{ "best_metric": 1.95067668, "best_model_checkpoint": "/scratch/ms-swift-chatas/exp_output_qwen2_vl_imagechat/v1-20250529-055538/checkpoint-103000", "epoch": 4.412835782528598, "eval_steps": 500, "global_step": 103000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 4.284306584979221e-05, "grad_norm": 8.035392761230469, "learning_rate": 9.999999998188407e-05, "loss": 4.592377185821533, "memory(GiB)": 10.09, "step": 1, "token_acc": 0.33783783783783783, "train_speed(iter/s)": 0.352947 }, { "epoch": 0.00021421532924896104, "grad_norm": 7.244219779968262, "learning_rate": 9.999999954710156e-05, "loss": 4.315703392028809, "memory(GiB)": 10.09, "step": 5, "token_acc": 0.31620553359683795, "train_speed(iter/s)": 0.883762 }, { "epoch": 0.0004284306584979221, "grad_norm": 5.759903907775879, "learning_rate": 9.99999981884062e-05, "loss": 3.7349918365478514, "memory(GiB)": 17.02, "step": 10, "token_acc": 0.3154121863799283, "train_speed(iter/s)": 1.092766 }, { "epoch": 0.0006426459877468832, "grad_norm": 6.463886737823486, "learning_rate": 9.999999592391398e-05, "loss": 3.3617221832275392, "memory(GiB)": 21.59, "step": 15, "token_acc": 0.3447204968944099, "train_speed(iter/s)": 1.177054 }, { "epoch": 0.0008568613169958442, "grad_norm": 5.3920979499816895, "learning_rate": 9.999999275362494e-05, "loss": 3.489911651611328, "memory(GiB)": 21.59, "step": 20, "token_acc": 0.4070175438596491, "train_speed(iter/s)": 1.254055 }, { "epoch": 0.0010710766462448053, "grad_norm": 4.974893093109131, "learning_rate": 9.999998867753912e-05, "loss": 3.31513671875, "memory(GiB)": 26.72, "step": 25, "token_acc": 0.3643410852713178, "train_speed(iter/s)": 1.238349 }, { "epoch": 0.0012852919754937663, "grad_norm": 4.026622772216797, "learning_rate": 9.999998369565659e-05, "loss": 3.0826137542724608, "memory(GiB)": 26.72, "step": 30, "token_acc": 0.39664804469273746, "train_speed(iter/s)": 1.233881 }, { "epoch": 0.0014995073047427273, "grad_norm": 4.793406009674072, "learning_rate": 9.999997780797748e-05, "loss": 3.117991638183594, "memory(GiB)": 26.72, "step": 35, "token_acc": 0.4230769230769231, "train_speed(iter/s)": 1.280209 }, { "epoch": 0.0017137226339916883, "grad_norm": 7.271947860717773, "learning_rate": 9.999997101450185e-05, "loss": 3.013927459716797, "memory(GiB)": 26.72, "step": 40, "token_acc": 0.4099722991689751, "train_speed(iter/s)": 1.286531 }, { "epoch": 0.0019279379632406496, "grad_norm": 4.337677478790283, "learning_rate": 9.999996331522983e-05, "loss": 2.9280925750732423, "memory(GiB)": 26.72, "step": 45, "token_acc": 0.4026315789473684, "train_speed(iter/s)": 1.323395 }, { "epoch": 0.0021421532924896106, "grad_norm": 7.95403528213501, "learning_rate": 9.99999547101616e-05, "loss": 3.090596008300781, "memory(GiB)": 26.72, "step": 50, "token_acc": 0.38127090301003347, "train_speed(iter/s)": 1.337204 }, { "epoch": 0.002356368621738572, "grad_norm": 11.402766227722168, "learning_rate": 9.999994519929725e-05, "loss": 3.0341163635253907, "memory(GiB)": 26.72, "step": 55, "token_acc": 0.38175675675675674, "train_speed(iter/s)": 1.372759 }, { "epoch": 0.0025705839509875326, "grad_norm": 6.415196418762207, "learning_rate": 9.9999934782637e-05, "loss": 2.797837829589844, "memory(GiB)": 26.72, "step": 60, "token_acc": 0.4051094890510949, "train_speed(iter/s)": 1.397544 }, { "epoch": 0.002784799280236494, "grad_norm": 5.0220208168029785, "learning_rate": 9.999992346018105e-05, "loss": 2.9029520034790037, "memory(GiB)": 26.72, "step": 65, "token_acc": 0.3992932862190813, "train_speed(iter/s)": 1.402898 }, { "epoch": 0.0029990146094854547, "grad_norm": 4.8612518310546875, "learning_rate": 9.999991123192957e-05, "loss": 2.9941619873046874, "memory(GiB)": 26.72, "step": 70, "token_acc": 0.4043887147335423, "train_speed(iter/s)": 1.425915 }, { "epoch": 0.003213229938734416, "grad_norm": 4.054269313812256, "learning_rate": 9.99998980978828e-05, "loss": 2.784201812744141, "memory(GiB)": 26.72, "step": 75, "token_acc": 0.4263322884012539, "train_speed(iter/s)": 1.454161 }, { "epoch": 0.0034274452679833767, "grad_norm": 22.715557098388672, "learning_rate": 9.999988405804095e-05, "loss": 3.0202789306640625, "memory(GiB)": 26.72, "step": 80, "token_acc": 0.4314516129032258, "train_speed(iter/s)": 1.47951 }, { "epoch": 0.003641660597232338, "grad_norm": 4.0691986083984375, "learning_rate": 9.999986911240431e-05, "loss": 2.7469051361083983, "memory(GiB)": 26.72, "step": 85, "token_acc": 0.4430379746835443, "train_speed(iter/s)": 1.485103 }, { "epoch": 0.003855875926481299, "grad_norm": 5.723708629608154, "learning_rate": 9.999985326097314e-05, "loss": 2.6862131118774415, "memory(GiB)": 26.72, "step": 90, "token_acc": 0.4379310344827586, "train_speed(iter/s)": 1.502945 }, { "epoch": 0.00407009125573026, "grad_norm": 5.090478420257568, "learning_rate": 9.999983650374773e-05, "loss": 2.78324031829834, "memory(GiB)": 26.72, "step": 95, "token_acc": 0.4444444444444444, "train_speed(iter/s)": 1.490174 }, { "epoch": 0.004284306584979221, "grad_norm": 5.822147369384766, "learning_rate": 9.999981884072838e-05, "loss": 2.932465934753418, "memory(GiB)": 26.72, "step": 100, "token_acc": 0.42120343839541546, "train_speed(iter/s)": 1.494382 }, { "epoch": 0.004498521914228182, "grad_norm": 6.790576457977295, "learning_rate": 9.999980027191539e-05, "loss": 2.8091917037963867, "memory(GiB)": 26.72, "step": 105, "token_acc": 0.43190661478599224, "train_speed(iter/s)": 1.494894 }, { "epoch": 0.004712737243477144, "grad_norm": 4.94995641708374, "learning_rate": 9.999978079730912e-05, "loss": 2.6159130096435548, "memory(GiB)": 26.72, "step": 110, "token_acc": 0.43911439114391143, "train_speed(iter/s)": 1.499744 }, { "epoch": 0.0049269525727261045, "grad_norm": 4.683931827545166, "learning_rate": 9.999976041690993e-05, "loss": 2.5537393569946287, "memory(GiB)": 26.72, "step": 115, "token_acc": 0.49823321554770317, "train_speed(iter/s)": 1.495688 }, { "epoch": 0.005141167901975065, "grad_norm": 4.683681488037109, "learning_rate": 9.999973913071817e-05, "loss": 2.5687942504882812, "memory(GiB)": 26.72, "step": 120, "token_acc": 0.475, "train_speed(iter/s)": 1.498772 }, { "epoch": 0.005355383231224026, "grad_norm": 6.400060653686523, "learning_rate": 9.999971693873423e-05, "loss": 2.461386489868164, "memory(GiB)": 26.72, "step": 125, "token_acc": 0.49236641221374045, "train_speed(iter/s)": 1.500086 }, { "epoch": 0.005569598560472988, "grad_norm": 5.713675498962402, "learning_rate": 9.999969384095851e-05, "loss": 2.952980613708496, "memory(GiB)": 26.72, "step": 130, "token_acc": 0.44642857142857145, "train_speed(iter/s)": 1.496834 }, { "epoch": 0.0057838138897219485, "grad_norm": 5.028921127319336, "learning_rate": 9.999966983739143e-05, "loss": 2.865002250671387, "memory(GiB)": 26.72, "step": 135, "token_acc": 0.4670846394984326, "train_speed(iter/s)": 1.508888 }, { "epoch": 0.005998029218970909, "grad_norm": 4.72570276260376, "learning_rate": 9.999964492803344e-05, "loss": 2.7058420181274414, "memory(GiB)": 26.72, "step": 140, "token_acc": 0.45454545454545453, "train_speed(iter/s)": 1.515212 }, { "epoch": 0.006212244548219871, "grad_norm": 3.9119255542755127, "learning_rate": 9.999961911288497e-05, "loss": 2.6955331802368163, "memory(GiB)": 26.72, "step": 145, "token_acc": 0.42857142857142855, "train_speed(iter/s)": 1.521761 }, { "epoch": 0.006426459877468832, "grad_norm": 4.968887805938721, "learning_rate": 9.99995923919465e-05, "loss": 2.752030944824219, "memory(GiB)": 26.72, "step": 150, "token_acc": 0.4365079365079365, "train_speed(iter/s)": 1.518681 }, { "epoch": 0.006640675206717793, "grad_norm": 4.824211597442627, "learning_rate": 9.99995647652185e-05, "loss": 3.1181013107299806, "memory(GiB)": 26.72, "step": 155, "token_acc": 0.3935860058309038, "train_speed(iter/s)": 1.513063 }, { "epoch": 0.006854890535966753, "grad_norm": 8.688904762268066, "learning_rate": 9.99995362327015e-05, "loss": 2.7499887466430666, "memory(GiB)": 26.72, "step": 160, "token_acc": 0.45733788395904434, "train_speed(iter/s)": 1.522206 }, { "epoch": 0.007069105865215715, "grad_norm": 4.412156105041504, "learning_rate": 9.999950679439598e-05, "loss": 3.1137237548828125, "memory(GiB)": 32.37, "step": 165, "token_acc": 0.3924050632911392, "train_speed(iter/s)": 1.511342 }, { "epoch": 0.007283321194464676, "grad_norm": 6.1439971923828125, "learning_rate": 9.99994764503025e-05, "loss": 2.9106130599975586, "memory(GiB)": 32.37, "step": 170, "token_acc": 0.4634146341463415, "train_speed(iter/s)": 1.513431 }, { "epoch": 0.007497536523713637, "grad_norm": 5.294766426086426, "learning_rate": 9.99994452004216e-05, "loss": 2.8748470306396485, "memory(GiB)": 32.37, "step": 175, "token_acc": 0.45819397993311034, "train_speed(iter/s)": 1.51648 }, { "epoch": 0.007711751852962598, "grad_norm": 5.394874572753906, "learning_rate": 9.999941304475385e-05, "loss": 2.5859235763549804, "memory(GiB)": 32.37, "step": 180, "token_acc": 0.43670886075949367, "train_speed(iter/s)": 1.500775 }, { "epoch": 0.007925967182211559, "grad_norm": 5.008523464202881, "learning_rate": 9.999937998329982e-05, "loss": 2.9563148498535154, "memory(GiB)": 32.37, "step": 185, "token_acc": 0.3745928338762215, "train_speed(iter/s)": 1.500915 }, { "epoch": 0.00814018251146052, "grad_norm": 4.091294288635254, "learning_rate": 9.999934601606014e-05, "loss": 2.705361557006836, "memory(GiB)": 32.37, "step": 190, "token_acc": 0.4192546583850932, "train_speed(iter/s)": 1.504781 }, { "epoch": 0.00835439784070948, "grad_norm": 4.025635242462158, "learning_rate": 9.999931114303538e-05, "loss": 2.8227870941162108, "memory(GiB)": 32.37, "step": 195, "token_acc": 0.41975308641975306, "train_speed(iter/s)": 1.509104 }, { "epoch": 0.008568613169958442, "grad_norm": 4.693427562713623, "learning_rate": 9.99992753642262e-05, "loss": 2.8427942276000975, "memory(GiB)": 32.37, "step": 200, "token_acc": 0.44787644787644787, "train_speed(iter/s)": 1.512413 }, { "epoch": 0.008782828499207404, "grad_norm": 6.930469512939453, "learning_rate": 9.999923867963326e-05, "loss": 3.0147377014160157, "memory(GiB)": 32.37, "step": 205, "token_acc": 0.3939393939393939, "train_speed(iter/s)": 1.510234 }, { "epoch": 0.008997043828456364, "grad_norm": 7.443196773529053, "learning_rate": 9.999920108925719e-05, "loss": 2.4351734161376952, "memory(GiB)": 32.37, "step": 210, "token_acc": 0.5021459227467812, "train_speed(iter/s)": 1.510754 }, { "epoch": 0.009211259157705326, "grad_norm": 7.709830284118652, "learning_rate": 9.99991625930987e-05, "loss": 2.7562721252441404, "memory(GiB)": 32.37, "step": 215, "token_acc": 0.41603053435114506, "train_speed(iter/s)": 1.507848 }, { "epoch": 0.009425474486954287, "grad_norm": 4.83250093460083, "learning_rate": 9.999912319115848e-05, "loss": 2.6840383529663088, "memory(GiB)": 32.37, "step": 220, "token_acc": 0.4671280276816609, "train_speed(iter/s)": 1.510214 }, { "epoch": 0.009639689816203247, "grad_norm": 5.140326499938965, "learning_rate": 9.999908288343722e-05, "loss": 2.8931657791137697, "memory(GiB)": 32.37, "step": 225, "token_acc": 0.40540540540540543, "train_speed(iter/s)": 1.518391 }, { "epoch": 0.009853905145452209, "grad_norm": 4.432214736938477, "learning_rate": 9.999904166993568e-05, "loss": 2.5513858795166016, "memory(GiB)": 32.37, "step": 230, "token_acc": 0.47435897435897434, "train_speed(iter/s)": 1.521624 }, { "epoch": 0.010068120474701169, "grad_norm": 5.439079761505127, "learning_rate": 9.999899955065461e-05, "loss": 2.964206314086914, "memory(GiB)": 32.37, "step": 235, "token_acc": 0.4262295081967213, "train_speed(iter/s)": 1.52316 }, { "epoch": 0.01028233580395013, "grad_norm": 9.50673770904541, "learning_rate": 9.999895652559475e-05, "loss": 2.587946128845215, "memory(GiB)": 32.37, "step": 240, "token_acc": 0.49433962264150944, "train_speed(iter/s)": 1.522841 }, { "epoch": 0.010496551133199092, "grad_norm": 5.015991687774658, "learning_rate": 9.999891259475688e-05, "loss": 2.7738439559936525, "memory(GiB)": 32.37, "step": 245, "token_acc": 0.4020979020979021, "train_speed(iter/s)": 1.526058 }, { "epoch": 0.010710766462448052, "grad_norm": 5.400449752807617, "learning_rate": 9.999886775814182e-05, "loss": 2.7554553985595702, "memory(GiB)": 32.37, "step": 250, "token_acc": 0.4694533762057878, "train_speed(iter/s)": 1.522284 }, { "epoch": 0.010924981791697014, "grad_norm": 4.275041103363037, "learning_rate": 9.999882201575036e-05, "loss": 2.767551803588867, "memory(GiB)": 32.37, "step": 255, "token_acc": 0.4542483660130719, "train_speed(iter/s)": 1.521842 }, { "epoch": 0.011139197120945975, "grad_norm": 4.915047645568848, "learning_rate": 9.999877536758334e-05, "loss": 2.9720163345336914, "memory(GiB)": 32.37, "step": 260, "token_acc": 0.43243243243243246, "train_speed(iter/s)": 1.524295 }, { "epoch": 0.011353412450194935, "grad_norm": 5.237390041351318, "learning_rate": 9.99987278136416e-05, "loss": 2.8887733459472655, "memory(GiB)": 32.37, "step": 265, "token_acc": 0.4246575342465753, "train_speed(iter/s)": 1.519953 }, { "epoch": 0.011567627779443897, "grad_norm": 6.189242839813232, "learning_rate": 9.9998679353926e-05, "loss": 2.7354158401489257, "memory(GiB)": 32.37, "step": 270, "token_acc": 0.4826086956521739, "train_speed(iter/s)": 1.521764 }, { "epoch": 0.011781843108692859, "grad_norm": 4.483863353729248, "learning_rate": 9.999862998843743e-05, "loss": 2.5961429595947267, "memory(GiB)": 38.45, "step": 275, "token_acc": 0.44171779141104295, "train_speed(iter/s)": 1.514263 }, { "epoch": 0.011996058437941819, "grad_norm": 4.702669143676758, "learning_rate": 9.999857971717678e-05, "loss": 2.832667350769043, "memory(GiB)": 38.45, "step": 280, "token_acc": 0.43478260869565216, "train_speed(iter/s)": 1.509821 }, { "epoch": 0.01221027376719078, "grad_norm": 4.098536014556885, "learning_rate": 9.999852854014495e-05, "loss": 2.7756668090820313, "memory(GiB)": 38.45, "step": 285, "token_acc": 0.42333333333333334, "train_speed(iter/s)": 1.511148 }, { "epoch": 0.012424489096439742, "grad_norm": 4.666366100311279, "learning_rate": 9.999847645734286e-05, "loss": 2.5693937301635743, "memory(GiB)": 38.45, "step": 290, "token_acc": 0.44649446494464945, "train_speed(iter/s)": 1.518436 }, { "epoch": 0.012638704425688702, "grad_norm": 3.2439024448394775, "learning_rate": 9.999842346877148e-05, "loss": 2.6344221115112303, "memory(GiB)": 38.45, "step": 295, "token_acc": 0.458955223880597, "train_speed(iter/s)": 1.524232 }, { "epoch": 0.012852919754937664, "grad_norm": 4.860950946807861, "learning_rate": 9.999836957443175e-05, "loss": 2.9313159942626954, "memory(GiB)": 38.45, "step": 300, "token_acc": 0.3890909090909091, "train_speed(iter/s)": 1.528104 }, { "epoch": 0.013067135084186625, "grad_norm": 4.235132694244385, "learning_rate": 9.999831477432466e-05, "loss": 2.4759666442871096, "memory(GiB)": 38.45, "step": 305, "token_acc": 0.5, "train_speed(iter/s)": 1.520383 }, { "epoch": 0.013281350413435585, "grad_norm": 5.354401111602783, "learning_rate": 9.99982590684512e-05, "loss": 2.9464181900024413, "memory(GiB)": 38.45, "step": 310, "token_acc": 0.39655172413793105, "train_speed(iter/s)": 1.520972 }, { "epoch": 0.013495565742684547, "grad_norm": 4.568800449371338, "learning_rate": 9.999820245681236e-05, "loss": 2.7896701812744142, "memory(GiB)": 38.45, "step": 315, "token_acc": 0.412568306010929, "train_speed(iter/s)": 1.515008 }, { "epoch": 0.013709781071933507, "grad_norm": 4.526827812194824, "learning_rate": 9.999814493940918e-05, "loss": 2.6762229919433596, "memory(GiB)": 38.45, "step": 320, "token_acc": 0.4098939929328622, "train_speed(iter/s)": 1.516438 }, { "epoch": 0.013923996401182468, "grad_norm": 4.059617042541504, "learning_rate": 9.999808651624271e-05, "loss": 2.3909589767456056, "memory(GiB)": 38.45, "step": 325, "token_acc": 0.4835164835164835, "train_speed(iter/s)": 1.51252 }, { "epoch": 0.01413821173043143, "grad_norm": 8.303412437438965, "learning_rate": 9.9998027187314e-05, "loss": 2.690586280822754, "memory(GiB)": 38.45, "step": 330, "token_acc": 0.47345132743362833, "train_speed(iter/s)": 1.515164 }, { "epoch": 0.01435242705968039, "grad_norm": 3.9448540210723877, "learning_rate": 9.999796695262413e-05, "loss": 2.6628677368164064, "memory(GiB)": 38.45, "step": 335, "token_acc": 0.43209876543209874, "train_speed(iter/s)": 1.514875 }, { "epoch": 0.014566642388929352, "grad_norm": 4.270360469818115, "learning_rate": 9.999790581217418e-05, "loss": 2.5807426452636717, "memory(GiB)": 38.45, "step": 340, "token_acc": 0.460431654676259, "train_speed(iter/s)": 1.512271 }, { "epoch": 0.014780857718178313, "grad_norm": 3.328791379928589, "learning_rate": 9.999784376596526e-05, "loss": 2.8379131317138673, "memory(GiB)": 38.45, "step": 345, "token_acc": 0.40963855421686746, "train_speed(iter/s)": 1.516468 }, { "epoch": 0.014995073047427273, "grad_norm": 4.7893385887146, "learning_rate": 9.999778081399851e-05, "loss": 2.5286256790161135, "memory(GiB)": 38.45, "step": 350, "token_acc": 0.4641509433962264, "train_speed(iter/s)": 1.517933 }, { "epoch": 0.015209288376676235, "grad_norm": 6.374311447143555, "learning_rate": 9.999771695627505e-05, "loss": 2.6312910079956056, "memory(GiB)": 38.45, "step": 355, "token_acc": 0.44404332129963897, "train_speed(iter/s)": 1.521818 }, { "epoch": 0.015423503705925197, "grad_norm": 4.904713153839111, "learning_rate": 9.999765219279605e-05, "loss": 2.7969123840332033, "memory(GiB)": 38.45, "step": 360, "token_acc": 0.4444444444444444, "train_speed(iter/s)": 1.526488 }, { "epoch": 0.015637719035174157, "grad_norm": 3.51206111907959, "learning_rate": 9.999758652356268e-05, "loss": 2.7190811157226564, "memory(GiB)": 38.45, "step": 365, "token_acc": 0.43205574912891986, "train_speed(iter/s)": 1.529405 }, { "epoch": 0.015851934364423118, "grad_norm": 4.942959785461426, "learning_rate": 9.999751994857614e-05, "loss": 2.495261573791504, "memory(GiB)": 38.45, "step": 370, "token_acc": 0.4767025089605735, "train_speed(iter/s)": 1.530248 }, { "epoch": 0.01606614969367208, "grad_norm": 6.035073280334473, "learning_rate": 9.999745246783761e-05, "loss": 2.8744131088256837, "memory(GiB)": 38.45, "step": 375, "token_acc": 0.45993031358885017, "train_speed(iter/s)": 1.52934 }, { "epoch": 0.01628036502292104, "grad_norm": 4.9291462898254395, "learning_rate": 9.999738408134832e-05, "loss": 2.8079748153686523, "memory(GiB)": 38.45, "step": 380, "token_acc": 0.4146341463414634, "train_speed(iter/s)": 1.528471 }, { "epoch": 0.01649458035217, "grad_norm": 3.258988380432129, "learning_rate": 9.999731478910952e-05, "loss": 2.445315933227539, "memory(GiB)": 38.45, "step": 385, "token_acc": 0.46710526315789475, "train_speed(iter/s)": 1.529275 }, { "epoch": 0.01670879568141896, "grad_norm": 12.798467636108398, "learning_rate": 9.999724459112247e-05, "loss": 2.7420465469360353, "memory(GiB)": 38.45, "step": 390, "token_acc": 0.43162393162393164, "train_speed(iter/s)": 1.524791 }, { "epoch": 0.016923011010667923, "grad_norm": 7.007190704345703, "learning_rate": 9.999717348738843e-05, "loss": 2.942500114440918, "memory(GiB)": 38.45, "step": 395, "token_acc": 0.40942028985507245, "train_speed(iter/s)": 1.527629 }, { "epoch": 0.017137226339916885, "grad_norm": 6.345561981201172, "learning_rate": 9.999710147790869e-05, "loss": 2.7135833740234374, "memory(GiB)": 38.45, "step": 400, "token_acc": 0.47735191637630664, "train_speed(iter/s)": 1.529885 }, { "epoch": 0.017351441669165846, "grad_norm": 5.034018039703369, "learning_rate": 9.999702856268456e-05, "loss": 2.9034446716308593, "memory(GiB)": 38.45, "step": 405, "token_acc": 0.42435424354243545, "train_speed(iter/s)": 1.526357 }, { "epoch": 0.017565656998414808, "grad_norm": 4.46183967590332, "learning_rate": 9.999695474171735e-05, "loss": 2.5323673248291017, "memory(GiB)": 38.45, "step": 410, "token_acc": 0.4912891986062718, "train_speed(iter/s)": 1.524539 }, { "epoch": 0.017779872327663766, "grad_norm": 4.101302623748779, "learning_rate": 9.99968800150084e-05, "loss": 2.836459732055664, "memory(GiB)": 38.45, "step": 415, "token_acc": 0.4169611307420495, "train_speed(iter/s)": 1.525782 }, { "epoch": 0.017994087656912728, "grad_norm": 5.307928085327148, "learning_rate": 9.999680438255908e-05, "loss": 2.5662986755371096, "memory(GiB)": 38.45, "step": 420, "token_acc": 0.4855072463768116, "train_speed(iter/s)": 1.526494 }, { "epoch": 0.01820830298616169, "grad_norm": 4.104459285736084, "learning_rate": 9.999672784437075e-05, "loss": 2.9653785705566404, "memory(GiB)": 38.45, "step": 425, "token_acc": 0.42379182156133827, "train_speed(iter/s)": 1.523517 }, { "epoch": 0.01842251831541065, "grad_norm": 4.8199462890625, "learning_rate": 9.999665040044477e-05, "loss": 3.1615955352783205, "memory(GiB)": 38.45, "step": 430, "token_acc": 0.41312741312741313, "train_speed(iter/s)": 1.520442 }, { "epoch": 0.018636733644659613, "grad_norm": 5.425066947937012, "learning_rate": 9.99965720507826e-05, "loss": 2.6454307556152346, "memory(GiB)": 38.45, "step": 435, "token_acc": 0.4384057971014493, "train_speed(iter/s)": 1.519672 }, { "epoch": 0.018850948973908575, "grad_norm": 7.106417179107666, "learning_rate": 9.999649279538562e-05, "loss": 2.5446660995483397, "memory(GiB)": 38.45, "step": 440, "token_acc": 0.4565916398713826, "train_speed(iter/s)": 1.519192 }, { "epoch": 0.019065164303157533, "grad_norm": 3.9185760021209717, "learning_rate": 9.999641263425524e-05, "loss": 2.862480354309082, "memory(GiB)": 38.45, "step": 445, "token_acc": 0.43026706231454004, "train_speed(iter/s)": 1.512258 }, { "epoch": 0.019279379632406494, "grad_norm": 4.255904197692871, "learning_rate": 9.999633156739298e-05, "loss": 2.7113834381103517, "memory(GiB)": 38.45, "step": 450, "token_acc": 0.46360153256704983, "train_speed(iter/s)": 1.512535 }, { "epoch": 0.019493594961655456, "grad_norm": 3.6170578002929688, "learning_rate": 9.999624959480025e-05, "loss": 2.712862014770508, "memory(GiB)": 38.45, "step": 455, "token_acc": 0.46598639455782315, "train_speed(iter/s)": 1.51161 }, { "epoch": 0.019707810290904418, "grad_norm": 4.32484245300293, "learning_rate": 9.999616671647858e-05, "loss": 2.6673866271972657, "memory(GiB)": 38.45, "step": 460, "token_acc": 0.44398340248962653, "train_speed(iter/s)": 1.511981 }, { "epoch": 0.01992202562015338, "grad_norm": 4.658557891845703, "learning_rate": 9.999608293242943e-05, "loss": 2.3850494384765626, "memory(GiB)": 38.45, "step": 465, "token_acc": 0.5018450184501845, "train_speed(iter/s)": 1.51372 }, { "epoch": 0.020136240949402338, "grad_norm": 5.076476573944092, "learning_rate": 9.999599824265436e-05, "loss": 3.014224624633789, "memory(GiB)": 38.45, "step": 470, "token_acc": 0.4306569343065693, "train_speed(iter/s)": 1.515909 }, { "epoch": 0.0203504562786513, "grad_norm": 3.9983785152435303, "learning_rate": 9.999591264715485e-05, "loss": 2.65859375, "memory(GiB)": 38.45, "step": 475, "token_acc": 0.4738675958188153, "train_speed(iter/s)": 1.512097 }, { "epoch": 0.02056467160790026, "grad_norm": 4.255746841430664, "learning_rate": 9.99958261459325e-05, "loss": 2.274767303466797, "memory(GiB)": 38.45, "step": 480, "token_acc": 0.5085470085470085, "train_speed(iter/s)": 1.510742 }, { "epoch": 0.020778886937149223, "grad_norm": 3.8533356189727783, "learning_rate": 9.999573873898887e-05, "loss": 2.840310287475586, "memory(GiB)": 38.45, "step": 485, "token_acc": 0.44635193133047213, "train_speed(iter/s)": 1.514164 }, { "epoch": 0.020993102266398184, "grad_norm": 3.6588633060455322, "learning_rate": 9.999565042632553e-05, "loss": 2.7850326538085937, "memory(GiB)": 38.45, "step": 490, "token_acc": 0.42857142857142855, "train_speed(iter/s)": 1.514103 }, { "epoch": 0.021207317595647146, "grad_norm": 5.175492286682129, "learning_rate": 9.999556120794407e-05, "loss": 2.610948181152344, "memory(GiB)": 38.45, "step": 495, "token_acc": 0.4631578947368421, "train_speed(iter/s)": 1.511599 }, { "epoch": 0.021421532924896104, "grad_norm": 4.59173059463501, "learning_rate": 9.999547108384612e-05, "loss": 2.926106262207031, "memory(GiB)": 38.45, "step": 500, "token_acc": 0.4444444444444444, "train_speed(iter/s)": 1.510686 }, { "epoch": 0.021421532924896104, "eval_loss": 2.28562331199646, "eval_runtime": 14.014, "eval_samples_per_second": 7.136, "eval_steps_per_second": 7.136, "eval_token_acc": 0.46879150066401065, "step": 500 }, { "epoch": 0.021635748254145066, "grad_norm": 10.294635772705078, "learning_rate": 9.999538005403332e-05, "loss": 2.579853820800781, "memory(GiB)": 38.45, "step": 505, "token_acc": 0.46791707798617965, "train_speed(iter/s)": 1.447716 }, { "epoch": 0.021849963583394028, "grad_norm": 4.110366344451904, "learning_rate": 9.999528811850732e-05, "loss": 2.8328296661376955, "memory(GiB)": 38.45, "step": 510, "token_acc": 0.4592833876221498, "train_speed(iter/s)": 1.45066 }, { "epoch": 0.02206417891264299, "grad_norm": 3.967574119567871, "learning_rate": 9.999519527726974e-05, "loss": 2.645603370666504, "memory(GiB)": 38.45, "step": 515, "token_acc": 0.45126353790613716, "train_speed(iter/s)": 1.4524 }, { "epoch": 0.02227839424189195, "grad_norm": 4.127718925476074, "learning_rate": 9.999510153032234e-05, "loss": 2.7630361557006835, "memory(GiB)": 38.45, "step": 520, "token_acc": 0.4594594594594595, "train_speed(iter/s)": 1.449849 }, { "epoch": 0.022492609571140913, "grad_norm": 6.0096845626831055, "learning_rate": 9.999500687766675e-05, "loss": 2.773067665100098, "memory(GiB)": 38.45, "step": 525, "token_acc": 0.4152823920265781, "train_speed(iter/s)": 1.452213 }, { "epoch": 0.02270682490038987, "grad_norm": 3.1059420108795166, "learning_rate": 9.999491131930472e-05, "loss": 2.55417537689209, "memory(GiB)": 38.45, "step": 530, "token_acc": 0.5018450184501845, "train_speed(iter/s)": 1.454439 }, { "epoch": 0.022921040229638832, "grad_norm": 4.492700576782227, "learning_rate": 9.999481485523798e-05, "loss": 2.4823116302490233, "memory(GiB)": 38.45, "step": 535, "token_acc": 0.4899328859060403, "train_speed(iter/s)": 1.454122 }, { "epoch": 0.023135255558887794, "grad_norm": 3.9863710403442383, "learning_rate": 9.999471748546826e-05, "loss": 2.539709281921387, "memory(GiB)": 38.45, "step": 540, "token_acc": 0.48091603053435117, "train_speed(iter/s)": 1.453699 }, { "epoch": 0.023349470888136756, "grad_norm": 13.657026290893555, "learning_rate": 9.999461920999733e-05, "loss": 2.495493698120117, "memory(GiB)": 38.45, "step": 545, "token_acc": 0.48717948717948717, "train_speed(iter/s)": 1.455843 }, { "epoch": 0.023563686217385717, "grad_norm": 5.109379291534424, "learning_rate": 9.999452002882699e-05, "loss": 2.5587100982666016, "memory(GiB)": 38.45, "step": 550, "token_acc": 0.4467455621301775, "train_speed(iter/s)": 1.455301 }, { "epoch": 0.023777901546634676, "grad_norm": 9.8173828125, "learning_rate": 9.999441994195902e-05, "loss": 2.557139778137207, "memory(GiB)": 38.45, "step": 555, "token_acc": 0.4781021897810219, "train_speed(iter/s)": 1.452023 }, { "epoch": 0.023992116875883637, "grad_norm": 3.308816432952881, "learning_rate": 9.999431894939522e-05, "loss": 2.7724546432495116, "memory(GiB)": 38.45, "step": 560, "token_acc": 0.44404332129963897, "train_speed(iter/s)": 1.454533 }, { "epoch": 0.0242063322051326, "grad_norm": 8.385897636413574, "learning_rate": 9.999421705113743e-05, "loss": 2.816848564147949, "memory(GiB)": 38.45, "step": 565, "token_acc": 0.43034055727554177, "train_speed(iter/s)": 1.457619 }, { "epoch": 0.02442054753438156, "grad_norm": 4.684794902801514, "learning_rate": 9.999411424718752e-05, "loss": 2.410172462463379, "memory(GiB)": 38.45, "step": 570, "token_acc": 0.4807121661721068, "train_speed(iter/s)": 1.459184 }, { "epoch": 0.024634762863630522, "grad_norm": 5.972314357757568, "learning_rate": 9.999401053754731e-05, "loss": 2.4901058197021486, "memory(GiB)": 38.45, "step": 575, "token_acc": 0.4607142857142857, "train_speed(iter/s)": 1.459399 }, { "epoch": 0.024848978192879484, "grad_norm": 4.343634128570557, "learning_rate": 9.999390592221871e-05, "loss": 2.8735374450683593, "memory(GiB)": 38.45, "step": 580, "token_acc": 0.45132743362831856, "train_speed(iter/s)": 1.460147 }, { "epoch": 0.025063193522128442, "grad_norm": 4.469681739807129, "learning_rate": 9.999380040120362e-05, "loss": 2.8024024963378906, "memory(GiB)": 38.45, "step": 585, "token_acc": 0.4223433242506812, "train_speed(iter/s)": 1.462626 }, { "epoch": 0.025277408851377404, "grad_norm": 4.772650718688965, "learning_rate": 9.999369397450392e-05, "loss": 2.6395135879516602, "memory(GiB)": 38.45, "step": 590, "token_acc": 0.43853820598006643, "train_speed(iter/s)": 1.465825 }, { "epoch": 0.025491624180626365, "grad_norm": 4.2591633796691895, "learning_rate": 9.999358664212158e-05, "loss": 2.9030269622802733, "memory(GiB)": 38.45, "step": 595, "token_acc": 0.44921875, "train_speed(iter/s)": 1.467251 }, { "epoch": 0.025705839509875327, "grad_norm": 4.614182949066162, "learning_rate": 9.999347840405848e-05, "loss": 2.5291936874389647, "memory(GiB)": 38.45, "step": 600, "token_acc": 0.47058823529411764, "train_speed(iter/s)": 1.466106 }, { "epoch": 0.02592005483912429, "grad_norm": 3.163677215576172, "learning_rate": 9.999336926031666e-05, "loss": 2.676190948486328, "memory(GiB)": 38.45, "step": 605, "token_acc": 0.4517133956386293, "train_speed(iter/s)": 1.465845 }, { "epoch": 0.02613427016837325, "grad_norm": 4.9240498542785645, "learning_rate": 9.999325921089804e-05, "loss": 2.810549736022949, "memory(GiB)": 38.45, "step": 610, "token_acc": 0.4483985765124555, "train_speed(iter/s)": 1.46277 }, { "epoch": 0.02634848549762221, "grad_norm": 4.851043701171875, "learning_rate": 9.999314825580464e-05, "loss": 3.121914100646973, "memory(GiB)": 38.45, "step": 615, "token_acc": 0.36271186440677966, "train_speed(iter/s)": 1.464436 }, { "epoch": 0.02656270082687117, "grad_norm": 3.2191321849823, "learning_rate": 9.999303639503845e-05, "loss": 2.8040241241455077, "memory(GiB)": 38.45, "step": 620, "token_acc": 0.4169381107491857, "train_speed(iter/s)": 1.467151 }, { "epoch": 0.026776916156120132, "grad_norm": 6.146385192871094, "learning_rate": 9.999292362860151e-05, "loss": 2.6678306579589846, "memory(GiB)": 38.45, "step": 625, "token_acc": 0.4452296819787986, "train_speed(iter/s)": 1.465956 }, { "epoch": 0.026991131485369094, "grad_norm": 4.547609329223633, "learning_rate": 9.999280995649588e-05, "loss": 2.668691635131836, "memory(GiB)": 38.45, "step": 630, "token_acc": 0.4673913043478261, "train_speed(iter/s)": 1.466443 }, { "epoch": 0.027205346814618055, "grad_norm": 3.406956195831299, "learning_rate": 9.999269537872357e-05, "loss": 2.705013656616211, "memory(GiB)": 38.45, "step": 635, "token_acc": 0.44, "train_speed(iter/s)": 1.469634 }, { "epoch": 0.027419562143867014, "grad_norm": 4.823546409606934, "learning_rate": 9.99925798952867e-05, "loss": 2.1641578674316406, "memory(GiB)": 38.45, "step": 640, "token_acc": 0.50187265917603, "train_speed(iter/s)": 1.469394 }, { "epoch": 0.027633777473115975, "grad_norm": 6.831277847290039, "learning_rate": 9.999246350618734e-05, "loss": 2.6867881774902345, "memory(GiB)": 38.45, "step": 645, "token_acc": 0.45394736842105265, "train_speed(iter/s)": 1.470678 }, { "epoch": 0.027847992802364937, "grad_norm": 4.051003932952881, "learning_rate": 9.999234621142764e-05, "loss": 2.5303443908691405, "memory(GiB)": 38.45, "step": 650, "token_acc": 0.46830985915492956, "train_speed(iter/s)": 1.472711 }, { "epoch": 0.0280622081316139, "grad_norm": 5.49873161315918, "learning_rate": 9.999222801100966e-05, "loss": 2.797929573059082, "memory(GiB)": 38.45, "step": 655, "token_acc": 0.41754385964912283, "train_speed(iter/s)": 1.471859 }, { "epoch": 0.02827642346086286, "grad_norm": 4.199821949005127, "learning_rate": 9.999210890493558e-05, "loss": 2.677899932861328, "memory(GiB)": 38.45, "step": 660, "token_acc": 0.45627376425855515, "train_speed(iter/s)": 1.47234 }, { "epoch": 0.028490638790111822, "grad_norm": 6.619091987609863, "learning_rate": 9.999198889320755e-05, "loss": 2.7923709869384767, "memory(GiB)": 38.45, "step": 665, "token_acc": 0.40483383685800606, "train_speed(iter/s)": 1.474378 }, { "epoch": 0.02870485411936078, "grad_norm": 2.97367525100708, "learning_rate": 9.999186797582775e-05, "loss": 2.836111068725586, "memory(GiB)": 38.45, "step": 670, "token_acc": 0.43260188087774293, "train_speed(iter/s)": 1.472144 }, { "epoch": 0.028919069448609742, "grad_norm": 3.7363877296447754, "learning_rate": 9.999174615279836e-05, "loss": 2.5940717697143554, "memory(GiB)": 38.45, "step": 675, "token_acc": 0.48507462686567165, "train_speed(iter/s)": 1.470481 }, { "epoch": 0.029133284777858703, "grad_norm": 4.364439964294434, "learning_rate": 9.999162342412161e-05, "loss": 2.753948783874512, "memory(GiB)": 38.45, "step": 680, "token_acc": 0.4229390681003584, "train_speed(iter/s)": 1.470741 }, { "epoch": 0.029347500107107665, "grad_norm": 3.7315828800201416, "learning_rate": 9.999149978979969e-05, "loss": 2.5765497207641603, "memory(GiB)": 38.45, "step": 685, "token_acc": 0.45627376425855515, "train_speed(iter/s)": 1.47357 }, { "epoch": 0.029561715436356627, "grad_norm": 5.6918487548828125, "learning_rate": 9.999137524983485e-05, "loss": 2.78399543762207, "memory(GiB)": 38.45, "step": 690, "token_acc": 0.45357142857142857, "train_speed(iter/s)": 1.476662 }, { "epoch": 0.02977593076560559, "grad_norm": 2.9097728729248047, "learning_rate": 9.999124980422936e-05, "loss": 2.7044647216796873, "memory(GiB)": 38.45, "step": 695, "token_acc": 0.4642857142857143, "train_speed(iter/s)": 1.47552 }, { "epoch": 0.029990146094854547, "grad_norm": 4.9471306800842285, "learning_rate": 9.999112345298549e-05, "loss": 2.4053537368774416, "memory(GiB)": 38.45, "step": 700, "token_acc": 0.4664179104477612, "train_speed(iter/s)": 1.472989 }, { "epoch": 0.03020436142410351, "grad_norm": 4.085890293121338, "learning_rate": 9.999099619610552e-05, "loss": 2.660585403442383, "memory(GiB)": 38.45, "step": 705, "token_acc": 0.46808510638297873, "train_speed(iter/s)": 1.474779 }, { "epoch": 0.03041857675335247, "grad_norm": 6.359280109405518, "learning_rate": 9.999086803359174e-05, "loss": 2.5837047576904295, "memory(GiB)": 38.45, "step": 710, "token_acc": 0.48014440433212996, "train_speed(iter/s)": 1.476224 }, { "epoch": 0.03063279208260143, "grad_norm": 5.505381107330322, "learning_rate": 9.99907389654465e-05, "loss": 2.4027435302734377, "memory(GiB)": 38.45, "step": 715, "token_acc": 0.4811715481171548, "train_speed(iter/s)": 1.478193 }, { "epoch": 0.030847007411850393, "grad_norm": 3.7724270820617676, "learning_rate": 9.999060899167214e-05, "loss": 2.5833389282226564, "memory(GiB)": 38.45, "step": 720, "token_acc": 0.48520710059171596, "train_speed(iter/s)": 1.480148 }, { "epoch": 0.03106122274109935, "grad_norm": 4.620935916900635, "learning_rate": 9.9990478112271e-05, "loss": 2.739794921875, "memory(GiB)": 38.45, "step": 725, "token_acc": 0.46308724832214765, "train_speed(iter/s)": 1.47927 }, { "epoch": 0.03127543807034831, "grad_norm": 3.3408830165863037, "learning_rate": 9.999034632724546e-05, "loss": 2.862911605834961, "memory(GiB)": 38.45, "step": 730, "token_acc": 0.4508670520231214, "train_speed(iter/s)": 1.477498 }, { "epoch": 0.03148965339959728, "grad_norm": 6.213047027587891, "learning_rate": 9.999021363659788e-05, "loss": 2.7660221099853515, "memory(GiB)": 38.45, "step": 735, "token_acc": 0.44074074074074077, "train_speed(iter/s)": 1.478877 }, { "epoch": 0.031703868728846236, "grad_norm": 7.579422473907471, "learning_rate": 9.99900800403307e-05, "loss": 2.365354537963867, "memory(GiB)": 38.45, "step": 740, "token_acc": 0.48134328358208955, "train_speed(iter/s)": 1.47808 }, { "epoch": 0.031918084058095195, "grad_norm": 5.464479446411133, "learning_rate": 9.998994553844632e-05, "loss": 2.5875072479248047, "memory(GiB)": 38.45, "step": 745, "token_acc": 0.49407114624505927, "train_speed(iter/s)": 1.478492 }, { "epoch": 0.03213229938734416, "grad_norm": 2.7492892742156982, "learning_rate": 9.99898101309472e-05, "loss": 3.0725284576416017, "memory(GiB)": 38.45, "step": 750, "token_acc": 0.3860182370820669, "train_speed(iter/s)": 1.479731 }, { "epoch": 0.03234651471659312, "grad_norm": 4.940596103668213, "learning_rate": 9.998967381783575e-05, "loss": 2.602863311767578, "memory(GiB)": 38.45, "step": 755, "token_acc": 0.4409722222222222, "train_speed(iter/s)": 1.480387 }, { "epoch": 0.03256073004584208, "grad_norm": 4.953486442565918, "learning_rate": 9.998953659911447e-05, "loss": 2.981593132019043, "memory(GiB)": 38.45, "step": 760, "token_acc": 0.3782051282051282, "train_speed(iter/s)": 1.47926 }, { "epoch": 0.03277494537509104, "grad_norm": 3.4166150093078613, "learning_rate": 9.998939847478585e-05, "loss": 2.607655143737793, "memory(GiB)": 38.45, "step": 765, "token_acc": 0.49324324324324326, "train_speed(iter/s)": 1.47998 }, { "epoch": 0.03298916070434, "grad_norm": 3.688446283340454, "learning_rate": 9.998925944485238e-05, "loss": 2.6573068618774416, "memory(GiB)": 38.45, "step": 770, "token_acc": 0.4421768707482993, "train_speed(iter/s)": 1.481473 }, { "epoch": 0.033203376033588965, "grad_norm": 3.088634490966797, "learning_rate": 9.998911950931658e-05, "loss": 2.8627756118774412, "memory(GiB)": 38.45, "step": 775, "token_acc": 0.4155844155844156, "train_speed(iter/s)": 1.482194 }, { "epoch": 0.03341759136283792, "grad_norm": 4.363468170166016, "learning_rate": 9.998897866818099e-05, "loss": 2.7546947479248045, "memory(GiB)": 38.45, "step": 780, "token_acc": 0.41403508771929826, "train_speed(iter/s)": 1.483031 }, { "epoch": 0.03363180669208689, "grad_norm": 5.336426258087158, "learning_rate": 9.998883692144817e-05, "loss": 2.5782140731811523, "memory(GiB)": 38.45, "step": 785, "token_acc": 0.496875, "train_speed(iter/s)": 1.484279 }, { "epoch": 0.033846022021335846, "grad_norm": 3.5997202396392822, "learning_rate": 9.998869426912065e-05, "loss": 2.6509056091308594, "memory(GiB)": 38.45, "step": 790, "token_acc": 0.44755244755244755, "train_speed(iter/s)": 1.484196 }, { "epoch": 0.034060237350584804, "grad_norm": 3.448155403137207, "learning_rate": 9.998855071120104e-05, "loss": 2.881861114501953, "memory(GiB)": 38.45, "step": 795, "token_acc": 0.4403409090909091, "train_speed(iter/s)": 1.485208 }, { "epoch": 0.03427445267983377, "grad_norm": 3.7866625785827637, "learning_rate": 9.998840624769196e-05, "loss": 2.8026958465576173, "memory(GiB)": 38.45, "step": 800, "token_acc": 0.45614035087719296, "train_speed(iter/s)": 1.486111 }, { "epoch": 0.03448866800908273, "grad_norm": 3.824439525604248, "learning_rate": 9.9988260878596e-05, "loss": 2.645696449279785, "memory(GiB)": 38.45, "step": 805, "token_acc": 0.44370860927152317, "train_speed(iter/s)": 1.488295 }, { "epoch": 0.03470288333833169, "grad_norm": 4.691617965698242, "learning_rate": 9.99881146039158e-05, "loss": 2.7255084991455076, "memory(GiB)": 38.45, "step": 810, "token_acc": 0.42323651452282157, "train_speed(iter/s)": 1.48988 }, { "epoch": 0.03491709866758065, "grad_norm": 3.136521100997925, "learning_rate": 9.998796742365402e-05, "loss": 2.609482192993164, "memory(GiB)": 38.45, "step": 815, "token_acc": 0.4309859154929577, "train_speed(iter/s)": 1.489195 }, { "epoch": 0.035131313996829616, "grad_norm": 3.6892218589782715, "learning_rate": 9.99878193378133e-05, "loss": 2.4301197052001955, "memory(GiB)": 38.45, "step": 820, "token_acc": 0.5333333333333333, "train_speed(iter/s)": 1.491319 }, { "epoch": 0.035345529326078574, "grad_norm": 3.0272536277770996, "learning_rate": 9.998767034639636e-05, "loss": 2.8461444854736326, "memory(GiB)": 38.45, "step": 825, "token_acc": 0.42902208201892744, "train_speed(iter/s)": 1.492437 }, { "epoch": 0.03555974465532753, "grad_norm": 3.5021369457244873, "learning_rate": 9.998752044940587e-05, "loss": 2.7193660736083984, "memory(GiB)": 38.45, "step": 830, "token_acc": 0.4416403785488959, "train_speed(iter/s)": 1.493741 }, { "epoch": 0.0357739599845765, "grad_norm": 3.633901596069336, "learning_rate": 9.998736964684454e-05, "loss": 2.5642627716064452, "memory(GiB)": 38.45, "step": 835, "token_acc": 0.46153846153846156, "train_speed(iter/s)": 1.493056 }, { "epoch": 0.035988175313825456, "grad_norm": 4.662364482879639, "learning_rate": 9.998721793871513e-05, "loss": 2.5761337280273438, "memory(GiB)": 38.45, "step": 840, "token_acc": 0.45054945054945056, "train_speed(iter/s)": 1.494073 }, { "epoch": 0.03620239064307442, "grad_norm": 5.195523262023926, "learning_rate": 9.998706532502038e-05, "loss": 2.638579177856445, "memory(GiB)": 38.45, "step": 845, "token_acc": 0.4722222222222222, "train_speed(iter/s)": 1.492949 }, { "epoch": 0.03641660597232338, "grad_norm": 4.824570655822754, "learning_rate": 9.998691180576306e-05, "loss": 2.672596549987793, "memory(GiB)": 38.45, "step": 850, "token_acc": 0.46830985915492956, "train_speed(iter/s)": 1.492738 }, { "epoch": 0.03663082130157234, "grad_norm": 2.6324217319488525, "learning_rate": 9.998675738094591e-05, "loss": 2.4606231689453124, "memory(GiB)": 38.45, "step": 855, "token_acc": 0.4948453608247423, "train_speed(iter/s)": 1.492514 }, { "epoch": 0.0368450366308213, "grad_norm": 6.1279191970825195, "learning_rate": 9.998660205057179e-05, "loss": 2.696563148498535, "memory(GiB)": 38.45, "step": 860, "token_acc": 0.47435897435897434, "train_speed(iter/s)": 1.494138 }, { "epoch": 0.03705925196007026, "grad_norm": 3.731215238571167, "learning_rate": 9.998644581464348e-05, "loss": 2.98821964263916, "memory(GiB)": 38.45, "step": 865, "token_acc": 0.429042904290429, "train_speed(iter/s)": 1.494777 }, { "epoch": 0.037273467289319226, "grad_norm": 4.00960111618042, "learning_rate": 9.998628867316378e-05, "loss": 2.7611104965209963, "memory(GiB)": 38.45, "step": 870, "token_acc": 0.4642857142857143, "train_speed(iter/s)": 1.493999 }, { "epoch": 0.037487682618568184, "grad_norm": 3.2089624404907227, "learning_rate": 9.99861306261356e-05, "loss": 2.7675397872924803, "memory(GiB)": 38.45, "step": 875, "token_acc": 0.4125, "train_speed(iter/s)": 1.494044 }, { "epoch": 0.03770189794781715, "grad_norm": 5.143516540527344, "learning_rate": 9.998597167356176e-05, "loss": 2.595576858520508, "memory(GiB)": 38.45, "step": 880, "token_acc": 0.459546925566343, "train_speed(iter/s)": 1.494389 }, { "epoch": 0.03791611327706611, "grad_norm": 4.214605808258057, "learning_rate": 9.998581181544516e-05, "loss": 2.522385025024414, "memory(GiB)": 38.45, "step": 885, "token_acc": 0.4668769716088328, "train_speed(iter/s)": 1.494258 }, { "epoch": 0.038130328606315066, "grad_norm": 3.4676084518432617, "learning_rate": 9.998565105178869e-05, "loss": 2.440280532836914, "memory(GiB)": 38.45, "step": 890, "token_acc": 0.4931972789115646, "train_speed(iter/s)": 1.496183 }, { "epoch": 0.03834454393556403, "grad_norm": 4.099660873413086, "learning_rate": 9.998548938259525e-05, "loss": 2.7564018249511717, "memory(GiB)": 38.45, "step": 895, "token_acc": 0.4763636363636364, "train_speed(iter/s)": 1.496428 }, { "epoch": 0.03855875926481299, "grad_norm": 3.6905577182769775, "learning_rate": 9.998532680786778e-05, "loss": 2.4071306228637694, "memory(GiB)": 38.81, "step": 900, "token_acc": 0.5, "train_speed(iter/s)": 1.496277 }, { "epoch": 0.038772974594061954, "grad_norm": 3.0314619541168213, "learning_rate": 9.998516332760924e-05, "loss": 2.5806015014648436, "memory(GiB)": 38.81, "step": 905, "token_acc": 0.4557377049180328, "train_speed(iter/s)": 1.496542 }, { "epoch": 0.03898718992331091, "grad_norm": 4.86159086227417, "learning_rate": 9.998499894182255e-05, "loss": 2.5048938751220704, "memory(GiB)": 38.81, "step": 910, "token_acc": 0.4722222222222222, "train_speed(iter/s)": 1.49598 }, { "epoch": 0.03920140525255987, "grad_norm": 3.315594434738159, "learning_rate": 9.998483365051075e-05, "loss": 2.897009086608887, "memory(GiB)": 38.81, "step": 915, "token_acc": 0.43018867924528303, "train_speed(iter/s)": 1.493181 }, { "epoch": 0.039415620581808836, "grad_norm": 3.834003210067749, "learning_rate": 9.998466745367678e-05, "loss": 2.5996976852416993, "memory(GiB)": 38.81, "step": 920, "token_acc": 0.4365781710914454, "train_speed(iter/s)": 1.492773 }, { "epoch": 0.039629835911057794, "grad_norm": 3.0740156173706055, "learning_rate": 9.998450035132367e-05, "loss": 2.842950439453125, "memory(GiB)": 38.81, "step": 925, "token_acc": 0.43506493506493504, "train_speed(iter/s)": 1.492662 }, { "epoch": 0.03984405124030676, "grad_norm": 4.596339225769043, "learning_rate": 9.998433234345446e-05, "loss": 2.635214424133301, "memory(GiB)": 38.81, "step": 930, "token_acc": 0.4694533762057878, "train_speed(iter/s)": 1.493597 }, { "epoch": 0.04005826656955572, "grad_norm": 3.8838858604431152, "learning_rate": 9.998416343007215e-05, "loss": 2.54049072265625, "memory(GiB)": 38.81, "step": 935, "token_acc": 0.45806451612903226, "train_speed(iter/s)": 1.493119 }, { "epoch": 0.040272481898804675, "grad_norm": 3.087055206298828, "learning_rate": 9.998399361117985e-05, "loss": 2.6478448867797852, "memory(GiB)": 38.81, "step": 940, "token_acc": 0.43444730077120824, "train_speed(iter/s)": 1.493678 }, { "epoch": 0.04048669722805364, "grad_norm": 3.6623730659484863, "learning_rate": 9.998382288678062e-05, "loss": 3.016253662109375, "memory(GiB)": 38.81, "step": 945, "token_acc": 0.4161290322580645, "train_speed(iter/s)": 1.495231 }, { "epoch": 0.0407009125573026, "grad_norm": 5.71432638168335, "learning_rate": 9.998365125687754e-05, "loss": 2.4765270233154295, "memory(GiB)": 38.81, "step": 950, "token_acc": 0.4919614147909968, "train_speed(iter/s)": 1.494298 }, { "epoch": 0.040915127886551564, "grad_norm": 3.9099810123443604, "learning_rate": 9.998347872147374e-05, "loss": 2.8657649993896483, "memory(GiB)": 38.81, "step": 955, "token_acc": 0.43050847457627117, "train_speed(iter/s)": 1.494705 }, { "epoch": 0.04112934321580052, "grad_norm": 3.657832384109497, "learning_rate": 9.998330528057234e-05, "loss": 2.5781782150268553, "memory(GiB)": 38.81, "step": 960, "token_acc": 0.4826388888888889, "train_speed(iter/s)": 1.495667 }, { "epoch": 0.04134355854504949, "grad_norm": 4.406288146972656, "learning_rate": 9.998313093417646e-05, "loss": 2.432485008239746, "memory(GiB)": 44.8, "step": 965, "token_acc": 0.4844961240310077, "train_speed(iter/s)": 1.494695 }, { "epoch": 0.041557773874298445, "grad_norm": 3.894352912902832, "learning_rate": 9.998295568228928e-05, "loss": 2.6198606491088867, "memory(GiB)": 44.8, "step": 970, "token_acc": 0.4541984732824427, "train_speed(iter/s)": 1.495261 }, { "epoch": 0.041771989203547404, "grad_norm": 2.8862714767456055, "learning_rate": 9.998277952491397e-05, "loss": 2.6841056823730467, "memory(GiB)": 44.8, "step": 975, "token_acc": 0.45592705167173253, "train_speed(iter/s)": 1.493921 }, { "epoch": 0.04198620453279637, "grad_norm": 3.875049352645874, "learning_rate": 9.998260246205372e-05, "loss": 2.9591068267822265, "memory(GiB)": 44.8, "step": 980, "token_acc": 0.4228395061728395, "train_speed(iter/s)": 1.495308 }, { "epoch": 0.04220041986204533, "grad_norm": 3.3060050010681152, "learning_rate": 9.998242449371174e-05, "loss": 2.8508010864257813, "memory(GiB)": 44.8, "step": 985, "token_acc": 0.449438202247191, "train_speed(iter/s)": 1.49519 }, { "epoch": 0.04241463519129429, "grad_norm": 6.291374206542969, "learning_rate": 9.998224561989124e-05, "loss": 2.6170028686523437, "memory(GiB)": 44.8, "step": 990, "token_acc": 0.4527687296416938, "train_speed(iter/s)": 1.495631 }, { "epoch": 0.04262885052054325, "grad_norm": 3.2309229373931885, "learning_rate": 9.99820658405955e-05, "loss": 2.529778480529785, "memory(GiB)": 44.8, "step": 995, "token_acc": 0.4478114478114478, "train_speed(iter/s)": 1.493933 }, { "epoch": 0.04284306584979221, "grad_norm": 7.27476167678833, "learning_rate": 9.998188515582773e-05, "loss": 2.746327781677246, "memory(GiB)": 44.8, "step": 1000, "token_acc": 0.44, "train_speed(iter/s)": 1.49489 }, { "epoch": 0.04284306584979221, "eval_loss": 2.4384655952453613, "eval_runtime": 14.749, "eval_samples_per_second": 6.78, "eval_steps_per_second": 6.78, "eval_token_acc": 0.44221698113207547, "step": 1000 }, { "epoch": 0.043057281179041174, "grad_norm": 3.783703565597534, "learning_rate": 9.998170356559125e-05, "loss": 2.604329299926758, "memory(GiB)": 44.8, "step": 1005, "token_acc": 0.4456993918331885, "train_speed(iter/s)": 1.460458 }, { "epoch": 0.04327149650829013, "grad_norm": 3.809075355529785, "learning_rate": 9.99815210698893e-05, "loss": 2.831036376953125, "memory(GiB)": 44.8, "step": 1010, "token_acc": 0.46153846153846156, "train_speed(iter/s)": 1.461448 }, { "epoch": 0.0434857118375391, "grad_norm": 3.267028570175171, "learning_rate": 9.99813376687252e-05, "loss": 2.627566909790039, "memory(GiB)": 44.8, "step": 1015, "token_acc": 0.4748201438848921, "train_speed(iter/s)": 1.461377 }, { "epoch": 0.043699927166788055, "grad_norm": 4.049720764160156, "learning_rate": 9.998115336210229e-05, "loss": 2.621623229980469, "memory(GiB)": 44.8, "step": 1020, "token_acc": 0.44609665427509293, "train_speed(iter/s)": 1.463755 }, { "epoch": 0.04391414249603701, "grad_norm": 4.8473429679870605, "learning_rate": 9.998096815002391e-05, "loss": 2.7243886947631837, "memory(GiB)": 44.8, "step": 1025, "token_acc": 0.4290657439446367, "train_speed(iter/s)": 1.464354 }, { "epoch": 0.04412835782528598, "grad_norm": 7.37783670425415, "learning_rate": 9.998078203249341e-05, "loss": 2.462293243408203, "memory(GiB)": 44.8, "step": 1030, "token_acc": 0.48951048951048953, "train_speed(iter/s)": 1.465813 }, { "epoch": 0.04434257315453494, "grad_norm": 3.511589288711548, "learning_rate": 9.998059500951415e-05, "loss": 2.864263725280762, "memory(GiB)": 44.8, "step": 1035, "token_acc": 0.4065573770491803, "train_speed(iter/s)": 1.465889 }, { "epoch": 0.0445567884837839, "grad_norm": 3.4809505939483643, "learning_rate": 9.998040708108953e-05, "loss": 2.7204833984375, "memory(GiB)": 44.8, "step": 1040, "token_acc": 0.4563758389261745, "train_speed(iter/s)": 1.466553 }, { "epoch": 0.04477100381303286, "grad_norm": 4.293155670166016, "learning_rate": 9.998021824722295e-05, "loss": 2.4542926788330077, "memory(GiB)": 44.8, "step": 1045, "token_acc": 0.49606299212598426, "train_speed(iter/s)": 1.467031 }, { "epoch": 0.044985219142281825, "grad_norm": 3.556633949279785, "learning_rate": 9.998002850791782e-05, "loss": 2.9304813385009765, "memory(GiB)": 44.8, "step": 1050, "token_acc": 0.41754385964912283, "train_speed(iter/s)": 1.46766 }, { "epoch": 0.04519943447153078, "grad_norm": 6.549159526824951, "learning_rate": 9.99798378631776e-05, "loss": 2.503632354736328, "memory(GiB)": 44.8, "step": 1055, "token_acc": 0.4559386973180077, "train_speed(iter/s)": 1.467155 }, { "epoch": 0.04541364980077974, "grad_norm": 4.304167747497559, "learning_rate": 9.997964631300573e-05, "loss": 2.888266372680664, "memory(GiB)": 44.8, "step": 1060, "token_acc": 0.41694915254237286, "train_speed(iter/s)": 1.466603 }, { "epoch": 0.04562786513002871, "grad_norm": 4.314018726348877, "learning_rate": 9.997945385740568e-05, "loss": 2.6884815216064455, "memory(GiB)": 44.8, "step": 1065, "token_acc": 0.4276729559748428, "train_speed(iter/s)": 1.466345 }, { "epoch": 0.045842080459277665, "grad_norm": 3.1511995792388916, "learning_rate": 9.997926049638094e-05, "loss": 2.5820171356201174, "memory(GiB)": 44.8, "step": 1070, "token_acc": 0.5, "train_speed(iter/s)": 1.468005 }, { "epoch": 0.04605629578852663, "grad_norm": 5.352568626403809, "learning_rate": 9.9979066229935e-05, "loss": 2.7415964126586916, "memory(GiB)": 44.8, "step": 1075, "token_acc": 0.4830188679245283, "train_speed(iter/s)": 1.467664 }, { "epoch": 0.04627051111777559, "grad_norm": 2.98688006401062, "learning_rate": 9.997887105807141e-05, "loss": 2.7664222717285156, "memory(GiB)": 44.8, "step": 1080, "token_acc": 0.4382716049382716, "train_speed(iter/s)": 1.466714 }, { "epoch": 0.046484726447024546, "grad_norm": 4.0362114906311035, "learning_rate": 9.997867498079368e-05, "loss": 2.9460811614990234, "memory(GiB)": 44.8, "step": 1085, "token_acc": 0.43018867924528303, "train_speed(iter/s)": 1.466689 }, { "epoch": 0.04669894177627351, "grad_norm": 3.121070623397827, "learning_rate": 9.997847799810537e-05, "loss": 2.4676792144775392, "memory(GiB)": 44.8, "step": 1090, "token_acc": 0.4887459807073955, "train_speed(iter/s)": 1.466383 }, { "epoch": 0.04691315710552247, "grad_norm": 3.5065741539001465, "learning_rate": 9.997828011001005e-05, "loss": 2.639570617675781, "memory(GiB)": 44.8, "step": 1095, "token_acc": 0.45964912280701753, "train_speed(iter/s)": 1.465902 }, { "epoch": 0.047127372434771435, "grad_norm": 3.773871898651123, "learning_rate": 9.99780813165113e-05, "loss": 2.541779899597168, "memory(GiB)": 44.8, "step": 1100, "token_acc": 0.456973293768546, "train_speed(iter/s)": 1.46657 }, { "epoch": 0.04734158776402039, "grad_norm": 4.274632453918457, "learning_rate": 9.997788161761272e-05, "loss": 2.544404220581055, "memory(GiB)": 44.8, "step": 1105, "token_acc": 0.4872881355932203, "train_speed(iter/s)": 1.466749 }, { "epoch": 0.04755580309326935, "grad_norm": 5.100984573364258, "learning_rate": 9.997768101331793e-05, "loss": 2.5767072677612304, "memory(GiB)": 44.8, "step": 1110, "token_acc": 0.43653250773993807, "train_speed(iter/s)": 1.465494 }, { "epoch": 0.047770018422518316, "grad_norm": 3.7199249267578125, "learning_rate": 9.997747950363056e-05, "loss": 2.7794525146484377, "memory(GiB)": 44.8, "step": 1115, "token_acc": 0.47619047619047616, "train_speed(iter/s)": 1.466184 }, { "epoch": 0.047984233751767275, "grad_norm": 3.925203323364258, "learning_rate": 9.997727708855429e-05, "loss": 2.4455743789672852, "memory(GiB)": 44.8, "step": 1120, "token_acc": 0.42857142857142855, "train_speed(iter/s)": 1.466064 }, { "epoch": 0.04819844908101624, "grad_norm": 5.212133407592773, "learning_rate": 9.997707376809273e-05, "loss": 2.6166486740112305, "memory(GiB)": 44.8, "step": 1125, "token_acc": 0.5150375939849624, "train_speed(iter/s)": 1.467475 }, { "epoch": 0.0484126644102652, "grad_norm": 5.15431022644043, "learning_rate": 9.997686954224963e-05, "loss": 2.6525360107421876, "memory(GiB)": 44.8, "step": 1130, "token_acc": 0.4527687296416938, "train_speed(iter/s)": 1.466977 }, { "epoch": 0.04862687973951416, "grad_norm": 3.462268829345703, "learning_rate": 9.997666441102864e-05, "loss": 2.814067077636719, "memory(GiB)": 44.8, "step": 1135, "token_acc": 0.3853211009174312, "train_speed(iter/s)": 1.467258 }, { "epoch": 0.04884109506876312, "grad_norm": 3.796671152114868, "learning_rate": 9.99764583744335e-05, "loss": 2.741273307800293, "memory(GiB)": 44.8, "step": 1140, "token_acc": 0.44, "train_speed(iter/s)": 1.466789 }, { "epoch": 0.04905531039801208, "grad_norm": 4.840392112731934, "learning_rate": 9.997625143246791e-05, "loss": 2.6720537185668944, "memory(GiB)": 44.8, "step": 1145, "token_acc": 0.46774193548387094, "train_speed(iter/s)": 1.466716 }, { "epoch": 0.049269525727261045, "grad_norm": 3.183471202850342, "learning_rate": 9.997604358513566e-05, "loss": 2.748935317993164, "memory(GiB)": 44.8, "step": 1150, "token_acc": 0.4290322580645161, "train_speed(iter/s)": 1.465586 }, { "epoch": 0.04948374105651, "grad_norm": 3.9429311752319336, "learning_rate": 9.997583483244051e-05, "loss": 2.6724550247192385, "memory(GiB)": 44.8, "step": 1155, "token_acc": 0.43661971830985913, "train_speed(iter/s)": 1.465914 }, { "epoch": 0.04969795638575897, "grad_norm": 4.010347366333008, "learning_rate": 9.997562517438621e-05, "loss": 2.78275146484375, "memory(GiB)": 44.8, "step": 1160, "token_acc": 0.43333333333333335, "train_speed(iter/s)": 1.467317 }, { "epoch": 0.049912171715007926, "grad_norm": 3.6683743000030518, "learning_rate": 9.997541461097659e-05, "loss": 2.859244537353516, "memory(GiB)": 44.8, "step": 1165, "token_acc": 0.39473684210526316, "train_speed(iter/s)": 1.468769 }, { "epoch": 0.050126387044256884, "grad_norm": 3.7796099185943604, "learning_rate": 9.997520314221547e-05, "loss": 2.8719547271728514, "memory(GiB)": 44.8, "step": 1170, "token_acc": 0.42813455657492355, "train_speed(iter/s)": 1.467851 }, { "epoch": 0.05034060237350585, "grad_norm": 3.327528715133667, "learning_rate": 9.997499076810664e-05, "loss": 2.911891555786133, "memory(GiB)": 44.8, "step": 1175, "token_acc": 0.44694533762057875, "train_speed(iter/s)": 1.468474 }, { "epoch": 0.05055481770275481, "grad_norm": 4.597551345825195, "learning_rate": 9.997477748865399e-05, "loss": 2.5749902725219727, "memory(GiB)": 44.8, "step": 1180, "token_acc": 0.4794520547945205, "train_speed(iter/s)": 1.468247 }, { "epoch": 0.05076903303200377, "grad_norm": 4.473481178283691, "learning_rate": 9.997456330386135e-05, "loss": 2.702045440673828, "memory(GiB)": 44.8, "step": 1185, "token_acc": 0.4691780821917808, "train_speed(iter/s)": 1.46898 }, { "epoch": 0.05098324836125273, "grad_norm": 3.2365341186523438, "learning_rate": 9.997434821373262e-05, "loss": 2.5826753616333007, "memory(GiB)": 44.8, "step": 1190, "token_acc": 0.5071428571428571, "train_speed(iter/s)": 1.469398 }, { "epoch": 0.05119746369050169, "grad_norm": 5.502979278564453, "learning_rate": 9.997413221827169e-05, "loss": 2.5995899200439454, "memory(GiB)": 44.8, "step": 1195, "token_acc": 0.48264984227129337, "train_speed(iter/s)": 1.470224 }, { "epoch": 0.051411679019750654, "grad_norm": 5.472434997558594, "learning_rate": 9.997391531748248e-05, "loss": 2.81494026184082, "memory(GiB)": 44.8, "step": 1200, "token_acc": 0.5036764705882353, "train_speed(iter/s)": 1.471308 }, { "epoch": 0.05162589434899961, "grad_norm": 3.0724525451660156, "learning_rate": 9.997369751136891e-05, "loss": 2.6393428802490235, "memory(GiB)": 44.8, "step": 1205, "token_acc": 0.4662576687116564, "train_speed(iter/s)": 1.470597 }, { "epoch": 0.05184010967824858, "grad_norm": 5.035545349121094, "learning_rate": 9.997347879993495e-05, "loss": 2.508587646484375, "memory(GiB)": 44.8, "step": 1210, "token_acc": 0.4605678233438486, "train_speed(iter/s)": 1.469467 }, { "epoch": 0.052054325007497536, "grad_norm": 3.6228578090667725, "learning_rate": 9.997325918318452e-05, "loss": 2.4395151138305664, "memory(GiB)": 44.8, "step": 1215, "token_acc": 0.4875, "train_speed(iter/s)": 1.468092 }, { "epoch": 0.0522685403367465, "grad_norm": 4.701809406280518, "learning_rate": 9.997303866112163e-05, "loss": 2.589558982849121, "memory(GiB)": 44.8, "step": 1220, "token_acc": 0.4472843450479233, "train_speed(iter/s)": 1.466086 }, { "epoch": 0.05248275566599546, "grad_norm": 4.492598533630371, "learning_rate": 9.997281723375027e-05, "loss": 2.6755788803100584, "memory(GiB)": 44.8, "step": 1225, "token_acc": 0.4585987261146497, "train_speed(iter/s)": 1.467089 }, { "epoch": 0.05269697099524442, "grad_norm": 3.3833775520324707, "learning_rate": 9.997259490107444e-05, "loss": 2.5095600128173827, "memory(GiB)": 44.8, "step": 1230, "token_acc": 0.4904214559386973, "train_speed(iter/s)": 1.468978 }, { "epoch": 0.05291118632449338, "grad_norm": 5.671452045440674, "learning_rate": 9.997237166309818e-05, "loss": 2.6694406509399413, "memory(GiB)": 44.8, "step": 1235, "token_acc": 0.4738562091503268, "train_speed(iter/s)": 1.469868 }, { "epoch": 0.05312540165374234, "grad_norm": 8.992464065551758, "learning_rate": 9.997214751982552e-05, "loss": 2.5071449279785156, "memory(GiB)": 44.8, "step": 1240, "token_acc": 0.4857142857142857, "train_speed(iter/s)": 1.469788 }, { "epoch": 0.053339616982991306, "grad_norm": 3.487832546234131, "learning_rate": 9.997192247126055e-05, "loss": 2.59873046875, "memory(GiB)": 44.8, "step": 1245, "token_acc": 0.4711864406779661, "train_speed(iter/s)": 1.469325 }, { "epoch": 0.053553832312240264, "grad_norm": 3.6097612380981445, "learning_rate": 9.99716965174073e-05, "loss": 2.4442211151123048, "memory(GiB)": 44.8, "step": 1250, "token_acc": 0.4966887417218543, "train_speed(iter/s)": 1.471082 }, { "epoch": 0.05376804764148922, "grad_norm": 4.641023635864258, "learning_rate": 9.997146965826991e-05, "loss": 2.5057844161987304, "memory(GiB)": 44.8, "step": 1255, "token_acc": 0.45348837209302323, "train_speed(iter/s)": 1.471497 }, { "epoch": 0.05398226297073819, "grad_norm": 3.097318172454834, "learning_rate": 9.997124189385246e-05, "loss": 2.6078752517700194, "memory(GiB)": 44.8, "step": 1260, "token_acc": 0.46006389776357826, "train_speed(iter/s)": 1.47263 }, { "epoch": 0.054196478299987146, "grad_norm": 3.847569465637207, "learning_rate": 9.99710132241591e-05, "loss": 2.4449111938476564, "memory(GiB)": 44.8, "step": 1265, "token_acc": 0.5116279069767442, "train_speed(iter/s)": 1.472272 }, { "epoch": 0.05441069362923611, "grad_norm": 4.654448509216309, "learning_rate": 9.997078364919395e-05, "loss": 2.8197586059570314, "memory(GiB)": 44.8, "step": 1270, "token_acc": 0.42857142857142855, "train_speed(iter/s)": 1.473333 }, { "epoch": 0.05462490895848507, "grad_norm": 3.8379368782043457, "learning_rate": 9.997055316896118e-05, "loss": 2.7199775695800783, "memory(GiB)": 44.8, "step": 1275, "token_acc": 0.45724907063197023, "train_speed(iter/s)": 1.473807 }, { "epoch": 0.05483912428773403, "grad_norm": 4.283139705657959, "learning_rate": 9.997032178346495e-05, "loss": 2.676108741760254, "memory(GiB)": 44.8, "step": 1280, "token_acc": 0.46735395189003437, "train_speed(iter/s)": 1.475485 }, { "epoch": 0.05505333961698299, "grad_norm": 4.635933876037598, "learning_rate": 9.997008949270947e-05, "loss": 2.517548179626465, "memory(GiB)": 44.8, "step": 1285, "token_acc": 0.49498327759197325, "train_speed(iter/s)": 1.476331 }, { "epoch": 0.05526755494623195, "grad_norm": 8.383788108825684, "learning_rate": 9.996985629669894e-05, "loss": 2.239846420288086, "memory(GiB)": 44.8, "step": 1290, "token_acc": 0.48372093023255813, "train_speed(iter/s)": 1.475744 }, { "epoch": 0.055481770275480916, "grad_norm": 4.16169548034668, "learning_rate": 9.996962219543762e-05, "loss": 2.500482177734375, "memory(GiB)": 44.8, "step": 1295, "token_acc": 0.45482866043613707, "train_speed(iter/s)": 1.474096 }, { "epoch": 0.055695985604729874, "grad_norm": 3.961717367172241, "learning_rate": 9.996938718892969e-05, "loss": 2.8550065994262694, "memory(GiB)": 44.8, "step": 1300, "token_acc": 0.423728813559322, "train_speed(iter/s)": 1.473889 }, { "epoch": 0.05591020093397884, "grad_norm": 4.51633358001709, "learning_rate": 9.996915127717944e-05, "loss": 2.781209182739258, "memory(GiB)": 44.8, "step": 1305, "token_acc": 0.37992831541218636, "train_speed(iter/s)": 1.472603 }, { "epoch": 0.0561244162632278, "grad_norm": 2.7895588874816895, "learning_rate": 9.996891446019114e-05, "loss": 2.6287181854248045, "memory(GiB)": 44.8, "step": 1310, "token_acc": 0.48507462686567165, "train_speed(iter/s)": 1.473199 }, { "epoch": 0.056338631592476755, "grad_norm": 3.305253028869629, "learning_rate": 9.996867673796908e-05, "loss": 2.6148349761962892, "memory(GiB)": 44.8, "step": 1315, "token_acc": 0.4626334519572954, "train_speed(iter/s)": 1.472924 }, { "epoch": 0.05655284692172572, "grad_norm": 3.111687421798706, "learning_rate": 9.996843811051757e-05, "loss": 2.5816940307617187, "memory(GiB)": 44.8, "step": 1320, "token_acc": 0.45741324921135645, "train_speed(iter/s)": 1.47242 }, { "epoch": 0.05676706225097468, "grad_norm": 3.986253499984741, "learning_rate": 9.996819857784093e-05, "loss": 2.53792667388916, "memory(GiB)": 44.8, "step": 1325, "token_acc": 0.47038327526132406, "train_speed(iter/s)": 1.472465 }, { "epoch": 0.056981277580223644, "grad_norm": 3.378695487976074, "learning_rate": 9.996795813994348e-05, "loss": 2.870656967163086, "memory(GiB)": 44.8, "step": 1330, "token_acc": 0.41721854304635764, "train_speed(iter/s)": 1.472482 }, { "epoch": 0.0571954929094726, "grad_norm": 5.106979846954346, "learning_rate": 9.996771679682961e-05, "loss": 2.8496253967285154, "memory(GiB)": 44.8, "step": 1335, "token_acc": 0.41284403669724773, "train_speed(iter/s)": 1.473367 }, { "epoch": 0.05740970823872156, "grad_norm": 5.238814353942871, "learning_rate": 9.996747454850368e-05, "loss": 2.7841476440429687, "memory(GiB)": 44.8, "step": 1340, "token_acc": 0.44571428571428573, "train_speed(iter/s)": 1.473836 }, { "epoch": 0.057623923567970525, "grad_norm": 3.4206931591033936, "learning_rate": 9.996723139497008e-05, "loss": 2.5446672439575195, "memory(GiB)": 44.8, "step": 1345, "token_acc": 0.47194719471947194, "train_speed(iter/s)": 1.472902 }, { "epoch": 0.057838138897219483, "grad_norm": 3.4484548568725586, "learning_rate": 9.99669873362332e-05, "loss": 2.4125125885009764, "memory(GiB)": 44.8, "step": 1350, "token_acc": 0.4813664596273292, "train_speed(iter/s)": 1.47399 }, { "epoch": 0.05805235422646845, "grad_norm": 4.250040054321289, "learning_rate": 9.996674237229748e-05, "loss": 2.986496162414551, "memory(GiB)": 44.8, "step": 1355, "token_acc": 0.4309623430962343, "train_speed(iter/s)": 1.474815 }, { "epoch": 0.05826656955571741, "grad_norm": 3.5987865924835205, "learning_rate": 9.996649650316734e-05, "loss": 2.768035888671875, "memory(GiB)": 44.8, "step": 1360, "token_acc": 0.44086021505376344, "train_speed(iter/s)": 1.475955 }, { "epoch": 0.058480784884966365, "grad_norm": 4.841388702392578, "learning_rate": 9.996624972884725e-05, "loss": 2.8981708526611327, "memory(GiB)": 44.8, "step": 1365, "token_acc": 0.44368600682593856, "train_speed(iter/s)": 1.477139 }, { "epoch": 0.05869500021421533, "grad_norm": 5.3648295402526855, "learning_rate": 9.996600204934165e-05, "loss": 2.895512008666992, "memory(GiB)": 44.8, "step": 1370, "token_acc": 0.41434262948207173, "train_speed(iter/s)": 1.478515 }, { "epoch": 0.05890921554346429, "grad_norm": 5.131505012512207, "learning_rate": 9.996575346465508e-05, "loss": 2.605813980102539, "memory(GiB)": 44.8, "step": 1375, "token_acc": 0.48464163822525597, "train_speed(iter/s)": 1.478754 }, { "epoch": 0.059123430872713253, "grad_norm": 2.7781922817230225, "learning_rate": 9.9965503974792e-05, "loss": 2.5423892974853515, "memory(GiB)": 44.8, "step": 1380, "token_acc": 0.49280575539568344, "train_speed(iter/s)": 1.478682 }, { "epoch": 0.05933764620196221, "grad_norm": 4.505220413208008, "learning_rate": 9.996525357975697e-05, "loss": 2.81160888671875, "memory(GiB)": 44.8, "step": 1385, "token_acc": 0.4107744107744108, "train_speed(iter/s)": 1.47827 }, { "epoch": 0.05955186153121118, "grad_norm": 3.481152296066284, "learning_rate": 9.996500227955447e-05, "loss": 2.6756229400634766, "memory(GiB)": 44.8, "step": 1390, "token_acc": 0.44947735191637633, "train_speed(iter/s)": 1.479092 }, { "epoch": 0.059766076860460135, "grad_norm": 28.509784698486328, "learning_rate": 9.99647500741891e-05, "loss": 2.8012458801269533, "memory(GiB)": 44.8, "step": 1395, "token_acc": 0.4092526690391459, "train_speed(iter/s)": 1.479515 }, { "epoch": 0.05998029218970909, "grad_norm": 4.454799652099609, "learning_rate": 9.99644969636654e-05, "loss": 2.4655921936035154, "memory(GiB)": 44.8, "step": 1400, "token_acc": 0.49603174603174605, "train_speed(iter/s)": 1.479241 }, { "epoch": 0.06019450751895806, "grad_norm": 4.337578773498535, "learning_rate": 9.996424294798796e-05, "loss": 2.85654296875, "memory(GiB)": 44.8, "step": 1405, "token_acc": 0.4722222222222222, "train_speed(iter/s)": 1.480932 }, { "epoch": 0.06040872284820702, "grad_norm": 4.483986854553223, "learning_rate": 9.99639880271614e-05, "loss": 2.564306640625, "memory(GiB)": 44.8, "step": 1410, "token_acc": 0.5022421524663677, "train_speed(iter/s)": 1.481714 }, { "epoch": 0.06062293817745598, "grad_norm": 3.3506863117218018, "learning_rate": 9.996373220119034e-05, "loss": 2.8136690139770506, "memory(GiB)": 44.8, "step": 1415, "token_acc": 0.4037854889589905, "train_speed(iter/s)": 1.481858 }, { "epoch": 0.06083715350670494, "grad_norm": 3.624328374862671, "learning_rate": 9.996347547007938e-05, "loss": 2.490763473510742, "memory(GiB)": 44.8, "step": 1420, "token_acc": 0.46875, "train_speed(iter/s)": 1.480731 }, { "epoch": 0.0610513688359539, "grad_norm": 3.642944574356079, "learning_rate": 9.99632178338332e-05, "loss": 2.7100624084472655, "memory(GiB)": 44.8, "step": 1425, "token_acc": 0.45535714285714285, "train_speed(iter/s)": 1.482317 }, { "epoch": 0.06126558416520286, "grad_norm": 4.454731464385986, "learning_rate": 9.996295929245647e-05, "loss": 2.6536014556884764, "memory(GiB)": 44.8, "step": 1430, "token_acc": 0.4139344262295082, "train_speed(iter/s)": 1.483121 }, { "epoch": 0.06147979949445182, "grad_norm": 4.322632312774658, "learning_rate": 9.996269984595386e-05, "loss": 2.7533239364624023, "memory(GiB)": 44.8, "step": 1435, "token_acc": 0.43776824034334766, "train_speed(iter/s)": 1.483148 }, { "epoch": 0.06169401482370079, "grad_norm": 3.861013412475586, "learning_rate": 9.996243949433006e-05, "loss": 2.5387073516845704, "memory(GiB)": 44.8, "step": 1440, "token_acc": 0.4681528662420382, "train_speed(iter/s)": 1.483337 }, { "epoch": 0.061908230152949745, "grad_norm": 5.236702919006348, "learning_rate": 9.996217823758981e-05, "loss": 2.83597469329834, "memory(GiB)": 44.8, "step": 1445, "token_acc": 0.43478260869565216, "train_speed(iter/s)": 1.482374 }, { "epoch": 0.0621224454821987, "grad_norm": 2.9103434085845947, "learning_rate": 9.996191607573782e-05, "loss": 2.809426689147949, "memory(GiB)": 44.8, "step": 1450, "token_acc": 0.4393939393939394, "train_speed(iter/s)": 1.481722 }, { "epoch": 0.06233666081144767, "grad_norm": 4.294338703155518, "learning_rate": 9.996165300877886e-05, "loss": 2.5830265045166017, "memory(GiB)": 44.8, "step": 1455, "token_acc": 0.43617021276595747, "train_speed(iter/s)": 1.481991 }, { "epoch": 0.06255087614069663, "grad_norm": 5.332017421722412, "learning_rate": 9.99613890367177e-05, "loss": 2.322412109375, "memory(GiB)": 44.8, "step": 1460, "token_acc": 0.5394190871369294, "train_speed(iter/s)": 1.480375 }, { "epoch": 0.06276509146994559, "grad_norm": 3.3627238273620605, "learning_rate": 9.99611241595591e-05, "loss": 2.8908124923706056, "memory(GiB)": 44.8, "step": 1465, "token_acc": 0.4270833333333333, "train_speed(iter/s)": 1.481475 }, { "epoch": 0.06297930679919456, "grad_norm": 6.100260257720947, "learning_rate": 9.996085837730787e-05, "loss": 2.6980457305908203, "memory(GiB)": 44.8, "step": 1470, "token_acc": 0.4414715719063545, "train_speed(iter/s)": 1.48171 }, { "epoch": 0.06319352212844351, "grad_norm": 3.991400957107544, "learning_rate": 9.996059168996883e-05, "loss": 2.712911033630371, "memory(GiB)": 44.8, "step": 1475, "token_acc": 0.4583333333333333, "train_speed(iter/s)": 1.483066 }, { "epoch": 0.06340773745769247, "grad_norm": 4.39341926574707, "learning_rate": 9.996032409754679e-05, "loss": 2.7493879318237306, "memory(GiB)": 44.8, "step": 1480, "token_acc": 0.4074074074074074, "train_speed(iter/s)": 1.483578 }, { "epoch": 0.06362195278694144, "grad_norm": 4.587990760803223, "learning_rate": 9.996005560004662e-05, "loss": 2.8541988372802733, "memory(GiB)": 44.8, "step": 1485, "token_acc": 0.445578231292517, "train_speed(iter/s)": 1.484213 }, { "epoch": 0.06383616811619039, "grad_norm": 3.9578187465667725, "learning_rate": 9.995978619747317e-05, "loss": 2.6278173446655275, "memory(GiB)": 44.8, "step": 1490, "token_acc": 0.4, "train_speed(iter/s)": 1.4848 }, { "epoch": 0.06405038344543935, "grad_norm": 6.137836933135986, "learning_rate": 9.995951588983135e-05, "loss": 2.4550449371337892, "memory(GiB)": 44.8, "step": 1495, "token_acc": 0.5147679324894515, "train_speed(iter/s)": 1.484099 }, { "epoch": 0.06426459877468832, "grad_norm": 4.868353366851807, "learning_rate": 9.995924467712601e-05, "loss": 2.4769081115722655, "memory(GiB)": 44.8, "step": 1500, "token_acc": 0.4935064935064935, "train_speed(iter/s)": 1.48376 }, { "epoch": 0.06426459877468832, "eval_loss": 2.4918837547302246, "eval_runtime": 13.488, "eval_samples_per_second": 7.414, "eval_steps_per_second": 7.414, "eval_token_acc": 0.4443155452436195, "step": 1500 }, { "epoch": 0.06447881410393727, "grad_norm": 3.260181188583374, "learning_rate": 9.995897255936209e-05, "loss": 2.8356361389160156, "memory(GiB)": 44.8, "step": 1505, "token_acc": 0.43099068585944117, "train_speed(iter/s)": 1.464067 }, { "epoch": 0.06469302943318624, "grad_norm": 4.076349258422852, "learning_rate": 9.995869953654452e-05, "loss": 2.3611581802368162, "memory(GiB)": 44.8, "step": 1510, "token_acc": 0.5037878787878788, "train_speed(iter/s)": 1.4648 }, { "epoch": 0.0649072447624352, "grad_norm": 4.669004440307617, "learning_rate": 9.995842560867826e-05, "loss": 2.7666147232055662, "memory(GiB)": 44.8, "step": 1515, "token_acc": 0.45907473309608543, "train_speed(iter/s)": 1.465446 }, { "epoch": 0.06512146009168417, "grad_norm": 4.889930725097656, "learning_rate": 9.995815077576822e-05, "loss": 2.6024490356445313, "memory(GiB)": 44.8, "step": 1520, "token_acc": 0.46706586826347307, "train_speed(iter/s)": 1.465824 }, { "epoch": 0.06533567542093312, "grad_norm": 4.200222492218018, "learning_rate": 9.995787503781944e-05, "loss": 2.8650585174560548, "memory(GiB)": 44.8, "step": 1525, "token_acc": 0.4419475655430712, "train_speed(iter/s)": 1.466276 }, { "epoch": 0.06554989075018208, "grad_norm": 3.2476115226745605, "learning_rate": 9.995759839483687e-05, "loss": 2.609589767456055, "memory(GiB)": 44.8, "step": 1530, "token_acc": 0.4434250764525994, "train_speed(iter/s)": 1.467032 }, { "epoch": 0.06576410607943105, "grad_norm": 5.784073352813721, "learning_rate": 9.995732084682557e-05, "loss": 2.655089569091797, "memory(GiB)": 44.8, "step": 1535, "token_acc": 0.45993031358885017, "train_speed(iter/s)": 1.4679 }, { "epoch": 0.06597832140868, "grad_norm": 4.483177661895752, "learning_rate": 9.995704239379051e-05, "loss": 2.618147850036621, "memory(GiB)": 44.8, "step": 1540, "token_acc": 0.4559748427672956, "train_speed(iter/s)": 1.468061 }, { "epoch": 0.06619253673792896, "grad_norm": 3.7003676891326904, "learning_rate": 9.995676303573678e-05, "loss": 2.632536697387695, "memory(GiB)": 44.8, "step": 1545, "token_acc": 0.5162337662337663, "train_speed(iter/s)": 1.469355 }, { "epoch": 0.06640675206717793, "grad_norm": 5.321671485900879, "learning_rate": 9.995648277266942e-05, "loss": 2.593501663208008, "memory(GiB)": 44.8, "step": 1550, "token_acc": 0.4828897338403042, "train_speed(iter/s)": 1.469549 }, { "epoch": 0.0666209673964269, "grad_norm": 4.74328088760376, "learning_rate": 9.995620160459351e-05, "loss": 2.894466209411621, "memory(GiB)": 44.8, "step": 1555, "token_acc": 0.41901408450704225, "train_speed(iter/s)": 1.469672 }, { "epoch": 0.06683518272567585, "grad_norm": 3.247278928756714, "learning_rate": 9.995591953151415e-05, "loss": 2.749079704284668, "memory(GiB)": 44.8, "step": 1560, "token_acc": 0.44904458598726116, "train_speed(iter/s)": 1.469661 }, { "epoch": 0.06704939805492481, "grad_norm": 4.189208030700684, "learning_rate": 9.995563655343645e-05, "loss": 2.3000814437866213, "memory(GiB)": 44.8, "step": 1565, "token_acc": 0.49491525423728816, "train_speed(iter/s)": 1.468532 }, { "epoch": 0.06726361338417378, "grad_norm": 3.8709604740142822, "learning_rate": 9.995535267036551e-05, "loss": 2.960202217102051, "memory(GiB)": 44.8, "step": 1570, "token_acc": 0.4084967320261438, "train_speed(iter/s)": 1.468213 }, { "epoch": 0.06747782871342273, "grad_norm": 3.0151495933532715, "learning_rate": 9.995506788230652e-05, "loss": 2.9425230026245117, "memory(GiB)": 44.8, "step": 1575, "token_acc": 0.421875, "train_speed(iter/s)": 1.467536 }, { "epoch": 0.06769204404267169, "grad_norm": 4.0558857917785645, "learning_rate": 9.99547821892646e-05, "loss": 2.702001762390137, "memory(GiB)": 44.8, "step": 1580, "token_acc": 0.4326923076923077, "train_speed(iter/s)": 1.467546 }, { "epoch": 0.06790625937192066, "grad_norm": 4.547561168670654, "learning_rate": 9.995449559124495e-05, "loss": 2.6235677719116213, "memory(GiB)": 44.8, "step": 1585, "token_acc": 0.4921259842519685, "train_speed(iter/s)": 1.468384 }, { "epoch": 0.06812047470116961, "grad_norm": 3.4789013862609863, "learning_rate": 9.995420808825274e-05, "loss": 2.6372425079345705, "memory(GiB)": 44.8, "step": 1590, "token_acc": 0.4419475655430712, "train_speed(iter/s)": 1.468392 }, { "epoch": 0.06833469003041857, "grad_norm": 3.6526811122894287, "learning_rate": 9.995391968029318e-05, "loss": 2.6607057571411135, "memory(GiB)": 44.8, "step": 1595, "token_acc": 0.44765342960288806, "train_speed(iter/s)": 1.465144 }, { "epoch": 0.06854890535966754, "grad_norm": 3.4350223541259766, "learning_rate": 9.995363036737152e-05, "loss": 2.662113380432129, "memory(GiB)": 44.8, "step": 1600, "token_acc": 0.4306569343065693, "train_speed(iter/s)": 1.465347 }, { "epoch": 0.0687631206889165, "grad_norm": 10.293490409851074, "learning_rate": 9.995334014949297e-05, "loss": 2.257463073730469, "memory(GiB)": 44.8, "step": 1605, "token_acc": 0.49230769230769234, "train_speed(iter/s)": 1.464816 }, { "epoch": 0.06897733601816546, "grad_norm": 3.9372994899749756, "learning_rate": 9.995304902666283e-05, "loss": 2.976861763000488, "memory(GiB)": 44.8, "step": 1610, "token_acc": 0.4072948328267477, "train_speed(iter/s)": 1.464416 }, { "epoch": 0.06919155134741442, "grad_norm": 5.508297920227051, "learning_rate": 9.995275699888633e-05, "loss": 2.9259031295776365, "memory(GiB)": 44.8, "step": 1615, "token_acc": 0.42028985507246375, "train_speed(iter/s)": 1.464444 }, { "epoch": 0.06940576667666339, "grad_norm": 5.092051982879639, "learning_rate": 9.995246406616879e-05, "loss": 2.6340478897094726, "memory(GiB)": 44.8, "step": 1620, "token_acc": 0.49096385542168675, "train_speed(iter/s)": 1.465172 }, { "epoch": 0.06961998200591234, "grad_norm": 4.231055736541748, "learning_rate": 9.995217022851548e-05, "loss": 2.37537841796875, "memory(GiB)": 44.8, "step": 1625, "token_acc": 0.5107296137339056, "train_speed(iter/s)": 1.464972 }, { "epoch": 0.0698341973351613, "grad_norm": 4.58602237701416, "learning_rate": 9.995187548593176e-05, "loss": 2.751350975036621, "memory(GiB)": 44.8, "step": 1630, "token_acc": 0.46179401993355484, "train_speed(iter/s)": 1.465782 }, { "epoch": 0.07004841266441027, "grad_norm": 3.5853564739227295, "learning_rate": 9.995157983842297e-05, "loss": 2.858659362792969, "memory(GiB)": 44.8, "step": 1635, "token_acc": 0.3962765957446808, "train_speed(iter/s)": 1.46704 }, { "epoch": 0.07026262799365923, "grad_norm": 3.7685813903808594, "learning_rate": 9.995128328599444e-05, "loss": 2.536533546447754, "memory(GiB)": 44.8, "step": 1640, "token_acc": 0.4823529411764706, "train_speed(iter/s)": 1.467404 }, { "epoch": 0.07047684332290818, "grad_norm": 2.9384233951568604, "learning_rate": 9.995098582865157e-05, "loss": 2.468268394470215, "memory(GiB)": 44.8, "step": 1645, "token_acc": 0.505338078291815, "train_speed(iter/s)": 1.467752 }, { "epoch": 0.07069105865215715, "grad_norm": 6.0222039222717285, "learning_rate": 9.99506874663997e-05, "loss": 2.4725214004516602, "memory(GiB)": 44.8, "step": 1650, "token_acc": 0.4666666666666667, "train_speed(iter/s)": 1.467767 }, { "epoch": 0.07090527398140611, "grad_norm": 3.1422464847564697, "learning_rate": 9.995038819924429e-05, "loss": 2.6452556610107423, "memory(GiB)": 44.8, "step": 1655, "token_acc": 0.46920821114369504, "train_speed(iter/s)": 1.468471 }, { "epoch": 0.07111948931065507, "grad_norm": 4.771244525909424, "learning_rate": 9.995008802719075e-05, "loss": 2.6549068450927735, "memory(GiB)": 44.8, "step": 1660, "token_acc": 0.4689655172413793, "train_speed(iter/s)": 1.469027 }, { "epoch": 0.07133370463990403, "grad_norm": 3.4673192501068115, "learning_rate": 9.99497869502445e-05, "loss": 2.8690052032470703, "memory(GiB)": 44.8, "step": 1665, "token_acc": 0.3874538745387454, "train_speed(iter/s)": 1.469623 }, { "epoch": 0.071547919969153, "grad_norm": 6.550461292266846, "learning_rate": 9.9949484968411e-05, "loss": 2.624686431884766, "memory(GiB)": 44.8, "step": 1670, "token_acc": 0.4925925925925926, "train_speed(iter/s)": 1.469485 }, { "epoch": 0.07176213529840195, "grad_norm": 4.609001159667969, "learning_rate": 9.994918208169572e-05, "loss": 2.849748229980469, "memory(GiB)": 44.8, "step": 1675, "token_acc": 0.4444444444444444, "train_speed(iter/s)": 1.46835 }, { "epoch": 0.07197635062765091, "grad_norm": 3.287458896636963, "learning_rate": 9.994887829010416e-05, "loss": 2.5628742218017577, "memory(GiB)": 44.8, "step": 1680, "token_acc": 0.46124031007751937, "train_speed(iter/s)": 1.468356 }, { "epoch": 0.07219056595689988, "grad_norm": 4.0205078125, "learning_rate": 9.994857359364181e-05, "loss": 2.4077875137329103, "memory(GiB)": 44.8, "step": 1685, "token_acc": 0.5151515151515151, "train_speed(iter/s)": 1.466875 }, { "epoch": 0.07240478128614884, "grad_norm": 4.003814697265625, "learning_rate": 9.99482679923142e-05, "loss": 2.4937423706054687, "memory(GiB)": 44.8, "step": 1690, "token_acc": 0.4882154882154882, "train_speed(iter/s)": 1.467723 }, { "epoch": 0.0726189966153978, "grad_norm": 6.048070430755615, "learning_rate": 9.994796148612684e-05, "loss": 2.6457901000976562, "memory(GiB)": 44.8, "step": 1695, "token_acc": 0.48253968253968255, "train_speed(iter/s)": 1.467999 }, { "epoch": 0.07283321194464676, "grad_norm": 3.582834482192993, "learning_rate": 9.994765407508532e-05, "loss": 2.455926704406738, "memory(GiB)": 44.8, "step": 1700, "token_acc": 0.45525291828793774, "train_speed(iter/s)": 1.467177 }, { "epoch": 0.07304742727389572, "grad_norm": 4.16580867767334, "learning_rate": 9.99473457591952e-05, "loss": 2.518654441833496, "memory(GiB)": 44.8, "step": 1705, "token_acc": 0.4222873900293255, "train_speed(iter/s)": 1.466489 }, { "epoch": 0.07326164260314467, "grad_norm": 3.9179389476776123, "learning_rate": 9.994703653846203e-05, "loss": 2.7111934661865233, "memory(GiB)": 44.8, "step": 1710, "token_acc": 0.44370860927152317, "train_speed(iter/s)": 1.467215 }, { "epoch": 0.07347585793239364, "grad_norm": 3.6408209800720215, "learning_rate": 9.994672641289146e-05, "loss": 3.0178693771362304, "memory(GiB)": 44.8, "step": 1715, "token_acc": 0.4272151898734177, "train_speed(iter/s)": 1.46759 }, { "epoch": 0.0736900732616426, "grad_norm": 3.802363634109497, "learning_rate": 9.994641538248907e-05, "loss": 2.4939434051513674, "memory(GiB)": 44.8, "step": 1720, "token_acc": 0.5148514851485149, "train_speed(iter/s)": 1.467842 }, { "epoch": 0.07390428859089157, "grad_norm": 3.98805570602417, "learning_rate": 9.994610344726055e-05, "loss": 2.795812225341797, "memory(GiB)": 44.8, "step": 1725, "token_acc": 0.4476987447698745, "train_speed(iter/s)": 1.467882 }, { "epoch": 0.07411850392014052, "grad_norm": 4.293564796447754, "learning_rate": 9.994579060721147e-05, "loss": 2.7436058044433596, "memory(GiB)": 44.8, "step": 1730, "token_acc": 0.44857142857142857, "train_speed(iter/s)": 1.467884 }, { "epoch": 0.07433271924938949, "grad_norm": 4.725361347198486, "learning_rate": 9.994547686234755e-05, "loss": 2.7824867248535154, "memory(GiB)": 44.8, "step": 1735, "token_acc": 0.422680412371134, "train_speed(iter/s)": 1.468395 }, { "epoch": 0.07454693457863845, "grad_norm": 3.632896661758423, "learning_rate": 9.994516221267446e-05, "loss": 2.6807174682617188, "memory(GiB)": 44.8, "step": 1740, "token_acc": 0.46075085324232085, "train_speed(iter/s)": 1.468945 }, { "epoch": 0.0747611499078874, "grad_norm": 5.27315616607666, "learning_rate": 9.994484665819793e-05, "loss": 2.6903053283691407, "memory(GiB)": 44.8, "step": 1745, "token_acc": 0.44404332129963897, "train_speed(iter/s)": 1.468373 }, { "epoch": 0.07497536523713637, "grad_norm": 4.695183277130127, "learning_rate": 9.994453019892364e-05, "loss": 2.579717445373535, "memory(GiB)": 44.8, "step": 1750, "token_acc": 0.475, "train_speed(iter/s)": 1.469647 }, { "epoch": 0.07518958056638533, "grad_norm": 3.082949161529541, "learning_rate": 9.994421283485733e-05, "loss": 2.7353631973266603, "memory(GiB)": 44.8, "step": 1755, "token_acc": 0.4738562091503268, "train_speed(iter/s)": 1.470082 }, { "epoch": 0.0754037958956343, "grad_norm": 4.967978000640869, "learning_rate": 9.994389456600474e-05, "loss": 2.6519132614135743, "memory(GiB)": 44.8, "step": 1760, "token_acc": 0.44761904761904764, "train_speed(iter/s)": 1.46967 }, { "epoch": 0.07561801122488325, "grad_norm": 4.676754474639893, "learning_rate": 9.994357539237166e-05, "loss": 2.588191604614258, "memory(GiB)": 44.8, "step": 1765, "token_acc": 0.46494464944649444, "train_speed(iter/s)": 1.46891 }, { "epoch": 0.07583222655413221, "grad_norm": 3.819511651992798, "learning_rate": 9.994325531396387e-05, "loss": 2.554705047607422, "memory(GiB)": 44.8, "step": 1770, "token_acc": 0.4642857142857143, "train_speed(iter/s)": 1.468704 }, { "epoch": 0.07604644188338118, "grad_norm": 4.374409198760986, "learning_rate": 9.994293433078714e-05, "loss": 2.24488525390625, "memory(GiB)": 44.8, "step": 1775, "token_acc": 0.4554794520547945, "train_speed(iter/s)": 1.468235 }, { "epoch": 0.07626065721263013, "grad_norm": 3.7358691692352295, "learning_rate": 9.994261244284733e-05, "loss": 2.6257869720458986, "memory(GiB)": 44.8, "step": 1780, "token_acc": 0.45517241379310347, "train_speed(iter/s)": 1.468495 }, { "epoch": 0.0764748725418791, "grad_norm": 3.8033885955810547, "learning_rate": 9.994228965015022e-05, "loss": 2.5802871704101564, "memory(GiB)": 44.8, "step": 1785, "token_acc": 0.46179401993355484, "train_speed(iter/s)": 1.467199 }, { "epoch": 0.07668908787112806, "grad_norm": 5.154775619506836, "learning_rate": 9.99419659527017e-05, "loss": 2.960002899169922, "memory(GiB)": 44.8, "step": 1790, "token_acc": 0.43416370106761565, "train_speed(iter/s)": 1.46801 }, { "epoch": 0.07690330320037701, "grad_norm": 2.979706048965454, "learning_rate": 9.994164135050761e-05, "loss": 2.6417831420898437, "memory(GiB)": 44.8, "step": 1795, "token_acc": 0.4533762057877814, "train_speed(iter/s)": 1.469445 }, { "epoch": 0.07711751852962598, "grad_norm": 5.776839256286621, "learning_rate": 9.994131584357384e-05, "loss": 2.742685317993164, "memory(GiB)": 44.8, "step": 1800, "token_acc": 0.461038961038961, "train_speed(iter/s)": 1.469615 }, { "epoch": 0.07733173385887494, "grad_norm": 3.6369216442108154, "learning_rate": 9.994098943190629e-05, "loss": 2.7681421279907226, "memory(GiB)": 44.8, "step": 1805, "token_acc": 0.437125748502994, "train_speed(iter/s)": 1.469898 }, { "epoch": 0.07754594918812391, "grad_norm": 3.295574903488159, "learning_rate": 9.994066211551085e-05, "loss": 2.7584259033203127, "memory(GiB)": 44.8, "step": 1810, "token_acc": 0.49295774647887325, "train_speed(iter/s)": 1.470905 }, { "epoch": 0.07776016451737286, "grad_norm": 4.260218143463135, "learning_rate": 9.994033389439348e-05, "loss": 2.7495498657226562, "memory(GiB)": 44.8, "step": 1815, "token_acc": 0.4413793103448276, "train_speed(iter/s)": 1.470433 }, { "epoch": 0.07797437984662182, "grad_norm": 4.7456374168396, "learning_rate": 9.994000476856011e-05, "loss": 2.791260528564453, "memory(GiB)": 44.8, "step": 1820, "token_acc": 0.4738675958188153, "train_speed(iter/s)": 1.470185 }, { "epoch": 0.07818859517587079, "grad_norm": 3.4733006954193115, "learning_rate": 9.993967473801671e-05, "loss": 2.4547632217407225, "memory(GiB)": 44.8, "step": 1825, "token_acc": 0.49429657794676807, "train_speed(iter/s)": 1.47054 }, { "epoch": 0.07840281050511974, "grad_norm": 4.259070873260498, "learning_rate": 9.993934380276926e-05, "loss": 2.639309310913086, "memory(GiB)": 44.8, "step": 1830, "token_acc": 0.48214285714285715, "train_speed(iter/s)": 1.471465 }, { "epoch": 0.0786170258343687, "grad_norm": 3.966062545776367, "learning_rate": 9.993901196282374e-05, "loss": 2.3673242568969726, "memory(GiB)": 44.8, "step": 1835, "token_acc": 0.5023474178403756, "train_speed(iter/s)": 1.47104 }, { "epoch": 0.07883124116361767, "grad_norm": 5.579440116882324, "learning_rate": 9.993867921818619e-05, "loss": 2.858770179748535, "memory(GiB)": 44.8, "step": 1840, "token_acc": 0.4444444444444444, "train_speed(iter/s)": 1.470998 }, { "epoch": 0.07904545649286664, "grad_norm": 3.153552532196045, "learning_rate": 9.993834556886259e-05, "loss": 2.3209178924560545, "memory(GiB)": 44.8, "step": 1845, "token_acc": 0.4925373134328358, "train_speed(iter/s)": 1.47136 }, { "epoch": 0.07925967182211559, "grad_norm": 2.9614076614379883, "learning_rate": 9.993801101485903e-05, "loss": 2.7119884490966797, "memory(GiB)": 44.8, "step": 1850, "token_acc": 0.44984802431610943, "train_speed(iter/s)": 1.471664 }, { "epoch": 0.07947388715136455, "grad_norm": 4.343069553375244, "learning_rate": 9.993767555618157e-05, "loss": 2.759410858154297, "memory(GiB)": 44.8, "step": 1855, "token_acc": 0.4502923976608187, "train_speed(iter/s)": 1.471036 }, { "epoch": 0.07968810248061352, "grad_norm": 3.1520256996154785, "learning_rate": 9.993733919283624e-05, "loss": 3.0197704315185545, "memory(GiB)": 44.8, "step": 1860, "token_acc": 0.4014336917562724, "train_speed(iter/s)": 1.471265 }, { "epoch": 0.07990231780986247, "grad_norm": 3.6374616622924805, "learning_rate": 9.993700192482918e-05, "loss": 2.8920530319213866, "memory(GiB)": 44.8, "step": 1865, "token_acc": 0.4244791666666667, "train_speed(iter/s)": 1.471729 }, { "epoch": 0.08011653313911143, "grad_norm": 2.874727964401245, "learning_rate": 9.99366637521665e-05, "loss": 2.738356018066406, "memory(GiB)": 44.8, "step": 1870, "token_acc": 0.4391691394658754, "train_speed(iter/s)": 1.472679 }, { "epoch": 0.0803307484683604, "grad_norm": 2.983149290084839, "learning_rate": 9.99363246748543e-05, "loss": 2.816168785095215, "memory(GiB)": 44.8, "step": 1875, "token_acc": 0.4426229508196721, "train_speed(iter/s)": 1.472957 }, { "epoch": 0.08054496379760935, "grad_norm": 2.8804805278778076, "learning_rate": 9.993598469289874e-05, "loss": 2.5867034912109377, "memory(GiB)": 44.8, "step": 1880, "token_acc": 0.45454545454545453, "train_speed(iter/s)": 1.47399 }, { "epoch": 0.08075917912685832, "grad_norm": 3.374390125274658, "learning_rate": 9.993564380630595e-05, "loss": 2.8110849380493166, "memory(GiB)": 44.8, "step": 1885, "token_acc": 0.45110410094637227, "train_speed(iter/s)": 1.474029 }, { "epoch": 0.08097339445610728, "grad_norm": 3.384381055831909, "learning_rate": 9.993530201508216e-05, "loss": 2.699570083618164, "memory(GiB)": 44.8, "step": 1890, "token_acc": 0.48125, "train_speed(iter/s)": 1.473377 }, { "epoch": 0.08118760978535625, "grad_norm": 6.725127696990967, "learning_rate": 9.993495931923352e-05, "loss": 2.5107748031616213, "memory(GiB)": 44.8, "step": 1895, "token_acc": 0.46037735849056605, "train_speed(iter/s)": 1.472904 }, { "epoch": 0.0814018251146052, "grad_norm": 3.8275153636932373, "learning_rate": 9.993461571876624e-05, "loss": 2.467954635620117, "memory(GiB)": 44.8, "step": 1900, "token_acc": 0.44594594594594594, "train_speed(iter/s)": 1.472298 }, { "epoch": 0.08161604044385416, "grad_norm": 4.472103118896484, "learning_rate": 9.993427121368656e-05, "loss": 2.788699913024902, "memory(GiB)": 44.8, "step": 1905, "token_acc": 0.4506578947368421, "train_speed(iter/s)": 1.471735 }, { "epoch": 0.08183025577310313, "grad_norm": 4.863548278808594, "learning_rate": 9.993392580400072e-05, "loss": 2.5571044921875, "memory(GiB)": 44.8, "step": 1910, "token_acc": 0.4846153846153846, "train_speed(iter/s)": 1.471922 }, { "epoch": 0.08204447110235208, "grad_norm": 5.535677909851074, "learning_rate": 9.993357948971496e-05, "loss": 2.714681053161621, "memory(GiB)": 44.8, "step": 1915, "token_acc": 0.47058823529411764, "train_speed(iter/s)": 1.472022 }, { "epoch": 0.08225868643160104, "grad_norm": 3.3805017471313477, "learning_rate": 9.993323227083557e-05, "loss": 2.60046443939209, "memory(GiB)": 44.8, "step": 1920, "token_acc": 0.42857142857142855, "train_speed(iter/s)": 1.473066 }, { "epoch": 0.08247290176085001, "grad_norm": 3.1437110900878906, "learning_rate": 9.993288414736885e-05, "loss": 2.686944770812988, "memory(GiB)": 44.8, "step": 1925, "token_acc": 0.48742138364779874, "train_speed(iter/s)": 1.472615 }, { "epoch": 0.08268711709009897, "grad_norm": 6.048345565795898, "learning_rate": 9.993253511932108e-05, "loss": 2.567666244506836, "memory(GiB)": 44.8, "step": 1930, "token_acc": 0.4723127035830619, "train_speed(iter/s)": 1.473231 }, { "epoch": 0.08290133241934793, "grad_norm": 4.505509853363037, "learning_rate": 9.99321851866986e-05, "loss": 2.66304931640625, "memory(GiB)": 44.8, "step": 1935, "token_acc": 0.4354243542435424, "train_speed(iter/s)": 1.473824 }, { "epoch": 0.08311554774859689, "grad_norm": 3.6984739303588867, "learning_rate": 9.993183434950774e-05, "loss": 2.5534067153930664, "memory(GiB)": 44.8, "step": 1940, "token_acc": 0.4983922829581994, "train_speed(iter/s)": 1.473403 }, { "epoch": 0.08332976307784586, "grad_norm": 3.472925901412964, "learning_rate": 9.993148260775487e-05, "loss": 2.303940200805664, "memory(GiB)": 44.8, "step": 1945, "token_acc": 0.516728624535316, "train_speed(iter/s)": 1.473391 }, { "epoch": 0.08354397840709481, "grad_norm": 3.518807888031006, "learning_rate": 9.993112996144634e-05, "loss": 2.582549285888672, "memory(GiB)": 44.8, "step": 1950, "token_acc": 0.47079037800687284, "train_speed(iter/s)": 1.473647 }, { "epoch": 0.08375819373634377, "grad_norm": 2.8743302822113037, "learning_rate": 9.993077641058856e-05, "loss": 2.4542316436767577, "memory(GiB)": 44.8, "step": 1955, "token_acc": 0.5238095238095238, "train_speed(iter/s)": 1.473027 }, { "epoch": 0.08397240906559274, "grad_norm": 5.637447357177734, "learning_rate": 9.993042195518793e-05, "loss": 2.5404361724853515, "memory(GiB)": 44.8, "step": 1960, "token_acc": 0.45774647887323944, "train_speed(iter/s)": 1.473449 }, { "epoch": 0.08418662439484169, "grad_norm": 3.8969013690948486, "learning_rate": 9.993006659525087e-05, "loss": 2.9124237060546876, "memory(GiB)": 44.8, "step": 1965, "token_acc": 0.4014336917562724, "train_speed(iter/s)": 1.473957 }, { "epoch": 0.08440083972409065, "grad_norm": 4.842186450958252, "learning_rate": 9.99297103307838e-05, "loss": 2.7313570022583007, "memory(GiB)": 44.8, "step": 1970, "token_acc": 0.46175637393767704, "train_speed(iter/s)": 1.47275 }, { "epoch": 0.08461505505333962, "grad_norm": 3.2330164909362793, "learning_rate": 9.99293531617932e-05, "loss": 2.4571990966796875, "memory(GiB)": 44.8, "step": 1975, "token_acc": 0.4872611464968153, "train_speed(iter/s)": 1.47132 }, { "epoch": 0.08482927038258858, "grad_norm": 3.849078416824341, "learning_rate": 9.992899508828553e-05, "loss": 2.63256721496582, "memory(GiB)": 44.8, "step": 1980, "token_acc": 0.48936170212765956, "train_speed(iter/s)": 1.471193 }, { "epoch": 0.08504348571183754, "grad_norm": 5.2319183349609375, "learning_rate": 9.992863611026725e-05, "loss": 2.579254722595215, "memory(GiB)": 44.8, "step": 1985, "token_acc": 0.45084745762711864, "train_speed(iter/s)": 1.470934 }, { "epoch": 0.0852577010410865, "grad_norm": 4.7803120613098145, "learning_rate": 9.992827622774494e-05, "loss": 2.7317478179931642, "memory(GiB)": 44.8, "step": 1990, "token_acc": 0.46179401993355484, "train_speed(iter/s)": 1.471228 }, { "epoch": 0.08547191637033547, "grad_norm": 3.165527105331421, "learning_rate": 9.992791544072504e-05, "loss": 2.8859058380126954, "memory(GiB)": 44.8, "step": 1995, "token_acc": 0.47735191637630664, "train_speed(iter/s)": 1.471087 }, { "epoch": 0.08568613169958442, "grad_norm": 4.092401027679443, "learning_rate": 9.992755374921412e-05, "loss": 2.788203811645508, "memory(GiB)": 44.8, "step": 2000, "token_acc": 0.4258064516129032, "train_speed(iter/s)": 1.471473 }, { "epoch": 0.08568613169958442, "eval_loss": 2.2102930545806885, "eval_runtime": 14.1978, "eval_samples_per_second": 7.043, "eval_steps_per_second": 7.043, "eval_token_acc": 0.4676753782668501, "step": 2000 }, { "epoch": 0.08590034702883338, "grad_norm": 5.016899585723877, "learning_rate": 9.992719115321872e-05, "loss": 2.5726345062255858, "memory(GiB)": 44.8, "step": 2005, "token_acc": 0.4597918637653737, "train_speed(iter/s)": 1.455839 }, { "epoch": 0.08611456235808235, "grad_norm": 3.140343427658081, "learning_rate": 9.992682765274543e-05, "loss": 3.0107297897338867, "memory(GiB)": 44.8, "step": 2010, "token_acc": 0.4467455621301775, "train_speed(iter/s)": 1.455615 }, { "epoch": 0.08632877768733131, "grad_norm": 5.080636024475098, "learning_rate": 9.992646324780082e-05, "loss": 2.307963752746582, "memory(GiB)": 44.8, "step": 2015, "token_acc": 0.5245283018867924, "train_speed(iter/s)": 1.455992 }, { "epoch": 0.08654299301658026, "grad_norm": 3.719008684158325, "learning_rate": 9.992609793839148e-05, "loss": 2.5544200897216798, "memory(GiB)": 44.8, "step": 2020, "token_acc": 0.458955223880597, "train_speed(iter/s)": 1.455796 }, { "epoch": 0.08675720834582923, "grad_norm": 4.171456813812256, "learning_rate": 9.992573172452406e-05, "loss": 2.617792510986328, "memory(GiB)": 44.8, "step": 2025, "token_acc": 0.45794392523364486, "train_speed(iter/s)": 1.456087 }, { "epoch": 0.0869714236750782, "grad_norm": 4.259059429168701, "learning_rate": 9.992536460620516e-05, "loss": 2.66180477142334, "memory(GiB)": 44.8, "step": 2030, "token_acc": 0.4826254826254826, "train_speed(iter/s)": 1.45579 }, { "epoch": 0.08718563900432715, "grad_norm": 3.160526990890503, "learning_rate": 9.992499658344145e-05, "loss": 2.69201545715332, "memory(GiB)": 44.8, "step": 2035, "token_acc": 0.473015873015873, "train_speed(iter/s)": 1.456549 }, { "epoch": 0.08739985433357611, "grad_norm": 4.474733352661133, "learning_rate": 9.99246276562396e-05, "loss": 2.8203947067260744, "memory(GiB)": 44.8, "step": 2040, "token_acc": 0.40217391304347827, "train_speed(iter/s)": 1.457013 }, { "epoch": 0.08761406966282508, "grad_norm": 3.573944330215454, "learning_rate": 9.992425782460628e-05, "loss": 2.5302507400512697, "memory(GiB)": 44.8, "step": 2045, "token_acc": 0.43666666666666665, "train_speed(iter/s)": 1.457445 }, { "epoch": 0.08782828499207403, "grad_norm": 5.393106460571289, "learning_rate": 9.99238870885482e-05, "loss": 2.9964015960693358, "memory(GiB)": 44.8, "step": 2050, "token_acc": 0.4199288256227758, "train_speed(iter/s)": 1.458832 }, { "epoch": 0.08804250032132299, "grad_norm": 2.755122423171997, "learning_rate": 9.992351544807208e-05, "loss": 2.91259708404541, "memory(GiB)": 44.8, "step": 2055, "token_acc": 0.4180064308681672, "train_speed(iter/s)": 1.459263 }, { "epoch": 0.08825671565057196, "grad_norm": 3.6033883094787598, "learning_rate": 9.992314290318465e-05, "loss": 2.6889392852783205, "memory(GiB)": 44.8, "step": 2060, "token_acc": 0.41776315789473684, "train_speed(iter/s)": 1.459457 }, { "epoch": 0.08847093097982092, "grad_norm": 3.053887128829956, "learning_rate": 9.992276945389263e-05, "loss": 2.7946678161621095, "memory(GiB)": 44.8, "step": 2065, "token_acc": 0.4340175953079179, "train_speed(iter/s)": 1.459748 }, { "epoch": 0.08868514630906987, "grad_norm": 6.401619911193848, "learning_rate": 9.992239510020282e-05, "loss": 2.6602827072143556, "memory(GiB)": 44.8, "step": 2070, "token_acc": 0.45038167938931295, "train_speed(iter/s)": 1.4601 }, { "epoch": 0.08889936163831884, "grad_norm": 4.626623630523682, "learning_rate": 9.9922019842122e-05, "loss": 2.8021312713623048, "memory(GiB)": 44.8, "step": 2075, "token_acc": 0.4602649006622517, "train_speed(iter/s)": 1.460213 }, { "epoch": 0.0891135769675678, "grad_norm": 3.489022970199585, "learning_rate": 9.992164367965696e-05, "loss": 2.7191648483276367, "memory(GiB)": 44.8, "step": 2080, "token_acc": 0.46153846153846156, "train_speed(iter/s)": 1.460497 }, { "epoch": 0.08932779229681675, "grad_norm": 3.8433265686035156, "learning_rate": 9.992126661281452e-05, "loss": 2.543224906921387, "memory(GiB)": 44.8, "step": 2085, "token_acc": 0.4804270462633452, "train_speed(iter/s)": 1.460068 }, { "epoch": 0.08954200762606572, "grad_norm": 3.1771106719970703, "learning_rate": 9.99208886416015e-05, "loss": 2.6399940490722655, "memory(GiB)": 44.8, "step": 2090, "token_acc": 0.428125, "train_speed(iter/s)": 1.460713 }, { "epoch": 0.08975622295531469, "grad_norm": 3.5798282623291016, "learning_rate": 9.992050976602474e-05, "loss": 2.520054244995117, "memory(GiB)": 44.8, "step": 2095, "token_acc": 0.4448051948051948, "train_speed(iter/s)": 1.460751 }, { "epoch": 0.08997043828456365, "grad_norm": 3.968090772628784, "learning_rate": 9.992012998609112e-05, "loss": 2.8136629104614257, "memory(GiB)": 44.8, "step": 2100, "token_acc": 0.4470198675496689, "train_speed(iter/s)": 1.460699 }, { "epoch": 0.0901846536138126, "grad_norm": 3.531860828399658, "learning_rate": 9.991974930180752e-05, "loss": 2.687778091430664, "memory(GiB)": 44.8, "step": 2105, "token_acc": 0.46303501945525294, "train_speed(iter/s)": 1.461212 }, { "epoch": 0.09039886894306157, "grad_norm": 2.9376018047332764, "learning_rate": 9.991936771318083e-05, "loss": 2.5948768615722657, "memory(GiB)": 44.8, "step": 2110, "token_acc": 0.47318611987381703, "train_speed(iter/s)": 1.46008 }, { "epoch": 0.09061308427231053, "grad_norm": 5.6689229011535645, "learning_rate": 9.991898522021798e-05, "loss": 2.7493515014648438, "memory(GiB)": 44.8, "step": 2115, "token_acc": 0.45390070921985815, "train_speed(iter/s)": 1.460445 }, { "epoch": 0.09082729960155948, "grad_norm": 3.887681245803833, "learning_rate": 9.991860182292587e-05, "loss": 2.595868682861328, "memory(GiB)": 44.8, "step": 2120, "token_acc": 0.49206349206349204, "train_speed(iter/s)": 1.460787 }, { "epoch": 0.09104151493080845, "grad_norm": 3.7782750129699707, "learning_rate": 9.991821752131146e-05, "loss": 2.73706169128418, "memory(GiB)": 44.8, "step": 2125, "token_acc": 0.4485049833887043, "train_speed(iter/s)": 1.460954 }, { "epoch": 0.09125573026005741, "grad_norm": 4.064642906188965, "learning_rate": 9.991783231538172e-05, "loss": 2.8414638519287108, "memory(GiB)": 44.8, "step": 2130, "token_acc": 0.41946308724832215, "train_speed(iter/s)": 1.460693 }, { "epoch": 0.09146994558930636, "grad_norm": 4.133368015289307, "learning_rate": 9.991744620514363e-05, "loss": 2.9908676147460938, "memory(GiB)": 44.8, "step": 2135, "token_acc": 0.392, "train_speed(iter/s)": 1.459536 }, { "epoch": 0.09168416091855533, "grad_norm": 4.057086944580078, "learning_rate": 9.991705919060416e-05, "loss": 2.6876079559326174, "memory(GiB)": 44.8, "step": 2140, "token_acc": 0.4453125, "train_speed(iter/s)": 1.459825 }, { "epoch": 0.0918983762478043, "grad_norm": 5.1124043464660645, "learning_rate": 9.991667127177033e-05, "loss": 2.4993526458740236, "memory(GiB)": 44.8, "step": 2145, "token_acc": 0.4681528662420382, "train_speed(iter/s)": 1.459294 }, { "epoch": 0.09211259157705326, "grad_norm": 4.4299397468566895, "learning_rate": 9.99162824486492e-05, "loss": 2.6979110717773436, "memory(GiB)": 44.8, "step": 2150, "token_acc": 0.449438202247191, "train_speed(iter/s)": 1.459595 }, { "epoch": 0.09232680690630221, "grad_norm": 4.406503677368164, "learning_rate": 9.991589272124778e-05, "loss": 2.6013330459594726, "memory(GiB)": 44.8, "step": 2155, "token_acc": 0.5, "train_speed(iter/s)": 1.458888 }, { "epoch": 0.09254102223555118, "grad_norm": 5.777492523193359, "learning_rate": 9.991550208957312e-05, "loss": 2.6056400299072267, "memory(GiB)": 44.8, "step": 2160, "token_acc": 0.42024539877300615, "train_speed(iter/s)": 1.459472 }, { "epoch": 0.09275523756480014, "grad_norm": 4.015589237213135, "learning_rate": 9.991511055363232e-05, "loss": 2.3596290588378905, "memory(GiB)": 44.8, "step": 2165, "token_acc": 0.5, "train_speed(iter/s)": 1.459388 }, { "epoch": 0.09296945289404909, "grad_norm": 3.3259499073028564, "learning_rate": 9.991471811343248e-05, "loss": 2.6802696228027343, "memory(GiB)": 44.8, "step": 2170, "token_acc": 0.43790849673202614, "train_speed(iter/s)": 1.459627 }, { "epoch": 0.09318366822329806, "grad_norm": 3.664127826690674, "learning_rate": 9.991432476898069e-05, "loss": 2.6583927154541014, "memory(GiB)": 44.8, "step": 2175, "token_acc": 0.48534201954397393, "train_speed(iter/s)": 1.459292 }, { "epoch": 0.09339788355254702, "grad_norm": 6.050678253173828, "learning_rate": 9.991393052028408e-05, "loss": 2.6330432891845703, "memory(GiB)": 44.8, "step": 2180, "token_acc": 0.4948453608247423, "train_speed(iter/s)": 1.458408 }, { "epoch": 0.09361209888179599, "grad_norm": 5.467209815979004, "learning_rate": 9.991353536734981e-05, "loss": 2.928238868713379, "memory(GiB)": 44.8, "step": 2185, "token_acc": 0.39935064935064934, "train_speed(iter/s)": 1.457615 }, { "epoch": 0.09382631421104494, "grad_norm": 3.2603909969329834, "learning_rate": 9.991313931018503e-05, "loss": 2.7299537658691406, "memory(GiB)": 44.8, "step": 2190, "token_acc": 0.4431137724550898, "train_speed(iter/s)": 1.457003 }, { "epoch": 0.0940405295402939, "grad_norm": 3.839003086090088, "learning_rate": 9.991274234879688e-05, "loss": 2.3807676315307615, "memory(GiB)": 44.8, "step": 2195, "token_acc": 0.4781144781144781, "train_speed(iter/s)": 1.457335 }, { "epoch": 0.09425474486954287, "grad_norm": 3.2841320037841797, "learning_rate": 9.99123444831926e-05, "loss": 2.8565706253051757, "memory(GiB)": 44.8, "step": 2200, "token_acc": 0.4325153374233129, "train_speed(iter/s)": 1.457065 }, { "epoch": 0.09446896019879182, "grad_norm": 4.429266929626465, "learning_rate": 9.991194571337937e-05, "loss": 2.6315616607666015, "memory(GiB)": 44.8, "step": 2205, "token_acc": 0.5, "train_speed(iter/s)": 1.457396 }, { "epoch": 0.09468317552804079, "grad_norm": 3.7662506103515625, "learning_rate": 9.991154603936443e-05, "loss": 2.3513725280761717, "memory(GiB)": 44.8, "step": 2210, "token_acc": 0.5309090909090909, "train_speed(iter/s)": 1.457128 }, { "epoch": 0.09489739085728975, "grad_norm": 4.712777137756348, "learning_rate": 9.991114546115502e-05, "loss": 2.537898635864258, "memory(GiB)": 44.8, "step": 2215, "token_acc": 0.44805194805194803, "train_speed(iter/s)": 1.45697 }, { "epoch": 0.0951116061865387, "grad_norm": 4.573584079742432, "learning_rate": 9.991074397875836e-05, "loss": 2.9043529510498045, "memory(GiB)": 44.8, "step": 2220, "token_acc": 0.4107142857142857, "train_speed(iter/s)": 1.456899 }, { "epoch": 0.09532582151578767, "grad_norm": 5.321566581726074, "learning_rate": 9.991034159218178e-05, "loss": 2.7422548294067384, "memory(GiB)": 44.8, "step": 2225, "token_acc": 0.4697674418604651, "train_speed(iter/s)": 1.456959 }, { "epoch": 0.09554003684503663, "grad_norm": 3.833298683166504, "learning_rate": 9.990993830143253e-05, "loss": 2.548801040649414, "memory(GiB)": 44.8, "step": 2230, "token_acc": 0.5061224489795918, "train_speed(iter/s)": 1.456937 }, { "epoch": 0.0957542521742856, "grad_norm": 5.392580986022949, "learning_rate": 9.990953410651793e-05, "loss": 3.013517379760742, "memory(GiB)": 44.8, "step": 2235, "token_acc": 0.38515901060070673, "train_speed(iter/s)": 1.456928 }, { "epoch": 0.09596846750353455, "grad_norm": 3.42513370513916, "learning_rate": 9.990912900744529e-05, "loss": 2.3812801361083986, "memory(GiB)": 44.8, "step": 2240, "token_acc": 0.5164835164835165, "train_speed(iter/s)": 1.45778 }, { "epoch": 0.09618268283278351, "grad_norm": 4.226132869720459, "learning_rate": 9.990872300422198e-05, "loss": 2.447560119628906, "memory(GiB)": 44.8, "step": 2245, "token_acc": 0.4927536231884058, "train_speed(iter/s)": 1.457581 }, { "epoch": 0.09639689816203248, "grad_norm": 4.196174621582031, "learning_rate": 9.990831609685532e-05, "loss": 2.4533769607543947, "memory(GiB)": 44.8, "step": 2250, "token_acc": 0.47278911564625853, "train_speed(iter/s)": 1.457717 }, { "epoch": 0.09661111349128143, "grad_norm": 3.42431640625, "learning_rate": 9.990790828535271e-05, "loss": 2.4951969146728517, "memory(GiB)": 44.8, "step": 2255, "token_acc": 0.474025974025974, "train_speed(iter/s)": 1.456446 }, { "epoch": 0.0968253288205304, "grad_norm": 4.205296039581299, "learning_rate": 9.990749956972152e-05, "loss": 2.945000648498535, "memory(GiB)": 44.8, "step": 2260, "token_acc": 0.4, "train_speed(iter/s)": 1.456896 }, { "epoch": 0.09703954414977936, "grad_norm": 3.1933600902557373, "learning_rate": 9.990708994996916e-05, "loss": 2.9696237564086916, "memory(GiB)": 44.8, "step": 2265, "token_acc": 0.43455497382198954, "train_speed(iter/s)": 1.457204 }, { "epoch": 0.09725375947902833, "grad_norm": 2.845933437347412, "learning_rate": 9.990667942610303e-05, "loss": 2.838313674926758, "memory(GiB)": 44.8, "step": 2270, "token_acc": 0.45244956772334294, "train_speed(iter/s)": 1.456595 }, { "epoch": 0.09746797480827728, "grad_norm": 3.597959518432617, "learning_rate": 9.990626799813061e-05, "loss": 2.954367446899414, "memory(GiB)": 44.8, "step": 2275, "token_acc": 0.40808823529411764, "train_speed(iter/s)": 1.456928 }, { "epoch": 0.09768219013752624, "grad_norm": 4.257412910461426, "learning_rate": 9.990585566605932e-05, "loss": 2.70975341796875, "memory(GiB)": 44.8, "step": 2280, "token_acc": 0.48639455782312924, "train_speed(iter/s)": 1.456894 }, { "epoch": 0.09789640546677521, "grad_norm": 5.009130001068115, "learning_rate": 9.990544242989663e-05, "loss": 2.705342483520508, "memory(GiB)": 44.8, "step": 2285, "token_acc": 0.48863636363636365, "train_speed(iter/s)": 1.457066 }, { "epoch": 0.09811062079602416, "grad_norm": 3.4967539310455322, "learning_rate": 9.990502828965005e-05, "loss": 2.5337432861328124, "memory(GiB)": 44.8, "step": 2290, "token_acc": 0.4910394265232975, "train_speed(iter/s)": 1.457007 }, { "epoch": 0.09832483612527312, "grad_norm": 3.3518471717834473, "learning_rate": 9.990461324532705e-05, "loss": 2.6714311599731446, "memory(GiB)": 44.8, "step": 2295, "token_acc": 0.4717607973421927, "train_speed(iter/s)": 1.456389 }, { "epoch": 0.09853905145452209, "grad_norm": 4.614869594573975, "learning_rate": 9.990419729693519e-05, "loss": 2.8144582748413085, "memory(GiB)": 44.8, "step": 2300, "token_acc": 0.4266666666666667, "train_speed(iter/s)": 1.456147 }, { "epoch": 0.09875326678377104, "grad_norm": 3.9386801719665527, "learning_rate": 9.990378044448197e-05, "loss": 2.8945823669433595, "memory(GiB)": 44.8, "step": 2305, "token_acc": 0.445578231292517, "train_speed(iter/s)": 1.456448 }, { "epoch": 0.09896748211302, "grad_norm": 4.9402570724487305, "learning_rate": 9.990336268797494e-05, "loss": 2.9529052734375, "memory(GiB)": 44.8, "step": 2310, "token_acc": 0.4432624113475177, "train_speed(iter/s)": 1.456636 }, { "epoch": 0.09918169744226897, "grad_norm": 2.6104836463928223, "learning_rate": 9.990294402742171e-05, "loss": 2.6046127319335937, "memory(GiB)": 44.8, "step": 2315, "token_acc": 0.47592067988668557, "train_speed(iter/s)": 1.456901 }, { "epoch": 0.09939591277151794, "grad_norm": 5.195140361785889, "learning_rate": 9.990252446282982e-05, "loss": 2.6561588287353515, "memory(GiB)": 44.8, "step": 2320, "token_acc": 0.45136186770428016, "train_speed(iter/s)": 1.457078 }, { "epoch": 0.09961012810076689, "grad_norm": 3.7514607906341553, "learning_rate": 9.990210399420688e-05, "loss": 2.476618766784668, "memory(GiB)": 44.8, "step": 2325, "token_acc": 0.48854961832061067, "train_speed(iter/s)": 1.457343 }, { "epoch": 0.09982434343001585, "grad_norm": 3.4020187854766846, "learning_rate": 9.990168262156052e-05, "loss": 2.8265119552612306, "memory(GiB)": 44.8, "step": 2330, "token_acc": 0.45, "train_speed(iter/s)": 1.457231 }, { "epoch": 0.10003855875926482, "grad_norm": 2.6397881507873535, "learning_rate": 9.990126034489837e-05, "loss": 2.339085006713867, "memory(GiB)": 44.8, "step": 2335, "token_acc": 0.5032258064516129, "train_speed(iter/s)": 1.45785 }, { "epoch": 0.10025277408851377, "grad_norm": 3.193979024887085, "learning_rate": 9.990083716422808e-05, "loss": 2.9215423583984377, "memory(GiB)": 44.8, "step": 2340, "token_acc": 0.4077669902912621, "train_speed(iter/s)": 1.457972 }, { "epoch": 0.10046698941776273, "grad_norm": 3.46121883392334, "learning_rate": 9.990041307955731e-05, "loss": 2.833090400695801, "memory(GiB)": 44.8, "step": 2345, "token_acc": 0.43962848297213625, "train_speed(iter/s)": 1.457646 }, { "epoch": 0.1006812047470117, "grad_norm": 3.982710599899292, "learning_rate": 9.989998809089376e-05, "loss": 2.6022457122802733, "memory(GiB)": 44.8, "step": 2350, "token_acc": 0.4844961240310077, "train_speed(iter/s)": 1.457811 }, { "epoch": 0.10089542007626066, "grad_norm": 4.487769603729248, "learning_rate": 9.98995621982451e-05, "loss": 2.5113431930541994, "memory(GiB)": 44.8, "step": 2355, "token_acc": 0.4838709677419355, "train_speed(iter/s)": 1.457838 }, { "epoch": 0.10110963540550962, "grad_norm": 4.391627788543701, "learning_rate": 9.989913540161906e-05, "loss": 2.76812801361084, "memory(GiB)": 44.8, "step": 2360, "token_acc": 0.48484848484848486, "train_speed(iter/s)": 1.458005 }, { "epoch": 0.10132385073475858, "grad_norm": 4.283302307128906, "learning_rate": 9.989870770102339e-05, "loss": 2.2327449798583983, "memory(GiB)": 44.8, "step": 2365, "token_acc": 0.5320754716981132, "train_speed(iter/s)": 1.457033 }, { "epoch": 0.10153806606400755, "grad_norm": 2.9694247245788574, "learning_rate": 9.989827909646581e-05, "loss": 2.6974130630493165, "memory(GiB)": 44.8, "step": 2370, "token_acc": 0.4521452145214521, "train_speed(iter/s)": 1.456564 }, { "epoch": 0.1017522813932565, "grad_norm": 3.814884662628174, "learning_rate": 9.989784958795409e-05, "loss": 2.5132705688476564, "memory(GiB)": 44.8, "step": 2375, "token_acc": 0.4942528735632184, "train_speed(iter/s)": 1.457108 }, { "epoch": 0.10196649672250546, "grad_norm": 6.999202251434326, "learning_rate": 9.989741917549603e-05, "loss": 2.3899309158325197, "memory(GiB)": 44.8, "step": 2380, "token_acc": 0.4577922077922078, "train_speed(iter/s)": 1.457502 }, { "epoch": 0.10218071205175443, "grad_norm": 3.5750794410705566, "learning_rate": 9.98969878590994e-05, "loss": 2.8229116439819335, "memory(GiB)": 44.8, "step": 2385, "token_acc": 0.45819397993311034, "train_speed(iter/s)": 1.457884 }, { "epoch": 0.10239492738100338, "grad_norm": 4.229066848754883, "learning_rate": 9.989655563877203e-05, "loss": 2.663188934326172, "memory(GiB)": 44.8, "step": 2390, "token_acc": 0.45964912280701753, "train_speed(iter/s)": 1.458034 }, { "epoch": 0.10260914271025234, "grad_norm": 4.6534857749938965, "learning_rate": 9.989612251452176e-05, "loss": 2.7501869201660156, "memory(GiB)": 44.8, "step": 2395, "token_acc": 0.4290322580645161, "train_speed(iter/s)": 1.458171 }, { "epoch": 0.10282335803950131, "grad_norm": 4.1199822425842285, "learning_rate": 9.98956884863564e-05, "loss": 2.644135665893555, "memory(GiB)": 44.8, "step": 2400, "token_acc": 0.4885245901639344, "train_speed(iter/s)": 1.458413 }, { "epoch": 0.10303757336875027, "grad_norm": 4.341559410095215, "learning_rate": 9.989525355428386e-05, "loss": 2.3370315551757814, "memory(GiB)": 44.8, "step": 2405, "token_acc": 0.4797687861271676, "train_speed(iter/s)": 1.458611 }, { "epoch": 0.10325178869799922, "grad_norm": 2.8268990516662598, "learning_rate": 9.9894817718312e-05, "loss": 2.4584678649902343, "memory(GiB)": 44.8, "step": 2410, "token_acc": 0.5292207792207793, "train_speed(iter/s)": 1.458368 }, { "epoch": 0.10346600402724819, "grad_norm": 3.776341199874878, "learning_rate": 9.98943809784487e-05, "loss": 2.5440006256103516, "memory(GiB)": 44.8, "step": 2415, "token_acc": 0.45806451612903226, "train_speed(iter/s)": 1.458954 }, { "epoch": 0.10368021935649716, "grad_norm": 5.095711708068848, "learning_rate": 9.989394333470188e-05, "loss": 3.028711700439453, "memory(GiB)": 44.8, "step": 2420, "token_acc": 0.4163568773234201, "train_speed(iter/s)": 1.459408 }, { "epoch": 0.1038944346857461, "grad_norm": 3.311483144760132, "learning_rate": 9.989350478707949e-05, "loss": 2.6387834548950195, "memory(GiB)": 44.8, "step": 2425, "token_acc": 0.4664804469273743, "train_speed(iter/s)": 1.460114 }, { "epoch": 0.10410865001499507, "grad_norm": 2.6038498878479004, "learning_rate": 9.989306533558944e-05, "loss": 2.5682758331298827, "memory(GiB)": 44.8, "step": 2430, "token_acc": 0.45671641791044776, "train_speed(iter/s)": 1.460074 }, { "epoch": 0.10432286534424404, "grad_norm": 3.486302614212036, "learning_rate": 9.98926249802397e-05, "loss": 2.5280033111572267, "memory(GiB)": 44.8, "step": 2435, "token_acc": 0.47318611987381703, "train_speed(iter/s)": 1.459851 }, { "epoch": 0.104537080673493, "grad_norm": 4.014797210693359, "learning_rate": 9.989218372103829e-05, "loss": 2.842324066162109, "memory(GiB)": 44.8, "step": 2440, "token_acc": 0.40431266846361186, "train_speed(iter/s)": 1.460128 }, { "epoch": 0.10475129600274195, "grad_norm": 4.540597915649414, "learning_rate": 9.989174155799314e-05, "loss": 2.459308624267578, "memory(GiB)": 44.8, "step": 2445, "token_acc": 0.48863636363636365, "train_speed(iter/s)": 1.459452 }, { "epoch": 0.10496551133199092, "grad_norm": 2.8123247623443604, "learning_rate": 9.989129849111229e-05, "loss": 2.5809600830078123, "memory(GiB)": 44.8, "step": 2450, "token_acc": 0.48589341692789967, "train_speed(iter/s)": 1.459326 }, { "epoch": 0.10517972666123988, "grad_norm": 3.3037538528442383, "learning_rate": 9.989085452040377e-05, "loss": 2.6184814453125, "memory(GiB)": 44.8, "step": 2455, "token_acc": 0.47706422018348627, "train_speed(iter/s)": 1.458921 }, { "epoch": 0.10539394199048883, "grad_norm": 3.4188737869262695, "learning_rate": 9.989040964587563e-05, "loss": 2.8132484436035154, "memory(GiB)": 44.8, "step": 2460, "token_acc": 0.44680851063829785, "train_speed(iter/s)": 1.459192 }, { "epoch": 0.1056081573197378, "grad_norm": 5.026494979858398, "learning_rate": 9.988996386753591e-05, "loss": 2.631730651855469, "memory(GiB)": 44.8, "step": 2465, "token_acc": 0.44625407166123776, "train_speed(iter/s)": 1.459446 }, { "epoch": 0.10582237264898676, "grad_norm": 7.5906782150268555, "learning_rate": 9.988951718539269e-05, "loss": 2.9284337997436523, "memory(GiB)": 44.8, "step": 2470, "token_acc": 0.4609053497942387, "train_speed(iter/s)": 1.459675 }, { "epoch": 0.10603658797823572, "grad_norm": 3.620561122894287, "learning_rate": 9.988906959945407e-05, "loss": 2.4706584930419924, "memory(GiB)": 44.8, "step": 2475, "token_acc": 0.48484848484848486, "train_speed(iter/s)": 1.460131 }, { "epoch": 0.10625080330748468, "grad_norm": 3.532071828842163, "learning_rate": 9.988862110972816e-05, "loss": 2.6927810668945313, "memory(GiB)": 44.8, "step": 2480, "token_acc": 0.4246575342465753, "train_speed(iter/s)": 1.460031 }, { "epoch": 0.10646501863673365, "grad_norm": 6.2653021812438965, "learning_rate": 9.988817171622307e-05, "loss": 2.4296083450317383, "memory(GiB)": 44.8, "step": 2485, "token_acc": 0.44813278008298757, "train_speed(iter/s)": 1.46073 }, { "epoch": 0.10667923396598261, "grad_norm": 4.349013328552246, "learning_rate": 9.988772141894696e-05, "loss": 2.7962924957275392, "memory(GiB)": 44.8, "step": 2490, "token_acc": 0.43956043956043955, "train_speed(iter/s)": 1.461129 }, { "epoch": 0.10689344929523156, "grad_norm": 3.796917200088501, "learning_rate": 9.988727021790796e-05, "loss": 2.7859775543212892, "memory(GiB)": 44.8, "step": 2495, "token_acc": 0.4421364985163205, "train_speed(iter/s)": 1.461341 }, { "epoch": 0.10710766462448053, "grad_norm": 3.583979845046997, "learning_rate": 9.988681811311428e-05, "loss": 2.744350624084473, "memory(GiB)": 44.8, "step": 2500, "token_acc": 0.41924398625429554, "train_speed(iter/s)": 1.461723 }, { "epoch": 0.10710766462448053, "eval_loss": 2.315615177154541, "eval_runtime": 13.9767, "eval_samples_per_second": 7.155, "eval_steps_per_second": 7.155, "eval_token_acc": 0.45179856115107914, "step": 2500 }, { "epoch": 0.1073218799537295, "grad_norm": 3.135530710220337, "learning_rate": 9.988636510457408e-05, "loss": 2.7376922607421874, "memory(GiB)": 44.8, "step": 2505, "token_acc": 0.4548076923076923, "train_speed(iter/s)": 1.449332 }, { "epoch": 0.10753609528297844, "grad_norm": 4.523245811462402, "learning_rate": 9.988591119229559e-05, "loss": 2.723159980773926, "memory(GiB)": 44.8, "step": 2510, "token_acc": 0.43508771929824563, "train_speed(iter/s)": 1.448981 }, { "epoch": 0.10775031061222741, "grad_norm": 4.681279182434082, "learning_rate": 9.988545637628702e-05, "loss": 2.5212112426757813, "memory(GiB)": 44.8, "step": 2515, "token_acc": 0.46176470588235297, "train_speed(iter/s)": 1.448885 }, { "epoch": 0.10796452594147637, "grad_norm": 4.707463264465332, "learning_rate": 9.98850006565566e-05, "loss": 2.535818862915039, "memory(GiB)": 44.8, "step": 2520, "token_acc": 0.49635036496350365, "train_speed(iter/s)": 1.448804 }, { "epoch": 0.10817874127072534, "grad_norm": 4.106057167053223, "learning_rate": 9.988454403311262e-05, "loss": 2.681136703491211, "memory(GiB)": 44.8, "step": 2525, "token_acc": 0.4090909090909091, "train_speed(iter/s)": 1.448967 }, { "epoch": 0.10839295659997429, "grad_norm": 3.8454689979553223, "learning_rate": 9.98840865059633e-05, "loss": 2.5826454162597656, "memory(GiB)": 44.8, "step": 2530, "token_acc": 0.4728682170542636, "train_speed(iter/s)": 1.449246 }, { "epoch": 0.10860717192922326, "grad_norm": 3.1298320293426514, "learning_rate": 9.988362807511697e-05, "loss": 2.8767068862915037, "memory(GiB)": 44.8, "step": 2535, "token_acc": 0.43086816720257237, "train_speed(iter/s)": 1.449185 }, { "epoch": 0.10882138725847222, "grad_norm": 3.548053741455078, "learning_rate": 9.988316874058195e-05, "loss": 2.7723020553588866, "memory(GiB)": 44.8, "step": 2540, "token_acc": 0.4631578947368421, "train_speed(iter/s)": 1.449884 }, { "epoch": 0.10903560258772117, "grad_norm": 3.266469717025757, "learning_rate": 9.988270850236649e-05, "loss": 2.901295280456543, "memory(GiB)": 44.8, "step": 2545, "token_acc": 0.4296577946768061, "train_speed(iter/s)": 1.449428 }, { "epoch": 0.10924981791697014, "grad_norm": 4.520561218261719, "learning_rate": 9.9882247360479e-05, "loss": 2.9140830993652345, "memory(GiB)": 44.8, "step": 2550, "token_acc": 0.41947565543071164, "train_speed(iter/s)": 1.449681 }, { "epoch": 0.1094640332462191, "grad_norm": 6.197085380554199, "learning_rate": 9.98817853149278e-05, "loss": 2.654117965698242, "memory(GiB)": 44.8, "step": 2555, "token_acc": 0.48872180451127817, "train_speed(iter/s)": 1.449702 }, { "epoch": 0.10967824857546805, "grad_norm": 3.7634689807891846, "learning_rate": 9.988132236572126e-05, "loss": 2.733693313598633, "memory(GiB)": 44.8, "step": 2560, "token_acc": 0.43174603174603177, "train_speed(iter/s)": 1.449961 }, { "epoch": 0.10989246390471702, "grad_norm": 3.0915908813476562, "learning_rate": 9.988085851286778e-05, "loss": 2.782105255126953, "memory(GiB)": 44.8, "step": 2565, "token_acc": 0.4415954415954416, "train_speed(iter/s)": 1.450478 }, { "epoch": 0.11010667923396598, "grad_norm": 3.5323173999786377, "learning_rate": 9.988039375637574e-05, "loss": 2.514674758911133, "memory(GiB)": 44.8, "step": 2570, "token_acc": 0.49140893470790376, "train_speed(iter/s)": 1.450904 }, { "epoch": 0.11032089456321495, "grad_norm": 3.008054733276367, "learning_rate": 9.98799280962536e-05, "loss": 2.7011260986328125, "memory(GiB)": 44.8, "step": 2575, "token_acc": 0.43137254901960786, "train_speed(iter/s)": 1.451432 }, { "epoch": 0.1105351098924639, "grad_norm": 3.8846633434295654, "learning_rate": 9.987946153250976e-05, "loss": 2.735409164428711, "memory(GiB)": 44.8, "step": 2580, "token_acc": 0.4318181818181818, "train_speed(iter/s)": 1.451935 }, { "epoch": 0.11074932522171287, "grad_norm": 3.538858413696289, "learning_rate": 9.987899406515268e-05, "loss": 2.938157844543457, "memory(GiB)": 44.8, "step": 2585, "token_acc": 0.3815789473684211, "train_speed(iter/s)": 1.452487 }, { "epoch": 0.11096354055096183, "grad_norm": 8.99933910369873, "learning_rate": 9.987852569419084e-05, "loss": 2.7111330032348633, "memory(GiB)": 44.8, "step": 2590, "token_acc": 0.38436482084690554, "train_speed(iter/s)": 1.452583 }, { "epoch": 0.11117775588021078, "grad_norm": 4.776661396026611, "learning_rate": 9.98780564196327e-05, "loss": 2.924737548828125, "memory(GiB)": 44.8, "step": 2595, "token_acc": 0.40327868852459015, "train_speed(iter/s)": 1.45278 }, { "epoch": 0.11139197120945975, "grad_norm": 3.6866297721862793, "learning_rate": 9.98775862414868e-05, "loss": 2.336515426635742, "memory(GiB)": 44.8, "step": 2600, "token_acc": 0.5, "train_speed(iter/s)": 1.453238 }, { "epoch": 0.11160618653870871, "grad_norm": 4.3828816413879395, "learning_rate": 9.987711515976164e-05, "loss": 2.57989387512207, "memory(GiB)": 44.8, "step": 2605, "token_acc": 0.46096654275092935, "train_speed(iter/s)": 1.453247 }, { "epoch": 0.11182040186795768, "grad_norm": 3.051657199859619, "learning_rate": 9.987664317446572e-05, "loss": 2.899135971069336, "memory(GiB)": 44.8, "step": 2610, "token_acc": 0.4711864406779661, "train_speed(iter/s)": 1.453688 }, { "epoch": 0.11203461719720663, "grad_norm": 5.278879165649414, "learning_rate": 9.987617028560765e-05, "loss": 2.8429651260375977, "memory(GiB)": 44.8, "step": 2615, "token_acc": 0.4174174174174174, "train_speed(iter/s)": 1.453461 }, { "epoch": 0.1122488325264556, "grad_norm": 3.091171979904175, "learning_rate": 9.987569649319595e-05, "loss": 2.5073850631713865, "memory(GiB)": 44.8, "step": 2620, "token_acc": 0.48872180451127817, "train_speed(iter/s)": 1.454114 }, { "epoch": 0.11246304785570456, "grad_norm": 3.2750463485717773, "learning_rate": 9.987522179723923e-05, "loss": 2.5251392364501952, "memory(GiB)": 44.8, "step": 2625, "token_acc": 0.506993006993007, "train_speed(iter/s)": 1.454021 }, { "epoch": 0.11267726318495351, "grad_norm": 5.543854236602783, "learning_rate": 9.987474619774609e-05, "loss": 2.63586368560791, "memory(GiB)": 44.8, "step": 2630, "token_acc": 0.4740484429065744, "train_speed(iter/s)": 1.45397 }, { "epoch": 0.11289147851420248, "grad_norm": 3.494727849960327, "learning_rate": 9.98742696947251e-05, "loss": 2.8177515029907227, "memory(GiB)": 44.8, "step": 2635, "token_acc": 0.44785276073619634, "train_speed(iter/s)": 1.453574 }, { "epoch": 0.11310569384345144, "grad_norm": 5.487606525421143, "learning_rate": 9.987379228818497e-05, "loss": 2.436424446105957, "memory(GiB)": 44.8, "step": 2640, "token_acc": 0.48771929824561405, "train_speed(iter/s)": 1.45333 }, { "epoch": 0.11331990917270039, "grad_norm": 3.1628775596618652, "learning_rate": 9.987331397813429e-05, "loss": 2.429340362548828, "memory(GiB)": 44.8, "step": 2645, "token_acc": 0.4819672131147541, "train_speed(iter/s)": 1.453584 }, { "epoch": 0.11353412450194936, "grad_norm": 3.776961326599121, "learning_rate": 9.987283476458174e-05, "loss": 2.6166324615478516, "memory(GiB)": 44.8, "step": 2650, "token_acc": 0.49056603773584906, "train_speed(iter/s)": 1.454593 }, { "epoch": 0.11374833983119832, "grad_norm": 3.17450213432312, "learning_rate": 9.9872354647536e-05, "loss": 2.866488456726074, "memory(GiB)": 44.8, "step": 2655, "token_acc": 0.4386503067484663, "train_speed(iter/s)": 1.454631 }, { "epoch": 0.11396255516044729, "grad_norm": 4.373410701751709, "learning_rate": 9.987187362700579e-05, "loss": 2.6585012435913087, "memory(GiB)": 44.8, "step": 2660, "token_acc": 0.458041958041958, "train_speed(iter/s)": 1.454704 }, { "epoch": 0.11417677048969624, "grad_norm": 4.524393081665039, "learning_rate": 9.987139170299978e-05, "loss": 2.6862470626831056, "memory(GiB)": 44.8, "step": 2665, "token_acc": 0.41935483870967744, "train_speed(iter/s)": 1.454475 }, { "epoch": 0.1143909858189452, "grad_norm": 4.258168697357178, "learning_rate": 9.987090887552675e-05, "loss": 2.646759605407715, "memory(GiB)": 44.8, "step": 2670, "token_acc": 0.4407894736842105, "train_speed(iter/s)": 1.455336 }, { "epoch": 0.11460520114819417, "grad_norm": 3.578068256378174, "learning_rate": 9.987042514459541e-05, "loss": 2.4732179641723633, "memory(GiB)": 44.8, "step": 2675, "token_acc": 0.4734848484848485, "train_speed(iter/s)": 1.455521 }, { "epoch": 0.11481941647744312, "grad_norm": 5.073055267333984, "learning_rate": 9.986994051021454e-05, "loss": 2.4216739654541017, "memory(GiB)": 44.8, "step": 2680, "token_acc": 0.47416413373860183, "train_speed(iter/s)": 1.455442 }, { "epoch": 0.11503363180669209, "grad_norm": 3.739480495452881, "learning_rate": 9.98694549723929e-05, "loss": 2.73177547454834, "memory(GiB)": 44.8, "step": 2685, "token_acc": 0.46089385474860334, "train_speed(iter/s)": 1.455095 }, { "epoch": 0.11524784713594105, "grad_norm": 4.096671104431152, "learning_rate": 9.986896853113932e-05, "loss": 2.700012969970703, "memory(GiB)": 44.8, "step": 2690, "token_acc": 0.44871794871794873, "train_speed(iter/s)": 1.454966 }, { "epoch": 0.11546206246519002, "grad_norm": 4.17447566986084, "learning_rate": 9.98684811864626e-05, "loss": 2.514265251159668, "memory(GiB)": 44.8, "step": 2695, "token_acc": 0.48627450980392156, "train_speed(iter/s)": 1.455102 }, { "epoch": 0.11567627779443897, "grad_norm": 4.056727886199951, "learning_rate": 9.986799293837155e-05, "loss": 2.7692144393920897, "memory(GiB)": 44.8, "step": 2700, "token_acc": 0.45977011494252873, "train_speed(iter/s)": 1.455007 }, { "epoch": 0.11589049312368793, "grad_norm": 3.6206116676330566, "learning_rate": 9.986750378687502e-05, "loss": 2.3905899047851564, "memory(GiB)": 44.8, "step": 2705, "token_acc": 0.4876543209876543, "train_speed(iter/s)": 1.454789 }, { "epoch": 0.1161047084529369, "grad_norm": 5.165829181671143, "learning_rate": 9.98670137319819e-05, "loss": 2.703978729248047, "memory(GiB)": 44.8, "step": 2710, "token_acc": 0.4453125, "train_speed(iter/s)": 1.454077 }, { "epoch": 0.11631892378218585, "grad_norm": 3.4856631755828857, "learning_rate": 9.986652277370103e-05, "loss": 2.891558074951172, "memory(GiB)": 44.8, "step": 2715, "token_acc": 0.4723127035830619, "train_speed(iter/s)": 1.454489 }, { "epoch": 0.11653313911143481, "grad_norm": 3.9351048469543457, "learning_rate": 9.986603091204134e-05, "loss": 2.5380075454711912, "memory(GiB)": 44.8, "step": 2720, "token_acc": 0.4588235294117647, "train_speed(iter/s)": 1.454791 }, { "epoch": 0.11674735444068378, "grad_norm": 5.181811809539795, "learning_rate": 9.98655381470117e-05, "loss": 2.854144287109375, "memory(GiB)": 44.8, "step": 2725, "token_acc": 0.46153846153846156, "train_speed(iter/s)": 1.454509 }, { "epoch": 0.11696156976993273, "grad_norm": 4.077805042266846, "learning_rate": 9.986504447862108e-05, "loss": 2.4561676025390624, "memory(GiB)": 44.8, "step": 2730, "token_acc": 0.46875, "train_speed(iter/s)": 1.454177 }, { "epoch": 0.1171757850991817, "grad_norm": 3.2708146572113037, "learning_rate": 9.986454990687839e-05, "loss": 2.478154182434082, "memory(GiB)": 44.8, "step": 2735, "token_acc": 0.46511627906976744, "train_speed(iter/s)": 1.453891 }, { "epoch": 0.11739000042843066, "grad_norm": 4.113687992095947, "learning_rate": 9.986405443179261e-05, "loss": 2.59047908782959, "memory(GiB)": 44.8, "step": 2740, "token_acc": 0.4950166112956811, "train_speed(iter/s)": 1.454461 }, { "epoch": 0.11760421575767963, "grad_norm": 3.1191680431365967, "learning_rate": 9.986355805337268e-05, "loss": 2.495977783203125, "memory(GiB)": 44.8, "step": 2745, "token_acc": 0.4227941176470588, "train_speed(iter/s)": 1.454518 }, { "epoch": 0.11781843108692858, "grad_norm": 4.838729381561279, "learning_rate": 9.986306077162766e-05, "loss": 2.7230384826660154, "memory(GiB)": 44.8, "step": 2750, "token_acc": 0.4392156862745098, "train_speed(iter/s)": 1.455454 }, { "epoch": 0.11803264641617754, "grad_norm": 4.422349452972412, "learning_rate": 9.98625625865665e-05, "loss": 2.635031509399414, "memory(GiB)": 44.8, "step": 2755, "token_acc": 0.43317972350230416, "train_speed(iter/s)": 1.455255 }, { "epoch": 0.11824686174542651, "grad_norm": 3.5057785511016846, "learning_rate": 9.986206349819825e-05, "loss": 2.7041976928710936, "memory(GiB)": 44.8, "step": 2760, "token_acc": 0.4669421487603306, "train_speed(iter/s)": 1.455021 }, { "epoch": 0.11846107707467546, "grad_norm": 4.519093990325928, "learning_rate": 9.986156350653193e-05, "loss": 2.80269775390625, "memory(GiB)": 44.8, "step": 2765, "token_acc": 0.43795620437956206, "train_speed(iter/s)": 1.455074 }, { "epoch": 0.11867529240392442, "grad_norm": 3.121180295944214, "learning_rate": 9.986106261157662e-05, "loss": 2.543857955932617, "memory(GiB)": 44.8, "step": 2770, "token_acc": 0.5115511551155115, "train_speed(iter/s)": 1.45498 }, { "epoch": 0.11888950773317339, "grad_norm": 3.253769636154175, "learning_rate": 9.986056081334139e-05, "loss": 2.355792236328125, "memory(GiB)": 44.8, "step": 2775, "token_acc": 0.5034246575342466, "train_speed(iter/s)": 1.455323 }, { "epoch": 0.11910372306242235, "grad_norm": 4.304771900177002, "learning_rate": 9.986005811183533e-05, "loss": 2.3715681076049804, "memory(GiB)": 44.8, "step": 2780, "token_acc": 0.4785714285714286, "train_speed(iter/s)": 1.455675 }, { "epoch": 0.1193179383916713, "grad_norm": 4.088959217071533, "learning_rate": 9.985955450706755e-05, "loss": 2.558426284790039, "memory(GiB)": 44.8, "step": 2785, "token_acc": 0.48859934853420195, "train_speed(iter/s)": 1.456214 }, { "epoch": 0.11953215372092027, "grad_norm": 3.599224328994751, "learning_rate": 9.985904999904715e-05, "loss": 2.6276081085205076, "memory(GiB)": 44.8, "step": 2790, "token_acc": 0.44765342960288806, "train_speed(iter/s)": 1.455336 }, { "epoch": 0.11974636905016924, "grad_norm": 3.8726751804351807, "learning_rate": 9.98585445877833e-05, "loss": 3.040993499755859, "memory(GiB)": 44.8, "step": 2795, "token_acc": 0.4266666666666667, "train_speed(iter/s)": 1.455832 }, { "epoch": 0.11996058437941819, "grad_norm": 5.547581672668457, "learning_rate": 9.985803827328515e-05, "loss": 2.7757171630859374, "memory(GiB)": 44.8, "step": 2800, "token_acc": 0.4652014652014652, "train_speed(iter/s)": 1.455775 }, { "epoch": 0.12017479970866715, "grad_norm": 4.747446060180664, "learning_rate": 9.985753105556185e-05, "loss": 2.4627155303955077, "memory(GiB)": 44.8, "step": 2805, "token_acc": 0.47959183673469385, "train_speed(iter/s)": 1.456337 }, { "epoch": 0.12038901503791612, "grad_norm": 5.369457721710205, "learning_rate": 9.985702293462261e-05, "loss": 2.4614301681518556, "memory(GiB)": 44.8, "step": 2810, "token_acc": 0.47703180212014135, "train_speed(iter/s)": 1.456445 }, { "epoch": 0.12060323036716507, "grad_norm": 3.3231163024902344, "learning_rate": 9.985651391047663e-05, "loss": 2.8215492248535154, "memory(GiB)": 44.8, "step": 2815, "token_acc": 0.45422535211267606, "train_speed(iter/s)": 1.456758 }, { "epoch": 0.12081744569641403, "grad_norm": 2.909749984741211, "learning_rate": 9.985600398313313e-05, "loss": 2.706439399719238, "memory(GiB)": 44.8, "step": 2820, "token_acc": 0.46075085324232085, "train_speed(iter/s)": 1.456957 }, { "epoch": 0.121031661025663, "grad_norm": 6.230549335479736, "learning_rate": 9.985549315260137e-05, "loss": 2.837756538391113, "memory(GiB)": 44.8, "step": 2825, "token_acc": 0.4246575342465753, "train_speed(iter/s)": 1.457146 }, { "epoch": 0.12124587635491196, "grad_norm": 3.530992269515991, "learning_rate": 9.985498141889056e-05, "loss": 2.4590328216552733, "memory(GiB)": 44.8, "step": 2830, "token_acc": 0.4767025089605735, "train_speed(iter/s)": 1.457587 }, { "epoch": 0.12146009168416091, "grad_norm": 4.051492214202881, "learning_rate": 9.985446878201e-05, "loss": 2.934111404418945, "memory(GiB)": 44.8, "step": 2835, "token_acc": 0.40509915014164305, "train_speed(iter/s)": 1.458241 }, { "epoch": 0.12167430701340988, "grad_norm": 5.797842025756836, "learning_rate": 9.985395524196896e-05, "loss": 2.639509391784668, "memory(GiB)": 44.8, "step": 2840, "token_acc": 0.44525547445255476, "train_speed(iter/s)": 1.45777 }, { "epoch": 0.12188852234265884, "grad_norm": 3.338931083679199, "learning_rate": 9.985344079877677e-05, "loss": 2.498863410949707, "memory(GiB)": 44.8, "step": 2845, "token_acc": 0.44, "train_speed(iter/s)": 1.457512 }, { "epoch": 0.1221027376719078, "grad_norm": 3.9982433319091797, "learning_rate": 9.985292545244274e-05, "loss": 2.7938840866088865, "memory(GiB)": 44.8, "step": 2850, "token_acc": 0.42857142857142855, "train_speed(iter/s)": 1.457688 }, { "epoch": 0.12231695300115676, "grad_norm": 3.668144941329956, "learning_rate": 9.985240920297618e-05, "loss": 2.768280029296875, "memory(GiB)": 44.8, "step": 2855, "token_acc": 0.43944636678200694, "train_speed(iter/s)": 1.457747 }, { "epoch": 0.12253116833040573, "grad_norm": 3.793884038925171, "learning_rate": 9.985189205038647e-05, "loss": 2.6596202850341797, "memory(GiB)": 44.8, "step": 2860, "token_acc": 0.46099290780141844, "train_speed(iter/s)": 1.458169 }, { "epoch": 0.12274538365965469, "grad_norm": 5.051962852478027, "learning_rate": 9.985137399468298e-05, "loss": 2.7195808410644533, "memory(GiB)": 44.8, "step": 2865, "token_acc": 0.4492753623188406, "train_speed(iter/s)": 1.458557 }, { "epoch": 0.12295959898890364, "grad_norm": 4.148073673248291, "learning_rate": 9.985085503587507e-05, "loss": 2.688954162597656, "memory(GiB)": 44.8, "step": 2870, "token_acc": 0.47297297297297297, "train_speed(iter/s)": 1.458639 }, { "epoch": 0.12317381431815261, "grad_norm": 9.053473472595215, "learning_rate": 9.985033517397217e-05, "loss": 2.5241180419921876, "memory(GiB)": 44.8, "step": 2875, "token_acc": 0.4575645756457565, "train_speed(iter/s)": 1.458511 }, { "epoch": 0.12338802964740157, "grad_norm": 4.768235206604004, "learning_rate": 9.984981440898366e-05, "loss": 2.556084632873535, "memory(GiB)": 44.8, "step": 2880, "token_acc": 0.4724137931034483, "train_speed(iter/s)": 1.458841 }, { "epoch": 0.12360224497665052, "grad_norm": 3.2739498615264893, "learning_rate": 9.984929274091903e-05, "loss": 2.971639633178711, "memory(GiB)": 44.8, "step": 2885, "token_acc": 0.3942307692307692, "train_speed(iter/s)": 1.459376 }, { "epoch": 0.12381646030589949, "grad_norm": 3.549626111984253, "learning_rate": 9.984877016978768e-05, "loss": 2.8099727630615234, "memory(GiB)": 44.8, "step": 2890, "token_acc": 0.45272206303724927, "train_speed(iter/s)": 1.460056 }, { "epoch": 0.12403067563514845, "grad_norm": 2.765273094177246, "learning_rate": 9.984824669559911e-05, "loss": 2.5519046783447266, "memory(GiB)": 44.8, "step": 2895, "token_acc": 0.463855421686747, "train_speed(iter/s)": 1.460508 }, { "epoch": 0.1242448909643974, "grad_norm": 3.7635700702667236, "learning_rate": 9.984772231836279e-05, "loss": 2.78131217956543, "memory(GiB)": 44.8, "step": 2900, "token_acc": 0.4398496240601504, "train_speed(iter/s)": 1.460391 }, { "epoch": 0.12445910629364637, "grad_norm": 4.400283336639404, "learning_rate": 9.98471970380882e-05, "loss": 2.8672739028930665, "memory(GiB)": 47.63, "step": 2905, "token_acc": 0.4222222222222222, "train_speed(iter/s)": 1.45939 }, { "epoch": 0.12467332162289534, "grad_norm": 3.5733256340026855, "learning_rate": 9.98466708547849e-05, "loss": 2.6260372161865235, "memory(GiB)": 47.63, "step": 2910, "token_acc": 0.4370860927152318, "train_speed(iter/s)": 1.459296 }, { "epoch": 0.1248875369521443, "grad_norm": 2.9659810066223145, "learning_rate": 9.984614376846238e-05, "loss": 2.588872528076172, "memory(GiB)": 47.63, "step": 2915, "token_acc": 0.45263157894736844, "train_speed(iter/s)": 1.459633 }, { "epoch": 0.12510175228139325, "grad_norm": 2.75057053565979, "learning_rate": 9.98456157791302e-05, "loss": 2.133049201965332, "memory(GiB)": 47.63, "step": 2920, "token_acc": 0.5215827338129496, "train_speed(iter/s)": 1.459971 }, { "epoch": 0.1253159676106422, "grad_norm": 3.801823139190674, "learning_rate": 9.984508688679796e-05, "loss": 2.666956329345703, "memory(GiB)": 47.63, "step": 2925, "token_acc": 0.4978165938864629, "train_speed(iter/s)": 1.460505 }, { "epoch": 0.12553018293989118, "grad_norm": 2.849738597869873, "learning_rate": 9.984455709147519e-05, "loss": 2.7858362197875977, "memory(GiB)": 47.63, "step": 2930, "token_acc": 0.4401294498381877, "train_speed(iter/s)": 1.460717 }, { "epoch": 0.12574439826914013, "grad_norm": 3.5369927883148193, "learning_rate": 9.984402639317152e-05, "loss": 2.5744321823120115, "memory(GiB)": 47.63, "step": 2935, "token_acc": 0.46229508196721314, "train_speed(iter/s)": 1.46155 }, { "epoch": 0.1259586135983891, "grad_norm": 5.130538463592529, "learning_rate": 9.984349479189654e-05, "loss": 2.6871822357177733, "memory(GiB)": 47.63, "step": 2940, "token_acc": 0.47058823529411764, "train_speed(iter/s)": 1.462424 }, { "epoch": 0.12617282892763806, "grad_norm": 4.449698448181152, "learning_rate": 9.984296228765991e-05, "loss": 2.8014768600463866, "memory(GiB)": 47.63, "step": 2945, "token_acc": 0.4135593220338983, "train_speed(iter/s)": 1.462565 }, { "epoch": 0.12638704425688702, "grad_norm": 3.4198319911956787, "learning_rate": 9.984242888047127e-05, "loss": 2.7933071136474608, "memory(GiB)": 47.63, "step": 2950, "token_acc": 0.4576271186440678, "train_speed(iter/s)": 1.462375 }, { "epoch": 0.126601259586136, "grad_norm": 5.142146587371826, "learning_rate": 9.984189457034026e-05, "loss": 2.7760799407958983, "memory(GiB)": 47.63, "step": 2955, "token_acc": 0.46078431372549017, "train_speed(iter/s)": 1.461615 }, { "epoch": 0.12681547491538495, "grad_norm": 4.261962413787842, "learning_rate": 9.984135935727657e-05, "loss": 2.888034629821777, "memory(GiB)": 47.63, "step": 2960, "token_acc": 0.41916167664670656, "train_speed(iter/s)": 1.461648 }, { "epoch": 0.1270296902446339, "grad_norm": 3.322274684906006, "learning_rate": 9.984082324128991e-05, "loss": 2.7487483978271485, "memory(GiB)": 47.63, "step": 2965, "token_acc": 0.41694915254237286, "train_speed(iter/s)": 1.461641 }, { "epoch": 0.12724390557388288, "grad_norm": 6.232876300811768, "learning_rate": 9.984028622238997e-05, "loss": 2.972730255126953, "memory(GiB)": 47.63, "step": 2970, "token_acc": 0.43478260869565216, "train_speed(iter/s)": 1.461888 }, { "epoch": 0.12745812090313183, "grad_norm": 4.244607448577881, "learning_rate": 9.98397483005865e-05, "loss": 2.574996757507324, "memory(GiB)": 47.63, "step": 2975, "token_acc": 0.5221238938053098, "train_speed(iter/s)": 1.462386 }, { "epoch": 0.12767233623238078, "grad_norm": 3.4390854835510254, "learning_rate": 9.983920947588923e-05, "loss": 3.0913532257080076, "memory(GiB)": 47.63, "step": 2980, "token_acc": 0.42857142857142855, "train_speed(iter/s)": 1.462509 }, { "epoch": 0.12788655156162976, "grad_norm": 3.9557533264160156, "learning_rate": 9.983866974830794e-05, "loss": 2.6116329193115235, "memory(GiB)": 47.63, "step": 2985, "token_acc": 0.44807121661721067, "train_speed(iter/s)": 1.462362 }, { "epoch": 0.1281007668908787, "grad_norm": 4.995320796966553, "learning_rate": 9.983812911785238e-05, "loss": 2.72194881439209, "memory(GiB)": 47.63, "step": 2990, "token_acc": 0.475, "train_speed(iter/s)": 1.462744 }, { "epoch": 0.12831498222012766, "grad_norm": 4.00553035736084, "learning_rate": 9.983758758453238e-05, "loss": 2.504826545715332, "memory(GiB)": 47.63, "step": 2995, "token_acc": 0.49236641221374045, "train_speed(iter/s)": 1.462946 }, { "epoch": 0.12852919754937664, "grad_norm": 4.180451393127441, "learning_rate": 9.983704514835771e-05, "loss": 2.6988336563110353, "memory(GiB)": 47.63, "step": 3000, "token_acc": 0.47019867549668876, "train_speed(iter/s)": 1.463163 }, { "epoch": 0.12852919754937664, "eval_loss": 2.293785333633423, "eval_runtime": 14.3443, "eval_samples_per_second": 6.971, "eval_steps_per_second": 6.971, "eval_token_acc": 0.453015427769986, "step": 3000 }, { "epoch": 0.1287434128786256, "grad_norm": 4.106462001800537, "learning_rate": 9.983650180933822e-05, "loss": 2.8215705871582033, "memory(GiB)": 47.63, "step": 3005, "token_acc": 0.44668008048289737, "train_speed(iter/s)": 1.452652 }, { "epoch": 0.12895762820787454, "grad_norm": 7.2187347412109375, "learning_rate": 9.983595756748376e-05, "loss": 2.584635543823242, "memory(GiB)": 47.63, "step": 3010, "token_acc": 0.4431818181818182, "train_speed(iter/s)": 1.452488 }, { "epoch": 0.12917184353712352, "grad_norm": 4.628119945526123, "learning_rate": 9.983541242280417e-05, "loss": 2.9025840759277344, "memory(GiB)": 47.63, "step": 3015, "token_acc": 0.42024539877300615, "train_speed(iter/s)": 1.452558 }, { "epoch": 0.12938605886637247, "grad_norm": 4.669239044189453, "learning_rate": 9.983486637530934e-05, "loss": 2.6678781509399414, "memory(GiB)": 47.63, "step": 3020, "token_acc": 0.4565916398713826, "train_speed(iter/s)": 1.453222 }, { "epoch": 0.12960027419562145, "grad_norm": 3.1429362297058105, "learning_rate": 9.983431942500915e-05, "loss": 2.8684564590454102, "memory(GiB)": 47.63, "step": 3025, "token_acc": 0.41389728096676737, "train_speed(iter/s)": 1.453618 }, { "epoch": 0.1298144895248704, "grad_norm": 2.8368780612945557, "learning_rate": 9.983377157191352e-05, "loss": 2.491801643371582, "memory(GiB)": 47.63, "step": 3030, "token_acc": 0.4666666666666667, "train_speed(iter/s)": 1.453491 }, { "epoch": 0.13002870485411935, "grad_norm": 2.881819486618042, "learning_rate": 9.983322281603235e-05, "loss": 2.719405746459961, "memory(GiB)": 47.63, "step": 3035, "token_acc": 0.4368231046931408, "train_speed(iter/s)": 1.453369 }, { "epoch": 0.13024292018336833, "grad_norm": 4.35675048828125, "learning_rate": 9.983267315737563e-05, "loss": 2.4980485916137694, "memory(GiB)": 47.63, "step": 3040, "token_acc": 0.45041322314049587, "train_speed(iter/s)": 1.454165 }, { "epoch": 0.13045713551261728, "grad_norm": 3.4245173931121826, "learning_rate": 9.983212259595328e-05, "loss": 2.6589332580566407, "memory(GiB)": 47.63, "step": 3045, "token_acc": 0.4355400696864111, "train_speed(iter/s)": 1.454215 }, { "epoch": 0.13067135084186624, "grad_norm": 4.292496204376221, "learning_rate": 9.983157113177528e-05, "loss": 2.5680091857910154, "memory(GiB)": 47.63, "step": 3050, "token_acc": 0.4797297297297297, "train_speed(iter/s)": 1.454931 }, { "epoch": 0.13088556617111521, "grad_norm": 3.379364490509033, "learning_rate": 9.983101876485162e-05, "loss": 3.025213623046875, "memory(GiB)": 47.63, "step": 3055, "token_acc": 0.43217665615141954, "train_speed(iter/s)": 1.455643 }, { "epoch": 0.13109978150036417, "grad_norm": 5.147782325744629, "learning_rate": 9.983046549519232e-05, "loss": 2.453817367553711, "memory(GiB)": 47.63, "step": 3060, "token_acc": 0.45528455284552843, "train_speed(iter/s)": 1.455458 }, { "epoch": 0.13131399682961312, "grad_norm": 4.744100570678711, "learning_rate": 9.982991132280739e-05, "loss": 2.5531290054321287, "memory(GiB)": 47.63, "step": 3065, "token_acc": 0.48214285714285715, "train_speed(iter/s)": 1.455646 }, { "epoch": 0.1315282121588621, "grad_norm": 4.192126750946045, "learning_rate": 9.982935624770687e-05, "loss": 2.2457679748535155, "memory(GiB)": 47.63, "step": 3070, "token_acc": 0.515527950310559, "train_speed(iter/s)": 1.455399 }, { "epoch": 0.13174242748811105, "grad_norm": 3.6630325317382812, "learning_rate": 9.98288002699008e-05, "loss": 2.8035654067993163, "memory(GiB)": 47.63, "step": 3075, "token_acc": 0.43727598566308246, "train_speed(iter/s)": 1.455998 }, { "epoch": 0.13195664281736, "grad_norm": 4.754130840301514, "learning_rate": 9.982824338939929e-05, "loss": 2.475682830810547, "memory(GiB)": 47.63, "step": 3080, "token_acc": 0.49606299212598426, "train_speed(iter/s)": 1.455822 }, { "epoch": 0.13217085814660898, "grad_norm": 4.164949893951416, "learning_rate": 9.98276856062124e-05, "loss": 2.553522300720215, "memory(GiB)": 47.63, "step": 3085, "token_acc": 0.4779116465863454, "train_speed(iter/s)": 1.455773 }, { "epoch": 0.13238507347585793, "grad_norm": 5.448624134063721, "learning_rate": 9.982712692035025e-05, "loss": 2.7034210205078124, "memory(GiB)": 47.63, "step": 3090, "token_acc": 0.5081967213114754, "train_speed(iter/s)": 1.455792 }, { "epoch": 0.13259928880510688, "grad_norm": 3.43050217628479, "learning_rate": 9.982656733182296e-05, "loss": 2.0949424743652343, "memory(GiB)": 47.63, "step": 3095, "token_acc": 0.5330396475770925, "train_speed(iter/s)": 1.4562 }, { "epoch": 0.13281350413435586, "grad_norm": 4.507286071777344, "learning_rate": 9.982600684064065e-05, "loss": 2.70364933013916, "memory(GiB)": 47.63, "step": 3100, "token_acc": 0.4395973154362416, "train_speed(iter/s)": 1.456434 }, { "epoch": 0.1330277194636048, "grad_norm": 4.425159931182861, "learning_rate": 9.98254454468135e-05, "loss": 2.7447975158691404, "memory(GiB)": 47.63, "step": 3105, "token_acc": 0.391304347826087, "train_speed(iter/s)": 1.456779 }, { "epoch": 0.1332419347928538, "grad_norm": 3.55261492729187, "learning_rate": 9.982488315035166e-05, "loss": 2.6549182891845704, "memory(GiB)": 47.63, "step": 3110, "token_acc": 0.4664804469273743, "train_speed(iter/s)": 1.456559 }, { "epoch": 0.13345615012210274, "grad_norm": 4.19412899017334, "learning_rate": 9.982431995126531e-05, "loss": 2.496395492553711, "memory(GiB)": 47.63, "step": 3115, "token_acc": 0.488135593220339, "train_speed(iter/s)": 1.456263 }, { "epoch": 0.1336703654513517, "grad_norm": 3.5189948081970215, "learning_rate": 9.982375584956467e-05, "loss": 2.6310930252075195, "memory(GiB)": 47.63, "step": 3120, "token_acc": 0.42574257425742573, "train_speed(iter/s)": 1.456234 }, { "epoch": 0.13388458078060067, "grad_norm": 3.1445939540863037, "learning_rate": 9.982319084525995e-05, "loss": 2.954359436035156, "memory(GiB)": 47.63, "step": 3125, "token_acc": 0.42033898305084744, "train_speed(iter/s)": 1.456507 }, { "epoch": 0.13409879610984962, "grad_norm": 4.120058536529541, "learning_rate": 9.98226249383614e-05, "loss": 2.300580596923828, "memory(GiB)": 47.63, "step": 3130, "token_acc": 0.4618320610687023, "train_speed(iter/s)": 1.456747 }, { "epoch": 0.13431301143909857, "grad_norm": 3.6941018104553223, "learning_rate": 9.982205812887925e-05, "loss": 2.590833854675293, "memory(GiB)": 47.63, "step": 3135, "token_acc": 0.4642857142857143, "train_speed(iter/s)": 1.457097 }, { "epoch": 0.13452722676834755, "grad_norm": 4.080898284912109, "learning_rate": 9.98214904168238e-05, "loss": 2.440216064453125, "memory(GiB)": 47.63, "step": 3140, "token_acc": 0.4662379421221865, "train_speed(iter/s)": 1.45684 }, { "epoch": 0.1347414420975965, "grad_norm": 3.7610936164855957, "learning_rate": 9.98209218022053e-05, "loss": 2.539932632446289, "memory(GiB)": 47.63, "step": 3145, "token_acc": 0.46567164179104475, "train_speed(iter/s)": 1.456996 }, { "epoch": 0.13495565742684545, "grad_norm": 3.659843683242798, "learning_rate": 9.982035228503407e-05, "loss": 2.4131053924560546, "memory(GiB)": 47.63, "step": 3150, "token_acc": 0.46402877697841727, "train_speed(iter/s)": 1.45732 }, { "epoch": 0.13516987275609443, "grad_norm": 4.357789993286133, "learning_rate": 9.981978186532041e-05, "loss": 2.670502853393555, "memory(GiB)": 47.63, "step": 3155, "token_acc": 0.4652014652014652, "train_speed(iter/s)": 1.457363 }, { "epoch": 0.13538408808534338, "grad_norm": 3.6408872604370117, "learning_rate": 9.981921054307469e-05, "loss": 2.79071044921875, "memory(GiB)": 47.63, "step": 3160, "token_acc": 0.44886363636363635, "train_speed(iter/s)": 1.457951 }, { "epoch": 0.13559830341459234, "grad_norm": 3.7442562580108643, "learning_rate": 9.981863831830723e-05, "loss": 2.481209564208984, "memory(GiB)": 47.63, "step": 3165, "token_acc": 0.4968553459119497, "train_speed(iter/s)": 1.458041 }, { "epoch": 0.13581251874384132, "grad_norm": 3.842127799987793, "learning_rate": 9.98180651910284e-05, "loss": 2.656930923461914, "memory(GiB)": 47.63, "step": 3170, "token_acc": 0.45209580838323354, "train_speed(iter/s)": 1.45861 }, { "epoch": 0.13602673407309027, "grad_norm": 5.372028827667236, "learning_rate": 9.981749116124859e-05, "loss": 2.739255905151367, "memory(GiB)": 47.63, "step": 3175, "token_acc": 0.4577922077922078, "train_speed(iter/s)": 1.458852 }, { "epoch": 0.13624094940233922, "grad_norm": 4.918059349060059, "learning_rate": 9.981691622897818e-05, "loss": 2.712474060058594, "memory(GiB)": 47.63, "step": 3180, "token_acc": 0.4389438943894389, "train_speed(iter/s)": 1.458461 }, { "epoch": 0.1364551647315882, "grad_norm": 3.887791156768799, "learning_rate": 9.981634039422761e-05, "loss": 2.9390422821044924, "memory(GiB)": 47.63, "step": 3185, "token_acc": 0.3894080996884735, "train_speed(iter/s)": 1.458451 }, { "epoch": 0.13666938006083715, "grad_norm": 3.610445976257324, "learning_rate": 9.98157636570073e-05, "loss": 2.300612449645996, "memory(GiB)": 47.63, "step": 3190, "token_acc": 0.5015479876160991, "train_speed(iter/s)": 1.458752 }, { "epoch": 0.13688359539008613, "grad_norm": 3.383845567703247, "learning_rate": 9.981518601732771e-05, "loss": 2.4171300888061524, "memory(GiB)": 47.63, "step": 3195, "token_acc": 0.5226480836236934, "train_speed(iter/s)": 1.458971 }, { "epoch": 0.13709781071933508, "grad_norm": 3.146437644958496, "learning_rate": 9.981460747519928e-05, "loss": 2.724361801147461, "memory(GiB)": 47.63, "step": 3200, "token_acc": 0.4158730158730159, "train_speed(iter/s)": 1.459388 }, { "epoch": 0.13731202604858403, "grad_norm": 3.7056562900543213, "learning_rate": 9.98140280306325e-05, "loss": 2.829375076293945, "memory(GiB)": 47.63, "step": 3205, "token_acc": 0.4489795918367347, "train_speed(iter/s)": 1.459722 }, { "epoch": 0.137526241377833, "grad_norm": 4.848848342895508, "learning_rate": 9.981344768363791e-05, "loss": 2.6239721298217775, "memory(GiB)": 47.63, "step": 3210, "token_acc": 0.43859649122807015, "train_speed(iter/s)": 1.458474 }, { "epoch": 0.13774045670708196, "grad_norm": 3.3288214206695557, "learning_rate": 9.981286643422596e-05, "loss": 2.5134204864501952, "memory(GiB)": 47.63, "step": 3215, "token_acc": 0.4721311475409836, "train_speed(iter/s)": 1.459086 }, { "epoch": 0.1379546720363309, "grad_norm": 4.821554183959961, "learning_rate": 9.981228428240721e-05, "loss": 2.6251523971557615, "memory(GiB)": 47.63, "step": 3220, "token_acc": 0.4580152671755725, "train_speed(iter/s)": 1.45836 }, { "epoch": 0.1381688873655799, "grad_norm": 3.739990472793579, "learning_rate": 9.98117012281922e-05, "loss": 2.6130451202392577, "memory(GiB)": 47.63, "step": 3225, "token_acc": 0.49174917491749176, "train_speed(iter/s)": 1.458788 }, { "epoch": 0.13838310269482884, "grad_norm": 3.886059284210205, "learning_rate": 9.98111172715915e-05, "loss": 2.6881637573242188, "memory(GiB)": 47.63, "step": 3230, "token_acc": 0.45302013422818793, "train_speed(iter/s)": 1.459028 }, { "epoch": 0.1385973180240778, "grad_norm": 3.547057867050171, "learning_rate": 9.981053241261567e-05, "loss": 2.7989946365356446, "memory(GiB)": 47.63, "step": 3235, "token_acc": 0.4781144781144781, "train_speed(iter/s)": 1.458744 }, { "epoch": 0.13881153335332677, "grad_norm": 3.5991673469543457, "learning_rate": 9.980994665127535e-05, "loss": 2.447433853149414, "memory(GiB)": 47.63, "step": 3240, "token_acc": 0.4934210526315789, "train_speed(iter/s)": 1.45873 }, { "epoch": 0.13902574868257572, "grad_norm": 4.359513282775879, "learning_rate": 9.980935998758109e-05, "loss": 2.5521196365356444, "memory(GiB)": 47.63, "step": 3245, "token_acc": 0.46946564885496184, "train_speed(iter/s)": 1.458775 }, { "epoch": 0.13923996401182467, "grad_norm": 3.155566692352295, "learning_rate": 9.980877242154356e-05, "loss": 2.5134252548217773, "memory(GiB)": 47.63, "step": 3250, "token_acc": 0.48328267477203646, "train_speed(iter/s)": 1.458647 }, { "epoch": 0.13945417934107365, "grad_norm": 3.671736717224121, "learning_rate": 9.980818395317339e-05, "loss": 2.5342523574829103, "memory(GiB)": 47.63, "step": 3255, "token_acc": 0.46886446886446886, "train_speed(iter/s)": 1.458844 }, { "epoch": 0.1396683946703226, "grad_norm": 2.9696121215820312, "learning_rate": 9.980759458248125e-05, "loss": 2.466375732421875, "memory(GiB)": 47.63, "step": 3260, "token_acc": 0.5109489051094891, "train_speed(iter/s)": 1.458919 }, { "epoch": 0.13988260999957156, "grad_norm": 4.028738975524902, "learning_rate": 9.98070043094778e-05, "loss": 2.8555065155029298, "memory(GiB)": 47.63, "step": 3265, "token_acc": 0.45426829268292684, "train_speed(iter/s)": 1.458649 }, { "epoch": 0.14009682532882053, "grad_norm": 3.731355667114258, "learning_rate": 9.980641313417376e-05, "loss": 2.763540267944336, "memory(GiB)": 47.63, "step": 3270, "token_acc": 0.46579804560260585, "train_speed(iter/s)": 1.458694 }, { "epoch": 0.14031104065806949, "grad_norm": 4.28919792175293, "learning_rate": 9.98058210565798e-05, "loss": 2.7386585235595704, "memory(GiB)": 47.63, "step": 3275, "token_acc": 0.46204620462046203, "train_speed(iter/s)": 1.45912 }, { "epoch": 0.14052525598731846, "grad_norm": 4.4661970138549805, "learning_rate": 9.980522807670669e-05, "loss": 2.326940155029297, "memory(GiB)": 47.63, "step": 3280, "token_acc": 0.47619047619047616, "train_speed(iter/s)": 1.459468 }, { "epoch": 0.14073947131656742, "grad_norm": 4.619133949279785, "learning_rate": 9.980463419456516e-05, "loss": 2.806599998474121, "memory(GiB)": 47.63, "step": 3285, "token_acc": 0.4358974358974359, "train_speed(iter/s)": 1.46012 }, { "epoch": 0.14095368664581637, "grad_norm": 4.316283702850342, "learning_rate": 9.980403941016594e-05, "loss": 2.7667058944702148, "memory(GiB)": 47.63, "step": 3290, "token_acc": 0.4435146443514644, "train_speed(iter/s)": 1.46037 }, { "epoch": 0.14116790197506535, "grad_norm": 3.754873752593994, "learning_rate": 9.980344372351983e-05, "loss": 2.634560775756836, "memory(GiB)": 47.63, "step": 3295, "token_acc": 0.4523809523809524, "train_speed(iter/s)": 1.460697 }, { "epoch": 0.1413821173043143, "grad_norm": 4.2558913230896, "learning_rate": 9.980284713463763e-05, "loss": 2.62609806060791, "memory(GiB)": 47.63, "step": 3300, "token_acc": 0.47876447876447875, "train_speed(iter/s)": 1.461006 }, { "epoch": 0.14159633263356325, "grad_norm": 4.759919166564941, "learning_rate": 9.980224964353014e-05, "loss": 2.6200397491455076, "memory(GiB)": 47.63, "step": 3305, "token_acc": 0.4217687074829932, "train_speed(iter/s)": 1.4612 }, { "epoch": 0.14181054796281223, "grad_norm": 3.752591848373413, "learning_rate": 9.980165125020816e-05, "loss": 2.603768539428711, "memory(GiB)": 47.63, "step": 3310, "token_acc": 0.4491803278688525, "train_speed(iter/s)": 1.461194 }, { "epoch": 0.14202476329206118, "grad_norm": 3.5088789463043213, "learning_rate": 9.980105195468256e-05, "loss": 2.9116655349731446, "memory(GiB)": 47.63, "step": 3315, "token_acc": 0.41785714285714287, "train_speed(iter/s)": 1.460879 }, { "epoch": 0.14223897862131013, "grad_norm": 5.019904613494873, "learning_rate": 9.980045175696418e-05, "loss": 2.826046371459961, "memory(GiB)": 47.63, "step": 3320, "token_acc": 0.4197183098591549, "train_speed(iter/s)": 1.460983 }, { "epoch": 0.1424531939505591, "grad_norm": 4.364889621734619, "learning_rate": 9.97998506570639e-05, "loss": 2.498710823059082, "memory(GiB)": 47.63, "step": 3325, "token_acc": 0.5231316725978647, "train_speed(iter/s)": 1.461307 }, { "epoch": 0.14266740927980806, "grad_norm": 2.8879406452178955, "learning_rate": 9.979924865499262e-05, "loss": 2.739917182922363, "memory(GiB)": 47.63, "step": 3330, "token_acc": 0.44668587896253603, "train_speed(iter/s)": 1.461107 }, { "epoch": 0.142881624609057, "grad_norm": 3.873210906982422, "learning_rate": 9.979864575076124e-05, "loss": 2.6274206161499025, "memory(GiB)": 47.63, "step": 3335, "token_acc": 0.4588235294117647, "train_speed(iter/s)": 1.461477 }, { "epoch": 0.143095839938306, "grad_norm": 3.0949182510375977, "learning_rate": 9.979804194438065e-05, "loss": 2.3082038879394533, "memory(GiB)": 47.63, "step": 3340, "token_acc": 0.4788273615635179, "train_speed(iter/s)": 1.461001 }, { "epoch": 0.14331005526755494, "grad_norm": 4.63090181350708, "learning_rate": 9.979743723586184e-05, "loss": 2.840785598754883, "memory(GiB)": 47.63, "step": 3345, "token_acc": 0.422360248447205, "train_speed(iter/s)": 1.460772 }, { "epoch": 0.1435242705968039, "grad_norm": 3.684236526489258, "learning_rate": 9.979683162521573e-05, "loss": 2.8608634948730467, "memory(GiB)": 47.63, "step": 3350, "token_acc": 0.4337748344370861, "train_speed(iter/s)": 1.461542 }, { "epoch": 0.14373848592605287, "grad_norm": 4.538915634155273, "learning_rate": 9.97962251124533e-05, "loss": 2.7592752456665037, "memory(GiB)": 47.63, "step": 3355, "token_acc": 0.4691780821917808, "train_speed(iter/s)": 1.461603 }, { "epoch": 0.14395270125530182, "grad_norm": 4.658306121826172, "learning_rate": 9.979561769758555e-05, "loss": 2.6472780227661135, "memory(GiB)": 47.63, "step": 3360, "token_acc": 0.4359861591695502, "train_speed(iter/s)": 1.461189 }, { "epoch": 0.1441669165845508, "grad_norm": 3.5684494972229004, "learning_rate": 9.979500938062345e-05, "loss": 2.387424087524414, "memory(GiB)": 47.63, "step": 3365, "token_acc": 0.48134328358208955, "train_speed(iter/s)": 1.461779 }, { "epoch": 0.14438113191379975, "grad_norm": 6.197642803192139, "learning_rate": 9.979440016157805e-05, "loss": 2.476662445068359, "memory(GiB)": 47.63, "step": 3370, "token_acc": 0.43309859154929575, "train_speed(iter/s)": 1.461999 }, { "epoch": 0.1445953472430487, "grad_norm": 3.708441972732544, "learning_rate": 9.97937900404604e-05, "loss": 2.873057174682617, "memory(GiB)": 47.63, "step": 3375, "token_acc": 0.43354430379746833, "train_speed(iter/s)": 1.462099 }, { "epoch": 0.14480956257229768, "grad_norm": 4.045034408569336, "learning_rate": 9.979317901728153e-05, "loss": 2.8710140228271483, "memory(GiB)": 47.63, "step": 3380, "token_acc": 0.45936395759717313, "train_speed(iter/s)": 1.462166 }, { "epoch": 0.14502377790154664, "grad_norm": 3.011522054672241, "learning_rate": 9.979256709205251e-05, "loss": 2.744778633117676, "memory(GiB)": 47.63, "step": 3385, "token_acc": 0.44126984126984126, "train_speed(iter/s)": 1.461826 }, { "epoch": 0.1452379932307956, "grad_norm": 3.169530153274536, "learning_rate": 9.979195426478443e-05, "loss": 2.4550779342651365, "memory(GiB)": 47.63, "step": 3390, "token_acc": 0.4850498338870432, "train_speed(iter/s)": 1.462158 }, { "epoch": 0.14545220856004457, "grad_norm": 4.8921380043029785, "learning_rate": 9.979134053548838e-05, "loss": 2.5971981048583985, "memory(GiB)": 47.63, "step": 3395, "token_acc": 0.3935018050541516, "train_speed(iter/s)": 1.461687 }, { "epoch": 0.14566642388929352, "grad_norm": 3.600477695465088, "learning_rate": 9.979072590417549e-05, "loss": 2.3302986145019533, "memory(GiB)": 47.63, "step": 3400, "token_acc": 0.4562043795620438, "train_speed(iter/s)": 1.462275 }, { "epoch": 0.14588063921854247, "grad_norm": 6.550259590148926, "learning_rate": 9.97901103708569e-05, "loss": 2.7483451843261717, "memory(GiB)": 47.63, "step": 3405, "token_acc": 0.48120300751879697, "train_speed(iter/s)": 1.462555 }, { "epoch": 0.14609485454779145, "grad_norm": 4.765453815460205, "learning_rate": 9.978949393554374e-05, "loss": 2.8481876373291017, "memory(GiB)": 47.63, "step": 3410, "token_acc": 0.4467455621301775, "train_speed(iter/s)": 1.462951 }, { "epoch": 0.1463090698770404, "grad_norm": 4.505581378936768, "learning_rate": 9.978887659824721e-05, "loss": 2.6652400970458983, "memory(GiB)": 47.63, "step": 3415, "token_acc": 0.48398576512455516, "train_speed(iter/s)": 1.462657 }, { "epoch": 0.14652328520628935, "grad_norm": 3.9403669834136963, "learning_rate": 9.978825835897844e-05, "loss": 2.4215864181518554, "memory(GiB)": 47.63, "step": 3420, "token_acc": 0.45098039215686275, "train_speed(iter/s)": 1.462608 }, { "epoch": 0.14673750053553833, "grad_norm": 4.667635440826416, "learning_rate": 9.978763921774869e-05, "loss": 2.425082206726074, "memory(GiB)": 47.63, "step": 3425, "token_acc": 0.4536082474226804, "train_speed(iter/s)": 1.462776 }, { "epoch": 0.14695171586478728, "grad_norm": 3.6880412101745605, "learning_rate": 9.978701917456916e-05, "loss": 2.4595867156982423, "memory(GiB)": 47.63, "step": 3430, "token_acc": 0.49258160237388726, "train_speed(iter/s)": 1.462755 }, { "epoch": 0.14716593119403626, "grad_norm": 3.2978837490081787, "learning_rate": 9.978639822945107e-05, "loss": 2.511236000061035, "memory(GiB)": 47.63, "step": 3435, "token_acc": 0.4511784511784512, "train_speed(iter/s)": 1.46316 }, { "epoch": 0.1473801465232852, "grad_norm": 3.917314291000366, "learning_rate": 9.978577638240567e-05, "loss": 2.7325916290283203, "memory(GiB)": 47.63, "step": 3440, "token_acc": 0.4581818181818182, "train_speed(iter/s)": 1.463603 }, { "epoch": 0.14759436185253416, "grad_norm": 3.8789377212524414, "learning_rate": 9.978515363344422e-05, "loss": 2.5562042236328124, "memory(GiB)": 47.63, "step": 3445, "token_acc": 0.4574468085106383, "train_speed(iter/s)": 1.464349 }, { "epoch": 0.14780857718178314, "grad_norm": 7.125819206237793, "learning_rate": 9.978452998257801e-05, "loss": 2.5441293716430664, "memory(GiB)": 47.63, "step": 3450, "token_acc": 0.44871794871794873, "train_speed(iter/s)": 1.464385 }, { "epoch": 0.1480227925110321, "grad_norm": 4.757804870605469, "learning_rate": 9.978390542981835e-05, "loss": 2.660295104980469, "memory(GiB)": 47.63, "step": 3455, "token_acc": 0.43555555555555553, "train_speed(iter/s)": 1.464718 }, { "epoch": 0.14823700784028104, "grad_norm": 4.2779107093811035, "learning_rate": 9.978327997517652e-05, "loss": 2.7758384704589845, "memory(GiB)": 47.63, "step": 3460, "token_acc": 0.43795620437956206, "train_speed(iter/s)": 1.465205 }, { "epoch": 0.14845122316953002, "grad_norm": 3.928638219833374, "learning_rate": 9.978265361866389e-05, "loss": 2.592887115478516, "memory(GiB)": 47.63, "step": 3465, "token_acc": 0.4823943661971831, "train_speed(iter/s)": 1.465435 }, { "epoch": 0.14866543849877897, "grad_norm": 3.746091842651367, "learning_rate": 9.978202636029179e-05, "loss": 2.6098140716552733, "memory(GiB)": 47.63, "step": 3470, "token_acc": 0.4689265536723164, "train_speed(iter/s)": 1.465081 }, { "epoch": 0.14887965382802792, "grad_norm": 4.820042610168457, "learning_rate": 9.978139820007158e-05, "loss": 2.5279256820678713, "memory(GiB)": 47.63, "step": 3475, "token_acc": 0.43790849673202614, "train_speed(iter/s)": 1.464572 }, { "epoch": 0.1490938691572769, "grad_norm": 4.189342975616455, "learning_rate": 9.978076913801464e-05, "loss": 2.3815740585327148, "memory(GiB)": 47.63, "step": 3480, "token_acc": 0.4982078853046595, "train_speed(iter/s)": 1.464616 }, { "epoch": 0.14930808448652585, "grad_norm": 6.875144958496094, "learning_rate": 9.978013917413237e-05, "loss": 2.633605194091797, "memory(GiB)": 47.63, "step": 3485, "token_acc": 0.4875, "train_speed(iter/s)": 1.465166 }, { "epoch": 0.1495222998157748, "grad_norm": 2.6285159587860107, "learning_rate": 9.977950830843619e-05, "loss": 2.563618850708008, "memory(GiB)": 47.63, "step": 3490, "token_acc": 0.4487534626038781, "train_speed(iter/s)": 1.465612 }, { "epoch": 0.14973651514502379, "grad_norm": 7.238272190093994, "learning_rate": 9.977887654093751e-05, "loss": 2.6613895416259767, "memory(GiB)": 47.63, "step": 3495, "token_acc": 0.48398576512455516, "train_speed(iter/s)": 1.465451 }, { "epoch": 0.14995073047427274, "grad_norm": 3.0726985931396484, "learning_rate": 9.977824387164778e-05, "loss": 2.343222999572754, "memory(GiB)": 47.63, "step": 3500, "token_acc": 0.53125, "train_speed(iter/s)": 1.465652 }, { "epoch": 0.14995073047427274, "eval_loss": 2.3537485599517822, "eval_runtime": 14.0822, "eval_samples_per_second": 7.101, "eval_steps_per_second": 7.101, "eval_token_acc": 0.4715447154471545, "step": 3500 }, { "epoch": 0.1501649458035217, "grad_norm": 3.4169580936431885, "learning_rate": 9.977761030057847e-05, "loss": 2.290047836303711, "memory(GiB)": 47.63, "step": 3505, "token_acc": 0.49447236180904525, "train_speed(iter/s)": 1.456895 }, { "epoch": 0.15037916113277067, "grad_norm": 3.6757216453552246, "learning_rate": 9.977697582774106e-05, "loss": 2.4777524948120115, "memory(GiB)": 47.63, "step": 3510, "token_acc": 0.4944237918215613, "train_speed(iter/s)": 1.456862 }, { "epoch": 0.15059337646201962, "grad_norm": 4.6084513664245605, "learning_rate": 9.977634045314703e-05, "loss": 2.6394153594970704, "memory(GiB)": 47.63, "step": 3515, "token_acc": 0.45454545454545453, "train_speed(iter/s)": 1.456717 }, { "epoch": 0.1508075917912686, "grad_norm": 3.7747673988342285, "learning_rate": 9.977570417680791e-05, "loss": 2.7754934310913084, "memory(GiB)": 47.63, "step": 3520, "token_acc": 0.45627376425855515, "train_speed(iter/s)": 1.45684 }, { "epoch": 0.15102180712051755, "grad_norm": 3.9443018436431885, "learning_rate": 9.977506699873521e-05, "loss": 2.4865036010742188, "memory(GiB)": 47.63, "step": 3525, "token_acc": 0.4605678233438486, "train_speed(iter/s)": 1.457102 }, { "epoch": 0.1512360224497665, "grad_norm": 3.8653881549835205, "learning_rate": 9.977442891894048e-05, "loss": 2.498517608642578, "memory(GiB)": 47.63, "step": 3530, "token_acc": 0.4577922077922078, "train_speed(iter/s)": 1.457026 }, { "epoch": 0.15145023777901548, "grad_norm": 3.7123584747314453, "learning_rate": 9.977378993743528e-05, "loss": 2.8637248992919924, "memory(GiB)": 47.63, "step": 3535, "token_acc": 0.4596774193548387, "train_speed(iter/s)": 1.457719 }, { "epoch": 0.15166445310826443, "grad_norm": 4.383836269378662, "learning_rate": 9.977315005423117e-05, "loss": 2.6163511276245117, "memory(GiB)": 47.63, "step": 3540, "token_acc": 0.4537037037037037, "train_speed(iter/s)": 1.457578 }, { "epoch": 0.15187866843751338, "grad_norm": 2.944066047668457, "learning_rate": 9.977250926933977e-05, "loss": 2.6205913543701174, "memory(GiB)": 47.63, "step": 3545, "token_acc": 0.46078431372549017, "train_speed(iter/s)": 1.457971 }, { "epoch": 0.15209288376676236, "grad_norm": 3.992919445037842, "learning_rate": 9.977186758277268e-05, "loss": 2.6933469772338867, "memory(GiB)": 47.63, "step": 3550, "token_acc": 0.41818181818181815, "train_speed(iter/s)": 1.457775 }, { "epoch": 0.1523070990960113, "grad_norm": 7.702213764190674, "learning_rate": 9.97712249945415e-05, "loss": 2.7108774185180664, "memory(GiB)": 47.63, "step": 3555, "token_acc": 0.4452054794520548, "train_speed(iter/s)": 1.457589 }, { "epoch": 0.15252131442526026, "grad_norm": 3.4477903842926025, "learning_rate": 9.97705815046579e-05, "loss": 2.6474851608276366, "memory(GiB)": 47.63, "step": 3560, "token_acc": 0.45357142857142857, "train_speed(iter/s)": 1.457768 }, { "epoch": 0.15273552975450924, "grad_norm": 3.7272629737854004, "learning_rate": 9.976993711313354e-05, "loss": 2.744158935546875, "memory(GiB)": 47.63, "step": 3565, "token_acc": 0.4682274247491639, "train_speed(iter/s)": 1.458 }, { "epoch": 0.1529497450837582, "grad_norm": 3.5521841049194336, "learning_rate": 9.976929181998007e-05, "loss": 2.6562219619750977, "memory(GiB)": 47.63, "step": 3570, "token_acc": 0.48905109489051096, "train_speed(iter/s)": 1.458203 }, { "epoch": 0.15316396041300714, "grad_norm": 4.787902355194092, "learning_rate": 9.976864562520918e-05, "loss": 2.75551815032959, "memory(GiB)": 47.63, "step": 3575, "token_acc": 0.4262948207171315, "train_speed(iter/s)": 1.458356 }, { "epoch": 0.15337817574225612, "grad_norm": 2.84358286857605, "learning_rate": 9.976799852883261e-05, "loss": 2.6625064849853515, "memory(GiB)": 47.63, "step": 3580, "token_acc": 0.4413793103448276, "train_speed(iter/s)": 1.458068 }, { "epoch": 0.15359239107150507, "grad_norm": 4.125617027282715, "learning_rate": 9.976735053086204e-05, "loss": 2.428995895385742, "memory(GiB)": 47.63, "step": 3585, "token_acc": 0.4804270462633452, "train_speed(iter/s)": 1.457472 }, { "epoch": 0.15380660640075403, "grad_norm": 3.760828733444214, "learning_rate": 9.976670163130924e-05, "loss": 2.9189868927001954, "memory(GiB)": 47.63, "step": 3590, "token_acc": 0.4415204678362573, "train_speed(iter/s)": 1.45773 }, { "epoch": 0.154020821730003, "grad_norm": 3.6647214889526367, "learning_rate": 9.976605183018594e-05, "loss": 2.695289421081543, "memory(GiB)": 47.63, "step": 3595, "token_acc": 0.43573667711598746, "train_speed(iter/s)": 1.458066 }, { "epoch": 0.15423503705925196, "grad_norm": 3.5985748767852783, "learning_rate": 9.976540112750394e-05, "loss": 2.5852956771850586, "memory(GiB)": 47.63, "step": 3600, "token_acc": 0.4532871972318339, "train_speed(iter/s)": 1.457399 }, { "epoch": 0.15444925238850093, "grad_norm": 3.0911688804626465, "learning_rate": 9.976474952327502e-05, "loss": 2.802461051940918, "memory(GiB)": 47.63, "step": 3605, "token_acc": 0.4758364312267658, "train_speed(iter/s)": 1.457697 }, { "epoch": 0.1546634677177499, "grad_norm": 4.63688325881958, "learning_rate": 9.976409701751097e-05, "loss": 2.487857627868652, "memory(GiB)": 47.63, "step": 3610, "token_acc": 0.48398576512455516, "train_speed(iter/s)": 1.457331 }, { "epoch": 0.15487768304699884, "grad_norm": 3.1934165954589844, "learning_rate": 9.976344361022363e-05, "loss": 2.3247928619384766, "memory(GiB)": 47.63, "step": 3615, "token_acc": 0.5115384615384615, "train_speed(iter/s)": 1.457743 }, { "epoch": 0.15509189837624782, "grad_norm": 3.4557573795318604, "learning_rate": 9.976278930142482e-05, "loss": 2.5618444442749024, "memory(GiB)": 47.63, "step": 3620, "token_acc": 0.49224806201550386, "train_speed(iter/s)": 1.458065 }, { "epoch": 0.15530611370549677, "grad_norm": 4.077261447906494, "learning_rate": 9.97621340911264e-05, "loss": 2.6343793869018555, "memory(GiB)": 47.63, "step": 3625, "token_acc": 0.43944636678200694, "train_speed(iter/s)": 1.458295 }, { "epoch": 0.15552032903474572, "grad_norm": 3.412285566329956, "learning_rate": 9.976147797934024e-05, "loss": 2.5630825042724608, "memory(GiB)": 47.63, "step": 3630, "token_acc": 0.44375, "train_speed(iter/s)": 1.457827 }, { "epoch": 0.1557345443639947, "grad_norm": 3.526050329208374, "learning_rate": 9.976082096607823e-05, "loss": 2.741041564941406, "memory(GiB)": 47.63, "step": 3635, "token_acc": 0.4104938271604938, "train_speed(iter/s)": 1.457739 }, { "epoch": 0.15594875969324365, "grad_norm": 5.315544605255127, "learning_rate": 9.976016305135228e-05, "loss": 2.279109573364258, "memory(GiB)": 47.63, "step": 3640, "token_acc": 0.4888888888888889, "train_speed(iter/s)": 1.457882 }, { "epoch": 0.1561629750224926, "grad_norm": 3.2124979496002197, "learning_rate": 9.975950423517426e-05, "loss": 2.6411577224731446, "memory(GiB)": 47.63, "step": 3645, "token_acc": 0.45182724252491696, "train_speed(iter/s)": 1.4577 }, { "epoch": 0.15637719035174158, "grad_norm": 7.270047187805176, "learning_rate": 9.975884451755618e-05, "loss": 2.6146183013916016, "memory(GiB)": 47.63, "step": 3650, "token_acc": 0.44983818770226536, "train_speed(iter/s)": 1.457684 }, { "epoch": 0.15659140568099053, "grad_norm": 3.6394429206848145, "learning_rate": 9.975818389850993e-05, "loss": 2.5720624923706055, "memory(GiB)": 47.63, "step": 3655, "token_acc": 0.4732824427480916, "train_speed(iter/s)": 1.457808 }, { "epoch": 0.15680562101023948, "grad_norm": 3.7785966396331787, "learning_rate": 9.975752237804752e-05, "loss": 2.522658920288086, "memory(GiB)": 47.63, "step": 3660, "token_acc": 0.43006993006993005, "train_speed(iter/s)": 1.457458 }, { "epoch": 0.15701983633948846, "grad_norm": 3.6492886543273926, "learning_rate": 9.975685995618092e-05, "loss": 2.747486686706543, "memory(GiB)": 47.63, "step": 3665, "token_acc": 0.4624505928853755, "train_speed(iter/s)": 1.457678 }, { "epoch": 0.1572340516687374, "grad_norm": 4.997968673706055, "learning_rate": 9.97561966329221e-05, "loss": 2.707684135437012, "memory(GiB)": 47.63, "step": 3670, "token_acc": 0.49264705882352944, "train_speed(iter/s)": 1.457882 }, { "epoch": 0.15744826699798636, "grad_norm": 2.9105184078216553, "learning_rate": 9.975553240828312e-05, "loss": 2.459530258178711, "memory(GiB)": 47.63, "step": 3675, "token_acc": 0.45364238410596025, "train_speed(iter/s)": 1.458198 }, { "epoch": 0.15766248232723534, "grad_norm": 3.2676429748535156, "learning_rate": 9.975486728227599e-05, "loss": 2.582370567321777, "memory(GiB)": 47.63, "step": 3680, "token_acc": 0.4420485175202156, "train_speed(iter/s)": 1.458089 }, { "epoch": 0.1578766976564843, "grad_norm": 4.302821159362793, "learning_rate": 9.975420125491277e-05, "loss": 2.7656349182128905, "memory(GiB)": 47.63, "step": 3685, "token_acc": 0.4463667820069204, "train_speed(iter/s)": 1.45836 }, { "epoch": 0.15809091298573327, "grad_norm": 3.3045613765716553, "learning_rate": 9.975353432620552e-05, "loss": 2.489249038696289, "memory(GiB)": 47.63, "step": 3690, "token_acc": 0.4940239043824701, "train_speed(iter/s)": 1.458707 }, { "epoch": 0.15830512831498222, "grad_norm": 5.259227275848389, "learning_rate": 9.975286649616633e-05, "loss": 2.4811010360717773, "memory(GiB)": 47.63, "step": 3695, "token_acc": 0.48739495798319327, "train_speed(iter/s)": 1.458281 }, { "epoch": 0.15851934364423118, "grad_norm": 5.344473361968994, "learning_rate": 9.975219776480727e-05, "loss": 2.438692092895508, "memory(GiB)": 47.63, "step": 3700, "token_acc": 0.5073529411764706, "train_speed(iter/s)": 1.458013 }, { "epoch": 0.15873355897348015, "grad_norm": 4.790504455566406, "learning_rate": 9.97515281321405e-05, "loss": 2.609920120239258, "memory(GiB)": 47.63, "step": 3705, "token_acc": 0.4855072463768116, "train_speed(iter/s)": 1.458119 }, { "epoch": 0.1589477743027291, "grad_norm": 3.29830002784729, "learning_rate": 9.975085759817811e-05, "loss": 2.884556770324707, "memory(GiB)": 47.63, "step": 3710, "token_acc": 0.4303030303030303, "train_speed(iter/s)": 1.457928 }, { "epoch": 0.15916198963197806, "grad_norm": 6.27782678604126, "learning_rate": 9.975018616293228e-05, "loss": 2.6151010513305666, "memory(GiB)": 47.63, "step": 3715, "token_acc": 0.47, "train_speed(iter/s)": 1.458229 }, { "epoch": 0.15937620496122704, "grad_norm": 4.553070545196533, "learning_rate": 9.974951382641516e-05, "loss": 2.7687604904174803, "memory(GiB)": 47.63, "step": 3720, "token_acc": 0.46953405017921146, "train_speed(iter/s)": 1.458198 }, { "epoch": 0.159590420290476, "grad_norm": 5.753072261810303, "learning_rate": 9.97488405886389e-05, "loss": 2.497258758544922, "memory(GiB)": 47.63, "step": 3725, "token_acc": 0.47232472324723246, "train_speed(iter/s)": 1.45841 }, { "epoch": 0.15980463561972494, "grad_norm": 5.164744853973389, "learning_rate": 9.974816644961575e-05, "loss": 2.6047916412353516, "memory(GiB)": 47.63, "step": 3730, "token_acc": 0.42391304347826086, "train_speed(iter/s)": 1.459016 }, { "epoch": 0.16001885094897392, "grad_norm": 4.997974395751953, "learning_rate": 9.974749140935789e-05, "loss": 2.688969612121582, "memory(GiB)": 47.63, "step": 3735, "token_acc": 0.4460431654676259, "train_speed(iter/s)": 1.459304 }, { "epoch": 0.16023306627822287, "grad_norm": 3.5237510204315186, "learning_rate": 9.974681546787755e-05, "loss": 2.5835519790649415, "memory(GiB)": 47.63, "step": 3740, "token_acc": 0.4797297297297297, "train_speed(iter/s)": 1.459492 }, { "epoch": 0.16044728160747182, "grad_norm": 4.978376388549805, "learning_rate": 9.974613862518699e-05, "loss": 2.5920886993408203, "memory(GiB)": 47.63, "step": 3745, "token_acc": 0.4889705882352941, "train_speed(iter/s)": 1.459628 }, { "epoch": 0.1606614969367208, "grad_norm": 3.2934536933898926, "learning_rate": 9.974546088129846e-05, "loss": 2.704619598388672, "memory(GiB)": 47.63, "step": 3750, "token_acc": 0.4688427299703264, "train_speed(iter/s)": 1.459236 }, { "epoch": 0.16087571226596975, "grad_norm": 3.007472515106201, "learning_rate": 9.974478223622424e-05, "loss": 2.632056999206543, "memory(GiB)": 47.63, "step": 3755, "token_acc": 0.46710526315789475, "train_speed(iter/s)": 1.459154 }, { "epoch": 0.1610899275952187, "grad_norm": 3.6066036224365234, "learning_rate": 9.974410268997662e-05, "loss": 2.520039367675781, "memory(GiB)": 47.63, "step": 3760, "token_acc": 0.4674329501915709, "train_speed(iter/s)": 1.459432 }, { "epoch": 0.16130414292446768, "grad_norm": 4.061606407165527, "learning_rate": 9.974342224256795e-05, "loss": 2.6116653442382813, "memory(GiB)": 47.63, "step": 3765, "token_acc": 0.46488294314381273, "train_speed(iter/s)": 1.459787 }, { "epoch": 0.16151835825371663, "grad_norm": 3.367438554763794, "learning_rate": 9.974274089401048e-05, "loss": 2.678365707397461, "memory(GiB)": 47.63, "step": 3770, "token_acc": 0.44664031620553357, "train_speed(iter/s)": 1.460093 }, { "epoch": 0.1617325735829656, "grad_norm": 7.6087727546691895, "learning_rate": 9.974205864431661e-05, "loss": 2.593564033508301, "memory(GiB)": 47.63, "step": 3775, "token_acc": 0.46178343949044587, "train_speed(iter/s)": 1.460482 }, { "epoch": 0.16194678891221456, "grad_norm": 4.246761798858643, "learning_rate": 9.974137549349869e-05, "loss": 2.468999671936035, "memory(GiB)": 47.63, "step": 3780, "token_acc": 0.5, "train_speed(iter/s)": 1.460402 }, { "epoch": 0.1621610042414635, "grad_norm": 3.301891803741455, "learning_rate": 9.97406914415691e-05, "loss": 2.7166805267333984, "memory(GiB)": 47.63, "step": 3785, "token_acc": 0.45180722891566266, "train_speed(iter/s)": 1.460596 }, { "epoch": 0.1623752195707125, "grad_norm": 4.316013813018799, "learning_rate": 9.97400064885402e-05, "loss": 2.4938470840454103, "memory(GiB)": 47.63, "step": 3790, "token_acc": 0.46735395189003437, "train_speed(iter/s)": 1.460808 }, { "epoch": 0.16258943489996144, "grad_norm": 6.042535781860352, "learning_rate": 9.973932063442444e-05, "loss": 2.47210750579834, "memory(GiB)": 47.63, "step": 3795, "token_acc": 0.5083333333333333, "train_speed(iter/s)": 1.461412 }, { "epoch": 0.1628036502292104, "grad_norm": 3.7296314239501953, "learning_rate": 9.973863387923423e-05, "loss": 2.7515296936035156, "memory(GiB)": 47.63, "step": 3800, "token_acc": 0.4540059347181009, "train_speed(iter/s)": 1.461468 }, { "epoch": 0.16301786555845937, "grad_norm": 3.433135747909546, "learning_rate": 9.9737946222982e-05, "loss": 2.6556316375732423, "memory(GiB)": 47.63, "step": 3805, "token_acc": 0.4712328767123288, "train_speed(iter/s)": 1.461323 }, { "epoch": 0.16323208088770833, "grad_norm": 3.6524879932403564, "learning_rate": 9.973725766568023e-05, "loss": 2.7036582946777346, "memory(GiB)": 47.63, "step": 3810, "token_acc": 0.4463087248322148, "train_speed(iter/s)": 1.461134 }, { "epoch": 0.16344629621695728, "grad_norm": 3.3620219230651855, "learning_rate": 9.973656820734136e-05, "loss": 2.4934627532958986, "memory(GiB)": 47.63, "step": 3815, "token_acc": 0.5117056856187291, "train_speed(iter/s)": 1.461772 }, { "epoch": 0.16366051154620626, "grad_norm": 3.825281858444214, "learning_rate": 9.973587784797791e-05, "loss": 2.373024749755859, "memory(GiB)": 47.63, "step": 3820, "token_acc": 0.5019157088122606, "train_speed(iter/s)": 1.462295 }, { "epoch": 0.1638747268754552, "grad_norm": 3.9945309162139893, "learning_rate": 9.973518658760236e-05, "loss": 2.5321113586425783, "memory(GiB)": 47.63, "step": 3825, "token_acc": 0.4844961240310077, "train_speed(iter/s)": 1.462047 }, { "epoch": 0.16408894220470416, "grad_norm": 3.375340223312378, "learning_rate": 9.973449442622729e-05, "loss": 3.001796913146973, "memory(GiB)": 47.63, "step": 3830, "token_acc": 0.40641711229946526, "train_speed(iter/s)": 1.461967 }, { "epoch": 0.16430315753395314, "grad_norm": 4.272892475128174, "learning_rate": 9.973380136386516e-05, "loss": 2.889964294433594, "memory(GiB)": 47.63, "step": 3835, "token_acc": 0.44964028776978415, "train_speed(iter/s)": 1.462502 }, { "epoch": 0.1645173728632021, "grad_norm": 4.154975414276123, "learning_rate": 9.973310740052859e-05, "loss": 2.9697288513183593, "memory(GiB)": 47.63, "step": 3840, "token_acc": 0.43617021276595747, "train_speed(iter/s)": 1.462468 }, { "epoch": 0.16473158819245104, "grad_norm": 3.1028828620910645, "learning_rate": 9.973241253623011e-05, "loss": 2.0401206970214845, "memory(GiB)": 47.63, "step": 3845, "token_acc": 0.5498154981549815, "train_speed(iter/s)": 1.462328 }, { "epoch": 0.16494580352170002, "grad_norm": 8.306822776794434, "learning_rate": 9.973171677098233e-05, "loss": 2.6747810363769533, "memory(GiB)": 47.63, "step": 3850, "token_acc": 0.44485294117647056, "train_speed(iter/s)": 1.462838 }, { "epoch": 0.16516001885094897, "grad_norm": 3.0384585857391357, "learning_rate": 9.973102010479784e-05, "loss": 2.571843147277832, "memory(GiB)": 47.63, "step": 3855, "token_acc": 0.4664179104477612, "train_speed(iter/s)": 1.4628 }, { "epoch": 0.16537423418019795, "grad_norm": 4.891858100891113, "learning_rate": 9.973032253768927e-05, "loss": 2.5152931213378906, "memory(GiB)": 47.63, "step": 3860, "token_acc": 0.44528301886792454, "train_speed(iter/s)": 1.463491 }, { "epoch": 0.1655884495094469, "grad_norm": 5.790170192718506, "learning_rate": 9.972962406966928e-05, "loss": 3.0690141677856446, "memory(GiB)": 47.63, "step": 3865, "token_acc": 0.38481675392670156, "train_speed(iter/s)": 1.463805 }, { "epoch": 0.16580266483869585, "grad_norm": 4.68450403213501, "learning_rate": 9.972892470075048e-05, "loss": 2.5472696304321287, "memory(GiB)": 47.63, "step": 3870, "token_acc": 0.45864661654135336, "train_speed(iter/s)": 1.464278 }, { "epoch": 0.16601688016794483, "grad_norm": 9.12940788269043, "learning_rate": 9.972822443094557e-05, "loss": 2.8336633682250976, "memory(GiB)": 47.63, "step": 3875, "token_acc": 0.4109090909090909, "train_speed(iter/s)": 1.464145 }, { "epoch": 0.16623109549719378, "grad_norm": 3.497802972793579, "learning_rate": 9.972752326026722e-05, "loss": 2.9579750061035157, "memory(GiB)": 47.63, "step": 3880, "token_acc": 0.47115384615384615, "train_speed(iter/s)": 1.463643 }, { "epoch": 0.16644531082644273, "grad_norm": 4.052250862121582, "learning_rate": 9.972682118872813e-05, "loss": 2.5573352813720702, "memory(GiB)": 47.63, "step": 3885, "token_acc": 0.4524714828897338, "train_speed(iter/s)": 1.463967 }, { "epoch": 0.1666595261556917, "grad_norm": 3.2251203060150146, "learning_rate": 9.972611821634104e-05, "loss": 2.8575668334960938, "memory(GiB)": 47.63, "step": 3890, "token_acc": 0.44807121661721067, "train_speed(iter/s)": 1.463968 }, { "epoch": 0.16687374148494066, "grad_norm": 3.248079538345337, "learning_rate": 9.972541434311866e-05, "loss": 2.5865352630615233, "memory(GiB)": 47.63, "step": 3895, "token_acc": 0.4626334519572954, "train_speed(iter/s)": 1.46376 }, { "epoch": 0.16708795681418961, "grad_norm": 4.117458343505859, "learning_rate": 9.972470956907375e-05, "loss": 2.7083208084106447, "memory(GiB)": 47.63, "step": 3900, "token_acc": 0.43597560975609756, "train_speed(iter/s)": 1.464029 }, { "epoch": 0.1673021721434386, "grad_norm": 2.8918769359588623, "learning_rate": 9.97240038942191e-05, "loss": 2.768160820007324, "memory(GiB)": 47.63, "step": 3905, "token_acc": 0.4117647058823529, "train_speed(iter/s)": 1.463813 }, { "epoch": 0.16751638747268754, "grad_norm": 3.165897846221924, "learning_rate": 9.972329731856745e-05, "loss": 2.3745092391967773, "memory(GiB)": 47.63, "step": 3910, "token_acc": 0.5363636363636364, "train_speed(iter/s)": 1.463548 }, { "epoch": 0.1677306028019365, "grad_norm": 5.167481899261475, "learning_rate": 9.972258984213164e-05, "loss": 2.7123252868652346, "memory(GiB)": 47.63, "step": 3915, "token_acc": 0.4430379746835443, "train_speed(iter/s)": 1.463868 }, { "epoch": 0.16794481813118547, "grad_norm": 7.225919723510742, "learning_rate": 9.972188146492446e-05, "loss": 2.6176177978515627, "memory(GiB)": 47.63, "step": 3920, "token_acc": 0.4486301369863014, "train_speed(iter/s)": 1.463912 }, { "epoch": 0.16815903346043443, "grad_norm": 4.501922607421875, "learning_rate": 9.972117218695877e-05, "loss": 2.619853973388672, "memory(GiB)": 47.63, "step": 3925, "token_acc": 0.47191011235955055, "train_speed(iter/s)": 1.46408 }, { "epoch": 0.16837324878968338, "grad_norm": 3.6003787517547607, "learning_rate": 9.97204620082474e-05, "loss": 2.591394233703613, "memory(GiB)": 47.63, "step": 3930, "token_acc": 0.4945054945054945, "train_speed(iter/s)": 1.464035 }, { "epoch": 0.16858746411893236, "grad_norm": 3.0413100719451904, "learning_rate": 9.971975092880321e-05, "loss": 2.5557174682617188, "memory(GiB)": 47.63, "step": 3935, "token_acc": 0.4612794612794613, "train_speed(iter/s)": 1.46356 }, { "epoch": 0.1688016794481813, "grad_norm": 3.5041685104370117, "learning_rate": 9.97190389486391e-05, "loss": 2.6853059768676757, "memory(GiB)": 47.63, "step": 3940, "token_acc": 0.4619883040935672, "train_speed(iter/s)": 1.463606 }, { "epoch": 0.1690158947774303, "grad_norm": 3.9365909099578857, "learning_rate": 9.971832606776797e-05, "loss": 2.800808334350586, "memory(GiB)": 47.63, "step": 3945, "token_acc": 0.4618055555555556, "train_speed(iter/s)": 1.463303 }, { "epoch": 0.16923011010667924, "grad_norm": 3.8099727630615234, "learning_rate": 9.97176122862027e-05, "loss": 2.8381914138793944, "memory(GiB)": 47.63, "step": 3950, "token_acc": 0.4297082228116711, "train_speed(iter/s)": 1.463091 }, { "epoch": 0.1694443254359282, "grad_norm": 3.864774465560913, "learning_rate": 9.971689760395626e-05, "loss": 2.587199592590332, "memory(GiB)": 47.63, "step": 3955, "token_acc": 0.44518272425249167, "train_speed(iter/s)": 1.462596 }, { "epoch": 0.16965854076517717, "grad_norm": 3.4290759563446045, "learning_rate": 9.971618202104157e-05, "loss": 2.573192024230957, "memory(GiB)": 47.63, "step": 3960, "token_acc": 0.4574780058651026, "train_speed(iter/s)": 1.462598 }, { "epoch": 0.16987275609442612, "grad_norm": 3.7943031787872314, "learning_rate": 9.971546553747163e-05, "loss": 2.7710811614990236, "memory(GiB)": 47.63, "step": 3965, "token_acc": 0.46387832699619774, "train_speed(iter/s)": 1.462764 }, { "epoch": 0.17008697142367507, "grad_norm": 3.480952501296997, "learning_rate": 9.971474815325936e-05, "loss": 2.6828598022460937, "memory(GiB)": 47.63, "step": 3970, "token_acc": 0.4290657439446367, "train_speed(iter/s)": 1.462971 }, { "epoch": 0.17030118675292405, "grad_norm": 4.311180591583252, "learning_rate": 9.97140298684178e-05, "loss": 2.555287170410156, "memory(GiB)": 47.63, "step": 3975, "token_acc": 0.5282258064516129, "train_speed(iter/s)": 1.462515 }, { "epoch": 0.170515402082173, "grad_norm": 4.069502830505371, "learning_rate": 9.971331068295998e-05, "loss": 2.439258575439453, "memory(GiB)": 47.63, "step": 3980, "token_acc": 0.49508196721311476, "train_speed(iter/s)": 1.462453 }, { "epoch": 0.17072961741142195, "grad_norm": 3.7039411067962646, "learning_rate": 9.971259059689887e-05, "loss": 3.065530014038086, "memory(GiB)": 47.63, "step": 3985, "token_acc": 0.4068241469816273, "train_speed(iter/s)": 1.462715 }, { "epoch": 0.17094383274067093, "grad_norm": 4.358893394470215, "learning_rate": 9.971186961024756e-05, "loss": 2.7284961700439454, "memory(GiB)": 47.63, "step": 3990, "token_acc": 0.4368231046931408, "train_speed(iter/s)": 1.462657 }, { "epoch": 0.17115804806991988, "grad_norm": 3.1369922161102295, "learning_rate": 9.97111477230191e-05, "loss": 2.567704200744629, "memory(GiB)": 47.63, "step": 3995, "token_acc": 0.45652173913043476, "train_speed(iter/s)": 1.463042 }, { "epoch": 0.17137226339916883, "grad_norm": 4.521172523498535, "learning_rate": 9.971042493522655e-05, "loss": 2.6029748916625977, "memory(GiB)": 47.63, "step": 4000, "token_acc": 0.466403162055336, "train_speed(iter/s)": 1.463397 }, { "epoch": 0.17137226339916883, "eval_loss": 2.2883613109588623, "eval_runtime": 14.1768, "eval_samples_per_second": 7.054, "eval_steps_per_second": 7.054, "eval_token_acc": 0.4913294797687861, "step": 4000 }, { "epoch": 0.1715864787284178, "grad_norm": 9.697166442871094, "learning_rate": 9.970970124688305e-05, "loss": 2.6056758880615236, "memory(GiB)": 47.63, "step": 4005, "token_acc": 0.4711155378486056, "train_speed(iter/s)": 1.455794 }, { "epoch": 0.17180069405766676, "grad_norm": 4.103949069976807, "learning_rate": 9.970897665800167e-05, "loss": 2.529444122314453, "memory(GiB)": 47.63, "step": 4010, "token_acc": 0.4948453608247423, "train_speed(iter/s)": 1.456129 }, { "epoch": 0.17201490938691572, "grad_norm": 4.441743850708008, "learning_rate": 9.970825116859554e-05, "loss": 3.038761329650879, "memory(GiB)": 47.63, "step": 4015, "token_acc": 0.44375, "train_speed(iter/s)": 1.456316 }, { "epoch": 0.1722291247161647, "grad_norm": 3.7059218883514404, "learning_rate": 9.97075247786778e-05, "loss": 2.5436962127685545, "memory(GiB)": 47.63, "step": 4020, "token_acc": 0.5100401606425703, "train_speed(iter/s)": 1.456573 }, { "epoch": 0.17244334004541365, "grad_norm": 5.761212348937988, "learning_rate": 9.970679748826164e-05, "loss": 3.0638465881347656, "memory(GiB)": 47.63, "step": 4025, "token_acc": 0.4084084084084084, "train_speed(iter/s)": 1.45652 }, { "epoch": 0.17265755537466262, "grad_norm": 4.596331596374512, "learning_rate": 9.97060692973602e-05, "loss": 2.9444766998291017, "memory(GiB)": 47.63, "step": 4030, "token_acc": 0.42214532871972316, "train_speed(iter/s)": 1.456569 }, { "epoch": 0.17287177070391158, "grad_norm": 5.4121856689453125, "learning_rate": 9.97053402059867e-05, "loss": 2.8367212295532225, "memory(GiB)": 47.63, "step": 4035, "token_acc": 0.4276729559748428, "train_speed(iter/s)": 1.456212 }, { "epoch": 0.17308598603316053, "grad_norm": 5.506450653076172, "learning_rate": 9.970461021415433e-05, "loss": 2.3924999237060547, "memory(GiB)": 47.63, "step": 4040, "token_acc": 0.4924812030075188, "train_speed(iter/s)": 1.455904 }, { "epoch": 0.1733002013624095, "grad_norm": 4.011473655700684, "learning_rate": 9.97038793218763e-05, "loss": 2.673995590209961, "memory(GiB)": 47.63, "step": 4045, "token_acc": 0.45255474452554745, "train_speed(iter/s)": 1.456114 }, { "epoch": 0.17351441669165846, "grad_norm": 4.279806613922119, "learning_rate": 9.97031475291659e-05, "loss": 2.7830909729003905, "memory(GiB)": 47.63, "step": 4050, "token_acc": 0.462406015037594, "train_speed(iter/s)": 1.456281 }, { "epoch": 0.1737286320209074, "grad_norm": 5.206503391265869, "learning_rate": 9.970241483603635e-05, "loss": 2.5369184494018553, "memory(GiB)": 47.63, "step": 4055, "token_acc": 0.45849802371541504, "train_speed(iter/s)": 1.456209 }, { "epoch": 0.1739428473501564, "grad_norm": 3.9950671195983887, "learning_rate": 9.970168124250093e-05, "loss": 2.4039979934692384, "memory(GiB)": 47.63, "step": 4060, "token_acc": 0.49609375, "train_speed(iter/s)": 1.456348 }, { "epoch": 0.17415706267940534, "grad_norm": 6.458606243133545, "learning_rate": 9.970094674857291e-05, "loss": 2.686125373840332, "memory(GiB)": 47.63, "step": 4065, "token_acc": 0.43609022556390975, "train_speed(iter/s)": 1.456607 }, { "epoch": 0.1743712780086543, "grad_norm": 7.220560550689697, "learning_rate": 9.970021135426564e-05, "loss": 3.1351200103759767, "memory(GiB)": 47.63, "step": 4070, "token_acc": 0.43508771929824563, "train_speed(iter/s)": 1.456987 }, { "epoch": 0.17458549333790327, "grad_norm": 3.4459733963012695, "learning_rate": 9.969947505959241e-05, "loss": 2.503296661376953, "memory(GiB)": 47.63, "step": 4075, "token_acc": 0.48375451263537905, "train_speed(iter/s)": 1.456934 }, { "epoch": 0.17479970866715222, "grad_norm": 9.989091873168945, "learning_rate": 9.969873786456656e-05, "loss": 2.734407424926758, "memory(GiB)": 47.63, "step": 4080, "token_acc": 0.4431137724550898, "train_speed(iter/s)": 1.456992 }, { "epoch": 0.17501392399640117, "grad_norm": 3.8455822467803955, "learning_rate": 9.969799976920147e-05, "loss": 2.4487541198730467, "memory(GiB)": 47.63, "step": 4085, "token_acc": 0.48148148148148145, "train_speed(iter/s)": 1.456482 }, { "epoch": 0.17522813932565015, "grad_norm": 4.026459693908691, "learning_rate": 9.969726077351047e-05, "loss": 2.525579833984375, "memory(GiB)": 47.63, "step": 4090, "token_acc": 0.4620253164556962, "train_speed(iter/s)": 1.456669 }, { "epoch": 0.1754423546548991, "grad_norm": 3.5084986686706543, "learning_rate": 9.969652087750698e-05, "loss": 2.494241142272949, "memory(GiB)": 47.63, "step": 4095, "token_acc": 0.4641638225255973, "train_speed(iter/s)": 1.456741 }, { "epoch": 0.17565656998414805, "grad_norm": 4.460583686828613, "learning_rate": 9.969578008120441e-05, "loss": 2.6435054779052733, "memory(GiB)": 47.63, "step": 4100, "token_acc": 0.4395973154362416, "train_speed(iter/s)": 1.456208 }, { "epoch": 0.17587078531339703, "grad_norm": 2.9879324436187744, "learning_rate": 9.969503838461615e-05, "loss": 2.456716537475586, "memory(GiB)": 47.63, "step": 4105, "token_acc": 0.48514851485148514, "train_speed(iter/s)": 1.456558 }, { "epoch": 0.17608500064264598, "grad_norm": 2.53955078125, "learning_rate": 9.969429578775567e-05, "loss": 2.7428861618041993, "memory(GiB)": 47.63, "step": 4110, "token_acc": 0.42450142450142453, "train_speed(iter/s)": 1.457029 }, { "epoch": 0.17629921597189496, "grad_norm": 8.147093772888184, "learning_rate": 9.969355229063638e-05, "loss": 2.627157211303711, "memory(GiB)": 47.63, "step": 4115, "token_acc": 0.5035971223021583, "train_speed(iter/s)": 1.456923 }, { "epoch": 0.1765134313011439, "grad_norm": 4.598494529724121, "learning_rate": 9.969280789327179e-05, "loss": 2.4965232849121093, "memory(GiB)": 47.63, "step": 4120, "token_acc": 0.4673913043478261, "train_speed(iter/s)": 1.457068 }, { "epoch": 0.17672764663039287, "grad_norm": 4.967527389526367, "learning_rate": 9.969206259567537e-05, "loss": 2.524370574951172, "memory(GiB)": 47.63, "step": 4125, "token_acc": 0.46905537459283386, "train_speed(iter/s)": 1.457255 }, { "epoch": 0.17694186195964184, "grad_norm": 4.3180928230285645, "learning_rate": 9.969131639786061e-05, "loss": 2.4822004318237303, "memory(GiB)": 47.63, "step": 4130, "token_acc": 0.4910394265232975, "train_speed(iter/s)": 1.456867 }, { "epoch": 0.1771560772888908, "grad_norm": 3.8205959796905518, "learning_rate": 9.969056929984105e-05, "loss": 2.5836681365966796, "memory(GiB)": 47.63, "step": 4135, "token_acc": 0.4712230215827338, "train_speed(iter/s)": 1.456748 }, { "epoch": 0.17737029261813975, "grad_norm": 3.3553004264831543, "learning_rate": 9.968982130163021e-05, "loss": 2.7959674835205077, "memory(GiB)": 47.63, "step": 4140, "token_acc": 0.4578313253012048, "train_speed(iter/s)": 1.456909 }, { "epoch": 0.17758450794738873, "grad_norm": 3.382815361022949, "learning_rate": 9.968907240324165e-05, "loss": 2.6812389373779295, "memory(GiB)": 47.63, "step": 4145, "token_acc": 0.4835820895522388, "train_speed(iter/s)": 1.457057 }, { "epoch": 0.17779872327663768, "grad_norm": 3.5095791816711426, "learning_rate": 9.968832260468892e-05, "loss": 2.73370304107666, "memory(GiB)": 47.63, "step": 4150, "token_acc": 0.4253246753246753, "train_speed(iter/s)": 1.457179 }, { "epoch": 0.17801293860588663, "grad_norm": 6.752483367919922, "learning_rate": 9.96875719059856e-05, "loss": 2.489898109436035, "memory(GiB)": 47.63, "step": 4155, "token_acc": 0.47580645161290325, "train_speed(iter/s)": 1.457324 }, { "epoch": 0.1782271539351356, "grad_norm": 3.5551066398620605, "learning_rate": 9.968682030714534e-05, "loss": 2.5549184799194338, "memory(GiB)": 47.63, "step": 4160, "token_acc": 0.46200607902735563, "train_speed(iter/s)": 1.457832 }, { "epoch": 0.17844136926438456, "grad_norm": 3.5505824089050293, "learning_rate": 9.96860678081817e-05, "loss": 2.686321258544922, "memory(GiB)": 47.63, "step": 4165, "token_acc": 0.4382716049382716, "train_speed(iter/s)": 1.458368 }, { "epoch": 0.1786555845936335, "grad_norm": 4.782723426818848, "learning_rate": 9.968531440910835e-05, "loss": 2.4822921752929688, "memory(GiB)": 47.63, "step": 4170, "token_acc": 0.49295774647887325, "train_speed(iter/s)": 1.45876 }, { "epoch": 0.1788697999228825, "grad_norm": 3.174055337905884, "learning_rate": 9.96845601099389e-05, "loss": 2.507801818847656, "memory(GiB)": 47.63, "step": 4175, "token_acc": 0.4916387959866221, "train_speed(iter/s)": 1.458457 }, { "epoch": 0.17908401525213144, "grad_norm": 4.528962135314941, "learning_rate": 9.968380491068705e-05, "loss": 2.70302734375, "memory(GiB)": 47.63, "step": 4180, "token_acc": 0.426056338028169, "train_speed(iter/s)": 1.458921 }, { "epoch": 0.1792982305813804, "grad_norm": 4.106651306152344, "learning_rate": 9.968304881136645e-05, "loss": 2.530369758605957, "memory(GiB)": 47.63, "step": 4185, "token_acc": 0.4909090909090909, "train_speed(iter/s)": 1.459121 }, { "epoch": 0.17951244591062937, "grad_norm": 4.651950836181641, "learning_rate": 9.968229181199083e-05, "loss": 2.392143440246582, "memory(GiB)": 47.63, "step": 4190, "token_acc": 0.4497991967871486, "train_speed(iter/s)": 1.459139 }, { "epoch": 0.17972666123987832, "grad_norm": 2.965430498123169, "learning_rate": 9.968153391257388e-05, "loss": 2.497489166259766, "memory(GiB)": 47.63, "step": 4195, "token_acc": 0.4421052631578947, "train_speed(iter/s)": 1.459019 }, { "epoch": 0.1799408765691273, "grad_norm": 3.2989437580108643, "learning_rate": 9.968077511312934e-05, "loss": 2.5011953353881835, "memory(GiB)": 47.63, "step": 4200, "token_acc": 0.4797507788161994, "train_speed(iter/s)": 1.459391 }, { "epoch": 0.18015509189837625, "grad_norm": 3.1855950355529785, "learning_rate": 9.968001541367096e-05, "loss": 2.597939682006836, "memory(GiB)": 47.63, "step": 4205, "token_acc": 0.49473684210526314, "train_speed(iter/s)": 1.45943 }, { "epoch": 0.1803693072276252, "grad_norm": 6.3596086502075195, "learning_rate": 9.967925481421249e-05, "loss": 2.8404872894287108, "memory(GiB)": 47.63, "step": 4210, "token_acc": 0.4722222222222222, "train_speed(iter/s)": 1.459484 }, { "epoch": 0.18058352255687418, "grad_norm": 3.202695608139038, "learning_rate": 9.967849331476771e-05, "loss": 2.640762519836426, "memory(GiB)": 47.63, "step": 4215, "token_acc": 0.4360655737704918, "train_speed(iter/s)": 1.459632 }, { "epoch": 0.18079773788612313, "grad_norm": 3.3546950817108154, "learning_rate": 9.967773091535045e-05, "loss": 2.3101552963256835, "memory(GiB)": 47.63, "step": 4220, "token_acc": 0.5229007633587787, "train_speed(iter/s)": 1.459579 }, { "epoch": 0.18101195321537208, "grad_norm": 3.6231908798217773, "learning_rate": 9.967696761597446e-05, "loss": 2.50335578918457, "memory(GiB)": 47.63, "step": 4225, "token_acc": 0.46360153256704983, "train_speed(iter/s)": 1.460172 }, { "epoch": 0.18122616854462106, "grad_norm": 4.873368740081787, "learning_rate": 9.967620341665361e-05, "loss": 2.5752490997314452, "memory(GiB)": 47.63, "step": 4230, "token_acc": 0.45317220543806647, "train_speed(iter/s)": 1.460317 }, { "epoch": 0.18144038387387001, "grad_norm": 4.936944484710693, "learning_rate": 9.967543831740174e-05, "loss": 2.3862064361572264, "memory(GiB)": 47.63, "step": 4235, "token_acc": 0.4772727272727273, "train_speed(iter/s)": 1.460182 }, { "epoch": 0.18165459920311897, "grad_norm": 3.8961172103881836, "learning_rate": 9.967467231823271e-05, "loss": 2.3928709030151367, "memory(GiB)": 47.63, "step": 4240, "token_acc": 0.5230769230769231, "train_speed(iter/s)": 1.460028 }, { "epoch": 0.18186881453236795, "grad_norm": 3.8946142196655273, "learning_rate": 9.967390541916037e-05, "loss": 2.912725639343262, "memory(GiB)": 47.63, "step": 4245, "token_acc": 0.4098939929328622, "train_speed(iter/s)": 1.46028 }, { "epoch": 0.1820830298616169, "grad_norm": 3.5400397777557373, "learning_rate": 9.967313762019866e-05, "loss": 2.9137847900390623, "memory(GiB)": 47.63, "step": 4250, "token_acc": 0.4354243542435424, "train_speed(iter/s)": 1.460411 }, { "epoch": 0.18229724519086585, "grad_norm": 2.751776695251465, "learning_rate": 9.967236892136145e-05, "loss": 2.5477777481079102, "memory(GiB)": 47.63, "step": 4255, "token_acc": 0.4567901234567901, "train_speed(iter/s)": 1.460698 }, { "epoch": 0.18251146052011483, "grad_norm": 3.9068148136138916, "learning_rate": 9.967159932266271e-05, "loss": 2.6640533447265624, "memory(GiB)": 47.63, "step": 4260, "token_acc": 0.4575645756457565, "train_speed(iter/s)": 1.460887 }, { "epoch": 0.18272567584936378, "grad_norm": 4.075963020324707, "learning_rate": 9.967082882411631e-05, "loss": 2.9097997665405275, "memory(GiB)": 47.63, "step": 4265, "token_acc": 0.40397350993377484, "train_speed(iter/s)": 1.46056 }, { "epoch": 0.18293989117861273, "grad_norm": 3.5479114055633545, "learning_rate": 9.967005742573627e-05, "loss": 2.8735851287841796, "memory(GiB)": 47.63, "step": 4270, "token_acc": 0.4472049689440994, "train_speed(iter/s)": 1.460882 }, { "epoch": 0.1831541065078617, "grad_norm": 3.406883478164673, "learning_rate": 9.966928512753656e-05, "loss": 2.4399322509765624, "memory(GiB)": 47.63, "step": 4275, "token_acc": 0.48148148148148145, "train_speed(iter/s)": 1.461111 }, { "epoch": 0.18336832183711066, "grad_norm": 5.102316379547119, "learning_rate": 9.966851192953114e-05, "loss": 2.696395683288574, "memory(GiB)": 47.63, "step": 4280, "token_acc": 0.4774774774774775, "train_speed(iter/s)": 1.460907 }, { "epoch": 0.18358253716635964, "grad_norm": 4.178129196166992, "learning_rate": 9.966773783173403e-05, "loss": 2.9731023788452147, "memory(GiB)": 47.63, "step": 4285, "token_acc": 0.44609665427509293, "train_speed(iter/s)": 1.460806 }, { "epoch": 0.1837967524956086, "grad_norm": 4.420988082885742, "learning_rate": 9.966696283415926e-05, "loss": 2.349736785888672, "memory(GiB)": 47.63, "step": 4290, "token_acc": 0.4825174825174825, "train_speed(iter/s)": 1.460971 }, { "epoch": 0.18401096782485754, "grad_norm": 3.1192681789398193, "learning_rate": 9.966618693682089e-05, "loss": 2.9653976440429686, "memory(GiB)": 47.63, "step": 4295, "token_acc": 0.4080267558528428, "train_speed(iter/s)": 1.461057 }, { "epoch": 0.18422518315410652, "grad_norm": 4.423130035400391, "learning_rate": 9.966541013973291e-05, "loss": 2.4411151885986326, "memory(GiB)": 47.63, "step": 4300, "token_acc": 0.5104895104895105, "train_speed(iter/s)": 1.461167 }, { "epoch": 0.18443939848335547, "grad_norm": 3.7723641395568848, "learning_rate": 9.966463244290947e-05, "loss": 2.408475875854492, "memory(GiB)": 47.63, "step": 4305, "token_acc": 0.4875, "train_speed(iter/s)": 1.461318 }, { "epoch": 0.18465361381260442, "grad_norm": 4.908487796783447, "learning_rate": 9.966385384636461e-05, "loss": 2.6260196685791017, "memory(GiB)": 47.63, "step": 4310, "token_acc": 0.41603053435114506, "train_speed(iter/s)": 1.461368 }, { "epoch": 0.1848678291418534, "grad_norm": 4.305746555328369, "learning_rate": 9.966307435011245e-05, "loss": 2.60677547454834, "memory(GiB)": 47.63, "step": 4315, "token_acc": 0.47297297297297297, "train_speed(iter/s)": 1.460971 }, { "epoch": 0.18508204447110235, "grad_norm": 4.457401752471924, "learning_rate": 9.96622939541671e-05, "loss": 2.923782730102539, "memory(GiB)": 47.63, "step": 4320, "token_acc": 0.45723684210526316, "train_speed(iter/s)": 1.461007 }, { "epoch": 0.1852962598003513, "grad_norm": 4.0793914794921875, "learning_rate": 9.966151265854273e-05, "loss": 2.727579116821289, "memory(GiB)": 47.63, "step": 4325, "token_acc": 0.4541832669322709, "train_speed(iter/s)": 1.461072 }, { "epoch": 0.18551047512960028, "grad_norm": 5.2035017013549805, "learning_rate": 9.966073046325346e-05, "loss": 2.4274356842041014, "memory(GiB)": 47.63, "step": 4330, "token_acc": 0.49361702127659574, "train_speed(iter/s)": 1.460589 }, { "epoch": 0.18572469045884923, "grad_norm": 4.7049880027771, "learning_rate": 9.965994736831348e-05, "loss": 2.6569192886352537, "memory(GiB)": 47.63, "step": 4335, "token_acc": 0.46534653465346537, "train_speed(iter/s)": 1.461008 }, { "epoch": 0.18593890578809819, "grad_norm": 3.325430154800415, "learning_rate": 9.965916337373696e-05, "loss": 2.6684452056884767, "memory(GiB)": 47.63, "step": 4340, "token_acc": 0.46474358974358976, "train_speed(iter/s)": 1.461264 }, { "epoch": 0.18615312111734716, "grad_norm": 2.775076389312744, "learning_rate": 9.965837847953812e-05, "loss": 2.372978401184082, "memory(GiB)": 47.63, "step": 4345, "token_acc": 0.48881789137380194, "train_speed(iter/s)": 1.461303 }, { "epoch": 0.18636733644659612, "grad_norm": 5.524483680725098, "learning_rate": 9.965759268573117e-05, "loss": 2.436813545227051, "memory(GiB)": 47.63, "step": 4350, "token_acc": 0.48717948717948717, "train_speed(iter/s)": 1.461202 }, { "epoch": 0.18658155177584507, "grad_norm": 3.795135974884033, "learning_rate": 9.965680599233034e-05, "loss": 2.4541547775268553, "memory(GiB)": 47.63, "step": 4355, "token_acc": 0.5038759689922481, "train_speed(iter/s)": 1.461225 }, { "epoch": 0.18679576710509405, "grad_norm": 2.6173551082611084, "learning_rate": 9.965601839934988e-05, "loss": 2.671848487854004, "memory(GiB)": 47.63, "step": 4360, "token_acc": 0.4603174603174603, "train_speed(iter/s)": 1.461331 }, { "epoch": 0.187009982434343, "grad_norm": 4.096261024475098, "learning_rate": 9.965522990680406e-05, "loss": 2.9269435882568358, "memory(GiB)": 47.63, "step": 4365, "token_acc": 0.4387755102040816, "train_speed(iter/s)": 1.461532 }, { "epoch": 0.18722419776359198, "grad_norm": 4.488049507141113, "learning_rate": 9.965444051470721e-05, "loss": 3.037659454345703, "memory(GiB)": 47.63, "step": 4370, "token_acc": 0.3881578947368421, "train_speed(iter/s)": 1.462003 }, { "epoch": 0.18743841309284093, "grad_norm": 2.9177701473236084, "learning_rate": 9.965365022307356e-05, "loss": 2.3297451019287108, "memory(GiB)": 47.63, "step": 4375, "token_acc": 0.45936395759717313, "train_speed(iter/s)": 1.46206 }, { "epoch": 0.18765262842208988, "grad_norm": 3.5849967002868652, "learning_rate": 9.965285903191746e-05, "loss": 2.6896245956420897, "memory(GiB)": 47.63, "step": 4380, "token_acc": 0.4626865671641791, "train_speed(iter/s)": 1.461985 }, { "epoch": 0.18786684375133886, "grad_norm": 4.3653154373168945, "learning_rate": 9.965206694125324e-05, "loss": 2.6202011108398438, "memory(GiB)": 47.63, "step": 4385, "token_acc": 0.465625, "train_speed(iter/s)": 1.461821 }, { "epoch": 0.1880810590805878, "grad_norm": 3.7579469680786133, "learning_rate": 9.965127395109525e-05, "loss": 2.800714874267578, "memory(GiB)": 47.63, "step": 4390, "token_acc": 0.43853820598006643, "train_speed(iter/s)": 1.461655 }, { "epoch": 0.18829527440983676, "grad_norm": 4.8983259201049805, "learning_rate": 9.965048006145785e-05, "loss": 2.5570486068725584, "memory(GiB)": 47.63, "step": 4395, "token_acc": 0.4621212121212121, "train_speed(iter/s)": 1.462053 }, { "epoch": 0.18850948973908574, "grad_norm": 5.242818355560303, "learning_rate": 9.964968527235544e-05, "loss": 2.620072364807129, "memory(GiB)": 47.63, "step": 4400, "token_acc": 0.4782608695652174, "train_speed(iter/s)": 1.462177 }, { "epoch": 0.1887237050683347, "grad_norm": 3.5841519832611084, "learning_rate": 9.96488895838024e-05, "loss": 2.581031608581543, "memory(GiB)": 47.63, "step": 4405, "token_acc": 0.4697508896797153, "train_speed(iter/s)": 1.461995 }, { "epoch": 0.18893792039758364, "grad_norm": 3.4121358394622803, "learning_rate": 9.964809299581315e-05, "loss": 2.467431831359863, "memory(GiB)": 47.63, "step": 4410, "token_acc": 0.46494464944649444, "train_speed(iter/s)": 1.462257 }, { "epoch": 0.18915213572683262, "grad_norm": 5.127429008483887, "learning_rate": 9.964729550840212e-05, "loss": 2.5965818405151366, "memory(GiB)": 47.63, "step": 4415, "token_acc": 0.43302180685358255, "train_speed(iter/s)": 1.462006 }, { "epoch": 0.18936635105608157, "grad_norm": 3.961388111114502, "learning_rate": 9.964649712158377e-05, "loss": 2.677463150024414, "memory(GiB)": 47.63, "step": 4420, "token_acc": 0.4782608695652174, "train_speed(iter/s)": 1.462467 }, { "epoch": 0.18958056638533052, "grad_norm": 3.791003942489624, "learning_rate": 9.964569783537255e-05, "loss": 2.9339937210083007, "memory(GiB)": 47.63, "step": 4425, "token_acc": 0.4678111587982833, "train_speed(iter/s)": 1.462977 }, { "epoch": 0.1897947817145795, "grad_norm": 3.5306408405303955, "learning_rate": 9.964489764978293e-05, "loss": 2.5060840606689454, "memory(GiB)": 47.63, "step": 4430, "token_acc": 0.49834983498349833, "train_speed(iter/s)": 1.463159 }, { "epoch": 0.19000899704382845, "grad_norm": 3.824587821960449, "learning_rate": 9.964409656482943e-05, "loss": 2.289750862121582, "memory(GiB)": 47.63, "step": 4435, "token_acc": 0.49624060150375937, "train_speed(iter/s)": 1.463251 }, { "epoch": 0.1902232123730774, "grad_norm": 3.808020830154419, "learning_rate": 9.964329458052655e-05, "loss": 2.6161310195922853, "memory(GiB)": 47.63, "step": 4440, "token_acc": 0.4444444444444444, "train_speed(iter/s)": 1.46351 }, { "epoch": 0.19043742770232638, "grad_norm": 3.0389132499694824, "learning_rate": 9.964249169688882e-05, "loss": 2.7402992248535156, "memory(GiB)": 47.63, "step": 4445, "token_acc": 0.43934426229508194, "train_speed(iter/s)": 1.463621 }, { "epoch": 0.19065164303157534, "grad_norm": 3.4146127700805664, "learning_rate": 9.964168791393077e-05, "loss": 2.8018726348876952, "memory(GiB)": 47.63, "step": 4450, "token_acc": 0.43272727272727274, "train_speed(iter/s)": 1.464128 }, { "epoch": 0.19086585836082431, "grad_norm": 3.163907527923584, "learning_rate": 9.964088323166698e-05, "loss": 3.381665802001953, "memory(GiB)": 47.63, "step": 4455, "token_acc": 0.415625, "train_speed(iter/s)": 1.463784 }, { "epoch": 0.19108007369007327, "grad_norm": 3.469853162765503, "learning_rate": 9.964007765011204e-05, "loss": 2.964613914489746, "memory(GiB)": 47.63, "step": 4460, "token_acc": 0.3939393939393939, "train_speed(iter/s)": 1.464038 }, { "epoch": 0.19129428901932222, "grad_norm": 4.974525451660156, "learning_rate": 9.963927116928051e-05, "loss": 2.4353404998779298, "memory(GiB)": 47.63, "step": 4465, "token_acc": 0.4631578947368421, "train_speed(iter/s)": 1.464027 }, { "epoch": 0.1915085043485712, "grad_norm": 4.314286231994629, "learning_rate": 9.963846378918702e-05, "loss": 2.8339258193969727, "memory(GiB)": 47.63, "step": 4470, "token_acc": 0.4349315068493151, "train_speed(iter/s)": 1.463985 }, { "epoch": 0.19172271967782015, "grad_norm": 4.252258777618408, "learning_rate": 9.96376555098462e-05, "loss": 2.8118757247924804, "memory(GiB)": 47.63, "step": 4475, "token_acc": 0.4327956989247312, "train_speed(iter/s)": 1.464139 }, { "epoch": 0.1919369350070691, "grad_norm": 4.8369011878967285, "learning_rate": 9.963684633127269e-05, "loss": 2.559932327270508, "memory(GiB)": 47.63, "step": 4480, "token_acc": 0.4536741214057508, "train_speed(iter/s)": 1.46394 }, { "epoch": 0.19215115033631808, "grad_norm": 3.3614726066589355, "learning_rate": 9.963603625348114e-05, "loss": 2.5462081909179686, "memory(GiB)": 47.63, "step": 4485, "token_acc": 0.46619217081850534, "train_speed(iter/s)": 1.463864 }, { "epoch": 0.19236536566556703, "grad_norm": 4.094184398651123, "learning_rate": 9.963522527648623e-05, "loss": 2.351723289489746, "memory(GiB)": 47.63, "step": 4490, "token_acc": 0.4961832061068702, "train_speed(iter/s)": 1.463849 }, { "epoch": 0.19257958099481598, "grad_norm": 3.6614444255828857, "learning_rate": 9.963441340030267e-05, "loss": 2.748393249511719, "memory(GiB)": 47.63, "step": 4495, "token_acc": 0.43213296398891965, "train_speed(iter/s)": 1.463856 }, { "epoch": 0.19279379632406496, "grad_norm": 3.257753372192383, "learning_rate": 9.963360062494512e-05, "loss": 2.688405990600586, "memory(GiB)": 47.63, "step": 4500, "token_acc": 0.4554794520547945, "train_speed(iter/s)": 1.464127 }, { "epoch": 0.19279379632406496, "eval_loss": 2.1965346336364746, "eval_runtime": 14.2949, "eval_samples_per_second": 6.995, "eval_steps_per_second": 6.995, "eval_token_acc": 0.4822888283378747, "step": 4500 }, { "epoch": 0.1930080116533139, "grad_norm": 3.3588240146636963, "learning_rate": 9.963278695042835e-05, "loss": 2.779301071166992, "memory(GiB)": 47.63, "step": 4505, "token_acc": 0.4774436090225564, "train_speed(iter/s)": 1.457289 }, { "epoch": 0.19322222698256286, "grad_norm": 4.67567777633667, "learning_rate": 9.963197237676709e-05, "loss": 2.8437389373779296, "memory(GiB)": 47.63, "step": 4510, "token_acc": 0.4560669456066946, "train_speed(iter/s)": 1.457184 }, { "epoch": 0.19343644231181184, "grad_norm": 3.164679527282715, "learning_rate": 9.963115690397608e-05, "loss": 2.7683971405029295, "memory(GiB)": 47.63, "step": 4515, "token_acc": 0.4384858044164038, "train_speed(iter/s)": 1.457478 }, { "epoch": 0.1936506576410608, "grad_norm": 5.027650833129883, "learning_rate": 9.96303405320701e-05, "loss": 2.2967552185058593, "memory(GiB)": 47.63, "step": 4520, "token_acc": 0.525691699604743, "train_speed(iter/s)": 1.457404 }, { "epoch": 0.19386487297030974, "grad_norm": 5.658149242401123, "learning_rate": 9.962952326106396e-05, "loss": 2.7964462280273437, "memory(GiB)": 47.63, "step": 4525, "token_acc": 0.4463087248322148, "train_speed(iter/s)": 1.457647 }, { "epoch": 0.19407908829955872, "grad_norm": 3.484639883041382, "learning_rate": 9.962870509097245e-05, "loss": 2.441773223876953, "memory(GiB)": 47.63, "step": 4530, "token_acc": 0.4708333333333333, "train_speed(iter/s)": 1.457409 }, { "epoch": 0.19429330362880767, "grad_norm": 3.731177568435669, "learning_rate": 9.962788602181039e-05, "loss": 2.435281181335449, "memory(GiB)": 47.63, "step": 4535, "token_acc": 0.43197278911564624, "train_speed(iter/s)": 1.457833 }, { "epoch": 0.19450751895805665, "grad_norm": 2.832918167114258, "learning_rate": 9.962706605359262e-05, "loss": 2.6169305801391602, "memory(GiB)": 47.63, "step": 4540, "token_acc": 0.46645367412140576, "train_speed(iter/s)": 1.457925 }, { "epoch": 0.1947217342873056, "grad_norm": 3.472694158554077, "learning_rate": 9.962624518633399e-05, "loss": 2.596920967102051, "memory(GiB)": 47.63, "step": 4545, "token_acc": 0.4825174825174825, "train_speed(iter/s)": 1.45815 }, { "epoch": 0.19493594961655455, "grad_norm": 3.1194911003112793, "learning_rate": 9.962542342004937e-05, "loss": 2.5030025482177733, "memory(GiB)": 47.63, "step": 4550, "token_acc": 0.5093167701863354, "train_speed(iter/s)": 1.458317 }, { "epoch": 0.19515016494580353, "grad_norm": 3.147874116897583, "learning_rate": 9.962460075475366e-05, "loss": 2.5221424102783203, "memory(GiB)": 47.63, "step": 4555, "token_acc": 0.43410852713178294, "train_speed(iter/s)": 1.458664 }, { "epoch": 0.19536438027505248, "grad_norm": 3.951706886291504, "learning_rate": 9.962377719046176e-05, "loss": 2.5376556396484373, "memory(GiB)": 47.63, "step": 4560, "token_acc": 0.46534653465346537, "train_speed(iter/s)": 1.458976 }, { "epoch": 0.19557859560430144, "grad_norm": 4.471684455871582, "learning_rate": 9.962295272718858e-05, "loss": 2.3781890869140625, "memory(GiB)": 47.63, "step": 4565, "token_acc": 0.5638766519823789, "train_speed(iter/s)": 1.458898 }, { "epoch": 0.19579281093355042, "grad_norm": 3.68644118309021, "learning_rate": 9.962212736494905e-05, "loss": 2.5323280334472655, "memory(GiB)": 47.63, "step": 4570, "token_acc": 0.4966216216216216, "train_speed(iter/s)": 1.458497 }, { "epoch": 0.19600702626279937, "grad_norm": 4.825692176818848, "learning_rate": 9.962130110375815e-05, "loss": 2.6095434188842774, "memory(GiB)": 47.63, "step": 4575, "token_acc": 0.4528985507246377, "train_speed(iter/s)": 1.458933 }, { "epoch": 0.19622124159204832, "grad_norm": 3.770578384399414, "learning_rate": 9.962047394363083e-05, "loss": 2.5597244262695313, "memory(GiB)": 47.63, "step": 4580, "token_acc": 0.43478260869565216, "train_speed(iter/s)": 1.458866 }, { "epoch": 0.1964354569212973, "grad_norm": 3.474961996078491, "learning_rate": 9.961964588458207e-05, "loss": 2.66544189453125, "memory(GiB)": 47.63, "step": 4585, "token_acc": 0.516, "train_speed(iter/s)": 1.459118 }, { "epoch": 0.19664967225054625, "grad_norm": 3.6821129322052, "learning_rate": 9.961881692662688e-05, "loss": 2.726749610900879, "memory(GiB)": 47.63, "step": 4590, "token_acc": 0.5072463768115942, "train_speed(iter/s)": 1.459355 }, { "epoch": 0.1968638875797952, "grad_norm": 5.148923397064209, "learning_rate": 9.961798706978027e-05, "loss": 2.8032838821411135, "memory(GiB)": 47.63, "step": 4595, "token_acc": 0.4405144694533762, "train_speed(iter/s)": 1.459654 }, { "epoch": 0.19707810290904418, "grad_norm": 4.147294044494629, "learning_rate": 9.96171563140573e-05, "loss": 2.798273277282715, "memory(GiB)": 47.63, "step": 4600, "token_acc": 0.4262295081967213, "train_speed(iter/s)": 1.459743 }, { "epoch": 0.19729231823829313, "grad_norm": 4.429314613342285, "learning_rate": 9.961632465947297e-05, "loss": 2.8597497940063477, "memory(GiB)": 47.63, "step": 4605, "token_acc": 0.4368932038834951, "train_speed(iter/s)": 1.459741 }, { "epoch": 0.19750653356754208, "grad_norm": 3.4142091274261475, "learning_rate": 9.96154921060424e-05, "loss": 2.630633544921875, "memory(GiB)": 47.63, "step": 4610, "token_acc": 0.44, "train_speed(iter/s)": 1.459614 }, { "epoch": 0.19772074889679106, "grad_norm": 6.562750339508057, "learning_rate": 9.961465865378063e-05, "loss": 2.3304882049560547, "memory(GiB)": 47.63, "step": 4615, "token_acc": 0.49034749034749037, "train_speed(iter/s)": 1.459435 }, { "epoch": 0.19793496422604, "grad_norm": 5.5141282081604, "learning_rate": 9.961382430270278e-05, "loss": 2.7084377288818358, "memory(GiB)": 47.63, "step": 4620, "token_acc": 0.4161676646706587, "train_speed(iter/s)": 1.459577 }, { "epoch": 0.198149179555289, "grad_norm": 4.459146022796631, "learning_rate": 9.961298905282397e-05, "loss": 2.5132150650024414, "memory(GiB)": 47.63, "step": 4625, "token_acc": 0.4175084175084175, "train_speed(iter/s)": 1.459566 }, { "epoch": 0.19836339488453794, "grad_norm": 4.930593967437744, "learning_rate": 9.961215290415931e-05, "loss": 2.896724891662598, "memory(GiB)": 47.63, "step": 4630, "token_acc": 0.42585551330798477, "train_speed(iter/s)": 1.459714 }, { "epoch": 0.1985776102137869, "grad_norm": 5.480079650878906, "learning_rate": 9.961131585672396e-05, "loss": 2.681516647338867, "memory(GiB)": 47.63, "step": 4635, "token_acc": 0.45195729537366547, "train_speed(iter/s)": 1.460097 }, { "epoch": 0.19879182554303587, "grad_norm": 5.128887176513672, "learning_rate": 9.961047791053309e-05, "loss": 2.6390451431274413, "memory(GiB)": 47.63, "step": 4640, "token_acc": 0.43243243243243246, "train_speed(iter/s)": 1.460115 }, { "epoch": 0.19900604087228482, "grad_norm": 3.6200530529022217, "learning_rate": 9.960963906560188e-05, "loss": 2.7532272338867188, "memory(GiB)": 47.63, "step": 4645, "token_acc": 0.4426751592356688, "train_speed(iter/s)": 1.459993 }, { "epoch": 0.19922025620153377, "grad_norm": 3.444218873977661, "learning_rate": 9.96087993219455e-05, "loss": 2.4867788314819337, "memory(GiB)": 47.63, "step": 4650, "token_acc": 0.4828767123287671, "train_speed(iter/s)": 1.459665 }, { "epoch": 0.19943447153078275, "grad_norm": 4.417496681213379, "learning_rate": 9.960795867957921e-05, "loss": 2.3958377838134766, "memory(GiB)": 47.63, "step": 4655, "token_acc": 0.5261194029850746, "train_speed(iter/s)": 1.459639 }, { "epoch": 0.1996486868600317, "grad_norm": 3.7539174556732178, "learning_rate": 9.96071171385182e-05, "loss": 2.6219974517822267, "memory(GiB)": 47.63, "step": 4660, "token_acc": 0.46357615894039733, "train_speed(iter/s)": 1.459999 }, { "epoch": 0.19986290218928066, "grad_norm": 3.0214028358459473, "learning_rate": 9.960627469877773e-05, "loss": 2.370345115661621, "memory(GiB)": 47.63, "step": 4665, "token_acc": 0.48580441640378547, "train_speed(iter/s)": 1.460206 }, { "epoch": 0.20007711751852963, "grad_norm": 6.439914226531982, "learning_rate": 9.960543136037306e-05, "loss": 2.6158199310302734, "memory(GiB)": 47.63, "step": 4670, "token_acc": 0.49122807017543857, "train_speed(iter/s)": 1.4604 }, { "epoch": 0.20029133284777859, "grad_norm": 4.3249993324279785, "learning_rate": 9.960458712331946e-05, "loss": 2.337392807006836, "memory(GiB)": 47.63, "step": 4675, "token_acc": 0.5520833333333334, "train_speed(iter/s)": 1.460867 }, { "epoch": 0.20050554817702754, "grad_norm": 3.847118377685547, "learning_rate": 9.960374198763225e-05, "loss": 2.4481176376342773, "memory(GiB)": 47.63, "step": 4680, "token_acc": 0.5128205128205128, "train_speed(iter/s)": 1.460848 }, { "epoch": 0.20071976350627652, "grad_norm": 3.2964940071105957, "learning_rate": 9.96028959533267e-05, "loss": 2.810785102844238, "memory(GiB)": 47.63, "step": 4685, "token_acc": 0.431266846361186, "train_speed(iter/s)": 1.461136 }, { "epoch": 0.20093397883552547, "grad_norm": 4.322447299957275, "learning_rate": 9.960204902041816e-05, "loss": 2.8656097412109376, "memory(GiB)": 47.63, "step": 4690, "token_acc": 0.467680608365019, "train_speed(iter/s)": 1.461308 }, { "epoch": 0.20114819416477442, "grad_norm": 5.639817714691162, "learning_rate": 9.960120118892199e-05, "loss": 2.6024860382080077, "memory(GiB)": 47.63, "step": 4695, "token_acc": 0.44106463878326996, "train_speed(iter/s)": 1.460901 }, { "epoch": 0.2013624094940234, "grad_norm": 3.873734951019287, "learning_rate": 9.960035245885352e-05, "loss": 2.8162139892578124, "memory(GiB)": 47.63, "step": 4700, "token_acc": 0.4626038781163435, "train_speed(iter/s)": 1.460887 }, { "epoch": 0.20157662482327235, "grad_norm": 4.02492618560791, "learning_rate": 9.959950283022813e-05, "loss": 2.5812231063842774, "memory(GiB)": 47.63, "step": 4705, "token_acc": 0.4837758112094395, "train_speed(iter/s)": 1.46106 }, { "epoch": 0.20179084015252133, "grad_norm": 4.980051517486572, "learning_rate": 9.959865230306122e-05, "loss": 2.585065460205078, "memory(GiB)": 47.63, "step": 4710, "token_acc": 0.4830188679245283, "train_speed(iter/s)": 1.460656 }, { "epoch": 0.20200505548177028, "grad_norm": 3.0168967247009277, "learning_rate": 9.959780087736821e-05, "loss": 2.552627372741699, "memory(GiB)": 47.63, "step": 4715, "token_acc": 0.46283783783783783, "train_speed(iter/s)": 1.460619 }, { "epoch": 0.20221927081101923, "grad_norm": 7.62388277053833, "learning_rate": 9.959694855316451e-05, "loss": 2.8916736602783204, "memory(GiB)": 47.63, "step": 4720, "token_acc": 0.4303030303030303, "train_speed(iter/s)": 1.46052 }, { "epoch": 0.2024334861402682, "grad_norm": 4.516824722290039, "learning_rate": 9.959609533046554e-05, "loss": 2.6287261962890627, "memory(GiB)": 47.63, "step": 4725, "token_acc": 0.45454545454545453, "train_speed(iter/s)": 1.460771 }, { "epoch": 0.20264770146951716, "grad_norm": 4.003058910369873, "learning_rate": 9.959524120928678e-05, "loss": 2.6882965087890627, "memory(GiB)": 47.63, "step": 4730, "token_acc": 0.4155124653739612, "train_speed(iter/s)": 1.46094 }, { "epoch": 0.2028619167987661, "grad_norm": 4.212945938110352, "learning_rate": 9.959438618964372e-05, "loss": 2.74023551940918, "memory(GiB)": 47.63, "step": 4735, "token_acc": 0.4533762057877814, "train_speed(iter/s)": 1.460756 }, { "epoch": 0.2030761321280151, "grad_norm": 4.152052879333496, "learning_rate": 9.959353027155183e-05, "loss": 2.5586921691894533, "memory(GiB)": 47.63, "step": 4740, "token_acc": 0.4882154882154882, "train_speed(iter/s)": 1.461207 }, { "epoch": 0.20329034745726404, "grad_norm": 4.194866180419922, "learning_rate": 9.959267345502661e-05, "loss": 2.62713623046875, "memory(GiB)": 47.63, "step": 4745, "token_acc": 0.4562043795620438, "train_speed(iter/s)": 1.461366 }, { "epoch": 0.203504562786513, "grad_norm": 4.050347805023193, "learning_rate": 9.959181574008358e-05, "loss": 2.79455509185791, "memory(GiB)": 47.63, "step": 4750, "token_acc": 0.42424242424242425, "train_speed(iter/s)": 1.461483 }, { "epoch": 0.20371877811576197, "grad_norm": 5.109241962432861, "learning_rate": 9.95909571267383e-05, "loss": 2.6086854934692383, "memory(GiB)": 47.63, "step": 4755, "token_acc": 0.4684385382059801, "train_speed(iter/s)": 1.460906 }, { "epoch": 0.20393299344501092, "grad_norm": 4.411006927490234, "learning_rate": 9.95900976150063e-05, "loss": 2.4530099868774413, "memory(GiB)": 47.63, "step": 4760, "token_acc": 0.458955223880597, "train_speed(iter/s)": 1.460873 }, { "epoch": 0.20414720877425988, "grad_norm": 4.208810329437256, "learning_rate": 9.958923720490317e-05, "loss": 2.8047142028808594, "memory(GiB)": 47.63, "step": 4765, "token_acc": 0.43050847457627117, "train_speed(iter/s)": 1.461025 }, { "epoch": 0.20436142410350885, "grad_norm": 5.848051071166992, "learning_rate": 9.958837589644449e-05, "loss": 2.3469343185424805, "memory(GiB)": 47.63, "step": 4770, "token_acc": 0.5074074074074074, "train_speed(iter/s)": 1.460712 }, { "epoch": 0.2045756394327578, "grad_norm": 3.1968307495117188, "learning_rate": 9.958751368964585e-05, "loss": 2.4620996475219727, "memory(GiB)": 47.63, "step": 4775, "token_acc": 0.5038759689922481, "train_speed(iter/s)": 1.460871 }, { "epoch": 0.20478985476200676, "grad_norm": 4.222325325012207, "learning_rate": 9.95866505845229e-05, "loss": 2.704343795776367, "memory(GiB)": 47.63, "step": 4780, "token_acc": 0.4359861591695502, "train_speed(iter/s)": 1.461128 }, { "epoch": 0.20500407009125574, "grad_norm": 3.7278714179992676, "learning_rate": 9.958578658109125e-05, "loss": 2.838206100463867, "memory(GiB)": 47.63, "step": 4785, "token_acc": 0.45323741007194246, "train_speed(iter/s)": 1.461544 }, { "epoch": 0.2052182854205047, "grad_norm": 3.182396650314331, "learning_rate": 9.958492167936658e-05, "loss": 2.4889122009277345, "memory(GiB)": 47.63, "step": 4790, "token_acc": 0.4872611464968153, "train_speed(iter/s)": 1.461955 }, { "epoch": 0.20543250074975367, "grad_norm": 3.691033363342285, "learning_rate": 9.958405587936452e-05, "loss": 2.598044013977051, "memory(GiB)": 47.63, "step": 4795, "token_acc": 0.44745762711864406, "train_speed(iter/s)": 1.461961 }, { "epoch": 0.20564671607900262, "grad_norm": 4.164496898651123, "learning_rate": 9.958318918110077e-05, "loss": 2.354471206665039, "memory(GiB)": 47.63, "step": 4800, "token_acc": 0.5313531353135313, "train_speed(iter/s)": 1.461688 }, { "epoch": 0.20586093140825157, "grad_norm": 5.045453071594238, "learning_rate": 9.958232158459104e-05, "loss": 2.482447052001953, "memory(GiB)": 47.63, "step": 4805, "token_acc": 0.47639484978540775, "train_speed(iter/s)": 1.461909 }, { "epoch": 0.20607514673750055, "grad_norm": 3.663247585296631, "learning_rate": 9.958145308985105e-05, "loss": 2.739253616333008, "memory(GiB)": 47.63, "step": 4810, "token_acc": 0.4456140350877193, "train_speed(iter/s)": 1.461809 }, { "epoch": 0.2062893620667495, "grad_norm": 3.318002462387085, "learning_rate": 9.958058369689651e-05, "loss": 2.5637796401977537, "memory(GiB)": 47.63, "step": 4815, "token_acc": 0.491869918699187, "train_speed(iter/s)": 1.462036 }, { "epoch": 0.20650357739599845, "grad_norm": 3.3069710731506348, "learning_rate": 9.95797134057432e-05, "loss": 2.7852516174316406, "memory(GiB)": 47.63, "step": 4820, "token_acc": 0.4592833876221498, "train_speed(iter/s)": 1.462307 }, { "epoch": 0.20671779272524743, "grad_norm": 2.944810628890991, "learning_rate": 9.957884221640687e-05, "loss": 2.959232711791992, "memory(GiB)": 47.63, "step": 4825, "token_acc": 0.4103448275862069, "train_speed(iter/s)": 1.462467 }, { "epoch": 0.20693200805449638, "grad_norm": 5.038721084594727, "learning_rate": 9.95779701289033e-05, "loss": 2.544814682006836, "memory(GiB)": 47.63, "step": 4830, "token_acc": 0.49606299212598426, "train_speed(iter/s)": 1.462689 }, { "epoch": 0.20714622338374533, "grad_norm": 4.850725173950195, "learning_rate": 9.95770971432483e-05, "loss": 2.634608268737793, "memory(GiB)": 47.63, "step": 4835, "token_acc": 0.48056537102473496, "train_speed(iter/s)": 1.462719 }, { "epoch": 0.2073604387129943, "grad_norm": 4.1277174949646, "learning_rate": 9.957622325945768e-05, "loss": 2.6970439910888673, "memory(GiB)": 47.63, "step": 4840, "token_acc": 0.4519230769230769, "train_speed(iter/s)": 1.46275 }, { "epoch": 0.20757465404224326, "grad_norm": 3.763849973678589, "learning_rate": 9.957534847754726e-05, "loss": 2.8271854400634764, "memory(GiB)": 47.63, "step": 4845, "token_acc": 0.44805194805194803, "train_speed(iter/s)": 1.462949 }, { "epoch": 0.2077888693714922, "grad_norm": 4.403916835784912, "learning_rate": 9.95744727975329e-05, "loss": 2.81970100402832, "memory(GiB)": 47.63, "step": 4850, "token_acc": 0.46735395189003437, "train_speed(iter/s)": 1.463301 }, { "epoch": 0.2080030847007412, "grad_norm": 6.187358379364014, "learning_rate": 9.957359621943045e-05, "loss": 2.487234878540039, "memory(GiB)": 47.63, "step": 4855, "token_acc": 0.4788732394366197, "train_speed(iter/s)": 1.463579 }, { "epoch": 0.20821730002999014, "grad_norm": 3.3152196407318115, "learning_rate": 9.957271874325582e-05, "loss": 2.536726379394531, "memory(GiB)": 47.63, "step": 4860, "token_acc": 0.4756944444444444, "train_speed(iter/s)": 1.464034 }, { "epoch": 0.2084315153592391, "grad_norm": 3.83278489112854, "learning_rate": 9.957184036902488e-05, "loss": 2.633724594116211, "memory(GiB)": 47.63, "step": 4865, "token_acc": 0.48214285714285715, "train_speed(iter/s)": 1.464009 }, { "epoch": 0.20864573068848807, "grad_norm": 5.216366291046143, "learning_rate": 9.957096109675354e-05, "loss": 2.8043491363525392, "memory(GiB)": 47.63, "step": 4870, "token_acc": 0.46706586826347307, "train_speed(iter/s)": 1.463764 }, { "epoch": 0.20885994601773702, "grad_norm": 3.232802629470825, "learning_rate": 9.957008092645777e-05, "loss": 2.591070365905762, "memory(GiB)": 47.63, "step": 4875, "token_acc": 0.4454828660436137, "train_speed(iter/s)": 1.463808 }, { "epoch": 0.209074161346986, "grad_norm": 20.843944549560547, "learning_rate": 9.956919985815345e-05, "loss": 2.6538322448730467, "memory(GiB)": 47.63, "step": 4880, "token_acc": 0.45098039215686275, "train_speed(iter/s)": 1.463636 }, { "epoch": 0.20928837667623496, "grad_norm": 3.754133701324463, "learning_rate": 9.956831789185659e-05, "loss": 2.4146175384521484, "memory(GiB)": 47.63, "step": 4885, "token_acc": 0.49829351535836175, "train_speed(iter/s)": 1.463367 }, { "epoch": 0.2095025920054839, "grad_norm": 7.558262825012207, "learning_rate": 9.956743502758315e-05, "loss": 2.7335124969482423, "memory(GiB)": 47.63, "step": 4890, "token_acc": 0.4652777777777778, "train_speed(iter/s)": 1.463483 }, { "epoch": 0.20971680733473289, "grad_norm": 2.960104465484619, "learning_rate": 9.956655126534911e-05, "loss": 2.3875946044921874, "memory(GiB)": 47.63, "step": 4895, "token_acc": 0.5067567567567568, "train_speed(iter/s)": 1.463796 }, { "epoch": 0.20993102266398184, "grad_norm": 5.942025661468506, "learning_rate": 9.956566660517053e-05, "loss": 2.312450408935547, "memory(GiB)": 47.63, "step": 4900, "token_acc": 0.49433962264150944, "train_speed(iter/s)": 1.463928 }, { "epoch": 0.2101452379932308, "grad_norm": 5.1008687019348145, "learning_rate": 9.956478104706337e-05, "loss": 2.6202777862548827, "memory(GiB)": 47.63, "step": 4905, "token_acc": 0.4963235294117647, "train_speed(iter/s)": 1.464154 }, { "epoch": 0.21035945332247977, "grad_norm": 4.184366226196289, "learning_rate": 9.956389459104372e-05, "loss": 2.444474220275879, "memory(GiB)": 47.63, "step": 4910, "token_acc": 0.43272727272727274, "train_speed(iter/s)": 1.464534 }, { "epoch": 0.21057366865172872, "grad_norm": 3.3858139514923096, "learning_rate": 9.956300723712764e-05, "loss": 2.7998870849609374, "memory(GiB)": 47.63, "step": 4915, "token_acc": 0.41040462427745666, "train_speed(iter/s)": 1.464655 }, { "epoch": 0.21078788398097767, "grad_norm": 3.690751552581787, "learning_rate": 9.956211898533117e-05, "loss": 2.331569290161133, "memory(GiB)": 47.63, "step": 4920, "token_acc": 0.5143769968051118, "train_speed(iter/s)": 1.465006 }, { "epoch": 0.21100209931022665, "grad_norm": 6.2631354331970215, "learning_rate": 9.956122983567042e-05, "loss": 2.435498046875, "memory(GiB)": 47.63, "step": 4925, "token_acc": 0.5141843971631206, "train_speed(iter/s)": 1.464621 }, { "epoch": 0.2112163146394756, "grad_norm": 3.335134744644165, "learning_rate": 9.95603397881615e-05, "loss": 2.3452857971191405, "memory(GiB)": 47.63, "step": 4930, "token_acc": 0.4886731391585761, "train_speed(iter/s)": 1.464787 }, { "epoch": 0.21143052996872455, "grad_norm": 3.005373477935791, "learning_rate": 9.955944884282055e-05, "loss": 2.612366485595703, "memory(GiB)": 47.63, "step": 4935, "token_acc": 0.44642857142857145, "train_speed(iter/s)": 1.465028 }, { "epoch": 0.21164474529797353, "grad_norm": 3.1305315494537354, "learning_rate": 9.955855699966368e-05, "loss": 2.6456478118896483, "memory(GiB)": 47.63, "step": 4940, "token_acc": 0.44642857142857145, "train_speed(iter/s)": 1.465002 }, { "epoch": 0.21185896062722248, "grad_norm": 3.9591519832611084, "learning_rate": 9.955766425870706e-05, "loss": 3.0028560638427733, "memory(GiB)": 47.63, "step": 4945, "token_acc": 0.4119496855345912, "train_speed(iter/s)": 1.464684 }, { "epoch": 0.21207317595647143, "grad_norm": 5.0495758056640625, "learning_rate": 9.955677061996687e-05, "loss": 2.730988311767578, "memory(GiB)": 47.63, "step": 4950, "token_acc": 0.43636363636363634, "train_speed(iter/s)": 1.464715 }, { "epoch": 0.2122873912857204, "grad_norm": 3.475385904312134, "learning_rate": 9.955587608345928e-05, "loss": 2.548473930358887, "memory(GiB)": 47.63, "step": 4955, "token_acc": 0.48936170212765956, "train_speed(iter/s)": 1.464675 }, { "epoch": 0.21250160661496936, "grad_norm": 3.494689702987671, "learning_rate": 9.955498064920052e-05, "loss": 2.870347595214844, "memory(GiB)": 47.63, "step": 4960, "token_acc": 0.45964912280701753, "train_speed(iter/s)": 1.464637 }, { "epoch": 0.21271582194421834, "grad_norm": 3.259819507598877, "learning_rate": 9.955408431720681e-05, "loss": 2.629207992553711, "memory(GiB)": 47.63, "step": 4965, "token_acc": 0.43934426229508194, "train_speed(iter/s)": 1.464807 }, { "epoch": 0.2129300372734673, "grad_norm": 4.76029634475708, "learning_rate": 9.955318708749435e-05, "loss": 2.643509864807129, "memory(GiB)": 47.63, "step": 4970, "token_acc": 0.46178343949044587, "train_speed(iter/s)": 1.464783 }, { "epoch": 0.21314425260271624, "grad_norm": 4.266506671905518, "learning_rate": 9.955228896007944e-05, "loss": 2.6341203689575194, "memory(GiB)": 47.63, "step": 4975, "token_acc": 0.44313725490196076, "train_speed(iter/s)": 1.464767 }, { "epoch": 0.21335846793196522, "grad_norm": 4.462551593780518, "learning_rate": 9.955138993497832e-05, "loss": 2.8413829803466797, "memory(GiB)": 47.63, "step": 4980, "token_acc": 0.42905405405405406, "train_speed(iter/s)": 1.464523 }, { "epoch": 0.21357268326121417, "grad_norm": 4.464143753051758, "learning_rate": 9.95504900122073e-05, "loss": 2.2535449981689455, "memory(GiB)": 47.63, "step": 4985, "token_acc": 0.4918032786885246, "train_speed(iter/s)": 1.464764 }, { "epoch": 0.21378689859046313, "grad_norm": 4.146317481994629, "learning_rate": 9.954958919178265e-05, "loss": 2.2576440811157226, "memory(GiB)": 47.63, "step": 4990, "token_acc": 0.5102880658436214, "train_speed(iter/s)": 1.464386 }, { "epoch": 0.2140011139197121, "grad_norm": 3.9403140544891357, "learning_rate": 9.954868747372074e-05, "loss": 2.723891830444336, "memory(GiB)": 47.63, "step": 4995, "token_acc": 0.4750830564784053, "train_speed(iter/s)": 1.464076 }, { "epoch": 0.21421532924896106, "grad_norm": 3.9458513259887695, "learning_rate": 9.954778485803787e-05, "loss": 2.4841426849365233, "memory(GiB)": 47.63, "step": 5000, "token_acc": 0.45038167938931295, "train_speed(iter/s)": 1.463911 }, { "epoch": 0.21421532924896106, "eval_loss": 2.3903603553771973, "eval_runtime": 13.3836, "eval_samples_per_second": 7.472, "eval_steps_per_second": 7.472, "eval_token_acc": 0.4508816120906801, "step": 5000 }, { "epoch": 0.21442954457821, "grad_norm": 4.423367023468018, "learning_rate": 9.954688134475038e-05, "loss": 2.759851837158203, "memory(GiB)": 47.63, "step": 5005, "token_acc": 0.4473924977127173, "train_speed(iter/s)": 1.457879 }, { "epoch": 0.214643759907459, "grad_norm": 3.4486584663391113, "learning_rate": 9.954597693387467e-05, "loss": 2.809971046447754, "memory(GiB)": 47.63, "step": 5010, "token_acc": 0.40853658536585363, "train_speed(iter/s)": 1.457742 }, { "epoch": 0.21485797523670794, "grad_norm": 3.5653340816497803, "learning_rate": 9.95450716254271e-05, "loss": 2.5606967926025392, "memory(GiB)": 47.63, "step": 5015, "token_acc": 0.44, "train_speed(iter/s)": 1.45784 }, { "epoch": 0.2150721905659569, "grad_norm": 6.181088924407959, "learning_rate": 9.954416541942408e-05, "loss": 3.0325870513916016, "memory(GiB)": 47.63, "step": 5020, "token_acc": 0.4220532319391635, "train_speed(iter/s)": 1.458003 }, { "epoch": 0.21528640589520587, "grad_norm": 5.29486083984375, "learning_rate": 9.954325831588204e-05, "loss": 2.6247535705566407, "memory(GiB)": 47.63, "step": 5025, "token_acc": 0.45674740484429066, "train_speed(iter/s)": 1.458211 }, { "epoch": 0.21550062122445482, "grad_norm": 4.460639953613281, "learning_rate": 9.954235031481739e-05, "loss": 2.4387359619140625, "memory(GiB)": 47.63, "step": 5030, "token_acc": 0.4769874476987448, "train_speed(iter/s)": 1.45829 }, { "epoch": 0.21571483655370377, "grad_norm": 3.4331841468811035, "learning_rate": 9.95414414162466e-05, "loss": 2.5991966247558596, "memory(GiB)": 47.63, "step": 5035, "token_acc": 0.4754601226993865, "train_speed(iter/s)": 1.458297 }, { "epoch": 0.21592905188295275, "grad_norm": 6.638716697692871, "learning_rate": 9.95405316201861e-05, "loss": 2.6689929962158203, "memory(GiB)": 47.63, "step": 5040, "token_acc": 0.47368421052631576, "train_speed(iter/s)": 1.458547 }, { "epoch": 0.2161432672122017, "grad_norm": 4.650533676147461, "learning_rate": 9.953962092665243e-05, "loss": 2.536483383178711, "memory(GiB)": 47.63, "step": 5045, "token_acc": 0.45149253731343286, "train_speed(iter/s)": 1.45859 }, { "epoch": 0.21635748254145068, "grad_norm": 4.859783172607422, "learning_rate": 9.953870933566203e-05, "loss": 2.8836605072021486, "memory(GiB)": 47.63, "step": 5050, "token_acc": 0.4358974358974359, "train_speed(iter/s)": 1.458408 }, { "epoch": 0.21657169787069963, "grad_norm": 3.7098331451416016, "learning_rate": 9.953779684723146e-05, "loss": 2.5904306411743163, "memory(GiB)": 47.63, "step": 5055, "token_acc": 0.444794952681388, "train_speed(iter/s)": 1.458505 }, { "epoch": 0.21678591319994858, "grad_norm": 3.6314024925231934, "learning_rate": 9.953688346137722e-05, "loss": 2.781103515625, "memory(GiB)": 47.63, "step": 5060, "token_acc": 0.4666666666666667, "train_speed(iter/s)": 1.458329 }, { "epoch": 0.21700012852919756, "grad_norm": 6.57258939743042, "learning_rate": 9.953596917811586e-05, "loss": 2.7235851287841797, "memory(GiB)": 47.63, "step": 5065, "token_acc": 0.44966442953020136, "train_speed(iter/s)": 1.458626 }, { "epoch": 0.2172143438584465, "grad_norm": 5.607734680175781, "learning_rate": 9.953505399746395e-05, "loss": 2.539692687988281, "memory(GiB)": 47.63, "step": 5070, "token_acc": 0.4384858044164038, "train_speed(iter/s)": 1.458775 }, { "epoch": 0.21742855918769546, "grad_norm": 3.690596103668213, "learning_rate": 9.953413791943808e-05, "loss": 2.4824382781982424, "memory(GiB)": 47.63, "step": 5075, "token_acc": 0.4876543209876543, "train_speed(iter/s)": 1.45831 }, { "epoch": 0.21764277451694444, "grad_norm": 4.219900131225586, "learning_rate": 9.953322094405482e-05, "loss": 2.744942855834961, "memory(GiB)": 47.63, "step": 5080, "token_acc": 0.4326241134751773, "train_speed(iter/s)": 1.458451 }, { "epoch": 0.2178569898461934, "grad_norm": 4.080595970153809, "learning_rate": 9.953230307133082e-05, "loss": 2.258548355102539, "memory(GiB)": 47.63, "step": 5085, "token_acc": 0.5038759689922481, "train_speed(iter/s)": 1.458626 }, { "epoch": 0.21807120517544235, "grad_norm": 5.973474025726318, "learning_rate": 9.953138430128266e-05, "loss": 2.804652786254883, "memory(GiB)": 47.63, "step": 5090, "token_acc": 0.41124260355029585, "train_speed(iter/s)": 1.458938 }, { "epoch": 0.21828542050469132, "grad_norm": 3.681203603744507, "learning_rate": 9.953046463392703e-05, "loss": 2.58138370513916, "memory(GiB)": 47.63, "step": 5095, "token_acc": 0.42990654205607476, "train_speed(iter/s)": 1.458989 }, { "epoch": 0.21849963583394028, "grad_norm": 3.3350465297698975, "learning_rate": 9.952954406928056e-05, "loss": 2.6960800170898436, "memory(GiB)": 47.63, "step": 5100, "token_acc": 0.4306784660766962, "train_speed(iter/s)": 1.459144 }, { "epoch": 0.21871385116318923, "grad_norm": 3.524101495742798, "learning_rate": 9.952862260735993e-05, "loss": 2.7578704833984373, "memory(GiB)": 47.63, "step": 5105, "token_acc": 0.4548611111111111, "train_speed(iter/s)": 1.459363 }, { "epoch": 0.2189280664924382, "grad_norm": 4.6973876953125, "learning_rate": 9.952770024818185e-05, "loss": 2.4339035034179686, "memory(GiB)": 47.63, "step": 5110, "token_acc": 0.5060728744939271, "train_speed(iter/s)": 1.459395 }, { "epoch": 0.21914228182168716, "grad_norm": 4.874196529388428, "learning_rate": 9.952677699176301e-05, "loss": 2.8538881301879884, "memory(GiB)": 47.63, "step": 5115, "token_acc": 0.4121212121212121, "train_speed(iter/s)": 1.459289 }, { "epoch": 0.2193564971509361, "grad_norm": 4.868494510650635, "learning_rate": 9.952585283812015e-05, "loss": 2.8529348373413086, "memory(GiB)": 47.63, "step": 5120, "token_acc": 0.43253968253968256, "train_speed(iter/s)": 1.459651 }, { "epoch": 0.2195707124801851, "grad_norm": 3.379431962966919, "learning_rate": 9.952492778727e-05, "loss": 2.5719249725341795, "memory(GiB)": 47.63, "step": 5125, "token_acc": 0.4722222222222222, "train_speed(iter/s)": 1.459488 }, { "epoch": 0.21978492780943404, "grad_norm": 5.336353302001953, "learning_rate": 9.952400183922933e-05, "loss": 2.547113037109375, "memory(GiB)": 47.63, "step": 5130, "token_acc": 0.44357976653696496, "train_speed(iter/s)": 1.459376 }, { "epoch": 0.21999914313868302, "grad_norm": 3.798307418823242, "learning_rate": 9.952307499401492e-05, "loss": 2.483036422729492, "memory(GiB)": 47.63, "step": 5135, "token_acc": 0.4981949458483754, "train_speed(iter/s)": 1.459064 }, { "epoch": 0.22021335846793197, "grad_norm": 4.275089740753174, "learning_rate": 9.952214725164355e-05, "loss": 2.9046915054321287, "memory(GiB)": 47.63, "step": 5140, "token_acc": 0.43359375, "train_speed(iter/s)": 1.459014 }, { "epoch": 0.22042757379718092, "grad_norm": 3.687201499938965, "learning_rate": 9.952121861213202e-05, "loss": 2.355246734619141, "memory(GiB)": 47.63, "step": 5145, "token_acc": 0.4728434504792332, "train_speed(iter/s)": 1.45946 }, { "epoch": 0.2206417891264299, "grad_norm": 4.156665325164795, "learning_rate": 9.952028907549715e-05, "loss": 2.884083557128906, "memory(GiB)": 47.63, "step": 5150, "token_acc": 0.4307228915662651, "train_speed(iter/s)": 1.459692 }, { "epoch": 0.22085600445567885, "grad_norm": 6.38535213470459, "learning_rate": 9.951935864175581e-05, "loss": 2.8706649780273437, "memory(GiB)": 47.63, "step": 5155, "token_acc": 0.4163934426229508, "train_speed(iter/s)": 1.459908 }, { "epoch": 0.2210702197849278, "grad_norm": 5.345571994781494, "learning_rate": 9.951842731092482e-05, "loss": 2.9905134201049806, "memory(GiB)": 47.63, "step": 5160, "token_acc": 0.4596774193548387, "train_speed(iter/s)": 1.459867 }, { "epoch": 0.22128443511417678, "grad_norm": 3.282136917114258, "learning_rate": 9.951749508302106e-05, "loss": 2.6577373504638673, "memory(GiB)": 47.63, "step": 5165, "token_acc": 0.45121951219512196, "train_speed(iter/s)": 1.459848 }, { "epoch": 0.22149865044342573, "grad_norm": 3.052436590194702, "learning_rate": 9.951656195806145e-05, "loss": 2.5006778717041014, "memory(GiB)": 47.63, "step": 5170, "token_acc": 0.47686832740213525, "train_speed(iter/s)": 1.459664 }, { "epoch": 0.22171286577267468, "grad_norm": 4.098628997802734, "learning_rate": 9.951562793606286e-05, "loss": 2.9629280090332033, "memory(GiB)": 47.63, "step": 5175, "token_acc": 0.4275618374558304, "train_speed(iter/s)": 1.460055 }, { "epoch": 0.22192708110192366, "grad_norm": 4.604274272918701, "learning_rate": 9.951469301704221e-05, "loss": 2.59681396484375, "memory(GiB)": 47.63, "step": 5180, "token_acc": 0.461864406779661, "train_speed(iter/s)": 1.460104 }, { "epoch": 0.2221412964311726, "grad_norm": 3.2579591274261475, "learning_rate": 9.951375720101645e-05, "loss": 2.9324100494384764, "memory(GiB)": 47.63, "step": 5185, "token_acc": 0.4166666666666667, "train_speed(iter/s)": 1.460165 }, { "epoch": 0.22235551176042156, "grad_norm": 3.7421793937683105, "learning_rate": 9.951282048800255e-05, "loss": 2.741208076477051, "memory(GiB)": 47.63, "step": 5190, "token_acc": 0.44166666666666665, "train_speed(iter/s)": 1.460467 }, { "epoch": 0.22256972708967054, "grad_norm": 3.7896907329559326, "learning_rate": 9.951188287801744e-05, "loss": 2.325700569152832, "memory(GiB)": 47.63, "step": 5195, "token_acc": 0.5169811320754717, "train_speed(iter/s)": 1.460124 }, { "epoch": 0.2227839424189195, "grad_norm": 4.166207790374756, "learning_rate": 9.951094437107812e-05, "loss": 2.6317943572998046, "memory(GiB)": 47.63, "step": 5200, "token_acc": 0.4773662551440329, "train_speed(iter/s)": 1.459908 }, { "epoch": 0.22299815774816845, "grad_norm": 3.2314565181732178, "learning_rate": 9.951000496720162e-05, "loss": 2.9743555068969725, "memory(GiB)": 47.63, "step": 5205, "token_acc": 0.41883116883116883, "train_speed(iter/s)": 1.460172 }, { "epoch": 0.22321237307741743, "grad_norm": 3.753066301345825, "learning_rate": 9.950906466640493e-05, "loss": 2.726948356628418, "memory(GiB)": 47.63, "step": 5210, "token_acc": 0.4639175257731959, "train_speed(iter/s)": 1.460003 }, { "epoch": 0.22342658840666638, "grad_norm": 3.9496212005615234, "learning_rate": 9.950812346870508e-05, "loss": 2.732045555114746, "memory(GiB)": 47.63, "step": 5215, "token_acc": 0.4662379421221865, "train_speed(iter/s)": 1.460337 }, { "epoch": 0.22364080373591536, "grad_norm": 5.344078063964844, "learning_rate": 9.950718137411913e-05, "loss": 2.6071565628051756, "memory(GiB)": 47.63, "step": 5220, "token_acc": 0.48880597014925375, "train_speed(iter/s)": 1.460541 }, { "epoch": 0.2238550190651643, "grad_norm": 8.683192253112793, "learning_rate": 9.950623838266415e-05, "loss": 2.644216537475586, "memory(GiB)": 47.63, "step": 5225, "token_acc": 0.48375451263537905, "train_speed(iter/s)": 1.46044 }, { "epoch": 0.22406923439441326, "grad_norm": 4.4640583992004395, "learning_rate": 9.950529449435722e-05, "loss": 2.8277259826660157, "memory(GiB)": 47.63, "step": 5230, "token_acc": 0.4565916398713826, "train_speed(iter/s)": 1.460534 }, { "epoch": 0.22428344972366224, "grad_norm": 4.138047218322754, "learning_rate": 9.950434970921544e-05, "loss": 2.5987066268920898, "memory(GiB)": 47.63, "step": 5235, "token_acc": 0.471875, "train_speed(iter/s)": 1.460354 }, { "epoch": 0.2244976650529112, "grad_norm": 2.9940555095672607, "learning_rate": 9.950340402725596e-05, "loss": 2.5609241485595704, "memory(GiB)": 47.63, "step": 5240, "token_acc": 0.47735191637630664, "train_speed(iter/s)": 1.460535 }, { "epoch": 0.22471188038216014, "grad_norm": 3.2722036838531494, "learning_rate": 9.950245744849583e-05, "loss": 2.501548004150391, "memory(GiB)": 47.63, "step": 5245, "token_acc": 0.43727598566308246, "train_speed(iter/s)": 1.46079 }, { "epoch": 0.22492609571140912, "grad_norm": 5.219012260437012, "learning_rate": 9.950150997295226e-05, "loss": 2.4772377014160156, "memory(GiB)": 47.63, "step": 5250, "token_acc": 0.4891304347826087, "train_speed(iter/s)": 1.460816 }, { "epoch": 0.22514031104065807, "grad_norm": 4.120798110961914, "learning_rate": 9.950056160064242e-05, "loss": 2.575048637390137, "memory(GiB)": 47.63, "step": 5255, "token_acc": 0.4557823129251701, "train_speed(iter/s)": 1.460547 }, { "epoch": 0.22535452636990702, "grad_norm": 3.0218071937561035, "learning_rate": 9.949961233158346e-05, "loss": 2.4449541091918947, "memory(GiB)": 47.63, "step": 5260, "token_acc": 0.46226415094339623, "train_speed(iter/s)": 1.46073 }, { "epoch": 0.225568741699156, "grad_norm": 4.176855564117432, "learning_rate": 9.949866216579258e-05, "loss": 2.899913787841797, "memory(GiB)": 47.63, "step": 5265, "token_acc": 0.43986254295532645, "train_speed(iter/s)": 1.460585 }, { "epoch": 0.22578295702840495, "grad_norm": 4.538032054901123, "learning_rate": 9.949771110328701e-05, "loss": 2.5551383972167967, "memory(GiB)": 47.63, "step": 5270, "token_acc": 0.45588235294117646, "train_speed(iter/s)": 1.460747 }, { "epoch": 0.2259971723576539, "grad_norm": 3.67073392868042, "learning_rate": 9.949675914408396e-05, "loss": 2.813393402099609, "memory(GiB)": 47.63, "step": 5275, "token_acc": 0.4645161290322581, "train_speed(iter/s)": 1.461141 }, { "epoch": 0.22621138768690288, "grad_norm": 4.147843837738037, "learning_rate": 9.94958062882007e-05, "loss": 2.7792625427246094, "memory(GiB)": 47.63, "step": 5280, "token_acc": 0.46394984326018807, "train_speed(iter/s)": 1.461271 }, { "epoch": 0.22642560301615183, "grad_norm": 3.92873477935791, "learning_rate": 9.949485253565446e-05, "loss": 2.63089542388916, "memory(GiB)": 47.63, "step": 5285, "token_acc": 0.4745762711864407, "train_speed(iter/s)": 1.461306 }, { "epoch": 0.22663981834540078, "grad_norm": 3.239232301712036, "learning_rate": 9.949389788646257e-05, "loss": 2.5243959426879883, "memory(GiB)": 47.63, "step": 5290, "token_acc": 0.4674922600619195, "train_speed(iter/s)": 1.461332 }, { "epoch": 0.22685403367464976, "grad_norm": 3.9025206565856934, "learning_rate": 9.949294234064226e-05, "loss": 2.4677541732788084, "memory(GiB)": 47.63, "step": 5295, "token_acc": 0.47278911564625853, "train_speed(iter/s)": 1.461653 }, { "epoch": 0.22706824900389871, "grad_norm": 3.9914939403533936, "learning_rate": 9.949198589821089e-05, "loss": 2.453841972351074, "memory(GiB)": 47.63, "step": 5300, "token_acc": 0.4981549815498155, "train_speed(iter/s)": 1.46185 }, { "epoch": 0.2272824643331477, "grad_norm": 4.803912162780762, "learning_rate": 9.949102855918575e-05, "loss": 2.6950902938842773, "memory(GiB)": 47.63, "step": 5305, "token_acc": 0.4481707317073171, "train_speed(iter/s)": 1.462137 }, { "epoch": 0.22749667966239664, "grad_norm": 3.7592363357543945, "learning_rate": 9.949007032358422e-05, "loss": 2.448274040222168, "memory(GiB)": 47.63, "step": 5310, "token_acc": 0.46689895470383275, "train_speed(iter/s)": 1.462024 }, { "epoch": 0.2277108949916456, "grad_norm": 7.815985202789307, "learning_rate": 9.948911119142363e-05, "loss": 2.5520002365112306, "memory(GiB)": 47.63, "step": 5315, "token_acc": 0.4585987261146497, "train_speed(iter/s)": 1.461873 }, { "epoch": 0.22792511032089458, "grad_norm": 5.387121200561523, "learning_rate": 9.948815116272137e-05, "loss": 2.404407501220703, "memory(GiB)": 47.63, "step": 5320, "token_acc": 0.5084745762711864, "train_speed(iter/s)": 1.462019 }, { "epoch": 0.22813932565014353, "grad_norm": 3.177846670150757, "learning_rate": 9.948719023749482e-05, "loss": 2.678445816040039, "memory(GiB)": 47.63, "step": 5325, "token_acc": 0.44656488549618323, "train_speed(iter/s)": 1.46235 }, { "epoch": 0.22835354097939248, "grad_norm": 5.241051197052002, "learning_rate": 9.94862284157614e-05, "loss": 2.7848968505859375, "memory(GiB)": 47.63, "step": 5330, "token_acc": 0.44402985074626866, "train_speed(iter/s)": 1.462502 }, { "epoch": 0.22856775630864146, "grad_norm": 2.985356569290161, "learning_rate": 9.948526569753853e-05, "loss": 2.830743598937988, "memory(GiB)": 47.63, "step": 5335, "token_acc": 0.4746268656716418, "train_speed(iter/s)": 1.462809 }, { "epoch": 0.2287819716378904, "grad_norm": 3.4994444847106934, "learning_rate": 9.948430208284366e-05, "loss": 2.6473003387451173, "memory(GiB)": 47.63, "step": 5340, "token_acc": 0.48464163822525597, "train_speed(iter/s)": 1.462925 }, { "epoch": 0.22899618696713936, "grad_norm": 2.861715078353882, "learning_rate": 9.948333757169424e-05, "loss": 2.493571662902832, "memory(GiB)": 47.63, "step": 5345, "token_acc": 0.5181159420289855, "train_speed(iter/s)": 1.462864 }, { "epoch": 0.22921040229638834, "grad_norm": 5.085661888122559, "learning_rate": 9.948237216410771e-05, "loss": 2.2965946197509766, "memory(GiB)": 47.63, "step": 5350, "token_acc": 0.47249190938511326, "train_speed(iter/s)": 1.463124 }, { "epoch": 0.2294246176256373, "grad_norm": 4.955268383026123, "learning_rate": 9.948140586010162e-05, "loss": 2.217868614196777, "memory(GiB)": 47.63, "step": 5355, "token_acc": 0.555956678700361, "train_speed(iter/s)": 1.463258 }, { "epoch": 0.22963883295488624, "grad_norm": 6.237241268157959, "learning_rate": 9.948043865969344e-05, "loss": 2.8866138458251953, "memory(GiB)": 47.63, "step": 5360, "token_acc": 0.44871794871794873, "train_speed(iter/s)": 1.463327 }, { "epoch": 0.22985304828413522, "grad_norm": 3.9926109313964844, "learning_rate": 9.94794705629007e-05, "loss": 2.698740005493164, "memory(GiB)": 47.63, "step": 5365, "token_acc": 0.4380952380952381, "train_speed(iter/s)": 1.462953 }, { "epoch": 0.23006726361338417, "grad_norm": 5.1145339012146, "learning_rate": 9.947850156974093e-05, "loss": 2.566942596435547, "memory(GiB)": 47.63, "step": 5370, "token_acc": 0.4646840148698885, "train_speed(iter/s)": 1.462972 }, { "epoch": 0.23028147894263312, "grad_norm": 5.715196132659912, "learning_rate": 9.947753168023168e-05, "loss": 2.5916378021240236, "memory(GiB)": 47.63, "step": 5375, "token_acc": 0.4714828897338403, "train_speed(iter/s)": 1.46289 }, { "epoch": 0.2304956942718821, "grad_norm": 3.0483667850494385, "learning_rate": 9.947656089439055e-05, "loss": 2.7588014602661133, "memory(GiB)": 47.63, "step": 5380, "token_acc": 0.4527027027027027, "train_speed(iter/s)": 1.462634 }, { "epoch": 0.23070990960113105, "grad_norm": 2.7828853130340576, "learning_rate": 9.947558921223509e-05, "loss": 2.7146541595458986, "memory(GiB)": 47.63, "step": 5385, "token_acc": 0.4755244755244755, "train_speed(iter/s)": 1.462516 }, { "epoch": 0.23092412493038003, "grad_norm": 3.149014472961426, "learning_rate": 9.947461663378292e-05, "loss": 2.404269981384277, "memory(GiB)": 47.63, "step": 5390, "token_acc": 0.49834983498349833, "train_speed(iter/s)": 1.462651 }, { "epoch": 0.23113834025962898, "grad_norm": 4.664316177368164, "learning_rate": 9.947364315905168e-05, "loss": 2.6491329193115236, "memory(GiB)": 47.63, "step": 5395, "token_acc": 0.4368932038834951, "train_speed(iter/s)": 1.462678 }, { "epoch": 0.23135255558887793, "grad_norm": 4.375911712646484, "learning_rate": 9.947266878805896e-05, "loss": 2.630513381958008, "memory(GiB)": 47.63, "step": 5400, "token_acc": 0.4548611111111111, "train_speed(iter/s)": 1.462808 }, { "epoch": 0.2315667709181269, "grad_norm": 4.754401683807373, "learning_rate": 9.947169352082245e-05, "loss": 2.536018943786621, "memory(GiB)": 47.63, "step": 5405, "token_acc": 0.4770992366412214, "train_speed(iter/s)": 1.462484 }, { "epoch": 0.23178098624737586, "grad_norm": 3.4064626693725586, "learning_rate": 9.94707173573598e-05, "loss": 2.026996040344238, "memory(GiB)": 47.63, "step": 5410, "token_acc": 0.5372549019607843, "train_speed(iter/s)": 1.462017 }, { "epoch": 0.23199520157662482, "grad_norm": 3.4410533905029297, "learning_rate": 9.946974029768869e-05, "loss": 2.7539953231811523, "memory(GiB)": 47.63, "step": 5415, "token_acc": 0.4701492537313433, "train_speed(iter/s)": 1.46208 }, { "epoch": 0.2322094169058738, "grad_norm": 4.9195404052734375, "learning_rate": 9.946876234182682e-05, "loss": 2.7599765777587892, "memory(GiB)": 47.63, "step": 5420, "token_acc": 0.447098976109215, "train_speed(iter/s)": 1.462127 }, { "epoch": 0.23242363223512275, "grad_norm": 5.885335922241211, "learning_rate": 9.946778348979193e-05, "loss": 2.507585144042969, "memory(GiB)": 47.63, "step": 5425, "token_acc": 0.5119047619047619, "train_speed(iter/s)": 1.462147 }, { "epoch": 0.2326378475643717, "grad_norm": 3.6174395084381104, "learning_rate": 9.946680374160174e-05, "loss": 2.5303054809570313, "memory(GiB)": 47.63, "step": 5430, "token_acc": 0.4805194805194805, "train_speed(iter/s)": 1.462138 }, { "epoch": 0.23285206289362068, "grad_norm": 4.536384105682373, "learning_rate": 9.9465823097274e-05, "loss": 2.6479154586791993, "memory(GiB)": 47.63, "step": 5435, "token_acc": 0.47315436241610737, "train_speed(iter/s)": 1.462136 }, { "epoch": 0.23306627822286963, "grad_norm": 3.7358009815216064, "learning_rate": 9.946484155682646e-05, "loss": 2.7107013702392577, "memory(GiB)": 47.63, "step": 5440, "token_acc": 0.5166666666666667, "train_speed(iter/s)": 1.4625 }, { "epoch": 0.23328049355211858, "grad_norm": 4.361748695373535, "learning_rate": 9.946385912027692e-05, "loss": 2.82617244720459, "memory(GiB)": 47.63, "step": 5445, "token_acc": 0.42142857142857143, "train_speed(iter/s)": 1.462705 }, { "epoch": 0.23349470888136756, "grad_norm": 4.033404350280762, "learning_rate": 9.946287578764318e-05, "loss": 2.5749502182006836, "memory(GiB)": 47.63, "step": 5450, "token_acc": 0.43243243243243246, "train_speed(iter/s)": 1.462794 }, { "epoch": 0.2337089242106165, "grad_norm": 4.20624303817749, "learning_rate": 9.946189155894303e-05, "loss": 2.6544652938842774, "memory(GiB)": 47.63, "step": 5455, "token_acc": 0.4666666666666667, "train_speed(iter/s)": 1.462835 }, { "epoch": 0.23392313953986546, "grad_norm": 3.0296459197998047, "learning_rate": 9.946090643419432e-05, "loss": 2.552129554748535, "memory(GiB)": 47.63, "step": 5460, "token_acc": 0.4626865671641791, "train_speed(iter/s)": 1.462884 }, { "epoch": 0.23413735486911444, "grad_norm": 4.592703342437744, "learning_rate": 9.945992041341489e-05, "loss": 2.430109214782715, "memory(GiB)": 47.63, "step": 5465, "token_acc": 0.47477744807121663, "train_speed(iter/s)": 1.463021 }, { "epoch": 0.2343515701983634, "grad_norm": 3.492677688598633, "learning_rate": 9.945893349662261e-05, "loss": 2.6183616638183596, "memory(GiB)": 47.63, "step": 5470, "token_acc": 0.46546546546546547, "train_speed(iter/s)": 1.462729 }, { "epoch": 0.23456578552761237, "grad_norm": 4.0725626945495605, "learning_rate": 9.945794568383534e-05, "loss": 2.785839080810547, "memory(GiB)": 47.63, "step": 5475, "token_acc": 0.444, "train_speed(iter/s)": 1.463084 }, { "epoch": 0.23478000085686132, "grad_norm": 3.672146797180176, "learning_rate": 9.945695697507101e-05, "loss": 2.4932071685791017, "memory(GiB)": 47.63, "step": 5480, "token_acc": 0.42586750788643535, "train_speed(iter/s)": 1.463114 }, { "epoch": 0.23499421618611027, "grad_norm": 4.550312519073486, "learning_rate": 9.945596737034748e-05, "loss": 2.583989715576172, "memory(GiB)": 47.63, "step": 5485, "token_acc": 0.4311594202898551, "train_speed(iter/s)": 1.463369 }, { "epoch": 0.23520843151535925, "grad_norm": 4.472805500030518, "learning_rate": 9.945497686968273e-05, "loss": 2.3705770492553713, "memory(GiB)": 47.63, "step": 5490, "token_acc": 0.46987951807228917, "train_speed(iter/s)": 1.463439 }, { "epoch": 0.2354226468446082, "grad_norm": 3.246840000152588, "learning_rate": 9.945398547309467e-05, "loss": 2.5402286529541014, "memory(GiB)": 47.63, "step": 5495, "token_acc": 0.46518105849582175, "train_speed(iter/s)": 1.463436 }, { "epoch": 0.23563686217385715, "grad_norm": 3.9132003784179688, "learning_rate": 9.945299318060129e-05, "loss": 2.4419858932495115, "memory(GiB)": 47.63, "step": 5500, "token_acc": 0.46830985915492956, "train_speed(iter/s)": 1.463269 }, { "epoch": 0.23563686217385715, "eval_loss": 2.3884971141815186, "eval_runtime": 13.5173, "eval_samples_per_second": 7.398, "eval_steps_per_second": 7.398, "eval_token_acc": 0.48157894736842105, "step": 5500 }, { "epoch": 0.23585107750310613, "grad_norm": 3.4197490215301514, "learning_rate": 9.945199999222053e-05, "loss": 2.9500453948974608, "memory(GiB)": 47.63, "step": 5505, "token_acc": 0.4563106796116505, "train_speed(iter/s)": 1.457209 }, { "epoch": 0.23606529283235508, "grad_norm": 5.53607177734375, "learning_rate": 9.945100590797041e-05, "loss": 2.5514347076416017, "memory(GiB)": 47.63, "step": 5510, "token_acc": 0.49829351535836175, "train_speed(iter/s)": 1.457341 }, { "epoch": 0.23627950816160403, "grad_norm": 3.660693883895874, "learning_rate": 9.945001092786893e-05, "loss": 2.706028366088867, "memory(GiB)": 47.63, "step": 5515, "token_acc": 0.4261168384879725, "train_speed(iter/s)": 1.457104 }, { "epoch": 0.23649372349085301, "grad_norm": 3.8297765254974365, "learning_rate": 9.944901505193411e-05, "loss": 2.6862775802612306, "memory(GiB)": 47.63, "step": 5520, "token_acc": 0.4496124031007752, "train_speed(iter/s)": 1.457171 }, { "epoch": 0.23670793882010197, "grad_norm": 3.6341445446014404, "learning_rate": 9.9448018280184e-05, "loss": 2.7686746597290037, "memory(GiB)": 47.63, "step": 5525, "token_acc": 0.43213296398891965, "train_speed(iter/s)": 1.457003 }, { "epoch": 0.23692215414935092, "grad_norm": 3.6948294639587402, "learning_rate": 9.944702061263664e-05, "loss": 2.6286340713500977, "memory(GiB)": 47.63, "step": 5530, "token_acc": 0.4810606060606061, "train_speed(iter/s)": 1.456811 }, { "epoch": 0.2371363694785999, "grad_norm": 5.83001708984375, "learning_rate": 9.944602204931013e-05, "loss": 2.410238838195801, "memory(GiB)": 47.63, "step": 5535, "token_acc": 0.4965277777777778, "train_speed(iter/s)": 1.456941 }, { "epoch": 0.23735058480784885, "grad_norm": 4.257678031921387, "learning_rate": 9.944502259022255e-05, "loss": 2.2674959182739256, "memory(GiB)": 47.63, "step": 5540, "token_acc": 0.5576923076923077, "train_speed(iter/s)": 1.457264 }, { "epoch": 0.2375648001370978, "grad_norm": 4.245832443237305, "learning_rate": 9.944402223539199e-05, "loss": 2.7157052993774413, "memory(GiB)": 47.63, "step": 5545, "token_acc": 0.4554794520547945, "train_speed(iter/s)": 1.457246 }, { "epoch": 0.23777901546634678, "grad_norm": 4.118874549865723, "learning_rate": 9.944302098483659e-05, "loss": 2.604669952392578, "memory(GiB)": 47.63, "step": 5550, "token_acc": 0.45901639344262296, "train_speed(iter/s)": 1.457545 }, { "epoch": 0.23799323079559573, "grad_norm": 4.63669490814209, "learning_rate": 9.944201883857449e-05, "loss": 2.7955175399780274, "memory(GiB)": 47.63, "step": 5555, "token_acc": 0.41935483870967744, "train_speed(iter/s)": 1.457627 }, { "epoch": 0.2382074461248447, "grad_norm": 3.287989616394043, "learning_rate": 9.944101579662381e-05, "loss": 2.568241500854492, "memory(GiB)": 47.63, "step": 5560, "token_acc": 0.4966216216216216, "train_speed(iter/s)": 1.457715 }, { "epoch": 0.23842166145409366, "grad_norm": 3.317380666732788, "learning_rate": 9.944001185900278e-05, "loss": 2.580239486694336, "memory(GiB)": 47.63, "step": 5565, "token_acc": 0.42857142857142855, "train_speed(iter/s)": 1.457854 }, { "epoch": 0.2386358767833426, "grad_norm": 4.391005039215088, "learning_rate": 9.943900702572955e-05, "loss": 2.5028066635131836, "memory(GiB)": 47.63, "step": 5570, "token_acc": 0.4892703862660944, "train_speed(iter/s)": 1.457292 }, { "epoch": 0.2388500921125916, "grad_norm": 3.535794496536255, "learning_rate": 9.943800129682233e-05, "loss": 2.5869670867919923, "memory(GiB)": 47.63, "step": 5575, "token_acc": 0.4872611464968153, "train_speed(iter/s)": 1.457505 }, { "epoch": 0.23906430744184054, "grad_norm": 3.965402364730835, "learning_rate": 9.943699467229935e-05, "loss": 2.7497854232788086, "memory(GiB)": 47.63, "step": 5580, "token_acc": 0.4396551724137931, "train_speed(iter/s)": 1.457407 }, { "epoch": 0.2392785227710895, "grad_norm": 4.6321258544921875, "learning_rate": 9.94359871521788e-05, "loss": 2.588104248046875, "memory(GiB)": 47.63, "step": 5585, "token_acc": 0.4738675958188153, "train_speed(iter/s)": 1.457048 }, { "epoch": 0.23949273810033847, "grad_norm": 3.646897792816162, "learning_rate": 9.9434978736479e-05, "loss": 2.4612993240356444, "memory(GiB)": 47.63, "step": 5590, "token_acc": 0.4794952681388013, "train_speed(iter/s)": 1.457051 }, { "epoch": 0.23970695342958742, "grad_norm": 3.5167148113250732, "learning_rate": 9.943396942521818e-05, "loss": 2.779859161376953, "memory(GiB)": 47.63, "step": 5595, "token_acc": 0.43137254901960786, "train_speed(iter/s)": 1.457356 }, { "epoch": 0.23992116875883637, "grad_norm": 5.377708911895752, "learning_rate": 9.943295921841462e-05, "loss": 2.449590492248535, "memory(GiB)": 47.63, "step": 5600, "token_acc": 0.5252918287937743, "train_speed(iter/s)": 1.457577 }, { "epoch": 0.24013538408808535, "grad_norm": 4.4027204513549805, "learning_rate": 9.943194811608665e-05, "loss": 2.644864082336426, "memory(GiB)": 47.63, "step": 5605, "token_acc": 0.49063670411985016, "train_speed(iter/s)": 1.457633 }, { "epoch": 0.2403495994173343, "grad_norm": 3.8333849906921387, "learning_rate": 9.943093611825256e-05, "loss": 2.370524597167969, "memory(GiB)": 47.63, "step": 5610, "token_acc": 0.48580441640378547, "train_speed(iter/s)": 1.457995 }, { "epoch": 0.24056381474658325, "grad_norm": 2.767317771911621, "learning_rate": 9.942992322493068e-05, "loss": 2.7053028106689454, "memory(GiB)": 47.63, "step": 5615, "token_acc": 0.46405228758169936, "train_speed(iter/s)": 1.458169 }, { "epoch": 0.24077803007583223, "grad_norm": 3.6258201599121094, "learning_rate": 9.942890943613939e-05, "loss": 2.4264352798461912, "memory(GiB)": 47.63, "step": 5620, "token_acc": 0.4942528735632184, "train_speed(iter/s)": 1.458285 }, { "epoch": 0.24099224540508118, "grad_norm": 4.235227584838867, "learning_rate": 9.942789475189704e-05, "loss": 2.6738864898681642, "memory(GiB)": 47.63, "step": 5625, "token_acc": 0.476, "train_speed(iter/s)": 1.458511 }, { "epoch": 0.24120646073433014, "grad_norm": 4.216747760772705, "learning_rate": 9.9426879172222e-05, "loss": 2.611791801452637, "memory(GiB)": 47.63, "step": 5630, "token_acc": 0.45555555555555555, "train_speed(iter/s)": 1.458891 }, { "epoch": 0.24142067606357911, "grad_norm": 5.329629421234131, "learning_rate": 9.942586269713268e-05, "loss": 2.5263603210449217, "memory(GiB)": 47.63, "step": 5635, "token_acc": 0.49122807017543857, "train_speed(iter/s)": 1.459087 }, { "epoch": 0.24163489139282807, "grad_norm": 4.288660526275635, "learning_rate": 9.942484532664748e-05, "loss": 2.8504562377929688, "memory(GiB)": 47.63, "step": 5640, "token_acc": 0.45263157894736844, "train_speed(iter/s)": 1.459321 }, { "epoch": 0.24184910672207705, "grad_norm": 3.7665090560913086, "learning_rate": 9.942382706078486e-05, "loss": 3.1255748748779295, "memory(GiB)": 47.63, "step": 5645, "token_acc": 0.3892857142857143, "train_speed(iter/s)": 1.459575 }, { "epoch": 0.242063322051326, "grad_norm": 4.620436191558838, "learning_rate": 9.942280789956325e-05, "loss": 2.322142791748047, "memory(GiB)": 47.63, "step": 5650, "token_acc": 0.47719298245614034, "train_speed(iter/s)": 1.459908 }, { "epoch": 0.24227753738057495, "grad_norm": 4.055863380432129, "learning_rate": 9.942178784300109e-05, "loss": 2.8052928924560545, "memory(GiB)": 47.63, "step": 5655, "token_acc": 0.45722713864306785, "train_speed(iter/s)": 1.460168 }, { "epoch": 0.24249175270982393, "grad_norm": 3.8129770755767822, "learning_rate": 9.94207668911169e-05, "loss": 2.8385086059570312, "memory(GiB)": 47.63, "step": 5660, "token_acc": 0.4163568773234201, "train_speed(iter/s)": 1.460169 }, { "epoch": 0.24270596803907288, "grad_norm": 3.9301958084106445, "learning_rate": 9.941974504392916e-05, "loss": 2.569260025024414, "memory(GiB)": 47.63, "step": 5665, "token_acc": 0.4421364985163205, "train_speed(iter/s)": 1.459913 }, { "epoch": 0.24292018336832183, "grad_norm": 4.454365253448486, "learning_rate": 9.941872230145637e-05, "loss": 2.8422536849975586, "memory(GiB)": 47.63, "step": 5670, "token_acc": 0.4421768707482993, "train_speed(iter/s)": 1.460053 }, { "epoch": 0.2431343986975708, "grad_norm": 3.1430835723876953, "learning_rate": 9.941769866371708e-05, "loss": 2.2880783081054688, "memory(GiB)": 47.63, "step": 5675, "token_acc": 0.5161290322580645, "train_speed(iter/s)": 1.460004 }, { "epoch": 0.24334861402681976, "grad_norm": 4.095118045806885, "learning_rate": 9.94166741307298e-05, "loss": 2.471821975708008, "memory(GiB)": 47.63, "step": 5680, "token_acc": 0.4440789473684211, "train_speed(iter/s)": 1.460028 }, { "epoch": 0.2435628293560687, "grad_norm": 3.6256887912750244, "learning_rate": 9.941564870251312e-05, "loss": 2.397996520996094, "memory(GiB)": 47.63, "step": 5685, "token_acc": 0.47735191637630664, "train_speed(iter/s)": 1.459812 }, { "epoch": 0.2437770446853177, "grad_norm": 3.829911708831787, "learning_rate": 9.941462237908561e-05, "loss": 2.7572067260742186, "memory(GiB)": 47.63, "step": 5690, "token_acc": 0.4427710843373494, "train_speed(iter/s)": 1.460168 }, { "epoch": 0.24399126001456664, "grad_norm": 5.137447834014893, "learning_rate": 9.941359516046586e-05, "loss": 2.923348236083984, "memory(GiB)": 47.63, "step": 5695, "token_acc": 0.437984496124031, "train_speed(iter/s)": 1.460379 }, { "epoch": 0.2442054753438156, "grad_norm": 4.275759220123291, "learning_rate": 9.941256704667249e-05, "loss": 2.330388069152832, "memory(GiB)": 47.63, "step": 5700, "token_acc": 0.4929078014184397, "train_speed(iter/s)": 1.460243 }, { "epoch": 0.24441969067306457, "grad_norm": 5.067497253417969, "learning_rate": 9.941153803772412e-05, "loss": 2.49843807220459, "memory(GiB)": 47.63, "step": 5705, "token_acc": 0.5083612040133779, "train_speed(iter/s)": 1.460151 }, { "epoch": 0.24463390600231352, "grad_norm": 4.475621223449707, "learning_rate": 9.941050813363937e-05, "loss": 2.559776496887207, "memory(GiB)": 47.63, "step": 5710, "token_acc": 0.47330960854092524, "train_speed(iter/s)": 1.460197 }, { "epoch": 0.24484812133156247, "grad_norm": 5.182840824127197, "learning_rate": 9.94094773344369e-05, "loss": 2.7183162689208986, "memory(GiB)": 47.63, "step": 5715, "token_acc": 0.4575757575757576, "train_speed(iter/s)": 1.460034 }, { "epoch": 0.24506233666081145, "grad_norm": 6.315879821777344, "learning_rate": 9.940844564013542e-05, "loss": 2.696755790710449, "memory(GiB)": 47.63, "step": 5720, "token_acc": 0.4452054794520548, "train_speed(iter/s)": 1.460214 }, { "epoch": 0.2452765519900604, "grad_norm": 6.686441421508789, "learning_rate": 9.94074130507536e-05, "loss": 2.8254520416259767, "memory(GiB)": 47.63, "step": 5725, "token_acc": 0.42857142857142855, "train_speed(iter/s)": 1.460206 }, { "epoch": 0.24549076731930938, "grad_norm": 4.55892276763916, "learning_rate": 9.940637956631013e-05, "loss": 2.3481155395507813, "memory(GiB)": 47.63, "step": 5730, "token_acc": 0.4909090909090909, "train_speed(iter/s)": 1.460336 }, { "epoch": 0.24570498264855833, "grad_norm": 4.8279218673706055, "learning_rate": 9.940534518682376e-05, "loss": 2.682110595703125, "memory(GiB)": 47.63, "step": 5735, "token_acc": 0.45936395759717313, "train_speed(iter/s)": 1.460624 }, { "epoch": 0.24591919797780729, "grad_norm": 4.679349422454834, "learning_rate": 9.940430991231322e-05, "loss": 3.0118648529052736, "memory(GiB)": 47.63, "step": 5740, "token_acc": 0.4158730158730159, "train_speed(iter/s)": 1.460892 }, { "epoch": 0.24613341330705626, "grad_norm": 3.5831639766693115, "learning_rate": 9.940327374279725e-05, "loss": 2.7695430755615233, "memory(GiB)": 47.63, "step": 5745, "token_acc": 0.4199288256227758, "train_speed(iter/s)": 1.460889 }, { "epoch": 0.24634762863630522, "grad_norm": 4.248871803283691, "learning_rate": 9.940223667829465e-05, "loss": 2.6276681900024412, "memory(GiB)": 47.63, "step": 5750, "token_acc": 0.422680412371134, "train_speed(iter/s)": 1.460391 }, { "epoch": 0.24656184396555417, "grad_norm": 4.029080390930176, "learning_rate": 9.940119871882416e-05, "loss": 2.8304515838623048, "memory(GiB)": 47.63, "step": 5755, "token_acc": 0.45714285714285713, "train_speed(iter/s)": 1.460395 }, { "epoch": 0.24677605929480315, "grad_norm": 5.108576774597168, "learning_rate": 9.940015986440464e-05, "loss": 2.7089282989501955, "memory(GiB)": 47.63, "step": 5760, "token_acc": 0.47104247104247104, "train_speed(iter/s)": 1.460593 }, { "epoch": 0.2469902746240521, "grad_norm": 5.069339275360107, "learning_rate": 9.939912011505487e-05, "loss": 2.565351104736328, "memory(GiB)": 47.63, "step": 5765, "token_acc": 0.5090252707581228, "train_speed(iter/s)": 1.460477 }, { "epoch": 0.24720448995330105, "grad_norm": 4.0777130126953125, "learning_rate": 9.939807947079369e-05, "loss": 2.4939748764038088, "memory(GiB)": 47.63, "step": 5770, "token_acc": 0.45263157894736844, "train_speed(iter/s)": 1.460238 }, { "epoch": 0.24741870528255003, "grad_norm": 4.485532760620117, "learning_rate": 9.939703793163998e-05, "loss": 2.81497859954834, "memory(GiB)": 47.63, "step": 5775, "token_acc": 0.4171974522292994, "train_speed(iter/s)": 1.460413 }, { "epoch": 0.24763292061179898, "grad_norm": 3.2145578861236572, "learning_rate": 9.939599549761259e-05, "loss": 2.407269096374512, "memory(GiB)": 47.63, "step": 5780, "token_acc": 0.47202797202797203, "train_speed(iter/s)": 1.460456 }, { "epoch": 0.24784713594104793, "grad_norm": 4.005586624145508, "learning_rate": 9.939495216873038e-05, "loss": 2.7351686477661135, "memory(GiB)": 47.63, "step": 5785, "token_acc": 0.4340277777777778, "train_speed(iter/s)": 1.460402 }, { "epoch": 0.2480613512702969, "grad_norm": 5.348986625671387, "learning_rate": 9.93939079450123e-05, "loss": 2.5608905792236327, "memory(GiB)": 47.63, "step": 5790, "token_acc": 0.4608150470219436, "train_speed(iter/s)": 1.460249 }, { "epoch": 0.24827556659954586, "grad_norm": 4.527293682098389, "learning_rate": 9.939286282647723e-05, "loss": 2.7762317657470703, "memory(GiB)": 47.63, "step": 5795, "token_acc": 0.4568345323741007, "train_speed(iter/s)": 1.460275 }, { "epoch": 0.2484897819287948, "grad_norm": 3.6915411949157715, "learning_rate": 9.939181681314411e-05, "loss": 2.6612146377563475, "memory(GiB)": 47.63, "step": 5800, "token_acc": 0.46236559139784944, "train_speed(iter/s)": 1.460155 }, { "epoch": 0.2487039972580438, "grad_norm": 3.42191481590271, "learning_rate": 9.939076990503191e-05, "loss": 2.2639301300048826, "memory(GiB)": 47.63, "step": 5805, "token_acc": 0.5261194029850746, "train_speed(iter/s)": 1.460182 }, { "epoch": 0.24891821258729274, "grad_norm": 3.4134628772735596, "learning_rate": 9.938972210215958e-05, "loss": 2.7226140975952147, "memory(GiB)": 47.63, "step": 5810, "token_acc": 0.42567567567567566, "train_speed(iter/s)": 1.460077 }, { "epoch": 0.24913242791654172, "grad_norm": 3.800591230392456, "learning_rate": 9.938867340454609e-05, "loss": 2.8251644134521485, "memory(GiB)": 47.63, "step": 5815, "token_acc": 0.45874587458745875, "train_speed(iter/s)": 1.460465 }, { "epoch": 0.24934664324579067, "grad_norm": 4.803477764129639, "learning_rate": 9.938762381221047e-05, "loss": 2.6071054458618166, "memory(GiB)": 47.63, "step": 5820, "token_acc": 0.43490304709141275, "train_speed(iter/s)": 1.460566 }, { "epoch": 0.24956085857503962, "grad_norm": 4.181412696838379, "learning_rate": 9.93865733251717e-05, "loss": 2.6922088623046876, "memory(GiB)": 47.63, "step": 5825, "token_acc": 0.4716981132075472, "train_speed(iter/s)": 1.460679 }, { "epoch": 0.2497750739042886, "grad_norm": 3.804574966430664, "learning_rate": 9.938552194344883e-05, "loss": 2.6233333587646483, "memory(GiB)": 47.63, "step": 5830, "token_acc": 0.41818181818181815, "train_speed(iter/s)": 1.460525 }, { "epoch": 0.24998928923353755, "grad_norm": 4.469377517700195, "learning_rate": 9.93844696670609e-05, "loss": 2.653834915161133, "memory(GiB)": 47.63, "step": 5835, "token_acc": 0.43582089552238806, "train_speed(iter/s)": 1.460463 }, { "epoch": 0.2502035045627865, "grad_norm": 3.8620765209198, "learning_rate": 9.938341649602698e-05, "loss": 2.595180130004883, "memory(GiB)": 47.63, "step": 5840, "token_acc": 0.462406015037594, "train_speed(iter/s)": 1.460496 }, { "epoch": 0.2504177198920355, "grad_norm": 5.089691162109375, "learning_rate": 9.938236243036613e-05, "loss": 2.6849374771118164, "memory(GiB)": 47.63, "step": 5845, "token_acc": 0.43727598566308246, "train_speed(iter/s)": 1.46056 }, { "epoch": 0.2506319352212844, "grad_norm": 4.5347418785095215, "learning_rate": 9.938130747009748e-05, "loss": 2.4504329681396486, "memory(GiB)": 47.63, "step": 5850, "token_acc": 0.4666666666666667, "train_speed(iter/s)": 1.460624 }, { "epoch": 0.2508461505505334, "grad_norm": 4.512951850891113, "learning_rate": 9.938025161524012e-05, "loss": 2.6327817916870115, "memory(GiB)": 47.63, "step": 5855, "token_acc": 0.45724907063197023, "train_speed(iter/s)": 1.460624 }, { "epoch": 0.25106036587978237, "grad_norm": 3.6371476650238037, "learning_rate": 9.937919486581317e-05, "loss": 2.5981365203857423, "memory(GiB)": 47.63, "step": 5860, "token_acc": 0.4584717607973422, "train_speed(iter/s)": 1.460525 }, { "epoch": 0.25127458120903134, "grad_norm": 3.2102556228637695, "learning_rate": 9.937813722183579e-05, "loss": 2.607102966308594, "memory(GiB)": 47.63, "step": 5865, "token_acc": 0.4658385093167702, "train_speed(iter/s)": 1.460289 }, { "epoch": 0.25148879653828027, "grad_norm": 3.3604695796966553, "learning_rate": 9.937707868332713e-05, "loss": 2.401707077026367, "memory(GiB)": 47.63, "step": 5870, "token_acc": 0.5133079847908745, "train_speed(iter/s)": 1.460493 }, { "epoch": 0.25170301186752925, "grad_norm": 6.565609931945801, "learning_rate": 9.937601925030638e-05, "loss": 2.6921478271484376, "memory(GiB)": 47.63, "step": 5875, "token_acc": 0.4295774647887324, "train_speed(iter/s)": 1.460509 }, { "epoch": 0.2519172271967782, "grad_norm": 5.022071838378906, "learning_rate": 9.937495892279272e-05, "loss": 2.6079330444335938, "memory(GiB)": 47.63, "step": 5880, "token_acc": 0.4798387096774194, "train_speed(iter/s)": 1.460815 }, { "epoch": 0.25213144252602715, "grad_norm": 3.526156425476074, "learning_rate": 9.937389770080535e-05, "loss": 2.1729354858398438, "memory(GiB)": 47.63, "step": 5885, "token_acc": 0.5436507936507936, "train_speed(iter/s)": 1.460567 }, { "epoch": 0.25234565785527613, "grad_norm": 5.314532279968262, "learning_rate": 9.937283558436352e-05, "loss": 2.5845405578613283, "memory(GiB)": 47.63, "step": 5890, "token_acc": 0.4676258992805755, "train_speed(iter/s)": 1.460932 }, { "epoch": 0.2525598731845251, "grad_norm": 3.1484577655792236, "learning_rate": 9.937177257348645e-05, "loss": 2.2195556640625, "memory(GiB)": 47.63, "step": 5895, "token_acc": 0.517799352750809, "train_speed(iter/s)": 1.460927 }, { "epoch": 0.25277408851377403, "grad_norm": 3.156454563140869, "learning_rate": 9.93707086681934e-05, "loss": 2.505203437805176, "memory(GiB)": 47.63, "step": 5900, "token_acc": 0.48109965635738833, "train_speed(iter/s)": 1.460815 }, { "epoch": 0.252988303843023, "grad_norm": 3.0690927505493164, "learning_rate": 9.936964386850366e-05, "loss": 2.4637630462646483, "memory(GiB)": 47.63, "step": 5905, "token_acc": 0.46953405017921146, "train_speed(iter/s)": 1.460764 }, { "epoch": 0.253202519172272, "grad_norm": 3.5255067348480225, "learning_rate": 9.93685781744365e-05, "loss": 2.435877227783203, "memory(GiB)": 47.63, "step": 5910, "token_acc": 0.458041958041958, "train_speed(iter/s)": 1.460675 }, { "epoch": 0.2534167345015209, "grad_norm": 3.420804023742676, "learning_rate": 9.936751158601124e-05, "loss": 2.7171825408935546, "memory(GiB)": 47.63, "step": 5915, "token_acc": 0.4075342465753425, "train_speed(iter/s)": 1.460974 }, { "epoch": 0.2536309498307699, "grad_norm": 7.083895206451416, "learning_rate": 9.93664441032472e-05, "loss": 2.4212947845458985, "memory(GiB)": 47.63, "step": 5920, "token_acc": 0.4891304347826087, "train_speed(iter/s)": 1.461293 }, { "epoch": 0.25384516516001887, "grad_norm": 4.429103374481201, "learning_rate": 9.936537572616372e-05, "loss": 2.6738073348999025, "memory(GiB)": 47.63, "step": 5925, "token_acc": 0.49110320284697506, "train_speed(iter/s)": 1.46146 }, { "epoch": 0.2540593804892678, "grad_norm": 4.744236946105957, "learning_rate": 9.936430645478014e-05, "loss": 2.6790178298950194, "memory(GiB)": 47.63, "step": 5930, "token_acc": 0.4440677966101695, "train_speed(iter/s)": 1.461148 }, { "epoch": 0.2542735958185168, "grad_norm": 4.812023162841797, "learning_rate": 9.936323628911584e-05, "loss": 2.839132308959961, "memory(GiB)": 47.63, "step": 5935, "token_acc": 0.45195729537366547, "train_speed(iter/s)": 1.46138 }, { "epoch": 0.25448781114776575, "grad_norm": 5.341465950012207, "learning_rate": 9.936216522919021e-05, "loss": 2.2750164031982423, "memory(GiB)": 47.63, "step": 5940, "token_acc": 0.5450980392156862, "train_speed(iter/s)": 1.461362 }, { "epoch": 0.2547020264770147, "grad_norm": 3.449023723602295, "learning_rate": 9.936109327502266e-05, "loss": 2.6081863403320313, "memory(GiB)": 47.63, "step": 5945, "token_acc": 0.48255813953488375, "train_speed(iter/s)": 1.461467 }, { "epoch": 0.25491624180626365, "grad_norm": 5.677049160003662, "learning_rate": 9.936002042663258e-05, "loss": 2.8544898986816407, "memory(GiB)": 47.63, "step": 5950, "token_acc": 0.41843971631205673, "train_speed(iter/s)": 1.461554 }, { "epoch": 0.25513045713551263, "grad_norm": 4.046048641204834, "learning_rate": 9.935894668403945e-05, "loss": 2.7638923645019533, "memory(GiB)": 47.63, "step": 5955, "token_acc": 0.48172757475083056, "train_speed(iter/s)": 1.461758 }, { "epoch": 0.25534467246476156, "grad_norm": 5.44657564163208, "learning_rate": 9.935787204726268e-05, "loss": 2.677348518371582, "memory(GiB)": 47.63, "step": 5960, "token_acc": 0.4426751592356688, "train_speed(iter/s)": 1.461933 }, { "epoch": 0.25555888779401054, "grad_norm": 3.4075331687927246, "learning_rate": 9.935679651632177e-05, "loss": 2.6634239196777343, "memory(GiB)": 47.63, "step": 5965, "token_acc": 0.4423676012461059, "train_speed(iter/s)": 1.46191 }, { "epoch": 0.2557731031232595, "grad_norm": 4.674412250518799, "learning_rate": 9.935572009123619e-05, "loss": 2.6897483825683595, "memory(GiB)": 47.63, "step": 5970, "token_acc": 0.4180064308681672, "train_speed(iter/s)": 1.461873 }, { "epoch": 0.25598731845250844, "grad_norm": 3.6903035640716553, "learning_rate": 9.935464277202544e-05, "loss": 2.708321189880371, "memory(GiB)": 47.63, "step": 5975, "token_acc": 0.4633333333333333, "train_speed(iter/s)": 1.46182 }, { "epoch": 0.2562015337817574, "grad_norm": 3.725214958190918, "learning_rate": 9.935356455870904e-05, "loss": 2.417351722717285, "memory(GiB)": 47.63, "step": 5980, "token_acc": 0.4789272030651341, "train_speed(iter/s)": 1.461616 }, { "epoch": 0.2564157491110064, "grad_norm": 3.872201919555664, "learning_rate": 9.93524854513065e-05, "loss": 2.609356689453125, "memory(GiB)": 47.63, "step": 5985, "token_acc": 0.4548736462093863, "train_speed(iter/s)": 1.461727 }, { "epoch": 0.2566299644402553, "grad_norm": 3.2694787979125977, "learning_rate": 9.93514054498374e-05, "loss": 2.5014877319335938, "memory(GiB)": 47.63, "step": 5990, "token_acc": 0.49615384615384617, "train_speed(iter/s)": 1.461838 }, { "epoch": 0.2568441797695043, "grad_norm": 5.387357234954834, "learning_rate": 9.93503245543213e-05, "loss": 2.867146110534668, "memory(GiB)": 47.63, "step": 5995, "token_acc": 0.41754385964912283, "train_speed(iter/s)": 1.461973 }, { "epoch": 0.2570583950987533, "grad_norm": 4.068348407745361, "learning_rate": 9.934924276477779e-05, "loss": 2.9688154220581056, "memory(GiB)": 47.63, "step": 6000, "token_acc": 0.43521594684385384, "train_speed(iter/s)": 1.461975 }, { "epoch": 0.2570583950987533, "eval_loss": 2.512045383453369, "eval_runtime": 14.2773, "eval_samples_per_second": 7.004, "eval_steps_per_second": 7.004, "eval_token_acc": 0.4553140096618358, "step": 6000 }, { "epoch": 0.2572726104280022, "grad_norm": 3.5032174587249756, "learning_rate": 9.934816008122643e-05, "loss": 2.8273218154907225, "memory(GiB)": 47.63, "step": 6005, "token_acc": 0.4603463992707384, "train_speed(iter/s)": 1.456857 }, { "epoch": 0.2574868257572512, "grad_norm": 4.137598991394043, "learning_rate": 9.934707650368686e-05, "loss": 2.599644660949707, "memory(GiB)": 47.63, "step": 6010, "token_acc": 0.46984126984126984, "train_speed(iter/s)": 1.456824 }, { "epoch": 0.25770104108650016, "grad_norm": 4.335493087768555, "learning_rate": 9.934599203217874e-05, "loss": 2.692888069152832, "memory(GiB)": 47.63, "step": 6015, "token_acc": 0.4660493827160494, "train_speed(iter/s)": 1.456937 }, { "epoch": 0.2579152564157491, "grad_norm": 4.111094951629639, "learning_rate": 9.934490666672164e-05, "loss": 2.4740135192871096, "memory(GiB)": 47.63, "step": 6020, "token_acc": 0.45302013422818793, "train_speed(iter/s)": 1.457194 }, { "epoch": 0.25812947174499806, "grad_norm": 5.307859420776367, "learning_rate": 9.93438204073353e-05, "loss": 2.5906599044799803, "memory(GiB)": 47.63, "step": 6025, "token_acc": 0.43902439024390244, "train_speed(iter/s)": 1.456628 }, { "epoch": 0.25834368707424704, "grad_norm": 4.8536272048950195, "learning_rate": 9.934273325403935e-05, "loss": 2.8745859146118162, "memory(GiB)": 47.63, "step": 6030, "token_acc": 0.44656488549618323, "train_speed(iter/s)": 1.456801 }, { "epoch": 0.258557902403496, "grad_norm": 4.217504501342773, "learning_rate": 9.934164520685349e-05, "loss": 2.4962957382202147, "memory(GiB)": 47.63, "step": 6035, "token_acc": 0.4942084942084942, "train_speed(iter/s)": 1.45659 }, { "epoch": 0.25877211773274494, "grad_norm": 3.7887179851531982, "learning_rate": 9.934055626579746e-05, "loss": 2.66213436126709, "memory(GiB)": 47.63, "step": 6040, "token_acc": 0.5060728744939271, "train_speed(iter/s)": 1.456629 }, { "epoch": 0.2589863330619939, "grad_norm": 3.306767225265503, "learning_rate": 9.933946643089096e-05, "loss": 2.7682987213134767, "memory(GiB)": 47.63, "step": 6045, "token_acc": 0.42165242165242167, "train_speed(iter/s)": 1.456714 }, { "epoch": 0.2592005483912429, "grad_norm": 4.921095371246338, "learning_rate": 9.933837570215374e-05, "loss": 2.4791046142578126, "memory(GiB)": 47.63, "step": 6050, "token_acc": 0.4808510638297872, "train_speed(iter/s)": 1.456622 }, { "epoch": 0.2594147637204918, "grad_norm": 3.72432279586792, "learning_rate": 9.933728407960556e-05, "loss": 2.349840545654297, "memory(GiB)": 47.63, "step": 6055, "token_acc": 0.5272727272727272, "train_speed(iter/s)": 1.456687 }, { "epoch": 0.2596289790497408, "grad_norm": 3.4007792472839355, "learning_rate": 9.933619156326621e-05, "loss": 2.6064830780029298, "memory(GiB)": 47.63, "step": 6060, "token_acc": 0.4444444444444444, "train_speed(iter/s)": 1.456835 }, { "epoch": 0.2598431943789898, "grad_norm": 4.636086940765381, "learning_rate": 9.933509815315545e-05, "loss": 2.4625308990478514, "memory(GiB)": 47.63, "step": 6065, "token_acc": 0.49063670411985016, "train_speed(iter/s)": 1.456911 }, { "epoch": 0.2600574097082387, "grad_norm": 3.8582499027252197, "learning_rate": 9.933400384929313e-05, "loss": 2.564784812927246, "memory(GiB)": 47.63, "step": 6070, "token_acc": 0.4779874213836478, "train_speed(iter/s)": 1.45693 }, { "epoch": 0.2602716250374877, "grad_norm": 2.999443531036377, "learning_rate": 9.933290865169903e-05, "loss": 2.6505287170410154, "memory(GiB)": 47.63, "step": 6075, "token_acc": 0.46578947368421053, "train_speed(iter/s)": 1.457018 }, { "epoch": 0.26048584036673667, "grad_norm": 3.236865758895874, "learning_rate": 9.933181256039301e-05, "loss": 2.6455543518066404, "memory(GiB)": 47.63, "step": 6080, "token_acc": 0.4503105590062112, "train_speed(iter/s)": 1.457262 }, { "epoch": 0.2607000556959856, "grad_norm": 2.9523332118988037, "learning_rate": 9.933071557539494e-05, "loss": 2.6446355819702148, "memory(GiB)": 47.63, "step": 6085, "token_acc": 0.4862068965517241, "train_speed(iter/s)": 1.457529 }, { "epoch": 0.26091427102523457, "grad_norm": 3.2571287155151367, "learning_rate": 9.932961769672469e-05, "loss": 2.8434967041015624, "memory(GiB)": 47.63, "step": 6090, "token_acc": 0.4384858044164038, "train_speed(iter/s)": 1.457772 }, { "epoch": 0.26112848635448355, "grad_norm": 4.042792797088623, "learning_rate": 9.932851892440211e-05, "loss": 2.4596343994140626, "memory(GiB)": 47.63, "step": 6095, "token_acc": 0.45774647887323944, "train_speed(iter/s)": 1.458079 }, { "epoch": 0.26134270168373247, "grad_norm": 5.742770671844482, "learning_rate": 9.932741925844717e-05, "loss": 2.4574043273925783, "memory(GiB)": 47.63, "step": 6100, "token_acc": 0.476, "train_speed(iter/s)": 1.458153 }, { "epoch": 0.26155691701298145, "grad_norm": 3.689504623413086, "learning_rate": 9.932631869887974e-05, "loss": 2.8216827392578123, "memory(GiB)": 47.63, "step": 6105, "token_acc": 0.4440789473684211, "train_speed(iter/s)": 1.458459 }, { "epoch": 0.26177113234223043, "grad_norm": 3.9181625843048096, "learning_rate": 9.932521724571977e-05, "loss": 2.758797454833984, "memory(GiB)": 47.63, "step": 6110, "token_acc": 0.40672782874617736, "train_speed(iter/s)": 1.458645 }, { "epoch": 0.26198534767147935, "grad_norm": 3.422567367553711, "learning_rate": 9.932411489898723e-05, "loss": 2.3415912628173827, "memory(GiB)": 47.63, "step": 6115, "token_acc": 0.5125448028673835, "train_speed(iter/s)": 1.458501 }, { "epoch": 0.26219956300072833, "grad_norm": 3.795133590698242, "learning_rate": 9.932301165870206e-05, "loss": 2.775946617126465, "memory(GiB)": 47.63, "step": 6120, "token_acc": 0.4395973154362416, "train_speed(iter/s)": 1.458252 }, { "epoch": 0.2624137783299773, "grad_norm": 3.046976327896118, "learning_rate": 9.932190752488428e-05, "loss": 2.5555233001708983, "memory(GiB)": 47.63, "step": 6125, "token_acc": 0.4398826979472141, "train_speed(iter/s)": 1.457662 }, { "epoch": 0.26262799365922623, "grad_norm": 3.9895241260528564, "learning_rate": 9.932080249755389e-05, "loss": 2.3629072189331053, "memory(GiB)": 47.63, "step": 6130, "token_acc": 0.47104247104247104, "train_speed(iter/s)": 1.457824 }, { "epoch": 0.2628422089884752, "grad_norm": 5.900223731994629, "learning_rate": 9.931969657673088e-05, "loss": 2.648196220397949, "memory(GiB)": 47.63, "step": 6135, "token_acc": 0.47003154574132494, "train_speed(iter/s)": 1.458188 }, { "epoch": 0.2630564243177242, "grad_norm": 4.17408561706543, "learning_rate": 9.931858976243531e-05, "loss": 2.629598617553711, "memory(GiB)": 47.63, "step": 6140, "token_acc": 0.4323308270676692, "train_speed(iter/s)": 1.458345 }, { "epoch": 0.2632706396469731, "grad_norm": 5.355582237243652, "learning_rate": 9.931748205468721e-05, "loss": 2.7148637771606445, "memory(GiB)": 47.63, "step": 6145, "token_acc": 0.46938775510204084, "train_speed(iter/s)": 1.458381 }, { "epoch": 0.2634848549762221, "grad_norm": 3.701125144958496, "learning_rate": 9.931637345350667e-05, "loss": 2.8413522720336912, "memory(GiB)": 47.63, "step": 6150, "token_acc": 0.42696629213483145, "train_speed(iter/s)": 1.458714 }, { "epoch": 0.2636990703054711, "grad_norm": 5.480071544647217, "learning_rate": 9.931526395891375e-05, "loss": 2.6477405548095705, "memory(GiB)": 47.63, "step": 6155, "token_acc": 0.4444444444444444, "train_speed(iter/s)": 1.458807 }, { "epoch": 0.26391328563472, "grad_norm": 4.121298789978027, "learning_rate": 9.931415357092858e-05, "loss": 2.474870300292969, "memory(GiB)": 47.63, "step": 6160, "token_acc": 0.4965277777777778, "train_speed(iter/s)": 1.458935 }, { "epoch": 0.264127500963969, "grad_norm": 4.323131084442139, "learning_rate": 9.931304228957123e-05, "loss": 2.642374038696289, "memory(GiB)": 47.63, "step": 6165, "token_acc": 0.4134275618374558, "train_speed(iter/s)": 1.459283 }, { "epoch": 0.26434171629321795, "grad_norm": 3.5837652683258057, "learning_rate": 9.931193011486188e-05, "loss": 2.4898326873779295, "memory(GiB)": 47.63, "step": 6170, "token_acc": 0.4978540772532189, "train_speed(iter/s)": 1.459232 }, { "epoch": 0.2645559316224669, "grad_norm": 3.816152811050415, "learning_rate": 9.931081704682066e-05, "loss": 2.5765811920166017, "memory(GiB)": 47.63, "step": 6175, "token_acc": 0.4542372881355932, "train_speed(iter/s)": 1.459484 }, { "epoch": 0.26477014695171586, "grad_norm": 4.142642974853516, "learning_rate": 9.930970308546772e-05, "loss": 2.852547836303711, "memory(GiB)": 47.63, "step": 6180, "token_acc": 0.4564459930313589, "train_speed(iter/s)": 1.459496 }, { "epoch": 0.26498436228096484, "grad_norm": 4.207241058349609, "learning_rate": 9.930858823082327e-05, "loss": 2.589390754699707, "memory(GiB)": 47.63, "step": 6185, "token_acc": 0.46178343949044587, "train_speed(iter/s)": 1.459538 }, { "epoch": 0.26519857761021376, "grad_norm": 3.89192795753479, "learning_rate": 9.930747248290747e-05, "loss": 2.2295835494995115, "memory(GiB)": 47.63, "step": 6190, "token_acc": 0.47101449275362317, "train_speed(iter/s)": 1.459644 }, { "epoch": 0.26541279293946274, "grad_norm": 5.49457311630249, "learning_rate": 9.930635584174056e-05, "loss": 2.608285903930664, "memory(GiB)": 47.63, "step": 6195, "token_acc": 0.44333333333333336, "train_speed(iter/s)": 1.459845 }, { "epoch": 0.2656270082687117, "grad_norm": 3.7768564224243164, "learning_rate": 9.930523830734276e-05, "loss": 2.72476863861084, "memory(GiB)": 47.63, "step": 6200, "token_acc": 0.4560810810810811, "train_speed(iter/s)": 1.459594 }, { "epoch": 0.2658412235979607, "grad_norm": 7.339259147644043, "learning_rate": 9.930411987973431e-05, "loss": 2.422555923461914, "memory(GiB)": 47.63, "step": 6205, "token_acc": 0.4880546075085324, "train_speed(iter/s)": 1.459532 }, { "epoch": 0.2660554389272096, "grad_norm": 3.4869730472564697, "learning_rate": 9.930300055893549e-05, "loss": 2.5603126525878905, "memory(GiB)": 47.63, "step": 6210, "token_acc": 0.4787234042553192, "train_speed(iter/s)": 1.459648 }, { "epoch": 0.2662696542564586, "grad_norm": 5.080729007720947, "learning_rate": 9.930188034496655e-05, "loss": 2.684031105041504, "memory(GiB)": 47.63, "step": 6215, "token_acc": 0.478125, "train_speed(iter/s)": 1.459563 }, { "epoch": 0.2664838695857076, "grad_norm": 6.193282604217529, "learning_rate": 9.93007592378478e-05, "loss": 2.7133209228515627, "memory(GiB)": 47.63, "step": 6220, "token_acc": 0.47703180212014135, "train_speed(iter/s)": 1.459394 }, { "epoch": 0.2666980849149565, "grad_norm": 7.4428205490112305, "learning_rate": 9.929963723759956e-05, "loss": 2.7466331481933595, "memory(GiB)": 47.63, "step": 6225, "token_acc": 0.450354609929078, "train_speed(iter/s)": 1.459673 }, { "epoch": 0.2669123002442055, "grad_norm": 3.6143174171447754, "learning_rate": 9.929851434424216e-05, "loss": 2.7537635803222655, "memory(GiB)": 47.63, "step": 6230, "token_acc": 0.45751633986928103, "train_speed(iter/s)": 1.459388 }, { "epoch": 0.26712651557345446, "grad_norm": 3.4207046031951904, "learning_rate": 9.92973905577959e-05, "loss": 2.6948078155517576, "memory(GiB)": 47.63, "step": 6235, "token_acc": 0.4440894568690096, "train_speed(iter/s)": 1.459492 }, { "epoch": 0.2673407309027034, "grad_norm": 3.170596122741699, "learning_rate": 9.929626587828118e-05, "loss": 2.574795150756836, "memory(GiB)": 47.63, "step": 6240, "token_acc": 0.4430379746835443, "train_speed(iter/s)": 1.459642 }, { "epoch": 0.26755494623195236, "grad_norm": 7.684406280517578, "learning_rate": 9.929514030571834e-05, "loss": 2.733842468261719, "memory(GiB)": 47.63, "step": 6245, "token_acc": 0.43209876543209874, "train_speed(iter/s)": 1.459705 }, { "epoch": 0.26776916156120134, "grad_norm": 4.399084568023682, "learning_rate": 9.92940138401278e-05, "loss": 2.87264347076416, "memory(GiB)": 47.63, "step": 6250, "token_acc": 0.4807017543859649, "train_speed(iter/s)": 1.459662 }, { "epoch": 0.26798337689045026, "grad_norm": 3.7682693004608154, "learning_rate": 9.929288648152997e-05, "loss": 3.175109100341797, "memory(GiB)": 47.63, "step": 6255, "token_acc": 0.4161676646706587, "train_speed(iter/s)": 1.459561 }, { "epoch": 0.26819759221969924, "grad_norm": 4.389972686767578, "learning_rate": 9.929175822994526e-05, "loss": 2.734465408325195, "memory(GiB)": 47.63, "step": 6260, "token_acc": 0.4828767123287671, "train_speed(iter/s)": 1.459654 }, { "epoch": 0.2684118075489482, "grad_norm": 3.4758071899414062, "learning_rate": 9.929062908539408e-05, "loss": 2.669170951843262, "memory(GiB)": 47.63, "step": 6265, "token_acc": 0.44405594405594406, "train_speed(iter/s)": 1.459501 }, { "epoch": 0.26862602287819715, "grad_norm": 3.4275498390197754, "learning_rate": 9.928949904789695e-05, "loss": 2.6379329681396486, "memory(GiB)": 47.63, "step": 6270, "token_acc": 0.44745762711864406, "train_speed(iter/s)": 1.459464 }, { "epoch": 0.2688402382074461, "grad_norm": 4.899318218231201, "learning_rate": 9.928836811747429e-05, "loss": 2.7687206268310547, "memory(GiB)": 47.63, "step": 6275, "token_acc": 0.4673913043478261, "train_speed(iter/s)": 1.459217 }, { "epoch": 0.2690544535366951, "grad_norm": 4.113958358764648, "learning_rate": 9.928723629414662e-05, "loss": 2.7000185012817384, "memory(GiB)": 47.63, "step": 6280, "token_acc": 0.5037037037037037, "train_speed(iter/s)": 1.459385 }, { "epoch": 0.269268668865944, "grad_norm": 4.145396709442139, "learning_rate": 9.928610357793441e-05, "loss": 2.6391624450683593, "memory(GiB)": 47.63, "step": 6285, "token_acc": 0.47703180212014135, "train_speed(iter/s)": 1.459432 }, { "epoch": 0.269482884195193, "grad_norm": 5.5968918800354, "learning_rate": 9.928496996885821e-05, "loss": 2.4019641876220703, "memory(GiB)": 47.63, "step": 6290, "token_acc": 0.5060728744939271, "train_speed(iter/s)": 1.459583 }, { "epoch": 0.269697099524442, "grad_norm": 5.587214946746826, "learning_rate": 9.928383546693854e-05, "loss": 2.5242671966552734, "memory(GiB)": 47.63, "step": 6295, "token_acc": 0.5, "train_speed(iter/s)": 1.45965 }, { "epoch": 0.2699113148536909, "grad_norm": 4.530307292938232, "learning_rate": 9.928270007219598e-05, "loss": 2.593612289428711, "memory(GiB)": 47.63, "step": 6300, "token_acc": 0.4790996784565916, "train_speed(iter/s)": 1.459693 }, { "epoch": 0.2701255301829399, "grad_norm": 5.821206569671631, "learning_rate": 9.928156378465106e-05, "loss": 2.5492328643798827, "memory(GiB)": 47.63, "step": 6305, "token_acc": 0.47843137254901963, "train_speed(iter/s)": 1.460083 }, { "epoch": 0.27033974551218887, "grad_norm": 4.19746732711792, "learning_rate": 9.928042660432437e-05, "loss": 2.6815792083740235, "memory(GiB)": 47.63, "step": 6310, "token_acc": 0.42729970326409494, "train_speed(iter/s)": 1.460085 }, { "epoch": 0.2705539608414378, "grad_norm": 3.615483283996582, "learning_rate": 9.927928853123654e-05, "loss": 2.443739891052246, "memory(GiB)": 47.63, "step": 6315, "token_acc": 0.49842271293375395, "train_speed(iter/s)": 1.460239 }, { "epoch": 0.27076817617068677, "grad_norm": 5.410396099090576, "learning_rate": 9.927814956540818e-05, "loss": 2.5455259323120116, "memory(GiB)": 47.63, "step": 6320, "token_acc": 0.44981412639405205, "train_speed(iter/s)": 1.460304 }, { "epoch": 0.27098239149993575, "grad_norm": 5.281170845031738, "learning_rate": 9.927700970685989e-05, "loss": 2.694020652770996, "memory(GiB)": 47.63, "step": 6325, "token_acc": 0.46586345381526106, "train_speed(iter/s)": 1.460188 }, { "epoch": 0.27119660682918467, "grad_norm": 2.8500008583068848, "learning_rate": 9.927586895561237e-05, "loss": 2.289530372619629, "memory(GiB)": 47.63, "step": 6330, "token_acc": 0.524904214559387, "train_speed(iter/s)": 1.460152 }, { "epoch": 0.27141082215843365, "grad_norm": 4.507504463195801, "learning_rate": 9.927472731168623e-05, "loss": 2.4930225372314454, "memory(GiB)": 47.63, "step": 6335, "token_acc": 0.48056537102473496, "train_speed(iter/s)": 1.460259 }, { "epoch": 0.27162503748768263, "grad_norm": 3.69659161567688, "learning_rate": 9.92735847751022e-05, "loss": 2.4775733947753906, "memory(GiB)": 47.63, "step": 6340, "token_acc": 0.47750865051903113, "train_speed(iter/s)": 1.460101 }, { "epoch": 0.27183925281693155, "grad_norm": 12.177428245544434, "learning_rate": 9.927244134588095e-05, "loss": 2.730167007446289, "memory(GiB)": 47.63, "step": 6345, "token_acc": 0.48727272727272725, "train_speed(iter/s)": 1.460416 }, { "epoch": 0.27205346814618053, "grad_norm": 4.370859622955322, "learning_rate": 9.927129702404321e-05, "loss": 2.5016876220703126, "memory(GiB)": 47.63, "step": 6350, "token_acc": 0.4563106796116505, "train_speed(iter/s)": 1.460325 }, { "epoch": 0.2722676834754295, "grad_norm": 3.9999990463256836, "learning_rate": 9.92701518096097e-05, "loss": 2.726361083984375, "memory(GiB)": 47.63, "step": 6355, "token_acc": 0.4317343173431734, "train_speed(iter/s)": 1.460069 }, { "epoch": 0.27248189880467844, "grad_norm": 4.737161159515381, "learning_rate": 9.926900570260118e-05, "loss": 2.558404541015625, "memory(GiB)": 47.63, "step": 6360, "token_acc": 0.46357615894039733, "train_speed(iter/s)": 1.460098 }, { "epoch": 0.2726961141339274, "grad_norm": 4.153711795806885, "learning_rate": 9.926785870303839e-05, "loss": 2.630443572998047, "memory(GiB)": 47.63, "step": 6365, "token_acc": 0.4682080924855491, "train_speed(iter/s)": 1.460066 }, { "epoch": 0.2729103294631764, "grad_norm": 3.8597517013549805, "learning_rate": 9.926671081094214e-05, "loss": 2.488731575012207, "memory(GiB)": 47.63, "step": 6370, "token_acc": 0.4549019607843137, "train_speed(iter/s)": 1.460238 }, { "epoch": 0.27312454479242537, "grad_norm": 4.428126335144043, "learning_rate": 9.92655620263332e-05, "loss": 2.739374351501465, "memory(GiB)": 47.63, "step": 6375, "token_acc": 0.45396825396825397, "train_speed(iter/s)": 1.460059 }, { "epoch": 0.2733387601216743, "grad_norm": 4.17251443862915, "learning_rate": 9.926441234923239e-05, "loss": 2.6665557861328124, "memory(GiB)": 47.63, "step": 6380, "token_acc": 0.44135802469135804, "train_speed(iter/s)": 1.459965 }, { "epoch": 0.2735529754509233, "grad_norm": 4.361823558807373, "learning_rate": 9.926326177966052e-05, "loss": 2.818448066711426, "memory(GiB)": 47.63, "step": 6385, "token_acc": 0.44333333333333336, "train_speed(iter/s)": 1.460136 }, { "epoch": 0.27376719078017225, "grad_norm": 3.3004982471466064, "learning_rate": 9.926211031763846e-05, "loss": 2.5723459243774416, "memory(GiB)": 47.63, "step": 6390, "token_acc": 0.4586206896551724, "train_speed(iter/s)": 1.460242 }, { "epoch": 0.2739814061094212, "grad_norm": 4.8515543937683105, "learning_rate": 9.926095796318706e-05, "loss": 2.7295555114746093, "memory(GiB)": 47.63, "step": 6395, "token_acc": 0.45634920634920634, "train_speed(iter/s)": 1.460357 }, { "epoch": 0.27419562143867016, "grad_norm": 4.730801582336426, "learning_rate": 9.92598047163272e-05, "loss": 2.5801937103271486, "memory(GiB)": 47.63, "step": 6400, "token_acc": 0.4541832669322709, "train_speed(iter/s)": 1.460409 }, { "epoch": 0.27440983676791914, "grad_norm": 3.8852200508117676, "learning_rate": 9.925865057707977e-05, "loss": 3.041835403442383, "memory(GiB)": 47.63, "step": 6405, "token_acc": 0.4192546583850932, "train_speed(iter/s)": 1.460437 }, { "epoch": 0.27462405209716806, "grad_norm": 4.520515441894531, "learning_rate": 9.925749554546568e-05, "loss": 2.643325996398926, "memory(GiB)": 47.63, "step": 6410, "token_acc": 0.46440677966101696, "train_speed(iter/s)": 1.460586 }, { "epoch": 0.27483826742641704, "grad_norm": 3.491898775100708, "learning_rate": 9.925633962150584e-05, "loss": 2.39678955078125, "memory(GiB)": 47.63, "step": 6415, "token_acc": 0.4742647058823529, "train_speed(iter/s)": 1.460348 }, { "epoch": 0.275052482755666, "grad_norm": 3.603445291519165, "learning_rate": 9.925518280522121e-05, "loss": 2.7103759765625, "memory(GiB)": 47.63, "step": 6420, "token_acc": 0.42946708463949845, "train_speed(iter/s)": 1.46015 }, { "epoch": 0.27526669808491494, "grad_norm": 3.179222345352173, "learning_rate": 9.925402509663273e-05, "loss": 2.5286163330078124, "memory(GiB)": 47.63, "step": 6425, "token_acc": 0.45652173913043476, "train_speed(iter/s)": 1.460259 }, { "epoch": 0.2754809134141639, "grad_norm": 3.502445936203003, "learning_rate": 9.92528664957614e-05, "loss": 2.733730697631836, "memory(GiB)": 47.63, "step": 6430, "token_acc": 0.447887323943662, "train_speed(iter/s)": 1.460065 }, { "epoch": 0.2756951287434129, "grad_norm": 4.481658935546875, "learning_rate": 9.925170700262817e-05, "loss": 2.6959110260009767, "memory(GiB)": 47.63, "step": 6435, "token_acc": 0.4489795918367347, "train_speed(iter/s)": 1.46009 }, { "epoch": 0.2759093440726618, "grad_norm": 3.666271448135376, "learning_rate": 9.925054661725406e-05, "loss": 2.81857967376709, "memory(GiB)": 47.63, "step": 6440, "token_acc": 0.4232081911262799, "train_speed(iter/s)": 1.46017 }, { "epoch": 0.2761235594019108, "grad_norm": 5.204419136047363, "learning_rate": 9.924938533966012e-05, "loss": 2.518722152709961, "memory(GiB)": 47.63, "step": 6445, "token_acc": 0.4584717607973422, "train_speed(iter/s)": 1.460288 }, { "epoch": 0.2763377747311598, "grad_norm": 3.9146623611450195, "learning_rate": 9.924822316986735e-05, "loss": 2.5668308258056642, "memory(GiB)": 47.63, "step": 6450, "token_acc": 0.4738955823293173, "train_speed(iter/s)": 1.46037 }, { "epoch": 0.2765519900604087, "grad_norm": 4.472992897033691, "learning_rate": 9.924706010789683e-05, "loss": 2.757154846191406, "memory(GiB)": 47.63, "step": 6455, "token_acc": 0.39928057553956836, "train_speed(iter/s)": 1.460136 }, { "epoch": 0.2767662053896577, "grad_norm": 4.2314605712890625, "learning_rate": 9.924589615376962e-05, "loss": 2.5920293807983397, "memory(GiB)": 47.63, "step": 6460, "token_acc": 0.44932432432432434, "train_speed(iter/s)": 1.460015 }, { "epoch": 0.27698042071890666, "grad_norm": 3.437063217163086, "learning_rate": 9.92447313075068e-05, "loss": 2.541089630126953, "memory(GiB)": 47.63, "step": 6465, "token_acc": 0.4691358024691358, "train_speed(iter/s)": 1.459735 }, { "epoch": 0.2771946360481556, "grad_norm": 5.263180255889893, "learning_rate": 9.924356556912946e-05, "loss": 2.538657379150391, "memory(GiB)": 47.63, "step": 6470, "token_acc": 0.46402877697841727, "train_speed(iter/s)": 1.459798 }, { "epoch": 0.27740885137740456, "grad_norm": 4.6755852699279785, "learning_rate": 9.924239893865874e-05, "loss": 2.6377643585205077, "memory(GiB)": 47.63, "step": 6475, "token_acc": 0.48616600790513836, "train_speed(iter/s)": 1.46002 }, { "epoch": 0.27762306670665354, "grad_norm": 3.1691155433654785, "learning_rate": 9.924123141611578e-05, "loss": 2.4227458953857424, "memory(GiB)": 47.63, "step": 6480, "token_acc": 0.4901185770750988, "train_speed(iter/s)": 1.460192 }, { "epoch": 0.27783728203590247, "grad_norm": 5.500218391418457, "learning_rate": 9.924006300152173e-05, "loss": 2.5818939208984375, "memory(GiB)": 47.63, "step": 6485, "token_acc": 0.4833948339483395, "train_speed(iter/s)": 1.460192 }, { "epoch": 0.27805149736515145, "grad_norm": 3.576978921890259, "learning_rate": 9.923889369489774e-05, "loss": 2.544826126098633, "memory(GiB)": 47.63, "step": 6490, "token_acc": 0.47368421052631576, "train_speed(iter/s)": 1.460026 }, { "epoch": 0.2782657126944004, "grad_norm": 4.320828914642334, "learning_rate": 9.9237723496265e-05, "loss": 2.27396297454834, "memory(GiB)": 47.63, "step": 6495, "token_acc": 0.504950495049505, "train_speed(iter/s)": 1.460027 }, { "epoch": 0.27847992802364935, "grad_norm": 5.061696529388428, "learning_rate": 9.923655240564472e-05, "loss": 2.7767749786376954, "memory(GiB)": 47.63, "step": 6500, "token_acc": 0.43661971830985913, "train_speed(iter/s)": 1.460112 }, { "epoch": 0.27847992802364935, "eval_loss": 2.2300853729248047, "eval_runtime": 13.9267, "eval_samples_per_second": 7.18, "eval_steps_per_second": 7.18, "eval_token_acc": 0.47348951911220716, "step": 6500 }, { "epoch": 0.2786941433528983, "grad_norm": 3.8164281845092773, "learning_rate": 9.92353804230581e-05, "loss": 2.7503490447998047, "memory(GiB)": 47.63, "step": 6505, "token_acc": 0.4629294755877034, "train_speed(iter/s)": 1.455262 }, { "epoch": 0.2789083586821473, "grad_norm": 5.132174015045166, "learning_rate": 9.923420754852634e-05, "loss": 2.845453453063965, "memory(GiB)": 47.63, "step": 6510, "token_acc": 0.4187725631768953, "train_speed(iter/s)": 1.455479 }, { "epoch": 0.27912257401139623, "grad_norm": 4.182595252990723, "learning_rate": 9.923303378207077e-05, "loss": 2.765598678588867, "memory(GiB)": 47.63, "step": 6515, "token_acc": 0.46875, "train_speed(iter/s)": 1.455863 }, { "epoch": 0.2793367893406452, "grad_norm": 5.0708394050598145, "learning_rate": 9.92318591237126e-05, "loss": 2.082514762878418, "memory(GiB)": 47.63, "step": 6520, "token_acc": 0.5726141078838174, "train_speed(iter/s)": 1.455809 }, { "epoch": 0.2795510046698942, "grad_norm": 4.086470127105713, "learning_rate": 9.923068357347312e-05, "loss": 2.5974002838134767, "memory(GiB)": 47.63, "step": 6525, "token_acc": 0.5018050541516246, "train_speed(iter/s)": 1.456167 }, { "epoch": 0.2797652199991431, "grad_norm": 10.000062942504883, "learning_rate": 9.92295071313736e-05, "loss": 2.7181060791015623, "memory(GiB)": 47.63, "step": 6530, "token_acc": 0.4554794520547945, "train_speed(iter/s)": 1.456112 }, { "epoch": 0.2799794353283921, "grad_norm": 3.311910629272461, "learning_rate": 9.922832979743542e-05, "loss": 2.5410552978515626, "memory(GiB)": 47.63, "step": 6535, "token_acc": 0.44598337950138506, "train_speed(iter/s)": 1.456212 }, { "epoch": 0.28019365065764107, "grad_norm": 4.854501724243164, "learning_rate": 9.922715157167984e-05, "loss": 2.5051097869873047, "memory(GiB)": 47.63, "step": 6540, "token_acc": 0.4892086330935252, "train_speed(iter/s)": 1.456141 }, { "epoch": 0.28040786598689005, "grad_norm": 4.3800129890441895, "learning_rate": 9.922597245412822e-05, "loss": 2.8104381561279297, "memory(GiB)": 47.63, "step": 6545, "token_acc": 0.4405144694533762, "train_speed(iter/s)": 1.456161 }, { "epoch": 0.28062208131613897, "grad_norm": 3.968067169189453, "learning_rate": 9.922479244480194e-05, "loss": 2.8367366790771484, "memory(GiB)": 47.63, "step": 6550, "token_acc": 0.44155844155844154, "train_speed(iter/s)": 1.456125 }, { "epoch": 0.28083629664538795, "grad_norm": 3.8599963188171387, "learning_rate": 9.922361154372237e-05, "loss": 2.4901859283447267, "memory(GiB)": 47.63, "step": 6555, "token_acc": 0.45017182130584193, "train_speed(iter/s)": 1.456231 }, { "epoch": 0.28105051197463693, "grad_norm": 4.0197296142578125, "learning_rate": 9.922242975091092e-05, "loss": 2.4429126739501954, "memory(GiB)": 47.63, "step": 6560, "token_acc": 0.4946236559139785, "train_speed(iter/s)": 1.456392 }, { "epoch": 0.28126472730388585, "grad_norm": 5.221249580383301, "learning_rate": 9.922124706638896e-05, "loss": 2.9108280181884765, "memory(GiB)": 47.63, "step": 6565, "token_acc": 0.43859649122807015, "train_speed(iter/s)": 1.456277 }, { "epoch": 0.28147894263313483, "grad_norm": 3.7353672981262207, "learning_rate": 9.922006349017793e-05, "loss": 2.744065284729004, "memory(GiB)": 47.63, "step": 6570, "token_acc": 0.44274809160305345, "train_speed(iter/s)": 1.456425 }, { "epoch": 0.2816931579623838, "grad_norm": 3.9291000366210938, "learning_rate": 9.92188790222993e-05, "loss": 2.686320114135742, "memory(GiB)": 47.63, "step": 6575, "token_acc": 0.47388059701492535, "train_speed(iter/s)": 1.456198 }, { "epoch": 0.28190737329163273, "grad_norm": 4.82513427734375, "learning_rate": 9.921769366277449e-05, "loss": 2.505121612548828, "memory(GiB)": 47.63, "step": 6580, "token_acc": 0.5038461538461538, "train_speed(iter/s)": 1.456238 }, { "epoch": 0.2821215886208817, "grad_norm": 4.253800868988037, "learning_rate": 9.9216507411625e-05, "loss": 2.924699401855469, "memory(GiB)": 47.63, "step": 6585, "token_acc": 0.4198250728862974, "train_speed(iter/s)": 1.456429 }, { "epoch": 0.2823358039501307, "grad_norm": 4.070432186126709, "learning_rate": 9.92153202688723e-05, "loss": 2.3470638275146483, "memory(GiB)": 47.63, "step": 6590, "token_acc": 0.5409252669039146, "train_speed(iter/s)": 1.456437 }, { "epoch": 0.2825500192793796, "grad_norm": 3.762282371520996, "learning_rate": 9.921413223453791e-05, "loss": 2.666696548461914, "memory(GiB)": 47.63, "step": 6595, "token_acc": 0.4218289085545723, "train_speed(iter/s)": 1.456519 }, { "epoch": 0.2827642346086286, "grad_norm": 4.166807174682617, "learning_rate": 9.921294330864334e-05, "loss": 2.8448354721069338, "memory(GiB)": 47.63, "step": 6600, "token_acc": 0.46686746987951805, "train_speed(iter/s)": 1.456385 }, { "epoch": 0.2829784499378776, "grad_norm": 5.437010765075684, "learning_rate": 9.921175349121015e-05, "loss": 2.6816150665283205, "memory(GiB)": 47.63, "step": 6605, "token_acc": 0.4416058394160584, "train_speed(iter/s)": 1.456789 }, { "epoch": 0.2831926652671265, "grad_norm": 3.6787517070770264, "learning_rate": 9.921056278225986e-05, "loss": 2.4739849090576174, "memory(GiB)": 47.63, "step": 6610, "token_acc": 0.5101351351351351, "train_speed(iter/s)": 1.456958 }, { "epoch": 0.2834068805963755, "grad_norm": 2.8377857208251953, "learning_rate": 9.920937118181408e-05, "loss": 2.6592750549316406, "memory(GiB)": 47.63, "step": 6615, "token_acc": 0.4169278996865204, "train_speed(iter/s)": 1.456925 }, { "epoch": 0.28362109592562446, "grad_norm": 3.365974187850952, "learning_rate": 9.920817868989439e-05, "loss": 2.3467428207397463, "memory(GiB)": 47.63, "step": 6620, "token_acc": 0.5303514376996805, "train_speed(iter/s)": 1.457005 }, { "epoch": 0.2838353112548734, "grad_norm": 4.327791213989258, "learning_rate": 9.920698530652235e-05, "loss": 2.6440120697021485, "memory(GiB)": 47.63, "step": 6625, "token_acc": 0.42857142857142855, "train_speed(iter/s)": 1.456917 }, { "epoch": 0.28404952658412236, "grad_norm": 2.8808248043060303, "learning_rate": 9.920579103171963e-05, "loss": 2.6981765747070314, "memory(GiB)": 47.63, "step": 6630, "token_acc": 0.5016835016835017, "train_speed(iter/s)": 1.457114 }, { "epoch": 0.28426374191337134, "grad_norm": 4.228165626525879, "learning_rate": 9.920459586550785e-05, "loss": 2.6802579879760744, "memory(GiB)": 47.63, "step": 6635, "token_acc": 0.45660377358490567, "train_speed(iter/s)": 1.457054 }, { "epoch": 0.28447795724262026, "grad_norm": 4.075457572937012, "learning_rate": 9.920339980790864e-05, "loss": 2.4691463470458985, "memory(GiB)": 47.63, "step": 6640, "token_acc": 0.4928571428571429, "train_speed(iter/s)": 1.45718 }, { "epoch": 0.28469217257186924, "grad_norm": 3.6696760654449463, "learning_rate": 9.92022028589437e-05, "loss": 2.7696184158325194, "memory(GiB)": 47.63, "step": 6645, "token_acc": 0.46886446886446886, "train_speed(iter/s)": 1.457392 }, { "epoch": 0.2849063879011182, "grad_norm": 3.7545242309570312, "learning_rate": 9.920100501863472e-05, "loss": 2.781435012817383, "memory(GiB)": 47.63, "step": 6650, "token_acc": 0.45614035087719296, "train_speed(iter/s)": 1.457431 }, { "epoch": 0.28512060323036714, "grad_norm": 3.9622879028320312, "learning_rate": 9.919980628700335e-05, "loss": 2.5763864517211914, "memory(GiB)": 47.63, "step": 6655, "token_acc": 0.449438202247191, "train_speed(iter/s)": 1.457464 }, { "epoch": 0.2853348185596161, "grad_norm": 5.135830879211426, "learning_rate": 9.919860666407135e-05, "loss": 2.903422546386719, "memory(GiB)": 47.63, "step": 6660, "token_acc": 0.4277456647398844, "train_speed(iter/s)": 1.457552 }, { "epoch": 0.2855490338888651, "grad_norm": 4.834655284881592, "learning_rate": 9.919740614986043e-05, "loss": 2.7380495071411133, "memory(GiB)": 47.63, "step": 6665, "token_acc": 0.4652014652014652, "train_speed(iter/s)": 1.457405 }, { "epoch": 0.285763249218114, "grad_norm": 3.739605188369751, "learning_rate": 9.919620474439236e-05, "loss": 2.609376907348633, "memory(GiB)": 47.63, "step": 6670, "token_acc": 0.50187265917603, "train_speed(iter/s)": 1.457537 }, { "epoch": 0.285977464547363, "grad_norm": 3.5914204120635986, "learning_rate": 9.919500244768886e-05, "loss": 2.6851119995117188, "memory(GiB)": 47.63, "step": 6675, "token_acc": 0.43728813559322033, "train_speed(iter/s)": 1.457429 }, { "epoch": 0.286191679876612, "grad_norm": 3.5740292072296143, "learning_rate": 9.919379925977177e-05, "loss": 2.534268379211426, "memory(GiB)": 47.63, "step": 6680, "token_acc": 0.48, "train_speed(iter/s)": 1.457522 }, { "epoch": 0.2864058952058609, "grad_norm": 3.6544411182403564, "learning_rate": 9.919259518066285e-05, "loss": 2.413597297668457, "memory(GiB)": 47.63, "step": 6685, "token_acc": 0.4701195219123506, "train_speed(iter/s)": 1.457378 }, { "epoch": 0.2866201105351099, "grad_norm": 4.151614665985107, "learning_rate": 9.919139021038392e-05, "loss": 2.855759620666504, "memory(GiB)": 47.63, "step": 6690, "token_acc": 0.49411764705882355, "train_speed(iter/s)": 1.457442 }, { "epoch": 0.28683432586435886, "grad_norm": 5.356203556060791, "learning_rate": 9.919018434895681e-05, "loss": 2.4592670440673827, "memory(GiB)": 47.63, "step": 6695, "token_acc": 0.4879032258064516, "train_speed(iter/s)": 1.457558 }, { "epoch": 0.2870485411936078, "grad_norm": 4.413278579711914, "learning_rate": 9.918897759640338e-05, "loss": 2.4328805923461916, "memory(GiB)": 47.63, "step": 6700, "token_acc": 0.496, "train_speed(iter/s)": 1.457612 }, { "epoch": 0.28726275652285677, "grad_norm": 4.578810691833496, "learning_rate": 9.918776995274547e-05, "loss": 2.7367570877075194, "memory(GiB)": 47.63, "step": 6705, "token_acc": 0.43506493506493504, "train_speed(iter/s)": 1.4576 }, { "epoch": 0.28747697185210574, "grad_norm": 4.948911190032959, "learning_rate": 9.918656141800496e-05, "loss": 2.6622766494750976, "memory(GiB)": 47.63, "step": 6710, "token_acc": 0.45364238410596025, "train_speed(iter/s)": 1.45722 }, { "epoch": 0.2876911871813547, "grad_norm": 4.788825035095215, "learning_rate": 9.918535199220376e-05, "loss": 2.6083139419555663, "memory(GiB)": 47.63, "step": 6715, "token_acc": 0.44660194174757284, "train_speed(iter/s)": 1.457461 }, { "epoch": 0.28790540251060365, "grad_norm": 3.6893582344055176, "learning_rate": 9.918414167536376e-05, "loss": 2.7440122604370116, "memory(GiB)": 47.63, "step": 6720, "token_acc": 0.4652014652014652, "train_speed(iter/s)": 1.457459 }, { "epoch": 0.2881196178398526, "grad_norm": 5.994373798370361, "learning_rate": 9.918293046750689e-05, "loss": 2.5064340591430665, "memory(GiB)": 47.63, "step": 6725, "token_acc": 0.4601449275362319, "train_speed(iter/s)": 1.45765 }, { "epoch": 0.2883338331691016, "grad_norm": 4.485553741455078, "learning_rate": 9.918171836865511e-05, "loss": 2.9492183685302735, "memory(GiB)": 47.63, "step": 6730, "token_acc": 0.4421052631578947, "train_speed(iter/s)": 1.457606 }, { "epoch": 0.28854804849835053, "grad_norm": 4.04892635345459, "learning_rate": 9.918050537883037e-05, "loss": 2.4176584243774415, "memory(GiB)": 47.63, "step": 6735, "token_acc": 0.4666666666666667, "train_speed(iter/s)": 1.457516 }, { "epoch": 0.2887622638275995, "grad_norm": 3.8965468406677246, "learning_rate": 9.917929149805462e-05, "loss": 2.4605600357055666, "memory(GiB)": 47.63, "step": 6740, "token_acc": 0.4948453608247423, "train_speed(iter/s)": 1.457132 }, { "epoch": 0.2889764791568485, "grad_norm": 5.634366989135742, "learning_rate": 9.917807672634989e-05, "loss": 2.6476724624633787, "memory(GiB)": 47.63, "step": 6745, "token_acc": 0.416988416988417, "train_speed(iter/s)": 1.457053 }, { "epoch": 0.2891906944860974, "grad_norm": 3.4423749446868896, "learning_rate": 9.917686106373816e-05, "loss": 2.393769454956055, "memory(GiB)": 47.63, "step": 6750, "token_acc": 0.5198412698412699, "train_speed(iter/s)": 1.457149 }, { "epoch": 0.2894049098153464, "grad_norm": 3.979480028152466, "learning_rate": 9.917564451024147e-05, "loss": 2.6255361557006838, "memory(GiB)": 47.63, "step": 6755, "token_acc": 0.4166666666666667, "train_speed(iter/s)": 1.457225 }, { "epoch": 0.28961912514459537, "grad_norm": 3.3376545906066895, "learning_rate": 9.917442706588183e-05, "loss": 2.601759910583496, "memory(GiB)": 47.63, "step": 6760, "token_acc": 0.46107784431137727, "train_speed(iter/s)": 1.457244 }, { "epoch": 0.2898333404738443, "grad_norm": 3.493684768676758, "learning_rate": 9.917320873068132e-05, "loss": 2.507145118713379, "memory(GiB)": 47.63, "step": 6765, "token_acc": 0.4876712328767123, "train_speed(iter/s)": 1.457043 }, { "epoch": 0.29004755580309327, "grad_norm": 4.453976631164551, "learning_rate": 9.9171989504662e-05, "loss": 2.54097843170166, "memory(GiB)": 47.63, "step": 6770, "token_acc": 0.4748201438848921, "train_speed(iter/s)": 1.457142 }, { "epoch": 0.29026177113234225, "grad_norm": 4.38257360458374, "learning_rate": 9.917076938784597e-05, "loss": 2.573847198486328, "memory(GiB)": 47.63, "step": 6775, "token_acc": 0.4459016393442623, "train_speed(iter/s)": 1.457361 }, { "epoch": 0.2904759864615912, "grad_norm": 4.03444766998291, "learning_rate": 9.916954838025533e-05, "loss": 2.2041826248168945, "memory(GiB)": 47.63, "step": 6780, "token_acc": 0.5365853658536586, "train_speed(iter/s)": 1.457458 }, { "epoch": 0.29069020179084015, "grad_norm": 4.5989789962768555, "learning_rate": 9.91683264819122e-05, "loss": 2.7307586669921875, "memory(GiB)": 47.63, "step": 6785, "token_acc": 0.4463667820069204, "train_speed(iter/s)": 1.457661 }, { "epoch": 0.29090441712008913, "grad_norm": 6.401525497436523, "learning_rate": 9.91671036928387e-05, "loss": 2.385505485534668, "memory(GiB)": 47.63, "step": 6790, "token_acc": 0.48398576512455516, "train_speed(iter/s)": 1.457658 }, { "epoch": 0.29111863244933806, "grad_norm": 4.399469375610352, "learning_rate": 9.9165880013057e-05, "loss": 2.6143795013427735, "memory(GiB)": 47.63, "step": 6795, "token_acc": 0.45977011494252873, "train_speed(iter/s)": 1.457322 }, { "epoch": 0.29133284777858703, "grad_norm": 3.8054795265197754, "learning_rate": 9.916465544258926e-05, "loss": 2.313238525390625, "memory(GiB)": 47.63, "step": 6800, "token_acc": 0.49818181818181817, "train_speed(iter/s)": 1.457467 }, { "epoch": 0.291547063107836, "grad_norm": 4.2663984298706055, "learning_rate": 9.916342998145766e-05, "loss": 2.6732725143432616, "memory(GiB)": 47.63, "step": 6805, "token_acc": 0.46357615894039733, "train_speed(iter/s)": 1.457199 }, { "epoch": 0.29176127843708494, "grad_norm": 7.208676815032959, "learning_rate": 9.916220362968443e-05, "loss": 2.809499740600586, "memory(GiB)": 47.63, "step": 6810, "token_acc": 0.4392523364485981, "train_speed(iter/s)": 1.457302 }, { "epoch": 0.2919754937663339, "grad_norm": 5.0455427169799805, "learning_rate": 9.916097638729174e-05, "loss": 2.493167686462402, "memory(GiB)": 47.63, "step": 6815, "token_acc": 0.49226006191950467, "train_speed(iter/s)": 1.457444 }, { "epoch": 0.2921897090955829, "grad_norm": 3.188335418701172, "learning_rate": 9.915974825430187e-05, "loss": 2.6794239044189454, "memory(GiB)": 47.63, "step": 6820, "token_acc": 0.48188405797101447, "train_speed(iter/s)": 1.457734 }, { "epoch": 0.2924039244248318, "grad_norm": 4.809487819671631, "learning_rate": 9.915851923073702e-05, "loss": 2.5735183715820313, "memory(GiB)": 47.63, "step": 6825, "token_acc": 0.49166666666666664, "train_speed(iter/s)": 1.457673 }, { "epoch": 0.2926181397540808, "grad_norm": 3.612699031829834, "learning_rate": 9.915728931661949e-05, "loss": 2.7919570922851564, "memory(GiB)": 47.63, "step": 6830, "token_acc": 0.43728813559322033, "train_speed(iter/s)": 1.457787 }, { "epoch": 0.2928323550833298, "grad_norm": 3.884958267211914, "learning_rate": 9.915605851197156e-05, "loss": 2.87333984375, "memory(GiB)": 47.63, "step": 6835, "token_acc": 0.4367816091954023, "train_speed(iter/s)": 1.457947 }, { "epoch": 0.2930465704125787, "grad_norm": 4.934978485107422, "learning_rate": 9.91548268168155e-05, "loss": 2.6373437881469726, "memory(GiB)": 47.63, "step": 6840, "token_acc": 0.4657534246575342, "train_speed(iter/s)": 1.457901 }, { "epoch": 0.2932607857418277, "grad_norm": 5.34118127822876, "learning_rate": 9.915359423117366e-05, "loss": 2.6255226135253906, "memory(GiB)": 47.63, "step": 6845, "token_acc": 0.4807017543859649, "train_speed(iter/s)": 1.457858 }, { "epoch": 0.29347500107107666, "grad_norm": 3.7396907806396484, "learning_rate": 9.915236075506833e-05, "loss": 2.570332336425781, "memory(GiB)": 47.63, "step": 6850, "token_acc": 0.4318181818181818, "train_speed(iter/s)": 1.458032 }, { "epoch": 0.2936892164003256, "grad_norm": 4.2154059410095215, "learning_rate": 9.915112638852188e-05, "loss": 2.5960338592529295, "memory(GiB)": 47.63, "step": 6855, "token_acc": 0.47843137254901963, "train_speed(iter/s)": 1.457974 }, { "epoch": 0.29390343172957456, "grad_norm": 5.163266658782959, "learning_rate": 9.914989113155668e-05, "loss": 2.8140689849853517, "memory(GiB)": 47.63, "step": 6860, "token_acc": 0.40594059405940597, "train_speed(iter/s)": 1.458094 }, { "epoch": 0.29411764705882354, "grad_norm": 5.603063106536865, "learning_rate": 9.91486549841951e-05, "loss": 2.5724794387817385, "memory(GiB)": 47.63, "step": 6865, "token_acc": 0.440625, "train_speed(iter/s)": 1.458086 }, { "epoch": 0.2943318623880725, "grad_norm": 4.696154594421387, "learning_rate": 9.914741794645952e-05, "loss": 2.582904815673828, "memory(GiB)": 47.63, "step": 6870, "token_acc": 0.47115384615384615, "train_speed(iter/s)": 1.458003 }, { "epoch": 0.29454607771732144, "grad_norm": 4.207950115203857, "learning_rate": 9.914618001837235e-05, "loss": 2.408650016784668, "memory(GiB)": 47.63, "step": 6875, "token_acc": 0.5211267605633803, "train_speed(iter/s)": 1.457964 }, { "epoch": 0.2947602930465704, "grad_norm": 4.780416965484619, "learning_rate": 9.914494119995604e-05, "loss": 2.9929107666015624, "memory(GiB)": 47.63, "step": 6880, "token_acc": 0.40202702702702703, "train_speed(iter/s)": 1.457783 }, { "epoch": 0.2949745083758194, "grad_norm": 4.01948881149292, "learning_rate": 9.914370149123302e-05, "loss": 2.7052196502685546, "memory(GiB)": 47.63, "step": 6885, "token_acc": 0.46254071661237783, "train_speed(iter/s)": 1.45741 }, { "epoch": 0.2951887237050683, "grad_norm": 3.457794666290283, "learning_rate": 9.914246089222575e-05, "loss": 2.8521696090698243, "memory(GiB)": 47.63, "step": 6890, "token_acc": 0.42036553524804177, "train_speed(iter/s)": 1.457387 }, { "epoch": 0.2954029390343173, "grad_norm": 6.059695243835449, "learning_rate": 9.914121940295669e-05, "loss": 2.523684501647949, "memory(GiB)": 47.63, "step": 6895, "token_acc": 0.48046875, "train_speed(iter/s)": 1.457385 }, { "epoch": 0.2956171543635663, "grad_norm": 5.192733287811279, "learning_rate": 9.913997702344834e-05, "loss": 2.4845111846923826, "memory(GiB)": 47.63, "step": 6900, "token_acc": 0.5111940298507462, "train_speed(iter/s)": 1.457472 }, { "epoch": 0.2958313696928152, "grad_norm": 3.800037145614624, "learning_rate": 9.913873375372321e-05, "loss": 2.552079772949219, "memory(GiB)": 47.63, "step": 6905, "token_acc": 0.4645390070921986, "train_speed(iter/s)": 1.457481 }, { "epoch": 0.2960455850220642, "grad_norm": 3.646867513656616, "learning_rate": 9.913748959380382e-05, "loss": 2.5172801971435548, "memory(GiB)": 47.63, "step": 6910, "token_acc": 0.477124183006536, "train_speed(iter/s)": 1.457499 }, { "epoch": 0.29625980035131316, "grad_norm": 3.363344669342041, "learning_rate": 9.913624454371273e-05, "loss": 2.3248594284057615, "memory(GiB)": 47.63, "step": 6915, "token_acc": 0.5017543859649123, "train_speed(iter/s)": 1.457524 }, { "epoch": 0.2964740156805621, "grad_norm": 3.9670493602752686, "learning_rate": 9.913499860347246e-05, "loss": 2.7420141220092775, "memory(GiB)": 47.63, "step": 6920, "token_acc": 0.41304347826086957, "train_speed(iter/s)": 1.457685 }, { "epoch": 0.29668823100981107, "grad_norm": 5.507998466491699, "learning_rate": 9.91337517731056e-05, "loss": 2.9616031646728516, "memory(GiB)": 47.63, "step": 6925, "token_acc": 0.4177215189873418, "train_speed(iter/s)": 1.457645 }, { "epoch": 0.29690244633906004, "grad_norm": 3.6019091606140137, "learning_rate": 9.913250405263474e-05, "loss": 2.7343515396118163, "memory(GiB)": 47.63, "step": 6930, "token_acc": 0.4696969696969697, "train_speed(iter/s)": 1.45789 }, { "epoch": 0.29711666166830897, "grad_norm": 4.727652549743652, "learning_rate": 9.913125544208248e-05, "loss": 2.784088897705078, "memory(GiB)": 47.63, "step": 6935, "token_acc": 0.4229390681003584, "train_speed(iter/s)": 1.458068 }, { "epoch": 0.29733087699755795, "grad_norm": 5.329716682434082, "learning_rate": 9.913000594147144e-05, "loss": 2.8175352096557615, "memory(GiB)": 47.63, "step": 6940, "token_acc": 0.4262295081967213, "train_speed(iter/s)": 1.458009 }, { "epoch": 0.2975450923268069, "grad_norm": 3.970320224761963, "learning_rate": 9.912875555082425e-05, "loss": 2.7154836654663086, "memory(GiB)": 47.63, "step": 6945, "token_acc": 0.43380281690140843, "train_speed(iter/s)": 1.457928 }, { "epoch": 0.29775930765605585, "grad_norm": 3.9871678352355957, "learning_rate": 9.912750427016356e-05, "loss": 2.4553781509399415, "memory(GiB)": 47.63, "step": 6950, "token_acc": 0.45660377358490567, "train_speed(iter/s)": 1.457848 }, { "epoch": 0.29797352298530483, "grad_norm": 3.7019360065460205, "learning_rate": 9.912625209951206e-05, "loss": 2.7154155731201173, "memory(GiB)": 47.63, "step": 6955, "token_acc": 0.4331983805668016, "train_speed(iter/s)": 1.458222 }, { "epoch": 0.2981877383145538, "grad_norm": 3.8082334995269775, "learning_rate": 9.91249990388924e-05, "loss": 2.478436279296875, "memory(GiB)": 47.63, "step": 6960, "token_acc": 0.48717948717948717, "train_speed(iter/s)": 1.45805 }, { "epoch": 0.29840195364380273, "grad_norm": 2.990219831466675, "learning_rate": 9.912374508832732e-05, "loss": 2.3979909896850584, "memory(GiB)": 47.63, "step": 6965, "token_acc": 0.5, "train_speed(iter/s)": 1.457916 }, { "epoch": 0.2986161689730517, "grad_norm": 4.876752853393555, "learning_rate": 9.912249024783951e-05, "loss": 2.6312931060791014, "memory(GiB)": 47.63, "step": 6970, "token_acc": 0.4504792332268371, "train_speed(iter/s)": 1.457852 }, { "epoch": 0.2988303843023007, "grad_norm": 3.5672826766967773, "learning_rate": 9.91212345174517e-05, "loss": 2.378168487548828, "memory(GiB)": 47.63, "step": 6975, "token_acc": 0.5035714285714286, "train_speed(iter/s)": 1.458002 }, { "epoch": 0.2990445996315496, "grad_norm": 3.7002193927764893, "learning_rate": 9.911997789718666e-05, "loss": 2.4658145904541016, "memory(GiB)": 47.63, "step": 6980, "token_acc": 0.4584717607973422, "train_speed(iter/s)": 1.457727 }, { "epoch": 0.2992588149607986, "grad_norm": 3.7269675731658936, "learning_rate": 9.911872038706713e-05, "loss": 2.325260353088379, "memory(GiB)": 47.63, "step": 6985, "token_acc": 0.45454545454545453, "train_speed(iter/s)": 1.45787 }, { "epoch": 0.29947303029004757, "grad_norm": 3.892512559890747, "learning_rate": 9.911746198711591e-05, "loss": 2.2504718780517576, "memory(GiB)": 47.63, "step": 6990, "token_acc": 0.5288888888888889, "train_speed(iter/s)": 1.457971 }, { "epoch": 0.2996872456192965, "grad_norm": 5.19597053527832, "learning_rate": 9.911620269735578e-05, "loss": 2.3118865966796873, "memory(GiB)": 47.63, "step": 6995, "token_acc": 0.5269709543568465, "train_speed(iter/s)": 1.457796 }, { "epoch": 0.2999014609485455, "grad_norm": 3.593531370162964, "learning_rate": 9.911494251780957e-05, "loss": 2.300238037109375, "memory(GiB)": 47.63, "step": 7000, "token_acc": 0.47333333333333333, "train_speed(iter/s)": 1.457821 }, { "epoch": 0.2999014609485455, "eval_loss": 2.3745665550231934, "eval_runtime": 14.7829, "eval_samples_per_second": 6.765, "eval_steps_per_second": 6.765, "eval_token_acc": 0.4563445867287544, "step": 7000 }, { "epoch": 0.30011567627779445, "grad_norm": 4.208698749542236, "learning_rate": 9.911368144850011e-05, "loss": 2.863749694824219, "memory(GiB)": 47.63, "step": 7005, "token_acc": 0.4390444810543657, "train_speed(iter/s)": 1.453302 }, { "epoch": 0.3003298916070434, "grad_norm": 3.87911319732666, "learning_rate": 9.911241948945022e-05, "loss": 2.394370269775391, "memory(GiB)": 47.63, "step": 7010, "token_acc": 0.5071428571428571, "train_speed(iter/s)": 1.453202 }, { "epoch": 0.30054410693629235, "grad_norm": 4.127148151397705, "learning_rate": 9.91111566406828e-05, "loss": 2.476901626586914, "memory(GiB)": 47.63, "step": 7015, "token_acc": 0.49295774647887325, "train_speed(iter/s)": 1.453335 }, { "epoch": 0.30075832226554133, "grad_norm": 5.481370449066162, "learning_rate": 9.91098929022207e-05, "loss": 3.0413379669189453, "memory(GiB)": 47.63, "step": 7020, "token_acc": 0.39416058394160586, "train_speed(iter/s)": 1.453233 }, { "epoch": 0.30097253759479026, "grad_norm": 3.688483238220215, "learning_rate": 9.910862827408682e-05, "loss": 2.909566879272461, "memory(GiB)": 47.63, "step": 7025, "token_acc": 0.4053156146179402, "train_speed(iter/s)": 1.453437 }, { "epoch": 0.30118675292403924, "grad_norm": 5.38715934753418, "learning_rate": 9.910736275630408e-05, "loss": 2.6324440002441407, "memory(GiB)": 47.63, "step": 7030, "token_acc": 0.47101449275362317, "train_speed(iter/s)": 1.453355 }, { "epoch": 0.3014009682532882, "grad_norm": 3.6263034343719482, "learning_rate": 9.910609634889538e-05, "loss": 2.7652103424072267, "memory(GiB)": 47.63, "step": 7035, "token_acc": 0.4485294117647059, "train_speed(iter/s)": 1.453448 }, { "epoch": 0.3016151835825372, "grad_norm": 3.28666615486145, "learning_rate": 9.91048290518837e-05, "loss": 2.8325773239135743, "memory(GiB)": 47.63, "step": 7040, "token_acc": 0.44565217391304346, "train_speed(iter/s)": 1.4538 }, { "epoch": 0.3018293989117861, "grad_norm": 3.835930109024048, "learning_rate": 9.910356086529196e-05, "loss": 2.8261871337890625, "memory(GiB)": 47.63, "step": 7045, "token_acc": 0.42641509433962266, "train_speed(iter/s)": 1.453892 }, { "epoch": 0.3020436142410351, "grad_norm": 4.7793145179748535, "learning_rate": 9.910229178914317e-05, "loss": 2.817231369018555, "memory(GiB)": 47.63, "step": 7050, "token_acc": 0.436241610738255, "train_speed(iter/s)": 1.453946 }, { "epoch": 0.3022578295702841, "grad_norm": 3.891536235809326, "learning_rate": 9.910102182346029e-05, "loss": 2.7835426330566406, "memory(GiB)": 47.63, "step": 7055, "token_acc": 0.4294478527607362, "train_speed(iter/s)": 1.454071 }, { "epoch": 0.302472044899533, "grad_norm": 3.9592056274414062, "learning_rate": 9.909975096826634e-05, "loss": 2.850838851928711, "memory(GiB)": 47.63, "step": 7060, "token_acc": 0.4098360655737705, "train_speed(iter/s)": 1.454033 }, { "epoch": 0.302686260228782, "grad_norm": 4.836696147918701, "learning_rate": 9.909847922358432e-05, "loss": 2.7942623138427733, "memory(GiB)": 47.63, "step": 7065, "token_acc": 0.46, "train_speed(iter/s)": 1.45406 }, { "epoch": 0.30290047555803096, "grad_norm": 3.4888906478881836, "learning_rate": 9.909720658943733e-05, "loss": 2.695857810974121, "memory(GiB)": 47.63, "step": 7070, "token_acc": 0.4300341296928328, "train_speed(iter/s)": 1.454272 }, { "epoch": 0.3031146908872799, "grad_norm": 3.878528118133545, "learning_rate": 9.909593306584837e-05, "loss": 2.5962528228759765, "memory(GiB)": 47.63, "step": 7075, "token_acc": 0.48109965635738833, "train_speed(iter/s)": 1.454447 }, { "epoch": 0.30332890621652886, "grad_norm": 4.596851348876953, "learning_rate": 9.909465865284052e-05, "loss": 2.633403778076172, "memory(GiB)": 47.63, "step": 7080, "token_acc": 0.4676258992805755, "train_speed(iter/s)": 1.454547 }, { "epoch": 0.30354312154577784, "grad_norm": 5.361253261566162, "learning_rate": 9.909338335043688e-05, "loss": 2.7792930603027344, "memory(GiB)": 47.63, "step": 7085, "token_acc": 0.40942028985507245, "train_speed(iter/s)": 1.4548 }, { "epoch": 0.30375733687502676, "grad_norm": 5.789522647857666, "learning_rate": 9.909210715866055e-05, "loss": 2.5569536209106447, "memory(GiB)": 47.63, "step": 7090, "token_acc": 0.5064935064935064, "train_speed(iter/s)": 1.455016 }, { "epoch": 0.30397155220427574, "grad_norm": 4.068614482879639, "learning_rate": 9.909083007753464e-05, "loss": 2.817095947265625, "memory(GiB)": 47.63, "step": 7095, "token_acc": 0.4180602006688963, "train_speed(iter/s)": 1.455218 }, { "epoch": 0.3041857675335247, "grad_norm": 3.8201069831848145, "learning_rate": 9.90895521070823e-05, "loss": 2.5596477508544924, "memory(GiB)": 47.63, "step": 7100, "token_acc": 0.42073170731707316, "train_speed(iter/s)": 1.454855 }, { "epoch": 0.30439998286277364, "grad_norm": 4.3132781982421875, "learning_rate": 9.908827324732667e-05, "loss": 2.786307525634766, "memory(GiB)": 47.63, "step": 7105, "token_acc": 0.4444444444444444, "train_speed(iter/s)": 1.454885 }, { "epoch": 0.3046141981920226, "grad_norm": 4.772188186645508, "learning_rate": 9.908699349829091e-05, "loss": 2.690424346923828, "memory(GiB)": 47.63, "step": 7110, "token_acc": 0.4659498207885305, "train_speed(iter/s)": 1.45481 }, { "epoch": 0.3048284135212716, "grad_norm": 3.452068567276001, "learning_rate": 9.908571285999824e-05, "loss": 2.748585891723633, "memory(GiB)": 47.63, "step": 7115, "token_acc": 0.4362017804154303, "train_speed(iter/s)": 1.454919 }, { "epoch": 0.3050426288505205, "grad_norm": 4.686628818511963, "learning_rate": 9.908443133247182e-05, "loss": 3.0903095245361327, "memory(GiB)": 47.63, "step": 7120, "token_acc": 0.42901234567901236, "train_speed(iter/s)": 1.455089 }, { "epoch": 0.3052568441797695, "grad_norm": 5.241159915924072, "learning_rate": 9.908314891573489e-05, "loss": 2.8816459655761717, "memory(GiB)": 47.63, "step": 7125, "token_acc": 0.41333333333333333, "train_speed(iter/s)": 1.455082 }, { "epoch": 0.3054710595090185, "grad_norm": 3.407045602798462, "learning_rate": 9.908186560981066e-05, "loss": 2.6806121826171876, "memory(GiB)": 47.63, "step": 7130, "token_acc": 0.5210727969348659, "train_speed(iter/s)": 1.455067 }, { "epoch": 0.3056852748382674, "grad_norm": 4.179222106933594, "learning_rate": 9.908058141472239e-05, "loss": 2.7187618255615233, "memory(GiB)": 47.63, "step": 7135, "token_acc": 0.4358108108108108, "train_speed(iter/s)": 1.455106 }, { "epoch": 0.3058994901675164, "grad_norm": 3.6603033542633057, "learning_rate": 9.907929633049336e-05, "loss": 2.6038034439086912, "memory(GiB)": 47.63, "step": 7140, "token_acc": 0.4874551971326165, "train_speed(iter/s)": 1.455309 }, { "epoch": 0.30611370549676536, "grad_norm": 4.7150983810424805, "learning_rate": 9.907801035714684e-05, "loss": 2.654140281677246, "memory(GiB)": 47.63, "step": 7145, "token_acc": 0.4825174825174825, "train_speed(iter/s)": 1.455513 }, { "epoch": 0.3063279208260143, "grad_norm": 4.321269989013672, "learning_rate": 9.907672349470612e-05, "loss": 3.074951934814453, "memory(GiB)": 47.63, "step": 7150, "token_acc": 0.43389830508474575, "train_speed(iter/s)": 1.455334 }, { "epoch": 0.30654213615526327, "grad_norm": 4.386789321899414, "learning_rate": 9.90754357431945e-05, "loss": 2.465946006774902, "memory(GiB)": 47.63, "step": 7155, "token_acc": 0.47633136094674555, "train_speed(iter/s)": 1.455479 }, { "epoch": 0.30675635148451225, "grad_norm": 3.6322429180145264, "learning_rate": 9.907414710263534e-05, "loss": 2.7125003814697264, "memory(GiB)": 47.63, "step": 7160, "token_acc": 0.46, "train_speed(iter/s)": 1.455717 }, { "epoch": 0.30697056681376117, "grad_norm": 3.60015606880188, "learning_rate": 9.907285757305198e-05, "loss": 2.580677795410156, "memory(GiB)": 47.63, "step": 7165, "token_acc": 0.49825783972125437, "train_speed(iter/s)": 1.456008 }, { "epoch": 0.30718478214301015, "grad_norm": 5.091457366943359, "learning_rate": 9.907156715446775e-05, "loss": 2.790735054016113, "memory(GiB)": 47.63, "step": 7170, "token_acc": 0.4506172839506173, "train_speed(iter/s)": 1.456158 }, { "epoch": 0.30739899747225913, "grad_norm": 4.9130988121032715, "learning_rate": 9.907027584690605e-05, "loss": 2.609231376647949, "memory(GiB)": 47.63, "step": 7175, "token_acc": 0.47017543859649125, "train_speed(iter/s)": 1.456166 }, { "epoch": 0.30761321280150805, "grad_norm": 4.512624740600586, "learning_rate": 9.906898365039027e-05, "loss": 2.864467239379883, "memory(GiB)": 47.63, "step": 7180, "token_acc": 0.43416370106761565, "train_speed(iter/s)": 1.45623 }, { "epoch": 0.30782742813075703, "grad_norm": 7.18649959564209, "learning_rate": 9.906769056494384e-05, "loss": 2.4311191558837892, "memory(GiB)": 47.63, "step": 7185, "token_acc": 0.4881656804733728, "train_speed(iter/s)": 1.456284 }, { "epoch": 0.308041643460006, "grad_norm": 6.2223920822143555, "learning_rate": 9.906639659059015e-05, "loss": 2.4184572219848635, "memory(GiB)": 47.63, "step": 7190, "token_acc": 0.4966442953020134, "train_speed(iter/s)": 1.456319 }, { "epoch": 0.30825585878925493, "grad_norm": 3.70603609085083, "learning_rate": 9.906510172735266e-05, "loss": 2.337209701538086, "memory(GiB)": 47.63, "step": 7195, "token_acc": 0.47129909365558914, "train_speed(iter/s)": 1.456077 }, { "epoch": 0.3084700741185039, "grad_norm": 5.951071739196777, "learning_rate": 9.906380597525484e-05, "loss": 2.6221616744995115, "memory(GiB)": 47.63, "step": 7200, "token_acc": 0.41338582677165353, "train_speed(iter/s)": 1.456293 }, { "epoch": 0.3086842894477529, "grad_norm": 3.1243138313293457, "learning_rate": 9.906250933432013e-05, "loss": 2.4911632537841797, "memory(GiB)": 47.63, "step": 7205, "token_acc": 0.45151515151515154, "train_speed(iter/s)": 1.456391 }, { "epoch": 0.30889850477700187, "grad_norm": 3.2041783332824707, "learning_rate": 9.906121180457204e-05, "loss": 2.439503860473633, "memory(GiB)": 47.63, "step": 7210, "token_acc": 0.5, "train_speed(iter/s)": 1.456254 }, { "epoch": 0.3091127201062508, "grad_norm": 5.2725934982299805, "learning_rate": 9.905991338603409e-05, "loss": 2.8739700317382812, "memory(GiB)": 47.63, "step": 7215, "token_acc": 0.43014705882352944, "train_speed(iter/s)": 1.456324 }, { "epoch": 0.3093269354354998, "grad_norm": 4.381966590881348, "learning_rate": 9.905861407872977e-05, "loss": 2.599055290222168, "memory(GiB)": 47.63, "step": 7220, "token_acc": 0.484, "train_speed(iter/s)": 1.456322 }, { "epoch": 0.30954115076474875, "grad_norm": 3.6992130279541016, "learning_rate": 9.905731388268265e-05, "loss": 2.710344123840332, "memory(GiB)": 47.63, "step": 7225, "token_acc": 0.4631578947368421, "train_speed(iter/s)": 1.456415 }, { "epoch": 0.3097553660939977, "grad_norm": 5.55756139755249, "learning_rate": 9.905601279791626e-05, "loss": 2.60537052154541, "memory(GiB)": 47.63, "step": 7230, "token_acc": 0.49828178694158076, "train_speed(iter/s)": 1.45652 }, { "epoch": 0.30996958142324665, "grad_norm": 3.7960941791534424, "learning_rate": 9.905471082445419e-05, "loss": 2.5660427093505858, "memory(GiB)": 47.63, "step": 7235, "token_acc": 0.4300341296928328, "train_speed(iter/s)": 1.456691 }, { "epoch": 0.31018379675249563, "grad_norm": 4.023140907287598, "learning_rate": 9.905340796232e-05, "loss": 2.454560089111328, "memory(GiB)": 47.63, "step": 7240, "token_acc": 0.48366013071895425, "train_speed(iter/s)": 1.456605 }, { "epoch": 0.31039801208174456, "grad_norm": 4.8561787605285645, "learning_rate": 9.905210421153732e-05, "loss": 2.681989860534668, "memory(GiB)": 47.63, "step": 7245, "token_acc": 0.4401294498381877, "train_speed(iter/s)": 1.45664 }, { "epoch": 0.31061222741099354, "grad_norm": 4.080799579620361, "learning_rate": 9.905079957212975e-05, "loss": 2.655870819091797, "memory(GiB)": 47.63, "step": 7250, "token_acc": 0.4585987261146497, "train_speed(iter/s)": 1.456485 }, { "epoch": 0.3108264427402425, "grad_norm": 3.901381731033325, "learning_rate": 9.904949404412094e-05, "loss": 2.425362205505371, "memory(GiB)": 47.63, "step": 7255, "token_acc": 0.43666666666666665, "train_speed(iter/s)": 1.456531 }, { "epoch": 0.31104065806949144, "grad_norm": 4.594584941864014, "learning_rate": 9.904818762753454e-05, "loss": 2.3024227142333986, "memory(GiB)": 47.63, "step": 7260, "token_acc": 0.4896265560165975, "train_speed(iter/s)": 1.456676 }, { "epoch": 0.3112548733987404, "grad_norm": 5.4759392738342285, "learning_rate": 9.904688032239419e-05, "loss": 2.772439193725586, "memory(GiB)": 47.63, "step": 7265, "token_acc": 0.45161290322580644, "train_speed(iter/s)": 1.456646 }, { "epoch": 0.3114690887279894, "grad_norm": 3.5301244258880615, "learning_rate": 9.904557212872361e-05, "loss": 2.629535675048828, "memory(GiB)": 47.63, "step": 7270, "token_acc": 0.43119266055045874, "train_speed(iter/s)": 1.456787 }, { "epoch": 0.3116833040572383, "grad_norm": 4.165886402130127, "learning_rate": 9.904426304654648e-05, "loss": 2.7619659423828127, "memory(GiB)": 47.63, "step": 7275, "token_acc": 0.47368421052631576, "train_speed(iter/s)": 1.456813 }, { "epoch": 0.3118975193864873, "grad_norm": 4.079471111297607, "learning_rate": 9.904295307588651e-05, "loss": 2.648187446594238, "memory(GiB)": 47.63, "step": 7280, "token_acc": 0.4778481012658228, "train_speed(iter/s)": 1.456821 }, { "epoch": 0.3121117347157363, "grad_norm": 5.125851154327393, "learning_rate": 9.904164221676745e-05, "loss": 2.295179748535156, "memory(GiB)": 47.63, "step": 7285, "token_acc": 0.5087719298245614, "train_speed(iter/s)": 1.45704 }, { "epoch": 0.3123259500449852, "grad_norm": 4.464629173278809, "learning_rate": 9.904033046921303e-05, "loss": 2.5714160919189455, "memory(GiB)": 47.63, "step": 7290, "token_acc": 0.46357615894039733, "train_speed(iter/s)": 1.457077 }, { "epoch": 0.3125401653742342, "grad_norm": 4.502624034881592, "learning_rate": 9.903901783324702e-05, "loss": 2.7460556030273438, "memory(GiB)": 47.63, "step": 7295, "token_acc": 0.45307443365695793, "train_speed(iter/s)": 1.457023 }, { "epoch": 0.31275438070348316, "grad_norm": 4.947858810424805, "learning_rate": 9.90377043088932e-05, "loss": 2.956756591796875, "memory(GiB)": 47.63, "step": 7300, "token_acc": 0.43174603174603177, "train_speed(iter/s)": 1.457335 }, { "epoch": 0.3129685960327321, "grad_norm": 4.027510166168213, "learning_rate": 9.903638989617537e-05, "loss": 2.6416173934936524, "memory(GiB)": 47.63, "step": 7305, "token_acc": 0.45454545454545453, "train_speed(iter/s)": 1.457109 }, { "epoch": 0.31318281136198106, "grad_norm": 6.303963661193848, "learning_rate": 9.903507459511733e-05, "loss": 3.154567909240723, "memory(GiB)": 47.63, "step": 7310, "token_acc": 0.40217391304347827, "train_speed(iter/s)": 1.457338 }, { "epoch": 0.31339702669123004, "grad_norm": 5.36435604095459, "learning_rate": 9.903375840574291e-05, "loss": 2.681963348388672, "memory(GiB)": 47.63, "step": 7315, "token_acc": 0.44485294117647056, "train_speed(iter/s)": 1.457225 }, { "epoch": 0.31361124202047896, "grad_norm": 3.716139554977417, "learning_rate": 9.903244132807597e-05, "loss": 2.3914871215820312, "memory(GiB)": 47.63, "step": 7320, "token_acc": 0.5105633802816901, "train_speed(iter/s)": 1.457414 }, { "epoch": 0.31382545734972794, "grad_norm": 3.5286977291107178, "learning_rate": 9.903112336214035e-05, "loss": 2.6671157836914063, "memory(GiB)": 47.63, "step": 7325, "token_acc": 0.4407894736842105, "train_speed(iter/s)": 1.457429 }, { "epoch": 0.3140396726789769, "grad_norm": 5.585231304168701, "learning_rate": 9.902980450795996e-05, "loss": 2.8353412628173826, "memory(GiB)": 47.63, "step": 7330, "token_acc": 0.41823056300268097, "train_speed(iter/s)": 1.457398 }, { "epoch": 0.31425388800822585, "grad_norm": 3.8338727951049805, "learning_rate": 9.902848476555864e-05, "loss": 2.7938262939453127, "memory(GiB)": 47.63, "step": 7335, "token_acc": 0.39184952978056425, "train_speed(iter/s)": 1.457507 }, { "epoch": 0.3144681033374748, "grad_norm": 5.961755752563477, "learning_rate": 9.902716413496034e-05, "loss": 2.2917255401611327, "memory(GiB)": 47.63, "step": 7340, "token_acc": 0.47413793103448276, "train_speed(iter/s)": 1.457397 }, { "epoch": 0.3146823186667238, "grad_norm": 3.7651877403259277, "learning_rate": 9.902584261618896e-05, "loss": 2.5533000946044924, "memory(GiB)": 47.63, "step": 7345, "token_acc": 0.4831081081081081, "train_speed(iter/s)": 1.45717 }, { "epoch": 0.3148965339959727, "grad_norm": 6.507554531097412, "learning_rate": 9.902452020926845e-05, "loss": 2.6301080703735353, "memory(GiB)": 47.63, "step": 7350, "token_acc": 0.43621399176954734, "train_speed(iter/s)": 1.457171 }, { "epoch": 0.3151107493252217, "grad_norm": 4.027996063232422, "learning_rate": 9.902319691422277e-05, "loss": 2.4271074295043946, "memory(GiB)": 47.63, "step": 7355, "token_acc": 0.46579804560260585, "train_speed(iter/s)": 1.457035 }, { "epoch": 0.3153249646544707, "grad_norm": 3.8188610076904297, "learning_rate": 9.902187273107591e-05, "loss": 2.706253242492676, "memory(GiB)": 47.63, "step": 7360, "token_acc": 0.45874587458745875, "train_speed(iter/s)": 1.457046 }, { "epoch": 0.3155391799837196, "grad_norm": 3.748914957046509, "learning_rate": 9.902054765985182e-05, "loss": 2.91416015625, "memory(GiB)": 47.63, "step": 7365, "token_acc": 0.42990654205607476, "train_speed(iter/s)": 1.457168 }, { "epoch": 0.3157533953129686, "grad_norm": 3.667433023452759, "learning_rate": 9.901922170057452e-05, "loss": 2.3270805358886717, "memory(GiB)": 47.63, "step": 7370, "token_acc": 0.5172413793103449, "train_speed(iter/s)": 1.45699 }, { "epoch": 0.31596761064221757, "grad_norm": 4.356845378875732, "learning_rate": 9.901789485326804e-05, "loss": 2.5035377502441407, "memory(GiB)": 47.63, "step": 7375, "token_acc": 0.4744525547445255, "train_speed(iter/s)": 1.457163 }, { "epoch": 0.31618182597146655, "grad_norm": 3.480801820755005, "learning_rate": 9.901656711795641e-05, "loss": 2.443022537231445, "memory(GiB)": 47.63, "step": 7380, "token_acc": 0.5050847457627119, "train_speed(iter/s)": 1.456968 }, { "epoch": 0.31639604130071547, "grad_norm": 4.483109474182129, "learning_rate": 9.90152384946637e-05, "loss": 2.6643272399902345, "memory(GiB)": 47.63, "step": 7385, "token_acc": 0.45364238410596025, "train_speed(iter/s)": 1.456782 }, { "epoch": 0.31661025662996445, "grad_norm": 4.516879081726074, "learning_rate": 9.901390898341397e-05, "loss": 2.4153942108154296, "memory(GiB)": 47.63, "step": 7390, "token_acc": 0.47540983606557374, "train_speed(iter/s)": 1.456984 }, { "epoch": 0.3168244719592134, "grad_norm": 4.858016014099121, "learning_rate": 9.901257858423127e-05, "loss": 2.756536865234375, "memory(GiB)": 47.63, "step": 7395, "token_acc": 0.4555160142348754, "train_speed(iter/s)": 1.45716 }, { "epoch": 0.31703868728846235, "grad_norm": 4.1707844734191895, "learning_rate": 9.901124729713975e-05, "loss": 2.4957103729248047, "memory(GiB)": 52.0, "step": 7400, "token_acc": 0.4791666666666667, "train_speed(iter/s)": 1.456692 }, { "epoch": 0.31725290261771133, "grad_norm": 3.6951959133148193, "learning_rate": 9.900991512216351e-05, "loss": 2.726665496826172, "memory(GiB)": 52.0, "step": 7405, "token_acc": 0.46496815286624205, "train_speed(iter/s)": 1.456565 }, { "epoch": 0.3174671179469603, "grad_norm": 3.8989763259887695, "learning_rate": 9.900858205932668e-05, "loss": 2.5814361572265625, "memory(GiB)": 52.0, "step": 7410, "token_acc": 0.4418604651162791, "train_speed(iter/s)": 1.456628 }, { "epoch": 0.31768133327620923, "grad_norm": 5.1748785972595215, "learning_rate": 9.900724810865341e-05, "loss": 2.6261388778686525, "memory(GiB)": 52.0, "step": 7415, "token_acc": 0.44108761329305135, "train_speed(iter/s)": 1.456834 }, { "epoch": 0.3178955486054582, "grad_norm": 4.179869174957275, "learning_rate": 9.900591327016786e-05, "loss": 2.5476551055908203, "memory(GiB)": 52.0, "step": 7420, "token_acc": 0.4609053497942387, "train_speed(iter/s)": 1.456736 }, { "epoch": 0.3181097639347072, "grad_norm": 7.137809753417969, "learning_rate": 9.900457754389422e-05, "loss": 2.7810552597045897, "memory(GiB)": 52.0, "step": 7425, "token_acc": 0.44694533762057875, "train_speed(iter/s)": 1.456672 }, { "epoch": 0.3183239792639561, "grad_norm": 4.1653361320495605, "learning_rate": 9.900324092985671e-05, "loss": 2.253307914733887, "memory(GiB)": 52.0, "step": 7430, "token_acc": 0.5319148936170213, "train_speed(iter/s)": 1.456618 }, { "epoch": 0.3185381945932051, "grad_norm": 3.961106538772583, "learning_rate": 9.900190342807951e-05, "loss": 2.568301963806152, "memory(GiB)": 52.0, "step": 7435, "token_acc": 0.44483985765124556, "train_speed(iter/s)": 1.456629 }, { "epoch": 0.31875240992245407, "grad_norm": 5.501273155212402, "learning_rate": 9.900056503858685e-05, "loss": 2.644911003112793, "memory(GiB)": 52.0, "step": 7440, "token_acc": 0.4618181818181818, "train_speed(iter/s)": 1.456846 }, { "epoch": 0.318966625251703, "grad_norm": 4.654253005981445, "learning_rate": 9.8999225761403e-05, "loss": 2.213212585449219, "memory(GiB)": 52.0, "step": 7445, "token_acc": 0.524390243902439, "train_speed(iter/s)": 1.456834 }, { "epoch": 0.319180840580952, "grad_norm": 3.2251601219177246, "learning_rate": 9.899788559655221e-05, "loss": 2.4558940887451173, "memory(GiB)": 52.0, "step": 7450, "token_acc": 0.5125448028673835, "train_speed(iter/s)": 1.456843 }, { "epoch": 0.31939505591020095, "grad_norm": 3.600304126739502, "learning_rate": 9.899654454405876e-05, "loss": 2.6671876907348633, "memory(GiB)": 52.0, "step": 7455, "token_acc": 0.43333333333333335, "train_speed(iter/s)": 1.456869 }, { "epoch": 0.3196092712394499, "grad_norm": 4.009782314300537, "learning_rate": 9.899520260394695e-05, "loss": 2.7939844131469727, "memory(GiB)": 52.0, "step": 7460, "token_acc": 0.4354243542435424, "train_speed(iter/s)": 1.457085 }, { "epoch": 0.31982348656869886, "grad_norm": 3.5540733337402344, "learning_rate": 9.899385977624107e-05, "loss": 2.6768720626831053, "memory(GiB)": 52.0, "step": 7465, "token_acc": 0.4515235457063712, "train_speed(iter/s)": 1.45714 }, { "epoch": 0.32003770189794783, "grad_norm": 5.435524940490723, "learning_rate": 9.899251606096546e-05, "loss": 2.6305347442626954, "memory(GiB)": 52.0, "step": 7470, "token_acc": 0.4275618374558304, "train_speed(iter/s)": 1.457141 }, { "epoch": 0.32025191722719676, "grad_norm": 4.0572590827941895, "learning_rate": 9.899117145814448e-05, "loss": 2.443489646911621, "memory(GiB)": 52.0, "step": 7475, "token_acc": 0.45387453874538747, "train_speed(iter/s)": 1.457394 }, { "epoch": 0.32046613255644574, "grad_norm": 4.868009567260742, "learning_rate": 9.898982596780244e-05, "loss": 2.3461074829101562, "memory(GiB)": 52.0, "step": 7480, "token_acc": 0.4765625, "train_speed(iter/s)": 1.457273 }, { "epoch": 0.3206803478856947, "grad_norm": 3.9423940181732178, "learning_rate": 9.898847958996377e-05, "loss": 2.801871109008789, "memory(GiB)": 52.0, "step": 7485, "token_acc": 0.43529411764705883, "train_speed(iter/s)": 1.457453 }, { "epoch": 0.32089456321494364, "grad_norm": 8.300311088562012, "learning_rate": 9.898713232465283e-05, "loss": 2.9120140075683594, "memory(GiB)": 52.0, "step": 7490, "token_acc": 0.46308724832214765, "train_speed(iter/s)": 1.45761 }, { "epoch": 0.3211087785441926, "grad_norm": 4.529783248901367, "learning_rate": 9.898578417189403e-05, "loss": 2.4431697845458986, "memory(GiB)": 52.0, "step": 7495, "token_acc": 0.4966442953020134, "train_speed(iter/s)": 1.457671 }, { "epoch": 0.3213229938734416, "grad_norm": 4.425193786621094, "learning_rate": 9.89844351317118e-05, "loss": 2.6823225021362305, "memory(GiB)": 52.0, "step": 7500, "token_acc": 0.43790849673202614, "train_speed(iter/s)": 1.45785 }, { "epoch": 0.3213229938734416, "eval_loss": 2.142605781555176, "eval_runtime": 14.2023, "eval_samples_per_second": 7.041, "eval_steps_per_second": 7.041, "eval_token_acc": 0.46029776674937967, "step": 7500 }, { "epoch": 0.3215372092026905, "grad_norm": 4.624557971954346, "learning_rate": 9.89830852041306e-05, "loss": 2.642671012878418, "memory(GiB)": 52.0, "step": 7505, "token_acc": 0.461044912923923, "train_speed(iter/s)": 1.45373 }, { "epoch": 0.3217514245319395, "grad_norm": 4.579354286193848, "learning_rate": 9.898173438917485e-05, "loss": 2.476900100708008, "memory(GiB)": 52.0, "step": 7510, "token_acc": 0.5, "train_speed(iter/s)": 1.453855 }, { "epoch": 0.3219656398611885, "grad_norm": 3.5670883655548096, "learning_rate": 9.8980382686869e-05, "loss": 2.5919353485107424, "memory(GiB)": 52.0, "step": 7515, "token_acc": 0.43389830508474575, "train_speed(iter/s)": 1.453897 }, { "epoch": 0.3221798551904374, "grad_norm": 6.061057090759277, "learning_rate": 9.89790300972376e-05, "loss": 3.1762290954589845, "memory(GiB)": 52.0, "step": 7520, "token_acc": 0.43243243243243246, "train_speed(iter/s)": 1.453838 }, { "epoch": 0.3223940705196864, "grad_norm": 4.990741729736328, "learning_rate": 9.897767662030512e-05, "loss": 2.7945816040039064, "memory(GiB)": 52.0, "step": 7525, "token_acc": 0.46360153256704983, "train_speed(iter/s)": 1.453979 }, { "epoch": 0.32260828584893536, "grad_norm": 3.867506742477417, "learning_rate": 9.897632225609607e-05, "loss": 2.8618717193603516, "memory(GiB)": 52.0, "step": 7530, "token_acc": 0.4107142857142857, "train_speed(iter/s)": 1.454086 }, { "epoch": 0.3228225011781843, "grad_norm": 3.3391501903533936, "learning_rate": 9.897496700463502e-05, "loss": 2.7338409423828125, "memory(GiB)": 52.0, "step": 7535, "token_acc": 0.4716417910447761, "train_speed(iter/s)": 1.454119 }, { "epoch": 0.32303671650743326, "grad_norm": 3.1852691173553467, "learning_rate": 9.897361086594649e-05, "loss": 2.7723899841308595, "memory(GiB)": 52.0, "step": 7540, "token_acc": 0.43490304709141275, "train_speed(iter/s)": 1.454214 }, { "epoch": 0.32325093183668224, "grad_norm": 3.827683687210083, "learning_rate": 9.897225384005507e-05, "loss": 2.6990550994873046, "memory(GiB)": 52.0, "step": 7545, "token_acc": 0.4304635761589404, "train_speed(iter/s)": 1.45433 }, { "epoch": 0.3234651471659312, "grad_norm": 3.6892833709716797, "learning_rate": 9.897089592698532e-05, "loss": 2.631779670715332, "memory(GiB)": 52.0, "step": 7550, "token_acc": 0.4479495268138801, "train_speed(iter/s)": 1.454572 }, { "epoch": 0.32367936249518015, "grad_norm": 3.134920835494995, "learning_rate": 9.896953712676184e-05, "loss": 2.7226795196533202, "memory(GiB)": 52.0, "step": 7555, "token_acc": 0.4253246753246753, "train_speed(iter/s)": 1.454461 }, { "epoch": 0.3238935778244291, "grad_norm": 3.281334400177002, "learning_rate": 9.896817743940928e-05, "loss": 2.4763965606689453, "memory(GiB)": 52.0, "step": 7560, "token_acc": 0.49146757679180886, "train_speed(iter/s)": 1.454641 }, { "epoch": 0.3241077931536781, "grad_norm": 4.33760929107666, "learning_rate": 9.896681686495224e-05, "loss": 2.911746025085449, "memory(GiB)": 52.0, "step": 7565, "token_acc": 0.4524714828897338, "train_speed(iter/s)": 1.454782 }, { "epoch": 0.324322008482927, "grad_norm": 4.222476959228516, "learning_rate": 9.896545540341538e-05, "loss": 2.5468311309814453, "memory(GiB)": 52.0, "step": 7570, "token_acc": 0.473015873015873, "train_speed(iter/s)": 1.454743 }, { "epoch": 0.324536223812176, "grad_norm": 5.119168758392334, "learning_rate": 9.896409305482336e-05, "loss": 2.677763748168945, "memory(GiB)": 52.0, "step": 7575, "token_acc": 0.45454545454545453, "train_speed(iter/s)": 1.454568 }, { "epoch": 0.324750439141425, "grad_norm": 4.543260097503662, "learning_rate": 9.896272981920087e-05, "loss": 2.8203689575195314, "memory(GiB)": 52.0, "step": 7580, "token_acc": 0.4693140794223827, "train_speed(iter/s)": 1.454539 }, { "epoch": 0.3249646544706739, "grad_norm": 4.372576713562012, "learning_rate": 9.89613656965726e-05, "loss": 2.743465805053711, "memory(GiB)": 52.0, "step": 7585, "token_acc": 0.4351145038167939, "train_speed(iter/s)": 1.454685 }, { "epoch": 0.3251788697999229, "grad_norm": 3.731536388397217, "learning_rate": 9.896000068696325e-05, "loss": 2.5834712982177734, "memory(GiB)": 52.0, "step": 7590, "token_acc": 0.49557522123893805, "train_speed(iter/s)": 1.454757 }, { "epoch": 0.32539308512917187, "grad_norm": 4.425579071044922, "learning_rate": 9.895863479039756e-05, "loss": 2.3880266189575194, "memory(GiB)": 52.0, "step": 7595, "token_acc": 0.49333333333333335, "train_speed(iter/s)": 1.454846 }, { "epoch": 0.3256073004584208, "grad_norm": 3.3718419075012207, "learning_rate": 9.895726800690028e-05, "loss": 2.6899112701416015, "memory(GiB)": 52.0, "step": 7600, "token_acc": 0.4980694980694981, "train_speed(iter/s)": 1.454966 }, { "epoch": 0.32582151578766977, "grad_norm": 4.074090003967285, "learning_rate": 9.895590033649616e-05, "loss": 2.691311836242676, "memory(GiB)": 52.0, "step": 7605, "token_acc": 0.45733788395904434, "train_speed(iter/s)": 1.455153 }, { "epoch": 0.32603573111691875, "grad_norm": 4.140510082244873, "learning_rate": 9.895453177920997e-05, "loss": 2.578537368774414, "memory(GiB)": 52.0, "step": 7610, "token_acc": 0.4904214559386973, "train_speed(iter/s)": 1.45514 }, { "epoch": 0.32624994644616767, "grad_norm": 6.439145088195801, "learning_rate": 9.895316233506653e-05, "loss": 2.5949462890625, "memory(GiB)": 52.0, "step": 7615, "token_acc": 0.4503311258278146, "train_speed(iter/s)": 1.45499 }, { "epoch": 0.32646416177541665, "grad_norm": 4.326587200164795, "learning_rate": 9.895179200409063e-05, "loss": 2.413687324523926, "memory(GiB)": 52.0, "step": 7620, "token_acc": 0.476038338658147, "train_speed(iter/s)": 1.454962 }, { "epoch": 0.32667837710466563, "grad_norm": 5.436228275299072, "learning_rate": 9.895042078630709e-05, "loss": 2.5811790466308593, "memory(GiB)": 52.0, "step": 7625, "token_acc": 0.4963235294117647, "train_speed(iter/s)": 1.454725 }, { "epoch": 0.32689259243391455, "grad_norm": 3.5004026889801025, "learning_rate": 9.894904868174076e-05, "loss": 2.6833271026611327, "memory(GiB)": 52.0, "step": 7630, "token_acc": 0.4954128440366973, "train_speed(iter/s)": 1.454869 }, { "epoch": 0.32710680776316353, "grad_norm": 5.4990034103393555, "learning_rate": 9.89476756904165e-05, "loss": 2.5884525299072267, "memory(GiB)": 52.0, "step": 7635, "token_acc": 0.4882154882154882, "train_speed(iter/s)": 1.454954 }, { "epoch": 0.3273210230924125, "grad_norm": 4.441423416137695, "learning_rate": 9.894630181235917e-05, "loss": 2.7414947509765626, "memory(GiB)": 52.0, "step": 7640, "token_acc": 0.4527687296416938, "train_speed(iter/s)": 1.455111 }, { "epoch": 0.32753523842166143, "grad_norm": 3.819051504135132, "learning_rate": 9.894492704759369e-05, "loss": 2.5649023056030273, "memory(GiB)": 52.0, "step": 7645, "token_acc": 0.4750830564784053, "train_speed(iter/s)": 1.455221 }, { "epoch": 0.3277494537509104, "grad_norm": 5.052639007568359, "learning_rate": 9.894355139614493e-05, "loss": 2.536255645751953, "memory(GiB)": 52.0, "step": 7650, "token_acc": 0.4763779527559055, "train_speed(iter/s)": 1.455362 }, { "epoch": 0.3279636690801594, "grad_norm": 4.879740238189697, "learning_rate": 9.89421748580378e-05, "loss": 2.621837043762207, "memory(GiB)": 52.0, "step": 7655, "token_acc": 0.46886446886446886, "train_speed(iter/s)": 1.455426 }, { "epoch": 0.3281778844094083, "grad_norm": 7.240545272827148, "learning_rate": 9.894079743329729e-05, "loss": 2.7611183166503905, "memory(GiB)": 52.0, "step": 7660, "token_acc": 0.4370629370629371, "train_speed(iter/s)": 1.455601 }, { "epoch": 0.3283920997386573, "grad_norm": 3.6262354850769043, "learning_rate": 9.893941912194831e-05, "loss": 2.543458938598633, "memory(GiB)": 52.0, "step": 7665, "token_acc": 0.5018450184501845, "train_speed(iter/s)": 1.455476 }, { "epoch": 0.3286063150679063, "grad_norm": 2.893031120300293, "learning_rate": 9.893803992401586e-05, "loss": 2.453871154785156, "memory(GiB)": 52.0, "step": 7670, "token_acc": 0.4793103448275862, "train_speed(iter/s)": 1.455521 }, { "epoch": 0.3288205303971552, "grad_norm": 3.959601879119873, "learning_rate": 9.893665983952489e-05, "loss": 2.8583328247070314, "memory(GiB)": 52.0, "step": 7675, "token_acc": 0.402555910543131, "train_speed(iter/s)": 1.455505 }, { "epoch": 0.3290347457264042, "grad_norm": 3.764578104019165, "learning_rate": 9.893527886850044e-05, "loss": 2.5479013442993166, "memory(GiB)": 52.0, "step": 7680, "token_acc": 0.4811320754716981, "train_speed(iter/s)": 1.455537 }, { "epoch": 0.32924896105565316, "grad_norm": 6.591464042663574, "learning_rate": 9.89338970109675e-05, "loss": 2.2869140625, "memory(GiB)": 52.0, "step": 7685, "token_acc": 0.5306122448979592, "train_speed(iter/s)": 1.455264 }, { "epoch": 0.3294631763849021, "grad_norm": 4.059225082397461, "learning_rate": 9.893251426695111e-05, "loss": 2.933719444274902, "memory(GiB)": 52.0, "step": 7690, "token_acc": 0.4041916167664671, "train_speed(iter/s)": 1.455251 }, { "epoch": 0.32967739171415106, "grad_norm": 4.148004055023193, "learning_rate": 9.893113063647632e-05, "loss": 2.7961151123046877, "memory(GiB)": 52.0, "step": 7695, "token_acc": 0.4793388429752066, "train_speed(iter/s)": 1.455352 }, { "epoch": 0.32989160704340004, "grad_norm": 4.07358980178833, "learning_rate": 9.89297461195682e-05, "loss": 2.361439323425293, "memory(GiB)": 58.3, "step": 7700, "token_acc": 0.4444444444444444, "train_speed(iter/s)": 1.455065 }, { "epoch": 0.33010582237264896, "grad_norm": 3.9765727519989014, "learning_rate": 9.892836071625182e-05, "loss": 2.282497787475586, "memory(GiB)": 58.3, "step": 7705, "token_acc": 0.4913494809688581, "train_speed(iter/s)": 1.454959 }, { "epoch": 0.33032003770189794, "grad_norm": 3.9762468338012695, "learning_rate": 9.89269744265523e-05, "loss": 2.883592414855957, "memory(GiB)": 58.3, "step": 7710, "token_acc": 0.42443729903536975, "train_speed(iter/s)": 1.455027 }, { "epoch": 0.3305342530311469, "grad_norm": 4.177420139312744, "learning_rate": 9.892558725049474e-05, "loss": 2.6837316513061524, "memory(GiB)": 58.3, "step": 7715, "token_acc": 0.46511627906976744, "train_speed(iter/s)": 1.455137 }, { "epoch": 0.3307484683603959, "grad_norm": 3.9853692054748535, "learning_rate": 9.892419918810426e-05, "loss": 2.7274417877197266, "memory(GiB)": 58.3, "step": 7720, "token_acc": 0.43214285714285716, "train_speed(iter/s)": 1.455245 }, { "epoch": 0.3309626836896448, "grad_norm": 5.162968158721924, "learning_rate": 9.892281023940602e-05, "loss": 2.5281951904296873, "memory(GiB)": 58.3, "step": 7725, "token_acc": 0.44727272727272727, "train_speed(iter/s)": 1.455536 }, { "epoch": 0.3311768990188938, "grad_norm": 3.5378823280334473, "learning_rate": 9.892142040442518e-05, "loss": 2.6750280380249025, "memory(GiB)": 58.3, "step": 7730, "token_acc": 0.47337278106508873, "train_speed(iter/s)": 1.455346 }, { "epoch": 0.3313911143481428, "grad_norm": 5.67416524887085, "learning_rate": 9.892002968318692e-05, "loss": 2.2620065689086912, "memory(GiB)": 58.3, "step": 7735, "token_acc": 0.5204460966542751, "train_speed(iter/s)": 1.455051 }, { "epoch": 0.3316053296773917, "grad_norm": 6.915927410125732, "learning_rate": 9.891863807571644e-05, "loss": 2.4981914520263673, "memory(GiB)": 58.3, "step": 7740, "token_acc": 0.4882943143812709, "train_speed(iter/s)": 1.455187 }, { "epoch": 0.3318195450066407, "grad_norm": 3.4285967350006104, "learning_rate": 9.891724558203893e-05, "loss": 2.540232276916504, "memory(GiB)": 58.3, "step": 7745, "token_acc": 0.44654088050314467, "train_speed(iter/s)": 1.455221 }, { "epoch": 0.33203376033588966, "grad_norm": 3.635664224624634, "learning_rate": 9.891585220217964e-05, "loss": 2.550495147705078, "memory(GiB)": 58.3, "step": 7750, "token_acc": 0.49416342412451364, "train_speed(iter/s)": 1.455151 }, { "epoch": 0.3322479756651386, "grad_norm": 5.062612056732178, "learning_rate": 9.891445793616378e-05, "loss": 2.6595279693603517, "memory(GiB)": 58.3, "step": 7755, "token_acc": 0.5019455252918288, "train_speed(iter/s)": 1.454911 }, { "epoch": 0.33246219099438756, "grad_norm": 3.8729732036590576, "learning_rate": 9.891306278401665e-05, "loss": 2.349718475341797, "memory(GiB)": 58.3, "step": 7760, "token_acc": 0.5135135135135135, "train_speed(iter/s)": 1.454786 }, { "epoch": 0.33267640632363654, "grad_norm": 4.594541549682617, "learning_rate": 9.891166674576349e-05, "loss": 2.9694677352905274, "memory(GiB)": 58.3, "step": 7765, "token_acc": 0.4006734006734007, "train_speed(iter/s)": 1.454485 }, { "epoch": 0.33289062165288547, "grad_norm": 3.407460927963257, "learning_rate": 9.891026982142962e-05, "loss": 2.890412139892578, "memory(GiB)": 58.3, "step": 7770, "token_acc": 0.3989071038251366, "train_speed(iter/s)": 1.454381 }, { "epoch": 0.33310483698213444, "grad_norm": 4.638185501098633, "learning_rate": 9.890887201104032e-05, "loss": 2.8380165100097656, "memory(GiB)": 58.3, "step": 7775, "token_acc": 0.45224719101123595, "train_speed(iter/s)": 1.454308 }, { "epoch": 0.3333190523113834, "grad_norm": 4.219641208648682, "learning_rate": 9.890747331462092e-05, "loss": 2.702608108520508, "memory(GiB)": 58.3, "step": 7780, "token_acc": 0.428125, "train_speed(iter/s)": 1.454617 }, { "epoch": 0.33353326764063235, "grad_norm": 3.846561908721924, "learning_rate": 9.890607373219676e-05, "loss": 2.5522266387939454, "memory(GiB)": 58.3, "step": 7785, "token_acc": 0.4819277108433735, "train_speed(iter/s)": 1.454803 }, { "epoch": 0.3337474829698813, "grad_norm": 4.175077438354492, "learning_rate": 9.89046732637932e-05, "loss": 2.759039878845215, "memory(GiB)": 58.3, "step": 7790, "token_acc": 0.4370629370629371, "train_speed(iter/s)": 1.45498 }, { "epoch": 0.3339616982991303, "grad_norm": 3.8363518714904785, "learning_rate": 9.890327190943561e-05, "loss": 2.6114875793457033, "memory(GiB)": 58.3, "step": 7795, "token_acc": 0.4537037037037037, "train_speed(iter/s)": 1.454938 }, { "epoch": 0.33417591362837923, "grad_norm": 3.5491459369659424, "learning_rate": 9.890186966914938e-05, "loss": 2.532672882080078, "memory(GiB)": 58.3, "step": 7800, "token_acc": 0.4883720930232558, "train_speed(iter/s)": 1.455006 }, { "epoch": 0.3343901289576282, "grad_norm": 3.699162244796753, "learning_rate": 9.89004665429599e-05, "loss": 2.1899948120117188, "memory(GiB)": 58.3, "step": 7805, "token_acc": 0.5341880341880342, "train_speed(iter/s)": 1.454635 }, { "epoch": 0.3346043442868772, "grad_norm": 3.653193950653076, "learning_rate": 9.88990625308926e-05, "loss": 2.282488250732422, "memory(GiB)": 58.3, "step": 7810, "token_acc": 0.5290322580645161, "train_speed(iter/s)": 1.454536 }, { "epoch": 0.3348185596161261, "grad_norm": 4.398053169250488, "learning_rate": 9.889765763297291e-05, "loss": 2.5617956161499023, "memory(GiB)": 58.3, "step": 7815, "token_acc": 0.4646840148698885, "train_speed(iter/s)": 1.454735 }, { "epoch": 0.3350327749453751, "grad_norm": 4.201615333557129, "learning_rate": 9.889625184922628e-05, "loss": 2.499456787109375, "memory(GiB)": 58.3, "step": 7820, "token_acc": 0.4942528735632184, "train_speed(iter/s)": 1.454658 }, { "epoch": 0.33524699027462407, "grad_norm": 3.0528340339660645, "learning_rate": 9.889484517967818e-05, "loss": 2.5310670852661135, "memory(GiB)": 58.3, "step": 7825, "token_acc": 0.49117647058823527, "train_speed(iter/s)": 1.454819 }, { "epoch": 0.335461205603873, "grad_norm": 6.417255401611328, "learning_rate": 9.889343762435409e-05, "loss": 2.919891929626465, "memory(GiB)": 58.3, "step": 7830, "token_acc": 0.43564356435643564, "train_speed(iter/s)": 1.454859 }, { "epoch": 0.33567542093312197, "grad_norm": 4.355087757110596, "learning_rate": 9.88920291832795e-05, "loss": 2.7533243179321287, "memory(GiB)": 58.3, "step": 7835, "token_acc": 0.41033434650455924, "train_speed(iter/s)": 1.454752 }, { "epoch": 0.33588963626237095, "grad_norm": 3.2876315116882324, "learning_rate": 9.889061985647996e-05, "loss": 2.9273612976074217, "memory(GiB)": 58.3, "step": 7840, "token_acc": 0.40540540540540543, "train_speed(iter/s)": 1.454844 }, { "epoch": 0.3361038515916199, "grad_norm": 4.384795665740967, "learning_rate": 9.888920964398099e-05, "loss": 2.6128000259399413, "memory(GiB)": 58.3, "step": 7845, "token_acc": 0.45733788395904434, "train_speed(iter/s)": 1.455004 }, { "epoch": 0.33631806692086885, "grad_norm": 3.9293158054351807, "learning_rate": 9.88877985458081e-05, "loss": 2.5912336349487304, "memory(GiB)": 58.3, "step": 7850, "token_acc": 0.4753521126760563, "train_speed(iter/s)": 1.455202 }, { "epoch": 0.33653228225011783, "grad_norm": 3.4973764419555664, "learning_rate": 9.888638656198688e-05, "loss": 2.6211244583129885, "memory(GiB)": 58.3, "step": 7855, "token_acc": 0.4527027027027027, "train_speed(iter/s)": 1.455112 }, { "epoch": 0.33674649757936675, "grad_norm": 3.9123497009277344, "learning_rate": 9.88849736925429e-05, "loss": 2.5589523315429688, "memory(GiB)": 58.3, "step": 7860, "token_acc": 0.48580441640378547, "train_speed(iter/s)": 1.455165 }, { "epoch": 0.33696071290861573, "grad_norm": 4.121347904205322, "learning_rate": 9.888355993750178e-05, "loss": 2.5394596099853515, "memory(GiB)": 58.3, "step": 7865, "token_acc": 0.50187265917603, "train_speed(iter/s)": 1.455397 }, { "epoch": 0.3371749282378647, "grad_norm": 5.938421726226807, "learning_rate": 9.888214529688912e-05, "loss": 2.6497512817382813, "memory(GiB)": 58.3, "step": 7870, "token_acc": 0.4609053497942387, "train_speed(iter/s)": 1.455437 }, { "epoch": 0.33738914356711364, "grad_norm": 3.8449952602386475, "learning_rate": 9.888072977073053e-05, "loss": 2.4433937072753906, "memory(GiB)": 58.3, "step": 7875, "token_acc": 0.4846153846153846, "train_speed(iter/s)": 1.455498 }, { "epoch": 0.3376033588963626, "grad_norm": 3.967132806777954, "learning_rate": 9.887931335905168e-05, "loss": 2.3723243713378905, "memory(GiB)": 58.3, "step": 7880, "token_acc": 0.484375, "train_speed(iter/s)": 1.455701 }, { "epoch": 0.3378175742256116, "grad_norm": 3.0878348350524902, "learning_rate": 9.887789606187819e-05, "loss": 2.5916696548461915, "memory(GiB)": 58.3, "step": 7885, "token_acc": 0.4673913043478261, "train_speed(iter/s)": 1.455677 }, { "epoch": 0.3380317895548606, "grad_norm": 5.347700119018555, "learning_rate": 9.887647787923578e-05, "loss": 2.7415653228759767, "memory(GiB)": 58.3, "step": 7890, "token_acc": 0.44673539518900346, "train_speed(iter/s)": 1.455828 }, { "epoch": 0.3382460048841095, "grad_norm": 3.3523037433624268, "learning_rate": 9.887505881115013e-05, "loss": 2.3701921463012696, "memory(GiB)": 58.3, "step": 7895, "token_acc": 0.48717948717948717, "train_speed(iter/s)": 1.456003 }, { "epoch": 0.3384602202133585, "grad_norm": 3.8140828609466553, "learning_rate": 9.887363885764693e-05, "loss": 2.7807552337646486, "memory(GiB)": 58.3, "step": 7900, "token_acc": 0.4296028880866426, "train_speed(iter/s)": 1.455988 }, { "epoch": 0.33867443554260745, "grad_norm": 5.0800700187683105, "learning_rate": 9.887221801875192e-05, "loss": 2.481638717651367, "memory(GiB)": 58.3, "step": 7905, "token_acc": 0.45161290322580644, "train_speed(iter/s)": 1.455828 }, { "epoch": 0.3388886508718564, "grad_norm": 4.114391326904297, "learning_rate": 9.887079629449083e-05, "loss": 2.49346923828125, "memory(GiB)": 58.3, "step": 7910, "token_acc": 0.4792332268370607, "train_speed(iter/s)": 1.455905 }, { "epoch": 0.33910286620110536, "grad_norm": 4.894964218139648, "learning_rate": 9.886937368488942e-05, "loss": 2.7256725311279295, "memory(GiB)": 58.3, "step": 7915, "token_acc": 0.4205607476635514, "train_speed(iter/s)": 1.455999 }, { "epoch": 0.33931708153035434, "grad_norm": 4.770758628845215, "learning_rate": 9.886795018997347e-05, "loss": 2.708760643005371, "memory(GiB)": 58.3, "step": 7920, "token_acc": 0.426056338028169, "train_speed(iter/s)": 1.456035 }, { "epoch": 0.33953129685960326, "grad_norm": 2.974980354309082, "learning_rate": 9.886652580976876e-05, "loss": 2.669449806213379, "memory(GiB)": 58.3, "step": 7925, "token_acc": 0.4575645756457565, "train_speed(iter/s)": 1.456164 }, { "epoch": 0.33974551218885224, "grad_norm": 3.535210132598877, "learning_rate": 9.886510054430108e-05, "loss": 2.564958953857422, "memory(GiB)": 58.3, "step": 7930, "token_acc": 0.4389438943894389, "train_speed(iter/s)": 1.456303 }, { "epoch": 0.3399597275181012, "grad_norm": 6.0368733406066895, "learning_rate": 9.886367439359627e-05, "loss": 2.9167285919189454, "memory(GiB)": 58.3, "step": 7935, "token_acc": 0.42585551330798477, "train_speed(iter/s)": 1.456396 }, { "epoch": 0.34017394284735014, "grad_norm": 3.674422025680542, "learning_rate": 9.886224735768017e-05, "loss": 2.611805725097656, "memory(GiB)": 58.3, "step": 7940, "token_acc": 0.4486301369863014, "train_speed(iter/s)": 1.456485 }, { "epoch": 0.3403881581765991, "grad_norm": 5.602137565612793, "learning_rate": 9.886081943657862e-05, "loss": 2.7226539611816407, "memory(GiB)": 58.3, "step": 7945, "token_acc": 0.4490566037735849, "train_speed(iter/s)": 1.456601 }, { "epoch": 0.3406023735058481, "grad_norm": 5.106326580047607, "learning_rate": 9.885939063031748e-05, "loss": 2.40622615814209, "memory(GiB)": 58.3, "step": 7950, "token_acc": 0.47985347985347987, "train_speed(iter/s)": 1.456585 }, { "epoch": 0.340816588835097, "grad_norm": 4.052453994750977, "learning_rate": 9.885796093892266e-05, "loss": 2.5365081787109376, "memory(GiB)": 58.3, "step": 7955, "token_acc": 0.4589041095890411, "train_speed(iter/s)": 1.456634 }, { "epoch": 0.341030804164346, "grad_norm": 5.224856853485107, "learning_rate": 9.885653036242004e-05, "loss": 2.4870349884033205, "memory(GiB)": 58.3, "step": 7960, "token_acc": 0.4939759036144578, "train_speed(iter/s)": 1.456684 }, { "epoch": 0.341245019493595, "grad_norm": 4.263911724090576, "learning_rate": 9.885509890083555e-05, "loss": 2.5133150100708006, "memory(GiB)": 58.3, "step": 7965, "token_acc": 0.5038461538461538, "train_speed(iter/s)": 1.456588 }, { "epoch": 0.3414592348228439, "grad_norm": 3.6719298362731934, "learning_rate": 9.88536665541951e-05, "loss": 2.6517005920410157, "memory(GiB)": 58.3, "step": 7970, "token_acc": 0.4626865671641791, "train_speed(iter/s)": 1.456716 }, { "epoch": 0.3416734501520929, "grad_norm": 3.5599942207336426, "learning_rate": 9.885223332252464e-05, "loss": 2.5277809143066405, "memory(GiB)": 58.3, "step": 7975, "token_acc": 0.44089456869009586, "train_speed(iter/s)": 1.456793 }, { "epoch": 0.34188766548134186, "grad_norm": 3.620069742202759, "learning_rate": 9.885079920585017e-05, "loss": 2.6460525512695314, "memory(GiB)": 58.3, "step": 7980, "token_acc": 0.45686900958466453, "train_speed(iter/s)": 1.456853 }, { "epoch": 0.3421018808105908, "grad_norm": 3.9305548667907715, "learning_rate": 9.884936420419763e-05, "loss": 2.7537521362304687, "memory(GiB)": 58.3, "step": 7985, "token_acc": 0.44983818770226536, "train_speed(iter/s)": 1.456874 }, { "epoch": 0.34231609613983977, "grad_norm": 5.006189346313477, "learning_rate": 9.884792831759305e-05, "loss": 2.5016101837158202, "memory(GiB)": 58.3, "step": 7990, "token_acc": 0.45934959349593496, "train_speed(iter/s)": 1.45684 }, { "epoch": 0.34253031146908874, "grad_norm": 4.487048149108887, "learning_rate": 9.884649154606242e-05, "loss": 2.4803647994995117, "memory(GiB)": 58.3, "step": 7995, "token_acc": 0.4472843450479233, "train_speed(iter/s)": 1.456952 }, { "epoch": 0.34274452679833767, "grad_norm": 3.460939645767212, "learning_rate": 9.884505388963176e-05, "loss": 2.5205123901367186, "memory(GiB)": 58.3, "step": 8000, "token_acc": 0.48771929824561405, "train_speed(iter/s)": 1.456872 }, { "epoch": 0.34274452679833767, "eval_loss": 2.3589062690734863, "eval_runtime": 14.3657, "eval_samples_per_second": 6.961, "eval_steps_per_second": 6.961, "eval_token_acc": 0.47733333333333333, "step": 8000 }, { "epoch": 0.34295874212758665, "grad_norm": 3.464186191558838, "learning_rate": 9.884361534832716e-05, "loss": 2.453921890258789, "memory(GiB)": 58.3, "step": 8005, "token_acc": 0.47714285714285715, "train_speed(iter/s)": 1.452755 }, { "epoch": 0.3431729574568356, "grad_norm": 4.4978413581848145, "learning_rate": 9.884217592217461e-05, "loss": 2.5027889251708983, "memory(GiB)": 58.3, "step": 8010, "token_acc": 0.43962848297213625, "train_speed(iter/s)": 1.452906 }, { "epoch": 0.34338717278608455, "grad_norm": 4.614760875701904, "learning_rate": 9.884073561120026e-05, "loss": 2.2955766677856446, "memory(GiB)": 58.3, "step": 8015, "token_acc": 0.5243445692883895, "train_speed(iter/s)": 1.452973 }, { "epoch": 0.34360138811533353, "grad_norm": 3.8762171268463135, "learning_rate": 9.883929441543014e-05, "loss": 2.495701217651367, "memory(GiB)": 58.3, "step": 8020, "token_acc": 0.4811320754716981, "train_speed(iter/s)": 1.452594 }, { "epoch": 0.3438156034445825, "grad_norm": 4.507558345794678, "learning_rate": 9.88378523348904e-05, "loss": 2.6824146270751954, "memory(GiB)": 58.3, "step": 8025, "token_acc": 0.44680851063829785, "train_speed(iter/s)": 1.452281 }, { "epoch": 0.34402981877383143, "grad_norm": 3.8340647220611572, "learning_rate": 9.883640936960716e-05, "loss": 2.4789306640625, "memory(GiB)": 58.3, "step": 8030, "token_acc": 0.5138888888888888, "train_speed(iter/s)": 1.452378 }, { "epoch": 0.3442440341030804, "grad_norm": 4.548669815063477, "learning_rate": 9.883496551960654e-05, "loss": 2.6601104736328125, "memory(GiB)": 58.3, "step": 8035, "token_acc": 0.45686900958466453, "train_speed(iter/s)": 1.45245 }, { "epoch": 0.3444582494323294, "grad_norm": 3.9123921394348145, "learning_rate": 9.88335207849147e-05, "loss": 2.433611297607422, "memory(GiB)": 58.3, "step": 8040, "token_acc": 0.49523809523809526, "train_speed(iter/s)": 1.45224 }, { "epoch": 0.3446724647615783, "grad_norm": 4.004251003265381, "learning_rate": 9.883207516555784e-05, "loss": 2.5678157806396484, "memory(GiB)": 58.3, "step": 8045, "token_acc": 0.43234323432343236, "train_speed(iter/s)": 1.452301 }, { "epoch": 0.3448866800908273, "grad_norm": 4.088587760925293, "learning_rate": 9.883062866156213e-05, "loss": 2.7233516693115236, "memory(GiB)": 58.3, "step": 8050, "token_acc": 0.4301369863013699, "train_speed(iter/s)": 1.452181 }, { "epoch": 0.34510089542007627, "grad_norm": 5.49698543548584, "learning_rate": 9.882918127295376e-05, "loss": 2.473859405517578, "memory(GiB)": 58.3, "step": 8055, "token_acc": 0.4405594405594406, "train_speed(iter/s)": 1.452229 }, { "epoch": 0.34531511074932525, "grad_norm": 4.037271499633789, "learning_rate": 9.882773299975897e-05, "loss": 3.2395488739013674, "memory(GiB)": 58.3, "step": 8060, "token_acc": 0.4034090909090909, "train_speed(iter/s)": 1.452279 }, { "epoch": 0.3455293260785742, "grad_norm": 3.700503349304199, "learning_rate": 9.8826283842004e-05, "loss": 2.470033073425293, "memory(GiB)": 58.3, "step": 8065, "token_acc": 0.4682274247491639, "train_speed(iter/s)": 1.452196 }, { "epoch": 0.34574354140782315, "grad_norm": 3.711183547973633, "learning_rate": 9.882483379971509e-05, "loss": 2.552614212036133, "memory(GiB)": 58.3, "step": 8070, "token_acc": 0.48501362397820164, "train_speed(iter/s)": 1.452313 }, { "epoch": 0.34595775673707213, "grad_norm": 3.106621026992798, "learning_rate": 9.882338287291851e-05, "loss": 2.7338134765625, "memory(GiB)": 58.3, "step": 8075, "token_acc": 0.4444444444444444, "train_speed(iter/s)": 1.452415 }, { "epoch": 0.34617197206632105, "grad_norm": 4.303894519805908, "learning_rate": 9.882193106164055e-05, "loss": 3.065963554382324, "memory(GiB)": 58.3, "step": 8080, "token_acc": 0.3867403314917127, "train_speed(iter/s)": 1.45252 }, { "epoch": 0.34638618739557003, "grad_norm": 4.250564098358154, "learning_rate": 9.882047836590752e-05, "loss": 2.4909812927246096, "memory(GiB)": 58.3, "step": 8085, "token_acc": 0.4895833333333333, "train_speed(iter/s)": 1.452694 }, { "epoch": 0.346600402724819, "grad_norm": 3.559344530105591, "learning_rate": 9.881902478574571e-05, "loss": 2.9835681915283203, "memory(GiB)": 58.3, "step": 8090, "token_acc": 0.40625, "train_speed(iter/s)": 1.452842 }, { "epoch": 0.34681461805406794, "grad_norm": 3.629955768585205, "learning_rate": 9.88175703211815e-05, "loss": 2.6201688766479494, "memory(GiB)": 58.3, "step": 8095, "token_acc": 0.4729241877256318, "train_speed(iter/s)": 1.452799 }, { "epoch": 0.3470288333833169, "grad_norm": 3.5932557582855225, "learning_rate": 9.88161149722412e-05, "loss": 2.4452743530273438, "memory(GiB)": 58.3, "step": 8100, "token_acc": 0.47416413373860183, "train_speed(iter/s)": 1.452743 }, { "epoch": 0.3472430487125659, "grad_norm": 3.6034882068634033, "learning_rate": 9.881465873895116e-05, "loss": 3.03234977722168, "memory(GiB)": 58.3, "step": 8105, "token_acc": 0.4391891891891892, "train_speed(iter/s)": 1.452636 }, { "epoch": 0.3474572640418148, "grad_norm": 5.087571620941162, "learning_rate": 9.881320162133781e-05, "loss": 2.5947038650512697, "memory(GiB)": 58.3, "step": 8110, "token_acc": 0.4332129963898917, "train_speed(iter/s)": 1.452818 }, { "epoch": 0.3476714793710638, "grad_norm": 3.706878423690796, "learning_rate": 9.881174361942751e-05, "loss": 2.3738880157470703, "memory(GiB)": 58.3, "step": 8115, "token_acc": 0.5370370370370371, "train_speed(iter/s)": 1.452831 }, { "epoch": 0.3478856947003128, "grad_norm": 3.635059356689453, "learning_rate": 9.881028473324669e-05, "loss": 2.608393096923828, "memory(GiB)": 58.3, "step": 8120, "token_acc": 0.4006514657980456, "train_speed(iter/s)": 1.452928 }, { "epoch": 0.3480999100295617, "grad_norm": 3.160557985305786, "learning_rate": 9.880882496282176e-05, "loss": 2.796361541748047, "memory(GiB)": 58.3, "step": 8125, "token_acc": 0.445141065830721, "train_speed(iter/s)": 1.453067 }, { "epoch": 0.3483141253588107, "grad_norm": 4.458441257476807, "learning_rate": 9.88073643081792e-05, "loss": 2.282944679260254, "memory(GiB)": 58.3, "step": 8130, "token_acc": 0.5196850393700787, "train_speed(iter/s)": 1.453111 }, { "epoch": 0.34852834068805966, "grad_norm": 4.312187671661377, "learning_rate": 9.880590276934543e-05, "loss": 2.600994873046875, "memory(GiB)": 58.3, "step": 8135, "token_acc": 0.46236559139784944, "train_speed(iter/s)": 1.453122 }, { "epoch": 0.3487425560173086, "grad_norm": 3.5220563411712646, "learning_rate": 9.880444034634698e-05, "loss": 2.680997848510742, "memory(GiB)": 58.3, "step": 8140, "token_acc": 0.43661971830985913, "train_speed(iter/s)": 1.453279 }, { "epoch": 0.34895677134655756, "grad_norm": 3.735930919647217, "learning_rate": 9.880297703921027e-05, "loss": 2.503192901611328, "memory(GiB)": 58.3, "step": 8145, "token_acc": 0.4585987261146497, "train_speed(iter/s)": 1.453363 }, { "epoch": 0.34917098667580654, "grad_norm": 5.526791572570801, "learning_rate": 9.880151284796187e-05, "loss": 2.8281320571899413, "memory(GiB)": 58.3, "step": 8150, "token_acc": 0.40794223826714804, "train_speed(iter/s)": 1.453434 }, { "epoch": 0.34938520200505546, "grad_norm": 6.912009239196777, "learning_rate": 9.880004777262829e-05, "loss": 2.6142414093017576, "memory(GiB)": 58.3, "step": 8155, "token_acc": 0.4618055555555556, "train_speed(iter/s)": 1.453425 }, { "epoch": 0.34959941733430444, "grad_norm": 4.234181880950928, "learning_rate": 9.879858181323607e-05, "loss": 2.4705215454101563, "memory(GiB)": 58.3, "step": 8160, "token_acc": 0.46691176470588236, "train_speed(iter/s)": 1.453481 }, { "epoch": 0.3498136326635534, "grad_norm": 4.864664554595947, "learning_rate": 9.879711496981174e-05, "loss": 2.624741554260254, "memory(GiB)": 58.3, "step": 8165, "token_acc": 0.46715328467153283, "train_speed(iter/s)": 1.453658 }, { "epoch": 0.35002784799280234, "grad_norm": 6.615359306335449, "learning_rate": 9.879564724238193e-05, "loss": 2.94897403717041, "memory(GiB)": 58.3, "step": 8170, "token_acc": 0.3763440860215054, "train_speed(iter/s)": 1.45379 }, { "epoch": 0.3502420633220513, "grad_norm": 3.722355365753174, "learning_rate": 9.879417863097318e-05, "loss": 2.5865571975708006, "memory(GiB)": 58.3, "step": 8175, "token_acc": 0.4351851851851852, "train_speed(iter/s)": 1.453813 }, { "epoch": 0.3504562786513003, "grad_norm": 4.878786563873291, "learning_rate": 9.879270913561209e-05, "loss": 2.4978450775146483, "memory(GiB)": 58.3, "step": 8180, "token_acc": 0.47368421052631576, "train_speed(iter/s)": 1.453702 }, { "epoch": 0.3506704939805492, "grad_norm": 4.959492206573486, "learning_rate": 9.879123875632534e-05, "loss": 2.77236385345459, "memory(GiB)": 58.3, "step": 8185, "token_acc": 0.47419354838709676, "train_speed(iter/s)": 1.453822 }, { "epoch": 0.3508847093097982, "grad_norm": 4.195497512817383, "learning_rate": 9.878976749313951e-05, "loss": 2.7344221115112304, "memory(GiB)": 58.3, "step": 8190, "token_acc": 0.4658385093167702, "train_speed(iter/s)": 1.453561 }, { "epoch": 0.3510989246390472, "grad_norm": 4.9788947105407715, "learning_rate": 9.878829534608127e-05, "loss": 2.6990888595581053, "memory(GiB)": 58.3, "step": 8195, "token_acc": 0.4493927125506073, "train_speed(iter/s)": 1.45361 }, { "epoch": 0.3513131399682961, "grad_norm": 4.959716796875, "learning_rate": 9.878682231517731e-05, "loss": 2.3707590103149414, "memory(GiB)": 58.3, "step": 8200, "token_acc": 0.5167173252279635, "train_speed(iter/s)": 1.453289 }, { "epoch": 0.3515273552975451, "grad_norm": 3.226439952850342, "learning_rate": 9.878534840045428e-05, "loss": 2.610869789123535, "memory(GiB)": 58.3, "step": 8205, "token_acc": 0.4964788732394366, "train_speed(iter/s)": 1.453178 }, { "epoch": 0.35174157062679406, "grad_norm": 3.548597574234009, "learning_rate": 9.878387360193891e-05, "loss": 2.5473773956298826, "memory(GiB)": 58.3, "step": 8210, "token_acc": 0.4512987012987013, "train_speed(iter/s)": 1.453252 }, { "epoch": 0.351955785956043, "grad_norm": 6.055249214172363, "learning_rate": 9.87823979196579e-05, "loss": 2.609727478027344, "memory(GiB)": 58.3, "step": 8215, "token_acc": 0.45041322314049587, "train_speed(iter/s)": 1.453461 }, { "epoch": 0.35217000128529197, "grad_norm": 3.633150100708008, "learning_rate": 9.8780921353638e-05, "loss": 2.622083282470703, "memory(GiB)": 58.3, "step": 8220, "token_acc": 0.4603658536585366, "train_speed(iter/s)": 1.453509 }, { "epoch": 0.35238421661454095, "grad_norm": 4.759337902069092, "learning_rate": 9.877944390390594e-05, "loss": 2.794211769104004, "memory(GiB)": 58.3, "step": 8225, "token_acc": 0.4139194139194139, "train_speed(iter/s)": 1.453525 }, { "epoch": 0.3525984319437899, "grad_norm": 4.9054131507873535, "learning_rate": 9.87779655704885e-05, "loss": 2.510045051574707, "memory(GiB)": 58.3, "step": 8230, "token_acc": 0.4716981132075472, "train_speed(iter/s)": 1.453481 }, { "epoch": 0.35281264727303885, "grad_norm": 3.7385787963867188, "learning_rate": 9.877648635341245e-05, "loss": 2.4643476486206053, "memory(GiB)": 58.3, "step": 8235, "token_acc": 0.44727272727272727, "train_speed(iter/s)": 1.453471 }, { "epoch": 0.3530268626022878, "grad_norm": 4.54408073425293, "learning_rate": 9.877500625270459e-05, "loss": 2.8306304931640627, "memory(GiB)": 58.3, "step": 8240, "token_acc": 0.4409722222222222, "train_speed(iter/s)": 1.453642 }, { "epoch": 0.3532410779315368, "grad_norm": 6.045342922210693, "learning_rate": 9.877352526839174e-05, "loss": 2.7010244369506835, "memory(GiB)": 58.3, "step": 8245, "token_acc": 0.4666666666666667, "train_speed(iter/s)": 1.45362 }, { "epoch": 0.35345529326078573, "grad_norm": 3.9131600856781006, "learning_rate": 9.877204340050075e-05, "loss": 2.348368835449219, "memory(GiB)": 58.3, "step": 8250, "token_acc": 0.5098814229249012, "train_speed(iter/s)": 1.453505 }, { "epoch": 0.3536695085900347, "grad_norm": 4.147241115570068, "learning_rate": 9.87705606490584e-05, "loss": 2.5031055450439452, "memory(GiB)": 58.3, "step": 8255, "token_acc": 0.4629080118694362, "train_speed(iter/s)": 1.453343 }, { "epoch": 0.3538837239192837, "grad_norm": 5.277980327606201, "learning_rate": 9.876907701409164e-05, "loss": 2.503815841674805, "memory(GiB)": 58.3, "step": 8260, "token_acc": 0.4548736462093863, "train_speed(iter/s)": 1.453521 }, { "epoch": 0.3540979392485326, "grad_norm": 4.859445095062256, "learning_rate": 9.876759249562727e-05, "loss": 2.5020065307617188, "memory(GiB)": 58.3, "step": 8265, "token_acc": 0.44483985765124556, "train_speed(iter/s)": 1.453608 }, { "epoch": 0.3543121545777816, "grad_norm": 4.326780319213867, "learning_rate": 9.876610709369221e-05, "loss": 2.516596794128418, "memory(GiB)": 58.3, "step": 8270, "token_acc": 0.48366013071895425, "train_speed(iter/s)": 1.453578 }, { "epoch": 0.35452636990703057, "grad_norm": 10.966876983642578, "learning_rate": 9.876462080831338e-05, "loss": 3.038375663757324, "memory(GiB)": 58.3, "step": 8275, "token_acc": 0.43356643356643354, "train_speed(iter/s)": 1.453699 }, { "epoch": 0.3547405852362795, "grad_norm": 5.078854084014893, "learning_rate": 9.876313363951772e-05, "loss": 2.286676788330078, "memory(GiB)": 58.3, "step": 8280, "token_acc": 0.46846846846846846, "train_speed(iter/s)": 1.453735 }, { "epoch": 0.35495480056552847, "grad_norm": 3.905805826187134, "learning_rate": 9.876164558733213e-05, "loss": 2.6840614318847655, "memory(GiB)": 58.3, "step": 8285, "token_acc": 0.42295081967213116, "train_speed(iter/s)": 1.45379 }, { "epoch": 0.35516901589477745, "grad_norm": 4.04298734664917, "learning_rate": 9.87601566517836e-05, "loss": 2.526031494140625, "memory(GiB)": 58.3, "step": 8290, "token_acc": 0.4709897610921502, "train_speed(iter/s)": 1.453525 }, { "epoch": 0.3553832312240264, "grad_norm": 3.051178455352783, "learning_rate": 9.875866683289907e-05, "loss": 2.277154731750488, "memory(GiB)": 58.3, "step": 8295, "token_acc": 0.5374592833876222, "train_speed(iter/s)": 1.453497 }, { "epoch": 0.35559744655327535, "grad_norm": 3.419931173324585, "learning_rate": 9.875717613070558e-05, "loss": 2.6096355438232424, "memory(GiB)": 58.3, "step": 8300, "token_acc": 0.44680851063829785, "train_speed(iter/s)": 1.45345 }, { "epoch": 0.35581166188252433, "grad_norm": 3.775873899459839, "learning_rate": 9.875568454523008e-05, "loss": 2.7648088455200197, "memory(GiB)": 58.3, "step": 8305, "token_acc": 0.41914191419141916, "train_speed(iter/s)": 1.453653 }, { "epoch": 0.35602587721177326, "grad_norm": 4.657072067260742, "learning_rate": 9.875419207649963e-05, "loss": 2.469293975830078, "memory(GiB)": 58.3, "step": 8310, "token_acc": 0.4318181818181818, "train_speed(iter/s)": 1.453677 }, { "epoch": 0.35624009254102224, "grad_norm": 3.663996934890747, "learning_rate": 9.875269872454127e-05, "loss": 2.6888759613037108, "memory(GiB)": 58.3, "step": 8315, "token_acc": 0.4407894736842105, "train_speed(iter/s)": 1.453757 }, { "epoch": 0.3564543078702712, "grad_norm": 4.971199035644531, "learning_rate": 9.875120448938201e-05, "loss": 2.319040870666504, "memory(GiB)": 58.3, "step": 8320, "token_acc": 0.5590551181102362, "train_speed(iter/s)": 1.453754 }, { "epoch": 0.35666852319952014, "grad_norm": 3.797389030456543, "learning_rate": 9.874970937104897e-05, "loss": 2.7418405532836916, "memory(GiB)": 58.3, "step": 8325, "token_acc": 0.4452296819787986, "train_speed(iter/s)": 1.453754 }, { "epoch": 0.3568827385287691, "grad_norm": 3.2229490280151367, "learning_rate": 9.87482133695692e-05, "loss": 2.6428028106689454, "memory(GiB)": 58.3, "step": 8330, "token_acc": 0.4491803278688525, "train_speed(iter/s)": 1.453805 }, { "epoch": 0.3570969538580181, "grad_norm": 5.507114887237549, "learning_rate": 9.874671648496983e-05, "loss": 2.785486602783203, "memory(GiB)": 58.3, "step": 8335, "token_acc": 0.4304635761589404, "train_speed(iter/s)": 1.453797 }, { "epoch": 0.357311169187267, "grad_norm": 4.2238311767578125, "learning_rate": 9.874521871727795e-05, "loss": 2.508288764953613, "memory(GiB)": 58.3, "step": 8340, "token_acc": 0.4617737003058104, "train_speed(iter/s)": 1.453767 }, { "epoch": 0.357525384516516, "grad_norm": 3.8674449920654297, "learning_rate": 9.87437200665207e-05, "loss": 2.3915096282958985, "memory(GiB)": 58.3, "step": 8345, "token_acc": 0.4793103448275862, "train_speed(iter/s)": 1.453642 }, { "epoch": 0.357739599845765, "grad_norm": 4.345756530761719, "learning_rate": 9.874222053272526e-05, "loss": 2.6997234344482424, "memory(GiB)": 58.3, "step": 8350, "token_acc": 0.4472049689440994, "train_speed(iter/s)": 1.453505 }, { "epoch": 0.3579538151750139, "grad_norm": 4.850778579711914, "learning_rate": 9.874072011591875e-05, "loss": 2.4386642456054686, "memory(GiB)": 58.3, "step": 8355, "token_acc": 0.4929078014184397, "train_speed(iter/s)": 1.453669 }, { "epoch": 0.3581680305042629, "grad_norm": 3.9055323600769043, "learning_rate": 9.873921881612839e-05, "loss": 2.7483781814575194, "memory(GiB)": 58.3, "step": 8360, "token_acc": 0.46099290780141844, "train_speed(iter/s)": 1.453667 }, { "epoch": 0.35838224583351186, "grad_norm": 3.9249587059020996, "learning_rate": 9.873771663338134e-05, "loss": 2.520079231262207, "memory(GiB)": 58.3, "step": 8365, "token_acc": 0.46706586826347307, "train_speed(iter/s)": 1.453655 }, { "epoch": 0.3585964611627608, "grad_norm": 4.477032661437988, "learning_rate": 9.873621356770485e-05, "loss": 2.4322654724121096, "memory(GiB)": 58.3, "step": 8370, "token_acc": 0.48484848484848486, "train_speed(iter/s)": 1.453563 }, { "epoch": 0.35881067649200976, "grad_norm": 4.751509666442871, "learning_rate": 9.873470961912612e-05, "loss": 2.9733051300048827, "memory(GiB)": 58.3, "step": 8375, "token_acc": 0.4370860927152318, "train_speed(iter/s)": 1.45348 }, { "epoch": 0.35902489182125874, "grad_norm": 5.375596523284912, "learning_rate": 9.873320478767242e-05, "loss": 2.792676544189453, "memory(GiB)": 58.3, "step": 8380, "token_acc": 0.4474708171206226, "train_speed(iter/s)": 1.453546 }, { "epoch": 0.35923910715050766, "grad_norm": 4.573060035705566, "learning_rate": 9.873169907337098e-05, "loss": 2.275337791442871, "memory(GiB)": 58.3, "step": 8385, "token_acc": 0.5224489795918368, "train_speed(iter/s)": 1.453406 }, { "epoch": 0.35945332247975664, "grad_norm": 4.999676704406738, "learning_rate": 9.873019247624913e-05, "loss": 2.385422134399414, "memory(GiB)": 58.3, "step": 8390, "token_acc": 0.48134328358208955, "train_speed(iter/s)": 1.45333 }, { "epoch": 0.3596675378090056, "grad_norm": 3.7001821994781494, "learning_rate": 9.87286849963341e-05, "loss": 2.6264087677001955, "memory(GiB)": 58.3, "step": 8395, "token_acc": 0.45627376425855515, "train_speed(iter/s)": 1.453251 }, { "epoch": 0.3598817531382546, "grad_norm": 2.916172504425049, "learning_rate": 9.872717663365325e-05, "loss": 2.6320735931396486, "memory(GiB)": 58.3, "step": 8400, "token_acc": 0.46866485013623976, "train_speed(iter/s)": 1.453154 }, { "epoch": 0.3600959684675035, "grad_norm": 4.477244853973389, "learning_rate": 9.872566738823388e-05, "loss": 2.71956729888916, "memory(GiB)": 58.3, "step": 8405, "token_acc": 0.4406779661016949, "train_speed(iter/s)": 1.453321 }, { "epoch": 0.3603101837967525, "grad_norm": 3.813852310180664, "learning_rate": 9.872415726010334e-05, "loss": 2.7722780227661135, "memory(GiB)": 58.3, "step": 8410, "token_acc": 0.45918367346938777, "train_speed(iter/s)": 1.453045 }, { "epoch": 0.3605243991260015, "grad_norm": 4.583105564117432, "learning_rate": 9.872264624928898e-05, "loss": 2.407649803161621, "memory(GiB)": 58.3, "step": 8415, "token_acc": 0.5055762081784386, "train_speed(iter/s)": 1.452983 }, { "epoch": 0.3607386144552504, "grad_norm": 3.8841018676757812, "learning_rate": 9.872113435581818e-05, "loss": 2.5894634246826174, "memory(GiB)": 58.3, "step": 8420, "token_acc": 0.4743202416918429, "train_speed(iter/s)": 1.45283 }, { "epoch": 0.3609528297844994, "grad_norm": 5.313314437866211, "learning_rate": 9.871962157971832e-05, "loss": 2.122224235534668, "memory(GiB)": 58.3, "step": 8425, "token_acc": 0.5283842794759825, "train_speed(iter/s)": 1.452825 }, { "epoch": 0.36116704511374836, "grad_norm": 4.664624214172363, "learning_rate": 9.871810792101681e-05, "loss": 2.8059823989868162, "memory(GiB)": 58.3, "step": 8430, "token_acc": 0.5093167701863354, "train_speed(iter/s)": 1.452979 }, { "epoch": 0.3613812604429973, "grad_norm": 4.487034320831299, "learning_rate": 9.871659337974109e-05, "loss": 2.6467830657958986, "memory(GiB)": 58.3, "step": 8435, "token_acc": 0.4664310954063604, "train_speed(iter/s)": 1.452817 }, { "epoch": 0.36159547577224627, "grad_norm": 4.632307529449463, "learning_rate": 9.871507795591857e-05, "loss": 2.727056694030762, "memory(GiB)": 58.3, "step": 8440, "token_acc": 0.4491803278688525, "train_speed(iter/s)": 1.452824 }, { "epoch": 0.36180969110149525, "grad_norm": 4.677274227142334, "learning_rate": 9.87135616495767e-05, "loss": 2.635938835144043, "memory(GiB)": 58.3, "step": 8445, "token_acc": 0.4959349593495935, "train_speed(iter/s)": 1.452819 }, { "epoch": 0.36202390643074417, "grad_norm": 4.1115336418151855, "learning_rate": 9.871204446074298e-05, "loss": 2.6627834320068358, "memory(GiB)": 58.3, "step": 8450, "token_acc": 0.4652014652014652, "train_speed(iter/s)": 1.452615 }, { "epoch": 0.36223812175999315, "grad_norm": 4.5900750160217285, "learning_rate": 9.871052638944489e-05, "loss": 2.5138330459594727, "memory(GiB)": 58.3, "step": 8455, "token_acc": 0.4858156028368794, "train_speed(iter/s)": 1.452559 }, { "epoch": 0.3624523370892421, "grad_norm": 6.558152675628662, "learning_rate": 9.87090074357099e-05, "loss": 2.6777801513671875, "memory(GiB)": 58.3, "step": 8460, "token_acc": 0.4790996784565916, "train_speed(iter/s)": 1.452618 }, { "epoch": 0.36266655241849105, "grad_norm": 3.3310494422912598, "learning_rate": 9.870748759956556e-05, "loss": 2.4743141174316405, "memory(GiB)": 58.3, "step": 8465, "token_acc": 0.4909090909090909, "train_speed(iter/s)": 1.452814 }, { "epoch": 0.36288076774774003, "grad_norm": 4.387814044952393, "learning_rate": 9.87059668810394e-05, "loss": 2.566577911376953, "memory(GiB)": 58.3, "step": 8470, "token_acc": 0.44932432432432434, "train_speed(iter/s)": 1.452952 }, { "epoch": 0.363094983076989, "grad_norm": 4.381251811981201, "learning_rate": 9.870444528015895e-05, "loss": 2.745221710205078, "memory(GiB)": 58.3, "step": 8475, "token_acc": 0.41389728096676737, "train_speed(iter/s)": 1.452875 }, { "epoch": 0.36330919840623793, "grad_norm": 4.326045513153076, "learning_rate": 9.870292279695177e-05, "loss": 2.5108896255493165, "memory(GiB)": 58.3, "step": 8480, "token_acc": 0.4774436090225564, "train_speed(iter/s)": 1.453156 }, { "epoch": 0.3635234137354869, "grad_norm": 3.560746908187866, "learning_rate": 9.870139943144547e-05, "loss": 2.908544731140137, "memory(GiB)": 58.3, "step": 8485, "token_acc": 0.43272727272727274, "train_speed(iter/s)": 1.453043 }, { "epoch": 0.3637376290647359, "grad_norm": 4.861599445343018, "learning_rate": 9.869987518366763e-05, "loss": 2.437498092651367, "memory(GiB)": 58.3, "step": 8490, "token_acc": 0.5034722222222222, "train_speed(iter/s)": 1.453108 }, { "epoch": 0.3639518443939848, "grad_norm": 4.313116550445557, "learning_rate": 9.869835005364587e-05, "loss": 2.867035484313965, "memory(GiB)": 58.3, "step": 8495, "token_acc": 0.4520547945205479, "train_speed(iter/s)": 1.453244 }, { "epoch": 0.3641660597232338, "grad_norm": 3.0654003620147705, "learning_rate": 9.869682404140781e-05, "loss": 2.5062629699707033, "memory(GiB)": 58.3, "step": 8500, "token_acc": 0.503448275862069, "train_speed(iter/s)": 1.453331 }, { "epoch": 0.3641660597232338, "eval_loss": 2.2773027420043945, "eval_runtime": 14.6047, "eval_samples_per_second": 6.847, "eval_steps_per_second": 6.847, "eval_token_acc": 0.4375, "step": 8500 }, { "epoch": 0.36438027505248277, "grad_norm": 4.361446857452393, "learning_rate": 9.869529714698111e-05, "loss": 2.8385292053222657, "memory(GiB)": 58.3, "step": 8505, "token_acc": 0.43667296786389415, "train_speed(iter/s)": 1.449559 }, { "epoch": 0.3645944903817317, "grad_norm": 4.428905963897705, "learning_rate": 9.869376937039342e-05, "loss": 2.4612777709960936, "memory(GiB)": 58.3, "step": 8510, "token_acc": 0.49624060150375937, "train_speed(iter/s)": 1.449048 }, { "epoch": 0.3648087057109807, "grad_norm": 3.629077911376953, "learning_rate": 9.869224071167242e-05, "loss": 2.7202875137329103, "memory(GiB)": 58.3, "step": 8515, "token_acc": 0.4522058823529412, "train_speed(iter/s)": 1.449052 }, { "epoch": 0.36502292104022965, "grad_norm": 3.568740129470825, "learning_rate": 9.869071117084581e-05, "loss": 2.969765472412109, "memory(GiB)": 58.3, "step": 8520, "token_acc": 0.41420118343195267, "train_speed(iter/s)": 1.448825 }, { "epoch": 0.3652371363694786, "grad_norm": 4.509570598602295, "learning_rate": 9.868918074794126e-05, "loss": 2.496523857116699, "memory(GiB)": 58.3, "step": 8525, "token_acc": 0.48366013071895425, "train_speed(iter/s)": 1.448846 }, { "epoch": 0.36545135169872756, "grad_norm": 4.692948341369629, "learning_rate": 9.868764944298656e-05, "loss": 2.6465007781982424, "memory(GiB)": 58.3, "step": 8530, "token_acc": 0.4471299093655589, "train_speed(iter/s)": 1.448729 }, { "epoch": 0.36566556702797653, "grad_norm": 4.059881210327148, "learning_rate": 9.86861172560094e-05, "loss": 2.5645797729492186, "memory(GiB)": 58.3, "step": 8535, "token_acc": 0.45985401459854014, "train_speed(iter/s)": 1.448923 }, { "epoch": 0.36587978235722546, "grad_norm": 4.927090644836426, "learning_rate": 9.868458418703756e-05, "loss": 2.505494499206543, "memory(GiB)": 58.3, "step": 8540, "token_acc": 0.44745762711864406, "train_speed(iter/s)": 1.449056 }, { "epoch": 0.36609399768647444, "grad_norm": 5.3619866371154785, "learning_rate": 9.868305023609881e-05, "loss": 2.314160919189453, "memory(GiB)": 58.3, "step": 8545, "token_acc": 0.515527950310559, "train_speed(iter/s)": 1.44932 }, { "epoch": 0.3663082130157234, "grad_norm": 4.235127925872803, "learning_rate": 9.868151540322094e-05, "loss": 2.611054611206055, "memory(GiB)": 58.3, "step": 8550, "token_acc": 0.4563758389261745, "train_speed(iter/s)": 1.449418 }, { "epoch": 0.36652242834497234, "grad_norm": 4.088761806488037, "learning_rate": 9.867997968843175e-05, "loss": 2.767421340942383, "memory(GiB)": 58.3, "step": 8555, "token_acc": 0.4199288256227758, "train_speed(iter/s)": 1.449525 }, { "epoch": 0.3667366436742213, "grad_norm": 5.123048782348633, "learning_rate": 9.867844309175906e-05, "loss": 2.5393394470214843, "memory(GiB)": 58.3, "step": 8560, "token_acc": 0.47076023391812866, "train_speed(iter/s)": 1.449602 }, { "epoch": 0.3669508590034703, "grad_norm": 3.409019947052002, "learning_rate": 9.86769056132307e-05, "loss": 2.4834274291992187, "memory(GiB)": 58.3, "step": 8565, "token_acc": 0.4709480122324159, "train_speed(iter/s)": 1.449539 }, { "epoch": 0.3671650743327193, "grad_norm": 4.338001251220703, "learning_rate": 9.867536725287455e-05, "loss": 2.516732406616211, "memory(GiB)": 58.3, "step": 8570, "token_acc": 0.4807692307692308, "train_speed(iter/s)": 1.449536 }, { "epoch": 0.3673792896619682, "grad_norm": 3.4665451049804688, "learning_rate": 9.867382801071844e-05, "loss": 2.4565670013427736, "memory(GiB)": 58.3, "step": 8575, "token_acc": 0.43666666666666665, "train_speed(iter/s)": 1.449611 }, { "epoch": 0.3675935049912172, "grad_norm": 3.400789260864258, "learning_rate": 9.86722878867903e-05, "loss": 2.612316703796387, "memory(GiB)": 58.3, "step": 8580, "token_acc": 0.4290909090909091, "train_speed(iter/s)": 1.449648 }, { "epoch": 0.36780772032046616, "grad_norm": 4.158246040344238, "learning_rate": 9.867074688111799e-05, "loss": 2.8523075103759767, "memory(GiB)": 58.3, "step": 8585, "token_acc": 0.424812030075188, "train_speed(iter/s)": 1.449566 }, { "epoch": 0.3680219356497151, "grad_norm": 3.6310372352600098, "learning_rate": 9.866920499372944e-05, "loss": 2.381735610961914, "memory(GiB)": 58.3, "step": 8590, "token_acc": 0.45751633986928103, "train_speed(iter/s)": 1.4495 }, { "epoch": 0.36823615097896406, "grad_norm": 4.189392566680908, "learning_rate": 9.86676622246526e-05, "loss": 2.3994396209716795, "memory(GiB)": 58.3, "step": 8595, "token_acc": 0.5145228215767634, "train_speed(iter/s)": 1.449502 }, { "epoch": 0.36845036630821304, "grad_norm": 5.331646919250488, "learning_rate": 9.866611857391541e-05, "loss": 2.479269027709961, "memory(GiB)": 58.3, "step": 8600, "token_acc": 0.4797047970479705, "train_speed(iter/s)": 1.44951 }, { "epoch": 0.36866458163746196, "grad_norm": 3.725822687149048, "learning_rate": 9.866457404154581e-05, "loss": 2.485365867614746, "memory(GiB)": 58.3, "step": 8605, "token_acc": 0.521594684385382, "train_speed(iter/s)": 1.449552 }, { "epoch": 0.36887879696671094, "grad_norm": 4.105701923370361, "learning_rate": 9.86630286275718e-05, "loss": 2.7235858917236326, "memory(GiB)": 58.3, "step": 8610, "token_acc": 0.4441176470588235, "train_speed(iter/s)": 1.44967 }, { "epoch": 0.3690930122959599, "grad_norm": 4.021725177764893, "learning_rate": 9.866148233202139e-05, "loss": 2.4119684219360353, "memory(GiB)": 58.3, "step": 8615, "token_acc": 0.4858156028368794, "train_speed(iter/s)": 1.449672 }, { "epoch": 0.36930722762520884, "grad_norm": 3.6348023414611816, "learning_rate": 9.865993515492258e-05, "loss": 2.7645082473754883, "memory(GiB)": 58.3, "step": 8620, "token_acc": 0.4540059347181009, "train_speed(iter/s)": 1.449739 }, { "epoch": 0.3695214429544578, "grad_norm": 3.8476059436798096, "learning_rate": 9.865838709630339e-05, "loss": 2.7304811477661133, "memory(GiB)": 58.3, "step": 8625, "token_acc": 0.48985507246376814, "train_speed(iter/s)": 1.449832 }, { "epoch": 0.3697356582837068, "grad_norm": 5.059749126434326, "learning_rate": 9.865683815619188e-05, "loss": 2.665720748901367, "memory(GiB)": 58.3, "step": 8630, "token_acc": 0.46855345911949686, "train_speed(iter/s)": 1.449935 }, { "epoch": 0.3699498736129557, "grad_norm": 3.95491099357605, "learning_rate": 9.865528833461611e-05, "loss": 2.3191860198974608, "memory(GiB)": 58.3, "step": 8635, "token_acc": 0.5070422535211268, "train_speed(iter/s)": 1.449937 }, { "epoch": 0.3701640889422047, "grad_norm": 3.9109537601470947, "learning_rate": 9.865373763160413e-05, "loss": 2.7906370162963867, "memory(GiB)": 58.3, "step": 8640, "token_acc": 0.39792387543252594, "train_speed(iter/s)": 1.449898 }, { "epoch": 0.3703783042714537, "grad_norm": 4.044189453125, "learning_rate": 9.865218604718405e-05, "loss": 2.555398368835449, "memory(GiB)": 58.3, "step": 8645, "token_acc": 0.490625, "train_speed(iter/s)": 1.450051 }, { "epoch": 0.3705925196007026, "grad_norm": 4.719189167022705, "learning_rate": 9.8650633581384e-05, "loss": 2.5803035736083983, "memory(GiB)": 58.3, "step": 8650, "token_acc": 0.4627831715210356, "train_speed(iter/s)": 1.449982 }, { "epoch": 0.3708067349299516, "grad_norm": 4.118319034576416, "learning_rate": 9.864908023423207e-05, "loss": 2.4400157928466797, "memory(GiB)": 58.3, "step": 8655, "token_acc": 0.4639175257731959, "train_speed(iter/s)": 1.4499 }, { "epoch": 0.37102095025920057, "grad_norm": 3.280740261077881, "learning_rate": 9.864752600575641e-05, "loss": 2.8693151473999023, "memory(GiB)": 58.3, "step": 8660, "token_acc": 0.43769968051118213, "train_speed(iter/s)": 1.45001 }, { "epoch": 0.3712351655884495, "grad_norm": 4.42762565612793, "learning_rate": 9.864597089598519e-05, "loss": 2.679347038269043, "memory(GiB)": 58.3, "step": 8665, "token_acc": 0.49635036496350365, "train_speed(iter/s)": 1.449935 }, { "epoch": 0.37144938091769847, "grad_norm": 4.9428887367248535, "learning_rate": 9.864441490494658e-05, "loss": 2.621337127685547, "memory(GiB)": 58.3, "step": 8670, "token_acc": 0.475, "train_speed(iter/s)": 1.449815 }, { "epoch": 0.37166359624694745, "grad_norm": 3.82576584815979, "learning_rate": 9.864285803266876e-05, "loss": 2.7555059432983398, "memory(GiB)": 58.3, "step": 8675, "token_acc": 0.43130990415335463, "train_speed(iter/s)": 1.449837 }, { "epoch": 0.37187781157619637, "grad_norm": 3.818668842315674, "learning_rate": 9.864130027917993e-05, "loss": 2.8221467971801757, "memory(GiB)": 58.3, "step": 8680, "token_acc": 0.42813455657492355, "train_speed(iter/s)": 1.44998 }, { "epoch": 0.37209202690544535, "grad_norm": 4.1920485496521, "learning_rate": 9.863974164450833e-05, "loss": 2.164962959289551, "memory(GiB)": 58.3, "step": 8685, "token_acc": 0.4936708860759494, "train_speed(iter/s)": 1.449936 }, { "epoch": 0.37230624223469433, "grad_norm": 3.3402833938598633, "learning_rate": 9.863818212868217e-05, "loss": 2.559827423095703, "memory(GiB)": 58.3, "step": 8690, "token_acc": 0.43902439024390244, "train_speed(iter/s)": 1.449817 }, { "epoch": 0.37252045756394325, "grad_norm": 3.664734125137329, "learning_rate": 9.863662173172971e-05, "loss": 2.6619312286376955, "memory(GiB)": 58.3, "step": 8695, "token_acc": 0.4270833333333333, "train_speed(iter/s)": 1.449801 }, { "epoch": 0.37273467289319223, "grad_norm": 4.694035530090332, "learning_rate": 9.863506045367923e-05, "loss": 2.540768051147461, "memory(GiB)": 58.3, "step": 8700, "token_acc": 0.4889705882352941, "train_speed(iter/s)": 1.449959 }, { "epoch": 0.3729488882224412, "grad_norm": 6.23726749420166, "learning_rate": 9.863349829455899e-05, "loss": 2.611031341552734, "memory(GiB)": 58.3, "step": 8705, "token_acc": 0.48091603053435117, "train_speed(iter/s)": 1.449981 }, { "epoch": 0.37316310355169013, "grad_norm": 3.676201105117798, "learning_rate": 9.863193525439734e-05, "loss": 2.4545129776000976, "memory(GiB)": 58.3, "step": 8710, "token_acc": 0.49829351535836175, "train_speed(iter/s)": 1.449955 }, { "epoch": 0.3733773188809391, "grad_norm": 4.304929256439209, "learning_rate": 9.863037133322252e-05, "loss": 2.3013126373291017, "memory(GiB)": 58.3, "step": 8715, "token_acc": 0.4943820224719101, "train_speed(iter/s)": 1.449877 }, { "epoch": 0.3735915342101881, "grad_norm": 4.57396125793457, "learning_rate": 9.862880653106294e-05, "loss": 2.7888824462890627, "memory(GiB)": 58.3, "step": 8720, "token_acc": 0.4684014869888476, "train_speed(iter/s)": 1.449935 }, { "epoch": 0.373805749539437, "grad_norm": 4.094754695892334, "learning_rate": 9.86272408479469e-05, "loss": 2.3246604919433596, "memory(GiB)": 58.3, "step": 8725, "token_acc": 0.4838709677419355, "train_speed(iter/s)": 1.449748 }, { "epoch": 0.374019964868686, "grad_norm": 5.775775909423828, "learning_rate": 9.862567428390277e-05, "loss": 2.8006607055664063, "memory(GiB)": 58.3, "step": 8730, "token_acc": 0.45416666666666666, "train_speed(iter/s)": 1.449714 }, { "epoch": 0.374234180197935, "grad_norm": 4.251965045928955, "learning_rate": 9.862410683895895e-05, "loss": 2.8104974746704103, "memory(GiB)": 58.3, "step": 8735, "token_acc": 0.41530054644808745, "train_speed(iter/s)": 1.449563 }, { "epoch": 0.37444839552718395, "grad_norm": 4.093268871307373, "learning_rate": 9.86225385131438e-05, "loss": 2.719206428527832, "memory(GiB)": 58.3, "step": 8740, "token_acc": 0.4472049689440994, "train_speed(iter/s)": 1.449646 }, { "epoch": 0.3746626108564329, "grad_norm": 3.7247488498687744, "learning_rate": 9.862096930648577e-05, "loss": 2.518401336669922, "memory(GiB)": 58.3, "step": 8745, "token_acc": 0.46545454545454545, "train_speed(iter/s)": 1.449695 }, { "epoch": 0.37487682618568186, "grad_norm": 6.050350189208984, "learning_rate": 9.861939921901326e-05, "loss": 2.544095993041992, "memory(GiB)": 58.3, "step": 8750, "token_acc": 0.4874551971326165, "train_speed(iter/s)": 1.449597 }, { "epoch": 0.37509104151493083, "grad_norm": 5.940671920776367, "learning_rate": 9.861782825075475e-05, "loss": 2.575368309020996, "memory(GiB)": 58.3, "step": 8755, "token_acc": 0.4591439688715953, "train_speed(iter/s)": 1.449682 }, { "epoch": 0.37530525684417976, "grad_norm": 4.251453399658203, "learning_rate": 9.861625640173865e-05, "loss": 2.481136703491211, "memory(GiB)": 58.3, "step": 8760, "token_acc": 0.4892086330935252, "train_speed(iter/s)": 1.449969 }, { "epoch": 0.37551947217342874, "grad_norm": 4.797479152679443, "learning_rate": 9.861468367199346e-05, "loss": 2.675431823730469, "memory(GiB)": 58.3, "step": 8765, "token_acc": 0.4562043795620438, "train_speed(iter/s)": 1.449976 }, { "epoch": 0.3757336875026777, "grad_norm": 4.372289657592773, "learning_rate": 9.861311006154767e-05, "loss": 2.4585329055786134, "memory(GiB)": 58.3, "step": 8770, "token_acc": 0.5057915057915058, "train_speed(iter/s)": 1.449499 }, { "epoch": 0.37594790283192664, "grad_norm": 4.571930885314941, "learning_rate": 9.86115355704298e-05, "loss": 2.6418178558349608, "memory(GiB)": 58.3, "step": 8775, "token_acc": 0.47766323024054985, "train_speed(iter/s)": 1.44961 }, { "epoch": 0.3761621181611756, "grad_norm": 3.6882073879241943, "learning_rate": 9.860996019866836e-05, "loss": 2.5285486221313476, "memory(GiB)": 58.3, "step": 8780, "token_acc": 0.45871559633027525, "train_speed(iter/s)": 1.449899 }, { "epoch": 0.3763763334904246, "grad_norm": 3.6706016063690186, "learning_rate": 9.860838394629188e-05, "loss": 2.7798057556152345, "memory(GiB)": 58.3, "step": 8785, "token_acc": 0.46794871794871795, "train_speed(iter/s)": 1.449993 }, { "epoch": 0.3765905488196735, "grad_norm": 3.9987995624542236, "learning_rate": 9.860680681332894e-05, "loss": 2.616022491455078, "memory(GiB)": 58.3, "step": 8790, "token_acc": 0.47003154574132494, "train_speed(iter/s)": 1.450214 }, { "epoch": 0.3768047641489225, "grad_norm": 2.946453094482422, "learning_rate": 9.860522879980809e-05, "loss": 2.4633434295654295, "memory(GiB)": 58.3, "step": 8795, "token_acc": 0.5241635687732342, "train_speed(iter/s)": 1.450324 }, { "epoch": 0.3770189794781715, "grad_norm": 4.215868949890137, "learning_rate": 9.860364990575792e-05, "loss": 2.6088342666625977, "memory(GiB)": 58.3, "step": 8800, "token_acc": 0.463768115942029, "train_speed(iter/s)": 1.450562 }, { "epoch": 0.3772331948074204, "grad_norm": 3.7749364376068115, "learning_rate": 9.860207013120706e-05, "loss": 2.602193832397461, "memory(GiB)": 58.3, "step": 8805, "token_acc": 0.4676258992805755, "train_speed(iter/s)": 1.450677 }, { "epoch": 0.3774474101366694, "grad_norm": 6.906366348266602, "learning_rate": 9.860048947618408e-05, "loss": 2.79365234375, "memory(GiB)": 58.3, "step": 8810, "token_acc": 0.4196078431372549, "train_speed(iter/s)": 1.450832 }, { "epoch": 0.37766162546591836, "grad_norm": 5.40219783782959, "learning_rate": 9.859890794071767e-05, "loss": 2.6922300338745115, "memory(GiB)": 58.3, "step": 8815, "token_acc": 0.44336569579288027, "train_speed(iter/s)": 1.450909 }, { "epoch": 0.3778758407951673, "grad_norm": 4.223740100860596, "learning_rate": 9.859732552483642e-05, "loss": 2.652150344848633, "memory(GiB)": 58.3, "step": 8820, "token_acc": 0.4277456647398844, "train_speed(iter/s)": 1.450953 }, { "epoch": 0.37809005612441626, "grad_norm": 4.2968597412109375, "learning_rate": 9.859574222856905e-05, "loss": 2.3183837890625, "memory(GiB)": 58.3, "step": 8825, "token_acc": 0.49122807017543857, "train_speed(iter/s)": 1.450927 }, { "epoch": 0.37830427145366524, "grad_norm": 5.012269973754883, "learning_rate": 9.859415805194422e-05, "loss": 2.4743255615234374, "memory(GiB)": 58.3, "step": 8830, "token_acc": 0.48220064724919093, "train_speed(iter/s)": 1.450937 }, { "epoch": 0.37851848678291417, "grad_norm": 3.280315399169922, "learning_rate": 9.859257299499064e-05, "loss": 2.5424777984619142, "memory(GiB)": 58.3, "step": 8835, "token_acc": 0.4530612244897959, "train_speed(iter/s)": 1.451087 }, { "epoch": 0.37873270211216314, "grad_norm": 3.276996612548828, "learning_rate": 9.859098705773701e-05, "loss": 2.8809751510620116, "memory(GiB)": 58.3, "step": 8840, "token_acc": 0.44, "train_speed(iter/s)": 1.451148 }, { "epoch": 0.3789469174414121, "grad_norm": 2.588827133178711, "learning_rate": 9.858940024021205e-05, "loss": 2.124911880493164, "memory(GiB)": 58.3, "step": 8845, "token_acc": 0.53125, "train_speed(iter/s)": 1.451228 }, { "epoch": 0.37916113277066105, "grad_norm": 4.800065040588379, "learning_rate": 9.858781254244455e-05, "loss": 2.6147735595703123, "memory(GiB)": 58.3, "step": 8850, "token_acc": 0.45161290322580644, "train_speed(iter/s)": 1.451035 }, { "epoch": 0.37937534809991, "grad_norm": 4.340216636657715, "learning_rate": 9.858622396446325e-05, "loss": 2.6263282775878904, "memory(GiB)": 58.3, "step": 8855, "token_acc": 0.45051194539249145, "train_speed(iter/s)": 1.451073 }, { "epoch": 0.379589563429159, "grad_norm": 6.374390602111816, "learning_rate": 9.85846345062969e-05, "loss": 2.4751708984375, "memory(GiB)": 58.3, "step": 8860, "token_acc": 0.4789272030651341, "train_speed(iter/s)": 1.451247 }, { "epoch": 0.37980377875840793, "grad_norm": 6.287930488586426, "learning_rate": 9.858304416797433e-05, "loss": 2.4763072967529296, "memory(GiB)": 58.3, "step": 8865, "token_acc": 0.46586345381526106, "train_speed(iter/s)": 1.451324 }, { "epoch": 0.3800179940876569, "grad_norm": 4.323367118835449, "learning_rate": 9.858145294952435e-05, "loss": 2.5795166015625, "memory(GiB)": 58.3, "step": 8870, "token_acc": 0.4563758389261745, "train_speed(iter/s)": 1.451165 }, { "epoch": 0.3802322094169059, "grad_norm": 4.53786563873291, "learning_rate": 9.857986085097577e-05, "loss": 2.6223476409912108, "memory(GiB)": 58.3, "step": 8875, "token_acc": 0.4612676056338028, "train_speed(iter/s)": 1.451014 }, { "epoch": 0.3804464247461548, "grad_norm": 3.346238851547241, "learning_rate": 9.857826787235744e-05, "loss": 2.848287010192871, "memory(GiB)": 58.3, "step": 8880, "token_acc": 0.4641638225255973, "train_speed(iter/s)": 1.451134 }, { "epoch": 0.3806606400754038, "grad_norm": 3.6543309688568115, "learning_rate": 9.85766740136982e-05, "loss": 2.6900163650512696, "memory(GiB)": 58.3, "step": 8885, "token_acc": 0.449438202247191, "train_speed(iter/s)": 1.451118 }, { "epoch": 0.38087485540465277, "grad_norm": 3.6415889263153076, "learning_rate": 9.857507927502697e-05, "loss": 2.757152557373047, "memory(GiB)": 58.3, "step": 8890, "token_acc": 0.46646341463414637, "train_speed(iter/s)": 1.45105 }, { "epoch": 0.3810890707339017, "grad_norm": 3.651844024658203, "learning_rate": 9.85734836563726e-05, "loss": 2.892853355407715, "memory(GiB)": 58.3, "step": 8895, "token_acc": 0.4171597633136095, "train_speed(iter/s)": 1.450978 }, { "epoch": 0.38130328606315067, "grad_norm": 4.364047527313232, "learning_rate": 9.857188715776401e-05, "loss": 2.6258750915527345, "memory(GiB)": 58.3, "step": 8900, "token_acc": 0.44891640866873067, "train_speed(iter/s)": 1.450983 }, { "epoch": 0.38151750139239965, "grad_norm": 3.3479998111724854, "learning_rate": 9.857028977923011e-05, "loss": 2.9866472244262696, "memory(GiB)": 58.3, "step": 8905, "token_acc": 0.43213296398891965, "train_speed(iter/s)": 1.451027 }, { "epoch": 0.38173171672164863, "grad_norm": 3.5985403060913086, "learning_rate": 9.856869152079986e-05, "loss": 2.602186393737793, "memory(GiB)": 58.3, "step": 8910, "token_acc": 0.4714285714285714, "train_speed(iter/s)": 1.451058 }, { "epoch": 0.38194593205089755, "grad_norm": 3.907210350036621, "learning_rate": 9.85670923825022e-05, "loss": 2.5993650436401365, "memory(GiB)": 58.3, "step": 8915, "token_acc": 0.41, "train_speed(iter/s)": 1.45122 }, { "epoch": 0.38216014738014653, "grad_norm": 5.691684246063232, "learning_rate": 9.856549236436609e-05, "loss": 2.7065284729003904, "memory(GiB)": 58.3, "step": 8920, "token_acc": 0.4392857142857143, "train_speed(iter/s)": 1.451179 }, { "epoch": 0.3823743627093955, "grad_norm": 4.579247951507568, "learning_rate": 9.856389146642054e-05, "loss": 2.5624696731567385, "memory(GiB)": 58.3, "step": 8925, "token_acc": 0.5089605734767025, "train_speed(iter/s)": 1.451182 }, { "epoch": 0.38258857803864443, "grad_norm": 3.177950143814087, "learning_rate": 9.856228968869454e-05, "loss": 2.6077198028564452, "memory(GiB)": 58.3, "step": 8930, "token_acc": 0.4982698961937716, "train_speed(iter/s)": 1.451272 }, { "epoch": 0.3828027933678934, "grad_norm": 6.067451000213623, "learning_rate": 9.85606870312171e-05, "loss": 2.707725715637207, "memory(GiB)": 58.3, "step": 8935, "token_acc": 0.4560260586319218, "train_speed(iter/s)": 1.45128 }, { "epoch": 0.3830170086971424, "grad_norm": 4.170866966247559, "learning_rate": 9.855908349401727e-05, "loss": 2.529203987121582, "memory(GiB)": 58.3, "step": 8940, "token_acc": 0.49050632911392406, "train_speed(iter/s)": 1.451371 }, { "epoch": 0.3832312240263913, "grad_norm": 4.348936557769775, "learning_rate": 9.855747907712408e-05, "loss": 2.575935935974121, "memory(GiB)": 58.3, "step": 8945, "token_acc": 0.4966216216216216, "train_speed(iter/s)": 1.451507 }, { "epoch": 0.3834454393556403, "grad_norm": 5.326323986053467, "learning_rate": 9.855587378056661e-05, "loss": 2.651362991333008, "memory(GiB)": 58.3, "step": 8950, "token_acc": 0.45791245791245794, "train_speed(iter/s)": 1.451672 }, { "epoch": 0.3836596546848893, "grad_norm": 4.141667366027832, "learning_rate": 9.855426760437394e-05, "loss": 2.64807014465332, "memory(GiB)": 58.3, "step": 8955, "token_acc": 0.4721189591078067, "train_speed(iter/s)": 1.451907 }, { "epoch": 0.3838738700141382, "grad_norm": 4.396313667297363, "learning_rate": 9.855266054857518e-05, "loss": 2.696456718444824, "memory(GiB)": 58.3, "step": 8960, "token_acc": 0.4318181818181818, "train_speed(iter/s)": 1.452041 }, { "epoch": 0.3840880853433872, "grad_norm": 4.367549896240234, "learning_rate": 9.855105261319939e-05, "loss": 2.4935434341430662, "memory(GiB)": 58.3, "step": 8965, "token_acc": 0.49173553719008267, "train_speed(iter/s)": 1.452163 }, { "epoch": 0.38430230067263615, "grad_norm": 3.9490792751312256, "learning_rate": 9.854944379827577e-05, "loss": 2.7790895462036134, "memory(GiB)": 58.3, "step": 8970, "token_acc": 0.4520123839009288, "train_speed(iter/s)": 1.452369 }, { "epoch": 0.3845165160018851, "grad_norm": 5.052127838134766, "learning_rate": 9.854783410383341e-05, "loss": 2.434416961669922, "memory(GiB)": 58.3, "step": 8975, "token_acc": 0.4657039711191336, "train_speed(iter/s)": 1.452607 }, { "epoch": 0.38473073133113406, "grad_norm": 5.247725486755371, "learning_rate": 9.85462235299015e-05, "loss": 2.680352973937988, "memory(GiB)": 58.3, "step": 8980, "token_acc": 0.49166666666666664, "train_speed(iter/s)": 1.452584 }, { "epoch": 0.38494494666038304, "grad_norm": 3.5917298793792725, "learning_rate": 9.854461207650922e-05, "loss": 2.3029912948608398, "memory(GiB)": 58.3, "step": 8985, "token_acc": 0.5160142348754448, "train_speed(iter/s)": 1.452604 }, { "epoch": 0.38515916198963196, "grad_norm": 5.086394309997559, "learning_rate": 9.854299974368575e-05, "loss": 2.502714157104492, "memory(GiB)": 58.3, "step": 8990, "token_acc": 0.5120967741935484, "train_speed(iter/s)": 1.452735 }, { "epoch": 0.38537337731888094, "grad_norm": 2.9717671871185303, "learning_rate": 9.85413865314603e-05, "loss": 2.5790231704711912, "memory(GiB)": 58.3, "step": 8995, "token_acc": 0.468944099378882, "train_speed(iter/s)": 1.452849 }, { "epoch": 0.3855875926481299, "grad_norm": 6.031939506530762, "learning_rate": 9.85397724398621e-05, "loss": 2.724561309814453, "memory(GiB)": 58.3, "step": 9000, "token_acc": 0.46558704453441296, "train_speed(iter/s)": 1.452797 }, { "epoch": 0.3855875926481299, "eval_loss": 2.1903438568115234, "eval_runtime": 13.7791, "eval_samples_per_second": 7.257, "eval_steps_per_second": 7.257, "eval_token_acc": 0.47277936962750716, "step": 9000 }, { "epoch": 0.38580180797737884, "grad_norm": 5.6124958992004395, "learning_rate": 9.853815746892039e-05, "loss": 2.979076385498047, "memory(GiB)": 58.3, "step": 9005, "token_acc": 0.45857418111753373, "train_speed(iter/s)": 1.449633 }, { "epoch": 0.3860160233066278, "grad_norm": 6.156009674072266, "learning_rate": 9.853654161866442e-05, "loss": 2.6359397888183596, "memory(GiB)": 58.3, "step": 9010, "token_acc": 0.452, "train_speed(iter/s)": 1.449689 }, { "epoch": 0.3862302386358768, "grad_norm": 3.8203330039978027, "learning_rate": 9.853492488912347e-05, "loss": 2.7224937438964845, "memory(GiB)": 58.3, "step": 9015, "token_acc": 0.44, "train_speed(iter/s)": 1.449882 }, { "epoch": 0.3864444539651257, "grad_norm": 4.893547534942627, "learning_rate": 9.853330728032682e-05, "loss": 2.5215385437011717, "memory(GiB)": 58.3, "step": 9020, "token_acc": 0.468503937007874, "train_speed(iter/s)": 1.449883 }, { "epoch": 0.3866586692943747, "grad_norm": 3.1681206226348877, "learning_rate": 9.853168879230379e-05, "loss": 2.557341957092285, "memory(GiB)": 58.3, "step": 9025, "token_acc": 0.4984894259818731, "train_speed(iter/s)": 1.449515 }, { "epoch": 0.3868728846236237, "grad_norm": 4.308308124542236, "learning_rate": 9.853006942508369e-05, "loss": 2.6847625732421876, "memory(GiB)": 58.3, "step": 9030, "token_acc": 0.4461942257217848, "train_speed(iter/s)": 1.449602 }, { "epoch": 0.3870870999528726, "grad_norm": 3.6191303730010986, "learning_rate": 9.852844917869585e-05, "loss": 2.3625089645385744, "memory(GiB)": 58.3, "step": 9035, "token_acc": 0.48945147679324896, "train_speed(iter/s)": 1.449776 }, { "epoch": 0.3873013152821216, "grad_norm": 3.9500842094421387, "learning_rate": 9.852682805316964e-05, "loss": 2.590120315551758, "memory(GiB)": 58.3, "step": 9040, "token_acc": 0.4738675958188153, "train_speed(iter/s)": 1.449744 }, { "epoch": 0.38751553061137056, "grad_norm": 3.949265480041504, "learning_rate": 9.852520604853442e-05, "loss": 2.4950660705566405, "memory(GiB)": 58.3, "step": 9045, "token_acc": 0.49074074074074076, "train_speed(iter/s)": 1.449912 }, { "epoch": 0.3877297459406195, "grad_norm": 3.363978624343872, "learning_rate": 9.852358316481955e-05, "loss": 2.665091323852539, "memory(GiB)": 58.3, "step": 9050, "token_acc": 0.46534653465346537, "train_speed(iter/s)": 1.450148 }, { "epoch": 0.38794396126986846, "grad_norm": 3.606065273284912, "learning_rate": 9.852195940205448e-05, "loss": 2.7039356231689453, "memory(GiB)": 58.3, "step": 9055, "token_acc": 0.44366197183098594, "train_speed(iter/s)": 1.450109 }, { "epoch": 0.38815817659911744, "grad_norm": 4.576938152313232, "learning_rate": 9.852033476026859e-05, "loss": 2.7347015380859374, "memory(GiB)": 58.3, "step": 9060, "token_acc": 0.43728813559322033, "train_speed(iter/s)": 1.450198 }, { "epoch": 0.38837239192836637, "grad_norm": 4.903129577636719, "learning_rate": 9.851870923949131e-05, "loss": 2.5407920837402345, "memory(GiB)": 58.3, "step": 9065, "token_acc": 0.43137254901960786, "train_speed(iter/s)": 1.450246 }, { "epoch": 0.38858660725761535, "grad_norm": 3.3986198902130127, "learning_rate": 9.85170828397521e-05, "loss": 2.535195159912109, "memory(GiB)": 58.3, "step": 9070, "token_acc": 0.487012987012987, "train_speed(iter/s)": 1.450421 }, { "epoch": 0.3888008225868643, "grad_norm": 4.44728422164917, "learning_rate": 9.851545556108042e-05, "loss": 2.296782684326172, "memory(GiB)": 58.3, "step": 9075, "token_acc": 0.4528985507246377, "train_speed(iter/s)": 1.450491 }, { "epoch": 0.3890150379161133, "grad_norm": 4.065402984619141, "learning_rate": 9.851382740350576e-05, "loss": 2.7275468826293947, "memory(GiB)": 58.3, "step": 9080, "token_acc": 0.43283582089552236, "train_speed(iter/s)": 1.450614 }, { "epoch": 0.38922925324536223, "grad_norm": 4.332919120788574, "learning_rate": 9.851219836705761e-05, "loss": 2.8587520599365233, "memory(GiB)": 58.3, "step": 9085, "token_acc": 0.41935483870967744, "train_speed(iter/s)": 1.450797 }, { "epoch": 0.3894434685746112, "grad_norm": 4.267101287841797, "learning_rate": 9.851056845176547e-05, "loss": 2.5675411224365234, "memory(GiB)": 58.3, "step": 9090, "token_acc": 0.4541984732824427, "train_speed(iter/s)": 1.450742 }, { "epoch": 0.3896576839038602, "grad_norm": 4.282329082489014, "learning_rate": 9.850893765765887e-05, "loss": 2.5712690353393555, "memory(GiB)": 58.3, "step": 9095, "token_acc": 0.4134275618374558, "train_speed(iter/s)": 1.450987 }, { "epoch": 0.3898718992331091, "grad_norm": 4.885793685913086, "learning_rate": 9.850730598476737e-05, "loss": 2.836735725402832, "memory(GiB)": 58.3, "step": 9100, "token_acc": 0.45387453874538747, "train_speed(iter/s)": 1.451068 }, { "epoch": 0.3900861145623581, "grad_norm": 3.5165767669677734, "learning_rate": 9.850567343312051e-05, "loss": 2.3478668212890623, "memory(GiB)": 58.3, "step": 9105, "token_acc": 0.545774647887324, "train_speed(iter/s)": 1.45102 }, { "epoch": 0.39030032989160707, "grad_norm": 3.6581642627716064, "learning_rate": 9.850404000274789e-05, "loss": 2.976247024536133, "memory(GiB)": 58.3, "step": 9110, "token_acc": 0.3993174061433447, "train_speed(iter/s)": 1.451054 }, { "epoch": 0.390514545220856, "grad_norm": 3.52714204788208, "learning_rate": 9.850240569367908e-05, "loss": 2.715854263305664, "memory(GiB)": 58.3, "step": 9115, "token_acc": 0.43515850144092216, "train_speed(iter/s)": 1.451117 }, { "epoch": 0.39072876055010497, "grad_norm": 3.781163454055786, "learning_rate": 9.850077050594368e-05, "loss": 2.555784225463867, "memory(GiB)": 58.3, "step": 9120, "token_acc": 0.4691780821917808, "train_speed(iter/s)": 1.451243 }, { "epoch": 0.39094297587935395, "grad_norm": 4.061525821685791, "learning_rate": 9.849913443957133e-05, "loss": 2.7376424789428713, "memory(GiB)": 58.3, "step": 9125, "token_acc": 0.46308724832214765, "train_speed(iter/s)": 1.451207 }, { "epoch": 0.3911571912086029, "grad_norm": 4.145669460296631, "learning_rate": 9.849749749459167e-05, "loss": 2.382442092895508, "memory(GiB)": 58.3, "step": 9130, "token_acc": 0.5129032258064516, "train_speed(iter/s)": 1.451246 }, { "epoch": 0.39137140653785185, "grad_norm": 5.671361923217773, "learning_rate": 9.849585967103434e-05, "loss": 2.8731260299682617, "memory(GiB)": 58.3, "step": 9135, "token_acc": 0.4375, "train_speed(iter/s)": 1.451174 }, { "epoch": 0.39158562186710083, "grad_norm": 3.527975559234619, "learning_rate": 9.849422096892902e-05, "loss": 2.7227451324462892, "memory(GiB)": 58.3, "step": 9140, "token_acc": 0.45806451612903226, "train_speed(iter/s)": 1.451381 }, { "epoch": 0.39179983719634975, "grad_norm": 5.232457160949707, "learning_rate": 9.84925813883054e-05, "loss": 2.906509780883789, "memory(GiB)": 58.3, "step": 9145, "token_acc": 0.4144144144144144, "train_speed(iter/s)": 1.451339 }, { "epoch": 0.39201405252559873, "grad_norm": 4.1620635986328125, "learning_rate": 9.849094092919318e-05, "loss": 2.8824880599975584, "memory(GiB)": 58.3, "step": 9150, "token_acc": 0.4209039548022599, "train_speed(iter/s)": 1.451499 }, { "epoch": 0.3922282678548477, "grad_norm": 4.052217960357666, "learning_rate": 9.848929959162207e-05, "loss": 2.8716266632080076, "memory(GiB)": 58.3, "step": 9155, "token_acc": 0.439873417721519, "train_speed(iter/s)": 1.451558 }, { "epoch": 0.39244248318409664, "grad_norm": 3.931018590927124, "learning_rate": 9.848765737562183e-05, "loss": 2.4714435577392577, "memory(GiB)": 58.3, "step": 9160, "token_acc": 0.5063291139240507, "train_speed(iter/s)": 1.451644 }, { "epoch": 0.3926566985133456, "grad_norm": 3.8844563961029053, "learning_rate": 9.848601428122217e-05, "loss": 2.8242166519165037, "memory(GiB)": 58.3, "step": 9165, "token_acc": 0.43209876543209874, "train_speed(iter/s)": 1.451748 }, { "epoch": 0.3928709138425946, "grad_norm": 5.687803268432617, "learning_rate": 9.848437030845288e-05, "loss": 2.89019718170166, "memory(GiB)": 58.3, "step": 9170, "token_acc": 0.44609665427509293, "train_speed(iter/s)": 1.451899 }, { "epoch": 0.3930851291718435, "grad_norm": 5.416097164154053, "learning_rate": 9.848272545734374e-05, "loss": 2.4476722717285155, "memory(GiB)": 58.3, "step": 9175, "token_acc": 0.4901315789473684, "train_speed(iter/s)": 1.451799 }, { "epoch": 0.3932993445010925, "grad_norm": 3.545675754547119, "learning_rate": 9.848107972792455e-05, "loss": 2.467362976074219, "memory(GiB)": 58.3, "step": 9180, "token_acc": 0.4816053511705686, "train_speed(iter/s)": 1.451979 }, { "epoch": 0.3935135598303415, "grad_norm": 5.395636081695557, "learning_rate": 9.847943312022511e-05, "loss": 2.738362121582031, "memory(GiB)": 58.3, "step": 9185, "token_acc": 0.4155405405405405, "train_speed(iter/s)": 1.452146 }, { "epoch": 0.3937277751595904, "grad_norm": 4.503758430480957, "learning_rate": 9.847778563427529e-05, "loss": 2.4091026306152346, "memory(GiB)": 58.3, "step": 9190, "token_acc": 0.44673539518900346, "train_speed(iter/s)": 1.452289 }, { "epoch": 0.3939419904888394, "grad_norm": 4.088322639465332, "learning_rate": 9.84761372701049e-05, "loss": 2.3184776306152344, "memory(GiB)": 58.3, "step": 9195, "token_acc": 0.4965753424657534, "train_speed(iter/s)": 1.452244 }, { "epoch": 0.39415620581808836, "grad_norm": 3.4466676712036133, "learning_rate": 9.847448802774379e-05, "loss": 2.609083557128906, "memory(GiB)": 58.3, "step": 9200, "token_acc": 0.44126984126984126, "train_speed(iter/s)": 1.452226 }, { "epoch": 0.3943704211473373, "grad_norm": 3.4711740016937256, "learning_rate": 9.847283790722187e-05, "loss": 2.7564361572265623, "memory(GiB)": 58.3, "step": 9205, "token_acc": 0.4405144694533762, "train_speed(iter/s)": 1.452387 }, { "epoch": 0.39458463647658626, "grad_norm": 5.136669158935547, "learning_rate": 9.847118690856903e-05, "loss": 2.4305177688598634, "memory(GiB)": 58.3, "step": 9210, "token_acc": 0.46381578947368424, "train_speed(iter/s)": 1.452453 }, { "epoch": 0.39479885180583524, "grad_norm": 3.6338794231414795, "learning_rate": 9.846953503181515e-05, "loss": 2.3469114303588867, "memory(GiB)": 58.3, "step": 9215, "token_acc": 0.4833948339483395, "train_speed(iter/s)": 1.452559 }, { "epoch": 0.39501306713508416, "grad_norm": 3.952202796936035, "learning_rate": 9.846788227699017e-05, "loss": 2.719685745239258, "memory(GiB)": 58.3, "step": 9220, "token_acc": 0.41590214067278286, "train_speed(iter/s)": 1.452582 }, { "epoch": 0.39522728246433314, "grad_norm": 6.918583869934082, "learning_rate": 9.846622864412406e-05, "loss": 2.6086856842041017, "memory(GiB)": 58.3, "step": 9225, "token_acc": 0.45878136200716846, "train_speed(iter/s)": 1.452516 }, { "epoch": 0.3954414977935821, "grad_norm": 4.2701334953308105, "learning_rate": 9.846457413324675e-05, "loss": 2.6882152557373047, "memory(GiB)": 58.3, "step": 9230, "token_acc": 0.43097643097643096, "train_speed(iter/s)": 1.45238 }, { "epoch": 0.39565571312283104, "grad_norm": 3.582998037338257, "learning_rate": 9.84629187443882e-05, "loss": 2.6443161010742187, "memory(GiB)": 58.3, "step": 9235, "token_acc": 0.438871473354232, "train_speed(iter/s)": 1.452554 }, { "epoch": 0.39586992845208, "grad_norm": 4.341336250305176, "learning_rate": 9.846126247757843e-05, "loss": 2.3565851211547852, "memory(GiB)": 58.3, "step": 9240, "token_acc": 0.4882943143812709, "train_speed(iter/s)": 1.452753 }, { "epoch": 0.396084143781329, "grad_norm": 4.659677982330322, "learning_rate": 9.845960533284742e-05, "loss": 2.480521392822266, "memory(GiB)": 58.3, "step": 9245, "token_acc": 0.4657039711191336, "train_speed(iter/s)": 1.452623 }, { "epoch": 0.396298359110578, "grad_norm": 3.5144901275634766, "learning_rate": 9.84579473102252e-05, "loss": 1.987625503540039, "memory(GiB)": 58.3, "step": 9250, "token_acc": 0.5800865800865801, "train_speed(iter/s)": 1.452228 }, { "epoch": 0.3965125744398269, "grad_norm": 4.90177059173584, "learning_rate": 9.845628840974182e-05, "loss": 2.4235244750976563, "memory(GiB)": 58.3, "step": 9255, "token_acc": 0.5132075471698113, "train_speed(iter/s)": 1.452361 }, { "epoch": 0.3967267897690759, "grad_norm": 4.903687477111816, "learning_rate": 9.84546286314273e-05, "loss": 2.8135444641113283, "memory(GiB)": 58.3, "step": 9260, "token_acc": 0.468944099378882, "train_speed(iter/s)": 1.452342 }, { "epoch": 0.39694100509832486, "grad_norm": 4.126767635345459, "learning_rate": 9.845296797531173e-05, "loss": 2.4977348327636717, "memory(GiB)": 58.3, "step": 9265, "token_acc": 0.4854368932038835, "train_speed(iter/s)": 1.452403 }, { "epoch": 0.3971552204275738, "grad_norm": 3.942574977874756, "learning_rate": 9.845130644142522e-05, "loss": 2.568085861206055, "memory(GiB)": 58.3, "step": 9270, "token_acc": 0.48484848484848486, "train_speed(iter/s)": 1.45252 }, { "epoch": 0.39736943575682276, "grad_norm": 4.052740573883057, "learning_rate": 9.844964402979781e-05, "loss": 2.3654411315917967, "memory(GiB)": 58.3, "step": 9275, "token_acc": 0.5230769230769231, "train_speed(iter/s)": 1.452639 }, { "epoch": 0.39758365108607174, "grad_norm": 4.126075267791748, "learning_rate": 9.844798074045967e-05, "loss": 2.4733795166015624, "memory(GiB)": 58.3, "step": 9280, "token_acc": 0.46402877697841727, "train_speed(iter/s)": 1.452459 }, { "epoch": 0.39779786641532067, "grad_norm": 5.828798294067383, "learning_rate": 9.84463165734409e-05, "loss": 2.325762176513672, "memory(GiB)": 58.3, "step": 9285, "token_acc": 0.5198675496688742, "train_speed(iter/s)": 1.452431 }, { "epoch": 0.39801208174456965, "grad_norm": 3.830606698989868, "learning_rate": 9.844465152877167e-05, "loss": 2.5024595260620117, "memory(GiB)": 58.3, "step": 9290, "token_acc": 0.4645390070921986, "train_speed(iter/s)": 1.452523 }, { "epoch": 0.3982262970738186, "grad_norm": 4.568037986755371, "learning_rate": 9.844298560648213e-05, "loss": 2.521292877197266, "memory(GiB)": 58.3, "step": 9295, "token_acc": 0.45323741007194246, "train_speed(iter/s)": 1.452491 }, { "epoch": 0.39844051240306755, "grad_norm": 3.5581140518188477, "learning_rate": 9.844131880660246e-05, "loss": 2.6647274017333986, "memory(GiB)": 58.3, "step": 9300, "token_acc": 0.4752851711026616, "train_speed(iter/s)": 1.452494 }, { "epoch": 0.3986547277323165, "grad_norm": 4.720825672149658, "learning_rate": 9.843965112916285e-05, "loss": 2.689658355712891, "memory(GiB)": 58.3, "step": 9305, "token_acc": 0.422680412371134, "train_speed(iter/s)": 1.45263 }, { "epoch": 0.3988689430615655, "grad_norm": 3.6050236225128174, "learning_rate": 9.843798257419352e-05, "loss": 2.657181167602539, "memory(GiB)": 58.3, "step": 9310, "token_acc": 0.46394984326018807, "train_speed(iter/s)": 1.452611 }, { "epoch": 0.39908315839081443, "grad_norm": 4.149864196777344, "learning_rate": 9.843631314172471e-05, "loss": 2.538443946838379, "memory(GiB)": 58.3, "step": 9315, "token_acc": 0.49333333333333335, "train_speed(iter/s)": 1.452678 }, { "epoch": 0.3992973737200634, "grad_norm": 5.8810858726501465, "learning_rate": 9.843464283178665e-05, "loss": 2.6902797698974608, "memory(GiB)": 58.3, "step": 9320, "token_acc": 0.46598639455782315, "train_speed(iter/s)": 1.452744 }, { "epoch": 0.3995115890493124, "grad_norm": 5.2606306076049805, "learning_rate": 9.843297164440959e-05, "loss": 2.446046829223633, "memory(GiB)": 58.3, "step": 9325, "token_acc": 0.46153846153846156, "train_speed(iter/s)": 1.452755 }, { "epoch": 0.3997258043785613, "grad_norm": 3.9002749919891357, "learning_rate": 9.843129957962381e-05, "loss": 2.461764907836914, "memory(GiB)": 58.3, "step": 9330, "token_acc": 0.4511494252873563, "train_speed(iter/s)": 1.452673 }, { "epoch": 0.3999400197078103, "grad_norm": 3.7831766605377197, "learning_rate": 9.842962663745963e-05, "loss": 2.641637420654297, "memory(GiB)": 58.3, "step": 9335, "token_acc": 0.44299674267100975, "train_speed(iter/s)": 1.452426 }, { "epoch": 0.40015423503705927, "grad_norm": 4.262258529663086, "learning_rate": 9.842795281794732e-05, "loss": 2.7654655456542967, "memory(GiB)": 58.3, "step": 9340, "token_acc": 0.42718446601941745, "train_speed(iter/s)": 1.452414 }, { "epoch": 0.4003684503663082, "grad_norm": 3.4499847888946533, "learning_rate": 9.84262781211172e-05, "loss": 2.6401214599609375, "memory(GiB)": 58.3, "step": 9345, "token_acc": 0.41216216216216217, "train_speed(iter/s)": 1.452275 }, { "epoch": 0.40058266569555717, "grad_norm": 4.302866458892822, "learning_rate": 9.842460254699963e-05, "loss": 2.5645931243896483, "memory(GiB)": 58.3, "step": 9350, "token_acc": 0.484, "train_speed(iter/s)": 1.45239 }, { "epoch": 0.40079688102480615, "grad_norm": 4.590943813323975, "learning_rate": 9.842292609562498e-05, "loss": 2.5758602142333986, "memory(GiB)": 58.3, "step": 9355, "token_acc": 0.4460431654676259, "train_speed(iter/s)": 1.452489 }, { "epoch": 0.4010110963540551, "grad_norm": 3.751528263092041, "learning_rate": 9.84212487670236e-05, "loss": 2.50805778503418, "memory(GiB)": 58.3, "step": 9360, "token_acc": 0.46645367412140576, "train_speed(iter/s)": 1.452515 }, { "epoch": 0.40122531168330405, "grad_norm": 2.8437740802764893, "learning_rate": 9.841957056122584e-05, "loss": 2.6894599914550783, "memory(GiB)": 58.3, "step": 9365, "token_acc": 0.4631268436578171, "train_speed(iter/s)": 1.452476 }, { "epoch": 0.40143952701255303, "grad_norm": 4.310125827789307, "learning_rate": 9.841789147826217e-05, "loss": 2.7040168762207033, "memory(GiB)": 58.3, "step": 9370, "token_acc": 0.4755700325732899, "train_speed(iter/s)": 1.452376 }, { "epoch": 0.40165374234180196, "grad_norm": 3.55946683883667, "learning_rate": 9.841621151816296e-05, "loss": 2.5776132583618163, "memory(GiB)": 58.3, "step": 9375, "token_acc": 0.4712230215827338, "train_speed(iter/s)": 1.452496 }, { "epoch": 0.40186795767105093, "grad_norm": 3.3330557346343994, "learning_rate": 9.841453068095867e-05, "loss": 2.650056838989258, "memory(GiB)": 58.3, "step": 9380, "token_acc": 0.397887323943662, "train_speed(iter/s)": 1.452514 }, { "epoch": 0.4020821730002999, "grad_norm": 4.043461322784424, "learning_rate": 9.841284896667973e-05, "loss": 2.3511322021484373, "memory(GiB)": 58.3, "step": 9385, "token_acc": 0.524, "train_speed(iter/s)": 1.452401 }, { "epoch": 0.40229638832954884, "grad_norm": 3.820551633834839, "learning_rate": 9.841116637535662e-05, "loss": 2.6962852478027344, "memory(GiB)": 58.3, "step": 9390, "token_acc": 0.46938775510204084, "train_speed(iter/s)": 1.452407 }, { "epoch": 0.4025106036587978, "grad_norm": 3.8388638496398926, "learning_rate": 9.840948290701982e-05, "loss": 2.6646074295043944, "memory(GiB)": 58.3, "step": 9395, "token_acc": 0.4697986577181208, "train_speed(iter/s)": 1.452536 }, { "epoch": 0.4027248189880468, "grad_norm": 4.511655330657959, "learning_rate": 9.840779856169982e-05, "loss": 2.613489532470703, "memory(GiB)": 58.3, "step": 9400, "token_acc": 0.4809688581314879, "train_speed(iter/s)": 1.4525 }, { "epoch": 0.4029390343172957, "grad_norm": 5.018468379974365, "learning_rate": 9.840611333942715e-05, "loss": 2.9536693572998045, "memory(GiB)": 58.3, "step": 9405, "token_acc": 0.44014084507042256, "train_speed(iter/s)": 1.452561 }, { "epoch": 0.4031532496465447, "grad_norm": 3.4255735874176025, "learning_rate": 9.840442724023232e-05, "loss": 2.7968017578125, "memory(GiB)": 58.3, "step": 9410, "token_acc": 0.4473684210526316, "train_speed(iter/s)": 1.452641 }, { "epoch": 0.4033674649757937, "grad_norm": 6.153944969177246, "learning_rate": 9.840274026414589e-05, "loss": 2.592268943786621, "memory(GiB)": 58.3, "step": 9415, "token_acc": 0.4810126582278481, "train_speed(iter/s)": 1.452797 }, { "epoch": 0.40358168030504266, "grad_norm": 3.660299301147461, "learning_rate": 9.840105241119841e-05, "loss": 2.437905502319336, "memory(GiB)": 58.3, "step": 9420, "token_acc": 0.48363636363636364, "train_speed(iter/s)": 1.452994 }, { "epoch": 0.4037958956342916, "grad_norm": 4.591060161590576, "learning_rate": 9.839936368142046e-05, "loss": 2.7226917266845705, "memory(GiB)": 58.3, "step": 9425, "token_acc": 0.45045045045045046, "train_speed(iter/s)": 1.453125 }, { "epoch": 0.40401011096354056, "grad_norm": 3.767965078353882, "learning_rate": 9.839767407484264e-05, "loss": 2.5389484405517577, "memory(GiB)": 58.3, "step": 9430, "token_acc": 0.4854368932038835, "train_speed(iter/s)": 1.453173 }, { "epoch": 0.40422432629278954, "grad_norm": 4.654022693634033, "learning_rate": 9.839598359149556e-05, "loss": 2.8059436798095705, "memory(GiB)": 58.3, "step": 9435, "token_acc": 0.4511784511784512, "train_speed(iter/s)": 1.45322 }, { "epoch": 0.40443854162203846, "grad_norm": 3.7642242908477783, "learning_rate": 9.839429223140981e-05, "loss": 2.648666000366211, "memory(GiB)": 58.3, "step": 9440, "token_acc": 0.44886363636363635, "train_speed(iter/s)": 1.453134 }, { "epoch": 0.40465275695128744, "grad_norm": 3.5528132915496826, "learning_rate": 9.839259999461609e-05, "loss": 2.6834552764892576, "memory(GiB)": 58.3, "step": 9445, "token_acc": 0.4371069182389937, "train_speed(iter/s)": 1.452979 }, { "epoch": 0.4048669722805364, "grad_norm": 3.637704372406006, "learning_rate": 9.839090688114501e-05, "loss": 2.749816131591797, "memory(GiB)": 58.3, "step": 9450, "token_acc": 0.43174603174603177, "train_speed(iter/s)": 1.452977 }, { "epoch": 0.40508118760978534, "grad_norm": 4.624016761779785, "learning_rate": 9.838921289102726e-05, "loss": 2.6001607894897463, "memory(GiB)": 58.3, "step": 9455, "token_acc": 0.4528301886792453, "train_speed(iter/s)": 1.452984 }, { "epoch": 0.4052954029390343, "grad_norm": 4.223477363586426, "learning_rate": 9.838751802429352e-05, "loss": 2.517749214172363, "memory(GiB)": 58.3, "step": 9460, "token_acc": 0.4423791821561338, "train_speed(iter/s)": 1.452943 }, { "epoch": 0.4055096182682833, "grad_norm": 3.8644094467163086, "learning_rate": 9.83858222809745e-05, "loss": 2.6502376556396485, "memory(GiB)": 58.3, "step": 9465, "token_acc": 0.4617737003058104, "train_speed(iter/s)": 1.452595 }, { "epoch": 0.4057238335975322, "grad_norm": 3.79787015914917, "learning_rate": 9.838412566110094e-05, "loss": 2.7505889892578126, "memory(GiB)": 58.3, "step": 9470, "token_acc": 0.4562043795620438, "train_speed(iter/s)": 1.452699 }, { "epoch": 0.4059380489267812, "grad_norm": 4.230555534362793, "learning_rate": 9.838242816470353e-05, "loss": 2.776617431640625, "memory(GiB)": 58.3, "step": 9475, "token_acc": 0.43197278911564624, "train_speed(iter/s)": 1.45274 }, { "epoch": 0.4061522642560302, "grad_norm": 3.367751121520996, "learning_rate": 9.838072979181306e-05, "loss": 2.65602970123291, "memory(GiB)": 58.3, "step": 9480, "token_acc": 0.4222222222222222, "train_speed(iter/s)": 1.452776 }, { "epoch": 0.4063664795852791, "grad_norm": 4.935830593109131, "learning_rate": 9.83790305424603e-05, "loss": 2.667824935913086, "memory(GiB)": 58.3, "step": 9485, "token_acc": 0.45263157894736844, "train_speed(iter/s)": 1.452834 }, { "epoch": 0.4065806949145281, "grad_norm": 4.7056965827941895, "learning_rate": 9.837733041667598e-05, "loss": 2.516107749938965, "memory(GiB)": 58.3, "step": 9490, "token_acc": 0.4742647058823529, "train_speed(iter/s)": 1.452742 }, { "epoch": 0.40679491024377706, "grad_norm": 5.4926252365112305, "learning_rate": 9.837562941449096e-05, "loss": 2.472407913208008, "memory(GiB)": 58.3, "step": 9495, "token_acc": 0.44649446494464945, "train_speed(iter/s)": 1.452924 }, { "epoch": 0.407009125573026, "grad_norm": 3.6378047466278076, "learning_rate": 9.837392753593604e-05, "loss": 2.4419391632080076, "memory(GiB)": 58.3, "step": 9500, "token_acc": 0.5328185328185329, "train_speed(iter/s)": 1.452911 }, { "epoch": 0.407009125573026, "eval_loss": 2.20744252204895, "eval_runtime": 13.7695, "eval_samples_per_second": 7.262, "eval_steps_per_second": 7.262, "eval_token_acc": 0.47275204359673023, "step": 9500 }, { "epoch": 0.40722334090227497, "grad_norm": 4.563931941986084, "learning_rate": 9.837222478104205e-05, "loss": 2.410663032531738, "memory(GiB)": 58.3, "step": 9505, "token_acc": 0.4746450304259635, "train_speed(iter/s)": 1.449619 }, { "epoch": 0.40743755623152395, "grad_norm": 4.4739580154418945, "learning_rate": 9.837052114983982e-05, "loss": 2.921822738647461, "memory(GiB)": 58.3, "step": 9510, "token_acc": 0.44571428571428573, "train_speed(iter/s)": 1.449609 }, { "epoch": 0.40765177156077287, "grad_norm": 3.862518787384033, "learning_rate": 9.836881664236021e-05, "loss": 2.6591524124145507, "memory(GiB)": 58.3, "step": 9515, "token_acc": 0.4605678233438486, "train_speed(iter/s)": 1.449542 }, { "epoch": 0.40786598689002185, "grad_norm": 3.1519999504089355, "learning_rate": 9.836711125863413e-05, "loss": 2.584940528869629, "memory(GiB)": 58.3, "step": 9520, "token_acc": 0.47, "train_speed(iter/s)": 1.449509 }, { "epoch": 0.4080802022192708, "grad_norm": 6.747684955596924, "learning_rate": 9.836540499869244e-05, "loss": 2.619213676452637, "memory(GiB)": 58.3, "step": 9525, "token_acc": 0.43902439024390244, "train_speed(iter/s)": 1.449433 }, { "epoch": 0.40829441754851975, "grad_norm": 3.8937957286834717, "learning_rate": 9.83636978625661e-05, "loss": 2.3868877410888674, "memory(GiB)": 58.3, "step": 9530, "token_acc": 0.4637223974763407, "train_speed(iter/s)": 1.449305 }, { "epoch": 0.40850863287776873, "grad_norm": 3.600677013397217, "learning_rate": 9.836198985028597e-05, "loss": 2.573666572570801, "memory(GiB)": 58.3, "step": 9535, "token_acc": 0.46688741721854304, "train_speed(iter/s)": 1.449465 }, { "epoch": 0.4087228482070177, "grad_norm": 4.229767322540283, "learning_rate": 9.836028096188306e-05, "loss": 2.581707000732422, "memory(GiB)": 58.3, "step": 9540, "token_acc": 0.462882096069869, "train_speed(iter/s)": 1.449476 }, { "epoch": 0.40893706353626663, "grad_norm": 6.386398792266846, "learning_rate": 9.835857119738827e-05, "loss": 2.889179229736328, "memory(GiB)": 58.3, "step": 9545, "token_acc": 0.41379310344827586, "train_speed(iter/s)": 1.449569 }, { "epoch": 0.4091512788655156, "grad_norm": 4.296133995056152, "learning_rate": 9.83568605568326e-05, "loss": 2.600038528442383, "memory(GiB)": 58.3, "step": 9550, "token_acc": 0.4788732394366197, "train_speed(iter/s)": 1.449703 }, { "epoch": 0.4093654941947646, "grad_norm": 3.547651767730713, "learning_rate": 9.835514904024705e-05, "loss": 2.411071014404297, "memory(GiB)": 58.3, "step": 9555, "token_acc": 0.46153846153846156, "train_speed(iter/s)": 1.449956 }, { "epoch": 0.4095797095240135, "grad_norm": 4.404221534729004, "learning_rate": 9.83534366476626e-05, "loss": 2.6571138381958006, "memory(GiB)": 58.3, "step": 9560, "token_acc": 0.4294478527607362, "train_speed(iter/s)": 1.449929 }, { "epoch": 0.4097939248532625, "grad_norm": 3.6686346530914307, "learning_rate": 9.83517233791103e-05, "loss": 2.6325796127319334, "memory(GiB)": 58.3, "step": 9565, "token_acc": 0.426056338028169, "train_speed(iter/s)": 1.449801 }, { "epoch": 0.41000814018251147, "grad_norm": 3.781003952026367, "learning_rate": 9.835000923462117e-05, "loss": 2.7300729751586914, "memory(GiB)": 58.3, "step": 9570, "token_acc": 0.4334470989761092, "train_speed(iter/s)": 1.449634 }, { "epoch": 0.4102223555117604, "grad_norm": 4.024383068084717, "learning_rate": 9.834829421422627e-05, "loss": 2.4482383728027344, "memory(GiB)": 58.3, "step": 9575, "token_acc": 0.494949494949495, "train_speed(iter/s)": 1.449674 }, { "epoch": 0.4104365708410094, "grad_norm": 3.7689874172210693, "learning_rate": 9.834657831795666e-05, "loss": 2.5864463806152345, "memory(GiB)": 58.3, "step": 9580, "token_acc": 0.5, "train_speed(iter/s)": 1.449801 }, { "epoch": 0.41065078617025835, "grad_norm": 4.039371490478516, "learning_rate": 9.834486154584342e-05, "loss": 2.506942367553711, "memory(GiB)": 58.3, "step": 9585, "token_acc": 0.5259515570934256, "train_speed(iter/s)": 1.44988 }, { "epoch": 0.41086500149950733, "grad_norm": 5.111746311187744, "learning_rate": 9.834314389791767e-05, "loss": 2.5287267684936525, "memory(GiB)": 58.3, "step": 9590, "token_acc": 0.4645669291338583, "train_speed(iter/s)": 1.450055 }, { "epoch": 0.41107921682875626, "grad_norm": 3.6341724395751953, "learning_rate": 9.834142537421053e-05, "loss": 2.613294982910156, "memory(GiB)": 58.3, "step": 9595, "token_acc": 0.445141065830721, "train_speed(iter/s)": 1.450211 }, { "epoch": 0.41129343215800523, "grad_norm": 4.9661359786987305, "learning_rate": 9.833970597475311e-05, "loss": 2.364989471435547, "memory(GiB)": 58.3, "step": 9600, "token_acc": 0.4942084942084942, "train_speed(iter/s)": 1.450216 }, { "epoch": 0.4115076474872542, "grad_norm": 3.4044036865234375, "learning_rate": 9.833798569957657e-05, "loss": 2.380361557006836, "memory(GiB)": 58.3, "step": 9605, "token_acc": 0.4936708860759494, "train_speed(iter/s)": 1.450301 }, { "epoch": 0.41172186281650314, "grad_norm": 2.8980321884155273, "learning_rate": 9.833626454871208e-05, "loss": 2.6926889419555664, "memory(GiB)": 58.3, "step": 9610, "token_acc": 0.47297297297297297, "train_speed(iter/s)": 1.450566 }, { "epoch": 0.4119360781457521, "grad_norm": 3.441521644592285, "learning_rate": 9.833454252219082e-05, "loss": 2.644577217102051, "memory(GiB)": 58.3, "step": 9615, "token_acc": 0.49814126394052044, "train_speed(iter/s)": 1.450409 }, { "epoch": 0.4121502934750011, "grad_norm": 4.102413654327393, "learning_rate": 9.833281962004397e-05, "loss": 2.4470069885253904, "memory(GiB)": 58.3, "step": 9620, "token_acc": 0.5175097276264592, "train_speed(iter/s)": 1.450341 }, { "epoch": 0.41236450880425, "grad_norm": 4.925978183746338, "learning_rate": 9.833109584230275e-05, "loss": 2.8542598724365233, "memory(GiB)": 58.3, "step": 9625, "token_acc": 0.44715447154471544, "train_speed(iter/s)": 1.450291 }, { "epoch": 0.412578724133499, "grad_norm": 3.057582139968872, "learning_rate": 9.832937118899842e-05, "loss": 2.4771472930908205, "memory(GiB)": 58.3, "step": 9630, "token_acc": 0.47619047619047616, "train_speed(iter/s)": 1.450556 }, { "epoch": 0.412792939462748, "grad_norm": 4.964419841766357, "learning_rate": 9.832764566016216e-05, "loss": 2.4388219833374025, "memory(GiB)": 58.3, "step": 9635, "token_acc": 0.5145985401459854, "train_speed(iter/s)": 1.450656 }, { "epoch": 0.4130071547919969, "grad_norm": 4.488737106323242, "learning_rate": 9.832591925582527e-05, "loss": 2.5482051849365233, "memory(GiB)": 58.3, "step": 9640, "token_acc": 0.4981549815498155, "train_speed(iter/s)": 1.450624 }, { "epoch": 0.4132213701212459, "grad_norm": 5.323754787445068, "learning_rate": 9.832419197601903e-05, "loss": 2.298923873901367, "memory(GiB)": 58.3, "step": 9645, "token_acc": 0.4779116465863454, "train_speed(iter/s)": 1.450678 }, { "epoch": 0.41343558545049486, "grad_norm": 4.610477924346924, "learning_rate": 9.832246382077471e-05, "loss": 2.7490882873535156, "memory(GiB)": 58.3, "step": 9650, "token_acc": 0.44, "train_speed(iter/s)": 1.450778 }, { "epoch": 0.4136498007797438, "grad_norm": 5.109628677368164, "learning_rate": 9.832073479012364e-05, "loss": 2.6343645095825194, "memory(GiB)": 58.3, "step": 9655, "token_acc": 0.45136186770428016, "train_speed(iter/s)": 1.450964 }, { "epoch": 0.41386401610899276, "grad_norm": 4.2545390129089355, "learning_rate": 9.83190048840971e-05, "loss": 2.8213424682617188, "memory(GiB)": 58.3, "step": 9660, "token_acc": 0.4553846153846154, "train_speed(iter/s)": 1.450974 }, { "epoch": 0.41407823143824174, "grad_norm": 3.911921501159668, "learning_rate": 9.83172741027265e-05, "loss": 2.620610237121582, "memory(GiB)": 58.3, "step": 9665, "token_acc": 0.46308724832214765, "train_speed(iter/s)": 1.450997 }, { "epoch": 0.41429244676749066, "grad_norm": 3.68625545501709, "learning_rate": 9.831554244604313e-05, "loss": 2.453909492492676, "memory(GiB)": 58.3, "step": 9670, "token_acc": 0.4726027397260274, "train_speed(iter/s)": 1.451004 }, { "epoch": 0.41450666209673964, "grad_norm": 4.247152328491211, "learning_rate": 9.831380991407841e-05, "loss": 2.911936569213867, "memory(GiB)": 58.3, "step": 9675, "token_acc": 0.4398496240601504, "train_speed(iter/s)": 1.45096 }, { "epoch": 0.4147208774259886, "grad_norm": 4.288780212402344, "learning_rate": 9.831207650686367e-05, "loss": 2.635361099243164, "memory(GiB)": 58.3, "step": 9680, "token_acc": 0.4391891891891892, "train_speed(iter/s)": 1.451075 }, { "epoch": 0.41493509275523754, "grad_norm": 3.842052459716797, "learning_rate": 9.831034222443037e-05, "loss": 2.533888244628906, "memory(GiB)": 58.3, "step": 9685, "token_acc": 0.524, "train_speed(iter/s)": 1.451013 }, { "epoch": 0.4151493080844865, "grad_norm": 3.9143149852752686, "learning_rate": 9.830860706680989e-05, "loss": 2.6112188339233398, "memory(GiB)": 58.3, "step": 9690, "token_acc": 0.49691358024691357, "train_speed(iter/s)": 1.451177 }, { "epoch": 0.4153635234137355, "grad_norm": 4.415402889251709, "learning_rate": 9.830687103403367e-05, "loss": 2.455311584472656, "memory(GiB)": 58.3, "step": 9695, "token_acc": 0.49411764705882355, "train_speed(iter/s)": 1.451305 }, { "epoch": 0.4155777387429844, "grad_norm": 3.138246536254883, "learning_rate": 9.830513412613318e-05, "loss": 2.635904884338379, "memory(GiB)": 58.3, "step": 9700, "token_acc": 0.4420289855072464, "train_speed(iter/s)": 1.45111 }, { "epoch": 0.4157919540722334, "grad_norm": 5.393638610839844, "learning_rate": 9.830339634313985e-05, "loss": 2.5539560317993164, "memory(GiB)": 58.3, "step": 9705, "token_acc": 0.4887459807073955, "train_speed(iter/s)": 1.451068 }, { "epoch": 0.4160061694014824, "grad_norm": 3.9030025005340576, "learning_rate": 9.830165768508519e-05, "loss": 2.660310173034668, "memory(GiB)": 58.3, "step": 9710, "token_acc": 0.47413793103448276, "train_speed(iter/s)": 1.450961 }, { "epoch": 0.4162203847307313, "grad_norm": 6.049018859863281, "learning_rate": 9.82999181520007e-05, "loss": 2.5478237152099608, "memory(GiB)": 58.3, "step": 9715, "token_acc": 0.4666666666666667, "train_speed(iter/s)": 1.451001 }, { "epoch": 0.4164346000599803, "grad_norm": 3.858489751815796, "learning_rate": 9.829817774391788e-05, "loss": 2.5838314056396485, "memory(GiB)": 58.3, "step": 9720, "token_acc": 0.4666666666666667, "train_speed(iter/s)": 1.451008 }, { "epoch": 0.41664881538922927, "grad_norm": 4.7568440437316895, "learning_rate": 9.829643646086826e-05, "loss": 2.889063262939453, "memory(GiB)": 58.3, "step": 9725, "token_acc": 0.41924398625429554, "train_speed(iter/s)": 1.451026 }, { "epoch": 0.4168630307184782, "grad_norm": 4.264733791351318, "learning_rate": 9.829469430288338e-05, "loss": 2.687091064453125, "memory(GiB)": 58.3, "step": 9730, "token_acc": 0.4810606060606061, "train_speed(iter/s)": 1.451126 }, { "epoch": 0.41707724604772717, "grad_norm": 3.2557778358459473, "learning_rate": 9.829295126999482e-05, "loss": 2.403067779541016, "memory(GiB)": 58.3, "step": 9735, "token_acc": 0.5204460966542751, "train_speed(iter/s)": 1.451114 }, { "epoch": 0.41729146137697615, "grad_norm": 4.914039611816406, "learning_rate": 9.829120736223417e-05, "loss": 2.671504020690918, "memory(GiB)": 58.3, "step": 9740, "token_acc": 0.4411764705882353, "train_speed(iter/s)": 1.451288 }, { "epoch": 0.41750567670622507, "grad_norm": 3.6416661739349365, "learning_rate": 9.828946257963296e-05, "loss": 2.2212257385253906, "memory(GiB)": 58.3, "step": 9745, "token_acc": 0.5387755102040817, "train_speed(iter/s)": 1.45145 }, { "epoch": 0.41771989203547405, "grad_norm": 8.937661170959473, "learning_rate": 9.828771692222284e-05, "loss": 2.4570201873779296, "memory(GiB)": 58.3, "step": 9750, "token_acc": 0.5113636363636364, "train_speed(iter/s)": 1.451537 }, { "epoch": 0.41793410736472303, "grad_norm": 3.8024401664733887, "learning_rate": 9.828597039003544e-05, "loss": 2.5888465881347655, "memory(GiB)": 58.3, "step": 9755, "token_acc": 0.5019607843137255, "train_speed(iter/s)": 1.451665 }, { "epoch": 0.418148322693972, "grad_norm": 4.451294422149658, "learning_rate": 9.82842229831024e-05, "loss": 2.936477851867676, "memory(GiB)": 58.3, "step": 9760, "token_acc": 0.4294478527607362, "train_speed(iter/s)": 1.451735 }, { "epoch": 0.41836253802322093, "grad_norm": 4.3740434646606445, "learning_rate": 9.828247470145535e-05, "loss": 2.576313781738281, "memory(GiB)": 58.3, "step": 9765, "token_acc": 0.4979757085020243, "train_speed(iter/s)": 1.451647 }, { "epoch": 0.4185767533524699, "grad_norm": 5.103072166442871, "learning_rate": 9.828072554512597e-05, "loss": 2.6400827407836913, "memory(GiB)": 58.3, "step": 9770, "token_acc": 0.4847560975609756, "train_speed(iter/s)": 1.451642 }, { "epoch": 0.4187909686817189, "grad_norm": 4.11093282699585, "learning_rate": 9.827897551414598e-05, "loss": 2.6991107940673826, "memory(GiB)": 58.3, "step": 9775, "token_acc": 0.45544554455445546, "train_speed(iter/s)": 1.451696 }, { "epoch": 0.4190051840109678, "grad_norm": 4.389251232147217, "learning_rate": 9.827722460854705e-05, "loss": 2.7348217010498046, "memory(GiB)": 58.3, "step": 9780, "token_acc": 0.45364238410596025, "train_speed(iter/s)": 1.451829 }, { "epoch": 0.4192193993402168, "grad_norm": 4.366090774536133, "learning_rate": 9.82754728283609e-05, "loss": 2.5567569732666016, "memory(GiB)": 58.3, "step": 9785, "token_acc": 0.49206349206349204, "train_speed(iter/s)": 1.451781 }, { "epoch": 0.41943361466946577, "grad_norm": 3.707515239715576, "learning_rate": 9.827372017361929e-05, "loss": 2.6124616622924806, "memory(GiB)": 58.3, "step": 9790, "token_acc": 0.44936708860759494, "train_speed(iter/s)": 1.451855 }, { "epoch": 0.4196478299987147, "grad_norm": 4.596794605255127, "learning_rate": 9.827196664435394e-05, "loss": 2.4758602142333985, "memory(GiB)": 58.3, "step": 9795, "token_acc": 0.5219123505976095, "train_speed(iter/s)": 1.451924 }, { "epoch": 0.4198620453279637, "grad_norm": 5.132248401641846, "learning_rate": 9.827021224059663e-05, "loss": 2.538256072998047, "memory(GiB)": 58.3, "step": 9800, "token_acc": 0.5172413793103449, "train_speed(iter/s)": 1.452101 }, { "epoch": 0.42007626065721265, "grad_norm": 3.3905787467956543, "learning_rate": 9.826845696237917e-05, "loss": 2.232204818725586, "memory(GiB)": 58.3, "step": 9805, "token_acc": 0.47580645161290325, "train_speed(iter/s)": 1.451935 }, { "epoch": 0.4202904759864616, "grad_norm": 3.9138851165771484, "learning_rate": 9.826670080973331e-05, "loss": 2.61285400390625, "memory(GiB)": 58.3, "step": 9810, "token_acc": 0.44150943396226416, "train_speed(iter/s)": 1.452012 }, { "epoch": 0.42050469131571055, "grad_norm": 5.183884620666504, "learning_rate": 9.826494378269088e-05, "loss": 2.547484588623047, "memory(GiB)": 58.3, "step": 9815, "token_acc": 0.467680608365019, "train_speed(iter/s)": 1.452109 }, { "epoch": 0.42071890664495953, "grad_norm": 4.3976850509643555, "learning_rate": 9.826318588128373e-05, "loss": 2.4847038269042967, "memory(GiB)": 58.3, "step": 9820, "token_acc": 0.46366782006920415, "train_speed(iter/s)": 1.452278 }, { "epoch": 0.42093312197420846, "grad_norm": 4.496557712554932, "learning_rate": 9.82614271055437e-05, "loss": 2.5820926666259765, "memory(GiB)": 58.3, "step": 9825, "token_acc": 0.46441947565543074, "train_speed(iter/s)": 1.452321 }, { "epoch": 0.42114733730345744, "grad_norm": 6.010549068450928, "learning_rate": 9.825966745550262e-05, "loss": 2.8377025604248045, "memory(GiB)": 58.3, "step": 9830, "token_acc": 0.4523809523809524, "train_speed(iter/s)": 1.45234 }, { "epoch": 0.4213615526327064, "grad_norm": 3.693732261657715, "learning_rate": 9.825790693119241e-05, "loss": 2.646694564819336, "memory(GiB)": 58.3, "step": 9835, "token_acc": 0.4517241379310345, "train_speed(iter/s)": 1.452264 }, { "epoch": 0.42157576796195534, "grad_norm": 5.219442844390869, "learning_rate": 9.825614553264495e-05, "loss": 3.0060569763183596, "memory(GiB)": 58.3, "step": 9840, "token_acc": 0.42657342657342656, "train_speed(iter/s)": 1.45242 }, { "epoch": 0.4217899832912043, "grad_norm": 3.7272820472717285, "learning_rate": 9.825438325989214e-05, "loss": 2.4056961059570314, "memory(GiB)": 58.3, "step": 9845, "token_acc": 0.5172413793103449, "train_speed(iter/s)": 1.452337 }, { "epoch": 0.4220041986204533, "grad_norm": 4.125674724578857, "learning_rate": 9.825262011296591e-05, "loss": 2.541624069213867, "memory(GiB)": 58.3, "step": 9850, "token_acc": 0.5057471264367817, "train_speed(iter/s)": 1.452179 }, { "epoch": 0.4222184139497022, "grad_norm": 3.719965696334839, "learning_rate": 9.825085609189821e-05, "loss": 2.3237314224243164, "memory(GiB)": 58.3, "step": 9855, "token_acc": 0.5020746887966805, "train_speed(iter/s)": 1.452181 }, { "epoch": 0.4224326292789512, "grad_norm": 5.1334028244018555, "learning_rate": 9.824909119672098e-05, "loss": 2.254361152648926, "memory(GiB)": 58.3, "step": 9860, "token_acc": 0.4981132075471698, "train_speed(iter/s)": 1.452126 }, { "epoch": 0.4226468446082002, "grad_norm": 5.238258361816406, "learning_rate": 9.82473254274662e-05, "loss": 2.55759334564209, "memory(GiB)": 58.3, "step": 9865, "token_acc": 0.4925925925925926, "train_speed(iter/s)": 1.452199 }, { "epoch": 0.4228610599374491, "grad_norm": 5.095531940460205, "learning_rate": 9.824555878416586e-05, "loss": 2.494092559814453, "memory(GiB)": 58.3, "step": 9870, "token_acc": 0.4794520547945205, "train_speed(iter/s)": 1.452066 }, { "epoch": 0.4230752752666981, "grad_norm": 4.849524974822998, "learning_rate": 9.824379126685197e-05, "loss": 2.3354536056518556, "memory(GiB)": 58.3, "step": 9875, "token_acc": 0.5038461538461538, "train_speed(iter/s)": 1.452032 }, { "epoch": 0.42328949059594706, "grad_norm": 3.707350730895996, "learning_rate": 9.824202287555655e-05, "loss": 2.878459167480469, "memory(GiB)": 58.3, "step": 9880, "token_acc": 0.41208791208791207, "train_speed(iter/s)": 1.452004 }, { "epoch": 0.423503705925196, "grad_norm": 4.141573905944824, "learning_rate": 9.824025361031162e-05, "loss": 2.334977722167969, "memory(GiB)": 58.3, "step": 9885, "token_acc": 0.5037593984962406, "train_speed(iter/s)": 1.451923 }, { "epoch": 0.42371792125444496, "grad_norm": 4.0235209465026855, "learning_rate": 9.823848347114925e-05, "loss": 2.7896957397460938, "memory(GiB)": 58.3, "step": 9890, "token_acc": 0.4198717948717949, "train_speed(iter/s)": 1.451988 }, { "epoch": 0.42393213658369394, "grad_norm": 3.8881659507751465, "learning_rate": 9.82367124581015e-05, "loss": 2.3459598541259767, "memory(GiB)": 58.3, "step": 9895, "token_acc": 0.48412698412698413, "train_speed(iter/s)": 1.4522 }, { "epoch": 0.42414635191294286, "grad_norm": 4.352472305297852, "learning_rate": 9.823494057120046e-05, "loss": 2.568310356140137, "memory(GiB)": 58.3, "step": 9900, "token_acc": 0.43214285714285716, "train_speed(iter/s)": 1.45229 }, { "epoch": 0.42436056724219184, "grad_norm": 4.939019680023193, "learning_rate": 9.82331678104782e-05, "loss": 2.5175588607788084, "memory(GiB)": 58.3, "step": 9905, "token_acc": 0.4563758389261745, "train_speed(iter/s)": 1.452451 }, { "epoch": 0.4245747825714408, "grad_norm": 6.635680198669434, "learning_rate": 9.823139417596688e-05, "loss": 2.3019681930541993, "memory(GiB)": 58.3, "step": 9910, "token_acc": 0.4942528735632184, "train_speed(iter/s)": 1.452353 }, { "epoch": 0.42478899790068975, "grad_norm": 4.656494617462158, "learning_rate": 9.822961966769861e-05, "loss": 2.536408042907715, "memory(GiB)": 58.3, "step": 9915, "token_acc": 0.45692883895131087, "train_speed(iter/s)": 1.452396 }, { "epoch": 0.4250032132299387, "grad_norm": 5.094083309173584, "learning_rate": 9.822784428570552e-05, "loss": 2.776628112792969, "memory(GiB)": 58.3, "step": 9920, "token_acc": 0.4517241379310345, "train_speed(iter/s)": 1.452285 }, { "epoch": 0.4252174285591877, "grad_norm": 3.064948081970215, "learning_rate": 9.822606803001981e-05, "loss": 2.715955924987793, "memory(GiB)": 58.3, "step": 9925, "token_acc": 0.46474358974358976, "train_speed(iter/s)": 1.452241 }, { "epoch": 0.4254316438884367, "grad_norm": 3.244896650314331, "learning_rate": 9.822429090067363e-05, "loss": 2.6002098083496095, "memory(GiB)": 58.3, "step": 9930, "token_acc": 0.5017301038062284, "train_speed(iter/s)": 1.452365 }, { "epoch": 0.4256458592176856, "grad_norm": 4.495660305023193, "learning_rate": 9.822251289769917e-05, "loss": 2.604629707336426, "memory(GiB)": 58.3, "step": 9935, "token_acc": 0.48172757475083056, "train_speed(iter/s)": 1.452571 }, { "epoch": 0.4258600745469346, "grad_norm": 6.743414878845215, "learning_rate": 9.822073402112867e-05, "loss": 2.359467697143555, "memory(GiB)": 58.3, "step": 9940, "token_acc": 0.46568627450980393, "train_speed(iter/s)": 1.452555 }, { "epoch": 0.42607428987618357, "grad_norm": 3.630054235458374, "learning_rate": 9.821895427099434e-05, "loss": 2.600129318237305, "memory(GiB)": 58.3, "step": 9945, "token_acc": 0.4326647564469914, "train_speed(iter/s)": 1.452691 }, { "epoch": 0.4262885052054325, "grad_norm": 3.3595852851867676, "learning_rate": 9.821717364732841e-05, "loss": 2.707257843017578, "memory(GiB)": 58.3, "step": 9950, "token_acc": 0.43653250773993807, "train_speed(iter/s)": 1.452842 }, { "epoch": 0.42650272053468147, "grad_norm": 4.772958755493164, "learning_rate": 9.821539215016314e-05, "loss": 2.568878936767578, "memory(GiB)": 58.3, "step": 9955, "token_acc": 0.42765273311897106, "train_speed(iter/s)": 1.452812 }, { "epoch": 0.42671693586393045, "grad_norm": 4.185490131378174, "learning_rate": 9.821360977953083e-05, "loss": 2.379243087768555, "memory(GiB)": 58.3, "step": 9960, "token_acc": 0.5126353790613718, "train_speed(iter/s)": 1.452913 }, { "epoch": 0.42693115119317937, "grad_norm": 3.699676752090454, "learning_rate": 9.821182653546374e-05, "loss": 2.3603675842285154, "memory(GiB)": 58.3, "step": 9965, "token_acc": 0.4533333333333333, "train_speed(iter/s)": 1.452909 }, { "epoch": 0.42714536652242835, "grad_norm": 4.556610107421875, "learning_rate": 9.821004241799419e-05, "loss": 2.3917613983154298, "memory(GiB)": 58.3, "step": 9970, "token_acc": 0.4618055555555556, "train_speed(iter/s)": 1.452939 }, { "epoch": 0.42735958185167733, "grad_norm": 3.3329644203186035, "learning_rate": 9.82082574271545e-05, "loss": 2.701329231262207, "memory(GiB)": 58.3, "step": 9975, "token_acc": 0.4268774703557312, "train_speed(iter/s)": 1.452964 }, { "epoch": 0.42757379718092625, "grad_norm": 4.63325309753418, "learning_rate": 9.8206471562977e-05, "loss": 2.4283470153808593, "memory(GiB)": 58.3, "step": 9980, "token_acc": 0.4788135593220339, "train_speed(iter/s)": 1.45299 }, { "epoch": 0.42778801251017523, "grad_norm": 5.624799728393555, "learning_rate": 9.820468482549403e-05, "loss": 2.7866586685180663, "memory(GiB)": 58.3, "step": 9985, "token_acc": 0.4316109422492401, "train_speed(iter/s)": 1.453091 }, { "epoch": 0.4280022278394242, "grad_norm": 10.387460708618164, "learning_rate": 9.8202897214738e-05, "loss": 2.660850715637207, "memory(GiB)": 58.3, "step": 9990, "token_acc": 0.4364406779661017, "train_speed(iter/s)": 1.45329 }, { "epoch": 0.42821644316867313, "grad_norm": 5.224329471588135, "learning_rate": 9.820110873074127e-05, "loss": 2.52262020111084, "memory(GiB)": 58.3, "step": 9995, "token_acc": 0.4845814977973568, "train_speed(iter/s)": 1.453339 }, { "epoch": 0.4284306584979221, "grad_norm": 4.111954212188721, "learning_rate": 9.819931937353622e-05, "loss": 2.7449642181396485, "memory(GiB)": 58.3, "step": 10000, "token_acc": 0.448559670781893, "train_speed(iter/s)": 1.453436 }, { "epoch": 0.4284306584979221, "eval_loss": 2.2310776710510254, "eval_runtime": 14.2185, "eval_samples_per_second": 7.033, "eval_steps_per_second": 7.033, "eval_token_acc": 0.46911764705882353, "step": 10000 }, { "epoch": 0.4286448738271711, "grad_norm": 3.7548162937164307, "learning_rate": 9.81975291431553e-05, "loss": 2.677891159057617, "memory(GiB)": 58.3, "step": 10005, "token_acc": 0.47058823529411764, "train_speed(iter/s)": 1.450476 }, { "epoch": 0.42885908915642, "grad_norm": 4.450655937194824, "learning_rate": 9.819573803963092e-05, "loss": 2.459446144104004, "memory(GiB)": 58.3, "step": 10010, "token_acc": 0.46206896551724136, "train_speed(iter/s)": 1.450607 }, { "epoch": 0.429073304485669, "grad_norm": 5.074817180633545, "learning_rate": 9.819394606299552e-05, "loss": 2.340096092224121, "memory(GiB)": 58.3, "step": 10015, "token_acc": 0.4666666666666667, "train_speed(iter/s)": 1.450687 }, { "epoch": 0.429287519814918, "grad_norm": 3.4895269870758057, "learning_rate": 9.819215321328159e-05, "loss": 2.4784250259399414, "memory(GiB)": 58.3, "step": 10020, "token_acc": 0.45791245791245794, "train_speed(iter/s)": 1.450856 }, { "epoch": 0.4295017351441669, "grad_norm": 4.1343488693237305, "learning_rate": 9.81903594905216e-05, "loss": 2.873653030395508, "memory(GiB)": 58.3, "step": 10025, "token_acc": 0.4229390681003584, "train_speed(iter/s)": 1.450903 }, { "epoch": 0.4297159504734159, "grad_norm": 3.646547794342041, "learning_rate": 9.818856489474803e-05, "loss": 2.7377740859985353, "memory(GiB)": 58.3, "step": 10030, "token_acc": 0.4411764705882353, "train_speed(iter/s)": 1.451047 }, { "epoch": 0.42993016580266485, "grad_norm": 4.119231700897217, "learning_rate": 9.818676942599343e-05, "loss": 2.670035552978516, "memory(GiB)": 58.3, "step": 10035, "token_acc": 0.43526170798898073, "train_speed(iter/s)": 1.450986 }, { "epoch": 0.4301443811319138, "grad_norm": 5.240797519683838, "learning_rate": 9.818497308429028e-05, "loss": 2.6698062896728514, "memory(GiB)": 58.3, "step": 10040, "token_acc": 0.4964788732394366, "train_speed(iter/s)": 1.451002 }, { "epoch": 0.43035859646116276, "grad_norm": 5.5719895362854, "learning_rate": 9.818317586967114e-05, "loss": 2.6206993103027343, "memory(GiB)": 58.3, "step": 10045, "token_acc": 0.46394984326018807, "train_speed(iter/s)": 1.450987 }, { "epoch": 0.43057281179041174, "grad_norm": 4.11723518371582, "learning_rate": 9.818137778216857e-05, "loss": 2.7491432189941407, "memory(GiB)": 58.3, "step": 10050, "token_acc": 0.4684014869888476, "train_speed(iter/s)": 1.451135 }, { "epoch": 0.43078702711966066, "grad_norm": 4.059296131134033, "learning_rate": 9.817957882181514e-05, "loss": 2.349420166015625, "memory(GiB)": 58.3, "step": 10055, "token_acc": 0.4927536231884058, "train_speed(iter/s)": 1.451191 }, { "epoch": 0.43100124244890964, "grad_norm": 4.097110271453857, "learning_rate": 9.817777898864345e-05, "loss": 2.798237991333008, "memory(GiB)": 58.3, "step": 10060, "token_acc": 0.4364820846905538, "train_speed(iter/s)": 1.451312 }, { "epoch": 0.4312154577781586, "grad_norm": 5.689593315124512, "learning_rate": 9.81759782826861e-05, "loss": 2.465549659729004, "memory(GiB)": 58.3, "step": 10065, "token_acc": 0.4645161290322581, "train_speed(iter/s)": 1.451293 }, { "epoch": 0.43142967310740754, "grad_norm": 4.548140525817871, "learning_rate": 9.817417670397571e-05, "loss": 2.3398454666137694, "memory(GiB)": 58.3, "step": 10070, "token_acc": 0.5106382978723404, "train_speed(iter/s)": 1.4512 }, { "epoch": 0.4316438884366565, "grad_norm": 6.6089863777160645, "learning_rate": 9.817237425254492e-05, "loss": 2.405929946899414, "memory(GiB)": 58.3, "step": 10075, "token_acc": 0.44654088050314467, "train_speed(iter/s)": 1.451241 }, { "epoch": 0.4318581037659055, "grad_norm": 5.215310573577881, "learning_rate": 9.817057092842639e-05, "loss": 2.610513114929199, "memory(GiB)": 58.3, "step": 10080, "token_acc": 0.44025157232704404, "train_speed(iter/s)": 1.451249 }, { "epoch": 0.4320723190951544, "grad_norm": 3.731516122817993, "learning_rate": 9.816876673165276e-05, "loss": 2.3214963912963866, "memory(GiB)": 58.3, "step": 10085, "token_acc": 0.4574898785425101, "train_speed(iter/s)": 1.451396 }, { "epoch": 0.4322865344244034, "grad_norm": 4.801403522491455, "learning_rate": 9.816696166225674e-05, "loss": 2.7218461990356446, "memory(GiB)": 58.3, "step": 10090, "token_acc": 0.4734982332155477, "train_speed(iter/s)": 1.451422 }, { "epoch": 0.4325007497536524, "grad_norm": 4.4056878089904785, "learning_rate": 9.816515572027103e-05, "loss": 2.6250595092773437, "memory(GiB)": 58.3, "step": 10095, "token_acc": 0.46984126984126984, "train_speed(iter/s)": 1.451457 }, { "epoch": 0.43271496508290136, "grad_norm": 5.308544158935547, "learning_rate": 9.816334890572834e-05, "loss": 2.5006891250610352, "memory(GiB)": 58.3, "step": 10100, "token_acc": 0.486013986013986, "train_speed(iter/s)": 1.451602 }, { "epoch": 0.4329291804121503, "grad_norm": 7.511530876159668, "learning_rate": 9.81615412186614e-05, "loss": 2.5856000900268556, "memory(GiB)": 58.3, "step": 10105, "token_acc": 0.46122448979591835, "train_speed(iter/s)": 1.451599 }, { "epoch": 0.43314339574139926, "grad_norm": 4.691821098327637, "learning_rate": 9.815973265910296e-05, "loss": 2.6388996124267576, "memory(GiB)": 58.3, "step": 10110, "token_acc": 0.45195729537366547, "train_speed(iter/s)": 1.451403 }, { "epoch": 0.43335761107064824, "grad_norm": 4.410147190093994, "learning_rate": 9.815792322708579e-05, "loss": 2.643895721435547, "memory(GiB)": 58.3, "step": 10115, "token_acc": 0.47601476014760147, "train_speed(iter/s)": 1.451398 }, { "epoch": 0.43357182639989716, "grad_norm": 4.606101989746094, "learning_rate": 9.815611292264267e-05, "loss": 2.6611881256103516, "memory(GiB)": 58.3, "step": 10120, "token_acc": 0.4620938628158845, "train_speed(iter/s)": 1.451477 }, { "epoch": 0.43378604172914614, "grad_norm": 4.348424911499023, "learning_rate": 9.815430174580638e-05, "loss": 2.3891780853271483, "memory(GiB)": 58.3, "step": 10125, "token_acc": 0.47368421052631576, "train_speed(iter/s)": 1.451523 }, { "epoch": 0.4340002570583951, "grad_norm": 4.787311553955078, "learning_rate": 9.815248969660975e-05, "loss": 2.6099613189697264, "memory(GiB)": 58.3, "step": 10130, "token_acc": 0.4758842443729904, "train_speed(iter/s)": 1.451583 }, { "epoch": 0.43421447238764405, "grad_norm": 4.865276336669922, "learning_rate": 9.81506767750856e-05, "loss": 2.622056770324707, "memory(GiB)": 58.3, "step": 10135, "token_acc": 0.47265625, "train_speed(iter/s)": 1.451635 }, { "epoch": 0.434428687716893, "grad_norm": 4.160851001739502, "learning_rate": 9.814886298126678e-05, "loss": 2.6428096771240233, "memory(GiB)": 58.3, "step": 10140, "token_acc": 0.4931972789115646, "train_speed(iter/s)": 1.451525 }, { "epoch": 0.434642903046142, "grad_norm": 3.2754552364349365, "learning_rate": 9.814704831518612e-05, "loss": 2.5365121841430662, "memory(GiB)": 58.3, "step": 10145, "token_acc": 0.48148148148148145, "train_speed(iter/s)": 1.451612 }, { "epoch": 0.4348571183753909, "grad_norm": 3.87518310546875, "learning_rate": 9.814523277687651e-05, "loss": 2.6110063552856446, "memory(GiB)": 58.3, "step": 10150, "token_acc": 0.4748201438848921, "train_speed(iter/s)": 1.45182 }, { "epoch": 0.4350713337046399, "grad_norm": 4.890989303588867, "learning_rate": 9.814341636637085e-05, "loss": 3.039305305480957, "memory(GiB)": 58.3, "step": 10155, "token_acc": 0.460431654676259, "train_speed(iter/s)": 1.45179 }, { "epoch": 0.4352855490338889, "grad_norm": 4.779794692993164, "learning_rate": 9.814159908370206e-05, "loss": 2.5788080215454103, "memory(GiB)": 58.3, "step": 10160, "token_acc": 0.4601449275362319, "train_speed(iter/s)": 1.451743 }, { "epoch": 0.4354997643631378, "grad_norm": 4.143568992614746, "learning_rate": 9.813978092890302e-05, "loss": 2.623064422607422, "memory(GiB)": 58.3, "step": 10165, "token_acc": 0.4473684210526316, "train_speed(iter/s)": 1.451692 }, { "epoch": 0.4357139796923868, "grad_norm": 4.308685302734375, "learning_rate": 9.813796190200671e-05, "loss": 2.2172544479370115, "memory(GiB)": 58.3, "step": 10170, "token_acc": 0.5130111524163569, "train_speed(iter/s)": 1.451685 }, { "epoch": 0.43592819502163577, "grad_norm": 5.4449543952941895, "learning_rate": 9.813614200304604e-05, "loss": 2.7361270904541017, "memory(GiB)": 58.3, "step": 10175, "token_acc": 0.4318181818181818, "train_speed(iter/s)": 1.451728 }, { "epoch": 0.4361424103508847, "grad_norm": 4.251591682434082, "learning_rate": 9.813432123205401e-05, "loss": 2.722416114807129, "memory(GiB)": 58.3, "step": 10180, "token_acc": 0.44011976047904194, "train_speed(iter/s)": 1.451803 }, { "epoch": 0.43635662568013367, "grad_norm": 5.440887928009033, "learning_rate": 9.813249958906362e-05, "loss": 2.454940414428711, "memory(GiB)": 58.3, "step": 10185, "token_acc": 0.4603174603174603, "train_speed(iter/s)": 1.451828 }, { "epoch": 0.43657084100938265, "grad_norm": 4.593027114868164, "learning_rate": 9.813067707410781e-05, "loss": 2.643841361999512, "memory(GiB)": 58.3, "step": 10190, "token_acc": 0.4645390070921986, "train_speed(iter/s)": 1.451931 }, { "epoch": 0.43678505633863157, "grad_norm": 5.123544692993164, "learning_rate": 9.812885368721966e-05, "loss": 2.8350969314575196, "memory(GiB)": 58.3, "step": 10195, "token_acc": 0.4200626959247649, "train_speed(iter/s)": 1.451857 }, { "epoch": 0.43699927166788055, "grad_norm": 5.503055572509766, "learning_rate": 9.812702942843218e-05, "loss": 2.770107460021973, "memory(GiB)": 58.3, "step": 10200, "token_acc": 0.4659400544959128, "train_speed(iter/s)": 1.45171 }, { "epoch": 0.43721348699712953, "grad_norm": 4.224015235900879, "learning_rate": 9.812520429777839e-05, "loss": 2.5481491088867188, "memory(GiB)": 58.3, "step": 10205, "token_acc": 0.48175182481751827, "train_speed(iter/s)": 1.451527 }, { "epoch": 0.43742770232637845, "grad_norm": 4.281762599945068, "learning_rate": 9.81233782952914e-05, "loss": 2.680375671386719, "memory(GiB)": 58.3, "step": 10210, "token_acc": 0.4630681818181818, "train_speed(iter/s)": 1.451532 }, { "epoch": 0.43764191765562743, "grad_norm": 4.127501964569092, "learning_rate": 9.812155142100425e-05, "loss": 2.637327194213867, "memory(GiB)": 58.3, "step": 10215, "token_acc": 0.43769968051118213, "train_speed(iter/s)": 1.451541 }, { "epoch": 0.4378561329848764, "grad_norm": 5.3888325691223145, "learning_rate": 9.811972367495008e-05, "loss": 2.6785909652709963, "memory(GiB)": 58.3, "step": 10220, "token_acc": 0.4865771812080537, "train_speed(iter/s)": 1.451575 }, { "epoch": 0.43807034831412534, "grad_norm": 3.7547709941864014, "learning_rate": 9.811789505716195e-05, "loss": 2.6827533721923826, "memory(GiB)": 58.3, "step": 10225, "token_acc": 0.4696969696969697, "train_speed(iter/s)": 1.451577 }, { "epoch": 0.4382845636433743, "grad_norm": 6.084743499755859, "learning_rate": 9.811606556767303e-05, "loss": 2.6391664505004884, "memory(GiB)": 58.3, "step": 10230, "token_acc": 0.4576923076923077, "train_speed(iter/s)": 1.451655 }, { "epoch": 0.4384987789726233, "grad_norm": 3.2450554370880127, "learning_rate": 9.811423520651644e-05, "loss": 2.8701803207397463, "memory(GiB)": 58.3, "step": 10235, "token_acc": 0.4391691394658754, "train_speed(iter/s)": 1.451551 }, { "epoch": 0.4387129943018722, "grad_norm": 3.278779983520508, "learning_rate": 9.811240397372535e-05, "loss": 2.6099533081054687, "memory(GiB)": 58.3, "step": 10240, "token_acc": 0.42295081967213116, "train_speed(iter/s)": 1.4517 }, { "epoch": 0.4389272096311212, "grad_norm": 4.576379299163818, "learning_rate": 9.811057186933293e-05, "loss": 2.3236488342285155, "memory(GiB)": 58.3, "step": 10245, "token_acc": 0.5287356321839081, "train_speed(iter/s)": 1.451766 }, { "epoch": 0.4391414249603702, "grad_norm": 3.7729880809783936, "learning_rate": 9.810873889337235e-05, "loss": 2.6522123336791994, "memory(GiB)": 58.3, "step": 10250, "token_acc": 0.46864686468646866, "train_speed(iter/s)": 1.451955 }, { "epoch": 0.4393556402896191, "grad_norm": 4.309666633605957, "learning_rate": 9.810690504587685e-05, "loss": 2.790571594238281, "memory(GiB)": 58.3, "step": 10255, "token_acc": 0.4473684210526316, "train_speed(iter/s)": 1.452002 }, { "epoch": 0.4395698556188681, "grad_norm": 4.178102016448975, "learning_rate": 9.810507032687964e-05, "loss": 2.5907960891723634, "memory(GiB)": 58.3, "step": 10260, "token_acc": 0.45666666666666667, "train_speed(iter/s)": 1.451945 }, { "epoch": 0.43978407094811706, "grad_norm": 3.921598196029663, "learning_rate": 9.810323473641395e-05, "loss": 2.7393951416015625, "memory(GiB)": 58.3, "step": 10265, "token_acc": 0.42524916943521596, "train_speed(iter/s)": 1.451981 }, { "epoch": 0.43999828627736604, "grad_norm": 3.2552382946014404, "learning_rate": 9.810139827451305e-05, "loss": 2.5158832550048826, "memory(GiB)": 58.3, "step": 10270, "token_acc": 0.46785714285714286, "train_speed(iter/s)": 1.45196 }, { "epoch": 0.44021250160661496, "grad_norm": 3.122969627380371, "learning_rate": 9.809956094121017e-05, "loss": 2.4390892028808593, "memory(GiB)": 58.3, "step": 10275, "token_acc": 0.4697986577181208, "train_speed(iter/s)": 1.452051 }, { "epoch": 0.44042671693586394, "grad_norm": 3.823323965072632, "learning_rate": 9.809772273653866e-05, "loss": 2.5466094970703126, "memory(GiB)": 58.3, "step": 10280, "token_acc": 0.45093457943925236, "train_speed(iter/s)": 1.452017 }, { "epoch": 0.4406409322651129, "grad_norm": 3.602243661880493, "learning_rate": 9.809588366053175e-05, "loss": 2.7130664825439452, "memory(GiB)": 58.3, "step": 10285, "token_acc": 0.4392523364485981, "train_speed(iter/s)": 1.452128 }, { "epoch": 0.44085514759436184, "grad_norm": 3.871955156326294, "learning_rate": 9.80940437132228e-05, "loss": 2.9540714263916015, "memory(GiB)": 58.3, "step": 10290, "token_acc": 0.427536231884058, "train_speed(iter/s)": 1.45217 }, { "epoch": 0.4410693629236108, "grad_norm": 3.894803285598755, "learning_rate": 9.809220289464513e-05, "loss": 2.9123489379882814, "memory(GiB)": 58.3, "step": 10295, "token_acc": 0.4298780487804878, "train_speed(iter/s)": 1.451978 }, { "epoch": 0.4412835782528598, "grad_norm": 3.7303433418273926, "learning_rate": 9.809036120483211e-05, "loss": 2.538544845581055, "memory(GiB)": 58.3, "step": 10300, "token_acc": 0.5, "train_speed(iter/s)": 1.452051 }, { "epoch": 0.4414977935821087, "grad_norm": 4.130242347717285, "learning_rate": 9.808851864381706e-05, "loss": 2.7464969635009764, "memory(GiB)": 58.3, "step": 10305, "token_acc": 0.4792746113989637, "train_speed(iter/s)": 1.452174 }, { "epoch": 0.4417120089113577, "grad_norm": 3.701951742172241, "learning_rate": 9.80866752116334e-05, "loss": 2.5881362915039063, "memory(GiB)": 58.3, "step": 10310, "token_acc": 0.44984802431610943, "train_speed(iter/s)": 1.452229 }, { "epoch": 0.4419262242406067, "grad_norm": 4.804937839508057, "learning_rate": 9.80848309083145e-05, "loss": 2.648440361022949, "memory(GiB)": 58.3, "step": 10315, "token_acc": 0.44206008583690987, "train_speed(iter/s)": 1.452319 }, { "epoch": 0.4421404395698556, "grad_norm": 4.358564853668213, "learning_rate": 9.808298573389379e-05, "loss": 2.5335319519042967, "memory(GiB)": 58.3, "step": 10320, "token_acc": 0.47058823529411764, "train_speed(iter/s)": 1.45236 }, { "epoch": 0.4423546548991046, "grad_norm": 4.186276912689209, "learning_rate": 9.808113968840468e-05, "loss": 2.5856439590454103, "memory(GiB)": 58.3, "step": 10325, "token_acc": 0.46598639455782315, "train_speed(iter/s)": 1.452504 }, { "epoch": 0.44256887022835356, "grad_norm": 4.695544242858887, "learning_rate": 9.807929277188061e-05, "loss": 2.5881237030029296, "memory(GiB)": 58.3, "step": 10330, "token_acc": 0.4612676056338028, "train_speed(iter/s)": 1.452594 }, { "epoch": 0.4427830855576025, "grad_norm": 3.1374800205230713, "learning_rate": 9.807744498435507e-05, "loss": 2.614379119873047, "memory(GiB)": 58.3, "step": 10335, "token_acc": 0.4570446735395189, "train_speed(iter/s)": 1.452636 }, { "epoch": 0.44299730088685146, "grad_norm": 4.29791259765625, "learning_rate": 9.80755963258615e-05, "loss": 2.776248741149902, "memory(GiB)": 58.3, "step": 10340, "token_acc": 0.44285714285714284, "train_speed(iter/s)": 1.452854 }, { "epoch": 0.44321151621610044, "grad_norm": 4.541945934295654, "learning_rate": 9.807374679643342e-05, "loss": 2.270546531677246, "memory(GiB)": 58.3, "step": 10345, "token_acc": 0.5111111111111111, "train_speed(iter/s)": 1.452839 }, { "epoch": 0.44342573154534937, "grad_norm": 3.594449520111084, "learning_rate": 9.80718963961043e-05, "loss": 2.5613216400146483, "memory(GiB)": 58.3, "step": 10350, "token_acc": 0.4953560371517028, "train_speed(iter/s)": 1.452746 }, { "epoch": 0.44363994687459835, "grad_norm": 3.237761974334717, "learning_rate": 9.807004512490769e-05, "loss": 2.365348052978516, "memory(GiB)": 58.3, "step": 10355, "token_acc": 0.5299145299145299, "train_speed(iter/s)": 1.452781 }, { "epoch": 0.4438541622038473, "grad_norm": 4.93245267868042, "learning_rate": 9.806819298287713e-05, "loss": 2.696757698059082, "memory(GiB)": 58.3, "step": 10360, "token_acc": 0.4382716049382716, "train_speed(iter/s)": 1.452898 }, { "epoch": 0.44406837753309625, "grad_norm": 5.638664722442627, "learning_rate": 9.806633997004615e-05, "loss": 2.884890556335449, "memory(GiB)": 58.3, "step": 10365, "token_acc": 0.42124542124542125, "train_speed(iter/s)": 1.45285 }, { "epoch": 0.4442825928623452, "grad_norm": 3.3431310653686523, "learning_rate": 9.806448608644834e-05, "loss": 2.3671947479248048, "memory(GiB)": 58.3, "step": 10370, "token_acc": 0.504424778761062, "train_speed(iter/s)": 1.452998 }, { "epoch": 0.4444968081915942, "grad_norm": 3.615079879760742, "learning_rate": 9.806263133211728e-05, "loss": 2.407727813720703, "memory(GiB)": 58.3, "step": 10375, "token_acc": 0.49635036496350365, "train_speed(iter/s)": 1.453045 }, { "epoch": 0.44471102352084313, "grad_norm": 6.307542324066162, "learning_rate": 9.806077570708654e-05, "loss": 2.398736763000488, "memory(GiB)": 58.3, "step": 10380, "token_acc": 0.5284280936454849, "train_speed(iter/s)": 1.453167 }, { "epoch": 0.4449252388500921, "grad_norm": 6.1022443771362305, "learning_rate": 9.805891921138979e-05, "loss": 2.2987117767333984, "memory(GiB)": 58.3, "step": 10385, "token_acc": 0.5102040816326531, "train_speed(iter/s)": 1.453223 }, { "epoch": 0.4451394541793411, "grad_norm": 5.117356300354004, "learning_rate": 9.805706184506062e-05, "loss": 2.508358955383301, "memory(GiB)": 58.3, "step": 10390, "token_acc": 0.46835443037974683, "train_speed(iter/s)": 1.453035 }, { "epoch": 0.44535366950859, "grad_norm": 4.126290321350098, "learning_rate": 9.805520360813272e-05, "loss": 2.717280960083008, "memory(GiB)": 58.3, "step": 10395, "token_acc": 0.44072948328267475, "train_speed(iter/s)": 1.4529 }, { "epoch": 0.445567884837839, "grad_norm": 5.412095069885254, "learning_rate": 9.80533445006397e-05, "loss": 2.6317028045654296, "memory(GiB)": 58.3, "step": 10400, "token_acc": 0.44074074074074077, "train_speed(iter/s)": 1.453116 }, { "epoch": 0.44578210016708797, "grad_norm": 8.25463581085205, "learning_rate": 9.805148452261528e-05, "loss": 2.9181276321411134, "memory(GiB)": 58.3, "step": 10405, "token_acc": 0.43812709030100333, "train_speed(iter/s)": 1.453152 }, { "epoch": 0.4459963154963369, "grad_norm": 3.7849009037017822, "learning_rate": 9.804962367409313e-05, "loss": 2.539438247680664, "memory(GiB)": 58.3, "step": 10410, "token_acc": 0.45614035087719296, "train_speed(iter/s)": 1.453323 }, { "epoch": 0.44621053082558587, "grad_norm": 3.961578130722046, "learning_rate": 9.804776195510699e-05, "loss": 2.5199079513549805, "memory(GiB)": 58.3, "step": 10415, "token_acc": 0.5508474576271186, "train_speed(iter/s)": 1.453375 }, { "epoch": 0.44642474615483485, "grad_norm": 4.937049388885498, "learning_rate": 9.804589936569055e-05, "loss": 2.718492889404297, "memory(GiB)": 58.3, "step": 10420, "token_acc": 0.4318181818181818, "train_speed(iter/s)": 1.453055 }, { "epoch": 0.4466389614840838, "grad_norm": 3.9586424827575684, "learning_rate": 9.804403590587758e-05, "loss": 2.673126792907715, "memory(GiB)": 58.3, "step": 10425, "token_acc": 0.476027397260274, "train_speed(iter/s)": 1.452989 }, { "epoch": 0.44685317681333275, "grad_norm": 3.0467731952667236, "learning_rate": 9.804217157570184e-05, "loss": 2.875569152832031, "memory(GiB)": 58.3, "step": 10430, "token_acc": 0.4103448275862069, "train_speed(iter/s)": 1.452952 }, { "epoch": 0.44706739214258173, "grad_norm": 2.768760919570923, "learning_rate": 9.804030637519708e-05, "loss": 2.668725776672363, "memory(GiB)": 58.3, "step": 10435, "token_acc": 0.46175637393767704, "train_speed(iter/s)": 1.453009 }, { "epoch": 0.4472816074718307, "grad_norm": 3.534268617630005, "learning_rate": 9.803844030439711e-05, "loss": 2.4038856506347654, "memory(GiB)": 58.3, "step": 10440, "token_acc": 0.5088967971530249, "train_speed(iter/s)": 1.453162 }, { "epoch": 0.44749582280107963, "grad_norm": 4.700990676879883, "learning_rate": 9.803657336333574e-05, "loss": 2.8660232543945314, "memory(GiB)": 58.3, "step": 10445, "token_acc": 0.4234527687296417, "train_speed(iter/s)": 1.453075 }, { "epoch": 0.4477100381303286, "grad_norm": 5.253242015838623, "learning_rate": 9.803470555204676e-05, "loss": 2.3562389373779298, "memory(GiB)": 58.3, "step": 10450, "token_acc": 0.4934210526315789, "train_speed(iter/s)": 1.452904 }, { "epoch": 0.4479242534595776, "grad_norm": 14.840235710144043, "learning_rate": 9.803283687056404e-05, "loss": 2.645217704772949, "memory(GiB)": 58.3, "step": 10455, "token_acc": 0.43097643097643096, "train_speed(iter/s)": 1.45275 }, { "epoch": 0.4481384687888265, "grad_norm": 4.14026403427124, "learning_rate": 9.803096731892142e-05, "loss": 2.549029541015625, "memory(GiB)": 58.3, "step": 10460, "token_acc": 0.4982698961937716, "train_speed(iter/s)": 1.452964 }, { "epoch": 0.4483526841180755, "grad_norm": 5.461249351501465, "learning_rate": 9.802909689715278e-05, "loss": 2.739639091491699, "memory(GiB)": 58.3, "step": 10465, "token_acc": 0.4558011049723757, "train_speed(iter/s)": 1.45317 }, { "epoch": 0.4485668994473245, "grad_norm": 4.478499412536621, "learning_rate": 9.802722560529199e-05, "loss": 2.686005401611328, "memory(GiB)": 58.3, "step": 10470, "token_acc": 0.4716312056737589, "train_speed(iter/s)": 1.453096 }, { "epoch": 0.4487811147765734, "grad_norm": 6.938562870025635, "learning_rate": 9.802535344337296e-05, "loss": 2.4839391708374023, "memory(GiB)": 58.3, "step": 10475, "token_acc": 0.4838709677419355, "train_speed(iter/s)": 1.452976 }, { "epoch": 0.4489953301058224, "grad_norm": 4.087958812713623, "learning_rate": 9.80234804114296e-05, "loss": 2.4675451278686524, "memory(GiB)": 58.3, "step": 10480, "token_acc": 0.47796610169491527, "train_speed(iter/s)": 1.453059 }, { "epoch": 0.44920954543507136, "grad_norm": 4.140214443206787, "learning_rate": 9.802160650949584e-05, "loss": 2.826065254211426, "memory(GiB)": 58.3, "step": 10485, "token_acc": 0.44025157232704404, "train_speed(iter/s)": 1.453005 }, { "epoch": 0.4494237607643203, "grad_norm": 4.372977256774902, "learning_rate": 9.801973173760562e-05, "loss": 2.6633235931396486, "memory(GiB)": 58.3, "step": 10490, "token_acc": 0.459546925566343, "train_speed(iter/s)": 1.45318 }, { "epoch": 0.44963797609356926, "grad_norm": 6.438418865203857, "learning_rate": 9.801785609579292e-05, "loss": 2.941879653930664, "memory(GiB)": 58.3, "step": 10495, "token_acc": 0.44200626959247646, "train_speed(iter/s)": 1.453301 }, { "epoch": 0.44985219142281824, "grad_norm": 3.240180730819702, "learning_rate": 9.801597958409172e-05, "loss": 2.5285844802856445, "memory(GiB)": 58.3, "step": 10500, "token_acc": 0.447098976109215, "train_speed(iter/s)": 1.453377 }, { "epoch": 0.44985219142281824, "eval_loss": 2.126368522644043, "eval_runtime": 14.1191, "eval_samples_per_second": 7.083, "eval_steps_per_second": 7.083, "eval_token_acc": 0.49923195084485406, "step": 10500 }, { "epoch": 0.45006640675206716, "grad_norm": 3.5414857864379883, "learning_rate": 9.8014102202536e-05, "loss": 2.6892297744750975, "memory(GiB)": 58.3, "step": 10505, "token_acc": 0.48895582329317266, "train_speed(iter/s)": 1.450312 }, { "epoch": 0.45028062208131614, "grad_norm": 4.189624309539795, "learning_rate": 9.801222395115976e-05, "loss": 2.5900028228759764, "memory(GiB)": 58.3, "step": 10510, "token_acc": 0.45936395759717313, "train_speed(iter/s)": 1.450381 }, { "epoch": 0.4504948374105651, "grad_norm": 6.751174449920654, "learning_rate": 9.801034482999707e-05, "loss": 2.3130468368530273, "memory(GiB)": 58.3, "step": 10515, "token_acc": 0.4858757062146893, "train_speed(iter/s)": 1.450449 }, { "epoch": 0.45070905273981404, "grad_norm": 3.647382974624634, "learning_rate": 9.800846483908195e-05, "loss": 2.590544891357422, "memory(GiB)": 58.3, "step": 10520, "token_acc": 0.4745762711864407, "train_speed(iter/s)": 1.450405 }, { "epoch": 0.450923268069063, "grad_norm": 5.126917362213135, "learning_rate": 9.800658397844844e-05, "loss": 2.6092994689941404, "memory(GiB)": 58.3, "step": 10525, "token_acc": 0.4742268041237113, "train_speed(iter/s)": 1.450581 }, { "epoch": 0.451137483398312, "grad_norm": 4.543416500091553, "learning_rate": 9.800470224813064e-05, "loss": 2.889688491821289, "memory(GiB)": 58.3, "step": 10530, "token_acc": 0.44481605351170567, "train_speed(iter/s)": 1.450654 }, { "epoch": 0.4513516987275609, "grad_norm": 4.379642963409424, "learning_rate": 9.80028196481626e-05, "loss": 2.648379898071289, "memory(GiB)": 58.3, "step": 10535, "token_acc": 0.4533762057877814, "train_speed(iter/s)": 1.450449 }, { "epoch": 0.4515659140568099, "grad_norm": 3.2970504760742188, "learning_rate": 9.800093617857846e-05, "loss": 2.5996782302856447, "memory(GiB)": 58.3, "step": 10540, "token_acc": 0.47770700636942676, "train_speed(iter/s)": 1.450531 }, { "epoch": 0.4517801293860589, "grad_norm": 5.440197467803955, "learning_rate": 9.799905183941236e-05, "loss": 2.53688907623291, "memory(GiB)": 58.3, "step": 10545, "token_acc": 0.45774647887323944, "train_speed(iter/s)": 1.450623 }, { "epoch": 0.4519943447153078, "grad_norm": 4.509754657745361, "learning_rate": 9.799716663069838e-05, "loss": 2.78790397644043, "memory(GiB)": 58.3, "step": 10550, "token_acc": 0.46204620462046203, "train_speed(iter/s)": 1.450625 }, { "epoch": 0.4522085600445568, "grad_norm": 4.187043190002441, "learning_rate": 9.799528055247071e-05, "loss": 2.655309867858887, "memory(GiB)": 58.3, "step": 10555, "token_acc": 0.45964912280701753, "train_speed(iter/s)": 1.450604 }, { "epoch": 0.45242277537380576, "grad_norm": 4.815025329589844, "learning_rate": 9.799339360476352e-05, "loss": 2.7851726531982424, "memory(GiB)": 58.3, "step": 10560, "token_acc": 0.44360902255639095, "train_speed(iter/s)": 1.450622 }, { "epoch": 0.4526369907030547, "grad_norm": 3.575446128845215, "learning_rate": 9.799150578761098e-05, "loss": 2.543497848510742, "memory(GiB)": 58.3, "step": 10565, "token_acc": 0.5, "train_speed(iter/s)": 1.450637 }, { "epoch": 0.45285120603230367, "grad_norm": 4.181159496307373, "learning_rate": 9.798961710104728e-05, "loss": 2.623805618286133, "memory(GiB)": 58.3, "step": 10570, "token_acc": 0.4753521126760563, "train_speed(iter/s)": 1.450629 }, { "epoch": 0.45306542136155264, "grad_norm": 3.958008050918579, "learning_rate": 9.798772754510666e-05, "loss": 2.475885200500488, "memory(GiB)": 58.3, "step": 10575, "token_acc": 0.46827794561933533, "train_speed(iter/s)": 1.450384 }, { "epoch": 0.45327963669080157, "grad_norm": 4.189507007598877, "learning_rate": 9.798583711982332e-05, "loss": 2.7183481216430665, "memory(GiB)": 58.3, "step": 10580, "token_acc": 0.46229508196721314, "train_speed(iter/s)": 1.450573 }, { "epoch": 0.45349385202005055, "grad_norm": 4.074063301086426, "learning_rate": 9.798394582523154e-05, "loss": 2.802981948852539, "memory(GiB)": 58.3, "step": 10585, "token_acc": 0.41534391534391535, "train_speed(iter/s)": 1.450591 }, { "epoch": 0.4537080673492995, "grad_norm": 5.128859043121338, "learning_rate": 9.798205366136558e-05, "loss": 2.7304380416870115, "memory(GiB)": 58.3, "step": 10590, "token_acc": 0.4405594405594406, "train_speed(iter/s)": 1.450703 }, { "epoch": 0.45392228267854845, "grad_norm": 3.6847312450408936, "learning_rate": 9.79801606282597e-05, "loss": 2.2724870681762694, "memory(GiB)": 58.3, "step": 10595, "token_acc": 0.48507462686567165, "train_speed(iter/s)": 1.450779 }, { "epoch": 0.45413649800779743, "grad_norm": 4.458700656890869, "learning_rate": 9.797826672594819e-05, "loss": 2.497824478149414, "memory(GiB)": 58.3, "step": 10600, "token_acc": 0.48638132295719844, "train_speed(iter/s)": 1.450905 }, { "epoch": 0.4543507133370464, "grad_norm": 4.180672645568848, "learning_rate": 9.797637195446538e-05, "loss": 2.615709686279297, "memory(GiB)": 58.3, "step": 10605, "token_acc": 0.477124183006536, "train_speed(iter/s)": 1.450776 }, { "epoch": 0.4545649286662954, "grad_norm": 4.374013423919678, "learning_rate": 9.797447631384559e-05, "loss": 2.6567962646484373, "memory(GiB)": 58.3, "step": 10610, "token_acc": 0.4812286689419795, "train_speed(iter/s)": 1.450747 }, { "epoch": 0.4547791439955443, "grad_norm": 4.041299343109131, "learning_rate": 9.797257980412316e-05, "loss": 2.621622657775879, "memory(GiB)": 58.3, "step": 10615, "token_acc": 0.4584837545126354, "train_speed(iter/s)": 1.450717 }, { "epoch": 0.4549933593247933, "grad_norm": 4.688261985778809, "learning_rate": 9.797068242533243e-05, "loss": 2.5129161834716798, "memory(GiB)": 58.3, "step": 10620, "token_acc": 0.46255506607929514, "train_speed(iter/s)": 1.450356 }, { "epoch": 0.45520757465404227, "grad_norm": 3.561720132827759, "learning_rate": 9.79687841775078e-05, "loss": 2.6358680725097656, "memory(GiB)": 58.3, "step": 10625, "token_acc": 0.4423791821561338, "train_speed(iter/s)": 1.450467 }, { "epoch": 0.4554217899832912, "grad_norm": 4.223676681518555, "learning_rate": 9.796688506068364e-05, "loss": 2.379704475402832, "memory(GiB)": 58.3, "step": 10630, "token_acc": 0.4536741214057508, "train_speed(iter/s)": 1.45058 }, { "epoch": 0.45563600531254017, "grad_norm": 5.094300746917725, "learning_rate": 9.796498507489436e-05, "loss": 2.7889373779296873, "memory(GiB)": 58.3, "step": 10635, "token_acc": 0.4532871972318339, "train_speed(iter/s)": 1.450619 }, { "epoch": 0.45585022064178915, "grad_norm": 4.634842395782471, "learning_rate": 9.79630842201744e-05, "loss": 2.478668785095215, "memory(GiB)": 58.3, "step": 10640, "token_acc": 0.46540880503144655, "train_speed(iter/s)": 1.450697 }, { "epoch": 0.4560644359710381, "grad_norm": 4.784429550170898, "learning_rate": 9.796118249655814e-05, "loss": 2.6598688125610352, "memory(GiB)": 58.3, "step": 10645, "token_acc": 0.486404833836858, "train_speed(iter/s)": 1.450567 }, { "epoch": 0.45627865130028705, "grad_norm": 3.1336419582366943, "learning_rate": 9.795927990408009e-05, "loss": 2.588586616516113, "memory(GiB)": 58.3, "step": 10650, "token_acc": 0.4773413897280967, "train_speed(iter/s)": 1.450478 }, { "epoch": 0.45649286662953603, "grad_norm": 4.033613204956055, "learning_rate": 9.79573764427747e-05, "loss": 2.7217081069946287, "memory(GiB)": 58.3, "step": 10655, "token_acc": 0.48638132295719844, "train_speed(iter/s)": 1.450615 }, { "epoch": 0.45670708195878496, "grad_norm": 5.214172840118408, "learning_rate": 9.795547211267643e-05, "loss": 2.4614585876464843, "memory(GiB)": 58.3, "step": 10660, "token_acc": 0.46830985915492956, "train_speed(iter/s)": 1.450557 }, { "epoch": 0.45692129728803393, "grad_norm": 6.807183742523193, "learning_rate": 9.795356691381983e-05, "loss": 2.5423999786376954, "memory(GiB)": 58.3, "step": 10665, "token_acc": 0.4530612244897959, "train_speed(iter/s)": 1.450437 }, { "epoch": 0.4571355126172829, "grad_norm": 3.8348653316497803, "learning_rate": 9.795166084623934e-05, "loss": 2.6810192108154296, "memory(GiB)": 58.3, "step": 10670, "token_acc": 0.47297297297297297, "train_speed(iter/s)": 1.450467 }, { "epoch": 0.45734972794653184, "grad_norm": 6.727038860321045, "learning_rate": 9.794975390996956e-05, "loss": 2.6809814453125, "memory(GiB)": 58.3, "step": 10675, "token_acc": 0.4462809917355372, "train_speed(iter/s)": 1.450499 }, { "epoch": 0.4575639432757808, "grad_norm": 4.346179008483887, "learning_rate": 9.7947846105045e-05, "loss": 2.444858932495117, "memory(GiB)": 58.3, "step": 10680, "token_acc": 0.5017921146953405, "train_speed(iter/s)": 1.450585 }, { "epoch": 0.4577781586050298, "grad_norm": 6.229421615600586, "learning_rate": 9.794593743150022e-05, "loss": 2.836994171142578, "memory(GiB)": 58.3, "step": 10685, "token_acc": 0.43911439114391143, "train_speed(iter/s)": 1.450806 }, { "epoch": 0.4579923739342787, "grad_norm": 3.387035846710205, "learning_rate": 9.794402788936983e-05, "loss": 2.5257070541381834, "memory(GiB)": 58.3, "step": 10690, "token_acc": 0.48582995951417, "train_speed(iter/s)": 1.450953 }, { "epoch": 0.4582065892635277, "grad_norm": 3.9601640701293945, "learning_rate": 9.794211747868838e-05, "loss": 2.578817939758301, "memory(GiB)": 58.3, "step": 10695, "token_acc": 0.45985401459854014, "train_speed(iter/s)": 1.450979 }, { "epoch": 0.4584208045927767, "grad_norm": 5.536830425262451, "learning_rate": 9.79402061994905e-05, "loss": 2.96250057220459, "memory(GiB)": 58.3, "step": 10700, "token_acc": 0.41132075471698115, "train_speed(iter/s)": 1.451014 }, { "epoch": 0.4586350199220256, "grad_norm": 4.493021488189697, "learning_rate": 9.793829405181081e-05, "loss": 2.4744644165039062, "memory(GiB)": 58.3, "step": 10705, "token_acc": 0.49, "train_speed(iter/s)": 1.451083 }, { "epoch": 0.4588492352512746, "grad_norm": 4.748714923858643, "learning_rate": 9.793638103568397e-05, "loss": 2.5041988372802733, "memory(GiB)": 58.3, "step": 10710, "token_acc": 0.46629213483146065, "train_speed(iter/s)": 1.451054 }, { "epoch": 0.45906345058052356, "grad_norm": 3.1664113998413086, "learning_rate": 9.79344671511446e-05, "loss": 2.6128122329711916, "memory(GiB)": 58.3, "step": 10715, "token_acc": 0.46006389776357826, "train_speed(iter/s)": 1.450868 }, { "epoch": 0.4592776659097725, "grad_norm": 5.172250747680664, "learning_rate": 9.79325523982274e-05, "loss": 2.7653314590454103, "memory(GiB)": 58.3, "step": 10720, "token_acc": 0.4651898734177215, "train_speed(iter/s)": 1.450748 }, { "epoch": 0.45949188123902146, "grad_norm": 4.8607001304626465, "learning_rate": 9.793063677696706e-05, "loss": 2.7596765518188477, "memory(GiB)": 58.3, "step": 10725, "token_acc": 0.4577922077922078, "train_speed(iter/s)": 1.450681 }, { "epoch": 0.45970609656827044, "grad_norm": 4.228477954864502, "learning_rate": 9.792872028739826e-05, "loss": 2.715163803100586, "memory(GiB)": 58.3, "step": 10730, "token_acc": 0.43234323432343236, "train_speed(iter/s)": 1.450609 }, { "epoch": 0.45992031189751936, "grad_norm": 4.469358444213867, "learning_rate": 9.792680292955571e-05, "loss": 2.675069046020508, "memory(GiB)": 58.3, "step": 10735, "token_acc": 0.4405594405594406, "train_speed(iter/s)": 1.450748 }, { "epoch": 0.46013452722676834, "grad_norm": 3.619096517562866, "learning_rate": 9.792488470347421e-05, "loss": 2.6168188095092773, "memory(GiB)": 58.3, "step": 10740, "token_acc": 0.45084745762711864, "train_speed(iter/s)": 1.450713 }, { "epoch": 0.4603487425560173, "grad_norm": 3.301767587661743, "learning_rate": 9.792296560918844e-05, "loss": 2.9165679931640627, "memory(GiB)": 58.3, "step": 10745, "token_acc": 0.42084942084942084, "train_speed(iter/s)": 1.450789 }, { "epoch": 0.46056295788526624, "grad_norm": 4.956804275512695, "learning_rate": 9.792104564673319e-05, "loss": 2.60153751373291, "memory(GiB)": 58.3, "step": 10750, "token_acc": 0.48375451263537905, "train_speed(iter/s)": 1.450813 }, { "epoch": 0.4607771732145152, "grad_norm": 3.9421226978302, "learning_rate": 9.791912481614324e-05, "loss": 2.554246520996094, "memory(GiB)": 58.3, "step": 10755, "token_acc": 0.4550898203592814, "train_speed(iter/s)": 1.450818 }, { "epoch": 0.4609913885437642, "grad_norm": 6.13618278503418, "learning_rate": 9.791720311745342e-05, "loss": 2.6295148849487306, "memory(GiB)": 58.3, "step": 10760, "token_acc": 0.5245901639344263, "train_speed(iter/s)": 1.450973 }, { "epoch": 0.4612056038730131, "grad_norm": 4.67579460144043, "learning_rate": 9.791528055069849e-05, "loss": 2.4764760971069335, "memory(GiB)": 58.3, "step": 10765, "token_acc": 0.46579804560260585, "train_speed(iter/s)": 1.451022 }, { "epoch": 0.4614198192022621, "grad_norm": 5.211319446563721, "learning_rate": 9.791335711591332e-05, "loss": 2.703862190246582, "memory(GiB)": 58.3, "step": 10770, "token_acc": 0.42948717948717946, "train_speed(iter/s)": 1.45096 }, { "epoch": 0.4616340345315111, "grad_norm": 4.508678913116455, "learning_rate": 9.791143281313274e-05, "loss": 2.5066123962402345, "memory(GiB)": 58.3, "step": 10775, "token_acc": 0.4508670520231214, "train_speed(iter/s)": 1.450878 }, { "epoch": 0.46184824986076006, "grad_norm": 3.7897000312805176, "learning_rate": 9.79095076423916e-05, "loss": 2.6497310638427733, "memory(GiB)": 58.3, "step": 10780, "token_acc": 0.4967105263157895, "train_speed(iter/s)": 1.450857 }, { "epoch": 0.462062465190009, "grad_norm": 3.7451624870300293, "learning_rate": 9.790758160372479e-05, "loss": 2.488443374633789, "memory(GiB)": 58.3, "step": 10785, "token_acc": 0.47232472324723246, "train_speed(iter/s)": 1.450973 }, { "epoch": 0.46227668051925797, "grad_norm": 5.758881568908691, "learning_rate": 9.79056546971672e-05, "loss": 2.585683059692383, "memory(GiB)": 58.3, "step": 10790, "token_acc": 0.44565217391304346, "train_speed(iter/s)": 1.450978 }, { "epoch": 0.46249089584850694, "grad_norm": 3.644552230834961, "learning_rate": 9.790372692275374e-05, "loss": 2.5159221649169923, "memory(GiB)": 58.3, "step": 10795, "token_acc": 0.4781144781144781, "train_speed(iter/s)": 1.451049 }, { "epoch": 0.46270511117775587, "grad_norm": 4.923215389251709, "learning_rate": 9.790179828051931e-05, "loss": 2.3791568756103514, "memory(GiB)": 58.3, "step": 10800, "token_acc": 0.4703703703703704, "train_speed(iter/s)": 1.451007 }, { "epoch": 0.46291932650700485, "grad_norm": 3.903329849243164, "learning_rate": 9.789986877049888e-05, "loss": 2.612640380859375, "memory(GiB)": 58.3, "step": 10805, "token_acc": 0.45714285714285713, "train_speed(iter/s)": 1.451101 }, { "epoch": 0.4631335418362538, "grad_norm": 4.178727149963379, "learning_rate": 9.78979383927274e-05, "loss": 2.5980998992919924, "memory(GiB)": 58.3, "step": 10810, "token_acc": 0.44193548387096776, "train_speed(iter/s)": 1.450968 }, { "epoch": 0.46334775716550275, "grad_norm": 3.5005838871002197, "learning_rate": 9.789600714723983e-05, "loss": 2.5595413208007813, "memory(GiB)": 58.3, "step": 10815, "token_acc": 0.4782608695652174, "train_speed(iter/s)": 1.450794 }, { "epoch": 0.46356197249475173, "grad_norm": 3.065406560897827, "learning_rate": 9.789407503407115e-05, "loss": 2.8204328536987306, "memory(GiB)": 58.3, "step": 10820, "token_acc": 0.4253164556962025, "train_speed(iter/s)": 1.451003 }, { "epoch": 0.4637761878240007, "grad_norm": 5.285351276397705, "learning_rate": 9.789214205325638e-05, "loss": 2.522801971435547, "memory(GiB)": 58.3, "step": 10825, "token_acc": 0.46229508196721314, "train_speed(iter/s)": 1.450962 }, { "epoch": 0.46399040315324963, "grad_norm": 3.872533082962036, "learning_rate": 9.789020820483055e-05, "loss": 2.9009300231933595, "memory(GiB)": 58.3, "step": 10830, "token_acc": 0.375, "train_speed(iter/s)": 1.450856 }, { "epoch": 0.4642046184824986, "grad_norm": 3.1430745124816895, "learning_rate": 9.788827348882865e-05, "loss": 2.362409973144531, "memory(GiB)": 58.3, "step": 10835, "token_acc": 0.4984025559105431, "train_speed(iter/s)": 1.451044 }, { "epoch": 0.4644188338117476, "grad_norm": 3.7982048988342285, "learning_rate": 9.788633790528576e-05, "loss": 2.4230241775512695, "memory(GiB)": 58.3, "step": 10840, "token_acc": 0.4633333333333333, "train_speed(iter/s)": 1.451088 }, { "epoch": 0.4646330491409965, "grad_norm": 5.107214450836182, "learning_rate": 9.788440145423695e-05, "loss": 2.7976070404052735, "memory(GiB)": 58.3, "step": 10845, "token_acc": 0.4944649446494465, "train_speed(iter/s)": 1.45126 }, { "epoch": 0.4648472644702455, "grad_norm": 4.19752311706543, "learning_rate": 9.788246413571727e-05, "loss": 2.6039764404296877, "memory(GiB)": 58.3, "step": 10850, "token_acc": 0.4791666666666667, "train_speed(iter/s)": 1.451428 }, { "epoch": 0.46506147979949447, "grad_norm": 9.007368087768555, "learning_rate": 9.788052594976184e-05, "loss": 2.68975715637207, "memory(GiB)": 58.3, "step": 10855, "token_acc": 0.4396887159533074, "train_speed(iter/s)": 1.451448 }, { "epoch": 0.4652756951287434, "grad_norm": 6.284673690795898, "learning_rate": 9.787858689640577e-05, "loss": 2.6068992614746094, "memory(GiB)": 58.3, "step": 10860, "token_acc": 0.44285714285714284, "train_speed(iter/s)": 1.451358 }, { "epoch": 0.4654899104579924, "grad_norm": 4.84188985824585, "learning_rate": 9.787664697568418e-05, "loss": 2.7556400299072266, "memory(GiB)": 58.3, "step": 10865, "token_acc": 0.4426229508196721, "train_speed(iter/s)": 1.451505 }, { "epoch": 0.46570412578724135, "grad_norm": 4.268581867218018, "learning_rate": 9.787470618763222e-05, "loss": 2.7689865112304686, "memory(GiB)": 58.3, "step": 10870, "token_acc": 0.4577922077922078, "train_speed(iter/s)": 1.451717 }, { "epoch": 0.4659183411164903, "grad_norm": 3.183107614517212, "learning_rate": 9.787276453228504e-05, "loss": 2.609415817260742, "memory(GiB)": 58.3, "step": 10875, "token_acc": 0.47368421052631576, "train_speed(iter/s)": 1.451829 }, { "epoch": 0.46613255644573925, "grad_norm": 8.195131301879883, "learning_rate": 9.787082200967784e-05, "loss": 2.664288330078125, "memory(GiB)": 58.3, "step": 10880, "token_acc": 0.4251497005988024, "train_speed(iter/s)": 1.452029 }, { "epoch": 0.46634677177498823, "grad_norm": 4.695074558258057, "learning_rate": 9.786887861984578e-05, "loss": 2.6576133728027345, "memory(GiB)": 58.3, "step": 10885, "token_acc": 0.49295774647887325, "train_speed(iter/s)": 1.452024 }, { "epoch": 0.46656098710423716, "grad_norm": 4.121676445007324, "learning_rate": 9.786693436282408e-05, "loss": 2.577968978881836, "memory(GiB)": 58.3, "step": 10890, "token_acc": 0.4979919678714859, "train_speed(iter/s)": 1.452152 }, { "epoch": 0.46677520243348614, "grad_norm": 3.376992702484131, "learning_rate": 9.786498923864796e-05, "loss": 2.4011980056762696, "memory(GiB)": 58.3, "step": 10895, "token_acc": 0.4963235294117647, "train_speed(iter/s)": 1.45232 }, { "epoch": 0.4669894177627351, "grad_norm": 3.7245006561279297, "learning_rate": 9.786304324735267e-05, "loss": 2.729457473754883, "memory(GiB)": 58.3, "step": 10900, "token_acc": 0.48623853211009177, "train_speed(iter/s)": 1.452395 }, { "epoch": 0.46720363309198404, "grad_norm": 3.90267276763916, "learning_rate": 9.786109638897344e-05, "loss": 2.938713264465332, "memory(GiB)": 58.3, "step": 10905, "token_acc": 0.44108761329305135, "train_speed(iter/s)": 1.452512 }, { "epoch": 0.467417848421233, "grad_norm": 4.368808269500732, "learning_rate": 9.785914866354556e-05, "loss": 3.125321960449219, "memory(GiB)": 58.3, "step": 10910, "token_acc": 0.3785900783289817, "train_speed(iter/s)": 1.452511 }, { "epoch": 0.467632063750482, "grad_norm": 5.022944927215576, "learning_rate": 9.78572000711043e-05, "loss": 2.748683738708496, "memory(GiB)": 58.3, "step": 10915, "token_acc": 0.46691176470588236, "train_speed(iter/s)": 1.452626 }, { "epoch": 0.4678462790797309, "grad_norm": 3.6183857917785645, "learning_rate": 9.785525061168497e-05, "loss": 2.657187652587891, "memory(GiB)": 58.3, "step": 10920, "token_acc": 0.43769968051118213, "train_speed(iter/s)": 1.452518 }, { "epoch": 0.4680604944089799, "grad_norm": 4.752320289611816, "learning_rate": 9.785330028532288e-05, "loss": 2.315539741516113, "memory(GiB)": 58.3, "step": 10925, "token_acc": 0.5018181818181818, "train_speed(iter/s)": 1.45255 }, { "epoch": 0.4682747097382289, "grad_norm": 4.117373943328857, "learning_rate": 9.785134909205337e-05, "loss": 2.528944396972656, "memory(GiB)": 58.3, "step": 10930, "token_acc": 0.46153846153846156, "train_speed(iter/s)": 1.4526 }, { "epoch": 0.4684889250674778, "grad_norm": 5.804306507110596, "learning_rate": 9.784939703191179e-05, "loss": 2.4984342575073244, "memory(GiB)": 58.3, "step": 10935, "token_acc": 0.5, "train_speed(iter/s)": 1.452635 }, { "epoch": 0.4687031403967268, "grad_norm": 3.236720561981201, "learning_rate": 9.784744410493348e-05, "loss": 2.6287277221679686, "memory(GiB)": 58.3, "step": 10940, "token_acc": 0.473015873015873, "train_speed(iter/s)": 1.452843 }, { "epoch": 0.46891735572597576, "grad_norm": 5.333811283111572, "learning_rate": 9.784549031115384e-05, "loss": 2.7113250732421874, "memory(GiB)": 58.3, "step": 10945, "token_acc": 0.44964028776978415, "train_speed(iter/s)": 1.452899 }, { "epoch": 0.46913157105522474, "grad_norm": 4.313626766204834, "learning_rate": 9.784353565060826e-05, "loss": 2.402693176269531, "memory(GiB)": 58.3, "step": 10950, "token_acc": 0.49421965317919075, "train_speed(iter/s)": 1.452959 }, { "epoch": 0.46934578638447366, "grad_norm": 3.7378852367401123, "learning_rate": 9.784158012333216e-05, "loss": 2.800084686279297, "memory(GiB)": 58.3, "step": 10955, "token_acc": 0.4424778761061947, "train_speed(iter/s)": 1.452848 }, { "epoch": 0.46956000171372264, "grad_norm": 3.3986082077026367, "learning_rate": 9.783962372936095e-05, "loss": 2.5746301651000976, "memory(GiB)": 58.3, "step": 10960, "token_acc": 0.4608695652173913, "train_speed(iter/s)": 1.453001 }, { "epoch": 0.4697742170429716, "grad_norm": 5.421916484832764, "learning_rate": 9.783766646873008e-05, "loss": 2.3186120986938477, "memory(GiB)": 58.3, "step": 10965, "token_acc": 0.5147058823529411, "train_speed(iter/s)": 1.452974 }, { "epoch": 0.46998843237222054, "grad_norm": 4.455541133880615, "learning_rate": 9.7835708341475e-05, "loss": 2.440431594848633, "memory(GiB)": 58.3, "step": 10970, "token_acc": 0.4856115107913669, "train_speed(iter/s)": 1.453093 }, { "epoch": 0.4702026477014695, "grad_norm": 4.785274028778076, "learning_rate": 9.78337493476312e-05, "loss": 2.561728858947754, "memory(GiB)": 58.3, "step": 10975, "token_acc": 0.4681528662420382, "train_speed(iter/s)": 1.453272 }, { "epoch": 0.4704168630307185, "grad_norm": 6.452266693115234, "learning_rate": 9.783178948723415e-05, "loss": 2.5045833587646484, "memory(GiB)": 58.3, "step": 10980, "token_acc": 0.43728813559322033, "train_speed(iter/s)": 1.453093 }, { "epoch": 0.4706310783599674, "grad_norm": 4.0317511558532715, "learning_rate": 9.782982876031938e-05, "loss": 2.4897130966186523, "memory(GiB)": 58.3, "step": 10985, "token_acc": 0.44727272727272727, "train_speed(iter/s)": 1.452998 }, { "epoch": 0.4708452936892164, "grad_norm": 9.115554809570312, "learning_rate": 9.782786716692239e-05, "loss": 2.3984521865844726, "memory(GiB)": 58.3, "step": 10990, "token_acc": 0.5230769230769231, "train_speed(iter/s)": 1.452987 }, { "epoch": 0.4710595090184654, "grad_norm": 4.206314563751221, "learning_rate": 9.782590470707871e-05, "loss": 2.9820087432861326, "memory(GiB)": 58.3, "step": 10995, "token_acc": 0.4431137724550898, "train_speed(iter/s)": 1.45302 }, { "epoch": 0.4712737243477143, "grad_norm": 5.37673282623291, "learning_rate": 9.78239413808239e-05, "loss": 2.4091548919677734, "memory(GiB)": 58.3, "step": 11000, "token_acc": 0.48872180451127817, "train_speed(iter/s)": 1.45301 }, { "epoch": 0.4712737243477143, "eval_loss": 2.286198616027832, "eval_runtime": 14.3672, "eval_samples_per_second": 6.96, "eval_steps_per_second": 6.96, "eval_token_acc": 0.49477806788511747, "step": 11000 }, { "epoch": 0.4714879396769633, "grad_norm": 4.41751766204834, "learning_rate": 9.782197718819352e-05, "loss": 2.7890464782714846, "memory(GiB)": 58.3, "step": 11005, "token_acc": 0.48725212464589235, "train_speed(iter/s)": 1.450004 }, { "epoch": 0.47170215500621226, "grad_norm": 5.613940238952637, "learning_rate": 9.782001212922319e-05, "loss": 2.665138053894043, "memory(GiB)": 58.3, "step": 11010, "token_acc": 0.4372623574144487, "train_speed(iter/s)": 1.450127 }, { "epoch": 0.4719163703354612, "grad_norm": 5.2741265296936035, "learning_rate": 9.781804620394847e-05, "loss": 2.850432586669922, "memory(GiB)": 58.3, "step": 11015, "token_acc": 0.43769968051118213, "train_speed(iter/s)": 1.450154 }, { "epoch": 0.47213058566471017, "grad_norm": 4.864946365356445, "learning_rate": 9.781607941240498e-05, "loss": 2.886791229248047, "memory(GiB)": 58.3, "step": 11020, "token_acc": 0.41904761904761906, "train_speed(iter/s)": 1.450107 }, { "epoch": 0.47234480099395915, "grad_norm": 3.490731716156006, "learning_rate": 9.781411175462836e-05, "loss": 2.668185234069824, "memory(GiB)": 58.3, "step": 11025, "token_acc": 0.45422535211267606, "train_speed(iter/s)": 1.450274 }, { "epoch": 0.47255901632320807, "grad_norm": 4.4292097091674805, "learning_rate": 9.781214323065426e-05, "loss": 2.3660659790039062, "memory(GiB)": 58.3, "step": 11030, "token_acc": 0.5142857142857142, "train_speed(iter/s)": 1.450323 }, { "epoch": 0.47277323165245705, "grad_norm": 9.111448287963867, "learning_rate": 9.781017384051832e-05, "loss": 2.639826774597168, "memory(GiB)": 58.3, "step": 11035, "token_acc": 0.4574898785425101, "train_speed(iter/s)": 1.450164 }, { "epoch": 0.47298744698170603, "grad_norm": 4.101097106933594, "learning_rate": 9.780820358425625e-05, "loss": 2.7708057403564452, "memory(GiB)": 58.3, "step": 11040, "token_acc": 0.4119718309859155, "train_speed(iter/s)": 1.450106 }, { "epoch": 0.47320166231095495, "grad_norm": 4.074931621551514, "learning_rate": 9.780623246190371e-05, "loss": 2.6240386962890625, "memory(GiB)": 58.3, "step": 11045, "token_acc": 0.48231511254019294, "train_speed(iter/s)": 1.450164 }, { "epoch": 0.47341587764020393, "grad_norm": 4.386037349700928, "learning_rate": 9.780426047349642e-05, "loss": 2.390812301635742, "memory(GiB)": 58.3, "step": 11050, "token_acc": 0.4791666666666667, "train_speed(iter/s)": 1.450212 }, { "epoch": 0.4736300929694529, "grad_norm": 3.773947238922119, "learning_rate": 9.78022876190701e-05, "loss": 2.5393697738647463, "memory(GiB)": 58.3, "step": 11055, "token_acc": 0.44360902255639095, "train_speed(iter/s)": 1.450069 }, { "epoch": 0.47384430829870183, "grad_norm": 4.300468921661377, "learning_rate": 9.780031389866053e-05, "loss": 2.258570098876953, "memory(GiB)": 58.3, "step": 11060, "token_acc": 0.5281385281385281, "train_speed(iter/s)": 1.449887 }, { "epoch": 0.4740585236279508, "grad_norm": 4.8900628089904785, "learning_rate": 9.77983393123034e-05, "loss": 2.48463134765625, "memory(GiB)": 58.3, "step": 11065, "token_acc": 0.4559386973180077, "train_speed(iter/s)": 1.450048 }, { "epoch": 0.4742727389571998, "grad_norm": 5.431396007537842, "learning_rate": 9.779636386003453e-05, "loss": 2.3649187088012695, "memory(GiB)": 58.3, "step": 11070, "token_acc": 0.47058823529411764, "train_speed(iter/s)": 1.45013 }, { "epoch": 0.4744869542864487, "grad_norm": 3.571594715118408, "learning_rate": 9.77943875418897e-05, "loss": 2.6671390533447266, "memory(GiB)": 58.3, "step": 11075, "token_acc": 0.43769968051118213, "train_speed(iter/s)": 1.450237 }, { "epoch": 0.4747011696156977, "grad_norm": 3.7833197116851807, "learning_rate": 9.77924103579047e-05, "loss": 2.6256244659423826, "memory(GiB)": 58.3, "step": 11080, "token_acc": 0.4774436090225564, "train_speed(iter/s)": 1.450348 }, { "epoch": 0.4749153849449467, "grad_norm": 3.9306674003601074, "learning_rate": 9.779043230811534e-05, "loss": 2.424663543701172, "memory(GiB)": 58.3, "step": 11085, "token_acc": 0.47058823529411764, "train_speed(iter/s)": 1.450376 }, { "epoch": 0.4751296002741956, "grad_norm": 8.253090858459473, "learning_rate": 9.778845339255749e-05, "loss": 2.9180736541748047, "memory(GiB)": 58.3, "step": 11090, "token_acc": 0.4290909090909091, "train_speed(iter/s)": 1.45026 }, { "epoch": 0.4753438156034446, "grad_norm": 4.530609607696533, "learning_rate": 9.778647361126696e-05, "loss": 2.809162712097168, "memory(GiB)": 58.3, "step": 11095, "token_acc": 0.4057971014492754, "train_speed(iter/s)": 1.450212 }, { "epoch": 0.47555803093269355, "grad_norm": 3.136714458465576, "learning_rate": 9.778449296427962e-05, "loss": 2.52059268951416, "memory(GiB)": 58.3, "step": 11100, "token_acc": 0.46048109965635736, "train_speed(iter/s)": 1.450245 }, { "epoch": 0.4757722462619425, "grad_norm": 4.584779262542725, "learning_rate": 9.778251145163139e-05, "loss": 2.684937286376953, "memory(GiB)": 58.3, "step": 11105, "token_acc": 0.46886446886446886, "train_speed(iter/s)": 1.450349 }, { "epoch": 0.47598646159119146, "grad_norm": 4.542789459228516, "learning_rate": 9.778052907335814e-05, "loss": 2.3787593841552734, "memory(GiB)": 58.3, "step": 11110, "token_acc": 0.48955223880597015, "train_speed(iter/s)": 1.450429 }, { "epoch": 0.47620067692044044, "grad_norm": 5.453615188598633, "learning_rate": 9.777854582949578e-05, "loss": 2.5061763763427733, "memory(GiB)": 58.3, "step": 11115, "token_acc": 0.46646341463414637, "train_speed(iter/s)": 1.45045 }, { "epoch": 0.4764148922496894, "grad_norm": 3.53247332572937, "learning_rate": 9.777656172008023e-05, "loss": 2.703365707397461, "memory(GiB)": 58.3, "step": 11120, "token_acc": 0.4650537634408602, "train_speed(iter/s)": 1.450509 }, { "epoch": 0.47662910757893834, "grad_norm": 3.5506818294525146, "learning_rate": 9.777457674514748e-05, "loss": 2.737448310852051, "memory(GiB)": 58.3, "step": 11125, "token_acc": 0.46107784431137727, "train_speed(iter/s)": 1.450372 }, { "epoch": 0.4768433229081873, "grad_norm": 4.337513446807861, "learning_rate": 9.777259090473341e-05, "loss": 2.4103200912475584, "memory(GiB)": 58.3, "step": 11130, "token_acc": 0.47586206896551725, "train_speed(iter/s)": 1.450535 }, { "epoch": 0.4770575382374363, "grad_norm": 5.818124771118164, "learning_rate": 9.777060419887407e-05, "loss": 2.7565093994140626, "memory(GiB)": 58.3, "step": 11135, "token_acc": 0.4339622641509434, "train_speed(iter/s)": 1.450591 }, { "epoch": 0.4772717535666852, "grad_norm": 3.3356211185455322, "learning_rate": 9.776861662760541e-05, "loss": 2.607416534423828, "memory(GiB)": 58.3, "step": 11140, "token_acc": 0.47232472324723246, "train_speed(iter/s)": 1.450469 }, { "epoch": 0.4774859688959342, "grad_norm": 4.544053077697754, "learning_rate": 9.776662819096347e-05, "loss": 2.7298851013183594, "memory(GiB)": 58.3, "step": 11145, "token_acc": 0.46619217081850534, "train_speed(iter/s)": 1.450424 }, { "epoch": 0.4777001842251832, "grad_norm": 5.424997329711914, "learning_rate": 9.776463888898423e-05, "loss": 2.64554443359375, "memory(GiB)": 58.3, "step": 11150, "token_acc": 0.392, "train_speed(iter/s)": 1.450504 }, { "epoch": 0.4779143995544321, "grad_norm": 4.7738542556762695, "learning_rate": 9.776264872170376e-05, "loss": 2.76705379486084, "memory(GiB)": 58.3, "step": 11155, "token_acc": 0.4379310344827586, "train_speed(iter/s)": 1.450424 }, { "epoch": 0.4781286148836811, "grad_norm": 4.916574478149414, "learning_rate": 9.77606576891581e-05, "loss": 2.580837631225586, "memory(GiB)": 58.3, "step": 11160, "token_acc": 0.452, "train_speed(iter/s)": 1.450398 }, { "epoch": 0.47834283021293006, "grad_norm": 5.020221710205078, "learning_rate": 9.775866579138332e-05, "loss": 2.681121253967285, "memory(GiB)": 58.3, "step": 11165, "token_acc": 0.4823529411764706, "train_speed(iter/s)": 1.450373 }, { "epoch": 0.478557045542179, "grad_norm": 4.253873825073242, "learning_rate": 9.775667302841551e-05, "loss": 2.583617401123047, "memory(GiB)": 58.3, "step": 11170, "token_acc": 0.45722713864306785, "train_speed(iter/s)": 1.45027 }, { "epoch": 0.47877126087142796, "grad_norm": 5.4516167640686035, "learning_rate": 9.775467940029077e-05, "loss": 2.4560674667358398, "memory(GiB)": 58.3, "step": 11175, "token_acc": 0.4793388429752066, "train_speed(iter/s)": 1.450318 }, { "epoch": 0.47898547620067694, "grad_norm": 4.386411190032959, "learning_rate": 9.775268490704522e-05, "loss": 2.5876808166503906, "memory(GiB)": 58.3, "step": 11180, "token_acc": 0.46601941747572817, "train_speed(iter/s)": 1.450354 }, { "epoch": 0.47919969152992586, "grad_norm": 4.466586589813232, "learning_rate": 9.775068954871498e-05, "loss": 2.7462501525878906, "memory(GiB)": 58.3, "step": 11185, "token_acc": 0.46863468634686345, "train_speed(iter/s)": 1.450413 }, { "epoch": 0.47941390685917484, "grad_norm": 5.115545749664307, "learning_rate": 9.774869332533622e-05, "loss": 2.7964197158813477, "memory(GiB)": 58.3, "step": 11190, "token_acc": 0.45625, "train_speed(iter/s)": 1.450541 }, { "epoch": 0.4796281221884238, "grad_norm": 4.668806076049805, "learning_rate": 9.774669623694507e-05, "loss": 2.9263195037841796, "memory(GiB)": 58.3, "step": 11195, "token_acc": 0.4169381107491857, "train_speed(iter/s)": 1.450589 }, { "epoch": 0.47984233751767275, "grad_norm": 4.503677845001221, "learning_rate": 9.774469828357773e-05, "loss": 2.2816009521484375, "memory(GiB)": 58.3, "step": 11200, "token_acc": 0.5019157088122606, "train_speed(iter/s)": 1.450739 }, { "epoch": 0.4800565528469217, "grad_norm": 4.348319053649902, "learning_rate": 9.77426994652704e-05, "loss": 2.1784135818481447, "memory(GiB)": 58.3, "step": 11205, "token_acc": 0.5164473684210527, "train_speed(iter/s)": 1.450811 }, { "epoch": 0.4802707681761707, "grad_norm": 5.390242576599121, "learning_rate": 9.774069978205928e-05, "loss": 2.409230422973633, "memory(GiB)": 58.3, "step": 11210, "token_acc": 0.5022026431718062, "train_speed(iter/s)": 1.450864 }, { "epoch": 0.4804849835054196, "grad_norm": 3.67338490486145, "learning_rate": 9.77386992339806e-05, "loss": 2.435714912414551, "memory(GiB)": 58.3, "step": 11215, "token_acc": 0.5285714285714286, "train_speed(iter/s)": 1.450881 }, { "epoch": 0.4806991988346686, "grad_norm": 4.455507278442383, "learning_rate": 9.77366978210706e-05, "loss": 2.4204294204711916, "memory(GiB)": 58.3, "step": 11220, "token_acc": 0.45819397993311034, "train_speed(iter/s)": 1.450766 }, { "epoch": 0.4809134141639176, "grad_norm": 3.2890052795410156, "learning_rate": 9.773469554336553e-05, "loss": 2.987748718261719, "memory(GiB)": 58.3, "step": 11225, "token_acc": 0.42857142857142855, "train_speed(iter/s)": 1.451006 }, { "epoch": 0.4811276294931665, "grad_norm": 3.461409330368042, "learning_rate": 9.773269240090169e-05, "loss": 2.811074066162109, "memory(GiB)": 58.3, "step": 11230, "token_acc": 0.4746268656716418, "train_speed(iter/s)": 1.451177 }, { "epoch": 0.4813418448224155, "grad_norm": 6.8326311111450195, "learning_rate": 9.773068839371534e-05, "loss": 2.378412628173828, "memory(GiB)": 58.3, "step": 11235, "token_acc": 0.5037878787878788, "train_speed(iter/s)": 1.451157 }, { "epoch": 0.48155606015166447, "grad_norm": 4.739665985107422, "learning_rate": 9.772868352184279e-05, "loss": 2.4834747314453125, "memory(GiB)": 58.3, "step": 11240, "token_acc": 0.4959677419354839, "train_speed(iter/s)": 1.451184 }, { "epoch": 0.4817702754809134, "grad_norm": 4.117530345916748, "learning_rate": 9.772667778532036e-05, "loss": 2.51016845703125, "memory(GiB)": 58.3, "step": 11245, "token_acc": 0.4717514124293785, "train_speed(iter/s)": 1.451296 }, { "epoch": 0.48198449081016237, "grad_norm": 4.38101863861084, "learning_rate": 9.77246711841844e-05, "loss": 2.8449834823608398, "memory(GiB)": 58.3, "step": 11250, "token_acc": 0.42207792207792205, "train_speed(iter/s)": 1.45116 }, { "epoch": 0.48219870613941135, "grad_norm": 5.53923225402832, "learning_rate": 9.772266371847125e-05, "loss": 2.6350059509277344, "memory(GiB)": 58.3, "step": 11255, "token_acc": 0.47368421052631576, "train_speed(iter/s)": 1.451289 }, { "epoch": 0.48241292146866027, "grad_norm": 3.890557050704956, "learning_rate": 9.772065538821728e-05, "loss": 2.357133483886719, "memory(GiB)": 58.3, "step": 11260, "token_acc": 0.516, "train_speed(iter/s)": 1.451357 }, { "epoch": 0.48262713679790925, "grad_norm": 3.507786750793457, "learning_rate": 9.771864619345888e-05, "loss": 2.5293365478515626, "memory(GiB)": 58.3, "step": 11265, "token_acc": 0.5, "train_speed(iter/s)": 1.451479 }, { "epoch": 0.48284135212715823, "grad_norm": 3.9261813163757324, "learning_rate": 9.771663613423243e-05, "loss": 2.661027526855469, "memory(GiB)": 58.3, "step": 11270, "token_acc": 0.422360248447205, "train_speed(iter/s)": 1.451399 }, { "epoch": 0.48305556745640715, "grad_norm": 4.198917388916016, "learning_rate": 9.771462521057436e-05, "loss": 2.894261932373047, "memory(GiB)": 58.3, "step": 11275, "token_acc": 0.43312101910828027, "train_speed(iter/s)": 1.451358 }, { "epoch": 0.48326978278565613, "grad_norm": 3.858447551727295, "learning_rate": 9.771261342252109e-05, "loss": 2.839884567260742, "memory(GiB)": 58.3, "step": 11280, "token_acc": 0.43309859154929575, "train_speed(iter/s)": 1.451374 }, { "epoch": 0.4834839981149051, "grad_norm": 4.041822910308838, "learning_rate": 9.771060077010907e-05, "loss": 2.575081443786621, "memory(GiB)": 58.3, "step": 11285, "token_acc": 0.4692982456140351, "train_speed(iter/s)": 1.451444 }, { "epoch": 0.4836982134441541, "grad_norm": 4.165565490722656, "learning_rate": 9.770858725337477e-05, "loss": 2.52838020324707, "memory(GiB)": 58.3, "step": 11290, "token_acc": 0.5229007633587787, "train_speed(iter/s)": 1.451671 }, { "epoch": 0.483912428773403, "grad_norm": 4.5889105796813965, "learning_rate": 9.770657287235465e-05, "loss": 2.9204345703125, "memory(GiB)": 58.3, "step": 11295, "token_acc": 0.421875, "train_speed(iter/s)": 1.451666 }, { "epoch": 0.484126644102652, "grad_norm": 4.789783477783203, "learning_rate": 9.770455762708521e-05, "loss": 2.543626594543457, "memory(GiB)": 58.3, "step": 11300, "token_acc": 0.46229508196721314, "train_speed(iter/s)": 1.451668 }, { "epoch": 0.48434085943190097, "grad_norm": 4.947669982910156, "learning_rate": 9.770254151760297e-05, "loss": 2.5403892517089846, "memory(GiB)": 58.3, "step": 11305, "token_acc": 0.4505928853754941, "train_speed(iter/s)": 1.45176 }, { "epoch": 0.4845550747611499, "grad_norm": 5.263824939727783, "learning_rate": 9.770052454394443e-05, "loss": 2.775040054321289, "memory(GiB)": 58.3, "step": 11310, "token_acc": 0.43548387096774194, "train_speed(iter/s)": 1.451967 }, { "epoch": 0.4847692900903989, "grad_norm": 5.169901371002197, "learning_rate": 9.769850670614613e-05, "loss": 2.853218841552734, "memory(GiB)": 58.3, "step": 11315, "token_acc": 0.4235294117647059, "train_speed(iter/s)": 1.451824 }, { "epoch": 0.48498350541964785, "grad_norm": 3.248640298843384, "learning_rate": 9.769648800424465e-05, "loss": 2.5355600357055663, "memory(GiB)": 58.3, "step": 11320, "token_acc": 0.4666666666666667, "train_speed(iter/s)": 1.451889 }, { "epoch": 0.4851977207488968, "grad_norm": 3.849860668182373, "learning_rate": 9.769446843827655e-05, "loss": 2.318301773071289, "memory(GiB)": 58.3, "step": 11325, "token_acc": 0.538135593220339, "train_speed(iter/s)": 1.451945 }, { "epoch": 0.48541193607814576, "grad_norm": 4.20799446105957, "learning_rate": 9.769244800827841e-05, "loss": 2.681135368347168, "memory(GiB)": 58.3, "step": 11330, "token_acc": 0.4567901234567901, "train_speed(iter/s)": 1.452038 }, { "epoch": 0.48562615140739473, "grad_norm": 5.085803031921387, "learning_rate": 9.769042671428682e-05, "loss": 2.6751615524291994, "memory(GiB)": 58.3, "step": 11335, "token_acc": 0.4470198675496689, "train_speed(iter/s)": 1.452087 }, { "epoch": 0.48584036673664366, "grad_norm": 3.483921766281128, "learning_rate": 9.768840455633842e-05, "loss": 2.9844387054443358, "memory(GiB)": 58.3, "step": 11340, "token_acc": 0.4158415841584158, "train_speed(iter/s)": 1.452105 }, { "epoch": 0.48605458206589264, "grad_norm": 4.813082218170166, "learning_rate": 9.768638153446986e-05, "loss": 2.9042110443115234, "memory(GiB)": 58.3, "step": 11345, "token_acc": 0.43902439024390244, "train_speed(iter/s)": 1.452154 }, { "epoch": 0.4862687973951416, "grad_norm": 4.587554931640625, "learning_rate": 9.768435764871775e-05, "loss": 2.5322193145751952, "memory(GiB)": 58.3, "step": 11350, "token_acc": 0.4798657718120805, "train_speed(iter/s)": 1.452256 }, { "epoch": 0.48648301272439054, "grad_norm": 43.5738410949707, "learning_rate": 9.768233289911877e-05, "loss": 2.3911769866943358, "memory(GiB)": 58.3, "step": 11355, "token_acc": 0.48872180451127817, "train_speed(iter/s)": 1.452392 }, { "epoch": 0.4866972280536395, "grad_norm": 4.606513977050781, "learning_rate": 9.768030728570959e-05, "loss": 2.770819664001465, "memory(GiB)": 58.3, "step": 11360, "token_acc": 0.4409722222222222, "train_speed(iter/s)": 1.452449 }, { "epoch": 0.4869114433828885, "grad_norm": 5.725339889526367, "learning_rate": 9.767828080852693e-05, "loss": 2.4176609039306642, "memory(GiB)": 58.3, "step": 11365, "token_acc": 0.47619047619047616, "train_speed(iter/s)": 1.45258 }, { "epoch": 0.4871256587121374, "grad_norm": 4.472254753112793, "learning_rate": 9.767625346760748e-05, "loss": 2.5790700912475586, "memory(GiB)": 58.3, "step": 11370, "token_acc": 0.45454545454545453, "train_speed(iter/s)": 1.452634 }, { "epoch": 0.4873398740413864, "grad_norm": 4.8378167152404785, "learning_rate": 9.767422526298798e-05, "loss": 2.669243812561035, "memory(GiB)": 58.3, "step": 11375, "token_acc": 0.4738675958188153, "train_speed(iter/s)": 1.45254 }, { "epoch": 0.4875540893706354, "grad_norm": 3.5412521362304688, "learning_rate": 9.767219619470516e-05, "loss": 2.5998735427856445, "memory(GiB)": 58.3, "step": 11380, "token_acc": 0.46439628482972134, "train_speed(iter/s)": 1.452617 }, { "epoch": 0.4877683046998843, "grad_norm": 4.090600490570068, "learning_rate": 9.767016626279577e-05, "loss": 2.7880121231079102, "memory(GiB)": 58.3, "step": 11385, "token_acc": 0.4384057971014493, "train_speed(iter/s)": 1.452739 }, { "epoch": 0.4879825200291333, "grad_norm": 4.44712495803833, "learning_rate": 9.766813546729663e-05, "loss": 2.4887435913085936, "memory(GiB)": 58.3, "step": 11390, "token_acc": 0.48736462093862815, "train_speed(iter/s)": 1.452517 }, { "epoch": 0.48819673535838226, "grad_norm": 4.05634069442749, "learning_rate": 9.766610380824448e-05, "loss": 2.8442813873291017, "memory(GiB)": 58.3, "step": 11395, "token_acc": 0.414985590778098, "train_speed(iter/s)": 1.45259 }, { "epoch": 0.4884109506876312, "grad_norm": 4.251303672790527, "learning_rate": 9.766407128567617e-05, "loss": 2.378520202636719, "memory(GiB)": 58.3, "step": 11400, "token_acc": 0.451505016722408, "train_speed(iter/s)": 1.452563 }, { "epoch": 0.48862516601688016, "grad_norm": 4.743833541870117, "learning_rate": 9.766203789962846e-05, "loss": 2.995096778869629, "memory(GiB)": 58.3, "step": 11405, "token_acc": 0.38823529411764707, "train_speed(iter/s)": 1.452374 }, { "epoch": 0.48883938134612914, "grad_norm": 5.054455757141113, "learning_rate": 9.766000365013824e-05, "loss": 2.7642860412597656, "memory(GiB)": 58.3, "step": 11410, "token_acc": 0.4626865671641791, "train_speed(iter/s)": 1.452323 }, { "epoch": 0.48905359667537807, "grad_norm": 4.550734996795654, "learning_rate": 9.765796853724235e-05, "loss": 2.798261260986328, "memory(GiB)": 58.3, "step": 11415, "token_acc": 0.40350877192982454, "train_speed(iter/s)": 1.452376 }, { "epoch": 0.48926781200462705, "grad_norm": 3.1900641918182373, "learning_rate": 9.765593256097764e-05, "loss": 2.6664859771728517, "memory(GiB)": 58.3, "step": 11420, "token_acc": 0.45454545454545453, "train_speed(iter/s)": 1.452396 }, { "epoch": 0.489482027333876, "grad_norm": 4.950425624847412, "learning_rate": 9.765389572138103e-05, "loss": 2.4235610961914062, "memory(GiB)": 58.3, "step": 11425, "token_acc": 0.45454545454545453, "train_speed(iter/s)": 1.452476 }, { "epoch": 0.48969624266312495, "grad_norm": 4.0858154296875, "learning_rate": 9.765185801848938e-05, "loss": 2.7007659912109374, "memory(GiB)": 58.3, "step": 11430, "token_acc": 0.4520123839009288, "train_speed(iter/s)": 1.45247 }, { "epoch": 0.4899104579923739, "grad_norm": 5.181708335876465, "learning_rate": 9.764981945233962e-05, "loss": 2.6881858825683596, "memory(GiB)": 58.3, "step": 11435, "token_acc": 0.4355400696864111, "train_speed(iter/s)": 1.452572 }, { "epoch": 0.4901246733216229, "grad_norm": 4.984737396240234, "learning_rate": 9.764778002296868e-05, "loss": 2.8407032012939455, "memory(GiB)": 58.3, "step": 11440, "token_acc": 0.44932432432432434, "train_speed(iter/s)": 1.452719 }, { "epoch": 0.49033888865087183, "grad_norm": 4.215512752532959, "learning_rate": 9.764573973041352e-05, "loss": 2.6679666519165037, "memory(GiB)": 58.3, "step": 11445, "token_acc": 0.48214285714285715, "train_speed(iter/s)": 1.452746 }, { "epoch": 0.4905531039801208, "grad_norm": 5.14586877822876, "learning_rate": 9.764369857471109e-05, "loss": 2.652199363708496, "memory(GiB)": 58.3, "step": 11450, "token_acc": 0.43260188087774293, "train_speed(iter/s)": 1.452642 }, { "epoch": 0.4907673193093698, "grad_norm": 4.5310845375061035, "learning_rate": 9.764165655589835e-05, "loss": 2.650895118713379, "memory(GiB)": 58.3, "step": 11455, "token_acc": 0.4753623188405797, "train_speed(iter/s)": 1.452825 }, { "epoch": 0.49098153463861877, "grad_norm": 5.773660182952881, "learning_rate": 9.763961367401231e-05, "loss": 2.690845489501953, "memory(GiB)": 58.3, "step": 11460, "token_acc": 0.4157706093189964, "train_speed(iter/s)": 1.452876 }, { "epoch": 0.4911957499678677, "grad_norm": 3.6825263500213623, "learning_rate": 9.763756992909e-05, "loss": 2.5997386932373048, "memory(GiB)": 58.3, "step": 11465, "token_acc": 0.475, "train_speed(iter/s)": 1.452998 }, { "epoch": 0.49140996529711667, "grad_norm": 5.825430870056152, "learning_rate": 9.763552532116841e-05, "loss": 2.6364845275878905, "memory(GiB)": 58.3, "step": 11470, "token_acc": 0.5017421602787456, "train_speed(iter/s)": 1.453105 }, { "epoch": 0.49162418062636565, "grad_norm": 4.11679220199585, "learning_rate": 9.76334798502846e-05, "loss": 2.563291549682617, "memory(GiB)": 58.3, "step": 11475, "token_acc": 0.4262295081967213, "train_speed(iter/s)": 1.45313 }, { "epoch": 0.49183839595561457, "grad_norm": 4.311800003051758, "learning_rate": 9.763143351647561e-05, "loss": 2.289215850830078, "memory(GiB)": 58.3, "step": 11480, "token_acc": 0.46494464944649444, "train_speed(iter/s)": 1.453215 }, { "epoch": 0.49205261128486355, "grad_norm": 7.280170440673828, "learning_rate": 9.762938631977852e-05, "loss": 2.361956024169922, "memory(GiB)": 58.3, "step": 11485, "token_acc": 0.48363636363636364, "train_speed(iter/s)": 1.453248 }, { "epoch": 0.49226682661411253, "grad_norm": 7.868833541870117, "learning_rate": 9.762733826023042e-05, "loss": 2.538089370727539, "memory(GiB)": 58.3, "step": 11490, "token_acc": 0.4628099173553719, "train_speed(iter/s)": 1.453321 }, { "epoch": 0.49248104194336145, "grad_norm": 4.240434169769287, "learning_rate": 9.76252893378684e-05, "loss": 2.8499610900878904, "memory(GiB)": 58.3, "step": 11495, "token_acc": 0.43309859154929575, "train_speed(iter/s)": 1.453397 }, { "epoch": 0.49269525727261043, "grad_norm": 4.263349533081055, "learning_rate": 9.76232395527296e-05, "loss": 2.6419279098510744, "memory(GiB)": 58.3, "step": 11500, "token_acc": 0.4895833333333333, "train_speed(iter/s)": 1.453492 }, { "epoch": 0.49269525727261043, "eval_loss": 2.122830867767334, "eval_runtime": 13.3411, "eval_samples_per_second": 7.496, "eval_steps_per_second": 7.496, "eval_token_acc": 0.47485207100591714, "step": 11500 }, { "epoch": 0.4929094726018594, "grad_norm": 3.7236857414245605, "learning_rate": 9.762118890485115e-05, "loss": 2.3778526306152346, "memory(GiB)": 58.3, "step": 11505, "token_acc": 0.4754601226993865, "train_speed(iter/s)": 1.45099 }, { "epoch": 0.49312368793110833, "grad_norm": 12.476144790649414, "learning_rate": 9.761913739427017e-05, "loss": 2.415938377380371, "memory(GiB)": 58.3, "step": 11510, "token_acc": 0.4930555555555556, "train_speed(iter/s)": 1.451143 }, { "epoch": 0.4933379032603573, "grad_norm": 3.7864506244659424, "learning_rate": 9.761708502102384e-05, "loss": 2.613703727722168, "memory(GiB)": 58.3, "step": 11515, "token_acc": 0.47017543859649125, "train_speed(iter/s)": 1.451055 }, { "epoch": 0.4935521185896063, "grad_norm": 7.46455192565918, "learning_rate": 9.761503178514938e-05, "loss": 2.7990486145019533, "memory(GiB)": 58.3, "step": 11520, "token_acc": 0.4426751592356688, "train_speed(iter/s)": 1.451047 }, { "epoch": 0.4937663339188552, "grad_norm": 4.192196846008301, "learning_rate": 9.761297768668393e-05, "loss": 2.8287534713745117, "memory(GiB)": 58.3, "step": 11525, "token_acc": 0.445141065830721, "train_speed(iter/s)": 1.450996 }, { "epoch": 0.4939805492481042, "grad_norm": 3.70595121383667, "learning_rate": 9.761092272566472e-05, "loss": 2.641410064697266, "memory(GiB)": 58.3, "step": 11530, "token_acc": 0.42857142857142855, "train_speed(iter/s)": 1.450973 }, { "epoch": 0.4941947645773532, "grad_norm": 6.178360939025879, "learning_rate": 9.7608866902129e-05, "loss": 2.670266342163086, "memory(GiB)": 58.3, "step": 11535, "token_acc": 0.45075757575757575, "train_speed(iter/s)": 1.451131 }, { "epoch": 0.4944089799066021, "grad_norm": 3.9855008125305176, "learning_rate": 9.760681021611398e-05, "loss": 2.3463743209838865, "memory(GiB)": 58.3, "step": 11540, "token_acc": 0.5064516129032258, "train_speed(iter/s)": 1.45128 }, { "epoch": 0.4946231952358511, "grad_norm": 3.2524452209472656, "learning_rate": 9.760475266765694e-05, "loss": 2.5913551330566404, "memory(GiB)": 58.3, "step": 11545, "token_acc": 0.3978102189781022, "train_speed(iter/s)": 1.451253 }, { "epoch": 0.49483741056510006, "grad_norm": 6.220343589782715, "learning_rate": 9.760269425679514e-05, "loss": 2.8090896606445312, "memory(GiB)": 58.3, "step": 11550, "token_acc": 0.42857142857142855, "train_speed(iter/s)": 1.451357 }, { "epoch": 0.495051625894349, "grad_norm": 4.260661602020264, "learning_rate": 9.760063498356589e-05, "loss": 2.58850154876709, "memory(GiB)": 58.3, "step": 11555, "token_acc": 0.48598130841121495, "train_speed(iter/s)": 1.451495 }, { "epoch": 0.49526584122359796, "grad_norm": 3.7763559818267822, "learning_rate": 9.759857484800647e-05, "loss": 2.461856460571289, "memory(GiB)": 58.3, "step": 11560, "token_acc": 0.48986486486486486, "train_speed(iter/s)": 1.451426 }, { "epoch": 0.49548005655284694, "grad_norm": 4.351131439208984, "learning_rate": 9.759651385015423e-05, "loss": 2.551533317565918, "memory(GiB)": 58.3, "step": 11565, "token_acc": 0.46449704142011833, "train_speed(iter/s)": 1.451246 }, { "epoch": 0.49569427188209586, "grad_norm": 4.711762428283691, "learning_rate": 9.759445199004649e-05, "loss": 2.5739336013793945, "memory(GiB)": 58.3, "step": 11570, "token_acc": 0.48214285714285715, "train_speed(iter/s)": 1.451283 }, { "epoch": 0.49590848721134484, "grad_norm": 4.900021553039551, "learning_rate": 9.75923892677206e-05, "loss": 2.4031457901000977, "memory(GiB)": 58.3, "step": 11575, "token_acc": 0.5, "train_speed(iter/s)": 1.451165 }, { "epoch": 0.4961227025405938, "grad_norm": 4.870184898376465, "learning_rate": 9.759032568321395e-05, "loss": 2.3678571701049806, "memory(GiB)": 58.3, "step": 11580, "token_acc": 0.4583333333333333, "train_speed(iter/s)": 1.451148 }, { "epoch": 0.49633691786984274, "grad_norm": 4.825766086578369, "learning_rate": 9.758826123656388e-05, "loss": 2.607929801940918, "memory(GiB)": 58.3, "step": 11585, "token_acc": 0.45819397993311034, "train_speed(iter/s)": 1.451158 }, { "epoch": 0.4965511331990917, "grad_norm": 3.561554193496704, "learning_rate": 9.758619592780784e-05, "loss": 2.5645885467529297, "memory(GiB)": 58.3, "step": 11590, "token_acc": 0.49258160237388726, "train_speed(iter/s)": 1.451167 }, { "epoch": 0.4967653485283407, "grad_norm": 5.883943557739258, "learning_rate": 9.758412975698321e-05, "loss": 2.891803169250488, "memory(GiB)": 58.3, "step": 11595, "token_acc": 0.4241573033707865, "train_speed(iter/s)": 1.451143 }, { "epoch": 0.4969795638575896, "grad_norm": 4.955366611480713, "learning_rate": 9.758206272412743e-05, "loss": 2.5933320999145506, "memory(GiB)": 58.3, "step": 11600, "token_acc": 0.46062992125984253, "train_speed(iter/s)": 1.451096 }, { "epoch": 0.4971937791868386, "grad_norm": 3.6083242893218994, "learning_rate": 9.757999482927795e-05, "loss": 2.7880538940429687, "memory(GiB)": 58.3, "step": 11605, "token_acc": 0.3843843843843844, "train_speed(iter/s)": 1.450971 }, { "epoch": 0.4974079945160876, "grad_norm": 3.6765027046203613, "learning_rate": 9.757792607247224e-05, "loss": 2.4341617584228517, "memory(GiB)": 58.3, "step": 11610, "token_acc": 0.4896551724137931, "train_speed(iter/s)": 1.450936 }, { "epoch": 0.4976222098453365, "grad_norm": 4.222357749938965, "learning_rate": 9.757585645374777e-05, "loss": 2.667819023132324, "memory(GiB)": 58.3, "step": 11615, "token_acc": 0.4618181818181818, "train_speed(iter/s)": 1.451034 }, { "epoch": 0.4978364251745855, "grad_norm": 4.218835353851318, "learning_rate": 9.757378597314201e-05, "loss": 2.8918861389160155, "memory(GiB)": 58.3, "step": 11620, "token_acc": 0.4200626959247649, "train_speed(iter/s)": 1.451137 }, { "epoch": 0.49805064050383446, "grad_norm": 4.356986999511719, "learning_rate": 9.75717146306925e-05, "loss": 2.3594308853149415, "memory(GiB)": 58.3, "step": 11625, "token_acc": 0.5103734439834025, "train_speed(iter/s)": 1.451228 }, { "epoch": 0.49826485583308344, "grad_norm": 5.578523635864258, "learning_rate": 9.756964242643674e-05, "loss": 2.4100622177124023, "memory(GiB)": 58.3, "step": 11630, "token_acc": 0.483271375464684, "train_speed(iter/s)": 1.451132 }, { "epoch": 0.49847907116233237, "grad_norm": 4.419356346130371, "learning_rate": 9.75675693604123e-05, "loss": 2.533791351318359, "memory(GiB)": 58.3, "step": 11635, "token_acc": 0.4639175257731959, "train_speed(iter/s)": 1.45122 }, { "epoch": 0.49869328649158134, "grad_norm": 3.9564931392669678, "learning_rate": 9.756549543265671e-05, "loss": 2.726533126831055, "memory(GiB)": 58.3, "step": 11640, "token_acc": 0.4353312302839117, "train_speed(iter/s)": 1.451084 }, { "epoch": 0.4989075018208303, "grad_norm": 4.707624435424805, "learning_rate": 9.756342064320756e-05, "loss": 2.6874515533447267, "memory(GiB)": 58.3, "step": 11645, "token_acc": 0.49498327759197325, "train_speed(iter/s)": 1.451192 }, { "epoch": 0.49912171715007925, "grad_norm": 3.3866236209869385, "learning_rate": 9.756134499210243e-05, "loss": 2.8155418395996095, "memory(GiB)": 58.3, "step": 11650, "token_acc": 0.42363112391930835, "train_speed(iter/s)": 1.451229 }, { "epoch": 0.4993359324793282, "grad_norm": 5.313538074493408, "learning_rate": 9.755926847937891e-05, "loss": 2.5449777603149415, "memory(GiB)": 58.3, "step": 11655, "token_acc": 0.45901639344262296, "train_speed(iter/s)": 1.451112 }, { "epoch": 0.4995501478085772, "grad_norm": 3.765958547592163, "learning_rate": 9.755719110507462e-05, "loss": 2.7128576278686523, "memory(GiB)": 58.3, "step": 11660, "token_acc": 0.45307443365695793, "train_speed(iter/s)": 1.451031 }, { "epoch": 0.49976436313782613, "grad_norm": 4.323287010192871, "learning_rate": 9.75551128692272e-05, "loss": 2.391266632080078, "memory(GiB)": 58.3, "step": 11665, "token_acc": 0.4965986394557823, "train_speed(iter/s)": 1.450972 }, { "epoch": 0.4999785784670751, "grad_norm": 3.935492753982544, "learning_rate": 9.755303377187433e-05, "loss": 2.717921829223633, "memory(GiB)": 58.3, "step": 11670, "token_acc": 0.45454545454545453, "train_speed(iter/s)": 1.451131 }, { "epoch": 0.5001927937963241, "grad_norm": 5.324994087219238, "learning_rate": 9.755095381305362e-05, "loss": 2.6325710296630858, "memory(GiB)": 58.3, "step": 11675, "token_acc": 0.4713804713804714, "train_speed(iter/s)": 1.451327 }, { "epoch": 0.500407009125573, "grad_norm": 5.840517997741699, "learning_rate": 9.754887299280277e-05, "loss": 2.7376882553100588, "memory(GiB)": 58.3, "step": 11680, "token_acc": 0.43097643097643096, "train_speed(iter/s)": 1.451365 }, { "epoch": 0.5006212244548219, "grad_norm": 4.677224159240723, "learning_rate": 9.754679131115949e-05, "loss": 2.4044061660766602, "memory(GiB)": 58.3, "step": 11685, "token_acc": 0.49333333333333335, "train_speed(iter/s)": 1.451416 }, { "epoch": 0.500835439784071, "grad_norm": 4.544371128082275, "learning_rate": 9.754470876816148e-05, "loss": 2.551678276062012, "memory(GiB)": 58.3, "step": 11690, "token_acc": 0.44396551724137934, "train_speed(iter/s)": 1.451492 }, { "epoch": 0.5010496551133199, "grad_norm": 4.3035054206848145, "learning_rate": 9.754262536384649e-05, "loss": 2.9027379989624023, "memory(GiB)": 58.3, "step": 11695, "token_acc": 0.4576923076923077, "train_speed(iter/s)": 1.451529 }, { "epoch": 0.5012638704425688, "grad_norm": 4.433408737182617, "learning_rate": 9.754054109825223e-05, "loss": 2.643610191345215, "memory(GiB)": 58.3, "step": 11700, "token_acc": 0.4669260700389105, "train_speed(iter/s)": 1.451467 }, { "epoch": 0.5014780857718178, "grad_norm": 3.3892669677734375, "learning_rate": 9.753845597141647e-05, "loss": 2.4351217269897463, "memory(GiB)": 58.3, "step": 11705, "token_acc": 0.5020080321285141, "train_speed(iter/s)": 1.451427 }, { "epoch": 0.5016923011010668, "grad_norm": 4.115910053253174, "learning_rate": 9.753636998337698e-05, "loss": 2.770868682861328, "memory(GiB)": 58.3, "step": 11710, "token_acc": 0.44314868804664725, "train_speed(iter/s)": 1.451487 }, { "epoch": 0.5019065164303158, "grad_norm": 4.731926918029785, "learning_rate": 9.753428313417155e-05, "loss": 3.0838483810424804, "memory(GiB)": 58.3, "step": 11715, "token_acc": 0.4070175438596491, "train_speed(iter/s)": 1.45147 }, { "epoch": 0.5021207317595647, "grad_norm": 4.004453182220459, "learning_rate": 9.7532195423838e-05, "loss": 2.358524131774902, "memory(GiB)": 58.3, "step": 11720, "token_acc": 0.49110320284697506, "train_speed(iter/s)": 1.451402 }, { "epoch": 0.5023349470888137, "grad_norm": 4.105185508728027, "learning_rate": 9.753010685241415e-05, "loss": 2.7120088577270507, "memory(GiB)": 58.3, "step": 11725, "token_acc": 0.49635036496350365, "train_speed(iter/s)": 1.451454 }, { "epoch": 0.5025491624180627, "grad_norm": 4.423943519592285, "learning_rate": 9.752801741993781e-05, "loss": 2.531999206542969, "memory(GiB)": 58.3, "step": 11730, "token_acc": 0.48497854077253216, "train_speed(iter/s)": 1.451519 }, { "epoch": 0.5027633777473116, "grad_norm": 4.0634846687316895, "learning_rate": 9.752592712644686e-05, "loss": 2.777116394042969, "memory(GiB)": 58.3, "step": 11735, "token_acc": 0.42382271468144045, "train_speed(iter/s)": 1.451661 }, { "epoch": 0.5029775930765605, "grad_norm": 5.358497619628906, "learning_rate": 9.752383597197916e-05, "loss": 2.463951301574707, "memory(GiB)": 58.3, "step": 11740, "token_acc": 0.4583333333333333, "train_speed(iter/s)": 1.451653 }, { "epoch": 0.5031918084058096, "grad_norm": 3.7360455989837646, "learning_rate": 9.752174395657259e-05, "loss": 2.750463104248047, "memory(GiB)": 58.3, "step": 11745, "token_acc": 0.4212218649517685, "train_speed(iter/s)": 1.451706 }, { "epoch": 0.5034060237350585, "grad_norm": 3.7836999893188477, "learning_rate": 9.751965108026505e-05, "loss": 2.4442745208740235, "memory(GiB)": 58.3, "step": 11750, "token_acc": 0.511400651465798, "train_speed(iter/s)": 1.451742 }, { "epoch": 0.5036202390643074, "grad_norm": 4.425050735473633, "learning_rate": 9.751755734309443e-05, "loss": 2.639969825744629, "memory(GiB)": 58.3, "step": 11755, "token_acc": 0.4353312302839117, "train_speed(iter/s)": 1.451744 }, { "epoch": 0.5038344543935565, "grad_norm": 4.007545471191406, "learning_rate": 9.751546274509871e-05, "loss": 2.988727569580078, "memory(GiB)": 58.3, "step": 11760, "token_acc": 0.45907473309608543, "train_speed(iter/s)": 1.451953 }, { "epoch": 0.5040486697228054, "grad_norm": 4.643023490905762, "learning_rate": 9.75133672863158e-05, "loss": 2.9101932525634764, "memory(GiB)": 58.3, "step": 11765, "token_acc": 0.43934426229508194, "train_speed(iter/s)": 1.452005 }, { "epoch": 0.5042628850520543, "grad_norm": 4.114360809326172, "learning_rate": 9.751127096678366e-05, "loss": 2.6409418106079103, "memory(GiB)": 58.3, "step": 11770, "token_acc": 0.46689895470383275, "train_speed(iter/s)": 1.452116 }, { "epoch": 0.5044771003813033, "grad_norm": 3.6016287803649902, "learning_rate": 9.75091737865403e-05, "loss": 2.69439811706543, "memory(GiB)": 58.3, "step": 11775, "token_acc": 0.4618055555555556, "train_speed(iter/s)": 1.452114 }, { "epoch": 0.5046913157105523, "grad_norm": 3.8433313369750977, "learning_rate": 9.750707574562368e-05, "loss": 2.542442512512207, "memory(GiB)": 58.3, "step": 11780, "token_acc": 0.5082508250825083, "train_speed(iter/s)": 1.452184 }, { "epoch": 0.5049055310398012, "grad_norm": 4.735212802886963, "learning_rate": 9.75049768440718e-05, "loss": 2.599843215942383, "memory(GiB)": 58.3, "step": 11785, "token_acc": 0.511326860841424, "train_speed(iter/s)": 1.452248 }, { "epoch": 0.5051197463690502, "grad_norm": 4.743094444274902, "learning_rate": 9.750287708192272e-05, "loss": 2.8545053482055662, "memory(GiB)": 58.3, "step": 11790, "token_acc": 0.3988439306358382, "train_speed(iter/s)": 1.452284 }, { "epoch": 0.5053339616982991, "grad_norm": 4.007525444030762, "learning_rate": 9.750077645921445e-05, "loss": 2.507660675048828, "memory(GiB)": 58.3, "step": 11795, "token_acc": 0.47766323024054985, "train_speed(iter/s)": 1.452403 }, { "epoch": 0.5055481770275481, "grad_norm": 3.828134536743164, "learning_rate": 9.749867497598508e-05, "loss": 2.9148635864257812, "memory(GiB)": 58.3, "step": 11800, "token_acc": 0.4563758389261745, "train_speed(iter/s)": 1.452611 }, { "epoch": 0.5057623923567971, "grad_norm": 4.234160900115967, "learning_rate": 9.749657263227263e-05, "loss": 2.424146270751953, "memory(GiB)": 58.3, "step": 11805, "token_acc": 0.4912891986062718, "train_speed(iter/s)": 1.452753 }, { "epoch": 0.505976607686046, "grad_norm": 8.152174949645996, "learning_rate": 9.749446942811522e-05, "loss": 2.65775146484375, "memory(GiB)": 58.3, "step": 11810, "token_acc": 0.4632352941176471, "train_speed(iter/s)": 1.452743 }, { "epoch": 0.5061908230152949, "grad_norm": 4.062210559844971, "learning_rate": 9.749236536355094e-05, "loss": 2.568818473815918, "memory(GiB)": 58.3, "step": 11815, "token_acc": 0.4528301886792453, "train_speed(iter/s)": 1.452694 }, { "epoch": 0.506405038344544, "grad_norm": 5.438949108123779, "learning_rate": 9.749026043861791e-05, "loss": 2.648557662963867, "memory(GiB)": 58.3, "step": 11820, "token_acc": 0.4491525423728814, "train_speed(iter/s)": 1.452805 }, { "epoch": 0.5066192536737929, "grad_norm": 5.653181076049805, "learning_rate": 9.748815465335426e-05, "loss": 2.7346431732177736, "memory(GiB)": 58.3, "step": 11825, "token_acc": 0.4573643410852713, "train_speed(iter/s)": 1.452829 }, { "epoch": 0.5068334690030418, "grad_norm": 4.348195552825928, "learning_rate": 9.748604800779814e-05, "loss": 2.5485939025878905, "memory(GiB)": 58.3, "step": 11830, "token_acc": 0.48854961832061067, "train_speed(iter/s)": 1.452821 }, { "epoch": 0.5070476843322909, "grad_norm": 4.624939441680908, "learning_rate": 9.748394050198773e-05, "loss": 2.787402534484863, "memory(GiB)": 58.3, "step": 11835, "token_acc": 0.4076655052264808, "train_speed(iter/s)": 1.452931 }, { "epoch": 0.5072618996615398, "grad_norm": 4.1721882820129395, "learning_rate": 9.748183213596118e-05, "loss": 2.7364587783813477, "memory(GiB)": 58.3, "step": 11840, "token_acc": 0.45084745762711864, "train_speed(iter/s)": 1.452854 }, { "epoch": 0.5074761149907887, "grad_norm": 3.9521102905273438, "learning_rate": 9.747972290975671e-05, "loss": 2.575947952270508, "memory(GiB)": 58.3, "step": 11845, "token_acc": 0.48484848484848486, "train_speed(iter/s)": 1.452885 }, { "epoch": 0.5076903303200377, "grad_norm": 3.7558672428131104, "learning_rate": 9.747761282341253e-05, "loss": 2.4672239303588865, "memory(GiB)": 58.3, "step": 11850, "token_acc": 0.4856115107913669, "train_speed(iter/s)": 1.452844 }, { "epoch": 0.5079045456492867, "grad_norm": 4.29282283782959, "learning_rate": 9.747550187696684e-05, "loss": 2.7997127532958985, "memory(GiB)": 58.3, "step": 11855, "token_acc": 0.4444444444444444, "train_speed(iter/s)": 1.452977 }, { "epoch": 0.5081187609785356, "grad_norm": 3.626417398452759, "learning_rate": 9.747339007045789e-05, "loss": 2.789495086669922, "memory(GiB)": 58.3, "step": 11860, "token_acc": 0.44, "train_speed(iter/s)": 1.453052 }, { "epoch": 0.5083329763077846, "grad_norm": 4.912052154541016, "learning_rate": 9.747127740392394e-05, "loss": 2.5460906982421876, "memory(GiB)": 58.3, "step": 11865, "token_acc": 0.4827586206896552, "train_speed(iter/s)": 1.45313 }, { "epoch": 0.5085471916370335, "grad_norm": 4.189931392669678, "learning_rate": 9.746916387740329e-05, "loss": 2.536775588989258, "memory(GiB)": 58.3, "step": 11870, "token_acc": 0.508833922261484, "train_speed(iter/s)": 1.45317 }, { "epoch": 0.5087614069662825, "grad_norm": 3.7563111782073975, "learning_rate": 9.74670494909342e-05, "loss": 2.6769477844238283, "memory(GiB)": 58.3, "step": 11875, "token_acc": 0.45209580838323354, "train_speed(iter/s)": 1.453222 }, { "epoch": 0.5089756222955315, "grad_norm": 3.833993673324585, "learning_rate": 9.746493424455498e-05, "loss": 2.463917350769043, "memory(GiB)": 58.3, "step": 11880, "token_acc": 0.47368421052631576, "train_speed(iter/s)": 1.453369 }, { "epoch": 0.5091898376247804, "grad_norm": 4.479302406311035, "learning_rate": 9.746281813830394e-05, "loss": 2.7719192504882812, "memory(GiB)": 58.3, "step": 11885, "token_acc": 0.43283582089552236, "train_speed(iter/s)": 1.453378 }, { "epoch": 0.5094040529540294, "grad_norm": 4.144737720489502, "learning_rate": 9.746070117221944e-05, "loss": 2.903744697570801, "memory(GiB)": 58.3, "step": 11890, "token_acc": 0.428125, "train_speed(iter/s)": 1.453447 }, { "epoch": 0.5096182682832784, "grad_norm": 3.634902000427246, "learning_rate": 9.745858334633982e-05, "loss": 2.30502872467041, "memory(GiB)": 58.3, "step": 11895, "token_acc": 0.5122950819672131, "train_speed(iter/s)": 1.453514 }, { "epoch": 0.5098324836125273, "grad_norm": 4.992306709289551, "learning_rate": 9.745646466070342e-05, "loss": 2.5915502548217773, "memory(GiB)": 58.3, "step": 11900, "token_acc": 0.44569288389513106, "train_speed(iter/s)": 1.453516 }, { "epoch": 0.5100466989417762, "grad_norm": 4.262874603271484, "learning_rate": 9.745434511534867e-05, "loss": 2.4882678985595703, "memory(GiB)": 58.3, "step": 11905, "token_acc": 0.48928571428571427, "train_speed(iter/s)": 1.453402 }, { "epoch": 0.5102609142710253, "grad_norm": 3.4171600341796875, "learning_rate": 9.745222471031392e-05, "loss": 2.9526262283325195, "memory(GiB)": 58.3, "step": 11910, "token_acc": 0.4292803970223325, "train_speed(iter/s)": 1.453441 }, { "epoch": 0.5104751296002742, "grad_norm": 4.713479518890381, "learning_rate": 9.745010344563763e-05, "loss": 2.548380661010742, "memory(GiB)": 58.3, "step": 11915, "token_acc": 0.4652567975830816, "train_speed(iter/s)": 1.453429 }, { "epoch": 0.5106893449295231, "grad_norm": 3.457210063934326, "learning_rate": 9.744798132135819e-05, "loss": 2.721689224243164, "memory(GiB)": 58.3, "step": 11920, "token_acc": 0.46779661016949153, "train_speed(iter/s)": 1.453146 }, { "epoch": 0.5109035602587721, "grad_norm": 4.835439682006836, "learning_rate": 9.744585833751405e-05, "loss": 2.7133157730102537, "memory(GiB)": 58.3, "step": 11925, "token_acc": 0.43416370106761565, "train_speed(iter/s)": 1.453285 }, { "epoch": 0.5111177755880211, "grad_norm": 5.564133644104004, "learning_rate": 9.74437344941437e-05, "loss": 2.6488723754882812, "memory(GiB)": 58.3, "step": 11930, "token_acc": 0.4509090909090909, "train_speed(iter/s)": 1.45303 }, { "epoch": 0.51133199091727, "grad_norm": 4.037095069885254, "learning_rate": 9.744160979128559e-05, "loss": 2.5098573684692385, "memory(GiB)": 58.3, "step": 11935, "token_acc": 0.4766355140186916, "train_speed(iter/s)": 1.452984 }, { "epoch": 0.511546206246519, "grad_norm": 4.985520362854004, "learning_rate": 9.743948422897819e-05, "loss": 2.3646467208862303, "memory(GiB)": 58.3, "step": 11940, "token_acc": 0.4942528735632184, "train_speed(iter/s)": 1.45315 }, { "epoch": 0.511760421575768, "grad_norm": 4.7432355880737305, "learning_rate": 9.743735780726005e-05, "loss": 2.8441465377807615, "memory(GiB)": 58.3, "step": 11945, "token_acc": 0.4, "train_speed(iter/s)": 1.453117 }, { "epoch": 0.5119746369050169, "grad_norm": 4.232179641723633, "learning_rate": 9.743523052616968e-05, "loss": 2.597243881225586, "memory(GiB)": 58.3, "step": 11950, "token_acc": 0.48, "train_speed(iter/s)": 1.453083 }, { "epoch": 0.5121888522342659, "grad_norm": 3.3830976486206055, "learning_rate": 9.743310238574561e-05, "loss": 2.7058849334716797, "memory(GiB)": 58.3, "step": 11955, "token_acc": 0.44528301886792454, "train_speed(iter/s)": 1.453081 }, { "epoch": 0.5124030675635148, "grad_norm": 4.269236087799072, "learning_rate": 9.743097338602638e-05, "loss": 2.4047061920166017, "memory(GiB)": 58.3, "step": 11960, "token_acc": 0.4775641025641026, "train_speed(iter/s)": 1.453046 }, { "epoch": 0.5126172828927638, "grad_norm": 4.670446395874023, "learning_rate": 9.742884352705057e-05, "loss": 2.70489387512207, "memory(GiB)": 58.3, "step": 11965, "token_acc": 0.4307228915662651, "train_speed(iter/s)": 1.452803 }, { "epoch": 0.5128314982220128, "grad_norm": 5.872429847717285, "learning_rate": 9.742671280885678e-05, "loss": 2.67957763671875, "memory(GiB)": 58.3, "step": 11970, "token_acc": 0.46, "train_speed(iter/s)": 1.452902 }, { "epoch": 0.5130457135512617, "grad_norm": 4.57229471206665, "learning_rate": 9.74245812314836e-05, "loss": 2.635705757141113, "memory(GiB)": 58.3, "step": 11975, "token_acc": 0.41007194244604317, "train_speed(iter/s)": 1.452959 }, { "epoch": 0.5132599288805106, "grad_norm": 6.656675815582275, "learning_rate": 9.742244879496964e-05, "loss": 2.6367509841918944, "memory(GiB)": 58.3, "step": 11980, "token_acc": 0.4666666666666667, "train_speed(iter/s)": 1.453006 }, { "epoch": 0.5134741442097597, "grad_norm": 7.522746562957764, "learning_rate": 9.742031549935353e-05, "loss": 2.6589969635009765, "memory(GiB)": 58.3, "step": 11985, "token_acc": 0.45794392523364486, "train_speed(iter/s)": 1.453102 }, { "epoch": 0.5136883595390086, "grad_norm": 5.553147792816162, "learning_rate": 9.741818134467393e-05, "loss": 2.8405023574829102, "memory(GiB)": 58.3, "step": 11990, "token_acc": 0.4222972972972973, "train_speed(iter/s)": 1.453165 }, { "epoch": 0.5139025748682575, "grad_norm": 4.80103063583374, "learning_rate": 9.741604633096947e-05, "loss": 2.891823577880859, "memory(GiB)": 58.3, "step": 11995, "token_acc": 0.44280442804428044, "train_speed(iter/s)": 1.452883 }, { "epoch": 0.5141167901975066, "grad_norm": 6.593349933624268, "learning_rate": 9.741391045827888e-05, "loss": 2.5027820587158205, "memory(GiB)": 58.3, "step": 12000, "token_acc": 0.4659498207885305, "train_speed(iter/s)": 1.452948 }, { "epoch": 0.5141167901975066, "eval_loss": 2.1061935424804688, "eval_runtime": 13.9199, "eval_samples_per_second": 7.184, "eval_steps_per_second": 7.184, "eval_token_acc": 0.4846715328467153, "step": 12000 }, { "epoch": 0.5143310055267555, "grad_norm": 7.821264266967773, "learning_rate": 9.741177372664081e-05, "loss": 2.4967153549194334, "memory(GiB)": 58.3, "step": 12005, "token_acc": 0.48237885462555063, "train_speed(iter/s)": 1.450436 }, { "epoch": 0.5145452208560044, "grad_norm": 4.144789695739746, "learning_rate": 9.7409636136094e-05, "loss": 2.572602081298828, "memory(GiB)": 58.3, "step": 12010, "token_acc": 0.4794952681388013, "train_speed(iter/s)": 1.450444 }, { "epoch": 0.5147594361852534, "grad_norm": 6.287311553955078, "learning_rate": 9.740749768667715e-05, "loss": 2.457497978210449, "memory(GiB)": 58.3, "step": 12015, "token_acc": 0.5, "train_speed(iter/s)": 1.450458 }, { "epoch": 0.5149736515145024, "grad_norm": 4.425909042358398, "learning_rate": 9.740535837842899e-05, "loss": 2.573872947692871, "memory(GiB)": 58.3, "step": 12020, "token_acc": 0.4307692307692308, "train_speed(iter/s)": 1.450501 }, { "epoch": 0.5151878668437513, "grad_norm": 4.586042881011963, "learning_rate": 9.74032182113883e-05, "loss": 2.8564168930053713, "memory(GiB)": 58.3, "step": 12025, "token_acc": 0.42366412213740456, "train_speed(iter/s)": 1.450662 }, { "epoch": 0.5154020821730003, "grad_norm": 5.136746406555176, "learning_rate": 9.740107718559385e-05, "loss": 2.612055206298828, "memory(GiB)": 58.3, "step": 12030, "token_acc": 0.4673202614379085, "train_speed(iter/s)": 1.450771 }, { "epoch": 0.5156162975022492, "grad_norm": 5.197052478790283, "learning_rate": 9.739893530108442e-05, "loss": 2.603532600402832, "memory(GiB)": 58.3, "step": 12035, "token_acc": 0.47257383966244726, "train_speed(iter/s)": 1.450714 }, { "epoch": 0.5158305128314982, "grad_norm": 5.981902599334717, "learning_rate": 9.739679255789881e-05, "loss": 2.574320602416992, "memory(GiB)": 58.3, "step": 12040, "token_acc": 0.4785992217898833, "train_speed(iter/s)": 1.450684 }, { "epoch": 0.5160447281607472, "grad_norm": 3.7969653606414795, "learning_rate": 9.739464895607584e-05, "loss": 2.3138635635375975, "memory(GiB)": 58.3, "step": 12045, "token_acc": 0.46153846153846156, "train_speed(iter/s)": 1.450664 }, { "epoch": 0.5162589434899961, "grad_norm": 3.929399013519287, "learning_rate": 9.739250449565436e-05, "loss": 3.0134450912475588, "memory(GiB)": 58.3, "step": 12050, "token_acc": 0.42105263157894735, "train_speed(iter/s)": 1.45073 }, { "epoch": 0.5164731588192452, "grad_norm": 3.7105753421783447, "learning_rate": 9.739035917667319e-05, "loss": 2.5796510696411135, "memory(GiB)": 58.3, "step": 12055, "token_acc": 0.4726027397260274, "train_speed(iter/s)": 1.45072 }, { "epoch": 0.5166873741484941, "grad_norm": 5.453879356384277, "learning_rate": 9.738821299917121e-05, "loss": 2.9756711959838866, "memory(GiB)": 58.3, "step": 12060, "token_acc": 0.4421768707482993, "train_speed(iter/s)": 1.450885 }, { "epoch": 0.516901589477743, "grad_norm": 4.616195201873779, "learning_rate": 9.738606596318728e-05, "loss": 2.7886936187744142, "memory(GiB)": 58.3, "step": 12065, "token_acc": 0.44982698961937717, "train_speed(iter/s)": 1.450952 }, { "epoch": 0.517115804806992, "grad_norm": 3.2179694175720215, "learning_rate": 9.738391806876034e-05, "loss": 2.543000602722168, "memory(GiB)": 58.3, "step": 12070, "token_acc": 0.4682274247491639, "train_speed(iter/s)": 1.450944 }, { "epoch": 0.517330020136241, "grad_norm": 3.4223556518554688, "learning_rate": 9.738176931592926e-05, "loss": 2.7501804351806642, "memory(GiB)": 58.3, "step": 12075, "token_acc": 0.45689655172413796, "train_speed(iter/s)": 1.450925 }, { "epoch": 0.5175442354654899, "grad_norm": 2.8262991905212402, "learning_rate": 9.7379619704733e-05, "loss": 2.565086555480957, "memory(GiB)": 58.3, "step": 12080, "token_acc": 0.48788927335640137, "train_speed(iter/s)": 1.450993 }, { "epoch": 0.5177584507947389, "grad_norm": 4.205055236816406, "learning_rate": 9.737746923521046e-05, "loss": 2.6331724166870116, "memory(GiB)": 58.3, "step": 12085, "token_acc": 0.45051194539249145, "train_speed(iter/s)": 1.451112 }, { "epoch": 0.5179726661239878, "grad_norm": 4.022721290588379, "learning_rate": 9.737531790740062e-05, "loss": 2.4493757247924806, "memory(GiB)": 58.3, "step": 12090, "token_acc": 0.5211267605633803, "train_speed(iter/s)": 1.451162 }, { "epoch": 0.5181868814532368, "grad_norm": 12.431967735290527, "learning_rate": 9.737316572134247e-05, "loss": 2.714704132080078, "memory(GiB)": 58.3, "step": 12095, "token_acc": 0.43730886850152906, "train_speed(iter/s)": 1.451171 }, { "epoch": 0.5184010967824858, "grad_norm": 4.202085971832275, "learning_rate": 9.737101267707498e-05, "loss": 2.7350027084350588, "memory(GiB)": 58.3, "step": 12100, "token_acc": 0.47703180212014135, "train_speed(iter/s)": 1.451307 }, { "epoch": 0.5186153121117347, "grad_norm": 4.404984951019287, "learning_rate": 9.736885877463716e-05, "loss": 2.9046512603759767, "memory(GiB)": 58.3, "step": 12105, "token_acc": 0.46096654275092935, "train_speed(iter/s)": 1.451393 }, { "epoch": 0.5188295274409837, "grad_norm": 4.3891191482543945, "learning_rate": 9.736670401406802e-05, "loss": 2.510883903503418, "memory(GiB)": 58.3, "step": 12110, "token_acc": 0.5211726384364821, "train_speed(iter/s)": 1.451349 }, { "epoch": 0.5190437427702327, "grad_norm": 4.217000484466553, "learning_rate": 9.736454839540661e-05, "loss": 2.7631343841552733, "memory(GiB)": 58.3, "step": 12115, "token_acc": 0.42394822006472493, "train_speed(iter/s)": 1.451441 }, { "epoch": 0.5192579580994816, "grad_norm": 4.029565334320068, "learning_rate": 9.736239191869197e-05, "loss": 2.4568695068359374, "memory(GiB)": 58.3, "step": 12120, "token_acc": 0.471875, "train_speed(iter/s)": 1.451457 }, { "epoch": 0.5194721734287305, "grad_norm": 3.3806543350219727, "learning_rate": 9.736023458396318e-05, "loss": 2.556052398681641, "memory(GiB)": 58.3, "step": 12125, "token_acc": 0.47368421052631576, "train_speed(iter/s)": 1.451233 }, { "epoch": 0.5196863887579796, "grad_norm": 4.3811564445495605, "learning_rate": 9.735807639125928e-05, "loss": 2.3946300506591798, "memory(GiB)": 58.3, "step": 12130, "token_acc": 0.4430769230769231, "train_speed(iter/s)": 1.451196 }, { "epoch": 0.5199006040872285, "grad_norm": 4.903264999389648, "learning_rate": 9.735591734061945e-05, "loss": 2.472573089599609, "memory(GiB)": 58.3, "step": 12135, "token_acc": 0.4505928853754941, "train_speed(iter/s)": 1.451045 }, { "epoch": 0.5201148194164774, "grad_norm": 4.18032169342041, "learning_rate": 9.735375743208273e-05, "loss": 2.7119796752929686, "memory(GiB)": 58.3, "step": 12140, "token_acc": 0.49377593360995853, "train_speed(iter/s)": 1.451109 }, { "epoch": 0.5203290347457264, "grad_norm": 4.25031042098999, "learning_rate": 9.735159666568826e-05, "loss": 2.797419548034668, "memory(GiB)": 58.3, "step": 12145, "token_acc": 0.4781144781144781, "train_speed(iter/s)": 1.451134 }, { "epoch": 0.5205432500749754, "grad_norm": 3.4540727138519287, "learning_rate": 9.734943504147521e-05, "loss": 2.6920007705688476, "memory(GiB)": 58.3, "step": 12150, "token_acc": 0.42033898305084744, "train_speed(iter/s)": 1.451134 }, { "epoch": 0.5207574654042243, "grad_norm": 3.4848079681396484, "learning_rate": 9.734727255948273e-05, "loss": 2.753078269958496, "memory(GiB)": 58.3, "step": 12155, "token_acc": 0.4416961130742049, "train_speed(iter/s)": 1.450984 }, { "epoch": 0.5209716807334733, "grad_norm": 3.585627555847168, "learning_rate": 9.734510921974998e-05, "loss": 2.6603424072265627, "memory(GiB)": 58.3, "step": 12160, "token_acc": 0.46774193548387094, "train_speed(iter/s)": 1.450981 }, { "epoch": 0.5211858960627223, "grad_norm": 4.278395175933838, "learning_rate": 9.734294502231618e-05, "loss": 2.345458984375, "memory(GiB)": 58.3, "step": 12165, "token_acc": 0.4918032786885246, "train_speed(iter/s)": 1.45096 }, { "epoch": 0.5214001113919712, "grad_norm": 3.882774829864502, "learning_rate": 9.73407799672205e-05, "loss": 2.5837369918823243, "memory(GiB)": 58.3, "step": 12170, "token_acc": 0.4485049833887043, "train_speed(iter/s)": 1.450963 }, { "epoch": 0.5216143267212202, "grad_norm": 6.81789493560791, "learning_rate": 9.73386140545022e-05, "loss": 2.9007122039794924, "memory(GiB)": 58.3, "step": 12175, "token_acc": 0.3978102189781022, "train_speed(iter/s)": 1.450844 }, { "epoch": 0.5218285420504691, "grad_norm": 3.43770432472229, "learning_rate": 9.73364472842005e-05, "loss": 2.546364974975586, "memory(GiB)": 58.3, "step": 12180, "token_acc": 0.5113636363636364, "train_speed(iter/s)": 1.450865 }, { "epoch": 0.5220427573797181, "grad_norm": 4.896144866943359, "learning_rate": 9.733427965635465e-05, "loss": 2.6740402221679687, "memory(GiB)": 58.3, "step": 12185, "token_acc": 0.4507936507936508, "train_speed(iter/s)": 1.450979 }, { "epoch": 0.5222569727089671, "grad_norm": 5.150900363922119, "learning_rate": 9.73321111710039e-05, "loss": 2.641109085083008, "memory(GiB)": 58.3, "step": 12190, "token_acc": 0.46308724832214765, "train_speed(iter/s)": 1.45109 }, { "epoch": 0.522471188038216, "grad_norm": 4.9929046630859375, "learning_rate": 9.732994182818757e-05, "loss": 2.557425880432129, "memory(GiB)": 58.3, "step": 12195, "token_acc": 0.4370860927152318, "train_speed(iter/s)": 1.451104 }, { "epoch": 0.5226854033674649, "grad_norm": 3.6203742027282715, "learning_rate": 9.732777162794496e-05, "loss": 2.441172790527344, "memory(GiB)": 58.3, "step": 12200, "token_acc": 0.43174603174603177, "train_speed(iter/s)": 1.451158 }, { "epoch": 0.522899618696714, "grad_norm": 4.222485065460205, "learning_rate": 9.732560057031534e-05, "loss": 2.535055923461914, "memory(GiB)": 58.3, "step": 12205, "token_acc": 0.5071428571428571, "train_speed(iter/s)": 1.451187 }, { "epoch": 0.5231138340259629, "grad_norm": 5.140800952911377, "learning_rate": 9.732342865533809e-05, "loss": 2.519024658203125, "memory(GiB)": 58.3, "step": 12210, "token_acc": 0.4894366197183099, "train_speed(iter/s)": 1.451206 }, { "epoch": 0.5233280493552118, "grad_norm": 7.724201202392578, "learning_rate": 9.732125588305252e-05, "loss": 2.722677993774414, "memory(GiB)": 58.3, "step": 12215, "token_acc": 0.429042904290429, "train_speed(iter/s)": 1.451296 }, { "epoch": 0.5235422646844609, "grad_norm": 4.023663520812988, "learning_rate": 9.731908225349803e-05, "loss": 2.4608808517456056, "memory(GiB)": 58.3, "step": 12220, "token_acc": 0.5054545454545455, "train_speed(iter/s)": 1.451089 }, { "epoch": 0.5237564800137098, "grad_norm": 3.1600120067596436, "learning_rate": 9.731690776671396e-05, "loss": 2.505036544799805, "memory(GiB)": 58.3, "step": 12225, "token_acc": 0.4605263157894737, "train_speed(iter/s)": 1.451101 }, { "epoch": 0.5239706953429587, "grad_norm": 6.346956729888916, "learning_rate": 9.731473242273974e-05, "loss": 2.5708852767944337, "memory(GiB)": 58.3, "step": 12230, "token_acc": 0.4618181818181818, "train_speed(iter/s)": 1.451158 }, { "epoch": 0.5241849106722077, "grad_norm": 3.5241777896881104, "learning_rate": 9.731255622161474e-05, "loss": 2.58681640625, "memory(GiB)": 58.3, "step": 12235, "token_acc": 0.46938775510204084, "train_speed(iter/s)": 1.451249 }, { "epoch": 0.5243991260014567, "grad_norm": 4.737888336181641, "learning_rate": 9.731037916337839e-05, "loss": 2.253646659851074, "memory(GiB)": 58.3, "step": 12240, "token_acc": 0.5341365461847389, "train_speed(iter/s)": 1.451189 }, { "epoch": 0.5246133413307056, "grad_norm": 3.948641300201416, "learning_rate": 9.730820124807017e-05, "loss": 2.4099130630493164, "memory(GiB)": 58.3, "step": 12245, "token_acc": 0.48231511254019294, "train_speed(iter/s)": 1.451249 }, { "epoch": 0.5248275566599546, "grad_norm": 4.104923725128174, "learning_rate": 9.730602247572948e-05, "loss": 2.530244827270508, "memory(GiB)": 58.3, "step": 12250, "token_acc": 0.5107142857142857, "train_speed(iter/s)": 1.451226 }, { "epoch": 0.5250417719892035, "grad_norm": 4.411331653594971, "learning_rate": 9.730384284639584e-05, "loss": 2.8195140838623045, "memory(GiB)": 58.3, "step": 12255, "token_acc": 0.4930555555555556, "train_speed(iter/s)": 1.451425 }, { "epoch": 0.5252559873184525, "grad_norm": 4.188760757446289, "learning_rate": 9.730166236010869e-05, "loss": 2.518187141418457, "memory(GiB)": 58.3, "step": 12260, "token_acc": 0.48214285714285715, "train_speed(iter/s)": 1.451508 }, { "epoch": 0.5254702026477015, "grad_norm": 5.70860481262207, "learning_rate": 9.729948101690757e-05, "loss": 2.4894786834716798, "memory(GiB)": 58.3, "step": 12265, "token_acc": 0.5203252032520326, "train_speed(iter/s)": 1.451656 }, { "epoch": 0.5256844179769504, "grad_norm": 4.290028095245361, "learning_rate": 9.729729881683197e-05, "loss": 2.852960395812988, "memory(GiB)": 58.3, "step": 12270, "token_acc": 0.4372623574144487, "train_speed(iter/s)": 1.451747 }, { "epoch": 0.5258986333061993, "grad_norm": 5.143388271331787, "learning_rate": 9.729511575992145e-05, "loss": 2.5350303649902344, "memory(GiB)": 58.3, "step": 12275, "token_acc": 0.4835164835164835, "train_speed(iter/s)": 1.451699 }, { "epoch": 0.5261128486354484, "grad_norm": 4.118575096130371, "learning_rate": 9.729293184621552e-05, "loss": 2.620782661437988, "memory(GiB)": 58.3, "step": 12280, "token_acc": 0.4716417910447761, "train_speed(iter/s)": 1.451748 }, { "epoch": 0.5263270639646973, "grad_norm": 3.6123764514923096, "learning_rate": 9.729074707575377e-05, "loss": 2.578450012207031, "memory(GiB)": 58.3, "step": 12285, "token_acc": 0.4676923076923077, "train_speed(iter/s)": 1.451596 }, { "epoch": 0.5265412792939462, "grad_norm": 5.424049377441406, "learning_rate": 9.728856144857579e-05, "loss": 2.8362285614013674, "memory(GiB)": 58.3, "step": 12290, "token_acc": 0.42574257425742573, "train_speed(iter/s)": 1.451664 }, { "epoch": 0.5267554946231953, "grad_norm": 4.256664276123047, "learning_rate": 9.728637496472115e-05, "loss": 2.4898361206054687, "memory(GiB)": 58.3, "step": 12295, "token_acc": 0.45255474452554745, "train_speed(iter/s)": 1.451589 }, { "epoch": 0.5269697099524442, "grad_norm": 6.542172431945801, "learning_rate": 9.728418762422947e-05, "loss": 2.6374752044677736, "memory(GiB)": 58.3, "step": 12300, "token_acc": 0.3870967741935484, "train_speed(iter/s)": 1.451678 }, { "epoch": 0.5271839252816931, "grad_norm": 3.748230218887329, "learning_rate": 9.728199942714036e-05, "loss": 2.5267093658447264, "memory(GiB)": 58.3, "step": 12305, "token_acc": 0.5050167224080268, "train_speed(iter/s)": 1.451577 }, { "epoch": 0.5273981406109421, "grad_norm": 3.6318066120147705, "learning_rate": 9.727981037349349e-05, "loss": 2.425985336303711, "memory(GiB)": 58.3, "step": 12310, "token_acc": 0.488135593220339, "train_speed(iter/s)": 1.451635 }, { "epoch": 0.5276123559401911, "grad_norm": 6.197627067565918, "learning_rate": 9.727762046332851e-05, "loss": 2.6292070388793944, "memory(GiB)": 58.3, "step": 12315, "token_acc": 0.4686192468619247, "train_speed(iter/s)": 1.451699 }, { "epoch": 0.52782657126944, "grad_norm": 4.748229026794434, "learning_rate": 9.727542969668507e-05, "loss": 2.525190734863281, "memory(GiB)": 58.3, "step": 12320, "token_acc": 0.45161290322580644, "train_speed(iter/s)": 1.451718 }, { "epoch": 0.528040786598689, "grad_norm": 4.905789852142334, "learning_rate": 9.72732380736029e-05, "loss": 2.280447769165039, "memory(GiB)": 58.3, "step": 12325, "token_acc": 0.49310344827586206, "train_speed(iter/s)": 1.451725 }, { "epoch": 0.528255001927938, "grad_norm": 5.149145603179932, "learning_rate": 9.727104559412163e-05, "loss": 2.813428497314453, "memory(GiB)": 58.3, "step": 12330, "token_acc": 0.4246575342465753, "train_speed(iter/s)": 1.45169 }, { "epoch": 0.5284692172571869, "grad_norm": 4.573469161987305, "learning_rate": 9.726885225828106e-05, "loss": 2.6495691299438477, "memory(GiB)": 58.3, "step": 12335, "token_acc": 0.44402985074626866, "train_speed(iter/s)": 1.451755 }, { "epoch": 0.5286834325864359, "grad_norm": 4.365417957305908, "learning_rate": 9.726665806612087e-05, "loss": 2.465804100036621, "memory(GiB)": 58.3, "step": 12340, "token_acc": 0.5125448028673835, "train_speed(iter/s)": 1.451658 }, { "epoch": 0.5288976479156848, "grad_norm": 3.845370292663574, "learning_rate": 9.726446301768085e-05, "loss": 2.7484079360961915, "memory(GiB)": 58.3, "step": 12345, "token_acc": 0.46273291925465837, "train_speed(iter/s)": 1.451639 }, { "epoch": 0.5291118632449338, "grad_norm": 3.590235471725464, "learning_rate": 9.726226711300074e-05, "loss": 2.6285877227783203, "memory(GiB)": 58.3, "step": 12350, "token_acc": 0.40584415584415584, "train_speed(iter/s)": 1.451685 }, { "epoch": 0.5293260785741828, "grad_norm": 4.054399013519287, "learning_rate": 9.726007035212033e-05, "loss": 2.486698532104492, "memory(GiB)": 58.3, "step": 12355, "token_acc": 0.44694533762057875, "train_speed(iter/s)": 1.451579 }, { "epoch": 0.5295402939034317, "grad_norm": 5.202297687530518, "learning_rate": 9.725787273507939e-05, "loss": 2.4875463485717773, "memory(GiB)": 58.3, "step": 12360, "token_acc": 0.4714285714285714, "train_speed(iter/s)": 1.451589 }, { "epoch": 0.5297545092326806, "grad_norm": 4.829860210418701, "learning_rate": 9.725567426191777e-05, "loss": 2.2026813507080076, "memory(GiB)": 58.3, "step": 12365, "token_acc": 0.5458015267175572, "train_speed(iter/s)": 1.45161 }, { "epoch": 0.5299687245619297, "grad_norm": 6.145134449005127, "learning_rate": 9.725347493267529e-05, "loss": 2.4365734100341796, "memory(GiB)": 58.3, "step": 12370, "token_acc": 0.48028673835125446, "train_speed(iter/s)": 1.451589 }, { "epoch": 0.5301829398911786, "grad_norm": 5.194461822509766, "learning_rate": 9.725127474739176e-05, "loss": 2.5591482162475585, "memory(GiB)": 58.3, "step": 12375, "token_acc": 0.41379310344827586, "train_speed(iter/s)": 1.451754 }, { "epoch": 0.5303971552204275, "grad_norm": 3.0579028129577637, "learning_rate": 9.724907370610707e-05, "loss": 2.559821891784668, "memory(GiB)": 58.3, "step": 12380, "token_acc": 0.48242811501597443, "train_speed(iter/s)": 1.451796 }, { "epoch": 0.5306113705496766, "grad_norm": 4.266158580780029, "learning_rate": 9.724687180886109e-05, "loss": 2.6127849578857423, "memory(GiB)": 58.3, "step": 12385, "token_acc": 0.48013245033112584, "train_speed(iter/s)": 1.451756 }, { "epoch": 0.5308255858789255, "grad_norm": 5.282445430755615, "learning_rate": 9.724466905569372e-05, "loss": 2.6352052688598633, "memory(GiB)": 58.3, "step": 12390, "token_acc": 0.4646840148698885, "train_speed(iter/s)": 1.451722 }, { "epoch": 0.5310398012081745, "grad_norm": 4.208840370178223, "learning_rate": 9.724246544664484e-05, "loss": 2.904363250732422, "memory(GiB)": 58.3, "step": 12395, "token_acc": 0.41471571906354515, "train_speed(iter/s)": 1.451688 }, { "epoch": 0.5312540165374234, "grad_norm": 3.660950183868408, "learning_rate": 9.724026098175438e-05, "loss": 2.935175323486328, "memory(GiB)": 58.3, "step": 12400, "token_acc": 0.4363143631436314, "train_speed(iter/s)": 1.451635 }, { "epoch": 0.5314682318666724, "grad_norm": 5.2977519035339355, "learning_rate": 9.723805566106227e-05, "loss": 2.7877475738525392, "memory(GiB)": 58.3, "step": 12405, "token_acc": 0.44649446494464945, "train_speed(iter/s)": 1.451639 }, { "epoch": 0.5316824471959214, "grad_norm": 4.768824577331543, "learning_rate": 9.723584948460848e-05, "loss": 2.896093559265137, "memory(GiB)": 58.3, "step": 12410, "token_acc": 0.42424242424242425, "train_speed(iter/s)": 1.451483 }, { "epoch": 0.5318966625251703, "grad_norm": 5.282357215881348, "learning_rate": 9.723364245243296e-05, "loss": 2.7335073471069338, "memory(GiB)": 58.3, "step": 12415, "token_acc": 0.4423076923076923, "train_speed(iter/s)": 1.451515 }, { "epoch": 0.5321108778544192, "grad_norm": 6.384718894958496, "learning_rate": 9.723143456457571e-05, "loss": 2.750216484069824, "memory(GiB)": 58.3, "step": 12420, "token_acc": 0.4296028880866426, "train_speed(iter/s)": 1.451517 }, { "epoch": 0.5323250931836683, "grad_norm": 5.642190933227539, "learning_rate": 9.72292258210767e-05, "loss": 2.60189208984375, "memory(GiB)": 58.3, "step": 12425, "token_acc": 0.5043103448275862, "train_speed(iter/s)": 1.451657 }, { "epoch": 0.5325393085129172, "grad_norm": 5.021167755126953, "learning_rate": 9.722701622197596e-05, "loss": 2.67369384765625, "memory(GiB)": 58.3, "step": 12430, "token_acc": 0.4664429530201342, "train_speed(iter/s)": 1.451786 }, { "epoch": 0.5327535238421661, "grad_norm": 5.418324947357178, "learning_rate": 9.722480576731354e-05, "loss": 2.748818206787109, "memory(GiB)": 58.3, "step": 12435, "token_acc": 0.48226950354609927, "train_speed(iter/s)": 1.45174 }, { "epoch": 0.5329677391714152, "grad_norm": 4.710017681121826, "learning_rate": 9.722259445712945e-05, "loss": 2.6717979431152346, "memory(GiB)": 58.3, "step": 12440, "token_acc": 0.45660377358490567, "train_speed(iter/s)": 1.451499 }, { "epoch": 0.5331819545006641, "grad_norm": 4.8615031242370605, "learning_rate": 9.722038229146375e-05, "loss": 2.6751766204833984, "memory(GiB)": 58.3, "step": 12445, "token_acc": 0.4383116883116883, "train_speed(iter/s)": 1.451323 }, { "epoch": 0.533396169829913, "grad_norm": 4.771420001983643, "learning_rate": 9.721816927035654e-05, "loss": 2.6626399993896483, "memory(GiB)": 58.3, "step": 12450, "token_acc": 0.5051546391752577, "train_speed(iter/s)": 1.451212 }, { "epoch": 0.533610385159162, "grad_norm": 5.13985538482666, "learning_rate": 9.721595539384791e-05, "loss": 2.873168182373047, "memory(GiB)": 58.3, "step": 12455, "token_acc": 0.46153846153846156, "train_speed(iter/s)": 1.45132 }, { "epoch": 0.533824600488411, "grad_norm": 3.6609761714935303, "learning_rate": 9.721374066197795e-05, "loss": 2.7462268829345704, "memory(GiB)": 58.3, "step": 12460, "token_acc": 0.4612903225806452, "train_speed(iter/s)": 1.451409 }, { "epoch": 0.5340388158176599, "grad_norm": 6.595832824707031, "learning_rate": 9.721152507478677e-05, "loss": 2.399384117126465, "memory(GiB)": 58.3, "step": 12465, "token_acc": 0.4855072463768116, "train_speed(iter/s)": 1.451458 }, { "epoch": 0.5342530311469089, "grad_norm": 3.449709415435791, "learning_rate": 9.720930863231454e-05, "loss": 2.1814918518066406, "memory(GiB)": 58.3, "step": 12470, "token_acc": 0.5165562913907285, "train_speed(iter/s)": 1.451342 }, { "epoch": 0.5344672464761578, "grad_norm": 4.599881172180176, "learning_rate": 9.72070913346014e-05, "loss": 2.8916391372680663, "memory(GiB)": 58.3, "step": 12475, "token_acc": 0.4791666666666667, "train_speed(iter/s)": 1.451297 }, { "epoch": 0.5346814618054068, "grad_norm": 4.440439224243164, "learning_rate": 9.720487318168752e-05, "loss": 2.7131805419921875, "memory(GiB)": 58.3, "step": 12480, "token_acc": 0.4690909090909091, "train_speed(iter/s)": 1.451327 }, { "epoch": 0.5348956771346558, "grad_norm": 4.202776908874512, "learning_rate": 9.720265417361306e-05, "loss": 2.696521759033203, "memory(GiB)": 58.3, "step": 12485, "token_acc": 0.50199203187251, "train_speed(iter/s)": 1.451387 }, { "epoch": 0.5351098924639047, "grad_norm": 4.543173789978027, "learning_rate": 9.720043431041825e-05, "loss": 2.8303363800048826, "memory(GiB)": 58.3, "step": 12490, "token_acc": 0.42955326460481097, "train_speed(iter/s)": 1.451482 }, { "epoch": 0.5353241077931536, "grad_norm": 3.8472940921783447, "learning_rate": 9.719821359214331e-05, "loss": 2.638447952270508, "memory(GiB)": 58.3, "step": 12495, "token_acc": 0.4908424908424908, "train_speed(iter/s)": 1.451592 }, { "epoch": 0.5355383231224027, "grad_norm": 6.116992950439453, "learning_rate": 9.719599201882845e-05, "loss": 2.5670450210571287, "memory(GiB)": 58.3, "step": 12500, "token_acc": 0.4927007299270073, "train_speed(iter/s)": 1.45172 }, { "epoch": 0.5355383231224027, "eval_loss": 1.9528933763504028, "eval_runtime": 13.6132, "eval_samples_per_second": 7.346, "eval_steps_per_second": 7.346, "eval_token_acc": 0.5, "step": 12500 }, { "epoch": 0.5357525384516516, "grad_norm": 4.405342102050781, "learning_rate": 9.71937695905139e-05, "loss": 2.6192043304443358, "memory(GiB)": 58.3, "step": 12505, "token_acc": 0.4769392033542977, "train_speed(iter/s)": 1.449136 }, { "epoch": 0.5359667537809005, "grad_norm": 4.714395046234131, "learning_rate": 9.719154630723996e-05, "loss": 2.841592025756836, "memory(GiB)": 58.3, "step": 12510, "token_acc": 0.4608433734939759, "train_speed(iter/s)": 1.449103 }, { "epoch": 0.5361809691101496, "grad_norm": 4.565947532653809, "learning_rate": 9.718932216904689e-05, "loss": 2.6948455810546874, "memory(GiB)": 58.3, "step": 12515, "token_acc": 0.42248062015503873, "train_speed(iter/s)": 1.449295 }, { "epoch": 0.5363951844393985, "grad_norm": 5.4101996421813965, "learning_rate": 9.718709717597498e-05, "loss": 2.7623340606689455, "memory(GiB)": 58.3, "step": 12520, "token_acc": 0.42045454545454547, "train_speed(iter/s)": 1.449142 }, { "epoch": 0.5366093997686474, "grad_norm": 3.920884132385254, "learning_rate": 9.718487132806453e-05, "loss": 2.7843671798706056, "memory(GiB)": 58.3, "step": 12525, "token_acc": 0.4527027027027027, "train_speed(iter/s)": 1.449283 }, { "epoch": 0.5368236150978964, "grad_norm": 3.6120080947875977, "learning_rate": 9.71826446253559e-05, "loss": 2.5558847427368163, "memory(GiB)": 58.3, "step": 12530, "token_acc": 0.5017301038062284, "train_speed(iter/s)": 1.449465 }, { "epoch": 0.5370378304271454, "grad_norm": 4.816959857940674, "learning_rate": 9.718041706788937e-05, "loss": 2.7386369705200195, "memory(GiB)": 58.3, "step": 12535, "token_acc": 0.47701149425287354, "train_speed(iter/s)": 1.449543 }, { "epoch": 0.5372520457563943, "grad_norm": 3.709352731704712, "learning_rate": 9.717818865570534e-05, "loss": 2.7484809875488283, "memory(GiB)": 58.3, "step": 12540, "token_acc": 0.4481707317073171, "train_speed(iter/s)": 1.449612 }, { "epoch": 0.5374662610856433, "grad_norm": 3.888885736465454, "learning_rate": 9.717595938884416e-05, "loss": 2.809458541870117, "memory(GiB)": 58.3, "step": 12545, "token_acc": 0.43843843843843844, "train_speed(iter/s)": 1.449496 }, { "epoch": 0.5376804764148922, "grad_norm": 4.670475482940674, "learning_rate": 9.717372926734623e-05, "loss": 3.0961544036865236, "memory(GiB)": 58.3, "step": 12550, "token_acc": 0.37942122186495175, "train_speed(iter/s)": 1.449608 }, { "epoch": 0.5378946917441412, "grad_norm": 4.811488628387451, "learning_rate": 9.717149829125194e-05, "loss": 2.8523914337158205, "memory(GiB)": 58.3, "step": 12555, "token_acc": 0.44802867383512546, "train_speed(iter/s)": 1.449638 }, { "epoch": 0.5381089070733902, "grad_norm": 5.6197428703308105, "learning_rate": 9.71692664606017e-05, "loss": 2.5427404403686524, "memory(GiB)": 58.3, "step": 12560, "token_acc": 0.4729241877256318, "train_speed(iter/s)": 1.449733 }, { "epoch": 0.5383231224026391, "grad_norm": 5.28272819519043, "learning_rate": 9.716703377543595e-05, "loss": 2.851909637451172, "memory(GiB)": 58.3, "step": 12565, "token_acc": 0.4454828660436137, "train_speed(iter/s)": 1.449798 }, { "epoch": 0.538537337731888, "grad_norm": 4.3687591552734375, "learning_rate": 9.716480023579516e-05, "loss": 2.531698226928711, "memory(GiB)": 58.3, "step": 12570, "token_acc": 0.48698884758364314, "train_speed(iter/s)": 1.449789 }, { "epoch": 0.5387515530611371, "grad_norm": 3.5607926845550537, "learning_rate": 9.716256584171974e-05, "loss": 2.599342918395996, "memory(GiB)": 58.3, "step": 12575, "token_acc": 0.4884488448844885, "train_speed(iter/s)": 1.449691 }, { "epoch": 0.538965768390386, "grad_norm": 4.8093085289001465, "learning_rate": 9.71603305932502e-05, "loss": 2.6707332611083983, "memory(GiB)": 58.3, "step": 12580, "token_acc": 0.5017543859649123, "train_speed(iter/s)": 1.449786 }, { "epoch": 0.5391799837196349, "grad_norm": 3.1623497009277344, "learning_rate": 9.715809449042706e-05, "loss": 2.695049285888672, "memory(GiB)": 58.3, "step": 12585, "token_acc": 0.44744744744744747, "train_speed(iter/s)": 1.449648 }, { "epoch": 0.539394199048884, "grad_norm": 4.399327278137207, "learning_rate": 9.715585753329077e-05, "loss": 2.579286575317383, "memory(GiB)": 58.3, "step": 12590, "token_acc": 0.5123674911660777, "train_speed(iter/s)": 1.449724 }, { "epoch": 0.5396084143781329, "grad_norm": 5.561838150024414, "learning_rate": 9.715361972188191e-05, "loss": 2.3928447723388673, "memory(GiB)": 58.3, "step": 12595, "token_acc": 0.48828125, "train_speed(iter/s)": 1.44968 }, { "epoch": 0.5398226297073818, "grad_norm": 4.708993434906006, "learning_rate": 9.715138105624097e-05, "loss": 2.908864212036133, "memory(GiB)": 58.3, "step": 12600, "token_acc": 0.4104938271604938, "train_speed(iter/s)": 1.449764 }, { "epoch": 0.5400368450366309, "grad_norm": 6.453427314758301, "learning_rate": 9.714914153640854e-05, "loss": 2.405851364135742, "memory(GiB)": 58.3, "step": 12605, "token_acc": 0.4942084942084942, "train_speed(iter/s)": 1.449807 }, { "epoch": 0.5402510603658798, "grad_norm": 7.729811191558838, "learning_rate": 9.714690116242518e-05, "loss": 2.809157943725586, "memory(GiB)": 58.3, "step": 12610, "token_acc": 0.4828767123287671, "train_speed(iter/s)": 1.449851 }, { "epoch": 0.5404652756951287, "grad_norm": 3.0981762409210205, "learning_rate": 9.714465993433148e-05, "loss": 3.0204233169555663, "memory(GiB)": 58.3, "step": 12615, "token_acc": 0.44542772861356933, "train_speed(iter/s)": 1.449977 }, { "epoch": 0.5406794910243777, "grad_norm": 4.510212421417236, "learning_rate": 9.714241785216804e-05, "loss": 2.4773700714111326, "memory(GiB)": 58.3, "step": 12620, "token_acc": 0.46619217081850534, "train_speed(iter/s)": 1.450007 }, { "epoch": 0.5408937063536267, "grad_norm": 3.8259317874908447, "learning_rate": 9.714017491597548e-05, "loss": 2.7129066467285154, "memory(GiB)": 58.3, "step": 12625, "token_acc": 0.4621212121212121, "train_speed(iter/s)": 1.450023 }, { "epoch": 0.5411079216828756, "grad_norm": 4.733958721160889, "learning_rate": 9.713793112579443e-05, "loss": 2.3526430130004883, "memory(GiB)": 58.3, "step": 12630, "token_acc": 0.5100401606425703, "train_speed(iter/s)": 1.45012 }, { "epoch": 0.5413221370121246, "grad_norm": 4.693044185638428, "learning_rate": 9.713568648166555e-05, "loss": 2.585987091064453, "memory(GiB)": 58.3, "step": 12635, "token_acc": 0.4826388888888889, "train_speed(iter/s)": 1.450247 }, { "epoch": 0.5415363523413735, "grad_norm": 5.047013759613037, "learning_rate": 9.713344098362948e-05, "loss": 3.068511962890625, "memory(GiB)": 58.3, "step": 12640, "token_acc": 0.40977443609022557, "train_speed(iter/s)": 1.450129 }, { "epoch": 0.5417505676706225, "grad_norm": 3.8072783946990967, "learning_rate": 9.71311946317269e-05, "loss": 2.135857582092285, "memory(GiB)": 58.3, "step": 12645, "token_acc": 0.5094339622641509, "train_speed(iter/s)": 1.450077 }, { "epoch": 0.5419647829998715, "grad_norm": 3.554363489151001, "learning_rate": 9.712894742599854e-05, "loss": 2.5715707778930663, "memory(GiB)": 58.3, "step": 12650, "token_acc": 0.46254071661237783, "train_speed(iter/s)": 1.450074 }, { "epoch": 0.5421789983291204, "grad_norm": 3.843611240386963, "learning_rate": 9.712669936648507e-05, "loss": 2.515215301513672, "memory(GiB)": 58.3, "step": 12655, "token_acc": 0.4559386973180077, "train_speed(iter/s)": 1.45004 }, { "epoch": 0.5423932136583693, "grad_norm": 5.072500228881836, "learning_rate": 9.712445045322725e-05, "loss": 2.6251152038574217, "memory(GiB)": 58.3, "step": 12660, "token_acc": 0.49324324324324326, "train_speed(iter/s)": 1.450226 }, { "epoch": 0.5426074289876184, "grad_norm": 5.165890693664551, "learning_rate": 9.712220068626578e-05, "loss": 2.5669994354248047, "memory(GiB)": 58.3, "step": 12665, "token_acc": 0.45964912280701753, "train_speed(iter/s)": 1.450245 }, { "epoch": 0.5428216443168673, "grad_norm": 7.052614688873291, "learning_rate": 9.711995006564145e-05, "loss": 2.443429946899414, "memory(GiB)": 58.3, "step": 12670, "token_acc": 0.4979919678714859, "train_speed(iter/s)": 1.450332 }, { "epoch": 0.5430358596461162, "grad_norm": 4.854347229003906, "learning_rate": 9.711769859139503e-05, "loss": 2.286446952819824, "memory(GiB)": 58.3, "step": 12675, "token_acc": 0.5258620689655172, "train_speed(iter/s)": 1.450368 }, { "epoch": 0.5432500749753653, "grad_norm": 4.551116466522217, "learning_rate": 9.71154462635673e-05, "loss": 2.7156274795532225, "memory(GiB)": 58.3, "step": 12680, "token_acc": 0.4751552795031056, "train_speed(iter/s)": 1.450338 }, { "epoch": 0.5434642903046142, "grad_norm": 4.808742046356201, "learning_rate": 9.711319308219906e-05, "loss": 2.6372392654418944, "memory(GiB)": 58.3, "step": 12685, "token_acc": 0.4861111111111111, "train_speed(iter/s)": 1.450455 }, { "epoch": 0.5436785056338631, "grad_norm": 3.753126859664917, "learning_rate": 9.711093904733113e-05, "loss": 2.5791189193725588, "memory(GiB)": 58.3, "step": 12690, "token_acc": 0.46855345911949686, "train_speed(iter/s)": 1.450517 }, { "epoch": 0.5438927209631121, "grad_norm": 3.754671573638916, "learning_rate": 9.710868415900433e-05, "loss": 2.4302879333496095, "memory(GiB)": 58.3, "step": 12695, "token_acc": 0.45955882352941174, "train_speed(iter/s)": 1.450481 }, { "epoch": 0.5441069362923611, "grad_norm": 4.710944175720215, "learning_rate": 9.710642841725954e-05, "loss": 2.624073791503906, "memory(GiB)": 58.3, "step": 12700, "token_acc": 0.4697508896797153, "train_speed(iter/s)": 1.450594 }, { "epoch": 0.54432115162161, "grad_norm": 6.307151794433594, "learning_rate": 9.710417182213762e-05, "loss": 2.7054271697998047, "memory(GiB)": 58.3, "step": 12705, "token_acc": 0.46875, "train_speed(iter/s)": 1.450679 }, { "epoch": 0.544535366950859, "grad_norm": 7.1200361251831055, "learning_rate": 9.710191437367942e-05, "loss": 2.6211063385009767, "memory(GiB)": 58.3, "step": 12710, "token_acc": 0.4323308270676692, "train_speed(iter/s)": 1.450773 }, { "epoch": 0.544749582280108, "grad_norm": 4.841012001037598, "learning_rate": 9.709965607192589e-05, "loss": 2.5262451171875, "memory(GiB)": 58.3, "step": 12715, "token_acc": 0.48514851485148514, "train_speed(iter/s)": 1.450709 }, { "epoch": 0.5449637976093569, "grad_norm": 3.5466911792755127, "learning_rate": 9.709739691691787e-05, "loss": 2.6461902618408204, "memory(GiB)": 58.3, "step": 12720, "token_acc": 0.4659400544959128, "train_speed(iter/s)": 1.450779 }, { "epoch": 0.5451780129386059, "grad_norm": 4.010507106781006, "learning_rate": 9.709513690869634e-05, "loss": 2.596065139770508, "memory(GiB)": 58.3, "step": 12725, "token_acc": 0.4782608695652174, "train_speed(iter/s)": 1.450676 }, { "epoch": 0.5453922282678548, "grad_norm": 3.871396780014038, "learning_rate": 9.709287604730222e-05, "loss": 2.5589183807373046, "memory(GiB)": 58.3, "step": 12730, "token_acc": 0.4380165289256198, "train_speed(iter/s)": 1.450752 }, { "epoch": 0.5456064435971039, "grad_norm": 4.161299705505371, "learning_rate": 9.709061433277647e-05, "loss": 2.575978088378906, "memory(GiB)": 58.3, "step": 12735, "token_acc": 0.45714285714285713, "train_speed(iter/s)": 1.450644 }, { "epoch": 0.5458206589263528, "grad_norm": 5.430095672607422, "learning_rate": 9.708835176516008e-05, "loss": 2.4091758728027344, "memory(GiB)": 58.3, "step": 12740, "token_acc": 0.4791666666666667, "train_speed(iter/s)": 1.450437 }, { "epoch": 0.5460348742556017, "grad_norm": 5.0184245109558105, "learning_rate": 9.708608834449402e-05, "loss": 2.7799776077270506, "memory(GiB)": 58.3, "step": 12745, "token_acc": 0.4637096774193548, "train_speed(iter/s)": 1.450624 }, { "epoch": 0.5462490895848507, "grad_norm": 7.726380825042725, "learning_rate": 9.708382407081929e-05, "loss": 2.339492988586426, "memory(GiB)": 58.3, "step": 12750, "token_acc": 0.46551724137931033, "train_speed(iter/s)": 1.450706 }, { "epoch": 0.5464633049140997, "grad_norm": 5.161066055297852, "learning_rate": 9.708155894417693e-05, "loss": 2.641633224487305, "memory(GiB)": 58.3, "step": 12755, "token_acc": 0.46, "train_speed(iter/s)": 1.450709 }, { "epoch": 0.5466775202433486, "grad_norm": 3.77683424949646, "learning_rate": 9.707929296460796e-05, "loss": 2.5671504974365233, "memory(GiB)": 58.3, "step": 12760, "token_acc": 0.47953216374269003, "train_speed(iter/s)": 1.450634 }, { "epoch": 0.5468917355725976, "grad_norm": 5.168262958526611, "learning_rate": 9.707702613215344e-05, "loss": 2.7779285430908205, "memory(GiB)": 58.3, "step": 12765, "token_acc": 0.46153846153846156, "train_speed(iter/s)": 1.450815 }, { "epoch": 0.5471059509018465, "grad_norm": 4.115829944610596, "learning_rate": 9.707475844685443e-05, "loss": 2.7260311126708983, "memory(GiB)": 58.3, "step": 12770, "token_acc": 0.46981627296587924, "train_speed(iter/s)": 1.450769 }, { "epoch": 0.5473201662310955, "grad_norm": 4.663558006286621, "learning_rate": 9.7072489908752e-05, "loss": 2.793963623046875, "memory(GiB)": 58.3, "step": 12775, "token_acc": 0.4245614035087719, "train_speed(iter/s)": 1.450838 }, { "epoch": 0.5475343815603445, "grad_norm": 3.709885835647583, "learning_rate": 9.707022051788725e-05, "loss": 2.8072214126586914, "memory(GiB)": 58.3, "step": 12780, "token_acc": 0.4431818181818182, "train_speed(iter/s)": 1.450803 }, { "epoch": 0.5477485968895934, "grad_norm": 4.756540298461914, "learning_rate": 9.706795027430133e-05, "loss": 2.7352317810058593, "memory(GiB)": 58.3, "step": 12785, "token_acc": 0.4884488448844885, "train_speed(iter/s)": 1.450834 }, { "epoch": 0.5479628122188424, "grad_norm": 4.949060916900635, "learning_rate": 9.706567917803532e-05, "loss": 2.598540115356445, "memory(GiB)": 58.3, "step": 12790, "token_acc": 0.4584717607973422, "train_speed(iter/s)": 1.450694 }, { "epoch": 0.5481770275480914, "grad_norm": 4.608774185180664, "learning_rate": 9.706340722913037e-05, "loss": 2.7334430694580076, "memory(GiB)": 58.3, "step": 12795, "token_acc": 0.48923076923076925, "train_speed(iter/s)": 1.450803 }, { "epoch": 0.5483912428773403, "grad_norm": 3.5583767890930176, "learning_rate": 9.706113442762768e-05, "loss": 2.512015533447266, "memory(GiB)": 58.3, "step": 12800, "token_acc": 0.5236363636363637, "train_speed(iter/s)": 1.450884 }, { "epoch": 0.5486054582065892, "grad_norm": 5.349429130554199, "learning_rate": 9.705886077356837e-05, "loss": 2.61452693939209, "memory(GiB)": 58.3, "step": 12805, "token_acc": 0.47586206896551725, "train_speed(iter/s)": 1.451024 }, { "epoch": 0.5488196735358383, "grad_norm": 3.242171287536621, "learning_rate": 9.705658626699365e-05, "loss": 2.4409656524658203, "memory(GiB)": 58.3, "step": 12810, "token_acc": 0.4884393063583815, "train_speed(iter/s)": 1.450948 }, { "epoch": 0.5490338888650872, "grad_norm": 3.8514227867126465, "learning_rate": 9.705431090794472e-05, "loss": 2.807894706726074, "memory(GiB)": 58.3, "step": 12815, "token_acc": 0.4660493827160494, "train_speed(iter/s)": 1.451088 }, { "epoch": 0.5492481041943361, "grad_norm": 4.336859703063965, "learning_rate": 9.705203469646282e-05, "loss": 2.4014379501342775, "memory(GiB)": 58.3, "step": 12820, "token_acc": 0.5188284518828452, "train_speed(iter/s)": 1.451135 }, { "epoch": 0.5494623195235852, "grad_norm": 5.311072826385498, "learning_rate": 9.704975763258919e-05, "loss": 2.6091053009033205, "memory(GiB)": 58.3, "step": 12825, "token_acc": 0.45188284518828453, "train_speed(iter/s)": 1.451138 }, { "epoch": 0.5496765348528341, "grad_norm": 5.685634613037109, "learning_rate": 9.704747971636504e-05, "loss": 2.8710981369018556, "memory(GiB)": 58.3, "step": 12830, "token_acc": 0.45318352059925093, "train_speed(iter/s)": 1.451154 }, { "epoch": 0.549890750182083, "grad_norm": 4.341067790985107, "learning_rate": 9.704520094783167e-05, "loss": 2.6993837356567383, "memory(GiB)": 58.3, "step": 12835, "token_acc": 0.479020979020979, "train_speed(iter/s)": 1.451161 }, { "epoch": 0.550104965511332, "grad_norm": 4.665333271026611, "learning_rate": 9.704292132703034e-05, "loss": 2.406839370727539, "memory(GiB)": 58.3, "step": 12840, "token_acc": 0.48188405797101447, "train_speed(iter/s)": 1.45117 }, { "epoch": 0.550319180840581, "grad_norm": 4.147494316101074, "learning_rate": 9.704064085400237e-05, "loss": 2.5593090057373047, "memory(GiB)": 58.3, "step": 12845, "token_acc": 0.46551724137931033, "train_speed(iter/s)": 1.451185 }, { "epoch": 0.5505333961698299, "grad_norm": 8.908293724060059, "learning_rate": 9.703835952878908e-05, "loss": 2.672134208679199, "memory(GiB)": 58.3, "step": 12850, "token_acc": 0.45110410094637227, "train_speed(iter/s)": 1.451197 }, { "epoch": 0.5507476114990789, "grad_norm": 5.46074104309082, "learning_rate": 9.703607735143176e-05, "loss": 2.5653865814208983, "memory(GiB)": 58.3, "step": 12855, "token_acc": 0.4889589905362776, "train_speed(iter/s)": 1.451356 }, { "epoch": 0.5509618268283278, "grad_norm": 5.539285659790039, "learning_rate": 9.703379432197178e-05, "loss": 2.5961772918701174, "memory(GiB)": 58.3, "step": 12860, "token_acc": 0.48717948717948717, "train_speed(iter/s)": 1.45141 }, { "epoch": 0.5511760421575768, "grad_norm": 3.4348926544189453, "learning_rate": 9.70315104404505e-05, "loss": 2.52852783203125, "memory(GiB)": 58.3, "step": 12865, "token_acc": 0.453125, "train_speed(iter/s)": 1.451285 }, { "epoch": 0.5513902574868258, "grad_norm": 4.793821811676025, "learning_rate": 9.70292257069093e-05, "loss": 2.3236888885498046, "memory(GiB)": 58.3, "step": 12870, "token_acc": 0.5065359477124183, "train_speed(iter/s)": 1.451377 }, { "epoch": 0.5516044728160747, "grad_norm": 4.095990180969238, "learning_rate": 9.702694012138953e-05, "loss": 2.354779815673828, "memory(GiB)": 58.3, "step": 12875, "token_acc": 0.43450479233226835, "train_speed(iter/s)": 1.451371 }, { "epoch": 0.5518186881453236, "grad_norm": 3.842207193374634, "learning_rate": 9.702465368393264e-05, "loss": 2.506814956665039, "memory(GiB)": 58.3, "step": 12880, "token_acc": 0.48757763975155277, "train_speed(iter/s)": 1.451259 }, { "epoch": 0.5520329034745727, "grad_norm": 8.617725372314453, "learning_rate": 9.702236639458003e-05, "loss": 2.399854850769043, "memory(GiB)": 58.3, "step": 12885, "token_acc": 0.434640522875817, "train_speed(iter/s)": 1.45107 }, { "epoch": 0.5522471188038216, "grad_norm": 4.077536582946777, "learning_rate": 9.702007825337316e-05, "loss": 2.788869094848633, "memory(GiB)": 58.3, "step": 12890, "token_acc": 0.4626865671641791, "train_speed(iter/s)": 1.45096 }, { "epoch": 0.5524613341330705, "grad_norm": 4.308690547943115, "learning_rate": 9.701778926035344e-05, "loss": 2.6484283447265624, "memory(GiB)": 58.3, "step": 12895, "token_acc": 0.4748201438848921, "train_speed(iter/s)": 1.45103 }, { "epoch": 0.5526755494623196, "grad_norm": 4.967340469360352, "learning_rate": 9.701549941556238e-05, "loss": 2.627827453613281, "memory(GiB)": 58.3, "step": 12900, "token_acc": 0.42857142857142855, "train_speed(iter/s)": 1.450966 }, { "epoch": 0.5528897647915685, "grad_norm": 3.6724588871002197, "learning_rate": 9.701320871904143e-05, "loss": 2.8314285278320312, "memory(GiB)": 58.3, "step": 12905, "token_acc": 0.444141689373297, "train_speed(iter/s)": 1.451098 }, { "epoch": 0.5531039801208174, "grad_norm": 4.921066761016846, "learning_rate": 9.70109171708321e-05, "loss": 2.488862228393555, "memory(GiB)": 58.3, "step": 12910, "token_acc": 0.4909090909090909, "train_speed(iter/s)": 1.451267 }, { "epoch": 0.5533181954500664, "grad_norm": 3.338116407394409, "learning_rate": 9.700862477097592e-05, "loss": 2.708690643310547, "memory(GiB)": 58.3, "step": 12915, "token_acc": 0.44931506849315067, "train_speed(iter/s)": 1.451401 }, { "epoch": 0.5535324107793154, "grad_norm": 5.2589874267578125, "learning_rate": 9.700633151951441e-05, "loss": 2.536569595336914, "memory(GiB)": 58.3, "step": 12920, "token_acc": 0.5054151624548736, "train_speed(iter/s)": 1.451434 }, { "epoch": 0.5537466261085643, "grad_norm": 4.2836785316467285, "learning_rate": 9.700403741648909e-05, "loss": 2.6208091735839845, "memory(GiB)": 58.3, "step": 12925, "token_acc": 0.4589041095890411, "train_speed(iter/s)": 1.451421 }, { "epoch": 0.5539608414378133, "grad_norm": 4.430039882659912, "learning_rate": 9.700174246194155e-05, "loss": 2.7207714080810548, "memory(GiB)": 58.3, "step": 12930, "token_acc": 0.45794392523364486, "train_speed(iter/s)": 1.451375 }, { "epoch": 0.5541750567670622, "grad_norm": 4.568837642669678, "learning_rate": 9.699944665591336e-05, "loss": 2.876577377319336, "memory(GiB)": 58.3, "step": 12935, "token_acc": 0.4377358490566038, "train_speed(iter/s)": 1.45135 }, { "epoch": 0.5543892720963112, "grad_norm": 4.635406494140625, "learning_rate": 9.699714999844608e-05, "loss": 2.5632776260375976, "memory(GiB)": 58.3, "step": 12940, "token_acc": 0.4763636363636364, "train_speed(iter/s)": 1.451482 }, { "epoch": 0.5546034874255602, "grad_norm": 3.9506866931915283, "learning_rate": 9.699485248958137e-05, "loss": 3.1370044708251954, "memory(GiB)": 58.3, "step": 12945, "token_acc": 0.40524781341107874, "train_speed(iter/s)": 1.451616 }, { "epoch": 0.5548177027548091, "grad_norm": 4.8641133308410645, "learning_rate": 9.69925541293608e-05, "loss": 2.539725494384766, "memory(GiB)": 58.3, "step": 12950, "token_acc": 0.512, "train_speed(iter/s)": 1.451677 }, { "epoch": 0.555031918084058, "grad_norm": 4.505123138427734, "learning_rate": 9.699025491782606e-05, "loss": 2.677878570556641, "memory(GiB)": 58.3, "step": 12955, "token_acc": 0.43795620437956206, "train_speed(iter/s)": 1.451786 }, { "epoch": 0.5552461334133071, "grad_norm": 9.601716995239258, "learning_rate": 9.698795485501873e-05, "loss": 2.6862897872924805, "memory(GiB)": 58.3, "step": 12960, "token_acc": 0.43700787401574803, "train_speed(iter/s)": 1.451697 }, { "epoch": 0.555460348742556, "grad_norm": 4.412141799926758, "learning_rate": 9.698565394098054e-05, "loss": 2.8015703201293944, "memory(GiB)": 58.3, "step": 12965, "token_acc": 0.41637010676156583, "train_speed(iter/s)": 1.451778 }, { "epoch": 0.5556745640718049, "grad_norm": 4.217938423156738, "learning_rate": 9.698335217575316e-05, "loss": 2.674403762817383, "memory(GiB)": 58.3, "step": 12970, "token_acc": 0.44696969696969696, "train_speed(iter/s)": 1.451799 }, { "epoch": 0.555888779401054, "grad_norm": 4.214227676391602, "learning_rate": 9.698104955937827e-05, "loss": 2.4468994140625, "memory(GiB)": 58.3, "step": 12975, "token_acc": 0.4435146443514644, "train_speed(iter/s)": 1.451741 }, { "epoch": 0.5561029947303029, "grad_norm": 4.063323497772217, "learning_rate": 9.697874609189759e-05, "loss": 2.7058795928955077, "memory(GiB)": 58.3, "step": 12980, "token_acc": 0.4377104377104377, "train_speed(iter/s)": 1.451781 }, { "epoch": 0.5563172100595518, "grad_norm": 5.039035797119141, "learning_rate": 9.697644177335287e-05, "loss": 2.4908977508544923, "memory(GiB)": 58.3, "step": 12985, "token_acc": 0.46875, "train_speed(iter/s)": 1.45187 }, { "epoch": 0.5565314253888008, "grad_norm": 4.5800604820251465, "learning_rate": 9.697413660378584e-05, "loss": 2.4564802169799806, "memory(GiB)": 58.3, "step": 12990, "token_acc": 0.4573643410852713, "train_speed(iter/s)": 1.451969 }, { "epoch": 0.5567456407180498, "grad_norm": 5.657577991485596, "learning_rate": 9.697183058323826e-05, "loss": 2.9000558853149414, "memory(GiB)": 58.3, "step": 12995, "token_acc": 0.43617021276595747, "train_speed(iter/s)": 1.451962 }, { "epoch": 0.5569598560472987, "grad_norm": 4.3039655685424805, "learning_rate": 9.69695237117519e-05, "loss": 2.6337804794311523, "memory(GiB)": 58.3, "step": 13000, "token_acc": 0.49477351916376305, "train_speed(iter/s)": 1.451979 }, { "epoch": 0.5569598560472987, "eval_loss": 2.31534481048584, "eval_runtime": 15.1459, "eval_samples_per_second": 6.602, "eval_steps_per_second": 6.602, "eval_token_acc": 0.45041322314049587, "step": 13000 }, { "epoch": 0.5571740713765477, "grad_norm": 4.509006977081299, "learning_rate": 9.696721598936856e-05, "loss": 2.721030426025391, "memory(GiB)": 58.3, "step": 13005, "token_acc": 0.4441233140655106, "train_speed(iter/s)": 1.449492 }, { "epoch": 0.5573882867057967, "grad_norm": 6.866149425506592, "learning_rate": 9.696490741613002e-05, "loss": 2.7085390090942383, "memory(GiB)": 58.3, "step": 13010, "token_acc": 0.4645669291338583, "train_speed(iter/s)": 1.449568 }, { "epoch": 0.5576025020350456, "grad_norm": 3.926135778427124, "learning_rate": 9.696259799207816e-05, "loss": 2.2735612869262694, "memory(GiB)": 58.3, "step": 13015, "token_acc": 0.5037593984962406, "train_speed(iter/s)": 1.449593 }, { "epoch": 0.5578167173642946, "grad_norm": 4.030943870544434, "learning_rate": 9.696028771725476e-05, "loss": 2.5962993621826174, "memory(GiB)": 58.3, "step": 13020, "token_acc": 0.4501510574018127, "train_speed(iter/s)": 1.449596 }, { "epoch": 0.5580309326935435, "grad_norm": 4.363567352294922, "learning_rate": 9.695797659170171e-05, "loss": 2.469046974182129, "memory(GiB)": 58.3, "step": 13025, "token_acc": 0.45429362880886426, "train_speed(iter/s)": 1.449647 }, { "epoch": 0.5582451480227925, "grad_norm": 3.859976291656494, "learning_rate": 9.695566461546086e-05, "loss": 2.4266117095947264, "memory(GiB)": 58.3, "step": 13030, "token_acc": 0.49645390070921985, "train_speed(iter/s)": 1.449571 }, { "epoch": 0.5584593633520415, "grad_norm": 3.6077330112457275, "learning_rate": 9.695335178857409e-05, "loss": 2.3070953369140623, "memory(GiB)": 58.3, "step": 13035, "token_acc": 0.5175097276264592, "train_speed(iter/s)": 1.449596 }, { "epoch": 0.5586735786812904, "grad_norm": 3.4801628589630127, "learning_rate": 9.695103811108333e-05, "loss": 2.861098861694336, "memory(GiB)": 58.3, "step": 13040, "token_acc": 0.41754385964912283, "train_speed(iter/s)": 1.449626 }, { "epoch": 0.5588877940105393, "grad_norm": 4.068077564239502, "learning_rate": 9.694872358303044e-05, "loss": 2.5247350692749024, "memory(GiB)": 58.3, "step": 13045, "token_acc": 0.4896551724137931, "train_speed(iter/s)": 1.449642 }, { "epoch": 0.5591020093397884, "grad_norm": 7.038202285766602, "learning_rate": 9.69464082044574e-05, "loss": 2.351951789855957, "memory(GiB)": 58.3, "step": 13050, "token_acc": 0.4833948339483395, "train_speed(iter/s)": 1.449544 }, { "epoch": 0.5593162246690373, "grad_norm": 2.840205669403076, "learning_rate": 9.694409197540613e-05, "loss": 2.796143341064453, "memory(GiB)": 58.3, "step": 13055, "token_acc": 0.4444444444444444, "train_speed(iter/s)": 1.449556 }, { "epoch": 0.5595304399982862, "grad_norm": 4.22896671295166, "learning_rate": 9.69417748959186e-05, "loss": 2.82263298034668, "memory(GiB)": 58.3, "step": 13060, "token_acc": 0.42258064516129035, "train_speed(iter/s)": 1.449657 }, { "epoch": 0.5597446553275353, "grad_norm": 4.6137003898620605, "learning_rate": 9.693945696603678e-05, "loss": 2.875554084777832, "memory(GiB)": 58.3, "step": 13065, "token_acc": 0.4317343173431734, "train_speed(iter/s)": 1.449754 }, { "epoch": 0.5599588706567842, "grad_norm": 4.3805694580078125, "learning_rate": 9.693713818580265e-05, "loss": 2.7376270294189453, "memory(GiB)": 58.3, "step": 13070, "token_acc": 0.4785714285714286, "train_speed(iter/s)": 1.449686 }, { "epoch": 0.5601730859860332, "grad_norm": 3.852978467941284, "learning_rate": 9.693481855525826e-05, "loss": 2.638378143310547, "memory(GiB)": 58.3, "step": 13075, "token_acc": 0.4886731391585761, "train_speed(iter/s)": 1.449615 }, { "epoch": 0.5603873013152821, "grad_norm": 4.143217086791992, "learning_rate": 9.693249807444559e-05, "loss": 2.5436470031738283, "memory(GiB)": 58.3, "step": 13080, "token_acc": 0.47750865051903113, "train_speed(iter/s)": 1.449628 }, { "epoch": 0.5606015166445311, "grad_norm": 4.595394134521484, "learning_rate": 9.693017674340669e-05, "loss": 2.449018096923828, "memory(GiB)": 58.3, "step": 13085, "token_acc": 0.4981684981684982, "train_speed(iter/s)": 1.449735 }, { "epoch": 0.5608157319737801, "grad_norm": 5.33073616027832, "learning_rate": 9.69278545621836e-05, "loss": 2.654311752319336, "memory(GiB)": 58.3, "step": 13090, "token_acc": 0.45149253731343286, "train_speed(iter/s)": 1.449899 }, { "epoch": 0.561029947303029, "grad_norm": 5.2792558670043945, "learning_rate": 9.692553153081842e-05, "loss": 2.7055185317993162, "memory(GiB)": 58.3, "step": 13095, "token_acc": 0.46613545816733065, "train_speed(iter/s)": 1.449902 }, { "epoch": 0.5612441626322779, "grad_norm": 6.160458087921143, "learning_rate": 9.692320764935322e-05, "loss": 2.9085187911987305, "memory(GiB)": 58.3, "step": 13100, "token_acc": 0.376425855513308, "train_speed(iter/s)": 1.44995 }, { "epoch": 0.561458377961527, "grad_norm": 4.582027435302734, "learning_rate": 9.692088291783009e-05, "loss": 2.349048614501953, "memory(GiB)": 58.3, "step": 13105, "token_acc": 0.5172413793103449, "train_speed(iter/s)": 1.450039 }, { "epoch": 0.5616725932907759, "grad_norm": 4.76365327835083, "learning_rate": 9.691855733629115e-05, "loss": 2.264979934692383, "memory(GiB)": 58.3, "step": 13110, "token_acc": 0.5277777777777778, "train_speed(iter/s)": 1.450036 }, { "epoch": 0.5618868086200248, "grad_norm": 6.014882564544678, "learning_rate": 9.691623090477852e-05, "loss": 2.1806407928466798, "memory(GiB)": 58.3, "step": 13115, "token_acc": 0.5083333333333333, "train_speed(iter/s)": 1.450114 }, { "epoch": 0.5621010239492739, "grad_norm": 4.662960529327393, "learning_rate": 9.691390362333437e-05, "loss": 2.566688346862793, "memory(GiB)": 58.3, "step": 13120, "token_acc": 0.47289156626506024, "train_speed(iter/s)": 1.450212 }, { "epoch": 0.5623152392785228, "grad_norm": 4.14929723739624, "learning_rate": 9.691157549200085e-05, "loss": 3.0323604583740233, "memory(GiB)": 58.3, "step": 13125, "token_acc": 0.4, "train_speed(iter/s)": 1.450343 }, { "epoch": 0.5625294546077717, "grad_norm": 3.6472291946411133, "learning_rate": 9.690924651082014e-05, "loss": 2.609294891357422, "memory(GiB)": 58.3, "step": 13130, "token_acc": 0.4864864864864865, "train_speed(iter/s)": 1.450296 }, { "epoch": 0.5627436699370207, "grad_norm": 6.209442615509033, "learning_rate": 9.69069166798344e-05, "loss": 2.5425687789916993, "memory(GiB)": 58.3, "step": 13135, "token_acc": 0.4628099173553719, "train_speed(iter/s)": 1.450398 }, { "epoch": 0.5629578852662697, "grad_norm": 4.023408889770508, "learning_rate": 9.690458599908588e-05, "loss": 2.6907154083251954, "memory(GiB)": 58.3, "step": 13140, "token_acc": 0.40784313725490196, "train_speed(iter/s)": 1.450402 }, { "epoch": 0.5631721005955186, "grad_norm": 3.9270732402801514, "learning_rate": 9.690225446861678e-05, "loss": 2.7321781158447265, "memory(GiB)": 58.3, "step": 13145, "token_acc": 0.45323741007194246, "train_speed(iter/s)": 1.450516 }, { "epoch": 0.5633863159247676, "grad_norm": 5.218646049499512, "learning_rate": 9.689992208846934e-05, "loss": 2.55145206451416, "memory(GiB)": 58.3, "step": 13150, "token_acc": 0.48161764705882354, "train_speed(iter/s)": 1.450647 }, { "epoch": 0.5636005312540165, "grad_norm": 4.007495880126953, "learning_rate": 9.689758885868582e-05, "loss": 2.489964485168457, "memory(GiB)": 58.3, "step": 13155, "token_acc": 0.47794117647058826, "train_speed(iter/s)": 1.450663 }, { "epoch": 0.5638147465832655, "grad_norm": 3.621326208114624, "learning_rate": 9.689525477930848e-05, "loss": 2.556059646606445, "memory(GiB)": 58.3, "step": 13160, "token_acc": 0.4597014925373134, "train_speed(iter/s)": 1.450761 }, { "epoch": 0.5640289619125145, "grad_norm": 4.4627838134765625, "learning_rate": 9.68929198503796e-05, "loss": 2.153653144836426, "memory(GiB)": 58.3, "step": 13165, "token_acc": 0.5205992509363296, "train_speed(iter/s)": 1.45079 }, { "epoch": 0.5642431772417634, "grad_norm": 4.90310525894165, "learning_rate": 9.689058407194152e-05, "loss": 2.8451507568359373, "memory(GiB)": 58.3, "step": 13170, "token_acc": 0.44, "train_speed(iter/s)": 1.450798 }, { "epoch": 0.5644573925710124, "grad_norm": 4.354957580566406, "learning_rate": 9.688824744403648e-05, "loss": 3.0005460739135743, "memory(GiB)": 58.3, "step": 13175, "token_acc": 0.39501779359430605, "train_speed(iter/s)": 1.450831 }, { "epoch": 0.5646716079002614, "grad_norm": 3.2764077186584473, "learning_rate": 9.688590996670688e-05, "loss": 2.5787830352783203, "memory(GiB)": 58.3, "step": 13180, "token_acc": 0.44765342960288806, "train_speed(iter/s)": 1.45096 }, { "epoch": 0.5648858232295103, "grad_norm": 4.579800128936768, "learning_rate": 9.688357163999504e-05, "loss": 2.542692184448242, "memory(GiB)": 58.3, "step": 13185, "token_acc": 0.4756554307116105, "train_speed(iter/s)": 1.451041 }, { "epoch": 0.5651000385587592, "grad_norm": 4.687442779541016, "learning_rate": 9.68812324639433e-05, "loss": 2.341318893432617, "memory(GiB)": 58.3, "step": 13190, "token_acc": 0.47808764940239046, "train_speed(iter/s)": 1.451113 }, { "epoch": 0.5653142538880083, "grad_norm": 4.083542823791504, "learning_rate": 9.687889243859407e-05, "loss": 2.4852067947387697, "memory(GiB)": 58.3, "step": 13195, "token_acc": 0.4756944444444444, "train_speed(iter/s)": 1.451116 }, { "epoch": 0.5655284692172572, "grad_norm": 5.367619037628174, "learning_rate": 9.687655156398972e-05, "loss": 2.3516740798950195, "memory(GiB)": 58.3, "step": 13200, "token_acc": 0.4594594594594595, "train_speed(iter/s)": 1.451069 }, { "epoch": 0.5657426845465061, "grad_norm": 5.573348522186279, "learning_rate": 9.687420984017268e-05, "loss": 2.6972415924072264, "memory(GiB)": 58.3, "step": 13205, "token_acc": 0.4692556634304207, "train_speed(iter/s)": 1.451185 }, { "epoch": 0.5659568998757551, "grad_norm": 4.3836774826049805, "learning_rate": 9.687186726718533e-05, "loss": 2.289139747619629, "memory(GiB)": 58.3, "step": 13210, "token_acc": 0.5296610169491526, "train_speed(iter/s)": 1.451183 }, { "epoch": 0.5661711152050041, "grad_norm": 4.545618057250977, "learning_rate": 9.686952384507017e-05, "loss": 3.006224822998047, "memory(GiB)": 58.3, "step": 13215, "token_acc": 0.43666666666666665, "train_speed(iter/s)": 1.451122 }, { "epoch": 0.566385330534253, "grad_norm": 6.145777225494385, "learning_rate": 9.686717957386959e-05, "loss": 2.8068992614746096, "memory(GiB)": 58.3, "step": 13220, "token_acc": 0.45394736842105265, "train_speed(iter/s)": 1.451137 }, { "epoch": 0.566599545863502, "grad_norm": 4.864924430847168, "learning_rate": 9.686483445362611e-05, "loss": 2.664941596984863, "memory(GiB)": 58.3, "step": 13225, "token_acc": 0.4318181818181818, "train_speed(iter/s)": 1.45113 }, { "epoch": 0.566813761192751, "grad_norm": 4.378489971160889, "learning_rate": 9.686248848438217e-05, "loss": 2.6865907669067384, "memory(GiB)": 58.3, "step": 13230, "token_acc": 0.4778481012658228, "train_speed(iter/s)": 1.451299 }, { "epoch": 0.5670279765219999, "grad_norm": 3.1974430084228516, "learning_rate": 9.68601416661803e-05, "loss": 2.5594970703125, "memory(GiB)": 58.3, "step": 13235, "token_acc": 0.4489795918367347, "train_speed(iter/s)": 1.451306 }, { "epoch": 0.5672421918512489, "grad_norm": 3.9626917839050293, "learning_rate": 9.6857793999063e-05, "loss": 2.546051025390625, "memory(GiB)": 58.3, "step": 13240, "token_acc": 0.47959183673469385, "train_speed(iter/s)": 1.451345 }, { "epoch": 0.5674564071804978, "grad_norm": 4.272452354431152, "learning_rate": 9.685544548307281e-05, "loss": 2.6064426422119142, "memory(GiB)": 58.3, "step": 13245, "token_acc": 0.45901639344262296, "train_speed(iter/s)": 1.451342 }, { "epoch": 0.5676706225097468, "grad_norm": 8.32752513885498, "learning_rate": 9.685309611825226e-05, "loss": 2.732959175109863, "memory(GiB)": 58.3, "step": 13250, "token_acc": 0.46178343949044587, "train_speed(iter/s)": 1.451461 }, { "epoch": 0.5678848378389958, "grad_norm": 4.122401714324951, "learning_rate": 9.685074590464394e-05, "loss": 2.6859151840209963, "memory(GiB)": 58.3, "step": 13255, "token_acc": 0.48736462093862815, "train_speed(iter/s)": 1.451412 }, { "epoch": 0.5680990531682447, "grad_norm": 3.7030677795410156, "learning_rate": 9.68483948422904e-05, "loss": 2.4719486236572266, "memory(GiB)": 58.3, "step": 13260, "token_acc": 0.4722222222222222, "train_speed(iter/s)": 1.451446 }, { "epoch": 0.5683132684974936, "grad_norm": 4.952679634094238, "learning_rate": 9.684604293123425e-05, "loss": 2.449727249145508, "memory(GiB)": 58.3, "step": 13265, "token_acc": 0.48698884758364314, "train_speed(iter/s)": 1.451449 }, { "epoch": 0.5685274838267427, "grad_norm": 4.573519229888916, "learning_rate": 9.684369017151806e-05, "loss": 2.691756820678711, "memory(GiB)": 58.3, "step": 13270, "token_acc": 0.45387453874538747, "train_speed(iter/s)": 1.451499 }, { "epoch": 0.5687416991559916, "grad_norm": 3.881869077682495, "learning_rate": 9.68413365631845e-05, "loss": 2.715593719482422, "memory(GiB)": 58.3, "step": 13275, "token_acc": 0.44673539518900346, "train_speed(iter/s)": 1.451538 }, { "epoch": 0.5689559144852405, "grad_norm": 3.503706216812134, "learning_rate": 9.68389821062762e-05, "loss": 2.540977668762207, "memory(GiB)": 58.3, "step": 13280, "token_acc": 0.45878136200716846, "train_speed(iter/s)": 1.451594 }, { "epoch": 0.5691701298144896, "grad_norm": 5.725363254547119, "learning_rate": 9.683662680083578e-05, "loss": 2.4994224548339843, "memory(GiB)": 58.3, "step": 13285, "token_acc": 0.49603174603174605, "train_speed(iter/s)": 1.451763 }, { "epoch": 0.5693843451437385, "grad_norm": 4.425648212432861, "learning_rate": 9.683427064690593e-05, "loss": 2.2694299697875975, "memory(GiB)": 58.3, "step": 13290, "token_acc": 0.5179856115107914, "train_speed(iter/s)": 1.451885 }, { "epoch": 0.5695985604729874, "grad_norm": 3.9148318767547607, "learning_rate": 9.683191364452934e-05, "loss": 2.630241584777832, "memory(GiB)": 58.3, "step": 13295, "token_acc": 0.48788927335640137, "train_speed(iter/s)": 1.452041 }, { "epoch": 0.5698127758022364, "grad_norm": 4.354046821594238, "learning_rate": 9.68295557937487e-05, "loss": 2.615975570678711, "memory(GiB)": 58.3, "step": 13300, "token_acc": 0.45964912280701753, "train_speed(iter/s)": 1.452075 }, { "epoch": 0.5700269911314854, "grad_norm": 3.723904609680176, "learning_rate": 9.682719709460672e-05, "loss": 2.825900650024414, "memory(GiB)": 58.3, "step": 13305, "token_acc": 0.4507042253521127, "train_speed(iter/s)": 1.452179 }, { "epoch": 0.5702412064607343, "grad_norm": 3.485006809234619, "learning_rate": 9.682483754714615e-05, "loss": 2.4839103698730467, "memory(GiB)": 58.3, "step": 13310, "token_acc": 0.484375, "train_speed(iter/s)": 1.452157 }, { "epoch": 0.5704554217899833, "grad_norm": 5.618500232696533, "learning_rate": 9.682247715140974e-05, "loss": 2.64285945892334, "memory(GiB)": 58.3, "step": 13315, "token_acc": 0.49224806201550386, "train_speed(iter/s)": 1.452165 }, { "epoch": 0.5706696371192322, "grad_norm": 4.187808036804199, "learning_rate": 9.68201159074402e-05, "loss": 2.7868972778320313, "memory(GiB)": 58.3, "step": 13320, "token_acc": 0.4222873900293255, "train_speed(iter/s)": 1.452189 }, { "epoch": 0.5708838524484812, "grad_norm": 5.1764397621154785, "learning_rate": 9.681775381528034e-05, "loss": 2.9550350189208983, "memory(GiB)": 58.3, "step": 13325, "token_acc": 0.4054878048780488, "train_speed(iter/s)": 1.452265 }, { "epoch": 0.5710980677777302, "grad_norm": 4.35557746887207, "learning_rate": 9.681539087497296e-05, "loss": 2.9351924896240233, "memory(GiB)": 58.3, "step": 13330, "token_acc": 0.3776978417266187, "train_speed(iter/s)": 1.45238 }, { "epoch": 0.5713122831069791, "grad_norm": 3.86228346824646, "learning_rate": 9.681302708656086e-05, "loss": 2.2315444946289062, "memory(GiB)": 58.3, "step": 13335, "token_acc": 0.5121951219512195, "train_speed(iter/s)": 1.452455 }, { "epoch": 0.571526498436228, "grad_norm": 5.441580772399902, "learning_rate": 9.681066245008687e-05, "loss": 2.4947357177734375, "memory(GiB)": 58.3, "step": 13340, "token_acc": 0.4745098039215686, "train_speed(iter/s)": 1.452475 }, { "epoch": 0.5717407137654771, "grad_norm": 4.5027618408203125, "learning_rate": 9.68082969655938e-05, "loss": 2.5578094482421876, "memory(GiB)": 58.3, "step": 13345, "token_acc": 0.4775641025641026, "train_speed(iter/s)": 1.452481 }, { "epoch": 0.571954929094726, "grad_norm": 5.213467597961426, "learning_rate": 9.680593063312454e-05, "loss": 2.4859502792358397, "memory(GiB)": 58.3, "step": 13350, "token_acc": 0.43853820598006643, "train_speed(iter/s)": 1.452356 }, { "epoch": 0.5721691444239749, "grad_norm": 4.5291242599487305, "learning_rate": 9.680356345272193e-05, "loss": 2.4200429916381836, "memory(GiB)": 58.3, "step": 13355, "token_acc": 0.4577922077922078, "train_speed(iter/s)": 1.452537 }, { "epoch": 0.572383359753224, "grad_norm": 5.182436943054199, "learning_rate": 9.680119542442885e-05, "loss": 2.8287117004394533, "memory(GiB)": 58.3, "step": 13360, "token_acc": 0.39666666666666667, "train_speed(iter/s)": 1.452691 }, { "epoch": 0.5725975750824729, "grad_norm": 4.105990886688232, "learning_rate": 9.679882654828822e-05, "loss": 2.871139144897461, "memory(GiB)": 58.3, "step": 13365, "token_acc": 0.3916349809885932, "train_speed(iter/s)": 1.452636 }, { "epoch": 0.5728117904117218, "grad_norm": 4.6735711097717285, "learning_rate": 9.679645682434295e-05, "loss": 2.658009910583496, "memory(GiB)": 58.3, "step": 13370, "token_acc": 0.4355400696864111, "train_speed(iter/s)": 1.452619 }, { "epoch": 0.5730260057409708, "grad_norm": 4.655219078063965, "learning_rate": 9.679408625263596e-05, "loss": 3.003887176513672, "memory(GiB)": 58.3, "step": 13375, "token_acc": 0.40058479532163743, "train_speed(iter/s)": 1.452713 }, { "epoch": 0.5732402210702198, "grad_norm": 3.323198080062866, "learning_rate": 9.679171483321022e-05, "loss": 2.4898963928222657, "memory(GiB)": 58.3, "step": 13380, "token_acc": 0.4980544747081712, "train_speed(iter/s)": 1.45276 }, { "epoch": 0.5734544363994687, "grad_norm": 9.692115783691406, "learning_rate": 9.678934256610864e-05, "loss": 2.6398977279663085, "memory(GiB)": 58.3, "step": 13385, "token_acc": 0.45625, "train_speed(iter/s)": 1.452801 }, { "epoch": 0.5736686517287177, "grad_norm": 6.924264907836914, "learning_rate": 9.678696945137424e-05, "loss": 2.2850440979003905, "memory(GiB)": 58.3, "step": 13390, "token_acc": 0.4789915966386555, "train_speed(iter/s)": 1.452856 }, { "epoch": 0.5738828670579667, "grad_norm": 3.8131535053253174, "learning_rate": 9.678459548905001e-05, "loss": 2.346193313598633, "memory(GiB)": 58.3, "step": 13395, "token_acc": 0.48333333333333334, "train_speed(iter/s)": 1.452807 }, { "epoch": 0.5740970823872156, "grad_norm": 4.590564250946045, "learning_rate": 9.678222067917893e-05, "loss": 2.387767219543457, "memory(GiB)": 58.3, "step": 13400, "token_acc": 0.4979253112033195, "train_speed(iter/s)": 1.452786 }, { "epoch": 0.5743112977164646, "grad_norm": 4.188224792480469, "learning_rate": 9.677984502180405e-05, "loss": 2.72305965423584, "memory(GiB)": 58.3, "step": 13405, "token_acc": 0.46853146853146854, "train_speed(iter/s)": 1.452682 }, { "epoch": 0.5745255130457135, "grad_norm": 3.7970058917999268, "learning_rate": 9.677746851696838e-05, "loss": 2.6192707061767577, "memory(GiB)": 58.3, "step": 13410, "token_acc": 0.45871559633027525, "train_speed(iter/s)": 1.452724 }, { "epoch": 0.5747397283749626, "grad_norm": 3.8336691856384277, "learning_rate": 9.6775091164715e-05, "loss": 2.6549158096313477, "memory(GiB)": 58.3, "step": 13415, "token_acc": 0.47289156626506024, "train_speed(iter/s)": 1.452819 }, { "epoch": 0.5749539437042115, "grad_norm": 4.714938163757324, "learning_rate": 9.677271296508697e-05, "loss": 2.6755447387695312, "memory(GiB)": 58.3, "step": 13420, "token_acc": 0.4357142857142857, "train_speed(iter/s)": 1.45289 }, { "epoch": 0.5751681590334604, "grad_norm": 5.060439109802246, "learning_rate": 9.677033391812736e-05, "loss": 2.711924362182617, "memory(GiB)": 58.3, "step": 13425, "token_acc": 0.4730290456431535, "train_speed(iter/s)": 1.452918 }, { "epoch": 0.5753823743627094, "grad_norm": 5.794318675994873, "learning_rate": 9.676795402387927e-05, "loss": 2.855929374694824, "memory(GiB)": 58.3, "step": 13430, "token_acc": 0.46381578947368424, "train_speed(iter/s)": 1.452941 }, { "epoch": 0.5755965896919584, "grad_norm": 6.0406951904296875, "learning_rate": 9.676557328238582e-05, "loss": 2.692300796508789, "memory(GiB)": 58.3, "step": 13435, "token_acc": 0.4634146341463415, "train_speed(iter/s)": 1.453039 }, { "epoch": 0.5758108050212073, "grad_norm": 4.528384685516357, "learning_rate": 9.676319169369016e-05, "loss": 2.774568557739258, "memory(GiB)": 58.3, "step": 13440, "token_acc": 0.45263157894736844, "train_speed(iter/s)": 1.45302 }, { "epoch": 0.5760250203504563, "grad_norm": 4.644454002380371, "learning_rate": 9.67608092578354e-05, "loss": 2.7741506576538084, "memory(GiB)": 58.3, "step": 13445, "token_acc": 0.45588235294117646, "train_speed(iter/s)": 1.453008 }, { "epoch": 0.5762392356797053, "grad_norm": 4.520377159118652, "learning_rate": 9.675842597486471e-05, "loss": 2.6387094497680663, "memory(GiB)": 58.3, "step": 13450, "token_acc": 0.45161290322580644, "train_speed(iter/s)": 1.45299 }, { "epoch": 0.5764534510089542, "grad_norm": 3.4945321083068848, "learning_rate": 9.675604184482129e-05, "loss": 2.656844711303711, "memory(GiB)": 58.3, "step": 13455, "token_acc": 0.4525316455696203, "train_speed(iter/s)": 1.453088 }, { "epoch": 0.5766676663382032, "grad_norm": 3.8861958980560303, "learning_rate": 9.67536568677483e-05, "loss": 2.69125919342041, "memory(GiB)": 58.3, "step": 13460, "token_acc": 0.4608150470219436, "train_speed(iter/s)": 1.453107 }, { "epoch": 0.5768818816674521, "grad_norm": 4.431004047393799, "learning_rate": 9.675127104368895e-05, "loss": 2.7479740142822267, "memory(GiB)": 58.3, "step": 13465, "token_acc": 0.46303501945525294, "train_speed(iter/s)": 1.453098 }, { "epoch": 0.5770960969967011, "grad_norm": 3.8639848232269287, "learning_rate": 9.674888437268648e-05, "loss": 3.028243637084961, "memory(GiB)": 58.3, "step": 13470, "token_acc": 0.4275092936802974, "train_speed(iter/s)": 1.453157 }, { "epoch": 0.5773103123259501, "grad_norm": 4.122154712677002, "learning_rate": 9.674649685478412e-05, "loss": 2.6113021850585936, "memory(GiB)": 58.3, "step": 13475, "token_acc": 0.49295774647887325, "train_speed(iter/s)": 1.45307 }, { "epoch": 0.577524527655199, "grad_norm": 4.971024036407471, "learning_rate": 9.67441084900251e-05, "loss": 2.4940296173095704, "memory(GiB)": 58.3, "step": 13480, "token_acc": 0.4838709677419355, "train_speed(iter/s)": 1.453074 }, { "epoch": 0.5777387429844479, "grad_norm": 4.5956130027771, "learning_rate": 9.674171927845272e-05, "loss": 2.6294580459594727, "memory(GiB)": 58.3, "step": 13485, "token_acc": 0.4591549295774648, "train_speed(iter/s)": 1.453102 }, { "epoch": 0.577952958313697, "grad_norm": 5.023074626922607, "learning_rate": 9.673932922011024e-05, "loss": 2.645197868347168, "memory(GiB)": 58.3, "step": 13490, "token_acc": 0.4437869822485207, "train_speed(iter/s)": 1.453032 }, { "epoch": 0.5781671736429459, "grad_norm": 4.95644474029541, "learning_rate": 9.673693831504096e-05, "loss": 2.6930280685424806, "memory(GiB)": 58.3, "step": 13495, "token_acc": 0.4349442379182156, "train_speed(iter/s)": 1.453145 }, { "epoch": 0.5783813889721948, "grad_norm": 6.048409461975098, "learning_rate": 9.673454656328823e-05, "loss": 2.411910057067871, "memory(GiB)": 58.3, "step": 13500, "token_acc": 0.5265017667844523, "train_speed(iter/s)": 1.453176 }, { "epoch": 0.5783813889721948, "eval_loss": 2.1747255325317383, "eval_runtime": 14.1219, "eval_samples_per_second": 7.081, "eval_steps_per_second": 7.081, "eval_token_acc": 0.4956268221574344, "step": 13500 }, { "epoch": 0.5785956043014439, "grad_norm": 5.690944671630859, "learning_rate": 9.673215396489531e-05, "loss": 2.3216552734375, "memory(GiB)": 58.3, "step": 13505, "token_acc": 0.5005452562704471, "train_speed(iter/s)": 1.450877 }, { "epoch": 0.5788098196306928, "grad_norm": 6.579805850982666, "learning_rate": 9.67297605199056e-05, "loss": 2.703226089477539, "memory(GiB)": 58.3, "step": 13510, "token_acc": 0.4708333333333333, "train_speed(iter/s)": 1.450922 }, { "epoch": 0.5790240349599417, "grad_norm": 4.9528656005859375, "learning_rate": 9.672736622836245e-05, "loss": 2.45620174407959, "memory(GiB)": 58.3, "step": 13515, "token_acc": 0.47101449275362317, "train_speed(iter/s)": 1.450966 }, { "epoch": 0.5792382502891907, "grad_norm": 3.16727614402771, "learning_rate": 9.672497109030922e-05, "loss": 2.566771125793457, "memory(GiB)": 58.3, "step": 13520, "token_acc": 0.4623955431754875, "train_speed(iter/s)": 1.451063 }, { "epoch": 0.5794524656184397, "grad_norm": 3.9224820137023926, "learning_rate": 9.67225751057893e-05, "loss": 2.4791086196899412, "memory(GiB)": 58.3, "step": 13525, "token_acc": 0.4576923076923077, "train_speed(iter/s)": 1.451165 }, { "epoch": 0.5796666809476886, "grad_norm": 4.189754962921143, "learning_rate": 9.672017827484611e-05, "loss": 2.561331367492676, "memory(GiB)": 58.3, "step": 13530, "token_acc": 0.4981949458483754, "train_speed(iter/s)": 1.451234 }, { "epoch": 0.5798808962769376, "grad_norm": 4.538440704345703, "learning_rate": 9.671778059752305e-05, "loss": 2.524432373046875, "memory(GiB)": 58.3, "step": 13535, "token_acc": 0.5, "train_speed(iter/s)": 1.451346 }, { "epoch": 0.5800951116061865, "grad_norm": 5.3694376945495605, "learning_rate": 9.67153820738636e-05, "loss": 2.5391387939453125, "memory(GiB)": 58.3, "step": 13540, "token_acc": 0.4852941176470588, "train_speed(iter/s)": 1.451429 }, { "epoch": 0.5803093269354355, "grad_norm": 5.829346179962158, "learning_rate": 9.671298270391114e-05, "loss": 2.6271434783935548, "memory(GiB)": 58.3, "step": 13545, "token_acc": 0.45151515151515154, "train_speed(iter/s)": 1.451451 }, { "epoch": 0.5805235422646845, "grad_norm": 4.982487678527832, "learning_rate": 9.671058248770922e-05, "loss": 2.541544532775879, "memory(GiB)": 58.3, "step": 13550, "token_acc": 0.47101449275362317, "train_speed(iter/s)": 1.451254 }, { "epoch": 0.5807377575939334, "grad_norm": 4.437562942504883, "learning_rate": 9.670818142530125e-05, "loss": 2.4570968627929686, "memory(GiB)": 58.3, "step": 13555, "token_acc": 0.49063670411985016, "train_speed(iter/s)": 1.450998 }, { "epoch": 0.5809519729231823, "grad_norm": 5.505538463592529, "learning_rate": 9.670577951673076e-05, "loss": 2.573525238037109, "memory(GiB)": 58.3, "step": 13560, "token_acc": 0.4671280276816609, "train_speed(iter/s)": 1.45109 }, { "epoch": 0.5811661882524314, "grad_norm": 5.254263401031494, "learning_rate": 9.670337676204127e-05, "loss": 2.2848575592041014, "memory(GiB)": 58.3, "step": 13565, "token_acc": 0.5342465753424658, "train_speed(iter/s)": 1.451095 }, { "epoch": 0.5813804035816803, "grad_norm": 4.772819519042969, "learning_rate": 9.670097316127631e-05, "loss": 2.5541152954101562, "memory(GiB)": 58.3, "step": 13570, "token_acc": 0.4662756598240469, "train_speed(iter/s)": 1.451135 }, { "epoch": 0.5815946189109292, "grad_norm": 3.9432716369628906, "learning_rate": 9.66985687144794e-05, "loss": 2.6513044357299806, "memory(GiB)": 58.3, "step": 13575, "token_acc": 0.45555555555555555, "train_speed(iter/s)": 1.451187 }, { "epoch": 0.5818088342401783, "grad_norm": 4.397954940795898, "learning_rate": 9.66961634216941e-05, "loss": 2.5172119140625, "memory(GiB)": 58.3, "step": 13580, "token_acc": 0.46794871794871795, "train_speed(iter/s)": 1.451316 }, { "epoch": 0.5820230495694272, "grad_norm": 4.439431190490723, "learning_rate": 9.669375728296402e-05, "loss": 2.5462577819824217, "memory(GiB)": 58.3, "step": 13585, "token_acc": 0.4646153846153846, "train_speed(iter/s)": 1.451258 }, { "epoch": 0.5822372648986761, "grad_norm": 5.510266304016113, "learning_rate": 9.66913502983327e-05, "loss": 2.8166122436523438, "memory(GiB)": 58.3, "step": 13590, "token_acc": 0.436046511627907, "train_speed(iter/s)": 1.451351 }, { "epoch": 0.5824514802279251, "grad_norm": 4.007482051849365, "learning_rate": 9.668894246784378e-05, "loss": 2.554096984863281, "memory(GiB)": 58.3, "step": 13595, "token_acc": 0.44336569579288027, "train_speed(iter/s)": 1.451545 }, { "epoch": 0.5826656955571741, "grad_norm": 5.670605182647705, "learning_rate": 9.668653379154086e-05, "loss": 2.6899135589599608, "memory(GiB)": 58.3, "step": 13600, "token_acc": 0.4155844155844156, "train_speed(iter/s)": 1.45173 }, { "epoch": 0.582879910886423, "grad_norm": 6.042681694030762, "learning_rate": 9.66841242694676e-05, "loss": 2.5125286102294924, "memory(GiB)": 58.3, "step": 13605, "token_acc": 0.4659498207885305, "train_speed(iter/s)": 1.451881 }, { "epoch": 0.583094126215672, "grad_norm": 5.8735480308532715, "learning_rate": 9.668171390166763e-05, "loss": 2.506675148010254, "memory(GiB)": 58.3, "step": 13610, "token_acc": 0.4913494809688581, "train_speed(iter/s)": 1.451908 }, { "epoch": 0.583308341544921, "grad_norm": 4.454692840576172, "learning_rate": 9.667930268818462e-05, "loss": 2.5537179946899413, "memory(GiB)": 58.3, "step": 13615, "token_acc": 0.4377224199288256, "train_speed(iter/s)": 1.451952 }, { "epoch": 0.5835225568741699, "grad_norm": 3.755244493484497, "learning_rate": 9.667689062906226e-05, "loss": 2.945166015625, "memory(GiB)": 58.3, "step": 13620, "token_acc": 0.4594594594594595, "train_speed(iter/s)": 1.452108 }, { "epoch": 0.5837367722034189, "grad_norm": 3.8499667644500732, "learning_rate": 9.667447772434423e-05, "loss": 2.7186246871948243, "memory(GiB)": 58.3, "step": 13625, "token_acc": 0.4720670391061452, "train_speed(iter/s)": 1.452228 }, { "epoch": 0.5839509875326678, "grad_norm": 4.239790439605713, "learning_rate": 9.667206397407426e-05, "loss": 2.6613414764404295, "memory(GiB)": 58.3, "step": 13630, "token_acc": 0.4612794612794613, "train_speed(iter/s)": 1.452201 }, { "epoch": 0.5841652028619168, "grad_norm": 4.446804046630859, "learning_rate": 9.666964937829606e-05, "loss": 2.292164611816406, "memory(GiB)": 58.3, "step": 13635, "token_acc": 0.5393258426966292, "train_speed(iter/s)": 1.452264 }, { "epoch": 0.5843794181911658, "grad_norm": 4.99111270904541, "learning_rate": 9.666723393705339e-05, "loss": 2.611367034912109, "memory(GiB)": 58.3, "step": 13640, "token_acc": 0.44745762711864406, "train_speed(iter/s)": 1.452283 }, { "epoch": 0.5845936335204147, "grad_norm": 5.620711803436279, "learning_rate": 9.666481765038999e-05, "loss": 2.590936851501465, "memory(GiB)": 58.3, "step": 13645, "token_acc": 0.5054151624548736, "train_speed(iter/s)": 1.452348 }, { "epoch": 0.5848078488496636, "grad_norm": 3.6960132122039795, "learning_rate": 9.666240051834965e-05, "loss": 2.449190139770508, "memory(GiB)": 58.3, "step": 13650, "token_acc": 0.48242811501597443, "train_speed(iter/s)": 1.452411 }, { "epoch": 0.5850220641789127, "grad_norm": 4.801359176635742, "learning_rate": 9.665998254097616e-05, "loss": 2.8072816848754885, "memory(GiB)": 58.3, "step": 13655, "token_acc": 0.45017182130584193, "train_speed(iter/s)": 1.452259 }, { "epoch": 0.5852362795081616, "grad_norm": 6.511500358581543, "learning_rate": 9.66575637183133e-05, "loss": 2.620912551879883, "memory(GiB)": 58.3, "step": 13660, "token_acc": 0.42857142857142855, "train_speed(iter/s)": 1.452318 }, { "epoch": 0.5854504948374105, "grad_norm": 4.2787652015686035, "learning_rate": 9.665514405040491e-05, "loss": 2.427029037475586, "memory(GiB)": 58.3, "step": 13665, "token_acc": 0.4755244755244755, "train_speed(iter/s)": 1.4523 }, { "epoch": 0.5856647101666596, "grad_norm": 5.601728916168213, "learning_rate": 9.665272353729482e-05, "loss": 2.292038917541504, "memory(GiB)": 58.3, "step": 13670, "token_acc": 0.5019011406844106, "train_speed(iter/s)": 1.452293 }, { "epoch": 0.5858789254959085, "grad_norm": 5.865860462188721, "learning_rate": 9.665030217902688e-05, "loss": 2.4172096252441406, "memory(GiB)": 58.3, "step": 13675, "token_acc": 0.49411764705882355, "train_speed(iter/s)": 1.452235 }, { "epoch": 0.5860931408251574, "grad_norm": 4.575709342956543, "learning_rate": 9.664787997564496e-05, "loss": 2.5697406768798827, "memory(GiB)": 58.3, "step": 13680, "token_acc": 0.46402877697841727, "train_speed(iter/s)": 1.452081 }, { "epoch": 0.5863073561544064, "grad_norm": 3.8804333209991455, "learning_rate": 9.664545692719293e-05, "loss": 2.788030815124512, "memory(GiB)": 58.3, "step": 13685, "token_acc": 0.4523076923076923, "train_speed(iter/s)": 1.452111 }, { "epoch": 0.5865215714836554, "grad_norm": 4.499039649963379, "learning_rate": 9.66430330337147e-05, "loss": 2.9364303588867187, "memory(GiB)": 58.3, "step": 13690, "token_acc": 0.4273255813953488, "train_speed(iter/s)": 1.452193 }, { "epoch": 0.5867357868129043, "grad_norm": 3.6702146530151367, "learning_rate": 9.664060829525416e-05, "loss": 2.805882453918457, "memory(GiB)": 58.3, "step": 13695, "token_acc": 0.46808510638297873, "train_speed(iter/s)": 1.452258 }, { "epoch": 0.5869500021421533, "grad_norm": 2.9204540252685547, "learning_rate": 9.663818271185525e-05, "loss": 2.2844181060791016, "memory(GiB)": 58.3, "step": 13700, "token_acc": 0.46511627906976744, "train_speed(iter/s)": 1.452122 }, { "epoch": 0.5871642174714022, "grad_norm": 4.366288661956787, "learning_rate": 9.663575628356191e-05, "loss": 2.7558219909667967, "memory(GiB)": 58.3, "step": 13705, "token_acc": 0.41479099678456594, "train_speed(iter/s)": 1.452145 }, { "epoch": 0.5873784328006512, "grad_norm": 3.9321718215942383, "learning_rate": 9.663332901041809e-05, "loss": 2.560669708251953, "memory(GiB)": 58.3, "step": 13710, "token_acc": 0.5, "train_speed(iter/s)": 1.452137 }, { "epoch": 0.5875926481299002, "grad_norm": 4.446691513061523, "learning_rate": 9.663090089246778e-05, "loss": 2.5037235260009765, "memory(GiB)": 58.3, "step": 13715, "token_acc": 0.4642857142857143, "train_speed(iter/s)": 1.452131 }, { "epoch": 0.5878068634591491, "grad_norm": 5.75213098526001, "learning_rate": 9.662847192975496e-05, "loss": 2.622756004333496, "memory(GiB)": 58.3, "step": 13720, "token_acc": 0.5, "train_speed(iter/s)": 1.452148 }, { "epoch": 0.588021078788398, "grad_norm": 3.5477755069732666, "learning_rate": 9.662604212232362e-05, "loss": 2.5508455276489257, "memory(GiB)": 58.3, "step": 13725, "token_acc": 0.46464646464646464, "train_speed(iter/s)": 1.452162 }, { "epoch": 0.5882352941176471, "grad_norm": 4.282444953918457, "learning_rate": 9.662361147021779e-05, "loss": 2.6218746185302733, "memory(GiB)": 58.3, "step": 13730, "token_acc": 0.45396825396825397, "train_speed(iter/s)": 1.452029 }, { "epoch": 0.588449509446896, "grad_norm": 6.025940895080566, "learning_rate": 9.662117997348151e-05, "loss": 2.7067113876342774, "memory(GiB)": 58.3, "step": 13735, "token_acc": 0.4275092936802974, "train_speed(iter/s)": 1.452049 }, { "epoch": 0.588663724776145, "grad_norm": 5.109095573425293, "learning_rate": 9.661874763215881e-05, "loss": 2.4650619506835936, "memory(GiB)": 58.3, "step": 13740, "token_acc": 0.49836065573770494, "train_speed(iter/s)": 1.452106 }, { "epoch": 0.588877940105394, "grad_norm": 3.9077296257019043, "learning_rate": 9.661631444629378e-05, "loss": 2.58374080657959, "memory(GiB)": 58.3, "step": 13745, "token_acc": 0.4846153846153846, "train_speed(iter/s)": 1.452199 }, { "epoch": 0.5890921554346429, "grad_norm": 4.706995964050293, "learning_rate": 9.661388041593047e-05, "loss": 2.468706512451172, "memory(GiB)": 58.3, "step": 13750, "token_acc": 0.4770992366412214, "train_speed(iter/s)": 1.452121 }, { "epoch": 0.5893063707638919, "grad_norm": 6.8934197425842285, "learning_rate": 9.661144554111298e-05, "loss": 2.5233896255493162, "memory(GiB)": 58.3, "step": 13755, "token_acc": 0.4315068493150685, "train_speed(iter/s)": 1.452287 }, { "epoch": 0.5895205860931408, "grad_norm": 5.084076404571533, "learning_rate": 9.660900982188545e-05, "loss": 2.863209533691406, "memory(GiB)": 58.3, "step": 13760, "token_acc": 0.41721854304635764, "train_speed(iter/s)": 1.452329 }, { "epoch": 0.5897348014223898, "grad_norm": 4.910953998565674, "learning_rate": 9.660657325829198e-05, "loss": 2.768596649169922, "memory(GiB)": 58.3, "step": 13765, "token_acc": 0.4542124542124542, "train_speed(iter/s)": 1.452159 }, { "epoch": 0.5899490167516388, "grad_norm": 4.866161823272705, "learning_rate": 9.660413585037671e-05, "loss": 2.6919881820678713, "memory(GiB)": 58.3, "step": 13770, "token_acc": 0.44285714285714284, "train_speed(iter/s)": 1.452161 }, { "epoch": 0.5901632320808877, "grad_norm": 4.18895959854126, "learning_rate": 9.66016975981838e-05, "loss": 2.540228843688965, "memory(GiB)": 58.3, "step": 13775, "token_acc": 0.46099290780141844, "train_speed(iter/s)": 1.452056 }, { "epoch": 0.5903774474101366, "grad_norm": 3.4329934120178223, "learning_rate": 9.659925850175742e-05, "loss": 2.3678192138671874, "memory(GiB)": 58.3, "step": 13780, "token_acc": 0.4750830564784053, "train_speed(iter/s)": 1.452087 }, { "epoch": 0.5905916627393857, "grad_norm": 5.119359016418457, "learning_rate": 9.659681856114177e-05, "loss": 2.637171173095703, "memory(GiB)": 58.3, "step": 13785, "token_acc": 0.4624505928853755, "train_speed(iter/s)": 1.452083 }, { "epoch": 0.5908058780686346, "grad_norm": 5.700256824493408, "learning_rate": 9.659437777638102e-05, "loss": 2.883258819580078, "memory(GiB)": 58.3, "step": 13790, "token_acc": 0.42962962962962964, "train_speed(iter/s)": 1.452093 }, { "epoch": 0.5910200933978835, "grad_norm": 5.514988899230957, "learning_rate": 9.659193614751942e-05, "loss": 2.347756195068359, "memory(GiB)": 58.3, "step": 13795, "token_acc": 0.528957528957529, "train_speed(iter/s)": 1.45209 }, { "epoch": 0.5912343087271326, "grad_norm": 5.882185935974121, "learning_rate": 9.658949367460119e-05, "loss": 2.6247514724731444, "memory(GiB)": 58.3, "step": 13800, "token_acc": 0.49122807017543857, "train_speed(iter/s)": 1.452121 }, { "epoch": 0.5914485240563815, "grad_norm": 8.835413932800293, "learning_rate": 9.658705035767059e-05, "loss": 2.618433380126953, "memory(GiB)": 58.3, "step": 13805, "token_acc": 0.47651006711409394, "train_speed(iter/s)": 1.452129 }, { "epoch": 0.5916627393856304, "grad_norm": 4.470920085906982, "learning_rate": 9.658460619677185e-05, "loss": 2.285435676574707, "memory(GiB)": 58.3, "step": 13810, "token_acc": 0.49122807017543857, "train_speed(iter/s)": 1.45213 }, { "epoch": 0.5918769547148794, "grad_norm": 4.125425815582275, "learning_rate": 9.658216119194929e-05, "loss": 2.206900978088379, "memory(GiB)": 58.3, "step": 13815, "token_acc": 0.5318352059925093, "train_speed(iter/s)": 1.452085 }, { "epoch": 0.5920911700441284, "grad_norm": 3.692875862121582, "learning_rate": 9.657971534324717e-05, "loss": 2.577462005615234, "memory(GiB)": 58.3, "step": 13820, "token_acc": 0.47, "train_speed(iter/s)": 1.452077 }, { "epoch": 0.5923053853733773, "grad_norm": 3.980250835418701, "learning_rate": 9.657726865070981e-05, "loss": 2.7860286712646483, "memory(GiB)": 58.3, "step": 13825, "token_acc": 0.40540540540540543, "train_speed(iter/s)": 1.452021 }, { "epoch": 0.5925196007026263, "grad_norm": 4.529167175292969, "learning_rate": 9.657482111438154e-05, "loss": 2.538942337036133, "memory(GiB)": 58.3, "step": 13830, "token_acc": 0.4866920152091255, "train_speed(iter/s)": 1.452063 }, { "epoch": 0.5927338160318752, "grad_norm": 3.8707027435302734, "learning_rate": 9.657237273430669e-05, "loss": 2.565081024169922, "memory(GiB)": 58.3, "step": 13835, "token_acc": 0.4653846153846154, "train_speed(iter/s)": 1.452147 }, { "epoch": 0.5929480313611242, "grad_norm": 4.658082962036133, "learning_rate": 9.656992351052961e-05, "loss": 2.672258758544922, "memory(GiB)": 58.3, "step": 13840, "token_acc": 0.47840531561461797, "train_speed(iter/s)": 1.452335 }, { "epoch": 0.5931622466903732, "grad_norm": 4.3584465980529785, "learning_rate": 9.656747344309469e-05, "loss": 2.639421081542969, "memory(GiB)": 58.3, "step": 13845, "token_acc": 0.42105263157894735, "train_speed(iter/s)": 1.45237 }, { "epoch": 0.5933764620196221, "grad_norm": 3.647871255874634, "learning_rate": 9.656502253204632e-05, "loss": 2.8180770874023438, "memory(GiB)": 58.3, "step": 13850, "token_acc": 0.44155844155844154, "train_speed(iter/s)": 1.45213 }, { "epoch": 0.593590677348871, "grad_norm": 4.493406772613525, "learning_rate": 9.656257077742886e-05, "loss": 2.958848571777344, "memory(GiB)": 58.3, "step": 13855, "token_acc": 0.40304182509505704, "train_speed(iter/s)": 1.452213 }, { "epoch": 0.5938048926781201, "grad_norm": 4.315821170806885, "learning_rate": 9.656011817928676e-05, "loss": 2.5926551818847656, "memory(GiB)": 58.3, "step": 13860, "token_acc": 0.5206349206349207, "train_speed(iter/s)": 1.452184 }, { "epoch": 0.594019108007369, "grad_norm": 3.80910325050354, "learning_rate": 9.655766473766444e-05, "loss": 2.6649192810058593, "memory(GiB)": 58.3, "step": 13865, "token_acc": 0.490625, "train_speed(iter/s)": 1.452193 }, { "epoch": 0.5942333233366179, "grad_norm": 4.4840288162231445, "learning_rate": 9.655521045260636e-05, "loss": 2.464204216003418, "memory(GiB)": 58.3, "step": 13870, "token_acc": 0.46621621621621623, "train_speed(iter/s)": 1.452183 }, { "epoch": 0.594447538665867, "grad_norm": 4.357649803161621, "learning_rate": 9.655275532415696e-05, "loss": 2.6859384536743165, "memory(GiB)": 58.3, "step": 13875, "token_acc": 0.44680851063829785, "train_speed(iter/s)": 1.452042 }, { "epoch": 0.5946617539951159, "grad_norm": 3.79764986038208, "learning_rate": 9.655029935236073e-05, "loss": 2.5860641479492186, "memory(GiB)": 58.3, "step": 13880, "token_acc": 0.4727272727272727, "train_speed(iter/s)": 1.452174 }, { "epoch": 0.5948759693243648, "grad_norm": 4.252235412597656, "learning_rate": 9.654784253726216e-05, "loss": 2.846329689025879, "memory(GiB)": 58.3, "step": 13885, "token_acc": 0.4377358490566038, "train_speed(iter/s)": 1.452104 }, { "epoch": 0.5950901846536139, "grad_norm": 4.272499084472656, "learning_rate": 9.654538487890577e-05, "loss": 2.596055603027344, "memory(GiB)": 58.3, "step": 13890, "token_acc": 0.5150501672240803, "train_speed(iter/s)": 1.452166 }, { "epoch": 0.5953043999828628, "grad_norm": 4.841752052307129, "learning_rate": 9.654292637733605e-05, "loss": 2.573365592956543, "memory(GiB)": 58.3, "step": 13895, "token_acc": 0.44074074074074077, "train_speed(iter/s)": 1.452163 }, { "epoch": 0.5955186153121117, "grad_norm": 3.2819955348968506, "learning_rate": 9.654046703259755e-05, "loss": 2.2940391540527343, "memory(GiB)": 58.3, "step": 13900, "token_acc": 0.49044585987261147, "train_speed(iter/s)": 1.452121 }, { "epoch": 0.5957328306413607, "grad_norm": 4.382943153381348, "learning_rate": 9.653800684473485e-05, "loss": 2.7047683715820314, "memory(GiB)": 58.3, "step": 13905, "token_acc": 0.461038961038961, "train_speed(iter/s)": 1.452076 }, { "epoch": 0.5959470459706097, "grad_norm": 3.7000436782836914, "learning_rate": 9.653554581379249e-05, "loss": 2.6412469863891603, "memory(GiB)": 58.3, "step": 13910, "token_acc": 0.45, "train_speed(iter/s)": 1.452066 }, { "epoch": 0.5961612612998586, "grad_norm": 3.6861205101013184, "learning_rate": 9.653308393981508e-05, "loss": 2.781342697143555, "memory(GiB)": 58.3, "step": 13915, "token_acc": 0.4152823920265781, "train_speed(iter/s)": 1.45207 }, { "epoch": 0.5963754766291076, "grad_norm": 4.117077827453613, "learning_rate": 9.65306212228472e-05, "loss": 2.6871110916137697, "memory(GiB)": 58.3, "step": 13920, "token_acc": 0.4691780821917808, "train_speed(iter/s)": 1.452144 }, { "epoch": 0.5965896919583565, "grad_norm": 4.764352798461914, "learning_rate": 9.652815766293344e-05, "loss": 2.4248533248901367, "memory(GiB)": 58.3, "step": 13925, "token_acc": 0.4983277591973244, "train_speed(iter/s)": 1.452105 }, { "epoch": 0.5968039072876055, "grad_norm": 4.078861236572266, "learning_rate": 9.652569326011849e-05, "loss": 2.328511619567871, "memory(GiB)": 58.3, "step": 13930, "token_acc": 0.4782608695652174, "train_speed(iter/s)": 1.452074 }, { "epoch": 0.5970181226168545, "grad_norm": 3.8641281127929688, "learning_rate": 9.652322801444695e-05, "loss": 2.7165510177612306, "memory(GiB)": 58.3, "step": 13935, "token_acc": 0.4787234042553192, "train_speed(iter/s)": 1.452143 }, { "epoch": 0.5972323379461034, "grad_norm": 5.55017614364624, "learning_rate": 9.652076192596349e-05, "loss": 2.7470321655273438, "memory(GiB)": 58.3, "step": 13940, "token_acc": 0.46511627906976744, "train_speed(iter/s)": 1.452266 }, { "epoch": 0.5974465532753523, "grad_norm": 3.43029522895813, "learning_rate": 9.65182949947128e-05, "loss": 2.4457880020141602, "memory(GiB)": 58.3, "step": 13945, "token_acc": 0.48580441640378547, "train_speed(iter/s)": 1.452376 }, { "epoch": 0.5976607686046014, "grad_norm": 3.934824228286743, "learning_rate": 9.651582722073953e-05, "loss": 2.572235870361328, "memory(GiB)": 58.3, "step": 13950, "token_acc": 0.4645390070921986, "train_speed(iter/s)": 1.452451 }, { "epoch": 0.5978749839338503, "grad_norm": 4.444162845611572, "learning_rate": 9.651335860408843e-05, "loss": 2.563711929321289, "memory(GiB)": 58.3, "step": 13955, "token_acc": 0.5056179775280899, "train_speed(iter/s)": 1.452566 }, { "epoch": 0.5980891992630992, "grad_norm": 5.351120948791504, "learning_rate": 9.651088914480421e-05, "loss": 2.1993560791015625, "memory(GiB)": 58.3, "step": 13960, "token_acc": 0.5021834061135371, "train_speed(iter/s)": 1.452669 }, { "epoch": 0.5983034145923483, "grad_norm": 5.675091743469238, "learning_rate": 9.650841884293159e-05, "loss": 2.4855014801025392, "memory(GiB)": 58.3, "step": 13965, "token_acc": 0.46503496503496505, "train_speed(iter/s)": 1.452715 }, { "epoch": 0.5985176299215972, "grad_norm": 6.399936676025391, "learning_rate": 9.650594769851535e-05, "loss": 2.689995002746582, "memory(GiB)": 58.3, "step": 13970, "token_acc": 0.4892703862660944, "train_speed(iter/s)": 1.452744 }, { "epoch": 0.5987318452508461, "grad_norm": 4.217350959777832, "learning_rate": 9.650347571160023e-05, "loss": 2.794980049133301, "memory(GiB)": 58.3, "step": 13975, "token_acc": 0.423728813559322, "train_speed(iter/s)": 1.452855 }, { "epoch": 0.5989460605800951, "grad_norm": 3.8354604244232178, "learning_rate": 9.650100288223102e-05, "loss": 2.725836181640625, "memory(GiB)": 58.3, "step": 13980, "token_acc": 0.42902208201892744, "train_speed(iter/s)": 1.453026 }, { "epoch": 0.5991602759093441, "grad_norm": 4.5916900634765625, "learning_rate": 9.649852921045254e-05, "loss": 2.405137634277344, "memory(GiB)": 58.3, "step": 13985, "token_acc": 0.48880597014925375, "train_speed(iter/s)": 1.453117 }, { "epoch": 0.599374491238593, "grad_norm": 3.5865020751953125, "learning_rate": 9.649605469630958e-05, "loss": 2.7852060317993166, "memory(GiB)": 58.3, "step": 13990, "token_acc": 0.4230769230769231, "train_speed(iter/s)": 1.453108 }, { "epoch": 0.599588706567842, "grad_norm": 4.899942874908447, "learning_rate": 9.649357933984697e-05, "loss": 2.580856704711914, "memory(GiB)": 58.3, "step": 13995, "token_acc": 0.4206008583690987, "train_speed(iter/s)": 1.453176 }, { "epoch": 0.599802921897091, "grad_norm": 4.871976375579834, "learning_rate": 9.649110314110954e-05, "loss": 2.4768436431884764, "memory(GiB)": 58.3, "step": 14000, "token_acc": 0.5, "train_speed(iter/s)": 1.453175 }, { "epoch": 0.599802921897091, "eval_loss": 2.266469717025757, "eval_runtime": 14.8183, "eval_samples_per_second": 6.748, "eval_steps_per_second": 6.748, "eval_token_acc": 0.45633456334563344, "step": 14000 }, { "epoch": 0.6000171372263399, "grad_norm": 4.842276096343994, "learning_rate": 9.648862610014218e-05, "loss": 2.5272029876708983, "memory(GiB)": 58.3, "step": 14005, "token_acc": 0.45356176735798015, "train_speed(iter/s)": 1.450832 }, { "epoch": 0.6002313525555889, "grad_norm": 5.21980619430542, "learning_rate": 9.648614821698975e-05, "loss": 2.73854923248291, "memory(GiB)": 58.3, "step": 14010, "token_acc": 0.4532871972318339, "train_speed(iter/s)": 1.450948 }, { "epoch": 0.6004455678848378, "grad_norm": 5.694094181060791, "learning_rate": 9.648366949169712e-05, "loss": 2.8148332595825196, "memory(GiB)": 58.3, "step": 14015, "token_acc": 0.43125, "train_speed(iter/s)": 1.451003 }, { "epoch": 0.6006597832140868, "grad_norm": 4.2976484298706055, "learning_rate": 9.648118992430923e-05, "loss": 2.3018362045288088, "memory(GiB)": 58.3, "step": 14020, "token_acc": 0.50199203187251, "train_speed(iter/s)": 1.450999 }, { "epoch": 0.6008739985433358, "grad_norm": 6.890940189361572, "learning_rate": 9.647870951487098e-05, "loss": 2.737139892578125, "memory(GiB)": 58.3, "step": 14025, "token_acc": 0.4107142857142857, "train_speed(iter/s)": 1.450921 }, { "epoch": 0.6010882138725847, "grad_norm": 7.609458923339844, "learning_rate": 9.64762282634273e-05, "loss": 2.702621269226074, "memory(GiB)": 58.3, "step": 14030, "token_acc": 0.46332046332046334, "train_speed(iter/s)": 1.45093 }, { "epoch": 0.6013024292018336, "grad_norm": 4.618037223815918, "learning_rate": 9.647374617002314e-05, "loss": 2.8386764526367188, "memory(GiB)": 58.3, "step": 14035, "token_acc": 0.47854785478547857, "train_speed(iter/s)": 1.450964 }, { "epoch": 0.6015166445310827, "grad_norm": 5.122060298919678, "learning_rate": 9.647126323470349e-05, "loss": 2.4943979263305662, "memory(GiB)": 58.3, "step": 14040, "token_acc": 0.47580645161290325, "train_speed(iter/s)": 1.450872 }, { "epoch": 0.6017308598603316, "grad_norm": 4.990699291229248, "learning_rate": 9.646877945751332e-05, "loss": 2.692562484741211, "memory(GiB)": 58.3, "step": 14045, "token_acc": 0.4709897610921502, "train_speed(iter/s)": 1.451002 }, { "epoch": 0.6019450751895805, "grad_norm": 4.44877290725708, "learning_rate": 9.646629483849758e-05, "loss": 2.585972213745117, "memory(GiB)": 58.3, "step": 14050, "token_acc": 0.4463276836158192, "train_speed(iter/s)": 1.450945 }, { "epoch": 0.6021592905188295, "grad_norm": 4.9418792724609375, "learning_rate": 9.646380937770135e-05, "loss": 2.6613460540771485, "memory(GiB)": 58.3, "step": 14055, "token_acc": 0.4407894736842105, "train_speed(iter/s)": 1.451073 }, { "epoch": 0.6023735058480785, "grad_norm": 5.462378978729248, "learning_rate": 9.646132307516961e-05, "loss": 2.4605152130126955, "memory(GiB)": 58.3, "step": 14060, "token_acc": 0.5171102661596958, "train_speed(iter/s)": 1.451083 }, { "epoch": 0.6025877211773274, "grad_norm": 4.256039142608643, "learning_rate": 9.645883593094745e-05, "loss": 2.7011783599853514, "memory(GiB)": 58.3, "step": 14065, "token_acc": 0.45222929936305734, "train_speed(iter/s)": 1.451087 }, { "epoch": 0.6028019365065764, "grad_norm": 3.353818655014038, "learning_rate": 9.645634794507987e-05, "loss": 2.4486928939819337, "memory(GiB)": 58.3, "step": 14070, "token_acc": 0.4735202492211838, "train_speed(iter/s)": 1.451147 }, { "epoch": 0.6030161518358254, "grad_norm": 4.841371059417725, "learning_rate": 9.645385911761196e-05, "loss": 2.2656063079833983, "memory(GiB)": 58.3, "step": 14075, "token_acc": 0.5, "train_speed(iter/s)": 1.450907 }, { "epoch": 0.6032303671650744, "grad_norm": 5.12148904800415, "learning_rate": 9.645136944858883e-05, "loss": 2.6110525131225586, "memory(GiB)": 58.3, "step": 14080, "token_acc": 0.4679245283018868, "train_speed(iter/s)": 1.450824 }, { "epoch": 0.6034445824943233, "grad_norm": 4.80199670791626, "learning_rate": 9.644887893805557e-05, "loss": 2.7595199584960937, "memory(GiB)": 58.3, "step": 14085, "token_acc": 0.4554794520547945, "train_speed(iter/s)": 1.450818 }, { "epoch": 0.6036587978235722, "grad_norm": 4.588850975036621, "learning_rate": 9.644638758605729e-05, "loss": 2.67706413269043, "memory(GiB)": 58.3, "step": 14090, "token_acc": 0.4833948339483395, "train_speed(iter/s)": 1.450912 }, { "epoch": 0.6038730131528213, "grad_norm": 6.626927852630615, "learning_rate": 9.644389539263911e-05, "loss": 2.828997993469238, "memory(GiB)": 58.3, "step": 14095, "token_acc": 0.39622641509433965, "train_speed(iter/s)": 1.450938 }, { "epoch": 0.6040872284820702, "grad_norm": 4.618725776672363, "learning_rate": 9.644140235784623e-05, "loss": 2.8019035339355467, "memory(GiB)": 58.3, "step": 14100, "token_acc": 0.4652777777777778, "train_speed(iter/s)": 1.45098 }, { "epoch": 0.6043014438113191, "grad_norm": 3.681743860244751, "learning_rate": 9.643890848172377e-05, "loss": 2.538693428039551, "memory(GiB)": 58.3, "step": 14105, "token_acc": 0.47635135135135137, "train_speed(iter/s)": 1.450892 }, { "epoch": 0.6045156591405682, "grad_norm": 4.0561842918396, "learning_rate": 9.643641376431691e-05, "loss": 2.4754875183105467, "memory(GiB)": 58.3, "step": 14110, "token_acc": 0.5164835164835165, "train_speed(iter/s)": 1.450895 }, { "epoch": 0.6047298744698171, "grad_norm": 5.372085094451904, "learning_rate": 9.643391820567084e-05, "loss": 2.580911064147949, "memory(GiB)": 58.3, "step": 14115, "token_acc": 0.4517241379310345, "train_speed(iter/s)": 1.450821 }, { "epoch": 0.604944089799066, "grad_norm": 10.50987720489502, "learning_rate": 9.643142180583081e-05, "loss": 2.5099327087402346, "memory(GiB)": 58.3, "step": 14120, "token_acc": 0.4489051094890511, "train_speed(iter/s)": 1.450813 }, { "epoch": 0.605158305128315, "grad_norm": 5.173738956451416, "learning_rate": 9.642892456484201e-05, "loss": 2.69002742767334, "memory(GiB)": 58.3, "step": 14125, "token_acc": 0.4037854889589905, "train_speed(iter/s)": 1.450797 }, { "epoch": 0.605372520457564, "grad_norm": 4.199275970458984, "learning_rate": 9.642642648274968e-05, "loss": 2.5305654525756838, "memory(GiB)": 58.3, "step": 14130, "token_acc": 0.4961832061068702, "train_speed(iter/s)": 1.450635 }, { "epoch": 0.6055867357868129, "grad_norm": 3.784954786300659, "learning_rate": 9.64239275595991e-05, "loss": 2.4811016082763673, "memory(GiB)": 58.3, "step": 14135, "token_acc": 0.4882943143812709, "train_speed(iter/s)": 1.450555 }, { "epoch": 0.6058009511160619, "grad_norm": 6.651519298553467, "learning_rate": 9.642142779543551e-05, "loss": 2.6361852645874024, "memory(GiB)": 58.3, "step": 14140, "token_acc": 0.4892966360856269, "train_speed(iter/s)": 1.450674 }, { "epoch": 0.6060151664453108, "grad_norm": 3.5732951164245605, "learning_rate": 9.641892719030421e-05, "loss": 2.4061546325683594, "memory(GiB)": 58.3, "step": 14145, "token_acc": 0.47096774193548385, "train_speed(iter/s)": 1.450729 }, { "epoch": 0.6062293817745598, "grad_norm": 3.4998037815093994, "learning_rate": 9.64164257442505e-05, "loss": 2.5439785003662108, "memory(GiB)": 58.3, "step": 14150, "token_acc": 0.44404332129963897, "train_speed(iter/s)": 1.450569 }, { "epoch": 0.6064435971038088, "grad_norm": 4.5941290855407715, "learning_rate": 9.64139234573197e-05, "loss": 2.0855281829833983, "memory(GiB)": 58.3, "step": 14155, "token_acc": 0.5274261603375527, "train_speed(iter/s)": 1.450455 }, { "epoch": 0.6066578124330577, "grad_norm": 5.384954452514648, "learning_rate": 9.641142032955714e-05, "loss": 2.896497344970703, "memory(GiB)": 58.3, "step": 14160, "token_acc": 0.4253731343283582, "train_speed(iter/s)": 1.450449 }, { "epoch": 0.6068720277623066, "grad_norm": 4.107150554656982, "learning_rate": 9.640891636100815e-05, "loss": 2.663096809387207, "memory(GiB)": 58.3, "step": 14165, "token_acc": 0.45222929936305734, "train_speed(iter/s)": 1.450381 }, { "epoch": 0.6070862430915557, "grad_norm": 4.029058456420898, "learning_rate": 9.640641155171812e-05, "loss": 2.3572019577026366, "memory(GiB)": 58.3, "step": 14170, "token_acc": 0.4929078014184397, "train_speed(iter/s)": 1.450346 }, { "epoch": 0.6073004584208046, "grad_norm": 4.141958236694336, "learning_rate": 9.64039059017324e-05, "loss": 2.4927330017089844, "memory(GiB)": 58.3, "step": 14175, "token_acc": 0.48013245033112584, "train_speed(iter/s)": 1.450355 }, { "epoch": 0.6075146737500535, "grad_norm": 4.207716941833496, "learning_rate": 9.64013994110964e-05, "loss": 2.625646209716797, "memory(GiB)": 58.3, "step": 14180, "token_acc": 0.4417808219178082, "train_speed(iter/s)": 1.450491 }, { "epoch": 0.6077288890793026, "grad_norm": 4.127267360687256, "learning_rate": 9.639889207985552e-05, "loss": 2.9878454208374023, "memory(GiB)": 58.3, "step": 14185, "token_acc": 0.44648318042813456, "train_speed(iter/s)": 1.450512 }, { "epoch": 0.6079431044085515, "grad_norm": 4.650112628936768, "learning_rate": 9.639638390805518e-05, "loss": 2.593561363220215, "memory(GiB)": 58.3, "step": 14190, "token_acc": 0.4911504424778761, "train_speed(iter/s)": 1.450517 }, { "epoch": 0.6081573197378004, "grad_norm": 4.303674697875977, "learning_rate": 9.639387489574083e-05, "loss": 2.535626220703125, "memory(GiB)": 58.3, "step": 14195, "token_acc": 0.46757679180887374, "train_speed(iter/s)": 1.450536 }, { "epoch": 0.6083715350670494, "grad_norm": 4.284163951873779, "learning_rate": 9.639136504295792e-05, "loss": 2.706220245361328, "memory(GiB)": 58.3, "step": 14200, "token_acc": 0.43670886075949367, "train_speed(iter/s)": 1.450551 }, { "epoch": 0.6085857503962984, "grad_norm": 4.55698823928833, "learning_rate": 9.638885434975189e-05, "loss": 2.5487178802490233, "memory(GiB)": 58.3, "step": 14205, "token_acc": 0.46405228758169936, "train_speed(iter/s)": 1.450581 }, { "epoch": 0.6087999657255473, "grad_norm": 4.081040382385254, "learning_rate": 9.638634281616827e-05, "loss": 2.6076353073120115, "memory(GiB)": 58.3, "step": 14210, "token_acc": 0.48231511254019294, "train_speed(iter/s)": 1.450535 }, { "epoch": 0.6090141810547963, "grad_norm": 3.8725972175598145, "learning_rate": 9.638383044225252e-05, "loss": 2.4013492584228517, "memory(GiB)": 58.3, "step": 14215, "token_acc": 0.5, "train_speed(iter/s)": 1.450483 }, { "epoch": 0.6092283963840452, "grad_norm": 5.643509387969971, "learning_rate": 9.638131722805018e-05, "loss": 2.8583192825317383, "memory(GiB)": 58.3, "step": 14220, "token_acc": 0.41964285714285715, "train_speed(iter/s)": 1.450534 }, { "epoch": 0.6094426117132942, "grad_norm": 3.9989659786224365, "learning_rate": 9.637880317360677e-05, "loss": 2.5155548095703124, "memory(GiB)": 58.3, "step": 14225, "token_acc": 0.4356060606060606, "train_speed(iter/s)": 1.450504 }, { "epoch": 0.6096568270425432, "grad_norm": 4.759162902832031, "learning_rate": 9.637628827896784e-05, "loss": 2.6664148330688477, "memory(GiB)": 58.3, "step": 14230, "token_acc": 0.475177304964539, "train_speed(iter/s)": 1.450415 }, { "epoch": 0.6098710423717921, "grad_norm": 4.269876003265381, "learning_rate": 9.637377254417892e-05, "loss": 2.6349323272705076, "memory(GiB)": 58.3, "step": 14235, "token_acc": 0.47202797202797203, "train_speed(iter/s)": 1.4504 }, { "epoch": 0.610085257701041, "grad_norm": 5.584471225738525, "learning_rate": 9.637125596928562e-05, "loss": 2.5454620361328124, "memory(GiB)": 58.3, "step": 14240, "token_acc": 0.4787234042553192, "train_speed(iter/s)": 1.4503 }, { "epoch": 0.6102994730302901, "grad_norm": 3.627065658569336, "learning_rate": 9.636873855433353e-05, "loss": 2.417140769958496, "memory(GiB)": 58.3, "step": 14245, "token_acc": 0.4717607973421927, "train_speed(iter/s)": 1.450325 }, { "epoch": 0.610513688359539, "grad_norm": 3.7807254791259766, "learning_rate": 9.636622029936823e-05, "loss": 2.7936275482177733, "memory(GiB)": 58.3, "step": 14250, "token_acc": 0.40350877192982454, "train_speed(iter/s)": 1.45039 }, { "epoch": 0.6107279036887879, "grad_norm": 5.293404579162598, "learning_rate": 9.636370120443536e-05, "loss": 2.5364202499389648, "memory(GiB)": 58.3, "step": 14255, "token_acc": 0.4746268656716418, "train_speed(iter/s)": 1.450284 }, { "epoch": 0.610942119018037, "grad_norm": 4.4679131507873535, "learning_rate": 9.636118126958056e-05, "loss": 2.5206331253051757, "memory(GiB)": 58.3, "step": 14260, "token_acc": 0.5218855218855218, "train_speed(iter/s)": 1.450263 }, { "epoch": 0.6111563343472859, "grad_norm": 3.8023853302001953, "learning_rate": 9.635866049484945e-05, "loss": 2.8486602783203123, "memory(GiB)": 58.3, "step": 14265, "token_acc": 0.46710526315789475, "train_speed(iter/s)": 1.450305 }, { "epoch": 0.6113705496765348, "grad_norm": 4.473516941070557, "learning_rate": 9.635613888028773e-05, "loss": 2.6485260009765623, "memory(GiB)": 58.3, "step": 14270, "token_acc": 0.45985401459854014, "train_speed(iter/s)": 1.450278 }, { "epoch": 0.6115847650057838, "grad_norm": 3.4229116439819336, "learning_rate": 9.635361642594107e-05, "loss": 2.694051742553711, "memory(GiB)": 58.3, "step": 14275, "token_acc": 0.42900302114803623, "train_speed(iter/s)": 1.450324 }, { "epoch": 0.6117989803350328, "grad_norm": 5.261970043182373, "learning_rate": 9.635109313185515e-05, "loss": 2.8413217544555662, "memory(GiB)": 58.3, "step": 14280, "token_acc": 0.4208860759493671, "train_speed(iter/s)": 1.450424 }, { "epoch": 0.6120131956642817, "grad_norm": 3.982478380203247, "learning_rate": 9.634856899807571e-05, "loss": 2.7169370651245117, "memory(GiB)": 58.3, "step": 14285, "token_acc": 0.4269005847953216, "train_speed(iter/s)": 1.450419 }, { "epoch": 0.6122274109935307, "grad_norm": 5.789243221282959, "learning_rate": 9.634604402464846e-05, "loss": 2.6715490341186525, "memory(GiB)": 58.3, "step": 14290, "token_acc": 0.48484848484848486, "train_speed(iter/s)": 1.450434 }, { "epoch": 0.6124416263227797, "grad_norm": 5.56578254699707, "learning_rate": 9.634351821161916e-05, "loss": 2.5312240600585936, "memory(GiB)": 58.3, "step": 14295, "token_acc": 0.46557377049180326, "train_speed(iter/s)": 1.450463 }, { "epoch": 0.6126558416520286, "grad_norm": 4.066060543060303, "learning_rate": 9.634099155903353e-05, "loss": 2.700741004943848, "memory(GiB)": 58.3, "step": 14300, "token_acc": 0.46619217081850534, "train_speed(iter/s)": 1.450592 }, { "epoch": 0.6128700569812776, "grad_norm": 3.4789788722991943, "learning_rate": 9.633846406693738e-05, "loss": 2.366909980773926, "memory(GiB)": 58.3, "step": 14305, "token_acc": 0.4793650793650794, "train_speed(iter/s)": 1.450625 }, { "epoch": 0.6130842723105265, "grad_norm": 5.285825252532959, "learning_rate": 9.633593573537649e-05, "loss": 2.700864791870117, "memory(GiB)": 58.3, "step": 14310, "token_acc": 0.49056603773584906, "train_speed(iter/s)": 1.450696 }, { "epoch": 0.6132984876397755, "grad_norm": 4.436102867126465, "learning_rate": 9.633340656439664e-05, "loss": 2.7850835800170897, "memory(GiB)": 58.3, "step": 14315, "token_acc": 0.4540059347181009, "train_speed(iter/s)": 1.450669 }, { "epoch": 0.6135127029690245, "grad_norm": 4.019272804260254, "learning_rate": 9.633087655404369e-05, "loss": 2.4706047058105467, "memory(GiB)": 58.3, "step": 14320, "token_acc": 0.476027397260274, "train_speed(iter/s)": 1.450561 }, { "epoch": 0.6137269182982734, "grad_norm": 4.5024824142456055, "learning_rate": 9.632834570436343e-05, "loss": 2.671927642822266, "memory(GiB)": 58.3, "step": 14325, "token_acc": 0.4380952380952381, "train_speed(iter/s)": 1.450663 }, { "epoch": 0.6139411336275223, "grad_norm": 3.2568390369415283, "learning_rate": 9.632581401540174e-05, "loss": 2.699192428588867, "memory(GiB)": 58.3, "step": 14330, "token_acc": 0.4506172839506173, "train_speed(iter/s)": 1.450665 }, { "epoch": 0.6141553489567714, "grad_norm": 7.013794898986816, "learning_rate": 9.632328148720447e-05, "loss": 2.422679901123047, "memory(GiB)": 58.3, "step": 14335, "token_acc": 0.5152838427947598, "train_speed(iter/s)": 1.450637 }, { "epoch": 0.6143695642860203, "grad_norm": 5.078404903411865, "learning_rate": 9.632074811981749e-05, "loss": 2.7318899154663088, "memory(GiB)": 58.3, "step": 14340, "token_acc": 0.4405144694533762, "train_speed(iter/s)": 1.450648 }, { "epoch": 0.6145837796152692, "grad_norm": 3.7661755084991455, "learning_rate": 9.631821391328671e-05, "loss": 3.045706367492676, "memory(GiB)": 58.3, "step": 14345, "token_acc": 0.40236686390532544, "train_speed(iter/s)": 1.450602 }, { "epoch": 0.6147979949445183, "grad_norm": 3.5946297645568848, "learning_rate": 9.631567886765804e-05, "loss": 2.706712341308594, "memory(GiB)": 58.3, "step": 14350, "token_acc": 0.48089171974522293, "train_speed(iter/s)": 1.450656 }, { "epoch": 0.6150122102737672, "grad_norm": 4.757500171661377, "learning_rate": 9.63131429829774e-05, "loss": 2.5729793548583983, "memory(GiB)": 58.3, "step": 14355, "token_acc": 0.4576271186440678, "train_speed(iter/s)": 1.45078 }, { "epoch": 0.6152264256030161, "grad_norm": 3.7429420948028564, "learning_rate": 9.631060625929073e-05, "loss": 2.5921089172363283, "memory(GiB)": 58.3, "step": 14360, "token_acc": 0.4492753623188406, "train_speed(iter/s)": 1.450923 }, { "epoch": 0.6154406409322651, "grad_norm": 4.863556385040283, "learning_rate": 9.630806869664397e-05, "loss": 2.179573822021484, "memory(GiB)": 58.3, "step": 14365, "token_acc": 0.5129151291512916, "train_speed(iter/s)": 1.45097 }, { "epoch": 0.6156548562615141, "grad_norm": 4.245652198791504, "learning_rate": 9.630553029508312e-05, "loss": 2.7798912048339846, "memory(GiB)": 58.3, "step": 14370, "token_acc": 0.47770700636942676, "train_speed(iter/s)": 1.451016 }, { "epoch": 0.615869071590763, "grad_norm": 6.306260585784912, "learning_rate": 9.630299105465414e-05, "loss": 2.9054943084716798, "memory(GiB)": 58.3, "step": 14375, "token_acc": 0.4653846153846154, "train_speed(iter/s)": 1.450915 }, { "epoch": 0.616083286920012, "grad_norm": 4.311077117919922, "learning_rate": 9.630045097540305e-05, "loss": 2.494624137878418, "memory(GiB)": 58.3, "step": 14380, "token_acc": 0.4725274725274725, "train_speed(iter/s)": 1.450996 }, { "epoch": 0.6162975022492609, "grad_norm": 5.55035400390625, "learning_rate": 9.629791005737587e-05, "loss": 2.644223213195801, "memory(GiB)": 58.3, "step": 14385, "token_acc": 0.4532871972318339, "train_speed(iter/s)": 1.451078 }, { "epoch": 0.6165117175785099, "grad_norm": 5.805172443389893, "learning_rate": 9.62953683006186e-05, "loss": 2.422890281677246, "memory(GiB)": 58.3, "step": 14390, "token_acc": 0.5075187969924813, "train_speed(iter/s)": 1.451204 }, { "epoch": 0.6167259329077589, "grad_norm": 4.4684953689575195, "learning_rate": 9.62928257051773e-05, "loss": 2.603474426269531, "memory(GiB)": 58.3, "step": 14395, "token_acc": 0.45307443365695793, "train_speed(iter/s)": 1.451256 }, { "epoch": 0.6169401482370078, "grad_norm": 4.715904712677002, "learning_rate": 9.629028227109806e-05, "loss": 2.8599857330322265, "memory(GiB)": 58.3, "step": 14400, "token_acc": 0.436950146627566, "train_speed(iter/s)": 1.451093 }, { "epoch": 0.6171543635662567, "grad_norm": 4.722514629364014, "learning_rate": 9.628773799842692e-05, "loss": 2.5614501953125, "memory(GiB)": 58.3, "step": 14405, "token_acc": 0.48606811145510836, "train_speed(iter/s)": 1.451136 }, { "epoch": 0.6173685788955058, "grad_norm": 4.117427349090576, "learning_rate": 9.628519288720998e-05, "loss": 2.8555744171142576, "memory(GiB)": 58.3, "step": 14410, "token_acc": 0.4268292682926829, "train_speed(iter/s)": 1.451169 }, { "epoch": 0.6175827942247547, "grad_norm": 10.364761352539062, "learning_rate": 9.628264693749336e-05, "loss": 2.869075393676758, "memory(GiB)": 58.3, "step": 14415, "token_acc": 0.45222929936305734, "train_speed(iter/s)": 1.451201 }, { "epoch": 0.6177970095540037, "grad_norm": 4.2784552574157715, "learning_rate": 9.628010014932315e-05, "loss": 2.4816051483154298, "memory(GiB)": 58.3, "step": 14420, "token_acc": 0.4858757062146893, "train_speed(iter/s)": 1.451221 }, { "epoch": 0.6180112248832527, "grad_norm": 3.3749730587005615, "learning_rate": 9.627755252274555e-05, "loss": 2.7301563262939452, "memory(GiB)": 58.3, "step": 14425, "token_acc": 0.4351851851851852, "train_speed(iter/s)": 1.451293 }, { "epoch": 0.6182254402125016, "grad_norm": 3.5255680084228516, "learning_rate": 9.627500405780664e-05, "loss": 2.773752975463867, "memory(GiB)": 58.3, "step": 14430, "token_acc": 0.4602272727272727, "train_speed(iter/s)": 1.451264 }, { "epoch": 0.6184396555417506, "grad_norm": 4.291807174682617, "learning_rate": 9.627245475455265e-05, "loss": 2.408559036254883, "memory(GiB)": 58.3, "step": 14435, "token_acc": 0.45484949832775917, "train_speed(iter/s)": 1.45117 }, { "epoch": 0.6186538708709995, "grad_norm": 4.134547710418701, "learning_rate": 9.626990461302972e-05, "loss": 2.3926017761230467, "memory(GiB)": 58.3, "step": 14440, "token_acc": 0.4982456140350877, "train_speed(iter/s)": 1.451235 }, { "epoch": 0.6188680862002485, "grad_norm": 5.719235420227051, "learning_rate": 9.626735363328405e-05, "loss": 2.5206960678100585, "memory(GiB)": 58.3, "step": 14445, "token_acc": 0.44363636363636366, "train_speed(iter/s)": 1.45133 }, { "epoch": 0.6190823015294975, "grad_norm": 4.217067718505859, "learning_rate": 9.626480181536188e-05, "loss": 2.5698015213012697, "memory(GiB)": 58.3, "step": 14450, "token_acc": 0.4589041095890411, "train_speed(iter/s)": 1.451226 }, { "epoch": 0.6192965168587464, "grad_norm": 5.145505428314209, "learning_rate": 9.626224915930943e-05, "loss": 2.5638763427734377, "memory(GiB)": 58.3, "step": 14455, "token_acc": 0.45051194539249145, "train_speed(iter/s)": 1.451161 }, { "epoch": 0.6195107321879953, "grad_norm": 4.336952209472656, "learning_rate": 9.625969566517294e-05, "loss": 2.7218080520629884, "memory(GiB)": 58.3, "step": 14460, "token_acc": 0.4798657718120805, "train_speed(iter/s)": 1.451182 }, { "epoch": 0.6197249475172444, "grad_norm": 3.6626038551330566, "learning_rate": 9.625714133299866e-05, "loss": 2.8155496597290037, "memory(GiB)": 58.3, "step": 14465, "token_acc": 0.45794392523364486, "train_speed(iter/s)": 1.451176 }, { "epoch": 0.6199391628464933, "grad_norm": 6.035238265991211, "learning_rate": 9.625458616283289e-05, "loss": 2.8277408599853517, "memory(GiB)": 58.3, "step": 14470, "token_acc": 0.4423676012461059, "train_speed(iter/s)": 1.451142 }, { "epoch": 0.6201533781757422, "grad_norm": 3.5719058513641357, "learning_rate": 9.625203015472189e-05, "loss": 2.819818115234375, "memory(GiB)": 58.3, "step": 14475, "token_acc": 0.42388059701492536, "train_speed(iter/s)": 1.45106 }, { "epoch": 0.6203675935049913, "grad_norm": 6.20145320892334, "learning_rate": 9.624947330871197e-05, "loss": 2.3607290267944334, "memory(GiB)": 58.3, "step": 14480, "token_acc": 0.47750865051903113, "train_speed(iter/s)": 1.450937 }, { "epoch": 0.6205818088342402, "grad_norm": 4.41830587387085, "learning_rate": 9.624691562484945e-05, "loss": 2.8208541870117188, "memory(GiB)": 58.3, "step": 14485, "token_acc": 0.4205607476635514, "train_speed(iter/s)": 1.451021 }, { "epoch": 0.6207960241634891, "grad_norm": 4.027515888214111, "learning_rate": 9.624435710318067e-05, "loss": 2.599051666259766, "memory(GiB)": 58.3, "step": 14490, "token_acc": 0.44107744107744107, "train_speed(iter/s)": 1.450996 }, { "epoch": 0.6210102394927381, "grad_norm": 4.197312831878662, "learning_rate": 9.6241797743752e-05, "loss": 2.4655889511108398, "memory(GiB)": 58.3, "step": 14495, "token_acc": 0.4927007299270073, "train_speed(iter/s)": 1.451173 }, { "epoch": 0.6212244548219871, "grad_norm": 4.853488922119141, "learning_rate": 9.623923754660977e-05, "loss": 2.570265197753906, "memory(GiB)": 58.3, "step": 14500, "token_acc": 0.4838709677419355, "train_speed(iter/s)": 1.451 }, { "epoch": 0.6212244548219871, "eval_loss": 2.4207069873809814, "eval_runtime": 14.1062, "eval_samples_per_second": 7.089, "eval_steps_per_second": 7.089, "eval_token_acc": 0.4683053040103493, "step": 14500 }, { "epoch": 0.621438670151236, "grad_norm": 5.254773139953613, "learning_rate": 9.623667651180039e-05, "loss": 2.7100574493408205, "memory(GiB)": 58.3, "step": 14505, "token_acc": 0.46378174976481656, "train_speed(iter/s)": 1.448937 }, { "epoch": 0.621652885480485, "grad_norm": 5.069192409515381, "learning_rate": 9.623411463937022e-05, "loss": 2.627546501159668, "memory(GiB)": 58.3, "step": 14510, "token_acc": 0.45302013422818793, "train_speed(iter/s)": 1.448913 }, { "epoch": 0.621867100809734, "grad_norm": 4.279447078704834, "learning_rate": 9.62315519293657e-05, "loss": 2.289988327026367, "memory(GiB)": 58.3, "step": 14515, "token_acc": 0.49173553719008267, "train_speed(iter/s)": 1.448999 }, { "epoch": 0.6220813161389829, "grad_norm": 4.658717632293701, "learning_rate": 9.622898838183325e-05, "loss": 2.7342790603637694, "memory(GiB)": 58.3, "step": 14520, "token_acc": 0.45569620253164556, "train_speed(iter/s)": 1.449107 }, { "epoch": 0.6222955314682319, "grad_norm": 4.0352606773376465, "learning_rate": 9.622642399681931e-05, "loss": 2.62386531829834, "memory(GiB)": 58.3, "step": 14525, "token_acc": 0.4738675958188153, "train_speed(iter/s)": 1.449165 }, { "epoch": 0.6225097467974808, "grad_norm": 4.412250995635986, "learning_rate": 9.622385877437034e-05, "loss": 2.4408428192138674, "memory(GiB)": 58.3, "step": 14530, "token_acc": 0.4591194968553459, "train_speed(iter/s)": 1.449163 }, { "epoch": 0.6227239621267298, "grad_norm": 4.083387851715088, "learning_rate": 9.622129271453281e-05, "loss": 2.6369386672973634, "memory(GiB)": 58.3, "step": 14535, "token_acc": 0.4539877300613497, "train_speed(iter/s)": 1.449091 }, { "epoch": 0.6229381774559788, "grad_norm": 5.386481285095215, "learning_rate": 9.62187258173532e-05, "loss": 3.10937614440918, "memory(GiB)": 58.3, "step": 14540, "token_acc": 0.39880952380952384, "train_speed(iter/s)": 1.449229 }, { "epoch": 0.6231523927852277, "grad_norm": 3.7416281700134277, "learning_rate": 9.621615808287801e-05, "loss": 2.6056060791015625, "memory(GiB)": 58.3, "step": 14545, "token_acc": 0.46616541353383456, "train_speed(iter/s)": 1.4492 }, { "epoch": 0.6233666081144766, "grad_norm": 5.48376989364624, "learning_rate": 9.621358951115377e-05, "loss": 2.794524574279785, "memory(GiB)": 58.3, "step": 14550, "token_acc": 0.44368600682593856, "train_speed(iter/s)": 1.449036 }, { "epoch": 0.6235808234437257, "grad_norm": 3.7273807525634766, "learning_rate": 9.6211020102227e-05, "loss": 2.8398351669311523, "memory(GiB)": 58.3, "step": 14555, "token_acc": 0.44368600682593856, "train_speed(iter/s)": 1.449018 }, { "epoch": 0.6237950387729746, "grad_norm": 5.19226598739624, "learning_rate": 9.620844985614425e-05, "loss": 2.773996353149414, "memory(GiB)": 58.3, "step": 14560, "token_acc": 0.46308724832214765, "train_speed(iter/s)": 1.449044 }, { "epoch": 0.6240092541022235, "grad_norm": 4.169505596160889, "learning_rate": 9.620587877295209e-05, "loss": 2.681260108947754, "memory(GiB)": 58.3, "step": 14565, "token_acc": 0.45907473309608543, "train_speed(iter/s)": 1.448974 }, { "epoch": 0.6242234694314726, "grad_norm": 3.792816400527954, "learning_rate": 9.620330685269707e-05, "loss": 2.4349451065063477, "memory(GiB)": 58.3, "step": 14570, "token_acc": 0.453125, "train_speed(iter/s)": 1.448966 }, { "epoch": 0.6244376847607215, "grad_norm": 4.641355514526367, "learning_rate": 9.620073409542583e-05, "loss": 2.322515678405762, "memory(GiB)": 58.3, "step": 14575, "token_acc": 0.5234657039711191, "train_speed(iter/s)": 1.449039 }, { "epoch": 0.6246519000899704, "grad_norm": 5.689401149749756, "learning_rate": 9.619816050118494e-05, "loss": 2.7147125244140624, "memory(GiB)": 58.3, "step": 14580, "token_acc": 0.43548387096774194, "train_speed(iter/s)": 1.449087 }, { "epoch": 0.6248661154192194, "grad_norm": 5.070311069488525, "learning_rate": 9.619558607002103e-05, "loss": 2.847411346435547, "memory(GiB)": 58.3, "step": 14585, "token_acc": 0.4375, "train_speed(iter/s)": 1.449143 }, { "epoch": 0.6250803307484684, "grad_norm": 4.0744524002075195, "learning_rate": 9.619301080198075e-05, "loss": 2.4808425903320312, "memory(GiB)": 58.3, "step": 14590, "token_acc": 0.5223367697594502, "train_speed(iter/s)": 1.449218 }, { "epoch": 0.6252945460777173, "grad_norm": 9.106696128845215, "learning_rate": 9.619043469711074e-05, "loss": 2.7112491607666014, "memory(GiB)": 58.3, "step": 14595, "token_acc": 0.4819672131147541, "train_speed(iter/s)": 1.449326 }, { "epoch": 0.6255087614069663, "grad_norm": 6.736730575561523, "learning_rate": 9.618785775545769e-05, "loss": 2.572785186767578, "memory(GiB)": 58.3, "step": 14600, "token_acc": 0.43829787234042555, "train_speed(iter/s)": 1.449366 }, { "epoch": 0.6257229767362152, "grad_norm": 4.710590362548828, "learning_rate": 9.618527997706825e-05, "loss": 2.7171817779541017, "memory(GiB)": 58.3, "step": 14605, "token_acc": 0.4584837545126354, "train_speed(iter/s)": 1.449359 }, { "epoch": 0.6259371920654642, "grad_norm": 8.038471221923828, "learning_rate": 9.618270136198915e-05, "loss": 2.5910783767700196, "memory(GiB)": 58.3, "step": 14610, "token_acc": 0.45864661654135336, "train_speed(iter/s)": 1.449265 }, { "epoch": 0.6261514073947132, "grad_norm": 3.7803711891174316, "learning_rate": 9.61801219102671e-05, "loss": 2.5998628616333006, "memory(GiB)": 58.3, "step": 14615, "token_acc": 0.425531914893617, "train_speed(iter/s)": 1.449428 }, { "epoch": 0.6263656227239621, "grad_norm": 4.469743728637695, "learning_rate": 9.617754162194881e-05, "loss": 2.6015525817871095, "memory(GiB)": 58.3, "step": 14620, "token_acc": 0.4565916398713826, "train_speed(iter/s)": 1.449515 }, { "epoch": 0.626579838053211, "grad_norm": 5.250475883483887, "learning_rate": 9.617496049708103e-05, "loss": 2.6085208892822265, "memory(GiB)": 58.3, "step": 14625, "token_acc": 0.46785714285714286, "train_speed(iter/s)": 1.449516 }, { "epoch": 0.6267940533824601, "grad_norm": 3.4902091026306152, "learning_rate": 9.617237853571054e-05, "loss": 2.5155656814575194, "memory(GiB)": 58.3, "step": 14630, "token_acc": 0.4728434504792332, "train_speed(iter/s)": 1.449539 }, { "epoch": 0.627008268711709, "grad_norm": 6.242879390716553, "learning_rate": 9.61697957378841e-05, "loss": 2.553343963623047, "memory(GiB)": 58.3, "step": 14635, "token_acc": 0.49236641221374045, "train_speed(iter/s)": 1.449504 }, { "epoch": 0.6272224840409579, "grad_norm": 6.107500076293945, "learning_rate": 9.616721210364849e-05, "loss": 2.7408821105957033, "memory(GiB)": 58.3, "step": 14640, "token_acc": 0.4318936877076412, "train_speed(iter/s)": 1.449604 }, { "epoch": 0.627436699370207, "grad_norm": 5.43466854095459, "learning_rate": 9.616462763305052e-05, "loss": 2.3727228164672853, "memory(GiB)": 58.3, "step": 14645, "token_acc": 0.46511627906976744, "train_speed(iter/s)": 1.449502 }, { "epoch": 0.6276509146994559, "grad_norm": 4.648040771484375, "learning_rate": 9.616204232613701e-05, "loss": 2.435367965698242, "memory(GiB)": 58.3, "step": 14650, "token_acc": 0.5127388535031847, "train_speed(iter/s)": 1.449526 }, { "epoch": 0.6278651300287048, "grad_norm": 4.420108318328857, "learning_rate": 9.615945618295483e-05, "loss": 2.3956180572509767, "memory(GiB)": 58.3, "step": 14655, "token_acc": 0.5112781954887218, "train_speed(iter/s)": 1.449593 }, { "epoch": 0.6280793453579538, "grad_norm": 4.477234840393066, "learning_rate": 9.615686920355077e-05, "loss": 2.6692153930664064, "memory(GiB)": 58.3, "step": 14660, "token_acc": 0.42729970326409494, "train_speed(iter/s)": 1.449737 }, { "epoch": 0.6282935606872028, "grad_norm": 3.919339179992676, "learning_rate": 9.615428138797174e-05, "loss": 2.6346946716308595, "memory(GiB)": 58.3, "step": 14665, "token_acc": 0.46062992125984253, "train_speed(iter/s)": 1.449803 }, { "epoch": 0.6285077760164517, "grad_norm": 4.786696910858154, "learning_rate": 9.61516927362646e-05, "loss": 2.4836502075195312, "memory(GiB)": 58.3, "step": 14670, "token_acc": 0.49433962264150944, "train_speed(iter/s)": 1.449779 }, { "epoch": 0.6287219913457007, "grad_norm": 4.565245151519775, "learning_rate": 9.614910324847626e-05, "loss": 2.596850776672363, "memory(GiB)": 58.3, "step": 14675, "token_acc": 0.4490566037735849, "train_speed(iter/s)": 1.449794 }, { "epoch": 0.6289362066749496, "grad_norm": 3.0799903869628906, "learning_rate": 9.614651292465361e-05, "loss": 2.465973472595215, "memory(GiB)": 58.3, "step": 14680, "token_acc": 0.5085227272727273, "train_speed(iter/s)": 1.449859 }, { "epoch": 0.6291504220041986, "grad_norm": 4.583732604980469, "learning_rate": 9.614392176484359e-05, "loss": 2.8886692047119142, "memory(GiB)": 58.3, "step": 14685, "token_acc": 0.410958904109589, "train_speed(iter/s)": 1.449801 }, { "epoch": 0.6293646373334476, "grad_norm": 3.2853856086730957, "learning_rate": 9.614132976909316e-05, "loss": 2.834272575378418, "memory(GiB)": 58.3, "step": 14690, "token_acc": 0.42105263157894735, "train_speed(iter/s)": 1.449768 }, { "epoch": 0.6295788526626965, "grad_norm": 4.822491645812988, "learning_rate": 9.613873693744924e-05, "loss": 2.610274314880371, "memory(GiB)": 58.3, "step": 14695, "token_acc": 0.41762452107279696, "train_speed(iter/s)": 1.449903 }, { "epoch": 0.6297930679919455, "grad_norm": 4.612882137298584, "learning_rate": 9.613614326995882e-05, "loss": 2.860098457336426, "memory(GiB)": 58.3, "step": 14700, "token_acc": 0.4253731343283582, "train_speed(iter/s)": 1.44996 }, { "epoch": 0.6300072833211945, "grad_norm": 4.541575908660889, "learning_rate": 9.613354876666889e-05, "loss": 2.702968406677246, "memory(GiB)": 58.3, "step": 14705, "token_acc": 0.44904458598726116, "train_speed(iter/s)": 1.450032 }, { "epoch": 0.6302214986504434, "grad_norm": 4.412137031555176, "learning_rate": 9.613095342762646e-05, "loss": 2.6839962005615234, "memory(GiB)": 58.3, "step": 14710, "token_acc": 0.46885245901639344, "train_speed(iter/s)": 1.450089 }, { "epoch": 0.6304357139796923, "grad_norm": 4.365449905395508, "learning_rate": 9.612835725287852e-05, "loss": 2.8457691192626955, "memory(GiB)": 58.3, "step": 14715, "token_acc": 0.4389438943894389, "train_speed(iter/s)": 1.450139 }, { "epoch": 0.6306499293089414, "grad_norm": 4.461371898651123, "learning_rate": 9.61257602424721e-05, "loss": 2.569447326660156, "memory(GiB)": 58.3, "step": 14720, "token_acc": 0.4368932038834951, "train_speed(iter/s)": 1.450082 }, { "epoch": 0.6308641446381903, "grad_norm": 4.058807373046875, "learning_rate": 9.612316239645429e-05, "loss": 2.6915924072265627, "memory(GiB)": 58.3, "step": 14725, "token_acc": 0.4694533762057878, "train_speed(iter/s)": 1.450199 }, { "epoch": 0.6310783599674392, "grad_norm": 3.8894944190979004, "learning_rate": 9.612056371487213e-05, "loss": 2.5796859741210936, "memory(GiB)": 58.3, "step": 14730, "token_acc": 0.463855421686747, "train_speed(iter/s)": 1.450191 }, { "epoch": 0.6312925752966883, "grad_norm": 3.9801249504089355, "learning_rate": 9.611796419777266e-05, "loss": 2.786220741271973, "memory(GiB)": 58.3, "step": 14735, "token_acc": 0.445578231292517, "train_speed(iter/s)": 1.450221 }, { "epoch": 0.6315067906259372, "grad_norm": 3.8651113510131836, "learning_rate": 9.611536384520303e-05, "loss": 2.436446952819824, "memory(GiB)": 58.3, "step": 14740, "token_acc": 0.4887459807073955, "train_speed(iter/s)": 1.450162 }, { "epoch": 0.6317210059551861, "grad_norm": 4.436505317687988, "learning_rate": 9.611276265721032e-05, "loss": 2.6532703399658204, "memory(GiB)": 58.3, "step": 14745, "token_acc": 0.4601449275362319, "train_speed(iter/s)": 1.449949 }, { "epoch": 0.6319352212844351, "grad_norm": 4.747355937957764, "learning_rate": 9.611016063384168e-05, "loss": 2.555825424194336, "memory(GiB)": 58.3, "step": 14750, "token_acc": 0.47419354838709676, "train_speed(iter/s)": 1.449947 }, { "epoch": 0.6321494366136841, "grad_norm": 4.22128963470459, "learning_rate": 9.61075577751442e-05, "loss": 2.3227397918701174, "memory(GiB)": 58.3, "step": 14755, "token_acc": 0.5035714285714286, "train_speed(iter/s)": 1.449919 }, { "epoch": 0.6323636519429331, "grad_norm": 5.194764137268066, "learning_rate": 9.610495408116507e-05, "loss": 2.728322410583496, "memory(GiB)": 58.3, "step": 14760, "token_acc": 0.42292490118577075, "train_speed(iter/s)": 1.449878 }, { "epoch": 0.632577867272182, "grad_norm": 4.795577049255371, "learning_rate": 9.610234955195143e-05, "loss": 2.3574304580688477, "memory(GiB)": 58.3, "step": 14765, "token_acc": 0.4978723404255319, "train_speed(iter/s)": 1.44982 }, { "epoch": 0.6327920826014309, "grad_norm": 5.83280086517334, "learning_rate": 9.60997441875505e-05, "loss": 2.8128021240234373, "memory(GiB)": 58.3, "step": 14770, "token_acc": 0.4305555555555556, "train_speed(iter/s)": 1.449905 }, { "epoch": 0.63300629793068, "grad_norm": 4.12320613861084, "learning_rate": 9.609713798800943e-05, "loss": 2.672073745727539, "memory(GiB)": 58.3, "step": 14775, "token_acc": 0.44884488448844884, "train_speed(iter/s)": 1.44985 }, { "epoch": 0.6332205132599289, "grad_norm": 5.067982196807861, "learning_rate": 9.609453095337548e-05, "loss": 2.612883758544922, "memory(GiB)": 58.3, "step": 14780, "token_acc": 0.46794871794871795, "train_speed(iter/s)": 1.449889 }, { "epoch": 0.6334347285891778, "grad_norm": 4.375309944152832, "learning_rate": 9.609192308369588e-05, "loss": 2.7582046508789064, "memory(GiB)": 58.3, "step": 14785, "token_acc": 0.46417445482866043, "train_speed(iter/s)": 1.449929 }, { "epoch": 0.6336489439184269, "grad_norm": 5.104846954345703, "learning_rate": 9.608931437901782e-05, "loss": 2.6408689498901365, "memory(GiB)": 58.3, "step": 14790, "token_acc": 0.43564356435643564, "train_speed(iter/s)": 1.449887 }, { "epoch": 0.6338631592476758, "grad_norm": 4.74508810043335, "learning_rate": 9.608670483938862e-05, "loss": 2.599860191345215, "memory(GiB)": 58.3, "step": 14795, "token_acc": 0.4259818731117825, "train_speed(iter/s)": 1.449771 }, { "epoch": 0.6340773745769247, "grad_norm": 3.539123773574829, "learning_rate": 9.608409446485553e-05, "loss": 2.5515140533447265, "memory(GiB)": 58.3, "step": 14800, "token_acc": 0.44573643410852715, "train_speed(iter/s)": 1.44983 }, { "epoch": 0.6342915899061737, "grad_norm": 3.3713126182556152, "learning_rate": 9.608148325546583e-05, "loss": 2.586057662963867, "memory(GiB)": 58.3, "step": 14805, "token_acc": 0.4426751592356688, "train_speed(iter/s)": 1.44987 }, { "epoch": 0.6345058052354227, "grad_norm": 4.70167875289917, "learning_rate": 9.607887121126684e-05, "loss": 2.3151166915893553, "memory(GiB)": 58.3, "step": 14810, "token_acc": 0.5037593984962406, "train_speed(iter/s)": 1.449931 }, { "epoch": 0.6347200205646716, "grad_norm": 3.8407154083251953, "learning_rate": 9.607625833230586e-05, "loss": 2.755192756652832, "memory(GiB)": 58.3, "step": 14815, "token_acc": 0.4417808219178082, "train_speed(iter/s)": 1.449956 }, { "epoch": 0.6349342358939206, "grad_norm": 5.744650840759277, "learning_rate": 9.607364461863024e-05, "loss": 2.551161766052246, "memory(GiB)": 58.3, "step": 14820, "token_acc": 0.5113636363636364, "train_speed(iter/s)": 1.449978 }, { "epoch": 0.6351484512231695, "grad_norm": 4.699014663696289, "learning_rate": 9.607103007028733e-05, "loss": 2.8635089874267576, "memory(GiB)": 58.3, "step": 14825, "token_acc": 0.45964912280701753, "train_speed(iter/s)": 1.450109 }, { "epoch": 0.6353626665524185, "grad_norm": 5.925031661987305, "learning_rate": 9.606841468732451e-05, "loss": 2.715308952331543, "memory(GiB)": 58.3, "step": 14830, "token_acc": 0.4635036496350365, "train_speed(iter/s)": 1.450097 }, { "epoch": 0.6355768818816675, "grad_norm": 5.102819919586182, "learning_rate": 9.606579846978912e-05, "loss": 2.9080530166625977, "memory(GiB)": 58.3, "step": 14835, "token_acc": 0.446875, "train_speed(iter/s)": 1.450097 }, { "epoch": 0.6357910972109164, "grad_norm": 3.8110156059265137, "learning_rate": 9.606318141772858e-05, "loss": 2.618484687805176, "memory(GiB)": 58.3, "step": 14840, "token_acc": 0.43354430379746833, "train_speed(iter/s)": 1.45006 }, { "epoch": 0.6360053125401653, "grad_norm": 5.4171366691589355, "learning_rate": 9.606056353119031e-05, "loss": 2.465500259399414, "memory(GiB)": 58.3, "step": 14845, "token_acc": 0.4166666666666667, "train_speed(iter/s)": 1.44985 }, { "epoch": 0.6362195278694144, "grad_norm": 4.653716087341309, "learning_rate": 9.60579448102217e-05, "loss": 2.785238265991211, "memory(GiB)": 58.3, "step": 14850, "token_acc": 0.4456140350877193, "train_speed(iter/s)": 1.449817 }, { "epoch": 0.6364337431986633, "grad_norm": 5.076840877532959, "learning_rate": 9.605532525487024e-05, "loss": 2.428042411804199, "memory(GiB)": 58.3, "step": 14855, "token_acc": 0.5260869565217391, "train_speed(iter/s)": 1.44994 }, { "epoch": 0.6366479585279122, "grad_norm": 4.1579670906066895, "learning_rate": 9.605270486518335e-05, "loss": 2.4552005767822265, "memory(GiB)": 58.3, "step": 14860, "token_acc": 0.5064102564102564, "train_speed(iter/s)": 1.449914 }, { "epoch": 0.6368621738571613, "grad_norm": 4.259786128997803, "learning_rate": 9.60500836412085e-05, "loss": 2.512075424194336, "memory(GiB)": 58.3, "step": 14865, "token_acc": 0.4492753623188406, "train_speed(iter/s)": 1.449956 }, { "epoch": 0.6370763891864102, "grad_norm": 3.9307875633239746, "learning_rate": 9.604746158299319e-05, "loss": 2.515934181213379, "memory(GiB)": 58.3, "step": 14870, "token_acc": 0.4785714285714286, "train_speed(iter/s)": 1.449822 }, { "epoch": 0.6372906045156591, "grad_norm": 4.962164402008057, "learning_rate": 9.604483869058492e-05, "loss": 2.844198226928711, "memory(GiB)": 58.3, "step": 14875, "token_acc": 0.46953405017921146, "train_speed(iter/s)": 1.449927 }, { "epoch": 0.6375048198449081, "grad_norm": 5.517521381378174, "learning_rate": 9.604221496403121e-05, "loss": 2.4753753662109377, "memory(GiB)": 58.3, "step": 14880, "token_acc": 0.4444444444444444, "train_speed(iter/s)": 1.449976 }, { "epoch": 0.6377190351741571, "grad_norm": 4.912667751312256, "learning_rate": 9.603959040337958e-05, "loss": 2.599565887451172, "memory(GiB)": 58.3, "step": 14885, "token_acc": 0.4143835616438356, "train_speed(iter/s)": 1.449923 }, { "epoch": 0.637933250503406, "grad_norm": 4.038173198699951, "learning_rate": 9.603696500867758e-05, "loss": 2.6866586685180662, "memory(GiB)": 58.3, "step": 14890, "token_acc": 0.45098039215686275, "train_speed(iter/s)": 1.449931 }, { "epoch": 0.638147465832655, "grad_norm": 6.335878372192383, "learning_rate": 9.603433877997278e-05, "loss": 2.87451286315918, "memory(GiB)": 58.3, "step": 14895, "token_acc": 0.46, "train_speed(iter/s)": 1.45004 }, { "epoch": 0.638361681161904, "grad_norm": 6.428534507751465, "learning_rate": 9.603171171731273e-05, "loss": 2.958345413208008, "memory(GiB)": 58.3, "step": 14900, "token_acc": 0.46122448979591835, "train_speed(iter/s)": 1.450129 }, { "epoch": 0.6385758964911529, "grad_norm": 4.494062900543213, "learning_rate": 9.602908382074508e-05, "loss": 2.7423254013061524, "memory(GiB)": 58.3, "step": 14905, "token_acc": 0.45427728613569324, "train_speed(iter/s)": 1.450149 }, { "epoch": 0.6387901118204019, "grad_norm": 4.378876209259033, "learning_rate": 9.602645509031735e-05, "loss": 2.493967819213867, "memory(GiB)": 58.3, "step": 14910, "token_acc": 0.47985347985347987, "train_speed(iter/s)": 1.450043 }, { "epoch": 0.6390043271496508, "grad_norm": 4.599621772766113, "learning_rate": 9.602382552607724e-05, "loss": 2.6918758392333983, "memory(GiB)": 58.3, "step": 14915, "token_acc": 0.46534653465346537, "train_speed(iter/s)": 1.450114 }, { "epoch": 0.6392185424788998, "grad_norm": 4.914430141448975, "learning_rate": 9.602119512807234e-05, "loss": 2.548309326171875, "memory(GiB)": 58.3, "step": 14920, "token_acc": 0.4867549668874172, "train_speed(iter/s)": 1.450072 }, { "epoch": 0.6394327578081488, "grad_norm": 5.141462326049805, "learning_rate": 9.601856389635034e-05, "loss": 2.8038703918457033, "memory(GiB)": 58.3, "step": 14925, "token_acc": 0.4523809523809524, "train_speed(iter/s)": 1.450165 }, { "epoch": 0.6396469731373977, "grad_norm": 5.20515251159668, "learning_rate": 9.601593183095887e-05, "loss": 2.6030431747436524, "memory(GiB)": 58.3, "step": 14930, "token_acc": 0.44366197183098594, "train_speed(iter/s)": 1.450181 }, { "epoch": 0.6398611884666466, "grad_norm": 5.146335601806641, "learning_rate": 9.601329893194563e-05, "loss": 2.6439170837402344, "memory(GiB)": 58.3, "step": 14935, "token_acc": 0.4630225080385852, "train_speed(iter/s)": 1.450191 }, { "epoch": 0.6400754037958957, "grad_norm": 4.347159385681152, "learning_rate": 9.601066519935832e-05, "loss": 2.5498104095458984, "memory(GiB)": 58.3, "step": 14940, "token_acc": 0.47115384615384615, "train_speed(iter/s)": 1.450194 }, { "epoch": 0.6402896191251446, "grad_norm": 6.115302562713623, "learning_rate": 9.600803063324465e-05, "loss": 2.3640268325805662, "memory(GiB)": 58.3, "step": 14945, "token_acc": 0.4810606060606061, "train_speed(iter/s)": 1.450126 }, { "epoch": 0.6405038344543935, "grad_norm": 5.782168865203857, "learning_rate": 9.600539523365234e-05, "loss": 2.961846923828125, "memory(GiB)": 58.3, "step": 14950, "token_acc": 0.4331395348837209, "train_speed(iter/s)": 1.450112 }, { "epoch": 0.6407180497836426, "grad_norm": 3.8622121810913086, "learning_rate": 9.600275900062915e-05, "loss": 2.6871658325195313, "memory(GiB)": 58.3, "step": 14955, "token_acc": 0.4438202247191011, "train_speed(iter/s)": 1.450199 }, { "epoch": 0.6409322651128915, "grad_norm": 3.9965529441833496, "learning_rate": 9.600012193422281e-05, "loss": 2.574698066711426, "memory(GiB)": 58.3, "step": 14960, "token_acc": 0.48507462686567165, "train_speed(iter/s)": 1.450078 }, { "epoch": 0.6411464804421404, "grad_norm": 3.745694637298584, "learning_rate": 9.599748403448113e-05, "loss": 2.608061981201172, "memory(GiB)": 58.3, "step": 14965, "token_acc": 0.47761194029850745, "train_speed(iter/s)": 1.4499 }, { "epoch": 0.6413606957713894, "grad_norm": 4.3401408195495605, "learning_rate": 9.599484530145186e-05, "loss": 2.2880584716796877, "memory(GiB)": 58.3, "step": 14970, "token_acc": 0.4837662337662338, "train_speed(iter/s)": 1.449876 }, { "epoch": 0.6415749111006384, "grad_norm": 4.041738986968994, "learning_rate": 9.599220573518282e-05, "loss": 2.6305240631103515, "memory(GiB)": 58.3, "step": 14975, "token_acc": 0.45084745762711864, "train_speed(iter/s)": 1.449915 }, { "epoch": 0.6417891264298873, "grad_norm": 5.006702423095703, "learning_rate": 9.598956533572183e-05, "loss": 2.7863771438598635, "memory(GiB)": 58.3, "step": 14980, "token_acc": 0.4483985765124555, "train_speed(iter/s)": 1.449805 }, { "epoch": 0.6420033417591363, "grad_norm": 5.555137634277344, "learning_rate": 9.598692410311674e-05, "loss": 2.6082275390625, "memory(GiB)": 58.3, "step": 14985, "token_acc": 0.4746376811594203, "train_speed(iter/s)": 1.449877 }, { "epoch": 0.6422175570883852, "grad_norm": 4.539990425109863, "learning_rate": 9.598428203741536e-05, "loss": 2.6776247024536133, "memory(GiB)": 58.3, "step": 14990, "token_acc": 0.44525547445255476, "train_speed(iter/s)": 1.449895 }, { "epoch": 0.6424317724176342, "grad_norm": 4.6724853515625, "learning_rate": 9.598163913866558e-05, "loss": 2.6252384185791016, "memory(GiB)": 58.3, "step": 14995, "token_acc": 0.49466192170818507, "train_speed(iter/s)": 1.450055 }, { "epoch": 0.6426459877468832, "grad_norm": 5.745842933654785, "learning_rate": 9.597899540691527e-05, "loss": 2.830762481689453, "memory(GiB)": 58.3, "step": 15000, "token_acc": 0.46254071661237783, "train_speed(iter/s)": 1.450112 }, { "epoch": 0.6426459877468832, "eval_loss": 2.21223521232605, "eval_runtime": 13.5047, "eval_samples_per_second": 7.405, "eval_steps_per_second": 7.405, "eval_token_acc": 0.48322147651006714, "step": 15000 }, { "epoch": 0.6428602030761321, "grad_norm": 5.049619674682617, "learning_rate": 9.597635084221234e-05, "loss": 2.7212646484375, "memory(GiB)": 58.3, "step": 15005, "token_acc": 0.47092469018112487, "train_speed(iter/s)": 1.448171 }, { "epoch": 0.643074418405381, "grad_norm": 4.307399749755859, "learning_rate": 9.597370544460467e-05, "loss": 2.6538330078125, "memory(GiB)": 58.3, "step": 15010, "token_acc": 0.4384057971014493, "train_speed(iter/s)": 1.448084 }, { "epoch": 0.6432886337346301, "grad_norm": 4.235497951507568, "learning_rate": 9.59710592141402e-05, "loss": 2.7035709381103517, "memory(GiB)": 58.3, "step": 15015, "token_acc": 0.45151515151515154, "train_speed(iter/s)": 1.448238 }, { "epoch": 0.643502849063879, "grad_norm": 6.353463649749756, "learning_rate": 9.596841215086689e-05, "loss": 2.3623008728027344, "memory(GiB)": 58.3, "step": 15020, "token_acc": 0.5127272727272727, "train_speed(iter/s)": 1.448341 }, { "epoch": 0.6437170643931279, "grad_norm": 8.712040901184082, "learning_rate": 9.596576425483264e-05, "loss": 2.5878549575805665, "memory(GiB)": 58.3, "step": 15025, "token_acc": 0.47686832740213525, "train_speed(iter/s)": 1.448338 }, { "epoch": 0.643931279722377, "grad_norm": 4.22176456451416, "learning_rate": 9.596311552608547e-05, "loss": 2.953642654418945, "memory(GiB)": 58.3, "step": 15030, "token_acc": 0.44242424242424244, "train_speed(iter/s)": 1.448318 }, { "epoch": 0.6441454950516259, "grad_norm": 4.134119987487793, "learning_rate": 9.596046596467334e-05, "loss": 2.391531562805176, "memory(GiB)": 58.3, "step": 15035, "token_acc": 0.5073529411764706, "train_speed(iter/s)": 1.448516 }, { "epoch": 0.6443597103808748, "grad_norm": 3.641388177871704, "learning_rate": 9.595781557064427e-05, "loss": 2.477495574951172, "memory(GiB)": 58.3, "step": 15040, "token_acc": 0.48507462686567165, "train_speed(iter/s)": 1.4485 }, { "epoch": 0.6445739257101238, "grad_norm": 7.13884162902832, "learning_rate": 9.595516434404624e-05, "loss": 2.774261474609375, "memory(GiB)": 58.3, "step": 15045, "token_acc": 0.4375, "train_speed(iter/s)": 1.448479 }, { "epoch": 0.6447881410393728, "grad_norm": 3.5477068424224854, "learning_rate": 9.59525122849273e-05, "loss": 2.663995552062988, "memory(GiB)": 58.3, "step": 15050, "token_acc": 0.4753623188405797, "train_speed(iter/s)": 1.448425 }, { "epoch": 0.6450023563686217, "grad_norm": 4.730018615722656, "learning_rate": 9.594985939333549e-05, "loss": 2.543264389038086, "memory(GiB)": 58.3, "step": 15055, "token_acc": 0.49498327759197325, "train_speed(iter/s)": 1.448499 }, { "epoch": 0.6452165716978707, "grad_norm": 4.0632195472717285, "learning_rate": 9.594720566931889e-05, "loss": 2.618559455871582, "memory(GiB)": 58.3, "step": 15060, "token_acc": 0.46689895470383275, "train_speed(iter/s)": 1.448317 }, { "epoch": 0.6454307870271196, "grad_norm": 3.4372284412384033, "learning_rate": 9.594455111292555e-05, "loss": 2.6636754989624025, "memory(GiB)": 58.3, "step": 15065, "token_acc": 0.4296028880866426, "train_speed(iter/s)": 1.448291 }, { "epoch": 0.6456450023563686, "grad_norm": 4.137977123260498, "learning_rate": 9.594189572420356e-05, "loss": 2.300486183166504, "memory(GiB)": 58.3, "step": 15070, "token_acc": 0.4875444839857651, "train_speed(iter/s)": 1.448254 }, { "epoch": 0.6458592176856176, "grad_norm": 4.151328086853027, "learning_rate": 9.593923950320104e-05, "loss": 2.7640867233276367, "memory(GiB)": 58.3, "step": 15075, "token_acc": 0.4405594405594406, "train_speed(iter/s)": 1.448279 }, { "epoch": 0.6460734330148665, "grad_norm": 4.091307163238525, "learning_rate": 9.593658244996609e-05, "loss": 2.4460733413696287, "memory(GiB)": 58.3, "step": 15080, "token_acc": 0.47735191637630664, "train_speed(iter/s)": 1.448334 }, { "epoch": 0.6462876483441155, "grad_norm": 4.356914043426514, "learning_rate": 9.593392456454686e-05, "loss": 2.548112487792969, "memory(GiB)": 58.3, "step": 15085, "token_acc": 0.48507462686567165, "train_speed(iter/s)": 1.44833 }, { "epoch": 0.6465018636733645, "grad_norm": 4.232750415802002, "learning_rate": 9.59312658469915e-05, "loss": 2.892285919189453, "memory(GiB)": 58.3, "step": 15090, "token_acc": 0.4280821917808219, "train_speed(iter/s)": 1.448328 }, { "epoch": 0.6467160790026134, "grad_norm": 4.230464458465576, "learning_rate": 9.592860629734819e-05, "loss": 2.692633628845215, "memory(GiB)": 58.3, "step": 15095, "token_acc": 0.4444444444444444, "train_speed(iter/s)": 1.448443 }, { "epoch": 0.6469302943318624, "grad_norm": 5.196896553039551, "learning_rate": 9.592594591566508e-05, "loss": 2.826826477050781, "memory(GiB)": 58.3, "step": 15100, "token_acc": 0.4296028880866426, "train_speed(iter/s)": 1.448516 }, { "epoch": 0.6471445096611114, "grad_norm": 9.97440242767334, "learning_rate": 9.592328470199037e-05, "loss": 2.8210947036743166, "memory(GiB)": 58.3, "step": 15105, "token_acc": 0.44966442953020136, "train_speed(iter/s)": 1.448551 }, { "epoch": 0.6473587249903603, "grad_norm": 4.933980941772461, "learning_rate": 9.592062265637227e-05, "loss": 2.51159725189209, "memory(GiB)": 58.3, "step": 15110, "token_acc": 0.46788990825688076, "train_speed(iter/s)": 1.448651 }, { "epoch": 0.6475729403196093, "grad_norm": 5.6142497062683105, "learning_rate": 9.591795977885903e-05, "loss": 2.496227264404297, "memory(GiB)": 58.3, "step": 15115, "token_acc": 0.4618055555555556, "train_speed(iter/s)": 1.448722 }, { "epoch": 0.6477871556488582, "grad_norm": 6.159660339355469, "learning_rate": 9.591529606949887e-05, "loss": 3.0869821548461913, "memory(GiB)": 58.3, "step": 15120, "token_acc": 0.39492753623188404, "train_speed(iter/s)": 1.448821 }, { "epoch": 0.6480013709781072, "grad_norm": 6.724240779876709, "learning_rate": 9.591263152834005e-05, "loss": 2.614556312561035, "memory(GiB)": 58.3, "step": 15125, "token_acc": 0.452755905511811, "train_speed(iter/s)": 1.448823 }, { "epoch": 0.6482155863073562, "grad_norm": 4.72502326965332, "learning_rate": 9.590996615543084e-05, "loss": 2.723069953918457, "memory(GiB)": 58.3, "step": 15130, "token_acc": 0.44333333333333336, "train_speed(iter/s)": 1.448852 }, { "epoch": 0.6484298016366051, "grad_norm": 4.011580467224121, "learning_rate": 9.590729995081953e-05, "loss": 2.35583438873291, "memory(GiB)": 58.3, "step": 15135, "token_acc": 0.5017064846416383, "train_speed(iter/s)": 1.4489 }, { "epoch": 0.648644016965854, "grad_norm": 3.4947268962860107, "learning_rate": 9.590463291455442e-05, "loss": 2.531879425048828, "memory(GiB)": 58.3, "step": 15140, "token_acc": 0.5128205128205128, "train_speed(iter/s)": 1.448927 }, { "epoch": 0.6488582322951031, "grad_norm": 4.407508373260498, "learning_rate": 9.59019650466838e-05, "loss": 2.477404022216797, "memory(GiB)": 58.3, "step": 15145, "token_acc": 0.5127272727272727, "train_speed(iter/s)": 1.449022 }, { "epoch": 0.649072447624352, "grad_norm": 3.6686882972717285, "learning_rate": 9.589929634725605e-05, "loss": 2.609175109863281, "memory(GiB)": 58.3, "step": 15150, "token_acc": 0.4743202416918429, "train_speed(iter/s)": 1.449006 }, { "epoch": 0.6492866629536009, "grad_norm": 5.289790630340576, "learning_rate": 9.589662681631948e-05, "loss": 2.7299274444580077, "memory(GiB)": 58.3, "step": 15155, "token_acc": 0.42402826855123676, "train_speed(iter/s)": 1.448868 }, { "epoch": 0.64950087828285, "grad_norm": 6.507251262664795, "learning_rate": 9.589395645392245e-05, "loss": 2.722701072692871, "memory(GiB)": 58.3, "step": 15160, "token_acc": 0.4263565891472868, "train_speed(iter/s)": 1.448838 }, { "epoch": 0.6497150936120989, "grad_norm": 4.34958028793335, "learning_rate": 9.589128526011336e-05, "loss": 2.349719429016113, "memory(GiB)": 58.3, "step": 15165, "token_acc": 0.524904214559387, "train_speed(iter/s)": 1.448831 }, { "epoch": 0.6499293089413478, "grad_norm": 3.771009922027588, "learning_rate": 9.588861323494058e-05, "loss": 2.47757511138916, "memory(GiB)": 58.3, "step": 15170, "token_acc": 0.45058139534883723, "train_speed(iter/s)": 1.448901 }, { "epoch": 0.6501435242705969, "grad_norm": 3.5361275672912598, "learning_rate": 9.588594037845254e-05, "loss": 2.5539297103881835, "memory(GiB)": 58.3, "step": 15175, "token_acc": 0.47878787878787876, "train_speed(iter/s)": 1.448977 }, { "epoch": 0.6503577395998458, "grad_norm": 4.314279556274414, "learning_rate": 9.588326669069763e-05, "loss": 2.497235107421875, "memory(GiB)": 58.3, "step": 15180, "token_acc": 0.4828767123287671, "train_speed(iter/s)": 1.449097 }, { "epoch": 0.6505719549290947, "grad_norm": 9.188122749328613, "learning_rate": 9.588059217172432e-05, "loss": 2.6258724212646483, "memory(GiB)": 58.3, "step": 15185, "token_acc": 0.45625, "train_speed(iter/s)": 1.449022 }, { "epoch": 0.6507861702583437, "grad_norm": 4.531054973602295, "learning_rate": 9.587791682158102e-05, "loss": 2.5726291656494142, "memory(GiB)": 58.3, "step": 15190, "token_acc": 0.4820846905537459, "train_speed(iter/s)": 1.449143 }, { "epoch": 0.6510003855875927, "grad_norm": 4.9413604736328125, "learning_rate": 9.587524064031624e-05, "loss": 2.427939796447754, "memory(GiB)": 58.3, "step": 15195, "token_acc": 0.5181159420289855, "train_speed(iter/s)": 1.449209 }, { "epoch": 0.6512146009168416, "grad_norm": 5.501973628997803, "learning_rate": 9.587256362797842e-05, "loss": 2.771798515319824, "memory(GiB)": 58.3, "step": 15200, "token_acc": 0.44126984126984126, "train_speed(iter/s)": 1.449229 }, { "epoch": 0.6514288162460906, "grad_norm": 3.9437625408172607, "learning_rate": 9.586988578461609e-05, "loss": 2.7026594161987303, "memory(GiB)": 58.3, "step": 15205, "token_acc": 0.4271523178807947, "train_speed(iter/s)": 1.449269 }, { "epoch": 0.6516430315753395, "grad_norm": 4.041426658630371, "learning_rate": 9.586720711027775e-05, "loss": 2.468884086608887, "memory(GiB)": 58.3, "step": 15210, "token_acc": 0.47003154574132494, "train_speed(iter/s)": 1.449257 }, { "epoch": 0.6518572469045885, "grad_norm": 3.4535014629364014, "learning_rate": 9.586452760501193e-05, "loss": 2.4665542602539063, "memory(GiB)": 58.3, "step": 15215, "token_acc": 0.4745222929936306, "train_speed(iter/s)": 1.449229 }, { "epoch": 0.6520714622338375, "grad_norm": 5.335357666015625, "learning_rate": 9.586184726886715e-05, "loss": 2.476365661621094, "memory(GiB)": 58.3, "step": 15220, "token_acc": 0.5182186234817814, "train_speed(iter/s)": 1.449209 }, { "epoch": 0.6522856775630864, "grad_norm": 3.933119535446167, "learning_rate": 9.5859166101892e-05, "loss": 2.5218833923339843, "memory(GiB)": 58.3, "step": 15225, "token_acc": 0.4186046511627907, "train_speed(iter/s)": 1.449328 }, { "epoch": 0.6524998928923353, "grad_norm": 5.322347164154053, "learning_rate": 9.585648410413503e-05, "loss": 2.808527183532715, "memory(GiB)": 58.3, "step": 15230, "token_acc": 0.4392857142857143, "train_speed(iter/s)": 1.449424 }, { "epoch": 0.6527141082215844, "grad_norm": 4.473291873931885, "learning_rate": 9.585380127564484e-05, "loss": 2.8380367279052736, "memory(GiB)": 58.3, "step": 15235, "token_acc": 0.45396825396825397, "train_speed(iter/s)": 1.449552 }, { "epoch": 0.6529283235508333, "grad_norm": 5.80584192276001, "learning_rate": 9.585111761647002e-05, "loss": 2.588529586791992, "memory(GiB)": 58.3, "step": 15240, "token_acc": 0.45864661654135336, "train_speed(iter/s)": 1.449591 }, { "epoch": 0.6531425388800822, "grad_norm": 3.6384706497192383, "learning_rate": 9.58484331266592e-05, "loss": 2.661268615722656, "memory(GiB)": 58.3, "step": 15245, "token_acc": 0.4440894568690096, "train_speed(iter/s)": 1.449575 }, { "epoch": 0.6533567542093313, "grad_norm": 3.9359335899353027, "learning_rate": 9.5845747806261e-05, "loss": 2.6027481079101564, "memory(GiB)": 58.3, "step": 15250, "token_acc": 0.4472843450479233, "train_speed(iter/s)": 1.449665 }, { "epoch": 0.6535709695385802, "grad_norm": 3.7551069259643555, "learning_rate": 9.584306165532406e-05, "loss": 2.597681427001953, "memory(GiB)": 58.3, "step": 15255, "token_acc": 0.48175182481751827, "train_speed(iter/s)": 1.449719 }, { "epoch": 0.6537851848678291, "grad_norm": 4.248190879821777, "learning_rate": 9.584037467389708e-05, "loss": 2.6357345581054688, "memory(GiB)": 58.3, "step": 15260, "token_acc": 0.42142857142857143, "train_speed(iter/s)": 1.449625 }, { "epoch": 0.6539994001970781, "grad_norm": 4.554111480712891, "learning_rate": 9.583768686202869e-05, "loss": 2.529732894897461, "memory(GiB)": 58.3, "step": 15265, "token_acc": 0.46178343949044587, "train_speed(iter/s)": 1.449628 }, { "epoch": 0.6542136155263271, "grad_norm": 7.0177130699157715, "learning_rate": 9.583499821976762e-05, "loss": 2.3018617630004883, "memory(GiB)": 58.3, "step": 15270, "token_acc": 0.5254237288135594, "train_speed(iter/s)": 1.44969 }, { "epoch": 0.654427830855576, "grad_norm": 3.57904314994812, "learning_rate": 9.583230874716253e-05, "loss": 2.4643199920654295, "memory(GiB)": 58.3, "step": 15275, "token_acc": 0.49337748344370863, "train_speed(iter/s)": 1.449541 }, { "epoch": 0.654642046184825, "grad_norm": 4.028859615325928, "learning_rate": 9.582961844426221e-05, "loss": 2.6705575942993165, "memory(GiB)": 58.3, "step": 15280, "token_acc": 0.5132075471698113, "train_speed(iter/s)": 1.449535 }, { "epoch": 0.654856261514074, "grad_norm": 4.198547840118408, "learning_rate": 9.582692731111535e-05, "loss": 2.5124027252197267, "memory(GiB)": 58.3, "step": 15285, "token_acc": 0.47368421052631576, "train_speed(iter/s)": 1.449683 }, { "epoch": 0.6550704768433229, "grad_norm": 5.1375532150268555, "learning_rate": 9.58242353477707e-05, "loss": 2.7559852600097656, "memory(GiB)": 58.3, "step": 15290, "token_acc": 0.42990654205607476, "train_speed(iter/s)": 1.449703 }, { "epoch": 0.6552846921725719, "grad_norm": 4.5739850997924805, "learning_rate": 9.582154255427705e-05, "loss": 2.802250289916992, "memory(GiB)": 58.3, "step": 15295, "token_acc": 0.47540983606557374, "train_speed(iter/s)": 1.449707 }, { "epoch": 0.6554989075018208, "grad_norm": 3.3840103149414062, "learning_rate": 9.581884893068315e-05, "loss": 2.700469398498535, "memory(GiB)": 58.3, "step": 15300, "token_acc": 0.4858757062146893, "train_speed(iter/s)": 1.449663 }, { "epoch": 0.6557131228310698, "grad_norm": 3.8742876052856445, "learning_rate": 9.581615447703784e-05, "loss": 2.60408878326416, "memory(GiB)": 58.3, "step": 15305, "token_acc": 0.45185185185185184, "train_speed(iter/s)": 1.449727 }, { "epoch": 0.6559273381603188, "grad_norm": 4.047839641571045, "learning_rate": 9.581345919338994e-05, "loss": 2.711720275878906, "memory(GiB)": 58.3, "step": 15310, "token_acc": 0.46153846153846156, "train_speed(iter/s)": 1.449896 }, { "epoch": 0.6561415534895677, "grad_norm": 6.770996570587158, "learning_rate": 9.58107630797882e-05, "loss": 2.7397390365600587, "memory(GiB)": 58.3, "step": 15315, "token_acc": 0.45126353790613716, "train_speed(iter/s)": 1.449886 }, { "epoch": 0.6563557688188166, "grad_norm": 3.7116146087646484, "learning_rate": 9.580806613628155e-05, "loss": 2.1243192672729494, "memory(GiB)": 58.3, "step": 15320, "token_acc": 0.5271317829457365, "train_speed(iter/s)": 1.449861 }, { "epoch": 0.6565699841480657, "grad_norm": 4.117305278778076, "learning_rate": 9.580536836291878e-05, "loss": 2.8824741363525392, "memory(GiB)": 58.3, "step": 15325, "token_acc": 0.45222929936305734, "train_speed(iter/s)": 1.449924 }, { "epoch": 0.6567841994773146, "grad_norm": 4.3368144035339355, "learning_rate": 9.58026697597488e-05, "loss": 2.6865615844726562, "memory(GiB)": 58.3, "step": 15330, "token_acc": 0.4716312056737589, "train_speed(iter/s)": 1.449948 }, { "epoch": 0.6569984148065635, "grad_norm": 4.002928256988525, "learning_rate": 9.579997032682052e-05, "loss": 2.4928972244262697, "memory(GiB)": 58.3, "step": 15335, "token_acc": 0.459375, "train_speed(iter/s)": 1.450055 }, { "epoch": 0.6572126301358125, "grad_norm": 5.437620162963867, "learning_rate": 9.579727006418279e-05, "loss": 2.749365043640137, "memory(GiB)": 58.3, "step": 15340, "token_acc": 0.45723684210526316, "train_speed(iter/s)": 1.450044 }, { "epoch": 0.6574268454650615, "grad_norm": 4.67940616607666, "learning_rate": 9.579456897188455e-05, "loss": 2.677341651916504, "memory(GiB)": 58.3, "step": 15345, "token_acc": 0.43462897526501765, "train_speed(iter/s)": 1.450066 }, { "epoch": 0.6576410607943104, "grad_norm": 5.337660312652588, "learning_rate": 9.579186704997474e-05, "loss": 2.707118797302246, "memory(GiB)": 58.3, "step": 15350, "token_acc": 0.4539249146757679, "train_speed(iter/s)": 1.450126 }, { "epoch": 0.6578552761235594, "grad_norm": 4.947908878326416, "learning_rate": 9.57891642985023e-05, "loss": 2.550575065612793, "memory(GiB)": 58.3, "step": 15355, "token_acc": 0.4889705882352941, "train_speed(iter/s)": 1.450199 }, { "epoch": 0.6580694914528084, "grad_norm": 3.9470489025115967, "learning_rate": 9.57864607175162e-05, "loss": 2.4807998657226564, "memory(GiB)": 58.3, "step": 15360, "token_acc": 0.4610169491525424, "train_speed(iter/s)": 1.45009 }, { "epoch": 0.6582837067820573, "grad_norm": 3.980710506439209, "learning_rate": 9.57837563070654e-05, "loss": 2.661483383178711, "memory(GiB)": 58.3, "step": 15365, "token_acc": 0.4486301369863014, "train_speed(iter/s)": 1.450136 }, { "epoch": 0.6584979221113063, "grad_norm": 5.150298595428467, "learning_rate": 9.578105106719893e-05, "loss": 2.567930221557617, "memory(GiB)": 58.3, "step": 15370, "token_acc": 0.4731182795698925, "train_speed(iter/s)": 1.450088 }, { "epoch": 0.6587121374405552, "grad_norm": 4.133373737335205, "learning_rate": 9.577834499796575e-05, "loss": 2.7865345001220705, "memory(GiB)": 58.3, "step": 15375, "token_acc": 0.48264984227129337, "train_speed(iter/s)": 1.450218 }, { "epoch": 0.6589263527698042, "grad_norm": 3.8452861309051514, "learning_rate": 9.577563809941492e-05, "loss": 2.725305938720703, "memory(GiB)": 58.3, "step": 15380, "token_acc": 0.4371069182389937, "train_speed(iter/s)": 1.450212 }, { "epoch": 0.6591405680990532, "grad_norm": 3.7102482318878174, "learning_rate": 9.577293037159544e-05, "loss": 2.448910140991211, "memory(GiB)": 58.3, "step": 15385, "token_acc": 0.4826388888888889, "train_speed(iter/s)": 1.450242 }, { "epoch": 0.6593547834283021, "grad_norm": 5.404450416564941, "learning_rate": 9.577022181455641e-05, "loss": 2.9752422332763673, "memory(GiB)": 58.3, "step": 15390, "token_acc": 0.4380952380952381, "train_speed(iter/s)": 1.450391 }, { "epoch": 0.659568998757551, "grad_norm": 7.460755348205566, "learning_rate": 9.57675124283469e-05, "loss": 3.09713134765625, "memory(GiB)": 58.3, "step": 15395, "token_acc": 0.42136498516320475, "train_speed(iter/s)": 1.450439 }, { "epoch": 0.6597832140868001, "grad_norm": 4.8870673179626465, "learning_rate": 9.576480221301593e-05, "loss": 2.526996040344238, "memory(GiB)": 58.3, "step": 15400, "token_acc": 0.4819277108433735, "train_speed(iter/s)": 1.450396 }, { "epoch": 0.659997429416049, "grad_norm": 5.4269938468933105, "learning_rate": 9.576209116861265e-05, "loss": 2.477591705322266, "memory(GiB)": 58.3, "step": 15405, "token_acc": 0.45229681978798586, "train_speed(iter/s)": 1.450455 }, { "epoch": 0.6602116447452979, "grad_norm": 4.405874252319336, "learning_rate": 9.575937929518616e-05, "loss": 2.419519233703613, "memory(GiB)": 58.3, "step": 15410, "token_acc": 0.5, "train_speed(iter/s)": 1.450492 }, { "epoch": 0.660425860074547, "grad_norm": 3.6095001697540283, "learning_rate": 9.575666659278559e-05, "loss": 2.7280147552490233, "memory(GiB)": 58.3, "step": 15415, "token_acc": 0.45993031358885017, "train_speed(iter/s)": 1.450433 }, { "epoch": 0.6606400754037959, "grad_norm": 4.082696437835693, "learning_rate": 9.575395306146008e-05, "loss": 2.7314144134521485, "memory(GiB)": 58.3, "step": 15420, "token_acc": 0.4676923076923077, "train_speed(iter/s)": 1.450416 }, { "epoch": 0.6608542907330448, "grad_norm": 3.2907700538635254, "learning_rate": 9.575123870125879e-05, "loss": 2.544133758544922, "memory(GiB)": 58.3, "step": 15425, "token_acc": 0.49477351916376305, "train_speed(iter/s)": 1.450361 }, { "epoch": 0.6610685060622938, "grad_norm": 4.113558769226074, "learning_rate": 9.574852351223089e-05, "loss": 2.8627801895141602, "memory(GiB)": 58.3, "step": 15430, "token_acc": 0.42934782608695654, "train_speed(iter/s)": 1.450241 }, { "epoch": 0.6612827213915428, "grad_norm": 5.359152317047119, "learning_rate": 9.574580749442557e-05, "loss": 2.5982944488525392, "memory(GiB)": 58.3, "step": 15435, "token_acc": 0.49794238683127573, "train_speed(iter/s)": 1.450292 }, { "epoch": 0.6614969367207918, "grad_norm": 3.745368242263794, "learning_rate": 9.574309064789205e-05, "loss": 2.4612525939941405, "memory(GiB)": 58.3, "step": 15440, "token_acc": 0.4808510638297872, "train_speed(iter/s)": 1.450211 }, { "epoch": 0.6617111520500407, "grad_norm": 5.729100227355957, "learning_rate": 9.57403729726795e-05, "loss": 2.8477020263671875, "memory(GiB)": 58.3, "step": 15445, "token_acc": 0.4117647058823529, "train_speed(iter/s)": 1.450283 }, { "epoch": 0.6619253673792896, "grad_norm": 3.986607074737549, "learning_rate": 9.573765446883722e-05, "loss": 2.642181396484375, "memory(GiB)": 58.3, "step": 15450, "token_acc": 0.4520123839009288, "train_speed(iter/s)": 1.450259 }, { "epoch": 0.6621395827085387, "grad_norm": 4.759144306182861, "learning_rate": 9.57349351364144e-05, "loss": 2.9675331115722656, "memory(GiB)": 58.3, "step": 15455, "token_acc": 0.43508771929824563, "train_speed(iter/s)": 1.45029 }, { "epoch": 0.6623537980377876, "grad_norm": 3.890176296234131, "learning_rate": 9.573221497546035e-05, "loss": 2.729171562194824, "memory(GiB)": 58.3, "step": 15460, "token_acc": 0.42948717948717946, "train_speed(iter/s)": 1.450391 }, { "epoch": 0.6625680133670365, "grad_norm": 3.833308458328247, "learning_rate": 9.57294939860243e-05, "loss": 2.7343345642089845, "memory(GiB)": 58.3, "step": 15465, "token_acc": 0.4409937888198758, "train_speed(iter/s)": 1.450206 }, { "epoch": 0.6627822286962856, "grad_norm": 5.782052993774414, "learning_rate": 9.572677216815559e-05, "loss": 2.8174747467041015, "memory(GiB)": 58.3, "step": 15470, "token_acc": 0.4420289855072464, "train_speed(iter/s)": 1.450226 }, { "epoch": 0.6629964440255345, "grad_norm": 4.530306339263916, "learning_rate": 9.572404952190349e-05, "loss": 2.4785350799560546, "memory(GiB)": 58.3, "step": 15475, "token_acc": 0.47368421052631576, "train_speed(iter/s)": 1.450288 }, { "epoch": 0.6632106593547834, "grad_norm": 3.7064712047576904, "learning_rate": 9.572132604731735e-05, "loss": 2.698088836669922, "memory(GiB)": 58.3, "step": 15480, "token_acc": 0.42933333333333334, "train_speed(iter/s)": 1.450391 }, { "epoch": 0.6634248746840324, "grad_norm": 3.4631004333496094, "learning_rate": 9.571860174444649e-05, "loss": 2.378741455078125, "memory(GiB)": 58.3, "step": 15485, "token_acc": 0.46464646464646464, "train_speed(iter/s)": 1.450334 }, { "epoch": 0.6636390900132814, "grad_norm": 4.903103351593018, "learning_rate": 9.571587661334028e-05, "loss": 2.87160701751709, "memory(GiB)": 58.3, "step": 15490, "token_acc": 0.42485549132947975, "train_speed(iter/s)": 1.450324 }, { "epoch": 0.6638533053425303, "grad_norm": 5.625885486602783, "learning_rate": 9.571315065404808e-05, "loss": 2.830636405944824, "memory(GiB)": 58.3, "step": 15495, "token_acc": 0.43302180685358255, "train_speed(iter/s)": 1.450328 }, { "epoch": 0.6640675206717793, "grad_norm": 3.764037847518921, "learning_rate": 9.571042386661928e-05, "loss": 2.3046358108520506, "memory(GiB)": 58.3, "step": 15500, "token_acc": 0.5148809523809523, "train_speed(iter/s)": 1.450257 }, { "epoch": 0.6640675206717793, "eval_loss": 2.1536643505096436, "eval_runtime": 14.8387, "eval_samples_per_second": 6.739, "eval_steps_per_second": 6.739, "eval_token_acc": 0.48297604035308955, "step": 15500 }, { "epoch": 0.6642817360010282, "grad_norm": 4.8557024002075195, "learning_rate": 9.570769625110325e-05, "loss": 2.5669265747070313, "memory(GiB)": 58.3, "step": 15505, "token_acc": 0.47685185185185186, "train_speed(iter/s)": 1.44813 }, { "epoch": 0.6644959513302772, "grad_norm": 4.15380859375, "learning_rate": 9.570496780754945e-05, "loss": 2.394583892822266, "memory(GiB)": 58.3, "step": 15510, "token_acc": 0.5425101214574899, "train_speed(iter/s)": 1.448189 }, { "epoch": 0.6647101666595262, "grad_norm": 3.6459875106811523, "learning_rate": 9.570223853600727e-05, "loss": 2.4049057006835937, "memory(GiB)": 58.3, "step": 15515, "token_acc": 0.46407185628742514, "train_speed(iter/s)": 1.448203 }, { "epoch": 0.6649243819887751, "grad_norm": 4.712183952331543, "learning_rate": 9.569950843652618e-05, "loss": 2.4937490463256835, "memory(GiB)": 58.3, "step": 15520, "token_acc": 0.4617737003058104, "train_speed(iter/s)": 1.448305 }, { "epoch": 0.665138597318024, "grad_norm": 4.0576252937316895, "learning_rate": 9.56967775091556e-05, "loss": 2.5970298767089846, "memory(GiB)": 58.3, "step": 15525, "token_acc": 0.45481927710843373, "train_speed(iter/s)": 1.448361 }, { "epoch": 0.6653528126472731, "grad_norm": 7.180209159851074, "learning_rate": 9.569404575394505e-05, "loss": 2.9798162460327147, "memory(GiB)": 58.3, "step": 15530, "token_acc": 0.3971631205673759, "train_speed(iter/s)": 1.448372 }, { "epoch": 0.665567027976522, "grad_norm": 6.177602767944336, "learning_rate": 9.569131317094399e-05, "loss": 2.54278450012207, "memory(GiB)": 58.3, "step": 15535, "token_acc": 0.4722222222222222, "train_speed(iter/s)": 1.448288 }, { "epoch": 0.6657812433057709, "grad_norm": 4.771605968475342, "learning_rate": 9.568857976020193e-05, "loss": 2.2341651916503906, "memory(GiB)": 58.3, "step": 15540, "token_acc": 0.5192307692307693, "train_speed(iter/s)": 1.448205 }, { "epoch": 0.66599545863502, "grad_norm": 6.403241157531738, "learning_rate": 9.568584552176838e-05, "loss": 2.461362838745117, "memory(GiB)": 58.3, "step": 15545, "token_acc": 0.458041958041958, "train_speed(iter/s)": 1.448194 }, { "epoch": 0.6662096739642689, "grad_norm": 6.478058338165283, "learning_rate": 9.568311045569289e-05, "loss": 2.625710296630859, "memory(GiB)": 58.3, "step": 15550, "token_acc": 0.4513888888888889, "train_speed(iter/s)": 1.448289 }, { "epoch": 0.6664238892935178, "grad_norm": 4.833446979522705, "learning_rate": 9.568037456202501e-05, "loss": 2.674976348876953, "memory(GiB)": 58.3, "step": 15555, "token_acc": 0.44404332129963897, "train_speed(iter/s)": 1.448339 }, { "epoch": 0.6666381046227668, "grad_norm": 5.176446437835693, "learning_rate": 9.567763784081428e-05, "loss": 2.465069389343262, "memory(GiB)": 58.3, "step": 15560, "token_acc": 0.47686832740213525, "train_speed(iter/s)": 1.44838 }, { "epoch": 0.6668523199520158, "grad_norm": 4.159816741943359, "learning_rate": 9.567490029211029e-05, "loss": 2.8300382614135744, "memory(GiB)": 58.3, "step": 15565, "token_acc": 0.4482758620689655, "train_speed(iter/s)": 1.448447 }, { "epoch": 0.6670665352812647, "grad_norm": 5.9345173835754395, "learning_rate": 9.567216191596264e-05, "loss": 2.7121652603149413, "memory(GiB)": 58.3, "step": 15570, "token_acc": 0.4022140221402214, "train_speed(iter/s)": 1.448456 }, { "epoch": 0.6672807506105137, "grad_norm": 5.538405895233154, "learning_rate": 9.566942271242093e-05, "loss": 2.6163633346557615, "memory(GiB)": 58.3, "step": 15575, "token_acc": 0.46987951807228917, "train_speed(iter/s)": 1.448494 }, { "epoch": 0.6674949659397627, "grad_norm": 4.507320404052734, "learning_rate": 9.566668268153479e-05, "loss": 2.7107967376708983, "memory(GiB)": 58.3, "step": 15580, "token_acc": 0.4542483660130719, "train_speed(iter/s)": 1.44853 }, { "epoch": 0.6677091812690116, "grad_norm": 4.423649311065674, "learning_rate": 9.566394182335385e-05, "loss": 2.3837303161621093, "memory(GiB)": 58.3, "step": 15585, "token_acc": 0.5055350553505535, "train_speed(iter/s)": 1.448467 }, { "epoch": 0.6679233965982606, "grad_norm": 4.200186252593994, "learning_rate": 9.566120013792776e-05, "loss": 2.636524963378906, "memory(GiB)": 58.3, "step": 15590, "token_acc": 0.42948717948717946, "train_speed(iter/s)": 1.448527 }, { "epoch": 0.6681376119275095, "grad_norm": 4.650906562805176, "learning_rate": 9.56584576253062e-05, "loss": 2.7633777618408204, "memory(GiB)": 58.3, "step": 15595, "token_acc": 0.44866920152091255, "train_speed(iter/s)": 1.448562 }, { "epoch": 0.6683518272567585, "grad_norm": 4.903339385986328, "learning_rate": 9.565571428553887e-05, "loss": 2.8440830230712892, "memory(GiB)": 58.3, "step": 15600, "token_acc": 0.4418604651162791, "train_speed(iter/s)": 1.448635 }, { "epoch": 0.6685660425860075, "grad_norm": 5.680325031280518, "learning_rate": 9.565297011867543e-05, "loss": 2.523952293395996, "memory(GiB)": 58.3, "step": 15605, "token_acc": 0.4872881355932203, "train_speed(iter/s)": 1.448532 }, { "epoch": 0.6687802579152564, "grad_norm": 3.57682728767395, "learning_rate": 9.565022512476561e-05, "loss": 2.3742599487304688, "memory(GiB)": 58.3, "step": 15610, "token_acc": 0.48175182481751827, "train_speed(iter/s)": 1.448619 }, { "epoch": 0.6689944732445053, "grad_norm": 3.680056571960449, "learning_rate": 9.564747930385915e-05, "loss": 2.515506553649902, "memory(GiB)": 58.3, "step": 15615, "token_acc": 0.4365781710914454, "train_speed(iter/s)": 1.448648 }, { "epoch": 0.6692086885737544, "grad_norm": 4.250692367553711, "learning_rate": 9.564473265600576e-05, "loss": 2.7476613998413084, "memory(GiB)": 58.3, "step": 15620, "token_acc": 0.46153846153846156, "train_speed(iter/s)": 1.448617 }, { "epoch": 0.6694229039030033, "grad_norm": 3.3645665645599365, "learning_rate": 9.564198518125523e-05, "loss": 2.7650035858154296, "memory(GiB)": 58.3, "step": 15625, "token_acc": 0.44200626959247646, "train_speed(iter/s)": 1.448681 }, { "epoch": 0.6696371192322522, "grad_norm": 4.581256866455078, "learning_rate": 9.563923687965733e-05, "loss": 2.4920093536376955, "memory(GiB)": 58.3, "step": 15630, "token_acc": 0.5019455252918288, "train_speed(iter/s)": 1.448558 }, { "epoch": 0.6698513345615013, "grad_norm": 4.66019344329834, "learning_rate": 9.563648775126184e-05, "loss": 2.6510133743286133, "memory(GiB)": 58.3, "step": 15635, "token_acc": 0.41605839416058393, "train_speed(iter/s)": 1.448541 }, { "epoch": 0.6700655498907502, "grad_norm": 3.4649465084075928, "learning_rate": 9.563373779611855e-05, "loss": 3.2039012908935547, "memory(GiB)": 58.3, "step": 15640, "token_acc": 0.4186666666666667, "train_speed(iter/s)": 1.448552 }, { "epoch": 0.6702797652199991, "grad_norm": 3.6231043338775635, "learning_rate": 9.563098701427731e-05, "loss": 2.628450393676758, "memory(GiB)": 58.3, "step": 15645, "token_acc": 0.42805755395683454, "train_speed(iter/s)": 1.448482 }, { "epoch": 0.6704939805492481, "grad_norm": 3.9865915775299072, "learning_rate": 9.562823540578791e-05, "loss": 2.263797569274902, "memory(GiB)": 58.3, "step": 15650, "token_acc": 0.5118110236220472, "train_speed(iter/s)": 1.448468 }, { "epoch": 0.6707081958784971, "grad_norm": 4.097041130065918, "learning_rate": 9.562548297070025e-05, "loss": 2.6353050231933595, "memory(GiB)": 58.3, "step": 15655, "token_acc": 0.4420289855072464, "train_speed(iter/s)": 1.448541 }, { "epoch": 0.670922411207746, "grad_norm": 4.2176947593688965, "learning_rate": 9.562272970906416e-05, "loss": 2.4473430633544924, "memory(GiB)": 58.3, "step": 15660, "token_acc": 0.4912891986062718, "train_speed(iter/s)": 1.448567 }, { "epoch": 0.671136626536995, "grad_norm": 4.233860015869141, "learning_rate": 9.561997562092951e-05, "loss": 2.614794158935547, "memory(GiB)": 58.3, "step": 15665, "token_acc": 0.5070422535211268, "train_speed(iter/s)": 1.448615 }, { "epoch": 0.6713508418662439, "grad_norm": 5.902677059173584, "learning_rate": 9.561722070634623e-05, "loss": 2.4403099060058593, "memory(GiB)": 58.3, "step": 15670, "token_acc": 0.47297297297297297, "train_speed(iter/s)": 1.448708 }, { "epoch": 0.6715650571954929, "grad_norm": 4.197824001312256, "learning_rate": 9.561446496536418e-05, "loss": 2.6506038665771485, "memory(GiB)": 58.3, "step": 15675, "token_acc": 0.4797047970479705, "train_speed(iter/s)": 1.448782 }, { "epoch": 0.6717792725247419, "grad_norm": 4.982149600982666, "learning_rate": 9.561170839803331e-05, "loss": 2.8639751434326173, "memory(GiB)": 58.3, "step": 15680, "token_acc": 0.4394904458598726, "train_speed(iter/s)": 1.448724 }, { "epoch": 0.6719934878539908, "grad_norm": 3.7808735370635986, "learning_rate": 9.560895100440357e-05, "loss": 2.6068578720092774, "memory(GiB)": 58.3, "step": 15685, "token_acc": 0.47384615384615386, "train_speed(iter/s)": 1.448636 }, { "epoch": 0.6722077031832397, "grad_norm": 5.031792163848877, "learning_rate": 9.56061927845249e-05, "loss": 2.752046585083008, "memory(GiB)": 58.3, "step": 15690, "token_acc": 0.4415954415954416, "train_speed(iter/s)": 1.448581 }, { "epoch": 0.6724219185124888, "grad_norm": 4.1393866539001465, "learning_rate": 9.560343373844724e-05, "loss": 2.550790214538574, "memory(GiB)": 58.3, "step": 15695, "token_acc": 0.4729241877256318, "train_speed(iter/s)": 1.448735 }, { "epoch": 0.6726361338417377, "grad_norm": 3.781517505645752, "learning_rate": 9.560067386622063e-05, "loss": 2.2200366973876955, "memory(GiB)": 58.3, "step": 15700, "token_acc": 0.4690909090909091, "train_speed(iter/s)": 1.448676 }, { "epoch": 0.6728503491709866, "grad_norm": 3.903384208679199, "learning_rate": 9.559791316789502e-05, "loss": 2.4927507400512696, "memory(GiB)": 58.3, "step": 15705, "token_acc": 0.5054945054945055, "train_speed(iter/s)": 1.448766 }, { "epoch": 0.6730645645002357, "grad_norm": 4.564765453338623, "learning_rate": 9.559515164352044e-05, "loss": 2.3333017349243166, "memory(GiB)": 58.3, "step": 15710, "token_acc": 0.4782608695652174, "train_speed(iter/s)": 1.448836 }, { "epoch": 0.6732787798294846, "grad_norm": 4.970495223999023, "learning_rate": 9.559238929314692e-05, "loss": 2.400589942932129, "memory(GiB)": 58.3, "step": 15715, "token_acc": 0.49491525423728816, "train_speed(iter/s)": 1.448981 }, { "epoch": 0.6734929951587335, "grad_norm": 6.2129316329956055, "learning_rate": 9.558962611682447e-05, "loss": 2.517072296142578, "memory(GiB)": 58.3, "step": 15720, "token_acc": 0.4652014652014652, "train_speed(iter/s)": 1.448921 }, { "epoch": 0.6737072104879825, "grad_norm": 4.229959011077881, "learning_rate": 9.558686211460322e-05, "loss": 2.586163330078125, "memory(GiB)": 58.3, "step": 15725, "token_acc": 0.45806451612903226, "train_speed(iter/s)": 1.449017 }, { "epoch": 0.6739214258172315, "grad_norm": 5.169749736785889, "learning_rate": 9.558409728653317e-05, "loss": 2.6079883575439453, "memory(GiB)": 58.3, "step": 15730, "token_acc": 0.473015873015873, "train_speed(iter/s)": 1.449006 }, { "epoch": 0.6741356411464804, "grad_norm": 4.491950988769531, "learning_rate": 9.558133163266444e-05, "loss": 2.63720817565918, "memory(GiB)": 58.3, "step": 15735, "token_acc": 0.42700729927007297, "train_speed(iter/s)": 1.449099 }, { "epoch": 0.6743498564757294, "grad_norm": 3.3280744552612305, "learning_rate": 9.557856515304713e-05, "loss": 2.5957775115966797, "memory(GiB)": 58.3, "step": 15740, "token_acc": 0.5095785440613027, "train_speed(iter/s)": 1.449122 }, { "epoch": 0.6745640718049783, "grad_norm": 6.450634479522705, "learning_rate": 9.557579784773137e-05, "loss": 2.9286909103393555, "memory(GiB)": 58.3, "step": 15745, "token_acc": 0.415929203539823, "train_speed(iter/s)": 1.449168 }, { "epoch": 0.6747782871342273, "grad_norm": 4.917341232299805, "learning_rate": 9.557302971676727e-05, "loss": 2.7538082122802736, "memory(GiB)": 58.3, "step": 15750, "token_acc": 0.44816053511705684, "train_speed(iter/s)": 1.44924 }, { "epoch": 0.6749925024634763, "grad_norm": 4.088737487792969, "learning_rate": 9.557026076020498e-05, "loss": 2.5165273666381838, "memory(GiB)": 58.3, "step": 15755, "token_acc": 0.47101449275362317, "train_speed(iter/s)": 1.449304 }, { "epoch": 0.6752067177927252, "grad_norm": 5.825265407562256, "learning_rate": 9.556749097809468e-05, "loss": 2.7471025466918944, "memory(GiB)": 58.3, "step": 15760, "token_acc": 0.4882154882154882, "train_speed(iter/s)": 1.449354 }, { "epoch": 0.6754209331219742, "grad_norm": 3.5806992053985596, "learning_rate": 9.556472037048651e-05, "loss": 2.5167312622070312, "memory(GiB)": 58.3, "step": 15765, "token_acc": 0.5060728744939271, "train_speed(iter/s)": 1.449396 }, { "epoch": 0.6756351484512232, "grad_norm": 5.661139965057373, "learning_rate": 9.556194893743071e-05, "loss": 2.584786224365234, "memory(GiB)": 58.3, "step": 15770, "token_acc": 0.46464646464646464, "train_speed(iter/s)": 1.4495 }, { "epoch": 0.6758493637804721, "grad_norm": 3.8455541133880615, "learning_rate": 9.555917667897746e-05, "loss": 2.7654178619384764, "memory(GiB)": 58.3, "step": 15775, "token_acc": 0.44884488448844884, "train_speed(iter/s)": 1.449355 }, { "epoch": 0.6760635791097211, "grad_norm": 3.6448280811309814, "learning_rate": 9.555640359517699e-05, "loss": 2.662990379333496, "memory(GiB)": 58.3, "step": 15780, "token_acc": 0.43217665615141954, "train_speed(iter/s)": 1.449363 }, { "epoch": 0.6762777944389701, "grad_norm": 5.703397274017334, "learning_rate": 9.555362968607952e-05, "loss": 2.5035566329956054, "memory(GiB)": 58.3, "step": 15785, "token_acc": 0.45864661654135336, "train_speed(iter/s)": 1.44935 }, { "epoch": 0.676492009768219, "grad_norm": 5.291288375854492, "learning_rate": 9.555085495173532e-05, "loss": 2.7179203033447266, "memory(GiB)": 58.3, "step": 15790, "token_acc": 0.44744744744744747, "train_speed(iter/s)": 1.449386 }, { "epoch": 0.676706225097468, "grad_norm": 3.1444194316864014, "learning_rate": 9.554807939219467e-05, "loss": 2.593650245666504, "memory(GiB)": 58.3, "step": 15795, "token_acc": 0.453551912568306, "train_speed(iter/s)": 1.449422 }, { "epoch": 0.676920440426717, "grad_norm": 4.930181503295898, "learning_rate": 9.554530300750782e-05, "loss": 2.3686145782470702, "memory(GiB)": 58.3, "step": 15800, "token_acc": 0.5096153846153846, "train_speed(iter/s)": 1.449459 }, { "epoch": 0.6771346557559659, "grad_norm": 3.371978282928467, "learning_rate": 9.554252579772509e-05, "loss": 2.7324657440185547, "memory(GiB)": 58.3, "step": 15805, "token_acc": 0.4409937888198758, "train_speed(iter/s)": 1.449497 }, { "epoch": 0.6773488710852149, "grad_norm": 4.526597499847412, "learning_rate": 9.553974776289678e-05, "loss": 2.4020816802978517, "memory(GiB)": 58.3, "step": 15810, "token_acc": 0.5228070175438596, "train_speed(iter/s)": 1.449483 }, { "epoch": 0.6775630864144638, "grad_norm": 5.530508518218994, "learning_rate": 9.55369689030732e-05, "loss": 2.7830686569213867, "memory(GiB)": 58.3, "step": 15815, "token_acc": 0.4423791821561338, "train_speed(iter/s)": 1.449503 }, { "epoch": 0.6777773017437128, "grad_norm": 5.447394847869873, "learning_rate": 9.553418921830473e-05, "loss": 2.8545257568359377, "memory(GiB)": 58.3, "step": 15820, "token_acc": 0.4367816091954023, "train_speed(iter/s)": 1.449539 }, { "epoch": 0.6779915170729618, "grad_norm": 4.551540374755859, "learning_rate": 9.55314087086417e-05, "loss": 2.6005834579467773, "memory(GiB)": 58.3, "step": 15825, "token_acc": 0.44722222222222224, "train_speed(iter/s)": 1.449545 }, { "epoch": 0.6782057324022107, "grad_norm": 4.18668794631958, "learning_rate": 9.552862737413449e-05, "loss": 2.5934478759765627, "memory(GiB)": 58.3, "step": 15830, "token_acc": 0.45934959349593496, "train_speed(iter/s)": 1.44949 }, { "epoch": 0.6784199477314596, "grad_norm": 4.853087425231934, "learning_rate": 9.55258452148335e-05, "loss": 2.9316173553466798, "memory(GiB)": 58.3, "step": 15835, "token_acc": 0.3720136518771331, "train_speed(iter/s)": 1.449685 }, { "epoch": 0.6786341630607087, "grad_norm": 4.226537227630615, "learning_rate": 9.552306223078911e-05, "loss": 2.6189754486083983, "memory(GiB)": 58.3, "step": 15840, "token_acc": 0.44814814814814813, "train_speed(iter/s)": 1.449792 }, { "epoch": 0.6788483783899576, "grad_norm": 4.9391984939575195, "learning_rate": 9.552027842205174e-05, "loss": 2.5882129669189453, "memory(GiB)": 58.3, "step": 15845, "token_acc": 0.4808362369337979, "train_speed(iter/s)": 1.449899 }, { "epoch": 0.6790625937192065, "grad_norm": 4.2914886474609375, "learning_rate": 9.55174937886718e-05, "loss": 2.534084510803223, "memory(GiB)": 58.3, "step": 15850, "token_acc": 0.4411764705882353, "train_speed(iter/s)": 1.449918 }, { "epoch": 0.6792768090484556, "grad_norm": 5.369961738586426, "learning_rate": 9.551470833069979e-05, "loss": 2.339730644226074, "memory(GiB)": 58.3, "step": 15855, "token_acc": 0.48717948717948717, "train_speed(iter/s)": 1.44991 }, { "epoch": 0.6794910243777045, "grad_norm": 4.803164005279541, "learning_rate": 9.551192204818615e-05, "loss": 2.491539192199707, "memory(GiB)": 58.3, "step": 15860, "token_acc": 0.4918032786885246, "train_speed(iter/s)": 1.449898 }, { "epoch": 0.6797052397069534, "grad_norm": 3.6270666122436523, "learning_rate": 9.550913494118133e-05, "loss": 2.540834426879883, "memory(GiB)": 58.3, "step": 15865, "token_acc": 0.46621621621621623, "train_speed(iter/s)": 1.449983 }, { "epoch": 0.6799194550362024, "grad_norm": 4.375837802886963, "learning_rate": 9.550634700973585e-05, "loss": 2.5530811309814454, "memory(GiB)": 58.3, "step": 15870, "token_acc": 0.48161764705882354, "train_speed(iter/s)": 1.450053 }, { "epoch": 0.6801336703654514, "grad_norm": 4.825113773345947, "learning_rate": 9.55035582539002e-05, "loss": 2.3854511260986326, "memory(GiB)": 58.3, "step": 15875, "token_acc": 0.495114006514658, "train_speed(iter/s)": 1.450144 }, { "epoch": 0.6803478856947003, "grad_norm": 4.216777801513672, "learning_rate": 9.550076867372491e-05, "loss": 2.5397287368774415, "memory(GiB)": 58.3, "step": 15880, "token_acc": 0.5020408163265306, "train_speed(iter/s)": 1.450225 }, { "epoch": 0.6805621010239493, "grad_norm": 3.958674907684326, "learning_rate": 9.549797826926052e-05, "loss": 3.0595335006713866, "memory(GiB)": 58.3, "step": 15885, "token_acc": 0.4208754208754209, "train_speed(iter/s)": 1.450291 }, { "epoch": 0.6807763163531982, "grad_norm": 4.0088629722595215, "learning_rate": 9.549518704055755e-05, "loss": 2.646309661865234, "memory(GiB)": 58.3, "step": 15890, "token_acc": 0.4847560975609756, "train_speed(iter/s)": 1.450348 }, { "epoch": 0.6809905316824472, "grad_norm": 3.2504427433013916, "learning_rate": 9.549239498766661e-05, "loss": 2.3795700073242188, "memory(GiB)": 58.3, "step": 15895, "token_acc": 0.5296167247386759, "train_speed(iter/s)": 1.450378 }, { "epoch": 0.6812047470116962, "grad_norm": 6.107132911682129, "learning_rate": 9.548960211063824e-05, "loss": 2.814947319030762, "memory(GiB)": 58.3, "step": 15900, "token_acc": 0.4377358490566038, "train_speed(iter/s)": 1.450523 }, { "epoch": 0.6814189623409451, "grad_norm": 4.817509174346924, "learning_rate": 9.548680840952308e-05, "loss": 2.4671628952026365, "memory(GiB)": 58.3, "step": 15905, "token_acc": 0.4929078014184397, "train_speed(iter/s)": 1.450489 }, { "epoch": 0.681633177670194, "grad_norm": 3.37170672416687, "learning_rate": 9.548401388437169e-05, "loss": 2.4301380157470702, "memory(GiB)": 58.3, "step": 15910, "token_acc": 0.5120274914089347, "train_speed(iter/s)": 1.450458 }, { "epoch": 0.6818473929994431, "grad_norm": 3.7406206130981445, "learning_rate": 9.548121853523475e-05, "loss": 2.5562255859375, "memory(GiB)": 58.3, "step": 15915, "token_acc": 0.4421364985163205, "train_speed(iter/s)": 1.450442 }, { "epoch": 0.682061608328692, "grad_norm": 5.093752861022949, "learning_rate": 9.547842236216285e-05, "loss": 2.5152847290039064, "memory(GiB)": 58.3, "step": 15920, "token_acc": 0.44074074074074077, "train_speed(iter/s)": 1.450432 }, { "epoch": 0.6822758236579409, "grad_norm": 7.331698894500732, "learning_rate": 9.547562536520667e-05, "loss": 2.437386894226074, "memory(GiB)": 58.3, "step": 15925, "token_acc": 0.4652014652014652, "train_speed(iter/s)": 1.450507 }, { "epoch": 0.68249003898719, "grad_norm": 3.6611979007720947, "learning_rate": 9.547282754441687e-05, "loss": 2.376340866088867, "memory(GiB)": 58.3, "step": 15930, "token_acc": 0.5182481751824818, "train_speed(iter/s)": 1.45038 }, { "epoch": 0.6827042543164389, "grad_norm": 4.315795421600342, "learning_rate": 9.547002889984415e-05, "loss": 2.6874521255493162, "memory(GiB)": 58.3, "step": 15935, "token_acc": 0.45808383233532934, "train_speed(iter/s)": 1.450423 }, { "epoch": 0.6829184696456878, "grad_norm": 3.830885648727417, "learning_rate": 9.546722943153921e-05, "loss": 2.7009918212890627, "memory(GiB)": 58.3, "step": 15940, "token_acc": 0.46153846153846156, "train_speed(iter/s)": 1.450553 }, { "epoch": 0.6831326849749368, "grad_norm": 5.225132942199707, "learning_rate": 9.546442913955276e-05, "loss": 2.9436458587646483, "memory(GiB)": 58.3, "step": 15945, "token_acc": 0.42679127725856697, "train_speed(iter/s)": 1.450532 }, { "epoch": 0.6833469003041858, "grad_norm": 4.207620143890381, "learning_rate": 9.54616280239355e-05, "loss": 2.712986373901367, "memory(GiB)": 58.3, "step": 15950, "token_acc": 0.4673913043478261, "train_speed(iter/s)": 1.450611 }, { "epoch": 0.6835611156334347, "grad_norm": 4.295711040496826, "learning_rate": 9.545882608473823e-05, "loss": 2.7001279830932616, "memory(GiB)": 58.3, "step": 15955, "token_acc": 0.41420118343195267, "train_speed(iter/s)": 1.450667 }, { "epoch": 0.6837753309626837, "grad_norm": 3.617671489715576, "learning_rate": 9.545602332201167e-05, "loss": 2.2829994201660155, "memory(GiB)": 58.3, "step": 15960, "token_acc": 0.505338078291815, "train_speed(iter/s)": 1.450622 }, { "epoch": 0.6839895462919326, "grad_norm": 6.1937456130981445, "learning_rate": 9.545321973580661e-05, "loss": 2.655887985229492, "memory(GiB)": 58.3, "step": 15965, "token_acc": 0.4813664596273292, "train_speed(iter/s)": 1.450634 }, { "epoch": 0.6842037616211816, "grad_norm": 4.067612648010254, "learning_rate": 9.545041532617382e-05, "loss": 2.786912536621094, "memory(GiB)": 58.3, "step": 15970, "token_acc": 0.4444444444444444, "train_speed(iter/s)": 1.450701 }, { "epoch": 0.6844179769504306, "grad_norm": 4.4033989906311035, "learning_rate": 9.544761009316414e-05, "loss": 2.200278472900391, "memory(GiB)": 58.3, "step": 15975, "token_acc": 0.484375, "train_speed(iter/s)": 1.450778 }, { "epoch": 0.6846321922796795, "grad_norm": 3.363515853881836, "learning_rate": 9.544480403682836e-05, "loss": 2.694540596008301, "memory(GiB)": 58.3, "step": 15980, "token_acc": 0.45016077170418006, "train_speed(iter/s)": 1.450812 }, { "epoch": 0.6848464076089285, "grad_norm": 4.277008533477783, "learning_rate": 9.544199715721734e-05, "loss": 2.2847898483276365, "memory(GiB)": 58.3, "step": 15985, "token_acc": 0.5018450184501845, "train_speed(iter/s)": 1.450816 }, { "epoch": 0.6850606229381775, "grad_norm": 5.444164276123047, "learning_rate": 9.543918945438189e-05, "loss": 2.752191162109375, "memory(GiB)": 58.3, "step": 15990, "token_acc": 0.46984126984126984, "train_speed(iter/s)": 1.450821 }, { "epoch": 0.6852748382674264, "grad_norm": 4.117075443267822, "learning_rate": 9.543638092837291e-05, "loss": 2.4991092681884766, "memory(GiB)": 58.3, "step": 15995, "token_acc": 0.4935483870967742, "train_speed(iter/s)": 1.450865 }, { "epoch": 0.6854890535966753, "grad_norm": 5.4325151443481445, "learning_rate": 9.543357157924126e-05, "loss": 2.451528549194336, "memory(GiB)": 58.3, "step": 16000, "token_acc": 0.4793103448275862, "train_speed(iter/s)": 1.450834 }, { "epoch": 0.6854890535966753, "eval_loss": 2.322693109512329, "eval_runtime": 13.6174, "eval_samples_per_second": 7.344, "eval_steps_per_second": 7.344, "eval_token_acc": 0.4449877750611247, "step": 16000 }, { "epoch": 0.6857032689259244, "grad_norm": 7.669858455657959, "learning_rate": 9.543076140703786e-05, "loss": 2.4565792083740234, "memory(GiB)": 58.3, "step": 16005, "token_acc": 0.45146520146520147, "train_speed(iter/s)": 1.448885 }, { "epoch": 0.6859174842551733, "grad_norm": 4.37327241897583, "learning_rate": 9.542795041181359e-05, "loss": 2.625646209716797, "memory(GiB)": 58.3, "step": 16010, "token_acc": 0.475, "train_speed(iter/s)": 1.449003 }, { "epoch": 0.6861316995844222, "grad_norm": 3.5756986141204834, "learning_rate": 9.542513859361937e-05, "loss": 2.615800476074219, "memory(GiB)": 58.3, "step": 16015, "token_acc": 0.46474358974358976, "train_speed(iter/s)": 1.449111 }, { "epoch": 0.6863459149136713, "grad_norm": 3.9372990131378174, "learning_rate": 9.542232595250615e-05, "loss": 2.419285011291504, "memory(GiB)": 58.3, "step": 16020, "token_acc": 0.5186440677966102, "train_speed(iter/s)": 1.449101 }, { "epoch": 0.6865601302429202, "grad_norm": 3.793991804122925, "learning_rate": 9.54195124885249e-05, "loss": 2.5227998733520507, "memory(GiB)": 58.3, "step": 16025, "token_acc": 0.46715328467153283, "train_speed(iter/s)": 1.449144 }, { "epoch": 0.6867743455721691, "grad_norm": 5.286299705505371, "learning_rate": 9.541669820172656e-05, "loss": 2.799925422668457, "memory(GiB)": 58.3, "step": 16030, "token_acc": 0.4266666666666667, "train_speed(iter/s)": 1.449197 }, { "epoch": 0.6869885609014181, "grad_norm": 3.6555585861206055, "learning_rate": 9.541388309216214e-05, "loss": 2.8805471420288087, "memory(GiB)": 58.3, "step": 16035, "token_acc": 0.4252199413489736, "train_speed(iter/s)": 1.449221 }, { "epoch": 0.6872027762306671, "grad_norm": 5.6552252769470215, "learning_rate": 9.541106715988263e-05, "loss": 2.4908044815063475, "memory(GiB)": 58.3, "step": 16040, "token_acc": 0.45161290322580644, "train_speed(iter/s)": 1.449226 }, { "epoch": 0.687416991559916, "grad_norm": 4.04453182220459, "learning_rate": 9.540825040493903e-05, "loss": 2.737270736694336, "memory(GiB)": 58.3, "step": 16045, "token_acc": 0.4623955431754875, "train_speed(iter/s)": 1.449303 }, { "epoch": 0.687631206889165, "grad_norm": 6.701990604400635, "learning_rate": 9.540543282738239e-05, "loss": 2.8500442504882812, "memory(GiB)": 58.3, "step": 16050, "token_acc": 0.42857142857142855, "train_speed(iter/s)": 1.44938 }, { "epoch": 0.6878454222184139, "grad_norm": 3.8837637901306152, "learning_rate": 9.540261442726373e-05, "loss": 2.3897096633911135, "memory(GiB)": 58.3, "step": 16055, "token_acc": 0.447098976109215, "train_speed(iter/s)": 1.449392 }, { "epoch": 0.6880596375476629, "grad_norm": 4.747652530670166, "learning_rate": 9.539979520463413e-05, "loss": 2.5115901947021486, "memory(GiB)": 58.3, "step": 16060, "token_acc": 0.48046875, "train_speed(iter/s)": 1.449424 }, { "epoch": 0.6882738528769119, "grad_norm": 3.818439245223999, "learning_rate": 9.539697515954465e-05, "loss": 2.573995590209961, "memory(GiB)": 58.3, "step": 16065, "token_acc": 0.5274725274725275, "train_speed(iter/s)": 1.449487 }, { "epoch": 0.6884880682061608, "grad_norm": 5.686701774597168, "learning_rate": 9.539415429204636e-05, "loss": 2.7241022109985353, "memory(GiB)": 58.3, "step": 16070, "token_acc": 0.43986254295532645, "train_speed(iter/s)": 1.449489 }, { "epoch": 0.6887022835354097, "grad_norm": 4.41595983505249, "learning_rate": 9.53913326021904e-05, "loss": 2.47973518371582, "memory(GiB)": 58.3, "step": 16075, "token_acc": 0.49377593360995853, "train_speed(iter/s)": 1.449536 }, { "epoch": 0.6889164988646588, "grad_norm": 4.110328197479248, "learning_rate": 9.538851009002785e-05, "loss": 2.469658851623535, "memory(GiB)": 58.3, "step": 16080, "token_acc": 0.49693251533742333, "train_speed(iter/s)": 1.44951 }, { "epoch": 0.6891307141939077, "grad_norm": 3.547121047973633, "learning_rate": 9.538568675560988e-05, "loss": 2.4900867462158205, "memory(GiB)": 58.3, "step": 16085, "token_acc": 0.4713804713804714, "train_speed(iter/s)": 1.449508 }, { "epoch": 0.6893449295231566, "grad_norm": 4.928086757659912, "learning_rate": 9.538286259898762e-05, "loss": 2.4570621490478515, "memory(GiB)": 58.3, "step": 16090, "token_acc": 0.4633333333333333, "train_speed(iter/s)": 1.449573 }, { "epoch": 0.6895591448524057, "grad_norm": 4.7055583000183105, "learning_rate": 9.538003762021221e-05, "loss": 2.6836000442504884, "memory(GiB)": 58.3, "step": 16095, "token_acc": 0.4236111111111111, "train_speed(iter/s)": 1.449735 }, { "epoch": 0.6897733601816546, "grad_norm": 4.162377834320068, "learning_rate": 9.537721181933488e-05, "loss": 2.3971416473388674, "memory(GiB)": 58.3, "step": 16100, "token_acc": 0.5144927536231884, "train_speed(iter/s)": 1.449735 }, { "epoch": 0.6899875755109035, "grad_norm": 5.180971145629883, "learning_rate": 9.537438519640675e-05, "loss": 2.8276094436645507, "memory(GiB)": 58.3, "step": 16105, "token_acc": 0.4219269102990033, "train_speed(iter/s)": 1.449773 }, { "epoch": 0.6902017908401525, "grad_norm": 4.417033672332764, "learning_rate": 9.53715577514791e-05, "loss": 2.6529945373535155, "memory(GiB)": 58.3, "step": 16110, "token_acc": 0.47719298245614034, "train_speed(iter/s)": 1.449903 }, { "epoch": 0.6904160061694015, "grad_norm": 4.234143257141113, "learning_rate": 9.536872948460312e-05, "loss": 3.0863554000854494, "memory(GiB)": 58.3, "step": 16115, "token_acc": 0.3931034482758621, "train_speed(iter/s)": 1.449982 }, { "epoch": 0.6906302214986505, "grad_norm": 5.599666595458984, "learning_rate": 9.536590039583004e-05, "loss": 3.0715717315673827, "memory(GiB)": 58.3, "step": 16120, "token_acc": 0.428169014084507, "train_speed(iter/s)": 1.449985 }, { "epoch": 0.6908444368278994, "grad_norm": 4.459110736846924, "learning_rate": 9.536307048521112e-05, "loss": 2.604759407043457, "memory(GiB)": 58.3, "step": 16125, "token_acc": 0.44569288389513106, "train_speed(iter/s)": 1.449951 }, { "epoch": 0.6910586521571483, "grad_norm": 3.9584357738494873, "learning_rate": 9.53602397527976e-05, "loss": 2.5693227767944338, "memory(GiB)": 58.3, "step": 16130, "token_acc": 0.4906832298136646, "train_speed(iter/s)": 1.449953 }, { "epoch": 0.6912728674863974, "grad_norm": 4.391383647918701, "learning_rate": 9.535740819864081e-05, "loss": 2.773303985595703, "memory(GiB)": 58.3, "step": 16135, "token_acc": 0.4444444444444444, "train_speed(iter/s)": 1.449884 }, { "epoch": 0.6914870828156463, "grad_norm": 5.701388359069824, "learning_rate": 9.535457582279203e-05, "loss": 2.2828033447265623, "memory(GiB)": 58.3, "step": 16140, "token_acc": 0.4852941176470588, "train_speed(iter/s)": 1.449785 }, { "epoch": 0.6917012981448952, "grad_norm": 5.432440280914307, "learning_rate": 9.535174262530254e-05, "loss": 2.720505714416504, "memory(GiB)": 58.3, "step": 16145, "token_acc": 0.4357142857142857, "train_speed(iter/s)": 1.449811 }, { "epoch": 0.6919155134741443, "grad_norm": 3.811840534210205, "learning_rate": 9.53489086062237e-05, "loss": 2.417336654663086, "memory(GiB)": 58.3, "step": 16150, "token_acc": 0.4775641025641026, "train_speed(iter/s)": 1.449639 }, { "epoch": 0.6921297288033932, "grad_norm": 4.712042331695557, "learning_rate": 9.534607376560684e-05, "loss": 2.4939800262451173, "memory(GiB)": 58.3, "step": 16155, "token_acc": 0.4440789473684211, "train_speed(iter/s)": 1.449653 }, { "epoch": 0.6923439441326421, "grad_norm": 4.263582706451416, "learning_rate": 9.534323810350332e-05, "loss": 2.709640884399414, "memory(GiB)": 58.3, "step": 16160, "token_acc": 0.43370165745856354, "train_speed(iter/s)": 1.449606 }, { "epoch": 0.6925581594618911, "grad_norm": 3.9528398513793945, "learning_rate": 9.534040161996449e-05, "loss": 2.777052307128906, "memory(GiB)": 58.3, "step": 16165, "token_acc": 0.44753086419753085, "train_speed(iter/s)": 1.449681 }, { "epoch": 0.6927723747911401, "grad_norm": 5.489184379577637, "learning_rate": 9.533756431504177e-05, "loss": 2.6040952682495115, "memory(GiB)": 58.3, "step": 16170, "token_acc": 0.45, "train_speed(iter/s)": 1.449658 }, { "epoch": 0.692986590120389, "grad_norm": 4.068277359008789, "learning_rate": 9.533472618878653e-05, "loss": 2.476105499267578, "memory(GiB)": 58.3, "step": 16175, "token_acc": 0.47854785478547857, "train_speed(iter/s)": 1.449665 }, { "epoch": 0.693200805449638, "grad_norm": 4.737843036651611, "learning_rate": 9.53318872412502e-05, "loss": 2.7528892517089845, "memory(GiB)": 58.31, "step": 16180, "token_acc": 0.43389830508474575, "train_speed(iter/s)": 1.44956 }, { "epoch": 0.693415020778887, "grad_norm": 4.122523784637451, "learning_rate": 9.532904747248422e-05, "loss": 2.7778581619262694, "memory(GiB)": 58.31, "step": 16185, "token_acc": 0.44363636363636366, "train_speed(iter/s)": 1.449671 }, { "epoch": 0.6936292361081359, "grad_norm": 3.963362216949463, "learning_rate": 9.532620688254e-05, "loss": 2.5131149291992188, "memory(GiB)": 58.31, "step": 16190, "token_acc": 0.47017543859649125, "train_speed(iter/s)": 1.44966 }, { "epoch": 0.6938434514373849, "grad_norm": 4.278669357299805, "learning_rate": 9.532336547146904e-05, "loss": 2.8307126998901366, "memory(GiB)": 58.31, "step": 16195, "token_acc": 0.4340175953079179, "train_speed(iter/s)": 1.449623 }, { "epoch": 0.6940576667666338, "grad_norm": 3.4925103187561035, "learning_rate": 9.532052323932279e-05, "loss": 2.4903324127197264, "memory(GiB)": 58.31, "step": 16200, "token_acc": 0.48024316109422494, "train_speed(iter/s)": 1.449628 }, { "epoch": 0.6942718820958828, "grad_norm": 3.750378370285034, "learning_rate": 9.531768018615276e-05, "loss": 2.578135108947754, "memory(GiB)": 58.31, "step": 16205, "token_acc": 0.4666666666666667, "train_speed(iter/s)": 1.449617 }, { "epoch": 0.6944860974251318, "grad_norm": 3.7137584686279297, "learning_rate": 9.531483631201044e-05, "loss": 2.89431209564209, "memory(GiB)": 58.31, "step": 16210, "token_acc": 0.4166666666666667, "train_speed(iter/s)": 1.449604 }, { "epoch": 0.6947003127543807, "grad_norm": 4.538028240203857, "learning_rate": 9.531199161694732e-05, "loss": 2.784868049621582, "memory(GiB)": 58.31, "step": 16215, "token_acc": 0.42813455657492355, "train_speed(iter/s)": 1.449597 }, { "epoch": 0.6949145280836296, "grad_norm": 3.5266575813293457, "learning_rate": 9.5309146101015e-05, "loss": 2.85870418548584, "memory(GiB)": 58.31, "step": 16220, "token_acc": 0.45321637426900585, "train_speed(iter/s)": 1.449693 }, { "epoch": 0.6951287434128787, "grad_norm": 4.1946845054626465, "learning_rate": 9.530629976426499e-05, "loss": 2.6545372009277344, "memory(GiB)": 58.31, "step": 16225, "token_acc": 0.43302180685358255, "train_speed(iter/s)": 1.449739 }, { "epoch": 0.6953429587421276, "grad_norm": 4.715703964233398, "learning_rate": 9.530345260674885e-05, "loss": 2.595602798461914, "memory(GiB)": 58.31, "step": 16230, "token_acc": 0.4921875, "train_speed(iter/s)": 1.449611 }, { "epoch": 0.6955571740713765, "grad_norm": 5.864632606506348, "learning_rate": 9.530060462851818e-05, "loss": 2.577613639831543, "memory(GiB)": 58.31, "step": 16235, "token_acc": 0.4309210526315789, "train_speed(iter/s)": 1.449668 }, { "epoch": 0.6957713894006256, "grad_norm": 4.584555625915527, "learning_rate": 9.529775582962455e-05, "loss": 2.4517467498779295, "memory(GiB)": 58.31, "step": 16240, "token_acc": 0.4746376811594203, "train_speed(iter/s)": 1.449669 }, { "epoch": 0.6959856047298745, "grad_norm": 3.677097797393799, "learning_rate": 9.529490621011958e-05, "loss": 2.4341060638427736, "memory(GiB)": 58.31, "step": 16245, "token_acc": 0.5103857566765578, "train_speed(iter/s)": 1.449694 }, { "epoch": 0.6961998200591234, "grad_norm": 4.16178560256958, "learning_rate": 9.529205577005491e-05, "loss": 2.491594696044922, "memory(GiB)": 58.31, "step": 16250, "token_acc": 0.48493975903614456, "train_speed(iter/s)": 1.449732 }, { "epoch": 0.6964140353883724, "grad_norm": 3.9551873207092285, "learning_rate": 9.528920450948215e-05, "loss": 2.1626350402832033, "memory(GiB)": 58.31, "step": 16255, "token_acc": 0.5194805194805194, "train_speed(iter/s)": 1.449701 }, { "epoch": 0.6966282507176214, "grad_norm": 3.9807679653167725, "learning_rate": 9.528635242845297e-05, "loss": 2.726948547363281, "memory(GiB)": 58.31, "step": 16260, "token_acc": 0.45660377358490567, "train_speed(iter/s)": 1.449736 }, { "epoch": 0.6968424660468703, "grad_norm": 2.9111335277557373, "learning_rate": 9.528349952701902e-05, "loss": 2.7875213623046875, "memory(GiB)": 58.31, "step": 16265, "token_acc": 0.4411764705882353, "train_speed(iter/s)": 1.449802 }, { "epoch": 0.6970566813761193, "grad_norm": 3.767491102218628, "learning_rate": 9.528064580523201e-05, "loss": 2.675881195068359, "memory(GiB)": 58.31, "step": 16270, "token_acc": 0.4174454828660436, "train_speed(iter/s)": 1.449799 }, { "epoch": 0.6972708967053682, "grad_norm": 6.079091548919678, "learning_rate": 9.527779126314362e-05, "loss": 2.9438419342041016, "memory(GiB)": 58.31, "step": 16275, "token_acc": 0.4125, "train_speed(iter/s)": 1.449884 }, { "epoch": 0.6974851120346172, "grad_norm": 4.782175064086914, "learning_rate": 9.527493590080557e-05, "loss": 2.6365474700927733, "memory(GiB)": 58.31, "step": 16280, "token_acc": 0.49, "train_speed(iter/s)": 1.449817 }, { "epoch": 0.6976993273638662, "grad_norm": 4.4325432777404785, "learning_rate": 9.527207971826959e-05, "loss": 2.650682830810547, "memory(GiB)": 58.31, "step": 16285, "token_acc": 0.4522058823529412, "train_speed(iter/s)": 1.449787 }, { "epoch": 0.6979135426931151, "grad_norm": 4.810183048248291, "learning_rate": 9.52692227155874e-05, "loss": 2.6694765090942383, "memory(GiB)": 58.31, "step": 16290, "token_acc": 0.43653250773993807, "train_speed(iter/s)": 1.449844 }, { "epoch": 0.698127758022364, "grad_norm": 4.229180812835693, "learning_rate": 9.526636489281078e-05, "loss": 2.7364078521728517, "memory(GiB)": 58.31, "step": 16295, "token_acc": 0.3884057971014493, "train_speed(iter/s)": 1.449878 }, { "epoch": 0.6983419733516131, "grad_norm": 4.053791522979736, "learning_rate": 9.526350624999153e-05, "loss": 2.5006582260131838, "memory(GiB)": 58.31, "step": 16300, "token_acc": 0.527027027027027, "train_speed(iter/s)": 1.449816 }, { "epoch": 0.698556188680862, "grad_norm": 6.355086326599121, "learning_rate": 9.526064678718137e-05, "loss": 2.558262252807617, "memory(GiB)": 58.31, "step": 16305, "token_acc": 0.47580645161290325, "train_speed(iter/s)": 1.449899 }, { "epoch": 0.6987704040101109, "grad_norm": 4.354630947113037, "learning_rate": 9.525778650443214e-05, "loss": 2.4823657989501955, "memory(GiB)": 58.31, "step": 16310, "token_acc": 0.48109965635738833, "train_speed(iter/s)": 1.449947 }, { "epoch": 0.69898461933936, "grad_norm": 5.398983478546143, "learning_rate": 9.525492540179563e-05, "loss": 2.803253936767578, "memory(GiB)": 58.31, "step": 16315, "token_acc": 0.4020979020979021, "train_speed(iter/s)": 1.449923 }, { "epoch": 0.6991988346686089, "grad_norm": 4.110025405883789, "learning_rate": 9.525206347932373e-05, "loss": 2.8333145141601563, "memory(GiB)": 58.31, "step": 16320, "token_acc": 0.4444444444444444, "train_speed(iter/s)": 1.449874 }, { "epoch": 0.6994130499978578, "grad_norm": 3.702155828475952, "learning_rate": 9.524920073706824e-05, "loss": 2.674330711364746, "memory(GiB)": 58.31, "step": 16325, "token_acc": 0.44171779141104295, "train_speed(iter/s)": 1.449955 }, { "epoch": 0.6996272653271068, "grad_norm": 5.382178783416748, "learning_rate": 9.524633717508103e-05, "loss": 2.4870960235595705, "memory(GiB)": 58.31, "step": 16330, "token_acc": 0.48328267477203646, "train_speed(iter/s)": 1.449953 }, { "epoch": 0.6998414806563558, "grad_norm": 3.2930479049682617, "learning_rate": 9.524347279341397e-05, "loss": 2.3145076751708986, "memory(GiB)": 58.31, "step": 16335, "token_acc": 0.4885245901639344, "train_speed(iter/s)": 1.449905 }, { "epoch": 0.7000556959856047, "grad_norm": 3.5885164737701416, "learning_rate": 9.524060759211897e-05, "loss": 2.5117565155029298, "memory(GiB)": 58.31, "step": 16340, "token_acc": 0.48739495798319327, "train_speed(iter/s)": 1.449904 }, { "epoch": 0.7002699113148537, "grad_norm": 5.129047393798828, "learning_rate": 9.523774157124791e-05, "loss": 2.7829776763916017, "memory(GiB)": 58.31, "step": 16345, "token_acc": 0.4142857142857143, "train_speed(iter/s)": 1.449922 }, { "epoch": 0.7004841266441026, "grad_norm": 3.988901376724243, "learning_rate": 9.523487473085274e-05, "loss": 2.69129638671875, "memory(GiB)": 58.31, "step": 16350, "token_acc": 0.4462025316455696, "train_speed(iter/s)": 1.450021 }, { "epoch": 0.7006983419733516, "grad_norm": 4.978018760681152, "learning_rate": 9.523200707098537e-05, "loss": 2.768937683105469, "memory(GiB)": 58.31, "step": 16355, "token_acc": 0.4961832061068702, "train_speed(iter/s)": 1.449972 }, { "epoch": 0.7009125573026006, "grad_norm": 3.4455044269561768, "learning_rate": 9.522913859169778e-05, "loss": 2.683019828796387, "memory(GiB)": 58.31, "step": 16360, "token_acc": 0.45535714285714285, "train_speed(iter/s)": 1.449999 }, { "epoch": 0.7011267726318495, "grad_norm": 5.0793776512146, "learning_rate": 9.52262692930419e-05, "loss": 2.762808418273926, "memory(GiB)": 58.31, "step": 16365, "token_acc": 0.46441947565543074, "train_speed(iter/s)": 1.450002 }, { "epoch": 0.7013409879610984, "grad_norm": 5.498508930206299, "learning_rate": 9.522339917506973e-05, "loss": 2.7171464920043946, "memory(GiB)": 58.31, "step": 16370, "token_acc": 0.4653846153846154, "train_speed(iter/s)": 1.450021 }, { "epoch": 0.7015552032903475, "grad_norm": 5.40051794052124, "learning_rate": 9.522052823783325e-05, "loss": 2.6595659255981445, "memory(GiB)": 58.31, "step": 16375, "token_acc": 0.46402877697841727, "train_speed(iter/s)": 1.450067 }, { "epoch": 0.7017694186195964, "grad_norm": 4.447750568389893, "learning_rate": 9.521765648138449e-05, "loss": 2.330704689025879, "memory(GiB)": 58.31, "step": 16380, "token_acc": 0.48, "train_speed(iter/s)": 1.450206 }, { "epoch": 0.7019836339488453, "grad_norm": 3.448335886001587, "learning_rate": 9.521478390577546e-05, "loss": 2.478415298461914, "memory(GiB)": 58.31, "step": 16385, "token_acc": 0.4696969696969697, "train_speed(iter/s)": 1.450255 }, { "epoch": 0.7021978492780944, "grad_norm": 4.745573043823242, "learning_rate": 9.521191051105823e-05, "loss": 2.713380241394043, "memory(GiB)": 58.31, "step": 16390, "token_acc": 0.46153846153846156, "train_speed(iter/s)": 1.450227 }, { "epoch": 0.7024120646073433, "grad_norm": 5.548971652984619, "learning_rate": 9.520903629728479e-05, "loss": 2.829987907409668, "memory(GiB)": 58.31, "step": 16395, "token_acc": 0.42028985507246375, "train_speed(iter/s)": 1.450252 }, { "epoch": 0.7026262799365922, "grad_norm": 6.130307674407959, "learning_rate": 9.520616126450726e-05, "loss": 2.919237518310547, "memory(GiB)": 58.31, "step": 16400, "token_acc": 0.42379182156133827, "train_speed(iter/s)": 1.450224 }, { "epoch": 0.7028404952658412, "grad_norm": 3.8262250423431396, "learning_rate": 9.520328541277772e-05, "loss": 2.577581214904785, "memory(GiB)": 58.31, "step": 16405, "token_acc": 0.46037735849056605, "train_speed(iter/s)": 1.450253 }, { "epoch": 0.7030547105950902, "grad_norm": 4.228984832763672, "learning_rate": 9.520040874214827e-05, "loss": 2.5994098663330076, "memory(GiB)": 66.02, "step": 16410, "token_acc": 0.4126984126984127, "train_speed(iter/s)": 1.450028 }, { "epoch": 0.7032689259243391, "grad_norm": 5.115041732788086, "learning_rate": 9.519753125267101e-05, "loss": 2.663760185241699, "memory(GiB)": 66.02, "step": 16415, "token_acc": 0.45084745762711864, "train_speed(iter/s)": 1.449944 }, { "epoch": 0.7034831412535881, "grad_norm": 4.100739479064941, "learning_rate": 9.519465294439805e-05, "loss": 2.630366325378418, "memory(GiB)": 66.02, "step": 16420, "token_acc": 0.4627831715210356, "train_speed(iter/s)": 1.449906 }, { "epoch": 0.703697356582837, "grad_norm": 4.6938300132751465, "learning_rate": 9.519177381738158e-05, "loss": 2.6072622299194337, "memory(GiB)": 66.02, "step": 16425, "token_acc": 0.48770491803278687, "train_speed(iter/s)": 1.449985 }, { "epoch": 0.703911571912086, "grad_norm": 3.35369873046875, "learning_rate": 9.518889387167373e-05, "loss": 2.489060974121094, "memory(GiB)": 66.02, "step": 16430, "token_acc": 0.48398576512455516, "train_speed(iter/s)": 1.45005 }, { "epoch": 0.704125787241335, "grad_norm": 4.35598087310791, "learning_rate": 9.518601310732667e-05, "loss": 2.4032833099365236, "memory(GiB)": 66.02, "step": 16435, "token_acc": 0.49096385542168675, "train_speed(iter/s)": 1.450024 }, { "epoch": 0.7043400025705839, "grad_norm": 4.064105987548828, "learning_rate": 9.518313152439259e-05, "loss": 2.6915843963623045, "memory(GiB)": 66.02, "step": 16440, "token_acc": 0.43597560975609756, "train_speed(iter/s)": 1.450105 }, { "epoch": 0.7045542178998329, "grad_norm": 4.789132595062256, "learning_rate": 9.51802491229237e-05, "loss": 2.5562313079833983, "memory(GiB)": 66.02, "step": 16445, "token_acc": 0.4228187919463087, "train_speed(iter/s)": 1.450243 }, { "epoch": 0.7047684332290819, "grad_norm": 3.658273935317993, "learning_rate": 9.517736590297223e-05, "loss": 2.533077430725098, "memory(GiB)": 66.02, "step": 16450, "token_acc": 0.4713804713804714, "train_speed(iter/s)": 1.45024 }, { "epoch": 0.7049826485583308, "grad_norm": 3.736837148666382, "learning_rate": 9.517448186459038e-05, "loss": 2.8293155670166015, "memory(GiB)": 66.02, "step": 16455, "token_acc": 0.40502793296089384, "train_speed(iter/s)": 1.450317 }, { "epoch": 0.7051968638875799, "grad_norm": 5.096014976501465, "learning_rate": 9.517159700783042e-05, "loss": 2.6939470291137697, "memory(GiB)": 66.02, "step": 16460, "token_acc": 0.4471830985915493, "train_speed(iter/s)": 1.450365 }, { "epoch": 0.7054110792168288, "grad_norm": 3.996666431427002, "learning_rate": 9.516871133274461e-05, "loss": 2.4760942459106445, "memory(GiB)": 66.02, "step": 16465, "token_acc": 0.4850498338870432, "train_speed(iter/s)": 1.450455 }, { "epoch": 0.7056252945460777, "grad_norm": 4.290310382843018, "learning_rate": 9.51658248393852e-05, "loss": 2.5249183654785154, "memory(GiB)": 66.02, "step": 16470, "token_acc": 0.44483985765124556, "train_speed(iter/s)": 1.450324 }, { "epoch": 0.7058395098753267, "grad_norm": 3.6787147521972656, "learning_rate": 9.516293752780454e-05, "loss": 2.4754077911376955, "memory(GiB)": 66.02, "step": 16475, "token_acc": 0.4882154882154882, "train_speed(iter/s)": 1.450288 }, { "epoch": 0.7060537252045757, "grad_norm": 6.543824195861816, "learning_rate": 9.516004939805488e-05, "loss": 2.9514930725097654, "memory(GiB)": 66.02, "step": 16480, "token_acc": 0.44, "train_speed(iter/s)": 1.450242 }, { "epoch": 0.7062679405338246, "grad_norm": 4.192415237426758, "learning_rate": 9.515716045018856e-05, "loss": 2.3985437393188476, "memory(GiB)": 66.02, "step": 16485, "token_acc": 0.4765957446808511, "train_speed(iter/s)": 1.450307 }, { "epoch": 0.7064821558630736, "grad_norm": 3.979339122772217, "learning_rate": 9.515427068425793e-05, "loss": 2.457245635986328, "memory(GiB)": 66.02, "step": 16490, "token_acc": 0.5074074074074074, "train_speed(iter/s)": 1.450247 }, { "epoch": 0.7066963711923225, "grad_norm": 3.7735750675201416, "learning_rate": 9.515138010031532e-05, "loss": 2.5792287826538085, "memory(GiB)": 66.02, "step": 16495, "token_acc": 0.4709897610921502, "train_speed(iter/s)": 1.450258 }, { "epoch": 0.7069105865215715, "grad_norm": 4.328885555267334, "learning_rate": 9.51484886984131e-05, "loss": 2.369606590270996, "memory(GiB)": 66.02, "step": 16500, "token_acc": 0.47388059701492535, "train_speed(iter/s)": 1.450326 }, { "epoch": 0.7069105865215715, "eval_loss": 2.395688533782959, "eval_runtime": 14.091, "eval_samples_per_second": 7.097, "eval_steps_per_second": 7.097, "eval_token_acc": 0.46732429099876693, "step": 16500 }, { "epoch": 0.7071248018508205, "grad_norm": 4.743238925933838, "learning_rate": 9.514559647860366e-05, "loss": 2.4409147262573243, "memory(GiB)": 66.02, "step": 16505, "token_acc": 0.47569113441372735, "train_speed(iter/s)": 1.448363 }, { "epoch": 0.7073390171800694, "grad_norm": 3.831484079360962, "learning_rate": 9.514270344093939e-05, "loss": 2.3554454803466798, "memory(GiB)": 66.02, "step": 16510, "token_acc": 0.5, "train_speed(iter/s)": 1.448306 }, { "epoch": 0.7075532325093183, "grad_norm": 3.9123921394348145, "learning_rate": 9.513980958547269e-05, "loss": 2.552790069580078, "memory(GiB)": 66.02, "step": 16515, "token_acc": 0.4637223974763407, "train_speed(iter/s)": 1.448271 }, { "epoch": 0.7077674478385674, "grad_norm": 4.251744747161865, "learning_rate": 9.5136914912256e-05, "loss": 2.697835922241211, "memory(GiB)": 66.02, "step": 16520, "token_acc": 0.4692556634304207, "train_speed(iter/s)": 1.448233 }, { "epoch": 0.7079816631678163, "grad_norm": 6.784512042999268, "learning_rate": 9.513401942134177e-05, "loss": 2.770741844177246, "memory(GiB)": 66.02, "step": 16525, "token_acc": 0.4792332268370607, "train_speed(iter/s)": 1.448175 }, { "epoch": 0.7081958784970652, "grad_norm": 6.408402442932129, "learning_rate": 9.513112311278243e-05, "loss": 2.4230630874633787, "memory(GiB)": 66.02, "step": 16530, "token_acc": 0.4870848708487085, "train_speed(iter/s)": 1.44824 }, { "epoch": 0.7084100938263143, "grad_norm": 4.877574443817139, "learning_rate": 9.512822598663045e-05, "loss": 2.4652992248535157, "memory(GiB)": 66.02, "step": 16535, "token_acc": 0.45514950166112955, "train_speed(iter/s)": 1.448293 }, { "epoch": 0.7086243091555632, "grad_norm": 7.459070682525635, "learning_rate": 9.512532804293832e-05, "loss": 2.2614675521850587, "memory(GiB)": 66.02, "step": 16540, "token_acc": 0.5, "train_speed(iter/s)": 1.448332 }, { "epoch": 0.7088385244848121, "grad_norm": 4.735755920410156, "learning_rate": 9.512242928175857e-05, "loss": 2.4417387008666993, "memory(GiB)": 66.02, "step": 16545, "token_acc": 0.44981412639405205, "train_speed(iter/s)": 1.448261 }, { "epoch": 0.7090527398140611, "grad_norm": 4.738497734069824, "learning_rate": 9.511952970314365e-05, "loss": 2.6786081314086916, "memory(GiB)": 66.02, "step": 16550, "token_acc": 0.4666666666666667, "train_speed(iter/s)": 1.448292 }, { "epoch": 0.7092669551433101, "grad_norm": 3.969362735748291, "learning_rate": 9.511662930714614e-05, "loss": 2.6648998260498047, "memory(GiB)": 66.02, "step": 16555, "token_acc": 0.4583333333333333, "train_speed(iter/s)": 1.448269 }, { "epoch": 0.709481170472559, "grad_norm": 3.7801785469055176, "learning_rate": 9.511372809381856e-05, "loss": 2.4989370346069335, "memory(GiB)": 66.02, "step": 16560, "token_acc": 0.4901315789473684, "train_speed(iter/s)": 1.448182 }, { "epoch": 0.709695385801808, "grad_norm": 5.0340423583984375, "learning_rate": 9.511082606321348e-05, "loss": 2.795084571838379, "memory(GiB)": 66.02, "step": 16565, "token_acc": 0.46325878594249204, "train_speed(iter/s)": 1.448032 }, { "epoch": 0.7099096011310569, "grad_norm": 5.707828521728516, "learning_rate": 9.510792321538348e-05, "loss": 2.8757003784179687, "memory(GiB)": 66.02, "step": 16570, "token_acc": 0.411371237458194, "train_speed(iter/s)": 1.44814 }, { "epoch": 0.7101238164603059, "grad_norm": 4.380470275878906, "learning_rate": 9.510501955038112e-05, "loss": 2.385051727294922, "memory(GiB)": 66.02, "step": 16575, "token_acc": 0.49504950495049505, "train_speed(iter/s)": 1.448197 }, { "epoch": 0.7103380317895549, "grad_norm": 4.149919509887695, "learning_rate": 9.510211506825904e-05, "loss": 2.710375213623047, "memory(GiB)": 66.02, "step": 16580, "token_acc": 0.45806451612903226, "train_speed(iter/s)": 1.448253 }, { "epoch": 0.7105522471188038, "grad_norm": 3.3036952018737793, "learning_rate": 9.509920976906981e-05, "loss": 2.7132644653320312, "memory(GiB)": 66.02, "step": 16585, "token_acc": 0.43440233236151604, "train_speed(iter/s)": 1.448354 }, { "epoch": 0.7107664624480527, "grad_norm": 3.1686246395111084, "learning_rate": 9.50963036528661e-05, "loss": 2.793349266052246, "memory(GiB)": 66.02, "step": 16590, "token_acc": 0.3955431754874652, "train_speed(iter/s)": 1.448285 }, { "epoch": 0.7109806777773018, "grad_norm": 3.1137187480926514, "learning_rate": 9.509339671970054e-05, "loss": 2.4462833404541016, "memory(GiB)": 66.02, "step": 16595, "token_acc": 0.5120481927710844, "train_speed(iter/s)": 1.448261 }, { "epoch": 0.7111948931065507, "grad_norm": 4.400082111358643, "learning_rate": 9.50904889696258e-05, "loss": 2.8768001556396485, "memory(GiB)": 66.02, "step": 16600, "token_acc": 0.4296577946768061, "train_speed(iter/s)": 1.448281 }, { "epoch": 0.7114091084357996, "grad_norm": 3.2652695178985596, "learning_rate": 9.508758040269457e-05, "loss": 2.4992000579833986, "memory(GiB)": 66.02, "step": 16605, "token_acc": 0.4756944444444444, "train_speed(iter/s)": 1.44834 }, { "epoch": 0.7116233237650487, "grad_norm": 5.064366340637207, "learning_rate": 9.50846710189595e-05, "loss": 2.4636676788330076, "memory(GiB)": 66.02, "step": 16610, "token_acc": 0.48056537102473496, "train_speed(iter/s)": 1.448332 }, { "epoch": 0.7118375390942976, "grad_norm": 3.61647891998291, "learning_rate": 9.508176081847333e-05, "loss": 2.437930679321289, "memory(GiB)": 66.02, "step": 16615, "token_acc": 0.47305389221556887, "train_speed(iter/s)": 1.448347 }, { "epoch": 0.7120517544235465, "grad_norm": 4.5554280281066895, "learning_rate": 9.507884980128879e-05, "loss": 2.6646337509155273, "memory(GiB)": 66.02, "step": 16620, "token_acc": 0.43478260869565216, "train_speed(iter/s)": 1.448246 }, { "epoch": 0.7122659697527955, "grad_norm": 4.849153518676758, "learning_rate": 9.507593796745858e-05, "loss": 2.6109994888305663, "memory(GiB)": 66.02, "step": 16625, "token_acc": 0.46885245901639344, "train_speed(iter/s)": 1.448316 }, { "epoch": 0.7124801850820445, "grad_norm": 4.082527160644531, "learning_rate": 9.507302531703549e-05, "loss": 2.654677391052246, "memory(GiB)": 66.02, "step": 16630, "token_acc": 0.4781021897810219, "train_speed(iter/s)": 1.448201 }, { "epoch": 0.7126944004112934, "grad_norm": 6.488663196563721, "learning_rate": 9.507011185007224e-05, "loss": 2.562953567504883, "memory(GiB)": 66.02, "step": 16635, "token_acc": 0.458041958041958, "train_speed(iter/s)": 1.448269 }, { "epoch": 0.7129086157405424, "grad_norm": 3.6953341960906982, "learning_rate": 9.506719756662163e-05, "loss": 2.6324296951293946, "memory(GiB)": 66.02, "step": 16640, "token_acc": 0.509090909090909, "train_speed(iter/s)": 1.448295 }, { "epoch": 0.7131228310697914, "grad_norm": 4.222318172454834, "learning_rate": 9.50642824667365e-05, "loss": 2.5764081954956053, "memory(GiB)": 66.02, "step": 16645, "token_acc": 0.41389728096676737, "train_speed(iter/s)": 1.448312 }, { "epoch": 0.7133370463990403, "grad_norm": 5.272609233856201, "learning_rate": 9.506136655046957e-05, "loss": 2.3582765579223635, "memory(GiB)": 66.02, "step": 16650, "token_acc": 0.5316455696202531, "train_speed(iter/s)": 1.448299 }, { "epoch": 0.7135512617282893, "grad_norm": 5.22477912902832, "learning_rate": 9.505844981787374e-05, "loss": 2.6418216705322264, "memory(GiB)": 66.02, "step": 16655, "token_acc": 0.4876543209876543, "train_speed(iter/s)": 1.448346 }, { "epoch": 0.7137654770575382, "grad_norm": 3.1178903579711914, "learning_rate": 9.505553226900181e-05, "loss": 2.271025848388672, "memory(GiB)": 66.02, "step": 16660, "token_acc": 0.49122807017543857, "train_speed(iter/s)": 1.448402 }, { "epoch": 0.7139796923867872, "grad_norm": 4.743771076202393, "learning_rate": 9.505261390390667e-05, "loss": 2.644114685058594, "memory(GiB)": 66.02, "step": 16665, "token_acc": 0.47315436241610737, "train_speed(iter/s)": 1.448367 }, { "epoch": 0.7141939077160362, "grad_norm": 3.894268274307251, "learning_rate": 9.504969472264114e-05, "loss": 2.396790313720703, "memory(GiB)": 66.02, "step": 16670, "token_acc": 0.48923076923076925, "train_speed(iter/s)": 1.448485 }, { "epoch": 0.7144081230452851, "grad_norm": 3.8684892654418945, "learning_rate": 9.504677472525816e-05, "loss": 2.4858880996704102, "memory(GiB)": 66.02, "step": 16675, "token_acc": 0.4696969696969697, "train_speed(iter/s)": 1.448527 }, { "epoch": 0.714622338374534, "grad_norm": 4.849381446838379, "learning_rate": 9.504385391181058e-05, "loss": 2.515666198730469, "memory(GiB)": 66.02, "step": 16680, "token_acc": 0.48221343873517786, "train_speed(iter/s)": 1.4485 }, { "epoch": 0.7148365537037831, "grad_norm": 4.635295391082764, "learning_rate": 9.504093228235132e-05, "loss": 2.6861574172973635, "memory(GiB)": 66.02, "step": 16685, "token_acc": 0.4444444444444444, "train_speed(iter/s)": 1.44858 }, { "epoch": 0.715050769033032, "grad_norm": 4.157852649688721, "learning_rate": 9.503800983693334e-05, "loss": 2.3195526123046877, "memory(GiB)": 66.02, "step": 16690, "token_acc": 0.46691176470588236, "train_speed(iter/s)": 1.448612 }, { "epoch": 0.7152649843622809, "grad_norm": 4.365450382232666, "learning_rate": 9.503508657560956e-05, "loss": 2.799073600769043, "memory(GiB)": 66.02, "step": 16695, "token_acc": 0.4124629080118694, "train_speed(iter/s)": 1.44873 }, { "epoch": 0.71547919969153, "grad_norm": 4.813438415527344, "learning_rate": 9.503216249843294e-05, "loss": 2.6884090423583986, "memory(GiB)": 66.02, "step": 16700, "token_acc": 0.4265927977839335, "train_speed(iter/s)": 1.448762 }, { "epoch": 0.7156934150207789, "grad_norm": 4.648043155670166, "learning_rate": 9.502923760545644e-05, "loss": 2.622831344604492, "memory(GiB)": 66.02, "step": 16705, "token_acc": 0.4862068965517241, "train_speed(iter/s)": 1.448763 }, { "epoch": 0.7159076303500278, "grad_norm": 4.139097690582275, "learning_rate": 9.502631189673307e-05, "loss": 2.3960453033447267, "memory(GiB)": 66.02, "step": 16710, "token_acc": 0.5043859649122807, "train_speed(iter/s)": 1.448877 }, { "epoch": 0.7161218456792768, "grad_norm": 3.825942277908325, "learning_rate": 9.502338537231583e-05, "loss": 2.6495628356933594, "memory(GiB)": 66.02, "step": 16715, "token_acc": 0.44868035190615835, "train_speed(iter/s)": 1.448897 }, { "epoch": 0.7163360610085258, "grad_norm": 4.570528030395508, "learning_rate": 9.502045803225772e-05, "loss": 2.729033660888672, "memory(GiB)": 66.02, "step": 16720, "token_acc": 0.4196078431372549, "train_speed(iter/s)": 1.448988 }, { "epoch": 0.7165502763377747, "grad_norm": 4.131954669952393, "learning_rate": 9.501752987661177e-05, "loss": 2.60513973236084, "memory(GiB)": 66.02, "step": 16725, "token_acc": 0.4586206896551724, "train_speed(iter/s)": 1.449049 }, { "epoch": 0.7167644916670237, "grad_norm": 3.556879758834839, "learning_rate": 9.501460090543105e-05, "loss": 2.2864448547363283, "memory(GiB)": 66.02, "step": 16730, "token_acc": 0.5135135135135135, "train_speed(iter/s)": 1.44903 }, { "epoch": 0.7169787069962726, "grad_norm": 6.622162818908691, "learning_rate": 9.50116711187686e-05, "loss": 2.583823394775391, "memory(GiB)": 66.02, "step": 16735, "token_acc": 0.432, "train_speed(iter/s)": 1.449011 }, { "epoch": 0.7171929223255216, "grad_norm": 4.226747989654541, "learning_rate": 9.500874051667751e-05, "loss": 2.53782958984375, "memory(GiB)": 66.02, "step": 16740, "token_acc": 0.475177304964539, "train_speed(iter/s)": 1.449067 }, { "epoch": 0.7174071376547706, "grad_norm": 3.340885877609253, "learning_rate": 9.500580909921086e-05, "loss": 2.438665580749512, "memory(GiB)": 66.02, "step": 16745, "token_acc": 0.44594594594594594, "train_speed(iter/s)": 1.449148 }, { "epoch": 0.7176213529840195, "grad_norm": 4.5980424880981445, "learning_rate": 9.500287686642177e-05, "loss": 2.5203458786010744, "memory(GiB)": 66.02, "step": 16750, "token_acc": 0.45517241379310347, "train_speed(iter/s)": 1.449138 }, { "epoch": 0.7178355683132684, "grad_norm": 3.902045726776123, "learning_rate": 9.499994381836334e-05, "loss": 2.7687450408935548, "memory(GiB)": 66.02, "step": 16755, "token_acc": 0.48746518105849584, "train_speed(iter/s)": 1.449159 }, { "epoch": 0.7180497836425175, "grad_norm": 3.6287145614624023, "learning_rate": 9.499700995508871e-05, "loss": 2.5825531005859377, "memory(GiB)": 66.02, "step": 16760, "token_acc": 0.4617737003058104, "train_speed(iter/s)": 1.449072 }, { "epoch": 0.7182639989717664, "grad_norm": 4.56061315536499, "learning_rate": 9.499407527665103e-05, "loss": 2.476934242248535, "memory(GiB)": 66.02, "step": 16765, "token_acc": 0.4881889763779528, "train_speed(iter/s)": 1.448946 }, { "epoch": 0.7184782143010153, "grad_norm": 3.9306862354278564, "learning_rate": 9.499113978310348e-05, "loss": 2.6438152313232424, "memory(GiB)": 66.02, "step": 16770, "token_acc": 0.4981949458483754, "train_speed(iter/s)": 1.448964 }, { "epoch": 0.7186924296302644, "grad_norm": 4.159835338592529, "learning_rate": 9.498820347449923e-05, "loss": 2.686585807800293, "memory(GiB)": 66.02, "step": 16775, "token_acc": 0.44528301886792454, "train_speed(iter/s)": 1.448902 }, { "epoch": 0.7189066449595133, "grad_norm": 4.010687351226807, "learning_rate": 9.498526635089147e-05, "loss": 2.5310346603393556, "memory(GiB)": 66.02, "step": 16780, "token_acc": 0.4402730375426621, "train_speed(iter/s)": 1.44893 }, { "epoch": 0.7191208602887622, "grad_norm": 5.286407470703125, "learning_rate": 9.498232841233341e-05, "loss": 2.6868690490722655, "memory(GiB)": 66.02, "step": 16785, "token_acc": 0.46178343949044587, "train_speed(iter/s)": 1.449007 }, { "epoch": 0.7193350756180112, "grad_norm": 5.136654376983643, "learning_rate": 9.497938965887827e-05, "loss": 2.529734230041504, "memory(GiB)": 66.02, "step": 16790, "token_acc": 0.4669421487603306, "train_speed(iter/s)": 1.448982 }, { "epoch": 0.7195492909472602, "grad_norm": 3.7295262813568115, "learning_rate": 9.497645009057929e-05, "loss": 2.6471458435058595, "memory(GiB)": 66.02, "step": 16795, "token_acc": 0.42857142857142855, "train_speed(iter/s)": 1.449068 }, { "epoch": 0.7197635062765092, "grad_norm": 3.391190767288208, "learning_rate": 9.497350970748973e-05, "loss": 2.6329538345336916, "memory(GiB)": 66.02, "step": 16800, "token_acc": 0.4634920634920635, "train_speed(iter/s)": 1.449178 }, { "epoch": 0.7199777216057581, "grad_norm": 4.856434345245361, "learning_rate": 9.497056850966286e-05, "loss": 2.485714149475098, "memory(GiB)": 66.02, "step": 16805, "token_acc": 0.4978723404255319, "train_speed(iter/s)": 1.449291 }, { "epoch": 0.720191936935007, "grad_norm": 4.344027519226074, "learning_rate": 9.496762649715194e-05, "loss": 2.601712226867676, "memory(GiB)": 66.02, "step": 16810, "token_acc": 0.4745762711864407, "train_speed(iter/s)": 1.449369 }, { "epoch": 0.7204061522642561, "grad_norm": 5.066960334777832, "learning_rate": 9.496468367001027e-05, "loss": 2.5768354415893553, "memory(GiB)": 66.02, "step": 16815, "token_acc": 0.44, "train_speed(iter/s)": 1.449416 }, { "epoch": 0.720620367593505, "grad_norm": 5.329759120941162, "learning_rate": 9.496174002829121e-05, "loss": 2.587724304199219, "memory(GiB)": 66.02, "step": 16820, "token_acc": 0.4716981132075472, "train_speed(iter/s)": 1.449337 }, { "epoch": 0.7208345829227539, "grad_norm": 4.92232084274292, "learning_rate": 9.495879557204803e-05, "loss": 2.499320220947266, "memory(GiB)": 66.02, "step": 16825, "token_acc": 0.45849802371541504, "train_speed(iter/s)": 1.449149 }, { "epoch": 0.721048798252003, "grad_norm": 4.489537239074707, "learning_rate": 9.49558503013341e-05, "loss": 2.6504798889160157, "memory(GiB)": 66.02, "step": 16830, "token_acc": 0.4612794612794613, "train_speed(iter/s)": 1.449243 }, { "epoch": 0.7212630135812519, "grad_norm": 4.168896675109863, "learning_rate": 9.495290421620278e-05, "loss": 2.498708724975586, "memory(GiB)": 66.02, "step": 16835, "token_acc": 0.47878787878787876, "train_speed(iter/s)": 1.449283 }, { "epoch": 0.7214772289105008, "grad_norm": 3.9724533557891846, "learning_rate": 9.494995731670742e-05, "loss": 2.7124778747558596, "memory(GiB)": 66.02, "step": 16840, "token_acc": 0.445859872611465, "train_speed(iter/s)": 1.449288 }, { "epoch": 0.7216914442397498, "grad_norm": 5.2523651123046875, "learning_rate": 9.494700960290141e-05, "loss": 2.6246126174926756, "memory(GiB)": 66.02, "step": 16845, "token_acc": 0.45934959349593496, "train_speed(iter/s)": 1.449383 }, { "epoch": 0.7219056595689988, "grad_norm": 5.6819987297058105, "learning_rate": 9.494406107483817e-05, "loss": 2.6334814071655273, "memory(GiB)": 66.02, "step": 16850, "token_acc": 0.4360655737704918, "train_speed(iter/s)": 1.449414 }, { "epoch": 0.7221198748982477, "grad_norm": 5.335559844970703, "learning_rate": 9.49411117325711e-05, "loss": 2.5696552276611326, "memory(GiB)": 66.02, "step": 16855, "token_acc": 0.47520661157024796, "train_speed(iter/s)": 1.449446 }, { "epoch": 0.7223340902274967, "grad_norm": 6.168452739715576, "learning_rate": 9.493816157615363e-05, "loss": 2.649845314025879, "memory(GiB)": 66.02, "step": 16860, "token_acc": 0.4523809523809524, "train_speed(iter/s)": 1.449463 }, { "epoch": 0.7225483055567457, "grad_norm": 4.4678239822387695, "learning_rate": 9.493521060563921e-05, "loss": 2.5617391586303713, "memory(GiB)": 66.02, "step": 16865, "token_acc": 0.45016077170418006, "train_speed(iter/s)": 1.449549 }, { "epoch": 0.7227625208859946, "grad_norm": 4.201544284820557, "learning_rate": 9.49322588210813e-05, "loss": 3.019285774230957, "memory(GiB)": 66.02, "step": 16870, "token_acc": 0.40524781341107874, "train_speed(iter/s)": 1.449525 }, { "epoch": 0.7229767362152436, "grad_norm": 3.6340491771698, "learning_rate": 9.492930622253336e-05, "loss": 2.402752685546875, "memory(GiB)": 66.02, "step": 16875, "token_acc": 0.4827586206896552, "train_speed(iter/s)": 1.449615 }, { "epoch": 0.7231909515444925, "grad_norm": 4.638591766357422, "learning_rate": 9.49263528100489e-05, "loss": 2.734838676452637, "memory(GiB)": 66.02, "step": 16880, "token_acc": 0.43478260869565216, "train_speed(iter/s)": 1.449627 }, { "epoch": 0.7234051668737415, "grad_norm": 7.417115211486816, "learning_rate": 9.492339858368141e-05, "loss": 3.159964370727539, "memory(GiB)": 66.02, "step": 16885, "token_acc": 0.3783783783783784, "train_speed(iter/s)": 1.44968 }, { "epoch": 0.7236193822029905, "grad_norm": 5.388756275177002, "learning_rate": 9.492044354348442e-05, "loss": 2.4829620361328124, "memory(GiB)": 66.02, "step": 16890, "token_acc": 0.48736462093862815, "train_speed(iter/s)": 1.449608 }, { "epoch": 0.7238335975322394, "grad_norm": 4.924877166748047, "learning_rate": 9.491748768951145e-05, "loss": 2.8124759674072264, "memory(GiB)": 66.02, "step": 16895, "token_acc": 0.45161290322580644, "train_speed(iter/s)": 1.449694 }, { "epoch": 0.7240478128614883, "grad_norm": 5.154012680053711, "learning_rate": 9.491453102181606e-05, "loss": 2.5469661712646485, "memory(GiB)": 66.02, "step": 16900, "token_acc": 0.4911660777385159, "train_speed(iter/s)": 1.4498 }, { "epoch": 0.7242620281907374, "grad_norm": 4.461989879608154, "learning_rate": 9.49115735404518e-05, "loss": 2.574415588378906, "memory(GiB)": 66.02, "step": 16905, "token_acc": 0.44410876132930516, "train_speed(iter/s)": 1.449766 }, { "epoch": 0.7244762435199863, "grad_norm": 4.099794387817383, "learning_rate": 9.490861524547225e-05, "loss": 2.5401105880737305, "memory(GiB)": 66.02, "step": 16910, "token_acc": 0.503448275862069, "train_speed(iter/s)": 1.449788 }, { "epoch": 0.7246904588492352, "grad_norm": 4.441971302032471, "learning_rate": 9.490565613693102e-05, "loss": 2.5891605377197267, "memory(GiB)": 66.02, "step": 16915, "token_acc": 0.4642857142857143, "train_speed(iter/s)": 1.449874 }, { "epoch": 0.7249046741784843, "grad_norm": 4.345794677734375, "learning_rate": 9.49026962148817e-05, "loss": 2.6970636367797853, "memory(GiB)": 66.02, "step": 16920, "token_acc": 0.44505494505494503, "train_speed(iter/s)": 1.449987 }, { "epoch": 0.7251188895077332, "grad_norm": 4.45413875579834, "learning_rate": 9.489973547937792e-05, "loss": 2.512656402587891, "memory(GiB)": 66.02, "step": 16925, "token_acc": 0.48598130841121495, "train_speed(iter/s)": 1.450104 }, { "epoch": 0.7253331048369821, "grad_norm": 4.909793853759766, "learning_rate": 9.48967739304733e-05, "loss": 2.256917190551758, "memory(GiB)": 66.02, "step": 16930, "token_acc": 0.47653429602888087, "train_speed(iter/s)": 1.450177 }, { "epoch": 0.7255473201662311, "grad_norm": 3.5779850482940674, "learning_rate": 9.489381156822152e-05, "loss": 2.7278827667236327, "memory(GiB)": 66.02, "step": 16935, "token_acc": 0.4740740740740741, "train_speed(iter/s)": 1.450192 }, { "epoch": 0.7257615354954801, "grad_norm": 5.764263153076172, "learning_rate": 9.489084839267621e-05, "loss": 2.571573257446289, "memory(GiB)": 66.02, "step": 16940, "token_acc": 0.48031496062992124, "train_speed(iter/s)": 1.450253 }, { "epoch": 0.725975750824729, "grad_norm": 3.739387273788452, "learning_rate": 9.488788440389109e-05, "loss": 2.6918308258056642, "memory(GiB)": 66.02, "step": 16945, "token_acc": 0.4523809523809524, "train_speed(iter/s)": 1.45033 }, { "epoch": 0.726189966153978, "grad_norm": 3.6499791145324707, "learning_rate": 9.488491960191984e-05, "loss": 2.6464052200317383, "memory(GiB)": 66.02, "step": 16950, "token_acc": 0.4386503067484663, "train_speed(iter/s)": 1.45033 }, { "epoch": 0.7264041814832269, "grad_norm": 5.935890197753906, "learning_rate": 9.488195398681614e-05, "loss": 2.749914360046387, "memory(GiB)": 66.02, "step": 16955, "token_acc": 0.4694533762057878, "train_speed(iter/s)": 1.450322 }, { "epoch": 0.7266183968124759, "grad_norm": 3.368055582046509, "learning_rate": 9.487898755863376e-05, "loss": 2.6027194976806642, "memory(GiB)": 66.02, "step": 16960, "token_acc": 0.4602076124567474, "train_speed(iter/s)": 1.450311 }, { "epoch": 0.7268326121417249, "grad_norm": 5.101498603820801, "learning_rate": 9.487602031742642e-05, "loss": 2.441142272949219, "memory(GiB)": 66.02, "step": 16965, "token_acc": 0.5049833887043189, "train_speed(iter/s)": 1.450289 }, { "epoch": 0.7270468274709738, "grad_norm": 4.607686996459961, "learning_rate": 9.487305226324786e-05, "loss": 2.597370147705078, "memory(GiB)": 66.02, "step": 16970, "token_acc": 0.45018450184501846, "train_speed(iter/s)": 1.450263 }, { "epoch": 0.7272610428002227, "grad_norm": 3.64021635055542, "learning_rate": 9.487008339615187e-05, "loss": 2.4405391693115233, "memory(GiB)": 66.02, "step": 16975, "token_acc": 0.53125, "train_speed(iter/s)": 1.45027 }, { "epoch": 0.7274752581294718, "grad_norm": 4.465456962585449, "learning_rate": 9.486711371619224e-05, "loss": 2.839253234863281, "memory(GiB)": 66.02, "step": 16980, "token_acc": 0.4214046822742475, "train_speed(iter/s)": 1.450409 }, { "epoch": 0.7276894734587207, "grad_norm": 3.530322790145874, "learning_rate": 9.486414322342274e-05, "loss": 2.679307556152344, "memory(GiB)": 66.02, "step": 16985, "token_acc": 0.4666666666666667, "train_speed(iter/s)": 1.450452 }, { "epoch": 0.7279036887879696, "grad_norm": 10.375099182128906, "learning_rate": 9.486117191789721e-05, "loss": 2.8104286193847656, "memory(GiB)": 66.02, "step": 16990, "token_acc": 0.4384057971014493, "train_speed(iter/s)": 1.450459 }, { "epoch": 0.7281179041172187, "grad_norm": 3.8432939052581787, "learning_rate": 9.485819979966946e-05, "loss": 2.687255859375, "memory(GiB)": 66.02, "step": 16995, "token_acc": 0.43103448275862066, "train_speed(iter/s)": 1.450417 }, { "epoch": 0.7283321194464676, "grad_norm": 5.823300361633301, "learning_rate": 9.485522686879336e-05, "loss": 2.5549243927001952, "memory(GiB)": 66.02, "step": 17000, "token_acc": 0.47305389221556887, "train_speed(iter/s)": 1.450509 }, { "epoch": 0.7283321194464676, "eval_loss": 2.220410108566284, "eval_runtime": 13.2803, "eval_samples_per_second": 7.53, "eval_steps_per_second": 7.53, "eval_token_acc": 0.466484268125855, "step": 17000 }, { "epoch": 0.7285463347757165, "grad_norm": 3.686988115310669, "learning_rate": 9.485225312532274e-05, "loss": 2.4599887847900392, "memory(GiB)": 66.02, "step": 17005, "token_acc": 0.45675413022351796, "train_speed(iter/s)": 1.448857 }, { "epoch": 0.7287605501049655, "grad_norm": 4.44323205947876, "learning_rate": 9.484927856931146e-05, "loss": 2.3029518127441406, "memory(GiB)": 66.02, "step": 17010, "token_acc": 0.49767441860465117, "train_speed(iter/s)": 1.448929 }, { "epoch": 0.7289747654342145, "grad_norm": 6.396110534667969, "learning_rate": 9.484630320081343e-05, "loss": 2.4460918426513674, "memory(GiB)": 66.02, "step": 17015, "token_acc": 0.5052264808362369, "train_speed(iter/s)": 1.448915 }, { "epoch": 0.7291889807634634, "grad_norm": 3.8762753009796143, "learning_rate": 9.484332701988257e-05, "loss": 2.427947425842285, "memory(GiB)": 66.02, "step": 17020, "token_acc": 0.4812286689419795, "train_speed(iter/s)": 1.448997 }, { "epoch": 0.7294031960927124, "grad_norm": 4.288548469543457, "learning_rate": 9.484035002657276e-05, "loss": 2.561084747314453, "memory(GiB)": 66.02, "step": 17025, "token_acc": 0.46706586826347307, "train_speed(iter/s)": 1.448909 }, { "epoch": 0.7296174114219613, "grad_norm": 4.329801559448242, "learning_rate": 9.483737222093794e-05, "loss": 2.4620681762695313, "memory(GiB)": 66.02, "step": 17030, "token_acc": 0.4909090909090909, "train_speed(iter/s)": 1.448988 }, { "epoch": 0.7298316267512103, "grad_norm": 5.658819675445557, "learning_rate": 9.483439360303208e-05, "loss": 2.3278966903686524, "memory(GiB)": 66.02, "step": 17035, "token_acc": 0.48201438848920863, "train_speed(iter/s)": 1.4491 }, { "epoch": 0.7300458420804593, "grad_norm": 5.306597709655762, "learning_rate": 9.48314141729091e-05, "loss": 2.765092468261719, "memory(GiB)": 66.02, "step": 17040, "token_acc": 0.45564516129032256, "train_speed(iter/s)": 1.449075 }, { "epoch": 0.7302600574097082, "grad_norm": 13.803184509277344, "learning_rate": 9.482843393062302e-05, "loss": 2.764059638977051, "memory(GiB)": 66.02, "step": 17045, "token_acc": 0.4228395061728395, "train_speed(iter/s)": 1.449114 }, { "epoch": 0.7304742727389572, "grad_norm": 3.409217357635498, "learning_rate": 9.482545287622779e-05, "loss": 2.5887920379638674, "memory(GiB)": 66.02, "step": 17050, "token_acc": 0.45901639344262296, "train_speed(iter/s)": 1.449142 }, { "epoch": 0.7306884880682062, "grad_norm": 3.3520493507385254, "learning_rate": 9.482247100977744e-05, "loss": 2.419658088684082, "memory(GiB)": 66.02, "step": 17055, "token_acc": 0.4786885245901639, "train_speed(iter/s)": 1.449056 }, { "epoch": 0.7309027033974551, "grad_norm": 7.660039901733398, "learning_rate": 9.481948833132597e-05, "loss": 2.8795650482177733, "memory(GiB)": 66.02, "step": 17060, "token_acc": 0.41825095057034223, "train_speed(iter/s)": 1.449113 }, { "epoch": 0.731116918726704, "grad_norm": 3.8919930458068848, "learning_rate": 9.481650484092744e-05, "loss": 2.5224071502685548, "memory(GiB)": 66.02, "step": 17065, "token_acc": 0.5021645021645021, "train_speed(iter/s)": 1.449143 }, { "epoch": 0.7313311340559531, "grad_norm": 3.0317535400390625, "learning_rate": 9.481352053863585e-05, "loss": 2.791655921936035, "memory(GiB)": 66.02, "step": 17070, "token_acc": 0.4423076923076923, "train_speed(iter/s)": 1.449188 }, { "epoch": 0.731545349385202, "grad_norm": 5.536355972290039, "learning_rate": 9.481053542450533e-05, "loss": 2.79862003326416, "memory(GiB)": 66.02, "step": 17075, "token_acc": 0.43130990415335463, "train_speed(iter/s)": 1.449188 }, { "epoch": 0.7317595647144509, "grad_norm": 4.253134727478027, "learning_rate": 9.480754949858992e-05, "loss": 2.510112762451172, "memory(GiB)": 66.02, "step": 17080, "token_acc": 0.4689922480620155, "train_speed(iter/s)": 1.449218 }, { "epoch": 0.7319737800437, "grad_norm": 5.482084274291992, "learning_rate": 9.480456276094372e-05, "loss": 2.4189802169799806, "memory(GiB)": 66.02, "step": 17085, "token_acc": 0.4541984732824427, "train_speed(iter/s)": 1.449333 }, { "epoch": 0.7321879953729489, "grad_norm": 4.540184020996094, "learning_rate": 9.480157521162084e-05, "loss": 2.663313293457031, "memory(GiB)": 66.02, "step": 17090, "token_acc": 0.48494983277591974, "train_speed(iter/s)": 1.449373 }, { "epoch": 0.7324022107021978, "grad_norm": 4.649466514587402, "learning_rate": 9.479858685067538e-05, "loss": 2.7879491806030274, "memory(GiB)": 66.02, "step": 17095, "token_acc": 0.4125, "train_speed(iter/s)": 1.449412 }, { "epoch": 0.7326164260314468, "grad_norm": 3.7161691188812256, "learning_rate": 9.479559767816151e-05, "loss": 2.9131710052490236, "memory(GiB)": 66.02, "step": 17100, "token_acc": 0.44036697247706424, "train_speed(iter/s)": 1.449286 }, { "epoch": 0.7328306413606958, "grad_norm": 4.126161098480225, "learning_rate": 9.479260769413335e-05, "loss": 2.775347900390625, "memory(GiB)": 66.02, "step": 17105, "token_acc": 0.44625407166123776, "train_speed(iter/s)": 1.44938 }, { "epoch": 0.7330448566899447, "grad_norm": 3.6814563274383545, "learning_rate": 9.47896168986451e-05, "loss": 2.5244625091552733, "memory(GiB)": 66.02, "step": 17110, "token_acc": 0.4601449275362319, "train_speed(iter/s)": 1.449404 }, { "epoch": 0.7332590720191937, "grad_norm": 5.023619174957275, "learning_rate": 9.478662529175091e-05, "loss": 2.7300262451171875, "memory(GiB)": 66.02, "step": 17115, "token_acc": 0.45544554455445546, "train_speed(iter/s)": 1.44941 }, { "epoch": 0.7334732873484426, "grad_norm": 3.727031707763672, "learning_rate": 9.4783632873505e-05, "loss": 2.1511747360229494, "memory(GiB)": 66.02, "step": 17120, "token_acc": 0.5038759689922481, "train_speed(iter/s)": 1.449482 }, { "epoch": 0.7336875026776916, "grad_norm": 6.000275135040283, "learning_rate": 9.478063964396156e-05, "loss": 2.6481086730957033, "memory(GiB)": 66.02, "step": 17125, "token_acc": 0.4396887159533074, "train_speed(iter/s)": 1.449574 }, { "epoch": 0.7339017180069406, "grad_norm": 3.840193033218384, "learning_rate": 9.477764560317483e-05, "loss": 2.6058633804321287, "memory(GiB)": 66.02, "step": 17130, "token_acc": 0.4669421487603306, "train_speed(iter/s)": 1.449719 }, { "epoch": 0.7341159333361895, "grad_norm": 4.2880754470825195, "learning_rate": 9.477465075119905e-05, "loss": 2.7650218963623048, "memory(GiB)": 66.02, "step": 17135, "token_acc": 0.43086816720257237, "train_speed(iter/s)": 1.449769 }, { "epoch": 0.7343301486654386, "grad_norm": 4.753639221191406, "learning_rate": 9.477165508808847e-05, "loss": 2.8226192474365233, "memory(GiB)": 66.02, "step": 17140, "token_acc": 0.43050847457627117, "train_speed(iter/s)": 1.449841 }, { "epoch": 0.7345443639946875, "grad_norm": 3.7705907821655273, "learning_rate": 9.476865861389735e-05, "loss": 2.580490303039551, "memory(GiB)": 66.02, "step": 17145, "token_acc": 0.4444444444444444, "train_speed(iter/s)": 1.449804 }, { "epoch": 0.7347585793239364, "grad_norm": 4.285682678222656, "learning_rate": 9.476566132867999e-05, "loss": 2.20565242767334, "memory(GiB)": 66.02, "step": 17150, "token_acc": 0.5225225225225225, "train_speed(iter/s)": 1.449832 }, { "epoch": 0.7349727946531854, "grad_norm": 3.546729564666748, "learning_rate": 9.476266323249068e-05, "loss": 2.4750633239746094, "memory(GiB)": 66.02, "step": 17155, "token_acc": 0.5203761755485894, "train_speed(iter/s)": 1.449815 }, { "epoch": 0.7351870099824344, "grad_norm": 4.114686489105225, "learning_rate": 9.475966432538372e-05, "loss": 2.786367988586426, "memory(GiB)": 66.02, "step": 17160, "token_acc": 0.3859060402684564, "train_speed(iter/s)": 1.449752 }, { "epoch": 0.7354012253116833, "grad_norm": 4.056107044219971, "learning_rate": 9.475666460741348e-05, "loss": 2.5953720092773436, "memory(GiB)": 66.02, "step": 17165, "token_acc": 0.46153846153846156, "train_speed(iter/s)": 1.449773 }, { "epoch": 0.7356154406409323, "grad_norm": 4.376646518707275, "learning_rate": 9.475366407863427e-05, "loss": 2.645228385925293, "memory(GiB)": 66.02, "step": 17170, "token_acc": 0.47230320699708456, "train_speed(iter/s)": 1.449742 }, { "epoch": 0.7358296559701812, "grad_norm": 5.003983974456787, "learning_rate": 9.475066273910043e-05, "loss": 2.7561511993408203, "memory(GiB)": 66.02, "step": 17175, "token_acc": 0.44126984126984126, "train_speed(iter/s)": 1.449773 }, { "epoch": 0.7360438712994302, "grad_norm": 3.858750343322754, "learning_rate": 9.474766058886637e-05, "loss": 2.612289047241211, "memory(GiB)": 66.02, "step": 17180, "token_acc": 0.49085365853658536, "train_speed(iter/s)": 1.449818 }, { "epoch": 0.7362580866286792, "grad_norm": 5.102039337158203, "learning_rate": 9.474465762798646e-05, "loss": 2.579855728149414, "memory(GiB)": 66.02, "step": 17185, "token_acc": 0.46099290780141844, "train_speed(iter/s)": 1.449892 }, { "epoch": 0.7364723019579281, "grad_norm": 4.245791912078857, "learning_rate": 9.474165385651511e-05, "loss": 2.4613508224487304, "memory(GiB)": 66.02, "step": 17190, "token_acc": 0.4899328859060403, "train_speed(iter/s)": 1.449868 }, { "epoch": 0.736686517287177, "grad_norm": 5.47557258605957, "learning_rate": 9.473864927450673e-05, "loss": 2.4753299713134767, "memory(GiB)": 66.02, "step": 17195, "token_acc": 0.45614035087719296, "train_speed(iter/s)": 1.449868 }, { "epoch": 0.7369007326164261, "grad_norm": 4.8811869621276855, "learning_rate": 9.473564388201575e-05, "loss": 2.5692346572875975, "memory(GiB)": 66.02, "step": 17200, "token_acc": 0.47635135135135137, "train_speed(iter/s)": 1.449913 }, { "epoch": 0.737114947945675, "grad_norm": 4.696002006530762, "learning_rate": 9.473263767909659e-05, "loss": 2.8190792083740233, "memory(GiB)": 66.02, "step": 17205, "token_acc": 0.4504792332268371, "train_speed(iter/s)": 1.449948 }, { "epoch": 0.7373291632749239, "grad_norm": 4.203622341156006, "learning_rate": 9.472963066580375e-05, "loss": 3.2341758728027346, "memory(GiB)": 66.02, "step": 17210, "token_acc": 0.4166666666666667, "train_speed(iter/s)": 1.450074 }, { "epoch": 0.737543378604173, "grad_norm": 4.111051559448242, "learning_rate": 9.472662284219171e-05, "loss": 2.8425106048583983, "memory(GiB)": 66.02, "step": 17215, "token_acc": 0.43260188087774293, "train_speed(iter/s)": 1.450163 }, { "epoch": 0.7377575939334219, "grad_norm": 4.058981418609619, "learning_rate": 9.472361420831493e-05, "loss": 2.6882980346679686, "memory(GiB)": 66.02, "step": 17220, "token_acc": 0.46273291925465837, "train_speed(iter/s)": 1.450247 }, { "epoch": 0.7379718092626708, "grad_norm": 4.370203495025635, "learning_rate": 9.472060476422791e-05, "loss": 2.318088722229004, "memory(GiB)": 66.02, "step": 17225, "token_acc": 0.4954682779456193, "train_speed(iter/s)": 1.450205 }, { "epoch": 0.7381860245919198, "grad_norm": 4.204536437988281, "learning_rate": 9.471759450998521e-05, "loss": 2.496956634521484, "memory(GiB)": 66.02, "step": 17230, "token_acc": 0.4309210526315789, "train_speed(iter/s)": 1.450172 }, { "epoch": 0.7384002399211688, "grad_norm": 4.768651008605957, "learning_rate": 9.471458344564132e-05, "loss": 2.6269115447998046, "memory(GiB)": 66.02, "step": 17235, "token_acc": 0.4641638225255973, "train_speed(iter/s)": 1.450207 }, { "epoch": 0.7386144552504177, "grad_norm": 4.9118781089782715, "learning_rate": 9.47115715712508e-05, "loss": 2.758819007873535, "memory(GiB)": 66.02, "step": 17240, "token_acc": 0.43042071197411, "train_speed(iter/s)": 1.450172 }, { "epoch": 0.7388286705796667, "grad_norm": 4.1345744132995605, "learning_rate": 9.470855888686822e-05, "loss": 2.638313102722168, "memory(GiB)": 66.02, "step": 17245, "token_acc": 0.4491017964071856, "train_speed(iter/s)": 1.450197 }, { "epoch": 0.7390428859089156, "grad_norm": 4.762982368469238, "learning_rate": 9.470554539254816e-05, "loss": 2.7447622299194334, "memory(GiB)": 66.02, "step": 17250, "token_acc": 0.44281524926686217, "train_speed(iter/s)": 1.450241 }, { "epoch": 0.7392571012381646, "grad_norm": 4.486371040344238, "learning_rate": 9.470253108834522e-05, "loss": 2.7322099685668944, "memory(GiB)": 66.02, "step": 17255, "token_acc": 0.41968911917098445, "train_speed(iter/s)": 1.450174 }, { "epoch": 0.7394713165674136, "grad_norm": 3.7400050163269043, "learning_rate": 9.469951597431398e-05, "loss": 2.816219711303711, "memory(GiB)": 66.02, "step": 17260, "token_acc": 0.4208860759493671, "train_speed(iter/s)": 1.450182 }, { "epoch": 0.7396855318966625, "grad_norm": 6.051055431365967, "learning_rate": 9.469650005050907e-05, "loss": 2.57449951171875, "memory(GiB)": 66.02, "step": 17265, "token_acc": 0.47470817120622566, "train_speed(iter/s)": 1.450168 }, { "epoch": 0.7398997472259115, "grad_norm": 5.436326503753662, "learning_rate": 9.469348331698516e-05, "loss": 2.6801521301269533, "memory(GiB)": 66.02, "step": 17270, "token_acc": 0.4186746987951807, "train_speed(iter/s)": 1.450144 }, { "epoch": 0.7401139625551605, "grad_norm": 4.679691314697266, "learning_rate": 9.469046577379685e-05, "loss": 2.7199474334716798, "memory(GiB)": 66.02, "step": 17275, "token_acc": 0.4339622641509434, "train_speed(iter/s)": 1.450258 }, { "epoch": 0.7403281778844094, "grad_norm": 5.3936686515808105, "learning_rate": 9.468744742099885e-05, "loss": 2.459560012817383, "memory(GiB)": 66.02, "step": 17280, "token_acc": 0.5078125, "train_speed(iter/s)": 1.45026 }, { "epoch": 0.7405423932136583, "grad_norm": 4.026109218597412, "learning_rate": 9.468442825864578e-05, "loss": 2.6350627899169923, "memory(GiB)": 66.02, "step": 17285, "token_acc": 0.45724907063197023, "train_speed(iter/s)": 1.450334 }, { "epoch": 0.7407566085429074, "grad_norm": 3.9744529724121094, "learning_rate": 9.468140828679242e-05, "loss": 2.5128152847290037, "memory(GiB)": 66.02, "step": 17290, "token_acc": 0.4689655172413793, "train_speed(iter/s)": 1.450349 }, { "epoch": 0.7409708238721563, "grad_norm": 4.821835041046143, "learning_rate": 9.467838750549341e-05, "loss": 2.7504316329956056, "memory(GiB)": 66.02, "step": 17295, "token_acc": 0.42857142857142855, "train_speed(iter/s)": 1.45032 }, { "epoch": 0.7411850392014052, "grad_norm": 4.159036159515381, "learning_rate": 9.46753659148035e-05, "loss": 2.8426998138427733, "memory(GiB)": 66.02, "step": 17300, "token_acc": 0.4271523178807947, "train_speed(iter/s)": 1.450319 }, { "epoch": 0.7413992545306543, "grad_norm": 5.1360554695129395, "learning_rate": 9.467234351477743e-05, "loss": 2.6242515563964846, "memory(GiB)": 66.02, "step": 17305, "token_acc": 0.42452830188679247, "train_speed(iter/s)": 1.45043 }, { "epoch": 0.7416134698599032, "grad_norm": 4.638542175292969, "learning_rate": 9.466932030546996e-05, "loss": 2.698273849487305, "memory(GiB)": 66.02, "step": 17310, "token_acc": 0.4605678233438486, "train_speed(iter/s)": 1.450486 }, { "epoch": 0.7418276851891521, "grad_norm": 3.99564266204834, "learning_rate": 9.466629628693584e-05, "loss": 2.454903984069824, "memory(GiB)": 66.02, "step": 17315, "token_acc": 0.47619047619047616, "train_speed(iter/s)": 1.45057 }, { "epoch": 0.7420419005184011, "grad_norm": 4.749660968780518, "learning_rate": 9.466327145922986e-05, "loss": 2.6469778060913085, "memory(GiB)": 66.02, "step": 17320, "token_acc": 0.49491525423728816, "train_speed(iter/s)": 1.45059 }, { "epoch": 0.74225611584765, "grad_norm": 4.526060581207275, "learning_rate": 9.466024582240683e-05, "loss": 2.284941482543945, "memory(GiB)": 66.02, "step": 17325, "token_acc": 0.5019607843137255, "train_speed(iter/s)": 1.450661 }, { "epoch": 0.742470331176899, "grad_norm": 7.708323001861572, "learning_rate": 9.465721937652155e-05, "loss": 2.456673812866211, "memory(GiB)": 66.02, "step": 17330, "token_acc": 0.4979919678714859, "train_speed(iter/s)": 1.450554 }, { "epoch": 0.742684546506148, "grad_norm": 4.336303234100342, "learning_rate": 9.465419212162883e-05, "loss": 2.5639923095703123, "memory(GiB)": 66.02, "step": 17335, "token_acc": 0.47468354430379744, "train_speed(iter/s)": 1.450559 }, { "epoch": 0.7428987618353969, "grad_norm": 3.728372812271118, "learning_rate": 9.465116405778357e-05, "loss": 2.7540443420410154, "memory(GiB)": 66.02, "step": 17340, "token_acc": 0.47410358565737054, "train_speed(iter/s)": 1.450645 }, { "epoch": 0.7431129771646459, "grad_norm": 5.069708824157715, "learning_rate": 9.464813518504055e-05, "loss": 2.604481506347656, "memory(GiB)": 66.02, "step": 17345, "token_acc": 0.44140625, "train_speed(iter/s)": 1.450759 }, { "epoch": 0.7433271924938949, "grad_norm": 4.775428771972656, "learning_rate": 9.464510550345469e-05, "loss": 2.732996368408203, "memory(GiB)": 66.02, "step": 17350, "token_acc": 0.5037593984962406, "train_speed(iter/s)": 1.450796 }, { "epoch": 0.7435414078231438, "grad_norm": 4.489157199859619, "learning_rate": 9.464207501308086e-05, "loss": 2.366771697998047, "memory(GiB)": 66.02, "step": 17355, "token_acc": 0.4965753424657534, "train_speed(iter/s)": 1.450912 }, { "epoch": 0.7437556231523927, "grad_norm": 4.524479389190674, "learning_rate": 9.463904371397397e-05, "loss": 2.513582229614258, "memory(GiB)": 66.02, "step": 17360, "token_acc": 0.48249027237354086, "train_speed(iter/s)": 1.450915 }, { "epoch": 0.7439698384816418, "grad_norm": 4.527760982513428, "learning_rate": 9.463601160618892e-05, "loss": 2.7093578338623048, "memory(GiB)": 66.02, "step": 17365, "token_acc": 0.4927536231884058, "train_speed(iter/s)": 1.450911 }, { "epoch": 0.7441840538108907, "grad_norm": 5.473022937774658, "learning_rate": 9.463297868978066e-05, "loss": 2.577850341796875, "memory(GiB)": 66.02, "step": 17370, "token_acc": 0.4886731391585761, "train_speed(iter/s)": 1.450963 }, { "epoch": 0.7443982691401396, "grad_norm": 5.752939701080322, "learning_rate": 9.462994496480411e-05, "loss": 2.5562368392944337, "memory(GiB)": 66.02, "step": 17375, "token_acc": 0.4707792207792208, "train_speed(iter/s)": 1.450881 }, { "epoch": 0.7446124844693887, "grad_norm": 4.581209659576416, "learning_rate": 9.462691043131424e-05, "loss": 2.479447364807129, "memory(GiB)": 66.02, "step": 17380, "token_acc": 0.5126353790613718, "train_speed(iter/s)": 1.450983 }, { "epoch": 0.7448266997986376, "grad_norm": 5.816927909851074, "learning_rate": 9.462387508936605e-05, "loss": 2.7149803161621096, "memory(GiB)": 66.02, "step": 17385, "token_acc": 0.4235294117647059, "train_speed(iter/s)": 1.450978 }, { "epoch": 0.7450409151278865, "grad_norm": 4.131303310394287, "learning_rate": 9.462083893901448e-05, "loss": 2.7552743911743165, "memory(GiB)": 66.02, "step": 17390, "token_acc": 0.44884488448844884, "train_speed(iter/s)": 1.450967 }, { "epoch": 0.7452551304571355, "grad_norm": 4.87317419052124, "learning_rate": 9.461780198031456e-05, "loss": 2.588461685180664, "memory(GiB)": 66.02, "step": 17395, "token_acc": 0.45422535211267606, "train_speed(iter/s)": 1.450928 }, { "epoch": 0.7454693457863845, "grad_norm": 6.666911602020264, "learning_rate": 9.46147642133213e-05, "loss": 2.7245845794677734, "memory(GiB)": 66.02, "step": 17400, "token_acc": 0.43389830508474575, "train_speed(iter/s)": 1.451009 }, { "epoch": 0.7456835611156334, "grad_norm": 3.6350271701812744, "learning_rate": 9.461172563808973e-05, "loss": 2.5686161041259767, "memory(GiB)": 66.02, "step": 17405, "token_acc": 0.4646153846153846, "train_speed(iter/s)": 1.451058 }, { "epoch": 0.7458977764448824, "grad_norm": 4.5575666427612305, "learning_rate": 9.46086862546749e-05, "loss": 2.672595977783203, "memory(GiB)": 66.02, "step": 17410, "token_acc": 0.45317220543806647, "train_speed(iter/s)": 1.451049 }, { "epoch": 0.7461119917741313, "grad_norm": 3.5397253036499023, "learning_rate": 9.460564606313189e-05, "loss": 2.6991245269775392, "memory(GiB)": 66.02, "step": 17415, "token_acc": 0.45930232558139533, "train_speed(iter/s)": 1.451117 }, { "epoch": 0.7463262071033803, "grad_norm": 3.950761556625366, "learning_rate": 9.460260506351573e-05, "loss": 2.5688545227050783, "memory(GiB)": 66.02, "step": 17420, "token_acc": 0.43582089552238806, "train_speed(iter/s)": 1.451096 }, { "epoch": 0.7465404224326293, "grad_norm": 4.27651834487915, "learning_rate": 9.459956325588155e-05, "loss": 2.6067344665527346, "memory(GiB)": 66.02, "step": 17425, "token_acc": 0.46153846153846156, "train_speed(iter/s)": 1.451137 }, { "epoch": 0.7467546377618782, "grad_norm": 4.900177478790283, "learning_rate": 9.459652064028445e-05, "loss": 2.91363468170166, "memory(GiB)": 66.02, "step": 17430, "token_acc": 0.4246987951807229, "train_speed(iter/s)": 1.451214 }, { "epoch": 0.7469688530911271, "grad_norm": 4.552701473236084, "learning_rate": 9.459347721677954e-05, "loss": 2.4086576461791993, "memory(GiB)": 66.02, "step": 17435, "token_acc": 0.4750830564784053, "train_speed(iter/s)": 1.451154 }, { "epoch": 0.7471830684203762, "grad_norm": 6.436448574066162, "learning_rate": 9.459043298542196e-05, "loss": 2.7376712799072265, "memory(GiB)": 66.02, "step": 17440, "token_acc": 0.46601941747572817, "train_speed(iter/s)": 1.451254 }, { "epoch": 0.7473972837496251, "grad_norm": 4.306189060211182, "learning_rate": 9.458738794626687e-05, "loss": 2.7037752151489256, "memory(GiB)": 66.02, "step": 17445, "token_acc": 0.5109717868338558, "train_speed(iter/s)": 1.451335 }, { "epoch": 0.747611499078874, "grad_norm": 3.639421224594116, "learning_rate": 9.45843420993694e-05, "loss": 2.3604948043823244, "memory(GiB)": 66.02, "step": 17450, "token_acc": 0.4721189591078067, "train_speed(iter/s)": 1.451412 }, { "epoch": 0.7478257144081231, "grad_norm": 3.6510393619537354, "learning_rate": 9.458129544478476e-05, "loss": 2.79281063079834, "memory(GiB)": 66.02, "step": 17455, "token_acc": 0.44336569579288027, "train_speed(iter/s)": 1.451391 }, { "epoch": 0.748039929737372, "grad_norm": 4.624139308929443, "learning_rate": 9.457824798256813e-05, "loss": 2.5438098907470703, "memory(GiB)": 66.02, "step": 17460, "token_acc": 0.4315068493150685, "train_speed(iter/s)": 1.451249 }, { "epoch": 0.7482541450666209, "grad_norm": 3.588677406311035, "learning_rate": 9.457519971277471e-05, "loss": 2.483653259277344, "memory(GiB)": 66.02, "step": 17465, "token_acc": 0.4565916398713826, "train_speed(iter/s)": 1.451298 }, { "epoch": 0.74846836039587, "grad_norm": 3.9264121055603027, "learning_rate": 9.457215063545974e-05, "loss": 2.7714941024780275, "memory(GiB)": 66.02, "step": 17470, "token_acc": 0.4421052631578947, "train_speed(iter/s)": 1.4513 }, { "epoch": 0.7486825757251189, "grad_norm": 3.972663164138794, "learning_rate": 9.456910075067847e-05, "loss": 2.0708208084106445, "memory(GiB)": 66.02, "step": 17475, "token_acc": 0.544, "train_speed(iter/s)": 1.451334 }, { "epoch": 0.7488967910543679, "grad_norm": 6.843869209289551, "learning_rate": 9.45660500584861e-05, "loss": 2.442848014831543, "memory(GiB)": 66.02, "step": 17480, "token_acc": 0.544, "train_speed(iter/s)": 1.451375 }, { "epoch": 0.7491110063836168, "grad_norm": 4.040668487548828, "learning_rate": 9.456299855893794e-05, "loss": 2.41268310546875, "memory(GiB)": 66.02, "step": 17485, "token_acc": 0.48727272727272725, "train_speed(iter/s)": 1.451335 }, { "epoch": 0.7493252217128658, "grad_norm": 4.642605781555176, "learning_rate": 9.455994625208926e-05, "loss": 2.4285951614379884, "memory(GiB)": 66.02, "step": 17490, "token_acc": 0.4574468085106383, "train_speed(iter/s)": 1.451439 }, { "epoch": 0.7495394370421148, "grad_norm": 4.087214946746826, "learning_rate": 9.455689313799535e-05, "loss": 2.445493698120117, "memory(GiB)": 66.02, "step": 17495, "token_acc": 0.46932515337423314, "train_speed(iter/s)": 1.451506 }, { "epoch": 0.7497536523713637, "grad_norm": 4.687131404876709, "learning_rate": 9.455383921671153e-05, "loss": 2.786838912963867, "memory(GiB)": 66.02, "step": 17500, "token_acc": 0.43252595155709345, "train_speed(iter/s)": 1.451542 }, { "epoch": 0.7497536523713637, "eval_loss": 2.231870174407959, "eval_runtime": 14.8081, "eval_samples_per_second": 6.753, "eval_steps_per_second": 6.753, "eval_token_acc": 0.47354497354497355, "step": 17500 }, { "epoch": 0.7499678677006126, "grad_norm": 5.118946552276611, "learning_rate": 9.45507844882931e-05, "loss": 2.565242385864258, "memory(GiB)": 66.02, "step": 17505, "token_acc": 0.473633748801534, "train_speed(iter/s)": 1.449738 }, { "epoch": 0.7501820830298617, "grad_norm": 6.066102027893066, "learning_rate": 9.454772895279543e-05, "loss": 2.4543426513671873, "memory(GiB)": 66.02, "step": 17510, "token_acc": 0.5101214574898786, "train_speed(iter/s)": 1.449789 }, { "epoch": 0.7503962983591106, "grad_norm": 3.9736335277557373, "learning_rate": 9.454467261027386e-05, "loss": 2.4278087615966797, "memory(GiB)": 66.02, "step": 17515, "token_acc": 0.4965986394557823, "train_speed(iter/s)": 1.449808 }, { "epoch": 0.7506105136883595, "grad_norm": 4.78377628326416, "learning_rate": 9.454161546078375e-05, "loss": 2.2966567993164064, "memory(GiB)": 66.02, "step": 17520, "token_acc": 0.444015444015444, "train_speed(iter/s)": 1.449754 }, { "epoch": 0.7508247290176085, "grad_norm": 4.362670421600342, "learning_rate": 9.453855750438049e-05, "loss": 2.4804916381835938, "memory(GiB)": 66.02, "step": 17525, "token_acc": 0.48, "train_speed(iter/s)": 1.44969 }, { "epoch": 0.7510389443468575, "grad_norm": 4.259242057800293, "learning_rate": 9.45354987411195e-05, "loss": 2.792049789428711, "memory(GiB)": 66.02, "step": 17530, "token_acc": 0.45564516129032256, "train_speed(iter/s)": 1.449794 }, { "epoch": 0.7512531596761064, "grad_norm": 5.366889476776123, "learning_rate": 9.453243917105617e-05, "loss": 2.934235954284668, "memory(GiB)": 66.02, "step": 17535, "token_acc": 0.43790849673202614, "train_speed(iter/s)": 1.449786 }, { "epoch": 0.7514673750053554, "grad_norm": 4.995511054992676, "learning_rate": 9.452937879424592e-05, "loss": 2.5973579406738283, "memory(GiB)": 66.02, "step": 17540, "token_acc": 0.44554455445544555, "train_speed(iter/s)": 1.449902 }, { "epoch": 0.7516815903346044, "grad_norm": 3.5699355602264404, "learning_rate": 9.452631761074419e-05, "loss": 2.5971807479858398, "memory(GiB)": 66.02, "step": 17545, "token_acc": 0.4684014869888476, "train_speed(iter/s)": 1.449844 }, { "epoch": 0.7518958056638533, "grad_norm": 4.320559024810791, "learning_rate": 9.452325562060647e-05, "loss": 2.797038269042969, "memory(GiB)": 66.02, "step": 17550, "token_acc": 0.4772727272727273, "train_speed(iter/s)": 1.449845 }, { "epoch": 0.7521100209931023, "grad_norm": 4.72542667388916, "learning_rate": 9.45201928238882e-05, "loss": 2.6849876403808595, "memory(GiB)": 66.02, "step": 17555, "token_acc": 0.4404332129963899, "train_speed(iter/s)": 1.449862 }, { "epoch": 0.7523242363223512, "grad_norm": 7.0616936683654785, "learning_rate": 9.451712922064488e-05, "loss": 2.8545696258544924, "memory(GiB)": 66.02, "step": 17560, "token_acc": 0.3981191222570533, "train_speed(iter/s)": 1.449808 }, { "epoch": 0.7525384516516002, "grad_norm": 4.490034103393555, "learning_rate": 9.4514064810932e-05, "loss": 2.809182357788086, "memory(GiB)": 66.02, "step": 17565, "token_acc": 0.4652567975830816, "train_speed(iter/s)": 1.449811 }, { "epoch": 0.7527526669808492, "grad_norm": 3.929772138595581, "learning_rate": 9.451099959480507e-05, "loss": 2.648844909667969, "memory(GiB)": 66.02, "step": 17570, "token_acc": 0.47368421052631576, "train_speed(iter/s)": 1.449763 }, { "epoch": 0.7529668823100981, "grad_norm": 3.9892163276672363, "learning_rate": 9.450793357231962e-05, "loss": 2.5053421020507813, "memory(GiB)": 66.02, "step": 17575, "token_acc": 0.4965986394557823, "train_speed(iter/s)": 1.449791 }, { "epoch": 0.753181097639347, "grad_norm": 4.415933132171631, "learning_rate": 9.450486674353122e-05, "loss": 2.777522087097168, "memory(GiB)": 66.02, "step": 17580, "token_acc": 0.45514950166112955, "train_speed(iter/s)": 1.449786 }, { "epoch": 0.7533953129685961, "grad_norm": 4.361843109130859, "learning_rate": 9.450179910849541e-05, "loss": 2.814442825317383, "memory(GiB)": 66.02, "step": 17585, "token_acc": 0.43130990415335463, "train_speed(iter/s)": 1.449747 }, { "epoch": 0.753609528297845, "grad_norm": 4.309413433074951, "learning_rate": 9.449873066726775e-05, "loss": 2.5275644302368163, "memory(GiB)": 66.02, "step": 17590, "token_acc": 0.48936170212765956, "train_speed(iter/s)": 1.449846 }, { "epoch": 0.7538237436270939, "grad_norm": 4.586771011352539, "learning_rate": 9.449566141990384e-05, "loss": 2.312491607666016, "memory(GiB)": 66.02, "step": 17595, "token_acc": 0.4847328244274809, "train_speed(iter/s)": 1.44991 }, { "epoch": 0.754037958956343, "grad_norm": 4.46777868270874, "learning_rate": 9.449259136645929e-05, "loss": 2.7186101913452148, "memory(GiB)": 66.02, "step": 17600, "token_acc": 0.45666666666666667, "train_speed(iter/s)": 1.44994 }, { "epoch": 0.7542521742855919, "grad_norm": 4.810561180114746, "learning_rate": 9.448952050698972e-05, "loss": 2.4807811737060548, "memory(GiB)": 66.02, "step": 17605, "token_acc": 0.48880597014925375, "train_speed(iter/s)": 1.45005 }, { "epoch": 0.7544663896148408, "grad_norm": 6.472301959991455, "learning_rate": 9.448644884155075e-05, "loss": 2.7250158309936525, "memory(GiB)": 66.02, "step": 17610, "token_acc": 0.4765840220385675, "train_speed(iter/s)": 1.450145 }, { "epoch": 0.7546806049440898, "grad_norm": 3.7796828746795654, "learning_rate": 9.448337637019801e-05, "loss": 2.7196197509765625, "memory(GiB)": 66.02, "step": 17615, "token_acc": 0.45132743362831856, "train_speed(iter/s)": 1.450192 }, { "epoch": 0.7548948202733388, "grad_norm": 4.772680282592773, "learning_rate": 9.44803030929872e-05, "loss": 2.7956024169921876, "memory(GiB)": 66.02, "step": 17620, "token_acc": 0.4255952380952381, "train_speed(iter/s)": 1.450212 }, { "epoch": 0.7551090356025877, "grad_norm": 5.025880336761475, "learning_rate": 9.447722900997396e-05, "loss": 2.634055328369141, "memory(GiB)": 66.02, "step": 17625, "token_acc": 0.4357142857142857, "train_speed(iter/s)": 1.450299 }, { "epoch": 0.7553232509318367, "grad_norm": 4.180508613586426, "learning_rate": 9.447415412121399e-05, "loss": 2.4728137969970705, "memory(GiB)": 66.02, "step": 17630, "token_acc": 0.467680608365019, "train_speed(iter/s)": 1.450384 }, { "epoch": 0.7555374662610856, "grad_norm": 7.413858413696289, "learning_rate": 9.4471078426763e-05, "loss": 2.812285232543945, "memory(GiB)": 66.02, "step": 17635, "token_acc": 0.4222972972972973, "train_speed(iter/s)": 1.450437 }, { "epoch": 0.7557516815903346, "grad_norm": 3.3859031200408936, "learning_rate": 9.44680019266767e-05, "loss": 2.370113563537598, "memory(GiB)": 66.02, "step": 17640, "token_acc": 0.5382165605095541, "train_speed(iter/s)": 1.45046 }, { "epoch": 0.7559658969195836, "grad_norm": 6.244542121887207, "learning_rate": 9.446492462101087e-05, "loss": 2.546002006530762, "memory(GiB)": 66.02, "step": 17645, "token_acc": 0.48727272727272725, "train_speed(iter/s)": 1.450616 }, { "epoch": 0.7561801122488325, "grad_norm": 3.882312297821045, "learning_rate": 9.446184650982117e-05, "loss": 2.5146961212158203, "memory(GiB)": 66.02, "step": 17650, "token_acc": 0.46357615894039733, "train_speed(iter/s)": 1.45068 }, { "epoch": 0.7563943275780814, "grad_norm": 5.0457940101623535, "learning_rate": 9.445876759316344e-05, "loss": 2.7844831466674806, "memory(GiB)": 66.02, "step": 17655, "token_acc": 0.43434343434343436, "train_speed(iter/s)": 1.450762 }, { "epoch": 0.7566085429073305, "grad_norm": 6.0084943771362305, "learning_rate": 9.445568787109342e-05, "loss": 2.819187545776367, "memory(GiB)": 66.02, "step": 17660, "token_acc": 0.44, "train_speed(iter/s)": 1.450809 }, { "epoch": 0.7568227582365794, "grad_norm": 4.842067241668701, "learning_rate": 9.445260734366692e-05, "loss": 2.751005172729492, "memory(GiB)": 66.02, "step": 17665, "token_acc": 0.4509090909090909, "train_speed(iter/s)": 1.450824 }, { "epoch": 0.7570369735658283, "grad_norm": 3.4147021770477295, "learning_rate": 9.444952601093972e-05, "loss": 2.756491470336914, "memory(GiB)": 66.02, "step": 17670, "token_acc": 0.4440993788819876, "train_speed(iter/s)": 1.450816 }, { "epoch": 0.7572511888950774, "grad_norm": 5.491716384887695, "learning_rate": 9.444644387296767e-05, "loss": 2.3792045593261717, "memory(GiB)": 66.02, "step": 17675, "token_acc": 0.5037593984962406, "train_speed(iter/s)": 1.450758 }, { "epoch": 0.7574654042243263, "grad_norm": 3.91731333732605, "learning_rate": 9.444336092980661e-05, "loss": 2.4315948486328125, "memory(GiB)": 66.02, "step": 17680, "token_acc": 0.5109034267912772, "train_speed(iter/s)": 1.450777 }, { "epoch": 0.7576796195535752, "grad_norm": 4.7529401779174805, "learning_rate": 9.444027718151235e-05, "loss": 2.8342124938964846, "memory(GiB)": 66.02, "step": 17685, "token_acc": 0.4432624113475177, "train_speed(iter/s)": 1.450729 }, { "epoch": 0.7578938348828242, "grad_norm": 4.680537223815918, "learning_rate": 9.443719262814079e-05, "loss": 2.5661392211914062, "memory(GiB)": 66.02, "step": 17690, "token_acc": 0.4860557768924303, "train_speed(iter/s)": 1.450765 }, { "epoch": 0.7581080502120732, "grad_norm": 3.670809268951416, "learning_rate": 9.44341072697478e-05, "loss": 2.448692512512207, "memory(GiB)": 66.02, "step": 17695, "token_acc": 0.47384615384615386, "train_speed(iter/s)": 1.450862 }, { "epoch": 0.7583222655413221, "grad_norm": 3.857788324356079, "learning_rate": 9.443102110638928e-05, "loss": 3.052908515930176, "memory(GiB)": 66.02, "step": 17700, "token_acc": 0.39751552795031053, "train_speed(iter/s)": 1.450919 }, { "epoch": 0.7585364808705711, "grad_norm": 5.057369709014893, "learning_rate": 9.442793413812113e-05, "loss": 2.669346809387207, "memory(GiB)": 66.02, "step": 17705, "token_acc": 0.4634920634920635, "train_speed(iter/s)": 1.450841 }, { "epoch": 0.75875069619982, "grad_norm": 4.74763298034668, "learning_rate": 9.442484636499927e-05, "loss": 2.49759521484375, "memory(GiB)": 66.02, "step": 17710, "token_acc": 0.4380664652567976, "train_speed(iter/s)": 1.45092 }, { "epoch": 0.758964911529069, "grad_norm": 4.796550273895264, "learning_rate": 9.442175778707965e-05, "loss": 2.521480941772461, "memory(GiB)": 66.02, "step": 17715, "token_acc": 0.45126353790613716, "train_speed(iter/s)": 1.450918 }, { "epoch": 0.759179126858318, "grad_norm": 4.271181106567383, "learning_rate": 9.441866840441822e-05, "loss": 2.573577880859375, "memory(GiB)": 66.02, "step": 17720, "token_acc": 0.4794007490636704, "train_speed(iter/s)": 1.450977 }, { "epoch": 0.7593933421875669, "grad_norm": 5.223025798797607, "learning_rate": 9.441557821707094e-05, "loss": 2.578267478942871, "memory(GiB)": 66.02, "step": 17725, "token_acc": 0.44727272727272727, "train_speed(iter/s)": 1.451081 }, { "epoch": 0.7596075575168159, "grad_norm": 4.418719291687012, "learning_rate": 9.44124872250938e-05, "loss": 2.6255107879638673, "memory(GiB)": 66.02, "step": 17730, "token_acc": 0.4494047619047619, "train_speed(iter/s)": 1.451117 }, { "epoch": 0.7598217728460649, "grad_norm": 3.666797637939453, "learning_rate": 9.440939542854278e-05, "loss": 2.6461490631103515, "memory(GiB)": 66.02, "step": 17735, "token_acc": 0.4421364985163205, "train_speed(iter/s)": 1.451094 }, { "epoch": 0.7600359881753138, "grad_norm": 4.563016891479492, "learning_rate": 9.440630282747392e-05, "loss": 2.5761444091796877, "memory(GiB)": 66.02, "step": 17740, "token_acc": 0.4584717607973422, "train_speed(iter/s)": 1.451114 }, { "epoch": 0.7602502035045627, "grad_norm": 4.366426944732666, "learning_rate": 9.44032094219432e-05, "loss": 2.7161720275878904, "memory(GiB)": 66.02, "step": 17745, "token_acc": 0.4562043795620438, "train_speed(iter/s)": 1.451125 }, { "epoch": 0.7604644188338118, "grad_norm": 5.017696380615234, "learning_rate": 9.440011521200671e-05, "loss": 2.853174591064453, "memory(GiB)": 66.02, "step": 17750, "token_acc": 0.4605678233438486, "train_speed(iter/s)": 1.451034 }, { "epoch": 0.7606786341630607, "grad_norm": 4.0472235679626465, "learning_rate": 9.439702019772047e-05, "loss": 2.7683490753173827, "memory(GiB)": 66.02, "step": 17755, "token_acc": 0.4542857142857143, "train_speed(iter/s)": 1.451026 }, { "epoch": 0.7608928494923096, "grad_norm": 6.4518256187438965, "learning_rate": 9.439392437914058e-05, "loss": 2.53245849609375, "memory(GiB)": 66.02, "step": 17760, "token_acc": 0.4222222222222222, "train_speed(iter/s)": 1.451057 }, { "epoch": 0.7611070648215587, "grad_norm": 4.387604236602783, "learning_rate": 9.439082775632309e-05, "loss": 2.4662708282470702, "memory(GiB)": 66.02, "step": 17765, "token_acc": 0.5, "train_speed(iter/s)": 1.451129 }, { "epoch": 0.7613212801508076, "grad_norm": 6.294944763183594, "learning_rate": 9.43877303293241e-05, "loss": 2.5766059875488283, "memory(GiB)": 66.02, "step": 17770, "token_acc": 0.42452830188679247, "train_speed(iter/s)": 1.451235 }, { "epoch": 0.7615354954800565, "grad_norm": 5.089325428009033, "learning_rate": 9.438463209819976e-05, "loss": 2.781337356567383, "memory(GiB)": 66.02, "step": 17775, "token_acc": 0.4602649006622517, "train_speed(iter/s)": 1.451227 }, { "epoch": 0.7617497108093055, "grad_norm": 5.062034606933594, "learning_rate": 9.438153306300616e-05, "loss": 2.726021575927734, "memory(GiB)": 66.02, "step": 17780, "token_acc": 0.4509090909090909, "train_speed(iter/s)": 1.451221 }, { "epoch": 0.7619639261385545, "grad_norm": 5.949607849121094, "learning_rate": 9.437843322379947e-05, "loss": 2.318865966796875, "memory(GiB)": 66.02, "step": 17785, "token_acc": 0.5179282868525896, "train_speed(iter/s)": 1.451305 }, { "epoch": 0.7621781414678034, "grad_norm": 3.746591091156006, "learning_rate": 9.437533258063582e-05, "loss": 3.1004989624023436, "memory(GiB)": 66.02, "step": 17790, "token_acc": 0.4119496855345912, "train_speed(iter/s)": 1.451344 }, { "epoch": 0.7623923567970524, "grad_norm": 5.227212429046631, "learning_rate": 9.437223113357141e-05, "loss": 2.436220169067383, "memory(GiB)": 66.02, "step": 17795, "token_acc": 0.4600760456273764, "train_speed(iter/s)": 1.4514 }, { "epoch": 0.7626065721263013, "grad_norm": 3.2719600200653076, "learning_rate": 9.436912888266239e-05, "loss": 2.4398998260498046, "memory(GiB)": 66.02, "step": 17800, "token_acc": 0.5064516129032258, "train_speed(iter/s)": 1.451242 }, { "epoch": 0.7628207874555503, "grad_norm": 3.56967830657959, "learning_rate": 9.436602582796498e-05, "loss": 2.569136619567871, "memory(GiB)": 66.02, "step": 17805, "token_acc": 0.48717948717948717, "train_speed(iter/s)": 1.451348 }, { "epoch": 0.7630350027847993, "grad_norm": 4.502561569213867, "learning_rate": 9.43629219695354e-05, "loss": 2.598386764526367, "memory(GiB)": 66.02, "step": 17810, "token_acc": 0.4483985765124555, "train_speed(iter/s)": 1.451262 }, { "epoch": 0.7632492181140482, "grad_norm": 3.789357900619507, "learning_rate": 9.435981730742986e-05, "loss": 2.718033027648926, "memory(GiB)": 66.02, "step": 17815, "token_acc": 0.45985401459854014, "train_speed(iter/s)": 1.451329 }, { "epoch": 0.7634634334432973, "grad_norm": 3.2228078842163086, "learning_rate": 9.435671184170463e-05, "loss": 2.681374168395996, "memory(GiB)": 66.02, "step": 17820, "token_acc": 0.467680608365019, "train_speed(iter/s)": 1.451369 }, { "epoch": 0.7636776487725462, "grad_norm": 3.3303074836730957, "learning_rate": 9.435360557241595e-05, "loss": 2.9132007598876952, "memory(GiB)": 66.02, "step": 17825, "token_acc": 0.4143646408839779, "train_speed(iter/s)": 1.451362 }, { "epoch": 0.7638918641017951, "grad_norm": 5.304351806640625, "learning_rate": 9.43504984996201e-05, "loss": 2.518760108947754, "memory(GiB)": 66.02, "step": 17830, "token_acc": 0.5041322314049587, "train_speed(iter/s)": 1.451438 }, { "epoch": 0.7641060794310441, "grad_norm": 3.8242204189300537, "learning_rate": 9.434739062337337e-05, "loss": 2.5196044921875, "memory(GiB)": 66.02, "step": 17835, "token_acc": 0.46285714285714286, "train_speed(iter/s)": 1.451449 }, { "epoch": 0.7643202947602931, "grad_norm": 3.9230220317840576, "learning_rate": 9.434428194373205e-05, "loss": 2.6513303756713866, "memory(GiB)": 66.02, "step": 17840, "token_acc": 0.4728682170542636, "train_speed(iter/s)": 1.451502 }, { "epoch": 0.764534510089542, "grad_norm": 4.761760711669922, "learning_rate": 9.434117246075246e-05, "loss": 2.5037139892578124, "memory(GiB)": 66.02, "step": 17845, "token_acc": 0.4673913043478261, "train_speed(iter/s)": 1.451526 }, { "epoch": 0.764748725418791, "grad_norm": 4.526192665100098, "learning_rate": 9.433806217449094e-05, "loss": 2.534004974365234, "memory(GiB)": 66.02, "step": 17850, "token_acc": 0.4631578947368421, "train_speed(iter/s)": 1.451488 }, { "epoch": 0.7649629407480399, "grad_norm": 4.827754974365234, "learning_rate": 9.433495108500384e-05, "loss": 2.4859790802001953, "memory(GiB)": 66.02, "step": 17855, "token_acc": 0.4666666666666667, "train_speed(iter/s)": 1.451514 }, { "epoch": 0.7651771560772889, "grad_norm": 3.4157497882843018, "learning_rate": 9.43318391923475e-05, "loss": 2.5476917266845702, "memory(GiB)": 66.02, "step": 17860, "token_acc": 0.47194719471947194, "train_speed(iter/s)": 1.451488 }, { "epoch": 0.7653913714065379, "grad_norm": 4.01407527923584, "learning_rate": 9.432872649657832e-05, "loss": 2.730925750732422, "memory(GiB)": 66.02, "step": 17865, "token_acc": 0.46963562753036436, "train_speed(iter/s)": 1.451435 }, { "epoch": 0.7656055867357868, "grad_norm": 3.7629337310791016, "learning_rate": 9.432561299775267e-05, "loss": 2.6077495574951173, "memory(GiB)": 66.02, "step": 17870, "token_acc": 0.4548736462093863, "train_speed(iter/s)": 1.451443 }, { "epoch": 0.7658198020650357, "grad_norm": 3.8664438724517822, "learning_rate": 9.432249869592696e-05, "loss": 2.6030914306640627, "memory(GiB)": 66.02, "step": 17875, "token_acc": 0.4808743169398907, "train_speed(iter/s)": 1.451476 }, { "epoch": 0.7660340173942848, "grad_norm": 4.5877156257629395, "learning_rate": 9.431938359115759e-05, "loss": 2.7656469345092773, "memory(GiB)": 66.02, "step": 17880, "token_acc": 0.4427710843373494, "train_speed(iter/s)": 1.451538 }, { "epoch": 0.7662482327235337, "grad_norm": 4.128017902374268, "learning_rate": 9.431626768350104e-05, "loss": 2.3668609619140626, "memory(GiB)": 66.02, "step": 17885, "token_acc": 0.49407114624505927, "train_speed(iter/s)": 1.451567 }, { "epoch": 0.7664624480527826, "grad_norm": 3.8643641471862793, "learning_rate": 9.43131509730137e-05, "loss": 2.262523078918457, "memory(GiB)": 66.02, "step": 17890, "token_acc": 0.4943820224719101, "train_speed(iter/s)": 1.451522 }, { "epoch": 0.7666766633820317, "grad_norm": 4.605358123779297, "learning_rate": 9.43100334597521e-05, "loss": 2.3411001205444335, "memory(GiB)": 66.02, "step": 17895, "token_acc": 0.5140562248995983, "train_speed(iter/s)": 1.451475 }, { "epoch": 0.7668908787112806, "grad_norm": 4.67607307434082, "learning_rate": 9.430691514377264e-05, "loss": 2.4089115142822264, "memory(GiB)": 66.02, "step": 17900, "token_acc": 0.4838709677419355, "train_speed(iter/s)": 1.451525 }, { "epoch": 0.7671050940405295, "grad_norm": 3.2710061073303223, "learning_rate": 9.430379602513186e-05, "loss": 2.5764934539794924, "memory(GiB)": 66.02, "step": 17905, "token_acc": 0.4768211920529801, "train_speed(iter/s)": 1.451583 }, { "epoch": 0.7673193093697785, "grad_norm": 5.5393195152282715, "learning_rate": 9.430067610388626e-05, "loss": 2.6153459548950195, "memory(GiB)": 66.02, "step": 17910, "token_acc": 0.4828897338403042, "train_speed(iter/s)": 1.451527 }, { "epoch": 0.7675335246990275, "grad_norm": 3.900489568710327, "learning_rate": 9.429755538009235e-05, "loss": 2.8840137481689454, "memory(GiB)": 66.02, "step": 17915, "token_acc": 0.4584837545126354, "train_speed(iter/s)": 1.4516 }, { "epoch": 0.7677477400282764, "grad_norm": 5.313302040100098, "learning_rate": 9.429443385380667e-05, "loss": 2.4606693267822264, "memory(GiB)": 66.02, "step": 17920, "token_acc": 0.49295774647887325, "train_speed(iter/s)": 1.451684 }, { "epoch": 0.7679619553575254, "grad_norm": 4.444176197052002, "learning_rate": 9.429131152508577e-05, "loss": 2.484004592895508, "memory(GiB)": 66.02, "step": 17925, "token_acc": 0.4867924528301887, "train_speed(iter/s)": 1.451739 }, { "epoch": 0.7681761706867744, "grad_norm": 4.170411586761475, "learning_rate": 9.42881883939862e-05, "loss": 2.614640808105469, "memory(GiB)": 66.02, "step": 17930, "token_acc": 0.46504559270516715, "train_speed(iter/s)": 1.451818 }, { "epoch": 0.7683903860160233, "grad_norm": 5.1936750411987305, "learning_rate": 9.428506446056458e-05, "loss": 2.537836456298828, "memory(GiB)": 66.02, "step": 17935, "token_acc": 0.5362318840579711, "train_speed(iter/s)": 1.451722 }, { "epoch": 0.7686046013452723, "grad_norm": 4.254932403564453, "learning_rate": 9.428193972487746e-05, "loss": 2.4651468276977537, "memory(GiB)": 66.02, "step": 17940, "token_acc": 0.5412186379928315, "train_speed(iter/s)": 1.451836 }, { "epoch": 0.7688188166745212, "grad_norm": 4.5974555015563965, "learning_rate": 9.427881418698147e-05, "loss": 2.428760528564453, "memory(GiB)": 66.02, "step": 17945, "token_acc": 0.502092050209205, "train_speed(iter/s)": 1.451855 }, { "epoch": 0.7690330320037702, "grad_norm": 4.533402442932129, "learning_rate": 9.427568784693321e-05, "loss": 2.4003820419311523, "memory(GiB)": 66.02, "step": 17950, "token_acc": 0.4857142857142857, "train_speed(iter/s)": 1.451893 }, { "epoch": 0.7692472473330192, "grad_norm": 5.776178359985352, "learning_rate": 9.427256070478934e-05, "loss": 2.593052291870117, "memory(GiB)": 66.02, "step": 17955, "token_acc": 0.4265232974910394, "train_speed(iter/s)": 1.451927 }, { "epoch": 0.7694614626622681, "grad_norm": 4.446044921875, "learning_rate": 9.426943276060649e-05, "loss": 2.844767761230469, "memory(GiB)": 66.02, "step": 17960, "token_acc": 0.4355400696864111, "train_speed(iter/s)": 1.451976 }, { "epoch": 0.769675677991517, "grad_norm": 6.065513610839844, "learning_rate": 9.426630401444136e-05, "loss": 2.5938388824462892, "memory(GiB)": 66.02, "step": 17965, "token_acc": 0.4738675958188153, "train_speed(iter/s)": 1.452019 }, { "epoch": 0.7698898933207661, "grad_norm": 4.160604476928711, "learning_rate": 9.426317446635059e-05, "loss": 2.8272281646728517, "memory(GiB)": 66.02, "step": 17970, "token_acc": 0.4328767123287671, "train_speed(iter/s)": 1.452031 }, { "epoch": 0.770104108650015, "grad_norm": 4.037982940673828, "learning_rate": 9.426004411639089e-05, "loss": 2.7478988647460936, "memory(GiB)": 66.02, "step": 17975, "token_acc": 0.4676470588235294, "train_speed(iter/s)": 1.452143 }, { "epoch": 0.7703183239792639, "grad_norm": 3.574831247329712, "learning_rate": 9.425691296461898e-05, "loss": 2.6368877410888674, "memory(GiB)": 66.02, "step": 17980, "token_acc": 0.4551083591331269, "train_speed(iter/s)": 1.452149 }, { "epoch": 0.770532539308513, "grad_norm": 4.331003665924072, "learning_rate": 9.425378101109158e-05, "loss": 2.5000373840332033, "memory(GiB)": 66.02, "step": 17985, "token_acc": 0.47474747474747475, "train_speed(iter/s)": 1.452133 }, { "epoch": 0.7707467546377619, "grad_norm": 4.412370681762695, "learning_rate": 9.425064825586541e-05, "loss": 2.7192708969116213, "memory(GiB)": 66.02, "step": 17990, "token_acc": 0.41904761904761906, "train_speed(iter/s)": 1.45218 }, { "epoch": 0.7709609699670108, "grad_norm": 3.9561266899108887, "learning_rate": 9.424751469899724e-05, "loss": 2.809209632873535, "memory(GiB)": 66.02, "step": 17995, "token_acc": 0.4271523178807947, "train_speed(iter/s)": 1.452232 }, { "epoch": 0.7711751852962598, "grad_norm": 3.802600622177124, "learning_rate": 9.424438034054385e-05, "loss": 2.6348880767822265, "memory(GiB)": 66.02, "step": 18000, "token_acc": 0.5176470588235295, "train_speed(iter/s)": 1.452281 }, { "epoch": 0.7711751852962598, "eval_loss": 2.1363372802734375, "eval_runtime": 14.8199, "eval_samples_per_second": 6.748, "eval_steps_per_second": 6.748, "eval_token_acc": 0.4845222072678331, "step": 18000 }, { "epoch": 0.7713894006255088, "grad_norm": 3.982149362564087, "learning_rate": 9.424124518056199e-05, "loss": 2.6583667755126954, "memory(GiB)": 66.02, "step": 18005, "token_acc": 0.48217821782178216, "train_speed(iter/s)": 1.450389 }, { "epoch": 0.7716036159547577, "grad_norm": 4.694133281707764, "learning_rate": 9.423810921910848e-05, "loss": 2.9110685348510743, "memory(GiB)": 66.02, "step": 18010, "token_acc": 0.4383561643835616, "train_speed(iter/s)": 1.450338 }, { "epoch": 0.7718178312840067, "grad_norm": 4.896605491638184, "learning_rate": 9.42349724562401e-05, "loss": 2.373472213745117, "memory(GiB)": 66.02, "step": 18015, "token_acc": 0.5056179775280899, "train_speed(iter/s)": 1.450378 }, { "epoch": 0.7720320466132556, "grad_norm": 3.073610305786133, "learning_rate": 9.423183489201373e-05, "loss": 2.56213321685791, "memory(GiB)": 66.02, "step": 18020, "token_acc": 0.48417721518987344, "train_speed(iter/s)": 1.450362 }, { "epoch": 0.7722462619425046, "grad_norm": 4.432365417480469, "learning_rate": 9.422869652648617e-05, "loss": 2.6012752532958983, "memory(GiB)": 66.02, "step": 18025, "token_acc": 0.4380664652567976, "train_speed(iter/s)": 1.450337 }, { "epoch": 0.7724604772717536, "grad_norm": 3.4521918296813965, "learning_rate": 9.422555735971426e-05, "loss": 2.4666662216186523, "memory(GiB)": 66.02, "step": 18030, "token_acc": 0.4417808219178082, "train_speed(iter/s)": 1.45032 }, { "epoch": 0.7726746926010025, "grad_norm": 4.163490295410156, "learning_rate": 9.422241739175491e-05, "loss": 2.196506881713867, "memory(GiB)": 66.02, "step": 18035, "token_acc": 0.5355648535564853, "train_speed(iter/s)": 1.450345 }, { "epoch": 0.7728889079302514, "grad_norm": 4.549071311950684, "learning_rate": 9.421927662266499e-05, "loss": 2.367755889892578, "memory(GiB)": 66.02, "step": 18040, "token_acc": 0.47686832740213525, "train_speed(iter/s)": 1.450314 }, { "epoch": 0.7731031232595005, "grad_norm": 3.2333977222442627, "learning_rate": 9.421613505250138e-05, "loss": 2.5339868545532225, "memory(GiB)": 66.02, "step": 18045, "token_acc": 0.4647058823529412, "train_speed(iter/s)": 1.450155 }, { "epoch": 0.7733173385887494, "grad_norm": 5.415011882781982, "learning_rate": 9.421299268132103e-05, "loss": 2.6513235092163088, "memory(GiB)": 66.02, "step": 18050, "token_acc": 0.4792332268370607, "train_speed(iter/s)": 1.450173 }, { "epoch": 0.7735315539179983, "grad_norm": 4.054705619812012, "learning_rate": 9.420984950918082e-05, "loss": 2.5059566497802734, "memory(GiB)": 66.02, "step": 18055, "token_acc": 0.42517006802721086, "train_speed(iter/s)": 1.450235 }, { "epoch": 0.7737457692472474, "grad_norm": 4.358946323394775, "learning_rate": 9.42067055361377e-05, "loss": 2.851646614074707, "memory(GiB)": 66.02, "step": 18060, "token_acc": 0.41055718475073316, "train_speed(iter/s)": 1.450248 }, { "epoch": 0.7739599845764963, "grad_norm": 4.50553035736084, "learning_rate": 9.420356076224865e-05, "loss": 2.516449737548828, "memory(GiB)": 66.02, "step": 18065, "token_acc": 0.4444444444444444, "train_speed(iter/s)": 1.450234 }, { "epoch": 0.7741741999057452, "grad_norm": 5.02170991897583, "learning_rate": 9.420041518757066e-05, "loss": 2.619902992248535, "memory(GiB)": 66.02, "step": 18070, "token_acc": 0.4533762057877814, "train_speed(iter/s)": 1.450157 }, { "epoch": 0.7743884152349942, "grad_norm": 4.110943794250488, "learning_rate": 9.419726881216066e-05, "loss": 2.503653717041016, "memory(GiB)": 66.02, "step": 18075, "token_acc": 0.48226950354609927, "train_speed(iter/s)": 1.45022 }, { "epoch": 0.7746026305642432, "grad_norm": 4.794353008270264, "learning_rate": 9.419412163607567e-05, "loss": 2.4214576721191405, "memory(GiB)": 66.02, "step": 18080, "token_acc": 0.504, "train_speed(iter/s)": 1.450294 }, { "epoch": 0.7748168458934921, "grad_norm": 3.4237639904022217, "learning_rate": 9.419097365937272e-05, "loss": 2.720335006713867, "memory(GiB)": 66.02, "step": 18085, "token_acc": 0.42071197411003236, "train_speed(iter/s)": 1.450338 }, { "epoch": 0.7750310612227411, "grad_norm": 4.554632186889648, "learning_rate": 9.418782488210882e-05, "loss": 2.496338653564453, "memory(GiB)": 66.02, "step": 18090, "token_acc": 0.45229681978798586, "train_speed(iter/s)": 1.450282 }, { "epoch": 0.77524527655199, "grad_norm": 3.722590684890747, "learning_rate": 9.418467530434103e-05, "loss": 2.606220245361328, "memory(GiB)": 66.02, "step": 18095, "token_acc": 0.46, "train_speed(iter/s)": 1.450305 }, { "epoch": 0.775459491881239, "grad_norm": 4.078646183013916, "learning_rate": 9.41815249261264e-05, "loss": 2.6520879745483397, "memory(GiB)": 66.02, "step": 18100, "token_acc": 0.42567567567567566, "train_speed(iter/s)": 1.450402 }, { "epoch": 0.775673707210488, "grad_norm": 4.396276950836182, "learning_rate": 9.417837374752199e-05, "loss": 2.7209846496582033, "memory(GiB)": 66.02, "step": 18105, "token_acc": 0.4582043343653251, "train_speed(iter/s)": 1.450452 }, { "epoch": 0.7758879225397369, "grad_norm": 4.03439474105835, "learning_rate": 9.417522176858491e-05, "loss": 2.641116905212402, "memory(GiB)": 66.02, "step": 18110, "token_acc": 0.46397694524495675, "train_speed(iter/s)": 1.45036 }, { "epoch": 0.7761021378689859, "grad_norm": 4.848409652709961, "learning_rate": 9.417206898937223e-05, "loss": 2.436016654968262, "memory(GiB)": 66.02, "step": 18115, "token_acc": 0.49767441860465117, "train_speed(iter/s)": 1.450261 }, { "epoch": 0.7763163531982349, "grad_norm": 7.31134557723999, "learning_rate": 9.41689154099411e-05, "loss": 2.5979223251342773, "memory(GiB)": 66.02, "step": 18120, "token_acc": 0.4692556634304207, "train_speed(iter/s)": 1.450142 }, { "epoch": 0.7765305685274838, "grad_norm": 3.8297173976898193, "learning_rate": 9.416576103034862e-05, "loss": 2.53304386138916, "memory(GiB)": 66.02, "step": 18125, "token_acc": 0.46839080459770116, "train_speed(iter/s)": 1.450088 }, { "epoch": 0.7767447838567327, "grad_norm": 4.4571099281311035, "learning_rate": 9.416260585065195e-05, "loss": 2.6305755615234374, "memory(GiB)": 66.02, "step": 18130, "token_acc": 0.4585987261146497, "train_speed(iter/s)": 1.449995 }, { "epoch": 0.7769589991859818, "grad_norm": 3.7489819526672363, "learning_rate": 9.415944987090826e-05, "loss": 2.425373840332031, "memory(GiB)": 66.02, "step": 18135, "token_acc": 0.4963235294117647, "train_speed(iter/s)": 1.449956 }, { "epoch": 0.7771732145152307, "grad_norm": 3.8894479274749756, "learning_rate": 9.415629309117471e-05, "loss": 2.251003646850586, "memory(GiB)": 66.02, "step": 18140, "token_acc": 0.5531914893617021, "train_speed(iter/s)": 1.449913 }, { "epoch": 0.7773874298444796, "grad_norm": 4.083927154541016, "learning_rate": 9.415313551150847e-05, "loss": 2.693222236633301, "memory(GiB)": 66.02, "step": 18145, "token_acc": 0.46645367412140576, "train_speed(iter/s)": 1.44986 }, { "epoch": 0.7776016451737287, "grad_norm": 7.487312316894531, "learning_rate": 9.414997713196678e-05, "loss": 2.689481735229492, "memory(GiB)": 66.02, "step": 18150, "token_acc": 0.4280936454849498, "train_speed(iter/s)": 1.449906 }, { "epoch": 0.7778158605029776, "grad_norm": 6.086834907531738, "learning_rate": 9.414681795260683e-05, "loss": 2.7687519073486326, "memory(GiB)": 66.02, "step": 18155, "token_acc": 0.4637223974763407, "train_speed(iter/s)": 1.450002 }, { "epoch": 0.7780300758322266, "grad_norm": 4.046126365661621, "learning_rate": 9.414365797348586e-05, "loss": 2.657626914978027, "memory(GiB)": 66.02, "step": 18160, "token_acc": 0.43217665615141954, "train_speed(iter/s)": 1.449989 }, { "epoch": 0.7782442911614755, "grad_norm": 3.8933658599853516, "learning_rate": 9.414049719466111e-05, "loss": 2.5921215057373046, "memory(GiB)": 66.02, "step": 18165, "token_acc": 0.4681647940074906, "train_speed(iter/s)": 1.449964 }, { "epoch": 0.7784585064907245, "grad_norm": 4.799526691436768, "learning_rate": 9.413733561618985e-05, "loss": 2.7638824462890623, "memory(GiB)": 66.02, "step": 18170, "token_acc": 0.45229681978798586, "train_speed(iter/s)": 1.44989 }, { "epoch": 0.7786727218199735, "grad_norm": 5.367789268493652, "learning_rate": 9.413417323812936e-05, "loss": 2.3000259399414062, "memory(GiB)": 66.02, "step": 18175, "token_acc": 0.5037593984962406, "train_speed(iter/s)": 1.449868 }, { "epoch": 0.7788869371492224, "grad_norm": 4.351169109344482, "learning_rate": 9.41310100605369e-05, "loss": 2.697555732727051, "memory(GiB)": 66.02, "step": 18180, "token_acc": 0.4478021978021978, "train_speed(iter/s)": 1.449802 }, { "epoch": 0.7791011524784713, "grad_norm": 4.8134541511535645, "learning_rate": 9.412784608346983e-05, "loss": 2.8435754776000977, "memory(GiB)": 66.02, "step": 18185, "token_acc": 0.45, "train_speed(iter/s)": 1.44973 }, { "epoch": 0.7793153678077204, "grad_norm": 5.636275291442871, "learning_rate": 9.41246813069854e-05, "loss": 2.467054557800293, "memory(GiB)": 66.02, "step": 18190, "token_acc": 0.49808429118773945, "train_speed(iter/s)": 1.449771 }, { "epoch": 0.7795295831369693, "grad_norm": 4.488534927368164, "learning_rate": 9.412151573114098e-05, "loss": 2.6613616943359375, "memory(GiB)": 66.02, "step": 18195, "token_acc": 0.46474358974358976, "train_speed(iter/s)": 1.449799 }, { "epoch": 0.7797437984662182, "grad_norm": 4.791868209838867, "learning_rate": 9.411834935599393e-05, "loss": 2.77734489440918, "memory(GiB)": 66.02, "step": 18200, "token_acc": 0.4528301886792453, "train_speed(iter/s)": 1.449803 }, { "epoch": 0.7799580137954673, "grad_norm": 3.9967901706695557, "learning_rate": 9.411518218160158e-05, "loss": 2.680686378479004, "memory(GiB)": 66.02, "step": 18205, "token_acc": 0.44089456869009586, "train_speed(iter/s)": 1.449883 }, { "epoch": 0.7801722291247162, "grad_norm": 3.768009662628174, "learning_rate": 9.411201420802134e-05, "loss": 2.5262779235839843, "memory(GiB)": 66.02, "step": 18210, "token_acc": 0.445859872611465, "train_speed(iter/s)": 1.449904 }, { "epoch": 0.7803864444539651, "grad_norm": 6.320119857788086, "learning_rate": 9.410884543531056e-05, "loss": 2.4924684524536134, "memory(GiB)": 66.02, "step": 18215, "token_acc": 0.4594594594594595, "train_speed(iter/s)": 1.44976 }, { "epoch": 0.7806006597832141, "grad_norm": 3.9295129776000977, "learning_rate": 9.410567586352668e-05, "loss": 2.752115249633789, "memory(GiB)": 66.02, "step": 18220, "token_acc": 0.47368421052631576, "train_speed(iter/s)": 1.449737 }, { "epoch": 0.7808148751124631, "grad_norm": 3.391709089279175, "learning_rate": 9.41025054927271e-05, "loss": 2.3972236633300783, "memory(GiB)": 66.02, "step": 18225, "token_acc": 0.5108695652173914, "train_speed(iter/s)": 1.44969 }, { "epoch": 0.781029090441712, "grad_norm": 4.542713165283203, "learning_rate": 9.409933432296927e-05, "loss": 2.6938899993896483, "memory(GiB)": 66.02, "step": 18230, "token_acc": 0.4618055555555556, "train_speed(iter/s)": 1.449607 }, { "epoch": 0.781243305770961, "grad_norm": 4.435104846954346, "learning_rate": 9.409616235431062e-05, "loss": 2.4522470474243163, "memory(GiB)": 66.02, "step": 18235, "token_acc": 0.47924528301886793, "train_speed(iter/s)": 1.449424 }, { "epoch": 0.7814575211002099, "grad_norm": 5.2817864418029785, "learning_rate": 9.409298958680864e-05, "loss": 2.65223388671875, "memory(GiB)": 66.02, "step": 18240, "token_acc": 0.43333333333333335, "train_speed(iter/s)": 1.449496 }, { "epoch": 0.7816717364294589, "grad_norm": 4.501829624176025, "learning_rate": 9.408981602052078e-05, "loss": 2.47408447265625, "memory(GiB)": 66.02, "step": 18245, "token_acc": 0.4725274725274725, "train_speed(iter/s)": 1.449509 }, { "epoch": 0.7818859517587079, "grad_norm": 3.8733203411102295, "learning_rate": 9.408664165550453e-05, "loss": 2.434902572631836, "memory(GiB)": 66.02, "step": 18250, "token_acc": 0.49429657794676807, "train_speed(iter/s)": 1.44958 }, { "epoch": 0.7821001670879568, "grad_norm": 4.0289626121521, "learning_rate": 9.408346649181742e-05, "loss": 2.555371856689453, "memory(GiB)": 66.02, "step": 18255, "token_acc": 0.4536741214057508, "train_speed(iter/s)": 1.449654 }, { "epoch": 0.7823143824172057, "grad_norm": 3.9965853691101074, "learning_rate": 9.408029052951698e-05, "loss": 2.5941064834594725, "memory(GiB)": 66.02, "step": 18260, "token_acc": 0.5147058823529411, "train_speed(iter/s)": 1.449677 }, { "epoch": 0.7825285977464548, "grad_norm": 4.760998249053955, "learning_rate": 9.407711376866071e-05, "loss": 2.619590187072754, "memory(GiB)": 66.02, "step": 18265, "token_acc": 0.45555555555555555, "train_speed(iter/s)": 1.449678 }, { "epoch": 0.7827428130757037, "grad_norm": 4.3195600509643555, "learning_rate": 9.407393620930618e-05, "loss": 2.5690332412719727, "memory(GiB)": 66.02, "step": 18270, "token_acc": 0.48028673835125446, "train_speed(iter/s)": 1.449675 }, { "epoch": 0.7829570284049526, "grad_norm": 4.01470947265625, "learning_rate": 9.407075785151094e-05, "loss": 2.6800205230712892, "memory(GiB)": 66.02, "step": 18275, "token_acc": 0.41935483870967744, "train_speed(iter/s)": 1.449692 }, { "epoch": 0.7831712437342017, "grad_norm": 4.600347995758057, "learning_rate": 9.406757869533259e-05, "loss": 2.6699668884277346, "memory(GiB)": 66.02, "step": 18280, "token_acc": 0.4512987012987013, "train_speed(iter/s)": 1.449634 }, { "epoch": 0.7833854590634506, "grad_norm": 4.298788070678711, "learning_rate": 9.406439874082871e-05, "loss": 2.2426826477050783, "memory(GiB)": 66.02, "step": 18285, "token_acc": 0.5117056856187291, "train_speed(iter/s)": 1.449694 }, { "epoch": 0.7835996743926995, "grad_norm": 3.2672784328460693, "learning_rate": 9.406121798805692e-05, "loss": 2.3077823638916017, "memory(GiB)": 66.02, "step": 18290, "token_acc": 0.5555555555555556, "train_speed(iter/s)": 1.449681 }, { "epoch": 0.7838138897219485, "grad_norm": 4.382030487060547, "learning_rate": 9.405803643707482e-05, "loss": 2.4252155303955076, "memory(GiB)": 66.02, "step": 18295, "token_acc": 0.5018867924528302, "train_speed(iter/s)": 1.449665 }, { "epoch": 0.7840281050511975, "grad_norm": 4.850598335266113, "learning_rate": 9.405485408794005e-05, "loss": 2.634998321533203, "memory(GiB)": 66.02, "step": 18300, "token_acc": 0.49517684887459806, "train_speed(iter/s)": 1.449711 }, { "epoch": 0.7842423203804464, "grad_norm": 4.17399263381958, "learning_rate": 9.40516709407103e-05, "loss": 2.8445547103881834, "memory(GiB)": 66.02, "step": 18305, "token_acc": 0.3961661341853035, "train_speed(iter/s)": 1.449564 }, { "epoch": 0.7844565357096954, "grad_norm": 4.272390842437744, "learning_rate": 9.40484869954432e-05, "loss": 2.8256629943847655, "memory(GiB)": 66.02, "step": 18310, "token_acc": 0.4384858044164038, "train_speed(iter/s)": 1.449582 }, { "epoch": 0.7846707510389443, "grad_norm": 4.42106819152832, "learning_rate": 9.404530225219643e-05, "loss": 2.5621002197265623, "memory(GiB)": 66.02, "step": 18315, "token_acc": 0.46366782006920415, "train_speed(iter/s)": 1.449464 }, { "epoch": 0.7848849663681933, "grad_norm": 6.792846202850342, "learning_rate": 9.404211671102769e-05, "loss": 2.8990150451660157, "memory(GiB)": 66.02, "step": 18320, "token_acc": 0.4152542372881356, "train_speed(iter/s)": 1.449451 }, { "epoch": 0.7850991816974423, "grad_norm": 4.029753684997559, "learning_rate": 9.403893037199469e-05, "loss": 2.663385009765625, "memory(GiB)": 66.02, "step": 18325, "token_acc": 0.4228571428571429, "train_speed(iter/s)": 1.449505 }, { "epoch": 0.7853133970266912, "grad_norm": 3.72320556640625, "learning_rate": 9.403574323515516e-05, "loss": 2.673150634765625, "memory(GiB)": 66.02, "step": 18330, "token_acc": 0.4542372881355932, "train_speed(iter/s)": 1.449571 }, { "epoch": 0.7855276123559402, "grad_norm": 4.929439544677734, "learning_rate": 9.403255530056682e-05, "loss": 2.5984115600585938, "memory(GiB)": 66.02, "step": 18335, "token_acc": 0.4811320754716981, "train_speed(iter/s)": 1.449636 }, { "epoch": 0.7857418276851892, "grad_norm": 4.097453594207764, "learning_rate": 9.402936656828745e-05, "loss": 2.5976675033569334, "memory(GiB)": 66.02, "step": 18340, "token_acc": 0.46557377049180326, "train_speed(iter/s)": 1.44965 }, { "epoch": 0.7859560430144381, "grad_norm": 3.460083484649658, "learning_rate": 9.40261770383748e-05, "loss": 2.2677680969238283, "memory(GiB)": 66.02, "step": 18345, "token_acc": 0.5, "train_speed(iter/s)": 1.449617 }, { "epoch": 0.786170258343687, "grad_norm": 5.9269232749938965, "learning_rate": 9.402298671088665e-05, "loss": 2.5967548370361326, "memory(GiB)": 66.02, "step": 18350, "token_acc": 0.4827586206896552, "train_speed(iter/s)": 1.449481 }, { "epoch": 0.7863844736729361, "grad_norm": 3.7726407051086426, "learning_rate": 9.401979558588081e-05, "loss": 2.911211395263672, "memory(GiB)": 66.02, "step": 18355, "token_acc": 0.4094292803970223, "train_speed(iter/s)": 1.449473 }, { "epoch": 0.786598689002185, "grad_norm": 3.9955241680145264, "learning_rate": 9.401660366341506e-05, "loss": 2.807758331298828, "memory(GiB)": 66.02, "step": 18360, "token_acc": 0.43820224719101125, "train_speed(iter/s)": 1.44954 }, { "epoch": 0.7868129043314339, "grad_norm": 7.486758708953857, "learning_rate": 9.401341094354725e-05, "loss": 2.7489816665649416, "memory(GiB)": 66.02, "step": 18365, "token_acc": 0.437125748502994, "train_speed(iter/s)": 1.449511 }, { "epoch": 0.787027119660683, "grad_norm": 6.179536819458008, "learning_rate": 9.401021742633523e-05, "loss": 2.3739242553710938, "memory(GiB)": 66.02, "step": 18370, "token_acc": 0.5255972696245734, "train_speed(iter/s)": 1.449468 }, { "epoch": 0.7872413349899319, "grad_norm": 3.909493923187256, "learning_rate": 9.400702311183681e-05, "loss": 2.706329345703125, "memory(GiB)": 66.02, "step": 18375, "token_acc": 0.4309210526315789, "train_speed(iter/s)": 1.449486 }, { "epoch": 0.7874555503191808, "grad_norm": 3.4717910289764404, "learning_rate": 9.40038280001099e-05, "loss": 2.4165367126464843, "memory(GiB)": 66.02, "step": 18380, "token_acc": 0.4789915966386555, "train_speed(iter/s)": 1.449526 }, { "epoch": 0.7876697656484298, "grad_norm": 5.798118591308594, "learning_rate": 9.400063209121235e-05, "loss": 2.2983631134033202, "memory(GiB)": 66.02, "step": 18385, "token_acc": 0.5035460992907801, "train_speed(iter/s)": 1.449484 }, { "epoch": 0.7878839809776788, "grad_norm": 5.7551116943359375, "learning_rate": 9.39974353852021e-05, "loss": 2.595143127441406, "memory(GiB)": 66.02, "step": 18390, "token_acc": 0.4291497975708502, "train_speed(iter/s)": 1.449401 }, { "epoch": 0.7880981963069277, "grad_norm": 3.806825637817383, "learning_rate": 9.399423788213701e-05, "loss": 2.587324333190918, "memory(GiB)": 66.02, "step": 18395, "token_acc": 0.5034965034965035, "train_speed(iter/s)": 1.449503 }, { "epoch": 0.7883124116361767, "grad_norm": 4.1000213623046875, "learning_rate": 9.399103958207505e-05, "loss": 2.3451332092285155, "memory(GiB)": 66.02, "step": 18400, "token_acc": 0.49433962264150944, "train_speed(iter/s)": 1.449541 }, { "epoch": 0.7885266269654256, "grad_norm": 4.8470940589904785, "learning_rate": 9.398784048507414e-05, "loss": 2.7898975372314454, "memory(GiB)": 66.02, "step": 18405, "token_acc": 0.47058823529411764, "train_speed(iter/s)": 1.449603 }, { "epoch": 0.7887408422946746, "grad_norm": 3.740290880203247, "learning_rate": 9.398464059119222e-05, "loss": 2.669422912597656, "memory(GiB)": 66.02, "step": 18410, "token_acc": 0.4125, "train_speed(iter/s)": 1.449494 }, { "epoch": 0.7889550576239236, "grad_norm": 3.710299491882324, "learning_rate": 9.398143990048727e-05, "loss": 2.6664682388305665, "memory(GiB)": 66.02, "step": 18415, "token_acc": 0.44244604316546765, "train_speed(iter/s)": 1.4495 }, { "epoch": 0.7891692729531725, "grad_norm": 3.7856040000915527, "learning_rate": 9.397823841301729e-05, "loss": 2.745924949645996, "memory(GiB)": 66.02, "step": 18420, "token_acc": 0.4382716049382716, "train_speed(iter/s)": 1.449529 }, { "epoch": 0.7893834882824214, "grad_norm": 4.311188697814941, "learning_rate": 9.397503612884028e-05, "loss": 2.4199142456054688, "memory(GiB)": 66.02, "step": 18425, "token_acc": 0.4608150470219436, "train_speed(iter/s)": 1.449544 }, { "epoch": 0.7895977036116705, "grad_norm": 4.5722737312316895, "learning_rate": 9.397183304801422e-05, "loss": 2.563701629638672, "memory(GiB)": 66.02, "step": 18430, "token_acc": 0.4684014869888476, "train_speed(iter/s)": 1.449632 }, { "epoch": 0.7898119189409194, "grad_norm": 3.772848606109619, "learning_rate": 9.396862917059716e-05, "loss": 2.603957939147949, "memory(GiB)": 66.02, "step": 18435, "token_acc": 0.44025157232704404, "train_speed(iter/s)": 1.44969 }, { "epoch": 0.7900261342701683, "grad_norm": 5.602076530456543, "learning_rate": 9.396542449664714e-05, "loss": 2.7424373626708984, "memory(GiB)": 66.02, "step": 18440, "token_acc": 0.4612676056338028, "train_speed(iter/s)": 1.449766 }, { "epoch": 0.7902403495994174, "grad_norm": 4.258472442626953, "learning_rate": 9.396221902622221e-05, "loss": 2.6096662521362304, "memory(GiB)": 66.02, "step": 18445, "token_acc": 0.4835164835164835, "train_speed(iter/s)": 1.449837 }, { "epoch": 0.7904545649286663, "grad_norm": 4.308422565460205, "learning_rate": 9.395901275938047e-05, "loss": 2.534760093688965, "memory(GiB)": 66.02, "step": 18450, "token_acc": 0.45652173913043476, "train_speed(iter/s)": 1.449884 }, { "epoch": 0.7906687802579152, "grad_norm": 4.1195220947265625, "learning_rate": 9.395580569617993e-05, "loss": 2.20571346282959, "memory(GiB)": 66.02, "step": 18455, "token_acc": 0.5427509293680297, "train_speed(iter/s)": 1.449898 }, { "epoch": 0.7908829955871642, "grad_norm": 4.290310382843018, "learning_rate": 9.395259783667876e-05, "loss": 2.574893760681152, "memory(GiB)": 66.02, "step": 18460, "token_acc": 0.4483985765124555, "train_speed(iter/s)": 1.449932 }, { "epoch": 0.7910972109164132, "grad_norm": 8.034736633300781, "learning_rate": 9.394938918093506e-05, "loss": 2.740865135192871, "memory(GiB)": 66.02, "step": 18465, "token_acc": 0.5018867924528302, "train_speed(iter/s)": 1.450036 }, { "epoch": 0.7913114262456621, "grad_norm": 6.057497501373291, "learning_rate": 9.394617972900692e-05, "loss": 2.6498220443725584, "memory(GiB)": 66.02, "step": 18470, "token_acc": 0.4315068493150685, "train_speed(iter/s)": 1.450064 }, { "epoch": 0.7915256415749111, "grad_norm": 4.421770095825195, "learning_rate": 9.394296948095251e-05, "loss": 2.7161481857299803, "memory(GiB)": 66.02, "step": 18475, "token_acc": 0.5045871559633027, "train_speed(iter/s)": 1.450103 }, { "epoch": 0.79173985690416, "grad_norm": 4.770242214202881, "learning_rate": 9.393975843683e-05, "loss": 2.5783054351806642, "memory(GiB)": 66.02, "step": 18480, "token_acc": 0.4383954154727794, "train_speed(iter/s)": 1.450216 }, { "epoch": 0.791954072233409, "grad_norm": 5.175363540649414, "learning_rate": 9.393654659669755e-05, "loss": 2.3660795211791994, "memory(GiB)": 66.02, "step": 18485, "token_acc": 0.5112540192926045, "train_speed(iter/s)": 1.450213 }, { "epoch": 0.792168287562658, "grad_norm": 4.325352191925049, "learning_rate": 9.393333396061335e-05, "loss": 2.932937240600586, "memory(GiB)": 66.02, "step": 18490, "token_acc": 0.3987915407854985, "train_speed(iter/s)": 1.450181 }, { "epoch": 0.7923825028919069, "grad_norm": 3.7582504749298096, "learning_rate": 9.393012052863556e-05, "loss": 2.5134958267211913, "memory(GiB)": 66.02, "step": 18495, "token_acc": 0.5077519379844961, "train_speed(iter/s)": 1.450093 }, { "epoch": 0.792596718221156, "grad_norm": 3.7133169174194336, "learning_rate": 9.392690630082246e-05, "loss": 2.4369785308837892, "memory(GiB)": 66.02, "step": 18500, "token_acc": 0.49262536873156343, "train_speed(iter/s)": 1.450082 }, { "epoch": 0.792596718221156, "eval_loss": 2.226203203201294, "eval_runtime": 15.2544, "eval_samples_per_second": 6.555, "eval_steps_per_second": 6.555, "eval_token_acc": 0.46776406035665297, "step": 18500 }, { "epoch": 0.7928109335504049, "grad_norm": 4.184749126434326, "learning_rate": 9.392369127723224e-05, "loss": 2.520800971984863, "memory(GiB)": 66.02, "step": 18505, "token_acc": 0.46406570841889117, "train_speed(iter/s)": 1.448273 }, { "epoch": 0.7930251488796538, "grad_norm": 4.034061908721924, "learning_rate": 9.392047545792312e-05, "loss": 2.63183479309082, "memory(GiB)": 66.02, "step": 18510, "token_acc": 0.45180722891566266, "train_speed(iter/s)": 1.448247 }, { "epoch": 0.7932393642089028, "grad_norm": 4.331101417541504, "learning_rate": 9.391725884295342e-05, "loss": 2.3379444122314452, "memory(GiB)": 66.02, "step": 18515, "token_acc": 0.48299319727891155, "train_speed(iter/s)": 1.448287 }, { "epoch": 0.7934535795381518, "grad_norm": 3.51999568939209, "learning_rate": 9.391404143238137e-05, "loss": 2.6013370513916017, "memory(GiB)": 66.02, "step": 18520, "token_acc": 0.45806451612903226, "train_speed(iter/s)": 1.448288 }, { "epoch": 0.7936677948674007, "grad_norm": 4.5327534675598145, "learning_rate": 9.391082322626526e-05, "loss": 2.2977149963378904, "memory(GiB)": 66.02, "step": 18525, "token_acc": 0.49473684210526314, "train_speed(iter/s)": 1.448288 }, { "epoch": 0.7938820101966497, "grad_norm": 3.8526196479797363, "learning_rate": 9.39076042246634e-05, "loss": 2.3243608474731445, "memory(GiB)": 66.02, "step": 18530, "token_acc": 0.5375939849624061, "train_speed(iter/s)": 1.448339 }, { "epoch": 0.7940962255258986, "grad_norm": 4.235838890075684, "learning_rate": 9.39043844276341e-05, "loss": 2.608317184448242, "memory(GiB)": 66.02, "step": 18535, "token_acc": 0.46366782006920415, "train_speed(iter/s)": 1.448436 }, { "epoch": 0.7943104408551476, "grad_norm": 5.187750339508057, "learning_rate": 9.390116383523568e-05, "loss": 2.5674930572509767, "memory(GiB)": 66.02, "step": 18540, "token_acc": 0.45112781954887216, "train_speed(iter/s)": 1.448511 }, { "epoch": 0.7945246561843966, "grad_norm": 4.310109615325928, "learning_rate": 9.38979424475265e-05, "loss": 2.553070831298828, "memory(GiB)": 66.02, "step": 18545, "token_acc": 0.4666666666666667, "train_speed(iter/s)": 1.448561 }, { "epoch": 0.7947388715136455, "grad_norm": 4.1276469230651855, "learning_rate": 9.389472026456492e-05, "loss": 2.384823799133301, "memory(GiB)": 66.02, "step": 18550, "token_acc": 0.4904214559386973, "train_speed(iter/s)": 1.448522 }, { "epoch": 0.7949530868428945, "grad_norm": 4.401536464691162, "learning_rate": 9.38914972864093e-05, "loss": 2.794795799255371, "memory(GiB)": 66.02, "step": 18555, "token_acc": 0.41690962099125367, "train_speed(iter/s)": 1.448558 }, { "epoch": 0.7951673021721435, "grad_norm": 5.56785774230957, "learning_rate": 9.388827351311804e-05, "loss": 2.7745285034179688, "memory(GiB)": 66.02, "step": 18560, "token_acc": 0.44744744744744747, "train_speed(iter/s)": 1.44854 }, { "epoch": 0.7953815175013924, "grad_norm": 5.077114105224609, "learning_rate": 9.388504894474953e-05, "loss": 2.5758234024047852, "memory(GiB)": 66.02, "step": 18565, "token_acc": 0.5, "train_speed(iter/s)": 1.448563 }, { "epoch": 0.7955957328306413, "grad_norm": 3.3762216567993164, "learning_rate": 9.38818235813622e-05, "loss": 2.6224308013916016, "memory(GiB)": 66.02, "step": 18570, "token_acc": 0.47352941176470587, "train_speed(iter/s)": 1.448522 }, { "epoch": 0.7958099481598904, "grad_norm": 3.718240261077881, "learning_rate": 9.387859742301445e-05, "loss": 2.5745002746582033, "memory(GiB)": 66.02, "step": 18575, "token_acc": 0.484, "train_speed(iter/s)": 1.44847 }, { "epoch": 0.7960241634891393, "grad_norm": 4.881780624389648, "learning_rate": 9.387537046976476e-05, "loss": 2.486758804321289, "memory(GiB)": 66.02, "step": 18580, "token_acc": 0.47796610169491527, "train_speed(iter/s)": 1.448512 }, { "epoch": 0.7962383788183882, "grad_norm": 4.2203593254089355, "learning_rate": 9.387214272167157e-05, "loss": 2.6436946868896483, "memory(GiB)": 66.02, "step": 18585, "token_acc": 0.4797507788161994, "train_speed(iter/s)": 1.448512 }, { "epoch": 0.7964525941476372, "grad_norm": 4.751131057739258, "learning_rate": 9.386891417879335e-05, "loss": 2.6586116790771483, "memory(GiB)": 66.02, "step": 18590, "token_acc": 0.45786516853932585, "train_speed(iter/s)": 1.448524 }, { "epoch": 0.7966668094768862, "grad_norm": 4.591014862060547, "learning_rate": 9.386568484118862e-05, "loss": 2.6276678085327148, "memory(GiB)": 66.02, "step": 18595, "token_acc": 0.4393939393939394, "train_speed(iter/s)": 1.448492 }, { "epoch": 0.7968810248061351, "grad_norm": 3.6758246421813965, "learning_rate": 9.386245470891584e-05, "loss": 2.7444358825683595, "memory(GiB)": 66.02, "step": 18600, "token_acc": 0.44376899696048633, "train_speed(iter/s)": 1.448533 }, { "epoch": 0.7970952401353841, "grad_norm": 4.360864639282227, "learning_rate": 9.385922378203356e-05, "loss": 2.509359359741211, "memory(GiB)": 66.02, "step": 18605, "token_acc": 0.44932432432432434, "train_speed(iter/s)": 1.448577 }, { "epoch": 0.797309455464633, "grad_norm": 4.772853374481201, "learning_rate": 9.38559920606003e-05, "loss": 2.564883232116699, "memory(GiB)": 66.02, "step": 18610, "token_acc": 0.4870967741935484, "train_speed(iter/s)": 1.448586 }, { "epoch": 0.797523670793882, "grad_norm": 4.266146183013916, "learning_rate": 9.38527595446746e-05, "loss": 2.61375732421875, "memory(GiB)": 66.02, "step": 18615, "token_acc": 0.47774480712166173, "train_speed(iter/s)": 1.448602 }, { "epoch": 0.797737886123131, "grad_norm": 6.605282306671143, "learning_rate": 9.384952623431502e-05, "loss": 2.5659912109375, "memory(GiB)": 66.02, "step": 18620, "token_acc": 0.44594594594594594, "train_speed(iter/s)": 1.448542 }, { "epoch": 0.7979521014523799, "grad_norm": 3.884753704071045, "learning_rate": 9.384629212958014e-05, "loss": 2.5951923370361327, "memory(GiB)": 66.02, "step": 18625, "token_acc": 0.4812286689419795, "train_speed(iter/s)": 1.448572 }, { "epoch": 0.7981663167816289, "grad_norm": 4.864241123199463, "learning_rate": 9.384305723052855e-05, "loss": 2.782741165161133, "memory(GiB)": 66.02, "step": 18630, "token_acc": 0.43548387096774194, "train_speed(iter/s)": 1.448536 }, { "epoch": 0.7983805321108779, "grad_norm": 4.610132694244385, "learning_rate": 9.383982153721884e-05, "loss": 2.7461664199829103, "memory(GiB)": 66.02, "step": 18635, "token_acc": 0.46774193548387094, "train_speed(iter/s)": 1.448552 }, { "epoch": 0.7985947474401268, "grad_norm": 4.440431594848633, "learning_rate": 9.383658504970965e-05, "loss": 2.855160140991211, "memory(GiB)": 66.02, "step": 18640, "token_acc": 0.44866920152091255, "train_speed(iter/s)": 1.448559 }, { "epoch": 0.7988089627693757, "grad_norm": 4.1292405128479, "learning_rate": 9.383334776805958e-05, "loss": 2.6430744171142577, "memory(GiB)": 66.02, "step": 18645, "token_acc": 0.4117647058823529, "train_speed(iter/s)": 1.448451 }, { "epoch": 0.7990231780986248, "grad_norm": 4.249571323394775, "learning_rate": 9.383010969232731e-05, "loss": 2.4393062591552734, "memory(GiB)": 66.02, "step": 18650, "token_acc": 0.4690909090909091, "train_speed(iter/s)": 1.448438 }, { "epoch": 0.7992373934278737, "grad_norm": 5.063155174255371, "learning_rate": 9.382687082257149e-05, "loss": 2.7747039794921875, "memory(GiB)": 66.02, "step": 18655, "token_acc": 0.4107142857142857, "train_speed(iter/s)": 1.448508 }, { "epoch": 0.7994516087571226, "grad_norm": 4.814256191253662, "learning_rate": 9.38236311588508e-05, "loss": 2.858232116699219, "memory(GiB)": 66.02, "step": 18660, "token_acc": 0.44405594405594406, "train_speed(iter/s)": 1.448423 }, { "epoch": 0.7996658240863717, "grad_norm": 3.941995620727539, "learning_rate": 9.38203907012239e-05, "loss": 2.9491191864013673, "memory(GiB)": 66.02, "step": 18665, "token_acc": 0.3942652329749104, "train_speed(iter/s)": 1.448526 }, { "epoch": 0.7998800394156206, "grad_norm": 4.696353912353516, "learning_rate": 9.381714944974953e-05, "loss": 2.374323272705078, "memory(GiB)": 66.02, "step": 18670, "token_acc": 0.5067567567567568, "train_speed(iter/s)": 1.448551 }, { "epoch": 0.8000942547448695, "grad_norm": 4.271224021911621, "learning_rate": 9.38139074044864e-05, "loss": 2.416745185852051, "memory(GiB)": 66.02, "step": 18675, "token_acc": 0.4612546125461255, "train_speed(iter/s)": 1.448411 }, { "epoch": 0.8003084700741185, "grad_norm": 5.279973030090332, "learning_rate": 9.381066456549321e-05, "loss": 2.6077295303344727, "memory(GiB)": 66.02, "step": 18680, "token_acc": 0.4681647940074906, "train_speed(iter/s)": 1.448442 }, { "epoch": 0.8005226854033675, "grad_norm": 4.325677871704102, "learning_rate": 9.380742093282876e-05, "loss": 2.5408451080322267, "memory(GiB)": 66.02, "step": 18685, "token_acc": 0.45229681978798586, "train_speed(iter/s)": 1.448442 }, { "epoch": 0.8007369007326164, "grad_norm": 4.190362453460693, "learning_rate": 9.380417650655178e-05, "loss": 2.664555549621582, "memory(GiB)": 66.02, "step": 18690, "token_acc": 0.44366197183098594, "train_speed(iter/s)": 1.448489 }, { "epoch": 0.8009511160618654, "grad_norm": 3.899315357208252, "learning_rate": 9.380093128672104e-05, "loss": 2.51683349609375, "memory(GiB)": 66.02, "step": 18695, "token_acc": 0.4809688581314879, "train_speed(iter/s)": 1.448476 }, { "epoch": 0.8011653313911143, "grad_norm": 4.963001251220703, "learning_rate": 9.379768527339534e-05, "loss": 2.554346466064453, "memory(GiB)": 66.02, "step": 18700, "token_acc": 0.49328859060402686, "train_speed(iter/s)": 1.448524 }, { "epoch": 0.8013795467203633, "grad_norm": 4.530016899108887, "learning_rate": 9.379443846663349e-05, "loss": 2.560807991027832, "memory(GiB)": 66.02, "step": 18705, "token_acc": 0.45229681978798586, "train_speed(iter/s)": 1.448411 }, { "epoch": 0.8015937620496123, "grad_norm": 4.441322326660156, "learning_rate": 9.379119086649432e-05, "loss": 2.585462188720703, "memory(GiB)": 66.02, "step": 18710, "token_acc": 0.449685534591195, "train_speed(iter/s)": 1.448434 }, { "epoch": 0.8018079773788612, "grad_norm": 3.917233467102051, "learning_rate": 9.378794247303664e-05, "loss": 2.6268999099731447, "memory(GiB)": 66.02, "step": 18715, "token_acc": 0.4819277108433735, "train_speed(iter/s)": 1.448454 }, { "epoch": 0.8020221927081101, "grad_norm": 4.160663604736328, "learning_rate": 9.378469328631929e-05, "loss": 2.4982261657714844, "memory(GiB)": 66.02, "step": 18720, "token_acc": 0.47619047619047616, "train_speed(iter/s)": 1.448509 }, { "epoch": 0.8022364080373592, "grad_norm": 4.181886672973633, "learning_rate": 9.378144330640118e-05, "loss": 2.4761966705322265, "memory(GiB)": 66.02, "step": 18725, "token_acc": 0.4603174603174603, "train_speed(iter/s)": 1.448536 }, { "epoch": 0.8024506233666081, "grad_norm": 6.050583839416504, "learning_rate": 9.377819253334113e-05, "loss": 2.1962636947631835, "memory(GiB)": 66.02, "step": 18730, "token_acc": 0.4860557768924303, "train_speed(iter/s)": 1.448589 }, { "epoch": 0.802664838695857, "grad_norm": 4.506659984588623, "learning_rate": 9.377494096719805e-05, "loss": 2.6051998138427734, "memory(GiB)": 66.02, "step": 18735, "token_acc": 0.4375, "train_speed(iter/s)": 1.448622 }, { "epoch": 0.8028790540251061, "grad_norm": 3.6847870349884033, "learning_rate": 9.377168860803085e-05, "loss": 2.5616291046142576, "memory(GiB)": 66.02, "step": 18740, "token_acc": 0.44876325088339225, "train_speed(iter/s)": 1.448687 }, { "epoch": 0.803093269354355, "grad_norm": 4.197353839874268, "learning_rate": 9.376843545589846e-05, "loss": 3.0736923217773438, "memory(GiB)": 66.02, "step": 18745, "token_acc": 0.3853820598006645, "train_speed(iter/s)": 1.448734 }, { "epoch": 0.8033074846836039, "grad_norm": 4.323554992675781, "learning_rate": 9.37651815108598e-05, "loss": 2.567486381530762, "memory(GiB)": 66.02, "step": 18750, "token_acc": 0.4772727272727273, "train_speed(iter/s)": 1.448703 }, { "epoch": 0.803521700012853, "grad_norm": 9.776946067810059, "learning_rate": 9.376192677297383e-05, "loss": 2.5702493667602537, "memory(GiB)": 66.02, "step": 18755, "token_acc": 0.49333333333333335, "train_speed(iter/s)": 1.448759 }, { "epoch": 0.8037359153421019, "grad_norm": 7.530881881713867, "learning_rate": 9.37586712422995e-05, "loss": 2.0877525329589846, "memory(GiB)": 66.02, "step": 18760, "token_acc": 0.5382059800664452, "train_speed(iter/s)": 1.448722 }, { "epoch": 0.8039501306713508, "grad_norm": 4.051812171936035, "learning_rate": 9.375541491889579e-05, "loss": 2.8175514221191404, "memory(GiB)": 66.02, "step": 18765, "token_acc": 0.41605839416058393, "train_speed(iter/s)": 1.448692 }, { "epoch": 0.8041643460005998, "grad_norm": 6.542845726013184, "learning_rate": 9.375215780282169e-05, "loss": 2.9809213638305665, "memory(GiB)": 66.02, "step": 18770, "token_acc": 0.41901408450704225, "train_speed(iter/s)": 1.448647 }, { "epoch": 0.8043785613298488, "grad_norm": 4.400534629821777, "learning_rate": 9.374889989413622e-05, "loss": 2.3808618545532227, "memory(GiB)": 66.02, "step": 18775, "token_acc": 0.4523809523809524, "train_speed(iter/s)": 1.448664 }, { "epoch": 0.8045927766590977, "grad_norm": 4.855630397796631, "learning_rate": 9.374564119289837e-05, "loss": 2.5283443450927736, "memory(GiB)": 66.02, "step": 18780, "token_acc": 0.5, "train_speed(iter/s)": 1.448757 }, { "epoch": 0.8048069919883467, "grad_norm": 4.238831043243408, "learning_rate": 9.37423816991672e-05, "loss": 2.5206716537475584, "memory(GiB)": 66.02, "step": 18785, "token_acc": 0.4808362369337979, "train_speed(iter/s)": 1.448736 }, { "epoch": 0.8050212073175956, "grad_norm": 5.935600757598877, "learning_rate": 9.373912141300177e-05, "loss": 2.6006214141845705, "memory(GiB)": 66.02, "step": 18790, "token_acc": 0.4816053511705686, "train_speed(iter/s)": 1.448722 }, { "epoch": 0.8052354226468446, "grad_norm": 6.571051597595215, "learning_rate": 9.37358603344611e-05, "loss": 2.515240478515625, "memory(GiB)": 66.02, "step": 18795, "token_acc": 0.4758364312267658, "train_speed(iter/s)": 1.448696 }, { "epoch": 0.8054496379760936, "grad_norm": 4.672883987426758, "learning_rate": 9.37325984636043e-05, "loss": 2.981559181213379, "memory(GiB)": 66.02, "step": 18800, "token_acc": 0.4307116104868914, "train_speed(iter/s)": 1.448665 }, { "epoch": 0.8056638533053425, "grad_norm": 5.836612701416016, "learning_rate": 9.372933580049047e-05, "loss": 2.3291561126708986, "memory(GiB)": 66.02, "step": 18805, "token_acc": 0.5, "train_speed(iter/s)": 1.448689 }, { "epoch": 0.8058780686345914, "grad_norm": 4.198070526123047, "learning_rate": 9.372607234517868e-05, "loss": 2.516164016723633, "memory(GiB)": 66.02, "step": 18810, "token_acc": 0.4647887323943662, "train_speed(iter/s)": 1.448664 }, { "epoch": 0.8060922839638405, "grad_norm": 5.6745452880859375, "learning_rate": 9.372280809772808e-05, "loss": 2.4835153579711915, "memory(GiB)": 66.02, "step": 18815, "token_acc": 0.5059288537549407, "train_speed(iter/s)": 1.448702 }, { "epoch": 0.8063064992930894, "grad_norm": 3.5774919986724854, "learning_rate": 9.371954305819779e-05, "loss": 2.755037307739258, "memory(GiB)": 66.02, "step": 18820, "token_acc": 0.4752186588921283, "train_speed(iter/s)": 1.448731 }, { "epoch": 0.8065207146223383, "grad_norm": 4.555241107940674, "learning_rate": 9.371627722664698e-05, "loss": 2.492181396484375, "memory(GiB)": 66.02, "step": 18825, "token_acc": 0.4402730375426621, "train_speed(iter/s)": 1.448703 }, { "epoch": 0.8067349299515874, "grad_norm": 3.933934450149536, "learning_rate": 9.371301060313477e-05, "loss": 2.0711223602294924, "memory(GiB)": 66.02, "step": 18830, "token_acc": 0.5508771929824562, "train_speed(iter/s)": 1.448724 }, { "epoch": 0.8069491452808363, "grad_norm": 4.408028602600098, "learning_rate": 9.370974318772038e-05, "loss": 2.677082061767578, "memory(GiB)": 66.02, "step": 18835, "token_acc": 0.47619047619047616, "train_speed(iter/s)": 1.448778 }, { "epoch": 0.8071633606100853, "grad_norm": 12.37784194946289, "learning_rate": 9.370647498046302e-05, "loss": 2.6479148864746094, "memory(GiB)": 66.02, "step": 18840, "token_acc": 0.49070631970260226, "train_speed(iter/s)": 1.448802 }, { "epoch": 0.8073775759393342, "grad_norm": 4.033270359039307, "learning_rate": 9.370320598142183e-05, "loss": 2.695201301574707, "memory(GiB)": 66.02, "step": 18845, "token_acc": 0.43309859154929575, "train_speed(iter/s)": 1.448749 }, { "epoch": 0.8075917912685832, "grad_norm": 4.585914611816406, "learning_rate": 9.369993619065608e-05, "loss": 2.5486961364746095, "memory(GiB)": 66.02, "step": 18850, "token_acc": 0.4603174603174603, "train_speed(iter/s)": 1.448808 }, { "epoch": 0.8078060065978322, "grad_norm": 4.526068687438965, "learning_rate": 9.369666560822498e-05, "loss": 2.6846969604492186, "memory(GiB)": 66.02, "step": 18855, "token_acc": 0.449438202247191, "train_speed(iter/s)": 1.448773 }, { "epoch": 0.8080202219270811, "grad_norm": 4.248173713684082, "learning_rate": 9.36933942341878e-05, "loss": 2.5881731033325197, "memory(GiB)": 66.02, "step": 18860, "token_acc": 0.4816053511705686, "train_speed(iter/s)": 1.448796 }, { "epoch": 0.80823443725633, "grad_norm": 5.018126487731934, "learning_rate": 9.369012206860381e-05, "loss": 2.653145599365234, "memory(GiB)": 66.02, "step": 18865, "token_acc": 0.45353159851301117, "train_speed(iter/s)": 1.448831 }, { "epoch": 0.8084486525855791, "grad_norm": 4.510242462158203, "learning_rate": 9.368684911153225e-05, "loss": 2.7652151107788088, "memory(GiB)": 66.02, "step": 18870, "token_acc": 0.4169278996865204, "train_speed(iter/s)": 1.448807 }, { "epoch": 0.808662867914828, "grad_norm": 4.0523762702941895, "learning_rate": 9.368357536303245e-05, "loss": 2.5270395278930664, "memory(GiB)": 66.02, "step": 18875, "token_acc": 0.46179401993355484, "train_speed(iter/s)": 1.448917 }, { "epoch": 0.8088770832440769, "grad_norm": 4.647110462188721, "learning_rate": 9.36803008231637e-05, "loss": 2.404300308227539, "memory(GiB)": 66.02, "step": 18880, "token_acc": 0.5317725752508361, "train_speed(iter/s)": 1.448993 }, { "epoch": 0.809091298573326, "grad_norm": 4.7010321617126465, "learning_rate": 9.367702549198533e-05, "loss": 2.342572021484375, "memory(GiB)": 66.02, "step": 18885, "token_acc": 0.5100401606425703, "train_speed(iter/s)": 1.449005 }, { "epoch": 0.8093055139025749, "grad_norm": 3.4574878215789795, "learning_rate": 9.367374936955666e-05, "loss": 2.5595396041870115, "memory(GiB)": 66.02, "step": 18890, "token_acc": 0.4766666666666667, "train_speed(iter/s)": 1.448895 }, { "epoch": 0.8095197292318238, "grad_norm": 5.867266654968262, "learning_rate": 9.367047245593705e-05, "loss": 2.6523746490478515, "memory(GiB)": 66.02, "step": 18895, "token_acc": 0.4745762711864407, "train_speed(iter/s)": 1.448933 }, { "epoch": 0.8097339445610728, "grad_norm": 3.5428473949432373, "learning_rate": 9.366719475118588e-05, "loss": 2.5579483032226564, "memory(GiB)": 66.02, "step": 18900, "token_acc": 0.47194719471947194, "train_speed(iter/s)": 1.449043 }, { "epoch": 0.8099481598903218, "grad_norm": 4.248826026916504, "learning_rate": 9.36639162553625e-05, "loss": 2.466800308227539, "memory(GiB)": 66.02, "step": 18905, "token_acc": 0.47575757575757577, "train_speed(iter/s)": 1.449012 }, { "epoch": 0.8101623752195707, "grad_norm": 6.2241997718811035, "learning_rate": 9.366063696852634e-05, "loss": 2.4950037002563477, "memory(GiB)": 66.02, "step": 18910, "token_acc": 0.44565217391304346, "train_speed(iter/s)": 1.448962 }, { "epoch": 0.8103765905488197, "grad_norm": 4.128664016723633, "learning_rate": 9.365735689073676e-05, "loss": 2.2902360916137696, "memory(GiB)": 66.02, "step": 18915, "token_acc": 0.55, "train_speed(iter/s)": 1.449014 }, { "epoch": 0.8105908058780686, "grad_norm": 3.3307580947875977, "learning_rate": 9.365407602205322e-05, "loss": 2.375588035583496, "memory(GiB)": 66.02, "step": 18920, "token_acc": 0.5689655172413793, "train_speed(iter/s)": 1.449038 }, { "epoch": 0.8108050212073176, "grad_norm": 4.561521053314209, "learning_rate": 9.365079436253513e-05, "loss": 2.688313102722168, "memory(GiB)": 66.02, "step": 18925, "token_acc": 0.4738675958188153, "train_speed(iter/s)": 1.44909 }, { "epoch": 0.8110192365365666, "grad_norm": 3.610499143600464, "learning_rate": 9.364751191224197e-05, "loss": 2.392082405090332, "memory(GiB)": 66.02, "step": 18930, "token_acc": 0.46619217081850534, "train_speed(iter/s)": 1.449154 }, { "epoch": 0.8112334518658155, "grad_norm": 3.648470878601074, "learning_rate": 9.364422867123317e-05, "loss": 2.3470903396606446, "memory(GiB)": 66.02, "step": 18935, "token_acc": 0.46402877697841727, "train_speed(iter/s)": 1.449157 }, { "epoch": 0.8114476671950644, "grad_norm": 4.74207067489624, "learning_rate": 9.364094463956823e-05, "loss": 2.464779281616211, "memory(GiB)": 66.02, "step": 18940, "token_acc": 0.47151898734177217, "train_speed(iter/s)": 1.449103 }, { "epoch": 0.8116618825243135, "grad_norm": 5.336584091186523, "learning_rate": 9.363765981730664e-05, "loss": 2.446050262451172, "memory(GiB)": 66.02, "step": 18945, "token_acc": 0.4749034749034749, "train_speed(iter/s)": 1.449055 }, { "epoch": 0.8118760978535624, "grad_norm": 3.5935122966766357, "learning_rate": 9.363437420450791e-05, "loss": 2.469440460205078, "memory(GiB)": 66.02, "step": 18950, "token_acc": 0.5031446540880503, "train_speed(iter/s)": 1.448962 }, { "epoch": 0.8120903131828113, "grad_norm": 4.4591875076293945, "learning_rate": 9.363108780123155e-05, "loss": 2.7129817962646485, "memory(GiB)": 66.02, "step": 18955, "token_acc": 0.48598130841121495, "train_speed(iter/s)": 1.448949 }, { "epoch": 0.8123045285120604, "grad_norm": 6.068639278411865, "learning_rate": 9.362780060753712e-05, "loss": 2.301922416687012, "memory(GiB)": 66.02, "step": 18960, "token_acc": 0.5280898876404494, "train_speed(iter/s)": 1.448954 }, { "epoch": 0.8125187438413093, "grad_norm": 4.075112342834473, "learning_rate": 9.362451262348414e-05, "loss": 2.8235143661499023, "memory(GiB)": 66.02, "step": 18965, "token_acc": 0.4716417910447761, "train_speed(iter/s)": 1.449072 }, { "epoch": 0.8127329591705582, "grad_norm": 6.5669708251953125, "learning_rate": 9.362122384913221e-05, "loss": 2.4278364181518555, "memory(GiB)": 66.02, "step": 18970, "token_acc": 0.4734982332155477, "train_speed(iter/s)": 1.44893 }, { "epoch": 0.8129471744998072, "grad_norm": 3.743220329284668, "learning_rate": 9.361793428454087e-05, "loss": 2.736924171447754, "memory(GiB)": 66.02, "step": 18975, "token_acc": 0.4437299035369775, "train_speed(iter/s)": 1.449007 }, { "epoch": 0.8131613898290562, "grad_norm": 5.922600746154785, "learning_rate": 9.361464392976975e-05, "loss": 2.7900842666625976, "memory(GiB)": 66.02, "step": 18980, "token_acc": 0.43548387096774194, "train_speed(iter/s)": 1.449005 }, { "epoch": 0.8133756051583051, "grad_norm": 4.516504287719727, "learning_rate": 9.361135278487843e-05, "loss": 2.632344055175781, "memory(GiB)": 66.02, "step": 18985, "token_acc": 0.4463667820069204, "train_speed(iter/s)": 1.448967 }, { "epoch": 0.8135898204875541, "grad_norm": 3.773824453353882, "learning_rate": 9.360806084992655e-05, "loss": 2.6191478729248048, "memory(GiB)": 66.02, "step": 18990, "token_acc": 0.45925925925925926, "train_speed(iter/s)": 1.448993 }, { "epoch": 0.813804035816803, "grad_norm": 3.8800442218780518, "learning_rate": 9.360476812497374e-05, "loss": 2.7537099838256838, "memory(GiB)": 66.02, "step": 18995, "token_acc": 0.4085714285714286, "train_speed(iter/s)": 1.448961 }, { "epoch": 0.814018251146052, "grad_norm": 4.386427402496338, "learning_rate": 9.360147461007964e-05, "loss": 2.56646728515625, "memory(GiB)": 66.02, "step": 19000, "token_acc": 0.5072463768115942, "train_speed(iter/s)": 1.448939 }, { "epoch": 0.814018251146052, "eval_loss": 2.147026777267456, "eval_runtime": 14.6797, "eval_samples_per_second": 6.812, "eval_steps_per_second": 6.812, "eval_token_acc": 0.4878048780487805, "step": 19000 }, { "epoch": 0.814232466475301, "grad_norm": 4.334740161895752, "learning_rate": 9.359818030530394e-05, "loss": 2.6520919799804688, "memory(GiB)": 66.02, "step": 19005, "token_acc": 0.4805194805194805, "train_speed(iter/s)": 1.447267 }, { "epoch": 0.8144466818045499, "grad_norm": 4.401294231414795, "learning_rate": 9.359488521070629e-05, "loss": 2.2585668563842773, "memory(GiB)": 66.02, "step": 19010, "token_acc": 0.5155038759689923, "train_speed(iter/s)": 1.447278 }, { "epoch": 0.8146608971337989, "grad_norm": 3.8697566986083984, "learning_rate": 9.359158932634642e-05, "loss": 2.6603321075439452, "memory(GiB)": 66.02, "step": 19015, "token_acc": 0.4557377049180328, "train_speed(iter/s)": 1.447262 }, { "epoch": 0.8148751124630479, "grad_norm": 4.988037109375, "learning_rate": 9.3588292652284e-05, "loss": 2.3017856597900392, "memory(GiB)": 66.02, "step": 19020, "token_acc": 0.4642857142857143, "train_speed(iter/s)": 1.447174 }, { "epoch": 0.8150893277922968, "grad_norm": 4.837860107421875, "learning_rate": 9.358499518857877e-05, "loss": 2.5823156356811525, "memory(GiB)": 66.02, "step": 19025, "token_acc": 0.4767025089605735, "train_speed(iter/s)": 1.44721 }, { "epoch": 0.8153035431215457, "grad_norm": 3.803999185562134, "learning_rate": 9.358169693529048e-05, "loss": 2.7338947296142577, "memory(GiB)": 66.02, "step": 19030, "token_acc": 0.45768025078369906, "train_speed(iter/s)": 1.447154 }, { "epoch": 0.8155177584507948, "grad_norm": 3.702883243560791, "learning_rate": 9.357839789247886e-05, "loss": 2.5634740829467773, "memory(GiB)": 66.02, "step": 19035, "token_acc": 0.4536741214057508, "train_speed(iter/s)": 1.447201 }, { "epoch": 0.8157319737800437, "grad_norm": 5.722247123718262, "learning_rate": 9.357509806020369e-05, "loss": 2.6622148513793946, "memory(GiB)": 66.02, "step": 19040, "token_acc": 0.4620938628158845, "train_speed(iter/s)": 1.447189 }, { "epoch": 0.8159461891092926, "grad_norm": 3.2920172214508057, "learning_rate": 9.357179743852471e-05, "loss": 2.6106306076049806, "memory(GiB)": 66.02, "step": 19045, "token_acc": 0.48580441640378547, "train_speed(iter/s)": 1.447124 }, { "epoch": 0.8161604044385417, "grad_norm": 4.404176712036133, "learning_rate": 9.356849602750177e-05, "loss": 2.526082229614258, "memory(GiB)": 66.02, "step": 19050, "token_acc": 0.4519230769230769, "train_speed(iter/s)": 1.447111 }, { "epoch": 0.8163746197677906, "grad_norm": 6.708415508270264, "learning_rate": 9.356519382719467e-05, "loss": 2.5774837493896485, "memory(GiB)": 66.02, "step": 19055, "token_acc": 0.4470284237726098, "train_speed(iter/s)": 1.447098 }, { "epoch": 0.8165888350970395, "grad_norm": 4.452574729919434, "learning_rate": 9.356189083766318e-05, "loss": 2.672476577758789, "memory(GiB)": 66.02, "step": 19060, "token_acc": 0.4232081911262799, "train_speed(iter/s)": 1.447156 }, { "epoch": 0.8168030504262885, "grad_norm": 7.4739484786987305, "learning_rate": 9.35585870589672e-05, "loss": 2.913471984863281, "memory(GiB)": 66.02, "step": 19065, "token_acc": 0.4573643410852713, "train_speed(iter/s)": 1.447248 }, { "epoch": 0.8170172657555375, "grad_norm": 4.520939350128174, "learning_rate": 9.355528249116653e-05, "loss": 2.40564079284668, "memory(GiB)": 66.02, "step": 19070, "token_acc": 0.44673539518900346, "train_speed(iter/s)": 1.447187 }, { "epoch": 0.8172314810847864, "grad_norm": 3.5942249298095703, "learning_rate": 9.355197713432109e-05, "loss": 2.7534931182861326, "memory(GiB)": 66.02, "step": 19075, "token_acc": 0.459546925566343, "train_speed(iter/s)": 1.447116 }, { "epoch": 0.8174456964140354, "grad_norm": 3.7783420085906982, "learning_rate": 9.354867098849071e-05, "loss": 2.6439193725585937, "memory(GiB)": 66.02, "step": 19080, "token_acc": 0.474025974025974, "train_speed(iter/s)": 1.446914 }, { "epoch": 0.8176599117432843, "grad_norm": 4.424737453460693, "learning_rate": 9.35453640537353e-05, "loss": 2.5768512725830077, "memory(GiB)": 66.02, "step": 19085, "token_acc": 0.4558303886925795, "train_speed(iter/s)": 1.446766 }, { "epoch": 0.8178741270725333, "grad_norm": 4.611454486846924, "learning_rate": 9.354205633011479e-05, "loss": 2.604601287841797, "memory(GiB)": 66.02, "step": 19090, "token_acc": 0.475, "train_speed(iter/s)": 1.446845 }, { "epoch": 0.8180883424017823, "grad_norm": 3.5911543369293213, "learning_rate": 9.353874781768908e-05, "loss": 2.3541257858276365, "memory(GiB)": 66.02, "step": 19095, "token_acc": 0.5, "train_speed(iter/s)": 1.446916 }, { "epoch": 0.8183025577310312, "grad_norm": 4.482860088348389, "learning_rate": 9.353543851651809e-05, "loss": 2.7378005981445312, "memory(GiB)": 66.02, "step": 19100, "token_acc": 0.4294117647058823, "train_speed(iter/s)": 1.446936 }, { "epoch": 0.8185167730602801, "grad_norm": 5.56183385848999, "learning_rate": 9.353212842666181e-05, "loss": 2.664101791381836, "memory(GiB)": 66.02, "step": 19105, "token_acc": 0.48249027237354086, "train_speed(iter/s)": 1.446943 }, { "epoch": 0.8187309883895292, "grad_norm": 3.9395222663879395, "learning_rate": 9.352881754818019e-05, "loss": 2.949480438232422, "memory(GiB)": 66.02, "step": 19110, "token_acc": 0.4557823129251701, "train_speed(iter/s)": 1.447034 }, { "epoch": 0.8189452037187781, "grad_norm": 4.044198036193848, "learning_rate": 9.352550588113319e-05, "loss": 2.3677745819091798, "memory(GiB)": 66.02, "step": 19115, "token_acc": 0.47766323024054985, "train_speed(iter/s)": 1.447083 }, { "epoch": 0.819159419048027, "grad_norm": 4.579442977905273, "learning_rate": 9.352219342558083e-05, "loss": 2.8724830627441404, "memory(GiB)": 66.02, "step": 19120, "token_acc": 0.4146341463414634, "train_speed(iter/s)": 1.447095 }, { "epoch": 0.8193736343772761, "grad_norm": 3.308806896209717, "learning_rate": 9.351888018158312e-05, "loss": 2.7751276016235353, "memory(GiB)": 66.02, "step": 19125, "token_acc": 0.4289940828402367, "train_speed(iter/s)": 1.447154 }, { "epoch": 0.819587849706525, "grad_norm": 5.318056106567383, "learning_rate": 9.351556614920005e-05, "loss": 2.530400276184082, "memory(GiB)": 66.02, "step": 19130, "token_acc": 0.48497854077253216, "train_speed(iter/s)": 1.44714 }, { "epoch": 0.8198020650357739, "grad_norm": 4.2228522300720215, "learning_rate": 9.351225132849169e-05, "loss": 2.9578628540039062, "memory(GiB)": 66.02, "step": 19135, "token_acc": 0.40773809523809523, "train_speed(iter/s)": 1.447156 }, { "epoch": 0.8200162803650229, "grad_norm": 4.344516754150391, "learning_rate": 9.350893571951808e-05, "loss": 2.6064916610717774, "memory(GiB)": 66.02, "step": 19140, "token_acc": 0.47368421052631576, "train_speed(iter/s)": 1.447135 }, { "epoch": 0.8202304956942719, "grad_norm": 5.083049774169922, "learning_rate": 9.350561932233928e-05, "loss": 2.6279670715332033, "memory(GiB)": 66.02, "step": 19145, "token_acc": 0.5247524752475248, "train_speed(iter/s)": 1.447138 }, { "epoch": 0.8204447110235208, "grad_norm": 7.079374313354492, "learning_rate": 9.350230213701537e-05, "loss": 2.7200139999389648, "memory(GiB)": 66.02, "step": 19150, "token_acc": 0.445578231292517, "train_speed(iter/s)": 1.447093 }, { "epoch": 0.8206589263527698, "grad_norm": 4.158751487731934, "learning_rate": 9.349898416360646e-05, "loss": 2.7096811294555665, "memory(GiB)": 66.02, "step": 19155, "token_acc": 0.4714285714285714, "train_speed(iter/s)": 1.447078 }, { "epoch": 0.8208731416820187, "grad_norm": 3.67203688621521, "learning_rate": 9.349566540217265e-05, "loss": 2.7657135009765623, "memory(GiB)": 66.02, "step": 19160, "token_acc": 0.4511784511784512, "train_speed(iter/s)": 1.44708 }, { "epoch": 0.8210873570112677, "grad_norm": 5.364586353302002, "learning_rate": 9.349234585277404e-05, "loss": 2.5590272903442384, "memory(GiB)": 66.02, "step": 19165, "token_acc": 0.4673913043478261, "train_speed(iter/s)": 1.447039 }, { "epoch": 0.8213015723405167, "grad_norm": 4.690365791320801, "learning_rate": 9.348902551547081e-05, "loss": 2.5245126724243163, "memory(GiB)": 66.02, "step": 19170, "token_acc": 0.4868913857677903, "train_speed(iter/s)": 1.447057 }, { "epoch": 0.8215157876697656, "grad_norm": 4.606238842010498, "learning_rate": 9.348570439032306e-05, "loss": 2.4096059799194336, "memory(GiB)": 66.02, "step": 19175, "token_acc": 0.5189003436426117, "train_speed(iter/s)": 1.447084 }, { "epoch": 0.8217300029990147, "grad_norm": 5.548707008361816, "learning_rate": 9.348238247739101e-05, "loss": 2.740232467651367, "memory(GiB)": 66.02, "step": 19180, "token_acc": 0.46218487394957986, "train_speed(iter/s)": 1.44709 }, { "epoch": 0.8219442183282636, "grad_norm": 3.649252414703369, "learning_rate": 9.34790597767348e-05, "loss": 2.513283538818359, "memory(GiB)": 66.02, "step": 19185, "token_acc": 0.483695652173913, "train_speed(iter/s)": 1.44716 }, { "epoch": 0.8221584336575125, "grad_norm": 4.202706336975098, "learning_rate": 9.347573628841462e-05, "loss": 2.3912729263305663, "memory(GiB)": 66.02, "step": 19190, "token_acc": 0.4595375722543353, "train_speed(iter/s)": 1.447188 }, { "epoch": 0.8223726489867615, "grad_norm": 5.691158771514893, "learning_rate": 9.347241201249071e-05, "loss": 2.98565559387207, "memory(GiB)": 66.02, "step": 19195, "token_acc": 0.432258064516129, "train_speed(iter/s)": 1.447173 }, { "epoch": 0.8225868643160105, "grad_norm": 4.348105430603027, "learning_rate": 9.346908694902327e-05, "loss": 2.437234306335449, "memory(GiB)": 66.02, "step": 19200, "token_acc": 0.5094339622641509, "train_speed(iter/s)": 1.447066 }, { "epoch": 0.8228010796452594, "grad_norm": 3.9248456954956055, "learning_rate": 9.346576109807255e-05, "loss": 2.6050180435180663, "memory(GiB)": 66.02, "step": 19205, "token_acc": 0.4743202416918429, "train_speed(iter/s)": 1.44715 }, { "epoch": 0.8230152949745084, "grad_norm": 5.578907012939453, "learning_rate": 9.346243445969877e-05, "loss": 2.5511499404907227, "memory(GiB)": 66.02, "step": 19210, "token_acc": 0.46236559139784944, "train_speed(iter/s)": 1.447174 }, { "epoch": 0.8232295103037574, "grad_norm": 4.100949287414551, "learning_rate": 9.345910703396225e-05, "loss": 2.5610687255859377, "memory(GiB)": 66.02, "step": 19215, "token_acc": 0.4662379421221865, "train_speed(iter/s)": 1.447117 }, { "epoch": 0.8234437256330063, "grad_norm": 4.5237531661987305, "learning_rate": 9.345577882092322e-05, "loss": 2.7338815689086915, "memory(GiB)": 66.02, "step": 19220, "token_acc": 0.43042071197411, "train_speed(iter/s)": 1.447062 }, { "epoch": 0.8236579409622553, "grad_norm": 4.642475605010986, "learning_rate": 9.345244982064201e-05, "loss": 2.573164939880371, "memory(GiB)": 66.02, "step": 19225, "token_acc": 0.4899598393574297, "train_speed(iter/s)": 1.447108 }, { "epoch": 0.8238721562915042, "grad_norm": 5.348715782165527, "learning_rate": 9.344912003317888e-05, "loss": 2.4919498443603514, "memory(GiB)": 66.02, "step": 19230, "token_acc": 0.483739837398374, "train_speed(iter/s)": 1.447074 }, { "epoch": 0.8240863716207532, "grad_norm": 3.156010150909424, "learning_rate": 9.344578945859421e-05, "loss": 2.637017822265625, "memory(GiB)": 66.02, "step": 19235, "token_acc": 0.4520123839009288, "train_speed(iter/s)": 1.447122 }, { "epoch": 0.8243005869500022, "grad_norm": 3.584594488143921, "learning_rate": 9.344245809694829e-05, "loss": 2.5854602813720704, "memory(GiB)": 66.02, "step": 19240, "token_acc": 0.4642857142857143, "train_speed(iter/s)": 1.446998 }, { "epoch": 0.8245148022792511, "grad_norm": 5.1024322509765625, "learning_rate": 9.34391259483015e-05, "loss": 2.683881378173828, "memory(GiB)": 66.02, "step": 19245, "token_acc": 0.48333333333333334, "train_speed(iter/s)": 1.44699 }, { "epoch": 0.8247290176085, "grad_norm": 4.292627811431885, "learning_rate": 9.343579301271419e-05, "loss": 2.5055210113525392, "memory(GiB)": 66.02, "step": 19250, "token_acc": 0.47941176470588237, "train_speed(iter/s)": 1.447044 }, { "epoch": 0.8249432329377491, "grad_norm": 6.777864933013916, "learning_rate": 9.343245929024674e-05, "loss": 2.869306755065918, "memory(GiB)": 66.02, "step": 19255, "token_acc": 0.4592833876221498, "train_speed(iter/s)": 1.447 }, { "epoch": 0.825157448266998, "grad_norm": 6.696408271789551, "learning_rate": 9.342912478095955e-05, "loss": 2.294254493713379, "memory(GiB)": 66.02, "step": 19260, "token_acc": 0.5486381322957199, "train_speed(iter/s)": 1.447093 }, { "epoch": 0.8253716635962469, "grad_norm": 4.6820173263549805, "learning_rate": 9.342578948491303e-05, "loss": 2.6426769256591798, "memory(GiB)": 66.02, "step": 19265, "token_acc": 0.4755700325732899, "train_speed(iter/s)": 1.447136 }, { "epoch": 0.825585878925496, "grad_norm": 4.027639389038086, "learning_rate": 9.342245340216759e-05, "loss": 2.5612937927246096, "memory(GiB)": 66.02, "step": 19270, "token_acc": 0.4618055555555556, "train_speed(iter/s)": 1.447137 }, { "epoch": 0.8258000942547449, "grad_norm": 3.4892492294311523, "learning_rate": 9.341911653278367e-05, "loss": 2.5044734954833983, "memory(GiB)": 66.02, "step": 19275, "token_acc": 0.5150501672240803, "train_speed(iter/s)": 1.447244 }, { "epoch": 0.8260143095839938, "grad_norm": 4.824902057647705, "learning_rate": 9.341577887682173e-05, "loss": 2.8290802001953126, "memory(GiB)": 66.02, "step": 19280, "token_acc": 0.44807121661721067, "train_speed(iter/s)": 1.447198 }, { "epoch": 0.8262285249132428, "grad_norm": 4.836545467376709, "learning_rate": 9.341244043434224e-05, "loss": 2.495344352722168, "memory(GiB)": 66.02, "step": 19285, "token_acc": 0.4857142857142857, "train_speed(iter/s)": 1.447192 }, { "epoch": 0.8264427402424918, "grad_norm": 5.338891506195068, "learning_rate": 9.340910120540564e-05, "loss": 2.9964624404907227, "memory(GiB)": 66.02, "step": 19290, "token_acc": 0.4300341296928328, "train_speed(iter/s)": 1.447221 }, { "epoch": 0.8266569555717407, "grad_norm": 4.3757524490356445, "learning_rate": 9.340576119007249e-05, "loss": 2.865975189208984, "memory(GiB)": 66.02, "step": 19295, "token_acc": 0.4702194357366771, "train_speed(iter/s)": 1.447223 }, { "epoch": 0.8268711709009897, "grad_norm": 4.286296844482422, "learning_rate": 9.340242038840322e-05, "loss": 2.495265007019043, "memory(GiB)": 66.02, "step": 19300, "token_acc": 0.4594594594594595, "train_speed(iter/s)": 1.447216 }, { "epoch": 0.8270853862302386, "grad_norm": 3.7346765995025635, "learning_rate": 9.33990788004584e-05, "loss": 2.980963706970215, "memory(GiB)": 66.02, "step": 19305, "token_acc": 0.4045584045584046, "train_speed(iter/s)": 1.447161 }, { "epoch": 0.8272996015594876, "grad_norm": 4.579444885253906, "learning_rate": 9.339573642629857e-05, "loss": 2.602779579162598, "memory(GiB)": 66.02, "step": 19310, "token_acc": 0.45323741007194246, "train_speed(iter/s)": 1.447242 }, { "epoch": 0.8275138168887366, "grad_norm": 4.895353317260742, "learning_rate": 9.339239326598426e-05, "loss": 2.724811553955078, "memory(GiB)": 66.02, "step": 19315, "token_acc": 0.4398826979472141, "train_speed(iter/s)": 1.44729 }, { "epoch": 0.8277280322179855, "grad_norm": 4.600613594055176, "learning_rate": 9.338904931957603e-05, "loss": 2.6580682754516602, "memory(GiB)": 66.02, "step": 19320, "token_acc": 0.4249201277955272, "train_speed(iter/s)": 1.44728 }, { "epoch": 0.8279422475472344, "grad_norm": 4.4494099617004395, "learning_rate": 9.338570458713447e-05, "loss": 2.9500267028808596, "memory(GiB)": 66.02, "step": 19325, "token_acc": 0.4427710843373494, "train_speed(iter/s)": 1.447383 }, { "epoch": 0.8281564628764835, "grad_norm": 4.115397930145264, "learning_rate": 9.338235906872019e-05, "loss": 2.7892601013183596, "memory(GiB)": 66.02, "step": 19330, "token_acc": 0.4559748427672956, "train_speed(iter/s)": 1.447289 }, { "epoch": 0.8283706782057324, "grad_norm": 5.089390754699707, "learning_rate": 9.337901276439376e-05, "loss": 2.7787311553955076, "memory(GiB)": 66.02, "step": 19335, "token_acc": 0.4550898203592814, "train_speed(iter/s)": 1.447272 }, { "epoch": 0.8285848935349813, "grad_norm": 4.719737529754639, "learning_rate": 9.337566567421583e-05, "loss": 2.5675708770751955, "memory(GiB)": 66.02, "step": 19340, "token_acc": 0.45017182130584193, "train_speed(iter/s)": 1.4473 }, { "epoch": 0.8287991088642304, "grad_norm": 4.357607841491699, "learning_rate": 9.337231779824703e-05, "loss": 2.4175018310546874, "memory(GiB)": 66.02, "step": 19345, "token_acc": 0.47586206896551725, "train_speed(iter/s)": 1.447294 }, { "epoch": 0.8290133241934793, "grad_norm": 3.693588972091675, "learning_rate": 9.3368969136548e-05, "loss": 2.6972537994384767, "memory(GiB)": 66.02, "step": 19350, "token_acc": 0.4397590361445783, "train_speed(iter/s)": 1.447209 }, { "epoch": 0.8292275395227282, "grad_norm": 4.481620788574219, "learning_rate": 9.33656196891794e-05, "loss": 2.5331802368164062, "memory(GiB)": 66.02, "step": 19355, "token_acc": 0.43854748603351956, "train_speed(iter/s)": 1.447161 }, { "epoch": 0.8294417548519772, "grad_norm": 3.9050045013427734, "learning_rate": 9.336226945620194e-05, "loss": 2.417461395263672, "memory(GiB)": 66.02, "step": 19360, "token_acc": 0.5019011406844106, "train_speed(iter/s)": 1.44718 }, { "epoch": 0.8296559701812262, "grad_norm": 4.633638858795166, "learning_rate": 9.335891843767629e-05, "loss": 2.4926803588867186, "memory(GiB)": 66.02, "step": 19365, "token_acc": 0.46774193548387094, "train_speed(iter/s)": 1.447125 }, { "epoch": 0.8298701855104751, "grad_norm": 5.955918312072754, "learning_rate": 9.335556663366314e-05, "loss": 2.575380325317383, "memory(GiB)": 66.02, "step": 19370, "token_acc": 0.47619047619047616, "train_speed(iter/s)": 1.447101 }, { "epoch": 0.8300844008397241, "grad_norm": 3.949324607849121, "learning_rate": 9.335221404422325e-05, "loss": 2.6844150543212892, "memory(GiB)": 66.02, "step": 19375, "token_acc": 0.4247787610619469, "train_speed(iter/s)": 1.447212 }, { "epoch": 0.830298616168973, "grad_norm": 5.545995712280273, "learning_rate": 9.334886066941733e-05, "loss": 2.6630107879638674, "memory(GiB)": 66.02, "step": 19380, "token_acc": 0.468503937007874, "train_speed(iter/s)": 1.447275 }, { "epoch": 0.830512831498222, "grad_norm": 5.249592304229736, "learning_rate": 9.334550650930613e-05, "loss": 2.828268051147461, "memory(GiB)": 66.02, "step": 19385, "token_acc": 0.4006734006734007, "train_speed(iter/s)": 1.447229 }, { "epoch": 0.830727046827471, "grad_norm": 4.565025329589844, "learning_rate": 9.334215156395042e-05, "loss": 2.4273197174072267, "memory(GiB)": 66.02, "step": 19390, "token_acc": 0.46579804560260585, "train_speed(iter/s)": 1.447295 }, { "epoch": 0.8309412621567199, "grad_norm": 3.8825125694274902, "learning_rate": 9.333879583341098e-05, "loss": 2.393206024169922, "memory(GiB)": 66.02, "step": 19395, "token_acc": 0.5016077170418006, "train_speed(iter/s)": 1.447316 }, { "epoch": 0.8311554774859689, "grad_norm": 4.470056533813477, "learning_rate": 9.33354393177486e-05, "loss": 3.019732666015625, "memory(GiB)": 66.02, "step": 19400, "token_acc": 0.42507645259938837, "train_speed(iter/s)": 1.447326 }, { "epoch": 0.8313696928152179, "grad_norm": 5.510180950164795, "learning_rate": 9.333208201702407e-05, "loss": 2.6127593994140623, "memory(GiB)": 66.02, "step": 19405, "token_acc": 0.4339622641509434, "train_speed(iter/s)": 1.447254 }, { "epoch": 0.8315839081444668, "grad_norm": 4.819840908050537, "learning_rate": 9.332872393129823e-05, "loss": 2.9776878356933594, "memory(GiB)": 66.02, "step": 19410, "token_acc": 0.3870967741935484, "train_speed(iter/s)": 1.44722 }, { "epoch": 0.8317981234737157, "grad_norm": 3.3434503078460693, "learning_rate": 9.332536506063192e-05, "loss": 2.2912576675415037, "memory(GiB)": 66.02, "step": 19415, "token_acc": 0.5016722408026756, "train_speed(iter/s)": 1.447159 }, { "epoch": 0.8320123388029648, "grad_norm": 4.7845540046691895, "learning_rate": 9.332200540508598e-05, "loss": 2.5155660629272463, "memory(GiB)": 66.02, "step": 19420, "token_acc": 0.44274809160305345, "train_speed(iter/s)": 1.447186 }, { "epoch": 0.8322265541322137, "grad_norm": 4.668683052062988, "learning_rate": 9.331864496472125e-05, "loss": 2.446110153198242, "memory(GiB)": 66.02, "step": 19425, "token_acc": 0.4862068965517241, "train_speed(iter/s)": 1.447191 }, { "epoch": 0.8324407694614626, "grad_norm": 4.651973724365234, "learning_rate": 9.331528373959865e-05, "loss": 2.647796630859375, "memory(GiB)": 66.02, "step": 19430, "token_acc": 0.43209876543209874, "train_speed(iter/s)": 1.447187 }, { "epoch": 0.8326549847907116, "grad_norm": 4.906210899353027, "learning_rate": 9.331192172977905e-05, "loss": 2.9209243774414064, "memory(GiB)": 66.02, "step": 19435, "token_acc": 0.4377104377104377, "train_speed(iter/s)": 1.447161 }, { "epoch": 0.8328692001199606, "grad_norm": 4.209266662597656, "learning_rate": 9.330855893532335e-05, "loss": 2.649152374267578, "memory(GiB)": 66.02, "step": 19440, "token_acc": 0.4444444444444444, "train_speed(iter/s)": 1.447224 }, { "epoch": 0.8330834154492095, "grad_norm": 4.533636569976807, "learning_rate": 9.330519535629249e-05, "loss": 2.589357376098633, "memory(GiB)": 66.02, "step": 19445, "token_acc": 0.49310344827586206, "train_speed(iter/s)": 1.447291 }, { "epoch": 0.8332976307784585, "grad_norm": 3.3974220752716064, "learning_rate": 9.330183099274739e-05, "loss": 2.4823240280151366, "memory(GiB)": 66.02, "step": 19450, "token_acc": 0.4681647940074906, "train_speed(iter/s)": 1.447261 }, { "epoch": 0.8335118461077075, "grad_norm": 3.6404998302459717, "learning_rate": 9.329846584474899e-05, "loss": 2.297855758666992, "memory(GiB)": 66.02, "step": 19455, "token_acc": 0.5048231511254019, "train_speed(iter/s)": 1.447331 }, { "epoch": 0.8337260614369564, "grad_norm": 6.674795627593994, "learning_rate": 9.329509991235829e-05, "loss": 2.7794775009155273, "memory(GiB)": 66.02, "step": 19460, "token_acc": 0.46864686468646866, "train_speed(iter/s)": 1.447463 }, { "epoch": 0.8339402767662054, "grad_norm": 4.150593280792236, "learning_rate": 9.329173319563622e-05, "loss": 2.540719985961914, "memory(GiB)": 66.02, "step": 19465, "token_acc": 0.4755244755244755, "train_speed(iter/s)": 1.447492 }, { "epoch": 0.8341544920954543, "grad_norm": 4.216850757598877, "learning_rate": 9.328836569464379e-05, "loss": 2.5613162994384764, "memory(GiB)": 66.02, "step": 19470, "token_acc": 0.48771929824561405, "train_speed(iter/s)": 1.447413 }, { "epoch": 0.8343687074247033, "grad_norm": 4.351722717285156, "learning_rate": 9.328499740944201e-05, "loss": 2.4275341033935547, "memory(GiB)": 66.02, "step": 19475, "token_acc": 0.4950166112956811, "train_speed(iter/s)": 1.447499 }, { "epoch": 0.8345829227539523, "grad_norm": 5.566346645355225, "learning_rate": 9.328162834009192e-05, "loss": 2.4238985061645506, "memory(GiB)": 66.02, "step": 19480, "token_acc": 0.4959349593495935, "train_speed(iter/s)": 1.447538 }, { "epoch": 0.8347971380832012, "grad_norm": 4.994141578674316, "learning_rate": 9.32782584866545e-05, "loss": 2.710413932800293, "memory(GiB)": 66.02, "step": 19485, "token_acc": 0.4261744966442953, "train_speed(iter/s)": 1.447632 }, { "epoch": 0.8350113534124501, "grad_norm": 4.252994537353516, "learning_rate": 9.327488784919084e-05, "loss": 2.8690027236938476, "memory(GiB)": 66.02, "step": 19490, "token_acc": 0.4316109422492401, "train_speed(iter/s)": 1.447531 }, { "epoch": 0.8352255687416992, "grad_norm": 3.779324769973755, "learning_rate": 9.3271516427762e-05, "loss": 2.803751754760742, "memory(GiB)": 66.02, "step": 19495, "token_acc": 0.440625, "train_speed(iter/s)": 1.447599 }, { "epoch": 0.8354397840709481, "grad_norm": 3.6907694339752197, "learning_rate": 9.326814422242905e-05, "loss": 2.467353630065918, "memory(GiB)": 66.02, "step": 19500, "token_acc": 0.48333333333333334, "train_speed(iter/s)": 1.447611 }, { "epoch": 0.8354397840709481, "eval_loss": 2.274688482284546, "eval_runtime": 14.081, "eval_samples_per_second": 7.102, "eval_steps_per_second": 7.102, "eval_token_acc": 0.47075208913649025, "step": 19500 }, { "epoch": 0.835653999400197, "grad_norm": 4.500802516937256, "learning_rate": 9.326477123325306e-05, "loss": 2.769436073303223, "memory(GiB)": 66.02, "step": 19505, "token_acc": 0.4583333333333333, "train_speed(iter/s)": 1.446073 }, { "epoch": 0.8358682147294461, "grad_norm": 4.522373199462891, "learning_rate": 9.326139746029516e-05, "loss": 2.3506784439086914, "memory(GiB)": 66.02, "step": 19510, "token_acc": 0.4666666666666667, "train_speed(iter/s)": 1.446172 }, { "epoch": 0.836082430058695, "grad_norm": 4.026937961578369, "learning_rate": 9.325802290361647e-05, "loss": 2.4716814041137694, "memory(GiB)": 66.02, "step": 19515, "token_acc": 0.439873417721519, "train_speed(iter/s)": 1.446215 }, { "epoch": 0.836296645387944, "grad_norm": 6.328344821929932, "learning_rate": 9.32546475632781e-05, "loss": 2.758492660522461, "memory(GiB)": 66.02, "step": 19520, "token_acc": 0.44106463878326996, "train_speed(iter/s)": 1.446229 }, { "epoch": 0.8365108607171929, "grad_norm": 4.819122791290283, "learning_rate": 9.325127143934122e-05, "loss": 2.472138214111328, "memory(GiB)": 66.02, "step": 19525, "token_acc": 0.4888888888888889, "train_speed(iter/s)": 1.446279 }, { "epoch": 0.8367250760464419, "grad_norm": 4.229304790496826, "learning_rate": 9.324789453186699e-05, "loss": 2.786085319519043, "memory(GiB)": 66.02, "step": 19530, "token_acc": 0.44336569579288027, "train_speed(iter/s)": 1.446258 }, { "epoch": 0.8369392913756909, "grad_norm": 3.706948757171631, "learning_rate": 9.32445168409166e-05, "loss": 2.5545644760131836, "memory(GiB)": 66.02, "step": 19535, "token_acc": 0.5035211267605634, "train_speed(iter/s)": 1.446341 }, { "epoch": 0.8371535067049398, "grad_norm": 5.0965986251831055, "learning_rate": 9.324113836655119e-05, "loss": 2.3366153717041014, "memory(GiB)": 66.02, "step": 19540, "token_acc": 0.5020242914979757, "train_speed(iter/s)": 1.446421 }, { "epoch": 0.8373677220341887, "grad_norm": 3.7695391178131104, "learning_rate": 9.323775910883202e-05, "loss": 2.6092723846435546, "memory(GiB)": 66.02, "step": 19545, "token_acc": 0.468013468013468, "train_speed(iter/s)": 1.446457 }, { "epoch": 0.8375819373634378, "grad_norm": 3.9775147438049316, "learning_rate": 9.323437906782026e-05, "loss": 2.418645668029785, "memory(GiB)": 66.02, "step": 19550, "token_acc": 0.5072992700729927, "train_speed(iter/s)": 1.446459 }, { "epoch": 0.8377961526926867, "grad_norm": 6.02524471282959, "learning_rate": 9.323099824357717e-05, "loss": 2.733505630493164, "memory(GiB)": 66.02, "step": 19555, "token_acc": 0.4402730375426621, "train_speed(iter/s)": 1.446475 }, { "epoch": 0.8380103680219356, "grad_norm": 4.156128883361816, "learning_rate": 9.3227616636164e-05, "loss": 2.5118431091308593, "memory(GiB)": 66.02, "step": 19560, "token_acc": 0.5037878787878788, "train_speed(iter/s)": 1.44652 }, { "epoch": 0.8382245833511847, "grad_norm": 4.817605972290039, "learning_rate": 9.322423424564201e-05, "loss": 2.7719282150268554, "memory(GiB)": 66.02, "step": 19565, "token_acc": 0.4382716049382716, "train_speed(iter/s)": 1.446475 }, { "epoch": 0.8384387986804336, "grad_norm": 4.123440265655518, "learning_rate": 9.322085107207245e-05, "loss": 2.5501338958740236, "memory(GiB)": 66.02, "step": 19570, "token_acc": 0.49570200573065903, "train_speed(iter/s)": 1.446514 }, { "epoch": 0.8386530140096825, "grad_norm": 4.149228096008301, "learning_rate": 9.321746711551665e-05, "loss": 2.702737045288086, "memory(GiB)": 66.02, "step": 19575, "token_acc": 0.4609375, "train_speed(iter/s)": 1.4465 }, { "epoch": 0.8388672293389315, "grad_norm": 5.247049808502197, "learning_rate": 9.321408237603589e-05, "loss": 2.679691696166992, "memory(GiB)": 66.02, "step": 19580, "token_acc": 0.47058823529411764, "train_speed(iter/s)": 1.446498 }, { "epoch": 0.8390814446681805, "grad_norm": 6.044321537017822, "learning_rate": 9.321069685369147e-05, "loss": 2.8071435928344726, "memory(GiB)": 66.02, "step": 19585, "token_acc": 0.46530612244897956, "train_speed(iter/s)": 1.446551 }, { "epoch": 0.8392956599974294, "grad_norm": 3.886892080307007, "learning_rate": 9.320731054854474e-05, "loss": 2.3325725555419923, "memory(GiB)": 66.02, "step": 19590, "token_acc": 0.47770700636942676, "train_speed(iter/s)": 1.446637 }, { "epoch": 0.8395098753266784, "grad_norm": 3.09775447845459, "learning_rate": 9.320392346065707e-05, "loss": 2.4970985412597657, "memory(GiB)": 66.02, "step": 19595, "token_acc": 0.4897959183673469, "train_speed(iter/s)": 1.446757 }, { "epoch": 0.8397240906559273, "grad_norm": 4.860101222991943, "learning_rate": 9.320053559008979e-05, "loss": 2.8612951278686523, "memory(GiB)": 66.02, "step": 19600, "token_acc": 0.46394984326018807, "train_speed(iter/s)": 1.44684 }, { "epoch": 0.8399383059851763, "grad_norm": 5.01551628112793, "learning_rate": 9.319714693690429e-05, "loss": 2.6764516830444336, "memory(GiB)": 66.02, "step": 19605, "token_acc": 0.44144144144144143, "train_speed(iter/s)": 1.446925 }, { "epoch": 0.8401525213144253, "grad_norm": 6.008364200592041, "learning_rate": 9.319375750116194e-05, "loss": 2.810929870605469, "memory(GiB)": 66.02, "step": 19610, "token_acc": 0.4401294498381877, "train_speed(iter/s)": 1.446874 }, { "epoch": 0.8403667366436742, "grad_norm": 5.056024551391602, "learning_rate": 9.319036728292415e-05, "loss": 2.5516054153442385, "memory(GiB)": 66.02, "step": 19615, "token_acc": 0.4473684210526316, "train_speed(iter/s)": 1.446906 }, { "epoch": 0.8405809519729232, "grad_norm": 4.122622966766357, "learning_rate": 9.318697628225235e-05, "loss": 2.4294466018676757, "memory(GiB)": 66.02, "step": 19620, "token_acc": 0.48507462686567165, "train_speed(iter/s)": 1.446917 }, { "epoch": 0.8407951673021722, "grad_norm": 4.909486293792725, "learning_rate": 9.318358449920795e-05, "loss": 2.573863983154297, "memory(GiB)": 66.02, "step": 19625, "token_acc": 0.46440677966101696, "train_speed(iter/s)": 1.447004 }, { "epoch": 0.8410093826314211, "grad_norm": 3.7781527042388916, "learning_rate": 9.318019193385242e-05, "loss": 2.692704772949219, "memory(GiB)": 66.02, "step": 19630, "token_acc": 0.4472049689440994, "train_speed(iter/s)": 1.447017 }, { "epoch": 0.84122359796067, "grad_norm": 4.423552989959717, "learning_rate": 9.317679858624721e-05, "loss": 2.7754079818725588, "memory(GiB)": 66.02, "step": 19635, "token_acc": 0.4146341463414634, "train_speed(iter/s)": 1.446982 }, { "epoch": 0.8414378132899191, "grad_norm": 6.33167028427124, "learning_rate": 9.317340445645377e-05, "loss": 2.6974552154541014, "memory(GiB)": 66.02, "step": 19640, "token_acc": 0.45918367346938777, "train_speed(iter/s)": 1.447068 }, { "epoch": 0.841652028619168, "grad_norm": 4.291542053222656, "learning_rate": 9.317000954453364e-05, "loss": 2.6369171142578125, "memory(GiB)": 66.02, "step": 19645, "token_acc": 0.4492307692307692, "train_speed(iter/s)": 1.446968 }, { "epoch": 0.8418662439484169, "grad_norm": 4.765768051147461, "learning_rate": 9.316661385054825e-05, "loss": 2.4904666900634767, "memory(GiB)": 66.02, "step": 19650, "token_acc": 0.47017543859649125, "train_speed(iter/s)": 1.446999 }, { "epoch": 0.842080459277666, "grad_norm": 4.3831467628479, "learning_rate": 9.316321737455919e-05, "loss": 2.6797822952270507, "memory(GiB)": 66.02, "step": 19655, "token_acc": 0.4471299093655589, "train_speed(iter/s)": 1.447075 }, { "epoch": 0.8422946746069149, "grad_norm": 3.520472288131714, "learning_rate": 9.315982011662794e-05, "loss": 2.318328285217285, "memory(GiB)": 66.02, "step": 19660, "token_acc": 0.5180722891566265, "train_speed(iter/s)": 1.447112 }, { "epoch": 0.8425088899361638, "grad_norm": 3.961155414581299, "learning_rate": 9.315642207681607e-05, "loss": 2.645852470397949, "memory(GiB)": 66.02, "step": 19665, "token_acc": 0.45857988165680474, "train_speed(iter/s)": 1.447191 }, { "epoch": 0.8427231052654128, "grad_norm": 5.533195495605469, "learning_rate": 9.315302325518514e-05, "loss": 2.5814552307128906, "memory(GiB)": 66.02, "step": 19670, "token_acc": 0.4641509433962264, "train_speed(iter/s)": 1.447185 }, { "epoch": 0.8429373205946618, "grad_norm": 3.973841667175293, "learning_rate": 9.31496236517967e-05, "loss": 2.647699546813965, "memory(GiB)": 66.02, "step": 19675, "token_acc": 0.45646437994722955, "train_speed(iter/s)": 1.447228 }, { "epoch": 0.8431515359239107, "grad_norm": 6.171427249908447, "learning_rate": 9.314622326671233e-05, "loss": 2.9970001220703124, "memory(GiB)": 66.02, "step": 19680, "token_acc": 0.39759036144578314, "train_speed(iter/s)": 1.447201 }, { "epoch": 0.8433657512531597, "grad_norm": 6.479748249053955, "learning_rate": 9.314282209999368e-05, "loss": 2.4964162826538088, "memory(GiB)": 66.02, "step": 19685, "token_acc": 0.5168539325842697, "train_speed(iter/s)": 1.447259 }, { "epoch": 0.8435799665824086, "grad_norm": 4.9075236320495605, "learning_rate": 9.313942015170233e-05, "loss": 2.711712646484375, "memory(GiB)": 66.02, "step": 19690, "token_acc": 0.45493562231759654, "train_speed(iter/s)": 1.447262 }, { "epoch": 0.8437941819116576, "grad_norm": 4.661203861236572, "learning_rate": 9.31360174218999e-05, "loss": 2.6397701263427735, "memory(GiB)": 66.02, "step": 19695, "token_acc": 0.4584717607973422, "train_speed(iter/s)": 1.447214 }, { "epoch": 0.8440083972409066, "grad_norm": 4.87868595123291, "learning_rate": 9.313261391064807e-05, "loss": 2.3757896423339844, "memory(GiB)": 66.02, "step": 19700, "token_acc": 0.4859437751004016, "train_speed(iter/s)": 1.447264 }, { "epoch": 0.8442226125701555, "grad_norm": 3.4998130798339844, "learning_rate": 9.312920961800847e-05, "loss": 2.357621765136719, "memory(GiB)": 66.02, "step": 19705, "token_acc": 0.5083612040133779, "train_speed(iter/s)": 1.447322 }, { "epoch": 0.8444368278994044, "grad_norm": 4.68348503112793, "learning_rate": 9.312580454404278e-05, "loss": 2.655163383483887, "memory(GiB)": 66.02, "step": 19710, "token_acc": 0.4929078014184397, "train_speed(iter/s)": 1.447273 }, { "epoch": 0.8446510432286535, "grad_norm": 6.707536697387695, "learning_rate": 9.312239868881268e-05, "loss": 2.594804382324219, "memory(GiB)": 66.02, "step": 19715, "token_acc": 0.4589041095890411, "train_speed(iter/s)": 1.447261 }, { "epoch": 0.8448652585579024, "grad_norm": 5.359857082366943, "learning_rate": 9.311899205237989e-05, "loss": 2.7529628753662108, "memory(GiB)": 66.02, "step": 19720, "token_acc": 0.42748091603053434, "train_speed(iter/s)": 1.447237 }, { "epoch": 0.8450794738871513, "grad_norm": 4.478527069091797, "learning_rate": 9.31155846348061e-05, "loss": 2.4507400512695314, "memory(GiB)": 66.02, "step": 19725, "token_acc": 0.4696485623003195, "train_speed(iter/s)": 1.44725 }, { "epoch": 0.8452936892164004, "grad_norm": 4.161308288574219, "learning_rate": 9.311217643615304e-05, "loss": 2.9077001571655274, "memory(GiB)": 66.02, "step": 19730, "token_acc": 0.46006389776357826, "train_speed(iter/s)": 1.447214 }, { "epoch": 0.8455079045456493, "grad_norm": 3.512089490890503, "learning_rate": 9.310876745648247e-05, "loss": 2.738383483886719, "memory(GiB)": 66.02, "step": 19735, "token_acc": 0.41847826086956524, "train_speed(iter/s)": 1.447308 }, { "epoch": 0.8457221198748982, "grad_norm": 6.097390651702881, "learning_rate": 9.310535769585615e-05, "loss": 2.6992351531982424, "memory(GiB)": 66.02, "step": 19740, "token_acc": 0.46236559139784944, "train_speed(iter/s)": 1.447323 }, { "epoch": 0.8459363352041472, "grad_norm": 6.157765865325928, "learning_rate": 9.310194715433583e-05, "loss": 2.6572113037109375, "memory(GiB)": 66.02, "step": 19745, "token_acc": 0.4525316455696203, "train_speed(iter/s)": 1.447411 }, { "epoch": 0.8461505505333962, "grad_norm": 5.736446380615234, "learning_rate": 9.309853583198328e-05, "loss": 2.7184785842895507, "memory(GiB)": 66.02, "step": 19750, "token_acc": 0.43283582089552236, "train_speed(iter/s)": 1.447482 }, { "epoch": 0.8463647658626451, "grad_norm": 6.251994609832764, "learning_rate": 9.309512372886036e-05, "loss": 2.4597482681274414, "memory(GiB)": 66.02, "step": 19755, "token_acc": 0.48148148148148145, "train_speed(iter/s)": 1.447497 }, { "epoch": 0.8465789811918941, "grad_norm": 5.289851665496826, "learning_rate": 9.309171084502883e-05, "loss": 2.600275421142578, "memory(GiB)": 66.02, "step": 19760, "token_acc": 0.44516129032258067, "train_speed(iter/s)": 1.447482 }, { "epoch": 0.846793196521143, "grad_norm": 4.868996620178223, "learning_rate": 9.308829718055054e-05, "loss": 3.076642608642578, "memory(GiB)": 66.02, "step": 19765, "token_acc": 0.3829787234042553, "train_speed(iter/s)": 1.447487 }, { "epoch": 0.847007411850392, "grad_norm": 5.199337482452393, "learning_rate": 9.308488273548732e-05, "loss": 2.368132781982422, "memory(GiB)": 66.02, "step": 19770, "token_acc": 0.5127272727272727, "train_speed(iter/s)": 1.447488 }, { "epoch": 0.847221627179641, "grad_norm": 5.29536247253418, "learning_rate": 9.308146750990106e-05, "loss": 2.670939636230469, "memory(GiB)": 66.02, "step": 19775, "token_acc": 0.45017182130584193, "train_speed(iter/s)": 1.44754 }, { "epoch": 0.8474358425088899, "grad_norm": 3.8321127891540527, "learning_rate": 9.307805150385356e-05, "loss": 2.5518962860107424, "memory(GiB)": 66.02, "step": 19780, "token_acc": 0.46953405017921146, "train_speed(iter/s)": 1.447599 }, { "epoch": 0.8476500578381388, "grad_norm": 5.568121433258057, "learning_rate": 9.307463471740678e-05, "loss": 2.5398372650146483, "memory(GiB)": 66.02, "step": 19785, "token_acc": 0.45564516129032256, "train_speed(iter/s)": 1.447645 }, { "epoch": 0.8478642731673879, "grad_norm": 3.0147359371185303, "learning_rate": 9.307121715062257e-05, "loss": 2.749627876281738, "memory(GiB)": 66.02, "step": 19790, "token_acc": 0.4666666666666667, "train_speed(iter/s)": 1.447588 }, { "epoch": 0.8480784884966368, "grad_norm": 5.127834796905518, "learning_rate": 9.306779880356284e-05, "loss": 2.801944351196289, "memory(GiB)": 66.02, "step": 19795, "token_acc": 0.448, "train_speed(iter/s)": 1.447674 }, { "epoch": 0.8482927038258857, "grad_norm": 4.423109531402588, "learning_rate": 9.306437967628956e-05, "loss": 2.5503236770629885, "memory(GiB)": 66.02, "step": 19800, "token_acc": 0.47950819672131145, "train_speed(iter/s)": 1.447673 }, { "epoch": 0.8485069191551348, "grad_norm": 3.6239840984344482, "learning_rate": 9.306095976886464e-05, "loss": 2.49837646484375, "memory(GiB)": 66.02, "step": 19805, "token_acc": 0.4789156626506024, "train_speed(iter/s)": 1.447689 }, { "epoch": 0.8487211344843837, "grad_norm": 4.444400310516357, "learning_rate": 9.305753908135003e-05, "loss": 2.423139190673828, "memory(GiB)": 66.02, "step": 19810, "token_acc": 0.49683544303797467, "train_speed(iter/s)": 1.447665 }, { "epoch": 0.8489353498136326, "grad_norm": 4.946013450622559, "learning_rate": 9.305411761380771e-05, "loss": 2.5663604736328125, "memory(GiB)": 66.02, "step": 19815, "token_acc": 0.48659003831417624, "train_speed(iter/s)": 1.447677 }, { "epoch": 0.8491495651428816, "grad_norm": 3.828543186187744, "learning_rate": 9.305069536629967e-05, "loss": 2.6054725646972656, "memory(GiB)": 66.02, "step": 19820, "token_acc": 0.4405144694533762, "train_speed(iter/s)": 1.447681 }, { "epoch": 0.8493637804721306, "grad_norm": 4.285463809967041, "learning_rate": 9.30472723388879e-05, "loss": 2.7441246032714846, "memory(GiB)": 66.02, "step": 19825, "token_acc": 0.4254658385093168, "train_speed(iter/s)": 1.447649 }, { "epoch": 0.8495779958013795, "grad_norm": 4.809802532196045, "learning_rate": 9.30438485316344e-05, "loss": 2.611124801635742, "memory(GiB)": 66.02, "step": 19830, "token_acc": 0.4797297297297297, "train_speed(iter/s)": 1.447651 }, { "epoch": 0.8497922111306285, "grad_norm": 3.7109873294830322, "learning_rate": 9.30404239446012e-05, "loss": 2.4324920654296873, "memory(GiB)": 66.02, "step": 19835, "token_acc": 0.5134328358208955, "train_speed(iter/s)": 1.447713 }, { "epoch": 0.8500064264598775, "grad_norm": 4.709730625152588, "learning_rate": 9.303699857785035e-05, "loss": 2.7048797607421875, "memory(GiB)": 66.02, "step": 19840, "token_acc": 0.46437994722955145, "train_speed(iter/s)": 1.447795 }, { "epoch": 0.8502206417891264, "grad_norm": 3.773698329925537, "learning_rate": 9.303357243144392e-05, "loss": 2.5571128845214846, "memory(GiB)": 66.02, "step": 19845, "token_acc": 0.48366013071895425, "train_speed(iter/s)": 1.447745 }, { "epoch": 0.8504348571183754, "grad_norm": 4.688603401184082, "learning_rate": 9.303014550544394e-05, "loss": 2.304994583129883, "memory(GiB)": 66.02, "step": 19850, "token_acc": 0.5211267605633803, "train_speed(iter/s)": 1.447684 }, { "epoch": 0.8506490724476243, "grad_norm": 6.75828218460083, "learning_rate": 9.30267177999125e-05, "loss": 2.6469951629638673, "memory(GiB)": 66.02, "step": 19855, "token_acc": 0.44805194805194803, "train_speed(iter/s)": 1.447527 }, { "epoch": 0.8508632877768734, "grad_norm": 5.713206768035889, "learning_rate": 9.302328931491172e-05, "loss": 2.735860252380371, "memory(GiB)": 66.02, "step": 19860, "token_acc": 0.43846153846153846, "train_speed(iter/s)": 1.447528 }, { "epoch": 0.8510775031061223, "grad_norm": 4.005903244018555, "learning_rate": 9.301986005050369e-05, "loss": 2.430007743835449, "memory(GiB)": 66.02, "step": 19865, "token_acc": 0.5197132616487455, "train_speed(iter/s)": 1.447545 }, { "epoch": 0.8512917184353712, "grad_norm": 4.3053741455078125, "learning_rate": 9.301643000675054e-05, "loss": 2.3170469284057615, "memory(GiB)": 66.02, "step": 19870, "token_acc": 0.48328267477203646, "train_speed(iter/s)": 1.447449 }, { "epoch": 0.8515059337646202, "grad_norm": 4.132357120513916, "learning_rate": 9.30129991837144e-05, "loss": 2.483839416503906, "memory(GiB)": 66.02, "step": 19875, "token_acc": 0.4732510288065844, "train_speed(iter/s)": 1.44744 }, { "epoch": 0.8517201490938692, "grad_norm": 4.457902908325195, "learning_rate": 9.300956758145742e-05, "loss": 2.625859260559082, "memory(GiB)": 66.02, "step": 19880, "token_acc": 0.4574898785425101, "train_speed(iter/s)": 1.447348 }, { "epoch": 0.8519343644231181, "grad_norm": 5.308292865753174, "learning_rate": 9.30061352000418e-05, "loss": 2.663136291503906, "memory(GiB)": 66.02, "step": 19885, "token_acc": 0.45454545454545453, "train_speed(iter/s)": 1.447301 }, { "epoch": 0.8521485797523671, "grad_norm": 4.78318977355957, "learning_rate": 9.30027020395297e-05, "loss": 2.8430709838867188, "memory(GiB)": 66.02, "step": 19890, "token_acc": 0.44363636363636366, "train_speed(iter/s)": 1.447231 }, { "epoch": 0.852362795081616, "grad_norm": 4.209441661834717, "learning_rate": 9.299926809998329e-05, "loss": 2.816679763793945, "memory(GiB)": 66.02, "step": 19895, "token_acc": 0.43018867924528303, "train_speed(iter/s)": 1.447296 }, { "epoch": 0.852577010410865, "grad_norm": 7.264736652374268, "learning_rate": 9.299583338146483e-05, "loss": 2.572256851196289, "memory(GiB)": 66.02, "step": 19900, "token_acc": 0.43174603174603177, "train_speed(iter/s)": 1.447234 }, { "epoch": 0.852791225740114, "grad_norm": 4.424783229827881, "learning_rate": 9.299239788403649e-05, "loss": 2.16674690246582, "memory(GiB)": 66.02, "step": 19905, "token_acc": 0.54296875, "train_speed(iter/s)": 1.447311 }, { "epoch": 0.8530054410693629, "grad_norm": 4.210756778717041, "learning_rate": 9.298896160776054e-05, "loss": 2.7489221572875975, "memory(GiB)": 66.02, "step": 19910, "token_acc": 0.40752351097178685, "train_speed(iter/s)": 1.44725 }, { "epoch": 0.8532196563986119, "grad_norm": 5.202207088470459, "learning_rate": 9.298552455269923e-05, "loss": 2.7239757537841798, "memory(GiB)": 66.02, "step": 19915, "token_acc": 0.40502793296089384, "train_speed(iter/s)": 1.447245 }, { "epoch": 0.8534338717278609, "grad_norm": 3.9920990467071533, "learning_rate": 9.298208671891482e-05, "loss": 2.497640609741211, "memory(GiB)": 66.02, "step": 19920, "token_acc": 0.4574468085106383, "train_speed(iter/s)": 1.447258 }, { "epoch": 0.8536480870571098, "grad_norm": 3.311734914779663, "learning_rate": 9.297864810646958e-05, "loss": 2.72473087310791, "memory(GiB)": 66.02, "step": 19925, "token_acc": 0.42138364779874216, "train_speed(iter/s)": 1.447371 }, { "epoch": 0.8538623023863587, "grad_norm": 4.473320960998535, "learning_rate": 9.297520871542583e-05, "loss": 2.7096553802490235, "memory(GiB)": 66.02, "step": 19930, "token_acc": 0.45544554455445546, "train_speed(iter/s)": 1.447454 }, { "epoch": 0.8540765177156078, "grad_norm": 4.602941513061523, "learning_rate": 9.297176854584582e-05, "loss": 2.5323307037353517, "memory(GiB)": 66.02, "step": 19935, "token_acc": 0.4833948339483395, "train_speed(iter/s)": 1.44742 }, { "epoch": 0.8542907330448567, "grad_norm": 4.637984752655029, "learning_rate": 9.296832759779195e-05, "loss": 2.4924673080444335, "memory(GiB)": 66.02, "step": 19940, "token_acc": 0.4245614035087719, "train_speed(iter/s)": 1.447434 }, { "epoch": 0.8545049483741056, "grad_norm": 3.401311159133911, "learning_rate": 9.29648858713265e-05, "loss": 2.800104522705078, "memory(GiB)": 66.02, "step": 19945, "token_acc": 0.4602076124567474, "train_speed(iter/s)": 1.447359 }, { "epoch": 0.8547191637033547, "grad_norm": 5.960352420806885, "learning_rate": 9.296144336651185e-05, "loss": 2.698061943054199, "memory(GiB)": 66.02, "step": 19950, "token_acc": 0.44573643410852715, "train_speed(iter/s)": 1.447358 }, { "epoch": 0.8549333790326036, "grad_norm": 4.572206497192383, "learning_rate": 9.295800008341033e-05, "loss": 2.777944564819336, "memory(GiB)": 66.02, "step": 19955, "token_acc": 0.4782608695652174, "train_speed(iter/s)": 1.447419 }, { "epoch": 0.8551475943618525, "grad_norm": 4.109676837921143, "learning_rate": 9.295455602208438e-05, "loss": 2.779833221435547, "memory(GiB)": 66.02, "step": 19960, "token_acc": 0.4680232558139535, "train_speed(iter/s)": 1.447392 }, { "epoch": 0.8553618096911015, "grad_norm": 4.0725860595703125, "learning_rate": 9.295111118259632e-05, "loss": 2.788290786743164, "memory(GiB)": 66.02, "step": 19965, "token_acc": 0.445578231292517, "train_speed(iter/s)": 1.447476 }, { "epoch": 0.8555760250203505, "grad_norm": 4.575146198272705, "learning_rate": 9.29476655650086e-05, "loss": 3.0003833770751953, "memory(GiB)": 66.02, "step": 19970, "token_acc": 0.4171779141104294, "train_speed(iter/s)": 1.447481 }, { "epoch": 0.8557902403495994, "grad_norm": 3.6619763374328613, "learning_rate": 9.294421916938363e-05, "loss": 2.3414730072021483, "memory(GiB)": 66.02, "step": 19975, "token_acc": 0.49508196721311476, "train_speed(iter/s)": 1.447553 }, { "epoch": 0.8560044556788484, "grad_norm": 3.6070966720581055, "learning_rate": 9.294077199578384e-05, "loss": 2.7277811050415037, "memory(GiB)": 66.02, "step": 19980, "token_acc": 0.45934959349593496, "train_speed(iter/s)": 1.447548 }, { "epoch": 0.8562186710080973, "grad_norm": 4.308478355407715, "learning_rate": 9.293732404427169e-05, "loss": 2.609332275390625, "memory(GiB)": 66.02, "step": 19985, "token_acc": 0.46808510638297873, "train_speed(iter/s)": 1.447563 }, { "epoch": 0.8564328863373463, "grad_norm": 3.5091500282287598, "learning_rate": 9.293387531490964e-05, "loss": 2.626960563659668, "memory(GiB)": 66.02, "step": 19990, "token_acc": 0.4633431085043988, "train_speed(iter/s)": 1.447647 }, { "epoch": 0.8566471016665953, "grad_norm": 4.095282077789307, "learning_rate": 9.293042580776015e-05, "loss": 2.5017480850219727, "memory(GiB)": 66.02, "step": 19995, "token_acc": 0.4444444444444444, "train_speed(iter/s)": 1.447702 }, { "epoch": 0.8568613169958442, "grad_norm": 7.032236099243164, "learning_rate": 9.292697552288574e-05, "loss": 2.6451932907104494, "memory(GiB)": 66.02, "step": 20000, "token_acc": 0.47019867549668876, "train_speed(iter/s)": 1.447715 }, { "epoch": 0.8568613169958442, "eval_loss": 2.3184609413146973, "eval_runtime": 14.0948, "eval_samples_per_second": 7.095, "eval_steps_per_second": 7.095, "eval_token_acc": 0.45674300254452926, "step": 20000 }, { "epoch": 0.8570755323250931, "grad_norm": 5.316519737243652, "learning_rate": 9.292352446034889e-05, "loss": 2.8579620361328124, "memory(GiB)": 66.02, "step": 20005, "token_acc": 0.44265232974910396, "train_speed(iter/s)": 1.446228 }, { "epoch": 0.8572897476543422, "grad_norm": 3.648613452911377, "learning_rate": 9.292007262021213e-05, "loss": 2.344171905517578, "memory(GiB)": 66.02, "step": 20010, "token_acc": 0.5119047619047619, "train_speed(iter/s)": 1.446273 }, { "epoch": 0.8575039629835911, "grad_norm": 4.44399881362915, "learning_rate": 9.291662000253799e-05, "loss": 2.589639663696289, "memory(GiB)": 66.02, "step": 20015, "token_acc": 0.44814814814814813, "train_speed(iter/s)": 1.446266 }, { "epoch": 0.85771817831284, "grad_norm": 4.210684299468994, "learning_rate": 9.291316660738903e-05, "loss": 2.7232810974121096, "memory(GiB)": 66.02, "step": 20020, "token_acc": 0.45454545454545453, "train_speed(iter/s)": 1.446296 }, { "epoch": 0.8579323936420891, "grad_norm": 3.345099925994873, "learning_rate": 9.290971243482778e-05, "loss": 3.020463562011719, "memory(GiB)": 66.02, "step": 20025, "token_acc": 0.3972972972972973, "train_speed(iter/s)": 1.446287 }, { "epoch": 0.858146608971338, "grad_norm": 5.379486083984375, "learning_rate": 9.290625748491686e-05, "loss": 2.8090381622314453, "memory(GiB)": 66.02, "step": 20030, "token_acc": 0.41114982578397213, "train_speed(iter/s)": 1.446346 }, { "epoch": 0.8583608243005869, "grad_norm": 5.432685852050781, "learning_rate": 9.290280175771882e-05, "loss": 2.428244781494141, "memory(GiB)": 66.02, "step": 20035, "token_acc": 0.47035573122529645, "train_speed(iter/s)": 1.44632 }, { "epoch": 0.858575039629836, "grad_norm": 3.5571537017822266, "learning_rate": 9.289934525329628e-05, "loss": 2.463956832885742, "memory(GiB)": 66.02, "step": 20040, "token_acc": 0.5032258064516129, "train_speed(iter/s)": 1.44638 }, { "epoch": 0.8587892549590849, "grad_norm": 4.1907148361206055, "learning_rate": 9.289588797171186e-05, "loss": 2.8935022354125977, "memory(GiB)": 66.02, "step": 20045, "token_acc": 0.4551282051282051, "train_speed(iter/s)": 1.446488 }, { "epoch": 0.8590034702883338, "grad_norm": 6.904212474822998, "learning_rate": 9.289242991302821e-05, "loss": 2.7568593978881837, "memory(GiB)": 66.02, "step": 20050, "token_acc": 0.4647058823529412, "train_speed(iter/s)": 1.446448 }, { "epoch": 0.8592176856175828, "grad_norm": 5.2837700843811035, "learning_rate": 9.288897107730794e-05, "loss": 2.474040985107422, "memory(GiB)": 66.02, "step": 20055, "token_acc": 0.47843137254901963, "train_speed(iter/s)": 1.446472 }, { "epoch": 0.8594319009468318, "grad_norm": 4.97775411605835, "learning_rate": 9.288551146461373e-05, "loss": 2.7760700225830077, "memory(GiB)": 66.02, "step": 20060, "token_acc": 0.4383116883116883, "train_speed(iter/s)": 1.446522 }, { "epoch": 0.8596461162760807, "grad_norm": 4.208581447601318, "learning_rate": 9.288205107500825e-05, "loss": 2.504689025878906, "memory(GiB)": 66.02, "step": 20065, "token_acc": 0.4618055555555556, "train_speed(iter/s)": 1.446542 }, { "epoch": 0.8598603316053297, "grad_norm": 4.59130859375, "learning_rate": 9.287858990855418e-05, "loss": 2.6606418609619142, "memory(GiB)": 66.02, "step": 20070, "token_acc": 0.42704626334519574, "train_speed(iter/s)": 1.446585 }, { "epoch": 0.8600745469345786, "grad_norm": 4.684729099273682, "learning_rate": 9.287512796531423e-05, "loss": 2.469797134399414, "memory(GiB)": 66.02, "step": 20075, "token_acc": 0.4623287671232877, "train_speed(iter/s)": 1.446597 }, { "epoch": 0.8602887622638276, "grad_norm": 4.1291584968566895, "learning_rate": 9.287166524535111e-05, "loss": 2.669881820678711, "memory(GiB)": 66.02, "step": 20080, "token_acc": 0.42105263157894735, "train_speed(iter/s)": 1.446541 }, { "epoch": 0.8605029775930766, "grad_norm": 3.9851222038269043, "learning_rate": 9.286820174872758e-05, "loss": 2.527341842651367, "memory(GiB)": 66.02, "step": 20085, "token_acc": 0.4625, "train_speed(iter/s)": 1.446552 }, { "epoch": 0.8607171929223255, "grad_norm": 4.214713096618652, "learning_rate": 9.286473747550635e-05, "loss": 2.6485807418823244, "memory(GiB)": 66.02, "step": 20090, "token_acc": 0.47058823529411764, "train_speed(iter/s)": 1.446583 }, { "epoch": 0.8609314082515744, "grad_norm": 5.188320636749268, "learning_rate": 9.286127242575019e-05, "loss": 2.5529800415039063, "memory(GiB)": 66.02, "step": 20095, "token_acc": 0.46788990825688076, "train_speed(iter/s)": 1.446659 }, { "epoch": 0.8611456235808235, "grad_norm": 4.187458515167236, "learning_rate": 9.285780659952188e-05, "loss": 2.537496566772461, "memory(GiB)": 66.02, "step": 20100, "token_acc": 0.4752851711026616, "train_speed(iter/s)": 1.446528 }, { "epoch": 0.8613598389100724, "grad_norm": 5.257472038269043, "learning_rate": 9.285433999688419e-05, "loss": 2.6902347564697267, "memory(GiB)": 66.02, "step": 20105, "token_acc": 0.488, "train_speed(iter/s)": 1.446573 }, { "epoch": 0.8615740542393213, "grad_norm": 4.594825744628906, "learning_rate": 9.285087261789993e-05, "loss": 2.7862216949462892, "memory(GiB)": 66.02, "step": 20110, "token_acc": 0.4646840148698885, "train_speed(iter/s)": 1.446581 }, { "epoch": 0.8617882695685704, "grad_norm": 5.086245059967041, "learning_rate": 9.284740446263191e-05, "loss": 2.489419937133789, "memory(GiB)": 66.02, "step": 20115, "token_acc": 0.48338368580060426, "train_speed(iter/s)": 1.446644 }, { "epoch": 0.8620024848978193, "grad_norm": 4.404093265533447, "learning_rate": 9.284393553114298e-05, "loss": 2.7474906921386717, "memory(GiB)": 66.02, "step": 20120, "token_acc": 0.4657039711191336, "train_speed(iter/s)": 1.446516 }, { "epoch": 0.8622167002270682, "grad_norm": 3.989957094192505, "learning_rate": 9.284046582349596e-05, "loss": 2.380805778503418, "memory(GiB)": 66.02, "step": 20125, "token_acc": 0.46621621621621623, "train_speed(iter/s)": 1.44659 }, { "epoch": 0.8624309155563172, "grad_norm": 7.043115615844727, "learning_rate": 9.283699533975372e-05, "loss": 2.6302469253540037, "memory(GiB)": 66.02, "step": 20130, "token_acc": 0.4726027397260274, "train_speed(iter/s)": 1.446553 }, { "epoch": 0.8626451308855662, "grad_norm": 3.9640233516693115, "learning_rate": 9.283352407997912e-05, "loss": 2.661854934692383, "memory(GiB)": 66.02, "step": 20135, "token_acc": 0.4521452145214521, "train_speed(iter/s)": 1.446512 }, { "epoch": 0.8628593462148151, "grad_norm": 4.779181957244873, "learning_rate": 9.283005204423504e-05, "loss": 2.5626220703125, "memory(GiB)": 66.02, "step": 20140, "token_acc": 0.4835164835164835, "train_speed(iter/s)": 1.446531 }, { "epoch": 0.8630735615440641, "grad_norm": 5.667942523956299, "learning_rate": 9.28265792325844e-05, "loss": 2.440044403076172, "memory(GiB)": 66.02, "step": 20145, "token_acc": 0.521311475409836, "train_speed(iter/s)": 1.446511 }, { "epoch": 0.863287776873313, "grad_norm": 3.9496004581451416, "learning_rate": 9.282310564509009e-05, "loss": 2.5329425811767576, "memory(GiB)": 66.02, "step": 20150, "token_acc": 0.4711864406779661, "train_speed(iter/s)": 1.446515 }, { "epoch": 0.863501992202562, "grad_norm": 4.588701248168945, "learning_rate": 9.281963128181508e-05, "loss": 2.47724609375, "memory(GiB)": 66.02, "step": 20155, "token_acc": 0.4728682170542636, "train_speed(iter/s)": 1.446567 }, { "epoch": 0.863716207531811, "grad_norm": 4.740305423736572, "learning_rate": 9.281615614282225e-05, "loss": 3.1929759979248047, "memory(GiB)": 66.02, "step": 20160, "token_acc": 0.4197183098591549, "train_speed(iter/s)": 1.44654 }, { "epoch": 0.8639304228610599, "grad_norm": 4.869343280792236, "learning_rate": 9.28126802281746e-05, "loss": 2.6420190811157225, "memory(GiB)": 66.02, "step": 20165, "token_acc": 0.4840764331210191, "train_speed(iter/s)": 1.446501 }, { "epoch": 0.8641446381903088, "grad_norm": 5.131339073181152, "learning_rate": 9.280920353793508e-05, "loss": 2.423465919494629, "memory(GiB)": 66.02, "step": 20170, "token_acc": 0.4855305466237942, "train_speed(iter/s)": 1.446434 }, { "epoch": 0.8643588535195579, "grad_norm": 3.910404682159424, "learning_rate": 9.280572607216668e-05, "loss": 2.235536575317383, "memory(GiB)": 66.02, "step": 20175, "token_acc": 0.5066225165562914, "train_speed(iter/s)": 1.446502 }, { "epoch": 0.8645730688488068, "grad_norm": 4.464833736419678, "learning_rate": 9.280224783093241e-05, "loss": 2.7030141830444334, "memory(GiB)": 66.02, "step": 20180, "token_acc": 0.45588235294117646, "train_speed(iter/s)": 1.446532 }, { "epoch": 0.8647872841780557, "grad_norm": 4.7071852684021, "learning_rate": 9.279876881429526e-05, "loss": 2.5194549560546875, "memory(GiB)": 66.02, "step": 20185, "token_acc": 0.47435897435897434, "train_speed(iter/s)": 1.446573 }, { "epoch": 0.8650014995073048, "grad_norm": 4.958098888397217, "learning_rate": 9.279528902231827e-05, "loss": 2.520359420776367, "memory(GiB)": 66.02, "step": 20190, "token_acc": 0.48518518518518516, "train_speed(iter/s)": 1.446588 }, { "epoch": 0.8652157148365537, "grad_norm": 6.080878734588623, "learning_rate": 9.279180845506446e-05, "loss": 2.768878173828125, "memory(GiB)": 66.02, "step": 20195, "token_acc": 0.4233128834355828, "train_speed(iter/s)": 1.446666 }, { "epoch": 0.8654299301658027, "grad_norm": 3.969670295715332, "learning_rate": 9.27883271125969e-05, "loss": 2.7485870361328124, "memory(GiB)": 66.02, "step": 20200, "token_acc": 0.44982698961937717, "train_speed(iter/s)": 1.446722 }, { "epoch": 0.8656441454950516, "grad_norm": 5.300868511199951, "learning_rate": 9.278484499497868e-05, "loss": 2.477072334289551, "memory(GiB)": 66.02, "step": 20205, "token_acc": 0.4755244755244755, "train_speed(iter/s)": 1.446586 }, { "epoch": 0.8658583608243006, "grad_norm": 4.885673522949219, "learning_rate": 9.278136210227284e-05, "loss": 2.606531524658203, "memory(GiB)": 66.02, "step": 20210, "token_acc": 0.46381578947368424, "train_speed(iter/s)": 1.446598 }, { "epoch": 0.8660725761535496, "grad_norm": 4.389687538146973, "learning_rate": 9.277787843454248e-05, "loss": 2.5676076889038084, "memory(GiB)": 66.02, "step": 20215, "token_acc": 0.4886731391585761, "train_speed(iter/s)": 1.446609 }, { "epoch": 0.8662867914827985, "grad_norm": 7.971065998077393, "learning_rate": 9.277439399185074e-05, "loss": 2.6598133087158202, "memory(GiB)": 66.02, "step": 20220, "token_acc": 0.46825396825396826, "train_speed(iter/s)": 1.446616 }, { "epoch": 0.8665010068120474, "grad_norm": 4.387348651885986, "learning_rate": 9.277090877426074e-05, "loss": 2.55792350769043, "memory(GiB)": 66.02, "step": 20225, "token_acc": 0.4432624113475177, "train_speed(iter/s)": 1.44661 }, { "epoch": 0.8667152221412965, "grad_norm": 4.705660820007324, "learning_rate": 9.276742278183558e-05, "loss": 2.6393453598022463, "memory(GiB)": 66.02, "step": 20230, "token_acc": 0.48427672955974843, "train_speed(iter/s)": 1.446603 }, { "epoch": 0.8669294374705454, "grad_norm": 3.941145420074463, "learning_rate": 9.276393601463844e-05, "loss": 2.630137825012207, "memory(GiB)": 66.02, "step": 20235, "token_acc": 0.5077881619937694, "train_speed(iter/s)": 1.446611 }, { "epoch": 0.8671436527997943, "grad_norm": 5.056817054748535, "learning_rate": 9.276044847273249e-05, "loss": 2.3690935134887696, "memory(GiB)": 66.02, "step": 20240, "token_acc": 0.4900398406374502, "train_speed(iter/s)": 1.446654 }, { "epoch": 0.8673578681290434, "grad_norm": 4.330813407897949, "learning_rate": 9.27569601561809e-05, "loss": 2.8303770065307616, "memory(GiB)": 66.02, "step": 20245, "token_acc": 0.46037735849056605, "train_speed(iter/s)": 1.446683 }, { "epoch": 0.8675720834582923, "grad_norm": 6.021437644958496, "learning_rate": 9.275347106504689e-05, "loss": 2.4435911178588867, "memory(GiB)": 66.02, "step": 20250, "token_acc": 0.47183098591549294, "train_speed(iter/s)": 1.446776 }, { "epoch": 0.8677862987875412, "grad_norm": 4.0443572998046875, "learning_rate": 9.274998119939362e-05, "loss": 2.717410659790039, "memory(GiB)": 66.02, "step": 20255, "token_acc": 0.48703170028818443, "train_speed(iter/s)": 1.446728 }, { "epoch": 0.8680005141167902, "grad_norm": 4.352334976196289, "learning_rate": 9.274649055928434e-05, "loss": 2.6219051361083983, "memory(GiB)": 66.02, "step": 20260, "token_acc": 0.4208955223880597, "train_speed(iter/s)": 1.446756 }, { "epoch": 0.8682147294460392, "grad_norm": 4.7858781814575195, "learning_rate": 9.274299914478228e-05, "loss": 2.683307075500488, "memory(GiB)": 66.02, "step": 20265, "token_acc": 0.4570446735395189, "train_speed(iter/s)": 1.446761 }, { "epoch": 0.8684289447752881, "grad_norm": 5.566839218139648, "learning_rate": 9.273950695595071e-05, "loss": 2.4615032196044924, "memory(GiB)": 66.02, "step": 20270, "token_acc": 0.49201277955271566, "train_speed(iter/s)": 1.446721 }, { "epoch": 0.8686431601045371, "grad_norm": 6.246694087982178, "learning_rate": 9.273601399285287e-05, "loss": 2.8437292098999025, "memory(GiB)": 66.02, "step": 20275, "token_acc": 0.43283582089552236, "train_speed(iter/s)": 1.44675 }, { "epoch": 0.868857375433786, "grad_norm": 3.2209200859069824, "learning_rate": 9.273252025555205e-05, "loss": 2.403131294250488, "memory(GiB)": 66.02, "step": 20280, "token_acc": 0.5177304964539007, "train_speed(iter/s)": 1.44675 }, { "epoch": 0.869071590763035, "grad_norm": 6.783487319946289, "learning_rate": 9.272902574411153e-05, "loss": 2.703940200805664, "memory(GiB)": 66.02, "step": 20285, "token_acc": 0.4478114478114478, "train_speed(iter/s)": 1.446795 }, { "epoch": 0.869285806092284, "grad_norm": 5.519693374633789, "learning_rate": 9.272553045859464e-05, "loss": 2.6275714874267577, "memory(GiB)": 66.02, "step": 20290, "token_acc": 0.45222929936305734, "train_speed(iter/s)": 1.446831 }, { "epoch": 0.8695000214215329, "grad_norm": 4.119229793548584, "learning_rate": 9.272203439906469e-05, "loss": 2.6545949935913087, "memory(GiB)": 66.02, "step": 20295, "token_acc": 0.4965277777777778, "train_speed(iter/s)": 1.4468 }, { "epoch": 0.8697142367507819, "grad_norm": 5.105564117431641, "learning_rate": 9.271853756558497e-05, "loss": 2.4343557357788086, "memory(GiB)": 66.02, "step": 20300, "token_acc": 0.4664429530201342, "train_speed(iter/s)": 1.446767 }, { "epoch": 0.8699284520800309, "grad_norm": 4.066102981567383, "learning_rate": 9.271503995821891e-05, "loss": 2.8815902709960937, "memory(GiB)": 66.02, "step": 20305, "token_acc": 0.43812709030100333, "train_speed(iter/s)": 1.446776 }, { "epoch": 0.8701426674092798, "grad_norm": 4.553707599639893, "learning_rate": 9.271154157702982e-05, "loss": 2.451702880859375, "memory(GiB)": 66.02, "step": 20310, "token_acc": 0.49324324324324326, "train_speed(iter/s)": 1.446836 }, { "epoch": 0.8703568827385287, "grad_norm": 3.8057193756103516, "learning_rate": 9.270804242208109e-05, "loss": 2.717175102233887, "memory(GiB)": 66.02, "step": 20315, "token_acc": 0.4662379421221865, "train_speed(iter/s)": 1.446861 }, { "epoch": 0.8705710980677778, "grad_norm": 4.666566371917725, "learning_rate": 9.27045424934361e-05, "loss": 3.0016950607299804, "memory(GiB)": 66.02, "step": 20320, "token_acc": 0.42517006802721086, "train_speed(iter/s)": 1.446776 }, { "epoch": 0.8707853133970267, "grad_norm": 4.413992404937744, "learning_rate": 9.270104179115825e-05, "loss": 2.8052406311035156, "memory(GiB)": 66.02, "step": 20325, "token_acc": 0.40878378378378377, "train_speed(iter/s)": 1.446853 }, { "epoch": 0.8709995287262756, "grad_norm": 4.927813529968262, "learning_rate": 9.2697540315311e-05, "loss": 2.657339096069336, "memory(GiB)": 66.02, "step": 20330, "token_acc": 0.4444444444444444, "train_speed(iter/s)": 1.446831 }, { "epoch": 0.8712137440555247, "grad_norm": 4.384943008422852, "learning_rate": 9.269403806595775e-05, "loss": 2.370720100402832, "memory(GiB)": 66.02, "step": 20335, "token_acc": 0.47058823529411764, "train_speed(iter/s)": 1.446817 }, { "epoch": 0.8714279593847736, "grad_norm": 4.819328784942627, "learning_rate": 9.269053504316194e-05, "loss": 2.832560729980469, "memory(GiB)": 66.02, "step": 20340, "token_acc": 0.43911439114391143, "train_speed(iter/s)": 1.446826 }, { "epoch": 0.8716421747140225, "grad_norm": 5.835651874542236, "learning_rate": 9.268703124698704e-05, "loss": 2.793199920654297, "memory(GiB)": 66.02, "step": 20345, "token_acc": 0.45051194539249145, "train_speed(iter/s)": 1.446845 }, { "epoch": 0.8718563900432715, "grad_norm": 6.481338977813721, "learning_rate": 9.268352667749653e-05, "loss": 2.480810356140137, "memory(GiB)": 66.02, "step": 20350, "token_acc": 0.44947735191637633, "train_speed(iter/s)": 1.446899 }, { "epoch": 0.8720706053725205, "grad_norm": 3.819230556488037, "learning_rate": 9.268002133475388e-05, "loss": 2.4810722351074217, "memory(GiB)": 66.02, "step": 20355, "token_acc": 0.4588235294117647, "train_speed(iter/s)": 1.446964 }, { "epoch": 0.8722848207017694, "grad_norm": 5.878935813903809, "learning_rate": 9.267651521882263e-05, "loss": 2.5962139129638673, "memory(GiB)": 66.02, "step": 20360, "token_acc": 0.48828125, "train_speed(iter/s)": 1.446997 }, { "epoch": 0.8724990360310184, "grad_norm": 4.918642520904541, "learning_rate": 9.267300832976626e-05, "loss": 2.899725914001465, "memory(GiB)": 66.02, "step": 20365, "token_acc": 0.3926174496644295, "train_speed(iter/s)": 1.446972 }, { "epoch": 0.8727132513602673, "grad_norm": 4.57191276550293, "learning_rate": 9.266950066764832e-05, "loss": 2.340840530395508, "memory(GiB)": 66.02, "step": 20370, "token_acc": 0.4892086330935252, "train_speed(iter/s)": 1.446966 }, { "epoch": 0.8729274666895163, "grad_norm": 4.200679779052734, "learning_rate": 9.266599223253234e-05, "loss": 2.6033843994140624, "memory(GiB)": 66.02, "step": 20375, "token_acc": 0.46236559139784944, "train_speed(iter/s)": 1.446956 }, { "epoch": 0.8731416820187653, "grad_norm": 3.744687557220459, "learning_rate": 9.266248302448188e-05, "loss": 2.5803451538085938, "memory(GiB)": 66.02, "step": 20380, "token_acc": 0.4452054794520548, "train_speed(iter/s)": 1.447061 }, { "epoch": 0.8733558973480142, "grad_norm": 5.624828815460205, "learning_rate": 9.265897304356055e-05, "loss": 2.7657032012939453, "memory(GiB)": 66.02, "step": 20385, "token_acc": 0.42962962962962964, "train_speed(iter/s)": 1.447105 }, { "epoch": 0.8735701126772631, "grad_norm": 5.167118549346924, "learning_rate": 9.265546228983189e-05, "loss": 2.6485198974609374, "memory(GiB)": 66.02, "step": 20390, "token_acc": 0.44642857142857145, "train_speed(iter/s)": 1.447051 }, { "epoch": 0.8737843280065122, "grad_norm": 4.024702548980713, "learning_rate": 9.26519507633595e-05, "loss": 2.53060302734375, "memory(GiB)": 66.02, "step": 20395, "token_acc": 0.45514950166112955, "train_speed(iter/s)": 1.447025 }, { "epoch": 0.8739985433357611, "grad_norm": 4.909636974334717, "learning_rate": 9.264843846420702e-05, "loss": 2.6260467529296876, "memory(GiB)": 66.02, "step": 20400, "token_acc": 0.4520547945205479, "train_speed(iter/s)": 1.44697 }, { "epoch": 0.87421275866501, "grad_norm": 4.184407711029053, "learning_rate": 9.264492539243808e-05, "loss": 2.6808338165283203, "memory(GiB)": 66.02, "step": 20405, "token_acc": 0.44285714285714284, "train_speed(iter/s)": 1.446941 }, { "epoch": 0.8744269739942591, "grad_norm": 4.637807369232178, "learning_rate": 9.26414115481163e-05, "loss": 2.5490180969238283, "memory(GiB)": 66.02, "step": 20410, "token_acc": 0.46558704453441296, "train_speed(iter/s)": 1.446984 }, { "epoch": 0.874641189323508, "grad_norm": 5.338601589202881, "learning_rate": 9.263789693130535e-05, "loss": 2.607616996765137, "memory(GiB)": 66.02, "step": 20415, "token_acc": 0.45307443365695793, "train_speed(iter/s)": 1.446926 }, { "epoch": 0.8748554046527569, "grad_norm": 5.417995452880859, "learning_rate": 9.263438154206892e-05, "loss": 2.6932701110839843, "memory(GiB)": 66.02, "step": 20420, "token_acc": 0.4602076124567474, "train_speed(iter/s)": 1.446931 }, { "epoch": 0.8750696199820059, "grad_norm": 3.8684165477752686, "learning_rate": 9.263086538047064e-05, "loss": 2.5578914642333985, "memory(GiB)": 66.02, "step": 20425, "token_acc": 0.4874551971326165, "train_speed(iter/s)": 1.446889 }, { "epoch": 0.8752838353112549, "grad_norm": 3.9209108352661133, "learning_rate": 9.262734844657425e-05, "loss": 2.691782760620117, "memory(GiB)": 66.02, "step": 20430, "token_acc": 0.479108635097493, "train_speed(iter/s)": 1.446835 }, { "epoch": 0.8754980506405038, "grad_norm": 5.319073677062988, "learning_rate": 9.262383074044347e-05, "loss": 2.475202751159668, "memory(GiB)": 66.02, "step": 20435, "token_acc": 0.4691780821917808, "train_speed(iter/s)": 1.44692 }, { "epoch": 0.8757122659697528, "grad_norm": 4.863601207733154, "learning_rate": 9.262031226214201e-05, "loss": 2.626394271850586, "memory(GiB)": 66.02, "step": 20440, "token_acc": 0.4657039711191336, "train_speed(iter/s)": 1.446953 }, { "epoch": 0.8759264812990017, "grad_norm": 6.365195274353027, "learning_rate": 9.261679301173361e-05, "loss": 2.4839855194091798, "memory(GiB)": 66.02, "step": 20445, "token_acc": 0.47315436241610737, "train_speed(iter/s)": 1.446977 }, { "epoch": 0.8761406966282507, "grad_norm": 6.444025039672852, "learning_rate": 9.261327298928203e-05, "loss": 2.427138328552246, "memory(GiB)": 66.02, "step": 20450, "token_acc": 0.4805194805194805, "train_speed(iter/s)": 1.447008 }, { "epoch": 0.8763549119574997, "grad_norm": 3.8496649265289307, "learning_rate": 9.260975219485104e-05, "loss": 2.5181474685668945, "memory(GiB)": 66.02, "step": 20455, "token_acc": 0.4729241877256318, "train_speed(iter/s)": 1.44706 }, { "epoch": 0.8765691272867486, "grad_norm": 4.388014793395996, "learning_rate": 9.26062306285044e-05, "loss": 2.391971969604492, "memory(GiB)": 66.02, "step": 20460, "token_acc": 0.5, "train_speed(iter/s)": 1.447112 }, { "epoch": 0.8767833426159976, "grad_norm": 3.3938064575195312, "learning_rate": 9.260270829030594e-05, "loss": 2.385320281982422, "memory(GiB)": 66.02, "step": 20465, "token_acc": 0.5310344827586206, "train_speed(iter/s)": 1.447117 }, { "epoch": 0.8769975579452466, "grad_norm": 4.626340389251709, "learning_rate": 9.259918518031944e-05, "loss": 2.3799556732177733, "memory(GiB)": 66.02, "step": 20470, "token_acc": 0.5155038759689923, "train_speed(iter/s)": 1.447124 }, { "epoch": 0.8772117732744955, "grad_norm": 4.65280294418335, "learning_rate": 9.259566129860874e-05, "loss": 2.9183542251586916, "memory(GiB)": 66.02, "step": 20475, "token_acc": 0.396078431372549, "train_speed(iter/s)": 1.447123 }, { "epoch": 0.8774259886037444, "grad_norm": 4.689497470855713, "learning_rate": 9.25921366452377e-05, "loss": 2.5879859924316406, "memory(GiB)": 66.02, "step": 20480, "token_acc": 0.4827586206896552, "train_speed(iter/s)": 1.447129 }, { "epoch": 0.8776402039329935, "grad_norm": 7.428919792175293, "learning_rate": 9.258861122027013e-05, "loss": 2.4727554321289062, "memory(GiB)": 66.02, "step": 20485, "token_acc": 0.463768115942029, "train_speed(iter/s)": 1.447223 }, { "epoch": 0.8778544192622424, "grad_norm": 5.427867889404297, "learning_rate": 9.258508502376992e-05, "loss": 2.6388469696044923, "memory(GiB)": 66.02, "step": 20490, "token_acc": 0.4652014652014652, "train_speed(iter/s)": 1.447279 }, { "epoch": 0.8780686345914913, "grad_norm": 4.802516937255859, "learning_rate": 9.258155805580095e-05, "loss": 2.357114791870117, "memory(GiB)": 66.02, "step": 20495, "token_acc": 0.4775510204081633, "train_speed(iter/s)": 1.447317 }, { "epoch": 0.8782828499207403, "grad_norm": 4.4269795417785645, "learning_rate": 9.257803031642711e-05, "loss": 2.624612045288086, "memory(GiB)": 66.02, "step": 20500, "token_acc": 0.4912891986062718, "train_speed(iter/s)": 1.447265 }, { "epoch": 0.8782828499207403, "eval_loss": 2.3010377883911133, "eval_runtime": 14.7646, "eval_samples_per_second": 6.773, "eval_steps_per_second": 6.773, "eval_token_acc": 0.46108490566037735, "step": 20500 }, { "epoch": 0.8784970652499893, "grad_norm": 4.085290431976318, "learning_rate": 9.257450180571232e-05, "loss": 2.4852075576782227, "memory(GiB)": 66.02, "step": 20505, "token_acc": 0.4701159678858162, "train_speed(iter/s)": 1.445707 }, { "epoch": 0.8787112805792382, "grad_norm": 3.283400535583496, "learning_rate": 9.257097252372047e-05, "loss": 2.671334457397461, "memory(GiB)": 66.02, "step": 20510, "token_acc": 0.4066265060240964, "train_speed(iter/s)": 1.44566 }, { "epoch": 0.8789254959084872, "grad_norm": 4.380463123321533, "learning_rate": 9.256744247051553e-05, "loss": 2.5247406005859374, "memory(GiB)": 66.02, "step": 20515, "token_acc": 0.45564516129032256, "train_speed(iter/s)": 1.445746 }, { "epoch": 0.8791397112377362, "grad_norm": 4.257956027984619, "learning_rate": 9.256391164616144e-05, "loss": 2.5347347259521484, "memory(GiB)": 66.02, "step": 20520, "token_acc": 0.47633136094674555, "train_speed(iter/s)": 1.445807 }, { "epoch": 0.8793539265669851, "grad_norm": 4.6171417236328125, "learning_rate": 9.256038005072216e-05, "loss": 2.715629005432129, "memory(GiB)": 66.02, "step": 20525, "token_acc": 0.4206896551724138, "train_speed(iter/s)": 1.445852 }, { "epoch": 0.8795681418962341, "grad_norm": 4.8964152336120605, "learning_rate": 9.255684768426168e-05, "loss": 2.3374588012695314, "memory(GiB)": 66.02, "step": 20530, "token_acc": 0.5185185185185185, "train_speed(iter/s)": 1.445886 }, { "epoch": 0.879782357225483, "grad_norm": 4.855588912963867, "learning_rate": 9.255331454684395e-05, "loss": 2.7803354263305664, "memory(GiB)": 66.02, "step": 20535, "token_acc": 0.4492753623188406, "train_speed(iter/s)": 1.445909 }, { "epoch": 0.8799965725547321, "grad_norm": 4.195810317993164, "learning_rate": 9.254978063853303e-05, "loss": 2.497849464416504, "memory(GiB)": 66.02, "step": 20540, "token_acc": 0.46691176470588236, "train_speed(iter/s)": 1.445918 }, { "epoch": 0.880210787883981, "grad_norm": 5.927244663238525, "learning_rate": 9.254624595939293e-05, "loss": 2.4883745193481444, "memory(GiB)": 66.02, "step": 20545, "token_acc": 0.42857142857142855, "train_speed(iter/s)": 1.44584 }, { "epoch": 0.8804250032132299, "grad_norm": 4.772204875946045, "learning_rate": 9.254271050948767e-05, "loss": 2.6938032150268554, "memory(GiB)": 66.02, "step": 20550, "token_acc": 0.4258064516129032, "train_speed(iter/s)": 1.445834 }, { "epoch": 0.880639218542479, "grad_norm": 4.112085342407227, "learning_rate": 9.253917428888129e-05, "loss": 2.398793411254883, "memory(GiB)": 66.02, "step": 20555, "token_acc": 0.4267515923566879, "train_speed(iter/s)": 1.445884 }, { "epoch": 0.8808534338717279, "grad_norm": 4.269628047943115, "learning_rate": 9.253563729763786e-05, "loss": 2.4429323196411135, "memory(GiB)": 66.02, "step": 20560, "token_acc": 0.49411764705882355, "train_speed(iter/s)": 1.445941 }, { "epoch": 0.8810676492009768, "grad_norm": 3.393285036087036, "learning_rate": 9.253209953582147e-05, "loss": 2.6097085952758787, "memory(GiB)": 66.02, "step": 20565, "token_acc": 0.4880546075085324, "train_speed(iter/s)": 1.446012 }, { "epoch": 0.8812818645302258, "grad_norm": 4.838438034057617, "learning_rate": 9.25285610034962e-05, "loss": 2.8534555435180664, "memory(GiB)": 66.02, "step": 20570, "token_acc": 0.4870848708487085, "train_speed(iter/s)": 1.44581 }, { "epoch": 0.8814960798594748, "grad_norm": 4.8219685554504395, "learning_rate": 9.252502170072615e-05, "loss": 2.448726272583008, "memory(GiB)": 66.02, "step": 20575, "token_acc": 0.516728624535316, "train_speed(iter/s)": 1.445895 }, { "epoch": 0.8817102951887237, "grad_norm": 5.260610580444336, "learning_rate": 9.252148162757542e-05, "loss": 2.6347301483154295, "memory(GiB)": 66.02, "step": 20580, "token_acc": 0.4482758620689655, "train_speed(iter/s)": 1.445927 }, { "epoch": 0.8819245105179727, "grad_norm": 4.973104000091553, "learning_rate": 9.251794078410818e-05, "loss": 2.560206985473633, "memory(GiB)": 66.02, "step": 20585, "token_acc": 0.4779874213836478, "train_speed(iter/s)": 1.445975 }, { "epoch": 0.8821387258472216, "grad_norm": 5.514646053314209, "learning_rate": 9.251439917038856e-05, "loss": 2.6100704193115236, "memory(GiB)": 66.02, "step": 20590, "token_acc": 0.4444444444444444, "train_speed(iter/s)": 1.445961 }, { "epoch": 0.8823529411764706, "grad_norm": 2.9874496459960938, "learning_rate": 9.251085678648072e-05, "loss": 2.350851058959961, "memory(GiB)": 66.02, "step": 20595, "token_acc": 0.5015673981191222, "train_speed(iter/s)": 1.445936 }, { "epoch": 0.8825671565057196, "grad_norm": 4.191380977630615, "learning_rate": 9.250731363244882e-05, "loss": 2.6914560317993166, "memory(GiB)": 66.02, "step": 20600, "token_acc": 0.4782608695652174, "train_speed(iter/s)": 1.445953 }, { "epoch": 0.8827813718349685, "grad_norm": 6.398646354675293, "learning_rate": 9.250376970835706e-05, "loss": 2.7345439910888674, "memory(GiB)": 66.02, "step": 20605, "token_acc": 0.42857142857142855, "train_speed(iter/s)": 1.445932 }, { "epoch": 0.8829955871642174, "grad_norm": 4.889020919799805, "learning_rate": 9.250022501426965e-05, "loss": 2.6503929138183593, "memory(GiB)": 66.02, "step": 20610, "token_acc": 0.43157894736842106, "train_speed(iter/s)": 1.445941 }, { "epoch": 0.8832098024934665, "grad_norm": 4.379112720489502, "learning_rate": 9.249667955025079e-05, "loss": 2.8273773193359375, "memory(GiB)": 66.02, "step": 20615, "token_acc": 0.4281524926686217, "train_speed(iter/s)": 1.445993 }, { "epoch": 0.8834240178227154, "grad_norm": 3.438961982727051, "learning_rate": 9.24931333163647e-05, "loss": 2.3543167114257812, "memory(GiB)": 66.02, "step": 20620, "token_acc": 0.4868421052631579, "train_speed(iter/s)": 1.445977 }, { "epoch": 0.8836382331519643, "grad_norm": 4.19992733001709, "learning_rate": 9.248958631267566e-05, "loss": 2.9201812744140625, "memory(GiB)": 66.02, "step": 20625, "token_acc": 0.3770491803278688, "train_speed(iter/s)": 1.445957 }, { "epoch": 0.8838524484812134, "grad_norm": 4.5692009925842285, "learning_rate": 9.248603853924789e-05, "loss": 2.5510080337524412, "memory(GiB)": 66.02, "step": 20630, "token_acc": 0.4664429530201342, "train_speed(iter/s)": 1.446031 }, { "epoch": 0.8840666638104623, "grad_norm": 5.530004024505615, "learning_rate": 9.248248999614568e-05, "loss": 2.80223388671875, "memory(GiB)": 66.02, "step": 20635, "token_acc": 0.41947565543071164, "train_speed(iter/s)": 1.446146 }, { "epoch": 0.8842808791397112, "grad_norm": 4.1893463134765625, "learning_rate": 9.247894068343332e-05, "loss": 2.6047887802124023, "memory(GiB)": 66.02, "step": 20640, "token_acc": 0.48928571428571427, "train_speed(iter/s)": 1.446189 }, { "epoch": 0.8844950944689602, "grad_norm": 4.4396820068359375, "learning_rate": 9.24753906011751e-05, "loss": 2.528820610046387, "memory(GiB)": 66.02, "step": 20645, "token_acc": 0.48161764705882354, "train_speed(iter/s)": 1.446279 }, { "epoch": 0.8847093097982092, "grad_norm": 5.101334095001221, "learning_rate": 9.247183974943532e-05, "loss": 2.7585237503051756, "memory(GiB)": 66.02, "step": 20650, "token_acc": 0.44912280701754387, "train_speed(iter/s)": 1.446325 }, { "epoch": 0.8849235251274581, "grad_norm": 3.763733148574829, "learning_rate": 9.246828812827834e-05, "loss": 2.4597408294677736, "memory(GiB)": 66.02, "step": 20655, "token_acc": 0.45323741007194246, "train_speed(iter/s)": 1.446286 }, { "epoch": 0.8851377404567071, "grad_norm": 5.465420246124268, "learning_rate": 9.246473573776845e-05, "loss": 2.6887969970703125, "memory(GiB)": 66.02, "step": 20660, "token_acc": 0.45481927710843373, "train_speed(iter/s)": 1.446292 }, { "epoch": 0.885351955785956, "grad_norm": 4.568000316619873, "learning_rate": 9.246118257797007e-05, "loss": 2.639945220947266, "memory(GiB)": 66.02, "step": 20665, "token_acc": 0.4492753623188406, "train_speed(iter/s)": 1.446346 }, { "epoch": 0.885566171115205, "grad_norm": 4.150634765625, "learning_rate": 9.245762864894754e-05, "loss": 2.996517562866211, "memory(GiB)": 66.02, "step": 20670, "token_acc": 0.41194029850746267, "train_speed(iter/s)": 1.446471 }, { "epoch": 0.885780386444454, "grad_norm": 5.055774688720703, "learning_rate": 9.245407395076522e-05, "loss": 2.773493766784668, "memory(GiB)": 66.02, "step": 20675, "token_acc": 0.4493927125506073, "train_speed(iter/s)": 1.446528 }, { "epoch": 0.8859946017737029, "grad_norm": 5.247165679931641, "learning_rate": 9.245051848348753e-05, "loss": 2.7089481353759766, "memory(GiB)": 66.02, "step": 20680, "token_acc": 0.46830985915492956, "train_speed(iter/s)": 1.44646 }, { "epoch": 0.8862088171029519, "grad_norm": 4.205036640167236, "learning_rate": 9.244696224717889e-05, "loss": 2.763844108581543, "memory(GiB)": 66.02, "step": 20685, "token_acc": 0.45555555555555555, "train_speed(iter/s)": 1.446553 }, { "epoch": 0.8864230324322009, "grad_norm": 4.1134033203125, "learning_rate": 9.244340524190368e-05, "loss": 2.6551750183105467, "memory(GiB)": 66.02, "step": 20690, "token_acc": 0.46402877697841727, "train_speed(iter/s)": 1.446561 }, { "epoch": 0.8866372477614498, "grad_norm": 3.585674285888672, "learning_rate": 9.243984746772638e-05, "loss": 2.7241180419921873, "memory(GiB)": 66.02, "step": 20695, "token_acc": 0.4591194968553459, "train_speed(iter/s)": 1.446469 }, { "epoch": 0.8868514630906987, "grad_norm": 4.635261535644531, "learning_rate": 9.243628892471145e-05, "loss": 2.594484329223633, "memory(GiB)": 66.02, "step": 20700, "token_acc": 0.4618055555555556, "train_speed(iter/s)": 1.446361 }, { "epoch": 0.8870656784199478, "grad_norm": 4.641414165496826, "learning_rate": 9.243272961292331e-05, "loss": 2.5772079467773437, "memory(GiB)": 66.02, "step": 20705, "token_acc": 0.4691780821917808, "train_speed(iter/s)": 1.446347 }, { "epoch": 0.8872798937491967, "grad_norm": 4.592565059661865, "learning_rate": 9.242916953242649e-05, "loss": 2.693921661376953, "memory(GiB)": 66.02, "step": 20710, "token_acc": 0.44534412955465585, "train_speed(iter/s)": 1.446347 }, { "epoch": 0.8874941090784456, "grad_norm": 4.181694984436035, "learning_rate": 9.242560868328545e-05, "loss": 2.5933982849121096, "memory(GiB)": 66.02, "step": 20715, "token_acc": 0.46953405017921146, "train_speed(iter/s)": 1.446386 }, { "epoch": 0.8877083244076946, "grad_norm": 3.5672190189361572, "learning_rate": 9.242204706556472e-05, "loss": 2.6774166107177733, "memory(GiB)": 66.02, "step": 20720, "token_acc": 0.4792332268370607, "train_speed(iter/s)": 1.446359 }, { "epoch": 0.8879225397369436, "grad_norm": 4.95294189453125, "learning_rate": 9.241848467932881e-05, "loss": 2.7005905151367187, "memory(GiB)": 66.02, "step": 20725, "token_acc": 0.41114982578397213, "train_speed(iter/s)": 1.44637 }, { "epoch": 0.8881367550661925, "grad_norm": 4.768760681152344, "learning_rate": 9.241492152464225e-05, "loss": 2.5643665313720705, "memory(GiB)": 66.02, "step": 20730, "token_acc": 0.49019607843137253, "train_speed(iter/s)": 1.446301 }, { "epoch": 0.8883509703954415, "grad_norm": 3.995396375656128, "learning_rate": 9.24113576015696e-05, "loss": 2.3804933547973635, "memory(GiB)": 66.02, "step": 20735, "token_acc": 0.49818181818181817, "train_speed(iter/s)": 1.446308 }, { "epoch": 0.8885651857246905, "grad_norm": 4.0110697746276855, "learning_rate": 9.240779291017542e-05, "loss": 2.4697078704833983, "memory(GiB)": 66.02, "step": 20740, "token_acc": 0.49411764705882355, "train_speed(iter/s)": 1.446309 }, { "epoch": 0.8887794010539394, "grad_norm": 5.562434673309326, "learning_rate": 9.240422745052431e-05, "loss": 2.7784284591674804, "memory(GiB)": 66.02, "step": 20745, "token_acc": 0.45195729537366547, "train_speed(iter/s)": 1.446312 }, { "epoch": 0.8889936163831884, "grad_norm": 6.544858932495117, "learning_rate": 9.240066122268081e-05, "loss": 2.5637544631958007, "memory(GiB)": 66.02, "step": 20750, "token_acc": 0.4751552795031056, "train_speed(iter/s)": 1.446312 }, { "epoch": 0.8892078317124373, "grad_norm": 4.335330009460449, "learning_rate": 9.239709422670958e-05, "loss": 2.6714353561401367, "memory(GiB)": 66.02, "step": 20755, "token_acc": 0.4119601328903654, "train_speed(iter/s)": 1.446367 }, { "epoch": 0.8894220470416863, "grad_norm": 4.983273029327393, "learning_rate": 9.23935264626752e-05, "loss": 2.6674016952514648, "memory(GiB)": 66.02, "step": 20760, "token_acc": 0.4369230769230769, "train_speed(iter/s)": 1.446408 }, { "epoch": 0.8896362623709353, "grad_norm": 4.446269989013672, "learning_rate": 9.238995793064233e-05, "loss": 2.5938438415527343, "memory(GiB)": 66.02, "step": 20765, "token_acc": 0.43874643874643876, "train_speed(iter/s)": 1.44652 }, { "epoch": 0.8898504777001842, "grad_norm": 4.286979675292969, "learning_rate": 9.238638863067561e-05, "loss": 2.5131658554077148, "memory(GiB)": 66.02, "step": 20770, "token_acc": 0.47592067988668557, "train_speed(iter/s)": 1.446522 }, { "epoch": 0.8900646930294331, "grad_norm": 6.719756126403809, "learning_rate": 9.23828185628397e-05, "loss": 2.4843597412109375, "memory(GiB)": 66.02, "step": 20775, "token_acc": 0.4966887417218543, "train_speed(iter/s)": 1.446584 }, { "epoch": 0.8902789083586822, "grad_norm": 3.612745761871338, "learning_rate": 9.237924772719927e-05, "loss": 2.49188175201416, "memory(GiB)": 66.02, "step": 20780, "token_acc": 0.45689655172413796, "train_speed(iter/s)": 1.446616 }, { "epoch": 0.8904931236879311, "grad_norm": 4.251318454742432, "learning_rate": 9.237567612381901e-05, "loss": 2.505125045776367, "memory(GiB)": 66.02, "step": 20785, "token_acc": 0.5020080321285141, "train_speed(iter/s)": 1.446671 }, { "epoch": 0.89070733901718, "grad_norm": 4.86512565612793, "learning_rate": 9.237210375276361e-05, "loss": 2.444071960449219, "memory(GiB)": 66.02, "step": 20790, "token_acc": 0.4711111111111111, "train_speed(iter/s)": 1.446678 }, { "epoch": 0.8909215543464291, "grad_norm": 6.499415874481201, "learning_rate": 9.236853061409783e-05, "loss": 2.8627498626708983, "memory(GiB)": 66.02, "step": 20795, "token_acc": 0.4297872340425532, "train_speed(iter/s)": 1.446707 }, { "epoch": 0.891135769675678, "grad_norm": 3.7318472862243652, "learning_rate": 9.236495670788636e-05, "loss": 2.509539985656738, "memory(GiB)": 66.02, "step": 20800, "token_acc": 0.5227272727272727, "train_speed(iter/s)": 1.446716 }, { "epoch": 0.8913499850049269, "grad_norm": 5.019067764282227, "learning_rate": 9.236138203419396e-05, "loss": 2.483055305480957, "memory(GiB)": 66.02, "step": 20805, "token_acc": 0.4707792207792208, "train_speed(iter/s)": 1.446643 }, { "epoch": 0.8915642003341759, "grad_norm": 5.08335018157959, "learning_rate": 9.23578065930854e-05, "loss": 2.661589813232422, "memory(GiB)": 66.02, "step": 20810, "token_acc": 0.4478114478114478, "train_speed(iter/s)": 1.446676 }, { "epoch": 0.8917784156634249, "grad_norm": 4.337501525878906, "learning_rate": 9.235423038462542e-05, "loss": 2.251332664489746, "memory(GiB)": 66.02, "step": 20815, "token_acc": 0.4858490566037736, "train_speed(iter/s)": 1.44666 }, { "epoch": 0.8919926309926738, "grad_norm": 5.828814506530762, "learning_rate": 9.235065340887882e-05, "loss": 2.5227634429931642, "memory(GiB)": 66.02, "step": 20820, "token_acc": 0.4700460829493088, "train_speed(iter/s)": 1.446713 }, { "epoch": 0.8922068463219228, "grad_norm": 4.543949604034424, "learning_rate": 9.234707566591042e-05, "loss": 2.909804916381836, "memory(GiB)": 66.02, "step": 20825, "token_acc": 0.429042904290429, "train_speed(iter/s)": 1.446725 }, { "epoch": 0.8924210616511717, "grad_norm": 3.9868791103363037, "learning_rate": 9.234349715578501e-05, "loss": 2.3819196701049803, "memory(GiB)": 66.02, "step": 20830, "token_acc": 0.4981949458483754, "train_speed(iter/s)": 1.44667 }, { "epoch": 0.8926352769804207, "grad_norm": 5.507164001464844, "learning_rate": 9.233991787856743e-05, "loss": 2.607301139831543, "memory(GiB)": 66.02, "step": 20835, "token_acc": 0.46616541353383456, "train_speed(iter/s)": 1.446722 }, { "epoch": 0.8928494923096697, "grad_norm": 6.382336139678955, "learning_rate": 9.233633783432252e-05, "loss": 2.650271987915039, "memory(GiB)": 66.02, "step": 20840, "token_acc": 0.4494047619047619, "train_speed(iter/s)": 1.446704 }, { "epoch": 0.8930637076389186, "grad_norm": 4.958181858062744, "learning_rate": 9.233275702311514e-05, "loss": 2.453841972351074, "memory(GiB)": 66.02, "step": 20845, "token_acc": 0.5241635687732342, "train_speed(iter/s)": 1.446667 }, { "epoch": 0.8932779229681675, "grad_norm": 6.049378871917725, "learning_rate": 9.232917544501015e-05, "loss": 2.4255767822265626, "memory(GiB)": 66.02, "step": 20850, "token_acc": 0.553030303030303, "train_speed(iter/s)": 1.446618 }, { "epoch": 0.8934921382974166, "grad_norm": 3.888660430908203, "learning_rate": 9.232559310007245e-05, "loss": 2.5009271621704103, "memory(GiB)": 66.02, "step": 20855, "token_acc": 0.4807692307692308, "train_speed(iter/s)": 1.446662 }, { "epoch": 0.8937063536266655, "grad_norm": 4.2253265380859375, "learning_rate": 9.23220099883669e-05, "loss": 2.607834815979004, "memory(GiB)": 66.02, "step": 20860, "token_acc": 0.48951048951048953, "train_speed(iter/s)": 1.44668 }, { "epoch": 0.8939205689559145, "grad_norm": 3.559831142425537, "learning_rate": 9.231842610995847e-05, "loss": 2.653416633605957, "memory(GiB)": 66.02, "step": 20865, "token_acc": 0.4328767123287671, "train_speed(iter/s)": 1.44663 }, { "epoch": 0.8941347842851635, "grad_norm": 5.760776042938232, "learning_rate": 9.231484146491203e-05, "loss": 2.746925163269043, "memory(GiB)": 66.02, "step": 20870, "token_acc": 0.45136186770428016, "train_speed(iter/s)": 1.446614 }, { "epoch": 0.8943489996144124, "grad_norm": 3.818387508392334, "learning_rate": 9.231125605329256e-05, "loss": 2.5633007049560548, "memory(GiB)": 66.02, "step": 20875, "token_acc": 0.4772727272727273, "train_speed(iter/s)": 1.446561 }, { "epoch": 0.8945632149436614, "grad_norm": 4.665847301483154, "learning_rate": 9.230766987516498e-05, "loss": 2.7993341445922852, "memory(GiB)": 66.02, "step": 20880, "token_acc": 0.475177304964539, "train_speed(iter/s)": 1.446619 }, { "epoch": 0.8947774302729103, "grad_norm": 4.266956329345703, "learning_rate": 9.230408293059427e-05, "loss": 2.537516784667969, "memory(GiB)": 66.02, "step": 20885, "token_acc": 0.4642857142857143, "train_speed(iter/s)": 1.446641 }, { "epoch": 0.8949916456021593, "grad_norm": 5.539259433746338, "learning_rate": 9.230049521964544e-05, "loss": 2.1880725860595702, "memory(GiB)": 66.02, "step": 20890, "token_acc": 0.5635593220338984, "train_speed(iter/s)": 1.446634 }, { "epoch": 0.8952058609314083, "grad_norm": 5.82073450088501, "learning_rate": 9.229690674238343e-05, "loss": 2.5165029525756837, "memory(GiB)": 66.02, "step": 20895, "token_acc": 0.4472049689440994, "train_speed(iter/s)": 1.446635 }, { "epoch": 0.8954200762606572, "grad_norm": 4.196836471557617, "learning_rate": 9.22933174988733e-05, "loss": 2.6166725158691406, "memory(GiB)": 66.02, "step": 20900, "token_acc": 0.45751633986928103, "train_speed(iter/s)": 1.446632 }, { "epoch": 0.8956342915899062, "grad_norm": 4.747260093688965, "learning_rate": 9.228972748918002e-05, "loss": 2.535319519042969, "memory(GiB)": 66.02, "step": 20905, "token_acc": 0.48854961832061067, "train_speed(iter/s)": 1.446703 }, { "epoch": 0.8958485069191552, "grad_norm": 3.4472787380218506, "learning_rate": 9.228613671336867e-05, "loss": 2.454991340637207, "memory(GiB)": 66.02, "step": 20910, "token_acc": 0.4634920634920635, "train_speed(iter/s)": 1.446698 }, { "epoch": 0.8960627222484041, "grad_norm": 3.922443151473999, "learning_rate": 9.228254517150429e-05, "loss": 2.7372570037841797, "memory(GiB)": 66.02, "step": 20915, "token_acc": 0.4605678233438486, "train_speed(iter/s)": 1.4467 }, { "epoch": 0.896276937577653, "grad_norm": 6.497615337371826, "learning_rate": 9.227895286365194e-05, "loss": 2.6493576049804686, "memory(GiB)": 66.02, "step": 20920, "token_acc": 0.4418604651162791, "train_speed(iter/s)": 1.446715 }, { "epoch": 0.8964911529069021, "grad_norm": 6.321664810180664, "learning_rate": 9.227535978987669e-05, "loss": 2.916431999206543, "memory(GiB)": 66.02, "step": 20925, "token_acc": 0.41964285714285715, "train_speed(iter/s)": 1.446792 }, { "epoch": 0.896705368236151, "grad_norm": 4.5049238204956055, "learning_rate": 9.227176595024364e-05, "loss": 2.614213562011719, "memory(GiB)": 66.02, "step": 20930, "token_acc": 0.47318611987381703, "train_speed(iter/s)": 1.446823 }, { "epoch": 0.8969195835653999, "grad_norm": 6.199278831481934, "learning_rate": 9.22681713448179e-05, "loss": 2.6324741363525392, "memory(GiB)": 66.02, "step": 20935, "token_acc": 0.4732824427480916, "train_speed(iter/s)": 1.446876 }, { "epoch": 0.897133798894649, "grad_norm": 3.929837465286255, "learning_rate": 9.226457597366459e-05, "loss": 2.565410614013672, "memory(GiB)": 66.02, "step": 20940, "token_acc": 0.45918367346938777, "train_speed(iter/s)": 1.446925 }, { "epoch": 0.8973480142238979, "grad_norm": 4.632893085479736, "learning_rate": 9.226097983684883e-05, "loss": 2.125894546508789, "memory(GiB)": 66.02, "step": 20945, "token_acc": 0.5269230769230769, "train_speed(iter/s)": 1.446942 }, { "epoch": 0.8975622295531468, "grad_norm": 4.071660995483398, "learning_rate": 9.225738293443578e-05, "loss": 2.6710739135742188, "memory(GiB)": 66.02, "step": 20950, "token_acc": 0.47058823529411764, "train_speed(iter/s)": 1.447016 }, { "epoch": 0.8977764448823958, "grad_norm": 4.499776840209961, "learning_rate": 9.22537852664906e-05, "loss": 2.9659128189086914, "memory(GiB)": 66.02, "step": 20955, "token_acc": 0.38181818181818183, "train_speed(iter/s)": 1.446973 }, { "epoch": 0.8979906602116448, "grad_norm": 4.787110805511475, "learning_rate": 9.225018683307846e-05, "loss": 2.8772682189941405, "memory(GiB)": 66.02, "step": 20960, "token_acc": 0.43902439024390244, "train_speed(iter/s)": 1.446918 }, { "epoch": 0.8982048755408937, "grad_norm": 5.112154006958008, "learning_rate": 9.224658763426454e-05, "loss": 2.6717370986938476, "memory(GiB)": 66.02, "step": 20965, "token_acc": 0.4668769716088328, "train_speed(iter/s)": 1.446982 }, { "epoch": 0.8984190908701427, "grad_norm": 3.814579486846924, "learning_rate": 9.224298767011407e-05, "loss": 2.573265266418457, "memory(GiB)": 66.02, "step": 20970, "token_acc": 0.45390070921985815, "train_speed(iter/s)": 1.447048 }, { "epoch": 0.8986333061993916, "grad_norm": 4.138620376586914, "learning_rate": 9.223938694069227e-05, "loss": 2.760302734375, "memory(GiB)": 66.02, "step": 20975, "token_acc": 0.468013468013468, "train_speed(iter/s)": 1.447107 }, { "epoch": 0.8988475215286406, "grad_norm": 3.342966079711914, "learning_rate": 9.223578544606432e-05, "loss": 2.569215202331543, "memory(GiB)": 66.02, "step": 20980, "token_acc": 0.4768211920529801, "train_speed(iter/s)": 1.447115 }, { "epoch": 0.8990617368578896, "grad_norm": 5.396997451782227, "learning_rate": 9.223218318629551e-05, "loss": 2.7148935317993166, "memory(GiB)": 66.02, "step": 20985, "token_acc": 0.437984496124031, "train_speed(iter/s)": 1.447138 }, { "epoch": 0.8992759521871385, "grad_norm": 6.1343793869018555, "learning_rate": 9.222858016145108e-05, "loss": 2.3639318466186525, "memory(GiB)": 66.02, "step": 20990, "token_acc": 0.4528301886792453, "train_speed(iter/s)": 1.447066 }, { "epoch": 0.8994901675163874, "grad_norm": 4.446055889129639, "learning_rate": 9.22249763715963e-05, "loss": 2.5438253402709963, "memory(GiB)": 66.02, "step": 20995, "token_acc": 0.46153846153846156, "train_speed(iter/s)": 1.447081 }, { "epoch": 0.8997043828456365, "grad_norm": 3.5794870853424072, "learning_rate": 9.222137181679648e-05, "loss": 2.6343250274658203, "memory(GiB)": 66.02, "step": 21000, "token_acc": 0.47686832740213525, "train_speed(iter/s)": 1.447092 }, { "epoch": 0.8997043828456365, "eval_loss": 2.172961950302124, "eval_runtime": 14.0706, "eval_samples_per_second": 7.107, "eval_steps_per_second": 7.107, "eval_token_acc": 0.47564469914040114, "step": 21000 }, { "epoch": 0.8999185981748854, "grad_norm": 4.638935565948486, "learning_rate": 9.221776649711689e-05, "loss": 2.503186798095703, "memory(GiB)": 66.02, "step": 21005, "token_acc": 0.4863731656184486, "train_speed(iter/s)": 1.445655 }, { "epoch": 0.9001328135041343, "grad_norm": 6.466837406158447, "learning_rate": 9.221416041262285e-05, "loss": 2.6534194946289062, "memory(GiB)": 66.02, "step": 21010, "token_acc": 0.4594594594594595, "train_speed(iter/s)": 1.445751 }, { "epoch": 0.9003470288333834, "grad_norm": 3.8257768154144287, "learning_rate": 9.22105535633797e-05, "loss": 2.412603759765625, "memory(GiB)": 66.02, "step": 21015, "token_acc": 0.4890282131661442, "train_speed(iter/s)": 1.44583 }, { "epoch": 0.9005612441626323, "grad_norm": 4.2280659675598145, "learning_rate": 9.220694594945278e-05, "loss": 2.7420732498168947, "memory(GiB)": 66.02, "step": 21020, "token_acc": 0.43304843304843305, "train_speed(iter/s)": 1.445807 }, { "epoch": 0.9007754594918812, "grad_norm": 4.948489665985107, "learning_rate": 9.220333757090745e-05, "loss": 2.318073272705078, "memory(GiB)": 66.02, "step": 21025, "token_acc": 0.5164319248826291, "train_speed(iter/s)": 1.445727 }, { "epoch": 0.9009896748211302, "grad_norm": 5.825639724731445, "learning_rate": 9.219972842780907e-05, "loss": 2.5972923278808593, "memory(GiB)": 66.02, "step": 21030, "token_acc": 0.46545454545454545, "train_speed(iter/s)": 1.445663 }, { "epoch": 0.9012038901503792, "grad_norm": 4.015948295593262, "learning_rate": 9.219611852022301e-05, "loss": 2.6458385467529295, "memory(GiB)": 66.02, "step": 21035, "token_acc": 0.46060606060606063, "train_speed(iter/s)": 1.445659 }, { "epoch": 0.9014181054796281, "grad_norm": 5.295448303222656, "learning_rate": 9.219250784821467e-05, "loss": 2.5657411575317384, "memory(GiB)": 66.02, "step": 21040, "token_acc": 0.46153846153846156, "train_speed(iter/s)": 1.445702 }, { "epoch": 0.9016323208088771, "grad_norm": 3.9318182468414307, "learning_rate": 9.218889641184949e-05, "loss": 2.386408233642578, "memory(GiB)": 66.02, "step": 21045, "token_acc": 0.49825783972125437, "train_speed(iter/s)": 1.44568 }, { "epoch": 0.901846536138126, "grad_norm": 4.602927207946777, "learning_rate": 9.218528421119287e-05, "loss": 2.8008617401123046, "memory(GiB)": 66.02, "step": 21050, "token_acc": 0.4690265486725664, "train_speed(iter/s)": 1.445743 }, { "epoch": 0.902060751467375, "grad_norm": 4.676817893981934, "learning_rate": 9.218167124631025e-05, "loss": 2.843794250488281, "memory(GiB)": 66.02, "step": 21055, "token_acc": 0.45878136200716846, "train_speed(iter/s)": 1.445844 }, { "epoch": 0.902274966796624, "grad_norm": 9.051002502441406, "learning_rate": 9.21780575172671e-05, "loss": 2.697144317626953, "memory(GiB)": 66.02, "step": 21060, "token_acc": 0.48464163822525597, "train_speed(iter/s)": 1.44583 }, { "epoch": 0.9024891821258729, "grad_norm": 5.728804111480713, "learning_rate": 9.217444302412886e-05, "loss": 2.633591079711914, "memory(GiB)": 66.02, "step": 21065, "token_acc": 0.4721311475409836, "train_speed(iter/s)": 1.445785 }, { "epoch": 0.9027033974551218, "grad_norm": 6.295378684997559, "learning_rate": 9.217082776696101e-05, "loss": 2.569627571105957, "memory(GiB)": 66.02, "step": 21070, "token_acc": 0.45041322314049587, "train_speed(iter/s)": 1.445853 }, { "epoch": 0.9029176127843709, "grad_norm": 6.806280612945557, "learning_rate": 9.216721174582907e-05, "loss": 2.5210811614990236, "memory(GiB)": 66.02, "step": 21075, "token_acc": 0.4735099337748344, "train_speed(iter/s)": 1.445877 }, { "epoch": 0.9031318281136198, "grad_norm": 4.269497871398926, "learning_rate": 9.216359496079851e-05, "loss": 2.9506881713867186, "memory(GiB)": 66.02, "step": 21080, "token_acc": 0.46875, "train_speed(iter/s)": 1.445862 }, { "epoch": 0.9033460434428687, "grad_norm": 4.604902267456055, "learning_rate": 9.215997741193491e-05, "loss": 2.4697088241577148, "memory(GiB)": 66.02, "step": 21085, "token_acc": 0.4927007299270073, "train_speed(iter/s)": 1.445918 }, { "epoch": 0.9035602587721178, "grad_norm": 6.149738788604736, "learning_rate": 9.215635909930376e-05, "loss": 2.571335220336914, "memory(GiB)": 66.02, "step": 21090, "token_acc": 0.4397163120567376, "train_speed(iter/s)": 1.44592 }, { "epoch": 0.9037744741013667, "grad_norm": 4.23390531539917, "learning_rate": 9.21527400229706e-05, "loss": 2.5584247589111326, "memory(GiB)": 66.02, "step": 21095, "token_acc": 0.4879725085910653, "train_speed(iter/s)": 1.445925 }, { "epoch": 0.9039886894306156, "grad_norm": 5.736716270446777, "learning_rate": 9.214912018300103e-05, "loss": 2.7630359649658205, "memory(GiB)": 66.02, "step": 21100, "token_acc": 0.4230769230769231, "train_speed(iter/s)": 1.445877 }, { "epoch": 0.9042029047598646, "grad_norm": 5.614219665527344, "learning_rate": 9.214549957946061e-05, "loss": 2.31036262512207, "memory(GiB)": 66.02, "step": 21105, "token_acc": 0.4830508474576271, "train_speed(iter/s)": 1.4459 }, { "epoch": 0.9044171200891136, "grad_norm": 6.216885089874268, "learning_rate": 9.214187821241492e-05, "loss": 2.558477592468262, "memory(GiB)": 66.02, "step": 21110, "token_acc": 0.483271375464684, "train_speed(iter/s)": 1.445946 }, { "epoch": 0.9046313354183625, "grad_norm": 3.8255560398101807, "learning_rate": 9.213825608192959e-05, "loss": 2.6169076919555665, "memory(GiB)": 66.02, "step": 21115, "token_acc": 0.4804270462633452, "train_speed(iter/s)": 1.445972 }, { "epoch": 0.9048455507476115, "grad_norm": 4.41001558303833, "learning_rate": 9.213463318807021e-05, "loss": 2.5118234634399412, "memory(GiB)": 66.02, "step": 21120, "token_acc": 0.4827586206896552, "train_speed(iter/s)": 1.446026 }, { "epoch": 0.9050597660768605, "grad_norm": 4.655299186706543, "learning_rate": 9.213100953090241e-05, "loss": 2.426678466796875, "memory(GiB)": 66.02, "step": 21125, "token_acc": 0.501432664756447, "train_speed(iter/s)": 1.445969 }, { "epoch": 0.9052739814061094, "grad_norm": 4.1010823249816895, "learning_rate": 9.212738511049187e-05, "loss": 2.468875503540039, "memory(GiB)": 66.02, "step": 21130, "token_acc": 0.48854961832061067, "train_speed(iter/s)": 1.445903 }, { "epoch": 0.9054881967353584, "grad_norm": 3.8986330032348633, "learning_rate": 9.212375992690423e-05, "loss": 2.4613273620605467, "memory(GiB)": 66.02, "step": 21135, "token_acc": 0.4854368932038835, "train_speed(iter/s)": 1.445981 }, { "epoch": 0.9057024120646073, "grad_norm": 5.170470714569092, "learning_rate": 9.212013398020516e-05, "loss": 2.6442886352539063, "memory(GiB)": 66.02, "step": 21140, "token_acc": 0.47766323024054985, "train_speed(iter/s)": 1.445955 }, { "epoch": 0.9059166273938563, "grad_norm": 5.627382755279541, "learning_rate": 9.211650727046033e-05, "loss": 2.7149894714355467, "memory(GiB)": 66.02, "step": 21145, "token_acc": 0.45209580838323354, "train_speed(iter/s)": 1.445963 }, { "epoch": 0.9061308427231053, "grad_norm": 3.8872225284576416, "learning_rate": 9.211287979773548e-05, "loss": 2.677489471435547, "memory(GiB)": 66.02, "step": 21150, "token_acc": 0.476038338658147, "train_speed(iter/s)": 1.445963 }, { "epoch": 0.9063450580523542, "grad_norm": 5.081066131591797, "learning_rate": 9.210925156209632e-05, "loss": 2.3350507736206056, "memory(GiB)": 66.02, "step": 21155, "token_acc": 0.4965986394557823, "train_speed(iter/s)": 1.445995 }, { "epoch": 0.9065592733816031, "grad_norm": 6.163038730621338, "learning_rate": 9.210562256360855e-05, "loss": 2.7246273040771483, "memory(GiB)": 66.02, "step": 21160, "token_acc": 0.4921259842519685, "train_speed(iter/s)": 1.446069 }, { "epoch": 0.9067734887108522, "grad_norm": 4.30300760269165, "learning_rate": 9.210199280233794e-05, "loss": 2.771466827392578, "memory(GiB)": 66.02, "step": 21165, "token_acc": 0.4371069182389937, "train_speed(iter/s)": 1.446118 }, { "epoch": 0.9069877040401011, "grad_norm": 3.454042673110962, "learning_rate": 9.209836227835022e-05, "loss": 2.6793468475341795, "memory(GiB)": 66.02, "step": 21170, "token_acc": 0.48231511254019294, "train_speed(iter/s)": 1.446189 }, { "epoch": 0.90720191936935, "grad_norm": 3.537275552749634, "learning_rate": 9.20947309917112e-05, "loss": 2.6368431091308593, "memory(GiB)": 66.02, "step": 21175, "token_acc": 0.46402877697841727, "train_speed(iter/s)": 1.446225 }, { "epoch": 0.907416134698599, "grad_norm": 4.747765064239502, "learning_rate": 9.209109894248662e-05, "loss": 2.5317012786865236, "memory(GiB)": 66.02, "step": 21180, "token_acc": 0.4389438943894389, "train_speed(iter/s)": 1.446225 }, { "epoch": 0.907630350027848, "grad_norm": 3.6141278743743896, "learning_rate": 9.208746613074231e-05, "loss": 2.5407249450683596, "memory(GiB)": 66.02, "step": 21185, "token_acc": 0.44966442953020136, "train_speed(iter/s)": 1.446258 }, { "epoch": 0.9078445653570969, "grad_norm": 4.709258079528809, "learning_rate": 9.208383255654406e-05, "loss": 2.942336654663086, "memory(GiB)": 66.02, "step": 21190, "token_acc": 0.42618384401114207, "train_speed(iter/s)": 1.446252 }, { "epoch": 0.9080587806863459, "grad_norm": 4.7240071296691895, "learning_rate": 9.20801982199577e-05, "loss": 2.747900390625, "memory(GiB)": 66.02, "step": 21195, "token_acc": 0.4675324675324675, "train_speed(iter/s)": 1.446281 }, { "epoch": 0.9082729960155949, "grad_norm": 5.564262866973877, "learning_rate": 9.20765631210491e-05, "loss": 2.5454097747802735, "memory(GiB)": 66.02, "step": 21200, "token_acc": 0.479020979020979, "train_speed(iter/s)": 1.446352 }, { "epoch": 0.9084872113448439, "grad_norm": 4.614888668060303, "learning_rate": 9.207292725988405e-05, "loss": 2.7390678405761717, "memory(GiB)": 66.02, "step": 21205, "token_acc": 0.4117647058823529, "train_speed(iter/s)": 1.44636 }, { "epoch": 0.9087014266740928, "grad_norm": 5.1803154945373535, "learning_rate": 9.206929063652849e-05, "loss": 2.8002925872802735, "memory(GiB)": 66.02, "step": 21210, "token_acc": 0.42258064516129035, "train_speed(iter/s)": 1.44632 }, { "epoch": 0.9089156420033417, "grad_norm": 5.064484119415283, "learning_rate": 9.206565325104826e-05, "loss": 2.586495590209961, "memory(GiB)": 66.02, "step": 21215, "token_acc": 0.49794238683127573, "train_speed(iter/s)": 1.446354 }, { "epoch": 0.9091298573325908, "grad_norm": 4.219216823577881, "learning_rate": 9.206201510350925e-05, "loss": 2.4654720306396483, "memory(GiB)": 66.02, "step": 21220, "token_acc": 0.5202702702702703, "train_speed(iter/s)": 1.446434 }, { "epoch": 0.9093440726618397, "grad_norm": 3.962348222732544, "learning_rate": 9.205837619397738e-05, "loss": 2.737915802001953, "memory(GiB)": 66.02, "step": 21225, "token_acc": 0.4201388888888889, "train_speed(iter/s)": 1.446406 }, { "epoch": 0.9095582879910886, "grad_norm": 3.7598958015441895, "learning_rate": 9.205473652251858e-05, "loss": 2.337178039550781, "memory(GiB)": 66.02, "step": 21230, "token_acc": 0.4835164835164835, "train_speed(iter/s)": 1.44643 }, { "epoch": 0.9097725033203377, "grad_norm": 4.624716758728027, "learning_rate": 9.205109608919878e-05, "loss": 2.201300048828125, "memory(GiB)": 66.02, "step": 21235, "token_acc": 0.527972027972028, "train_speed(iter/s)": 1.446495 }, { "epoch": 0.9099867186495866, "grad_norm": 4.461903095245361, "learning_rate": 9.20474548940839e-05, "loss": 2.646737289428711, "memory(GiB)": 66.02, "step": 21240, "token_acc": 0.4326923076923077, "train_speed(iter/s)": 1.446513 }, { "epoch": 0.9102009339788355, "grad_norm": 4.8072357177734375, "learning_rate": 9.204381293723996e-05, "loss": 3.1139034271240233, "memory(GiB)": 66.02, "step": 21245, "token_acc": 0.4115942028985507, "train_speed(iter/s)": 1.44649 }, { "epoch": 0.9104151493080845, "grad_norm": 5.154967308044434, "learning_rate": 9.204017021873289e-05, "loss": 2.5804229736328126, "memory(GiB)": 66.02, "step": 21250, "token_acc": 0.4684014869888476, "train_speed(iter/s)": 1.446539 }, { "epoch": 0.9106293646373335, "grad_norm": 4.2552666664123535, "learning_rate": 9.20365267386287e-05, "loss": 2.462293243408203, "memory(GiB)": 66.02, "step": 21255, "token_acc": 0.4866920152091255, "train_speed(iter/s)": 1.446631 }, { "epoch": 0.9108435799665824, "grad_norm": 4.117024898529053, "learning_rate": 9.203288249699341e-05, "loss": 2.6193473815917967, "memory(GiB)": 66.02, "step": 21260, "token_acc": 0.41901408450704225, "train_speed(iter/s)": 1.446677 }, { "epoch": 0.9110577952958314, "grad_norm": 4.155416488647461, "learning_rate": 9.202923749389302e-05, "loss": 2.5048810958862306, "memory(GiB)": 66.02, "step": 21265, "token_acc": 0.46296296296296297, "train_speed(iter/s)": 1.446696 }, { "epoch": 0.9112720106250803, "grad_norm": 5.3150129318237305, "learning_rate": 9.202559172939355e-05, "loss": 2.827397346496582, "memory(GiB)": 66.02, "step": 21270, "token_acc": 0.444015444015444, "train_speed(iter/s)": 1.446715 }, { "epoch": 0.9114862259543293, "grad_norm": 4.588735103607178, "learning_rate": 9.202194520356108e-05, "loss": 2.4561126708984373, "memory(GiB)": 66.02, "step": 21275, "token_acc": 0.48, "train_speed(iter/s)": 1.446786 }, { "epoch": 0.9117004412835783, "grad_norm": 6.2010111808776855, "learning_rate": 9.201829791646165e-05, "loss": 2.461725044250488, "memory(GiB)": 66.02, "step": 21280, "token_acc": 0.4444444444444444, "train_speed(iter/s)": 1.446777 }, { "epoch": 0.9119146566128272, "grad_norm": 5.03751277923584, "learning_rate": 9.201464986816132e-05, "loss": 2.651559829711914, "memory(GiB)": 66.02, "step": 21285, "token_acc": 0.42953020134228187, "train_speed(iter/s)": 1.446811 }, { "epoch": 0.9121288719420761, "grad_norm": 4.223094940185547, "learning_rate": 9.201100105872622e-05, "loss": 2.8223472595214845, "memory(GiB)": 66.02, "step": 21290, "token_acc": 0.44648318042813456, "train_speed(iter/s)": 1.446818 }, { "epoch": 0.9123430872713252, "grad_norm": 4.478932857513428, "learning_rate": 9.200735148822241e-05, "loss": 2.398407554626465, "memory(GiB)": 66.02, "step": 21295, "token_acc": 0.50199203187251, "train_speed(iter/s)": 1.446879 }, { "epoch": 0.9125573026005741, "grad_norm": 5.171123504638672, "learning_rate": 9.200370115671604e-05, "loss": 2.5137500762939453, "memory(GiB)": 66.02, "step": 21300, "token_acc": 0.43630573248407645, "train_speed(iter/s)": 1.446885 }, { "epoch": 0.912771517929823, "grad_norm": 7.187526226043701, "learning_rate": 9.20000500642732e-05, "loss": 2.4938621520996094, "memory(GiB)": 66.02, "step": 21305, "token_acc": 0.47686832740213525, "train_speed(iter/s)": 1.446738 }, { "epoch": 0.9129857332590721, "grad_norm": 4.40461540222168, "learning_rate": 9.199639821096006e-05, "loss": 2.840202522277832, "memory(GiB)": 66.02, "step": 21310, "token_acc": 0.4740484429065744, "train_speed(iter/s)": 1.446705 }, { "epoch": 0.913199948588321, "grad_norm": 4.7580742835998535, "learning_rate": 9.199274559684277e-05, "loss": 2.9117860794067383, "memory(GiB)": 66.02, "step": 21315, "token_acc": 0.4181184668989547, "train_speed(iter/s)": 1.44676 }, { "epoch": 0.9134141639175699, "grad_norm": 5.559549808502197, "learning_rate": 9.198909222198751e-05, "loss": 2.766831970214844, "memory(GiB)": 66.02, "step": 21320, "token_acc": 0.41292134831460675, "train_speed(iter/s)": 1.446827 }, { "epoch": 0.913628379246819, "grad_norm": 5.4706926345825195, "learning_rate": 9.198543808646045e-05, "loss": 2.6354143142700197, "memory(GiB)": 66.02, "step": 21325, "token_acc": 0.4980694980694981, "train_speed(iter/s)": 1.446787 }, { "epoch": 0.9138425945760679, "grad_norm": 4.737473964691162, "learning_rate": 9.19817831903278e-05, "loss": 2.52447395324707, "memory(GiB)": 66.02, "step": 21330, "token_acc": 0.5017301038062284, "train_speed(iter/s)": 1.446824 }, { "epoch": 0.9140568099053168, "grad_norm": 3.8506369590759277, "learning_rate": 9.197812753365575e-05, "loss": 2.699715805053711, "memory(GiB)": 66.02, "step": 21335, "token_acc": 0.4767025089605735, "train_speed(iter/s)": 1.44685 }, { "epoch": 0.9142710252345658, "grad_norm": 4.3350396156311035, "learning_rate": 9.197447111651055e-05, "loss": 2.677760124206543, "memory(GiB)": 66.02, "step": 21340, "token_acc": 0.4608695652173913, "train_speed(iter/s)": 1.446875 }, { "epoch": 0.9144852405638147, "grad_norm": 3.9501583576202393, "learning_rate": 9.197081393895843e-05, "loss": 2.7794120788574217, "memory(GiB)": 66.02, "step": 21345, "token_acc": 0.43103448275862066, "train_speed(iter/s)": 1.446925 }, { "epoch": 0.9146994558930637, "grad_norm": 4.029372692108154, "learning_rate": 9.196715600106564e-05, "loss": 2.1019163131713867, "memory(GiB)": 66.02, "step": 21350, "token_acc": 0.5378787878787878, "train_speed(iter/s)": 1.446988 }, { "epoch": 0.9149136712223127, "grad_norm": 4.662520885467529, "learning_rate": 9.196349730289845e-05, "loss": 2.593465042114258, "memory(GiB)": 66.02, "step": 21355, "token_acc": 0.4411764705882353, "train_speed(iter/s)": 1.446987 }, { "epoch": 0.9151278865515616, "grad_norm": 3.5103843212127686, "learning_rate": 9.195983784452315e-05, "loss": 2.5835250854492187, "memory(GiB)": 66.02, "step": 21360, "token_acc": 0.4891304347826087, "train_speed(iter/s)": 1.446917 }, { "epoch": 0.9153421018808106, "grad_norm": 5.815867900848389, "learning_rate": 9.195617762600601e-05, "loss": 2.379450798034668, "memory(GiB)": 66.02, "step": 21365, "token_acc": 0.46153846153846156, "train_speed(iter/s)": 1.446917 }, { "epoch": 0.9155563172100596, "grad_norm": 3.733875036239624, "learning_rate": 9.195251664741337e-05, "loss": 2.4395851135253905, "memory(GiB)": 66.02, "step": 21370, "token_acc": 0.48923076923076925, "train_speed(iter/s)": 1.446972 }, { "epoch": 0.9157705325393085, "grad_norm": 5.514097690582275, "learning_rate": 9.194885490881153e-05, "loss": 2.9035226821899416, "memory(GiB)": 66.02, "step": 21375, "token_acc": 0.43729903536977494, "train_speed(iter/s)": 1.447042 }, { "epoch": 0.9159847478685574, "grad_norm": 5.24214506149292, "learning_rate": 9.194519241026684e-05, "loss": 2.7846353530883787, "memory(GiB)": 66.02, "step": 21380, "token_acc": 0.42207792207792205, "train_speed(iter/s)": 1.447045 }, { "epoch": 0.9161989631978065, "grad_norm": 4.489255905151367, "learning_rate": 9.194152915184564e-05, "loss": 2.8071937561035156, "memory(GiB)": 66.02, "step": 21385, "token_acc": 0.43023255813953487, "train_speed(iter/s)": 1.44711 }, { "epoch": 0.9164131785270554, "grad_norm": 3.3107810020446777, "learning_rate": 9.193786513361428e-05, "loss": 2.5622711181640625, "memory(GiB)": 66.02, "step": 21390, "token_acc": 0.48036253776435045, "train_speed(iter/s)": 1.447167 }, { "epoch": 0.9166273938563043, "grad_norm": 5.04474401473999, "learning_rate": 9.193420035563916e-05, "loss": 2.6250728607177733, "memory(GiB)": 66.02, "step": 21395, "token_acc": 0.4437299035369775, "train_speed(iter/s)": 1.447203 }, { "epoch": 0.9168416091855534, "grad_norm": 4.405237197875977, "learning_rate": 9.193053481798667e-05, "loss": 2.6750654220581054, "memory(GiB)": 66.02, "step": 21400, "token_acc": 0.42011834319526625, "train_speed(iter/s)": 1.447245 }, { "epoch": 0.9170558245148023, "grad_norm": 6.6985321044921875, "learning_rate": 9.192686852072321e-05, "loss": 2.8048528671264648, "memory(GiB)": 66.02, "step": 21405, "token_acc": 0.4553191489361702, "train_speed(iter/s)": 1.447324 }, { "epoch": 0.9172700398440512, "grad_norm": 4.291528701782227, "learning_rate": 9.192320146391518e-05, "loss": 2.6468124389648438, "memory(GiB)": 66.02, "step": 21410, "token_acc": 0.45161290322580644, "train_speed(iter/s)": 1.447399 }, { "epoch": 0.9174842551733002, "grad_norm": 3.8271381855010986, "learning_rate": 9.191953364762904e-05, "loss": 2.7598636627197264, "memory(GiB)": 66.02, "step": 21415, "token_acc": 0.44814814814814813, "train_speed(iter/s)": 1.447329 }, { "epoch": 0.9176984705025492, "grad_norm": 5.290842533111572, "learning_rate": 9.191586507193122e-05, "loss": 2.4327934265136717, "memory(GiB)": 66.02, "step": 21420, "token_acc": 0.5211267605633803, "train_speed(iter/s)": 1.447226 }, { "epoch": 0.9179126858317981, "grad_norm": 4.74054479598999, "learning_rate": 9.191219573688819e-05, "loss": 2.803517532348633, "memory(GiB)": 66.02, "step": 21425, "token_acc": 0.4360655737704918, "train_speed(iter/s)": 1.447203 }, { "epoch": 0.9181269011610471, "grad_norm": 3.487269163131714, "learning_rate": 9.190852564256641e-05, "loss": 2.4647871017456056, "memory(GiB)": 66.02, "step": 21430, "token_acc": 0.4851190476190476, "train_speed(iter/s)": 1.447273 }, { "epoch": 0.918341116490296, "grad_norm": 4.492441654205322, "learning_rate": 9.190485478903238e-05, "loss": 2.5874406814575197, "memory(GiB)": 66.02, "step": 21435, "token_acc": 0.4916387959866221, "train_speed(iter/s)": 1.44729 }, { "epoch": 0.918555331819545, "grad_norm": 4.885055065155029, "learning_rate": 9.190118317635259e-05, "loss": 2.6426555633544924, "memory(GiB)": 66.02, "step": 21440, "token_acc": 0.4603658536585366, "train_speed(iter/s)": 1.447309 }, { "epoch": 0.918769547148794, "grad_norm": 4.512348175048828, "learning_rate": 9.189751080459357e-05, "loss": 2.3507299423217773, "memory(GiB)": 66.02, "step": 21445, "token_acc": 0.5252918287937743, "train_speed(iter/s)": 1.447285 }, { "epoch": 0.9189837624780429, "grad_norm": 5.213791370391846, "learning_rate": 9.189383767382182e-05, "loss": 2.8945537567138673, "memory(GiB)": 66.02, "step": 21450, "token_acc": 0.4525316455696203, "train_speed(iter/s)": 1.447331 }, { "epoch": 0.9191979778072918, "grad_norm": 4.647881507873535, "learning_rate": 9.189016378410393e-05, "loss": 2.7044212341308596, "memory(GiB)": 66.02, "step": 21455, "token_acc": 0.47601476014760147, "train_speed(iter/s)": 1.447384 }, { "epoch": 0.9194121931365409, "grad_norm": 4.2234392166137695, "learning_rate": 9.188648913550641e-05, "loss": 2.5259857177734375, "memory(GiB)": 66.02, "step": 21460, "token_acc": 0.4817813765182186, "train_speed(iter/s)": 1.447414 }, { "epoch": 0.9196264084657898, "grad_norm": 5.436551570892334, "learning_rate": 9.188281372809584e-05, "loss": 2.7624088287353517, "memory(GiB)": 66.02, "step": 21465, "token_acc": 0.44545454545454544, "train_speed(iter/s)": 1.447394 }, { "epoch": 0.9198406237950387, "grad_norm": 4.727115631103516, "learning_rate": 9.187913756193882e-05, "loss": 2.9652545928955076, "memory(GiB)": 66.02, "step": 21470, "token_acc": 0.4406779661016949, "train_speed(iter/s)": 1.44747 }, { "epoch": 0.9200548391242878, "grad_norm": 4.309323310852051, "learning_rate": 9.187546063710193e-05, "loss": 2.5118219375610353, "memory(GiB)": 66.02, "step": 21475, "token_acc": 0.46153846153846156, "train_speed(iter/s)": 1.447523 }, { "epoch": 0.9202690544535367, "grad_norm": 3.8176162242889404, "learning_rate": 9.18717829536518e-05, "loss": 2.592088317871094, "memory(GiB)": 66.02, "step": 21480, "token_acc": 0.42902208201892744, "train_speed(iter/s)": 1.447546 }, { "epoch": 0.9204832697827856, "grad_norm": 4.594817161560059, "learning_rate": 9.186810451165502e-05, "loss": 2.4093158721923826, "memory(GiB)": 66.02, "step": 21485, "token_acc": 0.4888888888888889, "train_speed(iter/s)": 1.447583 }, { "epoch": 0.9206974851120346, "grad_norm": 4.779289722442627, "learning_rate": 9.186442531117828e-05, "loss": 2.9425094604492186, "memory(GiB)": 66.02, "step": 21490, "token_acc": 0.43130990415335463, "train_speed(iter/s)": 1.447542 }, { "epoch": 0.9209117004412836, "grad_norm": 4.596897602081299, "learning_rate": 9.18607453522882e-05, "loss": 2.5867481231689453, "memory(GiB)": 66.02, "step": 21495, "token_acc": 0.4612676056338028, "train_speed(iter/s)": 1.44754 }, { "epoch": 0.9211259157705325, "grad_norm": 4.5871195793151855, "learning_rate": 9.185706463505143e-05, "loss": 2.416180229187012, "memory(GiB)": 66.02, "step": 21500, "token_acc": 0.5065359477124183, "train_speed(iter/s)": 1.447429 }, { "epoch": 0.9211259157705325, "eval_loss": 2.2096166610717773, "eval_runtime": 14.5991, "eval_samples_per_second": 6.85, "eval_steps_per_second": 6.85, "eval_token_acc": 0.46794871794871795, "step": 21500 }, { "epoch": 0.9213401310997815, "grad_norm": 4.842560291290283, "learning_rate": 9.185338315953468e-05, "loss": 2.567431831359863, "memory(GiB)": 66.02, "step": 21505, "token_acc": 0.4762790697674419, "train_speed(iter/s)": 1.445855 }, { "epoch": 0.9215543464290304, "grad_norm": 5.74186372756958, "learning_rate": 9.184970092580463e-05, "loss": 2.3399642944335937, "memory(GiB)": 66.02, "step": 21510, "token_acc": 0.47346938775510206, "train_speed(iter/s)": 1.445877 }, { "epoch": 0.9217685617582794, "grad_norm": 3.550483226776123, "learning_rate": 9.1846017933928e-05, "loss": 2.7218713760375977, "memory(GiB)": 66.02, "step": 21515, "token_acc": 0.43466666666666665, "train_speed(iter/s)": 1.44587 }, { "epoch": 0.9219827770875284, "grad_norm": 3.7454476356506348, "learning_rate": 9.184233418397148e-05, "loss": 2.5545284271240236, "memory(GiB)": 66.02, "step": 21520, "token_acc": 0.4669260700389105, "train_speed(iter/s)": 1.445847 }, { "epoch": 0.9221969924167773, "grad_norm": 4.751285076141357, "learning_rate": 9.183864967600184e-05, "loss": 2.7277864456176757, "memory(GiB)": 66.02, "step": 21525, "token_acc": 0.4811320754716981, "train_speed(iter/s)": 1.445797 }, { "epoch": 0.9224112077460263, "grad_norm": 4.8122782707214355, "learning_rate": 9.183496441008581e-05, "loss": 2.2724990844726562, "memory(GiB)": 66.02, "step": 21530, "token_acc": 0.4935064935064935, "train_speed(iter/s)": 1.445813 }, { "epoch": 0.9226254230752753, "grad_norm": 4.041903018951416, "learning_rate": 9.183127838629016e-05, "loss": 2.561549186706543, "memory(GiB)": 66.02, "step": 21535, "token_acc": 0.4756554307116105, "train_speed(iter/s)": 1.445824 }, { "epoch": 0.9228396384045242, "grad_norm": 4.776011943817139, "learning_rate": 9.182759160468164e-05, "loss": 2.621981620788574, "memory(GiB)": 66.02, "step": 21540, "token_acc": 0.4717741935483871, "train_speed(iter/s)": 1.445721 }, { "epoch": 0.9230538537337732, "grad_norm": 3.802180767059326, "learning_rate": 9.182390406532708e-05, "loss": 2.591941261291504, "memory(GiB)": 66.02, "step": 21545, "token_acc": 0.4699248120300752, "train_speed(iter/s)": 1.445778 }, { "epoch": 0.9232680690630222, "grad_norm": 4.105423450469971, "learning_rate": 9.182021576829326e-05, "loss": 2.516992378234863, "memory(GiB)": 66.02, "step": 21550, "token_acc": 0.4838709677419355, "train_speed(iter/s)": 1.445774 }, { "epoch": 0.9234822843922711, "grad_norm": 4.851661682128906, "learning_rate": 9.1816526713647e-05, "loss": 2.7617799758911135, "memory(GiB)": 66.02, "step": 21555, "token_acc": 0.4753521126760563, "train_speed(iter/s)": 1.445757 }, { "epoch": 0.9236964997215201, "grad_norm": 4.442677021026611, "learning_rate": 9.181283690145514e-05, "loss": 2.4971900939941407, "memory(GiB)": 66.02, "step": 21560, "token_acc": 0.4919614147909968, "train_speed(iter/s)": 1.445826 }, { "epoch": 0.923910715050769, "grad_norm": 3.7416703701019287, "learning_rate": 9.18091463317845e-05, "loss": 2.5543373107910154, "memory(GiB)": 66.02, "step": 21565, "token_acc": 0.4734848484848485, "train_speed(iter/s)": 1.445855 }, { "epoch": 0.924124930380018, "grad_norm": 4.9060540199279785, "learning_rate": 9.180545500470197e-05, "loss": 2.5978450775146484, "memory(GiB)": 66.02, "step": 21570, "token_acc": 0.4837662337662338, "train_speed(iter/s)": 1.44587 }, { "epoch": 0.924339145709267, "grad_norm": 6.2970967292785645, "learning_rate": 9.18017629202744e-05, "loss": 2.798686981201172, "memory(GiB)": 66.02, "step": 21575, "token_acc": 0.4714828897338403, "train_speed(iter/s)": 1.445929 }, { "epoch": 0.9245533610385159, "grad_norm": 5.021624565124512, "learning_rate": 9.179807007856867e-05, "loss": 2.6187065124511717, "memory(GiB)": 66.02, "step": 21580, "token_acc": 0.45774647887323944, "train_speed(iter/s)": 1.446013 }, { "epoch": 0.9247675763677649, "grad_norm": 6.220231056213379, "learning_rate": 9.179437647965172e-05, "loss": 2.4467517852783205, "memory(GiB)": 66.02, "step": 21585, "token_acc": 0.4728682170542636, "train_speed(iter/s)": 1.445955 }, { "epoch": 0.9249817916970139, "grad_norm": 5.461313724517822, "learning_rate": 9.179068212359041e-05, "loss": 2.8383514404296877, "memory(GiB)": 66.02, "step": 21590, "token_acc": 0.4256198347107438, "train_speed(iter/s)": 1.445924 }, { "epoch": 0.9251960070262628, "grad_norm": 5.336484909057617, "learning_rate": 9.178698701045169e-05, "loss": 2.770197296142578, "memory(GiB)": 66.02, "step": 21595, "token_acc": 0.40594059405940597, "train_speed(iter/s)": 1.445919 }, { "epoch": 0.9254102223555117, "grad_norm": 10.905637741088867, "learning_rate": 9.178329114030251e-05, "loss": 2.51379280090332, "memory(GiB)": 66.02, "step": 21600, "token_acc": 0.48464163822525597, "train_speed(iter/s)": 1.445878 }, { "epoch": 0.9256244376847608, "grad_norm": 4.201478958129883, "learning_rate": 9.177959451320981e-05, "loss": 2.6344709396362305, "memory(GiB)": 66.02, "step": 21605, "token_acc": 0.45018450184501846, "train_speed(iter/s)": 1.445953 }, { "epoch": 0.9258386530140097, "grad_norm": 5.29052209854126, "learning_rate": 9.177589712924055e-05, "loss": 2.8175079345703127, "memory(GiB)": 66.02, "step": 21610, "token_acc": 0.436950146627566, "train_speed(iter/s)": 1.445989 }, { "epoch": 0.9260528683432586, "grad_norm": 4.763354778289795, "learning_rate": 9.177219898846175e-05, "loss": 2.378049850463867, "memory(GiB)": 66.02, "step": 21615, "token_acc": 0.47297297297297297, "train_speed(iter/s)": 1.445973 }, { "epoch": 0.9262670836725077, "grad_norm": 3.910534620285034, "learning_rate": 9.176850009094037e-05, "loss": 2.36367130279541, "memory(GiB)": 66.02, "step": 21620, "token_acc": 0.48188405797101447, "train_speed(iter/s)": 1.446045 }, { "epoch": 0.9264812990017566, "grad_norm": 6.240082263946533, "learning_rate": 9.176480043674343e-05, "loss": 2.765350341796875, "memory(GiB)": 66.02, "step": 21625, "token_acc": 0.5073529411764706, "train_speed(iter/s)": 1.446028 }, { "epoch": 0.9266955143310055, "grad_norm": 4.599140167236328, "learning_rate": 9.176110002593794e-05, "loss": 2.3109182357788085, "memory(GiB)": 66.02, "step": 21630, "token_acc": 0.49110320284697506, "train_speed(iter/s)": 1.446011 }, { "epoch": 0.9269097296602545, "grad_norm": 4.406301021575928, "learning_rate": 9.175739885859095e-05, "loss": 2.5080699920654297, "memory(GiB)": 66.02, "step": 21635, "token_acc": 0.47794117647058826, "train_speed(iter/s)": 1.445968 }, { "epoch": 0.9271239449895035, "grad_norm": 3.972447156906128, "learning_rate": 9.175369693476951e-05, "loss": 2.4904502868652343, "memory(GiB)": 66.02, "step": 21640, "token_acc": 0.46953405017921146, "train_speed(iter/s)": 1.445882 }, { "epoch": 0.9273381603187524, "grad_norm": 3.660184144973755, "learning_rate": 9.17499942545407e-05, "loss": 2.281089973449707, "memory(GiB)": 66.02, "step": 21645, "token_acc": 0.5310077519379846, "train_speed(iter/s)": 1.445867 }, { "epoch": 0.9275523756480014, "grad_norm": 5.657915115356445, "learning_rate": 9.174629081797156e-05, "loss": 2.4793445587158205, "memory(GiB)": 66.02, "step": 21650, "token_acc": 0.48314606741573035, "train_speed(iter/s)": 1.445868 }, { "epoch": 0.9277665909772503, "grad_norm": 4.37311315536499, "learning_rate": 9.174258662512921e-05, "loss": 2.641525459289551, "memory(GiB)": 66.02, "step": 21655, "token_acc": 0.46616541353383456, "train_speed(iter/s)": 1.445913 }, { "epoch": 0.9279808063064993, "grad_norm": 4.937234401702881, "learning_rate": 9.173888167608074e-05, "loss": 3.104800987243652, "memory(GiB)": 66.02, "step": 21660, "token_acc": 0.422680412371134, "train_speed(iter/s)": 1.445913 }, { "epoch": 0.9281950216357483, "grad_norm": 4.419463157653809, "learning_rate": 9.173517597089328e-05, "loss": 2.9346687316894533, "memory(GiB)": 66.02, "step": 21665, "token_acc": 0.41924398625429554, "train_speed(iter/s)": 1.44588 }, { "epoch": 0.9284092369649972, "grad_norm": 4.571208477020264, "learning_rate": 9.173146950963396e-05, "loss": 2.823246192932129, "memory(GiB)": 66.02, "step": 21670, "token_acc": 0.4160839160839161, "train_speed(iter/s)": 1.445909 }, { "epoch": 0.9286234522942461, "grad_norm": 4.280839920043945, "learning_rate": 9.17277622923699e-05, "loss": 2.496957206726074, "memory(GiB)": 66.02, "step": 21675, "token_acc": 0.4485981308411215, "train_speed(iter/s)": 1.445946 }, { "epoch": 0.9288376676234952, "grad_norm": 3.9365475177764893, "learning_rate": 9.172405431916831e-05, "loss": 2.644087791442871, "memory(GiB)": 66.02, "step": 21680, "token_acc": 0.42433234421364985, "train_speed(iter/s)": 1.445969 }, { "epoch": 0.9290518829527441, "grad_norm": 19.35648536682129, "learning_rate": 9.172034559009632e-05, "loss": 2.5395315170288084, "memory(GiB)": 66.02, "step": 21685, "token_acc": 0.48375451263537905, "train_speed(iter/s)": 1.445946 }, { "epoch": 0.929266098281993, "grad_norm": 3.9599685668945312, "learning_rate": 9.171663610522114e-05, "loss": 2.814164924621582, "memory(GiB)": 66.02, "step": 21690, "token_acc": 0.42574257425742573, "train_speed(iter/s)": 1.445997 }, { "epoch": 0.9294803136112421, "grad_norm": 3.758268117904663, "learning_rate": 9.171292586460996e-05, "loss": 2.6677173614501952, "memory(GiB)": 66.02, "step": 21695, "token_acc": 0.4560810810810811, "train_speed(iter/s)": 1.446046 }, { "epoch": 0.929694528940491, "grad_norm": 4.4717631340026855, "learning_rate": 9.170921486833e-05, "loss": 2.7518178939819338, "memory(GiB)": 66.02, "step": 21700, "token_acc": 0.4482758620689655, "train_speed(iter/s)": 1.446002 }, { "epoch": 0.9299087442697399, "grad_norm": 4.0885748863220215, "learning_rate": 9.170550311644848e-05, "loss": 2.6211666107177733, "memory(GiB)": 66.02, "step": 21705, "token_acc": 0.39144736842105265, "train_speed(iter/s)": 1.445995 }, { "epoch": 0.9301229595989889, "grad_norm": 4.248121738433838, "learning_rate": 9.170179060903265e-05, "loss": 2.4145360946655274, "memory(GiB)": 66.02, "step": 21710, "token_acc": 0.5, "train_speed(iter/s)": 1.446016 }, { "epoch": 0.9303371749282379, "grad_norm": 3.6612682342529297, "learning_rate": 9.169807734614976e-05, "loss": 2.58431453704834, "memory(GiB)": 66.02, "step": 21715, "token_acc": 0.4434782608695652, "train_speed(iter/s)": 1.446114 }, { "epoch": 0.9305513902574868, "grad_norm": 4.904190540313721, "learning_rate": 9.16943633278671e-05, "loss": 2.574121856689453, "memory(GiB)": 66.02, "step": 21720, "token_acc": 0.4823529411764706, "train_speed(iter/s)": 1.446115 }, { "epoch": 0.9307656055867358, "grad_norm": 4.228428363800049, "learning_rate": 9.169064855425191e-05, "loss": 2.6321155548095705, "memory(GiB)": 66.02, "step": 21725, "token_acc": 0.4786885245901639, "train_speed(iter/s)": 1.446165 }, { "epoch": 0.9309798209159847, "grad_norm": 3.748889207839966, "learning_rate": 9.168693302537155e-05, "loss": 2.486474609375, "memory(GiB)": 66.02, "step": 21730, "token_acc": 0.4487534626038781, "train_speed(iter/s)": 1.446141 }, { "epoch": 0.9311940362452337, "grad_norm": 4.557462692260742, "learning_rate": 9.168321674129326e-05, "loss": 2.7568225860595703, "memory(GiB)": 66.02, "step": 21735, "token_acc": 0.4578313253012048, "train_speed(iter/s)": 1.44616 }, { "epoch": 0.9314082515744827, "grad_norm": 4.188618183135986, "learning_rate": 9.16794997020844e-05, "loss": 2.7182632446289063, "memory(GiB)": 66.02, "step": 21740, "token_acc": 0.46794871794871795, "train_speed(iter/s)": 1.446213 }, { "epoch": 0.9316224669037316, "grad_norm": 4.227839469909668, "learning_rate": 9.167578190781232e-05, "loss": 2.254042053222656, "memory(GiB)": 66.02, "step": 21745, "token_acc": 0.5017182130584192, "train_speed(iter/s)": 1.446253 }, { "epoch": 0.9318366822329806, "grad_norm": 4.528618812561035, "learning_rate": 9.167206335854435e-05, "loss": 2.4962574005126954, "memory(GiB)": 66.02, "step": 21750, "token_acc": 0.5016722408026756, "train_speed(iter/s)": 1.446287 }, { "epoch": 0.9320508975622296, "grad_norm": 3.9008610248565674, "learning_rate": 9.166834405434785e-05, "loss": 2.6359542846679687, "memory(GiB)": 66.02, "step": 21755, "token_acc": 0.45454545454545453, "train_speed(iter/s)": 1.446314 }, { "epoch": 0.9322651128914785, "grad_norm": 6.49666166305542, "learning_rate": 9.166462399529021e-05, "loss": 2.1529220581054687, "memory(GiB)": 66.02, "step": 21760, "token_acc": 0.5142857142857142, "train_speed(iter/s)": 1.44622 }, { "epoch": 0.9324793282207274, "grad_norm": 6.105951309204102, "learning_rate": 9.166090318143883e-05, "loss": 2.214730644226074, "memory(GiB)": 66.02, "step": 21765, "token_acc": 0.524390243902439, "train_speed(iter/s)": 1.446259 }, { "epoch": 0.9326935435499765, "grad_norm": 5.192975044250488, "learning_rate": 9.165718161286111e-05, "loss": 2.997336578369141, "memory(GiB)": 66.02, "step": 21770, "token_acc": 0.42, "train_speed(iter/s)": 1.446281 }, { "epoch": 0.9329077588792254, "grad_norm": 4.810291767120361, "learning_rate": 9.165345928962446e-05, "loss": 2.4880397796630858, "memory(GiB)": 66.02, "step": 21775, "token_acc": 0.4984520123839009, "train_speed(iter/s)": 1.4463 }, { "epoch": 0.9331219742084743, "grad_norm": 4.143027305603027, "learning_rate": 9.164973621179634e-05, "loss": 2.485639953613281, "memory(GiB)": 66.02, "step": 21780, "token_acc": 0.49809885931558934, "train_speed(iter/s)": 1.446237 }, { "epoch": 0.9333361895377233, "grad_norm": 4.098141193389893, "learning_rate": 9.164601237944415e-05, "loss": 2.7792768478393555, "memory(GiB)": 66.02, "step": 21785, "token_acc": 0.4712230215827338, "train_speed(iter/s)": 1.446199 }, { "epoch": 0.9335504048669723, "grad_norm": 6.270641803741455, "learning_rate": 9.16422877926354e-05, "loss": 2.7639646530151367, "memory(GiB)": 66.02, "step": 21790, "token_acc": 0.4785714285714286, "train_speed(iter/s)": 1.446166 }, { "epoch": 0.9337646201962212, "grad_norm": 4.690643787384033, "learning_rate": 9.163856245143752e-05, "loss": 2.5041481018066407, "memory(GiB)": 66.02, "step": 21795, "token_acc": 0.4666666666666667, "train_speed(iter/s)": 1.446094 }, { "epoch": 0.9339788355254702, "grad_norm": 4.700472831726074, "learning_rate": 9.163483635591804e-05, "loss": 2.362548828125, "memory(GiB)": 66.02, "step": 21800, "token_acc": 0.5048543689320388, "train_speed(iter/s)": 1.446031 }, { "epoch": 0.9341930508547192, "grad_norm": 4.3396759033203125, "learning_rate": 9.163110950614445e-05, "loss": 2.463043212890625, "memory(GiB)": 66.02, "step": 21805, "token_acc": 0.4924812030075188, "train_speed(iter/s)": 1.446055 }, { "epoch": 0.9344072661839681, "grad_norm": 3.9828310012817383, "learning_rate": 9.162738190218424e-05, "loss": 3.0610742568969727, "memory(GiB)": 66.02, "step": 21810, "token_acc": 0.37755102040816324, "train_speed(iter/s)": 1.446132 }, { "epoch": 0.9346214815132171, "grad_norm": 3.7744083404541016, "learning_rate": 9.162365354410496e-05, "loss": 2.868472862243652, "memory(GiB)": 66.02, "step": 21815, "token_acc": 0.430635838150289, "train_speed(iter/s)": 1.446164 }, { "epoch": 0.934835696842466, "grad_norm": 3.9515891075134277, "learning_rate": 9.161992443197416e-05, "loss": 2.5441120147705076, "memory(GiB)": 66.02, "step": 21820, "token_acc": 0.4714285714285714, "train_speed(iter/s)": 1.446156 }, { "epoch": 0.935049912171715, "grad_norm": 3.456465482711792, "learning_rate": 9.161619456585937e-05, "loss": 2.526171875, "memory(GiB)": 66.02, "step": 21825, "token_acc": 0.4684385382059801, "train_speed(iter/s)": 1.446213 }, { "epoch": 0.935264127500964, "grad_norm": 3.599581718444824, "learning_rate": 9.161246394582818e-05, "loss": 2.8087135314941407, "memory(GiB)": 66.02, "step": 21830, "token_acc": 0.4310850439882698, "train_speed(iter/s)": 1.446215 }, { "epoch": 0.9354783428302129, "grad_norm": 5.116060733795166, "learning_rate": 9.160873257194818e-05, "loss": 2.4573360443115235, "memory(GiB)": 66.02, "step": 21835, "token_acc": 0.4734982332155477, "train_speed(iter/s)": 1.446302 }, { "epoch": 0.9356925581594618, "grad_norm": 3.930436372756958, "learning_rate": 9.160500044428696e-05, "loss": 2.4878616333007812, "memory(GiB)": 66.02, "step": 21840, "token_acc": 0.479020979020979, "train_speed(iter/s)": 1.446285 }, { "epoch": 0.9359067734887109, "grad_norm": 5.1010284423828125, "learning_rate": 9.160126756291211e-05, "loss": 2.496533203125, "memory(GiB)": 66.02, "step": 21845, "token_acc": 0.44983818770226536, "train_speed(iter/s)": 1.44628 }, { "epoch": 0.9361209888179598, "grad_norm": 3.3701560497283936, "learning_rate": 9.15975339278913e-05, "loss": 2.364858627319336, "memory(GiB)": 66.02, "step": 21850, "token_acc": 0.4826388888888889, "train_speed(iter/s)": 1.44625 }, { "epoch": 0.9363352041472087, "grad_norm": 3.9150311946868896, "learning_rate": 9.159379953929213e-05, "loss": 2.6046993255615236, "memory(GiB)": 66.02, "step": 21855, "token_acc": 0.45098039215686275, "train_speed(iter/s)": 1.446238 }, { "epoch": 0.9365494194764578, "grad_norm": 4.447291851043701, "learning_rate": 9.159006439718226e-05, "loss": 2.5557540893554687, "memory(GiB)": 66.02, "step": 21860, "token_acc": 0.4895397489539749, "train_speed(iter/s)": 1.446226 }, { "epoch": 0.9367636348057067, "grad_norm": 4.724329948425293, "learning_rate": 9.158632850162935e-05, "loss": 2.7024940490722655, "memory(GiB)": 66.02, "step": 21865, "token_acc": 0.4527687296416938, "train_speed(iter/s)": 1.446314 }, { "epoch": 0.9369778501349556, "grad_norm": 4.627659797668457, "learning_rate": 9.158259185270108e-05, "loss": 2.3267560958862306, "memory(GiB)": 66.02, "step": 21870, "token_acc": 0.4874551971326165, "train_speed(iter/s)": 1.446319 }, { "epoch": 0.9371920654642046, "grad_norm": 6.513284683227539, "learning_rate": 9.157885445046519e-05, "loss": 2.784957504272461, "memory(GiB)": 66.02, "step": 21875, "token_acc": 0.4378698224852071, "train_speed(iter/s)": 1.446313 }, { "epoch": 0.9374062807934536, "grad_norm": 4.638032913208008, "learning_rate": 9.157511629498932e-05, "loss": 2.5153589248657227, "memory(GiB)": 66.02, "step": 21880, "token_acc": 0.521551724137931, "train_speed(iter/s)": 1.446416 }, { "epoch": 0.9376204961227026, "grad_norm": 4.4747843742370605, "learning_rate": 9.157137738634122e-05, "loss": 3.0156442642211916, "memory(GiB)": 66.02, "step": 21885, "token_acc": 0.41134751773049644, "train_speed(iter/s)": 1.446529 }, { "epoch": 0.9378347114519515, "grad_norm": 4.044059753417969, "learning_rate": 9.156763772458862e-05, "loss": 2.6388324737548827, "memory(GiB)": 66.02, "step": 21890, "token_acc": 0.46905537459283386, "train_speed(iter/s)": 1.44663 }, { "epoch": 0.9380489267812004, "grad_norm": 5.029946327209473, "learning_rate": 9.156389730979928e-05, "loss": 2.612639617919922, "memory(GiB)": 66.02, "step": 21895, "token_acc": 0.47194719471947194, "train_speed(iter/s)": 1.446742 }, { "epoch": 0.9382631421104495, "grad_norm": 5.445373058319092, "learning_rate": 9.156015614204094e-05, "loss": 2.3072505950927735, "memory(GiB)": 66.02, "step": 21900, "token_acc": 0.46987951807228917, "train_speed(iter/s)": 1.446735 }, { "epoch": 0.9384773574396984, "grad_norm": 5.270296573638916, "learning_rate": 9.155641422138139e-05, "loss": 2.8074317932128907, "memory(GiB)": 66.02, "step": 21905, "token_acc": 0.44107744107744107, "train_speed(iter/s)": 1.446748 }, { "epoch": 0.9386915727689473, "grad_norm": 3.0650320053100586, "learning_rate": 9.15526715478884e-05, "loss": 2.3497812271118166, "memory(GiB)": 66.02, "step": 21910, "token_acc": 0.4831081081081081, "train_speed(iter/s)": 1.446781 }, { "epoch": 0.9389057880981964, "grad_norm": 5.169348239898682, "learning_rate": 9.15489281216298e-05, "loss": 2.7509584426879883, "memory(GiB)": 66.02, "step": 21915, "token_acc": 0.40794223826714804, "train_speed(iter/s)": 1.446736 }, { "epoch": 0.9391200034274453, "grad_norm": 5.620747089385986, "learning_rate": 9.154518394267338e-05, "loss": 2.5814395904541017, "memory(GiB)": 66.02, "step": 21920, "token_acc": 0.4721311475409836, "train_speed(iter/s)": 1.446671 }, { "epoch": 0.9393342187566942, "grad_norm": 6.354038715362549, "learning_rate": 9.154143901108699e-05, "loss": 2.5864822387695314, "memory(GiB)": 66.02, "step": 21925, "token_acc": 0.46887966804979253, "train_speed(iter/s)": 1.446552 }, { "epoch": 0.9395484340859432, "grad_norm": 4.962425708770752, "learning_rate": 9.153769332693847e-05, "loss": 2.545681953430176, "memory(GiB)": 66.02, "step": 21930, "token_acc": 0.44485294117647056, "train_speed(iter/s)": 1.446519 }, { "epoch": 0.9397626494151922, "grad_norm": 5.091123580932617, "learning_rate": 9.153394689029566e-05, "loss": 2.5958648681640626, "memory(GiB)": 66.02, "step": 21935, "token_acc": 0.44483985765124556, "train_speed(iter/s)": 1.44649 }, { "epoch": 0.9399768647444411, "grad_norm": 4.598596572875977, "learning_rate": 9.153019970122643e-05, "loss": 2.4702308654785154, "memory(GiB)": 66.02, "step": 21940, "token_acc": 0.44964028776978415, "train_speed(iter/s)": 1.446473 }, { "epoch": 0.9401910800736901, "grad_norm": 5.330958843231201, "learning_rate": 9.15264517597987e-05, "loss": 2.559444808959961, "memory(GiB)": 66.02, "step": 21945, "token_acc": 0.45627376425855515, "train_speed(iter/s)": 1.446481 }, { "epoch": 0.940405295402939, "grad_norm": 5.53353214263916, "learning_rate": 9.152270306608031e-05, "loss": 2.5792699813842774, "memory(GiB)": 66.02, "step": 21950, "token_acc": 0.4540229885057471, "train_speed(iter/s)": 1.446447 }, { "epoch": 0.940619510732188, "grad_norm": 3.1102919578552246, "learning_rate": 9.151895362013922e-05, "loss": 2.269911193847656, "memory(GiB)": 66.02, "step": 21955, "token_acc": 0.4921875, "train_speed(iter/s)": 1.446492 }, { "epoch": 0.940833726061437, "grad_norm": 4.358954906463623, "learning_rate": 9.151520342204334e-05, "loss": 2.7471052169799806, "memory(GiB)": 66.02, "step": 21960, "token_acc": 0.4570552147239264, "train_speed(iter/s)": 1.446576 }, { "epoch": 0.9410479413906859, "grad_norm": 3.5152010917663574, "learning_rate": 9.151145247186061e-05, "loss": 2.645145797729492, "memory(GiB)": 66.02, "step": 21965, "token_acc": 0.43934426229508194, "train_speed(iter/s)": 1.446525 }, { "epoch": 0.9412621567199349, "grad_norm": 3.773735284805298, "learning_rate": 9.150770076965895e-05, "loss": 2.8553417205810545, "memory(GiB)": 66.02, "step": 21970, "token_acc": 0.3977272727272727, "train_speed(iter/s)": 1.446521 }, { "epoch": 0.9414763720491839, "grad_norm": 3.603227376937866, "learning_rate": 9.15039483155064e-05, "loss": 2.641327667236328, "memory(GiB)": 66.02, "step": 21975, "token_acc": 0.4756944444444444, "train_speed(iter/s)": 1.446526 }, { "epoch": 0.9416905873784328, "grad_norm": 4.119307518005371, "learning_rate": 9.150019510947086e-05, "loss": 2.4307092666625976, "memory(GiB)": 66.02, "step": 21980, "token_acc": 0.476038338658147, "train_speed(iter/s)": 1.446412 }, { "epoch": 0.9419048027076817, "grad_norm": 5.00947904586792, "learning_rate": 9.149644115162035e-05, "loss": 2.4045654296875, "memory(GiB)": 66.02, "step": 21985, "token_acc": 0.4823943661971831, "train_speed(iter/s)": 1.44629 }, { "epoch": 0.9421190180369308, "grad_norm": 4.431349277496338, "learning_rate": 9.149268644202289e-05, "loss": 2.8693355560302733, "memory(GiB)": 66.02, "step": 21990, "token_acc": 0.4368932038834951, "train_speed(iter/s)": 1.446285 }, { "epoch": 0.9423332333661797, "grad_norm": 4.919655799865723, "learning_rate": 9.148893098074649e-05, "loss": 2.2277074813842774, "memory(GiB)": 66.02, "step": 21995, "token_acc": 0.49433962264150944, "train_speed(iter/s)": 1.446275 }, { "epoch": 0.9425474486954286, "grad_norm": 4.579711437225342, "learning_rate": 9.148517476785918e-05, "loss": 2.4744726181030274, "memory(GiB)": 66.02, "step": 22000, "token_acc": 0.4774436090225564, "train_speed(iter/s)": 1.446332 }, { "epoch": 0.9425474486954286, "eval_loss": 2.200505256652832, "eval_runtime": 15.1137, "eval_samples_per_second": 6.617, "eval_steps_per_second": 6.617, "eval_token_acc": 0.46959896507115134, "step": 22000 }, { "epoch": 0.9427616640246776, "grad_norm": 3.9472317695617676, "learning_rate": 9.148141780342903e-05, "loss": 2.727242279052734, "memory(GiB)": 66.02, "step": 22005, "token_acc": 0.47702834799608995, "train_speed(iter/s)": 1.444886 }, { "epoch": 0.9429758793539266, "grad_norm": 7.040859699249268, "learning_rate": 9.147766008752407e-05, "loss": 2.6169862747192383, "memory(GiB)": 66.02, "step": 22010, "token_acc": 0.47653429602888087, "train_speed(iter/s)": 1.444895 }, { "epoch": 0.9431900946831755, "grad_norm": 5.663588523864746, "learning_rate": 9.14739016202124e-05, "loss": 3.0661840438842773, "memory(GiB)": 66.02, "step": 22015, "token_acc": 0.39365079365079364, "train_speed(iter/s)": 1.444907 }, { "epoch": 0.9434043100124245, "grad_norm": 4.596031665802002, "learning_rate": 9.14701424015621e-05, "loss": 2.503927230834961, "memory(GiB)": 66.02, "step": 22020, "token_acc": 0.47104247104247104, "train_speed(iter/s)": 1.444992 }, { "epoch": 0.9436185253416735, "grad_norm": 4.302864074707031, "learning_rate": 9.146638243164125e-05, "loss": 2.3167238235473633, "memory(GiB)": 66.02, "step": 22025, "token_acc": 0.5040650406504065, "train_speed(iter/s)": 1.445031 }, { "epoch": 0.9438327406709224, "grad_norm": 3.596513032913208, "learning_rate": 9.1462621710518e-05, "loss": 2.5464473724365235, "memory(GiB)": 66.02, "step": 22030, "token_acc": 0.44074074074074077, "train_speed(iter/s)": 1.445038 }, { "epoch": 0.9440469560001714, "grad_norm": 4.43658447265625, "learning_rate": 9.145886023826044e-05, "loss": 2.6728584289550783, "memory(GiB)": 66.02, "step": 22035, "token_acc": 0.475, "train_speed(iter/s)": 1.445066 }, { "epoch": 0.9442611713294203, "grad_norm": 3.598688840866089, "learning_rate": 9.145509801493677e-05, "loss": 2.5031652450561523, "memory(GiB)": 66.02, "step": 22040, "token_acc": 0.44061302681992337, "train_speed(iter/s)": 1.445092 }, { "epoch": 0.9444753866586693, "grad_norm": 5.328486919403076, "learning_rate": 9.145133504061509e-05, "loss": 2.5981777191162108, "memory(GiB)": 66.02, "step": 22045, "token_acc": 0.4852941176470588, "train_speed(iter/s)": 1.445123 }, { "epoch": 0.9446896019879183, "grad_norm": 4.462075710296631, "learning_rate": 9.14475713153636e-05, "loss": 2.7924291610717775, "memory(GiB)": 66.02, "step": 22050, "token_acc": 0.43333333333333335, "train_speed(iter/s)": 1.445136 }, { "epoch": 0.9449038173171672, "grad_norm": 3.7059483528137207, "learning_rate": 9.144380683925044e-05, "loss": 2.7857032775878907, "memory(GiB)": 66.02, "step": 22055, "token_acc": 0.4758364312267658, "train_speed(iter/s)": 1.445128 }, { "epoch": 0.9451180326464161, "grad_norm": 4.5063276290893555, "learning_rate": 9.144004161234388e-05, "loss": 2.778232955932617, "memory(GiB)": 66.02, "step": 22060, "token_acc": 0.4548611111111111, "train_speed(iter/s)": 1.445168 }, { "epoch": 0.9453322479756652, "grad_norm": 5.610306262969971, "learning_rate": 9.143627563471209e-05, "loss": 2.6039690017700194, "memory(GiB)": 66.02, "step": 22065, "token_acc": 0.44876325088339225, "train_speed(iter/s)": 1.445047 }, { "epoch": 0.9455464633049141, "grad_norm": 3.989807605743408, "learning_rate": 9.143250890642327e-05, "loss": 2.428289031982422, "memory(GiB)": 66.02, "step": 22070, "token_acc": 0.5278810408921933, "train_speed(iter/s)": 1.44505 }, { "epoch": 0.945760678634163, "grad_norm": 4.690921306610107, "learning_rate": 9.142874142754572e-05, "loss": 2.592865562438965, "memory(GiB)": 66.02, "step": 22075, "token_acc": 0.4627831715210356, "train_speed(iter/s)": 1.44508 }, { "epoch": 0.9459748939634121, "grad_norm": 3.8750624656677246, "learning_rate": 9.142497319814764e-05, "loss": 2.4155059814453126, "memory(GiB)": 66.02, "step": 22080, "token_acc": 0.49666666666666665, "train_speed(iter/s)": 1.445113 }, { "epoch": 0.946189109292661, "grad_norm": 5.399001598358154, "learning_rate": 9.142120421829729e-05, "loss": 2.9077178955078127, "memory(GiB)": 66.02, "step": 22085, "token_acc": 0.4295774647887324, "train_speed(iter/s)": 1.445153 }, { "epoch": 0.9464033246219099, "grad_norm": 4.626400470733643, "learning_rate": 9.141743448806301e-05, "loss": 2.6723724365234376, "memory(GiB)": 66.02, "step": 22090, "token_acc": 0.4375, "train_speed(iter/s)": 1.445083 }, { "epoch": 0.9466175399511589, "grad_norm": 4.274816989898682, "learning_rate": 9.141366400751301e-05, "loss": 2.886031723022461, "memory(GiB)": 66.02, "step": 22095, "token_acc": 0.4578313253012048, "train_speed(iter/s)": 1.44517 }, { "epoch": 0.9468317552804079, "grad_norm": 5.222473621368408, "learning_rate": 9.140989277671567e-05, "loss": 2.282111930847168, "memory(GiB)": 66.02, "step": 22100, "token_acc": 0.5101351351351351, "train_speed(iter/s)": 1.44522 }, { "epoch": 0.9470459706096568, "grad_norm": 4.098766803741455, "learning_rate": 9.140612079573927e-05, "loss": 2.4320705413818358, "memory(GiB)": 66.02, "step": 22105, "token_acc": 0.5193798449612403, "train_speed(iter/s)": 1.445214 }, { "epoch": 0.9472601859389058, "grad_norm": 6.469631195068359, "learning_rate": 9.140234806465214e-05, "loss": 2.3476051330566405, "memory(GiB)": 66.02, "step": 22110, "token_acc": 0.5482456140350878, "train_speed(iter/s)": 1.445237 }, { "epoch": 0.9474744012681547, "grad_norm": 3.9835705757141113, "learning_rate": 9.139857458352263e-05, "loss": 2.6543209075927736, "memory(GiB)": 66.02, "step": 22115, "token_acc": 0.43506493506493504, "train_speed(iter/s)": 1.445175 }, { "epoch": 0.9476886165974037, "grad_norm": 5.18387508392334, "learning_rate": 9.139480035241912e-05, "loss": 2.807154083251953, "memory(GiB)": 66.02, "step": 22120, "token_acc": 0.41580756013745707, "train_speed(iter/s)": 1.445154 }, { "epoch": 0.9479028319266527, "grad_norm": 3.8181638717651367, "learning_rate": 9.139102537140996e-05, "loss": 2.4215265274047852, "memory(GiB)": 66.02, "step": 22125, "token_acc": 0.5152542372881356, "train_speed(iter/s)": 1.445089 }, { "epoch": 0.9481170472559016, "grad_norm": 5.602406024932861, "learning_rate": 9.138724964056355e-05, "loss": 2.3998691558837892, "memory(GiB)": 66.02, "step": 22130, "token_acc": 0.4568345323741007, "train_speed(iter/s)": 1.445098 }, { "epoch": 0.9483312625851505, "grad_norm": 3.582289934158325, "learning_rate": 9.13834731599483e-05, "loss": 2.4179683685302735, "memory(GiB)": 66.02, "step": 22135, "token_acc": 0.5017182130584192, "train_speed(iter/s)": 1.445197 }, { "epoch": 0.9485454779143996, "grad_norm": 5.022667407989502, "learning_rate": 9.13796959296326e-05, "loss": 2.2809337615966796, "memory(GiB)": 66.02, "step": 22140, "token_acc": 0.4860557768924303, "train_speed(iter/s)": 1.445135 }, { "epoch": 0.9487596932436485, "grad_norm": 4.084569931030273, "learning_rate": 9.137591794968489e-05, "loss": 2.864162826538086, "memory(GiB)": 66.02, "step": 22145, "token_acc": 0.44476744186046513, "train_speed(iter/s)": 1.445142 }, { "epoch": 0.9489739085728974, "grad_norm": 4.691052436828613, "learning_rate": 9.137213922017363e-05, "loss": 2.634572982788086, "memory(GiB)": 66.02, "step": 22150, "token_acc": 0.5458515283842795, "train_speed(iter/s)": 1.445185 }, { "epoch": 0.9491881239021465, "grad_norm": 4.898622512817383, "learning_rate": 9.136835974116724e-05, "loss": 2.829137992858887, "memory(GiB)": 66.02, "step": 22155, "token_acc": 0.47115384615384615, "train_speed(iter/s)": 1.445161 }, { "epoch": 0.9494023392313954, "grad_norm": 4.631893634796143, "learning_rate": 9.136457951273423e-05, "loss": 2.6244916915893555, "memory(GiB)": 66.02, "step": 22160, "token_acc": 0.4595375722543353, "train_speed(iter/s)": 1.445163 }, { "epoch": 0.9496165545606443, "grad_norm": 5.193511962890625, "learning_rate": 9.136079853494304e-05, "loss": 2.705002212524414, "memory(GiB)": 66.02, "step": 22165, "token_acc": 0.48249027237354086, "train_speed(iter/s)": 1.445204 }, { "epoch": 0.9498307698898933, "grad_norm": 4.0157880783081055, "learning_rate": 9.135701680786218e-05, "loss": 2.684200477600098, "memory(GiB)": 66.02, "step": 22170, "token_acc": 0.4194528875379939, "train_speed(iter/s)": 1.445195 }, { "epoch": 0.9500449852191423, "grad_norm": 5.729152679443359, "learning_rate": 9.135323433156018e-05, "loss": 2.731343460083008, "memory(GiB)": 66.02, "step": 22175, "token_acc": 0.45794392523364486, "train_speed(iter/s)": 1.445248 }, { "epoch": 0.9502592005483912, "grad_norm": 5.193756580352783, "learning_rate": 9.134945110610554e-05, "loss": 2.280427169799805, "memory(GiB)": 66.02, "step": 22180, "token_acc": 0.5175438596491229, "train_speed(iter/s)": 1.445218 }, { "epoch": 0.9504734158776402, "grad_norm": 4.542730331420898, "learning_rate": 9.134566713156679e-05, "loss": 2.470453453063965, "memory(GiB)": 66.02, "step": 22185, "token_acc": 0.4896551724137931, "train_speed(iter/s)": 1.445295 }, { "epoch": 0.9506876312068891, "grad_norm": 4.4969329833984375, "learning_rate": 9.134188240801251e-05, "loss": 2.6251392364501953, "memory(GiB)": 66.02, "step": 22190, "token_acc": 0.46366782006920415, "train_speed(iter/s)": 1.445361 }, { "epoch": 0.9509018465361381, "grad_norm": 14.010857582092285, "learning_rate": 9.133809693551125e-05, "loss": 2.4005346298217773, "memory(GiB)": 66.02, "step": 22195, "token_acc": 0.46886446886446886, "train_speed(iter/s)": 1.445393 }, { "epoch": 0.9511160618653871, "grad_norm": 3.6862452030181885, "learning_rate": 9.133431071413158e-05, "loss": 2.5876438140869142, "memory(GiB)": 66.02, "step": 22200, "token_acc": 0.5033557046979866, "train_speed(iter/s)": 1.445356 }, { "epoch": 0.951330277194636, "grad_norm": 4.559037685394287, "learning_rate": 9.13305237439421e-05, "loss": 2.7423818588256834, "memory(GiB)": 66.02, "step": 22205, "token_acc": 0.48375451263537905, "train_speed(iter/s)": 1.445339 }, { "epoch": 0.951544492523885, "grad_norm": 5.095754623413086, "learning_rate": 9.13267360250114e-05, "loss": 2.498956871032715, "memory(GiB)": 66.02, "step": 22210, "token_acc": 0.47854785478547857, "train_speed(iter/s)": 1.445377 }, { "epoch": 0.951758707853134, "grad_norm": 4.862742900848389, "learning_rate": 9.132294755740814e-05, "loss": 2.6450281143188477, "memory(GiB)": 66.02, "step": 22215, "token_acc": 0.4580152671755725, "train_speed(iter/s)": 1.445392 }, { "epoch": 0.9519729231823829, "grad_norm": 5.515480995178223, "learning_rate": 9.131915834120088e-05, "loss": 2.463967514038086, "memory(GiB)": 66.02, "step": 22220, "token_acc": 0.47843137254901963, "train_speed(iter/s)": 1.445391 }, { "epoch": 0.952187138511632, "grad_norm": 4.148614883422852, "learning_rate": 9.131536837645833e-05, "loss": 2.6365741729736327, "memory(GiB)": 66.02, "step": 22225, "token_acc": 0.4367469879518072, "train_speed(iter/s)": 1.445428 }, { "epoch": 0.9524013538408809, "grad_norm": 4.757924556732178, "learning_rate": 9.131157766324912e-05, "loss": 2.507137107849121, "memory(GiB)": 66.02, "step": 22230, "token_acc": 0.4888888888888889, "train_speed(iter/s)": 1.445467 }, { "epoch": 0.9526155691701298, "grad_norm": 4.670322895050049, "learning_rate": 9.130778620164193e-05, "loss": 2.5537620544433595, "memory(GiB)": 66.02, "step": 22235, "token_acc": 0.4395973154362416, "train_speed(iter/s)": 1.445559 }, { "epoch": 0.9528297844993788, "grad_norm": 4.049413204193115, "learning_rate": 9.130399399170544e-05, "loss": 2.618539237976074, "memory(GiB)": 66.02, "step": 22240, "token_acc": 0.4740484429065744, "train_speed(iter/s)": 1.445635 }, { "epoch": 0.9530439998286278, "grad_norm": 5.538731098175049, "learning_rate": 9.130020103350836e-05, "loss": 2.821795654296875, "memory(GiB)": 66.02, "step": 22245, "token_acc": 0.42592592592592593, "train_speed(iter/s)": 1.445661 }, { "epoch": 0.9532582151578767, "grad_norm": 4.345365047454834, "learning_rate": 9.12964073271194e-05, "loss": 2.3884326934814455, "memory(GiB)": 66.02, "step": 22250, "token_acc": 0.5142857142857142, "train_speed(iter/s)": 1.445637 }, { "epoch": 0.9534724304871257, "grad_norm": 4.030776500701904, "learning_rate": 9.129261287260726e-05, "loss": 2.64312686920166, "memory(GiB)": 66.02, "step": 22255, "token_acc": 0.46938775510204084, "train_speed(iter/s)": 1.44562 }, { "epoch": 0.9536866458163746, "grad_norm": 5.273192405700684, "learning_rate": 9.128881767004072e-05, "loss": 2.433702278137207, "memory(GiB)": 69.34, "step": 22260, "token_acc": 0.5107296137339056, "train_speed(iter/s)": 1.445562 }, { "epoch": 0.9539008611456236, "grad_norm": 3.850247383117676, "learning_rate": 9.12850217194885e-05, "loss": 2.754536819458008, "memory(GiB)": 69.34, "step": 22265, "token_acc": 0.4146341463414634, "train_speed(iter/s)": 1.44558 }, { "epoch": 0.9541150764748726, "grad_norm": 5.440518856048584, "learning_rate": 9.12812250210194e-05, "loss": 2.704586410522461, "memory(GiB)": 69.34, "step": 22270, "token_acc": 0.5017667844522968, "train_speed(iter/s)": 1.445585 }, { "epoch": 0.9543292918041215, "grad_norm": 3.62768816947937, "learning_rate": 9.127742757470217e-05, "loss": 2.175482749938965, "memory(GiB)": 69.34, "step": 22275, "token_acc": 0.5115384615384615, "train_speed(iter/s)": 1.445628 }, { "epoch": 0.9545435071333704, "grad_norm": 4.364960193634033, "learning_rate": 9.127362938060563e-05, "loss": 2.784740447998047, "memory(GiB)": 69.34, "step": 22280, "token_acc": 0.4266666666666667, "train_speed(iter/s)": 1.445673 }, { "epoch": 0.9547577224626195, "grad_norm": 3.407343864440918, "learning_rate": 9.126983043879857e-05, "loss": 2.602976608276367, "memory(GiB)": 69.34, "step": 22285, "token_acc": 0.4887459807073955, "train_speed(iter/s)": 1.445699 }, { "epoch": 0.9549719377918684, "grad_norm": 4.121309280395508, "learning_rate": 9.126603074934982e-05, "loss": 2.6240062713623047, "memory(GiB)": 69.34, "step": 22290, "token_acc": 0.4453125, "train_speed(iter/s)": 1.44571 }, { "epoch": 0.9551861531211173, "grad_norm": 5.251585960388184, "learning_rate": 9.126223031232822e-05, "loss": 2.4973318099975588, "memory(GiB)": 69.34, "step": 22295, "token_acc": 0.45161290322580644, "train_speed(iter/s)": 1.445701 }, { "epoch": 0.9554003684503664, "grad_norm": 3.9109556674957275, "learning_rate": 9.125842912780259e-05, "loss": 2.4879005432128904, "memory(GiB)": 69.34, "step": 22300, "token_acc": 0.49230769230769234, "train_speed(iter/s)": 1.445701 }, { "epoch": 0.9556145837796153, "grad_norm": 4.672306537628174, "learning_rate": 9.125462719584183e-05, "loss": 2.45849723815918, "memory(GiB)": 69.34, "step": 22305, "token_acc": 0.48872180451127817, "train_speed(iter/s)": 1.445789 }, { "epoch": 0.9558287991088642, "grad_norm": 5.2371134757995605, "learning_rate": 9.125082451651479e-05, "loss": 2.801739311218262, "memory(GiB)": 69.34, "step": 22310, "token_acc": 0.44336569579288027, "train_speed(iter/s)": 1.445817 }, { "epoch": 0.9560430144381132, "grad_norm": 4.777816295623779, "learning_rate": 9.124702108989036e-05, "loss": 2.7504138946533203, "memory(GiB)": 69.34, "step": 22315, "token_acc": 0.4612794612794613, "train_speed(iter/s)": 1.445853 }, { "epoch": 0.9562572297673622, "grad_norm": 4.661057472229004, "learning_rate": 9.124321691603747e-05, "loss": 2.6545795440673827, "memory(GiB)": 69.34, "step": 22320, "token_acc": 0.4830188679245283, "train_speed(iter/s)": 1.445885 }, { "epoch": 0.9564714450966111, "grad_norm": 4.55148983001709, "learning_rate": 9.123941199502501e-05, "loss": 2.667533302307129, "memory(GiB)": 69.34, "step": 22325, "token_acc": 0.4143835616438356, "train_speed(iter/s)": 1.445795 }, { "epoch": 0.9566856604258601, "grad_norm": 3.683835983276367, "learning_rate": 9.12356063269219e-05, "loss": 2.6578132629394533, "memory(GiB)": 69.34, "step": 22330, "token_acc": 0.4316109422492401, "train_speed(iter/s)": 1.445744 }, { "epoch": 0.956899875755109, "grad_norm": 5.03511381149292, "learning_rate": 9.123179991179711e-05, "loss": 2.63839111328125, "memory(GiB)": 69.34, "step": 22335, "token_acc": 0.4630225080385852, "train_speed(iter/s)": 1.445709 }, { "epoch": 0.957114091084358, "grad_norm": 5.707028388977051, "learning_rate": 9.122799274971959e-05, "loss": 2.4655242919921876, "memory(GiB)": 69.34, "step": 22340, "token_acc": 0.4909090909090909, "train_speed(iter/s)": 1.445644 }, { "epoch": 0.957328306413607, "grad_norm": 5.44482421875, "learning_rate": 9.12241848407583e-05, "loss": 2.7348758697509767, "memory(GiB)": 69.34, "step": 22345, "token_acc": 0.42953020134228187, "train_speed(iter/s)": 1.44569 }, { "epoch": 0.9575425217428559, "grad_norm": 5.052228927612305, "learning_rate": 9.122037618498225e-05, "loss": 2.2934436798095703, "memory(GiB)": 69.34, "step": 22350, "token_acc": 0.5121951219512195, "train_speed(iter/s)": 1.445688 }, { "epoch": 0.9577567370721048, "grad_norm": 4.518186092376709, "learning_rate": 9.12165667824604e-05, "loss": 2.6791284561157225, "memory(GiB)": 69.34, "step": 22355, "token_acc": 0.4451219512195122, "train_speed(iter/s)": 1.445751 }, { "epoch": 0.9579709524013539, "grad_norm": 3.880202054977417, "learning_rate": 9.121275663326178e-05, "loss": 2.5310874938964845, "memory(GiB)": 69.34, "step": 22360, "token_acc": 0.46735395189003437, "train_speed(iter/s)": 1.445707 }, { "epoch": 0.9581851677306028, "grad_norm": 4.373251438140869, "learning_rate": 9.120894573745542e-05, "loss": 2.5756412506103517, "memory(GiB)": 69.34, "step": 22365, "token_acc": 0.44339622641509435, "train_speed(iter/s)": 1.445716 }, { "epoch": 0.9583993830598517, "grad_norm": 4.539257049560547, "learning_rate": 9.120513409511033e-05, "loss": 2.525067138671875, "memory(GiB)": 69.34, "step": 22370, "token_acc": 0.5210727969348659, "train_speed(iter/s)": 1.445738 }, { "epoch": 0.9586135983891008, "grad_norm": 4.783735275268555, "learning_rate": 9.12013217062956e-05, "loss": 2.5327564239501954, "memory(GiB)": 69.34, "step": 22375, "token_acc": 0.47686832740213525, "train_speed(iter/s)": 1.445749 }, { "epoch": 0.9588278137183497, "grad_norm": 4.265169620513916, "learning_rate": 9.119750857108027e-05, "loss": 2.7773130416870115, "memory(GiB)": 69.34, "step": 22380, "token_acc": 0.46153846153846156, "train_speed(iter/s)": 1.445732 }, { "epoch": 0.9590420290475986, "grad_norm": 5.149278163909912, "learning_rate": 9.119369468953344e-05, "loss": 2.4701839447021485, "memory(GiB)": 75.22, "step": 22385, "token_acc": 0.47019867549668876, "train_speed(iter/s)": 1.445693 }, { "epoch": 0.9592562443768476, "grad_norm": 3.8407633304595947, "learning_rate": 9.118988006172418e-05, "loss": 2.675972747802734, "memory(GiB)": 75.22, "step": 22390, "token_acc": 0.5018315018315018, "train_speed(iter/s)": 1.445635 }, { "epoch": 0.9594704597060966, "grad_norm": 5.794604301452637, "learning_rate": 9.11860646877216e-05, "loss": 2.8507226943969726, "memory(GiB)": 75.22, "step": 22395, "token_acc": 0.4098939929328622, "train_speed(iter/s)": 1.445606 }, { "epoch": 0.9596846750353455, "grad_norm": 4.639246463775635, "learning_rate": 9.118224856759482e-05, "loss": 2.7039623260498047, "memory(GiB)": 75.22, "step": 22400, "token_acc": 0.4945054945054945, "train_speed(iter/s)": 1.4456 }, { "epoch": 0.9598988903645945, "grad_norm": 4.491499423980713, "learning_rate": 9.117843170141297e-05, "loss": 2.843472480773926, "memory(GiB)": 75.22, "step": 22405, "token_acc": 0.4267515923566879, "train_speed(iter/s)": 1.445542 }, { "epoch": 0.9601131056938434, "grad_norm": 4.32130241394043, "learning_rate": 9.117461408924521e-05, "loss": 2.566757392883301, "memory(GiB)": 75.22, "step": 22410, "token_acc": 0.47038327526132406, "train_speed(iter/s)": 1.445491 }, { "epoch": 0.9603273210230924, "grad_norm": 4.164856910705566, "learning_rate": 9.11707957311607e-05, "loss": 2.1697620391845702, "memory(GiB)": 75.22, "step": 22415, "token_acc": 0.5335570469798657, "train_speed(iter/s)": 1.445409 }, { "epoch": 0.9605415363523414, "grad_norm": 3.553744077682495, "learning_rate": 9.116697662722859e-05, "loss": 2.7220294952392576, "memory(GiB)": 75.22, "step": 22420, "token_acc": 0.48464163822525597, "train_speed(iter/s)": 1.44548 }, { "epoch": 0.9607557516815903, "grad_norm": 4.6254191398620605, "learning_rate": 9.116315677751807e-05, "loss": 2.646860122680664, "memory(GiB)": 75.22, "step": 22425, "token_acc": 0.478125, "train_speed(iter/s)": 1.445515 }, { "epoch": 0.9609699670108393, "grad_norm": 3.833604335784912, "learning_rate": 9.115933618209838e-05, "loss": 2.6391712188720704, "memory(GiB)": 75.22, "step": 22430, "token_acc": 0.48161764705882354, "train_speed(iter/s)": 1.445509 }, { "epoch": 0.9611841823400883, "grad_norm": 3.3966407775878906, "learning_rate": 9.115551484103869e-05, "loss": 2.586381721496582, "memory(GiB)": 75.22, "step": 22435, "token_acc": 0.48231511254019294, "train_speed(iter/s)": 1.445533 }, { "epoch": 0.9613983976693372, "grad_norm": 5.000818252563477, "learning_rate": 9.115169275440825e-05, "loss": 2.6876317977905275, "memory(GiB)": 75.22, "step": 22440, "token_acc": 0.4844961240310077, "train_speed(iter/s)": 1.445543 }, { "epoch": 0.9616126129985861, "grad_norm": 3.923110246658325, "learning_rate": 9.114786992227629e-05, "loss": 2.6653194427490234, "memory(GiB)": 75.22, "step": 22445, "token_acc": 0.4608150470219436, "train_speed(iter/s)": 1.445546 }, { "epoch": 0.9618268283278352, "grad_norm": 3.779022455215454, "learning_rate": 9.114404634471205e-05, "loss": 2.6949960708618166, "memory(GiB)": 75.22, "step": 22450, "token_acc": 0.48109965635738833, "train_speed(iter/s)": 1.445523 }, { "epoch": 0.9620410436570841, "grad_norm": 4.776429176330566, "learning_rate": 9.114022202178483e-05, "loss": 2.4824108123779296, "memory(GiB)": 75.22, "step": 22455, "token_acc": 0.5034246575342466, "train_speed(iter/s)": 1.44558 }, { "epoch": 0.962255258986333, "grad_norm": 5.741626739501953, "learning_rate": 9.113639695356388e-05, "loss": 2.4363405227661135, "memory(GiB)": 75.22, "step": 22460, "token_acc": 0.4794520547945205, "train_speed(iter/s)": 1.445542 }, { "epoch": 0.962469474315582, "grad_norm": 4.263327121734619, "learning_rate": 9.113257114011852e-05, "loss": 2.2999876022338865, "memory(GiB)": 75.22, "step": 22465, "token_acc": 0.49206349206349204, "train_speed(iter/s)": 1.44548 }, { "epoch": 0.962683689644831, "grad_norm": 5.492435932159424, "learning_rate": 9.112874458151805e-05, "loss": 2.3961204528808593, "memory(GiB)": 75.22, "step": 22470, "token_acc": 0.5151515151515151, "train_speed(iter/s)": 1.445533 }, { "epoch": 0.9628979049740799, "grad_norm": 4.865903854370117, "learning_rate": 9.112491727783179e-05, "loss": 2.5525184631347657, "memory(GiB)": 75.22, "step": 22475, "token_acc": 0.45263157894736844, "train_speed(iter/s)": 1.44555 }, { "epoch": 0.9631121203033289, "grad_norm": 4.546634197235107, "learning_rate": 9.112108922912907e-05, "loss": 2.5844451904296877, "memory(GiB)": 75.22, "step": 22480, "token_acc": 0.4713804713804714, "train_speed(iter/s)": 1.445618 }, { "epoch": 0.9633263356325779, "grad_norm": 4.71919584274292, "learning_rate": 9.111726043547926e-05, "loss": 2.9415866851806642, "memory(GiB)": 75.22, "step": 22485, "token_acc": 0.4155844155844156, "train_speed(iter/s)": 1.44567 }, { "epoch": 0.9635405509618268, "grad_norm": 5.9818034172058105, "learning_rate": 9.111343089695168e-05, "loss": 2.68177490234375, "memory(GiB)": 75.22, "step": 22490, "token_acc": 0.49454545454545457, "train_speed(iter/s)": 1.44573 }, { "epoch": 0.9637547662910758, "grad_norm": 5.77147102355957, "learning_rate": 9.110960061361575e-05, "loss": 2.570528984069824, "memory(GiB)": 75.22, "step": 22495, "token_acc": 0.525, "train_speed(iter/s)": 1.445747 }, { "epoch": 0.9639689816203247, "grad_norm": 7.428699016571045, "learning_rate": 9.110576958554085e-05, "loss": 2.5904220581054687, "memory(GiB)": 77.56, "step": 22500, "token_acc": 0.4542372881355932, "train_speed(iter/s)": 1.445607 }, { "epoch": 0.9639689816203247, "eval_loss": 2.4378061294555664, "eval_runtime": 12.8503, "eval_samples_per_second": 7.782, "eval_steps_per_second": 7.782, "eval_token_acc": 0.4587280108254398, "step": 22500 }, { "epoch": 0.9641831969495737, "grad_norm": 4.519054412841797, "learning_rate": 9.110193781279635e-05, "loss": 3.037515640258789, "memory(GiB)": 77.56, "step": 22505, "token_acc": 0.45545545545545546, "train_speed(iter/s)": 1.444433 }, { "epoch": 0.9643974122788227, "grad_norm": 4.015757083892822, "learning_rate": 9.109810529545171e-05, "loss": 2.4090030670166014, "memory(GiB)": 77.56, "step": 22510, "token_acc": 0.4865771812080537, "train_speed(iter/s)": 1.44448 }, { "epoch": 0.9646116276080716, "grad_norm": 3.304832935333252, "learning_rate": 9.109427203357632e-05, "loss": 2.8798879623413085, "memory(GiB)": 77.56, "step": 22515, "token_acc": 0.4437869822485207, "train_speed(iter/s)": 1.444493 }, { "epoch": 0.9648258429373205, "grad_norm": 4.7493133544921875, "learning_rate": 9.109043802723967e-05, "loss": 2.4731964111328124, "memory(GiB)": 77.56, "step": 22520, "token_acc": 0.47924528301886793, "train_speed(iter/s)": 1.44452 }, { "epoch": 0.9650400582665696, "grad_norm": 4.440759658813477, "learning_rate": 9.108660327651116e-05, "loss": 2.348245620727539, "memory(GiB)": 77.56, "step": 22525, "token_acc": 0.5313653136531366, "train_speed(iter/s)": 1.444468 }, { "epoch": 0.9652542735958185, "grad_norm": 4.804719924926758, "learning_rate": 9.10827677814603e-05, "loss": 2.5432621002197267, "memory(GiB)": 77.56, "step": 22530, "token_acc": 0.5116279069767442, "train_speed(iter/s)": 1.444469 }, { "epoch": 0.9654684889250674, "grad_norm": 4.316156387329102, "learning_rate": 9.107893154215656e-05, "loss": 2.8198347091674805, "memory(GiB)": 77.56, "step": 22535, "token_acc": 0.46808510638297873, "train_speed(iter/s)": 1.44457 }, { "epoch": 0.9656827042543165, "grad_norm": 5.392191410064697, "learning_rate": 9.107509455866945e-05, "loss": 2.413319778442383, "memory(GiB)": 77.56, "step": 22540, "token_acc": 0.5310077519379846, "train_speed(iter/s)": 1.444569 }, { "epoch": 0.9658969195835654, "grad_norm": 3.2190346717834473, "learning_rate": 9.107125683106848e-05, "loss": 2.6461687088012695, "memory(GiB)": 77.56, "step": 22545, "token_acc": 0.4584450402144772, "train_speed(iter/s)": 1.444529 }, { "epoch": 0.9661111349128143, "grad_norm": 4.319345474243164, "learning_rate": 9.106741835942314e-05, "loss": 2.6277385711669923, "memory(GiB)": 77.56, "step": 22550, "token_acc": 0.468944099378882, "train_speed(iter/s)": 1.444508 }, { "epoch": 0.9663253502420633, "grad_norm": 5.934285640716553, "learning_rate": 9.106357914380299e-05, "loss": 2.2906621932983398, "memory(GiB)": 77.56, "step": 22555, "token_acc": 0.5222672064777328, "train_speed(iter/s)": 1.444561 }, { "epoch": 0.9665395655713123, "grad_norm": 4.374237060546875, "learning_rate": 9.105973918427759e-05, "loss": 2.664159393310547, "memory(GiB)": 77.56, "step": 22560, "token_acc": 0.4186851211072664, "train_speed(iter/s)": 1.444569 }, { "epoch": 0.9667537809005613, "grad_norm": 3.712407350540161, "learning_rate": 9.105589848091651e-05, "loss": 2.111763763427734, "memory(GiB)": 77.56, "step": 22565, "token_acc": 0.5313531353135313, "train_speed(iter/s)": 1.444573 }, { "epoch": 0.9669679962298102, "grad_norm": 4.662269115447998, "learning_rate": 9.105205703378931e-05, "loss": 2.396389961242676, "memory(GiB)": 77.56, "step": 22570, "token_acc": 0.4785714285714286, "train_speed(iter/s)": 1.444454 }, { "epoch": 0.9671822115590591, "grad_norm": 4.030180931091309, "learning_rate": 9.104821484296559e-05, "loss": 2.530960464477539, "memory(GiB)": 77.56, "step": 22575, "token_acc": 0.4953560371517028, "train_speed(iter/s)": 1.44452 }, { "epoch": 0.9673964268883082, "grad_norm": 5.760007858276367, "learning_rate": 9.104437190851493e-05, "loss": 2.733035659790039, "memory(GiB)": 77.56, "step": 22580, "token_acc": 0.4379310344827586, "train_speed(iter/s)": 1.444507 }, { "epoch": 0.9676106422175571, "grad_norm": 3.886164426803589, "learning_rate": 9.104052823050699e-05, "loss": 2.715729331970215, "memory(GiB)": 77.56, "step": 22585, "token_acc": 0.45180722891566266, "train_speed(iter/s)": 1.444442 }, { "epoch": 0.967824857546806, "grad_norm": 4.08539342880249, "learning_rate": 9.103668380901138e-05, "loss": 2.5099754333496094, "memory(GiB)": 77.56, "step": 22590, "token_acc": 0.45454545454545453, "train_speed(iter/s)": 1.444443 }, { "epoch": 0.9680390728760551, "grad_norm": 4.854295253753662, "learning_rate": 9.103283864409775e-05, "loss": 2.5760751724243165, "memory(GiB)": 77.56, "step": 22595, "token_acc": 0.46511627906976744, "train_speed(iter/s)": 1.444479 }, { "epoch": 0.968253288205304, "grad_norm": 3.448655605316162, "learning_rate": 9.102899273583575e-05, "loss": 2.6189884185791015, "memory(GiB)": 77.56, "step": 22600, "token_acc": 0.44660194174757284, "train_speed(iter/s)": 1.444528 }, { "epoch": 0.9684675035345529, "grad_norm": 5.7773003578186035, "learning_rate": 9.102514608429507e-05, "loss": 2.601335334777832, "memory(GiB)": 77.56, "step": 22605, "token_acc": 0.4943820224719101, "train_speed(iter/s)": 1.44453 }, { "epoch": 0.9686817188638019, "grad_norm": 5.030820846557617, "learning_rate": 9.102129868954537e-05, "loss": 2.4951377868652345, "memory(GiB)": 77.56, "step": 22610, "token_acc": 0.45054945054945056, "train_speed(iter/s)": 1.44451 }, { "epoch": 0.9688959341930509, "grad_norm": 4.682309150695801, "learning_rate": 9.101745055165635e-05, "loss": 2.450288009643555, "memory(GiB)": 77.56, "step": 22615, "token_acc": 0.4811320754716981, "train_speed(iter/s)": 1.444423 }, { "epoch": 0.9691101495222998, "grad_norm": 3.8503715991973877, "learning_rate": 9.101360167069777e-05, "loss": 2.4555179595947267, "memory(GiB)": 77.56, "step": 22620, "token_acc": 0.49508196721311476, "train_speed(iter/s)": 1.444385 }, { "epoch": 0.9693243648515488, "grad_norm": 3.836268186569214, "learning_rate": 9.100975204673929e-05, "loss": 2.758441925048828, "memory(GiB)": 77.56, "step": 22625, "token_acc": 0.4748427672955975, "train_speed(iter/s)": 1.444451 }, { "epoch": 0.9695385801807977, "grad_norm": 4.451447486877441, "learning_rate": 9.10059016798507e-05, "loss": 2.996836853027344, "memory(GiB)": 77.56, "step": 22630, "token_acc": 0.43776824034334766, "train_speed(iter/s)": 1.444482 }, { "epoch": 0.9697527955100467, "grad_norm": 4.166262626647949, "learning_rate": 9.100205057010174e-05, "loss": 2.6655994415283204, "memory(GiB)": 77.56, "step": 22635, "token_acc": 0.44370860927152317, "train_speed(iter/s)": 1.444527 }, { "epoch": 0.9699670108392957, "grad_norm": 4.432879447937012, "learning_rate": 9.099819871756215e-05, "loss": 2.710140609741211, "memory(GiB)": 77.56, "step": 22640, "token_acc": 0.47368421052631576, "train_speed(iter/s)": 1.444564 }, { "epoch": 0.9701812261685446, "grad_norm": 4.089960098266602, "learning_rate": 9.099434612230175e-05, "loss": 2.690087890625, "memory(GiB)": 77.56, "step": 22645, "token_acc": 0.4603174603174603, "train_speed(iter/s)": 1.444639 }, { "epoch": 0.9703954414977936, "grad_norm": 4.260512351989746, "learning_rate": 9.099049278439029e-05, "loss": 2.7790201187133787, "memory(GiB)": 77.56, "step": 22650, "token_acc": 0.4153846153846154, "train_speed(iter/s)": 1.44467 }, { "epoch": 0.9706096568270426, "grad_norm": 6.258110523223877, "learning_rate": 9.098663870389763e-05, "loss": 2.717209053039551, "memory(GiB)": 77.56, "step": 22655, "token_acc": 0.41739130434782606, "train_speed(iter/s)": 1.444758 }, { "epoch": 0.9708238721562915, "grad_norm": 4.986264705657959, "learning_rate": 9.098278388089354e-05, "loss": 2.4684144973754885, "memory(GiB)": 77.56, "step": 22660, "token_acc": 0.4734848484848485, "train_speed(iter/s)": 1.444767 }, { "epoch": 0.9710380874855404, "grad_norm": 5.7784247398376465, "learning_rate": 9.097892831544789e-05, "loss": 2.8189533233642576, "memory(GiB)": 77.56, "step": 22665, "token_acc": 0.46785714285714286, "train_speed(iter/s)": 1.444787 }, { "epoch": 0.9712523028147895, "grad_norm": 4.401434421539307, "learning_rate": 9.097507200763052e-05, "loss": 2.5944896697998048, "memory(GiB)": 77.56, "step": 22670, "token_acc": 0.4391891891891892, "train_speed(iter/s)": 1.444849 }, { "epoch": 0.9714665181440384, "grad_norm": 3.987140655517578, "learning_rate": 9.097121495751126e-05, "loss": 2.5924224853515625, "memory(GiB)": 77.56, "step": 22675, "token_acc": 0.49612403100775193, "train_speed(iter/s)": 1.444811 }, { "epoch": 0.9716807334732873, "grad_norm": 5.120415687561035, "learning_rate": 9.096735716516001e-05, "loss": 2.543726348876953, "memory(GiB)": 77.56, "step": 22680, "token_acc": 0.4434250764525994, "train_speed(iter/s)": 1.444818 }, { "epoch": 0.9718949488025364, "grad_norm": 4.902359485626221, "learning_rate": 9.096349863064666e-05, "loss": 2.5126115798950197, "memory(GiB)": 77.56, "step": 22685, "token_acc": 0.4942528735632184, "train_speed(iter/s)": 1.444883 }, { "epoch": 0.9721091641317853, "grad_norm": 5.709869384765625, "learning_rate": 9.09596393540411e-05, "loss": 2.651871109008789, "memory(GiB)": 77.56, "step": 22690, "token_acc": 0.4383561643835616, "train_speed(iter/s)": 1.444957 }, { "epoch": 0.9723233794610342, "grad_norm": 4.62658166885376, "learning_rate": 9.095577933541326e-05, "loss": 2.5991443634033202, "memory(GiB)": 77.56, "step": 22695, "token_acc": 0.4778481012658228, "train_speed(iter/s)": 1.444979 }, { "epoch": 0.9725375947902832, "grad_norm": 7.195178031921387, "learning_rate": 9.095191857483305e-05, "loss": 2.71396484375, "memory(GiB)": 77.56, "step": 22700, "token_acc": 0.45112781954887216, "train_speed(iter/s)": 1.444978 }, { "epoch": 0.9727518101195322, "grad_norm": 9.005720138549805, "learning_rate": 9.094805707237041e-05, "loss": 2.6929790496826174, "memory(GiB)": 77.56, "step": 22705, "token_acc": 0.4573170731707317, "train_speed(iter/s)": 1.444991 }, { "epoch": 0.9729660254487811, "grad_norm": 3.981722593307495, "learning_rate": 9.094419482809534e-05, "loss": 2.5417993545532225, "memory(GiB)": 77.56, "step": 22710, "token_acc": 0.4695121951219512, "train_speed(iter/s)": 1.445001 }, { "epoch": 0.9731802407780301, "grad_norm": 4.128677845001221, "learning_rate": 9.094033184207774e-05, "loss": 2.4229068756103516, "memory(GiB)": 77.56, "step": 22715, "token_acc": 0.4908424908424908, "train_speed(iter/s)": 1.44502 }, { "epoch": 0.973394456107279, "grad_norm": 3.5569841861724854, "learning_rate": 9.093646811438762e-05, "loss": 2.188077926635742, "memory(GiB)": 77.56, "step": 22720, "token_acc": 0.5185185185185185, "train_speed(iter/s)": 1.44503 }, { "epoch": 0.973608671436528, "grad_norm": 4.370412826538086, "learning_rate": 9.0932603645095e-05, "loss": 2.7317127227783202, "memory(GiB)": 77.56, "step": 22725, "token_acc": 0.44680851063829785, "train_speed(iter/s)": 1.444961 }, { "epoch": 0.973822886765777, "grad_norm": 4.974038600921631, "learning_rate": 9.092873843426986e-05, "loss": 2.7294788360595703, "memory(GiB)": 77.56, "step": 22730, "token_acc": 0.4597315436241611, "train_speed(iter/s)": 1.444971 }, { "epoch": 0.9740371020950259, "grad_norm": 6.928504943847656, "learning_rate": 9.092487248198222e-05, "loss": 2.795309066772461, "memory(GiB)": 77.56, "step": 22735, "token_acc": 0.4342105263157895, "train_speed(iter/s)": 1.445 }, { "epoch": 0.9742513174242748, "grad_norm": 4.414449214935303, "learning_rate": 9.092100578830214e-05, "loss": 2.5382831573486326, "memory(GiB)": 77.56, "step": 22740, "token_acc": 0.45625, "train_speed(iter/s)": 1.44499 }, { "epoch": 0.9744655327535239, "grad_norm": 4.456477642059326, "learning_rate": 9.091713835329964e-05, "loss": 2.560292053222656, "memory(GiB)": 77.56, "step": 22745, "token_acc": 0.47686832740213525, "train_speed(iter/s)": 1.444983 }, { "epoch": 0.9746797480827728, "grad_norm": 8.5945405960083, "learning_rate": 9.091327017704479e-05, "loss": 2.475649833679199, "memory(GiB)": 77.56, "step": 22750, "token_acc": 0.5139442231075697, "train_speed(iter/s)": 1.444872 }, { "epoch": 0.9748939634120217, "grad_norm": 4.095578670501709, "learning_rate": 9.090940125960769e-05, "loss": 2.6476024627685546, "memory(GiB)": 77.56, "step": 22755, "token_acc": 0.4691780821917808, "train_speed(iter/s)": 1.444803 }, { "epoch": 0.9751081787412708, "grad_norm": 4.861550807952881, "learning_rate": 9.090553160105839e-05, "loss": 2.380379486083984, "memory(GiB)": 77.56, "step": 22760, "token_acc": 0.42955326460481097, "train_speed(iter/s)": 1.444867 }, { "epoch": 0.9753223940705197, "grad_norm": 4.669407367706299, "learning_rate": 9.090166120146702e-05, "loss": 2.7587869644165037, "memory(GiB)": 77.56, "step": 22765, "token_acc": 0.5155038759689923, "train_speed(iter/s)": 1.444875 }, { "epoch": 0.9755366093997686, "grad_norm": 4.914674282073975, "learning_rate": 9.08977900609037e-05, "loss": 2.7206090927124023, "memory(GiB)": 77.56, "step": 22770, "token_acc": 0.43630573248407645, "train_speed(iter/s)": 1.444953 }, { "epoch": 0.9757508247290176, "grad_norm": 7.4558258056640625, "learning_rate": 9.089391817943853e-05, "loss": 2.3912635803222657, "memory(GiB)": 77.56, "step": 22775, "token_acc": 0.5655737704918032, "train_speed(iter/s)": 1.445006 }, { "epoch": 0.9759650400582666, "grad_norm": 4.739561557769775, "learning_rate": 9.089004555714168e-05, "loss": 2.5282665252685548, "memory(GiB)": 77.56, "step": 22780, "token_acc": 0.47330960854092524, "train_speed(iter/s)": 1.44506 }, { "epoch": 0.9761792553875155, "grad_norm": 4.167211055755615, "learning_rate": 9.08861721940833e-05, "loss": 2.6400732040405273, "memory(GiB)": 77.56, "step": 22785, "token_acc": 0.4740061162079511, "train_speed(iter/s)": 1.445067 }, { "epoch": 0.9763934707167645, "grad_norm": 4.274940490722656, "learning_rate": 9.088229809033355e-05, "loss": 2.4364160537719726, "memory(GiB)": 77.56, "step": 22790, "token_acc": 0.4770992366412214, "train_speed(iter/s)": 1.445105 }, { "epoch": 0.9766076860460134, "grad_norm": 3.6536030769348145, "learning_rate": 9.087842324596262e-05, "loss": 2.3567413330078124, "memory(GiB)": 77.56, "step": 22795, "token_acc": 0.4307228915662651, "train_speed(iter/s)": 1.445098 }, { "epoch": 0.9768219013752624, "grad_norm": 4.775113105773926, "learning_rate": 9.087454766104071e-05, "loss": 2.8382440567016602, "memory(GiB)": 77.56, "step": 22800, "token_acc": 0.43234323432343236, "train_speed(iter/s)": 1.445129 }, { "epoch": 0.9770361167045114, "grad_norm": 4.350823402404785, "learning_rate": 9.087067133563803e-05, "loss": 2.4608108520507814, "memory(GiB)": 77.56, "step": 22805, "token_acc": 0.4552238805970149, "train_speed(iter/s)": 1.445189 }, { "epoch": 0.9772503320337603, "grad_norm": 3.461327075958252, "learning_rate": 9.086679426982479e-05, "loss": 2.4220829010009766, "memory(GiB)": 77.56, "step": 22810, "token_acc": 0.45751633986928103, "train_speed(iter/s)": 1.44509 }, { "epoch": 0.9774645473630093, "grad_norm": 4.952081203460693, "learning_rate": 9.086291646367123e-05, "loss": 2.6396175384521485, "memory(GiB)": 77.56, "step": 22815, "token_acc": 0.4883720930232558, "train_speed(iter/s)": 1.445064 }, { "epoch": 0.9776787626922583, "grad_norm": 4.0555195808410645, "learning_rate": 9.085903791724761e-05, "loss": 2.960065460205078, "memory(GiB)": 77.56, "step": 22820, "token_acc": 0.4635036496350365, "train_speed(iter/s)": 1.44515 }, { "epoch": 0.9778929780215072, "grad_norm": 4.5373945236206055, "learning_rate": 9.085515863062419e-05, "loss": 2.6047840118408203, "memory(GiB)": 77.56, "step": 22825, "token_acc": 0.45051194539249145, "train_speed(iter/s)": 1.445167 }, { "epoch": 0.9781071933507561, "grad_norm": 4.878065586090088, "learning_rate": 9.085127860387126e-05, "loss": 2.434050941467285, "memory(GiB)": 77.56, "step": 22830, "token_acc": 0.47330960854092524, "train_speed(iter/s)": 1.445219 }, { "epoch": 0.9783214086800052, "grad_norm": 4.08262825012207, "learning_rate": 9.084739783705909e-05, "loss": 2.9012475967407227, "memory(GiB)": 77.56, "step": 22835, "token_acc": 0.4270833333333333, "train_speed(iter/s)": 1.445105 }, { "epoch": 0.9785356240092541, "grad_norm": 3.815702199935913, "learning_rate": 9.084351633025798e-05, "loss": 2.516925239562988, "memory(GiB)": 77.56, "step": 22840, "token_acc": 0.4862068965517241, "train_speed(iter/s)": 1.445117 }, { "epoch": 0.978749839338503, "grad_norm": 4.985994338989258, "learning_rate": 9.083963408353825e-05, "loss": 2.653405952453613, "memory(GiB)": 77.56, "step": 22845, "token_acc": 0.4838709677419355, "train_speed(iter/s)": 1.445185 }, { "epoch": 0.978964054667752, "grad_norm": 4.019278049468994, "learning_rate": 9.083575109697027e-05, "loss": 2.7412076950073243, "memory(GiB)": 77.56, "step": 22850, "token_acc": 0.42443729903536975, "train_speed(iter/s)": 1.445137 }, { "epoch": 0.979178269997001, "grad_norm": 12.105887413024902, "learning_rate": 9.083186737062432e-05, "loss": 2.926678466796875, "memory(GiB)": 77.56, "step": 22855, "token_acc": 0.41297935103244837, "train_speed(iter/s)": 1.44508 }, { "epoch": 0.9793924853262499, "grad_norm": 3.4400057792663574, "learning_rate": 9.082798290457081e-05, "loss": 2.459037017822266, "memory(GiB)": 77.56, "step": 22860, "token_acc": 0.48333333333333334, "train_speed(iter/s)": 1.445095 }, { "epoch": 0.9796067006554989, "grad_norm": 3.8603885173797607, "learning_rate": 9.082409769888008e-05, "loss": 2.6857845306396486, "memory(GiB)": 77.56, "step": 22865, "token_acc": 0.4585492227979275, "train_speed(iter/s)": 1.445136 }, { "epoch": 0.9798209159847479, "grad_norm": 3.772304058074951, "learning_rate": 9.082021175362252e-05, "loss": 2.4039142608642576, "memory(GiB)": 77.56, "step": 22870, "token_acc": 0.5171232876712328, "train_speed(iter/s)": 1.445133 }, { "epoch": 0.9800351313139968, "grad_norm": 5.362132549285889, "learning_rate": 9.081632506886854e-05, "loss": 2.4876777648925783, "memory(GiB)": 77.56, "step": 22875, "token_acc": 0.4362934362934363, "train_speed(iter/s)": 1.445188 }, { "epoch": 0.9802493466432458, "grad_norm": 4.6251702308654785, "learning_rate": 9.081243764468854e-05, "loss": 2.8020580291748045, "memory(GiB)": 77.56, "step": 22880, "token_acc": 0.4393939393939394, "train_speed(iter/s)": 1.445219 }, { "epoch": 0.9804635619724947, "grad_norm": 5.647506237030029, "learning_rate": 9.080854948115295e-05, "loss": 2.6524225234985352, "memory(GiB)": 77.56, "step": 22885, "token_acc": 0.45481927710843373, "train_speed(iter/s)": 1.445251 }, { "epoch": 0.9806777773017437, "grad_norm": 4.38006591796875, "learning_rate": 9.080466057833221e-05, "loss": 2.3606569290161135, "memory(GiB)": 77.56, "step": 22890, "token_acc": 0.5384615384615384, "train_speed(iter/s)": 1.445305 }, { "epoch": 0.9808919926309927, "grad_norm": 3.4390718936920166, "learning_rate": 9.080077093629675e-05, "loss": 2.8349235534667967, "memory(GiB)": 77.56, "step": 22895, "token_acc": 0.4199395770392749, "train_speed(iter/s)": 1.445333 }, { "epoch": 0.9811062079602416, "grad_norm": 4.981522083282471, "learning_rate": 9.079688055511707e-05, "loss": 2.579480743408203, "memory(GiB)": 77.56, "step": 22900, "token_acc": 0.44, "train_speed(iter/s)": 1.445347 }, { "epoch": 0.9813204232894907, "grad_norm": 4.231136322021484, "learning_rate": 9.079298943486361e-05, "loss": 2.410441207885742, "memory(GiB)": 77.56, "step": 22905, "token_acc": 0.488135593220339, "train_speed(iter/s)": 1.445391 }, { "epoch": 0.9815346386187396, "grad_norm": 4.782087326049805, "learning_rate": 9.078909757560687e-05, "loss": 2.3499324798583983, "memory(GiB)": 77.56, "step": 22910, "token_acc": 0.5451388888888888, "train_speed(iter/s)": 1.445423 }, { "epoch": 0.9817488539479885, "grad_norm": 7.973390102386475, "learning_rate": 9.07852049774174e-05, "loss": 2.193668174743652, "memory(GiB)": 77.56, "step": 22915, "token_acc": 0.49407114624505927, "train_speed(iter/s)": 1.445524 }, { "epoch": 0.9819630692772375, "grad_norm": 3.6842994689941406, "learning_rate": 9.078131164036565e-05, "loss": 2.497384262084961, "memory(GiB)": 77.56, "step": 22920, "token_acc": 0.4867549668874172, "train_speed(iter/s)": 1.445543 }, { "epoch": 0.9821772846064865, "grad_norm": 5.814662933349609, "learning_rate": 9.07774175645222e-05, "loss": 2.5203853607177735, "memory(GiB)": 77.56, "step": 22925, "token_acc": 0.5046153846153846, "train_speed(iter/s)": 1.445567 }, { "epoch": 0.9823914999357354, "grad_norm": 4.383076190948486, "learning_rate": 9.077352274995757e-05, "loss": 2.5637889862060548, "memory(GiB)": 77.56, "step": 22930, "token_acc": 0.4249084249084249, "train_speed(iter/s)": 1.445555 }, { "epoch": 0.9826057152649844, "grad_norm": 4.262376308441162, "learning_rate": 9.076962719674233e-05, "loss": 2.8621448516845702, "memory(GiB)": 77.56, "step": 22935, "token_acc": 0.463855421686747, "train_speed(iter/s)": 1.445597 }, { "epoch": 0.9828199305942333, "grad_norm": 5.175243854522705, "learning_rate": 9.076573090494704e-05, "loss": 2.4571056365966797, "memory(GiB)": 77.56, "step": 22940, "token_acc": 0.4558303886925795, "train_speed(iter/s)": 1.445619 }, { "epoch": 0.9830341459234823, "grad_norm": 3.8173458576202393, "learning_rate": 9.076183387464232e-05, "loss": 2.4322261810302734, "memory(GiB)": 77.56, "step": 22945, "token_acc": 0.48698884758364314, "train_speed(iter/s)": 1.445669 }, { "epoch": 0.9832483612527313, "grad_norm": 6.433363914489746, "learning_rate": 9.075793610589871e-05, "loss": 2.3937919616699217, "memory(GiB)": 77.56, "step": 22950, "token_acc": 0.48945147679324896, "train_speed(iter/s)": 1.445723 }, { "epoch": 0.9834625765819802, "grad_norm": 7.985085964202881, "learning_rate": 9.075403759878687e-05, "loss": 2.673828887939453, "memory(GiB)": 77.56, "step": 22955, "token_acc": 0.48031496062992124, "train_speed(iter/s)": 1.445751 }, { "epoch": 0.9836767919112291, "grad_norm": 5.2178215980529785, "learning_rate": 9.075013835337742e-05, "loss": 2.692142105102539, "memory(GiB)": 77.56, "step": 22960, "token_acc": 0.47465437788018433, "train_speed(iter/s)": 1.445827 }, { "epoch": 0.9838910072404782, "grad_norm": 3.7585647106170654, "learning_rate": 9.074623836974097e-05, "loss": 2.3891376495361327, "memory(GiB)": 77.56, "step": 22965, "token_acc": 0.515358361774744, "train_speed(iter/s)": 1.445857 }, { "epoch": 0.9841052225697271, "grad_norm": 6.010097980499268, "learning_rate": 9.074233764794818e-05, "loss": 2.4187442779541017, "memory(GiB)": 77.56, "step": 22970, "token_acc": 0.4900662251655629, "train_speed(iter/s)": 1.445895 }, { "epoch": 0.984319437898976, "grad_norm": 3.8825864791870117, "learning_rate": 9.073843618806974e-05, "loss": 2.5002138137817385, "memory(GiB)": 77.56, "step": 22975, "token_acc": 0.45791245791245794, "train_speed(iter/s)": 1.445944 }, { "epoch": 0.9845336532282251, "grad_norm": 4.150593280792236, "learning_rate": 9.073453399017631e-05, "loss": 2.698598098754883, "memory(GiB)": 77.56, "step": 22980, "token_acc": 0.43902439024390244, "train_speed(iter/s)": 1.445926 }, { "epoch": 0.984747868557474, "grad_norm": 7.992836952209473, "learning_rate": 9.073063105433859e-05, "loss": 2.6483861923217775, "memory(GiB)": 77.56, "step": 22985, "token_acc": 0.46496815286624205, "train_speed(iter/s)": 1.445888 }, { "epoch": 0.9849620838867229, "grad_norm": 6.339759826660156, "learning_rate": 9.072672738062726e-05, "loss": 2.6745561599731444, "memory(GiB)": 77.56, "step": 22990, "token_acc": 0.47863247863247865, "train_speed(iter/s)": 1.44598 }, { "epoch": 0.9851762992159719, "grad_norm": 4.41357946395874, "learning_rate": 9.072282296911308e-05, "loss": 2.3496932983398438, "memory(GiB)": 77.56, "step": 22995, "token_acc": 0.4693140794223827, "train_speed(iter/s)": 1.445981 }, { "epoch": 0.9853905145452209, "grad_norm": 4.551157474517822, "learning_rate": 9.071891781986675e-05, "loss": 2.695197105407715, "memory(GiB)": 77.56, "step": 23000, "token_acc": 0.43125, "train_speed(iter/s)": 1.44598 }, { "epoch": 0.9853905145452209, "eval_loss": 2.008117914199829, "eval_runtime": 14.9281, "eval_samples_per_second": 6.699, "eval_steps_per_second": 6.699, "eval_token_acc": 0.46785225718194257, "step": 23000 }, { "epoch": 0.9856047298744698, "grad_norm": 4.310020923614502, "learning_rate": 9.071501193295903e-05, "loss": 2.8055076599121094, "memory(GiB)": 77.56, "step": 23005, "token_acc": 0.4521224086870681, "train_speed(iter/s)": 1.444534 }, { "epoch": 0.9858189452037188, "grad_norm": 3.9575483798980713, "learning_rate": 9.071110530846067e-05, "loss": 2.4324247360229494, "memory(GiB)": 77.56, "step": 23010, "token_acc": 0.49117647058823527, "train_speed(iter/s)": 1.444589 }, { "epoch": 0.9860331605329677, "grad_norm": 5.247930526733398, "learning_rate": 9.070719794644245e-05, "loss": 2.610725212097168, "memory(GiB)": 77.56, "step": 23015, "token_acc": 0.4808362369337979, "train_speed(iter/s)": 1.444618 }, { "epoch": 0.9862473758622167, "grad_norm": 4.458913803100586, "learning_rate": 9.070328984697516e-05, "loss": 2.361677360534668, "memory(GiB)": 77.56, "step": 23020, "token_acc": 0.5, "train_speed(iter/s)": 1.444574 }, { "epoch": 0.9864615911914657, "grad_norm": 6.627816677093506, "learning_rate": 9.069938101012958e-05, "loss": 2.5770853042602537, "memory(GiB)": 77.56, "step": 23025, "token_acc": 0.48698884758364314, "train_speed(iter/s)": 1.444516 }, { "epoch": 0.9866758065207146, "grad_norm": 4.926617622375488, "learning_rate": 9.069547143597655e-05, "loss": 2.5908576965332033, "memory(GiB)": 77.56, "step": 23030, "token_acc": 0.4478114478114478, "train_speed(iter/s)": 1.444564 }, { "epoch": 0.9868900218499636, "grad_norm": 4.594491004943848, "learning_rate": 9.069156112458685e-05, "loss": 2.565652847290039, "memory(GiB)": 77.56, "step": 23035, "token_acc": 0.46726190476190477, "train_speed(iter/s)": 1.444485 }, { "epoch": 0.9871042371792126, "grad_norm": 4.847806453704834, "learning_rate": 9.068765007603137e-05, "loss": 2.6189918518066406, "memory(GiB)": 77.56, "step": 23040, "token_acc": 0.42805755395683454, "train_speed(iter/s)": 1.444437 }, { "epoch": 0.9873184525084615, "grad_norm": 5.198618412017822, "learning_rate": 9.068373829038095e-05, "loss": 2.7961402893066407, "memory(GiB)": 77.56, "step": 23045, "token_acc": 0.5, "train_speed(iter/s)": 1.444406 }, { "epoch": 0.9875326678377104, "grad_norm": 3.8865277767181396, "learning_rate": 9.067982576770644e-05, "loss": 2.4614585876464843, "memory(GiB)": 77.56, "step": 23050, "token_acc": 0.48265895953757226, "train_speed(iter/s)": 1.44438 }, { "epoch": 0.9877468831669595, "grad_norm": 4.218410968780518, "learning_rate": 9.067591250807872e-05, "loss": 2.5644222259521485, "memory(GiB)": 77.56, "step": 23055, "token_acc": 0.4772727272727273, "train_speed(iter/s)": 1.444476 }, { "epoch": 0.9879610984962084, "grad_norm": 6.073484420776367, "learning_rate": 9.067199851156869e-05, "loss": 2.5710912704467774, "memory(GiB)": 77.56, "step": 23060, "token_acc": 0.4468864468864469, "train_speed(iter/s)": 1.44456 }, { "epoch": 0.9881753138254573, "grad_norm": 3.766753673553467, "learning_rate": 9.066808377824725e-05, "loss": 2.7560733795166015, "memory(GiB)": 77.56, "step": 23065, "token_acc": 0.4166666666666667, "train_speed(iter/s)": 1.444613 }, { "epoch": 0.9883895291547063, "grad_norm": 5.844721794128418, "learning_rate": 9.066416830818531e-05, "loss": 3.010381507873535, "memory(GiB)": 77.56, "step": 23070, "token_acc": 0.4789272030651341, "train_speed(iter/s)": 1.44468 }, { "epoch": 0.9886037444839553, "grad_norm": 3.5435407161712646, "learning_rate": 9.066025210145384e-05, "loss": 2.3852956771850584, "memory(GiB)": 77.56, "step": 23075, "token_acc": 0.5263157894736842, "train_speed(iter/s)": 1.44466 }, { "epoch": 0.9888179598132042, "grad_norm": 3.66520619392395, "learning_rate": 9.065633515812376e-05, "loss": 2.4725925445556642, "memory(GiB)": 77.56, "step": 23080, "token_acc": 0.5089820359281437, "train_speed(iter/s)": 1.44473 }, { "epoch": 0.9890321751424532, "grad_norm": 3.9173736572265625, "learning_rate": 9.0652417478266e-05, "loss": 2.9407861709594725, "memory(GiB)": 77.56, "step": 23085, "token_acc": 0.43790849673202614, "train_speed(iter/s)": 1.444754 }, { "epoch": 0.9892463904717022, "grad_norm": 4.830893516540527, "learning_rate": 9.064849906195159e-05, "loss": 2.4900604248046876, "memory(GiB)": 77.56, "step": 23090, "token_acc": 0.4827586206896552, "train_speed(iter/s)": 1.444707 }, { "epoch": 0.9894606058009511, "grad_norm": 6.276354789733887, "learning_rate": 9.064457990925149e-05, "loss": 2.5824522018432616, "memory(GiB)": 77.56, "step": 23095, "token_acc": 0.45934959349593496, "train_speed(iter/s)": 1.444654 }, { "epoch": 0.9896748211302001, "grad_norm": 3.8143110275268555, "learning_rate": 9.064066002023668e-05, "loss": 2.5432191848754884, "memory(GiB)": 77.56, "step": 23100, "token_acc": 0.45666666666666667, "train_speed(iter/s)": 1.444589 }, { "epoch": 0.989889036459449, "grad_norm": 4.650741100311279, "learning_rate": 9.06367393949782e-05, "loss": 2.656488227844238, "memory(GiB)": 77.56, "step": 23105, "token_acc": 0.4837662337662338, "train_speed(iter/s)": 1.444601 }, { "epoch": 0.990103251788698, "grad_norm": 4.140844821929932, "learning_rate": 9.063281803354707e-05, "loss": 2.60064811706543, "memory(GiB)": 77.56, "step": 23110, "token_acc": 0.46853146853146854, "train_speed(iter/s)": 1.444602 }, { "epoch": 0.990317467117947, "grad_norm": 4.256901741027832, "learning_rate": 9.062889593601432e-05, "loss": 2.6823265075683596, "memory(GiB)": 77.56, "step": 23115, "token_acc": 0.48518518518518516, "train_speed(iter/s)": 1.44459 }, { "epoch": 0.9905316824471959, "grad_norm": 4.084400177001953, "learning_rate": 9.0624973102451e-05, "loss": 2.900673675537109, "memory(GiB)": 77.56, "step": 23120, "token_acc": 0.4378531073446328, "train_speed(iter/s)": 1.444632 }, { "epoch": 0.9907458977764448, "grad_norm": 5.528582572937012, "learning_rate": 9.062104953292819e-05, "loss": 2.6865842819213865, "memory(GiB)": 77.56, "step": 23125, "token_acc": 0.46484375, "train_speed(iter/s)": 1.444671 }, { "epoch": 0.9909601131056939, "grad_norm": 3.866848945617676, "learning_rate": 9.061712522751696e-05, "loss": 2.5581182479858398, "memory(GiB)": 77.56, "step": 23130, "token_acc": 0.4565916398713826, "train_speed(iter/s)": 1.444633 }, { "epoch": 0.9911743284349428, "grad_norm": 5.471525192260742, "learning_rate": 9.06132001862884e-05, "loss": 2.503224563598633, "memory(GiB)": 77.56, "step": 23135, "token_acc": 0.5172413793103449, "train_speed(iter/s)": 1.444684 }, { "epoch": 0.9913885437641917, "grad_norm": 5.09264612197876, "learning_rate": 9.060927440931362e-05, "loss": 2.767590141296387, "memory(GiB)": 77.56, "step": 23140, "token_acc": 0.4375, "train_speed(iter/s)": 1.444734 }, { "epoch": 0.9916027590934408, "grad_norm": 4.136631965637207, "learning_rate": 9.060534789666374e-05, "loss": 2.459285926818848, "memory(GiB)": 77.56, "step": 23145, "token_acc": 0.5053763440860215, "train_speed(iter/s)": 1.444673 }, { "epoch": 0.9918169744226897, "grad_norm": 5.8304338455200195, "learning_rate": 9.06014206484099e-05, "loss": 2.5782939910888674, "memory(GiB)": 77.56, "step": 23150, "token_acc": 0.4902597402597403, "train_speed(iter/s)": 1.444676 }, { "epoch": 0.9920311897519386, "grad_norm": 5.046599388122559, "learning_rate": 9.059749266462324e-05, "loss": 2.475712013244629, "memory(GiB)": 77.56, "step": 23155, "token_acc": 0.46195652173913043, "train_speed(iter/s)": 1.444766 }, { "epoch": 0.9922454050811876, "grad_norm": 8.552024841308594, "learning_rate": 9.05935639453749e-05, "loss": 2.714438247680664, "memory(GiB)": 77.56, "step": 23160, "token_acc": 0.4339622641509434, "train_speed(iter/s)": 1.444795 }, { "epoch": 0.9924596204104366, "grad_norm": 4.3296427726745605, "learning_rate": 9.058963449073607e-05, "loss": 2.878192901611328, "memory(GiB)": 77.56, "step": 23165, "token_acc": 0.4128113879003559, "train_speed(iter/s)": 1.444837 }, { "epoch": 0.9926738357396855, "grad_norm": 4.40519905090332, "learning_rate": 9.058570430077795e-05, "loss": 2.5774843215942385, "memory(GiB)": 77.56, "step": 23170, "token_acc": 0.4794007490636704, "train_speed(iter/s)": 1.444823 }, { "epoch": 0.9928880510689345, "grad_norm": 4.012173175811768, "learning_rate": 9.058177337557172e-05, "loss": 2.802840995788574, "memory(GiB)": 77.56, "step": 23175, "token_acc": 0.4417808219178082, "train_speed(iter/s)": 1.444816 }, { "epoch": 0.9931022663981834, "grad_norm": 4.722075939178467, "learning_rate": 9.057784171518861e-05, "loss": 2.7361244201660155, "memory(GiB)": 77.56, "step": 23180, "token_acc": 0.4610169491525424, "train_speed(iter/s)": 1.444708 }, { "epoch": 0.9933164817274324, "grad_norm": 4.343459606170654, "learning_rate": 9.057390931969981e-05, "loss": 2.681656837463379, "memory(GiB)": 77.56, "step": 23185, "token_acc": 0.4833948339483395, "train_speed(iter/s)": 1.444693 }, { "epoch": 0.9935306970566814, "grad_norm": 5.151376247406006, "learning_rate": 9.056997618917659e-05, "loss": 2.448335647583008, "memory(GiB)": 77.56, "step": 23190, "token_acc": 0.4782608695652174, "train_speed(iter/s)": 1.444681 }, { "epoch": 0.9937449123859303, "grad_norm": 4.584866046905518, "learning_rate": 9.056604232369019e-05, "loss": 2.314210319519043, "memory(GiB)": 77.56, "step": 23195, "token_acc": 0.48134328358208955, "train_speed(iter/s)": 1.444695 }, { "epoch": 0.9939591277151792, "grad_norm": 4.461553573608398, "learning_rate": 9.056210772331188e-05, "loss": 2.7964332580566404, "memory(GiB)": 77.56, "step": 23200, "token_acc": 0.45686900958466453, "train_speed(iter/s)": 1.444655 }, { "epoch": 0.9941733430444283, "grad_norm": 4.269008636474609, "learning_rate": 9.055817238811295e-05, "loss": 2.3951398849487306, "memory(GiB)": 77.56, "step": 23205, "token_acc": 0.5189003436426117, "train_speed(iter/s)": 1.444709 }, { "epoch": 0.9943875583736772, "grad_norm": 4.945583820343018, "learning_rate": 9.055423631816466e-05, "loss": 2.5814277648925783, "memory(GiB)": 77.56, "step": 23210, "token_acc": 0.5064935064935064, "train_speed(iter/s)": 1.444708 }, { "epoch": 0.9946017737029261, "grad_norm": 5.2736358642578125, "learning_rate": 9.055029951353835e-05, "loss": 2.34924259185791, "memory(GiB)": 77.56, "step": 23215, "token_acc": 0.5202020202020202, "train_speed(iter/s)": 1.444764 }, { "epoch": 0.9948159890321752, "grad_norm": 4.148223876953125, "learning_rate": 9.054636197430533e-05, "loss": 2.7521587371826173, "memory(GiB)": 77.56, "step": 23220, "token_acc": 0.43119266055045874, "train_speed(iter/s)": 1.444778 }, { "epoch": 0.9950302043614241, "grad_norm": 4.876492023468018, "learning_rate": 9.054242370053691e-05, "loss": 2.670973777770996, "memory(GiB)": 77.56, "step": 23225, "token_acc": 0.440625, "train_speed(iter/s)": 1.444687 }, { "epoch": 0.995244419690673, "grad_norm": 3.71134352684021, "learning_rate": 9.053848469230446e-05, "loss": 2.806547164916992, "memory(GiB)": 77.56, "step": 23230, "token_acc": 0.4394904458598726, "train_speed(iter/s)": 1.444707 }, { "epoch": 0.995458635019922, "grad_norm": 5.050687313079834, "learning_rate": 9.053454494967935e-05, "loss": 2.493272399902344, "memory(GiB)": 77.56, "step": 23235, "token_acc": 0.4594594594594595, "train_speed(iter/s)": 1.444714 }, { "epoch": 0.995672850349171, "grad_norm": 4.164772033691406, "learning_rate": 9.053060447273291e-05, "loss": 2.5831829071044923, "memory(GiB)": 77.56, "step": 23240, "token_acc": 0.4537037037037037, "train_speed(iter/s)": 1.444643 }, { "epoch": 0.99588706567842, "grad_norm": 6.227268695831299, "learning_rate": 9.052666326153656e-05, "loss": 2.7408420562744142, "memory(GiB)": 77.56, "step": 23245, "token_acc": 0.44072948328267475, "train_speed(iter/s)": 1.444681 }, { "epoch": 0.9961012810076689, "grad_norm": 4.395718574523926, "learning_rate": 9.052272131616168e-05, "loss": 2.68499698638916, "memory(GiB)": 77.56, "step": 23250, "token_acc": 0.44224422442244227, "train_speed(iter/s)": 1.444717 }, { "epoch": 0.9963154963369178, "grad_norm": 7.666308879852295, "learning_rate": 9.051877863667969e-05, "loss": 2.57991886138916, "memory(GiB)": 77.56, "step": 23255, "token_acc": 0.46864686468646866, "train_speed(iter/s)": 1.444781 }, { "epoch": 0.9965297116661669, "grad_norm": 5.391880512237549, "learning_rate": 9.051483522316202e-05, "loss": 2.4933521270751955, "memory(GiB)": 77.56, "step": 23260, "token_acc": 0.4722222222222222, "train_speed(iter/s)": 1.444799 }, { "epoch": 0.9967439269954158, "grad_norm": 7.581659317016602, "learning_rate": 9.05108910756801e-05, "loss": 2.64639892578125, "memory(GiB)": 77.56, "step": 23265, "token_acc": 0.4827586206896552, "train_speed(iter/s)": 1.444839 }, { "epoch": 0.9969581423246647, "grad_norm": 4.262197971343994, "learning_rate": 9.050694619430539e-05, "loss": 2.3680606842041017, "memory(GiB)": 77.56, "step": 23270, "token_acc": 0.4962121212121212, "train_speed(iter/s)": 1.444799 }, { "epoch": 0.9971723576539138, "grad_norm": 4.928798198699951, "learning_rate": 9.050300057910936e-05, "loss": 2.792987823486328, "memory(GiB)": 77.56, "step": 23275, "token_acc": 0.4405144694533762, "train_speed(iter/s)": 1.44484 }, { "epoch": 0.9973865729831627, "grad_norm": 3.818899154663086, "learning_rate": 9.049905423016347e-05, "loss": 2.4279727935791016, "memory(GiB)": 77.56, "step": 23280, "token_acc": 0.4405594405594406, "train_speed(iter/s)": 1.444897 }, { "epoch": 0.9976007883124116, "grad_norm": 4.312445640563965, "learning_rate": 9.049510714753922e-05, "loss": 2.9236495971679686, "memory(GiB)": 77.56, "step": 23285, "token_acc": 0.41002949852507375, "train_speed(iter/s)": 1.44495 }, { "epoch": 0.9978150036416606, "grad_norm": 4.444835662841797, "learning_rate": 9.049115933130811e-05, "loss": 2.9291763305664062, "memory(GiB)": 77.56, "step": 23290, "token_acc": 0.4312977099236641, "train_speed(iter/s)": 1.44486 }, { "epoch": 0.9980292189709096, "grad_norm": 8.014284133911133, "learning_rate": 9.048721078154168e-05, "loss": 2.5835525512695314, "memory(GiB)": 77.56, "step": 23295, "token_acc": 0.4444444444444444, "train_speed(iter/s)": 1.4449 }, { "epoch": 0.9982434343001585, "grad_norm": 4.911098957061768, "learning_rate": 9.048326149831143e-05, "loss": 2.60485782623291, "memory(GiB)": 77.56, "step": 23300, "token_acc": 0.4645669291338583, "train_speed(iter/s)": 1.444913 }, { "epoch": 0.9984576496294075, "grad_norm": 4.6003499031066895, "learning_rate": 9.047931148168894e-05, "loss": 2.5792760848999023, "memory(GiB)": 77.56, "step": 23305, "token_acc": 0.4689655172413793, "train_speed(iter/s)": 1.444905 }, { "epoch": 0.9986718649586565, "grad_norm": 3.4633169174194336, "learning_rate": 9.047536073174573e-05, "loss": 2.8468677520751955, "memory(GiB)": 77.56, "step": 23310, "token_acc": 0.4322916666666667, "train_speed(iter/s)": 1.444853 }, { "epoch": 0.9988860802879054, "grad_norm": 4.67794942855835, "learning_rate": 9.047140924855342e-05, "loss": 2.48193244934082, "memory(GiB)": 77.56, "step": 23315, "token_acc": 0.511864406779661, "train_speed(iter/s)": 1.444759 }, { "epoch": 0.9991002956171544, "grad_norm": 5.053409099578857, "learning_rate": 9.046745703218356e-05, "loss": 2.7321683883666994, "memory(GiB)": 77.56, "step": 23320, "token_acc": 0.4785714285714286, "train_speed(iter/s)": 1.444813 }, { "epoch": 0.9993145109464033, "grad_norm": 4.058961868286133, "learning_rate": 9.046350408270772e-05, "loss": 2.5489475250244142, "memory(GiB)": 77.56, "step": 23325, "token_acc": 0.5157894736842106, "train_speed(iter/s)": 1.444769 }, { "epoch": 0.9995287262756523, "grad_norm": 4.068192481994629, "learning_rate": 9.045955040019758e-05, "loss": 2.542393684387207, "memory(GiB)": 77.56, "step": 23330, "token_acc": 0.44483985765124556, "train_speed(iter/s)": 1.444758 }, { "epoch": 0.9997429416049013, "grad_norm": 6.695660591125488, "learning_rate": 9.045559598472472e-05, "loss": 2.796040153503418, "memory(GiB)": 77.56, "step": 23335, "token_acc": 0.4225352112676056, "train_speed(iter/s)": 1.444775 }, { "epoch": 0.9999571569341502, "grad_norm": 3.9384238719940186, "learning_rate": 9.045164083636079e-05, "loss": 2.6224626541137694, "memory(GiB)": 77.56, "step": 23340, "token_acc": 0.4293628808864266, "train_speed(iter/s)": 1.444831 }, { "epoch": 1.0001713722633991, "grad_norm": 4.683394908905029, "learning_rate": 9.044768495517744e-05, "loss": 2.376451110839844, "memory(GiB)": 77.56, "step": 23345, "token_acc": 0.4957983193277311, "train_speed(iter/s)": 1.444882 }, { "epoch": 1.0003855875926482, "grad_norm": 5.288326263427734, "learning_rate": 9.044372834124632e-05, "loss": 2.3113418579101563, "memory(GiB)": 77.56, "step": 23350, "token_acc": 0.5021097046413502, "train_speed(iter/s)": 1.444906 }, { "epoch": 1.000599802921897, "grad_norm": 5.382696628570557, "learning_rate": 9.043977099463914e-05, "loss": 2.6199560165405273, "memory(GiB)": 77.56, "step": 23355, "token_acc": 0.4584717607973422, "train_speed(iter/s)": 1.444945 }, { "epoch": 1.000814018251146, "grad_norm": 4.5582780838012695, "learning_rate": 9.043581291542757e-05, "loss": 2.6732173919677735, "memory(GiB)": 77.56, "step": 23360, "token_acc": 0.48056537102473496, "train_speed(iter/s)": 1.444919 }, { "epoch": 1.001028233580395, "grad_norm": 3.6138248443603516, "learning_rate": 9.043185410368332e-05, "loss": 2.5210849761962892, "memory(GiB)": 77.56, "step": 23365, "token_acc": 0.4533762057877814, "train_speed(iter/s)": 1.444948 }, { "epoch": 1.0012424489096439, "grad_norm": 6.489718437194824, "learning_rate": 9.042789455947808e-05, "loss": 2.3586307525634767, "memory(GiB)": 77.56, "step": 23370, "token_acc": 0.5141843971631206, "train_speed(iter/s)": 1.445018 }, { "epoch": 1.001456664238893, "grad_norm": 5.141478538513184, "learning_rate": 9.042393428288363e-05, "loss": 2.3916748046875, "memory(GiB)": 77.56, "step": 23375, "token_acc": 0.5057471264367817, "train_speed(iter/s)": 1.445023 }, { "epoch": 1.001670879568142, "grad_norm": 4.6214728355407715, "learning_rate": 9.041997327397169e-05, "loss": 2.7524709701538086, "memory(GiB)": 77.56, "step": 23380, "token_acc": 0.483271375464684, "train_speed(iter/s)": 1.445068 }, { "epoch": 1.0018850948973907, "grad_norm": 5.004777908325195, "learning_rate": 9.0416011532814e-05, "loss": 2.4133243560791016, "memory(GiB)": 77.56, "step": 23385, "token_acc": 0.4866666666666667, "train_speed(iter/s)": 1.445081 }, { "epoch": 1.0020993102266398, "grad_norm": 3.792548179626465, "learning_rate": 9.041204905948236e-05, "loss": 2.499088096618652, "memory(GiB)": 77.56, "step": 23390, "token_acc": 0.5070821529745042, "train_speed(iter/s)": 1.445155 }, { "epoch": 1.0023135255558888, "grad_norm": 3.9695231914520264, "learning_rate": 9.040808585404854e-05, "loss": 2.4528261184692384, "memory(GiB)": 77.56, "step": 23395, "token_acc": 0.4902597402597403, "train_speed(iter/s)": 1.44521 }, { "epoch": 1.0025277408851376, "grad_norm": 4.173301696777344, "learning_rate": 9.040412191658434e-05, "loss": 2.3430273056030275, "memory(GiB)": 77.56, "step": 23400, "token_acc": 0.490625, "train_speed(iter/s)": 1.44522 }, { "epoch": 1.0027419562143867, "grad_norm": 5.720931529998779, "learning_rate": 9.040015724716157e-05, "loss": 2.5962554931640627, "memory(GiB)": 77.56, "step": 23405, "token_acc": 0.49458483754512633, "train_speed(iter/s)": 1.445283 }, { "epoch": 1.0029561715436357, "grad_norm": 4.31567907333374, "learning_rate": 9.039619184585204e-05, "loss": 2.31009521484375, "memory(GiB)": 77.56, "step": 23410, "token_acc": 0.5316455696202531, "train_speed(iter/s)": 1.445276 }, { "epoch": 1.0031703868728847, "grad_norm": 4.946566581726074, "learning_rate": 9.039222571272763e-05, "loss": 2.402265739440918, "memory(GiB)": 77.56, "step": 23415, "token_acc": 0.5245283018867924, "train_speed(iter/s)": 1.445287 }, { "epoch": 1.0033846022021335, "grad_norm": 5.9220709800720215, "learning_rate": 9.038825884786013e-05, "loss": 2.526499557495117, "memory(GiB)": 77.56, "step": 23420, "token_acc": 0.4849624060150376, "train_speed(iter/s)": 1.445301 }, { "epoch": 1.0035988175313826, "grad_norm": 6.5811767578125, "learning_rate": 9.038429125132143e-05, "loss": 2.2615240097045897, "memory(GiB)": 77.56, "step": 23425, "token_acc": 0.5503875968992248, "train_speed(iter/s)": 1.445273 }, { "epoch": 1.0038130328606316, "grad_norm": 6.96791934967041, "learning_rate": 9.038032292318343e-05, "loss": 1.8306434631347657, "memory(GiB)": 77.56, "step": 23430, "token_acc": 0.6028708133971292, "train_speed(iter/s)": 1.445289 }, { "epoch": 1.0040272481898804, "grad_norm": 5.037871837615967, "learning_rate": 9.037635386351801e-05, "loss": 2.6830322265625, "memory(GiB)": 77.56, "step": 23435, "token_acc": 0.47560975609756095, "train_speed(iter/s)": 1.445322 }, { "epoch": 1.0042414635191295, "grad_norm": 4.981171607971191, "learning_rate": 9.037238407239705e-05, "loss": 2.4497539520263674, "memory(GiB)": 77.56, "step": 23440, "token_acc": 0.4607142857142857, "train_speed(iter/s)": 1.445317 }, { "epoch": 1.0044556788483785, "grad_norm": 4.907400608062744, "learning_rate": 9.036841354989248e-05, "loss": 2.5852592468261717, "memory(GiB)": 77.56, "step": 23445, "token_acc": 0.47368421052631576, "train_speed(iter/s)": 1.445399 }, { "epoch": 1.0046698941776273, "grad_norm": 6.677310943603516, "learning_rate": 9.036444229607623e-05, "loss": 2.505696487426758, "memory(GiB)": 77.56, "step": 23450, "token_acc": 0.45634920634920634, "train_speed(iter/s)": 1.445407 }, { "epoch": 1.0048841095068763, "grad_norm": 5.097861289978027, "learning_rate": 9.036047031102024e-05, "loss": 2.3147756576538088, "memory(GiB)": 77.56, "step": 23455, "token_acc": 0.5018050541516246, "train_speed(iter/s)": 1.445447 }, { "epoch": 1.0050983248361254, "grad_norm": 4.4812493324279785, "learning_rate": 9.035649759479648e-05, "loss": 2.6580783843994142, "memory(GiB)": 77.56, "step": 23460, "token_acc": 0.4979919678714859, "train_speed(iter/s)": 1.44545 }, { "epoch": 1.0053125401653742, "grad_norm": 4.0916290283203125, "learning_rate": 9.03525241474769e-05, "loss": 2.3689361572265626, "memory(GiB)": 77.56, "step": 23465, "token_acc": 0.49557522123893805, "train_speed(iter/s)": 1.445439 }, { "epoch": 1.0055267554946232, "grad_norm": 4.03123664855957, "learning_rate": 9.034854996913349e-05, "loss": 2.78851203918457, "memory(GiB)": 77.56, "step": 23470, "token_acc": 0.4696485623003195, "train_speed(iter/s)": 1.445535 }, { "epoch": 1.0057409708238723, "grad_norm": 4.383652210235596, "learning_rate": 9.034457505983825e-05, "loss": 2.697202682495117, "memory(GiB)": 77.56, "step": 23475, "token_acc": 0.43769968051118213, "train_speed(iter/s)": 1.445527 }, { "epoch": 1.005955186153121, "grad_norm": 4.011081695556641, "learning_rate": 9.034059941966318e-05, "loss": 2.5129276275634767, "memory(GiB)": 77.56, "step": 23480, "token_acc": 0.47440273037542663, "train_speed(iter/s)": 1.445491 }, { "epoch": 1.00616940148237, "grad_norm": 5.929107666015625, "learning_rate": 9.033662304868031e-05, "loss": 2.607911491394043, "memory(GiB)": 77.56, "step": 23485, "token_acc": 0.4981949458483754, "train_speed(iter/s)": 1.445454 }, { "epoch": 1.0063836168116191, "grad_norm": 5.403196811676025, "learning_rate": 9.033264594696169e-05, "loss": 2.611578178405762, "memory(GiB)": 77.56, "step": 23490, "token_acc": 0.46984126984126984, "train_speed(iter/s)": 1.445462 }, { "epoch": 1.006597832140868, "grad_norm": 5.1242828369140625, "learning_rate": 9.032866811457935e-05, "loss": 2.7485353469848635, "memory(GiB)": 77.56, "step": 23495, "token_acc": 0.46830985915492956, "train_speed(iter/s)": 1.4455 }, { "epoch": 1.006812047470117, "grad_norm": 4.291436195373535, "learning_rate": 9.032468955160533e-05, "loss": 2.426784706115723, "memory(GiB)": 77.56, "step": 23500, "token_acc": 0.5054945054945055, "train_speed(iter/s)": 1.445468 }, { "epoch": 1.006812047470117, "eval_loss": 2.2164549827575684, "eval_runtime": 14.6719, "eval_samples_per_second": 6.816, "eval_steps_per_second": 6.816, "eval_token_acc": 0.4899598393574297, "step": 23500 }, { "epoch": 1.007026262799366, "grad_norm": 4.435128211975098, "learning_rate": 9.032071025811175e-05, "loss": 2.2500232696533202, "memory(GiB)": 77.56, "step": 23505, "token_acc": 0.5024925224327019, "train_speed(iter/s)": 1.444143 }, { "epoch": 1.0072404781286148, "grad_norm": 3.8093514442443848, "learning_rate": 9.031673023417069e-05, "loss": 2.5141700744628905, "memory(GiB)": 77.56, "step": 23510, "token_acc": 0.46885245901639344, "train_speed(iter/s)": 1.444128 }, { "epoch": 1.0074546934578639, "grad_norm": 4.975738525390625, "learning_rate": 9.031274947985422e-05, "loss": 2.4633569717407227, "memory(GiB)": 77.56, "step": 23515, "token_acc": 0.5132450331125827, "train_speed(iter/s)": 1.444133 }, { "epoch": 1.007668908787113, "grad_norm": 4.458843231201172, "learning_rate": 9.03087679952345e-05, "loss": 2.706062126159668, "memory(GiB)": 77.56, "step": 23520, "token_acc": 0.4412811387900356, "train_speed(iter/s)": 1.444104 }, { "epoch": 1.0078831241163617, "grad_norm": 3.636833906173706, "learning_rate": 9.030478578038361e-05, "loss": 2.2278390884399415, "memory(GiB)": 77.56, "step": 23525, "token_acc": 0.5256410256410257, "train_speed(iter/s)": 1.444151 }, { "epoch": 1.0080973394456108, "grad_norm": 5.378984451293945, "learning_rate": 9.030080283537374e-05, "loss": 2.7690994262695314, "memory(GiB)": 77.56, "step": 23530, "token_acc": 0.5210727969348659, "train_speed(iter/s)": 1.444155 }, { "epoch": 1.0083115547748598, "grad_norm": 5.114414691925049, "learning_rate": 9.029681916027701e-05, "loss": 2.727855110168457, "memory(GiB)": 77.56, "step": 23535, "token_acc": 0.4418604651162791, "train_speed(iter/s)": 1.444231 }, { "epoch": 1.0085257701041086, "grad_norm": 4.054899215698242, "learning_rate": 9.029283475516561e-05, "loss": 2.854404830932617, "memory(GiB)": 77.56, "step": 23540, "token_acc": 0.4425087108013937, "train_speed(iter/s)": 1.44417 }, { "epoch": 1.0087399854333576, "grad_norm": 5.455842971801758, "learning_rate": 9.028884962011169e-05, "loss": 2.477247428894043, "memory(GiB)": 77.56, "step": 23545, "token_acc": 0.50199203187251, "train_speed(iter/s)": 1.444217 }, { "epoch": 1.0089542007626067, "grad_norm": 4.675895690917969, "learning_rate": 9.028486375518748e-05, "loss": 2.4009916305541994, "memory(GiB)": 77.56, "step": 23550, "token_acc": 0.4913494809688581, "train_speed(iter/s)": 1.444203 }, { "epoch": 1.0091684160918555, "grad_norm": 4.928582191467285, "learning_rate": 9.028087716046516e-05, "loss": 2.26422004699707, "memory(GiB)": 77.56, "step": 23555, "token_acc": 0.5283687943262412, "train_speed(iter/s)": 1.444107 }, { "epoch": 1.0093826314211045, "grad_norm": 4.196980953216553, "learning_rate": 9.027688983601699e-05, "loss": 2.4747299194335937, "memory(GiB)": 77.56, "step": 23560, "token_acc": 0.4783950617283951, "train_speed(iter/s)": 1.44415 }, { "epoch": 1.0095968467503535, "grad_norm": 4.74194860458374, "learning_rate": 9.027290178191515e-05, "loss": 2.5284299850463867, "memory(GiB)": 77.56, "step": 23565, "token_acc": 0.47435897435897434, "train_speed(iter/s)": 1.444116 }, { "epoch": 1.0098110620796024, "grad_norm": 4.887885570526123, "learning_rate": 9.026891299823192e-05, "loss": 2.711962890625, "memory(GiB)": 77.56, "step": 23570, "token_acc": 0.47183098591549294, "train_speed(iter/s)": 1.4441 }, { "epoch": 1.0100252774088514, "grad_norm": 4.193269729614258, "learning_rate": 9.026492348503957e-05, "loss": 2.5940530776977537, "memory(GiB)": 77.56, "step": 23575, "token_acc": 0.47039473684210525, "train_speed(iter/s)": 1.444108 }, { "epoch": 1.0102394927381004, "grad_norm": 3.5875673294067383, "learning_rate": 9.026093324241035e-05, "loss": 2.2336780548095705, "memory(GiB)": 77.56, "step": 23580, "token_acc": 0.5189003436426117, "train_speed(iter/s)": 1.444137 }, { "epoch": 1.0104537080673492, "grad_norm": 4.68424129486084, "learning_rate": 9.025694227041656e-05, "loss": 2.7033918380737303, "memory(GiB)": 77.56, "step": 23585, "token_acc": 0.4483870967741935, "train_speed(iter/s)": 1.444116 }, { "epoch": 1.0106679233965983, "grad_norm": 6.216456413269043, "learning_rate": 9.025295056913049e-05, "loss": 2.254861831665039, "memory(GiB)": 77.56, "step": 23590, "token_acc": 0.5274725274725275, "train_speed(iter/s)": 1.444148 }, { "epoch": 1.0108821387258473, "grad_norm": 4.2182698249816895, "learning_rate": 9.024895813862446e-05, "loss": 2.3986690521240233, "memory(GiB)": 77.56, "step": 23595, "token_acc": 0.5133333333333333, "train_speed(iter/s)": 1.444234 }, { "epoch": 1.0110963540550961, "grad_norm": 3.751181125640869, "learning_rate": 9.024496497897082e-05, "loss": 2.772421646118164, "memory(GiB)": 77.56, "step": 23600, "token_acc": 0.49640287769784175, "train_speed(iter/s)": 1.444263 }, { "epoch": 1.0113105693843452, "grad_norm": 5.261978626251221, "learning_rate": 9.024097109024186e-05, "loss": 2.661305618286133, "memory(GiB)": 77.56, "step": 23605, "token_acc": 0.4582043343653251, "train_speed(iter/s)": 1.444172 }, { "epoch": 1.0115247847135942, "grad_norm": 4.343251705169678, "learning_rate": 9.023697647250995e-05, "loss": 2.5762212753295897, "memory(GiB)": 77.56, "step": 23610, "token_acc": 0.45267489711934156, "train_speed(iter/s)": 1.444111 }, { "epoch": 1.011739000042843, "grad_norm": 4.401554584503174, "learning_rate": 9.023298112584749e-05, "loss": 2.5073280334472656, "memory(GiB)": 77.56, "step": 23615, "token_acc": 0.47232472324723246, "train_speed(iter/s)": 1.444144 }, { "epoch": 1.011953215372092, "grad_norm": 5.015374660491943, "learning_rate": 9.022898505032685e-05, "loss": 2.145386505126953, "memory(GiB)": 77.56, "step": 23620, "token_acc": 0.5164835164835165, "train_speed(iter/s)": 1.444123 }, { "epoch": 1.012167430701341, "grad_norm": 4.877620220184326, "learning_rate": 9.022498824602037e-05, "loss": 2.8943077087402345, "memory(GiB)": 77.56, "step": 23625, "token_acc": 0.42803030303030304, "train_speed(iter/s)": 1.44421 }, { "epoch": 1.0123816460305899, "grad_norm": 4.435857772827148, "learning_rate": 9.022099071300052e-05, "loss": 2.5356782913208007, "memory(GiB)": 77.56, "step": 23630, "token_acc": 0.4548022598870056, "train_speed(iter/s)": 1.444255 }, { "epoch": 1.012595861359839, "grad_norm": 3.3223369121551514, "learning_rate": 9.021699245133967e-05, "loss": 2.3703386306762697, "memory(GiB)": 77.56, "step": 23635, "token_acc": 0.5016077170418006, "train_speed(iter/s)": 1.444251 }, { "epoch": 1.012810076689088, "grad_norm": 5.082808971405029, "learning_rate": 9.02129934611103e-05, "loss": 2.4239749908447266, "memory(GiB)": 77.56, "step": 23640, "token_acc": 0.49673202614379086, "train_speed(iter/s)": 1.444226 }, { "epoch": 1.0130242920183368, "grad_norm": 4.389357566833496, "learning_rate": 9.020899374238481e-05, "loss": 2.378102111816406, "memory(GiB)": 77.56, "step": 23645, "token_acc": 0.468944099378882, "train_speed(iter/s)": 1.444253 }, { "epoch": 1.0132385073475858, "grad_norm": 4.6339521408081055, "learning_rate": 9.020499329523569e-05, "loss": 2.746384620666504, "memory(GiB)": 77.56, "step": 23650, "token_acc": 0.4817813765182186, "train_speed(iter/s)": 1.444261 }, { "epoch": 1.0134527226768348, "grad_norm": 4.083985328674316, "learning_rate": 9.02009921197354e-05, "loss": 2.3197113037109376, "memory(GiB)": 77.56, "step": 23655, "token_acc": 0.5166666666666667, "train_speed(iter/s)": 1.444324 }, { "epoch": 1.0136669380060837, "grad_norm": 5.729962348937988, "learning_rate": 9.019699021595642e-05, "loss": 2.289439392089844, "memory(GiB)": 77.56, "step": 23660, "token_acc": 0.5285171102661597, "train_speed(iter/s)": 1.444364 }, { "epoch": 1.0138811533353327, "grad_norm": 5.57902193069458, "learning_rate": 9.019298758397127e-05, "loss": 2.804506301879883, "memory(GiB)": 77.56, "step": 23665, "token_acc": 0.39361702127659576, "train_speed(iter/s)": 1.444296 }, { "epoch": 1.0140953686645817, "grad_norm": 3.6773295402526855, "learning_rate": 9.018898422385243e-05, "loss": 2.359520149230957, "memory(GiB)": 77.56, "step": 23670, "token_acc": 0.46551724137931033, "train_speed(iter/s)": 1.44425 }, { "epoch": 1.0143095839938305, "grad_norm": 5.6231770515441895, "learning_rate": 9.018498013567244e-05, "loss": 2.3890830993652346, "memory(GiB)": 77.56, "step": 23675, "token_acc": 0.5138339920948617, "train_speed(iter/s)": 1.444231 }, { "epoch": 1.0145237993230796, "grad_norm": 5.198417663574219, "learning_rate": 9.018097531950385e-05, "loss": 2.439471435546875, "memory(GiB)": 77.56, "step": 23680, "token_acc": 0.47750865051903113, "train_speed(iter/s)": 1.444241 }, { "epoch": 1.0147380146523286, "grad_norm": 4.018439292907715, "learning_rate": 9.01769697754192e-05, "loss": 2.27854061126709, "memory(GiB)": 77.56, "step": 23685, "token_acc": 0.48717948717948717, "train_speed(iter/s)": 1.444187 }, { "epoch": 1.0149522299815774, "grad_norm": 7.3545451164245605, "learning_rate": 9.017296350349105e-05, "loss": 2.6238658905029295, "memory(GiB)": 77.56, "step": 23690, "token_acc": 0.47719298245614034, "train_speed(iter/s)": 1.444262 }, { "epoch": 1.0151664453108264, "grad_norm": 4.56335973739624, "learning_rate": 9.016895650379198e-05, "loss": 2.372761917114258, "memory(GiB)": 77.56, "step": 23695, "token_acc": 0.49295774647887325, "train_speed(iter/s)": 1.444253 }, { "epoch": 1.0153806606400755, "grad_norm": 6.182762145996094, "learning_rate": 9.016494877639457e-05, "loss": 2.6362482070922852, "memory(GiB)": 77.56, "step": 23700, "token_acc": 0.43154761904761907, "train_speed(iter/s)": 1.444202 }, { "epoch": 1.0155948759693243, "grad_norm": 3.9759950637817383, "learning_rate": 9.016094032137144e-05, "loss": 2.288136291503906, "memory(GiB)": 77.56, "step": 23705, "token_acc": 0.5187713310580204, "train_speed(iter/s)": 1.444228 }, { "epoch": 1.0158090912985733, "grad_norm": 4.441798686981201, "learning_rate": 9.015693113879521e-05, "loss": 2.769272232055664, "memory(GiB)": 77.56, "step": 23710, "token_acc": 0.4262295081967213, "train_speed(iter/s)": 1.444223 }, { "epoch": 1.0160233066278224, "grad_norm": 4.964745044708252, "learning_rate": 9.01529212287385e-05, "loss": 2.537934112548828, "memory(GiB)": 77.56, "step": 23715, "token_acc": 0.5068493150684932, "train_speed(iter/s)": 1.444312 }, { "epoch": 1.0162375219570712, "grad_norm": 6.218685626983643, "learning_rate": 9.014891059127395e-05, "loss": 2.3481212615966798, "memory(GiB)": 77.56, "step": 23720, "token_acc": 0.5186440677966102, "train_speed(iter/s)": 1.444349 }, { "epoch": 1.0164517372863202, "grad_norm": 5.976919174194336, "learning_rate": 9.014489922647423e-05, "loss": 2.3539093017578123, "memory(GiB)": 77.56, "step": 23725, "token_acc": 0.5119047619047619, "train_speed(iter/s)": 1.444384 }, { "epoch": 1.0166659526155692, "grad_norm": 6.729485988616943, "learning_rate": 9.014088713441199e-05, "loss": 2.693385124206543, "memory(GiB)": 77.56, "step": 23730, "token_acc": 0.4942528735632184, "train_speed(iter/s)": 1.444488 }, { "epoch": 1.016880167944818, "grad_norm": 4.621358394622803, "learning_rate": 9.013687431515994e-05, "loss": 2.6795597076416016, "memory(GiB)": 77.56, "step": 23735, "token_acc": 0.45794392523364486, "train_speed(iter/s)": 1.444445 }, { "epoch": 1.017094383274067, "grad_norm": 6.505117416381836, "learning_rate": 9.013286076879075e-05, "loss": 2.6510812759399416, "memory(GiB)": 77.56, "step": 23740, "token_acc": 0.4440789473684211, "train_speed(iter/s)": 1.44448 }, { "epoch": 1.0173085986033161, "grad_norm": 4.350959777832031, "learning_rate": 9.012884649537715e-05, "loss": 2.6561456680297852, "memory(GiB)": 77.56, "step": 23745, "token_acc": 0.45195729537366547, "train_speed(iter/s)": 1.444471 }, { "epoch": 1.017522813932565, "grad_norm": 4.7133002281188965, "learning_rate": 9.012483149499184e-05, "loss": 2.5441621780395507, "memory(GiB)": 77.56, "step": 23750, "token_acc": 0.5089605734767025, "train_speed(iter/s)": 1.44447 }, { "epoch": 1.017737029261814, "grad_norm": 4.532026767730713, "learning_rate": 9.012081576770757e-05, "loss": 2.4489009857177733, "memory(GiB)": 77.56, "step": 23755, "token_acc": 0.49032258064516127, "train_speed(iter/s)": 1.444491 }, { "epoch": 1.017951244591063, "grad_norm": 5.434869289398193, "learning_rate": 9.011679931359708e-05, "loss": 2.395196723937988, "memory(GiB)": 77.56, "step": 23760, "token_acc": 0.47435897435897434, "train_speed(iter/s)": 1.444524 }, { "epoch": 1.0181654599203118, "grad_norm": 4.805933475494385, "learning_rate": 9.011278213273315e-05, "loss": 2.60504264831543, "memory(GiB)": 77.56, "step": 23765, "token_acc": 0.43103448275862066, "train_speed(iter/s)": 1.444568 }, { "epoch": 1.0183796752495609, "grad_norm": 4.878146648406982, "learning_rate": 9.010876422518854e-05, "loss": 2.6618135452270506, "memory(GiB)": 77.56, "step": 23770, "token_acc": 0.44336569579288027, "train_speed(iter/s)": 1.444585 }, { "epoch": 1.01859389057881, "grad_norm": 3.9071240425109863, "learning_rate": 9.010474559103604e-05, "loss": 2.258107566833496, "memory(GiB)": 77.56, "step": 23775, "token_acc": 0.511326860841424, "train_speed(iter/s)": 1.444679 }, { "epoch": 1.0188081059080587, "grad_norm": 4.440838813781738, "learning_rate": 9.010072623034845e-05, "loss": 2.464816093444824, "memory(GiB)": 77.56, "step": 23780, "token_acc": 0.4875, "train_speed(iter/s)": 1.444688 }, { "epoch": 1.0190223212373077, "grad_norm": 3.7575438022613525, "learning_rate": 9.00967061431986e-05, "loss": 2.516987609863281, "memory(GiB)": 77.56, "step": 23785, "token_acc": 0.47039473684210525, "train_speed(iter/s)": 1.44467 }, { "epoch": 1.0192365365665568, "grad_norm": 4.53010368347168, "learning_rate": 9.009268532965929e-05, "loss": 2.556178092956543, "memory(GiB)": 77.56, "step": 23790, "token_acc": 0.48717948717948717, "train_speed(iter/s)": 1.444594 }, { "epoch": 1.0194507518958056, "grad_norm": 4.7371602058410645, "learning_rate": 9.008866378980338e-05, "loss": 2.479389190673828, "memory(GiB)": 77.56, "step": 23795, "token_acc": 0.4885057471264368, "train_speed(iter/s)": 1.444564 }, { "epoch": 1.0196649672250546, "grad_norm": 4.864514350891113, "learning_rate": 9.008464152370371e-05, "loss": 2.3049648284912108, "memory(GiB)": 77.56, "step": 23800, "token_acc": 0.5102739726027398, "train_speed(iter/s)": 1.444415 }, { "epoch": 1.0198791825543037, "grad_norm": 4.705252170562744, "learning_rate": 9.008061853143318e-05, "loss": 2.311351776123047, "memory(GiB)": 77.56, "step": 23805, "token_acc": 0.49624060150375937, "train_speed(iter/s)": 1.444413 }, { "epoch": 1.0200933978835525, "grad_norm": 4.033809661865234, "learning_rate": 9.00765948130646e-05, "loss": 2.330838203430176, "memory(GiB)": 77.56, "step": 23810, "token_acc": 0.47619047619047616, "train_speed(iter/s)": 1.44437 }, { "epoch": 1.0203076132128015, "grad_norm": 3.6460118293762207, "learning_rate": 9.007257036867095e-05, "loss": 2.457318115234375, "memory(GiB)": 77.56, "step": 23815, "token_acc": 0.47774480712166173, "train_speed(iter/s)": 1.44434 }, { "epoch": 1.0205218285420505, "grad_norm": 5.905128479003906, "learning_rate": 9.006854519832509e-05, "loss": 2.585536575317383, "memory(GiB)": 77.56, "step": 23820, "token_acc": 0.46875, "train_speed(iter/s)": 1.444305 }, { "epoch": 1.0207360438712993, "grad_norm": 6.288656711578369, "learning_rate": 9.006451930209995e-05, "loss": 2.3825435638427734, "memory(GiB)": 77.56, "step": 23825, "token_acc": 0.4952076677316294, "train_speed(iter/s)": 1.444299 }, { "epoch": 1.0209502592005484, "grad_norm": 4.655263423919678, "learning_rate": 9.006049268006844e-05, "loss": 2.332324981689453, "memory(GiB)": 77.56, "step": 23830, "token_acc": 0.4803921568627451, "train_speed(iter/s)": 1.444371 }, { "epoch": 1.0211644745297974, "grad_norm": 4.464681625366211, "learning_rate": 9.005646533230354e-05, "loss": 2.542655372619629, "memory(GiB)": 77.56, "step": 23835, "token_acc": 0.5110132158590308, "train_speed(iter/s)": 1.444393 }, { "epoch": 1.0213786898590462, "grad_norm": 5.661242485046387, "learning_rate": 9.005243725887819e-05, "loss": 2.388264465332031, "memory(GiB)": 77.56, "step": 23840, "token_acc": 0.5161290322580645, "train_speed(iter/s)": 1.444441 }, { "epoch": 1.0215929051882953, "grad_norm": 5.917185306549072, "learning_rate": 9.004840845986538e-05, "loss": 2.3632976531982424, "memory(GiB)": 77.56, "step": 23845, "token_acc": 0.47761194029850745, "train_speed(iter/s)": 1.444459 }, { "epoch": 1.0218071205175443, "grad_norm": 4.519986152648926, "learning_rate": 9.004437893533807e-05, "loss": 2.4975162506103517, "memory(GiB)": 77.56, "step": 23850, "token_acc": 0.44680851063829785, "train_speed(iter/s)": 1.444394 }, { "epoch": 1.022021335846793, "grad_norm": 6.680171012878418, "learning_rate": 9.004034868536929e-05, "loss": 2.448757553100586, "memory(GiB)": 77.56, "step": 23855, "token_acc": 0.4894366197183099, "train_speed(iter/s)": 1.444377 }, { "epoch": 1.0222355511760421, "grad_norm": 4.50553560256958, "learning_rate": 9.0036317710032e-05, "loss": 2.5293701171875, "memory(GiB)": 77.56, "step": 23860, "token_acc": 0.4857142857142857, "train_speed(iter/s)": 1.444341 }, { "epoch": 1.0224497665052912, "grad_norm": 4.118928909301758, "learning_rate": 9.003228600939926e-05, "loss": 2.4206512451171873, "memory(GiB)": 77.56, "step": 23865, "token_acc": 0.4891640866873065, "train_speed(iter/s)": 1.444328 }, { "epoch": 1.02266398183454, "grad_norm": 5.375980377197266, "learning_rate": 9.002825358354414e-05, "loss": 2.9208450317382812, "memory(GiB)": 77.56, "step": 23870, "token_acc": 0.4066666666666667, "train_speed(iter/s)": 1.444306 }, { "epoch": 1.022878197163789, "grad_norm": 4.370960235595703, "learning_rate": 9.002422043253962e-05, "loss": 2.751597595214844, "memory(GiB)": 77.56, "step": 23875, "token_acc": 0.46458923512747874, "train_speed(iter/s)": 1.444348 }, { "epoch": 1.023092412493038, "grad_norm": 5.483879566192627, "learning_rate": 9.002018655645882e-05, "loss": 2.7617847442626955, "memory(GiB)": 77.56, "step": 23880, "token_acc": 0.47058823529411764, "train_speed(iter/s)": 1.444382 }, { "epoch": 1.0233066278222869, "grad_norm": 4.41718053817749, "learning_rate": 9.00161519553748e-05, "loss": 2.233142280578613, "memory(GiB)": 77.56, "step": 23885, "token_acc": 0.5, "train_speed(iter/s)": 1.444315 }, { "epoch": 1.023520843151536, "grad_norm": 5.621650695800781, "learning_rate": 9.001211662936065e-05, "loss": 2.3641443252563477, "memory(GiB)": 77.56, "step": 23890, "token_acc": 0.4612676056338028, "train_speed(iter/s)": 1.44436 }, { "epoch": 1.023735058480785, "grad_norm": 3.5997681617736816, "learning_rate": 9.000808057848946e-05, "loss": 2.4787971496582033, "memory(GiB)": 77.56, "step": 23895, "token_acc": 0.4696969696969697, "train_speed(iter/s)": 1.444375 }, { "epoch": 1.0239492738100338, "grad_norm": 4.5780839920043945, "learning_rate": 9.000404380283435e-05, "loss": 2.3750240325927736, "memory(GiB)": 77.56, "step": 23900, "token_acc": 0.4864864864864865, "train_speed(iter/s)": 1.444328 }, { "epoch": 1.0241634891392828, "grad_norm": 3.27915358543396, "learning_rate": 9.000000630246848e-05, "loss": 2.408158302307129, "memory(GiB)": 77.56, "step": 23905, "token_acc": 0.46449704142011833, "train_speed(iter/s)": 1.444378 }, { "epoch": 1.0243777044685318, "grad_norm": 4.563897609710693, "learning_rate": 8.999596807746497e-05, "loss": 2.3405872344970704, "memory(GiB)": 77.56, "step": 23910, "token_acc": 0.48936170212765956, "train_speed(iter/s)": 1.444343 }, { "epoch": 1.0245919197977806, "grad_norm": 5.107802391052246, "learning_rate": 8.999192912789697e-05, "loss": 2.611890983581543, "memory(GiB)": 77.56, "step": 23915, "token_acc": 0.4297752808988764, "train_speed(iter/s)": 1.444325 }, { "epoch": 1.0248061351270297, "grad_norm": 4.760217189788818, "learning_rate": 8.998788945383768e-05, "loss": 2.41015510559082, "memory(GiB)": 77.56, "step": 23920, "token_acc": 0.4892966360856269, "train_speed(iter/s)": 1.444407 }, { "epoch": 1.0250203504562787, "grad_norm": 4.066064834594727, "learning_rate": 8.998384905536025e-05, "loss": 2.623775100708008, "memory(GiB)": 77.56, "step": 23925, "token_acc": 0.48348348348348347, "train_speed(iter/s)": 1.444404 }, { "epoch": 1.0252345657855275, "grad_norm": 3.8480799198150635, "learning_rate": 8.997980793253789e-05, "loss": 2.5957548141479494, "memory(GiB)": 77.56, "step": 23930, "token_acc": 0.5, "train_speed(iter/s)": 1.444381 }, { "epoch": 1.0254487811147766, "grad_norm": 9.379072189331055, "learning_rate": 8.99757660854438e-05, "loss": 2.2241458892822266, "memory(GiB)": 77.56, "step": 23935, "token_acc": 0.5703703703703704, "train_speed(iter/s)": 1.444317 }, { "epoch": 1.0256629964440256, "grad_norm": 5.639321327209473, "learning_rate": 8.99717235141512e-05, "loss": 2.548636817932129, "memory(GiB)": 77.56, "step": 23940, "token_acc": 0.45751633986928103, "train_speed(iter/s)": 1.444376 }, { "epoch": 1.0258772117732744, "grad_norm": 5.260157585144043, "learning_rate": 8.996768021873334e-05, "loss": 2.483246421813965, "memory(GiB)": 77.56, "step": 23945, "token_acc": 0.4880546075085324, "train_speed(iter/s)": 1.444272 }, { "epoch": 1.0260914271025234, "grad_norm": 5.605607509613037, "learning_rate": 8.996363619926346e-05, "loss": 2.5595935821533202, "memory(GiB)": 77.56, "step": 23950, "token_acc": 0.48627450980392156, "train_speed(iter/s)": 1.444336 }, { "epoch": 1.0263056424317725, "grad_norm": 4.457199573516846, "learning_rate": 8.995959145581482e-05, "loss": 2.798406219482422, "memory(GiB)": 77.56, "step": 23955, "token_acc": 0.42857142857142855, "train_speed(iter/s)": 1.444318 }, { "epoch": 1.0265198577610213, "grad_norm": 4.708622455596924, "learning_rate": 8.995554598846069e-05, "loss": 2.197768211364746, "memory(GiB)": 77.56, "step": 23960, "token_acc": 0.5282392026578073, "train_speed(iter/s)": 1.444219 }, { "epoch": 1.0267340730902703, "grad_norm": 5.235738277435303, "learning_rate": 8.995149979727437e-05, "loss": 2.715723991394043, "memory(GiB)": 77.56, "step": 23965, "token_acc": 0.47398843930635837, "train_speed(iter/s)": 1.444218 }, { "epoch": 1.0269482884195194, "grad_norm": 4.115472793579102, "learning_rate": 8.994745288232916e-05, "loss": 2.5359683990478517, "memory(GiB)": 77.56, "step": 23970, "token_acc": 0.47586206896551725, "train_speed(iter/s)": 1.444185 }, { "epoch": 1.0271625037487682, "grad_norm": 3.066713809967041, "learning_rate": 8.994340524369836e-05, "loss": 2.552429962158203, "memory(GiB)": 77.56, "step": 23975, "token_acc": 0.42755344418052255, "train_speed(iter/s)": 1.444213 }, { "epoch": 1.0273767190780172, "grad_norm": 4.575443744659424, "learning_rate": 8.993935688145529e-05, "loss": 2.6501426696777344, "memory(GiB)": 77.56, "step": 23980, "token_acc": 0.4550561797752809, "train_speed(iter/s)": 1.444278 }, { "epoch": 1.0275909344072662, "grad_norm": 4.8414998054504395, "learning_rate": 8.993530779567332e-05, "loss": 2.350907897949219, "memory(GiB)": 77.56, "step": 23985, "token_acc": 0.5142857142857142, "train_speed(iter/s)": 1.444298 }, { "epoch": 1.027805149736515, "grad_norm": 4.459887504577637, "learning_rate": 8.993125798642579e-05, "loss": 2.5612808227539063, "memory(GiB)": 77.56, "step": 23990, "token_acc": 0.4812286689419795, "train_speed(iter/s)": 1.444328 }, { "epoch": 1.028019365065764, "grad_norm": 4.45468282699585, "learning_rate": 8.992720745378605e-05, "loss": 2.2972211837768555, "memory(GiB)": 77.56, "step": 23995, "token_acc": 0.5228070175438596, "train_speed(iter/s)": 1.444271 }, { "epoch": 1.0282335803950131, "grad_norm": 5.335805892944336, "learning_rate": 8.99231561978275e-05, "loss": 2.455869674682617, "memory(GiB)": 77.56, "step": 24000, "token_acc": 0.49825783972125437, "train_speed(iter/s)": 1.444212 }, { "epoch": 1.0282335803950131, "eval_loss": 2.3627328872680664, "eval_runtime": 14.1403, "eval_samples_per_second": 7.072, "eval_steps_per_second": 7.072, "eval_token_acc": 0.44138755980861244, "step": 24000 }, { "epoch": 1.028447795724262, "grad_norm": 5.489840984344482, "learning_rate": 8.991910421862352e-05, "loss": 2.632350730895996, "memory(GiB)": 77.56, "step": 24005, "token_acc": 0.44190140845070425, "train_speed(iter/s)": 1.442874 }, { "epoch": 1.028662011053511, "grad_norm": 4.241428852081299, "learning_rate": 8.991505151624752e-05, "loss": 2.454863166809082, "memory(GiB)": 77.56, "step": 24010, "token_acc": 0.46710526315789475, "train_speed(iter/s)": 1.442893 }, { "epoch": 1.02887622638276, "grad_norm": 4.7103776931762695, "learning_rate": 8.991099809077292e-05, "loss": 2.3978784561157225, "memory(GiB)": 77.56, "step": 24015, "token_acc": 0.44368600682593856, "train_speed(iter/s)": 1.442851 }, { "epoch": 1.0290904417120088, "grad_norm": 8.375214576721191, "learning_rate": 8.990694394227317e-05, "loss": 2.35728759765625, "memory(GiB)": 77.56, "step": 24020, "token_acc": 0.5047169811320755, "train_speed(iter/s)": 1.442866 }, { "epoch": 1.0293046570412578, "grad_norm": 5.615808010101318, "learning_rate": 8.990288907082168e-05, "loss": 2.4102575302124025, "memory(GiB)": 77.56, "step": 24025, "token_acc": 0.5098684210526315, "train_speed(iter/s)": 1.44277 }, { "epoch": 1.0295188723705069, "grad_norm": 4.144071578979492, "learning_rate": 8.989883347649191e-05, "loss": 2.474465560913086, "memory(GiB)": 77.56, "step": 24030, "token_acc": 0.4715447154471545, "train_speed(iter/s)": 1.442729 }, { "epoch": 1.0297330876997557, "grad_norm": 6.063302040100098, "learning_rate": 8.989477715935735e-05, "loss": 2.8166954040527346, "memory(GiB)": 77.56, "step": 24035, "token_acc": 0.42662116040955633, "train_speed(iter/s)": 1.442752 }, { "epoch": 1.0299473030290047, "grad_norm": 6.846219062805176, "learning_rate": 8.989072011949148e-05, "loss": 2.705123710632324, "memory(GiB)": 77.56, "step": 24040, "token_acc": 0.4357142857142857, "train_speed(iter/s)": 1.442761 }, { "epoch": 1.0301615183582538, "grad_norm": 4.94010591506958, "learning_rate": 8.988666235696779e-05, "loss": 2.6302959442138674, "memory(GiB)": 77.56, "step": 24045, "token_acc": 0.4371069182389937, "train_speed(iter/s)": 1.442813 }, { "epoch": 1.0303757336875026, "grad_norm": 5.746603012084961, "learning_rate": 8.98826038718598e-05, "loss": 2.5352577209472655, "memory(GiB)": 77.56, "step": 24050, "token_acc": 0.4888888888888889, "train_speed(iter/s)": 1.442859 }, { "epoch": 1.0305899490167516, "grad_norm": 4.987161159515381, "learning_rate": 8.987854466424103e-05, "loss": 2.4934181213378905, "memory(GiB)": 77.56, "step": 24055, "token_acc": 0.5017921146953405, "train_speed(iter/s)": 1.442855 }, { "epoch": 1.0308041643460006, "grad_norm": 4.284734725952148, "learning_rate": 8.987448473418502e-05, "loss": 2.2293792724609376, "memory(GiB)": 77.56, "step": 24060, "token_acc": 0.5068493150684932, "train_speed(iter/s)": 1.442885 }, { "epoch": 1.0310183796752495, "grad_norm": 6.619974136352539, "learning_rate": 8.98704240817653e-05, "loss": 2.5899566650390624, "memory(GiB)": 77.56, "step": 24065, "token_acc": 0.4749034749034749, "train_speed(iter/s)": 1.442843 }, { "epoch": 1.0312325950044985, "grad_norm": 4.957780838012695, "learning_rate": 8.986636270705545e-05, "loss": 2.292130470275879, "memory(GiB)": 77.56, "step": 24070, "token_acc": 0.4631578947368421, "train_speed(iter/s)": 1.442884 }, { "epoch": 1.0314468103337475, "grad_norm": 4.451689720153809, "learning_rate": 8.986230061012905e-05, "loss": 2.4193578720092774, "memory(GiB)": 77.56, "step": 24075, "token_acc": 0.4861111111111111, "train_speed(iter/s)": 1.442871 }, { "epoch": 1.0316610256629963, "grad_norm": 5.041201114654541, "learning_rate": 8.985823779105968e-05, "loss": 2.5141883850097657, "memory(GiB)": 77.56, "step": 24080, "token_acc": 0.4880952380952381, "train_speed(iter/s)": 1.442961 }, { "epoch": 1.0318752409922454, "grad_norm": 5.4844536781311035, "learning_rate": 8.985417424992093e-05, "loss": 2.4432388305664063, "memory(GiB)": 77.56, "step": 24085, "token_acc": 0.5099337748344371, "train_speed(iter/s)": 1.443022 }, { "epoch": 1.0320894563214944, "grad_norm": 4.77213191986084, "learning_rate": 8.985010998678642e-05, "loss": 2.46121826171875, "memory(GiB)": 77.56, "step": 24090, "token_acc": 0.4371069182389937, "train_speed(iter/s)": 1.443072 }, { "epoch": 1.0323036716507432, "grad_norm": 3.9187328815460205, "learning_rate": 8.984604500172982e-05, "loss": 2.5874811172485352, "memory(GiB)": 77.56, "step": 24095, "token_acc": 0.46176470588235297, "train_speed(iter/s)": 1.443101 }, { "epoch": 1.0325178869799922, "grad_norm": 4.982126712799072, "learning_rate": 8.984197929482471e-05, "loss": 2.457944869995117, "memory(GiB)": 77.56, "step": 24100, "token_acc": 0.47766323024054985, "train_speed(iter/s)": 1.443097 }, { "epoch": 1.0327321023092413, "grad_norm": 4.571814060211182, "learning_rate": 8.983791286614476e-05, "loss": 2.7963314056396484, "memory(GiB)": 77.56, "step": 24105, "token_acc": 0.45871559633027525, "train_speed(iter/s)": 1.443159 }, { "epoch": 1.03294631763849, "grad_norm": 4.690274715423584, "learning_rate": 8.983384571576367e-05, "loss": 2.499724197387695, "memory(GiB)": 77.56, "step": 24110, "token_acc": 0.47653429602888087, "train_speed(iter/s)": 1.443229 }, { "epoch": 1.0331605329677391, "grad_norm": 5.090278148651123, "learning_rate": 8.98297778437551e-05, "loss": 2.5692739486694336, "memory(GiB)": 77.56, "step": 24115, "token_acc": 0.5075757575757576, "train_speed(iter/s)": 1.443311 }, { "epoch": 1.0333747482969882, "grad_norm": 4.321795463562012, "learning_rate": 8.982570925019273e-05, "loss": 2.5763378143310547, "memory(GiB)": 77.56, "step": 24120, "token_acc": 0.4610169491525424, "train_speed(iter/s)": 1.443317 }, { "epoch": 1.0335889636262372, "grad_norm": 5.195075035095215, "learning_rate": 8.982163993515027e-05, "loss": 2.4982065200805663, "memory(GiB)": 77.56, "step": 24125, "token_acc": 0.48771929824561405, "train_speed(iter/s)": 1.443351 }, { "epoch": 1.033803178955486, "grad_norm": 4.9751691818237305, "learning_rate": 8.981756989870147e-05, "loss": 2.4682788848876953, "memory(GiB)": 77.56, "step": 24130, "token_acc": 0.4261168384879725, "train_speed(iter/s)": 1.44334 }, { "epoch": 1.034017394284735, "grad_norm": 5.988290309906006, "learning_rate": 8.981349914092002e-05, "loss": 2.4929752349853516, "memory(GiB)": 77.56, "step": 24135, "token_acc": 0.4979757085020243, "train_speed(iter/s)": 1.443408 }, { "epoch": 1.034231609613984, "grad_norm": 5.711163520812988, "learning_rate": 8.98094276618797e-05, "loss": 2.6479726791381837, "memory(GiB)": 77.56, "step": 24140, "token_acc": 0.4444444444444444, "train_speed(iter/s)": 1.443473 }, { "epoch": 1.034445824943233, "grad_norm": 4.263439655303955, "learning_rate": 8.980535546165422e-05, "loss": 2.4716259002685548, "memory(GiB)": 77.56, "step": 24145, "token_acc": 0.4740484429065744, "train_speed(iter/s)": 1.443487 }, { "epoch": 1.034660040272482, "grad_norm": 6.227019309997559, "learning_rate": 8.980128254031743e-05, "loss": 2.538905715942383, "memory(GiB)": 77.56, "step": 24150, "token_acc": 0.4560669456066946, "train_speed(iter/s)": 1.443547 }, { "epoch": 1.034874255601731, "grad_norm": 4.804160118103027, "learning_rate": 8.979720889794305e-05, "loss": 2.8498817443847657, "memory(GiB)": 77.56, "step": 24155, "token_acc": 0.47244094488188976, "train_speed(iter/s)": 1.443538 }, { "epoch": 1.0350884709309798, "grad_norm": 3.615837335586548, "learning_rate": 8.979313453460489e-05, "loss": 2.546941375732422, "memory(GiB)": 77.56, "step": 24160, "token_acc": 0.4727272727272727, "train_speed(iter/s)": 1.443586 }, { "epoch": 1.0353026862602288, "grad_norm": 5.576261043548584, "learning_rate": 8.978905945037678e-05, "loss": 2.5293659210205077, "memory(GiB)": 77.56, "step": 24165, "token_acc": 0.4968553459119497, "train_speed(iter/s)": 1.443642 }, { "epoch": 1.0355169015894778, "grad_norm": 5.04969596862793, "learning_rate": 8.978498364533252e-05, "loss": 2.2223846435546877, "memory(GiB)": 77.56, "step": 24170, "token_acc": 0.5, "train_speed(iter/s)": 1.443587 }, { "epoch": 1.0357311169187267, "grad_norm": 3.6998376846313477, "learning_rate": 8.978090711954598e-05, "loss": 2.5067092895507814, "memory(GiB)": 77.56, "step": 24175, "token_acc": 0.47959183673469385, "train_speed(iter/s)": 1.443563 }, { "epoch": 1.0359453322479757, "grad_norm": 5.043964385986328, "learning_rate": 8.977682987309097e-05, "loss": 2.8575565338134767, "memory(GiB)": 77.56, "step": 24180, "token_acc": 0.4264705882352941, "train_speed(iter/s)": 1.443562 }, { "epoch": 1.0361595475772247, "grad_norm": 4.713266849517822, "learning_rate": 8.977275190604138e-05, "loss": 2.1276174545288087, "memory(GiB)": 77.56, "step": 24185, "token_acc": 0.5294117647058824, "train_speed(iter/s)": 1.443596 }, { "epoch": 1.0363737629064735, "grad_norm": 4.897609233856201, "learning_rate": 8.97686732184711e-05, "loss": 2.3154813766479494, "memory(GiB)": 77.56, "step": 24190, "token_acc": 0.5017667844522968, "train_speed(iter/s)": 1.443635 }, { "epoch": 1.0365879782357226, "grad_norm": 4.5343146324157715, "learning_rate": 8.976459381045396e-05, "loss": 2.355120849609375, "memory(GiB)": 77.56, "step": 24195, "token_acc": 0.4965986394557823, "train_speed(iter/s)": 1.443658 }, { "epoch": 1.0368021935649716, "grad_norm": 5.203270435333252, "learning_rate": 8.976051368206393e-05, "loss": 2.558268356323242, "memory(GiB)": 77.56, "step": 24200, "token_acc": 0.45993031358885017, "train_speed(iter/s)": 1.443719 }, { "epoch": 1.0370164088942204, "grad_norm": 5.331142425537109, "learning_rate": 8.97564328333749e-05, "loss": 2.3839046478271486, "memory(GiB)": 77.56, "step": 24205, "token_acc": 0.49377593360995853, "train_speed(iter/s)": 1.443713 }, { "epoch": 1.0372306242234695, "grad_norm": 5.468891143798828, "learning_rate": 8.975235126446077e-05, "loss": 2.7484081268310545, "memory(GiB)": 77.56, "step": 24210, "token_acc": 0.4253731343283582, "train_speed(iter/s)": 1.44375 }, { "epoch": 1.0374448395527185, "grad_norm": 3.892148017883301, "learning_rate": 8.974826897539553e-05, "loss": 2.881919288635254, "memory(GiB)": 77.56, "step": 24215, "token_acc": 0.4888888888888889, "train_speed(iter/s)": 1.443796 }, { "epoch": 1.0376590548819673, "grad_norm": 7.222329616546631, "learning_rate": 8.97441859662531e-05, "loss": 2.2086765289306642, "memory(GiB)": 77.56, "step": 24220, "token_acc": 0.5, "train_speed(iter/s)": 1.443854 }, { "epoch": 1.0378732702112163, "grad_norm": 7.090409755706787, "learning_rate": 8.974010223710744e-05, "loss": 2.5318830490112303, "memory(GiB)": 77.56, "step": 24225, "token_acc": 0.5143769968051118, "train_speed(iter/s)": 1.443819 }, { "epoch": 1.0380874855404654, "grad_norm": 4.4375128746032715, "learning_rate": 8.973601778803257e-05, "loss": 2.4187496185302733, "memory(GiB)": 77.56, "step": 24230, "token_acc": 0.5204460966542751, "train_speed(iter/s)": 1.443821 }, { "epoch": 1.0383017008697142, "grad_norm": 5.027166843414307, "learning_rate": 8.973193261910247e-05, "loss": 2.5862619400024416, "memory(GiB)": 77.56, "step": 24235, "token_acc": 0.45714285714285713, "train_speed(iter/s)": 1.44381 }, { "epoch": 1.0385159161989632, "grad_norm": 4.532248497009277, "learning_rate": 8.972784673039111e-05, "loss": 2.365619087219238, "memory(GiB)": 77.56, "step": 24240, "token_acc": 0.49683544303797467, "train_speed(iter/s)": 1.443818 }, { "epoch": 1.0387301315282123, "grad_norm": 5.359230995178223, "learning_rate": 8.972376012197256e-05, "loss": 2.5995391845703124, "memory(GiB)": 77.56, "step": 24245, "token_acc": 0.44755244755244755, "train_speed(iter/s)": 1.443856 }, { "epoch": 1.038944346857461, "grad_norm": 4.708656311035156, "learning_rate": 8.971967279392083e-05, "loss": 2.489506721496582, "memory(GiB)": 77.56, "step": 24250, "token_acc": 0.4711864406779661, "train_speed(iter/s)": 1.443856 }, { "epoch": 1.03915856218671, "grad_norm": 5.532922744750977, "learning_rate": 8.971558474630997e-05, "loss": 2.671093559265137, "memory(GiB)": 77.56, "step": 24255, "token_acc": 0.490272373540856, "train_speed(iter/s)": 1.443923 }, { "epoch": 1.0393727775159591, "grad_norm": 4.693142414093018, "learning_rate": 8.971149597921402e-05, "loss": 2.520188331604004, "memory(GiB)": 77.56, "step": 24260, "token_acc": 0.46757679180887374, "train_speed(iter/s)": 1.443914 }, { "epoch": 1.039586992845208, "grad_norm": 4.718460559844971, "learning_rate": 8.970740649270708e-05, "loss": 2.177897644042969, "memory(GiB)": 77.56, "step": 24265, "token_acc": 0.5157894736842106, "train_speed(iter/s)": 1.443918 }, { "epoch": 1.039801208174457, "grad_norm": 5.367030620574951, "learning_rate": 8.970331628686323e-05, "loss": 2.6302547454833984, "memory(GiB)": 77.56, "step": 24270, "token_acc": 0.44947735191637633, "train_speed(iter/s)": 1.443966 }, { "epoch": 1.040015423503706, "grad_norm": 4.055055141448975, "learning_rate": 8.969922536175654e-05, "loss": 2.6908987045288084, "memory(GiB)": 77.56, "step": 24275, "token_acc": 0.5131578947368421, "train_speed(iter/s)": 1.443992 }, { "epoch": 1.0402296388329548, "grad_norm": 4.85711145401001, "learning_rate": 8.969513371746116e-05, "loss": 2.621944808959961, "memory(GiB)": 77.56, "step": 24280, "token_acc": 0.4844290657439446, "train_speed(iter/s)": 1.444023 }, { "epoch": 1.0404438541622039, "grad_norm": 4.508506774902344, "learning_rate": 8.96910413540512e-05, "loss": 2.5978229522705076, "memory(GiB)": 77.56, "step": 24285, "token_acc": 0.453416149068323, "train_speed(iter/s)": 1.44407 }, { "epoch": 1.040658069491453, "grad_norm": 5.204745769500732, "learning_rate": 8.968694827160078e-05, "loss": 2.6731306076049806, "memory(GiB)": 77.56, "step": 24290, "token_acc": 0.46226415094339623, "train_speed(iter/s)": 1.444053 }, { "epoch": 1.0408722848207017, "grad_norm": 4.502432823181152, "learning_rate": 8.968285447018406e-05, "loss": 2.2701461791992186, "memory(GiB)": 77.56, "step": 24295, "token_acc": 0.4826254826254826, "train_speed(iter/s)": 1.444084 }, { "epoch": 1.0410865001499507, "grad_norm": 5.828651428222656, "learning_rate": 8.967875994987521e-05, "loss": 2.8625324249267576, "memory(GiB)": 77.56, "step": 24300, "token_acc": 0.4411764705882353, "train_speed(iter/s)": 1.444172 }, { "epoch": 1.0413007154791998, "grad_norm": 4.078756809234619, "learning_rate": 8.967466471074841e-05, "loss": 2.425030517578125, "memory(GiB)": 77.56, "step": 24305, "token_acc": 0.49310344827586206, "train_speed(iter/s)": 1.444239 }, { "epoch": 1.0415149308084486, "grad_norm": 6.113180637359619, "learning_rate": 8.967056875287783e-05, "loss": 2.4928035736083984, "memory(GiB)": 77.56, "step": 24310, "token_acc": 0.4796511627906977, "train_speed(iter/s)": 1.444285 }, { "epoch": 1.0417291461376976, "grad_norm": 5.307697296142578, "learning_rate": 8.966647207633769e-05, "loss": 2.570161056518555, "memory(GiB)": 77.56, "step": 24315, "token_acc": 0.45364238410596025, "train_speed(iter/s)": 1.444336 }, { "epoch": 1.0419433614669467, "grad_norm": 5.691319942474365, "learning_rate": 8.966237468120218e-05, "loss": 2.6208337783813476, "memory(GiB)": 77.56, "step": 24320, "token_acc": 0.440625, "train_speed(iter/s)": 1.444402 }, { "epoch": 1.0421575767961955, "grad_norm": 12.893251419067383, "learning_rate": 8.965827656754557e-05, "loss": 2.638018035888672, "memory(GiB)": 77.56, "step": 24325, "token_acc": 0.4483870967741935, "train_speed(iter/s)": 1.444417 }, { "epoch": 1.0423717921254445, "grad_norm": 5.798929214477539, "learning_rate": 8.965417773544207e-05, "loss": 2.61107177734375, "memory(GiB)": 77.56, "step": 24330, "token_acc": 0.44974874371859297, "train_speed(iter/s)": 1.444426 }, { "epoch": 1.0425860074546935, "grad_norm": 6.788809776306152, "learning_rate": 8.965007818496593e-05, "loss": 2.549547576904297, "memory(GiB)": 77.56, "step": 24335, "token_acc": 0.475177304964539, "train_speed(iter/s)": 1.444387 }, { "epoch": 1.0428002227839424, "grad_norm": 4.759422779083252, "learning_rate": 8.964597791619145e-05, "loss": 2.246558952331543, "memory(GiB)": 77.56, "step": 24340, "token_acc": 0.5192307692307693, "train_speed(iter/s)": 1.444361 }, { "epoch": 1.0430144381131914, "grad_norm": 7.374278545379639, "learning_rate": 8.964187692919288e-05, "loss": 2.699236297607422, "memory(GiB)": 77.56, "step": 24345, "token_acc": 0.4305555555555556, "train_speed(iter/s)": 1.444368 }, { "epoch": 1.0432286534424404, "grad_norm": 5.6786885261535645, "learning_rate": 8.963777522404451e-05, "loss": 2.6421829223632813, "memory(GiB)": 77.56, "step": 24350, "token_acc": 0.4651898734177215, "train_speed(iter/s)": 1.44443 }, { "epoch": 1.0434428687716892, "grad_norm": 4.032026290893555, "learning_rate": 8.963367280082067e-05, "loss": 2.542629623413086, "memory(GiB)": 77.56, "step": 24355, "token_acc": 0.4820846905537459, "train_speed(iter/s)": 1.444406 }, { "epoch": 1.0436570841009383, "grad_norm": 4.686112403869629, "learning_rate": 8.962956965959568e-05, "loss": 2.6689868927001954, "memory(GiB)": 77.56, "step": 24360, "token_acc": 0.46688741721854304, "train_speed(iter/s)": 1.444383 }, { "epoch": 1.0438712994301873, "grad_norm": 4.627802848815918, "learning_rate": 8.962546580044384e-05, "loss": 2.7924173355102537, "memory(GiB)": 77.56, "step": 24365, "token_acc": 0.459375, "train_speed(iter/s)": 1.44439 }, { "epoch": 1.0440855147594361, "grad_norm": 5.952791213989258, "learning_rate": 8.962136122343952e-05, "loss": 2.4013442993164062, "memory(GiB)": 77.56, "step": 24370, "token_acc": 0.48638132295719844, "train_speed(iter/s)": 1.444436 }, { "epoch": 1.0442997300886852, "grad_norm": 6.701351642608643, "learning_rate": 8.961725592865708e-05, "loss": 2.376656341552734, "memory(GiB)": 77.56, "step": 24375, "token_acc": 0.5379061371841155, "train_speed(iter/s)": 1.444398 }, { "epoch": 1.0445139454179342, "grad_norm": 4.986407279968262, "learning_rate": 8.961314991617089e-05, "loss": 2.6158611297607424, "memory(GiB)": 77.56, "step": 24380, "token_acc": 0.4583333333333333, "train_speed(iter/s)": 1.444423 }, { "epoch": 1.044728160747183, "grad_norm": 5.211883068084717, "learning_rate": 8.960904318605532e-05, "loss": 2.3534502029418944, "memory(GiB)": 77.56, "step": 24385, "token_acc": 0.5033557046979866, "train_speed(iter/s)": 1.444372 }, { "epoch": 1.044942376076432, "grad_norm": 4.04542350769043, "learning_rate": 8.960493573838477e-05, "loss": 2.2348995208740234, "memory(GiB)": 77.56, "step": 24390, "token_acc": 0.4846153846153846, "train_speed(iter/s)": 1.444369 }, { "epoch": 1.045156591405681, "grad_norm": 7.3191680908203125, "learning_rate": 8.960082757323367e-05, "loss": 2.8703903198242187, "memory(GiB)": 77.56, "step": 24395, "token_acc": 0.44329896907216493, "train_speed(iter/s)": 1.444309 }, { "epoch": 1.0453708067349299, "grad_norm": 5.509588241577148, "learning_rate": 8.95967186906764e-05, "loss": 2.4106178283691406, "memory(GiB)": 77.56, "step": 24400, "token_acc": 0.5019305019305019, "train_speed(iter/s)": 1.444368 }, { "epoch": 1.045585022064179, "grad_norm": 4.54595422744751, "learning_rate": 8.959260909078746e-05, "loss": 2.410841941833496, "memory(GiB)": 77.56, "step": 24405, "token_acc": 0.47794117647058826, "train_speed(iter/s)": 1.444406 }, { "epoch": 1.045799237393428, "grad_norm": 5.7470245361328125, "learning_rate": 8.958849877364125e-05, "loss": 2.4927711486816406, "memory(GiB)": 77.56, "step": 24410, "token_acc": 0.5241635687732342, "train_speed(iter/s)": 1.444334 }, { "epoch": 1.0460134527226768, "grad_norm": 6.178435325622559, "learning_rate": 8.958438773931226e-05, "loss": 2.7552379608154296, "memory(GiB)": 77.56, "step": 24415, "token_acc": 0.4755244755244755, "train_speed(iter/s)": 1.444271 }, { "epoch": 1.0462276680519258, "grad_norm": 4.371453285217285, "learning_rate": 8.958027598787495e-05, "loss": 2.5143932342529296, "memory(GiB)": 77.56, "step": 24420, "token_acc": 0.48656716417910445, "train_speed(iter/s)": 1.444289 }, { "epoch": 1.0464418833811748, "grad_norm": 4.511281490325928, "learning_rate": 8.957616351940381e-05, "loss": 2.380167770385742, "memory(GiB)": 77.56, "step": 24425, "token_acc": 0.46689895470383275, "train_speed(iter/s)": 1.444221 }, { "epoch": 1.0466560987104236, "grad_norm": 6.007720470428467, "learning_rate": 8.957205033397333e-05, "loss": 2.4257295608520506, "memory(GiB)": 77.56, "step": 24430, "token_acc": 0.48175182481751827, "train_speed(iter/s)": 1.444288 }, { "epoch": 1.0468703140396727, "grad_norm": 5.955282688140869, "learning_rate": 8.956793643165805e-05, "loss": 2.946666145324707, "memory(GiB)": 77.56, "step": 24435, "token_acc": 0.42857142857142855, "train_speed(iter/s)": 1.444285 }, { "epoch": 1.0470845293689217, "grad_norm": 4.111509323120117, "learning_rate": 8.956382181253248e-05, "loss": 2.391428565979004, "memory(GiB)": 77.56, "step": 24440, "token_acc": 0.4811320754716981, "train_speed(iter/s)": 1.444325 }, { "epoch": 1.0472987446981705, "grad_norm": 6.420551776885986, "learning_rate": 8.955970647667115e-05, "loss": 2.5946741104125977, "memory(GiB)": 77.56, "step": 24445, "token_acc": 0.45703125, "train_speed(iter/s)": 1.444409 }, { "epoch": 1.0475129600274196, "grad_norm": 5.487607002258301, "learning_rate": 8.955559042414865e-05, "loss": 2.655599021911621, "memory(GiB)": 77.56, "step": 24450, "token_acc": 0.4375, "train_speed(iter/s)": 1.444335 }, { "epoch": 1.0477271753566686, "grad_norm": 4.227954864501953, "learning_rate": 8.955147365503952e-05, "loss": 2.5085376739501952, "memory(GiB)": 77.56, "step": 24455, "token_acc": 0.4944649446494465, "train_speed(iter/s)": 1.444386 }, { "epoch": 1.0479413906859174, "grad_norm": 6.8962531089782715, "learning_rate": 8.954735616941834e-05, "loss": 2.7214939117431642, "memory(GiB)": 77.56, "step": 24460, "token_acc": 0.4671814671814672, "train_speed(iter/s)": 1.444304 }, { "epoch": 1.0481556060151664, "grad_norm": 5.080885410308838, "learning_rate": 8.954323796735969e-05, "loss": 2.4229759216308593, "memory(GiB)": 77.56, "step": 24465, "token_acc": 0.5091575091575091, "train_speed(iter/s)": 1.444313 }, { "epoch": 1.0483698213444155, "grad_norm": 5.274581432342529, "learning_rate": 8.953911904893821e-05, "loss": 2.6455875396728517, "memory(GiB)": 77.56, "step": 24470, "token_acc": 0.5143884892086331, "train_speed(iter/s)": 1.444352 }, { "epoch": 1.0485840366736643, "grad_norm": 4.01211404800415, "learning_rate": 8.953499941422849e-05, "loss": 2.657146453857422, "memory(GiB)": 77.56, "step": 24475, "token_acc": 0.4482758620689655, "train_speed(iter/s)": 1.444376 }, { "epoch": 1.0487982520029133, "grad_norm": 4.258591651916504, "learning_rate": 8.953087906330514e-05, "loss": 2.4957908630371093, "memory(GiB)": 77.56, "step": 24480, "token_acc": 0.4692556634304207, "train_speed(iter/s)": 1.444377 }, { "epoch": 1.0490124673321624, "grad_norm": 7.305221080780029, "learning_rate": 8.952675799624288e-05, "loss": 2.6496051788330077, "memory(GiB)": 77.56, "step": 24485, "token_acc": 0.4859437751004016, "train_speed(iter/s)": 1.444378 }, { "epoch": 1.0492266826614112, "grad_norm": 5.413460731506348, "learning_rate": 8.952263621311629e-05, "loss": 2.6764875411987306, "memory(GiB)": 77.56, "step": 24490, "token_acc": 0.46994535519125685, "train_speed(iter/s)": 1.444334 }, { "epoch": 1.0494408979906602, "grad_norm": 4.029872417449951, "learning_rate": 8.951851371400008e-05, "loss": 2.7449920654296873, "memory(GiB)": 77.56, "step": 24495, "token_acc": 0.4066265060240964, "train_speed(iter/s)": 1.444372 }, { "epoch": 1.0496551133199092, "grad_norm": 5.372546672821045, "learning_rate": 8.951439049896892e-05, "loss": 2.267522430419922, "memory(GiB)": 77.56, "step": 24500, "token_acc": 0.525691699604743, "train_speed(iter/s)": 1.444442 }, { "epoch": 1.0496551133199092, "eval_loss": 2.305422306060791, "eval_runtime": 14.6397, "eval_samples_per_second": 6.831, "eval_steps_per_second": 6.831, "eval_token_acc": 0.45632333767926986, "step": 24500 }, { "epoch": 1.049869328649158, "grad_norm": 4.398870468139648, "learning_rate": 8.951026656809753e-05, "loss": 2.537317657470703, "memory(GiB)": 77.56, "step": 24505, "token_acc": 0.4588785046728972, "train_speed(iter/s)": 1.443118 }, { "epoch": 1.050083543978407, "grad_norm": 8.509885787963867, "learning_rate": 8.950614192146058e-05, "loss": 2.567396545410156, "memory(GiB)": 77.56, "step": 24510, "token_acc": 0.4341736694677871, "train_speed(iter/s)": 1.443114 }, { "epoch": 1.0502977593076561, "grad_norm": 4.17772912979126, "learning_rate": 8.950201655913279e-05, "loss": 2.3661081314086916, "memory(GiB)": 77.56, "step": 24515, "token_acc": 0.515625, "train_speed(iter/s)": 1.443044 }, { "epoch": 1.050511974636905, "grad_norm": 5.490793704986572, "learning_rate": 8.949789048118894e-05, "loss": 2.191828155517578, "memory(GiB)": 77.56, "step": 24520, "token_acc": 0.5168539325842697, "train_speed(iter/s)": 1.443072 }, { "epoch": 1.050726189966154, "grad_norm": 4.027836799621582, "learning_rate": 8.949376368770375e-05, "loss": 2.703683090209961, "memory(GiB)": 77.56, "step": 24525, "token_acc": 0.43214285714285716, "train_speed(iter/s)": 1.443033 }, { "epoch": 1.050940405295403, "grad_norm": 4.876866817474365, "learning_rate": 8.948963617875197e-05, "loss": 2.6705379486083984, "memory(GiB)": 77.56, "step": 24530, "token_acc": 0.4027027027027027, "train_speed(iter/s)": 1.442934 }, { "epoch": 1.0511546206246518, "grad_norm": 5.685788154602051, "learning_rate": 8.94855079544084e-05, "loss": 2.6842384338378906, "memory(GiB)": 77.56, "step": 24535, "token_acc": 0.4358974358974359, "train_speed(iter/s)": 1.442982 }, { "epoch": 1.0513688359539008, "grad_norm": 4.552136421203613, "learning_rate": 8.94813790147478e-05, "loss": 2.4423492431640623, "memory(GiB)": 77.56, "step": 24540, "token_acc": 0.4618181818181818, "train_speed(iter/s)": 1.442892 }, { "epoch": 1.0515830512831499, "grad_norm": 4.318337440490723, "learning_rate": 8.947724935984499e-05, "loss": 2.3700738906860352, "memory(GiB)": 77.56, "step": 24545, "token_acc": 0.5468164794007491, "train_speed(iter/s)": 1.442868 }, { "epoch": 1.0517972666123987, "grad_norm": 3.6966564655303955, "learning_rate": 8.947311898977477e-05, "loss": 2.5516988754272463, "memory(GiB)": 77.56, "step": 24550, "token_acc": 0.4629080118694362, "train_speed(iter/s)": 1.442871 }, { "epoch": 1.0520114819416477, "grad_norm": 9.808146476745605, "learning_rate": 8.946898790461197e-05, "loss": 2.388142395019531, "memory(GiB)": 77.56, "step": 24555, "token_acc": 0.5086505190311419, "train_speed(iter/s)": 1.442914 }, { "epoch": 1.0522256972708968, "grad_norm": 4.863648414611816, "learning_rate": 8.946485610443144e-05, "loss": 2.265351676940918, "memory(GiB)": 77.56, "step": 24560, "token_acc": 0.5105740181268882, "train_speed(iter/s)": 1.442929 }, { "epoch": 1.0524399126001456, "grad_norm": 8.26724624633789, "learning_rate": 8.946072358930802e-05, "loss": 2.5950172424316404, "memory(GiB)": 77.56, "step": 24565, "token_acc": 0.49828178694158076, "train_speed(iter/s)": 1.442957 }, { "epoch": 1.0526541279293946, "grad_norm": 5.3287506103515625, "learning_rate": 8.945659035931656e-05, "loss": 2.8080123901367187, "memory(GiB)": 77.56, "step": 24570, "token_acc": 0.41007194244604317, "train_speed(iter/s)": 1.442987 }, { "epoch": 1.0528683432586436, "grad_norm": 4.792881965637207, "learning_rate": 8.945245641453197e-05, "loss": 2.601020622253418, "memory(GiB)": 77.56, "step": 24575, "token_acc": 0.4560810810810811, "train_speed(iter/s)": 1.442946 }, { "epoch": 1.0530825585878925, "grad_norm": 3.9182608127593994, "learning_rate": 8.94483217550291e-05, "loss": 2.4217052459716797, "memory(GiB)": 77.56, "step": 24580, "token_acc": 0.48109965635738833, "train_speed(iter/s)": 1.442966 }, { "epoch": 1.0532967739171415, "grad_norm": 4.0062642097473145, "learning_rate": 8.94441863808829e-05, "loss": 2.3591335296630858, "memory(GiB)": 77.56, "step": 24585, "token_acc": 0.5037593984962406, "train_speed(iter/s)": 1.442958 }, { "epoch": 1.0535109892463905, "grad_norm": 4.292083263397217, "learning_rate": 8.944005029216824e-05, "loss": 2.4691034317016602, "memory(GiB)": 77.56, "step": 24590, "token_acc": 0.4664310954063604, "train_speed(iter/s)": 1.442978 }, { "epoch": 1.0537252045756393, "grad_norm": 6.970109939575195, "learning_rate": 8.94359134889601e-05, "loss": 2.562367057800293, "memory(GiB)": 77.56, "step": 24595, "token_acc": 0.4866920152091255, "train_speed(iter/s)": 1.442999 }, { "epoch": 1.0539394199048884, "grad_norm": 4.795201778411865, "learning_rate": 8.943177597133336e-05, "loss": 2.4861894607543946, "memory(GiB)": 77.56, "step": 24600, "token_acc": 0.4716981132075472, "train_speed(iter/s)": 1.443044 }, { "epoch": 1.0541536352341374, "grad_norm": 5.581934452056885, "learning_rate": 8.942763773936304e-05, "loss": 2.726054382324219, "memory(GiB)": 77.56, "step": 24605, "token_acc": 0.48355263157894735, "train_speed(iter/s)": 1.443053 }, { "epoch": 1.0543678505633862, "grad_norm": 5.570984363555908, "learning_rate": 8.942349879312406e-05, "loss": 2.555591011047363, "memory(GiB)": 77.56, "step": 24610, "token_acc": 0.5017921146953405, "train_speed(iter/s)": 1.443051 }, { "epoch": 1.0545820658926353, "grad_norm": 14.814002990722656, "learning_rate": 8.941935913269142e-05, "loss": 2.2190454483032225, "memory(GiB)": 77.56, "step": 24615, "token_acc": 0.5098039215686274, "train_speed(iter/s)": 1.44301 }, { "epoch": 1.0547962812218843, "grad_norm": 5.424521446228027, "learning_rate": 8.94152187581401e-05, "loss": 2.433231163024902, "memory(GiB)": 77.56, "step": 24620, "token_acc": 0.45980707395498394, "train_speed(iter/s)": 1.442938 }, { "epoch": 1.055010496551133, "grad_norm": 4.3254899978637695, "learning_rate": 8.941107766954513e-05, "loss": 2.5708267211914064, "memory(GiB)": 77.56, "step": 24625, "token_acc": 0.46179401993355484, "train_speed(iter/s)": 1.442888 }, { "epoch": 1.0552247118803821, "grad_norm": 4.32937479019165, "learning_rate": 8.940693586698152e-05, "loss": 2.3701637268066404, "memory(GiB)": 77.56, "step": 24630, "token_acc": 0.5152838427947598, "train_speed(iter/s)": 1.442936 }, { "epoch": 1.0554389272096312, "grad_norm": 6.3104448318481445, "learning_rate": 8.940279335052428e-05, "loss": 2.2762271881103517, "memory(GiB)": 77.56, "step": 24635, "token_acc": 0.5366795366795367, "train_speed(iter/s)": 1.442905 }, { "epoch": 1.05565314253888, "grad_norm": 6.276554107666016, "learning_rate": 8.939865012024849e-05, "loss": 2.366049575805664, "memory(GiB)": 77.56, "step": 24640, "token_acc": 0.5277777777777778, "train_speed(iter/s)": 1.442926 }, { "epoch": 1.055867357868129, "grad_norm": 5.921716690063477, "learning_rate": 8.939450617622919e-05, "loss": 2.9845218658447266, "memory(GiB)": 77.56, "step": 24645, "token_acc": 0.4523076923076923, "train_speed(iter/s)": 1.44301 }, { "epoch": 1.056081573197378, "grad_norm": 5.03089714050293, "learning_rate": 8.939036151854147e-05, "loss": 2.241654968261719, "memory(GiB)": 77.56, "step": 24650, "token_acc": 0.4929078014184397, "train_speed(iter/s)": 1.442991 }, { "epoch": 1.0562957885266269, "grad_norm": 5.07546854019165, "learning_rate": 8.93862161472604e-05, "loss": 2.3906084060668946, "memory(GiB)": 77.56, "step": 24655, "token_acc": 0.5, "train_speed(iter/s)": 1.442992 }, { "epoch": 1.056510003855876, "grad_norm": 4.440848350524902, "learning_rate": 8.938207006246106e-05, "loss": 2.414600944519043, "memory(GiB)": 77.56, "step": 24660, "token_acc": 0.47840531561461797, "train_speed(iter/s)": 1.442964 }, { "epoch": 1.056724219185125, "grad_norm": 4.9508056640625, "learning_rate": 8.937792326421859e-05, "loss": 2.9339134216308596, "memory(GiB)": 77.56, "step": 24665, "token_acc": 0.42356687898089174, "train_speed(iter/s)": 1.442945 }, { "epoch": 1.0569384345143737, "grad_norm": 5.229337692260742, "learning_rate": 8.937377575260809e-05, "loss": 2.421574592590332, "memory(GiB)": 77.56, "step": 24670, "token_acc": 0.48494983277591974, "train_speed(iter/s)": 1.442975 }, { "epoch": 1.0571526498436228, "grad_norm": 4.360829830169678, "learning_rate": 8.93696275277047e-05, "loss": 2.466918182373047, "memory(GiB)": 77.56, "step": 24675, "token_acc": 0.4425770308123249, "train_speed(iter/s)": 1.442959 }, { "epoch": 1.0573668651728718, "grad_norm": 3.5781688690185547, "learning_rate": 8.93654785895836e-05, "loss": 2.5756629943847655, "memory(GiB)": 77.56, "step": 24680, "token_acc": 0.456973293768546, "train_speed(iter/s)": 1.443027 }, { "epoch": 1.0575810805021206, "grad_norm": 4.3807902336120605, "learning_rate": 8.936132893831992e-05, "loss": 2.5636844635009766, "memory(GiB)": 77.56, "step": 24685, "token_acc": 0.44072948328267475, "train_speed(iter/s)": 1.443009 }, { "epoch": 1.0577952958313697, "grad_norm": 3.8716936111450195, "learning_rate": 8.935717857398883e-05, "loss": 2.277414321899414, "memory(GiB)": 77.56, "step": 24690, "token_acc": 0.5284280936454849, "train_speed(iter/s)": 1.443041 }, { "epoch": 1.0580095111606187, "grad_norm": 5.007645130157471, "learning_rate": 8.935302749666554e-05, "loss": 2.5836578369140626, "memory(GiB)": 77.56, "step": 24695, "token_acc": 0.4591194968553459, "train_speed(iter/s)": 1.443027 }, { "epoch": 1.0582237264898675, "grad_norm": 3.8855621814727783, "learning_rate": 8.934887570642523e-05, "loss": 2.437145233154297, "memory(GiB)": 77.56, "step": 24700, "token_acc": 0.4675324675324675, "train_speed(iter/s)": 1.443057 }, { "epoch": 1.0584379418191165, "grad_norm": 3.954162836074829, "learning_rate": 8.934472320334312e-05, "loss": 2.5538406372070312, "memory(GiB)": 77.56, "step": 24705, "token_acc": 0.4541984732824427, "train_speed(iter/s)": 1.443058 }, { "epoch": 1.0586521571483656, "grad_norm": 5.068463325500488, "learning_rate": 8.934056998749447e-05, "loss": 2.665746307373047, "memory(GiB)": 77.56, "step": 24710, "token_acc": 0.4542372881355932, "train_speed(iter/s)": 1.443003 }, { "epoch": 1.0588663724776144, "grad_norm": 4.683329105377197, "learning_rate": 8.933641605895447e-05, "loss": 2.548792266845703, "memory(GiB)": 77.56, "step": 24715, "token_acc": 0.4307228915662651, "train_speed(iter/s)": 1.443016 }, { "epoch": 1.0590805878068634, "grad_norm": 5.163784503936768, "learning_rate": 8.93322614177984e-05, "loss": 2.2101043701171874, "memory(GiB)": 77.56, "step": 24720, "token_acc": 0.5208333333333334, "train_speed(iter/s)": 1.443062 }, { "epoch": 1.0592948031361125, "grad_norm": 5.771228313446045, "learning_rate": 8.932810606410151e-05, "loss": 2.5359218597412108, "memory(GiB)": 77.56, "step": 24725, "token_acc": 0.46099290780141844, "train_speed(iter/s)": 1.443003 }, { "epoch": 1.0595090184653613, "grad_norm": 5.5019097328186035, "learning_rate": 8.932394999793909e-05, "loss": 2.2854108810424805, "memory(GiB)": 77.56, "step": 24730, "token_acc": 0.5016722408026756, "train_speed(iter/s)": 1.443001 }, { "epoch": 1.0597232337946103, "grad_norm": 4.975171089172363, "learning_rate": 8.931979321938643e-05, "loss": 2.670800971984863, "memory(GiB)": 77.56, "step": 24735, "token_acc": 0.43260188087774293, "train_speed(iter/s)": 1.443022 }, { "epoch": 1.0599374491238593, "grad_norm": 5.276051998138428, "learning_rate": 8.931563572851883e-05, "loss": 2.4296491622924803, "memory(GiB)": 77.56, "step": 24740, "token_acc": 0.46206896551724136, "train_speed(iter/s)": 1.443028 }, { "epoch": 1.0601516644531084, "grad_norm": 4.11514949798584, "learning_rate": 8.93114775254116e-05, "loss": 2.6960317611694338, "memory(GiB)": 77.56, "step": 24745, "token_acc": 0.4251497005988024, "train_speed(iter/s)": 1.443002 }, { "epoch": 1.0603658797823572, "grad_norm": 6.036252021789551, "learning_rate": 8.930731861014009e-05, "loss": 2.7282039642333986, "memory(GiB)": 77.56, "step": 24750, "token_acc": 0.45161290322580644, "train_speed(iter/s)": 1.442955 }, { "epoch": 1.0605800951116062, "grad_norm": 4.531759738922119, "learning_rate": 8.930315898277961e-05, "loss": 2.4699546813964846, "memory(GiB)": 77.56, "step": 24755, "token_acc": 0.44516129032258067, "train_speed(iter/s)": 1.442947 }, { "epoch": 1.0607943104408553, "grad_norm": 3.531190872192383, "learning_rate": 8.929899864340556e-05, "loss": 2.17702579498291, "memory(GiB)": 77.56, "step": 24760, "token_acc": 0.45294117647058824, "train_speed(iter/s)": 1.44294 }, { "epoch": 1.061008525770104, "grad_norm": 4.449081897735596, "learning_rate": 8.929483759209327e-05, "loss": 2.5497406005859373, "memory(GiB)": 77.56, "step": 24765, "token_acc": 0.4649122807017544, "train_speed(iter/s)": 1.442972 }, { "epoch": 1.061222741099353, "grad_norm": 4.878509998321533, "learning_rate": 8.929067582891813e-05, "loss": 2.5043004989624023, "memory(GiB)": 77.56, "step": 24770, "token_acc": 0.4724137931034483, "train_speed(iter/s)": 1.443028 }, { "epoch": 1.0614369564286021, "grad_norm": 5.615623474121094, "learning_rate": 8.928651335395556e-05, "loss": 2.7244335174560548, "memory(GiB)": 77.56, "step": 24775, "token_acc": 0.45294117647058824, "train_speed(iter/s)": 1.44308 }, { "epoch": 1.061651171757851, "grad_norm": 5.27921199798584, "learning_rate": 8.928235016728093e-05, "loss": 2.484927177429199, "memory(GiB)": 77.56, "step": 24780, "token_acc": 0.4612676056338028, "train_speed(iter/s)": 1.443092 }, { "epoch": 1.0618653870871, "grad_norm": 4.431709289550781, "learning_rate": 8.927818626896968e-05, "loss": 2.606571578979492, "memory(GiB)": 77.56, "step": 24785, "token_acc": 0.4583333333333333, "train_speed(iter/s)": 1.443143 }, { "epoch": 1.062079602416349, "grad_norm": 4.658729553222656, "learning_rate": 8.927402165909724e-05, "loss": 2.493944549560547, "memory(GiB)": 77.56, "step": 24790, "token_acc": 0.4879032258064516, "train_speed(iter/s)": 1.443176 }, { "epoch": 1.0622938177455978, "grad_norm": 4.7193074226379395, "learning_rate": 8.926985633773906e-05, "loss": 2.551185417175293, "memory(GiB)": 77.56, "step": 24795, "token_acc": 0.5138888888888888, "train_speed(iter/s)": 1.443219 }, { "epoch": 1.0625080330748469, "grad_norm": 4.909816265106201, "learning_rate": 8.92656903049706e-05, "loss": 2.3404747009277345, "memory(GiB)": 77.56, "step": 24800, "token_acc": 0.5035971223021583, "train_speed(iter/s)": 1.44317 }, { "epoch": 1.062722248404096, "grad_norm": 6.403280258178711, "learning_rate": 8.92615235608673e-05, "loss": 2.562880516052246, "memory(GiB)": 77.56, "step": 24805, "token_acc": 0.48175182481751827, "train_speed(iter/s)": 1.443188 }, { "epoch": 1.0629364637333447, "grad_norm": 4.773261070251465, "learning_rate": 8.925735610550469e-05, "loss": 2.6108932495117188, "memory(GiB)": 77.56, "step": 24810, "token_acc": 0.4860335195530726, "train_speed(iter/s)": 1.44323 }, { "epoch": 1.0631506790625938, "grad_norm": 8.39279556274414, "learning_rate": 8.925318793895825e-05, "loss": 2.3992229461669923, "memory(GiB)": 77.56, "step": 24815, "token_acc": 0.4838709677419355, "train_speed(iter/s)": 1.443239 }, { "epoch": 1.0633648943918428, "grad_norm": 6.312216281890869, "learning_rate": 8.924901906130348e-05, "loss": 2.4384105682373045, "memory(GiB)": 77.56, "step": 24820, "token_acc": 0.5129151291512916, "train_speed(iter/s)": 1.443176 }, { "epoch": 1.0635791097210916, "grad_norm": 5.021091461181641, "learning_rate": 8.924484947261592e-05, "loss": 2.6059885025024414, "memory(GiB)": 77.56, "step": 24825, "token_acc": 0.445141065830721, "train_speed(iter/s)": 1.443204 }, { "epoch": 1.0637933250503406, "grad_norm": 4.153078079223633, "learning_rate": 8.92406791729711e-05, "loss": 2.1656402587890624, "memory(GiB)": 77.56, "step": 24830, "token_acc": 0.4952978056426332, "train_speed(iter/s)": 1.443096 }, { "epoch": 1.0640075403795897, "grad_norm": 5.095820426940918, "learning_rate": 8.923650816244455e-05, "loss": 2.668680763244629, "memory(GiB)": 77.56, "step": 24835, "token_acc": 0.46959459459459457, "train_speed(iter/s)": 1.443129 }, { "epoch": 1.0642217557088385, "grad_norm": 4.86871337890625, "learning_rate": 8.923233644111187e-05, "loss": 2.8476835250854493, "memory(GiB)": 77.56, "step": 24840, "token_acc": 0.44554455445544555, "train_speed(iter/s)": 1.443135 }, { "epoch": 1.0644359710380875, "grad_norm": 6.836292743682861, "learning_rate": 8.922816400904859e-05, "loss": 2.5800466537475586, "memory(GiB)": 77.56, "step": 24845, "token_acc": 0.46441947565543074, "train_speed(iter/s)": 1.443094 }, { "epoch": 1.0646501863673365, "grad_norm": 5.393304824829102, "learning_rate": 8.922399086633032e-05, "loss": 2.926360321044922, "memory(GiB)": 77.56, "step": 24850, "token_acc": 0.4045307443365696, "train_speed(iter/s)": 1.443084 }, { "epoch": 1.0648644016965854, "grad_norm": 5.908994197845459, "learning_rate": 8.921981701303267e-05, "loss": 2.4802446365356445, "memory(GiB)": 77.56, "step": 24855, "token_acc": 0.46311475409836067, "train_speed(iter/s)": 1.443036 }, { "epoch": 1.0650786170258344, "grad_norm": 3.7969794273376465, "learning_rate": 8.921564244923124e-05, "loss": 2.3489316940307616, "memory(GiB)": 77.56, "step": 24860, "token_acc": 0.5183823529411765, "train_speed(iter/s)": 1.442975 }, { "epoch": 1.0652928323550834, "grad_norm": 4.436620712280273, "learning_rate": 8.921146717500166e-05, "loss": 2.749288558959961, "memory(GiB)": 77.56, "step": 24865, "token_acc": 0.4529616724738676, "train_speed(iter/s)": 1.442914 }, { "epoch": 1.0655070476843322, "grad_norm": 4.570539474487305, "learning_rate": 8.920729119041957e-05, "loss": 2.4371414184570312, "memory(GiB)": 77.56, "step": 24870, "token_acc": 0.4849624060150376, "train_speed(iter/s)": 1.44289 }, { "epoch": 1.0657212630135813, "grad_norm": 4.441818714141846, "learning_rate": 8.920311449556062e-05, "loss": 2.522395896911621, "memory(GiB)": 77.56, "step": 24875, "token_acc": 0.4709897610921502, "train_speed(iter/s)": 1.442891 }, { "epoch": 1.0659354783428303, "grad_norm": 6.656431674957275, "learning_rate": 8.919893709050049e-05, "loss": 2.601426696777344, "memory(GiB)": 77.56, "step": 24880, "token_acc": 0.4554794520547945, "train_speed(iter/s)": 1.442903 }, { "epoch": 1.0661496936720791, "grad_norm": 7.790807723999023, "learning_rate": 8.919475897531482e-05, "loss": 2.493697929382324, "memory(GiB)": 77.56, "step": 24885, "token_acc": 0.46691176470588236, "train_speed(iter/s)": 1.442963 }, { "epoch": 1.0663639090013282, "grad_norm": 4.048851490020752, "learning_rate": 8.919058015007934e-05, "loss": 2.5156993865966797, "memory(GiB)": 77.56, "step": 24890, "token_acc": 0.478134110787172, "train_speed(iter/s)": 1.442968 }, { "epoch": 1.0665781243305772, "grad_norm": 6.370020866394043, "learning_rate": 8.918640061486974e-05, "loss": 2.289066505432129, "memory(GiB)": 77.56, "step": 24895, "token_acc": 0.515625, "train_speed(iter/s)": 1.442983 }, { "epoch": 1.066792339659826, "grad_norm": 5.541647911071777, "learning_rate": 8.918222036976172e-05, "loss": 2.518605422973633, "memory(GiB)": 77.56, "step": 24900, "token_acc": 0.4684014869888476, "train_speed(iter/s)": 1.442936 }, { "epoch": 1.067006554989075, "grad_norm": 6.655233860015869, "learning_rate": 8.917803941483101e-05, "loss": 2.372197914123535, "memory(GiB)": 77.56, "step": 24905, "token_acc": 0.5, "train_speed(iter/s)": 1.442965 }, { "epoch": 1.067220770318324, "grad_norm": 5.027594089508057, "learning_rate": 8.917385775015338e-05, "loss": 2.3086753845214845, "memory(GiB)": 77.56, "step": 24910, "token_acc": 0.4728434504792332, "train_speed(iter/s)": 1.442902 }, { "epoch": 1.0674349856475729, "grad_norm": 4.518784046173096, "learning_rate": 8.916967537580457e-05, "loss": 2.430476760864258, "memory(GiB)": 77.56, "step": 24915, "token_acc": 0.48109965635738833, "train_speed(iter/s)": 1.442933 }, { "epoch": 1.067649200976822, "grad_norm": 4.362011909484863, "learning_rate": 8.916549229186036e-05, "loss": 2.2647964477539064, "memory(GiB)": 77.56, "step": 24920, "token_acc": 0.49615384615384617, "train_speed(iter/s)": 1.442951 }, { "epoch": 1.067863416306071, "grad_norm": 6.057953357696533, "learning_rate": 8.916130849839649e-05, "loss": 2.3241344451904298, "memory(GiB)": 77.56, "step": 24925, "token_acc": 0.46153846153846156, "train_speed(iter/s)": 1.442922 }, { "epoch": 1.0680776316353198, "grad_norm": 4.8255534172058105, "learning_rate": 8.915712399548879e-05, "loss": 2.385122299194336, "memory(GiB)": 77.56, "step": 24930, "token_acc": 0.49038461538461536, "train_speed(iter/s)": 1.442985 }, { "epoch": 1.0682918469645688, "grad_norm": 4.3298869132995605, "learning_rate": 8.915293878321308e-05, "loss": 2.2749584197998045, "memory(GiB)": 77.56, "step": 24935, "token_acc": 0.5087108013937283, "train_speed(iter/s)": 1.443013 }, { "epoch": 1.0685060622938178, "grad_norm": 5.672133445739746, "learning_rate": 8.914875286164512e-05, "loss": 2.6646970748901366, "memory(GiB)": 77.56, "step": 24940, "token_acc": 0.45686900958466453, "train_speed(iter/s)": 1.443017 }, { "epoch": 1.0687202776230667, "grad_norm": 5.998624801635742, "learning_rate": 8.914456623086078e-05, "loss": 2.453325653076172, "memory(GiB)": 77.56, "step": 24945, "token_acc": 0.4788273615635179, "train_speed(iter/s)": 1.443088 }, { "epoch": 1.0689344929523157, "grad_norm": 5.41029691696167, "learning_rate": 8.914037889093591e-05, "loss": 2.615672302246094, "memory(GiB)": 77.56, "step": 24950, "token_acc": 0.45625, "train_speed(iter/s)": 1.443145 }, { "epoch": 1.0691487082815647, "grad_norm": 4.725058078765869, "learning_rate": 8.913619084194638e-05, "loss": 2.575868034362793, "memory(GiB)": 77.56, "step": 24955, "token_acc": 0.5067114093959731, "train_speed(iter/s)": 1.443191 }, { "epoch": 1.0693629236108135, "grad_norm": 4.950165748596191, "learning_rate": 8.913200208396801e-05, "loss": 2.7590538024902345, "memory(GiB)": 77.56, "step": 24960, "token_acc": 0.42513368983957217, "train_speed(iter/s)": 1.443046 }, { "epoch": 1.0695771389400626, "grad_norm": 3.6775786876678467, "learning_rate": 8.912781261707674e-05, "loss": 2.4041927337646483, "memory(GiB)": 77.56, "step": 24965, "token_acc": 0.47039473684210525, "train_speed(iter/s)": 1.443077 }, { "epoch": 1.0697913542693116, "grad_norm": 5.825639724731445, "learning_rate": 8.912362244134842e-05, "loss": 2.311003875732422, "memory(GiB)": 77.56, "step": 24970, "token_acc": 0.5081967213114754, "train_speed(iter/s)": 1.443123 }, { "epoch": 1.0700055695985604, "grad_norm": 5.18943452835083, "learning_rate": 8.911943155685897e-05, "loss": 2.6163257598876952, "memory(GiB)": 77.56, "step": 24975, "token_acc": 0.4575757575757576, "train_speed(iter/s)": 1.44317 }, { "epoch": 1.0702197849278094, "grad_norm": 6.513099670410156, "learning_rate": 8.911523996368434e-05, "loss": 2.6651309967041015, "memory(GiB)": 77.56, "step": 24980, "token_acc": 0.45808383233532934, "train_speed(iter/s)": 1.443177 }, { "epoch": 1.0704340002570585, "grad_norm": 5.062175273895264, "learning_rate": 8.911104766190045e-05, "loss": 2.5427068710327148, "memory(GiB)": 77.56, "step": 24985, "token_acc": 0.4609375, "train_speed(iter/s)": 1.443271 }, { "epoch": 1.0706482155863073, "grad_norm": 4.503647804260254, "learning_rate": 8.910685465158324e-05, "loss": 2.5308393478393554, "memory(GiB)": 77.56, "step": 24990, "token_acc": 0.4823943661971831, "train_speed(iter/s)": 1.443328 }, { "epoch": 1.0708624309155563, "grad_norm": 4.3264079093933105, "learning_rate": 8.910266093280866e-05, "loss": 2.391660690307617, "memory(GiB)": 77.56, "step": 24995, "token_acc": 0.5253623188405797, "train_speed(iter/s)": 1.443337 }, { "epoch": 1.0710766462448054, "grad_norm": 5.4959940910339355, "learning_rate": 8.90984665056527e-05, "loss": 2.481093978881836, "memory(GiB)": 77.56, "step": 25000, "token_acc": 0.4744525547445255, "train_speed(iter/s)": 1.443315 }, { "epoch": 1.0710766462448054, "eval_loss": 2.2381017208099365, "eval_runtime": 13.5216, "eval_samples_per_second": 7.396, "eval_steps_per_second": 7.396, "eval_token_acc": 0.49591280653950953, "step": 25000 }, { "epoch": 1.0712908615740542, "grad_norm": 5.898514270782471, "learning_rate": 8.909427137019136e-05, "loss": 2.5043663024902343, "memory(GiB)": 77.56, "step": 25005, "token_acc": 0.48971596474045054, "train_speed(iter/s)": 1.442074 }, { "epoch": 1.0715050769033032, "grad_norm": 4.643527030944824, "learning_rate": 8.90900755265006e-05, "loss": 2.3257822036743163, "memory(GiB)": 77.56, "step": 25010, "token_acc": 0.5257352941176471, "train_speed(iter/s)": 1.442073 }, { "epoch": 1.0717192922325522, "grad_norm": 4.901183605194092, "learning_rate": 8.908587897465644e-05, "loss": 2.6565052032470704, "memory(GiB)": 77.56, "step": 25015, "token_acc": 0.4797047970479705, "train_speed(iter/s)": 1.442126 }, { "epoch": 1.071933507561801, "grad_norm": 3.8918936252593994, "learning_rate": 8.908168171473494e-05, "loss": 2.39459228515625, "memory(GiB)": 77.56, "step": 25020, "token_acc": 0.4870967741935484, "train_speed(iter/s)": 1.442094 }, { "epoch": 1.07214772289105, "grad_norm": 4.803125858306885, "learning_rate": 8.907748374681211e-05, "loss": 2.627385902404785, "memory(GiB)": 77.56, "step": 25025, "token_acc": 0.4204946996466431, "train_speed(iter/s)": 1.442145 }, { "epoch": 1.0723619382202991, "grad_norm": 3.630722761154175, "learning_rate": 8.907328507096399e-05, "loss": 2.4388004302978517, "memory(GiB)": 77.56, "step": 25030, "token_acc": 0.4980392156862745, "train_speed(iter/s)": 1.442229 }, { "epoch": 1.072576153549548, "grad_norm": 4.244997501373291, "learning_rate": 8.906908568726669e-05, "loss": 2.653009605407715, "memory(GiB)": 77.56, "step": 25035, "token_acc": 0.48424068767908307, "train_speed(iter/s)": 1.442197 }, { "epoch": 1.072790368878797, "grad_norm": 5.776773452758789, "learning_rate": 8.906488559579623e-05, "loss": 2.4952442169189455, "memory(GiB)": 77.56, "step": 25040, "token_acc": 0.5057915057915058, "train_speed(iter/s)": 1.442185 }, { "epoch": 1.073004584208046, "grad_norm": 7.8657941818237305, "learning_rate": 8.906068479662871e-05, "loss": 2.5570497512817383, "memory(GiB)": 77.56, "step": 25045, "token_acc": 0.5035211267605634, "train_speed(iter/s)": 1.442214 }, { "epoch": 1.0732187995372948, "grad_norm": 5.0104875564575195, "learning_rate": 8.905648328984026e-05, "loss": 2.180466079711914, "memory(GiB)": 77.56, "step": 25050, "token_acc": 0.5018181818181818, "train_speed(iter/s)": 1.442212 }, { "epoch": 1.0734330148665439, "grad_norm": 5.082650661468506, "learning_rate": 8.905228107550697e-05, "loss": 2.3480173110961915, "memory(GiB)": 77.56, "step": 25055, "token_acc": 0.4444444444444444, "train_speed(iter/s)": 1.442217 }, { "epoch": 1.073647230195793, "grad_norm": 4.033272743225098, "learning_rate": 8.904807815370499e-05, "loss": 2.507711410522461, "memory(GiB)": 77.56, "step": 25060, "token_acc": 0.43465045592705165, "train_speed(iter/s)": 1.442239 }, { "epoch": 1.0738614455250417, "grad_norm": 3.5127065181732178, "learning_rate": 8.904387452451042e-05, "loss": 2.6224090576171877, "memory(GiB)": 77.56, "step": 25065, "token_acc": 0.47770700636942676, "train_speed(iter/s)": 1.442214 }, { "epoch": 1.0740756608542907, "grad_norm": 4.23453950881958, "learning_rate": 8.903967018799946e-05, "loss": 2.6901920318603514, "memory(GiB)": 77.56, "step": 25070, "token_acc": 0.4408284023668639, "train_speed(iter/s)": 1.442252 }, { "epoch": 1.0742898761835398, "grad_norm": 3.6714584827423096, "learning_rate": 8.903546514424824e-05, "loss": 2.7261823654174804, "memory(GiB)": 77.56, "step": 25075, "token_acc": 0.44857142857142857, "train_speed(iter/s)": 1.442268 }, { "epoch": 1.0745040915127886, "grad_norm": 3.8848061561584473, "learning_rate": 8.903125939333294e-05, "loss": 2.4167057037353517, "memory(GiB)": 77.56, "step": 25080, "token_acc": 0.47003154574132494, "train_speed(iter/s)": 1.442259 }, { "epoch": 1.0747183068420376, "grad_norm": 4.971715927124023, "learning_rate": 8.902705293532978e-05, "loss": 2.7110219955444337, "memory(GiB)": 77.56, "step": 25085, "token_acc": 0.46875, "train_speed(iter/s)": 1.442213 }, { "epoch": 1.0749325221712867, "grad_norm": 4.538780689239502, "learning_rate": 8.902284577031494e-05, "loss": 2.5083343505859377, "memory(GiB)": 77.56, "step": 25090, "token_acc": 0.4878048780487805, "train_speed(iter/s)": 1.44222 }, { "epoch": 1.0751467375005355, "grad_norm": 5.06449556350708, "learning_rate": 8.901863789836464e-05, "loss": 2.4191352844238283, "memory(GiB)": 77.56, "step": 25095, "token_acc": 0.5342960288808665, "train_speed(iter/s)": 1.442257 }, { "epoch": 1.0753609528297845, "grad_norm": 3.889514684677124, "learning_rate": 8.901442931955512e-05, "loss": 2.728623390197754, "memory(GiB)": 77.56, "step": 25100, "token_acc": 0.43103448275862066, "train_speed(iter/s)": 1.442268 }, { "epoch": 1.0755751681590335, "grad_norm": 6.102291584014893, "learning_rate": 8.90102200339626e-05, "loss": 2.3905193328857424, "memory(GiB)": 77.56, "step": 25105, "token_acc": 0.4788732394366197, "train_speed(iter/s)": 1.442319 }, { "epoch": 1.0757893834882823, "grad_norm": 4.236095428466797, "learning_rate": 8.900601004166335e-05, "loss": 2.690999984741211, "memory(GiB)": 77.56, "step": 25110, "token_acc": 0.45217391304347826, "train_speed(iter/s)": 1.442302 }, { "epoch": 1.0760035988175314, "grad_norm": 4.288787364959717, "learning_rate": 8.900179934273366e-05, "loss": 2.4521663665771483, "memory(GiB)": 77.56, "step": 25115, "token_acc": 0.49056603773584906, "train_speed(iter/s)": 1.442289 }, { "epoch": 1.0762178141467804, "grad_norm": 11.823760032653809, "learning_rate": 8.899758793724978e-05, "loss": 2.724221420288086, "memory(GiB)": 77.56, "step": 25120, "token_acc": 0.4473684210526316, "train_speed(iter/s)": 1.442362 }, { "epoch": 1.0764320294760292, "grad_norm": 5.338200092315674, "learning_rate": 8.8993375825288e-05, "loss": 2.414128875732422, "memory(GiB)": 77.56, "step": 25125, "token_acc": 0.4627450980392157, "train_speed(iter/s)": 1.442328 }, { "epoch": 1.0766462448052783, "grad_norm": 4.276544570922852, "learning_rate": 8.898916300692463e-05, "loss": 2.6601654052734376, "memory(GiB)": 77.56, "step": 25130, "token_acc": 0.444, "train_speed(iter/s)": 1.442319 }, { "epoch": 1.0768604601345273, "grad_norm": 4.820889949798584, "learning_rate": 8.898494948223604e-05, "loss": 2.3188581466674805, "memory(GiB)": 77.56, "step": 25135, "token_acc": 0.5155038759689923, "train_speed(iter/s)": 1.442306 }, { "epoch": 1.077074675463776, "grad_norm": 4.7706522941589355, "learning_rate": 8.898073525129848e-05, "loss": 2.429790496826172, "memory(GiB)": 77.56, "step": 25140, "token_acc": 0.49230769230769234, "train_speed(iter/s)": 1.442319 }, { "epoch": 1.0772888907930251, "grad_norm": 5.803450107574463, "learning_rate": 8.897652031418834e-05, "loss": 2.4875125885009766, "memory(GiB)": 77.56, "step": 25145, "token_acc": 0.4714828897338403, "train_speed(iter/s)": 1.442373 }, { "epoch": 1.0775031061222742, "grad_norm": 4.465826988220215, "learning_rate": 8.8972304670982e-05, "loss": 2.676397514343262, "memory(GiB)": 77.56, "step": 25150, "token_acc": 0.43376623376623374, "train_speed(iter/s)": 1.442419 }, { "epoch": 1.077717321451523, "grad_norm": 4.993764877319336, "learning_rate": 8.896808832175576e-05, "loss": 2.623885726928711, "memory(GiB)": 77.56, "step": 25155, "token_acc": 0.473015873015873, "train_speed(iter/s)": 1.442421 }, { "epoch": 1.077931536780772, "grad_norm": 4.131471157073975, "learning_rate": 8.896387126658605e-05, "loss": 2.592173385620117, "memory(GiB)": 77.56, "step": 25160, "token_acc": 0.47635135135135137, "train_speed(iter/s)": 1.442489 }, { "epoch": 1.078145752110021, "grad_norm": 5.068206787109375, "learning_rate": 8.895965350554929e-05, "loss": 2.6493844985961914, "memory(GiB)": 77.56, "step": 25165, "token_acc": 0.45, "train_speed(iter/s)": 1.442505 }, { "epoch": 1.0783599674392699, "grad_norm": 5.090224266052246, "learning_rate": 8.895543503872183e-05, "loss": 2.3927188873291017, "memory(GiB)": 77.56, "step": 25170, "token_acc": 0.47547169811320755, "train_speed(iter/s)": 1.442485 }, { "epoch": 1.078574182768519, "grad_norm": 8.322251319885254, "learning_rate": 8.895121586618014e-05, "loss": 2.4706796646118163, "memory(GiB)": 77.56, "step": 25175, "token_acc": 0.5473251028806584, "train_speed(iter/s)": 1.442471 }, { "epoch": 1.078788398097768, "grad_norm": 3.7725141048431396, "learning_rate": 8.894699598800064e-05, "loss": 2.457138442993164, "memory(GiB)": 77.56, "step": 25180, "token_acc": 0.49097472924187724, "train_speed(iter/s)": 1.442367 }, { "epoch": 1.0790026134270168, "grad_norm": 4.180278778076172, "learning_rate": 8.894277540425977e-05, "loss": 2.7426902770996096, "memory(GiB)": 77.56, "step": 25185, "token_acc": 0.4612676056338028, "train_speed(iter/s)": 1.442405 }, { "epoch": 1.0792168287562658, "grad_norm": 5.424924850463867, "learning_rate": 8.893855411503398e-05, "loss": 2.2191679000854494, "memory(GiB)": 77.56, "step": 25190, "token_acc": 0.5152671755725191, "train_speed(iter/s)": 1.442394 }, { "epoch": 1.0794310440855148, "grad_norm": 5.147541046142578, "learning_rate": 8.893433212039974e-05, "loss": 2.6832679748535155, "memory(GiB)": 77.56, "step": 25195, "token_acc": 0.42955326460481097, "train_speed(iter/s)": 1.442419 }, { "epoch": 1.0796452594147636, "grad_norm": 4.844173908233643, "learning_rate": 8.893010942043359e-05, "loss": 2.839406204223633, "memory(GiB)": 77.56, "step": 25200, "token_acc": 0.43506493506493504, "train_speed(iter/s)": 1.442488 }, { "epoch": 1.0798594747440127, "grad_norm": 3.2959585189819336, "learning_rate": 8.892588601521197e-05, "loss": 2.4778619766235352, "memory(GiB)": 77.56, "step": 25205, "token_acc": 0.48520710059171596, "train_speed(iter/s)": 1.442461 }, { "epoch": 1.0800736900732617, "grad_norm": 7.255958080291748, "learning_rate": 8.89216619048114e-05, "loss": 2.7225162506103517, "memory(GiB)": 77.56, "step": 25210, "token_acc": 0.43416370106761565, "train_speed(iter/s)": 1.442471 }, { "epoch": 1.0802879054025105, "grad_norm": 4.925755500793457, "learning_rate": 8.891743708930842e-05, "loss": 2.397993469238281, "memory(GiB)": 77.56, "step": 25215, "token_acc": 0.4946236559139785, "train_speed(iter/s)": 1.442447 }, { "epoch": 1.0805021207317596, "grad_norm": 4.483622074127197, "learning_rate": 8.891321156877957e-05, "loss": 2.415365982055664, "memory(GiB)": 77.56, "step": 25220, "token_acc": 0.4646840148698885, "train_speed(iter/s)": 1.442361 }, { "epoch": 1.0807163360610086, "grad_norm": 5.890489101409912, "learning_rate": 8.890898534330136e-05, "loss": 2.3566139221191404, "memory(GiB)": 77.56, "step": 25225, "token_acc": 0.49508196721311476, "train_speed(iter/s)": 1.442366 }, { "epoch": 1.0809305513902574, "grad_norm": 3.569340229034424, "learning_rate": 8.89047584129504e-05, "loss": 2.535544586181641, "memory(GiB)": 77.56, "step": 25230, "token_acc": 0.467680608365019, "train_speed(iter/s)": 1.442323 }, { "epoch": 1.0811447667195064, "grad_norm": 5.1981916427612305, "learning_rate": 8.890053077780325e-05, "loss": 2.487428665161133, "memory(GiB)": 77.56, "step": 25235, "token_acc": 0.4602076124567474, "train_speed(iter/s)": 1.442362 }, { "epoch": 1.0813589820487555, "grad_norm": 6.219172477722168, "learning_rate": 8.88963024379365e-05, "loss": 2.3975128173828124, "memory(GiB)": 77.56, "step": 25240, "token_acc": 0.4758842443729904, "train_speed(iter/s)": 1.442363 }, { "epoch": 1.0815731973780043, "grad_norm": 3.575761556625366, "learning_rate": 8.889207339342673e-05, "loss": 2.8026546478271483, "memory(GiB)": 77.56, "step": 25245, "token_acc": 0.4417808219178082, "train_speed(iter/s)": 1.442427 }, { "epoch": 1.0817874127072533, "grad_norm": 5.14598274230957, "learning_rate": 8.888784364435056e-05, "loss": 2.301034927368164, "memory(GiB)": 77.56, "step": 25250, "token_acc": 0.4925373134328358, "train_speed(iter/s)": 1.442425 }, { "epoch": 1.0820016280365023, "grad_norm": 4.888864994049072, "learning_rate": 8.888361319078464e-05, "loss": 2.527876091003418, "memory(GiB)": 77.56, "step": 25255, "token_acc": 0.4684014869888476, "train_speed(iter/s)": 1.442499 }, { "epoch": 1.0822158433657512, "grad_norm": 5.543771266937256, "learning_rate": 8.887938203280559e-05, "loss": 2.743634033203125, "memory(GiB)": 77.56, "step": 25260, "token_acc": 0.4362017804154303, "train_speed(iter/s)": 1.442459 }, { "epoch": 1.0824300586950002, "grad_norm": 4.484138488769531, "learning_rate": 8.887515017049006e-05, "loss": 2.4278846740722657, "memory(GiB)": 77.56, "step": 25265, "token_acc": 0.5, "train_speed(iter/s)": 1.442484 }, { "epoch": 1.0826442740242492, "grad_norm": 3.929466485977173, "learning_rate": 8.887091760391471e-05, "loss": 2.311697769165039, "memory(GiB)": 77.56, "step": 25270, "token_acc": 0.5, "train_speed(iter/s)": 1.442436 }, { "epoch": 1.082858489353498, "grad_norm": 5.253222465515137, "learning_rate": 8.886668433315622e-05, "loss": 2.537200164794922, "memory(GiB)": 77.56, "step": 25275, "token_acc": 0.4786885245901639, "train_speed(iter/s)": 1.442435 }, { "epoch": 1.083072704682747, "grad_norm": 5.537881851196289, "learning_rate": 8.88624503582913e-05, "loss": 2.906534194946289, "memory(GiB)": 77.56, "step": 25280, "token_acc": 0.43214285714285716, "train_speed(iter/s)": 1.442448 }, { "epoch": 1.0832869200119961, "grad_norm": 4.92050313949585, "learning_rate": 8.885821567939663e-05, "loss": 2.659988021850586, "memory(GiB)": 77.56, "step": 25285, "token_acc": 0.4188679245283019, "train_speed(iter/s)": 1.442461 }, { "epoch": 1.083501135341245, "grad_norm": 6.284702777862549, "learning_rate": 8.885398029654892e-05, "loss": 2.5327856063842775, "memory(GiB)": 77.56, "step": 25290, "token_acc": 0.48014440433212996, "train_speed(iter/s)": 1.442492 }, { "epoch": 1.083715350670494, "grad_norm": 5.071280479431152, "learning_rate": 8.884974420982493e-05, "loss": 2.2914306640625, "memory(GiB)": 77.56, "step": 25295, "token_acc": 0.5158227848101266, "train_speed(iter/s)": 1.442449 }, { "epoch": 1.083929565999743, "grad_norm": 4.764655113220215, "learning_rate": 8.884550741930135e-05, "loss": 2.6236648559570312, "memory(GiB)": 77.56, "step": 25300, "token_acc": 0.4551083591331269, "train_speed(iter/s)": 1.442407 }, { "epoch": 1.0841437813289918, "grad_norm": 4.0674567222595215, "learning_rate": 8.884126992505498e-05, "loss": 2.7162118911743165, "memory(GiB)": 77.56, "step": 25305, "token_acc": 0.4489795918367347, "train_speed(iter/s)": 1.442393 }, { "epoch": 1.0843579966582408, "grad_norm": 4.113844394683838, "learning_rate": 8.883703172716258e-05, "loss": 2.3346494674682616, "memory(GiB)": 77.56, "step": 25310, "token_acc": 0.5016181229773463, "train_speed(iter/s)": 1.442398 }, { "epoch": 1.0845722119874899, "grad_norm": 5.086583137512207, "learning_rate": 8.883279282570089e-05, "loss": 2.295521545410156, "memory(GiB)": 77.56, "step": 25315, "token_acc": 0.5133333333333333, "train_speed(iter/s)": 1.442369 }, { "epoch": 1.0847864273167387, "grad_norm": 5.205321311950684, "learning_rate": 8.882855322074674e-05, "loss": 2.199264907836914, "memory(GiB)": 77.56, "step": 25320, "token_acc": 0.5112781954887218, "train_speed(iter/s)": 1.442359 }, { "epoch": 1.0850006426459877, "grad_norm": 5.412535667419434, "learning_rate": 8.882431291237693e-05, "loss": 2.65649356842041, "memory(GiB)": 77.56, "step": 25325, "token_acc": 0.45514950166112955, "train_speed(iter/s)": 1.44243 }, { "epoch": 1.0852148579752368, "grad_norm": 4.51121711730957, "learning_rate": 8.882007190066827e-05, "loss": 2.490675354003906, "memory(GiB)": 77.56, "step": 25330, "token_acc": 0.5096525096525096, "train_speed(iter/s)": 1.442491 }, { "epoch": 1.0854290733044856, "grad_norm": 8.112110137939453, "learning_rate": 8.881583018569761e-05, "loss": 2.6366546630859373, "memory(GiB)": 77.56, "step": 25335, "token_acc": 0.4607142857142857, "train_speed(iter/s)": 1.442421 }, { "epoch": 1.0856432886337346, "grad_norm": 4.496501445770264, "learning_rate": 8.881158776754175e-05, "loss": 2.4299571990966795, "memory(GiB)": 77.56, "step": 25340, "token_acc": 0.5037593984962406, "train_speed(iter/s)": 1.44244 }, { "epoch": 1.0858575039629836, "grad_norm": 7.006977081298828, "learning_rate": 8.880734464627757e-05, "loss": 2.751093864440918, "memory(GiB)": 77.56, "step": 25345, "token_acc": 0.40828402366863903, "train_speed(iter/s)": 1.442403 }, { "epoch": 1.0860717192922325, "grad_norm": 6.189694404602051, "learning_rate": 8.880310082198196e-05, "loss": 2.4706138610839843, "memory(GiB)": 77.56, "step": 25350, "token_acc": 0.47410358565737054, "train_speed(iter/s)": 1.442373 }, { "epoch": 1.0862859346214815, "grad_norm": 4.348604202270508, "learning_rate": 8.879885629473176e-05, "loss": 2.542814826965332, "memory(GiB)": 77.56, "step": 25355, "token_acc": 0.49812734082397003, "train_speed(iter/s)": 1.442376 }, { "epoch": 1.0865001499507305, "grad_norm": 4.553647994995117, "learning_rate": 8.87946110646039e-05, "loss": 2.4307376861572267, "memory(GiB)": 77.56, "step": 25360, "token_acc": 0.47474747474747475, "train_speed(iter/s)": 1.442389 }, { "epoch": 1.0867143652799793, "grad_norm": 4.445259094238281, "learning_rate": 8.879036513167523e-05, "loss": 2.8615108489990235, "memory(GiB)": 77.56, "step": 25365, "token_acc": 0.40782122905027934, "train_speed(iter/s)": 1.442417 }, { "epoch": 1.0869285806092284, "grad_norm": 5.165308475494385, "learning_rate": 8.878611849602274e-05, "loss": 2.4291994094848635, "memory(GiB)": 77.56, "step": 25370, "token_acc": 0.4962121212121212, "train_speed(iter/s)": 1.442387 }, { "epoch": 1.0871427959384774, "grad_norm": 3.711165428161621, "learning_rate": 8.878187115772331e-05, "loss": 2.595941925048828, "memory(GiB)": 77.56, "step": 25375, "token_acc": 0.5058479532163743, "train_speed(iter/s)": 1.442363 }, { "epoch": 1.0873570112677262, "grad_norm": 4.4258832931518555, "learning_rate": 8.877762311685391e-05, "loss": 2.642012023925781, "memory(GiB)": 77.56, "step": 25380, "token_acc": 0.47962382445141066, "train_speed(iter/s)": 1.442347 }, { "epoch": 1.0875712265969752, "grad_norm": 5.371784687042236, "learning_rate": 8.87733743734915e-05, "loss": 2.484430694580078, "memory(GiB)": 77.56, "step": 25385, "token_acc": 0.47633136094674555, "train_speed(iter/s)": 1.442328 }, { "epoch": 1.0877854419262243, "grad_norm": 4.51119327545166, "learning_rate": 8.876912492771303e-05, "loss": 2.2747976303100588, "memory(GiB)": 77.56, "step": 25390, "token_acc": 0.4936708860759494, "train_speed(iter/s)": 1.442323 }, { "epoch": 1.087999657255473, "grad_norm": 5.508774757385254, "learning_rate": 8.876487477959551e-05, "loss": 2.6571575164794923, "memory(GiB)": 77.56, "step": 25395, "token_acc": 0.5018315018315018, "train_speed(iter/s)": 1.442392 }, { "epoch": 1.0882138725847221, "grad_norm": 3.836334228515625, "learning_rate": 8.876062392921591e-05, "loss": 2.3518226623535154, "memory(GiB)": 77.56, "step": 25400, "token_acc": 0.5230125523012552, "train_speed(iter/s)": 1.442411 }, { "epoch": 1.0884280879139712, "grad_norm": 4.80452299118042, "learning_rate": 8.875637237665124e-05, "loss": 2.603152847290039, "memory(GiB)": 77.56, "step": 25405, "token_acc": 0.4703703703703704, "train_speed(iter/s)": 1.442444 }, { "epoch": 1.08864230324322, "grad_norm": 3.5263288021087646, "learning_rate": 8.875212012197853e-05, "loss": 2.3496030807495116, "memory(GiB)": 77.56, "step": 25410, "token_acc": 0.49216300940438873, "train_speed(iter/s)": 1.44248 }, { "epoch": 1.088856518572469, "grad_norm": 5.528037071228027, "learning_rate": 8.874786716527482e-05, "loss": 2.4018138885498046, "memory(GiB)": 77.56, "step": 25415, "token_acc": 0.519434628975265, "train_speed(iter/s)": 1.442525 }, { "epoch": 1.089070733901718, "grad_norm": 4.728265285491943, "learning_rate": 8.874361350661713e-05, "loss": 2.4568099975585938, "memory(GiB)": 77.56, "step": 25420, "token_acc": 0.5073529411764706, "train_speed(iter/s)": 1.442467 }, { "epoch": 1.0892849492309669, "grad_norm": 4.832590579986572, "learning_rate": 8.873935914608256e-05, "loss": 2.357774353027344, "memory(GiB)": 77.56, "step": 25425, "token_acc": 0.5141843971631206, "train_speed(iter/s)": 1.442495 }, { "epoch": 1.089499164560216, "grad_norm": 7.3577094078063965, "learning_rate": 8.873510408374814e-05, "loss": 2.623159408569336, "memory(GiB)": 77.56, "step": 25430, "token_acc": 0.45075757575757575, "train_speed(iter/s)": 1.442556 }, { "epoch": 1.089713379889465, "grad_norm": 4.762428283691406, "learning_rate": 8.873084831969098e-05, "loss": 2.5407928466796874, "memory(GiB)": 77.56, "step": 25435, "token_acc": 0.47985347985347987, "train_speed(iter/s)": 1.442537 }, { "epoch": 1.0899275952187137, "grad_norm": 4.607973098754883, "learning_rate": 8.872659185398817e-05, "loss": 2.6841594696044924, "memory(GiB)": 77.56, "step": 25440, "token_acc": 0.43653250773993807, "train_speed(iter/s)": 1.442545 }, { "epoch": 1.0901418105479628, "grad_norm": 8.858938217163086, "learning_rate": 8.872233468671683e-05, "loss": 2.210304832458496, "memory(GiB)": 77.56, "step": 25445, "token_acc": 0.5462555066079295, "train_speed(iter/s)": 1.442606 }, { "epoch": 1.0903560258772118, "grad_norm": 4.562241077423096, "learning_rate": 8.871807681795406e-05, "loss": 2.403728485107422, "memory(GiB)": 77.56, "step": 25450, "token_acc": 0.4731182795698925, "train_speed(iter/s)": 1.442646 }, { "epoch": 1.0905702412064606, "grad_norm": 4.31027889251709, "learning_rate": 8.8713818247777e-05, "loss": 2.3220489501953123, "memory(GiB)": 77.56, "step": 25455, "token_acc": 0.5218978102189781, "train_speed(iter/s)": 1.442695 }, { "epoch": 1.0907844565357097, "grad_norm": 5.132511138916016, "learning_rate": 8.870955897626283e-05, "loss": 2.5519908905029296, "memory(GiB)": 77.56, "step": 25460, "token_acc": 0.4953271028037383, "train_speed(iter/s)": 1.442773 }, { "epoch": 1.0909986718649587, "grad_norm": 4.796254634857178, "learning_rate": 8.870529900348867e-05, "loss": 2.466431427001953, "memory(GiB)": 77.56, "step": 25465, "token_acc": 0.5093167701863354, "train_speed(iter/s)": 1.442713 }, { "epoch": 1.0912128871942075, "grad_norm": 4.18263053894043, "learning_rate": 8.870103832953172e-05, "loss": 2.6662452697753904, "memory(GiB)": 77.56, "step": 25470, "token_acc": 0.44932432432432434, "train_speed(iter/s)": 1.442715 }, { "epoch": 1.0914271025234565, "grad_norm": 4.283825397491455, "learning_rate": 8.869677695446914e-05, "loss": 2.7063230514526366, "memory(GiB)": 77.56, "step": 25475, "token_acc": 0.43462897526501765, "train_speed(iter/s)": 1.442695 }, { "epoch": 1.0916413178527056, "grad_norm": 5.661508560180664, "learning_rate": 8.869251487837816e-05, "loss": 2.4352169036865234, "memory(GiB)": 77.56, "step": 25480, "token_acc": 0.4536741214057508, "train_speed(iter/s)": 1.442712 }, { "epoch": 1.0918555331819544, "grad_norm": 4.870262622833252, "learning_rate": 8.868825210133598e-05, "loss": 2.5312122344970702, "memory(GiB)": 77.56, "step": 25485, "token_acc": 0.48253968253968255, "train_speed(iter/s)": 1.442782 }, { "epoch": 1.0920697485112034, "grad_norm": 5.331650257110596, "learning_rate": 8.86839886234198e-05, "loss": 2.5506799697875975, "memory(GiB)": 77.56, "step": 25490, "token_acc": 0.46953405017921146, "train_speed(iter/s)": 1.442802 }, { "epoch": 1.0922839638404525, "grad_norm": 4.522636890411377, "learning_rate": 8.86797244447069e-05, "loss": 2.5758472442626954, "memory(GiB)": 77.56, "step": 25495, "token_acc": 0.4519230769230769, "train_speed(iter/s)": 1.442817 }, { "epoch": 1.0924981791697013, "grad_norm": 6.4138383865356445, "learning_rate": 8.86754595652745e-05, "loss": 2.6582805633544924, "memory(GiB)": 77.56, "step": 25500, "token_acc": 0.4694533762057878, "train_speed(iter/s)": 1.442841 }, { "epoch": 1.0924981791697013, "eval_loss": 2.074680805206299, "eval_runtime": 14.4656, "eval_samples_per_second": 6.913, "eval_steps_per_second": 6.913, "eval_token_acc": 0.5049365303244006, "step": 25500 }, { "epoch": 1.0927123944989503, "grad_norm": 5.1362504959106445, "learning_rate": 8.867119398519986e-05, "loss": 2.3715230941772463, "memory(GiB)": 77.56, "step": 25505, "token_acc": 0.5031185031185031, "train_speed(iter/s)": 1.441601 }, { "epoch": 1.0929266098281993, "grad_norm": 6.287789344787598, "learning_rate": 8.866692770456026e-05, "loss": 2.3799259185791017, "memory(GiB)": 77.56, "step": 25510, "token_acc": 0.5, "train_speed(iter/s)": 1.441647 }, { "epoch": 1.0931408251574484, "grad_norm": 8.176146507263184, "learning_rate": 8.866266072343301e-05, "loss": 2.3360300064086914, "memory(GiB)": 77.56, "step": 25515, "token_acc": 0.48412698412698413, "train_speed(iter/s)": 1.441682 }, { "epoch": 1.0933550404866972, "grad_norm": 5.211651802062988, "learning_rate": 8.865839304189538e-05, "loss": 2.5676237106323243, "memory(GiB)": 77.56, "step": 25520, "token_acc": 0.4721189591078067, "train_speed(iter/s)": 1.441695 }, { "epoch": 1.0935692558159462, "grad_norm": 5.002561092376709, "learning_rate": 8.865412466002472e-05, "loss": 2.760858917236328, "memory(GiB)": 77.56, "step": 25525, "token_acc": 0.43730886850152906, "train_speed(iter/s)": 1.441679 }, { "epoch": 1.0937834711451953, "grad_norm": 4.139132499694824, "learning_rate": 8.86498555778983e-05, "loss": 2.6972469329833983, "memory(GiB)": 77.56, "step": 25530, "token_acc": 0.42724458204334365, "train_speed(iter/s)": 1.441691 }, { "epoch": 1.093997686474444, "grad_norm": 6.844869613647461, "learning_rate": 8.86455857955935e-05, "loss": 2.4446144104003906, "memory(GiB)": 77.56, "step": 25535, "token_acc": 0.4888888888888889, "train_speed(iter/s)": 1.44164 }, { "epoch": 1.094211901803693, "grad_norm": 4.971738815307617, "learning_rate": 8.864131531318766e-05, "loss": 2.5308584213256835, "memory(GiB)": 77.56, "step": 25540, "token_acc": 0.41638225255972694, "train_speed(iter/s)": 1.441673 }, { "epoch": 1.0944261171329421, "grad_norm": 4.738288879394531, "learning_rate": 8.863704413075816e-05, "loss": 2.465706443786621, "memory(GiB)": 77.56, "step": 25545, "token_acc": 0.4867924528301887, "train_speed(iter/s)": 1.441681 }, { "epoch": 1.094640332462191, "grad_norm": 5.577242851257324, "learning_rate": 8.863277224838234e-05, "loss": 2.2139705657958983, "memory(GiB)": 77.56, "step": 25550, "token_acc": 0.5603112840466926, "train_speed(iter/s)": 1.441675 }, { "epoch": 1.09485454779144, "grad_norm": 3.6750497817993164, "learning_rate": 8.862849966613763e-05, "loss": 2.676479530334473, "memory(GiB)": 77.56, "step": 25555, "token_acc": 0.5052631578947369, "train_speed(iter/s)": 1.441686 }, { "epoch": 1.095068763120689, "grad_norm": 4.274869918823242, "learning_rate": 8.862422638410139e-05, "loss": 2.593830680847168, "memory(GiB)": 77.56, "step": 25560, "token_acc": 0.4855305466237942, "train_speed(iter/s)": 1.441701 }, { "epoch": 1.0952829784499378, "grad_norm": 4.485520839691162, "learning_rate": 8.861995240235106e-05, "loss": 2.6144424438476563, "memory(GiB)": 77.56, "step": 25565, "token_acc": 0.4692556634304207, "train_speed(iter/s)": 1.441709 }, { "epoch": 1.0954971937791869, "grad_norm": 5.184383869171143, "learning_rate": 8.861567772096408e-05, "loss": 2.807928466796875, "memory(GiB)": 77.56, "step": 25570, "token_acc": 0.460431654676259, "train_speed(iter/s)": 1.441766 }, { "epoch": 1.095711409108436, "grad_norm": 4.967194557189941, "learning_rate": 8.861140234001785e-05, "loss": 2.6933567047119142, "memory(GiB)": 77.56, "step": 25575, "token_acc": 0.46296296296296297, "train_speed(iter/s)": 1.441754 }, { "epoch": 1.0959256244376847, "grad_norm": 5.986224174499512, "learning_rate": 8.860712625958987e-05, "loss": 2.4381717681884765, "memory(GiB)": 77.56, "step": 25580, "token_acc": 0.45674740484429066, "train_speed(iter/s)": 1.441739 }, { "epoch": 1.0961398397669337, "grad_norm": 4.272314071655273, "learning_rate": 8.860284947975758e-05, "loss": 2.693319320678711, "memory(GiB)": 77.56, "step": 25585, "token_acc": 0.42907801418439717, "train_speed(iter/s)": 1.441721 }, { "epoch": 1.0963540550961828, "grad_norm": 4.546026706695557, "learning_rate": 8.859857200059845e-05, "loss": 2.8001968383789064, "memory(GiB)": 77.56, "step": 25590, "token_acc": 0.4208754208754209, "train_speed(iter/s)": 1.441754 }, { "epoch": 1.0965682704254316, "grad_norm": 6.143930435180664, "learning_rate": 8.859429382218998e-05, "loss": 2.657841682434082, "memory(GiB)": 77.56, "step": 25595, "token_acc": 0.4793650793650794, "train_speed(iter/s)": 1.441783 }, { "epoch": 1.0967824857546806, "grad_norm": 3.526970624923706, "learning_rate": 8.859001494460968e-05, "loss": 2.2485504150390625, "memory(GiB)": 77.56, "step": 25600, "token_acc": 0.5054545454545455, "train_speed(iter/s)": 1.441715 }, { "epoch": 1.0969967010839297, "grad_norm": 4.2236809730529785, "learning_rate": 8.858573536793504e-05, "loss": 2.58162841796875, "memory(GiB)": 77.56, "step": 25605, "token_acc": 0.4421364985163205, "train_speed(iter/s)": 1.441716 }, { "epoch": 1.0972109164131785, "grad_norm": 4.2564697265625, "learning_rate": 8.858145509224363e-05, "loss": 2.5570674896240235, "memory(GiB)": 77.56, "step": 25610, "token_acc": 0.45364238410596025, "train_speed(iter/s)": 1.441715 }, { "epoch": 1.0974251317424275, "grad_norm": 4.532004356384277, "learning_rate": 8.857717411761296e-05, "loss": 2.4169822692871095, "memory(GiB)": 77.56, "step": 25615, "token_acc": 0.5212355212355212, "train_speed(iter/s)": 1.441744 }, { "epoch": 1.0976393470716765, "grad_norm": 6.756443977355957, "learning_rate": 8.857289244412059e-05, "loss": 2.1476806640625, "memory(GiB)": 77.56, "step": 25620, "token_acc": 0.4909090909090909, "train_speed(iter/s)": 1.441836 }, { "epoch": 1.0978535624009254, "grad_norm": 4.35410737991333, "learning_rate": 8.85686100718441e-05, "loss": 2.5056549072265626, "memory(GiB)": 77.56, "step": 25625, "token_acc": 0.475, "train_speed(iter/s)": 1.441846 }, { "epoch": 1.0980677777301744, "grad_norm": 6.6620988845825195, "learning_rate": 8.856432700086104e-05, "loss": 2.506753349304199, "memory(GiB)": 77.56, "step": 25630, "token_acc": 0.47703180212014135, "train_speed(iter/s)": 1.441814 }, { "epoch": 1.0982819930594234, "grad_norm": 3.700556755065918, "learning_rate": 8.856004323124903e-05, "loss": 2.7397764205932615, "memory(GiB)": 77.56, "step": 25635, "token_acc": 0.4461538461538462, "train_speed(iter/s)": 1.441782 }, { "epoch": 1.0984962083886722, "grad_norm": 3.60774827003479, "learning_rate": 8.855575876308566e-05, "loss": 2.2539976119995115, "memory(GiB)": 77.56, "step": 25640, "token_acc": 0.4967741935483871, "train_speed(iter/s)": 1.44182 }, { "epoch": 1.0987104237179213, "grad_norm": 6.857401371002197, "learning_rate": 8.855147359644856e-05, "loss": 2.458940124511719, "memory(GiB)": 77.56, "step": 25645, "token_acc": 0.5019762845849802, "train_speed(iter/s)": 1.441756 }, { "epoch": 1.0989246390471703, "grad_norm": 4.297567844390869, "learning_rate": 8.854718773141535e-05, "loss": 2.6402027130126955, "memory(GiB)": 77.56, "step": 25650, "token_acc": 0.45857988165680474, "train_speed(iter/s)": 1.441758 }, { "epoch": 1.0991388543764191, "grad_norm": 4.711802005767822, "learning_rate": 8.854290116806367e-05, "loss": 2.456376647949219, "memory(GiB)": 77.56, "step": 25655, "token_acc": 0.4965277777777778, "train_speed(iter/s)": 1.441752 }, { "epoch": 1.0993530697056682, "grad_norm": 5.361373424530029, "learning_rate": 8.853861390647118e-05, "loss": 2.527352714538574, "memory(GiB)": 77.56, "step": 25660, "token_acc": 0.4645669291338583, "train_speed(iter/s)": 1.441699 }, { "epoch": 1.0995672850349172, "grad_norm": 6.213389873504639, "learning_rate": 8.853432594671554e-05, "loss": 2.4731678009033202, "memory(GiB)": 77.56, "step": 25665, "token_acc": 0.4834710743801653, "train_speed(iter/s)": 1.441746 }, { "epoch": 1.099781500364166, "grad_norm": 4.930428981781006, "learning_rate": 8.853003728887446e-05, "loss": 2.4466569900512694, "memory(GiB)": 77.56, "step": 25670, "token_acc": 0.4568345323741007, "train_speed(iter/s)": 1.441795 }, { "epoch": 1.099995715693415, "grad_norm": 4.919266700744629, "learning_rate": 8.85257479330256e-05, "loss": 2.517359161376953, "memory(GiB)": 77.56, "step": 25675, "token_acc": 0.46494464944649444, "train_speed(iter/s)": 1.441808 }, { "epoch": 1.100209931022664, "grad_norm": 4.77811861038208, "learning_rate": 8.852145787924666e-05, "loss": 2.562126350402832, "memory(GiB)": 77.56, "step": 25680, "token_acc": 0.47720364741641336, "train_speed(iter/s)": 1.441797 }, { "epoch": 1.1004241463519129, "grad_norm": 4.611969947814941, "learning_rate": 8.851716712761538e-05, "loss": 2.1323192596435545, "memory(GiB)": 77.56, "step": 25685, "token_acc": 0.5317460317460317, "train_speed(iter/s)": 1.441781 }, { "epoch": 1.100638361681162, "grad_norm": 5.798098087310791, "learning_rate": 8.851287567820949e-05, "loss": 2.8261680603027344, "memory(GiB)": 77.56, "step": 25690, "token_acc": 0.46464646464646464, "train_speed(iter/s)": 1.441673 }, { "epoch": 1.100852577010411, "grad_norm": 6.622003555297852, "learning_rate": 8.850858353110674e-05, "loss": 2.2589473724365234, "memory(GiB)": 77.56, "step": 25695, "token_acc": 0.5257731958762887, "train_speed(iter/s)": 1.441634 }, { "epoch": 1.1010667923396598, "grad_norm": 4.319852352142334, "learning_rate": 8.850429068638487e-05, "loss": 2.1083744049072264, "memory(GiB)": 77.56, "step": 25700, "token_acc": 0.555956678700361, "train_speed(iter/s)": 1.441598 }, { "epoch": 1.1012810076689088, "grad_norm": 4.97047233581543, "learning_rate": 8.849999714412165e-05, "loss": 2.4280059814453123, "memory(GiB)": 77.56, "step": 25705, "token_acc": 0.49504950495049505, "train_speed(iter/s)": 1.441607 }, { "epoch": 1.1014952229981578, "grad_norm": 4.595216274261475, "learning_rate": 8.849570290439486e-05, "loss": 2.5880908966064453, "memory(GiB)": 77.56, "step": 25710, "token_acc": 0.473015873015873, "train_speed(iter/s)": 1.441625 }, { "epoch": 1.1017094383274066, "grad_norm": 3.864295482635498, "learning_rate": 8.84914079672823e-05, "loss": 2.6979808807373047, "memory(GiB)": 77.56, "step": 25715, "token_acc": 0.4444444444444444, "train_speed(iter/s)": 1.441656 }, { "epoch": 1.1019236536566557, "grad_norm": 5.658937931060791, "learning_rate": 8.84871123328618e-05, "loss": 2.458305549621582, "memory(GiB)": 77.56, "step": 25720, "token_acc": 0.5090252707581228, "train_speed(iter/s)": 1.441594 }, { "epoch": 1.1021378689859047, "grad_norm": 6.146386623382568, "learning_rate": 8.848281600121114e-05, "loss": 2.378607177734375, "memory(GiB)": 77.56, "step": 25725, "token_acc": 0.49795918367346936, "train_speed(iter/s)": 1.441559 }, { "epoch": 1.1023520843151535, "grad_norm": 4.334575176239014, "learning_rate": 8.847851897240815e-05, "loss": 2.3802772521972657, "memory(GiB)": 77.56, "step": 25730, "token_acc": 0.5204081632653061, "train_speed(iter/s)": 1.441587 }, { "epoch": 1.1025662996444026, "grad_norm": 7.283489227294922, "learning_rate": 8.847422124653072e-05, "loss": 2.2444238662719727, "memory(GiB)": 77.56, "step": 25735, "token_acc": 0.524822695035461, "train_speed(iter/s)": 1.441641 }, { "epoch": 1.1027805149736516, "grad_norm": 3.8427562713623047, "learning_rate": 8.846992282365667e-05, "loss": 2.221853256225586, "memory(GiB)": 77.56, "step": 25740, "token_acc": 0.49324324324324326, "train_speed(iter/s)": 1.441684 }, { "epoch": 1.1029947303029004, "grad_norm": 5.28739595413208, "learning_rate": 8.846562370386389e-05, "loss": 2.318505859375, "memory(GiB)": 77.56, "step": 25745, "token_acc": 0.5075187969924813, "train_speed(iter/s)": 1.441737 }, { "epoch": 1.1032089456321494, "grad_norm": 5.43912410736084, "learning_rate": 8.846132388723023e-05, "loss": 2.2960477828979493, "memory(GiB)": 77.56, "step": 25750, "token_acc": 0.47619047619047616, "train_speed(iter/s)": 1.441768 }, { "epoch": 1.1034231609613985, "grad_norm": 5.258962631225586, "learning_rate": 8.845702337383363e-05, "loss": 2.4772518157958983, "memory(GiB)": 77.56, "step": 25755, "token_acc": 0.5, "train_speed(iter/s)": 1.441764 }, { "epoch": 1.1036373762906473, "grad_norm": 5.141008377075195, "learning_rate": 8.845272216375196e-05, "loss": 2.6910627365112303, "memory(GiB)": 77.56, "step": 25760, "token_acc": 0.49063670411985016, "train_speed(iter/s)": 1.441791 }, { "epoch": 1.1038515916198963, "grad_norm": 5.876232147216797, "learning_rate": 8.844842025706316e-05, "loss": 2.3275226593017577, "memory(GiB)": 77.56, "step": 25765, "token_acc": 0.5017182130584192, "train_speed(iter/s)": 1.441832 }, { "epoch": 1.1040658069491454, "grad_norm": 5.5710835456848145, "learning_rate": 8.844411765384517e-05, "loss": 2.2320301055908205, "memory(GiB)": 77.56, "step": 25770, "token_acc": 0.5355805243445693, "train_speed(iter/s)": 1.441865 }, { "epoch": 1.1042800222783942, "grad_norm": 4.297060489654541, "learning_rate": 8.843981435417592e-05, "loss": 2.644546890258789, "memory(GiB)": 77.56, "step": 25775, "token_acc": 0.44936708860759494, "train_speed(iter/s)": 1.441847 }, { "epoch": 1.1044942376076432, "grad_norm": 4.347912311553955, "learning_rate": 8.843551035813337e-05, "loss": 2.3509782791137694, "memory(GiB)": 77.56, "step": 25780, "token_acc": 0.5072992700729927, "train_speed(iter/s)": 1.441892 }, { "epoch": 1.1047084529368922, "grad_norm": 5.660109043121338, "learning_rate": 8.84312056657955e-05, "loss": 2.2397388458251952, "memory(GiB)": 77.56, "step": 25785, "token_acc": 0.4900398406374502, "train_speed(iter/s)": 1.441854 }, { "epoch": 1.104922668266141, "grad_norm": 5.345829010009766, "learning_rate": 8.842690027724029e-05, "loss": 2.4885114669799804, "memory(GiB)": 77.56, "step": 25790, "token_acc": 0.47183098591549294, "train_speed(iter/s)": 1.44187 }, { "epoch": 1.10513688359539, "grad_norm": 4.77627420425415, "learning_rate": 8.842259419254573e-05, "loss": 2.8829463958740233, "memory(GiB)": 77.56, "step": 25795, "token_acc": 0.38848920863309355, "train_speed(iter/s)": 1.4419 }, { "epoch": 1.1053510989246391, "grad_norm": 8.10155200958252, "learning_rate": 8.841828741178984e-05, "loss": 2.7255359649658204, "memory(GiB)": 77.56, "step": 25800, "token_acc": 0.5041322314049587, "train_speed(iter/s)": 1.441914 }, { "epoch": 1.105565314253888, "grad_norm": 6.050825119018555, "learning_rate": 8.841397993505062e-05, "loss": 2.590948486328125, "memory(GiB)": 77.56, "step": 25805, "token_acc": 0.47058823529411764, "train_speed(iter/s)": 1.441939 }, { "epoch": 1.105779529583137, "grad_norm": 4.583866596221924, "learning_rate": 8.840967176240612e-05, "loss": 2.4210124969482423, "memory(GiB)": 77.56, "step": 25810, "token_acc": 0.4967948717948718, "train_speed(iter/s)": 1.441939 }, { "epoch": 1.105993744912386, "grad_norm": 5.649902820587158, "learning_rate": 8.840536289393439e-05, "loss": 2.4256710052490233, "memory(GiB)": 77.56, "step": 25815, "token_acc": 0.48494983277591974, "train_speed(iter/s)": 1.441874 }, { "epoch": 1.1062079602416348, "grad_norm": 7.8919172286987305, "learning_rate": 8.840105332971348e-05, "loss": 2.5325475692749024, "memory(GiB)": 77.56, "step": 25820, "token_acc": 0.4769874476987448, "train_speed(iter/s)": 1.441892 }, { "epoch": 1.1064221755708838, "grad_norm": 5.311280250549316, "learning_rate": 8.839674306982148e-05, "loss": 2.556719970703125, "memory(GiB)": 77.56, "step": 25825, "token_acc": 0.4148606811145511, "train_speed(iter/s)": 1.441943 }, { "epoch": 1.1066363909001329, "grad_norm": 4.978405475616455, "learning_rate": 8.839243211433645e-05, "loss": 2.3605009078979493, "memory(GiB)": 77.56, "step": 25830, "token_acc": 0.47635135135135137, "train_speed(iter/s)": 1.441958 }, { "epoch": 1.1068506062293817, "grad_norm": 4.0087056159973145, "learning_rate": 8.838812046333648e-05, "loss": 2.4748769760131837, "memory(GiB)": 77.56, "step": 25835, "token_acc": 0.48936170212765956, "train_speed(iter/s)": 1.44195 }, { "epoch": 1.1070648215586307, "grad_norm": 5.239276885986328, "learning_rate": 8.838380811689973e-05, "loss": 2.437875747680664, "memory(GiB)": 77.56, "step": 25840, "token_acc": 0.521594684385382, "train_speed(iter/s)": 1.441853 }, { "epoch": 1.1072790368878798, "grad_norm": 5.4493889808654785, "learning_rate": 8.837949507510427e-05, "loss": 2.2718408584594725, "memory(GiB)": 77.56, "step": 25845, "token_acc": 0.5393258426966292, "train_speed(iter/s)": 1.441857 }, { "epoch": 1.1074932522171286, "grad_norm": 8.784475326538086, "learning_rate": 8.837518133802826e-05, "loss": 2.124480438232422, "memory(GiB)": 77.56, "step": 25850, "token_acc": 0.5546875, "train_speed(iter/s)": 1.441957 }, { "epoch": 1.1077074675463776, "grad_norm": 5.088685035705566, "learning_rate": 8.837086690574983e-05, "loss": 2.448760223388672, "memory(GiB)": 77.56, "step": 25855, "token_acc": 0.5102739726027398, "train_speed(iter/s)": 1.441989 }, { "epoch": 1.1079216828756266, "grad_norm": 6.981198310852051, "learning_rate": 8.836655177834716e-05, "loss": 2.1593671798706056, "memory(GiB)": 77.56, "step": 25860, "token_acc": 0.5173913043478261, "train_speed(iter/s)": 1.442022 }, { "epoch": 1.1081358982048755, "grad_norm": 4.095747947692871, "learning_rate": 8.836223595589842e-05, "loss": 2.2636686325073243, "memory(GiB)": 77.56, "step": 25865, "token_acc": 0.5050505050505051, "train_speed(iter/s)": 1.442001 }, { "epoch": 1.1083501135341245, "grad_norm": 6.862861633300781, "learning_rate": 8.835791943848178e-05, "loss": 2.42894287109375, "memory(GiB)": 77.56, "step": 25870, "token_acc": 0.513986013986014, "train_speed(iter/s)": 1.442041 }, { "epoch": 1.1085643288633735, "grad_norm": 4.592063903808594, "learning_rate": 8.835360222617545e-05, "loss": 2.7197782516479494, "memory(GiB)": 77.56, "step": 25875, "token_acc": 0.4072948328267477, "train_speed(iter/s)": 1.442071 }, { "epoch": 1.1087785441926223, "grad_norm": 5.136329174041748, "learning_rate": 8.834928431905764e-05, "loss": 2.357573127746582, "memory(GiB)": 77.56, "step": 25880, "token_acc": 0.46621621621621623, "train_speed(iter/s)": 1.442088 }, { "epoch": 1.1089927595218714, "grad_norm": 6.3618855476379395, "learning_rate": 8.834496571720657e-05, "loss": 2.4693374633789062, "memory(GiB)": 77.56, "step": 25885, "token_acc": 0.4246031746031746, "train_speed(iter/s)": 1.442081 }, { "epoch": 1.1092069748511204, "grad_norm": 4.026154518127441, "learning_rate": 8.834064642070045e-05, "loss": 2.4066232681274413, "memory(GiB)": 77.56, "step": 25890, "token_acc": 0.5435540069686411, "train_speed(iter/s)": 1.442062 }, { "epoch": 1.1094211901803692, "grad_norm": 4.661164283752441, "learning_rate": 8.833632642961759e-05, "loss": 2.4651983261108397, "memory(GiB)": 77.56, "step": 25895, "token_acc": 0.4847457627118644, "train_speed(iter/s)": 1.442117 }, { "epoch": 1.1096354055096183, "grad_norm": 4.34984827041626, "learning_rate": 8.83320057440362e-05, "loss": 2.1653614044189453, "memory(GiB)": 77.56, "step": 25900, "token_acc": 0.5230769230769231, "train_speed(iter/s)": 1.442182 }, { "epoch": 1.1098496208388673, "grad_norm": 6.945590496063232, "learning_rate": 8.832768436403455e-05, "loss": 2.418035888671875, "memory(GiB)": 77.56, "step": 25905, "token_acc": 0.5021459227467812, "train_speed(iter/s)": 1.442229 }, { "epoch": 1.110063836168116, "grad_norm": 7.615697383880615, "learning_rate": 8.832336228969098e-05, "loss": 2.4347999572753904, "memory(GiB)": 77.56, "step": 25910, "token_acc": 0.4919093851132686, "train_speed(iter/s)": 1.442226 }, { "epoch": 1.1102780514973651, "grad_norm": 4.93060827255249, "learning_rate": 8.831903952108372e-05, "loss": 2.6519432067871094, "memory(GiB)": 77.56, "step": 25915, "token_acc": 0.38235294117647056, "train_speed(iter/s)": 1.442238 }, { "epoch": 1.1104922668266142, "grad_norm": 4.502770900726318, "learning_rate": 8.831471605829113e-05, "loss": 2.637506103515625, "memory(GiB)": 77.56, "step": 25920, "token_acc": 0.4444444444444444, "train_speed(iter/s)": 1.442243 }, { "epoch": 1.110706482155863, "grad_norm": 5.694281101226807, "learning_rate": 8.831039190139152e-05, "loss": 2.467295265197754, "memory(GiB)": 77.56, "step": 25925, "token_acc": 0.4795539033457249, "train_speed(iter/s)": 1.442203 }, { "epoch": 1.110920697485112, "grad_norm": 7.324888706207275, "learning_rate": 8.830606705046321e-05, "loss": 2.5284379959106444, "memory(GiB)": 77.56, "step": 25930, "token_acc": 0.4963235294117647, "train_speed(iter/s)": 1.442205 }, { "epoch": 1.111134912814361, "grad_norm": 5.400707721710205, "learning_rate": 8.830174150558456e-05, "loss": 2.1361209869384767, "memory(GiB)": 77.56, "step": 25935, "token_acc": 0.54296875, "train_speed(iter/s)": 1.442123 }, { "epoch": 1.1113491281436099, "grad_norm": 4.79054069519043, "learning_rate": 8.829741526683394e-05, "loss": 2.491075325012207, "memory(GiB)": 77.56, "step": 25940, "token_acc": 0.48936170212765956, "train_speed(iter/s)": 1.442161 }, { "epoch": 1.111563343472859, "grad_norm": 4.609335899353027, "learning_rate": 8.829308833428972e-05, "loss": 2.286872673034668, "memory(GiB)": 77.56, "step": 25945, "token_acc": 0.4891304347826087, "train_speed(iter/s)": 1.442177 }, { "epoch": 1.111777558802108, "grad_norm": 6.0736308097839355, "learning_rate": 8.828876070803028e-05, "loss": 2.6855098724365236, "memory(GiB)": 77.56, "step": 25950, "token_acc": 0.43103448275862066, "train_speed(iter/s)": 1.442206 }, { "epoch": 1.1119917741313567, "grad_norm": 5.716984272003174, "learning_rate": 8.828443238813402e-05, "loss": 2.6547584533691406, "memory(GiB)": 77.56, "step": 25955, "token_acc": 0.43288590604026844, "train_speed(iter/s)": 1.442188 }, { "epoch": 1.1122059894606058, "grad_norm": 6.626274108886719, "learning_rate": 8.828010337467936e-05, "loss": 2.5188533782958986, "memory(GiB)": 77.56, "step": 25960, "token_acc": 0.5217391304347826, "train_speed(iter/s)": 1.442182 }, { "epoch": 1.1124202047898548, "grad_norm": 4.773168563842773, "learning_rate": 8.82757736677447e-05, "loss": 2.5926979064941404, "memory(GiB)": 77.56, "step": 25965, "token_acc": 0.47719298245614034, "train_speed(iter/s)": 1.442128 }, { "epoch": 1.1126344201191036, "grad_norm": 5.194952964782715, "learning_rate": 8.82714432674085e-05, "loss": 2.8051170349121093, "memory(GiB)": 77.56, "step": 25970, "token_acc": 0.41896024464831805, "train_speed(iter/s)": 1.442127 }, { "epoch": 1.1128486354483527, "grad_norm": 5.4387311935424805, "learning_rate": 8.826711217374921e-05, "loss": 2.7437679290771486, "memory(GiB)": 77.56, "step": 25975, "token_acc": 0.45180722891566266, "train_speed(iter/s)": 1.442167 }, { "epoch": 1.1130628507776017, "grad_norm": 5.421558380126953, "learning_rate": 8.826278038684529e-05, "loss": 2.295772171020508, "memory(GiB)": 77.56, "step": 25980, "token_acc": 0.5541125541125541, "train_speed(iter/s)": 1.442096 }, { "epoch": 1.1132770661068505, "grad_norm": 5.219773769378662, "learning_rate": 8.82584479067752e-05, "loss": 2.402861213684082, "memory(GiB)": 77.56, "step": 25985, "token_acc": 0.5071942446043165, "train_speed(iter/s)": 1.442062 }, { "epoch": 1.1134912814360995, "grad_norm": 5.060913562774658, "learning_rate": 8.825411473361745e-05, "loss": 2.620148849487305, "memory(GiB)": 77.56, "step": 25990, "token_acc": 0.49295774647887325, "train_speed(iter/s)": 1.442051 }, { "epoch": 1.1137054967653486, "grad_norm": 3.975721597671509, "learning_rate": 8.824978086745051e-05, "loss": 2.5187034606933594, "memory(GiB)": 77.56, "step": 25995, "token_acc": 0.49642857142857144, "train_speed(iter/s)": 1.442092 }, { "epoch": 1.1139197120945974, "grad_norm": 4.269970893859863, "learning_rate": 8.824544630835293e-05, "loss": 2.250044059753418, "memory(GiB)": 77.56, "step": 26000, "token_acc": 0.5175879396984925, "train_speed(iter/s)": 1.442168 }, { "epoch": 1.1139197120945974, "eval_loss": 2.251314640045166, "eval_runtime": 13.7991, "eval_samples_per_second": 7.247, "eval_steps_per_second": 7.247, "eval_token_acc": 0.47183098591549294, "step": 26000 }, { "epoch": 1.1141339274238464, "grad_norm": 4.331987380981445, "learning_rate": 8.82411110564032e-05, "loss": 2.6966156005859374, "memory(GiB)": 77.56, "step": 26005, "token_acc": 0.45722713864306785, "train_speed(iter/s)": 1.441025 }, { "epoch": 1.1143481427530955, "grad_norm": 5.541085243225098, "learning_rate": 8.823677511167986e-05, "loss": 2.686792182922363, "memory(GiB)": 77.56, "step": 26010, "token_acc": 0.44014084507042256, "train_speed(iter/s)": 1.440997 }, { "epoch": 1.1145623580823443, "grad_norm": 4.396245002746582, "learning_rate": 8.823243847426148e-05, "loss": 2.497194290161133, "memory(GiB)": 77.56, "step": 26015, "token_acc": 0.4738562091503268, "train_speed(iter/s)": 1.441058 }, { "epoch": 1.1147765734115933, "grad_norm": 5.339602947235107, "learning_rate": 8.822810114422662e-05, "loss": 2.597373199462891, "memory(GiB)": 77.56, "step": 26020, "token_acc": 0.4666666666666667, "train_speed(iter/s)": 1.441028 }, { "epoch": 1.1149907887408423, "grad_norm": 5.3471574783325195, "learning_rate": 8.822376312165384e-05, "loss": 2.3846054077148438, "memory(GiB)": 77.56, "step": 26025, "token_acc": 0.4982456140350877, "train_speed(iter/s)": 1.440978 }, { "epoch": 1.1152050040700912, "grad_norm": 3.5960137844085693, "learning_rate": 8.821942440662172e-05, "loss": 2.50716552734375, "memory(GiB)": 77.56, "step": 26030, "token_acc": 0.46703296703296704, "train_speed(iter/s)": 1.441037 }, { "epoch": 1.1154192193993402, "grad_norm": 4.165687084197998, "learning_rate": 8.821508499920889e-05, "loss": 2.470718002319336, "memory(GiB)": 77.56, "step": 26035, "token_acc": 0.45348837209302323, "train_speed(iter/s)": 1.441029 }, { "epoch": 1.1156334347285892, "grad_norm": 4.627836227416992, "learning_rate": 8.821074489949395e-05, "loss": 2.4498682022094727, "memory(GiB)": 77.56, "step": 26040, "token_acc": 0.46946564885496184, "train_speed(iter/s)": 1.440992 }, { "epoch": 1.115847650057838, "grad_norm": 5.555534839630127, "learning_rate": 8.820640410755551e-05, "loss": 2.49194393157959, "memory(GiB)": 77.56, "step": 26045, "token_acc": 0.44654088050314467, "train_speed(iter/s)": 1.44102 }, { "epoch": 1.116061865387087, "grad_norm": 4.856287956237793, "learning_rate": 8.820206262347221e-05, "loss": 2.7558454513549804, "memory(GiB)": 77.56, "step": 26050, "token_acc": 0.47706422018348627, "train_speed(iter/s)": 1.44107 }, { "epoch": 1.116276080716336, "grad_norm": 5.018049716949463, "learning_rate": 8.819772044732274e-05, "loss": 2.62425537109375, "memory(GiB)": 77.56, "step": 26055, "token_acc": 0.47335423197492166, "train_speed(iter/s)": 1.441107 }, { "epoch": 1.1164902960455851, "grad_norm": 5.306628704071045, "learning_rate": 8.81933775791857e-05, "loss": 2.5087875366210937, "memory(GiB)": 77.56, "step": 26060, "token_acc": 0.5375494071146245, "train_speed(iter/s)": 1.441155 }, { "epoch": 1.116704511374834, "grad_norm": 5.25780725479126, "learning_rate": 8.818903401913983e-05, "loss": 2.4063173294067384, "memory(GiB)": 77.56, "step": 26065, "token_acc": 0.4790874524714829, "train_speed(iter/s)": 1.441202 }, { "epoch": 1.116918726704083, "grad_norm": 7.838450908660889, "learning_rate": 8.818468976726377e-05, "loss": 2.5843784332275392, "memory(GiB)": 77.56, "step": 26070, "token_acc": 0.4709897610921502, "train_speed(iter/s)": 1.441274 }, { "epoch": 1.117132942033332, "grad_norm": 6.996995449066162, "learning_rate": 8.818034482363622e-05, "loss": 2.3615653991699217, "memory(GiB)": 77.56, "step": 26075, "token_acc": 0.49411764705882355, "train_speed(iter/s)": 1.441275 }, { "epoch": 1.1173471573625808, "grad_norm": 4.48069429397583, "learning_rate": 8.817599918833593e-05, "loss": 2.4722389221191405, "memory(GiB)": 77.56, "step": 26080, "token_acc": 0.5136986301369864, "train_speed(iter/s)": 1.441314 }, { "epoch": 1.1175613726918299, "grad_norm": 6.915204048156738, "learning_rate": 8.817165286144158e-05, "loss": 2.7206195831298827, "memory(GiB)": 77.56, "step": 26085, "token_acc": 0.44884488448844884, "train_speed(iter/s)": 1.441349 }, { "epoch": 1.117775588021079, "grad_norm": 5.575260162353516, "learning_rate": 8.816730584303194e-05, "loss": 2.3187282562255858, "memory(GiB)": 77.56, "step": 26090, "token_acc": 0.5344827586206896, "train_speed(iter/s)": 1.441359 }, { "epoch": 1.1179898033503277, "grad_norm": 5.173477649688721, "learning_rate": 8.816295813318576e-05, "loss": 2.7541292190551756, "memory(GiB)": 77.56, "step": 26095, "token_acc": 0.4666666666666667, "train_speed(iter/s)": 1.441352 }, { "epoch": 1.1182040186795768, "grad_norm": 3.787165880203247, "learning_rate": 8.815860973198177e-05, "loss": 2.2994710922241213, "memory(GiB)": 77.56, "step": 26100, "token_acc": 0.563265306122449, "train_speed(iter/s)": 1.441437 }, { "epoch": 1.1184182340088258, "grad_norm": 5.556142330169678, "learning_rate": 8.81542606394988e-05, "loss": 2.636882209777832, "memory(GiB)": 77.56, "step": 26105, "token_acc": 0.4483870967741935, "train_speed(iter/s)": 1.441432 }, { "epoch": 1.1186324493380746, "grad_norm": 5.094545841217041, "learning_rate": 8.814991085581559e-05, "loss": 2.1735029220581055, "memory(GiB)": 77.56, "step": 26110, "token_acc": 0.5384615384615384, "train_speed(iter/s)": 1.44151 }, { "epoch": 1.1188466646673236, "grad_norm": 5.514937877655029, "learning_rate": 8.814556038101097e-05, "loss": 2.409612274169922, "memory(GiB)": 77.56, "step": 26115, "token_acc": 0.49097472924187724, "train_speed(iter/s)": 1.441497 }, { "epoch": 1.1190608799965727, "grad_norm": 8.324711799621582, "learning_rate": 8.814120921516372e-05, "loss": 2.4711477279663088, "memory(GiB)": 77.56, "step": 26120, "token_acc": 0.46215139442231074, "train_speed(iter/s)": 1.441562 }, { "epoch": 1.1192750953258215, "grad_norm": 4.866443634033203, "learning_rate": 8.81368573583527e-05, "loss": 2.368330383300781, "memory(GiB)": 77.56, "step": 26125, "token_acc": 0.5161290322580645, "train_speed(iter/s)": 1.44152 }, { "epoch": 1.1194893106550705, "grad_norm": 4.037306308746338, "learning_rate": 8.813250481065673e-05, "loss": 2.1914657592773437, "memory(GiB)": 77.56, "step": 26130, "token_acc": 0.5343511450381679, "train_speed(iter/s)": 1.441549 }, { "epoch": 1.1197035259843195, "grad_norm": 9.574823379516602, "learning_rate": 8.812815157215466e-05, "loss": 2.874228858947754, "memory(GiB)": 77.56, "step": 26135, "token_acc": 0.4337748344370861, "train_speed(iter/s)": 1.441601 }, { "epoch": 1.1199177413135684, "grad_norm": 4.67606782913208, "learning_rate": 8.812379764292535e-05, "loss": 2.550762939453125, "memory(GiB)": 77.56, "step": 26140, "token_acc": 0.49480968858131485, "train_speed(iter/s)": 1.441655 }, { "epoch": 1.1201319566428174, "grad_norm": 5.895148754119873, "learning_rate": 8.81194430230477e-05, "loss": 2.947303581237793, "memory(GiB)": 77.56, "step": 26145, "token_acc": 0.42946708463949845, "train_speed(iter/s)": 1.441708 }, { "epoch": 1.1203461719720664, "grad_norm": 4.9481377601623535, "learning_rate": 8.811508771260058e-05, "loss": 2.57385368347168, "memory(GiB)": 77.56, "step": 26150, "token_acc": 0.4423076923076923, "train_speed(iter/s)": 1.441689 }, { "epoch": 1.1205603873013152, "grad_norm": 5.031641960144043, "learning_rate": 8.811073171166288e-05, "loss": 2.6819974899291994, "memory(GiB)": 77.56, "step": 26155, "token_acc": 0.468013468013468, "train_speed(iter/s)": 1.441711 }, { "epoch": 1.1207746026305643, "grad_norm": 5.014657497406006, "learning_rate": 8.810637502031354e-05, "loss": 2.2858190536499023, "memory(GiB)": 77.56, "step": 26160, "token_acc": 0.4883720930232558, "train_speed(iter/s)": 1.441557 }, { "epoch": 1.1209888179598133, "grad_norm": 4.458937644958496, "learning_rate": 8.810201763863145e-05, "loss": 2.7890573501586915, "memory(GiB)": 77.56, "step": 26165, "token_acc": 0.44876325088339225, "train_speed(iter/s)": 1.441557 }, { "epoch": 1.1212030332890621, "grad_norm": 5.7796630859375, "learning_rate": 8.809765956669558e-05, "loss": 2.99735107421875, "memory(GiB)": 77.56, "step": 26170, "token_acc": 0.42748091603053434, "train_speed(iter/s)": 1.441611 }, { "epoch": 1.1214172486183112, "grad_norm": 4.740631103515625, "learning_rate": 8.809330080458487e-05, "loss": 2.7155364990234374, "memory(GiB)": 77.56, "step": 26175, "token_acc": 0.43824701195219123, "train_speed(iter/s)": 1.441661 }, { "epoch": 1.1216314639475602, "grad_norm": 4.7244038581848145, "learning_rate": 8.808894135237828e-05, "loss": 2.4301692962646486, "memory(GiB)": 77.56, "step": 26180, "token_acc": 0.5014164305949008, "train_speed(iter/s)": 1.441648 }, { "epoch": 1.121845679276809, "grad_norm": 6.2575178146362305, "learning_rate": 8.808458121015477e-05, "loss": 2.600922966003418, "memory(GiB)": 77.56, "step": 26185, "token_acc": 0.4337748344370861, "train_speed(iter/s)": 1.441592 }, { "epoch": 1.122059894606058, "grad_norm": 4.577580451965332, "learning_rate": 8.808022037799337e-05, "loss": 2.485809326171875, "memory(GiB)": 77.56, "step": 26190, "token_acc": 0.5019455252918288, "train_speed(iter/s)": 1.4416 }, { "epoch": 1.122274109935307, "grad_norm": 3.8281726837158203, "learning_rate": 8.807585885597305e-05, "loss": 2.5327531814575197, "memory(GiB)": 77.56, "step": 26195, "token_acc": 0.47703180212014135, "train_speed(iter/s)": 1.441584 }, { "epoch": 1.1224883252645559, "grad_norm": 4.452630043029785, "learning_rate": 8.807149664417283e-05, "loss": 2.776981735229492, "memory(GiB)": 77.56, "step": 26200, "token_acc": 0.4430769230769231, "train_speed(iter/s)": 1.441576 }, { "epoch": 1.122702540593805, "grad_norm": 3.939777135848999, "learning_rate": 8.806713374267172e-05, "loss": 2.3384021759033202, "memory(GiB)": 77.56, "step": 26205, "token_acc": 0.5186567164179104, "train_speed(iter/s)": 1.441603 }, { "epoch": 1.122916755923054, "grad_norm": 5.321191787719727, "learning_rate": 8.806277015154877e-05, "loss": 2.4105514526367187, "memory(GiB)": 77.56, "step": 26210, "token_acc": 0.4717607973421927, "train_speed(iter/s)": 1.441582 }, { "epoch": 1.1231309712523028, "grad_norm": 5.572187423706055, "learning_rate": 8.805840587088305e-05, "loss": 2.729194450378418, "memory(GiB)": 77.56, "step": 26215, "token_acc": 0.44482758620689655, "train_speed(iter/s)": 1.441523 }, { "epoch": 1.1233451865815518, "grad_norm": 3.8347079753875732, "learning_rate": 8.80540409007536e-05, "loss": 2.781502342224121, "memory(GiB)": 77.56, "step": 26220, "token_acc": 0.45151515151515154, "train_speed(iter/s)": 1.441555 }, { "epoch": 1.1235594019108008, "grad_norm": 4.109711170196533, "learning_rate": 8.80496752412395e-05, "loss": 2.345529556274414, "memory(GiB)": 77.56, "step": 26225, "token_acc": 0.4828767123287671, "train_speed(iter/s)": 1.441532 }, { "epoch": 1.1237736172400496, "grad_norm": 6.2818732261657715, "learning_rate": 8.804530889241984e-05, "loss": 2.4447484970092774, "memory(GiB)": 77.56, "step": 26230, "token_acc": 0.5018867924528302, "train_speed(iter/s)": 1.441535 }, { "epoch": 1.1239878325692987, "grad_norm": 4.4610676765441895, "learning_rate": 8.804094185437373e-05, "loss": 2.7864213943481446, "memory(GiB)": 77.56, "step": 26235, "token_acc": 0.43492063492063493, "train_speed(iter/s)": 1.441561 }, { "epoch": 1.1242020478985477, "grad_norm": 4.987005710601807, "learning_rate": 8.803657412718025e-05, "loss": 2.395465850830078, "memory(GiB)": 77.56, "step": 26240, "token_acc": 0.48299319727891155, "train_speed(iter/s)": 1.441586 }, { "epoch": 1.1244162632277965, "grad_norm": 4.750020980834961, "learning_rate": 8.803220571091857e-05, "loss": 2.5770553588867187, "memory(GiB)": 77.56, "step": 26245, "token_acc": 0.48089171974522293, "train_speed(iter/s)": 1.441585 }, { "epoch": 1.1246304785570456, "grad_norm": 5.316093444824219, "learning_rate": 8.80278366056678e-05, "loss": 2.4066978454589845, "memory(GiB)": 77.56, "step": 26250, "token_acc": 0.47278911564625853, "train_speed(iter/s)": 1.441614 }, { "epoch": 1.1248446938862946, "grad_norm": 8.964237213134766, "learning_rate": 8.80234668115071e-05, "loss": 2.245305061340332, "memory(GiB)": 77.56, "step": 26255, "token_acc": 0.4901185770750988, "train_speed(iter/s)": 1.441625 }, { "epoch": 1.1250589092155434, "grad_norm": 3.6716768741607666, "learning_rate": 8.801909632851561e-05, "loss": 2.6376131057739256, "memory(GiB)": 77.56, "step": 26260, "token_acc": 0.4307692307692308, "train_speed(iter/s)": 1.441581 }, { "epoch": 1.1252731245447924, "grad_norm": 6.126744747161865, "learning_rate": 8.801472515677255e-05, "loss": 2.5930644989013674, "memory(GiB)": 77.56, "step": 26265, "token_acc": 0.49458483754512633, "train_speed(iter/s)": 1.441585 }, { "epoch": 1.1254873398740415, "grad_norm": 4.10883903503418, "learning_rate": 8.801035329635707e-05, "loss": 2.4922943115234375, "memory(GiB)": 77.56, "step": 26270, "token_acc": 0.47058823529411764, "train_speed(iter/s)": 1.441547 }, { "epoch": 1.1257015552032903, "grad_norm": 3.914081335067749, "learning_rate": 8.800598074734839e-05, "loss": 2.489690589904785, "memory(GiB)": 77.56, "step": 26275, "token_acc": 0.44662921348314605, "train_speed(iter/s)": 1.441582 }, { "epoch": 1.1259157705325393, "grad_norm": 6.201168060302734, "learning_rate": 8.800160750982572e-05, "loss": 2.7673282623291016, "memory(GiB)": 77.56, "step": 26280, "token_acc": 0.42813455657492355, "train_speed(iter/s)": 1.44164 }, { "epoch": 1.1261299858617884, "grad_norm": 3.8510451316833496, "learning_rate": 8.799723358386826e-05, "loss": 2.39993839263916, "memory(GiB)": 77.56, "step": 26285, "token_acc": 0.47330960854092524, "train_speed(iter/s)": 1.441629 }, { "epoch": 1.1263442011910372, "grad_norm": 4.904791831970215, "learning_rate": 8.79928589695553e-05, "loss": 2.6010610580444338, "memory(GiB)": 77.56, "step": 26290, "token_acc": 0.4607142857142857, "train_speed(iter/s)": 1.441681 }, { "epoch": 1.1265584165202862, "grad_norm": 3.5493972301483154, "learning_rate": 8.798848366696604e-05, "loss": 2.854984664916992, "memory(GiB)": 77.56, "step": 26295, "token_acc": 0.4559748427672956, "train_speed(iter/s)": 1.441662 }, { "epoch": 1.1267726318495352, "grad_norm": 5.556482315063477, "learning_rate": 8.798410767617977e-05, "loss": 2.749562644958496, "memory(GiB)": 77.56, "step": 26300, "token_acc": 0.47019867549668876, "train_speed(iter/s)": 1.441629 }, { "epoch": 1.126986847178784, "grad_norm": 4.702717304229736, "learning_rate": 8.797973099727575e-05, "loss": 2.464116096496582, "memory(GiB)": 77.56, "step": 26305, "token_acc": 0.5225563909774437, "train_speed(iter/s)": 1.441607 }, { "epoch": 1.127201062508033, "grad_norm": 4.054986000061035, "learning_rate": 8.797535363033327e-05, "loss": 2.187481689453125, "memory(GiB)": 77.56, "step": 26310, "token_acc": 0.5567765567765568, "train_speed(iter/s)": 1.441527 }, { "epoch": 1.1274152778372821, "grad_norm": 4.942107677459717, "learning_rate": 8.797097557543166e-05, "loss": 2.5402469635009766, "memory(GiB)": 77.56, "step": 26315, "token_acc": 0.4652777777777778, "train_speed(iter/s)": 1.441487 }, { "epoch": 1.127629493166531, "grad_norm": 4.518787860870361, "learning_rate": 8.796659683265019e-05, "loss": 2.1068132400512694, "memory(GiB)": 77.56, "step": 26320, "token_acc": 0.5450819672131147, "train_speed(iter/s)": 1.441442 }, { "epoch": 1.12784370849578, "grad_norm": 3.804300308227539, "learning_rate": 8.796221740206819e-05, "loss": 2.612284469604492, "memory(GiB)": 77.56, "step": 26325, "token_acc": 0.47214076246334313, "train_speed(iter/s)": 1.441416 }, { "epoch": 1.128057923825029, "grad_norm": 5.4769086837768555, "learning_rate": 8.795783728376504e-05, "loss": 2.512421226501465, "memory(GiB)": 77.56, "step": 26330, "token_acc": 0.5208333333333334, "train_speed(iter/s)": 1.441471 }, { "epoch": 1.1282721391542778, "grad_norm": 5.546802997589111, "learning_rate": 8.795345647782002e-05, "loss": 2.368264579772949, "memory(GiB)": 77.56, "step": 26335, "token_acc": 0.47307692307692306, "train_speed(iter/s)": 1.44141 }, { "epoch": 1.1284863544835269, "grad_norm": 17.33209991455078, "learning_rate": 8.794907498431258e-05, "loss": 2.432367706298828, "memory(GiB)": 77.56, "step": 26340, "token_acc": 0.5124555160142349, "train_speed(iter/s)": 1.441372 }, { "epoch": 1.128700569812776, "grad_norm": 5.860182762145996, "learning_rate": 8.794469280332203e-05, "loss": 2.4527870178222657, "memory(GiB)": 77.56, "step": 26345, "token_acc": 0.46258503401360546, "train_speed(iter/s)": 1.441356 }, { "epoch": 1.1289147851420247, "grad_norm": 4.315380573272705, "learning_rate": 8.794030993492777e-05, "loss": 2.508917808532715, "memory(GiB)": 77.56, "step": 26350, "token_acc": 0.5067567567567568, "train_speed(iter/s)": 1.441335 }, { "epoch": 1.1291290004712737, "grad_norm": 5.9567365646362305, "learning_rate": 8.79359263792092e-05, "loss": 2.632153129577637, "memory(GiB)": 77.56, "step": 26355, "token_acc": 0.4660493827160494, "train_speed(iter/s)": 1.441371 }, { "epoch": 1.1293432158005228, "grad_norm": 4.8569231033325195, "learning_rate": 8.793154213624572e-05, "loss": 2.5251413345336915, "memory(GiB)": 77.56, "step": 26360, "token_acc": 0.483271375464684, "train_speed(iter/s)": 1.441425 }, { "epoch": 1.1295574311297716, "grad_norm": 6.091347694396973, "learning_rate": 8.792715720611682e-05, "loss": 2.464490509033203, "memory(GiB)": 77.56, "step": 26365, "token_acc": 0.4722222222222222, "train_speed(iter/s)": 1.441374 }, { "epoch": 1.1297716464590206, "grad_norm": 4.500833034515381, "learning_rate": 8.792277158890185e-05, "loss": 2.649382209777832, "memory(GiB)": 77.56, "step": 26370, "token_acc": 0.46504559270516715, "train_speed(iter/s)": 1.441346 }, { "epoch": 1.1299858617882697, "grad_norm": 7.4974236488342285, "learning_rate": 8.791838528468031e-05, "loss": 2.7042991638183596, "memory(GiB)": 77.56, "step": 26375, "token_acc": 0.45185185185185184, "train_speed(iter/s)": 1.441411 }, { "epoch": 1.1302000771175185, "grad_norm": 4.057533264160156, "learning_rate": 8.791399829353166e-05, "loss": 2.5022491455078124, "memory(GiB)": 77.56, "step": 26380, "token_acc": 0.45454545454545453, "train_speed(iter/s)": 1.441476 }, { "epoch": 1.1304142924467675, "grad_norm": 5.182284355163574, "learning_rate": 8.790961061553535e-05, "loss": 2.775295639038086, "memory(GiB)": 77.56, "step": 26385, "token_acc": 0.4380664652567976, "train_speed(iter/s)": 1.44142 }, { "epoch": 1.1306285077760165, "grad_norm": 5.530815124511719, "learning_rate": 8.79052222507709e-05, "loss": 2.4753040313720702, "memory(GiB)": 77.56, "step": 26390, "token_acc": 0.4868913857677903, "train_speed(iter/s)": 1.441379 }, { "epoch": 1.1308427231052653, "grad_norm": 4.547403335571289, "learning_rate": 8.790083319931776e-05, "loss": 2.587689208984375, "memory(GiB)": 77.56, "step": 26395, "token_acc": 0.4684385382059801, "train_speed(iter/s)": 1.44144 }, { "epoch": 1.1310569384345144, "grad_norm": 5.189222812652588, "learning_rate": 8.78964434612555e-05, "loss": 2.399053192138672, "memory(GiB)": 77.56, "step": 26400, "token_acc": 0.49466192170818507, "train_speed(iter/s)": 1.441443 }, { "epoch": 1.1312711537637634, "grad_norm": 4.877856254577637, "learning_rate": 8.789205303666361e-05, "loss": 2.452896308898926, "memory(GiB)": 77.56, "step": 26405, "token_acc": 0.4789272030651341, "train_speed(iter/s)": 1.441458 }, { "epoch": 1.1314853690930122, "grad_norm": 4.806341648101807, "learning_rate": 8.788766192562162e-05, "loss": 2.4968421936035154, "memory(GiB)": 77.56, "step": 26410, "token_acc": 0.46683673469387754, "train_speed(iter/s)": 1.441537 }, { "epoch": 1.1316995844222613, "grad_norm": 4.129325866699219, "learning_rate": 8.78832701282091e-05, "loss": 2.187816047668457, "memory(GiB)": 77.56, "step": 26415, "token_acc": 0.5035211267605634, "train_speed(iter/s)": 1.441527 }, { "epoch": 1.1319137997515103, "grad_norm": 5.51617431640625, "learning_rate": 8.787887764450561e-05, "loss": 2.3775583267211915, "memory(GiB)": 77.56, "step": 26420, "token_acc": 0.44, "train_speed(iter/s)": 1.441477 }, { "epoch": 1.132128015080759, "grad_norm": 5.2360968589782715, "learning_rate": 8.787448447459073e-05, "loss": 2.3888309478759764, "memory(GiB)": 77.56, "step": 26425, "token_acc": 0.49808429118773945, "train_speed(iter/s)": 1.441508 }, { "epoch": 1.1323422304100081, "grad_norm": 4.695786476135254, "learning_rate": 8.787009061854401e-05, "loss": 2.682851791381836, "memory(GiB)": 77.56, "step": 26430, "token_acc": 0.42124542124542125, "train_speed(iter/s)": 1.441527 }, { "epoch": 1.1325564457392572, "grad_norm": 5.986169815063477, "learning_rate": 8.786569607644509e-05, "loss": 2.5935699462890627, "memory(GiB)": 77.56, "step": 26435, "token_acc": 0.48184818481848185, "train_speed(iter/s)": 1.441575 }, { "epoch": 1.132770661068506, "grad_norm": 6.511702060699463, "learning_rate": 8.786130084837355e-05, "loss": 2.5218067169189453, "memory(GiB)": 77.56, "step": 26440, "token_acc": 0.49295774647887325, "train_speed(iter/s)": 1.441524 }, { "epoch": 1.132984876397755, "grad_norm": 4.406321048736572, "learning_rate": 8.785690493440904e-05, "loss": 2.4994617462158204, "memory(GiB)": 77.56, "step": 26445, "token_acc": 0.45017182130584193, "train_speed(iter/s)": 1.441589 }, { "epoch": 1.133199091727004, "grad_norm": 4.7519001960754395, "learning_rate": 8.78525083346312e-05, "loss": 2.6419448852539062, "memory(GiB)": 77.56, "step": 26450, "token_acc": 0.4415204678362573, "train_speed(iter/s)": 1.441641 }, { "epoch": 1.1334133070562529, "grad_norm": 4.948915481567383, "learning_rate": 8.784811104911962e-05, "loss": 2.6675222396850584, "memory(GiB)": 77.56, "step": 26455, "token_acc": 0.4592833876221498, "train_speed(iter/s)": 1.441623 }, { "epoch": 1.133627522385502, "grad_norm": 6.53285026550293, "learning_rate": 8.784371307795403e-05, "loss": 2.504957580566406, "memory(GiB)": 77.56, "step": 26460, "token_acc": 0.47601476014760147, "train_speed(iter/s)": 1.44161 }, { "epoch": 1.133841737714751, "grad_norm": 3.6881697177886963, "learning_rate": 8.783931442121409e-05, "loss": 2.490731430053711, "memory(GiB)": 77.56, "step": 26465, "token_acc": 0.48026315789473684, "train_speed(iter/s)": 1.441681 }, { "epoch": 1.1340559530439998, "grad_norm": 4.588105201721191, "learning_rate": 8.783491507897946e-05, "loss": 2.483615493774414, "memory(GiB)": 77.56, "step": 26470, "token_acc": 0.5090252707581228, "train_speed(iter/s)": 1.441687 }, { "epoch": 1.1342701683732488, "grad_norm": 5.5090155601501465, "learning_rate": 8.783051505132984e-05, "loss": 2.467701721191406, "memory(GiB)": 77.56, "step": 26475, "token_acc": 0.4521072796934866, "train_speed(iter/s)": 1.441681 }, { "epoch": 1.1344843837024978, "grad_norm": 8.281801223754883, "learning_rate": 8.782611433834497e-05, "loss": 2.4860801696777344, "memory(GiB)": 77.56, "step": 26480, "token_acc": 0.47950819672131145, "train_speed(iter/s)": 1.441723 }, { "epoch": 1.1346985990317466, "grad_norm": 4.080926895141602, "learning_rate": 8.782171294010455e-05, "loss": 2.6461761474609373, "memory(GiB)": 77.56, "step": 26485, "token_acc": 0.47058823529411764, "train_speed(iter/s)": 1.441722 }, { "epoch": 1.1349128143609957, "grad_norm": 4.6654510498046875, "learning_rate": 8.781731085668831e-05, "loss": 2.4742658615112303, "memory(GiB)": 77.56, "step": 26490, "token_acc": 0.47854785478547857, "train_speed(iter/s)": 1.441659 }, { "epoch": 1.1351270296902447, "grad_norm": 4.042934417724609, "learning_rate": 8.781290808817602e-05, "loss": 2.532063293457031, "memory(GiB)": 77.56, "step": 26495, "token_acc": 0.47076023391812866, "train_speed(iter/s)": 1.441712 }, { "epoch": 1.1353412450194935, "grad_norm": 5.348151683807373, "learning_rate": 8.780850463464742e-05, "loss": 2.4239437103271486, "memory(GiB)": 77.56, "step": 26500, "token_acc": 0.48214285714285715, "train_speed(iter/s)": 1.441705 }, { "epoch": 1.1353412450194935, "eval_loss": 2.2603161334991455, "eval_runtime": 15.3761, "eval_samples_per_second": 6.504, "eval_steps_per_second": 6.504, "eval_token_acc": 0.4730077120822622, "step": 26500 }, { "epoch": 1.1355554603487426, "grad_norm": 3.867032527923584, "learning_rate": 8.78041004961823e-05, "loss": 2.438082695007324, "memory(GiB)": 77.56, "step": 26505, "token_acc": 0.47074468085106386, "train_speed(iter/s)": 1.440491 }, { "epoch": 1.1357696756779916, "grad_norm": 5.7271013259887695, "learning_rate": 8.779969567286044e-05, "loss": 2.486477279663086, "memory(GiB)": 77.56, "step": 26510, "token_acc": 0.4784172661870504, "train_speed(iter/s)": 1.440518 }, { "epoch": 1.1359838910072404, "grad_norm": 3.9770939350128174, "learning_rate": 8.779529016476163e-05, "loss": 2.4999099731445313, "memory(GiB)": 77.56, "step": 26515, "token_acc": 0.5018050541516246, "train_speed(iter/s)": 1.440579 }, { "epoch": 1.1361981063364894, "grad_norm": 4.614752769470215, "learning_rate": 8.779088397196567e-05, "loss": 2.5234304428100587, "memory(GiB)": 77.56, "step": 26520, "token_acc": 0.4326241134751773, "train_speed(iter/s)": 1.440643 }, { "epoch": 1.1364123216657385, "grad_norm": 4.708162307739258, "learning_rate": 8.778647709455241e-05, "loss": 2.6936302185058594, "memory(GiB)": 77.56, "step": 26525, "token_acc": 0.4586206896551724, "train_speed(iter/s)": 1.440646 }, { "epoch": 1.1366265369949873, "grad_norm": 6.140153884887695, "learning_rate": 8.778206953260166e-05, "loss": 2.4234237670898438, "memory(GiB)": 77.56, "step": 26530, "token_acc": 0.4713375796178344, "train_speed(iter/s)": 1.440624 }, { "epoch": 1.1368407523242363, "grad_norm": 4.023498058319092, "learning_rate": 8.77776612861933e-05, "loss": 2.383848190307617, "memory(GiB)": 77.56, "step": 26535, "token_acc": 0.4715909090909091, "train_speed(iter/s)": 1.440651 }, { "epoch": 1.1370549676534853, "grad_norm": 4.258756160736084, "learning_rate": 8.777325235540715e-05, "loss": 2.7052000045776365, "memory(GiB)": 77.56, "step": 26540, "token_acc": 0.45141065830721006, "train_speed(iter/s)": 1.440637 }, { "epoch": 1.1372691829827342, "grad_norm": 4.366440296173096, "learning_rate": 8.776884274032309e-05, "loss": 2.685236930847168, "memory(GiB)": 77.56, "step": 26545, "token_acc": 0.42900302114803623, "train_speed(iter/s)": 1.440702 }, { "epoch": 1.1374833983119832, "grad_norm": 5.327270984649658, "learning_rate": 8.776443244102103e-05, "loss": 2.7595027923583983, "memory(GiB)": 77.56, "step": 26550, "token_acc": 0.4809688581314879, "train_speed(iter/s)": 1.440691 }, { "epoch": 1.1376976136412322, "grad_norm": 4.49923038482666, "learning_rate": 8.776002145758084e-05, "loss": 2.661919593811035, "memory(GiB)": 77.56, "step": 26555, "token_acc": 0.4557377049180328, "train_speed(iter/s)": 1.44074 }, { "epoch": 1.137911828970481, "grad_norm": 3.9664995670318604, "learning_rate": 8.775560979008244e-05, "loss": 2.1485109329223633, "memory(GiB)": 77.56, "step": 26560, "token_acc": 0.5222672064777328, "train_speed(iter/s)": 1.440795 }, { "epoch": 1.13812604429973, "grad_norm": 5.120239734649658, "learning_rate": 8.775119743860576e-05, "loss": 2.3520294189453126, "memory(GiB)": 77.56, "step": 26565, "token_acc": 0.48880597014925375, "train_speed(iter/s)": 1.440864 }, { "epoch": 1.1383402596289791, "grad_norm": 5.736823081970215, "learning_rate": 8.774678440323072e-05, "loss": 2.43723201751709, "memory(GiB)": 77.56, "step": 26570, "token_acc": 0.43724696356275305, "train_speed(iter/s)": 1.440864 }, { "epoch": 1.138554474958228, "grad_norm": 5.541073322296143, "learning_rate": 8.774237068403727e-05, "loss": 2.471674919128418, "memory(GiB)": 77.56, "step": 26575, "token_acc": 0.47186147186147187, "train_speed(iter/s)": 1.440854 }, { "epoch": 1.138768690287477, "grad_norm": 5.638416290283203, "learning_rate": 8.773795628110539e-05, "loss": 2.578312110900879, "memory(GiB)": 77.56, "step": 26580, "token_acc": 0.4642857142857143, "train_speed(iter/s)": 1.440763 }, { "epoch": 1.138982905616726, "grad_norm": 4.208754062652588, "learning_rate": 8.7733541194515e-05, "loss": 2.5470470428466796, "memory(GiB)": 77.56, "step": 26585, "token_acc": 0.4823529411764706, "train_speed(iter/s)": 1.440766 }, { "epoch": 1.1391971209459748, "grad_norm": 3.9446842670440674, "learning_rate": 8.772912542434613e-05, "loss": 2.3783805847167967, "memory(GiB)": 77.56, "step": 26590, "token_acc": 0.48639455782312924, "train_speed(iter/s)": 1.440763 }, { "epoch": 1.1394113362752238, "grad_norm": 4.9133782386779785, "learning_rate": 8.772470897067877e-05, "loss": 2.527267646789551, "memory(GiB)": 77.56, "step": 26595, "token_acc": 0.4735099337748344, "train_speed(iter/s)": 1.440762 }, { "epoch": 1.1396255516044729, "grad_norm": 4.864068031311035, "learning_rate": 8.772029183359289e-05, "loss": 2.545042610168457, "memory(GiB)": 77.56, "step": 26600, "token_acc": 0.46598639455782315, "train_speed(iter/s)": 1.440747 }, { "epoch": 1.1398397669337217, "grad_norm": 5.588677406311035, "learning_rate": 8.771587401316856e-05, "loss": 2.5636566162109373, "memory(GiB)": 77.56, "step": 26605, "token_acc": 0.5110294117647058, "train_speed(iter/s)": 1.440728 }, { "epoch": 1.1400539822629707, "grad_norm": 5.388134956359863, "learning_rate": 8.771145550948579e-05, "loss": 2.35457763671875, "memory(GiB)": 77.56, "step": 26610, "token_acc": 0.5130718954248366, "train_speed(iter/s)": 1.440672 }, { "epoch": 1.1402681975922198, "grad_norm": 6.36678409576416, "learning_rate": 8.770703632262463e-05, "loss": 2.4657562255859373, "memory(GiB)": 77.56, "step": 26615, "token_acc": 0.4927536231884058, "train_speed(iter/s)": 1.440678 }, { "epoch": 1.1404824129214686, "grad_norm": 4.061175346374512, "learning_rate": 8.770261645266513e-05, "loss": 2.447623062133789, "memory(GiB)": 77.56, "step": 26620, "token_acc": 0.48299319727891155, "train_speed(iter/s)": 1.440755 }, { "epoch": 1.1406966282507176, "grad_norm": 5.193114280700684, "learning_rate": 8.769819589968737e-05, "loss": 2.6491151809692384, "memory(GiB)": 77.56, "step": 26625, "token_acc": 0.45695364238410596, "train_speed(iter/s)": 1.440811 }, { "epoch": 1.1409108435799666, "grad_norm": 5.166196823120117, "learning_rate": 8.76937746637714e-05, "loss": 2.4386377334594727, "memory(GiB)": 77.56, "step": 26630, "token_acc": 0.4889705882352941, "train_speed(iter/s)": 1.440814 }, { "epoch": 1.1411250589092155, "grad_norm": 4.047293186187744, "learning_rate": 8.768935274499738e-05, "loss": 2.8470130920410157, "memory(GiB)": 77.56, "step": 26635, "token_acc": 0.4230769230769231, "train_speed(iter/s)": 1.44085 }, { "epoch": 1.1413392742384645, "grad_norm": 5.059890270233154, "learning_rate": 8.768493014344536e-05, "loss": 2.4384864807128905, "memory(GiB)": 77.56, "step": 26640, "token_acc": 0.45514950166112955, "train_speed(iter/s)": 1.440834 }, { "epoch": 1.1415534895677135, "grad_norm": 6.320841312408447, "learning_rate": 8.768050685919548e-05, "loss": 2.3440835952758787, "memory(GiB)": 77.56, "step": 26645, "token_acc": 0.5259515570934256, "train_speed(iter/s)": 1.44085 }, { "epoch": 1.1417677048969623, "grad_norm": 4.210465908050537, "learning_rate": 8.767608289232787e-05, "loss": 2.3900732040405273, "memory(GiB)": 77.56, "step": 26650, "token_acc": 0.4907749077490775, "train_speed(iter/s)": 1.440875 }, { "epoch": 1.1419819202262114, "grad_norm": 5.062823295593262, "learning_rate": 8.767165824292269e-05, "loss": 2.9886711120605467, "memory(GiB)": 77.56, "step": 26655, "token_acc": 0.4206128133704735, "train_speed(iter/s)": 1.440859 }, { "epoch": 1.1421961355554604, "grad_norm": 5.273383617401123, "learning_rate": 8.766723291106007e-05, "loss": 2.667852783203125, "memory(GiB)": 77.56, "step": 26660, "token_acc": 0.43521594684385384, "train_speed(iter/s)": 1.440822 }, { "epoch": 1.1424103508847092, "grad_norm": 6.049825191497803, "learning_rate": 8.76628068968202e-05, "loss": 2.532472038269043, "memory(GiB)": 77.56, "step": 26665, "token_acc": 0.46923076923076923, "train_speed(iter/s)": 1.440897 }, { "epoch": 1.1426245662139582, "grad_norm": 5.55664587020874, "learning_rate": 8.765838020028322e-05, "loss": 2.225531578063965, "memory(GiB)": 77.56, "step": 26670, "token_acc": 0.5362903225806451, "train_speed(iter/s)": 1.440905 }, { "epoch": 1.1428387815432073, "grad_norm": 4.696475982666016, "learning_rate": 8.76539528215294e-05, "loss": 2.5136844635009767, "memory(GiB)": 77.56, "step": 26675, "token_acc": 0.4816053511705686, "train_speed(iter/s)": 1.440913 }, { "epoch": 1.143052996872456, "grad_norm": 3.7514920234680176, "learning_rate": 8.764952476063887e-05, "loss": 2.6212604522705076, "memory(GiB)": 77.56, "step": 26680, "token_acc": 0.49201277955271566, "train_speed(iter/s)": 1.440954 }, { "epoch": 1.1432672122017051, "grad_norm": 4.394812107086182, "learning_rate": 8.76450960176919e-05, "loss": 2.597858428955078, "memory(GiB)": 77.56, "step": 26685, "token_acc": 0.42105263157894735, "train_speed(iter/s)": 1.440996 }, { "epoch": 1.1434814275309542, "grad_norm": 5.044491291046143, "learning_rate": 8.76406665927687e-05, "loss": 2.298162078857422, "memory(GiB)": 77.56, "step": 26690, "token_acc": 0.5060240963855421, "train_speed(iter/s)": 1.44101 }, { "epoch": 1.143695642860203, "grad_norm": 5.354703903198242, "learning_rate": 8.763623648594952e-05, "loss": 2.4597751617431642, "memory(GiB)": 77.56, "step": 26695, "token_acc": 0.5104166666666666, "train_speed(iter/s)": 1.440995 }, { "epoch": 1.143909858189452, "grad_norm": 4.959939479827881, "learning_rate": 8.76318056973146e-05, "loss": 2.407014083862305, "memory(GiB)": 77.56, "step": 26700, "token_acc": 0.4752851711026616, "train_speed(iter/s)": 1.44091 }, { "epoch": 1.144124073518701, "grad_norm": 5.766183376312256, "learning_rate": 8.762737422694423e-05, "loss": 2.5530813217163084, "memory(GiB)": 77.56, "step": 26705, "token_acc": 0.48226950354609927, "train_speed(iter/s)": 1.440918 }, { "epoch": 1.1443382888479499, "grad_norm": 4.712148666381836, "learning_rate": 8.762294207491865e-05, "loss": 2.7044399261474608, "memory(GiB)": 77.56, "step": 26710, "token_acc": 0.4506578947368421, "train_speed(iter/s)": 1.440902 }, { "epoch": 1.144552504177199, "grad_norm": 5.469447135925293, "learning_rate": 8.761850924131821e-05, "loss": 2.260976028442383, "memory(GiB)": 77.56, "step": 26715, "token_acc": 0.5035714285714286, "train_speed(iter/s)": 1.440906 }, { "epoch": 1.144766719506448, "grad_norm": 5.0596747398376465, "learning_rate": 8.761407572622318e-05, "loss": 2.7360233306884765, "memory(GiB)": 77.56, "step": 26720, "token_acc": 0.431438127090301, "train_speed(iter/s)": 1.440932 }, { "epoch": 1.1449809348356967, "grad_norm": 5.104813098907471, "learning_rate": 8.760964152971387e-05, "loss": 2.345060348510742, "memory(GiB)": 77.56, "step": 26725, "token_acc": 0.5725806451612904, "train_speed(iter/s)": 1.440908 }, { "epoch": 1.1451951501649458, "grad_norm": 3.7433674335479736, "learning_rate": 8.760520665187064e-05, "loss": 2.3104143142700195, "memory(GiB)": 77.56, "step": 26730, "token_acc": 0.5156794425087108, "train_speed(iter/s)": 1.440952 }, { "epoch": 1.1454093654941948, "grad_norm": 4.41854190826416, "learning_rate": 8.760077109277381e-05, "loss": 2.550655174255371, "memory(GiB)": 77.56, "step": 26735, "token_acc": 0.44884488448844884, "train_speed(iter/s)": 1.440994 }, { "epoch": 1.1456235808234436, "grad_norm": 3.809072494506836, "learning_rate": 8.759633485250372e-05, "loss": 2.51839656829834, "memory(GiB)": 77.56, "step": 26740, "token_acc": 0.47634069400630913, "train_speed(iter/s)": 1.44105 }, { "epoch": 1.1458377961526927, "grad_norm": 4.477509021759033, "learning_rate": 8.759189793114077e-05, "loss": 2.411824417114258, "memory(GiB)": 77.56, "step": 26745, "token_acc": 0.4666666666666667, "train_speed(iter/s)": 1.441096 }, { "epoch": 1.1460520114819417, "grad_norm": 4.557850360870361, "learning_rate": 8.758746032876534e-05, "loss": 2.428873062133789, "memory(GiB)": 77.56, "step": 26750, "token_acc": 0.4652567975830816, "train_speed(iter/s)": 1.441131 }, { "epoch": 1.1462662268111905, "grad_norm": 5.25700044631958, "learning_rate": 8.758302204545779e-05, "loss": 2.540868949890137, "memory(GiB)": 77.56, "step": 26755, "token_acc": 0.4980392156862745, "train_speed(iter/s)": 1.44118 }, { "epoch": 1.1464804421404395, "grad_norm": 4.926212310791016, "learning_rate": 8.757858308129854e-05, "loss": 2.358903503417969, "memory(GiB)": 77.56, "step": 26760, "token_acc": 0.5362318840579711, "train_speed(iter/s)": 1.4412 }, { "epoch": 1.1466946574696886, "grad_norm": 3.690906286239624, "learning_rate": 8.7574143436368e-05, "loss": 2.4899175643920897, "memory(GiB)": 77.56, "step": 26765, "token_acc": 0.49838187702265374, "train_speed(iter/s)": 1.441261 }, { "epoch": 1.1469088727989374, "grad_norm": 4.500529766082764, "learning_rate": 8.756970311074663e-05, "loss": 2.657613754272461, "memory(GiB)": 77.56, "step": 26770, "token_acc": 0.4691780821917808, "train_speed(iter/s)": 1.441226 }, { "epoch": 1.1471230881281864, "grad_norm": 5.003650665283203, "learning_rate": 8.756526210451483e-05, "loss": 2.2685012817382812, "memory(GiB)": 77.56, "step": 26775, "token_acc": 0.4908424908424908, "train_speed(iter/s)": 1.44121 }, { "epoch": 1.1473373034574355, "grad_norm": 6.734185695648193, "learning_rate": 8.756082041775307e-05, "loss": 2.4928712844848633, "memory(GiB)": 77.56, "step": 26780, "token_acc": 0.484472049689441, "train_speed(iter/s)": 1.441235 }, { "epoch": 1.1475515187866843, "grad_norm": 4.195174217224121, "learning_rate": 8.75563780505418e-05, "loss": 2.3810924530029296, "memory(GiB)": 77.56, "step": 26785, "token_acc": 0.5048543689320388, "train_speed(iter/s)": 1.441271 }, { "epoch": 1.1477657341159333, "grad_norm": 5.03382682800293, "learning_rate": 8.755193500296152e-05, "loss": 2.7226097106933596, "memory(GiB)": 77.56, "step": 26790, "token_acc": 0.4695121951219512, "train_speed(iter/s)": 1.441326 }, { "epoch": 1.1479799494451823, "grad_norm": 6.038765907287598, "learning_rate": 8.75474912750927e-05, "loss": 2.545185089111328, "memory(GiB)": 77.56, "step": 26795, "token_acc": 0.45482866043613707, "train_speed(iter/s)": 1.44134 }, { "epoch": 1.1481941647744311, "grad_norm": 4.757031440734863, "learning_rate": 8.754304686701588e-05, "loss": 2.3836559295654296, "memory(GiB)": 77.56, "step": 26800, "token_acc": 0.4812286689419795, "train_speed(iter/s)": 1.441339 }, { "epoch": 1.1484083801036802, "grad_norm": 4.1283416748046875, "learning_rate": 8.753860177881153e-05, "loss": 2.3828325271606445, "memory(GiB)": 77.56, "step": 26805, "token_acc": 0.49333333333333335, "train_speed(iter/s)": 1.441269 }, { "epoch": 1.1486225954329292, "grad_norm": 5.796810150146484, "learning_rate": 8.753415601056019e-05, "loss": 2.584481620788574, "memory(GiB)": 77.56, "step": 26810, "token_acc": 0.4645669291338583, "train_speed(iter/s)": 1.441324 }, { "epoch": 1.148836810762178, "grad_norm": 4.63601016998291, "learning_rate": 8.752970956234242e-05, "loss": 2.8847875595092773, "memory(GiB)": 77.56, "step": 26815, "token_acc": 0.4427710843373494, "train_speed(iter/s)": 1.441333 }, { "epoch": 1.149051026091427, "grad_norm": 4.261287689208984, "learning_rate": 8.752526243423874e-05, "loss": 2.788716506958008, "memory(GiB)": 77.56, "step": 26820, "token_acc": 0.4097222222222222, "train_speed(iter/s)": 1.441364 }, { "epoch": 1.149265241420676, "grad_norm": 4.280457496643066, "learning_rate": 8.752081462632973e-05, "loss": 2.270029067993164, "memory(GiB)": 77.56, "step": 26825, "token_acc": 0.4929078014184397, "train_speed(iter/s)": 1.441393 }, { "epoch": 1.149479456749925, "grad_norm": 4.7293853759765625, "learning_rate": 8.751636613869599e-05, "loss": 2.5516822814941404, "memory(GiB)": 77.56, "step": 26830, "token_acc": 0.45985401459854014, "train_speed(iter/s)": 1.441396 }, { "epoch": 1.149693672079174, "grad_norm": 3.4673898220062256, "learning_rate": 8.751191697141805e-05, "loss": 2.4915645599365233, "memory(GiB)": 77.56, "step": 26835, "token_acc": 0.4825174825174825, "train_speed(iter/s)": 1.44142 }, { "epoch": 1.149907887408423, "grad_norm": 5.59115743637085, "learning_rate": 8.750746712457656e-05, "loss": 2.4356884002685546, "memory(GiB)": 77.56, "step": 26840, "token_acc": 0.447098976109215, "train_speed(iter/s)": 1.441437 }, { "epoch": 1.1501221027376718, "grad_norm": 4.838622093200684, "learning_rate": 8.750301659825213e-05, "loss": 2.642098808288574, "memory(GiB)": 77.56, "step": 26845, "token_acc": 0.4222972972972973, "train_speed(iter/s)": 1.441431 }, { "epoch": 1.1503363180669208, "grad_norm": 3.916435718536377, "learning_rate": 8.749856539252537e-05, "loss": 2.4598834991455076, "memory(GiB)": 77.56, "step": 26850, "token_acc": 0.48854961832061067, "train_speed(iter/s)": 1.441475 }, { "epoch": 1.1505505333961699, "grad_norm": 7.166783332824707, "learning_rate": 8.749411350747691e-05, "loss": 2.483937644958496, "memory(GiB)": 77.56, "step": 26855, "token_acc": 0.48338368580060426, "train_speed(iter/s)": 1.441487 }, { "epoch": 1.1507647487254187, "grad_norm": 5.853246688842773, "learning_rate": 8.74896609431874e-05, "loss": 2.5939865112304688, "memory(GiB)": 77.56, "step": 26860, "token_acc": 0.48936170212765956, "train_speed(iter/s)": 1.441513 }, { "epoch": 1.1509789640546677, "grad_norm": 5.35648250579834, "learning_rate": 8.748520769973754e-05, "loss": 2.4845726013183596, "memory(GiB)": 77.56, "step": 26865, "token_acc": 0.45936395759717313, "train_speed(iter/s)": 1.441573 }, { "epoch": 1.1511931793839167, "grad_norm": 3.7814955711364746, "learning_rate": 8.748075377720796e-05, "loss": 2.400625228881836, "memory(GiB)": 77.56, "step": 26870, "token_acc": 0.49823321554770317, "train_speed(iter/s)": 1.441652 }, { "epoch": 1.1514073947131656, "grad_norm": 5.260591506958008, "learning_rate": 8.747629917567938e-05, "loss": 2.222347068786621, "memory(GiB)": 77.56, "step": 26875, "token_acc": 0.543859649122807, "train_speed(iter/s)": 1.44167 }, { "epoch": 1.1516216100424146, "grad_norm": 5.286369800567627, "learning_rate": 8.747184389523249e-05, "loss": 2.5411750793457033, "memory(GiB)": 77.56, "step": 26880, "token_acc": 0.4630225080385852, "train_speed(iter/s)": 1.441658 }, { "epoch": 1.1518358253716636, "grad_norm": 4.017187118530273, "learning_rate": 8.746738793594798e-05, "loss": 2.5333734512329102, "memory(GiB)": 77.56, "step": 26885, "token_acc": 0.4873417721518987, "train_speed(iter/s)": 1.441655 }, { "epoch": 1.1520500407009124, "grad_norm": 4.664718151092529, "learning_rate": 8.746293129790659e-05, "loss": 2.5239028930664062, "memory(GiB)": 77.56, "step": 26890, "token_acc": 0.4899328859060403, "train_speed(iter/s)": 1.441725 }, { "epoch": 1.1522642560301615, "grad_norm": 4.8622636795043945, "learning_rate": 8.745847398118906e-05, "loss": 2.7554922103881836, "memory(GiB)": 77.56, "step": 26895, "token_acc": 0.44126984126984126, "train_speed(iter/s)": 1.441782 }, { "epoch": 1.1524784713594105, "grad_norm": 4.44446325302124, "learning_rate": 8.745401598587614e-05, "loss": 2.4982284545898437, "memory(GiB)": 77.56, "step": 26900, "token_acc": 0.4603658536585366, "train_speed(iter/s)": 1.441818 }, { "epoch": 1.1526926866886593, "grad_norm": 4.0920305252075195, "learning_rate": 8.744955731204858e-05, "loss": 2.5126646041870115, "memory(GiB)": 77.56, "step": 26905, "token_acc": 0.48068669527896996, "train_speed(iter/s)": 1.441804 }, { "epoch": 1.1529069020179084, "grad_norm": 4.622930526733398, "learning_rate": 8.744509795978716e-05, "loss": 2.499393844604492, "memory(GiB)": 77.56, "step": 26910, "token_acc": 0.4964788732394366, "train_speed(iter/s)": 1.441817 }, { "epoch": 1.1531211173471574, "grad_norm": 4.551782608032227, "learning_rate": 8.744063792917267e-05, "loss": 2.312860298156738, "memory(GiB)": 77.56, "step": 26915, "token_acc": 0.5121951219512195, "train_speed(iter/s)": 1.441846 }, { "epoch": 1.1533353326764062, "grad_norm": 4.52789306640625, "learning_rate": 8.74361772202859e-05, "loss": 2.397283363342285, "memory(GiB)": 77.56, "step": 26920, "token_acc": 0.5169811320754717, "train_speed(iter/s)": 1.441869 }, { "epoch": 1.1535495480056552, "grad_norm": 5.282530784606934, "learning_rate": 8.743171583320765e-05, "loss": 2.7640592575073244, "memory(GiB)": 77.56, "step": 26925, "token_acc": 0.4054982817869416, "train_speed(iter/s)": 1.441915 }, { "epoch": 1.1537637633349043, "grad_norm": 4.0936174392700195, "learning_rate": 8.742725376801877e-05, "loss": 2.269746017456055, "memory(GiB)": 77.56, "step": 26930, "token_acc": 0.5333333333333333, "train_speed(iter/s)": 1.441859 }, { "epoch": 1.1539779786641533, "grad_norm": 6.713964462280273, "learning_rate": 8.742279102480007e-05, "loss": 2.378371810913086, "memory(GiB)": 77.56, "step": 26935, "token_acc": 0.4740484429065744, "train_speed(iter/s)": 1.441897 }, { "epoch": 1.1541921939934021, "grad_norm": 4.747091770172119, "learning_rate": 8.74183276036324e-05, "loss": 2.5782567977905275, "memory(GiB)": 77.56, "step": 26940, "token_acc": 0.46551724137931033, "train_speed(iter/s)": 1.441926 }, { "epoch": 1.1544064093226512, "grad_norm": 4.940014362335205, "learning_rate": 8.741386350459663e-05, "loss": 2.727750778198242, "memory(GiB)": 77.56, "step": 26945, "token_acc": 0.4574468085106383, "train_speed(iter/s)": 1.441966 }, { "epoch": 1.1546206246519002, "grad_norm": 7.991199970245361, "learning_rate": 8.740939872777363e-05, "loss": 2.41723575592041, "memory(GiB)": 77.56, "step": 26950, "token_acc": 0.5390070921985816, "train_speed(iter/s)": 1.441961 }, { "epoch": 1.154834839981149, "grad_norm": 5.448914527893066, "learning_rate": 8.740493327324425e-05, "loss": 2.5768009185791017, "memory(GiB)": 77.56, "step": 26955, "token_acc": 0.506896551724138, "train_speed(iter/s)": 1.441974 }, { "epoch": 1.155049055310398, "grad_norm": 5.368537902832031, "learning_rate": 8.740046714108947e-05, "loss": 2.632326126098633, "memory(GiB)": 77.56, "step": 26960, "token_acc": 0.4605678233438486, "train_speed(iter/s)": 1.441969 }, { "epoch": 1.155263270639647, "grad_norm": 4.7027716636657715, "learning_rate": 8.739600033139011e-05, "loss": 2.731697654724121, "memory(GiB)": 77.56, "step": 26965, "token_acc": 0.46319018404907975, "train_speed(iter/s)": 1.441958 }, { "epoch": 1.1554774859688959, "grad_norm": 3.4945216178894043, "learning_rate": 8.739153284422712e-05, "loss": 2.6790306091308596, "memory(GiB)": 77.56, "step": 26970, "token_acc": 0.4338235294117647, "train_speed(iter/s)": 1.441967 }, { "epoch": 1.155691701298145, "grad_norm": 5.31782865524292, "learning_rate": 8.738706467968145e-05, "loss": 2.2991004943847657, "memory(GiB)": 77.56, "step": 26975, "token_acc": 0.5078740157480315, "train_speed(iter/s)": 1.442004 }, { "epoch": 1.155905916627394, "grad_norm": 4.354224681854248, "learning_rate": 8.738259583783401e-05, "loss": 2.6499013900756836, "memory(GiB)": 77.56, "step": 26980, "token_acc": 0.42810457516339867, "train_speed(iter/s)": 1.442091 }, { "epoch": 1.1561201319566428, "grad_norm": 4.768760681152344, "learning_rate": 8.737812631876581e-05, "loss": 2.5658615112304686, "memory(GiB)": 77.56, "step": 26985, "token_acc": 0.4838709677419355, "train_speed(iter/s)": 1.442096 }, { "epoch": 1.1563343472858918, "grad_norm": 4.8821330070495605, "learning_rate": 8.737365612255777e-05, "loss": 2.4449913024902346, "memory(GiB)": 77.56, "step": 26990, "token_acc": 0.4724137931034483, "train_speed(iter/s)": 1.442148 }, { "epoch": 1.1565485626151408, "grad_norm": 5.709953308105469, "learning_rate": 8.736918524929088e-05, "loss": 2.568259048461914, "memory(GiB)": 77.56, "step": 26995, "token_acc": 0.4645669291338583, "train_speed(iter/s)": 1.44214 }, { "epoch": 1.1567627779443896, "grad_norm": 6.190929412841797, "learning_rate": 8.736471369904616e-05, "loss": 2.7937015533447265, "memory(GiB)": 77.56, "step": 27000, "token_acc": 0.4222972972972973, "train_speed(iter/s)": 1.442137 }, { "epoch": 1.1567627779443896, "eval_loss": 2.2168946266174316, "eval_runtime": 14.7579, "eval_samples_per_second": 6.776, "eval_steps_per_second": 6.776, "eval_token_acc": 0.4817320703653586, "step": 27000 }, { "epoch": 1.1569769932736387, "grad_norm": 4.465674877166748, "learning_rate": 8.73602414719046e-05, "loss": 2.444173240661621, "memory(GiB)": 77.56, "step": 27005, "token_acc": 0.4771622934888241, "train_speed(iter/s)": 1.440907 }, { "epoch": 1.1571912086028877, "grad_norm": 4.241729736328125, "learning_rate": 8.735576856794722e-05, "loss": 2.522038459777832, "memory(GiB)": 77.56, "step": 27010, "token_acc": 0.4769736842105263, "train_speed(iter/s)": 1.44094 }, { "epoch": 1.1574054239321365, "grad_norm": 4.941840648651123, "learning_rate": 8.735129498725504e-05, "loss": 2.612467956542969, "memory(GiB)": 77.56, "step": 27015, "token_acc": 0.46258503401360546, "train_speed(iter/s)": 1.440991 }, { "epoch": 1.1576196392613856, "grad_norm": 4.547250270843506, "learning_rate": 8.734682072990912e-05, "loss": 2.6390592575073244, "memory(GiB)": 77.56, "step": 27020, "token_acc": 0.45302013422818793, "train_speed(iter/s)": 1.441022 }, { "epoch": 1.1578338545906346, "grad_norm": 6.028567790985107, "learning_rate": 8.734234579599049e-05, "loss": 2.4138420104980467, "memory(GiB)": 77.56, "step": 27025, "token_acc": 0.44357976653696496, "train_speed(iter/s)": 1.441108 }, { "epoch": 1.1580480699198834, "grad_norm": 4.942901134490967, "learning_rate": 8.733787018558026e-05, "loss": 2.1670726776123046, "memory(GiB)": 77.56, "step": 27030, "token_acc": 0.5261324041811847, "train_speed(iter/s)": 1.441179 }, { "epoch": 1.1582622852491324, "grad_norm": 5.489515781402588, "learning_rate": 8.733339389875947e-05, "loss": 2.4094064712524412, "memory(GiB)": 77.56, "step": 27035, "token_acc": 0.5, "train_speed(iter/s)": 1.441249 }, { "epoch": 1.1584765005783815, "grad_norm": 5.404365062713623, "learning_rate": 8.732891693560926e-05, "loss": 2.6728492736816407, "memory(GiB)": 77.56, "step": 27040, "token_acc": 0.48417721518987344, "train_speed(iter/s)": 1.441255 }, { "epoch": 1.1586907159076303, "grad_norm": 5.0062174797058105, "learning_rate": 8.732443929621067e-05, "loss": 2.608126640319824, "memory(GiB)": 77.56, "step": 27045, "token_acc": 0.4553846153846154, "train_speed(iter/s)": 1.441235 }, { "epoch": 1.1589049312368793, "grad_norm": 6.288596153259277, "learning_rate": 8.731996098064485e-05, "loss": 2.3181735992431642, "memory(GiB)": 77.56, "step": 27050, "token_acc": 0.49387755102040815, "train_speed(iter/s)": 1.441304 }, { "epoch": 1.1591191465661284, "grad_norm": 6.4204630851745605, "learning_rate": 8.731548198899295e-05, "loss": 2.454659080505371, "memory(GiB)": 77.56, "step": 27055, "token_acc": 0.4491803278688525, "train_speed(iter/s)": 1.441342 }, { "epoch": 1.1593333618953772, "grad_norm": 5.830802917480469, "learning_rate": 8.731100232133607e-05, "loss": 2.412165069580078, "memory(GiB)": 77.56, "step": 27060, "token_acc": 0.5186567164179104, "train_speed(iter/s)": 1.441317 }, { "epoch": 1.1595475772246262, "grad_norm": 4.659091949462891, "learning_rate": 8.730652197775538e-05, "loss": 2.4410703659057615, "memory(GiB)": 77.56, "step": 27065, "token_acc": 0.49070631970260226, "train_speed(iter/s)": 1.441329 }, { "epoch": 1.1597617925538752, "grad_norm": 3.963329553604126, "learning_rate": 8.730204095833205e-05, "loss": 2.262119674682617, "memory(GiB)": 77.56, "step": 27070, "token_acc": 0.5155709342560554, "train_speed(iter/s)": 1.441328 }, { "epoch": 1.159976007883124, "grad_norm": 5.534719944000244, "learning_rate": 8.729755926314727e-05, "loss": 2.466657829284668, "memory(GiB)": 77.56, "step": 27075, "token_acc": 0.5222222222222223, "train_speed(iter/s)": 1.441345 }, { "epoch": 1.160190223212373, "grad_norm": 5.126330852508545, "learning_rate": 8.72930768922822e-05, "loss": 2.2823165893554687, "memory(GiB)": 77.56, "step": 27080, "token_acc": 0.48297213622291024, "train_speed(iter/s)": 1.441375 }, { "epoch": 1.1604044385416221, "grad_norm": 6.151721954345703, "learning_rate": 8.728859384581807e-05, "loss": 2.558652877807617, "memory(GiB)": 77.56, "step": 27085, "token_acc": 0.48606811145510836, "train_speed(iter/s)": 1.441393 }, { "epoch": 1.160618653870871, "grad_norm": 4.088532447814941, "learning_rate": 8.728411012383609e-05, "loss": 2.3615997314453123, "memory(GiB)": 77.56, "step": 27090, "token_acc": 0.48404255319148937, "train_speed(iter/s)": 1.441417 }, { "epoch": 1.16083286920012, "grad_norm": 5.527521133422852, "learning_rate": 8.727962572641746e-05, "loss": 2.6201107025146486, "memory(GiB)": 77.56, "step": 27095, "token_acc": 0.475177304964539, "train_speed(iter/s)": 1.441427 }, { "epoch": 1.161047084529369, "grad_norm": 7.4720845222473145, "learning_rate": 8.727514065364344e-05, "loss": 2.6456672668457033, "memory(GiB)": 77.56, "step": 27100, "token_acc": 0.44525547445255476, "train_speed(iter/s)": 1.441444 }, { "epoch": 1.1612612998586178, "grad_norm": 5.867828369140625, "learning_rate": 8.72706549055953e-05, "loss": 2.5866325378417967, "memory(GiB)": 77.56, "step": 27105, "token_acc": 0.4448669201520912, "train_speed(iter/s)": 1.44146 }, { "epoch": 1.1614755151878668, "grad_norm": 6.73767614364624, "learning_rate": 8.726616848235426e-05, "loss": 2.450385856628418, "memory(GiB)": 77.56, "step": 27110, "token_acc": 0.5128205128205128, "train_speed(iter/s)": 1.441516 }, { "epoch": 1.1616897305171159, "grad_norm": 6.656970977783203, "learning_rate": 8.726168138400163e-05, "loss": 2.6038888931274413, "memory(GiB)": 77.56, "step": 27115, "token_acc": 0.45195729537366547, "train_speed(iter/s)": 1.441488 }, { "epoch": 1.1619039458463647, "grad_norm": 4.685435771942139, "learning_rate": 8.725719361061868e-05, "loss": 2.834351921081543, "memory(GiB)": 77.56, "step": 27120, "token_acc": 0.4273255813953488, "train_speed(iter/s)": 1.441502 }, { "epoch": 1.1621181611756137, "grad_norm": 5.694653511047363, "learning_rate": 8.725270516228674e-05, "loss": 3.034100341796875, "memory(GiB)": 77.56, "step": 27125, "token_acc": 0.4266666666666667, "train_speed(iter/s)": 1.441477 }, { "epoch": 1.1623323765048628, "grad_norm": 4.075718402862549, "learning_rate": 8.724821603908708e-05, "loss": 2.802223968505859, "memory(GiB)": 77.56, "step": 27130, "token_acc": 0.4405797101449275, "train_speed(iter/s)": 1.441487 }, { "epoch": 1.1625465918341116, "grad_norm": 4.457286834716797, "learning_rate": 8.724372624110105e-05, "loss": 2.566387176513672, "memory(GiB)": 77.56, "step": 27135, "token_acc": 0.4432624113475177, "train_speed(iter/s)": 1.441472 }, { "epoch": 1.1627608071633606, "grad_norm": 3.494842290878296, "learning_rate": 8.723923576840997e-05, "loss": 2.4439855575561524, "memory(GiB)": 77.56, "step": 27140, "token_acc": 0.5084745762711864, "train_speed(iter/s)": 1.441459 }, { "epoch": 1.1629750224926096, "grad_norm": 3.863755226135254, "learning_rate": 8.723474462109522e-05, "loss": 2.4159778594970702, "memory(GiB)": 77.56, "step": 27145, "token_acc": 0.5149253731343284, "train_speed(iter/s)": 1.441412 }, { "epoch": 1.1631892378218585, "grad_norm": 4.90539026260376, "learning_rate": 8.723025279923813e-05, "loss": 2.7178874969482423, "memory(GiB)": 77.56, "step": 27150, "token_acc": 0.4261168384879725, "train_speed(iter/s)": 1.44139 }, { "epoch": 1.1634034531511075, "grad_norm": 8.530177116394043, "learning_rate": 8.722576030292009e-05, "loss": 2.204135513305664, "memory(GiB)": 77.56, "step": 27155, "token_acc": 0.5381526104417671, "train_speed(iter/s)": 1.441405 }, { "epoch": 1.1636176684803565, "grad_norm": 5.1985602378845215, "learning_rate": 8.72212671322225e-05, "loss": 2.464513969421387, "memory(GiB)": 77.56, "step": 27160, "token_acc": 0.4847328244274809, "train_speed(iter/s)": 1.441423 }, { "epoch": 1.1638318838096053, "grad_norm": 4.844686031341553, "learning_rate": 8.72167732872267e-05, "loss": 2.430177688598633, "memory(GiB)": 77.56, "step": 27165, "token_acc": 0.47368421052631576, "train_speed(iter/s)": 1.441383 }, { "epoch": 1.1640460991388544, "grad_norm": 4.972643852233887, "learning_rate": 8.721227876801417e-05, "loss": 2.4846261978149413, "memory(GiB)": 77.56, "step": 27170, "token_acc": 0.4536741214057508, "train_speed(iter/s)": 1.441342 }, { "epoch": 1.1642603144681034, "grad_norm": 4.513660430908203, "learning_rate": 8.72077835746663e-05, "loss": 2.504902458190918, "memory(GiB)": 77.56, "step": 27175, "token_acc": 0.45751633986928103, "train_speed(iter/s)": 1.441379 }, { "epoch": 1.1644745297973522, "grad_norm": 5.896902084350586, "learning_rate": 8.720328770726452e-05, "loss": 2.387655258178711, "memory(GiB)": 77.56, "step": 27180, "token_acc": 0.5093632958801498, "train_speed(iter/s)": 1.441328 }, { "epoch": 1.1646887451266013, "grad_norm": 4.575392246246338, "learning_rate": 8.719879116589028e-05, "loss": 2.243789863586426, "memory(GiB)": 77.56, "step": 27185, "token_acc": 0.5214521452145214, "train_speed(iter/s)": 1.441397 }, { "epoch": 1.1649029604558503, "grad_norm": 5.380704879760742, "learning_rate": 8.719429395062504e-05, "loss": 2.9992685317993164, "memory(GiB)": 77.56, "step": 27190, "token_acc": 0.42058823529411765, "train_speed(iter/s)": 1.441454 }, { "epoch": 1.165117175785099, "grad_norm": 3.886584758758545, "learning_rate": 8.718979606155029e-05, "loss": 2.4537332534790037, "memory(GiB)": 77.56, "step": 27195, "token_acc": 0.4982078853046595, "train_speed(iter/s)": 1.441474 }, { "epoch": 1.1653313911143481, "grad_norm": 4.117032527923584, "learning_rate": 8.718529749874749e-05, "loss": 2.2546730041503906, "memory(GiB)": 77.56, "step": 27200, "token_acc": 0.5335689045936396, "train_speed(iter/s)": 1.441529 }, { "epoch": 1.1655456064435972, "grad_norm": 6.254148006439209, "learning_rate": 8.718079826229813e-05, "loss": 2.5495967864990234, "memory(GiB)": 77.56, "step": 27205, "token_acc": 0.5114754098360655, "train_speed(iter/s)": 1.441546 }, { "epoch": 1.165759821772846, "grad_norm": 4.410702228546143, "learning_rate": 8.717629835228375e-05, "loss": 2.3235519409179686, "memory(GiB)": 77.56, "step": 27210, "token_acc": 0.4838709677419355, "train_speed(iter/s)": 1.441588 }, { "epoch": 1.165974037102095, "grad_norm": 4.281838893890381, "learning_rate": 8.717179776878583e-05, "loss": 2.635189247131348, "memory(GiB)": 77.56, "step": 27215, "token_acc": 0.4691358024691358, "train_speed(iter/s)": 1.441576 }, { "epoch": 1.166188252431344, "grad_norm": 5.667890548706055, "learning_rate": 8.716729651188595e-05, "loss": 2.604692077636719, "memory(GiB)": 77.56, "step": 27220, "token_acc": 0.4953271028037383, "train_speed(iter/s)": 1.441617 }, { "epoch": 1.1664024677605929, "grad_norm": 4.3095622062683105, "learning_rate": 8.71627945816656e-05, "loss": 2.71480770111084, "memory(GiB)": 77.56, "step": 27225, "token_acc": 0.4506578947368421, "train_speed(iter/s)": 1.44159 }, { "epoch": 1.166616683089842, "grad_norm": 6.5331549644470215, "learning_rate": 8.715829197820639e-05, "loss": 2.582870101928711, "memory(GiB)": 77.56, "step": 27230, "token_acc": 0.4626334519572954, "train_speed(iter/s)": 1.441538 }, { "epoch": 1.166830898419091, "grad_norm": 6.405350208282471, "learning_rate": 8.715378870158986e-05, "loss": 2.538889694213867, "memory(GiB)": 77.56, "step": 27235, "token_acc": 0.45555555555555555, "train_speed(iter/s)": 1.441466 }, { "epoch": 1.1670451137483397, "grad_norm": 4.921253204345703, "learning_rate": 8.714928475189759e-05, "loss": 2.706508445739746, "memory(GiB)": 77.56, "step": 27240, "token_acc": 0.4612546125461255, "train_speed(iter/s)": 1.441489 }, { "epoch": 1.1672593290775888, "grad_norm": 5.2867255210876465, "learning_rate": 8.714478012921117e-05, "loss": 2.542100524902344, "memory(GiB)": 77.56, "step": 27245, "token_acc": 0.4828767123287671, "train_speed(iter/s)": 1.441531 }, { "epoch": 1.1674735444068378, "grad_norm": 6.396088600158691, "learning_rate": 8.714027483361222e-05, "loss": 2.621960258483887, "memory(GiB)": 77.56, "step": 27250, "token_acc": 0.4778481012658228, "train_speed(iter/s)": 1.441606 }, { "epoch": 1.1676877597360866, "grad_norm": 4.936322212219238, "learning_rate": 8.713576886518235e-05, "loss": 2.33414306640625, "memory(GiB)": 77.56, "step": 27255, "token_acc": 0.5426356589147286, "train_speed(iter/s)": 1.441652 }, { "epoch": 1.1679019750653357, "grad_norm": 4.305262565612793, "learning_rate": 8.713126222400319e-05, "loss": 2.306503677368164, "memory(GiB)": 77.56, "step": 27260, "token_acc": 0.5214285714285715, "train_speed(iter/s)": 1.44166 }, { "epoch": 1.1681161903945847, "grad_norm": 5.110226631164551, "learning_rate": 8.712675491015638e-05, "loss": 2.513997268676758, "memory(GiB)": 77.56, "step": 27265, "token_acc": 0.46200607902735563, "train_speed(iter/s)": 1.441597 }, { "epoch": 1.1683304057238335, "grad_norm": 5.511990547180176, "learning_rate": 8.712224692372358e-05, "loss": 2.761313819885254, "memory(GiB)": 77.56, "step": 27270, "token_acc": 0.4874551971326165, "train_speed(iter/s)": 1.441659 }, { "epoch": 1.1685446210530825, "grad_norm": 4.586798191070557, "learning_rate": 8.711773826478646e-05, "loss": 2.7100988388061524, "memory(GiB)": 77.56, "step": 27275, "token_acc": 0.4337748344370861, "train_speed(iter/s)": 1.441719 }, { "epoch": 1.1687588363823316, "grad_norm": 6.718746185302734, "learning_rate": 8.711322893342668e-05, "loss": 2.5854291915893555, "memory(GiB)": 77.56, "step": 27280, "token_acc": 0.4618320610687023, "train_speed(iter/s)": 1.44174 }, { "epoch": 1.1689730517115804, "grad_norm": 4.221371650695801, "learning_rate": 8.710871892972595e-05, "loss": 2.454153633117676, "memory(GiB)": 77.56, "step": 27285, "token_acc": 0.4809688581314879, "train_speed(iter/s)": 1.441755 }, { "epoch": 1.1691872670408294, "grad_norm": 3.7791264057159424, "learning_rate": 8.710420825376597e-05, "loss": 2.414768409729004, "memory(GiB)": 77.56, "step": 27290, "token_acc": 0.48942598187311176, "train_speed(iter/s)": 1.44179 }, { "epoch": 1.1694014823700785, "grad_norm": 4.8100152015686035, "learning_rate": 8.709969690562845e-05, "loss": 2.9373796463012694, "memory(GiB)": 77.56, "step": 27295, "token_acc": 0.4681528662420382, "train_speed(iter/s)": 1.441702 }, { "epoch": 1.1696156976993273, "grad_norm": 6.438279628753662, "learning_rate": 8.709518488539511e-05, "loss": 2.565231132507324, "memory(GiB)": 77.56, "step": 27300, "token_acc": 0.47191011235955055, "train_speed(iter/s)": 1.441756 }, { "epoch": 1.1698299130285763, "grad_norm": 4.581357002258301, "learning_rate": 8.709067219314771e-05, "loss": 2.4176259994506837, "memory(GiB)": 77.56, "step": 27305, "token_acc": 0.47547169811320755, "train_speed(iter/s)": 1.441798 }, { "epoch": 1.1700441283578253, "grad_norm": 4.517197608947754, "learning_rate": 8.708615882896798e-05, "loss": 2.441977882385254, "memory(GiB)": 77.56, "step": 27310, "token_acc": 0.48936170212765956, "train_speed(iter/s)": 1.441801 }, { "epoch": 1.1702583436870742, "grad_norm": 4.801573276519775, "learning_rate": 8.708164479293767e-05, "loss": 2.4015045166015625, "memory(GiB)": 77.56, "step": 27315, "token_acc": 0.4875444839857651, "train_speed(iter/s)": 1.441789 }, { "epoch": 1.1704725590163232, "grad_norm": 5.074005603790283, "learning_rate": 8.707713008513863e-05, "loss": 2.510340118408203, "memory(GiB)": 77.56, "step": 27320, "token_acc": 0.4482758620689655, "train_speed(iter/s)": 1.441739 }, { "epoch": 1.1706867743455722, "grad_norm": 4.657685279846191, "learning_rate": 8.707261470565256e-05, "loss": 2.5617416381835936, "memory(GiB)": 77.56, "step": 27325, "token_acc": 0.4642857142857143, "train_speed(iter/s)": 1.441709 }, { "epoch": 1.170900989674821, "grad_norm": 4.859247207641602, "learning_rate": 8.706809865456131e-05, "loss": 2.5886001586914062, "memory(GiB)": 77.56, "step": 27330, "token_acc": 0.5209125475285171, "train_speed(iter/s)": 1.441639 }, { "epoch": 1.17111520500407, "grad_norm": 4.961772918701172, "learning_rate": 8.706358193194668e-05, "loss": 2.3527488708496094, "memory(GiB)": 77.56, "step": 27335, "token_acc": 0.4724137931034483, "train_speed(iter/s)": 1.441579 }, { "epoch": 1.171329420333319, "grad_norm": 4.1337785720825195, "learning_rate": 8.705906453789049e-05, "loss": 2.345884323120117, "memory(GiB)": 77.56, "step": 27340, "token_acc": 0.49794238683127573, "train_speed(iter/s)": 1.44166 }, { "epoch": 1.171543635662568, "grad_norm": 4.445571422576904, "learning_rate": 8.705454647247458e-05, "loss": 2.6101245880126953, "memory(GiB)": 77.56, "step": 27345, "token_acc": 0.513986013986014, "train_speed(iter/s)": 1.441689 }, { "epoch": 1.171757850991817, "grad_norm": 4.270284652709961, "learning_rate": 8.705002773578081e-05, "loss": 2.3811355590820313, "memory(GiB)": 77.56, "step": 27350, "token_acc": 0.5310344827586206, "train_speed(iter/s)": 1.441676 }, { "epoch": 1.171972066321066, "grad_norm": 4.850458145141602, "learning_rate": 8.704550832789103e-05, "loss": 2.973872947692871, "memory(GiB)": 77.56, "step": 27355, "token_acc": 0.4458204334365325, "train_speed(iter/s)": 1.441726 }, { "epoch": 1.172186281650315, "grad_norm": 3.9101269245147705, "learning_rate": 8.704098824888711e-05, "loss": 2.577581596374512, "memory(GiB)": 77.56, "step": 27360, "token_acc": 0.48026315789473684, "train_speed(iter/s)": 1.441795 }, { "epoch": 1.1724004969795638, "grad_norm": 5.0655412673950195, "learning_rate": 8.703646749885095e-05, "loss": 2.536365509033203, "memory(GiB)": 77.56, "step": 27365, "token_acc": 0.46646341463414637, "train_speed(iter/s)": 1.441832 }, { "epoch": 1.1726147123088129, "grad_norm": 4.293900012969971, "learning_rate": 8.703194607786442e-05, "loss": 2.609212303161621, "memory(GiB)": 77.56, "step": 27370, "token_acc": 0.4893048128342246, "train_speed(iter/s)": 1.441793 }, { "epoch": 1.172828927638062, "grad_norm": 5.184179306030273, "learning_rate": 8.702742398600947e-05, "loss": 2.3663558959960938, "memory(GiB)": 77.56, "step": 27375, "token_acc": 0.4774193548387097, "train_speed(iter/s)": 1.441807 }, { "epoch": 1.1730431429673107, "grad_norm": 4.691805362701416, "learning_rate": 8.702290122336798e-05, "loss": 2.697550964355469, "memory(GiB)": 77.56, "step": 27380, "token_acc": 0.42028985507246375, "train_speed(iter/s)": 1.441782 }, { "epoch": 1.1732573582965597, "grad_norm": 4.138439178466797, "learning_rate": 8.701837779002192e-05, "loss": 2.500801849365234, "memory(GiB)": 77.56, "step": 27385, "token_acc": 0.4797297297297297, "train_speed(iter/s)": 1.441721 }, { "epoch": 1.1734715736258088, "grad_norm": 4.723711967468262, "learning_rate": 8.701385368605322e-05, "loss": 2.8100181579589845, "memory(GiB)": 77.56, "step": 27390, "token_acc": 0.4140127388535032, "train_speed(iter/s)": 1.441806 }, { "epoch": 1.1736857889550576, "grad_norm": 6.031518936157227, "learning_rate": 8.700932891154383e-05, "loss": 2.2398391723632813, "memory(GiB)": 77.56, "step": 27395, "token_acc": 0.5164835164835165, "train_speed(iter/s)": 1.441884 }, { "epoch": 1.1739000042843066, "grad_norm": 4.962797164916992, "learning_rate": 8.700480346657574e-05, "loss": 2.5756046295166017, "memory(GiB)": 77.56, "step": 27400, "token_acc": 0.46308724832214765, "train_speed(iter/s)": 1.441885 }, { "epoch": 1.1741142196135557, "grad_norm": 5.7035722732543945, "learning_rate": 8.70002773512309e-05, "loss": 2.5211410522460938, "memory(GiB)": 77.56, "step": 27405, "token_acc": 0.5098039215686274, "train_speed(iter/s)": 1.441946 }, { "epoch": 1.1743284349428045, "grad_norm": 4.7708892822265625, "learning_rate": 8.699575056559134e-05, "loss": 2.3003841400146485, "memory(GiB)": 77.56, "step": 27410, "token_acc": 0.5205992509363296, "train_speed(iter/s)": 1.442019 }, { "epoch": 1.1745426502720535, "grad_norm": 5.4351630210876465, "learning_rate": 8.699122310973907e-05, "loss": 2.4649078369140627, "memory(GiB)": 77.56, "step": 27415, "token_acc": 0.4831081081081081, "train_speed(iter/s)": 1.442022 }, { "epoch": 1.1747568656013025, "grad_norm": 5.503630638122559, "learning_rate": 8.698669498375606e-05, "loss": 2.4473007202148436, "memory(GiB)": 77.56, "step": 27420, "token_acc": 0.4791666666666667, "train_speed(iter/s)": 1.442027 }, { "epoch": 1.1749710809305514, "grad_norm": 3.8551297187805176, "learning_rate": 8.698216618772439e-05, "loss": 2.430198860168457, "memory(GiB)": 77.56, "step": 27425, "token_acc": 0.5067114093959731, "train_speed(iter/s)": 1.442065 }, { "epoch": 1.1751852962598004, "grad_norm": 3.7930805683135986, "learning_rate": 8.69776367217261e-05, "loss": 2.3508865356445314, "memory(GiB)": 77.56, "step": 27430, "token_acc": 0.5230263157894737, "train_speed(iter/s)": 1.44212 }, { "epoch": 1.1753995115890494, "grad_norm": 4.585013389587402, "learning_rate": 8.697310658584321e-05, "loss": 2.5842390060424805, "memory(GiB)": 77.56, "step": 27435, "token_acc": 0.4521452145214521, "train_speed(iter/s)": 1.442137 }, { "epoch": 1.1756137269182982, "grad_norm": 4.173397541046143, "learning_rate": 8.696857578015783e-05, "loss": 3.0209892272949217, "memory(GiB)": 77.56, "step": 27440, "token_acc": 0.4148606811145511, "train_speed(iter/s)": 1.442165 }, { "epoch": 1.1758279422475473, "grad_norm": 4.019002437591553, "learning_rate": 8.696404430475201e-05, "loss": 2.3971990585327148, "memory(GiB)": 77.56, "step": 27445, "token_acc": 0.45045045045045046, "train_speed(iter/s)": 1.442192 }, { "epoch": 1.1760421575767963, "grad_norm": 5.549796104431152, "learning_rate": 8.695951215970785e-05, "loss": 2.759377288818359, "memory(GiB)": 77.56, "step": 27450, "token_acc": 0.42700729927007297, "train_speed(iter/s)": 1.442127 }, { "epoch": 1.1762563729060451, "grad_norm": 4.445778846740723, "learning_rate": 8.695497934510747e-05, "loss": 2.447062301635742, "memory(GiB)": 77.56, "step": 27455, "token_acc": 0.46060606060606063, "train_speed(iter/s)": 1.442181 }, { "epoch": 1.1764705882352942, "grad_norm": 8.30522632598877, "learning_rate": 8.695044586103296e-05, "loss": 2.5784292221069336, "memory(GiB)": 77.56, "step": 27460, "token_acc": 0.4882154882154882, "train_speed(iter/s)": 1.442237 }, { "epoch": 1.1766848035645432, "grad_norm": 4.122225284576416, "learning_rate": 8.694591170756647e-05, "loss": 2.432565689086914, "memory(GiB)": 77.56, "step": 27465, "token_acc": 0.4984520123839009, "train_speed(iter/s)": 1.442197 }, { "epoch": 1.176899018893792, "grad_norm": 4.906227111816406, "learning_rate": 8.694137688479012e-05, "loss": 2.5537179946899413, "memory(GiB)": 77.56, "step": 27470, "token_acc": 0.5049833887043189, "train_speed(iter/s)": 1.442297 }, { "epoch": 1.177113234223041, "grad_norm": 4.375225067138672, "learning_rate": 8.693684139278609e-05, "loss": 2.3931270599365235, "memory(GiB)": 77.56, "step": 27475, "token_acc": 0.48484848484848486, "train_speed(iter/s)": 1.442334 }, { "epoch": 1.17732744955229, "grad_norm": 5.097619533538818, "learning_rate": 8.693230523163652e-05, "loss": 2.782934379577637, "memory(GiB)": 77.56, "step": 27480, "token_acc": 0.4528301886792453, "train_speed(iter/s)": 1.442375 }, { "epoch": 1.1775416648815389, "grad_norm": 5.939589500427246, "learning_rate": 8.692776840142359e-05, "loss": 2.272409439086914, "memory(GiB)": 77.56, "step": 27485, "token_acc": 0.5148936170212766, "train_speed(iter/s)": 1.442359 }, { "epoch": 1.177755880210788, "grad_norm": 3.857491970062256, "learning_rate": 8.692323090222949e-05, "loss": 2.517545700073242, "memory(GiB)": 77.56, "step": 27490, "token_acc": 0.4691689008042895, "train_speed(iter/s)": 1.442311 }, { "epoch": 1.177970095540037, "grad_norm": 4.787905693054199, "learning_rate": 8.691869273413645e-05, "loss": 2.540113830566406, "memory(GiB)": 77.56, "step": 27495, "token_acc": 0.5061728395061729, "train_speed(iter/s)": 1.442377 }, { "epoch": 1.1781843108692858, "grad_norm": 4.3217082023620605, "learning_rate": 8.691415389722663e-05, "loss": 2.4738290786743162, "memory(GiB)": 77.56, "step": 27500, "token_acc": 0.46387832699619774, "train_speed(iter/s)": 1.442406 }, { "epoch": 1.1781843108692858, "eval_loss": 2.181292772293091, "eval_runtime": 15.071, "eval_samples_per_second": 6.635, "eval_steps_per_second": 6.635, "eval_token_acc": 0.4791386271870794, "step": 27500 }, { "epoch": 1.1783985261985348, "grad_norm": 4.888856410980225, "learning_rate": 8.690961439158229e-05, "loss": 2.4217287063598634, "memory(GiB)": 77.56, "step": 27505, "token_acc": 0.47604790419161674, "train_speed(iter/s)": 1.441203 }, { "epoch": 1.1786127415277838, "grad_norm": 6.293729305267334, "learning_rate": 8.690507421728566e-05, "loss": 2.4635908126831056, "memory(GiB)": 77.56, "step": 27510, "token_acc": 0.5114503816793893, "train_speed(iter/s)": 1.441178 }, { "epoch": 1.1788269568570326, "grad_norm": 4.702425956726074, "learning_rate": 8.690053337441901e-05, "loss": 2.319647026062012, "memory(GiB)": 77.56, "step": 27515, "token_acc": 0.48366013071895425, "train_speed(iter/s)": 1.441269 }, { "epoch": 1.1790411721862817, "grad_norm": 4.294443130493164, "learning_rate": 8.689599186306457e-05, "loss": 2.5872364044189453, "memory(GiB)": 77.56, "step": 27520, "token_acc": 0.48026315789473684, "train_speed(iter/s)": 1.441277 }, { "epoch": 1.1792553875155307, "grad_norm": 4.025820255279541, "learning_rate": 8.689144968330462e-05, "loss": 2.332527732849121, "memory(GiB)": 77.56, "step": 27525, "token_acc": 0.46984126984126984, "train_speed(iter/s)": 1.441261 }, { "epoch": 1.1794696028447795, "grad_norm": 6.125864505767822, "learning_rate": 8.688690683522147e-05, "loss": 2.187384033203125, "memory(GiB)": 77.56, "step": 27530, "token_acc": 0.4856115107913669, "train_speed(iter/s)": 1.441303 }, { "epoch": 1.1796838181740286, "grad_norm": 4.251725196838379, "learning_rate": 8.688236331889739e-05, "loss": 2.6505970001220702, "memory(GiB)": 77.56, "step": 27535, "token_acc": 0.45751633986928103, "train_speed(iter/s)": 1.441336 }, { "epoch": 1.1798980335032776, "grad_norm": 5.752416133880615, "learning_rate": 8.687781913441469e-05, "loss": 2.47400016784668, "memory(GiB)": 77.56, "step": 27540, "token_acc": 0.47330960854092524, "train_speed(iter/s)": 1.441299 }, { "epoch": 1.1801122488325264, "grad_norm": 6.580593585968018, "learning_rate": 8.687327428185573e-05, "loss": 2.7571815490722655, "memory(GiB)": 77.56, "step": 27545, "token_acc": 0.44244604316546765, "train_speed(iter/s)": 1.44136 }, { "epoch": 1.1803264641617754, "grad_norm": 8.08622932434082, "learning_rate": 8.686872876130279e-05, "loss": 2.763301467895508, "memory(GiB)": 77.56, "step": 27550, "token_acc": 0.45564516129032256, "train_speed(iter/s)": 1.441431 }, { "epoch": 1.1805406794910245, "grad_norm": 4.863842487335205, "learning_rate": 8.686418257283824e-05, "loss": 2.249067687988281, "memory(GiB)": 77.56, "step": 27555, "token_acc": 0.5378151260504201, "train_speed(iter/s)": 1.441452 }, { "epoch": 1.1807548948202733, "grad_norm": 5.426926612854004, "learning_rate": 8.685963571654445e-05, "loss": 2.4215042114257814, "memory(GiB)": 77.56, "step": 27560, "token_acc": 0.4945054945054945, "train_speed(iter/s)": 1.441415 }, { "epoch": 1.1809691101495223, "grad_norm": 3.8409945964813232, "learning_rate": 8.685508819250379e-05, "loss": 2.6023340225219727, "memory(GiB)": 77.56, "step": 27565, "token_acc": 0.4647887323943662, "train_speed(iter/s)": 1.441432 }, { "epoch": 1.1811833254787714, "grad_norm": 4.093282699584961, "learning_rate": 8.685054000079863e-05, "loss": 2.6001235961914064, "memory(GiB)": 77.56, "step": 27570, "token_acc": 0.48333333333333334, "train_speed(iter/s)": 1.441483 }, { "epoch": 1.1813975408080202, "grad_norm": 5.113375663757324, "learning_rate": 8.684599114151137e-05, "loss": 2.6917396545410157, "memory(GiB)": 77.56, "step": 27575, "token_acc": 0.4675324675324675, "train_speed(iter/s)": 1.441502 }, { "epoch": 1.1816117561372692, "grad_norm": 5.857832908630371, "learning_rate": 8.684144161472441e-05, "loss": 2.4806318283081055, "memory(GiB)": 77.56, "step": 27580, "token_acc": 0.4461538461538462, "train_speed(iter/s)": 1.441462 }, { "epoch": 1.1818259714665182, "grad_norm": 5.172798156738281, "learning_rate": 8.683689142052018e-05, "loss": 2.2441524505615233, "memory(GiB)": 77.56, "step": 27585, "token_acc": 0.5151515151515151, "train_speed(iter/s)": 1.441514 }, { "epoch": 1.182040186795767, "grad_norm": 4.691322326660156, "learning_rate": 8.68323405589811e-05, "loss": 2.4120204925537108, "memory(GiB)": 77.56, "step": 27590, "token_acc": 0.43686006825938567, "train_speed(iter/s)": 1.441536 }, { "epoch": 1.182254402125016, "grad_norm": 5.982290744781494, "learning_rate": 8.682778903018962e-05, "loss": 2.453257369995117, "memory(GiB)": 77.56, "step": 27595, "token_acc": 0.46099290780141844, "train_speed(iter/s)": 1.441457 }, { "epoch": 1.1824686174542651, "grad_norm": 5.5276336669921875, "learning_rate": 8.68232368342282e-05, "loss": 2.1503772735595703, "memory(GiB)": 77.56, "step": 27600, "token_acc": 0.5096525096525096, "train_speed(iter/s)": 1.441457 }, { "epoch": 1.182682832783514, "grad_norm": 4.469607830047607, "learning_rate": 8.68186839711793e-05, "loss": 2.3769208908081056, "memory(GiB)": 77.56, "step": 27605, "token_acc": 0.5236486486486487, "train_speed(iter/s)": 1.441469 }, { "epoch": 1.182897048112763, "grad_norm": 5.754303932189941, "learning_rate": 8.68141304411254e-05, "loss": 2.381919288635254, "memory(GiB)": 77.56, "step": 27610, "token_acc": 0.48559670781893005, "train_speed(iter/s)": 1.441424 }, { "epoch": 1.183111263442012, "grad_norm": 7.353774547576904, "learning_rate": 8.680957624414901e-05, "loss": 2.832828140258789, "memory(GiB)": 77.56, "step": 27615, "token_acc": 0.45054945054945056, "train_speed(iter/s)": 1.441461 }, { "epoch": 1.1833254787712608, "grad_norm": 6.191554069519043, "learning_rate": 8.68050213803326e-05, "loss": 2.3928375244140625, "memory(GiB)": 77.56, "step": 27620, "token_acc": 0.4788732394366197, "train_speed(iter/s)": 1.44149 }, { "epoch": 1.1835396941005099, "grad_norm": 5.659239768981934, "learning_rate": 8.68004658497587e-05, "loss": 2.41302490234375, "memory(GiB)": 77.56, "step": 27625, "token_acc": 0.49377593360995853, "train_speed(iter/s)": 1.44152 }, { "epoch": 1.1837539094297589, "grad_norm": 4.310657501220703, "learning_rate": 8.679590965250984e-05, "loss": 2.7027875900268556, "memory(GiB)": 77.56, "step": 27630, "token_acc": 0.4665012406947891, "train_speed(iter/s)": 1.44146 }, { "epoch": 1.1839681247590077, "grad_norm": 4.37050724029541, "learning_rate": 8.679135278866855e-05, "loss": 2.389889144897461, "memory(GiB)": 77.56, "step": 27635, "token_acc": 0.48828125, "train_speed(iter/s)": 1.441456 }, { "epoch": 1.1841823400882567, "grad_norm": 4.196377277374268, "learning_rate": 8.678679525831742e-05, "loss": 2.388215255737305, "memory(GiB)": 77.56, "step": 27640, "token_acc": 0.5102739726027398, "train_speed(iter/s)": 1.441494 }, { "epoch": 1.1843965554175058, "grad_norm": 5.027113437652588, "learning_rate": 8.678223706153895e-05, "loss": 2.5079744338989256, "memory(GiB)": 77.56, "step": 27645, "token_acc": 0.5271084337349398, "train_speed(iter/s)": 1.441461 }, { "epoch": 1.1846107707467546, "grad_norm": 6.359491348266602, "learning_rate": 8.677767819841577e-05, "loss": 2.491704750061035, "memory(GiB)": 77.56, "step": 27650, "token_acc": 0.45714285714285713, "train_speed(iter/s)": 1.441427 }, { "epoch": 1.1848249860760036, "grad_norm": 4.476621627807617, "learning_rate": 8.677311866903044e-05, "loss": 2.7572935104370115, "memory(GiB)": 77.56, "step": 27655, "token_acc": 0.43846153846153846, "train_speed(iter/s)": 1.441452 }, { "epoch": 1.1850392014052527, "grad_norm": 5.465272903442383, "learning_rate": 8.676855847346559e-05, "loss": 2.8429534912109373, "memory(GiB)": 77.56, "step": 27660, "token_acc": 0.4275618374558304, "train_speed(iter/s)": 1.441426 }, { "epoch": 1.1852534167345015, "grad_norm": 5.176022529602051, "learning_rate": 8.676399761180379e-05, "loss": 2.4429162979125976, "memory(GiB)": 77.56, "step": 27665, "token_acc": 0.4694533762057878, "train_speed(iter/s)": 1.441409 }, { "epoch": 1.1854676320637505, "grad_norm": 4.39204216003418, "learning_rate": 8.67594360841277e-05, "loss": 2.4336334228515626, "memory(GiB)": 77.56, "step": 27670, "token_acc": 0.4477124183006536, "train_speed(iter/s)": 1.44143 }, { "epoch": 1.1856818473929995, "grad_norm": 4.6355133056640625, "learning_rate": 8.675487389051993e-05, "loss": 2.3863924026489256, "memory(GiB)": 77.56, "step": 27675, "token_acc": 0.48936170212765956, "train_speed(iter/s)": 1.441443 }, { "epoch": 1.1858960627222483, "grad_norm": 6.04000186920166, "learning_rate": 8.675031103106315e-05, "loss": 2.492868423461914, "memory(GiB)": 77.56, "step": 27680, "token_acc": 0.4403292181069959, "train_speed(iter/s)": 1.441436 }, { "epoch": 1.1861102780514974, "grad_norm": 6.974379539489746, "learning_rate": 8.674574750584002e-05, "loss": 2.434065818786621, "memory(GiB)": 77.56, "step": 27685, "token_acc": 0.48905109489051096, "train_speed(iter/s)": 1.441517 }, { "epoch": 1.1863244933807464, "grad_norm": 5.399651527404785, "learning_rate": 8.674118331493317e-05, "loss": 2.4655824661254884, "memory(GiB)": 77.56, "step": 27690, "token_acc": 0.43521594684385384, "train_speed(iter/s)": 1.441553 }, { "epoch": 1.1865387087099952, "grad_norm": 4.4834442138671875, "learning_rate": 8.673661845842536e-05, "loss": 2.3906164169311523, "memory(GiB)": 77.56, "step": 27695, "token_acc": 0.517799352750809, "train_speed(iter/s)": 1.441595 }, { "epoch": 1.1867529240392443, "grad_norm": 7.482815742492676, "learning_rate": 8.673205293639921e-05, "loss": 2.4204029083251952, "memory(GiB)": 77.56, "step": 27700, "token_acc": 0.4823529411764706, "train_speed(iter/s)": 1.4416 }, { "epoch": 1.1869671393684933, "grad_norm": 7.446115493774414, "learning_rate": 8.672748674893749e-05, "loss": 2.209854507446289, "memory(GiB)": 77.56, "step": 27705, "token_acc": 0.5290519877675841, "train_speed(iter/s)": 1.441614 }, { "epoch": 1.187181354697742, "grad_norm": 4.794114112854004, "learning_rate": 8.672291989612287e-05, "loss": 2.4992061614990235, "memory(GiB)": 77.56, "step": 27710, "token_acc": 0.48518518518518516, "train_speed(iter/s)": 1.441661 }, { "epoch": 1.1873955700269911, "grad_norm": 10.642241477966309, "learning_rate": 8.671835237803812e-05, "loss": 2.7294851303100587, "memory(GiB)": 77.56, "step": 27715, "token_acc": 0.4280155642023346, "train_speed(iter/s)": 1.441724 }, { "epoch": 1.1876097853562402, "grad_norm": 6.134695529937744, "learning_rate": 8.671378419476596e-05, "loss": 2.6088140487670897, "memory(GiB)": 77.56, "step": 27720, "token_acc": 0.46449704142011833, "train_speed(iter/s)": 1.4417 }, { "epoch": 1.187824000685489, "grad_norm": 6.8227009773254395, "learning_rate": 8.670921534638918e-05, "loss": 2.5411964416503907, "memory(GiB)": 77.56, "step": 27725, "token_acc": 0.45569620253164556, "train_speed(iter/s)": 1.441652 }, { "epoch": 1.188038216014738, "grad_norm": 5.670330047607422, "learning_rate": 8.670464583299052e-05, "loss": 2.5149412155151367, "memory(GiB)": 77.56, "step": 27730, "token_acc": 0.4530612244897959, "train_speed(iter/s)": 1.441711 }, { "epoch": 1.188252431343987, "grad_norm": 8.834687232971191, "learning_rate": 8.670007565465277e-05, "loss": 2.5030658721923826, "memory(GiB)": 77.56, "step": 27735, "token_acc": 0.4752475247524752, "train_speed(iter/s)": 1.441726 }, { "epoch": 1.1884666466732359, "grad_norm": 12.955204963684082, "learning_rate": 8.669550481145872e-05, "loss": 2.710847282409668, "memory(GiB)": 77.56, "step": 27740, "token_acc": 0.4721311475409836, "train_speed(iter/s)": 1.441765 }, { "epoch": 1.188680862002485, "grad_norm": 4.122494220733643, "learning_rate": 8.669093330349118e-05, "loss": 2.2909990310668946, "memory(GiB)": 77.56, "step": 27745, "token_acc": 0.4712230215827338, "train_speed(iter/s)": 1.44183 }, { "epoch": 1.188895077331734, "grad_norm": 4.734692096710205, "learning_rate": 8.668636113083296e-05, "loss": 2.3412214279174806, "memory(GiB)": 77.56, "step": 27750, "token_acc": 0.5323741007194245, "train_speed(iter/s)": 1.441805 }, { "epoch": 1.1891092926609828, "grad_norm": 4.5979132652282715, "learning_rate": 8.66817882935669e-05, "loss": 2.637179946899414, "memory(GiB)": 77.56, "step": 27755, "token_acc": 0.49216300940438873, "train_speed(iter/s)": 1.441782 }, { "epoch": 1.1893235079902318, "grad_norm": 6.253782749176025, "learning_rate": 8.667721479177583e-05, "loss": 2.268264389038086, "memory(GiB)": 77.56, "step": 27760, "token_acc": 0.4918032786885246, "train_speed(iter/s)": 1.441817 }, { "epoch": 1.1895377233194808, "grad_norm": 7.791936874389648, "learning_rate": 8.66726406255426e-05, "loss": 2.813027191162109, "memory(GiB)": 77.56, "step": 27765, "token_acc": 0.4703703703703704, "train_speed(iter/s)": 1.441829 }, { "epoch": 1.1897519386487296, "grad_norm": 6.149044990539551, "learning_rate": 8.66680657949501e-05, "loss": 2.553750419616699, "memory(GiB)": 77.56, "step": 27770, "token_acc": 0.5139318885448917, "train_speed(iter/s)": 1.441846 }, { "epoch": 1.1899661539779787, "grad_norm": 5.280455112457275, "learning_rate": 8.666349030008118e-05, "loss": 2.625950241088867, "memory(GiB)": 77.56, "step": 27775, "token_acc": 0.503448275862069, "train_speed(iter/s)": 1.441874 }, { "epoch": 1.1901803693072277, "grad_norm": 4.8144049644470215, "learning_rate": 8.665891414101874e-05, "loss": 2.4960559844970702, "memory(GiB)": 77.56, "step": 27780, "token_acc": 0.47297297297297297, "train_speed(iter/s)": 1.441879 }, { "epoch": 1.1903945846364765, "grad_norm": 5.351006031036377, "learning_rate": 8.665433731784568e-05, "loss": 2.6527963638305665, "memory(GiB)": 77.56, "step": 27785, "token_acc": 0.48322147651006714, "train_speed(iter/s)": 1.441898 }, { "epoch": 1.1906087999657256, "grad_norm": 5.2274580001831055, "learning_rate": 8.664975983064491e-05, "loss": 2.6767696380615233, "memory(GiB)": 77.56, "step": 27790, "token_acc": 0.48214285714285715, "train_speed(iter/s)": 1.441898 }, { "epoch": 1.1908230152949746, "grad_norm": 5.817531585693359, "learning_rate": 8.664518167949937e-05, "loss": 2.6544605255126954, "memory(GiB)": 77.56, "step": 27795, "token_acc": 0.476038338658147, "train_speed(iter/s)": 1.441875 }, { "epoch": 1.1910372306242234, "grad_norm": 4.971656322479248, "learning_rate": 8.6640602864492e-05, "loss": 2.581356430053711, "memory(GiB)": 77.56, "step": 27800, "token_acc": 0.45484949832775917, "train_speed(iter/s)": 1.441852 }, { "epoch": 1.1912514459534724, "grad_norm": 5.127297401428223, "learning_rate": 8.663602338570571e-05, "loss": 2.68997802734375, "memory(GiB)": 77.56, "step": 27805, "token_acc": 0.4482758620689655, "train_speed(iter/s)": 1.44186 }, { "epoch": 1.1914656612827215, "grad_norm": 4.589285373687744, "learning_rate": 8.663144324322351e-05, "loss": 2.1073837280273438, "memory(GiB)": 77.56, "step": 27810, "token_acc": 0.5543859649122806, "train_speed(iter/s)": 1.441765 }, { "epoch": 1.1916798766119703, "grad_norm": 5.023262977600098, "learning_rate": 8.662686243712834e-05, "loss": 2.5463430404663088, "memory(GiB)": 77.56, "step": 27815, "token_acc": 0.4827586206896552, "train_speed(iter/s)": 1.441737 }, { "epoch": 1.1918940919412193, "grad_norm": 5.036184310913086, "learning_rate": 8.662228096750321e-05, "loss": 2.344204139709473, "memory(GiB)": 77.56, "step": 27820, "token_acc": 0.4693140794223827, "train_speed(iter/s)": 1.441675 }, { "epoch": 1.1921083072704683, "grad_norm": 5.8417463302612305, "learning_rate": 8.661769883443109e-05, "loss": 2.637803077697754, "memory(GiB)": 77.56, "step": 27825, "token_acc": 0.4664179104477612, "train_speed(iter/s)": 1.44169 }, { "epoch": 1.1923225225997172, "grad_norm": 5.606911659240723, "learning_rate": 8.661311603799503e-05, "loss": 2.269527626037598, "memory(GiB)": 77.56, "step": 27830, "token_acc": 0.5203252032520326, "train_speed(iter/s)": 1.44172 }, { "epoch": 1.1925367379289662, "grad_norm": 4.498233795166016, "learning_rate": 8.660853257827801e-05, "loss": 2.674235725402832, "memory(GiB)": 77.56, "step": 27835, "token_acc": 0.4236760124610592, "train_speed(iter/s)": 1.441748 }, { "epoch": 1.1927509532582152, "grad_norm": 5.204937934875488, "learning_rate": 8.660394845536309e-05, "loss": 2.1245370864868165, "memory(GiB)": 77.56, "step": 27840, "token_acc": 0.5382059800664452, "train_speed(iter/s)": 1.441746 }, { "epoch": 1.192965168587464, "grad_norm": 4.371767520904541, "learning_rate": 8.659936366933328e-05, "loss": 2.521686553955078, "memory(GiB)": 77.56, "step": 27845, "token_acc": 0.46348314606741575, "train_speed(iter/s)": 1.441764 }, { "epoch": 1.193179383916713, "grad_norm": 4.029289245605469, "learning_rate": 8.659477822027169e-05, "loss": 2.5883317947387696, "memory(GiB)": 77.56, "step": 27850, "token_acc": 0.43465045592705165, "train_speed(iter/s)": 1.441781 }, { "epoch": 1.1933935992459621, "grad_norm": 3.722578287124634, "learning_rate": 8.659019210826136e-05, "loss": 2.4981433868408205, "memory(GiB)": 77.56, "step": 27855, "token_acc": 0.4536082474226804, "train_speed(iter/s)": 1.441761 }, { "epoch": 1.193607814575211, "grad_norm": 6.433159351348877, "learning_rate": 8.658560533338537e-05, "loss": 2.2970319747924806, "memory(GiB)": 77.56, "step": 27860, "token_acc": 0.5139442231075697, "train_speed(iter/s)": 1.441797 }, { "epoch": 1.19382202990446, "grad_norm": 4.9400105476379395, "learning_rate": 8.658101789572683e-05, "loss": 2.425587463378906, "memory(GiB)": 77.56, "step": 27865, "token_acc": 0.4875444839857651, "train_speed(iter/s)": 1.441751 }, { "epoch": 1.194036245233709, "grad_norm": 4.408133029937744, "learning_rate": 8.657642979536882e-05, "loss": 2.490181732177734, "memory(GiB)": 77.56, "step": 27870, "token_acc": 0.49206349206349204, "train_speed(iter/s)": 1.441763 }, { "epoch": 1.1942504605629578, "grad_norm": 4.0465617179870605, "learning_rate": 8.657184103239448e-05, "loss": 2.584272575378418, "memory(GiB)": 77.56, "step": 27875, "token_acc": 0.48089171974522293, "train_speed(iter/s)": 1.441789 }, { "epoch": 1.1944646758922068, "grad_norm": 5.069238185882568, "learning_rate": 8.656725160688694e-05, "loss": 2.3231069564819338, "memory(GiB)": 77.56, "step": 27880, "token_acc": 0.48616600790513836, "train_speed(iter/s)": 1.441843 }, { "epoch": 1.1946788912214559, "grad_norm": 4.324473857879639, "learning_rate": 8.656266151892932e-05, "loss": 2.605357360839844, "memory(GiB)": 77.56, "step": 27885, "token_acc": 0.424812030075188, "train_speed(iter/s)": 1.441891 }, { "epoch": 1.1948931065507047, "grad_norm": 3.8083996772766113, "learning_rate": 8.655807076860481e-05, "loss": 3.0749267578125, "memory(GiB)": 77.56, "step": 27890, "token_acc": 0.45098039215686275, "train_speed(iter/s)": 1.441974 }, { "epoch": 1.1951073218799537, "grad_norm": 4.634095191955566, "learning_rate": 8.655347935599653e-05, "loss": 2.557315444946289, "memory(GiB)": 77.56, "step": 27895, "token_acc": 0.46735395189003437, "train_speed(iter/s)": 1.442015 }, { "epoch": 1.1953215372092028, "grad_norm": 3.6966121196746826, "learning_rate": 8.654888728118769e-05, "loss": 2.479615592956543, "memory(GiB)": 77.56, "step": 27900, "token_acc": 0.5228070175438596, "train_speed(iter/s)": 1.442035 }, { "epoch": 1.1955357525384516, "grad_norm": 4.629363536834717, "learning_rate": 8.654429454426148e-05, "loss": 2.7010009765625, "memory(GiB)": 77.56, "step": 27905, "token_acc": 0.49049429657794674, "train_speed(iter/s)": 1.442036 }, { "epoch": 1.1957499678677006, "grad_norm": 4.705044269561768, "learning_rate": 8.653970114530107e-05, "loss": 2.6380115509033204, "memory(GiB)": 77.56, "step": 27910, "token_acc": 0.449438202247191, "train_speed(iter/s)": 1.442064 }, { "epoch": 1.1959641831969496, "grad_norm": 4.2426910400390625, "learning_rate": 8.65351070843897e-05, "loss": 2.43743953704834, "memory(GiB)": 77.56, "step": 27915, "token_acc": 0.5034013605442177, "train_speed(iter/s)": 1.442043 }, { "epoch": 1.1961783985261984, "grad_norm": 4.5766215324401855, "learning_rate": 8.653051236161062e-05, "loss": 2.4382553100585938, "memory(GiB)": 77.56, "step": 27920, "token_acc": 0.5129151291512916, "train_speed(iter/s)": 1.442009 }, { "epoch": 1.1963926138554475, "grad_norm": 4.8841986656188965, "learning_rate": 8.652591697704702e-05, "loss": 2.796978759765625, "memory(GiB)": 77.56, "step": 27925, "token_acc": 0.4195583596214511, "train_speed(iter/s)": 1.442018 }, { "epoch": 1.1966068291846965, "grad_norm": 5.63884973526001, "learning_rate": 8.652132093078217e-05, "loss": 2.701239585876465, "memory(GiB)": 77.56, "step": 27930, "token_acc": 0.44876325088339225, "train_speed(iter/s)": 1.442007 }, { "epoch": 1.1968210445139453, "grad_norm": 6.249762535095215, "learning_rate": 8.651672422289934e-05, "loss": 2.2940149307250977, "memory(GiB)": 77.56, "step": 27935, "token_acc": 0.48201438848920863, "train_speed(iter/s)": 1.441995 }, { "epoch": 1.1970352598431944, "grad_norm": 4.73747444152832, "learning_rate": 8.65121268534818e-05, "loss": 2.690804100036621, "memory(GiB)": 77.56, "step": 27940, "token_acc": 0.4342105263157895, "train_speed(iter/s)": 1.442011 }, { "epoch": 1.1972494751724434, "grad_norm": 4.564331531524658, "learning_rate": 8.650752882261282e-05, "loss": 2.754376029968262, "memory(GiB)": 77.56, "step": 27945, "token_acc": 0.46875, "train_speed(iter/s)": 1.442009 }, { "epoch": 1.1974636905016922, "grad_norm": 5.374021053314209, "learning_rate": 8.65029301303757e-05, "loss": 2.6044281005859373, "memory(GiB)": 77.56, "step": 27950, "token_acc": 0.4891304347826087, "train_speed(iter/s)": 1.441993 }, { "epoch": 1.1976779058309412, "grad_norm": 3.782750129699707, "learning_rate": 8.649833077685377e-05, "loss": 2.444538879394531, "memory(GiB)": 77.56, "step": 27955, "token_acc": 0.5476190476190477, "train_speed(iter/s)": 1.442025 }, { "epoch": 1.1978921211601903, "grad_norm": 7.202226638793945, "learning_rate": 8.649373076213035e-05, "loss": 2.836447334289551, "memory(GiB)": 77.56, "step": 27960, "token_acc": 0.4460431654676259, "train_speed(iter/s)": 1.442017 }, { "epoch": 1.198106336489439, "grad_norm": 5.353616237640381, "learning_rate": 8.648913008628874e-05, "loss": 2.3830245971679687, "memory(GiB)": 77.56, "step": 27965, "token_acc": 0.5267175572519084, "train_speed(iter/s)": 1.441956 }, { "epoch": 1.1983205518186881, "grad_norm": 4.24569034576416, "learning_rate": 8.648452874941232e-05, "loss": 2.7241838455200194, "memory(GiB)": 77.56, "step": 27970, "token_acc": 0.4756554307116105, "train_speed(iter/s)": 1.441986 }, { "epoch": 1.1985347671479372, "grad_norm": 6.994162082672119, "learning_rate": 8.647992675158443e-05, "loss": 2.473683547973633, "memory(GiB)": 77.56, "step": 27975, "token_acc": 0.49498327759197325, "train_speed(iter/s)": 1.442001 }, { "epoch": 1.198748982477186, "grad_norm": 5.098709583282471, "learning_rate": 8.647532409288846e-05, "loss": 2.47924690246582, "memory(GiB)": 77.56, "step": 27980, "token_acc": 0.4890282131661442, "train_speed(iter/s)": 1.441996 }, { "epoch": 1.198963197806435, "grad_norm": 4.81426477432251, "learning_rate": 8.647072077340776e-05, "loss": 2.635319137573242, "memory(GiB)": 77.56, "step": 27985, "token_acc": 0.4788732394366197, "train_speed(iter/s)": 1.442026 }, { "epoch": 1.199177413135684, "grad_norm": 4.6148881912231445, "learning_rate": 8.646611679322576e-05, "loss": 2.3667037963867186, "memory(GiB)": 77.56, "step": 27990, "token_acc": 0.5018050541516246, "train_speed(iter/s)": 1.442048 }, { "epoch": 1.1993916284649329, "grad_norm": 5.3459601402282715, "learning_rate": 8.646151215242584e-05, "loss": 2.5292531967163088, "memory(GiB)": 77.56, "step": 27995, "token_acc": 0.5128205128205128, "train_speed(iter/s)": 1.442065 }, { "epoch": 1.199605843794182, "grad_norm": 5.146407127380371, "learning_rate": 8.645690685109143e-05, "loss": 2.474112129211426, "memory(GiB)": 77.56, "step": 28000, "token_acc": 0.48859934853420195, "train_speed(iter/s)": 1.441967 }, { "epoch": 1.199605843794182, "eval_loss": 2.4750900268554688, "eval_runtime": 13.7177, "eval_samples_per_second": 7.29, "eval_steps_per_second": 7.29, "eval_token_acc": 0.44711538461538464, "step": 28000 }, { "epoch": 1.199820059123431, "grad_norm": 5.654464244842529, "learning_rate": 8.645230088930593e-05, "loss": 2.449993133544922, "memory(GiB)": 77.56, "step": 28005, "token_acc": 0.45607808340727596, "train_speed(iter/s)": 1.440885 }, { "epoch": 1.2000342744526797, "grad_norm": 7.380916595458984, "learning_rate": 8.644769426715282e-05, "loss": 2.5308441162109374, "memory(GiB)": 77.56, "step": 28010, "token_acc": 0.4806201550387597, "train_speed(iter/s)": 1.440886 }, { "epoch": 1.2002484897819288, "grad_norm": 4.218809604644775, "learning_rate": 8.644308698471553e-05, "loss": 2.3638336181640627, "memory(GiB)": 77.56, "step": 28015, "token_acc": 0.4931506849315068, "train_speed(iter/s)": 1.440915 }, { "epoch": 1.2004627051111778, "grad_norm": 4.558990955352783, "learning_rate": 8.643847904207755e-05, "loss": 2.814834213256836, "memory(GiB)": 77.56, "step": 28020, "token_acc": 0.43260188087774293, "train_speed(iter/s)": 1.441005 }, { "epoch": 1.2006769204404266, "grad_norm": 3.6906096935272217, "learning_rate": 8.643387043932232e-05, "loss": 2.5152759552001953, "memory(GiB)": 77.56, "step": 28025, "token_acc": 0.4889502762430939, "train_speed(iter/s)": 1.441012 }, { "epoch": 1.2008911357696757, "grad_norm": 5.000164031982422, "learning_rate": 8.642926117653336e-05, "loss": 2.4909133911132812, "memory(GiB)": 77.56, "step": 28030, "token_acc": 0.48627450980392156, "train_speed(iter/s)": 1.440989 }, { "epoch": 1.2011053510989247, "grad_norm": 4.8146586418151855, "learning_rate": 8.642465125379415e-05, "loss": 2.8136323928833007, "memory(GiB)": 77.56, "step": 28035, "token_acc": 0.43909348441926344, "train_speed(iter/s)": 1.441017 }, { "epoch": 1.2013195664281735, "grad_norm": 4.577399730682373, "learning_rate": 8.642004067118824e-05, "loss": 2.327980613708496, "memory(GiB)": 77.56, "step": 28040, "token_acc": 0.5198675496688742, "train_speed(iter/s)": 1.441061 }, { "epoch": 1.2015337817574225, "grad_norm": 5.458673000335693, "learning_rate": 8.64154294287991e-05, "loss": 2.6136566162109376, "memory(GiB)": 77.56, "step": 28045, "token_acc": 0.5, "train_speed(iter/s)": 1.441085 }, { "epoch": 1.2017479970866716, "grad_norm": 4.3608880043029785, "learning_rate": 8.64108175267103e-05, "loss": 2.5459054946899413, "memory(GiB)": 77.56, "step": 28050, "token_acc": 0.4738675958188153, "train_speed(iter/s)": 1.441053 }, { "epoch": 1.2019622124159204, "grad_norm": 4.632081508636475, "learning_rate": 8.64062049650054e-05, "loss": 2.4002363204956056, "memory(GiB)": 77.56, "step": 28055, "token_acc": 0.5358361774744027, "train_speed(iter/s)": 1.441042 }, { "epoch": 1.2021764277451694, "grad_norm": 5.0971503257751465, "learning_rate": 8.640159174376793e-05, "loss": 2.895140838623047, "memory(GiB)": 77.56, "step": 28060, "token_acc": 0.4037735849056604, "train_speed(iter/s)": 1.44107 }, { "epoch": 1.2023906430744185, "grad_norm": 5.100149154663086, "learning_rate": 8.63969778630815e-05, "loss": 2.2651060104370115, "memory(GiB)": 77.56, "step": 28065, "token_acc": 0.4944649446494465, "train_speed(iter/s)": 1.441088 }, { "epoch": 1.2026048584036673, "grad_norm": 4.2331223487854, "learning_rate": 8.639236332302964e-05, "loss": 2.599004554748535, "memory(GiB)": 77.56, "step": 28070, "token_acc": 0.4411764705882353, "train_speed(iter/s)": 1.44116 }, { "epoch": 1.2028190737329163, "grad_norm": 5.119237422943115, "learning_rate": 8.638774812369599e-05, "loss": 2.554437255859375, "memory(GiB)": 77.56, "step": 28075, "token_acc": 0.4603658536585366, "train_speed(iter/s)": 1.441205 }, { "epoch": 1.2030332890621653, "grad_norm": 4.890880584716797, "learning_rate": 8.638313226516417e-05, "loss": 2.72982120513916, "memory(GiB)": 77.56, "step": 28080, "token_acc": 0.445993031358885, "train_speed(iter/s)": 1.441175 }, { "epoch": 1.2032475043914141, "grad_norm": 6.190558433532715, "learning_rate": 8.637851574751776e-05, "loss": 2.5672435760498047, "memory(GiB)": 77.56, "step": 28085, "token_acc": 0.48344370860927155, "train_speed(iter/s)": 1.441195 }, { "epoch": 1.2034617197206632, "grad_norm": 3.8964924812316895, "learning_rate": 8.637389857084041e-05, "loss": 2.7974910736083984, "memory(GiB)": 77.56, "step": 28090, "token_acc": 0.4584615384615385, "train_speed(iter/s)": 1.441154 }, { "epoch": 1.2036759350499122, "grad_norm": 4.55295991897583, "learning_rate": 8.636928073521577e-05, "loss": 2.7204612731933593, "memory(GiB)": 77.56, "step": 28095, "token_acc": 0.4403183023872679, "train_speed(iter/s)": 1.44116 }, { "epoch": 1.203890150379161, "grad_norm": 4.697570323944092, "learning_rate": 8.63646622407275e-05, "loss": 2.5776172637939454, "memory(GiB)": 77.56, "step": 28100, "token_acc": 0.48872180451127817, "train_speed(iter/s)": 1.441196 }, { "epoch": 1.20410436570841, "grad_norm": 4.943467140197754, "learning_rate": 8.636004308745925e-05, "loss": 2.399165916442871, "memory(GiB)": 77.56, "step": 28105, "token_acc": 0.4758842443729904, "train_speed(iter/s)": 1.441151 }, { "epoch": 1.204318581037659, "grad_norm": 5.140091419219971, "learning_rate": 8.635542327549471e-05, "loss": 2.7213165283203127, "memory(GiB)": 77.56, "step": 28110, "token_acc": 0.4329896907216495, "train_speed(iter/s)": 1.441198 }, { "epoch": 1.204532796366908, "grad_norm": 4.847456932067871, "learning_rate": 8.635080280491758e-05, "loss": 2.42418212890625, "memory(GiB)": 77.56, "step": 28115, "token_acc": 0.48059701492537316, "train_speed(iter/s)": 1.441213 }, { "epoch": 1.204747011696157, "grad_norm": 4.328200340270996, "learning_rate": 8.634618167581156e-05, "loss": 2.562395477294922, "memory(GiB)": 77.56, "step": 28120, "token_acc": 0.4824561403508772, "train_speed(iter/s)": 1.441221 }, { "epoch": 1.204961227025406, "grad_norm": 5.6406073570251465, "learning_rate": 8.634155988826035e-05, "loss": 2.6292280197143554, "memory(GiB)": 77.56, "step": 28125, "token_acc": 0.4968152866242038, "train_speed(iter/s)": 1.441184 }, { "epoch": 1.2051754423546548, "grad_norm": 5.257379055023193, "learning_rate": 8.633693744234771e-05, "loss": 2.5983179092407225, "memory(GiB)": 77.56, "step": 28130, "token_acc": 0.46619217081850534, "train_speed(iter/s)": 1.441208 }, { "epoch": 1.2053896576839038, "grad_norm": 5.054792404174805, "learning_rate": 8.633231433815735e-05, "loss": 2.593419075012207, "memory(GiB)": 77.56, "step": 28135, "token_acc": 0.4684385382059801, "train_speed(iter/s)": 1.441242 }, { "epoch": 1.2056038730131529, "grad_norm": 4.093530178070068, "learning_rate": 8.632769057577304e-05, "loss": 2.6712366104125977, "memory(GiB)": 77.56, "step": 28140, "token_acc": 0.4420289855072464, "train_speed(iter/s)": 1.44127 }, { "epoch": 1.2058180883424017, "grad_norm": 4.249990463256836, "learning_rate": 8.632306615527853e-05, "loss": 2.654378128051758, "memory(GiB)": 77.56, "step": 28145, "token_acc": 0.4608150470219436, "train_speed(iter/s)": 1.441277 }, { "epoch": 1.2060323036716507, "grad_norm": 4.977593898773193, "learning_rate": 8.63184410767576e-05, "loss": 2.525018501281738, "memory(GiB)": 77.56, "step": 28150, "token_acc": 0.46407185628742514, "train_speed(iter/s)": 1.441286 }, { "epoch": 1.2062465190008997, "grad_norm": 5.1998515129089355, "learning_rate": 8.631381534029404e-05, "loss": 2.335630416870117, "memory(GiB)": 77.56, "step": 28155, "token_acc": 0.5059760956175299, "train_speed(iter/s)": 1.441202 }, { "epoch": 1.2064607343301486, "grad_norm": 5.370985984802246, "learning_rate": 8.630918894597166e-05, "loss": 2.2999839782714844, "memory(GiB)": 77.56, "step": 28160, "token_acc": 0.48627450980392156, "train_speed(iter/s)": 1.441192 }, { "epoch": 1.2066749496593976, "grad_norm": 5.30683708190918, "learning_rate": 8.630456189387426e-05, "loss": 2.421610450744629, "memory(GiB)": 77.56, "step": 28165, "token_acc": 0.49236641221374045, "train_speed(iter/s)": 1.441244 }, { "epoch": 1.2068891649886466, "grad_norm": 12.023375511169434, "learning_rate": 8.629993418408566e-05, "loss": 2.8368083953857424, "memory(GiB)": 77.56, "step": 28170, "token_acc": 0.4939271255060729, "train_speed(iter/s)": 1.441283 }, { "epoch": 1.2071033803178954, "grad_norm": 5.206878662109375, "learning_rate": 8.62953058166897e-05, "loss": 2.3973886489868166, "memory(GiB)": 77.56, "step": 28175, "token_acc": 0.45918367346938777, "train_speed(iter/s)": 1.44133 }, { "epoch": 1.2073175956471445, "grad_norm": 4.301476955413818, "learning_rate": 8.629067679177023e-05, "loss": 2.366256523132324, "memory(GiB)": 77.56, "step": 28180, "token_acc": 0.5144927536231884, "train_speed(iter/s)": 1.441318 }, { "epoch": 1.2075318109763935, "grad_norm": 6.912600040435791, "learning_rate": 8.62860471094111e-05, "loss": 2.543890380859375, "memory(GiB)": 77.56, "step": 28185, "token_acc": 0.4594594594594595, "train_speed(iter/s)": 1.441318 }, { "epoch": 1.2077460263056423, "grad_norm": 4.203740119934082, "learning_rate": 8.62814167696962e-05, "loss": 2.4901100158691407, "memory(GiB)": 77.56, "step": 28190, "token_acc": 0.4559748427672956, "train_speed(iter/s)": 1.441281 }, { "epoch": 1.2079602416348914, "grad_norm": 6.206925392150879, "learning_rate": 8.627678577270939e-05, "loss": 2.9209819793701173, "memory(GiB)": 77.56, "step": 28195, "token_acc": 0.42857142857142855, "train_speed(iter/s)": 1.441294 }, { "epoch": 1.2081744569641404, "grad_norm": 4.495066165924072, "learning_rate": 8.627215411853459e-05, "loss": 2.5599943161010743, "memory(GiB)": 77.56, "step": 28200, "token_acc": 0.5051546391752577, "train_speed(iter/s)": 1.441317 }, { "epoch": 1.2083886722933892, "grad_norm": 4.902633190155029, "learning_rate": 8.626752180725568e-05, "loss": 2.6431455612182617, "memory(GiB)": 77.56, "step": 28205, "token_acc": 0.4751958224543081, "train_speed(iter/s)": 1.441363 }, { "epoch": 1.2086028876226382, "grad_norm": 4.8289995193481445, "learning_rate": 8.626288883895659e-05, "loss": 2.56491756439209, "memory(GiB)": 77.56, "step": 28210, "token_acc": 0.4936708860759494, "train_speed(iter/s)": 1.441318 }, { "epoch": 1.2088171029518873, "grad_norm": 4.488157749176025, "learning_rate": 8.625825521372125e-05, "loss": 2.8338550567626952, "memory(GiB)": 77.56, "step": 28215, "token_acc": 0.4451219512195122, "train_speed(iter/s)": 1.441291 }, { "epoch": 1.209031318281136, "grad_norm": 5.5533552169799805, "learning_rate": 8.62536209316336e-05, "loss": 2.3425302505493164, "memory(GiB)": 77.56, "step": 28220, "token_acc": 0.5, "train_speed(iter/s)": 1.441351 }, { "epoch": 1.2092455336103851, "grad_norm": 5.305296421051025, "learning_rate": 8.624898599277762e-05, "loss": 2.6979038238525392, "memory(GiB)": 77.56, "step": 28225, "token_acc": 0.44660194174757284, "train_speed(iter/s)": 1.441319 }, { "epoch": 1.2094597489396341, "grad_norm": 4.448836326599121, "learning_rate": 8.624435039723724e-05, "loss": 2.3078176498413088, "memory(GiB)": 77.56, "step": 28230, "token_acc": 0.5430711610486891, "train_speed(iter/s)": 1.441363 }, { "epoch": 1.209673964268883, "grad_norm": 4.726208209991455, "learning_rate": 8.623971414509644e-05, "loss": 2.6113344192504884, "memory(GiB)": 77.56, "step": 28235, "token_acc": 0.46855345911949686, "train_speed(iter/s)": 1.441373 }, { "epoch": 1.209888179598132, "grad_norm": 4.682325839996338, "learning_rate": 8.623507723643924e-05, "loss": 2.412847709655762, "memory(GiB)": 77.56, "step": 28240, "token_acc": 0.4820717131474104, "train_speed(iter/s)": 1.44138 }, { "epoch": 1.210102394927381, "grad_norm": 13.473363876342773, "learning_rate": 8.623043967134963e-05, "loss": 2.4895746231079103, "memory(GiB)": 77.56, "step": 28245, "token_acc": 0.509493670886076, "train_speed(iter/s)": 1.44142 }, { "epoch": 1.2103166102566298, "grad_norm": 5.712949752807617, "learning_rate": 8.622580144991159e-05, "loss": 2.6608713150024412, "memory(GiB)": 77.56, "step": 28250, "token_acc": 0.4607142857142857, "train_speed(iter/s)": 1.441444 }, { "epoch": 1.2105308255858789, "grad_norm": 6.19139289855957, "learning_rate": 8.622116257220921e-05, "loss": 2.5623245239257812, "memory(GiB)": 77.56, "step": 28255, "token_acc": 0.4280701754385965, "train_speed(iter/s)": 1.441443 }, { "epoch": 1.210745040915128, "grad_norm": 5.0009074211120605, "learning_rate": 8.621652303832647e-05, "loss": 2.79967041015625, "memory(GiB)": 77.56, "step": 28260, "token_acc": 0.4290909090909091, "train_speed(iter/s)": 1.441461 }, { "epoch": 1.2109592562443767, "grad_norm": 5.7083563804626465, "learning_rate": 8.621188284834745e-05, "loss": 2.3252872467041015, "memory(GiB)": 77.56, "step": 28265, "token_acc": 0.47435897435897434, "train_speed(iter/s)": 1.441466 }, { "epoch": 1.2111734715736258, "grad_norm": 4.7395920753479, "learning_rate": 8.62072420023562e-05, "loss": 2.365593910217285, "memory(GiB)": 77.56, "step": 28270, "token_acc": 0.5138339920948617, "train_speed(iter/s)": 1.441427 }, { "epoch": 1.2113876869028748, "grad_norm": 5.456732749938965, "learning_rate": 8.620260050043678e-05, "loss": 2.5022214889526366, "memory(GiB)": 77.56, "step": 28275, "token_acc": 0.47079037800687284, "train_speed(iter/s)": 1.44145 }, { "epoch": 1.2116019022321236, "grad_norm": 4.1444172859191895, "learning_rate": 8.619795834267331e-05, "loss": 2.7523942947387696, "memory(GiB)": 77.56, "step": 28280, "token_acc": 0.4425087108013937, "train_speed(iter/s)": 1.441552 }, { "epoch": 1.2118161175613726, "grad_norm": 4.467493534088135, "learning_rate": 8.619331552914987e-05, "loss": 2.402785301208496, "memory(GiB)": 77.56, "step": 28285, "token_acc": 0.5206349206349207, "train_speed(iter/s)": 1.441565 }, { "epoch": 1.2120303328906217, "grad_norm": 5.909074306488037, "learning_rate": 8.618867205995056e-05, "loss": 2.17706298828125, "memory(GiB)": 77.56, "step": 28290, "token_acc": 0.5263157894736842, "train_speed(iter/s)": 1.441568 }, { "epoch": 1.2122445482198707, "grad_norm": 6.347830295562744, "learning_rate": 8.618402793515949e-05, "loss": 2.550565719604492, "memory(GiB)": 77.56, "step": 28295, "token_acc": 0.47491638795986624, "train_speed(iter/s)": 1.441625 }, { "epoch": 1.2124587635491195, "grad_norm": 4.585397720336914, "learning_rate": 8.617938315486084e-05, "loss": 2.403661346435547, "memory(GiB)": 77.56, "step": 28300, "token_acc": 0.4900398406374502, "train_speed(iter/s)": 1.441584 }, { "epoch": 1.2126729788783686, "grad_norm": 5.238585472106934, "learning_rate": 8.617473771913871e-05, "loss": 2.9338611602783202, "memory(GiB)": 77.56, "step": 28305, "token_acc": 0.421259842519685, "train_speed(iter/s)": 1.44154 }, { "epoch": 1.2128871942076176, "grad_norm": 4.562910556793213, "learning_rate": 8.617009162807727e-05, "loss": 2.6408969879150392, "memory(GiB)": 77.56, "step": 28310, "token_acc": 0.4772727272727273, "train_speed(iter/s)": 1.44154 }, { "epoch": 1.2131014095368664, "grad_norm": 5.460917949676514, "learning_rate": 8.61654448817607e-05, "loss": 2.3609785079956054, "memory(GiB)": 77.56, "step": 28315, "token_acc": 0.4793103448275862, "train_speed(iter/s)": 1.441597 }, { "epoch": 1.2133156248661154, "grad_norm": 5.652011394500732, "learning_rate": 8.616079748027317e-05, "loss": 2.647988128662109, "memory(GiB)": 77.56, "step": 28320, "token_acc": 0.4559386973180077, "train_speed(iter/s)": 1.441528 }, { "epoch": 1.2135298401953645, "grad_norm": 7.903143882751465, "learning_rate": 8.615614942369887e-05, "loss": 2.2158252716064455, "memory(GiB)": 77.56, "step": 28325, "token_acc": 0.5294117647058824, "train_speed(iter/s)": 1.441444 }, { "epoch": 1.2137440555246133, "grad_norm": 4.761894226074219, "learning_rate": 8.6151500712122e-05, "loss": 2.592799758911133, "memory(GiB)": 77.56, "step": 28330, "token_acc": 0.4778761061946903, "train_speed(iter/s)": 1.441475 }, { "epoch": 1.2139582708538623, "grad_norm": 4.6801018714904785, "learning_rate": 8.614685134562679e-05, "loss": 2.719898223876953, "memory(GiB)": 77.56, "step": 28335, "token_acc": 0.4732824427480916, "train_speed(iter/s)": 1.441496 }, { "epoch": 1.2141724861831114, "grad_norm": 4.945274353027344, "learning_rate": 8.614220132429746e-05, "loss": 2.555727958679199, "memory(GiB)": 77.56, "step": 28340, "token_acc": 0.5051903114186851, "train_speed(iter/s)": 1.441495 }, { "epoch": 1.2143867015123602, "grad_norm": 5.15886116027832, "learning_rate": 8.613755064821824e-05, "loss": 2.8917753219604494, "memory(GiB)": 77.56, "step": 28345, "token_acc": 0.40671641791044777, "train_speed(iter/s)": 1.441518 }, { "epoch": 1.2146009168416092, "grad_norm": 4.952661037445068, "learning_rate": 8.613289931747339e-05, "loss": 2.2962602615356444, "memory(GiB)": 77.56, "step": 28350, "token_acc": 0.5271317829457365, "train_speed(iter/s)": 1.441507 }, { "epoch": 1.2148151321708582, "grad_norm": 4.85758113861084, "learning_rate": 8.612824733214717e-05, "loss": 2.3349756240844726, "memory(GiB)": 77.56, "step": 28355, "token_acc": 0.5, "train_speed(iter/s)": 1.441518 }, { "epoch": 1.215029347500107, "grad_norm": 5.112732887268066, "learning_rate": 8.612359469232387e-05, "loss": 2.2662685394287108, "memory(GiB)": 77.56, "step": 28360, "token_acc": 0.5208333333333334, "train_speed(iter/s)": 1.441528 }, { "epoch": 1.215243562829356, "grad_norm": 4.784420967102051, "learning_rate": 8.611894139808776e-05, "loss": 2.8001258850097654, "memory(GiB)": 77.56, "step": 28365, "token_acc": 0.4318181818181818, "train_speed(iter/s)": 1.441469 }, { "epoch": 1.2154577781586051, "grad_norm": 6.794675350189209, "learning_rate": 8.611428744952315e-05, "loss": 2.2682168960571287, "memory(GiB)": 77.56, "step": 28370, "token_acc": 0.4671280276816609, "train_speed(iter/s)": 1.441502 }, { "epoch": 1.215671993487854, "grad_norm": 4.324798583984375, "learning_rate": 8.610963284671433e-05, "loss": 2.441996383666992, "memory(GiB)": 77.56, "step": 28375, "token_acc": 0.4717741935483871, "train_speed(iter/s)": 1.441481 }, { "epoch": 1.215886208817103, "grad_norm": 6.754473686218262, "learning_rate": 8.610497758974566e-05, "loss": 2.261363410949707, "memory(GiB)": 77.56, "step": 28380, "token_acc": 0.4896265560165975, "train_speed(iter/s)": 1.441501 }, { "epoch": 1.216100424146352, "grad_norm": 4.434762954711914, "learning_rate": 8.610032167870144e-05, "loss": 2.4972591400146484, "memory(GiB)": 77.56, "step": 28385, "token_acc": 0.49, "train_speed(iter/s)": 1.441506 }, { "epoch": 1.2163146394756008, "grad_norm": 5.20905065536499, "learning_rate": 8.609566511366603e-05, "loss": 2.5753446578979493, "memory(GiB)": 77.56, "step": 28390, "token_acc": 0.47410358565737054, "train_speed(iter/s)": 1.44156 }, { "epoch": 1.2165288548048498, "grad_norm": 5.327287673950195, "learning_rate": 8.609100789472377e-05, "loss": 2.5779697418212892, "memory(GiB)": 77.56, "step": 28395, "token_acc": 0.49843260188087773, "train_speed(iter/s)": 1.441603 }, { "epoch": 1.2167430701340989, "grad_norm": 7.3156633377075195, "learning_rate": 8.608635002195908e-05, "loss": 2.1596099853515627, "memory(GiB)": 77.56, "step": 28400, "token_acc": 0.5135135135135135, "train_speed(iter/s)": 1.441565 }, { "epoch": 1.2169572854633477, "grad_norm": 5.743879318237305, "learning_rate": 8.608169149545629e-05, "loss": 2.638324737548828, "memory(GiB)": 77.56, "step": 28405, "token_acc": 0.46875, "train_speed(iter/s)": 1.441563 }, { "epoch": 1.2171715007925967, "grad_norm": 5.754978179931641, "learning_rate": 8.60770323152998e-05, "loss": 2.3123592376708983, "memory(GiB)": 77.56, "step": 28410, "token_acc": 0.5076452599388379, "train_speed(iter/s)": 1.441579 }, { "epoch": 1.2173857161218458, "grad_norm": 5.318662643432617, "learning_rate": 8.607237248157403e-05, "loss": 2.647915840148926, "memory(GiB)": 77.56, "step": 28415, "token_acc": 0.45901639344262296, "train_speed(iter/s)": 1.441529 }, { "epoch": 1.2175999314510946, "grad_norm": 6.1738505363464355, "learning_rate": 8.606771199436341e-05, "loss": 2.607415771484375, "memory(GiB)": 77.56, "step": 28420, "token_acc": 0.425531914893617, "train_speed(iter/s)": 1.441522 }, { "epoch": 1.2178141467803436, "grad_norm": 5.154613018035889, "learning_rate": 8.606305085375234e-05, "loss": 2.6470447540283204, "memory(GiB)": 77.56, "step": 28425, "token_acc": 0.43956043956043955, "train_speed(iter/s)": 1.441481 }, { "epoch": 1.2180283621095926, "grad_norm": 3.8204874992370605, "learning_rate": 8.605838905982526e-05, "loss": 2.443175506591797, "memory(GiB)": 77.56, "step": 28430, "token_acc": 0.49393939393939396, "train_speed(iter/s)": 1.441441 }, { "epoch": 1.2182425774388415, "grad_norm": 4.8001179695129395, "learning_rate": 8.605372661266667e-05, "loss": 2.530910110473633, "memory(GiB)": 77.56, "step": 28435, "token_acc": 0.46616541353383456, "train_speed(iter/s)": 1.441458 }, { "epoch": 1.2184567927680905, "grad_norm": 4.321229457855225, "learning_rate": 8.604906351236097e-05, "loss": 2.264784812927246, "memory(GiB)": 77.56, "step": 28440, "token_acc": 0.5204460966542751, "train_speed(iter/s)": 1.441474 }, { "epoch": 1.2186710080973395, "grad_norm": 5.285921096801758, "learning_rate": 8.604439975899269e-05, "loss": 2.804207611083984, "memory(GiB)": 77.56, "step": 28445, "token_acc": 0.44776119402985076, "train_speed(iter/s)": 1.441469 }, { "epoch": 1.2188852234265883, "grad_norm": 4.859533309936523, "learning_rate": 8.60397353526463e-05, "loss": 2.3932064056396483, "memory(GiB)": 77.56, "step": 28450, "token_acc": 0.49056603773584906, "train_speed(iter/s)": 1.441479 }, { "epoch": 1.2190994387558374, "grad_norm": 6.09795618057251, "learning_rate": 8.603507029340627e-05, "loss": 2.565334510803223, "memory(GiB)": 77.56, "step": 28455, "token_acc": 0.4962686567164179, "train_speed(iter/s)": 1.441524 }, { "epoch": 1.2193136540850864, "grad_norm": 6.738732814788818, "learning_rate": 8.603040458135715e-05, "loss": 2.468765640258789, "memory(GiB)": 77.56, "step": 28460, "token_acc": 0.4788732394366197, "train_speed(iter/s)": 1.441507 }, { "epoch": 1.2195278694143352, "grad_norm": 4.478949546813965, "learning_rate": 8.602573821658345e-05, "loss": 2.242085266113281, "memory(GiB)": 77.56, "step": 28465, "token_acc": 0.5144694533762058, "train_speed(iter/s)": 1.441465 }, { "epoch": 1.2197420847435843, "grad_norm": 8.1392183303833, "learning_rate": 8.602107119916971e-05, "loss": 2.906325912475586, "memory(GiB)": 77.56, "step": 28470, "token_acc": 0.4232081911262799, "train_speed(iter/s)": 1.441498 }, { "epoch": 1.2199563000728333, "grad_norm": 4.383642196655273, "learning_rate": 8.601640352920049e-05, "loss": 2.4978002548217773, "memory(GiB)": 77.56, "step": 28475, "token_acc": 0.4681528662420382, "train_speed(iter/s)": 1.441542 }, { "epoch": 1.220170515402082, "grad_norm": 5.771337509155273, "learning_rate": 8.601173520676031e-05, "loss": 2.406903839111328, "memory(GiB)": 77.56, "step": 28480, "token_acc": 0.5056179775280899, "train_speed(iter/s)": 1.441578 }, { "epoch": 1.2203847307313311, "grad_norm": 5.979637145996094, "learning_rate": 8.600706623193377e-05, "loss": 2.421849822998047, "memory(GiB)": 77.56, "step": 28485, "token_acc": 0.4730290456431535, "train_speed(iter/s)": 1.441614 }, { "epoch": 1.2205989460605802, "grad_norm": 5.303152084350586, "learning_rate": 8.600239660480546e-05, "loss": 2.4490163803100584, "memory(GiB)": 77.56, "step": 28490, "token_acc": 0.4961832061068702, "train_speed(iter/s)": 1.441586 }, { "epoch": 1.220813161389829, "grad_norm": 5.009498596191406, "learning_rate": 8.599772632545995e-05, "loss": 2.367350387573242, "memory(GiB)": 77.56, "step": 28495, "token_acc": 0.5148514851485149, "train_speed(iter/s)": 1.441588 }, { "epoch": 1.221027376719078, "grad_norm": 6.146751403808594, "learning_rate": 8.599305539398186e-05, "loss": 2.753386878967285, "memory(GiB)": 77.56, "step": 28500, "token_acc": 0.44982698961937717, "train_speed(iter/s)": 1.44162 }, { "epoch": 1.221027376719078, "eval_loss": 2.276956558227539, "eval_runtime": 14.061, "eval_samples_per_second": 7.112, "eval_steps_per_second": 7.112, "eval_token_acc": 0.45796737766624845, "step": 28500 }, { "epoch": 1.221241592048327, "grad_norm": 4.489764213562012, "learning_rate": 8.598838381045582e-05, "loss": 2.6436840057373048, "memory(GiB)": 77.56, "step": 28505, "token_acc": 0.4633920296570899, "train_speed(iter/s)": 1.440588 }, { "epoch": 1.2214558073775759, "grad_norm": 5.637967586517334, "learning_rate": 8.598371157496642e-05, "loss": 2.493491554260254, "memory(GiB)": 77.56, "step": 28510, "token_acc": 0.5039370078740157, "train_speed(iter/s)": 1.440562 }, { "epoch": 1.221670022706825, "grad_norm": 6.490506649017334, "learning_rate": 8.597903868759836e-05, "loss": 2.630386734008789, "memory(GiB)": 77.56, "step": 28515, "token_acc": 0.4697802197802198, "train_speed(iter/s)": 1.440604 }, { "epoch": 1.221884238036074, "grad_norm": 4.837250232696533, "learning_rate": 8.597436514843625e-05, "loss": 2.6151580810546875, "memory(GiB)": 77.56, "step": 28520, "token_acc": 0.4641509433962264, "train_speed(iter/s)": 1.440655 }, { "epoch": 1.2220984533653227, "grad_norm": 6.502688884735107, "learning_rate": 8.596969095756478e-05, "loss": 2.8448753356933594, "memory(GiB)": 77.56, "step": 28525, "token_acc": 0.39490445859872614, "train_speed(iter/s)": 1.440695 }, { "epoch": 1.2223126686945718, "grad_norm": 4.855820178985596, "learning_rate": 8.59650161150686e-05, "loss": 2.3215974807739257, "memory(GiB)": 77.56, "step": 28530, "token_acc": 0.48494983277591974, "train_speed(iter/s)": 1.440581 }, { "epoch": 1.2225268840238208, "grad_norm": 4.5305914878845215, "learning_rate": 8.596034062103242e-05, "loss": 2.51825065612793, "memory(GiB)": 77.56, "step": 28535, "token_acc": 0.49645390070921985, "train_speed(iter/s)": 1.440541 }, { "epoch": 1.2227410993530696, "grad_norm": 4.3391947746276855, "learning_rate": 8.595566447554093e-05, "loss": 2.5289453506469726, "memory(GiB)": 77.56, "step": 28540, "token_acc": 0.4801587301587302, "train_speed(iter/s)": 1.440539 }, { "epoch": 1.2229553146823187, "grad_norm": 4.805681228637695, "learning_rate": 8.595098767867886e-05, "loss": 2.638051414489746, "memory(GiB)": 77.56, "step": 28545, "token_acc": 0.44551282051282054, "train_speed(iter/s)": 1.440564 }, { "epoch": 1.2231695300115677, "grad_norm": 3.7796120643615723, "learning_rate": 8.594631023053093e-05, "loss": 2.52237491607666, "memory(GiB)": 77.56, "step": 28550, "token_acc": 0.4774436090225564, "train_speed(iter/s)": 1.440557 }, { "epoch": 1.2233837453408165, "grad_norm": 3.972053050994873, "learning_rate": 8.594163213118185e-05, "loss": 2.4955272674560547, "memory(GiB)": 77.56, "step": 28555, "token_acc": 0.48466257668711654, "train_speed(iter/s)": 1.44056 }, { "epoch": 1.2235979606700655, "grad_norm": 5.100379467010498, "learning_rate": 8.593695338071639e-05, "loss": 2.626585578918457, "memory(GiB)": 77.56, "step": 28560, "token_acc": 0.4733893557422969, "train_speed(iter/s)": 1.440567 }, { "epoch": 1.2238121759993146, "grad_norm": 5.830081939697266, "learning_rate": 8.593227397921932e-05, "loss": 2.6314258575439453, "memory(GiB)": 77.56, "step": 28565, "token_acc": 0.47039473684210525, "train_speed(iter/s)": 1.440621 }, { "epoch": 1.2240263913285634, "grad_norm": 4.231169700622559, "learning_rate": 8.59275939267754e-05, "loss": 2.4682697296142577, "memory(GiB)": 77.56, "step": 28570, "token_acc": 0.5324675324675324, "train_speed(iter/s)": 1.440554 }, { "epoch": 1.2242406066578124, "grad_norm": 4.236193656921387, "learning_rate": 8.59229132234694e-05, "loss": 2.3920560836791993, "memory(GiB)": 77.56, "step": 28575, "token_acc": 0.5172413793103449, "train_speed(iter/s)": 1.440555 }, { "epoch": 1.2244548219870615, "grad_norm": 6.2157440185546875, "learning_rate": 8.591823186938614e-05, "loss": 2.2324760437011717, "memory(GiB)": 77.56, "step": 28580, "token_acc": 0.513677811550152, "train_speed(iter/s)": 1.440593 }, { "epoch": 1.2246690373163103, "grad_norm": 5.721351146697998, "learning_rate": 8.591354986461042e-05, "loss": 2.4571372985839846, "memory(GiB)": 77.56, "step": 28585, "token_acc": 0.46062992125984253, "train_speed(iter/s)": 1.440644 }, { "epoch": 1.2248832526455593, "grad_norm": 9.207639694213867, "learning_rate": 8.590886720922704e-05, "loss": 2.6439254760742186, "memory(GiB)": 77.56, "step": 28590, "token_acc": 0.40540540540540543, "train_speed(iter/s)": 1.440612 }, { "epoch": 1.2250974679748083, "grad_norm": 4.636092185974121, "learning_rate": 8.590418390332084e-05, "loss": 2.6139123916625975, "memory(GiB)": 77.56, "step": 28595, "token_acc": 0.4523076923076923, "train_speed(iter/s)": 1.440659 }, { "epoch": 1.2253116833040572, "grad_norm": 9.474297523498535, "learning_rate": 8.589949994697669e-05, "loss": 2.553696060180664, "memory(GiB)": 77.56, "step": 28600, "token_acc": 0.46511627906976744, "train_speed(iter/s)": 1.440722 }, { "epoch": 1.2255258986333062, "grad_norm": 4.777644634246826, "learning_rate": 8.589481534027941e-05, "loss": 2.6511844635009765, "memory(GiB)": 77.56, "step": 28605, "token_acc": 0.46689895470383275, "train_speed(iter/s)": 1.440744 }, { "epoch": 1.2257401139625552, "grad_norm": 4.623128890991211, "learning_rate": 8.589013008331388e-05, "loss": 2.417831802368164, "memory(GiB)": 77.56, "step": 28610, "token_acc": 0.4763636363636364, "train_speed(iter/s)": 1.440804 }, { "epoch": 1.225954329291804, "grad_norm": 5.694374084472656, "learning_rate": 8.588544417616497e-05, "loss": 2.4633270263671876, "memory(GiB)": 77.56, "step": 28615, "token_acc": 0.4756554307116105, "train_speed(iter/s)": 1.440837 }, { "epoch": 1.226168544621053, "grad_norm": 4.325733184814453, "learning_rate": 8.588075761891758e-05, "loss": 2.691203308105469, "memory(GiB)": 77.56, "step": 28620, "token_acc": 0.4980544747081712, "train_speed(iter/s)": 1.440783 }, { "epoch": 1.226382759950302, "grad_norm": 5.457977294921875, "learning_rate": 8.587607041165662e-05, "loss": 2.687010383605957, "memory(GiB)": 77.56, "step": 28625, "token_acc": 0.4742268041237113, "train_speed(iter/s)": 1.440775 }, { "epoch": 1.226596975279551, "grad_norm": 5.60301399230957, "learning_rate": 8.587138255446698e-05, "loss": 2.6495382308959963, "memory(GiB)": 77.56, "step": 28630, "token_acc": 0.4967105263157895, "train_speed(iter/s)": 1.440725 }, { "epoch": 1.2268111906088, "grad_norm": 4.6819915771484375, "learning_rate": 8.586669404743359e-05, "loss": 2.4744903564453127, "memory(GiB)": 77.56, "step": 28635, "token_acc": 0.4950166112956811, "train_speed(iter/s)": 1.440686 }, { "epoch": 1.227025405938049, "grad_norm": 6.039593696594238, "learning_rate": 8.586200489064138e-05, "loss": 2.622127151489258, "memory(GiB)": 77.56, "step": 28640, "token_acc": 0.47315436241610737, "train_speed(iter/s)": 1.440728 }, { "epoch": 1.2272396212672978, "grad_norm": 4.093668460845947, "learning_rate": 8.585731508417533e-05, "loss": 2.571345901489258, "memory(GiB)": 77.56, "step": 28645, "token_acc": 0.4570552147239264, "train_speed(iter/s)": 1.440806 }, { "epoch": 1.2274538365965468, "grad_norm": 4.982674598693848, "learning_rate": 8.585262462812038e-05, "loss": 2.5796134948730467, "memory(GiB)": 77.56, "step": 28650, "token_acc": 0.47249190938511326, "train_speed(iter/s)": 1.440791 }, { "epoch": 1.2276680519257959, "grad_norm": 4.81125545501709, "learning_rate": 8.584793352256149e-05, "loss": 2.326946258544922, "memory(GiB)": 77.56, "step": 28655, "token_acc": 0.503968253968254, "train_speed(iter/s)": 1.440837 }, { "epoch": 1.2278822672550447, "grad_norm": 5.603147983551025, "learning_rate": 8.584324176758367e-05, "loss": 2.6811941146850584, "memory(GiB)": 77.56, "step": 28660, "token_acc": 0.43670886075949367, "train_speed(iter/s)": 1.440839 }, { "epoch": 1.2280964825842937, "grad_norm": 4.917667388916016, "learning_rate": 8.583854936327189e-05, "loss": 2.6356378555297852, "memory(GiB)": 77.56, "step": 28665, "token_acc": 0.501577287066246, "train_speed(iter/s)": 1.440775 }, { "epoch": 1.2283106979135427, "grad_norm": 7.304621696472168, "learning_rate": 8.583385630971118e-05, "loss": 2.55916748046875, "memory(GiB)": 77.56, "step": 28670, "token_acc": 0.5093167701863354, "train_speed(iter/s)": 1.440808 }, { "epoch": 1.2285249132427916, "grad_norm": 5.387117385864258, "learning_rate": 8.582916260698655e-05, "loss": 2.304153823852539, "memory(GiB)": 77.56, "step": 28675, "token_acc": 0.5034013605442177, "train_speed(iter/s)": 1.44085 }, { "epoch": 1.2287391285720406, "grad_norm": 3.856370687484741, "learning_rate": 8.582446825518302e-05, "loss": 2.2381420135498047, "memory(GiB)": 77.56, "step": 28680, "token_acc": 0.524390243902439, "train_speed(iter/s)": 1.440874 }, { "epoch": 1.2289533439012896, "grad_norm": 4.070316314697266, "learning_rate": 8.581977325438564e-05, "loss": 2.48612060546875, "memory(GiB)": 77.56, "step": 28685, "token_acc": 0.47368421052631576, "train_speed(iter/s)": 1.440927 }, { "epoch": 1.2291675592305384, "grad_norm": 4.756408214569092, "learning_rate": 8.581507760467945e-05, "loss": 2.7492162704467775, "memory(GiB)": 77.56, "step": 28690, "token_acc": 0.41883116883116883, "train_speed(iter/s)": 1.440974 }, { "epoch": 1.2293817745597875, "grad_norm": 4.741931915283203, "learning_rate": 8.581038130614957e-05, "loss": 2.7649917602539062, "memory(GiB)": 77.56, "step": 28695, "token_acc": 0.43478260869565216, "train_speed(iter/s)": 1.440958 }, { "epoch": 1.2295959898890365, "grad_norm": 5.995345115661621, "learning_rate": 8.580568435888102e-05, "loss": 2.6990283966064452, "memory(GiB)": 77.56, "step": 28700, "token_acc": 0.42771084337349397, "train_speed(iter/s)": 1.44087 }, { "epoch": 1.2298102052182853, "grad_norm": 6.213626384735107, "learning_rate": 8.580098676295891e-05, "loss": 2.670740509033203, "memory(GiB)": 77.56, "step": 28705, "token_acc": 0.41543026706231456, "train_speed(iter/s)": 1.440881 }, { "epoch": 1.2300244205475344, "grad_norm": 4.130982875823975, "learning_rate": 8.579628851846834e-05, "loss": 2.2273691177368162, "memory(GiB)": 77.56, "step": 28710, "token_acc": 0.5263157894736842, "train_speed(iter/s)": 1.440919 }, { "epoch": 1.2302386358767834, "grad_norm": 4.355827808380127, "learning_rate": 8.579158962549443e-05, "loss": 2.4503570556640626, "memory(GiB)": 77.56, "step": 28715, "token_acc": 0.4965753424657534, "train_speed(iter/s)": 1.440942 }, { "epoch": 1.2304528512060324, "grad_norm": 4.211069107055664, "learning_rate": 8.578689008412229e-05, "loss": 2.5717063903808595, "memory(GiB)": 77.56, "step": 28720, "token_acc": 0.48338368580060426, "train_speed(iter/s)": 1.440999 }, { "epoch": 1.2306670665352812, "grad_norm": 5.150546073913574, "learning_rate": 8.578218989443706e-05, "loss": 2.775617790222168, "memory(GiB)": 77.56, "step": 28725, "token_acc": 0.4375, "train_speed(iter/s)": 1.441009 }, { "epoch": 1.2308812818645303, "grad_norm": 5.9000935554504395, "learning_rate": 8.577748905652389e-05, "loss": 2.530570411682129, "memory(GiB)": 77.56, "step": 28730, "token_acc": 0.48771929824561405, "train_speed(iter/s)": 1.441075 }, { "epoch": 1.2310954971937793, "grad_norm": 4.757238864898682, "learning_rate": 8.577278757046797e-05, "loss": 2.39785099029541, "memory(GiB)": 77.56, "step": 28735, "token_acc": 0.484, "train_speed(iter/s)": 1.441112 }, { "epoch": 1.2313097125230281, "grad_norm": 4.571513652801514, "learning_rate": 8.576808543635443e-05, "loss": 2.4586132049560545, "memory(GiB)": 77.56, "step": 28740, "token_acc": 0.4845679012345679, "train_speed(iter/s)": 1.441096 }, { "epoch": 1.2315239278522772, "grad_norm": 4.509367942810059, "learning_rate": 8.576338265426846e-05, "loss": 2.420689010620117, "memory(GiB)": 77.56, "step": 28745, "token_acc": 0.49333333333333335, "train_speed(iter/s)": 1.441122 }, { "epoch": 1.2317381431815262, "grad_norm": 4.456842422485352, "learning_rate": 8.575867922429529e-05, "loss": 2.5845626831054687, "memory(GiB)": 77.56, "step": 28750, "token_acc": 0.49295774647887325, "train_speed(iter/s)": 1.441171 }, { "epoch": 1.231952358510775, "grad_norm": 4.6104559898376465, "learning_rate": 8.575397514652008e-05, "loss": 2.108664131164551, "memory(GiB)": 77.56, "step": 28755, "token_acc": 0.48846153846153845, "train_speed(iter/s)": 1.441142 }, { "epoch": 1.232166573840024, "grad_norm": 7.478261470794678, "learning_rate": 8.574927042102807e-05, "loss": 2.7321355819702147, "memory(GiB)": 77.56, "step": 28760, "token_acc": 0.45454545454545453, "train_speed(iter/s)": 1.441208 }, { "epoch": 1.232380789169273, "grad_norm": 7.4106526374816895, "learning_rate": 8.574456504790451e-05, "loss": 2.5289710998535155, "memory(GiB)": 77.56, "step": 28765, "token_acc": 0.45188284518828453, "train_speed(iter/s)": 1.441262 }, { "epoch": 1.2325950044985219, "grad_norm": 5.109609127044678, "learning_rate": 8.57398590272346e-05, "loss": 2.3794837951660157, "memory(GiB)": 77.56, "step": 28770, "token_acc": 0.48828125, "train_speed(iter/s)": 1.44132 }, { "epoch": 1.232809219827771, "grad_norm": 4.757969379425049, "learning_rate": 8.573515235910364e-05, "loss": 2.7019912719726564, "memory(GiB)": 77.56, "step": 28775, "token_acc": 0.4194528875379939, "train_speed(iter/s)": 1.44133 }, { "epoch": 1.23302343515702, "grad_norm": 4.6256513595581055, "learning_rate": 8.573044504359686e-05, "loss": 2.5817726135253904, "memory(GiB)": 77.56, "step": 28780, "token_acc": 0.46105919003115264, "train_speed(iter/s)": 1.441366 }, { "epoch": 1.2332376504862688, "grad_norm": 4.061574459075928, "learning_rate": 8.572573708079954e-05, "loss": 2.3653993606567383, "memory(GiB)": 77.56, "step": 28785, "token_acc": 0.4983164983164983, "train_speed(iter/s)": 1.441366 }, { "epoch": 1.2334518658155178, "grad_norm": 5.002175807952881, "learning_rate": 8.5721028470797e-05, "loss": 2.7393707275390624, "memory(GiB)": 77.56, "step": 28790, "token_acc": 0.4864864864864865, "train_speed(iter/s)": 1.441382 }, { "epoch": 1.2336660811447668, "grad_norm": 5.107406139373779, "learning_rate": 8.571631921367451e-05, "loss": 2.3718990325927733, "memory(GiB)": 77.56, "step": 28795, "token_acc": 0.48828125, "train_speed(iter/s)": 1.441412 }, { "epoch": 1.2338802964740156, "grad_norm": 6.161802291870117, "learning_rate": 8.571160930951738e-05, "loss": 2.694923973083496, "memory(GiB)": 77.56, "step": 28800, "token_acc": 0.42813455657492355, "train_speed(iter/s)": 1.441482 }, { "epoch": 1.2340945118032647, "grad_norm": 4.458662033081055, "learning_rate": 8.570689875841095e-05, "loss": 2.5193561553955077, "memory(GiB)": 77.56, "step": 28805, "token_acc": 0.4696969696969697, "train_speed(iter/s)": 1.441516 }, { "epoch": 1.2343087271325137, "grad_norm": 8.706774711608887, "learning_rate": 8.570218756044058e-05, "loss": 2.3342342376708984, "memory(GiB)": 77.56, "step": 28810, "token_acc": 0.5196850393700787, "train_speed(iter/s)": 1.441513 }, { "epoch": 1.2345229424617625, "grad_norm": 4.946343898773193, "learning_rate": 8.569747571569157e-05, "loss": 2.4420707702636717, "memory(GiB)": 77.56, "step": 28815, "token_acc": 0.5197368421052632, "train_speed(iter/s)": 1.441491 }, { "epoch": 1.2347371577910116, "grad_norm": 5.19757604598999, "learning_rate": 8.569276322424931e-05, "loss": 2.3729801177978516, "memory(GiB)": 77.56, "step": 28820, "token_acc": 0.5074626865671642, "train_speed(iter/s)": 1.441505 }, { "epoch": 1.2349513731202606, "grad_norm": 4.7357707023620605, "learning_rate": 8.568805008619916e-05, "loss": 2.4615493774414063, "memory(GiB)": 77.56, "step": 28825, "token_acc": 0.4869281045751634, "train_speed(iter/s)": 1.441519 }, { "epoch": 1.2351655884495094, "grad_norm": 6.024968147277832, "learning_rate": 8.568333630162649e-05, "loss": 2.231813430786133, "memory(GiB)": 77.56, "step": 28830, "token_acc": 0.4758842443729904, "train_speed(iter/s)": 1.441516 }, { "epoch": 1.2353798037787584, "grad_norm": 4.668166160583496, "learning_rate": 8.567862187061673e-05, "loss": 2.5431495666503907, "memory(GiB)": 77.56, "step": 28835, "token_acc": 0.4838709677419355, "train_speed(iter/s)": 1.441494 }, { "epoch": 1.2355940191080075, "grad_norm": 5.450082778930664, "learning_rate": 8.567390679325525e-05, "loss": 2.1610176086425783, "memory(GiB)": 77.56, "step": 28840, "token_acc": 0.48872180451127817, "train_speed(iter/s)": 1.441518 }, { "epoch": 1.2358082344372563, "grad_norm": 5.276432037353516, "learning_rate": 8.56691910696275e-05, "loss": 2.484408950805664, "memory(GiB)": 77.56, "step": 28845, "token_acc": 0.5167095115681234, "train_speed(iter/s)": 1.441467 }, { "epoch": 1.2360224497665053, "grad_norm": 5.5432515144348145, "learning_rate": 8.566447469981888e-05, "loss": 2.7759578704833983, "memory(GiB)": 77.56, "step": 28850, "token_acc": 0.4174174174174174, "train_speed(iter/s)": 1.441514 }, { "epoch": 1.2362366650957544, "grad_norm": 7.759845733642578, "learning_rate": 8.565975768391484e-05, "loss": 2.3879852294921875, "memory(GiB)": 77.56, "step": 28855, "token_acc": 0.4897260273972603, "train_speed(iter/s)": 1.441519 }, { "epoch": 1.2364508804250032, "grad_norm": 6.212899684906006, "learning_rate": 8.565504002200084e-05, "loss": 2.567481803894043, "memory(GiB)": 77.56, "step": 28860, "token_acc": 0.4689655172413793, "train_speed(iter/s)": 1.441558 }, { "epoch": 1.2366650957542522, "grad_norm": 5.026437759399414, "learning_rate": 8.565032171416236e-05, "loss": 2.4668468475341796, "memory(GiB)": 77.56, "step": 28865, "token_acc": 0.46200607902735563, "train_speed(iter/s)": 1.441602 }, { "epoch": 1.2368793110835012, "grad_norm": 5.263489723205566, "learning_rate": 8.564560276048483e-05, "loss": 2.433339309692383, "memory(GiB)": 77.56, "step": 28870, "token_acc": 0.47278911564625853, "train_speed(iter/s)": 1.441618 }, { "epoch": 1.23709352641275, "grad_norm": 4.9902801513671875, "learning_rate": 8.56408831610538e-05, "loss": 2.2896575927734375, "memory(GiB)": 77.56, "step": 28875, "token_acc": 0.45864661654135336, "train_speed(iter/s)": 1.441594 }, { "epoch": 1.237307741741999, "grad_norm": 3.9070582389831543, "learning_rate": 8.563616291595473e-05, "loss": 2.371356964111328, "memory(GiB)": 77.56, "step": 28880, "token_acc": 0.4836795252225519, "train_speed(iter/s)": 1.441575 }, { "epoch": 1.2375219570712481, "grad_norm": 4.652554988861084, "learning_rate": 8.563144202527312e-05, "loss": 2.4942867279052736, "memory(GiB)": 77.56, "step": 28885, "token_acc": 0.46601941747572817, "train_speed(iter/s)": 1.441575 }, { "epoch": 1.237736172400497, "grad_norm": 5.192071914672852, "learning_rate": 8.562672048909453e-05, "loss": 2.924012565612793, "memory(GiB)": 77.56, "step": 28890, "token_acc": 0.42507645259938837, "train_speed(iter/s)": 1.44161 }, { "epoch": 1.237950387729746, "grad_norm": 5.600824356079102, "learning_rate": 8.562199830750447e-05, "loss": 2.4559005737304687, "memory(GiB)": 77.56, "step": 28895, "token_acc": 0.4662576687116564, "train_speed(iter/s)": 1.441595 }, { "epoch": 1.238164603058995, "grad_norm": 4.9247517585754395, "learning_rate": 8.561727548058849e-05, "loss": 2.629568862915039, "memory(GiB)": 77.56, "step": 28900, "token_acc": 0.48363636363636364, "train_speed(iter/s)": 1.441647 }, { "epoch": 1.2383788183882438, "grad_norm": 6.085575103759766, "learning_rate": 8.561255200843216e-05, "loss": 2.50693359375, "memory(GiB)": 77.56, "step": 28905, "token_acc": 0.4794520547945205, "train_speed(iter/s)": 1.441647 }, { "epoch": 1.2385930337174929, "grad_norm": 5.520236968994141, "learning_rate": 8.560782789112105e-05, "loss": 2.460283660888672, "memory(GiB)": 77.56, "step": 28910, "token_acc": 0.46332046332046334, "train_speed(iter/s)": 1.441664 }, { "epoch": 1.2388072490467419, "grad_norm": 7.209451675415039, "learning_rate": 8.560310312874074e-05, "loss": 2.617263603210449, "memory(GiB)": 77.56, "step": 28915, "token_acc": 0.45918367346938777, "train_speed(iter/s)": 1.441653 }, { "epoch": 1.2390214643759907, "grad_norm": 4.452922344207764, "learning_rate": 8.55983777213768e-05, "loss": 2.3573495864868166, "memory(GiB)": 77.56, "step": 28920, "token_acc": 0.4634146341463415, "train_speed(iter/s)": 1.44163 }, { "epoch": 1.2392356797052397, "grad_norm": 4.127779483795166, "learning_rate": 8.559365166911486e-05, "loss": 2.2596084594726564, "memory(GiB)": 77.56, "step": 28925, "token_acc": 0.4983388704318937, "train_speed(iter/s)": 1.44164 }, { "epoch": 1.2394498950344888, "grad_norm": 4.5671820640563965, "learning_rate": 8.558892497204052e-05, "loss": 2.238787078857422, "memory(GiB)": 77.56, "step": 28930, "token_acc": 0.49142857142857144, "train_speed(iter/s)": 1.441667 }, { "epoch": 1.2396641103637376, "grad_norm": 5.462334632873535, "learning_rate": 8.558419763023944e-05, "loss": 2.6892871856689453, "memory(GiB)": 77.56, "step": 28935, "token_acc": 0.43050847457627117, "train_speed(iter/s)": 1.441585 }, { "epoch": 1.2398783256929866, "grad_norm": 4.344351291656494, "learning_rate": 8.557946964379723e-05, "loss": 2.277344512939453, "memory(GiB)": 77.56, "step": 28940, "token_acc": 0.5, "train_speed(iter/s)": 1.441573 }, { "epoch": 1.2400925410222357, "grad_norm": 4.448167324066162, "learning_rate": 8.557474101279955e-05, "loss": 2.6097690582275392, "memory(GiB)": 77.56, "step": 28945, "token_acc": 0.4376899696048632, "train_speed(iter/s)": 1.441542 }, { "epoch": 1.2403067563514845, "grad_norm": 4.940723896026611, "learning_rate": 8.557001173733206e-05, "loss": 2.3075437545776367, "memory(GiB)": 77.56, "step": 28950, "token_acc": 0.5168918918918919, "train_speed(iter/s)": 1.441587 }, { "epoch": 1.2405209716807335, "grad_norm": 4.797855854034424, "learning_rate": 8.556528181748044e-05, "loss": 2.4329502105712892, "memory(GiB)": 77.56, "step": 28955, "token_acc": 0.47230320699708456, "train_speed(iter/s)": 1.44159 }, { "epoch": 1.2407351870099825, "grad_norm": 5.140381813049316, "learning_rate": 8.556055125333039e-05, "loss": 2.5371442794799806, "memory(GiB)": 77.56, "step": 28960, "token_acc": 0.4884488448844885, "train_speed(iter/s)": 1.441592 }, { "epoch": 1.2409494023392313, "grad_norm": 7.099332332611084, "learning_rate": 8.555582004496758e-05, "loss": 2.264695167541504, "memory(GiB)": 77.56, "step": 28965, "token_acc": 0.4609053497942387, "train_speed(iter/s)": 1.441584 }, { "epoch": 1.2411636176684804, "grad_norm": 5.5887980461120605, "learning_rate": 8.555108819247774e-05, "loss": 2.434825897216797, "memory(GiB)": 77.56, "step": 28970, "token_acc": 0.46794871794871795, "train_speed(iter/s)": 1.441543 }, { "epoch": 1.2413778329977294, "grad_norm": 6.623556137084961, "learning_rate": 8.55463556959466e-05, "loss": 2.302535629272461, "memory(GiB)": 77.56, "step": 28975, "token_acc": 0.5, "train_speed(iter/s)": 1.441469 }, { "epoch": 1.2415920483269782, "grad_norm": 4.901788711547852, "learning_rate": 8.554162255545987e-05, "loss": 2.586678695678711, "memory(GiB)": 77.56, "step": 28980, "token_acc": 0.44966442953020136, "train_speed(iter/s)": 1.441448 }, { "epoch": 1.2418062636562273, "grad_norm": 4.8584303855896, "learning_rate": 8.55368887711033e-05, "loss": 2.325054740905762, "memory(GiB)": 77.56, "step": 28985, "token_acc": 0.5217391304347826, "train_speed(iter/s)": 1.441472 }, { "epoch": 1.2420204789854763, "grad_norm": 4.714451789855957, "learning_rate": 8.553215434296268e-05, "loss": 2.535818862915039, "memory(GiB)": 77.56, "step": 28990, "token_acc": 0.47104247104247104, "train_speed(iter/s)": 1.441525 }, { "epoch": 1.242234694314725, "grad_norm": 8.418222427368164, "learning_rate": 8.552741927112373e-05, "loss": 2.4478073120117188, "memory(GiB)": 77.56, "step": 28995, "token_acc": 0.48507462686567165, "train_speed(iter/s)": 1.441557 }, { "epoch": 1.2424489096439741, "grad_norm": 5.557422161102295, "learning_rate": 8.552268355567226e-05, "loss": 2.9656723022460936, "memory(GiB)": 77.56, "step": 29000, "token_acc": 0.4041916167664671, "train_speed(iter/s)": 1.441547 }, { "epoch": 1.2424489096439741, "eval_loss": 2.2247400283813477, "eval_runtime": 13.7495, "eval_samples_per_second": 7.273, "eval_steps_per_second": 7.273, "eval_token_acc": 0.4855072463768116, "step": 29000 }, { "epoch": 1.2426631249732232, "grad_norm": 5.335652828216553, "learning_rate": 8.551794719669405e-05, "loss": 2.480691337585449, "memory(GiB)": 77.56, "step": 29005, "token_acc": 0.48810754912099275, "train_speed(iter/s)": 1.44048 }, { "epoch": 1.242877340302472, "grad_norm": 6.0281267166137695, "learning_rate": 8.551321019427491e-05, "loss": 2.332222747802734, "memory(GiB)": 77.56, "step": 29010, "token_acc": 0.5261194029850746, "train_speed(iter/s)": 1.440465 }, { "epoch": 1.243091555631721, "grad_norm": 4.924759864807129, "learning_rate": 8.550847254850065e-05, "loss": 2.4824304580688477, "memory(GiB)": 77.56, "step": 29015, "token_acc": 0.4774774774774775, "train_speed(iter/s)": 1.440537 }, { "epoch": 1.24330577096097, "grad_norm": 4.872472763061523, "learning_rate": 8.550373425945711e-05, "loss": 2.276318359375, "memory(GiB)": 77.56, "step": 29020, "token_acc": 0.4777777777777778, "train_speed(iter/s)": 1.440624 }, { "epoch": 1.2435199862902189, "grad_norm": 5.124520778656006, "learning_rate": 8.549899532723011e-05, "loss": 2.8170703887939452, "memory(GiB)": 77.56, "step": 29025, "token_acc": 0.4139072847682119, "train_speed(iter/s)": 1.440666 }, { "epoch": 1.243734201619468, "grad_norm": 4.6610002517700195, "learning_rate": 8.549425575190551e-05, "loss": 2.350818634033203, "memory(GiB)": 77.56, "step": 29030, "token_acc": 0.501628664495114, "train_speed(iter/s)": 1.44067 }, { "epoch": 1.243948416948717, "grad_norm": 5.190739154815674, "learning_rate": 8.548951553356917e-05, "loss": 2.732608413696289, "memory(GiB)": 77.56, "step": 29035, "token_acc": 0.4612546125461255, "train_speed(iter/s)": 1.440715 }, { "epoch": 1.2441626322779658, "grad_norm": 4.973451137542725, "learning_rate": 8.548477467230697e-05, "loss": 2.4846229553222656, "memory(GiB)": 77.56, "step": 29040, "token_acc": 0.47244094488188976, "train_speed(iter/s)": 1.440732 }, { "epoch": 1.2443768476072148, "grad_norm": 5.122106552124023, "learning_rate": 8.548003316820478e-05, "loss": 2.3691707611083985, "memory(GiB)": 77.56, "step": 29045, "token_acc": 0.5107142857142857, "train_speed(iter/s)": 1.440761 }, { "epoch": 1.2445910629364638, "grad_norm": 3.669872999191284, "learning_rate": 8.547529102134852e-05, "loss": 2.5579437255859374, "memory(GiB)": 77.56, "step": 29050, "token_acc": 0.4694533762057878, "train_speed(iter/s)": 1.440766 }, { "epoch": 1.2448052782657126, "grad_norm": 5.317296981811523, "learning_rate": 8.547054823182408e-05, "loss": 2.6559097290039064, "memory(GiB)": 77.56, "step": 29055, "token_acc": 0.4523809523809524, "train_speed(iter/s)": 1.440708 }, { "epoch": 1.2450194935949617, "grad_norm": 4.863055229187012, "learning_rate": 8.546580479971737e-05, "loss": 2.473911666870117, "memory(GiB)": 77.56, "step": 29060, "token_acc": 0.47, "train_speed(iter/s)": 1.440689 }, { "epoch": 1.2452337089242107, "grad_norm": 4.805677890777588, "learning_rate": 8.546106072511435e-05, "loss": 2.504608154296875, "memory(GiB)": 77.56, "step": 29065, "token_acc": 0.49169435215946844, "train_speed(iter/s)": 1.440693 }, { "epoch": 1.2454479242534595, "grad_norm": 4.455987453460693, "learning_rate": 8.545631600810094e-05, "loss": 2.338959503173828, "memory(GiB)": 77.56, "step": 29070, "token_acc": 0.4788732394366197, "train_speed(iter/s)": 1.440667 }, { "epoch": 1.2456621395827085, "grad_norm": 4.937370777130127, "learning_rate": 8.545157064876311e-05, "loss": 2.693893814086914, "memory(GiB)": 77.56, "step": 29075, "token_acc": 0.43902439024390244, "train_speed(iter/s)": 1.440647 }, { "epoch": 1.2458763549119576, "grad_norm": 4.100717544555664, "learning_rate": 8.544682464718684e-05, "loss": 2.204322624206543, "memory(GiB)": 77.56, "step": 29080, "token_acc": 0.5166666666666667, "train_speed(iter/s)": 1.440619 }, { "epoch": 1.2460905702412064, "grad_norm": 5.114598751068115, "learning_rate": 8.544207800345808e-05, "loss": 2.542263221740723, "memory(GiB)": 77.56, "step": 29085, "token_acc": 0.45625, "train_speed(iter/s)": 1.440653 }, { "epoch": 1.2463047855704554, "grad_norm": 5.612907409667969, "learning_rate": 8.54373307176628e-05, "loss": 2.358996772766113, "memory(GiB)": 77.56, "step": 29090, "token_acc": 0.4913494809688581, "train_speed(iter/s)": 1.440663 }, { "epoch": 1.2465190008997045, "grad_norm": 4.823172092437744, "learning_rate": 8.543258278988706e-05, "loss": 2.5500766754150392, "memory(GiB)": 77.56, "step": 29095, "token_acc": 0.4878048780487805, "train_speed(iter/s)": 1.44062 }, { "epoch": 1.2467332162289533, "grad_norm": 5.943562030792236, "learning_rate": 8.542783422021684e-05, "loss": 2.4312461853027343, "memory(GiB)": 77.56, "step": 29100, "token_acc": 0.503731343283582, "train_speed(iter/s)": 1.44066 }, { "epoch": 1.2469474315582023, "grad_norm": 6.189527988433838, "learning_rate": 8.542308500873817e-05, "loss": 3.0052860260009764, "memory(GiB)": 77.56, "step": 29105, "token_acc": 0.41472868217054265, "train_speed(iter/s)": 1.440697 }, { "epoch": 1.2471616468874513, "grad_norm": 4.743948936462402, "learning_rate": 8.541833515553707e-05, "loss": 2.497550201416016, "memory(GiB)": 77.56, "step": 29110, "token_acc": 0.5151515151515151, "train_speed(iter/s)": 1.44067 }, { "epoch": 1.2473758622167002, "grad_norm": 6.074448585510254, "learning_rate": 8.541358466069962e-05, "loss": 2.5804285049438476, "memory(GiB)": 77.56, "step": 29115, "token_acc": 0.45454545454545453, "train_speed(iter/s)": 1.440696 }, { "epoch": 1.2475900775459492, "grad_norm": 5.792123317718506, "learning_rate": 8.540883352431186e-05, "loss": 2.6468175888061523, "memory(GiB)": 77.56, "step": 29120, "token_acc": 0.43636363636363634, "train_speed(iter/s)": 1.440708 }, { "epoch": 1.2478042928751982, "grad_norm": 5.768387317657471, "learning_rate": 8.540408174645986e-05, "loss": 2.383892059326172, "memory(GiB)": 77.56, "step": 29125, "token_acc": 0.515625, "train_speed(iter/s)": 1.440733 }, { "epoch": 1.248018508204447, "grad_norm": 4.049323558807373, "learning_rate": 8.539932932722971e-05, "loss": 2.5525436401367188, "memory(GiB)": 77.56, "step": 29130, "token_acc": 0.4576923076923077, "train_speed(iter/s)": 1.440699 }, { "epoch": 1.248232723533696, "grad_norm": 4.429920673370361, "learning_rate": 8.539457626670752e-05, "loss": 2.5024272918701174, "memory(GiB)": 77.56, "step": 29135, "token_acc": 0.5034965034965035, "train_speed(iter/s)": 1.440677 }, { "epoch": 1.248446938862945, "grad_norm": 4.3035101890563965, "learning_rate": 8.538982256497937e-05, "loss": 2.5994089126586912, "memory(GiB)": 77.56, "step": 29140, "token_acc": 0.45454545454545453, "train_speed(iter/s)": 1.440682 }, { "epoch": 1.248661154192194, "grad_norm": 5.251198768615723, "learning_rate": 8.538506822213136e-05, "loss": 2.7291500091552736, "memory(GiB)": 77.56, "step": 29145, "token_acc": 0.46474358974358976, "train_speed(iter/s)": 1.440688 }, { "epoch": 1.248875369521443, "grad_norm": 4.390237808227539, "learning_rate": 8.538031323824967e-05, "loss": 2.503163719177246, "memory(GiB)": 77.56, "step": 29150, "token_acc": 0.46062992125984253, "train_speed(iter/s)": 1.440744 }, { "epoch": 1.249089584850692, "grad_norm": 5.376598834991455, "learning_rate": 8.537555761342039e-05, "loss": 2.403537368774414, "memory(GiB)": 77.56, "step": 29155, "token_acc": 0.49794238683127573, "train_speed(iter/s)": 1.440717 }, { "epoch": 1.2493038001799408, "grad_norm": 5.448025226593018, "learning_rate": 8.537080134772973e-05, "loss": 2.554519462585449, "memory(GiB)": 77.56, "step": 29160, "token_acc": 0.5083612040133779, "train_speed(iter/s)": 1.440726 }, { "epoch": 1.2495180155091898, "grad_norm": 5.245378494262695, "learning_rate": 8.536604444126382e-05, "loss": 2.830800247192383, "memory(GiB)": 77.56, "step": 29165, "token_acc": 0.4597014925373134, "train_speed(iter/s)": 1.440802 }, { "epoch": 1.2497322308384389, "grad_norm": 5.468711853027344, "learning_rate": 8.536128689410882e-05, "loss": 2.648890495300293, "memory(GiB)": 77.56, "step": 29170, "token_acc": 0.4701492537313433, "train_speed(iter/s)": 1.440833 }, { "epoch": 1.2499464461676877, "grad_norm": 32.12142562866211, "learning_rate": 8.535652870635094e-05, "loss": 2.5189035415649412, "memory(GiB)": 77.56, "step": 29175, "token_acc": 0.4782608695652174, "train_speed(iter/s)": 1.440788 }, { "epoch": 1.2501606614969367, "grad_norm": 6.855515956878662, "learning_rate": 8.535176987807639e-05, "loss": 2.553010368347168, "memory(GiB)": 77.56, "step": 29180, "token_acc": 0.5110294117647058, "train_speed(iter/s)": 1.440789 }, { "epoch": 1.2503748768261858, "grad_norm": 3.473304271697998, "learning_rate": 8.534701040937136e-05, "loss": 2.285348320007324, "memory(GiB)": 77.56, "step": 29185, "token_acc": 0.5197132616487455, "train_speed(iter/s)": 1.440789 }, { "epoch": 1.2505890921554346, "grad_norm": 5.68897819519043, "learning_rate": 8.534225030032208e-05, "loss": 2.392068862915039, "memory(GiB)": 77.56, "step": 29190, "token_acc": 0.5055350553505535, "train_speed(iter/s)": 1.440777 }, { "epoch": 1.2508033074846836, "grad_norm": 7.997988224029541, "learning_rate": 8.533748955101477e-05, "loss": 2.3846378326416016, "memory(GiB)": 77.56, "step": 29195, "token_acc": 0.48034934497816595, "train_speed(iter/s)": 1.440811 }, { "epoch": 1.2510175228139326, "grad_norm": 4.5932841300964355, "learning_rate": 8.533272816153571e-05, "loss": 2.4555004119873045, "memory(GiB)": 77.56, "step": 29200, "token_acc": 0.4696969696969697, "train_speed(iter/s)": 1.44087 }, { "epoch": 1.2512317381431814, "grad_norm": 5.046009540557861, "learning_rate": 8.532796613197112e-05, "loss": 2.460248565673828, "memory(GiB)": 77.56, "step": 29205, "token_acc": 0.5094339622641509, "train_speed(iter/s)": 1.440881 }, { "epoch": 1.2514459534724305, "grad_norm": 4.072773456573486, "learning_rate": 8.532320346240728e-05, "loss": 2.6071218490600585, "memory(GiB)": 77.56, "step": 29210, "token_acc": 0.4228571428571429, "train_speed(iter/s)": 1.440893 }, { "epoch": 1.2516601688016795, "grad_norm": 3.9104371070861816, "learning_rate": 8.531844015293047e-05, "loss": 2.4960987091064455, "memory(GiB)": 77.56, "step": 29215, "token_acc": 0.45980707395498394, "train_speed(iter/s)": 1.440927 }, { "epoch": 1.2518743841309283, "grad_norm": 5.324708938598633, "learning_rate": 8.531367620362699e-05, "loss": 2.3529176712036133, "memory(GiB)": 77.56, "step": 29220, "token_acc": 0.5258964143426295, "train_speed(iter/s)": 1.440929 }, { "epoch": 1.2520885994601774, "grad_norm": 5.784875869750977, "learning_rate": 8.530891161458314e-05, "loss": 2.5704120635986327, "memory(GiB)": 77.56, "step": 29225, "token_acc": 0.4708029197080292, "train_speed(iter/s)": 1.440974 }, { "epoch": 1.2523028147894264, "grad_norm": 4.044406890869141, "learning_rate": 8.530414638588525e-05, "loss": 2.2505264282226562, "memory(GiB)": 77.56, "step": 29230, "token_acc": 0.4864864864864865, "train_speed(iter/s)": 1.441012 }, { "epoch": 1.2525170301186752, "grad_norm": 6.830445289611816, "learning_rate": 8.529938051761961e-05, "loss": 2.8425745010375976, "memory(GiB)": 77.56, "step": 29235, "token_acc": 0.4155844155844156, "train_speed(iter/s)": 1.441034 }, { "epoch": 1.2527312454479242, "grad_norm": 5.800511360168457, "learning_rate": 8.529461400987258e-05, "loss": 2.721932601928711, "memory(GiB)": 77.56, "step": 29240, "token_acc": 0.47202797202797203, "train_speed(iter/s)": 1.44108 }, { "epoch": 1.2529454607771733, "grad_norm": 6.483523845672607, "learning_rate": 8.528984686273051e-05, "loss": 2.186646270751953, "memory(GiB)": 77.56, "step": 29245, "token_acc": 0.5416666666666666, "train_speed(iter/s)": 1.441125 }, { "epoch": 1.253159676106422, "grad_norm": 5.5024094581604, "learning_rate": 8.528507907627977e-05, "loss": 2.644719123840332, "memory(GiB)": 77.56, "step": 29250, "token_acc": 0.48639455782312924, "train_speed(iter/s)": 1.441102 }, { "epoch": 1.2533738914356711, "grad_norm": 4.80596923828125, "learning_rate": 8.528031065060669e-05, "loss": 2.0365554809570314, "memory(GiB)": 77.56, "step": 29255, "token_acc": 0.5436507936507936, "train_speed(iter/s)": 1.441125 }, { "epoch": 1.2535881067649202, "grad_norm": 4.627771377563477, "learning_rate": 8.527554158579772e-05, "loss": 2.210123825073242, "memory(GiB)": 77.56, "step": 29260, "token_acc": 0.5145228215767634, "train_speed(iter/s)": 1.441147 }, { "epoch": 1.253802322094169, "grad_norm": 5.042235851287842, "learning_rate": 8.527077188193921e-05, "loss": 2.6837717056274415, "memory(GiB)": 77.56, "step": 29265, "token_acc": 0.4612546125461255, "train_speed(iter/s)": 1.441218 }, { "epoch": 1.254016537423418, "grad_norm": 6.353658199310303, "learning_rate": 8.52660015391176e-05, "loss": 2.2099325180053713, "memory(GiB)": 77.56, "step": 29270, "token_acc": 0.516260162601626, "train_speed(iter/s)": 1.441262 }, { "epoch": 1.254230752752667, "grad_norm": 5.837703227996826, "learning_rate": 8.526123055741926e-05, "loss": 2.3983953475952147, "memory(GiB)": 77.56, "step": 29275, "token_acc": 0.5092250922509225, "train_speed(iter/s)": 1.44126 }, { "epoch": 1.2544449680819159, "grad_norm": 4.5204315185546875, "learning_rate": 8.525645893693067e-05, "loss": 2.6883926391601562, "memory(GiB)": 77.56, "step": 29280, "token_acc": 0.4725609756097561, "train_speed(iter/s)": 1.441311 }, { "epoch": 1.254659183411165, "grad_norm": 5.766561508178711, "learning_rate": 8.525168667773824e-05, "loss": 2.4652259826660154, "memory(GiB)": 77.56, "step": 29285, "token_acc": 0.49838187702265374, "train_speed(iter/s)": 1.441352 }, { "epoch": 1.254873398740414, "grad_norm": 5.017345428466797, "learning_rate": 8.524691377992844e-05, "loss": 2.6708446502685548, "memory(GiB)": 77.56, "step": 29290, "token_acc": 0.47368421052631576, "train_speed(iter/s)": 1.441361 }, { "epoch": 1.2550876140696627, "grad_norm": 7.347275733947754, "learning_rate": 8.524214024358775e-05, "loss": 2.7507781982421875, "memory(GiB)": 77.56, "step": 29295, "token_acc": 0.5, "train_speed(iter/s)": 1.441395 }, { "epoch": 1.2553018293989118, "grad_norm": 4.259577751159668, "learning_rate": 8.523736606880261e-05, "loss": 2.4214689254760744, "memory(GiB)": 77.56, "step": 29300, "token_acc": 0.5410447761194029, "train_speed(iter/s)": 1.441437 }, { "epoch": 1.2555160447281608, "grad_norm": 4.1819610595703125, "learning_rate": 8.523259125565954e-05, "loss": 2.726064682006836, "memory(GiB)": 77.56, "step": 29305, "token_acc": 0.45714285714285713, "train_speed(iter/s)": 1.441436 }, { "epoch": 1.2557302600574096, "grad_norm": 5.189210891723633, "learning_rate": 8.522781580424502e-05, "loss": 2.473117637634277, "memory(GiB)": 77.56, "step": 29310, "token_acc": 0.5181159420289855, "train_speed(iter/s)": 1.441456 }, { "epoch": 1.2559444753866587, "grad_norm": 5.402461528778076, "learning_rate": 8.522303971464557e-05, "loss": 2.6044374465942384, "memory(GiB)": 77.56, "step": 29315, "token_acc": 0.47619047619047616, "train_speed(iter/s)": 1.441478 }, { "epoch": 1.2561586907159077, "grad_norm": 5.355550289154053, "learning_rate": 8.521826298694773e-05, "loss": 2.649668884277344, "memory(GiB)": 77.56, "step": 29320, "token_acc": 0.4459016393442623, "train_speed(iter/s)": 1.441474 }, { "epoch": 1.2563729060451565, "grad_norm": 5.541404724121094, "learning_rate": 8.5213485621238e-05, "loss": 2.808111000061035, "memory(GiB)": 77.56, "step": 29325, "token_acc": 0.4798206278026906, "train_speed(iter/s)": 1.441497 }, { "epoch": 1.2565871213744055, "grad_norm": 6.287618637084961, "learning_rate": 8.520870761760296e-05, "loss": 2.5784027099609377, "memory(GiB)": 77.56, "step": 29330, "token_acc": 0.4763636363636364, "train_speed(iter/s)": 1.4415 }, { "epoch": 1.2568013367036546, "grad_norm": 4.258848667144775, "learning_rate": 8.520392897612913e-05, "loss": 2.4917228698730467, "memory(GiB)": 77.56, "step": 29335, "token_acc": 0.4879032258064516, "train_speed(iter/s)": 1.441495 }, { "epoch": 1.2570155520329034, "grad_norm": 3.6803226470947266, "learning_rate": 8.519914969690313e-05, "loss": 2.457961082458496, "memory(GiB)": 77.56, "step": 29340, "token_acc": 0.5, "train_speed(iter/s)": 1.441541 }, { "epoch": 1.2572297673621524, "grad_norm": 6.132785320281982, "learning_rate": 8.51943697800115e-05, "loss": 2.4769752502441404, "memory(GiB)": 77.56, "step": 29345, "token_acc": 0.4708333333333333, "train_speed(iter/s)": 1.441452 }, { "epoch": 1.2574439826914015, "grad_norm": 4.694347381591797, "learning_rate": 8.518958922554085e-05, "loss": 2.442934036254883, "memory(GiB)": 77.56, "step": 29350, "token_acc": 0.5031446540880503, "train_speed(iter/s)": 1.441451 }, { "epoch": 1.2576581980206503, "grad_norm": 5.689286708831787, "learning_rate": 8.518480803357778e-05, "loss": 2.617264175415039, "memory(GiB)": 77.56, "step": 29355, "token_acc": 0.5119453924914675, "train_speed(iter/s)": 1.441407 }, { "epoch": 1.2578724133498993, "grad_norm": 4.608502388000488, "learning_rate": 8.51800262042089e-05, "loss": 2.3587196350097654, "memory(GiB)": 77.56, "step": 29360, "token_acc": 0.47674418604651164, "train_speed(iter/s)": 1.441402 }, { "epoch": 1.2580866286791483, "grad_norm": 6.740025997161865, "learning_rate": 8.517524373752083e-05, "loss": 2.3277217864990236, "memory(GiB)": 77.56, "step": 29365, "token_acc": 0.45867768595041325, "train_speed(iter/s)": 1.44141 }, { "epoch": 1.2583008440083971, "grad_norm": 4.192903518676758, "learning_rate": 8.517046063360026e-05, "loss": 2.3574275970458984, "memory(GiB)": 77.56, "step": 29370, "token_acc": 0.5015873015873016, "train_speed(iter/s)": 1.441426 }, { "epoch": 1.2585150593376462, "grad_norm": 5.203413009643555, "learning_rate": 8.516567689253378e-05, "loss": 2.327602767944336, "memory(GiB)": 77.56, "step": 29375, "token_acc": 0.528957528957529, "train_speed(iter/s)": 1.441448 }, { "epoch": 1.2587292746668952, "grad_norm": 5.421355724334717, "learning_rate": 8.516089251440809e-05, "loss": 2.5142133712768553, "memory(GiB)": 77.56, "step": 29380, "token_acc": 0.465625, "train_speed(iter/s)": 1.44142 }, { "epoch": 1.258943489996144, "grad_norm": 4.307124137878418, "learning_rate": 8.515610749930982e-05, "loss": 2.445865821838379, "memory(GiB)": 77.56, "step": 29385, "token_acc": 0.5072463768115942, "train_speed(iter/s)": 1.44146 }, { "epoch": 1.259157705325393, "grad_norm": 3.540876626968384, "learning_rate": 8.51513218473257e-05, "loss": 2.5506105422973633, "memory(GiB)": 77.56, "step": 29390, "token_acc": 0.483974358974359, "train_speed(iter/s)": 1.441413 }, { "epoch": 1.259371920654642, "grad_norm": 4.286125659942627, "learning_rate": 8.514653555854242e-05, "loss": 2.609775161743164, "memory(GiB)": 77.56, "step": 29395, "token_acc": 0.4674922600619195, "train_speed(iter/s)": 1.441396 }, { "epoch": 1.259586135983891, "grad_norm": 5.016783237457275, "learning_rate": 8.514174863304667e-05, "loss": 2.781946563720703, "memory(GiB)": 77.56, "step": 29400, "token_acc": 0.4594594594594595, "train_speed(iter/s)": 1.441388 }, { "epoch": 1.25980035131314, "grad_norm": 3.8409063816070557, "learning_rate": 8.513696107092517e-05, "loss": 2.5462841033935546, "memory(GiB)": 77.56, "step": 29405, "token_acc": 0.4720496894409938, "train_speed(iter/s)": 1.441419 }, { "epoch": 1.260014566642389, "grad_norm": 5.7038421630859375, "learning_rate": 8.513217287226466e-05, "loss": 2.49387149810791, "memory(GiB)": 77.56, "step": 29410, "token_acc": 0.5036231884057971, "train_speed(iter/s)": 1.441448 }, { "epoch": 1.2602287819716378, "grad_norm": 4.566816329956055, "learning_rate": 8.51273840371519e-05, "loss": 2.2985652923583983, "memory(GiB)": 77.56, "step": 29415, "token_acc": 0.4807692307692308, "train_speed(iter/s)": 1.441456 }, { "epoch": 1.2604429973008868, "grad_norm": 4.351600646972656, "learning_rate": 8.512259456567362e-05, "loss": 2.575051116943359, "memory(GiB)": 77.56, "step": 29420, "token_acc": 0.5, "train_speed(iter/s)": 1.441473 }, { "epoch": 1.2606572126301359, "grad_norm": 6.6223978996276855, "learning_rate": 8.511780445791659e-05, "loss": 2.7139156341552733, "memory(GiB)": 77.56, "step": 29425, "token_acc": 0.43214285714285716, "train_speed(iter/s)": 1.441527 }, { "epoch": 1.2608714279593847, "grad_norm": 7.783810138702393, "learning_rate": 8.511301371396758e-05, "loss": 2.307796859741211, "memory(GiB)": 77.56, "step": 29430, "token_acc": 0.4939759036144578, "train_speed(iter/s)": 1.441568 }, { "epoch": 1.2610856432886337, "grad_norm": 4.968101501464844, "learning_rate": 8.510822233391338e-05, "loss": 2.2231395721435545, "memory(GiB)": 77.56, "step": 29435, "token_acc": 0.5125448028673835, "train_speed(iter/s)": 1.441603 }, { "epoch": 1.2612998586178827, "grad_norm": 5.41532564163208, "learning_rate": 8.510343031784082e-05, "loss": 2.586380195617676, "memory(GiB)": 77.56, "step": 29440, "token_acc": 0.45017182130584193, "train_speed(iter/s)": 1.441635 }, { "epoch": 1.2615140739471316, "grad_norm": 4.374895095825195, "learning_rate": 8.509863766583668e-05, "loss": 2.7054927825927733, "memory(GiB)": 77.56, "step": 29445, "token_acc": 0.4528301886792453, "train_speed(iter/s)": 1.441667 }, { "epoch": 1.2617282892763806, "grad_norm": 4.795865535736084, "learning_rate": 8.50938443779878e-05, "loss": 2.680415153503418, "memory(GiB)": 77.56, "step": 29450, "token_acc": 0.43171806167400884, "train_speed(iter/s)": 1.441669 }, { "epoch": 1.2619425046056296, "grad_norm": 4.2429962158203125, "learning_rate": 8.5089050454381e-05, "loss": 2.6380331039428713, "memory(GiB)": 77.56, "step": 29455, "token_acc": 0.46107784431137727, "train_speed(iter/s)": 1.441721 }, { "epoch": 1.2621567199348784, "grad_norm": 4.403748512268066, "learning_rate": 8.508425589510314e-05, "loss": 2.669622039794922, "memory(GiB)": 77.56, "step": 29460, "token_acc": 0.4943820224719101, "train_speed(iter/s)": 1.441747 }, { "epoch": 1.2623709352641275, "grad_norm": 4.5235443115234375, "learning_rate": 8.507946070024108e-05, "loss": 2.8870777130126952, "memory(GiB)": 77.56, "step": 29465, "token_acc": 0.39655172413793105, "train_speed(iter/s)": 1.441781 }, { "epoch": 1.2625851505933765, "grad_norm": 5.992175579071045, "learning_rate": 8.507466486988168e-05, "loss": 2.6534969329833986, "memory(GiB)": 77.56, "step": 29470, "token_acc": 0.4584717607973422, "train_speed(iter/s)": 1.441802 }, { "epoch": 1.2627993659226253, "grad_norm": 5.435794353485107, "learning_rate": 8.50698684041118e-05, "loss": 2.5746421813964844, "memory(GiB)": 77.56, "step": 29475, "token_acc": 0.48484848484848486, "train_speed(iter/s)": 1.441806 }, { "epoch": 1.2630135812518744, "grad_norm": 4.327826023101807, "learning_rate": 8.506507130301837e-05, "loss": 2.605919075012207, "memory(GiB)": 77.56, "step": 29480, "token_acc": 0.475, "train_speed(iter/s)": 1.441764 }, { "epoch": 1.2632277965811234, "grad_norm": 4.611506938934326, "learning_rate": 8.50602735666883e-05, "loss": 2.632436752319336, "memory(GiB)": 77.56, "step": 29485, "token_acc": 0.4251497005988024, "train_speed(iter/s)": 1.44178 }, { "epoch": 1.2634420119103722, "grad_norm": 4.321711540222168, "learning_rate": 8.505547519520845e-05, "loss": 2.5482723236083986, "memory(GiB)": 77.56, "step": 29490, "token_acc": 0.4891304347826087, "train_speed(iter/s)": 1.44182 }, { "epoch": 1.2636562272396212, "grad_norm": 4.042707920074463, "learning_rate": 8.50506761886658e-05, "loss": 2.3914003372192383, "memory(GiB)": 77.56, "step": 29495, "token_acc": 0.5087108013937283, "train_speed(iter/s)": 1.441796 }, { "epoch": 1.2638704425688703, "grad_norm": 4.777688503265381, "learning_rate": 8.504587654714727e-05, "loss": 2.768147277832031, "memory(GiB)": 77.56, "step": 29500, "token_acc": 0.4296028880866426, "train_speed(iter/s)": 1.441835 }, { "epoch": 1.2638704425688703, "eval_loss": 2.2530200481414795, "eval_runtime": 14.0803, "eval_samples_per_second": 7.102, "eval_steps_per_second": 7.102, "eval_token_acc": 0.5006765899864682, "step": 29500 }, { "epoch": 1.264084657898119, "grad_norm": 5.182931423187256, "learning_rate": 8.50410762707398e-05, "loss": 2.4805917739868164, "memory(GiB)": 77.56, "step": 29505, "token_acc": 0.4851024208566108, "train_speed(iter/s)": 1.440771 }, { "epoch": 1.2642988732273681, "grad_norm": 18.61556053161621, "learning_rate": 8.503627535953039e-05, "loss": 2.5214603424072264, "memory(GiB)": 77.56, "step": 29510, "token_acc": 0.46938775510204084, "train_speed(iter/s)": 1.440787 }, { "epoch": 1.2645130885566171, "grad_norm": 5.857314586639404, "learning_rate": 8.503147381360595e-05, "loss": 2.8611879348754883, "memory(GiB)": 77.56, "step": 29515, "token_acc": 0.42244224422442245, "train_speed(iter/s)": 1.440788 }, { "epoch": 1.264727303885866, "grad_norm": 4.183280944824219, "learning_rate": 8.502667163305353e-05, "loss": 2.587234306335449, "memory(GiB)": 77.56, "step": 29520, "token_acc": 0.41317365269461076, "train_speed(iter/s)": 1.440824 }, { "epoch": 1.264941519215115, "grad_norm": 4.637208938598633, "learning_rate": 8.502186881796008e-05, "loss": 2.1354629516601564, "memory(GiB)": 77.56, "step": 29525, "token_acc": 0.5421245421245421, "train_speed(iter/s)": 1.440803 }, { "epoch": 1.265155734544364, "grad_norm": 3.8511526584625244, "learning_rate": 8.501706536841263e-05, "loss": 2.7837804794311523, "memory(GiB)": 77.56, "step": 29530, "token_acc": 0.43902439024390244, "train_speed(iter/s)": 1.440849 }, { "epoch": 1.2653699498736128, "grad_norm": 4.983039855957031, "learning_rate": 8.501226128449818e-05, "loss": 2.6234146118164063, "memory(GiB)": 77.56, "step": 29535, "token_acc": 0.4757834757834758, "train_speed(iter/s)": 1.440798 }, { "epoch": 1.2655841652028619, "grad_norm": 5.084944248199463, "learning_rate": 8.500745656630379e-05, "loss": 2.774045944213867, "memory(GiB)": 77.56, "step": 29540, "token_acc": 0.43252595155709345, "train_speed(iter/s)": 1.440785 }, { "epoch": 1.265798380532111, "grad_norm": 3.617826461791992, "learning_rate": 8.500265121391649e-05, "loss": 2.4186511993408204, "memory(GiB)": 77.56, "step": 29545, "token_acc": 0.48201438848920863, "train_speed(iter/s)": 1.440786 }, { "epoch": 1.2660125958613597, "grad_norm": 4.013150215148926, "learning_rate": 8.499784522742329e-05, "loss": 2.6996299743652346, "memory(GiB)": 77.56, "step": 29550, "token_acc": 0.46273291925465837, "train_speed(iter/s)": 1.440808 }, { "epoch": 1.2662268111906088, "grad_norm": 5.148481369018555, "learning_rate": 8.499303860691132e-05, "loss": 2.5028945922851564, "memory(GiB)": 77.56, "step": 29555, "token_acc": 0.4517241379310345, "train_speed(iter/s)": 1.440844 }, { "epoch": 1.2664410265198578, "grad_norm": 4.05296516418457, "learning_rate": 8.498823135246763e-05, "loss": 2.5566755294799806, "memory(GiB)": 77.56, "step": 29560, "token_acc": 0.5206349206349207, "train_speed(iter/s)": 1.440865 }, { "epoch": 1.2666552418491066, "grad_norm": 5.525784015655518, "learning_rate": 8.49834234641793e-05, "loss": 2.2289154052734377, "memory(GiB)": 77.56, "step": 29565, "token_acc": 0.5243445692883895, "train_speed(iter/s)": 1.440906 }, { "epoch": 1.2668694571783556, "grad_norm": 4.3178887367248535, "learning_rate": 8.497861494213344e-05, "loss": 2.4543731689453123, "memory(GiB)": 77.56, "step": 29570, "token_acc": 0.5318352059925093, "train_speed(iter/s)": 1.440951 }, { "epoch": 1.2670836725076047, "grad_norm": 5.241393089294434, "learning_rate": 8.497380578641717e-05, "loss": 2.5240528106689455, "memory(GiB)": 77.56, "step": 29575, "token_acc": 0.49044585987261147, "train_speed(iter/s)": 1.440992 }, { "epoch": 1.2672978878368535, "grad_norm": 10.083868026733398, "learning_rate": 8.496899599711759e-05, "loss": 2.7484014511108397, "memory(GiB)": 77.56, "step": 29580, "token_acc": 0.45768025078369906, "train_speed(iter/s)": 1.440965 }, { "epoch": 1.2675121031661025, "grad_norm": 7.475610256195068, "learning_rate": 8.496418557432183e-05, "loss": 2.37349796295166, "memory(GiB)": 77.56, "step": 29585, "token_acc": 0.5, "train_speed(iter/s)": 1.440982 }, { "epoch": 1.2677263184953516, "grad_norm": 5.21260404586792, "learning_rate": 8.495937451811706e-05, "loss": 3.0564125061035154, "memory(GiB)": 77.56, "step": 29590, "token_acc": 0.40625, "train_speed(iter/s)": 1.440962 }, { "epoch": 1.2679405338246004, "grad_norm": 6.1735148429870605, "learning_rate": 8.495456282859043e-05, "loss": 2.9267141342163088, "memory(GiB)": 77.56, "step": 29595, "token_acc": 0.40059347181008903, "train_speed(iter/s)": 1.441033 }, { "epoch": 1.2681547491538494, "grad_norm": 5.1847968101501465, "learning_rate": 8.494975050582909e-05, "loss": 2.7431713104248048, "memory(GiB)": 77.56, "step": 29600, "token_acc": 0.4560669456066946, "train_speed(iter/s)": 1.441048 }, { "epoch": 1.2683689644830984, "grad_norm": 4.126093864440918, "learning_rate": 8.494493754992026e-05, "loss": 2.567129135131836, "memory(GiB)": 77.56, "step": 29605, "token_acc": 0.45517241379310347, "train_speed(iter/s)": 1.44111 }, { "epoch": 1.2685831798123473, "grad_norm": 4.953516006469727, "learning_rate": 8.49401239609511e-05, "loss": 2.479683685302734, "memory(GiB)": 77.56, "step": 29610, "token_acc": 0.4766355140186916, "train_speed(iter/s)": 1.441084 }, { "epoch": 1.2687973951415963, "grad_norm": 5.266232490539551, "learning_rate": 8.49353097390088e-05, "loss": 2.767433929443359, "memory(GiB)": 77.56, "step": 29615, "token_acc": 0.4, "train_speed(iter/s)": 1.441015 }, { "epoch": 1.2690116104708453, "grad_norm": 4.574163436889648, "learning_rate": 8.493049488418061e-05, "loss": 2.4078720092773436, "memory(GiB)": 77.56, "step": 29620, "token_acc": 0.4809688581314879, "train_speed(iter/s)": 1.441089 }, { "epoch": 1.2692258258000941, "grad_norm": 5.394957065582275, "learning_rate": 8.492567939655371e-05, "loss": 2.544032859802246, "memory(GiB)": 77.56, "step": 29625, "token_acc": 0.4645669291338583, "train_speed(iter/s)": 1.441139 }, { "epoch": 1.2694400411293432, "grad_norm": 4.767903804779053, "learning_rate": 8.49208632762154e-05, "loss": 2.6711475372314455, "memory(GiB)": 77.56, "step": 29630, "token_acc": 0.4624277456647399, "train_speed(iter/s)": 1.441114 }, { "epoch": 1.2696542564585922, "grad_norm": 4.421485424041748, "learning_rate": 8.491604652325287e-05, "loss": 2.718253326416016, "memory(GiB)": 77.56, "step": 29635, "token_acc": 0.47202797202797203, "train_speed(iter/s)": 1.441153 }, { "epoch": 1.269868471787841, "grad_norm": 5.698289394378662, "learning_rate": 8.491122913775342e-05, "loss": 3.055817413330078, "memory(GiB)": 77.56, "step": 29640, "token_acc": 0.3904109589041096, "train_speed(iter/s)": 1.441116 }, { "epoch": 1.27008268711709, "grad_norm": 4.952931880950928, "learning_rate": 8.490641111980429e-05, "loss": 2.50198974609375, "memory(GiB)": 77.56, "step": 29645, "token_acc": 0.49473684210526314, "train_speed(iter/s)": 1.441181 }, { "epoch": 1.270296902446339, "grad_norm": 4.448631763458252, "learning_rate": 8.490159246949278e-05, "loss": 2.729186248779297, "memory(GiB)": 77.56, "step": 29650, "token_acc": 0.4845679012345679, "train_speed(iter/s)": 1.441163 }, { "epoch": 1.270511117775588, "grad_norm": 4.190755844116211, "learning_rate": 8.489677318690619e-05, "loss": 2.6032169342041014, "memory(GiB)": 77.56, "step": 29655, "token_acc": 0.45565749235474007, "train_speed(iter/s)": 1.441183 }, { "epoch": 1.270725333104837, "grad_norm": 4.64596700668335, "learning_rate": 8.48919532721318e-05, "loss": 2.600752258300781, "memory(GiB)": 77.56, "step": 29660, "token_acc": 0.46794871794871795, "train_speed(iter/s)": 1.441195 }, { "epoch": 1.270939548434086, "grad_norm": 4.705674171447754, "learning_rate": 8.488713272525696e-05, "loss": 2.573043441772461, "memory(GiB)": 77.56, "step": 29665, "token_acc": 0.4253968253968254, "train_speed(iter/s)": 1.441253 }, { "epoch": 1.2711537637633348, "grad_norm": 4.295762062072754, "learning_rate": 8.488231154636899e-05, "loss": 2.5170421600341797, "memory(GiB)": 77.56, "step": 29670, "token_acc": 0.47761194029850745, "train_speed(iter/s)": 1.44131 }, { "epoch": 1.2713679790925838, "grad_norm": 4.633725643157959, "learning_rate": 8.487748973555523e-05, "loss": 2.4535173416137694, "memory(GiB)": 77.56, "step": 29675, "token_acc": 0.4792332268370607, "train_speed(iter/s)": 1.441293 }, { "epoch": 1.2715821944218328, "grad_norm": 4.391819953918457, "learning_rate": 8.487266729290299e-05, "loss": 2.348721504211426, "memory(GiB)": 77.56, "step": 29680, "token_acc": 0.48464163822525597, "train_speed(iter/s)": 1.441354 }, { "epoch": 1.2717964097510817, "grad_norm": 7.077546119689941, "learning_rate": 8.48678442184997e-05, "loss": 2.9757081985473635, "memory(GiB)": 77.56, "step": 29685, "token_acc": 0.42857142857142855, "train_speed(iter/s)": 1.441361 }, { "epoch": 1.2720106250803307, "grad_norm": 5.971894264221191, "learning_rate": 8.48630205124327e-05, "loss": 2.3883392333984377, "memory(GiB)": 77.56, "step": 29690, "token_acc": 0.5, "train_speed(iter/s)": 1.441363 }, { "epoch": 1.2722248404095797, "grad_norm": 5.288169860839844, "learning_rate": 8.485819617478936e-05, "loss": 2.5470142364501953, "memory(GiB)": 77.56, "step": 29695, "token_acc": 0.48466257668711654, "train_speed(iter/s)": 1.441349 }, { "epoch": 1.2724390557388285, "grad_norm": 5.781827449798584, "learning_rate": 8.485337120565712e-05, "loss": 2.4330787658691406, "memory(GiB)": 77.56, "step": 29700, "token_acc": 0.47101449275362317, "train_speed(iter/s)": 1.441351 }, { "epoch": 1.2726532710680776, "grad_norm": 6.260162830352783, "learning_rate": 8.484854560512335e-05, "loss": 2.556867790222168, "memory(GiB)": 77.56, "step": 29705, "token_acc": 0.45018450184501846, "train_speed(iter/s)": 1.441306 }, { "epoch": 1.2728674863973266, "grad_norm": 4.832083225250244, "learning_rate": 8.48437193732755e-05, "loss": 2.403263473510742, "memory(GiB)": 77.56, "step": 29710, "token_acc": 0.4838709677419355, "train_speed(iter/s)": 1.44127 }, { "epoch": 1.2730817017265754, "grad_norm": 4.045101642608643, "learning_rate": 8.483889251020097e-05, "loss": 2.421524429321289, "memory(GiB)": 77.56, "step": 29715, "token_acc": 0.4816053511705686, "train_speed(iter/s)": 1.441242 }, { "epoch": 1.2732959170558245, "grad_norm": 5.044188022613525, "learning_rate": 8.483406501598724e-05, "loss": 2.3714595794677735, "memory(GiB)": 77.56, "step": 29720, "token_acc": 0.4746376811594203, "train_speed(iter/s)": 1.44126 }, { "epoch": 1.2735101323850735, "grad_norm": 5.459587574005127, "learning_rate": 8.482923689072173e-05, "loss": 2.447173309326172, "memory(GiB)": 77.56, "step": 29725, "token_acc": 0.48148148148148145, "train_speed(iter/s)": 1.441192 }, { "epoch": 1.2737243477143223, "grad_norm": 4.656224250793457, "learning_rate": 8.482440813449193e-05, "loss": 2.7099254608154295, "memory(GiB)": 77.56, "step": 29730, "token_acc": 0.4867549668874172, "train_speed(iter/s)": 1.441172 }, { "epoch": 1.2739385630435713, "grad_norm": 4.5779900550842285, "learning_rate": 8.481957874738529e-05, "loss": 2.446799468994141, "memory(GiB)": 77.56, "step": 29735, "token_acc": 0.47076023391812866, "train_speed(iter/s)": 1.441138 }, { "epoch": 1.2741527783728204, "grad_norm": 4.220985412597656, "learning_rate": 8.481474872948933e-05, "loss": 2.419972229003906, "memory(GiB)": 77.56, "step": 29740, "token_acc": 0.5049180327868853, "train_speed(iter/s)": 1.441127 }, { "epoch": 1.2743669937020692, "grad_norm": 6.136692047119141, "learning_rate": 8.480991808089156e-05, "loss": 2.4867759704589845, "memory(GiB)": 77.56, "step": 29745, "token_acc": 0.4789156626506024, "train_speed(iter/s)": 1.441114 }, { "epoch": 1.2745812090313182, "grad_norm": 4.943179607391357, "learning_rate": 8.480508680167945e-05, "loss": 2.5574302673339844, "memory(GiB)": 77.56, "step": 29750, "token_acc": 0.4954128440366973, "train_speed(iter/s)": 1.441157 }, { "epoch": 1.2747954243605673, "grad_norm": 5.318636417388916, "learning_rate": 8.480025489194055e-05, "loss": 2.1998023986816406, "memory(GiB)": 77.56, "step": 29755, "token_acc": 0.5084745762711864, "train_speed(iter/s)": 1.441148 }, { "epoch": 1.275009639689816, "grad_norm": 5.354988098144531, "learning_rate": 8.479542235176236e-05, "loss": 2.6404611587524416, "memory(GiB)": 77.56, "step": 29760, "token_acc": 0.43944636678200694, "train_speed(iter/s)": 1.441137 }, { "epoch": 1.275223855019065, "grad_norm": 5.203178405761719, "learning_rate": 8.479058918123248e-05, "loss": 2.486052131652832, "memory(GiB)": 77.56, "step": 29765, "token_acc": 0.45222929936305734, "train_speed(iter/s)": 1.441166 }, { "epoch": 1.2754380703483141, "grad_norm": 6.913606643676758, "learning_rate": 8.478575538043843e-05, "loss": 2.5961864471435545, "memory(GiB)": 77.56, "step": 29770, "token_acc": 0.4549019607843137, "train_speed(iter/s)": 1.44118 }, { "epoch": 1.2756522856775632, "grad_norm": 4.3769001960754395, "learning_rate": 8.47809209494678e-05, "loss": 2.586769676208496, "memory(GiB)": 77.56, "step": 29775, "token_acc": 0.45394736842105265, "train_speed(iter/s)": 1.441162 }, { "epoch": 1.275866501006812, "grad_norm": 4.2965569496154785, "learning_rate": 8.477608588840815e-05, "loss": 2.709580421447754, "memory(GiB)": 77.56, "step": 29780, "token_acc": 0.44385026737967914, "train_speed(iter/s)": 1.441167 }, { "epoch": 1.276080716336061, "grad_norm": 4.180333614349365, "learning_rate": 8.477125019734709e-05, "loss": 2.206060218811035, "memory(GiB)": 77.56, "step": 29785, "token_acc": 0.5541125541125541, "train_speed(iter/s)": 1.441115 }, { "epoch": 1.27629493166531, "grad_norm": 4.147204399108887, "learning_rate": 8.476641387637221e-05, "loss": 2.558124542236328, "memory(GiB)": 77.56, "step": 29790, "token_acc": 0.451505016722408, "train_speed(iter/s)": 1.441146 }, { "epoch": 1.2765091469945589, "grad_norm": 4.534034252166748, "learning_rate": 8.476157692557112e-05, "loss": 2.290990447998047, "memory(GiB)": 77.56, "step": 29795, "token_acc": 0.5144694533762058, "train_speed(iter/s)": 1.441105 }, { "epoch": 1.276723362323808, "grad_norm": 5.902621746063232, "learning_rate": 8.475673934503147e-05, "loss": 2.5384904861450197, "memory(GiB)": 77.56, "step": 29800, "token_acc": 0.46557377049180326, "train_speed(iter/s)": 1.441063 }, { "epoch": 1.276937577653057, "grad_norm": 5.168031692504883, "learning_rate": 8.475190113484088e-05, "loss": 2.5275623321533205, "memory(GiB)": 77.56, "step": 29805, "token_acc": 0.42592592592592593, "train_speed(iter/s)": 1.441055 }, { "epoch": 1.2771517929823057, "grad_norm": 4.34005069732666, "learning_rate": 8.4747062295087e-05, "loss": 2.7979345321655273, "memory(GiB)": 77.56, "step": 29810, "token_acc": 0.4633333333333333, "train_speed(iter/s)": 1.441146 }, { "epoch": 1.2773660083115548, "grad_norm": 3.788792610168457, "learning_rate": 8.474222282585749e-05, "loss": 2.6363540649414063, "memory(GiB)": 77.56, "step": 29815, "token_acc": 0.45144356955380577, "train_speed(iter/s)": 1.441147 }, { "epoch": 1.2775802236408038, "grad_norm": 4.921347141265869, "learning_rate": 8.473738272724001e-05, "loss": 2.4330434799194336, "memory(GiB)": 77.56, "step": 29820, "token_acc": 0.4852459016393443, "train_speed(iter/s)": 1.441114 }, { "epoch": 1.2777944389700526, "grad_norm": 4.742170810699463, "learning_rate": 8.473254199932227e-05, "loss": 2.4082475662231446, "memory(GiB)": 77.56, "step": 29825, "token_acc": 0.4565916398713826, "train_speed(iter/s)": 1.44113 }, { "epoch": 1.2780086542993017, "grad_norm": 5.1656975746154785, "learning_rate": 8.472770064219196e-05, "loss": 2.3235877990722655, "memory(GiB)": 77.56, "step": 29830, "token_acc": 0.5313531353135313, "train_speed(iter/s)": 1.441116 }, { "epoch": 1.2782228696285507, "grad_norm": 3.8671486377716064, "learning_rate": 8.472285865593674e-05, "loss": 2.4731122970581056, "memory(GiB)": 77.56, "step": 29835, "token_acc": 0.48717948717948717, "train_speed(iter/s)": 1.441104 }, { "epoch": 1.2784370849577995, "grad_norm": 4.296897888183594, "learning_rate": 8.47180160406444e-05, "loss": 2.556016731262207, "memory(GiB)": 77.56, "step": 29840, "token_acc": 0.5136986301369864, "train_speed(iter/s)": 1.441102 }, { "epoch": 1.2786513002870485, "grad_norm": 4.75508975982666, "learning_rate": 8.471317279640261e-05, "loss": 2.7080209732055662, "memory(GiB)": 77.56, "step": 29845, "token_acc": 0.45396825396825397, "train_speed(iter/s)": 1.4411 }, { "epoch": 1.2788655156162976, "grad_norm": 4.338958740234375, "learning_rate": 8.470832892329912e-05, "loss": 2.454414176940918, "memory(GiB)": 77.56, "step": 29850, "token_acc": 0.4943820224719101, "train_speed(iter/s)": 1.44104 }, { "epoch": 1.2790797309455464, "grad_norm": 5.543600559234619, "learning_rate": 8.470348442142172e-05, "loss": 2.488588333129883, "memory(GiB)": 77.56, "step": 29855, "token_acc": 0.4934210526315789, "train_speed(iter/s)": 1.441028 }, { "epoch": 1.2792939462747954, "grad_norm": 5.980303764343262, "learning_rate": 8.469863929085813e-05, "loss": 2.561494064331055, "memory(GiB)": 77.56, "step": 29860, "token_acc": 0.4623287671232877, "train_speed(iter/s)": 1.440997 }, { "epoch": 1.2795081616040445, "grad_norm": 4.543352127075195, "learning_rate": 8.469379353169615e-05, "loss": 2.335466003417969, "memory(GiB)": 77.56, "step": 29865, "token_acc": 0.5054151624548736, "train_speed(iter/s)": 1.440972 }, { "epoch": 1.2797223769332933, "grad_norm": 6.944456577301025, "learning_rate": 8.468894714402356e-05, "loss": 3.019215393066406, "memory(GiB)": 77.56, "step": 29870, "token_acc": 0.40390879478827363, "train_speed(iter/s)": 1.441014 }, { "epoch": 1.2799365922625423, "grad_norm": 5.229017734527588, "learning_rate": 8.468410012792813e-05, "loss": 2.2623971939086913, "memory(GiB)": 77.56, "step": 29875, "token_acc": 0.5098684210526315, "train_speed(iter/s)": 1.440983 }, { "epoch": 1.2801508075917913, "grad_norm": 8.276698112487793, "learning_rate": 8.467925248349771e-05, "loss": 2.4009773254394533, "memory(GiB)": 77.56, "step": 29880, "token_acc": 0.49830508474576274, "train_speed(iter/s)": 1.440973 }, { "epoch": 1.2803650229210402, "grad_norm": 5.859376907348633, "learning_rate": 8.467440421082008e-05, "loss": 2.3379093170166017, "memory(GiB)": 77.56, "step": 29885, "token_acc": 0.518796992481203, "train_speed(iter/s)": 1.44096 }, { "epoch": 1.2805792382502892, "grad_norm": 4.952724456787109, "learning_rate": 8.466955530998311e-05, "loss": 2.523929786682129, "memory(GiB)": 77.56, "step": 29890, "token_acc": 0.4879032258064516, "train_speed(iter/s)": 1.441012 }, { "epoch": 1.2807934535795382, "grad_norm": 5.0092878341674805, "learning_rate": 8.466470578107462e-05, "loss": 2.7064552307128906, "memory(GiB)": 77.56, "step": 29895, "token_acc": 0.4772727272727273, "train_speed(iter/s)": 1.440983 }, { "epoch": 1.281007668908787, "grad_norm": 4.82404088973999, "learning_rate": 8.465985562418244e-05, "loss": 2.713532066345215, "memory(GiB)": 77.56, "step": 29900, "token_acc": 0.4798657718120805, "train_speed(iter/s)": 1.440986 }, { "epoch": 1.281221884238036, "grad_norm": 6.226202964782715, "learning_rate": 8.465500483939449e-05, "loss": 2.8195823669433593, "memory(GiB)": 77.56, "step": 29905, "token_acc": 0.46303501945525294, "train_speed(iter/s)": 1.441021 }, { "epoch": 1.281436099567285, "grad_norm": 5.871167182922363, "learning_rate": 8.465015342679861e-05, "loss": 2.382765769958496, "memory(GiB)": 77.56, "step": 29910, "token_acc": 0.4856115107913669, "train_speed(iter/s)": 1.440985 }, { "epoch": 1.281650314896534, "grad_norm": 5.723329067230225, "learning_rate": 8.46453013864827e-05, "loss": 2.5458908081054688, "memory(GiB)": 77.56, "step": 29915, "token_acc": 0.4935483870967742, "train_speed(iter/s)": 1.441042 }, { "epoch": 1.281864530225783, "grad_norm": 4.599218845367432, "learning_rate": 8.464044871853465e-05, "loss": 2.552521514892578, "memory(GiB)": 77.56, "step": 29920, "token_acc": 0.4717607973421927, "train_speed(iter/s)": 1.441029 }, { "epoch": 1.282078745555032, "grad_norm": 4.348132610321045, "learning_rate": 8.463559542304238e-05, "loss": 2.5979137420654297, "memory(GiB)": 77.56, "step": 29925, "token_acc": 0.4638888888888889, "train_speed(iter/s)": 1.441008 }, { "epoch": 1.2822929608842808, "grad_norm": 4.032307147979736, "learning_rate": 8.46307415000938e-05, "loss": 2.4703521728515625, "memory(GiB)": 77.56, "step": 29930, "token_acc": 0.4777327935222672, "train_speed(iter/s)": 1.441061 }, { "epoch": 1.2825071762135298, "grad_norm": 6.230744361877441, "learning_rate": 8.462588694977686e-05, "loss": 2.4588584899902344, "memory(GiB)": 77.56, "step": 29935, "token_acc": 0.4558303886925795, "train_speed(iter/s)": 1.441122 }, { "epoch": 1.2827213915427789, "grad_norm": 6.356720447540283, "learning_rate": 8.46210317721795e-05, "loss": 2.537357711791992, "memory(GiB)": 77.56, "step": 29940, "token_acc": 0.46545454545454545, "train_speed(iter/s)": 1.441127 }, { "epoch": 1.282935606872028, "grad_norm": 4.601590633392334, "learning_rate": 8.461617596738967e-05, "loss": 2.2179203033447266, "memory(GiB)": 77.56, "step": 29945, "token_acc": 0.47686832740213525, "train_speed(iter/s)": 1.441155 }, { "epoch": 1.2831498222012767, "grad_norm": 6.738268852233887, "learning_rate": 8.461131953549532e-05, "loss": 2.5237405776977537, "memory(GiB)": 77.56, "step": 29950, "token_acc": 0.4728682170542636, "train_speed(iter/s)": 1.441165 }, { "epoch": 1.2833640375305257, "grad_norm": 4.284161567687988, "learning_rate": 8.460646247658446e-05, "loss": 2.600858688354492, "memory(GiB)": 77.56, "step": 29955, "token_acc": 0.4583333333333333, "train_speed(iter/s)": 1.441169 }, { "epoch": 1.2835782528597748, "grad_norm": 4.127976894378662, "learning_rate": 8.460160479074508e-05, "loss": 2.69942626953125, "memory(GiB)": 77.56, "step": 29960, "token_acc": 0.44744744744744747, "train_speed(iter/s)": 1.441152 }, { "epoch": 1.2837924681890236, "grad_norm": 5.700314521789551, "learning_rate": 8.459674647806517e-05, "loss": 2.6903078079223635, "memory(GiB)": 77.56, "step": 29965, "token_acc": 0.44314868804664725, "train_speed(iter/s)": 1.441189 }, { "epoch": 1.2840066835182726, "grad_norm": 3.552034616470337, "learning_rate": 8.459188753863273e-05, "loss": 2.807967758178711, "memory(GiB)": 77.56, "step": 29970, "token_acc": 0.445993031358885, "train_speed(iter/s)": 1.441236 }, { "epoch": 1.2842208988475217, "grad_norm": 4.367836952209473, "learning_rate": 8.458702797253581e-05, "loss": 2.3090091705322267, "memory(GiB)": 77.56, "step": 29975, "token_acc": 0.5310077519379846, "train_speed(iter/s)": 1.441271 }, { "epoch": 1.2844351141767705, "grad_norm": 6.73507833480835, "learning_rate": 8.458216777986243e-05, "loss": 2.1440126419067385, "memory(GiB)": 77.56, "step": 29980, "token_acc": 0.5061728395061729, "train_speed(iter/s)": 1.441337 }, { "epoch": 1.2846493295060195, "grad_norm": 4.563814163208008, "learning_rate": 8.457730696070063e-05, "loss": 2.487034797668457, "memory(GiB)": 77.56, "step": 29985, "token_acc": 0.47840531561461797, "train_speed(iter/s)": 1.441373 }, { "epoch": 1.2848635448352685, "grad_norm": 5.482424736022949, "learning_rate": 8.457244551513848e-05, "loss": 2.790231132507324, "memory(GiB)": 77.56, "step": 29990, "token_acc": 0.44086021505376344, "train_speed(iter/s)": 1.441336 }, { "epoch": 1.2850777601645174, "grad_norm": 5.418969631195068, "learning_rate": 8.456758344326406e-05, "loss": 2.3300899505615233, "memory(GiB)": 77.56, "step": 29995, "token_acc": 0.5016722408026756, "train_speed(iter/s)": 1.441335 }, { "epoch": 1.2852919754937664, "grad_norm": 4.195066928863525, "learning_rate": 8.456272074516542e-05, "loss": 2.653728485107422, "memory(GiB)": 77.56, "step": 30000, "token_acc": 0.44857142857142857, "train_speed(iter/s)": 1.441371 }, { "epoch": 1.2852919754937664, "eval_loss": 2.228975296020508, "eval_runtime": 14.4878, "eval_samples_per_second": 6.902, "eval_steps_per_second": 6.902, "eval_token_acc": 0.44813278008298757, "step": 30000 }, { "epoch": 1.2855061908230154, "grad_norm": 4.7024993896484375, "learning_rate": 8.45578574209307e-05, "loss": 2.5826650619506837, "memory(GiB)": 77.56, "step": 30005, "token_acc": 0.4550102249488753, "train_speed(iter/s)": 1.440312 }, { "epoch": 1.2857204061522642, "grad_norm": 6.061879634857178, "learning_rate": 8.455299347064795e-05, "loss": 2.5359498977661135, "memory(GiB)": 77.56, "step": 30010, "token_acc": 0.44107744107744107, "train_speed(iter/s)": 1.440321 }, { "epoch": 1.2859346214815133, "grad_norm": 6.374854564666748, "learning_rate": 8.454812889440533e-05, "loss": 2.3168535232543945, "memory(GiB)": 77.56, "step": 30015, "token_acc": 0.4925925925925926, "train_speed(iter/s)": 1.440292 }, { "epoch": 1.2861488368107623, "grad_norm": 6.208530426025391, "learning_rate": 8.454326369229094e-05, "loss": 2.752610206604004, "memory(GiB)": 77.56, "step": 30020, "token_acc": 0.43989071038251365, "train_speed(iter/s)": 1.440312 }, { "epoch": 1.2863630521400111, "grad_norm": 5.622336387634277, "learning_rate": 8.45383978643929e-05, "loss": 2.6988750457763673, "memory(GiB)": 77.56, "step": 30025, "token_acc": 0.45544554455445546, "train_speed(iter/s)": 1.440287 }, { "epoch": 1.2865772674692602, "grad_norm": 5.669009685516357, "learning_rate": 8.453353141079941e-05, "loss": 2.32655029296875, "memory(GiB)": 77.56, "step": 30030, "token_acc": 0.5486381322957199, "train_speed(iter/s)": 1.440273 }, { "epoch": 1.2867914827985092, "grad_norm": 4.0814208984375, "learning_rate": 8.452866433159859e-05, "loss": 2.4197036743164064, "memory(GiB)": 77.56, "step": 30035, "token_acc": 0.4529616724738676, "train_speed(iter/s)": 1.44032 }, { "epoch": 1.287005698127758, "grad_norm": 5.440887928009033, "learning_rate": 8.452379662687864e-05, "loss": 2.7072935104370117, "memory(GiB)": 77.56, "step": 30040, "token_acc": 0.46206896551724136, "train_speed(iter/s)": 1.440344 }, { "epoch": 1.287219913457007, "grad_norm": 5.3693976402282715, "learning_rate": 8.45189282967277e-05, "loss": 2.7098106384277343, "memory(GiB)": 77.56, "step": 30045, "token_acc": 0.43656716417910446, "train_speed(iter/s)": 1.440422 }, { "epoch": 1.287434128786256, "grad_norm": 4.537566184997559, "learning_rate": 8.451405934123402e-05, "loss": 2.397121810913086, "memory(GiB)": 77.56, "step": 30050, "token_acc": 0.4882154882154882, "train_speed(iter/s)": 1.440424 }, { "epoch": 1.2876483441155049, "grad_norm": 4.082806587219238, "learning_rate": 8.450918976048577e-05, "loss": 2.542250061035156, "memory(GiB)": 77.56, "step": 30055, "token_acc": 0.43661971830985913, "train_speed(iter/s)": 1.440433 }, { "epoch": 1.287862559444754, "grad_norm": 4.772249698638916, "learning_rate": 8.450431955457118e-05, "loss": 2.2546098709106444, "memory(GiB)": 77.56, "step": 30060, "token_acc": 0.5110294117647058, "train_speed(iter/s)": 1.440471 }, { "epoch": 1.288076774774003, "grad_norm": 5.119551658630371, "learning_rate": 8.449944872357845e-05, "loss": 2.6211755752563475, "memory(GiB)": 77.56, "step": 30065, "token_acc": 0.4612546125461255, "train_speed(iter/s)": 1.440482 }, { "epoch": 1.2882909901032518, "grad_norm": 4.743472576141357, "learning_rate": 8.449457726759586e-05, "loss": 2.328533172607422, "memory(GiB)": 77.56, "step": 30070, "token_acc": 0.48028673835125446, "train_speed(iter/s)": 1.440466 }, { "epoch": 1.2885052054325008, "grad_norm": 4.37932825088501, "learning_rate": 8.448970518671165e-05, "loss": 2.489857864379883, "memory(GiB)": 77.56, "step": 30075, "token_acc": 0.4845679012345679, "train_speed(iter/s)": 1.44049 }, { "epoch": 1.2887194207617498, "grad_norm": 5.37784481048584, "learning_rate": 8.448483248101408e-05, "loss": 2.578757476806641, "memory(GiB)": 77.56, "step": 30080, "token_acc": 0.46567164179104475, "train_speed(iter/s)": 1.440489 }, { "epoch": 1.2889336360909986, "grad_norm": 4.141708850860596, "learning_rate": 8.447995915059142e-05, "loss": 2.4571609497070312, "memory(GiB)": 77.56, "step": 30085, "token_acc": 0.46075085324232085, "train_speed(iter/s)": 1.440436 }, { "epoch": 1.2891478514202477, "grad_norm": 6.63078498840332, "learning_rate": 8.447508519553194e-05, "loss": 2.537199783325195, "memory(GiB)": 77.56, "step": 30090, "token_acc": 0.43343653250773995, "train_speed(iter/s)": 1.440451 }, { "epoch": 1.2893620667494967, "grad_norm": 3.774007797241211, "learning_rate": 8.447021061592396e-05, "loss": 2.522028923034668, "memory(GiB)": 77.56, "step": 30095, "token_acc": 0.48520710059171596, "train_speed(iter/s)": 1.440489 }, { "epoch": 1.2895762820787455, "grad_norm": 7.373660087585449, "learning_rate": 8.446533541185577e-05, "loss": 2.354354476928711, "memory(GiB)": 77.56, "step": 30100, "token_acc": 0.5261324041811847, "train_speed(iter/s)": 1.440564 }, { "epoch": 1.2897904974079946, "grad_norm": 5.748693466186523, "learning_rate": 8.44604595834157e-05, "loss": 2.551074981689453, "memory(GiB)": 77.56, "step": 30105, "token_acc": 0.5130434782608696, "train_speed(iter/s)": 1.440572 }, { "epoch": 1.2900047127372436, "grad_norm": 3.907365322113037, "learning_rate": 8.445558313069208e-05, "loss": 2.5525793075561523, "memory(GiB)": 77.56, "step": 30110, "token_acc": 0.4745762711864407, "train_speed(iter/s)": 1.440581 }, { "epoch": 1.2902189280664924, "grad_norm": 5.5799102783203125, "learning_rate": 8.445070605377326e-05, "loss": 2.7172801971435545, "memory(GiB)": 77.56, "step": 30115, "token_acc": 0.47692307692307695, "train_speed(iter/s)": 1.44054 }, { "epoch": 1.2904331433957414, "grad_norm": 5.489712238311768, "learning_rate": 8.444582835274758e-05, "loss": 2.4466455459594725, "memory(GiB)": 77.56, "step": 30120, "token_acc": 0.4897260273972603, "train_speed(iter/s)": 1.440558 }, { "epoch": 1.2906473587249905, "grad_norm": 4.120311737060547, "learning_rate": 8.444095002770341e-05, "loss": 2.492127227783203, "memory(GiB)": 77.56, "step": 30125, "token_acc": 0.4956521739130435, "train_speed(iter/s)": 1.4405 }, { "epoch": 1.2908615740542393, "grad_norm": 5.713088035583496, "learning_rate": 8.44360710787291e-05, "loss": 2.3864126205444336, "memory(GiB)": 77.56, "step": 30130, "token_acc": 0.5016722408026756, "train_speed(iter/s)": 1.440494 }, { "epoch": 1.2910757893834883, "grad_norm": 4.283507823944092, "learning_rate": 8.443119150591309e-05, "loss": 2.364004707336426, "memory(GiB)": 77.56, "step": 30135, "token_acc": 0.47692307692307695, "train_speed(iter/s)": 1.440536 }, { "epoch": 1.2912900047127374, "grad_norm": 4.761294364929199, "learning_rate": 8.442631130934372e-05, "loss": 2.564166450500488, "memory(GiB)": 77.56, "step": 30140, "token_acc": 0.5079365079365079, "train_speed(iter/s)": 1.440507 }, { "epoch": 1.2915042200419862, "grad_norm": 4.866940975189209, "learning_rate": 8.442143048910944e-05, "loss": 2.268455696105957, "memory(GiB)": 77.56, "step": 30145, "token_acc": 0.5042372881355932, "train_speed(iter/s)": 1.440497 }, { "epoch": 1.2917184353712352, "grad_norm": 4.474341869354248, "learning_rate": 8.441654904529866e-05, "loss": 2.220620346069336, "memory(GiB)": 77.56, "step": 30150, "token_acc": 0.5242718446601942, "train_speed(iter/s)": 1.440472 }, { "epoch": 1.2919326507004842, "grad_norm": 5.508834362030029, "learning_rate": 8.441166697799981e-05, "loss": 2.6750415802001952, "memory(GiB)": 77.56, "step": 30155, "token_acc": 0.48314606741573035, "train_speed(iter/s)": 1.440502 }, { "epoch": 1.292146866029733, "grad_norm": 4.944782733917236, "learning_rate": 8.440678428730131e-05, "loss": 2.588155746459961, "memory(GiB)": 77.56, "step": 30160, "token_acc": 0.49375, "train_speed(iter/s)": 1.440533 }, { "epoch": 1.292361081358982, "grad_norm": 5.445731163024902, "learning_rate": 8.440190097329165e-05, "loss": 2.3746435165405275, "memory(GiB)": 77.56, "step": 30165, "token_acc": 0.5019607843137255, "train_speed(iter/s)": 1.440557 }, { "epoch": 1.2925752966882311, "grad_norm": 6.3063554763793945, "learning_rate": 8.439701703605929e-05, "loss": 2.836256408691406, "memory(GiB)": 77.56, "step": 30170, "token_acc": 0.475, "train_speed(iter/s)": 1.44061 }, { "epoch": 1.29278951201748, "grad_norm": 5.162059783935547, "learning_rate": 8.439213247569269e-05, "loss": 2.5087608337402343, "memory(GiB)": 77.56, "step": 30175, "token_acc": 0.4669260700389105, "train_speed(iter/s)": 1.440571 }, { "epoch": 1.293003727346729, "grad_norm": 9.062597274780273, "learning_rate": 8.438724729228034e-05, "loss": 2.2947444915771484, "memory(GiB)": 77.56, "step": 30180, "token_acc": 0.5, "train_speed(iter/s)": 1.440556 }, { "epoch": 1.293217942675978, "grad_norm": 5.128320217132568, "learning_rate": 8.438236148591076e-05, "loss": 2.704743576049805, "memory(GiB)": 77.56, "step": 30185, "token_acc": 0.4214876033057851, "train_speed(iter/s)": 1.440589 }, { "epoch": 1.2934321580052268, "grad_norm": 4.757826805114746, "learning_rate": 8.437747505667243e-05, "loss": 2.3504001617431642, "memory(GiB)": 77.56, "step": 30190, "token_acc": 0.49480968858131485, "train_speed(iter/s)": 1.440609 }, { "epoch": 1.2936463733344759, "grad_norm": 6.292047500610352, "learning_rate": 8.437258800465392e-05, "loss": 2.2857501983642576, "memory(GiB)": 77.56, "step": 30195, "token_acc": 0.511520737327189, "train_speed(iter/s)": 1.440617 }, { "epoch": 1.2938605886637249, "grad_norm": 5.029705047607422, "learning_rate": 8.436770032994372e-05, "loss": 2.325166702270508, "memory(GiB)": 77.56, "step": 30200, "token_acc": 0.5190839694656488, "train_speed(iter/s)": 1.440672 }, { "epoch": 1.2940748039929737, "grad_norm": 4.224485874176025, "learning_rate": 8.436281203263039e-05, "loss": 2.3237606048583985, "memory(GiB)": 77.56, "step": 30205, "token_acc": 0.45185185185185184, "train_speed(iter/s)": 1.440693 }, { "epoch": 1.2942890193222227, "grad_norm": 4.651941299438477, "learning_rate": 8.435792311280248e-05, "loss": 2.204146957397461, "memory(GiB)": 77.56, "step": 30210, "token_acc": 0.5401929260450161, "train_speed(iter/s)": 1.440679 }, { "epoch": 1.2945032346514718, "grad_norm": 5.682839393615723, "learning_rate": 8.435303357054857e-05, "loss": 2.5758430480957033, "memory(GiB)": 77.56, "step": 30215, "token_acc": 0.4601226993865031, "train_speed(iter/s)": 1.440665 }, { "epoch": 1.2947174499807206, "grad_norm": 5.664210796356201, "learning_rate": 8.434814340595722e-05, "loss": 2.5128652572631838, "memory(GiB)": 77.56, "step": 30220, "token_acc": 0.45569620253164556, "train_speed(iter/s)": 1.44067 }, { "epoch": 1.2949316653099696, "grad_norm": 4.011490821838379, "learning_rate": 8.434325261911705e-05, "loss": 2.680637741088867, "memory(GiB)": 77.56, "step": 30225, "token_acc": 0.4377224199288256, "train_speed(iter/s)": 1.44071 }, { "epoch": 1.2951458806392186, "grad_norm": 4.033299922943115, "learning_rate": 8.433836121011662e-05, "loss": 2.615276908874512, "memory(GiB)": 77.56, "step": 30230, "token_acc": 0.4709897610921502, "train_speed(iter/s)": 1.440768 }, { "epoch": 1.2953600959684675, "grad_norm": 4.467537879943848, "learning_rate": 8.43334691790446e-05, "loss": 2.5148754119873047, "memory(GiB)": 77.56, "step": 30235, "token_acc": 0.45222929936305734, "train_speed(iter/s)": 1.440784 }, { "epoch": 1.2955743112977165, "grad_norm": 3.937587261199951, "learning_rate": 8.432857652598954e-05, "loss": 2.328424072265625, "memory(GiB)": 77.56, "step": 30240, "token_acc": 0.5129032258064516, "train_speed(iter/s)": 1.440819 }, { "epoch": 1.2957885266269655, "grad_norm": 5.247279167175293, "learning_rate": 8.432368325104014e-05, "loss": 2.4510190963745115, "memory(GiB)": 77.56, "step": 30245, "token_acc": 0.4980544747081712, "train_speed(iter/s)": 1.440825 }, { "epoch": 1.2960027419562143, "grad_norm": 4.112699508666992, "learning_rate": 8.4318789354285e-05, "loss": 2.8035411834716797, "memory(GiB)": 77.56, "step": 30250, "token_acc": 0.42214532871972316, "train_speed(iter/s)": 1.440855 }, { "epoch": 1.2962169572854634, "grad_norm": 5.823759078979492, "learning_rate": 8.43138948358128e-05, "loss": 2.723115158081055, "memory(GiB)": 77.56, "step": 30255, "token_acc": 0.4485049833887043, "train_speed(iter/s)": 1.440902 }, { "epoch": 1.2964311726147124, "grad_norm": 4.194140434265137, "learning_rate": 8.43089996957122e-05, "loss": 2.4155887603759765, "memory(GiB)": 77.56, "step": 30260, "token_acc": 0.5272727272727272, "train_speed(iter/s)": 1.440961 }, { "epoch": 1.2966453879439612, "grad_norm": 4.884934425354004, "learning_rate": 8.43041039340719e-05, "loss": 2.9081209182739256, "memory(GiB)": 77.56, "step": 30265, "token_acc": 0.45302013422818793, "train_speed(iter/s)": 1.441037 }, { "epoch": 1.2968596032732103, "grad_norm": 4.536020278930664, "learning_rate": 8.429920755098058e-05, "loss": 2.5895225524902346, "memory(GiB)": 77.56, "step": 30270, "token_acc": 0.438871473354232, "train_speed(iter/s)": 1.441042 }, { "epoch": 1.2970738186024593, "grad_norm": 4.42290735244751, "learning_rate": 8.429431054652695e-05, "loss": 2.05948429107666, "memory(GiB)": 77.56, "step": 30275, "token_acc": 0.5703422053231939, "train_speed(iter/s)": 1.441055 }, { "epoch": 1.297288033931708, "grad_norm": 5.010835647583008, "learning_rate": 8.428941292079968e-05, "loss": 2.635399055480957, "memory(GiB)": 77.56, "step": 30280, "token_acc": 0.47635135135135137, "train_speed(iter/s)": 1.440991 }, { "epoch": 1.2975022492609571, "grad_norm": 6.494987487792969, "learning_rate": 8.428451467388756e-05, "loss": 2.6263401031494142, "memory(GiB)": 77.56, "step": 30285, "token_acc": 0.4481707317073171, "train_speed(iter/s)": 1.441006 }, { "epoch": 1.2977164645902062, "grad_norm": 5.413225173950195, "learning_rate": 8.427961580587927e-05, "loss": 2.60037841796875, "memory(GiB)": 77.56, "step": 30290, "token_acc": 0.46096654275092935, "train_speed(iter/s)": 1.441013 }, { "epoch": 1.297930679919455, "grad_norm": 4.137669086456299, "learning_rate": 8.427471631686359e-05, "loss": 2.451686477661133, "memory(GiB)": 77.56, "step": 30295, "token_acc": 0.5046153846153846, "train_speed(iter/s)": 1.440965 }, { "epoch": 1.298144895248704, "grad_norm": 4.886295318603516, "learning_rate": 8.426981620692929e-05, "loss": 2.62661075592041, "memory(GiB)": 77.56, "step": 30300, "token_acc": 0.4868421052631579, "train_speed(iter/s)": 1.440948 }, { "epoch": 1.298359110577953, "grad_norm": 10.413487434387207, "learning_rate": 8.426491547616511e-05, "loss": 2.5587528228759764, "memory(GiB)": 77.56, "step": 30305, "token_acc": 0.43333333333333335, "train_speed(iter/s)": 1.440991 }, { "epoch": 1.2985733259072019, "grad_norm": 4.705319881439209, "learning_rate": 8.426001412465984e-05, "loss": 2.501504898071289, "memory(GiB)": 77.56, "step": 30310, "token_acc": 0.49809885931558934, "train_speed(iter/s)": 1.440974 }, { "epoch": 1.298787541236451, "grad_norm": 6.307821750640869, "learning_rate": 8.425511215250226e-05, "loss": 2.345410919189453, "memory(GiB)": 77.56, "step": 30315, "token_acc": 0.46484375, "train_speed(iter/s)": 1.440973 }, { "epoch": 1.2990017565657, "grad_norm": 8.535022735595703, "learning_rate": 8.425020955978122e-05, "loss": 2.7694124221801757, "memory(GiB)": 77.56, "step": 30320, "token_acc": 0.4639175257731959, "train_speed(iter/s)": 1.440985 }, { "epoch": 1.2992159718949488, "grad_norm": 4.92496395111084, "learning_rate": 8.424530634658549e-05, "loss": 2.417336654663086, "memory(GiB)": 77.56, "step": 30325, "token_acc": 0.4659498207885305, "train_speed(iter/s)": 1.440999 }, { "epoch": 1.2994301872241978, "grad_norm": 5.449120044708252, "learning_rate": 8.424040251300392e-05, "loss": 2.5895715713500977, "memory(GiB)": 77.56, "step": 30330, "token_acc": 0.4756554307116105, "train_speed(iter/s)": 1.441064 }, { "epoch": 1.2996444025534468, "grad_norm": 5.59116268157959, "learning_rate": 8.423549805912532e-05, "loss": 2.591572570800781, "memory(GiB)": 77.56, "step": 30335, "token_acc": 0.44039735099337746, "train_speed(iter/s)": 1.441084 }, { "epoch": 1.2998586178826956, "grad_norm": 5.4039626121521, "learning_rate": 8.423059298503857e-05, "loss": 2.8692398071289062, "memory(GiB)": 77.56, "step": 30340, "token_acc": 0.4171779141104294, "train_speed(iter/s)": 1.441059 }, { "epoch": 1.3000728332119447, "grad_norm": 5.4849982261657715, "learning_rate": 8.422568729083251e-05, "loss": 2.378757858276367, "memory(GiB)": 77.56, "step": 30345, "token_acc": 0.489010989010989, "train_speed(iter/s)": 1.441063 }, { "epoch": 1.3002870485411937, "grad_norm": 5.328290939331055, "learning_rate": 8.422078097659602e-05, "loss": 2.5465137481689455, "memory(GiB)": 77.56, "step": 30350, "token_acc": 0.47601476014760147, "train_speed(iter/s)": 1.441087 }, { "epoch": 1.3005012638704425, "grad_norm": 4.981322288513184, "learning_rate": 8.4215874042418e-05, "loss": 2.591132926940918, "memory(GiB)": 77.56, "step": 30355, "token_acc": 0.4722222222222222, "train_speed(iter/s)": 1.441125 }, { "epoch": 1.3007154791996915, "grad_norm": 4.698912143707275, "learning_rate": 8.421096648838728e-05, "loss": 2.6168724060058595, "memory(GiB)": 77.56, "step": 30360, "token_acc": 0.4842105263157895, "train_speed(iter/s)": 1.441066 }, { "epoch": 1.3009296945289406, "grad_norm": 4.894650936126709, "learning_rate": 8.420605831459285e-05, "loss": 2.451618957519531, "memory(GiB)": 77.56, "step": 30365, "token_acc": 0.45741324921135645, "train_speed(iter/s)": 1.441018 }, { "epoch": 1.3011439098581894, "grad_norm": 4.435378074645996, "learning_rate": 8.420114952112358e-05, "loss": 2.362919235229492, "memory(GiB)": 77.56, "step": 30370, "token_acc": 0.46357615894039733, "train_speed(iter/s)": 1.441059 }, { "epoch": 1.3013581251874384, "grad_norm": 5.77326774597168, "learning_rate": 8.419624010806839e-05, "loss": 2.655732345581055, "memory(GiB)": 77.56, "step": 30375, "token_acc": 0.4777777777777778, "train_speed(iter/s)": 1.441133 }, { "epoch": 1.3015723405166875, "grad_norm": 4.066826343536377, "learning_rate": 8.419133007551626e-05, "loss": 2.5782665252685546, "memory(GiB)": 77.56, "step": 30380, "token_acc": 0.48148148148148145, "train_speed(iter/s)": 1.441145 }, { "epoch": 1.3017865558459363, "grad_norm": 7.82096004486084, "learning_rate": 8.418641942355608e-05, "loss": 2.4948854446411133, "memory(GiB)": 77.56, "step": 30385, "token_acc": 0.4847328244274809, "train_speed(iter/s)": 1.441091 }, { "epoch": 1.3020007711751853, "grad_norm": 5.382538318634033, "learning_rate": 8.418150815227686e-05, "loss": 2.6521175384521483, "memory(GiB)": 77.56, "step": 30390, "token_acc": 0.4622356495468278, "train_speed(iter/s)": 1.441049 }, { "epoch": 1.3022149865044343, "grad_norm": 4.82567834854126, "learning_rate": 8.417659626176754e-05, "loss": 2.624923324584961, "memory(GiB)": 77.56, "step": 30395, "token_acc": 0.4574468085106383, "train_speed(iter/s)": 1.440993 }, { "epoch": 1.3024292018336832, "grad_norm": 5.103214740753174, "learning_rate": 8.417168375211713e-05, "loss": 2.4815736770629884, "memory(GiB)": 77.56, "step": 30400, "token_acc": 0.4984423676012461, "train_speed(iter/s)": 1.441065 }, { "epoch": 1.3026434171629322, "grad_norm": 5.527584075927734, "learning_rate": 8.41667706234146e-05, "loss": 2.3787628173828126, "memory(GiB)": 77.56, "step": 30405, "token_acc": 0.5103734439834025, "train_speed(iter/s)": 1.441112 }, { "epoch": 1.3028576324921812, "grad_norm": 7.315732002258301, "learning_rate": 8.416185687574898e-05, "loss": 2.5277336120605467, "memory(GiB)": 77.56, "step": 30410, "token_acc": 0.44912280701754387, "train_speed(iter/s)": 1.441159 }, { "epoch": 1.30307184782143, "grad_norm": 3.7005109786987305, "learning_rate": 8.415694250920927e-05, "loss": 2.5848033905029295, "memory(GiB)": 77.56, "step": 30415, "token_acc": 0.45244956772334294, "train_speed(iter/s)": 1.44111 }, { "epoch": 1.303286063150679, "grad_norm": 5.403597354888916, "learning_rate": 8.415202752388451e-05, "loss": 2.568196487426758, "memory(GiB)": 77.56, "step": 30420, "token_acc": 0.5102639296187683, "train_speed(iter/s)": 1.441115 }, { "epoch": 1.303500278479928, "grad_norm": 5.422910213470459, "learning_rate": 8.414711191986374e-05, "loss": 2.562359619140625, "memory(GiB)": 77.56, "step": 30425, "token_acc": 0.5263157894736842, "train_speed(iter/s)": 1.441098 }, { "epoch": 1.303714493809177, "grad_norm": 8.326611518859863, "learning_rate": 8.4142195697236e-05, "loss": 2.673186683654785, "memory(GiB)": 77.56, "step": 30430, "token_acc": 0.4766666666666667, "train_speed(iter/s)": 1.441134 }, { "epoch": 1.303928709138426, "grad_norm": 3.7338290214538574, "learning_rate": 8.413727885609036e-05, "loss": 2.2320104598999024, "memory(GiB)": 77.56, "step": 30435, "token_acc": 0.4827586206896552, "train_speed(iter/s)": 1.441158 }, { "epoch": 1.304142924467675, "grad_norm": 4.473420143127441, "learning_rate": 8.413236139651588e-05, "loss": 2.5908416748046874, "memory(GiB)": 77.56, "step": 30440, "token_acc": 0.44542772861356933, "train_speed(iter/s)": 1.441131 }, { "epoch": 1.3043571397969238, "grad_norm": 7.763184547424316, "learning_rate": 8.412744331860167e-05, "loss": 2.4383451461791994, "memory(GiB)": 77.56, "step": 30445, "token_acc": 0.4880546075085324, "train_speed(iter/s)": 1.441139 }, { "epoch": 1.3045713551261728, "grad_norm": 5.2608962059021, "learning_rate": 8.412252462243679e-05, "loss": 2.4560430526733397, "memory(GiB)": 77.56, "step": 30450, "token_acc": 0.4713804713804714, "train_speed(iter/s)": 1.441092 }, { "epoch": 1.3047855704554219, "grad_norm": 5.97885274887085, "learning_rate": 8.411760530811038e-05, "loss": 2.6400531768798827, "memory(GiB)": 77.56, "step": 30455, "token_acc": 0.44680851063829785, "train_speed(iter/s)": 1.441112 }, { "epoch": 1.3049997857846707, "grad_norm": 3.698084831237793, "learning_rate": 8.411268537571154e-05, "loss": 2.6130773544311525, "memory(GiB)": 77.56, "step": 30460, "token_acc": 0.444743935309973, "train_speed(iter/s)": 1.441154 }, { "epoch": 1.3052140011139197, "grad_norm": 6.4532270431518555, "learning_rate": 8.410776482532941e-05, "loss": 2.73227596282959, "memory(GiB)": 77.56, "step": 30465, "token_acc": 0.47719298245614034, "train_speed(iter/s)": 1.441224 }, { "epoch": 1.3054282164431688, "grad_norm": 4.533722400665283, "learning_rate": 8.410284365705311e-05, "loss": 2.5809717178344727, "memory(GiB)": 77.56, "step": 30470, "token_acc": 0.4271523178807947, "train_speed(iter/s)": 1.441236 }, { "epoch": 1.3056424317724176, "grad_norm": 5.006070137023926, "learning_rate": 8.409792187097183e-05, "loss": 2.391727828979492, "memory(GiB)": 77.56, "step": 30475, "token_acc": 0.5, "train_speed(iter/s)": 1.441206 }, { "epoch": 1.3058566471016666, "grad_norm": 3.820786714553833, "learning_rate": 8.409299946717469e-05, "loss": 2.6763145446777346, "memory(GiB)": 77.56, "step": 30480, "token_acc": 0.47104247104247104, "train_speed(iter/s)": 1.441114 }, { "epoch": 1.3060708624309156, "grad_norm": 4.826564788818359, "learning_rate": 8.408807644575089e-05, "loss": 2.90842227935791, "memory(GiB)": 77.56, "step": 30485, "token_acc": 0.4772727272727273, "train_speed(iter/s)": 1.441163 }, { "epoch": 1.3062850777601644, "grad_norm": 4.939174175262451, "learning_rate": 8.40831528067896e-05, "loss": 2.670140266418457, "memory(GiB)": 77.56, "step": 30490, "token_acc": 0.46494464944649444, "train_speed(iter/s)": 1.441174 }, { "epoch": 1.3064992930894135, "grad_norm": 4.767963409423828, "learning_rate": 8.407822855038004e-05, "loss": 2.3368053436279297, "memory(GiB)": 77.56, "step": 30495, "token_acc": 0.4921875, "train_speed(iter/s)": 1.441151 }, { "epoch": 1.3067135084186625, "grad_norm": 4.725824356079102, "learning_rate": 8.40733036766114e-05, "loss": 2.627589988708496, "memory(GiB)": 77.56, "step": 30500, "token_acc": 0.45878136200716846, "train_speed(iter/s)": 1.441137 }, { "epoch": 1.3067135084186625, "eval_loss": 2.191805601119995, "eval_runtime": 14.4793, "eval_samples_per_second": 6.906, "eval_steps_per_second": 6.906, "eval_token_acc": 0.498567335243553, "step": 30500 }, { "epoch": 1.3069277237479113, "grad_norm": 5.181138038635254, "learning_rate": 8.406837818557289e-05, "loss": 2.4915164947509765, "memory(GiB)": 77.56, "step": 30505, "token_acc": 0.49851924975320827, "train_speed(iter/s)": 1.440041 }, { "epoch": 1.3071419390771604, "grad_norm": 5.8320441246032715, "learning_rate": 8.406345207735375e-05, "loss": 2.4027130126953127, "memory(GiB)": 77.56, "step": 30510, "token_acc": 0.4485049833887043, "train_speed(iter/s)": 1.440057 }, { "epoch": 1.3073561544064094, "grad_norm": 6.1275506019592285, "learning_rate": 8.405852535204323e-05, "loss": 2.534292984008789, "memory(GiB)": 77.56, "step": 30515, "token_acc": 0.4513888888888889, "train_speed(iter/s)": 1.440037 }, { "epoch": 1.3075703697356582, "grad_norm": 5.595847129821777, "learning_rate": 8.405359800973056e-05, "loss": 2.812118721008301, "memory(GiB)": 77.56, "step": 30520, "token_acc": 0.4371069182389937, "train_speed(iter/s)": 1.440081 }, { "epoch": 1.3077845850649072, "grad_norm": 4.512628078460693, "learning_rate": 8.404867005050504e-05, "loss": 2.545647621154785, "memory(GiB)": 77.56, "step": 30525, "token_acc": 0.4880546075085324, "train_speed(iter/s)": 1.440091 }, { "epoch": 1.3079988003941563, "grad_norm": 5.215756416320801, "learning_rate": 8.404374147445591e-05, "loss": 2.7159494400024413, "memory(GiB)": 77.56, "step": 30530, "token_acc": 0.4411764705882353, "train_speed(iter/s)": 1.440092 }, { "epoch": 1.308213015723405, "grad_norm": 6.1765899658203125, "learning_rate": 8.403881228167245e-05, "loss": 2.8459245681762697, "memory(GiB)": 77.56, "step": 30535, "token_acc": 0.4735099337748344, "train_speed(iter/s)": 1.440111 }, { "epoch": 1.3084272310526541, "grad_norm": 5.327668190002441, "learning_rate": 8.4033882472244e-05, "loss": 2.5296817779541017, "memory(GiB)": 77.56, "step": 30540, "token_acc": 0.47419354838709676, "train_speed(iter/s)": 1.440089 }, { "epoch": 1.3086414463819032, "grad_norm": 4.702518463134766, "learning_rate": 8.402895204625982e-05, "loss": 2.412190628051758, "memory(GiB)": 77.56, "step": 30545, "token_acc": 0.46438746438746437, "train_speed(iter/s)": 1.440101 }, { "epoch": 1.308855661711152, "grad_norm": 4.6268839836120605, "learning_rate": 8.402402100380927e-05, "loss": 2.1747419357299806, "memory(GiB)": 77.56, "step": 30550, "token_acc": 0.5510204081632653, "train_speed(iter/s)": 1.440075 }, { "epoch": 1.309069877040401, "grad_norm": 4.304233074188232, "learning_rate": 8.401908934498166e-05, "loss": 2.7301734924316405, "memory(GiB)": 77.56, "step": 30555, "token_acc": 0.43312101910828027, "train_speed(iter/s)": 1.440124 }, { "epoch": 1.30928409236965, "grad_norm": 4.967772006988525, "learning_rate": 8.401415706986634e-05, "loss": 2.75872917175293, "memory(GiB)": 77.56, "step": 30560, "token_acc": 0.4513888888888889, "train_speed(iter/s)": 1.44013 }, { "epoch": 1.3094983076988989, "grad_norm": 7.840235710144043, "learning_rate": 8.400922417855265e-05, "loss": 2.5490289688110352, "memory(GiB)": 77.56, "step": 30565, "token_acc": 0.4742647058823529, "train_speed(iter/s)": 1.440087 }, { "epoch": 1.309712523028148, "grad_norm": 4.7686767578125, "learning_rate": 8.400429067112996e-05, "loss": 2.0024553298950196, "memory(GiB)": 77.56, "step": 30570, "token_acc": 0.5471014492753623, "train_speed(iter/s)": 1.440049 }, { "epoch": 1.309926738357397, "grad_norm": 4.6255083084106445, "learning_rate": 8.399935654768765e-05, "loss": 2.648905944824219, "memory(GiB)": 77.56, "step": 30575, "token_acc": 0.4673913043478261, "train_speed(iter/s)": 1.440083 }, { "epoch": 1.3101409536866457, "grad_norm": 4.743405342102051, "learning_rate": 8.399442180831509e-05, "loss": 2.6169843673706055, "memory(GiB)": 77.56, "step": 30580, "token_acc": 0.4601449275362319, "train_speed(iter/s)": 1.440091 }, { "epoch": 1.3103551690158948, "grad_norm": 5.546091079711914, "learning_rate": 8.398948645310169e-05, "loss": 2.365212821960449, "memory(GiB)": 77.56, "step": 30585, "token_acc": 0.4753521126760563, "train_speed(iter/s)": 1.440082 }, { "epoch": 1.3105693843451438, "grad_norm": 6.177235126495361, "learning_rate": 8.398455048213687e-05, "loss": 2.355448913574219, "memory(GiB)": 77.56, "step": 30590, "token_acc": 0.5079872204472844, "train_speed(iter/s)": 1.440057 }, { "epoch": 1.3107835996743926, "grad_norm": 4.434200286865234, "learning_rate": 8.397961389551003e-05, "loss": 2.444801902770996, "memory(GiB)": 77.56, "step": 30595, "token_acc": 0.4629080118694362, "train_speed(iter/s)": 1.440144 }, { "epoch": 1.3109978150036417, "grad_norm": 4.852989196777344, "learning_rate": 8.39746766933106e-05, "loss": 2.4627485275268555, "memory(GiB)": 77.56, "step": 30600, "token_acc": 0.4984025559105431, "train_speed(iter/s)": 1.440142 }, { "epoch": 1.3112120303328907, "grad_norm": 4.629029273986816, "learning_rate": 8.396973887562806e-05, "loss": 2.662624168395996, "memory(GiB)": 77.56, "step": 30605, "token_acc": 0.4336283185840708, "train_speed(iter/s)": 1.440146 }, { "epoch": 1.3114262456621395, "grad_norm": 5.103628158569336, "learning_rate": 8.396480044255181e-05, "loss": 2.5142938613891603, "memory(GiB)": 77.56, "step": 30610, "token_acc": 0.4645161290322581, "train_speed(iter/s)": 1.440155 }, { "epoch": 1.3116404609913885, "grad_norm": 5.212147235870361, "learning_rate": 8.395986139417135e-05, "loss": 2.6481840133666994, "memory(GiB)": 77.56, "step": 30615, "token_acc": 0.44481605351170567, "train_speed(iter/s)": 1.440148 }, { "epoch": 1.3118546763206376, "grad_norm": 5.196325302124023, "learning_rate": 8.395492173057613e-05, "loss": 2.6507835388183594, "memory(GiB)": 77.56, "step": 30620, "token_acc": 0.5016501650165016, "train_speed(iter/s)": 1.440129 }, { "epoch": 1.3120688916498864, "grad_norm": 4.532564640045166, "learning_rate": 8.394998145185566e-05, "loss": 2.4831264495849608, "memory(GiB)": 77.56, "step": 30625, "token_acc": 0.5034013605442177, "train_speed(iter/s)": 1.440154 }, { "epoch": 1.3122831069791354, "grad_norm": 6.058781147003174, "learning_rate": 8.394504055809944e-05, "loss": 2.5596569061279295, "memory(GiB)": 77.56, "step": 30630, "token_acc": 0.4517133956386293, "train_speed(iter/s)": 1.440196 }, { "epoch": 1.3124973223083845, "grad_norm": 5.254405498504639, "learning_rate": 8.394009904939695e-05, "loss": 2.311823844909668, "memory(GiB)": 77.56, "step": 30635, "token_acc": 0.47079037800687284, "train_speed(iter/s)": 1.440224 }, { "epoch": 1.3127115376376333, "grad_norm": 5.523976802825928, "learning_rate": 8.393515692583774e-05, "loss": 2.5997011184692385, "memory(GiB)": 77.56, "step": 30640, "token_acc": 0.4827586206896552, "train_speed(iter/s)": 1.440205 }, { "epoch": 1.3129257529668823, "grad_norm": 6.062639236450195, "learning_rate": 8.393021418751132e-05, "loss": 2.713842582702637, "memory(GiB)": 77.56, "step": 30645, "token_acc": 0.45454545454545453, "train_speed(iter/s)": 1.44027 }, { "epoch": 1.3131399682961313, "grad_norm": 4.327581405639648, "learning_rate": 8.392527083450723e-05, "loss": 2.296024513244629, "memory(GiB)": 77.56, "step": 30650, "token_acc": 0.49070631970260226, "train_speed(iter/s)": 1.440255 }, { "epoch": 1.3133541836253801, "grad_norm": 4.298564434051514, "learning_rate": 8.392032686691504e-05, "loss": 2.3184110641479494, "memory(GiB)": 77.56, "step": 30655, "token_acc": 0.4982698961937716, "train_speed(iter/s)": 1.440267 }, { "epoch": 1.3135683989546292, "grad_norm": 4.8839569091796875, "learning_rate": 8.391538228482432e-05, "loss": 2.8416332244873046, "memory(GiB)": 77.56, "step": 30660, "token_acc": 0.4457831325301205, "train_speed(iter/s)": 1.440236 }, { "epoch": 1.3137826142838782, "grad_norm": 5.313823223114014, "learning_rate": 8.391043708832463e-05, "loss": 2.5864084243774412, "memory(GiB)": 77.56, "step": 30665, "token_acc": 0.47039473684210525, "train_speed(iter/s)": 1.440296 }, { "epoch": 1.313996829613127, "grad_norm": 6.0417914390563965, "learning_rate": 8.390549127750558e-05, "loss": 2.6308626174926757, "memory(GiB)": 77.56, "step": 30670, "token_acc": 0.47761194029850745, "train_speed(iter/s)": 1.440309 }, { "epoch": 1.314211044942376, "grad_norm": 3.9929563999176025, "learning_rate": 8.390054485245672e-05, "loss": 2.1022972106933593, "memory(GiB)": 77.56, "step": 30675, "token_acc": 0.5310344827586206, "train_speed(iter/s)": 1.440302 }, { "epoch": 1.314425260271625, "grad_norm": 5.207853317260742, "learning_rate": 8.38955978132677e-05, "loss": 2.3052059173583985, "memory(GiB)": 77.56, "step": 30680, "token_acc": 0.5226480836236934, "train_speed(iter/s)": 1.440262 }, { "epoch": 1.314639475600874, "grad_norm": 5.290105819702148, "learning_rate": 8.389065016002812e-05, "loss": 2.326431655883789, "memory(GiB)": 77.56, "step": 30685, "token_acc": 0.5165562913907285, "train_speed(iter/s)": 1.440246 }, { "epoch": 1.314853690930123, "grad_norm": 5.975013732910156, "learning_rate": 8.388570189282765e-05, "loss": 2.4550048828125, "memory(GiB)": 77.56, "step": 30690, "token_acc": 0.46096654275092935, "train_speed(iter/s)": 1.440226 }, { "epoch": 1.315067906259372, "grad_norm": 5.328584671020508, "learning_rate": 8.388075301175586e-05, "loss": 2.6362619400024414, "memory(GiB)": 77.56, "step": 30695, "token_acc": 0.4980237154150198, "train_speed(iter/s)": 1.440248 }, { "epoch": 1.3152821215886208, "grad_norm": 5.431981086730957, "learning_rate": 8.387580351690248e-05, "loss": 2.6616044998168946, "memory(GiB)": 77.56, "step": 30700, "token_acc": 0.4742268041237113, "train_speed(iter/s)": 1.440251 }, { "epoch": 1.3154963369178698, "grad_norm": 6.600261688232422, "learning_rate": 8.387085340835713e-05, "loss": 2.4584869384765624, "memory(GiB)": 77.56, "step": 30705, "token_acc": 0.47794117647058826, "train_speed(iter/s)": 1.440293 }, { "epoch": 1.3157105522471189, "grad_norm": 4.675459384918213, "learning_rate": 8.38659026862095e-05, "loss": 2.3372697830200195, "memory(GiB)": 77.56, "step": 30710, "token_acc": 0.47266881028938906, "train_speed(iter/s)": 1.440316 }, { "epoch": 1.3159247675763677, "grad_norm": 5.866064548492432, "learning_rate": 8.386095135054927e-05, "loss": 2.628317642211914, "memory(GiB)": 77.56, "step": 30715, "token_acc": 0.496, "train_speed(iter/s)": 1.440316 }, { "epoch": 1.3161389829056167, "grad_norm": 5.0868659019470215, "learning_rate": 8.385599940146613e-05, "loss": 2.426080513000488, "memory(GiB)": 77.56, "step": 30720, "token_acc": 0.5, "train_speed(iter/s)": 1.440264 }, { "epoch": 1.3163531982348657, "grad_norm": 4.216841220855713, "learning_rate": 8.385104683904983e-05, "loss": 2.5281440734863283, "memory(GiB)": 77.56, "step": 30725, "token_acc": 0.4722222222222222, "train_speed(iter/s)": 1.440275 }, { "epoch": 1.3165674135641146, "grad_norm": 5.248867034912109, "learning_rate": 8.384609366339003e-05, "loss": 2.657305145263672, "memory(GiB)": 77.56, "step": 30730, "token_acc": 0.4562043795620438, "train_speed(iter/s)": 1.440238 }, { "epoch": 1.3167816288933636, "grad_norm": 5.731428146362305, "learning_rate": 8.38411398745765e-05, "loss": 2.6855926513671875, "memory(GiB)": 77.56, "step": 30735, "token_acc": 0.4634920634920635, "train_speed(iter/s)": 1.440248 }, { "epoch": 1.3169958442226126, "grad_norm": 5.230101108551025, "learning_rate": 8.383618547269901e-05, "loss": 2.4764209747314454, "memory(GiB)": 77.56, "step": 30740, "token_acc": 0.45907473309608543, "train_speed(iter/s)": 1.440261 }, { "epoch": 1.3172100595518614, "grad_norm": 3.928238868713379, "learning_rate": 8.383123045784724e-05, "loss": 2.663407897949219, "memory(GiB)": 77.56, "step": 30745, "token_acc": 0.4539249146757679, "train_speed(iter/s)": 1.440306 }, { "epoch": 1.3174242748811105, "grad_norm": 5.327571868896484, "learning_rate": 8.382627483011101e-05, "loss": 2.7467296600341795, "memory(GiB)": 77.56, "step": 30750, "token_acc": 0.4368932038834951, "train_speed(iter/s)": 1.440353 }, { "epoch": 1.3176384902103595, "grad_norm": 3.9081926345825195, "learning_rate": 8.382131858958009e-05, "loss": 2.473705291748047, "memory(GiB)": 77.56, "step": 30755, "token_acc": 0.49206349206349204, "train_speed(iter/s)": 1.44033 }, { "epoch": 1.3178527055396083, "grad_norm": 4.930261135101318, "learning_rate": 8.381636173634425e-05, "loss": 2.481026840209961, "memory(GiB)": 77.56, "step": 30760, "token_acc": 0.4527027027027027, "train_speed(iter/s)": 1.440364 }, { "epoch": 1.3180669208688574, "grad_norm": 4.128588676452637, "learning_rate": 8.381140427049328e-05, "loss": 2.537253570556641, "memory(GiB)": 77.56, "step": 30765, "token_acc": 0.4582043343653251, "train_speed(iter/s)": 1.440365 }, { "epoch": 1.3182811361981064, "grad_norm": 5.725658893585205, "learning_rate": 8.380644619211702e-05, "loss": 2.3736797332763673, "memory(GiB)": 77.56, "step": 30770, "token_acc": 0.5016949152542373, "train_speed(iter/s)": 1.440327 }, { "epoch": 1.3184953515273552, "grad_norm": 6.728057861328125, "learning_rate": 8.380148750130525e-05, "loss": 2.5051387786865233, "memory(GiB)": 77.56, "step": 30775, "token_acc": 0.4980694980694981, "train_speed(iter/s)": 1.44031 }, { "epoch": 1.3187095668566042, "grad_norm": 5.706851959228516, "learning_rate": 8.379652819814785e-05, "loss": 2.551733207702637, "memory(GiB)": 77.56, "step": 30780, "token_acc": 0.44694533762057875, "train_speed(iter/s)": 1.440297 }, { "epoch": 1.3189237821858533, "grad_norm": 5.804297924041748, "learning_rate": 8.379156828273463e-05, "loss": 2.484577941894531, "memory(GiB)": 77.56, "step": 30785, "token_acc": 0.46875, "train_speed(iter/s)": 1.440245 }, { "epoch": 1.319137997515102, "grad_norm": 5.132512092590332, "learning_rate": 8.378660775515546e-05, "loss": 2.730057144165039, "memory(GiB)": 77.56, "step": 30790, "token_acc": 0.47335423197492166, "train_speed(iter/s)": 1.440298 }, { "epoch": 1.3193522128443511, "grad_norm": 5.32224702835083, "learning_rate": 8.378164661550019e-05, "loss": 2.4412097930908203, "memory(GiB)": 77.56, "step": 30795, "token_acc": 0.5126353790613718, "train_speed(iter/s)": 1.440278 }, { "epoch": 1.3195664281736001, "grad_norm": 6.233191967010498, "learning_rate": 8.37766848638587e-05, "loss": 2.685907173156738, "memory(GiB)": 77.56, "step": 30800, "token_acc": 0.46078431372549017, "train_speed(iter/s)": 1.4403 }, { "epoch": 1.319780643502849, "grad_norm": 5.858996868133545, "learning_rate": 8.377172250032086e-05, "loss": 2.5612987518310546, "memory(GiB)": 77.56, "step": 30805, "token_acc": 0.4752475247524752, "train_speed(iter/s)": 1.440257 }, { "epoch": 1.319994858832098, "grad_norm": 5.193116188049316, "learning_rate": 8.376675952497661e-05, "loss": 2.216260528564453, "memory(GiB)": 77.56, "step": 30810, "token_acc": 0.5241635687732342, "train_speed(iter/s)": 1.440288 }, { "epoch": 1.320209074161347, "grad_norm": 5.159361362457275, "learning_rate": 8.376179593791585e-05, "loss": 2.3703945159912108, "memory(GiB)": 77.56, "step": 30815, "token_acc": 0.4495798319327731, "train_speed(iter/s)": 1.440303 }, { "epoch": 1.3204232894905958, "grad_norm": 4.9706807136535645, "learning_rate": 8.375683173922846e-05, "loss": 2.440373992919922, "memory(GiB)": 77.56, "step": 30820, "token_acc": 0.5258064516129032, "train_speed(iter/s)": 1.440323 }, { "epoch": 1.3206375048198449, "grad_norm": 5.335484027862549, "learning_rate": 8.37518669290044e-05, "loss": 2.5746845245361327, "memory(GiB)": 77.56, "step": 30825, "token_acc": 0.4902597402597403, "train_speed(iter/s)": 1.440364 }, { "epoch": 1.320851720149094, "grad_norm": 5.414564609527588, "learning_rate": 8.374690150733362e-05, "loss": 2.8610759735107423, "memory(GiB)": 77.56, "step": 30830, "token_acc": 0.4088235294117647, "train_speed(iter/s)": 1.440363 }, { "epoch": 1.3210659354783427, "grad_norm": 5.210866451263428, "learning_rate": 8.374193547430605e-05, "loss": 2.3322175979614257, "memory(GiB)": 77.56, "step": 30835, "token_acc": 0.47794117647058826, "train_speed(iter/s)": 1.440411 }, { "epoch": 1.3212801508075918, "grad_norm": 4.217972755432129, "learning_rate": 8.373696883001166e-05, "loss": 2.41158447265625, "memory(GiB)": 77.56, "step": 30840, "token_acc": 0.4668769716088328, "train_speed(iter/s)": 1.440374 }, { "epoch": 1.3214943661368408, "grad_norm": 6.352040767669678, "learning_rate": 8.373200157454044e-05, "loss": 2.4626605987548826, "memory(GiB)": 77.56, "step": 30845, "token_acc": 0.5098684210526315, "train_speed(iter/s)": 1.440384 }, { "epoch": 1.3217085814660896, "grad_norm": 5.207592964172363, "learning_rate": 8.372703370798239e-05, "loss": 2.6127735137939454, "memory(GiB)": 77.56, "step": 30850, "token_acc": 0.4612794612794613, "train_speed(iter/s)": 1.440425 }, { "epoch": 1.3219227967953386, "grad_norm": 5.286231994628906, "learning_rate": 8.372206523042746e-05, "loss": 2.353206825256348, "memory(GiB)": 77.56, "step": 30855, "token_acc": 0.511326860841424, "train_speed(iter/s)": 1.440439 }, { "epoch": 1.3221370121245877, "grad_norm": 6.365725040435791, "learning_rate": 8.371709614196568e-05, "loss": 2.554417037963867, "memory(GiB)": 77.56, "step": 30860, "token_acc": 0.494949494949495, "train_speed(iter/s)": 1.440479 }, { "epoch": 1.3223512274538365, "grad_norm": 5.4799981117248535, "learning_rate": 8.371212644268709e-05, "loss": 2.3065364837646483, "memory(GiB)": 77.56, "step": 30865, "token_acc": 0.5142857142857142, "train_speed(iter/s)": 1.440512 }, { "epoch": 1.3225654427830855, "grad_norm": 4.490663051605225, "learning_rate": 8.370715613268171e-05, "loss": 2.422890853881836, "memory(GiB)": 77.56, "step": 30870, "token_acc": 0.483271375464684, "train_speed(iter/s)": 1.440524 }, { "epoch": 1.3227796581123346, "grad_norm": 3.842900276184082, "learning_rate": 8.370218521203956e-05, "loss": 2.372048568725586, "memory(GiB)": 77.56, "step": 30875, "token_acc": 0.5080385852090032, "train_speed(iter/s)": 1.440565 }, { "epoch": 1.3229938734415834, "grad_norm": 5.78187894821167, "learning_rate": 8.369721368085072e-05, "loss": 2.3127971649169923, "memory(GiB)": 77.56, "step": 30880, "token_acc": 0.5251798561151079, "train_speed(iter/s)": 1.440593 }, { "epoch": 1.3232080887708324, "grad_norm": 5.000535488128662, "learning_rate": 8.369224153920523e-05, "loss": 2.654227066040039, "memory(GiB)": 77.56, "step": 30885, "token_acc": 0.44569288389513106, "train_speed(iter/s)": 1.440615 }, { "epoch": 1.3234223041000814, "grad_norm": 5.567465782165527, "learning_rate": 8.36872687871932e-05, "loss": 2.454271697998047, "memory(GiB)": 77.56, "step": 30890, "token_acc": 0.45588235294117646, "train_speed(iter/s)": 1.440639 }, { "epoch": 1.3236365194293302, "grad_norm": 4.689681529998779, "learning_rate": 8.368229542490468e-05, "loss": 2.3774295806884767, "memory(GiB)": 77.56, "step": 30895, "token_acc": 0.50187265917603, "train_speed(iter/s)": 1.440637 }, { "epoch": 1.3238507347585793, "grad_norm": 5.641341209411621, "learning_rate": 8.36773214524298e-05, "loss": 2.691015434265137, "memory(GiB)": 77.56, "step": 30900, "token_acc": 0.4359861591695502, "train_speed(iter/s)": 1.44067 }, { "epoch": 1.3240649500878283, "grad_norm": 5.560390472412109, "learning_rate": 8.367234686985862e-05, "loss": 2.8286022186279296, "memory(GiB)": 77.56, "step": 30905, "token_acc": 0.4453125, "train_speed(iter/s)": 1.440639 }, { "epoch": 1.3242791654170771, "grad_norm": 4.860203742980957, "learning_rate": 8.366737167728132e-05, "loss": 2.6245336532592773, "memory(GiB)": 77.56, "step": 30910, "token_acc": 0.4384057971014493, "train_speed(iter/s)": 1.440635 }, { "epoch": 1.3244933807463262, "grad_norm": 4.59218692779541, "learning_rate": 8.366239587478797e-05, "loss": 2.4169998168945312, "memory(GiB)": 77.56, "step": 30915, "token_acc": 0.43641618497109824, "train_speed(iter/s)": 1.440628 }, { "epoch": 1.3247075960755752, "grad_norm": 4.9459757804870605, "learning_rate": 8.365741946246875e-05, "loss": 2.3696567535400392, "memory(GiB)": 77.56, "step": 30920, "token_acc": 0.4566929133858268, "train_speed(iter/s)": 1.440609 }, { "epoch": 1.324921811404824, "grad_norm": 6.135128974914551, "learning_rate": 8.365244244041382e-05, "loss": 2.5217370986938477, "memory(GiB)": 77.56, "step": 30925, "token_acc": 0.49523809523809526, "train_speed(iter/s)": 1.440619 }, { "epoch": 1.325136026734073, "grad_norm": 4.655352592468262, "learning_rate": 8.364746480871331e-05, "loss": 2.4324872970581053, "memory(GiB)": 77.56, "step": 30930, "token_acc": 0.487012987012987, "train_speed(iter/s)": 1.440619 }, { "epoch": 1.325350242063322, "grad_norm": 9.142215728759766, "learning_rate": 8.364248656745741e-05, "loss": 2.2758420944213866, "memory(GiB)": 77.56, "step": 30935, "token_acc": 0.49795918367346936, "train_speed(iter/s)": 1.440624 }, { "epoch": 1.325564457392571, "grad_norm": 4.344929218292236, "learning_rate": 8.36375077167363e-05, "loss": 2.3214689254760743, "memory(GiB)": 77.56, "step": 30940, "token_acc": 0.5257731958762887, "train_speed(iter/s)": 1.440613 }, { "epoch": 1.32577867272182, "grad_norm": 5.71087121963501, "learning_rate": 8.36325282566402e-05, "loss": 2.453940200805664, "memory(GiB)": 77.56, "step": 30945, "token_acc": 0.5186567164179104, "train_speed(iter/s)": 1.440659 }, { "epoch": 1.325992888051069, "grad_norm": 5.22177267074585, "learning_rate": 8.36275481872593e-05, "loss": 2.5214260101318358, "memory(GiB)": 77.56, "step": 30950, "token_acc": 0.5081967213114754, "train_speed(iter/s)": 1.440644 }, { "epoch": 1.3262071033803178, "grad_norm": 4.714686870574951, "learning_rate": 8.362256750868382e-05, "loss": 2.5031557083129883, "memory(GiB)": 77.56, "step": 30955, "token_acc": 0.5050167224080268, "train_speed(iter/s)": 1.440648 }, { "epoch": 1.3264213187095668, "grad_norm": 3.841874361038208, "learning_rate": 8.361758622100399e-05, "loss": 2.238236999511719, "memory(GiB)": 77.56, "step": 30960, "token_acc": 0.5240793201133145, "train_speed(iter/s)": 1.440693 }, { "epoch": 1.3266355340388158, "grad_norm": 5.512101173400879, "learning_rate": 8.361260432431003e-05, "loss": 2.4659755706787108, "memory(GiB)": 77.56, "step": 30965, "token_acc": 0.47678018575851394, "train_speed(iter/s)": 1.440676 }, { "epoch": 1.3268497493680647, "grad_norm": 3.927079677581787, "learning_rate": 8.360762181869224e-05, "loss": 2.7166528701782227, "memory(GiB)": 77.56, "step": 30970, "token_acc": 0.47, "train_speed(iter/s)": 1.440705 }, { "epoch": 1.3270639646973137, "grad_norm": 4.834190845489502, "learning_rate": 8.360263870424083e-05, "loss": 2.770062255859375, "memory(GiB)": 77.56, "step": 30975, "token_acc": 0.45263157894736844, "train_speed(iter/s)": 1.440701 }, { "epoch": 1.3272781800265627, "grad_norm": 4.216663360595703, "learning_rate": 8.359765498104612e-05, "loss": 2.662898635864258, "memory(GiB)": 77.56, "step": 30980, "token_acc": 0.4874551971326165, "train_speed(iter/s)": 1.440663 }, { "epoch": 1.3274923953558115, "grad_norm": 5.50461483001709, "learning_rate": 8.359267064919836e-05, "loss": 2.662904167175293, "memory(GiB)": 77.56, "step": 30985, "token_acc": 0.48286604361370716, "train_speed(iter/s)": 1.440637 }, { "epoch": 1.3277066106850606, "grad_norm": 4.458168983459473, "learning_rate": 8.358768570878786e-05, "loss": 2.64044132232666, "memory(GiB)": 77.56, "step": 30990, "token_acc": 0.46546546546546547, "train_speed(iter/s)": 1.440681 }, { "epoch": 1.3279208260143096, "grad_norm": 4.690812110900879, "learning_rate": 8.358270015990492e-05, "loss": 2.4562482833862305, "memory(GiB)": 77.56, "step": 30995, "token_acc": 0.5111940298507462, "train_speed(iter/s)": 1.440666 }, { "epoch": 1.3281350413435584, "grad_norm": 7.401129722595215, "learning_rate": 8.357771400263988e-05, "loss": 2.774020767211914, "memory(GiB)": 77.56, "step": 31000, "token_acc": 0.4338235294117647, "train_speed(iter/s)": 1.4407 }, { "epoch": 1.3281350413435584, "eval_loss": 2.1634511947631836, "eval_runtime": 13.8855, "eval_samples_per_second": 7.202, "eval_steps_per_second": 7.202, "eval_token_acc": 0.48216833095577744, "step": 31000 }, { "epoch": 1.3283492566728075, "grad_norm": 6.538474082946777, "learning_rate": 8.357272723708303e-05, "loss": 2.881578826904297, "memory(GiB)": 77.56, "step": 31005, "token_acc": 0.4655674102812803, "train_speed(iter/s)": 1.439742 }, { "epoch": 1.3285634720020565, "grad_norm": 4.092084884643555, "learning_rate": 8.356773986332475e-05, "loss": 2.7259519577026365, "memory(GiB)": 77.56, "step": 31010, "token_acc": 0.4402332361516035, "train_speed(iter/s)": 1.439772 }, { "epoch": 1.3287776873313053, "grad_norm": 5.630678653717041, "learning_rate": 8.356275188145537e-05, "loss": 2.4747346878051757, "memory(GiB)": 77.56, "step": 31015, "token_acc": 0.4919093851132686, "train_speed(iter/s)": 1.439716 }, { "epoch": 1.3289919026605543, "grad_norm": 4.750728130340576, "learning_rate": 8.355776329156525e-05, "loss": 2.626193618774414, "memory(GiB)": 77.56, "step": 31020, "token_acc": 0.4857142857142857, "train_speed(iter/s)": 1.439717 }, { "epoch": 1.3292061179898034, "grad_norm": 4.294435977935791, "learning_rate": 8.355277409374477e-05, "loss": 2.298265838623047, "memory(GiB)": 77.56, "step": 31025, "token_acc": 0.49808429118773945, "train_speed(iter/s)": 1.439788 }, { "epoch": 1.3294203333190522, "grad_norm": 8.400869369506836, "learning_rate": 8.354778428808431e-05, "loss": 2.7615495681762696, "memory(GiB)": 77.56, "step": 31030, "token_acc": 0.44089456869009586, "train_speed(iter/s)": 1.439843 }, { "epoch": 1.3296345486483012, "grad_norm": 4.415702819824219, "learning_rate": 8.354279387467427e-05, "loss": 2.6714847564697264, "memory(GiB)": 77.56, "step": 31035, "token_acc": 0.44126074498567336, "train_speed(iter/s)": 1.439927 }, { "epoch": 1.3298487639775503, "grad_norm": 4.608646869659424, "learning_rate": 8.353780285360505e-05, "loss": 2.430695915222168, "memory(GiB)": 77.56, "step": 31040, "token_acc": 0.5103857566765578, "train_speed(iter/s)": 1.439964 }, { "epoch": 1.330062979306799, "grad_norm": 4.7237725257873535, "learning_rate": 8.353281122496708e-05, "loss": 2.490535926818848, "memory(GiB)": 77.56, "step": 31045, "token_acc": 0.4927536231884058, "train_speed(iter/s)": 1.439924 }, { "epoch": 1.330277194636048, "grad_norm": 4.348401069641113, "learning_rate": 8.352781898885076e-05, "loss": 2.503744697570801, "memory(GiB)": 77.56, "step": 31050, "token_acc": 0.4865771812080537, "train_speed(iter/s)": 1.4399 }, { "epoch": 1.3304914099652971, "grad_norm": 5.597479343414307, "learning_rate": 8.352282614534657e-05, "loss": 2.297476577758789, "memory(GiB)": 77.56, "step": 31055, "token_acc": 0.48068669527896996, "train_speed(iter/s)": 1.439909 }, { "epoch": 1.330705625294546, "grad_norm": 4.203425884246826, "learning_rate": 8.351783269454493e-05, "loss": 2.5553943634033205, "memory(GiB)": 77.56, "step": 31060, "token_acc": 0.46283783783783783, "train_speed(iter/s)": 1.439911 }, { "epoch": 1.330919840623795, "grad_norm": 5.708987712860107, "learning_rate": 8.35128386365363e-05, "loss": 2.604556083679199, "memory(GiB)": 77.56, "step": 31065, "token_acc": 0.5, "train_speed(iter/s)": 1.439913 }, { "epoch": 1.331134055953044, "grad_norm": 4.4014129638671875, "learning_rate": 8.350784397141116e-05, "loss": 2.885649871826172, "memory(GiB)": 77.56, "step": 31070, "token_acc": 0.45454545454545453, "train_speed(iter/s)": 1.439968 }, { "epoch": 1.3313482712822928, "grad_norm": 6.514688014984131, "learning_rate": 8.350284869926e-05, "loss": 2.5280813217163085, "memory(GiB)": 77.56, "step": 31075, "token_acc": 0.4966216216216216, "train_speed(iter/s)": 1.439979 }, { "epoch": 1.3315624866115419, "grad_norm": 5.240673542022705, "learning_rate": 8.349785282017332e-05, "loss": 2.4918291091918947, "memory(GiB)": 77.56, "step": 31080, "token_acc": 0.4795539033457249, "train_speed(iter/s)": 1.439985 }, { "epoch": 1.331776701940791, "grad_norm": 4.627394199371338, "learning_rate": 8.34928563342416e-05, "loss": 2.419125556945801, "memory(GiB)": 77.56, "step": 31085, "token_acc": 0.5080906148867314, "train_speed(iter/s)": 1.440046 }, { "epoch": 1.3319909172700397, "grad_norm": 6.271434783935547, "learning_rate": 8.348785924155536e-05, "loss": 2.270029067993164, "memory(GiB)": 77.56, "step": 31090, "token_acc": 0.501577287066246, "train_speed(iter/s)": 1.440052 }, { "epoch": 1.3322051325992887, "grad_norm": 4.423927307128906, "learning_rate": 8.348286154220514e-05, "loss": 2.1954986572265627, "memory(GiB)": 77.56, "step": 31095, "token_acc": 0.5508196721311476, "train_speed(iter/s)": 1.440094 }, { "epoch": 1.3324193479285378, "grad_norm": 4.762854099273682, "learning_rate": 8.347786323628149e-05, "loss": 2.633818817138672, "memory(GiB)": 77.56, "step": 31100, "token_acc": 0.4805194805194805, "train_speed(iter/s)": 1.440139 }, { "epoch": 1.3326335632577866, "grad_norm": 4.1464619636535645, "learning_rate": 8.347286432387493e-05, "loss": 2.3386953353881834, "memory(GiB)": 77.56, "step": 31105, "token_acc": 0.5164835164835165, "train_speed(iter/s)": 1.440128 }, { "epoch": 1.3328477785870356, "grad_norm": 4.485672473907471, "learning_rate": 8.346786480507603e-05, "loss": 2.4894718170166015, "memory(GiB)": 77.56, "step": 31110, "token_acc": 0.5014925373134328, "train_speed(iter/s)": 1.440113 }, { "epoch": 1.3330619939162847, "grad_norm": 3.9842441082000732, "learning_rate": 8.346286467997536e-05, "loss": 2.7668119430541993, "memory(GiB)": 77.56, "step": 31115, "token_acc": 0.4797297297297297, "train_speed(iter/s)": 1.440154 }, { "epoch": 1.3332762092455335, "grad_norm": 4.314476013183594, "learning_rate": 8.345786394866352e-05, "loss": 2.3545547485351563, "memory(GiB)": 77.56, "step": 31120, "token_acc": 0.5263157894736842, "train_speed(iter/s)": 1.440184 }, { "epoch": 1.3334904245747825, "grad_norm": 5.358580589294434, "learning_rate": 8.345286261123108e-05, "loss": 2.2670669555664062, "memory(GiB)": 77.56, "step": 31125, "token_acc": 0.5132075471698113, "train_speed(iter/s)": 1.440219 }, { "epoch": 1.3337046399040315, "grad_norm": 4.57749080657959, "learning_rate": 8.344786066776865e-05, "loss": 2.4415740966796875, "memory(GiB)": 77.56, "step": 31130, "token_acc": 0.4697986577181208, "train_speed(iter/s)": 1.440265 }, { "epoch": 1.3339188552332806, "grad_norm": 4.659688949584961, "learning_rate": 8.344285811836686e-05, "loss": 2.528257942199707, "memory(GiB)": 77.56, "step": 31135, "token_acc": 0.4823943661971831, "train_speed(iter/s)": 1.440296 }, { "epoch": 1.3341330705625294, "grad_norm": 4.4153008460998535, "learning_rate": 8.34378549631163e-05, "loss": 2.794023895263672, "memory(GiB)": 77.56, "step": 31140, "token_acc": 0.4129692832764505, "train_speed(iter/s)": 1.440228 }, { "epoch": 1.3343472858917784, "grad_norm": 5.269262790679932, "learning_rate": 8.343285120210764e-05, "loss": 2.438818168640137, "memory(GiB)": 77.56, "step": 31145, "token_acc": 0.48184818481848185, "train_speed(iter/s)": 1.440265 }, { "epoch": 1.3345615012210275, "grad_norm": 4.74795389175415, "learning_rate": 8.342784683543152e-05, "loss": 2.4837642669677735, "memory(GiB)": 77.56, "step": 31150, "token_acc": 0.4738675958188153, "train_speed(iter/s)": 1.4403 }, { "epoch": 1.3347757165502763, "grad_norm": 7.848414897918701, "learning_rate": 8.342284186317862e-05, "loss": 2.4641300201416017, "memory(GiB)": 77.56, "step": 31155, "token_acc": 0.4521452145214521, "train_speed(iter/s)": 1.440292 }, { "epoch": 1.3349899318795253, "grad_norm": 4.578596115112305, "learning_rate": 8.341783628543956e-05, "loss": 2.5964681625366213, "memory(GiB)": 77.56, "step": 31160, "token_acc": 0.4884488448844885, "train_speed(iter/s)": 1.440277 }, { "epoch": 1.3352041472087743, "grad_norm": 5.1937174797058105, "learning_rate": 8.341283010230506e-05, "loss": 2.5030731201171874, "memory(GiB)": 77.56, "step": 31165, "token_acc": 0.4437299035369775, "train_speed(iter/s)": 1.44029 }, { "epoch": 1.3354183625380232, "grad_norm": 5.432137489318848, "learning_rate": 8.340782331386578e-05, "loss": 2.3957469940185545, "memory(GiB)": 77.56, "step": 31170, "token_acc": 0.5017543859649123, "train_speed(iter/s)": 1.440284 }, { "epoch": 1.3356325778672722, "grad_norm": 5.077791690826416, "learning_rate": 8.340281592021246e-05, "loss": 2.755970001220703, "memory(GiB)": 77.56, "step": 31175, "token_acc": 0.415625, "train_speed(iter/s)": 1.440301 }, { "epoch": 1.3358467931965212, "grad_norm": 3.7372591495513916, "learning_rate": 8.339780792143578e-05, "loss": 2.623807907104492, "memory(GiB)": 77.56, "step": 31180, "token_acc": 0.47112462006079026, "train_speed(iter/s)": 1.44026 }, { "epoch": 1.33606100852577, "grad_norm": 6.986464977264404, "learning_rate": 8.33927993176265e-05, "loss": 2.3935569763183593, "memory(GiB)": 77.56, "step": 31185, "token_acc": 0.5020408163265306, "train_speed(iter/s)": 1.440326 }, { "epoch": 1.336275223855019, "grad_norm": 5.754143714904785, "learning_rate": 8.338779010887532e-05, "loss": 2.685280990600586, "memory(GiB)": 77.56, "step": 31190, "token_acc": 0.46885245901639344, "train_speed(iter/s)": 1.440249 }, { "epoch": 1.336489439184268, "grad_norm": 5.6976752281188965, "learning_rate": 8.3382780295273e-05, "loss": 2.590597152709961, "memory(GiB)": 77.56, "step": 31195, "token_acc": 0.44329896907216493, "train_speed(iter/s)": 1.440265 }, { "epoch": 1.336703654513517, "grad_norm": 7.17396879196167, "learning_rate": 8.337776987691031e-05, "loss": 2.7046672821044924, "memory(GiB)": 77.56, "step": 31200, "token_acc": 0.4482758620689655, "train_speed(iter/s)": 1.440273 }, { "epoch": 1.336917869842766, "grad_norm": 4.85002326965332, "learning_rate": 8.3372758853878e-05, "loss": 2.267042541503906, "memory(GiB)": 77.56, "step": 31205, "token_acc": 0.5138339920948617, "train_speed(iter/s)": 1.440339 }, { "epoch": 1.337132085172015, "grad_norm": 8.528756141662598, "learning_rate": 8.336774722626685e-05, "loss": 2.5631269454956054, "memory(GiB)": 77.56, "step": 31210, "token_acc": 0.453781512605042, "train_speed(iter/s)": 1.440348 }, { "epoch": 1.3373463005012638, "grad_norm": 4.431490898132324, "learning_rate": 8.336273499416767e-05, "loss": 2.7103233337402344, "memory(GiB)": 77.56, "step": 31215, "token_acc": 0.4726027397260274, "train_speed(iter/s)": 1.440391 }, { "epoch": 1.3375605158305128, "grad_norm": 4.47111177444458, "learning_rate": 8.335772215767125e-05, "loss": 2.5284324645996095, "memory(GiB)": 77.56, "step": 31220, "token_acc": 0.4681528662420382, "train_speed(iter/s)": 1.44047 }, { "epoch": 1.3377747311597619, "grad_norm": 5.26793098449707, "learning_rate": 8.33527087168684e-05, "loss": 2.1152149200439454, "memory(GiB)": 77.56, "step": 31225, "token_acc": 0.5303030303030303, "train_speed(iter/s)": 1.440465 }, { "epoch": 1.3379889464890107, "grad_norm": 3.017547845840454, "learning_rate": 8.334769467184992e-05, "loss": 2.404205894470215, "memory(GiB)": 77.56, "step": 31230, "token_acc": 0.5294117647058824, "train_speed(iter/s)": 1.440497 }, { "epoch": 1.3382031618182597, "grad_norm": 4.037070274353027, "learning_rate": 8.334268002270671e-05, "loss": 2.6690139770507812, "memory(GiB)": 77.56, "step": 31235, "token_acc": 0.4783950617283951, "train_speed(iter/s)": 1.440479 }, { "epoch": 1.3384173771475087, "grad_norm": 3.340653419494629, "learning_rate": 8.333766476952955e-05, "loss": 2.2853366851806642, "memory(GiB)": 77.56, "step": 31240, "token_acc": 0.5190311418685121, "train_speed(iter/s)": 1.440526 }, { "epoch": 1.3386315924767576, "grad_norm": 4.789028644561768, "learning_rate": 8.333264891240933e-05, "loss": 2.578736686706543, "memory(GiB)": 77.56, "step": 31245, "token_acc": 0.5, "train_speed(iter/s)": 1.440584 }, { "epoch": 1.3388458078060066, "grad_norm": 5.581756591796875, "learning_rate": 8.33276324514369e-05, "loss": 2.5892478942871096, "memory(GiB)": 77.56, "step": 31250, "token_acc": 0.47384615384615386, "train_speed(iter/s)": 1.4406 }, { "epoch": 1.3390600231352556, "grad_norm": 4.968892574310303, "learning_rate": 8.332261538670313e-05, "loss": 2.659806251525879, "memory(GiB)": 77.56, "step": 31255, "token_acc": 0.4740061162079511, "train_speed(iter/s)": 1.440609 }, { "epoch": 1.3392742384645044, "grad_norm": 5.407258033752441, "learning_rate": 8.331759771829893e-05, "loss": 2.676869201660156, "memory(GiB)": 77.56, "step": 31260, "token_acc": 0.46885245901639344, "train_speed(iter/s)": 1.440628 }, { "epoch": 1.3394884537937535, "grad_norm": 4.9764533042907715, "learning_rate": 8.331257944631521e-05, "loss": 2.3965240478515626, "memory(GiB)": 77.56, "step": 31265, "token_acc": 0.4971751412429379, "train_speed(iter/s)": 1.440591 }, { "epoch": 1.3397026691230025, "grad_norm": 6.027521133422852, "learning_rate": 8.330756057084284e-05, "loss": 2.6725067138671874, "memory(GiB)": 77.56, "step": 31270, "token_acc": 0.4349442379182156, "train_speed(iter/s)": 1.440582 }, { "epoch": 1.3399168844522513, "grad_norm": 4.616183280944824, "learning_rate": 8.330254109197279e-05, "loss": 2.5812007904052736, "memory(GiB)": 77.56, "step": 31275, "token_acc": 0.46691176470588236, "train_speed(iter/s)": 1.440636 }, { "epoch": 1.3401310997815004, "grad_norm": 4.795625686645508, "learning_rate": 8.329752100979595e-05, "loss": 2.7311847686767576, "memory(GiB)": 77.56, "step": 31280, "token_acc": 0.46229508196721314, "train_speed(iter/s)": 1.440696 }, { "epoch": 1.3403453151107494, "grad_norm": 4.979424476623535, "learning_rate": 8.329250032440329e-05, "loss": 2.5712642669677734, "memory(GiB)": 77.56, "step": 31285, "token_acc": 0.43853820598006643, "train_speed(iter/s)": 1.440716 }, { "epoch": 1.3405595304399982, "grad_norm": 5.80098295211792, "learning_rate": 8.328747903588575e-05, "loss": 2.408363914489746, "memory(GiB)": 77.56, "step": 31290, "token_acc": 0.4618181818181818, "train_speed(iter/s)": 1.440706 }, { "epoch": 1.3407737457692472, "grad_norm": 5.018841743469238, "learning_rate": 8.32824571443343e-05, "loss": 2.5181486129760744, "memory(GiB)": 77.56, "step": 31295, "token_acc": 0.49063670411985016, "train_speed(iter/s)": 1.44077 }, { "epoch": 1.3409879610984963, "grad_norm": 4.644965648651123, "learning_rate": 8.327743464983992e-05, "loss": 2.5715700149536134, "memory(GiB)": 77.56, "step": 31300, "token_acc": 0.42507645259938837, "train_speed(iter/s)": 1.440799 }, { "epoch": 1.3412021764277453, "grad_norm": 6.5134782791137695, "learning_rate": 8.327241155249361e-05, "loss": 2.4259647369384765, "memory(GiB)": 77.56, "step": 31305, "token_acc": 0.47840531561461797, "train_speed(iter/s)": 1.440825 }, { "epoch": 1.3414163917569941, "grad_norm": 4.758327007293701, "learning_rate": 8.326738785238633e-05, "loss": 2.514888381958008, "memory(GiB)": 77.56, "step": 31310, "token_acc": 0.480225988700565, "train_speed(iter/s)": 1.440911 }, { "epoch": 1.3416306070862432, "grad_norm": 3.7796146869659424, "learning_rate": 8.326236354960912e-05, "loss": 2.4515625, "memory(GiB)": 77.56, "step": 31315, "token_acc": 0.4713896457765668, "train_speed(iter/s)": 1.440893 }, { "epoch": 1.3418448224154922, "grad_norm": 4.210323810577393, "learning_rate": 8.325733864425299e-05, "loss": 2.2994077682495115, "memory(GiB)": 77.56, "step": 31320, "token_acc": 0.532051282051282, "train_speed(iter/s)": 1.440919 }, { "epoch": 1.342059037744741, "grad_norm": 5.029773235321045, "learning_rate": 8.325231313640898e-05, "loss": 2.553785705566406, "memory(GiB)": 77.56, "step": 31325, "token_acc": 0.47648902821316613, "train_speed(iter/s)": 1.440931 }, { "epoch": 1.34227325307399, "grad_norm": 4.7754902839660645, "learning_rate": 8.32472870261681e-05, "loss": 2.460570526123047, "memory(GiB)": 77.56, "step": 31330, "token_acc": 0.4787234042553192, "train_speed(iter/s)": 1.440938 }, { "epoch": 1.342487468403239, "grad_norm": 5.723888874053955, "learning_rate": 8.324226031362146e-05, "loss": 2.6780044555664064, "memory(GiB)": 77.56, "step": 31335, "token_acc": 0.4558303886925795, "train_speed(iter/s)": 1.44097 }, { "epoch": 1.3427016837324879, "grad_norm": 4.395916938781738, "learning_rate": 8.323723299886008e-05, "loss": 2.5784963607788085, "memory(GiB)": 77.56, "step": 31340, "token_acc": 0.4952076677316294, "train_speed(iter/s)": 1.441018 }, { "epoch": 1.342915899061737, "grad_norm": 4.826033592224121, "learning_rate": 8.323220508197505e-05, "loss": 2.4980697631835938, "memory(GiB)": 77.56, "step": 31345, "token_acc": 0.4928571428571429, "train_speed(iter/s)": 1.441074 }, { "epoch": 1.343130114390986, "grad_norm": 4.233396530151367, "learning_rate": 8.322717656305744e-05, "loss": 2.4995664596557616, "memory(GiB)": 77.56, "step": 31350, "token_acc": 0.4764705882352941, "train_speed(iter/s)": 1.441102 }, { "epoch": 1.3433443297202348, "grad_norm": 5.334184646606445, "learning_rate": 8.322214744219837e-05, "loss": 2.3861522674560547, "memory(GiB)": 77.56, "step": 31355, "token_acc": 0.5075187969924813, "train_speed(iter/s)": 1.441115 }, { "epoch": 1.3435585450494838, "grad_norm": 5.229465484619141, "learning_rate": 8.32171177194889e-05, "loss": 2.5869619369506838, "memory(GiB)": 77.56, "step": 31360, "token_acc": 0.4744525547445255, "train_speed(iter/s)": 1.441161 }, { "epoch": 1.3437727603787328, "grad_norm": 6.362223148345947, "learning_rate": 8.32120873950202e-05, "loss": 2.609721565246582, "memory(GiB)": 77.56, "step": 31365, "token_acc": 0.4608150470219436, "train_speed(iter/s)": 1.441183 }, { "epoch": 1.3439869757079816, "grad_norm": 4.201914310455322, "learning_rate": 8.320705646888341e-05, "loss": 2.675884819030762, "memory(GiB)": 77.56, "step": 31370, "token_acc": 0.41025641025641024, "train_speed(iter/s)": 1.441198 }, { "epoch": 1.3442011910372307, "grad_norm": 4.556771755218506, "learning_rate": 8.320202494116961e-05, "loss": 2.5075345993041993, "memory(GiB)": 77.56, "step": 31375, "token_acc": 0.46153846153846156, "train_speed(iter/s)": 1.441211 }, { "epoch": 1.3444154063664797, "grad_norm": 6.09708833694458, "learning_rate": 8.319699281196999e-05, "loss": 2.4620147705078126, "memory(GiB)": 77.56, "step": 31380, "token_acc": 0.5016077170418006, "train_speed(iter/s)": 1.441208 }, { "epoch": 1.3446296216957285, "grad_norm": 5.378604888916016, "learning_rate": 8.319196008137571e-05, "loss": 2.4251192092895506, "memory(GiB)": 77.56, "step": 31385, "token_acc": 0.4646840148698885, "train_speed(iter/s)": 1.441224 }, { "epoch": 1.3448438370249776, "grad_norm": 5.7608819007873535, "learning_rate": 8.318692674947793e-05, "loss": 2.4822433471679686, "memory(GiB)": 77.56, "step": 31390, "token_acc": 0.4827586206896552, "train_speed(iter/s)": 1.441251 }, { "epoch": 1.3450580523542266, "grad_norm": 6.302339553833008, "learning_rate": 8.318189281636784e-05, "loss": 2.61944522857666, "memory(GiB)": 77.56, "step": 31395, "token_acc": 0.46715328467153283, "train_speed(iter/s)": 1.441292 }, { "epoch": 1.3452722676834754, "grad_norm": 4.51309061050415, "learning_rate": 8.317685828213664e-05, "loss": 2.6544521331787108, "memory(GiB)": 77.56, "step": 31400, "token_acc": 0.4812286689419795, "train_speed(iter/s)": 1.441346 }, { "epoch": 1.3454864830127244, "grad_norm": 4.59455680847168, "learning_rate": 8.317182314687551e-05, "loss": 2.6286258697509766, "memory(GiB)": 77.56, "step": 31405, "token_acc": 0.4742268041237113, "train_speed(iter/s)": 1.441355 }, { "epoch": 1.3457006983419735, "grad_norm": 5.485098838806152, "learning_rate": 8.316678741067571e-05, "loss": 2.4383277893066406, "memory(GiB)": 77.56, "step": 31410, "token_acc": 0.46153846153846156, "train_speed(iter/s)": 1.441402 }, { "epoch": 1.3459149136712223, "grad_norm": 5.019629955291748, "learning_rate": 8.316175107362842e-05, "loss": 2.1067447662353516, "memory(GiB)": 77.56, "step": 31415, "token_acc": 0.5359712230215827, "train_speed(iter/s)": 1.441425 }, { "epoch": 1.3461291290004713, "grad_norm": 5.156033039093018, "learning_rate": 8.315671413582493e-05, "loss": 2.608934211730957, "memory(GiB)": 77.56, "step": 31420, "token_acc": 0.4900662251655629, "train_speed(iter/s)": 1.44146 }, { "epoch": 1.3463433443297204, "grad_norm": 4.857155799865723, "learning_rate": 8.315167659735646e-05, "loss": 2.121651840209961, "memory(GiB)": 77.56, "step": 31425, "token_acc": 0.5, "train_speed(iter/s)": 1.441471 }, { "epoch": 1.3465575596589692, "grad_norm": 5.442330837249756, "learning_rate": 8.314663845831425e-05, "loss": 2.4756095886230467, "memory(GiB)": 77.56, "step": 31430, "token_acc": 0.48056537102473496, "train_speed(iter/s)": 1.44148 }, { "epoch": 1.3467717749882182, "grad_norm": 4.46121072769165, "learning_rate": 8.314159971878958e-05, "loss": 2.612537956237793, "memory(GiB)": 77.56, "step": 31435, "token_acc": 0.475, "train_speed(iter/s)": 1.441459 }, { "epoch": 1.3469859903174672, "grad_norm": 4.219089984893799, "learning_rate": 8.313656037887377e-05, "loss": 2.809228706359863, "memory(GiB)": 77.56, "step": 31440, "token_acc": 0.47318611987381703, "train_speed(iter/s)": 1.441452 }, { "epoch": 1.347200205646716, "grad_norm": 6.721188068389893, "learning_rate": 8.313152043865806e-05, "loss": 2.7048713684082033, "memory(GiB)": 77.56, "step": 31445, "token_acc": 0.46325878594249204, "train_speed(iter/s)": 1.441488 }, { "epoch": 1.347414420975965, "grad_norm": 7.163428783416748, "learning_rate": 8.312647989823379e-05, "loss": 2.533897590637207, "memory(GiB)": 77.56, "step": 31450, "token_acc": 0.4931506849315068, "train_speed(iter/s)": 1.441524 }, { "epoch": 1.3476286363052141, "grad_norm": 4.878605842590332, "learning_rate": 8.312143875769227e-05, "loss": 2.642001152038574, "memory(GiB)": 77.56, "step": 31455, "token_acc": 0.420863309352518, "train_speed(iter/s)": 1.441565 }, { "epoch": 1.347842851634463, "grad_norm": 4.353795528411865, "learning_rate": 8.31163970171248e-05, "loss": 2.7180862426757812, "memory(GiB)": 77.56, "step": 31460, "token_acc": 0.43843843843843844, "train_speed(iter/s)": 1.441612 }, { "epoch": 1.348057066963712, "grad_norm": 5.946815490722656, "learning_rate": 8.311135467662275e-05, "loss": 2.5462614059448243, "memory(GiB)": 77.56, "step": 31465, "token_acc": 0.5080645161290323, "train_speed(iter/s)": 1.441666 }, { "epoch": 1.348271282292961, "grad_norm": 4.680395603179932, "learning_rate": 8.310631173627743e-05, "loss": 2.780935287475586, "memory(GiB)": 77.56, "step": 31470, "token_acc": 0.4493927125506073, "train_speed(iter/s)": 1.441695 }, { "epoch": 1.3484854976222098, "grad_norm": 4.769514083862305, "learning_rate": 8.310126819618023e-05, "loss": 2.515646743774414, "memory(GiB)": 77.56, "step": 31475, "token_acc": 0.4744525547445255, "train_speed(iter/s)": 1.441694 }, { "epoch": 1.3486997129514589, "grad_norm": 6.272392749786377, "learning_rate": 8.30962240564225e-05, "loss": 2.8053808212280273, "memory(GiB)": 77.56, "step": 31480, "token_acc": 0.4473684210526316, "train_speed(iter/s)": 1.441701 }, { "epoch": 1.3489139282807079, "grad_norm": 5.002593517303467, "learning_rate": 8.309117931709563e-05, "loss": 2.5735185623168944, "memory(GiB)": 77.56, "step": 31485, "token_acc": 0.4618055555555556, "train_speed(iter/s)": 1.441669 }, { "epoch": 1.3491281436099567, "grad_norm": 10.882744789123535, "learning_rate": 8.3086133978291e-05, "loss": 2.4281139373779297, "memory(GiB)": 77.56, "step": 31490, "token_acc": 0.5116279069767442, "train_speed(iter/s)": 1.441608 }, { "epoch": 1.3493423589392057, "grad_norm": 5.653684616088867, "learning_rate": 8.308108804009999e-05, "loss": 2.6841964721679688, "memory(GiB)": 77.56, "step": 31495, "token_acc": 0.43125, "train_speed(iter/s)": 1.441571 }, { "epoch": 1.3495565742684548, "grad_norm": 5.4192399978637695, "learning_rate": 8.307604150261408e-05, "loss": 2.6143789291381836, "memory(GiB)": 77.56, "step": 31500, "token_acc": 0.46105919003115264, "train_speed(iter/s)": 1.441559 }, { "epoch": 1.3495565742684548, "eval_loss": 2.2718944549560547, "eval_runtime": 14.2635, "eval_samples_per_second": 7.011, "eval_steps_per_second": 7.011, "eval_token_acc": 0.453646477132262, "step": 31500 }, { "epoch": 1.3497707895977036, "grad_norm": 4.021576881408691, "learning_rate": 8.307099436592463e-05, "loss": 2.451487350463867, "memory(GiB)": 77.56, "step": 31505, "token_acc": 0.47302158273381295, "train_speed(iter/s)": 1.440563 }, { "epoch": 1.3499850049269526, "grad_norm": 4.678742408752441, "learning_rate": 8.306594663012308e-05, "loss": 2.7054092407226564, "memory(GiB)": 77.56, "step": 31510, "token_acc": 0.44554455445544555, "train_speed(iter/s)": 1.440572 }, { "epoch": 1.3501992202562016, "grad_norm": 5.106549263000488, "learning_rate": 8.306089829530092e-05, "loss": 2.810398483276367, "memory(GiB)": 77.56, "step": 31515, "token_acc": 0.4407894736842105, "train_speed(iter/s)": 1.440588 }, { "epoch": 1.3504134355854505, "grad_norm": 5.233591079711914, "learning_rate": 8.305584936154956e-05, "loss": 2.6204063415527346, "memory(GiB)": 77.56, "step": 31520, "token_acc": 0.42136498516320475, "train_speed(iter/s)": 1.440582 }, { "epoch": 1.3506276509146995, "grad_norm": 3.6382761001586914, "learning_rate": 8.305079982896047e-05, "loss": 2.325724792480469, "memory(GiB)": 77.56, "step": 31525, "token_acc": 0.4857142857142857, "train_speed(iter/s)": 1.440537 }, { "epoch": 1.3508418662439485, "grad_norm": 6.607351779937744, "learning_rate": 8.304574969762515e-05, "loss": 2.378327178955078, "memory(GiB)": 77.56, "step": 31530, "token_acc": 0.4967105263157895, "train_speed(iter/s)": 1.440544 }, { "epoch": 1.3510560815731973, "grad_norm": 4.294212818145752, "learning_rate": 8.304069896763506e-05, "loss": 2.536751556396484, "memory(GiB)": 77.56, "step": 31535, "token_acc": 0.4153846153846154, "train_speed(iter/s)": 1.44058 }, { "epoch": 1.3512702969024464, "grad_norm": 7.424275875091553, "learning_rate": 8.303564763908173e-05, "loss": 2.9064138412475584, "memory(GiB)": 77.56, "step": 31540, "token_acc": 0.43703703703703706, "train_speed(iter/s)": 1.440601 }, { "epoch": 1.3514845122316954, "grad_norm": 5.487633228302002, "learning_rate": 8.303059571205665e-05, "loss": 2.692577934265137, "memory(GiB)": 77.56, "step": 31545, "token_acc": 0.45993031358885017, "train_speed(iter/s)": 1.440595 }, { "epoch": 1.3516987275609442, "grad_norm": 5.942775249481201, "learning_rate": 8.302554318665135e-05, "loss": 2.3590694427490235, "memory(GiB)": 77.56, "step": 31550, "token_acc": 0.5048231511254019, "train_speed(iter/s)": 1.440581 }, { "epoch": 1.3519129428901933, "grad_norm": 5.499608516693115, "learning_rate": 8.302049006295734e-05, "loss": 2.209274673461914, "memory(GiB)": 77.56, "step": 31555, "token_acc": 0.4931506849315068, "train_speed(iter/s)": 1.440565 }, { "epoch": 1.3521271582194423, "grad_norm": 3.7635302543640137, "learning_rate": 8.301543634106617e-05, "loss": 2.4393642425537108, "memory(GiB)": 77.56, "step": 31560, "token_acc": 0.4788732394366197, "train_speed(iter/s)": 1.440563 }, { "epoch": 1.352341373548691, "grad_norm": 4.146806240081787, "learning_rate": 8.301038202106942e-05, "loss": 2.742236518859863, "memory(GiB)": 77.56, "step": 31565, "token_acc": 0.4358288770053476, "train_speed(iter/s)": 1.440581 }, { "epoch": 1.3525555888779401, "grad_norm": 4.635458469390869, "learning_rate": 8.300532710305862e-05, "loss": 2.612195587158203, "memory(GiB)": 77.56, "step": 31570, "token_acc": 0.4483695652173913, "train_speed(iter/s)": 1.440554 }, { "epoch": 1.3527698042071892, "grad_norm": 5.260425567626953, "learning_rate": 8.300027158712537e-05, "loss": 2.4457496643066405, "memory(GiB)": 77.56, "step": 31575, "token_acc": 0.4840764331210191, "train_speed(iter/s)": 1.44057 }, { "epoch": 1.352984019536438, "grad_norm": 4.975513458251953, "learning_rate": 8.299521547336123e-05, "loss": 2.519771766662598, "memory(GiB)": 77.56, "step": 31580, "token_acc": 0.4865771812080537, "train_speed(iter/s)": 1.440547 }, { "epoch": 1.353198234865687, "grad_norm": 5.17201566696167, "learning_rate": 8.299015876185782e-05, "loss": 2.3570817947387694, "memory(GiB)": 77.56, "step": 31585, "token_acc": 0.5016393442622951, "train_speed(iter/s)": 1.440504 }, { "epoch": 1.353412450194936, "grad_norm": 5.8714213371276855, "learning_rate": 8.298510145270673e-05, "loss": 2.204788017272949, "memory(GiB)": 77.56, "step": 31590, "token_acc": 0.521594684385382, "train_speed(iter/s)": 1.440544 }, { "epoch": 1.3536266655241849, "grad_norm": 3.987436532974243, "learning_rate": 8.298004354599959e-05, "loss": 2.542467498779297, "memory(GiB)": 77.56, "step": 31595, "token_acc": 0.4551282051282051, "train_speed(iter/s)": 1.440516 }, { "epoch": 1.353840880853434, "grad_norm": 7.022082805633545, "learning_rate": 8.297498504182802e-05, "loss": 2.42038631439209, "memory(GiB)": 77.56, "step": 31600, "token_acc": 0.4697508896797153, "train_speed(iter/s)": 1.440563 }, { "epoch": 1.354055096182683, "grad_norm": 3.850215196609497, "learning_rate": 8.296992594028366e-05, "loss": 2.5717395782470702, "memory(GiB)": 77.56, "step": 31605, "token_acc": 0.46449704142011833, "train_speed(iter/s)": 1.440531 }, { "epoch": 1.3542693115119318, "grad_norm": 6.579049110412598, "learning_rate": 8.296486624145819e-05, "loss": 2.7226057052612305, "memory(GiB)": 77.56, "step": 31610, "token_acc": 0.40059347181008903, "train_speed(iter/s)": 1.440532 }, { "epoch": 1.3544835268411808, "grad_norm": 4.3547139167785645, "learning_rate": 8.295980594544321e-05, "loss": 2.7584354400634767, "memory(GiB)": 77.56, "step": 31615, "token_acc": 0.414985590778098, "train_speed(iter/s)": 1.440592 }, { "epoch": 1.3546977421704298, "grad_norm": 6.598477363586426, "learning_rate": 8.295474505233042e-05, "loss": 2.7339330673217774, "memory(GiB)": 77.56, "step": 31620, "token_acc": 0.4417808219178082, "train_speed(iter/s)": 1.440617 }, { "epoch": 1.3549119574996786, "grad_norm": 6.258572578430176, "learning_rate": 8.294968356221155e-05, "loss": 2.359242630004883, "memory(GiB)": 77.56, "step": 31625, "token_acc": 0.49063670411985016, "train_speed(iter/s)": 1.440607 }, { "epoch": 1.3551261728289277, "grad_norm": 3.8502566814422607, "learning_rate": 8.294462147517823e-05, "loss": 2.6734107971191405, "memory(GiB)": 77.56, "step": 31630, "token_acc": 0.46112600536193027, "train_speed(iter/s)": 1.440638 }, { "epoch": 1.3553403881581767, "grad_norm": 4.248429775238037, "learning_rate": 8.293955879132219e-05, "loss": 2.321911430358887, "memory(GiB)": 77.56, "step": 31635, "token_acc": 0.5016949152542373, "train_speed(iter/s)": 1.440628 }, { "epoch": 1.3555546034874255, "grad_norm": 4.681156635284424, "learning_rate": 8.293449551073513e-05, "loss": 2.7754085540771483, "memory(GiB)": 77.56, "step": 31640, "token_acc": 0.4452296819787986, "train_speed(iter/s)": 1.440572 }, { "epoch": 1.3557688188166745, "grad_norm": 6.744724273681641, "learning_rate": 8.29294316335088e-05, "loss": 2.5335769653320312, "memory(GiB)": 77.56, "step": 31645, "token_acc": 0.4551282051282051, "train_speed(iter/s)": 1.440555 }, { "epoch": 1.3559830341459236, "grad_norm": 4.318502426147461, "learning_rate": 8.292436715973493e-05, "loss": 2.8196269989013674, "memory(GiB)": 77.56, "step": 31650, "token_acc": 0.4304635761589404, "train_speed(iter/s)": 1.440608 }, { "epoch": 1.3561972494751724, "grad_norm": 7.155418395996094, "learning_rate": 8.291930208950525e-05, "loss": 2.180579948425293, "memory(GiB)": 77.56, "step": 31655, "token_acc": 0.5136054421768708, "train_speed(iter/s)": 1.440645 }, { "epoch": 1.3564114648044214, "grad_norm": 4.491590976715088, "learning_rate": 8.291423642291153e-05, "loss": 2.1374509811401365, "memory(GiB)": 77.56, "step": 31660, "token_acc": 0.5587044534412956, "train_speed(iter/s)": 1.440631 }, { "epoch": 1.3566256801336705, "grad_norm": 4.511011123657227, "learning_rate": 8.290917016004556e-05, "loss": 2.2907419204711914, "memory(GiB)": 77.56, "step": 31665, "token_acc": 0.5084745762711864, "train_speed(iter/s)": 1.44062 }, { "epoch": 1.3568398954629193, "grad_norm": 7.064165115356445, "learning_rate": 8.29041033009991e-05, "loss": 2.507394027709961, "memory(GiB)": 77.56, "step": 31670, "token_acc": 0.4852459016393443, "train_speed(iter/s)": 1.44066 }, { "epoch": 1.3570541107921683, "grad_norm": 6.257872581481934, "learning_rate": 8.289903584586394e-05, "loss": 2.676230621337891, "memory(GiB)": 77.56, "step": 31675, "token_acc": 0.44336569579288027, "train_speed(iter/s)": 1.440599 }, { "epoch": 1.3572683261214173, "grad_norm": 4.7414422035217285, "learning_rate": 8.289396779473188e-05, "loss": 2.6043365478515623, "memory(GiB)": 77.56, "step": 31680, "token_acc": 0.48297213622291024, "train_speed(iter/s)": 1.440598 }, { "epoch": 1.3574825414506662, "grad_norm": 3.660566806793213, "learning_rate": 8.288889914769473e-05, "loss": 2.528450775146484, "memory(GiB)": 77.56, "step": 31685, "token_acc": 0.46534653465346537, "train_speed(iter/s)": 1.440644 }, { "epoch": 1.3576967567799152, "grad_norm": 5.024524211883545, "learning_rate": 8.288382990484432e-05, "loss": 2.401573371887207, "memory(GiB)": 77.56, "step": 31690, "token_acc": 0.527027027027027, "train_speed(iter/s)": 1.440644 }, { "epoch": 1.3579109721091642, "grad_norm": 6.42276668548584, "learning_rate": 8.287876006627248e-05, "loss": 2.7882152557373048, "memory(GiB)": 77.56, "step": 31695, "token_acc": 0.4234527687296417, "train_speed(iter/s)": 1.440642 }, { "epoch": 1.358125187438413, "grad_norm": 6.427697658538818, "learning_rate": 8.287368963207107e-05, "loss": 2.370273399353027, "memory(GiB)": 77.56, "step": 31700, "token_acc": 0.47962382445141066, "train_speed(iter/s)": 1.440681 }, { "epoch": 1.358339402767662, "grad_norm": 5.093068599700928, "learning_rate": 8.286861860233194e-05, "loss": 3.053812789916992, "memory(GiB)": 77.56, "step": 31705, "token_acc": 0.4094955489614243, "train_speed(iter/s)": 1.440685 }, { "epoch": 1.358553618096911, "grad_norm": 4.350356101989746, "learning_rate": 8.286354697714695e-05, "loss": 2.5586700439453125, "memory(GiB)": 77.56, "step": 31710, "token_acc": 0.5302491103202847, "train_speed(iter/s)": 1.440624 }, { "epoch": 1.35876783342616, "grad_norm": 5.138138294219971, "learning_rate": 8.285847475660797e-05, "loss": 2.552874755859375, "memory(GiB)": 77.56, "step": 31715, "token_acc": 0.47183098591549294, "train_speed(iter/s)": 1.440609 }, { "epoch": 1.358982048755409, "grad_norm": 6.08490514755249, "learning_rate": 8.285340194080688e-05, "loss": 2.8570438385009767, "memory(GiB)": 77.56, "step": 31720, "token_acc": 0.41118421052631576, "train_speed(iter/s)": 1.440617 }, { "epoch": 1.359196264084658, "grad_norm": 4.018260478973389, "learning_rate": 8.284832852983562e-05, "loss": 2.285373497009277, "memory(GiB)": 77.56, "step": 31725, "token_acc": 0.5407725321888412, "train_speed(iter/s)": 1.440635 }, { "epoch": 1.3594104794139068, "grad_norm": 6.264819145202637, "learning_rate": 8.284325452378606e-05, "loss": 2.553983688354492, "memory(GiB)": 77.56, "step": 31730, "token_acc": 0.44673539518900346, "train_speed(iter/s)": 1.440691 }, { "epoch": 1.3596246947431558, "grad_norm": 6.526613712310791, "learning_rate": 8.283817992275014e-05, "loss": 2.6119489669799805, "memory(GiB)": 77.56, "step": 31735, "token_acc": 0.4780058651026393, "train_speed(iter/s)": 1.440719 }, { "epoch": 1.3598389100724049, "grad_norm": 5.595158576965332, "learning_rate": 8.28331047268198e-05, "loss": 2.2213531494140626, "memory(GiB)": 77.56, "step": 31740, "token_acc": 0.5040322580645161, "train_speed(iter/s)": 1.440744 }, { "epoch": 1.3600531254016537, "grad_norm": 4.940277099609375, "learning_rate": 8.282802893608694e-05, "loss": 2.478802490234375, "memory(GiB)": 77.56, "step": 31745, "token_acc": 0.488135593220339, "train_speed(iter/s)": 1.440704 }, { "epoch": 1.3602673407309027, "grad_norm": 6.406253337860107, "learning_rate": 8.282295255064356e-05, "loss": 2.4300052642822267, "memory(GiB)": 77.56, "step": 31750, "token_acc": 0.4934210526315789, "train_speed(iter/s)": 1.440751 }, { "epoch": 1.3604815560601518, "grad_norm": 4.3721208572387695, "learning_rate": 8.28178755705816e-05, "loss": 2.2829425811767576, "memory(GiB)": 77.56, "step": 31755, "token_acc": 0.5206349206349207, "train_speed(iter/s)": 1.440701 }, { "epoch": 1.3606957713894006, "grad_norm": 4.818572521209717, "learning_rate": 8.281279799599303e-05, "loss": 2.547805404663086, "memory(GiB)": 77.56, "step": 31760, "token_acc": 0.49070631970260226, "train_speed(iter/s)": 1.440724 }, { "epoch": 1.3609099867186496, "grad_norm": 5.698460578918457, "learning_rate": 8.280771982696985e-05, "loss": 2.5330101013183595, "memory(GiB)": 77.56, "step": 31765, "token_acc": 0.45422535211267606, "train_speed(iter/s)": 1.440696 }, { "epoch": 1.3611242020478986, "grad_norm": 4.337852478027344, "learning_rate": 8.280264106360405e-05, "loss": 2.480868911743164, "memory(GiB)": 77.56, "step": 31770, "token_acc": 0.4621212121212121, "train_speed(iter/s)": 1.440718 }, { "epoch": 1.3613384173771474, "grad_norm": 6.176379203796387, "learning_rate": 8.279756170598764e-05, "loss": 2.332061195373535, "memory(GiB)": 77.56, "step": 31775, "token_acc": 0.5284810126582279, "train_speed(iter/s)": 1.440725 }, { "epoch": 1.3615526327063965, "grad_norm": 7.281599521636963, "learning_rate": 8.279248175421264e-05, "loss": 2.780141067504883, "memory(GiB)": 77.56, "step": 31780, "token_acc": 0.4119496855345912, "train_speed(iter/s)": 1.440735 }, { "epoch": 1.3617668480356455, "grad_norm": 5.348024845123291, "learning_rate": 8.278740120837106e-05, "loss": 2.6644657135009764, "memory(GiB)": 77.56, "step": 31785, "token_acc": 0.43322475570032576, "train_speed(iter/s)": 1.440758 }, { "epoch": 1.3619810633648943, "grad_norm": 5.615641117095947, "learning_rate": 8.278232006855495e-05, "loss": 2.578746223449707, "memory(GiB)": 77.56, "step": 31790, "token_acc": 0.46545454545454545, "train_speed(iter/s)": 1.440798 }, { "epoch": 1.3621952786941434, "grad_norm": 4.55649471282959, "learning_rate": 8.277723833485635e-05, "loss": 2.334025764465332, "memory(GiB)": 77.56, "step": 31795, "token_acc": 0.47346938775510206, "train_speed(iter/s)": 1.440785 }, { "epoch": 1.3624094940233924, "grad_norm": 5.463602066040039, "learning_rate": 8.277215600736734e-05, "loss": 2.733611488342285, "memory(GiB)": 77.56, "step": 31800, "token_acc": 0.4290657439446367, "train_speed(iter/s)": 1.440815 }, { "epoch": 1.3626237093526412, "grad_norm": 5.173390865325928, "learning_rate": 8.276707308617999e-05, "loss": 2.8185863494873047, "memory(GiB)": 77.56, "step": 31805, "token_acc": 0.4429530201342282, "train_speed(iter/s)": 1.440855 }, { "epoch": 1.3628379246818902, "grad_norm": 3.8689751625061035, "learning_rate": 8.276198957138636e-05, "loss": 2.4822025299072266, "memory(GiB)": 77.56, "step": 31810, "token_acc": 0.43006993006993005, "train_speed(iter/s)": 1.440893 }, { "epoch": 1.3630521400111393, "grad_norm": 5.2507405281066895, "learning_rate": 8.275690546307854e-05, "loss": 2.4450321197509766, "memory(GiB)": 77.56, "step": 31815, "token_acc": 0.44126984126984126, "train_speed(iter/s)": 1.440902 }, { "epoch": 1.363266355340388, "grad_norm": 5.465854644775391, "learning_rate": 8.275182076134869e-05, "loss": 2.6255542755126955, "memory(GiB)": 77.56, "step": 31820, "token_acc": 0.43258426966292135, "train_speed(iter/s)": 1.44095 }, { "epoch": 1.3634805706696371, "grad_norm": 8.459576606750488, "learning_rate": 8.274673546628886e-05, "loss": 2.4949108123779298, "memory(GiB)": 77.56, "step": 31825, "token_acc": 0.4952978056426332, "train_speed(iter/s)": 1.440972 }, { "epoch": 1.3636947859988862, "grad_norm": 5.377734661102295, "learning_rate": 8.27416495779912e-05, "loss": 2.4399099349975586, "memory(GiB)": 77.56, "step": 31830, "token_acc": 0.43812709030100333, "train_speed(iter/s)": 1.440939 }, { "epoch": 1.363909001328135, "grad_norm": 5.124488353729248, "learning_rate": 8.273656309654785e-05, "loss": 2.7032752990722657, "memory(GiB)": 77.56, "step": 31835, "token_acc": 0.47112462006079026, "train_speed(iter/s)": 1.440876 }, { "epoch": 1.364123216657384, "grad_norm": 6.172043323516846, "learning_rate": 8.273147602205094e-05, "loss": 2.3730249404907227, "memory(GiB)": 77.56, "step": 31840, "token_acc": 0.5089285714285714, "train_speed(iter/s)": 1.440928 }, { "epoch": 1.364337431986633, "grad_norm": 4.208598613739014, "learning_rate": 8.272638835459263e-05, "loss": 2.2262176513671874, "memory(GiB)": 77.56, "step": 31845, "token_acc": 0.45454545454545453, "train_speed(iter/s)": 1.440898 }, { "epoch": 1.3645516473158819, "grad_norm": 4.994443893432617, "learning_rate": 8.272130009426511e-05, "loss": 2.438750457763672, "memory(GiB)": 77.56, "step": 31850, "token_acc": 0.4591439688715953, "train_speed(iter/s)": 1.440904 }, { "epoch": 1.364765862645131, "grad_norm": 4.585249423980713, "learning_rate": 8.271621124116055e-05, "loss": 2.61342830657959, "memory(GiB)": 77.56, "step": 31855, "token_acc": 0.4722222222222222, "train_speed(iter/s)": 1.440854 }, { "epoch": 1.36498007797438, "grad_norm": 6.497950077056885, "learning_rate": 8.271112179537113e-05, "loss": 2.631338119506836, "memory(GiB)": 77.56, "step": 31860, "token_acc": 0.4577777777777778, "train_speed(iter/s)": 1.440831 }, { "epoch": 1.3651942933036287, "grad_norm": 4.362639427185059, "learning_rate": 8.270603175698904e-05, "loss": 2.476442337036133, "memory(GiB)": 77.56, "step": 31865, "token_acc": 0.5, "train_speed(iter/s)": 1.440769 }, { "epoch": 1.3654085086328778, "grad_norm": 4.356795787811279, "learning_rate": 8.270094112610651e-05, "loss": 2.068539047241211, "memory(GiB)": 77.56, "step": 31870, "token_acc": 0.4808362369337979, "train_speed(iter/s)": 1.440764 }, { "epoch": 1.3656227239621268, "grad_norm": 3.883218288421631, "learning_rate": 8.269584990281575e-05, "loss": 2.4610416412353517, "memory(GiB)": 77.56, "step": 31875, "token_acc": 0.4809688581314879, "train_speed(iter/s)": 1.440735 }, { "epoch": 1.3658369392913756, "grad_norm": 4.142423152923584, "learning_rate": 8.269075808720901e-05, "loss": 2.468758964538574, "memory(GiB)": 77.56, "step": 31880, "token_acc": 0.4612546125461255, "train_speed(iter/s)": 1.440681 }, { "epoch": 1.3660511546206247, "grad_norm": 5.313052654266357, "learning_rate": 8.268566567937851e-05, "loss": 2.5808610916137695, "memory(GiB)": 77.56, "step": 31885, "token_acc": 0.49337748344370863, "train_speed(iter/s)": 1.440758 }, { "epoch": 1.3662653699498737, "grad_norm": 4.263774394989014, "learning_rate": 8.268057267941652e-05, "loss": 2.6729047775268553, "memory(GiB)": 77.56, "step": 31890, "token_acc": 0.4834710743801653, "train_speed(iter/s)": 1.440725 }, { "epoch": 1.3664795852791225, "grad_norm": 4.951992034912109, "learning_rate": 8.267547908741529e-05, "loss": 2.513370704650879, "memory(GiB)": 77.56, "step": 31895, "token_acc": 0.48175182481751827, "train_speed(iter/s)": 1.440762 }, { "epoch": 1.3666938006083715, "grad_norm": 6.280810356140137, "learning_rate": 8.267038490346712e-05, "loss": 2.89864501953125, "memory(GiB)": 77.56, "step": 31900, "token_acc": 0.3862815884476534, "train_speed(iter/s)": 1.440807 }, { "epoch": 1.3669080159376206, "grad_norm": 6.688979625701904, "learning_rate": 8.266529012766428e-05, "loss": 2.2513904571533203, "memory(GiB)": 77.56, "step": 31905, "token_acc": 0.49140893470790376, "train_speed(iter/s)": 1.440804 }, { "epoch": 1.3671222312668694, "grad_norm": 4.840042591094971, "learning_rate": 8.266019476009905e-05, "loss": 2.280426788330078, "memory(GiB)": 77.56, "step": 31910, "token_acc": 0.5139442231075697, "train_speed(iter/s)": 1.440755 }, { "epoch": 1.3673364465961184, "grad_norm": 4.748971462249756, "learning_rate": 8.265509880086376e-05, "loss": 2.5299015045166016, "memory(GiB)": 77.56, "step": 31915, "token_acc": 0.4527687296416938, "train_speed(iter/s)": 1.440802 }, { "epoch": 1.3675506619253675, "grad_norm": 3.980222225189209, "learning_rate": 8.265000225005073e-05, "loss": 2.3797998428344727, "memory(GiB)": 77.56, "step": 31920, "token_acc": 0.5074183976261127, "train_speed(iter/s)": 1.440833 }, { "epoch": 1.3677648772546163, "grad_norm": 3.3712213039398193, "learning_rate": 8.264490510775227e-05, "loss": 2.6415313720703124, "memory(GiB)": 77.56, "step": 31925, "token_acc": 0.4474327628361858, "train_speed(iter/s)": 1.440814 }, { "epoch": 1.3679790925838653, "grad_norm": 5.953074932098389, "learning_rate": 8.263980737406073e-05, "loss": 2.5097558975219725, "memory(GiB)": 77.56, "step": 31930, "token_acc": 0.4517133956386293, "train_speed(iter/s)": 1.440751 }, { "epoch": 1.3681933079131143, "grad_norm": 5.452756881713867, "learning_rate": 8.263470904906849e-05, "loss": 2.7983936309814452, "memory(GiB)": 77.56, "step": 31935, "token_acc": 0.4458204334365325, "train_speed(iter/s)": 1.440788 }, { "epoch": 1.3684075232423631, "grad_norm": 4.6167802810668945, "learning_rate": 8.262961013286785e-05, "loss": 2.6030508041381837, "memory(GiB)": 77.56, "step": 31940, "token_acc": 0.45132743362831856, "train_speed(iter/s)": 1.440785 }, { "epoch": 1.3686217385716122, "grad_norm": 4.188957691192627, "learning_rate": 8.262451062555123e-05, "loss": 2.559660720825195, "memory(GiB)": 77.56, "step": 31945, "token_acc": 0.4657039711191336, "train_speed(iter/s)": 1.440781 }, { "epoch": 1.3688359539008612, "grad_norm": 6.492400646209717, "learning_rate": 8.261941052721098e-05, "loss": 2.6256488800048827, "memory(GiB)": 77.56, "step": 31950, "token_acc": 0.4956268221574344, "train_speed(iter/s)": 1.440781 }, { "epoch": 1.36905016923011, "grad_norm": 5.4128899574279785, "learning_rate": 8.261430983793952e-05, "loss": 2.4580759048461913, "memory(GiB)": 77.56, "step": 31955, "token_acc": 0.49185667752442996, "train_speed(iter/s)": 1.440833 }, { "epoch": 1.369264384559359, "grad_norm": 4.393882751464844, "learning_rate": 8.260920855782924e-05, "loss": 2.257247543334961, "memory(GiB)": 77.56, "step": 31960, "token_acc": 0.4952076677316294, "train_speed(iter/s)": 1.440872 }, { "epoch": 1.369478599888608, "grad_norm": 5.233603477478027, "learning_rate": 8.260410668697255e-05, "loss": 2.361859893798828, "memory(GiB)": 77.56, "step": 31965, "token_acc": 0.5227272727272727, "train_speed(iter/s)": 1.440922 }, { "epoch": 1.369692815217857, "grad_norm": 5.287154197692871, "learning_rate": 8.25990042254619e-05, "loss": 2.5765869140625, "memory(GiB)": 77.56, "step": 31970, "token_acc": 0.46096654275092935, "train_speed(iter/s)": 1.440892 }, { "epoch": 1.369907030547106, "grad_norm": 5.994129657745361, "learning_rate": 8.25939011733897e-05, "loss": 2.6727779388427733, "memory(GiB)": 77.56, "step": 31975, "token_acc": 0.467680608365019, "train_speed(iter/s)": 1.440849 }, { "epoch": 1.370121245876355, "grad_norm": 4.072033405303955, "learning_rate": 8.25887975308484e-05, "loss": 2.5022659301757812, "memory(GiB)": 77.56, "step": 31980, "token_acc": 0.46938775510204084, "train_speed(iter/s)": 1.4408 }, { "epoch": 1.3703354612056038, "grad_norm": 4.850002288818359, "learning_rate": 8.258369329793046e-05, "loss": 2.3202320098876954, "memory(GiB)": 77.56, "step": 31985, "token_acc": 0.5427350427350427, "train_speed(iter/s)": 1.440843 }, { "epoch": 1.3705496765348528, "grad_norm": 9.01506233215332, "learning_rate": 8.257858847472836e-05, "loss": 2.5688522338867186, "memory(GiB)": 77.56, "step": 31990, "token_acc": 0.4628975265017668, "train_speed(iter/s)": 1.440856 }, { "epoch": 1.3707638918641019, "grad_norm": 5.303791046142578, "learning_rate": 8.257348306133457e-05, "loss": 1.904374885559082, "memory(GiB)": 77.56, "step": 31995, "token_acc": 0.5689655172413793, "train_speed(iter/s)": 1.440887 }, { "epoch": 1.3709781071933507, "grad_norm": 10.566582679748535, "learning_rate": 8.256837705784157e-05, "loss": 3.1371002197265625, "memory(GiB)": 77.56, "step": 32000, "token_acc": 0.44140625, "train_speed(iter/s)": 1.440863 }, { "epoch": 1.3709781071933507, "eval_loss": 2.3098573684692383, "eval_runtime": 14.5627, "eval_samples_per_second": 6.867, "eval_steps_per_second": 6.867, "eval_token_acc": 0.4304556354916067, "step": 32000 }, { "epoch": 1.3711923225225997, "grad_norm": 5.158335208892822, "learning_rate": 8.256327046434187e-05, "loss": 2.698360633850098, "memory(GiB)": 77.56, "step": 32005, "token_acc": 0.43577981651376146, "train_speed(iter/s)": 1.439853 }, { "epoch": 1.3714065378518487, "grad_norm": 5.739282131195068, "learning_rate": 8.255816328092798e-05, "loss": 2.6402584075927735, "memory(GiB)": 77.56, "step": 32010, "token_acc": 0.4405594405594406, "train_speed(iter/s)": 1.439903 }, { "epoch": 1.3716207531810976, "grad_norm": 6.183213710784912, "learning_rate": 8.255305550769242e-05, "loss": 2.5954963684082033, "memory(GiB)": 77.56, "step": 32015, "token_acc": 0.4748201438848921, "train_speed(iter/s)": 1.439934 }, { "epoch": 1.3718349685103466, "grad_norm": 5.9565110206604, "learning_rate": 8.254794714472771e-05, "loss": 2.6600975036621093, "memory(GiB)": 77.56, "step": 32020, "token_acc": 0.44745762711864406, "train_speed(iter/s)": 1.43991 }, { "epoch": 1.3720491838395956, "grad_norm": 6.393780708312988, "learning_rate": 8.254283819212641e-05, "loss": 2.565863037109375, "memory(GiB)": 77.56, "step": 32025, "token_acc": 0.436, "train_speed(iter/s)": 1.439965 }, { "epoch": 1.3722633991688444, "grad_norm": 4.991652488708496, "learning_rate": 8.253772864998108e-05, "loss": 2.6734058380126955, "memory(GiB)": 77.56, "step": 32030, "token_acc": 0.42342342342342343, "train_speed(iter/s)": 1.439969 }, { "epoch": 1.3724776144980935, "grad_norm": 5.401498317718506, "learning_rate": 8.253261851838426e-05, "loss": 2.618927764892578, "memory(GiB)": 77.56, "step": 32035, "token_acc": 0.40797546012269936, "train_speed(iter/s)": 1.439938 }, { "epoch": 1.3726918298273425, "grad_norm": 5.256174087524414, "learning_rate": 8.252750779742855e-05, "loss": 2.5999101638793944, "memory(GiB)": 77.56, "step": 32040, "token_acc": 0.48466257668711654, "train_speed(iter/s)": 1.439991 }, { "epoch": 1.3729060451565913, "grad_norm": 4.309775352478027, "learning_rate": 8.252239648720652e-05, "loss": 2.523781585693359, "memory(GiB)": 77.56, "step": 32045, "token_acc": 0.45821325648414984, "train_speed(iter/s)": 1.440057 }, { "epoch": 1.3731202604858403, "grad_norm": 5.225777626037598, "learning_rate": 8.251728458781077e-05, "loss": 2.583746910095215, "memory(GiB)": 77.56, "step": 32050, "token_acc": 0.5151515151515151, "train_speed(iter/s)": 1.44008 }, { "epoch": 1.3733344758150894, "grad_norm": 5.951074123382568, "learning_rate": 8.25121720993339e-05, "loss": 2.4495475769042967, "memory(GiB)": 77.56, "step": 32055, "token_acc": 0.5018315018315018, "train_speed(iter/s)": 1.440102 }, { "epoch": 1.3735486911443382, "grad_norm": 5.441335201263428, "learning_rate": 8.250705902186853e-05, "loss": 2.4885532379150392, "memory(GiB)": 77.56, "step": 32060, "token_acc": 0.476027397260274, "train_speed(iter/s)": 1.440132 }, { "epoch": 1.3737629064735872, "grad_norm": 4.366138458251953, "learning_rate": 8.250194535550731e-05, "loss": 2.512118911743164, "memory(GiB)": 77.56, "step": 32065, "token_acc": 0.4735099337748344, "train_speed(iter/s)": 1.440163 }, { "epoch": 1.3739771218028363, "grad_norm": 3.9534366130828857, "learning_rate": 8.249683110034283e-05, "loss": 2.3270990371704103, "memory(GiB)": 77.56, "step": 32070, "token_acc": 0.5212765957446809, "train_speed(iter/s)": 1.440148 }, { "epoch": 1.374191337132085, "grad_norm": 3.901322364807129, "learning_rate": 8.249171625646779e-05, "loss": 2.7472261428833007, "memory(GiB)": 77.56, "step": 32075, "token_acc": 0.4126984126984127, "train_speed(iter/s)": 1.440082 }, { "epoch": 1.3744055524613341, "grad_norm": 4.305636882781982, "learning_rate": 8.248660082397484e-05, "loss": 2.2574813842773436, "memory(GiB)": 77.56, "step": 32080, "token_acc": 0.5, "train_speed(iter/s)": 1.440114 }, { "epoch": 1.3746197677905831, "grad_norm": 4.8445515632629395, "learning_rate": 8.248148480295662e-05, "loss": 2.6581447601318358, "memory(GiB)": 77.56, "step": 32085, "token_acc": 0.45317220543806647, "train_speed(iter/s)": 1.440171 }, { "epoch": 1.374833983119832, "grad_norm": 4.6322760581970215, "learning_rate": 8.247636819350584e-05, "loss": 2.5092159271240235, "memory(GiB)": 77.56, "step": 32090, "token_acc": 0.50187265917603, "train_speed(iter/s)": 1.440181 }, { "epoch": 1.375048198449081, "grad_norm": 4.44414758682251, "learning_rate": 8.24712509957152e-05, "loss": 2.5538555145263673, "memory(GiB)": 77.56, "step": 32095, "token_acc": 0.4793650793650794, "train_speed(iter/s)": 1.440196 }, { "epoch": 1.37526241377833, "grad_norm": 5.26290225982666, "learning_rate": 8.246613320967737e-05, "loss": 2.5447097778320313, "memory(GiB)": 77.56, "step": 32100, "token_acc": 0.45180722891566266, "train_speed(iter/s)": 1.440247 }, { "epoch": 1.3754766291075788, "grad_norm": 6.584343910217285, "learning_rate": 8.246101483548508e-05, "loss": 2.485315704345703, "memory(GiB)": 77.56, "step": 32105, "token_acc": 0.48148148148148145, "train_speed(iter/s)": 1.440302 }, { "epoch": 1.3756908444368279, "grad_norm": 5.275211811065674, "learning_rate": 8.245589587323106e-05, "loss": 2.497344970703125, "memory(GiB)": 77.56, "step": 32110, "token_acc": 0.42641509433962266, "train_speed(iter/s)": 1.440355 }, { "epoch": 1.375905059766077, "grad_norm": 4.924277305603027, "learning_rate": 8.245077632300803e-05, "loss": 2.176496124267578, "memory(GiB)": 77.56, "step": 32115, "token_acc": 0.4813664596273292, "train_speed(iter/s)": 1.440368 }, { "epoch": 1.3761192750953257, "grad_norm": 4.762798309326172, "learning_rate": 8.244565618490876e-05, "loss": 2.8513729095458986, "memory(GiB)": 77.56, "step": 32120, "token_acc": 0.4355300859598854, "train_speed(iter/s)": 1.440389 }, { "epoch": 1.3763334904245748, "grad_norm": 4.052890777587891, "learning_rate": 8.2440535459026e-05, "loss": 2.328522872924805, "memory(GiB)": 77.56, "step": 32125, "token_acc": 0.5115511551155115, "train_speed(iter/s)": 1.440352 }, { "epoch": 1.3765477057538238, "grad_norm": 3.617204189300537, "learning_rate": 8.24354141454525e-05, "loss": 3.036127471923828, "memory(GiB)": 77.56, "step": 32130, "token_acc": 0.4251497005988024, "train_speed(iter/s)": 1.440358 }, { "epoch": 1.3767619210830726, "grad_norm": 4.177921295166016, "learning_rate": 8.243029224428104e-05, "loss": 2.887272262573242, "memory(GiB)": 77.56, "step": 32135, "token_acc": 0.44722222222222224, "train_speed(iter/s)": 1.440365 }, { "epoch": 1.3769761364123216, "grad_norm": 5.442276477813721, "learning_rate": 8.242516975560441e-05, "loss": 2.3198795318603516, "memory(GiB)": 77.56, "step": 32140, "token_acc": 0.5068493150684932, "train_speed(iter/s)": 1.440331 }, { "epoch": 1.3771903517415707, "grad_norm": 5.068624496459961, "learning_rate": 8.242004667951542e-05, "loss": 2.5350936889648437, "memory(GiB)": 77.56, "step": 32145, "token_acc": 0.4560810810810811, "train_speed(iter/s)": 1.440311 }, { "epoch": 1.3774045670708195, "grad_norm": 4.242217540740967, "learning_rate": 8.241492301610688e-05, "loss": 2.698986053466797, "memory(GiB)": 77.56, "step": 32150, "token_acc": 0.4697508896797153, "train_speed(iter/s)": 1.440327 }, { "epoch": 1.3776187824000685, "grad_norm": 4.06760835647583, "learning_rate": 8.240979876547159e-05, "loss": 2.568186378479004, "memory(GiB)": 77.56, "step": 32155, "token_acc": 0.4794520547945205, "train_speed(iter/s)": 1.440355 }, { "epoch": 1.3778329977293176, "grad_norm": 4.250998020172119, "learning_rate": 8.240467392770239e-05, "loss": 2.386155700683594, "memory(GiB)": 77.56, "step": 32160, "token_acc": 0.48333333333333334, "train_speed(iter/s)": 1.440397 }, { "epoch": 1.3780472130585664, "grad_norm": 5.284581661224365, "learning_rate": 8.239954850289215e-05, "loss": 2.4334136962890627, "memory(GiB)": 77.56, "step": 32165, "token_acc": 0.4323308270676692, "train_speed(iter/s)": 1.440392 }, { "epoch": 1.3782614283878154, "grad_norm": 3.9874377250671387, "learning_rate": 8.239442249113366e-05, "loss": 2.5073711395263674, "memory(GiB)": 77.56, "step": 32170, "token_acc": 0.45569620253164556, "train_speed(iter/s)": 1.440387 }, { "epoch": 1.3784756437170644, "grad_norm": 4.134307384490967, "learning_rate": 8.238929589251984e-05, "loss": 2.5441864013671873, "memory(GiB)": 77.56, "step": 32175, "token_acc": 0.4713375796178344, "train_speed(iter/s)": 1.440449 }, { "epoch": 1.3786898590463132, "grad_norm": 4.906061172485352, "learning_rate": 8.238416870714354e-05, "loss": 2.6196819305419923, "memory(GiB)": 77.56, "step": 32180, "token_acc": 0.49416342412451364, "train_speed(iter/s)": 1.440486 }, { "epoch": 1.3789040743755623, "grad_norm": 4.560486316680908, "learning_rate": 8.237904093509763e-05, "loss": 2.4726566314697265, "memory(GiB)": 77.56, "step": 32185, "token_acc": 0.4863013698630137, "train_speed(iter/s)": 1.440491 }, { "epoch": 1.3791182897048113, "grad_norm": 3.7004616260528564, "learning_rate": 8.237391257647503e-05, "loss": 2.184162139892578, "memory(GiB)": 77.56, "step": 32190, "token_acc": 0.5254777070063694, "train_speed(iter/s)": 1.440515 }, { "epoch": 1.3793325050340601, "grad_norm": 4.5986127853393555, "learning_rate": 8.236878363136864e-05, "loss": 2.652243423461914, "memory(GiB)": 77.56, "step": 32195, "token_acc": 0.4565916398713826, "train_speed(iter/s)": 1.440554 }, { "epoch": 1.3795467203633092, "grad_norm": 5.741354465484619, "learning_rate": 8.236365409987136e-05, "loss": 2.4384607315063476, "memory(GiB)": 77.56, "step": 32200, "token_acc": 0.45849802371541504, "train_speed(iter/s)": 1.440583 }, { "epoch": 1.3797609356925582, "grad_norm": 4.578446865081787, "learning_rate": 8.235852398207613e-05, "loss": 2.2881650924682617, "memory(GiB)": 77.56, "step": 32205, "token_acc": 0.503448275862069, "train_speed(iter/s)": 1.440618 }, { "epoch": 1.379975151021807, "grad_norm": 4.787532329559326, "learning_rate": 8.235339327807588e-05, "loss": 2.7093544006347656, "memory(GiB)": 77.56, "step": 32210, "token_acc": 0.45454545454545453, "train_speed(iter/s)": 1.440651 }, { "epoch": 1.380189366351056, "grad_norm": 10.675076484680176, "learning_rate": 8.234826198796357e-05, "loss": 2.125292205810547, "memory(GiB)": 77.56, "step": 32215, "token_acc": 0.5423076923076923, "train_speed(iter/s)": 1.440669 }, { "epoch": 1.380403581680305, "grad_norm": 5.266031265258789, "learning_rate": 8.234313011183215e-05, "loss": 2.485264778137207, "memory(GiB)": 77.56, "step": 32220, "token_acc": 0.46254071661237783, "train_speed(iter/s)": 1.440711 }, { "epoch": 1.380617797009554, "grad_norm": 6.192359447479248, "learning_rate": 8.233799764977457e-05, "loss": 2.22994327545166, "memory(GiB)": 77.56, "step": 32225, "token_acc": 0.5636363636363636, "train_speed(iter/s)": 1.440684 }, { "epoch": 1.380832012338803, "grad_norm": 5.870208740234375, "learning_rate": 8.233286460188385e-05, "loss": 2.5577566146850588, "memory(GiB)": 77.56, "step": 32230, "token_acc": 0.4197080291970803, "train_speed(iter/s)": 1.440687 }, { "epoch": 1.381046227668052, "grad_norm": 4.9664530754089355, "learning_rate": 8.232773096825295e-05, "loss": 2.8343246459960936, "memory(GiB)": 77.56, "step": 32235, "token_acc": 0.3935483870967742, "train_speed(iter/s)": 1.440698 }, { "epoch": 1.3812604429973008, "grad_norm": 4.283202648162842, "learning_rate": 8.232259674897486e-05, "loss": 2.66262264251709, "memory(GiB)": 77.56, "step": 32240, "token_acc": 0.46153846153846156, "train_speed(iter/s)": 1.440743 }, { "epoch": 1.3814746583265498, "grad_norm": 4.331939220428467, "learning_rate": 8.231746194414262e-05, "loss": 2.4501779556274412, "memory(GiB)": 77.56, "step": 32245, "token_acc": 0.49242424242424243, "train_speed(iter/s)": 1.440704 }, { "epoch": 1.3816888736557988, "grad_norm": 4.636662006378174, "learning_rate": 8.231232655384924e-05, "loss": 2.686531639099121, "memory(GiB)": 77.56, "step": 32250, "token_acc": 0.44642857142857145, "train_speed(iter/s)": 1.440671 }, { "epoch": 1.3819030889850477, "grad_norm": 5.0237650871276855, "learning_rate": 8.230719057818776e-05, "loss": 2.477691078186035, "memory(GiB)": 77.56, "step": 32255, "token_acc": 0.45918367346938777, "train_speed(iter/s)": 1.440689 }, { "epoch": 1.3821173043142967, "grad_norm": 5.002540588378906, "learning_rate": 8.230205401725122e-05, "loss": 2.4150623321533202, "memory(GiB)": 77.56, "step": 32260, "token_acc": 0.4763636363636364, "train_speed(iter/s)": 1.440739 }, { "epoch": 1.3823315196435457, "grad_norm": 5.962159633636475, "learning_rate": 8.229691687113266e-05, "loss": 2.570017623901367, "memory(GiB)": 77.56, "step": 32265, "token_acc": 0.447098976109215, "train_speed(iter/s)": 1.440804 }, { "epoch": 1.3825457349727945, "grad_norm": 5.570696830749512, "learning_rate": 8.229177913992515e-05, "loss": 2.691242980957031, "memory(GiB)": 77.56, "step": 32270, "token_acc": 0.4732824427480916, "train_speed(iter/s)": 1.440803 }, { "epoch": 1.3827599503020436, "grad_norm": 4.509525775909424, "learning_rate": 8.228664082372177e-05, "loss": 2.468734359741211, "memory(GiB)": 77.56, "step": 32275, "token_acc": 0.4860557768924303, "train_speed(iter/s)": 1.440742 }, { "epoch": 1.3829741656312926, "grad_norm": 5.3111491203308105, "learning_rate": 8.22815019226156e-05, "loss": 2.464916801452637, "memory(GiB)": 77.56, "step": 32280, "token_acc": 0.4904214559386973, "train_speed(iter/s)": 1.44077 }, { "epoch": 1.3831883809605414, "grad_norm": 6.811872482299805, "learning_rate": 8.227636243669976e-05, "loss": 2.6973987579345704, "memory(GiB)": 77.56, "step": 32285, "token_acc": 0.47266881028938906, "train_speed(iter/s)": 1.440776 }, { "epoch": 1.3834025962897905, "grad_norm": 4.771921634674072, "learning_rate": 8.227122236606733e-05, "loss": 2.415981101989746, "memory(GiB)": 77.56, "step": 32290, "token_acc": 0.5416666666666666, "train_speed(iter/s)": 1.440796 }, { "epoch": 1.3836168116190395, "grad_norm": 4.9913105964660645, "learning_rate": 8.226608171081143e-05, "loss": 2.625484657287598, "memory(GiB)": 77.56, "step": 32295, "token_acc": 0.47157190635451507, "train_speed(iter/s)": 1.440781 }, { "epoch": 1.3838310269482883, "grad_norm": 4.733423709869385, "learning_rate": 8.22609404710252e-05, "loss": 2.5317523956298826, "memory(GiB)": 77.56, "step": 32300, "token_acc": 0.4641638225255973, "train_speed(iter/s)": 1.440845 }, { "epoch": 1.3840452422775373, "grad_norm": 4.963057041168213, "learning_rate": 8.225579864680175e-05, "loss": 2.359029769897461, "memory(GiB)": 77.56, "step": 32305, "token_acc": 0.4865771812080537, "train_speed(iter/s)": 1.440899 }, { "epoch": 1.3842594576067864, "grad_norm": 4.482953071594238, "learning_rate": 8.225065623823427e-05, "loss": 2.215733528137207, "memory(GiB)": 77.56, "step": 32310, "token_acc": 0.5486111111111112, "train_speed(iter/s)": 1.440909 }, { "epoch": 1.3844736729360352, "grad_norm": 5.477952480316162, "learning_rate": 8.224551324541588e-05, "loss": 2.6172725677490236, "memory(GiB)": 77.56, "step": 32315, "token_acc": 0.4316546762589928, "train_speed(iter/s)": 1.440946 }, { "epoch": 1.3846878882652842, "grad_norm": 5.942564010620117, "learning_rate": 8.224036966843978e-05, "loss": 2.522656059265137, "memory(GiB)": 77.56, "step": 32320, "token_acc": 0.4797297297297297, "train_speed(iter/s)": 1.440926 }, { "epoch": 1.3849021035945333, "grad_norm": 4.379029750823975, "learning_rate": 8.223522550739913e-05, "loss": 2.588387298583984, "memory(GiB)": 77.56, "step": 32325, "token_acc": 0.47318611987381703, "train_speed(iter/s)": 1.440923 }, { "epoch": 1.385116318923782, "grad_norm": 4.905298709869385, "learning_rate": 8.223008076238714e-05, "loss": 2.4599098205566405, "memory(GiB)": 77.56, "step": 32330, "token_acc": 0.47840531561461797, "train_speed(iter/s)": 1.440909 }, { "epoch": 1.385330534253031, "grad_norm": 5.045111656188965, "learning_rate": 8.2224935433497e-05, "loss": 2.6597064971923827, "memory(GiB)": 77.56, "step": 32335, "token_acc": 0.4842105263157895, "train_speed(iter/s)": 1.440963 }, { "epoch": 1.3855447495822801, "grad_norm": 5.351622581481934, "learning_rate": 8.221978952082192e-05, "loss": 2.9174943923950196, "memory(GiB)": 77.56, "step": 32340, "token_acc": 0.3825503355704698, "train_speed(iter/s)": 1.440917 }, { "epoch": 1.385758964911529, "grad_norm": 4.083662986755371, "learning_rate": 8.221464302445514e-05, "loss": 2.51528263092041, "memory(GiB)": 77.56, "step": 32345, "token_acc": 0.511864406779661, "train_speed(iter/s)": 1.440834 }, { "epoch": 1.385973180240778, "grad_norm": 3.813265323638916, "learning_rate": 8.220949594448987e-05, "loss": 2.3660558700561523, "memory(GiB)": 77.56, "step": 32350, "token_acc": 0.49271137026239065, "train_speed(iter/s)": 1.440826 }, { "epoch": 1.386187395570027, "grad_norm": 4.7663774490356445, "learning_rate": 8.220434828101937e-05, "loss": 2.4540462493896484, "memory(GiB)": 77.56, "step": 32355, "token_acc": 0.476, "train_speed(iter/s)": 1.440817 }, { "epoch": 1.3864016108992758, "grad_norm": 5.493171691894531, "learning_rate": 8.219920003413691e-05, "loss": 2.6703540802001955, "memory(GiB)": 77.56, "step": 32360, "token_acc": 0.47280334728033474, "train_speed(iter/s)": 1.440797 }, { "epoch": 1.3866158262285249, "grad_norm": 4.496729850769043, "learning_rate": 8.219405120393571e-05, "loss": 2.7621109008789064, "memory(GiB)": 77.56, "step": 32365, "token_acc": 0.43213296398891965, "train_speed(iter/s)": 1.44081 }, { "epoch": 1.386830041557774, "grad_norm": 4.299685478210449, "learning_rate": 8.218890179050908e-05, "loss": 2.7615472793579103, "memory(GiB)": 77.56, "step": 32370, "token_acc": 0.4471830985915493, "train_speed(iter/s)": 1.440865 }, { "epoch": 1.3870442568870227, "grad_norm": 5.835668087005615, "learning_rate": 8.21837517939503e-05, "loss": 2.6046798706054686, "memory(GiB)": 77.56, "step": 32375, "token_acc": 0.4470588235294118, "train_speed(iter/s)": 1.440828 }, { "epoch": 1.3872584722162717, "grad_norm": 4.56839656829834, "learning_rate": 8.217860121435267e-05, "loss": 2.5579978942871096, "memory(GiB)": 77.56, "step": 32380, "token_acc": 0.42662116040955633, "train_speed(iter/s)": 1.440853 }, { "epoch": 1.3874726875455208, "grad_norm": 4.46902322769165, "learning_rate": 8.217345005180949e-05, "loss": 2.3457687377929686, "memory(GiB)": 77.56, "step": 32385, "token_acc": 0.47157190635451507, "train_speed(iter/s)": 1.44087 }, { "epoch": 1.3876869028747696, "grad_norm": 5.586753845214844, "learning_rate": 8.216829830641408e-05, "loss": 2.686672592163086, "memory(GiB)": 77.56, "step": 32390, "token_acc": 0.42700729927007297, "train_speed(iter/s)": 1.440869 }, { "epoch": 1.3879011182040186, "grad_norm": 3.91251802444458, "learning_rate": 8.216314597825976e-05, "loss": 2.4910045623779298, "memory(GiB)": 77.56, "step": 32395, "token_acc": 0.49517684887459806, "train_speed(iter/s)": 1.440875 }, { "epoch": 1.3881153335332677, "grad_norm": 4.722597122192383, "learning_rate": 8.215799306743989e-05, "loss": 2.690944862365723, "memory(GiB)": 77.56, "step": 32400, "token_acc": 0.4419889502762431, "train_speed(iter/s)": 1.440881 }, { "epoch": 1.3883295488625165, "grad_norm": 3.9358503818511963, "learning_rate": 8.21528395740478e-05, "loss": 2.3778587341308595, "memory(GiB)": 77.56, "step": 32405, "token_acc": 0.5346534653465347, "train_speed(iter/s)": 1.440894 }, { "epoch": 1.3885437641917655, "grad_norm": 5.521212577819824, "learning_rate": 8.214768549817687e-05, "loss": 2.614592933654785, "memory(GiB)": 77.56, "step": 32410, "token_acc": 0.43543543543543545, "train_speed(iter/s)": 1.440936 }, { "epoch": 1.3887579795210145, "grad_norm": 4.584962368011475, "learning_rate": 8.214253083992046e-05, "loss": 2.730294418334961, "memory(GiB)": 77.56, "step": 32415, "token_acc": 0.47477744807121663, "train_speed(iter/s)": 1.440973 }, { "epoch": 1.3889721948502634, "grad_norm": 5.215190410614014, "learning_rate": 8.213737559937195e-05, "loss": 2.5338584899902346, "memory(GiB)": 77.56, "step": 32420, "token_acc": 0.49640287769784175, "train_speed(iter/s)": 1.441031 }, { "epoch": 1.3891864101795124, "grad_norm": 4.041851997375488, "learning_rate": 8.213221977662473e-05, "loss": 2.630046272277832, "memory(GiB)": 77.56, "step": 32425, "token_acc": 0.45481927710843373, "train_speed(iter/s)": 1.441107 }, { "epoch": 1.3894006255087614, "grad_norm": 4.636109352111816, "learning_rate": 8.212706337177221e-05, "loss": 2.447910690307617, "memory(GiB)": 77.56, "step": 32430, "token_acc": 0.49, "train_speed(iter/s)": 1.441163 }, { "epoch": 1.3896148408380102, "grad_norm": 4.599270343780518, "learning_rate": 8.21219063849078e-05, "loss": 2.242801856994629, "memory(GiB)": 77.56, "step": 32435, "token_acc": 0.5222672064777328, "train_speed(iter/s)": 1.441111 }, { "epoch": 1.3898290561672593, "grad_norm": 3.935060739517212, "learning_rate": 8.211674881612492e-05, "loss": 2.7042808532714844, "memory(GiB)": 77.56, "step": 32440, "token_acc": 0.45686900958466453, "train_speed(iter/s)": 1.441185 }, { "epoch": 1.3900432714965083, "grad_norm": 4.77193546295166, "learning_rate": 8.211159066551701e-05, "loss": 2.424533653259277, "memory(GiB)": 77.56, "step": 32445, "token_acc": 0.4722222222222222, "train_speed(iter/s)": 1.441161 }, { "epoch": 1.3902574868257571, "grad_norm": 7.982471466064453, "learning_rate": 8.210643193317751e-05, "loss": 2.8776296615600585, "memory(GiB)": 77.56, "step": 32450, "token_acc": 0.44745762711864406, "train_speed(iter/s)": 1.44118 }, { "epoch": 1.3904717021550062, "grad_norm": 4.024241924285889, "learning_rate": 8.210127261919987e-05, "loss": 2.800585174560547, "memory(GiB)": 77.56, "step": 32455, "token_acc": 0.49230769230769234, "train_speed(iter/s)": 1.441163 }, { "epoch": 1.3906859174842552, "grad_norm": 6.202906131744385, "learning_rate": 8.209611272367758e-05, "loss": 2.5529708862304688, "memory(GiB)": 77.56, "step": 32460, "token_acc": 0.4844290657439446, "train_speed(iter/s)": 1.441145 }, { "epoch": 1.390900132813504, "grad_norm": 6.042719841003418, "learning_rate": 8.209095224670409e-05, "loss": 2.1243961334228514, "memory(GiB)": 77.56, "step": 32465, "token_acc": 0.5231316725978647, "train_speed(iter/s)": 1.441191 }, { "epoch": 1.391114348142753, "grad_norm": 4.469020843505859, "learning_rate": 8.208579118837289e-05, "loss": 2.5956594467163088, "memory(GiB)": 77.56, "step": 32470, "token_acc": 0.5, "train_speed(iter/s)": 1.441226 }, { "epoch": 1.391328563472002, "grad_norm": 5.173557281494141, "learning_rate": 8.208062954877749e-05, "loss": 2.434360694885254, "memory(GiB)": 77.56, "step": 32475, "token_acc": 0.50199203187251, "train_speed(iter/s)": 1.44121 }, { "epoch": 1.3915427788012509, "grad_norm": 5.418885707855225, "learning_rate": 8.207546732801139e-05, "loss": 2.5377338409423826, "memory(GiB)": 77.56, "step": 32480, "token_acc": 0.4653846153846154, "train_speed(iter/s)": 1.441251 }, { "epoch": 1.3917569941305, "grad_norm": 5.52482795715332, "learning_rate": 8.207030452616811e-05, "loss": 2.545522689819336, "memory(GiB)": 77.56, "step": 32485, "token_acc": 0.48322147651006714, "train_speed(iter/s)": 1.441324 }, { "epoch": 1.391971209459749, "grad_norm": 5.534440994262695, "learning_rate": 8.206514114334119e-05, "loss": 2.809461975097656, "memory(GiB)": 77.56, "step": 32490, "token_acc": 0.41580756013745707, "train_speed(iter/s)": 1.441262 }, { "epoch": 1.392185424788998, "grad_norm": 4.601513385772705, "learning_rate": 8.205997717962415e-05, "loss": 2.5163551330566407, "memory(GiB)": 77.56, "step": 32495, "token_acc": 0.5210355987055016, "train_speed(iter/s)": 1.441283 }, { "epoch": 1.3923996401182468, "grad_norm": 3.5171921253204346, "learning_rate": 8.205481263511054e-05, "loss": 2.668793487548828, "memory(GiB)": 77.56, "step": 32500, "token_acc": 0.463768115942029, "train_speed(iter/s)": 1.441309 }, { "epoch": 1.3923996401182468, "eval_loss": 2.177598714828491, "eval_runtime": 14.4616, "eval_samples_per_second": 6.915, "eval_steps_per_second": 6.915, "eval_token_acc": 0.4858681022880215, "step": 32500 }, { "epoch": 1.3926138554474958, "grad_norm": 4.515955924987793, "learning_rate": 8.204964750989393e-05, "loss": 2.348715400695801, "memory(GiB)": 77.56, "step": 32505, "token_acc": 0.4870067372473532, "train_speed(iter/s)": 1.440388 }, { "epoch": 1.3928280707767449, "grad_norm": 4.06764554977417, "learning_rate": 8.204448180406789e-05, "loss": 2.540558624267578, "memory(GiB)": 77.56, "step": 32510, "token_acc": 0.46710526315789475, "train_speed(iter/s)": 1.440426 }, { "epoch": 1.3930422861059937, "grad_norm": 6.363366603851318, "learning_rate": 8.203931551772602e-05, "loss": 2.4556230545043944, "memory(GiB)": 77.56, "step": 32515, "token_acc": 0.4878048780487805, "train_speed(iter/s)": 1.440418 }, { "epoch": 1.3932565014352427, "grad_norm": 4.377366065979004, "learning_rate": 8.203414865096188e-05, "loss": 2.5806402206420898, "memory(GiB)": 77.56, "step": 32520, "token_acc": 0.46153846153846156, "train_speed(iter/s)": 1.440447 }, { "epoch": 1.3934707167644917, "grad_norm": 6.2758708000183105, "learning_rate": 8.20289812038691e-05, "loss": 2.291286087036133, "memory(GiB)": 77.56, "step": 32525, "token_acc": 0.5061224489795918, "train_speed(iter/s)": 1.440475 }, { "epoch": 1.3936849320937406, "grad_norm": 5.970769882202148, "learning_rate": 8.202381317654126e-05, "loss": 2.215692138671875, "memory(GiB)": 77.56, "step": 32530, "token_acc": 0.4950166112956811, "train_speed(iter/s)": 1.440463 }, { "epoch": 1.3938991474229896, "grad_norm": 4.598732948303223, "learning_rate": 8.201864456907203e-05, "loss": 2.9922607421875, "memory(GiB)": 77.56, "step": 32535, "token_acc": 0.43440233236151604, "train_speed(iter/s)": 1.44049 }, { "epoch": 1.3941133627522386, "grad_norm": 6.370267391204834, "learning_rate": 8.201347538155499e-05, "loss": 2.4286373138427733, "memory(GiB)": 77.56, "step": 32540, "token_acc": 0.4827586206896552, "train_speed(iter/s)": 1.440491 }, { "epoch": 1.3943275780814874, "grad_norm": 5.638474464416504, "learning_rate": 8.200830561408382e-05, "loss": 2.421356773376465, "memory(GiB)": 77.56, "step": 32545, "token_acc": 0.48226950354609927, "train_speed(iter/s)": 1.440468 }, { "epoch": 1.3945417934107365, "grad_norm": 4.560729026794434, "learning_rate": 8.200313526675218e-05, "loss": 2.4186595916748046, "memory(GiB)": 77.56, "step": 32550, "token_acc": 0.4831804281345566, "train_speed(iter/s)": 1.440461 }, { "epoch": 1.3947560087399855, "grad_norm": 6.50902795791626, "learning_rate": 8.199796433965373e-05, "loss": 2.122319221496582, "memory(GiB)": 77.56, "step": 32555, "token_acc": 0.5096153846153846, "train_speed(iter/s)": 1.440477 }, { "epoch": 1.3949702240692343, "grad_norm": 14.841451644897461, "learning_rate": 8.19927928328821e-05, "loss": 2.5114162445068358, "memory(GiB)": 77.56, "step": 32560, "token_acc": 0.49825783972125437, "train_speed(iter/s)": 1.440447 }, { "epoch": 1.3951844393984834, "grad_norm": 6.572962760925293, "learning_rate": 8.198762074653104e-05, "loss": 2.4489582061767576, "memory(GiB)": 77.56, "step": 32565, "token_acc": 0.4879032258064516, "train_speed(iter/s)": 1.440457 }, { "epoch": 1.3953986547277324, "grad_norm": 4.776227951049805, "learning_rate": 8.198244808069424e-05, "loss": 2.2974660873413084, "memory(GiB)": 77.56, "step": 32570, "token_acc": 0.5382059800664452, "train_speed(iter/s)": 1.440439 }, { "epoch": 1.3956128700569812, "grad_norm": 4.21104097366333, "learning_rate": 8.197727483546539e-05, "loss": 2.791429328918457, "memory(GiB)": 77.56, "step": 32575, "token_acc": 0.4373259052924791, "train_speed(iter/s)": 1.440494 }, { "epoch": 1.3958270853862302, "grad_norm": 5.432312965393066, "learning_rate": 8.197210101093817e-05, "loss": 2.6320913314819334, "memory(GiB)": 77.56, "step": 32580, "token_acc": 0.446875, "train_speed(iter/s)": 1.440505 }, { "epoch": 1.3960413007154793, "grad_norm": 4.993788719177246, "learning_rate": 8.196692660720638e-05, "loss": 2.4033786773681642, "memory(GiB)": 77.56, "step": 32585, "token_acc": 0.4944237918215613, "train_speed(iter/s)": 1.440547 }, { "epoch": 1.396255516044728, "grad_norm": 4.767704486846924, "learning_rate": 8.196175162436371e-05, "loss": 2.764363098144531, "memory(GiB)": 77.56, "step": 32590, "token_acc": 0.45, "train_speed(iter/s)": 1.440512 }, { "epoch": 1.3964697313739771, "grad_norm": 5.701756477355957, "learning_rate": 8.195657606250393e-05, "loss": 2.627780532836914, "memory(GiB)": 77.56, "step": 32595, "token_acc": 0.4817073170731707, "train_speed(iter/s)": 1.440545 }, { "epoch": 1.3966839467032262, "grad_norm": 4.9805097579956055, "learning_rate": 8.195139992172081e-05, "loss": 2.1044809341430666, "memory(GiB)": 77.56, "step": 32600, "token_acc": 0.5258620689655172, "train_speed(iter/s)": 1.440588 }, { "epoch": 1.396898162032475, "grad_norm": 4.280782699584961, "learning_rate": 8.19462232021081e-05, "loss": 2.193564605712891, "memory(GiB)": 77.56, "step": 32605, "token_acc": 0.5415282392026578, "train_speed(iter/s)": 1.440626 }, { "epoch": 1.397112377361724, "grad_norm": 4.711809158325195, "learning_rate": 8.19410459037596e-05, "loss": 2.361937141418457, "memory(GiB)": 77.56, "step": 32610, "token_acc": 0.47876447876447875, "train_speed(iter/s)": 1.440659 }, { "epoch": 1.397326592690973, "grad_norm": 4.320030212402344, "learning_rate": 8.193586802676907e-05, "loss": 2.561391830444336, "memory(GiB)": 77.56, "step": 32615, "token_acc": 0.4636363636363636, "train_speed(iter/s)": 1.44065 }, { "epoch": 1.3975408080202218, "grad_norm": 4.283494472503662, "learning_rate": 8.193068957123034e-05, "loss": 2.7902252197265627, "memory(GiB)": 77.56, "step": 32620, "token_acc": 0.4479166666666667, "train_speed(iter/s)": 1.440666 }, { "epoch": 1.3977550233494709, "grad_norm": 5.214541435241699, "learning_rate": 8.192551053723721e-05, "loss": 2.545706558227539, "memory(GiB)": 77.56, "step": 32625, "token_acc": 0.4452296819787986, "train_speed(iter/s)": 1.440653 }, { "epoch": 1.39796923867872, "grad_norm": 5.056772232055664, "learning_rate": 8.192033092488351e-05, "loss": 2.578232192993164, "memory(GiB)": 77.56, "step": 32630, "token_acc": 0.46875, "train_speed(iter/s)": 1.440686 }, { "epoch": 1.3981834540079687, "grad_norm": 5.8353800773620605, "learning_rate": 8.191515073426309e-05, "loss": 2.396602249145508, "memory(GiB)": 77.56, "step": 32635, "token_acc": 0.4195804195804196, "train_speed(iter/s)": 1.440712 }, { "epoch": 1.3983976693372178, "grad_norm": 5.054687023162842, "learning_rate": 8.190996996546975e-05, "loss": 2.968783378601074, "memory(GiB)": 77.56, "step": 32640, "token_acc": 0.4141791044776119, "train_speed(iter/s)": 1.440704 }, { "epoch": 1.3986118846664668, "grad_norm": 5.241652488708496, "learning_rate": 8.19047886185974e-05, "loss": 2.658024024963379, "memory(GiB)": 77.56, "step": 32645, "token_acc": 0.43902439024390244, "train_speed(iter/s)": 1.440668 }, { "epoch": 1.3988260999957156, "grad_norm": 4.599843502044678, "learning_rate": 8.189960669373987e-05, "loss": 2.4044450759887694, "memory(GiB)": 77.56, "step": 32650, "token_acc": 0.5254237288135594, "train_speed(iter/s)": 1.440713 }, { "epoch": 1.3990403153249646, "grad_norm": 4.849252700805664, "learning_rate": 8.189442419099101e-05, "loss": 2.7404220581054686, "memory(GiB)": 77.56, "step": 32655, "token_acc": 0.4444444444444444, "train_speed(iter/s)": 1.440777 }, { "epoch": 1.3992545306542137, "grad_norm": 6.469464302062988, "learning_rate": 8.188924111044476e-05, "loss": 2.2504724502563476, "memory(GiB)": 77.56, "step": 32660, "token_acc": 0.5982532751091703, "train_speed(iter/s)": 1.440807 }, { "epoch": 1.3994687459834627, "grad_norm": 4.850805282592773, "learning_rate": 8.188405745219498e-05, "loss": 2.551934814453125, "memory(GiB)": 77.56, "step": 32665, "token_acc": 0.4977578475336323, "train_speed(iter/s)": 1.440771 }, { "epoch": 1.3996829613127115, "grad_norm": 4.660606384277344, "learning_rate": 8.18788732163356e-05, "loss": 2.1702070236206055, "memory(GiB)": 77.56, "step": 32670, "token_acc": 0.5355805243445693, "train_speed(iter/s)": 1.440814 }, { "epoch": 1.3998971766419606, "grad_norm": 4.29949426651001, "learning_rate": 8.187368840296052e-05, "loss": 2.586586570739746, "memory(GiB)": 77.56, "step": 32675, "token_acc": 0.4717607973421927, "train_speed(iter/s)": 1.44084 }, { "epoch": 1.4001113919712096, "grad_norm": 5.140963554382324, "learning_rate": 8.186850301216368e-05, "loss": 2.3898927688598635, "memory(GiB)": 77.56, "step": 32680, "token_acc": 0.4897959183673469, "train_speed(iter/s)": 1.440847 }, { "epoch": 1.4003256073004584, "grad_norm": 4.6134033203125, "learning_rate": 8.186331704403902e-05, "loss": 2.4320209503173826, "memory(GiB)": 77.56, "step": 32685, "token_acc": 0.4812286689419795, "train_speed(iter/s)": 1.440869 }, { "epoch": 1.4005398226297074, "grad_norm": 4.765467166900635, "learning_rate": 8.185813049868045e-05, "loss": 2.3984859466552733, "memory(GiB)": 77.56, "step": 32690, "token_acc": 0.4404432132963989, "train_speed(iter/s)": 1.440899 }, { "epoch": 1.4007540379589565, "grad_norm": 4.149876117706299, "learning_rate": 8.185294337618198e-05, "loss": 2.3167816162109376, "memory(GiB)": 77.56, "step": 32695, "token_acc": 0.5243445692883895, "train_speed(iter/s)": 1.44092 }, { "epoch": 1.4009682532882053, "grad_norm": 4.961634159088135, "learning_rate": 8.184775567663759e-05, "loss": 2.56131649017334, "memory(GiB)": 77.56, "step": 32700, "token_acc": 0.4847560975609756, "train_speed(iter/s)": 1.440923 }, { "epoch": 1.4011824686174543, "grad_norm": 4.568178653717041, "learning_rate": 8.18425674001412e-05, "loss": 2.2430482864379884, "memory(GiB)": 77.56, "step": 32705, "token_acc": 0.47985347985347987, "train_speed(iter/s)": 1.440958 }, { "epoch": 1.4013966839467034, "grad_norm": 4.932185649871826, "learning_rate": 8.183737854678684e-05, "loss": 2.4853187561035157, "memory(GiB)": 77.56, "step": 32710, "token_acc": 0.4721311475409836, "train_speed(iter/s)": 1.44099 }, { "epoch": 1.4016108992759522, "grad_norm": 4.6252217292785645, "learning_rate": 8.183218911666849e-05, "loss": 2.572653579711914, "memory(GiB)": 77.56, "step": 32715, "token_acc": 0.452755905511811, "train_speed(iter/s)": 1.440979 }, { "epoch": 1.4018251146052012, "grad_norm": 5.366158485412598, "learning_rate": 8.182699910988018e-05, "loss": 2.667098045349121, "memory(GiB)": 77.56, "step": 32720, "token_acc": 0.4454828660436137, "train_speed(iter/s)": 1.441022 }, { "epoch": 1.4020393299344502, "grad_norm": 4.5241007804870605, "learning_rate": 8.182180852651592e-05, "loss": 2.3706123352050783, "memory(GiB)": 77.56, "step": 32725, "token_acc": 0.5, "train_speed(iter/s)": 1.441031 }, { "epoch": 1.402253545263699, "grad_norm": 4.363215923309326, "learning_rate": 8.181661736666974e-05, "loss": 2.3623613357543944, "memory(GiB)": 77.56, "step": 32730, "token_acc": 0.5265151515151515, "train_speed(iter/s)": 1.441057 }, { "epoch": 1.402467760592948, "grad_norm": 5.448582172393799, "learning_rate": 8.181142563043572e-05, "loss": 2.347551727294922, "memory(GiB)": 77.56, "step": 32735, "token_acc": 0.5354609929078015, "train_speed(iter/s)": 1.441071 }, { "epoch": 1.4026819759221971, "grad_norm": 5.197177410125732, "learning_rate": 8.180623331790785e-05, "loss": 2.276912498474121, "memory(GiB)": 77.56, "step": 32740, "token_acc": 0.5154185022026432, "train_speed(iter/s)": 1.441112 }, { "epoch": 1.402896191251446, "grad_norm": 5.24609375, "learning_rate": 8.180104042918025e-05, "loss": 2.6191478729248048, "memory(GiB)": 77.56, "step": 32745, "token_acc": 0.4633333333333333, "train_speed(iter/s)": 1.441086 }, { "epoch": 1.403110406580695, "grad_norm": 6.560999393463135, "learning_rate": 8.179584696434696e-05, "loss": 2.5139934539794924, "memory(GiB)": 77.56, "step": 32750, "token_acc": 0.4701492537313433, "train_speed(iter/s)": 1.441 }, { "epoch": 1.403324621909944, "grad_norm": 4.0000152587890625, "learning_rate": 8.179065292350208e-05, "loss": 2.457881736755371, "memory(GiB)": 77.56, "step": 32755, "token_acc": 0.44966442953020136, "train_speed(iter/s)": 1.441007 }, { "epoch": 1.4035388372391928, "grad_norm": 6.2242889404296875, "learning_rate": 8.178545830673969e-05, "loss": 2.547452926635742, "memory(GiB)": 77.56, "step": 32760, "token_acc": 0.4714285714285714, "train_speed(iter/s)": 1.44097 }, { "epoch": 1.4037530525684419, "grad_norm": 4.187480926513672, "learning_rate": 8.178026311415392e-05, "loss": 2.6834732055664063, "memory(GiB)": 77.56, "step": 32765, "token_acc": 0.4447852760736196, "train_speed(iter/s)": 1.440991 }, { "epoch": 1.4039672678976909, "grad_norm": 5.03083610534668, "learning_rate": 8.177506734583886e-05, "loss": 2.808266448974609, "memory(GiB)": 77.56, "step": 32770, "token_acc": 0.4764705882352941, "train_speed(iter/s)": 1.441039 }, { "epoch": 1.4041814832269397, "grad_norm": 5.288457870483398, "learning_rate": 8.176987100188865e-05, "loss": 2.6233535766601563, "memory(GiB)": 77.56, "step": 32775, "token_acc": 0.4312267657992565, "train_speed(iter/s)": 1.441043 }, { "epoch": 1.4043956985561887, "grad_norm": 5.455574989318848, "learning_rate": 8.176467408239743e-05, "loss": 2.259764862060547, "memory(GiB)": 77.56, "step": 32780, "token_acc": 0.5185185185185185, "train_speed(iter/s)": 1.441033 }, { "epoch": 1.4046099138854378, "grad_norm": 5.3393778800964355, "learning_rate": 8.175947658745934e-05, "loss": 2.104327583312988, "memory(GiB)": 77.56, "step": 32785, "token_acc": 0.5075187969924813, "train_speed(iter/s)": 1.440987 }, { "epoch": 1.4048241292146866, "grad_norm": 4.289956569671631, "learning_rate": 8.175427851716855e-05, "loss": 2.731656074523926, "memory(GiB)": 77.56, "step": 32790, "token_acc": 0.4573170731707317, "train_speed(iter/s)": 1.441037 }, { "epoch": 1.4050383445439356, "grad_norm": 4.7855329513549805, "learning_rate": 8.17490798716192e-05, "loss": 2.6186155319213866, "memory(GiB)": 77.56, "step": 32795, "token_acc": 0.4755244755244755, "train_speed(iter/s)": 1.44104 }, { "epoch": 1.4052525598731846, "grad_norm": 7.545797348022461, "learning_rate": 8.174388065090548e-05, "loss": 2.2450260162353515, "memory(GiB)": 77.56, "step": 32800, "token_acc": 0.5473251028806584, "train_speed(iter/s)": 1.441069 }, { "epoch": 1.4054667752024335, "grad_norm": 4.962194442749023, "learning_rate": 8.17386808551216e-05, "loss": 2.2483697891235352, "memory(GiB)": 77.56, "step": 32805, "token_acc": 0.48493975903614456, "train_speed(iter/s)": 1.441069 }, { "epoch": 1.4056809905316825, "grad_norm": 4.5178937911987305, "learning_rate": 8.173348048436174e-05, "loss": 2.231859016418457, "memory(GiB)": 77.56, "step": 32810, "token_acc": 0.5324675324675324, "train_speed(iter/s)": 1.441049 }, { "epoch": 1.4058952058609315, "grad_norm": 4.872297763824463, "learning_rate": 8.17282795387201e-05, "loss": 2.5846189498901366, "memory(GiB)": 77.56, "step": 32815, "token_acc": 0.45394736842105265, "train_speed(iter/s)": 1.441021 }, { "epoch": 1.4061094211901803, "grad_norm": 5.8877129554748535, "learning_rate": 8.172307801829093e-05, "loss": 2.6296873092651367, "memory(GiB)": 77.56, "step": 32820, "token_acc": 0.4628099173553719, "train_speed(iter/s)": 1.441037 }, { "epoch": 1.4063236365194294, "grad_norm": 5.066899299621582, "learning_rate": 8.171787592316842e-05, "loss": 2.498503494262695, "memory(GiB)": 77.56, "step": 32825, "token_acc": 0.49390243902439024, "train_speed(iter/s)": 1.440998 }, { "epoch": 1.4065378518486784, "grad_norm": 7.721127033233643, "learning_rate": 8.171267325344685e-05, "loss": 2.479060745239258, "memory(GiB)": 77.56, "step": 32830, "token_acc": 0.5301724137931034, "train_speed(iter/s)": 1.441068 }, { "epoch": 1.4067520671779272, "grad_norm": 5.018352031707764, "learning_rate": 8.170747000922045e-05, "loss": 2.431307792663574, "memory(GiB)": 77.56, "step": 32835, "token_acc": 0.46710526315789475, "train_speed(iter/s)": 1.440997 }, { "epoch": 1.4069662825071763, "grad_norm": 6.421723365783691, "learning_rate": 8.17022661905835e-05, "loss": 3.0423044204711913, "memory(GiB)": 77.56, "step": 32840, "token_acc": 0.37987012987012986, "train_speed(iter/s)": 1.441042 }, { "epoch": 1.4071804978364253, "grad_norm": 4.114084243774414, "learning_rate": 8.169706179763023e-05, "loss": 2.1239709854125977, "memory(GiB)": 77.56, "step": 32845, "token_acc": 0.5403508771929825, "train_speed(iter/s)": 1.44102 }, { "epoch": 1.407394713165674, "grad_norm": 4.6386494636535645, "learning_rate": 8.169185683045498e-05, "loss": 2.3217145919799806, "memory(GiB)": 77.56, "step": 32850, "token_acc": 0.5154639175257731, "train_speed(iter/s)": 1.441017 }, { "epoch": 1.4076089284949231, "grad_norm": 4.626767635345459, "learning_rate": 8.168665128915201e-05, "loss": 2.830036163330078, "memory(GiB)": 77.56, "step": 32855, "token_acc": 0.43902439024390244, "train_speed(iter/s)": 1.441036 }, { "epoch": 1.4078231438241722, "grad_norm": 5.563905715942383, "learning_rate": 8.168144517381562e-05, "loss": 2.4487443923950196, "memory(GiB)": 77.56, "step": 32860, "token_acc": 0.4735202492211838, "train_speed(iter/s)": 1.441063 }, { "epoch": 1.408037359153421, "grad_norm": 5.357914447784424, "learning_rate": 8.167623848454014e-05, "loss": 2.568429183959961, "memory(GiB)": 77.56, "step": 32865, "token_acc": 0.46785714285714286, "train_speed(iter/s)": 1.441117 }, { "epoch": 1.40825157448267, "grad_norm": 4.455787181854248, "learning_rate": 8.167103122141988e-05, "loss": 2.4503332138061524, "memory(GiB)": 77.56, "step": 32870, "token_acc": 0.46060606060606063, "train_speed(iter/s)": 1.44118 }, { "epoch": 1.408465789811919, "grad_norm": 5.0956573486328125, "learning_rate": 8.166582338454917e-05, "loss": 2.6250228881835938, "memory(GiB)": 77.56, "step": 32875, "token_acc": 0.45, "train_speed(iter/s)": 1.441161 }, { "epoch": 1.4086800051411679, "grad_norm": 4.343917369842529, "learning_rate": 8.166061497402236e-05, "loss": 2.610627365112305, "memory(GiB)": 77.56, "step": 32880, "token_acc": 0.45454545454545453, "train_speed(iter/s)": 1.441152 }, { "epoch": 1.408894220470417, "grad_norm": 4.916111946105957, "learning_rate": 8.165540598993383e-05, "loss": 2.6820501327514648, "memory(GiB)": 77.56, "step": 32885, "token_acc": 0.44931506849315067, "train_speed(iter/s)": 1.441135 }, { "epoch": 1.409108435799666, "grad_norm": 9.075296401977539, "learning_rate": 8.165019643237792e-05, "loss": 2.5452728271484375, "memory(GiB)": 77.56, "step": 32890, "token_acc": 0.4959677419354839, "train_speed(iter/s)": 1.441151 }, { "epoch": 1.4093226511289147, "grad_norm": 4.417697429656982, "learning_rate": 8.164498630144901e-05, "loss": 2.4722240447998045, "memory(GiB)": 77.56, "step": 32895, "token_acc": 0.47246376811594204, "train_speed(iter/s)": 1.441154 }, { "epoch": 1.4095368664581638, "grad_norm": 5.6175947189331055, "learning_rate": 8.163977559724147e-05, "loss": 2.930836486816406, "memory(GiB)": 77.56, "step": 32900, "token_acc": 0.44272445820433437, "train_speed(iter/s)": 1.441204 }, { "epoch": 1.4097510817874128, "grad_norm": 7.489553451538086, "learning_rate": 8.163456431984975e-05, "loss": 2.776253890991211, "memory(GiB)": 77.56, "step": 32905, "token_acc": 0.4303030303030303, "train_speed(iter/s)": 1.441115 }, { "epoch": 1.4099652971166616, "grad_norm": 4.295735836029053, "learning_rate": 8.162935246936819e-05, "loss": 2.4325775146484374, "memory(GiB)": 77.56, "step": 32910, "token_acc": 0.49645390070921985, "train_speed(iter/s)": 1.44115 }, { "epoch": 1.4101795124459107, "grad_norm": 3.759232997894287, "learning_rate": 8.162414004589126e-05, "loss": 2.215176010131836, "memory(GiB)": 77.56, "step": 32915, "token_acc": 0.5390334572490706, "train_speed(iter/s)": 1.441196 }, { "epoch": 1.4103937277751597, "grad_norm": 5.306053638458252, "learning_rate": 8.161892704951334e-05, "loss": 2.668818473815918, "memory(GiB)": 77.56, "step": 32920, "token_acc": 0.4576271186440678, "train_speed(iter/s)": 1.441198 }, { "epoch": 1.4106079431044085, "grad_norm": 5.074545383453369, "learning_rate": 8.161371348032893e-05, "loss": 2.4916919708251952, "memory(GiB)": 77.56, "step": 32925, "token_acc": 0.4456140350877193, "train_speed(iter/s)": 1.441155 }, { "epoch": 1.4108221584336575, "grad_norm": 5.683573246002197, "learning_rate": 8.160849933843241e-05, "loss": 2.3398935317993166, "memory(GiB)": 77.56, "step": 32930, "token_acc": 0.5175097276264592, "train_speed(iter/s)": 1.441137 }, { "epoch": 1.4110363737629066, "grad_norm": 4.769052505493164, "learning_rate": 8.16032846239183e-05, "loss": 2.4615779876708985, "memory(GiB)": 77.56, "step": 32935, "token_acc": 0.4789272030651341, "train_speed(iter/s)": 1.441115 }, { "epoch": 1.4112505890921554, "grad_norm": 4.571069240570068, "learning_rate": 8.159806933688105e-05, "loss": 2.540538024902344, "memory(GiB)": 77.56, "step": 32940, "token_acc": 0.4642857142857143, "train_speed(iter/s)": 1.441116 }, { "epoch": 1.4114648044214044, "grad_norm": 5.50850248336792, "learning_rate": 8.159285347741513e-05, "loss": 2.632329559326172, "memory(GiB)": 77.56, "step": 32945, "token_acc": 0.4703703703703704, "train_speed(iter/s)": 1.441137 }, { "epoch": 1.4116790197506535, "grad_norm": 5.724573135375977, "learning_rate": 8.1587637045615e-05, "loss": 2.593389320373535, "memory(GiB)": 77.56, "step": 32950, "token_acc": 0.46546546546546547, "train_speed(iter/s)": 1.441098 }, { "epoch": 1.4118932350799023, "grad_norm": 5.0343523025512695, "learning_rate": 8.158242004157522e-05, "loss": 2.5726451873779297, "memory(GiB)": 77.56, "step": 32955, "token_acc": 0.4693140794223827, "train_speed(iter/s)": 1.441094 }, { "epoch": 1.4121074504091513, "grad_norm": 4.84677791595459, "learning_rate": 8.157720246539026e-05, "loss": 2.5698471069335938, "memory(GiB)": 77.56, "step": 32960, "token_acc": 0.4552238805970149, "train_speed(iter/s)": 1.441055 }, { "epoch": 1.4123216657384003, "grad_norm": 4.837327003479004, "learning_rate": 8.157198431715466e-05, "loss": 2.5950782775878904, "memory(GiB)": 77.56, "step": 32965, "token_acc": 0.48, "train_speed(iter/s)": 1.44109 }, { "epoch": 1.4125358810676492, "grad_norm": 3.9810128211975098, "learning_rate": 8.156676559696294e-05, "loss": 2.3519641876220705, "memory(GiB)": 77.56, "step": 32970, "token_acc": 0.4885245901639344, "train_speed(iter/s)": 1.441078 }, { "epoch": 1.4127500963968982, "grad_norm": 4.564142227172852, "learning_rate": 8.156154630490968e-05, "loss": 2.6326671600341798, "memory(GiB)": 77.56, "step": 32975, "token_acc": 0.45794392523364486, "train_speed(iter/s)": 1.441076 }, { "epoch": 1.4129643117261472, "grad_norm": 3.8186042308807373, "learning_rate": 8.155632644108937e-05, "loss": 2.6206520080566404, "memory(GiB)": 77.56, "step": 32980, "token_acc": 0.49693251533742333, "train_speed(iter/s)": 1.441023 }, { "epoch": 1.413178527055396, "grad_norm": 4.897378921508789, "learning_rate": 8.155110600559661e-05, "loss": 2.395633316040039, "memory(GiB)": 77.56, "step": 32985, "token_acc": 0.4845679012345679, "train_speed(iter/s)": 1.441093 }, { "epoch": 1.413392742384645, "grad_norm": 4.5664753913879395, "learning_rate": 8.154588499852598e-05, "loss": 2.5076459884643554, "memory(GiB)": 77.56, "step": 32990, "token_acc": 0.5087108013937283, "train_speed(iter/s)": 1.441117 }, { "epoch": 1.413606957713894, "grad_norm": 4.726290702819824, "learning_rate": 8.154066341997204e-05, "loss": 2.6111370086669923, "memory(GiB)": 77.56, "step": 32995, "token_acc": 0.44787644787644787, "train_speed(iter/s)": 1.441114 }, { "epoch": 1.413821173043143, "grad_norm": 5.217080116271973, "learning_rate": 8.15354412700294e-05, "loss": 2.572787857055664, "memory(GiB)": 77.56, "step": 33000, "token_acc": 0.5170940170940171, "train_speed(iter/s)": 1.441136 }, { "epoch": 1.413821173043143, "eval_loss": 2.155951499938965, "eval_runtime": 14.7972, "eval_samples_per_second": 6.758, "eval_steps_per_second": 6.758, "eval_token_acc": 0.4605263157894737, "step": 33000 }, { "epoch": 1.414035388372392, "grad_norm": 4.319518566131592, "learning_rate": 8.153021854879266e-05, "loss": 2.4434165954589844, "memory(GiB)": 77.56, "step": 33005, "token_acc": 0.4681647940074906, "train_speed(iter/s)": 1.440084 }, { "epoch": 1.414249603701641, "grad_norm": 5.58959436416626, "learning_rate": 8.152499525635643e-05, "loss": 2.5024646759033202, "memory(GiB)": 77.56, "step": 33010, "token_acc": 0.42105263157894735, "train_speed(iter/s)": 1.440127 }, { "epoch": 1.4144638190308898, "grad_norm": 7.674812316894531, "learning_rate": 8.151977139281534e-05, "loss": 2.3606868743896485, "memory(GiB)": 77.56, "step": 33015, "token_acc": 0.5077519379844961, "train_speed(iter/s)": 1.440153 }, { "epoch": 1.4146780343601388, "grad_norm": 4.988476753234863, "learning_rate": 8.151454695826402e-05, "loss": 2.613323974609375, "memory(GiB)": 77.56, "step": 33020, "token_acc": 0.4542124542124542, "train_speed(iter/s)": 1.440207 }, { "epoch": 1.4148922496893879, "grad_norm": 4.285540580749512, "learning_rate": 8.150932195279711e-05, "loss": 2.3553529739379884, "memory(GiB)": 77.56, "step": 33025, "token_acc": 0.47586206896551725, "train_speed(iter/s)": 1.440218 }, { "epoch": 1.4151064650186367, "grad_norm": 5.295781135559082, "learning_rate": 8.150409637650928e-05, "loss": 2.2589305877685546, "memory(GiB)": 77.56, "step": 33030, "token_acc": 0.5281385281385281, "train_speed(iter/s)": 1.440193 }, { "epoch": 1.4153206803478857, "grad_norm": 4.3361921310424805, "learning_rate": 8.14988702294952e-05, "loss": 2.5230279922485352, "memory(GiB)": 77.56, "step": 33035, "token_acc": 0.4867549668874172, "train_speed(iter/s)": 1.440169 }, { "epoch": 1.4155348956771348, "grad_norm": 4.904618740081787, "learning_rate": 8.149364351184953e-05, "loss": 2.4007289886474608, "memory(GiB)": 77.56, "step": 33040, "token_acc": 0.494949494949495, "train_speed(iter/s)": 1.440135 }, { "epoch": 1.4157491110063836, "grad_norm": 4.181363105773926, "learning_rate": 8.148841622366699e-05, "loss": 2.517375946044922, "memory(GiB)": 77.56, "step": 33045, "token_acc": 0.4579710144927536, "train_speed(iter/s)": 1.440093 }, { "epoch": 1.4159633263356326, "grad_norm": 5.231570243835449, "learning_rate": 8.148318836504221e-05, "loss": 2.3261022567749023, "memory(GiB)": 77.56, "step": 33050, "token_acc": 0.5050505050505051, "train_speed(iter/s)": 1.440134 }, { "epoch": 1.4161775416648816, "grad_norm": 5.6807379722595215, "learning_rate": 8.147795993606997e-05, "loss": 2.480241394042969, "memory(GiB)": 77.56, "step": 33055, "token_acc": 0.4774436090225564, "train_speed(iter/s)": 1.440165 }, { "epoch": 1.4163917569941304, "grad_norm": 4.6538004875183105, "learning_rate": 8.147273093684493e-05, "loss": 2.358638954162598, "memory(GiB)": 77.56, "step": 33060, "token_acc": 0.5291970802919708, "train_speed(iter/s)": 1.440204 }, { "epoch": 1.4166059723233795, "grad_norm": 4.609269142150879, "learning_rate": 8.146750136746187e-05, "loss": 2.7514999389648436, "memory(GiB)": 77.56, "step": 33065, "token_acc": 0.4192546583850932, "train_speed(iter/s)": 1.440227 }, { "epoch": 1.4168201876526285, "grad_norm": 5.511987686157227, "learning_rate": 8.14622712280155e-05, "loss": 2.4519065856933593, "memory(GiB)": 77.56, "step": 33070, "token_acc": 0.4730290456431535, "train_speed(iter/s)": 1.440255 }, { "epoch": 1.4170344029818773, "grad_norm": 4.408252716064453, "learning_rate": 8.145704051860056e-05, "loss": 2.5872501373291015, "memory(GiB)": 77.56, "step": 33075, "token_acc": 0.4855072463768116, "train_speed(iter/s)": 1.440206 }, { "epoch": 1.4172486183111264, "grad_norm": 4.394386291503906, "learning_rate": 8.145180923931184e-05, "loss": 2.4042551040649416, "memory(GiB)": 77.56, "step": 33080, "token_acc": 0.46075085324232085, "train_speed(iter/s)": 1.440248 }, { "epoch": 1.4174628336403754, "grad_norm": 5.2622389793396, "learning_rate": 8.144657739024408e-05, "loss": 2.613916778564453, "memory(GiB)": 77.56, "step": 33085, "token_acc": 0.46464646464646464, "train_speed(iter/s)": 1.440314 }, { "epoch": 1.4176770489696242, "grad_norm": 3.2025272846221924, "learning_rate": 8.144134497149207e-05, "loss": 2.5696949005126952, "memory(GiB)": 77.56, "step": 33090, "token_acc": 0.4879518072289157, "train_speed(iter/s)": 1.440379 }, { "epoch": 1.4178912642988732, "grad_norm": 5.146648406982422, "learning_rate": 8.14361119831506e-05, "loss": 2.5399837493896484, "memory(GiB)": 77.56, "step": 33095, "token_acc": 0.4852459016393443, "train_speed(iter/s)": 1.440416 }, { "epoch": 1.4181054796281223, "grad_norm": 4.936645984649658, "learning_rate": 8.143087842531447e-05, "loss": 2.317360687255859, "memory(GiB)": 77.56, "step": 33100, "token_acc": 0.5159010600706714, "train_speed(iter/s)": 1.44048 }, { "epoch": 1.418319694957371, "grad_norm": 4.946645259857178, "learning_rate": 8.142564429807851e-05, "loss": 2.5198184967041017, "memory(GiB)": 77.56, "step": 33105, "token_acc": 0.49097472924187724, "train_speed(iter/s)": 1.440442 }, { "epoch": 1.4185339102866201, "grad_norm": 8.604920387268066, "learning_rate": 8.142040960153749e-05, "loss": 2.4088836669921876, "memory(GiB)": 77.56, "step": 33110, "token_acc": 0.47345132743362833, "train_speed(iter/s)": 1.440396 }, { "epoch": 1.4187481256158692, "grad_norm": 3.7186927795410156, "learning_rate": 8.141517433578632e-05, "loss": 2.599583053588867, "memory(GiB)": 77.56, "step": 33115, "token_acc": 0.47635135135135137, "train_speed(iter/s)": 1.440343 }, { "epoch": 1.418962340945118, "grad_norm": 6.1346845626831055, "learning_rate": 8.140993850091977e-05, "loss": 2.4547145843505858, "memory(GiB)": 77.56, "step": 33120, "token_acc": 0.453416149068323, "train_speed(iter/s)": 1.440381 }, { "epoch": 1.419176556274367, "grad_norm": 3.691970109939575, "learning_rate": 8.140470209703273e-05, "loss": 2.4248403549194335, "memory(GiB)": 77.56, "step": 33125, "token_acc": 0.5017421602787456, "train_speed(iter/s)": 1.440413 }, { "epoch": 1.419390771603616, "grad_norm": 4.270996570587158, "learning_rate": 8.139946512422003e-05, "loss": 2.259629249572754, "memory(GiB)": 77.56, "step": 33130, "token_acc": 0.5116279069767442, "train_speed(iter/s)": 1.44043 }, { "epoch": 1.4196049869328649, "grad_norm": 4.1327033042907715, "learning_rate": 8.139422758257658e-05, "loss": 2.228342819213867, "memory(GiB)": 77.56, "step": 33135, "token_acc": 0.5047923322683706, "train_speed(iter/s)": 1.440342 }, { "epoch": 1.4198192022621139, "grad_norm": 5.204843521118164, "learning_rate": 8.138898947219724e-05, "loss": 2.8300018310546875, "memory(GiB)": 77.56, "step": 33140, "token_acc": 0.46511627906976744, "train_speed(iter/s)": 1.440369 }, { "epoch": 1.420033417591363, "grad_norm": 6.835001468658447, "learning_rate": 8.138375079317693e-05, "loss": 2.3787372589111326, "memory(GiB)": 77.56, "step": 33145, "token_acc": 0.4804270462633452, "train_speed(iter/s)": 1.440396 }, { "epoch": 1.4202476329206117, "grad_norm": 6.716424465179443, "learning_rate": 8.137851154561051e-05, "loss": 2.496069145202637, "memory(GiB)": 77.56, "step": 33150, "token_acc": 0.4536082474226804, "train_speed(iter/s)": 1.440445 }, { "epoch": 1.4204618482498608, "grad_norm": 4.852542877197266, "learning_rate": 8.137327172959294e-05, "loss": 2.508481979370117, "memory(GiB)": 77.56, "step": 33155, "token_acc": 0.4758364312267658, "train_speed(iter/s)": 1.440472 }, { "epoch": 1.4206760635791098, "grad_norm": 4.746988773345947, "learning_rate": 8.136803134521912e-05, "loss": 2.7087955474853516, "memory(GiB)": 77.56, "step": 33160, "token_acc": 0.44904458598726116, "train_speed(iter/s)": 1.440505 }, { "epoch": 1.4208902789083586, "grad_norm": 4.003751277923584, "learning_rate": 8.136279039258397e-05, "loss": 2.5319950103759767, "memory(GiB)": 77.56, "step": 33165, "token_acc": 0.471875, "train_speed(iter/s)": 1.440556 }, { "epoch": 1.4211044942376077, "grad_norm": 5.177649974822998, "learning_rate": 8.135754887178246e-05, "loss": 2.4351833343505858, "memory(GiB)": 77.56, "step": 33170, "token_acc": 0.46551724137931033, "train_speed(iter/s)": 1.440596 }, { "epoch": 1.4213187095668567, "grad_norm": 4.527562141418457, "learning_rate": 8.135230678290953e-05, "loss": 2.467445945739746, "memory(GiB)": 77.56, "step": 33175, "token_acc": 0.4709480122324159, "train_speed(iter/s)": 1.440629 }, { "epoch": 1.4215329248961055, "grad_norm": 4.752240180969238, "learning_rate": 8.134706412606014e-05, "loss": 2.393367385864258, "memory(GiB)": 77.56, "step": 33180, "token_acc": 0.4873417721518987, "train_speed(iter/s)": 1.440594 }, { "epoch": 1.4217471402253545, "grad_norm": 6.1346540451049805, "learning_rate": 8.134182090132931e-05, "loss": 2.874431037902832, "memory(GiB)": 77.56, "step": 33185, "token_acc": 0.4418604651162791, "train_speed(iter/s)": 1.440578 }, { "epoch": 1.4219613555546036, "grad_norm": 4.7812652587890625, "learning_rate": 8.133657710881197e-05, "loss": 2.514013671875, "memory(GiB)": 77.56, "step": 33190, "token_acc": 0.47194719471947194, "train_speed(iter/s)": 1.440607 }, { "epoch": 1.4221755708838524, "grad_norm": 5.245537757873535, "learning_rate": 8.133133274860316e-05, "loss": 2.1866266250610353, "memory(GiB)": 77.56, "step": 33195, "token_acc": 0.524, "train_speed(iter/s)": 1.440653 }, { "epoch": 1.4223897862131014, "grad_norm": 5.887220859527588, "learning_rate": 8.132608782079785e-05, "loss": 2.630213737487793, "memory(GiB)": 77.56, "step": 33200, "token_acc": 0.4420731707317073, "train_speed(iter/s)": 1.440677 }, { "epoch": 1.4226040015423504, "grad_norm": 4.989991664886475, "learning_rate": 8.132084232549107e-05, "loss": 2.6953182220458984, "memory(GiB)": 77.56, "step": 33205, "token_acc": 0.4844961240310077, "train_speed(iter/s)": 1.440721 }, { "epoch": 1.4228182168715993, "grad_norm": 4.984920501708984, "learning_rate": 8.131559626277784e-05, "loss": 2.6882429122924805, "memory(GiB)": 77.56, "step": 33210, "token_acc": 0.46075085324232085, "train_speed(iter/s)": 1.440663 }, { "epoch": 1.4230324322008483, "grad_norm": 4.190038681030273, "learning_rate": 8.131034963275324e-05, "loss": 2.522857093811035, "memory(GiB)": 77.56, "step": 33215, "token_acc": 0.47413793103448276, "train_speed(iter/s)": 1.440659 }, { "epoch": 1.4232466475300973, "grad_norm": 5.334013938903809, "learning_rate": 8.130510243551227e-05, "loss": 2.3469905853271484, "memory(GiB)": 77.56, "step": 33220, "token_acc": 0.4623287671232877, "train_speed(iter/s)": 1.44067 }, { "epoch": 1.4234608628593461, "grad_norm": 4.802310466766357, "learning_rate": 8.129985467115e-05, "loss": 2.711520195007324, "memory(GiB)": 77.56, "step": 33225, "token_acc": 0.4635036496350365, "train_speed(iter/s)": 1.440702 }, { "epoch": 1.4236750781885952, "grad_norm": 6.179851531982422, "learning_rate": 8.12946063397615e-05, "loss": 2.578750419616699, "memory(GiB)": 77.56, "step": 33230, "token_acc": 0.4936708860759494, "train_speed(iter/s)": 1.4407 }, { "epoch": 1.4238892935178442, "grad_norm": 4.649274826049805, "learning_rate": 8.128935744144186e-05, "loss": 2.459977722167969, "memory(GiB)": 77.56, "step": 33235, "token_acc": 0.5017301038062284, "train_speed(iter/s)": 1.440694 }, { "epoch": 1.424103508847093, "grad_norm": 7.516111373901367, "learning_rate": 8.128410797628615e-05, "loss": 2.461636543273926, "memory(GiB)": 77.56, "step": 33240, "token_acc": 0.4908424908424908, "train_speed(iter/s)": 1.440705 }, { "epoch": 1.424317724176342, "grad_norm": 3.97896409034729, "learning_rate": 8.127885794438947e-05, "loss": 2.478633499145508, "memory(GiB)": 77.56, "step": 33245, "token_acc": 0.48881789137380194, "train_speed(iter/s)": 1.440735 }, { "epoch": 1.424531939505591, "grad_norm": 4.637205600738525, "learning_rate": 8.127360734584695e-05, "loss": 2.559110641479492, "memory(GiB)": 77.56, "step": 33250, "token_acc": 0.47058823529411764, "train_speed(iter/s)": 1.440746 }, { "epoch": 1.42474615483484, "grad_norm": 3.8903262615203857, "learning_rate": 8.126835618075368e-05, "loss": 2.4842920303344727, "memory(GiB)": 77.56, "step": 33255, "token_acc": 0.4870967741935484, "train_speed(iter/s)": 1.440734 }, { "epoch": 1.424960370164089, "grad_norm": 4.428865432739258, "learning_rate": 8.126310444920482e-05, "loss": 2.4514419555664064, "memory(GiB)": 77.56, "step": 33260, "token_acc": 0.4925373134328358, "train_speed(iter/s)": 1.440746 }, { "epoch": 1.425174585493338, "grad_norm": 5.462167739868164, "learning_rate": 8.12578521512955e-05, "loss": 2.2549291610717774, "memory(GiB)": 77.56, "step": 33265, "token_acc": 0.4984025559105431, "train_speed(iter/s)": 1.440782 }, { "epoch": 1.4253888008225868, "grad_norm": 6.441885471343994, "learning_rate": 8.125259928712085e-05, "loss": 2.4167686462402345, "memory(GiB)": 77.56, "step": 33270, "token_acc": 0.5189393939393939, "train_speed(iter/s)": 1.440776 }, { "epoch": 1.4256030161518358, "grad_norm": 5.584488391876221, "learning_rate": 8.124734585677606e-05, "loss": 2.8530174255371095, "memory(GiB)": 77.56, "step": 33275, "token_acc": 0.40625, "train_speed(iter/s)": 1.440778 }, { "epoch": 1.4258172314810849, "grad_norm": 3.919590473175049, "learning_rate": 8.124209186035627e-05, "loss": 2.5603870391845702, "memory(GiB)": 77.56, "step": 33280, "token_acc": 0.5082508250825083, "train_speed(iter/s)": 1.440757 }, { "epoch": 1.4260314468103337, "grad_norm": 6.487667560577393, "learning_rate": 8.12368372979567e-05, "loss": 2.7708879470825196, "memory(GiB)": 77.56, "step": 33285, "token_acc": 0.4471830985915493, "train_speed(iter/s)": 1.440758 }, { "epoch": 1.4262456621395827, "grad_norm": 4.309908390045166, "learning_rate": 8.12315821696725e-05, "loss": 2.498444747924805, "memory(GiB)": 77.56, "step": 33290, "token_acc": 0.4337748344370861, "train_speed(iter/s)": 1.440776 }, { "epoch": 1.4264598774688317, "grad_norm": 4.262087821960449, "learning_rate": 8.122632647559891e-05, "loss": 2.708254814147949, "memory(GiB)": 77.56, "step": 33295, "token_acc": 0.45425867507886436, "train_speed(iter/s)": 1.4408 }, { "epoch": 1.4266740927980806, "grad_norm": 4.266010284423828, "learning_rate": 8.122107021583112e-05, "loss": 2.502963066101074, "memory(GiB)": 77.56, "step": 33300, "token_acc": 0.47318611987381703, "train_speed(iter/s)": 1.440785 }, { "epoch": 1.4268883081273296, "grad_norm": 3.634632110595703, "learning_rate": 8.121581339046433e-05, "loss": 2.5300493240356445, "memory(GiB)": 77.56, "step": 33305, "token_acc": 0.488135593220339, "train_speed(iter/s)": 1.440799 }, { "epoch": 1.4271025234565786, "grad_norm": 4.688873291015625, "learning_rate": 8.121055599959382e-05, "loss": 2.2768619537353514, "memory(GiB)": 77.56, "step": 33310, "token_acc": 0.5201465201465202, "train_speed(iter/s)": 1.440785 }, { "epoch": 1.4273167387858274, "grad_norm": 3.5685460567474365, "learning_rate": 8.120529804331482e-05, "loss": 2.3236207962036133, "memory(GiB)": 77.56, "step": 33315, "token_acc": 0.5243445692883895, "train_speed(iter/s)": 1.440739 }, { "epoch": 1.4275309541150765, "grad_norm": 5.878200054168701, "learning_rate": 8.120003952172255e-05, "loss": 2.494319534301758, "memory(GiB)": 77.56, "step": 33320, "token_acc": 0.4607142857142857, "train_speed(iter/s)": 1.440738 }, { "epoch": 1.4277451694443255, "grad_norm": 5.4714508056640625, "learning_rate": 8.119478043491232e-05, "loss": 2.4719051361083983, "memory(GiB)": 77.56, "step": 33325, "token_acc": 0.5308641975308642, "train_speed(iter/s)": 1.440722 }, { "epoch": 1.4279593847735743, "grad_norm": 5.3441243171691895, "learning_rate": 8.118952078297936e-05, "loss": 2.7524444580078127, "memory(GiB)": 77.56, "step": 33330, "token_acc": 0.4388059701492537, "train_speed(iter/s)": 1.440764 }, { "epoch": 1.4281736001028233, "grad_norm": 6.843552589416504, "learning_rate": 8.118426056601897e-05, "loss": 2.4353818893432617, "memory(GiB)": 77.56, "step": 33335, "token_acc": 0.502092050209205, "train_speed(iter/s)": 1.440708 }, { "epoch": 1.4283878154320724, "grad_norm": 4.470619201660156, "learning_rate": 8.117899978412646e-05, "loss": 2.3507705688476563, "memory(GiB)": 77.56, "step": 33340, "token_acc": 0.48014440433212996, "train_speed(iter/s)": 1.440726 }, { "epoch": 1.4286020307613212, "grad_norm": 5.78504753112793, "learning_rate": 8.11737384373971e-05, "loss": 2.5856754302978517, "memory(GiB)": 77.56, "step": 33345, "token_acc": 0.4708029197080292, "train_speed(iter/s)": 1.440715 }, { "epoch": 1.4288162460905702, "grad_norm": 5.120090007781982, "learning_rate": 8.116847652592626e-05, "loss": 2.3425918579101563, "memory(GiB)": 77.56, "step": 33350, "token_acc": 0.5191489361702127, "train_speed(iter/s)": 1.440728 }, { "epoch": 1.4290304614198193, "grad_norm": 4.703099727630615, "learning_rate": 8.11632140498092e-05, "loss": 2.572285842895508, "memory(GiB)": 77.56, "step": 33355, "token_acc": 0.4874551971326165, "train_speed(iter/s)": 1.440767 }, { "epoch": 1.429244676749068, "grad_norm": 5.022765636444092, "learning_rate": 8.115795100914129e-05, "loss": 2.5505302429199217, "memory(GiB)": 77.56, "step": 33360, "token_acc": 0.46586345381526106, "train_speed(iter/s)": 1.440705 }, { "epoch": 1.4294588920783171, "grad_norm": 6.761358261108398, "learning_rate": 8.115268740401787e-05, "loss": 2.746389389038086, "memory(GiB)": 77.56, "step": 33365, "token_acc": 0.5104602510460251, "train_speed(iter/s)": 1.440666 }, { "epoch": 1.4296731074075661, "grad_norm": 4.111597537994385, "learning_rate": 8.114742323453431e-05, "loss": 2.6209781646728514, "memory(GiB)": 77.56, "step": 33370, "token_acc": 0.45791245791245794, "train_speed(iter/s)": 1.440675 }, { "epoch": 1.429887322736815, "grad_norm": 5.42866325378418, "learning_rate": 8.114215850078595e-05, "loss": 2.26412353515625, "memory(GiB)": 77.56, "step": 33375, "token_acc": 0.48872180451127817, "train_speed(iter/s)": 1.440706 }, { "epoch": 1.430101538066064, "grad_norm": 4.153022289276123, "learning_rate": 8.113689320286817e-05, "loss": 2.988064193725586, "memory(GiB)": 77.56, "step": 33380, "token_acc": 0.446875, "train_speed(iter/s)": 1.440737 }, { "epoch": 1.430315753395313, "grad_norm": 4.300074100494385, "learning_rate": 8.113162734087636e-05, "loss": 2.5837913513183595, "memory(GiB)": 77.56, "step": 33385, "token_acc": 0.4585987261146497, "train_speed(iter/s)": 1.440804 }, { "epoch": 1.4305299687245618, "grad_norm": 5.344752788543701, "learning_rate": 8.112636091490591e-05, "loss": 2.658455657958984, "memory(GiB)": 77.56, "step": 33390, "token_acc": 0.4594594594594595, "train_speed(iter/s)": 1.440818 }, { "epoch": 1.4307441840538109, "grad_norm": 6.4714765548706055, "learning_rate": 8.112109392505225e-05, "loss": 2.6288711547851564, "memory(GiB)": 77.56, "step": 33395, "token_acc": 0.4868913857677903, "train_speed(iter/s)": 1.440838 }, { "epoch": 1.43095839938306, "grad_norm": 5.565735816955566, "learning_rate": 8.111582637141078e-05, "loss": 2.7491548538208006, "memory(GiB)": 77.56, "step": 33400, "token_acc": 0.44921875, "train_speed(iter/s)": 1.440848 }, { "epoch": 1.4311726147123087, "grad_norm": 5.314371109008789, "learning_rate": 8.11105582540769e-05, "loss": 2.509299468994141, "memory(GiB)": 77.56, "step": 33405, "token_acc": 0.5361842105263158, "train_speed(iter/s)": 1.440834 }, { "epoch": 1.4313868300415578, "grad_norm": 5.368152618408203, "learning_rate": 8.11052895731461e-05, "loss": 2.621249961853027, "memory(GiB)": 77.56, "step": 33410, "token_acc": 0.476, "train_speed(iter/s)": 1.440815 }, { "epoch": 1.4316010453708068, "grad_norm": 5.408423900604248, "learning_rate": 8.11000203287138e-05, "loss": 2.6001571655273437, "memory(GiB)": 77.56, "step": 33415, "token_acc": 0.4479495268138801, "train_speed(iter/s)": 1.440862 }, { "epoch": 1.4318152607000556, "grad_norm": 4.377597332000732, "learning_rate": 8.109475052087543e-05, "loss": 2.71426944732666, "memory(GiB)": 77.56, "step": 33420, "token_acc": 0.4377358490566038, "train_speed(iter/s)": 1.440901 }, { "epoch": 1.4320294760293046, "grad_norm": 4.818597316741943, "learning_rate": 8.108948014972652e-05, "loss": 2.6134973526000977, "memory(GiB)": 77.56, "step": 33425, "token_acc": 0.4291044776119403, "train_speed(iter/s)": 1.440959 }, { "epoch": 1.4322436913585537, "grad_norm": 3.9246184825897217, "learning_rate": 8.108420921536248e-05, "loss": 2.654636573791504, "memory(GiB)": 77.56, "step": 33430, "token_acc": 0.47896440129449835, "train_speed(iter/s)": 1.440968 }, { "epoch": 1.4324579066878025, "grad_norm": 4.747319221496582, "learning_rate": 8.107893771787885e-05, "loss": 2.6029773712158204, "memory(GiB)": 77.56, "step": 33435, "token_acc": 0.43234323432343236, "train_speed(iter/s)": 1.440959 }, { "epoch": 1.4326721220170515, "grad_norm": 5.151585578918457, "learning_rate": 8.107366565737112e-05, "loss": 2.3758398056030274, "memory(GiB)": 77.56, "step": 33440, "token_acc": 0.5251798561151079, "train_speed(iter/s)": 1.440962 }, { "epoch": 1.4328863373463006, "grad_norm": 5.947368144989014, "learning_rate": 8.106839303393476e-05, "loss": 2.5448598861694336, "memory(GiB)": 77.56, "step": 33445, "token_acc": 0.4495114006514658, "train_speed(iter/s)": 1.440992 }, { "epoch": 1.4331005526755494, "grad_norm": 5.890367031097412, "learning_rate": 8.106311984766535e-05, "loss": 2.8148525238037108, "memory(GiB)": 77.56, "step": 33450, "token_acc": 0.4682274247491639, "train_speed(iter/s)": 1.441044 }, { "epoch": 1.4333147680047984, "grad_norm": 5.796880722045898, "learning_rate": 8.105784609865835e-05, "loss": 2.650727081298828, "memory(GiB)": 77.56, "step": 33455, "token_acc": 0.4901185770750988, "train_speed(iter/s)": 1.441091 }, { "epoch": 1.4335289833340474, "grad_norm": 4.480291843414307, "learning_rate": 8.105257178700935e-05, "loss": 2.385071563720703, "memory(GiB)": 77.56, "step": 33460, "token_acc": 0.47959183673469385, "train_speed(iter/s)": 1.44112 }, { "epoch": 1.4337431986632962, "grad_norm": 4.908984661102295, "learning_rate": 8.104729691281387e-05, "loss": 3.141388702392578, "memory(GiB)": 77.56, "step": 33465, "token_acc": 0.38509316770186336, "train_speed(iter/s)": 1.441105 }, { "epoch": 1.4339574139925453, "grad_norm": 6.158671855926514, "learning_rate": 8.10420214761675e-05, "loss": 2.772718048095703, "memory(GiB)": 77.56, "step": 33470, "token_acc": 0.4222873900293255, "train_speed(iter/s)": 1.44111 }, { "epoch": 1.4341716293217943, "grad_norm": 4.42659330368042, "learning_rate": 8.103674547716577e-05, "loss": 2.409808349609375, "memory(GiB)": 77.56, "step": 33475, "token_acc": 0.4946236559139785, "train_speed(iter/s)": 1.441106 }, { "epoch": 1.4343858446510431, "grad_norm": 8.199884414672852, "learning_rate": 8.10314689159043e-05, "loss": 2.42923641204834, "memory(GiB)": 77.56, "step": 33480, "token_acc": 0.5139442231075697, "train_speed(iter/s)": 1.441059 }, { "epoch": 1.4346000599802922, "grad_norm": 5.221634387969971, "learning_rate": 8.102619179247866e-05, "loss": 2.596410369873047, "memory(GiB)": 77.56, "step": 33485, "token_acc": 0.45808383233532934, "train_speed(iter/s)": 1.441096 }, { "epoch": 1.4348142753095412, "grad_norm": 5.489154815673828, "learning_rate": 8.102091410698445e-05, "loss": 2.6875179290771483, "memory(GiB)": 77.56, "step": 33490, "token_acc": 0.4671280276816609, "train_speed(iter/s)": 1.441123 }, { "epoch": 1.43502849063879, "grad_norm": 3.84725284576416, "learning_rate": 8.101563585951728e-05, "loss": 2.7381092071533204, "memory(GiB)": 77.56, "step": 33495, "token_acc": 0.4570552147239264, "train_speed(iter/s)": 1.441149 }, { "epoch": 1.435242705968039, "grad_norm": 4.275827407836914, "learning_rate": 8.101035705017277e-05, "loss": 2.7338294982910156, "memory(GiB)": 77.56, "step": 33500, "token_acc": 0.45714285714285713, "train_speed(iter/s)": 1.441155 }, { "epoch": 1.435242705968039, "eval_loss": 2.3013715744018555, "eval_runtime": 14.4887, "eval_samples_per_second": 6.902, "eval_steps_per_second": 6.902, "eval_token_acc": 0.4812760055478502, "step": 33500 }, { "epoch": 1.435456921297288, "grad_norm": 4.167738914489746, "learning_rate": 8.100507767904653e-05, "loss": 2.466640853881836, "memory(GiB)": 77.56, "step": 33505, "token_acc": 0.4852240228789323, "train_speed(iter/s)": 1.440202 }, { "epoch": 1.435671136626537, "grad_norm": 5.381674766540527, "learning_rate": 8.099979774623425e-05, "loss": 2.276072311401367, "memory(GiB)": 77.56, "step": 33510, "token_acc": 0.5246636771300448, "train_speed(iter/s)": 1.440171 }, { "epoch": 1.435885351955786, "grad_norm": 6.185641765594482, "learning_rate": 8.099451725183154e-05, "loss": 2.246568298339844, "memory(GiB)": 77.56, "step": 33515, "token_acc": 0.49264705882352944, "train_speed(iter/s)": 1.440168 }, { "epoch": 1.436099567285035, "grad_norm": 5.230942249298096, "learning_rate": 8.098923619593409e-05, "loss": 2.6886354446411134, "memory(GiB)": 77.56, "step": 33520, "token_acc": 0.5145631067961165, "train_speed(iter/s)": 1.440159 }, { "epoch": 1.4363137826142838, "grad_norm": 4.901467323303223, "learning_rate": 8.098395457863755e-05, "loss": 2.153408241271973, "memory(GiB)": 77.56, "step": 33525, "token_acc": 0.5289256198347108, "train_speed(iter/s)": 1.440173 }, { "epoch": 1.4365279979435328, "grad_norm": 4.4969682693481445, "learning_rate": 8.097867240003761e-05, "loss": 2.5498191833496096, "memory(GiB)": 77.56, "step": 33530, "token_acc": 0.4542372881355932, "train_speed(iter/s)": 1.440214 }, { "epoch": 1.4367422132727818, "grad_norm": 4.1007561683654785, "learning_rate": 8.097338966022993e-05, "loss": 2.7983366012573243, "memory(GiB)": 77.56, "step": 33535, "token_acc": 0.4366576819407008, "train_speed(iter/s)": 1.44024 }, { "epoch": 1.4369564286020307, "grad_norm": 6.27601957321167, "learning_rate": 8.096810635931026e-05, "loss": 2.673727035522461, "memory(GiB)": 77.56, "step": 33540, "token_acc": 0.4358974358974359, "train_speed(iter/s)": 1.440259 }, { "epoch": 1.4371706439312797, "grad_norm": 4.566864013671875, "learning_rate": 8.09628224973743e-05, "loss": 2.391421318054199, "memory(GiB)": 77.56, "step": 33545, "token_acc": 0.5017421602787456, "train_speed(iter/s)": 1.440217 }, { "epoch": 1.4373848592605287, "grad_norm": 4.772444248199463, "learning_rate": 8.095753807451777e-05, "loss": 2.640765380859375, "memory(GiB)": 77.56, "step": 33550, "token_acc": 0.48231511254019294, "train_speed(iter/s)": 1.440207 }, { "epoch": 1.4375990745897775, "grad_norm": 4.421515464782715, "learning_rate": 8.095225309083638e-05, "loss": 2.428546333312988, "memory(GiB)": 77.56, "step": 33555, "token_acc": 0.49169435215946844, "train_speed(iter/s)": 1.440212 }, { "epoch": 1.4378132899190266, "grad_norm": 4.157040596008301, "learning_rate": 8.094696754642591e-05, "loss": 2.5101404190063477, "memory(GiB)": 77.56, "step": 33560, "token_acc": 0.5102739726027398, "train_speed(iter/s)": 1.44025 }, { "epoch": 1.4380275052482756, "grad_norm": 5.607004642486572, "learning_rate": 8.094168144138208e-05, "loss": 2.6087614059448243, "memory(GiB)": 77.56, "step": 33565, "token_acc": 0.5097276264591439, "train_speed(iter/s)": 1.440266 }, { "epoch": 1.4382417205775244, "grad_norm": 6.247413635253906, "learning_rate": 8.093639477580066e-05, "loss": 2.7758199691772463, "memory(GiB)": 77.56, "step": 33570, "token_acc": 0.44673539518900346, "train_speed(iter/s)": 1.440255 }, { "epoch": 1.4384559359067735, "grad_norm": 4.286452293395996, "learning_rate": 8.093110754977744e-05, "loss": 2.545228958129883, "memory(GiB)": 77.56, "step": 33575, "token_acc": 0.46853146853146854, "train_speed(iter/s)": 1.440269 }, { "epoch": 1.4386701512360225, "grad_norm": 4.8186564445495605, "learning_rate": 8.092581976340819e-05, "loss": 2.361837387084961, "memory(GiB)": 77.56, "step": 33580, "token_acc": 0.5110294117647058, "train_speed(iter/s)": 1.440284 }, { "epoch": 1.4388843665652713, "grad_norm": 4.426604747772217, "learning_rate": 8.09205314167887e-05, "loss": 2.558700752258301, "memory(GiB)": 77.56, "step": 33585, "token_acc": 0.45787545787545786, "train_speed(iter/s)": 1.440293 }, { "epoch": 1.4390985818945203, "grad_norm": 6.376096248626709, "learning_rate": 8.091524251001476e-05, "loss": 2.339290428161621, "memory(GiB)": 77.56, "step": 33590, "token_acc": 0.45627376425855515, "train_speed(iter/s)": 1.440313 }, { "epoch": 1.4393127972237694, "grad_norm": 10.38262939453125, "learning_rate": 8.090995304318224e-05, "loss": 3.107723426818848, "memory(GiB)": 77.56, "step": 33595, "token_acc": 0.4064748201438849, "train_speed(iter/s)": 1.440254 }, { "epoch": 1.4395270125530182, "grad_norm": 5.614473342895508, "learning_rate": 8.090466301638688e-05, "loss": 2.6898096084594725, "memory(GiB)": 77.56, "step": 33600, "token_acc": 0.4127906976744186, "train_speed(iter/s)": 1.440248 }, { "epoch": 1.4397412278822672, "grad_norm": 6.232783794403076, "learning_rate": 8.089937242972459e-05, "loss": 2.3672855377197264, "memory(GiB)": 77.56, "step": 33605, "token_acc": 0.5203252032520326, "train_speed(iter/s)": 1.440292 }, { "epoch": 1.4399554432115163, "grad_norm": 6.109330177307129, "learning_rate": 8.089408128329118e-05, "loss": 2.4628347396850585, "memory(GiB)": 77.56, "step": 33610, "token_acc": 0.4793650793650794, "train_speed(iter/s)": 1.440275 }, { "epoch": 1.440169658540765, "grad_norm": 3.8369626998901367, "learning_rate": 8.088878957718249e-05, "loss": 2.786264419555664, "memory(GiB)": 77.56, "step": 33615, "token_acc": 0.4781144781144781, "train_speed(iter/s)": 1.440301 }, { "epoch": 1.440383873870014, "grad_norm": 5.966667175292969, "learning_rate": 8.088349731149441e-05, "loss": 2.650204658508301, "memory(GiB)": 77.56, "step": 33620, "token_acc": 0.44015444015444016, "train_speed(iter/s)": 1.440333 }, { "epoch": 1.4405980891992631, "grad_norm": 4.497265815734863, "learning_rate": 8.087820448632282e-05, "loss": 2.65279483795166, "memory(GiB)": 77.56, "step": 33625, "token_acc": 0.4804804804804805, "train_speed(iter/s)": 1.440357 }, { "epoch": 1.440812304528512, "grad_norm": 4.815916538238525, "learning_rate": 8.087291110176355e-05, "loss": 2.3260820388793944, "memory(GiB)": 77.56, "step": 33630, "token_acc": 0.4726027397260274, "train_speed(iter/s)": 1.440377 }, { "epoch": 1.441026519857761, "grad_norm": 4.615348815917969, "learning_rate": 8.086761715791258e-05, "loss": 2.050550079345703, "memory(GiB)": 77.56, "step": 33635, "token_acc": 0.5330578512396694, "train_speed(iter/s)": 1.440399 }, { "epoch": 1.44124073518701, "grad_norm": 6.801609516143799, "learning_rate": 8.086232265486576e-05, "loss": 2.616620635986328, "memory(GiB)": 77.56, "step": 33640, "token_acc": 0.4712230215827338, "train_speed(iter/s)": 1.440388 }, { "epoch": 1.4414549505162588, "grad_norm": 5.370550155639648, "learning_rate": 8.085702759271899e-05, "loss": 2.580626678466797, "memory(GiB)": 77.56, "step": 33645, "token_acc": 0.4515235457063712, "train_speed(iter/s)": 1.440395 }, { "epoch": 1.4416691658455079, "grad_norm": 4.841226577758789, "learning_rate": 8.085173197156824e-05, "loss": 2.6208080291748046, "memory(GiB)": 77.56, "step": 33650, "token_acc": 0.42474916387959866, "train_speed(iter/s)": 1.440408 }, { "epoch": 1.441883381174757, "grad_norm": 4.60963773727417, "learning_rate": 8.084643579150941e-05, "loss": 2.538687896728516, "memory(GiB)": 77.56, "step": 33655, "token_acc": 0.4828897338403042, "train_speed(iter/s)": 1.440344 }, { "epoch": 1.4420975965040057, "grad_norm": 5.9481306076049805, "learning_rate": 8.084113905263846e-05, "loss": 2.4066253662109376, "memory(GiB)": 77.56, "step": 33660, "token_acc": 0.5037878787878788, "train_speed(iter/s)": 1.440364 }, { "epoch": 1.4423118118332547, "grad_norm": 6.224748134613037, "learning_rate": 8.083584175505134e-05, "loss": 2.3255426406860353, "memory(GiB)": 77.56, "step": 33665, "token_acc": 0.49586776859504134, "train_speed(iter/s)": 1.440435 }, { "epoch": 1.4425260271625038, "grad_norm": 5.452847003936768, "learning_rate": 8.083054389884405e-05, "loss": 2.5483604431152345, "memory(GiB)": 77.56, "step": 33670, "token_acc": 0.472, "train_speed(iter/s)": 1.4404 }, { "epoch": 1.4427402424917526, "grad_norm": 4.419780731201172, "learning_rate": 8.082524548411252e-05, "loss": 2.5582042694091798, "memory(GiB)": 77.56, "step": 33675, "token_acc": 0.46546546546546547, "train_speed(iter/s)": 1.440401 }, { "epoch": 1.4429544578210016, "grad_norm": 6.002676486968994, "learning_rate": 8.081994651095273e-05, "loss": 2.4643436431884767, "memory(GiB)": 77.56, "step": 33680, "token_acc": 0.46686746987951805, "train_speed(iter/s)": 1.44037 }, { "epoch": 1.4431686731502507, "grad_norm": 4.702930450439453, "learning_rate": 8.081464697946072e-05, "loss": 2.578280448913574, "memory(GiB)": 77.56, "step": 33685, "token_acc": 0.46742209631728043, "train_speed(iter/s)": 1.440331 }, { "epoch": 1.4433828884794995, "grad_norm": 5.671988487243652, "learning_rate": 8.080934688973248e-05, "loss": 2.604653549194336, "memory(GiB)": 77.56, "step": 33690, "token_acc": 0.4588607594936709, "train_speed(iter/s)": 1.440353 }, { "epoch": 1.4435971038087485, "grad_norm": 4.508692741394043, "learning_rate": 8.0804046241864e-05, "loss": 2.448164939880371, "memory(GiB)": 77.56, "step": 33695, "token_acc": 0.4881889763779528, "train_speed(iter/s)": 1.440331 }, { "epoch": 1.4438113191379975, "grad_norm": 5.394710063934326, "learning_rate": 8.079874503595133e-05, "loss": 2.5911964416503905, "memory(GiB)": 77.56, "step": 33700, "token_acc": 0.47368421052631576, "train_speed(iter/s)": 1.440293 }, { "epoch": 1.4440255344672464, "grad_norm": 5.142947673797607, "learning_rate": 8.079344327209051e-05, "loss": 2.55928955078125, "memory(GiB)": 77.56, "step": 33705, "token_acc": 0.4515235457063712, "train_speed(iter/s)": 1.440301 }, { "epoch": 1.4442397497964954, "grad_norm": 4.448504447937012, "learning_rate": 8.078814095037758e-05, "loss": 2.5307140350341797, "memory(GiB)": 77.56, "step": 33710, "token_acc": 0.4859154929577465, "train_speed(iter/s)": 1.44031 }, { "epoch": 1.4444539651257444, "grad_norm": 5.810717582702637, "learning_rate": 8.078283807090858e-05, "loss": 2.7283184051513674, "memory(GiB)": 77.56, "step": 33715, "token_acc": 0.4457831325301205, "train_speed(iter/s)": 1.440318 }, { "epoch": 1.4446681804549932, "grad_norm": 4.456352710723877, "learning_rate": 8.077753463377962e-05, "loss": 2.76055965423584, "memory(GiB)": 77.56, "step": 33720, "token_acc": 0.444794952681388, "train_speed(iter/s)": 1.4404 }, { "epoch": 1.4448823957842423, "grad_norm": 5.7959184646606445, "learning_rate": 8.07722306390867e-05, "loss": 2.332366371154785, "memory(GiB)": 77.56, "step": 33725, "token_acc": 0.5054945054945055, "train_speed(iter/s)": 1.440414 }, { "epoch": 1.4450966111134913, "grad_norm": 6.644979476928711, "learning_rate": 8.0766926086926e-05, "loss": 2.443693161010742, "memory(GiB)": 77.56, "step": 33730, "token_acc": 0.4854014598540146, "train_speed(iter/s)": 1.440367 }, { "epoch": 1.4453108264427401, "grad_norm": 4.913311958312988, "learning_rate": 8.076162097739356e-05, "loss": 2.470724105834961, "memory(GiB)": 77.56, "step": 33735, "token_acc": 0.4783950617283951, "train_speed(iter/s)": 1.440351 }, { "epoch": 1.4455250417719891, "grad_norm": 5.47266149520874, "learning_rate": 8.07563153105855e-05, "loss": 2.40041561126709, "memory(GiB)": 77.56, "step": 33740, "token_acc": 0.46645367412140576, "train_speed(iter/s)": 1.44034 }, { "epoch": 1.4457392571012382, "grad_norm": 5.7080078125, "learning_rate": 8.075100908659793e-05, "loss": 3.122731399536133, "memory(GiB)": 77.56, "step": 33745, "token_acc": 0.4129032258064516, "train_speed(iter/s)": 1.440365 }, { "epoch": 1.445953472430487, "grad_norm": 4.789261817932129, "learning_rate": 8.074570230552698e-05, "loss": 2.2112518310546876, "memory(GiB)": 77.56, "step": 33750, "token_acc": 0.5272727272727272, "train_speed(iter/s)": 1.44038 }, { "epoch": 1.446167687759736, "grad_norm": 5.958988666534424, "learning_rate": 8.074039496746881e-05, "loss": 2.8660274505615235, "memory(GiB)": 77.56, "step": 33755, "token_acc": 0.4250871080139373, "train_speed(iter/s)": 1.440428 }, { "epoch": 1.446381903088985, "grad_norm": 4.641246795654297, "learning_rate": 8.073508707251953e-05, "loss": 2.675687789916992, "memory(GiB)": 77.56, "step": 33760, "token_acc": 0.47368421052631576, "train_speed(iter/s)": 1.440476 }, { "epoch": 1.4465961184182339, "grad_norm": 4.817697525024414, "learning_rate": 8.072977862077532e-05, "loss": 2.4350305557250977, "memory(GiB)": 77.56, "step": 33765, "token_acc": 0.47878787878787876, "train_speed(iter/s)": 1.440417 }, { "epoch": 1.446810333747483, "grad_norm": 4.625699996948242, "learning_rate": 8.072446961233236e-05, "loss": 2.404711151123047, "memory(GiB)": 77.56, "step": 33770, "token_acc": 0.4602510460251046, "train_speed(iter/s)": 1.440367 }, { "epoch": 1.447024549076732, "grad_norm": 4.81039571762085, "learning_rate": 8.07191600472868e-05, "loss": 2.301344108581543, "memory(GiB)": 77.56, "step": 33775, "token_acc": 0.49433962264150944, "train_speed(iter/s)": 1.440397 }, { "epoch": 1.4472387644059808, "grad_norm": 4.775697231292725, "learning_rate": 8.071384992573482e-05, "loss": 2.5201833724975584, "memory(GiB)": 77.56, "step": 33780, "token_acc": 0.47041420118343197, "train_speed(iter/s)": 1.440404 }, { "epoch": 1.4474529797352298, "grad_norm": 6.714394569396973, "learning_rate": 8.070853924777266e-05, "loss": 2.4296825408935545, "memory(GiB)": 77.56, "step": 33785, "token_acc": 0.4810606060606061, "train_speed(iter/s)": 1.440403 }, { "epoch": 1.4476671950644788, "grad_norm": 4.200636386871338, "learning_rate": 8.070322801349649e-05, "loss": 2.6939130783081056, "memory(GiB)": 77.56, "step": 33790, "token_acc": 0.4583333333333333, "train_speed(iter/s)": 1.440429 }, { "epoch": 1.4478814103937276, "grad_norm": 6.712575435638428, "learning_rate": 8.069791622300255e-05, "loss": 2.3527122497558595, "memory(GiB)": 77.56, "step": 33795, "token_acc": 0.4870848708487085, "train_speed(iter/s)": 1.440389 }, { "epoch": 1.4480956257229767, "grad_norm": 8.108104705810547, "learning_rate": 8.069260387638705e-05, "loss": 2.6927162170410157, "memory(GiB)": 77.56, "step": 33800, "token_acc": 0.4782608695652174, "train_speed(iter/s)": 1.440373 }, { "epoch": 1.4483098410522257, "grad_norm": 4.842827796936035, "learning_rate": 8.068729097374626e-05, "loss": 2.4400306701660157, "memory(GiB)": 77.56, "step": 33805, "token_acc": 0.470404984423676, "train_speed(iter/s)": 1.440392 }, { "epoch": 1.4485240563814745, "grad_norm": 4.508891582489014, "learning_rate": 8.068197751517638e-05, "loss": 2.4819536209106445, "memory(GiB)": 77.56, "step": 33810, "token_acc": 0.432, "train_speed(iter/s)": 1.440415 }, { "epoch": 1.4487382717107236, "grad_norm": 5.302507400512695, "learning_rate": 8.067666350077372e-05, "loss": 2.615432548522949, "memory(GiB)": 77.56, "step": 33815, "token_acc": 0.4967948717948718, "train_speed(iter/s)": 1.440389 }, { "epoch": 1.4489524870399726, "grad_norm": 4.059510231018066, "learning_rate": 8.067134893063452e-05, "loss": 2.618773651123047, "memory(GiB)": 77.56, "step": 33820, "token_acc": 0.44591029023746703, "train_speed(iter/s)": 1.440401 }, { "epoch": 1.4491667023692214, "grad_norm": 4.536221027374268, "learning_rate": 8.066603380485505e-05, "loss": 2.615183639526367, "memory(GiB)": 77.56, "step": 33825, "token_acc": 0.4838709677419355, "train_speed(iter/s)": 1.440448 }, { "epoch": 1.4493809176984704, "grad_norm": 4.616865158081055, "learning_rate": 8.066071812353162e-05, "loss": 2.4374961853027344, "memory(GiB)": 77.56, "step": 33830, "token_acc": 0.47470817120622566, "train_speed(iter/s)": 1.440433 }, { "epoch": 1.4495951330277195, "grad_norm": 4.955784320831299, "learning_rate": 8.06554018867605e-05, "loss": 2.2806678771972657, "memory(GiB)": 77.56, "step": 33835, "token_acc": 0.5051194539249146, "train_speed(iter/s)": 1.44038 }, { "epoch": 1.4498093483569683, "grad_norm": 4.6067609786987305, "learning_rate": 8.065008509463805e-05, "loss": 2.401715850830078, "memory(GiB)": 77.56, "step": 33840, "token_acc": 0.5018450184501845, "train_speed(iter/s)": 1.440372 }, { "epoch": 1.4500235636862173, "grad_norm": 5.793132781982422, "learning_rate": 8.064476774726054e-05, "loss": 2.485592269897461, "memory(GiB)": 77.56, "step": 33845, "token_acc": 0.4748427672955975, "train_speed(iter/s)": 1.440347 }, { "epoch": 1.4502377790154664, "grad_norm": 4.4255523681640625, "learning_rate": 8.063944984472431e-05, "loss": 2.5242336273193358, "memory(GiB)": 77.56, "step": 33850, "token_acc": 0.47547169811320755, "train_speed(iter/s)": 1.440307 }, { "epoch": 1.4504519943447154, "grad_norm": 4.728428840637207, "learning_rate": 8.063413138712572e-05, "loss": 2.763438415527344, "memory(GiB)": 77.56, "step": 33855, "token_acc": 0.4637223974763407, "train_speed(iter/s)": 1.440372 }, { "epoch": 1.4506662096739642, "grad_norm": 4.184111595153809, "learning_rate": 8.062881237456108e-05, "loss": 2.104880714416504, "memory(GiB)": 77.56, "step": 33860, "token_acc": 0.49387755102040815, "train_speed(iter/s)": 1.440404 }, { "epoch": 1.4508804250032132, "grad_norm": 4.85262393951416, "learning_rate": 8.06234928071268e-05, "loss": 2.3321254730224608, "memory(GiB)": 77.56, "step": 33865, "token_acc": 0.49264705882352944, "train_speed(iter/s)": 1.440425 }, { "epoch": 1.4510946403324623, "grad_norm": 5.855126857757568, "learning_rate": 8.061817268491919e-05, "loss": 2.693760108947754, "memory(GiB)": 77.56, "step": 33870, "token_acc": 0.4612903225806452, "train_speed(iter/s)": 1.440413 }, { "epoch": 1.451308855661711, "grad_norm": 4.04247522354126, "learning_rate": 8.061285200803467e-05, "loss": 2.3854419708251955, "memory(GiB)": 77.56, "step": 33875, "token_acc": 0.5, "train_speed(iter/s)": 1.440419 }, { "epoch": 1.4515230709909601, "grad_norm": 5.262057304382324, "learning_rate": 8.060753077656964e-05, "loss": 2.469320297241211, "memory(GiB)": 77.56, "step": 33880, "token_acc": 0.4230769230769231, "train_speed(iter/s)": 1.440386 }, { "epoch": 1.4517372863202092, "grad_norm": 5.64385986328125, "learning_rate": 8.060220899062045e-05, "loss": 2.6018207550048826, "memory(GiB)": 77.56, "step": 33885, "token_acc": 0.4812286689419795, "train_speed(iter/s)": 1.440383 }, { "epoch": 1.451951501649458, "grad_norm": 5.172167778015137, "learning_rate": 8.059688665028355e-05, "loss": 2.4049049377441407, "memory(GiB)": 77.56, "step": 33890, "token_acc": 0.49174917491749176, "train_speed(iter/s)": 1.44039 }, { "epoch": 1.452165716978707, "grad_norm": 4.447646141052246, "learning_rate": 8.059156375565535e-05, "loss": 2.5212480545043947, "memory(GiB)": 77.56, "step": 33895, "token_acc": 0.4548736462093863, "train_speed(iter/s)": 1.440356 }, { "epoch": 1.452379932307956, "grad_norm": 4.732133388519287, "learning_rate": 8.058624030683226e-05, "loss": 2.435433578491211, "memory(GiB)": 77.56, "step": 33900, "token_acc": 0.49707602339181284, "train_speed(iter/s)": 1.440351 }, { "epoch": 1.4525941476372048, "grad_norm": 5.007026195526123, "learning_rate": 8.058091630391076e-05, "loss": 2.590679931640625, "memory(GiB)": 77.56, "step": 33905, "token_acc": 0.4807017543859649, "train_speed(iter/s)": 1.44036 }, { "epoch": 1.4528083629664539, "grad_norm": 4.960092067718506, "learning_rate": 8.057559174698725e-05, "loss": 2.620238494873047, "memory(GiB)": 77.56, "step": 33910, "token_acc": 0.4267241379310345, "train_speed(iter/s)": 1.440333 }, { "epoch": 1.453022578295703, "grad_norm": 5.670502185821533, "learning_rate": 8.05702666361582e-05, "loss": 2.4999961853027344, "memory(GiB)": 77.56, "step": 33915, "token_acc": 0.445578231292517, "train_speed(iter/s)": 1.440369 }, { "epoch": 1.4532367936249517, "grad_norm": 5.052982330322266, "learning_rate": 8.056494097152013e-05, "loss": 2.5328283309936523, "memory(GiB)": 77.56, "step": 33920, "token_acc": 0.4299363057324841, "train_speed(iter/s)": 1.440408 }, { "epoch": 1.4534510089542008, "grad_norm": 4.490011692047119, "learning_rate": 8.055961475316947e-05, "loss": 2.695473289489746, "memory(GiB)": 77.56, "step": 33925, "token_acc": 0.4858156028368794, "train_speed(iter/s)": 1.440443 }, { "epoch": 1.4536652242834498, "grad_norm": 4.542452335357666, "learning_rate": 8.055428798120272e-05, "loss": 2.7107307434082033, "memory(GiB)": 77.56, "step": 33930, "token_acc": 0.48201438848920863, "train_speed(iter/s)": 1.440468 }, { "epoch": 1.4538794396126986, "grad_norm": 4.045337200164795, "learning_rate": 8.054896065571638e-05, "loss": 2.320671272277832, "memory(GiB)": 77.56, "step": 33935, "token_acc": 0.4605678233438486, "train_speed(iter/s)": 1.44046 }, { "epoch": 1.4540936549419476, "grad_norm": 4.828177452087402, "learning_rate": 8.054363277680695e-05, "loss": 2.5693618774414064, "memory(GiB)": 77.56, "step": 33940, "token_acc": 0.4768211920529801, "train_speed(iter/s)": 1.440465 }, { "epoch": 1.4543078702711967, "grad_norm": 4.6465301513671875, "learning_rate": 8.053830434457097e-05, "loss": 2.666273498535156, "memory(GiB)": 77.56, "step": 33945, "token_acc": 0.4693140794223827, "train_speed(iter/s)": 1.440514 }, { "epoch": 1.4545220856004455, "grad_norm": 5.002664089202881, "learning_rate": 8.053297535910496e-05, "loss": 3.001188850402832, "memory(GiB)": 77.56, "step": 33950, "token_acc": 0.43034055727554177, "train_speed(iter/s)": 1.440543 }, { "epoch": 1.4547363009296945, "grad_norm": 3.7473056316375732, "learning_rate": 8.052764582050544e-05, "loss": 2.404936408996582, "memory(GiB)": 77.56, "step": 33955, "token_acc": 0.49415204678362573, "train_speed(iter/s)": 1.440532 }, { "epoch": 1.4549505162589436, "grad_norm": 5.8957366943359375, "learning_rate": 8.052231572886902e-05, "loss": 2.5693374633789063, "memory(GiB)": 77.56, "step": 33960, "token_acc": 0.49264705882352944, "train_speed(iter/s)": 1.440585 }, { "epoch": 1.4551647315881924, "grad_norm": 4.121309280395508, "learning_rate": 8.051698508429219e-05, "loss": 2.52559871673584, "memory(GiB)": 77.56, "step": 33965, "token_acc": 0.504950495049505, "train_speed(iter/s)": 1.440589 }, { "epoch": 1.4553789469174414, "grad_norm": 4.470658302307129, "learning_rate": 8.051165388687154e-05, "loss": 2.7387414932250977, "memory(GiB)": 77.56, "step": 33970, "token_acc": 0.4523076923076923, "train_speed(iter/s)": 1.440595 }, { "epoch": 1.4555931622466904, "grad_norm": 5.512742042541504, "learning_rate": 8.050632213670368e-05, "loss": 2.5898693084716795, "memory(GiB)": 77.56, "step": 33975, "token_acc": 0.46332046332046334, "train_speed(iter/s)": 1.440625 }, { "epoch": 1.4558073775759393, "grad_norm": 7.265410900115967, "learning_rate": 8.050098983388516e-05, "loss": 2.705134391784668, "memory(GiB)": 77.56, "step": 33980, "token_acc": 0.44745762711864406, "train_speed(iter/s)": 1.440646 }, { "epoch": 1.4560215929051883, "grad_norm": 5.042665958404541, "learning_rate": 8.04956569785126e-05, "loss": 2.427484321594238, "memory(GiB)": 77.56, "step": 33985, "token_acc": 0.5184049079754601, "train_speed(iter/s)": 1.440612 }, { "epoch": 1.4562358082344373, "grad_norm": 5.753913402557373, "learning_rate": 8.049032357068263e-05, "loss": 2.8000616073608398, "memory(GiB)": 77.56, "step": 33990, "token_acc": 0.44525547445255476, "train_speed(iter/s)": 1.440589 }, { "epoch": 1.4564500235636861, "grad_norm": 4.5002827644348145, "learning_rate": 8.048498961049182e-05, "loss": 2.869167137145996, "memory(GiB)": 77.56, "step": 33995, "token_acc": 0.4269005847953216, "train_speed(iter/s)": 1.440571 }, { "epoch": 1.4566642388929352, "grad_norm": 4.313014507293701, "learning_rate": 8.047965509803684e-05, "loss": 2.503142547607422, "memory(GiB)": 77.56, "step": 34000, "token_acc": 0.4725274725274725, "train_speed(iter/s)": 1.440535 }, { "epoch": 1.4566642388929352, "eval_loss": 2.207979440689087, "eval_runtime": 13.8409, "eval_samples_per_second": 7.225, "eval_steps_per_second": 7.225, "eval_token_acc": 0.46065808297567956, "step": 34000 }, { "epoch": 1.4568784542221842, "grad_norm": 3.5770678520202637, "learning_rate": 8.04743200334143e-05, "loss": 2.4147817611694338, "memory(GiB)": 77.56, "step": 34005, "token_acc": 0.4794520547945205, "train_speed(iter/s)": 1.439678 }, { "epoch": 1.457092669551433, "grad_norm": 4.975116729736328, "learning_rate": 8.046898441672087e-05, "loss": 2.618977165222168, "memory(GiB)": 77.56, "step": 34010, "token_acc": 0.46579804560260585, "train_speed(iter/s)": 1.439692 }, { "epoch": 1.457306884880682, "grad_norm": 4.206860542297363, "learning_rate": 8.046364824805321e-05, "loss": 2.3614410400390624, "memory(GiB)": 77.56, "step": 34015, "token_acc": 0.49050632911392406, "train_speed(iter/s)": 1.439701 }, { "epoch": 1.457521100209931, "grad_norm": 4.989882946014404, "learning_rate": 8.045831152750798e-05, "loss": 2.7668922424316404, "memory(GiB)": 77.56, "step": 34020, "token_acc": 0.45, "train_speed(iter/s)": 1.439718 }, { "epoch": 1.4577353155391801, "grad_norm": 5.06628942489624, "learning_rate": 8.045297425518188e-05, "loss": 2.5968347549438477, "memory(GiB)": 77.56, "step": 34025, "token_acc": 0.4962121212121212, "train_speed(iter/s)": 1.439756 }, { "epoch": 1.457949530868429, "grad_norm": 8.473918914794922, "learning_rate": 8.044763643117157e-05, "loss": 2.5514623641967775, "memory(GiB)": 77.56, "step": 34030, "token_acc": 0.4444444444444444, "train_speed(iter/s)": 1.439795 }, { "epoch": 1.458163746197678, "grad_norm": 4.81611442565918, "learning_rate": 8.044229805557377e-05, "loss": 2.508599853515625, "memory(GiB)": 77.56, "step": 34035, "token_acc": 0.5019157088122606, "train_speed(iter/s)": 1.439836 }, { "epoch": 1.458377961526927, "grad_norm": 5.073397159576416, "learning_rate": 8.043695912848517e-05, "loss": 2.504368209838867, "memory(GiB)": 77.56, "step": 34040, "token_acc": 0.493006993006993, "train_speed(iter/s)": 1.439839 }, { "epoch": 1.4585921768561758, "grad_norm": 7.687406063079834, "learning_rate": 8.043161965000252e-05, "loss": 2.450765037536621, "memory(GiB)": 77.56, "step": 34045, "token_acc": 0.45703125, "train_speed(iter/s)": 1.439883 }, { "epoch": 1.4588063921854248, "grad_norm": 4.375649452209473, "learning_rate": 8.042627962022252e-05, "loss": 3.0150527954101562, "memory(GiB)": 77.56, "step": 34050, "token_acc": 0.4061433447098976, "train_speed(iter/s)": 1.439857 }, { "epoch": 1.4590206075146739, "grad_norm": 3.875495433807373, "learning_rate": 8.042093903924193e-05, "loss": 2.516194152832031, "memory(GiB)": 77.56, "step": 34055, "token_acc": 0.49606299212598426, "train_speed(iter/s)": 1.439857 }, { "epoch": 1.4592348228439227, "grad_norm": 5.264041900634766, "learning_rate": 8.04155979071575e-05, "loss": 2.3918903350830076, "memory(GiB)": 77.56, "step": 34060, "token_acc": 0.47410358565737054, "train_speed(iter/s)": 1.439868 }, { "epoch": 1.4594490381731717, "grad_norm": 4.1805291175842285, "learning_rate": 8.041025622406596e-05, "loss": 2.6414152145385743, "memory(GiB)": 77.56, "step": 34065, "token_acc": 0.44932432432432434, "train_speed(iter/s)": 1.439872 }, { "epoch": 1.4596632535024208, "grad_norm": 4.5763840675354, "learning_rate": 8.040491399006412e-05, "loss": 2.211083984375, "memory(GiB)": 77.56, "step": 34070, "token_acc": 0.5143884892086331, "train_speed(iter/s)": 1.439862 }, { "epoch": 1.4598774688316696, "grad_norm": 5.411908149719238, "learning_rate": 8.039957120524872e-05, "loss": 2.4264049530029297, "memory(GiB)": 77.56, "step": 34075, "token_acc": 0.5136986301369864, "train_speed(iter/s)": 1.439893 }, { "epoch": 1.4600916841609186, "grad_norm": 4.275822639465332, "learning_rate": 8.03942278697166e-05, "loss": 2.6225685119628905, "memory(GiB)": 77.56, "step": 34080, "token_acc": 0.47017543859649125, "train_speed(iter/s)": 1.439885 }, { "epoch": 1.4603058994901676, "grad_norm": 6.225549221038818, "learning_rate": 8.03888839835645e-05, "loss": 2.6890220642089844, "memory(GiB)": 77.56, "step": 34085, "token_acc": 0.4553072625698324, "train_speed(iter/s)": 1.439887 }, { "epoch": 1.4605201148194165, "grad_norm": 4.766172885894775, "learning_rate": 8.038353954688928e-05, "loss": 2.6291620254516603, "memory(GiB)": 77.56, "step": 34090, "token_acc": 0.4754601226993865, "train_speed(iter/s)": 1.439892 }, { "epoch": 1.4607343301486655, "grad_norm": 4.639849662780762, "learning_rate": 8.037819455978774e-05, "loss": 2.659840965270996, "memory(GiB)": 77.56, "step": 34095, "token_acc": 0.4418604651162791, "train_speed(iter/s)": 1.439871 }, { "epoch": 1.4609485454779145, "grad_norm": 5.083648204803467, "learning_rate": 8.037284902235669e-05, "loss": 2.4212902069091795, "memory(GiB)": 77.56, "step": 34100, "token_acc": 0.4701195219123506, "train_speed(iter/s)": 1.439914 }, { "epoch": 1.4611627608071633, "grad_norm": 3.767834424972534, "learning_rate": 8.036750293469302e-05, "loss": 2.3488174438476563, "memory(GiB)": 77.56, "step": 34105, "token_acc": 0.4838709677419355, "train_speed(iter/s)": 1.439948 }, { "epoch": 1.4613769761364124, "grad_norm": 5.209688186645508, "learning_rate": 8.036215629689352e-05, "loss": 2.3675933837890626, "memory(GiB)": 77.56, "step": 34110, "token_acc": 0.4946236559139785, "train_speed(iter/s)": 1.439971 }, { "epoch": 1.4615911914656614, "grad_norm": 3.7660396099090576, "learning_rate": 8.035680910905508e-05, "loss": 2.0199440002441404, "memory(GiB)": 77.56, "step": 34115, "token_acc": 0.5400696864111498, "train_speed(iter/s)": 1.440005 }, { "epoch": 1.4618054067949102, "grad_norm": 4.780405044555664, "learning_rate": 8.035146137127458e-05, "loss": 2.6689279556274412, "memory(GiB)": 77.56, "step": 34120, "token_acc": 0.49337748344370863, "train_speed(iter/s)": 1.439987 }, { "epoch": 1.4620196221241593, "grad_norm": 4.523158073425293, "learning_rate": 8.034611308364888e-05, "loss": 2.525370788574219, "memory(GiB)": 77.56, "step": 34125, "token_acc": 0.4641638225255973, "train_speed(iter/s)": 1.440022 }, { "epoch": 1.4622338374534083, "grad_norm": 4.344518661499023, "learning_rate": 8.034076424627486e-05, "loss": 2.5201141357421877, "memory(GiB)": 77.56, "step": 34130, "token_acc": 0.5018587360594795, "train_speed(iter/s)": 1.440066 }, { "epoch": 1.462448052782657, "grad_norm": 4.241621971130371, "learning_rate": 8.033541485924945e-05, "loss": 2.5197959899902345, "memory(GiB)": 77.56, "step": 34135, "token_acc": 0.49504950495049505, "train_speed(iter/s)": 1.440021 }, { "epoch": 1.4626622681119061, "grad_norm": 4.401252269744873, "learning_rate": 8.033006492266952e-05, "loss": 2.6376155853271483, "memory(GiB)": 77.56, "step": 34140, "token_acc": 0.47003154574132494, "train_speed(iter/s)": 1.439984 }, { "epoch": 1.4628764834411552, "grad_norm": 5.189802646636963, "learning_rate": 8.032471443663203e-05, "loss": 2.576529312133789, "memory(GiB)": 77.56, "step": 34145, "token_acc": 0.4523809523809524, "train_speed(iter/s)": 1.440004 }, { "epoch": 1.463090698770404, "grad_norm": 5.572022914886475, "learning_rate": 8.03193634012339e-05, "loss": 2.6915861129760743, "memory(GiB)": 77.56, "step": 34150, "token_acc": 0.4479166666666667, "train_speed(iter/s)": 1.440023 }, { "epoch": 1.463304914099653, "grad_norm": 4.5874481201171875, "learning_rate": 8.031401181657206e-05, "loss": 2.855754089355469, "memory(GiB)": 77.56, "step": 34155, "token_acc": 0.45660377358490567, "train_speed(iter/s)": 1.440019 }, { "epoch": 1.463519129428902, "grad_norm": 5.819094181060791, "learning_rate": 8.030865968274344e-05, "loss": 2.8100864410400392, "memory(GiB)": 77.56, "step": 34160, "token_acc": 0.43157894736842106, "train_speed(iter/s)": 1.439986 }, { "epoch": 1.4637333447581509, "grad_norm": 7.241195201873779, "learning_rate": 8.030330699984503e-05, "loss": 2.7724287033081056, "memory(GiB)": 77.56, "step": 34165, "token_acc": 0.44680851063829785, "train_speed(iter/s)": 1.439998 }, { "epoch": 1.4639475600874, "grad_norm": 5.7527995109558105, "learning_rate": 8.029795376797377e-05, "loss": 2.797378158569336, "memory(GiB)": 77.56, "step": 34170, "token_acc": 0.41839762611275966, "train_speed(iter/s)": 1.440013 }, { "epoch": 1.464161775416649, "grad_norm": 5.099137306213379, "learning_rate": 8.029259998722667e-05, "loss": 2.5382545471191404, "memory(GiB)": 77.56, "step": 34175, "token_acc": 0.49433962264150944, "train_speed(iter/s)": 1.440041 }, { "epoch": 1.4643759907458977, "grad_norm": 3.8338465690612793, "learning_rate": 8.028724565770071e-05, "loss": 2.2501338958740233, "memory(GiB)": 77.56, "step": 34180, "token_acc": 0.5286624203821656, "train_speed(iter/s)": 1.440113 }, { "epoch": 1.4645902060751468, "grad_norm": 5.26433801651001, "learning_rate": 8.028189077949288e-05, "loss": 2.733415412902832, "memory(GiB)": 77.56, "step": 34185, "token_acc": 0.45422535211267606, "train_speed(iter/s)": 1.440124 }, { "epoch": 1.4648044214043958, "grad_norm": 5.47374963760376, "learning_rate": 8.027653535270019e-05, "loss": 2.495956039428711, "memory(GiB)": 77.56, "step": 34190, "token_acc": 0.4774193548387097, "train_speed(iter/s)": 1.440126 }, { "epoch": 1.4650186367336446, "grad_norm": 5.073783874511719, "learning_rate": 8.027117937741966e-05, "loss": 2.509799575805664, "memory(GiB)": 77.56, "step": 34195, "token_acc": 0.475177304964539, "train_speed(iter/s)": 1.440103 }, { "epoch": 1.4652328520628937, "grad_norm": 4.862172603607178, "learning_rate": 8.026582285374832e-05, "loss": 2.429007911682129, "memory(GiB)": 77.56, "step": 34200, "token_acc": 0.4660493827160494, "train_speed(iter/s)": 1.440133 }, { "epoch": 1.4654470673921427, "grad_norm": 5.415630340576172, "learning_rate": 8.02604657817832e-05, "loss": 2.4519561767578124, "memory(GiB)": 77.56, "step": 34205, "token_acc": 0.5170278637770898, "train_speed(iter/s)": 1.440148 }, { "epoch": 1.4656612827213915, "grad_norm": 4.675723075866699, "learning_rate": 8.025510816162137e-05, "loss": 2.3500009536743165, "memory(GiB)": 77.56, "step": 34210, "token_acc": 0.5130434782608696, "train_speed(iter/s)": 1.440096 }, { "epoch": 1.4658754980506405, "grad_norm": 6.152647972106934, "learning_rate": 8.024974999335985e-05, "loss": 2.4933956146240233, "memory(GiB)": 77.56, "step": 34215, "token_acc": 0.4689655172413793, "train_speed(iter/s)": 1.440128 }, { "epoch": 1.4660897133798896, "grad_norm": 4.551023960113525, "learning_rate": 8.024439127709575e-05, "loss": 2.600601387023926, "memory(GiB)": 77.56, "step": 34220, "token_acc": 0.4290322580645161, "train_speed(iter/s)": 1.44012 }, { "epoch": 1.4663039287091384, "grad_norm": 5.539081573486328, "learning_rate": 8.023903201292613e-05, "loss": 2.3947078704833986, "memory(GiB)": 77.56, "step": 34225, "token_acc": 0.5189393939393939, "train_speed(iter/s)": 1.440142 }, { "epoch": 1.4665181440383874, "grad_norm": 6.135366439819336, "learning_rate": 8.023367220094809e-05, "loss": 2.3898338317871093, "memory(GiB)": 77.56, "step": 34230, "token_acc": 0.48788927335640137, "train_speed(iter/s)": 1.440204 }, { "epoch": 1.4667323593676365, "grad_norm": 5.819205284118652, "learning_rate": 8.02283118412587e-05, "loss": 2.4377296447753904, "memory(GiB)": 77.56, "step": 34235, "token_acc": 0.4889705882352941, "train_speed(iter/s)": 1.440243 }, { "epoch": 1.4669465746968853, "grad_norm": 4.292762279510498, "learning_rate": 8.022295093395509e-05, "loss": 2.4075897216796873, "memory(GiB)": 77.56, "step": 34240, "token_acc": 0.4952076677316294, "train_speed(iter/s)": 1.440261 }, { "epoch": 1.4671607900261343, "grad_norm": 5.881063461303711, "learning_rate": 8.021758947913436e-05, "loss": 2.4541038513183593, "memory(GiB)": 77.56, "step": 34245, "token_acc": 0.48188405797101447, "train_speed(iter/s)": 1.440211 }, { "epoch": 1.4673750053553833, "grad_norm": 5.276733875274658, "learning_rate": 8.021222747689367e-05, "loss": 2.558882713317871, "memory(GiB)": 77.56, "step": 34250, "token_acc": 0.4875, "train_speed(iter/s)": 1.440236 }, { "epoch": 1.4675892206846322, "grad_norm": 7.2614336013793945, "learning_rate": 8.020686492733011e-05, "loss": 2.5538002014160157, "memory(GiB)": 77.56, "step": 34255, "token_acc": 0.4432624113475177, "train_speed(iter/s)": 1.44026 }, { "epoch": 1.4678034360138812, "grad_norm": 5.199793815612793, "learning_rate": 8.02015018305409e-05, "loss": 2.4433361053466798, "memory(GiB)": 77.56, "step": 34260, "token_acc": 0.5207547169811321, "train_speed(iter/s)": 1.440248 }, { "epoch": 1.4680176513431302, "grad_norm": 5.548585891723633, "learning_rate": 8.019613818662312e-05, "loss": 2.35800838470459, "memory(GiB)": 77.56, "step": 34265, "token_acc": 0.5, "train_speed(iter/s)": 1.440284 }, { "epoch": 1.468231866672379, "grad_norm": 3.831486701965332, "learning_rate": 8.019077399567398e-05, "loss": 2.6825544357299806, "memory(GiB)": 77.56, "step": 34270, "token_acc": 0.4637223974763407, "train_speed(iter/s)": 1.440324 }, { "epoch": 1.468446082001628, "grad_norm": 4.019141674041748, "learning_rate": 8.018540925779064e-05, "loss": 2.5327083587646486, "memory(GiB)": 77.56, "step": 34275, "token_acc": 0.4563380281690141, "train_speed(iter/s)": 1.440299 }, { "epoch": 1.468660297330877, "grad_norm": 4.661750316619873, "learning_rate": 8.018004397307031e-05, "loss": 2.8629390716552736, "memory(GiB)": 77.56, "step": 34280, "token_acc": 0.44673539518900346, "train_speed(iter/s)": 1.440337 }, { "epoch": 1.468874512660126, "grad_norm": 5.158905506134033, "learning_rate": 8.017467814161015e-05, "loss": 2.487758445739746, "memory(GiB)": 77.56, "step": 34285, "token_acc": 0.4738562091503268, "train_speed(iter/s)": 1.440325 }, { "epoch": 1.469088727989375, "grad_norm": 5.842710018157959, "learning_rate": 8.016931176350741e-05, "loss": 2.309349250793457, "memory(GiB)": 77.56, "step": 34290, "token_acc": 0.5112781954887218, "train_speed(iter/s)": 1.440307 }, { "epoch": 1.469302943318624, "grad_norm": 4.598109245300293, "learning_rate": 8.016394483885928e-05, "loss": 2.6250984191894533, "memory(GiB)": 77.56, "step": 34295, "token_acc": 0.4984126984126984, "train_speed(iter/s)": 1.44036 }, { "epoch": 1.4695171586478728, "grad_norm": 5.496702671051025, "learning_rate": 8.015857736776299e-05, "loss": 2.464499282836914, "memory(GiB)": 77.56, "step": 34300, "token_acc": 0.4784172661870504, "train_speed(iter/s)": 1.440343 }, { "epoch": 1.4697313739771218, "grad_norm": 6.301141262054443, "learning_rate": 8.015320935031579e-05, "loss": 2.6534072875976564, "memory(GiB)": 77.56, "step": 34305, "token_acc": 0.4738562091503268, "train_speed(iter/s)": 1.440374 }, { "epoch": 1.4699455893063709, "grad_norm": 5.152554512023926, "learning_rate": 8.014784078661491e-05, "loss": 2.890163612365723, "memory(GiB)": 77.56, "step": 34310, "token_acc": 0.42657342657342656, "train_speed(iter/s)": 1.440373 }, { "epoch": 1.4701598046356197, "grad_norm": 4.3119072914123535, "learning_rate": 8.01424716767576e-05, "loss": 2.4026954650878904, "memory(GiB)": 77.56, "step": 34315, "token_acc": 0.47039473684210525, "train_speed(iter/s)": 1.440382 }, { "epoch": 1.4703740199648687, "grad_norm": 6.448112964630127, "learning_rate": 8.013710202084115e-05, "loss": 2.0851724624633787, "memory(GiB)": 77.56, "step": 34320, "token_acc": 0.5364806866952789, "train_speed(iter/s)": 1.440353 }, { "epoch": 1.4705882352941178, "grad_norm": 4.141831398010254, "learning_rate": 8.013173181896283e-05, "loss": 2.7041042327880858, "memory(GiB)": 77.56, "step": 34325, "token_acc": 0.44126984126984126, "train_speed(iter/s)": 1.440367 }, { "epoch": 1.4708024506233666, "grad_norm": 5.361547470092773, "learning_rate": 8.012636107121992e-05, "loss": 2.039295959472656, "memory(GiB)": 77.56, "step": 34330, "token_acc": 0.5165289256198347, "train_speed(iter/s)": 1.440322 }, { "epoch": 1.4710166659526156, "grad_norm": 5.071845054626465, "learning_rate": 8.012098977770971e-05, "loss": 2.9382888793945314, "memory(GiB)": 77.56, "step": 34335, "token_acc": 0.4311594202898551, "train_speed(iter/s)": 1.440334 }, { "epoch": 1.4712308812818646, "grad_norm": 6.151006698608398, "learning_rate": 8.011561793852953e-05, "loss": 2.5644079208374024, "memory(GiB)": 77.56, "step": 34340, "token_acc": 0.48333333333333334, "train_speed(iter/s)": 1.440351 }, { "epoch": 1.4714450966111134, "grad_norm": 3.985409736633301, "learning_rate": 8.011024555377667e-05, "loss": 2.2681888580322265, "memory(GiB)": 77.56, "step": 34345, "token_acc": 0.48314606741573035, "train_speed(iter/s)": 1.440325 }, { "epoch": 1.4716593119403625, "grad_norm": 4.982699394226074, "learning_rate": 8.010487262354847e-05, "loss": 2.3488372802734374, "memory(GiB)": 77.56, "step": 34350, "token_acc": 0.5190839694656488, "train_speed(iter/s)": 1.440413 }, { "epoch": 1.4718735272696115, "grad_norm": 4.712373733520508, "learning_rate": 8.009949914794226e-05, "loss": 2.4499528884887694, "memory(GiB)": 77.56, "step": 34355, "token_acc": 0.4953271028037383, "train_speed(iter/s)": 1.440408 }, { "epoch": 1.4720877425988603, "grad_norm": 6.626208782196045, "learning_rate": 8.009412512705539e-05, "loss": 2.5087371826171876, "memory(GiB)": 77.56, "step": 34360, "token_acc": 0.4370629370629371, "train_speed(iter/s)": 1.440386 }, { "epoch": 1.4723019579281094, "grad_norm": 4.252260684967041, "learning_rate": 8.00887505609852e-05, "loss": 2.5764888763427733, "memory(GiB)": 77.56, "step": 34365, "token_acc": 0.49295774647887325, "train_speed(iter/s)": 1.440402 }, { "epoch": 1.4725161732573584, "grad_norm": 5.475830078125, "learning_rate": 8.008337544982909e-05, "loss": 2.4365913391113283, "memory(GiB)": 77.56, "step": 34370, "token_acc": 0.5060240963855421, "train_speed(iter/s)": 1.440406 }, { "epoch": 1.4727303885866072, "grad_norm": 8.2151460647583, "learning_rate": 8.00779997936844e-05, "loss": 2.3214399337768556, "memory(GiB)": 77.56, "step": 34375, "token_acc": 0.4945054945054945, "train_speed(iter/s)": 1.440429 }, { "epoch": 1.4729446039158562, "grad_norm": 5.181869029998779, "learning_rate": 8.007262359264852e-05, "loss": 2.286582183837891, "memory(GiB)": 77.56, "step": 34380, "token_acc": 0.49337748344370863, "train_speed(iter/s)": 1.44041 }, { "epoch": 1.4731588192451053, "grad_norm": 4.501758098602295, "learning_rate": 8.006724684681888e-05, "loss": 2.3877639770507812, "memory(GiB)": 77.56, "step": 34385, "token_acc": 0.45724907063197023, "train_speed(iter/s)": 1.440394 }, { "epoch": 1.473373034574354, "grad_norm": 4.29125452041626, "learning_rate": 8.006186955629282e-05, "loss": 2.359326171875, "memory(GiB)": 77.56, "step": 34390, "token_acc": 0.5, "train_speed(iter/s)": 1.440405 }, { "epoch": 1.4735872499036031, "grad_norm": 4.701378345489502, "learning_rate": 8.005649172116782e-05, "loss": 2.644342803955078, "memory(GiB)": 77.56, "step": 34395, "token_acc": 0.43, "train_speed(iter/s)": 1.440341 }, { "epoch": 1.4738014652328522, "grad_norm": 5.0990800857543945, "learning_rate": 8.005111334154127e-05, "loss": 2.750717544555664, "memory(GiB)": 77.56, "step": 34400, "token_acc": 0.4444444444444444, "train_speed(iter/s)": 1.440329 }, { "epoch": 1.474015680562101, "grad_norm": 4.347292900085449, "learning_rate": 8.004573441751062e-05, "loss": 2.541716766357422, "memory(GiB)": 77.56, "step": 34405, "token_acc": 0.4813664596273292, "train_speed(iter/s)": 1.440306 }, { "epoch": 1.47422989589135, "grad_norm": 6.2339348793029785, "learning_rate": 8.00403549491733e-05, "loss": 2.711593246459961, "memory(GiB)": 77.56, "step": 34410, "token_acc": 0.44528301886792454, "train_speed(iter/s)": 1.440317 }, { "epoch": 1.474444111220599, "grad_norm": 5.4573235511779785, "learning_rate": 8.003497493662678e-05, "loss": 2.347819137573242, "memory(GiB)": 77.56, "step": 34415, "token_acc": 0.48534201954397393, "train_speed(iter/s)": 1.44032 }, { "epoch": 1.4746583265498479, "grad_norm": 6.5493693351745605, "learning_rate": 8.002959437996849e-05, "loss": 2.5783565521240233, "memory(GiB)": 77.56, "step": 34420, "token_acc": 0.4849624060150376, "train_speed(iter/s)": 1.440303 }, { "epoch": 1.4748725418790969, "grad_norm": 5.091011047363281, "learning_rate": 8.002421327929592e-05, "loss": 2.512987518310547, "memory(GiB)": 77.56, "step": 34425, "token_acc": 0.4533333333333333, "train_speed(iter/s)": 1.440272 }, { "epoch": 1.475086757208346, "grad_norm": 5.855741024017334, "learning_rate": 8.001883163470661e-05, "loss": 2.4178829193115234, "memory(GiB)": 77.56, "step": 34430, "token_acc": 0.4772727272727273, "train_speed(iter/s)": 1.440228 }, { "epoch": 1.4753009725375947, "grad_norm": 5.49994421005249, "learning_rate": 8.001344944629796e-05, "loss": 2.5528759002685546, "memory(GiB)": 77.56, "step": 34435, "token_acc": 0.4940239043824701, "train_speed(iter/s)": 1.440271 }, { "epoch": 1.4755151878668438, "grad_norm": 5.74098539352417, "learning_rate": 8.000806671416754e-05, "loss": 2.475512886047363, "memory(GiB)": 77.56, "step": 34440, "token_acc": 0.49224806201550386, "train_speed(iter/s)": 1.440302 }, { "epoch": 1.4757294031960928, "grad_norm": 4.453502178192139, "learning_rate": 8.000268343841283e-05, "loss": 2.661197471618652, "memory(GiB)": 77.56, "step": 34445, "token_acc": 0.4065040650406504, "train_speed(iter/s)": 1.440334 }, { "epoch": 1.4759436185253416, "grad_norm": 4.792104244232178, "learning_rate": 7.999729961913139e-05, "loss": 2.5993507385253904, "memory(GiB)": 77.56, "step": 34450, "token_acc": 0.48299319727891155, "train_speed(iter/s)": 1.440383 }, { "epoch": 1.4761578338545907, "grad_norm": 5.920977592468262, "learning_rate": 7.999191525642069e-05, "loss": 2.9311634063720704, "memory(GiB)": 77.56, "step": 34455, "token_acc": 0.4148148148148148, "train_speed(iter/s)": 1.440431 }, { "epoch": 1.4763720491838397, "grad_norm": 4.11167049407959, "learning_rate": 7.998653035037834e-05, "loss": 2.637973976135254, "memory(GiB)": 77.56, "step": 34460, "token_acc": 0.4798657718120805, "train_speed(iter/s)": 1.44042 }, { "epoch": 1.4765862645130885, "grad_norm": 5.867785453796387, "learning_rate": 7.998114490110185e-05, "loss": 2.431184196472168, "memory(GiB)": 77.56, "step": 34465, "token_acc": 0.4880546075085324, "train_speed(iter/s)": 1.440384 }, { "epoch": 1.4768004798423375, "grad_norm": 4.38999080657959, "learning_rate": 7.997575890868879e-05, "loss": 2.4927196502685547, "memory(GiB)": 77.56, "step": 34470, "token_acc": 0.4833948339483395, "train_speed(iter/s)": 1.440414 }, { "epoch": 1.4770146951715866, "grad_norm": 5.949841022491455, "learning_rate": 7.997037237323675e-05, "loss": 2.3301158905029298, "memory(GiB)": 77.56, "step": 34475, "token_acc": 0.5238095238095238, "train_speed(iter/s)": 1.440382 }, { "epoch": 1.4772289105008354, "grad_norm": 3.831998586654663, "learning_rate": 7.99649852948433e-05, "loss": 2.8143234252929688, "memory(GiB)": 77.56, "step": 34480, "token_acc": 0.4046997389033943, "train_speed(iter/s)": 1.440376 }, { "epoch": 1.4774431258300844, "grad_norm": 4.94537353515625, "learning_rate": 7.995959767360604e-05, "loss": 2.339773178100586, "memory(GiB)": 77.56, "step": 34485, "token_acc": 0.4945054945054945, "train_speed(iter/s)": 1.440435 }, { "epoch": 1.4776573411593334, "grad_norm": 6.495689868927002, "learning_rate": 7.995420950962254e-05, "loss": 2.6484256744384767, "memory(GiB)": 77.56, "step": 34490, "token_acc": 0.5, "train_speed(iter/s)": 1.440399 }, { "epoch": 1.4778715564885823, "grad_norm": 4.667993068695068, "learning_rate": 7.994882080299044e-05, "loss": 2.440870666503906, "memory(GiB)": 77.56, "step": 34495, "token_acc": 0.5058823529411764, "train_speed(iter/s)": 1.440427 }, { "epoch": 1.4780857718178313, "grad_norm": 6.729407787322998, "learning_rate": 7.994343155380737e-05, "loss": 2.530316925048828, "memory(GiB)": 77.56, "step": 34500, "token_acc": 0.46558704453441296, "train_speed(iter/s)": 1.440394 }, { "epoch": 1.4780857718178313, "eval_loss": 2.260627508163452, "eval_runtime": 14.256, "eval_samples_per_second": 7.015, "eval_steps_per_second": 7.015, "eval_token_acc": 0.46853146853146854, "step": 34500 }, { "epoch": 1.4782999871470803, "grad_norm": 5.972782611846924, "learning_rate": 7.993804176217093e-05, "loss": 2.6705720901489256, "memory(GiB)": 77.56, "step": 34505, "token_acc": 0.46534653465346537, "train_speed(iter/s)": 1.439537 }, { "epoch": 1.4785142024763291, "grad_norm": 5.555943965911865, "learning_rate": 7.993265142817881e-05, "loss": 2.364592933654785, "memory(GiB)": 77.56, "step": 34510, "token_acc": 0.5058823529411764, "train_speed(iter/s)": 1.43956 }, { "epoch": 1.4787284178055782, "grad_norm": 4.481245994567871, "learning_rate": 7.99272605519286e-05, "loss": 2.3735366821289063, "memory(GiB)": 77.56, "step": 34515, "token_acc": 0.445578231292517, "train_speed(iter/s)": 1.439557 }, { "epoch": 1.4789426331348272, "grad_norm": 5.238328456878662, "learning_rate": 7.9921869133518e-05, "loss": 2.487208938598633, "memory(GiB)": 77.56, "step": 34520, "token_acc": 0.5084745762711864, "train_speed(iter/s)": 1.439509 }, { "epoch": 1.479156848464076, "grad_norm": 5.508119106292725, "learning_rate": 7.991647717304467e-05, "loss": 2.756937026977539, "memory(GiB)": 77.56, "step": 34525, "token_acc": 0.47619047619047616, "train_speed(iter/s)": 1.439488 }, { "epoch": 1.479371063793325, "grad_norm": 4.803337574005127, "learning_rate": 7.99110846706063e-05, "loss": 2.5147260665893554, "memory(GiB)": 77.56, "step": 34530, "token_acc": 0.5, "train_speed(iter/s)": 1.439493 }, { "epoch": 1.479585279122574, "grad_norm": 5.185335159301758, "learning_rate": 7.990569162630057e-05, "loss": 2.4062711715698244, "memory(GiB)": 77.56, "step": 34535, "token_acc": 0.47924528301886793, "train_speed(iter/s)": 1.439423 }, { "epoch": 1.479799494451823, "grad_norm": 5.346336364746094, "learning_rate": 7.990029804022518e-05, "loss": 2.4619701385498045, "memory(GiB)": 77.56, "step": 34540, "token_acc": 0.4924924924924925, "train_speed(iter/s)": 1.439417 }, { "epoch": 1.480013709781072, "grad_norm": 5.0395965576171875, "learning_rate": 7.989490391247784e-05, "loss": 2.1731245040893556, "memory(GiB)": 77.56, "step": 34545, "token_acc": 0.5215827338129496, "train_speed(iter/s)": 1.439383 }, { "epoch": 1.480227925110321, "grad_norm": 4.642335891723633, "learning_rate": 7.988950924315628e-05, "loss": 2.566513442993164, "memory(GiB)": 77.56, "step": 34550, "token_acc": 0.5032258064516129, "train_speed(iter/s)": 1.439449 }, { "epoch": 1.4804421404395698, "grad_norm": 4.903952121734619, "learning_rate": 7.988411403235823e-05, "loss": 2.508007049560547, "memory(GiB)": 77.56, "step": 34555, "token_acc": 0.4716417910447761, "train_speed(iter/s)": 1.43951 }, { "epoch": 1.4806563557688188, "grad_norm": 3.999094009399414, "learning_rate": 7.987871828018141e-05, "loss": 2.6111896514892576, "memory(GiB)": 77.56, "step": 34560, "token_acc": 0.47297297297297297, "train_speed(iter/s)": 1.439534 }, { "epoch": 1.4808705710980679, "grad_norm": 5.334130764007568, "learning_rate": 7.987332198672356e-05, "loss": 2.630277633666992, "memory(GiB)": 77.56, "step": 34565, "token_acc": 0.43944636678200694, "train_speed(iter/s)": 1.439545 }, { "epoch": 1.4810847864273167, "grad_norm": 5.522465705871582, "learning_rate": 7.986792515208248e-05, "loss": 2.4380071640014647, "memory(GiB)": 77.56, "step": 34570, "token_acc": 0.483739837398374, "train_speed(iter/s)": 1.439587 }, { "epoch": 1.4812990017565657, "grad_norm": 4.230618476867676, "learning_rate": 7.986252777635592e-05, "loss": 2.4863521575927736, "memory(GiB)": 77.56, "step": 34575, "token_acc": 0.4684931506849315, "train_speed(iter/s)": 1.439575 }, { "epoch": 1.4815132170858147, "grad_norm": 4.854522705078125, "learning_rate": 7.985712985964164e-05, "loss": 2.2017822265625, "memory(GiB)": 77.56, "step": 34580, "token_acc": 0.5051903114186851, "train_speed(iter/s)": 1.439599 }, { "epoch": 1.4817274324150636, "grad_norm": 6.0103020668029785, "learning_rate": 7.985173140203745e-05, "loss": 2.481979179382324, "memory(GiB)": 77.56, "step": 34585, "token_acc": 0.5152671755725191, "train_speed(iter/s)": 1.439589 }, { "epoch": 1.4819416477443126, "grad_norm": 4.527801036834717, "learning_rate": 7.984633240364116e-05, "loss": 2.6746572494506835, "memory(GiB)": 77.56, "step": 34590, "token_acc": 0.46865671641791046, "train_speed(iter/s)": 1.439564 }, { "epoch": 1.4821558630735616, "grad_norm": 6.16262674331665, "learning_rate": 7.984093286455055e-05, "loss": 2.4438610076904297, "memory(GiB)": 77.56, "step": 34595, "token_acc": 0.48226950354609927, "train_speed(iter/s)": 1.439606 }, { "epoch": 1.4823700784028104, "grad_norm": 5.266121864318848, "learning_rate": 7.983553278486344e-05, "loss": 2.8454837799072266, "memory(GiB)": 77.56, "step": 34600, "token_acc": 0.4310850439882698, "train_speed(iter/s)": 1.43964 }, { "epoch": 1.4825842937320595, "grad_norm": 5.369907379150391, "learning_rate": 7.983013216467768e-05, "loss": 2.5199420928955076, "memory(GiB)": 77.56, "step": 34605, "token_acc": 0.46153846153846156, "train_speed(iter/s)": 1.439642 }, { "epoch": 1.4827985090613085, "grad_norm": 8.589921951293945, "learning_rate": 7.982473100409107e-05, "loss": 2.506466102600098, "memory(GiB)": 77.56, "step": 34610, "token_acc": 0.4575645756457565, "train_speed(iter/s)": 1.439691 }, { "epoch": 1.4830127243905573, "grad_norm": 5.31126070022583, "learning_rate": 7.981932930320149e-05, "loss": 2.659066390991211, "memory(GiB)": 77.56, "step": 34615, "token_acc": 0.48355263157894735, "train_speed(iter/s)": 1.439699 }, { "epoch": 1.4832269397198063, "grad_norm": 5.200987815856934, "learning_rate": 7.98139270621068e-05, "loss": 2.4870738983154297, "memory(GiB)": 77.56, "step": 34620, "token_acc": 0.47104247104247104, "train_speed(iter/s)": 1.439713 }, { "epoch": 1.4834411550490554, "grad_norm": 6.352973937988281, "learning_rate": 7.980852428090484e-05, "loss": 2.5357471466064454, "memory(GiB)": 77.56, "step": 34625, "token_acc": 0.4693140794223827, "train_speed(iter/s)": 1.439723 }, { "epoch": 1.4836553703783042, "grad_norm": 5.85141134262085, "learning_rate": 7.980312095969351e-05, "loss": 2.693460464477539, "memory(GiB)": 77.56, "step": 34630, "token_acc": 0.45819397993311034, "train_speed(iter/s)": 1.439776 }, { "epoch": 1.4838695857075532, "grad_norm": 4.918947219848633, "learning_rate": 7.979771709857066e-05, "loss": 2.4680059432983397, "memory(GiB)": 77.56, "step": 34635, "token_acc": 0.45483870967741935, "train_speed(iter/s)": 1.439838 }, { "epoch": 1.4840838010368023, "grad_norm": 4.078084945678711, "learning_rate": 7.979231269763425e-05, "loss": 2.4555105209350585, "memory(GiB)": 77.56, "step": 34640, "token_acc": 0.5117845117845118, "train_speed(iter/s)": 1.439827 }, { "epoch": 1.484298016366051, "grad_norm": 4.0401177406311035, "learning_rate": 7.978690775698213e-05, "loss": 2.245615768432617, "memory(GiB)": 77.56, "step": 34645, "token_acc": 0.5387323943661971, "train_speed(iter/s)": 1.439797 }, { "epoch": 1.4845122316953, "grad_norm": 5.768013000488281, "learning_rate": 7.978150227671223e-05, "loss": 2.546351432800293, "memory(GiB)": 77.56, "step": 34650, "token_acc": 0.4797047970479705, "train_speed(iter/s)": 1.439806 }, { "epoch": 1.4847264470245491, "grad_norm": 5.7937235832214355, "learning_rate": 7.977609625692248e-05, "loss": 2.3059606552124023, "memory(GiB)": 77.56, "step": 34655, "token_acc": 0.515748031496063, "train_speed(iter/s)": 1.439813 }, { "epoch": 1.484940662353798, "grad_norm": 4.96259069442749, "learning_rate": 7.977068969771083e-05, "loss": 2.3303560256958007, "memory(GiB)": 77.56, "step": 34660, "token_acc": 0.5038167938931297, "train_speed(iter/s)": 1.439808 }, { "epoch": 1.485154877683047, "grad_norm": 4.985846042633057, "learning_rate": 7.97652825991752e-05, "loss": 2.5304471969604494, "memory(GiB)": 77.56, "step": 34665, "token_acc": 0.44879518072289154, "train_speed(iter/s)": 1.439825 }, { "epoch": 1.485369093012296, "grad_norm": 4.745893955230713, "learning_rate": 7.975987496141354e-05, "loss": 2.5578933715820313, "memory(GiB)": 77.56, "step": 34670, "token_acc": 0.5130111524163569, "train_speed(iter/s)": 1.439821 }, { "epoch": 1.4855833083415448, "grad_norm": 5.999105930328369, "learning_rate": 7.975446678452384e-05, "loss": 2.607408142089844, "memory(GiB)": 77.56, "step": 34675, "token_acc": 0.45723684210526316, "train_speed(iter/s)": 1.439876 }, { "epoch": 1.4857975236707939, "grad_norm": 8.275153160095215, "learning_rate": 7.974905806860407e-05, "loss": 2.397511672973633, "memory(GiB)": 77.56, "step": 34680, "token_acc": 0.4744525547445255, "train_speed(iter/s)": 1.439828 }, { "epoch": 1.486011739000043, "grad_norm": 3.7610960006713867, "learning_rate": 7.974364881375218e-05, "loss": 2.3691444396972656, "memory(GiB)": 77.56, "step": 34685, "token_acc": 0.5155038759689923, "train_speed(iter/s)": 1.439815 }, { "epoch": 1.4862259543292917, "grad_norm": 4.706734657287598, "learning_rate": 7.97382390200662e-05, "loss": 2.685099983215332, "memory(GiB)": 77.56, "step": 34690, "token_acc": 0.44594594594594594, "train_speed(iter/s)": 1.439811 }, { "epoch": 1.4864401696585408, "grad_norm": 4.949123859405518, "learning_rate": 7.973282868764413e-05, "loss": 2.124616241455078, "memory(GiB)": 77.56, "step": 34695, "token_acc": 0.5020746887966805, "train_speed(iter/s)": 1.4398 }, { "epoch": 1.4866543849877898, "grad_norm": 5.066686630249023, "learning_rate": 7.972741781658399e-05, "loss": 2.717406463623047, "memory(GiB)": 77.56, "step": 34700, "token_acc": 0.44333333333333336, "train_speed(iter/s)": 1.439807 }, { "epoch": 1.4868686003170386, "grad_norm": 5.711490631103516, "learning_rate": 7.972200640698377e-05, "loss": 2.4084260940551756, "memory(GiB)": 77.56, "step": 34705, "token_acc": 0.5257352941176471, "train_speed(iter/s)": 1.439799 }, { "epoch": 1.4870828156462876, "grad_norm": 4.470431327819824, "learning_rate": 7.971659445894152e-05, "loss": 2.8833309173583985, "memory(GiB)": 77.56, "step": 34710, "token_acc": 0.4603174603174603, "train_speed(iter/s)": 1.43983 }, { "epoch": 1.4872970309755367, "grad_norm": 4.8883209228515625, "learning_rate": 7.97111819725553e-05, "loss": 2.4811426162719727, "memory(GiB)": 77.56, "step": 34715, "token_acc": 0.4778156996587031, "train_speed(iter/s)": 1.439869 }, { "epoch": 1.4875112463047855, "grad_norm": 4.814483642578125, "learning_rate": 7.970576894792314e-05, "loss": 2.6018882751464845, "memory(GiB)": 77.56, "step": 34720, "token_acc": 0.4440993788819876, "train_speed(iter/s)": 1.439942 }, { "epoch": 1.4877254616340345, "grad_norm": 7.101289749145508, "learning_rate": 7.97003553851431e-05, "loss": 2.6370811462402344, "memory(GiB)": 77.56, "step": 34725, "token_acc": 0.4558303886925795, "train_speed(iter/s)": 1.439979 }, { "epoch": 1.4879396769632836, "grad_norm": 5.749527454376221, "learning_rate": 7.969494128431327e-05, "loss": 2.659012222290039, "memory(GiB)": 77.56, "step": 34730, "token_acc": 0.46808510638297873, "train_speed(iter/s)": 1.439976 }, { "epoch": 1.4881538922925324, "grad_norm": 5.493963718414307, "learning_rate": 7.968952664553172e-05, "loss": 2.4715549468994142, "memory(GiB)": 77.56, "step": 34735, "token_acc": 0.4866920152091255, "train_speed(iter/s)": 1.439972 }, { "epoch": 1.4883681076217814, "grad_norm": 5.084977149963379, "learning_rate": 7.968411146889656e-05, "loss": 2.2956264495849608, "memory(GiB)": 77.56, "step": 34740, "token_acc": 0.523972602739726, "train_speed(iter/s)": 1.439997 }, { "epoch": 1.4885823229510304, "grad_norm": 6.531550407409668, "learning_rate": 7.967869575450587e-05, "loss": 2.8708534240722656, "memory(GiB)": 77.56, "step": 34745, "token_acc": 0.447098976109215, "train_speed(iter/s)": 1.440054 }, { "epoch": 1.4887965382802792, "grad_norm": 4.791491508483887, "learning_rate": 7.967327950245775e-05, "loss": 2.3526893615722657, "memory(GiB)": 77.56, "step": 34750, "token_acc": 0.4541984732824427, "train_speed(iter/s)": 1.440083 }, { "epoch": 1.4890107536095283, "grad_norm": 4.072079658508301, "learning_rate": 7.966786271285034e-05, "loss": 2.3518356323242187, "memory(GiB)": 77.56, "step": 34755, "token_acc": 0.4921875, "train_speed(iter/s)": 1.44009 }, { "epoch": 1.4892249689387773, "grad_norm": 4.942510604858398, "learning_rate": 7.966244538578177e-05, "loss": 2.668620300292969, "memory(GiB)": 77.56, "step": 34760, "token_acc": 0.46402877697841727, "train_speed(iter/s)": 1.44011 }, { "epoch": 1.4894391842680261, "grad_norm": 6.58004903793335, "learning_rate": 7.965702752135018e-05, "loss": 2.452613639831543, "memory(GiB)": 77.56, "step": 34765, "token_acc": 0.4959016393442623, "train_speed(iter/s)": 1.440126 }, { "epoch": 1.4896533995972752, "grad_norm": 6.290943145751953, "learning_rate": 7.965160911965371e-05, "loss": 2.2688323974609377, "memory(GiB)": 77.56, "step": 34770, "token_acc": 0.5, "train_speed(iter/s)": 1.440159 }, { "epoch": 1.4898676149265242, "grad_norm": 4.718799591064453, "learning_rate": 7.964619018079054e-05, "loss": 2.085108757019043, "memory(GiB)": 77.56, "step": 34775, "token_acc": 0.5351170568561873, "train_speed(iter/s)": 1.44017 }, { "epoch": 1.490081830255773, "grad_norm": 5.204291820526123, "learning_rate": 7.964077070485881e-05, "loss": 2.4292119979858398, "memory(GiB)": 77.56, "step": 34780, "token_acc": 0.5107913669064749, "train_speed(iter/s)": 1.440205 }, { "epoch": 1.490296045585022, "grad_norm": 6.061275482177734, "learning_rate": 7.963535069195671e-05, "loss": 2.5785268783569335, "memory(GiB)": 77.56, "step": 34785, "token_acc": 0.5034013605442177, "train_speed(iter/s)": 1.440192 }, { "epoch": 1.490510260914271, "grad_norm": 4.104846477508545, "learning_rate": 7.962993014218243e-05, "loss": 2.636448097229004, "memory(GiB)": 77.56, "step": 34790, "token_acc": 0.42857142857142855, "train_speed(iter/s)": 1.440179 }, { "epoch": 1.49072447624352, "grad_norm": 4.877756595611572, "learning_rate": 7.962450905563418e-05, "loss": 2.6018190383911133, "memory(GiB)": 77.56, "step": 34795, "token_acc": 0.44089456869009586, "train_speed(iter/s)": 1.440181 }, { "epoch": 1.490938691572769, "grad_norm": 5.892388820648193, "learning_rate": 7.961908743241016e-05, "loss": 2.557236671447754, "memory(GiB)": 77.56, "step": 34800, "token_acc": 0.4647887323943662, "train_speed(iter/s)": 1.440163 }, { "epoch": 1.491152906902018, "grad_norm": 4.26841926574707, "learning_rate": 7.961366527260858e-05, "loss": 2.4700206756591796, "memory(GiB)": 77.56, "step": 34805, "token_acc": 0.41522491349480967, "train_speed(iter/s)": 1.440198 }, { "epoch": 1.4913671222312668, "grad_norm": 7.467026233673096, "learning_rate": 7.960824257632768e-05, "loss": 2.7604074478149414, "memory(GiB)": 77.56, "step": 34810, "token_acc": 0.45263157894736844, "train_speed(iter/s)": 1.440161 }, { "epoch": 1.4915813375605158, "grad_norm": 4.229203701019287, "learning_rate": 7.960281934366568e-05, "loss": 2.5058860778808594, "memory(GiB)": 77.56, "step": 34815, "token_acc": 0.49673202614379086, "train_speed(iter/s)": 1.440148 }, { "epoch": 1.4917955528897648, "grad_norm": 5.706217288970947, "learning_rate": 7.959739557472085e-05, "loss": 2.591847038269043, "memory(GiB)": 77.56, "step": 34820, "token_acc": 0.4886731391585761, "train_speed(iter/s)": 1.440175 }, { "epoch": 1.4920097682190137, "grad_norm": 4.805039405822754, "learning_rate": 7.959197126959142e-05, "loss": 2.5985092163085937, "memory(GiB)": 77.56, "step": 34825, "token_acc": 0.4589041095890411, "train_speed(iter/s)": 1.440167 }, { "epoch": 1.4922239835482627, "grad_norm": 5.608213424682617, "learning_rate": 7.958654642837569e-05, "loss": 2.520285415649414, "memory(GiB)": 77.56, "step": 34830, "token_acc": 0.46779661016949153, "train_speed(iter/s)": 1.440204 }, { "epoch": 1.4924381988775117, "grad_norm": 4.211819171905518, "learning_rate": 7.95811210511719e-05, "loss": 2.406662940979004, "memory(GiB)": 77.56, "step": 34835, "token_acc": 0.5048543689320388, "train_speed(iter/s)": 1.440226 }, { "epoch": 1.4926524142067605, "grad_norm": 5.518600940704346, "learning_rate": 7.957569513807836e-05, "loss": 2.479241943359375, "memory(GiB)": 77.56, "step": 34840, "token_acc": 0.4831804281345566, "train_speed(iter/s)": 1.440185 }, { "epoch": 1.4928666295360096, "grad_norm": 4.9143805503845215, "learning_rate": 7.957026868919334e-05, "loss": 2.5975696563720705, "memory(GiB)": 77.56, "step": 34845, "token_acc": 0.4825174825174825, "train_speed(iter/s)": 1.440223 }, { "epoch": 1.4930808448652586, "grad_norm": 4.8514180183410645, "learning_rate": 7.95648417046152e-05, "loss": 2.597644805908203, "memory(GiB)": 77.56, "step": 34850, "token_acc": 0.4564102564102564, "train_speed(iter/s)": 1.44019 }, { "epoch": 1.4932950601945074, "grad_norm": 5.553345203399658, "learning_rate": 7.955941418444221e-05, "loss": 2.5125654220581053, "memory(GiB)": 77.56, "step": 34855, "token_acc": 0.4664310954063604, "train_speed(iter/s)": 1.44022 }, { "epoch": 1.4935092755237565, "grad_norm": 5.789570331573486, "learning_rate": 7.955398612877269e-05, "loss": 2.646173095703125, "memory(GiB)": 77.56, "step": 34860, "token_acc": 0.4675324675324675, "train_speed(iter/s)": 1.44023 }, { "epoch": 1.4937234908530055, "grad_norm": 5.425500392913818, "learning_rate": 7.954855753770499e-05, "loss": 2.269179916381836, "memory(GiB)": 77.56, "step": 34865, "token_acc": 0.496, "train_speed(iter/s)": 1.440224 }, { "epoch": 1.4939377061822543, "grad_norm": 4.963842391967773, "learning_rate": 7.954312841133744e-05, "loss": 2.6351694107055663, "memory(GiB)": 77.56, "step": 34870, "token_acc": 0.4659090909090909, "train_speed(iter/s)": 1.440215 }, { "epoch": 1.4941519215115033, "grad_norm": 5.984636306762695, "learning_rate": 7.953769874976842e-05, "loss": 2.2100454330444337, "memory(GiB)": 77.56, "step": 34875, "token_acc": 0.5333333333333333, "train_speed(iter/s)": 1.440262 }, { "epoch": 1.4943661368407524, "grad_norm": 4.549845218658447, "learning_rate": 7.953226855309628e-05, "loss": 2.672037887573242, "memory(GiB)": 77.56, "step": 34880, "token_acc": 0.5305343511450382, "train_speed(iter/s)": 1.440258 }, { "epoch": 1.4945803521700012, "grad_norm": 5.547774791717529, "learning_rate": 7.952683782141939e-05, "loss": 2.5649818420410155, "memory(GiB)": 77.56, "step": 34885, "token_acc": 0.4794520547945205, "train_speed(iter/s)": 1.440247 }, { "epoch": 1.4947945674992502, "grad_norm": 4.829455375671387, "learning_rate": 7.952140655483613e-05, "loss": 2.3834651947021483, "memory(GiB)": 77.56, "step": 34890, "token_acc": 0.5050847457627119, "train_speed(iter/s)": 1.440248 }, { "epoch": 1.4950087828284992, "grad_norm": 5.101635456085205, "learning_rate": 7.95159747534449e-05, "loss": 2.51883544921875, "memory(GiB)": 77.56, "step": 34895, "token_acc": 0.4506172839506173, "train_speed(iter/s)": 1.440311 }, { "epoch": 1.495222998157748, "grad_norm": 5.088320732116699, "learning_rate": 7.95105424173441e-05, "loss": 2.4319252014160155, "memory(GiB)": 77.56, "step": 34900, "token_acc": 0.5091575091575091, "train_speed(iter/s)": 1.440337 }, { "epoch": 1.495437213486997, "grad_norm": 4.86480188369751, "learning_rate": 7.950510954663213e-05, "loss": 2.3498004913330077, "memory(GiB)": 77.56, "step": 34905, "token_acc": 0.4981949458483754, "train_speed(iter/s)": 1.44032 }, { "epoch": 1.4956514288162461, "grad_norm": 5.9228129386901855, "learning_rate": 7.949967614140744e-05, "loss": 2.537355422973633, "memory(GiB)": 77.56, "step": 34910, "token_acc": 0.493006993006993, "train_speed(iter/s)": 1.440327 }, { "epoch": 1.495865644145495, "grad_norm": 4.44220495223999, "learning_rate": 7.949424220176843e-05, "loss": 2.6062253952026366, "memory(GiB)": 77.56, "step": 34915, "token_acc": 0.4612903225806452, "train_speed(iter/s)": 1.440337 }, { "epoch": 1.496079859474744, "grad_norm": 4.177424430847168, "learning_rate": 7.948880772781356e-05, "loss": 2.566104507446289, "memory(GiB)": 77.56, "step": 34920, "token_acc": 0.4265232974910394, "train_speed(iter/s)": 1.440374 }, { "epoch": 1.496294074803993, "grad_norm": 4.657687187194824, "learning_rate": 7.948337271964128e-05, "loss": 2.241402816772461, "memory(GiB)": 77.56, "step": 34925, "token_acc": 0.5167785234899329, "train_speed(iter/s)": 1.440378 }, { "epoch": 1.4965082901332418, "grad_norm": 4.103175163269043, "learning_rate": 7.947793717735003e-05, "loss": 2.112751579284668, "memory(GiB)": 77.56, "step": 34930, "token_acc": 0.5302325581395348, "train_speed(iter/s)": 1.440396 }, { "epoch": 1.4967225054624909, "grad_norm": 4.832241058349609, "learning_rate": 7.947250110103832e-05, "loss": 2.7344244003295897, "memory(GiB)": 77.56, "step": 34935, "token_acc": 0.46503496503496505, "train_speed(iter/s)": 1.440402 }, { "epoch": 1.49693672079174, "grad_norm": 5.227599620819092, "learning_rate": 7.946706449080459e-05, "loss": 2.242207336425781, "memory(GiB)": 77.56, "step": 34940, "token_acc": 0.5154639175257731, "train_speed(iter/s)": 1.440413 }, { "epoch": 1.4971509361209887, "grad_norm": 5.069202423095703, "learning_rate": 7.946162734674734e-05, "loss": 2.9288259506225587, "memory(GiB)": 77.56, "step": 34945, "token_acc": 0.41823899371069184, "train_speed(iter/s)": 1.440475 }, { "epoch": 1.4973651514502377, "grad_norm": 4.369980335235596, "learning_rate": 7.945618966896508e-05, "loss": 2.5908447265625, "memory(GiB)": 77.56, "step": 34950, "token_acc": 0.4589041095890411, "train_speed(iter/s)": 1.440484 }, { "epoch": 1.4975793667794868, "grad_norm": 4.850968837738037, "learning_rate": 7.945075145755632e-05, "loss": 2.5278106689453126, "memory(GiB)": 77.56, "step": 34955, "token_acc": 0.4675324675324675, "train_speed(iter/s)": 1.440506 }, { "epoch": 1.4977935821087356, "grad_norm": 5.642793655395508, "learning_rate": 7.944531271261955e-05, "loss": 2.642150115966797, "memory(GiB)": 77.56, "step": 34960, "token_acc": 0.42704626334519574, "train_speed(iter/s)": 1.440522 }, { "epoch": 1.4980077974379846, "grad_norm": 7.260000228881836, "learning_rate": 7.943987343425335e-05, "loss": 2.6825935363769533, "memory(GiB)": 77.56, "step": 34965, "token_acc": 0.49809885931558934, "train_speed(iter/s)": 1.440541 }, { "epoch": 1.4982220127672337, "grad_norm": 4.408665180206299, "learning_rate": 7.94344336225562e-05, "loss": 2.4678133010864256, "memory(GiB)": 77.56, "step": 34970, "token_acc": 0.4886731391585761, "train_speed(iter/s)": 1.440556 }, { "epoch": 1.4984362280964825, "grad_norm": 4.447878360748291, "learning_rate": 7.942899327762668e-05, "loss": 2.5756969451904297, "memory(GiB)": 77.56, "step": 34975, "token_acc": 0.45604395604395603, "train_speed(iter/s)": 1.440545 }, { "epoch": 1.4986504434257315, "grad_norm": 3.8783018589019775, "learning_rate": 7.942355239956332e-05, "loss": 2.423862838745117, "memory(GiB)": 77.56, "step": 34980, "token_acc": 0.4983922829581994, "train_speed(iter/s)": 1.440563 }, { "epoch": 1.4988646587549805, "grad_norm": 3.93540358543396, "learning_rate": 7.941811098846472e-05, "loss": 2.3538373947143554, "memory(GiB)": 77.56, "step": 34985, "token_acc": 0.5085324232081911, "train_speed(iter/s)": 1.440603 }, { "epoch": 1.4990788740842294, "grad_norm": 6.256972312927246, "learning_rate": 7.941266904442945e-05, "loss": 2.3601333618164064, "memory(GiB)": 77.56, "step": 34990, "token_acc": 0.49173553719008267, "train_speed(iter/s)": 1.440618 }, { "epoch": 1.4992930894134784, "grad_norm": 4.560388565063477, "learning_rate": 7.94072265675561e-05, "loss": 2.5219305038452147, "memory(GiB)": 77.56, "step": 34995, "token_acc": 0.4924924924924925, "train_speed(iter/s)": 1.440577 }, { "epoch": 1.4995073047427274, "grad_norm": 5.435134410858154, "learning_rate": 7.940178355794324e-05, "loss": 2.223490524291992, "memory(GiB)": 77.56, "step": 35000, "token_acc": 0.5220883534136547, "train_speed(iter/s)": 1.440476 }, { "epoch": 1.4995073047427274, "eval_loss": 1.9680900573730469, "eval_runtime": 14.4282, "eval_samples_per_second": 6.931, "eval_steps_per_second": 6.931, "eval_token_acc": 0.5168831168831168, "step": 35000 }, { "epoch": 1.4997215200719762, "grad_norm": 6.762289047241211, "learning_rate": 7.93963400156895e-05, "loss": 2.2263851165771484, "memory(GiB)": 77.56, "step": 35005, "token_acc": 0.5137963843958135, "train_speed(iter/s)": 1.439615 }, { "epoch": 1.4999357354012253, "grad_norm": 4.437307834625244, "learning_rate": 7.939089594089347e-05, "loss": 2.2932712554931642, "memory(GiB)": 77.56, "step": 35010, "token_acc": 0.5155807365439093, "train_speed(iter/s)": 1.439636 }, { "epoch": 1.5001499507304743, "grad_norm": 5.13985538482666, "learning_rate": 7.93854513336538e-05, "loss": 2.381512260437012, "memory(GiB)": 77.56, "step": 35015, "token_acc": 0.5, "train_speed(iter/s)": 1.439668 }, { "epoch": 1.5003641660597231, "grad_norm": 5.981245517730713, "learning_rate": 7.93800061940691e-05, "loss": 2.6373035430908205, "memory(GiB)": 77.56, "step": 35020, "token_acc": 0.4809688581314879, "train_speed(iter/s)": 1.439734 }, { "epoch": 1.5005783813889721, "grad_norm": 4.933376789093018, "learning_rate": 7.937456052223804e-05, "loss": 2.458828353881836, "memory(GiB)": 77.56, "step": 35025, "token_acc": 0.4953271028037383, "train_speed(iter/s)": 1.439745 }, { "epoch": 1.5007925967182212, "grad_norm": 5.496727466583252, "learning_rate": 7.936911431825926e-05, "loss": 2.8117338180541993, "memory(GiB)": 77.56, "step": 35030, "token_acc": 0.4272151898734177, "train_speed(iter/s)": 1.439765 }, { "epoch": 1.50100681204747, "grad_norm": 4.272097110748291, "learning_rate": 7.936366758223142e-05, "loss": 2.379751777648926, "memory(GiB)": 77.56, "step": 35035, "token_acc": 0.5051546391752577, "train_speed(iter/s)": 1.439743 }, { "epoch": 1.501221027376719, "grad_norm": 4.517671585083008, "learning_rate": 7.935822031425319e-05, "loss": 2.4862644195556642, "memory(GiB)": 77.56, "step": 35040, "token_acc": 0.46645367412140576, "train_speed(iter/s)": 1.439798 }, { "epoch": 1.501435242705968, "grad_norm": 4.858052730560303, "learning_rate": 7.935277251442326e-05, "loss": 2.133636474609375, "memory(GiB)": 77.56, "step": 35045, "token_acc": 0.5275862068965518, "train_speed(iter/s)": 1.439829 }, { "epoch": 1.5016494580352169, "grad_norm": 4.567692279815674, "learning_rate": 7.934732418284035e-05, "loss": 2.324208068847656, "memory(GiB)": 77.56, "step": 35050, "token_acc": 0.528, "train_speed(iter/s)": 1.439877 }, { "epoch": 1.501863673364466, "grad_norm": 5.690674304962158, "learning_rate": 7.93418753196031e-05, "loss": 2.8742040634155273, "memory(GiB)": 77.56, "step": 35055, "token_acc": 0.3952702702702703, "train_speed(iter/s)": 1.439897 }, { "epoch": 1.502077888693715, "grad_norm": 5.509589195251465, "learning_rate": 7.933642592481026e-05, "loss": 2.378958892822266, "memory(GiB)": 77.56, "step": 35060, "token_acc": 0.5017921146953405, "train_speed(iter/s)": 1.439867 }, { "epoch": 1.5022921040229638, "grad_norm": 4.702286720275879, "learning_rate": 7.933097599856054e-05, "loss": 2.2223737716674803, "memory(GiB)": 77.56, "step": 35065, "token_acc": 0.516260162601626, "train_speed(iter/s)": 1.439901 }, { "epoch": 1.5025063193522128, "grad_norm": 5.72991418838501, "learning_rate": 7.932552554095268e-05, "loss": 2.482608413696289, "memory(GiB)": 77.56, "step": 35070, "token_acc": 0.47844827586206895, "train_speed(iter/s)": 1.439903 }, { "epoch": 1.5027205346814618, "grad_norm": 7.016448497772217, "learning_rate": 7.932007455208542e-05, "loss": 2.734857749938965, "memory(GiB)": 77.56, "step": 35075, "token_acc": 0.44673539518900346, "train_speed(iter/s)": 1.439913 }, { "epoch": 1.5029347500107106, "grad_norm": 6.139796733856201, "learning_rate": 7.93146230320575e-05, "loss": 2.5799448013305666, "memory(GiB)": 77.56, "step": 35080, "token_acc": 0.4962962962962963, "train_speed(iter/s)": 1.43996 }, { "epoch": 1.5031489653399597, "grad_norm": 9.221030235290527, "learning_rate": 7.93091709809677e-05, "loss": 2.4534591674804687, "memory(GiB)": 77.56, "step": 35085, "token_acc": 0.47572815533980584, "train_speed(iter/s)": 1.440016 }, { "epoch": 1.5033631806692087, "grad_norm": 5.759133338928223, "learning_rate": 7.930371839891475e-05, "loss": 2.545137405395508, "memory(GiB)": 77.56, "step": 35090, "token_acc": 0.4562043795620438, "train_speed(iter/s)": 1.440015 }, { "epoch": 1.5035773959984575, "grad_norm": 7.849509239196777, "learning_rate": 7.929826528599746e-05, "loss": 2.778175926208496, "memory(GiB)": 77.56, "step": 35095, "token_acc": 0.45774647887323944, "train_speed(iter/s)": 1.440003 }, { "epoch": 1.5037916113277066, "grad_norm": 5.452881336212158, "learning_rate": 7.929281164231461e-05, "loss": 2.6373451232910154, "memory(GiB)": 77.56, "step": 35100, "token_acc": 0.44571428571428573, "train_speed(iter/s)": 1.440053 }, { "epoch": 1.5040058266569556, "grad_norm": 4.941193580627441, "learning_rate": 7.928735746796501e-05, "loss": 2.6310327529907225, "memory(GiB)": 77.56, "step": 35105, "token_acc": 0.4217252396166134, "train_speed(iter/s)": 1.440096 }, { "epoch": 1.5042200419862044, "grad_norm": 5.028214931488037, "learning_rate": 7.928190276304744e-05, "loss": 2.471433639526367, "memory(GiB)": 77.56, "step": 35110, "token_acc": 0.4596774193548387, "train_speed(iter/s)": 1.440027 }, { "epoch": 1.5044342573154537, "grad_norm": 5.106569290161133, "learning_rate": 7.927644752766074e-05, "loss": 2.7555688858032226, "memory(GiB)": 77.56, "step": 35115, "token_acc": 0.46757679180887374, "train_speed(iter/s)": 1.44003 }, { "epoch": 1.5046484726447025, "grad_norm": 7.169212818145752, "learning_rate": 7.927099176190374e-05, "loss": 2.8109527587890626, "memory(GiB)": 77.56, "step": 35120, "token_acc": 0.4371069182389937, "train_speed(iter/s)": 1.440062 }, { "epoch": 1.5048626879739513, "grad_norm": 5.62183952331543, "learning_rate": 7.926553546587525e-05, "loss": 2.6057254791259767, "memory(GiB)": 77.56, "step": 35125, "token_acc": 0.4901315789473684, "train_speed(iter/s)": 1.440087 }, { "epoch": 1.5050769033032005, "grad_norm": 5.260104179382324, "learning_rate": 7.926007863967413e-05, "loss": 2.4580989837646485, "memory(GiB)": 77.56, "step": 35130, "token_acc": 0.4763779527559055, "train_speed(iter/s)": 1.440145 }, { "epoch": 1.5052911186324494, "grad_norm": 4.021641731262207, "learning_rate": 7.925462128339925e-05, "loss": 2.5604425430297852, "memory(GiB)": 77.56, "step": 35135, "token_acc": 0.46153846153846156, "train_speed(iter/s)": 1.44018 }, { "epoch": 1.5055053339616982, "grad_norm": 6.118619441986084, "learning_rate": 7.924916339714945e-05, "loss": 2.6336822509765625, "memory(GiB)": 77.56, "step": 35140, "token_acc": 0.4740740740740741, "train_speed(iter/s)": 1.440221 }, { "epoch": 1.5057195492909474, "grad_norm": 4.927286624908447, "learning_rate": 7.924370498102363e-05, "loss": 2.7585710525512694, "memory(GiB)": 77.56, "step": 35145, "token_acc": 0.5016722408026756, "train_speed(iter/s)": 1.440217 }, { "epoch": 1.5059337646201962, "grad_norm": 4.069596290588379, "learning_rate": 7.923824603512065e-05, "loss": 2.2800817489624023, "memory(GiB)": 77.56, "step": 35150, "token_acc": 0.5092936802973977, "train_speed(iter/s)": 1.440217 }, { "epoch": 1.506147979949445, "grad_norm": 3.937621593475342, "learning_rate": 7.923278655953943e-05, "loss": 2.326829528808594, "memory(GiB)": 77.56, "step": 35155, "token_acc": 0.4879725085910653, "train_speed(iter/s)": 1.440218 }, { "epoch": 1.5063621952786943, "grad_norm": 7.246119022369385, "learning_rate": 7.922732655437884e-05, "loss": 2.4844532012939453, "memory(GiB)": 77.56, "step": 35160, "token_acc": 0.48464163822525597, "train_speed(iter/s)": 1.440163 }, { "epoch": 1.5065764106079431, "grad_norm": 5.87959098815918, "learning_rate": 7.922186601973782e-05, "loss": 2.432872772216797, "memory(GiB)": 77.56, "step": 35165, "token_acc": 0.5074074074074074, "train_speed(iter/s)": 1.44019 }, { "epoch": 1.506790625937192, "grad_norm": 4.126819610595703, "learning_rate": 7.921640495571529e-05, "loss": 2.5085987091064452, "memory(GiB)": 77.56, "step": 35170, "token_acc": 0.4537037037037037, "train_speed(iter/s)": 1.440222 }, { "epoch": 1.5070048412664412, "grad_norm": 4.535497188568115, "learning_rate": 7.921094336241017e-05, "loss": 2.5078815460205077, "memory(GiB)": 77.56, "step": 35175, "token_acc": 0.4444444444444444, "train_speed(iter/s)": 1.440221 }, { "epoch": 1.50721905659569, "grad_norm": 5.656325340270996, "learning_rate": 7.920548123992142e-05, "loss": 2.6849136352539062, "memory(GiB)": 77.56, "step": 35180, "token_acc": 0.4111111111111111, "train_speed(iter/s)": 1.44019 }, { "epoch": 1.5074332719249388, "grad_norm": 4.733944892883301, "learning_rate": 7.920001858834796e-05, "loss": 2.6267629623413087, "memory(GiB)": 77.56, "step": 35185, "token_acc": 0.4803921568627451, "train_speed(iter/s)": 1.440232 }, { "epoch": 1.507647487254188, "grad_norm": 4.708016395568848, "learning_rate": 7.91945554077888e-05, "loss": 2.660777473449707, "memory(GiB)": 77.56, "step": 35190, "token_acc": 0.4882154882154882, "train_speed(iter/s)": 1.44021 }, { "epoch": 1.5078617025834369, "grad_norm": 6.860857963562012, "learning_rate": 7.918909169834285e-05, "loss": 2.7101951599121095, "memory(GiB)": 77.56, "step": 35195, "token_acc": 0.4359861591695502, "train_speed(iter/s)": 1.440166 }, { "epoch": 1.5080759179126857, "grad_norm": 4.5236077308654785, "learning_rate": 7.918362746010914e-05, "loss": 2.3776504516601564, "memory(GiB)": 77.56, "step": 35200, "token_acc": 0.4273743016759777, "train_speed(iter/s)": 1.440204 }, { "epoch": 1.508290133241935, "grad_norm": 4.508399486541748, "learning_rate": 7.917816269318665e-05, "loss": 2.7005031585693358, "memory(GiB)": 77.56, "step": 35205, "token_acc": 0.4296875, "train_speed(iter/s)": 1.440252 }, { "epoch": 1.5085043485711838, "grad_norm": 5.647885322570801, "learning_rate": 7.917269739767434e-05, "loss": 2.3213151931762694, "memory(GiB)": 77.56, "step": 35210, "token_acc": 0.4452296819787986, "train_speed(iter/s)": 1.440213 }, { "epoch": 1.5087185639004326, "grad_norm": 4.938968181610107, "learning_rate": 7.916723157367129e-05, "loss": 2.5326332092285155, "memory(GiB)": 77.56, "step": 35215, "token_acc": 0.4588235294117647, "train_speed(iter/s)": 1.440186 }, { "epoch": 1.5089327792296818, "grad_norm": 6.525058269500732, "learning_rate": 7.916176522127645e-05, "loss": 2.8699703216552734, "memory(GiB)": 77.56, "step": 35220, "token_acc": 0.4603658536585366, "train_speed(iter/s)": 1.440176 }, { "epoch": 1.5091469945589306, "grad_norm": 4.628689289093018, "learning_rate": 7.91562983405889e-05, "loss": 2.640371322631836, "memory(GiB)": 77.56, "step": 35225, "token_acc": 0.47808764940239046, "train_speed(iter/s)": 1.440213 }, { "epoch": 1.5093612098881795, "grad_norm": 5.026602745056152, "learning_rate": 7.915083093170764e-05, "loss": 2.506536102294922, "memory(GiB)": 77.56, "step": 35230, "token_acc": 0.4810606060606061, "train_speed(iter/s)": 1.440211 }, { "epoch": 1.5095754252174287, "grad_norm": 5.8378682136535645, "learning_rate": 7.914536299473173e-05, "loss": 2.594832420349121, "memory(GiB)": 77.56, "step": 35235, "token_acc": 0.5020242914979757, "train_speed(iter/s)": 1.440212 }, { "epoch": 1.5097896405466775, "grad_norm": 6.31693696975708, "learning_rate": 7.913989452976023e-05, "loss": 2.743829345703125, "memory(GiB)": 77.56, "step": 35240, "token_acc": 0.4418604651162791, "train_speed(iter/s)": 1.440226 }, { "epoch": 1.5100038558759263, "grad_norm": 6.887179851531982, "learning_rate": 7.913442553689221e-05, "loss": 2.4985849380493166, "memory(GiB)": 77.56, "step": 35245, "token_acc": 0.4782608695652174, "train_speed(iter/s)": 1.440242 }, { "epoch": 1.5102180712051756, "grad_norm": 5.0936479568481445, "learning_rate": 7.912895601622675e-05, "loss": 2.3697273254394533, "memory(GiB)": 77.56, "step": 35250, "token_acc": 0.47183098591549294, "train_speed(iter/s)": 1.440247 }, { "epoch": 1.5104322865344244, "grad_norm": 4.387964725494385, "learning_rate": 7.912348596786291e-05, "loss": 2.3688579559326173, "memory(GiB)": 77.56, "step": 35255, "token_acc": 0.4927007299270073, "train_speed(iter/s)": 1.440239 }, { "epoch": 1.5106465018636732, "grad_norm": 5.024767875671387, "learning_rate": 7.91180153918998e-05, "loss": 2.3366846084594726, "memory(GiB)": 77.56, "step": 35260, "token_acc": 0.5060606060606061, "train_speed(iter/s)": 1.440245 }, { "epoch": 1.5108607171929225, "grad_norm": 6.093329906463623, "learning_rate": 7.911254428843654e-05, "loss": 2.6157827377319336, "memory(GiB)": 77.56, "step": 35265, "token_acc": 0.45791245791245794, "train_speed(iter/s)": 1.440281 }, { "epoch": 1.5110749325221713, "grad_norm": 5.016539573669434, "learning_rate": 7.910707265757222e-05, "loss": 2.4375516891479494, "memory(GiB)": 77.56, "step": 35270, "token_acc": 0.45454545454545453, "train_speed(iter/s)": 1.440228 }, { "epoch": 1.51128914785142, "grad_norm": 4.038543224334717, "learning_rate": 7.910160049940598e-05, "loss": 2.3681718826293947, "memory(GiB)": 77.56, "step": 35275, "token_acc": 0.4812286689419795, "train_speed(iter/s)": 1.440259 }, { "epoch": 1.5115033631806694, "grad_norm": 5.901710033416748, "learning_rate": 7.909612781403695e-05, "loss": 2.5667343139648438, "memory(GiB)": 77.56, "step": 35280, "token_acc": 0.47703180212014135, "train_speed(iter/s)": 1.440284 }, { "epoch": 1.5117175785099182, "grad_norm": 4.946591377258301, "learning_rate": 7.909065460156427e-05, "loss": 2.403861427307129, "memory(GiB)": 77.56, "step": 35285, "token_acc": 0.5201465201465202, "train_speed(iter/s)": 1.440205 }, { "epoch": 1.511931793839167, "grad_norm": 4.404177665710449, "learning_rate": 7.90851808620871e-05, "loss": 2.2176586151123048, "memory(GiB)": 77.56, "step": 35290, "token_acc": 0.525096525096525, "train_speed(iter/s)": 1.440216 }, { "epoch": 1.5121460091684162, "grad_norm": 5.203978061676025, "learning_rate": 7.907970659570457e-05, "loss": 2.570529556274414, "memory(GiB)": 77.56, "step": 35295, "token_acc": 0.4716981132075472, "train_speed(iter/s)": 1.440203 }, { "epoch": 1.512360224497665, "grad_norm": 4.523471832275391, "learning_rate": 7.90742318025159e-05, "loss": 2.179259490966797, "memory(GiB)": 77.56, "step": 35300, "token_acc": 0.5210727969348659, "train_speed(iter/s)": 1.440208 }, { "epoch": 1.5125744398269139, "grad_norm": 5.4604268074035645, "learning_rate": 7.906875648262022e-05, "loss": 2.560937690734863, "memory(GiB)": 77.56, "step": 35305, "token_acc": 0.4668769716088328, "train_speed(iter/s)": 1.440171 }, { "epoch": 1.5127886551561631, "grad_norm": 4.289139270782471, "learning_rate": 7.906328063611677e-05, "loss": 2.7063177108764647, "memory(GiB)": 77.56, "step": 35310, "token_acc": 0.4444444444444444, "train_speed(iter/s)": 1.440144 }, { "epoch": 1.513002870485412, "grad_norm": 4.89067268371582, "learning_rate": 7.905780426310472e-05, "loss": 2.5784698486328126, "memory(GiB)": 77.56, "step": 35315, "token_acc": 0.4555160142348754, "train_speed(iter/s)": 1.440125 }, { "epoch": 1.5132170858146607, "grad_norm": 6.2356367111206055, "learning_rate": 7.905232736368328e-05, "loss": 2.7093389511108397, "memory(GiB)": 77.56, "step": 35320, "token_acc": 0.4874551971326165, "train_speed(iter/s)": 1.440081 }, { "epoch": 1.51343130114391, "grad_norm": 6.2283549308776855, "learning_rate": 7.90468499379517e-05, "loss": 2.101117706298828, "memory(GiB)": 77.56, "step": 35325, "token_acc": 0.52734375, "train_speed(iter/s)": 1.440056 }, { "epoch": 1.5136455164731588, "grad_norm": 6.189523220062256, "learning_rate": 7.904137198600917e-05, "loss": 2.468982124328613, "memory(GiB)": 77.56, "step": 35330, "token_acc": 0.45695364238410596, "train_speed(iter/s)": 1.440079 }, { "epoch": 1.5138597318024076, "grad_norm": 4.029015064239502, "learning_rate": 7.903589350795495e-05, "loss": 2.6178512573242188, "memory(GiB)": 77.56, "step": 35335, "token_acc": 0.45751633986928103, "train_speed(iter/s)": 1.440105 }, { "epoch": 1.5140739471316569, "grad_norm": 4.273472309112549, "learning_rate": 7.903041450388828e-05, "loss": 2.716708946228027, "memory(GiB)": 77.56, "step": 35340, "token_acc": 0.45544554455445546, "train_speed(iter/s)": 1.440098 }, { "epoch": 1.5142881624609057, "grad_norm": 4.4855499267578125, "learning_rate": 7.902493497390842e-05, "loss": 2.6940485000610352, "memory(GiB)": 77.56, "step": 35345, "token_acc": 0.4495677233429395, "train_speed(iter/s)": 1.440123 }, { "epoch": 1.5145023777901545, "grad_norm": 7.15872049331665, "learning_rate": 7.901945491811462e-05, "loss": 2.457971954345703, "memory(GiB)": 77.56, "step": 35350, "token_acc": 0.4828767123287671, "train_speed(iter/s)": 1.440157 }, { "epoch": 1.5147165931194038, "grad_norm": 4.4815850257873535, "learning_rate": 7.901397433660617e-05, "loss": 2.6729719161987306, "memory(GiB)": 77.56, "step": 35355, "token_acc": 0.4357142857142857, "train_speed(iter/s)": 1.440152 }, { "epoch": 1.5149308084486526, "grad_norm": 5.4969401359558105, "learning_rate": 7.900849322948239e-05, "loss": 2.6149808883666994, "memory(GiB)": 77.56, "step": 35360, "token_acc": 0.42574257425742573, "train_speed(iter/s)": 1.440162 }, { "epoch": 1.5151450237779014, "grad_norm": 3.847578287124634, "learning_rate": 7.900301159684251e-05, "loss": 2.53448600769043, "memory(GiB)": 77.56, "step": 35365, "token_acc": 0.4608695652173913, "train_speed(iter/s)": 1.440207 }, { "epoch": 1.5153592391071506, "grad_norm": 5.528143882751465, "learning_rate": 7.899752943878589e-05, "loss": 2.507093048095703, "memory(GiB)": 77.56, "step": 35370, "token_acc": 0.5084175084175084, "train_speed(iter/s)": 1.440203 }, { "epoch": 1.5155734544363995, "grad_norm": 5.079574108123779, "learning_rate": 7.89920467554118e-05, "loss": 2.6101160049438477, "memory(GiB)": 77.56, "step": 35375, "token_acc": 0.4647058823529412, "train_speed(iter/s)": 1.440212 }, { "epoch": 1.5157876697656483, "grad_norm": 8.060430526733398, "learning_rate": 7.898656354681961e-05, "loss": 2.6383169174194334, "memory(GiB)": 77.56, "step": 35380, "token_acc": 0.4646017699115044, "train_speed(iter/s)": 1.440223 }, { "epoch": 1.5160018850948975, "grad_norm": 5.734468936920166, "learning_rate": 7.89810798131086e-05, "loss": 2.4155319213867186, "memory(GiB)": 77.56, "step": 35385, "token_acc": 0.4681647940074906, "train_speed(iter/s)": 1.440259 }, { "epoch": 1.5162161004241463, "grad_norm": 4.224002838134766, "learning_rate": 7.897559555437817e-05, "loss": 2.6890220642089844, "memory(GiB)": 77.56, "step": 35390, "token_acc": 0.46567164179104475, "train_speed(iter/s)": 1.440261 }, { "epoch": 1.5164303157533952, "grad_norm": 4.405418872833252, "learning_rate": 7.897011077072763e-05, "loss": 2.3410049438476563, "memory(GiB)": 77.56, "step": 35395, "token_acc": 0.48905109489051096, "train_speed(iter/s)": 1.440256 }, { "epoch": 1.5166445310826444, "grad_norm": 5.095930099487305, "learning_rate": 7.896462546225637e-05, "loss": 2.3978046417236327, "memory(GiB)": 77.56, "step": 35400, "token_acc": 0.4628099173553719, "train_speed(iter/s)": 1.440244 }, { "epoch": 1.5168587464118932, "grad_norm": 3.97068190574646, "learning_rate": 7.895913962906374e-05, "loss": 2.5652626037597654, "memory(GiB)": 77.56, "step": 35405, "token_acc": 0.43558282208588955, "train_speed(iter/s)": 1.440243 }, { "epoch": 1.517072961741142, "grad_norm": 4.979217052459717, "learning_rate": 7.895365327124912e-05, "loss": 2.497342109680176, "memory(GiB)": 77.56, "step": 35410, "token_acc": 0.475177304964539, "train_speed(iter/s)": 1.440209 }, { "epoch": 1.5172871770703913, "grad_norm": 4.717621803283691, "learning_rate": 7.894816638891194e-05, "loss": 2.6755285263061523, "memory(GiB)": 77.56, "step": 35415, "token_acc": 0.45614035087719296, "train_speed(iter/s)": 1.440165 }, { "epoch": 1.51750139239964, "grad_norm": 5.189795970916748, "learning_rate": 7.894267898215155e-05, "loss": 2.686911392211914, "memory(GiB)": 77.56, "step": 35420, "token_acc": 0.43304843304843305, "train_speed(iter/s)": 1.440204 }, { "epoch": 1.517715607728889, "grad_norm": 5.874298095703125, "learning_rate": 7.893719105106739e-05, "loss": 2.5686975479125977, "memory(GiB)": 77.56, "step": 35425, "token_acc": 0.4783950617283951, "train_speed(iter/s)": 1.440185 }, { "epoch": 1.5179298230581382, "grad_norm": 4.797722339630127, "learning_rate": 7.893170259575886e-05, "loss": 2.5290069580078125, "memory(GiB)": 77.56, "step": 35430, "token_acc": 0.47202797202797203, "train_speed(iter/s)": 1.440168 }, { "epoch": 1.518144038387387, "grad_norm": 6.44780969619751, "learning_rate": 7.892621361632539e-05, "loss": 2.531484031677246, "memory(GiB)": 77.56, "step": 35435, "token_acc": 0.4590643274853801, "train_speed(iter/s)": 1.440167 }, { "epoch": 1.5183582537166358, "grad_norm": 4.254237174987793, "learning_rate": 7.892072411286644e-05, "loss": 2.2918649673461915, "memory(GiB)": 77.56, "step": 35440, "token_acc": 0.5230125523012552, "train_speed(iter/s)": 1.440174 }, { "epoch": 1.518572469045885, "grad_norm": 5.859513759613037, "learning_rate": 7.891523408548143e-05, "loss": 2.5549551010131837, "memory(GiB)": 77.56, "step": 35445, "token_acc": 0.4555160142348754, "train_speed(iter/s)": 1.440191 }, { "epoch": 1.5187866843751339, "grad_norm": 5.328099250793457, "learning_rate": 7.890974353426984e-05, "loss": 2.576326370239258, "memory(GiB)": 77.56, "step": 35450, "token_acc": 0.4591439688715953, "train_speed(iter/s)": 1.44022 }, { "epoch": 1.519000899704383, "grad_norm": 5.957167625427246, "learning_rate": 7.890425245933113e-05, "loss": 2.236961555480957, "memory(GiB)": 77.56, "step": 35455, "token_acc": 0.5404411764705882, "train_speed(iter/s)": 1.440247 }, { "epoch": 1.519215115033632, "grad_norm": 5.930410385131836, "learning_rate": 7.889876086076475e-05, "loss": 2.690696907043457, "memory(GiB)": 77.56, "step": 35460, "token_acc": 0.46464646464646464, "train_speed(iter/s)": 1.440215 }, { "epoch": 1.5194293303628807, "grad_norm": 4.081855297088623, "learning_rate": 7.889326873867022e-05, "loss": 2.475139617919922, "memory(GiB)": 77.56, "step": 35465, "token_acc": 0.4563758389261745, "train_speed(iter/s)": 1.440259 }, { "epoch": 1.5196435456921298, "grad_norm": 5.731164455413818, "learning_rate": 7.888777609314704e-05, "loss": 2.626582145690918, "memory(GiB)": 77.56, "step": 35470, "token_acc": 0.4504792332268371, "train_speed(iter/s)": 1.440251 }, { "epoch": 1.5198577610213788, "grad_norm": 4.737641334533691, "learning_rate": 7.888228292429468e-05, "loss": 2.4544776916503905, "memory(GiB)": 77.56, "step": 35475, "token_acc": 0.4691358024691358, "train_speed(iter/s)": 1.440265 }, { "epoch": 1.5200719763506276, "grad_norm": 5.118046760559082, "learning_rate": 7.887678923221267e-05, "loss": 2.8044885635375976, "memory(GiB)": 77.56, "step": 35480, "token_acc": 0.45, "train_speed(iter/s)": 1.440322 }, { "epoch": 1.5202861916798767, "grad_norm": 4.17631721496582, "learning_rate": 7.887129501700055e-05, "loss": 2.554097557067871, "memory(GiB)": 77.56, "step": 35485, "token_acc": 0.4574780058651026, "train_speed(iter/s)": 1.440339 }, { "epoch": 1.5205004070091257, "grad_norm": 4.571279525756836, "learning_rate": 7.886580027875782e-05, "loss": 2.3724308013916016, "memory(GiB)": 77.56, "step": 35490, "token_acc": 0.5062761506276151, "train_speed(iter/s)": 1.440294 }, { "epoch": 1.5207146223383745, "grad_norm": 5.154933929443359, "learning_rate": 7.886030501758404e-05, "loss": 2.3690290451049805, "memory(GiB)": 77.56, "step": 35495, "token_acc": 0.5052631578947369, "train_speed(iter/s)": 1.440298 }, { "epoch": 1.5209288376676235, "grad_norm": 5.929644584655762, "learning_rate": 7.885480923357878e-05, "loss": 2.553443717956543, "memory(GiB)": 77.56, "step": 35500, "token_acc": 0.46865671641791046, "train_speed(iter/s)": 1.440292 }, { "epoch": 1.5209288376676235, "eval_loss": 2.2393977642059326, "eval_runtime": 14.1273, "eval_samples_per_second": 7.079, "eval_steps_per_second": 7.079, "eval_token_acc": 0.47717231222385864, "step": 35500 }, { "epoch": 1.5211430529968726, "grad_norm": 4.536977291107178, "learning_rate": 7.884931292684157e-05, "loss": 2.4208919525146486, "memory(GiB)": 77.56, "step": 35505, "token_acc": 0.4687179487179487, "train_speed(iter/s)": 1.439368 }, { "epoch": 1.5213572683261214, "grad_norm": 4.821561336517334, "learning_rate": 7.884381609747198e-05, "loss": 2.2888898849487305, "memory(GiB)": 77.56, "step": 35510, "token_acc": 0.4803921568627451, "train_speed(iter/s)": 1.439342 }, { "epoch": 1.5215714836553704, "grad_norm": 7.081754684448242, "learning_rate": 7.883831874556962e-05, "loss": 2.5779401779174806, "memory(GiB)": 77.56, "step": 35515, "token_acc": 0.44765342960288806, "train_speed(iter/s)": 1.439322 }, { "epoch": 1.5217856989846195, "grad_norm": 6.1978044509887695, "learning_rate": 7.883282087123407e-05, "loss": 2.5102333068847655, "memory(GiB)": 77.56, "step": 35520, "token_acc": 0.5019607843137255, "train_speed(iter/s)": 1.43935 }, { "epoch": 1.5219999143138683, "grad_norm": 4.620543003082275, "learning_rate": 7.88273224745649e-05, "loss": 2.7689218521118164, "memory(GiB)": 77.56, "step": 35525, "token_acc": 0.4679245283018868, "train_speed(iter/s)": 1.439354 }, { "epoch": 1.5222141296431173, "grad_norm": 5.6832780838012695, "learning_rate": 7.882182355566177e-05, "loss": 2.5620773315429686, "memory(GiB)": 77.56, "step": 35530, "token_acc": 0.47368421052631576, "train_speed(iter/s)": 1.439371 }, { "epoch": 1.5224283449723663, "grad_norm": 5.037816047668457, "learning_rate": 7.881632411462424e-05, "loss": 2.644948387145996, "memory(GiB)": 77.56, "step": 35535, "token_acc": 0.4405594405594406, "train_speed(iter/s)": 1.43939 }, { "epoch": 1.5226425603016152, "grad_norm": 5.584239482879639, "learning_rate": 7.881082415155198e-05, "loss": 2.460175132751465, "memory(GiB)": 77.56, "step": 35540, "token_acc": 0.48333333333333334, "train_speed(iter/s)": 1.439385 }, { "epoch": 1.5228567756308642, "grad_norm": 4.644760608673096, "learning_rate": 7.880532366654462e-05, "loss": 2.578045463562012, "memory(GiB)": 77.56, "step": 35545, "token_acc": 0.4891304347826087, "train_speed(iter/s)": 1.439428 }, { "epoch": 1.5230709909601132, "grad_norm": 4.966503620147705, "learning_rate": 7.879982265970178e-05, "loss": 2.2969470977783204, "memory(GiB)": 77.56, "step": 35550, "token_acc": 0.48398576512455516, "train_speed(iter/s)": 1.439469 }, { "epoch": 1.523285206289362, "grad_norm": 5.454427719116211, "learning_rate": 7.879432113112316e-05, "loss": 2.5244773864746093, "memory(GiB)": 77.56, "step": 35555, "token_acc": 0.48120300751879697, "train_speed(iter/s)": 1.439459 }, { "epoch": 1.523499421618611, "grad_norm": 5.873898029327393, "learning_rate": 7.87888190809084e-05, "loss": 2.512242317199707, "memory(GiB)": 77.56, "step": 35560, "token_acc": 0.5019455252918288, "train_speed(iter/s)": 1.439463 }, { "epoch": 1.52371363694786, "grad_norm": 4.819453239440918, "learning_rate": 7.878331650915716e-05, "loss": 2.350246810913086, "memory(GiB)": 77.56, "step": 35565, "token_acc": 0.515358361774744, "train_speed(iter/s)": 1.439451 }, { "epoch": 1.523927852277109, "grad_norm": 4.41420841217041, "learning_rate": 7.877781341596915e-05, "loss": 2.604685401916504, "memory(GiB)": 77.56, "step": 35570, "token_acc": 0.434375, "train_speed(iter/s)": 1.439494 }, { "epoch": 1.524142067606358, "grad_norm": 5.763274669647217, "learning_rate": 7.877230980144404e-05, "loss": 2.310897636413574, "memory(GiB)": 77.56, "step": 35575, "token_acc": 0.4874551971326165, "train_speed(iter/s)": 1.439488 }, { "epoch": 1.524356282935607, "grad_norm": 5.697576522827148, "learning_rate": 7.876680566568157e-05, "loss": 2.5009170532226563, "memory(GiB)": 77.56, "step": 35580, "token_acc": 0.49201277955271566, "train_speed(iter/s)": 1.439464 }, { "epoch": 1.5245704982648558, "grad_norm": 7.941682815551758, "learning_rate": 7.876130100878142e-05, "loss": 2.696417808532715, "memory(GiB)": 77.56, "step": 35585, "token_acc": 0.43656716417910446, "train_speed(iter/s)": 1.439488 }, { "epoch": 1.5247847135941048, "grad_norm": 6.363534927368164, "learning_rate": 7.875579583084331e-05, "loss": 2.6718107223510743, "memory(GiB)": 77.56, "step": 35590, "token_acc": 0.46204620462046203, "train_speed(iter/s)": 1.439528 }, { "epoch": 1.5249989289233539, "grad_norm": 5.607141017913818, "learning_rate": 7.8750290131967e-05, "loss": 3.0673961639404297, "memory(GiB)": 77.56, "step": 35595, "token_acc": 0.4163934426229508, "train_speed(iter/s)": 1.439542 }, { "epoch": 1.5252131442526027, "grad_norm": 6.786413669586182, "learning_rate": 7.874478391225221e-05, "loss": 2.569788360595703, "memory(GiB)": 77.56, "step": 35600, "token_acc": 0.519434628975265, "train_speed(iter/s)": 1.439549 }, { "epoch": 1.5254273595818517, "grad_norm": 4.321645736694336, "learning_rate": 7.87392771717987e-05, "loss": 2.5054037094116213, "memory(GiB)": 77.56, "step": 35605, "token_acc": 0.4407894736842105, "train_speed(iter/s)": 1.439562 }, { "epoch": 1.5256415749111008, "grad_norm": 4.681307792663574, "learning_rate": 7.873376991070623e-05, "loss": 2.707870674133301, "memory(GiB)": 77.56, "step": 35610, "token_acc": 0.4864864864864865, "train_speed(iter/s)": 1.439535 }, { "epoch": 1.5258557902403496, "grad_norm": 4.680245399475098, "learning_rate": 7.872826212907453e-05, "loss": 2.8780704498291017, "memory(GiB)": 77.56, "step": 35615, "token_acc": 0.4332129963898917, "train_speed(iter/s)": 1.439556 }, { "epoch": 1.5260700055695986, "grad_norm": 4.478838920593262, "learning_rate": 7.872275382700345e-05, "loss": 2.4710350036621094, "memory(GiB)": 77.56, "step": 35620, "token_acc": 0.4714285714285714, "train_speed(iter/s)": 1.439574 }, { "epoch": 1.5262842208988476, "grad_norm": 5.685272693634033, "learning_rate": 7.871724500459272e-05, "loss": 2.4026954650878904, "memory(GiB)": 77.56, "step": 35625, "token_acc": 0.5161290322580645, "train_speed(iter/s)": 1.439578 }, { "epoch": 1.5264984362280964, "grad_norm": 4.8700761795043945, "learning_rate": 7.871173566194217e-05, "loss": 2.677492141723633, "memory(GiB)": 77.56, "step": 35630, "token_acc": 0.4774774774774775, "train_speed(iter/s)": 1.439578 }, { "epoch": 1.5267126515573455, "grad_norm": 5.006597518920898, "learning_rate": 7.870622579915158e-05, "loss": 2.3438543319702148, "memory(GiB)": 77.56, "step": 35635, "token_acc": 0.5131578947368421, "train_speed(iter/s)": 1.439587 }, { "epoch": 1.5269268668865945, "grad_norm": 5.039852619171143, "learning_rate": 7.870071541632078e-05, "loss": 2.71852970123291, "memory(GiB)": 77.56, "step": 35640, "token_acc": 0.46439628482972134, "train_speed(iter/s)": 1.439588 }, { "epoch": 1.5271410822158433, "grad_norm": 6.308069705963135, "learning_rate": 7.869520451354961e-05, "loss": 2.248679351806641, "memory(GiB)": 77.56, "step": 35645, "token_acc": 0.531496062992126, "train_speed(iter/s)": 1.439566 }, { "epoch": 1.5273552975450924, "grad_norm": 5.813370227813721, "learning_rate": 7.868969309093788e-05, "loss": 2.7753889083862306, "memory(GiB)": 77.56, "step": 35650, "token_acc": 0.43944636678200694, "train_speed(iter/s)": 1.439597 }, { "epoch": 1.5275695128743414, "grad_norm": 4.956195831298828, "learning_rate": 7.868418114858545e-05, "loss": 2.3649627685546877, "memory(GiB)": 77.56, "step": 35655, "token_acc": 0.42379182156133827, "train_speed(iter/s)": 1.43962 }, { "epoch": 1.5277837282035902, "grad_norm": 4.613227844238281, "learning_rate": 7.867866868659218e-05, "loss": 2.7416263580322267, "memory(GiB)": 77.56, "step": 35660, "token_acc": 0.4594594594594595, "train_speed(iter/s)": 1.439625 }, { "epoch": 1.5279979435328392, "grad_norm": 4.017142295837402, "learning_rate": 7.867315570505792e-05, "loss": 2.4690214157104493, "memory(GiB)": 77.56, "step": 35665, "token_acc": 0.48366013071895425, "train_speed(iter/s)": 1.439674 }, { "epoch": 1.5282121588620883, "grad_norm": 5.85869836807251, "learning_rate": 7.866764220408255e-05, "loss": 2.5865554809570312, "memory(GiB)": 77.56, "step": 35670, "token_acc": 0.4559748427672956, "train_speed(iter/s)": 1.439706 }, { "epoch": 1.528426374191337, "grad_norm": 5.94870138168335, "learning_rate": 7.866212818376594e-05, "loss": 2.4458690643310548, "memory(GiB)": 77.56, "step": 35675, "token_acc": 0.46953405017921146, "train_speed(iter/s)": 1.439699 }, { "epoch": 1.5286405895205861, "grad_norm": 4.3266448974609375, "learning_rate": 7.8656613644208e-05, "loss": 2.4299875259399415, "memory(GiB)": 77.56, "step": 35680, "token_acc": 0.4828897338403042, "train_speed(iter/s)": 1.439735 }, { "epoch": 1.5288548048498352, "grad_norm": 4.098716735839844, "learning_rate": 7.865109858550862e-05, "loss": 2.627867317199707, "memory(GiB)": 77.56, "step": 35685, "token_acc": 0.4450402144772118, "train_speed(iter/s)": 1.439787 }, { "epoch": 1.529069020179084, "grad_norm": 4.418527603149414, "learning_rate": 7.86455830077677e-05, "loss": 2.79116153717041, "memory(GiB)": 77.56, "step": 35690, "token_acc": 0.4166666666666667, "train_speed(iter/s)": 1.439788 }, { "epoch": 1.529283235508333, "grad_norm": 5.227472305297852, "learning_rate": 7.864006691108519e-05, "loss": 2.8565208435058596, "memory(GiB)": 77.56, "step": 35695, "token_acc": 0.4054878048780488, "train_speed(iter/s)": 1.439796 }, { "epoch": 1.529497450837582, "grad_norm": 5.767401695251465, "learning_rate": 7.863455029556098e-05, "loss": 2.7329170227050783, "memory(GiB)": 77.56, "step": 35700, "token_acc": 0.45674740484429066, "train_speed(iter/s)": 1.439845 }, { "epoch": 1.5297116661668309, "grad_norm": 5.61146879196167, "learning_rate": 7.862903316129504e-05, "loss": 2.4084505081176757, "memory(GiB)": 77.56, "step": 35705, "token_acc": 0.45126353790613716, "train_speed(iter/s)": 1.439898 }, { "epoch": 1.5299258814960799, "grad_norm": 4.584047794342041, "learning_rate": 7.862351550838732e-05, "loss": 2.3794958114624025, "memory(GiB)": 77.56, "step": 35710, "token_acc": 0.4817073170731707, "train_speed(iter/s)": 1.439864 }, { "epoch": 1.530140096825329, "grad_norm": 5.686636924743652, "learning_rate": 7.861799733693775e-05, "loss": 2.7293888092041017, "memory(GiB)": 77.56, "step": 35715, "token_acc": 0.4417808219178082, "train_speed(iter/s)": 1.439888 }, { "epoch": 1.5303543121545777, "grad_norm": 4.240833759307861, "learning_rate": 7.86124786470463e-05, "loss": 2.501619338989258, "memory(GiB)": 77.56, "step": 35720, "token_acc": 0.46449704142011833, "train_speed(iter/s)": 1.439918 }, { "epoch": 1.5305685274838268, "grad_norm": 4.446664810180664, "learning_rate": 7.8606959438813e-05, "loss": 2.2656715393066404, "memory(GiB)": 77.56, "step": 35725, "token_acc": 0.5037037037037037, "train_speed(iter/s)": 1.439929 }, { "epoch": 1.5307827428130758, "grad_norm": 4.771551609039307, "learning_rate": 7.860143971233776e-05, "loss": 2.5883575439453126, "memory(GiB)": 77.56, "step": 35730, "token_acc": 0.4865771812080537, "train_speed(iter/s)": 1.439985 }, { "epoch": 1.5309969581423246, "grad_norm": 5.456704139709473, "learning_rate": 7.859591946772062e-05, "loss": 2.0569992065429688, "memory(GiB)": 77.56, "step": 35735, "token_acc": 0.5803921568627451, "train_speed(iter/s)": 1.44002 }, { "epoch": 1.5312111734715736, "grad_norm": 4.773999214172363, "learning_rate": 7.859039870506158e-05, "loss": 2.556158447265625, "memory(GiB)": 77.56, "step": 35740, "token_acc": 0.4709141274238227, "train_speed(iter/s)": 1.440046 }, { "epoch": 1.5314253888008227, "grad_norm": 5.565639019012451, "learning_rate": 7.858487742446063e-05, "loss": 2.3145166397094727, "memory(GiB)": 77.56, "step": 35745, "token_acc": 0.5298804780876494, "train_speed(iter/s)": 1.439999 }, { "epoch": 1.5316396041300715, "grad_norm": 4.366349220275879, "learning_rate": 7.857935562601783e-05, "loss": 2.548398017883301, "memory(GiB)": 77.56, "step": 35750, "token_acc": 0.4174174174174174, "train_speed(iter/s)": 1.43998 }, { "epoch": 1.5318538194593205, "grad_norm": 5.73050594329834, "learning_rate": 7.857383330983319e-05, "loss": 2.599017333984375, "memory(GiB)": 77.56, "step": 35755, "token_acc": 0.4306784660766962, "train_speed(iter/s)": 1.439999 }, { "epoch": 1.5320680347885696, "grad_norm": 6.19191837310791, "learning_rate": 7.856831047600674e-05, "loss": 2.465973663330078, "memory(GiB)": 77.56, "step": 35760, "token_acc": 0.47410358565737054, "train_speed(iter/s)": 1.440026 }, { "epoch": 1.5322822501178184, "grad_norm": 4.285295486450195, "learning_rate": 7.856278712463856e-05, "loss": 2.5737236022949217, "memory(GiB)": 77.56, "step": 35765, "token_acc": 0.45686900958466453, "train_speed(iter/s)": 1.440002 }, { "epoch": 1.5324964654470674, "grad_norm": 6.242623329162598, "learning_rate": 7.855726325582869e-05, "loss": 2.522172546386719, "memory(GiB)": 77.56, "step": 35770, "token_acc": 0.46545454545454545, "train_speed(iter/s)": 1.440018 }, { "epoch": 1.5327106807763164, "grad_norm": 6.120961666107178, "learning_rate": 7.855173886967722e-05, "loss": 2.373886489868164, "memory(GiB)": 77.56, "step": 35775, "token_acc": 0.4664429530201342, "train_speed(iter/s)": 1.440049 }, { "epoch": 1.5329248961055653, "grad_norm": 4.496016025543213, "learning_rate": 7.854621396628421e-05, "loss": 2.491876411437988, "memory(GiB)": 77.56, "step": 35780, "token_acc": 0.4527687296416938, "train_speed(iter/s)": 1.440049 }, { "epoch": 1.5331391114348143, "grad_norm": 5.415960311889648, "learning_rate": 7.854068854574976e-05, "loss": 2.5605112075805665, "memory(GiB)": 77.56, "step": 35785, "token_acc": 0.49264705882352944, "train_speed(iter/s)": 1.440039 }, { "epoch": 1.5333533267640633, "grad_norm": 4.093324184417725, "learning_rate": 7.853516260817395e-05, "loss": 2.450044631958008, "memory(GiB)": 77.56, "step": 35790, "token_acc": 0.4827586206896552, "train_speed(iter/s)": 1.440036 }, { "epoch": 1.5335675420933121, "grad_norm": 4.74154806137085, "learning_rate": 7.85296361536569e-05, "loss": 2.6434085845947264, "memory(GiB)": 77.56, "step": 35795, "token_acc": 0.45901639344262296, "train_speed(iter/s)": 1.440034 }, { "epoch": 1.5337817574225612, "grad_norm": 4.665964126586914, "learning_rate": 7.852410918229874e-05, "loss": 2.134696388244629, "memory(GiB)": 77.56, "step": 35800, "token_acc": 0.55859375, "train_speed(iter/s)": 1.440049 }, { "epoch": 1.5339959727518102, "grad_norm": 4.803917407989502, "learning_rate": 7.851858169419959e-05, "loss": 2.351963424682617, "memory(GiB)": 77.56, "step": 35805, "token_acc": 0.47368421052631576, "train_speed(iter/s)": 1.440058 }, { "epoch": 1.534210188081059, "grad_norm": 5.831637382507324, "learning_rate": 7.851305368945956e-05, "loss": 2.5339468002319334, "memory(GiB)": 77.56, "step": 35810, "token_acc": 0.4721189591078067, "train_speed(iter/s)": 1.439996 }, { "epoch": 1.534424403410308, "grad_norm": 5.5182204246521, "learning_rate": 7.850752516817883e-05, "loss": 2.585394859313965, "memory(GiB)": 77.56, "step": 35815, "token_acc": 0.48562300319488816, "train_speed(iter/s)": 1.440056 }, { "epoch": 1.534638618739557, "grad_norm": 3.996464967727661, "learning_rate": 7.850199613045754e-05, "loss": 2.4121829986572267, "memory(GiB)": 77.56, "step": 35820, "token_acc": 0.5044776119402985, "train_speed(iter/s)": 1.44005 }, { "epoch": 1.534852834068806, "grad_norm": 4.608769416809082, "learning_rate": 7.849646657639585e-05, "loss": 2.4529977798461915, "memory(GiB)": 77.56, "step": 35825, "token_acc": 0.47335423197492166, "train_speed(iter/s)": 1.440036 }, { "epoch": 1.535067049398055, "grad_norm": 5.1034440994262695, "learning_rate": 7.849093650609392e-05, "loss": 2.3742450714111327, "memory(GiB)": 77.56, "step": 35830, "token_acc": 0.5254901960784314, "train_speed(iter/s)": 1.440017 }, { "epoch": 1.535281264727304, "grad_norm": 4.757763862609863, "learning_rate": 7.848540591965197e-05, "loss": 2.565390396118164, "memory(GiB)": 77.56, "step": 35835, "token_acc": 0.4882154882154882, "train_speed(iter/s)": 1.440023 }, { "epoch": 1.5354954800565528, "grad_norm": 5.375235080718994, "learning_rate": 7.847987481717017e-05, "loss": 2.7975418090820314, "memory(GiB)": 77.56, "step": 35840, "token_acc": 0.4371069182389937, "train_speed(iter/s)": 1.440018 }, { "epoch": 1.5357096953858018, "grad_norm": 4.377996921539307, "learning_rate": 7.847434319874871e-05, "loss": 2.5620803833007812, "memory(GiB)": 77.56, "step": 35845, "token_acc": 0.4891640866873065, "train_speed(iter/s)": 1.440081 }, { "epoch": 1.5359239107150509, "grad_norm": 5.474483966827393, "learning_rate": 7.846881106448781e-05, "loss": 2.30145149230957, "memory(GiB)": 77.56, "step": 35850, "token_acc": 0.45614035087719296, "train_speed(iter/s)": 1.440093 }, { "epoch": 1.5361381260442997, "grad_norm": 6.444149494171143, "learning_rate": 7.846327841448771e-05, "loss": 2.3971038818359376, "memory(GiB)": 77.56, "step": 35855, "token_acc": 0.49146757679180886, "train_speed(iter/s)": 1.440106 }, { "epoch": 1.5363523413735487, "grad_norm": 4.7963948249816895, "learning_rate": 7.84577452488486e-05, "loss": 2.344122123718262, "memory(GiB)": 77.56, "step": 35860, "token_acc": 0.5052631578947369, "train_speed(iter/s)": 1.440089 }, { "epoch": 1.5365665567027977, "grad_norm": 4.2550578117370605, "learning_rate": 7.845221156767076e-05, "loss": 2.6109573364257814, "memory(GiB)": 77.56, "step": 35865, "token_acc": 0.4339622641509434, "train_speed(iter/s)": 1.440114 }, { "epoch": 1.5367807720320465, "grad_norm": 6.8269267082214355, "learning_rate": 7.84466773710544e-05, "loss": 2.2013324737548827, "memory(GiB)": 77.56, "step": 35870, "token_acc": 0.47035573122529645, "train_speed(iter/s)": 1.440161 }, { "epoch": 1.5369949873612956, "grad_norm": 6.0858073234558105, "learning_rate": 7.844114265909979e-05, "loss": 2.5771881103515626, "memory(GiB)": 77.56, "step": 35875, "token_acc": 0.4843205574912892, "train_speed(iter/s)": 1.440177 }, { "epoch": 1.5372092026905446, "grad_norm": 4.231525421142578, "learning_rate": 7.843560743190721e-05, "loss": 2.3409427642822265, "memory(GiB)": 77.56, "step": 35880, "token_acc": 0.4982078853046595, "train_speed(iter/s)": 1.440189 }, { "epoch": 1.5374234180197934, "grad_norm": 6.729775905609131, "learning_rate": 7.843007168957693e-05, "loss": 2.240573692321777, "memory(GiB)": 77.56, "step": 35885, "token_acc": 0.539568345323741, "train_speed(iter/s)": 1.440185 }, { "epoch": 1.5376376333490425, "grad_norm": 3.6165997982025146, "learning_rate": 7.842453543220924e-05, "loss": 2.641560935974121, "memory(GiB)": 77.56, "step": 35890, "token_acc": 0.46178343949044587, "train_speed(iter/s)": 1.44024 }, { "epoch": 1.5378518486782915, "grad_norm": 5.708844184875488, "learning_rate": 7.841899865990441e-05, "loss": 2.79778938293457, "memory(GiB)": 77.56, "step": 35895, "token_acc": 0.4563758389261745, "train_speed(iter/s)": 1.440237 }, { "epoch": 1.5380660640075403, "grad_norm": 4.266225814819336, "learning_rate": 7.841346137276276e-05, "loss": 2.532614517211914, "memory(GiB)": 77.56, "step": 35900, "token_acc": 0.468013468013468, "train_speed(iter/s)": 1.440234 }, { "epoch": 1.5382802793367893, "grad_norm": 5.179828643798828, "learning_rate": 7.84079235708846e-05, "loss": 2.3186885833740236, "memory(GiB)": 77.56, "step": 35905, "token_acc": 0.5311355311355311, "train_speed(iter/s)": 1.440252 }, { "epoch": 1.5384944946660384, "grad_norm": 4.616061687469482, "learning_rate": 7.840238525437027e-05, "loss": 2.5573942184448244, "memory(GiB)": 77.56, "step": 35910, "token_acc": 0.47988505747126436, "train_speed(iter/s)": 1.440279 }, { "epoch": 1.5387087099952872, "grad_norm": 5.172558307647705, "learning_rate": 7.839684642332008e-05, "loss": 2.4401458740234374, "memory(GiB)": 77.56, "step": 35915, "token_acc": 0.4844961240310077, "train_speed(iter/s)": 1.440276 }, { "epoch": 1.5389229253245362, "grad_norm": 4.917933464050293, "learning_rate": 7.839130707783438e-05, "loss": 2.329453468322754, "memory(GiB)": 77.56, "step": 35920, "token_acc": 0.5080385852090032, "train_speed(iter/s)": 1.440267 }, { "epoch": 1.5391371406537853, "grad_norm": 5.331910133361816, "learning_rate": 7.838576721801351e-05, "loss": 2.6115036010742188, "memory(GiB)": 77.56, "step": 35925, "token_acc": 0.44594594594594594, "train_speed(iter/s)": 1.440266 }, { "epoch": 1.539351355983034, "grad_norm": 4.201162815093994, "learning_rate": 7.838022684395786e-05, "loss": 2.461620330810547, "memory(GiB)": 77.56, "step": 35930, "token_acc": 0.47619047619047616, "train_speed(iter/s)": 1.44029 }, { "epoch": 1.539565571312283, "grad_norm": 4.826604843139648, "learning_rate": 7.837468595576777e-05, "loss": 2.5974220275878905, "memory(GiB)": 77.56, "step": 35935, "token_acc": 0.49825783972125437, "train_speed(iter/s)": 1.440323 }, { "epoch": 1.5397797866415321, "grad_norm": 5.460199356079102, "learning_rate": 7.836914455354362e-05, "loss": 2.337574005126953, "memory(GiB)": 77.56, "step": 35940, "token_acc": 0.5019762845849802, "train_speed(iter/s)": 1.440357 }, { "epoch": 1.539994001970781, "grad_norm": 4.470697402954102, "learning_rate": 7.83636026373858e-05, "loss": 2.454970359802246, "memory(GiB)": 77.56, "step": 35945, "token_acc": 0.4407894736842105, "train_speed(iter/s)": 1.440409 }, { "epoch": 1.54020821730003, "grad_norm": 6.884347915649414, "learning_rate": 7.835806020739472e-05, "loss": 2.413157653808594, "memory(GiB)": 77.56, "step": 35950, "token_acc": 0.4261168384879725, "train_speed(iter/s)": 1.440452 }, { "epoch": 1.540422432629279, "grad_norm": 4.889522552490234, "learning_rate": 7.835251726367078e-05, "loss": 2.2992578506469727, "memory(GiB)": 77.56, "step": 35955, "token_acc": 0.5217391304347826, "train_speed(iter/s)": 1.440406 }, { "epoch": 1.5406366479585278, "grad_norm": 6.5905632972717285, "learning_rate": 7.834697380631438e-05, "loss": 2.235517883300781, "memory(GiB)": 77.56, "step": 35960, "token_acc": 0.5441176470588235, "train_speed(iter/s)": 1.440435 }, { "epoch": 1.5408508632877769, "grad_norm": 5.198739051818848, "learning_rate": 7.834142983542597e-05, "loss": 2.703260040283203, "memory(GiB)": 77.56, "step": 35965, "token_acc": 0.501577287066246, "train_speed(iter/s)": 1.440417 }, { "epoch": 1.541065078617026, "grad_norm": 4.178027629852295, "learning_rate": 7.833588535110597e-05, "loss": 2.3758218765258787, "memory(GiB)": 77.56, "step": 35970, "token_acc": 0.4854014598540146, "train_speed(iter/s)": 1.440427 }, { "epoch": 1.5412792939462747, "grad_norm": 5.941470146179199, "learning_rate": 7.833034035345483e-05, "loss": 2.4637609481811524, "memory(GiB)": 77.56, "step": 35975, "token_acc": 0.44404332129963897, "train_speed(iter/s)": 1.440426 }, { "epoch": 1.5414935092755238, "grad_norm": 5.588265419006348, "learning_rate": 7.832479484257298e-05, "loss": 2.5613609313964845, "memory(GiB)": 77.56, "step": 35980, "token_acc": 0.41580756013745707, "train_speed(iter/s)": 1.440441 }, { "epoch": 1.5417077246047728, "grad_norm": 6.106418609619141, "learning_rate": 7.831924881856092e-05, "loss": 2.515755271911621, "memory(GiB)": 77.56, "step": 35985, "token_acc": 0.4815950920245399, "train_speed(iter/s)": 1.440454 }, { "epoch": 1.5419219399340216, "grad_norm": 4.166738986968994, "learning_rate": 7.831370228151909e-05, "loss": 2.3808338165283205, "memory(GiB)": 77.56, "step": 35990, "token_acc": 0.5152439024390244, "train_speed(iter/s)": 1.440458 }, { "epoch": 1.5421361552632706, "grad_norm": 4.873399257659912, "learning_rate": 7.830815523154799e-05, "loss": 2.7323768615722654, "memory(GiB)": 77.56, "step": 35995, "token_acc": 0.4520123839009288, "train_speed(iter/s)": 1.44048 }, { "epoch": 1.5423503705925197, "grad_norm": 4.752975940704346, "learning_rate": 7.83026076687481e-05, "loss": 2.5931865692138674, "memory(GiB)": 77.56, "step": 36000, "token_acc": 0.4788732394366197, "train_speed(iter/s)": 1.440489 }, { "epoch": 1.5423503705925197, "eval_loss": 2.1286087036132812, "eval_runtime": 14.9947, "eval_samples_per_second": 6.669, "eval_steps_per_second": 6.669, "eval_token_acc": 0.48885793871866295, "step": 36000 }, { "epoch": 1.5425645859217685, "grad_norm": 4.597265243530273, "learning_rate": 7.829705959321993e-05, "loss": 2.367877388000488, "memory(GiB)": 77.56, "step": 36005, "token_acc": 0.4871060171919771, "train_speed(iter/s)": 1.439519 }, { "epoch": 1.5427788012510175, "grad_norm": 5.153852939605713, "learning_rate": 7.829151100506396e-05, "loss": 2.6542198181152346, "memory(GiB)": 77.56, "step": 36010, "token_acc": 0.46283783783783783, "train_speed(iter/s)": 1.4395 }, { "epoch": 1.5429930165802666, "grad_norm": 4.836722373962402, "learning_rate": 7.828596190438075e-05, "loss": 2.182339096069336, "memory(GiB)": 77.56, "step": 36015, "token_acc": 0.4980237154150198, "train_speed(iter/s)": 1.439547 }, { "epoch": 1.5432072319095154, "grad_norm": 4.521052360534668, "learning_rate": 7.828041229127079e-05, "loss": 2.690706253051758, "memory(GiB)": 77.56, "step": 36020, "token_acc": 0.43434343434343436, "train_speed(iter/s)": 1.439584 }, { "epoch": 1.5434214472387644, "grad_norm": 5.721269607543945, "learning_rate": 7.827486216583465e-05, "loss": 2.8337615966796874, "memory(GiB)": 77.56, "step": 36025, "token_acc": 0.4538653366583541, "train_speed(iter/s)": 1.439596 }, { "epoch": 1.5436356625680134, "grad_norm": 6.398462295532227, "learning_rate": 7.826931152817283e-05, "loss": 2.1992380142211916, "memory(GiB)": 77.56, "step": 36030, "token_acc": 0.56, "train_speed(iter/s)": 1.439617 }, { "epoch": 1.5438498778972622, "grad_norm": 5.473598957061768, "learning_rate": 7.826376037838594e-05, "loss": 2.2730775833129884, "memory(GiB)": 77.56, "step": 36035, "token_acc": 0.5, "train_speed(iter/s)": 1.439618 }, { "epoch": 1.5440640932265113, "grad_norm": 4.58029842376709, "learning_rate": 7.82582087165745e-05, "loss": 2.492129325866699, "memory(GiB)": 77.56, "step": 36040, "token_acc": 0.4889589905362776, "train_speed(iter/s)": 1.439678 }, { "epoch": 1.5442783085557603, "grad_norm": 5.405659198760986, "learning_rate": 7.82526565428391e-05, "loss": 2.7293840408325196, "memory(GiB)": 77.56, "step": 36045, "token_acc": 0.43010752688172044, "train_speed(iter/s)": 1.439638 }, { "epoch": 1.5444925238850091, "grad_norm": 5.566967010498047, "learning_rate": 7.824710385728033e-05, "loss": 3.030938911437988, "memory(GiB)": 77.56, "step": 36050, "token_acc": 0.4276729559748428, "train_speed(iter/s)": 1.439674 }, { "epoch": 1.5447067392142582, "grad_norm": 4.404112339019775, "learning_rate": 7.824155065999879e-05, "loss": 2.689794158935547, "memory(GiB)": 77.56, "step": 36055, "token_acc": 0.43478260869565216, "train_speed(iter/s)": 1.439698 }, { "epoch": 1.5449209545435072, "grad_norm": 4.441066741943359, "learning_rate": 7.823599695109504e-05, "loss": 2.8341550827026367, "memory(GiB)": 77.56, "step": 36060, "token_acc": 0.41642228739002934, "train_speed(iter/s)": 1.439693 }, { "epoch": 1.545135169872756, "grad_norm": 5.663236141204834, "learning_rate": 7.823044273066975e-05, "loss": 2.3890533447265625, "memory(GiB)": 77.56, "step": 36065, "token_acc": 0.4980392156862745, "train_speed(iter/s)": 1.439656 }, { "epoch": 1.545349385202005, "grad_norm": 3.5448763370513916, "learning_rate": 7.822488799882348e-05, "loss": 2.541372299194336, "memory(GiB)": 77.56, "step": 36070, "token_acc": 0.4983164983164983, "train_speed(iter/s)": 1.439685 }, { "epoch": 1.545563600531254, "grad_norm": 4.116077423095703, "learning_rate": 7.821933275565691e-05, "loss": 2.534458351135254, "memory(GiB)": 77.56, "step": 36075, "token_acc": 0.50390625, "train_speed(iter/s)": 1.439678 }, { "epoch": 1.545777815860503, "grad_norm": 7.806304931640625, "learning_rate": 7.821377700127063e-05, "loss": 2.5961685180664062, "memory(GiB)": 77.56, "step": 36080, "token_acc": 0.5035971223021583, "train_speed(iter/s)": 1.43972 }, { "epoch": 1.545992031189752, "grad_norm": 5.884232044219971, "learning_rate": 7.820822073576534e-05, "loss": 2.6835458755493162, "memory(GiB)": 77.56, "step": 36085, "token_acc": 0.4701492537313433, "train_speed(iter/s)": 1.43968 }, { "epoch": 1.546206246519001, "grad_norm": 3.6564464569091797, "learning_rate": 7.820266395924164e-05, "loss": 2.40456485748291, "memory(GiB)": 77.56, "step": 36090, "token_acc": 0.46932515337423314, "train_speed(iter/s)": 1.439647 }, { "epoch": 1.5464204618482498, "grad_norm": 4.301235198974609, "learning_rate": 7.819710667180024e-05, "loss": 2.34564323425293, "memory(GiB)": 77.56, "step": 36095, "token_acc": 0.5045592705167173, "train_speed(iter/s)": 1.439667 }, { "epoch": 1.5466346771774988, "grad_norm": 4.588385105133057, "learning_rate": 7.819154887354182e-05, "loss": 2.4966814041137697, "memory(GiB)": 77.56, "step": 36100, "token_acc": 0.48188405797101447, "train_speed(iter/s)": 1.439681 }, { "epoch": 1.5468488925067478, "grad_norm": 4.888988494873047, "learning_rate": 7.818599056456701e-05, "loss": 2.5890830993652343, "memory(GiB)": 77.56, "step": 36105, "token_acc": 0.42857142857142855, "train_speed(iter/s)": 1.439718 }, { "epoch": 1.5470631078359967, "grad_norm": 6.6023993492126465, "learning_rate": 7.818043174497656e-05, "loss": 2.503236770629883, "memory(GiB)": 77.56, "step": 36110, "token_acc": 0.4847457627118644, "train_speed(iter/s)": 1.439756 }, { "epoch": 1.5472773231652457, "grad_norm": 4.630427837371826, "learning_rate": 7.817487241487115e-05, "loss": 2.526523208618164, "memory(GiB)": 77.56, "step": 36115, "token_acc": 0.5015974440894568, "train_speed(iter/s)": 1.439782 }, { "epoch": 1.5474915384944947, "grad_norm": 5.061960220336914, "learning_rate": 7.816931257435151e-05, "loss": 2.577986717224121, "memory(GiB)": 77.56, "step": 36120, "token_acc": 0.46464646464646464, "train_speed(iter/s)": 1.439797 }, { "epoch": 1.5477057538237435, "grad_norm": 5.558644771575928, "learning_rate": 7.816375222351833e-05, "loss": 2.602009391784668, "memory(GiB)": 77.56, "step": 36125, "token_acc": 0.4722222222222222, "train_speed(iter/s)": 1.43983 }, { "epoch": 1.5479199691529926, "grad_norm": 4.5223565101623535, "learning_rate": 7.815819136247238e-05, "loss": 2.6625133514404298, "memory(GiB)": 77.56, "step": 36130, "token_acc": 0.45110410094637227, "train_speed(iter/s)": 1.439803 }, { "epoch": 1.5481341844822416, "grad_norm": 4.3092851638793945, "learning_rate": 7.815262999131436e-05, "loss": 2.432361030578613, "memory(GiB)": 77.56, "step": 36135, "token_acc": 0.5202702702702703, "train_speed(iter/s)": 1.439807 }, { "epoch": 1.5483483998114904, "grad_norm": 5.107344150543213, "learning_rate": 7.814706811014504e-05, "loss": 2.4219039916992187, "memory(GiB)": 77.56, "step": 36140, "token_acc": 0.49032258064516127, "train_speed(iter/s)": 1.439782 }, { "epoch": 1.5485626151407395, "grad_norm": 4.754781246185303, "learning_rate": 7.814150571906517e-05, "loss": 2.3867931365966797, "memory(GiB)": 77.56, "step": 36145, "token_acc": 0.515625, "train_speed(iter/s)": 1.43981 }, { "epoch": 1.5487768304699885, "grad_norm": 4.836016654968262, "learning_rate": 7.813594281817555e-05, "loss": 2.7338991165161133, "memory(GiB)": 77.56, "step": 36150, "token_acc": 0.4489795918367347, "train_speed(iter/s)": 1.439849 }, { "epoch": 1.5489910457992373, "grad_norm": 4.735340595245361, "learning_rate": 7.813037940757692e-05, "loss": 2.7531713485717773, "memory(GiB)": 77.56, "step": 36155, "token_acc": 0.46621621621621623, "train_speed(iter/s)": 1.439827 }, { "epoch": 1.5492052611284863, "grad_norm": 5.50546407699585, "learning_rate": 7.812481548737007e-05, "loss": 2.559846115112305, "memory(GiB)": 77.56, "step": 36160, "token_acc": 0.4844290657439446, "train_speed(iter/s)": 1.439863 }, { "epoch": 1.5494194764577354, "grad_norm": 4.673547744750977, "learning_rate": 7.811925105765581e-05, "loss": 2.452170753479004, "memory(GiB)": 77.56, "step": 36165, "token_acc": 0.5035971223021583, "train_speed(iter/s)": 1.439864 }, { "epoch": 1.5496336917869842, "grad_norm": 4.878746509552002, "learning_rate": 7.811368611853493e-05, "loss": 2.666554832458496, "memory(GiB)": 77.56, "step": 36170, "token_acc": 0.4744525547445255, "train_speed(iter/s)": 1.439876 }, { "epoch": 1.5498479071162332, "grad_norm": 4.516483306884766, "learning_rate": 7.810812067010827e-05, "loss": 2.686572265625, "memory(GiB)": 77.56, "step": 36175, "token_acc": 0.4295774647887324, "train_speed(iter/s)": 1.439919 }, { "epoch": 1.5500621224454822, "grad_norm": 4.7003912925720215, "learning_rate": 7.810255471247663e-05, "loss": 2.6671051025390624, "memory(GiB)": 77.56, "step": 36180, "token_acc": 0.47019867549668876, "train_speed(iter/s)": 1.43996 }, { "epoch": 1.550276337774731, "grad_norm": 4.506163120269775, "learning_rate": 7.809698824574085e-05, "loss": 2.5628196716308596, "memory(GiB)": 77.56, "step": 36185, "token_acc": 0.48322147651006714, "train_speed(iter/s)": 1.440004 }, { "epoch": 1.55049055310398, "grad_norm": 4.055752277374268, "learning_rate": 7.809142127000176e-05, "loss": 2.416562080383301, "memory(GiB)": 77.56, "step": 36190, "token_acc": 0.4662756598240469, "train_speed(iter/s)": 1.440031 }, { "epoch": 1.5507047684332291, "grad_norm": 4.566366672515869, "learning_rate": 7.808585378536024e-05, "loss": 2.5232059478759767, "memory(GiB)": 77.56, "step": 36195, "token_acc": 0.47735191637630664, "train_speed(iter/s)": 1.44006 }, { "epoch": 1.550918983762478, "grad_norm": 5.101639747619629, "learning_rate": 7.808028579191711e-05, "loss": 2.614002227783203, "memory(GiB)": 77.56, "step": 36200, "token_acc": 0.45045045045045046, "train_speed(iter/s)": 1.44008 }, { "epoch": 1.551133199091727, "grad_norm": 5.844062805175781, "learning_rate": 7.807471728977327e-05, "loss": 2.3080062866210938, "memory(GiB)": 77.56, "step": 36205, "token_acc": 0.4866920152091255, "train_speed(iter/s)": 1.440079 }, { "epoch": 1.551347414420976, "grad_norm": 3.9902467727661133, "learning_rate": 7.80691482790296e-05, "loss": 2.4026275634765626, "memory(GiB)": 77.56, "step": 36210, "token_acc": 0.46348314606741575, "train_speed(iter/s)": 1.440096 }, { "epoch": 1.5515616297502248, "grad_norm": 5.366555213928223, "learning_rate": 7.806357875978698e-05, "loss": 2.6843694686889648, "memory(GiB)": 77.56, "step": 36215, "token_acc": 0.42996742671009774, "train_speed(iter/s)": 1.440122 }, { "epoch": 1.5517758450794739, "grad_norm": 7.945376396179199, "learning_rate": 7.805800873214628e-05, "loss": 2.5908931732177733, "memory(GiB)": 77.56, "step": 36220, "token_acc": 0.463768115942029, "train_speed(iter/s)": 1.440121 }, { "epoch": 1.551990060408723, "grad_norm": 5.47683048248291, "learning_rate": 7.805243819620845e-05, "loss": 2.338287925720215, "memory(GiB)": 77.56, "step": 36225, "token_acc": 0.4717607973421927, "train_speed(iter/s)": 1.440076 }, { "epoch": 1.5522042757379717, "grad_norm": 4.751999855041504, "learning_rate": 7.80468671520744e-05, "loss": 2.6259727478027344, "memory(GiB)": 77.56, "step": 36230, "token_acc": 0.49458483754512633, "train_speed(iter/s)": 1.440115 }, { "epoch": 1.5524184910672207, "grad_norm": 5.376540660858154, "learning_rate": 7.804129559984501e-05, "loss": 2.642112159729004, "memory(GiB)": 77.56, "step": 36235, "token_acc": 0.46846846846846846, "train_speed(iter/s)": 1.440112 }, { "epoch": 1.5526327063964698, "grad_norm": 5.526229381561279, "learning_rate": 7.803572353962126e-05, "loss": 2.418209457397461, "memory(GiB)": 77.56, "step": 36240, "token_acc": 0.45182724252491696, "train_speed(iter/s)": 1.440135 }, { "epoch": 1.5528469217257186, "grad_norm": 6.461812496185303, "learning_rate": 7.80301509715041e-05, "loss": 2.478191947937012, "memory(GiB)": 77.56, "step": 36245, "token_acc": 0.46511627906976744, "train_speed(iter/s)": 1.440169 }, { "epoch": 1.5530611370549676, "grad_norm": 4.604681015014648, "learning_rate": 7.802457789559443e-05, "loss": 2.3935449600219725, "memory(GiB)": 77.56, "step": 36250, "token_acc": 0.47038327526132406, "train_speed(iter/s)": 1.440211 }, { "epoch": 1.5532753523842167, "grad_norm": 4.748660087585449, "learning_rate": 7.801900431199325e-05, "loss": 2.4280540466308596, "memory(GiB)": 77.56, "step": 36255, "token_acc": 0.4962121212121212, "train_speed(iter/s)": 1.440221 }, { "epoch": 1.5534895677134655, "grad_norm": 4.3995161056518555, "learning_rate": 7.801343022080152e-05, "loss": 2.305587577819824, "memory(GiB)": 77.56, "step": 36260, "token_acc": 0.5333333333333333, "train_speed(iter/s)": 1.440228 }, { "epoch": 1.5537037830427145, "grad_norm": 5.565771579742432, "learning_rate": 7.800785562212025e-05, "loss": 2.4840831756591797, "memory(GiB)": 77.56, "step": 36265, "token_acc": 0.462406015037594, "train_speed(iter/s)": 1.440214 }, { "epoch": 1.5539179983719635, "grad_norm": 3.6793878078460693, "learning_rate": 7.800228051605036e-05, "loss": 2.393549346923828, "memory(GiB)": 77.56, "step": 36270, "token_acc": 0.48589341692789967, "train_speed(iter/s)": 1.440207 }, { "epoch": 1.5541322137012124, "grad_norm": 4.280629634857178, "learning_rate": 7.799670490269291e-05, "loss": 2.7498884201049805, "memory(GiB)": 77.56, "step": 36275, "token_acc": 0.47766323024054985, "train_speed(iter/s)": 1.440273 }, { "epoch": 1.5543464290304614, "grad_norm": 4.478820323944092, "learning_rate": 7.799112878214889e-05, "loss": 2.5493656158447267, "memory(GiB)": 77.56, "step": 36280, "token_acc": 0.45588235294117646, "train_speed(iter/s)": 1.440292 }, { "epoch": 1.5545606443597104, "grad_norm": 5.53695011138916, "learning_rate": 7.798555215451932e-05, "loss": 2.5075130462646484, "memory(GiB)": 77.56, "step": 36285, "token_acc": 0.4738675958188153, "train_speed(iter/s)": 1.44031 }, { "epoch": 1.5547748596889592, "grad_norm": 5.313154697418213, "learning_rate": 7.797997501990522e-05, "loss": 2.4900819778442385, "memory(GiB)": 77.56, "step": 36290, "token_acc": 0.46488294314381273, "train_speed(iter/s)": 1.44034 }, { "epoch": 1.5549890750182083, "grad_norm": 4.965277671813965, "learning_rate": 7.797439737840761e-05, "loss": 2.4642307281494142, "memory(GiB)": 77.56, "step": 36295, "token_acc": 0.4600760456273764, "train_speed(iter/s)": 1.440352 }, { "epoch": 1.5552032903474573, "grad_norm": 5.644978046417236, "learning_rate": 7.796881923012755e-05, "loss": 2.5070371627807617, "memory(GiB)": 77.56, "step": 36300, "token_acc": 0.46382978723404256, "train_speed(iter/s)": 1.44033 }, { "epoch": 1.5554175056767061, "grad_norm": 4.894263744354248, "learning_rate": 7.796324057516611e-05, "loss": 2.4139293670654296, "memory(GiB)": 77.56, "step": 36305, "token_acc": 0.4854368932038835, "train_speed(iter/s)": 1.440339 }, { "epoch": 1.5556317210059551, "grad_norm": 5.888087749481201, "learning_rate": 7.795766141362432e-05, "loss": 2.6148553848266602, "memory(GiB)": 77.56, "step": 36310, "token_acc": 0.4473684210526316, "train_speed(iter/s)": 1.440305 }, { "epoch": 1.5558459363352042, "grad_norm": 7.530528545379639, "learning_rate": 7.795208174560326e-05, "loss": 2.598414993286133, "memory(GiB)": 77.56, "step": 36315, "token_acc": 0.45302013422818793, "train_speed(iter/s)": 1.440313 }, { "epoch": 1.556060151664453, "grad_norm": 4.82913875579834, "learning_rate": 7.794650157120405e-05, "loss": 2.1892459869384764, "memory(GiB)": 77.56, "step": 36320, "token_acc": 0.5340909090909091, "train_speed(iter/s)": 1.440292 }, { "epoch": 1.556274366993702, "grad_norm": 4.564405918121338, "learning_rate": 7.794092089052774e-05, "loss": 2.514790725708008, "memory(GiB)": 77.56, "step": 36325, "token_acc": 0.5066225165562914, "train_speed(iter/s)": 1.440281 }, { "epoch": 1.556488582322951, "grad_norm": 4.016900062561035, "learning_rate": 7.793533970367542e-05, "loss": 2.4102779388427735, "memory(GiB)": 77.56, "step": 36330, "token_acc": 0.49110320284697506, "train_speed(iter/s)": 1.440339 }, { "epoch": 1.5567027976521999, "grad_norm": 5.928051471710205, "learning_rate": 7.792975801074822e-05, "loss": 2.5120061874389648, "memory(GiB)": 77.56, "step": 36335, "token_acc": 0.45878136200716846, "train_speed(iter/s)": 1.440304 }, { "epoch": 1.556917012981449, "grad_norm": 5.336033821105957, "learning_rate": 7.792417581184725e-05, "loss": 2.4910240173339844, "memory(GiB)": 77.56, "step": 36340, "token_acc": 0.47633136094674555, "train_speed(iter/s)": 1.440342 }, { "epoch": 1.557131228310698, "grad_norm": 6.072386741638184, "learning_rate": 7.791859310707366e-05, "loss": 2.4716182708740235, "memory(GiB)": 77.56, "step": 36345, "token_acc": 0.4798657718120805, "train_speed(iter/s)": 1.440371 }, { "epoch": 1.5573454436399468, "grad_norm": 4.771454811096191, "learning_rate": 7.791300989652853e-05, "loss": 2.791448211669922, "memory(GiB)": 77.56, "step": 36350, "token_acc": 0.45110410094637227, "train_speed(iter/s)": 1.440411 }, { "epoch": 1.5575596589691958, "grad_norm": 6.639774322509766, "learning_rate": 7.790742618031307e-05, "loss": 2.591164779663086, "memory(GiB)": 77.56, "step": 36355, "token_acc": 0.42955326460481097, "train_speed(iter/s)": 1.440454 }, { "epoch": 1.5577738742984448, "grad_norm": 4.969061851501465, "learning_rate": 7.79018419585284e-05, "loss": 2.256228446960449, "memory(GiB)": 77.56, "step": 36360, "token_acc": 0.5423076923076923, "train_speed(iter/s)": 1.440444 }, { "epoch": 1.5579880896276936, "grad_norm": 5.595006942749023, "learning_rate": 7.78962572312757e-05, "loss": 2.6134761810302733, "memory(GiB)": 77.56, "step": 36365, "token_acc": 0.5016611295681063, "train_speed(iter/s)": 1.440414 }, { "epoch": 1.5582023049569427, "grad_norm": 4.773839950561523, "learning_rate": 7.78906719986561e-05, "loss": 2.3932146072387694, "memory(GiB)": 77.56, "step": 36370, "token_acc": 0.5133079847908745, "train_speed(iter/s)": 1.440429 }, { "epoch": 1.5584165202861917, "grad_norm": 4.176609516143799, "learning_rate": 7.788508626077084e-05, "loss": 2.5070213317871093, "memory(GiB)": 77.56, "step": 36375, "token_acc": 0.49328859060402686, "train_speed(iter/s)": 1.440411 }, { "epoch": 1.5586307356154405, "grad_norm": 5.443298816680908, "learning_rate": 7.787950001772108e-05, "loss": 2.4655118942260743, "memory(GiB)": 77.56, "step": 36380, "token_acc": 0.5085910652920962, "train_speed(iter/s)": 1.440429 }, { "epoch": 1.5588449509446896, "grad_norm": 4.701830863952637, "learning_rate": 7.7873913269608e-05, "loss": 2.3925262451171876, "memory(GiB)": 77.56, "step": 36385, "token_acc": 0.5035211267605634, "train_speed(iter/s)": 1.440401 }, { "epoch": 1.5590591662739386, "grad_norm": 6.884942054748535, "learning_rate": 7.786832601653287e-05, "loss": 2.774361419677734, "memory(GiB)": 77.56, "step": 36390, "token_acc": 0.47104247104247104, "train_speed(iter/s)": 1.440403 }, { "epoch": 1.5592733816031874, "grad_norm": 5.308371543884277, "learning_rate": 7.786273825859684e-05, "loss": 2.405905914306641, "memory(GiB)": 77.56, "step": 36395, "token_acc": 0.43902439024390244, "train_speed(iter/s)": 1.44045 }, { "epoch": 1.5594875969324364, "grad_norm": 4.704966068267822, "learning_rate": 7.78571499959012e-05, "loss": 2.5021812438964846, "memory(GiB)": 77.56, "step": 36400, "token_acc": 0.49673202614379086, "train_speed(iter/s)": 1.440444 }, { "epoch": 1.5597018122616855, "grad_norm": 4.557310104370117, "learning_rate": 7.785156122854713e-05, "loss": 2.3835289001464846, "memory(GiB)": 77.56, "step": 36405, "token_acc": 0.52, "train_speed(iter/s)": 1.440491 }, { "epoch": 1.5599160275909343, "grad_norm": 4.9684553146362305, "learning_rate": 7.784597195663593e-05, "loss": 2.506352996826172, "memory(GiB)": 77.56, "step": 36410, "token_acc": 0.4812286689419795, "train_speed(iter/s)": 1.440518 }, { "epoch": 1.5601302429201833, "grad_norm": 5.420516014099121, "learning_rate": 7.78403821802688e-05, "loss": 2.4266332626342773, "memory(GiB)": 77.56, "step": 36415, "token_acc": 0.4679245283018868, "train_speed(iter/s)": 1.440486 }, { "epoch": 1.5603444582494324, "grad_norm": 4.634389877319336, "learning_rate": 7.783479189954704e-05, "loss": 2.4806692123413088, "memory(GiB)": 77.56, "step": 36420, "token_acc": 0.48264984227129337, "train_speed(iter/s)": 1.4405 }, { "epoch": 1.5605586735786812, "grad_norm": 4.9568071365356445, "learning_rate": 7.78292011145719e-05, "loss": 2.3620481491088867, "memory(GiB)": 77.56, "step": 36425, "token_acc": 0.4934640522875817, "train_speed(iter/s)": 1.440537 }, { "epoch": 1.5607728889079302, "grad_norm": 4.540524005889893, "learning_rate": 7.782360982544469e-05, "loss": 2.5699893951416017, "memory(GiB)": 77.56, "step": 36430, "token_acc": 0.44554455445544555, "train_speed(iter/s)": 1.440565 }, { "epoch": 1.5609871042371792, "grad_norm": 5.17749547958374, "learning_rate": 7.781801803226669e-05, "loss": 2.657927894592285, "memory(GiB)": 77.56, "step": 36435, "token_acc": 0.4492307692307692, "train_speed(iter/s)": 1.440589 }, { "epoch": 1.561201319566428, "grad_norm": 5.044076919555664, "learning_rate": 7.781242573513918e-05, "loss": 2.407718849182129, "memory(GiB)": 77.56, "step": 36440, "token_acc": 0.5143884892086331, "train_speed(iter/s)": 1.440634 }, { "epoch": 1.561415534895677, "grad_norm": 7.126518726348877, "learning_rate": 7.780683293416352e-05, "loss": 2.568478012084961, "memory(GiB)": 77.56, "step": 36445, "token_acc": 0.5015974440894568, "train_speed(iter/s)": 1.440654 }, { "epoch": 1.5616297502249261, "grad_norm": 4.720685958862305, "learning_rate": 7.780123962944097e-05, "loss": 2.646109771728516, "memory(GiB)": 77.56, "step": 36450, "token_acc": 0.44077134986225897, "train_speed(iter/s)": 1.440701 }, { "epoch": 1.561843965554175, "grad_norm": 5.71162748336792, "learning_rate": 7.779564582107289e-05, "loss": 2.2690031051635744, "memory(GiB)": 77.56, "step": 36455, "token_acc": 0.540268456375839, "train_speed(iter/s)": 1.440715 }, { "epoch": 1.562058180883424, "grad_norm": 5.233338832855225, "learning_rate": 7.779005150916061e-05, "loss": 2.619420051574707, "memory(GiB)": 77.56, "step": 36460, "token_acc": 0.464, "train_speed(iter/s)": 1.440758 }, { "epoch": 1.562272396212673, "grad_norm": 4.673311233520508, "learning_rate": 7.778445669380548e-05, "loss": 2.5446447372436523, "memory(GiB)": 77.56, "step": 36465, "token_acc": 0.4634920634920635, "train_speed(iter/s)": 1.44077 }, { "epoch": 1.5624866115419218, "grad_norm": 6.547863006591797, "learning_rate": 7.777886137510885e-05, "loss": 2.765332794189453, "memory(GiB)": 77.56, "step": 36470, "token_acc": 0.43272727272727274, "train_speed(iter/s)": 1.440766 }, { "epoch": 1.562700826871171, "grad_norm": 4.185351371765137, "learning_rate": 7.777326555317209e-05, "loss": 2.795298957824707, "memory(GiB)": 77.56, "step": 36475, "token_acc": 0.42628205128205127, "train_speed(iter/s)": 1.440779 }, { "epoch": 1.5629150422004199, "grad_norm": 10.662088394165039, "learning_rate": 7.776766922809658e-05, "loss": 2.4162115097045898, "memory(GiB)": 77.56, "step": 36480, "token_acc": 0.4645669291338583, "train_speed(iter/s)": 1.440775 }, { "epoch": 1.5631292575296687, "grad_norm": 5.266146659851074, "learning_rate": 7.776207239998368e-05, "loss": 2.4303817749023438, "memory(GiB)": 77.56, "step": 36485, "token_acc": 0.4981549815498155, "train_speed(iter/s)": 1.440822 }, { "epoch": 1.563343472858918, "grad_norm": 6.210820198059082, "learning_rate": 7.775647506893481e-05, "loss": 2.5056489944458007, "memory(GiB)": 77.56, "step": 36490, "token_acc": 0.46405228758169936, "train_speed(iter/s)": 1.440864 }, { "epoch": 1.5635576881881668, "grad_norm": 5.352214813232422, "learning_rate": 7.775087723505133e-05, "loss": 2.8598777770996096, "memory(GiB)": 77.56, "step": 36495, "token_acc": 0.41597796143250687, "train_speed(iter/s)": 1.440884 }, { "epoch": 1.5637719035174156, "grad_norm": 5.598677635192871, "learning_rate": 7.774527889843471e-05, "loss": 2.8109331130981445, "memory(GiB)": 77.56, "step": 36500, "token_acc": 0.43243243243243246, "train_speed(iter/s)": 1.440888 }, { "epoch": 1.5637719035174156, "eval_loss": 2.434030055999756, "eval_runtime": 13.3766, "eval_samples_per_second": 7.476, "eval_steps_per_second": 7.476, "eval_token_acc": 0.4772413793103448, "step": 36500 }, { "epoch": 1.5639861188466648, "grad_norm": 4.27955436706543, "learning_rate": 7.773968005918631e-05, "loss": 2.3682605743408205, "memory(GiB)": 77.56, "step": 36505, "token_acc": 0.4788732394366197, "train_speed(iter/s)": 1.440061 }, { "epoch": 1.5642003341759136, "grad_norm": 5.812290191650391, "learning_rate": 7.77340807174076e-05, "loss": 2.694140815734863, "memory(GiB)": 77.56, "step": 36510, "token_acc": 0.4685714285714286, "train_speed(iter/s)": 1.440051 }, { "epoch": 1.5644145495051625, "grad_norm": 4.478960990905762, "learning_rate": 7.77284808732e-05, "loss": 2.4436033248901365, "memory(GiB)": 77.56, "step": 36515, "token_acc": 0.4807017543859649, "train_speed(iter/s)": 1.440059 }, { "epoch": 1.5646287648344117, "grad_norm": 6.991723537445068, "learning_rate": 7.772288052666494e-05, "loss": 2.541650390625, "memory(GiB)": 77.56, "step": 36520, "token_acc": 0.4414715719063545, "train_speed(iter/s)": 1.440037 }, { "epoch": 1.5648429801636605, "grad_norm": 5.616585731506348, "learning_rate": 7.771727967790393e-05, "loss": 2.5818508148193358, "memory(GiB)": 77.56, "step": 36525, "token_acc": 0.40492957746478875, "train_speed(iter/s)": 1.440072 }, { "epoch": 1.5650571954929093, "grad_norm": 5.703603267669678, "learning_rate": 7.771167832701835e-05, "loss": 2.3734350204467773, "memory(GiB)": 77.56, "step": 36530, "token_acc": 0.4860557768924303, "train_speed(iter/s)": 1.440074 }, { "epoch": 1.5652714108221586, "grad_norm": 4.5755414962768555, "learning_rate": 7.770607647410975e-05, "loss": 2.5518375396728517, "memory(GiB)": 77.56, "step": 36535, "token_acc": 0.453416149068323, "train_speed(iter/s)": 1.440087 }, { "epoch": 1.5654856261514074, "grad_norm": 7.752872943878174, "learning_rate": 7.770047411927958e-05, "loss": 2.6195579528808595, "memory(GiB)": 77.56, "step": 36540, "token_acc": 0.47079037800687284, "train_speed(iter/s)": 1.440057 }, { "epoch": 1.5656998414806562, "grad_norm": 4.654615879058838, "learning_rate": 7.769487126262934e-05, "loss": 2.446702766418457, "memory(GiB)": 77.56, "step": 36545, "token_acc": 0.5, "train_speed(iter/s)": 1.440033 }, { "epoch": 1.5659140568099055, "grad_norm": 5.9365715980529785, "learning_rate": 7.768926790426052e-05, "loss": 2.5963687896728516, "memory(GiB)": 77.56, "step": 36550, "token_acc": 0.46488294314381273, "train_speed(iter/s)": 1.440033 }, { "epoch": 1.5661282721391543, "grad_norm": 5.438448429107666, "learning_rate": 7.768366404427464e-05, "loss": 2.5643732070922853, "memory(GiB)": 77.56, "step": 36555, "token_acc": 0.5034246575342466, "train_speed(iter/s)": 1.440063 }, { "epoch": 1.566342487468403, "grad_norm": 5.707797527313232, "learning_rate": 7.767805968277322e-05, "loss": 2.591194725036621, "memory(GiB)": 77.56, "step": 36560, "token_acc": 0.4131054131054131, "train_speed(iter/s)": 1.440102 }, { "epoch": 1.5665567027976524, "grad_norm": 5.371236324310303, "learning_rate": 7.767245481985777e-05, "loss": 2.7137020111083983, "memory(GiB)": 77.56, "step": 36565, "token_acc": 0.4354243542435424, "train_speed(iter/s)": 1.440126 }, { "epoch": 1.5667709181269012, "grad_norm": 4.493969440460205, "learning_rate": 7.766684945562986e-05, "loss": 2.3718088150024412, "memory(GiB)": 77.56, "step": 36570, "token_acc": 0.503731343283582, "train_speed(iter/s)": 1.440162 }, { "epoch": 1.56698513345615, "grad_norm": 4.753102779388428, "learning_rate": 7.7661243590191e-05, "loss": 2.6996337890625, "memory(GiB)": 77.56, "step": 36575, "token_acc": 0.45396825396825397, "train_speed(iter/s)": 1.440123 }, { "epoch": 1.5671993487853992, "grad_norm": 5.907541751861572, "learning_rate": 7.765563722364278e-05, "loss": 2.4066436767578123, "memory(GiB)": 77.56, "step": 36580, "token_acc": 0.5369649805447471, "train_speed(iter/s)": 1.440123 }, { "epoch": 1.567413564114648, "grad_norm": 5.512517929077148, "learning_rate": 7.765003035608676e-05, "loss": 2.658072280883789, "memory(GiB)": 77.56, "step": 36585, "token_acc": 0.46715328467153283, "train_speed(iter/s)": 1.440146 }, { "epoch": 1.5676277794438969, "grad_norm": 4.607879161834717, "learning_rate": 7.764442298762448e-05, "loss": 2.757480430603027, "memory(GiB)": 77.56, "step": 36590, "token_acc": 0.4185303514376997, "train_speed(iter/s)": 1.440148 }, { "epoch": 1.5678419947731461, "grad_norm": 4.5528645515441895, "learning_rate": 7.763881511835754e-05, "loss": 2.548477554321289, "memory(GiB)": 77.56, "step": 36595, "token_acc": 0.47540983606557374, "train_speed(iter/s)": 1.440204 }, { "epoch": 1.568056210102395, "grad_norm": 4.772520542144775, "learning_rate": 7.763320674838756e-05, "loss": 2.4298946380615236, "memory(GiB)": 77.56, "step": 36600, "token_acc": 0.4965277777777778, "train_speed(iter/s)": 1.440134 }, { "epoch": 1.5682704254316437, "grad_norm": 5.726729869842529, "learning_rate": 7.76275978778161e-05, "loss": 2.635542869567871, "memory(GiB)": 77.56, "step": 36605, "token_acc": 0.48, "train_speed(iter/s)": 1.440181 }, { "epoch": 1.568484640760893, "grad_norm": 4.152476787567139, "learning_rate": 7.762198850674478e-05, "loss": 2.5780864715576173, "memory(GiB)": 77.56, "step": 36610, "token_acc": 0.43234323432343236, "train_speed(iter/s)": 1.440193 }, { "epoch": 1.5686988560901418, "grad_norm": 5.2029709815979, "learning_rate": 7.761637863527524e-05, "loss": 2.6104827880859376, "memory(GiB)": 77.56, "step": 36615, "token_acc": 0.44274809160305345, "train_speed(iter/s)": 1.440192 }, { "epoch": 1.5689130714193906, "grad_norm": 4.238569259643555, "learning_rate": 7.761076826350911e-05, "loss": 2.3544130325317383, "memory(GiB)": 77.56, "step": 36620, "token_acc": 0.5209790209790209, "train_speed(iter/s)": 1.440194 }, { "epoch": 1.5691272867486399, "grad_norm": 4.985941410064697, "learning_rate": 7.760515739154798e-05, "loss": 2.8340641021728517, "memory(GiB)": 77.56, "step": 36625, "token_acc": 0.45544554455445546, "train_speed(iter/s)": 1.440245 }, { "epoch": 1.5693415020778887, "grad_norm": 4.0189313888549805, "learning_rate": 7.759954601949355e-05, "loss": 2.4815437316894533, "memory(GiB)": 77.56, "step": 36630, "token_acc": 0.5259938837920489, "train_speed(iter/s)": 1.440238 }, { "epoch": 1.5695557174071375, "grad_norm": 4.385519027709961, "learning_rate": 7.759393414744747e-05, "loss": 2.5908279418945312, "memory(GiB)": 77.56, "step": 36635, "token_acc": 0.46963562753036436, "train_speed(iter/s)": 1.440237 }, { "epoch": 1.5697699327363868, "grad_norm": 7.434064865112305, "learning_rate": 7.758832177551136e-05, "loss": 2.553477478027344, "memory(GiB)": 77.56, "step": 36640, "token_acc": 0.5328185328185329, "train_speed(iter/s)": 1.440221 }, { "epoch": 1.5699841480656356, "grad_norm": 6.4770402908325195, "learning_rate": 7.758270890378691e-05, "loss": 2.514266014099121, "memory(GiB)": 77.56, "step": 36645, "token_acc": 0.47003154574132494, "train_speed(iter/s)": 1.440182 }, { "epoch": 1.5701983633948844, "grad_norm": 4.651577949523926, "learning_rate": 7.757709553237584e-05, "loss": 2.2985984802246096, "memory(GiB)": 77.56, "step": 36650, "token_acc": 0.5381526104417671, "train_speed(iter/s)": 1.440192 }, { "epoch": 1.5704125787241336, "grad_norm": 8.120918273925781, "learning_rate": 7.757148166137981e-05, "loss": 2.3862056732177734, "memory(GiB)": 77.56, "step": 36655, "token_acc": 0.5095785440613027, "train_speed(iter/s)": 1.440209 }, { "epoch": 1.5706267940533825, "grad_norm": 5.80086612701416, "learning_rate": 7.756586729090052e-05, "loss": 2.622327995300293, "memory(GiB)": 77.56, "step": 36660, "token_acc": 0.43260188087774293, "train_speed(iter/s)": 1.440226 }, { "epoch": 1.5708410093826313, "grad_norm": 3.806565523147583, "learning_rate": 7.756025242103969e-05, "loss": 2.404244041442871, "memory(GiB)": 77.56, "step": 36665, "token_acc": 0.4963235294117647, "train_speed(iter/s)": 1.440241 }, { "epoch": 1.5710552247118805, "grad_norm": 4.028979778289795, "learning_rate": 7.755463705189902e-05, "loss": 2.401742935180664, "memory(GiB)": 77.56, "step": 36670, "token_acc": 0.5, "train_speed(iter/s)": 1.440255 }, { "epoch": 1.5712694400411293, "grad_norm": 6.583826065063477, "learning_rate": 7.754902118358027e-05, "loss": 2.2654064178466795, "memory(GiB)": 77.56, "step": 36675, "token_acc": 0.5056603773584906, "train_speed(iter/s)": 1.440269 }, { "epoch": 1.5714836553703782, "grad_norm": 4.303641319274902, "learning_rate": 7.754340481618514e-05, "loss": 2.709935188293457, "memory(GiB)": 77.56, "step": 36680, "token_acc": 0.4568345323741007, "train_speed(iter/s)": 1.440287 }, { "epoch": 1.5716978706996274, "grad_norm": 6.314344882965088, "learning_rate": 7.753778794981541e-05, "loss": 2.5506317138671877, "memory(GiB)": 77.56, "step": 36685, "token_acc": 0.47547169811320755, "train_speed(iter/s)": 1.440324 }, { "epoch": 1.5719120860288762, "grad_norm": 5.979341506958008, "learning_rate": 7.753217058457281e-05, "loss": 2.784853935241699, "memory(GiB)": 77.56, "step": 36690, "token_acc": 0.41118421052631576, "train_speed(iter/s)": 1.44035 }, { "epoch": 1.572126301358125, "grad_norm": 5.207551956176758, "learning_rate": 7.75265527205591e-05, "loss": 2.4602256774902345, "memory(GiB)": 77.56, "step": 36695, "token_acc": 0.4785276073619632, "train_speed(iter/s)": 1.440334 }, { "epoch": 1.5723405166873743, "grad_norm": 4.366931915283203, "learning_rate": 7.752093435787611e-05, "loss": 2.306169891357422, "memory(GiB)": 77.56, "step": 36700, "token_acc": 0.4820717131474104, "train_speed(iter/s)": 1.440357 }, { "epoch": 1.572554732016623, "grad_norm": 5.995160102844238, "learning_rate": 7.751531549662553e-05, "loss": 2.157869338989258, "memory(GiB)": 77.56, "step": 36705, "token_acc": 0.5418326693227091, "train_speed(iter/s)": 1.440393 }, { "epoch": 1.572768947345872, "grad_norm": 5.33212947845459, "learning_rate": 7.750969613690923e-05, "loss": 2.617536926269531, "memory(GiB)": 77.56, "step": 36710, "token_acc": 0.4584837545126354, "train_speed(iter/s)": 1.440398 }, { "epoch": 1.5729831626751212, "grad_norm": 4.435813903808594, "learning_rate": 7.750407627882897e-05, "loss": 2.7557437896728514, "memory(GiB)": 77.56, "step": 36715, "token_acc": 0.4289855072463768, "train_speed(iter/s)": 1.440371 }, { "epoch": 1.57319737800437, "grad_norm": 5.804585933685303, "learning_rate": 7.749845592248654e-05, "loss": 2.260823440551758, "memory(GiB)": 77.56, "step": 36720, "token_acc": 0.5354330708661418, "train_speed(iter/s)": 1.440395 }, { "epoch": 1.5734115933336188, "grad_norm": 4.43022346496582, "learning_rate": 7.749283506798382e-05, "loss": 2.3514469146728514, "memory(GiB)": 77.56, "step": 36725, "token_acc": 0.5179282868525896, "train_speed(iter/s)": 1.440409 }, { "epoch": 1.573625808662868, "grad_norm": 6.361043453216553, "learning_rate": 7.748721371542258e-05, "loss": 2.709575080871582, "memory(GiB)": 77.56, "step": 36730, "token_acc": 0.40168539325842695, "train_speed(iter/s)": 1.440419 }, { "epoch": 1.5738400239921169, "grad_norm": 4.544958591461182, "learning_rate": 7.748159186490469e-05, "loss": 2.4843658447265624, "memory(GiB)": 77.56, "step": 36735, "token_acc": 0.49691358024691357, "train_speed(iter/s)": 1.440473 }, { "epoch": 1.5740542393213657, "grad_norm": 6.759220123291016, "learning_rate": 7.747596951653198e-05, "loss": 2.5847455978393556, "memory(GiB)": 77.56, "step": 36740, "token_acc": 0.44039735099337746, "train_speed(iter/s)": 1.440469 }, { "epoch": 1.574268454650615, "grad_norm": 6.662232398986816, "learning_rate": 7.747034667040632e-05, "loss": 2.4833652496337892, "memory(GiB)": 77.56, "step": 36745, "token_acc": 0.48863636363636365, "train_speed(iter/s)": 1.440438 }, { "epoch": 1.5744826699798637, "grad_norm": 4.657716751098633, "learning_rate": 7.746472332662955e-05, "loss": 2.6565189361572266, "memory(GiB)": 77.56, "step": 36750, "token_acc": 0.4303030303030303, "train_speed(iter/s)": 1.44037 }, { "epoch": 1.5746968853091126, "grad_norm": 5.314342498779297, "learning_rate": 7.745909948530355e-05, "loss": 2.6523664474487303, "memory(GiB)": 77.56, "step": 36755, "token_acc": 0.4600760456273764, "train_speed(iter/s)": 1.440391 }, { "epoch": 1.5749111006383618, "grad_norm": 10.755656242370605, "learning_rate": 7.745347514653021e-05, "loss": 2.4207244873046876, "memory(GiB)": 77.56, "step": 36760, "token_acc": 0.5261194029850746, "train_speed(iter/s)": 1.440426 }, { "epoch": 1.5751253159676106, "grad_norm": 6.14573860168457, "learning_rate": 7.74478503104114e-05, "loss": 2.6705108642578126, "memory(GiB)": 77.56, "step": 36765, "token_acc": 0.4626865671641791, "train_speed(iter/s)": 1.44046 }, { "epoch": 1.5753395312968594, "grad_norm": 5.781991481781006, "learning_rate": 7.744222497704904e-05, "loss": 2.738389778137207, "memory(GiB)": 77.56, "step": 36770, "token_acc": 0.45357142857142857, "train_speed(iter/s)": 1.440452 }, { "epoch": 1.5755537466261087, "grad_norm": 7.469683647155762, "learning_rate": 7.743659914654504e-05, "loss": 2.799342727661133, "memory(GiB)": 77.56, "step": 36775, "token_acc": 0.43333333333333335, "train_speed(iter/s)": 1.440475 }, { "epoch": 1.5757679619553575, "grad_norm": 5.489133358001709, "learning_rate": 7.743097281900131e-05, "loss": 2.57935791015625, "memory(GiB)": 77.56, "step": 36780, "token_acc": 0.44648318042813456, "train_speed(iter/s)": 1.440493 }, { "epoch": 1.5759821772846063, "grad_norm": 5.444052696228027, "learning_rate": 7.742534599451978e-05, "loss": 2.4359161376953127, "memory(GiB)": 77.56, "step": 36785, "token_acc": 0.46546546546546547, "train_speed(iter/s)": 1.440507 }, { "epoch": 1.5761963926138556, "grad_norm": 6.023439407348633, "learning_rate": 7.741971867320237e-05, "loss": 2.485719108581543, "memory(GiB)": 77.56, "step": 36790, "token_acc": 0.4699248120300752, "train_speed(iter/s)": 1.440523 }, { "epoch": 1.5764106079431044, "grad_norm": 4.417028427124023, "learning_rate": 7.741409085515103e-05, "loss": 2.799840545654297, "memory(GiB)": 77.56, "step": 36795, "token_acc": 0.4344569288389513, "train_speed(iter/s)": 1.440569 }, { "epoch": 1.5766248232723532, "grad_norm": 4.624118328094482, "learning_rate": 7.740846254046772e-05, "loss": 2.6378807067871093, "memory(GiB)": 77.56, "step": 36800, "token_acc": 0.4147727272727273, "train_speed(iter/s)": 1.440559 }, { "epoch": 1.5768390386016025, "grad_norm": 4.87163782119751, "learning_rate": 7.74028337292544e-05, "loss": 2.703288459777832, "memory(GiB)": 77.56, "step": 36805, "token_acc": 0.4440677966101695, "train_speed(iter/s)": 1.440602 }, { "epoch": 1.5770532539308513, "grad_norm": 4.161083698272705, "learning_rate": 7.739720442161305e-05, "loss": 2.446087646484375, "memory(GiB)": 77.56, "step": 36810, "token_acc": 0.45454545454545453, "train_speed(iter/s)": 1.440603 }, { "epoch": 1.5772674692601003, "grad_norm": 4.400523662567139, "learning_rate": 7.739157461764564e-05, "loss": 2.5423789978027345, "memory(GiB)": 77.56, "step": 36815, "token_acc": 0.49240121580547114, "train_speed(iter/s)": 1.440575 }, { "epoch": 1.5774816845893493, "grad_norm": 4.913524150848389, "learning_rate": 7.738594431745415e-05, "loss": 2.7994462966918947, "memory(GiB)": 77.56, "step": 36820, "token_acc": 0.4529616724738676, "train_speed(iter/s)": 1.440551 }, { "epoch": 1.5776958999185982, "grad_norm": 4.85852575302124, "learning_rate": 7.738031352114059e-05, "loss": 2.4869850158691404, "memory(GiB)": 77.56, "step": 36825, "token_acc": 0.49056603773584906, "train_speed(iter/s)": 1.440579 }, { "epoch": 1.5779101152478472, "grad_norm": 5.648110389709473, "learning_rate": 7.737468222880697e-05, "loss": 2.9752532958984377, "memory(GiB)": 77.56, "step": 36830, "token_acc": 0.40181268882175225, "train_speed(iter/s)": 1.440608 }, { "epoch": 1.5781243305770962, "grad_norm": 5.381869316101074, "learning_rate": 7.736905044055531e-05, "loss": 2.3726613998413084, "memory(GiB)": 77.56, "step": 36835, "token_acc": 0.49477351916376305, "train_speed(iter/s)": 1.440592 }, { "epoch": 1.578338545906345, "grad_norm": 4.67322301864624, "learning_rate": 7.736341815648761e-05, "loss": 2.458905029296875, "memory(GiB)": 77.56, "step": 36840, "token_acc": 0.5088495575221239, "train_speed(iter/s)": 1.4406 }, { "epoch": 1.578552761235594, "grad_norm": 4.657018184661865, "learning_rate": 7.735778537670594e-05, "loss": 2.5954566955566407, "memory(GiB)": 77.56, "step": 36845, "token_acc": 0.4651898734177215, "train_speed(iter/s)": 1.440571 }, { "epoch": 1.578766976564843, "grad_norm": 4.841012954711914, "learning_rate": 7.73521521013123e-05, "loss": 2.2852815628051757, "memory(GiB)": 77.56, "step": 36850, "token_acc": 0.4833948339483395, "train_speed(iter/s)": 1.440566 }, { "epoch": 1.578981191894092, "grad_norm": 5.37290620803833, "learning_rate": 7.734651833040879e-05, "loss": 2.6469724655151365, "memory(GiB)": 77.56, "step": 36855, "token_acc": 0.4565826330532213, "train_speed(iter/s)": 1.44059 }, { "epoch": 1.579195407223341, "grad_norm": 5.4792256355285645, "learning_rate": 7.734088406409744e-05, "loss": 2.4698020935058596, "memory(GiB)": 77.56, "step": 36860, "token_acc": 0.46206896551724136, "train_speed(iter/s)": 1.440585 }, { "epoch": 1.57940962255259, "grad_norm": 7.3147358894348145, "learning_rate": 7.733524930248032e-05, "loss": 2.678775405883789, "memory(GiB)": 77.56, "step": 36865, "token_acc": 0.44244604316546765, "train_speed(iter/s)": 1.440553 }, { "epoch": 1.5796238378818388, "grad_norm": 4.623478412628174, "learning_rate": 7.732961404565953e-05, "loss": 2.620505523681641, "memory(GiB)": 77.56, "step": 36870, "token_acc": 0.4715447154471545, "train_speed(iter/s)": 1.440502 }, { "epoch": 1.5798380532110878, "grad_norm": 6.907497882843018, "learning_rate": 7.732397829373713e-05, "loss": 2.7558525085449217, "memory(GiB)": 77.56, "step": 36875, "token_acc": 0.4358108108108108, "train_speed(iter/s)": 1.440506 }, { "epoch": 1.5800522685403369, "grad_norm": 3.7904212474823, "learning_rate": 7.731834204681522e-05, "loss": 2.2528480529785155, "memory(GiB)": 77.56, "step": 36880, "token_acc": 0.5232198142414861, "train_speed(iter/s)": 1.440499 }, { "epoch": 1.5802664838695857, "grad_norm": 3.7137997150421143, "learning_rate": 7.731270530499595e-05, "loss": 2.8632713317871095, "memory(GiB)": 77.56, "step": 36885, "token_acc": 0.4157608695652174, "train_speed(iter/s)": 1.440531 }, { "epoch": 1.5804806991988347, "grad_norm": 6.536667823791504, "learning_rate": 7.73070680683814e-05, "loss": 2.6188026428222657, "memory(GiB)": 77.56, "step": 36890, "token_acc": 0.46545454545454545, "train_speed(iter/s)": 1.44055 }, { "epoch": 1.5806949145280837, "grad_norm": 4.936042785644531, "learning_rate": 7.730143033707367e-05, "loss": 2.4435157775878906, "memory(GiB)": 77.56, "step": 36895, "token_acc": 0.5189393939393939, "train_speed(iter/s)": 1.440537 }, { "epoch": 1.5809091298573326, "grad_norm": 5.053994178771973, "learning_rate": 7.729579211117492e-05, "loss": 2.538191223144531, "memory(GiB)": 77.56, "step": 36900, "token_acc": 0.47147147147147145, "train_speed(iter/s)": 1.440549 }, { "epoch": 1.5811233451865816, "grad_norm": 4.722520351409912, "learning_rate": 7.729015339078731e-05, "loss": 2.4507837295532227, "memory(GiB)": 77.56, "step": 36905, "token_acc": 0.48717948717948717, "train_speed(iter/s)": 1.440552 }, { "epoch": 1.5813375605158306, "grad_norm": 5.030966281890869, "learning_rate": 7.728451417601297e-05, "loss": 2.3558704376220705, "memory(GiB)": 77.56, "step": 36910, "token_acc": 0.4900398406374502, "train_speed(iter/s)": 1.440558 }, { "epoch": 1.5815517758450794, "grad_norm": 4.461679458618164, "learning_rate": 7.727887446695405e-05, "loss": 2.3357072830200196, "memory(GiB)": 77.56, "step": 36915, "token_acc": 0.4936708860759494, "train_speed(iter/s)": 1.440559 }, { "epoch": 1.5817659911743285, "grad_norm": 4.808989524841309, "learning_rate": 7.727323426371272e-05, "loss": 2.5729959487915037, "memory(GiB)": 77.56, "step": 36920, "token_acc": 0.4186046511627907, "train_speed(iter/s)": 1.440565 }, { "epoch": 1.5819802065035775, "grad_norm": 4.341703414916992, "learning_rate": 7.726759356639119e-05, "loss": 2.516896438598633, "memory(GiB)": 77.56, "step": 36925, "token_acc": 0.47038327526132406, "train_speed(iter/s)": 1.440602 }, { "epoch": 1.5821944218328263, "grad_norm": 4.133007049560547, "learning_rate": 7.726195237509162e-05, "loss": 2.483944320678711, "memory(GiB)": 77.56, "step": 36930, "token_acc": 0.47766323024054985, "train_speed(iter/s)": 1.440615 }, { "epoch": 1.5824086371620754, "grad_norm": 4.523614406585693, "learning_rate": 7.725631068991618e-05, "loss": 2.527436065673828, "memory(GiB)": 77.56, "step": 36935, "token_acc": 0.4831081081081081, "train_speed(iter/s)": 1.440631 }, { "epoch": 1.5826228524913244, "grad_norm": 6.269434928894043, "learning_rate": 7.725066851096714e-05, "loss": 2.2298418045043946, "memory(GiB)": 77.56, "step": 36940, "token_acc": 0.5309734513274337, "train_speed(iter/s)": 1.440639 }, { "epoch": 1.5828370678205732, "grad_norm": 5.9857940673828125, "learning_rate": 7.724502583834665e-05, "loss": 2.4994083404541017, "memory(GiB)": 77.56, "step": 36945, "token_acc": 0.4404332129963899, "train_speed(iter/s)": 1.44069 }, { "epoch": 1.5830512831498222, "grad_norm": 4.439908504486084, "learning_rate": 7.723938267215698e-05, "loss": 2.365422821044922, "memory(GiB)": 77.56, "step": 36950, "token_acc": 0.4798657718120805, "train_speed(iter/s)": 1.440707 }, { "epoch": 1.5832654984790713, "grad_norm": 7.594779014587402, "learning_rate": 7.723373901250032e-05, "loss": 2.384699058532715, "memory(GiB)": 77.56, "step": 36955, "token_acc": 0.47388059701492535, "train_speed(iter/s)": 1.44075 }, { "epoch": 1.58347971380832, "grad_norm": 4.222997188568115, "learning_rate": 7.722809485947895e-05, "loss": 2.5913795471191405, "memory(GiB)": 77.56, "step": 36960, "token_acc": 0.4696132596685083, "train_speed(iter/s)": 1.440814 }, { "epoch": 1.5836939291375691, "grad_norm": 5.102694988250732, "learning_rate": 7.72224502131951e-05, "loss": 2.9261005401611326, "memory(GiB)": 77.56, "step": 36965, "token_acc": 0.41237113402061853, "train_speed(iter/s)": 1.440781 }, { "epoch": 1.5839081444668182, "grad_norm": 5.263223171234131, "learning_rate": 7.721680507375102e-05, "loss": 2.398607063293457, "memory(GiB)": 77.56, "step": 36970, "token_acc": 0.4702549575070821, "train_speed(iter/s)": 1.440761 }, { "epoch": 1.584122359796067, "grad_norm": 5.182840347290039, "learning_rate": 7.721115944124897e-05, "loss": 2.590830421447754, "memory(GiB)": 77.56, "step": 36975, "token_acc": 0.4351145038167939, "train_speed(iter/s)": 1.440786 }, { "epoch": 1.584336575125316, "grad_norm": 5.2019805908203125, "learning_rate": 7.720551331579126e-05, "loss": 2.2415897369384767, "memory(GiB)": 77.56, "step": 36980, "token_acc": 0.4952076677316294, "train_speed(iter/s)": 1.440753 }, { "epoch": 1.584550790454565, "grad_norm": 6.639218330383301, "learning_rate": 7.719986669748013e-05, "loss": 2.4859216690063475, "memory(GiB)": 77.56, "step": 36985, "token_acc": 0.4602649006622517, "train_speed(iter/s)": 1.440779 }, { "epoch": 1.5847650057838139, "grad_norm": 4.705714702606201, "learning_rate": 7.719421958641794e-05, "loss": 2.3174625396728517, "memory(GiB)": 77.56, "step": 36990, "token_acc": 0.5232558139534884, "train_speed(iter/s)": 1.440825 }, { "epoch": 1.5849792211130629, "grad_norm": 4.4780378341674805, "learning_rate": 7.718857198270692e-05, "loss": 2.438267135620117, "memory(GiB)": 77.56, "step": 36995, "token_acc": 0.4542857142857143, "train_speed(iter/s)": 1.440823 }, { "epoch": 1.585193436442312, "grad_norm": 5.398425579071045, "learning_rate": 7.718292388644943e-05, "loss": 2.4314022064208984, "memory(GiB)": 77.56, "step": 37000, "token_acc": 0.48, "train_speed(iter/s)": 1.440812 }, { "epoch": 1.585193436442312, "eval_loss": 2.4495582580566406, "eval_runtime": 13.9402, "eval_samples_per_second": 7.174, "eval_steps_per_second": 7.174, "eval_token_acc": 0.45521472392638035, "step": 37000 }, { "epoch": 1.5854076517715607, "grad_norm": 4.466859340667725, "learning_rate": 7.717727529774777e-05, "loss": 2.8922637939453124, "memory(GiB)": 77.56, "step": 37005, "token_acc": 0.45254833040421794, "train_speed(iter/s)": 1.440029 }, { "epoch": 1.5856218671008098, "grad_norm": 4.917857646942139, "learning_rate": 7.717162621670427e-05, "loss": 2.544180679321289, "memory(GiB)": 77.56, "step": 37010, "token_acc": 0.4489795918367347, "train_speed(iter/s)": 1.440025 }, { "epoch": 1.5858360824300588, "grad_norm": 6.8532819747924805, "learning_rate": 7.716597664342127e-05, "loss": 2.503630447387695, "memory(GiB)": 77.56, "step": 37015, "token_acc": 0.4770992366412214, "train_speed(iter/s)": 1.440038 }, { "epoch": 1.5860502977593076, "grad_norm": 5.015791416168213, "learning_rate": 7.716032657800113e-05, "loss": 2.4324644088745115, "memory(GiB)": 77.56, "step": 37020, "token_acc": 0.5017301038062284, "train_speed(iter/s)": 1.440057 }, { "epoch": 1.5862645130885566, "grad_norm": 3.747614860534668, "learning_rate": 7.715467602054618e-05, "loss": 2.057494354248047, "memory(GiB)": 77.56, "step": 37025, "token_acc": 0.521551724137931, "train_speed(iter/s)": 1.440075 }, { "epoch": 1.5864787284178057, "grad_norm": 4.066998481750488, "learning_rate": 7.714902497115881e-05, "loss": 2.9439250946044924, "memory(GiB)": 77.56, "step": 37030, "token_acc": 0.41454545454545455, "train_speed(iter/s)": 1.440057 }, { "epoch": 1.5866929437470545, "grad_norm": 7.244368553161621, "learning_rate": 7.714337342994139e-05, "loss": 2.2538969039916994, "memory(GiB)": 77.56, "step": 37035, "token_acc": 0.48175182481751827, "train_speed(iter/s)": 1.440105 }, { "epoch": 1.5869071590763035, "grad_norm": 5.2623701095581055, "learning_rate": 7.71377213969963e-05, "loss": 2.591645050048828, "memory(GiB)": 77.56, "step": 37040, "token_acc": 0.4584527220630373, "train_speed(iter/s)": 1.440115 }, { "epoch": 1.5871213744055526, "grad_norm": 4.656369209289551, "learning_rate": 7.713206887242592e-05, "loss": 2.5355520248413086, "memory(GiB)": 77.56, "step": 37045, "token_acc": 0.4886731391585761, "train_speed(iter/s)": 1.44013 }, { "epoch": 1.5873355897348014, "grad_norm": 5.218441009521484, "learning_rate": 7.712641585633265e-05, "loss": 2.663298225402832, "memory(GiB)": 77.56, "step": 37050, "token_acc": 0.4766666666666667, "train_speed(iter/s)": 1.440172 }, { "epoch": 1.5875498050640504, "grad_norm": 5.425658702850342, "learning_rate": 7.712076234881893e-05, "loss": 2.7011905670166017, "memory(GiB)": 77.56, "step": 37055, "token_acc": 0.44966442953020136, "train_speed(iter/s)": 1.440215 }, { "epoch": 1.5877640203932994, "grad_norm": 3.627558946609497, "learning_rate": 7.711510834998714e-05, "loss": 2.633744239807129, "memory(GiB)": 77.56, "step": 37060, "token_acc": 0.44011976047904194, "train_speed(iter/s)": 1.440237 }, { "epoch": 1.5879782357225483, "grad_norm": 5.2490057945251465, "learning_rate": 7.710945385993975e-05, "loss": 2.6325763702392577, "memory(GiB)": 77.56, "step": 37065, "token_acc": 0.47540983606557374, "train_speed(iter/s)": 1.440258 }, { "epoch": 1.5881924510517973, "grad_norm": 4.766805171966553, "learning_rate": 7.710379887877917e-05, "loss": 2.7200057983398436, "memory(GiB)": 77.56, "step": 37070, "token_acc": 0.4322766570605187, "train_speed(iter/s)": 1.440314 }, { "epoch": 1.5884066663810463, "grad_norm": 5.99770975112915, "learning_rate": 7.709814340660784e-05, "loss": 2.4199867248535156, "memory(GiB)": 77.56, "step": 37075, "token_acc": 0.47719298245614034, "train_speed(iter/s)": 1.440341 }, { "epoch": 1.5886208817102951, "grad_norm": 5.70997428894043, "learning_rate": 7.709248744352822e-05, "loss": 2.1313186645507813, "memory(GiB)": 77.56, "step": 37080, "token_acc": 0.5454545454545454, "train_speed(iter/s)": 1.440287 }, { "epoch": 1.5888350970395442, "grad_norm": 5.822385311126709, "learning_rate": 7.708683098964275e-05, "loss": 2.676686668395996, "memory(GiB)": 77.56, "step": 37085, "token_acc": 0.470404984423676, "train_speed(iter/s)": 1.44033 }, { "epoch": 1.5890493123687932, "grad_norm": 4.615672588348389, "learning_rate": 7.708117404505397e-05, "loss": 2.7133426666259766, "memory(GiB)": 77.56, "step": 37090, "token_acc": 0.4836795252225519, "train_speed(iter/s)": 1.440328 }, { "epoch": 1.589263527698042, "grad_norm": 3.886209011077881, "learning_rate": 7.707551660986429e-05, "loss": 2.5003583908081053, "memory(GiB)": 77.56, "step": 37095, "token_acc": 0.5294117647058824, "train_speed(iter/s)": 1.440338 }, { "epoch": 1.589477743027291, "grad_norm": 4.47598123550415, "learning_rate": 7.706985868417624e-05, "loss": 2.4446216583251954, "memory(GiB)": 77.56, "step": 37100, "token_acc": 0.48828125, "train_speed(iter/s)": 1.440392 }, { "epoch": 1.58969195835654, "grad_norm": 6.72584342956543, "learning_rate": 7.706420026809232e-05, "loss": 2.746270751953125, "memory(GiB)": 77.56, "step": 37105, "token_acc": 0.43902439024390244, "train_speed(iter/s)": 1.440368 }, { "epoch": 1.589906173685789, "grad_norm": 5.002017974853516, "learning_rate": 7.7058541361715e-05, "loss": 2.6312305450439455, "memory(GiB)": 77.56, "step": 37110, "token_acc": 0.473015873015873, "train_speed(iter/s)": 1.440383 }, { "epoch": 1.590120389015038, "grad_norm": 4.537996768951416, "learning_rate": 7.705288196514682e-05, "loss": 2.2976896286010744, "memory(GiB)": 77.56, "step": 37115, "token_acc": 0.5206611570247934, "train_speed(iter/s)": 1.440404 }, { "epoch": 1.590334604344287, "grad_norm": 4.235039234161377, "learning_rate": 7.70472220784903e-05, "loss": 2.727490997314453, "memory(GiB)": 77.56, "step": 37120, "token_acc": 0.434375, "train_speed(iter/s)": 1.440385 }, { "epoch": 1.5905488196735358, "grad_norm": 6.362739086151123, "learning_rate": 7.704156170184801e-05, "loss": 2.4436103820800783, "memory(GiB)": 77.56, "step": 37125, "token_acc": 0.5075757575757576, "train_speed(iter/s)": 1.440422 }, { "epoch": 1.5907630350027848, "grad_norm": 5.3750081062316895, "learning_rate": 7.703590083532244e-05, "loss": 2.4182289123535154, "memory(GiB)": 77.56, "step": 37130, "token_acc": 0.49683544303797467, "train_speed(iter/s)": 1.440456 }, { "epoch": 1.5909772503320339, "grad_norm": 3.729543447494507, "learning_rate": 7.703023947901618e-05, "loss": 2.382952117919922, "memory(GiB)": 77.56, "step": 37135, "token_acc": 0.5123674911660777, "train_speed(iter/s)": 1.440457 }, { "epoch": 1.5911914656612827, "grad_norm": 4.52062463760376, "learning_rate": 7.702457763303177e-05, "loss": 2.5480127334594727, "memory(GiB)": 77.56, "step": 37140, "token_acc": 0.5048543689320388, "train_speed(iter/s)": 1.440491 }, { "epoch": 1.5914056809905317, "grad_norm": 5.185436725616455, "learning_rate": 7.701891529747178e-05, "loss": 2.525675964355469, "memory(GiB)": 77.56, "step": 37145, "token_acc": 0.47, "train_speed(iter/s)": 1.44056 }, { "epoch": 1.5916198963197807, "grad_norm": 5.667872905731201, "learning_rate": 7.70132524724388e-05, "loss": 2.530457878112793, "memory(GiB)": 77.56, "step": 37150, "token_acc": 0.5, "train_speed(iter/s)": 1.440559 }, { "epoch": 1.5918341116490295, "grad_norm": 3.952483892440796, "learning_rate": 7.70075891580354e-05, "loss": 2.4947715759277345, "memory(GiB)": 77.56, "step": 37155, "token_acc": 0.4673913043478261, "train_speed(iter/s)": 1.440569 }, { "epoch": 1.5920483269782786, "grad_norm": 4.715612888336182, "learning_rate": 7.70019253543642e-05, "loss": 2.5695564270019533, "memory(GiB)": 77.56, "step": 37160, "token_acc": 0.49859943977591037, "train_speed(iter/s)": 1.44055 }, { "epoch": 1.5922625423075276, "grad_norm": 4.617494583129883, "learning_rate": 7.699626106152778e-05, "loss": 2.283112144470215, "memory(GiB)": 77.56, "step": 37165, "token_acc": 0.5166666666666667, "train_speed(iter/s)": 1.440578 }, { "epoch": 1.5924767576367764, "grad_norm": 5.440049171447754, "learning_rate": 7.699059627962877e-05, "loss": 2.57884521484375, "memory(GiB)": 77.56, "step": 37170, "token_acc": 0.484251968503937, "train_speed(iter/s)": 1.44055 }, { "epoch": 1.5926909729660255, "grad_norm": 5.097413063049316, "learning_rate": 7.698493100876979e-05, "loss": 2.749297332763672, "memory(GiB)": 77.56, "step": 37175, "token_acc": 0.45483870967741935, "train_speed(iter/s)": 1.440583 }, { "epoch": 1.5929051882952745, "grad_norm": 4.4065752029418945, "learning_rate": 7.697926524905348e-05, "loss": 2.29630126953125, "memory(GiB)": 77.56, "step": 37180, "token_acc": 0.5033112582781457, "train_speed(iter/s)": 1.440615 }, { "epoch": 1.5931194036245233, "grad_norm": 4.410834312438965, "learning_rate": 7.697359900058245e-05, "loss": 2.6966730117797852, "memory(GiB)": 77.56, "step": 37185, "token_acc": 0.46078431372549017, "train_speed(iter/s)": 1.440611 }, { "epoch": 1.5933336189537723, "grad_norm": 6.140238285064697, "learning_rate": 7.696793226345939e-05, "loss": 3.168558692932129, "memory(GiB)": 77.56, "step": 37190, "token_acc": 0.4336283185840708, "train_speed(iter/s)": 1.440654 }, { "epoch": 1.5935478342830214, "grad_norm": 4.942981243133545, "learning_rate": 7.696226503778694e-05, "loss": 2.480720329284668, "memory(GiB)": 77.56, "step": 37195, "token_acc": 0.4672364672364672, "train_speed(iter/s)": 1.440671 }, { "epoch": 1.5937620496122702, "grad_norm": 3.73089861869812, "learning_rate": 7.695659732366774e-05, "loss": 2.577046775817871, "memory(GiB)": 77.56, "step": 37200, "token_acc": 0.5, "train_speed(iter/s)": 1.440642 }, { "epoch": 1.5939762649415192, "grad_norm": 6.346792221069336, "learning_rate": 7.695092912120452e-05, "loss": 2.663411331176758, "memory(GiB)": 77.56, "step": 37205, "token_acc": 0.48188405797101447, "train_speed(iter/s)": 1.440648 }, { "epoch": 1.5941904802707683, "grad_norm": 5.228428363800049, "learning_rate": 7.694526043049995e-05, "loss": 2.688984489440918, "memory(GiB)": 77.56, "step": 37210, "token_acc": 0.4467455621301775, "train_speed(iter/s)": 1.440592 }, { "epoch": 1.594404695600017, "grad_norm": 3.9997048377990723, "learning_rate": 7.693959125165666e-05, "loss": 2.43411922454834, "memory(GiB)": 77.56, "step": 37215, "token_acc": 0.4838709677419355, "train_speed(iter/s)": 1.440577 }, { "epoch": 1.594618910929266, "grad_norm": 7.999871730804443, "learning_rate": 7.693392158477745e-05, "loss": 2.6133049011230467, "memory(GiB)": 77.56, "step": 37220, "token_acc": 0.4688427299703264, "train_speed(iter/s)": 1.440552 }, { "epoch": 1.5948331262585151, "grad_norm": 6.031138896942139, "learning_rate": 7.692825142996498e-05, "loss": 2.4414501190185547, "memory(GiB)": 77.56, "step": 37225, "token_acc": 0.4874551971326165, "train_speed(iter/s)": 1.44055 }, { "epoch": 1.595047341587764, "grad_norm": 5.841692924499512, "learning_rate": 7.692258078732196e-05, "loss": 2.834478759765625, "memory(GiB)": 77.56, "step": 37230, "token_acc": 0.4440789473684211, "train_speed(iter/s)": 1.440571 }, { "epoch": 1.595261556917013, "grad_norm": 6.838806629180908, "learning_rate": 7.69169096569511e-05, "loss": 2.682520294189453, "memory(GiB)": 77.56, "step": 37235, "token_acc": 0.4697508896797153, "train_speed(iter/s)": 1.440575 }, { "epoch": 1.595475772246262, "grad_norm": 6.088563919067383, "learning_rate": 7.691123803895523e-05, "loss": 2.2300233840942383, "memory(GiB)": 77.56, "step": 37240, "token_acc": 0.5335968379446641, "train_speed(iter/s)": 1.440518 }, { "epoch": 1.5956899875755108, "grad_norm": 4.609649658203125, "learning_rate": 7.6905565933437e-05, "loss": 2.4188270568847656, "memory(GiB)": 77.56, "step": 37245, "token_acc": 0.4844290657439446, "train_speed(iter/s)": 1.440499 }, { "epoch": 1.5959042029047599, "grad_norm": 3.5053088665008545, "learning_rate": 7.689989334049923e-05, "loss": 2.451124572753906, "memory(GiB)": 77.56, "step": 37250, "token_acc": 0.5148514851485149, "train_speed(iter/s)": 1.440535 }, { "epoch": 1.596118418234009, "grad_norm": 5.890113830566406, "learning_rate": 7.689422026024464e-05, "loss": 2.8184814453125, "memory(GiB)": 77.56, "step": 37255, "token_acc": 0.4626334519572954, "train_speed(iter/s)": 1.440544 }, { "epoch": 1.5963326335632577, "grad_norm": 5.272008419036865, "learning_rate": 7.688854669277604e-05, "loss": 2.3964380264282226, "memory(GiB)": 77.56, "step": 37260, "token_acc": 0.5187713310580204, "train_speed(iter/s)": 1.440588 }, { "epoch": 1.5965468488925068, "grad_norm": 5.0255022048950195, "learning_rate": 7.688287263819617e-05, "loss": 2.526440238952637, "memory(GiB)": 77.56, "step": 37265, "token_acc": 0.4876325088339223, "train_speed(iter/s)": 1.440604 }, { "epoch": 1.5967610642217558, "grad_norm": 5.126718997955322, "learning_rate": 7.687719809660785e-05, "loss": 2.237118148803711, "memory(GiB)": 77.56, "step": 37270, "token_acc": 0.5136186770428015, "train_speed(iter/s)": 1.440564 }, { "epoch": 1.5969752795510046, "grad_norm": 4.850007057189941, "learning_rate": 7.687152306811388e-05, "loss": 2.7526214599609373, "memory(GiB)": 77.56, "step": 37275, "token_acc": 0.4713375796178344, "train_speed(iter/s)": 1.440571 }, { "epoch": 1.5971894948802536, "grad_norm": 4.5587687492370605, "learning_rate": 7.686584755281708e-05, "loss": 2.5548351287841795, "memory(GiB)": 77.56, "step": 37280, "token_acc": 0.43465045592705165, "train_speed(iter/s)": 1.440578 }, { "epoch": 1.5974037102095027, "grad_norm": 4.840272903442383, "learning_rate": 7.686017155082021e-05, "loss": 2.474538803100586, "memory(GiB)": 77.56, "step": 37285, "token_acc": 0.4896551724137931, "train_speed(iter/s)": 1.440607 }, { "epoch": 1.5976179255387515, "grad_norm": 7.790668964385986, "learning_rate": 7.685449506222616e-05, "loss": 2.541324806213379, "memory(GiB)": 77.56, "step": 37290, "token_acc": 0.49236641221374045, "train_speed(iter/s)": 1.440612 }, { "epoch": 1.5978321408680005, "grad_norm": 5.4809346199035645, "learning_rate": 7.684881808713774e-05, "loss": 2.4288475036621096, "memory(GiB)": 77.56, "step": 37295, "token_acc": 0.44936708860759494, "train_speed(iter/s)": 1.440603 }, { "epoch": 1.5980463561972496, "grad_norm": 6.058802127838135, "learning_rate": 7.684314062565779e-05, "loss": 2.76653995513916, "memory(GiB)": 77.56, "step": 37300, "token_acc": 0.41237113402061853, "train_speed(iter/s)": 1.440612 }, { "epoch": 1.5982605715264984, "grad_norm": 4.5335693359375, "learning_rate": 7.683746267788916e-05, "loss": 2.386663627624512, "memory(GiB)": 77.56, "step": 37305, "token_acc": 0.5142857142857142, "train_speed(iter/s)": 1.440648 }, { "epoch": 1.5984747868557474, "grad_norm": 5.054959774017334, "learning_rate": 7.683178424393472e-05, "loss": 2.4031471252441405, "memory(GiB)": 77.56, "step": 37310, "token_acc": 0.5036764705882353, "train_speed(iter/s)": 1.440678 }, { "epoch": 1.5986890021849964, "grad_norm": 6.449455738067627, "learning_rate": 7.682610532389734e-05, "loss": 2.6527030944824217, "memory(GiB)": 77.56, "step": 37315, "token_acc": 0.45901639344262296, "train_speed(iter/s)": 1.440739 }, { "epoch": 1.5989032175142452, "grad_norm": 5.217899799346924, "learning_rate": 7.68204259178799e-05, "loss": 2.2669368743896485, "memory(GiB)": 77.56, "step": 37320, "token_acc": 0.4777327935222672, "train_speed(iter/s)": 1.440771 }, { "epoch": 1.5991174328434943, "grad_norm": 4.740453720092773, "learning_rate": 7.681474602598529e-05, "loss": 2.6832502365112303, "memory(GiB)": 77.56, "step": 37325, "token_acc": 0.4180327868852459, "train_speed(iter/s)": 1.440794 }, { "epoch": 1.5993316481727433, "grad_norm": 4.466330051422119, "learning_rate": 7.68090656483164e-05, "loss": 2.578377532958984, "memory(GiB)": 77.56, "step": 37330, "token_acc": 0.4340836012861736, "train_speed(iter/s)": 1.440785 }, { "epoch": 1.5995458635019921, "grad_norm": 4.0943779945373535, "learning_rate": 7.680338478497613e-05, "loss": 2.4776931762695313, "memory(GiB)": 77.56, "step": 37335, "token_acc": 0.42857142857142855, "train_speed(iter/s)": 1.440776 }, { "epoch": 1.5997600788312412, "grad_norm": 5.398914337158203, "learning_rate": 7.679770343606741e-05, "loss": 2.6147722244262694, "memory(GiB)": 77.56, "step": 37340, "token_acc": 0.4681647940074906, "train_speed(iter/s)": 1.440797 }, { "epoch": 1.5999742941604902, "grad_norm": 5.088338851928711, "learning_rate": 7.679202160169314e-05, "loss": 2.4712217330932615, "memory(GiB)": 77.56, "step": 37345, "token_acc": 0.4965277777777778, "train_speed(iter/s)": 1.440776 }, { "epoch": 1.600188509489739, "grad_norm": 5.90064811706543, "learning_rate": 7.678633928195628e-05, "loss": 2.511794662475586, "memory(GiB)": 77.56, "step": 37350, "token_acc": 0.4955223880597015, "train_speed(iter/s)": 1.440775 }, { "epoch": 1.600402724818988, "grad_norm": 6.789758205413818, "learning_rate": 7.678065647695975e-05, "loss": 2.4820667266845704, "memory(GiB)": 77.56, "step": 37355, "token_acc": 0.4584837545126354, "train_speed(iter/s)": 1.440778 }, { "epoch": 1.600616940148237, "grad_norm": 7.3594160079956055, "learning_rate": 7.67749731868065e-05, "loss": 2.6745500564575195, "memory(GiB)": 77.56, "step": 37360, "token_acc": 0.4699248120300752, "train_speed(iter/s)": 1.440806 }, { "epoch": 1.600831155477486, "grad_norm": 5.078739166259766, "learning_rate": 7.676928941159951e-05, "loss": 2.4518218994140626, "memory(GiB)": 77.56, "step": 37365, "token_acc": 0.50920245398773, "train_speed(iter/s)": 1.440839 }, { "epoch": 1.601045370806735, "grad_norm": 4.971065521240234, "learning_rate": 7.676360515144172e-05, "loss": 2.2999269485473635, "memory(GiB)": 77.56, "step": 37370, "token_acc": 0.5151515151515151, "train_speed(iter/s)": 1.440816 }, { "epoch": 1.601259586135984, "grad_norm": 5.04205322265625, "learning_rate": 7.675792040643611e-05, "loss": 2.50014705657959, "memory(GiB)": 77.56, "step": 37375, "token_acc": 0.4775641025641026, "train_speed(iter/s)": 1.440805 }, { "epoch": 1.6014738014652328, "grad_norm": 4.661972999572754, "learning_rate": 7.675223517668569e-05, "loss": 2.547068977355957, "memory(GiB)": 77.56, "step": 37380, "token_acc": 0.44366197183098594, "train_speed(iter/s)": 1.440857 }, { "epoch": 1.6016880167944818, "grad_norm": 6.270064353942871, "learning_rate": 7.67465494622934e-05, "loss": 2.709405517578125, "memory(GiB)": 77.56, "step": 37385, "token_acc": 0.439873417721519, "train_speed(iter/s)": 1.440878 }, { "epoch": 1.6019022321237308, "grad_norm": 8.083171844482422, "learning_rate": 7.67408632633623e-05, "loss": 2.40850830078125, "memory(GiB)": 77.56, "step": 37390, "token_acc": 0.5060240963855421, "train_speed(iter/s)": 1.440829 }, { "epoch": 1.6021164474529797, "grad_norm": 4.941572189331055, "learning_rate": 7.673517657999538e-05, "loss": 2.4612800598144533, "memory(GiB)": 77.56, "step": 37395, "token_acc": 0.4781021897810219, "train_speed(iter/s)": 1.440854 }, { "epoch": 1.6023306627822287, "grad_norm": 7.851924419403076, "learning_rate": 7.672948941229565e-05, "loss": 2.303112602233887, "memory(GiB)": 77.56, "step": 37400, "token_acc": 0.47058823529411764, "train_speed(iter/s)": 1.440879 }, { "epoch": 1.6025448781114777, "grad_norm": 6.726553440093994, "learning_rate": 7.672380176036615e-05, "loss": 2.5341365814208983, "memory(GiB)": 77.56, "step": 37405, "token_acc": 0.508833922261484, "train_speed(iter/s)": 1.440932 }, { "epoch": 1.6027590934407265, "grad_norm": 4.743828773498535, "learning_rate": 7.671811362430992e-05, "loss": 2.3696929931640627, "memory(GiB)": 77.56, "step": 37410, "token_acc": 0.5164835164835165, "train_speed(iter/s)": 1.440963 }, { "epoch": 1.6029733087699756, "grad_norm": 3.8743042945861816, "learning_rate": 7.671242500422998e-05, "loss": 2.38269157409668, "memory(GiB)": 77.56, "step": 37415, "token_acc": 0.4940828402366864, "train_speed(iter/s)": 1.441007 }, { "epoch": 1.6031875240992246, "grad_norm": 4.833866119384766, "learning_rate": 7.670673590022939e-05, "loss": 2.354164886474609, "memory(GiB)": 77.56, "step": 37420, "token_acc": 0.506993006993007, "train_speed(iter/s)": 1.441027 }, { "epoch": 1.6034017394284734, "grad_norm": 4.122615814208984, "learning_rate": 7.670104631241126e-05, "loss": 2.5714073181152344, "memory(GiB)": 77.56, "step": 37425, "token_acc": 0.48502994011976047, "train_speed(iter/s)": 1.441018 }, { "epoch": 1.6036159547577225, "grad_norm": 5.98354959487915, "learning_rate": 7.66953562408786e-05, "loss": 2.5156038284301756, "memory(GiB)": 77.56, "step": 37430, "token_acc": 0.47840531561461797, "train_speed(iter/s)": 1.440981 }, { "epoch": 1.6038301700869715, "grad_norm": 5.067171573638916, "learning_rate": 7.668966568573455e-05, "loss": 2.483203887939453, "memory(GiB)": 77.56, "step": 37435, "token_acc": 0.50625, "train_speed(iter/s)": 1.440999 }, { "epoch": 1.6040443854162203, "grad_norm": 9.094625473022461, "learning_rate": 7.668397464708214e-05, "loss": 2.5751674652099608, "memory(GiB)": 77.56, "step": 37440, "token_acc": 0.4588607594936709, "train_speed(iter/s)": 1.440993 }, { "epoch": 1.6042586007454693, "grad_norm": 4.778010845184326, "learning_rate": 7.667828312502452e-05, "loss": 2.664686584472656, "memory(GiB)": 77.56, "step": 37445, "token_acc": 0.4402730375426621, "train_speed(iter/s)": 1.441024 }, { "epoch": 1.6044728160747184, "grad_norm": 3.4660446643829346, "learning_rate": 7.667259111966476e-05, "loss": 2.4331796646118162, "memory(GiB)": 77.56, "step": 37450, "token_acc": 0.4935897435897436, "train_speed(iter/s)": 1.441056 }, { "epoch": 1.6046870314039672, "grad_norm": 4.855658531188965, "learning_rate": 7.6666898631106e-05, "loss": 2.3552242279052735, "memory(GiB)": 77.56, "step": 37455, "token_acc": 0.5369649805447471, "train_speed(iter/s)": 1.441061 }, { "epoch": 1.6049012467332162, "grad_norm": 4.5516533851623535, "learning_rate": 7.666120565945135e-05, "loss": 2.7862833023071287, "memory(GiB)": 77.56, "step": 37460, "token_acc": 0.43333333333333335, "train_speed(iter/s)": 1.44107 }, { "epoch": 1.6051154620624652, "grad_norm": 4.8243536949157715, "learning_rate": 7.665551220480395e-05, "loss": 2.5050823211669924, "memory(GiB)": 77.56, "step": 37465, "token_acc": 0.5036764705882353, "train_speed(iter/s)": 1.441077 }, { "epoch": 1.605329677391714, "grad_norm": 3.3661677837371826, "learning_rate": 7.664981826726695e-05, "loss": 2.2978546142578127, "memory(GiB)": 77.56, "step": 37470, "token_acc": 0.48360655737704916, "train_speed(iter/s)": 1.441071 }, { "epoch": 1.605543892720963, "grad_norm": 4.803250312805176, "learning_rate": 7.664412384694348e-05, "loss": 2.3432857513427736, "memory(GiB)": 77.56, "step": 37475, "token_acc": 0.506993006993007, "train_speed(iter/s)": 1.441083 }, { "epoch": 1.6057581080502121, "grad_norm": 4.503689765930176, "learning_rate": 7.663842894393672e-05, "loss": 2.4688488006591798, "memory(GiB)": 77.56, "step": 37480, "token_acc": 0.49185667752442996, "train_speed(iter/s)": 1.441109 }, { "epoch": 1.605972323379461, "grad_norm": 4.941961765289307, "learning_rate": 7.663273355834984e-05, "loss": 2.3893104553222657, "memory(GiB)": 77.56, "step": 37485, "token_acc": 0.4962962962962963, "train_speed(iter/s)": 1.441081 }, { "epoch": 1.60618653870871, "grad_norm": 6.212223052978516, "learning_rate": 7.662703769028599e-05, "loss": 2.3285255432128906, "memory(GiB)": 77.56, "step": 37490, "token_acc": 0.5081967213114754, "train_speed(iter/s)": 1.441111 }, { "epoch": 1.606400754037959, "grad_norm": 5.007437705993652, "learning_rate": 7.662134133984838e-05, "loss": 2.505349349975586, "memory(GiB)": 77.56, "step": 37495, "token_acc": 0.4810606060606061, "train_speed(iter/s)": 1.441141 }, { "epoch": 1.6066149693672078, "grad_norm": 4.935401439666748, "learning_rate": 7.66156445071402e-05, "loss": 2.5389307022094725, "memory(GiB)": 77.56, "step": 37500, "token_acc": 0.48056537102473496, "train_speed(iter/s)": 1.441172 }, { "epoch": 1.6066149693672078, "eval_loss": 2.4067347049713135, "eval_runtime": 14.7178, "eval_samples_per_second": 6.794, "eval_steps_per_second": 6.794, "eval_token_acc": 0.47804878048780486, "step": 37500 }, { "epoch": 1.6068291846964569, "grad_norm": 4.806564807891846, "learning_rate": 7.660994719226464e-05, "loss": 2.657512664794922, "memory(GiB)": 77.56, "step": 37505, "token_acc": 0.47214854111405835, "train_speed(iter/s)": 1.440324 }, { "epoch": 1.607043400025706, "grad_norm": 5.706747055053711, "learning_rate": 7.660424939532494e-05, "loss": 2.6950347900390623, "memory(GiB)": 77.56, "step": 37510, "token_acc": 0.4163934426229508, "train_speed(iter/s)": 1.440341 }, { "epoch": 1.6072576153549547, "grad_norm": 6.18770170211792, "learning_rate": 7.65985511164243e-05, "loss": 2.7094188690185548, "memory(GiB)": 77.56, "step": 37515, "token_acc": 0.449438202247191, "train_speed(iter/s)": 1.440335 }, { "epoch": 1.6074718306842037, "grad_norm": 4.365080833435059, "learning_rate": 7.659285235566596e-05, "loss": 2.657796096801758, "memory(GiB)": 77.56, "step": 37520, "token_acc": 0.4649350649350649, "train_speed(iter/s)": 1.440347 }, { "epoch": 1.6076860460134528, "grad_norm": 5.248068809509277, "learning_rate": 7.658715311315314e-05, "loss": 2.4294240951538084, "memory(GiB)": 77.56, "step": 37525, "token_acc": 0.5055350553505535, "train_speed(iter/s)": 1.440401 }, { "epoch": 1.6079002613427016, "grad_norm": 5.749895095825195, "learning_rate": 7.658145338898912e-05, "loss": 2.715859031677246, "memory(GiB)": 77.56, "step": 37530, "token_acc": 0.4336283185840708, "train_speed(iter/s)": 1.440404 }, { "epoch": 1.6081144766719506, "grad_norm": 4.384402275085449, "learning_rate": 7.657575318327712e-05, "loss": 2.6229156494140624, "memory(GiB)": 77.56, "step": 37535, "token_acc": 0.4440677966101695, "train_speed(iter/s)": 1.440413 }, { "epoch": 1.6083286920011997, "grad_norm": 4.644188404083252, "learning_rate": 7.657005249612044e-05, "loss": 2.7625591278076174, "memory(GiB)": 77.56, "step": 37540, "token_acc": 0.4498567335243553, "train_speed(iter/s)": 1.440456 }, { "epoch": 1.6085429073304485, "grad_norm": 4.940188884735107, "learning_rate": 7.65643513276223e-05, "loss": 2.4074514389038084, "memory(GiB)": 77.56, "step": 37545, "token_acc": 0.43050847457627117, "train_speed(iter/s)": 1.44049 }, { "epoch": 1.6087571226596975, "grad_norm": 5.767033100128174, "learning_rate": 7.655864967788605e-05, "loss": 2.867677116394043, "memory(GiB)": 77.56, "step": 37550, "token_acc": 0.42136498516320475, "train_speed(iter/s)": 1.440489 }, { "epoch": 1.6089713379889465, "grad_norm": 5.383480072021484, "learning_rate": 7.655294754701494e-05, "loss": 2.480497360229492, "memory(GiB)": 77.56, "step": 37555, "token_acc": 0.4923547400611621, "train_speed(iter/s)": 1.440502 }, { "epoch": 1.6091855533181953, "grad_norm": 5.815268516540527, "learning_rate": 7.654724493511227e-05, "loss": 2.8430389404296874, "memory(GiB)": 77.56, "step": 37560, "token_acc": 0.4175084175084175, "train_speed(iter/s)": 1.44051 }, { "epoch": 1.6093997686474444, "grad_norm": 5.3116865158081055, "learning_rate": 7.654154184228137e-05, "loss": 2.5249053955078127, "memory(GiB)": 77.56, "step": 37565, "token_acc": 0.4982698961937716, "train_speed(iter/s)": 1.440471 }, { "epoch": 1.6096139839766934, "grad_norm": 5.164255619049072, "learning_rate": 7.653583826862552e-05, "loss": 2.545930099487305, "memory(GiB)": 77.56, "step": 37570, "token_acc": 0.49, "train_speed(iter/s)": 1.440479 }, { "epoch": 1.6098281993059422, "grad_norm": 6.061285972595215, "learning_rate": 7.653013421424806e-05, "loss": 2.430738830566406, "memory(GiB)": 77.56, "step": 37575, "token_acc": 0.5127118644067796, "train_speed(iter/s)": 1.44046 }, { "epoch": 1.6100424146351913, "grad_norm": 5.412038326263428, "learning_rate": 7.652442967925236e-05, "loss": 2.7430561065673826, "memory(GiB)": 77.56, "step": 37580, "token_acc": 0.46078431372549017, "train_speed(iter/s)": 1.440422 }, { "epoch": 1.6102566299644403, "grad_norm": 6.383668899536133, "learning_rate": 7.651872466374172e-05, "loss": 2.31632137298584, "memory(GiB)": 77.56, "step": 37585, "token_acc": 0.48872180451127817, "train_speed(iter/s)": 1.44043 }, { "epoch": 1.6104708452936891, "grad_norm": 8.428053855895996, "learning_rate": 7.65130191678195e-05, "loss": 2.372853469848633, "memory(GiB)": 77.56, "step": 37590, "token_acc": 0.5280898876404494, "train_speed(iter/s)": 1.440393 }, { "epoch": 1.6106850606229381, "grad_norm": 4.508504867553711, "learning_rate": 7.650731319158908e-05, "loss": 2.4179462432861327, "memory(GiB)": 77.56, "step": 37595, "token_acc": 0.4865771812080537, "train_speed(iter/s)": 1.440321 }, { "epoch": 1.6108992759521872, "grad_norm": 5.605442047119141, "learning_rate": 7.650160673515381e-05, "loss": 2.5736928939819337, "memory(GiB)": 77.56, "step": 37600, "token_acc": 0.47985347985347987, "train_speed(iter/s)": 1.440341 }, { "epoch": 1.611113491281436, "grad_norm": 5.4860076904296875, "learning_rate": 7.649589979861706e-05, "loss": 2.276582145690918, "memory(GiB)": 77.56, "step": 37605, "token_acc": 0.49328859060402686, "train_speed(iter/s)": 1.440328 }, { "epoch": 1.611327706610685, "grad_norm": 4.896621227264404, "learning_rate": 7.649019238208226e-05, "loss": 2.5030078887939453, "memory(GiB)": 77.56, "step": 37610, "token_acc": 0.4586206896551724, "train_speed(iter/s)": 1.440361 }, { "epoch": 1.611541921939934, "grad_norm": 4.020619869232178, "learning_rate": 7.648448448565275e-05, "loss": 2.4690114974975588, "memory(GiB)": 77.56, "step": 37615, "token_acc": 0.46875, "train_speed(iter/s)": 1.440395 }, { "epoch": 1.6117561372691829, "grad_norm": 4.35134744644165, "learning_rate": 7.647877610943197e-05, "loss": 2.580672264099121, "memory(GiB)": 77.56, "step": 37620, "token_acc": 0.4880952380952381, "train_speed(iter/s)": 1.440442 }, { "epoch": 1.611970352598432, "grad_norm": 3.719632387161255, "learning_rate": 7.647306725352332e-05, "loss": 2.273370361328125, "memory(GiB)": 77.56, "step": 37625, "token_acc": 0.5304878048780488, "train_speed(iter/s)": 1.440464 }, { "epoch": 1.612184567927681, "grad_norm": 5.32607889175415, "learning_rate": 7.646735791803024e-05, "loss": 2.459147644042969, "memory(GiB)": 77.56, "step": 37630, "token_acc": 0.47151898734177217, "train_speed(iter/s)": 1.440492 }, { "epoch": 1.6123987832569298, "grad_norm": 5.236227512359619, "learning_rate": 7.646164810305611e-05, "loss": 2.3994359970092773, "memory(GiB)": 77.56, "step": 37635, "token_acc": 0.5039370078740157, "train_speed(iter/s)": 1.44049 }, { "epoch": 1.6126129985861788, "grad_norm": 4.731742858886719, "learning_rate": 7.645593780870442e-05, "loss": 2.6422397613525392, "memory(GiB)": 77.56, "step": 37640, "token_acc": 0.4790996784565916, "train_speed(iter/s)": 1.440471 }, { "epoch": 1.6128272139154278, "grad_norm": 4.112747669219971, "learning_rate": 7.645022703507858e-05, "loss": 2.458250045776367, "memory(GiB)": 77.56, "step": 37645, "token_acc": 0.47735191637630664, "train_speed(iter/s)": 1.440508 }, { "epoch": 1.6130414292446766, "grad_norm": 4.678101539611816, "learning_rate": 7.64445157822821e-05, "loss": 2.2416994094848635, "memory(GiB)": 77.56, "step": 37650, "token_acc": 0.5251798561151079, "train_speed(iter/s)": 1.44046 }, { "epoch": 1.6132556445739257, "grad_norm": 4.763542175292969, "learning_rate": 7.643880405041838e-05, "loss": 2.4603944778442384, "memory(GiB)": 77.56, "step": 37655, "token_acc": 0.4714285714285714, "train_speed(iter/s)": 1.440421 }, { "epoch": 1.6134698599031747, "grad_norm": 4.984938621520996, "learning_rate": 7.643309183959094e-05, "loss": 2.6370798110961915, "memory(GiB)": 77.56, "step": 37660, "token_acc": 0.46407185628742514, "train_speed(iter/s)": 1.440453 }, { "epoch": 1.6136840752324235, "grad_norm": 3.9210801124572754, "learning_rate": 7.642737914990324e-05, "loss": 2.6241277694702148, "memory(GiB)": 77.56, "step": 37665, "token_acc": 0.47750865051903113, "train_speed(iter/s)": 1.440412 }, { "epoch": 1.6138982905616726, "grad_norm": 5.629879951477051, "learning_rate": 7.642166598145876e-05, "loss": 2.545711135864258, "memory(GiB)": 77.56, "step": 37670, "token_acc": 0.5069444444444444, "train_speed(iter/s)": 1.440435 }, { "epoch": 1.6141125058909216, "grad_norm": 5.128633499145508, "learning_rate": 7.641595233436104e-05, "loss": 2.3216808319091795, "memory(GiB)": 77.56, "step": 37675, "token_acc": 0.4701492537313433, "train_speed(iter/s)": 1.440439 }, { "epoch": 1.6143267212201704, "grad_norm": 5.640380382537842, "learning_rate": 7.641023820871353e-05, "loss": 2.8596923828125, "memory(GiB)": 77.56, "step": 37680, "token_acc": 0.44482758620689655, "train_speed(iter/s)": 1.440496 }, { "epoch": 1.6145409365494194, "grad_norm": 5.271227836608887, "learning_rate": 7.640452360461982e-05, "loss": 2.4152158737182616, "memory(GiB)": 77.56, "step": 37685, "token_acc": 0.5157894736842106, "train_speed(iter/s)": 1.440495 }, { "epoch": 1.6147551518786685, "grad_norm": 5.2296295166015625, "learning_rate": 7.639880852218338e-05, "loss": 2.613365364074707, "memory(GiB)": 77.56, "step": 37690, "token_acc": 0.5085324232081911, "train_speed(iter/s)": 1.440515 }, { "epoch": 1.6149693672079173, "grad_norm": 4.282989978790283, "learning_rate": 7.639309296150775e-05, "loss": 2.387594985961914, "memory(GiB)": 77.56, "step": 37695, "token_acc": 0.49019607843137253, "train_speed(iter/s)": 1.440548 }, { "epoch": 1.6151835825371663, "grad_norm": 4.745231628417969, "learning_rate": 7.638737692269649e-05, "loss": 2.5518789291381836, "memory(GiB)": 77.56, "step": 37700, "token_acc": 0.44814814814814813, "train_speed(iter/s)": 1.440564 }, { "epoch": 1.6153977978664154, "grad_norm": 5.613280773162842, "learning_rate": 7.638166040585314e-05, "loss": 2.257324981689453, "memory(GiB)": 77.56, "step": 37705, "token_acc": 0.48109965635738833, "train_speed(iter/s)": 1.440525 }, { "epoch": 1.6156120131956642, "grad_norm": 5.070335388183594, "learning_rate": 7.637594341108127e-05, "loss": 2.4958402633666994, "memory(GiB)": 77.56, "step": 37710, "token_acc": 0.47706422018348627, "train_speed(iter/s)": 1.440492 }, { "epoch": 1.6158262285249132, "grad_norm": 4.498345851898193, "learning_rate": 7.637022593848444e-05, "loss": 2.5327690124511717, "memory(GiB)": 77.56, "step": 37715, "token_acc": 0.471875, "train_speed(iter/s)": 1.440455 }, { "epoch": 1.6160404438541622, "grad_norm": 4.5153422355651855, "learning_rate": 7.636450798816624e-05, "loss": 2.7252254486083984, "memory(GiB)": 77.56, "step": 37720, "token_acc": 0.4490861618798956, "train_speed(iter/s)": 1.44047 }, { "epoch": 1.616254659183411, "grad_norm": 4.830429553985596, "learning_rate": 7.635878956023023e-05, "loss": 2.3023822784423826, "memory(GiB)": 77.56, "step": 37725, "token_acc": 0.5189003436426117, "train_speed(iter/s)": 1.440426 }, { "epoch": 1.61646887451266, "grad_norm": 5.323904037475586, "learning_rate": 7.635307065478003e-05, "loss": 2.6148488998413084, "memory(GiB)": 77.56, "step": 37730, "token_acc": 0.46511627906976744, "train_speed(iter/s)": 1.4404 }, { "epoch": 1.6166830898419091, "grad_norm": 5.403785705566406, "learning_rate": 7.634735127191922e-05, "loss": 2.47650032043457, "memory(GiB)": 77.56, "step": 37735, "token_acc": 0.45255474452554745, "train_speed(iter/s)": 1.440416 }, { "epoch": 1.616897305171158, "grad_norm": 4.429103851318359, "learning_rate": 7.634163141175144e-05, "loss": 2.313262176513672, "memory(GiB)": 77.56, "step": 37740, "token_acc": 0.4765625, "train_speed(iter/s)": 1.440421 }, { "epoch": 1.617111520500407, "grad_norm": 5.550984859466553, "learning_rate": 7.633591107438029e-05, "loss": 2.4353511810302733, "memory(GiB)": 77.56, "step": 37745, "token_acc": 0.4653846153846154, "train_speed(iter/s)": 1.440435 }, { "epoch": 1.617325735829656, "grad_norm": 4.2899932861328125, "learning_rate": 7.63301902599094e-05, "loss": 2.345765495300293, "memory(GiB)": 77.56, "step": 37750, "token_acc": 0.5038167938931297, "train_speed(iter/s)": 1.440467 }, { "epoch": 1.6175399511589048, "grad_norm": 5.171411991119385, "learning_rate": 7.632446896844243e-05, "loss": 2.5676803588867188, "memory(GiB)": 77.56, "step": 37755, "token_acc": 0.47266881028938906, "train_speed(iter/s)": 1.440463 }, { "epoch": 1.6177541664881538, "grad_norm": 7.154988765716553, "learning_rate": 7.631874720008301e-05, "loss": 2.253861999511719, "memory(GiB)": 77.56, "step": 37760, "token_acc": 0.4935064935064935, "train_speed(iter/s)": 1.440488 }, { "epoch": 1.6179683818174029, "grad_norm": 4.189392566680908, "learning_rate": 7.631302495493478e-05, "loss": 2.5879135131835938, "memory(GiB)": 77.56, "step": 37765, "token_acc": 0.4537313432835821, "train_speed(iter/s)": 1.440483 }, { "epoch": 1.6181825971466517, "grad_norm": 4.117321491241455, "learning_rate": 7.630730223310143e-05, "loss": 2.268657684326172, "memory(GiB)": 77.56, "step": 37770, "token_acc": 0.49169435215946844, "train_speed(iter/s)": 1.440484 }, { "epoch": 1.6183968124759007, "grad_norm": 5.277060508728027, "learning_rate": 7.630157903468663e-05, "loss": 2.2288436889648438, "memory(GiB)": 77.56, "step": 37775, "token_acc": 0.5304347826086957, "train_speed(iter/s)": 1.440486 }, { "epoch": 1.6186110278051498, "grad_norm": 4.592144966125488, "learning_rate": 7.629585535979402e-05, "loss": 2.6815914154052733, "memory(GiB)": 77.56, "step": 37780, "token_acc": 0.48854961832061067, "train_speed(iter/s)": 1.440443 }, { "epoch": 1.6188252431343986, "grad_norm": 6.084299087524414, "learning_rate": 7.629013120852736e-05, "loss": 2.34604549407959, "memory(GiB)": 77.56, "step": 37785, "token_acc": 0.5315985130111525, "train_speed(iter/s)": 1.44041 }, { "epoch": 1.6190394584636476, "grad_norm": 5.6197309494018555, "learning_rate": 7.628440658099029e-05, "loss": 2.512620735168457, "memory(GiB)": 77.56, "step": 37790, "token_acc": 0.4548872180451128, "train_speed(iter/s)": 1.440386 }, { "epoch": 1.6192536737928966, "grad_norm": 5.779651641845703, "learning_rate": 7.627868147728654e-05, "loss": 2.5897064208984375, "memory(GiB)": 77.56, "step": 37795, "token_acc": 0.4460431654676259, "train_speed(iter/s)": 1.440375 }, { "epoch": 1.6194678891221455, "grad_norm": 4.179642200469971, "learning_rate": 7.627295589751982e-05, "loss": 2.5370941162109375, "memory(GiB)": 77.56, "step": 37800, "token_acc": 0.4855072463768116, "train_speed(iter/s)": 1.440402 }, { "epoch": 1.6196821044513945, "grad_norm": 7.664151668548584, "learning_rate": 7.626722984179387e-05, "loss": 2.5881237030029296, "memory(GiB)": 77.56, "step": 37805, "token_acc": 0.4840989399293286, "train_speed(iter/s)": 1.440388 }, { "epoch": 1.6198963197806435, "grad_norm": 4.311489105224609, "learning_rate": 7.62615033102124e-05, "loss": 2.3895198822021486, "memory(GiB)": 77.56, "step": 37810, "token_acc": 0.509493670886076, "train_speed(iter/s)": 1.44041 }, { "epoch": 1.6201105351098923, "grad_norm": 4.460899353027344, "learning_rate": 7.625577630287918e-05, "loss": 2.559486961364746, "memory(GiB)": 77.56, "step": 37815, "token_acc": 0.47017543859649125, "train_speed(iter/s)": 1.440409 }, { "epoch": 1.6203247504391414, "grad_norm": 4.621767997741699, "learning_rate": 7.625004881989792e-05, "loss": 2.77893180847168, "memory(GiB)": 77.56, "step": 37820, "token_acc": 0.4360655737704918, "train_speed(iter/s)": 1.440408 }, { "epoch": 1.6205389657683904, "grad_norm": 5.098792552947998, "learning_rate": 7.624432086137241e-05, "loss": 2.751088333129883, "memory(GiB)": 77.56, "step": 37825, "token_acc": 0.4426751592356688, "train_speed(iter/s)": 1.440423 }, { "epoch": 1.6207531810976392, "grad_norm": 4.143022060394287, "learning_rate": 7.623859242740642e-05, "loss": 2.7381216049194337, "memory(GiB)": 77.56, "step": 37830, "token_acc": 0.4721189591078067, "train_speed(iter/s)": 1.440403 }, { "epoch": 1.6209673964268885, "grad_norm": 6.091585636138916, "learning_rate": 7.62328635181037e-05, "loss": 2.7297189712524412, "memory(GiB)": 77.56, "step": 37835, "token_acc": 0.4589041095890411, "train_speed(iter/s)": 1.440406 }, { "epoch": 1.6211816117561373, "grad_norm": 5.44978141784668, "learning_rate": 7.622713413356806e-05, "loss": 2.7423160552978514, "memory(GiB)": 77.56, "step": 37840, "token_acc": 0.38699690402476783, "train_speed(iter/s)": 1.440403 }, { "epoch": 1.621395827085386, "grad_norm": 5.1050944328308105, "learning_rate": 7.622140427390327e-05, "loss": 2.499595069885254, "memory(GiB)": 77.56, "step": 37845, "token_acc": 0.4873417721518987, "train_speed(iter/s)": 1.440431 }, { "epoch": 1.6216100424146354, "grad_norm": 4.863409042358398, "learning_rate": 7.621567393921315e-05, "loss": 2.842523765563965, "memory(GiB)": 77.56, "step": 37850, "token_acc": 0.4620938628158845, "train_speed(iter/s)": 1.440465 }, { "epoch": 1.6218242577438842, "grad_norm": 6.070392608642578, "learning_rate": 7.62099431296015e-05, "loss": 2.9843233108520506, "memory(GiB)": 77.56, "step": 37855, "token_acc": 0.38869257950530034, "train_speed(iter/s)": 1.440519 }, { "epoch": 1.622038473073133, "grad_norm": 4.390357494354248, "learning_rate": 7.620421184517216e-05, "loss": 2.491388702392578, "memory(GiB)": 77.56, "step": 37860, "token_acc": 0.46518105849582175, "train_speed(iter/s)": 1.440545 }, { "epoch": 1.6222526884023822, "grad_norm": 4.784060955047607, "learning_rate": 7.61984800860289e-05, "loss": 2.348118782043457, "memory(GiB)": 77.56, "step": 37865, "token_acc": 0.4635036496350365, "train_speed(iter/s)": 1.440521 }, { "epoch": 1.622466903731631, "grad_norm": 5.561811447143555, "learning_rate": 7.619274785227564e-05, "loss": 2.6775644302368162, "memory(GiB)": 77.56, "step": 37870, "token_acc": 0.44745762711864406, "train_speed(iter/s)": 1.440527 }, { "epoch": 1.6226811190608799, "grad_norm": 5.449887752532959, "learning_rate": 7.618701514401618e-05, "loss": 2.7351711273193358, "memory(GiB)": 77.56, "step": 37875, "token_acc": 0.4416058394160584, "train_speed(iter/s)": 1.440554 }, { "epoch": 1.6228953343901291, "grad_norm": 5.843167304992676, "learning_rate": 7.618128196135435e-05, "loss": 2.4031232833862304, "memory(GiB)": 77.56, "step": 37880, "token_acc": 0.49333333333333335, "train_speed(iter/s)": 1.440484 }, { "epoch": 1.623109549719378, "grad_norm": 5.339305400848389, "learning_rate": 7.617554830439406e-05, "loss": 2.3922922134399416, "memory(GiB)": 77.56, "step": 37885, "token_acc": 0.5214521452145214, "train_speed(iter/s)": 1.440501 }, { "epoch": 1.6233237650486267, "grad_norm": 7.193346977233887, "learning_rate": 7.616981417323914e-05, "loss": 2.7266597747802734, "memory(GiB)": 77.56, "step": 37890, "token_acc": 0.48046875, "train_speed(iter/s)": 1.440524 }, { "epoch": 1.623537980377876, "grad_norm": 6.7550435066223145, "learning_rate": 7.61640795679935e-05, "loss": 2.408618354797363, "memory(GiB)": 77.56, "step": 37895, "token_acc": 0.5121951219512195, "train_speed(iter/s)": 1.440537 }, { "epoch": 1.6237521957071248, "grad_norm": 5.425740718841553, "learning_rate": 7.6158344488761e-05, "loss": 2.6974472045898437, "memory(GiB)": 77.56, "step": 37900, "token_acc": 0.4808362369337979, "train_speed(iter/s)": 1.440556 }, { "epoch": 1.6239664110363736, "grad_norm": 4.923129081726074, "learning_rate": 7.615260893564556e-05, "loss": 2.2135669708251955, "memory(GiB)": 77.56, "step": 37905, "token_acc": 0.5275080906148867, "train_speed(iter/s)": 1.44055 }, { "epoch": 1.6241806263656229, "grad_norm": 5.52205228805542, "learning_rate": 7.614687290875107e-05, "loss": 2.7369285583496095, "memory(GiB)": 77.56, "step": 37910, "token_acc": 0.4406779661016949, "train_speed(iter/s)": 1.440511 }, { "epoch": 1.6243948416948717, "grad_norm": 4.848884105682373, "learning_rate": 7.614113640818145e-05, "loss": 2.658232498168945, "memory(GiB)": 77.56, "step": 37915, "token_acc": 0.4807017543859649, "train_speed(iter/s)": 1.4405 }, { "epoch": 1.6246090570241205, "grad_norm": 4.7700018882751465, "learning_rate": 7.613539943404064e-05, "loss": 2.562948226928711, "memory(GiB)": 77.56, "step": 37920, "token_acc": 0.44565217391304346, "train_speed(iter/s)": 1.440544 }, { "epoch": 1.6248232723533698, "grad_norm": 5.361043930053711, "learning_rate": 7.612966198643254e-05, "loss": 2.4788240432739257, "memory(GiB)": 77.56, "step": 37925, "token_acc": 0.4965277777777778, "train_speed(iter/s)": 1.440516 }, { "epoch": 1.6250374876826186, "grad_norm": 4.615484714508057, "learning_rate": 7.612392406546109e-05, "loss": 2.5205101013183593, "memory(GiB)": 77.56, "step": 37930, "token_acc": 0.4967532467532468, "train_speed(iter/s)": 1.440487 }, { "epoch": 1.6252517030118674, "grad_norm": 5.751092433929443, "learning_rate": 7.611818567123025e-05, "loss": 2.799050521850586, "memory(GiB)": 77.56, "step": 37935, "token_acc": 0.4452054794520548, "train_speed(iter/s)": 1.440479 }, { "epoch": 1.6254659183411166, "grad_norm": 6.117849826812744, "learning_rate": 7.6112446803844e-05, "loss": 2.5702619552612305, "memory(GiB)": 77.56, "step": 37940, "token_acc": 0.47634069400630913, "train_speed(iter/s)": 1.440508 }, { "epoch": 1.6256801336703655, "grad_norm": 5.259683609008789, "learning_rate": 7.610670746340626e-05, "loss": 2.4230731964111327, "memory(GiB)": 77.56, "step": 37945, "token_acc": 0.4657039711191336, "train_speed(iter/s)": 1.440516 }, { "epoch": 1.6258943489996143, "grad_norm": 4.160335063934326, "learning_rate": 7.610096765002104e-05, "loss": 2.6499282836914064, "memory(GiB)": 77.56, "step": 37950, "token_acc": 0.4507936507936508, "train_speed(iter/s)": 1.440505 }, { "epoch": 1.6261085643288635, "grad_norm": 3.890566110610962, "learning_rate": 7.609522736379229e-05, "loss": 2.342402458190918, "memory(GiB)": 77.56, "step": 37955, "token_acc": 0.5018587360594795, "train_speed(iter/s)": 1.440525 }, { "epoch": 1.6263227796581123, "grad_norm": 9.271947860717773, "learning_rate": 7.608948660482403e-05, "loss": 2.3067720413208006, "memory(GiB)": 77.56, "step": 37960, "token_acc": 0.4796747967479675, "train_speed(iter/s)": 1.440469 }, { "epoch": 1.6265369949873612, "grad_norm": 4.786581039428711, "learning_rate": 7.608374537322024e-05, "loss": 2.410334587097168, "memory(GiB)": 77.56, "step": 37965, "token_acc": 0.4870848708487085, "train_speed(iter/s)": 1.440478 }, { "epoch": 1.6267512103166104, "grad_norm": 4.84458589553833, "learning_rate": 7.607800366908493e-05, "loss": 2.341404914855957, "memory(GiB)": 77.56, "step": 37970, "token_acc": 0.49070631970260226, "train_speed(iter/s)": 1.440465 }, { "epoch": 1.6269654256458592, "grad_norm": 5.3732590675354, "learning_rate": 7.607226149252212e-05, "loss": 2.887238311767578, "memory(GiB)": 77.56, "step": 37975, "token_acc": 0.43109540636042404, "train_speed(iter/s)": 1.440482 }, { "epoch": 1.627179640975108, "grad_norm": 5.549344062805176, "learning_rate": 7.606651884363585e-05, "loss": 2.504510498046875, "memory(GiB)": 77.56, "step": 37980, "token_acc": 0.475, "train_speed(iter/s)": 1.440495 }, { "epoch": 1.6273938563043573, "grad_norm": 6.415650367736816, "learning_rate": 7.606077572253012e-05, "loss": 2.535602569580078, "memory(GiB)": 77.56, "step": 37985, "token_acc": 0.4432624113475177, "train_speed(iter/s)": 1.440516 }, { "epoch": 1.627608071633606, "grad_norm": 4.586345195770264, "learning_rate": 7.6055032129309e-05, "loss": 2.4179458618164062, "memory(GiB)": 77.56, "step": 37990, "token_acc": 0.49032258064516127, "train_speed(iter/s)": 1.440533 }, { "epoch": 1.627822286962855, "grad_norm": 5.557847499847412, "learning_rate": 7.604928806407654e-05, "loss": 2.7187625885009767, "memory(GiB)": 77.56, "step": 37995, "token_acc": 0.444015444015444, "train_speed(iter/s)": 1.440572 }, { "epoch": 1.6280365022921042, "grad_norm": 5.628841876983643, "learning_rate": 7.604354352693677e-05, "loss": 2.528227996826172, "memory(GiB)": 77.56, "step": 38000, "token_acc": 0.49666666666666665, "train_speed(iter/s)": 1.440593 }, { "epoch": 1.6280365022921042, "eval_loss": 2.230398654937744, "eval_runtime": 14.3111, "eval_samples_per_second": 6.988, "eval_steps_per_second": 6.988, "eval_token_acc": 0.4927536231884058, "step": 38000 }, { "epoch": 1.628250717621353, "grad_norm": 3.8304436206817627, "learning_rate": 7.60377985179938e-05, "loss": 2.668830680847168, "memory(GiB)": 77.56, "step": 38005, "token_acc": 0.4680105170902717, "train_speed(iter/s)": 1.439731 }, { "epoch": 1.6284649329506018, "grad_norm": 3.7807648181915283, "learning_rate": 7.603205303735166e-05, "loss": 2.268621826171875, "memory(GiB)": 77.56, "step": 38010, "token_acc": 0.47318611987381703, "train_speed(iter/s)": 1.439728 }, { "epoch": 1.628679148279851, "grad_norm": 4.056646823883057, "learning_rate": 7.602630708511448e-05, "loss": 2.4808622360229493, "memory(GiB)": 77.56, "step": 38015, "token_acc": 0.4835820895522388, "train_speed(iter/s)": 1.439775 }, { "epoch": 1.6288933636090999, "grad_norm": 4.447537899017334, "learning_rate": 7.602056066138633e-05, "loss": 2.279632568359375, "memory(GiB)": 77.56, "step": 38020, "token_acc": 0.4907749077490775, "train_speed(iter/s)": 1.439799 }, { "epoch": 1.6291075789383487, "grad_norm": 5.137543678283691, "learning_rate": 7.601481376627131e-05, "loss": 2.5810993194580076, "memory(GiB)": 77.56, "step": 38025, "token_acc": 0.48698884758364314, "train_speed(iter/s)": 1.439769 }, { "epoch": 1.629321794267598, "grad_norm": 4.957739353179932, "learning_rate": 7.600906639987352e-05, "loss": 2.527983856201172, "memory(GiB)": 77.56, "step": 38030, "token_acc": 0.5073529411764706, "train_speed(iter/s)": 1.439712 }, { "epoch": 1.6295360095968467, "grad_norm": 4.1089935302734375, "learning_rate": 7.600331856229712e-05, "loss": 2.464487838745117, "memory(GiB)": 77.56, "step": 38035, "token_acc": 0.4664310954063604, "train_speed(iter/s)": 1.439744 }, { "epoch": 1.6297502249260956, "grad_norm": 6.254332065582275, "learning_rate": 7.59975702536462e-05, "loss": 2.3078657150268556, "memory(GiB)": 77.56, "step": 38040, "token_acc": 0.5447470817120622, "train_speed(iter/s)": 1.439765 }, { "epoch": 1.6299644402553448, "grad_norm": 4.900516033172607, "learning_rate": 7.599182147402491e-05, "loss": 2.5902278900146483, "memory(GiB)": 77.56, "step": 38045, "token_acc": 0.47634069400630913, "train_speed(iter/s)": 1.439725 }, { "epoch": 1.6301786555845936, "grad_norm": 5.00094747543335, "learning_rate": 7.598607222353739e-05, "loss": 2.278514862060547, "memory(GiB)": 77.56, "step": 38050, "token_acc": 0.5131086142322098, "train_speed(iter/s)": 1.439729 }, { "epoch": 1.6303928709138424, "grad_norm": 7.132684230804443, "learning_rate": 7.598032250228779e-05, "loss": 2.2978532791137694, "memory(GiB)": 77.56, "step": 38055, "token_acc": 0.4897260273972603, "train_speed(iter/s)": 1.439763 }, { "epoch": 1.6306070862430917, "grad_norm": 5.430882453918457, "learning_rate": 7.597457231038028e-05, "loss": 2.6715049743652344, "memory(GiB)": 77.56, "step": 38060, "token_acc": 0.44569288389513106, "train_speed(iter/s)": 1.43979 }, { "epoch": 1.6308213015723405, "grad_norm": 6.83610200881958, "learning_rate": 7.596882164791903e-05, "loss": 2.5615875244140627, "memory(GiB)": 77.56, "step": 38065, "token_acc": 0.44, "train_speed(iter/s)": 1.439755 }, { "epoch": 1.6310355169015893, "grad_norm": 5.552457332611084, "learning_rate": 7.596307051500821e-05, "loss": 2.6205591201782226, "memory(GiB)": 77.56, "step": 38070, "token_acc": 0.4410112359550562, "train_speed(iter/s)": 1.439781 }, { "epoch": 1.6312497322308386, "grad_norm": 5.539562225341797, "learning_rate": 7.595731891175202e-05, "loss": 2.1188907623291016, "memory(GiB)": 77.56, "step": 38075, "token_acc": 0.5524193548387096, "train_speed(iter/s)": 1.439802 }, { "epoch": 1.6314639475600874, "grad_norm": 4.295923709869385, "learning_rate": 7.595156683825463e-05, "loss": 2.400032806396484, "memory(GiB)": 77.56, "step": 38080, "token_acc": 0.5088967971530249, "train_speed(iter/s)": 1.439766 }, { "epoch": 1.6316781628893362, "grad_norm": 5.152463912963867, "learning_rate": 7.594581429462026e-05, "loss": 2.635820770263672, "memory(GiB)": 77.56, "step": 38085, "token_acc": 0.48286604361370716, "train_speed(iter/s)": 1.439808 }, { "epoch": 1.6318923782185855, "grad_norm": 6.016039848327637, "learning_rate": 7.594006128095314e-05, "loss": 2.6588924407958983, "memory(GiB)": 77.56, "step": 38090, "token_acc": 0.427536231884058, "train_speed(iter/s)": 1.439872 }, { "epoch": 1.6321065935478343, "grad_norm": 5.937152862548828, "learning_rate": 7.593430779735749e-05, "loss": 2.530279541015625, "memory(GiB)": 77.56, "step": 38095, "token_acc": 0.4921135646687697, "train_speed(iter/s)": 1.439901 }, { "epoch": 1.632320808877083, "grad_norm": 4.711450576782227, "learning_rate": 7.592855384393752e-05, "loss": 2.4118274688720702, "memory(GiB)": 77.56, "step": 38100, "token_acc": 0.47368421052631576, "train_speed(iter/s)": 1.439916 }, { "epoch": 1.6325350242063323, "grad_norm": 7.2043609619140625, "learning_rate": 7.592279942079746e-05, "loss": 2.615461730957031, "memory(GiB)": 77.56, "step": 38105, "token_acc": 0.4858156028368794, "train_speed(iter/s)": 1.43993 }, { "epoch": 1.6327492395355812, "grad_norm": 4.363980770111084, "learning_rate": 7.591704452804157e-05, "loss": 2.2706024169921877, "memory(GiB)": 77.56, "step": 38110, "token_acc": 0.532871972318339, "train_speed(iter/s)": 1.439904 }, { "epoch": 1.63296345486483, "grad_norm": 3.774569511413574, "learning_rate": 7.591128916577412e-05, "loss": 2.411854553222656, "memory(GiB)": 77.56, "step": 38115, "token_acc": 0.5197568389057751, "train_speed(iter/s)": 1.439926 }, { "epoch": 1.6331776701940792, "grad_norm": 5.024695873260498, "learning_rate": 7.590553333409934e-05, "loss": 2.5799734115600588, "memory(GiB)": 77.56, "step": 38120, "token_acc": 0.43790849673202614, "train_speed(iter/s)": 1.439918 }, { "epoch": 1.633391885523328, "grad_norm": 4.98157262802124, "learning_rate": 7.589977703312152e-05, "loss": 2.3855262756347657, "memory(GiB)": 77.56, "step": 38125, "token_acc": 0.4679245283018868, "train_speed(iter/s)": 1.43993 }, { "epoch": 1.6336061008525768, "grad_norm": 6.1562724113464355, "learning_rate": 7.589402026294497e-05, "loss": 2.4294706344604493, "memory(GiB)": 77.56, "step": 38130, "token_acc": 0.4576271186440678, "train_speed(iter/s)": 1.43996 }, { "epoch": 1.633820316181826, "grad_norm": 4.540532112121582, "learning_rate": 7.588826302367394e-05, "loss": 2.669905090332031, "memory(GiB)": 77.56, "step": 38135, "token_acc": 0.432258064516129, "train_speed(iter/s)": 1.439953 }, { "epoch": 1.634034531511075, "grad_norm": 5.373117923736572, "learning_rate": 7.588250531541274e-05, "loss": 2.767827796936035, "memory(GiB)": 77.56, "step": 38140, "token_acc": 0.4795539033457249, "train_speed(iter/s)": 1.439919 }, { "epoch": 1.6342487468403237, "grad_norm": 4.0611677169799805, "learning_rate": 7.587674713826567e-05, "loss": 2.810939979553223, "memory(GiB)": 77.56, "step": 38145, "token_acc": 0.4331395348837209, "train_speed(iter/s)": 1.439934 }, { "epoch": 1.634462962169573, "grad_norm": 6.725802421569824, "learning_rate": 7.587098849233707e-05, "loss": 2.441546821594238, "memory(GiB)": 77.56, "step": 38150, "token_acc": 0.4830188679245283, "train_speed(iter/s)": 1.439945 }, { "epoch": 1.6346771774988218, "grad_norm": 4.110783576965332, "learning_rate": 7.586522937773123e-05, "loss": 2.278568077087402, "memory(GiB)": 77.56, "step": 38155, "token_acc": 0.5190311418685121, "train_speed(iter/s)": 1.439903 }, { "epoch": 1.6348913928280706, "grad_norm": 5.280030250549316, "learning_rate": 7.58594697945525e-05, "loss": 2.4644393920898438, "memory(GiB)": 77.56, "step": 38160, "token_acc": 0.4679245283018868, "train_speed(iter/s)": 1.439901 }, { "epoch": 1.6351056081573199, "grad_norm": 4.459488868713379, "learning_rate": 7.585370974290521e-05, "loss": 2.884593963623047, "memory(GiB)": 77.56, "step": 38165, "token_acc": 0.44694533762057875, "train_speed(iter/s)": 1.439897 }, { "epoch": 1.6353198234865687, "grad_norm": 5.270105838775635, "learning_rate": 7.584794922289371e-05, "loss": 2.3009801864624024, "memory(GiB)": 77.56, "step": 38170, "token_acc": 0.5144927536231884, "train_speed(iter/s)": 1.439888 }, { "epoch": 1.6355340388158177, "grad_norm": 6.147409915924072, "learning_rate": 7.584218823462238e-05, "loss": 2.55008544921875, "memory(GiB)": 77.56, "step": 38175, "token_acc": 0.4714285714285714, "train_speed(iter/s)": 1.439818 }, { "epoch": 1.6357482541450667, "grad_norm": 4.9222846031188965, "learning_rate": 7.583642677819557e-05, "loss": 2.828347396850586, "memory(GiB)": 77.56, "step": 38180, "token_acc": 0.449438202247191, "train_speed(iter/s)": 1.439814 }, { "epoch": 1.6359624694743156, "grad_norm": 4.484988212585449, "learning_rate": 7.583066485371764e-05, "loss": 2.1581960678100587, "memory(GiB)": 77.56, "step": 38185, "token_acc": 0.5294117647058824, "train_speed(iter/s)": 1.439828 }, { "epoch": 1.6361766848035646, "grad_norm": 4.387702465057373, "learning_rate": 7.582490246129299e-05, "loss": 2.6517467498779297, "memory(GiB)": 77.56, "step": 38190, "token_acc": 0.4551971326164875, "train_speed(iter/s)": 1.439864 }, { "epoch": 1.6363909001328136, "grad_norm": 3.7711524963378906, "learning_rate": 7.5819139601026e-05, "loss": 2.3045331954956056, "memory(GiB)": 77.56, "step": 38195, "token_acc": 0.5374149659863946, "train_speed(iter/s)": 1.439875 }, { "epoch": 1.6366051154620624, "grad_norm": 4.847904682159424, "learning_rate": 7.581337627302107e-05, "loss": 2.541432571411133, "memory(GiB)": 77.56, "step": 38200, "token_acc": 0.42435424354243545, "train_speed(iter/s)": 1.439916 }, { "epoch": 1.6368193307913115, "grad_norm": 4.269672870635986, "learning_rate": 7.580761247738264e-05, "loss": 2.569334030151367, "memory(GiB)": 77.56, "step": 38205, "token_acc": 0.46689895470383275, "train_speed(iter/s)": 1.439947 }, { "epoch": 1.6370335461205605, "grad_norm": 4.63272762298584, "learning_rate": 7.580184821421508e-05, "loss": 2.4549869537353515, "memory(GiB)": 77.56, "step": 38210, "token_acc": 0.48348348348348347, "train_speed(iter/s)": 1.439939 }, { "epoch": 1.6372477614498093, "grad_norm": 5.64998197555542, "learning_rate": 7.579608348362284e-05, "loss": 2.4259469985961912, "memory(GiB)": 77.56, "step": 38215, "token_acc": 0.45751633986928103, "train_speed(iter/s)": 1.439962 }, { "epoch": 1.6374619767790584, "grad_norm": 7.6386847496032715, "learning_rate": 7.579031828571035e-05, "loss": 2.329135513305664, "memory(GiB)": 77.56, "step": 38220, "token_acc": 0.5017921146953405, "train_speed(iter/s)": 1.439932 }, { "epoch": 1.6376761921083074, "grad_norm": 5.141870498657227, "learning_rate": 7.578455262058204e-05, "loss": 2.6745733261108398, "memory(GiB)": 77.56, "step": 38225, "token_acc": 0.43597560975609756, "train_speed(iter/s)": 1.439968 }, { "epoch": 1.6378904074375562, "grad_norm": 4.4243268966674805, "learning_rate": 7.577878648834241e-05, "loss": 2.4369958877563476, "memory(GiB)": 77.56, "step": 38230, "token_acc": 0.4781021897810219, "train_speed(iter/s)": 1.43998 }, { "epoch": 1.6381046227668052, "grad_norm": 4.563777446746826, "learning_rate": 7.577301988909583e-05, "loss": 2.731121635437012, "memory(GiB)": 77.56, "step": 38235, "token_acc": 0.4709480122324159, "train_speed(iter/s)": 1.439983 }, { "epoch": 1.6383188380960543, "grad_norm": 4.9535675048828125, "learning_rate": 7.576725282294683e-05, "loss": 2.455893707275391, "memory(GiB)": 77.56, "step": 38240, "token_acc": 0.5029761904761905, "train_speed(iter/s)": 1.439996 }, { "epoch": 1.638533053425303, "grad_norm": 5.448330879211426, "learning_rate": 7.576148528999989e-05, "loss": 2.8143096923828126, "memory(GiB)": 77.56, "step": 38245, "token_acc": 0.4412811387900356, "train_speed(iter/s)": 1.439993 }, { "epoch": 1.6387472687545521, "grad_norm": 6.893537998199463, "learning_rate": 7.575571729035948e-05, "loss": 2.2737138748168944, "memory(GiB)": 77.56, "step": 38250, "token_acc": 0.5772357723577236, "train_speed(iter/s)": 1.440021 }, { "epoch": 1.6389614840838012, "grad_norm": 4.84488582611084, "learning_rate": 7.574994882413007e-05, "loss": 2.4641876220703125, "memory(GiB)": 77.56, "step": 38255, "token_acc": 0.4820359281437126, "train_speed(iter/s)": 1.44001 }, { "epoch": 1.63917569941305, "grad_norm": 4.760132789611816, "learning_rate": 7.57441798914162e-05, "loss": 2.5985626220703124, "memory(GiB)": 77.56, "step": 38260, "token_acc": 0.42679127725856697, "train_speed(iter/s)": 1.44002 }, { "epoch": 1.639389914742299, "grad_norm": 7.235931396484375, "learning_rate": 7.573841049232236e-05, "loss": 2.6217987060546877, "memory(GiB)": 77.56, "step": 38265, "token_acc": 0.4227129337539432, "train_speed(iter/s)": 1.44009 }, { "epoch": 1.639604130071548, "grad_norm": 5.110458850860596, "learning_rate": 7.573264062695304e-05, "loss": 2.593521499633789, "memory(GiB)": 77.56, "step": 38270, "token_acc": 0.4727272727272727, "train_speed(iter/s)": 1.440108 }, { "epoch": 1.6398183454007969, "grad_norm": 4.615773677825928, "learning_rate": 7.572687029541283e-05, "loss": 2.462036895751953, "memory(GiB)": 77.56, "step": 38275, "token_acc": 0.4966887417218543, "train_speed(iter/s)": 1.440049 }, { "epoch": 1.6400325607300459, "grad_norm": 4.727434158325195, "learning_rate": 7.572109949780624e-05, "loss": 2.449875259399414, "memory(GiB)": 77.56, "step": 38280, "token_acc": 0.46153846153846156, "train_speed(iter/s)": 1.44009 }, { "epoch": 1.640246776059295, "grad_norm": 4.6819047927856445, "learning_rate": 7.571532823423777e-05, "loss": 2.401778221130371, "memory(GiB)": 77.56, "step": 38285, "token_acc": 0.46619217081850534, "train_speed(iter/s)": 1.440032 }, { "epoch": 1.6404609913885437, "grad_norm": 4.814586162567139, "learning_rate": 7.570955650481202e-05, "loss": 2.6406890869140627, "memory(GiB)": 77.56, "step": 38290, "token_acc": 0.4778156996587031, "train_speed(iter/s)": 1.44001 }, { "epoch": 1.6406752067177928, "grad_norm": 5.465790748596191, "learning_rate": 7.570378430963355e-05, "loss": 2.783735466003418, "memory(GiB)": 77.56, "step": 38295, "token_acc": 0.44565217391304346, "train_speed(iter/s)": 1.440068 }, { "epoch": 1.6408894220470418, "grad_norm": 4.85542631149292, "learning_rate": 7.56980116488069e-05, "loss": 2.239752769470215, "memory(GiB)": 77.56, "step": 38300, "token_acc": 0.525, "train_speed(iter/s)": 1.440086 }, { "epoch": 1.6411036373762906, "grad_norm": 5.765982151031494, "learning_rate": 7.569223852243666e-05, "loss": 2.829648971557617, "memory(GiB)": 77.56, "step": 38305, "token_acc": 0.43884892086330934, "train_speed(iter/s)": 1.440099 }, { "epoch": 1.6413178527055396, "grad_norm": 6.030619144439697, "learning_rate": 7.568646493062742e-05, "loss": 2.4924142837524412, "memory(GiB)": 77.56, "step": 38310, "token_acc": 0.475, "train_speed(iter/s)": 1.440103 }, { "epoch": 1.6415320680347887, "grad_norm": 4.4107561111450195, "learning_rate": 7.568069087348377e-05, "loss": 2.5773033142089843, "memory(GiB)": 77.56, "step": 38315, "token_acc": 0.4845360824742268, "train_speed(iter/s)": 1.440078 }, { "epoch": 1.6417462833640375, "grad_norm": 5.170173645019531, "learning_rate": 7.567491635111033e-05, "loss": 2.4269287109375, "memory(GiB)": 77.56, "step": 38320, "token_acc": 0.5167785234899329, "train_speed(iter/s)": 1.440132 }, { "epoch": 1.6419604986932865, "grad_norm": 6.318161964416504, "learning_rate": 7.566914136361168e-05, "loss": 2.3796905517578124, "memory(GiB)": 77.56, "step": 38325, "token_acc": 0.49411764705882355, "train_speed(iter/s)": 1.44014 }, { "epoch": 1.6421747140225356, "grad_norm": 4.667794227600098, "learning_rate": 7.566336591109245e-05, "loss": 2.5578704833984376, "memory(GiB)": 77.56, "step": 38330, "token_acc": 0.46938775510204084, "train_speed(iter/s)": 1.440155 }, { "epoch": 1.6423889293517844, "grad_norm": 5.906712532043457, "learning_rate": 7.565758999365728e-05, "loss": 2.472130012512207, "memory(GiB)": 77.56, "step": 38335, "token_acc": 0.4770992366412214, "train_speed(iter/s)": 1.440074 }, { "epoch": 1.6426031446810334, "grad_norm": 4.678191661834717, "learning_rate": 7.56518136114108e-05, "loss": 2.568874740600586, "memory(GiB)": 77.56, "step": 38340, "token_acc": 0.49185667752442996, "train_speed(iter/s)": 1.440087 }, { "epoch": 1.6428173600102824, "grad_norm": 4.495857238769531, "learning_rate": 7.564603676445765e-05, "loss": 2.4461050033569336, "memory(GiB)": 77.56, "step": 38345, "token_acc": 0.4717514124293785, "train_speed(iter/s)": 1.440071 }, { "epoch": 1.6430315753395313, "grad_norm": 4.390526294708252, "learning_rate": 7.56402594529025e-05, "loss": 2.401481819152832, "memory(GiB)": 77.56, "step": 38350, "token_acc": 0.512280701754386, "train_speed(iter/s)": 1.440063 }, { "epoch": 1.6432457906687803, "grad_norm": 5.889782905578613, "learning_rate": 7.563448167684996e-05, "loss": 2.2590831756591796, "memory(GiB)": 77.56, "step": 38355, "token_acc": 0.5622641509433962, "train_speed(iter/s)": 1.440017 }, { "epoch": 1.6434600059980293, "grad_norm": 4.997591972351074, "learning_rate": 7.562870343640478e-05, "loss": 2.0358966827392577, "memory(GiB)": 77.56, "step": 38360, "token_acc": 0.5333333333333333, "train_speed(iter/s)": 1.440009 }, { "epoch": 1.6436742213272781, "grad_norm": 6.388982772827148, "learning_rate": 7.562292473167158e-05, "loss": 2.380128860473633, "memory(GiB)": 77.56, "step": 38365, "token_acc": 0.5642023346303502, "train_speed(iter/s)": 1.440038 }, { "epoch": 1.6438884366565272, "grad_norm": 5.350227355957031, "learning_rate": 7.561714556275505e-05, "loss": 2.584260940551758, "memory(GiB)": 77.56, "step": 38370, "token_acc": 0.4681528662420382, "train_speed(iter/s)": 1.439995 }, { "epoch": 1.6441026519857762, "grad_norm": 5.289626121520996, "learning_rate": 7.561136592975993e-05, "loss": 2.5791215896606445, "memory(GiB)": 77.56, "step": 38375, "token_acc": 0.44654088050314467, "train_speed(iter/s)": 1.43994 }, { "epoch": 1.644316867315025, "grad_norm": 5.083733558654785, "learning_rate": 7.560558583279085e-05, "loss": 2.2939130783081056, "memory(GiB)": 77.56, "step": 38380, "token_acc": 0.5347222222222222, "train_speed(iter/s)": 1.439968 }, { "epoch": 1.644531082644274, "grad_norm": 4.437898635864258, "learning_rate": 7.559980527195259e-05, "loss": 2.107828903198242, "memory(GiB)": 77.56, "step": 38385, "token_acc": 0.5431654676258992, "train_speed(iter/s)": 1.43989 }, { "epoch": 1.644745297973523, "grad_norm": 4.620337009429932, "learning_rate": 7.559402424734982e-05, "loss": 2.696457290649414, "memory(GiB)": 77.56, "step": 38390, "token_acc": 0.46579804560260585, "train_speed(iter/s)": 1.439868 }, { "epoch": 1.644959513302772, "grad_norm": 4.596342086791992, "learning_rate": 7.558824275908732e-05, "loss": 2.35351619720459, "memory(GiB)": 77.56, "step": 38395, "token_acc": 0.5166666666666667, "train_speed(iter/s)": 1.439897 }, { "epoch": 1.645173728632021, "grad_norm": 7.431744575500488, "learning_rate": 7.558246080726978e-05, "loss": 2.629029655456543, "memory(GiB)": 77.56, "step": 38400, "token_acc": 0.5215686274509804, "train_speed(iter/s)": 1.439876 }, { "epoch": 1.64538794396127, "grad_norm": 7.818251132965088, "learning_rate": 7.557667839200198e-05, "loss": 2.2264585494995117, "memory(GiB)": 77.56, "step": 38405, "token_acc": 0.49224806201550386, "train_speed(iter/s)": 1.439827 }, { "epoch": 1.6456021592905188, "grad_norm": 4.419662952423096, "learning_rate": 7.557089551338865e-05, "loss": 2.3536327362060545, "memory(GiB)": 77.56, "step": 38410, "token_acc": 0.4919093851132686, "train_speed(iter/s)": 1.439792 }, { "epoch": 1.6458163746197678, "grad_norm": 4.097082138061523, "learning_rate": 7.556511217153455e-05, "loss": 2.526063919067383, "memory(GiB)": 77.56, "step": 38415, "token_acc": 0.49, "train_speed(iter/s)": 1.439816 }, { "epoch": 1.6460305899490169, "grad_norm": 4.693516254425049, "learning_rate": 7.555932836654447e-05, "loss": 2.4798213958740236, "memory(GiB)": 77.56, "step": 38420, "token_acc": 0.4939759036144578, "train_speed(iter/s)": 1.439884 }, { "epoch": 1.6462448052782657, "grad_norm": 4.578656196594238, "learning_rate": 7.555354409852318e-05, "loss": 2.335084915161133, "memory(GiB)": 77.56, "step": 38425, "token_acc": 0.5171339563862928, "train_speed(iter/s)": 1.439919 }, { "epoch": 1.6464590206075147, "grad_norm": 8.257400512695312, "learning_rate": 7.554775936757545e-05, "loss": 2.684035301208496, "memory(GiB)": 77.56, "step": 38430, "token_acc": 0.46504559270516715, "train_speed(iter/s)": 1.439949 }, { "epoch": 1.6466732359367637, "grad_norm": 4.201570987701416, "learning_rate": 7.554197417380613e-05, "loss": 2.5370201110839843, "memory(GiB)": 77.56, "step": 38435, "token_acc": 0.4401294498381877, "train_speed(iter/s)": 1.439971 }, { "epoch": 1.6468874512660125, "grad_norm": 3.7691662311553955, "learning_rate": 7.553618851731996e-05, "loss": 2.388266754150391, "memory(GiB)": 77.56, "step": 38440, "token_acc": 0.4914772727272727, "train_speed(iter/s)": 1.439979 }, { "epoch": 1.6471016665952616, "grad_norm": 4.353640556335449, "learning_rate": 7.553040239822179e-05, "loss": 2.6990001678466795, "memory(GiB)": 77.56, "step": 38445, "token_acc": 0.4659090909090909, "train_speed(iter/s)": 1.44001 }, { "epoch": 1.6473158819245106, "grad_norm": 5.954075336456299, "learning_rate": 7.552461581661643e-05, "loss": 2.7798843383789062, "memory(GiB)": 77.56, "step": 38450, "token_acc": 0.453416149068323, "train_speed(iter/s)": 1.440027 }, { "epoch": 1.6475300972537594, "grad_norm": 4.807617664337158, "learning_rate": 7.55188287726087e-05, "loss": 2.469046211242676, "memory(GiB)": 77.56, "step": 38455, "token_acc": 0.4792332268370607, "train_speed(iter/s)": 1.439986 }, { "epoch": 1.6477443125830085, "grad_norm": 5.687557220458984, "learning_rate": 7.551304126630345e-05, "loss": 2.815443229675293, "memory(GiB)": 77.56, "step": 38460, "token_acc": 0.42105263157894735, "train_speed(iter/s)": 1.439979 }, { "epoch": 1.6479585279122575, "grad_norm": 5.092597961425781, "learning_rate": 7.550725329780555e-05, "loss": 2.835649108886719, "memory(GiB)": 77.56, "step": 38465, "token_acc": 0.44673539518900346, "train_speed(iter/s)": 1.440005 }, { "epoch": 1.6481727432415063, "grad_norm": 4.26067590713501, "learning_rate": 7.550146486721981e-05, "loss": 2.2664522171020507, "memory(GiB)": 77.56, "step": 38470, "token_acc": 0.4862068965517241, "train_speed(iter/s)": 1.439995 }, { "epoch": 1.6483869585707553, "grad_norm": 4.944790363311768, "learning_rate": 7.54956759746511e-05, "loss": 2.7993053436279296, "memory(GiB)": 77.56, "step": 38475, "token_acc": 0.4528301886792453, "train_speed(iter/s)": 1.439973 }, { "epoch": 1.6486011739000044, "grad_norm": 3.6839516162872314, "learning_rate": 7.548988662020433e-05, "loss": 2.6350074768066407, "memory(GiB)": 77.56, "step": 38480, "token_acc": 0.4830287206266319, "train_speed(iter/s)": 1.439994 }, { "epoch": 1.6488153892292532, "grad_norm": 4.4718523025512695, "learning_rate": 7.548409680398433e-05, "loss": 2.329062271118164, "memory(GiB)": 77.56, "step": 38485, "token_acc": 0.4749034749034749, "train_speed(iter/s)": 1.440016 }, { "epoch": 1.6490296045585022, "grad_norm": 5.5267333984375, "learning_rate": 7.547830652609601e-05, "loss": 2.7577997207641602, "memory(GiB)": 77.56, "step": 38490, "token_acc": 0.4682274247491639, "train_speed(iter/s)": 1.43998 }, { "epoch": 1.6492438198877513, "grad_norm": 6.347685813903809, "learning_rate": 7.547251578664427e-05, "loss": 2.4713376998901366, "memory(GiB)": 77.56, "step": 38495, "token_acc": 0.5, "train_speed(iter/s)": 1.439957 }, { "epoch": 1.649458035217, "grad_norm": 4.946099281311035, "learning_rate": 7.546672458573402e-05, "loss": 2.0630680084228517, "memory(GiB)": 77.56, "step": 38500, "token_acc": 0.5801886792452831, "train_speed(iter/s)": 1.43996 }, { "epoch": 1.649458035217, "eval_loss": 2.434729814529419, "eval_runtime": 15.0419, "eval_samples_per_second": 6.648, "eval_steps_per_second": 6.648, "eval_token_acc": 0.43757725587144625, "step": 38500 }, { "epoch": 1.649672250546249, "grad_norm": 5.806351661682129, "learning_rate": 7.546093292347016e-05, "loss": 2.335239028930664, "memory(GiB)": 77.56, "step": 38505, "token_acc": 0.445970695970696, "train_speed(iter/s)": 1.439066 }, { "epoch": 1.6498864658754981, "grad_norm": 8.821775436401367, "learning_rate": 7.545514079995762e-05, "loss": 2.767985534667969, "memory(GiB)": 77.56, "step": 38510, "token_acc": 0.484, "train_speed(iter/s)": 1.439032 }, { "epoch": 1.650100681204747, "grad_norm": 5.096793174743652, "learning_rate": 7.544934821530132e-05, "loss": 2.5442153930664064, "memory(GiB)": 77.56, "step": 38515, "token_acc": 0.4784172661870504, "train_speed(iter/s)": 1.439059 }, { "epoch": 1.650314896533996, "grad_norm": 4.561770915985107, "learning_rate": 7.544355516960621e-05, "loss": 2.6692693710327147, "memory(GiB)": 77.56, "step": 38520, "token_acc": 0.4714285714285714, "train_speed(iter/s)": 1.439042 }, { "epoch": 1.650529111863245, "grad_norm": 5.6831769943237305, "learning_rate": 7.543776166297723e-05, "loss": 2.511821746826172, "memory(GiB)": 77.56, "step": 38525, "token_acc": 0.5019455252918288, "train_speed(iter/s)": 1.439069 }, { "epoch": 1.6507433271924938, "grad_norm": 7.861547946929932, "learning_rate": 7.543196769551931e-05, "loss": 2.3939016342163084, "memory(GiB)": 77.56, "step": 38530, "token_acc": 0.49206349206349204, "train_speed(iter/s)": 1.439128 }, { "epoch": 1.6509575425217429, "grad_norm": 3.8708057403564453, "learning_rate": 7.542617326733747e-05, "loss": 2.6797657012939453, "memory(GiB)": 77.56, "step": 38535, "token_acc": 0.46715328467153283, "train_speed(iter/s)": 1.439034 }, { "epoch": 1.651171757850992, "grad_norm": 4.398819446563721, "learning_rate": 7.542037837853664e-05, "loss": 2.5842761993408203, "memory(GiB)": 77.56, "step": 38540, "token_acc": 0.46808510638297873, "train_speed(iter/s)": 1.439059 }, { "epoch": 1.6513859731802407, "grad_norm": 4.454665184020996, "learning_rate": 7.541458302922179e-05, "loss": 2.436616897583008, "memory(GiB)": 77.56, "step": 38545, "token_acc": 0.4652014652014652, "train_speed(iter/s)": 1.439032 }, { "epoch": 1.6516001885094898, "grad_norm": 5.831594944000244, "learning_rate": 7.540878721949796e-05, "loss": 2.6067119598388673, "memory(GiB)": 77.56, "step": 38550, "token_acc": 0.4721311475409836, "train_speed(iter/s)": 1.439052 }, { "epoch": 1.6518144038387388, "grad_norm": 6.809323310852051, "learning_rate": 7.54029909494701e-05, "loss": 2.7003225326538085, "memory(GiB)": 77.56, "step": 38555, "token_acc": 0.39755351681957185, "train_speed(iter/s)": 1.439106 }, { "epoch": 1.6520286191679876, "grad_norm": 4.65964937210083, "learning_rate": 7.539719421924322e-05, "loss": 2.321906852722168, "memory(GiB)": 77.56, "step": 38560, "token_acc": 0.4723127035830619, "train_speed(iter/s)": 1.439127 }, { "epoch": 1.6522428344972366, "grad_norm": 4.276027202606201, "learning_rate": 7.539139702892235e-05, "loss": 2.284992790222168, "memory(GiB)": 77.56, "step": 38565, "token_acc": 0.5104895104895105, "train_speed(iter/s)": 1.439097 }, { "epoch": 1.6524570498264857, "grad_norm": 5.731977939605713, "learning_rate": 7.538559937861251e-05, "loss": 2.26531982421875, "memory(GiB)": 77.56, "step": 38570, "token_acc": 0.5230125523012552, "train_speed(iter/s)": 1.4391 }, { "epoch": 1.6526712651557345, "grad_norm": 5.116015434265137, "learning_rate": 7.53798012684187e-05, "loss": 2.5026100158691404, "memory(GiB)": 77.56, "step": 38575, "token_acc": 0.4881656804733728, "train_speed(iter/s)": 1.439141 }, { "epoch": 1.6528854804849835, "grad_norm": 5.174070835113525, "learning_rate": 7.537400269844601e-05, "loss": 2.5735965728759767, "memory(GiB)": 77.56, "step": 38580, "token_acc": 0.46938775510204084, "train_speed(iter/s)": 1.439061 }, { "epoch": 1.6530996958142326, "grad_norm": 4.859527111053467, "learning_rate": 7.536820366879946e-05, "loss": 2.5687503814697266, "memory(GiB)": 77.56, "step": 38585, "token_acc": 0.48727272727272725, "train_speed(iter/s)": 1.438995 }, { "epoch": 1.6533139111434814, "grad_norm": 4.956950664520264, "learning_rate": 7.536240417958409e-05, "loss": 2.701076316833496, "memory(GiB)": 77.56, "step": 38590, "token_acc": 0.42724458204334365, "train_speed(iter/s)": 1.43905 }, { "epoch": 1.6535281264727304, "grad_norm": 6.706456184387207, "learning_rate": 7.535660423090498e-05, "loss": 2.4777870178222656, "memory(GiB)": 77.56, "step": 38595, "token_acc": 0.5170940170940171, "train_speed(iter/s)": 1.439057 }, { "epoch": 1.6537423418019794, "grad_norm": 4.761669635772705, "learning_rate": 7.535080382286718e-05, "loss": 2.692028045654297, "memory(GiB)": 77.56, "step": 38600, "token_acc": 0.4612903225806452, "train_speed(iter/s)": 1.43902 }, { "epoch": 1.6539565571312282, "grad_norm": 6.3677520751953125, "learning_rate": 7.534500295557581e-05, "loss": 2.6435422897338867, "memory(GiB)": 77.56, "step": 38605, "token_acc": 0.4470588235294118, "train_speed(iter/s)": 1.438996 }, { "epoch": 1.6541707724604773, "grad_norm": 6.014742374420166, "learning_rate": 7.533920162913592e-05, "loss": 2.396424674987793, "memory(GiB)": 77.56, "step": 38610, "token_acc": 0.5228758169934641, "train_speed(iter/s)": 1.438997 }, { "epoch": 1.6543849877897263, "grad_norm": 5.6689581871032715, "learning_rate": 7.533339984365265e-05, "loss": 2.421131706237793, "memory(GiB)": 77.56, "step": 38615, "token_acc": 0.4562043795620438, "train_speed(iter/s)": 1.439014 }, { "epoch": 1.6545992031189751, "grad_norm": 4.582613945007324, "learning_rate": 7.532759759923105e-05, "loss": 2.5647411346435547, "memory(GiB)": 77.56, "step": 38620, "token_acc": 0.49122807017543857, "train_speed(iter/s)": 1.439036 }, { "epoch": 1.6548134184482242, "grad_norm": 5.181107044219971, "learning_rate": 7.532179489597626e-05, "loss": 2.500691223144531, "memory(GiB)": 77.56, "step": 38625, "token_acc": 0.5, "train_speed(iter/s)": 1.439077 }, { "epoch": 1.6550276337774732, "grad_norm": 9.546935081481934, "learning_rate": 7.531599173399342e-05, "loss": 2.7980955123901365, "memory(GiB)": 77.56, "step": 38630, "token_acc": 0.5129151291512916, "train_speed(iter/s)": 1.439114 }, { "epoch": 1.655241849106722, "grad_norm": 3.835843563079834, "learning_rate": 7.531018811338764e-05, "loss": 2.670639419555664, "memory(GiB)": 77.56, "step": 38635, "token_acc": 0.4862068965517241, "train_speed(iter/s)": 1.439121 }, { "epoch": 1.655456064435971, "grad_norm": 5.509315490722656, "learning_rate": 7.530438403426403e-05, "loss": 2.517391014099121, "memory(GiB)": 77.56, "step": 38640, "token_acc": 0.5156794425087108, "train_speed(iter/s)": 1.439153 }, { "epoch": 1.65567027976522, "grad_norm": 5.535069942474365, "learning_rate": 7.529857949672778e-05, "loss": 2.7095333099365235, "memory(GiB)": 77.56, "step": 38645, "token_acc": 0.4729241877256318, "train_speed(iter/s)": 1.439158 }, { "epoch": 1.6558844950944689, "grad_norm": 7.169466495513916, "learning_rate": 7.529277450088405e-05, "loss": 2.417226028442383, "memory(GiB)": 77.56, "step": 38650, "token_acc": 0.4742647058823529, "train_speed(iter/s)": 1.439155 }, { "epoch": 1.656098710423718, "grad_norm": 4.585759162902832, "learning_rate": 7.528696904683797e-05, "loss": 2.039259910583496, "memory(GiB)": 77.56, "step": 38655, "token_acc": 0.5546218487394958, "train_speed(iter/s)": 1.439169 }, { "epoch": 1.656312925752967, "grad_norm": 4.929565906524658, "learning_rate": 7.528116313469473e-05, "loss": 2.458763313293457, "memory(GiB)": 77.56, "step": 38660, "token_acc": 0.476027397260274, "train_speed(iter/s)": 1.439206 }, { "epoch": 1.6565271410822158, "grad_norm": 4.293598651885986, "learning_rate": 7.52753567645595e-05, "loss": 2.4261655807495117, "memory(GiB)": 77.56, "step": 38665, "token_acc": 0.4485049833887043, "train_speed(iter/s)": 1.439233 }, { "epoch": 1.6567413564114648, "grad_norm": 6.021685600280762, "learning_rate": 7.526954993653747e-05, "loss": 2.4622310638427733, "memory(GiB)": 77.56, "step": 38670, "token_acc": 0.4657534246575342, "train_speed(iter/s)": 1.439278 }, { "epoch": 1.6569555717407138, "grad_norm": 3.9898576736450195, "learning_rate": 7.526374265073384e-05, "loss": 2.586587905883789, "memory(GiB)": 77.56, "step": 38675, "token_acc": 0.46601941747572817, "train_speed(iter/s)": 1.439276 }, { "epoch": 1.6571697870699627, "grad_norm": 4.119765281677246, "learning_rate": 7.525793490725381e-05, "loss": 2.7099660873413085, "memory(GiB)": 77.56, "step": 38680, "token_acc": 0.46397694524495675, "train_speed(iter/s)": 1.439286 }, { "epoch": 1.6573840023992117, "grad_norm": 4.754705429077148, "learning_rate": 7.525212670620261e-05, "loss": 2.5766674041748048, "memory(GiB)": 77.56, "step": 38685, "token_acc": 0.483739837398374, "train_speed(iter/s)": 1.439322 }, { "epoch": 1.6575982177284607, "grad_norm": 4.713914394378662, "learning_rate": 7.524631804768543e-05, "loss": 2.9598012924194337, "memory(GiB)": 77.56, "step": 38690, "token_acc": 0.4463087248322148, "train_speed(iter/s)": 1.439345 }, { "epoch": 1.6578124330577095, "grad_norm": 5.343391418457031, "learning_rate": 7.524050893180752e-05, "loss": 2.514819526672363, "memory(GiB)": 77.56, "step": 38695, "token_acc": 0.4574468085106383, "train_speed(iter/s)": 1.439354 }, { "epoch": 1.6580266483869586, "grad_norm": 5.777933597564697, "learning_rate": 7.523469935867411e-05, "loss": 2.5260684967041014, "memory(GiB)": 77.56, "step": 38700, "token_acc": 0.42902208201892744, "train_speed(iter/s)": 1.439348 }, { "epoch": 1.6582408637162076, "grad_norm": 4.930129051208496, "learning_rate": 7.522888932839045e-05, "loss": 2.4257545471191406, "memory(GiB)": 77.56, "step": 38705, "token_acc": 0.4928571428571429, "train_speed(iter/s)": 1.439353 }, { "epoch": 1.6584550790454564, "grad_norm": 4.957555770874023, "learning_rate": 7.52230788410618e-05, "loss": 2.6519437789916993, "memory(GiB)": 77.56, "step": 38710, "token_acc": 0.47096774193548385, "train_speed(iter/s)": 1.439384 }, { "epoch": 1.6586692943747054, "grad_norm": 5.4925007820129395, "learning_rate": 7.52172678967934e-05, "loss": 2.5465240478515625, "memory(GiB)": 77.56, "step": 38715, "token_acc": 0.46710526315789475, "train_speed(iter/s)": 1.439379 }, { "epoch": 1.6588835097039545, "grad_norm": 5.156786918640137, "learning_rate": 7.521145649569054e-05, "loss": 2.633748435974121, "memory(GiB)": 77.56, "step": 38720, "token_acc": 0.46296296296296297, "train_speed(iter/s)": 1.43942 }, { "epoch": 1.6590977250332033, "grad_norm": 6.931817531585693, "learning_rate": 7.520564463785851e-05, "loss": 2.6720380783081055, "memory(GiB)": 77.56, "step": 38725, "token_acc": 0.4280701754385965, "train_speed(iter/s)": 1.439383 }, { "epoch": 1.6593119403624523, "grad_norm": 5.221216201782227, "learning_rate": 7.519983232340258e-05, "loss": 2.3263933181762697, "memory(GiB)": 77.56, "step": 38730, "token_acc": 0.49429657794676807, "train_speed(iter/s)": 1.439357 }, { "epoch": 1.6595261556917014, "grad_norm": 6.273737907409668, "learning_rate": 7.519401955242803e-05, "loss": 2.4995725631713865, "memory(GiB)": 77.56, "step": 38735, "token_acc": 0.4978165938864629, "train_speed(iter/s)": 1.43939 }, { "epoch": 1.6597403710209502, "grad_norm": 5.082566261291504, "learning_rate": 7.518820632504021e-05, "loss": 2.552480125427246, "memory(GiB)": 77.56, "step": 38740, "token_acc": 0.4604904632152589, "train_speed(iter/s)": 1.439362 }, { "epoch": 1.6599545863501992, "grad_norm": 4.399657726287842, "learning_rate": 7.518239264134439e-05, "loss": 2.3195480346679687, "memory(GiB)": 77.56, "step": 38745, "token_acc": 0.4894366197183099, "train_speed(iter/s)": 1.439407 }, { "epoch": 1.6601688016794482, "grad_norm": 5.266880989074707, "learning_rate": 7.51765785014459e-05, "loss": 2.695078468322754, "memory(GiB)": 77.56, "step": 38750, "token_acc": 0.4857142857142857, "train_speed(iter/s)": 1.439442 }, { "epoch": 1.660383017008697, "grad_norm": 7.205869674682617, "learning_rate": 7.517076390545007e-05, "loss": 2.627971076965332, "memory(GiB)": 77.56, "step": 38755, "token_acc": 0.47604790419161674, "train_speed(iter/s)": 1.439456 }, { "epoch": 1.660597232337946, "grad_norm": 5.870373725891113, "learning_rate": 7.516494885346223e-05, "loss": 2.6181596755981444, "memory(GiB)": 77.56, "step": 38760, "token_acc": 0.44404332129963897, "train_speed(iter/s)": 1.439481 }, { "epoch": 1.6608114476671951, "grad_norm": 5.468871116638184, "learning_rate": 7.515913334558778e-05, "loss": 2.3515453338623047, "memory(GiB)": 77.56, "step": 38765, "token_acc": 0.49812734082397003, "train_speed(iter/s)": 1.43949 }, { "epoch": 1.661025662996444, "grad_norm": 4.76951789855957, "learning_rate": 7.515331738193199e-05, "loss": 2.6942441940307615, "memory(GiB)": 77.56, "step": 38770, "token_acc": 0.4666666666666667, "train_speed(iter/s)": 1.439486 }, { "epoch": 1.661239878325693, "grad_norm": 6.500946521759033, "learning_rate": 7.514750096260026e-05, "loss": 2.636835479736328, "memory(GiB)": 77.56, "step": 38775, "token_acc": 0.4430379746835443, "train_speed(iter/s)": 1.439481 }, { "epoch": 1.661454093654942, "grad_norm": 6.966447830200195, "learning_rate": 7.514168408769798e-05, "loss": 2.415618896484375, "memory(GiB)": 77.56, "step": 38780, "token_acc": 0.4653179190751445, "train_speed(iter/s)": 1.439532 }, { "epoch": 1.6616683089841908, "grad_norm": 4.5493083000183105, "learning_rate": 7.513586675733049e-05, "loss": 2.31470947265625, "memory(GiB)": 77.56, "step": 38785, "token_acc": 0.5253623188405797, "train_speed(iter/s)": 1.439529 }, { "epoch": 1.6618825243134399, "grad_norm": 4.842322826385498, "learning_rate": 7.51300489716032e-05, "loss": 2.6461009979248047, "memory(GiB)": 77.56, "step": 38790, "token_acc": 0.4358108108108108, "train_speed(iter/s)": 1.439518 }, { "epoch": 1.662096739642689, "grad_norm": 5.590895175933838, "learning_rate": 7.51242307306215e-05, "loss": 2.746798515319824, "memory(GiB)": 77.56, "step": 38795, "token_acc": 0.44642857142857145, "train_speed(iter/s)": 1.439568 }, { "epoch": 1.6623109549719377, "grad_norm": 5.710389137268066, "learning_rate": 7.511841203449079e-05, "loss": 2.606972885131836, "memory(GiB)": 77.56, "step": 38800, "token_acc": 0.46715328467153283, "train_speed(iter/s)": 1.439585 }, { "epoch": 1.6625251703011867, "grad_norm": 5.101905345916748, "learning_rate": 7.511259288331649e-05, "loss": 2.4387020111083983, "memory(GiB)": 77.56, "step": 38805, "token_acc": 0.5077519379844961, "train_speed(iter/s)": 1.439534 }, { "epoch": 1.6627393856304358, "grad_norm": 6.93477725982666, "learning_rate": 7.510677327720401e-05, "loss": 2.5429611206054688, "memory(GiB)": 77.56, "step": 38810, "token_acc": 0.44666666666666666, "train_speed(iter/s)": 1.439528 }, { "epoch": 1.6629536009596846, "grad_norm": 4.492660999298096, "learning_rate": 7.510095321625878e-05, "loss": 2.3240121841430663, "memory(GiB)": 77.56, "step": 38815, "token_acc": 0.4979919678714859, "train_speed(iter/s)": 1.439546 }, { "epoch": 1.6631678162889336, "grad_norm": 3.215885639190674, "learning_rate": 7.509513270058624e-05, "loss": 2.4668886184692385, "memory(GiB)": 77.56, "step": 38820, "token_acc": 0.5018181818181818, "train_speed(iter/s)": 1.439583 }, { "epoch": 1.6633820316181827, "grad_norm": 4.944400787353516, "learning_rate": 7.50893117302918e-05, "loss": 2.5129093170166015, "memory(GiB)": 77.56, "step": 38825, "token_acc": 0.46503496503496505, "train_speed(iter/s)": 1.439581 }, { "epoch": 1.6635962469474315, "grad_norm": 6.282962322235107, "learning_rate": 7.508349030548099e-05, "loss": 2.451171875, "memory(GiB)": 77.56, "step": 38830, "token_acc": 0.46619217081850534, "train_speed(iter/s)": 1.439536 }, { "epoch": 1.6638104622766805, "grad_norm": 7.855851650238037, "learning_rate": 7.507766842625918e-05, "loss": 2.6627079010009767, "memory(GiB)": 77.56, "step": 38835, "token_acc": 0.47297297297297297, "train_speed(iter/s)": 1.439547 }, { "epoch": 1.6640246776059295, "grad_norm": 5.464412689208984, "learning_rate": 7.50718460927319e-05, "loss": 2.3441877365112305, "memory(GiB)": 77.56, "step": 38840, "token_acc": 0.4919093851132686, "train_speed(iter/s)": 1.439553 }, { "epoch": 1.6642388929351783, "grad_norm": 4.975652694702148, "learning_rate": 7.506602330500462e-05, "loss": 2.6074970245361326, "memory(GiB)": 77.56, "step": 38845, "token_acc": 0.44200626959247646, "train_speed(iter/s)": 1.439609 }, { "epoch": 1.6644531082644274, "grad_norm": 5.119026184082031, "learning_rate": 7.506020006318279e-05, "loss": 2.2321935653686524, "memory(GiB)": 77.56, "step": 38850, "token_acc": 0.49063670411985016, "train_speed(iter/s)": 1.439597 }, { "epoch": 1.6646673235936764, "grad_norm": 4.505013465881348, "learning_rate": 7.505437636737196e-05, "loss": 2.5621501922607424, "memory(GiB)": 77.56, "step": 38855, "token_acc": 0.4662576687116564, "train_speed(iter/s)": 1.439573 }, { "epoch": 1.6648815389229252, "grad_norm": 6.21051025390625, "learning_rate": 7.504855221767757e-05, "loss": 2.8910472869873045, "memory(GiB)": 77.56, "step": 38860, "token_acc": 0.41379310344827586, "train_speed(iter/s)": 1.439576 }, { "epoch": 1.6650957542521743, "grad_norm": 5.030711650848389, "learning_rate": 7.504272761420517e-05, "loss": 2.554920768737793, "memory(GiB)": 77.56, "step": 38865, "token_acc": 0.5109489051094891, "train_speed(iter/s)": 1.439508 }, { "epoch": 1.6653099695814233, "grad_norm": 3.88623046875, "learning_rate": 7.503690255706026e-05, "loss": 2.198586082458496, "memory(GiB)": 77.56, "step": 38870, "token_acc": 0.5086505190311419, "train_speed(iter/s)": 1.439468 }, { "epoch": 1.6655241849106721, "grad_norm": 4.617269515991211, "learning_rate": 7.503107704634838e-05, "loss": 2.6439130783081053, "memory(GiB)": 77.56, "step": 38875, "token_acc": 0.4575342465753425, "train_speed(iter/s)": 1.439458 }, { "epoch": 1.6657384002399211, "grad_norm": 6.865589618682861, "learning_rate": 7.502525108217506e-05, "loss": 2.1496143341064453, "memory(GiB)": 77.56, "step": 38880, "token_acc": 0.5587044534412956, "train_speed(iter/s)": 1.439486 }, { "epoch": 1.6659526155691702, "grad_norm": 5.5975022315979, "learning_rate": 7.501942466464584e-05, "loss": 2.6099880218505858, "memory(GiB)": 77.56, "step": 38885, "token_acc": 0.4529616724738676, "train_speed(iter/s)": 1.439506 }, { "epoch": 1.666166830898419, "grad_norm": 4.411645412445068, "learning_rate": 7.501359779386627e-05, "loss": 2.645598602294922, "memory(GiB)": 77.56, "step": 38890, "token_acc": 0.44805194805194803, "train_speed(iter/s)": 1.43954 }, { "epoch": 1.666381046227668, "grad_norm": 3.435889482498169, "learning_rate": 7.500777046994192e-05, "loss": 2.1572608947753906, "memory(GiB)": 77.56, "step": 38895, "token_acc": 0.4919614147909968, "train_speed(iter/s)": 1.439553 }, { "epoch": 1.666595261556917, "grad_norm": 5.93528413772583, "learning_rate": 7.500194269297833e-05, "loss": 2.5333736419677733, "memory(GiB)": 77.56, "step": 38900, "token_acc": 0.494949494949495, "train_speed(iter/s)": 1.43958 }, { "epoch": 1.6668094768861659, "grad_norm": 4.217938423156738, "learning_rate": 7.49961144630811e-05, "loss": 2.4473541259765623, "memory(GiB)": 77.56, "step": 38905, "token_acc": 0.5075075075075075, "train_speed(iter/s)": 1.439609 }, { "epoch": 1.667023692215415, "grad_norm": 4.780902862548828, "learning_rate": 7.499028578035581e-05, "loss": 2.6170782089233398, "memory(GiB)": 77.56, "step": 38910, "token_acc": 0.4169014084507042, "train_speed(iter/s)": 1.439605 }, { "epoch": 1.667237907544664, "grad_norm": 6.853539943695068, "learning_rate": 7.498445664490807e-05, "loss": 2.4274330139160156, "memory(GiB)": 77.56, "step": 38915, "token_acc": 0.4808362369337979, "train_speed(iter/s)": 1.439635 }, { "epoch": 1.6674521228739128, "grad_norm": 4.190219879150391, "learning_rate": 7.497862705684345e-05, "loss": 2.6190475463867187, "memory(GiB)": 77.56, "step": 38920, "token_acc": 0.45222929936305734, "train_speed(iter/s)": 1.439677 }, { "epoch": 1.6676663382031618, "grad_norm": 7.996579170227051, "learning_rate": 7.497279701626756e-05, "loss": 2.4458694458007812, "memory(GiB)": 77.56, "step": 38925, "token_acc": 0.4642857142857143, "train_speed(iter/s)": 1.439673 }, { "epoch": 1.6678805535324108, "grad_norm": 4.658965110778809, "learning_rate": 7.496696652328603e-05, "loss": 2.435190200805664, "memory(GiB)": 77.56, "step": 38930, "token_acc": 0.5083612040133779, "train_speed(iter/s)": 1.439674 }, { "epoch": 1.6680947688616596, "grad_norm": 4.744532108306885, "learning_rate": 7.496113557800446e-05, "loss": 2.380868148803711, "memory(GiB)": 77.56, "step": 38935, "token_acc": 0.5230263157894737, "train_speed(iter/s)": 1.439667 }, { "epoch": 1.6683089841909087, "grad_norm": 4.757541179656982, "learning_rate": 7.495530418052855e-05, "loss": 2.550774574279785, "memory(GiB)": 77.56, "step": 38940, "token_acc": 0.46, "train_speed(iter/s)": 1.439651 }, { "epoch": 1.6685231995201577, "grad_norm": 6.012129783630371, "learning_rate": 7.494947233096385e-05, "loss": 2.455175018310547, "memory(GiB)": 77.56, "step": 38945, "token_acc": 0.46534653465346537, "train_speed(iter/s)": 1.439648 }, { "epoch": 1.6687374148494065, "grad_norm": 4.727448463439941, "learning_rate": 7.494364002941608e-05, "loss": 2.6159389495849608, "memory(GiB)": 77.56, "step": 38950, "token_acc": 0.4613003095975232, "train_speed(iter/s)": 1.439674 }, { "epoch": 1.6689516301786556, "grad_norm": 4.4705400466918945, "learning_rate": 7.493780727599086e-05, "loss": 2.592789649963379, "memory(GiB)": 77.56, "step": 38955, "token_acc": 0.47058823529411764, "train_speed(iter/s)": 1.439693 }, { "epoch": 1.6691658455079046, "grad_norm": 5.9246673583984375, "learning_rate": 7.493197407079389e-05, "loss": 2.5767799377441407, "memory(GiB)": 77.56, "step": 38960, "token_acc": 0.4840764331210191, "train_speed(iter/s)": 1.439669 }, { "epoch": 1.6693800608371534, "grad_norm": 5.711269378662109, "learning_rate": 7.492614041393079e-05, "loss": 2.86291561126709, "memory(GiB)": 77.56, "step": 38965, "token_acc": 0.45, "train_speed(iter/s)": 1.439595 }, { "epoch": 1.6695942761664024, "grad_norm": 8.412878036499023, "learning_rate": 7.492030630550728e-05, "loss": 2.722490119934082, "memory(GiB)": 77.56, "step": 38970, "token_acc": 0.45980707395498394, "train_speed(iter/s)": 1.439604 }, { "epoch": 1.6698084914956515, "grad_norm": 5.96693754196167, "learning_rate": 7.491447174562906e-05, "loss": 2.6551231384277343, "memory(GiB)": 77.56, "step": 38975, "token_acc": 0.4755700325732899, "train_speed(iter/s)": 1.439614 }, { "epoch": 1.6700227068249003, "grad_norm": 4.954509735107422, "learning_rate": 7.49086367344018e-05, "loss": 2.6989574432373047, "memory(GiB)": 77.56, "step": 38980, "token_acc": 0.4653846153846154, "train_speed(iter/s)": 1.439582 }, { "epoch": 1.6702369221541493, "grad_norm": 5.199118614196777, "learning_rate": 7.490280127193122e-05, "loss": 2.872369575500488, "memory(GiB)": 77.56, "step": 38985, "token_acc": 0.4310850439882698, "train_speed(iter/s)": 1.439565 }, { "epoch": 1.6704511374833984, "grad_norm": 3.8215460777282715, "learning_rate": 7.489696535832305e-05, "loss": 2.401051902770996, "memory(GiB)": 77.56, "step": 38990, "token_acc": 0.49063670411985016, "train_speed(iter/s)": 1.439551 }, { "epoch": 1.6706653528126472, "grad_norm": 3.881770372390747, "learning_rate": 7.489112899368298e-05, "loss": 2.625141906738281, "memory(GiB)": 77.56, "step": 38995, "token_acc": 0.45592705167173253, "train_speed(iter/s)": 1.439566 }, { "epoch": 1.6708795681418962, "grad_norm": 3.871721029281616, "learning_rate": 7.488529217811676e-05, "loss": 2.5486787796020507, "memory(GiB)": 77.56, "step": 39000, "token_acc": 0.44545454545454544, "train_speed(iter/s)": 1.439577 }, { "epoch": 1.6708795681418962, "eval_loss": 2.2289645671844482, "eval_runtime": 14.8591, "eval_samples_per_second": 6.73, "eval_steps_per_second": 6.73, "eval_token_acc": 0.47736093143596375, "step": 39000 }, { "epoch": 1.6710937834711452, "grad_norm": 4.947300910949707, "learning_rate": 7.487945491173012e-05, "loss": 2.4421831130981446, "memory(GiB)": 77.56, "step": 39005, "token_acc": 0.4818702290076336, "train_speed(iter/s)": 1.438725 }, { "epoch": 1.671307998800394, "grad_norm": 4.8534393310546875, "learning_rate": 7.487361719462883e-05, "loss": 2.932672882080078, "memory(GiB)": 77.56, "step": 39010, "token_acc": 0.46153846153846156, "train_speed(iter/s)": 1.43875 }, { "epoch": 1.671522214129643, "grad_norm": 5.711973190307617, "learning_rate": 7.486777902691864e-05, "loss": 2.5695960998535154, "memory(GiB)": 77.56, "step": 39015, "token_acc": 0.46932515337423314, "train_speed(iter/s)": 1.438754 }, { "epoch": 1.6717364294588921, "grad_norm": 4.816310405731201, "learning_rate": 7.48619404087053e-05, "loss": 2.316717338562012, "memory(GiB)": 77.56, "step": 39020, "token_acc": 0.4883720930232558, "train_speed(iter/s)": 1.43876 }, { "epoch": 1.671950644788141, "grad_norm": 4.691830635070801, "learning_rate": 7.485610134009458e-05, "loss": 2.816037178039551, "memory(GiB)": 77.56, "step": 39025, "token_acc": 0.4620938628158845, "train_speed(iter/s)": 1.438812 }, { "epoch": 1.67216486011739, "grad_norm": 5.378674507141113, "learning_rate": 7.485026182119225e-05, "loss": 2.6296865463256838, "memory(GiB)": 77.56, "step": 39030, "token_acc": 0.45857988165680474, "train_speed(iter/s)": 1.438815 }, { "epoch": 1.672379075446639, "grad_norm": 4.636214256286621, "learning_rate": 7.484442185210414e-05, "loss": 2.6497900009155275, "memory(GiB)": 77.56, "step": 39035, "token_acc": 0.42244224422442245, "train_speed(iter/s)": 1.438835 }, { "epoch": 1.6725932907758878, "grad_norm": 4.445847511291504, "learning_rate": 7.483858143293602e-05, "loss": 2.561076354980469, "memory(GiB)": 77.56, "step": 39040, "token_acc": 0.479108635097493, "train_speed(iter/s)": 1.438861 }, { "epoch": 1.6728075061051368, "grad_norm": 5.082719326019287, "learning_rate": 7.483274056379368e-05, "loss": 2.3797388076782227, "memory(GiB)": 77.56, "step": 39045, "token_acc": 0.4634146341463415, "train_speed(iter/s)": 1.438885 }, { "epoch": 1.6730217214343859, "grad_norm": 7.136971473693848, "learning_rate": 7.482689924478297e-05, "loss": 2.4964004516601563, "memory(GiB)": 77.56, "step": 39050, "token_acc": 0.48014440433212996, "train_speed(iter/s)": 1.438901 }, { "epoch": 1.6732359367636347, "grad_norm": 5.671760559082031, "learning_rate": 7.482105747600968e-05, "loss": 2.2216861724853514, "memory(GiB)": 77.56, "step": 39055, "token_acc": 0.49624060150375937, "train_speed(iter/s)": 1.438887 }, { "epoch": 1.6734501520928837, "grad_norm": 5.063310146331787, "learning_rate": 7.481521525757966e-05, "loss": 2.498670768737793, "memory(GiB)": 77.56, "step": 39060, "token_acc": 0.48514851485148514, "train_speed(iter/s)": 1.438897 }, { "epoch": 1.6736643674221328, "grad_norm": 4.345708847045898, "learning_rate": 7.480937258959872e-05, "loss": 2.5834796905517576, "memory(GiB)": 77.56, "step": 39065, "token_acc": 0.4495677233429395, "train_speed(iter/s)": 1.438895 }, { "epoch": 1.6738785827513816, "grad_norm": 4.285645484924316, "learning_rate": 7.480352947217274e-05, "loss": 2.7373275756835938, "memory(GiB)": 77.56, "step": 39070, "token_acc": 0.45964912280701753, "train_speed(iter/s)": 1.438887 }, { "epoch": 1.6740927980806306, "grad_norm": 4.04364538192749, "learning_rate": 7.479768590540755e-05, "loss": 2.2201259613037108, "memory(GiB)": 77.56, "step": 39075, "token_acc": 0.5423728813559322, "train_speed(iter/s)": 1.438871 }, { "epoch": 1.6743070134098796, "grad_norm": 5.461197376251221, "learning_rate": 7.479184188940901e-05, "loss": 2.398779106140137, "memory(GiB)": 77.56, "step": 39080, "token_acc": 0.48846153846153845, "train_speed(iter/s)": 1.438845 }, { "epoch": 1.6745212287391285, "grad_norm": 5.962795734405518, "learning_rate": 7.4785997424283e-05, "loss": 2.826894760131836, "memory(GiB)": 77.56, "step": 39085, "token_acc": 0.43312101910828027, "train_speed(iter/s)": 1.438841 }, { "epoch": 1.6747354440683775, "grad_norm": 5.444334983825684, "learning_rate": 7.47801525101354e-05, "loss": 2.699898910522461, "memory(GiB)": 77.56, "step": 39090, "token_acc": 0.4533762057877814, "train_speed(iter/s)": 1.438875 }, { "epoch": 1.6749496593976265, "grad_norm": 5.573398590087891, "learning_rate": 7.477430714707208e-05, "loss": 2.547859764099121, "memory(GiB)": 77.56, "step": 39095, "token_acc": 0.4899598393574297, "train_speed(iter/s)": 1.43886 }, { "epoch": 1.6751638747268753, "grad_norm": 6.221633434295654, "learning_rate": 7.476846133519896e-05, "loss": 2.221527671813965, "memory(GiB)": 77.56, "step": 39100, "token_acc": 0.49825783972125437, "train_speed(iter/s)": 1.438907 }, { "epoch": 1.6753780900561244, "grad_norm": 5.120943546295166, "learning_rate": 7.476261507462194e-05, "loss": 2.5731740951538087, "memory(GiB)": 77.56, "step": 39105, "token_acc": 0.4377224199288256, "train_speed(iter/s)": 1.438932 }, { "epoch": 1.6755923053853734, "grad_norm": 4.545486927032471, "learning_rate": 7.47567683654469e-05, "loss": 2.459206390380859, "memory(GiB)": 77.56, "step": 39110, "token_acc": 0.5016949152542373, "train_speed(iter/s)": 1.43892 }, { "epoch": 1.6758065207146222, "grad_norm": 4.305181503295898, "learning_rate": 7.475092120777978e-05, "loss": 2.2530704498291017, "memory(GiB)": 77.56, "step": 39115, "token_acc": 0.5, "train_speed(iter/s)": 1.438917 }, { "epoch": 1.6760207360438713, "grad_norm": 5.2314910888671875, "learning_rate": 7.47450736017265e-05, "loss": 2.3604949951171874, "memory(GiB)": 77.56, "step": 39120, "token_acc": 0.49110320284697506, "train_speed(iter/s)": 1.438889 }, { "epoch": 1.6762349513731203, "grad_norm": 5.725355625152588, "learning_rate": 7.4739225547393e-05, "loss": 2.1574092864990235, "memory(GiB)": 77.56, "step": 39125, "token_acc": 0.5255474452554745, "train_speed(iter/s)": 1.4389 }, { "epoch": 1.676449166702369, "grad_norm": 6.635068416595459, "learning_rate": 7.473337704488523e-05, "loss": 2.669292449951172, "memory(GiB)": 77.56, "step": 39130, "token_acc": 0.47696476964769646, "train_speed(iter/s)": 1.438859 }, { "epoch": 1.6766633820316181, "grad_norm": 5.123763561248779, "learning_rate": 7.472752809430913e-05, "loss": 2.5801183700561525, "memory(GiB)": 77.56, "step": 39135, "token_acc": 0.4697508896797153, "train_speed(iter/s)": 1.438833 }, { "epoch": 1.6768775973608672, "grad_norm": 5.092013835906982, "learning_rate": 7.472167869577066e-05, "loss": 2.5641399383544923, "memory(GiB)": 77.56, "step": 39140, "token_acc": 0.5018587360594795, "train_speed(iter/s)": 1.43881 }, { "epoch": 1.677091812690116, "grad_norm": 5.053287506103516, "learning_rate": 7.471582884937579e-05, "loss": 2.685615348815918, "memory(GiB)": 77.56, "step": 39145, "token_acc": 0.4617737003058104, "train_speed(iter/s)": 1.438857 }, { "epoch": 1.677306028019365, "grad_norm": 4.772220611572266, "learning_rate": 7.470997855523049e-05, "loss": 2.6306638717651367, "memory(GiB)": 77.56, "step": 39150, "token_acc": 0.45614035087719296, "train_speed(iter/s)": 1.438861 }, { "epoch": 1.677520243348614, "grad_norm": 6.864222049713135, "learning_rate": 7.470412781344075e-05, "loss": 3.0099872589111327, "memory(GiB)": 77.56, "step": 39155, "token_acc": 0.4139072847682119, "train_speed(iter/s)": 1.438888 }, { "epoch": 1.6777344586778629, "grad_norm": 4.832400321960449, "learning_rate": 7.469827662411257e-05, "loss": 2.3876165390014648, "memory(GiB)": 77.56, "step": 39160, "token_acc": 0.483271375464684, "train_speed(iter/s)": 1.438897 }, { "epoch": 1.677948674007112, "grad_norm": 4.6121745109558105, "learning_rate": 7.469242498735193e-05, "loss": 2.6444320678710938, "memory(GiB)": 77.56, "step": 39165, "token_acc": 0.46153846153846156, "train_speed(iter/s)": 1.43893 }, { "epoch": 1.678162889336361, "grad_norm": 6.902131080627441, "learning_rate": 7.468657290326486e-05, "loss": 2.5464151382446287, "memory(GiB)": 77.56, "step": 39170, "token_acc": 0.47115384615384615, "train_speed(iter/s)": 1.438877 }, { "epoch": 1.6783771046656097, "grad_norm": 5.684139728546143, "learning_rate": 7.468072037195735e-05, "loss": 2.4755186080932616, "memory(GiB)": 77.56, "step": 39175, "token_acc": 0.48606811145510836, "train_speed(iter/s)": 1.438901 }, { "epoch": 1.6785913199948588, "grad_norm": 6.832084655761719, "learning_rate": 7.467486739353545e-05, "loss": 2.6017223358154298, "memory(GiB)": 77.56, "step": 39180, "token_acc": 0.4429065743944637, "train_speed(iter/s)": 1.438948 }, { "epoch": 1.6788055353241078, "grad_norm": 5.121083736419678, "learning_rate": 7.466901396810517e-05, "loss": 2.4421270370483397, "memory(GiB)": 77.56, "step": 39185, "token_acc": 0.4745762711864407, "train_speed(iter/s)": 1.438935 }, { "epoch": 1.6790197506533566, "grad_norm": 4.9516119956970215, "learning_rate": 7.466316009577258e-05, "loss": 2.643556594848633, "memory(GiB)": 77.56, "step": 39190, "token_acc": 0.4794520547945205, "train_speed(iter/s)": 1.438908 }, { "epoch": 1.6792339659826059, "grad_norm": 7.260333061218262, "learning_rate": 7.465730577664368e-05, "loss": 2.1878664016723635, "memory(GiB)": 77.56, "step": 39195, "token_acc": 0.5608856088560885, "train_speed(iter/s)": 1.438898 }, { "epoch": 1.6794481813118547, "grad_norm": 4.869746685028076, "learning_rate": 7.465145101082458e-05, "loss": 2.7983097076416015, "memory(GiB)": 77.56, "step": 39200, "token_acc": 0.42955326460481097, "train_speed(iter/s)": 1.438926 }, { "epoch": 1.6796623966411035, "grad_norm": 6.569705963134766, "learning_rate": 7.464559579842132e-05, "loss": 2.4519372940063477, "memory(GiB)": 77.56, "step": 39205, "token_acc": 0.4669260700389105, "train_speed(iter/s)": 1.438975 }, { "epoch": 1.6798766119703528, "grad_norm": 4.4712419509887695, "learning_rate": 7.463974013953995e-05, "loss": 2.294918441772461, "memory(GiB)": 77.56, "step": 39210, "token_acc": 0.503731343283582, "train_speed(iter/s)": 1.438927 }, { "epoch": 1.6800908272996016, "grad_norm": 4.814571380615234, "learning_rate": 7.463388403428659e-05, "loss": 2.29299259185791, "memory(GiB)": 77.56, "step": 39215, "token_acc": 0.4866920152091255, "train_speed(iter/s)": 1.438916 }, { "epoch": 1.6803050426288504, "grad_norm": 4.6567606925964355, "learning_rate": 7.46280274827673e-05, "loss": 2.447309875488281, "memory(GiB)": 77.56, "step": 39220, "token_acc": 0.4689655172413793, "train_speed(iter/s)": 1.438955 }, { "epoch": 1.6805192579580996, "grad_norm": 3.4669249057769775, "learning_rate": 7.46221704850882e-05, "loss": 2.331422805786133, "memory(GiB)": 77.56, "step": 39225, "token_acc": 0.46439628482972134, "train_speed(iter/s)": 1.438946 }, { "epoch": 1.6807334732873485, "grad_norm": 5.413876056671143, "learning_rate": 7.461631304135538e-05, "loss": 2.4734737396240236, "memory(GiB)": 77.56, "step": 39230, "token_acc": 0.4984025559105431, "train_speed(iter/s)": 1.438957 }, { "epoch": 1.6809476886165973, "grad_norm": 4.175632953643799, "learning_rate": 7.461045515167497e-05, "loss": 2.620469856262207, "memory(GiB)": 77.56, "step": 39235, "token_acc": 0.4508670520231214, "train_speed(iter/s)": 1.438989 }, { "epoch": 1.6811619039458465, "grad_norm": 5.209411144256592, "learning_rate": 7.460459681615305e-05, "loss": 2.537337875366211, "memory(GiB)": 77.56, "step": 39240, "token_acc": 0.48872180451127817, "train_speed(iter/s)": 1.438999 }, { "epoch": 1.6813761192750953, "grad_norm": 6.1368327140808105, "learning_rate": 7.45987380348958e-05, "loss": 2.5748977661132812, "memory(GiB)": 77.56, "step": 39245, "token_acc": 0.4482758620689655, "train_speed(iter/s)": 1.438999 }, { "epoch": 1.6815903346043442, "grad_norm": 5.22243070602417, "learning_rate": 7.459287880800933e-05, "loss": 2.320934295654297, "memory(GiB)": 77.56, "step": 39250, "token_acc": 0.4855072463768116, "train_speed(iter/s)": 1.438998 }, { "epoch": 1.6818045499335934, "grad_norm": 6.507482051849365, "learning_rate": 7.458701913559978e-05, "loss": 2.7442821502685546, "memory(GiB)": 77.56, "step": 39255, "token_acc": 0.4498567335243553, "train_speed(iter/s)": 1.439028 }, { "epoch": 1.6820187652628422, "grad_norm": 5.577680587768555, "learning_rate": 7.458115901777334e-05, "loss": 2.246876907348633, "memory(GiB)": 77.56, "step": 39260, "token_acc": 0.5259259259259259, "train_speed(iter/s)": 1.43905 }, { "epoch": 1.682232980592091, "grad_norm": 5.186855792999268, "learning_rate": 7.457529845463612e-05, "loss": 2.264640045166016, "memory(GiB)": 77.56, "step": 39265, "token_acc": 0.4876325088339223, "train_speed(iter/s)": 1.439095 }, { "epoch": 1.6824471959213403, "grad_norm": 8.413525581359863, "learning_rate": 7.456943744629433e-05, "loss": 2.429824447631836, "memory(GiB)": 77.56, "step": 39270, "token_acc": 0.527972027972028, "train_speed(iter/s)": 1.439138 }, { "epoch": 1.682661411250589, "grad_norm": 4.4329142570495605, "learning_rate": 7.456357599285413e-05, "loss": 2.4337596893310547, "memory(GiB)": 77.56, "step": 39275, "token_acc": 0.515527950310559, "train_speed(iter/s)": 1.439193 }, { "epoch": 1.682875626579838, "grad_norm": 5.147960186004639, "learning_rate": 7.455771409442171e-05, "loss": 2.7660747528076173, "memory(GiB)": 77.56, "step": 39280, "token_acc": 0.45993031358885017, "train_speed(iter/s)": 1.439236 }, { "epoch": 1.6830898419090872, "grad_norm": 4.473089218139648, "learning_rate": 7.455185175110325e-05, "loss": 2.6357538223266603, "memory(GiB)": 77.56, "step": 39285, "token_acc": 0.4946236559139785, "train_speed(iter/s)": 1.439216 }, { "epoch": 1.683304057238336, "grad_norm": 4.885560035705566, "learning_rate": 7.454598896300498e-05, "loss": 2.568756103515625, "memory(GiB)": 77.56, "step": 39290, "token_acc": 0.43613707165109034, "train_speed(iter/s)": 1.439266 }, { "epoch": 1.6835182725675848, "grad_norm": 4.351996421813965, "learning_rate": 7.454012573023308e-05, "loss": 2.9108325958251955, "memory(GiB)": 77.56, "step": 39295, "token_acc": 0.44, "train_speed(iter/s)": 1.439277 }, { "epoch": 1.683732487896834, "grad_norm": 4.397448539733887, "learning_rate": 7.453426205289379e-05, "loss": 2.2326358795166015, "memory(GiB)": 77.56, "step": 39300, "token_acc": 0.5326460481099656, "train_speed(iter/s)": 1.439266 }, { "epoch": 1.6839467032260829, "grad_norm": 4.59770393371582, "learning_rate": 7.452839793109332e-05, "loss": 2.5102912902832033, "memory(GiB)": 77.56, "step": 39305, "token_acc": 0.47648902821316613, "train_speed(iter/s)": 1.439299 }, { "epoch": 1.6841609185553317, "grad_norm": 5.023507118225098, "learning_rate": 7.452253336493791e-05, "loss": 2.489552688598633, "memory(GiB)": 77.56, "step": 39310, "token_acc": 0.45738636363636365, "train_speed(iter/s)": 1.43933 }, { "epoch": 1.684375133884581, "grad_norm": 4.828036785125732, "learning_rate": 7.451666835453382e-05, "loss": 2.7237260818481444, "memory(GiB)": 77.56, "step": 39315, "token_acc": 0.4517241379310345, "train_speed(iter/s)": 1.439292 }, { "epoch": 1.6845893492138297, "grad_norm": 4.740131378173828, "learning_rate": 7.451080289998729e-05, "loss": 2.5221656799316405, "memory(GiB)": 77.56, "step": 39320, "token_acc": 0.4645161290322581, "train_speed(iter/s)": 1.439307 }, { "epoch": 1.6848035645430786, "grad_norm": 5.324163436889648, "learning_rate": 7.450493700140454e-05, "loss": 2.758205604553223, "memory(GiB)": 77.56, "step": 39325, "token_acc": 0.4557823129251701, "train_speed(iter/s)": 1.439352 }, { "epoch": 1.6850177798723278, "grad_norm": 5.364895343780518, "learning_rate": 7.44990706588919e-05, "loss": 2.2419387817382814, "memory(GiB)": 77.56, "step": 39330, "token_acc": 0.5043859649122807, "train_speed(iter/s)": 1.439385 }, { "epoch": 1.6852319952015766, "grad_norm": 4.1193060874938965, "learning_rate": 7.44932038725556e-05, "loss": 2.604151153564453, "memory(GiB)": 77.56, "step": 39335, "token_acc": 0.46264367816091956, "train_speed(iter/s)": 1.43941 }, { "epoch": 1.6854462105308254, "grad_norm": 5.7761640548706055, "learning_rate": 7.448733664250191e-05, "loss": 2.223978805541992, "memory(GiB)": 77.56, "step": 39340, "token_acc": 0.5098039215686274, "train_speed(iter/s)": 1.439443 }, { "epoch": 1.6856604258600747, "grad_norm": 5.1349287033081055, "learning_rate": 7.448146896883719e-05, "loss": 2.589311218261719, "memory(GiB)": 77.56, "step": 39345, "token_acc": 0.4401294498381877, "train_speed(iter/s)": 1.439478 }, { "epoch": 1.6858746411893235, "grad_norm": 4.3070244789123535, "learning_rate": 7.447560085166768e-05, "loss": 2.571135711669922, "memory(GiB)": 77.56, "step": 39350, "token_acc": 0.5, "train_speed(iter/s)": 1.439477 }, { "epoch": 1.6860888565185723, "grad_norm": 5.659824848175049, "learning_rate": 7.446973229109969e-05, "loss": 2.485415458679199, "memory(GiB)": 77.56, "step": 39355, "token_acc": 0.5041322314049587, "train_speed(iter/s)": 1.439532 }, { "epoch": 1.6863030718478216, "grad_norm": 5.9419708251953125, "learning_rate": 7.446386328723955e-05, "loss": 2.428858757019043, "memory(GiB)": 77.56, "step": 39360, "token_acc": 0.48220064724919093, "train_speed(iter/s)": 1.439542 }, { "epoch": 1.6865172871770704, "grad_norm": 5.576047897338867, "learning_rate": 7.445799384019358e-05, "loss": 2.317838668823242, "memory(GiB)": 77.56, "step": 39365, "token_acc": 0.5228070175438596, "train_speed(iter/s)": 1.439578 }, { "epoch": 1.6867315025063192, "grad_norm": 6.139689922332764, "learning_rate": 7.44521239500681e-05, "loss": 2.2850807189941404, "memory(GiB)": 77.56, "step": 39370, "token_acc": 0.5104602510460251, "train_speed(iter/s)": 1.439626 }, { "epoch": 1.6869457178355685, "grad_norm": 5.615159511566162, "learning_rate": 7.444625361696948e-05, "loss": 2.4922197341918944, "memory(GiB)": 77.56, "step": 39375, "token_acc": 0.4858156028368794, "train_speed(iter/s)": 1.439616 }, { "epoch": 1.6871599331648173, "grad_norm": 5.553599834442139, "learning_rate": 7.444038284100401e-05, "loss": 2.4360607147216795, "memory(GiB)": 77.56, "step": 39380, "token_acc": 0.49019607843137253, "train_speed(iter/s)": 1.439638 }, { "epoch": 1.687374148494066, "grad_norm": 5.104459762573242, "learning_rate": 7.44345116222781e-05, "loss": 2.6728410720825195, "memory(GiB)": 77.56, "step": 39385, "token_acc": 0.4542372881355932, "train_speed(iter/s)": 1.43966 }, { "epoch": 1.6875883638233153, "grad_norm": 5.416928768157959, "learning_rate": 7.442863996089809e-05, "loss": 2.7254703521728514, "memory(GiB)": 77.56, "step": 39390, "token_acc": 0.48823529411764705, "train_speed(iter/s)": 1.439696 }, { "epoch": 1.6878025791525642, "grad_norm": 6.580688953399658, "learning_rate": 7.442276785697035e-05, "loss": 2.766031837463379, "memory(GiB)": 77.56, "step": 39395, "token_acc": 0.4520547945205479, "train_speed(iter/s)": 1.43972 }, { "epoch": 1.688016794481813, "grad_norm": 4.260105609893799, "learning_rate": 7.441689531060125e-05, "loss": 2.2400495529174806, "memory(GiB)": 77.56, "step": 39400, "token_acc": 0.4895397489539749, "train_speed(iter/s)": 1.439733 }, { "epoch": 1.6882310098110622, "grad_norm": 5.144050121307373, "learning_rate": 7.441102232189721e-05, "loss": 2.6564083099365234, "memory(GiB)": 77.56, "step": 39405, "token_acc": 0.4309859154929577, "train_speed(iter/s)": 1.439759 }, { "epoch": 1.688445225140311, "grad_norm": 4.711074352264404, "learning_rate": 7.440514889096457e-05, "loss": 2.779418182373047, "memory(GiB)": 77.56, "step": 39410, "token_acc": 0.43243243243243246, "train_speed(iter/s)": 1.439794 }, { "epoch": 1.6886594404695598, "grad_norm": 4.433008193969727, "learning_rate": 7.439927501790978e-05, "loss": 2.5604726791381838, "memory(GiB)": 77.56, "step": 39415, "token_acc": 0.49783549783549785, "train_speed(iter/s)": 1.439789 }, { "epoch": 1.688873655798809, "grad_norm": 4.9867095947265625, "learning_rate": 7.439340070283923e-05, "loss": 2.6040271759033202, "memory(GiB)": 77.56, "step": 39420, "token_acc": 0.4290322580645161, "train_speed(iter/s)": 1.439812 }, { "epoch": 1.689087871128058, "grad_norm": 5.791916847229004, "learning_rate": 7.438752594585935e-05, "loss": 2.4432435989379884, "memory(GiB)": 77.56, "step": 39425, "token_acc": 0.4555984555984556, "train_speed(iter/s)": 1.439826 }, { "epoch": 1.6893020864573067, "grad_norm": 4.592531204223633, "learning_rate": 7.438165074707655e-05, "loss": 2.6076065063476563, "memory(GiB)": 77.56, "step": 39430, "token_acc": 0.46864686468646866, "train_speed(iter/s)": 1.439866 }, { "epoch": 1.689516301786556, "grad_norm": 5.351303577423096, "learning_rate": 7.437577510659729e-05, "loss": 2.603732872009277, "memory(GiB)": 77.56, "step": 39435, "token_acc": 0.4659498207885305, "train_speed(iter/s)": 1.439861 }, { "epoch": 1.6897305171158048, "grad_norm": 8.990999221801758, "learning_rate": 7.436989902452798e-05, "loss": 2.7704092025756837, "memory(GiB)": 77.56, "step": 39440, "token_acc": 0.45302013422818793, "train_speed(iter/s)": 1.439894 }, { "epoch": 1.6899447324450536, "grad_norm": 4.653515815734863, "learning_rate": 7.43640225009751e-05, "loss": 2.4889642715454103, "memory(GiB)": 77.56, "step": 39445, "token_acc": 0.45588235294117646, "train_speed(iter/s)": 1.439928 }, { "epoch": 1.6901589477743029, "grad_norm": 6.005542278289795, "learning_rate": 7.43581455360451e-05, "loss": 2.432036209106445, "memory(GiB)": 77.56, "step": 39450, "token_acc": 0.45980707395498394, "train_speed(iter/s)": 1.439932 }, { "epoch": 1.6903731631035517, "grad_norm": 4.6243414878845215, "learning_rate": 7.435226812984443e-05, "loss": 2.7409540176391602, "memory(GiB)": 77.56, "step": 39455, "token_acc": 0.4311377245508982, "train_speed(iter/s)": 1.439952 }, { "epoch": 1.6905873784328005, "grad_norm": 4.375554084777832, "learning_rate": 7.434639028247959e-05, "loss": 2.603215217590332, "memory(GiB)": 77.56, "step": 39460, "token_acc": 0.5033557046979866, "train_speed(iter/s)": 1.439958 }, { "epoch": 1.6908015937620497, "grad_norm": 4.447576522827148, "learning_rate": 7.434051199405705e-05, "loss": 2.4326690673828124, "memory(GiB)": 77.56, "step": 39465, "token_acc": 0.4936708860759494, "train_speed(iter/s)": 1.439984 }, { "epoch": 1.6910158090912986, "grad_norm": 7.526741981506348, "learning_rate": 7.43346332646833e-05, "loss": 2.82733211517334, "memory(GiB)": 77.56, "step": 39470, "token_acc": 0.44482758620689655, "train_speed(iter/s)": 1.440025 }, { "epoch": 1.6912300244205474, "grad_norm": 6.270843982696533, "learning_rate": 7.432875409446483e-05, "loss": 2.868579864501953, "memory(GiB)": 77.56, "step": 39475, "token_acc": 0.3835125448028674, "train_speed(iter/s)": 1.440028 }, { "epoch": 1.6914442397497966, "grad_norm": 4.88505220413208, "learning_rate": 7.432287448350819e-05, "loss": 2.5317960739135743, "memory(GiB)": 77.56, "step": 39480, "token_acc": 0.46956521739130436, "train_speed(iter/s)": 1.440041 }, { "epoch": 1.6916584550790454, "grad_norm": 5.6399617195129395, "learning_rate": 7.431699443191982e-05, "loss": 2.3505857467651365, "memory(GiB)": 77.56, "step": 39485, "token_acc": 0.4838709677419355, "train_speed(iter/s)": 1.440067 }, { "epoch": 1.6918726704082943, "grad_norm": 4.655662536621094, "learning_rate": 7.431111393980631e-05, "loss": 2.5938844680786133, "memory(GiB)": 77.56, "step": 39490, "token_acc": 0.5020080321285141, "train_speed(iter/s)": 1.440058 }, { "epoch": 1.6920868857375435, "grad_norm": 4.296602249145508, "learning_rate": 7.430523300727416e-05, "loss": 2.563298225402832, "memory(GiB)": 77.56, "step": 39495, "token_acc": 0.4725609756097561, "train_speed(iter/s)": 1.440086 }, { "epoch": 1.6923011010667923, "grad_norm": 5.97561502456665, "learning_rate": 7.429935163442991e-05, "loss": 2.477747344970703, "memory(GiB)": 77.56, "step": 39500, "token_acc": 0.45255474452554745, "train_speed(iter/s)": 1.440135 }, { "epoch": 1.6923011010667923, "eval_loss": 2.2470016479492188, "eval_runtime": 14.3308, "eval_samples_per_second": 6.978, "eval_steps_per_second": 6.978, "eval_token_acc": 0.46788990825688076, "step": 39500 }, { "epoch": 1.6925153163960411, "grad_norm": 5.674051761627197, "learning_rate": 7.429346982138013e-05, "loss": 2.5348369598388674, "memory(GiB)": 77.56, "step": 39505, "token_acc": 0.47433460076045625, "train_speed(iter/s)": 1.439335 }, { "epoch": 1.6927295317252904, "grad_norm": 4.425677299499512, "learning_rate": 7.428758756823134e-05, "loss": 2.1379732131958007, "memory(GiB)": 77.56, "step": 39510, "token_acc": 0.5115511551155115, "train_speed(iter/s)": 1.439319 }, { "epoch": 1.6929437470545392, "grad_norm": 5.7284464836120605, "learning_rate": 7.42817048750901e-05, "loss": 2.4620323181152344, "memory(GiB)": 77.56, "step": 39515, "token_acc": 0.4950166112956811, "train_speed(iter/s)": 1.439248 }, { "epoch": 1.693157962383788, "grad_norm": 4.829771041870117, "learning_rate": 7.427582174206303e-05, "loss": 2.7509206771850585, "memory(GiB)": 77.56, "step": 39520, "token_acc": 0.44256756756756754, "train_speed(iter/s)": 1.439248 }, { "epoch": 1.6933721777130373, "grad_norm": 4.4447503089904785, "learning_rate": 7.426993816925665e-05, "loss": 2.5167118072509767, "memory(GiB)": 77.56, "step": 39525, "token_acc": 0.46808510638297873, "train_speed(iter/s)": 1.439279 }, { "epoch": 1.693586393042286, "grad_norm": 4.798361778259277, "learning_rate": 7.426405415677758e-05, "loss": 2.266404151916504, "memory(GiB)": 77.56, "step": 39530, "token_acc": 0.5246478873239436, "train_speed(iter/s)": 1.439294 }, { "epoch": 1.6938006083715351, "grad_norm": 3.9646053314208984, "learning_rate": 7.425816970473241e-05, "loss": 2.3794809341430665, "memory(GiB)": 77.56, "step": 39535, "token_acc": 0.5093632958801498, "train_speed(iter/s)": 1.439347 }, { "epoch": 1.6940148237007842, "grad_norm": 4.556328773498535, "learning_rate": 7.425228481322775e-05, "loss": 2.628735160827637, "memory(GiB)": 77.56, "step": 39540, "token_acc": 0.4690265486725664, "train_speed(iter/s)": 1.439364 }, { "epoch": 1.694229039030033, "grad_norm": 4.042719841003418, "learning_rate": 7.424639948237019e-05, "loss": 2.338087463378906, "memory(GiB)": 77.56, "step": 39545, "token_acc": 0.49415204678362573, "train_speed(iter/s)": 1.439369 }, { "epoch": 1.694443254359282, "grad_norm": 4.656288146972656, "learning_rate": 7.424051371226636e-05, "loss": 2.8349714279174805, "memory(GiB)": 77.56, "step": 39550, "token_acc": 0.4722222222222222, "train_speed(iter/s)": 1.439385 }, { "epoch": 1.694657469688531, "grad_norm": 4.094668388366699, "learning_rate": 7.423462750302289e-05, "loss": 2.197116470336914, "memory(GiB)": 77.56, "step": 39555, "token_acc": 0.5265151515151515, "train_speed(iter/s)": 1.439429 }, { "epoch": 1.6948716850177798, "grad_norm": 5.453276634216309, "learning_rate": 7.42287408547464e-05, "loss": 2.4897510528564455, "memory(GiB)": 77.56, "step": 39560, "token_acc": 0.5034246575342466, "train_speed(iter/s)": 1.439489 }, { "epoch": 1.6950859003470289, "grad_norm": 4.060357570648193, "learning_rate": 7.422285376754354e-05, "loss": 2.645904541015625, "memory(GiB)": 77.56, "step": 39565, "token_acc": 0.4555160142348754, "train_speed(iter/s)": 1.439545 }, { "epoch": 1.695300115676278, "grad_norm": 3.969604730606079, "learning_rate": 7.421696624152098e-05, "loss": 2.4718791961669924, "memory(GiB)": 77.56, "step": 39570, "token_acc": 0.4935897435897436, "train_speed(iter/s)": 1.439606 }, { "epoch": 1.6955143310055267, "grad_norm": 4.247460842132568, "learning_rate": 7.421107827678533e-05, "loss": 2.7582586288452147, "memory(GiB)": 77.56, "step": 39575, "token_acc": 0.44072948328267475, "train_speed(iter/s)": 1.439621 }, { "epoch": 1.6957285463347758, "grad_norm": 5.384565353393555, "learning_rate": 7.420518987344331e-05, "loss": 2.465864562988281, "memory(GiB)": 77.56, "step": 39580, "token_acc": 0.47639484978540775, "train_speed(iter/s)": 1.439645 }, { "epoch": 1.6959427616640248, "grad_norm": 7.9196367263793945, "learning_rate": 7.419930103160155e-05, "loss": 2.4264804840087892, "memory(GiB)": 77.56, "step": 39585, "token_acc": 0.45422535211267606, "train_speed(iter/s)": 1.439655 }, { "epoch": 1.6961569769932736, "grad_norm": 5.056312084197998, "learning_rate": 7.419341175136677e-05, "loss": 2.530421829223633, "memory(GiB)": 77.56, "step": 39590, "token_acc": 0.46689895470383275, "train_speed(iter/s)": 1.439663 }, { "epoch": 1.6963711923225226, "grad_norm": 5.398013114929199, "learning_rate": 7.418752203284563e-05, "loss": 2.5332481384277346, "memory(GiB)": 77.56, "step": 39595, "token_acc": 0.5137254901960784, "train_speed(iter/s)": 1.439694 }, { "epoch": 1.6965854076517717, "grad_norm": 4.751768112182617, "learning_rate": 7.418163187614485e-05, "loss": 2.705938148498535, "memory(GiB)": 77.56, "step": 39600, "token_acc": 0.4028776978417266, "train_speed(iter/s)": 1.439715 }, { "epoch": 1.6967996229810205, "grad_norm": 3.972363233566284, "learning_rate": 7.41757412813711e-05, "loss": 2.5165117263793944, "memory(GiB)": 77.56, "step": 39605, "token_acc": 0.49635036496350365, "train_speed(iter/s)": 1.439745 }, { "epoch": 1.6970138383102695, "grad_norm": 4.075873851776123, "learning_rate": 7.416985024863115e-05, "loss": 2.6509849548339846, "memory(GiB)": 77.56, "step": 39610, "token_acc": 0.4375, "train_speed(iter/s)": 1.439744 }, { "epoch": 1.6972280536395186, "grad_norm": 5.2881760597229, "learning_rate": 7.416395877803166e-05, "loss": 2.3910858154296877, "memory(GiB)": 77.56, "step": 39615, "token_acc": 0.49140893470790376, "train_speed(iter/s)": 1.439757 }, { "epoch": 1.6974422689687674, "grad_norm": 4.936835289001465, "learning_rate": 7.415806686967941e-05, "loss": 2.311636734008789, "memory(GiB)": 77.56, "step": 39620, "token_acc": 0.5343511450381679, "train_speed(iter/s)": 1.439776 }, { "epoch": 1.6976564842980164, "grad_norm": 4.937396049499512, "learning_rate": 7.41521745236811e-05, "loss": 2.7444005966186524, "memory(GiB)": 77.56, "step": 39625, "token_acc": 0.4270557029177719, "train_speed(iter/s)": 1.439779 }, { "epoch": 1.6978706996272654, "grad_norm": 6.1698503494262695, "learning_rate": 7.414628174014351e-05, "loss": 3.0836111068725587, "memory(GiB)": 77.56, "step": 39630, "token_acc": 0.4124629080118694, "train_speed(iter/s)": 1.439819 }, { "epoch": 1.6980849149565143, "grad_norm": 5.699939250946045, "learning_rate": 7.414038851917335e-05, "loss": 2.6164871215820313, "memory(GiB)": 77.56, "step": 39635, "token_acc": 0.4852459016393443, "train_speed(iter/s)": 1.439798 }, { "epoch": 1.6982991302857633, "grad_norm": 5.500759124755859, "learning_rate": 7.413449486087742e-05, "loss": 2.416820526123047, "memory(GiB)": 77.56, "step": 39640, "token_acc": 0.5120967741935484, "train_speed(iter/s)": 1.439776 }, { "epoch": 1.6985133456150123, "grad_norm": 5.812740325927734, "learning_rate": 7.412860076536247e-05, "loss": 2.7059539794921874, "memory(GiB)": 77.56, "step": 39645, "token_acc": 0.4204081632653061, "train_speed(iter/s)": 1.43978 }, { "epoch": 1.6987275609442611, "grad_norm": 5.377533912658691, "learning_rate": 7.41227062327353e-05, "loss": 2.600864601135254, "memory(GiB)": 77.56, "step": 39650, "token_acc": 0.4930555555555556, "train_speed(iter/s)": 1.439821 }, { "epoch": 1.6989417762735102, "grad_norm": 5.6966657638549805, "learning_rate": 7.411681126310268e-05, "loss": 3.090333366394043, "memory(GiB)": 77.56, "step": 39655, "token_acc": 0.41368078175895767, "train_speed(iter/s)": 1.43977 }, { "epoch": 1.6991559916027592, "grad_norm": 4.18341588973999, "learning_rate": 7.411091585657139e-05, "loss": 2.3464168548583983, "memory(GiB)": 77.56, "step": 39660, "token_acc": 0.49173553719008267, "train_speed(iter/s)": 1.439775 }, { "epoch": 1.699370206932008, "grad_norm": 5.513996601104736, "learning_rate": 7.410502001324824e-05, "loss": 2.5262365341186523, "memory(GiB)": 77.56, "step": 39665, "token_acc": 0.48534201954397393, "train_speed(iter/s)": 1.439782 }, { "epoch": 1.699584422261257, "grad_norm": 4.893082618713379, "learning_rate": 7.409912373324003e-05, "loss": 2.4036827087402344, "memory(GiB)": 77.56, "step": 39670, "token_acc": 0.5210843373493976, "train_speed(iter/s)": 1.439826 }, { "epoch": 1.699798637590506, "grad_norm": 6.081119060516357, "learning_rate": 7.409322701665359e-05, "loss": 2.4552143096923826, "memory(GiB)": 77.56, "step": 39675, "token_acc": 0.4898785425101215, "train_speed(iter/s)": 1.439838 }, { "epoch": 1.700012852919755, "grad_norm": 5.193746089935303, "learning_rate": 7.408732986359576e-05, "loss": 2.6363288879394533, "memory(GiB)": 77.56, "step": 39680, "token_acc": 0.4517133956386293, "train_speed(iter/s)": 1.439823 }, { "epoch": 1.700227068249004, "grad_norm": 5.900773048400879, "learning_rate": 7.408143227417334e-05, "loss": 2.8062456130981444, "memory(GiB)": 77.56, "step": 39685, "token_acc": 0.45686900958466453, "train_speed(iter/s)": 1.439822 }, { "epoch": 1.700441283578253, "grad_norm": 5.177570819854736, "learning_rate": 7.407553424849318e-05, "loss": 2.4233646392822266, "memory(GiB)": 77.56, "step": 39690, "token_acc": 0.4849624060150376, "train_speed(iter/s)": 1.439781 }, { "epoch": 1.7006554989075018, "grad_norm": 6.550286293029785, "learning_rate": 7.406963578666214e-05, "loss": 2.6196516036987303, "memory(GiB)": 77.56, "step": 39695, "token_acc": 0.48, "train_speed(iter/s)": 1.439823 }, { "epoch": 1.7008697142367508, "grad_norm": 4.333806991577148, "learning_rate": 7.406373688878706e-05, "loss": 2.631243133544922, "memory(GiB)": 77.56, "step": 39700, "token_acc": 0.4722222222222222, "train_speed(iter/s)": 1.439828 }, { "epoch": 1.7010839295659999, "grad_norm": 5.391918182373047, "learning_rate": 7.405783755497482e-05, "loss": 2.8438425064086914, "memory(GiB)": 77.56, "step": 39705, "token_acc": 0.40599455040871935, "train_speed(iter/s)": 1.439851 }, { "epoch": 1.7012981448952487, "grad_norm": 5.443723201751709, "learning_rate": 7.405193778533229e-05, "loss": 2.687221145629883, "memory(GiB)": 77.56, "step": 39710, "token_acc": 0.46332046332046334, "train_speed(iter/s)": 1.4399 }, { "epoch": 1.7015123602244977, "grad_norm": 6.031450271606445, "learning_rate": 7.404603757996634e-05, "loss": 2.714067268371582, "memory(GiB)": 77.56, "step": 39715, "token_acc": 0.47129909365558914, "train_speed(iter/s)": 1.439878 }, { "epoch": 1.7017265755537467, "grad_norm": 5.732964992523193, "learning_rate": 7.404013693898385e-05, "loss": 2.3155542373657227, "memory(GiB)": 77.56, "step": 39720, "token_acc": 0.4912891986062718, "train_speed(iter/s)": 1.439912 }, { "epoch": 1.7019407908829955, "grad_norm": 5.318905353546143, "learning_rate": 7.403423586249174e-05, "loss": 2.4612009048461916, "memory(GiB)": 77.56, "step": 39725, "token_acc": 0.5078740157480315, "train_speed(iter/s)": 1.43989 }, { "epoch": 1.7021550062122446, "grad_norm": 7.824594020843506, "learning_rate": 7.402833435059691e-05, "loss": 2.730356216430664, "memory(GiB)": 77.56, "step": 39730, "token_acc": 0.462882096069869, "train_speed(iter/s)": 1.439872 }, { "epoch": 1.7023692215414936, "grad_norm": 5.203568458557129, "learning_rate": 7.402243240340624e-05, "loss": 2.5023773193359373, "memory(GiB)": 77.56, "step": 39735, "token_acc": 0.45016077170418006, "train_speed(iter/s)": 1.43989 }, { "epoch": 1.7025834368707424, "grad_norm": 6.08251428604126, "learning_rate": 7.401653002102669e-05, "loss": 2.444143295288086, "memory(GiB)": 77.56, "step": 39740, "token_acc": 0.4959677419354839, "train_speed(iter/s)": 1.439855 }, { "epoch": 1.7027976521999915, "grad_norm": 4.962358474731445, "learning_rate": 7.401062720356516e-05, "loss": 2.8301889419555666, "memory(GiB)": 77.56, "step": 39745, "token_acc": 0.4605678233438486, "train_speed(iter/s)": 1.439861 }, { "epoch": 1.7030118675292405, "grad_norm": 8.405646324157715, "learning_rate": 7.400472395112861e-05, "loss": 2.5224157333374024, "memory(GiB)": 77.56, "step": 39750, "token_acc": 0.4820846905537459, "train_speed(iter/s)": 1.439844 }, { "epoch": 1.7032260828584893, "grad_norm": 5.012209892272949, "learning_rate": 7.399882026382395e-05, "loss": 2.705532455444336, "memory(GiB)": 77.56, "step": 39755, "token_acc": 0.4583333333333333, "train_speed(iter/s)": 1.4398 }, { "epoch": 1.7034402981877383, "grad_norm": 4.887112617492676, "learning_rate": 7.399291614175815e-05, "loss": 2.334407424926758, "memory(GiB)": 77.56, "step": 39760, "token_acc": 0.5064935064935064, "train_speed(iter/s)": 1.439839 }, { "epoch": 1.7036545135169874, "grad_norm": 4.952052593231201, "learning_rate": 7.398701158503818e-05, "loss": 2.404689407348633, "memory(GiB)": 77.56, "step": 39765, "token_acc": 0.449438202247191, "train_speed(iter/s)": 1.439852 }, { "epoch": 1.7038687288462362, "grad_norm": 4.783664703369141, "learning_rate": 7.398110659377098e-05, "loss": 2.403533363342285, "memory(GiB)": 77.56, "step": 39770, "token_acc": 0.471875, "train_speed(iter/s)": 1.439838 }, { "epoch": 1.7040829441754852, "grad_norm": 4.58333158493042, "learning_rate": 7.397520116806354e-05, "loss": 2.5737075805664062, "memory(GiB)": 77.56, "step": 39775, "token_acc": 0.44285714285714284, "train_speed(iter/s)": 1.439844 }, { "epoch": 1.7042971595047343, "grad_norm": 6.006550312042236, "learning_rate": 7.396929530802286e-05, "loss": 2.465286636352539, "memory(GiB)": 77.56, "step": 39780, "token_acc": 0.4825174825174825, "train_speed(iter/s)": 1.439834 }, { "epoch": 1.704511374833983, "grad_norm": 7.101477146148682, "learning_rate": 7.396338901375588e-05, "loss": 2.5579864501953127, "memory(GiB)": 77.56, "step": 39785, "token_acc": 0.48134328358208955, "train_speed(iter/s)": 1.439898 }, { "epoch": 1.704725590163232, "grad_norm": 3.6944642066955566, "learning_rate": 7.395748228536964e-05, "loss": 2.457639694213867, "memory(GiB)": 77.56, "step": 39790, "token_acc": 0.4742857142857143, "train_speed(iter/s)": 1.43987 }, { "epoch": 1.7049398054924811, "grad_norm": 6.27808141708374, "learning_rate": 7.395157512297114e-05, "loss": 2.722454071044922, "memory(GiB)": 77.56, "step": 39795, "token_acc": 0.4633431085043988, "train_speed(iter/s)": 1.439889 }, { "epoch": 1.70515402082173, "grad_norm": 4.822300910949707, "learning_rate": 7.394566752666739e-05, "loss": 2.625094985961914, "memory(GiB)": 77.56, "step": 39800, "token_acc": 0.4794952681388013, "train_speed(iter/s)": 1.439868 }, { "epoch": 1.705368236150979, "grad_norm": 4.021412372589111, "learning_rate": 7.39397594965654e-05, "loss": 2.633833312988281, "memory(GiB)": 77.56, "step": 39805, "token_acc": 0.47074468085106386, "train_speed(iter/s)": 1.439901 }, { "epoch": 1.705582451480228, "grad_norm": 4.1343207359313965, "learning_rate": 7.393385103277222e-05, "loss": 2.423944854736328, "memory(GiB)": 77.56, "step": 39810, "token_acc": 0.5121107266435986, "train_speed(iter/s)": 1.439913 }, { "epoch": 1.7057966668094768, "grad_norm": 5.7742509841918945, "learning_rate": 7.392794213539486e-05, "loss": 2.581664276123047, "memory(GiB)": 77.56, "step": 39815, "token_acc": 0.4565916398713826, "train_speed(iter/s)": 1.439875 }, { "epoch": 1.7060108821387259, "grad_norm": 6.122373104095459, "learning_rate": 7.392203280454039e-05, "loss": 2.482340431213379, "memory(GiB)": 77.56, "step": 39820, "token_acc": 0.48562300319488816, "train_speed(iter/s)": 1.43989 }, { "epoch": 1.706225097467975, "grad_norm": 6.736767292022705, "learning_rate": 7.391612304031585e-05, "loss": 2.3063446044921876, "memory(GiB)": 77.56, "step": 39825, "token_acc": 0.5263157894736842, "train_speed(iter/s)": 1.439861 }, { "epoch": 1.7064393127972237, "grad_norm": 7.398369789123535, "learning_rate": 7.391021284282833e-05, "loss": 2.6417974472045898, "memory(GiB)": 77.56, "step": 39830, "token_acc": 0.42412451361867703, "train_speed(iter/s)": 1.439877 }, { "epoch": 1.7066535281264728, "grad_norm": 4.481542587280273, "learning_rate": 7.390430221218485e-05, "loss": 2.402405548095703, "memory(GiB)": 77.56, "step": 39835, "token_acc": 0.48031496062992124, "train_speed(iter/s)": 1.439926 }, { "epoch": 1.7068677434557218, "grad_norm": 5.377448081970215, "learning_rate": 7.389839114849253e-05, "loss": 2.545908737182617, "memory(GiB)": 77.56, "step": 39840, "token_acc": 0.450199203187251, "train_speed(iter/s)": 1.439923 }, { "epoch": 1.7070819587849706, "grad_norm": 5.170248985290527, "learning_rate": 7.389247965185841e-05, "loss": 2.706021308898926, "memory(GiB)": 77.56, "step": 39845, "token_acc": 0.49517684887459806, "train_speed(iter/s)": 1.439898 }, { "epoch": 1.7072961741142196, "grad_norm": 4.069697380065918, "learning_rate": 7.388656772238964e-05, "loss": 2.29998893737793, "memory(GiB)": 77.56, "step": 39850, "token_acc": 0.5194805194805194, "train_speed(iter/s)": 1.439912 }, { "epoch": 1.7075103894434687, "grad_norm": 4.980340957641602, "learning_rate": 7.388065536019327e-05, "loss": 2.6758806228637697, "memory(GiB)": 77.56, "step": 39855, "token_acc": 0.4666666666666667, "train_speed(iter/s)": 1.439862 }, { "epoch": 1.7077246047727175, "grad_norm": 4.526433944702148, "learning_rate": 7.387474256537642e-05, "loss": 2.6746925354003905, "memory(GiB)": 77.56, "step": 39860, "token_acc": 0.42138364779874216, "train_speed(iter/s)": 1.439892 }, { "epoch": 1.7079388201019665, "grad_norm": 4.018715858459473, "learning_rate": 7.386882933804621e-05, "loss": 2.4466697692871096, "memory(GiB)": 77.56, "step": 39865, "token_acc": 0.4533762057877814, "train_speed(iter/s)": 1.439906 }, { "epoch": 1.7081530354312155, "grad_norm": 7.17388391494751, "learning_rate": 7.386291567830978e-05, "loss": 2.3986154556274415, "memory(GiB)": 77.56, "step": 39870, "token_acc": 0.4855305466237942, "train_speed(iter/s)": 1.439916 }, { "epoch": 1.7083672507604644, "grad_norm": 6.041895389556885, "learning_rate": 7.385700158627424e-05, "loss": 2.636348342895508, "memory(GiB)": 77.56, "step": 39875, "token_acc": 0.48771929824561405, "train_speed(iter/s)": 1.439911 }, { "epoch": 1.7085814660897134, "grad_norm": 4.120418071746826, "learning_rate": 7.385108706204675e-05, "loss": 2.544120025634766, "memory(GiB)": 77.56, "step": 39880, "token_acc": 0.4942528735632184, "train_speed(iter/s)": 1.439883 }, { "epoch": 1.7087956814189624, "grad_norm": 5.060544013977051, "learning_rate": 7.384517210573444e-05, "loss": 2.36533203125, "memory(GiB)": 77.56, "step": 39885, "token_acc": 0.5083056478405316, "train_speed(iter/s)": 1.439903 }, { "epoch": 1.7090098967482112, "grad_norm": 5.357654571533203, "learning_rate": 7.383925671744446e-05, "loss": 2.6864748001098633, "memory(GiB)": 77.56, "step": 39890, "token_acc": 0.4606741573033708, "train_speed(iter/s)": 1.439915 }, { "epoch": 1.7092241120774603, "grad_norm": 4.374958515167236, "learning_rate": 7.383334089728398e-05, "loss": 2.5595218658447267, "memory(GiB)": 77.56, "step": 39895, "token_acc": 0.4806451612903226, "train_speed(iter/s)": 1.439934 }, { "epoch": 1.7094383274067093, "grad_norm": 4.889717102050781, "learning_rate": 7.382742464536017e-05, "loss": 2.3687376022338866, "memory(GiB)": 77.56, "step": 39900, "token_acc": 0.4536082474226804, "train_speed(iter/s)": 1.439981 }, { "epoch": 1.7096525427359581, "grad_norm": 4.771450519561768, "learning_rate": 7.382150796178022e-05, "loss": 2.6416671752929686, "memory(GiB)": 77.56, "step": 39905, "token_acc": 0.46200607902735563, "train_speed(iter/s)": 1.439971 }, { "epoch": 1.7098667580652072, "grad_norm": 4.958680152893066, "learning_rate": 7.38155908466513e-05, "loss": 2.5969444274902345, "memory(GiB)": 77.56, "step": 39910, "token_acc": 0.4712230215827338, "train_speed(iter/s)": 1.439984 }, { "epoch": 1.7100809733944562, "grad_norm": 4.731327533721924, "learning_rate": 7.380967330008063e-05, "loss": 2.5432605743408203, "memory(GiB)": 77.56, "step": 39915, "token_acc": 0.47674418604651164, "train_speed(iter/s)": 1.439984 }, { "epoch": 1.710295188723705, "grad_norm": 4.742739677429199, "learning_rate": 7.380375532217537e-05, "loss": 2.3297481536865234, "memory(GiB)": 77.56, "step": 39920, "token_acc": 0.4983277591973244, "train_speed(iter/s)": 1.439959 }, { "epoch": 1.710509404052954, "grad_norm": 5.272616863250732, "learning_rate": 7.379783691304276e-05, "loss": 2.6115653991699217, "memory(GiB)": 77.56, "step": 39925, "token_acc": 0.44776119402985076, "train_speed(iter/s)": 1.439977 }, { "epoch": 1.710723619382203, "grad_norm": 5.22698450088501, "learning_rate": 7.379191807279002e-05, "loss": 2.2420799255371096, "memory(GiB)": 77.56, "step": 39930, "token_acc": 0.49236641221374045, "train_speed(iter/s)": 1.439987 }, { "epoch": 1.7109378347114519, "grad_norm": 4.384657859802246, "learning_rate": 7.378599880152434e-05, "loss": 2.440205764770508, "memory(GiB)": 77.56, "step": 39935, "token_acc": 0.4816053511705686, "train_speed(iter/s)": 1.440019 }, { "epoch": 1.711152050040701, "grad_norm": 5.922700881958008, "learning_rate": 7.3780079099353e-05, "loss": 2.2233476638793945, "memory(GiB)": 77.56, "step": 39940, "token_acc": 0.476027397260274, "train_speed(iter/s)": 1.439979 }, { "epoch": 1.71136626536995, "grad_norm": 4.690421104431152, "learning_rate": 7.377415896638322e-05, "loss": 3.0306575775146483, "memory(GiB)": 77.56, "step": 39945, "token_acc": 0.43729903536977494, "train_speed(iter/s)": 1.43998 }, { "epoch": 1.7115804806991988, "grad_norm": 6.880755424499512, "learning_rate": 7.376823840272224e-05, "loss": 2.5882944107055663, "memory(GiB)": 77.56, "step": 39950, "token_acc": 0.4485981308411215, "train_speed(iter/s)": 1.439973 }, { "epoch": 1.7117946960284478, "grad_norm": 4.755744934082031, "learning_rate": 7.376231740847734e-05, "loss": 2.4759830474853515, "memory(GiB)": 77.56, "step": 39955, "token_acc": 0.48013245033112584, "train_speed(iter/s)": 1.439989 }, { "epoch": 1.7120089113576968, "grad_norm": 4.4547438621521, "learning_rate": 7.375639598375574e-05, "loss": 2.6666980743408204, "memory(GiB)": 77.56, "step": 39960, "token_acc": 0.4318936877076412, "train_speed(iter/s)": 1.439978 }, { "epoch": 1.7122231266869457, "grad_norm": 6.396081447601318, "learning_rate": 7.375047412866476e-05, "loss": 2.7680437088012697, "memory(GiB)": 77.56, "step": 39965, "token_acc": 0.4522058823529412, "train_speed(iter/s)": 1.439976 }, { "epoch": 1.7124373420161947, "grad_norm": 4.523647308349609, "learning_rate": 7.374455184331167e-05, "loss": 2.577466583251953, "memory(GiB)": 77.56, "step": 39970, "token_acc": 0.46153846153846156, "train_speed(iter/s)": 1.440005 }, { "epoch": 1.7126515573454437, "grad_norm": 4.375260829925537, "learning_rate": 7.373862912780373e-05, "loss": 2.586697578430176, "memory(GiB)": 77.56, "step": 39975, "token_acc": 0.4875, "train_speed(iter/s)": 1.440041 }, { "epoch": 1.7128657726746925, "grad_norm": 4.126951217651367, "learning_rate": 7.373270598224827e-05, "loss": 2.713718223571777, "memory(GiB)": 77.56, "step": 39980, "token_acc": 0.44410876132930516, "train_speed(iter/s)": 1.44005 }, { "epoch": 1.7130799880039416, "grad_norm": 5.868197917938232, "learning_rate": 7.372678240675256e-05, "loss": 2.084040641784668, "memory(GiB)": 77.56, "step": 39985, "token_acc": 0.5331010452961672, "train_speed(iter/s)": 1.440071 }, { "epoch": 1.7132942033331906, "grad_norm": 4.283258438110352, "learning_rate": 7.372085840142394e-05, "loss": 2.444026756286621, "memory(GiB)": 77.56, "step": 39990, "token_acc": 0.4782608695652174, "train_speed(iter/s)": 1.440107 }, { "epoch": 1.7135084186624394, "grad_norm": 6.181144714355469, "learning_rate": 7.371493396636973e-05, "loss": 2.659521293640137, "memory(GiB)": 77.56, "step": 39995, "token_acc": 0.4537313432835821, "train_speed(iter/s)": 1.440104 }, { "epoch": 1.7137226339916884, "grad_norm": 4.1230926513671875, "learning_rate": 7.370900910169723e-05, "loss": 2.811714744567871, "memory(GiB)": 77.56, "step": 40000, "token_acc": 0.44610778443113774, "train_speed(iter/s)": 1.44013 }, { "epoch": 1.7137226339916884, "eval_loss": 2.1421358585357666, "eval_runtime": 13.9435, "eval_samples_per_second": 7.172, "eval_steps_per_second": 7.172, "eval_token_acc": 0.48042168674698793, "step": 40000 }, { "epoch": 1.7139368493209375, "grad_norm": 4.4093241691589355, "learning_rate": 7.370308380751378e-05, "loss": 2.7694704055786135, "memory(GiB)": 77.56, "step": 40005, "token_acc": 0.4678714859437751, "train_speed(iter/s)": 1.439385 }, { "epoch": 1.7141510646501863, "grad_norm": 4.222255229949951, "learning_rate": 7.369715808392675e-05, "loss": 2.4819902420043944, "memory(GiB)": 77.56, "step": 40010, "token_acc": 0.48928571428571427, "train_speed(iter/s)": 1.439422 }, { "epoch": 1.7143652799794353, "grad_norm": 5.060547828674316, "learning_rate": 7.369123193104347e-05, "loss": 2.6443323135375976, "memory(GiB)": 77.56, "step": 40015, "token_acc": 0.45484949832775917, "train_speed(iter/s)": 1.439434 }, { "epoch": 1.7145794953086844, "grad_norm": 6.641648292541504, "learning_rate": 7.368530534897128e-05, "loss": 2.2589042663574217, "memory(GiB)": 77.56, "step": 40020, "token_acc": 0.49812734082397003, "train_speed(iter/s)": 1.43945 }, { "epoch": 1.7147937106379332, "grad_norm": 4.530861854553223, "learning_rate": 7.36793783378176e-05, "loss": 2.8286441802978515, "memory(GiB)": 77.56, "step": 40025, "token_acc": 0.4258064516129032, "train_speed(iter/s)": 1.439499 }, { "epoch": 1.7150079259671822, "grad_norm": 9.142610549926758, "learning_rate": 7.367345089768976e-05, "loss": 2.3285446166992188, "memory(GiB)": 77.56, "step": 40030, "token_acc": 0.46689895470383275, "train_speed(iter/s)": 1.439499 }, { "epoch": 1.7152221412964312, "grad_norm": 11.068404197692871, "learning_rate": 7.366752302869513e-05, "loss": 2.5566738128662108, "memory(GiB)": 77.56, "step": 40035, "token_acc": 0.48534201954397393, "train_speed(iter/s)": 1.439493 }, { "epoch": 1.71543635662568, "grad_norm": 6.662464141845703, "learning_rate": 7.366159473094112e-05, "loss": 2.384919548034668, "memory(GiB)": 77.56, "step": 40040, "token_acc": 0.4962121212121212, "train_speed(iter/s)": 1.439531 }, { "epoch": 1.715650571954929, "grad_norm": 4.910244941711426, "learning_rate": 7.365566600453513e-05, "loss": 2.5101667404174806, "memory(GiB)": 77.56, "step": 40045, "token_acc": 0.48951048951048953, "train_speed(iter/s)": 1.439539 }, { "epoch": 1.7158647872841781, "grad_norm": 5.121800422668457, "learning_rate": 7.364973684958456e-05, "loss": 2.5433162689208983, "memory(GiB)": 77.56, "step": 40050, "token_acc": 0.4645161290322581, "train_speed(iter/s)": 1.439487 }, { "epoch": 1.716079002613427, "grad_norm": 4.069670677185059, "learning_rate": 7.364380726619681e-05, "loss": 2.6185279846191407, "memory(GiB)": 77.56, "step": 40055, "token_acc": 0.47058823529411764, "train_speed(iter/s)": 1.439513 }, { "epoch": 1.716293217942676, "grad_norm": 4.56881046295166, "learning_rate": 7.363787725447935e-05, "loss": 2.3854040145874023, "memory(GiB)": 77.56, "step": 40060, "token_acc": 0.4946524064171123, "train_speed(iter/s)": 1.439536 }, { "epoch": 1.716507433271925, "grad_norm": 6.28679895401001, "learning_rate": 7.363194681453953e-05, "loss": 2.667953109741211, "memory(GiB)": 77.56, "step": 40065, "token_acc": 0.45454545454545453, "train_speed(iter/s)": 1.439571 }, { "epoch": 1.7167216486011738, "grad_norm": 5.500514507293701, "learning_rate": 7.362601594648484e-05, "loss": 2.5200653076171875, "memory(GiB)": 77.56, "step": 40070, "token_acc": 0.4756944444444444, "train_speed(iter/s)": 1.439627 }, { "epoch": 1.7169358639304229, "grad_norm": 8.16668701171875, "learning_rate": 7.36200846504227e-05, "loss": 2.5562793731689455, "memory(GiB)": 77.56, "step": 40075, "token_acc": 0.45396825396825397, "train_speed(iter/s)": 1.439633 }, { "epoch": 1.717150079259672, "grad_norm": 5.220269680023193, "learning_rate": 7.361415292646057e-05, "loss": 2.1251943588256834, "memory(GiB)": 77.56, "step": 40080, "token_acc": 0.5131578947368421, "train_speed(iter/s)": 1.439639 }, { "epoch": 1.7173642945889207, "grad_norm": 5.581764221191406, "learning_rate": 7.360822077470592e-05, "loss": 2.601914978027344, "memory(GiB)": 77.56, "step": 40085, "token_acc": 0.4244604316546763, "train_speed(iter/s)": 1.439678 }, { "epoch": 1.7175785099181697, "grad_norm": 4.447869300842285, "learning_rate": 7.360228819526621e-05, "loss": 2.872735595703125, "memory(GiB)": 77.56, "step": 40090, "token_acc": 0.41297935103244837, "train_speed(iter/s)": 1.439709 }, { "epoch": 1.7177927252474188, "grad_norm": 5.055079936981201, "learning_rate": 7.35963551882489e-05, "loss": 2.4841999053955077, "memory(GiB)": 77.56, "step": 40095, "token_acc": 0.4144486692015209, "train_speed(iter/s)": 1.439765 }, { "epoch": 1.7180069405766676, "grad_norm": 4.955320835113525, "learning_rate": 7.359042175376148e-05, "loss": 2.6695268630981444, "memory(GiB)": 77.56, "step": 40100, "token_acc": 0.4590643274853801, "train_speed(iter/s)": 1.439755 }, { "epoch": 1.7182211559059166, "grad_norm": 5.731344223022461, "learning_rate": 7.358448789191142e-05, "loss": 2.4094270706176757, "memory(GiB)": 77.56, "step": 40105, "token_acc": 0.4964788732394366, "train_speed(iter/s)": 1.439749 }, { "epoch": 1.7184353712351657, "grad_norm": 4.8525800704956055, "learning_rate": 7.357855360280626e-05, "loss": 2.6583824157714844, "memory(GiB)": 77.56, "step": 40110, "token_acc": 0.47368421052631576, "train_speed(iter/s)": 1.439766 }, { "epoch": 1.7186495865644145, "grad_norm": 4.381115913391113, "learning_rate": 7.357261888655349e-05, "loss": 2.791868209838867, "memory(GiB)": 77.56, "step": 40115, "token_acc": 0.45652173913043476, "train_speed(iter/s)": 1.439771 }, { "epoch": 1.7188638018936635, "grad_norm": 5.044618129730225, "learning_rate": 7.35666837432606e-05, "loss": 2.681040954589844, "memory(GiB)": 77.56, "step": 40120, "token_acc": 0.43820224719101125, "train_speed(iter/s)": 1.439776 }, { "epoch": 1.7190780172229125, "grad_norm": 5.593702793121338, "learning_rate": 7.356074817303513e-05, "loss": 2.636488342285156, "memory(GiB)": 77.56, "step": 40125, "token_acc": 0.4434250764525994, "train_speed(iter/s)": 1.439741 }, { "epoch": 1.7192922325521613, "grad_norm": 4.197584629058838, "learning_rate": 7.355481217598461e-05, "loss": 2.3843318939208986, "memory(GiB)": 77.56, "step": 40130, "token_acc": 0.49838187702265374, "train_speed(iter/s)": 1.439757 }, { "epoch": 1.7195064478814104, "grad_norm": 6.246951580047607, "learning_rate": 7.354887575221656e-05, "loss": 2.4831682205200196, "memory(GiB)": 77.56, "step": 40135, "token_acc": 0.5014749262536873, "train_speed(iter/s)": 1.439778 }, { "epoch": 1.7197206632106594, "grad_norm": 5.10792875289917, "learning_rate": 7.354293890183854e-05, "loss": 2.4761425018310548, "memory(GiB)": 77.56, "step": 40140, "token_acc": 0.44375, "train_speed(iter/s)": 1.439806 }, { "epoch": 1.7199348785399082, "grad_norm": 5.021482944488525, "learning_rate": 7.353700162495811e-05, "loss": 2.6779758453369142, "memory(GiB)": 77.56, "step": 40145, "token_acc": 0.45724907063197023, "train_speed(iter/s)": 1.439777 }, { "epoch": 1.7201490938691573, "grad_norm": 6.702967166900635, "learning_rate": 7.35310639216828e-05, "loss": 2.804609680175781, "memory(GiB)": 77.56, "step": 40150, "token_acc": 0.46357615894039733, "train_speed(iter/s)": 1.439798 }, { "epoch": 1.7203633091984063, "grad_norm": 4.085918426513672, "learning_rate": 7.35251257921202e-05, "loss": 2.5090551376342773, "memory(GiB)": 77.56, "step": 40155, "token_acc": 0.5038759689922481, "train_speed(iter/s)": 1.439851 }, { "epoch": 1.7205775245276551, "grad_norm": 5.104355335235596, "learning_rate": 7.351918723637788e-05, "loss": 2.4670711517333985, "memory(GiB)": 77.56, "step": 40160, "token_acc": 0.4671814671814672, "train_speed(iter/s)": 1.439836 }, { "epoch": 1.7207917398569041, "grad_norm": 4.603713512420654, "learning_rate": 7.35132482545634e-05, "loss": 2.6015100479125977, "memory(GiB)": 77.56, "step": 40165, "token_acc": 0.4793103448275862, "train_speed(iter/s)": 1.439837 }, { "epoch": 1.7210059551861532, "grad_norm": 5.6582841873168945, "learning_rate": 7.350730884678441e-05, "loss": 2.083356475830078, "memory(GiB)": 77.56, "step": 40170, "token_acc": 0.5384615384615384, "train_speed(iter/s)": 1.439864 }, { "epoch": 1.721220170515402, "grad_norm": 4.597603797912598, "learning_rate": 7.350136901314845e-05, "loss": 2.605609321594238, "memory(GiB)": 77.56, "step": 40175, "token_acc": 0.45964912280701753, "train_speed(iter/s)": 1.439897 }, { "epoch": 1.721434385844651, "grad_norm": 4.485691070556641, "learning_rate": 7.349542875376312e-05, "loss": 2.3316486358642576, "memory(GiB)": 77.56, "step": 40180, "token_acc": 0.49554896142433236, "train_speed(iter/s)": 1.439915 }, { "epoch": 1.7216486011739, "grad_norm": 5.283058166503906, "learning_rate": 7.348948806873609e-05, "loss": 2.434284782409668, "memory(GiB)": 77.56, "step": 40185, "token_acc": 0.5057471264367817, "train_speed(iter/s)": 1.439935 }, { "epoch": 1.7218628165031489, "grad_norm": 3.7487587928771973, "learning_rate": 7.348354695817493e-05, "loss": 2.2351150512695312, "memory(GiB)": 77.56, "step": 40190, "token_acc": 0.5301587301587302, "train_speed(iter/s)": 1.43996 }, { "epoch": 1.722077031832398, "grad_norm": 5.850497722625732, "learning_rate": 7.347760542218729e-05, "loss": 2.428093147277832, "memory(GiB)": 77.56, "step": 40195, "token_acc": 0.47035573122529645, "train_speed(iter/s)": 1.439974 }, { "epoch": 1.722291247161647, "grad_norm": 6.800174236297607, "learning_rate": 7.347166346088081e-05, "loss": 2.807633972167969, "memory(GiB)": 77.56, "step": 40200, "token_acc": 0.4141791044776119, "train_speed(iter/s)": 1.440031 }, { "epoch": 1.7225054624908958, "grad_norm": 5.8759002685546875, "learning_rate": 7.346572107436313e-05, "loss": 2.7058908462524416, "memory(GiB)": 77.56, "step": 40205, "token_acc": 0.46745562130177515, "train_speed(iter/s)": 1.440019 }, { "epoch": 1.7227196778201448, "grad_norm": 5.107700347900391, "learning_rate": 7.345977826274189e-05, "loss": 2.612692451477051, "memory(GiB)": 77.56, "step": 40210, "token_acc": 0.49008498583569404, "train_speed(iter/s)": 1.440046 }, { "epoch": 1.7229338931493938, "grad_norm": 4.888387680053711, "learning_rate": 7.345383502612477e-05, "loss": 2.2837772369384766, "memory(GiB)": 77.56, "step": 40215, "token_acc": 0.4867924528301887, "train_speed(iter/s)": 1.44 }, { "epoch": 1.7231481084786426, "grad_norm": 5.610747337341309, "learning_rate": 7.344789136461942e-05, "loss": 2.562638854980469, "memory(GiB)": 77.56, "step": 40220, "token_acc": 0.5, "train_speed(iter/s)": 1.440013 }, { "epoch": 1.7233623238078917, "grad_norm": 4.89550256729126, "learning_rate": 7.344194727833354e-05, "loss": 2.660519027709961, "memory(GiB)": 77.56, "step": 40225, "token_acc": 0.44074074074074077, "train_speed(iter/s)": 1.440051 }, { "epoch": 1.7235765391371407, "grad_norm": 5.416114807128906, "learning_rate": 7.343600276737477e-05, "loss": 2.574527931213379, "memory(GiB)": 77.56, "step": 40230, "token_acc": 0.476, "train_speed(iter/s)": 1.440067 }, { "epoch": 1.7237907544663895, "grad_norm": 4.884213924407959, "learning_rate": 7.343005783185085e-05, "loss": 2.445771408081055, "memory(GiB)": 77.56, "step": 40235, "token_acc": 0.48242811501597443, "train_speed(iter/s)": 1.440073 }, { "epoch": 1.7240049697956386, "grad_norm": 4.314901351928711, "learning_rate": 7.342411247186942e-05, "loss": 2.433149719238281, "memory(GiB)": 77.56, "step": 40240, "token_acc": 0.47796610169491527, "train_speed(iter/s)": 1.440096 }, { "epoch": 1.7242191851248876, "grad_norm": 4.202539920806885, "learning_rate": 7.341816668753825e-05, "loss": 2.4923582077026367, "memory(GiB)": 77.56, "step": 40245, "token_acc": 0.49606299212598426, "train_speed(iter/s)": 1.440119 }, { "epoch": 1.7244334004541364, "grad_norm": 4.560611724853516, "learning_rate": 7.341222047896501e-05, "loss": 2.562312126159668, "memory(GiB)": 77.56, "step": 40250, "token_acc": 0.4723127035830619, "train_speed(iter/s)": 1.440164 }, { "epoch": 1.7246476157833854, "grad_norm": 5.547539234161377, "learning_rate": 7.340627384625743e-05, "loss": 2.4742088317871094, "memory(GiB)": 77.56, "step": 40255, "token_acc": 0.4910394265232975, "train_speed(iter/s)": 1.440186 }, { "epoch": 1.7248618311126345, "grad_norm": 6.14340353012085, "learning_rate": 7.340032678952325e-05, "loss": 2.6248313903808596, "memory(GiB)": 77.56, "step": 40260, "token_acc": 0.4820359281437126, "train_speed(iter/s)": 1.440201 }, { "epoch": 1.7250760464418833, "grad_norm": 4.854167938232422, "learning_rate": 7.339437930887018e-05, "loss": 2.7901308059692385, "memory(GiB)": 77.56, "step": 40265, "token_acc": 0.42244224422442245, "train_speed(iter/s)": 1.440242 }, { "epoch": 1.7252902617711323, "grad_norm": 4.3600239753723145, "learning_rate": 7.338843140440601e-05, "loss": 2.5119213104248046, "memory(GiB)": 77.56, "step": 40270, "token_acc": 0.4889589905362776, "train_speed(iter/s)": 1.440244 }, { "epoch": 1.7255044771003814, "grad_norm": 4.788198471069336, "learning_rate": 7.338248307623844e-05, "loss": 2.515062141418457, "memory(GiB)": 77.56, "step": 40275, "token_acc": 0.46785714285714286, "train_speed(iter/s)": 1.440237 }, { "epoch": 1.7257186924296302, "grad_norm": 6.201361656188965, "learning_rate": 7.337653432447527e-05, "loss": 2.6870807647705077, "memory(GiB)": 77.56, "step": 40280, "token_acc": 0.45565749235474007, "train_speed(iter/s)": 1.440215 }, { "epoch": 1.7259329077588792, "grad_norm": 8.603616714477539, "learning_rate": 7.337058514922425e-05, "loss": 2.7269718170166017, "memory(GiB)": 77.56, "step": 40285, "token_acc": 0.41603053435114506, "train_speed(iter/s)": 1.440221 }, { "epoch": 1.7261471230881282, "grad_norm": 4.739927768707275, "learning_rate": 7.336463555059316e-05, "loss": 2.577268600463867, "memory(GiB)": 77.56, "step": 40290, "token_acc": 0.45695364238410596, "train_speed(iter/s)": 1.440273 }, { "epoch": 1.726361338417377, "grad_norm": 4.7373576164245605, "learning_rate": 7.335868552868979e-05, "loss": 2.313774299621582, "memory(GiB)": 77.56, "step": 40295, "token_acc": 0.5140562248995983, "train_speed(iter/s)": 1.440318 }, { "epoch": 1.726575553746626, "grad_norm": 5.206615924835205, "learning_rate": 7.335273508362189e-05, "loss": 2.613463592529297, "memory(GiB)": 77.56, "step": 40300, "token_acc": 0.5, "train_speed(iter/s)": 1.440332 }, { "epoch": 1.7267897690758751, "grad_norm": 4.584021091461182, "learning_rate": 7.334678421549731e-05, "loss": 2.2635498046875, "memory(GiB)": 77.56, "step": 40305, "token_acc": 0.5019762845849802, "train_speed(iter/s)": 1.440335 }, { "epoch": 1.727003984405124, "grad_norm": 6.164045810699463, "learning_rate": 7.334083292442382e-05, "loss": 2.4893268585205077, "memory(GiB)": 77.56, "step": 40310, "token_acc": 0.5100401606425703, "train_speed(iter/s)": 1.440352 }, { "epoch": 1.727218199734373, "grad_norm": 5.9391584396362305, "learning_rate": 7.333488121050925e-05, "loss": 2.485063362121582, "memory(GiB)": 77.56, "step": 40315, "token_acc": 0.46814404432132967, "train_speed(iter/s)": 1.440309 }, { "epoch": 1.727432415063622, "grad_norm": 5.095292091369629, "learning_rate": 7.332892907386142e-05, "loss": 2.3862890243530273, "memory(GiB)": 77.56, "step": 40320, "token_acc": 0.49480968858131485, "train_speed(iter/s)": 1.44027 }, { "epoch": 1.7276466303928708, "grad_norm": 3.871943712234497, "learning_rate": 7.332297651458815e-05, "loss": 2.4703323364257814, "memory(GiB)": 77.56, "step": 40325, "token_acc": 0.4934210526315789, "train_speed(iter/s)": 1.440295 }, { "epoch": 1.7278608457221198, "grad_norm": 4.925197124481201, "learning_rate": 7.33170235327973e-05, "loss": 2.485773468017578, "memory(GiB)": 77.56, "step": 40330, "token_acc": 0.4666666666666667, "train_speed(iter/s)": 1.440274 }, { "epoch": 1.7280750610513689, "grad_norm": 5.479426860809326, "learning_rate": 7.331107012859667e-05, "loss": 2.6705928802490235, "memory(GiB)": 77.56, "step": 40335, "token_acc": 0.4444444444444444, "train_speed(iter/s)": 1.440299 }, { "epoch": 1.7282892763806177, "grad_norm": 4.657426834106445, "learning_rate": 7.330511630209414e-05, "loss": 2.5968000411987306, "memory(GiB)": 77.56, "step": 40340, "token_acc": 0.4686192468619247, "train_speed(iter/s)": 1.440307 }, { "epoch": 1.7285034917098667, "grad_norm": 6.10496711730957, "learning_rate": 7.329916205339757e-05, "loss": 2.439530944824219, "memory(GiB)": 77.56, "step": 40345, "token_acc": 0.5166666666666667, "train_speed(iter/s)": 1.440339 }, { "epoch": 1.7287177070391158, "grad_norm": 5.403059959411621, "learning_rate": 7.329320738261484e-05, "loss": 2.4468931198120116, "memory(GiB)": 77.56, "step": 40350, "token_acc": 0.5091575091575091, "train_speed(iter/s)": 1.440343 }, { "epoch": 1.7289319223683646, "grad_norm": 7.664263725280762, "learning_rate": 7.32872522898538e-05, "loss": 2.4133537292480467, "memory(GiB)": 77.56, "step": 40355, "token_acc": 0.4883720930232558, "train_speed(iter/s)": 1.440344 }, { "epoch": 1.7291461376976136, "grad_norm": 4.846518516540527, "learning_rate": 7.328129677522234e-05, "loss": 2.357892608642578, "memory(GiB)": 77.56, "step": 40360, "token_acc": 0.5055350553505535, "train_speed(iter/s)": 1.440393 }, { "epoch": 1.7293603530268626, "grad_norm": 7.284416198730469, "learning_rate": 7.327534083882832e-05, "loss": 2.574083709716797, "memory(GiB)": 77.56, "step": 40365, "token_acc": 0.45614035087719296, "train_speed(iter/s)": 1.440409 }, { "epoch": 1.7295745683561115, "grad_norm": 5.343515396118164, "learning_rate": 7.32693844807797e-05, "loss": 2.256794738769531, "memory(GiB)": 77.56, "step": 40370, "token_acc": 0.49173553719008267, "train_speed(iter/s)": 1.440428 }, { "epoch": 1.7297887836853605, "grad_norm": 4.493984222412109, "learning_rate": 7.326342770118434e-05, "loss": 2.4861026763916017, "memory(GiB)": 77.56, "step": 40375, "token_acc": 0.5133079847908745, "train_speed(iter/s)": 1.440417 }, { "epoch": 1.7300029990146095, "grad_norm": 5.836231708526611, "learning_rate": 7.325747050015017e-05, "loss": 2.248910903930664, "memory(GiB)": 77.56, "step": 40380, "token_acc": 0.5140562248995983, "train_speed(iter/s)": 1.44042 }, { "epoch": 1.7302172143438583, "grad_norm": 5.4983320236206055, "learning_rate": 7.325151287778509e-05, "loss": 2.727415084838867, "memory(GiB)": 77.56, "step": 40385, "token_acc": 0.43205574912891986, "train_speed(iter/s)": 1.440394 }, { "epoch": 1.7304314296731074, "grad_norm": 4.418530464172363, "learning_rate": 7.324555483419707e-05, "loss": 2.9007171630859374, "memory(GiB)": 77.56, "step": 40390, "token_acc": 0.40804597701149425, "train_speed(iter/s)": 1.440392 }, { "epoch": 1.7306456450023564, "grad_norm": 4.905223846435547, "learning_rate": 7.323959636949398e-05, "loss": 2.598267364501953, "memory(GiB)": 77.56, "step": 40395, "token_acc": 0.45514950166112955, "train_speed(iter/s)": 1.440425 }, { "epoch": 1.7308598603316052, "grad_norm": 6.1761016845703125, "learning_rate": 7.323363748378384e-05, "loss": 2.7709102630615234, "memory(GiB)": 77.56, "step": 40400, "token_acc": 0.43157894736842106, "train_speed(iter/s)": 1.440375 }, { "epoch": 1.7310740756608543, "grad_norm": 4.547285079956055, "learning_rate": 7.322767817717454e-05, "loss": 2.4007984161376954, "memory(GiB)": 77.56, "step": 40405, "token_acc": 0.44594594594594594, "train_speed(iter/s)": 1.440407 }, { "epoch": 1.7312882909901033, "grad_norm": 4.873405933380127, "learning_rate": 7.322171844977407e-05, "loss": 2.385569763183594, "memory(GiB)": 77.56, "step": 40410, "token_acc": 0.5170278637770898, "train_speed(iter/s)": 1.440398 }, { "epoch": 1.731502506319352, "grad_norm": 5.374251842498779, "learning_rate": 7.321575830169037e-05, "loss": 2.63199520111084, "memory(GiB)": 77.56, "step": 40415, "token_acc": 0.4371069182389937, "train_speed(iter/s)": 1.440382 }, { "epoch": 1.7317167216486011, "grad_norm": 5.448232173919678, "learning_rate": 7.320979773303144e-05, "loss": 2.6868612289428713, "memory(GiB)": 77.56, "step": 40420, "token_acc": 0.4573643410852713, "train_speed(iter/s)": 1.440409 }, { "epoch": 1.7319309369778502, "grad_norm": 5.896923542022705, "learning_rate": 7.320383674390525e-05, "loss": 2.3270648956298827, "memory(GiB)": 77.56, "step": 40425, "token_acc": 0.4842105263157895, "train_speed(iter/s)": 1.440434 }, { "epoch": 1.732145152307099, "grad_norm": 4.544893741607666, "learning_rate": 7.319787533441981e-05, "loss": 2.6786264419555663, "memory(GiB)": 77.56, "step": 40430, "token_acc": 0.4495114006514658, "train_speed(iter/s)": 1.440451 }, { "epoch": 1.732359367636348, "grad_norm": 4.721642017364502, "learning_rate": 7.319191350468308e-05, "loss": 2.3792957305908202, "memory(GiB)": 77.56, "step": 40435, "token_acc": 0.49145299145299143, "train_speed(iter/s)": 1.440477 }, { "epoch": 1.732573582965597, "grad_norm": 4.844539642333984, "learning_rate": 7.318595125480308e-05, "loss": 2.4523681640625, "memory(GiB)": 77.56, "step": 40440, "token_acc": 0.4536082474226804, "train_speed(iter/s)": 1.440491 }, { "epoch": 1.7327877982948459, "grad_norm": 6.116079330444336, "learning_rate": 7.317998858488783e-05, "loss": 2.4160694122314452, "memory(GiB)": 77.56, "step": 40445, "token_acc": 0.5065502183406113, "train_speed(iter/s)": 1.440436 }, { "epoch": 1.733002013624095, "grad_norm": 4.909404754638672, "learning_rate": 7.317402549504533e-05, "loss": 2.4019872665405275, "memory(GiB)": 77.56, "step": 40450, "token_acc": 0.47101449275362317, "train_speed(iter/s)": 1.440445 }, { "epoch": 1.733216228953344, "grad_norm": 4.96981143951416, "learning_rate": 7.316806198538362e-05, "loss": 2.250261688232422, "memory(GiB)": 77.56, "step": 40455, "token_acc": 0.5139442231075697, "train_speed(iter/s)": 1.440455 }, { "epoch": 1.7334304442825927, "grad_norm": 4.455904960632324, "learning_rate": 7.316209805601074e-05, "loss": 2.551084518432617, "memory(GiB)": 77.56, "step": 40460, "token_acc": 0.46153846153846156, "train_speed(iter/s)": 1.440479 }, { "epoch": 1.7336446596118418, "grad_norm": 5.141857147216797, "learning_rate": 7.315613370703473e-05, "loss": 2.539170837402344, "memory(GiB)": 77.56, "step": 40465, "token_acc": 0.47619047619047616, "train_speed(iter/s)": 1.440488 }, { "epoch": 1.7338588749410908, "grad_norm": 3.741856098175049, "learning_rate": 7.315016893856363e-05, "loss": 2.7437156677246093, "memory(GiB)": 77.56, "step": 40470, "token_acc": 0.46153846153846156, "train_speed(iter/s)": 1.44047 }, { "epoch": 1.7340730902703396, "grad_norm": 5.1214213371276855, "learning_rate": 7.31442037507055e-05, "loss": 2.3721084594726562, "memory(GiB)": 77.56, "step": 40475, "token_acc": 0.5134328358208955, "train_speed(iter/s)": 1.440447 }, { "epoch": 1.7342873055995887, "grad_norm": 6.05742883682251, "learning_rate": 7.313823814356841e-05, "loss": 2.6238195419311525, "memory(GiB)": 77.56, "step": 40480, "token_acc": 0.4645390070921986, "train_speed(iter/s)": 1.44047 }, { "epoch": 1.7345015209288377, "grad_norm": 4.813930988311768, "learning_rate": 7.313227211726044e-05, "loss": 2.563539505004883, "memory(GiB)": 77.56, "step": 40485, "token_acc": 0.46875, "train_speed(iter/s)": 1.4405 }, { "epoch": 1.7347157362580865, "grad_norm": 4.328474044799805, "learning_rate": 7.312630567188965e-05, "loss": 2.696954536437988, "memory(GiB)": 77.56, "step": 40490, "token_acc": 0.4397163120567376, "train_speed(iter/s)": 1.440528 }, { "epoch": 1.7349299515873355, "grad_norm": 4.531674385070801, "learning_rate": 7.312033880756415e-05, "loss": 2.4875301361083983, "memory(GiB)": 77.56, "step": 40495, "token_acc": 0.5057034220532319, "train_speed(iter/s)": 1.440489 }, { "epoch": 1.7351441669165846, "grad_norm": 4.905977249145508, "learning_rate": 7.311437152439201e-05, "loss": 2.4641048431396486, "memory(GiB)": 77.56, "step": 40500, "token_acc": 0.4793103448275862, "train_speed(iter/s)": 1.440519 }, { "epoch": 1.7351441669165846, "eval_loss": 2.252992630004883, "eval_runtime": 14.9239, "eval_samples_per_second": 6.701, "eval_steps_per_second": 6.701, "eval_token_acc": 0.46740638002773927, "step": 40500 }, { "epoch": 1.7353583822458334, "grad_norm": 4.80511999130249, "learning_rate": 7.310840382248136e-05, "loss": 2.5491262435913087, "memory(GiB)": 77.56, "step": 40505, "token_acc": 0.468063872255489, "train_speed(iter/s)": 1.439743 }, { "epoch": 1.7355725975750824, "grad_norm": 6.622159004211426, "learning_rate": 7.310243570194029e-05, "loss": 2.644375228881836, "memory(GiB)": 77.56, "step": 40510, "token_acc": 0.462406015037594, "train_speed(iter/s)": 1.43978 }, { "epoch": 1.7357868129043315, "grad_norm": 5.362545013427734, "learning_rate": 7.309646716287692e-05, "loss": 2.366105079650879, "memory(GiB)": 77.56, "step": 40515, "token_acc": 0.5075987841945289, "train_speed(iter/s)": 1.439795 }, { "epoch": 1.7360010282335803, "grad_norm": 5.001947402954102, "learning_rate": 7.30904982053994e-05, "loss": 2.302295112609863, "memory(GiB)": 77.56, "step": 40520, "token_acc": 0.49122807017543857, "train_speed(iter/s)": 1.439772 }, { "epoch": 1.7362152435628293, "grad_norm": 4.6217041015625, "learning_rate": 7.308452882961582e-05, "loss": 2.5398284912109377, "memory(GiB)": 77.56, "step": 40525, "token_acc": 0.4166666666666667, "train_speed(iter/s)": 1.439747 }, { "epoch": 1.7364294588920783, "grad_norm": 4.250296592712402, "learning_rate": 7.307855903563435e-05, "loss": 2.555429458618164, "memory(GiB)": 77.56, "step": 40530, "token_acc": 0.46, "train_speed(iter/s)": 1.439796 }, { "epoch": 1.7366436742213271, "grad_norm": 6.470465660095215, "learning_rate": 7.307258882356314e-05, "loss": 2.4705730438232423, "memory(GiB)": 77.56, "step": 40535, "token_acc": 0.4981132075471698, "train_speed(iter/s)": 1.439741 }, { "epoch": 1.7368578895505762, "grad_norm": 4.547605514526367, "learning_rate": 7.306661819351033e-05, "loss": 2.268496513366699, "memory(GiB)": 77.56, "step": 40540, "token_acc": 0.5319148936170213, "train_speed(iter/s)": 1.439733 }, { "epoch": 1.7370721048798252, "grad_norm": 5.024112701416016, "learning_rate": 7.306064714558412e-05, "loss": 2.406391143798828, "memory(GiB)": 77.56, "step": 40545, "token_acc": 0.48623853211009177, "train_speed(iter/s)": 1.439655 }, { "epoch": 1.737286320209074, "grad_norm": 6.351881980895996, "learning_rate": 7.305467567989263e-05, "loss": 2.446776580810547, "memory(GiB)": 77.56, "step": 40550, "token_acc": 0.5053763440860215, "train_speed(iter/s)": 1.439664 }, { "epoch": 1.7375005355383233, "grad_norm": 6.156836032867432, "learning_rate": 7.304870379654407e-05, "loss": 2.5387594223022463, "memory(GiB)": 77.56, "step": 40555, "token_acc": 0.4696969696969697, "train_speed(iter/s)": 1.439714 }, { "epoch": 1.737714750867572, "grad_norm": 4.673503875732422, "learning_rate": 7.304273149564662e-05, "loss": 2.4603137969970703, "memory(GiB)": 77.56, "step": 40560, "token_acc": 0.4679245283018868, "train_speed(iter/s)": 1.43975 }, { "epoch": 1.737928966196821, "grad_norm": 5.059421539306641, "learning_rate": 7.303675877730848e-05, "loss": 2.2620702743530274, "memory(GiB)": 77.56, "step": 40565, "token_acc": 0.4820359281437126, "train_speed(iter/s)": 1.439795 }, { "epoch": 1.7381431815260702, "grad_norm": 4.947644233703613, "learning_rate": 7.303078564163783e-05, "loss": 2.7470006942749023, "memory(GiB)": 77.56, "step": 40570, "token_acc": 0.45993031358885017, "train_speed(iter/s)": 1.439811 }, { "epoch": 1.738357396855319, "grad_norm": 5.241748809814453, "learning_rate": 7.30248120887429e-05, "loss": 2.7889389038085937, "memory(GiB)": 77.56, "step": 40575, "token_acc": 0.42105263157894735, "train_speed(iter/s)": 1.439828 }, { "epoch": 1.7385716121845678, "grad_norm": 4.976478099822998, "learning_rate": 7.301883811873191e-05, "loss": 2.6292093276977537, "memory(GiB)": 77.56, "step": 40580, "token_acc": 0.47440273037542663, "train_speed(iter/s)": 1.43985 }, { "epoch": 1.738785827513817, "grad_norm": 5.1577677726745605, "learning_rate": 7.301286373171305e-05, "loss": 2.3530048370361327, "memory(GiB)": 77.56, "step": 40585, "token_acc": 0.5, "train_speed(iter/s)": 1.439832 }, { "epoch": 1.7390000428430659, "grad_norm": 6.54850435256958, "learning_rate": 7.30068889277946e-05, "loss": 2.4994314193725584, "memory(GiB)": 77.56, "step": 40590, "token_acc": 0.47491638795986624, "train_speed(iter/s)": 1.439789 }, { "epoch": 1.7392142581723147, "grad_norm": 6.911805629730225, "learning_rate": 7.300091370708476e-05, "loss": 2.661682891845703, "memory(GiB)": 77.56, "step": 40595, "token_acc": 0.48, "train_speed(iter/s)": 1.439832 }, { "epoch": 1.739428473501564, "grad_norm": 5.412624835968018, "learning_rate": 7.29949380696918e-05, "loss": 2.5546459197998046, "memory(GiB)": 77.56, "step": 40600, "token_acc": 0.4664310954063604, "train_speed(iter/s)": 1.439827 }, { "epoch": 1.7396426888308127, "grad_norm": 7.305591106414795, "learning_rate": 7.298896201572396e-05, "loss": 2.5734928131103514, "memory(GiB)": 77.56, "step": 40605, "token_acc": 0.47577092511013214, "train_speed(iter/s)": 1.439839 }, { "epoch": 1.7398569041600616, "grad_norm": 3.903895616531372, "learning_rate": 7.298298554528952e-05, "loss": 2.597462844848633, "memory(GiB)": 77.56, "step": 40610, "token_acc": 0.4817073170731707, "train_speed(iter/s)": 1.43983 }, { "epoch": 1.7400711194893108, "grad_norm": 4.521829605102539, "learning_rate": 7.297700865849672e-05, "loss": 2.549844169616699, "memory(GiB)": 77.56, "step": 40615, "token_acc": 0.475177304964539, "train_speed(iter/s)": 1.439827 }, { "epoch": 1.7402853348185596, "grad_norm": 6.840327739715576, "learning_rate": 7.297103135545387e-05, "loss": 2.335257720947266, "memory(GiB)": 77.56, "step": 40620, "token_acc": 0.5075757575757576, "train_speed(iter/s)": 1.439856 }, { "epoch": 1.7404995501478084, "grad_norm": 8.244111061096191, "learning_rate": 7.296505363626922e-05, "loss": 2.7429428100585938, "memory(GiB)": 77.56, "step": 40625, "token_acc": 0.475, "train_speed(iter/s)": 1.439786 }, { "epoch": 1.7407137654770577, "grad_norm": 4.791749954223633, "learning_rate": 7.29590755010511e-05, "loss": 2.3054740905761717, "memory(GiB)": 77.56, "step": 40630, "token_acc": 0.488135593220339, "train_speed(iter/s)": 1.439785 }, { "epoch": 1.7409279808063065, "grad_norm": 5.039892196655273, "learning_rate": 7.295309694990777e-05, "loss": 2.410167694091797, "memory(GiB)": 77.56, "step": 40635, "token_acc": 0.5041666666666667, "train_speed(iter/s)": 1.439787 }, { "epoch": 1.7411421961355553, "grad_norm": 5.623873710632324, "learning_rate": 7.294711798294758e-05, "loss": 2.36167049407959, "memory(GiB)": 77.56, "step": 40640, "token_acc": 0.47368421052631576, "train_speed(iter/s)": 1.439802 }, { "epoch": 1.7413564114648046, "grad_norm": 7.293215751647949, "learning_rate": 7.29411386002788e-05, "loss": 2.4238561630249023, "memory(GiB)": 77.56, "step": 40645, "token_acc": 0.5064377682403434, "train_speed(iter/s)": 1.439811 }, { "epoch": 1.7415706267940534, "grad_norm": 3.7725753784179688, "learning_rate": 7.293515880200978e-05, "loss": 2.543780708312988, "memory(GiB)": 77.56, "step": 40650, "token_acc": 0.47941176470588237, "train_speed(iter/s)": 1.43979 }, { "epoch": 1.7417848421233022, "grad_norm": 4.8384881019592285, "learning_rate": 7.292917858824885e-05, "loss": 2.6027462005615236, "memory(GiB)": 77.56, "step": 40655, "token_acc": 0.44518272425249167, "train_speed(iter/s)": 1.439787 }, { "epoch": 1.7419990574525515, "grad_norm": 4.457609176635742, "learning_rate": 7.292319795910434e-05, "loss": 2.5626333236694334, "memory(GiB)": 77.56, "step": 40660, "token_acc": 0.4666666666666667, "train_speed(iter/s)": 1.439737 }, { "epoch": 1.7422132727818003, "grad_norm": 6.2772064208984375, "learning_rate": 7.29172169146846e-05, "loss": 2.378091049194336, "memory(GiB)": 77.56, "step": 40665, "token_acc": 0.4693140794223827, "train_speed(iter/s)": 1.439735 }, { "epoch": 1.742427488111049, "grad_norm": 4.154864311218262, "learning_rate": 7.291123545509796e-05, "loss": 2.60838623046875, "memory(GiB)": 77.56, "step": 40670, "token_acc": 0.4575757575757576, "train_speed(iter/s)": 1.439747 }, { "epoch": 1.7426417034402983, "grad_norm": 4.296637058258057, "learning_rate": 7.290525358045279e-05, "loss": 2.543315887451172, "memory(GiB)": 77.56, "step": 40675, "token_acc": 0.47547169811320755, "train_speed(iter/s)": 1.439745 }, { "epoch": 1.7428559187695472, "grad_norm": 4.667581081390381, "learning_rate": 7.289927129085749e-05, "loss": 2.6319957733154298, "memory(GiB)": 77.56, "step": 40680, "token_acc": 0.49056603773584906, "train_speed(iter/s)": 1.439765 }, { "epoch": 1.743070134098796, "grad_norm": 4.641863822937012, "learning_rate": 7.28932885864204e-05, "loss": 2.4675020217895507, "memory(GiB)": 77.56, "step": 40685, "token_acc": 0.4692556634304207, "train_speed(iter/s)": 1.439802 }, { "epoch": 1.7432843494280452, "grad_norm": 6.663127422332764, "learning_rate": 7.28873054672499e-05, "loss": 2.5931159973144533, "memory(GiB)": 77.56, "step": 40690, "token_acc": 0.5, "train_speed(iter/s)": 1.439792 }, { "epoch": 1.743498564757294, "grad_norm": 6.932421684265137, "learning_rate": 7.288132193345443e-05, "loss": 2.7125431060791017, "memory(GiB)": 77.56, "step": 40695, "token_acc": 0.4606413994169096, "train_speed(iter/s)": 1.439785 }, { "epoch": 1.7437127800865428, "grad_norm": 6.435955047607422, "learning_rate": 7.287533798514231e-05, "loss": 2.5684553146362306, "memory(GiB)": 77.56, "step": 40700, "token_acc": 0.4603174603174603, "train_speed(iter/s)": 1.439777 }, { "epoch": 1.743926995415792, "grad_norm": 4.493941307067871, "learning_rate": 7.2869353622422e-05, "loss": 2.286090087890625, "memory(GiB)": 77.56, "step": 40705, "token_acc": 0.4978165938864629, "train_speed(iter/s)": 1.439828 }, { "epoch": 1.744141210745041, "grad_norm": 6.2130913734436035, "learning_rate": 7.286336884540189e-05, "loss": 2.1310964584350587, "memory(GiB)": 77.56, "step": 40710, "token_acc": 0.5057034220532319, "train_speed(iter/s)": 1.439855 }, { "epoch": 1.7443554260742897, "grad_norm": 4.717353820800781, "learning_rate": 7.28573836541904e-05, "loss": 2.5709308624267577, "memory(GiB)": 77.56, "step": 40715, "token_acc": 0.49517684887459806, "train_speed(iter/s)": 1.439865 }, { "epoch": 1.744569641403539, "grad_norm": 5.047801971435547, "learning_rate": 7.285139804889598e-05, "loss": 2.5895423889160156, "memory(GiB)": 77.56, "step": 40720, "token_acc": 0.4930555555555556, "train_speed(iter/s)": 1.43985 }, { "epoch": 1.7447838567327878, "grad_norm": 6.486758708953857, "learning_rate": 7.284541202962704e-05, "loss": 2.865602493286133, "memory(GiB)": 77.56, "step": 40725, "token_acc": 0.421259842519685, "train_speed(iter/s)": 1.439873 }, { "epoch": 1.7449980720620366, "grad_norm": 5.440725803375244, "learning_rate": 7.283942559649202e-05, "loss": 2.575438690185547, "memory(GiB)": 77.56, "step": 40730, "token_acc": 0.5037593984962406, "train_speed(iter/s)": 1.439857 }, { "epoch": 1.7452122873912859, "grad_norm": 5.528493404388428, "learning_rate": 7.283343874959941e-05, "loss": 2.5446495056152343, "memory(GiB)": 77.56, "step": 40735, "token_acc": 0.4542372881355932, "train_speed(iter/s)": 1.439841 }, { "epoch": 1.7454265027205347, "grad_norm": 3.583773612976074, "learning_rate": 7.282745148905759e-05, "loss": 2.553518867492676, "memory(GiB)": 77.56, "step": 40740, "token_acc": 0.5147058823529411, "train_speed(iter/s)": 1.439836 }, { "epoch": 1.7456407180497835, "grad_norm": 4.390019416809082, "learning_rate": 7.28214638149751e-05, "loss": 2.5765850067138674, "memory(GiB)": 77.56, "step": 40745, "token_acc": 0.4444444444444444, "train_speed(iter/s)": 1.439827 }, { "epoch": 1.7458549333790327, "grad_norm": 4.111855983734131, "learning_rate": 7.28154757274604e-05, "loss": 2.502342414855957, "memory(GiB)": 77.56, "step": 40750, "token_acc": 0.5, "train_speed(iter/s)": 1.439828 }, { "epoch": 1.7460691487082816, "grad_norm": 4.50808048248291, "learning_rate": 7.280948722662194e-05, "loss": 2.421772766113281, "memory(GiB)": 77.56, "step": 40755, "token_acc": 0.45871559633027525, "train_speed(iter/s)": 1.439864 }, { "epoch": 1.7462833640375304, "grad_norm": 5.665604114532471, "learning_rate": 7.280349831256821e-05, "loss": 2.6266101837158202, "memory(GiB)": 77.56, "step": 40760, "token_acc": 0.47183098591549294, "train_speed(iter/s)": 1.439902 }, { "epoch": 1.7464975793667796, "grad_norm": 3.7740471363067627, "learning_rate": 7.279750898540774e-05, "loss": 2.191357421875, "memory(GiB)": 77.56, "step": 40765, "token_acc": 0.4934210526315789, "train_speed(iter/s)": 1.439876 }, { "epoch": 1.7467117946960284, "grad_norm": 6.237283706665039, "learning_rate": 7.279151924524899e-05, "loss": 2.4772674560546877, "memory(GiB)": 77.56, "step": 40770, "token_acc": 0.5017921146953405, "train_speed(iter/s)": 1.439898 }, { "epoch": 1.7469260100252773, "grad_norm": 4.670551776885986, "learning_rate": 7.278552909220049e-05, "loss": 2.4850196838378906, "memory(GiB)": 77.56, "step": 40775, "token_acc": 0.4925925925925926, "train_speed(iter/s)": 1.439908 }, { "epoch": 1.7471402253545265, "grad_norm": 4.446109294891357, "learning_rate": 7.277953852637076e-05, "loss": 2.4417701721191407, "memory(GiB)": 77.56, "step": 40780, "token_acc": 0.45151515151515154, "train_speed(iter/s)": 1.439901 }, { "epoch": 1.7473544406837753, "grad_norm": 4.435632228851318, "learning_rate": 7.277354754786832e-05, "loss": 2.644916534423828, "memory(GiB)": 77.56, "step": 40785, "token_acc": 0.47330960854092524, "train_speed(iter/s)": 1.439934 }, { "epoch": 1.7475686560130241, "grad_norm": 4.747522354125977, "learning_rate": 7.27675561568017e-05, "loss": 2.4676158905029295, "memory(GiB)": 77.56, "step": 40790, "token_acc": 0.4924812030075188, "train_speed(iter/s)": 1.43992 }, { "epoch": 1.7477828713422734, "grad_norm": 4.9789018630981445, "learning_rate": 7.276156435327946e-05, "loss": 2.4871990203857424, "memory(GiB)": 77.56, "step": 40795, "token_acc": 0.4867924528301887, "train_speed(iter/s)": 1.439914 }, { "epoch": 1.7479970866715222, "grad_norm": 5.806180477142334, "learning_rate": 7.27555721374101e-05, "loss": 2.245492935180664, "memory(GiB)": 77.56, "step": 40800, "token_acc": 0.5061728395061729, "train_speed(iter/s)": 1.439923 }, { "epoch": 1.748211302000771, "grad_norm": 5.388152599334717, "learning_rate": 7.274957950930223e-05, "loss": 2.5515552520751954, "memory(GiB)": 77.56, "step": 40805, "token_acc": 0.49469964664310956, "train_speed(iter/s)": 1.439936 }, { "epoch": 1.7484255173300203, "grad_norm": 5.357909202575684, "learning_rate": 7.274358646906438e-05, "loss": 2.761534881591797, "memory(GiB)": 77.56, "step": 40810, "token_acc": 0.4553314121037464, "train_speed(iter/s)": 1.439951 }, { "epoch": 1.748639732659269, "grad_norm": 3.8138606548309326, "learning_rate": 7.273759301680511e-05, "loss": 2.443224334716797, "memory(GiB)": 77.56, "step": 40815, "token_acc": 0.4755700325732899, "train_speed(iter/s)": 1.439959 }, { "epoch": 1.748853947988518, "grad_norm": 4.7785444259643555, "learning_rate": 7.273159915263303e-05, "loss": 2.490665245056152, "memory(GiB)": 77.56, "step": 40820, "token_acc": 0.45901639344262296, "train_speed(iter/s)": 1.439958 }, { "epoch": 1.7490681633177672, "grad_norm": 5.238621711730957, "learning_rate": 7.27256048766567e-05, "loss": 2.3670316696166993, "memory(GiB)": 77.56, "step": 40825, "token_acc": 0.5477178423236515, "train_speed(iter/s)": 1.439971 }, { "epoch": 1.749282378647016, "grad_norm": 4.694814205169678, "learning_rate": 7.271961018898473e-05, "loss": 2.3893117904663086, "memory(GiB)": 77.56, "step": 40830, "token_acc": 0.4935064935064935, "train_speed(iter/s)": 1.439967 }, { "epoch": 1.7494965939762648, "grad_norm": 7.3936848640441895, "learning_rate": 7.27136150897257e-05, "loss": 2.376717758178711, "memory(GiB)": 77.56, "step": 40835, "token_acc": 0.5125, "train_speed(iter/s)": 1.439947 }, { "epoch": 1.749710809305514, "grad_norm": 5.1302103996276855, "learning_rate": 7.270761957898823e-05, "loss": 2.715974044799805, "memory(GiB)": 77.56, "step": 40840, "token_acc": 0.41403508771929826, "train_speed(iter/s)": 1.439969 }, { "epoch": 1.7499250246347628, "grad_norm": 4.127560615539551, "learning_rate": 7.270162365688092e-05, "loss": 2.2237516403198243, "memory(GiB)": 77.56, "step": 40845, "token_acc": 0.4845360824742268, "train_speed(iter/s)": 1.439949 }, { "epoch": 1.7501392399640117, "grad_norm": 9.205788612365723, "learning_rate": 7.269562732351241e-05, "loss": 2.6029834747314453, "memory(GiB)": 77.56, "step": 40850, "token_acc": 0.4978165938864629, "train_speed(iter/s)": 1.439966 }, { "epoch": 1.750353455293261, "grad_norm": 5.841698169708252, "learning_rate": 7.268963057899132e-05, "loss": 2.5094099044799805, "memory(GiB)": 77.56, "step": 40855, "token_acc": 0.4965277777777778, "train_speed(iter/s)": 1.439963 }, { "epoch": 1.7505676706225097, "grad_norm": 4.756931304931641, "learning_rate": 7.268363342342628e-05, "loss": 2.6328990936279295, "memory(GiB)": 77.56, "step": 40860, "token_acc": 0.4551083591331269, "train_speed(iter/s)": 1.43999 }, { "epoch": 1.7507818859517585, "grad_norm": 4.951889514923096, "learning_rate": 7.267763585692595e-05, "loss": 2.719615364074707, "memory(GiB)": 77.56, "step": 40865, "token_acc": 0.4646153846153846, "train_speed(iter/s)": 1.440017 }, { "epoch": 1.7509961012810078, "grad_norm": 3.907123565673828, "learning_rate": 7.267163787959898e-05, "loss": 2.297977256774902, "memory(GiB)": 77.56, "step": 40870, "token_acc": 0.5206611570247934, "train_speed(iter/s)": 1.440033 }, { "epoch": 1.7512103166102566, "grad_norm": 5.126631736755371, "learning_rate": 7.2665639491554e-05, "loss": 2.6837745666503907, "memory(GiB)": 77.56, "step": 40875, "token_acc": 0.42058823529411765, "train_speed(iter/s)": 1.440065 }, { "epoch": 1.7514245319395054, "grad_norm": 5.0975751876831055, "learning_rate": 7.265964069289972e-05, "loss": 2.415487861633301, "memory(GiB)": 77.56, "step": 40880, "token_acc": 0.4586206896551724, "train_speed(iter/s)": 1.44007 }, { "epoch": 1.7516387472687547, "grad_norm": 7.13967752456665, "learning_rate": 7.265364148374478e-05, "loss": 2.5960275650024416, "memory(GiB)": 77.56, "step": 40885, "token_acc": 0.45768025078369906, "train_speed(iter/s)": 1.440095 }, { "epoch": 1.7518529625980035, "grad_norm": 4.449210166931152, "learning_rate": 7.264764186419788e-05, "loss": 2.753332328796387, "memory(GiB)": 77.56, "step": 40890, "token_acc": 0.41637010676156583, "train_speed(iter/s)": 1.440158 }, { "epoch": 1.7520671779272525, "grad_norm": 6.141135215759277, "learning_rate": 7.26416418343677e-05, "loss": 2.5775833129882812, "memory(GiB)": 77.56, "step": 40895, "token_acc": 0.43820224719101125, "train_speed(iter/s)": 1.440184 }, { "epoch": 1.7522813932565016, "grad_norm": 5.39845609664917, "learning_rate": 7.263564139436294e-05, "loss": 2.3636911392211912, "memory(GiB)": 77.56, "step": 40900, "token_acc": 0.4781144781144781, "train_speed(iter/s)": 1.440214 }, { "epoch": 1.7524956085857504, "grad_norm": 4.237954139709473, "learning_rate": 7.26296405442923e-05, "loss": 2.444438934326172, "memory(GiB)": 77.56, "step": 40905, "token_acc": 0.5014925373134328, "train_speed(iter/s)": 1.440243 }, { "epoch": 1.7527098239149994, "grad_norm": 4.258236408233643, "learning_rate": 7.26236392842645e-05, "loss": 2.5143136978149414, "memory(GiB)": 77.56, "step": 40910, "token_acc": 0.48148148148148145, "train_speed(iter/s)": 1.440267 }, { "epoch": 1.7529240392442484, "grad_norm": 5.368627071380615, "learning_rate": 7.261763761438824e-05, "loss": 2.27318115234375, "memory(GiB)": 77.56, "step": 40915, "token_acc": 0.51, "train_speed(iter/s)": 1.44027 }, { "epoch": 1.7531382545734973, "grad_norm": 6.429764747619629, "learning_rate": 7.261163553477226e-05, "loss": 2.5919261932373048, "memory(GiB)": 77.56, "step": 40920, "token_acc": 0.49056603773584906, "train_speed(iter/s)": 1.440282 }, { "epoch": 1.7533524699027463, "grad_norm": 6.166162014007568, "learning_rate": 7.26056330455253e-05, "loss": 2.3379310607910155, "memory(GiB)": 77.56, "step": 40925, "token_acc": 0.5148148148148148, "train_speed(iter/s)": 1.440318 }, { "epoch": 1.7535666852319953, "grad_norm": 5.194517612457275, "learning_rate": 7.259963014675608e-05, "loss": 2.538084030151367, "memory(GiB)": 77.56, "step": 40930, "token_acc": 0.4584615384615385, "train_speed(iter/s)": 1.440355 }, { "epoch": 1.7537809005612441, "grad_norm": 4.538455963134766, "learning_rate": 7.259362683857336e-05, "loss": 2.637641716003418, "memory(GiB)": 77.56, "step": 40935, "token_acc": 0.4690909090909091, "train_speed(iter/s)": 1.440388 }, { "epoch": 1.7539951158904932, "grad_norm": 4.2257184982299805, "learning_rate": 7.258762312108591e-05, "loss": 2.419940185546875, "memory(GiB)": 77.56, "step": 40940, "token_acc": 0.5035971223021583, "train_speed(iter/s)": 1.440354 }, { "epoch": 1.7542093312197422, "grad_norm": 5.460875988006592, "learning_rate": 7.258161899440246e-05, "loss": 2.3541849136352537, "memory(GiB)": 77.56, "step": 40945, "token_acc": 0.4612546125461255, "train_speed(iter/s)": 1.44034 }, { "epoch": 1.754423546548991, "grad_norm": 6.190771102905273, "learning_rate": 7.257561445863182e-05, "loss": 2.557106781005859, "memory(GiB)": 77.56, "step": 40950, "token_acc": 0.48226950354609927, "train_speed(iter/s)": 1.440363 }, { "epoch": 1.75463776187824, "grad_norm": 5.101850986480713, "learning_rate": 7.256960951388274e-05, "loss": 2.0622623443603514, "memory(GiB)": 77.56, "step": 40955, "token_acc": 0.5691699604743083, "train_speed(iter/s)": 1.440361 }, { "epoch": 1.754851977207489, "grad_norm": 3.9329185485839844, "learning_rate": 7.2563604160264e-05, "loss": 2.7492170333862305, "memory(GiB)": 77.56, "step": 40960, "token_acc": 0.4983922829581994, "train_speed(iter/s)": 1.440382 }, { "epoch": 1.755066192536738, "grad_norm": 4.481163024902344, "learning_rate": 7.25575983978844e-05, "loss": 2.3399169921875, "memory(GiB)": 77.56, "step": 40965, "token_acc": 0.5016835016835017, "train_speed(iter/s)": 1.440381 }, { "epoch": 1.755280407865987, "grad_norm": 5.865595817565918, "learning_rate": 7.255159222685277e-05, "loss": 2.3756961822509766, "memory(GiB)": 77.56, "step": 40970, "token_acc": 0.43986254295532645, "train_speed(iter/s)": 1.440405 }, { "epoch": 1.755494623195236, "grad_norm": 5.64993953704834, "learning_rate": 7.254558564727786e-05, "loss": 2.5981561660766603, "memory(GiB)": 77.56, "step": 40975, "token_acc": 0.44516129032258067, "train_speed(iter/s)": 1.440419 }, { "epoch": 1.7557088385244848, "grad_norm": 4.915028095245361, "learning_rate": 7.253957865926854e-05, "loss": 2.503937911987305, "memory(GiB)": 77.56, "step": 40980, "token_acc": 0.4767025089605735, "train_speed(iter/s)": 1.440413 }, { "epoch": 1.7559230538537338, "grad_norm": 5.909773826599121, "learning_rate": 7.253357126293361e-05, "loss": 2.189768600463867, "memory(GiB)": 77.56, "step": 40985, "token_acc": 0.5257352941176471, "train_speed(iter/s)": 1.44042 }, { "epoch": 1.7561372691829829, "grad_norm": 4.224458694458008, "learning_rate": 7.252756345838187e-05, "loss": 2.4832731246948243, "memory(GiB)": 77.56, "step": 40990, "token_acc": 0.43478260869565216, "train_speed(iter/s)": 1.440475 }, { "epoch": 1.7563514845122317, "grad_norm": 5.250662803649902, "learning_rate": 7.252155524572222e-05, "loss": 2.818433952331543, "memory(GiB)": 77.56, "step": 40995, "token_acc": 0.445993031358885, "train_speed(iter/s)": 1.440457 }, { "epoch": 1.7565656998414807, "grad_norm": 4.951468467712402, "learning_rate": 7.251554662506346e-05, "loss": 2.559613037109375, "memory(GiB)": 77.56, "step": 41000, "token_acc": 0.47096774193548385, "train_speed(iter/s)": 1.440462 }, { "epoch": 1.7565656998414807, "eval_loss": 2.1070873737335205, "eval_runtime": 13.8064, "eval_samples_per_second": 7.243, "eval_steps_per_second": 7.243, "eval_token_acc": 0.502283105022831, "step": 41000 }, { "epoch": 1.7567799151707297, "grad_norm": 5.524142742156982, "learning_rate": 7.250953759651443e-05, "loss": 2.285837173461914, "memory(GiB)": 77.56, "step": 41005, "token_acc": 0.49739311783107404, "train_speed(iter/s)": 1.439688 }, { "epoch": 1.7569941304999785, "grad_norm": 5.306302070617676, "learning_rate": 7.250352816018402e-05, "loss": 2.702947425842285, "memory(GiB)": 77.56, "step": 41010, "token_acc": 0.4539877300613497, "train_speed(iter/s)": 1.439664 }, { "epoch": 1.7572083458292276, "grad_norm": 4.793148040771484, "learning_rate": 7.24975183161811e-05, "loss": 2.4987409591674803, "memory(GiB)": 77.56, "step": 41015, "token_acc": 0.4583333333333333, "train_speed(iter/s)": 1.439657 }, { "epoch": 1.7574225611584766, "grad_norm": 4.849996566772461, "learning_rate": 7.249150806461453e-05, "loss": 2.0000219345092773, "memory(GiB)": 77.56, "step": 41020, "token_acc": 0.5860655737704918, "train_speed(iter/s)": 1.439639 }, { "epoch": 1.7576367764877254, "grad_norm": 5.711615562438965, "learning_rate": 7.248549740559319e-05, "loss": 2.657586669921875, "memory(GiB)": 77.56, "step": 41025, "token_acc": 0.48125, "train_speed(iter/s)": 1.439641 }, { "epoch": 1.7578509918169745, "grad_norm": 5.536566257476807, "learning_rate": 7.247948633922597e-05, "loss": 2.314360809326172, "memory(GiB)": 77.56, "step": 41030, "token_acc": 0.5121951219512195, "train_speed(iter/s)": 1.439653 }, { "epoch": 1.7580652071462235, "grad_norm": 4.747581958770752, "learning_rate": 7.247347486562177e-05, "loss": 2.78458251953125, "memory(GiB)": 77.56, "step": 41035, "token_acc": 0.4527687296416938, "train_speed(iter/s)": 1.439661 }, { "epoch": 1.7582794224754723, "grad_norm": 5.256016731262207, "learning_rate": 7.246746298488949e-05, "loss": 2.5093849182128904, "memory(GiB)": 77.56, "step": 41040, "token_acc": 0.4766666666666667, "train_speed(iter/s)": 1.439646 }, { "epoch": 1.7584936378047213, "grad_norm": 4.9303131103515625, "learning_rate": 7.246145069713804e-05, "loss": 2.653642272949219, "memory(GiB)": 77.56, "step": 41045, "token_acc": 0.46153846153846156, "train_speed(iter/s)": 1.439685 }, { "epoch": 1.7587078531339704, "grad_norm": 5.3557233810424805, "learning_rate": 7.245543800247634e-05, "loss": 2.302123260498047, "memory(GiB)": 77.56, "step": 41050, "token_acc": 0.5109489051094891, "train_speed(iter/s)": 1.439664 }, { "epoch": 1.7589220684632192, "grad_norm": 5.448339939117432, "learning_rate": 7.244942490101332e-05, "loss": 2.6585994720458985, "memory(GiB)": 77.56, "step": 41055, "token_acc": 0.4772727272727273, "train_speed(iter/s)": 1.439636 }, { "epoch": 1.7591362837924682, "grad_norm": 4.330277442932129, "learning_rate": 7.24434113928579e-05, "loss": 2.4772113800048827, "memory(GiB)": 77.56, "step": 41060, "token_acc": 0.5035971223021583, "train_speed(iter/s)": 1.439667 }, { "epoch": 1.7593504991217173, "grad_norm": 5.223570823669434, "learning_rate": 7.243739747811903e-05, "loss": 2.4174676895141602, "memory(GiB)": 77.56, "step": 41065, "token_acc": 0.4878048780487805, "train_speed(iter/s)": 1.439694 }, { "epoch": 1.759564714450966, "grad_norm": 4.613308429718018, "learning_rate": 7.243138315690567e-05, "loss": 2.6890644073486327, "memory(GiB)": 77.56, "step": 41070, "token_acc": 0.4729241877256318, "train_speed(iter/s)": 1.439736 }, { "epoch": 1.759778929780215, "grad_norm": 4.7630462646484375, "learning_rate": 7.242536842932675e-05, "loss": 2.692647171020508, "memory(GiB)": 77.56, "step": 41075, "token_acc": 0.4411764705882353, "train_speed(iter/s)": 1.439749 }, { "epoch": 1.7599931451094641, "grad_norm": 4.018398284912109, "learning_rate": 7.241935329549125e-05, "loss": 2.9527984619140626, "memory(GiB)": 77.56, "step": 41080, "token_acc": 0.3961218836565097, "train_speed(iter/s)": 1.439776 }, { "epoch": 1.760207360438713, "grad_norm": 4.387391567230225, "learning_rate": 7.241333775550813e-05, "loss": 2.555002975463867, "memory(GiB)": 77.56, "step": 41085, "token_acc": 0.4756944444444444, "train_speed(iter/s)": 1.43978 }, { "epoch": 1.760421575767962, "grad_norm": 4.582672595977783, "learning_rate": 7.240732180948637e-05, "loss": 2.626940155029297, "memory(GiB)": 77.56, "step": 41090, "token_acc": 0.44886363636363635, "train_speed(iter/s)": 1.439762 }, { "epoch": 1.760635791097211, "grad_norm": 5.070387840270996, "learning_rate": 7.240130545753496e-05, "loss": 2.461312675476074, "memory(GiB)": 77.56, "step": 41095, "token_acc": 0.46540880503144655, "train_speed(iter/s)": 1.439801 }, { "epoch": 1.7608500064264598, "grad_norm": 6.758989334106445, "learning_rate": 7.239528869976288e-05, "loss": 2.7175647735595705, "memory(GiB)": 77.56, "step": 41100, "token_acc": 0.4795539033457249, "train_speed(iter/s)": 1.439832 }, { "epoch": 1.7610642217557089, "grad_norm": 5.756537437438965, "learning_rate": 7.238927153627914e-05, "loss": 2.6489498138427736, "memory(GiB)": 77.56, "step": 41105, "token_acc": 0.4693140794223827, "train_speed(iter/s)": 1.43984 }, { "epoch": 1.761278437084958, "grad_norm": 5.375679969787598, "learning_rate": 7.238325396719275e-05, "loss": 2.4181610107421876, "memory(GiB)": 77.56, "step": 41110, "token_acc": 0.5, "train_speed(iter/s)": 1.439825 }, { "epoch": 1.7614926524142067, "grad_norm": 5.2676873207092285, "learning_rate": 7.23772359926127e-05, "loss": 2.7358272552490233, "memory(GiB)": 77.56, "step": 41115, "token_acc": 0.45652173913043476, "train_speed(iter/s)": 1.439848 }, { "epoch": 1.7617068677434558, "grad_norm": 4.4719767570495605, "learning_rate": 7.237121761264805e-05, "loss": 2.507393646240234, "memory(GiB)": 77.56, "step": 41120, "token_acc": 0.4563758389261745, "train_speed(iter/s)": 1.439867 }, { "epoch": 1.7619210830727048, "grad_norm": 3.822356939315796, "learning_rate": 7.23651988274078e-05, "loss": 2.652963066101074, "memory(GiB)": 77.56, "step": 41125, "token_acc": 0.46474358974358976, "train_speed(iter/s)": 1.439877 }, { "epoch": 1.7621352984019536, "grad_norm": 6.476071357727051, "learning_rate": 7.235917963700098e-05, "loss": 2.8798969268798826, "memory(GiB)": 77.56, "step": 41130, "token_acc": 0.4482758620689655, "train_speed(iter/s)": 1.439907 }, { "epoch": 1.7623495137312026, "grad_norm": 4.881564140319824, "learning_rate": 7.235316004153667e-05, "loss": 2.4644643783569338, "memory(GiB)": 77.56, "step": 41135, "token_acc": 0.47854785478547857, "train_speed(iter/s)": 1.439933 }, { "epoch": 1.7625637290604517, "grad_norm": 4.694281101226807, "learning_rate": 7.234714004112388e-05, "loss": 2.473439025878906, "memory(GiB)": 77.56, "step": 41140, "token_acc": 0.4387755102040816, "train_speed(iter/s)": 1.439956 }, { "epoch": 1.7627779443897005, "grad_norm": 4.466538906097412, "learning_rate": 7.234111963587169e-05, "loss": 2.450484848022461, "memory(GiB)": 77.56, "step": 41145, "token_acc": 0.49242424242424243, "train_speed(iter/s)": 1.439966 }, { "epoch": 1.7629921597189495, "grad_norm": 3.744291305541992, "learning_rate": 7.233509882588914e-05, "loss": 2.535540771484375, "memory(GiB)": 77.56, "step": 41150, "token_acc": 0.468586387434555, "train_speed(iter/s)": 1.439992 }, { "epoch": 1.7632063750481985, "grad_norm": 5.230288982391357, "learning_rate": 7.232907761128535e-05, "loss": 2.702992248535156, "memory(GiB)": 77.56, "step": 41155, "token_acc": 0.46607669616519176, "train_speed(iter/s)": 1.439998 }, { "epoch": 1.7634205903774474, "grad_norm": 5.842164516448975, "learning_rate": 7.232305599216938e-05, "loss": 2.5083232879638673, "memory(GiB)": 77.56, "step": 41160, "token_acc": 0.49473684210526314, "train_speed(iter/s)": 1.439991 }, { "epoch": 1.7636348057066964, "grad_norm": 5.75670862197876, "learning_rate": 7.231703396865029e-05, "loss": 2.2387208938598633, "memory(GiB)": 77.56, "step": 41165, "token_acc": 0.5057471264367817, "train_speed(iter/s)": 1.439951 }, { "epoch": 1.7638490210359454, "grad_norm": 5.6118998527526855, "learning_rate": 7.23110115408372e-05, "loss": 2.2752613067626952, "memory(GiB)": 77.56, "step": 41170, "token_acc": 0.49814126394052044, "train_speed(iter/s)": 1.439923 }, { "epoch": 1.7640632363651942, "grad_norm": 4.402761459350586, "learning_rate": 7.23049887088392e-05, "loss": 2.512615776062012, "memory(GiB)": 77.56, "step": 41175, "token_acc": 0.4755700325732899, "train_speed(iter/s)": 1.439958 }, { "epoch": 1.7642774516944433, "grad_norm": 4.5815653800964355, "learning_rate": 7.22989654727654e-05, "loss": 2.5429006576538087, "memory(GiB)": 77.56, "step": 41180, "token_acc": 0.5146579804560261, "train_speed(iter/s)": 1.43988 }, { "epoch": 1.7644916670236923, "grad_norm": 5.889723777770996, "learning_rate": 7.229294183272495e-05, "loss": 2.3612640380859373, "memory(GiB)": 77.56, "step": 41185, "token_acc": 0.4847328244274809, "train_speed(iter/s)": 1.439876 }, { "epoch": 1.7647058823529411, "grad_norm": 4.5917887687683105, "learning_rate": 7.228691778882693e-05, "loss": 2.602444076538086, "memory(GiB)": 77.56, "step": 41190, "token_acc": 0.4794520547945205, "train_speed(iter/s)": 1.439844 }, { "epoch": 1.7649200976821902, "grad_norm": 5.886419773101807, "learning_rate": 7.228089334118047e-05, "loss": 2.5652881622314454, "memory(GiB)": 77.56, "step": 41195, "token_acc": 0.44807121661721067, "train_speed(iter/s)": 1.439868 }, { "epoch": 1.7651343130114392, "grad_norm": 4.75514554977417, "learning_rate": 7.227486848989475e-05, "loss": 2.4506084442138674, "memory(GiB)": 77.56, "step": 41200, "token_acc": 0.465625, "train_speed(iter/s)": 1.439886 }, { "epoch": 1.765348528340688, "grad_norm": 6.189976215362549, "learning_rate": 7.226884323507887e-05, "loss": 2.3964977264404297, "memory(GiB)": 77.56, "step": 41205, "token_acc": 0.4866920152091255, "train_speed(iter/s)": 1.439918 }, { "epoch": 1.765562743669937, "grad_norm": 4.561886310577393, "learning_rate": 7.226281757684204e-05, "loss": 2.515126037597656, "memory(GiB)": 77.56, "step": 41210, "token_acc": 0.4427710843373494, "train_speed(iter/s)": 1.439932 }, { "epoch": 1.765776958999186, "grad_norm": 4.111982345581055, "learning_rate": 7.225679151529337e-05, "loss": 2.322934150695801, "memory(GiB)": 77.56, "step": 41215, "token_acc": 0.496875, "train_speed(iter/s)": 1.439923 }, { "epoch": 1.7659911743284349, "grad_norm": 6.711462497711182, "learning_rate": 7.225076505054202e-05, "loss": 2.3757816314697267, "memory(GiB)": 77.56, "step": 41220, "token_acc": 0.48497854077253216, "train_speed(iter/s)": 1.439916 }, { "epoch": 1.766205389657684, "grad_norm": 7.27276086807251, "learning_rate": 7.224473818269721e-05, "loss": 2.670970153808594, "memory(GiB)": 77.56, "step": 41225, "token_acc": 0.46689895470383275, "train_speed(iter/s)": 1.439933 }, { "epoch": 1.766419604986933, "grad_norm": 4.742341041564941, "learning_rate": 7.22387109118681e-05, "loss": 2.542632293701172, "memory(GiB)": 77.56, "step": 41230, "token_acc": 0.49477351916376305, "train_speed(iter/s)": 1.439888 }, { "epoch": 1.7666338203161818, "grad_norm": 3.990126848220825, "learning_rate": 7.223268323816386e-05, "loss": 2.285426139831543, "memory(GiB)": 77.56, "step": 41235, "token_acc": 0.4838709677419355, "train_speed(iter/s)": 1.439871 }, { "epoch": 1.7668480356454308, "grad_norm": 4.492398738861084, "learning_rate": 7.222665516169373e-05, "loss": 2.734975814819336, "memory(GiB)": 77.56, "step": 41240, "token_acc": 0.43302180685358255, "train_speed(iter/s)": 1.439904 }, { "epoch": 1.7670622509746798, "grad_norm": 5.796911239624023, "learning_rate": 7.22206266825669e-05, "loss": 2.61248836517334, "memory(GiB)": 77.56, "step": 41245, "token_acc": 0.4865771812080537, "train_speed(iter/s)": 1.439914 }, { "epoch": 1.7672764663039287, "grad_norm": 6.271621227264404, "learning_rate": 7.221459780089255e-05, "loss": 2.5292232513427733, "memory(GiB)": 77.56, "step": 41250, "token_acc": 0.5088967971530249, "train_speed(iter/s)": 1.43988 }, { "epoch": 1.7674906816331777, "grad_norm": 3.8992645740509033, "learning_rate": 7.220856851677994e-05, "loss": 2.5859981536865235, "memory(GiB)": 77.56, "step": 41255, "token_acc": 0.46153846153846156, "train_speed(iter/s)": 1.439914 }, { "epoch": 1.7677048969624267, "grad_norm": 6.706957817077637, "learning_rate": 7.220253883033827e-05, "loss": 2.9659212112426756, "memory(GiB)": 77.56, "step": 41260, "token_acc": 0.4251968503937008, "train_speed(iter/s)": 1.439925 }, { "epoch": 1.7679191122916755, "grad_norm": 5.120011806488037, "learning_rate": 7.219650874167675e-05, "loss": 2.4343017578125, "memory(GiB)": 77.56, "step": 41265, "token_acc": 0.47278911564625853, "train_speed(iter/s)": 1.439921 }, { "epoch": 1.7681333276209246, "grad_norm": 4.392596244812012, "learning_rate": 7.21904782509047e-05, "loss": 2.519309425354004, "memory(GiB)": 77.56, "step": 41270, "token_acc": 0.4916387959866221, "train_speed(iter/s)": 1.43995 }, { "epoch": 1.7683475429501736, "grad_norm": 5.016544818878174, "learning_rate": 7.218444735813132e-05, "loss": 2.6991399765014648, "memory(GiB)": 77.56, "step": 41275, "token_acc": 0.47843137254901963, "train_speed(iter/s)": 1.439997 }, { "epoch": 1.7685617582794224, "grad_norm": 5.663881301879883, "learning_rate": 7.217841606346584e-05, "loss": 2.2779376983642576, "memory(GiB)": 77.56, "step": 41280, "token_acc": 0.47555555555555556, "train_speed(iter/s)": 1.440032 }, { "epoch": 1.7687759736086714, "grad_norm": 5.234511375427246, "learning_rate": 7.217238436701756e-05, "loss": 2.5336509704589845, "memory(GiB)": 77.56, "step": 41285, "token_acc": 0.4618181818181818, "train_speed(iter/s)": 1.440049 }, { "epoch": 1.7689901889379205, "grad_norm": 5.388389587402344, "learning_rate": 7.216635226889572e-05, "loss": 2.098532867431641, "memory(GiB)": 77.56, "step": 41290, "token_acc": 0.5019762845849802, "train_speed(iter/s)": 1.440087 }, { "epoch": 1.7692044042671693, "grad_norm": 5.6218037605285645, "learning_rate": 7.216031976920963e-05, "loss": 2.6895313262939453, "memory(GiB)": 77.56, "step": 41295, "token_acc": 0.4542124542124542, "train_speed(iter/s)": 1.440099 }, { "epoch": 1.7694186195964183, "grad_norm": 4.102258682250977, "learning_rate": 7.215428686806855e-05, "loss": 2.801017189025879, "memory(GiB)": 77.56, "step": 41300, "token_acc": 0.4006024096385542, "train_speed(iter/s)": 1.440121 }, { "epoch": 1.7696328349256674, "grad_norm": 8.6804780960083, "learning_rate": 7.214825356558181e-05, "loss": 2.545354461669922, "memory(GiB)": 77.56, "step": 41305, "token_acc": 0.49603174603174605, "train_speed(iter/s)": 1.440125 }, { "epoch": 1.7698470502549162, "grad_norm": 4.438647270202637, "learning_rate": 7.214221986185865e-05, "loss": 2.752881622314453, "memory(GiB)": 77.56, "step": 41310, "token_acc": 0.4693140794223827, "train_speed(iter/s)": 1.440144 }, { "epoch": 1.7700612655841652, "grad_norm": 4.487037181854248, "learning_rate": 7.213618575700841e-05, "loss": 2.467152214050293, "memory(GiB)": 77.56, "step": 41315, "token_acc": 0.47547169811320755, "train_speed(iter/s)": 1.440163 }, { "epoch": 1.7702754809134142, "grad_norm": 4.7058000564575195, "learning_rate": 7.21301512511404e-05, "loss": 2.692424774169922, "memory(GiB)": 77.56, "step": 41320, "token_acc": 0.4634920634920635, "train_speed(iter/s)": 1.440195 }, { "epoch": 1.770489696242663, "grad_norm": 6.961008071899414, "learning_rate": 7.212411634436396e-05, "loss": 2.5186532974243163, "memory(GiB)": 77.56, "step": 41325, "token_acc": 0.5094339622641509, "train_speed(iter/s)": 1.44022 }, { "epoch": 1.770703911571912, "grad_norm": 5.930924892425537, "learning_rate": 7.21180810367884e-05, "loss": 2.546651268005371, "memory(GiB)": 77.56, "step": 41330, "token_acc": 0.43870967741935485, "train_speed(iter/s)": 1.440207 }, { "epoch": 1.7709181269011611, "grad_norm": 5.790921688079834, "learning_rate": 7.211204532852302e-05, "loss": 2.636322021484375, "memory(GiB)": 77.56, "step": 41335, "token_acc": 0.44272445820433437, "train_speed(iter/s)": 1.440262 }, { "epoch": 1.77113234223041, "grad_norm": 5.1165947914123535, "learning_rate": 7.21060092196772e-05, "loss": 2.778389739990234, "memory(GiB)": 77.56, "step": 41340, "token_acc": 0.43023255813953487, "train_speed(iter/s)": 1.440224 }, { "epoch": 1.771346557559659, "grad_norm": 4.746934413909912, "learning_rate": 7.209997271036031e-05, "loss": 2.438471221923828, "memory(GiB)": 77.56, "step": 41345, "token_acc": 0.5182186234817814, "train_speed(iter/s)": 1.440279 }, { "epoch": 1.771560772888908, "grad_norm": 5.822700500488281, "learning_rate": 7.209393580068167e-05, "loss": 2.619981575012207, "memory(GiB)": 77.56, "step": 41350, "token_acc": 0.4506172839506173, "train_speed(iter/s)": 1.440272 }, { "epoch": 1.7717749882181568, "grad_norm": 6.012607574462891, "learning_rate": 7.208789849075065e-05, "loss": 2.787109375, "memory(GiB)": 77.56, "step": 41355, "token_acc": 0.432258064516129, "train_speed(iter/s)": 1.440295 }, { "epoch": 1.7719892035474059, "grad_norm": 7.867917060852051, "learning_rate": 7.208186078067665e-05, "loss": 2.358205223083496, "memory(GiB)": 77.56, "step": 41360, "token_acc": 0.5064516129032258, "train_speed(iter/s)": 1.440278 }, { "epoch": 1.772203418876655, "grad_norm": 3.890453577041626, "learning_rate": 7.2075822670569e-05, "loss": 2.6740558624267576, "memory(GiB)": 77.56, "step": 41365, "token_acc": 0.49169435215946844, "train_speed(iter/s)": 1.44031 }, { "epoch": 1.7724176342059037, "grad_norm": 4.7364020347595215, "learning_rate": 7.206978416053714e-05, "loss": 2.524542236328125, "memory(GiB)": 77.56, "step": 41370, "token_acc": 0.46825396825396826, "train_speed(iter/s)": 1.440328 }, { "epoch": 1.7726318495351527, "grad_norm": 5.364123821258545, "learning_rate": 7.206374525069041e-05, "loss": 2.6369821548461916, "memory(GiB)": 77.56, "step": 41375, "token_acc": 0.44086021505376344, "train_speed(iter/s)": 1.440343 }, { "epoch": 1.7728460648644018, "grad_norm": 4.983763694763184, "learning_rate": 7.205770594113826e-05, "loss": 2.583917808532715, "memory(GiB)": 77.56, "step": 41380, "token_acc": 0.43853820598006643, "train_speed(iter/s)": 1.440356 }, { "epoch": 1.7730602801936506, "grad_norm": 4.361377716064453, "learning_rate": 7.205166623199007e-05, "loss": 2.505217170715332, "memory(GiB)": 77.56, "step": 41385, "token_acc": 0.4880546075085324, "train_speed(iter/s)": 1.44036 }, { "epoch": 1.7732744955228996, "grad_norm": 6.533573150634766, "learning_rate": 7.204562612335526e-05, "loss": 2.74563045501709, "memory(GiB)": 77.56, "step": 41390, "token_acc": 0.470404984423676, "train_speed(iter/s)": 1.440389 }, { "epoch": 1.7734887108521487, "grad_norm": 5.561290264129639, "learning_rate": 7.203958561534324e-05, "loss": 2.4945323944091795, "memory(GiB)": 77.56, "step": 41395, "token_acc": 0.43790849673202614, "train_speed(iter/s)": 1.440428 }, { "epoch": 1.7737029261813975, "grad_norm": 6.526377201080322, "learning_rate": 7.203354470806348e-05, "loss": 2.537253189086914, "memory(GiB)": 77.56, "step": 41400, "token_acc": 0.5065502183406113, "train_speed(iter/s)": 1.440464 }, { "epoch": 1.7739171415106465, "grad_norm": 4.192866802215576, "learning_rate": 7.202750340162536e-05, "loss": 2.664411926269531, "memory(GiB)": 77.56, "step": 41405, "token_acc": 0.4786096256684492, "train_speed(iter/s)": 1.440498 }, { "epoch": 1.7741313568398955, "grad_norm": 9.008928298950195, "learning_rate": 7.202146169613835e-05, "loss": 2.7312032699584963, "memory(GiB)": 77.56, "step": 41410, "token_acc": 0.44816053511705684, "train_speed(iter/s)": 1.440502 }, { "epoch": 1.7743455721691443, "grad_norm": 4.682097911834717, "learning_rate": 7.201541959171191e-05, "loss": 2.8470258712768555, "memory(GiB)": 77.56, "step": 41415, "token_acc": 0.4671280276816609, "train_speed(iter/s)": 1.440545 }, { "epoch": 1.7745597874983934, "grad_norm": 5.219699859619141, "learning_rate": 7.200937708845552e-05, "loss": 2.5159173965454102, "memory(GiB)": 77.56, "step": 41420, "token_acc": 0.4557823129251701, "train_speed(iter/s)": 1.440569 }, { "epoch": 1.7747740028276424, "grad_norm": 5.969017505645752, "learning_rate": 7.200333418647859e-05, "loss": 2.7637874603271486, "memory(GiB)": 77.56, "step": 41425, "token_acc": 0.4583333333333333, "train_speed(iter/s)": 1.440536 }, { "epoch": 1.7749882181568912, "grad_norm": 4.169270038604736, "learning_rate": 7.199729088589063e-05, "loss": 2.386057662963867, "memory(GiB)": 77.56, "step": 41430, "token_acc": 0.47840531561461797, "train_speed(iter/s)": 1.440499 }, { "epoch": 1.7752024334861403, "grad_norm": 6.329133987426758, "learning_rate": 7.199124718680111e-05, "loss": 2.8182960510253907, "memory(GiB)": 77.56, "step": 41435, "token_acc": 0.4088050314465409, "train_speed(iter/s)": 1.440465 }, { "epoch": 1.7754166488153893, "grad_norm": 5.599954128265381, "learning_rate": 7.198520308931951e-05, "loss": 2.5343799591064453, "memory(GiB)": 77.56, "step": 41440, "token_acc": 0.45964912280701753, "train_speed(iter/s)": 1.440493 }, { "epoch": 1.775630864144638, "grad_norm": 4.3669328689575195, "learning_rate": 7.197915859355534e-05, "loss": 2.615229606628418, "memory(GiB)": 77.56, "step": 41445, "token_acc": 0.44299674267100975, "train_speed(iter/s)": 1.440514 }, { "epoch": 1.7758450794738871, "grad_norm": 6.3242998123168945, "learning_rate": 7.197311369961809e-05, "loss": 2.278392219543457, "memory(GiB)": 77.56, "step": 41450, "token_acc": 0.5263157894736842, "train_speed(iter/s)": 1.440549 }, { "epoch": 1.7760592948031362, "grad_norm": 5.017621040344238, "learning_rate": 7.196706840761727e-05, "loss": 2.62106876373291, "memory(GiB)": 77.56, "step": 41455, "token_acc": 0.4912891986062718, "train_speed(iter/s)": 1.440545 }, { "epoch": 1.776273510132385, "grad_norm": 4.113110065460205, "learning_rate": 7.196102271766241e-05, "loss": 2.9667633056640623, "memory(GiB)": 77.56, "step": 41460, "token_acc": 0.40607734806629836, "train_speed(iter/s)": 1.440548 }, { "epoch": 1.776487725461634, "grad_norm": 6.209756374359131, "learning_rate": 7.1954976629863e-05, "loss": 2.7616878509521485, "memory(GiB)": 77.56, "step": 41465, "token_acc": 0.4497991967871486, "train_speed(iter/s)": 1.44056 }, { "epoch": 1.776701940790883, "grad_norm": 6.022635459899902, "learning_rate": 7.194893014432862e-05, "loss": 2.424945068359375, "memory(GiB)": 77.56, "step": 41470, "token_acc": 0.45244956772334294, "train_speed(iter/s)": 1.440586 }, { "epoch": 1.7769161561201319, "grad_norm": 5.682319164276123, "learning_rate": 7.194288326116876e-05, "loss": 2.392338180541992, "memory(GiB)": 77.56, "step": 41475, "token_acc": 0.4755244755244755, "train_speed(iter/s)": 1.440625 }, { "epoch": 1.777130371449381, "grad_norm": 7.37538480758667, "learning_rate": 7.1936835980493e-05, "loss": 2.602642059326172, "memory(GiB)": 77.56, "step": 41480, "token_acc": 0.47619047619047616, "train_speed(iter/s)": 1.440636 }, { "epoch": 1.77734458677863, "grad_norm": 5.151170253753662, "learning_rate": 7.193078830241087e-05, "loss": 2.957985496520996, "memory(GiB)": 77.56, "step": 41485, "token_acc": 0.4501510574018127, "train_speed(iter/s)": 1.440684 }, { "epoch": 1.7775588021078788, "grad_norm": 5.186006546020508, "learning_rate": 7.192474022703194e-05, "loss": 2.5356969833374023, "memory(GiB)": 77.56, "step": 41490, "token_acc": 0.519298245614035, "train_speed(iter/s)": 1.440696 }, { "epoch": 1.7777730174371278, "grad_norm": 4.7131571769714355, "learning_rate": 7.191869175446576e-05, "loss": 2.312872886657715, "memory(GiB)": 77.56, "step": 41495, "token_acc": 0.49666666666666665, "train_speed(iter/s)": 1.440708 }, { "epoch": 1.7779872327663768, "grad_norm": 4.934205532073975, "learning_rate": 7.191264288482194e-05, "loss": 2.5455230712890624, "memory(GiB)": 77.56, "step": 41500, "token_acc": 0.47474747474747475, "train_speed(iter/s)": 1.440694 }, { "epoch": 1.7779872327663768, "eval_loss": 2.125981330871582, "eval_runtime": 14.0156, "eval_samples_per_second": 7.135, "eval_steps_per_second": 7.135, "eval_token_acc": 0.48404255319148937, "step": 41500 }, { "epoch": 1.7782014480956256, "grad_norm": 6.567460536956787, "learning_rate": 7.190659361821003e-05, "loss": 2.3286039352416994, "memory(GiB)": 77.56, "step": 41505, "token_acc": 0.4860557768924303, "train_speed(iter/s)": 1.439977 }, { "epoch": 1.7784156634248747, "grad_norm": 5.059206962585449, "learning_rate": 7.190054395473962e-05, "loss": 2.5230005264282225, "memory(GiB)": 77.56, "step": 41510, "token_acc": 0.49097472924187724, "train_speed(iter/s)": 1.439934 }, { "epoch": 1.7786298787541237, "grad_norm": 4.963253021240234, "learning_rate": 7.189449389452032e-05, "loss": 2.5037582397460936, "memory(GiB)": 77.56, "step": 41515, "token_acc": 0.4758842443729904, "train_speed(iter/s)": 1.439951 }, { "epoch": 1.7788440940833725, "grad_norm": 3.9160563945770264, "learning_rate": 7.188844343766173e-05, "loss": 2.3822832107543945, "memory(GiB)": 77.56, "step": 41520, "token_acc": 0.5201612903225806, "train_speed(iter/s)": 1.439973 }, { "epoch": 1.7790583094126216, "grad_norm": 4.47765588760376, "learning_rate": 7.188239258427343e-05, "loss": 2.6160921096801757, "memory(GiB)": 77.56, "step": 41525, "token_acc": 0.43125, "train_speed(iter/s)": 1.439957 }, { "epoch": 1.7792725247418706, "grad_norm": 5.558020114898682, "learning_rate": 7.187634133446507e-05, "loss": 2.605325126647949, "memory(GiB)": 77.56, "step": 41530, "token_acc": 0.4598337950138504, "train_speed(iter/s)": 1.440001 }, { "epoch": 1.7794867400711194, "grad_norm": 4.489594459533691, "learning_rate": 7.18702896883463e-05, "loss": 2.896640396118164, "memory(GiB)": 77.56, "step": 41535, "token_acc": 0.37942122186495175, "train_speed(iter/s)": 1.439965 }, { "epoch": 1.7797009554003684, "grad_norm": 6.1399407386779785, "learning_rate": 7.186423764602668e-05, "loss": 2.248468208312988, "memory(GiB)": 77.56, "step": 41540, "token_acc": 0.4823529411764706, "train_speed(iter/s)": 1.439997 }, { "epoch": 1.7799151707296175, "grad_norm": 5.654233455657959, "learning_rate": 7.185818520761589e-05, "loss": 2.2671958923339846, "memory(GiB)": 77.56, "step": 41545, "token_acc": 0.4979757085020243, "train_speed(iter/s)": 1.440014 }, { "epoch": 1.7801293860588663, "grad_norm": 5.686004638671875, "learning_rate": 7.185213237322358e-05, "loss": 2.4533288955688475, "memory(GiB)": 77.56, "step": 41550, "token_acc": 0.4793103448275862, "train_speed(iter/s)": 1.440038 }, { "epoch": 1.7803436013881153, "grad_norm": 4.564158916473389, "learning_rate": 7.184607914295937e-05, "loss": 2.3960342407226562, "memory(GiB)": 77.56, "step": 41555, "token_acc": 0.45955882352941174, "train_speed(iter/s)": 1.440092 }, { "epoch": 1.7805578167173644, "grad_norm": 4.506593227386475, "learning_rate": 7.184002551693296e-05, "loss": 2.554522895812988, "memory(GiB)": 77.56, "step": 41560, "token_acc": 0.4738955823293173, "train_speed(iter/s)": 1.440085 }, { "epoch": 1.7807720320466132, "grad_norm": 4.5854058265686035, "learning_rate": 7.183397149525401e-05, "loss": 2.3290922164916994, "memory(GiB)": 77.56, "step": 41565, "token_acc": 0.5130111524163569, "train_speed(iter/s)": 1.440122 }, { "epoch": 1.7809862473758622, "grad_norm": 5.484707832336426, "learning_rate": 7.182791707803216e-05, "loss": 2.6286540985107423, "memory(GiB)": 77.56, "step": 41570, "token_acc": 0.47318611987381703, "train_speed(iter/s)": 1.440149 }, { "epoch": 1.7812004627051112, "grad_norm": 4.956236839294434, "learning_rate": 7.182186226537714e-05, "loss": 2.6242177963256834, "memory(GiB)": 77.56, "step": 41575, "token_acc": 0.47924528301886793, "train_speed(iter/s)": 1.440163 }, { "epoch": 1.78141467803436, "grad_norm": 5.367851257324219, "learning_rate": 7.18158070573986e-05, "loss": 2.675528717041016, "memory(GiB)": 77.56, "step": 41580, "token_acc": 0.46621621621621623, "train_speed(iter/s)": 1.440154 }, { "epoch": 1.781628893363609, "grad_norm": 5.175403118133545, "learning_rate": 7.180975145420625e-05, "loss": 2.335260581970215, "memory(GiB)": 77.56, "step": 41585, "token_acc": 0.5040983606557377, "train_speed(iter/s)": 1.4402 }, { "epoch": 1.7818431086928581, "grad_norm": 4.378537178039551, "learning_rate": 7.180369545590981e-05, "loss": 2.374713134765625, "memory(GiB)": 77.56, "step": 41590, "token_acc": 0.5245398773006135, "train_speed(iter/s)": 1.440215 }, { "epoch": 1.782057324022107, "grad_norm": 4.559616565704346, "learning_rate": 7.179763906261896e-05, "loss": 2.3027820587158203, "memory(GiB)": 77.56, "step": 41595, "token_acc": 0.4714828897338403, "train_speed(iter/s)": 1.440221 }, { "epoch": 1.782271539351356, "grad_norm": 6.5058088302612305, "learning_rate": 7.179158227444343e-05, "loss": 2.5365753173828125, "memory(GiB)": 77.56, "step": 41600, "token_acc": 0.46688741721854304, "train_speed(iter/s)": 1.440225 }, { "epoch": 1.782485754680605, "grad_norm": 4.407220363616943, "learning_rate": 7.178552509149294e-05, "loss": 2.428637886047363, "memory(GiB)": 77.56, "step": 41605, "token_acc": 0.4806451612903226, "train_speed(iter/s)": 1.440222 }, { "epoch": 1.7826999700098538, "grad_norm": 3.9212772846221924, "learning_rate": 7.177946751387723e-05, "loss": 2.4099021911621095, "memory(GiB)": 77.56, "step": 41610, "token_acc": 0.47985347985347987, "train_speed(iter/s)": 1.440227 }, { "epoch": 1.7829141853391028, "grad_norm": 4.980103015899658, "learning_rate": 7.177340954170604e-05, "loss": 2.5643218994140624, "memory(GiB)": 77.56, "step": 41615, "token_acc": 0.47843137254901963, "train_speed(iter/s)": 1.440228 }, { "epoch": 1.7831284006683519, "grad_norm": 4.563261985778809, "learning_rate": 7.176735117508911e-05, "loss": 2.676917839050293, "memory(GiB)": 77.56, "step": 41620, "token_acc": 0.5068493150684932, "train_speed(iter/s)": 1.440258 }, { "epoch": 1.7833426159976007, "grad_norm": 4.64477014541626, "learning_rate": 7.176129241413619e-05, "loss": 2.514088439941406, "memory(GiB)": 77.56, "step": 41625, "token_acc": 0.5074074074074074, "train_speed(iter/s)": 1.440298 }, { "epoch": 1.7835568313268497, "grad_norm": 7.422196865081787, "learning_rate": 7.175523325895705e-05, "loss": 2.656286430358887, "memory(GiB)": 77.56, "step": 41630, "token_acc": 0.4172661870503597, "train_speed(iter/s)": 1.440277 }, { "epoch": 1.7837710466560988, "grad_norm": 6.116430282592773, "learning_rate": 7.174917370966145e-05, "loss": 2.299342727661133, "memory(GiB)": 77.56, "step": 41635, "token_acc": 0.5019607843137255, "train_speed(iter/s)": 1.440262 }, { "epoch": 1.7839852619853476, "grad_norm": 4.1989216804504395, "learning_rate": 7.174311376635916e-05, "loss": 2.3616992950439455, "memory(GiB)": 77.56, "step": 41640, "token_acc": 0.5207667731629393, "train_speed(iter/s)": 1.44022 }, { "epoch": 1.7841994773145966, "grad_norm": 4.709729194641113, "learning_rate": 7.173705342915998e-05, "loss": 2.6079713821411135, "memory(GiB)": 77.56, "step": 41645, "token_acc": 0.47686832740213525, "train_speed(iter/s)": 1.440177 }, { "epoch": 1.7844136926438456, "grad_norm": 5.738956451416016, "learning_rate": 7.173099269817368e-05, "loss": 2.477339172363281, "memory(GiB)": 77.56, "step": 41650, "token_acc": 0.49635036496350365, "train_speed(iter/s)": 1.440173 }, { "epoch": 1.7846279079730945, "grad_norm": 5.236964225769043, "learning_rate": 7.172493157351006e-05, "loss": 2.8129898071289063, "memory(GiB)": 77.56, "step": 41655, "token_acc": 0.43243243243243246, "train_speed(iter/s)": 1.440219 }, { "epoch": 1.7848421233023435, "grad_norm": 4.651280879974365, "learning_rate": 7.171887005527893e-05, "loss": 2.2527658462524416, "memory(GiB)": 77.56, "step": 41660, "token_acc": 0.5107142857142857, "train_speed(iter/s)": 1.440241 }, { "epoch": 1.7850563386315925, "grad_norm": 10.790109634399414, "learning_rate": 7.17128081435901e-05, "loss": 2.887628746032715, "memory(GiB)": 77.56, "step": 41665, "token_acc": 0.45422535211267606, "train_speed(iter/s)": 1.440296 }, { "epoch": 1.7852705539608413, "grad_norm": 3.976048231124878, "learning_rate": 7.170674583855335e-05, "loss": 2.5121671676635744, "memory(GiB)": 77.56, "step": 41670, "token_acc": 0.4620253164556962, "train_speed(iter/s)": 1.440316 }, { "epoch": 1.7854847692900904, "grad_norm": 4.391224384307861, "learning_rate": 7.170068314027855e-05, "loss": 2.365525817871094, "memory(GiB)": 77.56, "step": 41675, "token_acc": 0.49390243902439024, "train_speed(iter/s)": 1.440309 }, { "epoch": 1.7856989846193394, "grad_norm": 4.24528169631958, "learning_rate": 7.169462004887555e-05, "loss": 2.632581329345703, "memory(GiB)": 77.56, "step": 41680, "token_acc": 0.49264705882352944, "train_speed(iter/s)": 1.440329 }, { "epoch": 1.7859131999485882, "grad_norm": 4.256743907928467, "learning_rate": 7.168855656445412e-05, "loss": 2.4761331558227537, "memory(GiB)": 77.56, "step": 41685, "token_acc": 0.47761194029850745, "train_speed(iter/s)": 1.440352 }, { "epoch": 1.7861274152778372, "grad_norm": 4.573760509490967, "learning_rate": 7.168249268712415e-05, "loss": 2.4129867553710938, "memory(GiB)": 77.56, "step": 41690, "token_acc": 0.5019011406844106, "train_speed(iter/s)": 1.440385 }, { "epoch": 1.7863416306070863, "grad_norm": 4.989323139190674, "learning_rate": 7.16764284169955e-05, "loss": 2.4528703689575195, "memory(GiB)": 77.56, "step": 41695, "token_acc": 0.470404984423676, "train_speed(iter/s)": 1.44042 }, { "epoch": 1.786555845936335, "grad_norm": 5.008612632751465, "learning_rate": 7.167036375417801e-05, "loss": 2.6893356323242186, "memory(GiB)": 77.56, "step": 41700, "token_acc": 0.44370860927152317, "train_speed(iter/s)": 1.440422 }, { "epoch": 1.7867700612655841, "grad_norm": 5.087997913360596, "learning_rate": 7.166429869878154e-05, "loss": 2.5272586822509764, "memory(GiB)": 77.56, "step": 41705, "token_acc": 0.49295774647887325, "train_speed(iter/s)": 1.440457 }, { "epoch": 1.7869842765948332, "grad_norm": 4.831918239593506, "learning_rate": 7.165823325091599e-05, "loss": 2.5077728271484374, "memory(GiB)": 77.56, "step": 41710, "token_acc": 0.4716981132075472, "train_speed(iter/s)": 1.440468 }, { "epoch": 1.787198491924082, "grad_norm": 4.737553119659424, "learning_rate": 7.165216741069122e-05, "loss": 2.6069902420043944, "memory(GiB)": 77.56, "step": 41715, "token_acc": 0.4602076124567474, "train_speed(iter/s)": 1.44048 }, { "epoch": 1.787412707253331, "grad_norm": 6.522765159606934, "learning_rate": 7.164610117821713e-05, "loss": 2.6605192184448243, "memory(GiB)": 77.56, "step": 41720, "token_acc": 0.46953405017921146, "train_speed(iter/s)": 1.440524 }, { "epoch": 1.78762692258258, "grad_norm": 11.286214828491211, "learning_rate": 7.16400345536036e-05, "loss": 2.541108512878418, "memory(GiB)": 77.56, "step": 41725, "token_acc": 0.5147058823529411, "train_speed(iter/s)": 1.440519 }, { "epoch": 1.7878411379118289, "grad_norm": 5.008591651916504, "learning_rate": 7.163396753696057e-05, "loss": 2.5759857177734373, "memory(GiB)": 77.56, "step": 41730, "token_acc": 0.4612794612794613, "train_speed(iter/s)": 1.440478 }, { "epoch": 1.788055353241078, "grad_norm": 5.311080455780029, "learning_rate": 7.162790012839791e-05, "loss": 2.367471694946289, "memory(GiB)": 77.56, "step": 41735, "token_acc": 0.5133079847908745, "train_speed(iter/s)": 1.440469 }, { "epoch": 1.788269568570327, "grad_norm": 4.494590759277344, "learning_rate": 7.162183232802554e-05, "loss": 2.9353370666503906, "memory(GiB)": 77.56, "step": 41740, "token_acc": 0.38622754491017963, "train_speed(iter/s)": 1.440453 }, { "epoch": 1.7884837838995757, "grad_norm": 4.419131278991699, "learning_rate": 7.161576413595339e-05, "loss": 2.0296464920043946, "memory(GiB)": 77.56, "step": 41745, "token_acc": 0.5714285714285714, "train_speed(iter/s)": 1.44043 }, { "epoch": 1.7886979992288248, "grad_norm": 4.305533409118652, "learning_rate": 7.160969555229142e-05, "loss": 2.7163219451904297, "memory(GiB)": 77.56, "step": 41750, "token_acc": 0.44376899696048633, "train_speed(iter/s)": 1.440439 }, { "epoch": 1.7889122145580738, "grad_norm": 7.3529181480407715, "learning_rate": 7.160362657714953e-05, "loss": 2.3863460540771486, "memory(GiB)": 77.56, "step": 41755, "token_acc": 0.48854961832061067, "train_speed(iter/s)": 1.4404 }, { "epoch": 1.7891264298873226, "grad_norm": 5.84220027923584, "learning_rate": 7.159755721063768e-05, "loss": 2.4685157775878905, "memory(GiB)": 77.56, "step": 41760, "token_acc": 0.48639455782312924, "train_speed(iter/s)": 1.440374 }, { "epoch": 1.7893406452165717, "grad_norm": 5.567548751831055, "learning_rate": 7.159148745286582e-05, "loss": 2.1888689041137694, "memory(GiB)": 77.56, "step": 41765, "token_acc": 0.5458515283842795, "train_speed(iter/s)": 1.440381 }, { "epoch": 1.7895548605458207, "grad_norm": 4.820566177368164, "learning_rate": 7.158541730394391e-05, "loss": 2.381830596923828, "memory(GiB)": 77.56, "step": 41770, "token_acc": 0.46616541353383456, "train_speed(iter/s)": 1.440405 }, { "epoch": 1.7897690758750695, "grad_norm": 4.206056594848633, "learning_rate": 7.157934676398192e-05, "loss": 2.5655515670776365, "memory(GiB)": 77.56, "step": 41775, "token_acc": 0.48188405797101447, "train_speed(iter/s)": 1.440404 }, { "epoch": 1.7899832912043185, "grad_norm": 8.237789154052734, "learning_rate": 7.157327583308981e-05, "loss": 2.410350227355957, "memory(GiB)": 77.56, "step": 41780, "token_acc": 0.5035460992907801, "train_speed(iter/s)": 1.44041 }, { "epoch": 1.7901975065335676, "grad_norm": 5.038957118988037, "learning_rate": 7.156720451137759e-05, "loss": 2.3720062255859373, "memory(GiB)": 77.56, "step": 41785, "token_acc": 0.4854014598540146, "train_speed(iter/s)": 1.440394 }, { "epoch": 1.7904117218628164, "grad_norm": 6.162631511688232, "learning_rate": 7.156113279895522e-05, "loss": 2.8040966033935546, "memory(GiB)": 77.56, "step": 41790, "token_acc": 0.43944636678200694, "train_speed(iter/s)": 1.440451 }, { "epoch": 1.7906259371920654, "grad_norm": 4.644979000091553, "learning_rate": 7.15550606959327e-05, "loss": 2.8054466247558594, "memory(GiB)": 77.56, "step": 41795, "token_acc": 0.43962848297213625, "train_speed(iter/s)": 1.440484 }, { "epoch": 1.7908401525213145, "grad_norm": 4.512179851531982, "learning_rate": 7.154898820242003e-05, "loss": 2.775253486633301, "memory(GiB)": 77.56, "step": 41800, "token_acc": 0.436426116838488, "train_speed(iter/s)": 1.44051 }, { "epoch": 1.7910543678505633, "grad_norm": 8.885587692260742, "learning_rate": 7.154291531852723e-05, "loss": 2.411014938354492, "memory(GiB)": 77.56, "step": 41805, "token_acc": 0.5019762845849802, "train_speed(iter/s)": 1.44056 }, { "epoch": 1.7912685831798123, "grad_norm": 5.238372802734375, "learning_rate": 7.153684204436433e-05, "loss": 2.8068750381469725, "memory(GiB)": 77.56, "step": 41810, "token_acc": 0.4722222222222222, "train_speed(iter/s)": 1.440613 }, { "epoch": 1.7914827985090613, "grad_norm": 6.922854423522949, "learning_rate": 7.153076838004129e-05, "loss": 2.2262115478515625, "memory(GiB)": 77.56, "step": 41815, "token_acc": 0.5128205128205128, "train_speed(iter/s)": 1.440606 }, { "epoch": 1.7916970138383101, "grad_norm": 4.267703056335449, "learning_rate": 7.152469432566822e-05, "loss": 2.6742364883422853, "memory(GiB)": 77.56, "step": 41820, "token_acc": 0.43909348441926344, "train_speed(iter/s)": 1.440627 }, { "epoch": 1.7919112291675592, "grad_norm": 5.662759780883789, "learning_rate": 7.151861988135511e-05, "loss": 2.392784309387207, "memory(GiB)": 77.56, "step": 41825, "token_acc": 0.5107692307692308, "train_speed(iter/s)": 1.440655 }, { "epoch": 1.7921254444968082, "grad_norm": 5.835724353790283, "learning_rate": 7.151254504721201e-05, "loss": 2.859352684020996, "memory(GiB)": 77.56, "step": 41830, "token_acc": 0.40264026402640263, "train_speed(iter/s)": 1.440679 }, { "epoch": 1.792339659826057, "grad_norm": 6.028143882751465, "learning_rate": 7.150646982334897e-05, "loss": 2.3542957305908203, "memory(GiB)": 77.56, "step": 41835, "token_acc": 0.4963235294117647, "train_speed(iter/s)": 1.440731 }, { "epoch": 1.792553875155306, "grad_norm": 3.758532762527466, "learning_rate": 7.150039420987606e-05, "loss": 2.462593650817871, "memory(GiB)": 77.56, "step": 41840, "token_acc": 0.48059701492537316, "train_speed(iter/s)": 1.440762 }, { "epoch": 1.792768090484555, "grad_norm": 5.808251857757568, "learning_rate": 7.149431820690335e-05, "loss": 2.4858678817749023, "memory(GiB)": 77.56, "step": 41845, "token_acc": 0.5134099616858238, "train_speed(iter/s)": 1.440738 }, { "epoch": 1.792982305813804, "grad_norm": 4.783618927001953, "learning_rate": 7.14882418145409e-05, "loss": 2.5948272705078126, "memory(GiB)": 77.56, "step": 41850, "token_acc": 0.47398843930635837, "train_speed(iter/s)": 1.440779 }, { "epoch": 1.793196521143053, "grad_norm": 5.319774627685547, "learning_rate": 7.148216503289878e-05, "loss": 2.621656036376953, "memory(GiB)": 77.56, "step": 41855, "token_acc": 0.4444444444444444, "train_speed(iter/s)": 1.440815 }, { "epoch": 1.793410736472302, "grad_norm": 5.4388298988342285, "learning_rate": 7.147608786208709e-05, "loss": 2.6802688598632813, "memory(GiB)": 77.56, "step": 41860, "token_acc": 0.47017543859649125, "train_speed(iter/s)": 1.440878 }, { "epoch": 1.7936249518015508, "grad_norm": 4.757205486297607, "learning_rate": 7.147001030221594e-05, "loss": 2.3458911895751955, "memory(GiB)": 77.56, "step": 41865, "token_acc": 0.47038327526132406, "train_speed(iter/s)": 1.440917 }, { "epoch": 1.7938391671307998, "grad_norm": 4.485103130340576, "learning_rate": 7.146393235339539e-05, "loss": 2.5630115509033202, "memory(GiB)": 77.56, "step": 41870, "token_acc": 0.4391891891891892, "train_speed(iter/s)": 1.440913 }, { "epoch": 1.7940533824600489, "grad_norm": 4.268906116485596, "learning_rate": 7.145785401573559e-05, "loss": 2.498132514953613, "memory(GiB)": 77.56, "step": 41875, "token_acc": 0.4740484429065744, "train_speed(iter/s)": 1.440871 }, { "epoch": 1.7942675977892977, "grad_norm": 4.961365699768066, "learning_rate": 7.145177528934663e-05, "loss": 2.7005792617797852, "memory(GiB)": 77.56, "step": 41880, "token_acc": 0.4246153846153846, "train_speed(iter/s)": 1.440893 }, { "epoch": 1.7944818131185467, "grad_norm": 4.200442314147949, "learning_rate": 7.144569617433863e-05, "loss": 2.4139476776123048, "memory(GiB)": 77.56, "step": 41885, "token_acc": 0.4781144781144781, "train_speed(iter/s)": 1.440933 }, { "epoch": 1.7946960284477957, "grad_norm": 4.790828704833984, "learning_rate": 7.143961667082173e-05, "loss": 2.6316465377807616, "memory(GiB)": 77.56, "step": 41890, "token_acc": 0.46835443037974683, "train_speed(iter/s)": 1.440961 }, { "epoch": 1.7949102437770446, "grad_norm": 5.2089128494262695, "learning_rate": 7.143353677890607e-05, "loss": 2.5275337219238283, "memory(GiB)": 77.56, "step": 41895, "token_acc": 0.4392857142857143, "train_speed(iter/s)": 1.440965 }, { "epoch": 1.7951244591062938, "grad_norm": 5.263805389404297, "learning_rate": 7.142745649870177e-05, "loss": 2.684798240661621, "memory(GiB)": 77.56, "step": 41900, "token_acc": 0.41637010676156583, "train_speed(iter/s)": 1.441001 }, { "epoch": 1.7953386744355426, "grad_norm": 5.7521281242370605, "learning_rate": 7.142137583031901e-05, "loss": 2.4712223052978515, "memory(GiB)": 77.56, "step": 41905, "token_acc": 0.46206896551724136, "train_speed(iter/s)": 1.44102 }, { "epoch": 1.7955528897647914, "grad_norm": 3.884425163269043, "learning_rate": 7.141529477386792e-05, "loss": 2.5200998306274416, "memory(GiB)": 77.56, "step": 41910, "token_acc": 0.4664179104477612, "train_speed(iter/s)": 1.44102 }, { "epoch": 1.7957671050940407, "grad_norm": 4.386074542999268, "learning_rate": 7.140921332945868e-05, "loss": 2.3738903045654296, "memory(GiB)": 77.56, "step": 41915, "token_acc": 0.49193548387096775, "train_speed(iter/s)": 1.441015 }, { "epoch": 1.7959813204232895, "grad_norm": 4.672444820404053, "learning_rate": 7.140313149720145e-05, "loss": 2.617329216003418, "memory(GiB)": 77.56, "step": 41920, "token_acc": 0.4670487106017192, "train_speed(iter/s)": 1.441004 }, { "epoch": 1.7961955357525383, "grad_norm": 4.702471733093262, "learning_rate": 7.139704927720644e-05, "loss": 2.57045841217041, "memory(GiB)": 77.56, "step": 41925, "token_acc": 0.4816053511705686, "train_speed(iter/s)": 1.441046 }, { "epoch": 1.7964097510817876, "grad_norm": 4.501942157745361, "learning_rate": 7.139096666958378e-05, "loss": 2.451796531677246, "memory(GiB)": 77.56, "step": 41930, "token_acc": 0.4857142857142857, "train_speed(iter/s)": 1.441074 }, { "epoch": 1.7966239664110364, "grad_norm": 3.9743943214416504, "learning_rate": 7.13848836744437e-05, "loss": 2.465041160583496, "memory(GiB)": 77.56, "step": 41935, "token_acc": 0.49324324324324326, "train_speed(iter/s)": 1.441066 }, { "epoch": 1.7968381817402852, "grad_norm": 6.735315799713135, "learning_rate": 7.137880029189641e-05, "loss": 2.5690181732177733, "memory(GiB)": 77.56, "step": 41940, "token_acc": 0.4984025559105431, "train_speed(iter/s)": 1.441031 }, { "epoch": 1.7970523970695345, "grad_norm": 4.932923793792725, "learning_rate": 7.137271652205208e-05, "loss": 2.447648811340332, "memory(GiB)": 77.56, "step": 41945, "token_acc": 0.4743083003952569, "train_speed(iter/s)": 1.441081 }, { "epoch": 1.7972666123987833, "grad_norm": 6.011246681213379, "learning_rate": 7.136663236502095e-05, "loss": 2.8310478210449217, "memory(GiB)": 77.56, "step": 41950, "token_acc": 0.4444444444444444, "train_speed(iter/s)": 1.441048 }, { "epoch": 1.797480827728032, "grad_norm": 4.117939472198486, "learning_rate": 7.136054782091322e-05, "loss": 2.1531585693359374, "memory(GiB)": 77.56, "step": 41955, "token_acc": 0.5574468085106383, "train_speed(iter/s)": 1.44108 }, { "epoch": 1.7976950430572813, "grad_norm": 4.647158145904541, "learning_rate": 7.135446288983914e-05, "loss": 2.5094648361206056, "memory(GiB)": 77.56, "step": 41960, "token_acc": 0.47959183673469385, "train_speed(iter/s)": 1.441053 }, { "epoch": 1.7979092583865302, "grad_norm": 4.708792686462402, "learning_rate": 7.134837757190891e-05, "loss": 2.7043376922607423, "memory(GiB)": 77.56, "step": 41965, "token_acc": 0.45714285714285713, "train_speed(iter/s)": 1.441059 }, { "epoch": 1.798123473715779, "grad_norm": 4.866854190826416, "learning_rate": 7.134229186723282e-05, "loss": 2.5221385955810547, "memory(GiB)": 77.56, "step": 41970, "token_acc": 0.5032679738562091, "train_speed(iter/s)": 1.44096 }, { "epoch": 1.7983376890450282, "grad_norm": 4.450485706329346, "learning_rate": 7.133620577592108e-05, "loss": 2.3979970932006838, "memory(GiB)": 77.56, "step": 41975, "token_acc": 0.4833948339483395, "train_speed(iter/s)": 1.440947 }, { "epoch": 1.798551904374277, "grad_norm": 5.129952907562256, "learning_rate": 7.133011929808398e-05, "loss": 2.607725143432617, "memory(GiB)": 77.56, "step": 41980, "token_acc": 0.4939759036144578, "train_speed(iter/s)": 1.440961 }, { "epoch": 1.7987661197035258, "grad_norm": 4.724880695343018, "learning_rate": 7.132403243383173e-05, "loss": 2.680074691772461, "memory(GiB)": 77.56, "step": 41985, "token_acc": 0.4444444444444444, "train_speed(iter/s)": 1.440967 }, { "epoch": 1.798980335032775, "grad_norm": 5.4733452796936035, "learning_rate": 7.131794518327463e-05, "loss": 2.4858415603637694, "memory(GiB)": 77.56, "step": 41990, "token_acc": 0.4697986577181208, "train_speed(iter/s)": 1.440939 }, { "epoch": 1.799194550362024, "grad_norm": 5.619987487792969, "learning_rate": 7.131185754652299e-05, "loss": 2.543406677246094, "memory(GiB)": 77.56, "step": 41995, "token_acc": 0.4444444444444444, "train_speed(iter/s)": 1.440909 }, { "epoch": 1.7994087656912727, "grad_norm": 4.996868133544922, "learning_rate": 7.130576952368701e-05, "loss": 2.8067474365234375, "memory(GiB)": 77.56, "step": 42000, "token_acc": 0.4160839160839161, "train_speed(iter/s)": 1.440903 }, { "epoch": 1.7994087656912727, "eval_loss": 2.3193671703338623, "eval_runtime": 14.288, "eval_samples_per_second": 6.999, "eval_steps_per_second": 6.999, "eval_token_acc": 0.45690834473324216, "step": 42000 }, { "epoch": 1.799622981020522, "grad_norm": 6.263830661773682, "learning_rate": 7.129968111487706e-05, "loss": 2.1180793762207033, "memory(GiB)": 77.56, "step": 42005, "token_acc": 0.46576663452266154, "train_speed(iter/s)": 1.440158 }, { "epoch": 1.7998371963497708, "grad_norm": 4.130348205566406, "learning_rate": 7.12935923202034e-05, "loss": 2.449118423461914, "memory(GiB)": 77.56, "step": 42010, "token_acc": 0.5167785234899329, "train_speed(iter/s)": 1.440176 }, { "epoch": 1.8000514116790196, "grad_norm": 14.074151039123535, "learning_rate": 7.128750313977633e-05, "loss": 2.5615009307861327, "memory(GiB)": 77.56, "step": 42015, "token_acc": 0.44805194805194803, "train_speed(iter/s)": 1.440206 }, { "epoch": 1.8002656270082689, "grad_norm": 4.594374656677246, "learning_rate": 7.128141357370618e-05, "loss": 2.6220375061035157, "memory(GiB)": 77.56, "step": 42020, "token_acc": 0.47076023391812866, "train_speed(iter/s)": 1.440201 }, { "epoch": 1.8004798423375177, "grad_norm": 4.808896541595459, "learning_rate": 7.127532362210327e-05, "loss": 2.4455310821533205, "memory(GiB)": 77.56, "step": 42025, "token_acc": 0.48417721518987344, "train_speed(iter/s)": 1.440207 }, { "epoch": 1.8006940576667665, "grad_norm": 6.303009510040283, "learning_rate": 7.12692332850779e-05, "loss": 2.5016260147094727, "memory(GiB)": 77.56, "step": 42030, "token_acc": 0.5107142857142857, "train_speed(iter/s)": 1.44025 }, { "epoch": 1.8009082729960157, "grad_norm": 5.437148094177246, "learning_rate": 7.126314256274042e-05, "loss": 2.4528053283691404, "memory(GiB)": 77.56, "step": 42035, "token_acc": 0.5306122448979592, "train_speed(iter/s)": 1.440278 }, { "epoch": 1.8011224883252646, "grad_norm": 4.603950023651123, "learning_rate": 7.125705145520117e-05, "loss": 2.4632890701293944, "memory(GiB)": 77.56, "step": 42040, "token_acc": 0.46488294314381273, "train_speed(iter/s)": 1.440308 }, { "epoch": 1.8013367036545134, "grad_norm": 5.031585693359375, "learning_rate": 7.125095996257048e-05, "loss": 2.8365667343139647, "memory(GiB)": 77.56, "step": 42045, "token_acc": 0.39568345323741005, "train_speed(iter/s)": 1.440293 }, { "epoch": 1.8015509189837626, "grad_norm": 4.871204853057861, "learning_rate": 7.124486808495873e-05, "loss": 2.459613800048828, "memory(GiB)": 77.56, "step": 42050, "token_acc": 0.5109717868338558, "train_speed(iter/s)": 1.440257 }, { "epoch": 1.8017651343130114, "grad_norm": 4.5483012199401855, "learning_rate": 7.123877582247626e-05, "loss": 2.7137245178222655, "memory(GiB)": 77.56, "step": 42055, "token_acc": 0.4064516129032258, "train_speed(iter/s)": 1.44028 }, { "epoch": 1.8019793496422603, "grad_norm": 6.571033477783203, "learning_rate": 7.123268317523344e-05, "loss": 2.58399658203125, "memory(GiB)": 77.56, "step": 42060, "token_acc": 0.46987951807228917, "train_speed(iter/s)": 1.440281 }, { "epoch": 1.8021935649715095, "grad_norm": 5.01151180267334, "learning_rate": 7.122659014334065e-05, "loss": 2.4359710693359373, "memory(GiB)": 77.56, "step": 42065, "token_acc": 0.4626865671641791, "train_speed(iter/s)": 1.440336 }, { "epoch": 1.8024077803007583, "grad_norm": 5.244607448577881, "learning_rate": 7.122049672690828e-05, "loss": 2.1736881256103517, "memory(GiB)": 77.56, "step": 42070, "token_acc": 0.49818181818181817, "train_speed(iter/s)": 1.440384 }, { "epoch": 1.8026219956300071, "grad_norm": 4.934427738189697, "learning_rate": 7.121440292604667e-05, "loss": 2.603476333618164, "memory(GiB)": 77.56, "step": 42075, "token_acc": 0.45454545454545453, "train_speed(iter/s)": 1.440426 }, { "epoch": 1.8028362109592564, "grad_norm": 4.66689920425415, "learning_rate": 7.120830874086627e-05, "loss": 2.590445899963379, "memory(GiB)": 77.56, "step": 42080, "token_acc": 0.47924528301886793, "train_speed(iter/s)": 1.440455 }, { "epoch": 1.8030504262885052, "grad_norm": 4.61002779006958, "learning_rate": 7.120221417147747e-05, "loss": 2.3773166656494142, "memory(GiB)": 77.56, "step": 42085, "token_acc": 0.4882154882154882, "train_speed(iter/s)": 1.440472 }, { "epoch": 1.803264641617754, "grad_norm": 4.537388801574707, "learning_rate": 7.119611921799065e-05, "loss": 2.6261821746826173, "memory(GiB)": 77.56, "step": 42090, "token_acc": 0.4523076923076923, "train_speed(iter/s)": 1.44046 }, { "epoch": 1.8034788569470033, "grad_norm": 4.249468803405762, "learning_rate": 7.119002388051627e-05, "loss": 2.7216285705566405, "memory(GiB)": 77.56, "step": 42095, "token_acc": 0.4605678233438486, "train_speed(iter/s)": 1.440517 }, { "epoch": 1.803693072276252, "grad_norm": 7.35615348815918, "learning_rate": 7.118392815916472e-05, "loss": 2.5203794479370116, "memory(GiB)": 77.56, "step": 42100, "token_acc": 0.46357615894039733, "train_speed(iter/s)": 1.440523 }, { "epoch": 1.803907287605501, "grad_norm": 5.2889509201049805, "learning_rate": 7.117783205404644e-05, "loss": 2.2091312408447266, "memory(GiB)": 77.56, "step": 42105, "token_acc": 0.4982456140350877, "train_speed(iter/s)": 1.440566 }, { "epoch": 1.8041215029347502, "grad_norm": 4.824329853057861, "learning_rate": 7.117173556527187e-05, "loss": 2.512018013000488, "memory(GiB)": 77.56, "step": 42110, "token_acc": 0.44126984126984126, "train_speed(iter/s)": 1.440585 }, { "epoch": 1.804335718263999, "grad_norm": 5.606828212738037, "learning_rate": 7.116563869295145e-05, "loss": 2.3335639953613283, "memory(GiB)": 77.56, "step": 42115, "token_acc": 0.5, "train_speed(iter/s)": 1.440573 }, { "epoch": 1.8045499335932478, "grad_norm": 4.261765003204346, "learning_rate": 7.11595414371956e-05, "loss": 2.5213661193847656, "memory(GiB)": 77.56, "step": 42120, "token_acc": 0.4879518072289157, "train_speed(iter/s)": 1.440586 }, { "epoch": 1.804764148922497, "grad_norm": 3.8963773250579834, "learning_rate": 7.115344379811485e-05, "loss": 2.9007081985473633, "memory(GiB)": 77.56, "step": 42125, "token_acc": 0.4243243243243243, "train_speed(iter/s)": 1.440591 }, { "epoch": 1.8049783642517458, "grad_norm": 7.668780326843262, "learning_rate": 7.11473457758196e-05, "loss": 2.4790472030639648, "memory(GiB)": 77.56, "step": 42130, "token_acc": 0.4752851711026616, "train_speed(iter/s)": 1.440609 }, { "epoch": 1.8051925795809947, "grad_norm": 5.161706924438477, "learning_rate": 7.114124737042035e-05, "loss": 2.7120758056640626, "memory(GiB)": 77.56, "step": 42135, "token_acc": 0.42, "train_speed(iter/s)": 1.44061 }, { "epoch": 1.805406794910244, "grad_norm": 7.514952182769775, "learning_rate": 7.113514858202758e-05, "loss": 2.463833236694336, "memory(GiB)": 77.56, "step": 42140, "token_acc": 0.48028673835125446, "train_speed(iter/s)": 1.440563 }, { "epoch": 1.8056210102394927, "grad_norm": 4.837766647338867, "learning_rate": 7.112904941075175e-05, "loss": 2.703973579406738, "memory(GiB)": 77.56, "step": 42145, "token_acc": 0.44966442953020136, "train_speed(iter/s)": 1.440602 }, { "epoch": 1.8058352255687415, "grad_norm": 6.498230457305908, "learning_rate": 7.112294985670337e-05, "loss": 2.2992202758789064, "memory(GiB)": 77.56, "step": 42150, "token_acc": 0.522633744855967, "train_speed(iter/s)": 1.440602 }, { "epoch": 1.8060494408979908, "grad_norm": 3.9114623069763184, "learning_rate": 7.111684991999295e-05, "loss": 2.6470157623291017, "memory(GiB)": 77.56, "step": 42155, "token_acc": 0.4696485623003195, "train_speed(iter/s)": 1.440622 }, { "epoch": 1.8062636562272396, "grad_norm": 6.802840709686279, "learning_rate": 7.111074960073098e-05, "loss": 2.750494956970215, "memory(GiB)": 77.56, "step": 42160, "token_acc": 0.48014440433212996, "train_speed(iter/s)": 1.440627 }, { "epoch": 1.8064778715564884, "grad_norm": 4.773049831390381, "learning_rate": 7.110464889902796e-05, "loss": 2.2988460540771483, "memory(GiB)": 77.56, "step": 42165, "token_acc": 0.5081967213114754, "train_speed(iter/s)": 1.440666 }, { "epoch": 1.8066920868857377, "grad_norm": 6.8202924728393555, "learning_rate": 7.109854781499446e-05, "loss": 2.542022132873535, "memory(GiB)": 77.56, "step": 42170, "token_acc": 0.47854785478547857, "train_speed(iter/s)": 1.440672 }, { "epoch": 1.8069063022149865, "grad_norm": 4.792782783508301, "learning_rate": 7.109244634874096e-05, "loss": 2.8129718780517576, "memory(GiB)": 77.56, "step": 42175, "token_acc": 0.4576923076923077, "train_speed(iter/s)": 1.440709 }, { "epoch": 1.8071205175442353, "grad_norm": 5.579444885253906, "learning_rate": 7.108634450037802e-05, "loss": 2.370299530029297, "memory(GiB)": 77.56, "step": 42180, "token_acc": 0.4980392156862745, "train_speed(iter/s)": 1.440714 }, { "epoch": 1.8073347328734846, "grad_norm": 4.456747055053711, "learning_rate": 7.108024227001615e-05, "loss": 2.6092477798461915, "memory(GiB)": 77.56, "step": 42185, "token_acc": 0.43389830508474575, "train_speed(iter/s)": 1.440728 }, { "epoch": 1.8075489482027334, "grad_norm": 5.074801921844482, "learning_rate": 7.10741396577659e-05, "loss": 2.525266647338867, "memory(GiB)": 77.56, "step": 42190, "token_acc": 0.4734982332155477, "train_speed(iter/s)": 1.440734 }, { "epoch": 1.8077631635319822, "grad_norm": 5.257549285888672, "learning_rate": 7.106803666373787e-05, "loss": 2.597431182861328, "memory(GiB)": 77.56, "step": 42195, "token_acc": 0.49122807017543857, "train_speed(iter/s)": 1.44073 }, { "epoch": 1.8079773788612314, "grad_norm": 6.119453430175781, "learning_rate": 7.10619332880426e-05, "loss": 2.4286149978637694, "memory(GiB)": 77.56, "step": 42200, "token_acc": 0.5105633802816901, "train_speed(iter/s)": 1.440764 }, { "epoch": 1.8081915941904803, "grad_norm": 5.287391662597656, "learning_rate": 7.105582953079063e-05, "loss": 2.454452133178711, "memory(GiB)": 77.56, "step": 42205, "token_acc": 0.4909090909090909, "train_speed(iter/s)": 1.440738 }, { "epoch": 1.808405809519729, "grad_norm": 4.636159896850586, "learning_rate": 7.104972539209256e-05, "loss": 2.6267990112304687, "memory(GiB)": 77.56, "step": 42210, "token_acc": 0.4495114006514658, "train_speed(iter/s)": 1.440755 }, { "epoch": 1.8086200248489783, "grad_norm": 3.9708337783813477, "learning_rate": 7.104362087205898e-05, "loss": 2.505854606628418, "memory(GiB)": 77.56, "step": 42215, "token_acc": 0.47041420118343197, "train_speed(iter/s)": 1.440711 }, { "epoch": 1.8088342401782271, "grad_norm": 4.682599067687988, "learning_rate": 7.103751597080045e-05, "loss": 2.7869260787963865, "memory(GiB)": 77.56, "step": 42220, "token_acc": 0.4338235294117647, "train_speed(iter/s)": 1.440726 }, { "epoch": 1.809048455507476, "grad_norm": 5.472672462463379, "learning_rate": 7.103141068842759e-05, "loss": 2.3040191650390627, "memory(GiB)": 77.56, "step": 42225, "token_acc": 0.49498327759197325, "train_speed(iter/s)": 1.44069 }, { "epoch": 1.8092626708367252, "grad_norm": 6.776150226593018, "learning_rate": 7.1025305025051e-05, "loss": 2.4268171310424806, "memory(GiB)": 77.56, "step": 42230, "token_acc": 0.5146443514644351, "train_speed(iter/s)": 1.440698 }, { "epoch": 1.809476886165974, "grad_norm": 4.946876525878906, "learning_rate": 7.101919898078128e-05, "loss": 2.7492752075195312, "memory(GiB)": 77.56, "step": 42235, "token_acc": 0.46686746987951805, "train_speed(iter/s)": 1.440678 }, { "epoch": 1.809691101495223, "grad_norm": 4.6621294021606445, "learning_rate": 7.101309255572905e-05, "loss": 2.518558311462402, "memory(GiB)": 77.56, "step": 42240, "token_acc": 0.4745222929936306, "train_speed(iter/s)": 1.440681 }, { "epoch": 1.809905316824472, "grad_norm": 6.335580825805664, "learning_rate": 7.100698575000496e-05, "loss": 2.7184579849243162, "memory(GiB)": 77.56, "step": 42245, "token_acc": 0.4452054794520548, "train_speed(iter/s)": 1.440694 }, { "epoch": 1.810119532153721, "grad_norm": 5.230987071990967, "learning_rate": 7.10008785637196e-05, "loss": 2.328630065917969, "memory(GiB)": 77.56, "step": 42250, "token_acc": 0.5217391304347826, "train_speed(iter/s)": 1.440652 }, { "epoch": 1.81033374748297, "grad_norm": 6.371697425842285, "learning_rate": 7.099477099698361e-05, "loss": 2.582619285583496, "memory(GiB)": 77.56, "step": 42255, "token_acc": 0.45739910313901344, "train_speed(iter/s)": 1.440671 }, { "epoch": 1.810547962812219, "grad_norm": 8.896299362182617, "learning_rate": 7.098866304990767e-05, "loss": 2.2750127792358397, "memory(GiB)": 77.56, "step": 42260, "token_acc": 0.5152838427947598, "train_speed(iter/s)": 1.440688 }, { "epoch": 1.8107621781414678, "grad_norm": 4.576548099517822, "learning_rate": 7.09825547226024e-05, "loss": 2.2459184646606447, "memory(GiB)": 77.56, "step": 42265, "token_acc": 0.4715447154471545, "train_speed(iter/s)": 1.440688 }, { "epoch": 1.8109763934707168, "grad_norm": 5.128814220428467, "learning_rate": 7.097644601517848e-05, "loss": 2.376708984375, "memory(GiB)": 77.56, "step": 42270, "token_acc": 0.4842105263157895, "train_speed(iter/s)": 1.440689 }, { "epoch": 1.8111906087999659, "grad_norm": 4.97381591796875, "learning_rate": 7.097033692774653e-05, "loss": 2.457427978515625, "memory(GiB)": 77.56, "step": 42275, "token_acc": 0.46774193548387094, "train_speed(iter/s)": 1.440707 }, { "epoch": 1.8114048241292147, "grad_norm": 4.62233829498291, "learning_rate": 7.09642274604173e-05, "loss": 2.306803512573242, "memory(GiB)": 77.56, "step": 42280, "token_acc": 0.5, "train_speed(iter/s)": 1.440719 }, { "epoch": 1.8116190394584637, "grad_norm": 5.273478031158447, "learning_rate": 7.09581176133014e-05, "loss": 2.4258193969726562, "memory(GiB)": 77.56, "step": 42285, "token_acc": 0.4777777777777778, "train_speed(iter/s)": 1.440749 }, { "epoch": 1.8118332547877127, "grad_norm": 6.072991371154785, "learning_rate": 7.095200738650953e-05, "loss": 2.5939872741699217, "memory(GiB)": 77.56, "step": 42290, "token_acc": 0.4342105263157895, "train_speed(iter/s)": 1.440716 }, { "epoch": 1.8120474701169615, "grad_norm": 5.440391540527344, "learning_rate": 7.094589678015242e-05, "loss": 2.393227195739746, "memory(GiB)": 77.56, "step": 42295, "token_acc": 0.46464646464646464, "train_speed(iter/s)": 1.440729 }, { "epoch": 1.8122616854462106, "grad_norm": 5.05177640914917, "learning_rate": 7.093978579434072e-05, "loss": 2.6129196166992186, "memory(GiB)": 77.56, "step": 42300, "token_acc": 0.47634069400630913, "train_speed(iter/s)": 1.440736 }, { "epoch": 1.8124759007754596, "grad_norm": 4.825653553009033, "learning_rate": 7.093367442918515e-05, "loss": 2.6763320922851563, "memory(GiB)": 77.56, "step": 42305, "token_acc": 0.4444444444444444, "train_speed(iter/s)": 1.440759 }, { "epoch": 1.8126901161047084, "grad_norm": 4.48107385635376, "learning_rate": 7.092756268479646e-05, "loss": 2.4609107971191406, "memory(GiB)": 77.56, "step": 42310, "token_acc": 0.4657039711191336, "train_speed(iter/s)": 1.440765 }, { "epoch": 1.8129043314339575, "grad_norm": 4.72641134262085, "learning_rate": 7.092145056128532e-05, "loss": 2.511740493774414, "memory(GiB)": 77.56, "step": 42315, "token_acc": 0.4341317365269461, "train_speed(iter/s)": 1.440751 }, { "epoch": 1.8131185467632065, "grad_norm": 4.278848648071289, "learning_rate": 7.091533805876247e-05, "loss": 2.2098731994628906, "memory(GiB)": 77.56, "step": 42320, "token_acc": 0.553030303030303, "train_speed(iter/s)": 1.440725 }, { "epoch": 1.8133327620924553, "grad_norm": 5.661434650421143, "learning_rate": 7.090922517733867e-05, "loss": 2.554936981201172, "memory(GiB)": 77.56, "step": 42325, "token_acc": 0.424812030075188, "train_speed(iter/s)": 1.440728 }, { "epoch": 1.8135469774217043, "grad_norm": 5.949657440185547, "learning_rate": 7.090311191712464e-05, "loss": 2.510664367675781, "memory(GiB)": 77.56, "step": 42330, "token_acc": 0.4842105263157895, "train_speed(iter/s)": 1.440738 }, { "epoch": 1.8137611927509534, "grad_norm": 6.50044584274292, "learning_rate": 7.08969982782311e-05, "loss": 2.4037689208984374, "memory(GiB)": 77.56, "step": 42335, "token_acc": 0.5363984674329502, "train_speed(iter/s)": 1.44077 }, { "epoch": 1.8139754080802022, "grad_norm": 5.013998031616211, "learning_rate": 7.089088426076885e-05, "loss": 2.6579734802246096, "memory(GiB)": 77.56, "step": 42340, "token_acc": 0.4725274725274725, "train_speed(iter/s)": 1.44077 }, { "epoch": 1.8141896234094512, "grad_norm": 4.483895778656006, "learning_rate": 7.088476986484865e-05, "loss": 2.281387710571289, "memory(GiB)": 77.56, "step": 42345, "token_acc": 0.5019305019305019, "train_speed(iter/s)": 1.440812 }, { "epoch": 1.8144038387387003, "grad_norm": 4.825186252593994, "learning_rate": 7.087865509058124e-05, "loss": 2.730850601196289, "memory(GiB)": 77.56, "step": 42350, "token_acc": 0.43288590604026844, "train_speed(iter/s)": 1.440823 }, { "epoch": 1.814618054067949, "grad_norm": 4.690332889556885, "learning_rate": 7.087253993807741e-05, "loss": 2.4748687744140625, "memory(GiB)": 77.56, "step": 42355, "token_acc": 0.449438202247191, "train_speed(iter/s)": 1.440795 }, { "epoch": 1.814832269397198, "grad_norm": 5.284695148468018, "learning_rate": 7.086642440744794e-05, "loss": 2.4146251678466797, "memory(GiB)": 77.56, "step": 42360, "token_acc": 0.49333333333333335, "train_speed(iter/s)": 1.440824 }, { "epoch": 1.8150464847264471, "grad_norm": 5.020896911621094, "learning_rate": 7.08603084988036e-05, "loss": 2.4790555953979494, "memory(GiB)": 77.56, "step": 42365, "token_acc": 0.47191011235955055, "train_speed(iter/s)": 1.440807 }, { "epoch": 1.815260700055696, "grad_norm": 5.374149322509766, "learning_rate": 7.085419221225522e-05, "loss": 2.7598379135131834, "memory(GiB)": 77.56, "step": 42370, "token_acc": 0.4523809523809524, "train_speed(iter/s)": 1.440795 }, { "epoch": 1.815474915384945, "grad_norm": 4.4034600257873535, "learning_rate": 7.084807554791358e-05, "loss": 2.6706916809082033, "memory(GiB)": 77.56, "step": 42375, "token_acc": 0.428125, "train_speed(iter/s)": 1.440817 }, { "epoch": 1.815689130714194, "grad_norm": 3.9676151275634766, "learning_rate": 7.08419585058895e-05, "loss": 2.5571022033691406, "memory(GiB)": 77.56, "step": 42380, "token_acc": 0.4738675958188153, "train_speed(iter/s)": 1.440856 }, { "epoch": 1.8159033460434428, "grad_norm": 5.746986389160156, "learning_rate": 7.083584108629379e-05, "loss": 2.4747262954711915, "memory(GiB)": 77.56, "step": 42385, "token_acc": 0.47985347985347987, "train_speed(iter/s)": 1.440847 }, { "epoch": 1.8161175613726919, "grad_norm": 6.064831733703613, "learning_rate": 7.082972328923726e-05, "loss": 2.4341896057128904, "memory(GiB)": 77.56, "step": 42390, "token_acc": 0.49236641221374045, "train_speed(iter/s)": 1.440861 }, { "epoch": 1.816331776701941, "grad_norm": 9.76562213897705, "learning_rate": 7.082360511483077e-05, "loss": 2.770310974121094, "memory(GiB)": 77.56, "step": 42395, "token_acc": 0.4227129337539432, "train_speed(iter/s)": 1.440855 }, { "epoch": 1.8165459920311897, "grad_norm": 5.372458457946777, "learning_rate": 7.081748656318514e-05, "loss": 2.2854185104370117, "memory(GiB)": 77.56, "step": 42400, "token_acc": 0.5166051660516605, "train_speed(iter/s)": 1.440816 }, { "epoch": 1.8167602073604388, "grad_norm": 4.555784702301025, "learning_rate": 7.081136763441119e-05, "loss": 2.782064437866211, "memory(GiB)": 77.56, "step": 42405, "token_acc": 0.44660194174757284, "train_speed(iter/s)": 1.440793 }, { "epoch": 1.8169744226896878, "grad_norm": 4.1589155197143555, "learning_rate": 7.080524832861982e-05, "loss": 2.4405752182006837, "memory(GiB)": 77.56, "step": 42410, "token_acc": 0.5070422535211268, "train_speed(iter/s)": 1.440764 }, { "epoch": 1.8171886380189366, "grad_norm": 4.964533805847168, "learning_rate": 7.079912864592185e-05, "loss": 2.1579416275024412, "memory(GiB)": 77.56, "step": 42415, "token_acc": 0.5703703703703704, "train_speed(iter/s)": 1.440742 }, { "epoch": 1.8174028533481856, "grad_norm": 4.621743202209473, "learning_rate": 7.079300858642814e-05, "loss": 2.334096145629883, "memory(GiB)": 77.56, "step": 42420, "token_acc": 0.48659003831417624, "train_speed(iter/s)": 1.44073 }, { "epoch": 1.8176170686774347, "grad_norm": 4.793374061584473, "learning_rate": 7.078688815024959e-05, "loss": 2.5243562698364257, "memory(GiB)": 77.56, "step": 42425, "token_acc": 0.4472573839662447, "train_speed(iter/s)": 1.440763 }, { "epoch": 1.8178312840066835, "grad_norm": 6.596546173095703, "learning_rate": 7.078076733749706e-05, "loss": 2.651638412475586, "memory(GiB)": 77.56, "step": 42430, "token_acc": 0.45791245791245794, "train_speed(iter/s)": 1.440778 }, { "epoch": 1.8180454993359325, "grad_norm": 5.211883544921875, "learning_rate": 7.077464614828142e-05, "loss": 2.6267276763916017, "memory(GiB)": 77.56, "step": 42435, "token_acc": 0.46175637393767704, "train_speed(iter/s)": 1.440809 }, { "epoch": 1.8182597146651815, "grad_norm": 5.329939842224121, "learning_rate": 7.07685245827136e-05, "loss": 2.4424423217773437, "memory(GiB)": 77.56, "step": 42440, "token_acc": 0.4740740740740741, "train_speed(iter/s)": 1.440836 }, { "epoch": 1.8184739299944304, "grad_norm": 3.719423294067383, "learning_rate": 7.076240264090446e-05, "loss": 2.7285940170288088, "memory(GiB)": 77.56, "step": 42445, "token_acc": 0.4461538461538462, "train_speed(iter/s)": 1.440835 }, { "epoch": 1.8186881453236794, "grad_norm": 4.613279819488525, "learning_rate": 7.075628032296491e-05, "loss": 2.375505828857422, "memory(GiB)": 77.56, "step": 42450, "token_acc": 0.5172413793103449, "train_speed(iter/s)": 1.440848 }, { "epoch": 1.8189023606529284, "grad_norm": 4.4954938888549805, "learning_rate": 7.07501576290059e-05, "loss": 2.356053924560547, "memory(GiB)": 77.56, "step": 42455, "token_acc": 0.5182186234817814, "train_speed(iter/s)": 1.440787 }, { "epoch": 1.8191165759821772, "grad_norm": 4.431429386138916, "learning_rate": 7.074403455913829e-05, "loss": 2.754829216003418, "memory(GiB)": 77.56, "step": 42460, "token_acc": 0.45364238410596025, "train_speed(iter/s)": 1.440799 }, { "epoch": 1.8193307913114263, "grad_norm": 4.924394130706787, "learning_rate": 7.073791111347305e-05, "loss": 2.286001205444336, "memory(GiB)": 77.56, "step": 42465, "token_acc": 0.5287356321839081, "train_speed(iter/s)": 1.440832 }, { "epoch": 1.8195450066406753, "grad_norm": 4.5436859130859375, "learning_rate": 7.07317872921211e-05, "loss": 2.4529966354370116, "memory(GiB)": 77.56, "step": 42470, "token_acc": 0.49491525423728816, "train_speed(iter/s)": 1.440838 }, { "epoch": 1.8197592219699241, "grad_norm": 5.282332420349121, "learning_rate": 7.072566309519338e-05, "loss": 2.5372215270996095, "memory(GiB)": 77.56, "step": 42475, "token_acc": 0.47346938775510206, "train_speed(iter/s)": 1.440849 }, { "epoch": 1.8199734372991732, "grad_norm": 5.669687271118164, "learning_rate": 7.071953852280081e-05, "loss": 2.5251893997192383, "memory(GiB)": 77.56, "step": 42480, "token_acc": 0.4625, "train_speed(iter/s)": 1.440871 }, { "epoch": 1.8201876526284222, "grad_norm": 4.748483180999756, "learning_rate": 7.071341357505437e-05, "loss": 2.6501842498779298, "memory(GiB)": 77.56, "step": 42485, "token_acc": 0.4833948339483395, "train_speed(iter/s)": 1.440895 }, { "epoch": 1.820401867957671, "grad_norm": 4.6338114738464355, "learning_rate": 7.070728825206501e-05, "loss": 2.8831497192382813, "memory(GiB)": 77.56, "step": 42490, "token_acc": 0.445859872611465, "train_speed(iter/s)": 1.440922 }, { "epoch": 1.82061608328692, "grad_norm": 6.064342498779297, "learning_rate": 7.070116255394371e-05, "loss": 2.340175819396973, "memory(GiB)": 77.56, "step": 42495, "token_acc": 0.5032894736842105, "train_speed(iter/s)": 1.440962 }, { "epoch": 1.820830298616169, "grad_norm": 5.834266662597656, "learning_rate": 7.069503648080143e-05, "loss": 2.5555051803588866, "memory(GiB)": 77.56, "step": 42500, "token_acc": 0.471875, "train_speed(iter/s)": 1.440978 }, { "epoch": 1.820830298616169, "eval_loss": 2.2241339683532715, "eval_runtime": 14.1675, "eval_samples_per_second": 7.058, "eval_steps_per_second": 7.058, "eval_token_acc": 0.47468354430379744, "step": 42500 }, { "epoch": 1.8210445139454179, "grad_norm": 6.313887596130371, "learning_rate": 7.068891003274915e-05, "loss": 2.353474807739258, "memory(GiB)": 77.56, "step": 42505, "token_acc": 0.4801812004530011, "train_speed(iter/s)": 1.440276 }, { "epoch": 1.821258729274667, "grad_norm": 5.908014297485352, "learning_rate": 7.068278320989785e-05, "loss": 2.248538017272949, "memory(GiB)": 77.56, "step": 42510, "token_acc": 0.5165289256198347, "train_speed(iter/s)": 1.440285 }, { "epoch": 1.821472944603916, "grad_norm": 4.993363857269287, "learning_rate": 7.067665601235853e-05, "loss": 2.569161033630371, "memory(GiB)": 77.56, "step": 42515, "token_acc": 0.46179401993355484, "train_speed(iter/s)": 1.440284 }, { "epoch": 1.8216871599331648, "grad_norm": 5.7084221839904785, "learning_rate": 7.067052844024219e-05, "loss": 2.5540475845336914, "memory(GiB)": 77.56, "step": 42520, "token_acc": 0.45901639344262296, "train_speed(iter/s)": 1.440283 }, { "epoch": 1.8219013752624138, "grad_norm": 4.787469863891602, "learning_rate": 7.066440049365983e-05, "loss": 2.291056823730469, "memory(GiB)": 77.56, "step": 42525, "token_acc": 0.5285714285714286, "train_speed(iter/s)": 1.44027 }, { "epoch": 1.8221155905916628, "grad_norm": 4.9950642585754395, "learning_rate": 7.06582721727225e-05, "loss": 2.5157814025878906, "memory(GiB)": 77.56, "step": 42530, "token_acc": 0.4788273615635179, "train_speed(iter/s)": 1.440273 }, { "epoch": 1.8223298059209116, "grad_norm": 5.4237542152404785, "learning_rate": 7.065214347754115e-05, "loss": 2.4063838958740233, "memory(GiB)": 77.56, "step": 42535, "token_acc": 0.49429657794676807, "train_speed(iter/s)": 1.440227 }, { "epoch": 1.8225440212501607, "grad_norm": 6.043896198272705, "learning_rate": 7.064601440822688e-05, "loss": 2.2234670639038088, "memory(GiB)": 77.56, "step": 42540, "token_acc": 0.5152542372881356, "train_speed(iter/s)": 1.440256 }, { "epoch": 1.8227582365794097, "grad_norm": 5.785392761230469, "learning_rate": 7.063988496489067e-05, "loss": 2.357148551940918, "memory(GiB)": 77.56, "step": 42545, "token_acc": 0.5191082802547771, "train_speed(iter/s)": 1.440286 }, { "epoch": 1.8229724519086585, "grad_norm": 4.614426136016846, "learning_rate": 7.06337551476436e-05, "loss": 2.3619728088378906, "memory(GiB)": 77.56, "step": 42550, "token_acc": 0.5354609929078015, "train_speed(iter/s)": 1.440311 }, { "epoch": 1.8231866672379076, "grad_norm": 5.35098934173584, "learning_rate": 7.062762495659669e-05, "loss": 2.5314332962036135, "memory(GiB)": 77.56, "step": 42555, "token_acc": 0.46258503401360546, "train_speed(iter/s)": 1.440301 }, { "epoch": 1.8234008825671566, "grad_norm": 8.009761810302734, "learning_rate": 7.062149439186101e-05, "loss": 2.5030033111572267, "memory(GiB)": 77.56, "step": 42560, "token_acc": 0.46078431372549017, "train_speed(iter/s)": 1.440293 }, { "epoch": 1.8236150978964054, "grad_norm": 4.992061614990234, "learning_rate": 7.06153634535476e-05, "loss": 2.441164016723633, "memory(GiB)": 77.56, "step": 42565, "token_acc": 0.4603658536585366, "train_speed(iter/s)": 1.440297 }, { "epoch": 1.8238293132256544, "grad_norm": 5.6058502197265625, "learning_rate": 7.060923214176756e-05, "loss": 2.9603830337524415, "memory(GiB)": 77.56, "step": 42570, "token_acc": 0.40764331210191085, "train_speed(iter/s)": 1.440257 }, { "epoch": 1.8240435285549035, "grad_norm": 4.6703643798828125, "learning_rate": 7.060310045663193e-05, "loss": 2.729035568237305, "memory(GiB)": 77.56, "step": 42575, "token_acc": 0.4329268292682927, "train_speed(iter/s)": 1.440283 }, { "epoch": 1.8242577438841523, "grad_norm": 4.325722694396973, "learning_rate": 7.059696839825182e-05, "loss": 2.428070068359375, "memory(GiB)": 77.56, "step": 42580, "token_acc": 0.47796610169491527, "train_speed(iter/s)": 1.440309 }, { "epoch": 1.8244719592134013, "grad_norm": 4.568417549133301, "learning_rate": 7.05908359667383e-05, "loss": 2.5685184478759764, "memory(GiB)": 77.56, "step": 42585, "token_acc": 0.4681528662420382, "train_speed(iter/s)": 1.440317 }, { "epoch": 1.8246861745426504, "grad_norm": 5.907657623291016, "learning_rate": 7.058470316220248e-05, "loss": 2.3318790435791015, "memory(GiB)": 77.56, "step": 42590, "token_acc": 0.49645390070921985, "train_speed(iter/s)": 1.440301 }, { "epoch": 1.8249003898718992, "grad_norm": 6.304762840270996, "learning_rate": 7.057856998475543e-05, "loss": 2.5876407623291016, "memory(GiB)": 77.56, "step": 42595, "token_acc": 0.5059288537549407, "train_speed(iter/s)": 1.440326 }, { "epoch": 1.8251146052011482, "grad_norm": 5.732647895812988, "learning_rate": 7.05724364345083e-05, "loss": 2.6107913970947267, "memory(GiB)": 77.56, "step": 42600, "token_acc": 0.45016077170418006, "train_speed(iter/s)": 1.440323 }, { "epoch": 1.8253288205303972, "grad_norm": 5.199191093444824, "learning_rate": 7.056630251157219e-05, "loss": 2.3107242584228516, "memory(GiB)": 77.56, "step": 42605, "token_acc": 0.506578947368421, "train_speed(iter/s)": 1.44035 }, { "epoch": 1.825543035859646, "grad_norm": 4.308810710906982, "learning_rate": 7.05601682160582e-05, "loss": 2.695077133178711, "memory(GiB)": 77.56, "step": 42610, "token_acc": 0.4565826330532213, "train_speed(iter/s)": 1.440358 }, { "epoch": 1.825757251188895, "grad_norm": 6.556538105010986, "learning_rate": 7.055403354807749e-05, "loss": 2.414827346801758, "memory(GiB)": 77.56, "step": 42615, "token_acc": 0.501577287066246, "train_speed(iter/s)": 1.440343 }, { "epoch": 1.8259714665181441, "grad_norm": 6.080617427825928, "learning_rate": 7.054789850774118e-05, "loss": 2.4803916931152346, "memory(GiB)": 77.56, "step": 42620, "token_acc": 0.48161764705882354, "train_speed(iter/s)": 1.440339 }, { "epoch": 1.826185681847393, "grad_norm": 6.105247974395752, "learning_rate": 7.054176309516041e-05, "loss": 2.6499397277832033, "memory(GiB)": 77.56, "step": 42625, "token_acc": 0.4808259587020649, "train_speed(iter/s)": 1.440349 }, { "epoch": 1.826399897176642, "grad_norm": 6.8638014793396, "learning_rate": 7.053562731044632e-05, "loss": 2.483260726928711, "memory(GiB)": 77.56, "step": 42630, "token_acc": 0.5169811320754717, "train_speed(iter/s)": 1.440385 }, { "epoch": 1.826614112505891, "grad_norm": 4.559618949890137, "learning_rate": 7.05294911537101e-05, "loss": 2.4516197204589845, "memory(GiB)": 77.56, "step": 42635, "token_acc": 0.4517133956386293, "train_speed(iter/s)": 1.440405 }, { "epoch": 1.8268283278351398, "grad_norm": 6.369504928588867, "learning_rate": 7.052335462506285e-05, "loss": 2.7407154083251952, "memory(GiB)": 77.56, "step": 42640, "token_acc": 0.44047619047619047, "train_speed(iter/s)": 1.440392 }, { "epoch": 1.8270425431643889, "grad_norm": 5.443140029907227, "learning_rate": 7.051721772461582e-05, "loss": 2.403109550476074, "memory(GiB)": 77.56, "step": 42645, "token_acc": 0.5019455252918288, "train_speed(iter/s)": 1.440407 }, { "epoch": 1.827256758493638, "grad_norm": 5.081888675689697, "learning_rate": 7.051108045248014e-05, "loss": 2.6135513305664064, "memory(GiB)": 77.56, "step": 42650, "token_acc": 0.48148148148148145, "train_speed(iter/s)": 1.440404 }, { "epoch": 1.8274709738228867, "grad_norm": 6.575129985809326, "learning_rate": 7.050494280876697e-05, "loss": 2.4567901611328127, "memory(GiB)": 77.56, "step": 42655, "token_acc": 0.5063829787234042, "train_speed(iter/s)": 1.44042 }, { "epoch": 1.8276851891521357, "grad_norm": 4.810214042663574, "learning_rate": 7.049880479358754e-05, "loss": 2.452205276489258, "memory(GiB)": 77.56, "step": 42660, "token_acc": 0.48148148148148145, "train_speed(iter/s)": 1.440411 }, { "epoch": 1.8278994044813848, "grad_norm": 5.769571781158447, "learning_rate": 7.049266640705304e-05, "loss": 2.678585433959961, "memory(GiB)": 77.56, "step": 42665, "token_acc": 0.45454545454545453, "train_speed(iter/s)": 1.440399 }, { "epoch": 1.8281136198106336, "grad_norm": 4.388237476348877, "learning_rate": 7.048652764927466e-05, "loss": 2.7901378631591798, "memory(GiB)": 77.56, "step": 42670, "token_acc": 0.479020979020979, "train_speed(iter/s)": 1.440421 }, { "epoch": 1.8283278351398826, "grad_norm": 4.146549701690674, "learning_rate": 7.04803885203636e-05, "loss": 2.524386978149414, "memory(GiB)": 77.56, "step": 42675, "token_acc": 0.505338078291815, "train_speed(iter/s)": 1.440461 }, { "epoch": 1.8285420504691317, "grad_norm": 3.608217239379883, "learning_rate": 7.047424902043111e-05, "loss": 2.617009162902832, "memory(GiB)": 77.56, "step": 42680, "token_acc": 0.43790849673202614, "train_speed(iter/s)": 1.440473 }, { "epoch": 1.8287562657983805, "grad_norm": 5.381430625915527, "learning_rate": 7.046810914958839e-05, "loss": 2.31790828704834, "memory(GiB)": 77.56, "step": 42685, "token_acc": 0.4701195219123506, "train_speed(iter/s)": 1.440489 }, { "epoch": 1.8289704811276295, "grad_norm": 5.573160648345947, "learning_rate": 7.046196890794666e-05, "loss": 2.4803380966186523, "memory(GiB)": 77.56, "step": 42690, "token_acc": 0.47330960854092524, "train_speed(iter/s)": 1.440491 }, { "epoch": 1.8291846964568785, "grad_norm": 6.720072269439697, "learning_rate": 7.045582829561718e-05, "loss": 2.4793569564819338, "memory(GiB)": 77.56, "step": 42695, "token_acc": 0.4804270462633452, "train_speed(iter/s)": 1.440516 }, { "epoch": 1.8293989117861273, "grad_norm": 5.089569568634033, "learning_rate": 7.044968731271118e-05, "loss": 2.31717529296875, "memory(GiB)": 77.56, "step": 42700, "token_acc": 0.4981549815498155, "train_speed(iter/s)": 1.440509 }, { "epoch": 1.8296131271153764, "grad_norm": 5.226766109466553, "learning_rate": 7.04435459593399e-05, "loss": 2.63138484954834, "memory(GiB)": 77.56, "step": 42705, "token_acc": 0.45878136200716846, "train_speed(iter/s)": 1.440499 }, { "epoch": 1.8298273424446254, "grad_norm": 5.256031513214111, "learning_rate": 7.04374042356146e-05, "loss": 2.8003908157348634, "memory(GiB)": 77.56, "step": 42710, "token_acc": 0.4387755102040816, "train_speed(iter/s)": 1.440458 }, { "epoch": 1.8300415577738742, "grad_norm": 4.400367259979248, "learning_rate": 7.043126214164657e-05, "loss": 2.4033090591430666, "memory(GiB)": 77.56, "step": 42715, "token_acc": 0.5140845070422535, "train_speed(iter/s)": 1.440485 }, { "epoch": 1.8302557731031233, "grad_norm": 6.759885311126709, "learning_rate": 7.042511967754707e-05, "loss": 2.7107378005981446, "memory(GiB)": 77.56, "step": 42720, "token_acc": 0.45569620253164556, "train_speed(iter/s)": 1.440511 }, { "epoch": 1.8304699884323723, "grad_norm": 6.6062116622924805, "learning_rate": 7.041897684342734e-05, "loss": 2.503848648071289, "memory(GiB)": 77.56, "step": 42725, "token_acc": 0.4602649006622517, "train_speed(iter/s)": 1.440518 }, { "epoch": 1.830684203761621, "grad_norm": 7.270866870880127, "learning_rate": 7.04128336393987e-05, "loss": 2.848384666442871, "memory(GiB)": 77.56, "step": 42730, "token_acc": 0.4232081911262799, "train_speed(iter/s)": 1.440544 }, { "epoch": 1.8308984190908701, "grad_norm": 4.867170810699463, "learning_rate": 7.040669006557245e-05, "loss": 2.3126014709472655, "memory(GiB)": 77.56, "step": 42735, "token_acc": 0.4966216216216216, "train_speed(iter/s)": 1.440524 }, { "epoch": 1.8311126344201192, "grad_norm": 5.4053635597229, "learning_rate": 7.040054612205982e-05, "loss": 2.4727672576904296, "memory(GiB)": 77.56, "step": 42740, "token_acc": 0.48771929824561405, "train_speed(iter/s)": 1.440515 }, { "epoch": 1.831326849749368, "grad_norm": 5.161674976348877, "learning_rate": 7.03944018089722e-05, "loss": 2.333534812927246, "memory(GiB)": 77.56, "step": 42745, "token_acc": 0.47019867549668876, "train_speed(iter/s)": 1.44052 }, { "epoch": 1.831541065078617, "grad_norm": 4.686703205108643, "learning_rate": 7.038825712642085e-05, "loss": 2.735204315185547, "memory(GiB)": 77.56, "step": 42750, "token_acc": 0.4078014184397163, "train_speed(iter/s)": 1.440562 }, { "epoch": 1.831755280407866, "grad_norm": 4.866014003753662, "learning_rate": 7.03821120745171e-05, "loss": 2.665049362182617, "memory(GiB)": 77.56, "step": 42755, "token_acc": 0.4533333333333333, "train_speed(iter/s)": 1.440578 }, { "epoch": 1.8319694957371149, "grad_norm": 4.805840969085693, "learning_rate": 7.037596665337226e-05, "loss": 2.5504104614257814, "memory(GiB)": 77.56, "step": 42760, "token_acc": 0.4517241379310345, "train_speed(iter/s)": 1.440597 }, { "epoch": 1.832183711066364, "grad_norm": 5.937466621398926, "learning_rate": 7.036982086309769e-05, "loss": 2.3927824020385744, "memory(GiB)": 77.56, "step": 42765, "token_acc": 0.506993006993007, "train_speed(iter/s)": 1.440654 }, { "epoch": 1.832397926395613, "grad_norm": 6.452016830444336, "learning_rate": 7.036367470380467e-05, "loss": 2.840751838684082, "memory(GiB)": 77.56, "step": 42770, "token_acc": 0.44876325088339225, "train_speed(iter/s)": 1.440628 }, { "epoch": 1.8326121417248618, "grad_norm": 7.551222324371338, "learning_rate": 7.03575281756046e-05, "loss": 2.674056816101074, "memory(GiB)": 77.56, "step": 42775, "token_acc": 0.46689895470383275, "train_speed(iter/s)": 1.440636 }, { "epoch": 1.8328263570541108, "grad_norm": 7.136316776275635, "learning_rate": 7.03513812786088e-05, "loss": 2.59435977935791, "memory(GiB)": 77.56, "step": 42780, "token_acc": 0.4742268041237113, "train_speed(iter/s)": 1.44062 }, { "epoch": 1.8330405723833598, "grad_norm": 4.6389899253845215, "learning_rate": 7.034523401292866e-05, "loss": 2.7101425170898437, "memory(GiB)": 77.56, "step": 42785, "token_acc": 0.4563758389261745, "train_speed(iter/s)": 1.440627 }, { "epoch": 1.8332547877126086, "grad_norm": 3.842972993850708, "learning_rate": 7.03390863786755e-05, "loss": 2.4858264923095703, "memory(GiB)": 77.56, "step": 42790, "token_acc": 0.49002849002849, "train_speed(iter/s)": 1.440653 }, { "epoch": 1.8334690030418577, "grad_norm": 5.817306041717529, "learning_rate": 7.033293837596073e-05, "loss": 2.576206588745117, "memory(GiB)": 77.56, "step": 42795, "token_acc": 0.4883720930232558, "train_speed(iter/s)": 1.440662 }, { "epoch": 1.8336832183711067, "grad_norm": 6.170565605163574, "learning_rate": 7.032679000489568e-05, "loss": 2.592821502685547, "memory(GiB)": 77.56, "step": 42800, "token_acc": 0.45144356955380577, "train_speed(iter/s)": 1.440668 }, { "epoch": 1.8338974337003555, "grad_norm": 5.738097190856934, "learning_rate": 7.032064126559179e-05, "loss": 2.663206100463867, "memory(GiB)": 77.56, "step": 42805, "token_acc": 0.44868035190615835, "train_speed(iter/s)": 1.440678 }, { "epoch": 1.8341116490296046, "grad_norm": 5.772311687469482, "learning_rate": 7.031449215816041e-05, "loss": 2.6962120056152346, "memory(GiB)": 77.56, "step": 42810, "token_acc": 0.4280936454849498, "train_speed(iter/s)": 1.440685 }, { "epoch": 1.8343258643588536, "grad_norm": 5.5379509925842285, "learning_rate": 7.030834268271294e-05, "loss": 2.4956586837768553, "memory(GiB)": 77.56, "step": 42815, "token_acc": 0.5091575091575091, "train_speed(iter/s)": 1.440683 }, { "epoch": 1.8345400796881024, "grad_norm": 4.719763278961182, "learning_rate": 7.030219283936079e-05, "loss": 2.643565368652344, "memory(GiB)": 77.56, "step": 42820, "token_acc": 0.4485294117647059, "train_speed(iter/s)": 1.440694 }, { "epoch": 1.8347542950173514, "grad_norm": 4.409863471984863, "learning_rate": 7.029604262821539e-05, "loss": 2.5345787048339843, "memory(GiB)": 77.56, "step": 42825, "token_acc": 0.4810126582278481, "train_speed(iter/s)": 1.440697 }, { "epoch": 1.8349685103466005, "grad_norm": 5.3105268478393555, "learning_rate": 7.028989204938812e-05, "loss": 2.3162155151367188, "memory(GiB)": 77.56, "step": 42830, "token_acc": 0.4852941176470588, "train_speed(iter/s)": 1.440737 }, { "epoch": 1.8351827256758493, "grad_norm": 5.830991268157959, "learning_rate": 7.028374110299044e-05, "loss": 2.385612678527832, "memory(GiB)": 77.56, "step": 42835, "token_acc": 0.4788732394366197, "train_speed(iter/s)": 1.440776 }, { "epoch": 1.8353969410050983, "grad_norm": 6.850379467010498, "learning_rate": 7.027758978913374e-05, "loss": 2.410422134399414, "memory(GiB)": 77.56, "step": 42840, "token_acc": 0.4858156028368794, "train_speed(iter/s)": 1.44078 }, { "epoch": 1.8356111563343473, "grad_norm": 4.204717636108398, "learning_rate": 7.02714381079295e-05, "loss": 2.3584640502929686, "memory(GiB)": 77.56, "step": 42845, "token_acc": 0.5093167701863354, "train_speed(iter/s)": 1.440822 }, { "epoch": 1.8358253716635962, "grad_norm": 3.996864080429077, "learning_rate": 7.026528605948913e-05, "loss": 2.572443962097168, "memory(GiB)": 77.56, "step": 42850, "token_acc": 0.47840531561461797, "train_speed(iter/s)": 1.440809 }, { "epoch": 1.8360395869928452, "grad_norm": 4.448183536529541, "learning_rate": 7.025913364392409e-05, "loss": 2.6070602416992186, "memory(GiB)": 77.56, "step": 42855, "token_acc": 0.44510385756676557, "train_speed(iter/s)": 1.440774 }, { "epoch": 1.8362538023220942, "grad_norm": 6.315032482147217, "learning_rate": 7.025298086134586e-05, "loss": 2.8378742218017576, "memory(GiB)": 77.56, "step": 42860, "token_acc": 0.425, "train_speed(iter/s)": 1.44075 }, { "epoch": 1.836468017651343, "grad_norm": 5.0336761474609375, "learning_rate": 7.024682771186586e-05, "loss": 2.6184921264648438, "memory(GiB)": 77.56, "step": 42865, "token_acc": 0.484251968503937, "train_speed(iter/s)": 1.440768 }, { "epoch": 1.836682232980592, "grad_norm": 5.010344982147217, "learning_rate": 7.02406741955956e-05, "loss": 2.6325130462646484, "memory(GiB)": 77.56, "step": 42870, "token_acc": 0.46503496503496505, "train_speed(iter/s)": 1.440763 }, { "epoch": 1.8368964483098411, "grad_norm": 5.454922199249268, "learning_rate": 7.023452031264653e-05, "loss": 2.5520700454711913, "memory(GiB)": 77.56, "step": 42875, "token_acc": 0.44884488448844884, "train_speed(iter/s)": 1.440767 }, { "epoch": 1.83711066363909, "grad_norm": 5.264733791351318, "learning_rate": 7.022836606313015e-05, "loss": 2.335470199584961, "memory(GiB)": 77.56, "step": 42880, "token_acc": 0.535593220338983, "train_speed(iter/s)": 1.440748 }, { "epoch": 1.837324878968339, "grad_norm": 4.933237552642822, "learning_rate": 7.022221144715793e-05, "loss": 2.393748474121094, "memory(GiB)": 77.56, "step": 42885, "token_acc": 0.4470588235294118, "train_speed(iter/s)": 1.440753 }, { "epoch": 1.837539094297588, "grad_norm": 6.662442207336426, "learning_rate": 7.021605646484137e-05, "loss": 2.7910985946655273, "memory(GiB)": 77.56, "step": 42890, "token_acc": 0.4368932038834951, "train_speed(iter/s)": 1.440682 }, { "epoch": 1.8377533096268368, "grad_norm": 5.211727142333984, "learning_rate": 7.020990111629202e-05, "loss": 2.9527698516845704, "memory(GiB)": 77.56, "step": 42895, "token_acc": 0.4421768707482993, "train_speed(iter/s)": 1.440671 }, { "epoch": 1.8379675249560858, "grad_norm": 5.574032783508301, "learning_rate": 7.020374540162132e-05, "loss": 2.2710084915161133, "memory(GiB)": 77.56, "step": 42900, "token_acc": 0.49407114624505927, "train_speed(iter/s)": 1.440718 }, { "epoch": 1.8381817402853349, "grad_norm": 5.917013168334961, "learning_rate": 7.019758932094083e-05, "loss": 2.2579360961914063, "memory(GiB)": 77.56, "step": 42905, "token_acc": 0.5194805194805194, "train_speed(iter/s)": 1.440726 }, { "epoch": 1.8383959556145837, "grad_norm": 4.261195659637451, "learning_rate": 7.019143287436207e-05, "loss": 2.517148017883301, "memory(GiB)": 77.56, "step": 42910, "token_acc": 0.5078369905956113, "train_speed(iter/s)": 1.440739 }, { "epoch": 1.8386101709438327, "grad_norm": 5.687638282775879, "learning_rate": 7.018527606199657e-05, "loss": 2.4009639739990236, "memory(GiB)": 77.56, "step": 42915, "token_acc": 0.4878048780487805, "train_speed(iter/s)": 1.44071 }, { "epoch": 1.8388243862730818, "grad_norm": 5.801491737365723, "learning_rate": 7.017911888395584e-05, "loss": 2.4040607452392577, "memory(GiB)": 77.56, "step": 42920, "token_acc": 0.4857142857142857, "train_speed(iter/s)": 1.440677 }, { "epoch": 1.8390386016023306, "grad_norm": 5.284247398376465, "learning_rate": 7.017296134035145e-05, "loss": 2.453633117675781, "memory(GiB)": 77.56, "step": 42925, "token_acc": 0.5205479452054794, "train_speed(iter/s)": 1.440694 }, { "epoch": 1.8392528169315796, "grad_norm": 6.3654465675354, "learning_rate": 7.016680343129493e-05, "loss": 2.527718734741211, "memory(GiB)": 77.56, "step": 42930, "token_acc": 0.4812286689419795, "train_speed(iter/s)": 1.440744 }, { "epoch": 1.8394670322608286, "grad_norm": 4.958274841308594, "learning_rate": 7.016064515689787e-05, "loss": 2.4775012969970702, "memory(GiB)": 77.56, "step": 42935, "token_acc": 0.4900662251655629, "train_speed(iter/s)": 1.440736 }, { "epoch": 1.8396812475900775, "grad_norm": 5.551571846008301, "learning_rate": 7.015448651727179e-05, "loss": 2.655436325073242, "memory(GiB)": 77.56, "step": 42940, "token_acc": 0.4430769230769231, "train_speed(iter/s)": 1.44069 }, { "epoch": 1.8398954629193265, "grad_norm": 4.376008033752441, "learning_rate": 7.01483275125283e-05, "loss": 2.537699890136719, "memory(GiB)": 77.56, "step": 42945, "token_acc": 0.49174917491749176, "train_speed(iter/s)": 1.440726 }, { "epoch": 1.8401096782485755, "grad_norm": 5.936407089233398, "learning_rate": 7.014216814277894e-05, "loss": 2.7400318145751954, "memory(GiB)": 77.56, "step": 42950, "token_acc": 0.4738562091503268, "train_speed(iter/s)": 1.440742 }, { "epoch": 1.8403238935778243, "grad_norm": 5.693716049194336, "learning_rate": 7.01360084081353e-05, "loss": 2.706703948974609, "memory(GiB)": 77.56, "step": 42955, "token_acc": 0.4244604316546763, "train_speed(iter/s)": 1.440753 }, { "epoch": 1.8405381089070734, "grad_norm": 6.113903999328613, "learning_rate": 7.0129848308709e-05, "loss": 2.658347320556641, "memory(GiB)": 77.56, "step": 42960, "token_acc": 0.45058139534883723, "train_speed(iter/s)": 1.440785 }, { "epoch": 1.8407523242363224, "grad_norm": 5.083117961883545, "learning_rate": 7.012368784461161e-05, "loss": 2.833583641052246, "memory(GiB)": 77.56, "step": 42965, "token_acc": 0.4233128834355828, "train_speed(iter/s)": 1.440824 }, { "epoch": 1.8409665395655712, "grad_norm": 4.786596775054932, "learning_rate": 7.011752701595471e-05, "loss": 2.790046501159668, "memory(GiB)": 77.56, "step": 42970, "token_acc": 0.47601476014760147, "train_speed(iter/s)": 1.440831 }, { "epoch": 1.8411807548948202, "grad_norm": 5.824346542358398, "learning_rate": 7.011136582284998e-05, "loss": 2.5863201141357424, "memory(GiB)": 77.56, "step": 42975, "token_acc": 0.4807692307692308, "train_speed(iter/s)": 1.44084 }, { "epoch": 1.8413949702240693, "grad_norm": 4.7220845222473145, "learning_rate": 7.010520426540897e-05, "loss": 2.336712646484375, "memory(GiB)": 77.56, "step": 42980, "token_acc": 0.5179856115107914, "train_speed(iter/s)": 1.440843 }, { "epoch": 1.841609185553318, "grad_norm": 5.535294532775879, "learning_rate": 7.009904234374332e-05, "loss": 2.5778785705566407, "memory(GiB)": 77.56, "step": 42985, "token_acc": 0.4674329501915709, "train_speed(iter/s)": 1.440867 }, { "epoch": 1.8418234008825671, "grad_norm": 6.33097505569458, "learning_rate": 7.009288005796469e-05, "loss": 2.712126541137695, "memory(GiB)": 77.56, "step": 42990, "token_acc": 0.41132075471698115, "train_speed(iter/s)": 1.440898 }, { "epoch": 1.8420376162118162, "grad_norm": 4.346415996551514, "learning_rate": 7.008671740818466e-05, "loss": 2.378527069091797, "memory(GiB)": 77.56, "step": 42995, "token_acc": 0.4828767123287671, "train_speed(iter/s)": 1.440894 }, { "epoch": 1.842251831541065, "grad_norm": 4.51521110534668, "learning_rate": 7.008055439451491e-05, "loss": 2.4386518478393553, "memory(GiB)": 77.56, "step": 43000, "token_acc": 0.4795539033457249, "train_speed(iter/s)": 1.440871 }, { "epoch": 1.842251831541065, "eval_loss": 2.1833853721618652, "eval_runtime": 14.0243, "eval_samples_per_second": 7.131, "eval_steps_per_second": 7.131, "eval_token_acc": 0.4574468085106383, "step": 43000 }, { "epoch": 1.842466046870314, "grad_norm": 4.630211353302002, "learning_rate": 7.007439101706708e-05, "loss": 2.5586383819580076, "memory(GiB)": 77.56, "step": 43005, "token_acc": 0.46735751295336786, "train_speed(iter/s)": 1.440152 }, { "epoch": 1.842680262199563, "grad_norm": 4.667478084564209, "learning_rate": 7.006822727595284e-05, "loss": 2.679239273071289, "memory(GiB)": 77.56, "step": 43010, "token_acc": 0.44, "train_speed(iter/s)": 1.440158 }, { "epoch": 1.8428944775288119, "grad_norm": 4.515958309173584, "learning_rate": 7.00620631712838e-05, "loss": 2.890361785888672, "memory(GiB)": 77.56, "step": 43015, "token_acc": 0.38613861386138615, "train_speed(iter/s)": 1.44016 }, { "epoch": 1.843108692858061, "grad_norm": 5.206225872039795, "learning_rate": 7.00558987031717e-05, "loss": 2.6854297637939455, "memory(GiB)": 77.56, "step": 43020, "token_acc": 0.42857142857142855, "train_speed(iter/s)": 1.440175 }, { "epoch": 1.84332290818731, "grad_norm": 4.903097152709961, "learning_rate": 7.004973387172818e-05, "loss": 2.7004928588867188, "memory(GiB)": 77.56, "step": 43025, "token_acc": 0.44405594405594406, "train_speed(iter/s)": 1.440198 }, { "epoch": 1.8435371235165587, "grad_norm": 4.881291389465332, "learning_rate": 7.00435686770649e-05, "loss": 2.544137954711914, "memory(GiB)": 77.56, "step": 43030, "token_acc": 0.44931506849315067, "train_speed(iter/s)": 1.440199 }, { "epoch": 1.8437513388458078, "grad_norm": 5.065155982971191, "learning_rate": 7.003740311929358e-05, "loss": 2.2918020248413087, "memory(GiB)": 77.56, "step": 43035, "token_acc": 0.4967105263157895, "train_speed(iter/s)": 1.440155 }, { "epoch": 1.8439655541750568, "grad_norm": 3.8211097717285156, "learning_rate": 7.003123719852592e-05, "loss": 2.2812437057495116, "memory(GiB)": 77.56, "step": 43040, "token_acc": 0.5, "train_speed(iter/s)": 1.440169 }, { "epoch": 1.8441797695043056, "grad_norm": 5.353897571563721, "learning_rate": 7.00250709148736e-05, "loss": 2.7209804534912108, "memory(GiB)": 77.56, "step": 43045, "token_acc": 0.47202797202797203, "train_speed(iter/s)": 1.440079 }, { "epoch": 1.8443939848335547, "grad_norm": 4.515683174133301, "learning_rate": 7.001890426844833e-05, "loss": 2.576288604736328, "memory(GiB)": 77.56, "step": 43050, "token_acc": 0.4391891891891892, "train_speed(iter/s)": 1.440098 }, { "epoch": 1.8446082001628037, "grad_norm": 4.109330654144287, "learning_rate": 7.001273725936184e-05, "loss": 2.634905242919922, "memory(GiB)": 77.56, "step": 43055, "token_acc": 0.4411764705882353, "train_speed(iter/s)": 1.440117 }, { "epoch": 1.8448224154920525, "grad_norm": 4.0266265869140625, "learning_rate": 7.000656988772583e-05, "loss": 2.897775077819824, "memory(GiB)": 77.56, "step": 43060, "token_acc": 0.43641618497109824, "train_speed(iter/s)": 1.440127 }, { "epoch": 1.8450366308213015, "grad_norm": 4.472598552703857, "learning_rate": 7.000040215365205e-05, "loss": 2.56237735748291, "memory(GiB)": 77.56, "step": 43065, "token_acc": 0.49034749034749037, "train_speed(iter/s)": 1.440151 }, { "epoch": 1.8452508461505506, "grad_norm": 4.262383460998535, "learning_rate": 6.999423405725221e-05, "loss": 2.7779729843139647, "memory(GiB)": 77.56, "step": 43070, "token_acc": 0.40425531914893614, "train_speed(iter/s)": 1.44018 }, { "epoch": 1.8454650614797994, "grad_norm": 6.024571418762207, "learning_rate": 6.998806559863806e-05, "loss": 2.6618925094604493, "memory(GiB)": 77.56, "step": 43075, "token_acc": 0.44565217391304346, "train_speed(iter/s)": 1.440183 }, { "epoch": 1.8456792768090484, "grad_norm": 5.0662455558776855, "learning_rate": 6.998189677792138e-05, "loss": 2.6739423751831053, "memory(GiB)": 77.56, "step": 43080, "token_acc": 0.4888268156424581, "train_speed(iter/s)": 1.440187 }, { "epoch": 1.8458934921382975, "grad_norm": 6.7062530517578125, "learning_rate": 6.997572759521386e-05, "loss": 3.038888931274414, "memory(GiB)": 77.56, "step": 43085, "token_acc": 0.3944020356234097, "train_speed(iter/s)": 1.440234 }, { "epoch": 1.8461077074675463, "grad_norm": 6.608982086181641, "learning_rate": 6.996955805062732e-05, "loss": 2.4786960601806642, "memory(GiB)": 77.56, "step": 43090, "token_acc": 0.47924528301886793, "train_speed(iter/s)": 1.440245 }, { "epoch": 1.8463219227967953, "grad_norm": 3.9080896377563477, "learning_rate": 6.99633881442735e-05, "loss": 2.4747425079345704, "memory(GiB)": 77.56, "step": 43095, "token_acc": 0.467966573816156, "train_speed(iter/s)": 1.440181 }, { "epoch": 1.8465361381260443, "grad_norm": 4.1277031898498535, "learning_rate": 6.995721787626417e-05, "loss": 2.425519561767578, "memory(GiB)": 77.56, "step": 43100, "token_acc": 0.5506072874493927, "train_speed(iter/s)": 1.440183 }, { "epoch": 1.8467503534552931, "grad_norm": 4.736281871795654, "learning_rate": 6.995104724671112e-05, "loss": 2.764559745788574, "memory(GiB)": 77.56, "step": 43105, "token_acc": 0.4297994269340974, "train_speed(iter/s)": 1.440191 }, { "epoch": 1.8469645687845422, "grad_norm": 4.871133327484131, "learning_rate": 6.994487625572613e-05, "loss": 2.4187595367431642, "memory(GiB)": 77.56, "step": 43110, "token_acc": 0.46956521739130436, "train_speed(iter/s)": 1.440197 }, { "epoch": 1.8471787841137912, "grad_norm": 4.730271816253662, "learning_rate": 6.993870490342099e-05, "loss": 2.6471309661865234, "memory(GiB)": 77.56, "step": 43115, "token_acc": 0.446875, "train_speed(iter/s)": 1.440177 }, { "epoch": 1.84739299944304, "grad_norm": 4.323908805847168, "learning_rate": 6.993253318990753e-05, "loss": 2.4310916900634765, "memory(GiB)": 77.56, "step": 43120, "token_acc": 0.46835443037974683, "train_speed(iter/s)": 1.440178 }, { "epoch": 1.847607214772289, "grad_norm": 5.497196674346924, "learning_rate": 6.992636111529752e-05, "loss": 2.4413120269775392, "memory(GiB)": 77.56, "step": 43125, "token_acc": 0.476, "train_speed(iter/s)": 1.440191 }, { "epoch": 1.847821430101538, "grad_norm": 5.502140045166016, "learning_rate": 6.992018867970278e-05, "loss": 2.975293731689453, "memory(GiB)": 77.56, "step": 43130, "token_acc": 0.4146341463414634, "train_speed(iter/s)": 1.440179 }, { "epoch": 1.848035645430787, "grad_norm": 4.811458587646484, "learning_rate": 6.991401588323514e-05, "loss": 2.208480453491211, "memory(GiB)": 77.56, "step": 43135, "token_acc": 0.503731343283582, "train_speed(iter/s)": 1.440182 }, { "epoch": 1.848249860760036, "grad_norm": 4.557877540588379, "learning_rate": 6.990784272600643e-05, "loss": 2.1829463958740236, "memory(GiB)": 77.56, "step": 43140, "token_acc": 0.5149253731343284, "train_speed(iter/s)": 1.440151 }, { "epoch": 1.848464076089285, "grad_norm": 4.922790050506592, "learning_rate": 6.990166920812847e-05, "loss": 2.4981847763061524, "memory(GiB)": 77.56, "step": 43145, "token_acc": 0.4418604651162791, "train_speed(iter/s)": 1.440178 }, { "epoch": 1.8486782914185338, "grad_norm": 4.187565803527832, "learning_rate": 6.989549532971309e-05, "loss": 3.0007017135620115, "memory(GiB)": 77.56, "step": 43150, "token_acc": 0.43, "train_speed(iter/s)": 1.440214 }, { "epoch": 1.8488925067477828, "grad_norm": 7.6599555015563965, "learning_rate": 6.988932109087216e-05, "loss": 2.5249298095703123, "memory(GiB)": 77.56, "step": 43155, "token_acc": 0.49070631970260226, "train_speed(iter/s)": 1.440219 }, { "epoch": 1.8491067220770319, "grad_norm": 8.369895935058594, "learning_rate": 6.988314649171751e-05, "loss": 2.2094568252563476, "memory(GiB)": 77.56, "step": 43160, "token_acc": 0.491869918699187, "train_speed(iter/s)": 1.440265 }, { "epoch": 1.8493209374062807, "grad_norm": 3.8235225677490234, "learning_rate": 6.987697153236102e-05, "loss": 2.48919677734375, "memory(GiB)": 77.56, "step": 43165, "token_acc": 0.5, "train_speed(iter/s)": 1.440288 }, { "epoch": 1.8495351527355297, "grad_norm": 3.86279559135437, "learning_rate": 6.987079621291455e-05, "loss": 2.2730331420898438, "memory(GiB)": 77.56, "step": 43170, "token_acc": 0.5106382978723404, "train_speed(iter/s)": 1.44032 }, { "epoch": 1.8497493680647787, "grad_norm": 5.171674728393555, "learning_rate": 6.986462053348996e-05, "loss": 2.55029296875, "memory(GiB)": 77.56, "step": 43175, "token_acc": 0.4729241877256318, "train_speed(iter/s)": 1.440328 }, { "epoch": 1.8499635833940276, "grad_norm": 4.2251811027526855, "learning_rate": 6.985844449419913e-05, "loss": 2.502684211730957, "memory(GiB)": 77.56, "step": 43180, "token_acc": 0.48363636363636364, "train_speed(iter/s)": 1.440346 }, { "epoch": 1.8501777987232766, "grad_norm": 4.920688152313232, "learning_rate": 6.985226809515395e-05, "loss": 2.8009572982788087, "memory(GiB)": 77.56, "step": 43185, "token_acc": 0.40836012861736337, "train_speed(iter/s)": 1.440366 }, { "epoch": 1.8503920140525256, "grad_norm": 5.436327934265137, "learning_rate": 6.98460913364663e-05, "loss": 2.579128456115723, "memory(GiB)": 77.56, "step": 43190, "token_acc": 0.5, "train_speed(iter/s)": 1.440367 }, { "epoch": 1.8506062293817744, "grad_norm": 5.891781330108643, "learning_rate": 6.983991421824811e-05, "loss": 2.523370933532715, "memory(GiB)": 77.56, "step": 43195, "token_acc": 0.510548523206751, "train_speed(iter/s)": 1.440379 }, { "epoch": 1.8508204447110235, "grad_norm": 4.855978012084961, "learning_rate": 6.983373674061126e-05, "loss": 2.462619972229004, "memory(GiB)": 77.56, "step": 43200, "token_acc": 0.5, "train_speed(iter/s)": 1.440401 }, { "epoch": 1.8510346600402725, "grad_norm": 4.836047649383545, "learning_rate": 6.982755890366766e-05, "loss": 2.6963569641113283, "memory(GiB)": 77.56, "step": 43205, "token_acc": 0.4552238805970149, "train_speed(iter/s)": 1.440406 }, { "epoch": 1.8512488753695213, "grad_norm": 4.0195231437683105, "learning_rate": 6.982138070752923e-05, "loss": 2.649702453613281, "memory(GiB)": 77.56, "step": 43210, "token_acc": 0.4905149051490515, "train_speed(iter/s)": 1.440429 }, { "epoch": 1.8514630906987704, "grad_norm": 5.107478141784668, "learning_rate": 6.981520215230788e-05, "loss": 2.583074188232422, "memory(GiB)": 77.56, "step": 43215, "token_acc": 0.4962121212121212, "train_speed(iter/s)": 1.44047 }, { "epoch": 1.8516773060280194, "grad_norm": 4.943504333496094, "learning_rate": 6.980902323811557e-05, "loss": 2.4854690551757814, "memory(GiB)": 77.56, "step": 43220, "token_acc": 0.47470817120622566, "train_speed(iter/s)": 1.440491 }, { "epoch": 1.8518915213572682, "grad_norm": 3.9420807361602783, "learning_rate": 6.980284396506421e-05, "loss": 2.6313228607177734, "memory(GiB)": 77.56, "step": 43225, "token_acc": 0.48, "train_speed(iter/s)": 1.440502 }, { "epoch": 1.8521057366865172, "grad_norm": 6.036388874053955, "learning_rate": 6.979666433326577e-05, "loss": 2.7429330825805662, "memory(GiB)": 77.56, "step": 43230, "token_acc": 0.45017182130584193, "train_speed(iter/s)": 1.440509 }, { "epoch": 1.8523199520157663, "grad_norm": 6.380568504333496, "learning_rate": 6.979048434283218e-05, "loss": 2.595009994506836, "memory(GiB)": 77.56, "step": 43235, "token_acc": 0.4208754208754209, "train_speed(iter/s)": 1.440563 }, { "epoch": 1.852534167345015, "grad_norm": 4.630065441131592, "learning_rate": 6.978430399387541e-05, "loss": 2.4058111190795897, "memory(GiB)": 77.56, "step": 43240, "token_acc": 0.46402877697841727, "train_speed(iter/s)": 1.44053 }, { "epoch": 1.8527483826742641, "grad_norm": 4.022320747375488, "learning_rate": 6.97781232865074e-05, "loss": 2.441282272338867, "memory(GiB)": 77.56, "step": 43245, "token_acc": 0.49166666666666664, "train_speed(iter/s)": 1.440477 }, { "epoch": 1.8529625980035132, "grad_norm": 4.732589244842529, "learning_rate": 6.977194222084013e-05, "loss": 2.520916748046875, "memory(GiB)": 77.56, "step": 43250, "token_acc": 0.4766666666666667, "train_speed(iter/s)": 1.440478 }, { "epoch": 1.853176813332762, "grad_norm": 5.01125431060791, "learning_rate": 6.97657607969856e-05, "loss": 2.6507028579711913, "memory(GiB)": 77.56, "step": 43255, "token_acc": 0.4459016393442623, "train_speed(iter/s)": 1.440489 }, { "epoch": 1.8533910286620112, "grad_norm": 4.769619464874268, "learning_rate": 6.975957901505574e-05, "loss": 2.265201377868652, "memory(GiB)": 77.56, "step": 43260, "token_acc": 0.5409836065573771, "train_speed(iter/s)": 1.44052 }, { "epoch": 1.85360524399126, "grad_norm": 4.287899971008301, "learning_rate": 6.97533968751626e-05, "loss": 2.485826110839844, "memory(GiB)": 77.56, "step": 43265, "token_acc": 0.4517241379310345, "train_speed(iter/s)": 1.440519 }, { "epoch": 1.8538194593205088, "grad_norm": 5.929051876068115, "learning_rate": 6.974721437741813e-05, "loss": 2.472885322570801, "memory(GiB)": 77.56, "step": 43270, "token_acc": 0.5176470588235295, "train_speed(iter/s)": 1.440488 }, { "epoch": 1.854033674649758, "grad_norm": 4.998896598815918, "learning_rate": 6.974103152193434e-05, "loss": 2.297836685180664, "memory(GiB)": 77.56, "step": 43275, "token_acc": 0.5390625, "train_speed(iter/s)": 1.440517 }, { "epoch": 1.854247889979007, "grad_norm": 5.720040798187256, "learning_rate": 6.973484830882326e-05, "loss": 2.609451675415039, "memory(GiB)": 77.56, "step": 43280, "token_acc": 0.4691780821917808, "train_speed(iter/s)": 1.440492 }, { "epoch": 1.8544621053082557, "grad_norm": 4.926914691925049, "learning_rate": 6.97286647381969e-05, "loss": 2.4503416061401366, "memory(GiB)": 77.56, "step": 43285, "token_acc": 0.472, "train_speed(iter/s)": 1.440517 }, { "epoch": 1.854676320637505, "grad_norm": 6.048356533050537, "learning_rate": 6.972248081016724e-05, "loss": 2.396846580505371, "memory(GiB)": 77.56, "step": 43290, "token_acc": 0.4930555555555556, "train_speed(iter/s)": 1.440521 }, { "epoch": 1.8548905359667538, "grad_norm": 4.139334201812744, "learning_rate": 6.971629652484635e-05, "loss": 2.3883827209472654, "memory(GiB)": 77.56, "step": 43295, "token_acc": 0.5, "train_speed(iter/s)": 1.440521 }, { "epoch": 1.8551047512960026, "grad_norm": 4.844492435455322, "learning_rate": 6.971011188234628e-05, "loss": 2.719435119628906, "memory(GiB)": 77.56, "step": 43300, "token_acc": 0.4676470588235294, "train_speed(iter/s)": 1.440539 }, { "epoch": 1.8553189666252519, "grad_norm": 5.669625282287598, "learning_rate": 6.9703926882779e-05, "loss": 2.4841148376464846, "memory(GiB)": 77.56, "step": 43305, "token_acc": 0.4612903225806452, "train_speed(iter/s)": 1.440533 }, { "epoch": 1.8555331819545007, "grad_norm": 6.216773986816406, "learning_rate": 6.969774152625664e-05, "loss": 2.0978271484375, "memory(GiB)": 77.56, "step": 43310, "token_acc": 0.5103734439834025, "train_speed(iter/s)": 1.440557 }, { "epoch": 1.8557473972837495, "grad_norm": 5.27077579498291, "learning_rate": 6.969155581289119e-05, "loss": 2.484029006958008, "memory(GiB)": 77.56, "step": 43315, "token_acc": 0.49794238683127573, "train_speed(iter/s)": 1.440545 }, { "epoch": 1.8559616126129987, "grad_norm": 5.679560661315918, "learning_rate": 6.968536974279475e-05, "loss": 2.672632026672363, "memory(GiB)": 77.56, "step": 43320, "token_acc": 0.4177215189873418, "train_speed(iter/s)": 1.440582 }, { "epoch": 1.8561758279422476, "grad_norm": 3.9813849925994873, "learning_rate": 6.967918331607937e-05, "loss": 2.484807014465332, "memory(GiB)": 77.56, "step": 43325, "token_acc": 0.4967105263157895, "train_speed(iter/s)": 1.440592 }, { "epoch": 1.8563900432714964, "grad_norm": 5.795016288757324, "learning_rate": 6.967299653285711e-05, "loss": 2.193868637084961, "memory(GiB)": 77.56, "step": 43330, "token_acc": 0.453125, "train_speed(iter/s)": 1.440593 }, { "epoch": 1.8566042586007456, "grad_norm": 4.828737258911133, "learning_rate": 6.966680939324006e-05, "loss": 2.525767707824707, "memory(GiB)": 77.56, "step": 43335, "token_acc": 0.46296296296296297, "train_speed(iter/s)": 1.440615 }, { "epoch": 1.8568184739299944, "grad_norm": 5.104550838470459, "learning_rate": 6.966062189734033e-05, "loss": 2.5883762359619142, "memory(GiB)": 77.56, "step": 43340, "token_acc": 0.42671009771986973, "train_speed(iter/s)": 1.44063 }, { "epoch": 1.8570326892592433, "grad_norm": 4.25006628036499, "learning_rate": 6.965443404526998e-05, "loss": 2.370433044433594, "memory(GiB)": 77.56, "step": 43345, "token_acc": 0.48788927335640137, "train_speed(iter/s)": 1.440611 }, { "epoch": 1.8572469045884925, "grad_norm": 4.21995735168457, "learning_rate": 6.964824583714111e-05, "loss": 2.446165084838867, "memory(GiB)": 77.56, "step": 43350, "token_acc": 0.4738562091503268, "train_speed(iter/s)": 1.440623 }, { "epoch": 1.8574611199177413, "grad_norm": 4.266446113586426, "learning_rate": 6.964205727306586e-05, "loss": 2.655420112609863, "memory(GiB)": 77.56, "step": 43355, "token_acc": 0.46296296296296297, "train_speed(iter/s)": 1.440629 }, { "epoch": 1.8576753352469901, "grad_norm": 4.508521556854248, "learning_rate": 6.963586835315629e-05, "loss": 2.5085685729980467, "memory(GiB)": 77.56, "step": 43360, "token_acc": 0.4838709677419355, "train_speed(iter/s)": 1.440635 }, { "epoch": 1.8578895505762394, "grad_norm": 4.173177719116211, "learning_rate": 6.962967907752455e-05, "loss": 2.577441596984863, "memory(GiB)": 77.56, "step": 43365, "token_acc": 0.4668769716088328, "train_speed(iter/s)": 1.440649 }, { "epoch": 1.8581037659054882, "grad_norm": 4.742574214935303, "learning_rate": 6.962348944628276e-05, "loss": 2.2691776275634767, "memory(GiB)": 77.56, "step": 43370, "token_acc": 0.48563218390804597, "train_speed(iter/s)": 1.44067 }, { "epoch": 1.858317981234737, "grad_norm": 6.976690292358398, "learning_rate": 6.961729945954307e-05, "loss": 2.68206787109375, "memory(GiB)": 77.56, "step": 43375, "token_acc": 0.4610169491525424, "train_speed(iter/s)": 1.440676 }, { "epoch": 1.8585321965639863, "grad_norm": 5.838136196136475, "learning_rate": 6.961110911741757e-05, "loss": 2.4876346588134766, "memory(GiB)": 77.56, "step": 43380, "token_acc": 0.4570552147239264, "train_speed(iter/s)": 1.440655 }, { "epoch": 1.858746411893235, "grad_norm": 4.771737575531006, "learning_rate": 6.960491842001846e-05, "loss": 2.4716320037841797, "memory(GiB)": 77.56, "step": 43385, "token_acc": 0.4840764331210191, "train_speed(iter/s)": 1.440704 }, { "epoch": 1.858960627222484, "grad_norm": 6.208278656005859, "learning_rate": 6.959872736745784e-05, "loss": 2.4613088607788085, "memory(GiB)": 77.56, "step": 43390, "token_acc": 0.5, "train_speed(iter/s)": 1.4407 }, { "epoch": 1.8591748425517332, "grad_norm": 4.553323268890381, "learning_rate": 6.95925359598479e-05, "loss": 2.575738525390625, "memory(GiB)": 77.56, "step": 43395, "token_acc": 0.4682080924855491, "train_speed(iter/s)": 1.44075 }, { "epoch": 1.859389057880982, "grad_norm": 4.627508640289307, "learning_rate": 6.95863441973008e-05, "loss": 2.4983247756958007, "memory(GiB)": 77.56, "step": 43400, "token_acc": 0.46062992125984253, "train_speed(iter/s)": 1.440773 }, { "epoch": 1.8596032732102308, "grad_norm": 6.814741611480713, "learning_rate": 6.958015207992867e-05, "loss": 2.373073387145996, "memory(GiB)": 77.56, "step": 43405, "token_acc": 0.4509090909090909, "train_speed(iter/s)": 1.440789 }, { "epoch": 1.85981748853948, "grad_norm": 5.269682884216309, "learning_rate": 6.957395960784374e-05, "loss": 2.832663154602051, "memory(GiB)": 77.56, "step": 43410, "token_acc": 0.43656716417910446, "train_speed(iter/s)": 1.440806 }, { "epoch": 1.8600317038687288, "grad_norm": 5.185050964355469, "learning_rate": 6.956776678115817e-05, "loss": 2.564066696166992, "memory(GiB)": 77.56, "step": 43415, "token_acc": 0.4360655737704918, "train_speed(iter/s)": 1.440859 }, { "epoch": 1.8602459191979777, "grad_norm": 4.240634918212891, "learning_rate": 6.956157359998414e-05, "loss": 2.3420948028564452, "memory(GiB)": 77.56, "step": 43420, "token_acc": 0.4782608695652174, "train_speed(iter/s)": 1.44086 }, { "epoch": 1.860460134527227, "grad_norm": 6.516208648681641, "learning_rate": 6.955538006443386e-05, "loss": 2.530893898010254, "memory(GiB)": 77.56, "step": 43425, "token_acc": 0.4775510204081633, "train_speed(iter/s)": 1.440844 }, { "epoch": 1.8606743498564757, "grad_norm": 4.886407375335693, "learning_rate": 6.954918617461952e-05, "loss": 2.5489376068115233, "memory(GiB)": 77.56, "step": 43430, "token_acc": 0.49, "train_speed(iter/s)": 1.440846 }, { "epoch": 1.8608885651857245, "grad_norm": 5.054930686950684, "learning_rate": 6.954299193065334e-05, "loss": 2.8205469131469725, "memory(GiB)": 77.56, "step": 43435, "token_acc": 0.43902439024390244, "train_speed(iter/s)": 1.44083 }, { "epoch": 1.8611027805149738, "grad_norm": 4.837130069732666, "learning_rate": 6.953679733264753e-05, "loss": 2.5725114822387694, "memory(GiB)": 77.56, "step": 43440, "token_acc": 0.4583333333333333, "train_speed(iter/s)": 1.440827 }, { "epoch": 1.8613169958442226, "grad_norm": 5.3324503898620605, "learning_rate": 6.95306023807143e-05, "loss": 2.5039031982421873, "memory(GiB)": 77.56, "step": 43445, "token_acc": 0.4897119341563786, "train_speed(iter/s)": 1.440839 }, { "epoch": 1.8615312111734714, "grad_norm": 4.094564437866211, "learning_rate": 6.952440707496589e-05, "loss": 2.612009620666504, "memory(GiB)": 77.56, "step": 43450, "token_acc": 0.4869281045751634, "train_speed(iter/s)": 1.440869 }, { "epoch": 1.8617454265027207, "grad_norm": 4.587902069091797, "learning_rate": 6.951821141551455e-05, "loss": 2.530088424682617, "memory(GiB)": 77.56, "step": 43455, "token_acc": 0.4585987261146497, "train_speed(iter/s)": 1.44089 }, { "epoch": 1.8619596418319695, "grad_norm": 5.007899284362793, "learning_rate": 6.951201540247249e-05, "loss": 2.371768760681152, "memory(GiB)": 77.56, "step": 43460, "token_acc": 0.5064102564102564, "train_speed(iter/s)": 1.440896 }, { "epoch": 1.8621738571612183, "grad_norm": 4.972559452056885, "learning_rate": 6.950581903595194e-05, "loss": 2.7083065032958986, "memory(GiB)": 77.56, "step": 43465, "token_acc": 0.46885245901639344, "train_speed(iter/s)": 1.440931 }, { "epoch": 1.8623880724904676, "grad_norm": 4.13785457611084, "learning_rate": 6.949962231606522e-05, "loss": 2.3697992324829102, "memory(GiB)": 77.56, "step": 43470, "token_acc": 0.48109965635738833, "train_speed(iter/s)": 1.440895 }, { "epoch": 1.8626022878197164, "grad_norm": 4.663508892059326, "learning_rate": 6.949342524292453e-05, "loss": 2.1297985076904298, "memory(GiB)": 77.56, "step": 43475, "token_acc": 0.48494983277591974, "train_speed(iter/s)": 1.4409 }, { "epoch": 1.8628165031489652, "grad_norm": 4.668050765991211, "learning_rate": 6.948722781664215e-05, "loss": 2.5469255447387695, "memory(GiB)": 77.56, "step": 43480, "token_acc": 0.49683544303797467, "train_speed(iter/s)": 1.440853 }, { "epoch": 1.8630307184782144, "grad_norm": 4.261608123779297, "learning_rate": 6.948103003733036e-05, "loss": 2.676869773864746, "memory(GiB)": 77.56, "step": 43485, "token_acc": 0.414985590778098, "train_speed(iter/s)": 1.440859 }, { "epoch": 1.8632449338074633, "grad_norm": 5.871016025543213, "learning_rate": 6.947483190510144e-05, "loss": 2.5678970336914064, "memory(GiB)": 77.56, "step": 43490, "token_acc": 0.5112781954887218, "train_speed(iter/s)": 1.440849 }, { "epoch": 1.863459149136712, "grad_norm": 5.957498550415039, "learning_rate": 6.946863342006768e-05, "loss": 2.7108821868896484, "memory(GiB)": 77.56, "step": 43495, "token_acc": 0.471976401179941, "train_speed(iter/s)": 1.440827 }, { "epoch": 1.8636733644659613, "grad_norm": 8.545435905456543, "learning_rate": 6.946243458234135e-05, "loss": 2.5306770324707033, "memory(GiB)": 77.56, "step": 43500, "token_acc": 0.4924812030075188, "train_speed(iter/s)": 1.44086 }, { "epoch": 1.8636733644659613, "eval_loss": 2.236259698867798, "eval_runtime": 13.7364, "eval_samples_per_second": 7.28, "eval_steps_per_second": 7.28, "eval_token_acc": 0.4917541229385307, "step": 43500 }, { "epoch": 1.8638875797952101, "grad_norm": 3.7839760780334473, "learning_rate": 6.945623539203477e-05, "loss": 2.7957149505615235, "memory(GiB)": 77.56, "step": 43505, "token_acc": 0.477319587628866, "train_speed(iter/s)": 1.440141 }, { "epoch": 1.864101795124459, "grad_norm": 11.7059965133667, "learning_rate": 6.945003584926021e-05, "loss": 2.688629150390625, "memory(GiB)": 77.56, "step": 43510, "token_acc": 0.4261168384879725, "train_speed(iter/s)": 1.440121 }, { "epoch": 1.8643160104537082, "grad_norm": 5.366820812225342, "learning_rate": 6.944383595413003e-05, "loss": 2.7319902420043944, "memory(GiB)": 77.56, "step": 43515, "token_acc": 0.5092250922509225, "train_speed(iter/s)": 1.440143 }, { "epoch": 1.864530225782957, "grad_norm": 4.514744281768799, "learning_rate": 6.943763570675652e-05, "loss": 2.6621368408203123, "memory(GiB)": 77.56, "step": 43520, "token_acc": 0.47368421052631576, "train_speed(iter/s)": 1.440183 }, { "epoch": 1.8647444411122058, "grad_norm": 5.274441242218018, "learning_rate": 6.943143510725199e-05, "loss": 2.4516849517822266, "memory(GiB)": 77.56, "step": 43525, "token_acc": 0.4675324675324675, "train_speed(iter/s)": 1.440136 }, { "epoch": 1.864958656441455, "grad_norm": 5.445152759552002, "learning_rate": 6.942523415572882e-05, "loss": 2.5138612747192384, "memory(GiB)": 77.56, "step": 43530, "token_acc": 0.5032258064516129, "train_speed(iter/s)": 1.440151 }, { "epoch": 1.865172871770704, "grad_norm": 7.207469463348389, "learning_rate": 6.941903285229927e-05, "loss": 2.7002941131591798, "memory(GiB)": 77.56, "step": 43535, "token_acc": 0.44776119402985076, "train_speed(iter/s)": 1.440097 }, { "epoch": 1.8653870870999527, "grad_norm": 3.9434337615966797, "learning_rate": 6.941283119707575e-05, "loss": 2.586966705322266, "memory(GiB)": 77.56, "step": 43540, "token_acc": 0.483974358974359, "train_speed(iter/s)": 1.440082 }, { "epoch": 1.865601302429202, "grad_norm": 5.480777263641357, "learning_rate": 6.940662919017057e-05, "loss": 2.4758493423461916, "memory(GiB)": 77.56, "step": 43545, "token_acc": 0.5053763440860215, "train_speed(iter/s)": 1.440086 }, { "epoch": 1.8658155177584508, "grad_norm": 4.904507637023926, "learning_rate": 6.940042683169609e-05, "loss": 2.491636848449707, "memory(GiB)": 77.56, "step": 43550, "token_acc": 0.5080385852090032, "train_speed(iter/s)": 1.440107 }, { "epoch": 1.8660297330876996, "grad_norm": 5.072583198547363, "learning_rate": 6.939422412176468e-05, "loss": 2.304940032958984, "memory(GiB)": 77.56, "step": 43555, "token_acc": 0.4916387959866221, "train_speed(iter/s)": 1.440113 }, { "epoch": 1.8662439484169489, "grad_norm": 7.3964643478393555, "learning_rate": 6.938802106048873e-05, "loss": 2.4718551635742188, "memory(GiB)": 77.56, "step": 43560, "token_acc": 0.5019455252918288, "train_speed(iter/s)": 1.440127 }, { "epoch": 1.8664581637461977, "grad_norm": 4.174238681793213, "learning_rate": 6.938181764798058e-05, "loss": 2.856204605102539, "memory(GiB)": 77.56, "step": 43565, "token_acc": 0.49221183800623053, "train_speed(iter/s)": 1.440179 }, { "epoch": 1.8666723790754465, "grad_norm": 5.73566198348999, "learning_rate": 6.93756138843526e-05, "loss": 2.6958240509033202, "memory(GiB)": 77.56, "step": 43570, "token_acc": 0.4117647058823529, "train_speed(iter/s)": 1.440113 }, { "epoch": 1.8668865944046957, "grad_norm": 5.127898216247559, "learning_rate": 6.936940976971724e-05, "loss": 2.5017662048339844, "memory(GiB)": 77.56, "step": 43575, "token_acc": 0.5060240963855421, "train_speed(iter/s)": 1.440146 }, { "epoch": 1.8671008097339445, "grad_norm": 4.297633171081543, "learning_rate": 6.936320530418681e-05, "loss": 2.6987648010253906, "memory(GiB)": 77.56, "step": 43580, "token_acc": 0.46439628482972134, "train_speed(iter/s)": 1.440151 }, { "epoch": 1.8673150250631934, "grad_norm": 5.929309844970703, "learning_rate": 6.935700048787379e-05, "loss": 2.6566654205322267, "memory(GiB)": 77.56, "step": 43585, "token_acc": 0.47635135135135137, "train_speed(iter/s)": 1.44016 }, { "epoch": 1.8675292403924426, "grad_norm": 5.273189067840576, "learning_rate": 6.935079532089052e-05, "loss": 2.542314147949219, "memory(GiB)": 77.56, "step": 43590, "token_acc": 0.4577922077922078, "train_speed(iter/s)": 1.440165 }, { "epoch": 1.8677434557216914, "grad_norm": 6.269961833953857, "learning_rate": 6.934458980334944e-05, "loss": 2.670563507080078, "memory(GiB)": 77.56, "step": 43595, "token_acc": 0.44106463878326996, "train_speed(iter/s)": 1.440175 }, { "epoch": 1.8679576710509405, "grad_norm": 4.557129859924316, "learning_rate": 6.933838393536299e-05, "loss": 2.5225624084472655, "memory(GiB)": 77.56, "step": 43600, "token_acc": 0.48011363636363635, "train_speed(iter/s)": 1.440165 }, { "epoch": 1.8681718863801895, "grad_norm": 4.752987861633301, "learning_rate": 6.933217771704356e-05, "loss": 2.380077362060547, "memory(GiB)": 77.56, "step": 43605, "token_acc": 0.4869281045751634, "train_speed(iter/s)": 1.44017 }, { "epoch": 1.8683861017094383, "grad_norm": 4.288316249847412, "learning_rate": 6.932597114850359e-05, "loss": 2.9540658950805665, "memory(GiB)": 77.56, "step": 43610, "token_acc": 0.41317365269461076, "train_speed(iter/s)": 1.440146 }, { "epoch": 1.8686003170386873, "grad_norm": 5.97092342376709, "learning_rate": 6.931976422985554e-05, "loss": 2.494019317626953, "memory(GiB)": 77.56, "step": 43615, "token_acc": 0.45964912280701753, "train_speed(iter/s)": 1.440171 }, { "epoch": 1.8688145323679364, "grad_norm": 5.262835502624512, "learning_rate": 6.931355696121183e-05, "loss": 2.2457048416137697, "memory(GiB)": 77.56, "step": 43620, "token_acc": 0.5259259259259259, "train_speed(iter/s)": 1.440156 }, { "epoch": 1.8690287476971852, "grad_norm": 5.038191318511963, "learning_rate": 6.930734934268491e-05, "loss": 2.470929718017578, "memory(GiB)": 77.56, "step": 43625, "token_acc": 0.4852459016393443, "train_speed(iter/s)": 1.440107 }, { "epoch": 1.8692429630264342, "grad_norm": 6.955289363861084, "learning_rate": 6.930114137438725e-05, "loss": 2.6117916107177734, "memory(GiB)": 77.56, "step": 43630, "token_acc": 0.43686006825938567, "train_speed(iter/s)": 1.440112 }, { "epoch": 1.8694571783556833, "grad_norm": 5.306889057159424, "learning_rate": 6.92949330564313e-05, "loss": 2.4621322631835936, "memory(GiB)": 77.56, "step": 43635, "token_acc": 0.47265625, "train_speed(iter/s)": 1.440119 }, { "epoch": 1.869671393684932, "grad_norm": 5.303523063659668, "learning_rate": 6.928872438892956e-05, "loss": 2.3270092010498047, "memory(GiB)": 77.56, "step": 43640, "token_acc": 0.4983277591973244, "train_speed(iter/s)": 1.440118 }, { "epoch": 1.869885609014181, "grad_norm": 3.8588778972625732, "learning_rate": 6.928251537199446e-05, "loss": 2.6262319564819334, "memory(GiB)": 77.56, "step": 43645, "token_acc": 0.4386422976501306, "train_speed(iter/s)": 1.440142 }, { "epoch": 1.8700998243434301, "grad_norm": 4.734883785247803, "learning_rate": 6.927630600573851e-05, "loss": 2.570682716369629, "memory(GiB)": 77.56, "step": 43650, "token_acc": 0.4682274247491639, "train_speed(iter/s)": 1.440157 }, { "epoch": 1.870314039672679, "grad_norm": 4.64441442489624, "learning_rate": 6.927009629027421e-05, "loss": 2.217654800415039, "memory(GiB)": 77.56, "step": 43655, "token_acc": 0.5155709342560554, "train_speed(iter/s)": 1.440162 }, { "epoch": 1.870528255001928, "grad_norm": 6.077085494995117, "learning_rate": 6.926388622571403e-05, "loss": 2.426427459716797, "memory(GiB)": 77.56, "step": 43660, "token_acc": 0.4962121212121212, "train_speed(iter/s)": 1.440191 }, { "epoch": 1.870742470331177, "grad_norm": 4.232110977172852, "learning_rate": 6.925767581217046e-05, "loss": 2.2968572616577148, "memory(GiB)": 77.56, "step": 43665, "token_acc": 0.516728624535316, "train_speed(iter/s)": 1.440222 }, { "epoch": 1.8709566856604258, "grad_norm": 3.608761787414551, "learning_rate": 6.925146504975606e-05, "loss": 2.242470932006836, "memory(GiB)": 77.56, "step": 43670, "token_acc": 0.5113636363636364, "train_speed(iter/s)": 1.440212 }, { "epoch": 1.8711709009896749, "grad_norm": 5.739598751068115, "learning_rate": 6.92452539385833e-05, "loss": 2.247125434875488, "memory(GiB)": 77.56, "step": 43675, "token_acc": 0.5053763440860215, "train_speed(iter/s)": 1.440188 }, { "epoch": 1.871385116318924, "grad_norm": 7.395709037780762, "learning_rate": 6.92390424787647e-05, "loss": 2.343984603881836, "memory(GiB)": 77.56, "step": 43680, "token_acc": 0.48109965635738833, "train_speed(iter/s)": 1.440166 }, { "epoch": 1.8715993316481727, "grad_norm": 4.652615547180176, "learning_rate": 6.923283067041282e-05, "loss": 2.400879669189453, "memory(GiB)": 77.56, "step": 43685, "token_acc": 0.5021097046413502, "train_speed(iter/s)": 1.440169 }, { "epoch": 1.8718135469774217, "grad_norm": 7.7594146728515625, "learning_rate": 6.922661851364016e-05, "loss": 2.393063545227051, "memory(GiB)": 77.56, "step": 43690, "token_acc": 0.4197080291970803, "train_speed(iter/s)": 1.440181 }, { "epoch": 1.8720277623066708, "grad_norm": 5.022670269012451, "learning_rate": 6.922040600855926e-05, "loss": 2.5054611206054687, "memory(GiB)": 77.56, "step": 43695, "token_acc": 0.4842105263157895, "train_speed(iter/s)": 1.440162 }, { "epoch": 1.8722419776359196, "grad_norm": 5.376389503479004, "learning_rate": 6.921419315528268e-05, "loss": 2.703806686401367, "memory(GiB)": 77.56, "step": 43700, "token_acc": 0.44569288389513106, "train_speed(iter/s)": 1.440207 }, { "epoch": 1.8724561929651686, "grad_norm": 6.1688761711120605, "learning_rate": 6.920797995392297e-05, "loss": 2.550992965698242, "memory(GiB)": 77.56, "step": 43705, "token_acc": 0.41638225255972694, "train_speed(iter/s)": 1.440213 }, { "epoch": 1.8726704082944177, "grad_norm": 4.517484188079834, "learning_rate": 6.920176640459268e-05, "loss": 2.6653520584106447, "memory(GiB)": 77.56, "step": 43710, "token_acc": 0.4613259668508287, "train_speed(iter/s)": 1.44022 }, { "epoch": 1.8728846236236665, "grad_norm": 4.672995567321777, "learning_rate": 6.919555250740439e-05, "loss": 2.6150575637817384, "memory(GiB)": 77.56, "step": 43715, "token_acc": 0.4440993788819876, "train_speed(iter/s)": 1.440217 }, { "epoch": 1.8730988389529155, "grad_norm": 8.136906623840332, "learning_rate": 6.918933826247065e-05, "loss": 2.590870666503906, "memory(GiB)": 77.56, "step": 43720, "token_acc": 0.4482758620689655, "train_speed(iter/s)": 1.440225 }, { "epoch": 1.8733130542821645, "grad_norm": 6.360536575317383, "learning_rate": 6.918312366990405e-05, "loss": 2.6951112747192383, "memory(GiB)": 77.56, "step": 43725, "token_acc": 0.43234323432343236, "train_speed(iter/s)": 1.440246 }, { "epoch": 1.8735272696114134, "grad_norm": 4.464033126831055, "learning_rate": 6.917690872981717e-05, "loss": 2.6590789794921874, "memory(GiB)": 77.56, "step": 43730, "token_acc": 0.4723127035830619, "train_speed(iter/s)": 1.44024 }, { "epoch": 1.8737414849406624, "grad_norm": 4.957088947296143, "learning_rate": 6.917069344232258e-05, "loss": 2.2770597457885744, "memory(GiB)": 77.56, "step": 43735, "token_acc": 0.5210727969348659, "train_speed(iter/s)": 1.440277 }, { "epoch": 1.8739557002699114, "grad_norm": 6.839737892150879, "learning_rate": 6.916447780753291e-05, "loss": 2.5905824661254884, "memory(GiB)": 77.56, "step": 43740, "token_acc": 0.43333333333333335, "train_speed(iter/s)": 1.440221 }, { "epoch": 1.8741699155991602, "grad_norm": 4.120622158050537, "learning_rate": 6.915826182556075e-05, "loss": 2.385251998901367, "memory(GiB)": 77.56, "step": 43745, "token_acc": 0.48226950354609927, "train_speed(iter/s)": 1.440217 }, { "epoch": 1.8743841309284093, "grad_norm": 8.104998588562012, "learning_rate": 6.915204549651871e-05, "loss": 2.60362548828125, "memory(GiB)": 77.56, "step": 43750, "token_acc": 0.4617737003058104, "train_speed(iter/s)": 1.440196 }, { "epoch": 1.8745983462576583, "grad_norm": 4.279297828674316, "learning_rate": 6.914582882051938e-05, "loss": 2.6403614044189454, "memory(GiB)": 77.56, "step": 43755, "token_acc": 0.44299674267100975, "train_speed(iter/s)": 1.440199 }, { "epoch": 1.8748125615869071, "grad_norm": 6.763187885284424, "learning_rate": 6.913961179767543e-05, "loss": 2.311290740966797, "memory(GiB)": 77.56, "step": 43760, "token_acc": 0.53515625, "train_speed(iter/s)": 1.44022 }, { "epoch": 1.8750267769161562, "grad_norm": 6.343144416809082, "learning_rate": 6.913339442809942e-05, "loss": 2.5140384674072265, "memory(GiB)": 77.56, "step": 43765, "token_acc": 0.4980237154150198, "train_speed(iter/s)": 1.440263 }, { "epoch": 1.8752409922454052, "grad_norm": 4.878091335296631, "learning_rate": 6.912717671190407e-05, "loss": 2.728389358520508, "memory(GiB)": 77.56, "step": 43770, "token_acc": 0.4970414201183432, "train_speed(iter/s)": 1.440311 }, { "epoch": 1.875455207574654, "grad_norm": 5.390368461608887, "learning_rate": 6.912095864920193e-05, "loss": 2.3072967529296875, "memory(GiB)": 77.56, "step": 43775, "token_acc": 0.5068493150684932, "train_speed(iter/s)": 1.440308 }, { "epoch": 1.875669422903903, "grad_norm": 4.705453395843506, "learning_rate": 6.911474024010569e-05, "loss": 2.5538101196289062, "memory(GiB)": 77.56, "step": 43780, "token_acc": 0.4815950920245399, "train_speed(iter/s)": 1.440295 }, { "epoch": 1.875883638233152, "grad_norm": 5.005934715270996, "learning_rate": 6.910852148472802e-05, "loss": 2.5337921142578126, "memory(GiB)": 77.56, "step": 43785, "token_acc": 0.5171232876712328, "train_speed(iter/s)": 1.440297 }, { "epoch": 1.8760978535624009, "grad_norm": 4.590370178222656, "learning_rate": 6.910230238318157e-05, "loss": 2.1834234237670898, "memory(GiB)": 77.56, "step": 43790, "token_acc": 0.5409252669039146, "train_speed(iter/s)": 1.440322 }, { "epoch": 1.87631206889165, "grad_norm": 4.759270191192627, "learning_rate": 6.909608293557895e-05, "loss": 2.4583187103271484, "memory(GiB)": 77.56, "step": 43795, "token_acc": 0.49454545454545457, "train_speed(iter/s)": 1.440292 }, { "epoch": 1.876526284220899, "grad_norm": 4.1492085456848145, "learning_rate": 6.908986314203289e-05, "loss": 2.724493980407715, "memory(GiB)": 77.56, "step": 43800, "token_acc": 0.42507645259938837, "train_speed(iter/s)": 1.440286 }, { "epoch": 1.8767404995501478, "grad_norm": 4.204627513885498, "learning_rate": 6.908364300265607e-05, "loss": 2.418387603759766, "memory(GiB)": 77.56, "step": 43805, "token_acc": 0.46099290780141844, "train_speed(iter/s)": 1.440293 }, { "epoch": 1.8769547148793968, "grad_norm": 5.65638542175293, "learning_rate": 6.907742251756114e-05, "loss": 2.375998306274414, "memory(GiB)": 77.56, "step": 43810, "token_acc": 0.5444839857651246, "train_speed(iter/s)": 1.44027 }, { "epoch": 1.8771689302086458, "grad_norm": 3.6777758598327637, "learning_rate": 6.907120168686079e-05, "loss": 2.1566015243530274, "memory(GiB)": 77.56, "step": 43815, "token_acc": 0.5214007782101168, "train_speed(iter/s)": 1.440262 }, { "epoch": 1.8773831455378946, "grad_norm": 5.5765790939331055, "learning_rate": 6.906498051066776e-05, "loss": 2.375243377685547, "memory(GiB)": 77.56, "step": 43820, "token_acc": 0.49454545454545457, "train_speed(iter/s)": 1.440283 }, { "epoch": 1.8775973608671437, "grad_norm": 4.720075607299805, "learning_rate": 6.90587589890947e-05, "loss": 2.635455322265625, "memory(GiB)": 77.56, "step": 43825, "token_acc": 0.4927536231884058, "train_speed(iter/s)": 1.440304 }, { "epoch": 1.8778115761963927, "grad_norm": 5.195075035095215, "learning_rate": 6.905253712225436e-05, "loss": 2.779643249511719, "memory(GiB)": 77.56, "step": 43830, "token_acc": 0.4351145038167939, "train_speed(iter/s)": 1.440226 }, { "epoch": 1.8780257915256415, "grad_norm": 3.975156545639038, "learning_rate": 6.904631491025945e-05, "loss": 2.816888427734375, "memory(GiB)": 77.56, "step": 43835, "token_acc": 0.487012987012987, "train_speed(iter/s)": 1.440253 }, { "epoch": 1.8782400068548906, "grad_norm": 4.649935245513916, "learning_rate": 6.904009235322265e-05, "loss": 2.6185672760009764, "memory(GiB)": 77.56, "step": 43840, "token_acc": 0.43670886075949367, "train_speed(iter/s)": 1.440252 }, { "epoch": 1.8784542221841396, "grad_norm": 6.371175289154053, "learning_rate": 6.903386945125673e-05, "loss": 2.7402807235717774, "memory(GiB)": 77.56, "step": 43845, "token_acc": 0.45121951219512196, "train_speed(iter/s)": 1.440244 }, { "epoch": 1.8786684375133884, "grad_norm": 5.791367530822754, "learning_rate": 6.90276462044744e-05, "loss": 2.267729949951172, "memory(GiB)": 77.56, "step": 43850, "token_acc": 0.5147540983606558, "train_speed(iter/s)": 1.440224 }, { "epoch": 1.8788826528426374, "grad_norm": 6.075738906860352, "learning_rate": 6.90214226129884e-05, "loss": 2.1494869232177733, "memory(GiB)": 77.56, "step": 43855, "token_acc": 0.5021645021645021, "train_speed(iter/s)": 1.440254 }, { "epoch": 1.8790968681718865, "grad_norm": 5.984035968780518, "learning_rate": 6.901519867691151e-05, "loss": 2.7059022903442385, "memory(GiB)": 77.56, "step": 43860, "token_acc": 0.44981412639405205, "train_speed(iter/s)": 1.440264 }, { "epoch": 1.8793110835011353, "grad_norm": 4.501453399658203, "learning_rate": 6.900897439635646e-05, "loss": 2.630645751953125, "memory(GiB)": 77.56, "step": 43865, "token_acc": 0.4675324675324675, "train_speed(iter/s)": 1.440267 }, { "epoch": 1.8795252988303843, "grad_norm": 5.558224678039551, "learning_rate": 6.900274977143599e-05, "loss": 2.4991966247558595, "memory(GiB)": 77.56, "step": 43870, "token_acc": 0.4944237918215613, "train_speed(iter/s)": 1.440291 }, { "epoch": 1.8797395141596334, "grad_norm": 4.4733757972717285, "learning_rate": 6.89965248022629e-05, "loss": 2.2572797775268554, "memory(GiB)": 77.56, "step": 43875, "token_acc": 0.5079365079365079, "train_speed(iter/s)": 1.44025 }, { "epoch": 1.8799537294888822, "grad_norm": 4.89325475692749, "learning_rate": 6.899029948894993e-05, "loss": 2.7734336853027344, "memory(GiB)": 77.56, "step": 43880, "token_acc": 0.43884892086330934, "train_speed(iter/s)": 1.440247 }, { "epoch": 1.8801679448181312, "grad_norm": 7.597643852233887, "learning_rate": 6.898407383160985e-05, "loss": 2.5331729888916015, "memory(GiB)": 77.56, "step": 43885, "token_acc": 0.4666666666666667, "train_speed(iter/s)": 1.440274 }, { "epoch": 1.8803821601473802, "grad_norm": 4.127545356750488, "learning_rate": 6.89778478303555e-05, "loss": 2.31573486328125, "memory(GiB)": 77.56, "step": 43890, "token_acc": 0.49201277955271566, "train_speed(iter/s)": 1.440292 }, { "epoch": 1.880596375476629, "grad_norm": 7.149901390075684, "learning_rate": 6.897162148529963e-05, "loss": 2.661090278625488, "memory(GiB)": 77.56, "step": 43895, "token_acc": 0.44370860927152317, "train_speed(iter/s)": 1.440327 }, { "epoch": 1.880810590805878, "grad_norm": 6.092045783996582, "learning_rate": 6.896539479655501e-05, "loss": 2.7160852432250975, "memory(GiB)": 77.56, "step": 43900, "token_acc": 0.426056338028169, "train_speed(iter/s)": 1.44033 }, { "epoch": 1.8810248061351271, "grad_norm": 5.126265048980713, "learning_rate": 6.89591677642345e-05, "loss": 2.7058479309082033, "memory(GiB)": 77.56, "step": 43905, "token_acc": 0.4766355140186916, "train_speed(iter/s)": 1.440329 }, { "epoch": 1.881239021464376, "grad_norm": 5.003473281860352, "learning_rate": 6.895294038845087e-05, "loss": 2.278493118286133, "memory(GiB)": 77.56, "step": 43910, "token_acc": 0.49850746268656715, "train_speed(iter/s)": 1.440315 }, { "epoch": 1.881453236793625, "grad_norm": 4.984987735748291, "learning_rate": 6.894671266931697e-05, "loss": 2.4715904235839843, "memory(GiB)": 77.56, "step": 43915, "token_acc": 0.5018050541516246, "train_speed(iter/s)": 1.440358 }, { "epoch": 1.881667452122874, "grad_norm": 4.243618488311768, "learning_rate": 6.894048460694557e-05, "loss": 2.678019332885742, "memory(GiB)": 77.56, "step": 43920, "token_acc": 0.42990654205607476, "train_speed(iter/s)": 1.440369 }, { "epoch": 1.8818816674521228, "grad_norm": 4.783170223236084, "learning_rate": 6.893425620144952e-05, "loss": 2.2035579681396484, "memory(GiB)": 77.56, "step": 43925, "token_acc": 0.5051194539249146, "train_speed(iter/s)": 1.44037 }, { "epoch": 1.8820958827813719, "grad_norm": 4.1528754234313965, "learning_rate": 6.892802745294165e-05, "loss": 2.6755613327026366, "memory(GiB)": 77.56, "step": 43930, "token_acc": 0.4131944444444444, "train_speed(iter/s)": 1.440378 }, { "epoch": 1.8823100981106209, "grad_norm": 9.812792778015137, "learning_rate": 6.892179836153483e-05, "loss": 2.6304904937744142, "memory(GiB)": 77.56, "step": 43935, "token_acc": 0.45161290322580644, "train_speed(iter/s)": 1.440382 }, { "epoch": 1.8825243134398697, "grad_norm": 5.402609825134277, "learning_rate": 6.891556892734188e-05, "loss": 2.4052761077880858, "memory(GiB)": 77.56, "step": 43940, "token_acc": 0.46503496503496505, "train_speed(iter/s)": 1.440394 }, { "epoch": 1.8827385287691187, "grad_norm": 5.9622979164123535, "learning_rate": 6.890933915047565e-05, "loss": 2.1981311798095704, "memory(GiB)": 77.56, "step": 43945, "token_acc": 0.5236363636363637, "train_speed(iter/s)": 1.440424 }, { "epoch": 1.8829527440983678, "grad_norm": 5.090926647186279, "learning_rate": 6.8903109031049e-05, "loss": 2.7230417251586916, "memory(GiB)": 77.56, "step": 43950, "token_acc": 0.4307692307692308, "train_speed(iter/s)": 1.440456 }, { "epoch": 1.8831669594276166, "grad_norm": 5.329853534698486, "learning_rate": 6.88968785691748e-05, "loss": 2.1352123260498046, "memory(GiB)": 77.56, "step": 43955, "token_acc": 0.5245901639344263, "train_speed(iter/s)": 1.440479 }, { "epoch": 1.8833811747568656, "grad_norm": 5.130246162414551, "learning_rate": 6.88906477649659e-05, "loss": 2.2296253204345704, "memory(GiB)": 77.56, "step": 43960, "token_acc": 0.5132075471698113, "train_speed(iter/s)": 1.440496 }, { "epoch": 1.8835953900861147, "grad_norm": 4.083864212036133, "learning_rate": 6.888441661853523e-05, "loss": 2.340218734741211, "memory(GiB)": 77.56, "step": 43965, "token_acc": 0.49818181818181817, "train_speed(iter/s)": 1.440489 }, { "epoch": 1.8838096054153635, "grad_norm": 5.540615558624268, "learning_rate": 6.887818512999562e-05, "loss": 2.4362293243408204, "memory(GiB)": 77.56, "step": 43970, "token_acc": 0.49242424242424243, "train_speed(iter/s)": 1.440506 }, { "epoch": 1.8840238207446125, "grad_norm": 4.939748764038086, "learning_rate": 6.887195329945997e-05, "loss": 2.4583242416381834, "memory(GiB)": 77.56, "step": 43975, "token_acc": 0.4778761061946903, "train_speed(iter/s)": 1.440511 }, { "epoch": 1.8842380360738615, "grad_norm": 6.531278133392334, "learning_rate": 6.88657211270412e-05, "loss": 2.2615076065063477, "memory(GiB)": 77.56, "step": 43980, "token_acc": 0.5150375939849624, "train_speed(iter/s)": 1.44051 }, { "epoch": 1.8844522514031103, "grad_norm": 4.485387325286865, "learning_rate": 6.885948861285219e-05, "loss": 2.597138595581055, "memory(GiB)": 77.56, "step": 43985, "token_acc": 0.4702194357366771, "train_speed(iter/s)": 1.440501 }, { "epoch": 1.8846664667323594, "grad_norm": 5.031183242797852, "learning_rate": 6.885325575700584e-05, "loss": 2.4078433990478514, "memory(GiB)": 77.56, "step": 43990, "token_acc": 0.4880546075085324, "train_speed(iter/s)": 1.44049 }, { "epoch": 1.8848806820616084, "grad_norm": 5.190521717071533, "learning_rate": 6.884702255961508e-05, "loss": 2.76110954284668, "memory(GiB)": 77.56, "step": 43995, "token_acc": 0.43465045592705165, "train_speed(iter/s)": 1.440474 }, { "epoch": 1.8850948973908572, "grad_norm": 5.4947590827941895, "learning_rate": 6.884078902079283e-05, "loss": 2.3637725830078127, "memory(GiB)": 77.56, "step": 44000, "token_acc": 0.5148148148148148, "train_speed(iter/s)": 1.440485 }, { "epoch": 1.8850948973908572, "eval_loss": 2.4237220287323, "eval_runtime": 14.7483, "eval_samples_per_second": 6.78, "eval_steps_per_second": 6.78, "eval_token_acc": 0.44549763033175355, "step": 44000 }, { "epoch": 1.8853091127201063, "grad_norm": 6.161308765411377, "learning_rate": 6.8834555140652e-05, "loss": 2.1780561447143554, "memory(GiB)": 77.56, "step": 44005, "token_acc": 0.45576407506702415, "train_speed(iter/s)": 1.439736 }, { "epoch": 1.8855233280493553, "grad_norm": 4.318418502807617, "learning_rate": 6.882832091930556e-05, "loss": 2.5019474029541016, "memory(GiB)": 77.56, "step": 44010, "token_acc": 0.46953405017921146, "train_speed(iter/s)": 1.439723 }, { "epoch": 1.885737543378604, "grad_norm": 4.363212585449219, "learning_rate": 6.882208635686641e-05, "loss": 2.362237739562988, "memory(GiB)": 77.56, "step": 44015, "token_acc": 0.4896265560165975, "train_speed(iter/s)": 1.439713 }, { "epoch": 1.8859517587078531, "grad_norm": 5.084024429321289, "learning_rate": 6.881585145344751e-05, "loss": 2.341076469421387, "memory(GiB)": 77.56, "step": 44020, "token_acc": 0.5137931034482759, "train_speed(iter/s)": 1.439738 }, { "epoch": 1.8861659740371022, "grad_norm": 3.8032565116882324, "learning_rate": 6.88096162091618e-05, "loss": 2.510670471191406, "memory(GiB)": 77.56, "step": 44025, "token_acc": 0.4879725085910653, "train_speed(iter/s)": 1.439706 }, { "epoch": 1.886380189366351, "grad_norm": 7.438320159912109, "learning_rate": 6.880338062412228e-05, "loss": 2.864375114440918, "memory(GiB)": 77.56, "step": 44030, "token_acc": 0.4835164835164835, "train_speed(iter/s)": 1.439719 }, { "epoch": 1.8865944046956, "grad_norm": 5.129100322723389, "learning_rate": 6.879714469844185e-05, "loss": 2.4701171875, "memory(GiB)": 77.56, "step": 44035, "token_acc": 0.475177304964539, "train_speed(iter/s)": 1.439747 }, { "epoch": 1.886808620024849, "grad_norm": 4.694314002990723, "learning_rate": 6.879090843223353e-05, "loss": 2.3149692535400392, "memory(GiB)": 77.56, "step": 44040, "token_acc": 0.5305343511450382, "train_speed(iter/s)": 1.439737 }, { "epoch": 1.8870228353540979, "grad_norm": 4.69293212890625, "learning_rate": 6.878467182561026e-05, "loss": 2.3349449157714846, "memory(GiB)": 77.56, "step": 44045, "token_acc": 0.532871972318339, "train_speed(iter/s)": 1.439731 }, { "epoch": 1.887237050683347, "grad_norm": 5.171730041503906, "learning_rate": 6.877843487868505e-05, "loss": 2.3221935272216796, "memory(GiB)": 77.56, "step": 44050, "token_acc": 0.5259259259259259, "train_speed(iter/s)": 1.439745 }, { "epoch": 1.887451266012596, "grad_norm": 5.316656589508057, "learning_rate": 6.877219759157087e-05, "loss": 2.308821678161621, "memory(GiB)": 77.56, "step": 44055, "token_acc": 0.4860557768924303, "train_speed(iter/s)": 1.439757 }, { "epoch": 1.8876654813418448, "grad_norm": 5.144650936126709, "learning_rate": 6.876595996438072e-05, "loss": 2.5875526428222657, "memory(GiB)": 77.56, "step": 44060, "token_acc": 0.4691780821917808, "train_speed(iter/s)": 1.439792 }, { "epoch": 1.8878796966710938, "grad_norm": 3.9537158012390137, "learning_rate": 6.875972199722761e-05, "loss": 2.427333450317383, "memory(GiB)": 77.56, "step": 44065, "token_acc": 0.5020242914979757, "train_speed(iter/s)": 1.439811 }, { "epoch": 1.8880939120003428, "grad_norm": 4.662796974182129, "learning_rate": 6.875348369022452e-05, "loss": 2.361859130859375, "memory(GiB)": 77.56, "step": 44070, "token_acc": 0.5, "train_speed(iter/s)": 1.43982 }, { "epoch": 1.8883081273295916, "grad_norm": 5.868584632873535, "learning_rate": 6.87472450434845e-05, "loss": 2.578470802307129, "memory(GiB)": 77.56, "step": 44075, "token_acc": 0.49615384615384617, "train_speed(iter/s)": 1.439828 }, { "epoch": 1.8885223426588407, "grad_norm": 4.635788917541504, "learning_rate": 6.874100605712054e-05, "loss": 2.407032012939453, "memory(GiB)": 77.56, "step": 44080, "token_acc": 0.5092250922509225, "train_speed(iter/s)": 1.439819 }, { "epoch": 1.8887365579880897, "grad_norm": 7.193100929260254, "learning_rate": 6.873476673124567e-05, "loss": 2.6361345291137694, "memory(GiB)": 77.56, "step": 44085, "token_acc": 0.4378698224852071, "train_speed(iter/s)": 1.439769 }, { "epoch": 1.8889507733173385, "grad_norm": 5.188387870788574, "learning_rate": 6.872852706597295e-05, "loss": 2.548508071899414, "memory(GiB)": 77.56, "step": 44090, "token_acc": 0.4430769230769231, "train_speed(iter/s)": 1.439805 }, { "epoch": 1.8891649886465876, "grad_norm": 5.298627853393555, "learning_rate": 6.872228706141537e-05, "loss": 2.3667545318603516, "memory(GiB)": 77.56, "step": 44095, "token_acc": 0.4882154882154882, "train_speed(iter/s)": 1.439811 }, { "epoch": 1.8893792039758366, "grad_norm": 6.749616622924805, "learning_rate": 6.8716046717686e-05, "loss": 2.5558544158935548, "memory(GiB)": 77.56, "step": 44100, "token_acc": 0.5131578947368421, "train_speed(iter/s)": 1.439784 }, { "epoch": 1.8895934193050854, "grad_norm": 4.484576225280762, "learning_rate": 6.87098060348979e-05, "loss": 2.1556907653808595, "memory(GiB)": 77.56, "step": 44105, "token_acc": 0.5073529411764706, "train_speed(iter/s)": 1.439789 }, { "epoch": 1.8898076346343344, "grad_norm": 4.849059104919434, "learning_rate": 6.87035650131641e-05, "loss": 2.708654022216797, "memory(GiB)": 77.56, "step": 44110, "token_acc": 0.46984126984126984, "train_speed(iter/s)": 1.439797 }, { "epoch": 1.8900218499635835, "grad_norm": 4.918495178222656, "learning_rate": 6.869732365259767e-05, "loss": 2.398085021972656, "memory(GiB)": 77.56, "step": 44115, "token_acc": 0.5033557046979866, "train_speed(iter/s)": 1.439855 }, { "epoch": 1.8902360652928323, "grad_norm": 4.631740093231201, "learning_rate": 6.869108195331169e-05, "loss": 2.5333696365356446, "memory(GiB)": 77.56, "step": 44120, "token_acc": 0.4703703703703704, "train_speed(iter/s)": 1.439861 }, { "epoch": 1.8904502806220813, "grad_norm": 4.902470588684082, "learning_rate": 6.868483991541923e-05, "loss": 2.482940673828125, "memory(GiB)": 77.56, "step": 44125, "token_acc": 0.4884488448844885, "train_speed(iter/s)": 1.439893 }, { "epoch": 1.8906644959513303, "grad_norm": 4.570365905761719, "learning_rate": 6.867859753903336e-05, "loss": 2.6711803436279298, "memory(GiB)": 77.56, "step": 44130, "token_acc": 0.445859872611465, "train_speed(iter/s)": 1.439893 }, { "epoch": 1.8908787112805792, "grad_norm": 4.237510681152344, "learning_rate": 6.867235482426719e-05, "loss": 2.4270206451416017, "memory(GiB)": 77.56, "step": 44135, "token_acc": 0.49491525423728816, "train_speed(iter/s)": 1.4399 }, { "epoch": 1.8910929266098282, "grad_norm": 5.2100300788879395, "learning_rate": 6.866611177123377e-05, "loss": 2.335957336425781, "memory(GiB)": 77.56, "step": 44140, "token_acc": 0.4782608695652174, "train_speed(iter/s)": 1.439898 }, { "epoch": 1.8913071419390772, "grad_norm": 5.797184467315674, "learning_rate": 6.865986838004624e-05, "loss": 2.575522613525391, "memory(GiB)": 77.56, "step": 44145, "token_acc": 0.4631578947368421, "train_speed(iter/s)": 1.439912 }, { "epoch": 1.891521357268326, "grad_norm": 7.417253017425537, "learning_rate": 6.865362465081768e-05, "loss": 2.2161609649658205, "memory(GiB)": 77.56, "step": 44150, "token_acc": 0.47692307692307695, "train_speed(iter/s)": 1.439931 }, { "epoch": 1.891735572597575, "grad_norm": 5.7866950035095215, "learning_rate": 6.864738058366122e-05, "loss": 2.4511606216430666, "memory(GiB)": 77.56, "step": 44155, "token_acc": 0.4823943661971831, "train_speed(iter/s)": 1.439964 }, { "epoch": 1.8919497879268241, "grad_norm": 3.83992600440979, "learning_rate": 6.864113617868997e-05, "loss": 2.60604190826416, "memory(GiB)": 77.56, "step": 44160, "token_acc": 0.49271137026239065, "train_speed(iter/s)": 1.44 }, { "epoch": 1.892164003256073, "grad_norm": 5.51364278793335, "learning_rate": 6.863489143601705e-05, "loss": 2.4766021728515626, "memory(GiB)": 77.56, "step": 44165, "token_acc": 0.4647887323943662, "train_speed(iter/s)": 1.440022 }, { "epoch": 1.892378218585322, "grad_norm": 8.00137996673584, "learning_rate": 6.862864635575559e-05, "loss": 2.514451026916504, "memory(GiB)": 77.56, "step": 44170, "token_acc": 0.44921875, "train_speed(iter/s)": 1.440023 }, { "epoch": 1.892592433914571, "grad_norm": 7.786540985107422, "learning_rate": 6.862240093801873e-05, "loss": 2.3569799423217774, "memory(GiB)": 77.56, "step": 44175, "token_acc": 0.4645390070921986, "train_speed(iter/s)": 1.440023 }, { "epoch": 1.8928066492438198, "grad_norm": 4.392272472381592, "learning_rate": 6.861615518291962e-05, "loss": 2.4845392227172853, "memory(GiB)": 77.56, "step": 44180, "token_acc": 0.4680232558139535, "train_speed(iter/s)": 1.440018 }, { "epoch": 1.8930208645730688, "grad_norm": 4.896578788757324, "learning_rate": 6.860990909057137e-05, "loss": 2.821159553527832, "memory(GiB)": 77.56, "step": 44185, "token_acc": 0.4416058394160584, "train_speed(iter/s)": 1.440043 }, { "epoch": 1.8932350799023179, "grad_norm": 5.9553680419921875, "learning_rate": 6.860366266108716e-05, "loss": 2.5175575256347655, "memory(GiB)": 77.56, "step": 44190, "token_acc": 0.46946564885496184, "train_speed(iter/s)": 1.440054 }, { "epoch": 1.8934492952315667, "grad_norm": 6.024968147277832, "learning_rate": 6.859741589458015e-05, "loss": 2.520068550109863, "memory(GiB)": 77.56, "step": 44195, "token_acc": 0.4682274247491639, "train_speed(iter/s)": 1.440073 }, { "epoch": 1.8936635105608157, "grad_norm": 5.330214977264404, "learning_rate": 6.859116879116352e-05, "loss": 2.6640974044799806, "memory(GiB)": 77.56, "step": 44200, "token_acc": 0.4901315789473684, "train_speed(iter/s)": 1.440076 }, { "epoch": 1.8938777258900648, "grad_norm": 7.271003723144531, "learning_rate": 6.858492135095043e-05, "loss": 2.7508787155151366, "memory(GiB)": 77.56, "step": 44205, "token_acc": 0.4880546075085324, "train_speed(iter/s)": 1.440047 }, { "epoch": 1.8940919412193136, "grad_norm": 4.811175346374512, "learning_rate": 6.857867357405404e-05, "loss": 2.1135034561157227, "memory(GiB)": 77.56, "step": 44210, "token_acc": 0.5151515151515151, "train_speed(iter/s)": 1.440073 }, { "epoch": 1.8943061565485626, "grad_norm": 4.746237754821777, "learning_rate": 6.857242546058756e-05, "loss": 2.4407325744628907, "memory(GiB)": 77.56, "step": 44215, "token_acc": 0.5047318611987381, "train_speed(iter/s)": 1.440103 }, { "epoch": 1.8945203718778116, "grad_norm": 5.387736797332764, "learning_rate": 6.856617701066416e-05, "loss": 2.5141532897949217, "memory(GiB)": 77.56, "step": 44220, "token_acc": 0.48026315789473684, "train_speed(iter/s)": 1.440124 }, { "epoch": 1.8947345872070604, "grad_norm": 4.8732523918151855, "learning_rate": 6.855992822439705e-05, "loss": 2.4990543365478515, "memory(GiB)": 77.56, "step": 44225, "token_acc": 0.45874587458745875, "train_speed(iter/s)": 1.440127 }, { "epoch": 1.8949488025363095, "grad_norm": 5.696702003479004, "learning_rate": 6.855367910189942e-05, "loss": 2.3242656707763674, "memory(GiB)": 77.56, "step": 44230, "token_acc": 0.4831081081081081, "train_speed(iter/s)": 1.440165 }, { "epoch": 1.8951630178655585, "grad_norm": 5.067339897155762, "learning_rate": 6.854742964328451e-05, "loss": 2.559010696411133, "memory(GiB)": 77.56, "step": 44235, "token_acc": 0.45806451612903226, "train_speed(iter/s)": 1.440151 }, { "epoch": 1.8953772331948073, "grad_norm": 4.362419605255127, "learning_rate": 6.854117984866552e-05, "loss": 2.544628715515137, "memory(GiB)": 77.56, "step": 44240, "token_acc": 0.4938650306748466, "train_speed(iter/s)": 1.44011 }, { "epoch": 1.8955914485240564, "grad_norm": 5.868187427520752, "learning_rate": 6.853492971815564e-05, "loss": 2.6103952407836912, "memory(GiB)": 77.56, "step": 44245, "token_acc": 0.46503496503496505, "train_speed(iter/s)": 1.440115 }, { "epoch": 1.8958056638533054, "grad_norm": 4.793186187744141, "learning_rate": 6.852867925186814e-05, "loss": 2.580684280395508, "memory(GiB)": 77.56, "step": 44250, "token_acc": 0.4533762057877814, "train_speed(iter/s)": 1.440097 }, { "epoch": 1.8960198791825542, "grad_norm": 4.180619716644287, "learning_rate": 6.85224284499162e-05, "loss": 2.662502479553223, "memory(GiB)": 77.56, "step": 44255, "token_acc": 0.44192634560906513, "train_speed(iter/s)": 1.440116 }, { "epoch": 1.8962340945118032, "grad_norm": 4.831403732299805, "learning_rate": 6.851617731241312e-05, "loss": 2.5747108459472656, "memory(GiB)": 77.56, "step": 44260, "token_acc": 0.45985401459854014, "train_speed(iter/s)": 1.440132 }, { "epoch": 1.8964483098410523, "grad_norm": 4.796936988830566, "learning_rate": 6.850992583947212e-05, "loss": 2.830059814453125, "memory(GiB)": 77.56, "step": 44265, "token_acc": 0.44368600682593856, "train_speed(iter/s)": 1.440171 }, { "epoch": 1.896662525170301, "grad_norm": 7.206109523773193, "learning_rate": 6.850367403120645e-05, "loss": 2.4320091247558593, "memory(GiB)": 77.56, "step": 44270, "token_acc": 0.47157190635451507, "train_speed(iter/s)": 1.440134 }, { "epoch": 1.8968767404995501, "grad_norm": 5.738765239715576, "learning_rate": 6.849742188772936e-05, "loss": 2.4546533584594727, "memory(GiB)": 77.56, "step": 44275, "token_acc": 0.4966887417218543, "train_speed(iter/s)": 1.440147 }, { "epoch": 1.8970909558287992, "grad_norm": 4.917630672454834, "learning_rate": 6.849116940915412e-05, "loss": 2.0927761077880858, "memory(GiB)": 77.56, "step": 44280, "token_acc": 0.5222222222222223, "train_speed(iter/s)": 1.440198 }, { "epoch": 1.897305171158048, "grad_norm": 5.305604457855225, "learning_rate": 6.8484916595594e-05, "loss": 2.3930181503295898, "memory(GiB)": 77.56, "step": 44285, "token_acc": 0.5206896551724138, "train_speed(iter/s)": 1.440187 }, { "epoch": 1.897519386487297, "grad_norm": 4.421180725097656, "learning_rate": 6.847866344716228e-05, "loss": 2.3185192108154298, "memory(GiB)": 77.56, "step": 44290, "token_acc": 0.49295774647887325, "train_speed(iter/s)": 1.440168 }, { "epoch": 1.897733601816546, "grad_norm": 5.534777641296387, "learning_rate": 6.847240996397224e-05, "loss": 2.5180152893066405, "memory(GiB)": 77.56, "step": 44295, "token_acc": 0.4748201438848921, "train_speed(iter/s)": 1.440204 }, { "epoch": 1.8979478171457949, "grad_norm": 4.705270767211914, "learning_rate": 6.846615614613716e-05, "loss": 2.5731258392333984, "memory(GiB)": 77.56, "step": 44300, "token_acc": 0.4728434504792332, "train_speed(iter/s)": 1.440219 }, { "epoch": 1.898162032475044, "grad_norm": 5.615041255950928, "learning_rate": 6.845990199377033e-05, "loss": 2.8154468536376953, "memory(GiB)": 77.56, "step": 44305, "token_acc": 0.4648318042813456, "train_speed(iter/s)": 1.440226 }, { "epoch": 1.898376247804293, "grad_norm": 4.8558807373046875, "learning_rate": 6.845364750698507e-05, "loss": 2.760580062866211, "memory(GiB)": 77.56, "step": 44310, "token_acc": 0.48172757475083056, "train_speed(iter/s)": 1.440278 }, { "epoch": 1.8985904631335417, "grad_norm": 5.47043514251709, "learning_rate": 6.844739268589467e-05, "loss": 2.5955120086669923, "memory(GiB)": 77.56, "step": 44315, "token_acc": 0.49416342412451364, "train_speed(iter/s)": 1.440226 }, { "epoch": 1.8988046784627908, "grad_norm": 5.321701526641846, "learning_rate": 6.844113753061246e-05, "loss": 2.2472354888916017, "memory(GiB)": 77.56, "step": 44320, "token_acc": 0.5201793721973094, "train_speed(iter/s)": 1.440235 }, { "epoch": 1.8990188937920398, "grad_norm": 4.491479396820068, "learning_rate": 6.843488204125174e-05, "loss": 2.733559989929199, "memory(GiB)": 77.56, "step": 44325, "token_acc": 0.42990654205607476, "train_speed(iter/s)": 1.440212 }, { "epoch": 1.8992331091212886, "grad_norm": 6.029450416564941, "learning_rate": 6.842862621792581e-05, "loss": 2.443553161621094, "memory(GiB)": 77.56, "step": 44330, "token_acc": 0.4807692307692308, "train_speed(iter/s)": 1.440222 }, { "epoch": 1.8994473244505377, "grad_norm": 5.856253623962402, "learning_rate": 6.842237006074805e-05, "loss": 2.851237106323242, "memory(GiB)": 77.56, "step": 44335, "token_acc": 0.43465045592705165, "train_speed(iter/s)": 1.440278 }, { "epoch": 1.8996615397797867, "grad_norm": 4.865426063537598, "learning_rate": 6.841611356983179e-05, "loss": 2.3552343368530275, "memory(GiB)": 77.56, "step": 44340, "token_acc": 0.4852941176470588, "train_speed(iter/s)": 1.440245 }, { "epoch": 1.8998757551090355, "grad_norm": 4.678223133087158, "learning_rate": 6.840985674529033e-05, "loss": 2.636866569519043, "memory(GiB)": 77.56, "step": 44345, "token_acc": 0.5054151624548736, "train_speed(iter/s)": 1.440282 }, { "epoch": 1.9000899704382845, "grad_norm": 5.230689525604248, "learning_rate": 6.840359958723705e-05, "loss": 2.7309587478637694, "memory(GiB)": 77.56, "step": 44350, "token_acc": 0.41324921135646686, "train_speed(iter/s)": 1.440322 }, { "epoch": 1.9003041857675336, "grad_norm": 6.695852756500244, "learning_rate": 6.839734209578532e-05, "loss": 2.299959182739258, "memory(GiB)": 77.56, "step": 44355, "token_acc": 0.45110410094637227, "train_speed(iter/s)": 1.440365 }, { "epoch": 1.9005184010967824, "grad_norm": 4.074918270111084, "learning_rate": 6.839108427104846e-05, "loss": 2.2946331024169924, "memory(GiB)": 77.56, "step": 44360, "token_acc": 0.5156794425087108, "train_speed(iter/s)": 1.440405 }, { "epoch": 1.9007326164260314, "grad_norm": 6.127739906311035, "learning_rate": 6.838482611313985e-05, "loss": 2.290567970275879, "memory(GiB)": 77.56, "step": 44365, "token_acc": 0.544, "train_speed(iter/s)": 1.440436 }, { "epoch": 1.9009468317552805, "grad_norm": 5.002657413482666, "learning_rate": 6.83785676221729e-05, "loss": 2.466518211364746, "memory(GiB)": 77.56, "step": 44370, "token_acc": 0.4944237918215613, "train_speed(iter/s)": 1.440453 }, { "epoch": 1.9011610470845293, "grad_norm": 5.262711048126221, "learning_rate": 6.837230879826093e-05, "loss": 2.222349166870117, "memory(GiB)": 77.56, "step": 44375, "token_acc": 0.43389830508474575, "train_speed(iter/s)": 1.440419 }, { "epoch": 1.9013752624137783, "grad_norm": 6.165261745452881, "learning_rate": 6.836604964151737e-05, "loss": 2.5008384704589846, "memory(GiB)": 77.56, "step": 44380, "token_acc": 0.4639175257731959, "train_speed(iter/s)": 1.440446 }, { "epoch": 1.9015894777430273, "grad_norm": 5.602413654327393, "learning_rate": 6.835979015205558e-05, "loss": 2.4265506744384764, "memory(GiB)": 77.56, "step": 44385, "token_acc": 0.5375, "train_speed(iter/s)": 1.440423 }, { "epoch": 1.9018036930722761, "grad_norm": 5.043893814086914, "learning_rate": 6.835353032998896e-05, "loss": 2.2805496215820313, "memory(GiB)": 77.56, "step": 44390, "token_acc": 0.528957528957529, "train_speed(iter/s)": 1.44041 }, { "epoch": 1.9020179084015252, "grad_norm": 4.740059852600098, "learning_rate": 6.834727017543094e-05, "loss": 2.356195068359375, "memory(GiB)": 77.56, "step": 44395, "token_acc": 0.5, "train_speed(iter/s)": 1.440415 }, { "epoch": 1.9022321237307742, "grad_norm": 5.851634979248047, "learning_rate": 6.83410096884949e-05, "loss": 2.5868406295776367, "memory(GiB)": 77.56, "step": 44400, "token_acc": 0.4901185770750988, "train_speed(iter/s)": 1.440429 }, { "epoch": 1.902446339060023, "grad_norm": 5.274365425109863, "learning_rate": 6.833474886929425e-05, "loss": 2.489689254760742, "memory(GiB)": 77.56, "step": 44405, "token_acc": 0.4759036144578313, "train_speed(iter/s)": 1.440437 }, { "epoch": 1.902660554389272, "grad_norm": 4.019134044647217, "learning_rate": 6.832848771794245e-05, "loss": 2.27807502746582, "memory(GiB)": 77.56, "step": 44410, "token_acc": 0.498220640569395, "train_speed(iter/s)": 1.440452 }, { "epoch": 1.902874769718521, "grad_norm": 4.189376354217529, "learning_rate": 6.83222262345529e-05, "loss": 2.325949859619141, "memory(GiB)": 77.56, "step": 44415, "token_acc": 0.5048543689320388, "train_speed(iter/s)": 1.440407 }, { "epoch": 1.90308898504777, "grad_norm": 5.913931369781494, "learning_rate": 6.831596441923902e-05, "loss": 2.5688007354736326, "memory(GiB)": 77.56, "step": 44420, "token_acc": 0.4563106796116505, "train_speed(iter/s)": 1.440388 }, { "epoch": 1.903303200377019, "grad_norm": 4.257114410400391, "learning_rate": 6.830970227211427e-05, "loss": 2.7230743408203124, "memory(GiB)": 77.56, "step": 44425, "token_acc": 0.4735202492211838, "train_speed(iter/s)": 1.440421 }, { "epoch": 1.903517415706268, "grad_norm": 5.527200698852539, "learning_rate": 6.830343979329208e-05, "loss": 2.335757827758789, "memory(GiB)": 77.56, "step": 44430, "token_acc": 0.496551724137931, "train_speed(iter/s)": 1.440458 }, { "epoch": 1.9037316310355168, "grad_norm": 4.908282279968262, "learning_rate": 6.829717698288591e-05, "loss": 2.3787445068359374, "memory(GiB)": 77.56, "step": 44435, "token_acc": 0.47547169811320755, "train_speed(iter/s)": 1.440469 }, { "epoch": 1.9039458463647658, "grad_norm": 4.689885139465332, "learning_rate": 6.829091384100922e-05, "loss": 2.2938655853271483, "memory(GiB)": 77.56, "step": 44440, "token_acc": 0.43866171003717475, "train_speed(iter/s)": 1.440491 }, { "epoch": 1.9041600616940149, "grad_norm": 4.703327655792236, "learning_rate": 6.828465036777548e-05, "loss": 2.5733800888061524, "memory(GiB)": 77.56, "step": 44445, "token_acc": 0.48986486486486486, "train_speed(iter/s)": 1.440459 }, { "epoch": 1.9043742770232637, "grad_norm": 6.948575973510742, "learning_rate": 6.827838656329812e-05, "loss": 2.6196332931518556, "memory(GiB)": 77.56, "step": 44450, "token_acc": 0.45098039215686275, "train_speed(iter/s)": 1.440433 }, { "epoch": 1.9045884923525127, "grad_norm": 6.885674476623535, "learning_rate": 6.827212242769065e-05, "loss": 2.4800430297851563, "memory(GiB)": 77.56, "step": 44455, "token_acc": 0.475, "train_speed(iter/s)": 1.440456 }, { "epoch": 1.9048027076817617, "grad_norm": 5.4360737800598145, "learning_rate": 6.826585796106654e-05, "loss": 2.589427947998047, "memory(GiB)": 77.56, "step": 44460, "token_acc": 0.4659090909090909, "train_speed(iter/s)": 1.440458 }, { "epoch": 1.9050169230110106, "grad_norm": 4.815432071685791, "learning_rate": 6.82595931635393e-05, "loss": 2.551029586791992, "memory(GiB)": 77.56, "step": 44465, "token_acc": 0.5163398692810458, "train_speed(iter/s)": 1.440482 }, { "epoch": 1.9052311383402596, "grad_norm": 7.217994689941406, "learning_rate": 6.825332803522238e-05, "loss": 2.3544147491455076, "memory(GiB)": 77.56, "step": 44470, "token_acc": 0.5271966527196653, "train_speed(iter/s)": 1.440491 }, { "epoch": 1.9054453536695086, "grad_norm": 4.317183494567871, "learning_rate": 6.82470625762293e-05, "loss": 2.2244842529296873, "memory(GiB)": 77.56, "step": 44475, "token_acc": 0.5171232876712328, "train_speed(iter/s)": 1.440452 }, { "epoch": 1.9056595689987574, "grad_norm": 5.077231407165527, "learning_rate": 6.824079678667357e-05, "loss": 2.70748291015625, "memory(GiB)": 77.56, "step": 44480, "token_acc": 0.45112781954887216, "train_speed(iter/s)": 1.440423 }, { "epoch": 1.9058737843280065, "grad_norm": 5.023132801055908, "learning_rate": 6.82345306666687e-05, "loss": 2.3879493713378905, "memory(GiB)": 77.56, "step": 44485, "token_acc": 0.47564469914040114, "train_speed(iter/s)": 1.440471 }, { "epoch": 1.9060879996572555, "grad_norm": 5.956859588623047, "learning_rate": 6.822826421632819e-05, "loss": 2.4765167236328125, "memory(GiB)": 77.56, "step": 44490, "token_acc": 0.4981949458483754, "train_speed(iter/s)": 1.440481 }, { "epoch": 1.9063022149865043, "grad_norm": 4.102790355682373, "learning_rate": 6.822199743576558e-05, "loss": 2.5086177825927733, "memory(GiB)": 77.56, "step": 44495, "token_acc": 0.49079754601226994, "train_speed(iter/s)": 1.440487 }, { "epoch": 1.9065164303157534, "grad_norm": 4.397863388061523, "learning_rate": 6.82157303250944e-05, "loss": 2.4902156829833983, "memory(GiB)": 77.56, "step": 44500, "token_acc": 0.4708029197080292, "train_speed(iter/s)": 1.440514 }, { "epoch": 1.9065164303157534, "eval_loss": 2.4332528114318848, "eval_runtime": 14.922, "eval_samples_per_second": 6.702, "eval_steps_per_second": 6.702, "eval_token_acc": 0.46124523506988563, "step": 44500 }, { "epoch": 1.9067306456450024, "grad_norm": 4.878571033477783, "learning_rate": 6.820946288442816e-05, "loss": 2.7795650482177736, "memory(GiB)": 77.56, "step": 44505, "token_acc": 0.45056867891513563, "train_speed(iter/s)": 1.439757 }, { "epoch": 1.9069448609742512, "grad_norm": 5.015744686126709, "learning_rate": 6.820319511388043e-05, "loss": 2.7151039123535154, "memory(GiB)": 77.56, "step": 44510, "token_acc": 0.449685534591195, "train_speed(iter/s)": 1.439787 }, { "epoch": 1.9071590763035002, "grad_norm": 4.074020862579346, "learning_rate": 6.819692701356474e-05, "loss": 2.9155553817749023, "memory(GiB)": 77.56, "step": 44515, "token_acc": 0.40119760479041916, "train_speed(iter/s)": 1.439799 }, { "epoch": 1.9073732916327493, "grad_norm": 3.7463245391845703, "learning_rate": 6.819065858359464e-05, "loss": 2.5465557098388674, "memory(GiB)": 77.56, "step": 44520, "token_acc": 0.4850498338870432, "train_speed(iter/s)": 1.439793 }, { "epoch": 1.907587506961998, "grad_norm": 7.250163555145264, "learning_rate": 6.818438982408371e-05, "loss": 2.690298080444336, "memory(GiB)": 77.56, "step": 44525, "token_acc": 0.4542124542124542, "train_speed(iter/s)": 1.439795 }, { "epoch": 1.9078017222912471, "grad_norm": 5.25191593170166, "learning_rate": 6.81781207351455e-05, "loss": 2.4210437774658202, "memory(GiB)": 77.56, "step": 44530, "token_acc": 0.49640287769784175, "train_speed(iter/s)": 1.439841 }, { "epoch": 1.9080159376204961, "grad_norm": 4.25998592376709, "learning_rate": 6.817185131689356e-05, "loss": 2.5990951538085936, "memory(GiB)": 77.56, "step": 44535, "token_acc": 0.47315436241610737, "train_speed(iter/s)": 1.439863 }, { "epoch": 1.908230152949745, "grad_norm": 4.700165748596191, "learning_rate": 6.816558156944151e-05, "loss": 2.2443944931030275, "memory(GiB)": 77.56, "step": 44540, "token_acc": 0.4982078853046595, "train_speed(iter/s)": 1.43989 }, { "epoch": 1.908444368278994, "grad_norm": 5.219594955444336, "learning_rate": 6.81593114929029e-05, "loss": 2.5784843444824217, "memory(GiB)": 77.56, "step": 44545, "token_acc": 0.43769968051118213, "train_speed(iter/s)": 1.439917 }, { "epoch": 1.908658583608243, "grad_norm": 4.59681510925293, "learning_rate": 6.815304108739133e-05, "loss": 2.6447582244873047, "memory(GiB)": 77.56, "step": 44550, "token_acc": 0.43636363636363634, "train_speed(iter/s)": 1.439908 }, { "epoch": 1.9088727989374918, "grad_norm": 4.393628120422363, "learning_rate": 6.814677035302038e-05, "loss": 2.506500244140625, "memory(GiB)": 77.56, "step": 44555, "token_acc": 0.4459016393442623, "train_speed(iter/s)": 1.439903 }, { "epoch": 1.9090870142667409, "grad_norm": 5.056570529937744, "learning_rate": 6.814049928990369e-05, "loss": 2.478056526184082, "memory(GiB)": 77.56, "step": 44560, "token_acc": 0.4511784511784512, "train_speed(iter/s)": 1.439933 }, { "epoch": 1.90930122959599, "grad_norm": 5.622750282287598, "learning_rate": 6.81342278981548e-05, "loss": 2.6171329498291014, "memory(GiB)": 77.56, "step": 44565, "token_acc": 0.41810344827586204, "train_speed(iter/s)": 1.439965 }, { "epoch": 1.9095154449252387, "grad_norm": 5.701539993286133, "learning_rate": 6.812795617788739e-05, "loss": 2.4851621627807616, "memory(GiB)": 77.56, "step": 44570, "token_acc": 0.46048109965635736, "train_speed(iter/s)": 1.439923 }, { "epoch": 1.9097296602544878, "grad_norm": 5.992129802703857, "learning_rate": 6.812168412921504e-05, "loss": 2.4025535583496094, "memory(GiB)": 77.56, "step": 44575, "token_acc": 0.4909090909090909, "train_speed(iter/s)": 1.439913 }, { "epoch": 1.9099438755837368, "grad_norm": 3.5778889656066895, "learning_rate": 6.811541175225138e-05, "loss": 2.5711830139160154, "memory(GiB)": 77.56, "step": 44580, "token_acc": 0.4564459930313589, "train_speed(iter/s)": 1.439928 }, { "epoch": 1.9101580909129856, "grad_norm": 4.192580223083496, "learning_rate": 6.810913904711004e-05, "loss": 2.684924507141113, "memory(GiB)": 77.56, "step": 44585, "token_acc": 0.4169381107491857, "train_speed(iter/s)": 1.439939 }, { "epoch": 1.9103723062422346, "grad_norm": 4.966233730316162, "learning_rate": 6.810286601390466e-05, "loss": 2.288302421569824, "memory(GiB)": 77.56, "step": 44590, "token_acc": 0.4880952380952381, "train_speed(iter/s)": 1.439929 }, { "epoch": 1.9105865215714837, "grad_norm": 5.189723968505859, "learning_rate": 6.809659265274888e-05, "loss": 2.3654727935791016, "memory(GiB)": 77.56, "step": 44595, "token_acc": 0.4777777777777778, "train_speed(iter/s)": 1.439974 }, { "epoch": 1.9108007369007325, "grad_norm": 4.194835186004639, "learning_rate": 6.809031896375636e-05, "loss": 2.6107873916625977, "memory(GiB)": 77.56, "step": 44600, "token_acc": 0.4314868804664723, "train_speed(iter/s)": 1.439951 }, { "epoch": 1.9110149522299815, "grad_norm": 5.426981449127197, "learning_rate": 6.808404494704073e-05, "loss": 2.34908447265625, "memory(GiB)": 77.56, "step": 44605, "token_acc": 0.5129151291512916, "train_speed(iter/s)": 1.439938 }, { "epoch": 1.9112291675592306, "grad_norm": 5.184592247009277, "learning_rate": 6.807777060271566e-05, "loss": 2.452161407470703, "memory(GiB)": 77.56, "step": 44610, "token_acc": 0.45985401459854014, "train_speed(iter/s)": 1.439973 }, { "epoch": 1.9114433828884794, "grad_norm": 4.717799186706543, "learning_rate": 6.807149593089484e-05, "loss": 2.276875686645508, "memory(GiB)": 77.56, "step": 44615, "token_acc": 0.5087719298245614, "train_speed(iter/s)": 1.439971 }, { "epoch": 1.9116575982177286, "grad_norm": 4.707108974456787, "learning_rate": 6.806522093169189e-05, "loss": 2.6007253646850588, "memory(GiB)": 77.56, "step": 44620, "token_acc": 0.5021645021645021, "train_speed(iter/s)": 1.440014 }, { "epoch": 1.9118718135469774, "grad_norm": 4.245858669281006, "learning_rate": 6.805894560522051e-05, "loss": 2.5515480041503906, "memory(GiB)": 77.56, "step": 44625, "token_acc": 0.4700854700854701, "train_speed(iter/s)": 1.440041 }, { "epoch": 1.9120860288762263, "grad_norm": 5.026432514190674, "learning_rate": 6.805266995159442e-05, "loss": 2.5758060455322265, "memory(GiB)": 77.56, "step": 44630, "token_acc": 0.49244712990936557, "train_speed(iter/s)": 1.440058 }, { "epoch": 1.9123002442054755, "grad_norm": 4.200397491455078, "learning_rate": 6.804639397092726e-05, "loss": 2.635577583312988, "memory(GiB)": 77.56, "step": 44635, "token_acc": 0.48909657320872274, "train_speed(iter/s)": 1.440074 }, { "epoch": 1.9125144595347243, "grad_norm": 3.5709409713745117, "learning_rate": 6.804011766333276e-05, "loss": 2.190346908569336, "memory(GiB)": 77.56, "step": 44640, "token_acc": 0.5206896551724138, "train_speed(iter/s)": 1.440056 }, { "epoch": 1.9127286748639731, "grad_norm": 5.625542640686035, "learning_rate": 6.80338410289246e-05, "loss": 2.4494842529296874, "memory(GiB)": 77.56, "step": 44645, "token_acc": 0.48494983277591974, "train_speed(iter/s)": 1.440024 }, { "epoch": 1.9129428901932224, "grad_norm": 6.4972381591796875, "learning_rate": 6.802756406781648e-05, "loss": 2.2170576095581054, "memory(GiB)": 77.56, "step": 44650, "token_acc": 0.5328467153284672, "train_speed(iter/s)": 1.440061 }, { "epoch": 1.9131571055224712, "grad_norm": 5.853265762329102, "learning_rate": 6.802128678012214e-05, "loss": 2.648603630065918, "memory(GiB)": 77.56, "step": 44655, "token_acc": 0.4864864864864865, "train_speed(iter/s)": 1.440045 }, { "epoch": 1.91337132085172, "grad_norm": 4.965744972229004, "learning_rate": 6.801500916595527e-05, "loss": 2.413798522949219, "memory(GiB)": 77.56, "step": 44660, "token_acc": 0.4984423676012461, "train_speed(iter/s)": 1.440072 }, { "epoch": 1.9135855361809693, "grad_norm": 6.027616024017334, "learning_rate": 6.800873122542962e-05, "loss": 2.6154855728149413, "memory(GiB)": 77.56, "step": 44665, "token_acc": 0.46325878594249204, "train_speed(iter/s)": 1.440053 }, { "epoch": 1.913799751510218, "grad_norm": 5.317164897918701, "learning_rate": 6.80024529586589e-05, "loss": 2.3016637802124023, "memory(GiB)": 77.56, "step": 44670, "token_acc": 0.5135135135135135, "train_speed(iter/s)": 1.440071 }, { "epoch": 1.914013966839467, "grad_norm": 4.941979885101318, "learning_rate": 6.799617436575688e-05, "loss": 2.164947509765625, "memory(GiB)": 77.56, "step": 44675, "token_acc": 0.5121951219512195, "train_speed(iter/s)": 1.440117 }, { "epoch": 1.9142281821687162, "grad_norm": 5.081755638122559, "learning_rate": 6.798989544683725e-05, "loss": 2.5052539825439455, "memory(GiB)": 77.56, "step": 44680, "token_acc": 0.5052631578947369, "train_speed(iter/s)": 1.440124 }, { "epoch": 1.914442397497965, "grad_norm": 5.270116806030273, "learning_rate": 6.798361620201382e-05, "loss": 2.524852752685547, "memory(GiB)": 77.56, "step": 44685, "token_acc": 0.5174603174603175, "train_speed(iter/s)": 1.440104 }, { "epoch": 1.9146566128272138, "grad_norm": 4.910960674285889, "learning_rate": 6.797733663140028e-05, "loss": 2.371072769165039, "memory(GiB)": 77.56, "step": 44690, "token_acc": 0.46808510638297873, "train_speed(iter/s)": 1.440074 }, { "epoch": 1.914870828156463, "grad_norm": 4.892165184020996, "learning_rate": 6.797105673511044e-05, "loss": 2.616036796569824, "memory(GiB)": 77.56, "step": 44695, "token_acc": 0.46, "train_speed(iter/s)": 1.440134 }, { "epoch": 1.9150850434857118, "grad_norm": 5.687769412994385, "learning_rate": 6.796477651325805e-05, "loss": 2.6887134552001952, "memory(GiB)": 77.56, "step": 44700, "token_acc": 0.4954128440366973, "train_speed(iter/s)": 1.44012 }, { "epoch": 1.9152992588149607, "grad_norm": 5.398184776306152, "learning_rate": 6.795849596595686e-05, "loss": 2.6722476959228514, "memory(GiB)": 77.56, "step": 44705, "token_acc": 0.4426751592356688, "train_speed(iter/s)": 1.440126 }, { "epoch": 1.91551347414421, "grad_norm": 4.448758125305176, "learning_rate": 6.79522150933207e-05, "loss": 2.4870203018188475, "memory(GiB)": 77.56, "step": 44710, "token_acc": 0.48355263157894735, "train_speed(iter/s)": 1.440139 }, { "epoch": 1.9157276894734587, "grad_norm": 4.954132556915283, "learning_rate": 6.794593389546331e-05, "loss": 2.6410064697265625, "memory(GiB)": 77.56, "step": 44715, "token_acc": 0.4521452145214521, "train_speed(iter/s)": 1.440164 }, { "epoch": 1.9159419048027075, "grad_norm": 7.54559326171875, "learning_rate": 6.793965237249848e-05, "loss": 2.5560977935791014, "memory(GiB)": 77.56, "step": 44720, "token_acc": 0.46886446886446886, "train_speed(iter/s)": 1.440165 }, { "epoch": 1.9161561201319568, "grad_norm": 4.980578899383545, "learning_rate": 6.793337052454005e-05, "loss": 2.514068603515625, "memory(GiB)": 77.56, "step": 44725, "token_acc": 0.47101449275362317, "train_speed(iter/s)": 1.440173 }, { "epoch": 1.9163703354612056, "grad_norm": 5.284402370452881, "learning_rate": 6.792708835170177e-05, "loss": 2.4049118041992186, "memory(GiB)": 77.56, "step": 44730, "token_acc": 0.49645390070921985, "train_speed(iter/s)": 1.440161 }, { "epoch": 1.9165845507904544, "grad_norm": 4.475897789001465, "learning_rate": 6.792080585409746e-05, "loss": 2.47657470703125, "memory(GiB)": 77.56, "step": 44735, "token_acc": 0.5, "train_speed(iter/s)": 1.440192 }, { "epoch": 1.9167987661197037, "grad_norm": 3.9399573802948, "learning_rate": 6.791452303184094e-05, "loss": 2.5199440002441404, "memory(GiB)": 77.56, "step": 44740, "token_acc": 0.486404833836858, "train_speed(iter/s)": 1.440182 }, { "epoch": 1.9170129814489525, "grad_norm": 4.361889362335205, "learning_rate": 6.790823988504606e-05, "loss": 2.7094736099243164, "memory(GiB)": 77.56, "step": 44745, "token_acc": 0.4870848708487085, "train_speed(iter/s)": 1.440194 }, { "epoch": 1.9172271967782013, "grad_norm": 5.997969627380371, "learning_rate": 6.790195641382658e-05, "loss": 2.7188392639160157, "memory(GiB)": 77.56, "step": 44750, "token_acc": 0.43567251461988304, "train_speed(iter/s)": 1.440185 }, { "epoch": 1.9174414121074506, "grad_norm": 5.397094249725342, "learning_rate": 6.789567261829639e-05, "loss": 2.5017984390258787, "memory(GiB)": 77.56, "step": 44755, "token_acc": 0.4812286689419795, "train_speed(iter/s)": 1.44011 }, { "epoch": 1.9176556274366994, "grad_norm": 6.921408176422119, "learning_rate": 6.788938849856929e-05, "loss": 2.719912528991699, "memory(GiB)": 77.56, "step": 44760, "token_acc": 0.46204620462046203, "train_speed(iter/s)": 1.440135 }, { "epoch": 1.9178698427659482, "grad_norm": 4.085766315460205, "learning_rate": 6.788310405475915e-05, "loss": 2.15233154296875, "memory(GiB)": 77.56, "step": 44765, "token_acc": 0.5215686274509804, "train_speed(iter/s)": 1.440155 }, { "epoch": 1.9180840580951974, "grad_norm": 5.353226184844971, "learning_rate": 6.78768192869798e-05, "loss": 2.401826858520508, "memory(GiB)": 77.56, "step": 44770, "token_acc": 0.5287769784172662, "train_speed(iter/s)": 1.440182 }, { "epoch": 1.9182982734244463, "grad_norm": 4.965895175933838, "learning_rate": 6.787053419534508e-05, "loss": 2.5608749389648438, "memory(GiB)": 77.56, "step": 44775, "token_acc": 0.5, "train_speed(iter/s)": 1.440166 }, { "epoch": 1.918512488753695, "grad_norm": 4.790609359741211, "learning_rate": 6.786424877996887e-05, "loss": 2.4662988662719725, "memory(GiB)": 77.56, "step": 44780, "token_acc": 0.47289156626506024, "train_speed(iter/s)": 1.440193 }, { "epoch": 1.9187267040829443, "grad_norm": 5.190618515014648, "learning_rate": 6.785796304096506e-05, "loss": 2.4805007934570313, "memory(GiB)": 77.56, "step": 44785, "token_acc": 0.5074626865671642, "train_speed(iter/s)": 1.440178 }, { "epoch": 1.9189409194121931, "grad_norm": 4.605311393737793, "learning_rate": 6.78516769784475e-05, "loss": 2.8391439437866213, "memory(GiB)": 77.56, "step": 44790, "token_acc": 0.4578313253012048, "train_speed(iter/s)": 1.440224 }, { "epoch": 1.919155134741442, "grad_norm": 3.971388578414917, "learning_rate": 6.784539059253004e-05, "loss": 2.6653919219970703, "memory(GiB)": 77.56, "step": 44795, "token_acc": 0.4758842443729904, "train_speed(iter/s)": 1.440247 }, { "epoch": 1.9193693500706912, "grad_norm": 3.805058479309082, "learning_rate": 6.783910388332661e-05, "loss": 2.362849807739258, "memory(GiB)": 77.56, "step": 44800, "token_acc": 0.45723684210526316, "train_speed(iter/s)": 1.440273 }, { "epoch": 1.91958356539994, "grad_norm": 4.532949924468994, "learning_rate": 6.783281685095105e-05, "loss": 2.414392280578613, "memory(GiB)": 77.56, "step": 44805, "token_acc": 0.46296296296296297, "train_speed(iter/s)": 1.440274 }, { "epoch": 1.9197977807291888, "grad_norm": 4.745820999145508, "learning_rate": 6.78265294955173e-05, "loss": 2.460069274902344, "memory(GiB)": 77.56, "step": 44810, "token_acc": 0.5134099616858238, "train_speed(iter/s)": 1.44026 }, { "epoch": 1.920011996058438, "grad_norm": 4.696557998657227, "learning_rate": 6.782024181713925e-05, "loss": 2.4688270568847654, "memory(GiB)": 77.56, "step": 44815, "token_acc": 0.5017182130584192, "train_speed(iter/s)": 1.440289 }, { "epoch": 1.920226211387687, "grad_norm": 4.4076247215271, "learning_rate": 6.781395381593082e-05, "loss": 2.496187210083008, "memory(GiB)": 77.56, "step": 44820, "token_acc": 0.4367469879518072, "train_speed(iter/s)": 1.440298 }, { "epoch": 1.9204404267169357, "grad_norm": 5.914099216461182, "learning_rate": 6.780766549200587e-05, "loss": 2.5455934524536135, "memory(GiB)": 77.56, "step": 44825, "token_acc": 0.4948805460750853, "train_speed(iter/s)": 1.440291 }, { "epoch": 1.920654642046185, "grad_norm": 4.998441219329834, "learning_rate": 6.780137684547838e-05, "loss": 2.6923410415649416, "memory(GiB)": 77.56, "step": 44830, "token_acc": 0.4166666666666667, "train_speed(iter/s)": 1.440326 }, { "epoch": 1.9208688573754338, "grad_norm": 4.390291690826416, "learning_rate": 6.779508787646222e-05, "loss": 2.4865606307983397, "memory(GiB)": 77.56, "step": 44835, "token_acc": 0.47619047619047616, "train_speed(iter/s)": 1.440353 }, { "epoch": 1.9210830727046826, "grad_norm": 7.698446273803711, "learning_rate": 6.778879858507138e-05, "loss": 2.5965694427490233, "memory(GiB)": 77.56, "step": 44840, "token_acc": 0.45018450184501846, "train_speed(iter/s)": 1.440373 }, { "epoch": 1.9212972880339318, "grad_norm": 5.172274112701416, "learning_rate": 6.778250897141976e-05, "loss": 2.5993576049804688, "memory(GiB)": 77.56, "step": 44845, "token_acc": 0.4429065743944637, "train_speed(iter/s)": 1.440407 }, { "epoch": 1.9215115033631807, "grad_norm": 4.587586402893066, "learning_rate": 6.777621903562132e-05, "loss": 2.297383499145508, "memory(GiB)": 77.56, "step": 44850, "token_acc": 0.49645390070921985, "train_speed(iter/s)": 1.440424 }, { "epoch": 1.9217257186924295, "grad_norm": 3.7478761672973633, "learning_rate": 6.776992877778997e-05, "loss": 2.829428482055664, "memory(GiB)": 77.56, "step": 44855, "token_acc": 0.42810457516339867, "train_speed(iter/s)": 1.440488 }, { "epoch": 1.9219399340216787, "grad_norm": 5.2397685050964355, "learning_rate": 6.77636381980397e-05, "loss": 2.568630409240723, "memory(GiB)": 77.56, "step": 44860, "token_acc": 0.4746376811594203, "train_speed(iter/s)": 1.440509 }, { "epoch": 1.9221541493509275, "grad_norm": 9.00521469116211, "learning_rate": 6.775734729648447e-05, "loss": 2.752686309814453, "memory(GiB)": 77.56, "step": 44865, "token_acc": 0.47035573122529645, "train_speed(iter/s)": 1.440503 }, { "epoch": 1.9223683646801764, "grad_norm": 5.078679084777832, "learning_rate": 6.775105607323822e-05, "loss": 2.4538951873779298, "memory(GiB)": 77.56, "step": 44870, "token_acc": 0.48, "train_speed(iter/s)": 1.440493 }, { "epoch": 1.9225825800094256, "grad_norm": 3.5641767978668213, "learning_rate": 6.774476452841496e-05, "loss": 2.509532165527344, "memory(GiB)": 77.56, "step": 44875, "token_acc": 0.4592833876221498, "train_speed(iter/s)": 1.440483 }, { "epoch": 1.9227967953386744, "grad_norm": 4.824163913726807, "learning_rate": 6.773847266212863e-05, "loss": 2.3499183654785156, "memory(GiB)": 77.56, "step": 44880, "token_acc": 0.4942084942084942, "train_speed(iter/s)": 1.440487 }, { "epoch": 1.9230110106679232, "grad_norm": 4.4063286781311035, "learning_rate": 6.773218047449323e-05, "loss": 2.404673767089844, "memory(GiB)": 77.56, "step": 44885, "token_acc": 0.4812286689419795, "train_speed(iter/s)": 1.440493 }, { "epoch": 1.9232252259971725, "grad_norm": 4.887564182281494, "learning_rate": 6.772588796562274e-05, "loss": 2.6430288314819337, "memory(GiB)": 77.56, "step": 44890, "token_acc": 0.48226950354609927, "train_speed(iter/s)": 1.440511 }, { "epoch": 1.9234394413264213, "grad_norm": 4.752852439880371, "learning_rate": 6.771959513563116e-05, "loss": 2.498763847351074, "memory(GiB)": 77.56, "step": 44895, "token_acc": 0.4459016393442623, "train_speed(iter/s)": 1.440479 }, { "epoch": 1.9236536566556701, "grad_norm": 4.58977746963501, "learning_rate": 6.77133019846325e-05, "loss": 2.3855897903442385, "memory(GiB)": 77.56, "step": 44900, "token_acc": 0.4666666666666667, "train_speed(iter/s)": 1.440514 }, { "epoch": 1.9238678719849194, "grad_norm": 6.020086288452148, "learning_rate": 6.770700851274077e-05, "loss": 2.8141883850097655, "memory(GiB)": 77.56, "step": 44905, "token_acc": 0.4473684210526316, "train_speed(iter/s)": 1.440546 }, { "epoch": 1.9240820873141682, "grad_norm": 3.6862447261810303, "learning_rate": 6.770071472006994e-05, "loss": 2.6929437637329103, "memory(GiB)": 77.56, "step": 44910, "token_acc": 0.49295774647887325, "train_speed(iter/s)": 1.440548 }, { "epoch": 1.924296302643417, "grad_norm": 4.549350738525391, "learning_rate": 6.769442060673409e-05, "loss": 2.507354736328125, "memory(GiB)": 77.56, "step": 44915, "token_acc": 0.4513888888888889, "train_speed(iter/s)": 1.440546 }, { "epoch": 1.9245105179726663, "grad_norm": 6.182398796081543, "learning_rate": 6.768812617284719e-05, "loss": 2.4904422760009766, "memory(GiB)": 77.56, "step": 44920, "token_acc": 0.48338368580060426, "train_speed(iter/s)": 1.440578 }, { "epoch": 1.924724733301915, "grad_norm": 5.885412693023682, "learning_rate": 6.76818314185233e-05, "loss": 2.3250518798828126, "memory(GiB)": 77.56, "step": 44925, "token_acc": 0.48859934853420195, "train_speed(iter/s)": 1.440568 }, { "epoch": 1.9249389486311639, "grad_norm": 4.711713790893555, "learning_rate": 6.767553634387645e-05, "loss": 2.893124580383301, "memory(GiB)": 77.56, "step": 44930, "token_acc": 0.4228395061728395, "train_speed(iter/s)": 1.440578 }, { "epoch": 1.9251531639604131, "grad_norm": 3.9892210960388184, "learning_rate": 6.766924094902067e-05, "loss": 2.800054168701172, "memory(GiB)": 77.56, "step": 44935, "token_acc": 0.44648318042813456, "train_speed(iter/s)": 1.440585 }, { "epoch": 1.925367379289662, "grad_norm": 5.91239595413208, "learning_rate": 6.766294523407002e-05, "loss": 2.2573455810546874, "memory(GiB)": 77.56, "step": 44940, "token_acc": 0.5039370078740157, "train_speed(iter/s)": 1.440607 }, { "epoch": 1.9255815946189108, "grad_norm": 5.328665733337402, "learning_rate": 6.765664919913856e-05, "loss": 2.2851661682128905, "memory(GiB)": 77.56, "step": 44945, "token_acc": 0.491869918699187, "train_speed(iter/s)": 1.440603 }, { "epoch": 1.92579580994816, "grad_norm": 6.843301296234131, "learning_rate": 6.765035284434031e-05, "loss": 2.48330078125, "memory(GiB)": 77.56, "step": 44950, "token_acc": 0.4491017964071856, "train_speed(iter/s)": 1.440594 }, { "epoch": 1.9260100252774088, "grad_norm": 5.048739910125732, "learning_rate": 6.764405616978938e-05, "loss": 2.105299377441406, "memory(GiB)": 77.56, "step": 44955, "token_acc": 0.509090909090909, "train_speed(iter/s)": 1.440618 }, { "epoch": 1.9262242406066579, "grad_norm": 4.746590614318848, "learning_rate": 6.763775917559982e-05, "loss": 2.259245681762695, "memory(GiB)": 77.56, "step": 44960, "token_acc": 0.5278810408921933, "train_speed(iter/s)": 1.440628 }, { "epoch": 1.926438455935907, "grad_norm": 5.139850616455078, "learning_rate": 6.76314618618857e-05, "loss": 2.58447265625, "memory(GiB)": 77.56, "step": 44965, "token_acc": 0.4389438943894389, "train_speed(iter/s)": 1.440682 }, { "epoch": 1.9266526712651557, "grad_norm": 6.878082275390625, "learning_rate": 6.76251642287611e-05, "loss": 2.3195016860961912, "memory(GiB)": 77.56, "step": 44970, "token_acc": 0.5077881619937694, "train_speed(iter/s)": 1.440702 }, { "epoch": 1.9268668865944047, "grad_norm": 5.152690410614014, "learning_rate": 6.761886627634013e-05, "loss": 2.4266490936279297, "memory(GiB)": 77.56, "step": 44975, "token_acc": 0.4898785425101215, "train_speed(iter/s)": 1.440721 }, { "epoch": 1.9270811019236538, "grad_norm": 6.772485733032227, "learning_rate": 6.761256800473686e-05, "loss": 2.4867437362670897, "memory(GiB)": 77.56, "step": 44980, "token_acc": 0.4420289855072464, "train_speed(iter/s)": 1.440716 }, { "epoch": 1.9272953172529026, "grad_norm": 6.258435249328613, "learning_rate": 6.760626941406541e-05, "loss": 2.5491941452026365, "memory(GiB)": 77.56, "step": 44985, "token_acc": 0.4662379421221865, "train_speed(iter/s)": 1.440747 }, { "epoch": 1.9275095325821516, "grad_norm": 7.090802192687988, "learning_rate": 6.759997050443987e-05, "loss": 2.7601268768310545, "memory(GiB)": 77.56, "step": 44990, "token_acc": 0.47151898734177217, "train_speed(iter/s)": 1.440739 }, { "epoch": 1.9277237479114007, "grad_norm": 4.356286525726318, "learning_rate": 6.759367127597436e-05, "loss": 2.4777576446533205, "memory(GiB)": 77.56, "step": 44995, "token_acc": 0.47058823529411764, "train_speed(iter/s)": 1.440747 }, { "epoch": 1.9279379632406495, "grad_norm": 4.196362495422363, "learning_rate": 6.758737172878298e-05, "loss": 2.5145463943481445, "memory(GiB)": 77.56, "step": 45000, "token_acc": 0.4787234042553192, "train_speed(iter/s)": 1.440746 }, { "epoch": 1.9279379632406495, "eval_loss": 2.272648572921753, "eval_runtime": 13.8851, "eval_samples_per_second": 7.202, "eval_steps_per_second": 7.202, "eval_token_acc": 0.4802547770700637, "step": 45000 }, { "epoch": 1.9281521785698985, "grad_norm": 4.798551559448242, "learning_rate": 6.758107186297987e-05, "loss": 2.6651744842529297, "memory(GiB)": 77.56, "step": 45005, "token_acc": 0.47058823529411764, "train_speed(iter/s)": 1.440106 }, { "epoch": 1.9283663938991475, "grad_norm": 6.122073173522949, "learning_rate": 6.757477167867914e-05, "loss": 2.641393280029297, "memory(GiB)": 77.56, "step": 45010, "token_acc": 0.4526627218934911, "train_speed(iter/s)": 1.440144 }, { "epoch": 1.9285806092283964, "grad_norm": 7.3333563804626465, "learning_rate": 6.756847117599495e-05, "loss": 2.451889991760254, "memory(GiB)": 77.56, "step": 45015, "token_acc": 0.5186440677966102, "train_speed(iter/s)": 1.440179 }, { "epoch": 1.9287948245576454, "grad_norm": 4.334657669067383, "learning_rate": 6.756217035504142e-05, "loss": 2.4974742889404298, "memory(GiB)": 77.56, "step": 45020, "token_acc": 0.4887640449438202, "train_speed(iter/s)": 1.440185 }, { "epoch": 1.9290090398868944, "grad_norm": 5.305824279785156, "learning_rate": 6.755586921593268e-05, "loss": 2.381949615478516, "memory(GiB)": 77.56, "step": 45025, "token_acc": 0.49262536873156343, "train_speed(iter/s)": 1.440152 }, { "epoch": 1.9292232552161432, "grad_norm": 5.9110212326049805, "learning_rate": 6.754956775878294e-05, "loss": 2.5106342315673826, "memory(GiB)": 77.56, "step": 45030, "token_acc": 0.44879518072289154, "train_speed(iter/s)": 1.440186 }, { "epoch": 1.9294374705453923, "grad_norm": 5.255375385284424, "learning_rate": 6.754326598370628e-05, "loss": 2.6439294815063477, "memory(GiB)": 77.56, "step": 45035, "token_acc": 0.4679245283018868, "train_speed(iter/s)": 1.440199 }, { "epoch": 1.9296516858746413, "grad_norm": 6.076241970062256, "learning_rate": 6.75369638908169e-05, "loss": 2.4158645629882813, "memory(GiB)": 77.56, "step": 45040, "token_acc": 0.5152671755725191, "train_speed(iter/s)": 1.440223 }, { "epoch": 1.9298659012038901, "grad_norm": 4.477628707885742, "learning_rate": 6.7530661480229e-05, "loss": 2.7268402099609377, "memory(GiB)": 77.56, "step": 45045, "token_acc": 0.4935483870967742, "train_speed(iter/s)": 1.440238 }, { "epoch": 1.9300801165331392, "grad_norm": 5.206740856170654, "learning_rate": 6.75243587520567e-05, "loss": 2.4599994659423827, "memory(GiB)": 77.56, "step": 45050, "token_acc": 0.4717741935483871, "train_speed(iter/s)": 1.440253 }, { "epoch": 1.9302943318623882, "grad_norm": 4.140765190124512, "learning_rate": 6.751805570641421e-05, "loss": 2.6211193084716795, "memory(GiB)": 77.56, "step": 45055, "token_acc": 0.4435483870967742, "train_speed(iter/s)": 1.44029 }, { "epoch": 1.930508547191637, "grad_norm": 4.071769714355469, "learning_rate": 6.75117523434157e-05, "loss": 2.774671936035156, "memory(GiB)": 77.56, "step": 45060, "token_acc": 0.44336569579288027, "train_speed(iter/s)": 1.440309 }, { "epoch": 1.930722762520886, "grad_norm": 8.741419792175293, "learning_rate": 6.750544866317536e-05, "loss": 2.596161460876465, "memory(GiB)": 77.56, "step": 45065, "token_acc": 0.46875, "train_speed(iter/s)": 1.440266 }, { "epoch": 1.930936977850135, "grad_norm": 4.923036098480225, "learning_rate": 6.749914466580741e-05, "loss": 2.7943634033203124, "memory(GiB)": 77.56, "step": 45070, "token_acc": 0.43304843304843305, "train_speed(iter/s)": 1.44027 }, { "epoch": 1.9311511931793839, "grad_norm": 4.8703293800354, "learning_rate": 6.749284035142604e-05, "loss": 2.632495880126953, "memory(GiB)": 77.56, "step": 45075, "token_acc": 0.4867924528301887, "train_speed(iter/s)": 1.440285 }, { "epoch": 1.931365408508633, "grad_norm": 5.802798271179199, "learning_rate": 6.748653572014546e-05, "loss": 2.656983757019043, "memory(GiB)": 77.56, "step": 45080, "token_acc": 0.47586206896551725, "train_speed(iter/s)": 1.44025 }, { "epoch": 1.931579623837882, "grad_norm": 3.893988847732544, "learning_rate": 6.748023077207988e-05, "loss": 2.6880258560180663, "memory(GiB)": 77.56, "step": 45085, "token_acc": 0.4592833876221498, "train_speed(iter/s)": 1.440248 }, { "epoch": 1.9317938391671308, "grad_norm": 3.6687610149383545, "learning_rate": 6.747392550734352e-05, "loss": 2.22630615234375, "memory(GiB)": 77.56, "step": 45090, "token_acc": 0.532871972318339, "train_speed(iter/s)": 1.440234 }, { "epoch": 1.9320080544963798, "grad_norm": 7.48464298248291, "learning_rate": 6.74676199260506e-05, "loss": 2.7770549774169924, "memory(GiB)": 77.56, "step": 45095, "token_acc": 0.48727272727272725, "train_speed(iter/s)": 1.440266 }, { "epoch": 1.9322222698256288, "grad_norm": 5.711358547210693, "learning_rate": 6.746131402831537e-05, "loss": 2.421975517272949, "memory(GiB)": 77.56, "step": 45100, "token_acc": 0.49795918367346936, "train_speed(iter/s)": 1.440316 }, { "epoch": 1.9324364851548776, "grad_norm": 5.555703639984131, "learning_rate": 6.745500781425205e-05, "loss": 2.902573013305664, "memory(GiB)": 77.56, "step": 45105, "token_acc": 0.45141065830721006, "train_speed(iter/s)": 1.440321 }, { "epoch": 1.9326507004841267, "grad_norm": 4.670094966888428, "learning_rate": 6.74487012839749e-05, "loss": 2.3642473220825195, "memory(GiB)": 77.56, "step": 45110, "token_acc": 0.4618055555555556, "train_speed(iter/s)": 1.440276 }, { "epoch": 1.9328649158133757, "grad_norm": 5.587475776672363, "learning_rate": 6.744239443759813e-05, "loss": 2.6430273056030273, "memory(GiB)": 77.56, "step": 45115, "token_acc": 0.5, "train_speed(iter/s)": 1.440217 }, { "epoch": 1.9330791311426245, "grad_norm": 5.119340896606445, "learning_rate": 6.743608727523605e-05, "loss": 2.14676456451416, "memory(GiB)": 77.56, "step": 45120, "token_acc": 0.5084745762711864, "train_speed(iter/s)": 1.440203 }, { "epoch": 1.9332933464718736, "grad_norm": 5.657715797424316, "learning_rate": 6.742977979700287e-05, "loss": 2.621632957458496, "memory(GiB)": 77.56, "step": 45125, "token_acc": 0.5, "train_speed(iter/s)": 1.440213 }, { "epoch": 1.9335075618011226, "grad_norm": 5.945922374725342, "learning_rate": 6.74234720030129e-05, "loss": 2.449910354614258, "memory(GiB)": 77.56, "step": 45130, "token_acc": 0.5205047318611987, "train_speed(iter/s)": 1.440203 }, { "epoch": 1.9337217771303714, "grad_norm": 8.12584114074707, "learning_rate": 6.741716389338039e-05, "loss": 2.1450578689575197, "memory(GiB)": 77.56, "step": 45135, "token_acc": 0.5283842794759825, "train_speed(iter/s)": 1.440231 }, { "epoch": 1.9339359924596204, "grad_norm": 4.752293586730957, "learning_rate": 6.74108554682196e-05, "loss": 2.821733093261719, "memory(GiB)": 77.56, "step": 45140, "token_acc": 0.4148148148148148, "train_speed(iter/s)": 1.440244 }, { "epoch": 1.9341502077888695, "grad_norm": 7.279443740844727, "learning_rate": 6.740454672764484e-05, "loss": 2.858756256103516, "memory(GiB)": 77.56, "step": 45145, "token_acc": 0.45627376425855515, "train_speed(iter/s)": 1.44027 }, { "epoch": 1.9343644231181183, "grad_norm": 6.130493640899658, "learning_rate": 6.739823767177039e-05, "loss": 2.5429866790771483, "memory(GiB)": 77.56, "step": 45150, "token_acc": 0.45806451612903226, "train_speed(iter/s)": 1.440241 }, { "epoch": 1.9345786384473673, "grad_norm": 8.559672355651855, "learning_rate": 6.739192830071054e-05, "loss": 2.274237632751465, "memory(GiB)": 77.56, "step": 45155, "token_acc": 0.4904214559386973, "train_speed(iter/s)": 1.440258 }, { "epoch": 1.9347928537766164, "grad_norm": 5.867745399475098, "learning_rate": 6.73856186145796e-05, "loss": 2.4978317260742187, "memory(GiB)": 77.56, "step": 45160, "token_acc": 0.5127272727272727, "train_speed(iter/s)": 1.440277 }, { "epoch": 1.9350070691058652, "grad_norm": 4.435779094696045, "learning_rate": 6.737930861349186e-05, "loss": 2.5070775985717773, "memory(GiB)": 77.56, "step": 45165, "token_acc": 0.4738675958188153, "train_speed(iter/s)": 1.440217 }, { "epoch": 1.9352212844351142, "grad_norm": 4.044161319732666, "learning_rate": 6.737299829756165e-05, "loss": 2.232552909851074, "memory(GiB)": 77.56, "step": 45170, "token_acc": 0.4901315789473684, "train_speed(iter/s)": 1.440213 }, { "epoch": 1.9354354997643632, "grad_norm": 4.500838756561279, "learning_rate": 6.736668766690328e-05, "loss": 2.3945945739746093, "memory(GiB)": 77.56, "step": 45175, "token_acc": 0.44574780058651026, "train_speed(iter/s)": 1.44025 }, { "epoch": 1.935649715093612, "grad_norm": 4.921265602111816, "learning_rate": 6.736037672163107e-05, "loss": 2.016653060913086, "memory(GiB)": 77.56, "step": 45180, "token_acc": 0.5225563909774437, "train_speed(iter/s)": 1.44026 }, { "epoch": 1.935863930422861, "grad_norm": 5.894145488739014, "learning_rate": 6.735406546185934e-05, "loss": 2.7764585494995115, "memory(GiB)": 77.56, "step": 45185, "token_acc": 0.46484375, "train_speed(iter/s)": 1.440254 }, { "epoch": 1.9360781457521101, "grad_norm": 6.673031330108643, "learning_rate": 6.734775388770245e-05, "loss": 2.493510437011719, "memory(GiB)": 77.56, "step": 45190, "token_acc": 0.4483870967741935, "train_speed(iter/s)": 1.440279 }, { "epoch": 1.936292361081359, "grad_norm": 6.107112884521484, "learning_rate": 6.734144199927472e-05, "loss": 2.3875770568847656, "memory(GiB)": 77.56, "step": 45195, "token_acc": 0.5218855218855218, "train_speed(iter/s)": 1.440264 }, { "epoch": 1.936506576410608, "grad_norm": 5.024647235870361, "learning_rate": 6.73351297966905e-05, "loss": 2.5815534591674805, "memory(GiB)": 77.56, "step": 45200, "token_acc": 0.47039473684210525, "train_speed(iter/s)": 1.440236 }, { "epoch": 1.936720791739857, "grad_norm": 3.971710443496704, "learning_rate": 6.732881728006414e-05, "loss": 2.6231557846069338, "memory(GiB)": 77.56, "step": 45205, "token_acc": 0.46885245901639344, "train_speed(iter/s)": 1.440277 }, { "epoch": 1.9369350070691058, "grad_norm": 4.264815807342529, "learning_rate": 6.732250444951e-05, "loss": 2.3905750274658204, "memory(GiB)": 77.56, "step": 45210, "token_acc": 0.506896551724138, "train_speed(iter/s)": 1.440307 }, { "epoch": 1.9371492223983549, "grad_norm": 4.220180988311768, "learning_rate": 6.731619130514243e-05, "loss": 2.6343719482421877, "memory(GiB)": 77.56, "step": 45215, "token_acc": 0.4486301369863014, "train_speed(iter/s)": 1.440281 }, { "epoch": 1.9373634377276039, "grad_norm": 6.077444553375244, "learning_rate": 6.730987784707584e-05, "loss": 2.263299560546875, "memory(GiB)": 77.56, "step": 45220, "token_acc": 0.5278969957081545, "train_speed(iter/s)": 1.440303 }, { "epoch": 1.9375776530568527, "grad_norm": 5.1404314041137695, "learning_rate": 6.730356407542456e-05, "loss": 2.5684535980224608, "memory(GiB)": 77.56, "step": 45225, "token_acc": 0.46551724137931033, "train_speed(iter/s)": 1.440346 }, { "epoch": 1.9377918683861017, "grad_norm": 6.451222896575928, "learning_rate": 6.729724999030297e-05, "loss": 2.6400772094726563, "memory(GiB)": 77.56, "step": 45230, "token_acc": 0.44884488448844884, "train_speed(iter/s)": 1.44032 }, { "epoch": 1.9380060837153508, "grad_norm": 7.853918552398682, "learning_rate": 6.729093559182549e-05, "loss": 2.437779998779297, "memory(GiB)": 77.56, "step": 45235, "token_acc": 0.4673913043478261, "train_speed(iter/s)": 1.44028 }, { "epoch": 1.9382202990445996, "grad_norm": 4.741629123687744, "learning_rate": 6.728462088010647e-05, "loss": 2.8410581588745116, "memory(GiB)": 77.56, "step": 45240, "token_acc": 0.44542772861356933, "train_speed(iter/s)": 1.440296 }, { "epoch": 1.9384345143738486, "grad_norm": 6.1778178215026855, "learning_rate": 6.727830585526035e-05, "loss": 2.702621841430664, "memory(GiB)": 77.56, "step": 45245, "token_acc": 0.46153846153846156, "train_speed(iter/s)": 1.440314 }, { "epoch": 1.9386487297030977, "grad_norm": 4.489249229431152, "learning_rate": 6.727199051740151e-05, "loss": 2.316697692871094, "memory(GiB)": 77.56, "step": 45250, "token_acc": 0.5265306122448979, "train_speed(iter/s)": 1.440355 }, { "epoch": 1.9388629450323465, "grad_norm": 4.341643333435059, "learning_rate": 6.726567486664434e-05, "loss": 2.388902473449707, "memory(GiB)": 77.56, "step": 45255, "token_acc": 0.4716417910447761, "train_speed(iter/s)": 1.440368 }, { "epoch": 1.9390771603615955, "grad_norm": 4.554438591003418, "learning_rate": 6.725935890310328e-05, "loss": 2.7564958572387694, "memory(GiB)": 77.56, "step": 45260, "token_acc": 0.419672131147541, "train_speed(iter/s)": 1.440417 }, { "epoch": 1.9392913756908445, "grad_norm": 5.8008198738098145, "learning_rate": 6.725304262689275e-05, "loss": 2.707223129272461, "memory(GiB)": 77.56, "step": 45265, "token_acc": 0.42452830188679247, "train_speed(iter/s)": 1.440425 }, { "epoch": 1.9395055910200933, "grad_norm": 7.180793762207031, "learning_rate": 6.724672603812717e-05, "loss": 2.512354278564453, "memory(GiB)": 77.56, "step": 45270, "token_acc": 0.46579804560260585, "train_speed(iter/s)": 1.44045 }, { "epoch": 1.9397198063493424, "grad_norm": 4.4878973960876465, "learning_rate": 6.724040913692098e-05, "loss": 2.7086597442626954, "memory(GiB)": 77.56, "step": 45275, "token_acc": 0.46099290780141844, "train_speed(iter/s)": 1.440477 }, { "epoch": 1.9399340216785914, "grad_norm": 5.306338310241699, "learning_rate": 6.723409192338859e-05, "loss": 2.6740139007568358, "memory(GiB)": 77.56, "step": 45280, "token_acc": 0.4482758620689655, "train_speed(iter/s)": 1.440466 }, { "epoch": 1.9401482370078402, "grad_norm": 6.707126140594482, "learning_rate": 6.722777439764446e-05, "loss": 2.289545249938965, "memory(GiB)": 77.56, "step": 45285, "token_acc": 0.5280898876404494, "train_speed(iter/s)": 1.440469 }, { "epoch": 1.9403624523370893, "grad_norm": 7.800013542175293, "learning_rate": 6.722145655980304e-05, "loss": 2.498686408996582, "memory(GiB)": 77.56, "step": 45290, "token_acc": 0.468013468013468, "train_speed(iter/s)": 1.44045 }, { "epoch": 1.9405766676663383, "grad_norm": 5.396202564239502, "learning_rate": 6.721513840997878e-05, "loss": 2.5861148834228516, "memory(GiB)": 77.56, "step": 45295, "token_acc": 0.47104247104247104, "train_speed(iter/s)": 1.440417 }, { "epoch": 1.940790882995587, "grad_norm": 5.635911464691162, "learning_rate": 6.720881994828612e-05, "loss": 2.375554656982422, "memory(GiB)": 77.56, "step": 45300, "token_acc": 0.5098814229249012, "train_speed(iter/s)": 1.440463 }, { "epoch": 1.9410050983248361, "grad_norm": 4.748110771179199, "learning_rate": 6.720250117483956e-05, "loss": 2.6421573638916014, "memory(GiB)": 77.56, "step": 45305, "token_acc": 0.4646840148698885, "train_speed(iter/s)": 1.440456 }, { "epoch": 1.9412193136540852, "grad_norm": 3.5759501457214355, "learning_rate": 6.719618208975355e-05, "loss": 2.7423301696777345, "memory(GiB)": 77.56, "step": 45310, "token_acc": 0.4565826330532213, "train_speed(iter/s)": 1.440464 }, { "epoch": 1.941433528983334, "grad_norm": 5.056948661804199, "learning_rate": 6.718986269314257e-05, "loss": 2.43087158203125, "memory(GiB)": 77.56, "step": 45315, "token_acc": 0.49808429118773945, "train_speed(iter/s)": 1.440469 }, { "epoch": 1.941647744312583, "grad_norm": 3.962507724761963, "learning_rate": 6.718354298512111e-05, "loss": 2.5097015380859373, "memory(GiB)": 77.56, "step": 45320, "token_acc": 0.45454545454545453, "train_speed(iter/s)": 1.440485 }, { "epoch": 1.941861959641832, "grad_norm": 4.559586524963379, "learning_rate": 6.717722296580364e-05, "loss": 2.702666091918945, "memory(GiB)": 77.56, "step": 45325, "token_acc": 0.476038338658147, "train_speed(iter/s)": 1.440475 }, { "epoch": 1.9420761749710809, "grad_norm": 5.075676441192627, "learning_rate": 6.717090263530466e-05, "loss": 2.6583606719970705, "memory(GiB)": 77.56, "step": 45330, "token_acc": 0.5037593984962406, "train_speed(iter/s)": 1.440498 }, { "epoch": 1.94229039030033, "grad_norm": 5.0208516120910645, "learning_rate": 6.716458199373868e-05, "loss": 2.1088844299316407, "memory(GiB)": 77.56, "step": 45335, "token_acc": 0.5085324232081911, "train_speed(iter/s)": 1.44052 }, { "epoch": 1.942504605629579, "grad_norm": 5.868277549743652, "learning_rate": 6.71582610412202e-05, "loss": 2.520858955383301, "memory(GiB)": 77.56, "step": 45340, "token_acc": 0.5, "train_speed(iter/s)": 1.440534 }, { "epoch": 1.9427188209588278, "grad_norm": 4.251816749572754, "learning_rate": 6.71519397778637e-05, "loss": 2.189457130432129, "memory(GiB)": 77.56, "step": 45345, "token_acc": 0.5198412698412699, "train_speed(iter/s)": 1.4405 }, { "epoch": 1.9429330362880768, "grad_norm": 10.378301620483398, "learning_rate": 6.714561820378375e-05, "loss": 2.5810359954833983, "memory(GiB)": 77.56, "step": 45350, "token_acc": 0.45723684210526316, "train_speed(iter/s)": 1.440525 }, { "epoch": 1.9431472516173258, "grad_norm": 5.028270721435547, "learning_rate": 6.713929631909483e-05, "loss": 2.4318796157836915, "memory(GiB)": 77.56, "step": 45355, "token_acc": 0.5062240663900415, "train_speed(iter/s)": 1.440524 }, { "epoch": 1.9433614669465746, "grad_norm": 5.1753249168396, "learning_rate": 6.713297412391148e-05, "loss": 2.6329109191894533, "memory(GiB)": 77.56, "step": 45360, "token_acc": 0.5019011406844106, "train_speed(iter/s)": 1.440544 }, { "epoch": 1.9435756822758237, "grad_norm": 4.587521076202393, "learning_rate": 6.712665161834822e-05, "loss": 2.2890296936035157, "memory(GiB)": 77.56, "step": 45365, "token_acc": 0.49834983498349833, "train_speed(iter/s)": 1.440558 }, { "epoch": 1.9437898976050727, "grad_norm": 3.887960433959961, "learning_rate": 6.712032880251962e-05, "loss": 2.903134346008301, "memory(GiB)": 77.56, "step": 45370, "token_acc": 0.43389830508474575, "train_speed(iter/s)": 1.44056 }, { "epoch": 1.9440041129343215, "grad_norm": 4.430891990661621, "learning_rate": 6.711400567654019e-05, "loss": 2.468527984619141, "memory(GiB)": 77.56, "step": 45375, "token_acc": 0.4809384164222874, "train_speed(iter/s)": 1.440573 }, { "epoch": 1.9442183282635705, "grad_norm": 4.241114139556885, "learning_rate": 6.710768224052451e-05, "loss": 2.6923124313354494, "memory(GiB)": 77.56, "step": 45380, "token_acc": 0.42136498516320475, "train_speed(iter/s)": 1.440584 }, { "epoch": 1.9444325435928196, "grad_norm": 3.6667704582214355, "learning_rate": 6.710135849458709e-05, "loss": 2.2617155075073243, "memory(GiB)": 77.56, "step": 45385, "token_acc": 0.5131964809384164, "train_speed(iter/s)": 1.440599 }, { "epoch": 1.9446467589220684, "grad_norm": 7.395628929138184, "learning_rate": 6.709503443884255e-05, "loss": 2.6145151138305662, "memory(GiB)": 77.56, "step": 45390, "token_acc": 0.5038461538461538, "train_speed(iter/s)": 1.440579 }, { "epoch": 1.9448609742513174, "grad_norm": 4.459097862243652, "learning_rate": 6.708871007340541e-05, "loss": 2.515476417541504, "memory(GiB)": 77.56, "step": 45395, "token_acc": 0.4777777777777778, "train_speed(iter/s)": 1.440598 }, { "epoch": 1.9450751895805665, "grad_norm": 4.505022048950195, "learning_rate": 6.708238539839025e-05, "loss": 2.3406009674072266, "memory(GiB)": 77.56, "step": 45400, "token_acc": 0.5015105740181269, "train_speed(iter/s)": 1.440618 }, { "epoch": 1.9452894049098153, "grad_norm": 6.577566623687744, "learning_rate": 6.707606041391165e-05, "loss": 2.7817224502563476, "memory(GiB)": 77.56, "step": 45405, "token_acc": 0.41198501872659177, "train_speed(iter/s)": 1.440643 }, { "epoch": 1.9455036202390643, "grad_norm": 4.262021064758301, "learning_rate": 6.706973512008421e-05, "loss": 2.657246398925781, "memory(GiB)": 77.56, "step": 45410, "token_acc": 0.4527027027027027, "train_speed(iter/s)": 1.440658 }, { "epoch": 1.9457178355683133, "grad_norm": 4.189906597137451, "learning_rate": 6.70634095170225e-05, "loss": 2.511100959777832, "memory(GiB)": 77.56, "step": 45415, "token_acc": 0.4541984732824427, "train_speed(iter/s)": 1.440674 }, { "epoch": 1.9459320508975622, "grad_norm": 4.1230149269104, "learning_rate": 6.705708360484111e-05, "loss": 2.6832096099853517, "memory(GiB)": 77.56, "step": 45420, "token_acc": 0.47440273037542663, "train_speed(iter/s)": 1.440676 }, { "epoch": 1.9461462662268112, "grad_norm": 5.072758197784424, "learning_rate": 6.705075738365468e-05, "loss": 2.613764762878418, "memory(GiB)": 77.56, "step": 45425, "token_acc": 0.4634146341463415, "train_speed(iter/s)": 1.440597 }, { "epoch": 1.9463604815560602, "grad_norm": 4.666149139404297, "learning_rate": 6.704443085357777e-05, "loss": 2.439645004272461, "memory(GiB)": 77.56, "step": 45430, "token_acc": 0.49110320284697506, "train_speed(iter/s)": 1.440635 }, { "epoch": 1.946574696885309, "grad_norm": 12.834710121154785, "learning_rate": 6.7038104014725e-05, "loss": 2.3617557525634765, "memory(GiB)": 77.56, "step": 45435, "token_acc": 0.5015873015873016, "train_speed(iter/s)": 1.440632 }, { "epoch": 1.946788912214558, "grad_norm": 4.6653008460998535, "learning_rate": 6.703177686721098e-05, "loss": 2.675905227661133, "memory(GiB)": 77.56, "step": 45440, "token_acc": 0.5, "train_speed(iter/s)": 1.440643 }, { "epoch": 1.947003127543807, "grad_norm": 7.687556743621826, "learning_rate": 6.702544941115036e-05, "loss": 2.4108144760131838, "memory(GiB)": 77.56, "step": 45445, "token_acc": 0.503731343283582, "train_speed(iter/s)": 1.440635 }, { "epoch": 1.947217342873056, "grad_norm": 4.9494123458862305, "learning_rate": 6.701912164665776e-05, "loss": 2.415040969848633, "memory(GiB)": 77.56, "step": 45450, "token_acc": 0.4808362369337979, "train_speed(iter/s)": 1.440602 }, { "epoch": 1.947431558202305, "grad_norm": 6.328458309173584, "learning_rate": 6.70127935738478e-05, "loss": 2.236003303527832, "memory(GiB)": 77.56, "step": 45455, "token_acc": 0.5427509293680297, "train_speed(iter/s)": 1.440557 }, { "epoch": 1.947645773531554, "grad_norm": 4.472757339477539, "learning_rate": 6.700646519283513e-05, "loss": 2.6073421478271483, "memory(GiB)": 77.56, "step": 45460, "token_acc": 0.4907749077490775, "train_speed(iter/s)": 1.44059 }, { "epoch": 1.9478599888608028, "grad_norm": 6.233323097229004, "learning_rate": 6.700013650373438e-05, "loss": 2.3979076385498046, "memory(GiB)": 77.56, "step": 45465, "token_acc": 0.4782608695652174, "train_speed(iter/s)": 1.440606 }, { "epoch": 1.9480742041900518, "grad_norm": 4.534661769866943, "learning_rate": 6.699380750666023e-05, "loss": 2.769176483154297, "memory(GiB)": 77.56, "step": 45470, "token_acc": 0.46200607902735563, "train_speed(iter/s)": 1.440627 }, { "epoch": 1.9482884195193009, "grad_norm": 4.951756000518799, "learning_rate": 6.698747820172728e-05, "loss": 2.579067611694336, "memory(GiB)": 77.56, "step": 45475, "token_acc": 0.4161490683229814, "train_speed(iter/s)": 1.440667 }, { "epoch": 1.9485026348485497, "grad_norm": 4.349664688110352, "learning_rate": 6.698114858905024e-05, "loss": 2.4291372299194336, "memory(GiB)": 77.56, "step": 45480, "token_acc": 0.4965986394557823, "train_speed(iter/s)": 1.440673 }, { "epoch": 1.9487168501777987, "grad_norm": 4.892009258270264, "learning_rate": 6.697481866874377e-05, "loss": 2.6150508880615235, "memory(GiB)": 77.56, "step": 45485, "token_acc": 0.471875, "train_speed(iter/s)": 1.440728 }, { "epoch": 1.9489310655070478, "grad_norm": 4.527082920074463, "learning_rate": 6.696848844092254e-05, "loss": 2.3411970138549805, "memory(GiB)": 77.56, "step": 45490, "token_acc": 0.4440894568690096, "train_speed(iter/s)": 1.440684 }, { "epoch": 1.9491452808362966, "grad_norm": 4.689889430999756, "learning_rate": 6.696215790570122e-05, "loss": 2.333414077758789, "memory(GiB)": 77.56, "step": 45495, "token_acc": 0.4576271186440678, "train_speed(iter/s)": 1.440675 }, { "epoch": 1.9493594961655456, "grad_norm": 4.693789958953857, "learning_rate": 6.69558270631945e-05, "loss": 2.806296539306641, "memory(GiB)": 77.56, "step": 45500, "token_acc": 0.39457831325301207, "train_speed(iter/s)": 1.440636 }, { "epoch": 1.9493594961655456, "eval_loss": 2.266148328781128, "eval_runtime": 13.4619, "eval_samples_per_second": 7.428, "eval_steps_per_second": 7.428, "eval_token_acc": 0.4514285714285714, "step": 45500 }, { "epoch": 1.9495737114947946, "grad_norm": 5.273241996765137, "learning_rate": 6.694949591351706e-05, "loss": 2.4877269744873045, "memory(GiB)": 77.56, "step": 45505, "token_acc": 0.44410569105691056, "train_speed(iter/s)": 1.44 }, { "epoch": 1.9497879268240434, "grad_norm": 5.049350738525391, "learning_rate": 6.694316445678363e-05, "loss": 2.6786537170410156, "memory(GiB)": 77.56, "step": 45510, "token_acc": 0.44029850746268656, "train_speed(iter/s)": 1.44 }, { "epoch": 1.9500021421532925, "grad_norm": 4.017052173614502, "learning_rate": 6.693683269310884e-05, "loss": 2.529633331298828, "memory(GiB)": 77.56, "step": 45515, "token_acc": 0.5089605734767025, "train_speed(iter/s)": 1.439979 }, { "epoch": 1.9502163574825415, "grad_norm": 4.5068159103393555, "learning_rate": 6.693050062260747e-05, "loss": 2.8497472763061524, "memory(GiB)": 77.56, "step": 45520, "token_acc": 0.44876325088339225, "train_speed(iter/s)": 1.439973 }, { "epoch": 1.9504305728117903, "grad_norm": 4.838365077972412, "learning_rate": 6.692416824539418e-05, "loss": 2.4935396194458006, "memory(GiB)": 77.56, "step": 45525, "token_acc": 0.5033112582781457, "train_speed(iter/s)": 1.439976 }, { "epoch": 1.9506447881410394, "grad_norm": 5.676723480224609, "learning_rate": 6.69178355615837e-05, "loss": 2.3777151107788086, "memory(GiB)": 77.56, "step": 45530, "token_acc": 0.4957627118644068, "train_speed(iter/s)": 1.439993 }, { "epoch": 1.9508590034702884, "grad_norm": 6.021706581115723, "learning_rate": 6.691150257129077e-05, "loss": 2.5612789154052735, "memory(GiB)": 77.56, "step": 45535, "token_acc": 0.4701195219123506, "train_speed(iter/s)": 1.440017 }, { "epoch": 1.9510732187995372, "grad_norm": 5.214600563049316, "learning_rate": 6.690516927463012e-05, "loss": 2.411444664001465, "memory(GiB)": 77.56, "step": 45540, "token_acc": 0.4735202492211838, "train_speed(iter/s)": 1.440065 }, { "epoch": 1.9512874341287862, "grad_norm": 4.718339920043945, "learning_rate": 6.689883567171646e-05, "loss": 2.5888877868652345, "memory(GiB)": 77.56, "step": 45545, "token_acc": 0.4651898734177215, "train_speed(iter/s)": 1.440084 }, { "epoch": 1.9515016494580353, "grad_norm": 4.600925922393799, "learning_rate": 6.689250176266452e-05, "loss": 2.4571081161499024, "memory(GiB)": 77.56, "step": 45550, "token_acc": 0.4734982332155477, "train_speed(iter/s)": 1.440073 }, { "epoch": 1.951715864787284, "grad_norm": 4.214907169342041, "learning_rate": 6.688616754758908e-05, "loss": 2.537423324584961, "memory(GiB)": 77.56, "step": 45555, "token_acc": 0.4727272727272727, "train_speed(iter/s)": 1.44006 }, { "epoch": 1.9519300801165331, "grad_norm": 4.54862117767334, "learning_rate": 6.687983302660487e-05, "loss": 2.7397626876831054, "memory(GiB)": 77.56, "step": 45560, "token_acc": 0.42765273311897106, "train_speed(iter/s)": 1.44004 }, { "epoch": 1.9521442954457822, "grad_norm": 4.2088518142700195, "learning_rate": 6.687349819982667e-05, "loss": 2.4314159393310546, "memory(GiB)": 77.56, "step": 45565, "token_acc": 0.4774436090225564, "train_speed(iter/s)": 1.440037 }, { "epoch": 1.952358510775031, "grad_norm": 5.273055076599121, "learning_rate": 6.686716306736921e-05, "loss": 2.3553470611572265, "memory(GiB)": 77.56, "step": 45570, "token_acc": 0.48161764705882354, "train_speed(iter/s)": 1.440011 }, { "epoch": 1.95257272610428, "grad_norm": 6.307897567749023, "learning_rate": 6.686082762934725e-05, "loss": 2.4227832794189452, "memory(GiB)": 77.56, "step": 45575, "token_acc": 0.48638132295719844, "train_speed(iter/s)": 1.440018 }, { "epoch": 1.952786941433529, "grad_norm": 5.8458170890808105, "learning_rate": 6.68544918858756e-05, "loss": 2.771864318847656, "memory(GiB)": 77.56, "step": 45580, "token_acc": 0.4192546583850932, "train_speed(iter/s)": 1.440036 }, { "epoch": 1.9530011567627779, "grad_norm": 6.395783424377441, "learning_rate": 6.684815583706902e-05, "loss": 2.36529598236084, "memory(GiB)": 77.56, "step": 45585, "token_acc": 0.5182186234817814, "train_speed(iter/s)": 1.439999 }, { "epoch": 1.953215372092027, "grad_norm": 5.270660400390625, "learning_rate": 6.684181948304228e-05, "loss": 2.234212303161621, "memory(GiB)": 77.56, "step": 45590, "token_acc": 0.5035211267605634, "train_speed(iter/s)": 1.44003 }, { "epoch": 1.953429587421276, "grad_norm": 4.634381294250488, "learning_rate": 6.683548282391019e-05, "loss": 2.3774932861328124, "memory(GiB)": 77.56, "step": 45595, "token_acc": 0.503448275862069, "train_speed(iter/s)": 1.440023 }, { "epoch": 1.9536438027505247, "grad_norm": 5.826134204864502, "learning_rate": 6.682914585978754e-05, "loss": 2.585467529296875, "memory(GiB)": 77.56, "step": 45600, "token_acc": 0.47191011235955055, "train_speed(iter/s)": 1.440062 }, { "epoch": 1.9538580180797738, "grad_norm": 3.282994508743286, "learning_rate": 6.682280859078911e-05, "loss": 2.6191328048706053, "memory(GiB)": 77.56, "step": 45605, "token_acc": 0.5015384615384615, "train_speed(iter/s)": 1.440039 }, { "epoch": 1.9540722334090228, "grad_norm": 5.450883865356445, "learning_rate": 6.681647101702973e-05, "loss": 2.727987861633301, "memory(GiB)": 77.56, "step": 45610, "token_acc": 0.4523809523809524, "train_speed(iter/s)": 1.440079 }, { "epoch": 1.9542864487382716, "grad_norm": 4.7945146560668945, "learning_rate": 6.681013313862421e-05, "loss": 2.798582077026367, "memory(GiB)": 77.56, "step": 45615, "token_acc": 0.449438202247191, "train_speed(iter/s)": 1.44001 }, { "epoch": 1.9545006640675207, "grad_norm": 3.914283275604248, "learning_rate": 6.680379495568735e-05, "loss": 2.5355690002441404, "memory(GiB)": 77.56, "step": 45620, "token_acc": 0.4542483660130719, "train_speed(iter/s)": 1.440041 }, { "epoch": 1.9547148793967697, "grad_norm": 4.840810298919678, "learning_rate": 6.679745646833397e-05, "loss": 2.332847023010254, "memory(GiB)": 77.56, "step": 45625, "token_acc": 0.49836065573770494, "train_speed(iter/s)": 1.440015 }, { "epoch": 1.9549290947260185, "grad_norm": 6.350103378295898, "learning_rate": 6.679111767667894e-05, "loss": 2.43646240234375, "memory(GiB)": 77.56, "step": 45630, "token_acc": 0.5102040816326531, "train_speed(iter/s)": 1.440006 }, { "epoch": 1.9551433100552675, "grad_norm": 5.065173625946045, "learning_rate": 6.678477858083703e-05, "loss": 2.835140037536621, "memory(GiB)": 77.56, "step": 45635, "token_acc": 0.4602649006622517, "train_speed(iter/s)": 1.440022 }, { "epoch": 1.9553575253845166, "grad_norm": 5.095966815948486, "learning_rate": 6.677843918092312e-05, "loss": 2.516127014160156, "memory(GiB)": 77.56, "step": 45640, "token_acc": 0.486013986013986, "train_speed(iter/s)": 1.440008 }, { "epoch": 1.9555717407137654, "grad_norm": 4.866915702819824, "learning_rate": 6.677209947705205e-05, "loss": 2.562136650085449, "memory(GiB)": 77.56, "step": 45645, "token_acc": 0.43416370106761565, "train_speed(iter/s)": 1.440012 }, { "epoch": 1.9557859560430144, "grad_norm": 4.1509175300598145, "learning_rate": 6.676575946933867e-05, "loss": 2.1394287109375, "memory(GiB)": 77.56, "step": 45650, "token_acc": 0.5096774193548387, "train_speed(iter/s)": 1.440026 }, { "epoch": 1.9560001713722635, "grad_norm": 4.327695369720459, "learning_rate": 6.675941915789783e-05, "loss": 2.4458009719848635, "memory(GiB)": 77.56, "step": 45655, "token_acc": 0.48013245033112584, "train_speed(iter/s)": 1.440004 }, { "epoch": 1.9562143867015123, "grad_norm": 6.655233860015869, "learning_rate": 6.675307854284438e-05, "loss": 2.729585647583008, "memory(GiB)": 77.56, "step": 45660, "token_acc": 0.4067796610169492, "train_speed(iter/s)": 1.440007 }, { "epoch": 1.9564286020307613, "grad_norm": 4.3718767166137695, "learning_rate": 6.67467376242932e-05, "loss": 2.3149494171142577, "memory(GiB)": 77.56, "step": 45665, "token_acc": 0.5357142857142857, "train_speed(iter/s)": 1.440002 }, { "epoch": 1.9566428173600103, "grad_norm": 6.582820892333984, "learning_rate": 6.674039640235917e-05, "loss": 2.5831912994384765, "memory(GiB)": 77.56, "step": 45670, "token_acc": 0.4782608695652174, "train_speed(iter/s)": 1.439984 }, { "epoch": 1.9568570326892591, "grad_norm": 4.774419784545898, "learning_rate": 6.673405487715713e-05, "loss": 2.264492988586426, "memory(GiB)": 77.56, "step": 45675, "token_acc": 0.5210084033613446, "train_speed(iter/s)": 1.439973 }, { "epoch": 1.9570712480185082, "grad_norm": 5.1712646484375, "learning_rate": 6.6727713048802e-05, "loss": 2.4001861572265626, "memory(GiB)": 77.56, "step": 45680, "token_acc": 0.5207547169811321, "train_speed(iter/s)": 1.439989 }, { "epoch": 1.9572854633477572, "grad_norm": 6.647982120513916, "learning_rate": 6.672137091740867e-05, "loss": 2.4587112426757813, "memory(GiB)": 77.56, "step": 45685, "token_acc": 0.45255474452554745, "train_speed(iter/s)": 1.44003 }, { "epoch": 1.957499678677006, "grad_norm": 5.97927713394165, "learning_rate": 6.671502848309198e-05, "loss": 2.41509952545166, "memory(GiB)": 77.56, "step": 45690, "token_acc": 0.4980392156862745, "train_speed(iter/s)": 1.440066 }, { "epoch": 1.957713894006255, "grad_norm": 4.979121685028076, "learning_rate": 6.670868574596691e-05, "loss": 2.54388313293457, "memory(GiB)": 77.56, "step": 45695, "token_acc": 0.44140625, "train_speed(iter/s)": 1.440047 }, { "epoch": 1.957928109335504, "grad_norm": 4.9567365646362305, "learning_rate": 6.670234270614832e-05, "loss": 2.8766857147216798, "memory(GiB)": 77.56, "step": 45700, "token_acc": 0.4594594594594595, "train_speed(iter/s)": 1.440045 }, { "epoch": 1.958142324664753, "grad_norm": 4.630826950073242, "learning_rate": 6.66959993637511e-05, "loss": 2.474800682067871, "memory(GiB)": 77.56, "step": 45705, "token_acc": 0.5288461538461539, "train_speed(iter/s)": 1.440044 }, { "epoch": 1.958356539994002, "grad_norm": 4.426127910614014, "learning_rate": 6.668965571889021e-05, "loss": 2.7333507537841797, "memory(GiB)": 77.56, "step": 45710, "token_acc": 0.4691780821917808, "train_speed(iter/s)": 1.440064 }, { "epoch": 1.958570755323251, "grad_norm": 5.418478965759277, "learning_rate": 6.668331177168055e-05, "loss": 2.374101257324219, "memory(GiB)": 77.56, "step": 45715, "token_acc": 0.45901639344262296, "train_speed(iter/s)": 1.44007 }, { "epoch": 1.9587849706524998, "grad_norm": 5.040560245513916, "learning_rate": 6.667696752223703e-05, "loss": 2.7069873809814453, "memory(GiB)": 77.56, "step": 45720, "token_acc": 0.44981412639405205, "train_speed(iter/s)": 1.440092 }, { "epoch": 1.9589991859817488, "grad_norm": 5.640760898590088, "learning_rate": 6.667062297067462e-05, "loss": 2.40421142578125, "memory(GiB)": 77.56, "step": 45725, "token_acc": 0.4827586206896552, "train_speed(iter/s)": 1.440105 }, { "epoch": 1.9592134013109979, "grad_norm": 3.948404312133789, "learning_rate": 6.666427811710824e-05, "loss": 2.652470016479492, "memory(GiB)": 77.56, "step": 45730, "token_acc": 0.4827586206896552, "train_speed(iter/s)": 1.440129 }, { "epoch": 1.9594276166402467, "grad_norm": 6.179126739501953, "learning_rate": 6.66579329616528e-05, "loss": 2.852280616760254, "memory(GiB)": 77.56, "step": 45735, "token_acc": 0.4463667820069204, "train_speed(iter/s)": 1.44018 }, { "epoch": 1.9596418319694957, "grad_norm": 4.214303016662598, "learning_rate": 6.66515875044233e-05, "loss": 2.3850189208984376, "memory(GiB)": 77.56, "step": 45740, "token_acc": 0.53125, "train_speed(iter/s)": 1.440183 }, { "epoch": 1.9598560472987447, "grad_norm": 4.51772928237915, "learning_rate": 6.664524174553467e-05, "loss": 2.7407501220703123, "memory(GiB)": 77.56, "step": 45745, "token_acc": 0.4570552147239264, "train_speed(iter/s)": 1.44019 }, { "epoch": 1.9600702626279936, "grad_norm": 5.619811058044434, "learning_rate": 6.663889568510186e-05, "loss": 2.686910057067871, "memory(GiB)": 77.56, "step": 45750, "token_acc": 0.41694915254237286, "train_speed(iter/s)": 1.440207 }, { "epoch": 1.9602844779572426, "grad_norm": 3.596792221069336, "learning_rate": 6.663254932323986e-05, "loss": 2.729270362854004, "memory(GiB)": 77.56, "step": 45755, "token_acc": 0.43217665615141954, "train_speed(iter/s)": 1.440221 }, { "epoch": 1.9604986932864916, "grad_norm": 5.189215183258057, "learning_rate": 6.662620266006361e-05, "loss": 2.6603809356689454, "memory(GiB)": 77.56, "step": 45760, "token_acc": 0.4701086956521739, "train_speed(iter/s)": 1.440246 }, { "epoch": 1.9607129086157404, "grad_norm": 5.653921604156494, "learning_rate": 6.661985569568811e-05, "loss": 2.4463336944580076, "memory(GiB)": 77.56, "step": 45765, "token_acc": 0.5136986301369864, "train_speed(iter/s)": 1.440266 }, { "epoch": 1.9609271239449895, "grad_norm": 5.984095096588135, "learning_rate": 6.661350843022833e-05, "loss": 2.3911781311035156, "memory(GiB)": 77.56, "step": 45770, "token_acc": 0.4634146341463415, "train_speed(iter/s)": 1.440303 }, { "epoch": 1.9611413392742385, "grad_norm": 5.861817836761475, "learning_rate": 6.660716086379927e-05, "loss": 2.728754425048828, "memory(GiB)": 77.56, "step": 45775, "token_acc": 0.4825174825174825, "train_speed(iter/s)": 1.440315 }, { "epoch": 1.9613555546034873, "grad_norm": 5.163695335388184, "learning_rate": 6.660081299651591e-05, "loss": 2.8481945037841796, "memory(GiB)": 77.56, "step": 45780, "token_acc": 0.46273291925465837, "train_speed(iter/s)": 1.440364 }, { "epoch": 1.9615697699327364, "grad_norm": 5.630147933959961, "learning_rate": 6.659446482849325e-05, "loss": 2.5242502212524416, "memory(GiB)": 77.56, "step": 45785, "token_acc": 0.460431654676259, "train_speed(iter/s)": 1.440358 }, { "epoch": 1.9617839852619854, "grad_norm": 4.404461860656738, "learning_rate": 6.658811635984629e-05, "loss": 2.657027816772461, "memory(GiB)": 77.56, "step": 45790, "token_acc": 0.4664179104477612, "train_speed(iter/s)": 1.440401 }, { "epoch": 1.9619982005912342, "grad_norm": 4.369134426116943, "learning_rate": 6.658176759069004e-05, "loss": 2.7699539184570314, "memory(GiB)": 77.56, "step": 45795, "token_acc": 0.48360655737704916, "train_speed(iter/s)": 1.44043 }, { "epoch": 1.9622124159204832, "grad_norm": 4.9915924072265625, "learning_rate": 6.657541852113953e-05, "loss": 2.6560897827148438, "memory(GiB)": 77.56, "step": 45800, "token_acc": 0.423841059602649, "train_speed(iter/s)": 1.440389 }, { "epoch": 1.9624266312497323, "grad_norm": 8.534985542297363, "learning_rate": 6.656906915130975e-05, "loss": 2.514673614501953, "memory(GiB)": 77.56, "step": 45805, "token_acc": 0.46638655462184875, "train_speed(iter/s)": 1.440415 }, { "epoch": 1.962640846578981, "grad_norm": 6.863739013671875, "learning_rate": 6.656271948131573e-05, "loss": 2.553104019165039, "memory(GiB)": 77.56, "step": 45810, "token_acc": 0.4634146341463415, "train_speed(iter/s)": 1.440385 }, { "epoch": 1.9628550619082301, "grad_norm": 6.4225897789001465, "learning_rate": 6.655636951127254e-05, "loss": 2.747235107421875, "memory(GiB)": 77.56, "step": 45815, "token_acc": 0.4342105263157895, "train_speed(iter/s)": 1.440399 }, { "epoch": 1.9630692772374791, "grad_norm": 5.554860591888428, "learning_rate": 6.655001924129515e-05, "loss": 2.8411582946777343, "memory(GiB)": 77.56, "step": 45820, "token_acc": 0.4261744966442953, "train_speed(iter/s)": 1.440388 }, { "epoch": 1.963283492566728, "grad_norm": 5.233529567718506, "learning_rate": 6.654366867149865e-05, "loss": 2.360663414001465, "memory(GiB)": 77.56, "step": 45825, "token_acc": 0.5, "train_speed(iter/s)": 1.44039 }, { "epoch": 1.963497707895977, "grad_norm": 4.994997024536133, "learning_rate": 6.653731780199807e-05, "loss": 2.7178909301757814, "memory(GiB)": 77.56, "step": 45830, "token_acc": 0.4766666666666667, "train_speed(iter/s)": 1.440354 }, { "epoch": 1.963711923225226, "grad_norm": 4.927798271179199, "learning_rate": 6.653096663290847e-05, "loss": 2.3548952102661134, "memory(GiB)": 77.56, "step": 45835, "token_acc": 0.4645669291338583, "train_speed(iter/s)": 1.440372 }, { "epoch": 1.9639261385544748, "grad_norm": 5.804313659667969, "learning_rate": 6.652461516434492e-05, "loss": 2.7635629653930662, "memory(GiB)": 77.56, "step": 45840, "token_acc": 0.47058823529411764, "train_speed(iter/s)": 1.440402 }, { "epoch": 1.9641403538837239, "grad_norm": 5.223865032196045, "learning_rate": 6.651826339642243e-05, "loss": 2.681183433532715, "memory(GiB)": 77.56, "step": 45845, "token_acc": 0.4316546762589928, "train_speed(iter/s)": 1.440386 }, { "epoch": 1.964354569212973, "grad_norm": 5.4697794914245605, "learning_rate": 6.651191132925612e-05, "loss": 2.288090133666992, "memory(GiB)": 77.56, "step": 45850, "token_acc": 0.48928571428571427, "train_speed(iter/s)": 1.440419 }, { "epoch": 1.9645687845422217, "grad_norm": 4.830199718475342, "learning_rate": 6.650555896296104e-05, "loss": 2.339397430419922, "memory(GiB)": 77.56, "step": 45855, "token_acc": 0.5042016806722689, "train_speed(iter/s)": 1.440422 }, { "epoch": 1.9647829998714708, "grad_norm": 5.951051712036133, "learning_rate": 6.649920629765229e-05, "loss": 2.4759355545043946, "memory(GiB)": 77.56, "step": 45860, "token_acc": 0.45925925925925926, "train_speed(iter/s)": 1.440462 }, { "epoch": 1.9649972152007198, "grad_norm": 5.6724677085876465, "learning_rate": 6.649285333344492e-05, "loss": 2.5050195693969726, "memory(GiB)": 77.56, "step": 45865, "token_acc": 0.46963562753036436, "train_speed(iter/s)": 1.440456 }, { "epoch": 1.9652114305299686, "grad_norm": 5.25377893447876, "learning_rate": 6.648650007045407e-05, "loss": 2.4837507247924804, "memory(GiB)": 77.56, "step": 45870, "token_acc": 0.5121107266435986, "train_speed(iter/s)": 1.440458 }, { "epoch": 1.9654256458592176, "grad_norm": 5.423166751861572, "learning_rate": 6.648014650879479e-05, "loss": 2.893759346008301, "memory(GiB)": 77.56, "step": 45875, "token_acc": 0.4607142857142857, "train_speed(iter/s)": 1.440504 }, { "epoch": 1.9656398611884667, "grad_norm": 4.335208892822266, "learning_rate": 6.64737926485822e-05, "loss": 2.390877532958984, "memory(GiB)": 77.56, "step": 45880, "token_acc": 0.527972027972028, "train_speed(iter/s)": 1.440481 }, { "epoch": 1.9658540765177155, "grad_norm": 4.426405429840088, "learning_rate": 6.646743848993139e-05, "loss": 2.643740653991699, "memory(GiB)": 77.56, "step": 45885, "token_acc": 0.4417910447761194, "train_speed(iter/s)": 1.440457 }, { "epoch": 1.9660682918469645, "grad_norm": 5.99837589263916, "learning_rate": 6.64610840329575e-05, "loss": 2.7230987548828125, "memory(GiB)": 77.56, "step": 45890, "token_acc": 0.4153846153846154, "train_speed(iter/s)": 1.440498 }, { "epoch": 1.9662825071762136, "grad_norm": 6.710351943969727, "learning_rate": 6.64547292777756e-05, "loss": 2.7149799346923826, "memory(GiB)": 77.56, "step": 45895, "token_acc": 0.47368421052631576, "train_speed(iter/s)": 1.440515 }, { "epoch": 1.9664967225054624, "grad_norm": 5.052811145782471, "learning_rate": 6.644837422450087e-05, "loss": 2.3122949600219727, "memory(GiB)": 77.56, "step": 45900, "token_acc": 0.48120300751879697, "train_speed(iter/s)": 1.440496 }, { "epoch": 1.9667109378347114, "grad_norm": 6.387402057647705, "learning_rate": 6.644201887324841e-05, "loss": 2.6619235992431642, "memory(GiB)": 77.56, "step": 45905, "token_acc": 0.423728813559322, "train_speed(iter/s)": 1.440487 }, { "epoch": 1.9669251531639604, "grad_norm": 6.813206672668457, "learning_rate": 6.643566322413336e-05, "loss": 2.4210391998291017, "memory(GiB)": 77.56, "step": 45910, "token_acc": 0.4775641025641026, "train_speed(iter/s)": 1.440483 }, { "epoch": 1.9671393684932093, "grad_norm": 6.530525207519531, "learning_rate": 6.642930727727085e-05, "loss": 2.380424690246582, "memory(GiB)": 77.56, "step": 45915, "token_acc": 0.4796747967479675, "train_speed(iter/s)": 1.440503 }, { "epoch": 1.9673535838224583, "grad_norm": 4.187199592590332, "learning_rate": 6.642295103277602e-05, "loss": 2.4211254119873047, "memory(GiB)": 77.56, "step": 45920, "token_acc": 0.4774011299435028, "train_speed(iter/s)": 1.440496 }, { "epoch": 1.9675677991517073, "grad_norm": 4.4053955078125, "learning_rate": 6.641659449076401e-05, "loss": 2.7990097045898437, "memory(GiB)": 77.56, "step": 45925, "token_acc": 0.4608433734939759, "train_speed(iter/s)": 1.440497 }, { "epoch": 1.9677820144809561, "grad_norm": 4.436799049377441, "learning_rate": 6.641023765135001e-05, "loss": 2.544133758544922, "memory(GiB)": 77.56, "step": 45930, "token_acc": 0.46607669616519176, "train_speed(iter/s)": 1.440473 }, { "epoch": 1.9679962298102052, "grad_norm": 4.896063327789307, "learning_rate": 6.640388051464917e-05, "loss": 2.408317756652832, "memory(GiB)": 77.56, "step": 45935, "token_acc": 0.46619217081850534, "train_speed(iter/s)": 1.440482 }, { "epoch": 1.9682104451394542, "grad_norm": 4.4914469718933105, "learning_rate": 6.639752308077662e-05, "loss": 2.615721321105957, "memory(GiB)": 77.56, "step": 45940, "token_acc": 0.47560975609756095, "train_speed(iter/s)": 1.440503 }, { "epoch": 1.968424660468703, "grad_norm": 6.709591865539551, "learning_rate": 6.639116534984758e-05, "loss": 2.486857604980469, "memory(GiB)": 77.56, "step": 45945, "token_acc": 0.4632352941176471, "train_speed(iter/s)": 1.440523 }, { "epoch": 1.968638875797952, "grad_norm": 4.1377973556518555, "learning_rate": 6.63848073219772e-05, "loss": 2.4100162506103517, "memory(GiB)": 77.56, "step": 45950, "token_acc": 0.43859649122807015, "train_speed(iter/s)": 1.440494 }, { "epoch": 1.968853091127201, "grad_norm": 4.820364952087402, "learning_rate": 6.637844899728065e-05, "loss": 2.901133728027344, "memory(GiB)": 77.56, "step": 45955, "token_acc": 0.44280442804428044, "train_speed(iter/s)": 1.440497 }, { "epoch": 1.96906730645645, "grad_norm": 4.0486907958984375, "learning_rate": 6.637209037587315e-05, "loss": 2.6329875946044923, "memory(GiB)": 77.56, "step": 45960, "token_acc": 0.5087108013937283, "train_speed(iter/s)": 1.440504 }, { "epoch": 1.969281521785699, "grad_norm": 5.785236835479736, "learning_rate": 6.636573145786987e-05, "loss": 2.518662452697754, "memory(GiB)": 77.56, "step": 45965, "token_acc": 0.4557823129251701, "train_speed(iter/s)": 1.440511 }, { "epoch": 1.969495737114948, "grad_norm": 5.312324047088623, "learning_rate": 6.635937224338601e-05, "loss": 2.66689453125, "memory(GiB)": 77.56, "step": 45970, "token_acc": 0.43234323432343236, "train_speed(iter/s)": 1.440531 }, { "epoch": 1.9697099524441968, "grad_norm": 5.460351467132568, "learning_rate": 6.635301273253678e-05, "loss": 2.398164749145508, "memory(GiB)": 77.56, "step": 45975, "token_acc": 0.47619047619047616, "train_speed(iter/s)": 1.440479 }, { "epoch": 1.969924167773446, "grad_norm": 9.177876472473145, "learning_rate": 6.634665292543737e-05, "loss": 2.5278142929077148, "memory(GiB)": 77.56, "step": 45980, "token_acc": 0.5037593984962406, "train_speed(iter/s)": 1.4405 }, { "epoch": 1.9701383831026948, "grad_norm": 4.011397361755371, "learning_rate": 6.634029282220303e-05, "loss": 2.349286651611328, "memory(GiB)": 77.56, "step": 45985, "token_acc": 0.5300353356890459, "train_speed(iter/s)": 1.440511 }, { "epoch": 1.9703525984319437, "grad_norm": 4.624520778656006, "learning_rate": 6.633393242294894e-05, "loss": 2.2925954818725587, "memory(GiB)": 77.56, "step": 45990, "token_acc": 0.5050505050505051, "train_speed(iter/s)": 1.440517 }, { "epoch": 1.970566813761193, "grad_norm": 6.456453800201416, "learning_rate": 6.632757172779034e-05, "loss": 2.5786991119384766, "memory(GiB)": 77.56, "step": 45995, "token_acc": 0.48880597014925375, "train_speed(iter/s)": 1.440515 }, { "epoch": 1.9707810290904417, "grad_norm": 4.135759353637695, "learning_rate": 6.632121073684246e-05, "loss": 2.1136852264404298, "memory(GiB)": 77.56, "step": 46000, "token_acc": 0.5384615384615384, "train_speed(iter/s)": 1.440504 }, { "epoch": 1.9707810290904417, "eval_loss": 2.287435531616211, "eval_runtime": 15.0359, "eval_samples_per_second": 6.651, "eval_steps_per_second": 6.651, "eval_token_acc": 0.4641509433962264, "step": 46000 }, { "epoch": 1.9709952444196905, "grad_norm": 5.086953639984131, "learning_rate": 6.631484945022055e-05, "loss": 2.330047035217285, "memory(GiB)": 77.56, "step": 46005, "token_acc": 0.4670433145009416, "train_speed(iter/s)": 1.439799 }, { "epoch": 1.9712094597489398, "grad_norm": 5.451361656188965, "learning_rate": 6.630848786803983e-05, "loss": 2.3972442626953123, "memory(GiB)": 77.56, "step": 46010, "token_acc": 0.5017301038062284, "train_speed(iter/s)": 1.439792 }, { "epoch": 1.9714236750781886, "grad_norm": 4.9217987060546875, "learning_rate": 6.630212599041556e-05, "loss": 2.8109764099121093, "memory(GiB)": 77.56, "step": 46015, "token_acc": 0.4186746987951807, "train_speed(iter/s)": 1.439835 }, { "epoch": 1.9716378904074374, "grad_norm": 7.312126636505127, "learning_rate": 6.629576381746299e-05, "loss": 2.473282051086426, "memory(GiB)": 77.56, "step": 46020, "token_acc": 0.43636363636363634, "train_speed(iter/s)": 1.439861 }, { "epoch": 1.9718521057366867, "grad_norm": 4.677431583404541, "learning_rate": 6.628940134929736e-05, "loss": 2.624419403076172, "memory(GiB)": 77.56, "step": 46025, "token_acc": 0.45454545454545453, "train_speed(iter/s)": 1.439862 }, { "epoch": 1.9720663210659355, "grad_norm": 4.504249572753906, "learning_rate": 6.628303858603395e-05, "loss": 2.371323013305664, "memory(GiB)": 77.56, "step": 46030, "token_acc": 0.49645390070921985, "train_speed(iter/s)": 1.439853 }, { "epoch": 1.9722805363951843, "grad_norm": 6.488310813903809, "learning_rate": 6.627667552778802e-05, "loss": 2.247083282470703, "memory(GiB)": 77.56, "step": 46035, "token_acc": 0.5081967213114754, "train_speed(iter/s)": 1.439868 }, { "epoch": 1.9724947517244336, "grad_norm": 5.536812782287598, "learning_rate": 6.627031217467484e-05, "loss": 2.5022125244140625, "memory(GiB)": 77.56, "step": 46040, "token_acc": 0.4641509433962264, "train_speed(iter/s)": 1.439881 }, { "epoch": 1.9727089670536824, "grad_norm": 4.00999116897583, "learning_rate": 6.626394852680968e-05, "loss": 2.453029441833496, "memory(GiB)": 77.56, "step": 46045, "token_acc": 0.48872180451127817, "train_speed(iter/s)": 1.43988 }, { "epoch": 1.9729231823829312, "grad_norm": 6.29418420791626, "learning_rate": 6.625758458430785e-05, "loss": 2.559286880493164, "memory(GiB)": 77.56, "step": 46050, "token_acc": 0.486013986013986, "train_speed(iter/s)": 1.439873 }, { "epoch": 1.9731373977121804, "grad_norm": 5.518522262573242, "learning_rate": 6.625122034728463e-05, "loss": 2.4911861419677734, "memory(GiB)": 77.56, "step": 46055, "token_acc": 0.4746376811594203, "train_speed(iter/s)": 1.439881 }, { "epoch": 1.9733516130414293, "grad_norm": 4.570826530456543, "learning_rate": 6.624485581585531e-05, "loss": 2.5234804153442383, "memory(GiB)": 77.56, "step": 46060, "token_acc": 0.46938775510204084, "train_speed(iter/s)": 1.439864 }, { "epoch": 1.973565828370678, "grad_norm": 4.213891506195068, "learning_rate": 6.623849099013517e-05, "loss": 2.6506858825683595, "memory(GiB)": 77.56, "step": 46065, "token_acc": 0.4682274247491639, "train_speed(iter/s)": 1.439861 }, { "epoch": 1.9737800436999273, "grad_norm": 4.141619682312012, "learning_rate": 6.623212587023955e-05, "loss": 2.401409149169922, "memory(GiB)": 77.56, "step": 46070, "token_acc": 0.5220588235294118, "train_speed(iter/s)": 1.439875 }, { "epoch": 1.9739942590291761, "grad_norm": 4.475203990936279, "learning_rate": 6.622576045628374e-05, "loss": 2.4911773681640623, "memory(GiB)": 77.56, "step": 46075, "token_acc": 0.44280442804428044, "train_speed(iter/s)": 1.439902 }, { "epoch": 1.974208474358425, "grad_norm": 5.4392523765563965, "learning_rate": 6.621939474838304e-05, "loss": 2.2502946853637695, "memory(GiB)": 77.56, "step": 46080, "token_acc": 0.5181159420289855, "train_speed(iter/s)": 1.439919 }, { "epoch": 1.9744226896876742, "grad_norm": 5.93950891494751, "learning_rate": 6.621302874665282e-05, "loss": 2.4226354598999023, "memory(GiB)": 77.56, "step": 46085, "token_acc": 0.4921135646687697, "train_speed(iter/s)": 1.439942 }, { "epoch": 1.974636905016923, "grad_norm": 4.538694858551025, "learning_rate": 6.620666245120836e-05, "loss": 2.8146881103515624, "memory(GiB)": 77.56, "step": 46090, "token_acc": 0.44360902255639095, "train_speed(iter/s)": 1.439967 }, { "epoch": 1.9748511203461718, "grad_norm": 4.988295078277588, "learning_rate": 6.620029586216499e-05, "loss": 2.610568618774414, "memory(GiB)": 77.56, "step": 46095, "token_acc": 0.4583333333333333, "train_speed(iter/s)": 1.439988 }, { "epoch": 1.975065335675421, "grad_norm": 6.159144401550293, "learning_rate": 6.61939289796381e-05, "loss": 2.6839128494262696, "memory(GiB)": 77.56, "step": 46100, "token_acc": 0.4308176100628931, "train_speed(iter/s)": 1.440016 }, { "epoch": 1.97527955100467, "grad_norm": 4.465036392211914, "learning_rate": 6.618756180374295e-05, "loss": 2.7416873931884767, "memory(GiB)": 77.56, "step": 46105, "token_acc": 0.4560260586319218, "train_speed(iter/s)": 1.440026 }, { "epoch": 1.9754937663339187, "grad_norm": 5.041161060333252, "learning_rate": 6.618119433459496e-05, "loss": 2.0445932388305663, "memory(GiB)": 77.56, "step": 46110, "token_acc": 0.53125, "train_speed(iter/s)": 1.440032 }, { "epoch": 1.975707981663168, "grad_norm": 5.3165507316589355, "learning_rate": 6.617482657230945e-05, "loss": 2.613577461242676, "memory(GiB)": 77.56, "step": 46115, "token_acc": 0.4276094276094276, "train_speed(iter/s)": 1.44003 }, { "epoch": 1.9759221969924168, "grad_norm": 5.273176670074463, "learning_rate": 6.61684585170018e-05, "loss": 2.156299591064453, "memory(GiB)": 77.56, "step": 46120, "token_acc": 0.5330739299610895, "train_speed(iter/s)": 1.440046 }, { "epoch": 1.9761364123216656, "grad_norm": 4.864031791687012, "learning_rate": 6.616209016878733e-05, "loss": 2.547792434692383, "memory(GiB)": 77.56, "step": 46125, "token_acc": 0.46254071661237783, "train_speed(iter/s)": 1.440065 }, { "epoch": 1.9763506276509148, "grad_norm": 5.276404857635498, "learning_rate": 6.615572152778147e-05, "loss": 2.778274154663086, "memory(GiB)": 77.56, "step": 46130, "token_acc": 0.4494047619047619, "train_speed(iter/s)": 1.440042 }, { "epoch": 1.9765648429801637, "grad_norm": 4.446139335632324, "learning_rate": 6.614935259409954e-05, "loss": 2.7039743423461915, "memory(GiB)": 77.56, "step": 46135, "token_acc": 0.4421052631578947, "train_speed(iter/s)": 1.440036 }, { "epoch": 1.9767790583094125, "grad_norm": 4.300412654876709, "learning_rate": 6.614298336785692e-05, "loss": 2.3138809204101562, "memory(GiB)": 77.56, "step": 46140, "token_acc": 0.5193548387096775, "train_speed(iter/s)": 1.440062 }, { "epoch": 1.9769932736386617, "grad_norm": 6.478123664855957, "learning_rate": 6.613661384916902e-05, "loss": 2.727131652832031, "memory(GiB)": 77.56, "step": 46145, "token_acc": 0.4520547945205479, "train_speed(iter/s)": 1.440065 }, { "epoch": 1.9772074889679105, "grad_norm": 5.043158531188965, "learning_rate": 6.613024403815125e-05, "loss": 2.651078224182129, "memory(GiB)": 77.56, "step": 46150, "token_acc": 0.4276315789473684, "train_speed(iter/s)": 1.440081 }, { "epoch": 1.9774217042971594, "grad_norm": 5.35750675201416, "learning_rate": 6.612387393491894e-05, "loss": 2.6064208984375, "memory(GiB)": 77.56, "step": 46155, "token_acc": 0.4533762057877814, "train_speed(iter/s)": 1.440057 }, { "epoch": 1.9776359196264086, "grad_norm": 5.122040748596191, "learning_rate": 6.611750353958757e-05, "loss": 2.5336179733276367, "memory(GiB)": 77.56, "step": 46160, "token_acc": 0.48226950354609927, "train_speed(iter/s)": 1.440087 }, { "epoch": 1.9778501349556574, "grad_norm": 4.25455379486084, "learning_rate": 6.611113285227247e-05, "loss": 2.698091506958008, "memory(GiB)": 77.56, "step": 46165, "token_acc": 0.4523076923076923, "train_speed(iter/s)": 1.440104 }, { "epoch": 1.9780643502849062, "grad_norm": 5.582869052886963, "learning_rate": 6.610476187308909e-05, "loss": 2.8125026702880858, "memory(GiB)": 77.56, "step": 46170, "token_acc": 0.43034055727554177, "train_speed(iter/s)": 1.440137 }, { "epoch": 1.9782785656141555, "grad_norm": 4.765628814697266, "learning_rate": 6.609839060215287e-05, "loss": 2.7479183197021486, "memory(GiB)": 77.56, "step": 46175, "token_acc": 0.4307228915662651, "train_speed(iter/s)": 1.440108 }, { "epoch": 1.9784927809434043, "grad_norm": 4.945771217346191, "learning_rate": 6.609201903957916e-05, "loss": 2.6540557861328127, "memory(GiB)": 77.56, "step": 46180, "token_acc": 0.44412607449856734, "train_speed(iter/s)": 1.440084 }, { "epoch": 1.9787069962726531, "grad_norm": 5.673392295837402, "learning_rate": 6.608564718548344e-05, "loss": 2.469636344909668, "memory(GiB)": 77.56, "step": 46185, "token_acc": 0.4881889763779528, "train_speed(iter/s)": 1.440104 }, { "epoch": 1.9789212116019024, "grad_norm": 4.77620792388916, "learning_rate": 6.607927503998116e-05, "loss": 2.4937023162841796, "memory(GiB)": 77.56, "step": 46190, "token_acc": 0.4980237154150198, "train_speed(iter/s)": 1.440085 }, { "epoch": 1.9791354269311512, "grad_norm": 4.5593976974487305, "learning_rate": 6.607290260318771e-05, "loss": 2.2752307891845702, "memory(GiB)": 77.56, "step": 46195, "token_acc": 0.5369649805447471, "train_speed(iter/s)": 1.440107 }, { "epoch": 1.9793496422604, "grad_norm": 5.7640533447265625, "learning_rate": 6.606652987521855e-05, "loss": 3.032878303527832, "memory(GiB)": 77.56, "step": 46200, "token_acc": 0.42748091603053434, "train_speed(iter/s)": 1.44013 }, { "epoch": 1.9795638575896493, "grad_norm": 6.39599609375, "learning_rate": 6.606015685618913e-05, "loss": 2.6303007125854494, "memory(GiB)": 77.56, "step": 46205, "token_acc": 0.4423076923076923, "train_speed(iter/s)": 1.440143 }, { "epoch": 1.979778072918898, "grad_norm": 4.649415493011475, "learning_rate": 6.60537835462149e-05, "loss": 2.34720401763916, "memory(GiB)": 77.56, "step": 46210, "token_acc": 0.5198863636363636, "train_speed(iter/s)": 1.440158 }, { "epoch": 1.9799922882481469, "grad_norm": 10.375418663024902, "learning_rate": 6.604740994541133e-05, "loss": 2.4568550109863283, "memory(GiB)": 77.56, "step": 46215, "token_acc": 0.5275229357798165, "train_speed(iter/s)": 1.440164 }, { "epoch": 1.9802065035773961, "grad_norm": 6.978662014007568, "learning_rate": 6.604103605389387e-05, "loss": 2.3641246795654296, "memory(GiB)": 77.56, "step": 46220, "token_acc": 0.5161290322580645, "train_speed(iter/s)": 1.440145 }, { "epoch": 1.980420718906645, "grad_norm": 4.827588081359863, "learning_rate": 6.603466187177799e-05, "loss": 2.5591302871704102, "memory(GiB)": 77.56, "step": 46225, "token_acc": 0.5137614678899083, "train_speed(iter/s)": 1.440178 }, { "epoch": 1.9806349342358938, "grad_norm": 5.898464679718018, "learning_rate": 6.602828739917919e-05, "loss": 2.604831314086914, "memory(GiB)": 77.56, "step": 46230, "token_acc": 0.49019607843137253, "train_speed(iter/s)": 1.440205 }, { "epoch": 1.980849149565143, "grad_norm": 6.191739082336426, "learning_rate": 6.602191263621291e-05, "loss": 2.2673355102539063, "memory(GiB)": 77.56, "step": 46235, "token_acc": 0.4939759036144578, "train_speed(iter/s)": 1.440233 }, { "epoch": 1.9810633648943918, "grad_norm": 5.841427803039551, "learning_rate": 6.601553758299465e-05, "loss": 2.1395694732666017, "memory(GiB)": 77.56, "step": 46240, "token_acc": 0.5375, "train_speed(iter/s)": 1.440255 }, { "epoch": 1.9812775802236406, "grad_norm": 5.003666400909424, "learning_rate": 6.60091622396399e-05, "loss": 2.442990303039551, "memory(GiB)": 77.56, "step": 46245, "token_acc": 0.4542483660130719, "train_speed(iter/s)": 1.44028 }, { "epoch": 1.98149179555289, "grad_norm": 5.166260719299316, "learning_rate": 6.600278660626418e-05, "loss": 2.660910224914551, "memory(GiB)": 77.56, "step": 46250, "token_acc": 0.434640522875817, "train_speed(iter/s)": 1.440311 }, { "epoch": 1.9817060108821387, "grad_norm": 3.8498947620391846, "learning_rate": 6.599641068298295e-05, "loss": 2.4890729904174806, "memory(GiB)": 77.56, "step": 46255, "token_acc": 0.5055762081784386, "train_speed(iter/s)": 1.440317 }, { "epoch": 1.9819202262113875, "grad_norm": 5.755703926086426, "learning_rate": 6.599003446991174e-05, "loss": 2.2098081588745115, "memory(GiB)": 77.56, "step": 46260, "token_acc": 0.5633802816901409, "train_speed(iter/s)": 1.440349 }, { "epoch": 1.9821344415406368, "grad_norm": 4.347459316253662, "learning_rate": 6.598365796716606e-05, "loss": 2.7876510620117188, "memory(GiB)": 77.56, "step": 46265, "token_acc": 0.46439628482972134, "train_speed(iter/s)": 1.440376 }, { "epoch": 1.9823486568698856, "grad_norm": 4.963770866394043, "learning_rate": 6.597728117486141e-05, "loss": 2.793642044067383, "memory(GiB)": 77.56, "step": 46270, "token_acc": 0.42452830188679247, "train_speed(iter/s)": 1.440392 }, { "epoch": 1.9825628721991344, "grad_norm": 4.950620174407959, "learning_rate": 6.597090409311334e-05, "loss": 2.7416378021240235, "memory(GiB)": 77.56, "step": 46275, "token_acc": 0.46443514644351463, "train_speed(iter/s)": 1.440389 }, { "epoch": 1.9827770875283837, "grad_norm": 4.158836364746094, "learning_rate": 6.596452672203736e-05, "loss": 2.542458915710449, "memory(GiB)": 77.56, "step": 46280, "token_acc": 0.5375375375375375, "train_speed(iter/s)": 1.440383 }, { "epoch": 1.9829913028576325, "grad_norm": 4.593012809753418, "learning_rate": 6.595814906174898e-05, "loss": 2.7064571380615234, "memory(GiB)": 77.56, "step": 46285, "token_acc": 0.45794392523364486, "train_speed(iter/s)": 1.440391 }, { "epoch": 1.9832055181868813, "grad_norm": 5.085115909576416, "learning_rate": 6.595177111236377e-05, "loss": 2.5841949462890623, "memory(GiB)": 77.56, "step": 46290, "token_acc": 0.49795918367346936, "train_speed(iter/s)": 1.440381 }, { "epoch": 1.9834197335161305, "grad_norm": 5.26007604598999, "learning_rate": 6.594539287399727e-05, "loss": 2.5472856521606446, "memory(GiB)": 77.56, "step": 46295, "token_acc": 0.47904191616766467, "train_speed(iter/s)": 1.440422 }, { "epoch": 1.9836339488453794, "grad_norm": 3.1587576866149902, "learning_rate": 6.593901434676502e-05, "loss": 2.3253494262695313, "memory(GiB)": 77.56, "step": 46300, "token_acc": 0.49, "train_speed(iter/s)": 1.440427 }, { "epoch": 1.9838481641746282, "grad_norm": 5.255431175231934, "learning_rate": 6.593263553078257e-05, "loss": 2.7006940841674805, "memory(GiB)": 77.56, "step": 46305, "token_acc": 0.4557377049180328, "train_speed(iter/s)": 1.440448 }, { "epoch": 1.9840623795038774, "grad_norm": 4.174441337585449, "learning_rate": 6.592625642616546e-05, "loss": 2.4668142318725588, "memory(GiB)": 77.56, "step": 46310, "token_acc": 0.49683544303797467, "train_speed(iter/s)": 1.44044 }, { "epoch": 1.9842765948331262, "grad_norm": 4.879791736602783, "learning_rate": 6.591987703302931e-05, "loss": 2.3057186126708986, "memory(GiB)": 77.56, "step": 46315, "token_acc": 0.5371900826446281, "train_speed(iter/s)": 1.440457 }, { "epoch": 1.9844908101623753, "grad_norm": 5.802064895629883, "learning_rate": 6.591349735148964e-05, "loss": 2.4005640029907225, "memory(GiB)": 77.56, "step": 46320, "token_acc": 0.5051903114186851, "train_speed(iter/s)": 1.440468 }, { "epoch": 1.9847050254916243, "grad_norm": 4.675039291381836, "learning_rate": 6.590711738166204e-05, "loss": 2.950305938720703, "memory(GiB)": 77.56, "step": 46325, "token_acc": 0.4539877300613497, "train_speed(iter/s)": 1.440488 }, { "epoch": 1.9849192408208731, "grad_norm": 5.377086639404297, "learning_rate": 6.590073712366207e-05, "loss": 2.6158636093139647, "memory(GiB)": 77.56, "step": 46330, "token_acc": 0.4883720930232558, "train_speed(iter/s)": 1.440499 }, { "epoch": 1.9851334561501222, "grad_norm": 5.898637294769287, "learning_rate": 6.589435657760535e-05, "loss": 2.5784420013427733, "memory(GiB)": 77.56, "step": 46335, "token_acc": 0.48727272727272725, "train_speed(iter/s)": 1.440519 }, { "epoch": 1.9853476714793712, "grad_norm": 3.5971996784210205, "learning_rate": 6.588797574360744e-05, "loss": 2.2648946762084963, "memory(GiB)": 77.56, "step": 46340, "token_acc": 0.5306122448979592, "train_speed(iter/s)": 1.440515 }, { "epoch": 1.98556188680862, "grad_norm": 5.021591663360596, "learning_rate": 6.588159462178396e-05, "loss": 2.5986915588378907, "memory(GiB)": 77.56, "step": 46345, "token_acc": 0.4472843450479233, "train_speed(iter/s)": 1.44049 }, { "epoch": 1.985776102137869, "grad_norm": 5.158310890197754, "learning_rate": 6.587521321225048e-05, "loss": 2.4552642822265627, "memory(GiB)": 77.56, "step": 46350, "token_acc": 0.45263157894736844, "train_speed(iter/s)": 1.440495 }, { "epoch": 1.985990317467118, "grad_norm": 3.8151841163635254, "learning_rate": 6.586883151512263e-05, "loss": 2.422525405883789, "memory(GiB)": 77.56, "step": 46355, "token_acc": 0.47719298245614034, "train_speed(iter/s)": 1.440463 }, { "epoch": 1.9862045327963669, "grad_norm": 4.906080722808838, "learning_rate": 6.586244953051602e-05, "loss": 2.466356086730957, "memory(GiB)": 77.56, "step": 46360, "token_acc": 0.4738675958188153, "train_speed(iter/s)": 1.440479 }, { "epoch": 1.986418748125616, "grad_norm": 5.610723495483398, "learning_rate": 6.585606725854623e-05, "loss": 2.7259952545166017, "memory(GiB)": 77.56, "step": 46365, "token_acc": 0.4128113879003559, "train_speed(iter/s)": 1.440505 }, { "epoch": 1.986632963454865, "grad_norm": 6.600398540496826, "learning_rate": 6.58496846993289e-05, "loss": 2.4627485275268555, "memory(GiB)": 77.56, "step": 46370, "token_acc": 0.45110410094637227, "train_speed(iter/s)": 1.440499 }, { "epoch": 1.9868471787841138, "grad_norm": 4.842108726501465, "learning_rate": 6.58433018529797e-05, "loss": 2.661446762084961, "memory(GiB)": 77.56, "step": 46375, "token_acc": 0.48056537102473496, "train_speed(iter/s)": 1.4405 }, { "epoch": 1.9870613941133628, "grad_norm": 4.814848899841309, "learning_rate": 6.583691871961422e-05, "loss": 2.904281997680664, "memory(GiB)": 77.56, "step": 46380, "token_acc": 0.44672131147540983, "train_speed(iter/s)": 1.440488 }, { "epoch": 1.9872756094426118, "grad_norm": 4.1047282218933105, "learning_rate": 6.583053529934807e-05, "loss": 2.713669776916504, "memory(GiB)": 77.56, "step": 46385, "token_acc": 0.45733788395904434, "train_speed(iter/s)": 1.440481 }, { "epoch": 1.9874898247718606, "grad_norm": 7.023497104644775, "learning_rate": 6.582415159229695e-05, "loss": 2.583456039428711, "memory(GiB)": 77.56, "step": 46390, "token_acc": 0.4880239520958084, "train_speed(iter/s)": 1.440512 }, { "epoch": 1.9877040401011097, "grad_norm": 5.183415412902832, "learning_rate": 6.581776759857647e-05, "loss": 2.384836769104004, "memory(GiB)": 77.56, "step": 46395, "token_acc": 0.4612546125461255, "train_speed(iter/s)": 1.440503 }, { "epoch": 1.9879182554303587, "grad_norm": 4.711422920227051, "learning_rate": 6.58113833183023e-05, "loss": 2.1625141143798827, "memory(GiB)": 77.56, "step": 46400, "token_acc": 0.5254777070063694, "train_speed(iter/s)": 1.440517 }, { "epoch": 1.9881324707596075, "grad_norm": 4.9224138259887695, "learning_rate": 6.580499875159008e-05, "loss": 2.6314754486083984, "memory(GiB)": 77.56, "step": 46405, "token_acc": 0.4574468085106383, "train_speed(iter/s)": 1.440526 }, { "epoch": 1.9883466860888566, "grad_norm": 4.872269630432129, "learning_rate": 6.579861389855548e-05, "loss": 2.784394454956055, "memory(GiB)": 77.56, "step": 46410, "token_acc": 0.43209876543209874, "train_speed(iter/s)": 1.440507 }, { "epoch": 1.9885609014181056, "grad_norm": 5.029921054840088, "learning_rate": 6.579222875931417e-05, "loss": 2.4003747940063476, "memory(GiB)": 77.56, "step": 46415, "token_acc": 0.4691780821917808, "train_speed(iter/s)": 1.440521 }, { "epoch": 1.9887751167473544, "grad_norm": 5.879994869232178, "learning_rate": 6.578584333398184e-05, "loss": 2.546307373046875, "memory(GiB)": 77.56, "step": 46420, "token_acc": 0.43769968051118213, "train_speed(iter/s)": 1.440541 }, { "epoch": 1.9889893320766034, "grad_norm": 4.8974385261535645, "learning_rate": 6.577945762267412e-05, "loss": 2.4312274932861326, "memory(GiB)": 77.56, "step": 46425, "token_acc": 0.48951048951048953, "train_speed(iter/s)": 1.440506 }, { "epoch": 1.9892035474058525, "grad_norm": 5.272725582122803, "learning_rate": 6.577307162550675e-05, "loss": 2.1904491424560546, "memory(GiB)": 77.56, "step": 46430, "token_acc": 0.5209790209790209, "train_speed(iter/s)": 1.440513 }, { "epoch": 1.9894177627351013, "grad_norm": 5.513241767883301, "learning_rate": 6.576668534259536e-05, "loss": 2.6153263092041015, "memory(GiB)": 77.56, "step": 46435, "token_acc": 0.45964912280701753, "train_speed(iter/s)": 1.44049 }, { "epoch": 1.9896319780643503, "grad_norm": 5.99629020690918, "learning_rate": 6.57602987740557e-05, "loss": 2.6848064422607423, "memory(GiB)": 77.56, "step": 46440, "token_acc": 0.42805755395683454, "train_speed(iter/s)": 1.440497 }, { "epoch": 1.9898461933935994, "grad_norm": 5.065510272979736, "learning_rate": 6.575391192000342e-05, "loss": 2.973910331726074, "memory(GiB)": 77.56, "step": 46445, "token_acc": 0.4341317365269461, "train_speed(iter/s)": 1.440501 }, { "epoch": 1.9900604087228482, "grad_norm": 6.09493350982666, "learning_rate": 6.574752478055428e-05, "loss": 2.3773401260375975, "memory(GiB)": 77.56, "step": 46450, "token_acc": 0.4528985507246377, "train_speed(iter/s)": 1.440502 }, { "epoch": 1.9902746240520972, "grad_norm": 4.089011192321777, "learning_rate": 6.574113735582393e-05, "loss": 2.4532121658325194, "memory(GiB)": 77.56, "step": 46455, "token_acc": 0.4596774193548387, "train_speed(iter/s)": 1.440466 }, { "epoch": 1.9904888393813462, "grad_norm": 4.827455043792725, "learning_rate": 6.573474964592813e-05, "loss": 2.708393859863281, "memory(GiB)": 77.56, "step": 46460, "token_acc": 0.4634920634920635, "train_speed(iter/s)": 1.440492 }, { "epoch": 1.990703054710595, "grad_norm": 4.492869853973389, "learning_rate": 6.572836165098258e-05, "loss": 2.262235641479492, "memory(GiB)": 77.56, "step": 46465, "token_acc": 0.5585585585585585, "train_speed(iter/s)": 1.440494 }, { "epoch": 1.990917270039844, "grad_norm": 5.896934509277344, "learning_rate": 6.572197337110299e-05, "loss": 2.6777055740356444, "memory(GiB)": 77.56, "step": 46470, "token_acc": 0.46885245901639344, "train_speed(iter/s)": 1.440486 }, { "epoch": 1.9911314853690931, "grad_norm": 4.937989234924316, "learning_rate": 6.57155848064051e-05, "loss": 2.379093360900879, "memory(GiB)": 77.56, "step": 46475, "token_acc": 0.515358361774744, "train_speed(iter/s)": 1.440516 }, { "epoch": 1.991345700698342, "grad_norm": 4.8116536140441895, "learning_rate": 6.570919595700467e-05, "loss": 2.436779022216797, "memory(GiB)": 77.56, "step": 46480, "token_acc": 0.4666666666666667, "train_speed(iter/s)": 1.440512 }, { "epoch": 1.991559916027591, "grad_norm": 4.81565523147583, "learning_rate": 6.570280682301739e-05, "loss": 2.193369674682617, "memory(GiB)": 77.56, "step": 46485, "token_acc": 0.5375939849624061, "train_speed(iter/s)": 1.440544 }, { "epoch": 1.99177413135684, "grad_norm": 5.239287853240967, "learning_rate": 6.569641740455905e-05, "loss": 2.5726573944091795, "memory(GiB)": 77.56, "step": 46490, "token_acc": 0.4576271186440678, "train_speed(iter/s)": 1.440547 }, { "epoch": 1.9919883466860888, "grad_norm": 4.939355850219727, "learning_rate": 6.569002770174539e-05, "loss": 2.665151596069336, "memory(GiB)": 77.56, "step": 46495, "token_acc": 0.47924528301886793, "train_speed(iter/s)": 1.440543 }, { "epoch": 1.9922025620153379, "grad_norm": 5.027363300323486, "learning_rate": 6.568363771469214e-05, "loss": 2.6751869201660154, "memory(GiB)": 77.56, "step": 46500, "token_acc": 0.45121951219512196, "train_speed(iter/s)": 1.440558 }, { "epoch": 1.9922025620153379, "eval_loss": 2.4219417572021484, "eval_runtime": 14.8868, "eval_samples_per_second": 6.717, "eval_steps_per_second": 6.717, "eval_token_acc": 0.4422843256379101, "step": 46500 }, { "epoch": 1.9924167773445869, "grad_norm": 5.112144470214844, "learning_rate": 6.567724744351512e-05, "loss": 2.4460798263549806, "memory(GiB)": 77.56, "step": 46505, "token_acc": 0.45422535211267606, "train_speed(iter/s)": 1.439871 }, { "epoch": 1.9926309926738357, "grad_norm": 6.163054466247559, "learning_rate": 6.567085688833001e-05, "loss": 2.5283781051635743, "memory(GiB)": 77.56, "step": 46510, "token_acc": 0.4952681388012618, "train_speed(iter/s)": 1.439868 }, { "epoch": 1.9928452080030847, "grad_norm": 5.4156060218811035, "learning_rate": 6.566446604925264e-05, "loss": 2.5319400787353517, "memory(GiB)": 77.56, "step": 46515, "token_acc": 0.471875, "train_speed(iter/s)": 1.439869 }, { "epoch": 1.9930594233323338, "grad_norm": 5.574679374694824, "learning_rate": 6.56580749263988e-05, "loss": 2.220624542236328, "memory(GiB)": 77.56, "step": 46520, "token_acc": 0.5, "train_speed(iter/s)": 1.439875 }, { "epoch": 1.9932736386615826, "grad_norm": 3.827199935913086, "learning_rate": 6.565168351988422e-05, "loss": 2.7762617111206054, "memory(GiB)": 77.56, "step": 46525, "token_acc": 0.46621621621621623, "train_speed(iter/s)": 1.439861 }, { "epoch": 1.9934878539908316, "grad_norm": 4.552662372589111, "learning_rate": 6.564529182982471e-05, "loss": 2.3779298782348635, "memory(GiB)": 77.56, "step": 46530, "token_acc": 0.4894366197183099, "train_speed(iter/s)": 1.439852 }, { "epoch": 1.9937020693200806, "grad_norm": 6.504910469055176, "learning_rate": 6.563889985633608e-05, "loss": 2.4095617294311524, "memory(GiB)": 77.56, "step": 46535, "token_acc": 0.48360655737704916, "train_speed(iter/s)": 1.439831 }, { "epoch": 1.9939162846493295, "grad_norm": 6.695801734924316, "learning_rate": 6.56325075995341e-05, "loss": 2.5215282440185547, "memory(GiB)": 77.56, "step": 46540, "token_acc": 0.460431654676259, "train_speed(iter/s)": 1.439846 }, { "epoch": 1.9941304999785785, "grad_norm": 5.770375728607178, "learning_rate": 6.562611505953457e-05, "loss": 2.6792497634887695, "memory(GiB)": 77.56, "step": 46545, "token_acc": 0.44569288389513106, "train_speed(iter/s)": 1.439812 }, { "epoch": 1.9943447153078275, "grad_norm": 7.105743408203125, "learning_rate": 6.561972223645331e-05, "loss": 2.4988286972045897, "memory(GiB)": 77.56, "step": 46550, "token_acc": 0.5143769968051118, "train_speed(iter/s)": 1.439842 }, { "epoch": 1.9945589306370763, "grad_norm": 5.650620460510254, "learning_rate": 6.561332913040613e-05, "loss": 2.7292051315307617, "memory(GiB)": 77.56, "step": 46555, "token_acc": 0.45692883895131087, "train_speed(iter/s)": 1.439828 }, { "epoch": 1.9947731459663254, "grad_norm": 7.187072277069092, "learning_rate": 6.560693574150885e-05, "loss": 2.366291046142578, "memory(GiB)": 77.56, "step": 46560, "token_acc": 0.47876447876447875, "train_speed(iter/s)": 1.439812 }, { "epoch": 1.9949873612955744, "grad_norm": 5.516972541809082, "learning_rate": 6.560054206987729e-05, "loss": 2.5241676330566407, "memory(GiB)": 77.56, "step": 46565, "token_acc": 0.4506578947368421, "train_speed(iter/s)": 1.439795 }, { "epoch": 1.9952015766248232, "grad_norm": 3.9918289184570312, "learning_rate": 6.559414811562728e-05, "loss": 2.5528985977172853, "memory(GiB)": 77.56, "step": 46570, "token_acc": 0.44047619047619047, "train_speed(iter/s)": 1.439792 }, { "epoch": 1.9954157919540723, "grad_norm": 8.8845796585083, "learning_rate": 6.558775387887463e-05, "loss": 2.7382272720336913, "memory(GiB)": 77.56, "step": 46575, "token_acc": 0.45017182130584193, "train_speed(iter/s)": 1.439833 }, { "epoch": 1.9956300072833213, "grad_norm": 5.179216384887695, "learning_rate": 6.558135935973521e-05, "loss": 2.7244104385375976, "memory(GiB)": 77.56, "step": 46580, "token_acc": 0.44015444015444016, "train_speed(iter/s)": 1.439867 }, { "epoch": 1.99584422261257, "grad_norm": 5.242931365966797, "learning_rate": 6.557496455832485e-05, "loss": 2.298200798034668, "memory(GiB)": 77.56, "step": 46585, "token_acc": 0.48464163822525597, "train_speed(iter/s)": 1.439901 }, { "epoch": 1.9960584379418191, "grad_norm": 5.2678542137146, "learning_rate": 6.556856947475937e-05, "loss": 2.557473373413086, "memory(GiB)": 77.56, "step": 46590, "token_acc": 0.42024539877300615, "train_speed(iter/s)": 1.439922 }, { "epoch": 1.9962726532710682, "grad_norm": 5.593785285949707, "learning_rate": 6.55621741091547e-05, "loss": 2.929859733581543, "memory(GiB)": 77.56, "step": 46595, "token_acc": 0.40522875816993464, "train_speed(iter/s)": 1.439924 }, { "epoch": 1.996486868600317, "grad_norm": 5.375649929046631, "learning_rate": 6.55557784616266e-05, "loss": 2.4373357772827147, "memory(GiB)": 77.56, "step": 46600, "token_acc": 0.48942598187311176, "train_speed(iter/s)": 1.439965 }, { "epoch": 1.996701083929566, "grad_norm": 3.787369728088379, "learning_rate": 6.554938253229102e-05, "loss": 2.5138132095336916, "memory(GiB)": 77.56, "step": 46605, "token_acc": 0.4627831715210356, "train_speed(iter/s)": 1.439975 }, { "epoch": 1.996915299258815, "grad_norm": 5.040591239929199, "learning_rate": 6.554298632126376e-05, "loss": 2.2719184875488283, "memory(GiB)": 77.56, "step": 46610, "token_acc": 0.5138888888888888, "train_speed(iter/s)": 1.439991 }, { "epoch": 1.9971295145880639, "grad_norm": 4.640641689300537, "learning_rate": 6.553658982866073e-05, "loss": 2.4173145294189453, "memory(GiB)": 77.56, "step": 46615, "token_acc": 0.47335423197492166, "train_speed(iter/s)": 1.440022 }, { "epoch": 1.997343729917313, "grad_norm": 5.445117950439453, "learning_rate": 6.55301930545978e-05, "loss": 2.614707183837891, "memory(GiB)": 77.56, "step": 46620, "token_acc": 0.46405228758169936, "train_speed(iter/s)": 1.440057 }, { "epoch": 1.997557945246562, "grad_norm": 4.248864650726318, "learning_rate": 6.552379599919087e-05, "loss": 2.593882751464844, "memory(GiB)": 77.56, "step": 46625, "token_acc": 0.46757679180887374, "train_speed(iter/s)": 1.440062 }, { "epoch": 1.9977721605758108, "grad_norm": 4.769877910614014, "learning_rate": 6.551739866255578e-05, "loss": 2.4515695571899414, "memory(GiB)": 77.56, "step": 46630, "token_acc": 0.5101214574898786, "train_speed(iter/s)": 1.440088 }, { "epoch": 1.9979863759050598, "grad_norm": 6.324184894561768, "learning_rate": 6.551100104480849e-05, "loss": 2.6700746536254885, "memory(GiB)": 77.56, "step": 46635, "token_acc": 0.41423948220064727, "train_speed(iter/s)": 1.440113 }, { "epoch": 1.9982005912343088, "grad_norm": 4.573021411895752, "learning_rate": 6.550460314606486e-05, "loss": 2.392468070983887, "memory(GiB)": 77.56, "step": 46640, "token_acc": 0.49185667752442996, "train_speed(iter/s)": 1.440117 }, { "epoch": 1.9984148065635576, "grad_norm": 4.98002290725708, "learning_rate": 6.549820496644078e-05, "loss": 2.700870323181152, "memory(GiB)": 77.56, "step": 46645, "token_acc": 0.46236559139784944, "train_speed(iter/s)": 1.440129 }, { "epoch": 1.9986290218928067, "grad_norm": 4.0348405838012695, "learning_rate": 6.54918065060522e-05, "loss": 2.7478660583496093, "memory(GiB)": 77.56, "step": 46650, "token_acc": 0.49, "train_speed(iter/s)": 1.440095 }, { "epoch": 1.9988432372220557, "grad_norm": 5.0965704917907715, "learning_rate": 6.5485407765015e-05, "loss": 2.3058755874633787, "memory(GiB)": 77.56, "step": 46655, "token_acc": 0.5399239543726235, "train_speed(iter/s)": 1.440079 }, { "epoch": 1.9990574525513045, "grad_norm": 4.812645435333252, "learning_rate": 6.54790087434451e-05, "loss": 2.449415588378906, "memory(GiB)": 77.56, "step": 46660, "token_acc": 0.4641638225255973, "train_speed(iter/s)": 1.440069 }, { "epoch": 1.9992716678805535, "grad_norm": 6.063690185546875, "learning_rate": 6.547260944145845e-05, "loss": 2.7162961959838867, "memory(GiB)": 77.56, "step": 46665, "token_acc": 0.4388059701492537, "train_speed(iter/s)": 1.440062 }, { "epoch": 1.9994858832098026, "grad_norm": 5.127791404724121, "learning_rate": 6.546620985917097e-05, "loss": 2.6050086975097657, "memory(GiB)": 77.56, "step": 46670, "token_acc": 0.45896656534954405, "train_speed(iter/s)": 1.440071 }, { "epoch": 1.9997000985390514, "grad_norm": 3.860870599746704, "learning_rate": 6.545980999669859e-05, "loss": 2.809811019897461, "memory(GiB)": 77.56, "step": 46675, "token_acc": 0.4745222929936306, "train_speed(iter/s)": 1.440085 }, { "epoch": 1.9999143138683004, "grad_norm": 6.640521049499512, "learning_rate": 6.545340985415726e-05, "loss": 2.9207460403442385, "memory(GiB)": 77.56, "step": 46680, "token_acc": 0.4666666666666667, "train_speed(iter/s)": 1.440069 }, { "epoch": 2.0001285291975495, "grad_norm": 4.379486560821533, "learning_rate": 6.54470094316629e-05, "loss": 2.554909896850586, "memory(GiB)": 77.56, "step": 46685, "token_acc": 0.4921875, "train_speed(iter/s)": 1.440098 }, { "epoch": 2.0003427445267983, "grad_norm": 6.2013163566589355, "learning_rate": 6.544060872933146e-05, "loss": 2.7635061264038088, "memory(GiB)": 77.56, "step": 46690, "token_acc": 0.44039735099337746, "train_speed(iter/s)": 1.440079 }, { "epoch": 2.000556959856047, "grad_norm": 5.286303520202637, "learning_rate": 6.543420774727892e-05, "loss": 2.4695693969726564, "memory(GiB)": 77.56, "step": 46695, "token_acc": 0.47058823529411764, "train_speed(iter/s)": 1.44007 }, { "epoch": 2.0007711751852963, "grad_norm": 4.1759138107299805, "learning_rate": 6.542780648562124e-05, "loss": 2.6401100158691406, "memory(GiB)": 77.56, "step": 46700, "token_acc": 0.4662379421221865, "train_speed(iter/s)": 1.440058 }, { "epoch": 2.000985390514545, "grad_norm": 4.321265697479248, "learning_rate": 6.542140494447435e-05, "loss": 2.462251091003418, "memory(GiB)": 77.56, "step": 46705, "token_acc": 0.4882154882154882, "train_speed(iter/s)": 1.44008 }, { "epoch": 2.001199605843794, "grad_norm": 4.87285852432251, "learning_rate": 6.541500312395427e-05, "loss": 2.2234607696533204, "memory(GiB)": 77.56, "step": 46710, "token_acc": 0.5137254901960784, "train_speed(iter/s)": 1.440097 }, { "epoch": 2.0014138211730432, "grad_norm": 4.688236236572266, "learning_rate": 6.540860102417693e-05, "loss": 2.3051342010498046, "memory(GiB)": 77.56, "step": 46715, "token_acc": 0.5362903225806451, "train_speed(iter/s)": 1.440146 }, { "epoch": 2.001628036502292, "grad_norm": 4.412413120269775, "learning_rate": 6.540219864525835e-05, "loss": 2.5773540496826173, "memory(GiB)": 77.56, "step": 46720, "token_acc": 0.488135593220339, "train_speed(iter/s)": 1.440168 }, { "epoch": 2.001842251831541, "grad_norm": 6.4903950691223145, "learning_rate": 6.539579598731447e-05, "loss": 2.61322021484375, "memory(GiB)": 77.56, "step": 46725, "token_acc": 0.4879518072289157, "train_speed(iter/s)": 1.440158 }, { "epoch": 2.00205646716079, "grad_norm": 5.339513778686523, "learning_rate": 6.53893930504613e-05, "loss": 2.4109384536743166, "memory(GiB)": 77.56, "step": 46730, "token_acc": 0.4935483870967742, "train_speed(iter/s)": 1.44018 }, { "epoch": 2.002270682490039, "grad_norm": 4.855136394500732, "learning_rate": 6.538298983481484e-05, "loss": 2.5464282989501954, "memory(GiB)": 77.56, "step": 46735, "token_acc": 0.4307692307692308, "train_speed(iter/s)": 1.44017 }, { "epoch": 2.0024848978192877, "grad_norm": 6.4732255935668945, "learning_rate": 6.53765863404911e-05, "loss": 2.2858535766601564, "memory(GiB)": 77.56, "step": 46740, "token_acc": 0.4888888888888889, "train_speed(iter/s)": 1.440203 }, { "epoch": 2.002699113148537, "grad_norm": 5.127806186676025, "learning_rate": 6.537018256760606e-05, "loss": 2.6056758880615236, "memory(GiB)": 77.56, "step": 46745, "token_acc": 0.5019762845849802, "train_speed(iter/s)": 1.440238 }, { "epoch": 2.002913328477786, "grad_norm": 5.070985317230225, "learning_rate": 6.536377851627577e-05, "loss": 2.4653038024902343, "memory(GiB)": 77.56, "step": 46750, "token_acc": 0.4819277108433735, "train_speed(iter/s)": 1.440261 }, { "epoch": 2.0031275438070346, "grad_norm": 5.667211055755615, "learning_rate": 6.53573741866162e-05, "loss": 2.7156482696533204, "memory(GiB)": 77.56, "step": 46755, "token_acc": 0.5153846153846153, "train_speed(iter/s)": 1.440316 }, { "epoch": 2.003341759136284, "grad_norm": 4.165454864501953, "learning_rate": 6.53509695787434e-05, "loss": 2.5176471710205077, "memory(GiB)": 77.56, "step": 46760, "token_acc": 0.4712230215827338, "train_speed(iter/s)": 1.44034 }, { "epoch": 2.0035559744655327, "grad_norm": 5.048638820648193, "learning_rate": 6.534456469277337e-05, "loss": 2.4855140686035155, "memory(GiB)": 77.56, "step": 46765, "token_acc": 0.4766081871345029, "train_speed(iter/s)": 1.440328 }, { "epoch": 2.0037701897947815, "grad_norm": 4.935610771179199, "learning_rate": 6.533815952882216e-05, "loss": 2.1477176666259767, "memory(GiB)": 77.56, "step": 46770, "token_acc": 0.5325670498084292, "train_speed(iter/s)": 1.440306 }, { "epoch": 2.0039844051240308, "grad_norm": 4.180975914001465, "learning_rate": 6.53317540870058e-05, "loss": 2.447718620300293, "memory(GiB)": 77.56, "step": 46775, "token_acc": 0.5029940119760479, "train_speed(iter/s)": 1.440293 }, { "epoch": 2.0041986204532796, "grad_norm": 5.904799938201904, "learning_rate": 6.532534836744035e-05, "loss": 2.5206777572631838, "memory(GiB)": 77.56, "step": 46780, "token_acc": 0.4713375796178344, "train_speed(iter/s)": 1.440279 }, { "epoch": 2.0044128357825284, "grad_norm": 6.350618839263916, "learning_rate": 6.531894237024183e-05, "loss": 2.4447546005249023, "memory(GiB)": 77.56, "step": 46785, "token_acc": 0.5221843003412969, "train_speed(iter/s)": 1.44029 }, { "epoch": 2.0046270511117776, "grad_norm": 4.7659831047058105, "learning_rate": 6.53125360955263e-05, "loss": 2.087943267822266, "memory(GiB)": 77.56, "step": 46790, "token_acc": 0.5512367491166078, "train_speed(iter/s)": 1.44033 }, { "epoch": 2.0048412664410264, "grad_norm": 5.327122211456299, "learning_rate": 6.530612954340981e-05, "loss": 2.4332603454589843, "memory(GiB)": 77.56, "step": 46795, "token_acc": 0.5141955835962145, "train_speed(iter/s)": 1.440345 }, { "epoch": 2.0050554817702753, "grad_norm": 4.535036563873291, "learning_rate": 6.529972271400844e-05, "loss": 2.4609336853027344, "memory(GiB)": 77.56, "step": 46800, "token_acc": 0.49079754601226994, "train_speed(iter/s)": 1.440373 }, { "epoch": 2.0052696970995245, "grad_norm": 5.071470260620117, "learning_rate": 6.529331560743821e-05, "loss": 2.5745059967041017, "memory(GiB)": 77.56, "step": 46805, "token_acc": 0.46438746438746437, "train_speed(iter/s)": 1.44034 }, { "epoch": 2.0054839124287733, "grad_norm": 5.618152141571045, "learning_rate": 6.528690822381523e-05, "loss": 2.6603822708129883, "memory(GiB)": 77.56, "step": 46810, "token_acc": 0.4557823129251701, "train_speed(iter/s)": 1.440361 }, { "epoch": 2.0056981277580226, "grad_norm": 5.464741230010986, "learning_rate": 6.528050056325558e-05, "loss": 2.4364561080932616, "memory(GiB)": 77.56, "step": 46815, "token_acc": 0.4564459930313589, "train_speed(iter/s)": 1.440396 }, { "epoch": 2.0059123430872714, "grad_norm": 5.506270408630371, "learning_rate": 6.527409262587533e-05, "loss": 2.46496639251709, "memory(GiB)": 77.56, "step": 46820, "token_acc": 0.50625, "train_speed(iter/s)": 1.440398 }, { "epoch": 2.00612655841652, "grad_norm": 7.7618560791015625, "learning_rate": 6.526768441179056e-05, "loss": 2.516340446472168, "memory(GiB)": 77.56, "step": 46825, "token_acc": 0.5166666666666667, "train_speed(iter/s)": 1.440392 }, { "epoch": 2.0063407737457695, "grad_norm": 4.279303550720215, "learning_rate": 6.526127592111737e-05, "loss": 2.6124666213989256, "memory(GiB)": 77.56, "step": 46830, "token_acc": 0.45821325648414984, "train_speed(iter/s)": 1.440416 }, { "epoch": 2.0065549890750183, "grad_norm": 6.336496829986572, "learning_rate": 6.525486715397183e-05, "loss": 2.3455944061279297, "memory(GiB)": 77.56, "step": 46835, "token_acc": 0.5381818181818182, "train_speed(iter/s)": 1.440421 }, { "epoch": 2.006769204404267, "grad_norm": 5.114360809326172, "learning_rate": 6.524845811047009e-05, "loss": 2.3283687591552735, "memory(GiB)": 77.56, "step": 46840, "token_acc": 0.5179282868525896, "train_speed(iter/s)": 1.440416 }, { "epoch": 2.0069834197335163, "grad_norm": 6.016843795776367, "learning_rate": 6.52420487907282e-05, "loss": 2.154891014099121, "memory(GiB)": 77.56, "step": 46845, "token_acc": 0.5276752767527675, "train_speed(iter/s)": 1.440417 }, { "epoch": 2.007197635062765, "grad_norm": 4.609485626220703, "learning_rate": 6.52356391948623e-05, "loss": 2.2795175552368163, "memory(GiB)": 77.56, "step": 46850, "token_acc": 0.5444839857651246, "train_speed(iter/s)": 1.44042 }, { "epoch": 2.007411850392014, "grad_norm": 6.186790943145752, "learning_rate": 6.522922932298852e-05, "loss": 2.7255102157592774, "memory(GiB)": 77.56, "step": 46855, "token_acc": 0.47703180212014135, "train_speed(iter/s)": 1.44045 }, { "epoch": 2.0076260657212632, "grad_norm": 4.760570526123047, "learning_rate": 6.522281917522295e-05, "loss": 2.4536609649658203, "memory(GiB)": 77.56, "step": 46860, "token_acc": 0.5074074074074074, "train_speed(iter/s)": 1.440465 }, { "epoch": 2.007840281050512, "grad_norm": 4.162820816040039, "learning_rate": 6.521640875168175e-05, "loss": 2.3813400268554688, "memory(GiB)": 77.56, "step": 46865, "token_acc": 0.4959785522788204, "train_speed(iter/s)": 1.440492 }, { "epoch": 2.008054496379761, "grad_norm": 6.800650596618652, "learning_rate": 6.520999805248102e-05, "loss": 2.5655967712402346, "memory(GiB)": 77.56, "step": 46870, "token_acc": 0.4810126582278481, "train_speed(iter/s)": 1.440518 }, { "epoch": 2.00826871170901, "grad_norm": 6.280663967132568, "learning_rate": 6.520358707773691e-05, "loss": 2.3437000274658204, "memory(GiB)": 77.56, "step": 46875, "token_acc": 0.4693140794223827, "train_speed(iter/s)": 1.440473 }, { "epoch": 2.008482927038259, "grad_norm": 5.3110432624816895, "learning_rate": 6.519717582756554e-05, "loss": 2.357120323181152, "memory(GiB)": 77.56, "step": 46880, "token_acc": 0.503125, "train_speed(iter/s)": 1.440469 }, { "epoch": 2.0086971423675077, "grad_norm": 3.271852731704712, "learning_rate": 6.519076430208308e-05, "loss": 2.3375322341918947, "memory(GiB)": 77.56, "step": 46885, "token_acc": 0.5, "train_speed(iter/s)": 1.440502 }, { "epoch": 2.008911357696757, "grad_norm": 5.670036792755127, "learning_rate": 6.518435250140569e-05, "loss": 2.4166229248046873, "memory(GiB)": 77.56, "step": 46890, "token_acc": 0.45364238410596025, "train_speed(iter/s)": 1.440503 }, { "epoch": 2.009125573026006, "grad_norm": 4.191525459289551, "learning_rate": 6.517794042564951e-05, "loss": 2.3124170303344727, "memory(GiB)": 77.56, "step": 46895, "token_acc": 0.5255972696245734, "train_speed(iter/s)": 1.440523 }, { "epoch": 2.0093397883552546, "grad_norm": 5.3819708824157715, "learning_rate": 6.51715280749307e-05, "loss": 2.7740522384643556, "memory(GiB)": 77.56, "step": 46900, "token_acc": 0.44966442953020136, "train_speed(iter/s)": 1.440492 }, { "epoch": 2.009554003684504, "grad_norm": 5.581313610076904, "learning_rate": 6.516511544936542e-05, "loss": 2.2341598510742187, "memory(GiB)": 77.56, "step": 46905, "token_acc": 0.5116279069767442, "train_speed(iter/s)": 1.440451 }, { "epoch": 2.0097682190137527, "grad_norm": 5.02405309677124, "learning_rate": 6.515870254906984e-05, "loss": 2.3327259063720702, "memory(GiB)": 77.56, "step": 46910, "token_acc": 0.4720496894409938, "train_speed(iter/s)": 1.440477 }, { "epoch": 2.0099824343430015, "grad_norm": 4.830881595611572, "learning_rate": 6.515228937416015e-05, "loss": 2.210921287536621, "memory(GiB)": 77.56, "step": 46915, "token_acc": 0.5255474452554745, "train_speed(iter/s)": 1.440443 }, { "epoch": 2.0101966496722508, "grad_norm": 5.542373180389404, "learning_rate": 6.514587592475252e-05, "loss": 2.493315315246582, "memory(GiB)": 77.56, "step": 46920, "token_acc": 0.4716312056737589, "train_speed(iter/s)": 1.440466 }, { "epoch": 2.0104108650014996, "grad_norm": 6.550668239593506, "learning_rate": 6.513946220096316e-05, "loss": 2.410256195068359, "memory(GiB)": 77.56, "step": 46925, "token_acc": 0.4967741935483871, "train_speed(iter/s)": 1.440476 }, { "epoch": 2.0106250803307484, "grad_norm": 5.8648457527160645, "learning_rate": 6.513304820290822e-05, "loss": 2.301552581787109, "memory(GiB)": 77.56, "step": 46930, "token_acc": 0.4626865671641791, "train_speed(iter/s)": 1.440436 }, { "epoch": 2.0108392956599976, "grad_norm": 5.672832012176514, "learning_rate": 6.512663393070392e-05, "loss": 2.343380355834961, "memory(GiB)": 77.56, "step": 46935, "token_acc": 0.4861111111111111, "train_speed(iter/s)": 1.44045 }, { "epoch": 2.0110535109892465, "grad_norm": 5.217822074890137, "learning_rate": 6.512021938446646e-05, "loss": 2.420007514953613, "memory(GiB)": 77.56, "step": 46940, "token_acc": 0.46747967479674796, "train_speed(iter/s)": 1.440478 }, { "epoch": 2.0112677263184953, "grad_norm": 4.892837047576904, "learning_rate": 6.511380456431204e-05, "loss": 2.4698442459106444, "memory(GiB)": 77.56, "step": 46945, "token_acc": 0.47904191616766467, "train_speed(iter/s)": 1.44047 }, { "epoch": 2.0114819416477445, "grad_norm": 4.919934272766113, "learning_rate": 6.510738947035687e-05, "loss": 2.6515804290771485, "memory(GiB)": 77.56, "step": 46950, "token_acc": 0.4727272727272727, "train_speed(iter/s)": 1.440483 }, { "epoch": 2.0116961569769933, "grad_norm": 6.32347297668457, "learning_rate": 6.510097410271717e-05, "loss": 2.543160820007324, "memory(GiB)": 77.56, "step": 46955, "token_acc": 0.5063291139240507, "train_speed(iter/s)": 1.440459 }, { "epoch": 2.011910372306242, "grad_norm": 5.247535705566406, "learning_rate": 6.509455846150916e-05, "loss": 2.0123329162597656, "memory(GiB)": 77.56, "step": 46960, "token_acc": 0.5529411764705883, "train_speed(iter/s)": 1.440479 }, { "epoch": 2.0121245876354914, "grad_norm": 3.964775323867798, "learning_rate": 6.508814254684906e-05, "loss": 2.4224483489990236, "memory(GiB)": 77.56, "step": 46965, "token_acc": 0.46048109965635736, "train_speed(iter/s)": 1.440466 }, { "epoch": 2.01233880296474, "grad_norm": 7.648169994354248, "learning_rate": 6.50817263588531e-05, "loss": 2.4166791915893553, "memory(GiB)": 77.56, "step": 46970, "token_acc": 0.5158730158730159, "train_speed(iter/s)": 1.440471 }, { "epoch": 2.012553018293989, "grad_norm": 4.846028804779053, "learning_rate": 6.507530989763754e-05, "loss": 2.4553701400756838, "memory(GiB)": 77.56, "step": 46975, "token_acc": 0.42378048780487804, "train_speed(iter/s)": 1.440472 }, { "epoch": 2.0127672336232383, "grad_norm": 7.003790855407715, "learning_rate": 6.506889316331858e-05, "loss": 2.522560882568359, "memory(GiB)": 77.56, "step": 46980, "token_acc": 0.48046875, "train_speed(iter/s)": 1.440494 }, { "epoch": 2.012981448952487, "grad_norm": 5.9567060470581055, "learning_rate": 6.50624761560125e-05, "loss": 2.231857681274414, "memory(GiB)": 77.56, "step": 46985, "token_acc": 0.48188405797101447, "train_speed(iter/s)": 1.440499 }, { "epoch": 2.013195664281736, "grad_norm": 4.712182521820068, "learning_rate": 6.505605887583552e-05, "loss": 2.4869033813476564, "memory(GiB)": 77.56, "step": 46990, "token_acc": 0.46557377049180326, "train_speed(iter/s)": 1.440436 }, { "epoch": 2.013409879610985, "grad_norm": 4.36808967590332, "learning_rate": 6.50496413229039e-05, "loss": 2.0867469787597654, "memory(GiB)": 77.56, "step": 46995, "token_acc": 0.5632183908045977, "train_speed(iter/s)": 1.440444 }, { "epoch": 2.013624094940234, "grad_norm": 5.934143543243408, "learning_rate": 6.504322349733393e-05, "loss": 2.640213203430176, "memory(GiB)": 77.56, "step": 47000, "token_acc": 0.4852941176470588, "train_speed(iter/s)": 1.440445 }, { "epoch": 2.013624094940234, "eval_loss": 2.2116146087646484, "eval_runtime": 14.3282, "eval_samples_per_second": 6.979, "eval_steps_per_second": 6.979, "eval_token_acc": 0.4889807162534435, "step": 47000 }, { "epoch": 2.013838310269483, "grad_norm": 4.928823947906494, "learning_rate": 6.503680539924184e-05, "loss": 2.2399688720703126, "memory(GiB)": 77.56, "step": 47005, "token_acc": 0.49753208292201384, "train_speed(iter/s)": 1.439758 }, { "epoch": 2.014052525598732, "grad_norm": 5.141942977905273, "learning_rate": 6.503038702874394e-05, "loss": 2.4522708892822265, "memory(GiB)": 77.56, "step": 47010, "token_acc": 0.45454545454545453, "train_speed(iter/s)": 1.439704 }, { "epoch": 2.014266740927981, "grad_norm": 5.53627872467041, "learning_rate": 6.502396838595646e-05, "loss": 2.5175106048583986, "memory(GiB)": 77.56, "step": 47015, "token_acc": 0.4674922600619195, "train_speed(iter/s)": 1.439738 }, { "epoch": 2.0144809562572297, "grad_norm": 7.602707862854004, "learning_rate": 6.50175494709957e-05, "loss": 2.413856315612793, "memory(GiB)": 77.56, "step": 47020, "token_acc": 0.46078431372549017, "train_speed(iter/s)": 1.43971 }, { "epoch": 2.014695171586479, "grad_norm": 5.674644470214844, "learning_rate": 6.501113028397793e-05, "loss": 2.4417697906494142, "memory(GiB)": 77.56, "step": 47025, "token_acc": 0.5259259259259259, "train_speed(iter/s)": 1.439694 }, { "epoch": 2.0149093869157277, "grad_norm": 5.88431978225708, "learning_rate": 6.500471082501948e-05, "loss": 2.6259002685546875, "memory(GiB)": 77.56, "step": 47030, "token_acc": 0.4152823920265781, "train_speed(iter/s)": 1.439696 }, { "epoch": 2.0151236022449766, "grad_norm": 7.305669784545898, "learning_rate": 6.49982910942366e-05, "loss": 2.230219268798828, "memory(GiB)": 77.56, "step": 47035, "token_acc": 0.4891640866873065, "train_speed(iter/s)": 1.439662 }, { "epoch": 2.015337817574226, "grad_norm": 4.789058685302734, "learning_rate": 6.499187109174561e-05, "loss": 2.5457658767700195, "memory(GiB)": 77.56, "step": 47040, "token_acc": 0.496875, "train_speed(iter/s)": 1.439626 }, { "epoch": 2.0155520329034746, "grad_norm": 4.733511924743652, "learning_rate": 6.498545081766282e-05, "loss": 2.266896438598633, "memory(GiB)": 77.56, "step": 47045, "token_acc": 0.5243445692883895, "train_speed(iter/s)": 1.439608 }, { "epoch": 2.0157662482327234, "grad_norm": 3.543917417526245, "learning_rate": 6.497903027210453e-05, "loss": 2.7028133392333986, "memory(GiB)": 77.56, "step": 47050, "token_acc": 0.45454545454545453, "train_speed(iter/s)": 1.439643 }, { "epoch": 2.0159804635619727, "grad_norm": 5.878790855407715, "learning_rate": 6.497260945518706e-05, "loss": 2.4566299438476564, "memory(GiB)": 77.56, "step": 47055, "token_acc": 0.45714285714285713, "train_speed(iter/s)": 1.439678 }, { "epoch": 2.0161946788912215, "grad_norm": 4.004664897918701, "learning_rate": 6.496618836702672e-05, "loss": 2.1462165832519533, "memory(GiB)": 77.56, "step": 47060, "token_acc": 0.5, "train_speed(iter/s)": 1.439684 }, { "epoch": 2.0164088942204703, "grad_norm": 4.003391265869141, "learning_rate": 6.49597670077398e-05, "loss": 2.233612823486328, "memory(GiB)": 77.56, "step": 47065, "token_acc": 0.5238095238095238, "train_speed(iter/s)": 1.439638 }, { "epoch": 2.0166231095497196, "grad_norm": 4.659119606018066, "learning_rate": 6.495334537744271e-05, "loss": 2.246729850769043, "memory(GiB)": 77.56, "step": 47070, "token_acc": 0.5084745762711864, "train_speed(iter/s)": 1.439648 }, { "epoch": 2.0168373248789684, "grad_norm": 5.30164098739624, "learning_rate": 6.494692347625174e-05, "loss": 2.328721618652344, "memory(GiB)": 77.56, "step": 47075, "token_acc": 0.5606694560669456, "train_speed(iter/s)": 1.439615 }, { "epoch": 2.017051540208217, "grad_norm": 6.037069320678711, "learning_rate": 6.494050130428323e-05, "loss": 2.4070837020874025, "memory(GiB)": 77.56, "step": 47080, "token_acc": 0.477124183006536, "train_speed(iter/s)": 1.439617 }, { "epoch": 2.0172657555374665, "grad_norm": 4.738967418670654, "learning_rate": 6.493407886165351e-05, "loss": 2.786213684082031, "memory(GiB)": 77.56, "step": 47085, "token_acc": 0.4673202614379085, "train_speed(iter/s)": 1.439654 }, { "epoch": 2.0174799708667153, "grad_norm": 6.387697219848633, "learning_rate": 6.492765614847896e-05, "loss": 2.393421173095703, "memory(GiB)": 77.56, "step": 47090, "token_acc": 0.47601476014760147, "train_speed(iter/s)": 1.439688 }, { "epoch": 2.017694186195964, "grad_norm": 6.092655658721924, "learning_rate": 6.492123316487589e-05, "loss": 2.5422796249389648, "memory(GiB)": 77.56, "step": 47095, "token_acc": 0.49829351535836175, "train_speed(iter/s)": 1.439684 }, { "epoch": 2.0179084015252133, "grad_norm": 5.014275550842285, "learning_rate": 6.49148099109607e-05, "loss": 2.3574142456054688, "memory(GiB)": 77.56, "step": 47100, "token_acc": 0.48717948717948717, "train_speed(iter/s)": 1.439658 }, { "epoch": 2.018122616854462, "grad_norm": 5.569258689880371, "learning_rate": 6.490838638684973e-05, "loss": 2.5010326385498045, "memory(GiB)": 77.56, "step": 47105, "token_acc": 0.44565217391304346, "train_speed(iter/s)": 1.439626 }, { "epoch": 2.018336832183711, "grad_norm": 4.812034606933594, "learning_rate": 6.490196259265934e-05, "loss": 2.405223846435547, "memory(GiB)": 77.56, "step": 47110, "token_acc": 0.46439628482972134, "train_speed(iter/s)": 1.43962 }, { "epoch": 2.01855104751296, "grad_norm": 5.988837242126465, "learning_rate": 6.489553852850596e-05, "loss": 2.651731491088867, "memory(GiB)": 77.56, "step": 47115, "token_acc": 0.4536741214057508, "train_speed(iter/s)": 1.43963 }, { "epoch": 2.018765262842209, "grad_norm": 4.13277006149292, "learning_rate": 6.488911419450591e-05, "loss": 2.352150726318359, "memory(GiB)": 77.56, "step": 47120, "token_acc": 0.4927007299270073, "train_speed(iter/s)": 1.439639 }, { "epoch": 2.018979478171458, "grad_norm": 5.49845552444458, "learning_rate": 6.488268959077557e-05, "loss": 2.3887655258178713, "memory(GiB)": 77.56, "step": 47125, "token_acc": 0.5074074074074074, "train_speed(iter/s)": 1.439638 }, { "epoch": 2.019193693500707, "grad_norm": 6.266018867492676, "learning_rate": 6.487626471743137e-05, "loss": 2.502766418457031, "memory(GiB)": 77.56, "step": 47130, "token_acc": 0.4479166666666667, "train_speed(iter/s)": 1.439647 }, { "epoch": 2.019407908829956, "grad_norm": 4.618208408355713, "learning_rate": 6.486983957458967e-05, "loss": 2.315196228027344, "memory(GiB)": 77.56, "step": 47135, "token_acc": 0.5, "train_speed(iter/s)": 1.439657 }, { "epoch": 2.0196221241592047, "grad_norm": 4.724485397338867, "learning_rate": 6.486341416236687e-05, "loss": 2.527619743347168, "memory(GiB)": 77.56, "step": 47140, "token_acc": 0.4577922077922078, "train_speed(iter/s)": 1.439693 }, { "epoch": 2.019836339488454, "grad_norm": 6.78851318359375, "learning_rate": 6.485698848087939e-05, "loss": 2.1528024673461914, "memory(GiB)": 77.56, "step": 47145, "token_acc": 0.552901023890785, "train_speed(iter/s)": 1.439742 }, { "epoch": 2.020050554817703, "grad_norm": 6.4735188484191895, "learning_rate": 6.485056253024362e-05, "loss": 2.553872299194336, "memory(GiB)": 77.56, "step": 47150, "token_acc": 0.48717948717948717, "train_speed(iter/s)": 1.439741 }, { "epoch": 2.0202647701469516, "grad_norm": 4.260812282562256, "learning_rate": 6.484413631057599e-05, "loss": 2.3631893157958985, "memory(GiB)": 77.56, "step": 47155, "token_acc": 0.5, "train_speed(iter/s)": 1.439731 }, { "epoch": 2.020478985476201, "grad_norm": 4.051983833312988, "learning_rate": 6.48377098219929e-05, "loss": 2.3516508102416993, "memory(GiB)": 77.56, "step": 47160, "token_acc": 0.5168195718654435, "train_speed(iter/s)": 1.439717 }, { "epoch": 2.0206932008054497, "grad_norm": 4.177536964416504, "learning_rate": 6.483128306461078e-05, "loss": 2.4834449768066404, "memory(GiB)": 77.56, "step": 47165, "token_acc": 0.49310344827586206, "train_speed(iter/s)": 1.439707 }, { "epoch": 2.0209074161346985, "grad_norm": 5.729572296142578, "learning_rate": 6.482485603854604e-05, "loss": 2.5365907669067385, "memory(GiB)": 77.56, "step": 47170, "token_acc": 0.451505016722408, "train_speed(iter/s)": 1.439704 }, { "epoch": 2.0211216314639477, "grad_norm": 10.399534225463867, "learning_rate": 6.481842874391514e-05, "loss": 2.585718536376953, "memory(GiB)": 77.56, "step": 47175, "token_acc": 0.447098976109215, "train_speed(iter/s)": 1.439709 }, { "epoch": 2.0213358467931966, "grad_norm": 6.627134323120117, "learning_rate": 6.481200118083449e-05, "loss": 2.3237384796142577, "memory(GiB)": 77.56, "step": 47180, "token_acc": 0.48264984227129337, "train_speed(iter/s)": 1.439725 }, { "epoch": 2.0215500621224454, "grad_norm": 4.1366167068481445, "learning_rate": 6.480557334942055e-05, "loss": 2.5460943222045898, "memory(GiB)": 77.56, "step": 47185, "token_acc": 0.4517133956386293, "train_speed(iter/s)": 1.439753 }, { "epoch": 2.0217642774516946, "grad_norm": 7.300133228302002, "learning_rate": 6.479914524978978e-05, "loss": 2.6619863510131836, "memory(GiB)": 77.56, "step": 47190, "token_acc": 0.44224422442244227, "train_speed(iter/s)": 1.439769 }, { "epoch": 2.0219784927809434, "grad_norm": 5.019187927246094, "learning_rate": 6.479271688205858e-05, "loss": 2.5283918380737305, "memory(GiB)": 77.56, "step": 47195, "token_acc": 0.509090909090909, "train_speed(iter/s)": 1.439776 }, { "epoch": 2.0221927081101922, "grad_norm": 5.677464485168457, "learning_rate": 6.478628824634346e-05, "loss": 2.6112157821655275, "memory(GiB)": 77.56, "step": 47200, "token_acc": 0.49382716049382713, "train_speed(iter/s)": 1.439766 }, { "epoch": 2.0224069234394415, "grad_norm": 3.9665565490722656, "learning_rate": 6.477985934276085e-05, "loss": 2.4552179336547852, "memory(GiB)": 77.56, "step": 47205, "token_acc": 0.4612903225806452, "train_speed(iter/s)": 1.439774 }, { "epoch": 2.0226211387686903, "grad_norm": 6.540243625640869, "learning_rate": 6.477343017142722e-05, "loss": 2.491834259033203, "memory(GiB)": 77.56, "step": 47210, "token_acc": 0.4777777777777778, "train_speed(iter/s)": 1.439777 }, { "epoch": 2.022835354097939, "grad_norm": 7.2349467277526855, "learning_rate": 6.476700073245904e-05, "loss": 1.9539417266845702, "memory(GiB)": 77.56, "step": 47215, "token_acc": 0.5700934579439252, "train_speed(iter/s)": 1.439774 }, { "epoch": 2.0230495694271884, "grad_norm": 4.291737079620361, "learning_rate": 6.47605710259728e-05, "loss": 2.1844161987304687, "memory(GiB)": 77.56, "step": 47220, "token_acc": 0.535593220338983, "train_speed(iter/s)": 1.439792 }, { "epoch": 2.023263784756437, "grad_norm": 4.990181922912598, "learning_rate": 6.475414105208497e-05, "loss": 2.6633190155029296, "memory(GiB)": 77.56, "step": 47225, "token_acc": 0.45132743362831856, "train_speed(iter/s)": 1.43978 }, { "epoch": 2.023478000085686, "grad_norm": 4.123158931732178, "learning_rate": 6.474771081091202e-05, "loss": 2.2368452072143556, "memory(GiB)": 77.56, "step": 47230, "token_acc": 0.5031645569620253, "train_speed(iter/s)": 1.439784 }, { "epoch": 2.0236922154149353, "grad_norm": 5.408724308013916, "learning_rate": 6.474128030257047e-05, "loss": 2.443004608154297, "memory(GiB)": 77.56, "step": 47235, "token_acc": 0.4874551971326165, "train_speed(iter/s)": 1.439821 }, { "epoch": 2.023906430744184, "grad_norm": 6.242146015167236, "learning_rate": 6.473484952717679e-05, "loss": 2.7575626373291016, "memory(GiB)": 77.56, "step": 47240, "token_acc": 0.4542483660130719, "train_speed(iter/s)": 1.439843 }, { "epoch": 2.024120646073433, "grad_norm": 5.340526103973389, "learning_rate": 6.472841848484749e-05, "loss": 2.4209598541259765, "memory(GiB)": 77.56, "step": 47245, "token_acc": 0.4778761061946903, "train_speed(iter/s)": 1.439856 }, { "epoch": 2.024334861402682, "grad_norm": 5.6754608154296875, "learning_rate": 6.472198717569907e-05, "loss": 2.383496856689453, "memory(GiB)": 77.56, "step": 47250, "token_acc": 0.48598130841121495, "train_speed(iter/s)": 1.439918 }, { "epoch": 2.024549076731931, "grad_norm": 6.373967170715332, "learning_rate": 6.471555559984804e-05, "loss": 2.695840835571289, "memory(GiB)": 77.56, "step": 47255, "token_acc": 0.4409722222222222, "train_speed(iter/s)": 1.439945 }, { "epoch": 2.0247632920611798, "grad_norm": 4.937572479248047, "learning_rate": 6.470912375741093e-05, "loss": 2.2277603149414062, "memory(GiB)": 77.56, "step": 47260, "token_acc": 0.5059880239520959, "train_speed(iter/s)": 1.43994 }, { "epoch": 2.024977507390429, "grad_norm": 3.9886159896850586, "learning_rate": 6.470269164850424e-05, "loss": 2.7752742767333984, "memory(GiB)": 77.56, "step": 47265, "token_acc": 0.4417808219178082, "train_speed(iter/s)": 1.439963 }, { "epoch": 2.025191722719678, "grad_norm": 5.3319220542907715, "learning_rate": 6.469625927324448e-05, "loss": 2.64019775390625, "memory(GiB)": 77.56, "step": 47270, "token_acc": 0.4664310954063604, "train_speed(iter/s)": 1.439975 }, { "epoch": 2.0254059380489267, "grad_norm": 7.283320903778076, "learning_rate": 6.468982663174822e-05, "loss": 2.2063600540161135, "memory(GiB)": 77.56, "step": 47275, "token_acc": 0.4879725085910653, "train_speed(iter/s)": 1.440025 }, { "epoch": 2.025620153378176, "grad_norm": 4.875926971435547, "learning_rate": 6.468339372413196e-05, "loss": 2.5367393493652344, "memory(GiB)": 77.56, "step": 47280, "token_acc": 0.4897959183673469, "train_speed(iter/s)": 1.440015 }, { "epoch": 2.0258343687074247, "grad_norm": 5.891599655151367, "learning_rate": 6.467696055051225e-05, "loss": 2.403213882446289, "memory(GiB)": 77.56, "step": 47285, "token_acc": 0.4892086330935252, "train_speed(iter/s)": 1.440022 }, { "epoch": 2.0260485840366735, "grad_norm": 6.47664213180542, "learning_rate": 6.467052711100563e-05, "loss": 2.37048225402832, "memory(GiB)": 77.56, "step": 47290, "token_acc": 0.4944237918215613, "train_speed(iter/s)": 1.439986 }, { "epoch": 2.026262799365923, "grad_norm": 6.752597808837891, "learning_rate": 6.466409340572864e-05, "loss": 2.0826969146728516, "memory(GiB)": 77.56, "step": 47295, "token_acc": 0.5498154981549815, "train_speed(iter/s)": 1.439992 }, { "epoch": 2.0264770146951716, "grad_norm": 5.42634916305542, "learning_rate": 6.465765943479785e-05, "loss": 2.737038803100586, "memory(GiB)": 77.56, "step": 47300, "token_acc": 0.4340277777777778, "train_speed(iter/s)": 1.440002 }, { "epoch": 2.0266912300244204, "grad_norm": 4.91648006439209, "learning_rate": 6.465122519832982e-05, "loss": 2.392622375488281, "memory(GiB)": 77.56, "step": 47305, "token_acc": 0.45964912280701753, "train_speed(iter/s)": 1.440012 }, { "epoch": 2.0269054453536697, "grad_norm": 5.535400867462158, "learning_rate": 6.464479069644109e-05, "loss": 2.3067462921142576, "memory(GiB)": 77.56, "step": 47310, "token_acc": 0.5333333333333333, "train_speed(iter/s)": 1.439989 }, { "epoch": 2.0271196606829185, "grad_norm": 4.595633506774902, "learning_rate": 6.463835592924825e-05, "loss": 2.758593940734863, "memory(GiB)": 77.56, "step": 47315, "token_acc": 0.46366782006920415, "train_speed(iter/s)": 1.440038 }, { "epoch": 2.0273338760121673, "grad_norm": 6.745107650756836, "learning_rate": 6.463192089686786e-05, "loss": 2.5533119201660157, "memory(GiB)": 77.56, "step": 47320, "token_acc": 0.4664179104477612, "train_speed(iter/s)": 1.440074 }, { "epoch": 2.0275480913414166, "grad_norm": 4.424008369445801, "learning_rate": 6.462548559941648e-05, "loss": 2.306426429748535, "memory(GiB)": 77.56, "step": 47325, "token_acc": 0.49145299145299143, "train_speed(iter/s)": 1.440073 }, { "epoch": 2.0277623066706654, "grad_norm": 5.835791110992432, "learning_rate": 6.461905003701072e-05, "loss": 2.314540481567383, "memory(GiB)": 77.56, "step": 47330, "token_acc": 0.5423076923076923, "train_speed(iter/s)": 1.440085 }, { "epoch": 2.027976521999914, "grad_norm": 6.8580403327941895, "learning_rate": 6.461261420976716e-05, "loss": 2.4569332122802736, "memory(GiB)": 77.56, "step": 47335, "token_acc": 0.5036231884057971, "train_speed(iter/s)": 1.440101 }, { "epoch": 2.0281907373291634, "grad_norm": 6.78116512298584, "learning_rate": 6.460617811780238e-05, "loss": 2.277758026123047, "memory(GiB)": 77.56, "step": 47340, "token_acc": 0.48909657320872274, "train_speed(iter/s)": 1.440116 }, { "epoch": 2.0284049526584123, "grad_norm": 5.802482604980469, "learning_rate": 6.459974176123299e-05, "loss": 2.4579164505004885, "memory(GiB)": 77.56, "step": 47345, "token_acc": 0.5280898876404494, "train_speed(iter/s)": 1.440088 }, { "epoch": 2.028619167987661, "grad_norm": 5.421518325805664, "learning_rate": 6.459330514017556e-05, "loss": 2.506378746032715, "memory(GiB)": 77.56, "step": 47350, "token_acc": 0.4468864468864469, "train_speed(iter/s)": 1.440084 }, { "epoch": 2.0288333833169103, "grad_norm": 4.904119968414307, "learning_rate": 6.458686825474674e-05, "loss": 2.421441078186035, "memory(GiB)": 77.56, "step": 47355, "token_acc": 0.47384615384615386, "train_speed(iter/s)": 1.440041 }, { "epoch": 2.029047598646159, "grad_norm": 6.118224620819092, "learning_rate": 6.45804311050631e-05, "loss": 2.717330551147461, "memory(GiB)": 77.56, "step": 47360, "token_acc": 0.47214076246334313, "train_speed(iter/s)": 1.440061 }, { "epoch": 2.029261813975408, "grad_norm": 3.993043899536133, "learning_rate": 6.457399369124129e-05, "loss": 2.1258781433105467, "memory(GiB)": 77.56, "step": 47365, "token_acc": 0.5261324041811847, "train_speed(iter/s)": 1.440068 }, { "epoch": 2.029476029304657, "grad_norm": 4.975916862487793, "learning_rate": 6.456755601339789e-05, "loss": 2.434589385986328, "memory(GiB)": 77.56, "step": 47370, "token_acc": 0.4713804713804714, "train_speed(iter/s)": 1.440074 }, { "epoch": 2.029690244633906, "grad_norm": 6.480383396148682, "learning_rate": 6.456111807164958e-05, "loss": 2.5920232772827148, "memory(GiB)": 77.56, "step": 47375, "token_acc": 0.4551282051282051, "train_speed(iter/s)": 1.440092 }, { "epoch": 2.029904459963155, "grad_norm": 5.017740726470947, "learning_rate": 6.455467986611293e-05, "loss": 2.416194534301758, "memory(GiB)": 77.56, "step": 47380, "token_acc": 0.47770700636942676, "train_speed(iter/s)": 1.440103 }, { "epoch": 2.030118675292404, "grad_norm": 5.090701103210449, "learning_rate": 6.45482413969046e-05, "loss": 2.193351936340332, "memory(GiB)": 77.56, "step": 47385, "token_acc": 0.49056603773584906, "train_speed(iter/s)": 1.440083 }, { "epoch": 2.030332890621653, "grad_norm": 6.160708427429199, "learning_rate": 6.454180266414125e-05, "loss": 2.466547966003418, "memory(GiB)": 77.56, "step": 47390, "token_acc": 0.4493927125506073, "train_speed(iter/s)": 1.440049 }, { "epoch": 2.0305471059509017, "grad_norm": 6.454678535461426, "learning_rate": 6.453536366793949e-05, "loss": 2.6635141372680664, "memory(GiB)": 77.56, "step": 47395, "token_acc": 0.484375, "train_speed(iter/s)": 1.440049 }, { "epoch": 2.030761321280151, "grad_norm": 5.459461688995361, "learning_rate": 6.452892440841597e-05, "loss": 2.288252067565918, "memory(GiB)": 77.56, "step": 47400, "token_acc": 0.484251968503937, "train_speed(iter/s)": 1.44003 }, { "epoch": 2.0309755366094, "grad_norm": 5.283269882202148, "learning_rate": 6.452248488568738e-05, "loss": 2.5599859237670897, "memory(GiB)": 77.56, "step": 47405, "token_acc": 0.41643835616438357, "train_speed(iter/s)": 1.440044 }, { "epoch": 2.0311897519386486, "grad_norm": 5.3509440422058105, "learning_rate": 6.451604509987033e-05, "loss": 2.2912080764770506, "memory(GiB)": 77.56, "step": 47410, "token_acc": 0.4878048780487805, "train_speed(iter/s)": 1.440014 }, { "epoch": 2.031403967267898, "grad_norm": 5.202113151550293, "learning_rate": 6.450960505108153e-05, "loss": 2.434415817260742, "memory(GiB)": 77.56, "step": 47415, "token_acc": 0.47398843930635837, "train_speed(iter/s)": 1.439984 }, { "epoch": 2.0316181825971467, "grad_norm": 4.731122016906738, "learning_rate": 6.450316473943763e-05, "loss": 2.546631622314453, "memory(GiB)": 77.56, "step": 47420, "token_acc": 0.49523809523809526, "train_speed(iter/s)": 1.439986 }, { "epoch": 2.0318323979263955, "grad_norm": 8.232008934020996, "learning_rate": 6.449672416505526e-05, "loss": 2.2981239318847657, "memory(GiB)": 77.56, "step": 47425, "token_acc": 0.5033112582781457, "train_speed(iter/s)": 1.439969 }, { "epoch": 2.0320466132556447, "grad_norm": 6.969511032104492, "learning_rate": 6.449028332805117e-05, "loss": 2.6346397399902344, "memory(GiB)": 77.56, "step": 47430, "token_acc": 0.44715447154471544, "train_speed(iter/s)": 1.440005 }, { "epoch": 2.0322608285848935, "grad_norm": 4.844028472900391, "learning_rate": 6.448384222854197e-05, "loss": 2.2274232864379884, "memory(GiB)": 77.56, "step": 47435, "token_acc": 0.5153374233128835, "train_speed(iter/s)": 1.440007 }, { "epoch": 2.0324750439141424, "grad_norm": 7.408126354217529, "learning_rate": 6.447740086664439e-05, "loss": 2.150931549072266, "memory(GiB)": 77.56, "step": 47440, "token_acc": 0.5145228215767634, "train_speed(iter/s)": 1.440031 }, { "epoch": 2.0326892592433916, "grad_norm": 5.0446295738220215, "learning_rate": 6.447095924247512e-05, "loss": 2.460962104797363, "memory(GiB)": 77.56, "step": 47445, "token_acc": 0.4931972789115646, "train_speed(iter/s)": 1.440036 }, { "epoch": 2.0329034745726404, "grad_norm": 6.160946846008301, "learning_rate": 6.446451735615085e-05, "loss": 2.326716995239258, "memory(GiB)": 77.56, "step": 47450, "token_acc": 0.5251141552511416, "train_speed(iter/s)": 1.440031 }, { "epoch": 2.0331176899018892, "grad_norm": 6.735740661621094, "learning_rate": 6.445807520778828e-05, "loss": 2.156563568115234, "memory(GiB)": 77.56, "step": 47455, "token_acc": 0.5062240663900415, "train_speed(iter/s)": 1.44002 }, { "epoch": 2.0333319052311385, "grad_norm": 5.7068634033203125, "learning_rate": 6.445163279750412e-05, "loss": 2.3455982208251953, "memory(GiB)": 77.56, "step": 47460, "token_acc": 0.5136986301369864, "train_speed(iter/s)": 1.440049 }, { "epoch": 2.0335461205603873, "grad_norm": 5.048141956329346, "learning_rate": 6.444519012541507e-05, "loss": 2.0552072525024414, "memory(GiB)": 77.56, "step": 47465, "token_acc": 0.5811320754716981, "train_speed(iter/s)": 1.440062 }, { "epoch": 2.033760335889636, "grad_norm": 7.297991752624512, "learning_rate": 6.443874719163782e-05, "loss": 2.100653076171875, "memory(GiB)": 77.56, "step": 47470, "token_acc": 0.5337837837837838, "train_speed(iter/s)": 1.440073 }, { "epoch": 2.0339745512188854, "grad_norm": 3.8732073307037354, "learning_rate": 6.443230399628916e-05, "loss": 2.36053409576416, "memory(GiB)": 77.56, "step": 47475, "token_acc": 0.48360655737704916, "train_speed(iter/s)": 1.44005 }, { "epoch": 2.034188766548134, "grad_norm": 5.332747459411621, "learning_rate": 6.442586053948576e-05, "loss": 2.2015209197998047, "memory(GiB)": 77.56, "step": 47480, "token_acc": 0.5241635687732342, "train_speed(iter/s)": 1.440057 }, { "epoch": 2.034402981877383, "grad_norm": 6.69398832321167, "learning_rate": 6.441941682134436e-05, "loss": 2.2048450469970704, "memory(GiB)": 77.56, "step": 47485, "token_acc": 0.545816733067729, "train_speed(iter/s)": 1.440108 }, { "epoch": 2.0346171972066323, "grad_norm": 5.170960903167725, "learning_rate": 6.441297284198172e-05, "loss": 2.4763031005859375, "memory(GiB)": 77.56, "step": 47490, "token_acc": 0.48484848484848486, "train_speed(iter/s)": 1.440151 }, { "epoch": 2.034831412535881, "grad_norm": 5.75846529006958, "learning_rate": 6.440652860151452e-05, "loss": 2.4016122817993164, "memory(GiB)": 77.56, "step": 47495, "token_acc": 0.4589041095890411, "train_speed(iter/s)": 1.440143 }, { "epoch": 2.03504562786513, "grad_norm": 6.008525371551514, "learning_rate": 6.440008410005956e-05, "loss": 2.5113031387329103, "memory(GiB)": 77.56, "step": 47500, "token_acc": 0.44516129032258067, "train_speed(iter/s)": 1.440184 }, { "epoch": 2.03504562786513, "eval_loss": 2.163577079772949, "eval_runtime": 14.1428, "eval_samples_per_second": 7.071, "eval_steps_per_second": 7.071, "eval_token_acc": 0.46774193548387094, "step": 47500 }, { "epoch": 2.035259843194379, "grad_norm": 4.006160736083984, "learning_rate": 6.439363933773358e-05, "loss": 2.2517932891845702, "memory(GiB)": 77.56, "step": 47505, "token_acc": 0.474025974025974, "train_speed(iter/s)": 1.43954 }, { "epoch": 2.035474058523628, "grad_norm": 5.914662837982178, "learning_rate": 6.43871943146533e-05, "loss": 2.436318588256836, "memory(GiB)": 77.56, "step": 47510, "token_acc": 0.46647230320699706, "train_speed(iter/s)": 1.439531 }, { "epoch": 2.0356882738528768, "grad_norm": 6.909361839294434, "learning_rate": 6.43807490309355e-05, "loss": 2.337209701538086, "memory(GiB)": 77.56, "step": 47515, "token_acc": 0.525691699604743, "train_speed(iter/s)": 1.439505 }, { "epoch": 2.035902489182126, "grad_norm": 4.693491458892822, "learning_rate": 6.437430348669695e-05, "loss": 2.4208454132080077, "memory(GiB)": 77.56, "step": 47520, "token_acc": 0.4568345323741007, "train_speed(iter/s)": 1.439497 }, { "epoch": 2.036116704511375, "grad_norm": 4.709084510803223, "learning_rate": 6.436785768205442e-05, "loss": 2.5848079681396485, "memory(GiB)": 77.56, "step": 47525, "token_acc": 0.4965753424657534, "train_speed(iter/s)": 1.439499 }, { "epoch": 2.0363309198406236, "grad_norm": 5.230836391448975, "learning_rate": 6.436141161712465e-05, "loss": 2.376507568359375, "memory(GiB)": 77.56, "step": 47530, "token_acc": 0.48859934853420195, "train_speed(iter/s)": 1.439485 }, { "epoch": 2.036545135169873, "grad_norm": 4.986629962921143, "learning_rate": 6.435496529202446e-05, "loss": 2.282478904724121, "memory(GiB)": 77.56, "step": 47535, "token_acc": 0.5195729537366548, "train_speed(iter/s)": 1.439509 }, { "epoch": 2.0367593504991217, "grad_norm": 5.57295560836792, "learning_rate": 6.434851870687058e-05, "loss": 2.5561681747436524, "memory(GiB)": 77.56, "step": 47540, "token_acc": 0.473015873015873, "train_speed(iter/s)": 1.439482 }, { "epoch": 2.0369735658283705, "grad_norm": 4.689891338348389, "learning_rate": 6.434207186177982e-05, "loss": 2.5525936126708983, "memory(GiB)": 77.56, "step": 47545, "token_acc": 0.5201465201465202, "train_speed(iter/s)": 1.439512 }, { "epoch": 2.03718778115762, "grad_norm": 4.075557231903076, "learning_rate": 6.4335624756869e-05, "loss": 2.3793817520141602, "memory(GiB)": 77.56, "step": 47550, "token_acc": 0.5286195286195287, "train_speed(iter/s)": 1.439511 }, { "epoch": 2.0374019964868686, "grad_norm": 4.648125648498535, "learning_rate": 6.432917739225486e-05, "loss": 2.1042709350585938, "memory(GiB)": 77.56, "step": 47555, "token_acc": 0.5546558704453441, "train_speed(iter/s)": 1.439502 }, { "epoch": 2.0376162118161174, "grad_norm": 4.895267009735107, "learning_rate": 6.432272976805425e-05, "loss": 2.394881820678711, "memory(GiB)": 77.56, "step": 47560, "token_acc": 0.486013986013986, "train_speed(iter/s)": 1.439491 }, { "epoch": 2.0378304271453667, "grad_norm": 4.482232093811035, "learning_rate": 6.431628188438397e-05, "loss": 2.2670919418334963, "memory(GiB)": 77.56, "step": 47565, "token_acc": 0.5311203319502075, "train_speed(iter/s)": 1.439503 }, { "epoch": 2.0380446424746155, "grad_norm": 4.007987976074219, "learning_rate": 6.430983374136078e-05, "loss": 2.51898307800293, "memory(GiB)": 77.56, "step": 47570, "token_acc": 0.49491525423728816, "train_speed(iter/s)": 1.439537 }, { "epoch": 2.0382588578038643, "grad_norm": 5.164368152618408, "learning_rate": 6.430338533910155e-05, "loss": 2.272294044494629, "memory(GiB)": 77.56, "step": 47575, "token_acc": 0.4962962962962963, "train_speed(iter/s)": 1.439562 }, { "epoch": 2.0384730731331135, "grad_norm": 4.738786220550537, "learning_rate": 6.429693667772308e-05, "loss": 2.6083396911621093, "memory(GiB)": 77.56, "step": 47580, "token_acc": 0.4585635359116022, "train_speed(iter/s)": 1.439587 }, { "epoch": 2.0386872884623624, "grad_norm": 6.589414596557617, "learning_rate": 6.429048775734216e-05, "loss": 2.5532367706298826, "memory(GiB)": 77.56, "step": 47585, "token_acc": 0.4825174825174825, "train_speed(iter/s)": 1.439585 }, { "epoch": 2.038901503791611, "grad_norm": 5.052786827087402, "learning_rate": 6.428403857807568e-05, "loss": 2.48738956451416, "memory(GiB)": 77.56, "step": 47590, "token_acc": 0.4646153846153846, "train_speed(iter/s)": 1.439579 }, { "epoch": 2.0391157191208604, "grad_norm": 5.194866180419922, "learning_rate": 6.427758914004044e-05, "loss": 2.323424530029297, "memory(GiB)": 77.56, "step": 47595, "token_acc": 0.5, "train_speed(iter/s)": 1.439577 }, { "epoch": 2.0393299344501092, "grad_norm": 4.9435834884643555, "learning_rate": 6.427113944335326e-05, "loss": 2.5700027465820314, "memory(GiB)": 77.56, "step": 47600, "token_acc": 0.47096774193548385, "train_speed(iter/s)": 1.439584 }, { "epoch": 2.039544149779358, "grad_norm": 4.893200874328613, "learning_rate": 6.426468948813102e-05, "loss": 2.213448905944824, "memory(GiB)": 77.56, "step": 47605, "token_acc": 0.5158227848101266, "train_speed(iter/s)": 1.439617 }, { "epoch": 2.0397583651086073, "grad_norm": 5.314486503601074, "learning_rate": 6.425823927449055e-05, "loss": 2.1358619689941407, "memory(GiB)": 77.56, "step": 47610, "token_acc": 0.5149501661129569, "train_speed(iter/s)": 1.439604 }, { "epoch": 2.039972580437856, "grad_norm": 5.000269412994385, "learning_rate": 6.425178880254869e-05, "loss": 2.2250171661376954, "memory(GiB)": 77.56, "step": 47615, "token_acc": 0.4871060171919771, "train_speed(iter/s)": 1.439622 }, { "epoch": 2.040186795767105, "grad_norm": 4.7385454177856445, "learning_rate": 6.424533807242229e-05, "loss": 2.5986610412597657, "memory(GiB)": 77.56, "step": 47620, "token_acc": 0.48632218844984804, "train_speed(iter/s)": 1.439641 }, { "epoch": 2.040401011096354, "grad_norm": 5.4596266746521, "learning_rate": 6.423888708422825e-05, "loss": 2.2574934005737304, "memory(GiB)": 77.56, "step": 47625, "token_acc": 0.5451127819548872, "train_speed(iter/s)": 1.439624 }, { "epoch": 2.040615226425603, "grad_norm": 4.448336124420166, "learning_rate": 6.42324358380834e-05, "loss": 2.230917549133301, "memory(GiB)": 77.56, "step": 47630, "token_acc": 0.4847457627118644, "train_speed(iter/s)": 1.439644 }, { "epoch": 2.040829441754852, "grad_norm": 7.835684776306152, "learning_rate": 6.422598433410465e-05, "loss": 2.2535871505737304, "memory(GiB)": 77.56, "step": 47635, "token_acc": 0.4883720930232558, "train_speed(iter/s)": 1.439631 }, { "epoch": 2.041043657084101, "grad_norm": 7.610886096954346, "learning_rate": 6.421953257240882e-05, "loss": 2.476526641845703, "memory(GiB)": 77.56, "step": 47640, "token_acc": 0.47509578544061304, "train_speed(iter/s)": 1.439638 }, { "epoch": 2.04125787241335, "grad_norm": 4.8699445724487305, "learning_rate": 6.421308055311284e-05, "loss": 2.4349857330322267, "memory(GiB)": 77.56, "step": 47645, "token_acc": 0.5050847457627119, "train_speed(iter/s)": 1.439652 }, { "epoch": 2.0414720877425987, "grad_norm": 4.174794673919678, "learning_rate": 6.420662827633358e-05, "loss": 2.4949317932128907, "memory(GiB)": 77.56, "step": 47650, "token_acc": 0.49221183800623053, "train_speed(iter/s)": 1.439645 }, { "epoch": 2.041686303071848, "grad_norm": 5.558699131011963, "learning_rate": 6.420017574218788e-05, "loss": 2.332351875305176, "memory(GiB)": 77.56, "step": 47655, "token_acc": 0.4850498338870432, "train_speed(iter/s)": 1.439644 }, { "epoch": 2.0419005184010968, "grad_norm": 8.018543243408203, "learning_rate": 6.41937229507927e-05, "loss": 2.3778156280517577, "memory(GiB)": 77.56, "step": 47660, "token_acc": 0.509090909090909, "train_speed(iter/s)": 1.439623 }, { "epoch": 2.0421147337303456, "grad_norm": 4.871335506439209, "learning_rate": 6.418726990226491e-05, "loss": 2.40634708404541, "memory(GiB)": 77.56, "step": 47665, "token_acc": 0.5510204081632653, "train_speed(iter/s)": 1.439667 }, { "epoch": 2.042328949059595, "grad_norm": 5.5760369300842285, "learning_rate": 6.418081659672142e-05, "loss": 2.422725868225098, "memory(GiB)": 77.56, "step": 47670, "token_acc": 0.4868913857677903, "train_speed(iter/s)": 1.439702 }, { "epoch": 2.0425431643888436, "grad_norm": 6.696816444396973, "learning_rate": 6.417436303427914e-05, "loss": 2.6715274810791017, "memory(GiB)": 77.56, "step": 47675, "token_acc": 0.46153846153846156, "train_speed(iter/s)": 1.439727 }, { "epoch": 2.0427573797180925, "grad_norm": 4.811416149139404, "learning_rate": 6.416790921505498e-05, "loss": 2.5192203521728516, "memory(GiB)": 77.56, "step": 47680, "token_acc": 0.49206349206349204, "train_speed(iter/s)": 1.439735 }, { "epoch": 2.0429715950473417, "grad_norm": 5.809756755828857, "learning_rate": 6.416145513916584e-05, "loss": 2.5253982543945312, "memory(GiB)": 77.56, "step": 47685, "token_acc": 0.44126984126984126, "train_speed(iter/s)": 1.439759 }, { "epoch": 2.0431858103765905, "grad_norm": 6.040470123291016, "learning_rate": 6.415500080672866e-05, "loss": 2.423827362060547, "memory(GiB)": 77.56, "step": 47690, "token_acc": 0.4582210242587601, "train_speed(iter/s)": 1.439735 }, { "epoch": 2.0434000257058393, "grad_norm": 4.674708843231201, "learning_rate": 6.414854621786037e-05, "loss": 2.4663007736206053, "memory(GiB)": 77.56, "step": 47695, "token_acc": 0.4963235294117647, "train_speed(iter/s)": 1.439764 }, { "epoch": 2.0436142410350886, "grad_norm": 5.48889684677124, "learning_rate": 6.414209137267788e-05, "loss": 2.5493513107299806, "memory(GiB)": 77.56, "step": 47700, "token_acc": 0.4437299035369775, "train_speed(iter/s)": 1.439794 }, { "epoch": 2.0438284563643374, "grad_norm": 4.745237350463867, "learning_rate": 6.413563627129815e-05, "loss": 2.6456165313720703, "memory(GiB)": 77.56, "step": 47705, "token_acc": 0.4469914040114613, "train_speed(iter/s)": 1.439783 }, { "epoch": 2.044042671693586, "grad_norm": 4.328041076660156, "learning_rate": 6.41291809138381e-05, "loss": 2.3519237518310545, "memory(GiB)": 77.56, "step": 47710, "token_acc": 0.5031645569620253, "train_speed(iter/s)": 1.439789 }, { "epoch": 2.0442568870228355, "grad_norm": 5.482819557189941, "learning_rate": 6.412272530041469e-05, "loss": 2.2070241928100587, "memory(GiB)": 77.56, "step": 47715, "token_acc": 0.5038759689922481, "train_speed(iter/s)": 1.439748 }, { "epoch": 2.0444711023520843, "grad_norm": 4.37467622756958, "learning_rate": 6.411626943114486e-05, "loss": 2.417240524291992, "memory(GiB)": 77.56, "step": 47720, "token_acc": 0.47041420118343197, "train_speed(iter/s)": 1.439758 }, { "epoch": 2.044685317681333, "grad_norm": 4.715269088745117, "learning_rate": 6.410981330614558e-05, "loss": 2.2460214614868166, "memory(GiB)": 77.56, "step": 47725, "token_acc": 0.5653846153846154, "train_speed(iter/s)": 1.43977 }, { "epoch": 2.0448995330105824, "grad_norm": 4.7994890213012695, "learning_rate": 6.410335692553376e-05, "loss": 2.4280099868774414, "memory(GiB)": 77.56, "step": 47730, "token_acc": 0.4744525547445255, "train_speed(iter/s)": 1.439776 }, { "epoch": 2.045113748339831, "grad_norm": 7.1618170738220215, "learning_rate": 6.409690028942643e-05, "loss": 2.367707443237305, "memory(GiB)": 77.56, "step": 47735, "token_acc": 0.5282392026578073, "train_speed(iter/s)": 1.439705 }, { "epoch": 2.04532796366908, "grad_norm": 4.732382774353027, "learning_rate": 6.409044339794052e-05, "loss": 2.410608100891113, "memory(GiB)": 77.56, "step": 47740, "token_acc": 0.4811320754716981, "train_speed(iter/s)": 1.439682 }, { "epoch": 2.0455421789983292, "grad_norm": 4.81704568862915, "learning_rate": 6.408398625119301e-05, "loss": 2.5306934356689452, "memory(GiB)": 77.56, "step": 47745, "token_acc": 0.4707792207792208, "train_speed(iter/s)": 1.439695 }, { "epoch": 2.045756394327578, "grad_norm": 4.810451984405518, "learning_rate": 6.407752884930089e-05, "loss": 2.3730548858642577, "memory(GiB)": 77.56, "step": 47750, "token_acc": 0.4984423676012461, "train_speed(iter/s)": 1.439709 }, { "epoch": 2.045970609656827, "grad_norm": 4.718359470367432, "learning_rate": 6.407107119238111e-05, "loss": 2.451678466796875, "memory(GiB)": 77.56, "step": 47755, "token_acc": 0.5087108013937283, "train_speed(iter/s)": 1.439742 }, { "epoch": 2.046184824986076, "grad_norm": 3.8796544075012207, "learning_rate": 6.40646132805507e-05, "loss": 2.5709644317626954, "memory(GiB)": 77.56, "step": 47760, "token_acc": 0.45625, "train_speed(iter/s)": 1.439702 }, { "epoch": 2.046399040315325, "grad_norm": 6.005451679229736, "learning_rate": 6.405815511392659e-05, "loss": 2.2278003692626953, "memory(GiB)": 77.56, "step": 47765, "token_acc": 0.5121107266435986, "train_speed(iter/s)": 1.439735 }, { "epoch": 2.0466132556445737, "grad_norm": 5.36748743057251, "learning_rate": 6.405169669262583e-05, "loss": 2.668032646179199, "memory(GiB)": 77.56, "step": 47770, "token_acc": 0.4767025089605735, "train_speed(iter/s)": 1.439726 }, { "epoch": 2.046827470973823, "grad_norm": 5.313390254974365, "learning_rate": 6.40452380167654e-05, "loss": 2.3970668792724608, "memory(GiB)": 77.56, "step": 47775, "token_acc": 0.5369649805447471, "train_speed(iter/s)": 1.439719 }, { "epoch": 2.047041686303072, "grad_norm": 4.933276653289795, "learning_rate": 6.403877908646232e-05, "loss": 2.3330368041992187, "memory(GiB)": 77.56, "step": 47780, "token_acc": 0.5069444444444444, "train_speed(iter/s)": 1.439697 }, { "epoch": 2.0472559016323206, "grad_norm": 4.451749324798584, "learning_rate": 6.403231990183358e-05, "loss": 2.3309856414794923, "memory(GiB)": 77.56, "step": 47785, "token_acc": 0.494475138121547, "train_speed(iter/s)": 1.439704 }, { "epoch": 2.04747011696157, "grad_norm": 4.890246868133545, "learning_rate": 6.40258604629962e-05, "loss": 2.3902399063110353, "memory(GiB)": 77.56, "step": 47790, "token_acc": 0.4953271028037383, "train_speed(iter/s)": 1.439678 }, { "epoch": 2.0476843322908187, "grad_norm": 6.390333652496338, "learning_rate": 6.401940077006721e-05, "loss": 2.5157808303833007, "memory(GiB)": 77.56, "step": 47795, "token_acc": 0.4775641025641026, "train_speed(iter/s)": 1.439674 }, { "epoch": 2.0478985476200675, "grad_norm": 5.395419120788574, "learning_rate": 6.401294082316361e-05, "loss": 2.6448734283447264, "memory(GiB)": 77.56, "step": 47800, "token_acc": 0.46944444444444444, "train_speed(iter/s)": 1.439676 }, { "epoch": 2.0481127629493168, "grad_norm": 6.820577621459961, "learning_rate": 6.400648062240243e-05, "loss": 2.202439308166504, "memory(GiB)": 77.56, "step": 47805, "token_acc": 0.5442622950819672, "train_speed(iter/s)": 1.439688 }, { "epoch": 2.0483269782785656, "grad_norm": 5.206563472747803, "learning_rate": 6.400002016790074e-05, "loss": 2.2871850967407226, "memory(GiB)": 77.56, "step": 47810, "token_acc": 0.4984709480122324, "train_speed(iter/s)": 1.439684 }, { "epoch": 2.0485411936078144, "grad_norm": 5.819280624389648, "learning_rate": 6.399355945977554e-05, "loss": 2.2564260482788088, "memory(GiB)": 77.56, "step": 47815, "token_acc": 0.53, "train_speed(iter/s)": 1.439673 }, { "epoch": 2.0487554089370636, "grad_norm": 5.17987585067749, "learning_rate": 6.39870984981439e-05, "loss": 2.349131965637207, "memory(GiB)": 77.56, "step": 47820, "token_acc": 0.5311355311355311, "train_speed(iter/s)": 1.439713 }, { "epoch": 2.0489696242663125, "grad_norm": 5.169327735900879, "learning_rate": 6.398063728312283e-05, "loss": 2.422494888305664, "memory(GiB)": 77.56, "step": 47825, "token_acc": 0.4634146341463415, "train_speed(iter/s)": 1.439681 }, { "epoch": 2.0491838395955613, "grad_norm": 8.17672348022461, "learning_rate": 6.39741758148294e-05, "loss": 2.0970438003540037, "memory(GiB)": 77.56, "step": 47830, "token_acc": 0.4888888888888889, "train_speed(iter/s)": 1.439668 }, { "epoch": 2.0493980549248105, "grad_norm": 5.379665851593018, "learning_rate": 6.396771409338068e-05, "loss": 2.691663360595703, "memory(GiB)": 77.56, "step": 47835, "token_acc": 0.4589041095890411, "train_speed(iter/s)": 1.439681 }, { "epoch": 2.0496122702540593, "grad_norm": 6.921357154846191, "learning_rate": 6.39612521188937e-05, "loss": 2.5036855697631837, "memory(GiB)": 77.56, "step": 47840, "token_acc": 0.49337748344370863, "train_speed(iter/s)": 1.439695 }, { "epoch": 2.049826485583308, "grad_norm": 7.54922342300415, "learning_rate": 6.395478989148556e-05, "loss": 2.3679580688476562, "memory(GiB)": 77.56, "step": 47845, "token_acc": 0.47633136094674555, "train_speed(iter/s)": 1.439655 }, { "epoch": 2.0500407009125574, "grad_norm": 5.228775501251221, "learning_rate": 6.39483274112733e-05, "loss": 2.2357929229736326, "memory(GiB)": 77.56, "step": 47850, "token_acc": 0.5232974910394266, "train_speed(iter/s)": 1.439684 }, { "epoch": 2.0502549162418062, "grad_norm": 4.800392150878906, "learning_rate": 6.3941864678374e-05, "loss": 2.427528953552246, "memory(GiB)": 77.56, "step": 47855, "token_acc": 0.46827794561933533, "train_speed(iter/s)": 1.439704 }, { "epoch": 2.050469131571055, "grad_norm": 4.886074066162109, "learning_rate": 6.393540169290475e-05, "loss": 2.254031181335449, "memory(GiB)": 77.56, "step": 47860, "token_acc": 0.5374149659863946, "train_speed(iter/s)": 1.439692 }, { "epoch": 2.0506833469003043, "grad_norm": 5.674720287322998, "learning_rate": 6.392893845498262e-05, "loss": 2.5538101196289062, "memory(GiB)": 77.56, "step": 47865, "token_acc": 0.4835164835164835, "train_speed(iter/s)": 1.439687 }, { "epoch": 2.050897562229553, "grad_norm": 6.776648998260498, "learning_rate": 6.392247496472472e-05, "loss": 2.4056728363037108, "memory(GiB)": 77.56, "step": 47870, "token_acc": 0.5158227848101266, "train_speed(iter/s)": 1.439671 }, { "epoch": 2.051111777558802, "grad_norm": 7.172952175140381, "learning_rate": 6.391601122224811e-05, "loss": 2.341914939880371, "memory(GiB)": 77.56, "step": 47875, "token_acc": 0.47058823529411764, "train_speed(iter/s)": 1.439658 }, { "epoch": 2.051325992888051, "grad_norm": 6.251924514770508, "learning_rate": 6.39095472276699e-05, "loss": 2.646720123291016, "memory(GiB)": 77.56, "step": 47880, "token_acc": 0.4492307692307692, "train_speed(iter/s)": 1.439682 }, { "epoch": 2.0515402082173, "grad_norm": 4.300154685974121, "learning_rate": 6.39030829811072e-05, "loss": 2.4142295837402346, "memory(GiB)": 77.56, "step": 47885, "token_acc": 0.48951048951048953, "train_speed(iter/s)": 1.439722 }, { "epoch": 2.051754423546549, "grad_norm": 5.152960300445557, "learning_rate": 6.38966184826771e-05, "loss": 2.3202417373657225, "memory(GiB)": 77.56, "step": 47890, "token_acc": 0.5015873015873016, "train_speed(iter/s)": 1.439708 }, { "epoch": 2.051968638875798, "grad_norm": 7.4465179443359375, "learning_rate": 6.389015373249674e-05, "loss": 2.7647781372070312, "memory(GiB)": 77.56, "step": 47895, "token_acc": 0.45454545454545453, "train_speed(iter/s)": 1.439733 }, { "epoch": 2.052182854205047, "grad_norm": 5.134638786315918, "learning_rate": 6.388368873068321e-05, "loss": 2.5917724609375, "memory(GiB)": 77.56, "step": 47900, "token_acc": 0.4820846905537459, "train_speed(iter/s)": 1.439756 }, { "epoch": 2.0523970695342957, "grad_norm": 4.606905937194824, "learning_rate": 6.387722347735362e-05, "loss": 2.1808450698852537, "memory(GiB)": 77.56, "step": 47905, "token_acc": 0.5109489051094891, "train_speed(iter/s)": 1.439783 }, { "epoch": 2.052611284863545, "grad_norm": 5.044373035430908, "learning_rate": 6.387075797262513e-05, "loss": 2.676270294189453, "memory(GiB)": 77.56, "step": 47910, "token_acc": 0.44074074074074077, "train_speed(iter/s)": 1.439789 }, { "epoch": 2.0528255001927938, "grad_norm": 5.0664591789245605, "learning_rate": 6.386429221661483e-05, "loss": 2.8006845474243165, "memory(GiB)": 77.56, "step": 47915, "token_acc": 0.4318936877076412, "train_speed(iter/s)": 1.439793 }, { "epoch": 2.0530397155220426, "grad_norm": 7.783517360687256, "learning_rate": 6.385782620943986e-05, "loss": 2.3821617126464845, "memory(GiB)": 77.56, "step": 47920, "token_acc": 0.5106382978723404, "train_speed(iter/s)": 1.439799 }, { "epoch": 2.053253930851292, "grad_norm": 4.918902397155762, "learning_rate": 6.385135995121739e-05, "loss": 2.245688629150391, "memory(GiB)": 77.56, "step": 47925, "token_acc": 0.5222929936305732, "train_speed(iter/s)": 1.439806 }, { "epoch": 2.0534681461805406, "grad_norm": 5.0280351638793945, "learning_rate": 6.384489344206455e-05, "loss": 2.2529088973999025, "memory(GiB)": 77.56, "step": 47930, "token_acc": 0.5164473684210527, "train_speed(iter/s)": 1.43981 }, { "epoch": 2.0536823615097894, "grad_norm": 5.787678241729736, "learning_rate": 6.383842668209845e-05, "loss": 2.4657352447509764, "memory(GiB)": 77.56, "step": 47935, "token_acc": 0.4864864864864865, "train_speed(iter/s)": 1.439805 }, { "epoch": 2.0538965768390387, "grad_norm": 5.114063262939453, "learning_rate": 6.383195967143629e-05, "loss": 2.274463081359863, "memory(GiB)": 77.56, "step": 47940, "token_acc": 0.5217391304347826, "train_speed(iter/s)": 1.43982 }, { "epoch": 2.0541107921682875, "grad_norm": 5.533620834350586, "learning_rate": 6.38254924101952e-05, "loss": 2.320192337036133, "memory(GiB)": 77.56, "step": 47945, "token_acc": 0.5278969957081545, "train_speed(iter/s)": 1.439854 }, { "epoch": 2.0543250074975363, "grad_norm": 4.123553276062012, "learning_rate": 6.381902489849233e-05, "loss": 2.285234832763672, "memory(GiB)": 77.56, "step": 47950, "token_acc": 0.5155709342560554, "train_speed(iter/s)": 1.439856 }, { "epoch": 2.0545392228267856, "grad_norm": 8.138646125793457, "learning_rate": 6.381255713644488e-05, "loss": 2.4138904571533204, "memory(GiB)": 77.56, "step": 47955, "token_acc": 0.4612794612794613, "train_speed(iter/s)": 1.439861 }, { "epoch": 2.0547534381560344, "grad_norm": 4.103034973144531, "learning_rate": 6.380608912416999e-05, "loss": 2.406050682067871, "memory(GiB)": 77.56, "step": 47960, "token_acc": 0.49491525423728816, "train_speed(iter/s)": 1.439842 }, { "epoch": 2.054967653485283, "grad_norm": 5.271156311035156, "learning_rate": 6.379962086178485e-05, "loss": 2.463789176940918, "memory(GiB)": 77.56, "step": 47965, "token_acc": 0.49508196721311476, "train_speed(iter/s)": 1.439869 }, { "epoch": 2.0551818688145325, "grad_norm": 5.393589019775391, "learning_rate": 6.379315234940664e-05, "loss": 2.263259696960449, "memory(GiB)": 77.56, "step": 47970, "token_acc": 0.531986531986532, "train_speed(iter/s)": 1.439865 }, { "epoch": 2.0553960841437813, "grad_norm": 7.665388584136963, "learning_rate": 6.378668358715252e-05, "loss": 2.3992361068725585, "memory(GiB)": 77.56, "step": 47975, "token_acc": 0.4983277591973244, "train_speed(iter/s)": 1.439877 }, { "epoch": 2.05561029947303, "grad_norm": 6.8578338623046875, "learning_rate": 6.378021457513971e-05, "loss": 2.3825326919555665, "memory(GiB)": 77.56, "step": 47980, "token_acc": 0.48, "train_speed(iter/s)": 1.439876 }, { "epoch": 2.0558245148022793, "grad_norm": 4.834499359130859, "learning_rate": 6.377374531348538e-05, "loss": 2.1360877990722655, "memory(GiB)": 77.56, "step": 47985, "token_acc": 0.531986531986532, "train_speed(iter/s)": 1.439868 }, { "epoch": 2.056038730131528, "grad_norm": 4.213587760925293, "learning_rate": 6.376727580230671e-05, "loss": 2.2522083282470704, "memory(GiB)": 77.56, "step": 47990, "token_acc": 0.5375375375375375, "train_speed(iter/s)": 1.439845 }, { "epoch": 2.056252945460777, "grad_norm": 7.45883846282959, "learning_rate": 6.376080604172096e-05, "loss": 2.5970041275024416, "memory(GiB)": 77.56, "step": 47995, "token_acc": 0.4280821917808219, "train_speed(iter/s)": 1.439829 }, { "epoch": 2.0564671607900262, "grad_norm": 4.569416522979736, "learning_rate": 6.375433603184528e-05, "loss": 2.4430402755737304, "memory(GiB)": 77.56, "step": 48000, "token_acc": 0.4781420765027322, "train_speed(iter/s)": 1.439867 }, { "epoch": 2.0564671607900262, "eval_loss": 2.2916102409362793, "eval_runtime": 13.9451, "eval_samples_per_second": 7.171, "eval_steps_per_second": 7.171, "eval_token_acc": 0.45955882352941174, "step": 48000 }, { "epoch": 2.056681376119275, "grad_norm": 4.693818092346191, "learning_rate": 6.37478657727969e-05, "loss": 2.2714290618896484, "memory(GiB)": 77.56, "step": 48005, "token_acc": 0.4605377276669558, "train_speed(iter/s)": 1.439278 }, { "epoch": 2.056895591448524, "grad_norm": 5.718568325042725, "learning_rate": 6.374139526469304e-05, "loss": 2.1184856414794924, "memory(GiB)": 77.56, "step": 48010, "token_acc": 0.5287356321839081, "train_speed(iter/s)": 1.439207 }, { "epoch": 2.057109806777773, "grad_norm": 6.2125935554504395, "learning_rate": 6.373492450765093e-05, "loss": 2.271101951599121, "memory(GiB)": 77.56, "step": 48015, "token_acc": 0.5131195335276968, "train_speed(iter/s)": 1.439228 }, { "epoch": 2.057324022107022, "grad_norm": 5.286134719848633, "learning_rate": 6.372845350178776e-05, "loss": 2.4091039657592774, "memory(GiB)": 77.56, "step": 48020, "token_acc": 0.4865771812080537, "train_speed(iter/s)": 1.439274 }, { "epoch": 2.0575382374362707, "grad_norm": 5.216477870941162, "learning_rate": 6.372198224722077e-05, "loss": 2.2434417724609377, "memory(GiB)": 77.56, "step": 48025, "token_acc": 0.49169435215946844, "train_speed(iter/s)": 1.439289 }, { "epoch": 2.05775245276552, "grad_norm": 4.990546226501465, "learning_rate": 6.37155107440672e-05, "loss": 2.143111801147461, "memory(GiB)": 77.56, "step": 48030, "token_acc": 0.5601659751037344, "train_speed(iter/s)": 1.4393 }, { "epoch": 2.057966668094769, "grad_norm": 4.992410182952881, "learning_rate": 6.370903899244429e-05, "loss": 2.544847869873047, "memory(GiB)": 77.56, "step": 48035, "token_acc": 0.49838187702265374, "train_speed(iter/s)": 1.439272 }, { "epoch": 2.0581808834240176, "grad_norm": 5.420811653137207, "learning_rate": 6.370256699246929e-05, "loss": 2.363251304626465, "memory(GiB)": 77.56, "step": 48040, "token_acc": 0.4789156626506024, "train_speed(iter/s)": 1.439275 }, { "epoch": 2.058395098753267, "grad_norm": 4.947816848754883, "learning_rate": 6.369609474425944e-05, "loss": 2.317617416381836, "memory(GiB)": 77.56, "step": 48045, "token_acc": 0.5133333333333333, "train_speed(iter/s)": 1.439291 }, { "epoch": 2.0586093140825157, "grad_norm": 4.4387335777282715, "learning_rate": 6.368962224793195e-05, "loss": 2.631804275512695, "memory(GiB)": 77.56, "step": 48050, "token_acc": 0.4394904458598726, "train_speed(iter/s)": 1.439295 }, { "epoch": 2.0588235294117645, "grad_norm": 3.9364869594573975, "learning_rate": 6.368314950360415e-05, "loss": 2.2472118377685546, "memory(GiB)": 77.56, "step": 48055, "token_acc": 0.496551724137931, "train_speed(iter/s)": 1.439269 }, { "epoch": 2.0590377447410138, "grad_norm": 6.670133590698242, "learning_rate": 6.367667651139324e-05, "loss": 2.486860466003418, "memory(GiB)": 77.56, "step": 48060, "token_acc": 0.4808362369337979, "train_speed(iter/s)": 1.439273 }, { "epoch": 2.0592519600702626, "grad_norm": 6.60884952545166, "learning_rate": 6.367020327141651e-05, "loss": 2.564601707458496, "memory(GiB)": 77.56, "step": 48065, "token_acc": 0.4717607973421927, "train_speed(iter/s)": 1.439268 }, { "epoch": 2.0594661753995114, "grad_norm": 4.956416130065918, "learning_rate": 6.366372978379122e-05, "loss": 2.4545724868774412, "memory(GiB)": 77.56, "step": 48070, "token_acc": 0.4985507246376812, "train_speed(iter/s)": 1.439277 }, { "epoch": 2.0596803907287606, "grad_norm": 4.4853081703186035, "learning_rate": 6.365725604863466e-05, "loss": 2.598590850830078, "memory(GiB)": 77.56, "step": 48075, "token_acc": 0.5250836120401338, "train_speed(iter/s)": 1.439305 }, { "epoch": 2.0598946060580094, "grad_norm": 4.909699440002441, "learning_rate": 6.365078206606408e-05, "loss": 2.706365203857422, "memory(GiB)": 77.56, "step": 48080, "token_acc": 0.4966887417218543, "train_speed(iter/s)": 1.439309 }, { "epoch": 2.0601088213872583, "grad_norm": 8.361443519592285, "learning_rate": 6.364430783619679e-05, "loss": 2.3414520263671874, "memory(GiB)": 77.56, "step": 48085, "token_acc": 0.48135593220338985, "train_speed(iter/s)": 1.43931 }, { "epoch": 2.0603230367165075, "grad_norm": 6.474850177764893, "learning_rate": 6.363783335915005e-05, "loss": 2.5392520904541014, "memory(GiB)": 77.56, "step": 48090, "token_acc": 0.4885245901639344, "train_speed(iter/s)": 1.439341 }, { "epoch": 2.0605372520457563, "grad_norm": 6.0981011390686035, "learning_rate": 6.36313586350412e-05, "loss": 2.15423526763916, "memory(GiB)": 77.56, "step": 48095, "token_acc": 0.5335968379446641, "train_speed(iter/s)": 1.439349 }, { "epoch": 2.060751467375005, "grad_norm": 5.5894083976745605, "learning_rate": 6.362488366398746e-05, "loss": 2.670083999633789, "memory(GiB)": 77.56, "step": 48100, "token_acc": 0.4409448818897638, "train_speed(iter/s)": 1.439382 }, { "epoch": 2.0609656827042544, "grad_norm": 5.242283344268799, "learning_rate": 6.361840844610619e-05, "loss": 2.7428956985473634, "memory(GiB)": 77.56, "step": 48105, "token_acc": 0.4441340782122905, "train_speed(iter/s)": 1.43941 }, { "epoch": 2.061179898033503, "grad_norm": 6.626741409301758, "learning_rate": 6.361193298151466e-05, "loss": 2.204768180847168, "memory(GiB)": 77.56, "step": 48110, "token_acc": 0.562992125984252, "train_speed(iter/s)": 1.439376 }, { "epoch": 2.061394113362752, "grad_norm": 4.67233419418335, "learning_rate": 6.360545727033023e-05, "loss": 2.4921072006225584, "memory(GiB)": 77.56, "step": 48115, "token_acc": 0.47419354838709676, "train_speed(iter/s)": 1.439373 }, { "epoch": 2.0616083286920013, "grad_norm": 5.033863067626953, "learning_rate": 6.359898131267014e-05, "loss": 2.411849784851074, "memory(GiB)": 77.56, "step": 48120, "token_acc": 0.46853146853146854, "train_speed(iter/s)": 1.439405 }, { "epoch": 2.06182254402125, "grad_norm": 6.651470184326172, "learning_rate": 6.359250510865177e-05, "loss": 2.49924373626709, "memory(GiB)": 77.56, "step": 48125, "token_acc": 0.4828767123287671, "train_speed(iter/s)": 1.439419 }, { "epoch": 2.062036759350499, "grad_norm": 5.903689861297607, "learning_rate": 6.358602865839242e-05, "loss": 2.346999740600586, "memory(GiB)": 77.56, "step": 48130, "token_acc": 0.4963768115942029, "train_speed(iter/s)": 1.4394 }, { "epoch": 2.062250974679748, "grad_norm": 5.883366107940674, "learning_rate": 6.357955196200939e-05, "loss": 2.416558265686035, "memory(GiB)": 77.56, "step": 48135, "token_acc": 0.49454545454545457, "train_speed(iter/s)": 1.439395 }, { "epoch": 2.062465190008997, "grad_norm": 6.490179061889648, "learning_rate": 6.357307501962007e-05, "loss": 2.1859272003173826, "memory(GiB)": 77.56, "step": 48140, "token_acc": 0.4894894894894895, "train_speed(iter/s)": 1.439415 }, { "epoch": 2.062679405338246, "grad_norm": 5.267295837402344, "learning_rate": 6.356659783134174e-05, "loss": 2.303095817565918, "memory(GiB)": 77.56, "step": 48145, "token_acc": 0.48201438848920863, "train_speed(iter/s)": 1.439407 }, { "epoch": 2.062893620667495, "grad_norm": 4.725438594818115, "learning_rate": 6.356012039729177e-05, "loss": 2.503006362915039, "memory(GiB)": 77.56, "step": 48150, "token_acc": 0.48589341692789967, "train_speed(iter/s)": 1.439425 }, { "epoch": 2.063107835996744, "grad_norm": 5.783420085906982, "learning_rate": 6.35536427175875e-05, "loss": 2.253975486755371, "memory(GiB)": 77.56, "step": 48155, "token_acc": 0.5063291139240507, "train_speed(iter/s)": 1.439433 }, { "epoch": 2.0633220513259927, "grad_norm": 4.804093360900879, "learning_rate": 6.354716479234629e-05, "loss": 2.283300018310547, "memory(GiB)": 77.56, "step": 48160, "token_acc": 0.5410447761194029, "train_speed(iter/s)": 1.439453 }, { "epoch": 2.063536266655242, "grad_norm": 5.716576099395752, "learning_rate": 6.354068662168546e-05, "loss": 2.600575828552246, "memory(GiB)": 77.56, "step": 48165, "token_acc": 0.47058823529411764, "train_speed(iter/s)": 1.439441 }, { "epoch": 2.0637504819844907, "grad_norm": 4.64011287689209, "learning_rate": 6.353420820572242e-05, "loss": 2.1770219802856445, "memory(GiB)": 77.56, "step": 48170, "token_acc": 0.5575539568345323, "train_speed(iter/s)": 1.439442 }, { "epoch": 2.0639646973137395, "grad_norm": 5.3839826583862305, "learning_rate": 6.352772954457447e-05, "loss": 2.416168975830078, "memory(GiB)": 77.56, "step": 48175, "token_acc": 0.49429657794676807, "train_speed(iter/s)": 1.439394 }, { "epoch": 2.064178912642989, "grad_norm": 6.550118923187256, "learning_rate": 6.3521250638359e-05, "loss": 2.33520393371582, "memory(GiB)": 77.56, "step": 48180, "token_acc": 0.5221843003412969, "train_speed(iter/s)": 1.439329 }, { "epoch": 2.0643931279722376, "grad_norm": 5.122165679931641, "learning_rate": 6.351477148719343e-05, "loss": 2.526004600524902, "memory(GiB)": 77.56, "step": 48185, "token_acc": 0.46647230320699706, "train_speed(iter/s)": 1.439329 }, { "epoch": 2.0646073433014864, "grad_norm": 5.075963497161865, "learning_rate": 6.350829209119509e-05, "loss": 2.615765380859375, "memory(GiB)": 77.56, "step": 48190, "token_acc": 0.44402985074626866, "train_speed(iter/s)": 1.439318 }, { "epoch": 2.0648215586307357, "grad_norm": 5.452728748321533, "learning_rate": 6.350181245048135e-05, "loss": 2.231444549560547, "memory(GiB)": 77.56, "step": 48195, "token_acc": 0.5091575091575091, "train_speed(iter/s)": 1.43934 }, { "epoch": 2.0650357739599845, "grad_norm": 6.405764102935791, "learning_rate": 6.349533256516962e-05, "loss": 2.572401428222656, "memory(GiB)": 77.56, "step": 48200, "token_acc": 0.4870848708487085, "train_speed(iter/s)": 1.439344 }, { "epoch": 2.0652499892892333, "grad_norm": 4.045449733734131, "learning_rate": 6.348885243537728e-05, "loss": 2.498097038269043, "memory(GiB)": 77.56, "step": 48205, "token_acc": 0.46258503401360546, "train_speed(iter/s)": 1.43935 }, { "epoch": 2.0654642046184826, "grad_norm": 5.261559963226318, "learning_rate": 6.348237206122172e-05, "loss": 2.444377899169922, "memory(GiB)": 77.56, "step": 48210, "token_acc": 0.46206896551724136, "train_speed(iter/s)": 1.439381 }, { "epoch": 2.0656784199477314, "grad_norm": 5.384941577911377, "learning_rate": 6.347589144282034e-05, "loss": 2.262255096435547, "memory(GiB)": 77.56, "step": 48215, "token_acc": 0.5321428571428571, "train_speed(iter/s)": 1.439341 }, { "epoch": 2.06589263527698, "grad_norm": 7.680131435394287, "learning_rate": 6.346941058029055e-05, "loss": 2.614405059814453, "memory(GiB)": 77.56, "step": 48220, "token_acc": 0.5, "train_speed(iter/s)": 1.439336 }, { "epoch": 2.0661068506062295, "grad_norm": 4.867196559906006, "learning_rate": 6.346292947374975e-05, "loss": 2.2359460830688476, "memory(GiB)": 77.56, "step": 48225, "token_acc": 0.4964788732394366, "train_speed(iter/s)": 1.439385 }, { "epoch": 2.0663210659354783, "grad_norm": 5.427309513092041, "learning_rate": 6.345644812331538e-05, "loss": 2.4799144744873045, "memory(GiB)": 77.56, "step": 48230, "token_acc": 0.47058823529411764, "train_speed(iter/s)": 1.439368 }, { "epoch": 2.066535281264727, "grad_norm": 5.110914707183838, "learning_rate": 6.344996652910479e-05, "loss": 2.3211389541625977, "memory(GiB)": 77.56, "step": 48235, "token_acc": 0.5134228187919463, "train_speed(iter/s)": 1.439359 }, { "epoch": 2.0667494965939763, "grad_norm": 7.088987827301025, "learning_rate": 6.344348469123545e-05, "loss": 2.7160480499267576, "memory(GiB)": 77.56, "step": 48240, "token_acc": 0.47440273037542663, "train_speed(iter/s)": 1.43938 }, { "epoch": 2.066963711923225, "grad_norm": 4.280320644378662, "learning_rate": 6.343700260982479e-05, "loss": 2.2539031982421873, "memory(GiB)": 77.56, "step": 48245, "token_acc": 0.5, "train_speed(iter/s)": 1.439394 }, { "epoch": 2.0671779272524744, "grad_norm": 4.63068962097168, "learning_rate": 6.343052028499019e-05, "loss": 2.3382373809814454, "memory(GiB)": 77.56, "step": 48250, "token_acc": 0.5347985347985348, "train_speed(iter/s)": 1.439388 }, { "epoch": 2.067392142581723, "grad_norm": 5.030412673950195, "learning_rate": 6.342403771684916e-05, "loss": 2.321604919433594, "memory(GiB)": 77.56, "step": 48255, "token_acc": 0.5037037037037037, "train_speed(iter/s)": 1.439361 }, { "epoch": 2.067606357910972, "grad_norm": 4.675687313079834, "learning_rate": 6.341755490551906e-05, "loss": 2.3080495834350585, "memory(GiB)": 77.56, "step": 48260, "token_acc": 0.4889589905362776, "train_speed(iter/s)": 1.439355 }, { "epoch": 2.0678205732402213, "grad_norm": 5.396175861358643, "learning_rate": 6.341107185111737e-05, "loss": 2.2108844757080077, "memory(GiB)": 77.56, "step": 48265, "token_acc": 0.5570032573289903, "train_speed(iter/s)": 1.439359 }, { "epoch": 2.06803478856947, "grad_norm": 7.695731163024902, "learning_rate": 6.340458855376156e-05, "loss": 2.3261009216308595, "memory(GiB)": 77.56, "step": 48270, "token_acc": 0.5365853658536586, "train_speed(iter/s)": 1.439347 }, { "epoch": 2.068249003898719, "grad_norm": 5.517941474914551, "learning_rate": 6.339810501356903e-05, "loss": 2.0953414916992186, "memory(GiB)": 77.56, "step": 48275, "token_acc": 0.5743801652892562, "train_speed(iter/s)": 1.439317 }, { "epoch": 2.068463219227968, "grad_norm": 4.48099946975708, "learning_rate": 6.339162123065727e-05, "loss": 2.293553352355957, "memory(GiB)": 77.56, "step": 48280, "token_acc": 0.5328467153284672, "train_speed(iter/s)": 1.439318 }, { "epoch": 2.068677434557217, "grad_norm": 6.049262523651123, "learning_rate": 6.338513720514371e-05, "loss": 2.1654096603393556, "memory(GiB)": 77.56, "step": 48285, "token_acc": 0.5607142857142857, "train_speed(iter/s)": 1.439314 }, { "epoch": 2.068891649886466, "grad_norm": 5.270562171936035, "learning_rate": 6.337865293714584e-05, "loss": 2.644285774230957, "memory(GiB)": 77.56, "step": 48290, "token_acc": 0.45180722891566266, "train_speed(iter/s)": 1.439299 }, { "epoch": 2.069105865215715, "grad_norm": 5.95337438583374, "learning_rate": 6.337216842678114e-05, "loss": 2.506064224243164, "memory(GiB)": 77.56, "step": 48295, "token_acc": 0.4879032258064516, "train_speed(iter/s)": 1.439294 }, { "epoch": 2.069320080544964, "grad_norm": 5.2512993812561035, "learning_rate": 6.336568367416706e-05, "loss": 2.4757450103759764, "memory(GiB)": 77.56, "step": 48300, "token_acc": 0.47232472324723246, "train_speed(iter/s)": 1.439302 }, { "epoch": 2.0695342958742127, "grad_norm": 6.013294219970703, "learning_rate": 6.335919867942107e-05, "loss": 2.5989797592163084, "memory(GiB)": 77.56, "step": 48305, "token_acc": 0.44765342960288806, "train_speed(iter/s)": 1.4393 }, { "epoch": 2.069748511203462, "grad_norm": 5.916513919830322, "learning_rate": 6.335271344266066e-05, "loss": 2.4197723388671877, "memory(GiB)": 77.56, "step": 48310, "token_acc": 0.48363636363636364, "train_speed(iter/s)": 1.439324 }, { "epoch": 2.0699627265327107, "grad_norm": 5.032732963562012, "learning_rate": 6.334622796400333e-05, "loss": 2.382965087890625, "memory(GiB)": 77.56, "step": 48315, "token_acc": 0.484, "train_speed(iter/s)": 1.439356 }, { "epoch": 2.0701769418619596, "grad_norm": 5.509790897369385, "learning_rate": 6.333974224356656e-05, "loss": 2.4743606567382814, "memory(GiB)": 77.56, "step": 48320, "token_acc": 0.4818181818181818, "train_speed(iter/s)": 1.439346 }, { "epoch": 2.070391157191209, "grad_norm": 6.152044296264648, "learning_rate": 6.333325628146785e-05, "loss": 2.601472854614258, "memory(GiB)": 77.56, "step": 48325, "token_acc": 0.4880952380952381, "train_speed(iter/s)": 1.439381 }, { "epoch": 2.0706053725204576, "grad_norm": 5.371792316436768, "learning_rate": 6.332677007782468e-05, "loss": 2.3854007720947266, "memory(GiB)": 77.56, "step": 48330, "token_acc": 0.475, "train_speed(iter/s)": 1.439348 }, { "epoch": 2.0708195878497064, "grad_norm": 6.3165740966796875, "learning_rate": 6.33202836327546e-05, "loss": 2.7836769104003904, "memory(GiB)": 77.56, "step": 48335, "token_acc": 0.4605809128630705, "train_speed(iter/s)": 1.439319 }, { "epoch": 2.0710338031789557, "grad_norm": 5.348511695861816, "learning_rate": 6.331379694637504e-05, "loss": 2.050336456298828, "memory(GiB)": 77.56, "step": 48340, "token_acc": 0.5205479452054794, "train_speed(iter/s)": 1.439325 }, { "epoch": 2.0712480185082045, "grad_norm": 8.009041786193848, "learning_rate": 6.33073100188036e-05, "loss": 2.5793895721435547, "memory(GiB)": 77.56, "step": 48345, "token_acc": 0.43342776203966005, "train_speed(iter/s)": 1.439348 }, { "epoch": 2.0714622338374533, "grad_norm": 5.634845733642578, "learning_rate": 6.330082285015773e-05, "loss": 2.254982566833496, "memory(GiB)": 77.56, "step": 48350, "token_acc": 0.5311203319502075, "train_speed(iter/s)": 1.439348 }, { "epoch": 2.0716764491667026, "grad_norm": 5.762125015258789, "learning_rate": 6.329433544055499e-05, "loss": 2.1113458633422852, "memory(GiB)": 77.56, "step": 48355, "token_acc": 0.5, "train_speed(iter/s)": 1.439349 }, { "epoch": 2.0718906644959514, "grad_norm": 5.638726234436035, "learning_rate": 6.328784779011288e-05, "loss": 2.404056167602539, "memory(GiB)": 77.56, "step": 48360, "token_acc": 0.4899328859060403, "train_speed(iter/s)": 1.43931 }, { "epoch": 2.0721048798252, "grad_norm": 4.821144104003906, "learning_rate": 6.328135989894896e-05, "loss": 2.169768524169922, "memory(GiB)": 77.56, "step": 48365, "token_acc": 0.5019455252918288, "train_speed(iter/s)": 1.43929 }, { "epoch": 2.0723190951544495, "grad_norm": 5.7469916343688965, "learning_rate": 6.327487176718075e-05, "loss": 2.2687667846679687, "memory(GiB)": 77.56, "step": 48370, "token_acc": 0.5363984674329502, "train_speed(iter/s)": 1.439267 }, { "epoch": 2.0725333104836983, "grad_norm": 4.575311183929443, "learning_rate": 6.326838339492577e-05, "loss": 2.321147918701172, "memory(GiB)": 77.56, "step": 48375, "token_acc": 0.5031446540880503, "train_speed(iter/s)": 1.439289 }, { "epoch": 2.072747525812947, "grad_norm": 6.307142734527588, "learning_rate": 6.326189478230157e-05, "loss": 2.359288787841797, "memory(GiB)": 77.56, "step": 48380, "token_acc": 0.49264705882352944, "train_speed(iter/s)": 1.439281 }, { "epoch": 2.0729617411421963, "grad_norm": 4.568991661071777, "learning_rate": 6.325540592942571e-05, "loss": 2.618624687194824, "memory(GiB)": 77.56, "step": 48385, "token_acc": 0.46394984326018807, "train_speed(iter/s)": 1.439293 }, { "epoch": 2.073175956471445, "grad_norm": 5.1485724449157715, "learning_rate": 6.324891683641576e-05, "loss": 2.348969650268555, "memory(GiB)": 77.56, "step": 48390, "token_acc": 0.49825783972125437, "train_speed(iter/s)": 1.439279 }, { "epoch": 2.073390171800694, "grad_norm": 6.932342052459717, "learning_rate": 6.324242750338922e-05, "loss": 2.2965444564819335, "memory(GiB)": 77.56, "step": 48395, "token_acc": 0.5264900662251656, "train_speed(iter/s)": 1.439252 }, { "epoch": 2.073604387129943, "grad_norm": 5.213193416595459, "learning_rate": 6.323593793046371e-05, "loss": 2.507697868347168, "memory(GiB)": 77.56, "step": 48400, "token_acc": 0.5104166666666666, "train_speed(iter/s)": 1.439205 }, { "epoch": 2.073818602459192, "grad_norm": 6.838024616241455, "learning_rate": 6.322944811775676e-05, "loss": 2.2458683013916017, "memory(GiB)": 77.56, "step": 48405, "token_acc": 0.525, "train_speed(iter/s)": 1.439204 }, { "epoch": 2.074032817788441, "grad_norm": 6.040023326873779, "learning_rate": 6.322295806538594e-05, "loss": 2.5360385894775392, "memory(GiB)": 77.56, "step": 48410, "token_acc": 0.5103448275862069, "train_speed(iter/s)": 1.439233 }, { "epoch": 2.07424703311769, "grad_norm": 6.14741325378418, "learning_rate": 6.321646777346883e-05, "loss": 2.425777053833008, "memory(GiB)": 77.56, "step": 48415, "token_acc": 0.4828767123287671, "train_speed(iter/s)": 1.439245 }, { "epoch": 2.074461248446939, "grad_norm": 6.541590213775635, "learning_rate": 6.320997724212304e-05, "loss": 2.2811193466186523, "memory(GiB)": 77.56, "step": 48420, "token_acc": 0.5290322580645161, "train_speed(iter/s)": 1.439245 }, { "epoch": 2.0746754637761877, "grad_norm": 5.597967147827148, "learning_rate": 6.320348647146608e-05, "loss": 2.3456005096435546, "memory(GiB)": 77.56, "step": 48425, "token_acc": 0.5217391304347826, "train_speed(iter/s)": 1.439247 }, { "epoch": 2.074889679105437, "grad_norm": 4.073702812194824, "learning_rate": 6.319699546161559e-05, "loss": 2.480477142333984, "memory(GiB)": 77.56, "step": 48430, "token_acc": 0.4839650145772595, "train_speed(iter/s)": 1.439246 }, { "epoch": 2.075103894434686, "grad_norm": 5.629161357879639, "learning_rate": 6.319050421268915e-05, "loss": 2.342411422729492, "memory(GiB)": 77.56, "step": 48435, "token_acc": 0.5014577259475219, "train_speed(iter/s)": 1.439286 }, { "epoch": 2.0753181097639346, "grad_norm": 5.1040568351745605, "learning_rate": 6.318401272480435e-05, "loss": 2.461376190185547, "memory(GiB)": 77.56, "step": 48440, "token_acc": 0.49158249158249157, "train_speed(iter/s)": 1.439307 }, { "epoch": 2.075532325093184, "grad_norm": 4.828313827514648, "learning_rate": 6.31775209980788e-05, "loss": 2.3649301528930664, "memory(GiB)": 77.56, "step": 48445, "token_acc": 0.48026315789473684, "train_speed(iter/s)": 1.439319 }, { "epoch": 2.0757465404224327, "grad_norm": 4.40201473236084, "learning_rate": 6.317102903263008e-05, "loss": 2.528664207458496, "memory(GiB)": 77.56, "step": 48450, "token_acc": 0.48517520215633425, "train_speed(iter/s)": 1.439345 }, { "epoch": 2.0759607557516815, "grad_norm": 5.237856864929199, "learning_rate": 6.316453682857582e-05, "loss": 2.1672229766845703, "memory(GiB)": 77.56, "step": 48455, "token_acc": 0.5314685314685315, "train_speed(iter/s)": 1.439361 }, { "epoch": 2.0761749710809307, "grad_norm": 5.823591709136963, "learning_rate": 6.315804438603363e-05, "loss": 2.324350357055664, "memory(GiB)": 77.56, "step": 48460, "token_acc": 0.4927007299270073, "train_speed(iter/s)": 1.439391 }, { "epoch": 2.0763891864101796, "grad_norm": 5.161983489990234, "learning_rate": 6.315155170512111e-05, "loss": 2.345977783203125, "memory(GiB)": 77.56, "step": 48465, "token_acc": 0.49454545454545457, "train_speed(iter/s)": 1.439408 }, { "epoch": 2.0766034017394284, "grad_norm": 4.3810200691223145, "learning_rate": 6.31450587859559e-05, "loss": 2.633358192443848, "memory(GiB)": 77.56, "step": 48470, "token_acc": 0.4326923076923077, "train_speed(iter/s)": 1.439439 }, { "epoch": 2.0768176170686776, "grad_norm": 4.507627964019775, "learning_rate": 6.313856562865564e-05, "loss": 2.285697174072266, "memory(GiB)": 77.56, "step": 48475, "token_acc": 0.4875444839857651, "train_speed(iter/s)": 1.439398 }, { "epoch": 2.0770318323979264, "grad_norm": 5.572567939758301, "learning_rate": 6.313207223333793e-05, "loss": 2.541775703430176, "memory(GiB)": 77.56, "step": 48480, "token_acc": 0.46296296296296297, "train_speed(iter/s)": 1.439397 }, { "epoch": 2.0772460477271752, "grad_norm": 6.2764997482299805, "learning_rate": 6.312557860012039e-05, "loss": 2.255630683898926, "memory(GiB)": 77.56, "step": 48485, "token_acc": 0.5053003533568905, "train_speed(iter/s)": 1.439452 }, { "epoch": 2.0774602630564245, "grad_norm": 5.716599941253662, "learning_rate": 6.31190847291207e-05, "loss": 2.53277645111084, "memory(GiB)": 77.56, "step": 48490, "token_acc": 0.45364238410596025, "train_speed(iter/s)": 1.43945 }, { "epoch": 2.0776744783856733, "grad_norm": 5.113265514373779, "learning_rate": 6.311259062045647e-05, "loss": 2.8372575759887697, "memory(GiB)": 77.56, "step": 48495, "token_acc": 0.4117647058823529, "train_speed(iter/s)": 1.439471 }, { "epoch": 2.077888693714922, "grad_norm": 4.587961673736572, "learning_rate": 6.310609627424537e-05, "loss": 2.313138961791992, "memory(GiB)": 77.56, "step": 48500, "token_acc": 0.47540983606557374, "train_speed(iter/s)": 1.439487 }, { "epoch": 2.077888693714922, "eval_loss": 2.0567266941070557, "eval_runtime": 14.6253, "eval_samples_per_second": 6.837, "eval_steps_per_second": 6.837, "eval_token_acc": 0.4861294583883752, "step": 48500 }, { "epoch": 2.0781029090441714, "grad_norm": 5.349452495574951, "learning_rate": 6.309960169060504e-05, "loss": 2.59619255065918, "memory(GiB)": 77.56, "step": 48505, "token_acc": 0.4685185185185185, "train_speed(iter/s)": 1.438787 }, { "epoch": 2.07831712437342, "grad_norm": 3.8574986457824707, "learning_rate": 6.309310686965313e-05, "loss": 2.4214851379394533, "memory(GiB)": 77.56, "step": 48510, "token_acc": 0.4878048780487805, "train_speed(iter/s)": 1.438761 }, { "epoch": 2.078531339702669, "grad_norm": 4.647470474243164, "learning_rate": 6.308661181150733e-05, "loss": 2.2915735244750977, "memory(GiB)": 77.56, "step": 48515, "token_acc": 0.5, "train_speed(iter/s)": 1.438761 }, { "epoch": 2.0787455550319183, "grad_norm": 5.418434143066406, "learning_rate": 6.308011651628525e-05, "loss": 2.5039791107177733, "memory(GiB)": 77.56, "step": 48520, "token_acc": 0.47878787878787876, "train_speed(iter/s)": 1.438792 }, { "epoch": 2.078959770361167, "grad_norm": 5.092519760131836, "learning_rate": 6.30736209841046e-05, "loss": 2.503418731689453, "memory(GiB)": 77.56, "step": 48525, "token_acc": 0.4766355140186916, "train_speed(iter/s)": 1.438809 }, { "epoch": 2.079173985690416, "grad_norm": 4.155118465423584, "learning_rate": 6.306712521508306e-05, "loss": 2.1027830123901365, "memory(GiB)": 77.56, "step": 48530, "token_acc": 0.5091575091575091, "train_speed(iter/s)": 1.438828 }, { "epoch": 2.079388201019665, "grad_norm": 6.210097789764404, "learning_rate": 6.306062920933829e-05, "loss": 2.4000572204589843, "memory(GiB)": 77.56, "step": 48535, "token_acc": 0.5112781954887218, "train_speed(iter/s)": 1.438836 }, { "epoch": 2.079602416348914, "grad_norm": 5.417062759399414, "learning_rate": 6.305413296698795e-05, "loss": 2.4895183563232424, "memory(GiB)": 77.56, "step": 48540, "token_acc": 0.46503496503496505, "train_speed(iter/s)": 1.438841 }, { "epoch": 2.0798166316781628, "grad_norm": 5.2669196128845215, "learning_rate": 6.304763648814974e-05, "loss": 2.216512680053711, "memory(GiB)": 77.56, "step": 48545, "token_acc": 0.48214285714285715, "train_speed(iter/s)": 1.438851 }, { "epoch": 2.080030847007412, "grad_norm": 5.809610843658447, "learning_rate": 6.304113977294137e-05, "loss": 2.299217414855957, "memory(GiB)": 77.56, "step": 48550, "token_acc": 0.4423963133640553, "train_speed(iter/s)": 1.438865 }, { "epoch": 2.080245062336661, "grad_norm": 8.177706718444824, "learning_rate": 6.30346428214805e-05, "loss": 2.2650827407836913, "memory(GiB)": 77.56, "step": 48555, "token_acc": 0.5193798449612403, "train_speed(iter/s)": 1.438903 }, { "epoch": 2.0804592776659097, "grad_norm": 5.567712783813477, "learning_rate": 6.302814563388487e-05, "loss": 2.455456352233887, "memory(GiB)": 77.56, "step": 48560, "token_acc": 0.4539877300613497, "train_speed(iter/s)": 1.438902 }, { "epoch": 2.080673492995159, "grad_norm": 4.803197860717773, "learning_rate": 6.302164821027217e-05, "loss": 2.2091701507568358, "memory(GiB)": 77.56, "step": 48565, "token_acc": 0.5341880341880342, "train_speed(iter/s)": 1.438889 }, { "epoch": 2.0808877083244077, "grad_norm": 6.3997368812561035, "learning_rate": 6.301515055076007e-05, "loss": 2.2463455200195312, "memory(GiB)": 77.56, "step": 48570, "token_acc": 0.5217391304347826, "train_speed(iter/s)": 1.438931 }, { "epoch": 2.0811019236536565, "grad_norm": 5.555520534515381, "learning_rate": 6.300865265546632e-05, "loss": 2.7440542221069335, "memory(GiB)": 77.56, "step": 48575, "token_acc": 0.43131868131868134, "train_speed(iter/s)": 1.438944 }, { "epoch": 2.081316138982906, "grad_norm": 5.450163841247559, "learning_rate": 6.300215452450862e-05, "loss": 2.409371566772461, "memory(GiB)": 77.56, "step": 48580, "token_acc": 0.49514563106796117, "train_speed(iter/s)": 1.438966 }, { "epoch": 2.0815303543121546, "grad_norm": 5.4445672035217285, "learning_rate": 6.29956561580047e-05, "loss": 2.444070053100586, "memory(GiB)": 77.56, "step": 48585, "token_acc": 0.48328267477203646, "train_speed(iter/s)": 1.438989 }, { "epoch": 2.0817445696414034, "grad_norm": 4.557222843170166, "learning_rate": 6.298915755607228e-05, "loss": 2.245400238037109, "memory(GiB)": 77.56, "step": 48590, "token_acc": 0.5426621160409556, "train_speed(iter/s)": 1.438981 }, { "epoch": 2.0819587849706527, "grad_norm": 4.618313312530518, "learning_rate": 6.298265871882908e-05, "loss": 2.2743204116821287, "memory(GiB)": 77.56, "step": 48595, "token_acc": 0.4892086330935252, "train_speed(iter/s)": 1.438986 }, { "epoch": 2.0821730002999015, "grad_norm": 4.655125617980957, "learning_rate": 6.297615964639283e-05, "loss": 2.1316608428955077, "memory(GiB)": 77.56, "step": 48600, "token_acc": 0.5641025641025641, "train_speed(iter/s)": 1.438966 }, { "epoch": 2.0823872156291503, "grad_norm": 6.335999011993408, "learning_rate": 6.29696603388813e-05, "loss": 2.473579216003418, "memory(GiB)": 77.56, "step": 48605, "token_acc": 0.4869281045751634, "train_speed(iter/s)": 1.438968 }, { "epoch": 2.0826014309583996, "grad_norm": 7.8644514083862305, "learning_rate": 6.296316079641218e-05, "loss": 2.8764577865600587, "memory(GiB)": 77.56, "step": 48610, "token_acc": 0.4422535211267606, "train_speed(iter/s)": 1.438956 }, { "epoch": 2.0828156462876484, "grad_norm": 6.0232462882995605, "learning_rate": 6.295666101910325e-05, "loss": 2.3139520645141602, "memory(GiB)": 77.56, "step": 48615, "token_acc": 0.48638132295719844, "train_speed(iter/s)": 1.438943 }, { "epoch": 2.083029861616897, "grad_norm": 8.493483543395996, "learning_rate": 6.295016100707226e-05, "loss": 2.5897979736328125, "memory(GiB)": 77.56, "step": 48620, "token_acc": 0.46830985915492956, "train_speed(iter/s)": 1.438938 }, { "epoch": 2.0832440769461464, "grad_norm": 5.3446269035339355, "learning_rate": 6.294366076043695e-05, "loss": 2.4428714752197265, "memory(GiB)": 77.56, "step": 48625, "token_acc": 0.4692982456140351, "train_speed(iter/s)": 1.438893 }, { "epoch": 2.0834582922753953, "grad_norm": 6.484088897705078, "learning_rate": 6.293716027931507e-05, "loss": 2.476865768432617, "memory(GiB)": 77.56, "step": 48630, "token_acc": 0.45936395759717313, "train_speed(iter/s)": 1.438919 }, { "epoch": 2.083672507604644, "grad_norm": 5.911398410797119, "learning_rate": 6.293065956382442e-05, "loss": 2.558656692504883, "memory(GiB)": 77.56, "step": 48635, "token_acc": 0.47335423197492166, "train_speed(iter/s)": 1.438903 }, { "epoch": 2.0838867229338933, "grad_norm": 4.09732723236084, "learning_rate": 6.292415861408273e-05, "loss": 2.475154685974121, "memory(GiB)": 77.56, "step": 48640, "token_acc": 0.5019011406844106, "train_speed(iter/s)": 1.43889 }, { "epoch": 2.084100938263142, "grad_norm": 4.679836750030518, "learning_rate": 6.291765743020779e-05, "loss": 2.631758689880371, "memory(GiB)": 77.56, "step": 48645, "token_acc": 0.48562300319488816, "train_speed(iter/s)": 1.438941 }, { "epoch": 2.084315153592391, "grad_norm": 4.731095314025879, "learning_rate": 6.291115601231736e-05, "loss": 2.4554443359375, "memory(GiB)": 77.56, "step": 48650, "token_acc": 0.47692307692307695, "train_speed(iter/s)": 1.438936 }, { "epoch": 2.08452936892164, "grad_norm": 9.38693618774414, "learning_rate": 6.290465436052921e-05, "loss": 2.5012958526611326, "memory(GiB)": 77.56, "step": 48655, "token_acc": 0.47686832740213525, "train_speed(iter/s)": 1.438957 }, { "epoch": 2.084743584250889, "grad_norm": 6.419809818267822, "learning_rate": 6.289815247496117e-05, "loss": 2.483686828613281, "memory(GiB)": 77.56, "step": 48660, "token_acc": 0.5, "train_speed(iter/s)": 1.438948 }, { "epoch": 2.084957799580138, "grad_norm": 5.986326694488525, "learning_rate": 6.289165035573098e-05, "loss": 2.166529655456543, "memory(GiB)": 77.56, "step": 48665, "token_acc": 0.5465587044534413, "train_speed(iter/s)": 1.438939 }, { "epoch": 2.085172014909387, "grad_norm": 4.805379867553711, "learning_rate": 6.288514800295647e-05, "loss": 2.393752670288086, "memory(GiB)": 77.56, "step": 48670, "token_acc": 0.47307692307692306, "train_speed(iter/s)": 1.438961 }, { "epoch": 2.085386230238636, "grad_norm": 4.750682353973389, "learning_rate": 6.287864541675542e-05, "loss": 2.4145248413085936, "memory(GiB)": 77.56, "step": 48675, "token_acc": 0.519298245614035, "train_speed(iter/s)": 1.439011 }, { "epoch": 2.0856004455678847, "grad_norm": 5.7357707023620605, "learning_rate": 6.287214259724559e-05, "loss": 2.3581592559814455, "memory(GiB)": 77.56, "step": 48680, "token_acc": 0.5196850393700787, "train_speed(iter/s)": 1.439039 }, { "epoch": 2.085814660897134, "grad_norm": 5.493290424346924, "learning_rate": 6.286563954454485e-05, "loss": 2.458401679992676, "memory(GiB)": 77.56, "step": 48685, "token_acc": 0.4749034749034749, "train_speed(iter/s)": 1.439082 }, { "epoch": 2.0860288762263828, "grad_norm": 7.9068732261657715, "learning_rate": 6.285913625877097e-05, "loss": 2.5358303070068358, "memory(GiB)": 77.56, "step": 48690, "token_acc": 0.5, "train_speed(iter/s)": 1.439099 }, { "epoch": 2.0862430915556316, "grad_norm": 4.675535678863525, "learning_rate": 6.285263274004179e-05, "loss": 2.4378604888916016, "memory(GiB)": 77.56, "step": 48695, "token_acc": 0.4925373134328358, "train_speed(iter/s)": 1.439113 }, { "epoch": 2.086457306884881, "grad_norm": 4.346827983856201, "learning_rate": 6.284612898847508e-05, "loss": 2.3036895751953126, "memory(GiB)": 77.56, "step": 48700, "token_acc": 0.5076335877862596, "train_speed(iter/s)": 1.439104 }, { "epoch": 2.0866715222141297, "grad_norm": 6.296469211578369, "learning_rate": 6.283962500418872e-05, "loss": 2.703408050537109, "memory(GiB)": 77.56, "step": 48705, "token_acc": 0.47384615384615386, "train_speed(iter/s)": 1.439131 }, { "epoch": 2.0868857375433785, "grad_norm": 5.321903228759766, "learning_rate": 6.28331207873005e-05, "loss": 2.334235954284668, "memory(GiB)": 77.56, "step": 48710, "token_acc": 0.4722222222222222, "train_speed(iter/s)": 1.439125 }, { "epoch": 2.0870999528726277, "grad_norm": 5.126958847045898, "learning_rate": 6.282661633792826e-05, "loss": 2.7237152099609374, "memory(GiB)": 77.56, "step": 48715, "token_acc": 0.4563758389261745, "train_speed(iter/s)": 1.439151 }, { "epoch": 2.0873141682018765, "grad_norm": 4.70961332321167, "learning_rate": 6.282011165618984e-05, "loss": 2.456793785095215, "memory(GiB)": 77.56, "step": 48720, "token_acc": 0.4692737430167598, "train_speed(iter/s)": 1.439172 }, { "epoch": 2.0875283835311254, "grad_norm": 5.794946670532227, "learning_rate": 6.281360674220305e-05, "loss": 2.369644355773926, "memory(GiB)": 77.56, "step": 48725, "token_acc": 0.5015479876160991, "train_speed(iter/s)": 1.439184 }, { "epoch": 2.0877425988603746, "grad_norm": 4.590949535369873, "learning_rate": 6.280710159608578e-05, "loss": 2.6604488372802733, "memory(GiB)": 77.56, "step": 48730, "token_acc": 0.47876447876447875, "train_speed(iter/s)": 1.439186 }, { "epoch": 2.0879568141896234, "grad_norm": 4.3121018409729, "learning_rate": 6.280059621795583e-05, "loss": 2.250263786315918, "memory(GiB)": 77.56, "step": 48735, "token_acc": 0.513595166163142, "train_speed(iter/s)": 1.43914 }, { "epoch": 2.0881710295188722, "grad_norm": 5.6718902587890625, "learning_rate": 6.279409060793109e-05, "loss": 2.4860706329345703, "memory(GiB)": 77.56, "step": 48740, "token_acc": 0.4931972789115646, "train_speed(iter/s)": 1.439169 }, { "epoch": 2.0883852448481215, "grad_norm": 4.941648006439209, "learning_rate": 6.278758476612937e-05, "loss": 2.503778266906738, "memory(GiB)": 77.56, "step": 48745, "token_acc": 0.47157190635451507, "train_speed(iter/s)": 1.439161 }, { "epoch": 2.0885994601773703, "grad_norm": 5.030585765838623, "learning_rate": 6.278107869266859e-05, "loss": 2.29956169128418, "memory(GiB)": 77.56, "step": 48750, "token_acc": 0.5313432835820896, "train_speed(iter/s)": 1.439115 }, { "epoch": 2.088813675506619, "grad_norm": 5.252609729766846, "learning_rate": 6.277457238766655e-05, "loss": 2.123383331298828, "memory(GiB)": 77.56, "step": 48755, "token_acc": 0.5394190871369294, "train_speed(iter/s)": 1.439125 }, { "epoch": 2.0890278908358684, "grad_norm": 5.799145221710205, "learning_rate": 6.276806585124116e-05, "loss": 2.483392906188965, "memory(GiB)": 77.56, "step": 48760, "token_acc": 0.5052631578947369, "train_speed(iter/s)": 1.43912 }, { "epoch": 2.089242106165117, "grad_norm": 4.487954139709473, "learning_rate": 6.276155908351027e-05, "loss": 2.4665964126586912, "memory(GiB)": 77.56, "step": 48765, "token_acc": 0.49174917491749176, "train_speed(iter/s)": 1.439105 }, { "epoch": 2.089456321494366, "grad_norm": 5.483849048614502, "learning_rate": 6.275505208459178e-05, "loss": 2.424612808227539, "memory(GiB)": 77.56, "step": 48770, "token_acc": 0.48771929824561405, "train_speed(iter/s)": 1.439124 }, { "epoch": 2.0896705368236153, "grad_norm": 6.1339030265808105, "learning_rate": 6.274854485460355e-05, "loss": 2.3976306915283203, "memory(GiB)": 77.56, "step": 48775, "token_acc": 0.5112540192926045, "train_speed(iter/s)": 1.439125 }, { "epoch": 2.089884752152864, "grad_norm": 4.765876770019531, "learning_rate": 6.274203739366347e-05, "loss": 2.3025442123413087, "memory(GiB)": 77.56, "step": 48780, "token_acc": 0.5076335877862596, "train_speed(iter/s)": 1.439123 }, { "epoch": 2.090098967482113, "grad_norm": 6.810534954071045, "learning_rate": 6.273552970188942e-05, "loss": 2.316607666015625, "memory(GiB)": 77.56, "step": 48785, "token_acc": 0.4913494809688581, "train_speed(iter/s)": 1.439161 }, { "epoch": 2.090313182811362, "grad_norm": 5.791784763336182, "learning_rate": 6.272902177939933e-05, "loss": 2.345306396484375, "memory(GiB)": 77.56, "step": 48790, "token_acc": 0.521594684385382, "train_speed(iter/s)": 1.439161 }, { "epoch": 2.090527398140611, "grad_norm": 4.690967082977295, "learning_rate": 6.272251362631107e-05, "loss": 2.3964767456054688, "memory(GiB)": 77.56, "step": 48795, "token_acc": 0.5, "train_speed(iter/s)": 1.439175 }, { "epoch": 2.0907416134698598, "grad_norm": 5.468694686889648, "learning_rate": 6.27160052427425e-05, "loss": 2.508285140991211, "memory(GiB)": 77.56, "step": 48800, "token_acc": 0.43243243243243246, "train_speed(iter/s)": 1.439164 }, { "epoch": 2.090955828799109, "grad_norm": 4.597681045532227, "learning_rate": 6.27094966288116e-05, "loss": 2.254531478881836, "memory(GiB)": 77.56, "step": 48805, "token_acc": 0.4684385382059801, "train_speed(iter/s)": 1.439188 }, { "epoch": 2.091170044128358, "grad_norm": 5.0382537841796875, "learning_rate": 6.270298778463624e-05, "loss": 2.6489679336547853, "memory(GiB)": 77.56, "step": 48810, "token_acc": 0.5021097046413502, "train_speed(iter/s)": 1.439214 }, { "epoch": 2.0913842594576066, "grad_norm": 5.8131489753723145, "learning_rate": 6.269647871033432e-05, "loss": 2.5048757553100587, "memory(GiB)": 77.56, "step": 48815, "token_acc": 0.4769736842105263, "train_speed(iter/s)": 1.439188 }, { "epoch": 2.091598474786856, "grad_norm": 4.653921127319336, "learning_rate": 6.26899694060238e-05, "loss": 2.340727615356445, "memory(GiB)": 77.56, "step": 48820, "token_acc": 0.47604790419161674, "train_speed(iter/s)": 1.439173 }, { "epoch": 2.0918126901161047, "grad_norm": 4.954652309417725, "learning_rate": 6.26834598718226e-05, "loss": 2.1758090972900392, "memory(GiB)": 77.56, "step": 48825, "token_acc": 0.5229357798165137, "train_speed(iter/s)": 1.439192 }, { "epoch": 2.0920269054453535, "grad_norm": 6.207099437713623, "learning_rate": 6.267695010784859e-05, "loss": 2.5750011444091796, "memory(GiB)": 77.56, "step": 48830, "token_acc": 0.4479166666666667, "train_speed(iter/s)": 1.439211 }, { "epoch": 2.092241120774603, "grad_norm": 5.960329055786133, "learning_rate": 6.267044011421975e-05, "loss": 2.672381782531738, "memory(GiB)": 77.56, "step": 48835, "token_acc": 0.46598639455782315, "train_speed(iter/s)": 1.43922 }, { "epoch": 2.0924553361038516, "grad_norm": 4.969362258911133, "learning_rate": 6.2663929891054e-05, "loss": 2.771834945678711, "memory(GiB)": 77.56, "step": 48840, "token_acc": 0.4461538461538462, "train_speed(iter/s)": 1.439231 }, { "epoch": 2.0926695514331004, "grad_norm": 5.441162586212158, "learning_rate": 6.265741943846926e-05, "loss": 2.423880958557129, "memory(GiB)": 77.56, "step": 48845, "token_acc": 0.5241379310344828, "train_speed(iter/s)": 1.439245 }, { "epoch": 2.0928837667623497, "grad_norm": 8.320979118347168, "learning_rate": 6.265090875658353e-05, "loss": 2.3598764419555662, "memory(GiB)": 77.56, "step": 48850, "token_acc": 0.532871972318339, "train_speed(iter/s)": 1.439272 }, { "epoch": 2.0930979820915985, "grad_norm": 5.704608917236328, "learning_rate": 6.264439784551472e-05, "loss": 2.1590160369873046, "memory(GiB)": 77.56, "step": 48855, "token_acc": 0.5323741007194245, "train_speed(iter/s)": 1.439272 }, { "epoch": 2.0933121974208473, "grad_norm": 5.506229877471924, "learning_rate": 6.263788670538075e-05, "loss": 2.025997352600098, "memory(GiB)": 77.56, "step": 48860, "token_acc": 0.6032388663967612, "train_speed(iter/s)": 1.4393 }, { "epoch": 2.0935264127500965, "grad_norm": 4.564820289611816, "learning_rate": 6.263137533629963e-05, "loss": 2.3568538665771483, "memory(GiB)": 77.56, "step": 48865, "token_acc": 0.5412541254125413, "train_speed(iter/s)": 1.439319 }, { "epoch": 2.0937406280793454, "grad_norm": 5.8116960525512695, "learning_rate": 6.262486373838929e-05, "loss": 2.299452781677246, "memory(GiB)": 77.56, "step": 48870, "token_acc": 0.5401459854014599, "train_speed(iter/s)": 1.439344 }, { "epoch": 2.093954843408594, "grad_norm": 3.955968141555786, "learning_rate": 6.261835191176769e-05, "loss": 2.393795204162598, "memory(GiB)": 77.56, "step": 48875, "token_acc": 0.5, "train_speed(iter/s)": 1.43937 }, { "epoch": 2.0941690587378434, "grad_norm": 5.866372585296631, "learning_rate": 6.261183985655281e-05, "loss": 2.3203685760498045, "memory(GiB)": 77.56, "step": 48880, "token_acc": 0.5372549019607843, "train_speed(iter/s)": 1.439379 }, { "epoch": 2.0943832740670922, "grad_norm": 4.214962959289551, "learning_rate": 6.260532757286264e-05, "loss": 2.567597579956055, "memory(GiB)": 77.56, "step": 48885, "token_acc": 0.4525993883792049, "train_speed(iter/s)": 1.439383 }, { "epoch": 2.094597489396341, "grad_norm": 4.494652271270752, "learning_rate": 6.259881506081512e-05, "loss": 2.3942338943481447, "memory(GiB)": 77.56, "step": 48890, "token_acc": 0.46440677966101696, "train_speed(iter/s)": 1.439355 }, { "epoch": 2.0948117047255903, "grad_norm": 6.093865871429443, "learning_rate": 6.259230232052826e-05, "loss": 2.6650035858154295, "memory(GiB)": 77.56, "step": 48895, "token_acc": 0.4444444444444444, "train_speed(iter/s)": 1.439344 }, { "epoch": 2.095025920054839, "grad_norm": 5.515293121337891, "learning_rate": 6.258578935212e-05, "loss": 2.2267967224121095, "memory(GiB)": 77.56, "step": 48900, "token_acc": 0.5414201183431953, "train_speed(iter/s)": 1.439379 }, { "epoch": 2.095240135384088, "grad_norm": 5.29227876663208, "learning_rate": 6.257927615570839e-05, "loss": 2.3304256439208983, "memory(GiB)": 77.56, "step": 48905, "token_acc": 0.4889589905362776, "train_speed(iter/s)": 1.43939 }, { "epoch": 2.095454350713337, "grad_norm": 6.785962104797363, "learning_rate": 6.257276273141139e-05, "loss": 2.422073173522949, "memory(GiB)": 77.56, "step": 48910, "token_acc": 0.496551724137931, "train_speed(iter/s)": 1.439353 }, { "epoch": 2.095668566042586, "grad_norm": 4.055444240570068, "learning_rate": 6.256624907934699e-05, "loss": 2.2771997451782227, "memory(GiB)": 77.56, "step": 48915, "token_acc": 0.49645390070921985, "train_speed(iter/s)": 1.439364 }, { "epoch": 2.095882781371835, "grad_norm": 5.131942272186279, "learning_rate": 6.25597351996332e-05, "loss": 2.3919715881347656, "memory(GiB)": 77.56, "step": 48920, "token_acc": 0.4766666666666667, "train_speed(iter/s)": 1.439383 }, { "epoch": 2.096096996701084, "grad_norm": 7.323632717132568, "learning_rate": 6.255322109238803e-05, "loss": 2.4990726470947267, "memory(GiB)": 77.56, "step": 48925, "token_acc": 0.4664310954063604, "train_speed(iter/s)": 1.439387 }, { "epoch": 2.096311212030333, "grad_norm": 5.591567516326904, "learning_rate": 6.254670675772947e-05, "loss": 2.7983993530273437, "memory(GiB)": 77.56, "step": 48930, "token_acc": 0.4186746987951807, "train_speed(iter/s)": 1.439409 }, { "epoch": 2.0965254273595817, "grad_norm": 4.745422840118408, "learning_rate": 6.254019219577556e-05, "loss": 2.853195381164551, "memory(GiB)": 77.56, "step": 48935, "token_acc": 0.4680232558139535, "train_speed(iter/s)": 1.439413 }, { "epoch": 2.096739642688831, "grad_norm": 4.464240074157715, "learning_rate": 6.253367740664431e-05, "loss": 2.533761405944824, "memory(GiB)": 77.56, "step": 48940, "token_acc": 0.501628664495114, "train_speed(iter/s)": 1.439435 }, { "epoch": 2.0969538580180798, "grad_norm": 7.061866760253906, "learning_rate": 6.252716239045372e-05, "loss": 2.2278575897216797, "memory(GiB)": 77.56, "step": 48945, "token_acc": 0.505338078291815, "train_speed(iter/s)": 1.439458 }, { "epoch": 2.0971680733473286, "grad_norm": 4.807161331176758, "learning_rate": 6.252064714732185e-05, "loss": 2.4195178985595702, "memory(GiB)": 77.56, "step": 48950, "token_acc": 0.5031847133757962, "train_speed(iter/s)": 1.439442 }, { "epoch": 2.097382288676578, "grad_norm": 7.726494789123535, "learning_rate": 6.25141316773667e-05, "loss": 2.4144935607910156, "memory(GiB)": 77.56, "step": 48955, "token_acc": 0.486646884272997, "train_speed(iter/s)": 1.439454 }, { "epoch": 2.0975965040058266, "grad_norm": 5.9279465675354, "learning_rate": 6.250761598070632e-05, "loss": 2.538133239746094, "memory(GiB)": 77.56, "step": 48960, "token_acc": 0.46830985915492956, "train_speed(iter/s)": 1.439462 }, { "epoch": 2.0978107193350755, "grad_norm": 5.672407627105713, "learning_rate": 6.250110005745874e-05, "loss": 2.5083023071289063, "memory(GiB)": 77.56, "step": 48965, "token_acc": 0.49612403100775193, "train_speed(iter/s)": 1.439445 }, { "epoch": 2.0980249346643247, "grad_norm": 5.007399082183838, "learning_rate": 6.249458390774201e-05, "loss": 2.5312580108642577, "memory(GiB)": 77.56, "step": 48970, "token_acc": 0.4965753424657534, "train_speed(iter/s)": 1.439436 }, { "epoch": 2.0982391499935735, "grad_norm": 8.42153263092041, "learning_rate": 6.248806753167417e-05, "loss": 2.308869743347168, "memory(GiB)": 77.56, "step": 48975, "token_acc": 0.48333333333333334, "train_speed(iter/s)": 1.439417 }, { "epoch": 2.0984533653228223, "grad_norm": 4.837903022766113, "learning_rate": 6.248155092937326e-05, "loss": 2.3988525390625, "memory(GiB)": 77.56, "step": 48980, "token_acc": 0.4981132075471698, "train_speed(iter/s)": 1.439427 }, { "epoch": 2.0986675806520716, "grad_norm": 4.984959602355957, "learning_rate": 6.247503410095737e-05, "loss": 2.5160776138305665, "memory(GiB)": 77.56, "step": 48985, "token_acc": 0.45652173913043476, "train_speed(iter/s)": 1.439413 }, { "epoch": 2.0988817959813204, "grad_norm": 5.135047435760498, "learning_rate": 6.246851704654451e-05, "loss": 2.214380645751953, "memory(GiB)": 77.56, "step": 48990, "token_acc": 0.5655172413793104, "train_speed(iter/s)": 1.439426 }, { "epoch": 2.099096011310569, "grad_norm": 6.444039821624756, "learning_rate": 6.246199976625277e-05, "loss": 2.469023323059082, "memory(GiB)": 77.56, "step": 48995, "token_acc": 0.5094339622641509, "train_speed(iter/s)": 1.439444 }, { "epoch": 2.0993102266398185, "grad_norm": 4.798272132873535, "learning_rate": 6.245548226020024e-05, "loss": 2.409999465942383, "memory(GiB)": 77.56, "step": 49000, "token_acc": 0.47330960854092524, "train_speed(iter/s)": 1.439477 }, { "epoch": 2.0993102266398185, "eval_loss": 2.222402811050415, "eval_runtime": 13.8381, "eval_samples_per_second": 7.226, "eval_steps_per_second": 7.226, "eval_token_acc": 0.461212976022567, "step": 49000 }, { "epoch": 2.0995244419690673, "grad_norm": 6.333439826965332, "learning_rate": 6.244896452850494e-05, "loss": 2.6709468841552733, "memory(GiB)": 77.56, "step": 49005, "token_acc": 0.456, "train_speed(iter/s)": 1.438812 }, { "epoch": 2.099738657298316, "grad_norm": 4.671489715576172, "learning_rate": 6.244244657128498e-05, "loss": 2.3175464630126954, "memory(GiB)": 77.56, "step": 49010, "token_acc": 0.5355648535564853, "train_speed(iter/s)": 1.438861 }, { "epoch": 2.0999528726275654, "grad_norm": 5.851550579071045, "learning_rate": 6.243592838865842e-05, "loss": 2.3232490539550783, "memory(GiB)": 77.56, "step": 49015, "token_acc": 0.504, "train_speed(iter/s)": 1.438884 }, { "epoch": 2.100167087956814, "grad_norm": 6.119845390319824, "learning_rate": 6.242940998074336e-05, "loss": 2.5297639846801756, "memory(GiB)": 77.56, "step": 49020, "token_acc": 0.46441947565543074, "train_speed(iter/s)": 1.438855 }, { "epoch": 2.100381303286063, "grad_norm": 5.294820785522461, "learning_rate": 6.242289134765788e-05, "loss": 2.530585289001465, "memory(GiB)": 77.56, "step": 49025, "token_acc": 0.49382716049382713, "train_speed(iter/s)": 1.438855 }, { "epoch": 2.1005955186153122, "grad_norm": 5.15852689743042, "learning_rate": 6.241637248952006e-05, "loss": 2.457562255859375, "memory(GiB)": 77.56, "step": 49030, "token_acc": 0.47865853658536583, "train_speed(iter/s)": 1.438875 }, { "epoch": 2.100809733944561, "grad_norm": 6.752788543701172, "learning_rate": 6.2409853406448e-05, "loss": 2.4850791931152343, "memory(GiB)": 77.56, "step": 49035, "token_acc": 0.45394736842105265, "train_speed(iter/s)": 1.438904 }, { "epoch": 2.10102394927381, "grad_norm": 3.656010389328003, "learning_rate": 6.240333409855983e-05, "loss": 2.648000717163086, "memory(GiB)": 77.56, "step": 49040, "token_acc": 0.4653179190751445, "train_speed(iter/s)": 1.438951 }, { "epoch": 2.101238164603059, "grad_norm": 5.753542423248291, "learning_rate": 6.239681456597361e-05, "loss": 1.9628711700439454, "memory(GiB)": 77.56, "step": 49045, "token_acc": 0.5206611570247934, "train_speed(iter/s)": 1.438956 }, { "epoch": 2.101452379932308, "grad_norm": 4.903069019317627, "learning_rate": 6.239029480880747e-05, "loss": 2.4458799362182617, "memory(GiB)": 77.56, "step": 49050, "token_acc": 0.5015576323987538, "train_speed(iter/s)": 1.438933 }, { "epoch": 2.1016665952615567, "grad_norm": 4.62379789352417, "learning_rate": 6.238377482717951e-05, "loss": 2.2904151916503905, "memory(GiB)": 77.56, "step": 49055, "token_acc": 0.5353159851301115, "train_speed(iter/s)": 1.43897 }, { "epoch": 2.101880810590806, "grad_norm": 4.9703049659729, "learning_rate": 6.237725462120784e-05, "loss": 2.2056434631347654, "memory(GiB)": 77.56, "step": 49060, "token_acc": 0.5387453874538746, "train_speed(iter/s)": 1.438942 }, { "epoch": 2.102095025920055, "grad_norm": 5.410557746887207, "learning_rate": 6.237073419101061e-05, "loss": 2.29724063873291, "memory(GiB)": 77.56, "step": 49065, "token_acc": 0.5407166123778502, "train_speed(iter/s)": 1.438857 }, { "epoch": 2.1023092412493036, "grad_norm": 4.390076637268066, "learning_rate": 6.236421353670592e-05, "loss": 2.665765380859375, "memory(GiB)": 77.56, "step": 49070, "token_acc": 0.46835443037974683, "train_speed(iter/s)": 1.43883 }, { "epoch": 2.102523456578553, "grad_norm": 5.287171840667725, "learning_rate": 6.235769265841191e-05, "loss": 2.5007225036621095, "memory(GiB)": 77.56, "step": 49075, "token_acc": 0.45569620253164556, "train_speed(iter/s)": 1.43886 }, { "epoch": 2.1027376719078017, "grad_norm": 6.390717029571533, "learning_rate": 6.235117155624671e-05, "loss": 2.561604690551758, "memory(GiB)": 77.56, "step": 49080, "token_acc": 0.47266881028938906, "train_speed(iter/s)": 1.438871 }, { "epoch": 2.1029518872370505, "grad_norm": 6.165492534637451, "learning_rate": 6.234465023032844e-05, "loss": 2.133884620666504, "memory(GiB)": 77.56, "step": 49085, "token_acc": 0.5323741007194245, "train_speed(iter/s)": 1.43889 }, { "epoch": 2.1031661025662998, "grad_norm": 4.798530578613281, "learning_rate": 6.233812868077525e-05, "loss": 2.6677425384521483, "memory(GiB)": 77.56, "step": 49090, "token_acc": 0.47381546134663344, "train_speed(iter/s)": 1.438923 }, { "epoch": 2.1033803178955486, "grad_norm": 5.187280654907227, "learning_rate": 6.233160690770528e-05, "loss": 2.5310846328735352, "memory(GiB)": 77.56, "step": 49095, "token_acc": 0.47335423197492166, "train_speed(iter/s)": 1.438904 }, { "epoch": 2.1035945332247974, "grad_norm": 5.205384731292725, "learning_rate": 6.23250849112367e-05, "loss": 2.503949737548828, "memory(GiB)": 77.56, "step": 49100, "token_acc": 0.4670487106017192, "train_speed(iter/s)": 1.43889 }, { "epoch": 2.1038087485540466, "grad_norm": 3.6947855949401855, "learning_rate": 6.231856269148762e-05, "loss": 2.215028762817383, "memory(GiB)": 77.56, "step": 49105, "token_acc": 0.5046153846153846, "train_speed(iter/s)": 1.438888 }, { "epoch": 2.1040229638832955, "grad_norm": 5.002501010894775, "learning_rate": 6.231204024857624e-05, "loss": 2.3441003799438476, "memory(GiB)": 77.56, "step": 49110, "token_acc": 0.5268817204301075, "train_speed(iter/s)": 1.438917 }, { "epoch": 2.1042371792125443, "grad_norm": 5.260482311248779, "learning_rate": 6.23055175826207e-05, "loss": 2.4012269973754883, "memory(GiB)": 77.56, "step": 49115, "token_acc": 0.48518518518518516, "train_speed(iter/s)": 1.438922 }, { "epoch": 2.1044513945417935, "grad_norm": 7.0244221687316895, "learning_rate": 6.229899469373917e-05, "loss": 2.4538755416870117, "memory(GiB)": 77.56, "step": 49120, "token_acc": 0.4807692307692308, "train_speed(iter/s)": 1.438931 }, { "epoch": 2.1046656098710423, "grad_norm": 5.103029727935791, "learning_rate": 6.229247158204981e-05, "loss": 2.3814722061157227, "memory(GiB)": 77.56, "step": 49125, "token_acc": 0.5171232876712328, "train_speed(iter/s)": 1.43891 }, { "epoch": 2.104879825200291, "grad_norm": 5.0265703201293945, "learning_rate": 6.228594824767078e-05, "loss": 2.51201171875, "memory(GiB)": 77.56, "step": 49130, "token_acc": 0.46616541353383456, "train_speed(iter/s)": 1.438912 }, { "epoch": 2.1050940405295404, "grad_norm": 6.083210468292236, "learning_rate": 6.227942469072027e-05, "loss": 2.340061378479004, "memory(GiB)": 77.56, "step": 49135, "token_acc": 0.4880546075085324, "train_speed(iter/s)": 1.438914 }, { "epoch": 2.1053082558587892, "grad_norm": 5.102031707763672, "learning_rate": 6.227290091131648e-05, "loss": 2.9406700134277344, "memory(GiB)": 77.56, "step": 49140, "token_acc": 0.4141791044776119, "train_speed(iter/s)": 1.438934 }, { "epoch": 2.105522471188038, "grad_norm": 5.642523765563965, "learning_rate": 6.22663769095776e-05, "loss": 2.439640426635742, "memory(GiB)": 77.56, "step": 49145, "token_acc": 0.4909090909090909, "train_speed(iter/s)": 1.438937 }, { "epoch": 2.1057366865172873, "grad_norm": 5.119693756103516, "learning_rate": 6.225985268562175e-05, "loss": 2.68221492767334, "memory(GiB)": 77.56, "step": 49150, "token_acc": 0.4605678233438486, "train_speed(iter/s)": 1.438952 }, { "epoch": 2.105950901846536, "grad_norm": 6.095335483551025, "learning_rate": 6.22533282395672e-05, "loss": 2.130111312866211, "memory(GiB)": 77.56, "step": 49155, "token_acc": 0.5241157556270096, "train_speed(iter/s)": 1.438974 }, { "epoch": 2.106165117175785, "grad_norm": 6.140216827392578, "learning_rate": 6.22468035715321e-05, "loss": 2.455864906311035, "memory(GiB)": 77.56, "step": 49160, "token_acc": 0.48375451263537905, "train_speed(iter/s)": 1.438923 }, { "epoch": 2.106379332505034, "grad_norm": 5.604785919189453, "learning_rate": 6.224027868163467e-05, "loss": 2.2982151031494142, "memory(GiB)": 77.56, "step": 49165, "token_acc": 0.48333333333333334, "train_speed(iter/s)": 1.438932 }, { "epoch": 2.106593547834283, "grad_norm": 5.83743953704834, "learning_rate": 6.223375356999311e-05, "loss": 2.18133544921875, "memory(GiB)": 77.56, "step": 49170, "token_acc": 0.5410447761194029, "train_speed(iter/s)": 1.438963 }, { "epoch": 2.106807763163532, "grad_norm": 3.9852590560913086, "learning_rate": 6.222722823672562e-05, "loss": 2.4130273818969727, "memory(GiB)": 77.56, "step": 49175, "token_acc": 0.47161572052401746, "train_speed(iter/s)": 1.438952 }, { "epoch": 2.107021978492781, "grad_norm": 5.204185485839844, "learning_rate": 6.222070268195041e-05, "loss": 2.62127799987793, "memory(GiB)": 77.56, "step": 49180, "token_acc": 0.48559670781893005, "train_speed(iter/s)": 1.438993 }, { "epoch": 2.10723619382203, "grad_norm": 5.959209442138672, "learning_rate": 6.221417690578574e-05, "loss": 2.514321517944336, "memory(GiB)": 77.56, "step": 49185, "token_acc": 0.503448275862069, "train_speed(iter/s)": 1.439038 }, { "epoch": 2.1074504091512787, "grad_norm": 5.792915344238281, "learning_rate": 6.220765090834977e-05, "loss": 2.6759796142578125, "memory(GiB)": 77.56, "step": 49190, "token_acc": 0.4666666666666667, "train_speed(iter/s)": 1.439096 }, { "epoch": 2.107664624480528, "grad_norm": 6.1992573738098145, "learning_rate": 6.220112468976076e-05, "loss": 2.231016159057617, "memory(GiB)": 77.56, "step": 49195, "token_acc": 0.4716312056737589, "train_speed(iter/s)": 1.439102 }, { "epoch": 2.1078788398097767, "grad_norm": 5.198278427124023, "learning_rate": 6.219459825013694e-05, "loss": 2.3910972595214846, "memory(GiB)": 77.56, "step": 49200, "token_acc": 0.49295774647887325, "train_speed(iter/s)": 1.439099 }, { "epoch": 2.1080930551390256, "grad_norm": 6.3471574783325195, "learning_rate": 6.218807158959652e-05, "loss": 2.1711345672607423, "memory(GiB)": 77.56, "step": 49205, "token_acc": 0.5055762081784386, "train_speed(iter/s)": 1.439134 }, { "epoch": 2.108307270468275, "grad_norm": 4.638065338134766, "learning_rate": 6.218154470825775e-05, "loss": 2.402248764038086, "memory(GiB)": 77.56, "step": 49210, "token_acc": 0.4364820846905538, "train_speed(iter/s)": 1.439128 }, { "epoch": 2.1085214857975236, "grad_norm": 5.397317409515381, "learning_rate": 6.217501760623889e-05, "loss": 2.3646902084350585, "memory(GiB)": 77.56, "step": 49215, "token_acc": 0.5017421602787456, "train_speed(iter/s)": 1.439085 }, { "epoch": 2.1087357011267724, "grad_norm": 5.058228492736816, "learning_rate": 6.216849028365815e-05, "loss": 2.475019836425781, "memory(GiB)": 77.56, "step": 49220, "token_acc": 0.49823321554770317, "train_speed(iter/s)": 1.439093 }, { "epoch": 2.1089499164560217, "grad_norm": 5.189786911010742, "learning_rate": 6.216196274063379e-05, "loss": 2.4395408630371094, "memory(GiB)": 77.56, "step": 49225, "token_acc": 0.4758364312267658, "train_speed(iter/s)": 1.439108 }, { "epoch": 2.1091641317852705, "grad_norm": 5.8689680099487305, "learning_rate": 6.215543497728407e-05, "loss": 2.4147130966186525, "memory(GiB)": 77.56, "step": 49230, "token_acc": 0.4812286689419795, "train_speed(iter/s)": 1.439149 }, { "epoch": 2.1093783471145193, "grad_norm": 7.903217792510986, "learning_rate": 6.214890699372724e-05, "loss": 2.2708404541015623, "memory(GiB)": 77.56, "step": 49235, "token_acc": 0.49477351916376305, "train_speed(iter/s)": 1.439149 }, { "epoch": 2.1095925624437686, "grad_norm": 4.991354465484619, "learning_rate": 6.214237879008157e-05, "loss": 2.434276580810547, "memory(GiB)": 77.56, "step": 49240, "token_acc": 0.5017421602787456, "train_speed(iter/s)": 1.439168 }, { "epoch": 2.1098067777730174, "grad_norm": 6.885368824005127, "learning_rate": 6.213585036646531e-05, "loss": 2.2261051177978515, "memory(GiB)": 77.56, "step": 49245, "token_acc": 0.4984520123839009, "train_speed(iter/s)": 1.439165 }, { "epoch": 2.110020993102266, "grad_norm": 8.591289520263672, "learning_rate": 6.212932172299674e-05, "loss": 2.4186332702636717, "memory(GiB)": 77.56, "step": 49250, "token_acc": 0.49356223175965663, "train_speed(iter/s)": 1.439122 }, { "epoch": 2.1102352084315155, "grad_norm": 6.000884056091309, "learning_rate": 6.212279285979412e-05, "loss": 2.32342643737793, "memory(GiB)": 77.56, "step": 49255, "token_acc": 0.528052805280528, "train_speed(iter/s)": 1.439132 }, { "epoch": 2.1104494237607643, "grad_norm": 5.541501998901367, "learning_rate": 6.211626377697575e-05, "loss": 2.3411909103393556, "memory(GiB)": 77.56, "step": 49260, "token_acc": 0.47586206896551725, "train_speed(iter/s)": 1.439131 }, { "epoch": 2.110663639090013, "grad_norm": 4.93800687789917, "learning_rate": 6.210973447465988e-05, "loss": 2.314703178405762, "memory(GiB)": 77.56, "step": 49265, "token_acc": 0.5135135135135135, "train_speed(iter/s)": 1.439127 }, { "epoch": 2.1108778544192623, "grad_norm": 4.915526390075684, "learning_rate": 6.210320495296484e-05, "loss": 2.8657541275024414, "memory(GiB)": 77.56, "step": 49270, "token_acc": 0.40498442367601245, "train_speed(iter/s)": 1.439128 }, { "epoch": 2.111092069748511, "grad_norm": 5.895676136016846, "learning_rate": 6.209667521200886e-05, "loss": 2.4640504837036135, "memory(GiB)": 77.56, "step": 49275, "token_acc": 0.44363636363636366, "train_speed(iter/s)": 1.43912 }, { "epoch": 2.11130628507776, "grad_norm": 5.05529260635376, "learning_rate": 6.209014525191025e-05, "loss": 2.178257179260254, "memory(GiB)": 77.56, "step": 49280, "token_acc": 0.5193548387096775, "train_speed(iter/s)": 1.439096 }, { "epoch": 2.1115205004070092, "grad_norm": 6.255785942077637, "learning_rate": 6.208361507278735e-05, "loss": 2.2371715545654296, "memory(GiB)": 77.56, "step": 49285, "token_acc": 0.5508771929824562, "train_speed(iter/s)": 1.439044 }, { "epoch": 2.111734715736258, "grad_norm": 5.389954566955566, "learning_rate": 6.207708467475842e-05, "loss": 2.3669509887695312, "memory(GiB)": 77.56, "step": 49290, "token_acc": 0.5227272727272727, "train_speed(iter/s)": 1.43905 }, { "epoch": 2.111948931065507, "grad_norm": 4.614659309387207, "learning_rate": 6.207055405794176e-05, "loss": 2.2462226867675783, "memory(GiB)": 77.56, "step": 49295, "token_acc": 0.48923076923076925, "train_speed(iter/s)": 1.439043 }, { "epoch": 2.112163146394756, "grad_norm": 4.930994987487793, "learning_rate": 6.20640232224557e-05, "loss": 2.4049846649169924, "memory(GiB)": 77.56, "step": 49300, "token_acc": 0.5084175084175084, "train_speed(iter/s)": 1.439023 }, { "epoch": 2.112377361724005, "grad_norm": 4.987551689147949, "learning_rate": 6.205749216841855e-05, "loss": 2.333563232421875, "memory(GiB)": 77.56, "step": 49305, "token_acc": 0.4911660777385159, "train_speed(iter/s)": 1.439001 }, { "epoch": 2.1125915770532537, "grad_norm": 5.033623695373535, "learning_rate": 6.20509608959486e-05, "loss": 2.192428398132324, "memory(GiB)": 77.56, "step": 49310, "token_acc": 0.537117903930131, "train_speed(iter/s)": 1.439017 }, { "epoch": 2.112805792382503, "grad_norm": 5.2437663078308105, "learning_rate": 6.20444294051642e-05, "loss": 2.475846862792969, "memory(GiB)": 77.56, "step": 49315, "token_acc": 0.48464163822525597, "train_speed(iter/s)": 1.43904 }, { "epoch": 2.113020007711752, "grad_norm": 8.206925392150879, "learning_rate": 6.203789769618365e-05, "loss": 2.491926574707031, "memory(GiB)": 77.56, "step": 49320, "token_acc": 0.4925373134328358, "train_speed(iter/s)": 1.439009 }, { "epoch": 2.1132342230410006, "grad_norm": 5.43293571472168, "learning_rate": 6.203136576912529e-05, "loss": 2.3664140701293945, "memory(GiB)": 77.56, "step": 49325, "token_acc": 0.5490196078431373, "train_speed(iter/s)": 1.439008 }, { "epoch": 2.11344843837025, "grad_norm": 5.0613908767700195, "learning_rate": 6.202483362410748e-05, "loss": 2.609480285644531, "memory(GiB)": 77.56, "step": 49330, "token_acc": 0.46959459459459457, "train_speed(iter/s)": 1.438993 }, { "epoch": 2.1136626536994987, "grad_norm": 5.422504425048828, "learning_rate": 6.20183012612485e-05, "loss": 2.3954341888427733, "memory(GiB)": 77.56, "step": 49335, "token_acc": 0.4576923076923077, "train_speed(iter/s)": 1.438998 }, { "epoch": 2.1138768690287475, "grad_norm": 4.421570777893066, "learning_rate": 6.201176868066674e-05, "loss": 2.3289791107177735, "memory(GiB)": 77.56, "step": 49340, "token_acc": 0.486013986013986, "train_speed(iter/s)": 1.439025 }, { "epoch": 2.1140910843579968, "grad_norm": 5.543737888336182, "learning_rate": 6.20052358824805e-05, "loss": 2.5759489059448244, "memory(GiB)": 77.56, "step": 49345, "token_acc": 0.4782608695652174, "train_speed(iter/s)": 1.439019 }, { "epoch": 2.1143052996872456, "grad_norm": 6.646541595458984, "learning_rate": 6.199870286680817e-05, "loss": 2.3740303039550783, "memory(GiB)": 77.56, "step": 49350, "token_acc": 0.4977168949771689, "train_speed(iter/s)": 1.439042 }, { "epoch": 2.1145195150164944, "grad_norm": 5.940126419067383, "learning_rate": 6.199216963376806e-05, "loss": 2.2782978057861327, "memory(GiB)": 77.56, "step": 49355, "token_acc": 0.4970588235294118, "train_speed(iter/s)": 1.439068 }, { "epoch": 2.1147337303457436, "grad_norm": 5.637457847595215, "learning_rate": 6.198563618347857e-05, "loss": 2.5789794921875, "memory(GiB)": 77.56, "step": 49360, "token_acc": 0.45819397993311034, "train_speed(iter/s)": 1.43909 }, { "epoch": 2.1149479456749924, "grad_norm": 6.14561653137207, "learning_rate": 6.197910251605803e-05, "loss": 2.1523054122924803, "memory(GiB)": 77.56, "step": 49365, "token_acc": 0.4928571428571429, "train_speed(iter/s)": 1.439064 }, { "epoch": 2.1151621610042413, "grad_norm": 5.5074286460876465, "learning_rate": 6.19725686316248e-05, "loss": 2.470820426940918, "memory(GiB)": 77.56, "step": 49370, "token_acc": 0.4916387959866221, "train_speed(iter/s)": 1.439053 }, { "epoch": 2.1153763763334905, "grad_norm": 6.865657329559326, "learning_rate": 6.196603453029728e-05, "loss": 2.365422821044922, "memory(GiB)": 77.56, "step": 49375, "token_acc": 0.4916387959866221, "train_speed(iter/s)": 1.439026 }, { "epoch": 2.1155905916627393, "grad_norm": 6.0406599044799805, "learning_rate": 6.19595002121938e-05, "loss": 2.446185493469238, "memory(GiB)": 77.56, "step": 49380, "token_acc": 0.4931506849315068, "train_speed(iter/s)": 1.439008 }, { "epoch": 2.115804806991988, "grad_norm": 5.8405680656433105, "learning_rate": 6.195296567743277e-05, "loss": 2.678752899169922, "memory(GiB)": 77.56, "step": 49385, "token_acc": 0.48221343873517786, "train_speed(iter/s)": 1.439013 }, { "epoch": 2.1160190223212374, "grad_norm": 6.350005626678467, "learning_rate": 6.194643092613254e-05, "loss": 2.4396173477172853, "memory(GiB)": 77.56, "step": 49390, "token_acc": 0.45051194539249145, "train_speed(iter/s)": 1.43904 }, { "epoch": 2.116233237650486, "grad_norm": 5.962313175201416, "learning_rate": 6.193989595841151e-05, "loss": 2.490930366516113, "memory(GiB)": 77.56, "step": 49395, "token_acc": 0.4966887417218543, "train_speed(iter/s)": 1.438995 }, { "epoch": 2.116447452979735, "grad_norm": 4.057485580444336, "learning_rate": 6.193336077438807e-05, "loss": 2.2414833068847657, "memory(GiB)": 77.56, "step": 49400, "token_acc": 0.5217391304347826, "train_speed(iter/s)": 1.439013 }, { "epoch": 2.1166616683089843, "grad_norm": 6.552286624908447, "learning_rate": 6.192682537418061e-05, "loss": 2.437358856201172, "memory(GiB)": 77.56, "step": 49405, "token_acc": 0.4928571428571429, "train_speed(iter/s)": 1.438998 }, { "epoch": 2.116875883638233, "grad_norm": 6.112018585205078, "learning_rate": 6.19202897579075e-05, "loss": 2.4001922607421875, "memory(GiB)": 77.56, "step": 49410, "token_acc": 0.52734375, "train_speed(iter/s)": 1.439001 }, { "epoch": 2.117090098967482, "grad_norm": 7.433084011077881, "learning_rate": 6.191375392568718e-05, "loss": 2.3160715103149414, "memory(GiB)": 77.56, "step": 49415, "token_acc": 0.5037878787878788, "train_speed(iter/s)": 1.439015 }, { "epoch": 2.117304314296731, "grad_norm": 5.40130615234375, "learning_rate": 6.190721787763801e-05, "loss": 2.536113166809082, "memory(GiB)": 77.56, "step": 49420, "token_acc": 0.4709480122324159, "train_speed(iter/s)": 1.439017 }, { "epoch": 2.11751852962598, "grad_norm": 5.7429280281066895, "learning_rate": 6.190068161387844e-05, "loss": 2.4757179260253905, "memory(GiB)": 77.56, "step": 49425, "token_acc": 0.49491525423728816, "train_speed(iter/s)": 1.439046 }, { "epoch": 2.117732744955229, "grad_norm": 4.870671272277832, "learning_rate": 6.189414513452685e-05, "loss": 2.4686864852905273, "memory(GiB)": 77.56, "step": 49430, "token_acc": 0.45901639344262296, "train_speed(iter/s)": 1.439083 }, { "epoch": 2.117946960284478, "grad_norm": 5.639690399169922, "learning_rate": 6.188760843970166e-05, "loss": 2.4599655151367186, "memory(GiB)": 77.56, "step": 49435, "token_acc": 0.47719298245614034, "train_speed(iter/s)": 1.439084 }, { "epoch": 2.118161175613727, "grad_norm": 4.582322120666504, "learning_rate": 6.188107152952129e-05, "loss": 2.2679817199707033, "memory(GiB)": 77.56, "step": 49440, "token_acc": 0.5151515151515151, "train_speed(iter/s)": 1.439066 }, { "epoch": 2.1183753909429757, "grad_norm": 5.99381160736084, "learning_rate": 6.187453440410418e-05, "loss": 2.381431770324707, "memory(GiB)": 77.56, "step": 49445, "token_acc": 0.5185185185185185, "train_speed(iter/s)": 1.439078 }, { "epoch": 2.118589606272225, "grad_norm": 3.7383217811584473, "learning_rate": 6.186799706356872e-05, "loss": 2.377843475341797, "memory(GiB)": 77.56, "step": 49450, "token_acc": 0.4755043227665706, "train_speed(iter/s)": 1.439052 }, { "epoch": 2.1188038216014737, "grad_norm": 4.4830098152160645, "learning_rate": 6.186145950803337e-05, "loss": 2.103716278076172, "memory(GiB)": 77.56, "step": 49455, "token_acc": 0.5266903914590747, "train_speed(iter/s)": 1.439036 }, { "epoch": 2.1190180369307225, "grad_norm": 6.509287357330322, "learning_rate": 6.185492173761655e-05, "loss": 2.693792533874512, "memory(GiB)": 77.56, "step": 49460, "token_acc": 0.46691176470588236, "train_speed(iter/s)": 1.439072 }, { "epoch": 2.119232252259972, "grad_norm": 6.000534534454346, "learning_rate": 6.184838375243671e-05, "loss": 2.4342653274536135, "memory(GiB)": 77.56, "step": 49465, "token_acc": 0.4981549815498155, "train_speed(iter/s)": 1.439063 }, { "epoch": 2.1194464675892206, "grad_norm": 4.785678863525391, "learning_rate": 6.184184555261227e-05, "loss": 2.715536689758301, "memory(GiB)": 77.56, "step": 49470, "token_acc": 0.4413793103448276, "train_speed(iter/s)": 1.438981 }, { "epoch": 2.11966068291847, "grad_norm": 6.359776020050049, "learning_rate": 6.18353071382617e-05, "loss": 2.6815032958984375, "memory(GiB)": 77.56, "step": 49475, "token_acc": 0.40498442367601245, "train_speed(iter/s)": 1.438998 }, { "epoch": 2.1198748982477187, "grad_norm": 3.8615801334381104, "learning_rate": 6.182876850950344e-05, "loss": 2.593334197998047, "memory(GiB)": 77.56, "step": 49480, "token_acc": 0.4375, "train_speed(iter/s)": 1.438969 }, { "epoch": 2.1200891135769675, "grad_norm": 5.856441497802734, "learning_rate": 6.182222966645593e-05, "loss": 2.129928207397461, "memory(GiB)": 77.56, "step": 49485, "token_acc": 0.5451127819548872, "train_speed(iter/s)": 1.438958 }, { "epoch": 2.1203033289062168, "grad_norm": 4.4080424308776855, "learning_rate": 6.181569060923765e-05, "loss": 2.4936798095703123, "memory(GiB)": 77.56, "step": 49490, "token_acc": 0.5086505190311419, "train_speed(iter/s)": 1.438949 }, { "epoch": 2.1205175442354656, "grad_norm": 5.6301164627075195, "learning_rate": 6.180915133796705e-05, "loss": 2.132694053649902, "memory(GiB)": 77.56, "step": 49495, "token_acc": 0.5445859872611465, "train_speed(iter/s)": 1.438956 }, { "epoch": 2.1207317595647144, "grad_norm": 6.104939937591553, "learning_rate": 6.180261185276259e-05, "loss": 2.311728286743164, "memory(GiB)": 77.56, "step": 49500, "token_acc": 0.49808429118773945, "train_speed(iter/s)": 1.438969 }, { "epoch": 2.1207317595647144, "eval_loss": 2.243361473083496, "eval_runtime": 14.0098, "eval_samples_per_second": 7.138, "eval_steps_per_second": 7.138, "eval_token_acc": 0.4934876989869754, "step": 49500 }, { "epoch": 2.1209459748939636, "grad_norm": 6.549467086791992, "learning_rate": 6.179607215374274e-05, "loss": 2.3752391815185545, "memory(GiB)": 77.56, "step": 49505, "token_acc": 0.4994652406417112, "train_speed(iter/s)": 1.438322 }, { "epoch": 2.1211601902232124, "grad_norm": 5.327155590057373, "learning_rate": 6.178953224102599e-05, "loss": 2.3207958221435545, "memory(GiB)": 77.56, "step": 49510, "token_acc": 0.4623287671232877, "train_speed(iter/s)": 1.438342 }, { "epoch": 2.1213744055524613, "grad_norm": 6.941089153289795, "learning_rate": 6.178299211473081e-05, "loss": 2.448649215698242, "memory(GiB)": 77.56, "step": 49515, "token_acc": 0.5220588235294118, "train_speed(iter/s)": 1.438358 }, { "epoch": 2.1215886208817105, "grad_norm": 6.1966376304626465, "learning_rate": 6.177645177497566e-05, "loss": 2.5064371109008787, "memory(GiB)": 77.56, "step": 49520, "token_acc": 0.5153374233128835, "train_speed(iter/s)": 1.438369 }, { "epoch": 2.1218028362109593, "grad_norm": 6.111876010894775, "learning_rate": 6.176991122187904e-05, "loss": 2.3103778839111326, "memory(GiB)": 77.56, "step": 49525, "token_acc": 0.5398550724637681, "train_speed(iter/s)": 1.43832 }, { "epoch": 2.122017051540208, "grad_norm": 5.384545803070068, "learning_rate": 6.176337045555944e-05, "loss": 2.4785642623901367, "memory(GiB)": 77.56, "step": 49530, "token_acc": 0.5015384615384615, "train_speed(iter/s)": 1.438349 }, { "epoch": 2.1222312668694574, "grad_norm": 5.431389808654785, "learning_rate": 6.175682947613534e-05, "loss": 2.4664621353149414, "memory(GiB)": 77.56, "step": 49535, "token_acc": 0.5212765957446809, "train_speed(iter/s)": 1.43837 }, { "epoch": 2.122445482198706, "grad_norm": 5.330932140350342, "learning_rate": 6.175028828372527e-05, "loss": 2.87606258392334, "memory(GiB)": 77.56, "step": 49540, "token_acc": 0.434640522875817, "train_speed(iter/s)": 1.438388 }, { "epoch": 2.122659697527955, "grad_norm": 7.038581371307373, "learning_rate": 6.174374687844769e-05, "loss": 2.4076940536499025, "memory(GiB)": 77.56, "step": 49545, "token_acc": 0.4750830564784053, "train_speed(iter/s)": 1.438408 }, { "epoch": 2.1228739128572043, "grad_norm": 4.5212507247924805, "learning_rate": 6.173720526042112e-05, "loss": 2.0131912231445312, "memory(GiB)": 77.56, "step": 49550, "token_acc": 0.5261194029850746, "train_speed(iter/s)": 1.438396 }, { "epoch": 2.123088128186453, "grad_norm": 4.725074291229248, "learning_rate": 6.173066342976405e-05, "loss": 2.1782270431518556, "memory(GiB)": 77.56, "step": 49555, "token_acc": 0.5490909090909091, "train_speed(iter/s)": 1.438429 }, { "epoch": 2.123302343515702, "grad_norm": 8.704798698425293, "learning_rate": 6.172412138659504e-05, "loss": 2.2778594970703123, "memory(GiB)": 77.56, "step": 49560, "token_acc": 0.5147058823529411, "train_speed(iter/s)": 1.438455 }, { "epoch": 2.123516558844951, "grad_norm": 6.381246089935303, "learning_rate": 6.171757913103255e-05, "loss": 2.361995887756348, "memory(GiB)": 77.56, "step": 49565, "token_acc": 0.4623287671232877, "train_speed(iter/s)": 1.438465 }, { "epoch": 2.1237307741742, "grad_norm": 5.788327217102051, "learning_rate": 6.171103666319514e-05, "loss": 2.3147878646850586, "memory(GiB)": 77.56, "step": 49570, "token_acc": 0.4808259587020649, "train_speed(iter/s)": 1.438503 }, { "epoch": 2.123944989503449, "grad_norm": 4.380969047546387, "learning_rate": 6.17044939832013e-05, "loss": 2.3820045471191404, "memory(GiB)": 77.56, "step": 49575, "token_acc": 0.5080906148867314, "train_speed(iter/s)": 1.438477 }, { "epoch": 2.124159204832698, "grad_norm": 5.585661888122559, "learning_rate": 6.169795109116957e-05, "loss": 2.5776718139648436, "memory(GiB)": 77.56, "step": 49580, "token_acc": 0.46923076923076923, "train_speed(iter/s)": 1.438464 }, { "epoch": 2.124373420161947, "grad_norm": 7.426137924194336, "learning_rate": 6.169140798721847e-05, "loss": 2.4600198745727537, "memory(GiB)": 77.56, "step": 49585, "token_acc": 0.4602649006622517, "train_speed(iter/s)": 1.438492 }, { "epoch": 2.1245876354911957, "grad_norm": 4.669302940368652, "learning_rate": 6.168486467146658e-05, "loss": 2.5644407272338867, "memory(GiB)": 77.56, "step": 49590, "token_acc": 0.4520547945205479, "train_speed(iter/s)": 1.438441 }, { "epoch": 2.124801850820445, "grad_norm": 5.455381393432617, "learning_rate": 6.167832114403238e-05, "loss": 2.2322168350219727, "memory(GiB)": 77.56, "step": 49595, "token_acc": 0.4899328859060403, "train_speed(iter/s)": 1.438465 }, { "epoch": 2.1250160661496937, "grad_norm": 5.240556240081787, "learning_rate": 6.167177740503444e-05, "loss": 2.333906364440918, "memory(GiB)": 77.56, "step": 49600, "token_acc": 0.519298245614035, "train_speed(iter/s)": 1.438493 }, { "epoch": 2.1252302814789426, "grad_norm": 4.892724514007568, "learning_rate": 6.166523345459132e-05, "loss": 2.451043701171875, "memory(GiB)": 77.56, "step": 49605, "token_acc": 0.49310344827586206, "train_speed(iter/s)": 1.438495 }, { "epoch": 2.125444496808192, "grad_norm": 5.030633449554443, "learning_rate": 6.165868929282155e-05, "loss": 2.3205389022827148, "memory(GiB)": 77.56, "step": 49610, "token_acc": 0.4824561403508772, "train_speed(iter/s)": 1.438474 }, { "epoch": 2.1256587121374406, "grad_norm": 4.008750915527344, "learning_rate": 6.165214491984367e-05, "loss": 2.527475929260254, "memory(GiB)": 77.56, "step": 49615, "token_acc": 0.46756756756756757, "train_speed(iter/s)": 1.438483 }, { "epoch": 2.1258729274666894, "grad_norm": 4.7009687423706055, "learning_rate": 6.164560033577626e-05, "loss": 2.470628547668457, "memory(GiB)": 77.56, "step": 49620, "token_acc": 0.49517684887459806, "train_speed(iter/s)": 1.438497 }, { "epoch": 2.1260871427959387, "grad_norm": 4.111307621002197, "learning_rate": 6.163905554073787e-05, "loss": 2.073130416870117, "memory(GiB)": 77.56, "step": 49625, "token_acc": 0.5418326693227091, "train_speed(iter/s)": 1.438493 }, { "epoch": 2.1263013581251875, "grad_norm": 6.109747886657715, "learning_rate": 6.16325105348471e-05, "loss": 2.5180421829223634, "memory(GiB)": 77.56, "step": 49630, "token_acc": 0.45302013422818793, "train_speed(iter/s)": 1.438515 }, { "epoch": 2.1265155734544363, "grad_norm": 5.7932538986206055, "learning_rate": 6.162596531822247e-05, "loss": 2.5330270767211913, "memory(GiB)": 77.56, "step": 49635, "token_acc": 0.434375, "train_speed(iter/s)": 1.438521 }, { "epoch": 2.1267297887836856, "grad_norm": 5.666923999786377, "learning_rate": 6.161941989098256e-05, "loss": 2.4763904571533204, "memory(GiB)": 77.56, "step": 49640, "token_acc": 0.5019455252918288, "train_speed(iter/s)": 1.438489 }, { "epoch": 2.1269440041129344, "grad_norm": 4.667091369628906, "learning_rate": 6.161287425324597e-05, "loss": 2.394595146179199, "memory(GiB)": 77.56, "step": 49645, "token_acc": 0.48589341692789967, "train_speed(iter/s)": 1.438525 }, { "epoch": 2.127158219442183, "grad_norm": 5.370915412902832, "learning_rate": 6.160632840513127e-05, "loss": 2.3501203536987303, "memory(GiB)": 77.56, "step": 49650, "token_acc": 0.4967105263157895, "train_speed(iter/s)": 1.438525 }, { "epoch": 2.1273724347714325, "grad_norm": 5.324522018432617, "learning_rate": 6.159978234675704e-05, "loss": 2.243113136291504, "memory(GiB)": 77.56, "step": 49655, "token_acc": 0.5454545454545454, "train_speed(iter/s)": 1.438543 }, { "epoch": 2.1275866501006813, "grad_norm": 5.306806564331055, "learning_rate": 6.159323607824188e-05, "loss": 2.176255798339844, "memory(GiB)": 77.56, "step": 49660, "token_acc": 0.5282258064516129, "train_speed(iter/s)": 1.438496 }, { "epoch": 2.12780086542993, "grad_norm": 5.817471027374268, "learning_rate": 6.158668959970437e-05, "loss": 2.46315975189209, "memory(GiB)": 77.56, "step": 49665, "token_acc": 0.48348348348348347, "train_speed(iter/s)": 1.438487 }, { "epoch": 2.1280150807591793, "grad_norm": 4.693881511688232, "learning_rate": 6.158014291126311e-05, "loss": 2.2356786727905273, "memory(GiB)": 77.56, "step": 49670, "token_acc": 0.5068493150684932, "train_speed(iter/s)": 1.438474 }, { "epoch": 2.128229296088428, "grad_norm": 5.200475215911865, "learning_rate": 6.15735960130367e-05, "loss": 2.420536422729492, "memory(GiB)": 77.56, "step": 49675, "token_acc": 0.4791666666666667, "train_speed(iter/s)": 1.438492 }, { "epoch": 2.128443511417677, "grad_norm": 5.993760108947754, "learning_rate": 6.156704890514372e-05, "loss": 2.2368011474609375, "memory(GiB)": 77.56, "step": 49680, "token_acc": 0.5116279069767442, "train_speed(iter/s)": 1.438526 }, { "epoch": 2.128657726746926, "grad_norm": 5.373075008392334, "learning_rate": 6.156050158770282e-05, "loss": 2.2701204299926756, "memory(GiB)": 77.56, "step": 49685, "token_acc": 0.4778156996587031, "train_speed(iter/s)": 1.438545 }, { "epoch": 2.128871942076175, "grad_norm": 3.6746273040771484, "learning_rate": 6.155395406083257e-05, "loss": 2.7333545684814453, "memory(GiB)": 77.56, "step": 49690, "token_acc": 0.4293785310734463, "train_speed(iter/s)": 1.43851 }, { "epoch": 2.129086157405424, "grad_norm": 4.016968250274658, "learning_rate": 6.154740632465162e-05, "loss": 2.507294464111328, "memory(GiB)": 77.56, "step": 49695, "token_acc": 0.4899328859060403, "train_speed(iter/s)": 1.438507 }, { "epoch": 2.129300372734673, "grad_norm": 6.886704444885254, "learning_rate": 6.154085837927857e-05, "loss": 2.233457565307617, "memory(GiB)": 77.56, "step": 49700, "token_acc": 0.5309090909090909, "train_speed(iter/s)": 1.438516 }, { "epoch": 2.129514588063922, "grad_norm": 4.527353286743164, "learning_rate": 6.153431022483205e-05, "loss": 2.5117422103881837, "memory(GiB)": 77.56, "step": 49705, "token_acc": 0.5053763440860215, "train_speed(iter/s)": 1.438547 }, { "epoch": 2.1297288033931707, "grad_norm": 6.1918625831604, "learning_rate": 6.152776186143067e-05, "loss": 2.406984329223633, "memory(GiB)": 77.56, "step": 49710, "token_acc": 0.476027397260274, "train_speed(iter/s)": 1.438546 }, { "epoch": 2.12994301872242, "grad_norm": 4.611532688140869, "learning_rate": 6.152121328919307e-05, "loss": 2.220952606201172, "memory(GiB)": 77.56, "step": 49715, "token_acc": 0.5344129554655871, "train_speed(iter/s)": 1.438572 }, { "epoch": 2.130157234051669, "grad_norm": 4.31071138381958, "learning_rate": 6.15146645082379e-05, "loss": 2.1495359420776365, "memory(GiB)": 77.56, "step": 49720, "token_acc": 0.5106382978723404, "train_speed(iter/s)": 1.438562 }, { "epoch": 2.1303714493809176, "grad_norm": 6.270028114318848, "learning_rate": 6.150811551868377e-05, "loss": 2.477192687988281, "memory(GiB)": 77.56, "step": 49725, "token_acc": 0.4782608695652174, "train_speed(iter/s)": 1.438544 }, { "epoch": 2.130585664710167, "grad_norm": 7.548161506652832, "learning_rate": 6.15015663206493e-05, "loss": 2.397829055786133, "memory(GiB)": 77.56, "step": 49730, "token_acc": 0.49206349206349204, "train_speed(iter/s)": 1.438569 }, { "epoch": 2.1307998800394157, "grad_norm": 5.996013641357422, "learning_rate": 6.149501691425321e-05, "loss": 2.490370178222656, "memory(GiB)": 77.56, "step": 49735, "token_acc": 0.46855345911949686, "train_speed(iter/s)": 1.438567 }, { "epoch": 2.1310140953686645, "grad_norm": 5.669908046722412, "learning_rate": 6.148846729961409e-05, "loss": 2.2922748565673827, "memory(GiB)": 77.56, "step": 49740, "token_acc": 0.5015873015873016, "train_speed(iter/s)": 1.438574 }, { "epoch": 2.1312283106979137, "grad_norm": 5.151236534118652, "learning_rate": 6.148191747685061e-05, "loss": 2.2669656753540037, "memory(GiB)": 77.56, "step": 49745, "token_acc": 0.5555555555555556, "train_speed(iter/s)": 1.438589 }, { "epoch": 2.1314425260271626, "grad_norm": 5.144192695617676, "learning_rate": 6.147536744608143e-05, "loss": 2.5211219787597656, "memory(GiB)": 77.56, "step": 49750, "token_acc": 0.47703180212014135, "train_speed(iter/s)": 1.438557 }, { "epoch": 2.1316567413564114, "grad_norm": 4.636655807495117, "learning_rate": 6.146881720742519e-05, "loss": 2.3756664276123045, "memory(GiB)": 77.56, "step": 49755, "token_acc": 0.52, "train_speed(iter/s)": 1.43855 }, { "epoch": 2.1318709566856606, "grad_norm": 5.384429454803467, "learning_rate": 6.146226676100058e-05, "loss": 2.387874794006348, "memory(GiB)": 77.56, "step": 49760, "token_acc": 0.46853146853146854, "train_speed(iter/s)": 1.438536 }, { "epoch": 2.1320851720149094, "grad_norm": 6.880685806274414, "learning_rate": 6.145571610692624e-05, "loss": 2.2560420989990235, "memory(GiB)": 77.56, "step": 49765, "token_acc": 0.5233333333333333, "train_speed(iter/s)": 1.438511 }, { "epoch": 2.1322993873441582, "grad_norm": 7.248161792755127, "learning_rate": 6.144916524532086e-05, "loss": 2.4489181518554686, "memory(GiB)": 77.56, "step": 49770, "token_acc": 0.47244094488188976, "train_speed(iter/s)": 1.438508 }, { "epoch": 2.1325136026734075, "grad_norm": 5.082353591918945, "learning_rate": 6.144261417630313e-05, "loss": 2.3674482345581054, "memory(GiB)": 77.56, "step": 49775, "token_acc": 0.512987012987013, "train_speed(iter/s)": 1.438538 }, { "epoch": 2.1327278180026563, "grad_norm": 5.5833892822265625, "learning_rate": 6.143606289999169e-05, "loss": 2.5191928863525392, "memory(GiB)": 77.56, "step": 49780, "token_acc": 0.5138888888888888, "train_speed(iter/s)": 1.438549 }, { "epoch": 2.132942033331905, "grad_norm": 4.669332504272461, "learning_rate": 6.142951141650527e-05, "loss": 2.293395233154297, "memory(GiB)": 77.56, "step": 49785, "token_acc": 0.5202702702702703, "train_speed(iter/s)": 1.438561 }, { "epoch": 2.1331562486611544, "grad_norm": 5.875611782073975, "learning_rate": 6.14229597259625e-05, "loss": 2.6193126678466796, "memory(GiB)": 77.56, "step": 49790, "token_acc": 0.4772117962466488, "train_speed(iter/s)": 1.438588 }, { "epoch": 2.133370463990403, "grad_norm": 4.320123195648193, "learning_rate": 6.141640782848211e-05, "loss": 2.146860122680664, "memory(GiB)": 77.56, "step": 49795, "token_acc": 0.5634920634920635, "train_speed(iter/s)": 1.438616 }, { "epoch": 2.133584679319652, "grad_norm": 5.646020889282227, "learning_rate": 6.140985572418276e-05, "loss": 2.3647481918334963, "memory(GiB)": 77.56, "step": 49800, "token_acc": 0.4854014598540146, "train_speed(iter/s)": 1.438618 }, { "epoch": 2.1337988946489013, "grad_norm": 6.506009101867676, "learning_rate": 6.14033034131832e-05, "loss": 2.6079477310180663, "memory(GiB)": 77.56, "step": 49805, "token_acc": 0.4375, "train_speed(iter/s)": 1.438647 }, { "epoch": 2.13401310997815, "grad_norm": 5.5941853523254395, "learning_rate": 6.13967508956021e-05, "loss": 2.3892213821411135, "memory(GiB)": 77.56, "step": 49810, "token_acc": 0.4837758112094395, "train_speed(iter/s)": 1.438648 }, { "epoch": 2.134227325307399, "grad_norm": 4.989762306213379, "learning_rate": 6.139019817155815e-05, "loss": 2.0783016204833986, "memory(GiB)": 77.56, "step": 49815, "token_acc": 0.5133333333333333, "train_speed(iter/s)": 1.438676 }, { "epoch": 2.134441540636648, "grad_norm": 6.979779243469238, "learning_rate": 6.13836452411701e-05, "loss": 2.5517240524291993, "memory(GiB)": 77.56, "step": 49820, "token_acc": 0.45724907063197023, "train_speed(iter/s)": 1.438704 }, { "epoch": 2.134655755965897, "grad_norm": 5.5464582443237305, "learning_rate": 6.137709210455661e-05, "loss": 2.4006580352783202, "memory(GiB)": 77.56, "step": 49825, "token_acc": 0.5124653739612188, "train_speed(iter/s)": 1.438699 }, { "epoch": 2.1348699712951458, "grad_norm": 6.28291130065918, "learning_rate": 6.137053876183644e-05, "loss": 2.1528804779052733, "memory(GiB)": 77.56, "step": 49830, "token_acc": 0.5454545454545454, "train_speed(iter/s)": 1.438698 }, { "epoch": 2.135084186624395, "grad_norm": 5.401803493499756, "learning_rate": 6.136398521312829e-05, "loss": 2.4813947677612305, "memory(GiB)": 77.56, "step": 49835, "token_acc": 0.4563106796116505, "train_speed(iter/s)": 1.438649 }, { "epoch": 2.135298401953644, "grad_norm": 6.829617977142334, "learning_rate": 6.135743145855088e-05, "loss": 2.7040019989013673, "memory(GiB)": 77.56, "step": 49840, "token_acc": 0.46048109965635736, "train_speed(iter/s)": 1.438646 }, { "epoch": 2.1355126172828927, "grad_norm": 6.5611138343811035, "learning_rate": 6.135087749822296e-05, "loss": 2.496912384033203, "memory(GiB)": 77.56, "step": 49845, "token_acc": 0.4794520547945205, "train_speed(iter/s)": 1.438662 }, { "epoch": 2.135726832612142, "grad_norm": 4.983579158782959, "learning_rate": 6.134432333226324e-05, "loss": 2.4228458404541016, "memory(GiB)": 77.56, "step": 49850, "token_acc": 0.48859934853420195, "train_speed(iter/s)": 1.438684 }, { "epoch": 2.1359410479413907, "grad_norm": 7.406137943267822, "learning_rate": 6.133776896079045e-05, "loss": 2.5163949966430663, "memory(GiB)": 77.56, "step": 49855, "token_acc": 0.49411764705882355, "train_speed(iter/s)": 1.438704 }, { "epoch": 2.1361552632706395, "grad_norm": 6.890848159790039, "learning_rate": 6.133121438392336e-05, "loss": 2.358189010620117, "memory(GiB)": 77.56, "step": 49860, "token_acc": 0.4790874524714829, "train_speed(iter/s)": 1.438727 }, { "epoch": 2.136369478599889, "grad_norm": 5.278731822967529, "learning_rate": 6.132465960178069e-05, "loss": 2.1852678298950194, "memory(GiB)": 77.56, "step": 49865, "token_acc": 0.5409252669039146, "train_speed(iter/s)": 1.438713 }, { "epoch": 2.1365836939291376, "grad_norm": 6.262353420257568, "learning_rate": 6.131810461448118e-05, "loss": 2.576774787902832, "memory(GiB)": 77.56, "step": 49870, "token_acc": 0.5036764705882353, "train_speed(iter/s)": 1.438726 }, { "epoch": 2.1367979092583864, "grad_norm": 6.995584011077881, "learning_rate": 6.131154942214356e-05, "loss": 2.5000476837158203, "memory(GiB)": 77.56, "step": 49875, "token_acc": 0.5108695652173914, "train_speed(iter/s)": 1.438716 }, { "epoch": 2.1370121245876357, "grad_norm": 5.0994343757629395, "learning_rate": 6.130499402488665e-05, "loss": 2.340886116027832, "memory(GiB)": 77.56, "step": 49880, "token_acc": 0.519163763066202, "train_speed(iter/s)": 1.438685 }, { "epoch": 2.1372263399168845, "grad_norm": 3.9778294563293457, "learning_rate": 6.129843842282915e-05, "loss": 2.0544652938842773, "memory(GiB)": 77.56, "step": 49885, "token_acc": 0.5673469387755102, "train_speed(iter/s)": 1.438663 }, { "epoch": 2.1374405552461333, "grad_norm": 4.39173698425293, "learning_rate": 6.129188261608985e-05, "loss": 2.2172895431518556, "memory(GiB)": 77.56, "step": 49890, "token_acc": 0.5399239543726235, "train_speed(iter/s)": 1.438672 }, { "epoch": 2.1376547705753826, "grad_norm": 6.487553119659424, "learning_rate": 6.12853266047875e-05, "loss": 2.509355354309082, "memory(GiB)": 77.56, "step": 49895, "token_acc": 0.5018587360594795, "train_speed(iter/s)": 1.438663 }, { "epoch": 2.1378689859046314, "grad_norm": 5.5855488777160645, "learning_rate": 6.127877038904087e-05, "loss": 2.5908069610595703, "memory(GiB)": 77.56, "step": 49900, "token_acc": 0.48333333333333334, "train_speed(iter/s)": 1.438651 }, { "epoch": 2.13808320123388, "grad_norm": 6.047486305236816, "learning_rate": 6.127221396896876e-05, "loss": 2.8822341918945313, "memory(GiB)": 77.56, "step": 49905, "token_acc": 0.4623955431754875, "train_speed(iter/s)": 1.438647 }, { "epoch": 2.1382974165631294, "grad_norm": 5.487802028656006, "learning_rate": 6.126565734468987e-05, "loss": 2.446123504638672, "memory(GiB)": 77.56, "step": 49910, "token_acc": 0.46534653465346537, "train_speed(iter/s)": 1.438655 }, { "epoch": 2.1385116318923783, "grad_norm": 5.983086585998535, "learning_rate": 6.125910051632305e-05, "loss": 2.622086524963379, "memory(GiB)": 77.56, "step": 49915, "token_acc": 0.46366782006920415, "train_speed(iter/s)": 1.438649 }, { "epoch": 2.138725847221627, "grad_norm": 4.839722633361816, "learning_rate": 6.125254348398708e-05, "loss": 2.53487548828125, "memory(GiB)": 77.56, "step": 49920, "token_acc": 0.48148148148148145, "train_speed(iter/s)": 1.438665 }, { "epoch": 2.1389400625508763, "grad_norm": 5.114991188049316, "learning_rate": 6.124598624780071e-05, "loss": 2.6296823501586912, "memory(GiB)": 77.56, "step": 49925, "token_acc": 0.42382271468144045, "train_speed(iter/s)": 1.438671 }, { "epoch": 2.139154277880125, "grad_norm": 5.7839837074279785, "learning_rate": 6.123942880788276e-05, "loss": 2.483086585998535, "memory(GiB)": 77.56, "step": 49930, "token_acc": 0.46601941747572817, "train_speed(iter/s)": 1.438701 }, { "epoch": 2.139368493209374, "grad_norm": 4.412889003753662, "learning_rate": 6.123287116435201e-05, "loss": 2.650364875793457, "memory(GiB)": 77.56, "step": 49935, "token_acc": 0.46938775510204084, "train_speed(iter/s)": 1.438735 }, { "epoch": 2.139582708538623, "grad_norm": 6.163586616516113, "learning_rate": 6.122631331732726e-05, "loss": 2.525053024291992, "memory(GiB)": 77.56, "step": 49940, "token_acc": 0.4525316455696203, "train_speed(iter/s)": 1.438765 }, { "epoch": 2.139796923867872, "grad_norm": 6.026120662689209, "learning_rate": 6.121975526692731e-05, "loss": 2.1024471282958985, "memory(GiB)": 77.56, "step": 49945, "token_acc": 0.5707762557077626, "train_speed(iter/s)": 1.438785 }, { "epoch": 2.140011139197121, "grad_norm": 5.671998977661133, "learning_rate": 6.121319701327097e-05, "loss": 2.443772315979004, "memory(GiB)": 77.56, "step": 49950, "token_acc": 0.49834983498349833, "train_speed(iter/s)": 1.438811 }, { "epoch": 2.14022535452637, "grad_norm": 5.431333065032959, "learning_rate": 6.120663855647706e-05, "loss": 2.2522510528564452, "memory(GiB)": 77.56, "step": 49955, "token_acc": 0.5036764705882353, "train_speed(iter/s)": 1.438827 }, { "epoch": 2.140439569855619, "grad_norm": 6.220781326293945, "learning_rate": 6.120007989666437e-05, "loss": 2.1643314361572266, "memory(GiB)": 77.56, "step": 49960, "token_acc": 0.5017421602787456, "train_speed(iter/s)": 1.438809 }, { "epoch": 2.1406537851848677, "grad_norm": 6.45214319229126, "learning_rate": 6.119352103395172e-05, "loss": 2.625122833251953, "memory(GiB)": 77.56, "step": 49965, "token_acc": 0.46601941747572817, "train_speed(iter/s)": 1.438782 }, { "epoch": 2.140868000514117, "grad_norm": 5.414968967437744, "learning_rate": 6.118696196845793e-05, "loss": 2.4727462768554687, "memory(GiB)": 77.56, "step": 49970, "token_acc": 0.4452296819787986, "train_speed(iter/s)": 1.438804 }, { "epoch": 2.1410822158433658, "grad_norm": 4.752712726593018, "learning_rate": 6.118040270030185e-05, "loss": 2.5828969955444334, "memory(GiB)": 77.56, "step": 49975, "token_acc": 0.496551724137931, "train_speed(iter/s)": 1.438824 }, { "epoch": 2.1412964311726146, "grad_norm": 4.766554355621338, "learning_rate": 6.117384322960228e-05, "loss": 2.353059005737305, "memory(GiB)": 77.56, "step": 49980, "token_acc": 0.4792332268370607, "train_speed(iter/s)": 1.438827 }, { "epoch": 2.141510646501864, "grad_norm": 5.561649322509766, "learning_rate": 6.116728355647805e-05, "loss": 2.318983459472656, "memory(GiB)": 77.56, "step": 49985, "token_acc": 0.484, "train_speed(iter/s)": 1.438846 }, { "epoch": 2.1417248618311127, "grad_norm": 4.975546836853027, "learning_rate": 6.1160723681048e-05, "loss": 2.4256845474243165, "memory(GiB)": 77.56, "step": 49990, "token_acc": 0.4948805460750853, "train_speed(iter/s)": 1.438847 }, { "epoch": 2.1419390771603615, "grad_norm": 4.863134384155273, "learning_rate": 6.115416360343099e-05, "loss": 2.343097686767578, "memory(GiB)": 77.56, "step": 49995, "token_acc": 0.5350553505535055, "train_speed(iter/s)": 1.438854 }, { "epoch": 2.1421532924896107, "grad_norm": 5.984571933746338, "learning_rate": 6.114760332374582e-05, "loss": 2.2845407485961915, "memory(GiB)": 77.56, "step": 50000, "token_acc": 0.5310077519379846, "train_speed(iter/s)": 1.438858 }, { "epoch": 2.1421532924896107, "eval_loss": 2.2455437183380127, "eval_runtime": 15.7782, "eval_samples_per_second": 6.338, "eval_steps_per_second": 6.338, "eval_token_acc": 0.4928571428571429, "step": 50000 }, { "epoch": 2.1423675078188595, "grad_norm": 5.6391921043396, "learning_rate": 6.114104284211139e-05, "loss": 2.2938869476318358, "memory(GiB)": 77.56, "step": 50005, "token_acc": 0.49537512846865367, "train_speed(iter/s)": 1.438212 }, { "epoch": 2.1425817231481084, "grad_norm": 7.031339168548584, "learning_rate": 6.11344821586465e-05, "loss": 2.5754207611083983, "memory(GiB)": 77.56, "step": 50010, "token_acc": 0.46853146853146854, "train_speed(iter/s)": 1.438225 }, { "epoch": 2.1427959384773576, "grad_norm": 5.62471866607666, "learning_rate": 6.112792127347001e-05, "loss": 2.482496452331543, "memory(GiB)": 77.56, "step": 50015, "token_acc": 0.505338078291815, "train_speed(iter/s)": 1.438206 }, { "epoch": 2.1430101538066064, "grad_norm": 4.942071914672852, "learning_rate": 6.112136018670079e-05, "loss": 2.132308578491211, "memory(GiB)": 77.56, "step": 50020, "token_acc": 0.5223367697594502, "train_speed(iter/s)": 1.438202 }, { "epoch": 2.1432243691358552, "grad_norm": 4.971907138824463, "learning_rate": 6.111479889845772e-05, "loss": 2.2501907348632812, "memory(GiB)": 77.56, "step": 50025, "token_acc": 0.515748031496063, "train_speed(iter/s)": 1.438185 }, { "epoch": 2.1434385844651045, "grad_norm": 11.107024192810059, "learning_rate": 6.110823740885962e-05, "loss": 2.597299003601074, "memory(GiB)": 77.56, "step": 50030, "token_acc": 0.4984894259818731, "train_speed(iter/s)": 1.438138 }, { "epoch": 2.1436527997943533, "grad_norm": 5.977148532867432, "learning_rate": 6.110167571802538e-05, "loss": 2.653176116943359, "memory(GiB)": 77.56, "step": 50035, "token_acc": 0.44108761329305135, "train_speed(iter/s)": 1.43817 }, { "epoch": 2.143867015123602, "grad_norm": 6.594445705413818, "learning_rate": 6.109511382607388e-05, "loss": 2.425271415710449, "memory(GiB)": 77.56, "step": 50040, "token_acc": 0.4892086330935252, "train_speed(iter/s)": 1.438165 }, { "epoch": 2.1440812304528514, "grad_norm": 5.630459308624268, "learning_rate": 6.108855173312397e-05, "loss": 2.4234024047851563, "memory(GiB)": 77.56, "step": 50045, "token_acc": 0.498220640569395, "train_speed(iter/s)": 1.438168 }, { "epoch": 2.1442954457821, "grad_norm": 5.08636474609375, "learning_rate": 6.108198943929457e-05, "loss": 2.39666633605957, "memory(GiB)": 77.56, "step": 50050, "token_acc": 0.5306859205776173, "train_speed(iter/s)": 1.438143 }, { "epoch": 2.144509661111349, "grad_norm": 4.75391149520874, "learning_rate": 6.107542694470452e-05, "loss": 2.2972488403320312, "memory(GiB)": 77.56, "step": 50055, "token_acc": 0.48344370860927155, "train_speed(iter/s)": 1.438133 }, { "epoch": 2.1447238764405983, "grad_norm": 5.415741920471191, "learning_rate": 6.10688642494727e-05, "loss": 2.722870635986328, "memory(GiB)": 77.56, "step": 50060, "token_acc": 0.46458923512747874, "train_speed(iter/s)": 1.438167 }, { "epoch": 2.144938091769847, "grad_norm": 5.590670108795166, "learning_rate": 6.106230135371804e-05, "loss": 2.4435903549194338, "memory(GiB)": 77.56, "step": 50065, "token_acc": 0.4941860465116279, "train_speed(iter/s)": 1.438164 }, { "epoch": 2.145152307099096, "grad_norm": 5.37048864364624, "learning_rate": 6.105573825755942e-05, "loss": 2.6174755096435547, "memory(GiB)": 77.56, "step": 50070, "token_acc": 0.4641975308641975, "train_speed(iter/s)": 1.438178 }, { "epoch": 2.145366522428345, "grad_norm": 4.424235820770264, "learning_rate": 6.104917496111574e-05, "loss": 2.1999488830566407, "memory(GiB)": 77.56, "step": 50075, "token_acc": 0.5, "train_speed(iter/s)": 1.438176 }, { "epoch": 2.145580737757594, "grad_norm": 5.697660446166992, "learning_rate": 6.104261146450588e-05, "loss": 2.5041751861572266, "memory(GiB)": 77.56, "step": 50080, "token_acc": 0.4738372093023256, "train_speed(iter/s)": 1.438163 }, { "epoch": 2.1457949530868428, "grad_norm": 5.865833282470703, "learning_rate": 6.103604776784872e-05, "loss": 2.5066795349121094, "memory(GiB)": 77.56, "step": 50085, "token_acc": 0.46496815286624205, "train_speed(iter/s)": 1.438146 }, { "epoch": 2.146009168416092, "grad_norm": 5.480571269989014, "learning_rate": 6.102948387126325e-05, "loss": 2.1818000793457033, "memory(GiB)": 77.56, "step": 50090, "token_acc": 0.512280701754386, "train_speed(iter/s)": 1.438126 }, { "epoch": 2.146223383745341, "grad_norm": 6.711512565612793, "learning_rate": 6.10229197748683e-05, "loss": 2.623927688598633, "memory(GiB)": 77.56, "step": 50095, "token_acc": 0.4586206896551724, "train_speed(iter/s)": 1.43817 }, { "epoch": 2.1464375990745896, "grad_norm": 4.391927719116211, "learning_rate": 6.101635547878285e-05, "loss": 2.3694786071777343, "memory(GiB)": 77.56, "step": 50100, "token_acc": 0.46112600536193027, "train_speed(iter/s)": 1.438212 }, { "epoch": 2.146651814403839, "grad_norm": 5.752460956573486, "learning_rate": 6.100979098312576e-05, "loss": 2.518646240234375, "memory(GiB)": 77.56, "step": 50105, "token_acc": 0.4583333333333333, "train_speed(iter/s)": 1.438243 }, { "epoch": 2.1468660297330877, "grad_norm": 5.81132173538208, "learning_rate": 6.100322628801599e-05, "loss": 2.3162628173828126, "memory(GiB)": 77.56, "step": 50110, "token_acc": 0.5149253731343284, "train_speed(iter/s)": 1.43828 }, { "epoch": 2.1470802450623365, "grad_norm": 5.15714168548584, "learning_rate": 6.0996661393572454e-05, "loss": 2.4816158294677733, "memory(GiB)": 77.56, "step": 50115, "token_acc": 0.46464646464646464, "train_speed(iter/s)": 1.438317 }, { "epoch": 2.147294460391586, "grad_norm": 9.179927825927734, "learning_rate": 6.099009629991408e-05, "loss": 2.149810791015625, "memory(GiB)": 77.56, "step": 50120, "token_acc": 0.5665236051502146, "train_speed(iter/s)": 1.438319 }, { "epoch": 2.1475086757208346, "grad_norm": 5.6330695152282715, "learning_rate": 6.098353100715981e-05, "loss": 2.1747529983520506, "memory(GiB)": 77.56, "step": 50125, "token_acc": 0.5053763440860215, "train_speed(iter/s)": 1.438327 }, { "epoch": 2.1477228910500834, "grad_norm": 4.714677333831787, "learning_rate": 6.0976965515428554e-05, "loss": 2.393399238586426, "memory(GiB)": 77.56, "step": 50130, "token_acc": 0.4882154882154882, "train_speed(iter/s)": 1.438341 }, { "epoch": 2.1479371063793327, "grad_norm": 4.813422203063965, "learning_rate": 6.097039982483927e-05, "loss": 2.2043251037597655, "memory(GiB)": 77.56, "step": 50135, "token_acc": 0.532871972318339, "train_speed(iter/s)": 1.438363 }, { "epoch": 2.1481513217085815, "grad_norm": 5.016312122344971, "learning_rate": 6.0963833935510916e-05, "loss": 2.3490016937255858, "memory(GiB)": 77.56, "step": 50140, "token_acc": 0.49466192170818507, "train_speed(iter/s)": 1.438369 }, { "epoch": 2.1483655370378303, "grad_norm": 4.861772060394287, "learning_rate": 6.0957267847562414e-05, "loss": 2.557559776306152, "memory(GiB)": 77.56, "step": 50145, "token_acc": 0.4553846153846154, "train_speed(iter/s)": 1.43839 }, { "epoch": 2.1485797523670795, "grad_norm": 4.889413356781006, "learning_rate": 6.095070156111274e-05, "loss": 2.385935401916504, "memory(GiB)": 77.56, "step": 50150, "token_acc": 0.4421052631578947, "train_speed(iter/s)": 1.438378 }, { "epoch": 2.1487939676963284, "grad_norm": 5.498384475708008, "learning_rate": 6.094413507628084e-05, "loss": 2.437391662597656, "memory(GiB)": 77.56, "step": 50155, "token_acc": 0.4733893557422969, "train_speed(iter/s)": 1.438388 }, { "epoch": 2.149008183025577, "grad_norm": 5.706067085266113, "learning_rate": 6.093756839318565e-05, "loss": 2.2755405426025392, "memory(GiB)": 77.56, "step": 50160, "token_acc": 0.5234657039711191, "train_speed(iter/s)": 1.438411 }, { "epoch": 2.1492223983548264, "grad_norm": 6.289199352264404, "learning_rate": 6.093100151194615e-05, "loss": 2.3919721603393556, "memory(GiB)": 77.56, "step": 50165, "token_acc": 0.5129151291512916, "train_speed(iter/s)": 1.438425 }, { "epoch": 2.1494366136840752, "grad_norm": 5.646265029907227, "learning_rate": 6.09244344326813e-05, "loss": 2.469646453857422, "memory(GiB)": 77.56, "step": 50170, "token_acc": 0.49390243902439024, "train_speed(iter/s)": 1.438412 }, { "epoch": 2.149650829013324, "grad_norm": 5.634893894195557, "learning_rate": 6.091786715551008e-05, "loss": 2.32348747253418, "memory(GiB)": 77.56, "step": 50175, "token_acc": 0.5019305019305019, "train_speed(iter/s)": 1.438388 }, { "epoch": 2.1498650443425733, "grad_norm": 6.0155463218688965, "learning_rate": 6.091129968055146e-05, "loss": 2.5108104705810548, "memory(GiB)": 77.56, "step": 50180, "token_acc": 0.5145631067961165, "train_speed(iter/s)": 1.438377 }, { "epoch": 2.150079259671822, "grad_norm": 5.3230109214782715, "learning_rate": 6.09047320079244e-05, "loss": 2.3352073669433593, "memory(GiB)": 77.56, "step": 50185, "token_acc": 0.5185185185185185, "train_speed(iter/s)": 1.438349 }, { "epoch": 2.150293475001071, "grad_norm": 5.95028018951416, "learning_rate": 6.0898164137747893e-05, "loss": 2.077684783935547, "memory(GiB)": 77.56, "step": 50190, "token_acc": 0.5226480836236934, "train_speed(iter/s)": 1.438344 }, { "epoch": 2.15050769033032, "grad_norm": 7.436186790466309, "learning_rate": 6.089159607014092e-05, "loss": 2.4042360305786135, "memory(GiB)": 77.56, "step": 50195, "token_acc": 0.5051194539249146, "train_speed(iter/s)": 1.438344 }, { "epoch": 2.150721905659569, "grad_norm": 4.912622451782227, "learning_rate": 6.0885027805222484e-05, "loss": 2.5977706909179688, "memory(GiB)": 77.56, "step": 50200, "token_acc": 0.47003154574132494, "train_speed(iter/s)": 1.438324 }, { "epoch": 2.150936120988818, "grad_norm": 6.073189735412598, "learning_rate": 6.0878459343111517e-05, "loss": 2.28847599029541, "memory(GiB)": 77.56, "step": 50205, "token_acc": 0.4756554307116105, "train_speed(iter/s)": 1.438322 }, { "epoch": 2.151150336318067, "grad_norm": 6.330996990203857, "learning_rate": 6.087189068392709e-05, "loss": 2.3986082077026367, "memory(GiB)": 77.56, "step": 50210, "token_acc": 0.5089605734767025, "train_speed(iter/s)": 1.438328 }, { "epoch": 2.151364551647316, "grad_norm": 8.425272941589355, "learning_rate": 6.0865321827788154e-05, "loss": 2.206868553161621, "memory(GiB)": 77.56, "step": 50215, "token_acc": 0.5, "train_speed(iter/s)": 1.438324 }, { "epoch": 2.1515787669765647, "grad_norm": 5.978322505950928, "learning_rate": 6.085875277481372e-05, "loss": 2.1524681091308593, "memory(GiB)": 77.56, "step": 50220, "token_acc": 0.5152542372881356, "train_speed(iter/s)": 1.438322 }, { "epoch": 2.151792982305814, "grad_norm": 4.493298053741455, "learning_rate": 6.08521835251228e-05, "loss": 2.3736846923828123, "memory(GiB)": 77.56, "step": 50225, "token_acc": 0.5085910652920962, "train_speed(iter/s)": 1.438332 }, { "epoch": 2.1520071976350628, "grad_norm": 7.0440521240234375, "learning_rate": 6.084561407883438e-05, "loss": 2.971976089477539, "memory(GiB)": 77.56, "step": 50230, "token_acc": 0.39841688654353563, "train_speed(iter/s)": 1.438345 }, { "epoch": 2.1522214129643116, "grad_norm": 5.06019401550293, "learning_rate": 6.083904443606751e-05, "loss": 2.262111854553223, "memory(GiB)": 77.56, "step": 50235, "token_acc": 0.5384615384615384, "train_speed(iter/s)": 1.438372 }, { "epoch": 2.152435628293561, "grad_norm": 3.6124343872070312, "learning_rate": 6.083247459694117e-05, "loss": 2.3987369537353516, "memory(GiB)": 77.56, "step": 50240, "token_acc": 0.4968152866242038, "train_speed(iter/s)": 1.438404 }, { "epoch": 2.1526498436228096, "grad_norm": 4.730318069458008, "learning_rate": 6.0825904561574374e-05, "loss": 2.4393558502197266, "memory(GiB)": 77.56, "step": 50245, "token_acc": 0.4954682779456193, "train_speed(iter/s)": 1.438423 }, { "epoch": 2.1528640589520585, "grad_norm": 7.20909309387207, "learning_rate": 6.081933433008617e-05, "loss": 2.157692718505859, "memory(GiB)": 77.56, "step": 50250, "token_acc": 0.5222551928783383, "train_speed(iter/s)": 1.438445 }, { "epoch": 2.1530782742813077, "grad_norm": 9.454322814941406, "learning_rate": 6.081276390259559e-05, "loss": 2.418796920776367, "memory(GiB)": 77.56, "step": 50255, "token_acc": 0.48727272727272725, "train_speed(iter/s)": 1.43846 }, { "epoch": 2.1532924896105565, "grad_norm": 5.906976699829102, "learning_rate": 6.0806193279221634e-05, "loss": 2.2224166870117186, "memory(GiB)": 77.56, "step": 50260, "token_acc": 0.5107913669064749, "train_speed(iter/s)": 1.43846 }, { "epoch": 2.1535067049398053, "grad_norm": 7.277247905731201, "learning_rate": 6.079962246008336e-05, "loss": 2.634247970581055, "memory(GiB)": 77.56, "step": 50265, "token_acc": 0.44983818770226536, "train_speed(iter/s)": 1.438491 }, { "epoch": 2.1537209202690546, "grad_norm": 5.129485130310059, "learning_rate": 6.07930514452998e-05, "loss": 2.341689682006836, "memory(GiB)": 77.56, "step": 50270, "token_acc": 0.5304054054054054, "train_speed(iter/s)": 1.438496 }, { "epoch": 2.1539351355983034, "grad_norm": 5.050314426422119, "learning_rate": 6.0786480234989976e-05, "loss": 2.390590476989746, "memory(GiB)": 77.56, "step": 50275, "token_acc": 0.44398340248962653, "train_speed(iter/s)": 1.438507 }, { "epoch": 2.154149350927552, "grad_norm": 5.15317440032959, "learning_rate": 6.0779908829272936e-05, "loss": 2.0559864044189453, "memory(GiB)": 77.56, "step": 50280, "token_acc": 0.4945054945054945, "train_speed(iter/s)": 1.438518 }, { "epoch": 2.1543635662568015, "grad_norm": 5.07619047164917, "learning_rate": 6.077333722826775e-05, "loss": 2.4736354827880858, "memory(GiB)": 77.56, "step": 50285, "token_acc": 0.4921259842519685, "train_speed(iter/s)": 1.438502 }, { "epoch": 2.1545777815860503, "grad_norm": 7.170868396759033, "learning_rate": 6.076676543209344e-05, "loss": 2.4416608810424805, "memory(GiB)": 77.56, "step": 50290, "token_acc": 0.532051282051282, "train_speed(iter/s)": 1.438548 }, { "epoch": 2.154791996915299, "grad_norm": 6.5216851234436035, "learning_rate": 6.07601934408691e-05, "loss": 2.1830108642578123, "memory(GiB)": 77.56, "step": 50295, "token_acc": 0.4889705882352941, "train_speed(iter/s)": 1.438531 }, { "epoch": 2.1550062122445484, "grad_norm": 5.386215686798096, "learning_rate": 6.075362125471374e-05, "loss": 2.333147430419922, "memory(GiB)": 77.56, "step": 50300, "token_acc": 0.4852459016393443, "train_speed(iter/s)": 1.438535 }, { "epoch": 2.155220427573797, "grad_norm": 4.919002532958984, "learning_rate": 6.0747048873746446e-05, "loss": 2.3592729568481445, "memory(GiB)": 77.56, "step": 50305, "token_acc": 0.5267175572519084, "train_speed(iter/s)": 1.438569 }, { "epoch": 2.155434642903046, "grad_norm": 5.851461887359619, "learning_rate": 6.074047629808629e-05, "loss": 2.5147979736328123, "memory(GiB)": 77.56, "step": 50310, "token_acc": 0.49169435215946844, "train_speed(iter/s)": 1.438575 }, { "epoch": 2.1556488582322952, "grad_norm": 5.775365352630615, "learning_rate": 6.073390352785232e-05, "loss": 2.216945838928223, "memory(GiB)": 77.56, "step": 50315, "token_acc": 0.479020979020979, "train_speed(iter/s)": 1.43859 }, { "epoch": 2.155863073561544, "grad_norm": 5.318002700805664, "learning_rate": 6.0727330563163624e-05, "loss": 2.3767499923706055, "memory(GiB)": 77.56, "step": 50320, "token_acc": 0.4492753623188406, "train_speed(iter/s)": 1.438621 }, { "epoch": 2.156077288890793, "grad_norm": 5.274514198303223, "learning_rate": 6.072075740413926e-05, "loss": 2.6518791198730467, "memory(GiB)": 77.56, "step": 50325, "token_acc": 0.45714285714285713, "train_speed(iter/s)": 1.438629 }, { "epoch": 2.156291504220042, "grad_norm": 5.2085185050964355, "learning_rate": 6.071418405089834e-05, "loss": 2.455303955078125, "memory(GiB)": 77.56, "step": 50330, "token_acc": 0.5107142857142857, "train_speed(iter/s)": 1.438639 }, { "epoch": 2.156505719549291, "grad_norm": 4.752186298370361, "learning_rate": 6.070761050355991e-05, "loss": 2.59373779296875, "memory(GiB)": 77.56, "step": 50335, "token_acc": 0.4785714285714286, "train_speed(iter/s)": 1.438665 }, { "epoch": 2.1567199348785397, "grad_norm": 6.599400520324707, "learning_rate": 6.070103676224308e-05, "loss": 2.4299339294433593, "memory(GiB)": 77.56, "step": 50340, "token_acc": 0.459214501510574, "train_speed(iter/s)": 1.438631 }, { "epoch": 2.156934150207789, "grad_norm": 4.464208602905273, "learning_rate": 6.069446282706692e-05, "loss": 2.295475387573242, "memory(GiB)": 77.56, "step": 50345, "token_acc": 0.5053003533568905, "train_speed(iter/s)": 1.43865 }, { "epoch": 2.157148365537038, "grad_norm": 6.727293014526367, "learning_rate": 6.068788869815054e-05, "loss": 2.5154487609863283, "memory(GiB)": 77.56, "step": 50350, "token_acc": 0.4900662251655629, "train_speed(iter/s)": 1.43868 }, { "epoch": 2.1573625808662866, "grad_norm": 6.149333477020264, "learning_rate": 6.068131437561303e-05, "loss": 2.3686569213867186, "memory(GiB)": 77.56, "step": 50355, "token_acc": 0.47244094488188976, "train_speed(iter/s)": 1.438691 }, { "epoch": 2.157576796195536, "grad_norm": 4.366853713989258, "learning_rate": 6.067473985957349e-05, "loss": 2.402776336669922, "memory(GiB)": 77.56, "step": 50360, "token_acc": 0.4909090909090909, "train_speed(iter/s)": 1.43871 }, { "epoch": 2.1577910115247847, "grad_norm": 4.948853492736816, "learning_rate": 6.066816515015101e-05, "loss": 2.26220703125, "memory(GiB)": 77.56, "step": 50365, "token_acc": 0.46863468634686345, "train_speed(iter/s)": 1.43868 }, { "epoch": 2.1580052268540335, "grad_norm": 6.735204219818115, "learning_rate": 6.0661590247464736e-05, "loss": 2.371020698547363, "memory(GiB)": 77.56, "step": 50370, "token_acc": 0.45614035087719296, "train_speed(iter/s)": 1.438633 }, { "epoch": 2.1582194421832828, "grad_norm": 5.736177444458008, "learning_rate": 6.065501515163374e-05, "loss": 2.5300086975097655, "memory(GiB)": 77.56, "step": 50375, "token_acc": 0.45806451612903226, "train_speed(iter/s)": 1.438652 }, { "epoch": 2.1584336575125316, "grad_norm": 7.273033142089844, "learning_rate": 6.064843986277715e-05, "loss": 2.9673542022705077, "memory(GiB)": 77.56, "step": 50380, "token_acc": 0.4222972972972973, "train_speed(iter/s)": 1.438659 }, { "epoch": 2.1586478728417804, "grad_norm": 4.439969062805176, "learning_rate": 6.064186438101409e-05, "loss": 2.4679986953735353, "memory(GiB)": 77.56, "step": 50385, "token_acc": 0.46835443037974683, "train_speed(iter/s)": 1.438654 }, { "epoch": 2.1588620881710296, "grad_norm": 4.850871562957764, "learning_rate": 6.063528870646367e-05, "loss": 2.368244171142578, "memory(GiB)": 77.56, "step": 50390, "token_acc": 0.47058823529411764, "train_speed(iter/s)": 1.438617 }, { "epoch": 2.1590763035002785, "grad_norm": 5.8203043937683105, "learning_rate": 6.0628712839245005e-05, "loss": 2.4497161865234376, "memory(GiB)": 77.56, "step": 50395, "token_acc": 0.5052264808362369, "train_speed(iter/s)": 1.438639 }, { "epoch": 2.1592905188295273, "grad_norm": 6.4210734367370605, "learning_rate": 6.0622136779477254e-05, "loss": 2.456562614440918, "memory(GiB)": 77.56, "step": 50400, "token_acc": 0.4953560371517028, "train_speed(iter/s)": 1.438671 }, { "epoch": 2.1595047341587765, "grad_norm": 4.872335910797119, "learning_rate": 6.0615560527279514e-05, "loss": 2.2009693145751954, "memory(GiB)": 77.56, "step": 50405, "token_acc": 0.534375, "train_speed(iter/s)": 1.438699 }, { "epoch": 2.1597189494880253, "grad_norm": 4.752109527587891, "learning_rate": 6.060898408277096e-05, "loss": 2.5521974563598633, "memory(GiB)": 77.56, "step": 50410, "token_acc": 0.47230320699708456, "train_speed(iter/s)": 1.43873 }, { "epoch": 2.159933164817274, "grad_norm": 4.424752712249756, "learning_rate": 6.06024074460707e-05, "loss": 2.467576026916504, "memory(GiB)": 77.56, "step": 50415, "token_acc": 0.4793650793650794, "train_speed(iter/s)": 1.438714 }, { "epoch": 2.1601473801465234, "grad_norm": 4.672728538513184, "learning_rate": 6.059583061729787e-05, "loss": 2.4791168212890624, "memory(GiB)": 77.56, "step": 50420, "token_acc": 0.48698884758364314, "train_speed(iter/s)": 1.438702 }, { "epoch": 2.1603615954757722, "grad_norm": 5.194528102874756, "learning_rate": 6.058925359657164e-05, "loss": 2.430134963989258, "memory(GiB)": 77.56, "step": 50425, "token_acc": 0.4618055555555556, "train_speed(iter/s)": 1.438705 }, { "epoch": 2.160575810805021, "grad_norm": 4.794474124908447, "learning_rate": 6.058267638401114e-05, "loss": 2.731005859375, "memory(GiB)": 77.56, "step": 50430, "token_acc": 0.4522058823529412, "train_speed(iter/s)": 1.438734 }, { "epoch": 2.1607900261342703, "grad_norm": 5.209463119506836, "learning_rate": 6.057609897973552e-05, "loss": 2.6820161819458006, "memory(GiB)": 77.56, "step": 50435, "token_acc": 0.437984496124031, "train_speed(iter/s)": 1.438703 }, { "epoch": 2.161004241463519, "grad_norm": 4.501694679260254, "learning_rate": 6.056952138386397e-05, "loss": 2.462152099609375, "memory(GiB)": 77.56, "step": 50440, "token_acc": 0.4260355029585799, "train_speed(iter/s)": 1.438705 }, { "epoch": 2.161218456792768, "grad_norm": 5.322403430938721, "learning_rate": 6.056294359651562e-05, "loss": 2.1826005935668946, "memory(GiB)": 77.56, "step": 50445, "token_acc": 0.5755395683453237, "train_speed(iter/s)": 1.438692 }, { "epoch": 2.161432672122017, "grad_norm": 6.76984977722168, "learning_rate": 6.0556365617809615e-05, "loss": 2.304361343383789, "memory(GiB)": 77.56, "step": 50450, "token_acc": 0.5163398692810458, "train_speed(iter/s)": 1.438684 }, { "epoch": 2.161646887451266, "grad_norm": 4.454043865203857, "learning_rate": 6.0549787447865166e-05, "loss": 2.467548370361328, "memory(GiB)": 77.56, "step": 50455, "token_acc": 0.4642857142857143, "train_speed(iter/s)": 1.438677 }, { "epoch": 2.161861102780515, "grad_norm": 6.561191558837891, "learning_rate": 6.0543209086801434e-05, "loss": 2.564452362060547, "memory(GiB)": 77.56, "step": 50460, "token_acc": 0.5015576323987538, "train_speed(iter/s)": 1.438681 }, { "epoch": 2.162075318109764, "grad_norm": 5.802851676940918, "learning_rate": 6.053663053473754e-05, "loss": 2.1944671630859376, "memory(GiB)": 77.56, "step": 50465, "token_acc": 0.5088967971530249, "train_speed(iter/s)": 1.438719 }, { "epoch": 2.162289533439013, "grad_norm": 5.064087390899658, "learning_rate": 6.053005179179273e-05, "loss": 2.431778144836426, "memory(GiB)": 77.56, "step": 50470, "token_acc": 0.48, "train_speed(iter/s)": 1.438746 }, { "epoch": 2.1625037487682617, "grad_norm": 8.673851013183594, "learning_rate": 6.052347285808615e-05, "loss": 2.200387954711914, "memory(GiB)": 77.56, "step": 50475, "token_acc": 0.5322033898305085, "train_speed(iter/s)": 1.438779 }, { "epoch": 2.162717964097511, "grad_norm": 7.30638313293457, "learning_rate": 6.051689373373698e-05, "loss": 2.772796058654785, "memory(GiB)": 77.56, "step": 50480, "token_acc": 0.49264705882352944, "train_speed(iter/s)": 1.438763 }, { "epoch": 2.1629321794267597, "grad_norm": 5.439511775970459, "learning_rate": 6.0510314418864413e-05, "loss": 2.2394184112548827, "memory(GiB)": 77.56, "step": 50485, "token_acc": 0.5051194539249146, "train_speed(iter/s)": 1.438774 }, { "epoch": 2.1631463947560086, "grad_norm": 5.40219259262085, "learning_rate": 6.050373491358764e-05, "loss": 2.2772876739501955, "memory(GiB)": 77.56, "step": 50490, "token_acc": 0.5364238410596026, "train_speed(iter/s)": 1.438742 }, { "epoch": 2.163360610085258, "grad_norm": 5.540431976318359, "learning_rate": 6.049715521802587e-05, "loss": 2.6008733749389648, "memory(GiB)": 77.56, "step": 50495, "token_acc": 0.5019011406844106, "train_speed(iter/s)": 1.438718 }, { "epoch": 2.1635748254145066, "grad_norm": 5.823878765106201, "learning_rate": 6.0490575332298274e-05, "loss": 2.362906646728516, "memory(GiB)": 77.56, "step": 50500, "token_acc": 0.48598130841121495, "train_speed(iter/s)": 1.438739 }, { "epoch": 2.1635748254145066, "eval_loss": 2.3349204063415527, "eval_runtime": 15.0511, "eval_samples_per_second": 6.644, "eval_steps_per_second": 6.644, "eval_token_acc": 0.4614427860696517, "step": 50500 }, { "epoch": 2.1637890407437554, "grad_norm": 5.63339376449585, "learning_rate": 6.048399525652406e-05, "loss": 2.63114013671875, "memory(GiB)": 77.56, "step": 50505, "token_acc": 0.46698564593301434, "train_speed(iter/s)": 1.438087 }, { "epoch": 2.1640032560730047, "grad_norm": 7.447271823883057, "learning_rate": 6.0477414990822444e-05, "loss": 2.4979364395141603, "memory(GiB)": 77.56, "step": 50510, "token_acc": 0.4965753424657534, "train_speed(iter/s)": 1.43811 }, { "epoch": 2.1642174714022535, "grad_norm": 5.807193279266357, "learning_rate": 6.0470834535312636e-05, "loss": 2.468741226196289, "memory(GiB)": 77.56, "step": 50515, "token_acc": 0.5053003533568905, "train_speed(iter/s)": 1.438122 }, { "epoch": 2.1644316867315023, "grad_norm": 5.4508867263793945, "learning_rate": 6.046425389011382e-05, "loss": 2.6488554000854494, "memory(GiB)": 77.56, "step": 50520, "token_acc": 0.4461538461538462, "train_speed(iter/s)": 1.438102 }, { "epoch": 2.1646459020607516, "grad_norm": 5.475977897644043, "learning_rate": 6.045767305534524e-05, "loss": 2.4003440856933596, "memory(GiB)": 77.56, "step": 50525, "token_acc": 0.48, "train_speed(iter/s)": 1.438108 }, { "epoch": 2.1648601173900004, "grad_norm": 9.473495483398438, "learning_rate": 6.045109203112611e-05, "loss": 2.3714120864868162, "memory(GiB)": 77.56, "step": 50530, "token_acc": 0.4882154882154882, "train_speed(iter/s)": 1.438129 }, { "epoch": 2.165074332719249, "grad_norm": 5.357614994049072, "learning_rate": 6.044451081757563e-05, "loss": 2.2716957092285157, "memory(GiB)": 77.56, "step": 50535, "token_acc": 0.5347222222222222, "train_speed(iter/s)": 1.438134 }, { "epoch": 2.1652885480484985, "grad_norm": 8.74575138092041, "learning_rate": 6.043792941481303e-05, "loss": 2.482444190979004, "memory(GiB)": 77.56, "step": 50540, "token_acc": 0.4897260273972603, "train_speed(iter/s)": 1.438118 }, { "epoch": 2.1655027633777473, "grad_norm": 5.383188247680664, "learning_rate": 6.0431347822957574e-05, "loss": 2.466129684448242, "memory(GiB)": 77.56, "step": 50545, "token_acc": 0.5, "train_speed(iter/s)": 1.438138 }, { "epoch": 2.165716978706996, "grad_norm": 5.982137203216553, "learning_rate": 6.042476604212844e-05, "loss": 2.315919876098633, "memory(GiB)": 77.56, "step": 50550, "token_acc": 0.46619217081850534, "train_speed(iter/s)": 1.438158 }, { "epoch": 2.1659311940362453, "grad_norm": 5.913150787353516, "learning_rate": 6.041818407244492e-05, "loss": 2.41861515045166, "memory(GiB)": 77.56, "step": 50555, "token_acc": 0.4847457627118644, "train_speed(iter/s)": 1.438199 }, { "epoch": 2.166145409365494, "grad_norm": 4.576161861419678, "learning_rate": 6.0411601914026205e-05, "loss": 2.2976364135742187, "memory(GiB)": 77.56, "step": 50560, "token_acc": 0.5229007633587787, "train_speed(iter/s)": 1.438211 }, { "epoch": 2.166359624694743, "grad_norm": 5.48606538772583, "learning_rate": 6.040501956699155e-05, "loss": 2.5144866943359374, "memory(GiB)": 77.56, "step": 50565, "token_acc": 0.4603658536585366, "train_speed(iter/s)": 1.438231 }, { "epoch": 2.1665738400239922, "grad_norm": 5.112666130065918, "learning_rate": 6.039843703146022e-05, "loss": 2.504762649536133, "memory(GiB)": 77.56, "step": 50570, "token_acc": 0.4965986394557823, "train_speed(iter/s)": 1.438194 }, { "epoch": 2.166788055353241, "grad_norm": 5.325357913970947, "learning_rate": 6.039185430755143e-05, "loss": 2.5218271255493163, "memory(GiB)": 77.56, "step": 50575, "token_acc": 0.4489051094890511, "train_speed(iter/s)": 1.438199 }, { "epoch": 2.16700227068249, "grad_norm": 4.974330425262451, "learning_rate": 6.038527139538445e-05, "loss": 2.3442657470703123, "memory(GiB)": 77.56, "step": 50580, "token_acc": 0.5076452599388379, "train_speed(iter/s)": 1.438234 }, { "epoch": 2.167216486011739, "grad_norm": 5.249184608459473, "learning_rate": 6.0378688295078556e-05, "loss": 2.2773159027099608, "memory(GiB)": 77.56, "step": 50585, "token_acc": 0.5125, "train_speed(iter/s)": 1.43826 }, { "epoch": 2.167430701340988, "grad_norm": 5.24116849899292, "learning_rate": 6.037210500675298e-05, "loss": 2.8669221878051756, "memory(GiB)": 77.56, "step": 50590, "token_acc": 0.41494845360824745, "train_speed(iter/s)": 1.43826 }, { "epoch": 2.1676449166702367, "grad_norm": 5.492655277252197, "learning_rate": 6.036552153052698e-05, "loss": 2.393923759460449, "memory(GiB)": 77.56, "step": 50595, "token_acc": 0.4874551971326165, "train_speed(iter/s)": 1.438275 }, { "epoch": 2.167859131999486, "grad_norm": 5.0687055587768555, "learning_rate": 6.035893786651985e-05, "loss": 2.402924156188965, "memory(GiB)": 77.56, "step": 50600, "token_acc": 0.5068493150684932, "train_speed(iter/s)": 1.438293 }, { "epoch": 2.168073347328735, "grad_norm": 6.7476396560668945, "learning_rate": 6.035235401485084e-05, "loss": 2.2508779525756837, "memory(GiB)": 77.56, "step": 50605, "token_acc": 0.5320754716981132, "train_speed(iter/s)": 1.438331 }, { "epoch": 2.1682875626579836, "grad_norm": 5.155445575714111, "learning_rate": 6.034576997563921e-05, "loss": 2.6313072204589845, "memory(GiB)": 77.56, "step": 50610, "token_acc": 0.4564459930313589, "train_speed(iter/s)": 1.438352 }, { "epoch": 2.168501777987233, "grad_norm": 5.576014995574951, "learning_rate": 6.0339185749004265e-05, "loss": 2.3745452880859377, "memory(GiB)": 77.56, "step": 50615, "token_acc": 0.5109717868338558, "train_speed(iter/s)": 1.438366 }, { "epoch": 2.1687159933164817, "grad_norm": 5.3393049240112305, "learning_rate": 6.033260133506528e-05, "loss": 2.7261726379394533, "memory(GiB)": 77.56, "step": 50620, "token_acc": 0.438871473354232, "train_speed(iter/s)": 1.438346 }, { "epoch": 2.1689302086457305, "grad_norm": 4.502142429351807, "learning_rate": 6.03260167339415e-05, "loss": 2.405743408203125, "memory(GiB)": 77.56, "step": 50625, "token_acc": 0.48717948717948717, "train_speed(iter/s)": 1.438366 }, { "epoch": 2.1691444239749798, "grad_norm": 4.860012531280518, "learning_rate": 6.031943194575227e-05, "loss": 2.5214256286621093, "memory(GiB)": 77.56, "step": 50630, "token_acc": 0.5086505190311419, "train_speed(iter/s)": 1.43839 }, { "epoch": 2.1693586393042286, "grad_norm": 7.530333042144775, "learning_rate": 6.031284697061683e-05, "loss": 2.290510559082031, "memory(GiB)": 77.56, "step": 50635, "token_acc": 0.4911242603550296, "train_speed(iter/s)": 1.438414 }, { "epoch": 2.1695728546334774, "grad_norm": 7.081682205200195, "learning_rate": 6.030626180865451e-05, "loss": 2.4867294311523436, "memory(GiB)": 77.56, "step": 50640, "token_acc": 0.4380664652567976, "train_speed(iter/s)": 1.438391 }, { "epoch": 2.1697870699627266, "grad_norm": 5.617022514343262, "learning_rate": 6.029967645998459e-05, "loss": 2.5478572845458984, "memory(GiB)": 77.56, "step": 50645, "token_acc": 0.5126050420168067, "train_speed(iter/s)": 1.438383 }, { "epoch": 2.1700012852919754, "grad_norm": 5.664225101470947, "learning_rate": 6.0293090924726346e-05, "loss": 2.430221748352051, "memory(GiB)": 77.56, "step": 50650, "token_acc": 0.5, "train_speed(iter/s)": 1.438393 }, { "epoch": 2.1702155006212243, "grad_norm": 7.483113765716553, "learning_rate": 6.028650520299912e-05, "loss": 2.2885259628295898, "memory(GiB)": 77.56, "step": 50655, "token_acc": 0.5451388888888888, "train_speed(iter/s)": 1.438394 }, { "epoch": 2.1704297159504735, "grad_norm": 6.605968475341797, "learning_rate": 6.0279919294922206e-05, "loss": 2.8503719329833985, "memory(GiB)": 77.56, "step": 50660, "token_acc": 0.4475806451612903, "train_speed(iter/s)": 1.438375 }, { "epoch": 2.1706439312797223, "grad_norm": 5.806797981262207, "learning_rate": 6.02733332006149e-05, "loss": 2.3515445709228517, "memory(GiB)": 77.56, "step": 50665, "token_acc": 0.48148148148148145, "train_speed(iter/s)": 1.438405 }, { "epoch": 2.170858146608971, "grad_norm": 6.882453918457031, "learning_rate": 6.026674692019654e-05, "loss": 2.3562469482421875, "memory(GiB)": 77.56, "step": 50670, "token_acc": 0.5056603773584906, "train_speed(iter/s)": 1.438394 }, { "epoch": 2.1710723619382204, "grad_norm": 5.481173515319824, "learning_rate": 6.0260160453786416e-05, "loss": 2.620244598388672, "memory(GiB)": 77.56, "step": 50675, "token_acc": 0.4779874213836478, "train_speed(iter/s)": 1.438356 }, { "epoch": 2.171286577267469, "grad_norm": 6.531098365783691, "learning_rate": 6.025357380150387e-05, "loss": 2.4211912155151367, "memory(GiB)": 77.56, "step": 50680, "token_acc": 0.5272108843537415, "train_speed(iter/s)": 1.438332 }, { "epoch": 2.171500792596718, "grad_norm": 6.274392604827881, "learning_rate": 6.02469869634682e-05, "loss": 2.415005111694336, "memory(GiB)": 77.56, "step": 50685, "token_acc": 0.49279538904899134, "train_speed(iter/s)": 1.438361 }, { "epoch": 2.1717150079259673, "grad_norm": 5.343069553375244, "learning_rate": 6.0240399939798766e-05, "loss": 2.479130744934082, "memory(GiB)": 77.56, "step": 50690, "token_acc": 0.50199203187251, "train_speed(iter/s)": 1.438364 }, { "epoch": 2.171929223255216, "grad_norm": 5.851415634155273, "learning_rate": 6.023381273061487e-05, "loss": 2.395462417602539, "memory(GiB)": 77.56, "step": 50695, "token_acc": 0.4901185770750988, "train_speed(iter/s)": 1.438332 }, { "epoch": 2.172143438584465, "grad_norm": 7.054051876068115, "learning_rate": 6.0227225336035866e-05, "loss": 2.254651641845703, "memory(GiB)": 77.56, "step": 50700, "token_acc": 0.5340136054421769, "train_speed(iter/s)": 1.438325 }, { "epoch": 2.172357653913714, "grad_norm": 5.3499345779418945, "learning_rate": 6.022063775618107e-05, "loss": 2.645551300048828, "memory(GiB)": 77.56, "step": 50705, "token_acc": 0.4368932038834951, "train_speed(iter/s)": 1.438301 }, { "epoch": 2.172571869242963, "grad_norm": 4.720377445220947, "learning_rate": 6.0214049991169844e-05, "loss": 2.3728002548217773, "memory(GiB)": 77.56, "step": 50710, "token_acc": 0.5029585798816568, "train_speed(iter/s)": 1.438307 }, { "epoch": 2.172786084572212, "grad_norm": 5.480470657348633, "learning_rate": 6.0207462041121524e-05, "loss": 2.592828369140625, "memory(GiB)": 77.56, "step": 50715, "token_acc": 0.47770700636942676, "train_speed(iter/s)": 1.438292 }, { "epoch": 2.173000299901461, "grad_norm": 6.5919718742370605, "learning_rate": 6.0200873906155455e-05, "loss": 2.308619499206543, "memory(GiB)": 77.56, "step": 50720, "token_acc": 0.49477351916376305, "train_speed(iter/s)": 1.438327 }, { "epoch": 2.17321451523071, "grad_norm": 5.305961608886719, "learning_rate": 6.0194285586390955e-05, "loss": 2.450691986083984, "memory(GiB)": 77.56, "step": 50725, "token_acc": 0.5067567567567568, "train_speed(iter/s)": 1.438314 }, { "epoch": 2.1734287305599587, "grad_norm": 5.529691696166992, "learning_rate": 6.0187697081947434e-05, "loss": 2.3037506103515626, "memory(GiB)": 77.56, "step": 50730, "token_acc": 0.5287356321839081, "train_speed(iter/s)": 1.438332 }, { "epoch": 2.173642945889208, "grad_norm": 5.136481285095215, "learning_rate": 6.0181108392944216e-05, "loss": 2.4477033615112305, "memory(GiB)": 77.56, "step": 50735, "token_acc": 0.48502994011976047, "train_speed(iter/s)": 1.438358 }, { "epoch": 2.1738571612184567, "grad_norm": 6.894428253173828, "learning_rate": 6.017451951950067e-05, "loss": 2.6204248428344727, "memory(GiB)": 77.56, "step": 50740, "token_acc": 0.47580645161290325, "train_speed(iter/s)": 1.438369 }, { "epoch": 2.1740713765477055, "grad_norm": 6.091804504394531, "learning_rate": 6.0167930461736165e-05, "loss": 2.5189701080322267, "memory(GiB)": 77.56, "step": 50745, "token_acc": 0.45993031358885017, "train_speed(iter/s)": 1.438399 }, { "epoch": 2.174285591876955, "grad_norm": 5.559927940368652, "learning_rate": 6.016134121977006e-05, "loss": 2.2092323303222656, "memory(GiB)": 77.56, "step": 50750, "token_acc": 0.5642023346303502, "train_speed(iter/s)": 1.43841 }, { "epoch": 2.1744998072062036, "grad_norm": 5.553183555603027, "learning_rate": 6.015475179372173e-05, "loss": 2.5787242889404296, "memory(GiB)": 77.56, "step": 50755, "token_acc": 0.45353159851301117, "train_speed(iter/s)": 1.438394 }, { "epoch": 2.1747140225354524, "grad_norm": 5.36630916595459, "learning_rate": 6.0148162183710534e-05, "loss": 2.3338794708251953, "memory(GiB)": 77.56, "step": 50760, "token_acc": 0.5173745173745173, "train_speed(iter/s)": 1.438408 }, { "epoch": 2.1749282378647017, "grad_norm": 5.367036819458008, "learning_rate": 6.014157238985587e-05, "loss": 2.266572380065918, "memory(GiB)": 77.56, "step": 50765, "token_acc": 0.5291970802919708, "train_speed(iter/s)": 1.43838 }, { "epoch": 2.1751424531939505, "grad_norm": 6.967119216918945, "learning_rate": 6.0134982412277095e-05, "loss": 2.2577402114868166, "memory(GiB)": 77.56, "step": 50770, "token_acc": 0.5068027210884354, "train_speed(iter/s)": 1.438372 }, { "epoch": 2.1753566685231993, "grad_norm": 6.080873966217041, "learning_rate": 6.0128392251093624e-05, "loss": 2.2987483978271483, "memory(GiB)": 77.56, "step": 50775, "token_acc": 0.4896551724137931, "train_speed(iter/s)": 1.438372 }, { "epoch": 2.1755708838524486, "grad_norm": 8.971758842468262, "learning_rate": 6.012180190642481e-05, "loss": 2.63602237701416, "memory(GiB)": 77.56, "step": 50780, "token_acc": 0.4897959183673469, "train_speed(iter/s)": 1.438379 }, { "epoch": 2.1757850991816974, "grad_norm": 6.279210567474365, "learning_rate": 6.011521137839007e-05, "loss": 2.659720230102539, "memory(GiB)": 77.56, "step": 50785, "token_acc": 0.4421768707482993, "train_speed(iter/s)": 1.438369 }, { "epoch": 2.175999314510946, "grad_norm": 6.57822322845459, "learning_rate": 6.0108620667108794e-05, "loss": 2.3965044021606445, "memory(GiB)": 77.56, "step": 50790, "token_acc": 0.49411764705882355, "train_speed(iter/s)": 1.438377 }, { "epoch": 2.1762135298401954, "grad_norm": 5.5568647384643555, "learning_rate": 6.010202977270035e-05, "loss": 2.5202722549438477, "memory(GiB)": 77.56, "step": 50795, "token_acc": 0.4981684981684982, "train_speed(iter/s)": 1.438402 }, { "epoch": 2.1764277451694443, "grad_norm": 5.045445919036865, "learning_rate": 6.009543869528417e-05, "loss": 2.3829681396484377, "memory(GiB)": 77.56, "step": 50800, "token_acc": 0.5175718849840255, "train_speed(iter/s)": 1.43842 }, { "epoch": 2.176641960498693, "grad_norm": 4.594754219055176, "learning_rate": 6.008884743497966e-05, "loss": 2.480429267883301, "memory(GiB)": 77.56, "step": 50805, "token_acc": 0.45977011494252873, "train_speed(iter/s)": 1.438422 }, { "epoch": 2.1768561758279423, "grad_norm": 5.978048324584961, "learning_rate": 6.00822559919062e-05, "loss": 2.185072135925293, "memory(GiB)": 77.56, "step": 50810, "token_acc": 0.5245283018867924, "train_speed(iter/s)": 1.438439 }, { "epoch": 2.177070391157191, "grad_norm": 6.25172758102417, "learning_rate": 6.007566436618321e-05, "loss": 2.718180465698242, "memory(GiB)": 77.56, "step": 50815, "token_acc": 0.468013468013468, "train_speed(iter/s)": 1.438403 }, { "epoch": 2.17728460648644, "grad_norm": 6.709914684295654, "learning_rate": 6.006907255793013e-05, "loss": 2.6451622009277345, "memory(GiB)": 77.56, "step": 50820, "token_acc": 0.4852459016393443, "train_speed(iter/s)": 1.438417 }, { "epoch": 2.177498821815689, "grad_norm": 4.794750690460205, "learning_rate": 6.006248056726634e-05, "loss": 2.0978515625, "memory(GiB)": 77.56, "step": 50825, "token_acc": 0.5133079847908745, "train_speed(iter/s)": 1.438434 }, { "epoch": 2.177713037144938, "grad_norm": 5.17275333404541, "learning_rate": 6.005588839431129e-05, "loss": 2.4510494232177735, "memory(GiB)": 77.56, "step": 50830, "token_acc": 0.465625, "train_speed(iter/s)": 1.438478 }, { "epoch": 2.177927252474187, "grad_norm": 5.74208927154541, "learning_rate": 6.0049296039184364e-05, "loss": 2.42938346862793, "memory(GiB)": 77.56, "step": 50835, "token_acc": 0.5, "train_speed(iter/s)": 1.438458 }, { "epoch": 2.178141467803436, "grad_norm": 5.702775955200195, "learning_rate": 6.0042703502005015e-05, "loss": 2.3877586364746093, "memory(GiB)": 77.56, "step": 50840, "token_acc": 0.48742138364779874, "train_speed(iter/s)": 1.438488 }, { "epoch": 2.178355683132685, "grad_norm": 6.729453086853027, "learning_rate": 6.00361107828927e-05, "loss": 2.3723974227905273, "memory(GiB)": 77.56, "step": 50845, "token_acc": 0.5195729537366548, "train_speed(iter/s)": 1.438503 }, { "epoch": 2.1785698984619337, "grad_norm": 6.154310703277588, "learning_rate": 6.00295178819668e-05, "loss": 2.5813852310180665, "memory(GiB)": 77.56, "step": 50850, "token_acc": 0.48, "train_speed(iter/s)": 1.438527 }, { "epoch": 2.178784113791183, "grad_norm": 7.639402389526367, "learning_rate": 6.002292479934678e-05, "loss": 2.7543163299560547, "memory(GiB)": 77.56, "step": 50855, "token_acc": 0.4601449275362319, "train_speed(iter/s)": 1.438522 }, { "epoch": 2.178998329120432, "grad_norm": 5.869032859802246, "learning_rate": 6.0016331535152084e-05, "loss": 2.389212226867676, "memory(GiB)": 77.56, "step": 50860, "token_acc": 0.44573643410852715, "train_speed(iter/s)": 1.438556 }, { "epoch": 2.1792125444496806, "grad_norm": 4.972807884216309, "learning_rate": 6.000973808950214e-05, "loss": 2.421988105773926, "memory(GiB)": 77.56, "step": 50865, "token_acc": 0.525679758308157, "train_speed(iter/s)": 1.43857 }, { "epoch": 2.17942675977893, "grad_norm": 6.6491522789001465, "learning_rate": 6.000314446251638e-05, "loss": 2.5481613159179686, "memory(GiB)": 77.56, "step": 50870, "token_acc": 0.4681528662420382, "train_speed(iter/s)": 1.438578 }, { "epoch": 2.1796409751081787, "grad_norm": 5.59084939956665, "learning_rate": 5.99965506543143e-05, "loss": 2.310906410217285, "memory(GiB)": 77.56, "step": 50875, "token_acc": 0.49508196721311476, "train_speed(iter/s)": 1.438587 }, { "epoch": 2.1798551904374275, "grad_norm": 6.800754547119141, "learning_rate": 5.9989956665015324e-05, "loss": 2.799560546875, "memory(GiB)": 77.56, "step": 50880, "token_acc": 0.4158415841584158, "train_speed(iter/s)": 1.43857 }, { "epoch": 2.1800694057666767, "grad_norm": 4.771638870239258, "learning_rate": 5.99833624947389e-05, "loss": 2.548948287963867, "memory(GiB)": 77.56, "step": 50885, "token_acc": 0.5095057034220533, "train_speed(iter/s)": 1.438593 }, { "epoch": 2.1802836210959256, "grad_norm": 6.158051490783691, "learning_rate": 5.997676814360451e-05, "loss": 2.431191825866699, "memory(GiB)": 77.56, "step": 50890, "token_acc": 0.46886446886446886, "train_speed(iter/s)": 1.438612 }, { "epoch": 2.1804978364251744, "grad_norm": 6.7467803955078125, "learning_rate": 5.9970173611731616e-05, "loss": 2.714176559448242, "memory(GiB)": 77.56, "step": 50895, "token_acc": 0.43109540636042404, "train_speed(iter/s)": 1.43861 }, { "epoch": 2.1807120517544236, "grad_norm": 5.072213172912598, "learning_rate": 5.996357889923965e-05, "loss": 2.329721450805664, "memory(GiB)": 77.56, "step": 50900, "token_acc": 0.48846153846153845, "train_speed(iter/s)": 1.438567 }, { "epoch": 2.1809262670836724, "grad_norm": 5.626143932342529, "learning_rate": 5.995698400624813e-05, "loss": 2.12532901763916, "memory(GiB)": 77.56, "step": 50905, "token_acc": 0.5059760956175299, "train_speed(iter/s)": 1.438556 }, { "epoch": 2.1811404824129212, "grad_norm": 5.469083786010742, "learning_rate": 5.995038893287648e-05, "loss": 2.5412452697753904, "memory(GiB)": 77.56, "step": 50910, "token_acc": 0.4557823129251701, "train_speed(iter/s)": 1.438609 }, { "epoch": 2.1813546977421705, "grad_norm": 5.214121341705322, "learning_rate": 5.994379367924421e-05, "loss": 2.3003984451293946, "memory(GiB)": 77.56, "step": 50915, "token_acc": 0.532, "train_speed(iter/s)": 1.438636 }, { "epoch": 2.1815689130714193, "grad_norm": 5.522370338439941, "learning_rate": 5.993719824547079e-05, "loss": 2.4568899154663084, "memory(GiB)": 77.56, "step": 50920, "token_acc": 0.4483870967741935, "train_speed(iter/s)": 1.438631 }, { "epoch": 2.181783128400668, "grad_norm": 6.3648858070373535, "learning_rate": 5.9930602631675705e-05, "loss": 2.533412551879883, "memory(GiB)": 77.56, "step": 50925, "token_acc": 0.483974358974359, "train_speed(iter/s)": 1.438619 }, { "epoch": 2.1819973437299174, "grad_norm": 6.041980266571045, "learning_rate": 5.992400683797843e-05, "loss": 2.4639968872070312, "memory(GiB)": 77.56, "step": 50930, "token_acc": 0.5177304964539007, "train_speed(iter/s)": 1.438611 }, { "epoch": 2.182211559059166, "grad_norm": 6.086566925048828, "learning_rate": 5.991741086449848e-05, "loss": 2.2928543090820312, "memory(GiB)": 77.56, "step": 50935, "token_acc": 0.4909090909090909, "train_speed(iter/s)": 1.438578 }, { "epoch": 2.182425774388415, "grad_norm": 5.191893577575684, "learning_rate": 5.991081471135531e-05, "loss": 2.32578182220459, "memory(GiB)": 77.56, "step": 50940, "token_acc": 0.4959349593495935, "train_speed(iter/s)": 1.438599 }, { "epoch": 2.1826399897176643, "grad_norm": 5.302101135253906, "learning_rate": 5.990421837866843e-05, "loss": 2.479727935791016, "memory(GiB)": 77.56, "step": 50945, "token_acc": 0.44642857142857145, "train_speed(iter/s)": 1.438617 }, { "epoch": 2.182854205046913, "grad_norm": 5.075984954833984, "learning_rate": 5.989762186655736e-05, "loss": 2.6966833114624023, "memory(GiB)": 77.56, "step": 50950, "token_acc": 0.44931506849315067, "train_speed(iter/s)": 1.438643 }, { "epoch": 2.183068420376162, "grad_norm": 7.5072340965271, "learning_rate": 5.989102517514158e-05, "loss": 2.3890151977539062, "memory(GiB)": 77.56, "step": 50955, "token_acc": 0.47586206896551725, "train_speed(iter/s)": 1.438639 }, { "epoch": 2.183282635705411, "grad_norm": 4.678971767425537, "learning_rate": 5.9884428304540595e-05, "loss": 2.442661094665527, "memory(GiB)": 77.56, "step": 50960, "token_acc": 0.5187713310580204, "train_speed(iter/s)": 1.438662 }, { "epoch": 2.18349685103466, "grad_norm": 5.9851460456848145, "learning_rate": 5.987783125487394e-05, "loss": 2.358523368835449, "memory(GiB)": 77.56, "step": 50965, "token_acc": 0.5109170305676856, "train_speed(iter/s)": 1.438674 }, { "epoch": 2.1837110663639088, "grad_norm": 4.827874660491943, "learning_rate": 5.987123402626108e-05, "loss": 2.4888254165649415, "memory(GiB)": 77.56, "step": 50970, "token_acc": 0.5161290322580645, "train_speed(iter/s)": 1.438703 }, { "epoch": 2.183925281693158, "grad_norm": 5.580270767211914, "learning_rate": 5.986463661882157e-05, "loss": 2.5833593368530274, "memory(GiB)": 77.56, "step": 50975, "token_acc": 0.4854368932038835, "train_speed(iter/s)": 1.438712 }, { "epoch": 2.184139497022407, "grad_norm": 6.3668718338012695, "learning_rate": 5.985803903267491e-05, "loss": 2.554352569580078, "memory(GiB)": 77.56, "step": 50980, "token_acc": 0.45, "train_speed(iter/s)": 1.438718 }, { "epoch": 2.1843537123516557, "grad_norm": 6.56332540512085, "learning_rate": 5.985144126794061e-05, "loss": 2.382846641540527, "memory(GiB)": 77.56, "step": 50985, "token_acc": 0.4945054945054945, "train_speed(iter/s)": 1.438677 }, { "epoch": 2.184567927680905, "grad_norm": 6.39066743850708, "learning_rate": 5.984484332473823e-05, "loss": 2.6113391876220704, "memory(GiB)": 77.56, "step": 50990, "token_acc": 0.4664179104477612, "train_speed(iter/s)": 1.438653 }, { "epoch": 2.1847821430101537, "grad_norm": 5.2281413078308105, "learning_rate": 5.983824520318728e-05, "loss": 2.4021209716796874, "memory(GiB)": 77.56, "step": 50995, "token_acc": 0.4660493827160494, "train_speed(iter/s)": 1.438669 }, { "epoch": 2.1849963583394025, "grad_norm": 4.64334774017334, "learning_rate": 5.983164690340727e-05, "loss": 2.5635854721069338, "memory(GiB)": 77.56, "step": 51000, "token_acc": 0.44984802431610943, "train_speed(iter/s)": 1.438714 }, { "epoch": 2.1849963583394025, "eval_loss": 2.136322021484375, "eval_runtime": 14.4488, "eval_samples_per_second": 6.921, "eval_steps_per_second": 6.921, "eval_token_acc": 0.4740853658536585, "step": 51000 }, { "epoch": 2.185210573668652, "grad_norm": 5.247880935668945, "learning_rate": 5.982504842551777e-05, "loss": 2.575246810913086, "memory(GiB)": 77.56, "step": 51005, "token_acc": 0.4791666666666667, "train_speed(iter/s)": 1.438101 }, { "epoch": 2.1854247889979006, "grad_norm": 5.502328395843506, "learning_rate": 5.981844976963831e-05, "loss": 2.43762149810791, "memory(GiB)": 77.56, "step": 51010, "token_acc": 0.4668769716088328, "train_speed(iter/s)": 1.43808 }, { "epoch": 2.1856390043271494, "grad_norm": 5.834925651550293, "learning_rate": 5.981185093588839e-05, "loss": 2.573842239379883, "memory(GiB)": 77.56, "step": 51015, "token_acc": 0.46503496503496505, "train_speed(iter/s)": 1.438091 }, { "epoch": 2.1858532196563987, "grad_norm": 5.116877555847168, "learning_rate": 5.980525192438761e-05, "loss": 2.3351455688476563, "memory(GiB)": 77.56, "step": 51020, "token_acc": 0.4935483870967742, "train_speed(iter/s)": 1.438095 }, { "epoch": 2.1860674349856475, "grad_norm": 4.3217902183532715, "learning_rate": 5.979865273525549e-05, "loss": 2.5266494750976562, "memory(GiB)": 77.56, "step": 51025, "token_acc": 0.4583333333333333, "train_speed(iter/s)": 1.438082 }, { "epoch": 2.1862816503148967, "grad_norm": 5.159086227416992, "learning_rate": 5.9792053368611565e-05, "loss": 2.587189483642578, "memory(GiB)": 77.56, "step": 51030, "token_acc": 0.498567335243553, "train_speed(iter/s)": 1.438114 }, { "epoch": 2.1864958656441456, "grad_norm": 6.224012851715088, "learning_rate": 5.978545382457543e-05, "loss": 2.5629861831665037, "memory(GiB)": 77.56, "step": 51035, "token_acc": 0.4810606060606061, "train_speed(iter/s)": 1.43814 }, { "epoch": 2.1867100809733944, "grad_norm": 6.302873611450195, "learning_rate": 5.977885410326661e-05, "loss": 2.541942596435547, "memory(GiB)": 77.56, "step": 51040, "token_acc": 0.45222929936305734, "train_speed(iter/s)": 1.438169 }, { "epoch": 2.1869242963026436, "grad_norm": 8.17940616607666, "learning_rate": 5.977225420480468e-05, "loss": 2.209524726867676, "memory(GiB)": 77.56, "step": 51045, "token_acc": 0.5253623188405797, "train_speed(iter/s)": 1.438195 }, { "epoch": 2.1871385116318924, "grad_norm": 5.27204704284668, "learning_rate": 5.97656541293092e-05, "loss": 2.1242889404296874, "memory(GiB)": 77.56, "step": 51050, "token_acc": 0.5285171102661597, "train_speed(iter/s)": 1.438221 }, { "epoch": 2.1873527269611412, "grad_norm": 6.283603668212891, "learning_rate": 5.975905387689973e-05, "loss": 2.445973777770996, "memory(GiB)": 77.56, "step": 51055, "token_acc": 0.5016722408026756, "train_speed(iter/s)": 1.438217 }, { "epoch": 2.1875669422903905, "grad_norm": 4.831215858459473, "learning_rate": 5.9752453447695834e-05, "loss": 2.3898681640625, "memory(GiB)": 77.56, "step": 51060, "token_acc": 0.5117845117845118, "train_speed(iter/s)": 1.438232 }, { "epoch": 2.1877811576196393, "grad_norm": 5.528214931488037, "learning_rate": 5.974585284181712e-05, "loss": 2.543265533447266, "memory(GiB)": 77.56, "step": 51065, "token_acc": 0.43613707165109034, "train_speed(iter/s)": 1.438243 }, { "epoch": 2.187995372948888, "grad_norm": 4.38115930557251, "learning_rate": 5.973925205938311e-05, "loss": 2.4410064697265623, "memory(GiB)": 77.56, "step": 51070, "token_acc": 0.46474358974358976, "train_speed(iter/s)": 1.438262 }, { "epoch": 2.1882095882781374, "grad_norm": 5.574254989624023, "learning_rate": 5.973265110051344e-05, "loss": 2.2609447479248046, "memory(GiB)": 77.56, "step": 51075, "token_acc": 0.5346820809248555, "train_speed(iter/s)": 1.438231 }, { "epoch": 2.188423803607386, "grad_norm": 5.398826599121094, "learning_rate": 5.9726049965327656e-05, "loss": 2.464238929748535, "memory(GiB)": 77.56, "step": 51080, "token_acc": 0.4879725085910653, "train_speed(iter/s)": 1.43827 }, { "epoch": 2.188638018936635, "grad_norm": 5.23271369934082, "learning_rate": 5.9719448653945344e-05, "loss": 2.221196746826172, "memory(GiB)": 77.56, "step": 51085, "token_acc": 0.5652173913043478, "train_speed(iter/s)": 1.438283 }, { "epoch": 2.1888522342658843, "grad_norm": 5.4251909255981445, "learning_rate": 5.9712847166486105e-05, "loss": 2.4523088455200197, "memory(GiB)": 77.56, "step": 51090, "token_acc": 0.4728434504792332, "train_speed(iter/s)": 1.438289 }, { "epoch": 2.189066449595133, "grad_norm": 4.862835884094238, "learning_rate": 5.9706245503069534e-05, "loss": 2.3679885864257812, "memory(GiB)": 77.56, "step": 51095, "token_acc": 0.5170068027210885, "train_speed(iter/s)": 1.438284 }, { "epoch": 2.189280664924382, "grad_norm": 6.4454121589660645, "learning_rate": 5.9699643663815205e-05, "loss": 2.5079633712768556, "memory(GiB)": 77.56, "step": 51100, "token_acc": 0.46689895470383275, "train_speed(iter/s)": 1.438266 }, { "epoch": 2.189494880253631, "grad_norm": 5.596668720245361, "learning_rate": 5.969304164884275e-05, "loss": 2.5075233459472654, "memory(GiB)": 77.56, "step": 51105, "token_acc": 0.45182724252491696, "train_speed(iter/s)": 1.438302 }, { "epoch": 2.18970909558288, "grad_norm": 5.3330230712890625, "learning_rate": 5.968643945827176e-05, "loss": 2.5485298156738283, "memory(GiB)": 77.56, "step": 51110, "token_acc": 0.46959459459459457, "train_speed(iter/s)": 1.438302 }, { "epoch": 2.1899233109121288, "grad_norm": 4.852079391479492, "learning_rate": 5.9679837092221815e-05, "loss": 2.387131690979004, "memory(GiB)": 77.56, "step": 51115, "token_acc": 0.48494983277591974, "train_speed(iter/s)": 1.438276 }, { "epoch": 2.190137526241378, "grad_norm": 4.6014580726623535, "learning_rate": 5.967323455081255e-05, "loss": 2.265755271911621, "memory(GiB)": 77.56, "step": 51120, "token_acc": 0.5040650406504065, "train_speed(iter/s)": 1.438286 }, { "epoch": 2.190351741570627, "grad_norm": 6.057758808135986, "learning_rate": 5.966663183416357e-05, "loss": 2.3789167404174805, "memory(GiB)": 77.56, "step": 51125, "token_acc": 0.5, "train_speed(iter/s)": 1.438306 }, { "epoch": 2.1905659568998757, "grad_norm": 6.111120700836182, "learning_rate": 5.966002894239446e-05, "loss": 2.5509174346923826, "memory(GiB)": 77.56, "step": 51130, "token_acc": 0.46551724137931033, "train_speed(iter/s)": 1.438354 }, { "epoch": 2.190780172229125, "grad_norm": 5.215373516082764, "learning_rate": 5.965342587562489e-05, "loss": 2.3354923248291017, "memory(GiB)": 77.56, "step": 51135, "token_acc": 0.4956772334293948, "train_speed(iter/s)": 1.438385 }, { "epoch": 2.1909943875583737, "grad_norm": 5.302890777587891, "learning_rate": 5.9646822633974454e-05, "loss": 2.4579946517944338, "memory(GiB)": 77.56, "step": 51140, "token_acc": 0.5, "train_speed(iter/s)": 1.438338 }, { "epoch": 2.1912086028876225, "grad_norm": 7.1614556312561035, "learning_rate": 5.964021921756277e-05, "loss": 2.5008249282836914, "memory(GiB)": 77.56, "step": 51145, "token_acc": 0.5, "train_speed(iter/s)": 1.438354 }, { "epoch": 2.191422818216872, "grad_norm": 5.565342903137207, "learning_rate": 5.963361562650946e-05, "loss": 2.4689800262451174, "memory(GiB)": 77.56, "step": 51150, "token_acc": 0.46987951807228917, "train_speed(iter/s)": 1.438352 }, { "epoch": 2.1916370335461206, "grad_norm": 4.475517272949219, "learning_rate": 5.962701186093419e-05, "loss": 2.5771085739135744, "memory(GiB)": 77.56, "step": 51155, "token_acc": 0.4831081081081081, "train_speed(iter/s)": 1.438374 }, { "epoch": 2.1918512488753694, "grad_norm": 5.476842880249023, "learning_rate": 5.962040792095656e-05, "loss": 2.450410079956055, "memory(GiB)": 77.56, "step": 51160, "token_acc": 0.5100671140939598, "train_speed(iter/s)": 1.438389 }, { "epoch": 2.1920654642046187, "grad_norm": 6.096722602844238, "learning_rate": 5.96138038066962e-05, "loss": 2.377358627319336, "memory(GiB)": 77.56, "step": 51165, "token_acc": 0.4795539033457249, "train_speed(iter/s)": 1.438406 }, { "epoch": 2.1922796795338675, "grad_norm": 6.062929153442383, "learning_rate": 5.960719951827278e-05, "loss": 2.460158920288086, "memory(GiB)": 77.56, "step": 51170, "token_acc": 0.4910394265232975, "train_speed(iter/s)": 1.438379 }, { "epoch": 2.1924938948631163, "grad_norm": 5.080893039703369, "learning_rate": 5.960059505580593e-05, "loss": 2.382518196105957, "memory(GiB)": 77.56, "step": 51175, "token_acc": 0.4873417721518987, "train_speed(iter/s)": 1.438372 }, { "epoch": 2.1927081101923656, "grad_norm": 6.196957111358643, "learning_rate": 5.9593990419415294e-05, "loss": 2.1168500900268556, "memory(GiB)": 77.56, "step": 51180, "token_acc": 0.49310344827586206, "train_speed(iter/s)": 1.43839 }, { "epoch": 2.1929223255216144, "grad_norm": 7.209712982177734, "learning_rate": 5.9587385609220516e-05, "loss": 2.4931427001953126, "memory(GiB)": 77.56, "step": 51185, "token_acc": 0.49280575539568344, "train_speed(iter/s)": 1.438404 }, { "epoch": 2.193136540850863, "grad_norm": 5.172015190124512, "learning_rate": 5.958078062534126e-05, "loss": 2.451609802246094, "memory(GiB)": 77.56, "step": 51190, "token_acc": 0.4983277591973244, "train_speed(iter/s)": 1.438421 }, { "epoch": 2.1933507561801124, "grad_norm": 7.0667901039123535, "learning_rate": 5.957417546789717e-05, "loss": 2.3835845947265626, "memory(GiB)": 77.56, "step": 51195, "token_acc": 0.5405405405405406, "train_speed(iter/s)": 1.438419 }, { "epoch": 2.1935649715093612, "grad_norm": 6.388824939727783, "learning_rate": 5.956757013700791e-05, "loss": 2.527954864501953, "memory(GiB)": 77.56, "step": 51200, "token_acc": 0.5150214592274678, "train_speed(iter/s)": 1.438443 }, { "epoch": 2.19377918683861, "grad_norm": 6.017939567565918, "learning_rate": 5.956096463279314e-05, "loss": 2.4327980041503907, "memory(GiB)": 77.56, "step": 51205, "token_acc": 0.47368421052631576, "train_speed(iter/s)": 1.43848 }, { "epoch": 2.1939934021678593, "grad_norm": 5.817434310913086, "learning_rate": 5.955435895537253e-05, "loss": 2.4561895370483398, "memory(GiB)": 77.56, "step": 51210, "token_acc": 0.48109965635738833, "train_speed(iter/s)": 1.438491 }, { "epoch": 2.194207617497108, "grad_norm": 4.726731777191162, "learning_rate": 5.9547753104865746e-05, "loss": 2.4411893844604493, "memory(GiB)": 77.56, "step": 51215, "token_acc": 0.4816326530612245, "train_speed(iter/s)": 1.438498 }, { "epoch": 2.194421832826357, "grad_norm": 8.09035587310791, "learning_rate": 5.954114708139247e-05, "loss": 2.5014892578125, "memory(GiB)": 77.56, "step": 51220, "token_acc": 0.453125, "train_speed(iter/s)": 1.438489 }, { "epoch": 2.194636048155606, "grad_norm": 5.522891521453857, "learning_rate": 5.953454088507236e-05, "loss": 2.330046272277832, "memory(GiB)": 77.56, "step": 51225, "token_acc": 0.5236220472440944, "train_speed(iter/s)": 1.438498 }, { "epoch": 2.194850263484855, "grad_norm": 6.340538501739502, "learning_rate": 5.952793451602507e-05, "loss": 2.178668975830078, "memory(GiB)": 77.56, "step": 51230, "token_acc": 0.5116279069767442, "train_speed(iter/s)": 1.438496 }, { "epoch": 2.195064478814104, "grad_norm": 5.124255657196045, "learning_rate": 5.9521327974370334e-05, "loss": 2.484984588623047, "memory(GiB)": 77.56, "step": 51235, "token_acc": 0.4563758389261745, "train_speed(iter/s)": 1.438513 }, { "epoch": 2.195278694143353, "grad_norm": 7.406636714935303, "learning_rate": 5.95147212602278e-05, "loss": 2.5710737228393556, "memory(GiB)": 77.56, "step": 51240, "token_acc": 0.4476744186046512, "train_speed(iter/s)": 1.438484 }, { "epoch": 2.195492909472602, "grad_norm": 5.909469127655029, "learning_rate": 5.950811437371716e-05, "loss": 2.448769760131836, "memory(GiB)": 77.56, "step": 51245, "token_acc": 0.4617737003058104, "train_speed(iter/s)": 1.438464 }, { "epoch": 2.1957071248018507, "grad_norm": 5.257267475128174, "learning_rate": 5.950150731495813e-05, "loss": 2.4028879165649415, "memory(GiB)": 77.56, "step": 51250, "token_acc": 0.5068493150684932, "train_speed(iter/s)": 1.438479 }, { "epoch": 2.1959213401311, "grad_norm": 4.758358478546143, "learning_rate": 5.949490008407037e-05, "loss": 2.5705322265625, "memory(GiB)": 77.56, "step": 51255, "token_acc": 0.44336569579288027, "train_speed(iter/s)": 1.438516 }, { "epoch": 2.1961355554603488, "grad_norm": 5.432770729064941, "learning_rate": 5.9488292681173585e-05, "loss": 2.188041114807129, "memory(GiB)": 77.56, "step": 51260, "token_acc": 0.5475285171102662, "train_speed(iter/s)": 1.438521 }, { "epoch": 2.1963497707895976, "grad_norm": 4.859992504119873, "learning_rate": 5.948168510638748e-05, "loss": 2.528558921813965, "memory(GiB)": 77.56, "step": 51265, "token_acc": 0.4830188679245283, "train_speed(iter/s)": 1.438527 }, { "epoch": 2.196563986118847, "grad_norm": 6.28976583480835, "learning_rate": 5.9475077359831766e-05, "loss": 2.1902523040771484, "memory(GiB)": 77.56, "step": 51270, "token_acc": 0.5451388888888888, "train_speed(iter/s)": 1.438523 }, { "epoch": 2.1967782014480957, "grad_norm": 4.740793228149414, "learning_rate": 5.9468469441626116e-05, "loss": 2.7493324279785156, "memory(GiB)": 77.56, "step": 51275, "token_acc": 0.46551724137931033, "train_speed(iter/s)": 1.438523 }, { "epoch": 2.1969924167773445, "grad_norm": 6.288125514984131, "learning_rate": 5.946186135189027e-05, "loss": 2.7736921310424805, "memory(GiB)": 77.56, "step": 51280, "token_acc": 0.4843205574912892, "train_speed(iter/s)": 1.438521 }, { "epoch": 2.1972066321065937, "grad_norm": 6.039641857147217, "learning_rate": 5.945525309074393e-05, "loss": 2.4235759735107423, "memory(GiB)": 77.56, "step": 51285, "token_acc": 0.4948805460750853, "train_speed(iter/s)": 1.438526 }, { "epoch": 2.1974208474358425, "grad_norm": 5.337045669555664, "learning_rate": 5.944864465830681e-05, "loss": 2.7233800888061523, "memory(GiB)": 77.56, "step": 51290, "token_acc": 0.48348348348348347, "train_speed(iter/s)": 1.438566 }, { "epoch": 2.1976350627650914, "grad_norm": 6.917486667633057, "learning_rate": 5.944203605469863e-05, "loss": 2.5187665939331056, "memory(GiB)": 77.56, "step": 51295, "token_acc": 0.475, "train_speed(iter/s)": 1.43859 }, { "epoch": 2.1978492780943406, "grad_norm": 4.805302143096924, "learning_rate": 5.943542728003911e-05, "loss": 2.369694137573242, "memory(GiB)": 77.56, "step": 51300, "token_acc": 0.48788927335640137, "train_speed(iter/s)": 1.438604 }, { "epoch": 2.1980634934235894, "grad_norm": 5.587535858154297, "learning_rate": 5.9428818334447976e-05, "loss": 2.416248893737793, "memory(GiB)": 77.56, "step": 51305, "token_acc": 0.47232472324723246, "train_speed(iter/s)": 1.438618 }, { "epoch": 2.1982777087528382, "grad_norm": 5.735699653625488, "learning_rate": 5.9422209218044956e-05, "loss": 2.4903430938720703, "memory(GiB)": 77.56, "step": 51310, "token_acc": 0.4868804664723032, "train_speed(iter/s)": 1.438651 }, { "epoch": 2.1984919240820875, "grad_norm": 5.903115749359131, "learning_rate": 5.941559993094976e-05, "loss": 2.28411808013916, "memory(GiB)": 77.56, "step": 51315, "token_acc": 0.49808429118773945, "train_speed(iter/s)": 1.438649 }, { "epoch": 2.1987061394113363, "grad_norm": 4.929615497589111, "learning_rate": 5.9408990473282145e-05, "loss": 2.1097713470458985, "memory(GiB)": 77.56, "step": 51320, "token_acc": 0.5433333333333333, "train_speed(iter/s)": 1.43864 }, { "epoch": 2.198920354740585, "grad_norm": 5.301702499389648, "learning_rate": 5.940238084516184e-05, "loss": 2.680929183959961, "memory(GiB)": 77.56, "step": 51325, "token_acc": 0.4855305466237942, "train_speed(iter/s)": 1.438613 }, { "epoch": 2.1991345700698344, "grad_norm": 5.708707809448242, "learning_rate": 5.9395771046708594e-05, "loss": 2.2984172821044924, "memory(GiB)": 77.56, "step": 51330, "token_acc": 0.47058823529411764, "train_speed(iter/s)": 1.438589 }, { "epoch": 2.199348785399083, "grad_norm": 5.186180114746094, "learning_rate": 5.9389161078042143e-05, "loss": 2.2402292251586915, "memory(GiB)": 77.56, "step": 51335, "token_acc": 0.4982078853046595, "train_speed(iter/s)": 1.438577 }, { "epoch": 2.199563000728332, "grad_norm": 4.457876205444336, "learning_rate": 5.9382550939282234e-05, "loss": 2.462636184692383, "memory(GiB)": 77.56, "step": 51340, "token_acc": 0.4912891986062718, "train_speed(iter/s)": 1.438549 }, { "epoch": 2.1997772160575813, "grad_norm": 4.908180236816406, "learning_rate": 5.9375940630548597e-05, "loss": 2.3931446075439453, "memory(GiB)": 77.56, "step": 51345, "token_acc": 0.4804270462633452, "train_speed(iter/s)": 1.43856 }, { "epoch": 2.19999143138683, "grad_norm": 5.65999174118042, "learning_rate": 5.9369330151961e-05, "loss": 2.395417594909668, "memory(GiB)": 77.56, "step": 51350, "token_acc": 0.48028673835125446, "train_speed(iter/s)": 1.438559 }, { "epoch": 2.200205646716079, "grad_norm": 5.591470241546631, "learning_rate": 5.9362719503639216e-05, "loss": 2.4952018737792967, "memory(GiB)": 77.56, "step": 51355, "token_acc": 0.4420731707317073, "train_speed(iter/s)": 1.438561 }, { "epoch": 2.200419862045328, "grad_norm": 4.321208953857422, "learning_rate": 5.9356108685702974e-05, "loss": 2.672606658935547, "memory(GiB)": 77.56, "step": 51360, "token_acc": 0.49375, "train_speed(iter/s)": 1.438555 }, { "epoch": 2.200634077374577, "grad_norm": 4.107922554016113, "learning_rate": 5.934949769827205e-05, "loss": 2.3642757415771483, "memory(GiB)": 77.56, "step": 51365, "token_acc": 0.5015673981191222, "train_speed(iter/s)": 1.438568 }, { "epoch": 2.2008482927038258, "grad_norm": 4.427626132965088, "learning_rate": 5.9342886541466204e-05, "loss": 2.5454174041748048, "memory(GiB)": 77.56, "step": 51370, "token_acc": 0.5, "train_speed(iter/s)": 1.438583 }, { "epoch": 2.201062508033075, "grad_norm": 5.962900638580322, "learning_rate": 5.93362752154052e-05, "loss": 2.4481891632080077, "memory(GiB)": 77.56, "step": 51375, "token_acc": 0.5019157088122606, "train_speed(iter/s)": 1.438593 }, { "epoch": 2.201276723362324, "grad_norm": 5.660308361053467, "learning_rate": 5.9329663720208826e-05, "loss": 2.493613433837891, "memory(GiB)": 77.56, "step": 51380, "token_acc": 0.44569288389513106, "train_speed(iter/s)": 1.438574 }, { "epoch": 2.2014909386915726, "grad_norm": 5.489604473114014, "learning_rate": 5.932305205599683e-05, "loss": 2.7598297119140627, "memory(GiB)": 77.56, "step": 51385, "token_acc": 0.46959459459459457, "train_speed(iter/s)": 1.438576 }, { "epoch": 2.201705154020822, "grad_norm": 5.936769008636475, "learning_rate": 5.931644022288899e-05, "loss": 2.371737480163574, "memory(GiB)": 77.56, "step": 51390, "token_acc": 0.45089285714285715, "train_speed(iter/s)": 1.438559 }, { "epoch": 2.2019193693500707, "grad_norm": 4.960988521575928, "learning_rate": 5.9309828221005115e-05, "loss": 2.4654918670654298, "memory(GiB)": 77.56, "step": 51395, "token_acc": 0.4716312056737589, "train_speed(iter/s)": 1.438543 }, { "epoch": 2.2021335846793195, "grad_norm": 6.509946346282959, "learning_rate": 5.930321605046496e-05, "loss": 2.7539703369140627, "memory(GiB)": 77.56, "step": 51400, "token_acc": 0.48333333333333334, "train_speed(iter/s)": 1.43856 }, { "epoch": 2.202347800008569, "grad_norm": 5.776191234588623, "learning_rate": 5.9296603711388324e-05, "loss": 2.575841522216797, "memory(GiB)": 77.56, "step": 51405, "token_acc": 0.4864864864864865, "train_speed(iter/s)": 1.438594 }, { "epoch": 2.2025620153378176, "grad_norm": 6.656833171844482, "learning_rate": 5.928999120389499e-05, "loss": 2.5711742401123048, "memory(GiB)": 77.56, "step": 51410, "token_acc": 0.47706422018348627, "train_speed(iter/s)": 1.43859 }, { "epoch": 2.2027762306670664, "grad_norm": 4.375270366668701, "learning_rate": 5.928337852810475e-05, "loss": 2.601535606384277, "memory(GiB)": 77.56, "step": 51415, "token_acc": 0.452991452991453, "train_speed(iter/s)": 1.438576 }, { "epoch": 2.2029904459963157, "grad_norm": 4.985630989074707, "learning_rate": 5.927676568413739e-05, "loss": 2.402043342590332, "memory(GiB)": 77.56, "step": 51420, "token_acc": 0.48220064724919093, "train_speed(iter/s)": 1.438544 }, { "epoch": 2.2032046613255645, "grad_norm": 5.533224105834961, "learning_rate": 5.9270152672112725e-05, "loss": 2.276856231689453, "memory(GiB)": 77.56, "step": 51425, "token_acc": 0.48484848484848486, "train_speed(iter/s)": 1.43859 }, { "epoch": 2.2034188766548133, "grad_norm": 5.177923202514648, "learning_rate": 5.9263539492150557e-05, "loss": 2.5078428268432615, "memory(GiB)": 77.56, "step": 51430, "token_acc": 0.4907749077490775, "train_speed(iter/s)": 1.438578 }, { "epoch": 2.2036330919840625, "grad_norm": 5.188407897949219, "learning_rate": 5.9256926144370663e-05, "loss": 2.50731201171875, "memory(GiB)": 77.56, "step": 51435, "token_acc": 0.47719298245614034, "train_speed(iter/s)": 1.438539 }, { "epoch": 2.2038473073133114, "grad_norm": 8.031177520751953, "learning_rate": 5.9250312628892877e-05, "loss": 2.6311120986938477, "memory(GiB)": 77.56, "step": 51440, "token_acc": 0.4784172661870504, "train_speed(iter/s)": 1.438546 }, { "epoch": 2.20406152264256, "grad_norm": 4.27857780456543, "learning_rate": 5.9243698945837014e-05, "loss": 2.434372329711914, "memory(GiB)": 77.56, "step": 51445, "token_acc": 0.48242811501597443, "train_speed(iter/s)": 1.438559 }, { "epoch": 2.2042757379718094, "grad_norm": 5.051950454711914, "learning_rate": 5.923708509532284e-05, "loss": 2.433517646789551, "memory(GiB)": 77.56, "step": 51450, "token_acc": 0.46996466431095407, "train_speed(iter/s)": 1.438559 }, { "epoch": 2.2044899533010582, "grad_norm": 5.938570022583008, "learning_rate": 5.923047107747024e-05, "loss": 2.32891731262207, "memory(GiB)": 77.56, "step": 51455, "token_acc": 0.4966887417218543, "train_speed(iter/s)": 1.438583 }, { "epoch": 2.204704168630307, "grad_norm": 5.0678324699401855, "learning_rate": 5.9223856892398975e-05, "loss": 2.3945594787597657, "memory(GiB)": 77.56, "step": 51460, "token_acc": 0.5201238390092879, "train_speed(iter/s)": 1.438617 }, { "epoch": 2.2049183839595563, "grad_norm": 5.1532158851623535, "learning_rate": 5.921724254022889e-05, "loss": 2.245484161376953, "memory(GiB)": 77.56, "step": 51465, "token_acc": 0.49557522123893805, "train_speed(iter/s)": 1.438589 }, { "epoch": 2.205132599288805, "grad_norm": 6.8095808029174805, "learning_rate": 5.921062802107982e-05, "loss": 2.64456844329834, "memory(GiB)": 77.56, "step": 51470, "token_acc": 0.48046875, "train_speed(iter/s)": 1.43861 }, { "epoch": 2.205346814618054, "grad_norm": 6.624456882476807, "learning_rate": 5.920401333507157e-05, "loss": 2.413959503173828, "memory(GiB)": 77.56, "step": 51475, "token_acc": 0.5087108013937283, "train_speed(iter/s)": 1.438627 }, { "epoch": 2.205561029947303, "grad_norm": 5.202213764190674, "learning_rate": 5.9197398482324e-05, "loss": 2.625692939758301, "memory(GiB)": 77.56, "step": 51480, "token_acc": 0.4707792207792208, "train_speed(iter/s)": 1.43867 }, { "epoch": 2.205775245276552, "grad_norm": 5.417222023010254, "learning_rate": 5.919078346295693e-05, "loss": 2.4275264739990234, "memory(GiB)": 77.56, "step": 51485, "token_acc": 0.4735202492211838, "train_speed(iter/s)": 1.43869 }, { "epoch": 2.205989460605801, "grad_norm": 5.424493789672852, "learning_rate": 5.918416827709018e-05, "loss": 2.1249998092651365, "memory(GiB)": 77.56, "step": 51490, "token_acc": 0.5241379310344828, "train_speed(iter/s)": 1.438659 }, { "epoch": 2.20620367593505, "grad_norm": 5.903954029083252, "learning_rate": 5.917755292484361e-05, "loss": 2.3350547790527343, "memory(GiB)": 77.56, "step": 51495, "token_acc": 0.5118343195266272, "train_speed(iter/s)": 1.438679 }, { "epoch": 2.206417891264299, "grad_norm": 4.104979038238525, "learning_rate": 5.917093740633707e-05, "loss": 2.266367721557617, "memory(GiB)": 77.56, "step": 51500, "token_acc": 0.5069444444444444, "train_speed(iter/s)": 1.438671 }, { "epoch": 2.206417891264299, "eval_loss": 2.3244543075561523, "eval_runtime": 14.2381, "eval_samples_per_second": 7.023, "eval_steps_per_second": 7.023, "eval_token_acc": 0.4762532981530343, "step": 51500 }, { "epoch": 2.2066321065935477, "grad_norm": 4.286702632904053, "learning_rate": 5.916432172169038e-05, "loss": 2.5060062408447266, "memory(GiB)": 77.56, "step": 51505, "token_acc": 0.4753623188405797, "train_speed(iter/s)": 1.438069 }, { "epoch": 2.206846321922797, "grad_norm": 6.096134185791016, "learning_rate": 5.9157705871023426e-05, "loss": 2.478118133544922, "memory(GiB)": 77.56, "step": 51510, "token_acc": 0.4755700325732899, "train_speed(iter/s)": 1.438036 }, { "epoch": 2.2070605372520458, "grad_norm": 4.494735240936279, "learning_rate": 5.915108985445603e-05, "loss": 2.555459403991699, "memory(GiB)": 77.56, "step": 51515, "token_acc": 0.4759036144578313, "train_speed(iter/s)": 1.438014 }, { "epoch": 2.2072747525812946, "grad_norm": 6.113365650177002, "learning_rate": 5.914447367210805e-05, "loss": 2.418491172790527, "memory(GiB)": 77.56, "step": 51520, "token_acc": 0.5477031802120141, "train_speed(iter/s)": 1.438002 }, { "epoch": 2.207488967910544, "grad_norm": 6.196977138519287, "learning_rate": 5.913785732409937e-05, "loss": 2.4084068298339845, "memory(GiB)": 77.56, "step": 51525, "token_acc": 0.488, "train_speed(iter/s)": 1.438027 }, { "epoch": 2.2077031832397926, "grad_norm": 6.043222427368164, "learning_rate": 5.913124081054981e-05, "loss": 1.9862640380859375, "memory(GiB)": 77.56, "step": 51530, "token_acc": 0.5401785714285714, "train_speed(iter/s)": 1.438077 }, { "epoch": 2.2079173985690415, "grad_norm": 4.764535427093506, "learning_rate": 5.912462413157926e-05, "loss": 2.5969568252563477, "memory(GiB)": 77.56, "step": 51535, "token_acc": 0.43823529411764706, "train_speed(iter/s)": 1.438091 }, { "epoch": 2.2081316138982907, "grad_norm": 5.156094074249268, "learning_rate": 5.91180072873076e-05, "loss": 2.5567049026489257, "memory(GiB)": 77.56, "step": 51540, "token_acc": 0.48717948717948717, "train_speed(iter/s)": 1.438104 }, { "epoch": 2.2083458292275395, "grad_norm": 4.799515247344971, "learning_rate": 5.9111390277854675e-05, "loss": 2.700315475463867, "memory(GiB)": 77.56, "step": 51545, "token_acc": 0.4666666666666667, "train_speed(iter/s)": 1.438136 }, { "epoch": 2.2085600445567883, "grad_norm": 4.821591377258301, "learning_rate": 5.910477310334036e-05, "loss": 2.553404426574707, "memory(GiB)": 77.56, "step": 51550, "token_acc": 0.4586206896551724, "train_speed(iter/s)": 1.438164 }, { "epoch": 2.2087742598860376, "grad_norm": 6.490218162536621, "learning_rate": 5.9098155763884554e-05, "loss": 2.213795852661133, "memory(GiB)": 77.56, "step": 51555, "token_acc": 0.5108695652173914, "train_speed(iter/s)": 1.438178 }, { "epoch": 2.2089884752152864, "grad_norm": 6.389596939086914, "learning_rate": 5.909153825960711e-05, "loss": 2.5539928436279298, "memory(GiB)": 77.56, "step": 51560, "token_acc": 0.4666666666666667, "train_speed(iter/s)": 1.438155 }, { "epoch": 2.209202690544535, "grad_norm": 6.180983543395996, "learning_rate": 5.908492059062794e-05, "loss": 2.3557964324951173, "memory(GiB)": 77.56, "step": 51565, "token_acc": 0.5109170305676856, "train_speed(iter/s)": 1.438161 }, { "epoch": 2.2094169058737845, "grad_norm": 5.574581623077393, "learning_rate": 5.907830275706689e-05, "loss": 2.4755584716796877, "memory(GiB)": 77.56, "step": 51570, "token_acc": 0.4740484429065744, "train_speed(iter/s)": 1.438156 }, { "epoch": 2.2096311212030333, "grad_norm": 4.493891716003418, "learning_rate": 5.907168475904388e-05, "loss": 2.510402488708496, "memory(GiB)": 77.56, "step": 51575, "token_acc": 0.5146579804560261, "train_speed(iter/s)": 1.438189 }, { "epoch": 2.209845336532282, "grad_norm": 5.137444972991943, "learning_rate": 5.906506659667878e-05, "loss": 2.4336313247680663, "memory(GiB)": 77.56, "step": 51580, "token_acc": 0.4809688581314879, "train_speed(iter/s)": 1.438213 }, { "epoch": 2.2100595518615314, "grad_norm": 6.816127777099609, "learning_rate": 5.905844827009151e-05, "loss": 1.9409049987792968, "memory(GiB)": 77.56, "step": 51585, "token_acc": 0.5714285714285714, "train_speed(iter/s)": 1.438235 }, { "epoch": 2.21027376719078, "grad_norm": 5.381969451904297, "learning_rate": 5.905182977940195e-05, "loss": 2.7115360260009767, "memory(GiB)": 77.56, "step": 51590, "token_acc": 0.4918032786885246, "train_speed(iter/s)": 1.438228 }, { "epoch": 2.210487982520029, "grad_norm": 6.080172061920166, "learning_rate": 5.904521112472999e-05, "loss": 2.5088022232055662, "memory(GiB)": 77.56, "step": 51595, "token_acc": 0.46308724832214765, "train_speed(iter/s)": 1.438211 }, { "epoch": 2.2107021978492782, "grad_norm": 4.577946662902832, "learning_rate": 5.903859230619556e-05, "loss": 2.7236812591552733, "memory(GiB)": 77.56, "step": 51600, "token_acc": 0.4752475247524752, "train_speed(iter/s)": 1.438202 }, { "epoch": 2.210916413178527, "grad_norm": 5.804032802581787, "learning_rate": 5.903197332391853e-05, "loss": 2.7177486419677734, "memory(GiB)": 77.56, "step": 51605, "token_acc": 0.4200626959247649, "train_speed(iter/s)": 1.438225 }, { "epoch": 2.211130628507776, "grad_norm": 5.156857967376709, "learning_rate": 5.902535417801884e-05, "loss": 2.487660598754883, "memory(GiB)": 77.56, "step": 51610, "token_acc": 0.49266862170087977, "train_speed(iter/s)": 1.438213 }, { "epoch": 2.211344843837025, "grad_norm": 3.858626365661621, "learning_rate": 5.901873486861641e-05, "loss": 2.305619812011719, "memory(GiB)": 77.56, "step": 51615, "token_acc": 0.5140562248995983, "train_speed(iter/s)": 1.438194 }, { "epoch": 2.211559059166274, "grad_norm": 4.362866401672363, "learning_rate": 5.901211539583111e-05, "loss": 2.546542167663574, "memory(GiB)": 77.56, "step": 51620, "token_acc": 0.47129909365558914, "train_speed(iter/s)": 1.438222 }, { "epoch": 2.2117732744955227, "grad_norm": 5.550348281860352, "learning_rate": 5.900549575978291e-05, "loss": 2.7103553771972657, "memory(GiB)": 77.56, "step": 51625, "token_acc": 0.41358024691358025, "train_speed(iter/s)": 1.438227 }, { "epoch": 2.211987489824772, "grad_norm": 7.457678318023682, "learning_rate": 5.899887596059171e-05, "loss": 2.773662567138672, "memory(GiB)": 77.56, "step": 51630, "token_acc": 0.4503105590062112, "train_speed(iter/s)": 1.438234 }, { "epoch": 2.212201705154021, "grad_norm": 5.990923881530762, "learning_rate": 5.899225599837741e-05, "loss": 2.7374265670776365, "memory(GiB)": 77.56, "step": 51635, "token_acc": 0.43537414965986393, "train_speed(iter/s)": 1.438209 }, { "epoch": 2.2124159204832696, "grad_norm": 5.70811653137207, "learning_rate": 5.8985635873259956e-05, "loss": 2.471603775024414, "memory(GiB)": 77.56, "step": 51640, "token_acc": 0.46726190476190477, "train_speed(iter/s)": 1.438214 }, { "epoch": 2.212630135812519, "grad_norm": 4.653250217437744, "learning_rate": 5.8979015585359296e-05, "loss": 2.6476161956787108, "memory(GiB)": 77.56, "step": 51645, "token_acc": 0.4369747899159664, "train_speed(iter/s)": 1.438204 }, { "epoch": 2.2128443511417677, "grad_norm": 5.632009029388428, "learning_rate": 5.897239513479532e-05, "loss": 2.2976472854614256, "memory(GiB)": 77.56, "step": 51650, "token_acc": 0.4674329501915709, "train_speed(iter/s)": 1.438238 }, { "epoch": 2.2130585664710165, "grad_norm": 5.307003974914551, "learning_rate": 5.896577452168801e-05, "loss": 2.5293455123901367, "memory(GiB)": 77.56, "step": 51655, "token_acc": 0.48909657320872274, "train_speed(iter/s)": 1.438277 }, { "epoch": 2.2132727818002658, "grad_norm": 4.1953125, "learning_rate": 5.8959153746157294e-05, "loss": 2.3829559326171874, "memory(GiB)": 77.56, "step": 51660, "token_acc": 0.4864864864864865, "train_speed(iter/s)": 1.43826 }, { "epoch": 2.2134869971295146, "grad_norm": 5.508793830871582, "learning_rate": 5.895253280832308e-05, "loss": 2.4228404998779296, "memory(GiB)": 77.56, "step": 51665, "token_acc": 0.48717948717948717, "train_speed(iter/s)": 1.438262 }, { "epoch": 2.2137012124587634, "grad_norm": 5.665297508239746, "learning_rate": 5.894591170830536e-05, "loss": 2.4385372161865235, "memory(GiB)": 77.56, "step": 51670, "token_acc": 0.5, "train_speed(iter/s)": 1.438291 }, { "epoch": 2.2139154277880126, "grad_norm": 5.090893268585205, "learning_rate": 5.893929044622404e-05, "loss": 2.1795680999755858, "memory(GiB)": 77.56, "step": 51675, "token_acc": 0.5359477124183006, "train_speed(iter/s)": 1.438304 }, { "epoch": 2.2141296431172615, "grad_norm": 4.7929887771606445, "learning_rate": 5.8932669022199095e-05, "loss": 2.09849853515625, "memory(GiB)": 77.56, "step": 51680, "token_acc": 0.5212765957446809, "train_speed(iter/s)": 1.438289 }, { "epoch": 2.2143438584465103, "grad_norm": 6.889700889587402, "learning_rate": 5.892604743635045e-05, "loss": 2.566457748413086, "memory(GiB)": 77.56, "step": 51685, "token_acc": 0.48905109489051096, "train_speed(iter/s)": 1.438322 }, { "epoch": 2.2145580737757595, "grad_norm": 5.650369167327881, "learning_rate": 5.891942568879811e-05, "loss": 2.613294792175293, "memory(GiB)": 77.56, "step": 51690, "token_acc": 0.47333333333333333, "train_speed(iter/s)": 1.438343 }, { "epoch": 2.2147722891050083, "grad_norm": 4.616402626037598, "learning_rate": 5.8912803779662e-05, "loss": 2.2758647918701174, "memory(GiB)": 77.56, "step": 51695, "token_acc": 0.47257383966244726, "train_speed(iter/s)": 1.438365 }, { "epoch": 2.214986504434257, "grad_norm": 6.8922810554504395, "learning_rate": 5.890618170906208e-05, "loss": 2.462801933288574, "memory(GiB)": 77.56, "step": 51700, "token_acc": 0.4769736842105263, "train_speed(iter/s)": 1.438396 }, { "epoch": 2.2152007197635064, "grad_norm": 4.460061073303223, "learning_rate": 5.889955947711834e-05, "loss": 2.406911087036133, "memory(GiB)": 77.56, "step": 51705, "token_acc": 0.5037878787878788, "train_speed(iter/s)": 1.438404 }, { "epoch": 2.215414935092755, "grad_norm": 6.122963905334473, "learning_rate": 5.8892937083950704e-05, "loss": 2.2896242141723633, "memory(GiB)": 77.56, "step": 51710, "token_acc": 0.48322147651006714, "train_speed(iter/s)": 1.438421 }, { "epoch": 2.215629150422004, "grad_norm": 4.520376682281494, "learning_rate": 5.8886314529679196e-05, "loss": 2.5812835693359375, "memory(GiB)": 77.56, "step": 51715, "token_acc": 0.4444444444444444, "train_speed(iter/s)": 1.438388 }, { "epoch": 2.2158433657512533, "grad_norm": 4.933167934417725, "learning_rate": 5.8879691814423744e-05, "loss": 2.4599193572998046, "memory(GiB)": 77.56, "step": 51720, "token_acc": 0.5074626865671642, "train_speed(iter/s)": 1.438407 }, { "epoch": 2.216057581080502, "grad_norm": 4.681436538696289, "learning_rate": 5.8873068938304355e-05, "loss": 2.5470909118652343, "memory(GiB)": 77.56, "step": 51725, "token_acc": 0.4489051094890511, "train_speed(iter/s)": 1.438426 }, { "epoch": 2.216271796409751, "grad_norm": 5.931581974029541, "learning_rate": 5.8866445901441e-05, "loss": 2.547966194152832, "memory(GiB)": 77.56, "step": 51730, "token_acc": 0.4708029197080292, "train_speed(iter/s)": 1.43845 }, { "epoch": 2.216486011739, "grad_norm": 6.11769437789917, "learning_rate": 5.885982270395366e-05, "loss": 2.3104244232177735, "memory(GiB)": 77.56, "step": 51735, "token_acc": 0.5205992509363296, "train_speed(iter/s)": 1.43843 }, { "epoch": 2.216700227068249, "grad_norm": 4.562624931335449, "learning_rate": 5.885319934596233e-05, "loss": 2.6275177001953125, "memory(GiB)": 77.56, "step": 51740, "token_acc": 0.44807121661721067, "train_speed(iter/s)": 1.438447 }, { "epoch": 2.216914442397498, "grad_norm": 5.758816242218018, "learning_rate": 5.884657582758698e-05, "loss": 2.2551509857177736, "memory(GiB)": 77.56, "step": 51745, "token_acc": 0.5230125523012552, "train_speed(iter/s)": 1.43847 }, { "epoch": 2.217128657726747, "grad_norm": 6.4284467697143555, "learning_rate": 5.8839952148947594e-05, "loss": 2.382111740112305, "memory(GiB)": 77.56, "step": 51750, "token_acc": 0.5183946488294314, "train_speed(iter/s)": 1.438486 }, { "epoch": 2.217342873055996, "grad_norm": 5.3027143478393555, "learning_rate": 5.8833328310164215e-05, "loss": 2.307453155517578, "memory(GiB)": 77.56, "step": 51755, "token_acc": 0.5648854961832062, "train_speed(iter/s)": 1.438507 }, { "epoch": 2.2175570883852447, "grad_norm": 7.027342796325684, "learning_rate": 5.882670431135677e-05, "loss": 2.6029586791992188, "memory(GiB)": 77.56, "step": 51760, "token_acc": 0.4329896907216495, "train_speed(iter/s)": 1.438504 }, { "epoch": 2.217771303714494, "grad_norm": 4.677776336669922, "learning_rate": 5.882008015264532e-05, "loss": 2.6335948944091796, "memory(GiB)": 77.56, "step": 51765, "token_acc": 0.4565916398713826, "train_speed(iter/s)": 1.438501 }, { "epoch": 2.2179855190437427, "grad_norm": 5.307129383087158, "learning_rate": 5.8813455834149846e-05, "loss": 2.38995304107666, "memory(GiB)": 77.56, "step": 51770, "token_acc": 0.4703389830508475, "train_speed(iter/s)": 1.438518 }, { "epoch": 2.2181997343729916, "grad_norm": 5.592504501342773, "learning_rate": 5.880683135599034e-05, "loss": 2.632982635498047, "memory(GiB)": 77.56, "step": 51775, "token_acc": 0.4177215189873418, "train_speed(iter/s)": 1.438551 }, { "epoch": 2.218413949702241, "grad_norm": 7.6745524406433105, "learning_rate": 5.880020671828683e-05, "loss": 2.3372930526733398, "memory(GiB)": 77.56, "step": 51780, "token_acc": 0.4916387959866221, "train_speed(iter/s)": 1.438552 }, { "epoch": 2.2186281650314896, "grad_norm": 7.014909744262695, "learning_rate": 5.879358192115932e-05, "loss": 2.380252647399902, "memory(GiB)": 77.56, "step": 51785, "token_acc": 0.48355263157894735, "train_speed(iter/s)": 1.438534 }, { "epoch": 2.2188423803607384, "grad_norm": 4.42019510269165, "learning_rate": 5.8786956964727834e-05, "loss": 2.44866828918457, "memory(GiB)": 77.56, "step": 51790, "token_acc": 0.5100864553314121, "train_speed(iter/s)": 1.438503 }, { "epoch": 2.2190565956899877, "grad_norm": 5.494714736938477, "learning_rate": 5.878033184911236e-05, "loss": 2.292796516418457, "memory(GiB)": 77.56, "step": 51795, "token_acc": 0.47985347985347987, "train_speed(iter/s)": 1.438519 }, { "epoch": 2.2192708110192365, "grad_norm": 6.420318603515625, "learning_rate": 5.877370657443294e-05, "loss": 2.430602264404297, "memory(GiB)": 77.56, "step": 51800, "token_acc": 0.5155709342560554, "train_speed(iter/s)": 1.438522 }, { "epoch": 2.2194850263484853, "grad_norm": 4.252730846405029, "learning_rate": 5.876708114080961e-05, "loss": 2.567221832275391, "memory(GiB)": 77.56, "step": 51805, "token_acc": 0.4670487106017192, "train_speed(iter/s)": 1.438525 }, { "epoch": 2.2196992416777346, "grad_norm": 4.965419769287109, "learning_rate": 5.876045554836237e-05, "loss": 2.1858219146728515, "memory(GiB)": 77.56, "step": 51810, "token_acc": 0.556420233463035, "train_speed(iter/s)": 1.438544 }, { "epoch": 2.2199134570069834, "grad_norm": 4.175282001495361, "learning_rate": 5.875382979721127e-05, "loss": 2.6881969451904295, "memory(GiB)": 77.56, "step": 51815, "token_acc": 0.4411764705882353, "train_speed(iter/s)": 1.438549 }, { "epoch": 2.220127672336232, "grad_norm": 6.368675708770752, "learning_rate": 5.874720388747632e-05, "loss": 2.541495132446289, "memory(GiB)": 77.56, "step": 51820, "token_acc": 0.4711111111111111, "train_speed(iter/s)": 1.438543 }, { "epoch": 2.2203418876654815, "grad_norm": 5.018083095550537, "learning_rate": 5.874057781927756e-05, "loss": 2.4667293548583986, "memory(GiB)": 77.56, "step": 51825, "token_acc": 0.440625, "train_speed(iter/s)": 1.438538 }, { "epoch": 2.2205561029947303, "grad_norm": 5.82100248336792, "learning_rate": 5.8733951592735045e-05, "loss": 2.636016273498535, "memory(GiB)": 77.56, "step": 51830, "token_acc": 0.4684014869888476, "train_speed(iter/s)": 1.438536 }, { "epoch": 2.220770318323979, "grad_norm": 6.133962631225586, "learning_rate": 5.8727325207968806e-05, "loss": 2.5272632598876954, "memory(GiB)": 77.56, "step": 51835, "token_acc": 0.4349315068493151, "train_speed(iter/s)": 1.438523 }, { "epoch": 2.2209845336532283, "grad_norm": 5.133821964263916, "learning_rate": 5.872069866509887e-05, "loss": 2.3145822525024413, "memory(GiB)": 77.56, "step": 51840, "token_acc": 0.4875444839857651, "train_speed(iter/s)": 1.438529 }, { "epoch": 2.221198748982477, "grad_norm": 10.58356761932373, "learning_rate": 5.871407196424532e-05, "loss": 2.1660438537597657, "memory(GiB)": 77.56, "step": 51845, "token_acc": 0.4854771784232365, "train_speed(iter/s)": 1.438528 }, { "epoch": 2.221412964311726, "grad_norm": 5.412009239196777, "learning_rate": 5.870744510552817e-05, "loss": 2.1739456176757814, "memory(GiB)": 77.56, "step": 51850, "token_acc": 0.5015576323987538, "train_speed(iter/s)": 1.438559 }, { "epoch": 2.2216271796409752, "grad_norm": 6.137373447418213, "learning_rate": 5.8700818089067474e-05, "loss": 2.479261779785156, "memory(GiB)": 77.56, "step": 51855, "token_acc": 0.5017667844522968, "train_speed(iter/s)": 1.438569 }, { "epoch": 2.221841394970224, "grad_norm": 6.139931678771973, "learning_rate": 5.8694190914983317e-05, "loss": 2.496595001220703, "memory(GiB)": 77.56, "step": 51860, "token_acc": 0.46215139442231074, "train_speed(iter/s)": 1.438573 }, { "epoch": 2.222055610299473, "grad_norm": 7.457913398742676, "learning_rate": 5.868756358339572e-05, "loss": 2.512097930908203, "memory(GiB)": 77.56, "step": 51865, "token_acc": 0.49693251533742333, "train_speed(iter/s)": 1.438601 }, { "epoch": 2.222269825628722, "grad_norm": 5.847672462463379, "learning_rate": 5.8680936094424754e-05, "loss": 2.334166717529297, "memory(GiB)": 77.56, "step": 51870, "token_acc": 0.496551724137931, "train_speed(iter/s)": 1.438625 }, { "epoch": 2.222484040957971, "grad_norm": 4.2275071144104, "learning_rate": 5.8674308448190506e-05, "loss": 2.639978790283203, "memory(GiB)": 77.56, "step": 51875, "token_acc": 0.4473684210526316, "train_speed(iter/s)": 1.438606 }, { "epoch": 2.2226982562872197, "grad_norm": 5.724484920501709, "learning_rate": 5.8667680644813005e-05, "loss": 2.419038009643555, "memory(GiB)": 77.56, "step": 51880, "token_acc": 0.46075085324232085, "train_speed(iter/s)": 1.438609 }, { "epoch": 2.222912471616469, "grad_norm": 5.660974979400635, "learning_rate": 5.8661052684412354e-05, "loss": 2.2676227569580076, "memory(GiB)": 77.56, "step": 51885, "token_acc": 0.4578313253012048, "train_speed(iter/s)": 1.438634 }, { "epoch": 2.223126686945718, "grad_norm": 6.126853942871094, "learning_rate": 5.86544245671086e-05, "loss": 2.6737537384033203, "memory(GiB)": 77.56, "step": 51890, "token_acc": 0.4189723320158103, "train_speed(iter/s)": 1.438643 }, { "epoch": 2.2233409022749666, "grad_norm": 6.544710636138916, "learning_rate": 5.8647796293021826e-05, "loss": 2.8202558517456056, "memory(GiB)": 77.56, "step": 51895, "token_acc": 0.44609665427509293, "train_speed(iter/s)": 1.438644 }, { "epoch": 2.223555117604216, "grad_norm": 6.707851409912109, "learning_rate": 5.864116786227212e-05, "loss": 2.3110851287841796, "memory(GiB)": 77.56, "step": 51900, "token_acc": 0.49843260188087773, "train_speed(iter/s)": 1.438658 }, { "epoch": 2.2237693329334647, "grad_norm": 5.187964916229248, "learning_rate": 5.863453927497954e-05, "loss": 2.4777355194091797, "memory(GiB)": 77.56, "step": 51905, "token_acc": 0.5075187969924813, "train_speed(iter/s)": 1.438642 }, { "epoch": 2.2239835482627135, "grad_norm": 4.107360363006592, "learning_rate": 5.8627910531264176e-05, "loss": 2.625115966796875, "memory(GiB)": 77.56, "step": 51910, "token_acc": 0.4619883040935672, "train_speed(iter/s)": 1.438635 }, { "epoch": 2.2241977635919628, "grad_norm": 5.4470720291137695, "learning_rate": 5.862128163124613e-05, "loss": 2.4041315078735352, "memory(GiB)": 77.56, "step": 51915, "token_acc": 0.5202702702702703, "train_speed(iter/s)": 1.438623 }, { "epoch": 2.2244119789212116, "grad_norm": 6.143822193145752, "learning_rate": 5.861465257504548e-05, "loss": 2.435796356201172, "memory(GiB)": 77.56, "step": 51920, "token_acc": 0.4639175257731959, "train_speed(iter/s)": 1.438612 }, { "epoch": 2.2246261942504604, "grad_norm": 10.512497901916504, "learning_rate": 5.8608023362782316e-05, "loss": 2.5388055801391602, "memory(GiB)": 77.56, "step": 51925, "token_acc": 0.4536082474226804, "train_speed(iter/s)": 1.438643 }, { "epoch": 2.2248404095797096, "grad_norm": 7.161112308502197, "learning_rate": 5.8601393994576734e-05, "loss": 2.444790267944336, "memory(GiB)": 77.56, "step": 51930, "token_acc": 0.4675324675324675, "train_speed(iter/s)": 1.438609 }, { "epoch": 2.2250546249089584, "grad_norm": 6.528739929199219, "learning_rate": 5.859476447054884e-05, "loss": 2.480727767944336, "memory(GiB)": 77.56, "step": 51935, "token_acc": 0.5171232876712328, "train_speed(iter/s)": 1.438618 }, { "epoch": 2.2252688402382073, "grad_norm": 4.665379047393799, "learning_rate": 5.8588134790818707e-05, "loss": 2.0166717529296876, "memory(GiB)": 77.56, "step": 51940, "token_acc": 0.5393258426966292, "train_speed(iter/s)": 1.438603 }, { "epoch": 2.2254830555674565, "grad_norm": 7.0894694328308105, "learning_rate": 5.858150495550646e-05, "loss": 2.2285882949829103, "memory(GiB)": 77.56, "step": 51945, "token_acc": 0.5088967971530249, "train_speed(iter/s)": 1.438625 }, { "epoch": 2.2256972708967053, "grad_norm": 5.141116619110107, "learning_rate": 5.857487496473221e-05, "loss": 2.2541152954101564, "memory(GiB)": 77.56, "step": 51950, "token_acc": 0.5432098765432098, "train_speed(iter/s)": 1.438622 }, { "epoch": 2.225911486225954, "grad_norm": 6.123048305511475, "learning_rate": 5.856824481861605e-05, "loss": 2.383865165710449, "memory(GiB)": 77.56, "step": 51955, "token_acc": 0.4627831715210356, "train_speed(iter/s)": 1.438603 }, { "epoch": 2.2261257015552034, "grad_norm": 7.0820441246032715, "learning_rate": 5.85616145172781e-05, "loss": 2.54730339050293, "memory(GiB)": 77.56, "step": 51960, "token_acc": 0.48314606741573035, "train_speed(iter/s)": 1.438598 }, { "epoch": 2.226339916884452, "grad_norm": 6.379467010498047, "learning_rate": 5.855498406083847e-05, "loss": 2.458445739746094, "memory(GiB)": 77.56, "step": 51965, "token_acc": 0.4842105263157895, "train_speed(iter/s)": 1.438636 }, { "epoch": 2.226554132213701, "grad_norm": 6.872741222381592, "learning_rate": 5.854835344941727e-05, "loss": 2.640496826171875, "memory(GiB)": 77.56, "step": 51970, "token_acc": 0.43416370106761565, "train_speed(iter/s)": 1.438631 }, { "epoch": 2.2267683475429503, "grad_norm": 5.836672306060791, "learning_rate": 5.854172268313465e-05, "loss": 2.2867881774902346, "memory(GiB)": 77.56, "step": 51975, "token_acc": 0.5547445255474452, "train_speed(iter/s)": 1.438619 }, { "epoch": 2.226982562872199, "grad_norm": 5.967794895172119, "learning_rate": 5.8535091762110695e-05, "loss": 2.4145483016967773, "memory(GiB)": 77.56, "step": 51980, "token_acc": 0.46619217081850534, "train_speed(iter/s)": 1.438599 }, { "epoch": 2.227196778201448, "grad_norm": 6.1902899742126465, "learning_rate": 5.852846068646554e-05, "loss": 2.2402278900146486, "memory(GiB)": 77.56, "step": 51985, "token_acc": 0.5, "train_speed(iter/s)": 1.438626 }, { "epoch": 2.227410993530697, "grad_norm": 5.117874622344971, "learning_rate": 5.8521829456319334e-05, "loss": 2.5297452926635744, "memory(GiB)": 77.56, "step": 51990, "token_acc": 0.5092250922509225, "train_speed(iter/s)": 1.43861 }, { "epoch": 2.227625208859946, "grad_norm": 6.199989318847656, "learning_rate": 5.851519807179219e-05, "loss": 2.2609928131103514, "memory(GiB)": 77.56, "step": 51995, "token_acc": 0.5197132616487455, "train_speed(iter/s)": 1.438601 }, { "epoch": 2.227839424189195, "grad_norm": 6.211108207702637, "learning_rate": 5.850856653300424e-05, "loss": 2.674707794189453, "memory(GiB)": 77.56, "step": 52000, "token_acc": 0.4811320754716981, "train_speed(iter/s)": 1.438565 }, { "epoch": 2.227839424189195, "eval_loss": 2.3591954708099365, "eval_runtime": 14.591, "eval_samples_per_second": 6.854, "eval_steps_per_second": 6.854, "eval_token_acc": 0.4573248407643312, "step": 52000 }, { "epoch": 2.228053639518444, "grad_norm": 4.987310886383057, "learning_rate": 5.850193484007563e-05, "loss": 2.3738496780395506, "memory(GiB)": 77.56, "step": 52005, "token_acc": 0.46702317290552586, "train_speed(iter/s)": 1.437957 }, { "epoch": 2.228267854847693, "grad_norm": 7.356802940368652, "learning_rate": 5.849530299312649e-05, "loss": 2.2208412170410154, "memory(GiB)": 77.56, "step": 52010, "token_acc": 0.524822695035461, "train_speed(iter/s)": 1.437989 }, { "epoch": 2.2284820701769417, "grad_norm": 4.580575942993164, "learning_rate": 5.848867099227696e-05, "loss": 2.5155324935913086, "memory(GiB)": 77.56, "step": 52015, "token_acc": 0.4887459807073955, "train_speed(iter/s)": 1.438001 }, { "epoch": 2.228696285506191, "grad_norm": 5.334826946258545, "learning_rate": 5.848203883764721e-05, "loss": 2.5269001007080076, "memory(GiB)": 77.56, "step": 52020, "token_acc": 0.45161290322580644, "train_speed(iter/s)": 1.438045 }, { "epoch": 2.2289105008354397, "grad_norm": 5.351432800292969, "learning_rate": 5.8475406529357356e-05, "loss": 2.78003044128418, "memory(GiB)": 77.56, "step": 52025, "token_acc": 0.4418604651162791, "train_speed(iter/s)": 1.438051 }, { "epoch": 2.2291247161646885, "grad_norm": 5.526770114898682, "learning_rate": 5.8468774067527575e-05, "loss": 2.8215051651000977, "memory(GiB)": 77.56, "step": 52030, "token_acc": 0.3993610223642173, "train_speed(iter/s)": 1.438029 }, { "epoch": 2.229338931493938, "grad_norm": 5.490999698638916, "learning_rate": 5.8462141452277995e-05, "loss": 2.418069839477539, "memory(GiB)": 77.56, "step": 52035, "token_acc": 0.48375451263537905, "train_speed(iter/s)": 1.438045 }, { "epoch": 2.2295531468231866, "grad_norm": 7.099259853363037, "learning_rate": 5.845550868372879e-05, "loss": 2.4227771759033203, "memory(GiB)": 77.56, "step": 52040, "token_acc": 0.4632352941176471, "train_speed(iter/s)": 1.438081 }, { "epoch": 2.2297673621524354, "grad_norm": 6.0998735427856445, "learning_rate": 5.844887576200012e-05, "loss": 2.3925891876220704, "memory(GiB)": 77.56, "step": 52045, "token_acc": 0.5122699386503068, "train_speed(iter/s)": 1.438078 }, { "epoch": 2.2299815774816847, "grad_norm": 4.469804286956787, "learning_rate": 5.844224268721214e-05, "loss": 2.340729904174805, "memory(GiB)": 77.56, "step": 52050, "token_acc": 0.4618055555555556, "train_speed(iter/s)": 1.438053 }, { "epoch": 2.2301957928109335, "grad_norm": 4.652035713195801, "learning_rate": 5.843560945948499e-05, "loss": 2.371796989440918, "memory(GiB)": 77.56, "step": 52055, "token_acc": 0.4864864864864865, "train_speed(iter/s)": 1.438051 }, { "epoch": 2.2304100081401823, "grad_norm": 5.386318206787109, "learning_rate": 5.8428976078938877e-05, "loss": 2.4027633666992188, "memory(GiB)": 77.56, "step": 52060, "token_acc": 0.4919614147909968, "train_speed(iter/s)": 1.438054 }, { "epoch": 2.2306242234694316, "grad_norm": 5.4634270668029785, "learning_rate": 5.842234254569396e-05, "loss": 2.408706855773926, "memory(GiB)": 77.56, "step": 52065, "token_acc": 0.4867549668874172, "train_speed(iter/s)": 1.438047 }, { "epoch": 2.2308384387986804, "grad_norm": 4.541078090667725, "learning_rate": 5.84157088598704e-05, "loss": 2.631957244873047, "memory(GiB)": 77.56, "step": 52070, "token_acc": 0.444794952681388, "train_speed(iter/s)": 1.438056 }, { "epoch": 2.231052654127929, "grad_norm": 6.064865589141846, "learning_rate": 5.840907502158839e-05, "loss": 2.4243370056152345, "memory(GiB)": 77.56, "step": 52075, "token_acc": 0.48109965635738833, "train_speed(iter/s)": 1.438034 }, { "epoch": 2.2312668694571784, "grad_norm": 3.8478260040283203, "learning_rate": 5.84024410309681e-05, "loss": 2.5084304809570312, "memory(GiB)": 77.56, "step": 52080, "token_acc": 0.5068493150684932, "train_speed(iter/s)": 1.438011 }, { "epoch": 2.2314810847864273, "grad_norm": 4.559310436248779, "learning_rate": 5.839580688812969e-05, "loss": 2.4697303771972656, "memory(GiB)": 77.56, "step": 52085, "token_acc": 0.5087108013937283, "train_speed(iter/s)": 1.438013 }, { "epoch": 2.231695300115676, "grad_norm": 6.388387680053711, "learning_rate": 5.8389172593193365e-05, "loss": 2.638285827636719, "memory(GiB)": 77.56, "step": 52090, "token_acc": 0.46006389776357826, "train_speed(iter/s)": 1.438005 }, { "epoch": 2.2319095154449253, "grad_norm": 4.712356090545654, "learning_rate": 5.838253814627932e-05, "loss": 2.4830198287963867, "memory(GiB)": 77.56, "step": 52095, "token_acc": 0.5244755244755245, "train_speed(iter/s)": 1.438023 }, { "epoch": 2.232123730774174, "grad_norm": 6.152145862579346, "learning_rate": 5.8375903547507724e-05, "loss": 2.402608299255371, "memory(GiB)": 77.56, "step": 52100, "token_acc": 0.4785992217898833, "train_speed(iter/s)": 1.438046 }, { "epoch": 2.232337946103423, "grad_norm": 5.250881195068359, "learning_rate": 5.836926879699879e-05, "loss": 2.586806297302246, "memory(GiB)": 77.56, "step": 52105, "token_acc": 0.46366782006920415, "train_speed(iter/s)": 1.438059 }, { "epoch": 2.232552161432672, "grad_norm": 7.42572021484375, "learning_rate": 5.83626338948727e-05, "loss": 2.5476520538330076, "memory(GiB)": 77.56, "step": 52110, "token_acc": 0.4797297297297297, "train_speed(iter/s)": 1.438051 }, { "epoch": 2.232766376761921, "grad_norm": 4.829923152923584, "learning_rate": 5.835599884124964e-05, "loss": 2.460332489013672, "memory(GiB)": 77.56, "step": 52115, "token_acc": 0.47266881028938906, "train_speed(iter/s)": 1.438046 }, { "epoch": 2.2329805920911703, "grad_norm": 5.859281063079834, "learning_rate": 5.8349363636249835e-05, "loss": 2.6722280502319338, "memory(GiB)": 77.56, "step": 52120, "token_acc": 0.4479166666666667, "train_speed(iter/s)": 1.438059 }, { "epoch": 2.233194807420419, "grad_norm": 4.803500175476074, "learning_rate": 5.834272827999345e-05, "loss": 2.4590518951416014, "memory(GiB)": 77.56, "step": 52125, "token_acc": 0.47278911564625853, "train_speed(iter/s)": 1.438067 }, { "epoch": 2.233409022749668, "grad_norm": 4.993882656097412, "learning_rate": 5.833609277260074e-05, "loss": 2.3638330459594727, "memory(GiB)": 77.56, "step": 52130, "token_acc": 0.504, "train_speed(iter/s)": 1.438076 }, { "epoch": 2.233623238078917, "grad_norm": 6.235511302947998, "learning_rate": 5.8329457114191886e-05, "loss": 2.392010498046875, "memory(GiB)": 77.56, "step": 52135, "token_acc": 0.5047619047619047, "train_speed(iter/s)": 1.438097 }, { "epoch": 2.233837453408166, "grad_norm": 6.040309906005859, "learning_rate": 5.832282130488711e-05, "loss": 2.4152313232421876, "memory(GiB)": 77.56, "step": 52140, "token_acc": 0.5158730158730159, "train_speed(iter/s)": 1.438114 }, { "epoch": 2.234051668737415, "grad_norm": 5.303132057189941, "learning_rate": 5.8316185344806596e-05, "loss": 2.158448600769043, "memory(GiB)": 77.56, "step": 52145, "token_acc": 0.49818181818181817, "train_speed(iter/s)": 1.438134 }, { "epoch": 2.234265884066664, "grad_norm": 5.439749240875244, "learning_rate": 5.8309549234070605e-05, "loss": 2.461848831176758, "memory(GiB)": 77.56, "step": 52150, "token_acc": 0.4911660777385159, "train_speed(iter/s)": 1.438136 }, { "epoch": 2.234480099395913, "grad_norm": 4.345194339752197, "learning_rate": 5.8302912972799315e-05, "loss": 2.5911281585693358, "memory(GiB)": 77.56, "step": 52155, "token_acc": 0.48324022346368717, "train_speed(iter/s)": 1.438113 }, { "epoch": 2.2346943147251617, "grad_norm": 5.080658912658691, "learning_rate": 5.8296276561112985e-05, "loss": 2.3790657043457033, "memory(GiB)": 77.56, "step": 52160, "token_acc": 0.49429657794676807, "train_speed(iter/s)": 1.438143 }, { "epoch": 2.234908530054411, "grad_norm": 4.814274787902832, "learning_rate": 5.828963999913182e-05, "loss": 2.2174882888793945, "memory(GiB)": 77.56, "step": 52165, "token_acc": 0.49469964664310956, "train_speed(iter/s)": 1.438142 }, { "epoch": 2.2351227453836597, "grad_norm": 4.737116813659668, "learning_rate": 5.8283003286976035e-05, "loss": 2.3507022857666016, "memory(GiB)": 77.56, "step": 52170, "token_acc": 0.48056537102473496, "train_speed(iter/s)": 1.438137 }, { "epoch": 2.2353369607129085, "grad_norm": 5.720367908477783, "learning_rate": 5.827636642476589e-05, "loss": 2.209674072265625, "memory(GiB)": 77.56, "step": 52175, "token_acc": 0.5483870967741935, "train_speed(iter/s)": 1.438184 }, { "epoch": 2.235551176042158, "grad_norm": 5.919825077056885, "learning_rate": 5.826972941262161e-05, "loss": 2.558554840087891, "memory(GiB)": 77.56, "step": 52180, "token_acc": 0.47766323024054985, "train_speed(iter/s)": 1.438194 }, { "epoch": 2.2357653913714066, "grad_norm": 5.511066913604736, "learning_rate": 5.826309225066341e-05, "loss": 2.4316606521606445, "memory(GiB)": 77.56, "step": 52185, "token_acc": 0.48, "train_speed(iter/s)": 1.438209 }, { "epoch": 2.2359796067006554, "grad_norm": 4.484046459197998, "learning_rate": 5.825645493901155e-05, "loss": 2.3739242553710938, "memory(GiB)": 77.56, "step": 52190, "token_acc": 0.4723926380368098, "train_speed(iter/s)": 1.438246 }, { "epoch": 2.2361938220299047, "grad_norm": 5.770759105682373, "learning_rate": 5.824981747778626e-05, "loss": 2.4203216552734377, "memory(GiB)": 77.56, "step": 52195, "token_acc": 0.4782608695652174, "train_speed(iter/s)": 1.438259 }, { "epoch": 2.2364080373591535, "grad_norm": 4.978071689605713, "learning_rate": 5.824317986710778e-05, "loss": 2.4877138137817383, "memory(GiB)": 77.56, "step": 52200, "token_acc": 0.47692307692307695, "train_speed(iter/s)": 1.438261 }, { "epoch": 2.2366222526884023, "grad_norm": 5.100183486938477, "learning_rate": 5.823654210709637e-05, "loss": 2.5518028259277346, "memory(GiB)": 77.56, "step": 52205, "token_acc": 0.46236559139784944, "train_speed(iter/s)": 1.438285 }, { "epoch": 2.2368364680176516, "grad_norm": 6.134519577026367, "learning_rate": 5.8229904197872284e-05, "loss": 2.7520109176635743, "memory(GiB)": 77.56, "step": 52210, "token_acc": 0.4272151898734177, "train_speed(iter/s)": 1.438244 }, { "epoch": 2.2370506833469004, "grad_norm": 5.845499038696289, "learning_rate": 5.822326613955574e-05, "loss": 2.6151599884033203, "memory(GiB)": 77.56, "step": 52215, "token_acc": 0.4774436090225564, "train_speed(iter/s)": 1.438244 }, { "epoch": 2.237264898676149, "grad_norm": 5.837034702301025, "learning_rate": 5.821662793226704e-05, "loss": 2.2620176315307616, "memory(GiB)": 77.56, "step": 52220, "token_acc": 0.547244094488189, "train_speed(iter/s)": 1.438238 }, { "epoch": 2.2374791140053985, "grad_norm": 5.221806049346924, "learning_rate": 5.820998957612641e-05, "loss": 2.3668664932250976, "memory(GiB)": 77.56, "step": 52225, "token_acc": 0.47470817120622566, "train_speed(iter/s)": 1.438253 }, { "epoch": 2.2376933293346473, "grad_norm": 4.990957260131836, "learning_rate": 5.820335107125412e-05, "loss": 2.4245464324951174, "memory(GiB)": 77.56, "step": 52230, "token_acc": 0.46645367412140576, "train_speed(iter/s)": 1.438241 }, { "epoch": 2.237907544663896, "grad_norm": 4.406238079071045, "learning_rate": 5.819671241777043e-05, "loss": 2.5115472793579103, "memory(GiB)": 77.56, "step": 52235, "token_acc": 0.44666666666666666, "train_speed(iter/s)": 1.438236 }, { "epoch": 2.2381217599931453, "grad_norm": 5.33854341506958, "learning_rate": 5.819007361579558e-05, "loss": 2.3464244842529296, "memory(GiB)": 77.56, "step": 52240, "token_acc": 0.513986013986014, "train_speed(iter/s)": 1.438265 }, { "epoch": 2.238335975322394, "grad_norm": 4.555159568786621, "learning_rate": 5.818343466544989e-05, "loss": 2.3547286987304688, "memory(GiB)": 77.56, "step": 52245, "token_acc": 0.48736462093862815, "train_speed(iter/s)": 1.438254 }, { "epoch": 2.238550190651643, "grad_norm": 6.115626335144043, "learning_rate": 5.8176795566853606e-05, "loss": 2.206055450439453, "memory(GiB)": 77.56, "step": 52250, "token_acc": 0.5031055900621118, "train_speed(iter/s)": 1.438235 }, { "epoch": 2.238764405980892, "grad_norm": 6.728278160095215, "learning_rate": 5.817015632012699e-05, "loss": 2.427275848388672, "memory(GiB)": 77.56, "step": 52255, "token_acc": 0.4641509433962264, "train_speed(iter/s)": 1.438241 }, { "epoch": 2.238978621310141, "grad_norm": 5.419593811035156, "learning_rate": 5.816351692539033e-05, "loss": 2.164683723449707, "memory(GiB)": 77.56, "step": 52260, "token_acc": 0.5634920634920635, "train_speed(iter/s)": 1.438281 }, { "epoch": 2.23919283663939, "grad_norm": 4.266914367675781, "learning_rate": 5.81568773827639e-05, "loss": 2.502466583251953, "memory(GiB)": 77.56, "step": 52265, "token_acc": 0.5087719298245614, "train_speed(iter/s)": 1.4383 }, { "epoch": 2.239407051968639, "grad_norm": 4.814389228820801, "learning_rate": 5.815023769236798e-05, "loss": 2.730708122253418, "memory(GiB)": 77.56, "step": 52270, "token_acc": 0.4780058651026393, "train_speed(iter/s)": 1.438289 }, { "epoch": 2.239621267297888, "grad_norm": 5.215169906616211, "learning_rate": 5.814359785432286e-05, "loss": 2.356830596923828, "memory(GiB)": 77.56, "step": 52275, "token_acc": 0.48322147651006714, "train_speed(iter/s)": 1.438281 }, { "epoch": 2.2398354826271367, "grad_norm": 6.906881809234619, "learning_rate": 5.8136957868748844e-05, "loss": 2.4186727523803713, "memory(GiB)": 77.56, "step": 52280, "token_acc": 0.48986486486486486, "train_speed(iter/s)": 1.438296 }, { "epoch": 2.240049697956386, "grad_norm": 5.0663981437683105, "learning_rate": 5.813031773576618e-05, "loss": 2.3654603958129883, "memory(GiB)": 77.56, "step": 52285, "token_acc": 0.4482758620689655, "train_speed(iter/s)": 1.438264 }, { "epoch": 2.240263913285635, "grad_norm": 6.552148818969727, "learning_rate": 5.81236774554952e-05, "loss": 2.625449371337891, "memory(GiB)": 77.56, "step": 52290, "token_acc": 0.4603174603174603, "train_speed(iter/s)": 1.438269 }, { "epoch": 2.2404781286148836, "grad_norm": 4.879304885864258, "learning_rate": 5.811703702805618e-05, "loss": 2.2694137573242186, "memory(GiB)": 77.56, "step": 52295, "token_acc": 0.5191082802547771, "train_speed(iter/s)": 1.438279 }, { "epoch": 2.240692343944133, "grad_norm": 4.904812335968018, "learning_rate": 5.811039645356941e-05, "loss": 2.1864694595336913, "memory(GiB)": 77.56, "step": 52300, "token_acc": 0.5421245421245421, "train_speed(iter/s)": 1.438285 }, { "epoch": 2.2409065592733817, "grad_norm": 6.095534324645996, "learning_rate": 5.810375573215521e-05, "loss": 2.602367401123047, "memory(GiB)": 77.56, "step": 52305, "token_acc": 0.4984520123839009, "train_speed(iter/s)": 1.438298 }, { "epoch": 2.2411207746026305, "grad_norm": 6.377011299133301, "learning_rate": 5.809711486393388e-05, "loss": 2.2531185150146484, "memory(GiB)": 77.56, "step": 52310, "token_acc": 0.5103734439834025, "train_speed(iter/s)": 1.438313 }, { "epoch": 2.2413349899318797, "grad_norm": 5.126463413238525, "learning_rate": 5.8090473849025685e-05, "loss": 2.141797637939453, "memory(GiB)": 77.56, "step": 52315, "token_acc": 0.5182186234817814, "train_speed(iter/s)": 1.438314 }, { "epoch": 2.2415492052611286, "grad_norm": 8.175921440124512, "learning_rate": 5.8083832687551e-05, "loss": 2.5346071243286135, "memory(GiB)": 77.56, "step": 52320, "token_acc": 0.4657039711191336, "train_speed(iter/s)": 1.438315 }, { "epoch": 2.2417634205903774, "grad_norm": 9.176375389099121, "learning_rate": 5.807719137963009e-05, "loss": 2.480098915100098, "memory(GiB)": 77.56, "step": 52325, "token_acc": 0.49140893470790376, "train_speed(iter/s)": 1.438336 }, { "epoch": 2.2419776359196266, "grad_norm": 5.458570957183838, "learning_rate": 5.807054992538328e-05, "loss": 2.5901315689086912, "memory(GiB)": 77.56, "step": 52330, "token_acc": 0.4919614147909968, "train_speed(iter/s)": 1.43835 }, { "epoch": 2.2421918512488754, "grad_norm": 5.865347385406494, "learning_rate": 5.806390832493089e-05, "loss": 2.528288650512695, "memory(GiB)": 77.56, "step": 52335, "token_acc": 0.4847328244274809, "train_speed(iter/s)": 1.438351 }, { "epoch": 2.2424060665781242, "grad_norm": 5.235307693481445, "learning_rate": 5.805726657839324e-05, "loss": 2.5595371246337892, "memory(GiB)": 77.56, "step": 52340, "token_acc": 0.4293785310734463, "train_speed(iter/s)": 1.438354 }, { "epoch": 2.2426202819073735, "grad_norm": 5.518535137176514, "learning_rate": 5.805062468589064e-05, "loss": 2.1073171615600588, "memory(GiB)": 77.56, "step": 52345, "token_acc": 0.5382059800664452, "train_speed(iter/s)": 1.43831 }, { "epoch": 2.2428344972366223, "grad_norm": 5.909546375274658, "learning_rate": 5.8043982647543426e-05, "loss": 2.144661712646484, "memory(GiB)": 77.56, "step": 52350, "token_acc": 0.5687732342007435, "train_speed(iter/s)": 1.438324 }, { "epoch": 2.243048712565871, "grad_norm": 5.499137878417969, "learning_rate": 5.803734046347192e-05, "loss": 2.4163820266723635, "memory(GiB)": 77.56, "step": 52355, "token_acc": 0.5178571428571429, "train_speed(iter/s)": 1.438327 }, { "epoch": 2.2432629278951204, "grad_norm": 5.9609055519104, "learning_rate": 5.8030698133796445e-05, "loss": 2.4384990692138673, "memory(GiB)": 77.56, "step": 52360, "token_acc": 0.5094339622641509, "train_speed(iter/s)": 1.438336 }, { "epoch": 2.243477143224369, "grad_norm": 6.5359930992126465, "learning_rate": 5.802405565863735e-05, "loss": 2.4144208908081053, "memory(GiB)": 77.56, "step": 52365, "token_acc": 0.4977973568281938, "train_speed(iter/s)": 1.438328 }, { "epoch": 2.243691358553618, "grad_norm": 9.94322681427002, "learning_rate": 5.8017413038114965e-05, "loss": 2.3920829772949217, "memory(GiB)": 77.56, "step": 52370, "token_acc": 0.5267175572519084, "train_speed(iter/s)": 1.438345 }, { "epoch": 2.2439055738828673, "grad_norm": 4.635645866394043, "learning_rate": 5.8010770272349615e-05, "loss": 2.440418243408203, "memory(GiB)": 77.56, "step": 52375, "token_acc": 0.49221183800623053, "train_speed(iter/s)": 1.438394 }, { "epoch": 2.244119789212116, "grad_norm": 4.728506565093994, "learning_rate": 5.8004127361461644e-05, "loss": 2.575151824951172, "memory(GiB)": 77.56, "step": 52380, "token_acc": 0.4924812030075188, "train_speed(iter/s)": 1.438367 }, { "epoch": 2.244334004541365, "grad_norm": 5.283487796783447, "learning_rate": 5.799748430557139e-05, "loss": 2.39417724609375, "memory(GiB)": 77.56, "step": 52385, "token_acc": 0.5313531353135313, "train_speed(iter/s)": 1.438366 }, { "epoch": 2.244548219870614, "grad_norm": 5.265110969543457, "learning_rate": 5.799084110479921e-05, "loss": 2.6639053344726564, "memory(GiB)": 77.56, "step": 52390, "token_acc": 0.47959183673469385, "train_speed(iter/s)": 1.438358 }, { "epoch": 2.244762435199863, "grad_norm": 6.173926830291748, "learning_rate": 5.798419775926546e-05, "loss": 2.472491455078125, "memory(GiB)": 77.56, "step": 52395, "token_acc": 0.4582043343653251, "train_speed(iter/s)": 1.438358 }, { "epoch": 2.2449766505291118, "grad_norm": 4.308659076690674, "learning_rate": 5.7977554269090475e-05, "loss": 2.3466522216796877, "memory(GiB)": 77.56, "step": 52400, "token_acc": 0.4854368932038835, "train_speed(iter/s)": 1.438393 }, { "epoch": 2.245190865858361, "grad_norm": 5.5615715980529785, "learning_rate": 5.7970910634394594e-05, "loss": 2.4914134979248046, "memory(GiB)": 77.56, "step": 52405, "token_acc": 0.4866666666666667, "train_speed(iter/s)": 1.438418 }, { "epoch": 2.24540508118761, "grad_norm": 6.581422805786133, "learning_rate": 5.796426685529821e-05, "loss": 2.561817932128906, "memory(GiB)": 77.56, "step": 52410, "token_acc": 0.48951048951048953, "train_speed(iter/s)": 1.438405 }, { "epoch": 2.2456192965168587, "grad_norm": 4.762694835662842, "learning_rate": 5.795762293192164e-05, "loss": 2.320370674133301, "memory(GiB)": 77.56, "step": 52415, "token_acc": 0.49038461538461536, "train_speed(iter/s)": 1.438419 }, { "epoch": 2.245833511846108, "grad_norm": 5.242889404296875, "learning_rate": 5.7950978864385286e-05, "loss": 2.396922302246094, "memory(GiB)": 77.56, "step": 52420, "token_acc": 0.4804270462633452, "train_speed(iter/s)": 1.438438 }, { "epoch": 2.2460477271753567, "grad_norm": 4.6974968910217285, "learning_rate": 5.7944334652809485e-05, "loss": 1.9638542175292968, "memory(GiB)": 77.56, "step": 52425, "token_acc": 0.5454545454545454, "train_speed(iter/s)": 1.438455 }, { "epoch": 2.2462619425046055, "grad_norm": 5.275105953216553, "learning_rate": 5.7937690297314594e-05, "loss": 2.6029062271118164, "memory(GiB)": 77.56, "step": 52430, "token_acc": 0.4507042253521127, "train_speed(iter/s)": 1.438447 }, { "epoch": 2.246476157833855, "grad_norm": 8.600444793701172, "learning_rate": 5.793104579802102e-05, "loss": 2.369175338745117, "memory(GiB)": 77.56, "step": 52435, "token_acc": 0.5277777777777778, "train_speed(iter/s)": 1.43843 }, { "epoch": 2.2466903731631036, "grad_norm": 4.77775239944458, "learning_rate": 5.79244011550491e-05, "loss": 2.4584651947021485, "memory(GiB)": 77.56, "step": 52440, "token_acc": 0.46075085324232085, "train_speed(iter/s)": 1.438442 }, { "epoch": 2.2469045884923524, "grad_norm": 6.812375545501709, "learning_rate": 5.7917756368519217e-05, "loss": 2.665845489501953, "memory(GiB)": 77.56, "step": 52445, "token_acc": 0.45357142857142857, "train_speed(iter/s)": 1.438461 }, { "epoch": 2.2471188038216017, "grad_norm": 6.556591033935547, "learning_rate": 5.7911111438551754e-05, "loss": 2.3507244110107424, "memory(GiB)": 77.56, "step": 52450, "token_acc": 0.4963768115942029, "train_speed(iter/s)": 1.438473 }, { "epoch": 2.2473330191508505, "grad_norm": 5.057736873626709, "learning_rate": 5.7904466365267097e-05, "loss": 2.29601936340332, "memory(GiB)": 77.56, "step": 52455, "token_acc": 0.5120274914089347, "train_speed(iter/s)": 1.438488 }, { "epoch": 2.2475472344800993, "grad_norm": 6.139328479766846, "learning_rate": 5.789782114878559e-05, "loss": 2.318193244934082, "memory(GiB)": 77.56, "step": 52460, "token_acc": 0.48854961832061067, "train_speed(iter/s)": 1.43849 }, { "epoch": 2.2477614498093486, "grad_norm": 5.3854289054870605, "learning_rate": 5.789117578922767e-05, "loss": 2.544055938720703, "memory(GiB)": 77.56, "step": 52465, "token_acc": 0.4684385382059801, "train_speed(iter/s)": 1.438501 }, { "epoch": 2.2479756651385974, "grad_norm": 7.402945518493652, "learning_rate": 5.7884530286713687e-05, "loss": 2.385725212097168, "memory(GiB)": 77.56, "step": 52470, "token_acc": 0.45182724252491696, "train_speed(iter/s)": 1.438477 }, { "epoch": 2.248189880467846, "grad_norm": 5.561273097991943, "learning_rate": 5.787788464136403e-05, "loss": 2.3282379150390624, "memory(GiB)": 77.56, "step": 52475, "token_acc": 0.4913494809688581, "train_speed(iter/s)": 1.438488 }, { "epoch": 2.2484040957970954, "grad_norm": 7.599419116973877, "learning_rate": 5.787123885329913e-05, "loss": 2.2904230117797852, "memory(GiB)": 77.56, "step": 52480, "token_acc": 0.4826254826254826, "train_speed(iter/s)": 1.43844 }, { "epoch": 2.2486183111263442, "grad_norm": 5.4341607093811035, "learning_rate": 5.786459292263934e-05, "loss": 2.273422431945801, "memory(GiB)": 77.56, "step": 52485, "token_acc": 0.4807692307692308, "train_speed(iter/s)": 1.438443 }, { "epoch": 2.248832526455593, "grad_norm": 5.609556674957275, "learning_rate": 5.785794684950506e-05, "loss": 2.547252082824707, "memory(GiB)": 77.56, "step": 52490, "token_acc": 0.4797507788161994, "train_speed(iter/s)": 1.438445 }, { "epoch": 2.2490467417848423, "grad_norm": 4.623444557189941, "learning_rate": 5.7851300634016724e-05, "loss": 2.6107341766357424, "memory(GiB)": 77.56, "step": 52495, "token_acc": 0.438953488372093, "train_speed(iter/s)": 1.438411 }, { "epoch": 2.249260957114091, "grad_norm": 7.347949981689453, "learning_rate": 5.784465427629469e-05, "loss": 2.8286808013916014, "memory(GiB)": 77.56, "step": 52500, "token_acc": 0.43853820598006643, "train_speed(iter/s)": 1.438421 }, { "epoch": 2.249260957114091, "eval_loss": 2.180190324783325, "eval_runtime": 14.1531, "eval_samples_per_second": 7.066, "eval_steps_per_second": 7.066, "eval_token_acc": 0.4816326530612245, "step": 52500 }, { "epoch": 2.24947517244334, "grad_norm": 6.299923896789551, "learning_rate": 5.783800777645939e-05, "loss": 2.5754859924316404, "memory(GiB)": 77.56, "step": 52505, "token_acc": 0.47147147147147145, "train_speed(iter/s)": 1.437837 }, { "epoch": 2.249689387772589, "grad_norm": 4.94052791595459, "learning_rate": 5.783136113463125e-05, "loss": 2.3202953338623047, "memory(GiB)": 77.56, "step": 52510, "token_acc": 0.5019455252918288, "train_speed(iter/s)": 1.437875 }, { "epoch": 2.249903603101838, "grad_norm": 6.667887210845947, "learning_rate": 5.782471435093063e-05, "loss": 2.8009078979492186, "memory(GiB)": 77.56, "step": 52515, "token_acc": 0.4199395770392749, "train_speed(iter/s)": 1.437895 }, { "epoch": 2.250117818431087, "grad_norm": 5.917612552642822, "learning_rate": 5.7818067425477976e-05, "loss": 2.840686798095703, "memory(GiB)": 77.56, "step": 52520, "token_acc": 0.43790849673202614, "train_speed(iter/s)": 1.437917 }, { "epoch": 2.250332033760336, "grad_norm": 6.140458106994629, "learning_rate": 5.781142035839371e-05, "loss": 2.2187252044677734, "memory(GiB)": 77.56, "step": 52525, "token_acc": 0.49477351916376305, "train_speed(iter/s)": 1.437918 }, { "epoch": 2.250546249089585, "grad_norm": 4.763411045074463, "learning_rate": 5.7804773149798216e-05, "loss": 2.2728418350219726, "memory(GiB)": 77.56, "step": 52530, "token_acc": 0.5416666666666666, "train_speed(iter/s)": 1.437943 }, { "epoch": 2.2507604644188337, "grad_norm": 5.218469619750977, "learning_rate": 5.7798125799811944e-05, "loss": 2.6449504852294923, "memory(GiB)": 77.56, "step": 52535, "token_acc": 0.45864661654135336, "train_speed(iter/s)": 1.437951 }, { "epoch": 2.250974679748083, "grad_norm": 9.126242637634277, "learning_rate": 5.77914783085553e-05, "loss": 2.4818254470825196, "memory(GiB)": 77.56, "step": 52540, "token_acc": 0.5017182130584192, "train_speed(iter/s)": 1.437945 }, { "epoch": 2.2511888950773318, "grad_norm": 5.561603546142578, "learning_rate": 5.778483067614874e-05, "loss": 2.635569953918457, "memory(GiB)": 77.56, "step": 52545, "token_acc": 0.4554140127388535, "train_speed(iter/s)": 1.437942 }, { "epoch": 2.2514031104065806, "grad_norm": 4.865551948547363, "learning_rate": 5.7778182902712644e-05, "loss": 2.3304779052734377, "memory(GiB)": 77.56, "step": 52550, "token_acc": 0.48872180451127817, "train_speed(iter/s)": 1.437922 }, { "epoch": 2.25161732573583, "grad_norm": 5.232170104980469, "learning_rate": 5.777153498836748e-05, "loss": 2.522089385986328, "memory(GiB)": 77.56, "step": 52555, "token_acc": 0.46218487394957986, "train_speed(iter/s)": 1.437931 }, { "epoch": 2.2518315410650787, "grad_norm": 13.78891372680664, "learning_rate": 5.776488693323366e-05, "loss": 2.43114013671875, "memory(GiB)": 77.56, "step": 52560, "token_acc": 0.5104166666666666, "train_speed(iter/s)": 1.437941 }, { "epoch": 2.2520457563943275, "grad_norm": 8.460672378540039, "learning_rate": 5.775823873743165e-05, "loss": 2.300827217102051, "memory(GiB)": 77.56, "step": 52565, "token_acc": 0.5822222222222222, "train_speed(iter/s)": 1.437977 }, { "epoch": 2.2522599717235767, "grad_norm": 6.5112690925598145, "learning_rate": 5.775159040108185e-05, "loss": 2.1495765686035155, "memory(GiB)": 77.56, "step": 52570, "token_acc": 0.5033112582781457, "train_speed(iter/s)": 1.437979 }, { "epoch": 2.2524741870528255, "grad_norm": 4.030750274658203, "learning_rate": 5.7744941924304716e-05, "loss": 2.4456748962402344, "memory(GiB)": 77.56, "step": 52575, "token_acc": 0.5189003436426117, "train_speed(iter/s)": 1.437965 }, { "epoch": 2.2526884023820744, "grad_norm": 5.141251087188721, "learning_rate": 5.77382933072207e-05, "loss": 2.512584686279297, "memory(GiB)": 77.56, "step": 52580, "token_acc": 0.49056603773584906, "train_speed(iter/s)": 1.437971 }, { "epoch": 2.2529026177113236, "grad_norm": 4.528842449188232, "learning_rate": 5.773164454995026e-05, "loss": 2.2243797302246096, "memory(GiB)": 77.56, "step": 52585, "token_acc": 0.5090252707581228, "train_speed(iter/s)": 1.437983 }, { "epoch": 2.2531168330405724, "grad_norm": 6.190828323364258, "learning_rate": 5.77249956526138e-05, "loss": 2.2826152801513673, "memory(GiB)": 77.56, "step": 52590, "token_acc": 0.49070631970260226, "train_speed(iter/s)": 1.437955 }, { "epoch": 2.2533310483698212, "grad_norm": 4.875882625579834, "learning_rate": 5.7718346615331806e-05, "loss": 2.585957145690918, "memory(GiB)": 77.56, "step": 52595, "token_acc": 0.5249169435215947, "train_speed(iter/s)": 1.437971 }, { "epoch": 2.2535452636990705, "grad_norm": 6.09178352355957, "learning_rate": 5.771169743822473e-05, "loss": 2.415875244140625, "memory(GiB)": 77.56, "step": 52600, "token_acc": 0.5019762845849802, "train_speed(iter/s)": 1.437938 }, { "epoch": 2.2537594790283193, "grad_norm": 6.055955410003662, "learning_rate": 5.770504812141301e-05, "loss": 2.4428234100341797, "memory(GiB)": 77.56, "step": 52605, "token_acc": 0.51985559566787, "train_speed(iter/s)": 1.437952 }, { "epoch": 2.253973694357568, "grad_norm": 4.312917232513428, "learning_rate": 5.7698398665017104e-05, "loss": 2.5624116897583007, "memory(GiB)": 77.56, "step": 52610, "token_acc": 0.49859943977591037, "train_speed(iter/s)": 1.437931 }, { "epoch": 2.2541879096868174, "grad_norm": 4.953600883483887, "learning_rate": 5.7691749069157505e-05, "loss": 2.6711801528930663, "memory(GiB)": 77.56, "step": 52615, "token_acc": 0.4617737003058104, "train_speed(iter/s)": 1.437944 }, { "epoch": 2.254402125016066, "grad_norm": 9.856009483337402, "learning_rate": 5.768509933395465e-05, "loss": 2.1410646438598633, "memory(GiB)": 77.56, "step": 52620, "token_acc": 0.5209125475285171, "train_speed(iter/s)": 1.437929 }, { "epoch": 2.254616340345315, "grad_norm": 5.957549571990967, "learning_rate": 5.7678449459529015e-05, "loss": 2.4958564758300783, "memory(GiB)": 77.56, "step": 52625, "token_acc": 0.4254658385093168, "train_speed(iter/s)": 1.437948 }, { "epoch": 2.2548305556745643, "grad_norm": 6.903100490570068, "learning_rate": 5.7671799446001075e-05, "loss": 2.2911500930786133, "memory(GiB)": 77.56, "step": 52630, "token_acc": 0.49823321554770317, "train_speed(iter/s)": 1.437946 }, { "epoch": 2.255044771003813, "grad_norm": 5.000712871551514, "learning_rate": 5.766514929349126e-05, "loss": 2.5998674392700196, "memory(GiB)": 77.56, "step": 52635, "token_acc": 0.4555984555984556, "train_speed(iter/s)": 1.437909 }, { "epoch": 2.255258986333062, "grad_norm": 4.973024368286133, "learning_rate": 5.7658499002120104e-05, "loss": 2.53555908203125, "memory(GiB)": 77.56, "step": 52640, "token_acc": 0.49107142857142855, "train_speed(iter/s)": 1.437917 }, { "epoch": 2.255473201662311, "grad_norm": 4.722836494445801, "learning_rate": 5.765184857200804e-05, "loss": 2.5097539901733397, "memory(GiB)": 77.56, "step": 52645, "token_acc": 0.5107033639143731, "train_speed(iter/s)": 1.437951 }, { "epoch": 2.25568741699156, "grad_norm": 5.660754680633545, "learning_rate": 5.764519800327556e-05, "loss": 2.1958438873291017, "memory(GiB)": 77.56, "step": 52650, "token_acc": 0.5261437908496732, "train_speed(iter/s)": 1.437897 }, { "epoch": 2.2559016323208088, "grad_norm": 5.0157623291015625, "learning_rate": 5.7638547296043154e-05, "loss": 2.4194915771484373, "memory(GiB)": 77.56, "step": 52655, "token_acc": 0.4967948717948718, "train_speed(iter/s)": 1.437857 }, { "epoch": 2.256115847650058, "grad_norm": 5.594944477081299, "learning_rate": 5.76318964504313e-05, "loss": 2.4303096771240233, "memory(GiB)": 77.56, "step": 52660, "token_acc": 0.5049180327868853, "train_speed(iter/s)": 1.437877 }, { "epoch": 2.256330062979307, "grad_norm": 6.9311418533325195, "learning_rate": 5.7625245466560474e-05, "loss": 2.4899091720581055, "memory(GiB)": 77.56, "step": 52665, "token_acc": 0.47318611987381703, "train_speed(iter/s)": 1.437886 }, { "epoch": 2.2565442783085556, "grad_norm": 7.694872856140137, "learning_rate": 5.761859434455118e-05, "loss": 2.1439022064208983, "memory(GiB)": 77.56, "step": 52670, "token_acc": 0.5387755102040817, "train_speed(iter/s)": 1.437927 }, { "epoch": 2.256758493637805, "grad_norm": 7.846240043640137, "learning_rate": 5.761194308452389e-05, "loss": 2.2495460510253906, "memory(GiB)": 77.56, "step": 52675, "token_acc": 0.49280575539568344, "train_speed(iter/s)": 1.437941 }, { "epoch": 2.2569727089670537, "grad_norm": 4.7755255699157715, "learning_rate": 5.760529168659912e-05, "loss": 2.3682403564453125, "memory(GiB)": 77.56, "step": 52680, "token_acc": 0.48013245033112584, "train_speed(iter/s)": 1.437945 }, { "epoch": 2.2571869242963025, "grad_norm": 4.929680347442627, "learning_rate": 5.759864015089735e-05, "loss": 2.440260887145996, "memory(GiB)": 77.56, "step": 52685, "token_acc": 0.4605263157894737, "train_speed(iter/s)": 1.437963 }, { "epoch": 2.257401139625552, "grad_norm": 6.361323356628418, "learning_rate": 5.7591988477539104e-05, "loss": 2.2576662063598634, "memory(GiB)": 77.56, "step": 52690, "token_acc": 0.5150501672240803, "train_speed(iter/s)": 1.437998 }, { "epoch": 2.2576153549548006, "grad_norm": 6.733607769012451, "learning_rate": 5.758533666664485e-05, "loss": 2.329274559020996, "memory(GiB)": 77.56, "step": 52695, "token_acc": 0.5078864353312302, "train_speed(iter/s)": 1.438028 }, { "epoch": 2.2578295702840494, "grad_norm": 4.716760635375977, "learning_rate": 5.757868471833512e-05, "loss": 2.097896957397461, "memory(GiB)": 77.56, "step": 52700, "token_acc": 0.5415162454873647, "train_speed(iter/s)": 1.438059 }, { "epoch": 2.2580437856132987, "grad_norm": 5.216626167297363, "learning_rate": 5.757203263273039e-05, "loss": 2.395073890686035, "memory(GiB)": 77.56, "step": 52705, "token_acc": 0.4462025316455696, "train_speed(iter/s)": 1.438092 }, { "epoch": 2.2582580009425475, "grad_norm": 9.615227699279785, "learning_rate": 5.756538040995119e-05, "loss": 2.5581438064575197, "memory(GiB)": 77.56, "step": 52710, "token_acc": 0.48348348348348347, "train_speed(iter/s)": 1.438096 }, { "epoch": 2.2584722162717963, "grad_norm": 7.783900737762451, "learning_rate": 5.7558728050118036e-05, "loss": 2.490049362182617, "memory(GiB)": 77.56, "step": 52715, "token_acc": 0.5093632958801498, "train_speed(iter/s)": 1.438101 }, { "epoch": 2.2586864316010455, "grad_norm": 4.691930770874023, "learning_rate": 5.755207555335142e-05, "loss": 2.5530912399291994, "memory(GiB)": 77.56, "step": 52720, "token_acc": 0.5070821529745042, "train_speed(iter/s)": 1.438071 }, { "epoch": 2.2589006469302944, "grad_norm": 5.784709453582764, "learning_rate": 5.7545422919771874e-05, "loss": 2.3213569641113283, "memory(GiB)": 77.56, "step": 52725, "token_acc": 0.49324324324324326, "train_speed(iter/s)": 1.438079 }, { "epoch": 2.259114862259543, "grad_norm": 4.669852256774902, "learning_rate": 5.753877014949992e-05, "loss": 2.0753929138183596, "memory(GiB)": 77.56, "step": 52730, "token_acc": 0.5253164556962026, "train_speed(iter/s)": 1.4381 }, { "epoch": 2.2593290775887924, "grad_norm": 4.522634029388428, "learning_rate": 5.753211724265606e-05, "loss": 2.205304718017578, "memory(GiB)": 77.56, "step": 52735, "token_acc": 0.49310344827586206, "train_speed(iter/s)": 1.43807 }, { "epoch": 2.2595432929180412, "grad_norm": 5.349833965301514, "learning_rate": 5.7525464199360844e-05, "loss": 2.4013298034667967, "memory(GiB)": 77.56, "step": 52740, "token_acc": 0.4864864864864865, "train_speed(iter/s)": 1.438057 }, { "epoch": 2.25975750824729, "grad_norm": 5.460079669952393, "learning_rate": 5.751881101973479e-05, "loss": 2.4229297637939453, "memory(GiB)": 77.56, "step": 52745, "token_acc": 0.5032679738562091, "train_speed(iter/s)": 1.438064 }, { "epoch": 2.2599717235765393, "grad_norm": 5.337129592895508, "learning_rate": 5.75121577038984e-05, "loss": 2.470130729675293, "memory(GiB)": 77.56, "step": 52750, "token_acc": 0.5037037037037037, "train_speed(iter/s)": 1.438047 }, { "epoch": 2.260185938905788, "grad_norm": 6.955590724945068, "learning_rate": 5.750550425197224e-05, "loss": 2.163749885559082, "memory(GiB)": 77.56, "step": 52755, "token_acc": 0.5121951219512195, "train_speed(iter/s)": 1.438049 }, { "epoch": 2.260400154235037, "grad_norm": 4.7066216468811035, "learning_rate": 5.749885066407683e-05, "loss": 2.409752082824707, "memory(GiB)": 77.56, "step": 52760, "token_acc": 0.5416666666666666, "train_speed(iter/s)": 1.43805 }, { "epoch": 2.260614369564286, "grad_norm": 5.495039463043213, "learning_rate": 5.74921969403327e-05, "loss": 2.5739295959472654, "memory(GiB)": 77.56, "step": 52765, "token_acc": 0.5, "train_speed(iter/s)": 1.438029 }, { "epoch": 2.260828584893535, "grad_norm": 5.087043762207031, "learning_rate": 5.74855430808604e-05, "loss": 2.327122688293457, "memory(GiB)": 77.56, "step": 52770, "token_acc": 0.48717948717948717, "train_speed(iter/s)": 1.438064 }, { "epoch": 2.261042800222784, "grad_norm": 5.553941249847412, "learning_rate": 5.7478889085780476e-05, "loss": 2.530124473571777, "memory(GiB)": 77.56, "step": 52775, "token_acc": 0.5172413793103449, "train_speed(iter/s)": 1.438091 }, { "epoch": 2.261257015552033, "grad_norm": 6.690070629119873, "learning_rate": 5.7472234955213435e-05, "loss": 2.708115577697754, "memory(GiB)": 77.56, "step": 52780, "token_acc": 0.4368932038834951, "train_speed(iter/s)": 1.438096 }, { "epoch": 2.261471230881282, "grad_norm": 5.2283220291137695, "learning_rate": 5.7465580689279864e-05, "loss": 2.3802984237670897, "memory(GiB)": 77.56, "step": 52785, "token_acc": 0.47876447876447875, "train_speed(iter/s)": 1.438116 }, { "epoch": 2.2616854462105307, "grad_norm": 5.123905658721924, "learning_rate": 5.745892628810029e-05, "loss": 2.2786844253540037, "memory(GiB)": 77.56, "step": 52790, "token_acc": 0.5136986301369864, "train_speed(iter/s)": 1.438114 }, { "epoch": 2.26189966153978, "grad_norm": 6.019182205200195, "learning_rate": 5.745227175179526e-05, "loss": 2.4592945098876955, "memory(GiB)": 77.56, "step": 52795, "token_acc": 0.4624277456647399, "train_speed(iter/s)": 1.438064 }, { "epoch": 2.2621138768690288, "grad_norm": 6.499453544616699, "learning_rate": 5.744561708048536e-05, "loss": 2.9474178314208985, "memory(GiB)": 77.56, "step": 52800, "token_acc": 0.4134275618374558, "train_speed(iter/s)": 1.438063 }, { "epoch": 2.2623280921982776, "grad_norm": 5.409016132354736, "learning_rate": 5.743896227429111e-05, "loss": 2.3522409439086913, "memory(GiB)": 77.56, "step": 52805, "token_acc": 0.5053475935828877, "train_speed(iter/s)": 1.438085 }, { "epoch": 2.262542307527527, "grad_norm": 4.505898475646973, "learning_rate": 5.743230733333307e-05, "loss": 2.6855234146118163, "memory(GiB)": 77.56, "step": 52810, "token_acc": 0.4375, "train_speed(iter/s)": 1.438077 }, { "epoch": 2.2627565228567756, "grad_norm": 5.21828031539917, "learning_rate": 5.7425652257731834e-05, "loss": 2.4964494705200195, "memory(GiB)": 77.56, "step": 52815, "token_acc": 0.4868421052631579, "train_speed(iter/s)": 1.438112 }, { "epoch": 2.2629707381860245, "grad_norm": 6.4684319496154785, "learning_rate": 5.741899704760791e-05, "loss": 2.5311044692993163, "memory(GiB)": 77.56, "step": 52820, "token_acc": 0.4697508896797153, "train_speed(iter/s)": 1.43813 }, { "epoch": 2.2631849535152737, "grad_norm": 4.992574691772461, "learning_rate": 5.741234170308193e-05, "loss": 2.5661808013916017, "memory(GiB)": 77.56, "step": 52825, "token_acc": 0.48014440433212996, "train_speed(iter/s)": 1.438159 }, { "epoch": 2.2633991688445225, "grad_norm": 6.602748870849609, "learning_rate": 5.74056862242744e-05, "loss": 2.5633052825927733, "memory(GiB)": 77.56, "step": 52830, "token_acc": 0.5097402597402597, "train_speed(iter/s)": 1.438152 }, { "epoch": 2.2636133841737713, "grad_norm": 7.113536357879639, "learning_rate": 5.7399030611305913e-05, "loss": 2.1402908325195313, "memory(GiB)": 77.56, "step": 52835, "token_acc": 0.5182481751824818, "train_speed(iter/s)": 1.438172 }, { "epoch": 2.2638275995030206, "grad_norm": 6.32389497756958, "learning_rate": 5.739237486429707e-05, "loss": 2.4931793212890625, "memory(GiB)": 77.56, "step": 52840, "token_acc": 0.49808429118773945, "train_speed(iter/s)": 1.438167 }, { "epoch": 2.2640418148322694, "grad_norm": 5.483723163604736, "learning_rate": 5.738571898336842e-05, "loss": 2.538656997680664, "memory(GiB)": 77.56, "step": 52845, "token_acc": 0.4744744744744745, "train_speed(iter/s)": 1.438162 }, { "epoch": 2.264256030161518, "grad_norm": 5.115888595581055, "learning_rate": 5.737906296864053e-05, "loss": 2.2663623809814455, "memory(GiB)": 77.56, "step": 52850, "token_acc": 0.5634920634920635, "train_speed(iter/s)": 1.438138 }, { "epoch": 2.2644702454907675, "grad_norm": 5.4134297370910645, "learning_rate": 5.737240682023399e-05, "loss": 2.4715688705444334, "memory(GiB)": 77.56, "step": 52855, "token_acc": 0.5103448275862069, "train_speed(iter/s)": 1.43816 }, { "epoch": 2.2646844608200163, "grad_norm": 4.665398120880127, "learning_rate": 5.73657505382694e-05, "loss": 2.578397750854492, "memory(GiB)": 77.56, "step": 52860, "token_acc": 0.5100671140939598, "train_speed(iter/s)": 1.438154 }, { "epoch": 2.264898676149265, "grad_norm": 5.527158737182617, "learning_rate": 5.735909412286731e-05, "loss": 2.3311594009399412, "memory(GiB)": 77.56, "step": 52865, "token_acc": 0.49328859060402686, "train_speed(iter/s)": 1.43814 }, { "epoch": 2.2651128914785144, "grad_norm": 4.6213698387146, "learning_rate": 5.735243757414833e-05, "loss": 2.382043647766113, "memory(GiB)": 77.56, "step": 52870, "token_acc": 0.4391891891891892, "train_speed(iter/s)": 1.438129 }, { "epoch": 2.265327106807763, "grad_norm": 4.389588832855225, "learning_rate": 5.734578089223306e-05, "loss": 2.744268608093262, "memory(GiB)": 77.56, "step": 52875, "token_acc": 0.4262734584450402, "train_speed(iter/s)": 1.438129 }, { "epoch": 2.265541322137012, "grad_norm": 5.95206356048584, "learning_rate": 5.7339124077242066e-05, "loss": 2.5268640518188477, "memory(GiB)": 77.56, "step": 52880, "token_acc": 0.48214285714285715, "train_speed(iter/s)": 1.438124 }, { "epoch": 2.2657555374662612, "grad_norm": 5.782310962677002, "learning_rate": 5.7332467129295964e-05, "loss": 2.2619747161865233, "memory(GiB)": 77.56, "step": 52885, "token_acc": 0.5280898876404494, "train_speed(iter/s)": 1.438146 }, { "epoch": 2.26596975279551, "grad_norm": 5.011475086212158, "learning_rate": 5.732581004851534e-05, "loss": 2.434720993041992, "memory(GiB)": 77.56, "step": 52890, "token_acc": 0.45849802371541504, "train_speed(iter/s)": 1.438157 }, { "epoch": 2.266183968124759, "grad_norm": 4.993654251098633, "learning_rate": 5.731915283502079e-05, "loss": 2.340412139892578, "memory(GiB)": 77.56, "step": 52895, "token_acc": 0.47388059701492535, "train_speed(iter/s)": 1.438186 }, { "epoch": 2.266398183454008, "grad_norm": 4.088520050048828, "learning_rate": 5.731249548893291e-05, "loss": 2.396742057800293, "memory(GiB)": 77.56, "step": 52900, "token_acc": 0.5222929936305732, "train_speed(iter/s)": 1.438204 }, { "epoch": 2.266612398783257, "grad_norm": 4.804503917694092, "learning_rate": 5.730583801037234e-05, "loss": 2.3832279205322267, "memory(GiB)": 77.56, "step": 52905, "token_acc": 0.5017182130584192, "train_speed(iter/s)": 1.438205 }, { "epoch": 2.2668266141125057, "grad_norm": 4.973869800567627, "learning_rate": 5.729918039945963e-05, "loss": 2.6502588272094725, "memory(GiB)": 77.56, "step": 52910, "token_acc": 0.4473684210526316, "train_speed(iter/s)": 1.438232 }, { "epoch": 2.267040829441755, "grad_norm": 5.373922824859619, "learning_rate": 5.729252265631545e-05, "loss": 2.3283412933349608, "memory(GiB)": 77.56, "step": 52915, "token_acc": 0.5017064846416383, "train_speed(iter/s)": 1.43823 }, { "epoch": 2.267255044771004, "grad_norm": 5.5015997886657715, "learning_rate": 5.728586478106037e-05, "loss": 2.685133934020996, "memory(GiB)": 77.56, "step": 52920, "token_acc": 0.4463667820069204, "train_speed(iter/s)": 1.438252 }, { "epoch": 2.2674692601002526, "grad_norm": 7.28200626373291, "learning_rate": 5.727920677381501e-05, "loss": 2.42260684967041, "memory(GiB)": 77.56, "step": 52925, "token_acc": 0.5150375939849624, "train_speed(iter/s)": 1.438278 }, { "epoch": 2.267683475429502, "grad_norm": 6.621476650238037, "learning_rate": 5.72725486347e-05, "loss": 2.368482208251953, "memory(GiB)": 77.56, "step": 52930, "token_acc": 0.48606811145510836, "train_speed(iter/s)": 1.438279 }, { "epoch": 2.2678976907587507, "grad_norm": 5.441981792449951, "learning_rate": 5.726589036383594e-05, "loss": 2.7897417068481447, "memory(GiB)": 77.56, "step": 52935, "token_acc": 0.46153846153846156, "train_speed(iter/s)": 1.438277 }, { "epoch": 2.2681119060879995, "grad_norm": 6.121260643005371, "learning_rate": 5.725923196134345e-05, "loss": 2.4538955688476562, "memory(GiB)": 77.56, "step": 52940, "token_acc": 0.46757679180887374, "train_speed(iter/s)": 1.438291 }, { "epoch": 2.2683261214172488, "grad_norm": 5.303750991821289, "learning_rate": 5.725257342734318e-05, "loss": 2.3754655838012697, "memory(GiB)": 77.56, "step": 52945, "token_acc": 0.4725274725274725, "train_speed(iter/s)": 1.438289 }, { "epoch": 2.2685403367464976, "grad_norm": 5.178130626678467, "learning_rate": 5.7245914761955744e-05, "loss": 2.293476867675781, "memory(GiB)": 77.56, "step": 52950, "token_acc": 0.5098684210526315, "train_speed(iter/s)": 1.43832 }, { "epoch": 2.2687545520757464, "grad_norm": 4.849772930145264, "learning_rate": 5.7239255965301755e-05, "loss": 2.196510124206543, "memory(GiB)": 77.56, "step": 52955, "token_acc": 0.5622317596566524, "train_speed(iter/s)": 1.438353 }, { "epoch": 2.2689687674049956, "grad_norm": 7.176445960998535, "learning_rate": 5.723259703750186e-05, "loss": 2.5891637802124023, "memory(GiB)": 77.56, "step": 52960, "token_acc": 0.476027397260274, "train_speed(iter/s)": 1.438364 }, { "epoch": 2.2691829827342445, "grad_norm": 5.645754814147949, "learning_rate": 5.722593797867667e-05, "loss": 2.2313116073608397, "memory(GiB)": 77.56, "step": 52965, "token_acc": 0.5724381625441696, "train_speed(iter/s)": 1.438373 }, { "epoch": 2.2693971980634933, "grad_norm": 4.929851531982422, "learning_rate": 5.721927878894685e-05, "loss": 2.5327665328979494, "memory(GiB)": 77.56, "step": 52970, "token_acc": 0.4820846905537459, "train_speed(iter/s)": 1.438423 }, { "epoch": 2.2696114133927425, "grad_norm": 4.901386737823486, "learning_rate": 5.721261946843302e-05, "loss": 2.5152469635009767, "memory(GiB)": 77.56, "step": 52975, "token_acc": 0.45426829268292684, "train_speed(iter/s)": 1.438445 }, { "epoch": 2.2698256287219913, "grad_norm": 8.600111961364746, "learning_rate": 5.72059600172558e-05, "loss": 2.6597412109375, "memory(GiB)": 77.56, "step": 52980, "token_acc": 0.5074183976261127, "train_speed(iter/s)": 1.438463 }, { "epoch": 2.27003984405124, "grad_norm": 4.8522419929504395, "learning_rate": 5.7199300435535884e-05, "loss": 2.4320371627807615, "memory(GiB)": 77.56, "step": 52985, "token_acc": 0.503448275862069, "train_speed(iter/s)": 1.438494 }, { "epoch": 2.2702540593804894, "grad_norm": 5.510733127593994, "learning_rate": 5.7192640723393874e-05, "loss": 2.4039257049560545, "memory(GiB)": 77.56, "step": 52990, "token_acc": 0.4855305466237942, "train_speed(iter/s)": 1.438528 }, { "epoch": 2.270468274709738, "grad_norm": 3.5769388675689697, "learning_rate": 5.7185980880950426e-05, "loss": 2.498982238769531, "memory(GiB)": 77.56, "step": 52995, "token_acc": 0.4581151832460733, "train_speed(iter/s)": 1.438508 }, { "epoch": 2.270682490038987, "grad_norm": 7.290960311889648, "learning_rate": 5.717932090832621e-05, "loss": 2.399822807312012, "memory(GiB)": 77.56, "step": 53000, "token_acc": 0.5063694267515924, "train_speed(iter/s)": 1.438521 }, { "epoch": 2.270682490038987, "eval_loss": 2.3298373222351074, "eval_runtime": 14.6176, "eval_samples_per_second": 6.841, "eval_steps_per_second": 6.841, "eval_token_acc": 0.45588235294117646, "step": 53000 }, { "epoch": 2.2708967053682363, "grad_norm": 5.588569641113281, "learning_rate": 5.7172660805641855e-05, "loss": 2.29166316986084, "memory(GiB)": 77.56, "step": 53005, "token_acc": 0.47610294117647056, "train_speed(iter/s)": 1.437919 }, { "epoch": 2.271110920697485, "grad_norm": 5.524135112762451, "learning_rate": 5.716600057301802e-05, "loss": 2.526597023010254, "memory(GiB)": 77.56, "step": 53010, "token_acc": 0.4460431654676259, "train_speed(iter/s)": 1.437924 }, { "epoch": 2.271325136026734, "grad_norm": 6.729628562927246, "learning_rate": 5.7159340210575355e-05, "loss": 2.395654296875, "memory(GiB)": 77.56, "step": 53015, "token_acc": 0.5018587360594795, "train_speed(iter/s)": 1.437938 }, { "epoch": 2.271539351355983, "grad_norm": 7.5427422523498535, "learning_rate": 5.715267971843453e-05, "loss": 2.386401557922363, "memory(GiB)": 77.56, "step": 53020, "token_acc": 0.4511784511784512, "train_speed(iter/s)": 1.437968 }, { "epoch": 2.271753566685232, "grad_norm": 5.130465984344482, "learning_rate": 5.71460190967162e-05, "loss": 2.367441177368164, "memory(GiB)": 77.56, "step": 53025, "token_acc": 0.5056179775280899, "train_speed(iter/s)": 1.437992 }, { "epoch": 2.271967782014481, "grad_norm": 4.470834732055664, "learning_rate": 5.713935834554104e-05, "loss": 2.4068119049072267, "memory(GiB)": 77.56, "step": 53030, "token_acc": 0.4781144781144781, "train_speed(iter/s)": 1.437991 }, { "epoch": 2.27218199734373, "grad_norm": 7.210778713226318, "learning_rate": 5.713269746502971e-05, "loss": 2.5451162338256834, "memory(GiB)": 77.56, "step": 53035, "token_acc": 0.4868421052631579, "train_speed(iter/s)": 1.43801 }, { "epoch": 2.272396212672979, "grad_norm": 5.885176181793213, "learning_rate": 5.7126036455302855e-05, "loss": 2.252902030944824, "memory(GiB)": 77.56, "step": 53040, "token_acc": 0.5129032258064516, "train_speed(iter/s)": 1.43803 }, { "epoch": 2.2726104280022277, "grad_norm": 6.645816326141357, "learning_rate": 5.7119375316481175e-05, "loss": 2.2691341400146485, "memory(GiB)": 77.56, "step": 53045, "token_acc": 0.52734375, "train_speed(iter/s)": 1.438077 }, { "epoch": 2.272824643331477, "grad_norm": 4.853824138641357, "learning_rate": 5.7112714048685324e-05, "loss": 2.2286312103271486, "memory(GiB)": 77.56, "step": 53050, "token_acc": 0.5241157556270096, "train_speed(iter/s)": 1.438072 }, { "epoch": 2.2730388586607257, "grad_norm": 6.165210247039795, "learning_rate": 5.710605265203599e-05, "loss": 2.574686813354492, "memory(GiB)": 77.56, "step": 53055, "token_acc": 0.4681528662420382, "train_speed(iter/s)": 1.43808 }, { "epoch": 2.2732530739899746, "grad_norm": 6.995044231414795, "learning_rate": 5.7099391126653855e-05, "loss": 2.667811965942383, "memory(GiB)": 77.56, "step": 53060, "token_acc": 0.5018450184501845, "train_speed(iter/s)": 1.438094 }, { "epoch": 2.273467289319224, "grad_norm": 5.122948169708252, "learning_rate": 5.709272947265959e-05, "loss": 2.2270503997802735, "memory(GiB)": 77.56, "step": 53065, "token_acc": 0.5876288659793815, "train_speed(iter/s)": 1.438112 }, { "epoch": 2.2736815046484726, "grad_norm": 6.087159156799316, "learning_rate": 5.708606769017386e-05, "loss": 2.7900115966796877, "memory(GiB)": 77.56, "step": 53070, "token_acc": 0.4380664652567976, "train_speed(iter/s)": 1.438079 }, { "epoch": 2.2738957199777214, "grad_norm": 5.669824600219727, "learning_rate": 5.707940577931739e-05, "loss": 2.3901580810546874, "memory(GiB)": 77.56, "step": 53075, "token_acc": 0.4875444839857651, "train_speed(iter/s)": 1.438068 }, { "epoch": 2.2741099353069707, "grad_norm": 5.867022514343262, "learning_rate": 5.707274374021082e-05, "loss": 2.693343925476074, "memory(GiB)": 77.56, "step": 53080, "token_acc": 0.44884488448844884, "train_speed(iter/s)": 1.438082 }, { "epoch": 2.2743241506362195, "grad_norm": 4.98362922668457, "learning_rate": 5.706608157297488e-05, "loss": 2.6642425537109373, "memory(GiB)": 77.56, "step": 53085, "token_acc": 0.4831081081081081, "train_speed(iter/s)": 1.438117 }, { "epoch": 2.2745383659654683, "grad_norm": 5.155498027801514, "learning_rate": 5.705941927773023e-05, "loss": 2.536794662475586, "memory(GiB)": 77.56, "step": 53090, "token_acc": 0.51171875, "train_speed(iter/s)": 1.438148 }, { "epoch": 2.2747525812947176, "grad_norm": 6.072821140289307, "learning_rate": 5.70527568545976e-05, "loss": 2.4132221221923826, "memory(GiB)": 77.56, "step": 53095, "token_acc": 0.5129151291512916, "train_speed(iter/s)": 1.438142 }, { "epoch": 2.2749667966239664, "grad_norm": 5.548638820648193, "learning_rate": 5.704609430369764e-05, "loss": 2.2902013778686525, "memory(GiB)": 77.56, "step": 53100, "token_acc": 0.5338645418326693, "train_speed(iter/s)": 1.438136 }, { "epoch": 2.275181011953215, "grad_norm": 4.359196186065674, "learning_rate": 5.703943162515109e-05, "loss": 2.485875129699707, "memory(GiB)": 77.56, "step": 53105, "token_acc": 0.47368421052631576, "train_speed(iter/s)": 1.438167 }, { "epoch": 2.2753952272824645, "grad_norm": 5.7630157470703125, "learning_rate": 5.703276881907861e-05, "loss": 2.366191101074219, "memory(GiB)": 77.56, "step": 53110, "token_acc": 0.523972602739726, "train_speed(iter/s)": 1.438167 }, { "epoch": 2.2756094426117133, "grad_norm": 5.62446403503418, "learning_rate": 5.7026105885600946e-05, "loss": 2.3020889282226564, "memory(GiB)": 77.56, "step": 53115, "token_acc": 0.46545454545454545, "train_speed(iter/s)": 1.438175 }, { "epoch": 2.275823657940962, "grad_norm": 6.563262462615967, "learning_rate": 5.701944282483877e-05, "loss": 2.3872055053710937, "memory(GiB)": 77.56, "step": 53120, "token_acc": 0.4923547400611621, "train_speed(iter/s)": 1.438187 }, { "epoch": 2.2760378732702113, "grad_norm": 5.30178165435791, "learning_rate": 5.7012779636912804e-05, "loss": 2.105153465270996, "memory(GiB)": 77.56, "step": 53125, "token_acc": 0.5285171102661597, "train_speed(iter/s)": 1.438217 }, { "epoch": 2.27625208859946, "grad_norm": 5.890365123748779, "learning_rate": 5.700611632194377e-05, "loss": 2.4600345611572267, "memory(GiB)": 77.56, "step": 53130, "token_acc": 0.45348837209302323, "train_speed(iter/s)": 1.438225 }, { "epoch": 2.276466303928709, "grad_norm": 4.762717247009277, "learning_rate": 5.699945288005235e-05, "loss": 2.375034713745117, "memory(GiB)": 77.56, "step": 53135, "token_acc": 0.44336569579288027, "train_speed(iter/s)": 1.438251 }, { "epoch": 2.2766805192579582, "grad_norm": 6.028038024902344, "learning_rate": 5.699278931135929e-05, "loss": 2.4622344970703125, "memory(GiB)": 77.56, "step": 53140, "token_acc": 0.48846153846153845, "train_speed(iter/s)": 1.438238 }, { "epoch": 2.276894734587207, "grad_norm": 8.230001449584961, "learning_rate": 5.6986125615985285e-05, "loss": 2.2831315994262695, "memory(GiB)": 77.56, "step": 53145, "token_acc": 0.5153846153846153, "train_speed(iter/s)": 1.438251 }, { "epoch": 2.277108949916456, "grad_norm": 6.929459095001221, "learning_rate": 5.6979461794051056e-05, "loss": 2.4925300598144533, "memory(GiB)": 77.56, "step": 53150, "token_acc": 0.445993031358885, "train_speed(iter/s)": 1.438234 }, { "epoch": 2.277323165245705, "grad_norm": 5.7662224769592285, "learning_rate": 5.6972797845677325e-05, "loss": 2.0039365768432615, "memory(GiB)": 77.56, "step": 53155, "token_acc": 0.5354330708661418, "train_speed(iter/s)": 1.438252 }, { "epoch": 2.277537380574954, "grad_norm": 4.668109893798828, "learning_rate": 5.696613377098484e-05, "loss": 2.0935928344726564, "memory(GiB)": 77.56, "step": 53160, "token_acc": 0.549645390070922, "train_speed(iter/s)": 1.438257 }, { "epoch": 2.2777515959042027, "grad_norm": 8.795716285705566, "learning_rate": 5.695946957009429e-05, "loss": 2.2457677841186525, "memory(GiB)": 77.56, "step": 53165, "token_acc": 0.4871060171919771, "train_speed(iter/s)": 1.438251 }, { "epoch": 2.277965811233452, "grad_norm": 6.286344528198242, "learning_rate": 5.6952805243126416e-05, "loss": 2.5730365753173827, "memory(GiB)": 77.56, "step": 53170, "token_acc": 0.5152671755725191, "train_speed(iter/s)": 1.438264 }, { "epoch": 2.278180026562701, "grad_norm": 6.191279411315918, "learning_rate": 5.694614079020197e-05, "loss": 2.0139749526977537, "memory(GiB)": 77.56, "step": 53175, "token_acc": 0.5425531914893617, "train_speed(iter/s)": 1.438263 }, { "epoch": 2.2783942418919496, "grad_norm": 7.085164546966553, "learning_rate": 5.6939476211441664e-05, "loss": 2.4792854309082033, "memory(GiB)": 77.56, "step": 53180, "token_acc": 0.5158730158730159, "train_speed(iter/s)": 1.438251 }, { "epoch": 2.278608457221199, "grad_norm": 8.756878852844238, "learning_rate": 5.6932811506966236e-05, "loss": 2.576771545410156, "memory(GiB)": 77.56, "step": 53185, "token_acc": 0.4931506849315068, "train_speed(iter/s)": 1.438256 }, { "epoch": 2.2788226725504477, "grad_norm": 4.465390682220459, "learning_rate": 5.692614667689642e-05, "loss": 2.4743396759033205, "memory(GiB)": 77.56, "step": 53190, "token_acc": 0.505338078291815, "train_speed(iter/s)": 1.438251 }, { "epoch": 2.2790368878796965, "grad_norm": 5.911646366119385, "learning_rate": 5.6919481721352954e-05, "loss": 2.3559106826782226, "memory(GiB)": 77.56, "step": 53195, "token_acc": 0.5, "train_speed(iter/s)": 1.438262 }, { "epoch": 2.2792511032089458, "grad_norm": 4.425171375274658, "learning_rate": 5.691281664045659e-05, "loss": 2.4223958969116213, "memory(GiB)": 77.56, "step": 53200, "token_acc": 0.49523809523809526, "train_speed(iter/s)": 1.438277 }, { "epoch": 2.2794653185381946, "grad_norm": 5.114206790924072, "learning_rate": 5.690615143432807e-05, "loss": 2.3967639923095705, "memory(GiB)": 77.56, "step": 53205, "token_acc": 0.508695652173913, "train_speed(iter/s)": 1.438255 }, { "epoch": 2.2796795338674434, "grad_norm": 6.245949745178223, "learning_rate": 5.689948610308815e-05, "loss": 2.6908233642578123, "memory(GiB)": 77.56, "step": 53210, "token_acc": 0.46855345911949686, "train_speed(iter/s)": 1.43829 }, { "epoch": 2.2798937491966926, "grad_norm": 5.675736904144287, "learning_rate": 5.689282064685755e-05, "loss": 2.213981819152832, "memory(GiB)": 77.56, "step": 53215, "token_acc": 0.50199203187251, "train_speed(iter/s)": 1.43829 }, { "epoch": 2.2801079645259414, "grad_norm": 6.717597961425781, "learning_rate": 5.688615506575704e-05, "loss": 2.7104873657226562, "memory(GiB)": 77.56, "step": 53220, "token_acc": 0.4675324675324675, "train_speed(iter/s)": 1.438329 }, { "epoch": 2.2803221798551903, "grad_norm": 5.387569427490234, "learning_rate": 5.687948935990737e-05, "loss": 2.4388608932495117, "memory(GiB)": 77.56, "step": 53225, "token_acc": 0.49538461538461537, "train_speed(iter/s)": 1.438356 }, { "epoch": 2.2805363951844395, "grad_norm": 6.406753063201904, "learning_rate": 5.687282352942931e-05, "loss": 2.838492012023926, "memory(GiB)": 77.56, "step": 53230, "token_acc": 0.4524590163934426, "train_speed(iter/s)": 1.43837 }, { "epoch": 2.2807506105136883, "grad_norm": 5.576780319213867, "learning_rate": 5.686615757444359e-05, "loss": 2.5071510314941405, "memory(GiB)": 77.56, "step": 53235, "token_acc": 0.5079365079365079, "train_speed(iter/s)": 1.438328 }, { "epoch": 2.280964825842937, "grad_norm": 6.09677267074585, "learning_rate": 5.685949149507099e-05, "loss": 2.8217147827148437, "memory(GiB)": 77.56, "step": 53240, "token_acc": 0.44715447154471544, "train_speed(iter/s)": 1.438306 }, { "epoch": 2.2811790411721864, "grad_norm": 6.976754188537598, "learning_rate": 5.6852825291432264e-05, "loss": 2.2277990341186524, "memory(GiB)": 77.56, "step": 53245, "token_acc": 0.46296296296296297, "train_speed(iter/s)": 1.438325 }, { "epoch": 2.281393256501435, "grad_norm": 5.044142246246338, "learning_rate": 5.684615896364819e-05, "loss": 2.4598684310913086, "memory(GiB)": 77.56, "step": 53250, "token_acc": 0.5182724252491694, "train_speed(iter/s)": 1.438303 }, { "epoch": 2.281607471830684, "grad_norm": 6.840868949890137, "learning_rate": 5.683949251183951e-05, "loss": 2.270075225830078, "memory(GiB)": 77.56, "step": 53255, "token_acc": 0.5151515151515151, "train_speed(iter/s)": 1.438319 }, { "epoch": 2.2818216871599333, "grad_norm": 6.621948719024658, "learning_rate": 5.683282593612702e-05, "loss": 2.5731340408325196, "memory(GiB)": 77.56, "step": 53260, "token_acc": 0.4869281045751634, "train_speed(iter/s)": 1.438351 }, { "epoch": 2.282035902489182, "grad_norm": 5.110633373260498, "learning_rate": 5.682615923663147e-05, "loss": 2.6413650512695312, "memory(GiB)": 77.56, "step": 53265, "token_acc": 0.4664310954063604, "train_speed(iter/s)": 1.438365 }, { "epoch": 2.282250117818431, "grad_norm": 5.838475227355957, "learning_rate": 5.681949241347364e-05, "loss": 2.316972351074219, "memory(GiB)": 77.56, "step": 53270, "token_acc": 0.5555555555555556, "train_speed(iter/s)": 1.438351 }, { "epoch": 2.28246433314768, "grad_norm": 4.8606414794921875, "learning_rate": 5.681282546677431e-05, "loss": 2.6067874908447264, "memory(GiB)": 77.56, "step": 53275, "token_acc": 0.4520123839009288, "train_speed(iter/s)": 1.438325 }, { "epoch": 2.282678548476929, "grad_norm": 6.489525318145752, "learning_rate": 5.6806158396654264e-05, "loss": 2.2418067932128904, "memory(GiB)": 77.56, "step": 53280, "token_acc": 0.5035714285714286, "train_speed(iter/s)": 1.438362 }, { "epoch": 2.282892763806178, "grad_norm": 5.963095664978027, "learning_rate": 5.679949120323426e-05, "loss": 2.815349006652832, "memory(GiB)": 77.56, "step": 53285, "token_acc": 0.44144144144144143, "train_speed(iter/s)": 1.438403 }, { "epoch": 2.283106979135427, "grad_norm": 4.6756205558776855, "learning_rate": 5.6792823886635104e-05, "loss": 2.2031063079833983, "memory(GiB)": 77.56, "step": 53290, "token_acc": 0.5220338983050847, "train_speed(iter/s)": 1.438429 }, { "epoch": 2.283321194464676, "grad_norm": 5.816607475280762, "learning_rate": 5.678615644697758e-05, "loss": 2.2711997985839845, "memory(GiB)": 77.56, "step": 53295, "token_acc": 0.49122807017543857, "train_speed(iter/s)": 1.438452 }, { "epoch": 2.2835354097939247, "grad_norm": 6.056549072265625, "learning_rate": 5.6779488884382446e-05, "loss": 2.533121109008789, "memory(GiB)": 77.56, "step": 53300, "token_acc": 0.4723127035830619, "train_speed(iter/s)": 1.438469 }, { "epoch": 2.283749625123174, "grad_norm": 5.472777366638184, "learning_rate": 5.677282119897053e-05, "loss": 2.2757379531860353, "memory(GiB)": 77.56, "step": 53305, "token_acc": 0.5335689045936396, "train_speed(iter/s)": 1.438497 }, { "epoch": 2.2839638404524227, "grad_norm": 7.357724189758301, "learning_rate": 5.6766153390862585e-05, "loss": 2.488249588012695, "memory(GiB)": 77.56, "step": 53310, "token_acc": 0.46357615894039733, "train_speed(iter/s)": 1.438483 }, { "epoch": 2.2841780557816715, "grad_norm": 5.0392045974731445, "learning_rate": 5.675948546017943e-05, "loss": 2.2134862899780274, "memory(GiB)": 77.56, "step": 53315, "token_acc": 0.5483870967741935, "train_speed(iter/s)": 1.438481 }, { "epoch": 2.284392271110921, "grad_norm": 4.15414571762085, "learning_rate": 5.6752817407041855e-05, "loss": 2.2173389434814452, "memory(GiB)": 77.56, "step": 53320, "token_acc": 0.5182481751824818, "train_speed(iter/s)": 1.438507 }, { "epoch": 2.2846064864401696, "grad_norm": 5.964130401611328, "learning_rate": 5.674614923157068e-05, "loss": 2.273541259765625, "memory(GiB)": 77.56, "step": 53325, "token_acc": 0.5175097276264592, "train_speed(iter/s)": 1.438465 }, { "epoch": 2.2848207017694184, "grad_norm": 5.023801803588867, "learning_rate": 5.6739480933886655e-05, "loss": 2.452076721191406, "memory(GiB)": 77.56, "step": 53330, "token_acc": 0.4748427672955975, "train_speed(iter/s)": 1.438479 }, { "epoch": 2.2850349170986677, "grad_norm": 4.366186618804932, "learning_rate": 5.6732812514110624e-05, "loss": 2.5886425018310546, "memory(GiB)": 77.56, "step": 53335, "token_acc": 0.48135593220338985, "train_speed(iter/s)": 1.438489 }, { "epoch": 2.2852491324279165, "grad_norm": 4.92492151260376, "learning_rate": 5.672614397236337e-05, "loss": 2.34570426940918, "memory(GiB)": 77.56, "step": 53340, "token_acc": 0.4864864864864865, "train_speed(iter/s)": 1.438499 }, { "epoch": 2.2854633477571653, "grad_norm": 5.4748125076293945, "learning_rate": 5.671947530876571e-05, "loss": 2.4210758209228516, "memory(GiB)": 77.56, "step": 53345, "token_acc": 0.477124183006536, "train_speed(iter/s)": 1.438506 }, { "epoch": 2.2856775630864146, "grad_norm": 7.903741359710693, "learning_rate": 5.671280652343846e-05, "loss": 2.6645380020141602, "memory(GiB)": 77.56, "step": 53350, "token_acc": 0.45703125, "train_speed(iter/s)": 1.438504 }, { "epoch": 2.2858917784156634, "grad_norm": 5.288139820098877, "learning_rate": 5.670613761650242e-05, "loss": 2.6764604568481447, "memory(GiB)": 77.56, "step": 53355, "token_acc": 0.46285714285714286, "train_speed(iter/s)": 1.438525 }, { "epoch": 2.286105993744912, "grad_norm": 5.179421901702881, "learning_rate": 5.6699468588078394e-05, "loss": 2.383032035827637, "memory(GiB)": 77.56, "step": 53360, "token_acc": 0.4782608695652174, "train_speed(iter/s)": 1.438487 }, { "epoch": 2.2863202090741614, "grad_norm": 5.836581230163574, "learning_rate": 5.669279943828723e-05, "loss": 2.3547563552856445, "memory(GiB)": 77.56, "step": 53365, "token_acc": 0.5016611295681063, "train_speed(iter/s)": 1.438511 }, { "epoch": 2.2865344244034103, "grad_norm": 4.6228227615356445, "learning_rate": 5.6686130167249706e-05, "loss": 2.406845474243164, "memory(GiB)": 77.56, "step": 53370, "token_acc": 0.44368600682593856, "train_speed(iter/s)": 1.438545 }, { "epoch": 2.286748639732659, "grad_norm": 5.463676929473877, "learning_rate": 5.6679460775086676e-05, "loss": 2.379409980773926, "memory(GiB)": 77.56, "step": 53375, "token_acc": 0.47761194029850745, "train_speed(iter/s)": 1.438565 }, { "epoch": 2.2869628550619083, "grad_norm": 5.259365558624268, "learning_rate": 5.667279126191896e-05, "loss": 2.4757621765136717, "memory(GiB)": 77.56, "step": 53380, "token_acc": 0.47039473684210525, "train_speed(iter/s)": 1.438585 }, { "epoch": 2.287177070391157, "grad_norm": 4.919700622558594, "learning_rate": 5.666612162786734e-05, "loss": 2.513957214355469, "memory(GiB)": 77.56, "step": 53385, "token_acc": 0.49372384937238495, "train_speed(iter/s)": 1.438641 }, { "epoch": 2.287391285720406, "grad_norm": 5.996565341949463, "learning_rate": 5.665945187305268e-05, "loss": 2.4554615020751953, "memory(GiB)": 77.56, "step": 53390, "token_acc": 0.4847457627118644, "train_speed(iter/s)": 1.438634 }, { "epoch": 2.287605501049655, "grad_norm": 6.9908061027526855, "learning_rate": 5.6652781997595814e-05, "loss": 2.5153329849243162, "memory(GiB)": 77.56, "step": 53395, "token_acc": 0.4827586206896552, "train_speed(iter/s)": 1.438654 }, { "epoch": 2.287819716378904, "grad_norm": 4.970003604888916, "learning_rate": 5.6646112001617554e-05, "loss": 2.6753328323364256, "memory(GiB)": 77.56, "step": 53400, "token_acc": 0.4585635359116022, "train_speed(iter/s)": 1.438685 }, { "epoch": 2.288033931708153, "grad_norm": 6.306012153625488, "learning_rate": 5.663944188523875e-05, "loss": 2.233827590942383, "memory(GiB)": 77.56, "step": 53405, "token_acc": 0.5217391304347826, "train_speed(iter/s)": 1.438693 }, { "epoch": 2.288248147037402, "grad_norm": 4.609481334686279, "learning_rate": 5.663277164858023e-05, "loss": 2.8375007629394533, "memory(GiB)": 77.56, "step": 53410, "token_acc": 0.4478114478114478, "train_speed(iter/s)": 1.438714 }, { "epoch": 2.288462362366651, "grad_norm": 5.747157096862793, "learning_rate": 5.662610129176281e-05, "loss": 2.1186235427856444, "memory(GiB)": 77.56, "step": 53415, "token_acc": 0.5433962264150943, "train_speed(iter/s)": 1.438695 }, { "epoch": 2.2886765776958997, "grad_norm": 6.112963676452637, "learning_rate": 5.661943081490736e-05, "loss": 2.583347511291504, "memory(GiB)": 77.56, "step": 53420, "token_acc": 0.47440273037542663, "train_speed(iter/s)": 1.43868 }, { "epoch": 2.288890793025149, "grad_norm": 6.028528213500977, "learning_rate": 5.661276021813472e-05, "loss": 2.379299354553223, "memory(GiB)": 77.56, "step": 53425, "token_acc": 0.47058823529411764, "train_speed(iter/s)": 1.438716 }, { "epoch": 2.289105008354398, "grad_norm": 6.210702896118164, "learning_rate": 5.6606089501565704e-05, "loss": 2.4029861450195313, "memory(GiB)": 77.56, "step": 53430, "token_acc": 0.5033112582781457, "train_speed(iter/s)": 1.438736 }, { "epoch": 2.2893192236836466, "grad_norm": 6.106517791748047, "learning_rate": 5.65994186653212e-05, "loss": 2.525979423522949, "memory(GiB)": 77.56, "step": 53435, "token_acc": 0.4673913043478261, "train_speed(iter/s)": 1.438762 }, { "epoch": 2.289533439012896, "grad_norm": 7.56550407409668, "learning_rate": 5.659274770952204e-05, "loss": 2.4921316146850585, "memory(GiB)": 77.56, "step": 53440, "token_acc": 0.4918032786885246, "train_speed(iter/s)": 1.438775 }, { "epoch": 2.2897476543421447, "grad_norm": 4.833075523376465, "learning_rate": 5.658607663428905e-05, "loss": 2.3022693634033202, "memory(GiB)": 77.56, "step": 53445, "token_acc": 0.551829268292683, "train_speed(iter/s)": 1.438787 }, { "epoch": 2.2899618696713935, "grad_norm": 5.692952632904053, "learning_rate": 5.6579405439743114e-05, "loss": 2.270130157470703, "memory(GiB)": 77.56, "step": 53450, "token_acc": 0.49818181818181817, "train_speed(iter/s)": 1.438816 }, { "epoch": 2.2901760850006427, "grad_norm": 7.176535606384277, "learning_rate": 5.6572734126005076e-05, "loss": 2.3104724884033203, "memory(GiB)": 77.56, "step": 53455, "token_acc": 0.4804270462633452, "train_speed(iter/s)": 1.438808 }, { "epoch": 2.2903903003298915, "grad_norm": 3.950342893600464, "learning_rate": 5.656606269319579e-05, "loss": 2.116921806335449, "memory(GiB)": 77.56, "step": 53460, "token_acc": 0.5228070175438596, "train_speed(iter/s)": 1.438794 }, { "epoch": 2.2906045156591404, "grad_norm": 5.201592445373535, "learning_rate": 5.6559391141436126e-05, "loss": 2.420242691040039, "memory(GiB)": 77.56, "step": 53465, "token_acc": 0.5065359477124183, "train_speed(iter/s)": 1.438822 }, { "epoch": 2.2908187309883896, "grad_norm": 5.9489030838012695, "learning_rate": 5.655271947084694e-05, "loss": 2.26196231842041, "memory(GiB)": 77.56, "step": 53470, "token_acc": 0.5172413793103449, "train_speed(iter/s)": 1.438803 }, { "epoch": 2.2910329463176384, "grad_norm": 6.96991491317749, "learning_rate": 5.654604768154909e-05, "loss": 2.6189952850341798, "memory(GiB)": 77.56, "step": 53475, "token_acc": 0.4570446735395189, "train_speed(iter/s)": 1.438813 }, { "epoch": 2.2912471616468872, "grad_norm": 4.714742183685303, "learning_rate": 5.6539375773663436e-05, "loss": 2.2030481338500976, "memory(GiB)": 77.56, "step": 53480, "token_acc": 0.5463917525773195, "train_speed(iter/s)": 1.43878 }, { "epoch": 2.2914613769761365, "grad_norm": 4.81325626373291, "learning_rate": 5.653270374731087e-05, "loss": 2.750800895690918, "memory(GiB)": 77.56, "step": 53485, "token_acc": 0.45454545454545453, "train_speed(iter/s)": 1.438783 }, { "epoch": 2.2916755923053853, "grad_norm": 5.346724033355713, "learning_rate": 5.652603160261224e-05, "loss": 2.5518165588378907, "memory(GiB)": 77.56, "step": 53490, "token_acc": 0.4880952380952381, "train_speed(iter/s)": 1.438761 }, { "epoch": 2.291889807634634, "grad_norm": 5.134522914886475, "learning_rate": 5.651935933968843e-05, "loss": 2.2650596618652346, "memory(GiB)": 77.56, "step": 53495, "token_acc": 0.513986013986014, "train_speed(iter/s)": 1.438746 }, { "epoch": 2.2921040229638834, "grad_norm": 4.741995811462402, "learning_rate": 5.65126869586603e-05, "loss": 2.345151901245117, "memory(GiB)": 77.56, "step": 53500, "token_acc": 0.5166051660516605, "train_speed(iter/s)": 1.438763 }, { "epoch": 2.2921040229638834, "eval_loss": 2.2262728214263916, "eval_runtime": 13.6477, "eval_samples_per_second": 7.327, "eval_steps_per_second": 7.327, "eval_token_acc": 0.48289473684210527, "step": 53500 }, { "epoch": 2.292318238293132, "grad_norm": 4.873902320861816, "learning_rate": 5.650601445964874e-05, "loss": 2.298366355895996, "memory(GiB)": 77.56, "step": 53505, "token_acc": 0.47879359095193214, "train_speed(iter/s)": 1.438183 }, { "epoch": 2.292532453622381, "grad_norm": 4.590123176574707, "learning_rate": 5.6499341842774637e-05, "loss": 2.370613670349121, "memory(GiB)": 77.56, "step": 53510, "token_acc": 0.48028673835125446, "train_speed(iter/s)": 1.438192 }, { "epoch": 2.2927466689516303, "grad_norm": 5.4586615562438965, "learning_rate": 5.649266910815885e-05, "loss": 2.103523063659668, "memory(GiB)": 77.56, "step": 53515, "token_acc": 0.49224806201550386, "train_speed(iter/s)": 1.438202 }, { "epoch": 2.292960884280879, "grad_norm": 5.6876420974731445, "learning_rate": 5.648599625592227e-05, "loss": 2.237200927734375, "memory(GiB)": 77.56, "step": 53520, "token_acc": 0.5, "train_speed(iter/s)": 1.438218 }, { "epoch": 2.293175099610128, "grad_norm": 4.383730411529541, "learning_rate": 5.647932328618579e-05, "loss": 2.3002960205078127, "memory(GiB)": 77.56, "step": 53525, "token_acc": 0.5084175084175084, "train_speed(iter/s)": 1.438247 }, { "epoch": 2.293389314939377, "grad_norm": 5.919870376586914, "learning_rate": 5.647265019907027e-05, "loss": 2.341591644287109, "memory(GiB)": 77.56, "step": 53530, "token_acc": 0.504885993485342, "train_speed(iter/s)": 1.438284 }, { "epoch": 2.293603530268626, "grad_norm": 6.525994300842285, "learning_rate": 5.646597699469665e-05, "loss": 2.644367790222168, "memory(GiB)": 77.56, "step": 53535, "token_acc": 0.4551971326164875, "train_speed(iter/s)": 1.438278 }, { "epoch": 2.2938177455978748, "grad_norm": 5.621082305908203, "learning_rate": 5.645930367318577e-05, "loss": 2.302459716796875, "memory(GiB)": 77.56, "step": 53540, "token_acc": 0.5413533834586466, "train_speed(iter/s)": 1.438281 }, { "epoch": 2.294031960927124, "grad_norm": 4.449276447296143, "learning_rate": 5.645263023465854e-05, "loss": 2.276961326599121, "memory(GiB)": 77.56, "step": 53545, "token_acc": 0.4888888888888889, "train_speed(iter/s)": 1.438298 }, { "epoch": 2.294246176256373, "grad_norm": 4.564559459686279, "learning_rate": 5.644595667923589e-05, "loss": 2.425223541259766, "memory(GiB)": 77.56, "step": 53550, "token_acc": 0.4865771812080537, "train_speed(iter/s)": 1.438305 }, { "epoch": 2.2944603915856217, "grad_norm": 5.750235080718994, "learning_rate": 5.643928300703867e-05, "loss": 2.1752178192138674, "memory(GiB)": 77.56, "step": 53555, "token_acc": 0.5555555555555556, "train_speed(iter/s)": 1.438322 }, { "epoch": 2.294674606914871, "grad_norm": 3.7093615531921387, "learning_rate": 5.643260921818779e-05, "loss": 2.690829849243164, "memory(GiB)": 77.56, "step": 53560, "token_acc": 0.48220064724919093, "train_speed(iter/s)": 1.438337 }, { "epoch": 2.2948888222441197, "grad_norm": 4.8255934715271, "learning_rate": 5.642593531280418e-05, "loss": 2.6003559112548826, "memory(GiB)": 77.56, "step": 53565, "token_acc": 0.45454545454545453, "train_speed(iter/s)": 1.438345 }, { "epoch": 2.2951030375733685, "grad_norm": 5.6724042892456055, "learning_rate": 5.6419261291008696e-05, "loss": 2.417842483520508, "memory(GiB)": 77.56, "step": 53570, "token_acc": 0.52, "train_speed(iter/s)": 1.438361 }, { "epoch": 2.295317252902618, "grad_norm": 5.824110507965088, "learning_rate": 5.641258715292228e-05, "loss": 2.454248046875, "memory(GiB)": 77.56, "step": 53575, "token_acc": 0.5050847457627119, "train_speed(iter/s)": 1.438337 }, { "epoch": 2.2955314682318666, "grad_norm": 4.26109504699707, "learning_rate": 5.640591289866585e-05, "loss": 1.9606668472290039, "memory(GiB)": 77.56, "step": 53580, "token_acc": 0.5708154506437768, "train_speed(iter/s)": 1.43837 }, { "epoch": 2.2957456835611154, "grad_norm": 7.147060394287109, "learning_rate": 5.6399238528360285e-05, "loss": 2.374354362487793, "memory(GiB)": 77.56, "step": 53585, "token_acc": 0.5131964809384164, "train_speed(iter/s)": 1.438362 }, { "epoch": 2.2959598988903647, "grad_norm": 5.928290843963623, "learning_rate": 5.6392564042126514e-05, "loss": 2.417148208618164, "memory(GiB)": 77.56, "step": 53590, "token_acc": 0.5072992700729927, "train_speed(iter/s)": 1.438366 }, { "epoch": 2.2961741142196135, "grad_norm": 4.662358283996582, "learning_rate": 5.638588944008546e-05, "loss": 2.7058742523193358, "memory(GiB)": 77.56, "step": 53595, "token_acc": 0.48598130841121495, "train_speed(iter/s)": 1.438382 }, { "epoch": 2.2963883295488623, "grad_norm": 4.462774276733398, "learning_rate": 5.637921472235801e-05, "loss": 2.414849281311035, "memory(GiB)": 77.56, "step": 53600, "token_acc": 0.49280575539568344, "train_speed(iter/s)": 1.438381 }, { "epoch": 2.2966025448781116, "grad_norm": 5.896153926849365, "learning_rate": 5.63725398890651e-05, "loss": 2.1580427169799803, "memory(GiB)": 77.56, "step": 53605, "token_acc": 0.5020080321285141, "train_speed(iter/s)": 1.438382 }, { "epoch": 2.2968167602073604, "grad_norm": 5.841938018798828, "learning_rate": 5.636586494032766e-05, "loss": 2.544931983947754, "memory(GiB)": 77.56, "step": 53610, "token_acc": 0.49079754601226994, "train_speed(iter/s)": 1.438367 }, { "epoch": 2.297030975536609, "grad_norm": 6.33357048034668, "learning_rate": 5.635918987626661e-05, "loss": 2.297098731994629, "memory(GiB)": 77.56, "step": 53615, "token_acc": 0.5, "train_speed(iter/s)": 1.438385 }, { "epoch": 2.2972451908658584, "grad_norm": 5.119426727294922, "learning_rate": 5.6352514697002865e-05, "loss": 2.330405616760254, "memory(GiB)": 77.56, "step": 53620, "token_acc": 0.49491525423728816, "train_speed(iter/s)": 1.43839 }, { "epoch": 2.2974594061951072, "grad_norm": 5.214448928833008, "learning_rate": 5.6345839402657364e-05, "loss": 2.562021827697754, "memory(GiB)": 77.56, "step": 53625, "token_acc": 0.49416342412451364, "train_speed(iter/s)": 1.438415 }, { "epoch": 2.297673621524356, "grad_norm": 6.473047733306885, "learning_rate": 5.633916399335102e-05, "loss": 2.627433204650879, "memory(GiB)": 77.56, "step": 53630, "token_acc": 0.4889705882352941, "train_speed(iter/s)": 1.438396 }, { "epoch": 2.2978878368536053, "grad_norm": 4.60115385055542, "learning_rate": 5.633248846920478e-05, "loss": 2.236454391479492, "memory(GiB)": 77.56, "step": 53635, "token_acc": 0.5346153846153846, "train_speed(iter/s)": 1.438404 }, { "epoch": 2.298102052182854, "grad_norm": 5.444350242614746, "learning_rate": 5.632581283033956e-05, "loss": 2.5599668502807615, "memory(GiB)": 77.56, "step": 53640, "token_acc": 0.46703296703296704, "train_speed(iter/s)": 1.438377 }, { "epoch": 2.298316267512103, "grad_norm": 5.982178688049316, "learning_rate": 5.631913707687632e-05, "loss": 2.057514953613281, "memory(GiB)": 77.56, "step": 53645, "token_acc": 0.5444444444444444, "train_speed(iter/s)": 1.438383 }, { "epoch": 2.298530482841352, "grad_norm": 5.037120819091797, "learning_rate": 5.6312461208935964e-05, "loss": 2.0735027313232424, "memory(GiB)": 77.56, "step": 53650, "token_acc": 0.5284552845528455, "train_speed(iter/s)": 1.438401 }, { "epoch": 2.298744698170601, "grad_norm": 4.352396011352539, "learning_rate": 5.630578522663947e-05, "loss": 2.585700035095215, "memory(GiB)": 77.56, "step": 53655, "token_acc": 0.46646341463414637, "train_speed(iter/s)": 1.43841 }, { "epoch": 2.29895891349985, "grad_norm": 4.839389801025391, "learning_rate": 5.629910913010775e-05, "loss": 2.2515895843505858, "memory(GiB)": 77.56, "step": 53660, "token_acc": 0.5014164305949008, "train_speed(iter/s)": 1.43845 }, { "epoch": 2.299173128829099, "grad_norm": 5.365807056427002, "learning_rate": 5.6292432919461777e-05, "loss": 2.233770751953125, "memory(GiB)": 77.56, "step": 53665, "token_acc": 0.5088967971530249, "train_speed(iter/s)": 1.438456 }, { "epoch": 2.299387344158348, "grad_norm": 5.616290092468262, "learning_rate": 5.628575659482247e-05, "loss": 2.2652149200439453, "memory(GiB)": 77.56, "step": 53670, "token_acc": 0.48355263157894735, "train_speed(iter/s)": 1.438461 }, { "epoch": 2.2996015594875967, "grad_norm": 5.762845039367676, "learning_rate": 5.627908015631078e-05, "loss": 2.7932287216186524, "memory(GiB)": 77.56, "step": 53675, "token_acc": 0.42857142857142855, "train_speed(iter/s)": 1.438453 }, { "epoch": 2.299815774816846, "grad_norm": 4.533267974853516, "learning_rate": 5.6272403604047663e-05, "loss": 2.4930322647094725, "memory(GiB)": 77.56, "step": 53680, "token_acc": 0.49869451697127937, "train_speed(iter/s)": 1.438464 }, { "epoch": 2.3000299901460948, "grad_norm": 6.187613010406494, "learning_rate": 5.626572693815407e-05, "loss": 2.6859153747558593, "memory(GiB)": 77.56, "step": 53685, "token_acc": 0.45484949832775917, "train_speed(iter/s)": 1.438468 }, { "epoch": 2.3002442054753436, "grad_norm": 7.417507171630859, "learning_rate": 5.625905015875096e-05, "loss": 2.473193359375, "memory(GiB)": 77.56, "step": 53690, "token_acc": 0.48514851485148514, "train_speed(iter/s)": 1.438487 }, { "epoch": 2.300458420804593, "grad_norm": 6.058342456817627, "learning_rate": 5.625237326595929e-05, "loss": 2.296754837036133, "memory(GiB)": 77.56, "step": 53695, "token_acc": 0.5264705882352941, "train_speed(iter/s)": 1.43852 }, { "epoch": 2.3006726361338417, "grad_norm": 6.117280006408691, "learning_rate": 5.624569625990002e-05, "loss": 2.3953039169311525, "memory(GiB)": 77.56, "step": 53700, "token_acc": 0.5261324041811847, "train_speed(iter/s)": 1.438517 }, { "epoch": 2.3008868514630905, "grad_norm": 4.356379985809326, "learning_rate": 5.623901914069407e-05, "loss": 2.2769742965698243, "memory(GiB)": 77.56, "step": 53705, "token_acc": 0.5082508250825083, "train_speed(iter/s)": 1.438535 }, { "epoch": 2.3011010667923397, "grad_norm": 4.994988918304443, "learning_rate": 5.623234190846247e-05, "loss": 2.143342971801758, "memory(GiB)": 77.56, "step": 53710, "token_acc": 0.5338345864661654, "train_speed(iter/s)": 1.438528 }, { "epoch": 2.3013152821215885, "grad_norm": 5.715671539306641, "learning_rate": 5.6225664563326134e-05, "loss": 2.5589460372924804, "memory(GiB)": 77.56, "step": 53715, "token_acc": 0.4889589905362776, "train_speed(iter/s)": 1.438521 }, { "epoch": 2.3015294974508373, "grad_norm": 6.478508472442627, "learning_rate": 5.621898710540604e-05, "loss": 2.2853019714355467, "memory(GiB)": 77.56, "step": 53720, "token_acc": 0.4982456140350877, "train_speed(iter/s)": 1.438538 }, { "epoch": 2.3017437127800866, "grad_norm": 6.701432228088379, "learning_rate": 5.621230953482317e-05, "loss": 2.1565780639648438, "memory(GiB)": 77.56, "step": 53725, "token_acc": 0.4957983193277311, "train_speed(iter/s)": 1.43856 }, { "epoch": 2.3019579281093354, "grad_norm": 5.394850730895996, "learning_rate": 5.620563185169848e-05, "loss": 2.4948917388916017, "memory(GiB)": 77.56, "step": 53730, "token_acc": 0.48226950354609927, "train_speed(iter/s)": 1.438552 }, { "epoch": 2.3021721434385842, "grad_norm": 5.482250213623047, "learning_rate": 5.6198954056152954e-05, "loss": 2.3999359130859377, "memory(GiB)": 77.56, "step": 53735, "token_acc": 0.4934210526315789, "train_speed(iter/s)": 1.438563 }, { "epoch": 2.3023863587678335, "grad_norm": 5.089041709899902, "learning_rate": 5.6192276148307556e-05, "loss": 2.28002815246582, "memory(GiB)": 77.56, "step": 53740, "token_acc": 0.5425531914893617, "train_speed(iter/s)": 1.4386 }, { "epoch": 2.3026005740970823, "grad_norm": 4.6152873039245605, "learning_rate": 5.618559812828327e-05, "loss": 2.472471237182617, "memory(GiB)": 77.56, "step": 53745, "token_acc": 0.46757679180887374, "train_speed(iter/s)": 1.438587 }, { "epoch": 2.302814789426331, "grad_norm": 5.413963794708252, "learning_rate": 5.6178919996201064e-05, "loss": 2.601222610473633, "memory(GiB)": 77.56, "step": 53750, "token_acc": 0.48736462093862815, "train_speed(iter/s)": 1.438612 }, { "epoch": 2.3030290047555804, "grad_norm": 4.750934600830078, "learning_rate": 5.617224175218193e-05, "loss": 2.4426929473876955, "memory(GiB)": 77.56, "step": 53755, "token_acc": 0.4635036496350365, "train_speed(iter/s)": 1.438614 }, { "epoch": 2.303243220084829, "grad_norm": 5.06614351272583, "learning_rate": 5.616556339634686e-05, "loss": 2.2192386627197265, "memory(GiB)": 77.56, "step": 53760, "token_acc": 0.5416666666666666, "train_speed(iter/s)": 1.438604 }, { "epoch": 2.303457435414078, "grad_norm": 5.729245662689209, "learning_rate": 5.61588849288168e-05, "loss": 2.384457015991211, "memory(GiB)": 77.56, "step": 53765, "token_acc": 0.5135135135135135, "train_speed(iter/s)": 1.438607 }, { "epoch": 2.3036716507433272, "grad_norm": 6.668264865875244, "learning_rate": 5.6152206349712776e-05, "loss": 2.093819236755371, "memory(GiB)": 77.56, "step": 53770, "token_acc": 0.5675675675675675, "train_speed(iter/s)": 1.438601 }, { "epoch": 2.303885866072576, "grad_norm": 6.468653678894043, "learning_rate": 5.614552765915575e-05, "loss": 2.3463817596435548, "memory(GiB)": 77.56, "step": 53775, "token_acc": 0.523972602739726, "train_speed(iter/s)": 1.438624 }, { "epoch": 2.304100081401825, "grad_norm": 4.567581653594971, "learning_rate": 5.613884885726675e-05, "loss": 2.3168701171875, "memory(GiB)": 77.56, "step": 53780, "token_acc": 0.5140845070422535, "train_speed(iter/s)": 1.438617 }, { "epoch": 2.304314296731074, "grad_norm": 4.7725510597229, "learning_rate": 5.6132169944166735e-05, "loss": 2.2093624114990233, "memory(GiB)": 77.56, "step": 53785, "token_acc": 0.5307443365695793, "train_speed(iter/s)": 1.43864 }, { "epoch": 2.304528512060323, "grad_norm": 5.204513072967529, "learning_rate": 5.6125490919976696e-05, "loss": 2.4235612869262697, "memory(GiB)": 77.56, "step": 53790, "token_acc": 0.473015873015873, "train_speed(iter/s)": 1.438676 }, { "epoch": 2.3047427273895718, "grad_norm": 6.216206073760986, "learning_rate": 5.611881178481765e-05, "loss": 2.1947036743164063, "memory(GiB)": 77.56, "step": 53795, "token_acc": 0.5491071428571429, "train_speed(iter/s)": 1.438685 }, { "epoch": 2.304956942718821, "grad_norm": 6.209000110626221, "learning_rate": 5.611213253881059e-05, "loss": 2.575878715515137, "memory(GiB)": 77.56, "step": 53800, "token_acc": 0.4521452145214521, "train_speed(iter/s)": 1.438691 }, { "epoch": 2.30517115804807, "grad_norm": 4.774401664733887, "learning_rate": 5.610545318207652e-05, "loss": 2.487649154663086, "memory(GiB)": 77.56, "step": 53805, "token_acc": 0.4621212121212121, "train_speed(iter/s)": 1.438695 }, { "epoch": 2.3053853733773186, "grad_norm": 5.229221820831299, "learning_rate": 5.609877371473643e-05, "loss": 2.395768737792969, "memory(GiB)": 77.56, "step": 53810, "token_acc": 0.4942084942084942, "train_speed(iter/s)": 1.438689 }, { "epoch": 2.305599588706568, "grad_norm": 5.733492374420166, "learning_rate": 5.6092094136911344e-05, "loss": 2.225937080383301, "memory(GiB)": 77.56, "step": 53815, "token_acc": 0.48627450980392156, "train_speed(iter/s)": 1.438713 }, { "epoch": 2.3058138040358167, "grad_norm": 5.653471946716309, "learning_rate": 5.608541444872224e-05, "loss": 2.3452863693237305, "memory(GiB)": 77.56, "step": 53820, "token_acc": 0.4777777777777778, "train_speed(iter/s)": 1.43871 }, { "epoch": 2.3060280193650655, "grad_norm": 7.246216297149658, "learning_rate": 5.607873465029017e-05, "loss": 2.464826202392578, "memory(GiB)": 77.56, "step": 53825, "token_acc": 0.5276752767527675, "train_speed(iter/s)": 1.438745 }, { "epoch": 2.3062422346943148, "grad_norm": 4.9488372802734375, "learning_rate": 5.607205474173609e-05, "loss": 2.3327274322509766, "memory(GiB)": 77.56, "step": 53830, "token_acc": 0.483974358974359, "train_speed(iter/s)": 1.438777 }, { "epoch": 2.3064564500235636, "grad_norm": 5.391012191772461, "learning_rate": 5.606537472318105e-05, "loss": 2.6019224166870116, "memory(GiB)": 77.56, "step": 53835, "token_acc": 0.4548611111111111, "train_speed(iter/s)": 1.438783 }, { "epoch": 2.3066706653528124, "grad_norm": 4.587630271911621, "learning_rate": 5.605869459474608e-05, "loss": 2.727021408081055, "memory(GiB)": 77.56, "step": 53840, "token_acc": 0.5, "train_speed(iter/s)": 1.438802 }, { "epoch": 2.3068848806820617, "grad_norm": 5.3732709884643555, "learning_rate": 5.6052014356552166e-05, "loss": 2.4661779403686523, "memory(GiB)": 77.56, "step": 53845, "token_acc": 0.45528455284552843, "train_speed(iter/s)": 1.43882 }, { "epoch": 2.3070990960113105, "grad_norm": 5.235447406768799, "learning_rate": 5.6045334008720316e-05, "loss": 2.272469902038574, "memory(GiB)": 77.56, "step": 53850, "token_acc": 0.5206611570247934, "train_speed(iter/s)": 1.438855 }, { "epoch": 2.3073133113405593, "grad_norm": 8.965286254882812, "learning_rate": 5.603865355137159e-05, "loss": 2.526051330566406, "memory(GiB)": 77.56, "step": 53855, "token_acc": 0.46567164179104475, "train_speed(iter/s)": 1.438882 }, { "epoch": 2.3075275266698085, "grad_norm": 5.878705024719238, "learning_rate": 5.603197298462697e-05, "loss": 2.586753082275391, "memory(GiB)": 77.56, "step": 53860, "token_acc": 0.4689265536723164, "train_speed(iter/s)": 1.43888 }, { "epoch": 2.3077417419990573, "grad_norm": 8.354619026184082, "learning_rate": 5.60252923086075e-05, "loss": 2.4950727462768554, "memory(GiB)": 77.56, "step": 53865, "token_acc": 0.4563106796116505, "train_speed(iter/s)": 1.438901 }, { "epoch": 2.3079559573283066, "grad_norm": 5.443455219268799, "learning_rate": 5.601861152343423e-05, "loss": 2.5206817626953124, "memory(GiB)": 77.56, "step": 53870, "token_acc": 0.46710526315789475, "train_speed(iter/s)": 1.438952 }, { "epoch": 2.3081701726575554, "grad_norm": 11.195306777954102, "learning_rate": 5.601193062922816e-05, "loss": 2.527469253540039, "memory(GiB)": 77.56, "step": 53875, "token_acc": 0.5055350553505535, "train_speed(iter/s)": 1.438967 }, { "epoch": 2.3083843879868042, "grad_norm": 5.233806133270264, "learning_rate": 5.600524962611032e-05, "loss": 2.5457073211669923, "memory(GiB)": 77.56, "step": 53880, "token_acc": 0.46254071661237783, "train_speed(iter/s)": 1.438978 }, { "epoch": 2.3085986033160535, "grad_norm": 5.601393699645996, "learning_rate": 5.5998568514201754e-05, "loss": 2.3417791366577148, "memory(GiB)": 77.56, "step": 53885, "token_acc": 0.47368421052631576, "train_speed(iter/s)": 1.439 }, { "epoch": 2.3088128186453023, "grad_norm": 5.6394805908203125, "learning_rate": 5.599188729362349e-05, "loss": 2.4958471298217773, "memory(GiB)": 77.56, "step": 53890, "token_acc": 0.478134110787172, "train_speed(iter/s)": 1.439003 }, { "epoch": 2.309027033974551, "grad_norm": 9.374964714050293, "learning_rate": 5.598520596449657e-05, "loss": 2.617146682739258, "memory(GiB)": 77.56, "step": 53895, "token_acc": 0.4264705882352941, "train_speed(iter/s)": 1.439027 }, { "epoch": 2.3092412493038004, "grad_norm": 5.910741806030273, "learning_rate": 5.597852452694202e-05, "loss": 2.5937618255615233, "memory(GiB)": 77.56, "step": 53900, "token_acc": 0.4907749077490775, "train_speed(iter/s)": 1.439026 }, { "epoch": 2.309455464633049, "grad_norm": 5.780880928039551, "learning_rate": 5.5971842981080905e-05, "loss": 2.5331489562988283, "memory(GiB)": 77.56, "step": 53905, "token_acc": 0.5161290322580645, "train_speed(iter/s)": 1.439029 }, { "epoch": 2.309669679962298, "grad_norm": 7.117316722869873, "learning_rate": 5.5965161327034234e-05, "loss": 2.3115524291992187, "memory(GiB)": 77.56, "step": 53910, "token_acc": 0.5014925373134328, "train_speed(iter/s)": 1.439065 }, { "epoch": 2.3098838952915473, "grad_norm": 7.297818660736084, "learning_rate": 5.595847956492308e-05, "loss": 2.189962387084961, "memory(GiB)": 77.56, "step": 53915, "token_acc": 0.511864406779661, "train_speed(iter/s)": 1.439028 }, { "epoch": 2.310098110620796, "grad_norm": 5.504382610321045, "learning_rate": 5.595179769486848e-05, "loss": 2.1856462478637697, "memory(GiB)": 77.56, "step": 53920, "token_acc": 0.5748987854251012, "train_speed(iter/s)": 1.439026 }, { "epoch": 2.310312325950045, "grad_norm": 5.445213794708252, "learning_rate": 5.5945115716991484e-05, "loss": 2.5255428314208985, "memory(GiB)": 77.56, "step": 53925, "token_acc": 0.5091575091575091, "train_speed(iter/s)": 1.439028 }, { "epoch": 2.310526541279294, "grad_norm": 7.495448112487793, "learning_rate": 5.5938433631413145e-05, "loss": 2.3789600372314452, "memory(GiB)": 77.56, "step": 53930, "token_acc": 0.49070631970260226, "train_speed(iter/s)": 1.439017 }, { "epoch": 2.310740756608543, "grad_norm": 6.696556091308594, "learning_rate": 5.5931751438254486e-05, "loss": 2.7472387313842774, "memory(GiB)": 77.56, "step": 53935, "token_acc": 0.4398496240601504, "train_speed(iter/s)": 1.439029 }, { "epoch": 2.3109549719377918, "grad_norm": 6.056976318359375, "learning_rate": 5.59250691376366e-05, "loss": 2.533992958068848, "memory(GiB)": 77.56, "step": 53940, "token_acc": 0.49666666666666665, "train_speed(iter/s)": 1.438985 }, { "epoch": 2.311169187267041, "grad_norm": 6.147352695465088, "learning_rate": 5.5918386729680535e-05, "loss": 2.479195785522461, "memory(GiB)": 77.56, "step": 53945, "token_acc": 0.5076923076923077, "train_speed(iter/s)": 1.439008 }, { "epoch": 2.31138340259629, "grad_norm": 5.396799087524414, "learning_rate": 5.591170421450733e-05, "loss": 2.662628173828125, "memory(GiB)": 77.56, "step": 53950, "token_acc": 0.46645367412140576, "train_speed(iter/s)": 1.439011 }, { "epoch": 2.3115976179255386, "grad_norm": 7.313360214233398, "learning_rate": 5.590502159223807e-05, "loss": 2.657781982421875, "memory(GiB)": 77.56, "step": 53955, "token_acc": 0.4409937888198758, "train_speed(iter/s)": 1.439032 }, { "epoch": 2.311811833254788, "grad_norm": 6.940757751464844, "learning_rate": 5.58983388629938e-05, "loss": 2.4785959243774416, "memory(GiB)": 77.56, "step": 53960, "token_acc": 0.5020408163265306, "train_speed(iter/s)": 1.439033 }, { "epoch": 2.3120260485840367, "grad_norm": 4.621875762939453, "learning_rate": 5.589165602689559e-05, "loss": 2.2554958343505858, "memory(GiB)": 77.56, "step": 53965, "token_acc": 0.5330882352941176, "train_speed(iter/s)": 1.439049 }, { "epoch": 2.3122402639132855, "grad_norm": 5.698229789733887, "learning_rate": 5.588497308406451e-05, "loss": 2.238688278198242, "memory(GiB)": 77.56, "step": 53970, "token_acc": 0.48355263157894735, "train_speed(iter/s)": 1.438991 }, { "epoch": 2.3124544792425348, "grad_norm": 5.020229816436768, "learning_rate": 5.5878290034621593e-05, "loss": 2.2700672149658203, "memory(GiB)": 77.56, "step": 53975, "token_acc": 0.4695121951219512, "train_speed(iter/s)": 1.438962 }, { "epoch": 2.3126686945717836, "grad_norm": 6.037504196166992, "learning_rate": 5.5871606878687956e-05, "loss": 2.140109634399414, "memory(GiB)": 77.56, "step": 53980, "token_acc": 0.5463917525773195, "train_speed(iter/s)": 1.438961 }, { "epoch": 2.3128829099010324, "grad_norm": 5.255722522735596, "learning_rate": 5.586492361638466e-05, "loss": 2.4218265533447267, "memory(GiB)": 77.56, "step": 53985, "token_acc": 0.453125, "train_speed(iter/s)": 1.438954 }, { "epoch": 2.3130971252302817, "grad_norm": 7.433424949645996, "learning_rate": 5.585824024783277e-05, "loss": 2.3656471252441404, "memory(GiB)": 77.56, "step": 53990, "token_acc": 0.521594684385382, "train_speed(iter/s)": 1.438958 }, { "epoch": 2.3133113405595305, "grad_norm": 5.4691972732543945, "learning_rate": 5.585155677315336e-05, "loss": 2.3481334686279296, "memory(GiB)": 77.56, "step": 53995, "token_acc": 0.5090252707581228, "train_speed(iter/s)": 1.438952 }, { "epoch": 2.3135255558887793, "grad_norm": 8.643807411193848, "learning_rate": 5.5844873192467506e-05, "loss": 2.1556068420410157, "memory(GiB)": 77.56, "step": 54000, "token_acc": 0.549645390070922, "train_speed(iter/s)": 1.438956 }, { "epoch": 2.3135255558887793, "eval_loss": 2.3864502906799316, "eval_runtime": 14.6584, "eval_samples_per_second": 6.822, "eval_steps_per_second": 6.822, "eval_token_acc": 0.47003994673768307, "step": 54000 }, { "epoch": 2.3137397712180285, "grad_norm": 4.805451393127441, "learning_rate": 5.583818950589629e-05, "loss": 2.264748764038086, "memory(GiB)": 77.56, "step": 54005, "token_acc": 0.4810964083175803, "train_speed(iter/s)": 1.438351 }, { "epoch": 2.3139539865472774, "grad_norm": 5.354040145874023, "learning_rate": 5.583150571356079e-05, "loss": 2.4373064041137695, "memory(GiB)": 77.56, "step": 54010, "token_acc": 0.4854368932038835, "train_speed(iter/s)": 1.438399 }, { "epoch": 2.314168201876526, "grad_norm": 5.284209251403809, "learning_rate": 5.5824821815582104e-05, "loss": 2.2593448638916014, "memory(GiB)": 77.56, "step": 54015, "token_acc": 0.5512367491166078, "train_speed(iter/s)": 1.438422 }, { "epoch": 2.3143824172057754, "grad_norm": 4.4956278800964355, "learning_rate": 5.5818137812081306e-05, "loss": 2.4375816345214845, "memory(GiB)": 77.56, "step": 54020, "token_acc": 0.5029585798816568, "train_speed(iter/s)": 1.438436 }, { "epoch": 2.3145966325350242, "grad_norm": 8.219070434570312, "learning_rate": 5.581145370317948e-05, "loss": 2.1817834854125975, "memory(GiB)": 77.56, "step": 54025, "token_acc": 0.5, "train_speed(iter/s)": 1.438432 }, { "epoch": 2.314810847864273, "grad_norm": 5.6516594886779785, "learning_rate": 5.5804769488997724e-05, "loss": 2.3674079895019533, "memory(GiB)": 77.56, "step": 54030, "token_acc": 0.5220125786163522, "train_speed(iter/s)": 1.438455 }, { "epoch": 2.3150250631935223, "grad_norm": 5.184658527374268, "learning_rate": 5.579808516965711e-05, "loss": 2.2186534881591795, "memory(GiB)": 77.56, "step": 54035, "token_acc": 0.49375, "train_speed(iter/s)": 1.438493 }, { "epoch": 2.315239278522771, "grad_norm": 5.937741756439209, "learning_rate": 5.579140074527877e-05, "loss": 2.2813316345214845, "memory(GiB)": 77.56, "step": 54040, "token_acc": 0.524, "train_speed(iter/s)": 1.438491 }, { "epoch": 2.31545349385202, "grad_norm": 4.384599685668945, "learning_rate": 5.578471621598376e-05, "loss": 2.4732698440551757, "memory(GiB)": 77.56, "step": 54045, "token_acc": 0.5215827338129496, "train_speed(iter/s)": 1.43849 }, { "epoch": 2.315667709181269, "grad_norm": 5.9085774421691895, "learning_rate": 5.5778031581893175e-05, "loss": 2.7172569274902343, "memory(GiB)": 77.56, "step": 54050, "token_acc": 0.4965277777777778, "train_speed(iter/s)": 1.438466 }, { "epoch": 2.315881924510518, "grad_norm": 6.28882360458374, "learning_rate": 5.5771346843128124e-05, "loss": 2.4833419799804686, "memory(GiB)": 77.56, "step": 54055, "token_acc": 0.4507936507936508, "train_speed(iter/s)": 1.438494 }, { "epoch": 2.316096139839767, "grad_norm": 7.162354946136475, "learning_rate": 5.576466199980973e-05, "loss": 2.6254657745361327, "memory(GiB)": 77.56, "step": 54060, "token_acc": 0.4351851851851852, "train_speed(iter/s)": 1.438504 }, { "epoch": 2.316310355169016, "grad_norm": 5.696046352386475, "learning_rate": 5.575797705205907e-05, "loss": 2.5408376693725585, "memory(GiB)": 77.56, "step": 54065, "token_acc": 0.4524590163934426, "train_speed(iter/s)": 1.438521 }, { "epoch": 2.316524570498265, "grad_norm": 5.709765911102295, "learning_rate": 5.5751291999997256e-05, "loss": 2.3990570068359376, "memory(GiB)": 77.56, "step": 54070, "token_acc": 0.49846153846153846, "train_speed(iter/s)": 1.438515 }, { "epoch": 2.3167387858275137, "grad_norm": 5.901362419128418, "learning_rate": 5.574460684374541e-05, "loss": 2.7339168548583985, "memory(GiB)": 77.56, "step": 54075, "token_acc": 0.4475920679886686, "train_speed(iter/s)": 1.438525 }, { "epoch": 2.316953001156763, "grad_norm": 5.384810447692871, "learning_rate": 5.573792158342459e-05, "loss": 2.2162403106689452, "memory(GiB)": 77.56, "step": 54080, "token_acc": 0.4952076677316294, "train_speed(iter/s)": 1.438516 }, { "epoch": 2.3171672164860118, "grad_norm": 4.498585224151611, "learning_rate": 5.573123621915595e-05, "loss": 2.4023914337158203, "memory(GiB)": 77.56, "step": 54085, "token_acc": 0.46864686468646866, "train_speed(iter/s)": 1.438502 }, { "epoch": 2.3173814318152606, "grad_norm": 7.0708699226379395, "learning_rate": 5.572455075106059e-05, "loss": 2.5315731048583983, "memory(GiB)": 77.56, "step": 54090, "token_acc": 0.4950166112956811, "train_speed(iter/s)": 1.438511 }, { "epoch": 2.31759564714451, "grad_norm": 6.007331371307373, "learning_rate": 5.5717865179259624e-05, "loss": 2.358246994018555, "memory(GiB)": 77.56, "step": 54095, "token_acc": 0.477124183006536, "train_speed(iter/s)": 1.438482 }, { "epoch": 2.3178098624737586, "grad_norm": 6.702929973602295, "learning_rate": 5.571117950387417e-05, "loss": 2.501856803894043, "memory(GiB)": 77.56, "step": 54100, "token_acc": 0.4755700325732899, "train_speed(iter/s)": 1.438499 }, { "epoch": 2.3180240778030075, "grad_norm": 5.773906707763672, "learning_rate": 5.570449372502534e-05, "loss": 2.597435188293457, "memory(GiB)": 77.56, "step": 54105, "token_acc": 0.4666666666666667, "train_speed(iter/s)": 1.438501 }, { "epoch": 2.3182382931322567, "grad_norm": 5.605736255645752, "learning_rate": 5.5697807842834245e-05, "loss": 2.514650344848633, "memory(GiB)": 77.56, "step": 54110, "token_acc": 0.47959183673469385, "train_speed(iter/s)": 1.438524 }, { "epoch": 2.3184525084615055, "grad_norm": 5.441324710845947, "learning_rate": 5.5691121857422034e-05, "loss": 2.143595886230469, "memory(GiB)": 77.56, "step": 54115, "token_acc": 0.5042016806722689, "train_speed(iter/s)": 1.438566 }, { "epoch": 2.3186667237907543, "grad_norm": 5.025784015655518, "learning_rate": 5.568443576890979e-05, "loss": 2.489344596862793, "memory(GiB)": 77.56, "step": 54120, "token_acc": 0.4794520547945205, "train_speed(iter/s)": 1.438548 }, { "epoch": 2.3188809391200036, "grad_norm": 6.301708698272705, "learning_rate": 5.567774957741867e-05, "loss": 2.4384708404541016, "memory(GiB)": 77.56, "step": 54125, "token_acc": 0.4309210526315789, "train_speed(iter/s)": 1.438567 }, { "epoch": 2.3190951544492524, "grad_norm": 5.335469722747803, "learning_rate": 5.56710632830698e-05, "loss": 1.951573944091797, "memory(GiB)": 77.56, "step": 54130, "token_acc": 0.576, "train_speed(iter/s)": 1.438565 }, { "epoch": 2.319309369778501, "grad_norm": 5.782993316650391, "learning_rate": 5.5664376885984296e-05, "loss": 2.064581871032715, "memory(GiB)": 77.56, "step": 54135, "token_acc": 0.5271966527196653, "train_speed(iter/s)": 1.438549 }, { "epoch": 2.3195235851077505, "grad_norm": 5.268125534057617, "learning_rate": 5.565769038628328e-05, "loss": 2.3723209381103514, "memory(GiB)": 77.56, "step": 54140, "token_acc": 0.5201465201465202, "train_speed(iter/s)": 1.438558 }, { "epoch": 2.3197378004369993, "grad_norm": 8.005107879638672, "learning_rate": 5.5651003784087904e-05, "loss": 2.663521575927734, "memory(GiB)": 77.56, "step": 54145, "token_acc": 0.4728682170542636, "train_speed(iter/s)": 1.438569 }, { "epoch": 2.319952015766248, "grad_norm": 4.887722969055176, "learning_rate": 5.5644317079519296e-05, "loss": 2.5326534271240235, "memory(GiB)": 77.56, "step": 54150, "token_acc": 0.4724137931034483, "train_speed(iter/s)": 1.438549 }, { "epoch": 2.3201662310954974, "grad_norm": 5.536297798156738, "learning_rate": 5.5637630272698584e-05, "loss": 2.433104705810547, "memory(GiB)": 77.56, "step": 54155, "token_acc": 0.4707692307692308, "train_speed(iter/s)": 1.438535 }, { "epoch": 2.320380446424746, "grad_norm": 5.90949010848999, "learning_rate": 5.56309433637469e-05, "loss": 2.585856056213379, "memory(GiB)": 77.56, "step": 54160, "token_acc": 0.4631578947368421, "train_speed(iter/s)": 1.438522 }, { "epoch": 2.320594661753995, "grad_norm": 5.377594470977783, "learning_rate": 5.562425635278541e-05, "loss": 2.2658287048339845, "memory(GiB)": 77.56, "step": 54165, "token_acc": 0.4797507788161994, "train_speed(iter/s)": 1.43852 }, { "epoch": 2.3208088770832442, "grad_norm": 5.158398151397705, "learning_rate": 5.561756923993523e-05, "loss": 2.622241401672363, "memory(GiB)": 77.56, "step": 54170, "token_acc": 0.45151515151515154, "train_speed(iter/s)": 1.438526 }, { "epoch": 2.321023092412493, "grad_norm": 6.190441131591797, "learning_rate": 5.561088202531752e-05, "loss": 2.3491865158081056, "memory(GiB)": 77.56, "step": 54175, "token_acc": 0.48188405797101447, "train_speed(iter/s)": 1.438532 }, { "epoch": 2.321237307741742, "grad_norm": 7.089932441711426, "learning_rate": 5.5604194709053424e-05, "loss": 2.508444595336914, "memory(GiB)": 77.56, "step": 54180, "token_acc": 0.5134099616858238, "train_speed(iter/s)": 1.438576 }, { "epoch": 2.321451523070991, "grad_norm": 7.774629592895508, "learning_rate": 5.5597507291264087e-05, "loss": 2.468191909790039, "memory(GiB)": 77.56, "step": 54185, "token_acc": 0.49230769230769234, "train_speed(iter/s)": 1.438607 }, { "epoch": 2.32166573840024, "grad_norm": 6.656830310821533, "learning_rate": 5.559081977207065e-05, "loss": 2.3286041259765624, "memory(GiB)": 77.56, "step": 54190, "token_acc": 0.4661016949152542, "train_speed(iter/s)": 1.438622 }, { "epoch": 2.3218799537294887, "grad_norm": 6.327261924743652, "learning_rate": 5.558413215159425e-05, "loss": 2.519520950317383, "memory(GiB)": 77.56, "step": 54195, "token_acc": 0.47435897435897434, "train_speed(iter/s)": 1.438628 }, { "epoch": 2.322094169058738, "grad_norm": 5.310946941375732, "learning_rate": 5.557744442995607e-05, "loss": 2.695193862915039, "memory(GiB)": 77.56, "step": 54200, "token_acc": 0.47368421052631576, "train_speed(iter/s)": 1.438631 }, { "epoch": 2.322308384387987, "grad_norm": 9.110607147216797, "learning_rate": 5.5570756607277256e-05, "loss": 2.407896041870117, "memory(GiB)": 77.56, "step": 54205, "token_acc": 0.5539568345323741, "train_speed(iter/s)": 1.438627 }, { "epoch": 2.3225225997172356, "grad_norm": 4.725812911987305, "learning_rate": 5.556406868367895e-05, "loss": 2.484501075744629, "memory(GiB)": 77.56, "step": 54210, "token_acc": 0.4815950920245399, "train_speed(iter/s)": 1.438608 }, { "epoch": 2.322736815046485, "grad_norm": 6.263838291168213, "learning_rate": 5.555738065928233e-05, "loss": 2.5613006591796874, "memory(GiB)": 77.56, "step": 54215, "token_acc": 0.5019157088122606, "train_speed(iter/s)": 1.438631 }, { "epoch": 2.3229510303757337, "grad_norm": 6.026326656341553, "learning_rate": 5.555069253420855e-05, "loss": 2.2715780258178713, "memory(GiB)": 77.56, "step": 54220, "token_acc": 0.5433333333333333, "train_speed(iter/s)": 1.43864 }, { "epoch": 2.3231652457049825, "grad_norm": 5.113918781280518, "learning_rate": 5.554400430857876e-05, "loss": 2.2215293884277343, "memory(GiB)": 77.56, "step": 54225, "token_acc": 0.5284810126582279, "train_speed(iter/s)": 1.438674 }, { "epoch": 2.3233794610342318, "grad_norm": 4.733516216278076, "learning_rate": 5.5537315982514135e-05, "loss": 2.378024101257324, "memory(GiB)": 77.56, "step": 54230, "token_acc": 0.543918918918919, "train_speed(iter/s)": 1.438675 }, { "epoch": 2.3235936763634806, "grad_norm": 4.240628242492676, "learning_rate": 5.5530627556135826e-05, "loss": 2.7475404739379883, "memory(GiB)": 77.56, "step": 54235, "token_acc": 0.4411764705882353, "train_speed(iter/s)": 1.438685 }, { "epoch": 2.3238078916927294, "grad_norm": 7.404336929321289, "learning_rate": 5.552393902956502e-05, "loss": 2.373859977722168, "memory(GiB)": 77.56, "step": 54240, "token_acc": 0.5189003436426117, "train_speed(iter/s)": 1.438728 }, { "epoch": 2.3240221070219786, "grad_norm": 5.088076114654541, "learning_rate": 5.551725040292288e-05, "loss": 2.458535003662109, "memory(GiB)": 77.56, "step": 54245, "token_acc": 0.4605263157894737, "train_speed(iter/s)": 1.438709 }, { "epoch": 2.3242363223512275, "grad_norm": 5.420531749725342, "learning_rate": 5.551056167633058e-05, "loss": 2.508696937561035, "memory(GiB)": 77.56, "step": 54250, "token_acc": 0.48562300319488816, "train_speed(iter/s)": 1.438727 }, { "epoch": 2.3244505376804763, "grad_norm": 5.287556171417236, "learning_rate": 5.550387284990927e-05, "loss": 2.508671760559082, "memory(GiB)": 77.56, "step": 54255, "token_acc": 0.475177304964539, "train_speed(iter/s)": 1.438717 }, { "epoch": 2.3246647530097255, "grad_norm": 4.1852946281433105, "learning_rate": 5.549718392378014e-05, "loss": 2.6818719863891602, "memory(GiB)": 77.56, "step": 54260, "token_acc": 0.4775641025641026, "train_speed(iter/s)": 1.438751 }, { "epoch": 2.3248789683389743, "grad_norm": 4.860667705535889, "learning_rate": 5.549049489806437e-05, "loss": 2.311024856567383, "memory(GiB)": 77.56, "step": 54265, "token_acc": 0.48881789137380194, "train_speed(iter/s)": 1.438779 }, { "epoch": 2.325093183668223, "grad_norm": 6.025043487548828, "learning_rate": 5.5483805772883133e-05, "loss": 2.485648536682129, "memory(GiB)": 77.56, "step": 54270, "token_acc": 0.48732394366197185, "train_speed(iter/s)": 1.438812 }, { "epoch": 2.3253073989974724, "grad_norm": 5.138810634613037, "learning_rate": 5.547711654835761e-05, "loss": 2.5689886093139647, "memory(GiB)": 77.56, "step": 54275, "token_acc": 0.4794007490636704, "train_speed(iter/s)": 1.438815 }, { "epoch": 2.325521614326721, "grad_norm": 5.626759052276611, "learning_rate": 5.5470427224609e-05, "loss": 2.79862060546875, "memory(GiB)": 77.56, "step": 54280, "token_acc": 0.4654255319148936, "train_speed(iter/s)": 1.438768 }, { "epoch": 2.32573582965597, "grad_norm": 4.4945478439331055, "learning_rate": 5.5463737801758443e-05, "loss": 2.5011838912963866, "memory(GiB)": 77.56, "step": 54285, "token_acc": 0.4752851711026616, "train_speed(iter/s)": 1.438747 }, { "epoch": 2.3259500449852193, "grad_norm": 6.036630153656006, "learning_rate": 5.545704827992717e-05, "loss": 2.304207611083984, "memory(GiB)": 77.56, "step": 54290, "token_acc": 0.518796992481203, "train_speed(iter/s)": 1.438728 }, { "epoch": 2.326164260314468, "grad_norm": 5.231227397918701, "learning_rate": 5.5450358659236336e-05, "loss": 2.514471435546875, "memory(GiB)": 77.56, "step": 54295, "token_acc": 0.46387832699619774, "train_speed(iter/s)": 1.438719 }, { "epoch": 2.326378475643717, "grad_norm": 4.567483425140381, "learning_rate": 5.5443668939807156e-05, "loss": 2.3522150039672853, "memory(GiB)": 77.56, "step": 54300, "token_acc": 0.5064516129032258, "train_speed(iter/s)": 1.438719 }, { "epoch": 2.326592690972966, "grad_norm": 5.361208915710449, "learning_rate": 5.5436979121760804e-05, "loss": 2.5086477279663084, "memory(GiB)": 77.56, "step": 54305, "token_acc": 0.5126582278481012, "train_speed(iter/s)": 1.438687 }, { "epoch": 2.326806906302215, "grad_norm": 5.930521011352539, "learning_rate": 5.543028920521846e-05, "loss": 2.255126190185547, "memory(GiB)": 77.56, "step": 54310, "token_acc": 0.5, "train_speed(iter/s)": 1.438649 }, { "epoch": 2.327021121631464, "grad_norm": 5.941011905670166, "learning_rate": 5.542359919030133e-05, "loss": 2.325210952758789, "memory(GiB)": 77.56, "step": 54315, "token_acc": 0.49691358024691357, "train_speed(iter/s)": 1.438641 }, { "epoch": 2.327235336960713, "grad_norm": 5.84672737121582, "learning_rate": 5.541690907713063e-05, "loss": 2.555367279052734, "memory(GiB)": 77.56, "step": 54320, "token_acc": 0.4523809523809524, "train_speed(iter/s)": 1.438663 }, { "epoch": 2.327449552289962, "grad_norm": 7.087546348571777, "learning_rate": 5.5410218865827534e-05, "loss": 2.2478092193603514, "memory(GiB)": 77.56, "step": 54325, "token_acc": 0.536, "train_speed(iter/s)": 1.438684 }, { "epoch": 2.3276637676192107, "grad_norm": 4.922154903411865, "learning_rate": 5.540352855651325e-05, "loss": 2.2991874694824217, "memory(GiB)": 77.56, "step": 54330, "token_acc": 0.4791666666666667, "train_speed(iter/s)": 1.438691 }, { "epoch": 2.32787798294846, "grad_norm": 4.60272741317749, "learning_rate": 5.539683814930898e-05, "loss": 2.4507495880126955, "memory(GiB)": 77.56, "step": 54335, "token_acc": 0.5315985130111525, "train_speed(iter/s)": 1.438688 }, { "epoch": 2.3280921982777087, "grad_norm": 4.2224931716918945, "learning_rate": 5.539014764433592e-05, "loss": 2.5097467422485353, "memory(GiB)": 77.56, "step": 54340, "token_acc": 0.4797297297297297, "train_speed(iter/s)": 1.438691 }, { "epoch": 2.3283064136069576, "grad_norm": 4.0915751457214355, "learning_rate": 5.538345704171527e-05, "loss": 1.945652961730957, "memory(GiB)": 77.56, "step": 54345, "token_acc": 0.5351170568561873, "train_speed(iter/s)": 1.438678 }, { "epoch": 2.328520628936207, "grad_norm": 5.094549655914307, "learning_rate": 5.537676634156827e-05, "loss": 2.4953020095825194, "memory(GiB)": 77.56, "step": 54350, "token_acc": 0.49696969696969695, "train_speed(iter/s)": 1.438678 }, { "epoch": 2.3287348442654556, "grad_norm": 4.522862911224365, "learning_rate": 5.537007554401608e-05, "loss": 2.5908123016357423, "memory(GiB)": 77.56, "step": 54355, "token_acc": 0.45734597156398105, "train_speed(iter/s)": 1.438708 }, { "epoch": 2.3289490595947044, "grad_norm": 5.012341499328613, "learning_rate": 5.536338464917995e-05, "loss": 2.46358699798584, "memory(GiB)": 77.56, "step": 54360, "token_acc": 0.49818181818181817, "train_speed(iter/s)": 1.438717 }, { "epoch": 2.3291632749239537, "grad_norm": 5.816551208496094, "learning_rate": 5.535669365718107e-05, "loss": 2.1036087036132813, "memory(GiB)": 77.56, "step": 54365, "token_acc": 0.5092250922509225, "train_speed(iter/s)": 1.438754 }, { "epoch": 2.3293774902532025, "grad_norm": 6.4626030921936035, "learning_rate": 5.535000256814066e-05, "loss": 2.3061901092529298, "memory(GiB)": 77.56, "step": 54370, "token_acc": 0.4844290657439446, "train_speed(iter/s)": 1.438707 }, { "epoch": 2.3295917055824513, "grad_norm": 5.194098472595215, "learning_rate": 5.534331138217993e-05, "loss": 2.339299774169922, "memory(GiB)": 77.56, "step": 54375, "token_acc": 0.4726027397260274, "train_speed(iter/s)": 1.438729 }, { "epoch": 2.3298059209117006, "grad_norm": 5.277906894683838, "learning_rate": 5.53366200994201e-05, "loss": 2.381866455078125, "memory(GiB)": 77.56, "step": 54380, "token_acc": 0.49693251533742333, "train_speed(iter/s)": 1.438709 }, { "epoch": 2.3300201362409494, "grad_norm": 4.18302583694458, "learning_rate": 5.532992871998239e-05, "loss": 2.344228744506836, "memory(GiB)": 77.56, "step": 54385, "token_acc": 0.5249169435215947, "train_speed(iter/s)": 1.438741 }, { "epoch": 2.330234351570198, "grad_norm": 4.742477893829346, "learning_rate": 5.532323724398804e-05, "loss": 2.58557243347168, "memory(GiB)": 77.56, "step": 54390, "token_acc": 0.4735099337748344, "train_speed(iter/s)": 1.438737 }, { "epoch": 2.3304485668994475, "grad_norm": 8.05929946899414, "learning_rate": 5.531654567155824e-05, "loss": 2.248756217956543, "memory(GiB)": 77.56, "step": 54395, "token_acc": 0.5454545454545454, "train_speed(iter/s)": 1.438754 }, { "epoch": 2.3306627822286963, "grad_norm": 3.982360601425171, "learning_rate": 5.530985400281422e-05, "loss": 2.4308753967285157, "memory(GiB)": 77.56, "step": 54400, "token_acc": 0.5016501650165016, "train_speed(iter/s)": 1.438787 }, { "epoch": 2.330876997557945, "grad_norm": 7.862496376037598, "learning_rate": 5.530316223787723e-05, "loss": 2.3568853378295898, "memory(GiB)": 77.56, "step": 54405, "token_acc": 0.5057471264367817, "train_speed(iter/s)": 1.438724 }, { "epoch": 2.3310912128871943, "grad_norm": 6.096340179443359, "learning_rate": 5.529647037686847e-05, "loss": 2.3952186584472654, "memory(GiB)": 77.56, "step": 54410, "token_acc": 0.45098039215686275, "train_speed(iter/s)": 1.438751 }, { "epoch": 2.331305428216443, "grad_norm": 5.931663990020752, "learning_rate": 5.528977841990919e-05, "loss": 2.5866153717041014, "memory(GiB)": 77.56, "step": 54415, "token_acc": 0.4666666666666667, "train_speed(iter/s)": 1.438757 }, { "epoch": 2.331519643545692, "grad_norm": 4.925633907318115, "learning_rate": 5.5283086367120594e-05, "loss": 2.492633819580078, "memory(GiB)": 77.56, "step": 54420, "token_acc": 0.476027397260274, "train_speed(iter/s)": 1.438751 }, { "epoch": 2.3317338588749412, "grad_norm": 4.742899417877197, "learning_rate": 5.527639421862395e-05, "loss": 2.5154991149902344, "memory(GiB)": 77.56, "step": 54425, "token_acc": 0.45695364238410596, "train_speed(iter/s)": 1.438777 }, { "epoch": 2.33194807420419, "grad_norm": 4.619522571563721, "learning_rate": 5.526970197454047e-05, "loss": 2.53272705078125, "memory(GiB)": 77.56, "step": 54430, "token_acc": 0.49848024316109424, "train_speed(iter/s)": 1.438776 }, { "epoch": 2.332162289533439, "grad_norm": 4.009924411773682, "learning_rate": 5.52630096349914e-05, "loss": 2.7802892684936524, "memory(GiB)": 77.56, "step": 54435, "token_acc": 0.44140625, "train_speed(iter/s)": 1.438774 }, { "epoch": 2.332376504862688, "grad_norm": 5.807552337646484, "learning_rate": 5.525631720009796e-05, "loss": 2.6330240249633787, "memory(GiB)": 77.56, "step": 54440, "token_acc": 0.4875444839857651, "train_speed(iter/s)": 1.438798 }, { "epoch": 2.332590720191937, "grad_norm": 4.5297040939331055, "learning_rate": 5.524962466998142e-05, "loss": 2.5877969741821287, "memory(GiB)": 77.56, "step": 54445, "token_acc": 0.4845360824742268, "train_speed(iter/s)": 1.438786 }, { "epoch": 2.3328049355211857, "grad_norm": 5.650115013122559, "learning_rate": 5.5242932044762994e-05, "loss": 2.4883726119995115, "memory(GiB)": 77.56, "step": 54450, "token_acc": 0.4883720930232558, "train_speed(iter/s)": 1.438794 }, { "epoch": 2.333019150850435, "grad_norm": 5.331174850463867, "learning_rate": 5.523623932456394e-05, "loss": 2.5523576736450195, "memory(GiB)": 77.56, "step": 54455, "token_acc": 0.4855072463768116, "train_speed(iter/s)": 1.438801 }, { "epoch": 2.333233366179684, "grad_norm": 5.889141082763672, "learning_rate": 5.522954650950549e-05, "loss": 2.586617088317871, "memory(GiB)": 77.56, "step": 54460, "token_acc": 0.4411764705882353, "train_speed(iter/s)": 1.438793 }, { "epoch": 2.3334475815089326, "grad_norm": 6.986037731170654, "learning_rate": 5.522285359970891e-05, "loss": 2.318703460693359, "memory(GiB)": 77.56, "step": 54465, "token_acc": 0.503448275862069, "train_speed(iter/s)": 1.438781 }, { "epoch": 2.333661796838182, "grad_norm": 5.666684150695801, "learning_rate": 5.521616059529543e-05, "loss": 2.489665985107422, "memory(GiB)": 77.56, "step": 54470, "token_acc": 0.4626865671641791, "train_speed(iter/s)": 1.438814 }, { "epoch": 2.3338760121674307, "grad_norm": 4.9587788581848145, "learning_rate": 5.5209467496386315e-05, "loss": 2.844677543640137, "memory(GiB)": 77.56, "step": 54475, "token_acc": 0.4367469879518072, "train_speed(iter/s)": 1.438818 }, { "epoch": 2.3340902274966795, "grad_norm": 6.0969462394714355, "learning_rate": 5.5202774303102814e-05, "loss": 2.2558618545532227, "memory(GiB)": 77.56, "step": 54480, "token_acc": 0.50187265917603, "train_speed(iter/s)": 1.438814 }, { "epoch": 2.3343044428259287, "grad_norm": 5.694857120513916, "learning_rate": 5.519608101556616e-05, "loss": 2.535897636413574, "memory(GiB)": 77.56, "step": 54485, "token_acc": 0.46621621621621623, "train_speed(iter/s)": 1.438826 }, { "epoch": 2.3345186581551776, "grad_norm": 5.296054840087891, "learning_rate": 5.518938763389764e-05, "loss": 2.5851299285888674, "memory(GiB)": 77.56, "step": 54490, "token_acc": 0.46619217081850534, "train_speed(iter/s)": 1.438858 }, { "epoch": 2.3347328734844264, "grad_norm": 4.489163875579834, "learning_rate": 5.5182694158218476e-05, "loss": 2.463297462463379, "memory(GiB)": 77.56, "step": 54495, "token_acc": 0.498371335504886, "train_speed(iter/s)": 1.43885 }, { "epoch": 2.3349470888136756, "grad_norm": 5.169692039489746, "learning_rate": 5.5176000588649946e-05, "loss": 2.5896942138671877, "memory(GiB)": 77.56, "step": 54500, "token_acc": 0.4845679012345679, "train_speed(iter/s)": 1.438851 }, { "epoch": 2.3349470888136756, "eval_loss": 2.114393472671509, "eval_runtime": 14.5742, "eval_samples_per_second": 6.861, "eval_steps_per_second": 6.861, "eval_token_acc": 0.49039692701664533, "step": 54500 }, { "epoch": 2.3351613041429244, "grad_norm": 5.3127522468566895, "learning_rate": 5.5169306925313324e-05, "loss": 2.268874168395996, "memory(GiB)": 77.56, "step": 54505, "token_acc": 0.49150943396226415, "train_speed(iter/s)": 1.438283 }, { "epoch": 2.3353755194721733, "grad_norm": 6.650249004364014, "learning_rate": 5.5162613168329856e-05, "loss": 2.3926322937011717, "memory(GiB)": 77.56, "step": 54510, "token_acc": 0.5160142348754448, "train_speed(iter/s)": 1.438293 }, { "epoch": 2.3355897348014225, "grad_norm": 5.038234710693359, "learning_rate": 5.5155919317820806e-05, "loss": 2.2601802825927733, "memory(GiB)": 77.56, "step": 54515, "token_acc": 0.5275590551181102, "train_speed(iter/s)": 1.43826 }, { "epoch": 2.3358039501306713, "grad_norm": 5.188998222351074, "learning_rate": 5.514922537390744e-05, "loss": 2.011192512512207, "memory(GiB)": 77.56, "step": 54520, "token_acc": 0.5362318840579711, "train_speed(iter/s)": 1.438245 }, { "epoch": 2.33601816545992, "grad_norm": 4.637081146240234, "learning_rate": 5.5142531336711034e-05, "loss": 2.211981773376465, "memory(GiB)": 77.56, "step": 54525, "token_acc": 0.5333333333333333, "train_speed(iter/s)": 1.438247 }, { "epoch": 2.3362323807891694, "grad_norm": 5.339656829833984, "learning_rate": 5.513583720635283e-05, "loss": 2.3581382751464846, "memory(GiB)": 77.56, "step": 54530, "token_acc": 0.4827586206896552, "train_speed(iter/s)": 1.43827 }, { "epoch": 2.336446596118418, "grad_norm": 10.428476333618164, "learning_rate": 5.512914298295413e-05, "loss": 2.3907444000244142, "memory(GiB)": 77.56, "step": 54535, "token_acc": 0.5033333333333333, "train_speed(iter/s)": 1.438244 }, { "epoch": 2.336660811447667, "grad_norm": 4.768646717071533, "learning_rate": 5.51224486666362e-05, "loss": 2.534077262878418, "memory(GiB)": 77.56, "step": 54540, "token_acc": 0.4837758112094395, "train_speed(iter/s)": 1.438268 }, { "epoch": 2.3368750267769163, "grad_norm": 5.592228412628174, "learning_rate": 5.511575425752029e-05, "loss": 2.669137954711914, "memory(GiB)": 77.56, "step": 54545, "token_acc": 0.43272727272727274, "train_speed(iter/s)": 1.438253 }, { "epoch": 2.337089242106165, "grad_norm": 5.471279144287109, "learning_rate": 5.510905975572771e-05, "loss": 2.3823238372802735, "memory(GiB)": 77.56, "step": 54550, "token_acc": 0.5031847133757962, "train_speed(iter/s)": 1.438215 }, { "epoch": 2.337303457435414, "grad_norm": 6.329859256744385, "learning_rate": 5.51023651613797e-05, "loss": 2.399275779724121, "memory(GiB)": 77.56, "step": 54555, "token_acc": 0.483974358974359, "train_speed(iter/s)": 1.438215 }, { "epoch": 2.337517672764663, "grad_norm": 5.307493686676025, "learning_rate": 5.5095670474597574e-05, "loss": 2.191426658630371, "memory(GiB)": 77.56, "step": 54560, "token_acc": 0.5121951219512195, "train_speed(iter/s)": 1.43823 }, { "epoch": 2.337731888093912, "grad_norm": 5.246840000152588, "learning_rate": 5.508897569550259e-05, "loss": 2.470229911804199, "memory(GiB)": 77.56, "step": 54565, "token_acc": 0.48507462686567165, "train_speed(iter/s)": 1.438222 }, { "epoch": 2.337946103423161, "grad_norm": 6.284454822540283, "learning_rate": 5.5082280824216046e-05, "loss": 2.5420969009399412, "memory(GiB)": 77.56, "step": 54570, "token_acc": 0.49666666666666665, "train_speed(iter/s)": 1.438217 }, { "epoch": 2.33816031875241, "grad_norm": 5.767074108123779, "learning_rate": 5.5075585860859206e-05, "loss": 2.346650505065918, "memory(GiB)": 77.56, "step": 54575, "token_acc": 0.5121107266435986, "train_speed(iter/s)": 1.438231 }, { "epoch": 2.338374534081659, "grad_norm": 5.003377437591553, "learning_rate": 5.5068890805553374e-05, "loss": 2.387823486328125, "memory(GiB)": 77.56, "step": 54580, "token_acc": 0.48905109489051096, "train_speed(iter/s)": 1.438271 }, { "epoch": 2.3385887494109077, "grad_norm": 6.539522171020508, "learning_rate": 5.506219565841983e-05, "loss": 2.4285747528076174, "memory(GiB)": 77.56, "step": 54585, "token_acc": 0.46184738955823296, "train_speed(iter/s)": 1.438274 }, { "epoch": 2.338802964740157, "grad_norm": 4.871034622192383, "learning_rate": 5.5055500419579866e-05, "loss": 2.1912952423095704, "memory(GiB)": 77.56, "step": 54590, "token_acc": 0.5412186379928315, "train_speed(iter/s)": 1.438295 }, { "epoch": 2.3390171800694057, "grad_norm": 4.646417617797852, "learning_rate": 5.5048805089154765e-05, "loss": 2.1723554611206053, "memory(GiB)": 77.56, "step": 54595, "token_acc": 0.4928571428571429, "train_speed(iter/s)": 1.438293 }, { "epoch": 2.3392313953986545, "grad_norm": 5.2171783447265625, "learning_rate": 5.504210966726582e-05, "loss": 2.253154182434082, "memory(GiB)": 77.56, "step": 54600, "token_acc": 0.5068493150684932, "train_speed(iter/s)": 1.438277 }, { "epoch": 2.339445610727904, "grad_norm": 5.858293056488037, "learning_rate": 5.503541415403433e-05, "loss": 2.4629806518554687, "memory(GiB)": 77.56, "step": 54605, "token_acc": 0.4864864864864865, "train_speed(iter/s)": 1.438261 }, { "epoch": 2.3396598260571526, "grad_norm": 5.127858638763428, "learning_rate": 5.502871854958159e-05, "loss": 2.4169069290161134, "memory(GiB)": 77.56, "step": 54610, "token_acc": 0.4896755162241888, "train_speed(iter/s)": 1.438255 }, { "epoch": 2.3398740413864014, "grad_norm": 5.172540187835693, "learning_rate": 5.502202285402889e-05, "loss": 2.552465057373047, "memory(GiB)": 77.56, "step": 54615, "token_acc": 0.5018315018315018, "train_speed(iter/s)": 1.438273 }, { "epoch": 2.3400882567156507, "grad_norm": 5.092914581298828, "learning_rate": 5.5015327067497544e-05, "loss": 2.1506404876708984, "memory(GiB)": 77.56, "step": 54620, "token_acc": 0.5308641975308642, "train_speed(iter/s)": 1.438274 }, { "epoch": 2.3403024720448995, "grad_norm": 5.406525135040283, "learning_rate": 5.500863119010885e-05, "loss": 2.4598031997680665, "memory(GiB)": 77.56, "step": 54625, "token_acc": 0.4608150470219436, "train_speed(iter/s)": 1.438277 }, { "epoch": 2.3405166873741483, "grad_norm": 5.842250347137451, "learning_rate": 5.500193522198409e-05, "loss": 2.4525663375854494, "memory(GiB)": 77.56, "step": 54630, "token_acc": 0.4827586206896552, "train_speed(iter/s)": 1.438292 }, { "epoch": 2.3407309027033976, "grad_norm": 8.250039100646973, "learning_rate": 5.499523916324459e-05, "loss": 2.4933700561523438, "memory(GiB)": 77.56, "step": 54635, "token_acc": 0.48201438848920863, "train_speed(iter/s)": 1.438278 }, { "epoch": 2.3409451180326464, "grad_norm": 5.769161224365234, "learning_rate": 5.498854301401164e-05, "loss": 2.7864315032958986, "memory(GiB)": 77.56, "step": 54640, "token_acc": 0.4157303370786517, "train_speed(iter/s)": 1.438304 }, { "epoch": 2.341159333361895, "grad_norm": 5.727543354034424, "learning_rate": 5.4981846774406555e-05, "loss": 2.553101348876953, "memory(GiB)": 77.56, "step": 54645, "token_acc": 0.4827586206896552, "train_speed(iter/s)": 1.438319 }, { "epoch": 2.3413735486911444, "grad_norm": 6.9831767082214355, "learning_rate": 5.497515044455065e-05, "loss": 2.28961296081543, "memory(GiB)": 77.56, "step": 54650, "token_acc": 0.5202492211838006, "train_speed(iter/s)": 1.438345 }, { "epoch": 2.3415877640203933, "grad_norm": 4.3163161277771, "learning_rate": 5.496845402456522e-05, "loss": 2.5322153091430666, "memory(GiB)": 77.56, "step": 54655, "token_acc": 0.42950819672131146, "train_speed(iter/s)": 1.438369 }, { "epoch": 2.341801979349642, "grad_norm": 8.255607604980469, "learning_rate": 5.496175751457159e-05, "loss": 2.5266948699951173, "memory(GiB)": 77.56, "step": 54660, "token_acc": 0.4863013698630137, "train_speed(iter/s)": 1.438341 }, { "epoch": 2.3420161946788913, "grad_norm": 5.314919471740723, "learning_rate": 5.495506091469106e-05, "loss": 2.6604766845703125, "memory(GiB)": 77.56, "step": 54665, "token_acc": 0.4690909090909091, "train_speed(iter/s)": 1.438367 }, { "epoch": 2.34223041000814, "grad_norm": 4.820956707000732, "learning_rate": 5.494836422504497e-05, "loss": 2.361622619628906, "memory(GiB)": 77.56, "step": 54670, "token_acc": 0.5217391304347826, "train_speed(iter/s)": 1.438382 }, { "epoch": 2.342444625337389, "grad_norm": 6.008187294006348, "learning_rate": 5.49416674457546e-05, "loss": 2.5905376434326173, "memory(GiB)": 77.56, "step": 54675, "token_acc": 0.4479768786127168, "train_speed(iter/s)": 1.438377 }, { "epoch": 2.342658840666638, "grad_norm": 5.974514484405518, "learning_rate": 5.493497057694129e-05, "loss": 2.441915512084961, "memory(GiB)": 77.56, "step": 54680, "token_acc": 0.4870848708487085, "train_speed(iter/s)": 1.438399 }, { "epoch": 2.342873055995887, "grad_norm": 5.400164604187012, "learning_rate": 5.492827361872637e-05, "loss": 2.4609085083007813, "memory(GiB)": 77.56, "step": 54685, "token_acc": 0.48880597014925375, "train_speed(iter/s)": 1.438426 }, { "epoch": 2.343087271325136, "grad_norm": 7.050365924835205, "learning_rate": 5.492157657123114e-05, "loss": 2.0577001571655273, "memory(GiB)": 77.56, "step": 54690, "token_acc": 0.5298013245033113, "train_speed(iter/s)": 1.43839 }, { "epoch": 2.343301486654385, "grad_norm": 5.239218235015869, "learning_rate": 5.4914879434576936e-05, "loss": 2.534061813354492, "memory(GiB)": 77.56, "step": 54695, "token_acc": 0.4664310954063604, "train_speed(iter/s)": 1.438404 }, { "epoch": 2.343515701983634, "grad_norm": 6.294853210449219, "learning_rate": 5.490818220888508e-05, "loss": 2.6373470306396483, "memory(GiB)": 77.56, "step": 54700, "token_acc": 0.487012987012987, "train_speed(iter/s)": 1.438413 }, { "epoch": 2.3437299173128827, "grad_norm": 7.120141983032227, "learning_rate": 5.4901484894276914e-05, "loss": 2.2757015228271484, "memory(GiB)": 77.56, "step": 54705, "token_acc": 0.5148148148148148, "train_speed(iter/s)": 1.43841 }, { "epoch": 2.343944132642132, "grad_norm": 7.0399088859558105, "learning_rate": 5.4894787490873746e-05, "loss": 2.6073095321655275, "memory(GiB)": 77.56, "step": 54710, "token_acc": 0.43703703703703706, "train_speed(iter/s)": 1.438411 }, { "epoch": 2.344158347971381, "grad_norm": 5.254524230957031, "learning_rate": 5.48880899987969e-05, "loss": 2.7771066665649413, "memory(GiB)": 77.56, "step": 54715, "token_acc": 0.43462897526501765, "train_speed(iter/s)": 1.438452 }, { "epoch": 2.34437256330063, "grad_norm": 5.308459758758545, "learning_rate": 5.488139241816771e-05, "loss": 2.1082611083984375, "memory(GiB)": 77.56, "step": 54720, "token_acc": 0.47330960854092524, "train_speed(iter/s)": 1.438438 }, { "epoch": 2.344586778629879, "grad_norm": 4.598628044128418, "learning_rate": 5.487469474910754e-05, "loss": 2.3201196670532225, "memory(GiB)": 77.56, "step": 54725, "token_acc": 0.4779116465863454, "train_speed(iter/s)": 1.438463 }, { "epoch": 2.3448009939591277, "grad_norm": 4.571203708648682, "learning_rate": 5.486799699173768e-05, "loss": 2.4418289184570314, "memory(GiB)": 77.56, "step": 54730, "token_acc": 0.5101449275362319, "train_speed(iter/s)": 1.438482 }, { "epoch": 2.345015209288377, "grad_norm": 4.999051094055176, "learning_rate": 5.486129914617951e-05, "loss": 2.2475542068481444, "memory(GiB)": 77.56, "step": 54735, "token_acc": 0.5345911949685535, "train_speed(iter/s)": 1.438503 }, { "epoch": 2.3452294246176257, "grad_norm": 6.978096008300781, "learning_rate": 5.485460121255434e-05, "loss": 2.308015823364258, "memory(GiB)": 77.56, "step": 54740, "token_acc": 0.5214285714285715, "train_speed(iter/s)": 1.438522 }, { "epoch": 2.3454436399468745, "grad_norm": 5.965931415557861, "learning_rate": 5.484790319098351e-05, "loss": 2.616992378234863, "memory(GiB)": 77.56, "step": 54745, "token_acc": 0.4734848484848485, "train_speed(iter/s)": 1.438508 }, { "epoch": 2.345657855276124, "grad_norm": 4.755771160125732, "learning_rate": 5.4841205081588354e-05, "loss": 2.4398662567138674, "memory(GiB)": 77.56, "step": 54750, "token_acc": 0.4806451612903226, "train_speed(iter/s)": 1.438535 }, { "epoch": 2.3458720706053726, "grad_norm": 4.0919928550720215, "learning_rate": 5.483450688449024e-05, "loss": 2.4105281829833984, "memory(GiB)": 77.56, "step": 54755, "token_acc": 0.5158227848101266, "train_speed(iter/s)": 1.438543 }, { "epoch": 2.3460862859346214, "grad_norm": 5.815858840942383, "learning_rate": 5.482780859981049e-05, "loss": 2.9770795822143556, "memory(GiB)": 77.56, "step": 54760, "token_acc": 0.41002949852507375, "train_speed(iter/s)": 1.438565 }, { "epoch": 2.3463005012638707, "grad_norm": 4.6366190910339355, "learning_rate": 5.482111022767048e-05, "loss": 2.343940544128418, "memory(GiB)": 77.56, "step": 54765, "token_acc": 0.5085324232081911, "train_speed(iter/s)": 1.438557 }, { "epoch": 2.3465147165931195, "grad_norm": 5.3660759925842285, "learning_rate": 5.481441176819152e-05, "loss": 2.5082046508789064, "memory(GiB)": 77.56, "step": 54770, "token_acc": 0.48863636363636365, "train_speed(iter/s)": 1.438553 }, { "epoch": 2.3467289319223683, "grad_norm": 5.451185703277588, "learning_rate": 5.480771322149497e-05, "loss": 2.2944231033325195, "memory(GiB)": 77.56, "step": 54775, "token_acc": 0.501628664495114, "train_speed(iter/s)": 1.438558 }, { "epoch": 2.3469431472516176, "grad_norm": 6.541532516479492, "learning_rate": 5.480101458770221e-05, "loss": 2.389505386352539, "memory(GiB)": 77.56, "step": 54780, "token_acc": 0.4481707317073171, "train_speed(iter/s)": 1.438567 }, { "epoch": 2.3471573625808664, "grad_norm": 6.510070323944092, "learning_rate": 5.479431586693454e-05, "loss": 2.7133731842041016, "memory(GiB)": 77.56, "step": 54785, "token_acc": 0.4397163120567376, "train_speed(iter/s)": 1.438582 }, { "epoch": 2.347371577910115, "grad_norm": 4.8781914710998535, "learning_rate": 5.478761705931334e-05, "loss": 2.652548599243164, "memory(GiB)": 77.56, "step": 54790, "token_acc": 0.4309210526315789, "train_speed(iter/s)": 1.438604 }, { "epoch": 2.3475857932393644, "grad_norm": 5.47180700302124, "learning_rate": 5.478091816495997e-05, "loss": 2.564466094970703, "memory(GiB)": 77.56, "step": 54795, "token_acc": 0.453125, "train_speed(iter/s)": 1.43863 }, { "epoch": 2.3478000085686133, "grad_norm": 5.245362281799316, "learning_rate": 5.4774219183995804e-05, "loss": 2.5847213745117186, "memory(GiB)": 77.56, "step": 54800, "token_acc": 0.4656084656084656, "train_speed(iter/s)": 1.438611 }, { "epoch": 2.348014223897862, "grad_norm": 5.536609649658203, "learning_rate": 5.476752011654215e-05, "loss": 2.6254127502441404, "memory(GiB)": 77.56, "step": 54805, "token_acc": 0.4153225806451613, "train_speed(iter/s)": 1.438651 }, { "epoch": 2.3482284392271113, "grad_norm": 5.326805114746094, "learning_rate": 5.4760820962720416e-05, "loss": 2.354401206970215, "memory(GiB)": 77.56, "step": 54810, "token_acc": 0.4491525423728814, "train_speed(iter/s)": 1.438637 }, { "epoch": 2.34844265455636, "grad_norm": 5.349968910217285, "learning_rate": 5.475412172265193e-05, "loss": 2.3771642684936523, "memory(GiB)": 77.56, "step": 54815, "token_acc": 0.45791245791245794, "train_speed(iter/s)": 1.438645 }, { "epoch": 2.348656869885609, "grad_norm": 6.083472728729248, "learning_rate": 5.4747422396458085e-05, "loss": 2.7385623931884764, "memory(GiB)": 77.56, "step": 54820, "token_acc": 0.4509090909090909, "train_speed(iter/s)": 1.438638 }, { "epoch": 2.348871085214858, "grad_norm": 4.6936774253845215, "learning_rate": 5.4740722984260216e-05, "loss": 2.223306655883789, "memory(GiB)": 77.56, "step": 54825, "token_acc": 0.5431654676258992, "train_speed(iter/s)": 1.438631 }, { "epoch": 2.349085300544107, "grad_norm": 5.431412696838379, "learning_rate": 5.473402348617971e-05, "loss": 2.5443208694458006, "memory(GiB)": 77.56, "step": 54830, "token_acc": 0.4639175257731959, "train_speed(iter/s)": 1.43864 }, { "epoch": 2.349299515873356, "grad_norm": 6.527918338775635, "learning_rate": 5.472732390233792e-05, "loss": 2.254547119140625, "memory(GiB)": 77.56, "step": 54835, "token_acc": 0.4778481012658228, "train_speed(iter/s)": 1.438663 }, { "epoch": 2.349513731202605, "grad_norm": 5.981904983520508, "learning_rate": 5.472062423285623e-05, "loss": 2.742825508117676, "memory(GiB)": 77.56, "step": 54840, "token_acc": 0.4669260700389105, "train_speed(iter/s)": 1.438626 }, { "epoch": 2.349727946531854, "grad_norm": 5.265769004821777, "learning_rate": 5.4713924477856006e-05, "loss": 2.4999710083007813, "memory(GiB)": 77.56, "step": 54845, "token_acc": 0.4521452145214521, "train_speed(iter/s)": 1.438651 }, { "epoch": 2.3499421618611027, "grad_norm": 4.826220512390137, "learning_rate": 5.470722463745862e-05, "loss": 2.300009536743164, "memory(GiB)": 77.56, "step": 54850, "token_acc": 0.575091575091575, "train_speed(iter/s)": 1.438683 }, { "epoch": 2.350156377190352, "grad_norm": 4.447829723358154, "learning_rate": 5.4700524711785436e-05, "loss": 2.40640869140625, "memory(GiB)": 77.56, "step": 54855, "token_acc": 0.48036253776435045, "train_speed(iter/s)": 1.438659 }, { "epoch": 2.350370592519601, "grad_norm": 6.051206588745117, "learning_rate": 5.469382470095784e-05, "loss": 2.362230682373047, "memory(GiB)": 77.56, "step": 54860, "token_acc": 0.49498327759197325, "train_speed(iter/s)": 1.438662 }, { "epoch": 2.3505848078488496, "grad_norm": 5.714086055755615, "learning_rate": 5.4687124605097196e-05, "loss": 2.525142478942871, "memory(GiB)": 77.56, "step": 54865, "token_acc": 0.4604519774011299, "train_speed(iter/s)": 1.438648 }, { "epoch": 2.350799023178099, "grad_norm": 6.7206010818481445, "learning_rate": 5.4680424424324914e-05, "loss": 2.4359636306762695, "memory(GiB)": 77.56, "step": 54870, "token_acc": 0.46496815286624205, "train_speed(iter/s)": 1.438649 }, { "epoch": 2.3510132385073477, "grad_norm": 7.983884334564209, "learning_rate": 5.467372415876233e-05, "loss": 2.737460708618164, "memory(GiB)": 77.56, "step": 54875, "token_acc": 0.44086021505376344, "train_speed(iter/s)": 1.438676 }, { "epoch": 2.3512274538365965, "grad_norm": 5.137965202331543, "learning_rate": 5.466702380853087e-05, "loss": 2.4100505828857424, "memory(GiB)": 77.56, "step": 54880, "token_acc": 0.48201438848920863, "train_speed(iter/s)": 1.438685 }, { "epoch": 2.3514416691658457, "grad_norm": 7.0209808349609375, "learning_rate": 5.4660323373751884e-05, "loss": 2.6817886352539064, "memory(GiB)": 77.56, "step": 54885, "token_acc": 0.4472573839662447, "train_speed(iter/s)": 1.438693 }, { "epoch": 2.3516558844950946, "grad_norm": 6.822183132171631, "learning_rate": 5.465362285454677e-05, "loss": 2.4985605239868165, "memory(GiB)": 77.56, "step": 54890, "token_acc": 0.4713804713804714, "train_speed(iter/s)": 1.438725 }, { "epoch": 2.3518700998243434, "grad_norm": 7.360143184661865, "learning_rate": 5.4646922251036924e-05, "loss": 2.506234359741211, "memory(GiB)": 77.56, "step": 54895, "token_acc": 0.4767932489451477, "train_speed(iter/s)": 1.438694 }, { "epoch": 2.3520843151535926, "grad_norm": 5.007196426391602, "learning_rate": 5.464022156334371e-05, "loss": 2.3190681457519533, "memory(GiB)": 77.56, "step": 54900, "token_acc": 0.4742647058823529, "train_speed(iter/s)": 1.4387 }, { "epoch": 2.3522985304828414, "grad_norm": 3.920491933822632, "learning_rate": 5.463352079158852e-05, "loss": 2.384654426574707, "memory(GiB)": 77.56, "step": 54905, "token_acc": 0.5132450331125827, "train_speed(iter/s)": 1.438695 }, { "epoch": 2.3525127458120902, "grad_norm": 5.728605270385742, "learning_rate": 5.462681993589277e-05, "loss": 2.4840839385986326, "memory(GiB)": 77.56, "step": 54910, "token_acc": 0.46206896551724136, "train_speed(iter/s)": 1.438671 }, { "epoch": 2.3527269611413395, "grad_norm": 5.6435136795043945, "learning_rate": 5.462011899637783e-05, "loss": 2.40546875, "memory(GiB)": 77.56, "step": 54915, "token_acc": 0.47896440129449835, "train_speed(iter/s)": 1.438691 }, { "epoch": 2.3529411764705883, "grad_norm": 6.231410980224609, "learning_rate": 5.4613417973165106e-05, "loss": 2.3908317565917967, "memory(GiB)": 77.56, "step": 54920, "token_acc": 0.5277777777777778, "train_speed(iter/s)": 1.438676 }, { "epoch": 2.353155391799837, "grad_norm": 5.761892795562744, "learning_rate": 5.4606716866375985e-05, "loss": 2.661233329772949, "memory(GiB)": 77.56, "step": 54925, "token_acc": 0.5035714285714286, "train_speed(iter/s)": 1.438696 }, { "epoch": 2.3533696071290864, "grad_norm": 9.214360237121582, "learning_rate": 5.460001567613188e-05, "loss": 2.558022880554199, "memory(GiB)": 77.56, "step": 54930, "token_acc": 0.49836065573770494, "train_speed(iter/s)": 1.438717 }, { "epoch": 2.353583822458335, "grad_norm": 4.837754249572754, "learning_rate": 5.459331440255414e-05, "loss": 2.489152526855469, "memory(GiB)": 77.56, "step": 54935, "token_acc": 0.4554140127388535, "train_speed(iter/s)": 1.438706 }, { "epoch": 2.353798037787584, "grad_norm": 6.567314624786377, "learning_rate": 5.458661304576422e-05, "loss": 2.268810272216797, "memory(GiB)": 77.56, "step": 54940, "token_acc": 0.5064516129032258, "train_speed(iter/s)": 1.438703 }, { "epoch": 2.3540122531168333, "grad_norm": 6.083878040313721, "learning_rate": 5.457991160588351e-05, "loss": 2.4191219329833986, "memory(GiB)": 77.56, "step": 54945, "token_acc": 0.5060728744939271, "train_speed(iter/s)": 1.438708 }, { "epoch": 2.354226468446082, "grad_norm": 7.360652923583984, "learning_rate": 5.45732100830334e-05, "loss": 2.5673065185546875, "memory(GiB)": 77.56, "step": 54950, "token_acc": 0.45396825396825397, "train_speed(iter/s)": 1.438694 }, { "epoch": 2.354440683775331, "grad_norm": 5.104067325592041, "learning_rate": 5.4566508477335296e-05, "loss": 2.730603790283203, "memory(GiB)": 77.56, "step": 54955, "token_acc": 0.4778156996587031, "train_speed(iter/s)": 1.438721 }, { "epoch": 2.35465489910458, "grad_norm": 4.436124801635742, "learning_rate": 5.4559806788910606e-05, "loss": 2.2991706848144533, "memory(GiB)": 77.56, "step": 54960, "token_acc": 0.5217391304347826, "train_speed(iter/s)": 1.438722 }, { "epoch": 2.354869114433829, "grad_norm": 5.377289295196533, "learning_rate": 5.455310501788075e-05, "loss": 2.3624456405639647, "memory(GiB)": 77.56, "step": 54965, "token_acc": 0.5259259259259259, "train_speed(iter/s)": 1.438744 }, { "epoch": 2.3550833297630778, "grad_norm": 4.002988815307617, "learning_rate": 5.454640316436712e-05, "loss": 2.536465644836426, "memory(GiB)": 77.56, "step": 54970, "token_acc": 0.487012987012987, "train_speed(iter/s)": 1.438719 }, { "epoch": 2.355297545092327, "grad_norm": 5.537210464477539, "learning_rate": 5.453970122849113e-05, "loss": 2.722956657409668, "memory(GiB)": 77.56, "step": 54975, "token_acc": 0.453416149068323, "train_speed(iter/s)": 1.438715 }, { "epoch": 2.355511760421576, "grad_norm": 5.738343238830566, "learning_rate": 5.453299921037418e-05, "loss": 2.351471710205078, "memory(GiB)": 77.56, "step": 54980, "token_acc": 0.5195729537366548, "train_speed(iter/s)": 1.438726 }, { "epoch": 2.3557259757508247, "grad_norm": 5.757815837860107, "learning_rate": 5.452629711013773e-05, "loss": 2.2772943496704103, "memory(GiB)": 77.56, "step": 54985, "token_acc": 0.504950495049505, "train_speed(iter/s)": 1.438709 }, { "epoch": 2.355940191080074, "grad_norm": 5.990734100341797, "learning_rate": 5.451959492790313e-05, "loss": 2.5603607177734373, "memory(GiB)": 77.56, "step": 54990, "token_acc": 0.4342857142857143, "train_speed(iter/s)": 1.438716 }, { "epoch": 2.3561544064093227, "grad_norm": 5.451921463012695, "learning_rate": 5.451289266379184e-05, "loss": 2.5363142013549806, "memory(GiB)": 77.56, "step": 54995, "token_acc": 0.4295774647887324, "train_speed(iter/s)": 1.438737 }, { "epoch": 2.3563686217385715, "grad_norm": 4.374578952789307, "learning_rate": 5.4506190317925275e-05, "loss": 2.2245977401733397, "memory(GiB)": 77.56, "step": 55000, "token_acc": 0.5340501792114696, "train_speed(iter/s)": 1.438748 }, { "epoch": 2.3563686217385715, "eval_loss": 2.2667043209075928, "eval_runtime": 13.8579, "eval_samples_per_second": 7.216, "eval_steps_per_second": 7.216, "eval_token_acc": 0.4754335260115607, "step": 55000 }, { "epoch": 2.356582837067821, "grad_norm": 6.08438777923584, "learning_rate": 5.4499487890424826e-05, "loss": 2.456856918334961, "memory(GiB)": 77.56, "step": 55005, "token_acc": 0.47685683530678147, "train_speed(iter/s)": 1.438183 }, { "epoch": 2.3567970523970696, "grad_norm": 4.270294666290283, "learning_rate": 5.4492785381411936e-05, "loss": 2.2999061584472655, "memory(GiB)": 77.56, "step": 55010, "token_acc": 0.5387205387205387, "train_speed(iter/s)": 1.438205 }, { "epoch": 2.3570112677263184, "grad_norm": 4.787764549255371, "learning_rate": 5.4486082791008044e-05, "loss": 2.6687618255615235, "memory(GiB)": 77.56, "step": 55015, "token_acc": 0.47854785478547857, "train_speed(iter/s)": 1.438226 }, { "epoch": 2.3572254830555677, "grad_norm": 7.941731929779053, "learning_rate": 5.447938011933453e-05, "loss": 2.556752014160156, "memory(GiB)": 77.56, "step": 55020, "token_acc": 0.4767025089605735, "train_speed(iter/s)": 1.438239 }, { "epoch": 2.3574396983848165, "grad_norm": 4.8594512939453125, "learning_rate": 5.447267736651286e-05, "loss": 2.569650650024414, "memory(GiB)": 77.56, "step": 55025, "token_acc": 0.4675324675324675, "train_speed(iter/s)": 1.438226 }, { "epoch": 2.3576539137140653, "grad_norm": 4.816797733306885, "learning_rate": 5.446597453266444e-05, "loss": 2.287433052062988, "memory(GiB)": 77.56, "step": 55030, "token_acc": 0.4954682779456193, "train_speed(iter/s)": 1.438228 }, { "epoch": 2.3578681290433146, "grad_norm": 6.369913578033447, "learning_rate": 5.44592716179107e-05, "loss": 2.4946224212646486, "memory(GiB)": 77.56, "step": 55035, "token_acc": 0.4857142857142857, "train_speed(iter/s)": 1.438241 }, { "epoch": 2.3580823443725634, "grad_norm": 5.064686298370361, "learning_rate": 5.445256862237308e-05, "loss": 2.3456184387207033, "memory(GiB)": 77.56, "step": 55040, "token_acc": 0.5278810408921933, "train_speed(iter/s)": 1.438264 }, { "epoch": 2.358296559701812, "grad_norm": 5.750959396362305, "learning_rate": 5.4445865546172995e-05, "loss": 2.4515045166015623, "memory(GiB)": 77.56, "step": 55045, "token_acc": 0.4748201438848921, "train_speed(iter/s)": 1.438268 }, { "epoch": 2.3585107750310614, "grad_norm": 6.908394813537598, "learning_rate": 5.4439162389431876e-05, "loss": 2.460306167602539, "memory(GiB)": 77.56, "step": 55050, "token_acc": 0.49127906976744184, "train_speed(iter/s)": 1.438269 }, { "epoch": 2.3587249903603102, "grad_norm": 6.866858959197998, "learning_rate": 5.443245915227117e-05, "loss": 2.4044116973876952, "memory(GiB)": 77.56, "step": 55055, "token_acc": 0.5171339563862928, "train_speed(iter/s)": 1.438286 }, { "epoch": 2.358939205689559, "grad_norm": 5.466826915740967, "learning_rate": 5.442575583481232e-05, "loss": 2.015051078796387, "memory(GiB)": 77.56, "step": 55060, "token_acc": 0.5357142857142857, "train_speed(iter/s)": 1.438327 }, { "epoch": 2.3591534210188083, "grad_norm": 5.861602306365967, "learning_rate": 5.441905243717674e-05, "loss": 2.225111961364746, "memory(GiB)": 77.56, "step": 55065, "token_acc": 0.49514563106796117, "train_speed(iter/s)": 1.438302 }, { "epoch": 2.359367636348057, "grad_norm": 5.2651824951171875, "learning_rate": 5.4412348959485894e-05, "loss": 2.3751714706420897, "memory(GiB)": 77.56, "step": 55070, "token_acc": 0.4554794520547945, "train_speed(iter/s)": 1.438261 }, { "epoch": 2.359581851677306, "grad_norm": 7.595268726348877, "learning_rate": 5.4405645401861205e-05, "loss": 2.487693977355957, "memory(GiB)": 77.56, "step": 55075, "token_acc": 0.48148148148148145, "train_speed(iter/s)": 1.438275 }, { "epoch": 2.359796067006555, "grad_norm": 4.716050148010254, "learning_rate": 5.439894176442409e-05, "loss": 2.337527275085449, "memory(GiB)": 77.56, "step": 55080, "token_acc": 0.4942857142857143, "train_speed(iter/s)": 1.438258 }, { "epoch": 2.360010282335804, "grad_norm": 6.211947917938232, "learning_rate": 5.439223804729604e-05, "loss": 2.431791877746582, "memory(GiB)": 77.56, "step": 55085, "token_acc": 0.5, "train_speed(iter/s)": 1.438287 }, { "epoch": 2.360224497665053, "grad_norm": 5.643962383270264, "learning_rate": 5.438553425059848e-05, "loss": 2.328583526611328, "memory(GiB)": 77.56, "step": 55090, "token_acc": 0.4684385382059801, "train_speed(iter/s)": 1.438296 }, { "epoch": 2.360438712994302, "grad_norm": 4.780154228210449, "learning_rate": 5.437883037445284e-05, "loss": 2.3667251586914064, "memory(GiB)": 77.56, "step": 55095, "token_acc": 0.5, "train_speed(iter/s)": 1.438292 }, { "epoch": 2.360652928323551, "grad_norm": 5.0685954093933105, "learning_rate": 5.43721264189806e-05, "loss": 2.3226537704467773, "memory(GiB)": 77.56, "step": 55100, "token_acc": 0.5075757575757576, "train_speed(iter/s)": 1.438266 }, { "epoch": 2.3608671436527997, "grad_norm": 4.993478298187256, "learning_rate": 5.4365422384303164e-05, "loss": 2.565767288208008, "memory(GiB)": 77.56, "step": 55105, "token_acc": 0.5072992700729927, "train_speed(iter/s)": 1.438273 }, { "epoch": 2.361081358982049, "grad_norm": 4.918037414550781, "learning_rate": 5.435871827054203e-05, "loss": 2.5779842376708983, "memory(GiB)": 77.56, "step": 55110, "token_acc": 0.4937888198757764, "train_speed(iter/s)": 1.438292 }, { "epoch": 2.3612955743112978, "grad_norm": 5.096170425415039, "learning_rate": 5.435201407781863e-05, "loss": 2.371634292602539, "memory(GiB)": 77.56, "step": 55115, "token_acc": 0.5032679738562091, "train_speed(iter/s)": 1.438309 }, { "epoch": 2.3615097896405466, "grad_norm": 8.549903869628906, "learning_rate": 5.434530980625438e-05, "loss": 2.2073726654052734, "memory(GiB)": 77.56, "step": 55120, "token_acc": 0.5314685314685315, "train_speed(iter/s)": 1.438341 }, { "epoch": 2.361724004969796, "grad_norm": 5.753121852874756, "learning_rate": 5.433860545597078e-05, "loss": 2.4982673645019533, "memory(GiB)": 77.56, "step": 55125, "token_acc": 0.47058823529411764, "train_speed(iter/s)": 1.438352 }, { "epoch": 2.3619382202990447, "grad_norm": 4.9193291664123535, "learning_rate": 5.433190102708928e-05, "loss": 2.4230201721191404, "memory(GiB)": 77.56, "step": 55130, "token_acc": 0.5335689045936396, "train_speed(iter/s)": 1.438359 }, { "epoch": 2.3621524356282935, "grad_norm": 6.801519870758057, "learning_rate": 5.432519651973133e-05, "loss": 2.428292083740234, "memory(GiB)": 77.56, "step": 55135, "token_acc": 0.5171232876712328, "train_speed(iter/s)": 1.43836 }, { "epoch": 2.3623666509575427, "grad_norm": 5.328106880187988, "learning_rate": 5.431849193401838e-05, "loss": 2.5010623931884766, "memory(GiB)": 77.56, "step": 55140, "token_acc": 0.47959183673469385, "train_speed(iter/s)": 1.438357 }, { "epoch": 2.3625808662867915, "grad_norm": 7.334033489227295, "learning_rate": 5.431178727007191e-05, "loss": 2.5254135131835938, "memory(GiB)": 77.56, "step": 55145, "token_acc": 0.496, "train_speed(iter/s)": 1.438365 }, { "epoch": 2.3627950816160403, "grad_norm": 5.021880149841309, "learning_rate": 5.430508252801335e-05, "loss": 2.393972396850586, "memory(GiB)": 77.56, "step": 55150, "token_acc": 0.49458483754512633, "train_speed(iter/s)": 1.438377 }, { "epoch": 2.3630092969452896, "grad_norm": 5.0176239013671875, "learning_rate": 5.42983777079642e-05, "loss": 2.3331554412841795, "memory(GiB)": 77.56, "step": 55155, "token_acc": 0.4534412955465587, "train_speed(iter/s)": 1.438348 }, { "epoch": 2.3632235122745384, "grad_norm": 6.08544397354126, "learning_rate": 5.429167281004589e-05, "loss": 2.344183921813965, "memory(GiB)": 77.56, "step": 55160, "token_acc": 0.5, "train_speed(iter/s)": 1.438356 }, { "epoch": 2.3634377276037872, "grad_norm": 5.9323296546936035, "learning_rate": 5.42849678343799e-05, "loss": 2.5026529312133787, "memory(GiB)": 77.56, "step": 55165, "token_acc": 0.4746376811594203, "train_speed(iter/s)": 1.438356 }, { "epoch": 2.3636519429330365, "grad_norm": 4.6031060218811035, "learning_rate": 5.4278262781087706e-05, "loss": 2.4346088409423827, "memory(GiB)": 77.56, "step": 55170, "token_acc": 0.5161290322580645, "train_speed(iter/s)": 1.438369 }, { "epoch": 2.3638661582622853, "grad_norm": 4.812386989593506, "learning_rate": 5.427155765029078e-05, "loss": 2.3574138641357423, "memory(GiB)": 77.56, "step": 55175, "token_acc": 0.49473684210526314, "train_speed(iter/s)": 1.438357 }, { "epoch": 2.364080373591534, "grad_norm": 5.376172065734863, "learning_rate": 5.426485244211056e-05, "loss": 2.628916931152344, "memory(GiB)": 77.56, "step": 55180, "token_acc": 0.4375, "train_speed(iter/s)": 1.438367 }, { "epoch": 2.3642945889207834, "grad_norm": 5.451033115386963, "learning_rate": 5.4258147156668545e-05, "loss": 2.5946929931640623, "memory(GiB)": 77.56, "step": 55185, "token_acc": 0.4735202492211838, "train_speed(iter/s)": 1.43835 }, { "epoch": 2.364508804250032, "grad_norm": 5.982585430145264, "learning_rate": 5.425144179408621e-05, "loss": 2.4256092071533204, "memory(GiB)": 77.56, "step": 55190, "token_acc": 0.46905537459283386, "train_speed(iter/s)": 1.438354 }, { "epoch": 2.364723019579281, "grad_norm": 4.28377103805542, "learning_rate": 5.4244736354484994e-05, "loss": 2.2783056259155274, "memory(GiB)": 77.56, "step": 55195, "token_acc": 0.5299401197604791, "train_speed(iter/s)": 1.438349 }, { "epoch": 2.3649372349085303, "grad_norm": 5.632835388183594, "learning_rate": 5.423803083798642e-05, "loss": 2.5441762924194338, "memory(GiB)": 77.56, "step": 55200, "token_acc": 0.48338368580060426, "train_speed(iter/s)": 1.438358 }, { "epoch": 2.365151450237779, "grad_norm": 6.502086162567139, "learning_rate": 5.423132524471194e-05, "loss": 2.4038652420043944, "memory(GiB)": 77.56, "step": 55205, "token_acc": 0.4742268041237113, "train_speed(iter/s)": 1.438379 }, { "epoch": 2.365365665567028, "grad_norm": 4.80830717086792, "learning_rate": 5.422461957478302e-05, "loss": 2.2743370056152346, "memory(GiB)": 77.56, "step": 55210, "token_acc": 0.504885993485342, "train_speed(iter/s)": 1.438402 }, { "epoch": 2.365579880896277, "grad_norm": 4.352596759796143, "learning_rate": 5.4217913828321166e-05, "loss": 2.4236536026000977, "memory(GiB)": 77.56, "step": 55215, "token_acc": 0.5017667844522968, "train_speed(iter/s)": 1.438393 }, { "epoch": 2.365794096225526, "grad_norm": 5.5595502853393555, "learning_rate": 5.421120800544785e-05, "loss": 2.4924606323242187, "memory(GiB)": 77.56, "step": 55220, "token_acc": 0.4727272727272727, "train_speed(iter/s)": 1.438414 }, { "epoch": 2.3660083115547748, "grad_norm": 4.732698917388916, "learning_rate": 5.420450210628454e-05, "loss": 2.037059783935547, "memory(GiB)": 77.56, "step": 55225, "token_acc": 0.572463768115942, "train_speed(iter/s)": 1.438438 }, { "epoch": 2.366222526884024, "grad_norm": 6.975305557250977, "learning_rate": 5.4197796130952724e-05, "loss": 2.4264499664306642, "memory(GiB)": 77.56, "step": 55230, "token_acc": 0.45555555555555555, "train_speed(iter/s)": 1.438466 }, { "epoch": 2.366436742213273, "grad_norm": 4.999119281768799, "learning_rate": 5.41910900795739e-05, "loss": 2.443581771850586, "memory(GiB)": 77.56, "step": 55235, "token_acc": 0.508833922261484, "train_speed(iter/s)": 1.438462 }, { "epoch": 2.3666509575425216, "grad_norm": 4.922730922698975, "learning_rate": 5.418438395226954e-05, "loss": 2.6420093536376954, "memory(GiB)": 77.56, "step": 55240, "token_acc": 0.46153846153846156, "train_speed(iter/s)": 1.438466 }, { "epoch": 2.366865172871771, "grad_norm": 7.011460304260254, "learning_rate": 5.417767774916116e-05, "loss": 2.2443168640136717, "memory(GiB)": 77.56, "step": 55245, "token_acc": 0.5202952029520295, "train_speed(iter/s)": 1.438482 }, { "epoch": 2.3670793882010197, "grad_norm": 6.412450790405273, "learning_rate": 5.417097147037021e-05, "loss": 2.1097509384155275, "memory(GiB)": 77.56, "step": 55250, "token_acc": 0.5051903114186851, "train_speed(iter/s)": 1.438488 }, { "epoch": 2.3672936035302685, "grad_norm": 7.007317543029785, "learning_rate": 5.416426511601822e-05, "loss": 2.6930639266967775, "memory(GiB)": 77.56, "step": 55255, "token_acc": 0.43666666666666665, "train_speed(iter/s)": 1.43851 }, { "epoch": 2.3675078188595178, "grad_norm": 5.313680171966553, "learning_rate": 5.415755868622665e-05, "loss": 2.5467241287231444, "memory(GiB)": 77.56, "step": 55260, "token_acc": 0.4693140794223827, "train_speed(iter/s)": 1.438526 }, { "epoch": 2.3677220341887666, "grad_norm": 4.119755268096924, "learning_rate": 5.4150852181116995e-05, "loss": 2.4497085571289063, "memory(GiB)": 77.56, "step": 55265, "token_acc": 0.46153846153846156, "train_speed(iter/s)": 1.438537 }, { "epoch": 2.3679362495180154, "grad_norm": 5.886566638946533, "learning_rate": 5.414414560081076e-05, "loss": 2.458090972900391, "memory(GiB)": 77.56, "step": 55270, "token_acc": 0.5326797385620915, "train_speed(iter/s)": 1.438564 }, { "epoch": 2.3681504648472647, "grad_norm": 4.560958385467529, "learning_rate": 5.413743894542945e-05, "loss": 2.3180269241333007, "memory(GiB)": 77.56, "step": 55275, "token_acc": 0.4982456140350877, "train_speed(iter/s)": 1.438563 }, { "epoch": 2.3683646801765135, "grad_norm": 5.445955753326416, "learning_rate": 5.413073221509454e-05, "loss": 2.568151092529297, "memory(GiB)": 77.56, "step": 55280, "token_acc": 0.46178343949044587, "train_speed(iter/s)": 1.438595 }, { "epoch": 2.3685788955057623, "grad_norm": 5.808968544006348, "learning_rate": 5.412402540992756e-05, "loss": 2.205615997314453, "memory(GiB)": 77.56, "step": 55285, "token_acc": 0.49416342412451364, "train_speed(iter/s)": 1.438633 }, { "epoch": 2.3687931108350115, "grad_norm": 14.384493827819824, "learning_rate": 5.4117318530049976e-05, "loss": 2.3757465362548826, "memory(GiB)": 77.56, "step": 55290, "token_acc": 0.4964788732394366, "train_speed(iter/s)": 1.438663 }, { "epoch": 2.3690073261642604, "grad_norm": 4.8841023445129395, "learning_rate": 5.41106115755833e-05, "loss": 2.3218557357788088, "memory(GiB)": 77.56, "step": 55295, "token_acc": 0.5197132616487455, "train_speed(iter/s)": 1.438666 }, { "epoch": 2.369221541493509, "grad_norm": 7.7853288650512695, "learning_rate": 5.4103904546649063e-05, "loss": 2.515058898925781, "memory(GiB)": 77.56, "step": 55300, "token_acc": 0.4726027397260274, "train_speed(iter/s)": 1.438663 }, { "epoch": 2.3694357568227584, "grad_norm": 4.586111545562744, "learning_rate": 5.4097197443368716e-05, "loss": 2.370487594604492, "memory(GiB)": 77.56, "step": 55305, "token_acc": 0.5071942446043165, "train_speed(iter/s)": 1.438657 }, { "epoch": 2.3696499721520072, "grad_norm": 4.423598766326904, "learning_rate": 5.4090490265863814e-05, "loss": 2.1970808029174806, "memory(GiB)": 77.56, "step": 55310, "token_acc": 0.5151515151515151, "train_speed(iter/s)": 1.438685 }, { "epoch": 2.369864187481256, "grad_norm": 5.399542331695557, "learning_rate": 5.408378301425584e-05, "loss": 2.234607696533203, "memory(GiB)": 77.56, "step": 55315, "token_acc": 0.4782608695652174, "train_speed(iter/s)": 1.438681 }, { "epoch": 2.3700784028105053, "grad_norm": 4.355242729187012, "learning_rate": 5.40770756886663e-05, "loss": 2.564031982421875, "memory(GiB)": 77.56, "step": 55320, "token_acc": 0.4688427299703264, "train_speed(iter/s)": 1.438679 }, { "epoch": 2.370292618139754, "grad_norm": 6.180300235748291, "learning_rate": 5.407036828921671e-05, "loss": 2.691029739379883, "memory(GiB)": 77.56, "step": 55325, "token_acc": 0.47896440129449835, "train_speed(iter/s)": 1.438693 }, { "epoch": 2.370506833469003, "grad_norm": 5.43287467956543, "learning_rate": 5.406366081602859e-05, "loss": 2.409763526916504, "memory(GiB)": 77.56, "step": 55330, "token_acc": 0.4726775956284153, "train_speed(iter/s)": 1.438712 }, { "epoch": 2.370721048798252, "grad_norm": 5.2215986251831055, "learning_rate": 5.4056953269223445e-05, "loss": 2.6180233001708983, "memory(GiB)": 77.56, "step": 55335, "token_acc": 0.4878048780487805, "train_speed(iter/s)": 1.438718 }, { "epoch": 2.370935264127501, "grad_norm": 5.596814155578613, "learning_rate": 5.405024564892277e-05, "loss": 2.3289669036865233, "memory(GiB)": 77.56, "step": 55340, "token_acc": 0.4859437751004016, "train_speed(iter/s)": 1.438732 }, { "epoch": 2.37114947945675, "grad_norm": 6.0897932052612305, "learning_rate": 5.4043537955248094e-05, "loss": 2.46077823638916, "memory(GiB)": 77.56, "step": 55345, "token_acc": 0.5071942446043165, "train_speed(iter/s)": 1.438754 }, { "epoch": 2.371363694785999, "grad_norm": 5.113719463348389, "learning_rate": 5.403683018832094e-05, "loss": 2.41677303314209, "memory(GiB)": 77.56, "step": 55350, "token_acc": 0.5093632958801498, "train_speed(iter/s)": 1.438783 }, { "epoch": 2.371577910115248, "grad_norm": 4.7541093826293945, "learning_rate": 5.4030122348262824e-05, "loss": 2.305705451965332, "memory(GiB)": 77.56, "step": 55355, "token_acc": 0.5154639175257731, "train_speed(iter/s)": 1.438799 }, { "epoch": 2.3717921254444967, "grad_norm": 5.01695442199707, "learning_rate": 5.402341443519526e-05, "loss": 2.7446502685546874, "memory(GiB)": 77.56, "step": 55360, "token_acc": 0.4965034965034965, "train_speed(iter/s)": 1.438797 }, { "epoch": 2.372006340773746, "grad_norm": 5.386935710906982, "learning_rate": 5.4016706449239764e-05, "loss": 2.620396614074707, "memory(GiB)": 77.56, "step": 55365, "token_acc": 0.4755700325732899, "train_speed(iter/s)": 1.438796 }, { "epoch": 2.3722205561029948, "grad_norm": 4.951467037200928, "learning_rate": 5.400999839051788e-05, "loss": 2.551708221435547, "memory(GiB)": 77.56, "step": 55370, "token_acc": 0.4791666666666667, "train_speed(iter/s)": 1.438798 }, { "epoch": 2.3724347714322436, "grad_norm": 4.838369846343994, "learning_rate": 5.40032902591511e-05, "loss": 2.6849252700805666, "memory(GiB)": 77.56, "step": 55375, "token_acc": 0.4360655737704918, "train_speed(iter/s)": 1.438819 }, { "epoch": 2.372648986761493, "grad_norm": 5.829607963562012, "learning_rate": 5.399658205526096e-05, "loss": 2.4711143493652346, "memory(GiB)": 77.56, "step": 55380, "token_acc": 0.5, "train_speed(iter/s)": 1.438829 }, { "epoch": 2.3728632020907416, "grad_norm": 5.011562824249268, "learning_rate": 5.398987377896898e-05, "loss": 2.1905250549316406, "memory(GiB)": 77.56, "step": 55385, "token_acc": 0.5247148288973384, "train_speed(iter/s)": 1.438835 }, { "epoch": 2.3730774174199905, "grad_norm": 7.044679164886475, "learning_rate": 5.398316543039671e-05, "loss": 2.323820877075195, "memory(GiB)": 77.56, "step": 55390, "token_acc": 0.4805194805194805, "train_speed(iter/s)": 1.438845 }, { "epoch": 2.3732916327492397, "grad_norm": 6.043456554412842, "learning_rate": 5.397645700966565e-05, "loss": 2.5732213973999025, "memory(GiB)": 77.56, "step": 55395, "token_acc": 0.47491638795986624, "train_speed(iter/s)": 1.438835 }, { "epoch": 2.3735058480784885, "grad_norm": 6.244682788848877, "learning_rate": 5.3969748516897354e-05, "loss": 2.6469261169433596, "memory(GiB)": 77.56, "step": 55400, "token_acc": 0.44696969696969696, "train_speed(iter/s)": 1.438841 }, { "epoch": 2.3737200634077373, "grad_norm": 6.698449611663818, "learning_rate": 5.3963039952213336e-05, "loss": 2.2927406311035154, "memory(GiB)": 77.56, "step": 55405, "token_acc": 0.48736462093862815, "train_speed(iter/s)": 1.438866 }, { "epoch": 2.3739342787369866, "grad_norm": 6.650759220123291, "learning_rate": 5.395633131573512e-05, "loss": 2.5409788131713866, "memory(GiB)": 77.56, "step": 55410, "token_acc": 0.48091603053435117, "train_speed(iter/s)": 1.438901 }, { "epoch": 2.3741484940662354, "grad_norm": 4.832799911499023, "learning_rate": 5.394962260758425e-05, "loss": 2.307061767578125, "memory(GiB)": 77.56, "step": 55415, "token_acc": 0.5523465703971119, "train_speed(iter/s)": 1.43889 }, { "epoch": 2.374362709395484, "grad_norm": 5.03985071182251, "learning_rate": 5.394291382788228e-05, "loss": 2.4070295333862304, "memory(GiB)": 77.56, "step": 55420, "token_acc": 0.4966442953020134, "train_speed(iter/s)": 1.438913 }, { "epoch": 2.3745769247247335, "grad_norm": 5.742072105407715, "learning_rate": 5.39362049767507e-05, "loss": 1.8717477798461915, "memory(GiB)": 77.56, "step": 55425, "token_acc": 0.5341880341880342, "train_speed(iter/s)": 1.438935 }, { "epoch": 2.3747911400539823, "grad_norm": 5.457836627960205, "learning_rate": 5.3929496054311104e-05, "loss": 2.23592586517334, "memory(GiB)": 77.56, "step": 55430, "token_acc": 0.48534201954397393, "train_speed(iter/s)": 1.438912 }, { "epoch": 2.375005355383231, "grad_norm": 7.373669147491455, "learning_rate": 5.3922787060685e-05, "loss": 2.3980010986328124, "memory(GiB)": 77.56, "step": 55435, "token_acc": 0.47547169811320755, "train_speed(iter/s)": 1.43892 }, { "epoch": 2.3752195707124804, "grad_norm": 6.243663787841797, "learning_rate": 5.391607799599391e-05, "loss": 2.476382827758789, "memory(GiB)": 77.56, "step": 55440, "token_acc": 0.4709897610921502, "train_speed(iter/s)": 1.438956 }, { "epoch": 2.375433786041729, "grad_norm": 4.847306251525879, "learning_rate": 5.39093688603594e-05, "loss": 2.197188377380371, "memory(GiB)": 77.56, "step": 55445, "token_acc": 0.5409252669039146, "train_speed(iter/s)": 1.438991 }, { "epoch": 2.375648001370978, "grad_norm": 6.375215530395508, "learning_rate": 5.390265965390301e-05, "loss": 2.798021697998047, "memory(GiB)": 77.56, "step": 55450, "token_acc": 0.44785276073619634, "train_speed(iter/s)": 1.439021 }, { "epoch": 2.3758622167002272, "grad_norm": 5.868282318115234, "learning_rate": 5.3895950376746266e-05, "loss": 2.2429452896118165, "memory(GiB)": 77.56, "step": 55455, "token_acc": 0.516320474777448, "train_speed(iter/s)": 1.438994 }, { "epoch": 2.376076432029476, "grad_norm": 5.980130672454834, "learning_rate": 5.388924102901074e-05, "loss": 2.48947811126709, "memory(GiB)": 77.56, "step": 55460, "token_acc": 0.48104956268221577, "train_speed(iter/s)": 1.438999 }, { "epoch": 2.376290647358725, "grad_norm": 4.3123931884765625, "learning_rate": 5.388253161081795e-05, "loss": 2.232387924194336, "memory(GiB)": 77.56, "step": 55465, "token_acc": 0.5092250922509225, "train_speed(iter/s)": 1.439035 }, { "epoch": 2.376504862687974, "grad_norm": 4.940186977386475, "learning_rate": 5.387582212228948e-05, "loss": 2.635325241088867, "memory(GiB)": 77.56, "step": 55470, "token_acc": 0.4561933534743202, "train_speed(iter/s)": 1.439051 }, { "epoch": 2.376719078017223, "grad_norm": 5.646570682525635, "learning_rate": 5.3869112563546844e-05, "loss": 2.4602468490600584, "memory(GiB)": 77.56, "step": 55475, "token_acc": 0.4041095890410959, "train_speed(iter/s)": 1.439038 }, { "epoch": 2.3769332933464717, "grad_norm": 4.777437686920166, "learning_rate": 5.386240293471161e-05, "loss": 2.4900028228759767, "memory(GiB)": 77.56, "step": 55480, "token_acc": 0.4869281045751634, "train_speed(iter/s)": 1.439032 }, { "epoch": 2.377147508675721, "grad_norm": 5.969933032989502, "learning_rate": 5.385569323590531e-05, "loss": 2.389388656616211, "memory(GiB)": 77.56, "step": 55485, "token_acc": 0.5035460992907801, "train_speed(iter/s)": 1.439057 }, { "epoch": 2.37736172400497, "grad_norm": 4.918552398681641, "learning_rate": 5.3848983467249516e-05, "loss": 2.6371192932128906, "memory(GiB)": 77.56, "step": 55490, "token_acc": 0.48231511254019294, "train_speed(iter/s)": 1.439046 }, { "epoch": 2.3775759393342186, "grad_norm": 4.467573642730713, "learning_rate": 5.384227362886578e-05, "loss": 2.3733142852783202, "memory(GiB)": 77.56, "step": 55495, "token_acc": 0.4921135646687697, "train_speed(iter/s)": 1.439046 }, { "epoch": 2.377790154663468, "grad_norm": 6.7481489181518555, "learning_rate": 5.3835563720875645e-05, "loss": 2.632244110107422, "memory(GiB)": 77.56, "step": 55500, "token_acc": 0.45394736842105265, "train_speed(iter/s)": 1.439057 }, { "epoch": 2.377790154663468, "eval_loss": 2.0676262378692627, "eval_runtime": 14.3769, "eval_samples_per_second": 6.956, "eval_steps_per_second": 6.956, "eval_token_acc": 0.494550408719346, "step": 55500 }, { "epoch": 2.3780043699927167, "grad_norm": 7.466429710388184, "learning_rate": 5.382885374340069e-05, "loss": 2.413881301879883, "memory(GiB)": 77.56, "step": 55505, "token_acc": 0.49607843137254903, "train_speed(iter/s)": 1.438486 }, { "epoch": 2.3782185853219655, "grad_norm": 5.201564788818359, "learning_rate": 5.382214369656244e-05, "loss": 2.23253173828125, "memory(GiB)": 77.56, "step": 55510, "token_acc": 0.5125448028673835, "train_speed(iter/s)": 1.438483 }, { "epoch": 2.3784328006512148, "grad_norm": 5.7796244621276855, "learning_rate": 5.381543358048249e-05, "loss": 2.617478370666504, "memory(GiB)": 77.56, "step": 55515, "token_acc": 0.43243243243243246, "train_speed(iter/s)": 1.438457 }, { "epoch": 2.3786470159804636, "grad_norm": 9.110394477844238, "learning_rate": 5.380872339528237e-05, "loss": 2.659603500366211, "memory(GiB)": 77.56, "step": 55520, "token_acc": 0.4059040590405904, "train_speed(iter/s)": 1.438438 }, { "epoch": 2.3788612313097124, "grad_norm": 6.04946231842041, "learning_rate": 5.380201314108365e-05, "loss": 2.3853595733642576, "memory(GiB)": 77.56, "step": 55525, "token_acc": 0.4712230215827338, "train_speed(iter/s)": 1.438452 }, { "epoch": 2.3790754466389616, "grad_norm": 5.455751895904541, "learning_rate": 5.3795302818007895e-05, "loss": 2.3142032623291016, "memory(GiB)": 77.56, "step": 55530, "token_acc": 0.45794392523364486, "train_speed(iter/s)": 1.438438 }, { "epoch": 2.3792896619682105, "grad_norm": 5.232691287994385, "learning_rate": 5.378859242617668e-05, "loss": 2.18917293548584, "memory(GiB)": 77.56, "step": 55535, "token_acc": 0.4962686567164179, "train_speed(iter/s)": 1.438436 }, { "epoch": 2.3795038772974593, "grad_norm": 5.232368469238281, "learning_rate": 5.378188196571154e-05, "loss": 2.3399673461914063, "memory(GiB)": 77.56, "step": 55540, "token_acc": 0.48214285714285715, "train_speed(iter/s)": 1.43839 }, { "epoch": 2.3797180926267085, "grad_norm": 6.277401447296143, "learning_rate": 5.3775171436734084e-05, "loss": 2.7239477157592775, "memory(GiB)": 77.56, "step": 55545, "token_acc": 0.4522058823529412, "train_speed(iter/s)": 1.438418 }, { "epoch": 2.3799323079559573, "grad_norm": 6.038576602935791, "learning_rate": 5.376846083936585e-05, "loss": 2.38717098236084, "memory(GiB)": 77.56, "step": 55550, "token_acc": 0.46381578947368424, "train_speed(iter/s)": 1.43845 }, { "epoch": 2.380146523285206, "grad_norm": 7.51558780670166, "learning_rate": 5.376175017372841e-05, "loss": 2.617052459716797, "memory(GiB)": 77.56, "step": 55555, "token_acc": 0.50390625, "train_speed(iter/s)": 1.438469 }, { "epoch": 2.3803607386144554, "grad_norm": 4.218362808227539, "learning_rate": 5.375503943994333e-05, "loss": 2.5389705657958985, "memory(GiB)": 77.56, "step": 55560, "token_acc": 0.5048543689320388, "train_speed(iter/s)": 1.438487 }, { "epoch": 2.380574953943704, "grad_norm": 5.294258117675781, "learning_rate": 5.37483286381322e-05, "loss": 2.4298730850219727, "memory(GiB)": 77.56, "step": 55565, "token_acc": 0.5088967971530249, "train_speed(iter/s)": 1.438507 }, { "epoch": 2.380789169272953, "grad_norm": 6.186878681182861, "learning_rate": 5.374161776841656e-05, "loss": 2.632823944091797, "memory(GiB)": 77.56, "step": 55570, "token_acc": 0.4463667820069204, "train_speed(iter/s)": 1.438528 }, { "epoch": 2.3810033846022023, "grad_norm": 4.771598815917969, "learning_rate": 5.373490683091802e-05, "loss": 2.436178970336914, "memory(GiB)": 77.56, "step": 55575, "token_acc": 0.5047021943573667, "train_speed(iter/s)": 1.438491 }, { "epoch": 2.381217599931451, "grad_norm": 6.250410079956055, "learning_rate": 5.372819582575814e-05, "loss": 2.494367790222168, "memory(GiB)": 77.56, "step": 55580, "token_acc": 0.4801587301587302, "train_speed(iter/s)": 1.438494 }, { "epoch": 2.3814318152607, "grad_norm": 5.710798263549805, "learning_rate": 5.372148475305849e-05, "loss": 2.6136795043945313, "memory(GiB)": 77.56, "step": 55585, "token_acc": 0.46984126984126984, "train_speed(iter/s)": 1.438477 }, { "epoch": 2.381646030589949, "grad_norm": 3.6559946537017822, "learning_rate": 5.371477361294066e-05, "loss": 2.548026466369629, "memory(GiB)": 77.56, "step": 55590, "token_acc": 0.5254237288135594, "train_speed(iter/s)": 1.438491 }, { "epoch": 2.381860245919198, "grad_norm": 6.05740213394165, "learning_rate": 5.370806240552623e-05, "loss": 2.6580345153808596, "memory(GiB)": 77.56, "step": 55595, "token_acc": 0.47717842323651455, "train_speed(iter/s)": 1.438514 }, { "epoch": 2.382074461248447, "grad_norm": 5.075231552124023, "learning_rate": 5.370135113093674e-05, "loss": 2.2091745376586913, "memory(GiB)": 77.56, "step": 55600, "token_acc": 0.5075757575757576, "train_speed(iter/s)": 1.438522 }, { "epoch": 2.382288676577696, "grad_norm": 5.125804901123047, "learning_rate": 5.369463978929382e-05, "loss": 2.198167610168457, "memory(GiB)": 77.56, "step": 55605, "token_acc": 0.5204081632653061, "train_speed(iter/s)": 1.438528 }, { "epoch": 2.382502891906945, "grad_norm": 5.294960021972656, "learning_rate": 5.3687928380719044e-05, "loss": 2.4617355346679686, "memory(GiB)": 77.56, "step": 55610, "token_acc": 0.5060728744939271, "train_speed(iter/s)": 1.438543 }, { "epoch": 2.3827171072361937, "grad_norm": 6.396987438201904, "learning_rate": 5.368121690533396e-05, "loss": 2.5283481597900392, "memory(GiB)": 77.56, "step": 55615, "token_acc": 0.5066666666666667, "train_speed(iter/s)": 1.438547 }, { "epoch": 2.382931322565443, "grad_norm": 6.050921440124512, "learning_rate": 5.3674505363260206e-05, "loss": 2.4809059143066405, "memory(GiB)": 77.56, "step": 55620, "token_acc": 0.5030864197530864, "train_speed(iter/s)": 1.438549 }, { "epoch": 2.3831455378946917, "grad_norm": 5.0938849449157715, "learning_rate": 5.366779375461933e-05, "loss": 2.4185562133789062, "memory(GiB)": 77.56, "step": 55625, "token_acc": 0.4697986577181208, "train_speed(iter/s)": 1.438562 }, { "epoch": 2.3833597532239406, "grad_norm": 5.465989589691162, "learning_rate": 5.366108207953293e-05, "loss": 2.583892059326172, "memory(GiB)": 77.56, "step": 55630, "token_acc": 0.4857142857142857, "train_speed(iter/s)": 1.438608 }, { "epoch": 2.38357396855319, "grad_norm": 5.359743118286133, "learning_rate": 5.365437033812259e-05, "loss": 2.3713050842285157, "memory(GiB)": 77.56, "step": 55635, "token_acc": 0.5204918032786885, "train_speed(iter/s)": 1.438613 }, { "epoch": 2.3837881838824386, "grad_norm": 6.497318744659424, "learning_rate": 5.3647658530509904e-05, "loss": 2.702520179748535, "memory(GiB)": 77.56, "step": 55640, "token_acc": 0.44761904761904764, "train_speed(iter/s)": 1.438628 }, { "epoch": 2.3840023992116874, "grad_norm": 4.790289402008057, "learning_rate": 5.364094665681646e-05, "loss": 2.127778244018555, "memory(GiB)": 77.56, "step": 55645, "token_acc": 0.5445205479452054, "train_speed(iter/s)": 1.438655 }, { "epoch": 2.3842166145409367, "grad_norm": 4.052952766418457, "learning_rate": 5.363423471716386e-05, "loss": 2.3164926528930665, "memory(GiB)": 77.56, "step": 55650, "token_acc": 0.4819277108433735, "train_speed(iter/s)": 1.438653 }, { "epoch": 2.3844308298701855, "grad_norm": 4.31178092956543, "learning_rate": 5.3627522711673675e-05, "loss": 2.3609615325927735, "memory(GiB)": 77.56, "step": 55655, "token_acc": 0.5062893081761006, "train_speed(iter/s)": 1.438659 }, { "epoch": 2.3846450451994343, "grad_norm": 3.9674179553985596, "learning_rate": 5.362081064046753e-05, "loss": 1.916794776916504, "memory(GiB)": 77.56, "step": 55660, "token_acc": 0.5886524822695035, "train_speed(iter/s)": 1.438643 }, { "epoch": 2.3848592605286836, "grad_norm": 7.026403903961182, "learning_rate": 5.3614098503667e-05, "loss": 2.5660417556762694, "memory(GiB)": 77.56, "step": 55665, "token_acc": 0.4375, "train_speed(iter/s)": 1.438662 }, { "epoch": 2.3850734758579324, "grad_norm": 6.199244976043701, "learning_rate": 5.360738630139368e-05, "loss": 2.320438194274902, "memory(GiB)": 77.56, "step": 55670, "token_acc": 0.4928571428571429, "train_speed(iter/s)": 1.438667 }, { "epoch": 2.385287691187181, "grad_norm": 5.9237165451049805, "learning_rate": 5.360067403376916e-05, "loss": 2.6910205841064454, "memory(GiB)": 77.56, "step": 55675, "token_acc": 0.468944099378882, "train_speed(iter/s)": 1.438664 }, { "epoch": 2.3855019065164305, "grad_norm": 4.771265983581543, "learning_rate": 5.359396170091508e-05, "loss": 2.4336774826049803, "memory(GiB)": 77.56, "step": 55680, "token_acc": 0.4619718309859155, "train_speed(iter/s)": 1.438654 }, { "epoch": 2.3857161218456793, "grad_norm": 7.338444232940674, "learning_rate": 5.358724930295299e-05, "loss": 2.59556884765625, "memory(GiB)": 77.56, "step": 55685, "token_acc": 0.4965986394557823, "train_speed(iter/s)": 1.438656 }, { "epoch": 2.385930337174928, "grad_norm": 4.878612995147705, "learning_rate": 5.3580536840004524e-05, "loss": 2.4306787490844726, "memory(GiB)": 77.56, "step": 55690, "token_acc": 0.4967532467532468, "train_speed(iter/s)": 1.438688 }, { "epoch": 2.3861445525041773, "grad_norm": 5.07237434387207, "learning_rate": 5.3573824312191276e-05, "loss": 2.467372512817383, "memory(GiB)": 77.56, "step": 55695, "token_acc": 0.5132450331125827, "train_speed(iter/s)": 1.438706 }, { "epoch": 2.386358767833426, "grad_norm": 6.616938591003418, "learning_rate": 5.356711171963484e-05, "loss": 2.234323501586914, "memory(GiB)": 77.56, "step": 55700, "token_acc": 0.510548523206751, "train_speed(iter/s)": 1.438734 }, { "epoch": 2.386572983162675, "grad_norm": 5.262768268585205, "learning_rate": 5.356039906245684e-05, "loss": 2.4600982666015625, "memory(GiB)": 77.56, "step": 55705, "token_acc": 0.4895833333333333, "train_speed(iter/s)": 1.438742 }, { "epoch": 2.3867871984919242, "grad_norm": 4.559207439422607, "learning_rate": 5.3553686340778875e-05, "loss": 2.655509185791016, "memory(GiB)": 77.56, "step": 55710, "token_acc": 0.4927536231884058, "train_speed(iter/s)": 1.438703 }, { "epoch": 2.387001413821173, "grad_norm": 6.9952921867370605, "learning_rate": 5.354697355472252e-05, "loss": 2.460745620727539, "memory(GiB)": 77.56, "step": 55715, "token_acc": 0.49049429657794674, "train_speed(iter/s)": 1.43873 }, { "epoch": 2.387215629150422, "grad_norm": 7.227347373962402, "learning_rate": 5.354026070440944e-05, "loss": 2.2894195556640624, "memory(GiB)": 77.56, "step": 55720, "token_acc": 0.49050632911392406, "train_speed(iter/s)": 1.438724 }, { "epoch": 2.387429844479671, "grad_norm": 5.833535671234131, "learning_rate": 5.35335477899612e-05, "loss": 2.538958740234375, "memory(GiB)": 77.56, "step": 55725, "token_acc": 0.45936395759717313, "train_speed(iter/s)": 1.438756 }, { "epoch": 2.38764405980892, "grad_norm": 5.033865451812744, "learning_rate": 5.352683481149944e-05, "loss": 2.5982614517211915, "memory(GiB)": 77.56, "step": 55730, "token_acc": 0.5018050541516246, "train_speed(iter/s)": 1.438768 }, { "epoch": 2.3878582751381687, "grad_norm": 6.980336666107178, "learning_rate": 5.352012176914575e-05, "loss": 2.2374475479125975, "memory(GiB)": 77.56, "step": 55735, "token_acc": 0.5095785440613027, "train_speed(iter/s)": 1.438764 }, { "epoch": 2.388072490467418, "grad_norm": 6.426371097564697, "learning_rate": 5.351340866302176e-05, "loss": 2.369184684753418, "memory(GiB)": 77.56, "step": 55740, "token_acc": 0.5177865612648221, "train_speed(iter/s)": 1.438772 }, { "epoch": 2.388286705796667, "grad_norm": 6.667300701141357, "learning_rate": 5.350669549324907e-05, "loss": 2.8681800842285154, "memory(GiB)": 77.56, "step": 55745, "token_acc": 0.4470198675496689, "train_speed(iter/s)": 1.438773 }, { "epoch": 2.3885009211259156, "grad_norm": 5.618614196777344, "learning_rate": 5.3499982259949285e-05, "loss": 2.390421676635742, "memory(GiB)": 77.56, "step": 55750, "token_acc": 0.49224806201550386, "train_speed(iter/s)": 1.438778 }, { "epoch": 2.388715136455165, "grad_norm": 6.456734657287598, "learning_rate": 5.349326896324407e-05, "loss": 2.3189796447753905, "memory(GiB)": 77.56, "step": 55755, "token_acc": 0.4979253112033195, "train_speed(iter/s)": 1.43878 }, { "epoch": 2.3889293517844137, "grad_norm": 4.946966171264648, "learning_rate": 5.348655560325498e-05, "loss": 2.4424358367919923, "memory(GiB)": 77.56, "step": 55760, "token_acc": 0.504, "train_speed(iter/s)": 1.438792 }, { "epoch": 2.3891435671136625, "grad_norm": 4.634521007537842, "learning_rate": 5.347984218010369e-05, "loss": 2.392115020751953, "memory(GiB)": 77.56, "step": 55765, "token_acc": 0.4828767123287671, "train_speed(iter/s)": 1.438803 }, { "epoch": 2.3893577824429117, "grad_norm": 6.2234907150268555, "learning_rate": 5.347312869391179e-05, "loss": 2.5383182525634767, "memory(GiB)": 77.56, "step": 55770, "token_acc": 0.46283783783783783, "train_speed(iter/s)": 1.438817 }, { "epoch": 2.3895719977721606, "grad_norm": 5.292355537414551, "learning_rate": 5.3466415144800893e-05, "loss": 2.629548263549805, "memory(GiB)": 77.56, "step": 55775, "token_acc": 0.4542372881355932, "train_speed(iter/s)": 1.438817 }, { "epoch": 2.3897862131014094, "grad_norm": 5.502911567687988, "learning_rate": 5.345970153289266e-05, "loss": 2.5506542205810545, "memory(GiB)": 77.56, "step": 55780, "token_acc": 0.4623287671232877, "train_speed(iter/s)": 1.438837 }, { "epoch": 2.3900004284306586, "grad_norm": 5.6977949142456055, "learning_rate": 5.345298785830866e-05, "loss": 2.372870445251465, "memory(GiB)": 77.56, "step": 55785, "token_acc": 0.46534653465346537, "train_speed(iter/s)": 1.438847 }, { "epoch": 2.3902146437599074, "grad_norm": 5.290041446685791, "learning_rate": 5.344627412117055e-05, "loss": 2.2906618118286133, "memory(GiB)": 77.56, "step": 55790, "token_acc": 0.5236220472440944, "train_speed(iter/s)": 1.43887 }, { "epoch": 2.3904288590891563, "grad_norm": 5.064677715301514, "learning_rate": 5.343956032159996e-05, "loss": 2.596945381164551, "memory(GiB)": 77.56, "step": 55795, "token_acc": 0.486404833836858, "train_speed(iter/s)": 1.438899 }, { "epoch": 2.3906430744184055, "grad_norm": 6.323420524597168, "learning_rate": 5.34328464597185e-05, "loss": 2.092578887939453, "memory(GiB)": 77.56, "step": 55800, "token_acc": 0.5461254612546126, "train_speed(iter/s)": 1.438909 }, { "epoch": 2.3908572897476543, "grad_norm": 7.094272136688232, "learning_rate": 5.342613253564782e-05, "loss": 2.310749816894531, "memory(GiB)": 77.56, "step": 55805, "token_acc": 0.4831804281345566, "train_speed(iter/s)": 1.43892 }, { "epoch": 2.391071505076903, "grad_norm": 5.3603129386901855, "learning_rate": 5.341941854950952e-05, "loss": 2.3845823287963865, "memory(GiB)": 77.56, "step": 55810, "token_acc": 0.4788273615635179, "train_speed(iter/s)": 1.438937 }, { "epoch": 2.3912857204061524, "grad_norm": 4.8593363761901855, "learning_rate": 5.341270450142526e-05, "loss": 2.562577819824219, "memory(GiB)": 77.56, "step": 55815, "token_acc": 0.51171875, "train_speed(iter/s)": 1.438956 }, { "epoch": 2.391499935735401, "grad_norm": 7.277611255645752, "learning_rate": 5.340599039151665e-05, "loss": 2.3729629516601562, "memory(GiB)": 77.56, "step": 55820, "token_acc": 0.5043103448275862, "train_speed(iter/s)": 1.438967 }, { "epoch": 2.39171415106465, "grad_norm": 5.727371692657471, "learning_rate": 5.339927621990533e-05, "loss": 2.5254974365234375, "memory(GiB)": 77.56, "step": 55825, "token_acc": 0.5067114093959731, "train_speed(iter/s)": 1.43898 }, { "epoch": 2.3919283663938993, "grad_norm": 6.238166809082031, "learning_rate": 5.3392561986712916e-05, "loss": 2.342428970336914, "memory(GiB)": 77.56, "step": 55830, "token_acc": 0.4835820895522388, "train_speed(iter/s)": 1.439015 }, { "epoch": 2.392142581723148, "grad_norm": 6.191690444946289, "learning_rate": 5.3385847692061075e-05, "loss": 2.622299385070801, "memory(GiB)": 77.56, "step": 55835, "token_acc": 0.41924398625429554, "train_speed(iter/s)": 1.439024 }, { "epoch": 2.392356797052397, "grad_norm": 7.068532943725586, "learning_rate": 5.337913333607143e-05, "loss": 2.555002784729004, "memory(GiB)": 77.56, "step": 55840, "token_acc": 0.44765342960288806, "train_speed(iter/s)": 1.439054 }, { "epoch": 2.392571012381646, "grad_norm": 5.620985984802246, "learning_rate": 5.337241891886561e-05, "loss": 2.413443946838379, "memory(GiB)": 77.56, "step": 55845, "token_acc": 0.47315436241610737, "train_speed(iter/s)": 1.439062 }, { "epoch": 2.392785227710895, "grad_norm": 6.190371036529541, "learning_rate": 5.3365704440565255e-05, "loss": 2.46199893951416, "memory(GiB)": 77.56, "step": 55850, "token_acc": 0.45806451612903226, "train_speed(iter/s)": 1.439094 }, { "epoch": 2.392999443040144, "grad_norm": 5.548079490661621, "learning_rate": 5.335898990129202e-05, "loss": 2.48559684753418, "memory(GiB)": 77.56, "step": 55855, "token_acc": 0.5092250922509225, "train_speed(iter/s)": 1.439113 }, { "epoch": 2.393213658369393, "grad_norm": 5.645583152770996, "learning_rate": 5.335227530116751e-05, "loss": 2.3503229141235353, "memory(GiB)": 77.56, "step": 55860, "token_acc": 0.5142045454545454, "train_speed(iter/s)": 1.439107 }, { "epoch": 2.393427873698642, "grad_norm": 5.426797389984131, "learning_rate": 5.3345560640313395e-05, "loss": 2.3297786712646484, "memory(GiB)": 77.56, "step": 55865, "token_acc": 0.4858490566037736, "train_speed(iter/s)": 1.439136 }, { "epoch": 2.3936420890278907, "grad_norm": 6.183347225189209, "learning_rate": 5.333884591885132e-05, "loss": 2.2710235595703123, "memory(GiB)": 77.56, "step": 55870, "token_acc": 0.5207667731629393, "train_speed(iter/s)": 1.439166 }, { "epoch": 2.39385630435714, "grad_norm": 5.147744655609131, "learning_rate": 5.333213113690291e-05, "loss": 2.2079469680786135, "memory(GiB)": 77.56, "step": 55875, "token_acc": 0.551094890510949, "train_speed(iter/s)": 1.439158 }, { "epoch": 2.3940705196863887, "grad_norm": 4.292358875274658, "learning_rate": 5.3325416294589826e-05, "loss": 2.3044912338256838, "memory(GiB)": 77.56, "step": 55880, "token_acc": 0.48427672955974843, "train_speed(iter/s)": 1.439178 }, { "epoch": 2.3942847350156375, "grad_norm": 6.845787048339844, "learning_rate": 5.331870139203371e-05, "loss": 2.157851791381836, "memory(GiB)": 77.56, "step": 55885, "token_acc": 0.5114754098360655, "train_speed(iter/s)": 1.439184 }, { "epoch": 2.394498950344887, "grad_norm": 5.979043006896973, "learning_rate": 5.33119864293562e-05, "loss": 2.2227333068847654, "memory(GiB)": 77.56, "step": 55890, "token_acc": 0.5100671140939598, "train_speed(iter/s)": 1.439191 }, { "epoch": 2.3947131656741356, "grad_norm": 5.1875762939453125, "learning_rate": 5.3305271406678936e-05, "loss": 2.6128095626831054, "memory(GiB)": 77.56, "step": 55895, "token_acc": 0.4473684210526316, "train_speed(iter/s)": 1.439211 }, { "epoch": 2.3949273810033844, "grad_norm": 4.127660751342773, "learning_rate": 5.329855632412359e-05, "loss": 2.5641387939453124, "memory(GiB)": 77.56, "step": 55900, "token_acc": 0.46973365617433416, "train_speed(iter/s)": 1.439206 }, { "epoch": 2.3951415963326337, "grad_norm": 11.877144813537598, "learning_rate": 5.329184118181181e-05, "loss": 2.5081594467163084, "memory(GiB)": 77.56, "step": 55905, "token_acc": 0.48328267477203646, "train_speed(iter/s)": 1.439213 }, { "epoch": 2.3953558116618825, "grad_norm": 5.12606954574585, "learning_rate": 5.3285125979865234e-05, "loss": 2.336347961425781, "memory(GiB)": 77.56, "step": 55910, "token_acc": 0.49377593360995853, "train_speed(iter/s)": 1.439221 }, { "epoch": 2.3955700269911313, "grad_norm": 5.319666385650635, "learning_rate": 5.3278410718405514e-05, "loss": 2.23387451171875, "memory(GiB)": 77.56, "step": 55915, "token_acc": 0.508833922261484, "train_speed(iter/s)": 1.439219 }, { "epoch": 2.3957842423203806, "grad_norm": 5.634885787963867, "learning_rate": 5.327169539755431e-05, "loss": 2.4125213623046875, "memory(GiB)": 77.56, "step": 55920, "token_acc": 0.5040322580645161, "train_speed(iter/s)": 1.439244 }, { "epoch": 2.3959984576496294, "grad_norm": 6.618666648864746, "learning_rate": 5.3264980017433284e-05, "loss": 2.662358856201172, "memory(GiB)": 77.56, "step": 55925, "token_acc": 0.46236559139784944, "train_speed(iter/s)": 1.439276 }, { "epoch": 2.396212672978878, "grad_norm": 5.333796501159668, "learning_rate": 5.325826457816406e-05, "loss": 2.3391931533813475, "memory(GiB)": 77.56, "step": 55930, "token_acc": 0.4929078014184397, "train_speed(iter/s)": 1.439303 }, { "epoch": 2.3964268883081274, "grad_norm": 4.858936309814453, "learning_rate": 5.325154907986833e-05, "loss": 2.212980270385742, "memory(GiB)": 77.56, "step": 55935, "token_acc": 0.488135593220339, "train_speed(iter/s)": 1.43932 }, { "epoch": 2.3966411036373763, "grad_norm": 4.452764511108398, "learning_rate": 5.324483352266775e-05, "loss": 2.5510013580322264, "memory(GiB)": 77.56, "step": 55940, "token_acc": 0.4982456140350877, "train_speed(iter/s)": 1.439325 }, { "epoch": 2.396855318966625, "grad_norm": 5.3482584953308105, "learning_rate": 5.323811790668395e-05, "loss": 2.4425991058349608, "memory(GiB)": 77.56, "step": 55945, "token_acc": 0.458041958041958, "train_speed(iter/s)": 1.439319 }, { "epoch": 2.3970695342958743, "grad_norm": 4.145442962646484, "learning_rate": 5.323140223203862e-05, "loss": 2.3750329971313477, "memory(GiB)": 77.56, "step": 55950, "token_acc": 0.4868804664723032, "train_speed(iter/s)": 1.439342 }, { "epoch": 2.397283749625123, "grad_norm": 8.035231590270996, "learning_rate": 5.322468649885341e-05, "loss": 2.761701965332031, "memory(GiB)": 77.56, "step": 55955, "token_acc": 0.4411764705882353, "train_speed(iter/s)": 1.439361 }, { "epoch": 2.397497964954372, "grad_norm": 7.0439910888671875, "learning_rate": 5.3217970707249955e-05, "loss": 2.406748580932617, "memory(GiB)": 77.56, "step": 55960, "token_acc": 0.43670886075949367, "train_speed(iter/s)": 1.439367 }, { "epoch": 2.397712180283621, "grad_norm": 5.987401485443115, "learning_rate": 5.3211254857349955e-05, "loss": 2.289975166320801, "memory(GiB)": 77.56, "step": 55965, "token_acc": 0.5, "train_speed(iter/s)": 1.439374 }, { "epoch": 2.39792639561287, "grad_norm": 5.4337849617004395, "learning_rate": 5.320453894927506e-05, "loss": 2.20496826171875, "memory(GiB)": 77.56, "step": 55970, "token_acc": 0.5464684014869888, "train_speed(iter/s)": 1.439356 }, { "epoch": 2.398140610942119, "grad_norm": 7.567423343658447, "learning_rate": 5.319782298314694e-05, "loss": 2.1844202041625977, "memory(GiB)": 77.56, "step": 55975, "token_acc": 0.4838709677419355, "train_speed(iter/s)": 1.439339 }, { "epoch": 2.398354826271368, "grad_norm": 5.335397243499756, "learning_rate": 5.319110695908726e-05, "loss": 2.394648551940918, "memory(GiB)": 77.56, "step": 55980, "token_acc": 0.48830409356725146, "train_speed(iter/s)": 1.439353 }, { "epoch": 2.398569041600617, "grad_norm": 4.841787815093994, "learning_rate": 5.318439087721768e-05, "loss": 2.3667648315429686, "memory(GiB)": 77.56, "step": 55985, "token_acc": 0.49700598802395207, "train_speed(iter/s)": 1.439369 }, { "epoch": 2.3987832569298657, "grad_norm": 6.294712066650391, "learning_rate": 5.3177674737659865e-05, "loss": 2.507879638671875, "memory(GiB)": 77.56, "step": 55990, "token_acc": 0.46258503401360546, "train_speed(iter/s)": 1.439357 }, { "epoch": 2.398997472259115, "grad_norm": 6.565338611602783, "learning_rate": 5.31709585405355e-05, "loss": 2.4787265777587892, "memory(GiB)": 77.56, "step": 55995, "token_acc": 0.5129310344827587, "train_speed(iter/s)": 1.439374 }, { "epoch": 2.399211687588364, "grad_norm": 6.02125358581543, "learning_rate": 5.316424228596625e-05, "loss": 2.3187360763549805, "memory(GiB)": 77.56, "step": 56000, "token_acc": 0.5195729537366548, "train_speed(iter/s)": 1.439384 }, { "epoch": 2.399211687588364, "eval_loss": 2.0392301082611084, "eval_runtime": 14.4516, "eval_samples_per_second": 6.92, "eval_steps_per_second": 6.92, "eval_token_acc": 0.4645858343337335, "step": 56000 }, { "epoch": 2.3994259029176126, "grad_norm": 8.405716896057129, "learning_rate": 5.315752597407376e-05, "loss": 2.3897193908691405, "memory(GiB)": 77.56, "step": 56005, "token_acc": 0.46860782529572337, "train_speed(iter/s)": 1.43884 }, { "epoch": 2.399640118246862, "grad_norm": 8.381136894226074, "learning_rate": 5.315080960497975e-05, "loss": 2.7350940704345703, "memory(GiB)": 77.56, "step": 56010, "token_acc": 0.4344569288389513, "train_speed(iter/s)": 1.438849 }, { "epoch": 2.3998543335761107, "grad_norm": 7.539923667907715, "learning_rate": 5.3144093178805856e-05, "loss": 2.585416793823242, "memory(GiB)": 77.56, "step": 56015, "token_acc": 0.505338078291815, "train_speed(iter/s)": 1.438876 }, { "epoch": 2.4000685489053595, "grad_norm": 4.847936630249023, "learning_rate": 5.313737669567377e-05, "loss": 2.556245231628418, "memory(GiB)": 77.56, "step": 56020, "token_acc": 0.5059523809523809, "train_speed(iter/s)": 1.438876 }, { "epoch": 2.4002827642346087, "grad_norm": 6.339115142822266, "learning_rate": 5.313066015570516e-05, "loss": 2.512187957763672, "memory(GiB)": 77.56, "step": 56025, "token_acc": 0.4631578947368421, "train_speed(iter/s)": 1.438861 }, { "epoch": 2.4004969795638575, "grad_norm": 7.122486114501953, "learning_rate": 5.312394355902171e-05, "loss": 2.4464868545532226, "memory(GiB)": 77.56, "step": 56030, "token_acc": 0.5245901639344263, "train_speed(iter/s)": 1.438869 }, { "epoch": 2.4007111948931064, "grad_norm": 4.212314128875732, "learning_rate": 5.3117226905745075e-05, "loss": 2.5223152160644533, "memory(GiB)": 77.56, "step": 56035, "token_acc": 0.48606811145510836, "train_speed(iter/s)": 1.438876 }, { "epoch": 2.4009254102223556, "grad_norm": 5.982733249664307, "learning_rate": 5.311051019599698e-05, "loss": 2.341153144836426, "memory(GiB)": 77.56, "step": 56040, "token_acc": 0.47692307692307695, "train_speed(iter/s)": 1.438876 }, { "epoch": 2.4011396255516044, "grad_norm": 4.982411861419678, "learning_rate": 5.310379342989904e-05, "loss": 2.5095245361328127, "memory(GiB)": 77.56, "step": 56045, "token_acc": 0.4639175257731959, "train_speed(iter/s)": 1.438879 }, { "epoch": 2.4013538408808532, "grad_norm": 6.8948140144348145, "learning_rate": 5.3097076607572984e-05, "loss": 2.175385665893555, "memory(GiB)": 77.56, "step": 56050, "token_acc": 0.5269709543568465, "train_speed(iter/s)": 1.43888 }, { "epoch": 2.4015680562101025, "grad_norm": 4.494455337524414, "learning_rate": 5.309035972914049e-05, "loss": 2.364501953125, "memory(GiB)": 77.56, "step": 56055, "token_acc": 0.5132743362831859, "train_speed(iter/s)": 1.438902 }, { "epoch": 2.4017822715393513, "grad_norm": 5.145925998687744, "learning_rate": 5.308364279472322e-05, "loss": 2.5412187576293945, "memory(GiB)": 77.56, "step": 56060, "token_acc": 0.46551724137931033, "train_speed(iter/s)": 1.438884 }, { "epoch": 2.4019964868686, "grad_norm": 5.406068325042725, "learning_rate": 5.3076925804442865e-05, "loss": 2.3500289916992188, "memory(GiB)": 77.56, "step": 56065, "token_acc": 0.5, "train_speed(iter/s)": 1.438883 }, { "epoch": 2.4022107021978494, "grad_norm": 8.811362266540527, "learning_rate": 5.307020875842114e-05, "loss": 2.3378273010253907, "memory(GiB)": 77.56, "step": 56070, "token_acc": 0.48148148148148145, "train_speed(iter/s)": 1.438892 }, { "epoch": 2.402424917527098, "grad_norm": 5.983698844909668, "learning_rate": 5.306349165677967e-05, "loss": 2.7025020599365233, "memory(GiB)": 77.56, "step": 56075, "token_acc": 0.43666666666666665, "train_speed(iter/s)": 1.438872 }, { "epoch": 2.402639132856347, "grad_norm": 5.231452465057373, "learning_rate": 5.3056774499640184e-05, "loss": 2.346792221069336, "memory(GiB)": 77.56, "step": 56080, "token_acc": 0.5119453924914675, "train_speed(iter/s)": 1.438878 }, { "epoch": 2.4028533481855963, "grad_norm": 4.943996906280518, "learning_rate": 5.305005728712437e-05, "loss": 2.5767940521240233, "memory(GiB)": 77.56, "step": 56085, "token_acc": 0.46178343949044587, "train_speed(iter/s)": 1.438881 }, { "epoch": 2.403067563514845, "grad_norm": 6.586244106292725, "learning_rate": 5.304334001935389e-05, "loss": 2.310846519470215, "memory(GiB)": 77.56, "step": 56090, "token_acc": 0.5072992700729927, "train_speed(iter/s)": 1.438901 }, { "epoch": 2.403281778844094, "grad_norm": 6.0121989250183105, "learning_rate": 5.3036622696450466e-05, "loss": 2.0009632110595703, "memory(GiB)": 77.56, "step": 56095, "token_acc": 0.550561797752809, "train_speed(iter/s)": 1.438919 }, { "epoch": 2.403495994173343, "grad_norm": 5.395315170288086, "learning_rate": 5.302990531853578e-05, "loss": 2.4876937866210938, "memory(GiB)": 77.56, "step": 56100, "token_acc": 0.4911660777385159, "train_speed(iter/s)": 1.438901 }, { "epoch": 2.403710209502592, "grad_norm": 4.957891464233398, "learning_rate": 5.302318788573149e-05, "loss": 2.266989517211914, "memory(GiB)": 77.56, "step": 56105, "token_acc": 0.5, "train_speed(iter/s)": 1.438879 }, { "epoch": 2.4039244248318408, "grad_norm": 6.9275054931640625, "learning_rate": 5.3016470398159344e-05, "loss": 2.6653812408447264, "memory(GiB)": 77.56, "step": 56110, "token_acc": 0.5019305019305019, "train_speed(iter/s)": 1.438864 }, { "epoch": 2.40413864016109, "grad_norm": 5.8965582847595215, "learning_rate": 5.3009752855941e-05, "loss": 2.4196725845336915, "memory(GiB)": 77.56, "step": 56115, "token_acc": 0.506993006993007, "train_speed(iter/s)": 1.438861 }, { "epoch": 2.404352855490339, "grad_norm": 4.167325019836426, "learning_rate": 5.300303525919813e-05, "loss": 2.3755931854248047, "memory(GiB)": 77.56, "step": 56120, "token_acc": 0.49407114624505927, "train_speed(iter/s)": 1.438885 }, { "epoch": 2.4045670708195876, "grad_norm": 5.9489970207214355, "learning_rate": 5.2996317608052494e-05, "loss": 2.276514434814453, "memory(GiB)": 77.56, "step": 56125, "token_acc": 0.5302491103202847, "train_speed(iter/s)": 1.438875 }, { "epoch": 2.404781286148837, "grad_norm": 6.91579008102417, "learning_rate": 5.298959990262574e-05, "loss": 2.4200796127319335, "memory(GiB)": 77.56, "step": 56130, "token_acc": 0.4843205574912892, "train_speed(iter/s)": 1.438868 }, { "epoch": 2.4049955014780857, "grad_norm": 5.1532511711120605, "learning_rate": 5.298288214303958e-05, "loss": 2.5728235244750977, "memory(GiB)": 77.56, "step": 56135, "token_acc": 0.501466275659824, "train_speed(iter/s)": 1.438897 }, { "epoch": 2.4052097168073345, "grad_norm": 5.5746307373046875, "learning_rate": 5.2976164329415725e-05, "loss": 2.5000816345214845, "memory(GiB)": 77.56, "step": 56140, "token_acc": 0.4670846394984326, "train_speed(iter/s)": 1.438887 }, { "epoch": 2.405423932136584, "grad_norm": 5.644297122955322, "learning_rate": 5.296944646187585e-05, "loss": 2.333147430419922, "memory(GiB)": 77.56, "step": 56145, "token_acc": 0.4827586206896552, "train_speed(iter/s)": 1.438906 }, { "epoch": 2.4056381474658326, "grad_norm": 5.093411922454834, "learning_rate": 5.296272854054166e-05, "loss": 2.655217742919922, "memory(GiB)": 77.56, "step": 56150, "token_acc": 0.48338368580060426, "train_speed(iter/s)": 1.438915 }, { "epoch": 2.4058523627950814, "grad_norm": 5.415943622589111, "learning_rate": 5.295601056553486e-05, "loss": 2.397196960449219, "memory(GiB)": 77.56, "step": 56155, "token_acc": 0.478134110787172, "train_speed(iter/s)": 1.438934 }, { "epoch": 2.4060665781243307, "grad_norm": 4.815046310424805, "learning_rate": 5.294929253697718e-05, "loss": 2.4271930694580077, "memory(GiB)": 77.56, "step": 56160, "token_acc": 0.5119760479041916, "train_speed(iter/s)": 1.438949 }, { "epoch": 2.4062807934535795, "grad_norm": 6.341163635253906, "learning_rate": 5.294257445499028e-05, "loss": 2.531138229370117, "memory(GiB)": 77.56, "step": 56165, "token_acc": 0.468, "train_speed(iter/s)": 1.43898 }, { "epoch": 2.4064950087828283, "grad_norm": 4.485575199127197, "learning_rate": 5.29358563196959e-05, "loss": 2.299831771850586, "memory(GiB)": 77.56, "step": 56170, "token_acc": 0.5257352941176471, "train_speed(iter/s)": 1.438958 }, { "epoch": 2.4067092241120775, "grad_norm": 5.203620433807373, "learning_rate": 5.2929138131215715e-05, "loss": 2.5045700073242188, "memory(GiB)": 77.56, "step": 56175, "token_acc": 0.47635135135135137, "train_speed(iter/s)": 1.438995 }, { "epoch": 2.4069234394413264, "grad_norm": 4.303263187408447, "learning_rate": 5.292241988967144e-05, "loss": 2.6499711990356447, "memory(GiB)": 77.56, "step": 56180, "token_acc": 0.4383116883116883, "train_speed(iter/s)": 1.438977 }, { "epoch": 2.407137654770575, "grad_norm": 5.529666423797607, "learning_rate": 5.29157015951848e-05, "loss": 2.4012521743774413, "memory(GiB)": 77.56, "step": 56185, "token_acc": 0.5050167224080268, "train_speed(iter/s)": 1.438967 }, { "epoch": 2.4073518700998244, "grad_norm": 5.961589336395264, "learning_rate": 5.2908983247877475e-05, "loss": 2.374089813232422, "memory(GiB)": 77.56, "step": 56190, "token_acc": 0.492, "train_speed(iter/s)": 1.438979 }, { "epoch": 2.4075660854290732, "grad_norm": 5.217300891876221, "learning_rate": 5.2902264847871196e-05, "loss": 2.674641227722168, "memory(GiB)": 77.56, "step": 56195, "token_acc": 0.4943820224719101, "train_speed(iter/s)": 1.438983 }, { "epoch": 2.407780300758322, "grad_norm": 5.262786865234375, "learning_rate": 5.289554639528768e-05, "loss": 2.3925403594970702, "memory(GiB)": 77.56, "step": 56200, "token_acc": 0.5043731778425656, "train_speed(iter/s)": 1.438967 }, { "epoch": 2.4079945160875713, "grad_norm": 5.554116725921631, "learning_rate": 5.2888827890248604e-05, "loss": 2.6348649978637697, "memory(GiB)": 77.56, "step": 56205, "token_acc": 0.4440677966101695, "train_speed(iter/s)": 1.438979 }, { "epoch": 2.40820873141682, "grad_norm": 6.3835906982421875, "learning_rate": 5.288210933287572e-05, "loss": 2.7073829650878904, "memory(GiB)": 77.56, "step": 56210, "token_acc": 0.4788273615635179, "train_speed(iter/s)": 1.438979 }, { "epoch": 2.408422946746069, "grad_norm": 4.819346904754639, "learning_rate": 5.287539072329072e-05, "loss": 2.3233657836914063, "memory(GiB)": 77.56, "step": 56215, "token_acc": 0.5109489051094891, "train_speed(iter/s)": 1.438986 }, { "epoch": 2.408637162075318, "grad_norm": 4.799155235290527, "learning_rate": 5.286867206161531e-05, "loss": 2.3258378982543944, "memory(GiB)": 77.56, "step": 56220, "token_acc": 0.5231316725978647, "train_speed(iter/s)": 1.439 }, { "epoch": 2.408851377404567, "grad_norm": 4.620659828186035, "learning_rate": 5.286195334797122e-05, "loss": 2.4757110595703127, "memory(GiB)": 77.56, "step": 56225, "token_acc": 0.46864686468646866, "train_speed(iter/s)": 1.439002 }, { "epoch": 2.409065592733816, "grad_norm": 5.389440059661865, "learning_rate": 5.285523458248015e-05, "loss": 2.359396553039551, "memory(GiB)": 77.56, "step": 56230, "token_acc": 0.4513888888888889, "train_speed(iter/s)": 1.439012 }, { "epoch": 2.409279808063065, "grad_norm": 5.467198848724365, "learning_rate": 5.284851576526383e-05, "loss": 2.6087230682373046, "memory(GiB)": 77.56, "step": 56235, "token_acc": 0.4763636363636364, "train_speed(iter/s)": 1.439019 }, { "epoch": 2.409494023392314, "grad_norm": 5.533074378967285, "learning_rate": 5.2841796896443986e-05, "loss": 2.4206157684326173, "memory(GiB)": 77.56, "step": 56240, "token_acc": 0.4819277108433735, "train_speed(iter/s)": 1.43902 }, { "epoch": 2.4097082387215627, "grad_norm": 5.181727886199951, "learning_rate": 5.283507797614232e-05, "loss": 2.207934761047363, "memory(GiB)": 77.56, "step": 56245, "token_acc": 0.5427350427350427, "train_speed(iter/s)": 1.439021 }, { "epoch": 2.409922454050812, "grad_norm": 5.358429431915283, "learning_rate": 5.282835900448055e-05, "loss": 2.2529346466064455, "memory(GiB)": 77.56, "step": 56250, "token_acc": 0.5032467532467533, "train_speed(iter/s)": 1.439049 }, { "epoch": 2.4101366693800608, "grad_norm": 5.2248382568359375, "learning_rate": 5.282163998158042e-05, "loss": 2.5750003814697267, "memory(GiB)": 77.56, "step": 56255, "token_acc": 0.48727272727272725, "train_speed(iter/s)": 1.439061 }, { "epoch": 2.4103508847093096, "grad_norm": 6.446933746337891, "learning_rate": 5.281492090756364e-05, "loss": 2.411317825317383, "memory(GiB)": 77.56, "step": 56260, "token_acc": 0.46488294314381273, "train_speed(iter/s)": 1.439091 }, { "epoch": 2.410565100038559, "grad_norm": 4.414382457733154, "learning_rate": 5.280820178255188e-05, "loss": 2.3740716934204102, "memory(GiB)": 77.56, "step": 56265, "token_acc": 0.5331010452961672, "train_speed(iter/s)": 1.439098 }, { "epoch": 2.4107793153678077, "grad_norm": 6.324094295501709, "learning_rate": 5.2801482606666955e-05, "loss": 2.545774459838867, "memory(GiB)": 77.56, "step": 56270, "token_acc": 0.5072463768115942, "train_speed(iter/s)": 1.43911 }, { "epoch": 2.4109935306970565, "grad_norm": 4.340853214263916, "learning_rate": 5.279476338003053e-05, "loss": 2.3693159103393553, "memory(GiB)": 77.56, "step": 56275, "token_acc": 0.5035971223021583, "train_speed(iter/s)": 1.439108 }, { "epoch": 2.4112077460263057, "grad_norm": 6.0674262046813965, "learning_rate": 5.2788044102764345e-05, "loss": 2.3557533264160155, "memory(GiB)": 77.56, "step": 56280, "token_acc": 0.5239852398523985, "train_speed(iter/s)": 1.439105 }, { "epoch": 2.4114219613555545, "grad_norm": 5.321403980255127, "learning_rate": 5.278132477499014e-05, "loss": 2.6936790466308596, "memory(GiB)": 77.56, "step": 56285, "token_acc": 0.43137254901960786, "train_speed(iter/s)": 1.439147 }, { "epoch": 2.4116361766848033, "grad_norm": 6.133849143981934, "learning_rate": 5.277460539682961e-05, "loss": 2.4603178024291994, "memory(GiB)": 77.56, "step": 56290, "token_acc": 0.4679245283018868, "train_speed(iter/s)": 1.439152 }, { "epoch": 2.4118503920140526, "grad_norm": 4.4758148193359375, "learning_rate": 5.276788596840451e-05, "loss": 2.671260643005371, "memory(GiB)": 77.56, "step": 56295, "token_acc": 0.4658753709198813, "train_speed(iter/s)": 1.439165 }, { "epoch": 2.4120646073433014, "grad_norm": 5.828741550445557, "learning_rate": 5.276116648983656e-05, "loss": 2.5580806732177734, "memory(GiB)": 77.56, "step": 56300, "token_acc": 0.4570446735395189, "train_speed(iter/s)": 1.439175 }, { "epoch": 2.4122788226725502, "grad_norm": 5.495771884918213, "learning_rate": 5.275444696124747e-05, "loss": 2.572454833984375, "memory(GiB)": 77.56, "step": 56305, "token_acc": 0.5095541401273885, "train_speed(iter/s)": 1.439165 }, { "epoch": 2.4124930380017995, "grad_norm": 3.5633745193481445, "learning_rate": 5.2747727382758993e-05, "loss": 2.3117183685302733, "memory(GiB)": 77.56, "step": 56310, "token_acc": 0.5197368421052632, "train_speed(iter/s)": 1.43915 }, { "epoch": 2.4127072533310483, "grad_norm": 5.507272720336914, "learning_rate": 5.274100775449288e-05, "loss": 2.4593910217285155, "memory(GiB)": 77.56, "step": 56315, "token_acc": 0.47653429602888087, "train_speed(iter/s)": 1.439153 }, { "epoch": 2.412921468660297, "grad_norm": 4.739686012268066, "learning_rate": 5.2734288076570824e-05, "loss": 2.3920015335083007, "memory(GiB)": 77.56, "step": 56320, "token_acc": 0.5206349206349207, "train_speed(iter/s)": 1.439153 }, { "epoch": 2.4131356839895464, "grad_norm": 5.539514541625977, "learning_rate": 5.2727568349114576e-05, "loss": 2.201377105712891, "memory(GiB)": 77.56, "step": 56325, "token_acc": 0.5288461538461539, "train_speed(iter/s)": 1.439135 }, { "epoch": 2.413349899318795, "grad_norm": 5.625189781188965, "learning_rate": 5.2720848572245874e-05, "loss": 2.710480880737305, "memory(GiB)": 77.56, "step": 56330, "token_acc": 0.4377224199288256, "train_speed(iter/s)": 1.439147 }, { "epoch": 2.413564114648044, "grad_norm": 7.14446496963501, "learning_rate": 5.271412874608644e-05, "loss": 2.3742630004882814, "memory(GiB)": 77.56, "step": 56335, "token_acc": 0.5112781954887218, "train_speed(iter/s)": 1.439137 }, { "epoch": 2.4137783299772932, "grad_norm": 4.664318084716797, "learning_rate": 5.270740887075801e-05, "loss": 2.212387466430664, "memory(GiB)": 77.56, "step": 56340, "token_acc": 0.5294117647058824, "train_speed(iter/s)": 1.439101 }, { "epoch": 2.413992545306542, "grad_norm": 6.09464168548584, "learning_rate": 5.2700688946382345e-05, "loss": 2.399026298522949, "memory(GiB)": 77.56, "step": 56345, "token_acc": 0.532, "train_speed(iter/s)": 1.439106 }, { "epoch": 2.414206760635791, "grad_norm": 4.864884376525879, "learning_rate": 5.269396897308117e-05, "loss": 2.4771289825439453, "memory(GiB)": 77.56, "step": 56350, "token_acc": 0.4641638225255973, "train_speed(iter/s)": 1.439107 }, { "epoch": 2.41442097596504, "grad_norm": 4.9332499504089355, "learning_rate": 5.2687248950976206e-05, "loss": 2.2081476211547852, "memory(GiB)": 77.56, "step": 56355, "token_acc": 0.5118110236220472, "train_speed(iter/s)": 1.439111 }, { "epoch": 2.414635191294289, "grad_norm": 5.544666767120361, "learning_rate": 5.268052888018922e-05, "loss": 2.853831481933594, "memory(GiB)": 77.56, "step": 56360, "token_acc": 0.4186046511627907, "train_speed(iter/s)": 1.439141 }, { "epoch": 2.4148494066235378, "grad_norm": 4.919857025146484, "learning_rate": 5.267380876084192e-05, "loss": 2.4408655166625977, "memory(GiB)": 77.56, "step": 56365, "token_acc": 0.47194719471947194, "train_speed(iter/s)": 1.439159 }, { "epoch": 2.415063621952787, "grad_norm": 5.91994047164917, "learning_rate": 5.266708859305608e-05, "loss": 2.340895080566406, "memory(GiB)": 77.56, "step": 56370, "token_acc": 0.5691823899371069, "train_speed(iter/s)": 1.439135 }, { "epoch": 2.415277837282036, "grad_norm": 3.824577808380127, "learning_rate": 5.266036837695344e-05, "loss": 2.38179931640625, "memory(GiB)": 77.56, "step": 56375, "token_acc": 0.5109717868338558, "train_speed(iter/s)": 1.439141 }, { "epoch": 2.4154920526112846, "grad_norm": 5.2734832763671875, "learning_rate": 5.26536481126557e-05, "loss": 2.5354665756225585, "memory(GiB)": 77.56, "step": 56380, "token_acc": 0.49731182795698925, "train_speed(iter/s)": 1.43916 }, { "epoch": 2.415706267940534, "grad_norm": 5.425694942474365, "learning_rate": 5.264692780028465e-05, "loss": 2.5977821350097656, "memory(GiB)": 77.56, "step": 56385, "token_acc": 0.45302013422818793, "train_speed(iter/s)": 1.43918 }, { "epoch": 2.4159204832697827, "grad_norm": 4.564004421234131, "learning_rate": 5.264020743996203e-05, "loss": 2.2885391235351564, "memory(GiB)": 77.56, "step": 56390, "token_acc": 0.5337423312883436, "train_speed(iter/s)": 1.439182 }, { "epoch": 2.4161346985990315, "grad_norm": 7.108612537384033, "learning_rate": 5.263348703180956e-05, "loss": 2.2892339706420897, "memory(GiB)": 77.56, "step": 56395, "token_acc": 0.516245487364621, "train_speed(iter/s)": 1.43917 }, { "epoch": 2.4163489139282808, "grad_norm": 4.639871120452881, "learning_rate": 5.262676657594901e-05, "loss": 2.316487503051758, "memory(GiB)": 77.56, "step": 56400, "token_acc": 0.4846153846153846, "train_speed(iter/s)": 1.439195 }, { "epoch": 2.4165631292575296, "grad_norm": 6.653697967529297, "learning_rate": 5.2620046072502124e-05, "loss": 2.4122287750244142, "memory(GiB)": 77.56, "step": 56405, "token_acc": 0.5, "train_speed(iter/s)": 1.439211 }, { "epoch": 2.4167773445867784, "grad_norm": 5.839682579040527, "learning_rate": 5.2613325521590626e-05, "loss": 2.5080310821533205, "memory(GiB)": 77.56, "step": 56410, "token_acc": 0.4889705882352941, "train_speed(iter/s)": 1.439188 }, { "epoch": 2.4169915599160277, "grad_norm": 4.128732681274414, "learning_rate": 5.26066049233363e-05, "loss": 2.1906267166137696, "memory(GiB)": 77.56, "step": 56415, "token_acc": 0.5230769230769231, "train_speed(iter/s)": 1.439202 }, { "epoch": 2.4172057752452765, "grad_norm": 7.49148416519165, "learning_rate": 5.259988427786088e-05, "loss": 2.7538579940795898, "memory(GiB)": 77.56, "step": 56420, "token_acc": 0.44932432432432434, "train_speed(iter/s)": 1.439206 }, { "epoch": 2.4174199905745253, "grad_norm": 6.461582183837891, "learning_rate": 5.259316358528611e-05, "loss": 2.402140426635742, "memory(GiB)": 77.56, "step": 56425, "token_acc": 0.4948453608247423, "train_speed(iter/s)": 1.43923 }, { "epoch": 2.4176342059037745, "grad_norm": 5.6993207931518555, "learning_rate": 5.2586442845733765e-05, "loss": 2.5821022033691405, "memory(GiB)": 77.56, "step": 56430, "token_acc": 0.4672131147540984, "train_speed(iter/s)": 1.439221 }, { "epoch": 2.4178484212330233, "grad_norm": 5.566256999969482, "learning_rate": 5.257972205932558e-05, "loss": 2.52720947265625, "memory(GiB)": 77.56, "step": 56435, "token_acc": 0.476027397260274, "train_speed(iter/s)": 1.439227 }, { "epoch": 2.418062636562272, "grad_norm": 5.295451641082764, "learning_rate": 5.257300122618329e-05, "loss": 2.3013282775878907, "memory(GiB)": 77.56, "step": 56440, "token_acc": 0.4837758112094395, "train_speed(iter/s)": 1.439242 }, { "epoch": 2.4182768518915214, "grad_norm": 6.468355178833008, "learning_rate": 5.2566280346428687e-05, "loss": 2.83559513092041, "memory(GiB)": 77.56, "step": 56445, "token_acc": 0.4246987951807229, "train_speed(iter/s)": 1.439258 }, { "epoch": 2.4184910672207702, "grad_norm": 5.487728118896484, "learning_rate": 5.25595594201835e-05, "loss": 2.2627710342407226, "memory(GiB)": 77.56, "step": 56450, "token_acc": 0.5420875420875421, "train_speed(iter/s)": 1.439229 }, { "epoch": 2.418705282550019, "grad_norm": 4.941653251647949, "learning_rate": 5.2552838447569485e-05, "loss": 2.6452669143676757, "memory(GiB)": 77.56, "step": 56455, "token_acc": 0.4658753709198813, "train_speed(iter/s)": 1.439222 }, { "epoch": 2.4189194978792683, "grad_norm": 5.474538803100586, "learning_rate": 5.254611742870842e-05, "loss": 2.5214195251464844, "memory(GiB)": 77.56, "step": 56460, "token_acc": 0.4666666666666667, "train_speed(iter/s)": 1.439258 }, { "epoch": 2.419133713208517, "grad_norm": 5.894259929656982, "learning_rate": 5.253939636372206e-05, "loss": 2.303129959106445, "memory(GiB)": 77.56, "step": 56465, "token_acc": 0.5172413793103449, "train_speed(iter/s)": 1.439264 }, { "epoch": 2.419347928537766, "grad_norm": 5.728356838226318, "learning_rate": 5.253267525273213e-05, "loss": 2.2912128448486326, "memory(GiB)": 77.56, "step": 56470, "token_acc": 0.48417721518987344, "train_speed(iter/s)": 1.439263 }, { "epoch": 2.419562143867015, "grad_norm": 4.11015510559082, "learning_rate": 5.252595409586042e-05, "loss": 2.379174995422363, "memory(GiB)": 77.56, "step": 56475, "token_acc": 0.5436507936507936, "train_speed(iter/s)": 1.439264 }, { "epoch": 2.419776359196264, "grad_norm": 6.598214626312256, "learning_rate": 5.251923289322868e-05, "loss": 2.0697322845458985, "memory(GiB)": 77.56, "step": 56480, "token_acc": 0.5134099616858238, "train_speed(iter/s)": 1.439272 }, { "epoch": 2.419990574525513, "grad_norm": 5.734899997711182, "learning_rate": 5.251251164495868e-05, "loss": 2.4696388244628906, "memory(GiB)": 77.56, "step": 56485, "token_acc": 0.4887459807073955, "train_speed(iter/s)": 1.439277 }, { "epoch": 2.420204789854762, "grad_norm": 9.856157302856445, "learning_rate": 5.250579035117217e-05, "loss": 2.546694755554199, "memory(GiB)": 77.56, "step": 56490, "token_acc": 0.4693140794223827, "train_speed(iter/s)": 1.439247 }, { "epoch": 2.420419005184011, "grad_norm": 4.791378021240234, "learning_rate": 5.249906901199091e-05, "loss": 2.635769844055176, "memory(GiB)": 77.56, "step": 56495, "token_acc": 0.4261744966442953, "train_speed(iter/s)": 1.439223 }, { "epoch": 2.4206332205132597, "grad_norm": 6.565951824188232, "learning_rate": 5.2492347627536686e-05, "loss": 2.485715866088867, "memory(GiB)": 77.56, "step": 56500, "token_acc": 0.5232558139534884, "train_speed(iter/s)": 1.439203 }, { "epoch": 2.4206332205132597, "eval_loss": 2.3672709465026855, "eval_runtime": 14.3838, "eval_samples_per_second": 6.952, "eval_steps_per_second": 6.952, "eval_token_acc": 0.4813218390804598, "step": 56500 }, { "epoch": 2.420847435842509, "grad_norm": 5.80452823638916, "learning_rate": 5.248562619793124e-05, "loss": 2.574502182006836, "memory(GiB)": 77.56, "step": 56505, "token_acc": 0.48422090729783035, "train_speed(iter/s)": 1.438658 }, { "epoch": 2.4210616511717578, "grad_norm": 4.908410549163818, "learning_rate": 5.247890472329634e-05, "loss": 2.420794677734375, "memory(GiB)": 77.56, "step": 56510, "token_acc": 0.4954954954954955, "train_speed(iter/s)": 1.438666 }, { "epoch": 2.4212758665010066, "grad_norm": 5.8564043045043945, "learning_rate": 5.247218320375376e-05, "loss": 2.382805824279785, "memory(GiB)": 77.56, "step": 56515, "token_acc": 0.4738562091503268, "train_speed(iter/s)": 1.438673 }, { "epoch": 2.421490081830256, "grad_norm": 5.025363922119141, "learning_rate": 5.246546163942526e-05, "loss": 2.334534454345703, "memory(GiB)": 77.56, "step": 56520, "token_acc": 0.4770992366412214, "train_speed(iter/s)": 1.43862 }, { "epoch": 2.4217042971595046, "grad_norm": 7.295405387878418, "learning_rate": 5.2458740030432595e-05, "loss": 2.5426416397094727, "memory(GiB)": 77.56, "step": 56525, "token_acc": 0.43006993006993005, "train_speed(iter/s)": 1.438625 }, { "epoch": 2.4219185124887534, "grad_norm": 3.918203115463257, "learning_rate": 5.245201837689757e-05, "loss": 2.103429985046387, "memory(GiB)": 77.56, "step": 56530, "token_acc": 0.5186440677966102, "train_speed(iter/s)": 1.438638 }, { "epoch": 2.4221327278180027, "grad_norm": 5.002352714538574, "learning_rate": 5.2445296678941935e-05, "loss": 2.5431236267089843, "memory(GiB)": 77.56, "step": 56535, "token_acc": 0.4876543209876543, "train_speed(iter/s)": 1.438645 }, { "epoch": 2.4223469431472515, "grad_norm": 6.707949161529541, "learning_rate": 5.243857493668743e-05, "loss": 2.640879821777344, "memory(GiB)": 77.56, "step": 56540, "token_acc": 0.450920245398773, "train_speed(iter/s)": 1.438649 }, { "epoch": 2.4225611584765003, "grad_norm": 4.90098237991333, "learning_rate": 5.2431853150255886e-05, "loss": 2.680977439880371, "memory(GiB)": 77.56, "step": 56545, "token_acc": 0.4554140127388535, "train_speed(iter/s)": 1.438666 }, { "epoch": 2.4227753738057496, "grad_norm": 6.017824172973633, "learning_rate": 5.2425131319769024e-05, "loss": 2.978159523010254, "memory(GiB)": 77.56, "step": 56550, "token_acc": 0.4671280276816609, "train_speed(iter/s)": 1.438687 }, { "epoch": 2.4229895891349984, "grad_norm": 7.294832229614258, "learning_rate": 5.241840944534862e-05, "loss": 1.836798095703125, "memory(GiB)": 77.56, "step": 56555, "token_acc": 0.5097276264591439, "train_speed(iter/s)": 1.43869 }, { "epoch": 2.423203804464247, "grad_norm": 5.722614765167236, "learning_rate": 5.241168752711648e-05, "loss": 2.312526321411133, "memory(GiB)": 77.56, "step": 56560, "token_acc": 0.5019607843137255, "train_speed(iter/s)": 1.438706 }, { "epoch": 2.4234180197934965, "grad_norm": 6.166382312774658, "learning_rate": 5.240496556519435e-05, "loss": 2.3768888473510743, "memory(GiB)": 77.56, "step": 56565, "token_acc": 0.49158249158249157, "train_speed(iter/s)": 1.438743 }, { "epoch": 2.4236322351227453, "grad_norm": 5.268062114715576, "learning_rate": 5.239824355970401e-05, "loss": 2.379104804992676, "memory(GiB)": 77.56, "step": 56570, "token_acc": 0.49504950495049505, "train_speed(iter/s)": 1.438767 }, { "epoch": 2.4238464504519945, "grad_norm": 5.939847946166992, "learning_rate": 5.239152151076725e-05, "loss": 2.3494287490844727, "memory(GiB)": 77.56, "step": 56575, "token_acc": 0.5223880597014925, "train_speed(iter/s)": 1.43875 }, { "epoch": 2.4240606657812434, "grad_norm": 5.592379570007324, "learning_rate": 5.2384799418505824e-05, "loss": 2.7172195434570314, "memory(GiB)": 77.56, "step": 56580, "token_acc": 0.4541984732824427, "train_speed(iter/s)": 1.438787 }, { "epoch": 2.424274881110492, "grad_norm": 5.186420917510986, "learning_rate": 5.237807728304152e-05, "loss": 2.4347774505615236, "memory(GiB)": 77.56, "step": 56585, "token_acc": 0.5150214592274678, "train_speed(iter/s)": 1.438816 }, { "epoch": 2.4244890964397414, "grad_norm": 4.3312554359436035, "learning_rate": 5.2371355104496125e-05, "loss": 2.3911298751831054, "memory(GiB)": 77.56, "step": 56590, "token_acc": 0.4745222929936306, "train_speed(iter/s)": 1.438838 }, { "epoch": 2.4247033117689902, "grad_norm": 4.943553924560547, "learning_rate": 5.236463288299139e-05, "loss": 2.390595054626465, "memory(GiB)": 77.56, "step": 56595, "token_acc": 0.4904214559386973, "train_speed(iter/s)": 1.438839 }, { "epoch": 2.424917527098239, "grad_norm": 4.774374485015869, "learning_rate": 5.235791061864912e-05, "loss": 2.2636978149414064, "memory(GiB)": 77.56, "step": 56600, "token_acc": 0.4982078853046595, "train_speed(iter/s)": 1.438826 }, { "epoch": 2.4251317424274883, "grad_norm": 6.53861141204834, "learning_rate": 5.2351188311591093e-05, "loss": 2.4385959625244142, "memory(GiB)": 77.56, "step": 56605, "token_acc": 0.5309090909090909, "train_speed(iter/s)": 1.438834 }, { "epoch": 2.425345957756737, "grad_norm": 4.330164909362793, "learning_rate": 5.234446596193907e-05, "loss": 2.604307174682617, "memory(GiB)": 77.56, "step": 56610, "token_acc": 0.4693140794223827, "train_speed(iter/s)": 1.438849 }, { "epoch": 2.425560173085986, "grad_norm": 5.388030052185059, "learning_rate": 5.233774356981487e-05, "loss": 2.4409034729003904, "memory(GiB)": 77.56, "step": 56615, "token_acc": 0.506578947368421, "train_speed(iter/s)": 1.43885 }, { "epoch": 2.425774388415235, "grad_norm": 4.841846942901611, "learning_rate": 5.233102113534024e-05, "loss": 2.645443916320801, "memory(GiB)": 77.56, "step": 56620, "token_acc": 0.4406779661016949, "train_speed(iter/s)": 1.438847 }, { "epoch": 2.425988603744484, "grad_norm": 5.512052059173584, "learning_rate": 5.2324298658636974e-05, "loss": 2.2702354431152343, "memory(GiB)": 77.56, "step": 56625, "token_acc": 0.50814332247557, "train_speed(iter/s)": 1.438854 }, { "epoch": 2.426202819073733, "grad_norm": 6.381377220153809, "learning_rate": 5.231757613982686e-05, "loss": 2.223843574523926, "memory(GiB)": 77.56, "step": 56630, "token_acc": 0.48717948717948717, "train_speed(iter/s)": 1.438861 }, { "epoch": 2.426417034402982, "grad_norm": 4.889789581298828, "learning_rate": 5.2310853579031674e-05, "loss": 2.463627815246582, "memory(GiB)": 77.56, "step": 56635, "token_acc": 0.48788927335640137, "train_speed(iter/s)": 1.438827 }, { "epoch": 2.426631249732231, "grad_norm": 6.635883808135986, "learning_rate": 5.23041309763732e-05, "loss": 2.3669376373291016, "memory(GiB)": 77.56, "step": 56640, "token_acc": 0.49469964664310956, "train_speed(iter/s)": 1.43884 }, { "epoch": 2.4268454650614797, "grad_norm": 5.636005401611328, "learning_rate": 5.229740833197325e-05, "loss": 2.5137474060058596, "memory(GiB)": 77.56, "step": 56645, "token_acc": 0.44904458598726116, "train_speed(iter/s)": 1.438832 }, { "epoch": 2.427059680390729, "grad_norm": 5.689830303192139, "learning_rate": 5.2290685645953597e-05, "loss": 2.5631351470947266, "memory(GiB)": 77.56, "step": 56650, "token_acc": 0.48363636363636364, "train_speed(iter/s)": 1.438847 }, { "epoch": 2.4272738957199778, "grad_norm": 4.363366603851318, "learning_rate": 5.2283962918436014e-05, "loss": 2.565142059326172, "memory(GiB)": 77.56, "step": 56655, "token_acc": 0.4908424908424908, "train_speed(iter/s)": 1.438872 }, { "epoch": 2.4274881110492266, "grad_norm": 7.218557357788086, "learning_rate": 5.227724014954231e-05, "loss": 1.9128873825073243, "memory(GiB)": 77.56, "step": 56660, "token_acc": 0.5481171548117155, "train_speed(iter/s)": 1.438892 }, { "epoch": 2.427702326378476, "grad_norm": 6.1528544425964355, "learning_rate": 5.227051733939425e-05, "loss": 2.5260448455810547, "memory(GiB)": 77.56, "step": 56665, "token_acc": 0.5019157088122606, "train_speed(iter/s)": 1.438884 }, { "epoch": 2.4279165417077246, "grad_norm": 4.401196002960205, "learning_rate": 5.2263794488113635e-05, "loss": 2.6199535369873046, "memory(GiB)": 77.56, "step": 56670, "token_acc": 0.48265895953757226, "train_speed(iter/s)": 1.438881 }, { "epoch": 2.4281307570369735, "grad_norm": 6.6801838874816895, "learning_rate": 5.225707159582227e-05, "loss": 2.549765396118164, "memory(GiB)": 77.56, "step": 56675, "token_acc": 0.5138461538461538, "train_speed(iter/s)": 1.438884 }, { "epoch": 2.4283449723662227, "grad_norm": 5.841605186462402, "learning_rate": 5.225034866264193e-05, "loss": 2.4959110260009765, "memory(GiB)": 77.56, "step": 56680, "token_acc": 0.48464163822525597, "train_speed(iter/s)": 1.438894 }, { "epoch": 2.4285591876954715, "grad_norm": 4.549417018890381, "learning_rate": 5.224362568869442e-05, "loss": 2.639225387573242, "memory(GiB)": 77.56, "step": 56685, "token_acc": 0.4806201550387597, "train_speed(iter/s)": 1.438891 }, { "epoch": 2.4287734030247203, "grad_norm": 4.565102577209473, "learning_rate": 5.223690267410153e-05, "loss": 2.490378189086914, "memory(GiB)": 77.56, "step": 56690, "token_acc": 0.48717948717948717, "train_speed(iter/s)": 1.438884 }, { "epoch": 2.4289876183539696, "grad_norm": 5.574451923370361, "learning_rate": 5.223017961898504e-05, "loss": 2.2715522766113283, "memory(GiB)": 77.56, "step": 56695, "token_acc": 0.48627450980392156, "train_speed(iter/s)": 1.438835 }, { "epoch": 2.4292018336832184, "grad_norm": 5.546135425567627, "learning_rate": 5.222345652346675e-05, "loss": 2.2152938842773438, "memory(GiB)": 77.56, "step": 56700, "token_acc": 0.5333333333333333, "train_speed(iter/s)": 1.438871 }, { "epoch": 2.429416049012467, "grad_norm": 4.191052436828613, "learning_rate": 5.221673338766847e-05, "loss": 2.5945505142211913, "memory(GiB)": 77.56, "step": 56705, "token_acc": 0.4899713467048711, "train_speed(iter/s)": 1.438887 }, { "epoch": 2.4296302643417165, "grad_norm": 10.171424865722656, "learning_rate": 5.221001021171198e-05, "loss": 2.2691066741943358, "memory(GiB)": 77.56, "step": 56710, "token_acc": 0.549800796812749, "train_speed(iter/s)": 1.43891 }, { "epoch": 2.4298444796709653, "grad_norm": 4.609151840209961, "learning_rate": 5.220328699571908e-05, "loss": 2.553591728210449, "memory(GiB)": 77.56, "step": 56715, "token_acc": 0.4723127035830619, "train_speed(iter/s)": 1.438921 }, { "epoch": 2.430058695000214, "grad_norm": 6.131407737731934, "learning_rate": 5.219656373981158e-05, "loss": 2.3142824172973633, "memory(GiB)": 77.56, "step": 56720, "token_acc": 0.5101351351351351, "train_speed(iter/s)": 1.438902 }, { "epoch": 2.4302729103294634, "grad_norm": 5.306214332580566, "learning_rate": 5.218984044411126e-05, "loss": 2.3295936584472656, "memory(GiB)": 77.56, "step": 56725, "token_acc": 0.4935483870967742, "train_speed(iter/s)": 1.438918 }, { "epoch": 2.430487125658712, "grad_norm": 4.491667747497559, "learning_rate": 5.218311710873992e-05, "loss": 2.4540481567382812, "memory(GiB)": 77.56, "step": 56730, "token_acc": 0.5034013605442177, "train_speed(iter/s)": 1.438947 }, { "epoch": 2.430701340987961, "grad_norm": 6.126719951629639, "learning_rate": 5.2176393733819376e-05, "loss": 2.548937225341797, "memory(GiB)": 77.56, "step": 56735, "token_acc": 0.44932432432432434, "train_speed(iter/s)": 1.438955 }, { "epoch": 2.4309155563172102, "grad_norm": 5.413206100463867, "learning_rate": 5.2169670319471406e-05, "loss": 2.453196144104004, "memory(GiB)": 77.56, "step": 56740, "token_acc": 0.508833922261484, "train_speed(iter/s)": 1.438978 }, { "epoch": 2.431129771646459, "grad_norm": 4.76425838470459, "learning_rate": 5.216294686581783e-05, "loss": 2.3705583572387696, "memory(GiB)": 77.56, "step": 56745, "token_acc": 0.49514563106796117, "train_speed(iter/s)": 1.438976 }, { "epoch": 2.431343986975708, "grad_norm": 6.396388053894043, "learning_rate": 5.215622337298044e-05, "loss": 2.5731929779052733, "memory(GiB)": 77.56, "step": 56750, "token_acc": 0.48135593220338985, "train_speed(iter/s)": 1.438967 }, { "epoch": 2.431558202304957, "grad_norm": 6.104854106903076, "learning_rate": 5.214949984108104e-05, "loss": 2.5788434982299804, "memory(GiB)": 77.56, "step": 56755, "token_acc": 0.44591029023746703, "train_speed(iter/s)": 1.438978 }, { "epoch": 2.431772417634206, "grad_norm": 4.484294414520264, "learning_rate": 5.214277627024144e-05, "loss": 2.128470230102539, "memory(GiB)": 77.56, "step": 56760, "token_acc": 0.5089605734767025, "train_speed(iter/s)": 1.439011 }, { "epoch": 2.4319866329634547, "grad_norm": 7.305736064910889, "learning_rate": 5.2136052660583444e-05, "loss": 2.4432231903076174, "memory(GiB)": 77.56, "step": 56765, "token_acc": 0.4808362369337979, "train_speed(iter/s)": 1.43901 }, { "epoch": 2.432200848292704, "grad_norm": 6.269150733947754, "learning_rate": 5.212932901222883e-05, "loss": 2.531831169128418, "memory(GiB)": 77.56, "step": 56770, "token_acc": 0.4969512195121951, "train_speed(iter/s)": 1.439 }, { "epoch": 2.432415063621953, "grad_norm": 4.121421813964844, "learning_rate": 5.212260532529945e-05, "loss": 2.3903039932250976, "memory(GiB)": 77.56, "step": 56775, "token_acc": 0.4840764331210191, "train_speed(iter/s)": 1.43901 }, { "epoch": 2.4326292789512016, "grad_norm": 4.335596561431885, "learning_rate": 5.211588159991707e-05, "loss": 2.445465087890625, "memory(GiB)": 77.56, "step": 56780, "token_acc": 0.4599406528189911, "train_speed(iter/s)": 1.438985 }, { "epoch": 2.432843494280451, "grad_norm": 4.750160217285156, "learning_rate": 5.210915783620349e-05, "loss": 2.7006240844726563, "memory(GiB)": 77.56, "step": 56785, "token_acc": 0.4659498207885305, "train_speed(iter/s)": 1.439003 }, { "epoch": 2.4330577096096997, "grad_norm": 4.748763084411621, "learning_rate": 5.2102434034280566e-05, "loss": 2.5845464706420898, "memory(GiB)": 77.56, "step": 56790, "token_acc": 0.4774011299435028, "train_speed(iter/s)": 1.438998 }, { "epoch": 2.4332719249389485, "grad_norm": 4.500877380371094, "learning_rate": 5.2095710194270067e-05, "loss": 2.089221954345703, "memory(GiB)": 77.56, "step": 56795, "token_acc": 0.5158227848101266, "train_speed(iter/s)": 1.439039 }, { "epoch": 2.4334861402681978, "grad_norm": 5.059268951416016, "learning_rate": 5.208898631629381e-05, "loss": 2.2982500076293944, "memory(GiB)": 77.56, "step": 56800, "token_acc": 0.5125448028673835, "train_speed(iter/s)": 1.439033 }, { "epoch": 2.4337003555974466, "grad_norm": 6.589064598083496, "learning_rate": 5.208226240047362e-05, "loss": 2.342428970336914, "memory(GiB)": 77.56, "step": 56805, "token_acc": 0.5433070866141733, "train_speed(iter/s)": 1.439044 }, { "epoch": 2.4339145709266954, "grad_norm": 7.438954830169678, "learning_rate": 5.207553844693128e-05, "loss": 2.438463592529297, "memory(GiB)": 77.56, "step": 56810, "token_acc": 0.44155844155844154, "train_speed(iter/s)": 1.439056 }, { "epoch": 2.4341287862559446, "grad_norm": 5.0491108894348145, "learning_rate": 5.206881445578861e-05, "loss": 2.3306510925292967, "memory(GiB)": 77.56, "step": 56815, "token_acc": 0.5076923076923077, "train_speed(iter/s)": 1.439014 }, { "epoch": 2.4343430015851935, "grad_norm": 4.765693664550781, "learning_rate": 5.206209042716742e-05, "loss": 2.502008628845215, "memory(GiB)": 77.56, "step": 56820, "token_acc": 0.4983277591973244, "train_speed(iter/s)": 1.439021 }, { "epoch": 2.4345572169144423, "grad_norm": 5.881290912628174, "learning_rate": 5.205536636118955e-05, "loss": 2.302350616455078, "memory(GiB)": 77.56, "step": 56825, "token_acc": 0.4963768115942029, "train_speed(iter/s)": 1.43904 }, { "epoch": 2.4347714322436915, "grad_norm": 5.645346164703369, "learning_rate": 5.204864225797676e-05, "loss": 2.7165205001831056, "memory(GiB)": 77.56, "step": 56830, "token_acc": 0.44785276073619634, "train_speed(iter/s)": 1.439037 }, { "epoch": 2.4349856475729403, "grad_norm": 5.8589653968811035, "learning_rate": 5.204191811765092e-05, "loss": 2.419597625732422, "memory(GiB)": 77.56, "step": 56835, "token_acc": 0.49074074074074076, "train_speed(iter/s)": 1.439038 }, { "epoch": 2.435199862902189, "grad_norm": 6.1219024658203125, "learning_rate": 5.203519394033382e-05, "loss": 2.6928516387939454, "memory(GiB)": 77.56, "step": 56840, "token_acc": 0.43795620437956206, "train_speed(iter/s)": 1.439057 }, { "epoch": 2.4354140782314384, "grad_norm": 8.250514030456543, "learning_rate": 5.202846972614726e-05, "loss": 2.279958724975586, "memory(GiB)": 77.56, "step": 56845, "token_acc": 0.5161290322580645, "train_speed(iter/s)": 1.439059 }, { "epoch": 2.435628293560687, "grad_norm": 4.9896721839904785, "learning_rate": 5.2021745475213076e-05, "loss": 2.6677433013916017, "memory(GiB)": 77.56, "step": 56850, "token_acc": 0.46540880503144655, "train_speed(iter/s)": 1.439051 }, { "epoch": 2.435842508889936, "grad_norm": 5.202160835266113, "learning_rate": 5.201502118765307e-05, "loss": 2.9514438629150392, "memory(GiB)": 77.56, "step": 56855, "token_acc": 0.436950146627566, "train_speed(iter/s)": 1.439084 }, { "epoch": 2.4360567242191853, "grad_norm": 5.538449764251709, "learning_rate": 5.200829686358906e-05, "loss": 2.3525764465332033, "memory(GiB)": 77.56, "step": 56860, "token_acc": 0.44912280701754387, "train_speed(iter/s)": 1.439094 }, { "epoch": 2.436270939548434, "grad_norm": 4.442500114440918, "learning_rate": 5.2001572503142873e-05, "loss": 2.54138069152832, "memory(GiB)": 77.56, "step": 56865, "token_acc": 0.47761194029850745, "train_speed(iter/s)": 1.439052 }, { "epoch": 2.436485154877683, "grad_norm": 6.37665319442749, "learning_rate": 5.1994848106436334e-05, "loss": 2.4878376007080076, "memory(GiB)": 77.56, "step": 56870, "token_acc": 0.48698884758364314, "train_speed(iter/s)": 1.439055 }, { "epoch": 2.436699370206932, "grad_norm": 5.775146961212158, "learning_rate": 5.198812367359123e-05, "loss": 2.33713436126709, "memory(GiB)": 77.56, "step": 56875, "token_acc": 0.5469255663430421, "train_speed(iter/s)": 1.43904 }, { "epoch": 2.436913585536181, "grad_norm": 6.4485344886779785, "learning_rate": 5.198139920472942e-05, "loss": 2.4708410263061524, "memory(GiB)": 77.56, "step": 56880, "token_acc": 0.4873417721518987, "train_speed(iter/s)": 1.439056 }, { "epoch": 2.43712780086543, "grad_norm": 5.370219707489014, "learning_rate": 5.1974674699972684e-05, "loss": 2.2273605346679686, "memory(GiB)": 77.56, "step": 56885, "token_acc": 0.5338645418326693, "train_speed(iter/s)": 1.439077 }, { "epoch": 2.437342016194679, "grad_norm": 5.101024627685547, "learning_rate": 5.196795015944288e-05, "loss": 2.219680404663086, "memory(GiB)": 77.56, "step": 56890, "token_acc": 0.501432664756447, "train_speed(iter/s)": 1.439104 }, { "epoch": 2.437556231523928, "grad_norm": 6.431314468383789, "learning_rate": 5.19612255832618e-05, "loss": 2.4655418395996094, "memory(GiB)": 77.56, "step": 56895, "token_acc": 0.5190839694656488, "train_speed(iter/s)": 1.439112 }, { "epoch": 2.4377704468531767, "grad_norm": 5.0001373291015625, "learning_rate": 5.195450097155128e-05, "loss": 2.5330574035644533, "memory(GiB)": 77.56, "step": 56900, "token_acc": 0.47126436781609193, "train_speed(iter/s)": 1.4391 }, { "epoch": 2.437984662182426, "grad_norm": 5.0970940589904785, "learning_rate": 5.194777632443315e-05, "loss": 2.1877840042114256, "memory(GiB)": 77.56, "step": 56905, "token_acc": 0.49673202614379086, "train_speed(iter/s)": 1.439057 }, { "epoch": 2.4381988775116747, "grad_norm": 5.015025615692139, "learning_rate": 5.194105164202924e-05, "loss": 2.518899917602539, "memory(GiB)": 77.56, "step": 56910, "token_acc": 0.468013468013468, "train_speed(iter/s)": 1.43904 }, { "epoch": 2.4384130928409236, "grad_norm": 5.5324883460998535, "learning_rate": 5.1934326924461326e-05, "loss": 2.56801700592041, "memory(GiB)": 77.56, "step": 56915, "token_acc": 0.47181008902077154, "train_speed(iter/s)": 1.43906 }, { "epoch": 2.438627308170173, "grad_norm": 6.703329086303711, "learning_rate": 5.192760217185129e-05, "loss": 2.689248275756836, "memory(GiB)": 77.56, "step": 56920, "token_acc": 0.4862068965517241, "train_speed(iter/s)": 1.43907 }, { "epoch": 2.4388415234994216, "grad_norm": 5.418503284454346, "learning_rate": 5.192087738432092e-05, "loss": 2.197000503540039, "memory(GiB)": 77.56, "step": 56925, "token_acc": 0.5215827338129496, "train_speed(iter/s)": 1.439047 }, { "epoch": 2.4390557388286704, "grad_norm": 8.2334623336792, "learning_rate": 5.191415256199205e-05, "loss": 2.6109909057617187, "memory(GiB)": 77.56, "step": 56930, "token_acc": 0.483974358974359, "train_speed(iter/s)": 1.43908 }, { "epoch": 2.4392699541579197, "grad_norm": 5.991984844207764, "learning_rate": 5.190742770498652e-05, "loss": 2.408322334289551, "memory(GiB)": 77.56, "step": 56935, "token_acc": 0.47678018575851394, "train_speed(iter/s)": 1.439097 }, { "epoch": 2.4394841694871685, "grad_norm": 8.74606704711914, "learning_rate": 5.190070281342615e-05, "loss": 2.6613529205322264, "memory(GiB)": 77.56, "step": 56940, "token_acc": 0.4576719576719577, "train_speed(iter/s)": 1.4391 }, { "epoch": 2.4396983848164173, "grad_norm": 4.8893022537231445, "learning_rate": 5.189397788743275e-05, "loss": 2.3952220916748046, "memory(GiB)": 77.56, "step": 56945, "token_acc": 0.4965277777777778, "train_speed(iter/s)": 1.43911 }, { "epoch": 2.4399126001456666, "grad_norm": 5.606250286102295, "learning_rate": 5.188725292712818e-05, "loss": 2.571638488769531, "memory(GiB)": 77.56, "step": 56950, "token_acc": 0.4882943143812709, "train_speed(iter/s)": 1.439127 }, { "epoch": 2.4401268154749154, "grad_norm": 6.874665260314941, "learning_rate": 5.188052793263426e-05, "loss": 2.331051254272461, "memory(GiB)": 77.56, "step": 56955, "token_acc": 0.5325670498084292, "train_speed(iter/s)": 1.43912 }, { "epoch": 2.440341030804164, "grad_norm": 5.383266448974609, "learning_rate": 5.1873802904072786e-05, "loss": 2.647352600097656, "memory(GiB)": 77.56, "step": 56960, "token_acc": 0.468944099378882, "train_speed(iter/s)": 1.439116 }, { "epoch": 2.4405552461334135, "grad_norm": 4.835541725158691, "learning_rate": 5.1867077841565635e-05, "loss": 2.5778778076171873, "memory(GiB)": 77.56, "step": 56965, "token_acc": 0.49828178694158076, "train_speed(iter/s)": 1.43916 }, { "epoch": 2.4407694614626623, "grad_norm": 5.5413079261779785, "learning_rate": 5.186035274523461e-05, "loss": 2.4015180587768556, "memory(GiB)": 77.56, "step": 56970, "token_acc": 0.47761194029850745, "train_speed(iter/s)": 1.439156 }, { "epoch": 2.440983676791911, "grad_norm": 4.444792747497559, "learning_rate": 5.1853627615201536e-05, "loss": 2.411921501159668, "memory(GiB)": 77.56, "step": 56975, "token_acc": 0.4733893557422969, "train_speed(iter/s)": 1.439144 }, { "epoch": 2.4411978921211603, "grad_norm": 7.380198001861572, "learning_rate": 5.184690245158829e-05, "loss": 2.441391372680664, "memory(GiB)": 77.56, "step": 56980, "token_acc": 0.5054545454545455, "train_speed(iter/s)": 1.439137 }, { "epoch": 2.441412107450409, "grad_norm": 4.234613418579102, "learning_rate": 5.1840177254516666e-05, "loss": 2.4302005767822266, "memory(GiB)": 77.56, "step": 56985, "token_acc": 0.45671641791044776, "train_speed(iter/s)": 1.439167 }, { "epoch": 2.441626322779658, "grad_norm": 5.636434078216553, "learning_rate": 5.183345202410849e-05, "loss": 2.2529296875, "memory(GiB)": 77.56, "step": 56990, "token_acc": 0.543918918918919, "train_speed(iter/s)": 1.439184 }, { "epoch": 2.441840538108907, "grad_norm": 4.551118850708008, "learning_rate": 5.182672676048561e-05, "loss": 2.4376110076904296, "memory(GiB)": 77.56, "step": 56995, "token_acc": 0.4838709677419355, "train_speed(iter/s)": 1.43918 }, { "epoch": 2.442054753438156, "grad_norm": 5.243435382843018, "learning_rate": 5.182000146376986e-05, "loss": 2.241717529296875, "memory(GiB)": 77.56, "step": 57000, "token_acc": 0.5340909090909091, "train_speed(iter/s)": 1.439178 }, { "epoch": 2.442054753438156, "eval_loss": 2.3264853954315186, "eval_runtime": 14.1731, "eval_samples_per_second": 7.056, "eval_steps_per_second": 7.056, "eval_token_acc": 0.4582763337893297, "step": 57000 }, { "epoch": 2.442268968767405, "grad_norm": 5.387640476226807, "learning_rate": 5.181327613408309e-05, "loss": 2.4150400161743164, "memory(GiB)": 77.56, "step": 57005, "token_acc": 0.4698318496538081, "train_speed(iter/s)": 1.438633 }, { "epoch": 2.442483184096654, "grad_norm": 5.350733757019043, "learning_rate": 5.1806550771547115e-05, "loss": 2.360303497314453, "memory(GiB)": 77.56, "step": 57010, "token_acc": 0.5196078431372549, "train_speed(iter/s)": 1.438649 }, { "epoch": 2.442697399425903, "grad_norm": 4.9536943435668945, "learning_rate": 5.179982537628378e-05, "loss": 2.185806655883789, "memory(GiB)": 77.56, "step": 57015, "token_acc": 0.5490909090909091, "train_speed(iter/s)": 1.43866 }, { "epoch": 2.4429116147551517, "grad_norm": 4.514573097229004, "learning_rate": 5.1793099948414925e-05, "loss": 2.4682483673095703, "memory(GiB)": 77.56, "step": 57020, "token_acc": 0.4984025559105431, "train_speed(iter/s)": 1.438666 }, { "epoch": 2.443125830084401, "grad_norm": 5.264124870300293, "learning_rate": 5.1786374488062375e-05, "loss": 2.4311029434204103, "memory(GiB)": 77.56, "step": 57025, "token_acc": 0.5017301038062284, "train_speed(iter/s)": 1.438658 }, { "epoch": 2.44334004541365, "grad_norm": 9.807960510253906, "learning_rate": 5.1779648995347975e-05, "loss": 2.5561866760253906, "memory(GiB)": 77.56, "step": 57030, "token_acc": 0.4440993788819876, "train_speed(iter/s)": 1.438642 }, { "epoch": 2.4435542607428986, "grad_norm": 5.882599353790283, "learning_rate": 5.177292347039358e-05, "loss": 2.78738956451416, "memory(GiB)": 77.56, "step": 57035, "token_acc": 0.43137254901960786, "train_speed(iter/s)": 1.43862 }, { "epoch": 2.443768476072148, "grad_norm": 5.670618534088135, "learning_rate": 5.176619791332099e-05, "loss": 2.327910232543945, "memory(GiB)": 77.56, "step": 57040, "token_acc": 0.4967741935483871, "train_speed(iter/s)": 1.438604 }, { "epoch": 2.4439826914013967, "grad_norm": 4.727508068084717, "learning_rate": 5.175947232425207e-05, "loss": 2.5686370849609377, "memory(GiB)": 77.56, "step": 57045, "token_acc": 0.5050505050505051, "train_speed(iter/s)": 1.43864 }, { "epoch": 2.4441969067306455, "grad_norm": 7.624876976013184, "learning_rate": 5.1752746703308664e-05, "loss": 2.6940095901489256, "memory(GiB)": 77.56, "step": 57050, "token_acc": 0.45244956772334294, "train_speed(iter/s)": 1.4386 }, { "epoch": 2.4444111220598947, "grad_norm": 6.475161075592041, "learning_rate": 5.174602105061262e-05, "loss": 2.572902297973633, "memory(GiB)": 77.56, "step": 57055, "token_acc": 0.4878048780487805, "train_speed(iter/s)": 1.438609 }, { "epoch": 2.4446253373891436, "grad_norm": 4.965032577514648, "learning_rate": 5.1739295366285745e-05, "loss": 2.3567081451416017, "memory(GiB)": 77.56, "step": 57060, "token_acc": 0.49640287769784175, "train_speed(iter/s)": 1.438601 }, { "epoch": 2.4448395527183924, "grad_norm": 7.131523609161377, "learning_rate": 5.173256965044991e-05, "loss": 2.5855518341064454, "memory(GiB)": 77.56, "step": 57065, "token_acc": 0.4559748427672956, "train_speed(iter/s)": 1.438606 }, { "epoch": 2.4450537680476416, "grad_norm": 5.767808437347412, "learning_rate": 5.1725843903226966e-05, "loss": 2.420552062988281, "memory(GiB)": 77.56, "step": 57070, "token_acc": 0.5187713310580204, "train_speed(iter/s)": 1.438609 }, { "epoch": 2.4452679833768904, "grad_norm": 6.030223846435547, "learning_rate": 5.171911812473872e-05, "loss": 2.4342273712158202, "memory(GiB)": 77.56, "step": 57075, "token_acc": 0.4856115107913669, "train_speed(iter/s)": 1.438636 }, { "epoch": 2.4454821987061393, "grad_norm": 9.87091064453125, "learning_rate": 5.171239231510704e-05, "loss": 2.0547002792358398, "memory(GiB)": 77.56, "step": 57080, "token_acc": 0.5568181818181818, "train_speed(iter/s)": 1.438622 }, { "epoch": 2.4456964140353885, "grad_norm": 4.279675483703613, "learning_rate": 5.1705666474453785e-05, "loss": 2.2895599365234376, "memory(GiB)": 77.56, "step": 57085, "token_acc": 0.5063291139240507, "train_speed(iter/s)": 1.438598 }, { "epoch": 2.4459106293646373, "grad_norm": 7.536383628845215, "learning_rate": 5.1698940602900756e-05, "loss": 2.5207191467285157, "memory(GiB)": 77.56, "step": 57090, "token_acc": 0.452, "train_speed(iter/s)": 1.438621 }, { "epoch": 2.446124844693886, "grad_norm": 4.840969085693359, "learning_rate": 5.169221470056984e-05, "loss": 2.4955591201782226, "memory(GiB)": 77.56, "step": 57095, "token_acc": 0.46319018404907975, "train_speed(iter/s)": 1.438623 }, { "epoch": 2.4463390600231354, "grad_norm": 6.287506580352783, "learning_rate": 5.168548876758288e-05, "loss": 2.747912788391113, "memory(GiB)": 77.56, "step": 57100, "token_acc": 0.43288590604026844, "train_speed(iter/s)": 1.438627 }, { "epoch": 2.446553275352384, "grad_norm": 5.823217391967773, "learning_rate": 5.1678762804061685e-05, "loss": 2.7979211807250977, "memory(GiB)": 77.56, "step": 57105, "token_acc": 0.44285714285714284, "train_speed(iter/s)": 1.438647 }, { "epoch": 2.446767490681633, "grad_norm": 5.685962677001953, "learning_rate": 5.167203681012813e-05, "loss": 2.4895488739013674, "memory(GiB)": 77.56, "step": 57110, "token_acc": 0.46875, "train_speed(iter/s)": 1.438635 }, { "epoch": 2.4469817060108823, "grad_norm": 6.21130895614624, "learning_rate": 5.1665310785904066e-05, "loss": 2.4648815155029298, "memory(GiB)": 77.56, "step": 57115, "token_acc": 0.4913494809688581, "train_speed(iter/s)": 1.438612 }, { "epoch": 2.447195921340131, "grad_norm": 6.889520645141602, "learning_rate": 5.165858473151133e-05, "loss": 2.6206830978393554, "memory(GiB)": 77.56, "step": 57120, "token_acc": 0.48717948717948717, "train_speed(iter/s)": 1.438596 }, { "epoch": 2.44741013666938, "grad_norm": 5.61447811126709, "learning_rate": 5.165185864707178e-05, "loss": 2.7210128784179686, "memory(GiB)": 77.56, "step": 57125, "token_acc": 0.46959459459459457, "train_speed(iter/s)": 1.438527 }, { "epoch": 2.447624351998629, "grad_norm": 5.406766891479492, "learning_rate": 5.164513253270727e-05, "loss": 2.4325815200805665, "memory(GiB)": 77.56, "step": 57130, "token_acc": 0.4845360824742268, "train_speed(iter/s)": 1.438558 }, { "epoch": 2.447838567327878, "grad_norm": 15.234235763549805, "learning_rate": 5.163840638853963e-05, "loss": 2.159321403503418, "memory(GiB)": 77.56, "step": 57135, "token_acc": 0.5442622950819672, "train_speed(iter/s)": 1.438562 }, { "epoch": 2.448052782657127, "grad_norm": 5.185170650482178, "learning_rate": 5.163168021469073e-05, "loss": 2.4400049209594727, "memory(GiB)": 77.56, "step": 57140, "token_acc": 0.4899328859060403, "train_speed(iter/s)": 1.438554 }, { "epoch": 2.448266997986376, "grad_norm": 4.750624179840088, "learning_rate": 5.16249540112824e-05, "loss": 2.6198116302490235, "memory(GiB)": 77.56, "step": 57145, "token_acc": 0.45484949832775917, "train_speed(iter/s)": 1.438551 }, { "epoch": 2.448481213315625, "grad_norm": 5.566971778869629, "learning_rate": 5.161822777843651e-05, "loss": 2.4444602966308593, "memory(GiB)": 77.56, "step": 57150, "token_acc": 0.48253968253968255, "train_speed(iter/s)": 1.438559 }, { "epoch": 2.4486954286448737, "grad_norm": 5.943056583404541, "learning_rate": 5.1611501516274904e-05, "loss": 2.2202121734619142, "memory(GiB)": 77.56, "step": 57155, "token_acc": 0.4928571428571429, "train_speed(iter/s)": 1.43857 }, { "epoch": 2.448909643974123, "grad_norm": 6.960949420928955, "learning_rate": 5.160477522491943e-05, "loss": 2.309294509887695, "memory(GiB)": 77.56, "step": 57160, "token_acc": 0.5078740157480315, "train_speed(iter/s)": 1.438576 }, { "epoch": 2.4491238593033717, "grad_norm": 6.0811381340026855, "learning_rate": 5.159804890449196e-05, "loss": 2.507560920715332, "memory(GiB)": 77.56, "step": 57165, "token_acc": 0.4659090909090909, "train_speed(iter/s)": 1.438583 }, { "epoch": 2.4493380746326205, "grad_norm": 5.529671669006348, "learning_rate": 5.159132255511434e-05, "loss": 2.1746746063232423, "memory(GiB)": 77.56, "step": 57170, "token_acc": 0.5, "train_speed(iter/s)": 1.438576 }, { "epoch": 2.44955228996187, "grad_norm": 3.9784586429595947, "learning_rate": 5.15845961769084e-05, "loss": 2.395619583129883, "memory(GiB)": 77.56, "step": 57175, "token_acc": 0.4622356495468278, "train_speed(iter/s)": 1.438594 }, { "epoch": 2.4497665052911186, "grad_norm": 5.22333288192749, "learning_rate": 5.157786976999602e-05, "loss": 2.3220149993896486, "memory(GiB)": 77.56, "step": 57180, "token_acc": 0.5282258064516129, "train_speed(iter/s)": 1.438581 }, { "epoch": 2.4499807206203674, "grad_norm": 6.703789234161377, "learning_rate": 5.157114333449906e-05, "loss": 2.1752740859985353, "memory(GiB)": 77.56, "step": 57185, "token_acc": 0.5376712328767124, "train_speed(iter/s)": 1.43858 }, { "epoch": 2.4501949359496167, "grad_norm": 7.221778869628906, "learning_rate": 5.1564416870539346e-05, "loss": 2.0189912796020506, "memory(GiB)": 77.56, "step": 57190, "token_acc": 0.5091463414634146, "train_speed(iter/s)": 1.438593 }, { "epoch": 2.4504091512788655, "grad_norm": 3.823230266571045, "learning_rate": 5.155769037823876e-05, "loss": 2.3497365951538085, "memory(GiB)": 77.56, "step": 57195, "token_acc": 0.47186147186147187, "train_speed(iter/s)": 1.438614 }, { "epoch": 2.4506233666081143, "grad_norm": 6.960144996643066, "learning_rate": 5.155096385771917e-05, "loss": 2.5806880950927735, "memory(GiB)": 77.56, "step": 57200, "token_acc": 0.4603174603174603, "train_speed(iter/s)": 1.43865 }, { "epoch": 2.4508375819373636, "grad_norm": 5.065337657928467, "learning_rate": 5.1544237309102395e-05, "loss": 2.4673130035400392, "memory(GiB)": 77.56, "step": 57205, "token_acc": 0.503125, "train_speed(iter/s)": 1.438626 }, { "epoch": 2.4510517972666124, "grad_norm": 4.6179118156433105, "learning_rate": 5.153751073251032e-05, "loss": 2.3746482849121096, "memory(GiB)": 77.56, "step": 57210, "token_acc": 0.4828767123287671, "train_speed(iter/s)": 1.438648 }, { "epoch": 2.451266012595861, "grad_norm": 4.876397609710693, "learning_rate": 5.15307841280648e-05, "loss": 2.256892967224121, "memory(GiB)": 77.56, "step": 57215, "token_acc": 0.5077399380804953, "train_speed(iter/s)": 1.438686 }, { "epoch": 2.4514802279251104, "grad_norm": 5.341527938842773, "learning_rate": 5.152405749588768e-05, "loss": 2.0741554260253907, "memory(GiB)": 77.56, "step": 57220, "token_acc": 0.5080645161290323, "train_speed(iter/s)": 1.43866 }, { "epoch": 2.4516944432543593, "grad_norm": 4.602362155914307, "learning_rate": 5.151733083610083e-05, "loss": 2.2884637832641603, "memory(GiB)": 77.56, "step": 57225, "token_acc": 0.5335820895522388, "train_speed(iter/s)": 1.438687 }, { "epoch": 2.451908658583608, "grad_norm": 4.572879791259766, "learning_rate": 5.15106041488261e-05, "loss": 2.3112060546875, "memory(GiB)": 77.56, "step": 57230, "token_acc": 0.5379939209726444, "train_speed(iter/s)": 1.438686 }, { "epoch": 2.4521228739128573, "grad_norm": 5.744421482086182, "learning_rate": 5.1503877434185366e-05, "loss": 2.58641357421875, "memory(GiB)": 77.56, "step": 57235, "token_acc": 0.4503105590062112, "train_speed(iter/s)": 1.438704 }, { "epoch": 2.452337089242106, "grad_norm": 4.397494792938232, "learning_rate": 5.149715069230049e-05, "loss": 2.3761240005493165, "memory(GiB)": 77.56, "step": 57240, "token_acc": 0.4783950617283951, "train_speed(iter/s)": 1.438708 }, { "epoch": 2.452551304571355, "grad_norm": 5.352940082550049, "learning_rate": 5.149042392329333e-05, "loss": 2.3313854217529295, "memory(GiB)": 77.56, "step": 57245, "token_acc": 0.49032258064516127, "train_speed(iter/s)": 1.438723 }, { "epoch": 2.452765519900604, "grad_norm": 6.408934593200684, "learning_rate": 5.148369712728572e-05, "loss": 2.553639221191406, "memory(GiB)": 77.56, "step": 57250, "token_acc": 0.45126353790613716, "train_speed(iter/s)": 1.438731 }, { "epoch": 2.452979735229853, "grad_norm": 4.5913238525390625, "learning_rate": 5.1476970304399565e-05, "loss": 2.281085968017578, "memory(GiB)": 77.56, "step": 57255, "token_acc": 0.5, "train_speed(iter/s)": 1.438725 }, { "epoch": 2.453193950559102, "grad_norm": 5.944769859313965, "learning_rate": 5.1470243454756694e-05, "loss": 2.485824966430664, "memory(GiB)": 77.56, "step": 57260, "token_acc": 0.493006993006993, "train_speed(iter/s)": 1.438738 }, { "epoch": 2.453408165888351, "grad_norm": 5.902571201324463, "learning_rate": 5.146351657847898e-05, "loss": 2.5209325790405273, "memory(GiB)": 77.56, "step": 57265, "token_acc": 0.46153846153846156, "train_speed(iter/s)": 1.438736 }, { "epoch": 2.4536223812176, "grad_norm": 5.643834590911865, "learning_rate": 5.14567896756883e-05, "loss": 2.4684677124023438, "memory(GiB)": 77.56, "step": 57270, "token_acc": 0.48464163822525597, "train_speed(iter/s)": 1.438734 }, { "epoch": 2.4538365965468487, "grad_norm": 5.017353534698486, "learning_rate": 5.145006274650652e-05, "loss": 2.446546936035156, "memory(GiB)": 77.56, "step": 57275, "token_acc": 0.48375451263537905, "train_speed(iter/s)": 1.438725 }, { "epoch": 2.454050811876098, "grad_norm": 8.103410720825195, "learning_rate": 5.144333579105547e-05, "loss": 2.547798919677734, "memory(GiB)": 77.56, "step": 57280, "token_acc": 0.4393939393939394, "train_speed(iter/s)": 1.43872 }, { "epoch": 2.454265027205347, "grad_norm": 5.171634674072266, "learning_rate": 5.143660880945705e-05, "loss": 2.5604633331298827, "memory(GiB)": 77.56, "step": 57285, "token_acc": 0.4662379421221865, "train_speed(iter/s)": 1.438728 }, { "epoch": 2.4544792425345956, "grad_norm": 8.173238754272461, "learning_rate": 5.1429881801833116e-05, "loss": 2.5359878540039062, "memory(GiB)": 77.56, "step": 57290, "token_acc": 0.4810126582278481, "train_speed(iter/s)": 1.438712 }, { "epoch": 2.454693457863845, "grad_norm": 5.89133882522583, "learning_rate": 5.1423154768305524e-05, "loss": 2.49134521484375, "memory(GiB)": 77.56, "step": 57295, "token_acc": 0.4660493827160494, "train_speed(iter/s)": 1.438722 }, { "epoch": 2.4549076731930937, "grad_norm": 4.935211181640625, "learning_rate": 5.1416427708996154e-05, "loss": 2.411406135559082, "memory(GiB)": 77.56, "step": 57300, "token_acc": 0.48089171974522293, "train_speed(iter/s)": 1.43874 }, { "epoch": 2.4551218885223425, "grad_norm": 6.120079040527344, "learning_rate": 5.1409700624026855e-05, "loss": 2.2102788925170898, "memory(GiB)": 77.56, "step": 57305, "token_acc": 0.5442477876106194, "train_speed(iter/s)": 1.438738 }, { "epoch": 2.4553361038515917, "grad_norm": 5.650693416595459, "learning_rate": 5.1402973513519527e-05, "loss": 2.5443172454833984, "memory(GiB)": 77.56, "step": 57310, "token_acc": 0.4840989399293286, "train_speed(iter/s)": 1.438761 }, { "epoch": 2.4555503191808405, "grad_norm": 5.473184585571289, "learning_rate": 5.139624637759601e-05, "loss": 2.262398529052734, "memory(GiB)": 77.56, "step": 57315, "token_acc": 0.4740484429065744, "train_speed(iter/s)": 1.43876 }, { "epoch": 2.4557645345100894, "grad_norm": 5.1242194175720215, "learning_rate": 5.138951921637817e-05, "loss": 2.3731449127197264, "memory(GiB)": 77.56, "step": 57320, "token_acc": 0.5127388535031847, "train_speed(iter/s)": 1.438765 }, { "epoch": 2.4559787498393386, "grad_norm": 8.556614875793457, "learning_rate": 5.1382792029987904e-05, "loss": 2.6999895095825197, "memory(GiB)": 77.56, "step": 57325, "token_acc": 0.44673539518900346, "train_speed(iter/s)": 1.438797 }, { "epoch": 2.4561929651685874, "grad_norm": 5.725161075592041, "learning_rate": 5.137606481854705e-05, "loss": 2.1397125244140627, "memory(GiB)": 77.56, "step": 57330, "token_acc": 0.555045871559633, "train_speed(iter/s)": 1.4388 }, { "epoch": 2.4564071804978362, "grad_norm": 4.597769737243652, "learning_rate": 5.136933758217749e-05, "loss": 2.4590587615966797, "memory(GiB)": 77.56, "step": 57335, "token_acc": 0.5127272727272727, "train_speed(iter/s)": 1.438797 }, { "epoch": 2.4566213958270855, "grad_norm": 5.495504379272461, "learning_rate": 5.13626103210011e-05, "loss": 2.698875617980957, "memory(GiB)": 77.56, "step": 57340, "token_acc": 0.4646840148698885, "train_speed(iter/s)": 1.438833 }, { "epoch": 2.4568356111563343, "grad_norm": 5.055111408233643, "learning_rate": 5.135588303513975e-05, "loss": 2.6068859100341797, "memory(GiB)": 77.56, "step": 57345, "token_acc": 0.5251572327044025, "train_speed(iter/s)": 1.438833 }, { "epoch": 2.457049826485583, "grad_norm": 6.525391101837158, "learning_rate": 5.1349155724715294e-05, "loss": 2.4812475204467774, "memory(GiB)": 77.56, "step": 57350, "token_acc": 0.49823321554770317, "train_speed(iter/s)": 1.438851 }, { "epoch": 2.4572640418148324, "grad_norm": 5.498703956604004, "learning_rate": 5.1342428389849626e-05, "loss": 2.586172866821289, "memory(GiB)": 77.56, "step": 57355, "token_acc": 0.45964912280701753, "train_speed(iter/s)": 1.438869 }, { "epoch": 2.457478257144081, "grad_norm": 5.428562641143799, "learning_rate": 5.133570103066462e-05, "loss": 2.24481201171875, "memory(GiB)": 77.56, "step": 57360, "token_acc": 0.486404833836858, "train_speed(iter/s)": 1.438866 }, { "epoch": 2.45769247247333, "grad_norm": 5.332465648651123, "learning_rate": 5.1328973647282116e-05, "loss": 2.378965377807617, "memory(GiB)": 77.56, "step": 57365, "token_acc": 0.4945054945054945, "train_speed(iter/s)": 1.438882 }, { "epoch": 2.4579066878025793, "grad_norm": 6.097894668579102, "learning_rate": 5.1322246239824024e-05, "loss": 2.6744949340820314, "memory(GiB)": 77.56, "step": 57370, "token_acc": 0.46774193548387094, "train_speed(iter/s)": 1.438893 }, { "epoch": 2.458120903131828, "grad_norm": 5.567917823791504, "learning_rate": 5.131551880841219e-05, "loss": 2.3654218673706056, "memory(GiB)": 77.56, "step": 57375, "token_acc": 0.5055350553505535, "train_speed(iter/s)": 1.438905 }, { "epoch": 2.458335118461077, "grad_norm": 4.319338321685791, "learning_rate": 5.1308791353168484e-05, "loss": 2.2937015533447265, "memory(GiB)": 77.56, "step": 57380, "token_acc": 0.5, "train_speed(iter/s)": 1.438933 }, { "epoch": 2.458549333790326, "grad_norm": 4.3087897300720215, "learning_rate": 5.130206387421482e-05, "loss": 2.769311714172363, "memory(GiB)": 77.56, "step": 57385, "token_acc": 0.44518272425249167, "train_speed(iter/s)": 1.438954 }, { "epoch": 2.458763549119575, "grad_norm": 5.06441068649292, "learning_rate": 5.1295336371673045e-05, "loss": 2.5599733352661134, "memory(GiB)": 77.56, "step": 57390, "token_acc": 0.44518272425249167, "train_speed(iter/s)": 1.438952 }, { "epoch": 2.4589777644488238, "grad_norm": 5.019257068634033, "learning_rate": 5.1288608845665034e-05, "loss": 2.318602752685547, "memory(GiB)": 77.56, "step": 57395, "token_acc": 0.5437956204379562, "train_speed(iter/s)": 1.438967 }, { "epoch": 2.459191979778073, "grad_norm": 6.315002918243408, "learning_rate": 5.128188129631266e-05, "loss": 2.4263404846191405, "memory(GiB)": 77.56, "step": 57400, "token_acc": 0.4748201438848921, "train_speed(iter/s)": 1.438995 }, { "epoch": 2.459406195107322, "grad_norm": 5.129228591918945, "learning_rate": 5.12751537237378e-05, "loss": 2.459724235534668, "memory(GiB)": 77.56, "step": 57405, "token_acc": 0.4716981132075472, "train_speed(iter/s)": 1.439039 }, { "epoch": 2.4596204104365706, "grad_norm": 7.693178653717041, "learning_rate": 5.126842612806234e-05, "loss": 2.2444610595703125, "memory(GiB)": 77.56, "step": 57410, "token_acc": 0.5399239543726235, "train_speed(iter/s)": 1.439052 }, { "epoch": 2.45983462576582, "grad_norm": 7.329493999481201, "learning_rate": 5.1261698509408154e-05, "loss": 2.370320510864258, "memory(GiB)": 77.56, "step": 57415, "token_acc": 0.45526315789473687, "train_speed(iter/s)": 1.439072 }, { "epoch": 2.4600488410950687, "grad_norm": 5.697206497192383, "learning_rate": 5.125497086789711e-05, "loss": 2.334388542175293, "memory(GiB)": 77.56, "step": 57420, "token_acc": 0.4979919678714859, "train_speed(iter/s)": 1.439084 }, { "epoch": 2.460263056424318, "grad_norm": 4.582203388214111, "learning_rate": 5.1248243203651094e-05, "loss": 2.381041145324707, "memory(GiB)": 77.56, "step": 57425, "token_acc": 0.5387096774193548, "train_speed(iter/s)": 1.439076 }, { "epoch": 2.460477271753567, "grad_norm": 5.91025447845459, "learning_rate": 5.124151551679198e-05, "loss": 2.4503894805908204, "memory(GiB)": 77.56, "step": 57430, "token_acc": 0.5057471264367817, "train_speed(iter/s)": 1.439093 }, { "epoch": 2.4606914870828156, "grad_norm": 5.3099894523620605, "learning_rate": 5.1234787807441655e-05, "loss": 2.023750114440918, "memory(GiB)": 77.56, "step": 57435, "token_acc": 0.55859375, "train_speed(iter/s)": 1.439109 }, { "epoch": 2.460905702412065, "grad_norm": 5.285356044769287, "learning_rate": 5.122806007572198e-05, "loss": 2.383597564697266, "memory(GiB)": 77.56, "step": 57440, "token_acc": 0.476038338658147, "train_speed(iter/s)": 1.43909 }, { "epoch": 2.4611199177413137, "grad_norm": 5.4618120193481445, "learning_rate": 5.1221332321754855e-05, "loss": 2.4647523880004885, "memory(GiB)": 77.56, "step": 57445, "token_acc": 0.45874587458745875, "train_speed(iter/s)": 1.439097 }, { "epoch": 2.4613341330705625, "grad_norm": 4.592566013336182, "learning_rate": 5.1214604545662135e-05, "loss": 2.218570327758789, "memory(GiB)": 77.56, "step": 57450, "token_acc": 0.5038461538461538, "train_speed(iter/s)": 1.439115 }, { "epoch": 2.4615483483998117, "grad_norm": 5.485901355743408, "learning_rate": 5.120787674756573e-05, "loss": 2.2112655639648438, "memory(GiB)": 77.56, "step": 57455, "token_acc": 0.5682656826568265, "train_speed(iter/s)": 1.439121 }, { "epoch": 2.4617625637290605, "grad_norm": 4.783279895782471, "learning_rate": 5.120114892758749e-05, "loss": 2.4660833358764647, "memory(GiB)": 77.56, "step": 57460, "token_acc": 0.4557823129251701, "train_speed(iter/s)": 1.439138 }, { "epoch": 2.4619767790583094, "grad_norm": 4.32682466506958, "learning_rate": 5.119442108584932e-05, "loss": 2.0927169799804686, "memory(GiB)": 77.56, "step": 57465, "token_acc": 0.5773584905660377, "train_speed(iter/s)": 1.439118 }, { "epoch": 2.4621909943875586, "grad_norm": 5.225218772888184, "learning_rate": 5.11876932224731e-05, "loss": 2.220818519592285, "memory(GiB)": 77.56, "step": 57470, "token_acc": 0.5107913669064749, "train_speed(iter/s)": 1.439116 }, { "epoch": 2.4624052097168074, "grad_norm": 5.393348693847656, "learning_rate": 5.11809653375807e-05, "loss": 2.4311344146728517, "memory(GiB)": 77.56, "step": 57475, "token_acc": 0.4807017543859649, "train_speed(iter/s)": 1.439126 }, { "epoch": 2.4626194250460562, "grad_norm": 4.696230888366699, "learning_rate": 5.1174237431293994e-05, "loss": 2.447711181640625, "memory(GiB)": 77.56, "step": 57480, "token_acc": 0.4969512195121951, "train_speed(iter/s)": 1.439154 }, { "epoch": 2.4628336403753055, "grad_norm": 6.5843353271484375, "learning_rate": 5.116750950373487e-05, "loss": 2.7485700607299806, "memory(GiB)": 77.56, "step": 57485, "token_acc": 0.4631578947368421, "train_speed(iter/s)": 1.439167 }, { "epoch": 2.4630478557045543, "grad_norm": 4.670713424682617, "learning_rate": 5.1160781555025225e-05, "loss": 2.2367443084716796, "memory(GiB)": 77.56, "step": 57490, "token_acc": 0.51875, "train_speed(iter/s)": 1.43917 }, { "epoch": 2.463262071033803, "grad_norm": 5.058983325958252, "learning_rate": 5.115405358528693e-05, "loss": 2.2882545471191404, "memory(GiB)": 77.56, "step": 57495, "token_acc": 0.5409252669039146, "train_speed(iter/s)": 1.43917 }, { "epoch": 2.4634762863630524, "grad_norm": 6.193905353546143, "learning_rate": 5.114732559464188e-05, "loss": 2.2321853637695312, "memory(GiB)": 77.56, "step": 57500, "token_acc": 0.5066666666666667, "train_speed(iter/s)": 1.439199 }, { "epoch": 2.4634762863630524, "eval_loss": 2.181269884109497, "eval_runtime": 13.2963, "eval_samples_per_second": 7.521, "eval_steps_per_second": 7.521, "eval_token_acc": 0.48021828103683495, "step": 57500 }, { "epoch": 2.463690501692301, "grad_norm": 4.969756603240967, "learning_rate": 5.114059758321196e-05, "loss": 2.528905487060547, "memory(GiB)": 77.56, "step": 57505, "token_acc": 0.4846449136276392, "train_speed(iter/s)": 1.438707 }, { "epoch": 2.46390471702155, "grad_norm": 6.251031875610352, "learning_rate": 5.1133869551119016e-05, "loss": 2.0313507080078126, "memory(GiB)": 77.56, "step": 57510, "token_acc": 0.5367647058823529, "train_speed(iter/s)": 1.438716 }, { "epoch": 2.4641189323507993, "grad_norm": 6.339770793914795, "learning_rate": 5.112714149848499e-05, "loss": 2.6249263763427733, "memory(GiB)": 77.56, "step": 57515, "token_acc": 0.501779359430605, "train_speed(iter/s)": 1.438712 }, { "epoch": 2.464333147680048, "grad_norm": 4.177573204040527, "learning_rate": 5.112041342543171e-05, "loss": 2.6119308471679688, "memory(GiB)": 77.56, "step": 57520, "token_acc": 0.4982078853046595, "train_speed(iter/s)": 1.438713 }, { "epoch": 2.464547363009297, "grad_norm": 6.631214618682861, "learning_rate": 5.1113685332081094e-05, "loss": 2.4411493301391602, "memory(GiB)": 77.56, "step": 57525, "token_acc": 0.4983277591973244, "train_speed(iter/s)": 1.438738 }, { "epoch": 2.464761578338546, "grad_norm": 4.378689765930176, "learning_rate": 5.110695721855505e-05, "loss": 2.7039278030395506, "memory(GiB)": 77.56, "step": 57530, "token_acc": 0.4675324675324675, "train_speed(iter/s)": 1.438764 }, { "epoch": 2.464975793667795, "grad_norm": 8.13716983795166, "learning_rate": 5.1100229084975424e-05, "loss": 2.189742851257324, "memory(GiB)": 77.56, "step": 57535, "token_acc": 0.5454545454545454, "train_speed(iter/s)": 1.438778 }, { "epoch": 2.4651900089970438, "grad_norm": 4.951664924621582, "learning_rate": 5.109350093146411e-05, "loss": 2.170023536682129, "memory(GiB)": 77.56, "step": 57540, "token_acc": 0.5394736842105263, "train_speed(iter/s)": 1.438818 }, { "epoch": 2.465404224326293, "grad_norm": 6.1526923179626465, "learning_rate": 5.108677275814301e-05, "loss": 2.6115438461303713, "memory(GiB)": 77.56, "step": 57545, "token_acc": 0.43137254901960786, "train_speed(iter/s)": 1.438806 }, { "epoch": 2.465618439655542, "grad_norm": 8.08043384552002, "learning_rate": 5.108004456513399e-05, "loss": 2.3119247436523436, "memory(GiB)": 77.56, "step": 57550, "token_acc": 0.4978723404255319, "train_speed(iter/s)": 1.438816 }, { "epoch": 2.4658326549847907, "grad_norm": 5.128757953643799, "learning_rate": 5.107331635255895e-05, "loss": 2.3346920013427734, "memory(GiB)": 77.56, "step": 57555, "token_acc": 0.5088967971530249, "train_speed(iter/s)": 1.438814 }, { "epoch": 2.46604687031404, "grad_norm": 8.985196113586426, "learning_rate": 5.1066588120539785e-05, "loss": 2.39608211517334, "memory(GiB)": 77.56, "step": 57560, "token_acc": 0.4978540772532189, "train_speed(iter/s)": 1.438858 }, { "epoch": 2.4662610856432887, "grad_norm": 5.544095516204834, "learning_rate": 5.105985986919838e-05, "loss": 2.2582118988037108, "memory(GiB)": 77.56, "step": 57565, "token_acc": 0.5204460966542751, "train_speed(iter/s)": 1.438869 }, { "epoch": 2.4664753009725375, "grad_norm": 5.5937066078186035, "learning_rate": 5.1053131598656614e-05, "loss": 2.600021171569824, "memory(GiB)": 77.56, "step": 57570, "token_acc": 0.46006389776357826, "train_speed(iter/s)": 1.438824 }, { "epoch": 2.466689516301787, "grad_norm": 5.682216644287109, "learning_rate": 5.104640330903638e-05, "loss": 2.501003646850586, "memory(GiB)": 77.56, "step": 57575, "token_acc": 0.4750733137829912, "train_speed(iter/s)": 1.438818 }, { "epoch": 2.4669037316310356, "grad_norm": 5.798671722412109, "learning_rate": 5.103967500045956e-05, "loss": 2.7489118576049805, "memory(GiB)": 77.56, "step": 57580, "token_acc": 0.466403162055336, "train_speed(iter/s)": 1.4388 }, { "epoch": 2.4671179469602844, "grad_norm": 5.5647382736206055, "learning_rate": 5.1032946673048067e-05, "loss": 2.5334585189819334, "memory(GiB)": 77.56, "step": 57585, "token_acc": 0.5, "train_speed(iter/s)": 1.43882 }, { "epoch": 2.4673321622895337, "grad_norm": 5.787646293640137, "learning_rate": 5.102621832692378e-05, "loss": 2.200487518310547, "memory(GiB)": 77.56, "step": 57590, "token_acc": 0.5153846153846153, "train_speed(iter/s)": 1.438787 }, { "epoch": 2.4675463776187825, "grad_norm": 5.420490264892578, "learning_rate": 5.1019489962208555e-05, "loss": 2.5062013626098634, "memory(GiB)": 77.56, "step": 57595, "token_acc": 0.47719298245614034, "train_speed(iter/s)": 1.438791 }, { "epoch": 2.4677605929480313, "grad_norm": 6.231259346008301, "learning_rate": 5.101276157902434e-05, "loss": 2.5847116470336915, "memory(GiB)": 77.56, "step": 57600, "token_acc": 0.48226950354609927, "train_speed(iter/s)": 1.43882 }, { "epoch": 2.4679748082772806, "grad_norm": 5.21881103515625, "learning_rate": 5.100603317749299e-05, "loss": 2.63265380859375, "memory(GiB)": 77.56, "step": 57605, "token_acc": 0.47017543859649125, "train_speed(iter/s)": 1.438853 }, { "epoch": 2.4681890236065294, "grad_norm": 5.555099010467529, "learning_rate": 5.099930475773641e-05, "loss": 2.415752410888672, "memory(GiB)": 77.56, "step": 57610, "token_acc": 0.4838709677419355, "train_speed(iter/s)": 1.438851 }, { "epoch": 2.468403238935778, "grad_norm": 4.607346534729004, "learning_rate": 5.099257631987648e-05, "loss": 2.609634017944336, "memory(GiB)": 77.56, "step": 57615, "token_acc": 0.44363636363636366, "train_speed(iter/s)": 1.438851 }, { "epoch": 2.4686174542650274, "grad_norm": 4.508793354034424, "learning_rate": 5.098584786403512e-05, "loss": 2.289145851135254, "memory(GiB)": 77.56, "step": 57620, "token_acc": 0.49808429118773945, "train_speed(iter/s)": 1.438852 }, { "epoch": 2.4688316695942762, "grad_norm": 6.425941467285156, "learning_rate": 5.0979119390334175e-05, "loss": 2.796823501586914, "memory(GiB)": 77.56, "step": 57625, "token_acc": 0.4491017964071856, "train_speed(iter/s)": 1.438838 }, { "epoch": 2.469045884923525, "grad_norm": 5.5024518966674805, "learning_rate": 5.097239089889558e-05, "loss": 2.4876276016235352, "memory(GiB)": 77.56, "step": 57630, "token_acc": 0.45714285714285713, "train_speed(iter/s)": 1.438831 }, { "epoch": 2.4692601002527743, "grad_norm": 5.610123157501221, "learning_rate": 5.0965662389841196e-05, "loss": 2.4639196395874023, "memory(GiB)": 77.56, "step": 57635, "token_acc": 0.49603174603174605, "train_speed(iter/s)": 1.43885 }, { "epoch": 2.469474315582023, "grad_norm": 6.002385139465332, "learning_rate": 5.095893386329293e-05, "loss": 2.4816234588623045, "memory(GiB)": 77.56, "step": 57640, "token_acc": 0.5080645161290323, "train_speed(iter/s)": 1.438881 }, { "epoch": 2.469688530911272, "grad_norm": 3.819467306137085, "learning_rate": 5.0952205319372706e-05, "loss": 2.254865264892578, "memory(GiB)": 77.56, "step": 57645, "token_acc": 0.5218855218855218, "train_speed(iter/s)": 1.438904 }, { "epoch": 2.469902746240521, "grad_norm": 6.105497360229492, "learning_rate": 5.094547675820237e-05, "loss": 2.43424072265625, "memory(GiB)": 77.56, "step": 57650, "token_acc": 0.5112781954887218, "train_speed(iter/s)": 1.438925 }, { "epoch": 2.47011696156977, "grad_norm": 4.623422622680664, "learning_rate": 5.093874817990383e-05, "loss": 2.5657581329345702, "memory(GiB)": 77.56, "step": 57655, "token_acc": 0.4919614147909968, "train_speed(iter/s)": 1.438949 }, { "epoch": 2.470331176899019, "grad_norm": 5.294508457183838, "learning_rate": 5.0932019584599e-05, "loss": 2.4200065612792967, "memory(GiB)": 77.56, "step": 57660, "token_acc": 0.49107142857142855, "train_speed(iter/s)": 1.438907 }, { "epoch": 2.470545392228268, "grad_norm": 4.910347938537598, "learning_rate": 5.092529097240976e-05, "loss": 2.3683277130126954, "memory(GiB)": 77.56, "step": 57665, "token_acc": 0.525974025974026, "train_speed(iter/s)": 1.438925 }, { "epoch": 2.470759607557517, "grad_norm": 5.081131458282471, "learning_rate": 5.091856234345799e-05, "loss": 2.343279266357422, "memory(GiB)": 77.56, "step": 57670, "token_acc": 0.5, "train_speed(iter/s)": 1.438911 }, { "epoch": 2.4709738228867657, "grad_norm": 4.529219627380371, "learning_rate": 5.0911833697865607e-05, "loss": 2.6960355758666994, "memory(GiB)": 77.56, "step": 57675, "token_acc": 0.4306784660766962, "train_speed(iter/s)": 1.438917 }, { "epoch": 2.471188038216015, "grad_norm": 6.560049057006836, "learning_rate": 5.0905105035754516e-05, "loss": 2.308742332458496, "memory(GiB)": 77.56, "step": 57680, "token_acc": 0.524390243902439, "train_speed(iter/s)": 1.438932 }, { "epoch": 2.4714022535452638, "grad_norm": 6.020293712615967, "learning_rate": 5.089837635724658e-05, "loss": 2.517816925048828, "memory(GiB)": 77.56, "step": 57685, "token_acc": 0.5018050541516246, "train_speed(iter/s)": 1.438961 }, { "epoch": 2.4716164688745126, "grad_norm": 5.475345611572266, "learning_rate": 5.0891647662463724e-05, "loss": 2.476761817932129, "memory(GiB)": 77.56, "step": 57690, "token_acc": 0.528052805280528, "train_speed(iter/s)": 1.438983 }, { "epoch": 2.471830684203762, "grad_norm": 6.823609828948975, "learning_rate": 5.088491895152784e-05, "loss": 2.4553321838378905, "memory(GiB)": 77.56, "step": 57695, "token_acc": 0.5132075471698113, "train_speed(iter/s)": 1.438987 }, { "epoch": 2.4720448995330107, "grad_norm": 5.711009979248047, "learning_rate": 5.0878190224560804e-05, "loss": 2.364780807495117, "memory(GiB)": 77.56, "step": 57700, "token_acc": 0.5017182130584192, "train_speed(iter/s)": 1.438972 }, { "epoch": 2.4722591148622595, "grad_norm": 4.534488201141357, "learning_rate": 5.0871461481684546e-05, "loss": 2.149310302734375, "memory(GiB)": 77.56, "step": 57705, "token_acc": 0.5392857142857143, "train_speed(iter/s)": 1.438975 }, { "epoch": 2.4724733301915087, "grad_norm": 5.547414779663086, "learning_rate": 5.0864732723020915e-05, "loss": 2.160359191894531, "memory(GiB)": 77.56, "step": 57710, "token_acc": 0.5156794425087108, "train_speed(iter/s)": 1.438949 }, { "epoch": 2.4726875455207575, "grad_norm": 6.99166202545166, "learning_rate": 5.085800394869187e-05, "loss": 2.383811187744141, "memory(GiB)": 77.56, "step": 57715, "token_acc": 0.5257731958762887, "train_speed(iter/s)": 1.438924 }, { "epoch": 2.4729017608500063, "grad_norm": 5.011475086212158, "learning_rate": 5.0851275158819264e-05, "loss": 2.288974380493164, "memory(GiB)": 77.56, "step": 57720, "token_acc": 0.535483870967742, "train_speed(iter/s)": 1.438909 }, { "epoch": 2.4731159761792556, "grad_norm": 6.043013095855713, "learning_rate": 5.084454635352501e-05, "loss": 2.4874574661254885, "memory(GiB)": 77.56, "step": 57725, "token_acc": 0.4984894259818731, "train_speed(iter/s)": 1.438906 }, { "epoch": 2.4733301915085044, "grad_norm": 6.45123291015625, "learning_rate": 5.083781753293102e-05, "loss": 2.1487701416015623, "memory(GiB)": 77.56, "step": 57730, "token_acc": 0.5543071161048689, "train_speed(iter/s)": 1.438903 }, { "epoch": 2.4735444068377532, "grad_norm": 5.536190509796143, "learning_rate": 5.083108869715918e-05, "loss": 2.208243179321289, "memory(GiB)": 77.56, "step": 57735, "token_acc": 0.5518518518518518, "train_speed(iter/s)": 1.438911 }, { "epoch": 2.4737586221670025, "grad_norm": 4.991488456726074, "learning_rate": 5.082435984633137e-05, "loss": 2.4614936828613283, "memory(GiB)": 77.56, "step": 57740, "token_acc": 0.48175182481751827, "train_speed(iter/s)": 1.43894 }, { "epoch": 2.4739728374962513, "grad_norm": 6.942436695098877, "learning_rate": 5.081763098056951e-05, "loss": 2.446458625793457, "memory(GiB)": 77.56, "step": 57745, "token_acc": 0.4863013698630137, "train_speed(iter/s)": 1.438936 }, { "epoch": 2.4741870528255, "grad_norm": 4.558429718017578, "learning_rate": 5.08109020999955e-05, "loss": 2.6614004135131837, "memory(GiB)": 77.56, "step": 57750, "token_acc": 0.4670846394984326, "train_speed(iter/s)": 1.438959 }, { "epoch": 2.4744012681547494, "grad_norm": 4.911935806274414, "learning_rate": 5.080417320473124e-05, "loss": 2.335498809814453, "memory(GiB)": 77.56, "step": 57755, "token_acc": 0.4797297297297297, "train_speed(iter/s)": 1.43895 }, { "epoch": 2.474615483483998, "grad_norm": 4.802637100219727, "learning_rate": 5.0797444294898644e-05, "loss": 2.1629831314086916, "memory(GiB)": 77.56, "step": 57760, "token_acc": 0.5162241887905604, "train_speed(iter/s)": 1.438906 }, { "epoch": 2.474829698813247, "grad_norm": 4.477532863616943, "learning_rate": 5.079071537061959e-05, "loss": 2.095111274719238, "memory(GiB)": 77.56, "step": 57765, "token_acc": 0.5464684014869888, "train_speed(iter/s)": 1.438918 }, { "epoch": 2.4750439141424962, "grad_norm": 5.740736484527588, "learning_rate": 5.078398643201597e-05, "loss": 2.5655784606933594, "memory(GiB)": 77.56, "step": 57770, "token_acc": 0.49264705882352944, "train_speed(iter/s)": 1.43891 }, { "epoch": 2.475258129471745, "grad_norm": 4.7918901443481445, "learning_rate": 5.077725747920972e-05, "loss": 2.198044013977051, "memory(GiB)": 77.56, "step": 57775, "token_acc": 0.5404411764705882, "train_speed(iter/s)": 1.438938 }, { "epoch": 2.475472344800994, "grad_norm": 4.255255222320557, "learning_rate": 5.07705285123227e-05, "loss": 2.214427185058594, "memory(GiB)": 77.56, "step": 57780, "token_acc": 0.5271565495207667, "train_speed(iter/s)": 1.438929 }, { "epoch": 2.475686560130243, "grad_norm": 5.639316558837891, "learning_rate": 5.076379953147684e-05, "loss": 2.607181167602539, "memory(GiB)": 77.56, "step": 57785, "token_acc": 0.4984709480122324, "train_speed(iter/s)": 1.438902 }, { "epoch": 2.475900775459492, "grad_norm": 4.517269611358643, "learning_rate": 5.075707053679404e-05, "loss": 2.305590057373047, "memory(GiB)": 77.56, "step": 57790, "token_acc": 0.52, "train_speed(iter/s)": 1.438886 }, { "epoch": 2.4761149907887408, "grad_norm": 5.247602462768555, "learning_rate": 5.075034152839621e-05, "loss": 2.4419702529907226, "memory(GiB)": 77.56, "step": 57795, "token_acc": 0.49842271293375395, "train_speed(iter/s)": 1.438863 }, { "epoch": 2.47632920611799, "grad_norm": 3.173375368118286, "learning_rate": 5.074361250640521e-05, "loss": 2.157380294799805, "memory(GiB)": 77.56, "step": 57800, "token_acc": 0.5035460992907801, "train_speed(iter/s)": 1.438886 }, { "epoch": 2.476543421447239, "grad_norm": 5.00565242767334, "learning_rate": 5.0736883470942986e-05, "loss": 2.3380767822265627, "memory(GiB)": 77.56, "step": 57805, "token_acc": 0.5104895104895105, "train_speed(iter/s)": 1.438891 }, { "epoch": 2.4767576367764876, "grad_norm": 6.09058141708374, "learning_rate": 5.0730154422131424e-05, "loss": 2.4289512634277344, "memory(GiB)": 77.56, "step": 57810, "token_acc": 0.4860557768924303, "train_speed(iter/s)": 1.438896 }, { "epoch": 2.476971852105737, "grad_norm": 4.935365676879883, "learning_rate": 5.072342536009245e-05, "loss": 2.480707550048828, "memory(GiB)": 77.56, "step": 57815, "token_acc": 0.5, "train_speed(iter/s)": 1.43889 }, { "epoch": 2.4771860674349857, "grad_norm": 6.091104507446289, "learning_rate": 5.071669628494792e-05, "loss": 2.5238800048828125, "memory(GiB)": 77.56, "step": 57820, "token_acc": 0.4782608695652174, "train_speed(iter/s)": 1.438905 }, { "epoch": 2.4774002827642345, "grad_norm": 8.003154754638672, "learning_rate": 5.070996719681977e-05, "loss": 2.52032527923584, "memory(GiB)": 77.56, "step": 57825, "token_acc": 0.5070921985815603, "train_speed(iter/s)": 1.438923 }, { "epoch": 2.4776144980934838, "grad_norm": 5.366401672363281, "learning_rate": 5.070323809582991e-05, "loss": 2.2390411376953123, "memory(GiB)": 77.56, "step": 57830, "token_acc": 0.47330960854092524, "train_speed(iter/s)": 1.438885 }, { "epoch": 2.4778287134227326, "grad_norm": 5.645700931549072, "learning_rate": 5.0696508982100225e-05, "loss": 2.2358333587646486, "memory(GiB)": 77.56, "step": 57835, "token_acc": 0.5236220472440944, "train_speed(iter/s)": 1.438897 }, { "epoch": 2.4780429287519814, "grad_norm": 5.364466667175293, "learning_rate": 5.0689779855752616e-05, "loss": 2.5054306030273437, "memory(GiB)": 77.56, "step": 57840, "token_acc": 0.47692307692307695, "train_speed(iter/s)": 1.438897 }, { "epoch": 2.4782571440812307, "grad_norm": 6.162137508392334, "learning_rate": 5.068305071690902e-05, "loss": 2.4064870834350587, "memory(GiB)": 77.56, "step": 57845, "token_acc": 0.4657039711191336, "train_speed(iter/s)": 1.438912 }, { "epoch": 2.4784713594104795, "grad_norm": 4.874151229858398, "learning_rate": 5.067632156569131e-05, "loss": 2.454629898071289, "memory(GiB)": 77.56, "step": 57850, "token_acc": 0.5057915057915058, "train_speed(iter/s)": 1.438915 }, { "epoch": 2.4786855747397283, "grad_norm": 5.7840800285339355, "learning_rate": 5.066959240222138e-05, "loss": 2.785805892944336, "memory(GiB)": 77.56, "step": 57855, "token_acc": 0.4630225080385852, "train_speed(iter/s)": 1.438928 }, { "epoch": 2.4788997900689775, "grad_norm": 7.42020845413208, "learning_rate": 5.066286322662118e-05, "loss": 2.225681495666504, "memory(GiB)": 77.56, "step": 57860, "token_acc": 0.5238095238095238, "train_speed(iter/s)": 1.438926 }, { "epoch": 2.4791140053982264, "grad_norm": 5.325389862060547, "learning_rate": 5.0656134039012593e-05, "loss": 2.7070350646972656, "memory(GiB)": 77.56, "step": 57865, "token_acc": 0.47017543859649125, "train_speed(iter/s)": 1.438955 }, { "epoch": 2.479328220727475, "grad_norm": 6.985469818115234, "learning_rate": 5.06494048395175e-05, "loss": 2.329922676086426, "memory(GiB)": 77.56, "step": 57870, "token_acc": 0.47580645161290325, "train_speed(iter/s)": 1.438964 }, { "epoch": 2.4795424360567244, "grad_norm": 5.885095119476318, "learning_rate": 5.064267562825785e-05, "loss": 2.2059085845947264, "memory(GiB)": 77.56, "step": 57875, "token_acc": 0.5201465201465202, "train_speed(iter/s)": 1.438932 }, { "epoch": 2.4797566513859732, "grad_norm": 5.26485538482666, "learning_rate": 5.0635946405355525e-05, "loss": 2.2520315170288088, "memory(GiB)": 77.56, "step": 57880, "token_acc": 0.524904214559387, "train_speed(iter/s)": 1.438942 }, { "epoch": 2.479970866715222, "grad_norm": 5.791314125061035, "learning_rate": 5.062921717093243e-05, "loss": 2.424143600463867, "memory(GiB)": 77.56, "step": 57885, "token_acc": 0.45182724252491696, "train_speed(iter/s)": 1.438966 }, { "epoch": 2.4801850820444713, "grad_norm": 6.202103614807129, "learning_rate": 5.062248792511048e-05, "loss": 2.5702686309814453, "memory(GiB)": 77.56, "step": 57890, "token_acc": 0.44692737430167595, "train_speed(iter/s)": 1.438951 }, { "epoch": 2.48039929737372, "grad_norm": 4.706606388092041, "learning_rate": 5.061575866801157e-05, "loss": 2.291838836669922, "memory(GiB)": 77.56, "step": 57895, "token_acc": 0.5374449339207048, "train_speed(iter/s)": 1.438964 }, { "epoch": 2.480613512702969, "grad_norm": 6.094140529632568, "learning_rate": 5.0609029399757615e-05, "loss": 2.5490455627441406, "memory(GiB)": 77.56, "step": 57900, "token_acc": 0.42662116040955633, "train_speed(iter/s)": 1.438975 }, { "epoch": 2.480827728032218, "grad_norm": 5.127205848693848, "learning_rate": 5.060230012047052e-05, "loss": 2.4769092559814454, "memory(GiB)": 77.56, "step": 57905, "token_acc": 0.4533333333333333, "train_speed(iter/s)": 1.438986 }, { "epoch": 2.481041943361467, "grad_norm": 6.49498987197876, "learning_rate": 5.059557083027221e-05, "loss": 2.4379310607910156, "memory(GiB)": 77.56, "step": 57910, "token_acc": 0.46745562130177515, "train_speed(iter/s)": 1.438978 }, { "epoch": 2.481256158690716, "grad_norm": 5.46910285949707, "learning_rate": 5.058884152928455e-05, "loss": 2.576978302001953, "memory(GiB)": 77.56, "step": 57915, "token_acc": 0.48344370860927155, "train_speed(iter/s)": 1.439019 }, { "epoch": 2.481470374019965, "grad_norm": 6.400634765625, "learning_rate": 5.0582112217629494e-05, "loss": 2.4977977752685545, "memory(GiB)": 77.56, "step": 57920, "token_acc": 0.48986486486486486, "train_speed(iter/s)": 1.439024 }, { "epoch": 2.481684589349214, "grad_norm": 4.879184722900391, "learning_rate": 5.0575382895428914e-05, "loss": 2.3481157302856444, "memory(GiB)": 77.56, "step": 57925, "token_acc": 0.4830188679245283, "train_speed(iter/s)": 1.439019 }, { "epoch": 2.4818988046784627, "grad_norm": 5.49393892288208, "learning_rate": 5.0568653562804734e-05, "loss": 2.4221954345703125, "memory(GiB)": 77.56, "step": 57930, "token_acc": 0.46688741721854304, "train_speed(iter/s)": 1.438996 }, { "epoch": 2.482113020007712, "grad_norm": 5.23515510559082, "learning_rate": 5.056192421987888e-05, "loss": 2.602817916870117, "memory(GiB)": 77.56, "step": 57935, "token_acc": 0.468013468013468, "train_speed(iter/s)": 1.439021 }, { "epoch": 2.4823272353369608, "grad_norm": 4.713934421539307, "learning_rate": 5.055519486677322e-05, "loss": 2.351328468322754, "memory(GiB)": 77.56, "step": 57940, "token_acc": 0.5346938775510204, "train_speed(iter/s)": 1.43903 }, { "epoch": 2.4825414506662096, "grad_norm": 4.903232097625732, "learning_rate": 5.0548465503609697e-05, "loss": 2.4050979614257812, "memory(GiB)": 77.56, "step": 57945, "token_acc": 0.49825783972125437, "train_speed(iter/s)": 1.438975 }, { "epoch": 2.482755665995459, "grad_norm": 6.948683738708496, "learning_rate": 5.054173613051021e-05, "loss": 2.4426015853881835, "memory(GiB)": 77.56, "step": 57950, "token_acc": 0.47368421052631576, "train_speed(iter/s)": 1.439008 }, { "epoch": 2.4829698813247076, "grad_norm": 4.461533069610596, "learning_rate": 5.0535006747596646e-05, "loss": 2.7285572052001954, "memory(GiB)": 77.56, "step": 57955, "token_acc": 0.43922651933701656, "train_speed(iter/s)": 1.439014 }, { "epoch": 2.4831840966539565, "grad_norm": 4.471543788909912, "learning_rate": 5.052827735499095e-05, "loss": 2.9877140045166017, "memory(GiB)": 77.56, "step": 57960, "token_acc": 0.4010152284263959, "train_speed(iter/s)": 1.439014 }, { "epoch": 2.4833983119832057, "grad_norm": 6.1984543800354, "learning_rate": 5.0521547952815005e-05, "loss": 2.3776714324951174, "memory(GiB)": 77.56, "step": 57965, "token_acc": 0.4742268041237113, "train_speed(iter/s)": 1.43905 }, { "epoch": 2.4836125273124545, "grad_norm": 4.951435565948486, "learning_rate": 5.0514818541190745e-05, "loss": 2.098245620727539, "memory(GiB)": 77.56, "step": 57970, "token_acc": 0.5335463258785943, "train_speed(iter/s)": 1.439056 }, { "epoch": 2.4838267426417033, "grad_norm": 6.0008625984191895, "learning_rate": 5.050808912024004e-05, "loss": 2.408772659301758, "memory(GiB)": 77.56, "step": 57975, "token_acc": 0.4897959183673469, "train_speed(iter/s)": 1.439033 }, { "epoch": 2.4840409579709526, "grad_norm": 6.2969770431518555, "learning_rate": 5.050135969008485e-05, "loss": 2.5903215408325195, "memory(GiB)": 77.56, "step": 57980, "token_acc": 0.4625, "train_speed(iter/s)": 1.439057 }, { "epoch": 2.4842551733002014, "grad_norm": 4.6040263175964355, "learning_rate": 5.049463025084703e-05, "loss": 2.398129463195801, "memory(GiB)": 77.56, "step": 57985, "token_acc": 0.49691358024691357, "train_speed(iter/s)": 1.439081 }, { "epoch": 2.48446938862945, "grad_norm": 5.366873741149902, "learning_rate": 5.0487900802648544e-05, "loss": 2.522391128540039, "memory(GiB)": 77.56, "step": 57990, "token_acc": 0.5340909090909091, "train_speed(iter/s)": 1.439076 }, { "epoch": 2.4846836039586995, "grad_norm": 4.175821304321289, "learning_rate": 5.048117134561128e-05, "loss": 2.3147741317749024, "memory(GiB)": 77.56, "step": 57995, "token_acc": 0.476, "train_speed(iter/s)": 1.439074 }, { "epoch": 2.4848978192879483, "grad_norm": 5.270333766937256, "learning_rate": 5.0474441879857125e-05, "loss": 2.7343297958374024, "memory(GiB)": 77.56, "step": 58000, "token_acc": 0.471875, "train_speed(iter/s)": 1.439077 }, { "epoch": 2.4848978192879483, "eval_loss": 2.177473306655884, "eval_runtime": 13.7372, "eval_samples_per_second": 7.279, "eval_steps_per_second": 7.279, "eval_token_acc": 0.47198879551820727, "step": 58000 }, { "epoch": 2.485112034617197, "grad_norm": 7.586592674255371, "learning_rate": 5.046771240550801e-05, "loss": 2.7471054077148436, "memory(GiB)": 77.56, "step": 58005, "token_acc": 0.46255924170616114, "train_speed(iter/s)": 1.438559 }, { "epoch": 2.4853262499464464, "grad_norm": 4.5493645668029785, "learning_rate": 5.0460982922685865e-05, "loss": 2.4920017242431642, "memory(GiB)": 77.56, "step": 58010, "token_acc": 0.4803921568627451, "train_speed(iter/s)": 1.438549 }, { "epoch": 2.485540465275695, "grad_norm": 5.337832450866699, "learning_rate": 5.045425343151255e-05, "loss": 2.594310760498047, "memory(GiB)": 77.56, "step": 58015, "token_acc": 0.4509090909090909, "train_speed(iter/s)": 1.43856 }, { "epoch": 2.485754680604944, "grad_norm": 9.150239944458008, "learning_rate": 5.044752393211004e-05, "loss": 2.4332067489624025, "memory(GiB)": 77.56, "step": 58020, "token_acc": 0.4889589905362776, "train_speed(iter/s)": 1.438579 }, { "epoch": 2.4859688959341932, "grad_norm": 6.049983501434326, "learning_rate": 5.044079442460021e-05, "loss": 2.490147018432617, "memory(GiB)": 77.56, "step": 58025, "token_acc": 0.4864864864864865, "train_speed(iter/s)": 1.438566 }, { "epoch": 2.486183111263442, "grad_norm": 5.0744452476501465, "learning_rate": 5.043406490910497e-05, "loss": 2.259611129760742, "memory(GiB)": 77.56, "step": 58030, "token_acc": 0.484375, "train_speed(iter/s)": 1.438555 }, { "epoch": 2.486397326592691, "grad_norm": 7.369129657745361, "learning_rate": 5.0427335385746244e-05, "loss": 2.3613285064697265, "memory(GiB)": 77.56, "step": 58035, "token_acc": 0.5105740181268882, "train_speed(iter/s)": 1.438576 }, { "epoch": 2.48661154192194, "grad_norm": 6.428347587585449, "learning_rate": 5.042060585464592e-05, "loss": 2.57977352142334, "memory(GiB)": 77.56, "step": 58040, "token_acc": 0.475, "train_speed(iter/s)": 1.438604 }, { "epoch": 2.486825757251189, "grad_norm": 4.430943965911865, "learning_rate": 5.0413876315925925e-05, "loss": 2.448247718811035, "memory(GiB)": 77.56, "step": 58045, "token_acc": 0.4845679012345679, "train_speed(iter/s)": 1.438599 }, { "epoch": 2.4870399725804377, "grad_norm": 5.167415142059326, "learning_rate": 5.040714676970819e-05, "loss": 2.2805667877197267, "memory(GiB)": 77.56, "step": 58050, "token_acc": 0.5148514851485149, "train_speed(iter/s)": 1.438604 }, { "epoch": 2.487254187909687, "grad_norm": 5.407455921173096, "learning_rate": 5.04004172161146e-05, "loss": 2.453420639038086, "memory(GiB)": 77.56, "step": 58055, "token_acc": 0.4965277777777778, "train_speed(iter/s)": 1.43861 }, { "epoch": 2.487468403238936, "grad_norm": 4.259662628173828, "learning_rate": 5.039368765526708e-05, "loss": 1.986313247680664, "memory(GiB)": 77.56, "step": 58060, "token_acc": 0.5276752767527675, "train_speed(iter/s)": 1.438629 }, { "epoch": 2.4876826185681846, "grad_norm": 5.050387382507324, "learning_rate": 5.038695808728754e-05, "loss": 2.3218318939208986, "memory(GiB)": 77.56, "step": 58065, "token_acc": 0.5307692307692308, "train_speed(iter/s)": 1.438643 }, { "epoch": 2.487896833897434, "grad_norm": 7.325018882751465, "learning_rate": 5.038022851229789e-05, "loss": 2.463140678405762, "memory(GiB)": 77.56, "step": 58070, "token_acc": 0.5, "train_speed(iter/s)": 1.438634 }, { "epoch": 2.4881110492266827, "grad_norm": 6.819101810455322, "learning_rate": 5.037349893042005e-05, "loss": 2.5707046508789064, "memory(GiB)": 77.56, "step": 58075, "token_acc": 0.4777777777777778, "train_speed(iter/s)": 1.43866 }, { "epoch": 2.4883252645559315, "grad_norm": 5.634852409362793, "learning_rate": 5.036676934177591e-05, "loss": 2.4254791259765627, "memory(GiB)": 77.56, "step": 58080, "token_acc": 0.45857988165680474, "train_speed(iter/s)": 1.438679 }, { "epoch": 2.4885394798851808, "grad_norm": 5.327877521514893, "learning_rate": 5.036003974648741e-05, "loss": 2.4051029205322267, "memory(GiB)": 77.56, "step": 58085, "token_acc": 0.47674418604651164, "train_speed(iter/s)": 1.438726 }, { "epoch": 2.4887536952144296, "grad_norm": 7.217409133911133, "learning_rate": 5.0353310144676445e-05, "loss": 2.490331268310547, "memory(GiB)": 77.56, "step": 58090, "token_acc": 0.5193798449612403, "train_speed(iter/s)": 1.43869 }, { "epoch": 2.4889679105436784, "grad_norm": 5.765705585479736, "learning_rate": 5.034658053646495e-05, "loss": 2.113507843017578, "memory(GiB)": 77.56, "step": 58095, "token_acc": 0.5283018867924528, "train_speed(iter/s)": 1.438682 }, { "epoch": 2.4891821258729276, "grad_norm": 6.187605381011963, "learning_rate": 5.033985092197481e-05, "loss": 2.472217559814453, "memory(GiB)": 77.56, "step": 58100, "token_acc": 0.5067567567567568, "train_speed(iter/s)": 1.438693 }, { "epoch": 2.4893963412021765, "grad_norm": 4.89593505859375, "learning_rate": 5.033312130132796e-05, "loss": 2.4233102798461914, "memory(GiB)": 77.56, "step": 58105, "token_acc": 0.4894366197183099, "train_speed(iter/s)": 1.438707 }, { "epoch": 2.4896105565314253, "grad_norm": 5.415012836456299, "learning_rate": 5.03263916746463e-05, "loss": 2.6598560333251955, "memory(GiB)": 77.56, "step": 58110, "token_acc": 0.4523076923076923, "train_speed(iter/s)": 1.438724 }, { "epoch": 2.4898247718606745, "grad_norm": 4.77933931350708, "learning_rate": 5.031966204205175e-05, "loss": 2.501139450073242, "memory(GiB)": 77.56, "step": 58115, "token_acc": 0.46710526315789475, "train_speed(iter/s)": 1.438742 }, { "epoch": 2.4900389871899233, "grad_norm": 6.370707035064697, "learning_rate": 5.0312932403666214e-05, "loss": 2.31909294128418, "memory(GiB)": 77.56, "step": 58120, "token_acc": 0.5321888412017167, "train_speed(iter/s)": 1.438773 }, { "epoch": 2.490253202519172, "grad_norm": 4.952884197235107, "learning_rate": 5.0306202759611634e-05, "loss": 2.438155746459961, "memory(GiB)": 77.56, "step": 58125, "token_acc": 0.49171270718232046, "train_speed(iter/s)": 1.438796 }, { "epoch": 2.4904674178484214, "grad_norm": 4.658510684967041, "learning_rate": 5.0299473110009876e-05, "loss": 2.148599624633789, "memory(GiB)": 77.56, "step": 58130, "token_acc": 0.524, "train_speed(iter/s)": 1.438824 }, { "epoch": 2.49068163317767, "grad_norm": 6.677910804748535, "learning_rate": 5.0292743454982905e-05, "loss": 2.5059249877929686, "memory(GiB)": 77.56, "step": 58135, "token_acc": 0.4793103448275862, "train_speed(iter/s)": 1.438836 }, { "epoch": 2.490895848506919, "grad_norm": 6.639410972595215, "learning_rate": 5.028601379465261e-05, "loss": 2.759622573852539, "memory(GiB)": 77.56, "step": 58140, "token_acc": 0.45723684210526316, "train_speed(iter/s)": 1.438838 }, { "epoch": 2.4911100638361683, "grad_norm": 5.123559951782227, "learning_rate": 5.0279284129140894e-05, "loss": 2.5407032012939452, "memory(GiB)": 77.56, "step": 58145, "token_acc": 0.46113989637305697, "train_speed(iter/s)": 1.438817 }, { "epoch": 2.491324279165417, "grad_norm": 4.856555461883545, "learning_rate": 5.027255445856969e-05, "loss": 2.4736461639404297, "memory(GiB)": 77.56, "step": 58150, "token_acc": 0.4721311475409836, "train_speed(iter/s)": 1.438835 }, { "epoch": 2.491538494494666, "grad_norm": 5.092394828796387, "learning_rate": 5.0265824783060914e-05, "loss": 2.3582536697387697, "memory(GiB)": 77.56, "step": 58155, "token_acc": 0.5138339920948617, "train_speed(iter/s)": 1.438825 }, { "epoch": 2.491752709823915, "grad_norm": 5.648548126220703, "learning_rate": 5.025909510273645e-05, "loss": 2.356985092163086, "memory(GiB)": 77.56, "step": 58160, "token_acc": 0.4904214559386973, "train_speed(iter/s)": 1.438816 }, { "epoch": 2.491966925153164, "grad_norm": 6.463406085968018, "learning_rate": 5.025236541771826e-05, "loss": 2.3251243591308595, "memory(GiB)": 77.56, "step": 58165, "token_acc": 0.5204918032786885, "train_speed(iter/s)": 1.438768 }, { "epoch": 2.492181140482413, "grad_norm": 6.251805782318115, "learning_rate": 5.0245635728128225e-05, "loss": 2.4452255249023436, "memory(GiB)": 77.56, "step": 58170, "token_acc": 0.4891304347826087, "train_speed(iter/s)": 1.438792 }, { "epoch": 2.492395355811662, "grad_norm": 4.523138046264648, "learning_rate": 5.023890603408827e-05, "loss": 2.1374338150024412, "memory(GiB)": 77.56, "step": 58175, "token_acc": 0.5181518151815182, "train_speed(iter/s)": 1.438803 }, { "epoch": 2.492609571140911, "grad_norm": 4.479569435119629, "learning_rate": 5.0232176335720314e-05, "loss": 2.4535398483276367, "memory(GiB)": 77.56, "step": 58180, "token_acc": 0.4823943661971831, "train_speed(iter/s)": 1.438813 }, { "epoch": 2.4928237864701597, "grad_norm": 5.842418670654297, "learning_rate": 5.0225446633146247e-05, "loss": 2.688613510131836, "memory(GiB)": 77.56, "step": 58185, "token_acc": 0.46619217081850534, "train_speed(iter/s)": 1.438798 }, { "epoch": 2.493038001799409, "grad_norm": 5.621906280517578, "learning_rate": 5.021871692648801e-05, "loss": 2.476247215270996, "memory(GiB)": 77.56, "step": 58190, "token_acc": 0.5102739726027398, "train_speed(iter/s)": 1.438827 }, { "epoch": 2.4932522171286577, "grad_norm": 6.361431121826172, "learning_rate": 5.0211987215867516e-05, "loss": 2.403704833984375, "memory(GiB)": 77.56, "step": 58195, "token_acc": 0.4647887323943662, "train_speed(iter/s)": 1.438834 }, { "epoch": 2.4934664324579066, "grad_norm": 4.427335262298584, "learning_rate": 5.020525750140668e-05, "loss": 2.3656896591186523, "memory(GiB)": 77.56, "step": 58200, "token_acc": 0.45689655172413796, "train_speed(iter/s)": 1.438856 }, { "epoch": 2.493680647787156, "grad_norm": 4.050919532775879, "learning_rate": 5.01985277832274e-05, "loss": 2.3997989654541017, "memory(GiB)": 77.56, "step": 58205, "token_acc": 0.5106382978723404, "train_speed(iter/s)": 1.438855 }, { "epoch": 2.4938948631164046, "grad_norm": 6.055706977844238, "learning_rate": 5.0191798061451614e-05, "loss": 2.4805131912231446, "memory(GiB)": 77.56, "step": 58210, "token_acc": 0.483739837398374, "train_speed(iter/s)": 1.438839 }, { "epoch": 2.4941090784456534, "grad_norm": 5.819148063659668, "learning_rate": 5.0185068336201215e-05, "loss": 2.405121612548828, "memory(GiB)": 77.56, "step": 58215, "token_acc": 0.4750830564784053, "train_speed(iter/s)": 1.438802 }, { "epoch": 2.4943232937749027, "grad_norm": 4.424496650695801, "learning_rate": 5.0178338607598144e-05, "loss": 2.3622785568237306, "memory(GiB)": 77.56, "step": 58220, "token_acc": 0.506993006993007, "train_speed(iter/s)": 1.438797 }, { "epoch": 2.4945375091041515, "grad_norm": 6.676568031311035, "learning_rate": 5.0171608875764286e-05, "loss": 2.3665557861328126, "memory(GiB)": 77.56, "step": 58225, "token_acc": 0.49236641221374045, "train_speed(iter/s)": 1.438766 }, { "epoch": 2.4947517244334003, "grad_norm": 5.545111179351807, "learning_rate": 5.0164879140821586e-05, "loss": 2.52616024017334, "memory(GiB)": 77.56, "step": 58230, "token_acc": 0.4721311475409836, "train_speed(iter/s)": 1.438747 }, { "epoch": 2.4949659397626496, "grad_norm": 5.15902853012085, "learning_rate": 5.015814940289193e-05, "loss": 2.2114841461181642, "memory(GiB)": 77.56, "step": 58235, "token_acc": 0.5377643504531722, "train_speed(iter/s)": 1.438779 }, { "epoch": 2.4951801550918984, "grad_norm": 3.7115330696105957, "learning_rate": 5.015141966209726e-05, "loss": 2.3777944564819338, "memory(GiB)": 77.56, "step": 58240, "token_acc": 0.519774011299435, "train_speed(iter/s)": 1.438804 }, { "epoch": 2.495394370421147, "grad_norm": 5.075016021728516, "learning_rate": 5.014468991855947e-05, "loss": 2.677511978149414, "memory(GiB)": 77.56, "step": 58245, "token_acc": 0.4676258992805755, "train_speed(iter/s)": 1.438808 }, { "epoch": 2.4956085857503965, "grad_norm": 4.5342278480529785, "learning_rate": 5.01379601724005e-05, "loss": 2.2680641174316407, "memory(GiB)": 77.56, "step": 58250, "token_acc": 0.5115606936416185, "train_speed(iter/s)": 1.438831 }, { "epoch": 2.4958228010796453, "grad_norm": 4.905433177947998, "learning_rate": 5.013123042374226e-05, "loss": 2.5868595123291014, "memory(GiB)": 77.56, "step": 58255, "token_acc": 0.4689265536723164, "train_speed(iter/s)": 1.438847 }, { "epoch": 2.496037016408894, "grad_norm": 7.08261251449585, "learning_rate": 5.012450067270663e-05, "loss": 2.3131925582885744, "memory(GiB)": 77.56, "step": 58260, "token_acc": 0.47634069400630913, "train_speed(iter/s)": 1.438859 }, { "epoch": 2.4962512317381433, "grad_norm": 5.00227689743042, "learning_rate": 5.0117770919415566e-05, "loss": 2.2290515899658203, "memory(GiB)": 77.56, "step": 58265, "token_acc": 0.5625, "train_speed(iter/s)": 1.438847 }, { "epoch": 2.496465447067392, "grad_norm": 4.830747604370117, "learning_rate": 5.011104116399098e-05, "loss": 2.41119441986084, "memory(GiB)": 77.56, "step": 58270, "token_acc": 0.45017182130584193, "train_speed(iter/s)": 1.43882 }, { "epoch": 2.496679662396641, "grad_norm": 5.60421895980835, "learning_rate": 5.010431140655477e-05, "loss": 2.232265281677246, "memory(GiB)": 77.56, "step": 58275, "token_acc": 0.5193798449612403, "train_speed(iter/s)": 1.438806 }, { "epoch": 2.49689387772589, "grad_norm": 7.05345344543457, "learning_rate": 5.009758164722887e-05, "loss": 2.3105337142944338, "memory(GiB)": 77.56, "step": 58280, "token_acc": 0.4894366197183099, "train_speed(iter/s)": 1.438793 }, { "epoch": 2.497108093055139, "grad_norm": 6.987645149230957, "learning_rate": 5.0090851886135184e-05, "loss": 2.398975372314453, "memory(GiB)": 77.56, "step": 58285, "token_acc": 0.4495798319327731, "train_speed(iter/s)": 1.438755 }, { "epoch": 2.497322308384388, "grad_norm": 4.815450191497803, "learning_rate": 5.0084122123395615e-05, "loss": 2.3575220108032227, "memory(GiB)": 77.56, "step": 58290, "token_acc": 0.4837662337662338, "train_speed(iter/s)": 1.438779 }, { "epoch": 2.497536523713637, "grad_norm": 5.71390962600708, "learning_rate": 5.007739235913211e-05, "loss": 2.4685995101928713, "memory(GiB)": 77.56, "step": 58295, "token_acc": 0.4592833876221498, "train_speed(iter/s)": 1.438766 }, { "epoch": 2.497750739042886, "grad_norm": 5.189290523529053, "learning_rate": 5.0070662593466564e-05, "loss": 2.3445167541503906, "memory(GiB)": 77.56, "step": 58300, "token_acc": 0.4967741935483871, "train_speed(iter/s)": 1.438776 }, { "epoch": 2.4979649543721347, "grad_norm": 8.077359199523926, "learning_rate": 5.00639328265209e-05, "loss": 2.5039722442626955, "memory(GiB)": 77.56, "step": 58305, "token_acc": 0.497907949790795, "train_speed(iter/s)": 1.438741 }, { "epoch": 2.498179169701384, "grad_norm": 7.4290242195129395, "learning_rate": 5.0057203058417044e-05, "loss": 2.2886486053466797, "memory(GiB)": 77.56, "step": 58310, "token_acc": 0.48091603053435117, "train_speed(iter/s)": 1.438759 }, { "epoch": 2.498393385030633, "grad_norm": 6.249179840087891, "learning_rate": 5.00504732892769e-05, "loss": 2.6114885330200197, "memory(GiB)": 77.56, "step": 58315, "token_acc": 0.4634146341463415, "train_speed(iter/s)": 1.438774 }, { "epoch": 2.4986076003598816, "grad_norm": 3.9913127422332764, "learning_rate": 5.004374351922236e-05, "loss": 2.4197288513183595, "memory(GiB)": 77.56, "step": 58320, "token_acc": 0.4885245901639344, "train_speed(iter/s)": 1.438807 }, { "epoch": 2.498821815689131, "grad_norm": 5.795289039611816, "learning_rate": 5.003701374837539e-05, "loss": 2.378787612915039, "memory(GiB)": 77.56, "step": 58325, "token_acc": 0.4921875, "train_speed(iter/s)": 1.438796 }, { "epoch": 2.4990360310183797, "grad_norm": 11.66122817993164, "learning_rate": 5.003028397685787e-05, "loss": 2.6223936080932617, "memory(GiB)": 77.56, "step": 58330, "token_acc": 0.5102040816326531, "train_speed(iter/s)": 1.438783 }, { "epoch": 2.4992502463476285, "grad_norm": 5.806344985961914, "learning_rate": 5.0023554204791724e-05, "loss": 2.4009864807128904, "memory(GiB)": 77.56, "step": 58335, "token_acc": 0.4690909090909091, "train_speed(iter/s)": 1.438818 }, { "epoch": 2.4994644616768777, "grad_norm": 5.129970550537109, "learning_rate": 5.001682443229888e-05, "loss": 2.718728446960449, "memory(GiB)": 77.56, "step": 58340, "token_acc": 0.45819397993311034, "train_speed(iter/s)": 1.438846 }, { "epoch": 2.4996786770061266, "grad_norm": 5.388007640838623, "learning_rate": 5.001009465950125e-05, "loss": 2.529027557373047, "memory(GiB)": 77.56, "step": 58345, "token_acc": 0.4740061162079511, "train_speed(iter/s)": 1.438881 }, { "epoch": 2.4998928923353754, "grad_norm": 5.799760341644287, "learning_rate": 5.000336488652074e-05, "loss": 2.488451385498047, "memory(GiB)": 77.56, "step": 58350, "token_acc": 0.4489051094890511, "train_speed(iter/s)": 1.438867 }, { "epoch": 2.5001071076646246, "grad_norm": 5.171080589294434, "learning_rate": 4.9996635113479265e-05, "loss": 2.423663330078125, "memory(GiB)": 77.56, "step": 58355, "token_acc": 0.5107296137339056, "train_speed(iter/s)": 1.438878 }, { "epoch": 2.5003213229938734, "grad_norm": 5.024204254150391, "learning_rate": 4.9989905340498755e-05, "loss": 2.700656700134277, "memory(GiB)": 77.56, "step": 58360, "token_acc": 0.4785714285714286, "train_speed(iter/s)": 1.438893 }, { "epoch": 2.5005355383231223, "grad_norm": 5.59718656539917, "learning_rate": 4.998317556770111e-05, "loss": 2.100053596496582, "memory(GiB)": 77.56, "step": 58365, "token_acc": 0.5300353356890459, "train_speed(iter/s)": 1.438878 }, { "epoch": 2.5007497536523715, "grad_norm": 5.861362457275391, "learning_rate": 4.9976445795208274e-05, "loss": 2.3406686782836914, "memory(GiB)": 77.56, "step": 58370, "token_acc": 0.5064102564102564, "train_speed(iter/s)": 1.438885 }, { "epoch": 2.5009639689816203, "grad_norm": 6.280579090118408, "learning_rate": 4.996971602314214e-05, "loss": 2.453501892089844, "memory(GiB)": 77.56, "step": 58375, "token_acc": 0.47093023255813954, "train_speed(iter/s)": 1.438899 }, { "epoch": 2.501178184310869, "grad_norm": 4.300484657287598, "learning_rate": 4.996298625162462e-05, "loss": 2.253561592102051, "memory(GiB)": 77.56, "step": 58380, "token_acc": 0.5032467532467533, "train_speed(iter/s)": 1.438865 }, { "epoch": 2.5013923996401184, "grad_norm": 6.742525100708008, "learning_rate": 4.9956256480777643e-05, "loss": 2.060250473022461, "memory(GiB)": 77.56, "step": 58385, "token_acc": 0.5444444444444444, "train_speed(iter/s)": 1.438854 }, { "epoch": 2.501606614969367, "grad_norm": 6.936786651611328, "learning_rate": 4.994952671072312e-05, "loss": 2.442148780822754, "memory(GiB)": 77.56, "step": 58390, "token_acc": 0.4916387959866221, "train_speed(iter/s)": 1.438881 }, { "epoch": 2.501820830298616, "grad_norm": 6.741944313049316, "learning_rate": 4.994279694158297e-05, "loss": 2.4010562896728516, "memory(GiB)": 77.56, "step": 58395, "token_acc": 0.4982698961937716, "train_speed(iter/s)": 1.438877 }, { "epoch": 2.5020350456278653, "grad_norm": 6.655197620391846, "learning_rate": 4.993606717347911e-05, "loss": 2.3350038528442383, "memory(GiB)": 77.56, "step": 58400, "token_acc": 0.515625, "train_speed(iter/s)": 1.438872 }, { "epoch": 2.502249260957114, "grad_norm": 5.165590286254883, "learning_rate": 4.992933740653345e-05, "loss": 2.0496925354003905, "memory(GiB)": 77.56, "step": 58405, "token_acc": 0.5074074074074074, "train_speed(iter/s)": 1.438891 }, { "epoch": 2.502463476286363, "grad_norm": 6.217377185821533, "learning_rate": 4.99226076408679e-05, "loss": 2.4300582885742186, "memory(GiB)": 77.56, "step": 58410, "token_acc": 0.5166666666666667, "train_speed(iter/s)": 1.438891 }, { "epoch": 2.502677691615612, "grad_norm": 5.3018293380737305, "learning_rate": 4.99158778766044e-05, "loss": 2.6122314453125, "memory(GiB)": 77.56, "step": 58415, "token_acc": 0.4522058823529412, "train_speed(iter/s)": 1.438888 }, { "epoch": 2.502891906944861, "grad_norm": 4.7727460861206055, "learning_rate": 4.990914811386484e-05, "loss": 2.798413848876953, "memory(GiB)": 77.56, "step": 58420, "token_acc": 0.45054945054945056, "train_speed(iter/s)": 1.43891 }, { "epoch": 2.50310612227411, "grad_norm": 6.41102409362793, "learning_rate": 4.990241835277116e-05, "loss": 2.8175573348999023, "memory(GiB)": 77.56, "step": 58425, "token_acc": 0.4487534626038781, "train_speed(iter/s)": 1.438908 }, { "epoch": 2.503320337603359, "grad_norm": 4.640605926513672, "learning_rate": 4.989568859344523e-05, "loss": 2.5508893966674804, "memory(GiB)": 77.56, "step": 58430, "token_acc": 0.46178343949044587, "train_speed(iter/s)": 1.438906 }, { "epoch": 2.503534552932608, "grad_norm": 5.336583137512207, "learning_rate": 4.988895883600902e-05, "loss": 2.527036666870117, "memory(GiB)": 77.56, "step": 58435, "token_acc": 0.48398576512455516, "train_speed(iter/s)": 1.438922 }, { "epoch": 2.5037487682618567, "grad_norm": 5.696075439453125, "learning_rate": 4.988222908058443e-05, "loss": 2.3428720474243163, "memory(GiB)": 77.56, "step": 58440, "token_acc": 0.5076923076923077, "train_speed(iter/s)": 1.438934 }, { "epoch": 2.503962983591106, "grad_norm": 4.8912248611450195, "learning_rate": 4.987549932729337e-05, "loss": 2.1608455657958983, "memory(GiB)": 77.56, "step": 58445, "token_acc": 0.5457627118644067, "train_speed(iter/s)": 1.438891 }, { "epoch": 2.5041771989203547, "grad_norm": 5.038296699523926, "learning_rate": 4.986876957625776e-05, "loss": 2.2645145416259767, "memory(GiB)": 77.56, "step": 58450, "token_acc": 0.5051194539249146, "train_speed(iter/s)": 1.438916 }, { "epoch": 2.5043914142496035, "grad_norm": 5.156894207000732, "learning_rate": 4.98620398275995e-05, "loss": 2.403422164916992, "memory(GiB)": 77.56, "step": 58455, "token_acc": 0.48135593220338985, "train_speed(iter/s)": 1.43891 }, { "epoch": 2.504605629578853, "grad_norm": 5.4597859382629395, "learning_rate": 4.9855310081440535e-05, "loss": 2.6115291595458983, "memory(GiB)": 77.56, "step": 58460, "token_acc": 0.46417445482866043, "train_speed(iter/s)": 1.438935 }, { "epoch": 2.5048198449081016, "grad_norm": 6.157292366027832, "learning_rate": 4.984858033790275e-05, "loss": 2.4855825424194338, "memory(GiB)": 77.56, "step": 58465, "token_acc": 0.475177304964539, "train_speed(iter/s)": 1.438922 }, { "epoch": 2.5050340602373504, "grad_norm": 6.037020206451416, "learning_rate": 4.984185059710808e-05, "loss": 2.2975700378417967, "memory(GiB)": 77.56, "step": 58470, "token_acc": 0.5, "train_speed(iter/s)": 1.438932 }, { "epoch": 2.5052482755665997, "grad_norm": 5.616464138031006, "learning_rate": 4.983512085917843e-05, "loss": 2.3390806198120115, "memory(GiB)": 77.56, "step": 58475, "token_acc": 0.4607142857142857, "train_speed(iter/s)": 1.4389 }, { "epoch": 2.5054624908958485, "grad_norm": 5.004324913024902, "learning_rate": 4.9828391124235726e-05, "loss": 2.354347801208496, "memory(GiB)": 77.56, "step": 58480, "token_acc": 0.49477351916376305, "train_speed(iter/s)": 1.438869 }, { "epoch": 2.5056767062250973, "grad_norm": 6.0675950050354, "learning_rate": 4.982166139240188e-05, "loss": 2.6349502563476563, "memory(GiB)": 77.56, "step": 58485, "token_acc": 0.46601941747572817, "train_speed(iter/s)": 1.438883 }, { "epoch": 2.5058909215543466, "grad_norm": 5.169259548187256, "learning_rate": 4.98149316637988e-05, "loss": 2.3559337615966798, "memory(GiB)": 77.56, "step": 58490, "token_acc": 0.47, "train_speed(iter/s)": 1.438876 }, { "epoch": 2.5061051368835954, "grad_norm": 5.194831371307373, "learning_rate": 4.9808201938548404e-05, "loss": 2.4086750030517576, "memory(GiB)": 77.56, "step": 58495, "token_acc": 0.5135135135135135, "train_speed(iter/s)": 1.438893 }, { "epoch": 2.506319352212844, "grad_norm": 8.391009330749512, "learning_rate": 4.980147221677262e-05, "loss": 2.6264013290405273, "memory(GiB)": 77.56, "step": 58500, "token_acc": 0.46078431372549017, "train_speed(iter/s)": 1.438906 }, { "epoch": 2.506319352212844, "eval_loss": 2.080378770828247, "eval_runtime": 15.1383, "eval_samples_per_second": 6.606, "eval_steps_per_second": 6.606, "eval_token_acc": 0.4725433526011561, "step": 58500 }, { "epoch": 2.5065335675420934, "grad_norm": 4.968241214752197, "learning_rate": 4.979474249859333e-05, "loss": 2.5486387252807616, "memory(GiB)": 77.56, "step": 58505, "token_acc": 0.4665314401622718, "train_speed(iter/s)": 1.438341 }, { "epoch": 2.5067477828713423, "grad_norm": 4.82818078994751, "learning_rate": 4.978801278413248e-05, "loss": 2.262587547302246, "memory(GiB)": 77.56, "step": 58510, "token_acc": 0.5153846153846153, "train_speed(iter/s)": 1.438373 }, { "epoch": 2.506961998200591, "grad_norm": 6.097710609436035, "learning_rate": 4.9781283073511995e-05, "loss": 2.1054759979248048, "memory(GiB)": 77.56, "step": 58515, "token_acc": 0.5094339622641509, "train_speed(iter/s)": 1.438401 }, { "epoch": 2.5071762135298403, "grad_norm": 6.493237018585205, "learning_rate": 4.977455336685376e-05, "loss": 2.6580795288085937, "memory(GiB)": 77.56, "step": 58520, "token_acc": 0.43882978723404253, "train_speed(iter/s)": 1.438392 }, { "epoch": 2.507390428859089, "grad_norm": 5.894067764282227, "learning_rate": 4.97678236642797e-05, "loss": 2.515596389770508, "memory(GiB)": 77.56, "step": 58525, "token_acc": 0.4986376021798365, "train_speed(iter/s)": 1.438414 }, { "epoch": 2.507604644188338, "grad_norm": 6.168473243713379, "learning_rate": 4.9761093965911746e-05, "loss": 2.4529163360595705, "memory(GiB)": 77.56, "step": 58530, "token_acc": 0.4870967741935484, "train_speed(iter/s)": 1.438418 }, { "epoch": 2.507818859517587, "grad_norm": 5.022265434265137, "learning_rate": 4.975436427187178e-05, "loss": 2.247749137878418, "memory(GiB)": 77.56, "step": 58535, "token_acc": 0.512987012987013, "train_speed(iter/s)": 1.438442 }, { "epoch": 2.508033074846836, "grad_norm": 4.225148677825928, "learning_rate": 4.974763458228175e-05, "loss": 2.4482269287109375, "memory(GiB)": 77.56, "step": 58540, "token_acc": 0.4831804281345566, "train_speed(iter/s)": 1.438423 }, { "epoch": 2.508247290176085, "grad_norm": 6.861063480377197, "learning_rate": 4.9740904897263554e-05, "loss": 2.2667058944702148, "memory(GiB)": 77.56, "step": 58545, "token_acc": 0.47985347985347987, "train_speed(iter/s)": 1.438421 }, { "epoch": 2.508461505505334, "grad_norm": 5.056663990020752, "learning_rate": 4.97341752169391e-05, "loss": 2.3969274520874024, "memory(GiB)": 77.56, "step": 58550, "token_acc": 0.5051903114186851, "train_speed(iter/s)": 1.438431 }, { "epoch": 2.508675720834583, "grad_norm": 5.804452896118164, "learning_rate": 4.972744554143032e-05, "loss": 2.471655082702637, "memory(GiB)": 77.56, "step": 58555, "token_acc": 0.4982698961937716, "train_speed(iter/s)": 1.438441 }, { "epoch": 2.5088899361638317, "grad_norm": 5.051337718963623, "learning_rate": 4.9720715870859124e-05, "loss": 2.5114559173583983, "memory(GiB)": 77.56, "step": 58560, "token_acc": 0.4807017543859649, "train_speed(iter/s)": 1.438455 }, { "epoch": 2.509104151493081, "grad_norm": 5.521915435791016, "learning_rate": 4.971398620534741e-05, "loss": 2.2452049255371094, "memory(GiB)": 77.56, "step": 58565, "token_acc": 0.5461847389558233, "train_speed(iter/s)": 1.438445 }, { "epoch": 2.50931836682233, "grad_norm": 6.709741592407227, "learning_rate": 4.970725654501712e-05, "loss": 2.4361576080322265, "memory(GiB)": 77.56, "step": 58570, "token_acc": 0.5018050541516246, "train_speed(iter/s)": 1.438415 }, { "epoch": 2.5095325821515786, "grad_norm": 9.513143539428711, "learning_rate": 4.970052688999012e-05, "loss": 2.603710746765137, "memory(GiB)": 77.56, "step": 58575, "token_acc": 0.4778761061946903, "train_speed(iter/s)": 1.438409 }, { "epoch": 2.509746797480828, "grad_norm": 6.703118801116943, "learning_rate": 4.969379724038837e-05, "loss": 2.8177309036254883, "memory(GiB)": 77.56, "step": 58580, "token_acc": 0.4725274725274725, "train_speed(iter/s)": 1.438419 }, { "epoch": 2.5099610128100767, "grad_norm": 6.20990514755249, "learning_rate": 4.9687067596333784e-05, "loss": 2.4423900604248048, "memory(GiB)": 77.56, "step": 58585, "token_acc": 0.4927007299270073, "train_speed(iter/s)": 1.438425 }, { "epoch": 2.5101752281393255, "grad_norm": 6.054320335388184, "learning_rate": 4.968033795794827e-05, "loss": 2.502968406677246, "memory(GiB)": 77.56, "step": 58590, "token_acc": 0.4367816091954023, "train_speed(iter/s)": 1.438433 }, { "epoch": 2.5103894434685747, "grad_norm": 6.248667240142822, "learning_rate": 4.967360832535371e-05, "loss": 2.3279033660888673, "memory(GiB)": 77.56, "step": 58595, "token_acc": 0.5095541401273885, "train_speed(iter/s)": 1.438413 }, { "epoch": 2.5106036587978235, "grad_norm": 7.26310396194458, "learning_rate": 4.9666878698672054e-05, "loss": 2.630337142944336, "memory(GiB)": 77.56, "step": 58600, "token_acc": 0.44696969696969696, "train_speed(iter/s)": 1.438407 }, { "epoch": 2.5108178741270724, "grad_norm": 6.325886249542236, "learning_rate": 4.96601490780252e-05, "loss": 2.371360015869141, "memory(GiB)": 77.56, "step": 58605, "token_acc": 0.4672897196261682, "train_speed(iter/s)": 1.438407 }, { "epoch": 2.5110320894563216, "grad_norm": 4.498351573944092, "learning_rate": 4.965341946353506e-05, "loss": 2.154536247253418, "memory(GiB)": 77.56, "step": 58610, "token_acc": 0.5204460966542751, "train_speed(iter/s)": 1.438412 }, { "epoch": 2.5112463047855704, "grad_norm": 6.498818874359131, "learning_rate": 4.964668985532357e-05, "loss": 2.3352787017822267, "memory(GiB)": 77.56, "step": 58615, "token_acc": 0.4793650793650794, "train_speed(iter/s)": 1.438424 }, { "epoch": 2.5114605201148192, "grad_norm": 6.107295513153076, "learning_rate": 4.96399602535126e-05, "loss": 2.575478363037109, "memory(GiB)": 77.56, "step": 58620, "token_acc": 0.45741324921135645, "train_speed(iter/s)": 1.438429 }, { "epoch": 2.5116747354440685, "grad_norm": 4.7984771728515625, "learning_rate": 4.9633230658224106e-05, "loss": 2.307478904724121, "memory(GiB)": 77.56, "step": 58625, "token_acc": 0.5157894736842106, "train_speed(iter/s)": 1.438417 }, { "epoch": 2.5118889507733173, "grad_norm": 5.4650068283081055, "learning_rate": 4.9626501069579975e-05, "loss": 2.367713737487793, "memory(GiB)": 77.56, "step": 58630, "token_acc": 0.532871972318339, "train_speed(iter/s)": 1.438419 }, { "epoch": 2.512103166102566, "grad_norm": 5.911473274230957, "learning_rate": 4.961977148770212e-05, "loss": 2.188696098327637, "memory(GiB)": 77.56, "step": 58635, "token_acc": 0.5346938775510204, "train_speed(iter/s)": 1.438446 }, { "epoch": 2.5123173814318154, "grad_norm": 4.255577087402344, "learning_rate": 4.961304191271248e-05, "loss": 2.444392776489258, "memory(GiB)": 77.56, "step": 58640, "token_acc": 0.5316901408450704, "train_speed(iter/s)": 1.438466 }, { "epoch": 2.512531596761064, "grad_norm": 7.421138286590576, "learning_rate": 4.960631234473294e-05, "loss": 2.5387012481689455, "memory(GiB)": 77.56, "step": 58645, "token_acc": 0.46938775510204084, "train_speed(iter/s)": 1.438485 }, { "epoch": 2.512745812090313, "grad_norm": 6.129209041595459, "learning_rate": 4.9599582783885404e-05, "loss": 2.654239273071289, "memory(GiB)": 77.56, "step": 58650, "token_acc": 0.4833948339483395, "train_speed(iter/s)": 1.438497 }, { "epoch": 2.5129600274195623, "grad_norm": 5.452489376068115, "learning_rate": 4.959285323029181e-05, "loss": 2.670383071899414, "memory(GiB)": 77.56, "step": 58655, "token_acc": 0.4574468085106383, "train_speed(iter/s)": 1.438533 }, { "epoch": 2.513174242748811, "grad_norm": 5.070569038391113, "learning_rate": 4.958612368407407e-05, "loss": 2.357744598388672, "memory(GiB)": 77.56, "step": 58660, "token_acc": 0.4597315436241611, "train_speed(iter/s)": 1.438482 }, { "epoch": 2.51338845807806, "grad_norm": 5.5211334228515625, "learning_rate": 4.957939414535409e-05, "loss": 2.39716796875, "memory(GiB)": 77.56, "step": 58665, "token_acc": 0.48771929824561405, "train_speed(iter/s)": 1.438466 }, { "epoch": 2.513602673407309, "grad_norm": 5.426631927490234, "learning_rate": 4.9572664614253774e-05, "loss": 2.3903064727783203, "memory(GiB)": 77.56, "step": 58670, "token_acc": 0.48928571428571427, "train_speed(iter/s)": 1.438488 }, { "epoch": 2.513816888736558, "grad_norm": 4.342236042022705, "learning_rate": 4.9565935090895046e-05, "loss": 2.535909080505371, "memory(GiB)": 77.56, "step": 58675, "token_acc": 0.47202797202797203, "train_speed(iter/s)": 1.438498 }, { "epoch": 2.5140311040658068, "grad_norm": 5.142080307006836, "learning_rate": 4.95592055753998e-05, "loss": 2.4534364700317384, "memory(GiB)": 77.56, "step": 58680, "token_acc": 0.4605263157894737, "train_speed(iter/s)": 1.438468 }, { "epoch": 2.514245319395056, "grad_norm": 6.049755573272705, "learning_rate": 4.9552476067889967e-05, "loss": 2.5266448974609377, "memory(GiB)": 77.56, "step": 58685, "token_acc": 0.4899328859060403, "train_speed(iter/s)": 1.438448 }, { "epoch": 2.514459534724305, "grad_norm": 5.624327182769775, "learning_rate": 4.954574656848745e-05, "loss": 2.4393081665039062, "memory(GiB)": 77.56, "step": 58690, "token_acc": 0.50814332247557, "train_speed(iter/s)": 1.438432 }, { "epoch": 2.5146737500535536, "grad_norm": 5.702749252319336, "learning_rate": 4.953901707731415e-05, "loss": 2.182010269165039, "memory(GiB)": 77.56, "step": 58695, "token_acc": 0.5212464589235127, "train_speed(iter/s)": 1.438419 }, { "epoch": 2.514887965382803, "grad_norm": 4.576151371002197, "learning_rate": 4.9532287594491996e-05, "loss": 2.0450227737426756, "memory(GiB)": 77.56, "step": 58700, "token_acc": 0.5144927536231884, "train_speed(iter/s)": 1.438422 }, { "epoch": 2.5151021807120517, "grad_norm": 8.027562141418457, "learning_rate": 4.95255581201429e-05, "loss": 2.2008514404296875, "memory(GiB)": 77.56, "step": 58705, "token_acc": 0.5241379310344828, "train_speed(iter/s)": 1.438431 }, { "epoch": 2.5153163960413005, "grad_norm": 5.362934112548828, "learning_rate": 4.951882865438875e-05, "loss": 2.3463413238525392, "memory(GiB)": 77.56, "step": 58710, "token_acc": 0.4778156996587031, "train_speed(iter/s)": 1.438459 }, { "epoch": 2.51553061137055, "grad_norm": 7.179898262023926, "learning_rate": 4.951209919735148e-05, "loss": 2.3349565505981444, "memory(GiB)": 77.56, "step": 58715, "token_acc": 0.5085324232081911, "train_speed(iter/s)": 1.438472 }, { "epoch": 2.5157448266997986, "grad_norm": 6.435906887054443, "learning_rate": 4.9505369749152975e-05, "loss": 2.452790451049805, "memory(GiB)": 77.56, "step": 58720, "token_acc": 0.4555984555984556, "train_speed(iter/s)": 1.438487 }, { "epoch": 2.5159590420290474, "grad_norm": 16.25479507446289, "learning_rate": 4.9498640309915156e-05, "loss": 2.348832702636719, "memory(GiB)": 77.56, "step": 58725, "token_acc": 0.5160142348754448, "train_speed(iter/s)": 1.438497 }, { "epoch": 2.5161732573582967, "grad_norm": 5.01768159866333, "learning_rate": 4.9491910879759956e-05, "loss": 2.618852424621582, "memory(GiB)": 77.56, "step": 58730, "token_acc": 0.49240121580547114, "train_speed(iter/s)": 1.438502 }, { "epoch": 2.5163874726875455, "grad_norm": 7.3222832679748535, "learning_rate": 4.9485181458809273e-05, "loss": 2.6504459381103516, "memory(GiB)": 77.56, "step": 58735, "token_acc": 0.5018587360594795, "train_speed(iter/s)": 1.438521 }, { "epoch": 2.5166016880167943, "grad_norm": 4.788215160369873, "learning_rate": 4.947845204718499e-05, "loss": 2.475088882446289, "memory(GiB)": 77.56, "step": 58740, "token_acc": 0.45901639344262296, "train_speed(iter/s)": 1.438514 }, { "epoch": 2.5168159033460435, "grad_norm": 4.636257171630859, "learning_rate": 4.947172264500905e-05, "loss": 2.0276056289672852, "memory(GiB)": 77.56, "step": 58745, "token_acc": 0.532871972318339, "train_speed(iter/s)": 1.438505 }, { "epoch": 2.5170301186752924, "grad_norm": 5.67156457901001, "learning_rate": 4.9464993252403366e-05, "loss": 2.4555709838867186, "memory(GiB)": 77.56, "step": 58750, "token_acc": 0.4868421052631579, "train_speed(iter/s)": 1.438501 }, { "epoch": 2.517244334004541, "grad_norm": 4.645724296569824, "learning_rate": 4.945826386948981e-05, "loss": 2.2215145111083983, "memory(GiB)": 77.56, "step": 58755, "token_acc": 0.5655172413793104, "train_speed(iter/s)": 1.438513 }, { "epoch": 2.5174585493337904, "grad_norm": 5.968151092529297, "learning_rate": 4.9451534496390315e-05, "loss": 2.5536806106567385, "memory(GiB)": 77.56, "step": 58760, "token_acc": 0.45627376425855515, "train_speed(iter/s)": 1.438517 }, { "epoch": 2.5176727646630392, "grad_norm": 5.72506046295166, "learning_rate": 4.944480513322678e-05, "loss": 2.3667137145996096, "memory(GiB)": 77.56, "step": 58765, "token_acc": 0.4831804281345566, "train_speed(iter/s)": 1.438527 }, { "epoch": 2.517886979992288, "grad_norm": 6.006704807281494, "learning_rate": 4.943807578012114e-05, "loss": 2.3640342712402345, "memory(GiB)": 77.56, "step": 58770, "token_acc": 0.5176991150442478, "train_speed(iter/s)": 1.43852 }, { "epoch": 2.5181011953215373, "grad_norm": 5.237598419189453, "learning_rate": 4.943134643719528e-05, "loss": 2.5975671768188477, "memory(GiB)": 77.56, "step": 58775, "token_acc": 0.46048109965635736, "train_speed(iter/s)": 1.438477 }, { "epoch": 2.518315410650786, "grad_norm": 4.4248809814453125, "learning_rate": 4.94246171045711e-05, "loss": 1.9326534271240234, "memory(GiB)": 77.56, "step": 58780, "token_acc": 0.5758754863813229, "train_speed(iter/s)": 1.43848 }, { "epoch": 2.518529625980035, "grad_norm": 5.686248779296875, "learning_rate": 4.941788778237053e-05, "loss": 2.3820608139038084, "memory(GiB)": 77.56, "step": 58785, "token_acc": 0.46511627906976744, "train_speed(iter/s)": 1.438497 }, { "epoch": 2.518743841309284, "grad_norm": 8.44089412689209, "learning_rate": 4.941115847071547e-05, "loss": 2.6586257934570314, "memory(GiB)": 77.56, "step": 58790, "token_acc": 0.43283582089552236, "train_speed(iter/s)": 1.438534 }, { "epoch": 2.518958056638533, "grad_norm": 6.452279567718506, "learning_rate": 4.9404429169727815e-05, "loss": 2.2759790420532227, "memory(GiB)": 77.56, "step": 58795, "token_acc": 0.5275590551181102, "train_speed(iter/s)": 1.438536 }, { "epoch": 2.519172271967782, "grad_norm": 6.500096321105957, "learning_rate": 4.9397699879529475e-05, "loss": 2.3621746063232423, "memory(GiB)": 77.56, "step": 58800, "token_acc": 0.5126582278481012, "train_speed(iter/s)": 1.438552 }, { "epoch": 2.519386487297031, "grad_norm": 4.290776252746582, "learning_rate": 4.939097060024239e-05, "loss": 2.379546546936035, "memory(GiB)": 77.56, "step": 58805, "token_acc": 0.4900662251655629, "train_speed(iter/s)": 1.438545 }, { "epoch": 2.51960070262628, "grad_norm": 4.893869400024414, "learning_rate": 4.9384241331988444e-05, "loss": 2.309023857116699, "memory(GiB)": 77.56, "step": 58810, "token_acc": 0.5075757575757576, "train_speed(iter/s)": 1.438538 }, { "epoch": 2.5198149179555287, "grad_norm": 3.77557373046875, "learning_rate": 4.937751207488953e-05, "loss": 2.4117080688476564, "memory(GiB)": 77.56, "step": 58815, "token_acc": 0.5015197568389058, "train_speed(iter/s)": 1.438538 }, { "epoch": 2.520029133284778, "grad_norm": 6.054540634155273, "learning_rate": 4.937078282906758e-05, "loss": 2.5684844970703127, "memory(GiB)": 77.56, "step": 58820, "token_acc": 0.4652567975830816, "train_speed(iter/s)": 1.438527 }, { "epoch": 2.5202433486140268, "grad_norm": 5.331477165222168, "learning_rate": 4.936405359464448e-05, "loss": 2.0903430938720704, "memory(GiB)": 77.56, "step": 58825, "token_acc": 0.5514705882352942, "train_speed(iter/s)": 1.438546 }, { "epoch": 2.5204575639432756, "grad_norm": 4.416264057159424, "learning_rate": 4.9357324371742156e-05, "loss": 2.2944374084472656, "memory(GiB)": 77.56, "step": 58830, "token_acc": 0.4940119760479042, "train_speed(iter/s)": 1.438541 }, { "epoch": 2.520671779272525, "grad_norm": 6.2284417152404785, "learning_rate": 4.93505951604825e-05, "loss": 2.346688461303711, "memory(GiB)": 77.56, "step": 58835, "token_acc": 0.5257352941176471, "train_speed(iter/s)": 1.438552 }, { "epoch": 2.5208859946017736, "grad_norm": 5.001190662384033, "learning_rate": 4.9343865960987425e-05, "loss": 2.892074775695801, "memory(GiB)": 77.56, "step": 58840, "token_acc": 0.428169014084507, "train_speed(iter/s)": 1.438568 }, { "epoch": 2.5211002099310225, "grad_norm": 5.005980014801025, "learning_rate": 4.9337136773378836e-05, "loss": 2.4520015716552734, "memory(GiB)": 77.56, "step": 58845, "token_acc": 0.5047318611987381, "train_speed(iter/s)": 1.438573 }, { "epoch": 2.5213144252602717, "grad_norm": 7.85048246383667, "learning_rate": 4.933040759777863e-05, "loss": 2.4986574172973635, "memory(GiB)": 77.56, "step": 58850, "token_acc": 0.48314606741573035, "train_speed(iter/s)": 1.438593 }, { "epoch": 2.5215286405895205, "grad_norm": 5.1033034324646, "learning_rate": 4.9323678434308713e-05, "loss": 2.619414520263672, "memory(GiB)": 77.56, "step": 58855, "token_acc": 0.47, "train_speed(iter/s)": 1.438595 }, { "epoch": 2.5217428559187693, "grad_norm": 6.098699569702148, "learning_rate": 4.9316949283091004e-05, "loss": 1.9633358001708985, "memory(GiB)": 77.56, "step": 58860, "token_acc": 0.5625, "train_speed(iter/s)": 1.438613 }, { "epoch": 2.5219570712480186, "grad_norm": 7.399806022644043, "learning_rate": 4.9310220144247396e-05, "loss": 2.4731571197509767, "memory(GiB)": 77.56, "step": 58865, "token_acc": 0.48109965635738833, "train_speed(iter/s)": 1.438576 }, { "epoch": 2.5221712865772674, "grad_norm": 4.403747081756592, "learning_rate": 4.930349101789978e-05, "loss": 2.3901674270629885, "memory(GiB)": 77.56, "step": 58870, "token_acc": 0.4948186528497409, "train_speed(iter/s)": 1.438562 }, { "epoch": 2.5223855019065162, "grad_norm": 5.965559959411621, "learning_rate": 4.9296761904170095e-05, "loss": 2.225765037536621, "memory(GiB)": 77.56, "step": 58875, "token_acc": 0.536, "train_speed(iter/s)": 1.438533 }, { "epoch": 2.5225997172357655, "grad_norm": 5.793490886688232, "learning_rate": 4.9290032803180234e-05, "loss": 2.4884992599487306, "memory(GiB)": 77.56, "step": 58880, "token_acc": 0.4782608695652174, "train_speed(iter/s)": 1.438551 }, { "epoch": 2.5228139325650143, "grad_norm": 5.5262908935546875, "learning_rate": 4.928330371505208e-05, "loss": 2.624837112426758, "memory(GiB)": 77.56, "step": 58885, "token_acc": 0.425531914893617, "train_speed(iter/s)": 1.438544 }, { "epoch": 2.523028147894263, "grad_norm": 4.926614761352539, "learning_rate": 4.927657463990757e-05, "loss": 2.2829132080078125, "memory(GiB)": 77.56, "step": 58890, "token_acc": 0.5207667731629393, "train_speed(iter/s)": 1.438508 }, { "epoch": 2.5232423632235124, "grad_norm": 8.304628372192383, "learning_rate": 4.926984557786859e-05, "loss": 2.3865482330322267, "memory(GiB)": 77.56, "step": 58895, "token_acc": 0.47297297297297297, "train_speed(iter/s)": 1.438513 }, { "epoch": 2.523456578552761, "grad_norm": 6.951370716094971, "learning_rate": 4.926311652905702e-05, "loss": 2.285795211791992, "memory(GiB)": 77.56, "step": 58900, "token_acc": 0.5, "train_speed(iter/s)": 1.438512 }, { "epoch": 2.52367079388201, "grad_norm": 4.3239569664001465, "learning_rate": 4.9256387493594805e-05, "loss": 2.4147138595581055, "memory(GiB)": 77.56, "step": 58905, "token_acc": 0.49226006191950467, "train_speed(iter/s)": 1.438478 }, { "epoch": 2.5238850092112592, "grad_norm": 7.33284854888916, "learning_rate": 4.924965847160381e-05, "loss": 2.5570903778076173, "memory(GiB)": 77.56, "step": 58910, "token_acc": 0.48846153846153845, "train_speed(iter/s)": 1.438502 }, { "epoch": 2.524099224540508, "grad_norm": 6.5535888671875, "learning_rate": 4.924292946320597e-05, "loss": 2.5323068618774416, "memory(GiB)": 77.56, "step": 58915, "token_acc": 0.4542372881355932, "train_speed(iter/s)": 1.438507 }, { "epoch": 2.524313439869757, "grad_norm": 6.4172186851501465, "learning_rate": 4.923620046852318e-05, "loss": 2.78980712890625, "memory(GiB)": 77.56, "step": 58920, "token_acc": 0.45695364238410596, "train_speed(iter/s)": 1.438526 }, { "epoch": 2.524527655199006, "grad_norm": 6.0758514404296875, "learning_rate": 4.9229471487677316e-05, "loss": 2.3035104751586912, "memory(GiB)": 77.56, "step": 58925, "token_acc": 0.49609375, "train_speed(iter/s)": 1.43853 }, { "epoch": 2.524741870528255, "grad_norm": 5.2978034019470215, "learning_rate": 4.9222742520790314e-05, "loss": 2.121658515930176, "memory(GiB)": 77.56, "step": 58930, "token_acc": 0.534965034965035, "train_speed(iter/s)": 1.438561 }, { "epoch": 2.5249560858575038, "grad_norm": 5.900257110595703, "learning_rate": 4.9216013567984055e-05, "loss": 2.3860401153564452, "memory(GiB)": 77.56, "step": 58935, "token_acc": 0.4782608695652174, "train_speed(iter/s)": 1.438577 }, { "epoch": 2.525170301186753, "grad_norm": 5.098506450653076, "learning_rate": 4.920928462938044e-05, "loss": 2.3442935943603516, "memory(GiB)": 77.56, "step": 58940, "token_acc": 0.5250836120401338, "train_speed(iter/s)": 1.438593 }, { "epoch": 2.525384516516002, "grad_norm": 5.645188808441162, "learning_rate": 4.9202555705101355e-05, "loss": 2.7175121307373047, "memory(GiB)": 77.56, "step": 58945, "token_acc": 0.44193548387096776, "train_speed(iter/s)": 1.438612 }, { "epoch": 2.5255987318452506, "grad_norm": 3.989086151123047, "learning_rate": 4.919582679526875e-05, "loss": 2.1617315292358397, "memory(GiB)": 77.56, "step": 58950, "token_acc": 0.5372168284789643, "train_speed(iter/s)": 1.438601 }, { "epoch": 2.5258129471745, "grad_norm": 4.649693012237549, "learning_rate": 4.918909790000449e-05, "loss": 2.3297706604003907, "memory(GiB)": 77.56, "step": 58955, "token_acc": 0.4807692307692308, "train_speed(iter/s)": 1.43861 }, { "epoch": 2.5260271625037487, "grad_norm": 4.8881964683532715, "learning_rate": 4.918236901943049e-05, "loss": 2.4629371643066404, "memory(GiB)": 77.56, "step": 58960, "token_acc": 0.46691176470588236, "train_speed(iter/s)": 1.438599 }, { "epoch": 2.5262413778329975, "grad_norm": 4.827664375305176, "learning_rate": 4.917564015366864e-05, "loss": 2.3383283615112305, "memory(GiB)": 77.56, "step": 58965, "token_acc": 0.5504201680672269, "train_speed(iter/s)": 1.438611 }, { "epoch": 2.5264555931622468, "grad_norm": 14.114139556884766, "learning_rate": 4.9168911302840834e-05, "loss": 2.650992774963379, "memory(GiB)": 77.56, "step": 58970, "token_acc": 0.4945054945054945, "train_speed(iter/s)": 1.438603 }, { "epoch": 2.5266698084914956, "grad_norm": 5.02547025680542, "learning_rate": 4.916218246706899e-05, "loss": 2.34653377532959, "memory(GiB)": 77.56, "step": 58975, "token_acc": 0.49777777777777776, "train_speed(iter/s)": 1.43863 }, { "epoch": 2.5268840238207444, "grad_norm": 6.5236735343933105, "learning_rate": 4.9155453646475e-05, "loss": 2.6372867584228517, "memory(GiB)": 77.56, "step": 58980, "token_acc": 0.4463667820069204, "train_speed(iter/s)": 1.438663 }, { "epoch": 2.5270982391499937, "grad_norm": 6.692519187927246, "learning_rate": 4.914872484118074e-05, "loss": 2.363043212890625, "memory(GiB)": 77.56, "step": 58985, "token_acc": 0.49609375, "train_speed(iter/s)": 1.438651 }, { "epoch": 2.5273124544792425, "grad_norm": 7.264876365661621, "learning_rate": 4.914199605130814e-05, "loss": 2.435413360595703, "memory(GiB)": 77.56, "step": 58990, "token_acc": 0.5051903114186851, "train_speed(iter/s)": 1.438655 }, { "epoch": 2.5275266698084913, "grad_norm": 5.1589274406433105, "learning_rate": 4.913526727697909e-05, "loss": 2.659071159362793, "memory(GiB)": 77.56, "step": 58995, "token_acc": 0.4894366197183099, "train_speed(iter/s)": 1.438655 }, { "epoch": 2.5277408851377405, "grad_norm": 5.5924859046936035, "learning_rate": 4.912853851831547e-05, "loss": 2.331832504272461, "memory(GiB)": 77.56, "step": 59000, "token_acc": 0.5019157088122606, "train_speed(iter/s)": 1.438649 }, { "epoch": 2.5277408851377405, "eval_loss": 2.3112387657165527, "eval_runtime": 14.4884, "eval_samples_per_second": 6.902, "eval_steps_per_second": 6.902, "eval_token_acc": 0.4672657252888318, "step": 59000 }, { "epoch": 2.5279551004669893, "grad_norm": 5.729962348937988, "learning_rate": 4.9121809775439214e-05, "loss": 2.224778938293457, "memory(GiB)": 77.56, "step": 59005, "token_acc": 0.46866485013623976, "train_speed(iter/s)": 1.438039 }, { "epoch": 2.528169315796238, "grad_norm": 6.025215148925781, "learning_rate": 4.911508104847218e-05, "loss": 2.5950698852539062, "memory(GiB)": 77.56, "step": 59010, "token_acc": 0.4584717607973422, "train_speed(iter/s)": 1.438044 }, { "epoch": 2.5283835311254874, "grad_norm": 6.581197261810303, "learning_rate": 4.910835233753629e-05, "loss": 2.428325653076172, "memory(GiB)": 77.56, "step": 59015, "token_acc": 0.4797507788161994, "train_speed(iter/s)": 1.438058 }, { "epoch": 2.5285977464547362, "grad_norm": 4.956717491149902, "learning_rate": 4.9101623642753416e-05, "loss": 2.385212707519531, "memory(GiB)": 77.56, "step": 59020, "token_acc": 0.5139318885448917, "train_speed(iter/s)": 1.438095 }, { "epoch": 2.528811961783985, "grad_norm": 6.194469451904297, "learning_rate": 4.909489496424549e-05, "loss": 2.761299896240234, "memory(GiB)": 77.56, "step": 59025, "token_acc": 0.45161290322580644, "train_speed(iter/s)": 1.438105 }, { "epoch": 2.5290261771132343, "grad_norm": 6.688918113708496, "learning_rate": 4.9088166302134385e-05, "loss": 2.44656925201416, "memory(GiB)": 77.56, "step": 59030, "token_acc": 0.423728813559322, "train_speed(iter/s)": 1.438138 }, { "epoch": 2.529240392442483, "grad_norm": 9.883824348449707, "learning_rate": 4.908143765654201e-05, "loss": 2.012375259399414, "memory(GiB)": 77.56, "step": 59035, "token_acc": 0.5720524017467249, "train_speed(iter/s)": 1.438143 }, { "epoch": 2.529454607771732, "grad_norm": 5.347978591918945, "learning_rate": 4.907470902759026e-05, "loss": 2.5401554107666016, "memory(GiB)": 77.56, "step": 59040, "token_acc": 0.47335423197492166, "train_speed(iter/s)": 1.438103 }, { "epoch": 2.529668823100981, "grad_norm": 5.202302932739258, "learning_rate": 4.906798041540101e-05, "loss": 2.3805755615234374, "memory(GiB)": 77.56, "step": 59045, "token_acc": 0.5074626865671642, "train_speed(iter/s)": 1.438108 }, { "epoch": 2.52988303843023, "grad_norm": 4.343328952789307, "learning_rate": 4.906125182009618e-05, "loss": 2.103287696838379, "memory(GiB)": 77.56, "step": 59050, "token_acc": 0.5631399317406144, "train_speed(iter/s)": 1.438085 }, { "epoch": 2.530097253759479, "grad_norm": 4.7397613525390625, "learning_rate": 4.905452324179764e-05, "loss": 2.289626693725586, "memory(GiB)": 77.56, "step": 59055, "token_acc": 0.5034965034965035, "train_speed(iter/s)": 1.438104 }, { "epoch": 2.530311469088728, "grad_norm": 5.467718601226807, "learning_rate": 4.904779468062731e-05, "loss": 2.3340198516845705, "memory(GiB)": 77.56, "step": 59060, "token_acc": 0.47232472324723246, "train_speed(iter/s)": 1.438087 }, { "epoch": 2.530525684417977, "grad_norm": 5.405391693115234, "learning_rate": 4.904106613670707e-05, "loss": 2.423992156982422, "memory(GiB)": 77.56, "step": 59065, "token_acc": 0.47468354430379744, "train_speed(iter/s)": 1.43808 }, { "epoch": 2.5307398997472257, "grad_norm": 6.097036838531494, "learning_rate": 4.9034337610158815e-05, "loss": 2.3963991165161134, "memory(GiB)": 77.56, "step": 59070, "token_acc": 0.49498327759197325, "train_speed(iter/s)": 1.438093 }, { "epoch": 2.530954115076475, "grad_norm": 4.077427387237549, "learning_rate": 4.9027609101104444e-05, "loss": 2.5260807037353517, "memory(GiB)": 77.56, "step": 59075, "token_acc": 0.45564516129032256, "train_speed(iter/s)": 1.43813 }, { "epoch": 2.5311683304057238, "grad_norm": 5.249317169189453, "learning_rate": 4.902088060966585e-05, "loss": 2.912622833251953, "memory(GiB)": 77.56, "step": 59080, "token_acc": 0.45348837209302323, "train_speed(iter/s)": 1.438163 }, { "epoch": 2.5313825457349726, "grad_norm": 4.795521259307861, "learning_rate": 4.90141521359649e-05, "loss": 2.4073673248291017, "memory(GiB)": 77.56, "step": 59085, "token_acc": 0.49166666666666664, "train_speed(iter/s)": 1.438162 }, { "epoch": 2.531596761064222, "grad_norm": 4.793128490447998, "learning_rate": 4.900742368012353e-05, "loss": 2.4595111846923827, "memory(GiB)": 77.56, "step": 59090, "token_acc": 0.4840989399293286, "train_speed(iter/s)": 1.43817 }, { "epoch": 2.5318109763934706, "grad_norm": 4.526932239532471, "learning_rate": 4.9000695242263586e-05, "loss": 2.0980030059814454, "memory(GiB)": 77.56, "step": 59095, "token_acc": 0.5302013422818792, "train_speed(iter/s)": 1.438188 }, { "epoch": 2.5320251917227194, "grad_norm": 7.1503987312316895, "learning_rate": 4.8993966822507006e-05, "loss": 2.7005409240722655, "memory(GiB)": 77.56, "step": 59100, "token_acc": 0.4626865671641791, "train_speed(iter/s)": 1.438204 }, { "epoch": 2.5322394070519687, "grad_norm": 4.8963470458984375, "learning_rate": 4.898723842097566e-05, "loss": 2.420587921142578, "memory(GiB)": 77.56, "step": 59105, "token_acc": 0.4954954954954955, "train_speed(iter/s)": 1.438216 }, { "epoch": 2.5324536223812175, "grad_norm": 7.978202819824219, "learning_rate": 4.898051003779144e-05, "loss": 2.306772994995117, "memory(GiB)": 77.56, "step": 59110, "token_acc": 0.5230769230769231, "train_speed(iter/s)": 1.438249 }, { "epoch": 2.5326678377104663, "grad_norm": 8.484978675842285, "learning_rate": 4.8973781673076236e-05, "loss": 2.2961393356323243, "memory(GiB)": 77.56, "step": 59115, "token_acc": 0.44569288389513106, "train_speed(iter/s)": 1.438239 }, { "epoch": 2.5328820530397156, "grad_norm": 5.341443061828613, "learning_rate": 4.896705332695194e-05, "loss": 2.437510681152344, "memory(GiB)": 77.56, "step": 59120, "token_acc": 0.5053003533568905, "train_speed(iter/s)": 1.438242 }, { "epoch": 2.5330962683689644, "grad_norm": 5.369494915008545, "learning_rate": 4.896032499954045e-05, "loss": 2.5108789443969726, "memory(GiB)": 77.56, "step": 59125, "token_acc": 0.47985347985347987, "train_speed(iter/s)": 1.438252 }, { "epoch": 2.533310483698213, "grad_norm": 5.534130096435547, "learning_rate": 4.895359669096363e-05, "loss": 2.6540061950683596, "memory(GiB)": 77.56, "step": 59130, "token_acc": 0.47058823529411764, "train_speed(iter/s)": 1.438274 }, { "epoch": 2.5335246990274625, "grad_norm": 6.556541442871094, "learning_rate": 4.8946868401343404e-05, "loss": 2.3918256759643555, "memory(GiB)": 77.56, "step": 59135, "token_acc": 0.49814126394052044, "train_speed(iter/s)": 1.438284 }, { "epoch": 2.5337389143567113, "grad_norm": 5.085676670074463, "learning_rate": 4.894014013080163e-05, "loss": 2.6551151275634766, "memory(GiB)": 77.56, "step": 59140, "token_acc": 0.44871794871794873, "train_speed(iter/s)": 1.438302 }, { "epoch": 2.53395312968596, "grad_norm": 6.968482971191406, "learning_rate": 4.893341187946022e-05, "loss": 2.5774843215942385, "memory(GiB)": 77.56, "step": 59145, "token_acc": 0.4697508896797153, "train_speed(iter/s)": 1.438303 }, { "epoch": 2.5341673450152093, "grad_norm": 7.608434200286865, "learning_rate": 4.8926683647441065e-05, "loss": 2.232361602783203, "memory(GiB)": 77.56, "step": 59150, "token_acc": 0.4758364312267658, "train_speed(iter/s)": 1.438313 }, { "epoch": 2.534381560344458, "grad_norm": 11.061235427856445, "learning_rate": 4.891995543486602e-05, "loss": 2.3544471740722654, "memory(GiB)": 77.56, "step": 59155, "token_acc": 0.50199203187251, "train_speed(iter/s)": 1.438346 }, { "epoch": 2.534595775673707, "grad_norm": 5.840290546417236, "learning_rate": 4.8913227241857016e-05, "loss": 2.585592269897461, "memory(GiB)": 77.56, "step": 59160, "token_acc": 0.48375451263537905, "train_speed(iter/s)": 1.438366 }, { "epoch": 2.5348099910029562, "grad_norm": 5.152430534362793, "learning_rate": 4.8906499068535917e-05, "loss": 2.5015968322753905, "memory(GiB)": 77.56, "step": 59165, "token_acc": 0.45454545454545453, "train_speed(iter/s)": 1.438348 }, { "epoch": 2.535024206332205, "grad_norm": 5.306188583374023, "learning_rate": 4.889977091502459e-05, "loss": 2.322525978088379, "memory(GiB)": 77.56, "step": 59170, "token_acc": 0.48514851485148514, "train_speed(iter/s)": 1.438367 }, { "epoch": 2.535238421661454, "grad_norm": 5.255798816680908, "learning_rate": 4.889304278144495e-05, "loss": 2.4219472885131834, "memory(GiB)": 77.56, "step": 59175, "token_acc": 0.49206349206349204, "train_speed(iter/s)": 1.438362 }, { "epoch": 2.535452636990703, "grad_norm": 5.514500141143799, "learning_rate": 4.88863146679189e-05, "loss": 2.479305648803711, "memory(GiB)": 77.56, "step": 59180, "token_acc": 0.49538461538461537, "train_speed(iter/s)": 1.438359 }, { "epoch": 2.535666852319952, "grad_norm": 5.697058200836182, "learning_rate": 4.88795865745683e-05, "loss": 2.5944629669189454, "memory(GiB)": 77.56, "step": 59185, "token_acc": 0.4579710144927536, "train_speed(iter/s)": 1.438359 }, { "epoch": 2.5358810676492007, "grad_norm": 5.072600841522217, "learning_rate": 4.887285850151503e-05, "loss": 2.4342037200927735, "memory(GiB)": 77.56, "step": 59190, "token_acc": 0.4793103448275862, "train_speed(iter/s)": 1.438371 }, { "epoch": 2.53609528297845, "grad_norm": 5.58681583404541, "learning_rate": 4.8866130448880995e-05, "loss": 2.531167411804199, "memory(GiB)": 77.56, "step": 59195, "token_acc": 0.44625407166123776, "train_speed(iter/s)": 1.438391 }, { "epoch": 2.536309498307699, "grad_norm": 5.864629745483398, "learning_rate": 4.885940241678806e-05, "loss": 2.3252824783325194, "memory(GiB)": 77.56, "step": 59200, "token_acc": 0.4845679012345679, "train_speed(iter/s)": 1.438387 }, { "epoch": 2.5365237136369476, "grad_norm": 4.608503341674805, "learning_rate": 4.885267440535813e-05, "loss": 2.4484243392944336, "memory(GiB)": 77.56, "step": 59205, "token_acc": 0.4602649006622517, "train_speed(iter/s)": 1.438406 }, { "epoch": 2.536737928966197, "grad_norm": 6.551125526428223, "learning_rate": 4.8845946414713076e-05, "loss": 2.5869518280029298, "memory(GiB)": 77.56, "step": 59210, "token_acc": 0.4370860927152318, "train_speed(iter/s)": 1.438374 }, { "epoch": 2.5369521442954457, "grad_norm": 7.950067043304443, "learning_rate": 4.883921844497478e-05, "loss": 2.903058624267578, "memory(GiB)": 77.56, "step": 59215, "token_acc": 0.4393939393939394, "train_speed(iter/s)": 1.438382 }, { "epoch": 2.5371663596246945, "grad_norm": 4.788400173187256, "learning_rate": 4.883249049626514e-05, "loss": 2.566099166870117, "memory(GiB)": 77.56, "step": 59220, "token_acc": 0.43508771929824563, "train_speed(iter/s)": 1.438411 }, { "epoch": 2.5373805749539438, "grad_norm": 5.352656841278076, "learning_rate": 4.882576256870604e-05, "loss": 2.491599464416504, "memory(GiB)": 77.56, "step": 59225, "token_acc": 0.4645161290322581, "train_speed(iter/s)": 1.438414 }, { "epoch": 2.5375947902831926, "grad_norm": 4.482700824737549, "learning_rate": 4.8819034662419326e-05, "loss": 2.4838624954223634, "memory(GiB)": 77.56, "step": 59230, "token_acc": 0.4787234042553192, "train_speed(iter/s)": 1.438432 }, { "epoch": 2.5378090056124414, "grad_norm": 5.484973907470703, "learning_rate": 4.881230677752693e-05, "loss": 2.3808292388916015, "memory(GiB)": 77.56, "step": 59235, "token_acc": 0.50390625, "train_speed(iter/s)": 1.438449 }, { "epoch": 2.5380232209416906, "grad_norm": 4.7214274406433105, "learning_rate": 4.880557891415068e-05, "loss": 2.483547019958496, "memory(GiB)": 77.56, "step": 59240, "token_acc": 0.47305389221556887, "train_speed(iter/s)": 1.438464 }, { "epoch": 2.5382374362709395, "grad_norm": 5.173149585723877, "learning_rate": 4.87988510724125e-05, "loss": 2.4541904449462892, "memory(GiB)": 77.56, "step": 59245, "token_acc": 0.49097472924187724, "train_speed(iter/s)": 1.438492 }, { "epoch": 2.5384516516001883, "grad_norm": 4.171067237854004, "learning_rate": 4.8792123252434276e-05, "loss": 2.2743024826049805, "memory(GiB)": 77.56, "step": 59250, "token_acc": 0.4983922829581994, "train_speed(iter/s)": 1.438462 }, { "epoch": 2.5386658669294375, "grad_norm": 5.42086124420166, "learning_rate": 4.878539545433787e-05, "loss": 2.404249572753906, "memory(GiB)": 77.56, "step": 59255, "token_acc": 0.4845360824742268, "train_speed(iter/s)": 1.438469 }, { "epoch": 2.5388800822586863, "grad_norm": 4.652543544769287, "learning_rate": 4.877866767824515e-05, "loss": 2.578704833984375, "memory(GiB)": 77.56, "step": 59260, "token_acc": 0.46875, "train_speed(iter/s)": 1.438484 }, { "epoch": 2.539094297587935, "grad_norm": 5.320056915283203, "learning_rate": 4.877193992427803e-05, "loss": 2.273621940612793, "memory(GiB)": 77.56, "step": 59265, "token_acc": 0.5441176470588235, "train_speed(iter/s)": 1.438477 }, { "epoch": 2.5393085129171844, "grad_norm": 6.450484275817871, "learning_rate": 4.876521219255836e-05, "loss": 2.215576171875, "memory(GiB)": 77.56, "step": 59270, "token_acc": 0.5265017667844523, "train_speed(iter/s)": 1.438473 }, { "epoch": 2.539522728246433, "grad_norm": 6.637783527374268, "learning_rate": 4.8758484483208024e-05, "loss": 2.595779609680176, "memory(GiB)": 77.56, "step": 59275, "token_acc": 0.4448979591836735, "train_speed(iter/s)": 1.438445 }, { "epoch": 2.539736943575682, "grad_norm": 6.064037799835205, "learning_rate": 4.875175679634892e-05, "loss": 2.6027334213256834, "memory(GiB)": 77.56, "step": 59280, "token_acc": 0.5095057034220533, "train_speed(iter/s)": 1.438457 }, { "epoch": 2.5399511589049313, "grad_norm": 5.9925665855407715, "learning_rate": 4.87450291321029e-05, "loss": 2.3716007232666017, "memory(GiB)": 77.56, "step": 59285, "token_acc": 0.5313807531380753, "train_speed(iter/s)": 1.438492 }, { "epoch": 2.54016537423418, "grad_norm": 4.940703868865967, "learning_rate": 4.873830149059186e-05, "loss": 2.00594539642334, "memory(GiB)": 77.56, "step": 59290, "token_acc": 0.5764705882352941, "train_speed(iter/s)": 1.438499 }, { "epoch": 2.540379589563429, "grad_norm": 5.072874546051025, "learning_rate": 4.873157387193767e-05, "loss": 2.4332725524902346, "memory(GiB)": 77.56, "step": 59295, "token_acc": 0.48580441640378547, "train_speed(iter/s)": 1.438513 }, { "epoch": 2.540593804892678, "grad_norm": 4.554154872894287, "learning_rate": 4.872484627626221e-05, "loss": 2.604878234863281, "memory(GiB)": 77.56, "step": 59300, "token_acc": 0.4797507788161994, "train_speed(iter/s)": 1.438504 }, { "epoch": 2.540808020221927, "grad_norm": 4.922318935394287, "learning_rate": 4.871811870368736e-05, "loss": 2.366122817993164, "memory(GiB)": 77.56, "step": 59305, "token_acc": 0.5231788079470199, "train_speed(iter/s)": 1.438522 }, { "epoch": 2.541022235551176, "grad_norm": 5.363475322723389, "learning_rate": 4.8711391154335e-05, "loss": 2.1453617095947264, "memory(GiB)": 77.56, "step": 59310, "token_acc": 0.5170940170940171, "train_speed(iter/s)": 1.438537 }, { "epoch": 2.541236450880425, "grad_norm": 4.4472222328186035, "learning_rate": 4.870466362832696e-05, "loss": 2.1647071838378906, "memory(GiB)": 77.56, "step": 59315, "token_acc": 0.5348837209302325, "train_speed(iter/s)": 1.438549 }, { "epoch": 2.541450666209674, "grad_norm": 4.510188102722168, "learning_rate": 4.869793612578518e-05, "loss": 2.268606948852539, "memory(GiB)": 77.56, "step": 59320, "token_acc": 0.4935483870967742, "train_speed(iter/s)": 1.43856 }, { "epoch": 2.5416648815389227, "grad_norm": 5.476666450500488, "learning_rate": 4.869120864683151e-05, "loss": 2.5754816055297853, "memory(GiB)": 77.56, "step": 59325, "token_acc": 0.5103448275862069, "train_speed(iter/s)": 1.438536 }, { "epoch": 2.541879096868172, "grad_norm": 5.3883442878723145, "learning_rate": 4.868448119158783e-05, "loss": 2.504313087463379, "memory(GiB)": 77.56, "step": 59330, "token_acc": 0.4634146341463415, "train_speed(iter/s)": 1.438537 }, { "epoch": 2.5420933121974207, "grad_norm": 5.293647289276123, "learning_rate": 4.8677753760175995e-05, "loss": 2.352847862243652, "memory(GiB)": 77.56, "step": 59335, "token_acc": 0.5, "train_speed(iter/s)": 1.438554 }, { "epoch": 2.5423075275266696, "grad_norm": 7.011762619018555, "learning_rate": 4.8671026352717895e-05, "loss": 2.2629510879516603, "memory(GiB)": 77.56, "step": 59340, "token_acc": 0.45, "train_speed(iter/s)": 1.438562 }, { "epoch": 2.542521742855919, "grad_norm": 5.031600475311279, "learning_rate": 4.866429896933539e-05, "loss": 2.171701431274414, "memory(GiB)": 77.56, "step": 59345, "token_acc": 0.498220640569395, "train_speed(iter/s)": 1.43855 }, { "epoch": 2.5427359581851676, "grad_norm": 6.941433429718018, "learning_rate": 4.865757161015038e-05, "loss": 2.555876541137695, "memory(GiB)": 77.56, "step": 59350, "token_acc": 0.4517241379310345, "train_speed(iter/s)": 1.438547 }, { "epoch": 2.5429501735144164, "grad_norm": 5.078488349914551, "learning_rate": 4.865084427528471e-05, "loss": 2.546540451049805, "memory(GiB)": 77.56, "step": 59355, "token_acc": 0.4609756097560976, "train_speed(iter/s)": 1.438548 }, { "epoch": 2.5431643888436657, "grad_norm": 9.364160537719727, "learning_rate": 4.864411696486026e-05, "loss": 2.586537742614746, "memory(GiB)": 77.56, "step": 59360, "token_acc": 0.5104602510460251, "train_speed(iter/s)": 1.438535 }, { "epoch": 2.5433786041729145, "grad_norm": 5.819259166717529, "learning_rate": 4.863738967899891e-05, "loss": 2.5130224227905273, "memory(GiB)": 77.56, "step": 59365, "token_acc": 0.49236641221374045, "train_speed(iter/s)": 1.438569 }, { "epoch": 2.5435928195021633, "grad_norm": 5.531962871551514, "learning_rate": 4.8630662417822526e-05, "loss": 2.374894905090332, "memory(GiB)": 77.56, "step": 59370, "token_acc": 0.5261324041811847, "train_speed(iter/s)": 1.43857 }, { "epoch": 2.5438070348314126, "grad_norm": 5.3213911056518555, "learning_rate": 4.8623935181452966e-05, "loss": 2.6349231719970705, "memory(GiB)": 77.56, "step": 59375, "token_acc": 0.4913494809688581, "train_speed(iter/s)": 1.438568 }, { "epoch": 2.5440212501606614, "grad_norm": 5.56639289855957, "learning_rate": 4.861720797001212e-05, "loss": 2.4288457870483398, "memory(GiB)": 77.56, "step": 59380, "token_acc": 0.4471299093655589, "train_speed(iter/s)": 1.438565 }, { "epoch": 2.54423546548991, "grad_norm": 4.515190124511719, "learning_rate": 4.8610480783621835e-05, "loss": 2.5948448181152344, "memory(GiB)": 77.56, "step": 59385, "token_acc": 0.4878048780487805, "train_speed(iter/s)": 1.43855 }, { "epoch": 2.5444496808191595, "grad_norm": 6.551082611083984, "learning_rate": 4.860375362240399e-05, "loss": 2.078593063354492, "memory(GiB)": 77.56, "step": 59390, "token_acc": 0.5461254612546126, "train_speed(iter/s)": 1.43857 }, { "epoch": 2.5446638961484083, "grad_norm": 5.5749897956848145, "learning_rate": 4.859702648648047e-05, "loss": 2.096351432800293, "memory(GiB)": 77.56, "step": 59395, "token_acc": 0.5291828793774319, "train_speed(iter/s)": 1.438577 }, { "epoch": 2.544878111477657, "grad_norm": 6.020618915557861, "learning_rate": 4.859029937597314e-05, "loss": 2.20284423828125, "memory(GiB)": 77.56, "step": 59400, "token_acc": 0.5230769230769231, "train_speed(iter/s)": 1.438566 }, { "epoch": 2.5450923268069063, "grad_norm": 5.776437282562256, "learning_rate": 4.858357229100385e-05, "loss": 2.394856643676758, "memory(GiB)": 77.56, "step": 59405, "token_acc": 0.46629213483146065, "train_speed(iter/s)": 1.438596 }, { "epoch": 2.545306542136155, "grad_norm": 6.168442726135254, "learning_rate": 4.857684523169449e-05, "loss": 2.3220544815063477, "memory(GiB)": 77.56, "step": 59410, "token_acc": 0.47876447876447875, "train_speed(iter/s)": 1.438611 }, { "epoch": 2.545520757465404, "grad_norm": 4.487974643707275, "learning_rate": 4.8570118198166896e-05, "loss": 2.2447771072387694, "memory(GiB)": 77.56, "step": 59415, "token_acc": 0.4840764331210191, "train_speed(iter/s)": 1.438602 }, { "epoch": 2.545734972794653, "grad_norm": 5.289567470550537, "learning_rate": 4.8563391190542954e-05, "loss": 2.6859954833984374, "memory(GiB)": 77.56, "step": 59420, "token_acc": 0.49538461538461537, "train_speed(iter/s)": 1.43861 }, { "epoch": 2.545949188123902, "grad_norm": 5.069399356842041, "learning_rate": 4.855666420894454e-05, "loss": 2.771007537841797, "memory(GiB)": 77.56, "step": 59425, "token_acc": 0.4295774647887324, "train_speed(iter/s)": 1.438642 }, { "epoch": 2.546163403453151, "grad_norm": 6.1787428855896, "learning_rate": 4.85499372534935e-05, "loss": 2.628916549682617, "memory(GiB)": 77.56, "step": 59430, "token_acc": 0.4861111111111111, "train_speed(iter/s)": 1.438649 }, { "epoch": 2.5463776187824, "grad_norm": 5.285228252410889, "learning_rate": 4.8543210324311704e-05, "loss": 2.485353469848633, "memory(GiB)": 77.56, "step": 59435, "token_acc": 0.45664739884393063, "train_speed(iter/s)": 1.438637 }, { "epoch": 2.546591834111649, "grad_norm": 7.288173198699951, "learning_rate": 4.8536483421521025e-05, "loss": 2.538277435302734, "memory(GiB)": 77.56, "step": 59440, "token_acc": 0.46449704142011833, "train_speed(iter/s)": 1.438629 }, { "epoch": 2.5468060494408977, "grad_norm": 5.360102653503418, "learning_rate": 4.852975654524332e-05, "loss": 2.5608760833740236, "memory(GiB)": 77.56, "step": 59445, "token_acc": 0.4507042253521127, "train_speed(iter/s)": 1.438636 }, { "epoch": 2.547020264770147, "grad_norm": 4.4294233322143555, "learning_rate": 4.852302969560046e-05, "loss": 2.1616743087768553, "memory(GiB)": 77.56, "step": 59450, "token_acc": 0.54296875, "train_speed(iter/s)": 1.438663 }, { "epoch": 2.547234480099396, "grad_norm": 5.301448345184326, "learning_rate": 4.8516302872714295e-05, "loss": 2.5592079162597656, "memory(GiB)": 77.56, "step": 59455, "token_acc": 0.49185667752442996, "train_speed(iter/s)": 1.438679 }, { "epoch": 2.5474486954286446, "grad_norm": 6.969396114349365, "learning_rate": 4.8509576076706695e-05, "loss": 2.2596336364746095, "memory(GiB)": 77.56, "step": 59460, "token_acc": 0.509090909090909, "train_speed(iter/s)": 1.43867 }, { "epoch": 2.547662910757894, "grad_norm": 5.3324666023254395, "learning_rate": 4.8502849307699504e-05, "loss": 2.3390174865722657, "memory(GiB)": 77.56, "step": 59465, "token_acc": 0.4929078014184397, "train_speed(iter/s)": 1.43868 }, { "epoch": 2.5478771260871427, "grad_norm": 4.63497257232666, "learning_rate": 4.849612256581463e-05, "loss": 2.6836269378662108, "memory(GiB)": 77.56, "step": 59470, "token_acc": 0.4503311258278146, "train_speed(iter/s)": 1.43868 }, { "epoch": 2.5480913414163915, "grad_norm": 5.863549709320068, "learning_rate": 4.8489395851173905e-05, "loss": 2.3295476913452147, "memory(GiB)": 77.56, "step": 59475, "token_acc": 0.45724907063197023, "train_speed(iter/s)": 1.438666 }, { "epoch": 2.5483055567456407, "grad_norm": 9.32987117767334, "learning_rate": 4.848266916389918e-05, "loss": 2.429655075073242, "memory(GiB)": 77.56, "step": 59480, "token_acc": 0.4880546075085324, "train_speed(iter/s)": 1.438659 }, { "epoch": 2.5485197720748896, "grad_norm": 4.570631504058838, "learning_rate": 4.847594250411234e-05, "loss": 2.41324405670166, "memory(GiB)": 77.56, "step": 59485, "token_acc": 0.4482758620689655, "train_speed(iter/s)": 1.4387 }, { "epoch": 2.5487339874041384, "grad_norm": 5.507359981536865, "learning_rate": 4.8469215871935216e-05, "loss": 2.5094242095947266, "memory(GiB)": 77.56, "step": 59490, "token_acc": 0.4819672131147541, "train_speed(iter/s)": 1.438703 }, { "epoch": 2.5489482027333876, "grad_norm": 4.875783443450928, "learning_rate": 4.846248926748969e-05, "loss": 2.235984039306641, "memory(GiB)": 77.56, "step": 59495, "token_acc": 0.5206349206349207, "train_speed(iter/s)": 1.438641 }, { "epoch": 2.5491624180626364, "grad_norm": 5.424518585205078, "learning_rate": 4.845576269089762e-05, "loss": 2.3595706939697267, "memory(GiB)": 77.56, "step": 59500, "token_acc": 0.5100864553314121, "train_speed(iter/s)": 1.438651 }, { "epoch": 2.5491624180626364, "eval_loss": 2.287466049194336, "eval_runtime": 14.7182, "eval_samples_per_second": 6.794, "eval_steps_per_second": 6.794, "eval_token_acc": 0.44106463878326996, "step": 59500 }, { "epoch": 2.5493766333918852, "grad_norm": 5.486857891082764, "learning_rate": 4.844903614228084e-05, "loss": 2.2575931549072266, "memory(GiB)": 77.56, "step": 59505, "token_acc": 0.4548022598870056, "train_speed(iter/s)": 1.43809 }, { "epoch": 2.5495908487211345, "grad_norm": 4.657067775726318, "learning_rate": 4.844230962176124e-05, "loss": 2.5744064331054686, "memory(GiB)": 77.56, "step": 59510, "token_acc": 0.4438040345821326, "train_speed(iter/s)": 1.438102 }, { "epoch": 2.5498050640503833, "grad_norm": 5.722229957580566, "learning_rate": 4.8435583129460666e-05, "loss": 2.1946325302124023, "memory(GiB)": 77.56, "step": 59515, "token_acc": 0.5063291139240507, "train_speed(iter/s)": 1.438123 }, { "epoch": 2.550019279379632, "grad_norm": 6.230782985687256, "learning_rate": 4.842885666550095e-05, "loss": 2.3860937118530274, "memory(GiB)": 77.56, "step": 59520, "token_acc": 0.4921875, "train_speed(iter/s)": 1.43813 }, { "epoch": 2.5502334947088814, "grad_norm": 6.758782863616943, "learning_rate": 4.842213023000399e-05, "loss": 2.551443862915039, "memory(GiB)": 77.56, "step": 59525, "token_acc": 0.5, "train_speed(iter/s)": 1.438143 }, { "epoch": 2.55044771003813, "grad_norm": 5.023636341094971, "learning_rate": 4.841540382309161e-05, "loss": 2.397336959838867, "memory(GiB)": 77.56, "step": 59530, "token_acc": 0.48120300751879697, "train_speed(iter/s)": 1.438151 }, { "epoch": 2.550661925367379, "grad_norm": 7.535745143890381, "learning_rate": 4.8408677444885685e-05, "loss": 2.4807538986206055, "memory(GiB)": 77.56, "step": 59535, "token_acc": 0.4662379421221865, "train_speed(iter/s)": 1.43817 }, { "epoch": 2.5508761406966283, "grad_norm": 6.641665458679199, "learning_rate": 4.840195109550804e-05, "loss": 2.4462207794189452, "memory(GiB)": 77.56, "step": 59540, "token_acc": 0.46037735849056605, "train_speed(iter/s)": 1.438168 }, { "epoch": 2.551090356025877, "grad_norm": 5.368220329284668, "learning_rate": 4.8395224775080574e-05, "loss": 2.412137031555176, "memory(GiB)": 77.56, "step": 59545, "token_acc": 0.4708029197080292, "train_speed(iter/s)": 1.438162 }, { "epoch": 2.5513045713551263, "grad_norm": 5.6271281242370605, "learning_rate": 4.83884984837251e-05, "loss": 2.4368988037109376, "memory(GiB)": 77.56, "step": 59550, "token_acc": 0.5180722891566265, "train_speed(iter/s)": 1.438156 }, { "epoch": 2.551518786684375, "grad_norm": 5.452853202819824, "learning_rate": 4.83817722215635e-05, "loss": 2.67132511138916, "memory(GiB)": 77.56, "step": 59555, "token_acc": 0.47093023255813954, "train_speed(iter/s)": 1.438173 }, { "epoch": 2.551733002013624, "grad_norm": 5.387850761413574, "learning_rate": 4.837504598871762e-05, "loss": 2.4648250579833983, "memory(GiB)": 77.56, "step": 59560, "token_acc": 0.4482758620689655, "train_speed(iter/s)": 1.438154 }, { "epoch": 2.551947217342873, "grad_norm": 5.006322860717773, "learning_rate": 4.8368319785309285e-05, "loss": 2.3904516220092775, "memory(GiB)": 77.56, "step": 59565, "token_acc": 0.5, "train_speed(iter/s)": 1.438188 }, { "epoch": 2.552161432672122, "grad_norm": 6.988188743591309, "learning_rate": 4.836159361146038e-05, "loss": 2.312815475463867, "memory(GiB)": 77.56, "step": 59570, "token_acc": 0.48, "train_speed(iter/s)": 1.438205 }, { "epoch": 2.552375648001371, "grad_norm": 5.220654010772705, "learning_rate": 4.835486746729274e-05, "loss": 2.6267271041870117, "memory(GiB)": 77.56, "step": 59575, "token_acc": 0.4221556886227545, "train_speed(iter/s)": 1.438235 }, { "epoch": 2.55258986333062, "grad_norm": 4.633787631988525, "learning_rate": 4.834814135292822e-05, "loss": 2.1964237213134767, "memory(GiB)": 77.56, "step": 59580, "token_acc": 0.5419354838709678, "train_speed(iter/s)": 1.438249 }, { "epoch": 2.552804078659869, "grad_norm": 6.835556507110596, "learning_rate": 4.834141526848868e-05, "loss": 2.3077939987182616, "memory(GiB)": 77.56, "step": 59585, "token_acc": 0.5051903114186851, "train_speed(iter/s)": 1.438274 }, { "epoch": 2.5530182939891177, "grad_norm": 6.0026373863220215, "learning_rate": 4.833468921409594e-05, "loss": 2.52010555267334, "memory(GiB)": 77.56, "step": 59590, "token_acc": 0.5058823529411764, "train_speed(iter/s)": 1.438293 }, { "epoch": 2.553232509318367, "grad_norm": 8.448450088500977, "learning_rate": 4.832796318987188e-05, "loss": 2.5625141143798826, "memory(GiB)": 77.56, "step": 59595, "token_acc": 0.4517374517374517, "train_speed(iter/s)": 1.438253 }, { "epoch": 2.553446724647616, "grad_norm": 6.984602451324463, "learning_rate": 4.832123719593834e-05, "loss": 2.6494930267333983, "memory(GiB)": 77.56, "step": 59600, "token_acc": 0.4852941176470588, "train_speed(iter/s)": 1.438278 }, { "epoch": 2.5536609399768646, "grad_norm": 6.8267741203308105, "learning_rate": 4.831451123241715e-05, "loss": 2.034275436401367, "memory(GiB)": 77.56, "step": 59605, "token_acc": 0.5678391959798995, "train_speed(iter/s)": 1.438309 }, { "epoch": 2.553875155306114, "grad_norm": 6.508211135864258, "learning_rate": 4.8307785299430156e-05, "loss": 2.752101516723633, "memory(GiB)": 77.56, "step": 59610, "token_acc": 0.4375, "train_speed(iter/s)": 1.438319 }, { "epoch": 2.5540893706353627, "grad_norm": 4.957455158233643, "learning_rate": 4.830105939709924e-05, "loss": 2.447592544555664, "memory(GiB)": 77.56, "step": 59615, "token_acc": 0.4884488448844885, "train_speed(iter/s)": 1.438317 }, { "epoch": 2.5543035859646115, "grad_norm": 6.223200798034668, "learning_rate": 4.8294333525546234e-05, "loss": 2.5762271881103516, "memory(GiB)": 77.56, "step": 59620, "token_acc": 0.4557823129251701, "train_speed(iter/s)": 1.438277 }, { "epoch": 2.5545178012938607, "grad_norm": 6.365777015686035, "learning_rate": 4.828760768489295e-05, "loss": 2.6135543823242187, "memory(GiB)": 77.56, "step": 59625, "token_acc": 0.5069444444444444, "train_speed(iter/s)": 1.4383 }, { "epoch": 2.5547320166231096, "grad_norm": 5.369986057281494, "learning_rate": 4.8280881875261284e-05, "loss": 2.4641145706176757, "memory(GiB)": 77.56, "step": 59630, "token_acc": 0.5229007633587787, "train_speed(iter/s)": 1.438298 }, { "epoch": 2.5549462319523584, "grad_norm": 6.556378364562988, "learning_rate": 4.8274156096773046e-05, "loss": 2.4437116622924804, "memory(GiB)": 77.56, "step": 59635, "token_acc": 0.49416342412451364, "train_speed(iter/s)": 1.438305 }, { "epoch": 2.5551604472816076, "grad_norm": 5.670872211456299, "learning_rate": 4.8267430349550094e-05, "loss": 2.2682395935058595, "memory(GiB)": 77.56, "step": 59640, "token_acc": 0.5280898876404494, "train_speed(iter/s)": 1.438307 }, { "epoch": 2.5553746626108564, "grad_norm": 5.879903316497803, "learning_rate": 4.826070463371427e-05, "loss": 2.502425193786621, "memory(GiB)": 77.56, "step": 59645, "token_acc": 0.4613003095975232, "train_speed(iter/s)": 1.438301 }, { "epoch": 2.5555888779401053, "grad_norm": 6.359103679656982, "learning_rate": 4.8253978949387394e-05, "loss": 2.4154285430908202, "memory(GiB)": 77.56, "step": 59650, "token_acc": 0.4772727272727273, "train_speed(iter/s)": 1.438327 }, { "epoch": 2.5558030932693545, "grad_norm": 4.669216156005859, "learning_rate": 4.824725329669135e-05, "loss": 2.3376068115234374, "memory(GiB)": 77.56, "step": 59655, "token_acc": 0.48366013071895425, "train_speed(iter/s)": 1.438346 }, { "epoch": 2.5560173085986033, "grad_norm": 5.696174621582031, "learning_rate": 4.824052767574795e-05, "loss": 2.300503158569336, "memory(GiB)": 77.56, "step": 59660, "token_acc": 0.47191011235955055, "train_speed(iter/s)": 1.438346 }, { "epoch": 2.556231523927852, "grad_norm": 4.010333061218262, "learning_rate": 4.823380208667903e-05, "loss": 2.602652168273926, "memory(GiB)": 77.56, "step": 59665, "token_acc": 0.45609065155807366, "train_speed(iter/s)": 1.43832 }, { "epoch": 2.5564457392571014, "grad_norm": 6.473670959472656, "learning_rate": 4.8227076529606455e-05, "loss": 2.2469112396240236, "memory(GiB)": 77.56, "step": 59670, "token_acc": 0.5072463768115942, "train_speed(iter/s)": 1.438321 }, { "epoch": 2.55665995458635, "grad_norm": 4.375396251678467, "learning_rate": 4.8220351004652036e-05, "loss": 2.400933265686035, "memory(GiB)": 77.56, "step": 59675, "token_acc": 0.48286604361370716, "train_speed(iter/s)": 1.438346 }, { "epoch": 2.556874169915599, "grad_norm": 5.402980804443359, "learning_rate": 4.8213625511937644e-05, "loss": 2.8342254638671873, "memory(GiB)": 77.56, "step": 59680, "token_acc": 0.436426116838488, "train_speed(iter/s)": 1.438372 }, { "epoch": 2.5570883852448483, "grad_norm": 6.275370121002197, "learning_rate": 4.820690005158508e-05, "loss": 2.266942596435547, "memory(GiB)": 77.56, "step": 59685, "token_acc": 0.5309446254071661, "train_speed(iter/s)": 1.438365 }, { "epoch": 2.557302600574097, "grad_norm": 4.4692888259887695, "learning_rate": 4.820017462371622e-05, "loss": 2.4160953521728517, "memory(GiB)": 77.56, "step": 59690, "token_acc": 0.45353159851301117, "train_speed(iter/s)": 1.438365 }, { "epoch": 2.557516815903346, "grad_norm": 8.16158390045166, "learning_rate": 4.819344922845288e-05, "loss": 2.6106136322021483, "memory(GiB)": 77.56, "step": 59695, "token_acc": 0.5069444444444444, "train_speed(iter/s)": 1.438352 }, { "epoch": 2.557731031232595, "grad_norm": 7.145231246948242, "learning_rate": 4.818672386591691e-05, "loss": 2.588006019592285, "memory(GiB)": 77.56, "step": 59700, "token_acc": 0.4440894568690096, "train_speed(iter/s)": 1.438373 }, { "epoch": 2.557945246561844, "grad_norm": 5.022751331329346, "learning_rate": 4.817999853623014e-05, "loss": 2.312425231933594, "memory(GiB)": 77.56, "step": 59705, "token_acc": 0.461038961038961, "train_speed(iter/s)": 1.438373 }, { "epoch": 2.558159461891093, "grad_norm": 7.166477203369141, "learning_rate": 4.8173273239514396e-05, "loss": 2.4700672149658205, "memory(GiB)": 77.56, "step": 59710, "token_acc": 0.4788273615635179, "train_speed(iter/s)": 1.438366 }, { "epoch": 2.558373677220342, "grad_norm": 5.713319778442383, "learning_rate": 4.816654797589153e-05, "loss": 2.461689758300781, "memory(GiB)": 77.56, "step": 59715, "token_acc": 0.49097472924187724, "train_speed(iter/s)": 1.438363 }, { "epoch": 2.558587892549591, "grad_norm": 5.759060859680176, "learning_rate": 4.815982274548335e-05, "loss": 2.366672134399414, "memory(GiB)": 77.56, "step": 59720, "token_acc": 0.4861111111111111, "train_speed(iter/s)": 1.438377 }, { "epoch": 2.5588021078788397, "grad_norm": 7.476055145263672, "learning_rate": 4.815309754841172e-05, "loss": 2.295948600769043, "memory(GiB)": 77.56, "step": 59725, "token_acc": 0.4945054945054945, "train_speed(iter/s)": 1.438382 }, { "epoch": 2.559016323208089, "grad_norm": 6.503139495849609, "learning_rate": 4.814637238479847e-05, "loss": 2.4775360107421873, "memory(GiB)": 77.56, "step": 59730, "token_acc": 0.509375, "train_speed(iter/s)": 1.438402 }, { "epoch": 2.5592305385373377, "grad_norm": 4.851894855499268, "learning_rate": 4.8139647254765404e-05, "loss": 2.321548843383789, "memory(GiB)": 77.56, "step": 59735, "token_acc": 0.4861111111111111, "train_speed(iter/s)": 1.43843 }, { "epoch": 2.5594447538665865, "grad_norm": 5.576775074005127, "learning_rate": 4.8132922158434384e-05, "loss": 2.3857646942138673, "memory(GiB)": 77.56, "step": 59740, "token_acc": 0.4956140350877193, "train_speed(iter/s)": 1.438444 }, { "epoch": 2.559658969195836, "grad_norm": 4.1467132568359375, "learning_rate": 4.812619709592723e-05, "loss": 2.2559581756591798, "memory(GiB)": 77.56, "step": 59745, "token_acc": 0.508, "train_speed(iter/s)": 1.438444 }, { "epoch": 2.5598731845250846, "grad_norm": 7.181698799133301, "learning_rate": 4.8119472067365766e-05, "loss": 2.2758745193481444, "memory(GiB)": 77.56, "step": 59750, "token_acc": 0.4878048780487805, "train_speed(iter/s)": 1.438452 }, { "epoch": 2.5600873998543334, "grad_norm": 4.9263129234313965, "learning_rate": 4.8112747072871836e-05, "loss": 2.543910789489746, "memory(GiB)": 77.56, "step": 59755, "token_acc": 0.45051194539249145, "train_speed(iter/s)": 1.438467 }, { "epoch": 2.5603016151835827, "grad_norm": 5.002185821533203, "learning_rate": 4.8106022112567247e-05, "loss": 2.6644874572753907, "memory(GiB)": 77.56, "step": 59760, "token_acc": 0.43354430379746833, "train_speed(iter/s)": 1.438482 }, { "epoch": 2.5605158305128315, "grad_norm": 4.545617580413818, "learning_rate": 4.809929718657386e-05, "loss": 2.468119430541992, "memory(GiB)": 77.56, "step": 59765, "token_acc": 0.4928571428571429, "train_speed(iter/s)": 1.438496 }, { "epoch": 2.5607300458420803, "grad_norm": 7.644782066345215, "learning_rate": 4.809257229501348e-05, "loss": 2.3550495147705077, "memory(GiB)": 77.56, "step": 59770, "token_acc": 0.5019157088122606, "train_speed(iter/s)": 1.438471 }, { "epoch": 2.5609442611713296, "grad_norm": 7.732504844665527, "learning_rate": 4.8085847438007955e-05, "loss": 2.4434947967529297, "memory(GiB)": 77.56, "step": 59775, "token_acc": 0.4528985507246377, "train_speed(iter/s)": 1.438483 }, { "epoch": 2.5611584765005784, "grad_norm": 4.603659152984619, "learning_rate": 4.807912261567908e-05, "loss": 2.3427825927734376, "memory(GiB)": 77.56, "step": 59780, "token_acc": 0.5070921985815603, "train_speed(iter/s)": 1.438471 }, { "epoch": 2.561372691829827, "grad_norm": 6.309761047363281, "learning_rate": 4.807239782814872e-05, "loss": 2.404462432861328, "memory(GiB)": 77.56, "step": 59785, "token_acc": 0.5234375, "train_speed(iter/s)": 1.438483 }, { "epoch": 2.5615869071590764, "grad_norm": 5.627111434936523, "learning_rate": 4.8065673075538685e-05, "loss": 2.213104248046875, "memory(GiB)": 77.56, "step": 59790, "token_acc": 0.5191082802547771, "train_speed(iter/s)": 1.438469 }, { "epoch": 2.5618011224883253, "grad_norm": 4.7075276374816895, "learning_rate": 4.805894835797078e-05, "loss": 2.0538761138916017, "memory(GiB)": 77.56, "step": 59795, "token_acc": 0.540650406504065, "train_speed(iter/s)": 1.438466 }, { "epoch": 2.562015337817574, "grad_norm": 5.505980491638184, "learning_rate": 4.805222367556685e-05, "loss": 2.2942991256713867, "memory(GiB)": 77.56, "step": 59800, "token_acc": 0.4823943661971831, "train_speed(iter/s)": 1.438501 }, { "epoch": 2.5622295531468233, "grad_norm": 5.822419166564941, "learning_rate": 4.804549902844873e-05, "loss": 2.2477752685546877, "memory(GiB)": 77.56, "step": 59805, "token_acc": 0.48672566371681414, "train_speed(iter/s)": 1.438498 }, { "epoch": 2.562443768476072, "grad_norm": 5.29340124130249, "learning_rate": 4.8038774416738205e-05, "loss": 2.4582977294921875, "memory(GiB)": 77.56, "step": 59810, "token_acc": 0.47307692307692306, "train_speed(iter/s)": 1.43848 }, { "epoch": 2.562657983805321, "grad_norm": 5.51577091217041, "learning_rate": 4.803204984055714e-05, "loss": 2.402615547180176, "memory(GiB)": 77.56, "step": 59815, "token_acc": 0.46710526315789475, "train_speed(iter/s)": 1.438483 }, { "epoch": 2.56287219913457, "grad_norm": 5.877311706542969, "learning_rate": 4.802532530002733e-05, "loss": 2.339146041870117, "memory(GiB)": 77.56, "step": 59820, "token_acc": 0.5180722891566265, "train_speed(iter/s)": 1.438484 }, { "epoch": 2.563086414463819, "grad_norm": 5.680993556976318, "learning_rate": 4.801860079527061e-05, "loss": 2.393800735473633, "memory(GiB)": 77.56, "step": 59825, "token_acc": 0.4881656804733728, "train_speed(iter/s)": 1.43849 }, { "epoch": 2.563300629793068, "grad_norm": 5.882232666015625, "learning_rate": 4.8011876326408796e-05, "loss": 2.3937700271606444, "memory(GiB)": 77.56, "step": 59830, "token_acc": 0.4785992217898833, "train_speed(iter/s)": 1.438508 }, { "epoch": 2.563514845122317, "grad_norm": 5.29835319519043, "learning_rate": 4.8005151893563684e-05, "loss": 2.4504329681396486, "memory(GiB)": 77.56, "step": 59835, "token_acc": 0.4827586206896552, "train_speed(iter/s)": 1.438518 }, { "epoch": 2.563729060451566, "grad_norm": 5.1285400390625, "learning_rate": 4.799842749685713e-05, "loss": 2.3803762435913085, "memory(GiB)": 77.56, "step": 59840, "token_acc": 0.4314868804664723, "train_speed(iter/s)": 1.438549 }, { "epoch": 2.5639432757808147, "grad_norm": 4.771817207336426, "learning_rate": 4.799170313641095e-05, "loss": 2.4962512969970705, "memory(GiB)": 77.56, "step": 59845, "token_acc": 0.4871060171919771, "train_speed(iter/s)": 1.438557 }, { "epoch": 2.564157491110064, "grad_norm": 5.648887634277344, "learning_rate": 4.798497881234695e-05, "loss": 2.5638856887817383, "memory(GiB)": 77.56, "step": 59850, "token_acc": 0.4728434504792332, "train_speed(iter/s)": 1.43857 }, { "epoch": 2.564371706439313, "grad_norm": 4.494454860687256, "learning_rate": 4.7978254524786935e-05, "loss": 2.489668273925781, "memory(GiB)": 77.56, "step": 59855, "token_acc": 0.4904214559386973, "train_speed(iter/s)": 1.438552 }, { "epoch": 2.5645859217685616, "grad_norm": 7.119904518127441, "learning_rate": 4.7971530273852754e-05, "loss": 2.4390853881835937, "memory(GiB)": 77.56, "step": 59860, "token_acc": 0.48, "train_speed(iter/s)": 1.438522 }, { "epoch": 2.564800137097811, "grad_norm": 7.292238712310791, "learning_rate": 4.796480605966619e-05, "loss": 2.300970458984375, "memory(GiB)": 77.56, "step": 59865, "token_acc": 0.5321888412017167, "train_speed(iter/s)": 1.438528 }, { "epoch": 2.5650143524270597, "grad_norm": 7.616230487823486, "learning_rate": 4.795808188234909e-05, "loss": 2.079450988769531, "memory(GiB)": 77.56, "step": 59870, "token_acc": 0.52, "train_speed(iter/s)": 1.438536 }, { "epoch": 2.5652285677563085, "grad_norm": 4.937198638916016, "learning_rate": 4.795135774202324e-05, "loss": 2.3414304733276365, "memory(GiB)": 77.56, "step": 59875, "token_acc": 0.512, "train_speed(iter/s)": 1.438518 }, { "epoch": 2.5654427830855577, "grad_norm": 5.350150108337402, "learning_rate": 4.794463363881047e-05, "loss": 2.444717025756836, "memory(GiB)": 77.56, "step": 59880, "token_acc": 0.5322033898305085, "train_speed(iter/s)": 1.438497 }, { "epoch": 2.5656569984148065, "grad_norm": 4.827812671661377, "learning_rate": 4.793790957283259e-05, "loss": 2.355134201049805, "memory(GiB)": 77.56, "step": 59885, "token_acc": 0.4929078014184397, "train_speed(iter/s)": 1.438499 }, { "epoch": 2.565871213744056, "grad_norm": 6.479953765869141, "learning_rate": 4.7931185544211416e-05, "loss": 2.437975311279297, "memory(GiB)": 77.56, "step": 59890, "token_acc": 0.46494464944649444, "train_speed(iter/s)": 1.438505 }, { "epoch": 2.5660854290733046, "grad_norm": 5.186941146850586, "learning_rate": 4.7924461553068745e-05, "loss": 2.648612213134766, "memory(GiB)": 77.56, "step": 59895, "token_acc": 0.43630573248407645, "train_speed(iter/s)": 1.438468 }, { "epoch": 2.5662996444025534, "grad_norm": 4.517250061035156, "learning_rate": 4.7917737599526415e-05, "loss": 2.4635990142822264, "memory(GiB)": 77.56, "step": 59900, "token_acc": 0.4574468085106383, "train_speed(iter/s)": 1.438478 }, { "epoch": 2.5665138597318027, "grad_norm": 9.733177185058594, "learning_rate": 4.791101368370619e-05, "loss": 2.624422073364258, "memory(GiB)": 77.56, "step": 59905, "token_acc": 0.5056179775280899, "train_speed(iter/s)": 1.438486 }, { "epoch": 2.5667280750610515, "grad_norm": 4.182031631469727, "learning_rate": 4.790428980572994e-05, "loss": 2.271896743774414, "memory(GiB)": 77.56, "step": 59910, "token_acc": 0.5121107266435986, "train_speed(iter/s)": 1.438428 }, { "epoch": 2.5669422903903003, "grad_norm": 6.960338115692139, "learning_rate": 4.789756596571944e-05, "loss": 2.3902454376220703, "memory(GiB)": 77.56, "step": 59915, "token_acc": 0.4967105263157895, "train_speed(iter/s)": 1.438447 }, { "epoch": 2.5671565057195496, "grad_norm": 5.9755659103393555, "learning_rate": 4.789084216379651e-05, "loss": 2.0333044052124025, "memory(GiB)": 77.56, "step": 59920, "token_acc": 0.5815602836879432, "train_speed(iter/s)": 1.438457 }, { "epoch": 2.5673707210487984, "grad_norm": 6.6166510581970215, "learning_rate": 4.788411840008294e-05, "loss": 2.3599395751953125, "memory(GiB)": 77.56, "step": 59925, "token_acc": 0.5124555160142349, "train_speed(iter/s)": 1.43846 }, { "epoch": 2.567584936378047, "grad_norm": 5.649675369262695, "learning_rate": 4.7877394674700564e-05, "loss": 2.254487228393555, "memory(GiB)": 77.56, "step": 59930, "token_acc": 0.5169811320754717, "train_speed(iter/s)": 1.438468 }, { "epoch": 2.5677991517072964, "grad_norm": 5.2218403816223145, "learning_rate": 4.787067098777117e-05, "loss": 2.483497428894043, "memory(GiB)": 77.56, "step": 59935, "token_acc": 0.475177304964539, "train_speed(iter/s)": 1.438477 }, { "epoch": 2.5680133670365453, "grad_norm": 4.606977462768555, "learning_rate": 4.786394733941657e-05, "loss": 2.644822883605957, "memory(GiB)": 77.56, "step": 59940, "token_acc": 0.47, "train_speed(iter/s)": 1.438479 }, { "epoch": 2.568227582365794, "grad_norm": 5.572807788848877, "learning_rate": 4.785722372975857e-05, "loss": 2.6316604614257812, "memory(GiB)": 77.56, "step": 59945, "token_acc": 0.47318611987381703, "train_speed(iter/s)": 1.438486 }, { "epoch": 2.5684417976950433, "grad_norm": 5.559742450714111, "learning_rate": 4.785050015891897e-05, "loss": 2.5154964447021486, "memory(GiB)": 77.56, "step": 59950, "token_acc": 0.5, "train_speed(iter/s)": 1.438492 }, { "epoch": 2.568656013024292, "grad_norm": 6.341352462768555, "learning_rate": 4.7843776627019567e-05, "loss": 2.443520164489746, "memory(GiB)": 77.56, "step": 59955, "token_acc": 0.4863013698630137, "train_speed(iter/s)": 1.438499 }, { "epoch": 2.568870228353541, "grad_norm": 4.311911106109619, "learning_rate": 4.7837053134182183e-05, "loss": 2.665169525146484, "memory(GiB)": 77.56, "step": 59960, "token_acc": 0.47987616099071206, "train_speed(iter/s)": 1.43847 }, { "epoch": 2.56908444368279, "grad_norm": 10.258307456970215, "learning_rate": 4.78303296805286e-05, "loss": 2.4120904922485353, "memory(GiB)": 77.56, "step": 59965, "token_acc": 0.4967532467532468, "train_speed(iter/s)": 1.438451 }, { "epoch": 2.569298659012039, "grad_norm": 5.7733283042907715, "learning_rate": 4.782360626618064e-05, "loss": 2.4882480621337892, "memory(GiB)": 77.56, "step": 59970, "token_acc": 0.46579804560260585, "train_speed(iter/s)": 1.438452 }, { "epoch": 2.569512874341288, "grad_norm": 5.327549457550049, "learning_rate": 4.78168828912601e-05, "loss": 2.0749256134033205, "memory(GiB)": 77.56, "step": 59975, "token_acc": 0.5335968379446641, "train_speed(iter/s)": 1.438476 }, { "epoch": 2.569727089670537, "grad_norm": 6.815774917602539, "learning_rate": 4.781015955588875e-05, "loss": 2.417924690246582, "memory(GiB)": 77.56, "step": 59980, "token_acc": 0.5108225108225108, "train_speed(iter/s)": 1.438496 }, { "epoch": 2.569941304999786, "grad_norm": 7.461811065673828, "learning_rate": 4.7803436260188425e-05, "loss": 2.2282024383544923, "memory(GiB)": 77.56, "step": 59985, "token_acc": 0.556420233463035, "train_speed(iter/s)": 1.438505 }, { "epoch": 2.5701555203290347, "grad_norm": 5.774947166442871, "learning_rate": 4.779671300428092e-05, "loss": 2.406715202331543, "memory(GiB)": 77.56, "step": 59990, "token_acc": 0.541095890410959, "train_speed(iter/s)": 1.438521 }, { "epoch": 2.570369735658284, "grad_norm": 5.347567081451416, "learning_rate": 4.778998978828803e-05, "loss": 2.5186132431030273, "memory(GiB)": 77.56, "step": 59995, "token_acc": 0.45663265306122447, "train_speed(iter/s)": 1.438496 }, { "epoch": 2.570583950987533, "grad_norm": 6.227933883666992, "learning_rate": 4.7783266612331536e-05, "loss": 2.5292888641357423, "memory(GiB)": 77.56, "step": 60000, "token_acc": 0.4809688581314879, "train_speed(iter/s)": 1.438522 }, { "epoch": 2.570583950987533, "eval_loss": 2.0648553371429443, "eval_runtime": 14.8514, "eval_samples_per_second": 6.733, "eval_steps_per_second": 6.733, "eval_token_acc": 0.4709066305818674, "step": 60000 }, { "epoch": 2.5707981663167816, "grad_norm": 4.6119384765625, "learning_rate": 4.777654347653326e-05, "loss": 2.4082363128662108, "memory(GiB)": 77.56, "step": 60005, "token_acc": 0.47517039922103216, "train_speed(iter/s)": 1.43799 }, { "epoch": 2.571012381646031, "grad_norm": 5.563006401062012, "learning_rate": 4.776982038101497e-05, "loss": 2.075536918640137, "memory(GiB)": 77.56, "step": 60010, "token_acc": 0.5698529411764706, "train_speed(iter/s)": 1.438023 }, { "epoch": 2.5712265969752797, "grad_norm": 5.174965858459473, "learning_rate": 4.776309732589849e-05, "loss": 2.38769474029541, "memory(GiB)": 77.56, "step": 60015, "token_acc": 0.49498327759197325, "train_speed(iter/s)": 1.438007 }, { "epoch": 2.5714408123045285, "grad_norm": 4.748098373413086, "learning_rate": 4.775637431130559e-05, "loss": 2.555836486816406, "memory(GiB)": 77.56, "step": 60020, "token_acc": 0.4892966360856269, "train_speed(iter/s)": 1.438008 }, { "epoch": 2.5716550276337777, "grad_norm": 6.356767654418945, "learning_rate": 4.774965133735808e-05, "loss": 2.361519622802734, "memory(GiB)": 77.56, "step": 60025, "token_acc": 0.4908424908424908, "train_speed(iter/s)": 1.43798 }, { "epoch": 2.5718692429630265, "grad_norm": 5.02504825592041, "learning_rate": 4.7742928404177746e-05, "loss": 2.428948974609375, "memory(GiB)": 77.56, "step": 60030, "token_acc": 0.4492307692307692, "train_speed(iter/s)": 1.437969 }, { "epoch": 2.5720834582922754, "grad_norm": 6.378479480743408, "learning_rate": 4.773620551188638e-05, "loss": 2.5078826904296876, "memory(GiB)": 77.56, "step": 60035, "token_acc": 0.4657534246575342, "train_speed(iter/s)": 1.437953 }, { "epoch": 2.5722976736215246, "grad_norm": 4.581421375274658, "learning_rate": 4.772948266060577e-05, "loss": 2.4601423263549806, "memory(GiB)": 77.56, "step": 60040, "token_acc": 0.46779661016949153, "train_speed(iter/s)": 1.43796 }, { "epoch": 2.5725118889507734, "grad_norm": 7.045839309692383, "learning_rate": 4.772275985045772e-05, "loss": 2.8986183166503907, "memory(GiB)": 77.56, "step": 60045, "token_acc": 0.42662116040955633, "train_speed(iter/s)": 1.437985 }, { "epoch": 2.5727261042800222, "grad_norm": 6.260333061218262, "learning_rate": 4.7716037081564004e-05, "loss": 2.5616737365722657, "memory(GiB)": 77.56, "step": 60050, "token_acc": 0.4429530201342282, "train_speed(iter/s)": 1.43799 }, { "epoch": 2.5729403196092715, "grad_norm": 4.881079196929932, "learning_rate": 4.7709314354046415e-05, "loss": 2.420989990234375, "memory(GiB)": 77.56, "step": 60055, "token_acc": 0.5017667844522968, "train_speed(iter/s)": 1.438002 }, { "epoch": 2.5731545349385203, "grad_norm": 4.75499963760376, "learning_rate": 4.7702591668026745e-05, "loss": 2.6270772933959963, "memory(GiB)": 77.56, "step": 60060, "token_acc": 0.453125, "train_speed(iter/s)": 1.438018 }, { "epoch": 2.573368750267769, "grad_norm": 5.190093994140625, "learning_rate": 4.769586902362679e-05, "loss": 2.395723342895508, "memory(GiB)": 77.56, "step": 60065, "token_acc": 0.47257383966244726, "train_speed(iter/s)": 1.438056 }, { "epoch": 2.5735829655970184, "grad_norm": 4.898736953735352, "learning_rate": 4.768914642096833e-05, "loss": 2.443906784057617, "memory(GiB)": 77.56, "step": 60070, "token_acc": 0.4965753424657534, "train_speed(iter/s)": 1.438079 }, { "epoch": 2.573797180926267, "grad_norm": 7.562080383300781, "learning_rate": 4.768242386017315e-05, "loss": 2.615232467651367, "memory(GiB)": 77.56, "step": 60075, "token_acc": 0.5038759689922481, "train_speed(iter/s)": 1.4381 }, { "epoch": 2.574011396255516, "grad_norm": 4.500131607055664, "learning_rate": 4.767570134136304e-05, "loss": 2.378614807128906, "memory(GiB)": 77.56, "step": 60080, "token_acc": 0.49019607843137253, "train_speed(iter/s)": 1.438115 }, { "epoch": 2.5742256115847653, "grad_norm": 5.240380764007568, "learning_rate": 4.766897886465977e-05, "loss": 2.3287225723266602, "memory(GiB)": 77.56, "step": 60085, "token_acc": 0.5067567567567568, "train_speed(iter/s)": 1.438101 }, { "epoch": 2.574439826914014, "grad_norm": 4.375026226043701, "learning_rate": 4.766225643018514e-05, "loss": 2.0763872146606444, "memory(GiB)": 77.56, "step": 60090, "token_acc": 0.5454545454545454, "train_speed(iter/s)": 1.438114 }, { "epoch": 2.574654042243263, "grad_norm": 5.3339667320251465, "learning_rate": 4.765553403806094e-05, "loss": 2.3074771881103517, "memory(GiB)": 77.56, "step": 60095, "token_acc": 0.4825174825174825, "train_speed(iter/s)": 1.438112 }, { "epoch": 2.574868257572512, "grad_norm": 6.190136909484863, "learning_rate": 4.764881168840892e-05, "loss": 2.3066503524780275, "memory(GiB)": 77.56, "step": 60100, "token_acc": 0.4942084942084942, "train_speed(iter/s)": 1.438097 }, { "epoch": 2.575082472901761, "grad_norm": 4.96804141998291, "learning_rate": 4.7642089381350895e-05, "loss": 2.587931442260742, "memory(GiB)": 77.56, "step": 60105, "token_acc": 0.44680851063829785, "train_speed(iter/s)": 1.438109 }, { "epoch": 2.5752966882310098, "grad_norm": 4.135900020599365, "learning_rate": 4.763536711700862e-05, "loss": 2.215572738647461, "memory(GiB)": 77.56, "step": 60110, "token_acc": 0.5047021943573667, "train_speed(iter/s)": 1.438078 }, { "epoch": 2.575510903560259, "grad_norm": 8.227154731750488, "learning_rate": 4.76286448955039e-05, "loss": 2.502964210510254, "memory(GiB)": 77.56, "step": 60115, "token_acc": 0.4796747967479675, "train_speed(iter/s)": 1.438093 }, { "epoch": 2.575725118889508, "grad_norm": 5.9069976806640625, "learning_rate": 4.76219227169585e-05, "loss": 2.428532028198242, "memory(GiB)": 77.56, "step": 60120, "token_acc": 0.5234375, "train_speed(iter/s)": 1.438082 }, { "epoch": 2.5759393342187566, "grad_norm": 17.34946060180664, "learning_rate": 4.7615200581494194e-05, "loss": 2.37042121887207, "memory(GiB)": 77.56, "step": 60125, "token_acc": 0.464, "train_speed(iter/s)": 1.438088 }, { "epoch": 2.576153549548006, "grad_norm": 6.491145610809326, "learning_rate": 4.7608478489232756e-05, "loss": 2.754975128173828, "memory(GiB)": 77.56, "step": 60130, "token_acc": 0.45323741007194246, "train_speed(iter/s)": 1.43811 }, { "epoch": 2.5763677648772547, "grad_norm": 5.394059181213379, "learning_rate": 4.760175644029599e-05, "loss": 2.623165512084961, "memory(GiB)": 77.56, "step": 60135, "token_acc": 0.4666666666666667, "train_speed(iter/s)": 1.438133 }, { "epoch": 2.5765819802065035, "grad_norm": 5.3159379959106445, "learning_rate": 4.759503443480566e-05, "loss": 2.5075891494750975, "memory(GiB)": 77.56, "step": 60140, "token_acc": 0.449685534591195, "train_speed(iter/s)": 1.438139 }, { "epoch": 2.576796195535753, "grad_norm": 5.586711406707764, "learning_rate": 4.758831247288353e-05, "loss": 2.483822250366211, "memory(GiB)": 77.56, "step": 60145, "token_acc": 0.4758364312267658, "train_speed(iter/s)": 1.438153 }, { "epoch": 2.5770104108650016, "grad_norm": 5.024467468261719, "learning_rate": 4.758159055465138e-05, "loss": 2.680333709716797, "memory(GiB)": 77.56, "step": 60150, "token_acc": 0.4501347708894879, "train_speed(iter/s)": 1.438174 }, { "epoch": 2.5772246261942504, "grad_norm": 6.300375461578369, "learning_rate": 4.757486868023099e-05, "loss": 2.555211639404297, "memory(GiB)": 77.56, "step": 60155, "token_acc": 0.4967741935483871, "train_speed(iter/s)": 1.438191 }, { "epoch": 2.5774388415234997, "grad_norm": 4.312414169311523, "learning_rate": 4.756814684974413e-05, "loss": 2.3841041564941405, "memory(GiB)": 77.56, "step": 60160, "token_acc": 0.48514851485148514, "train_speed(iter/s)": 1.438207 }, { "epoch": 2.5776530568527485, "grad_norm": 9.535069465637207, "learning_rate": 4.756142506331258e-05, "loss": 2.7303104400634766, "memory(GiB)": 77.56, "step": 60165, "token_acc": 0.4528301886792453, "train_speed(iter/s)": 1.438196 }, { "epoch": 2.5778672721819973, "grad_norm": 5.826118469238281, "learning_rate": 4.755470332105808e-05, "loss": 2.127614974975586, "memory(GiB)": 77.56, "step": 60170, "token_acc": 0.550561797752809, "train_speed(iter/s)": 1.438206 }, { "epoch": 2.5780814875112466, "grad_norm": 5.456142425537109, "learning_rate": 4.754798162310244e-05, "loss": 2.158995246887207, "memory(GiB)": 77.56, "step": 60175, "token_acc": 0.5503355704697986, "train_speed(iter/s)": 1.438178 }, { "epoch": 2.5782957028404954, "grad_norm": 4.853738784790039, "learning_rate": 4.7541259969567416e-05, "loss": 2.2282285690307617, "memory(GiB)": 77.56, "step": 60180, "token_acc": 0.47540983606557374, "train_speed(iter/s)": 1.43816 }, { "epoch": 2.578509918169744, "grad_norm": 5.5688395500183105, "learning_rate": 4.753453836057476e-05, "loss": 2.418585014343262, "memory(GiB)": 77.56, "step": 60185, "token_acc": 0.48535564853556484, "train_speed(iter/s)": 1.43815 }, { "epoch": 2.5787241334989934, "grad_norm": 5.165946960449219, "learning_rate": 4.752781679624626e-05, "loss": 2.448345947265625, "memory(GiB)": 77.56, "step": 60190, "token_acc": 0.4966442953020134, "train_speed(iter/s)": 1.438182 }, { "epoch": 2.5789383488282422, "grad_norm": 4.71059513092041, "learning_rate": 4.7521095276703676e-05, "loss": 2.491571807861328, "memory(GiB)": 77.56, "step": 60195, "token_acc": 0.5078864353312302, "train_speed(iter/s)": 1.438156 }, { "epoch": 2.579152564157491, "grad_norm": 5.885578155517578, "learning_rate": 4.7514373802068786e-05, "loss": 2.4665386199951174, "memory(GiB)": 77.56, "step": 60200, "token_acc": 0.4889705882352941, "train_speed(iter/s)": 1.438173 }, { "epoch": 2.5793667794867403, "grad_norm": 4.911962985992432, "learning_rate": 4.750765237246332e-05, "loss": 2.236262893676758, "memory(GiB)": 77.56, "step": 60205, "token_acc": 0.5146579804560261, "train_speed(iter/s)": 1.438166 }, { "epoch": 2.579580994815989, "grad_norm": 7.053640365600586, "learning_rate": 4.750093098800909e-05, "loss": 2.3783876419067385, "memory(GiB)": 77.56, "step": 60210, "token_acc": 0.5, "train_speed(iter/s)": 1.43821 }, { "epoch": 2.579795210145238, "grad_norm": 5.45408296585083, "learning_rate": 4.749420964882783e-05, "loss": 2.621144104003906, "memory(GiB)": 77.56, "step": 60215, "token_acc": 0.49019607843137253, "train_speed(iter/s)": 1.438213 }, { "epoch": 2.580009425474487, "grad_norm": 5.28698205947876, "learning_rate": 4.748748835504133e-05, "loss": 2.3261960983276366, "memory(GiB)": 77.56, "step": 60220, "token_acc": 0.48214285714285715, "train_speed(iter/s)": 1.438247 }, { "epoch": 2.580223640803736, "grad_norm": 6.509244918823242, "learning_rate": 4.7480767106771326e-05, "loss": 2.5394302368164063, "memory(GiB)": 77.56, "step": 60225, "token_acc": 0.5092936802973977, "train_speed(iter/s)": 1.438232 }, { "epoch": 2.580437856132985, "grad_norm": 5.842759132385254, "learning_rate": 4.7474045904139586e-05, "loss": 2.6135311126708984, "memory(GiB)": 77.56, "step": 60230, "token_acc": 0.47058823529411764, "train_speed(iter/s)": 1.438224 }, { "epoch": 2.580652071462234, "grad_norm": 5.242397308349609, "learning_rate": 4.746732474726788e-05, "loss": 2.456760025024414, "memory(GiB)": 77.56, "step": 60235, "token_acc": 0.4463667820069204, "train_speed(iter/s)": 1.438212 }, { "epoch": 2.580866286791483, "grad_norm": 4.919493198394775, "learning_rate": 4.7460603636277956e-05, "loss": 2.716153144836426, "memory(GiB)": 77.56, "step": 60240, "token_acc": 0.4919093851132686, "train_speed(iter/s)": 1.43822 }, { "epoch": 2.5810805021207317, "grad_norm": 4.709676265716553, "learning_rate": 4.7453882571291584e-05, "loss": 2.7681114196777346, "memory(GiB)": 77.56, "step": 60245, "token_acc": 0.46598639455782315, "train_speed(iter/s)": 1.438224 }, { "epoch": 2.581294717449981, "grad_norm": 5.668246746063232, "learning_rate": 4.7447161552430526e-05, "loss": 2.4259979248046877, "memory(GiB)": 77.56, "step": 60250, "token_acc": 0.4642857142857143, "train_speed(iter/s)": 1.438246 }, { "epoch": 2.5815089327792298, "grad_norm": 4.389259338378906, "learning_rate": 4.744044057981651e-05, "loss": 2.4198646545410156, "memory(GiB)": 77.56, "step": 60255, "token_acc": 0.5093632958801498, "train_speed(iter/s)": 1.438237 }, { "epoch": 2.5817231481084786, "grad_norm": 5.31933069229126, "learning_rate": 4.743371965357133e-05, "loss": 2.5894342422485352, "memory(GiB)": 77.56, "step": 60260, "token_acc": 0.45980707395498394, "train_speed(iter/s)": 1.43824 }, { "epoch": 2.581937363437728, "grad_norm": 5.5895490646362305, "learning_rate": 4.742699877381673e-05, "loss": 2.382550811767578, "memory(GiB)": 77.56, "step": 60265, "token_acc": 0.5057034220532319, "train_speed(iter/s)": 1.438252 }, { "epoch": 2.5821515787669767, "grad_norm": 6.026521682739258, "learning_rate": 4.7420277940674446e-05, "loss": 2.5565954208374024, "memory(GiB)": 77.56, "step": 60270, "token_acc": 0.5018867924528302, "train_speed(iter/s)": 1.43826 }, { "epoch": 2.5823657940962255, "grad_norm": 4.510000228881836, "learning_rate": 4.741355715426623e-05, "loss": 2.3520702362060546, "memory(GiB)": 77.56, "step": 60275, "token_acc": 0.5197132616487455, "train_speed(iter/s)": 1.438262 }, { "epoch": 2.5825800094254747, "grad_norm": 7.214466094970703, "learning_rate": 4.7406836414713884e-05, "loss": 2.511739730834961, "memory(GiB)": 77.56, "step": 60280, "token_acc": 0.4759036144578313, "train_speed(iter/s)": 1.438287 }, { "epoch": 2.5827942247547235, "grad_norm": 6.1277971267700195, "learning_rate": 4.7400115722139126e-05, "loss": 2.3304912567138674, "memory(GiB)": 77.56, "step": 60285, "token_acc": 0.49523809523809526, "train_speed(iter/s)": 1.438274 }, { "epoch": 2.5830084400839723, "grad_norm": 6.150754928588867, "learning_rate": 4.73933950766637e-05, "loss": 2.687809181213379, "memory(GiB)": 77.56, "step": 60290, "token_acc": 0.4859154929577465, "train_speed(iter/s)": 1.438296 }, { "epoch": 2.5832226554132216, "grad_norm": 14.555551528930664, "learning_rate": 4.738667447840938e-05, "loss": 2.4184549331665037, "memory(GiB)": 77.56, "step": 60295, "token_acc": 0.49612403100775193, "train_speed(iter/s)": 1.438306 }, { "epoch": 2.5834368707424704, "grad_norm": 9.330744743347168, "learning_rate": 4.737995392749789e-05, "loss": 2.428777313232422, "memory(GiB)": 77.56, "step": 60300, "token_acc": 0.503448275862069, "train_speed(iter/s)": 1.43833 }, { "epoch": 2.5836510860717192, "grad_norm": 9.24018383026123, "learning_rate": 4.7373233424051e-05, "loss": 2.525027847290039, "memory(GiB)": 77.56, "step": 60305, "token_acc": 0.4795539033457249, "train_speed(iter/s)": 1.438332 }, { "epoch": 2.5838653014009685, "grad_norm": 6.798473358154297, "learning_rate": 4.7366512968190454e-05, "loss": 2.5032781600952148, "memory(GiB)": 77.56, "step": 60310, "token_acc": 0.5148514851485149, "train_speed(iter/s)": 1.438304 }, { "epoch": 2.5840795167302173, "grad_norm": 6.8903727531433105, "learning_rate": 4.735979256003798e-05, "loss": 2.4880092620849608, "memory(GiB)": 77.56, "step": 60315, "token_acc": 0.4652014652014652, "train_speed(iter/s)": 1.438306 }, { "epoch": 2.584293732059466, "grad_norm": 5.280789852142334, "learning_rate": 4.735307219971536e-05, "loss": 2.2187263488769533, "memory(GiB)": 77.56, "step": 60320, "token_acc": 0.5, "train_speed(iter/s)": 1.438301 }, { "epoch": 2.5845079473887154, "grad_norm": 4.505428791046143, "learning_rate": 4.734635188734432e-05, "loss": 2.210480880737305, "memory(GiB)": 77.56, "step": 60325, "token_acc": 0.5508196721311476, "train_speed(iter/s)": 1.43831 }, { "epoch": 2.584722162717964, "grad_norm": 8.387556076049805, "learning_rate": 4.7339631623046585e-05, "loss": 2.3935256958007813, "memory(GiB)": 77.56, "step": 60330, "token_acc": 0.5, "train_speed(iter/s)": 1.438333 }, { "epoch": 2.584936378047213, "grad_norm": 5.168647289276123, "learning_rate": 4.7332911406943934e-05, "loss": 2.561665725708008, "memory(GiB)": 77.56, "step": 60335, "token_acc": 0.50625, "train_speed(iter/s)": 1.438365 }, { "epoch": 2.5851505933764622, "grad_norm": 6.293690204620361, "learning_rate": 4.732619123915809e-05, "loss": 2.3551496505737304, "memory(GiB)": 77.56, "step": 60340, "token_acc": 0.49097472924187724, "train_speed(iter/s)": 1.438358 }, { "epoch": 2.585364808705711, "grad_norm": 5.55072546005249, "learning_rate": 4.7319471119810805e-05, "loss": 2.4079263687133787, "memory(GiB)": 77.56, "step": 60345, "token_acc": 0.5098039215686274, "train_speed(iter/s)": 1.438393 }, { "epoch": 2.58557902403496, "grad_norm": 5.686210632324219, "learning_rate": 4.731275104902379e-05, "loss": 2.4750595092773438, "memory(GiB)": 77.56, "step": 60350, "token_acc": 0.4533762057877814, "train_speed(iter/s)": 1.438432 }, { "epoch": 2.585793239364209, "grad_norm": 5.239920139312744, "learning_rate": 4.730603102691884e-05, "loss": 2.549484634399414, "memory(GiB)": 77.56, "step": 60355, "token_acc": 0.48201438848920863, "train_speed(iter/s)": 1.438454 }, { "epoch": 2.586007454693458, "grad_norm": 5.484700679779053, "learning_rate": 4.729931105361765e-05, "loss": 2.3309253692626952, "memory(GiB)": 77.56, "step": 60360, "token_acc": 0.5143884892086331, "train_speed(iter/s)": 1.438472 }, { "epoch": 2.5862216700227068, "grad_norm": 6.028470993041992, "learning_rate": 4.7292591129241985e-05, "loss": 2.398227500915527, "memory(GiB)": 77.56, "step": 60365, "token_acc": 0.5074626865671642, "train_speed(iter/s)": 1.438474 }, { "epoch": 2.586435885351956, "grad_norm": 15.866666793823242, "learning_rate": 4.728587125391357e-05, "loss": 2.642198944091797, "memory(GiB)": 77.56, "step": 60370, "token_acc": 0.47104247104247104, "train_speed(iter/s)": 1.438502 }, { "epoch": 2.586650100681205, "grad_norm": 5.291264533996582, "learning_rate": 4.727915142775414e-05, "loss": 2.6901432037353517, "memory(GiB)": 77.56, "step": 60375, "token_acc": 0.4711864406779661, "train_speed(iter/s)": 1.438483 }, { "epoch": 2.5868643160104536, "grad_norm": 5.216635704040527, "learning_rate": 4.7272431650885436e-05, "loss": 2.340800094604492, "memory(GiB)": 77.56, "step": 60380, "token_acc": 0.4697406340057637, "train_speed(iter/s)": 1.438503 }, { "epoch": 2.587078531339703, "grad_norm": 6.237765312194824, "learning_rate": 4.726571192342919e-05, "loss": 2.367164421081543, "memory(GiB)": 77.56, "step": 60385, "token_acc": 0.4879725085910653, "train_speed(iter/s)": 1.438507 }, { "epoch": 2.5872927466689517, "grad_norm": 5.036973476409912, "learning_rate": 4.7258992245507134e-05, "loss": 2.3952657699584963, "memory(GiB)": 77.56, "step": 60390, "token_acc": 0.5276872964169381, "train_speed(iter/s)": 1.438541 }, { "epoch": 2.5875069619982005, "grad_norm": 4.255127429962158, "learning_rate": 4.725227261724101e-05, "loss": 2.1614418029785156, "memory(GiB)": 77.56, "step": 60395, "token_acc": 0.5253623188405797, "train_speed(iter/s)": 1.438546 }, { "epoch": 2.5877211773274498, "grad_norm": 7.340925693511963, "learning_rate": 4.7245553038752535e-05, "loss": 2.442327880859375, "memory(GiB)": 77.56, "step": 60400, "token_acc": 0.5, "train_speed(iter/s)": 1.438533 }, { "epoch": 2.5879353926566986, "grad_norm": 5.364330768585205, "learning_rate": 4.7238833510163475e-05, "loss": 2.4522726058959963, "memory(GiB)": 77.56, "step": 60405, "token_acc": 0.47530864197530864, "train_speed(iter/s)": 1.438518 }, { "epoch": 2.5881496079859474, "grad_norm": 6.723532199859619, "learning_rate": 4.723211403159552e-05, "loss": 2.831590461730957, "memory(GiB)": 77.56, "step": 60410, "token_acc": 0.4307692307692308, "train_speed(iter/s)": 1.43851 }, { "epoch": 2.5883638233151967, "grad_norm": 4.7872843742370605, "learning_rate": 4.722539460317041e-05, "loss": 2.6963634490966797, "memory(GiB)": 77.56, "step": 60415, "token_acc": 0.4967532467532468, "train_speed(iter/s)": 1.43849 }, { "epoch": 2.5885780386444455, "grad_norm": 4.921646595001221, "learning_rate": 4.721867522500989e-05, "loss": 2.4428171157836913, "memory(GiB)": 77.56, "step": 60420, "token_acc": 0.4738675958188153, "train_speed(iter/s)": 1.438497 }, { "epoch": 2.5887922539736943, "grad_norm": 8.170104026794434, "learning_rate": 4.721195589723565e-05, "loss": 2.6428401947021483, "memory(GiB)": 77.56, "step": 60425, "token_acc": 0.4897959183673469, "train_speed(iter/s)": 1.438493 }, { "epoch": 2.5890064693029435, "grad_norm": 7.618386268615723, "learning_rate": 4.7205236619969474e-05, "loss": 2.7102468490600584, "memory(GiB)": 77.56, "step": 60430, "token_acc": 0.4307116104868914, "train_speed(iter/s)": 1.438479 }, { "epoch": 2.5892206846321923, "grad_norm": 4.510335922241211, "learning_rate": 4.719851739333305e-05, "loss": 2.538511848449707, "memory(GiB)": 77.56, "step": 60435, "token_acc": 0.4980392156862745, "train_speed(iter/s)": 1.438484 }, { "epoch": 2.589434899961441, "grad_norm": 4.9730377197265625, "learning_rate": 4.7191798217448115e-05, "loss": 2.6503627777099608, "memory(GiB)": 77.56, "step": 60440, "token_acc": 0.44807121661721067, "train_speed(iter/s)": 1.438453 }, { "epoch": 2.5896491152906904, "grad_norm": 5.63424015045166, "learning_rate": 4.718507909243638e-05, "loss": 2.681383514404297, "memory(GiB)": 77.56, "step": 60445, "token_acc": 0.447098976109215, "train_speed(iter/s)": 1.438464 }, { "epoch": 2.5898633306199392, "grad_norm": 5.386106014251709, "learning_rate": 4.7178360018419585e-05, "loss": 2.538774871826172, "memory(GiB)": 77.56, "step": 60450, "token_acc": 0.48161764705882354, "train_speed(iter/s)": 1.43846 }, { "epoch": 2.590077545949188, "grad_norm": 8.20904541015625, "learning_rate": 4.717164099551945e-05, "loss": 2.440165710449219, "memory(GiB)": 77.56, "step": 60455, "token_acc": 0.49242424242424243, "train_speed(iter/s)": 1.43847 }, { "epoch": 2.5902917612784373, "grad_norm": 6.0574727058410645, "learning_rate": 4.7164922023857686e-05, "loss": 2.4994409561157225, "memory(GiB)": 77.56, "step": 60460, "token_acc": 0.4938650306748466, "train_speed(iter/s)": 1.438502 }, { "epoch": 2.590505976607686, "grad_norm": 4.768818378448486, "learning_rate": 4.7158203103556026e-05, "loss": 2.383126640319824, "memory(GiB)": 77.56, "step": 60465, "token_acc": 0.496551724137931, "train_speed(iter/s)": 1.438526 }, { "epoch": 2.590720191936935, "grad_norm": 4.0426506996154785, "learning_rate": 4.715148423473618e-05, "loss": 2.438172149658203, "memory(GiB)": 77.56, "step": 60470, "token_acc": 0.5229357798165137, "train_speed(iter/s)": 1.438541 }, { "epoch": 2.590934407266184, "grad_norm": 5.394260406494141, "learning_rate": 4.714476541751986e-05, "loss": 2.3842021942138674, "memory(GiB)": 77.56, "step": 60475, "token_acc": 0.4312977099236641, "train_speed(iter/s)": 1.438557 }, { "epoch": 2.591148622595433, "grad_norm": 4.433017253875732, "learning_rate": 4.71380466520288e-05, "loss": 2.3090087890625, "memory(GiB)": 77.56, "step": 60480, "token_acc": 0.5215686274509804, "train_speed(iter/s)": 1.438586 }, { "epoch": 2.591362837924682, "grad_norm": 6.5087409019470215, "learning_rate": 4.7131327938384706e-05, "loss": 2.400865364074707, "memory(GiB)": 77.56, "step": 60485, "token_acc": 0.48226950354609927, "train_speed(iter/s)": 1.438578 }, { "epoch": 2.591577053253931, "grad_norm": 6.493954181671143, "learning_rate": 4.71246092767093e-05, "loss": 2.2848079681396483, "memory(GiB)": 77.56, "step": 60490, "token_acc": 0.4959677419354839, "train_speed(iter/s)": 1.438588 }, { "epoch": 2.59179126858318, "grad_norm": 4.80222225189209, "learning_rate": 4.7117890667124306e-05, "loss": 2.416494941711426, "memory(GiB)": 77.56, "step": 60495, "token_acc": 0.46417445482866043, "train_speed(iter/s)": 1.438617 }, { "epoch": 2.5920054839124287, "grad_norm": 7.05763053894043, "learning_rate": 4.7111172109751394e-05, "loss": 2.438814163208008, "memory(GiB)": 77.56, "step": 60500, "token_acc": 0.48698884758364314, "train_speed(iter/s)": 1.438613 }, { "epoch": 2.5920054839124287, "eval_loss": 2.1299960613250732, "eval_runtime": 14.3991, "eval_samples_per_second": 6.945, "eval_steps_per_second": 6.945, "eval_token_acc": 0.4797297297297297, "step": 60500 }, { "epoch": 2.592219699241678, "grad_norm": 7.089644432067871, "learning_rate": 4.7104453604712326e-05, "loss": 2.1959226608276365, "memory(GiB)": 77.56, "step": 60505, "token_acc": 0.4916911045943304, "train_speed(iter/s)": 1.438085 }, { "epoch": 2.5924339145709268, "grad_norm": 4.846078395843506, "learning_rate": 4.70977351521288e-05, "loss": 2.626007080078125, "memory(GiB)": 77.56, "step": 60510, "token_acc": 0.5, "train_speed(iter/s)": 1.438094 }, { "epoch": 2.5926481299001756, "grad_norm": 5.694186687469482, "learning_rate": 4.709101675212253e-05, "loss": 2.5185497283935545, "memory(GiB)": 77.56, "step": 60515, "token_acc": 0.41114982578397213, "train_speed(iter/s)": 1.43811 }, { "epoch": 2.592862345229425, "grad_norm": 5.390349864959717, "learning_rate": 4.7084298404815206e-05, "loss": 3.103775405883789, "memory(GiB)": 77.56, "step": 60520, "token_acc": 0.40069686411149824, "train_speed(iter/s)": 1.438115 }, { "epoch": 2.5930765605586736, "grad_norm": 6.129212856292725, "learning_rate": 4.7077580110328566e-05, "loss": 2.274595260620117, "memory(GiB)": 77.56, "step": 60525, "token_acc": 0.5018450184501845, "train_speed(iter/s)": 1.438131 }, { "epoch": 2.5932907758879225, "grad_norm": 6.412266254425049, "learning_rate": 4.7070861868784296e-05, "loss": 2.5591609954833983, "memory(GiB)": 77.56, "step": 60530, "token_acc": 0.4503311258278146, "train_speed(iter/s)": 1.438152 }, { "epoch": 2.5935049912171717, "grad_norm": 5.946508407592773, "learning_rate": 4.706414368030412e-05, "loss": 2.6155929565429688, "memory(GiB)": 77.56, "step": 60535, "token_acc": 0.46938775510204084, "train_speed(iter/s)": 1.438135 }, { "epoch": 2.5937192065464205, "grad_norm": 5.697239398956299, "learning_rate": 4.705742554500973e-05, "loss": 2.376565933227539, "memory(GiB)": 77.56, "step": 60540, "token_acc": 0.48641304347826086, "train_speed(iter/s)": 1.438134 }, { "epoch": 2.5939334218756693, "grad_norm": 5.533055782318115, "learning_rate": 4.705070746302283e-05, "loss": 2.2665977478027344, "memory(GiB)": 77.56, "step": 60545, "token_acc": 0.47619047619047616, "train_speed(iter/s)": 1.438112 }, { "epoch": 2.5941476372049186, "grad_norm": 7.01714563369751, "learning_rate": 4.704398943446514e-05, "loss": 2.5848180770874025, "memory(GiB)": 77.56, "step": 60550, "token_acc": 0.4340836012861736, "train_speed(iter/s)": 1.438101 }, { "epoch": 2.5943618525341674, "grad_norm": 6.462080478668213, "learning_rate": 4.703727145945836e-05, "loss": 2.3493200302124024, "memory(GiB)": 77.56, "step": 60555, "token_acc": 0.519650655021834, "train_speed(iter/s)": 1.438062 }, { "epoch": 2.594576067863416, "grad_norm": 4.913819313049316, "learning_rate": 4.7030553538124166e-05, "loss": 2.3986820220947265, "memory(GiB)": 77.56, "step": 60560, "token_acc": 0.4896142433234421, "train_speed(iter/s)": 1.438045 }, { "epoch": 2.5947902831926655, "grad_norm": 5.035233974456787, "learning_rate": 4.70238356705843e-05, "loss": 2.5434823989868165, "memory(GiB)": 77.56, "step": 60565, "token_acc": 0.45555555555555555, "train_speed(iter/s)": 1.438009 }, { "epoch": 2.5950044985219143, "grad_norm": 6.447807788848877, "learning_rate": 4.701711785696042e-05, "loss": 2.892926025390625, "memory(GiB)": 77.56, "step": 60570, "token_acc": 0.42201834862385323, "train_speed(iter/s)": 1.438027 }, { "epoch": 2.595218713851163, "grad_norm": 3.8857436180114746, "learning_rate": 4.7010400097374264e-05, "loss": 2.411061668395996, "memory(GiB)": 77.56, "step": 60575, "token_acc": 0.5045045045045045, "train_speed(iter/s)": 1.438026 }, { "epoch": 2.5954329291804124, "grad_norm": 5.358659744262695, "learning_rate": 4.7003682391947504e-05, "loss": 2.3542449951171873, "memory(GiB)": 77.56, "step": 60580, "token_acc": 0.4847457627118644, "train_speed(iter/s)": 1.43804 }, { "epoch": 2.595647144509661, "grad_norm": 8.36040210723877, "learning_rate": 4.699696474080186e-05, "loss": 2.6073287963867187, "memory(GiB)": 77.56, "step": 60585, "token_acc": 0.4503311258278146, "train_speed(iter/s)": 1.438053 }, { "epoch": 2.59586135983891, "grad_norm": 4.727888107299805, "learning_rate": 4.699024714405901e-05, "loss": 2.234549331665039, "memory(GiB)": 77.56, "step": 60590, "token_acc": 0.5177304964539007, "train_speed(iter/s)": 1.43802 }, { "epoch": 2.5960755751681592, "grad_norm": 4.970827102661133, "learning_rate": 4.698352960184067e-05, "loss": 2.5970645904541017, "memory(GiB)": 77.56, "step": 60595, "token_acc": 0.4429530201342282, "train_speed(iter/s)": 1.438033 }, { "epoch": 2.596289790497408, "grad_norm": 5.5648722648620605, "learning_rate": 4.697681211426851e-05, "loss": 2.184271240234375, "memory(GiB)": 77.56, "step": 60600, "token_acc": 0.518796992481203, "train_speed(iter/s)": 1.438039 }, { "epoch": 2.596504005826657, "grad_norm": 6.2273054122924805, "learning_rate": 4.697009468146423e-05, "loss": 2.3865848541259767, "memory(GiB)": 77.56, "step": 60605, "token_acc": 0.5036764705882353, "train_speed(iter/s)": 1.43805 }, { "epoch": 2.596718221155906, "grad_norm": 5.652369976043701, "learning_rate": 4.6963377303549546e-05, "loss": 2.4741628646850584, "memory(GiB)": 77.56, "step": 60610, "token_acc": 0.42857142857142855, "train_speed(iter/s)": 1.438048 }, { "epoch": 2.596932436485155, "grad_norm": 4.808661937713623, "learning_rate": 4.6956659980646125e-05, "loss": 2.6206716537475585, "memory(GiB)": 77.56, "step": 60615, "token_acc": 0.4536741214057508, "train_speed(iter/s)": 1.438048 }, { "epoch": 2.5971466518144037, "grad_norm": 5.790530681610107, "learning_rate": 4.6949942712875645e-05, "loss": 2.71263427734375, "memory(GiB)": 77.56, "step": 60620, "token_acc": 0.4634146341463415, "train_speed(iter/s)": 1.438052 }, { "epoch": 2.597360867143653, "grad_norm": 8.878464698791504, "learning_rate": 4.6943225500359834e-05, "loss": 2.418523597717285, "memory(GiB)": 77.56, "step": 60625, "token_acc": 0.515358361774744, "train_speed(iter/s)": 1.438033 }, { "epoch": 2.597575082472902, "grad_norm": 6.090363025665283, "learning_rate": 4.693650834322034e-05, "loss": 2.2794145584106444, "memory(GiB)": 77.56, "step": 60630, "token_acc": 0.4790874524714829, "train_speed(iter/s)": 1.438025 }, { "epoch": 2.5977892978021506, "grad_norm": 4.402954578399658, "learning_rate": 4.6929791241578894e-05, "loss": 2.333298110961914, "memory(GiB)": 77.56, "step": 60635, "token_acc": 0.4856115107913669, "train_speed(iter/s)": 1.43802 }, { "epoch": 2.5980035131314, "grad_norm": 5.152929782867432, "learning_rate": 4.6923074195557146e-05, "loss": 2.3224788665771485, "memory(GiB)": 77.56, "step": 60640, "token_acc": 0.52, "train_speed(iter/s)": 1.438028 }, { "epoch": 2.5982177284606487, "grad_norm": 5.916562557220459, "learning_rate": 4.691635720527679e-05, "loss": 2.704825592041016, "memory(GiB)": 77.56, "step": 60645, "token_acc": 0.445993031358885, "train_speed(iter/s)": 1.438009 }, { "epoch": 2.5984319437898975, "grad_norm": 6.161414623260498, "learning_rate": 4.690964027085951e-05, "loss": 2.561298942565918, "memory(GiB)": 77.56, "step": 60650, "token_acc": 0.4707792207792208, "train_speed(iter/s)": 1.438013 }, { "epoch": 2.5986461591191468, "grad_norm": 8.037162780761719, "learning_rate": 4.6902923392427014e-05, "loss": 2.4153738021850586, "memory(GiB)": 77.56, "step": 60655, "token_acc": 0.4804270462633452, "train_speed(iter/s)": 1.43804 }, { "epoch": 2.5988603744483956, "grad_norm": 6.481609344482422, "learning_rate": 4.689620657010097e-05, "loss": 2.7959033966064455, "memory(GiB)": 77.56, "step": 60660, "token_acc": 0.43977591036414565, "train_speed(iter/s)": 1.438033 }, { "epoch": 2.5990745897776444, "grad_norm": 6.540483474731445, "learning_rate": 4.688948980400304e-05, "loss": 2.541012191772461, "memory(GiB)": 77.56, "step": 60665, "token_acc": 0.4967741935483871, "train_speed(iter/s)": 1.438017 }, { "epoch": 2.5992888051068936, "grad_norm": 5.331500053405762, "learning_rate": 4.688277309425494e-05, "loss": 2.5811416625976564, "memory(GiB)": 77.56, "step": 60670, "token_acc": 0.4557823129251701, "train_speed(iter/s)": 1.438034 }, { "epoch": 2.5995030204361425, "grad_norm": 4.370889663696289, "learning_rate": 4.68760564409783e-05, "loss": 2.7431217193603517, "memory(GiB)": 77.56, "step": 60675, "token_acc": 0.46153846153846156, "train_speed(iter/s)": 1.438058 }, { "epoch": 2.5997172357653913, "grad_norm": 5.083352565765381, "learning_rate": 4.686933984429485e-05, "loss": 2.5576873779296876, "memory(GiB)": 77.56, "step": 60680, "token_acc": 0.49333333333333335, "train_speed(iter/s)": 1.438056 }, { "epoch": 2.5999314510946405, "grad_norm": 6.1396379470825195, "learning_rate": 4.686262330432625e-05, "loss": 2.4654163360595702, "memory(GiB)": 77.56, "step": 60685, "token_acc": 0.45185185185185184, "train_speed(iter/s)": 1.438041 }, { "epoch": 2.6001456664238893, "grad_norm": 5.844450950622559, "learning_rate": 4.685590682119415e-05, "loss": 2.307367134094238, "memory(GiB)": 77.56, "step": 60690, "token_acc": 0.5189393939393939, "train_speed(iter/s)": 1.43806 }, { "epoch": 2.600359881753138, "grad_norm": 8.420538902282715, "learning_rate": 4.684919039502027e-05, "loss": 2.718204689025879, "memory(GiB)": 77.56, "step": 60695, "token_acc": 0.4573170731707317, "train_speed(iter/s)": 1.438068 }, { "epoch": 2.6005740970823874, "grad_norm": 4.86892557144165, "learning_rate": 4.6842474025926255e-05, "loss": 2.6100723266601564, "memory(GiB)": 77.56, "step": 60700, "token_acc": 0.44223107569721115, "train_speed(iter/s)": 1.438069 }, { "epoch": 2.600788312411636, "grad_norm": 4.864750385284424, "learning_rate": 4.683575771403377e-05, "loss": 2.171669578552246, "memory(GiB)": 77.56, "step": 60705, "token_acc": 0.52, "train_speed(iter/s)": 1.438051 }, { "epoch": 2.601002527740885, "grad_norm": 7.040539264678955, "learning_rate": 4.6829041459464525e-05, "loss": 2.4126054763793947, "memory(GiB)": 77.56, "step": 60710, "token_acc": 0.4632768361581921, "train_speed(iter/s)": 1.438036 }, { "epoch": 2.6012167430701343, "grad_norm": 7.751967906951904, "learning_rate": 4.6822325262340147e-05, "loss": 2.6023113250732424, "memory(GiB)": 77.56, "step": 60715, "token_acc": 0.48322147651006714, "train_speed(iter/s)": 1.438028 }, { "epoch": 2.601430958399383, "grad_norm": 6.601855754852295, "learning_rate": 4.6815609122782336e-05, "loss": 2.6646862030029297, "memory(GiB)": 77.56, "step": 60720, "token_acc": 0.47653429602888087, "train_speed(iter/s)": 1.438003 }, { "epoch": 2.601645173728632, "grad_norm": 9.612900733947754, "learning_rate": 4.680889304091275e-05, "loss": 2.2821449279785155, "memory(GiB)": 77.56, "step": 60725, "token_acc": 0.5054545454545455, "train_speed(iter/s)": 1.438044 }, { "epoch": 2.601859389057881, "grad_norm": 4.928402900695801, "learning_rate": 4.680217701685306e-05, "loss": 2.264250373840332, "memory(GiB)": 77.56, "step": 60730, "token_acc": 0.5079872204472844, "train_speed(iter/s)": 1.438028 }, { "epoch": 2.60207360438713, "grad_norm": 5.766315460205078, "learning_rate": 4.679546105072494e-05, "loss": 2.488662338256836, "memory(GiB)": 77.56, "step": 60735, "token_acc": 0.503731343283582, "train_speed(iter/s)": 1.438006 }, { "epoch": 2.602287819716379, "grad_norm": 5.843890190124512, "learning_rate": 4.678874514265005e-05, "loss": 2.6733932495117188, "memory(GiB)": 77.56, "step": 60740, "token_acc": 0.46647230320699706, "train_speed(iter/s)": 1.437998 }, { "epoch": 2.602502035045628, "grad_norm": 4.9181342124938965, "learning_rate": 4.6782029292750056e-05, "loss": 2.436018180847168, "memory(GiB)": 77.56, "step": 60745, "token_acc": 0.4646840148698885, "train_speed(iter/s)": 1.43799 }, { "epoch": 2.602716250374877, "grad_norm": 7.540224075317383, "learning_rate": 4.677531350114661e-05, "loss": 2.3421604156494142, "memory(GiB)": 77.56, "step": 60750, "token_acc": 0.49416342412451364, "train_speed(iter/s)": 1.438011 }, { "epoch": 2.6029304657041257, "grad_norm": 5.905830383300781, "learning_rate": 4.6768597767961395e-05, "loss": 2.4692893981933595, "memory(GiB)": 77.56, "step": 60755, "token_acc": 0.48, "train_speed(iter/s)": 1.438022 }, { "epoch": 2.603144681033375, "grad_norm": 5.923783302307129, "learning_rate": 4.676188209331606e-05, "loss": 2.3923463821411133, "memory(GiB)": 77.56, "step": 60760, "token_acc": 0.5030674846625767, "train_speed(iter/s)": 1.438009 }, { "epoch": 2.6033588963626237, "grad_norm": 5.391059398651123, "learning_rate": 4.675516647733226e-05, "loss": 2.248657989501953, "memory(GiB)": 77.56, "step": 60765, "token_acc": 0.501628664495114, "train_speed(iter/s)": 1.437986 }, { "epoch": 2.6035731116918726, "grad_norm": 5.573025226593018, "learning_rate": 4.674845092013168e-05, "loss": 2.987510871887207, "memory(GiB)": 77.56, "step": 60770, "token_acc": 0.3877551020408163, "train_speed(iter/s)": 1.437982 }, { "epoch": 2.603787327021122, "grad_norm": 7.309691905975342, "learning_rate": 4.674173542183594e-05, "loss": 2.3911584854125976, "memory(GiB)": 77.56, "step": 60775, "token_acc": 0.4889705882352941, "train_speed(iter/s)": 1.437977 }, { "epoch": 2.6040015423503706, "grad_norm": 4.779590606689453, "learning_rate": 4.673501998256674e-05, "loss": 2.313121795654297, "memory(GiB)": 77.56, "step": 60780, "token_acc": 0.501577287066246, "train_speed(iter/s)": 1.43798 }, { "epoch": 2.6042157576796194, "grad_norm": 5.138645648956299, "learning_rate": 4.672830460244571e-05, "loss": 2.380880355834961, "memory(GiB)": 77.56, "step": 60785, "token_acc": 0.46963562753036436, "train_speed(iter/s)": 1.437979 }, { "epoch": 2.6044299730088687, "grad_norm": 5.897324562072754, "learning_rate": 4.6721589281594504e-05, "loss": 2.8266084671020506, "memory(GiB)": 77.56, "step": 60790, "token_acc": 0.4201954397394137, "train_speed(iter/s)": 1.437967 }, { "epoch": 2.6046441883381175, "grad_norm": 5.31672477722168, "learning_rate": 4.671487402013477e-05, "loss": 2.5437637329101563, "memory(GiB)": 77.56, "step": 60795, "token_acc": 0.49201277955271566, "train_speed(iter/s)": 1.437974 }, { "epoch": 2.6048584036673663, "grad_norm": 4.853329658508301, "learning_rate": 4.6708158818188195e-05, "loss": 2.4282676696777346, "memory(GiB)": 77.56, "step": 60800, "token_acc": 0.431438127090301, "train_speed(iter/s)": 1.437976 }, { "epoch": 2.6050726189966156, "grad_norm": 6.074598789215088, "learning_rate": 4.670144367587642e-05, "loss": 2.583574104309082, "memory(GiB)": 77.56, "step": 60805, "token_acc": 0.46153846153846156, "train_speed(iter/s)": 1.437984 }, { "epoch": 2.6052868343258644, "grad_norm": 7.673786640167236, "learning_rate": 4.669472859332106e-05, "loss": 2.4890289306640625, "memory(GiB)": 77.56, "step": 60810, "token_acc": 0.45374449339207046, "train_speed(iter/s)": 1.438009 }, { "epoch": 2.605501049655113, "grad_norm": 5.606259822845459, "learning_rate": 4.668801357064382e-05, "loss": 2.273537826538086, "memory(GiB)": 77.56, "step": 60815, "token_acc": 0.5149501661129569, "train_speed(iter/s)": 1.438007 }, { "epoch": 2.6057152649843625, "grad_norm": 7.482728481292725, "learning_rate": 4.66812986079663e-05, "loss": 2.5690435409545898, "memory(GiB)": 77.56, "step": 60820, "token_acc": 0.45364238410596025, "train_speed(iter/s)": 1.437999 }, { "epoch": 2.6059294803136113, "grad_norm": 5.658421039581299, "learning_rate": 4.667458370541018e-05, "loss": 2.479115104675293, "memory(GiB)": 77.56, "step": 60825, "token_acc": 0.43243243243243246, "train_speed(iter/s)": 1.438008 }, { "epoch": 2.60614369564286, "grad_norm": 6.401666164398193, "learning_rate": 4.6667868863097096e-05, "loss": 2.646041488647461, "memory(GiB)": 77.56, "step": 60830, "token_acc": 0.456973293768546, "train_speed(iter/s)": 1.438006 }, { "epoch": 2.6063579109721093, "grad_norm": 6.190518379211426, "learning_rate": 4.666115408114869e-05, "loss": 2.1837493896484377, "memory(GiB)": 77.56, "step": 60835, "token_acc": 0.5141843971631206, "train_speed(iter/s)": 1.437976 }, { "epoch": 2.606572126301358, "grad_norm": 5.382422924041748, "learning_rate": 4.665443935968662e-05, "loss": 2.594824028015137, "memory(GiB)": 77.56, "step": 60840, "token_acc": 0.48226950354609927, "train_speed(iter/s)": 1.437964 }, { "epoch": 2.606786341630607, "grad_norm": 5.528724193572998, "learning_rate": 4.664772469883251e-05, "loss": 2.204884147644043, "memory(GiB)": 77.56, "step": 60845, "token_acc": 0.5283018867924528, "train_speed(iter/s)": 1.437989 }, { "epoch": 2.607000556959856, "grad_norm": 5.810295581817627, "learning_rate": 4.6641010098708006e-05, "loss": 2.2973960876464843, "memory(GiB)": 77.56, "step": 60850, "token_acc": 0.5223880597014925, "train_speed(iter/s)": 1.437984 }, { "epoch": 2.607214772289105, "grad_norm": 6.444004535675049, "learning_rate": 4.6634295559434756e-05, "loss": 2.540684127807617, "memory(GiB)": 77.56, "step": 60855, "token_acc": 0.44405594405594406, "train_speed(iter/s)": 1.438003 }, { "epoch": 2.607428987618354, "grad_norm": 5.819235324859619, "learning_rate": 4.66275810811344e-05, "loss": 2.5616165161132813, "memory(GiB)": 77.56, "step": 60860, "token_acc": 0.43333333333333335, "train_speed(iter/s)": 1.438011 }, { "epoch": 2.607643202947603, "grad_norm": 8.05027961730957, "learning_rate": 4.662086666392859e-05, "loss": 2.5462581634521486, "memory(GiB)": 77.56, "step": 60865, "token_acc": 0.4843205574912892, "train_speed(iter/s)": 1.437996 }, { "epoch": 2.607857418276852, "grad_norm": 4.803243637084961, "learning_rate": 4.661415230793892e-05, "loss": 2.3517572402954103, "memory(GiB)": 77.56, "step": 60870, "token_acc": 0.49019607843137253, "train_speed(iter/s)": 1.438023 }, { "epoch": 2.6080716336061007, "grad_norm": 5.180236339569092, "learning_rate": 4.660743801328709e-05, "loss": 2.2150434494018554, "memory(GiB)": 77.56, "step": 60875, "token_acc": 0.5358649789029536, "train_speed(iter/s)": 1.438026 }, { "epoch": 2.60828584893535, "grad_norm": 5.502531051635742, "learning_rate": 4.6600723780094677e-05, "loss": 2.5581836700439453, "memory(GiB)": 77.56, "step": 60880, "token_acc": 0.47619047619047616, "train_speed(iter/s)": 1.438015 }, { "epoch": 2.608500064264599, "grad_norm": 4.951906204223633, "learning_rate": 4.659400960848336e-05, "loss": 2.3677539825439453, "memory(GiB)": 77.56, "step": 60885, "token_acc": 0.4857142857142857, "train_speed(iter/s)": 1.438011 }, { "epoch": 2.6087142795938476, "grad_norm": 6.366088390350342, "learning_rate": 4.658729549857476e-05, "loss": 2.3809818267822265, "memory(GiB)": 77.56, "step": 60890, "token_acc": 0.4942528735632184, "train_speed(iter/s)": 1.438013 }, { "epoch": 2.608928494923097, "grad_norm": 6.7157979011535645, "learning_rate": 4.658058145049048e-05, "loss": 2.658820152282715, "memory(GiB)": 77.56, "step": 60895, "token_acc": 0.4682274247491639, "train_speed(iter/s)": 1.438011 }, { "epoch": 2.6091427102523457, "grad_norm": 4.269170761108398, "learning_rate": 4.65738674643522e-05, "loss": 2.461532211303711, "memory(GiB)": 77.56, "step": 60900, "token_acc": 0.48881789137380194, "train_speed(iter/s)": 1.437983 }, { "epoch": 2.6093569255815945, "grad_norm": 6.531954288482666, "learning_rate": 4.656715354028151e-05, "loss": 2.31856689453125, "memory(GiB)": 77.56, "step": 60905, "token_acc": 0.5338345864661654, "train_speed(iter/s)": 1.438007 }, { "epoch": 2.6095711409108437, "grad_norm": 7.257207870483398, "learning_rate": 4.656043967840005e-05, "loss": 2.3338733673095704, "memory(GiB)": 77.56, "step": 60910, "token_acc": 0.5195729537366548, "train_speed(iter/s)": 1.438006 }, { "epoch": 2.6097853562400926, "grad_norm": 5.517190933227539, "learning_rate": 4.655372587882946e-05, "loss": 2.529966354370117, "memory(GiB)": 77.56, "step": 60915, "token_acc": 0.4830508474576271, "train_speed(iter/s)": 1.437996 }, { "epoch": 2.6099995715693414, "grad_norm": 5.898299694061279, "learning_rate": 4.654701214169135e-05, "loss": 2.1048030853271484, "memory(GiB)": 77.56, "step": 60920, "token_acc": 0.4864864864864865, "train_speed(iter/s)": 1.437978 }, { "epoch": 2.6102137868985906, "grad_norm": 7.8402276039123535, "learning_rate": 4.654029846710737e-05, "loss": 2.2793750762939453, "memory(GiB)": 77.56, "step": 60925, "token_acc": 0.4666666666666667, "train_speed(iter/s)": 1.438014 }, { "epoch": 2.6104280022278394, "grad_norm": 5.904280185699463, "learning_rate": 4.653358485519912e-05, "loss": 2.3322324752807617, "memory(GiB)": 77.56, "step": 60930, "token_acc": 0.5445205479452054, "train_speed(iter/s)": 1.438046 }, { "epoch": 2.6106422175570883, "grad_norm": 4.554354667663574, "learning_rate": 4.652687130608823e-05, "loss": 2.506477928161621, "memory(GiB)": 77.56, "step": 60935, "token_acc": 0.46005509641873277, "train_speed(iter/s)": 1.43807 }, { "epoch": 2.6108564328863375, "grad_norm": 6.028464317321777, "learning_rate": 4.652015781989631e-05, "loss": 2.6493663787841797, "memory(GiB)": 77.56, "step": 60940, "token_acc": 0.42124542124542125, "train_speed(iter/s)": 1.438077 }, { "epoch": 2.6110706482155863, "grad_norm": 5.519216537475586, "learning_rate": 4.651344439674501e-05, "loss": 2.3362178802490234, "memory(GiB)": 77.56, "step": 60945, "token_acc": 0.48148148148148145, "train_speed(iter/s)": 1.438079 }, { "epoch": 2.611284863544835, "grad_norm": 6.624106407165527, "learning_rate": 4.650673103675594e-05, "loss": 2.3710700988769533, "memory(GiB)": 77.56, "step": 60950, "token_acc": 0.5108695652173914, "train_speed(iter/s)": 1.43808 }, { "epoch": 2.6114990788740844, "grad_norm": 4.884666919708252, "learning_rate": 4.6500017740050706e-05, "loss": 2.240732192993164, "memory(GiB)": 77.56, "step": 60955, "token_acc": 0.5255972696245734, "train_speed(iter/s)": 1.438087 }, { "epoch": 2.611713294203333, "grad_norm": 4.90848970413208, "learning_rate": 4.649330450675095e-05, "loss": 2.2751197814941406, "memory(GiB)": 77.56, "step": 60960, "token_acc": 0.5098039215686274, "train_speed(iter/s)": 1.438121 }, { "epoch": 2.611927509532582, "grad_norm": 5.464402675628662, "learning_rate": 4.6486591336978246e-05, "loss": 2.6248870849609376, "memory(GiB)": 77.56, "step": 60965, "token_acc": 0.48698884758364314, "train_speed(iter/s)": 1.438134 }, { "epoch": 2.6121417248618313, "grad_norm": 5.042157173156738, "learning_rate": 4.647987823085426e-05, "loss": 2.4178579330444334, "memory(GiB)": 77.56, "step": 60970, "token_acc": 0.4915254237288136, "train_speed(iter/s)": 1.438131 }, { "epoch": 2.61235594019108, "grad_norm": 5.9759039878845215, "learning_rate": 4.647316518850058e-05, "loss": 2.470847320556641, "memory(GiB)": 77.56, "step": 60975, "token_acc": 0.4731182795698925, "train_speed(iter/s)": 1.438122 }, { "epoch": 2.612570155520329, "grad_norm": 5.451035499572754, "learning_rate": 4.64664522100388e-05, "loss": 2.3520626068115233, "memory(GiB)": 77.56, "step": 60980, "token_acc": 0.511400651465798, "train_speed(iter/s)": 1.438139 }, { "epoch": 2.612784370849578, "grad_norm": 5.501289367675781, "learning_rate": 4.645973929559058e-05, "loss": 2.528302001953125, "memory(GiB)": 77.56, "step": 60985, "token_acc": 0.45733788395904434, "train_speed(iter/s)": 1.438157 }, { "epoch": 2.612998586178827, "grad_norm": 5.421691417694092, "learning_rate": 4.645302644527749e-05, "loss": 2.1132137298583986, "memory(GiB)": 77.56, "step": 60990, "token_acc": 0.541958041958042, "train_speed(iter/s)": 1.438178 }, { "epoch": 2.6132128015080758, "grad_norm": 5.147592544555664, "learning_rate": 4.644631365922114e-05, "loss": 2.375047492980957, "memory(GiB)": 77.56, "step": 60995, "token_acc": 0.47337278106508873, "train_speed(iter/s)": 1.438181 }, { "epoch": 2.613427016837325, "grad_norm": 5.489688873291016, "learning_rate": 4.643960093754318e-05, "loss": 2.533391571044922, "memory(GiB)": 77.56, "step": 61000, "token_acc": 0.5, "train_speed(iter/s)": 1.43819 }, { "epoch": 2.613427016837325, "eval_loss": 2.2321438789367676, "eval_runtime": 14.7761, "eval_samples_per_second": 6.768, "eval_steps_per_second": 6.768, "eval_token_acc": 0.47619047619047616, "step": 61000 }, { "epoch": 2.613641232166574, "grad_norm": 6.466928005218506, "learning_rate": 4.643288828036517e-05, "loss": 2.4165807723999024, "memory(GiB)": 77.56, "step": 61005, "token_acc": 0.48466257668711654, "train_speed(iter/s)": 1.437649 }, { "epoch": 2.6138554474958227, "grad_norm": 6.679676532745361, "learning_rate": 4.642617568780874e-05, "loss": 2.5052881240844727, "memory(GiB)": 77.56, "step": 61010, "token_acc": 0.5064377682403434, "train_speed(iter/s)": 1.437643 }, { "epoch": 2.614069662825072, "grad_norm": 5.5163655281066895, "learning_rate": 4.6419463159995474e-05, "loss": 2.6603378295898437, "memory(GiB)": 77.56, "step": 61015, "token_acc": 0.47, "train_speed(iter/s)": 1.437643 }, { "epoch": 2.6142838781543207, "grad_norm": 5.466887950897217, "learning_rate": 4.6412750697047014e-05, "loss": 2.2969329833984373, "memory(GiB)": 77.56, "step": 61020, "token_acc": 0.508833922261484, "train_speed(iter/s)": 1.43764 }, { "epoch": 2.6144980934835695, "grad_norm": 5.1457014083862305, "learning_rate": 4.640603829908493e-05, "loss": 2.3360012054443358, "memory(GiB)": 77.56, "step": 61025, "token_acc": 0.5231316725978647, "train_speed(iter/s)": 1.437622 }, { "epoch": 2.614712308812819, "grad_norm": 4.956437110900879, "learning_rate": 4.6399325966230835e-05, "loss": 2.8091278076171875, "memory(GiB)": 77.56, "step": 61030, "token_acc": 0.4077669902912621, "train_speed(iter/s)": 1.43761 }, { "epoch": 2.6149265241420676, "grad_norm": 4.8004279136657715, "learning_rate": 4.6392613698606336e-05, "loss": 2.5653026580810545, "memory(GiB)": 77.56, "step": 61035, "token_acc": 0.45864661654135336, "train_speed(iter/s)": 1.437584 }, { "epoch": 2.6151407394713164, "grad_norm": 6.565741062164307, "learning_rate": 4.638590149633301e-05, "loss": 2.6528976440429686, "memory(GiB)": 77.56, "step": 61040, "token_acc": 0.4456140350877193, "train_speed(iter/s)": 1.437572 }, { "epoch": 2.6153549548005657, "grad_norm": 4.783234119415283, "learning_rate": 4.637918935953248e-05, "loss": 2.322017860412598, "memory(GiB)": 77.56, "step": 61045, "token_acc": 0.5098039215686274, "train_speed(iter/s)": 1.437588 }, { "epoch": 2.6155691701298145, "grad_norm": 3.8038084506988525, "learning_rate": 4.637247728832633e-05, "loss": 2.1964120864868164, "memory(GiB)": 77.56, "step": 61050, "token_acc": 0.5487364620938628, "train_speed(iter/s)": 1.437616 }, { "epoch": 2.6157833854590633, "grad_norm": 6.103489875793457, "learning_rate": 4.636576528283615e-05, "loss": 2.6924789428710936, "memory(GiB)": 77.56, "step": 61055, "token_acc": 0.44, "train_speed(iter/s)": 1.43761 }, { "epoch": 2.6159976007883126, "grad_norm": 7.4280595779418945, "learning_rate": 4.635905334318355e-05, "loss": 2.347287178039551, "memory(GiB)": 77.56, "step": 61060, "token_acc": 0.49280575539568344, "train_speed(iter/s)": 1.437644 }, { "epoch": 2.6162118161175614, "grad_norm": 4.4463396072387695, "learning_rate": 4.635234146949011e-05, "loss": 2.0726484298706054, "memory(GiB)": 77.56, "step": 61065, "token_acc": 0.5149253731343284, "train_speed(iter/s)": 1.437618 }, { "epoch": 2.61642603144681, "grad_norm": 7.577883243560791, "learning_rate": 4.6345629661877425e-05, "loss": 2.6312582015991213, "memory(GiB)": 77.56, "step": 61070, "token_acc": 0.4675324675324675, "train_speed(iter/s)": 1.437615 }, { "epoch": 2.6166402467760594, "grad_norm": 5.872193813323975, "learning_rate": 4.6338917920467096e-05, "loss": 2.2583131790161133, "memory(GiB)": 77.56, "step": 61075, "token_acc": 0.4856115107913669, "train_speed(iter/s)": 1.437601 }, { "epoch": 2.6168544621053083, "grad_norm": 5.659163951873779, "learning_rate": 4.633220624538069e-05, "loss": 2.408833122253418, "memory(GiB)": 77.56, "step": 61080, "token_acc": 0.4962686567164179, "train_speed(iter/s)": 1.437618 }, { "epoch": 2.617068677434557, "grad_norm": 6.097976207733154, "learning_rate": 4.632549463673981e-05, "loss": 2.4263397216796876, "memory(GiB)": 77.56, "step": 61085, "token_acc": 0.531496062992126, "train_speed(iter/s)": 1.437641 }, { "epoch": 2.6172828927638063, "grad_norm": 5.783149242401123, "learning_rate": 4.631878309466603e-05, "loss": 2.5575687408447267, "memory(GiB)": 77.56, "step": 61090, "token_acc": 0.49517684887459806, "train_speed(iter/s)": 1.437671 }, { "epoch": 2.617497108093055, "grad_norm": 5.717090606689453, "learning_rate": 4.6312071619280974e-05, "loss": 2.3968442916870116, "memory(GiB)": 77.56, "step": 61095, "token_acc": 0.45993031358885017, "train_speed(iter/s)": 1.437668 }, { "epoch": 2.617711323422304, "grad_norm": 4.587832927703857, "learning_rate": 4.630536021070618e-05, "loss": 2.7261463165283204, "memory(GiB)": 77.56, "step": 61100, "token_acc": 0.471875, "train_speed(iter/s)": 1.437688 }, { "epoch": 2.617925538751553, "grad_norm": 5.032992839813232, "learning_rate": 4.629864886906326e-05, "loss": 2.287972640991211, "memory(GiB)": 77.56, "step": 61105, "token_acc": 0.4845679012345679, "train_speed(iter/s)": 1.437696 }, { "epoch": 2.618139754080802, "grad_norm": 5.303487300872803, "learning_rate": 4.629193759447378e-05, "loss": 2.5813934326171877, "memory(GiB)": 77.56, "step": 61110, "token_acc": 0.45993031358885017, "train_speed(iter/s)": 1.437712 }, { "epoch": 2.618353969410051, "grad_norm": 3.700289249420166, "learning_rate": 4.628522638705934e-05, "loss": 2.2632728576660157, "memory(GiB)": 77.56, "step": 61115, "token_acc": 0.48507462686567165, "train_speed(iter/s)": 1.437677 }, { "epoch": 2.6185681847393, "grad_norm": 6.38515567779541, "learning_rate": 4.6278515246941514e-05, "loss": 2.355577850341797, "memory(GiB)": 77.56, "step": 61120, "token_acc": 0.5176848874598071, "train_speed(iter/s)": 1.43767 }, { "epoch": 2.618782400068549, "grad_norm": 4.75387716293335, "learning_rate": 4.6271804174241864e-05, "loss": 2.401355743408203, "memory(GiB)": 77.56, "step": 61125, "token_acc": 0.4885245901639344, "train_speed(iter/s)": 1.437676 }, { "epoch": 2.6189966153977977, "grad_norm": 4.666396617889404, "learning_rate": 4.6265093169081986e-05, "loss": 2.46710205078125, "memory(GiB)": 77.56, "step": 61130, "token_acc": 0.45962732919254656, "train_speed(iter/s)": 1.437667 }, { "epoch": 2.619210830727047, "grad_norm": 5.302369594573975, "learning_rate": 4.625838223158345e-05, "loss": 2.417184829711914, "memory(GiB)": 77.56, "step": 61135, "token_acc": 0.4597315436241611, "train_speed(iter/s)": 1.437665 }, { "epoch": 2.619425046056296, "grad_norm": 8.818601608276367, "learning_rate": 4.6251671361867813e-05, "loss": 2.4843074798583986, "memory(GiB)": 77.56, "step": 61140, "token_acc": 0.4603174603174603, "train_speed(iter/s)": 1.437685 }, { "epoch": 2.6196392613855446, "grad_norm": 5.414271354675293, "learning_rate": 4.6244960560056686e-05, "loss": 2.4678627014160157, "memory(GiB)": 77.56, "step": 61145, "token_acc": 0.47766323024054985, "train_speed(iter/s)": 1.437687 }, { "epoch": 2.619853476714794, "grad_norm": 5.477501392364502, "learning_rate": 4.623824982627161e-05, "loss": 2.617709159851074, "memory(GiB)": 77.56, "step": 61150, "token_acc": 0.4742268041237113, "train_speed(iter/s)": 1.437682 }, { "epoch": 2.6200676920440427, "grad_norm": 4.945050239562988, "learning_rate": 4.623153916063417e-05, "loss": 2.4587818145751954, "memory(GiB)": 77.56, "step": 61155, "token_acc": 0.5, "train_speed(iter/s)": 1.437671 }, { "epoch": 2.6202819073732915, "grad_norm": 5.390034198760986, "learning_rate": 4.6224828563265934e-05, "loss": 2.411081886291504, "memory(GiB)": 77.56, "step": 61160, "token_acc": 0.5144927536231884, "train_speed(iter/s)": 1.437668 }, { "epoch": 2.6204961227025407, "grad_norm": 4.585055351257324, "learning_rate": 4.6218118034288456e-05, "loss": 2.637013053894043, "memory(GiB)": 77.56, "step": 61165, "token_acc": 0.49557522123893805, "train_speed(iter/s)": 1.437695 }, { "epoch": 2.6207103380317895, "grad_norm": 5.090357780456543, "learning_rate": 4.6211407573823326e-05, "loss": 2.5173492431640625, "memory(GiB)": 77.56, "step": 61170, "token_acc": 0.4563106796116505, "train_speed(iter/s)": 1.437702 }, { "epoch": 2.6209245533610384, "grad_norm": 9.149224281311035, "learning_rate": 4.620469718199211e-05, "loss": 2.3824291229248047, "memory(GiB)": 77.56, "step": 61175, "token_acc": 0.5, "train_speed(iter/s)": 1.437726 }, { "epoch": 2.6211387686902876, "grad_norm": 6.559986591339111, "learning_rate": 4.619798685891637e-05, "loss": 2.543759346008301, "memory(GiB)": 77.56, "step": 61180, "token_acc": 0.4699248120300752, "train_speed(iter/s)": 1.43775 }, { "epoch": 2.6213529840195364, "grad_norm": 5.382274150848389, "learning_rate": 4.6191276604717645e-05, "loss": 2.148107719421387, "memory(GiB)": 77.56, "step": 61185, "token_acc": 0.5447761194029851, "train_speed(iter/s)": 1.437737 }, { "epoch": 2.6215671993487852, "grad_norm": 5.556448459625244, "learning_rate": 4.618456641951753e-05, "loss": 2.2009441375732424, "memory(GiB)": 77.56, "step": 61190, "token_acc": 0.5555555555555556, "train_speed(iter/s)": 1.437759 }, { "epoch": 2.6217814146780345, "grad_norm": 6.283977031707764, "learning_rate": 4.617785630343757e-05, "loss": 2.6974502563476563, "memory(GiB)": 77.56, "step": 61195, "token_acc": 0.48615384615384616, "train_speed(iter/s)": 1.437769 }, { "epoch": 2.6219956300072833, "grad_norm": 4.915924072265625, "learning_rate": 4.617114625659932e-05, "loss": 2.227311134338379, "memory(GiB)": 77.56, "step": 61200, "token_acc": 0.5142045454545454, "train_speed(iter/s)": 1.437712 }, { "epoch": 2.622209845336532, "grad_norm": 5.53706693649292, "learning_rate": 4.6164436279124366e-05, "loss": 2.6401288986206053, "memory(GiB)": 77.56, "step": 61205, "token_acc": 0.4828897338403042, "train_speed(iter/s)": 1.437732 }, { "epoch": 2.6224240606657814, "grad_norm": 5.931461334228516, "learning_rate": 4.615772637113423e-05, "loss": 2.558811378479004, "memory(GiB)": 77.56, "step": 61210, "token_acc": 0.47419354838709676, "train_speed(iter/s)": 1.437747 }, { "epoch": 2.62263827599503, "grad_norm": 5.994172096252441, "learning_rate": 4.61510165327505e-05, "loss": 2.4454769134521483, "memory(GiB)": 77.56, "step": 61215, "token_acc": 0.4911660777385159, "train_speed(iter/s)": 1.437757 }, { "epoch": 2.622852491324279, "grad_norm": 6.3840155601501465, "learning_rate": 4.6144306764094714e-05, "loss": 2.450403594970703, "memory(GiB)": 77.56, "step": 61220, "token_acc": 0.4880546075085324, "train_speed(iter/s)": 1.437773 }, { "epoch": 2.6230667066535283, "grad_norm": 4.7752885818481445, "learning_rate": 4.613759706528842e-05, "loss": 2.4518911361694338, "memory(GiB)": 77.56, "step": 61225, "token_acc": 0.46153846153846156, "train_speed(iter/s)": 1.437772 }, { "epoch": 2.623280921982777, "grad_norm": 4.849348068237305, "learning_rate": 4.613088743645318e-05, "loss": 2.8188608169555662, "memory(GiB)": 77.56, "step": 61230, "token_acc": 0.4558011049723757, "train_speed(iter/s)": 1.437759 }, { "epoch": 2.623495137312026, "grad_norm": 5.336507797241211, "learning_rate": 4.612417787771055e-05, "loss": 2.0962846755981444, "memory(GiB)": 77.56, "step": 61235, "token_acc": 0.5393700787401575, "train_speed(iter/s)": 1.43779 }, { "epoch": 2.623709352641275, "grad_norm": 5.035750865936279, "learning_rate": 4.6117468389182053e-05, "loss": 2.576386260986328, "memory(GiB)": 77.56, "step": 61240, "token_acc": 0.45907473309608543, "train_speed(iter/s)": 1.437812 }, { "epoch": 2.623923567970524, "grad_norm": 6.402904510498047, "learning_rate": 4.6110758970989264e-05, "loss": 2.5084495544433594, "memory(GiB)": 77.56, "step": 61245, "token_acc": 0.5017543859649123, "train_speed(iter/s)": 1.437814 }, { "epoch": 2.6241377832997728, "grad_norm": 5.828314304351807, "learning_rate": 4.610404962325374e-05, "loss": 2.374681282043457, "memory(GiB)": 77.56, "step": 61250, "token_acc": 0.5434782608695652, "train_speed(iter/s)": 1.437797 }, { "epoch": 2.624351998629022, "grad_norm": 4.399845123291016, "learning_rate": 4.6097340346097004e-05, "loss": 2.3787761688232423, "memory(GiB)": 77.56, "step": 61255, "token_acc": 0.47075208913649025, "train_speed(iter/s)": 1.4378 }, { "epoch": 2.624566213958271, "grad_norm": 5.238890647888184, "learning_rate": 4.609063113964061e-05, "loss": 2.248393249511719, "memory(GiB)": 77.56, "step": 61260, "token_acc": 0.4864864864864865, "train_speed(iter/s)": 1.437793 }, { "epoch": 2.6247804292875196, "grad_norm": 6.462149620056152, "learning_rate": 4.608392200400611e-05, "loss": 2.5526473999023436, "memory(GiB)": 77.56, "step": 61265, "token_acc": 0.4713804713804714, "train_speed(iter/s)": 1.437803 }, { "epoch": 2.624994644616769, "grad_norm": 6.404357433319092, "learning_rate": 4.607721293931502e-05, "loss": 2.203322982788086, "memory(GiB)": 77.56, "step": 61270, "token_acc": 0.54, "train_speed(iter/s)": 1.437803 }, { "epoch": 2.6252088599460177, "grad_norm": 8.24066162109375, "learning_rate": 4.607050394568891e-05, "loss": 2.4650421142578125, "memory(GiB)": 77.56, "step": 61275, "token_acc": 0.5072992700729927, "train_speed(iter/s)": 1.437806 }, { "epoch": 2.6254230752752665, "grad_norm": 7.238137245178223, "learning_rate": 4.6063795023249303e-05, "loss": 2.476858901977539, "memory(GiB)": 77.56, "step": 61280, "token_acc": 0.4908424908424908, "train_speed(iter/s)": 1.437812 }, { "epoch": 2.625637290604516, "grad_norm": 5.264199733734131, "learning_rate": 4.605708617211774e-05, "loss": 2.3938180923461916, "memory(GiB)": 77.56, "step": 61285, "token_acc": 0.5067114093959731, "train_speed(iter/s)": 1.437815 }, { "epoch": 2.6258515059337646, "grad_norm": 5.499978542327881, "learning_rate": 4.605037739241576e-05, "loss": 2.7696823120117187, "memory(GiB)": 77.56, "step": 61290, "token_acc": 0.46855345911949686, "train_speed(iter/s)": 1.437825 }, { "epoch": 2.6260657212630134, "grad_norm": 5.290322780609131, "learning_rate": 4.604366868426489e-05, "loss": 2.4920318603515623, "memory(GiB)": 77.56, "step": 61295, "token_acc": 0.4851190476190476, "train_speed(iter/s)": 1.437819 }, { "epoch": 2.6262799365922627, "grad_norm": 6.1877522468566895, "learning_rate": 4.603696004778669e-05, "loss": 2.492643928527832, "memory(GiB)": 77.56, "step": 61300, "token_acc": 0.48698884758364314, "train_speed(iter/s)": 1.437837 }, { "epoch": 2.6264941519215115, "grad_norm": 5.980542182922363, "learning_rate": 4.603025148310267e-05, "loss": 2.43652286529541, "memory(GiB)": 77.56, "step": 61305, "token_acc": 0.5057034220532319, "train_speed(iter/s)": 1.437842 }, { "epoch": 2.6267083672507603, "grad_norm": 6.2820563316345215, "learning_rate": 4.602354299033435e-05, "loss": 2.477032470703125, "memory(GiB)": 77.56, "step": 61310, "token_acc": 0.4966216216216216, "train_speed(iter/s)": 1.437848 }, { "epoch": 2.6269225825800095, "grad_norm": 5.314938545227051, "learning_rate": 4.601683456960329e-05, "loss": 2.8025466918945314, "memory(GiB)": 77.56, "step": 61315, "token_acc": 0.43820224719101125, "train_speed(iter/s)": 1.437877 }, { "epoch": 2.6271367979092584, "grad_norm": 5.161302089691162, "learning_rate": 4.601012622103102e-05, "loss": 2.4552543640136717, "memory(GiB)": 77.56, "step": 61320, "token_acc": 0.5084745762711864, "train_speed(iter/s)": 1.437887 }, { "epoch": 2.627351013238507, "grad_norm": 5.536771297454834, "learning_rate": 4.600341794473905e-05, "loss": 2.3944297790527345, "memory(GiB)": 77.56, "step": 61325, "token_acc": 0.48771929824561405, "train_speed(iter/s)": 1.437904 }, { "epoch": 2.6275652285677564, "grad_norm": 4.604583263397217, "learning_rate": 4.599670974084891e-05, "loss": 2.487743377685547, "memory(GiB)": 77.56, "step": 61330, "token_acc": 0.49324324324324326, "train_speed(iter/s)": 1.43791 }, { "epoch": 2.6277794438970052, "grad_norm": 5.908392429351807, "learning_rate": 4.599000160948214e-05, "loss": 2.3267688751220703, "memory(GiB)": 77.56, "step": 61335, "token_acc": 0.5533333333333333, "train_speed(iter/s)": 1.437918 }, { "epoch": 2.627993659226254, "grad_norm": 5.070243835449219, "learning_rate": 4.598329355076024e-05, "loss": 2.486347961425781, "memory(GiB)": 77.56, "step": 61340, "token_acc": 0.4692556634304207, "train_speed(iter/s)": 1.437925 }, { "epoch": 2.6282078745555033, "grad_norm": 6.921378135681152, "learning_rate": 4.597658556480475e-05, "loss": 2.50708065032959, "memory(GiB)": 77.56, "step": 61345, "token_acc": 0.460431654676259, "train_speed(iter/s)": 1.437938 }, { "epoch": 2.628422089884752, "grad_norm": 7.598796844482422, "learning_rate": 4.5969877651737194e-05, "loss": 2.3890256881713867, "memory(GiB)": 77.56, "step": 61350, "token_acc": 0.5117370892018779, "train_speed(iter/s)": 1.43794 }, { "epoch": 2.628636305214001, "grad_norm": 5.354578971862793, "learning_rate": 4.596316981167906e-05, "loss": 2.3807161331176756, "memory(GiB)": 77.56, "step": 61355, "token_acc": 0.49458483754512633, "train_speed(iter/s)": 1.437958 }, { "epoch": 2.62885052054325, "grad_norm": 7.167765140533447, "learning_rate": 4.595646204475192e-05, "loss": 2.4626785278320313, "memory(GiB)": 77.56, "step": 61360, "token_acc": 0.48013245033112584, "train_speed(iter/s)": 1.437958 }, { "epoch": 2.629064735872499, "grad_norm": 5.551894187927246, "learning_rate": 4.594975435107726e-05, "loss": 2.3933115005493164, "memory(GiB)": 77.56, "step": 61365, "token_acc": 0.48928571428571427, "train_speed(iter/s)": 1.437942 }, { "epoch": 2.629278951201748, "grad_norm": 6.5892205238342285, "learning_rate": 4.594304673077658e-05, "loss": 2.3925952911376953, "memory(GiB)": 77.56, "step": 61370, "token_acc": 0.5166051660516605, "train_speed(iter/s)": 1.437949 }, { "epoch": 2.629493166530997, "grad_norm": 5.637641429901123, "learning_rate": 4.593633918397143e-05, "loss": 2.6355018615722656, "memory(GiB)": 77.56, "step": 61375, "token_acc": 0.4891304347826087, "train_speed(iter/s)": 1.437979 }, { "epoch": 2.629707381860246, "grad_norm": 5.442171096801758, "learning_rate": 4.592963171078331e-05, "loss": 2.4948026657104494, "memory(GiB)": 77.56, "step": 61380, "token_acc": 0.4707792207792208, "train_speed(iter/s)": 1.437973 }, { "epoch": 2.6299215971894947, "grad_norm": 4.8702263832092285, "learning_rate": 4.59229243113337e-05, "loss": 2.388945960998535, "memory(GiB)": 77.56, "step": 61385, "token_acc": 0.5017182130584192, "train_speed(iter/s)": 1.437987 }, { "epoch": 2.630135812518744, "grad_norm": 5.89458703994751, "learning_rate": 4.5916216985744164e-05, "loss": 2.391023063659668, "memory(GiB)": 77.56, "step": 61390, "token_acc": 0.4810126582278481, "train_speed(iter/s)": 1.437995 }, { "epoch": 2.6303500278479928, "grad_norm": 7.389822006225586, "learning_rate": 4.590950973413619e-05, "loss": 2.1155399322509765, "memory(GiB)": 77.56, "step": 61395, "token_acc": 0.5446428571428571, "train_speed(iter/s)": 1.437999 }, { "epoch": 2.6305642431772416, "grad_norm": 5.502264976501465, "learning_rate": 4.5902802556631275e-05, "loss": 2.4201284408569337, "memory(GiB)": 77.56, "step": 61400, "token_acc": 0.48639455782312924, "train_speed(iter/s)": 1.438023 }, { "epoch": 2.630778458506491, "grad_norm": 7.2376532554626465, "learning_rate": 4.589609545335095e-05, "loss": 2.499235153198242, "memory(GiB)": 77.56, "step": 61405, "token_acc": 0.4846153846153846, "train_speed(iter/s)": 1.438035 }, { "epoch": 2.6309926738357396, "grad_norm": 5.329863548278809, "learning_rate": 4.58893884244167e-05, "loss": 2.4541032791137694, "memory(GiB)": 77.56, "step": 61410, "token_acc": 0.4831804281345566, "train_speed(iter/s)": 1.438027 }, { "epoch": 2.6312068891649885, "grad_norm": 5.4700236320495605, "learning_rate": 4.588268146995003e-05, "loss": 2.7074026107788085, "memory(GiB)": 77.56, "step": 61415, "token_acc": 0.4375, "train_speed(iter/s)": 1.438021 }, { "epoch": 2.6314211044942377, "grad_norm": 4.2026495933532715, "learning_rate": 4.587597459007246e-05, "loss": 2.3165355682373048, "memory(GiB)": 77.56, "step": 61420, "token_acc": 0.5065359477124183, "train_speed(iter/s)": 1.437999 }, { "epoch": 2.6316353198234865, "grad_norm": 6.1931047439575195, "learning_rate": 4.586926778490547e-05, "loss": 2.3176557540893556, "memory(GiB)": 77.56, "step": 61425, "token_acc": 0.5148148148148148, "train_speed(iter/s)": 1.437991 }, { "epoch": 2.6318495351527353, "grad_norm": 7.530332565307617, "learning_rate": 4.586256105457056e-05, "loss": 2.2246816635131834, "memory(GiB)": 77.56, "step": 61430, "token_acc": 0.4915254237288136, "train_speed(iter/s)": 1.43798 }, { "epoch": 2.6320637504819846, "grad_norm": 5.485375881195068, "learning_rate": 4.585585439918925e-05, "loss": 2.5426197052001953, "memory(GiB)": 77.56, "step": 61435, "token_acc": 0.47985347985347987, "train_speed(iter/s)": 1.437999 }, { "epoch": 2.6322779658112334, "grad_norm": 6.360501289367676, "learning_rate": 4.5849147818883017e-05, "loss": 2.617303657531738, "memory(GiB)": 77.56, "step": 61440, "token_acc": 0.48297213622291024, "train_speed(iter/s)": 1.437991 }, { "epoch": 2.6324921811404822, "grad_norm": 8.146564483642578, "learning_rate": 4.584244131377338e-05, "loss": 2.1984540939331056, "memory(GiB)": 77.56, "step": 61445, "token_acc": 0.5258964143426295, "train_speed(iter/s)": 1.437992 }, { "epoch": 2.6327063964697315, "grad_norm": 5.190566062927246, "learning_rate": 4.5835734883981804e-05, "loss": 2.370341491699219, "memory(GiB)": 77.56, "step": 61450, "token_acc": 0.4820846905537459, "train_speed(iter/s)": 1.437982 }, { "epoch": 2.6329206117989803, "grad_norm": 5.975463390350342, "learning_rate": 4.5829028529629794e-05, "loss": 2.45679931640625, "memory(GiB)": 77.56, "step": 61455, "token_acc": 0.47761194029850745, "train_speed(iter/s)": 1.437979 }, { "epoch": 2.633134827128229, "grad_norm": 5.87574577331543, "learning_rate": 4.5822322250838836e-05, "loss": 2.689578628540039, "memory(GiB)": 77.56, "step": 61460, "token_acc": 0.4925925925925926, "train_speed(iter/s)": 1.437956 }, { "epoch": 2.6333490424574784, "grad_norm": 6.860673904418945, "learning_rate": 4.5815616047730455e-05, "loss": 2.4837966918945313, "memory(GiB)": 77.56, "step": 61465, "token_acc": 0.5046153846153846, "train_speed(iter/s)": 1.437977 }, { "epoch": 2.633563257786727, "grad_norm": 5.0913190841674805, "learning_rate": 4.580890992042611e-05, "loss": 2.35849666595459, "memory(GiB)": 77.56, "step": 61470, "token_acc": 0.48220064724919093, "train_speed(iter/s)": 1.437991 }, { "epoch": 2.633777473115976, "grad_norm": 7.2061967849731445, "learning_rate": 4.580220386904728e-05, "loss": 2.5140594482421874, "memory(GiB)": 77.56, "step": 61475, "token_acc": 0.48, "train_speed(iter/s)": 1.437997 }, { "epoch": 2.6339916884452252, "grad_norm": 5.278740882873535, "learning_rate": 4.579549789371548e-05, "loss": 2.097747802734375, "memory(GiB)": 77.56, "step": 61480, "token_acc": 0.5269709543568465, "train_speed(iter/s)": 1.438029 }, { "epoch": 2.634205903774474, "grad_norm": 6.1236395835876465, "learning_rate": 4.578879199455216e-05, "loss": 2.319572639465332, "memory(GiB)": 77.56, "step": 61485, "token_acc": 0.5120967741935484, "train_speed(iter/s)": 1.438034 }, { "epoch": 2.634420119103723, "grad_norm": 5.959428787231445, "learning_rate": 4.5782086171678845e-05, "loss": 2.5790782928466798, "memory(GiB)": 77.56, "step": 61490, "token_acc": 0.4634146341463415, "train_speed(iter/s)": 1.438029 }, { "epoch": 2.634634334432972, "grad_norm": 6.366353511810303, "learning_rate": 4.577538042521699e-05, "loss": 2.2309345245361327, "memory(GiB)": 77.56, "step": 61495, "token_acc": 0.5223880597014925, "train_speed(iter/s)": 1.43803 }, { "epoch": 2.634848549762221, "grad_norm": 5.365630626678467, "learning_rate": 4.5768674755288074e-05, "loss": 2.4895315170288086, "memory(GiB)": 77.56, "step": 61500, "token_acc": 0.4844290657439446, "train_speed(iter/s)": 1.438036 }, { "epoch": 2.634848549762221, "eval_loss": 2.0978097915649414, "eval_runtime": 14.3868, "eval_samples_per_second": 6.951, "eval_steps_per_second": 6.951, "eval_token_acc": 0.4894179894179894, "step": 61500 }, { "epoch": 2.6350627650914697, "grad_norm": 6.450132846832275, "learning_rate": 4.576196916201359e-05, "loss": 2.5573665618896486, "memory(GiB)": 77.56, "step": 61505, "token_acc": 0.4896810506566604, "train_speed(iter/s)": 1.437526 }, { "epoch": 2.635276980420719, "grad_norm": 4.668859958648682, "learning_rate": 4.575526364551501e-05, "loss": 2.3783327102661134, "memory(GiB)": 77.56, "step": 61510, "token_acc": 0.46229508196721314, "train_speed(iter/s)": 1.437525 }, { "epoch": 2.635491195749968, "grad_norm": 4.791419982910156, "learning_rate": 4.5748558205913814e-05, "loss": 2.3759929656982424, "memory(GiB)": 77.56, "step": 61515, "token_acc": 0.49085365853658536, "train_speed(iter/s)": 1.437546 }, { "epoch": 2.6357054110792166, "grad_norm": 4.853916168212891, "learning_rate": 4.5741852843331466e-05, "loss": 2.403456687927246, "memory(GiB)": 77.56, "step": 61520, "token_acc": 0.5, "train_speed(iter/s)": 1.437553 }, { "epoch": 2.635919626408466, "grad_norm": 7.256995677947998, "learning_rate": 4.5735147557889445e-05, "loss": 2.435834503173828, "memory(GiB)": 77.56, "step": 61525, "token_acc": 0.4639175257731959, "train_speed(iter/s)": 1.437576 }, { "epoch": 2.6361338417377147, "grad_norm": 5.986945152282715, "learning_rate": 4.5728442349709244e-05, "loss": 2.2936466217041014, "memory(GiB)": 77.56, "step": 61530, "token_acc": 0.50390625, "train_speed(iter/s)": 1.437595 }, { "epoch": 2.6363480570669635, "grad_norm": 5.918588161468506, "learning_rate": 4.5721737218912285e-05, "loss": 2.402658462524414, "memory(GiB)": 77.56, "step": 61535, "token_acc": 0.46325878594249204, "train_speed(iter/s)": 1.437607 }, { "epoch": 2.6365622723962128, "grad_norm": 5.93505859375, "learning_rate": 4.57150321656201e-05, "loss": 2.6621875762939453, "memory(GiB)": 77.56, "step": 61540, "token_acc": 0.501779359430605, "train_speed(iter/s)": 1.437608 }, { "epoch": 2.6367764877254616, "grad_norm": 5.0371222496032715, "learning_rate": 4.5708327189954116e-05, "loss": 2.270490837097168, "memory(GiB)": 77.56, "step": 61545, "token_acc": 0.5333333333333333, "train_speed(iter/s)": 1.437624 }, { "epoch": 2.6369907030547104, "grad_norm": 6.274292945861816, "learning_rate": 4.5701622292035815e-05, "loss": 2.655232620239258, "memory(GiB)": 77.56, "step": 61550, "token_acc": 0.46153846153846156, "train_speed(iter/s)": 1.437609 }, { "epoch": 2.6372049183839597, "grad_norm": 6.238415718078613, "learning_rate": 4.569491747198666e-05, "loss": 2.353529930114746, "memory(GiB)": 77.56, "step": 61555, "token_acc": 0.5346534653465347, "train_speed(iter/s)": 1.437613 }, { "epoch": 2.6374191337132085, "grad_norm": 4.50010347366333, "learning_rate": 4.56882127299281e-05, "loss": 2.352578353881836, "memory(GiB)": 77.56, "step": 61560, "token_acc": 0.4838709677419355, "train_speed(iter/s)": 1.437634 }, { "epoch": 2.6376333490424573, "grad_norm": 6.645480155944824, "learning_rate": 4.568150806598163e-05, "loss": 2.5335971832275392, "memory(GiB)": 77.56, "step": 61565, "token_acc": 0.46511627906976744, "train_speed(iter/s)": 1.437635 }, { "epoch": 2.6378475643717065, "grad_norm": 6.403446197509766, "learning_rate": 4.567480348026869e-05, "loss": 2.4612537384033204, "memory(GiB)": 77.56, "step": 61570, "token_acc": 0.41935483870967744, "train_speed(iter/s)": 1.437663 }, { "epoch": 2.6380617797009553, "grad_norm": 6.063022136688232, "learning_rate": 4.566809897291073e-05, "loss": 2.363648796081543, "memory(GiB)": 77.56, "step": 61575, "token_acc": 0.545816733067729, "train_speed(iter/s)": 1.437678 }, { "epoch": 2.638275995030204, "grad_norm": 6.214544773101807, "learning_rate": 4.566139454402923e-05, "loss": 2.4042648315429687, "memory(GiB)": 77.56, "step": 61580, "token_acc": 0.4599406528189911, "train_speed(iter/s)": 1.437677 }, { "epoch": 2.6384902103594534, "grad_norm": 6.261946201324463, "learning_rate": 4.565469019374563e-05, "loss": 2.2694778442382812, "memory(GiB)": 77.56, "step": 61585, "token_acc": 0.5330578512396694, "train_speed(iter/s)": 1.43767 }, { "epoch": 2.6387044256887022, "grad_norm": 6.364415168762207, "learning_rate": 4.5647985922181405e-05, "loss": 2.2851587295532227, "memory(GiB)": 77.56, "step": 61590, "token_acc": 0.5034246575342466, "train_speed(iter/s)": 1.43767 }, { "epoch": 2.638918641017951, "grad_norm": 5.5399322509765625, "learning_rate": 4.564128172945799e-05, "loss": 2.3585536956787108, "memory(GiB)": 77.56, "step": 61595, "token_acc": 0.5244444444444445, "train_speed(iter/s)": 1.437641 }, { "epoch": 2.6391328563472003, "grad_norm": 7.091579914093018, "learning_rate": 4.563457761569685e-05, "loss": 2.488679122924805, "memory(GiB)": 77.56, "step": 61600, "token_acc": 0.5163636363636364, "train_speed(iter/s)": 1.437635 }, { "epoch": 2.639347071676449, "grad_norm": 5.665006160736084, "learning_rate": 4.56278735810194e-05, "loss": 2.534504508972168, "memory(GiB)": 77.56, "step": 61605, "token_acc": 0.4785714285714286, "train_speed(iter/s)": 1.437657 }, { "epoch": 2.639561287005698, "grad_norm": 5.5218353271484375, "learning_rate": 4.562116962554716e-05, "loss": 2.477473258972168, "memory(GiB)": 77.56, "step": 61610, "token_acc": 0.48616600790513836, "train_speed(iter/s)": 1.437657 }, { "epoch": 2.639775502334947, "grad_norm": 4.8111653327941895, "learning_rate": 4.561446574940153e-05, "loss": 2.4436132431030275, "memory(GiB)": 77.56, "step": 61615, "token_acc": 0.5094339622641509, "train_speed(iter/s)": 1.437667 }, { "epoch": 2.639989717664196, "grad_norm": 5.70447301864624, "learning_rate": 4.560776195270396e-05, "loss": 2.5345947265625, "memory(GiB)": 77.56, "step": 61620, "token_acc": 0.473015873015873, "train_speed(iter/s)": 1.437661 }, { "epoch": 2.640203932993445, "grad_norm": 4.418641090393066, "learning_rate": 4.5601058235575914e-05, "loss": 2.4715129852294924, "memory(GiB)": 77.56, "step": 61625, "token_acc": 0.4629080118694362, "train_speed(iter/s)": 1.437668 }, { "epoch": 2.640418148322694, "grad_norm": 4.811949253082275, "learning_rate": 4.559435459813881e-05, "loss": 2.4463916778564454, "memory(GiB)": 77.56, "step": 61630, "token_acc": 0.501779359430605, "train_speed(iter/s)": 1.437688 }, { "epoch": 2.640632363651943, "grad_norm": 7.582095146179199, "learning_rate": 4.558765104051412e-05, "loss": 2.20712890625, "memory(GiB)": 77.56, "step": 61635, "token_acc": 0.5, "train_speed(iter/s)": 1.437705 }, { "epoch": 2.6408465789811917, "grad_norm": 6.083032608032227, "learning_rate": 4.558094756282327e-05, "loss": 2.3288381576538084, "memory(GiB)": 77.56, "step": 61640, "token_acc": 0.521875, "train_speed(iter/s)": 1.437736 }, { "epoch": 2.641060794310441, "grad_norm": 6.091094017028809, "learning_rate": 4.557424416518768e-05, "loss": 2.507367706298828, "memory(GiB)": 77.56, "step": 61645, "token_acc": 0.5399239543726235, "train_speed(iter/s)": 1.437712 }, { "epoch": 2.6412750096396898, "grad_norm": 7.4312591552734375, "learning_rate": 4.556754084772883e-05, "loss": 2.6219932556152346, "memory(GiB)": 77.56, "step": 61650, "token_acc": 0.5061349693251533, "train_speed(iter/s)": 1.437718 }, { "epoch": 2.6414892249689386, "grad_norm": 5.042299270629883, "learning_rate": 4.556083761056814e-05, "loss": 2.2487071990966796, "memory(GiB)": 77.56, "step": 61655, "token_acc": 0.5151515151515151, "train_speed(iter/s)": 1.437731 }, { "epoch": 2.641703440298188, "grad_norm": 5.684948921203613, "learning_rate": 4.5554134453827024e-05, "loss": 2.4063066482543944, "memory(GiB)": 77.56, "step": 61660, "token_acc": 0.4763779527559055, "train_speed(iter/s)": 1.437739 }, { "epoch": 2.6419176556274366, "grad_norm": 6.47034215927124, "learning_rate": 4.554743137762694e-05, "loss": 2.3203659057617188, "memory(GiB)": 77.56, "step": 61665, "token_acc": 0.47896440129449835, "train_speed(iter/s)": 1.437751 }, { "epoch": 2.6421318709566854, "grad_norm": 5.349266529083252, "learning_rate": 4.554072838208931e-05, "loss": 2.536144828796387, "memory(GiB)": 77.56, "step": 61670, "token_acc": 0.46357615894039733, "train_speed(iter/s)": 1.437767 }, { "epoch": 2.6423460862859347, "grad_norm": 5.05371618270874, "learning_rate": 4.553402546733557e-05, "loss": 2.061013412475586, "memory(GiB)": 77.56, "step": 61675, "token_acc": 0.5400696864111498, "train_speed(iter/s)": 1.437758 }, { "epoch": 2.6425603016151835, "grad_norm": 6.922592639923096, "learning_rate": 4.552732263348713e-05, "loss": 2.3230724334716797, "memory(GiB)": 77.56, "step": 61680, "token_acc": 0.532258064516129, "train_speed(iter/s)": 1.437765 }, { "epoch": 2.6427745169444323, "grad_norm": 6.067020416259766, "learning_rate": 4.5520619880665466e-05, "loss": 2.3619842529296875, "memory(GiB)": 77.56, "step": 61685, "token_acc": 0.4521452145214521, "train_speed(iter/s)": 1.437758 }, { "epoch": 2.6429887322736816, "grad_norm": 5.453634738922119, "learning_rate": 4.551391720899196e-05, "loss": 2.785741424560547, "memory(GiB)": 77.56, "step": 61690, "token_acc": 0.49517684887459806, "train_speed(iter/s)": 1.437777 }, { "epoch": 2.6432029476029304, "grad_norm": 5.473423957824707, "learning_rate": 4.5507214618588055e-05, "loss": 2.5167659759521483, "memory(GiB)": 77.56, "step": 61695, "token_acc": 0.48825065274151436, "train_speed(iter/s)": 1.437804 }, { "epoch": 2.643417162932179, "grad_norm": 4.922832489013672, "learning_rate": 4.5500512109575186e-05, "loss": 2.6155630111694337, "memory(GiB)": 77.56, "step": 61700, "token_acc": 0.4773413897280967, "train_speed(iter/s)": 1.437795 }, { "epoch": 2.6436313782614285, "grad_norm": 5.397421836853027, "learning_rate": 4.549380968207474e-05, "loss": 2.2809940338134767, "memory(GiB)": 77.56, "step": 61705, "token_acc": 0.484251968503937, "train_speed(iter/s)": 1.437794 }, { "epoch": 2.6438455935906773, "grad_norm": 6.049432754516602, "learning_rate": 4.5487107336208166e-05, "loss": 2.534933853149414, "memory(GiB)": 77.56, "step": 61710, "token_acc": 0.44482758620689655, "train_speed(iter/s)": 1.437834 }, { "epoch": 2.644059808919926, "grad_norm": 5.0446553230285645, "learning_rate": 4.548040507209689e-05, "loss": 2.156915855407715, "memory(GiB)": 77.56, "step": 61715, "token_acc": 0.532258064516129, "train_speed(iter/s)": 1.437863 }, { "epoch": 2.6442740242491753, "grad_norm": 4.818816184997559, "learning_rate": 4.547370288986229e-05, "loss": 2.4247295379638674, "memory(GiB)": 77.56, "step": 61720, "token_acc": 0.5016501650165016, "train_speed(iter/s)": 1.437867 }, { "epoch": 2.644488239578424, "grad_norm": 7.382740020751953, "learning_rate": 4.546700078962582e-05, "loss": 2.592110252380371, "memory(GiB)": 77.56, "step": 61725, "token_acc": 0.4750830564784053, "train_speed(iter/s)": 1.437877 }, { "epoch": 2.644702454907673, "grad_norm": 7.030486106872559, "learning_rate": 4.5460298771508885e-05, "loss": 2.628032684326172, "memory(GiB)": 77.56, "step": 61730, "token_acc": 0.4219858156028369, "train_speed(iter/s)": 1.437886 }, { "epoch": 2.6449166702369222, "grad_norm": 5.9276556968688965, "learning_rate": 4.54535968356329e-05, "loss": 2.621383857727051, "memory(GiB)": 77.56, "step": 61735, "token_acc": 0.4336569579288026, "train_speed(iter/s)": 1.43788 }, { "epoch": 2.645130885566171, "grad_norm": 5.162145614624023, "learning_rate": 4.544689498211927e-05, "loss": 2.523931694030762, "memory(GiB)": 77.56, "step": 61740, "token_acc": 0.48, "train_speed(iter/s)": 1.437911 }, { "epoch": 2.64534510089542, "grad_norm": 4.634915828704834, "learning_rate": 4.54401932110894e-05, "loss": 2.373402404785156, "memory(GiB)": 77.56, "step": 61745, "token_acc": 0.4863013698630137, "train_speed(iter/s)": 1.437936 }, { "epoch": 2.645559316224669, "grad_norm": 4.609366416931152, "learning_rate": 4.543349152266472e-05, "loss": 2.430101776123047, "memory(GiB)": 77.56, "step": 61750, "token_acc": 0.4483695652173913, "train_speed(iter/s)": 1.43794 }, { "epoch": 2.645773531553918, "grad_norm": 4.220828533172607, "learning_rate": 4.54267899169666e-05, "loss": 2.3142263412475588, "memory(GiB)": 77.56, "step": 61755, "token_acc": 0.4858156028368794, "train_speed(iter/s)": 1.43795 }, { "epoch": 2.6459877468831667, "grad_norm": 9.216361045837402, "learning_rate": 4.54200883941165e-05, "loss": 2.341102409362793, "memory(GiB)": 77.56, "step": 61760, "token_acc": 0.492, "train_speed(iter/s)": 1.437921 }, { "epoch": 2.646201962212416, "grad_norm": 4.543700218200684, "learning_rate": 4.541338695423578e-05, "loss": 2.056321907043457, "memory(GiB)": 77.56, "step": 61765, "token_acc": 0.5645756457564576, "train_speed(iter/s)": 1.437933 }, { "epoch": 2.646416177541665, "grad_norm": 7.84173059463501, "learning_rate": 4.5406685597445866e-05, "loss": 2.462577056884766, "memory(GiB)": 77.56, "step": 61770, "token_acc": 0.496, "train_speed(iter/s)": 1.437937 }, { "epoch": 2.6466303928709136, "grad_norm": 6.8672261238098145, "learning_rate": 4.539998432386814e-05, "loss": 2.4416786193847657, "memory(GiB)": 77.56, "step": 61775, "token_acc": 0.4506578947368421, "train_speed(iter/s)": 1.437945 }, { "epoch": 2.646844608200163, "grad_norm": 4.976284027099609, "learning_rate": 4.5393283133624026e-05, "loss": 2.443865203857422, "memory(GiB)": 77.56, "step": 61780, "token_acc": 0.48013245033112584, "train_speed(iter/s)": 1.437973 }, { "epoch": 2.6470588235294117, "grad_norm": 5.521240711212158, "learning_rate": 4.5386582026834906e-05, "loss": 2.5446725845336915, "memory(GiB)": 77.56, "step": 61785, "token_acc": 0.4697986577181208, "train_speed(iter/s)": 1.43798 }, { "epoch": 2.6472730388586605, "grad_norm": 5.984615802764893, "learning_rate": 4.5379881003622174e-05, "loss": 2.3382713317871096, "memory(GiB)": 77.56, "step": 61790, "token_acc": 0.5310344827586206, "train_speed(iter/s)": 1.437998 }, { "epoch": 2.6474872541879098, "grad_norm": 6.743816375732422, "learning_rate": 4.537318006410724e-05, "loss": 2.3853715896606444, "memory(GiB)": 77.56, "step": 61795, "token_acc": 0.4775641025641026, "train_speed(iter/s)": 1.437997 }, { "epoch": 2.6477014695171586, "grad_norm": 8.244396209716797, "learning_rate": 4.53664792084115e-05, "loss": 2.377880668640137, "memory(GiB)": 77.56, "step": 61800, "token_acc": 0.519163763066202, "train_speed(iter/s)": 1.438002 }, { "epoch": 2.6479156848464074, "grad_norm": 5.245119094848633, "learning_rate": 4.535977843665631e-05, "loss": 2.7398296356201173, "memory(GiB)": 77.56, "step": 61805, "token_acc": 0.45938375350140054, "train_speed(iter/s)": 1.438009 }, { "epoch": 2.6481299001756566, "grad_norm": 9.776571273803711, "learning_rate": 4.53530777489631e-05, "loss": 2.6974952697753904, "memory(GiB)": 77.56, "step": 61810, "token_acc": 0.4816326530612245, "train_speed(iter/s)": 1.438023 }, { "epoch": 2.6483441155049054, "grad_norm": 7.274689674377441, "learning_rate": 4.534637714545324e-05, "loss": 2.5024799346923827, "memory(GiB)": 77.56, "step": 61815, "token_acc": 0.4645669291338583, "train_speed(iter/s)": 1.438009 }, { "epoch": 2.6485583308341543, "grad_norm": 4.626080513000488, "learning_rate": 4.533967662624813e-05, "loss": 2.526080322265625, "memory(GiB)": 77.56, "step": 61820, "token_acc": 0.4856115107913669, "train_speed(iter/s)": 1.438005 }, { "epoch": 2.6487725461634035, "grad_norm": 4.9425787925720215, "learning_rate": 4.533297619146915e-05, "loss": 2.480988311767578, "memory(GiB)": 77.56, "step": 61825, "token_acc": 0.5038167938931297, "train_speed(iter/s)": 1.438017 }, { "epoch": 2.6489867614926523, "grad_norm": 4.877227783203125, "learning_rate": 4.532627584123766e-05, "loss": 2.669286346435547, "memory(GiB)": 77.56, "step": 61830, "token_acc": 0.46366782006920415, "train_speed(iter/s)": 1.438058 }, { "epoch": 2.649200976821901, "grad_norm": 7.359621524810791, "learning_rate": 4.531957557567509e-05, "loss": 2.4072750091552733, "memory(GiB)": 77.56, "step": 61835, "token_acc": 0.48563218390804597, "train_speed(iter/s)": 1.438054 }, { "epoch": 2.6494151921511504, "grad_norm": 6.575071811676025, "learning_rate": 4.5312875394902795e-05, "loss": 2.6007226943969726, "memory(GiB)": 77.56, "step": 61840, "token_acc": 0.4440677966101695, "train_speed(iter/s)": 1.438045 }, { "epoch": 2.649629407480399, "grad_norm": 4.844421863555908, "learning_rate": 4.530617529904218e-05, "loss": 2.315010833740234, "memory(GiB)": 77.56, "step": 61845, "token_acc": 0.4954682779456193, "train_speed(iter/s)": 1.43805 }, { "epoch": 2.649843622809648, "grad_norm": 4.905552864074707, "learning_rate": 4.529947528821457e-05, "loss": 2.1814138412475588, "memory(GiB)": 77.56, "step": 61850, "token_acc": 0.45724907063197023, "train_speed(iter/s)": 1.43803 }, { "epoch": 2.6500578381388973, "grad_norm": 6.575028896331787, "learning_rate": 4.52927753625414e-05, "loss": 2.420262908935547, "memory(GiB)": 77.56, "step": 61855, "token_acc": 0.5017064846416383, "train_speed(iter/s)": 1.438046 }, { "epoch": 2.650272053468146, "grad_norm": 5.843136310577393, "learning_rate": 4.528607552214401e-05, "loss": 2.747515106201172, "memory(GiB)": 77.56, "step": 61860, "token_acc": 0.44281524926686217, "train_speed(iter/s)": 1.438063 }, { "epoch": 2.650486268797395, "grad_norm": 4.591243743896484, "learning_rate": 4.527937576714378e-05, "loss": 2.6525651931762697, "memory(GiB)": 77.56, "step": 61865, "token_acc": 0.4560810810810811, "train_speed(iter/s)": 1.438028 }, { "epoch": 2.650700484126644, "grad_norm": 5.79725980758667, "learning_rate": 4.527267609766209e-05, "loss": 2.37323112487793, "memory(GiB)": 77.56, "step": 61870, "token_acc": 0.5104602510460251, "train_speed(iter/s)": 1.438024 }, { "epoch": 2.650914699455893, "grad_norm": 4.817253112792969, "learning_rate": 4.52659765138203e-05, "loss": 1.9315811157226563, "memory(GiB)": 77.56, "step": 61875, "token_acc": 0.555984555984556, "train_speed(iter/s)": 1.438018 }, { "epoch": 2.651128914785142, "grad_norm": 5.264191627502441, "learning_rate": 4.5259277015739796e-05, "loss": 2.183505630493164, "memory(GiB)": 77.56, "step": 61880, "token_acc": 0.5418060200668896, "train_speed(iter/s)": 1.438021 }, { "epoch": 2.651343130114391, "grad_norm": 6.933333396911621, "learning_rate": 4.525257760354194e-05, "loss": 2.2891656875610353, "memory(GiB)": 77.56, "step": 61885, "token_acc": 0.49469964664310956, "train_speed(iter/s)": 1.438034 }, { "epoch": 2.65155734544364, "grad_norm": 4.850844383239746, "learning_rate": 4.5245878277348084e-05, "loss": 2.208728790283203, "memory(GiB)": 77.56, "step": 61890, "token_acc": 0.4910394265232975, "train_speed(iter/s)": 1.438041 }, { "epoch": 2.6517715607728887, "grad_norm": 6.755412578582764, "learning_rate": 4.523917903727961e-05, "loss": 2.4329196929931642, "memory(GiB)": 77.56, "step": 61895, "token_acc": 0.48242811501597443, "train_speed(iter/s)": 1.438079 }, { "epoch": 2.651985776102138, "grad_norm": 5.404649257659912, "learning_rate": 4.5232479883457865e-05, "loss": 2.2061956405639647, "memory(GiB)": 77.56, "step": 61900, "token_acc": 0.5136986301369864, "train_speed(iter/s)": 1.438057 }, { "epoch": 2.6521999914313867, "grad_norm": 4.8161845207214355, "learning_rate": 4.522578081600421e-05, "loss": 2.187418556213379, "memory(GiB)": 77.56, "step": 61905, "token_acc": 0.5364238410596026, "train_speed(iter/s)": 1.43807 }, { "epoch": 2.6524142067606356, "grad_norm": 5.173804759979248, "learning_rate": 4.521908183504002e-05, "loss": 2.567978858947754, "memory(GiB)": 77.56, "step": 61910, "token_acc": 0.4816053511705686, "train_speed(iter/s)": 1.438068 }, { "epoch": 2.652628422089885, "grad_norm": 5.253247261047363, "learning_rate": 4.521238294068667e-05, "loss": 2.547380828857422, "memory(GiB)": 77.56, "step": 61915, "token_acc": 0.5161290322580645, "train_speed(iter/s)": 1.438071 }, { "epoch": 2.6528426374191336, "grad_norm": 5.067152500152588, "learning_rate": 4.520568413306547e-05, "loss": 2.4333213806152343, "memory(GiB)": 77.56, "step": 61920, "token_acc": 0.5060606060606061, "train_speed(iter/s)": 1.438048 }, { "epoch": 2.6530568527483824, "grad_norm": 4.9168925285339355, "learning_rate": 4.519898541229781e-05, "loss": 2.2088905334472657, "memory(GiB)": 77.56, "step": 61925, "token_acc": 0.4846153846153846, "train_speed(iter/s)": 1.438042 }, { "epoch": 2.6532710680776317, "grad_norm": 5.341547966003418, "learning_rate": 4.519228677850504e-05, "loss": 2.274029350280762, "memory(GiB)": 77.56, "step": 61930, "token_acc": 0.5183823529411765, "train_speed(iter/s)": 1.438045 }, { "epoch": 2.6534852834068805, "grad_norm": 5.764420509338379, "learning_rate": 4.5185588231808486e-05, "loss": 2.488014984130859, "memory(GiB)": 77.56, "step": 61935, "token_acc": 0.47076023391812866, "train_speed(iter/s)": 1.438079 }, { "epoch": 2.6536994987361293, "grad_norm": 5.769437313079834, "learning_rate": 4.517888977232953e-05, "loss": 2.4321130752563476, "memory(GiB)": 77.56, "step": 61940, "token_acc": 0.47796610169491527, "train_speed(iter/s)": 1.438086 }, { "epoch": 2.6539137140653786, "grad_norm": 5.0661516189575195, "learning_rate": 4.5172191400189515e-05, "loss": 2.268540382385254, "memory(GiB)": 77.56, "step": 61945, "token_acc": 0.5128205128205128, "train_speed(iter/s)": 1.438091 }, { "epoch": 2.6541279293946274, "grad_norm": 5.428701400756836, "learning_rate": 4.516549311550977e-05, "loss": 2.3401361465454102, "memory(GiB)": 77.56, "step": 61950, "token_acc": 0.476027397260274, "train_speed(iter/s)": 1.438115 }, { "epoch": 2.654342144723876, "grad_norm": 6.1041789054870605, "learning_rate": 4.515879491841166e-05, "loss": 2.6630428314208983, "memory(GiB)": 77.56, "step": 61955, "token_acc": 0.4527027027027027, "train_speed(iter/s)": 1.438141 }, { "epoch": 2.6545563600531255, "grad_norm": 5.632446765899658, "learning_rate": 4.515209680901651e-05, "loss": 2.4645896911621095, "memory(GiB)": 77.56, "step": 61960, "token_acc": 0.4774436090225564, "train_speed(iter/s)": 1.438165 }, { "epoch": 2.6547705753823743, "grad_norm": 5.658358097076416, "learning_rate": 4.514539878744568e-05, "loss": 2.551421356201172, "memory(GiB)": 77.56, "step": 61965, "token_acc": 0.45321637426900585, "train_speed(iter/s)": 1.438161 }, { "epoch": 2.654984790711623, "grad_norm": 5.636961460113525, "learning_rate": 4.5138700853820516e-05, "loss": 2.4709659576416017, "memory(GiB)": 77.56, "step": 61970, "token_acc": 0.4527687296416938, "train_speed(iter/s)": 1.43818 }, { "epoch": 2.6551990060408723, "grad_norm": 5.3820600509643555, "learning_rate": 4.513200300826232e-05, "loss": 2.4653305053710937, "memory(GiB)": 77.56, "step": 61975, "token_acc": 0.49097472924187724, "train_speed(iter/s)": 1.438189 }, { "epoch": 2.655413221370121, "grad_norm": 7.468258380889893, "learning_rate": 4.512530525089246e-05, "loss": 2.3487545013427735, "memory(GiB)": 77.56, "step": 61980, "token_acc": 0.5020080321285141, "train_speed(iter/s)": 1.438223 }, { "epoch": 2.65562743669937, "grad_norm": 4.721065044403076, "learning_rate": 4.511860758183229e-05, "loss": 2.338658905029297, "memory(GiB)": 77.56, "step": 61985, "token_acc": 0.48360655737704916, "train_speed(iter/s)": 1.438223 }, { "epoch": 2.655841652028619, "grad_norm": 4.905104637145996, "learning_rate": 4.511191000120312e-05, "loss": 2.575343704223633, "memory(GiB)": 77.56, "step": 61990, "token_acc": 0.48880597014925375, "train_speed(iter/s)": 1.438233 }, { "epoch": 2.656055867357868, "grad_norm": 5.322173595428467, "learning_rate": 4.510521250912627e-05, "loss": 2.432086944580078, "memory(GiB)": 77.56, "step": 61995, "token_acc": 0.4703703703703704, "train_speed(iter/s)": 1.438216 }, { "epoch": 2.656270082687117, "grad_norm": 8.139036178588867, "learning_rate": 4.50985151057231e-05, "loss": 2.178491973876953, "memory(GiB)": 77.56, "step": 62000, "token_acc": 0.4874551971326165, "train_speed(iter/s)": 1.43821 }, { "epoch": 2.656270082687117, "eval_loss": 2.1532678604125977, "eval_runtime": 14.7965, "eval_samples_per_second": 6.758, "eval_steps_per_second": 6.758, "eval_token_acc": 0.4946808510638298, "step": 62000 }, { "epoch": 2.656484298016366, "grad_norm": 5.055727005004883, "learning_rate": 4.509181779111493e-05, "loss": 2.136933135986328, "memory(GiB)": 77.56, "step": 62005, "token_acc": 0.507537688442211, "train_speed(iter/s)": 1.437696 }, { "epoch": 2.656698513345615, "grad_norm": 7.198328971862793, "learning_rate": 4.508512056542307e-05, "loss": 2.575959014892578, "memory(GiB)": 77.56, "step": 62010, "token_acc": 0.4797297297297297, "train_speed(iter/s)": 1.437694 }, { "epoch": 2.6569127286748637, "grad_norm": 4.462030410766602, "learning_rate": 4.507842342876887e-05, "loss": 2.244471549987793, "memory(GiB)": 77.56, "step": 62015, "token_acc": 0.53, "train_speed(iter/s)": 1.437709 }, { "epoch": 2.657126944004113, "grad_norm": 7.001450538635254, "learning_rate": 4.507172638127364e-05, "loss": 2.293403434753418, "memory(GiB)": 77.56, "step": 62020, "token_acc": 0.5309090909090909, "train_speed(iter/s)": 1.437712 }, { "epoch": 2.657341159333362, "grad_norm": 6.439095973968506, "learning_rate": 4.5065029423058726e-05, "loss": 2.484415054321289, "memory(GiB)": 77.56, "step": 62025, "token_acc": 0.4879725085910653, "train_speed(iter/s)": 1.437707 }, { "epoch": 2.6575553746626106, "grad_norm": 5.766871929168701, "learning_rate": 4.505833255424543e-05, "loss": 2.247924041748047, "memory(GiB)": 77.56, "step": 62030, "token_acc": 0.5066666666666667, "train_speed(iter/s)": 1.437709 }, { "epoch": 2.65776958999186, "grad_norm": 4.9292426109313965, "learning_rate": 4.505163577495506e-05, "loss": 2.292584037780762, "memory(GiB)": 77.56, "step": 62035, "token_acc": 0.5183946488294314, "train_speed(iter/s)": 1.437713 }, { "epoch": 2.6579838053211087, "grad_norm": 6.00905179977417, "learning_rate": 4.504493908530896e-05, "loss": 2.2320985794067383, "memory(GiB)": 77.56, "step": 62040, "token_acc": 0.48846153846153845, "train_speed(iter/s)": 1.437718 }, { "epoch": 2.6581980206503575, "grad_norm": 4.733302593231201, "learning_rate": 4.5038242485428436e-05, "loss": 2.4438005447387696, "memory(GiB)": 77.56, "step": 62045, "token_acc": 0.5254237288135594, "train_speed(iter/s)": 1.437721 }, { "epoch": 2.6584122359796067, "grad_norm": 5.3019118309021, "learning_rate": 4.503154597543479e-05, "loss": 2.6854190826416016, "memory(GiB)": 77.56, "step": 62050, "token_acc": 0.4075342465753425, "train_speed(iter/s)": 1.437727 }, { "epoch": 2.6586264513088556, "grad_norm": 6.637152671813965, "learning_rate": 4.5024849555449353e-05, "loss": 2.1898777008056642, "memory(GiB)": 77.56, "step": 62055, "token_acc": 0.5203252032520326, "train_speed(iter/s)": 1.437739 }, { "epoch": 2.6588406666381044, "grad_norm": 6.405393123626709, "learning_rate": 4.501815322559345e-05, "loss": 2.4268672943115233, "memory(GiB)": 77.56, "step": 62060, "token_acc": 0.48441926345609065, "train_speed(iter/s)": 1.437734 }, { "epoch": 2.6590548819673536, "grad_norm": 5.337629318237305, "learning_rate": 4.501145698598836e-05, "loss": 2.5698728561401367, "memory(GiB)": 77.56, "step": 62065, "token_acc": 0.47368421052631576, "train_speed(iter/s)": 1.437772 }, { "epoch": 2.6592690972966024, "grad_norm": 6.5373005867004395, "learning_rate": 4.500476083675542e-05, "loss": 2.2987171173095704, "memory(GiB)": 77.56, "step": 62070, "token_acc": 0.5091575091575091, "train_speed(iter/s)": 1.437756 }, { "epoch": 2.6594833126258512, "grad_norm": 5.635282516479492, "learning_rate": 4.499806477801592e-05, "loss": 2.2451248168945312, "memory(GiB)": 77.56, "step": 62075, "token_acc": 0.5191082802547771, "train_speed(iter/s)": 1.437741 }, { "epoch": 2.6596975279551005, "grad_norm": 7.797072887420654, "learning_rate": 4.499136880989116e-05, "loss": 2.1330049514770506, "memory(GiB)": 77.56, "step": 62080, "token_acc": 0.5390946502057613, "train_speed(iter/s)": 1.437734 }, { "epoch": 2.6599117432843493, "grad_norm": 5.674398422241211, "learning_rate": 4.498467293250246e-05, "loss": 2.315528678894043, "memory(GiB)": 77.56, "step": 62085, "token_acc": 0.5261044176706827, "train_speed(iter/s)": 1.437739 }, { "epoch": 2.660125958613598, "grad_norm": 5.611018180847168, "learning_rate": 4.497797714597112e-05, "loss": 2.7278715133666993, "memory(GiB)": 77.56, "step": 62090, "token_acc": 0.45993031358885017, "train_speed(iter/s)": 1.437756 }, { "epoch": 2.6603401739428474, "grad_norm": 8.207868576049805, "learning_rate": 4.4971281450418425e-05, "loss": 2.547300338745117, "memory(GiB)": 77.56, "step": 62095, "token_acc": 0.4675324675324675, "train_speed(iter/s)": 1.437754 }, { "epoch": 2.660554389272096, "grad_norm": 4.00620698928833, "learning_rate": 4.496458584596569e-05, "loss": 2.5608427047729494, "memory(GiB)": 77.56, "step": 62100, "token_acc": 0.487012987012987, "train_speed(iter/s)": 1.437787 }, { "epoch": 2.660768604601345, "grad_norm": 5.188326835632324, "learning_rate": 4.495789033273419e-05, "loss": 2.6191925048828124, "memory(GiB)": 77.56, "step": 62105, "token_acc": 0.4756944444444444, "train_speed(iter/s)": 1.437798 }, { "epoch": 2.6609828199305943, "grad_norm": 5.956536769866943, "learning_rate": 4.495119491084526e-05, "loss": 2.505153846740723, "memory(GiB)": 77.56, "step": 62110, "token_acc": 0.4521452145214521, "train_speed(iter/s)": 1.43779 }, { "epoch": 2.661197035259843, "grad_norm": 5.652467250823975, "learning_rate": 4.4944499580420166e-05, "loss": 2.4883447647094727, "memory(GiB)": 77.56, "step": 62115, "token_acc": 0.48732394366197185, "train_speed(iter/s)": 1.437792 }, { "epoch": 2.661411250589092, "grad_norm": 5.945432662963867, "learning_rate": 4.4937804341580184e-05, "loss": 2.289636421203613, "memory(GiB)": 77.56, "step": 62120, "token_acc": 0.5102639296187683, "train_speed(iter/s)": 1.437768 }, { "epoch": 2.661625465918341, "grad_norm": 6.544475078582764, "learning_rate": 4.4931109194446624e-05, "loss": 2.480554962158203, "memory(GiB)": 77.56, "step": 62125, "token_acc": 0.5018867924528302, "train_speed(iter/s)": 1.437778 }, { "epoch": 2.66183968124759, "grad_norm": 5.793374538421631, "learning_rate": 4.49244141391408e-05, "loss": 2.2614669799804688, "memory(GiB)": 77.56, "step": 62130, "token_acc": 0.46153846153846156, "train_speed(iter/s)": 1.437769 }, { "epoch": 2.6620538965768388, "grad_norm": 5.474088191986084, "learning_rate": 4.4917719175783965e-05, "loss": 2.6035057067871095, "memory(GiB)": 77.56, "step": 62135, "token_acc": 0.43952802359882004, "train_speed(iter/s)": 1.437788 }, { "epoch": 2.662268111906088, "grad_norm": 5.417370319366455, "learning_rate": 4.491102430449741e-05, "loss": 2.304451751708984, "memory(GiB)": 77.56, "step": 62140, "token_acc": 0.4867549668874172, "train_speed(iter/s)": 1.437782 }, { "epoch": 2.662482327235337, "grad_norm": 5.341798782348633, "learning_rate": 4.490432952540243e-05, "loss": 2.488572692871094, "memory(GiB)": 77.56, "step": 62145, "token_acc": 0.4506578947368421, "train_speed(iter/s)": 1.437811 }, { "epoch": 2.6626965425645857, "grad_norm": 5.721759796142578, "learning_rate": 4.489763483862031e-05, "loss": 2.3949277877807615, "memory(GiB)": 77.56, "step": 62150, "token_acc": 0.4757834757834758, "train_speed(iter/s)": 1.43782 }, { "epoch": 2.662910757893835, "grad_norm": 5.802772045135498, "learning_rate": 4.4890940244272305e-05, "loss": 2.3115299224853514, "memory(GiB)": 77.56, "step": 62155, "token_acc": 0.5159235668789809, "train_speed(iter/s)": 1.437831 }, { "epoch": 2.6631249732230837, "grad_norm": 4.604318141937256, "learning_rate": 4.488424574247972e-05, "loss": 2.3293811798095705, "memory(GiB)": 77.56, "step": 62160, "token_acc": 0.486013986013986, "train_speed(iter/s)": 1.437799 }, { "epoch": 2.6633391885523325, "grad_norm": 5.474299907684326, "learning_rate": 4.4877551333363814e-05, "loss": 2.415744400024414, "memory(GiB)": 77.56, "step": 62165, "token_acc": 0.48484848484848486, "train_speed(iter/s)": 1.43782 }, { "epoch": 2.663553403881582, "grad_norm": 4.148676872253418, "learning_rate": 4.487085701704588e-05, "loss": 2.5797075271606444, "memory(GiB)": 77.56, "step": 62170, "token_acc": 0.48771929824561405, "train_speed(iter/s)": 1.437832 }, { "epoch": 2.6637676192108306, "grad_norm": 5.646249771118164, "learning_rate": 4.4864162793647184e-05, "loss": 2.4013315200805665, "memory(GiB)": 77.56, "step": 62175, "token_acc": 0.5077881619937694, "train_speed(iter/s)": 1.437831 }, { "epoch": 2.6639818345400794, "grad_norm": 5.800014495849609, "learning_rate": 4.4857468663288985e-05, "loss": 2.3350486755371094, "memory(GiB)": 77.56, "step": 62180, "token_acc": 0.4674329501915709, "train_speed(iter/s)": 1.437859 }, { "epoch": 2.6641960498693287, "grad_norm": 6.78893518447876, "learning_rate": 4.485077462609258e-05, "loss": 2.4084039688110352, "memory(GiB)": 77.56, "step": 62185, "token_acc": 0.5066666666666667, "train_speed(iter/s)": 1.437869 }, { "epoch": 2.6644102651985775, "grad_norm": 5.667250156402588, "learning_rate": 4.484408068217922e-05, "loss": 2.7562252044677735, "memory(GiB)": 77.56, "step": 62190, "token_acc": 0.47904191616766467, "train_speed(iter/s)": 1.437848 }, { "epoch": 2.6646244805278263, "grad_norm": 5.6670026779174805, "learning_rate": 4.4837386831670155e-05, "loss": 2.5700937271118165, "memory(GiB)": 77.56, "step": 62195, "token_acc": 0.4554794520547945, "train_speed(iter/s)": 1.437869 }, { "epoch": 2.6648386958570756, "grad_norm": 5.202577590942383, "learning_rate": 4.4830693074686675e-05, "loss": 2.094849395751953, "memory(GiB)": 77.56, "step": 62200, "token_acc": 0.5546875, "train_speed(iter/s)": 1.437885 }, { "epoch": 2.6650529111863244, "grad_norm": 4.83437442779541, "learning_rate": 4.482399941135005e-05, "loss": 2.7122657775878904, "memory(GiB)": 77.56, "step": 62205, "token_acc": 0.4273972602739726, "train_speed(iter/s)": 1.437887 }, { "epoch": 2.665267126515573, "grad_norm": 5.236632347106934, "learning_rate": 4.481730584178153e-05, "loss": 2.6442386627197267, "memory(GiB)": 77.56, "step": 62210, "token_acc": 0.4452054794520548, "train_speed(iter/s)": 1.4379 }, { "epoch": 2.6654813418448224, "grad_norm": 5.442342758178711, "learning_rate": 4.481061236610238e-05, "loss": 2.339546966552734, "memory(GiB)": 77.56, "step": 62215, "token_acc": 0.5072463768115942, "train_speed(iter/s)": 1.437904 }, { "epoch": 2.6656955571740713, "grad_norm": 5.601112365722656, "learning_rate": 4.480391898443386e-05, "loss": 2.450676918029785, "memory(GiB)": 77.56, "step": 62220, "token_acc": 0.4915254237288136, "train_speed(iter/s)": 1.437891 }, { "epoch": 2.66590977250332, "grad_norm": 5.787229537963867, "learning_rate": 4.4797225696897205e-05, "loss": 2.6089290618896483, "memory(GiB)": 77.56, "step": 62225, "token_acc": 0.46863468634686345, "train_speed(iter/s)": 1.437902 }, { "epoch": 2.6661239878325693, "grad_norm": 4.809157371520996, "learning_rate": 4.4790532503613696e-05, "loss": 2.4545944213867186, "memory(GiB)": 77.56, "step": 62230, "token_acc": 0.5207667731629393, "train_speed(iter/s)": 1.437873 }, { "epoch": 2.666338203161818, "grad_norm": 4.896027088165283, "learning_rate": 4.4783839404704587e-05, "loss": 2.4091516494750977, "memory(GiB)": 77.56, "step": 62235, "token_acc": 0.4797297297297297, "train_speed(iter/s)": 1.437849 }, { "epoch": 2.666552418491067, "grad_norm": 4.648581504821777, "learning_rate": 4.47771464002911e-05, "loss": 2.604655075073242, "memory(GiB)": 77.56, "step": 62240, "token_acc": 0.4782608695652174, "train_speed(iter/s)": 1.437865 }, { "epoch": 2.666766633820316, "grad_norm": 4.681258678436279, "learning_rate": 4.4770453490494526e-05, "loss": 2.5066070556640625, "memory(GiB)": 77.56, "step": 62245, "token_acc": 0.5158227848101266, "train_speed(iter/s)": 1.437903 }, { "epoch": 2.666980849149565, "grad_norm": 7.053329944610596, "learning_rate": 4.476376067543607e-05, "loss": 2.432002639770508, "memory(GiB)": 77.56, "step": 62250, "token_acc": 0.4539877300613497, "train_speed(iter/s)": 1.437886 }, { "epoch": 2.6671950644788143, "grad_norm": 6.050266265869141, "learning_rate": 4.475706795523702e-05, "loss": 2.459771728515625, "memory(GiB)": 77.56, "step": 62255, "token_acc": 0.509009009009009, "train_speed(iter/s)": 1.437914 }, { "epoch": 2.667409279808063, "grad_norm": 4.887913703918457, "learning_rate": 4.47503753300186e-05, "loss": 2.7142181396484375, "memory(GiB)": 77.56, "step": 62260, "token_acc": 0.4417808219178082, "train_speed(iter/s)": 1.437915 }, { "epoch": 2.667623495137312, "grad_norm": 6.596017837524414, "learning_rate": 4.474368279990205e-05, "loss": 2.752261734008789, "memory(GiB)": 77.56, "step": 62265, "token_acc": 0.44664031620553357, "train_speed(iter/s)": 1.437916 }, { "epoch": 2.667837710466561, "grad_norm": 4.929872989654541, "learning_rate": 4.47369903650086e-05, "loss": 2.210143280029297, "memory(GiB)": 77.56, "step": 62270, "token_acc": 0.5409252669039146, "train_speed(iter/s)": 1.437936 }, { "epoch": 2.66805192579581, "grad_norm": 6.227293968200684, "learning_rate": 4.4730298025459536e-05, "loss": 2.450564956665039, "memory(GiB)": 77.56, "step": 62275, "token_acc": 0.49466192170818507, "train_speed(iter/s)": 1.437951 }, { "epoch": 2.6682661411250588, "grad_norm": 5.108763694763184, "learning_rate": 4.4723605781376056e-05, "loss": 2.659554290771484, "memory(GiB)": 77.56, "step": 62280, "token_acc": 0.4208754208754209, "train_speed(iter/s)": 1.437951 }, { "epoch": 2.668480356454308, "grad_norm": 5.252298831939697, "learning_rate": 4.47169136328794e-05, "loss": 2.1697967529296873, "memory(GiB)": 77.56, "step": 62285, "token_acc": 0.5477178423236515, "train_speed(iter/s)": 1.437981 }, { "epoch": 2.668694571783557, "grad_norm": 5.540384769439697, "learning_rate": 4.471022158009082e-05, "loss": 2.4978921890258787, "memory(GiB)": 77.56, "step": 62290, "token_acc": 0.4637223974763407, "train_speed(iter/s)": 1.437977 }, { "epoch": 2.6689087871128057, "grad_norm": 5.051617622375488, "learning_rate": 4.470352962313154e-05, "loss": 2.589203643798828, "memory(GiB)": 77.56, "step": 62295, "token_acc": 0.4403183023872679, "train_speed(iter/s)": 1.438006 }, { "epoch": 2.669123002442055, "grad_norm": 5.171085357666016, "learning_rate": 4.4696837762122777e-05, "loss": 2.4283390045166016, "memory(GiB)": 77.56, "step": 62300, "token_acc": 0.5147058823529411, "train_speed(iter/s)": 1.438013 }, { "epoch": 2.6693372177713037, "grad_norm": 4.92645788192749, "learning_rate": 4.469014599718579e-05, "loss": 2.110988998413086, "memory(GiB)": 77.56, "step": 62305, "token_acc": 0.5288135593220339, "train_speed(iter/s)": 1.437996 }, { "epoch": 2.6695514331005525, "grad_norm": 6.298330783843994, "learning_rate": 4.468345432844177e-05, "loss": 2.4649415969848634, "memory(GiB)": 77.56, "step": 62310, "token_acc": 0.5133333333333333, "train_speed(iter/s)": 1.438003 }, { "epoch": 2.669765648429802, "grad_norm": 5.4456467628479, "learning_rate": 4.4676762756011974e-05, "loss": 2.4625049591064454, "memory(GiB)": 77.56, "step": 62315, "token_acc": 0.49146757679180886, "train_speed(iter/s)": 1.437979 }, { "epoch": 2.6699798637590506, "grad_norm": 4.191608905792236, "learning_rate": 4.467007128001762e-05, "loss": 2.3561885833740233, "memory(GiB)": 77.56, "step": 62320, "token_acc": 0.5235294117647059, "train_speed(iter/s)": 1.438016 }, { "epoch": 2.6701940790882994, "grad_norm": 4.823205471038818, "learning_rate": 4.466337990057991e-05, "loss": 2.571339416503906, "memory(GiB)": 77.56, "step": 62325, "token_acc": 0.44281524926686217, "train_speed(iter/s)": 1.438028 }, { "epoch": 2.6704082944175487, "grad_norm": 7.227558135986328, "learning_rate": 4.4656688617820095e-05, "loss": 2.380377006530762, "memory(GiB)": 77.56, "step": 62330, "token_acc": 0.49019607843137253, "train_speed(iter/s)": 1.43798 }, { "epoch": 2.6706225097467975, "grad_norm": 4.9831156730651855, "learning_rate": 4.464999743185937e-05, "loss": 2.476052665710449, "memory(GiB)": 77.56, "step": 62335, "token_acc": 0.4923547400611621, "train_speed(iter/s)": 1.437971 }, { "epoch": 2.6708367250760463, "grad_norm": 5.775318622589111, "learning_rate": 4.464330634281895e-05, "loss": 2.7013622283935548, "memory(GiB)": 77.56, "step": 62340, "token_acc": 0.447098976109215, "train_speed(iter/s)": 1.437985 }, { "epoch": 2.6710509404052956, "grad_norm": 6.507359504699707, "learning_rate": 4.4636615350820054e-05, "loss": 2.245701217651367, "memory(GiB)": 77.56, "step": 62345, "token_acc": 0.5232558139534884, "train_speed(iter/s)": 1.437959 }, { "epoch": 2.6712651557345444, "grad_norm": 5.102236747741699, "learning_rate": 4.462992445598392e-05, "loss": 2.418789863586426, "memory(GiB)": 77.56, "step": 62350, "token_acc": 0.49642857142857144, "train_speed(iter/s)": 1.437969 }, { "epoch": 2.671479371063793, "grad_norm": 5.49685525894165, "learning_rate": 4.462323365843174e-05, "loss": 2.376914405822754, "memory(GiB)": 77.56, "step": 62355, "token_acc": 0.5175097276264592, "train_speed(iter/s)": 1.43797 }, { "epoch": 2.6716935863930424, "grad_norm": 5.1924848556518555, "learning_rate": 4.4616542958284725e-05, "loss": 2.1029449462890626, "memory(GiB)": 77.56, "step": 62360, "token_acc": 0.5611510791366906, "train_speed(iter/s)": 1.437972 }, { "epoch": 2.6719078017222913, "grad_norm": 5.0350751876831055, "learning_rate": 4.460985235566409e-05, "loss": 2.2838520050048827, "memory(GiB)": 77.56, "step": 62365, "token_acc": 0.5190839694656488, "train_speed(iter/s)": 1.437994 }, { "epoch": 2.67212201705154, "grad_norm": 4.944079875946045, "learning_rate": 4.4603161850691025e-05, "loss": 2.1518455505371095, "memory(GiB)": 77.56, "step": 62370, "token_acc": 0.5201465201465202, "train_speed(iter/s)": 1.43797 }, { "epoch": 2.6723362323807893, "grad_norm": 4.947826862335205, "learning_rate": 4.459647144348675e-05, "loss": 2.3024925231933593, "memory(GiB)": 77.56, "step": 62375, "token_acc": 0.5, "train_speed(iter/s)": 1.437987 }, { "epoch": 2.672550447710038, "grad_norm": 5.7744526863098145, "learning_rate": 4.458978113417248e-05, "loss": 2.4686527252197266, "memory(GiB)": 77.56, "step": 62380, "token_acc": 0.48514851485148514, "train_speed(iter/s)": 1.437985 }, { "epoch": 2.672764663039287, "grad_norm": 6.173766613006592, "learning_rate": 4.4583090922869375e-05, "loss": 2.5134342193603514, "memory(GiB)": 77.56, "step": 62385, "token_acc": 0.44481605351170567, "train_speed(iter/s)": 1.437995 }, { "epoch": 2.672978878368536, "grad_norm": 4.811943531036377, "learning_rate": 4.457640080969868e-05, "loss": 2.3879085540771485, "memory(GiB)": 77.56, "step": 62390, "token_acc": 0.46703296703296704, "train_speed(iter/s)": 1.43799 }, { "epoch": 2.673193093697785, "grad_norm": 5.159696578979492, "learning_rate": 4.456971079478155e-05, "loss": 2.2175457000732424, "memory(GiB)": 77.56, "step": 62395, "token_acc": 0.5542635658914729, "train_speed(iter/s)": 1.437957 }, { "epoch": 2.673407309027034, "grad_norm": 4.60494327545166, "learning_rate": 4.456302087823922e-05, "loss": 2.1543155670166017, "memory(GiB)": 77.56, "step": 62400, "token_acc": 0.5233644859813084, "train_speed(iter/s)": 1.437961 }, { "epoch": 2.673621524356283, "grad_norm": 8.032392501831055, "learning_rate": 4.455633106019287e-05, "loss": 2.5719888687133787, "memory(GiB)": 77.56, "step": 62405, "token_acc": 0.43609022556390975, "train_speed(iter/s)": 1.437974 }, { "epoch": 2.673835739685532, "grad_norm": 6.6925153732299805, "learning_rate": 4.4549641340763676e-05, "loss": 2.2387340545654295, "memory(GiB)": 77.56, "step": 62410, "token_acc": 0.5083333333333333, "train_speed(iter/s)": 1.437982 }, { "epoch": 2.6740499550147807, "grad_norm": 5.219361782073975, "learning_rate": 4.454295172007285e-05, "loss": 2.3546146392822265, "memory(GiB)": 77.56, "step": 62415, "token_acc": 0.4980694980694981, "train_speed(iter/s)": 1.437983 }, { "epoch": 2.67426417034403, "grad_norm": 4.447869300842285, "learning_rate": 4.4536262198241555e-05, "loss": 2.368247222900391, "memory(GiB)": 77.56, "step": 62420, "token_acc": 0.5, "train_speed(iter/s)": 1.437975 }, { "epoch": 2.674478385673279, "grad_norm": 5.033359527587891, "learning_rate": 4.4529572775391014e-05, "loss": 2.441751480102539, "memory(GiB)": 77.56, "step": 62425, "token_acc": 0.4576271186440678, "train_speed(iter/s)": 1.437977 }, { "epoch": 2.6746926010025276, "grad_norm": 5.163084506988525, "learning_rate": 4.4522883451642386e-05, "loss": 2.264231491088867, "memory(GiB)": 77.56, "step": 62430, "token_acc": 0.5238095238095238, "train_speed(iter/s)": 1.437949 }, { "epoch": 2.674906816331777, "grad_norm": 5.881134510040283, "learning_rate": 4.451619422711687e-05, "loss": 2.434559631347656, "memory(GiB)": 77.56, "step": 62435, "token_acc": 0.48355263157894735, "train_speed(iter/s)": 1.437942 }, { "epoch": 2.6751210316610257, "grad_norm": 5.882541179656982, "learning_rate": 4.4509505101935636e-05, "loss": 2.1831966400146485, "memory(GiB)": 77.56, "step": 62440, "token_acc": 0.5384615384615384, "train_speed(iter/s)": 1.437958 }, { "epoch": 2.6753352469902745, "grad_norm": 6.888217926025391, "learning_rate": 4.450281607621987e-05, "loss": 2.733083724975586, "memory(GiB)": 77.56, "step": 62445, "token_acc": 0.4311594202898551, "train_speed(iter/s)": 1.437972 }, { "epoch": 2.6755494623195237, "grad_norm": 4.711943626403809, "learning_rate": 4.449612715009075e-05, "loss": 2.7587135314941404, "memory(GiB)": 77.56, "step": 62450, "token_acc": 0.4517241379310345, "train_speed(iter/s)": 1.437988 }, { "epoch": 2.6757636776487725, "grad_norm": 5.918360233306885, "learning_rate": 4.4489438323669435e-05, "loss": 2.2518150329589846, "memory(GiB)": 77.56, "step": 62455, "token_acc": 0.5153846153846153, "train_speed(iter/s)": 1.438017 }, { "epoch": 2.6759778929780214, "grad_norm": 5.371405601501465, "learning_rate": 4.448274959707713e-05, "loss": 2.458642768859863, "memory(GiB)": 77.56, "step": 62460, "token_acc": 0.4539249146757679, "train_speed(iter/s)": 1.438022 }, { "epoch": 2.6761921083072706, "grad_norm": 4.860164642333984, "learning_rate": 4.447606097043499e-05, "loss": 2.6802932739257814, "memory(GiB)": 77.56, "step": 62465, "token_acc": 0.48427672955974843, "train_speed(iter/s)": 1.438013 }, { "epoch": 2.6764063236365194, "grad_norm": 7.362837791442871, "learning_rate": 4.4469372443864185e-05, "loss": 2.1227832794189454, "memory(GiB)": 77.56, "step": 62470, "token_acc": 0.5076923076923077, "train_speed(iter/s)": 1.438027 }, { "epoch": 2.6766205389657682, "grad_norm": 5.705780506134033, "learning_rate": 4.4462684017485884e-05, "loss": 2.1604549407958986, "memory(GiB)": 77.56, "step": 62475, "token_acc": 0.5298245614035088, "train_speed(iter/s)": 1.438049 }, { "epoch": 2.6768347542950175, "grad_norm": 6.227565288543701, "learning_rate": 4.445599569142127e-05, "loss": 2.438285255432129, "memory(GiB)": 77.56, "step": 62480, "token_acc": 0.4894894894894895, "train_speed(iter/s)": 1.438043 }, { "epoch": 2.6770489696242663, "grad_norm": 6.081276893615723, "learning_rate": 4.444930746579147e-05, "loss": 2.418336868286133, "memory(GiB)": 77.56, "step": 62485, "token_acc": 0.5173745173745173, "train_speed(iter/s)": 1.438046 }, { "epoch": 2.677263184953515, "grad_norm": 6.587379455566406, "learning_rate": 4.444261934071769e-05, "loss": 2.5576358795166017, "memory(GiB)": 77.56, "step": 62490, "token_acc": 0.45364238410596025, "train_speed(iter/s)": 1.438074 }, { "epoch": 2.6774774002827644, "grad_norm": 4.905750274658203, "learning_rate": 4.443593131632105e-05, "loss": 2.571126937866211, "memory(GiB)": 77.56, "step": 62495, "token_acc": 0.4876543209876543, "train_speed(iter/s)": 1.438087 }, { "epoch": 2.677691615612013, "grad_norm": 7.74746561050415, "learning_rate": 4.442924339272275e-05, "loss": 2.303331184387207, "memory(GiB)": 77.56, "step": 62500, "token_acc": 0.5340501792114696, "train_speed(iter/s)": 1.438088 }, { "epoch": 2.677691615612013, "eval_loss": 2.4737586975097656, "eval_runtime": 14.4274, "eval_samples_per_second": 6.931, "eval_steps_per_second": 6.931, "eval_token_acc": 0.45298013245033114, "step": 62500 }, { "epoch": 2.677905830941262, "grad_norm": 5.322314739227295, "learning_rate": 4.442255557004393e-05, "loss": 2.4849563598632813, "memory(GiB)": 77.56, "step": 62505, "token_acc": 0.4603024574669187, "train_speed(iter/s)": 1.437579 }, { "epoch": 2.6781200462705113, "grad_norm": 8.372267723083496, "learning_rate": 4.441586784840576e-05, "loss": 2.3722812652587892, "memory(GiB)": 77.56, "step": 62510, "token_acc": 0.5107913669064749, "train_speed(iter/s)": 1.43759 }, { "epoch": 2.67833426159976, "grad_norm": 6.033510208129883, "learning_rate": 4.440918022792937e-05, "loss": 2.172313690185547, "memory(GiB)": 77.56, "step": 62515, "token_acc": 0.5551601423487544, "train_speed(iter/s)": 1.437601 }, { "epoch": 2.678548476929009, "grad_norm": 6.373674392700195, "learning_rate": 4.440249270873593e-05, "loss": 2.399858856201172, "memory(GiB)": 77.56, "step": 62520, "token_acc": 0.48757763975155277, "train_speed(iter/s)": 1.437611 }, { "epoch": 2.678762692258258, "grad_norm": 6.440563201904297, "learning_rate": 4.439580529094659e-05, "loss": 2.6191905975341796, "memory(GiB)": 77.56, "step": 62525, "token_acc": 0.46366782006920415, "train_speed(iter/s)": 1.437595 }, { "epoch": 2.678976907587507, "grad_norm": 7.462244987487793, "learning_rate": 4.4389117974682484e-05, "loss": 2.4368354797363283, "memory(GiB)": 77.56, "step": 62530, "token_acc": 0.4924812030075188, "train_speed(iter/s)": 1.437586 }, { "epoch": 2.6791911229167558, "grad_norm": 4.401798725128174, "learning_rate": 4.4382430760064774e-05, "loss": 2.2819873809814455, "memory(GiB)": 77.56, "step": 62535, "token_acc": 0.5191082802547771, "train_speed(iter/s)": 1.437604 }, { "epoch": 2.679405338246005, "grad_norm": 6.273928165435791, "learning_rate": 4.4375743647214596e-05, "loss": 2.8158090591430662, "memory(GiB)": 77.56, "step": 62540, "token_acc": 0.4397590361445783, "train_speed(iter/s)": 1.437607 }, { "epoch": 2.679619553575254, "grad_norm": 9.880330085754395, "learning_rate": 4.436905663625311e-05, "loss": 2.5445882797241213, "memory(GiB)": 77.56, "step": 62545, "token_acc": 0.45112781954887216, "train_speed(iter/s)": 1.437641 }, { "epoch": 2.6798337689045026, "grad_norm": 6.399016857147217, "learning_rate": 4.436236972730144e-05, "loss": 2.120593452453613, "memory(GiB)": 77.56, "step": 62550, "token_acc": 0.540084388185654, "train_speed(iter/s)": 1.437617 }, { "epoch": 2.680047984233752, "grad_norm": 4.407811164855957, "learning_rate": 4.435568292048072e-05, "loss": 2.6315799713134767, "memory(GiB)": 77.56, "step": 62555, "token_acc": 0.46308724832214765, "train_speed(iter/s)": 1.437637 }, { "epoch": 2.6802621995630007, "grad_norm": 5.339437961578369, "learning_rate": 4.4348996215912114e-05, "loss": 2.5039394378662108, "memory(GiB)": 77.56, "step": 62560, "token_acc": 0.5, "train_speed(iter/s)": 1.437634 }, { "epoch": 2.6804764148922495, "grad_norm": 5.569267749786377, "learning_rate": 4.434230961371674e-05, "loss": 2.558920478820801, "memory(GiB)": 77.56, "step": 62565, "token_acc": 0.48546511627906974, "train_speed(iter/s)": 1.437614 }, { "epoch": 2.680690630221499, "grad_norm": 7.515854835510254, "learning_rate": 4.433562311401571e-05, "loss": 2.613993453979492, "memory(GiB)": 77.56, "step": 62570, "token_acc": 0.476, "train_speed(iter/s)": 1.437632 }, { "epoch": 2.6809048455507476, "grad_norm": 5.23411750793457, "learning_rate": 4.43289367169302e-05, "loss": 2.749032211303711, "memory(GiB)": 77.56, "step": 62575, "token_acc": 0.4585987261146497, "train_speed(iter/s)": 1.437645 }, { "epoch": 2.6811190608799964, "grad_norm": 4.750558376312256, "learning_rate": 4.4322250422581326e-05, "loss": 2.3014715194702147, "memory(GiB)": 77.56, "step": 62580, "token_acc": 0.4967105263157895, "train_speed(iter/s)": 1.437647 }, { "epoch": 2.6813332762092457, "grad_norm": 4.996963024139404, "learning_rate": 4.431556423109021e-05, "loss": 2.4371225357055666, "memory(GiB)": 77.56, "step": 62585, "token_acc": 0.4840764331210191, "train_speed(iter/s)": 1.437661 }, { "epoch": 2.6815474915384945, "grad_norm": 4.832967758178711, "learning_rate": 4.430887814257798e-05, "loss": 2.337788391113281, "memory(GiB)": 77.56, "step": 62590, "token_acc": 0.5035460992907801, "train_speed(iter/s)": 1.437659 }, { "epoch": 2.6817617068677437, "grad_norm": 7.261024475097656, "learning_rate": 4.430219215716576e-05, "loss": 2.333908462524414, "memory(GiB)": 77.56, "step": 62595, "token_acc": 0.5220883534136547, "train_speed(iter/s)": 1.437648 }, { "epoch": 2.6819759221969925, "grad_norm": 5.736026763916016, "learning_rate": 4.429550627497467e-05, "loss": 2.5036090850830077, "memory(GiB)": 77.56, "step": 62600, "token_acc": 0.46863468634686345, "train_speed(iter/s)": 1.437644 }, { "epoch": 2.6821901375262414, "grad_norm": 5.654018878936768, "learning_rate": 4.428882049612584e-05, "loss": 2.2484457015991213, "memory(GiB)": 77.56, "step": 62605, "token_acc": 0.5345454545454545, "train_speed(iter/s)": 1.437623 }, { "epoch": 2.6824043528554906, "grad_norm": 5.278985500335693, "learning_rate": 4.428213482074039e-05, "loss": 2.1761598587036133, "memory(GiB)": 77.56, "step": 62610, "token_acc": 0.5373134328358209, "train_speed(iter/s)": 1.437603 }, { "epoch": 2.6826185681847394, "grad_norm": 6.985358715057373, "learning_rate": 4.427544924893941e-05, "loss": 2.569799041748047, "memory(GiB)": 77.56, "step": 62615, "token_acc": 0.5094339622641509, "train_speed(iter/s)": 1.437617 }, { "epoch": 2.6828327835139882, "grad_norm": 5.450619697570801, "learning_rate": 4.426876378084406e-05, "loss": 2.458124542236328, "memory(GiB)": 77.56, "step": 62620, "token_acc": 0.4745222929936306, "train_speed(iter/s)": 1.437634 }, { "epoch": 2.6830469988432375, "grad_norm": 6.4383931159973145, "learning_rate": 4.426207841657543e-05, "loss": 1.9995841979980469, "memory(GiB)": 77.56, "step": 62625, "token_acc": 0.5748031496062992, "train_speed(iter/s)": 1.437633 }, { "epoch": 2.6832612141724863, "grad_norm": 5.315460681915283, "learning_rate": 4.425539315625462e-05, "loss": 2.5346023559570314, "memory(GiB)": 77.56, "step": 62630, "token_acc": 0.45864661654135336, "train_speed(iter/s)": 1.437668 }, { "epoch": 2.683475429501735, "grad_norm": 5.402333736419678, "learning_rate": 4.424870800000276e-05, "loss": 2.337423324584961, "memory(GiB)": 77.56, "step": 62635, "token_acc": 0.5230125523012552, "train_speed(iter/s)": 1.437678 }, { "epoch": 2.6836896448309844, "grad_norm": 4.58209228515625, "learning_rate": 4.424202294794093e-05, "loss": 2.636511039733887, "memory(GiB)": 77.56, "step": 62640, "token_acc": 0.46794871794871795, "train_speed(iter/s)": 1.437677 }, { "epoch": 2.683903860160233, "grad_norm": 5.8528151512146, "learning_rate": 4.423533800019026e-05, "loss": 2.838317108154297, "memory(GiB)": 77.56, "step": 62645, "token_acc": 0.430635838150289, "train_speed(iter/s)": 1.437664 }, { "epoch": 2.684118075489482, "grad_norm": 7.27495813369751, "learning_rate": 4.422865315687187e-05, "loss": 2.3046009063720705, "memory(GiB)": 77.56, "step": 62650, "token_acc": 0.5239852398523985, "train_speed(iter/s)": 1.437682 }, { "epoch": 2.6843322908187313, "grad_norm": 5.070964336395264, "learning_rate": 4.4221968418106844e-05, "loss": 2.4182600021362304, "memory(GiB)": 77.56, "step": 62655, "token_acc": 0.47096774193548385, "train_speed(iter/s)": 1.437696 }, { "epoch": 2.68454650614798, "grad_norm": 6.140015125274658, "learning_rate": 4.421528378401626e-05, "loss": 2.3498199462890623, "memory(GiB)": 77.56, "step": 62660, "token_acc": 0.4375, "train_speed(iter/s)": 1.437695 }, { "epoch": 2.684760721477229, "grad_norm": 4.644944667816162, "learning_rate": 4.420859925472125e-05, "loss": 2.1466753005981447, "memory(GiB)": 77.56, "step": 62665, "token_acc": 0.5344827586206896, "train_speed(iter/s)": 1.437726 }, { "epoch": 2.684974936806478, "grad_norm": 5.87173318862915, "learning_rate": 4.42019148303429e-05, "loss": 2.2788885116577147, "memory(GiB)": 77.56, "step": 62670, "token_acc": 0.49635036496350365, "train_speed(iter/s)": 1.43773 }, { "epoch": 2.685189152135727, "grad_norm": 4.484790325164795, "learning_rate": 4.419523051100229e-05, "loss": 2.597868537902832, "memory(GiB)": 77.56, "step": 62675, "token_acc": 0.46397694524495675, "train_speed(iter/s)": 1.437754 }, { "epoch": 2.6854033674649758, "grad_norm": 5.3702712059021, "learning_rate": 4.418854629682053e-05, "loss": 2.550777816772461, "memory(GiB)": 77.56, "step": 62680, "token_acc": 0.44654088050314467, "train_speed(iter/s)": 1.437759 }, { "epoch": 2.685617582794225, "grad_norm": 6.263823986053467, "learning_rate": 4.41818621879187e-05, "loss": 2.556705665588379, "memory(GiB)": 77.56, "step": 62685, "token_acc": 0.48375451263537905, "train_speed(iter/s)": 1.437774 }, { "epoch": 2.685831798123474, "grad_norm": 6.265450954437256, "learning_rate": 4.41751781844179e-05, "loss": 2.4886608123779297, "memory(GiB)": 77.56, "step": 62690, "token_acc": 0.5015015015015015, "train_speed(iter/s)": 1.437758 }, { "epoch": 2.6860460134527226, "grad_norm": 4.281060218811035, "learning_rate": 4.416849428643922e-05, "loss": 2.6439865112304686, "memory(GiB)": 77.56, "step": 62695, "token_acc": 0.45454545454545453, "train_speed(iter/s)": 1.437777 }, { "epoch": 2.686260228781972, "grad_norm": 5.494462013244629, "learning_rate": 4.416181049410372e-05, "loss": 2.4629398345947267, "memory(GiB)": 77.56, "step": 62700, "token_acc": 0.49606299212598426, "train_speed(iter/s)": 1.437801 }, { "epoch": 2.6864744441112207, "grad_norm": 5.727767467498779, "learning_rate": 4.415512680753251e-05, "loss": 2.2903635025024416, "memory(GiB)": 77.56, "step": 62705, "token_acc": 0.5096525096525096, "train_speed(iter/s)": 1.437807 }, { "epoch": 2.6866886594404695, "grad_norm": 5.721604824066162, "learning_rate": 4.414844322684667e-05, "loss": 2.3975643157958983, "memory(GiB)": 77.56, "step": 62710, "token_acc": 0.4646840148698885, "train_speed(iter/s)": 1.437816 }, { "epoch": 2.686902874769719, "grad_norm": 5.7855305671691895, "learning_rate": 4.414175975216724e-05, "loss": 2.6450050354003904, "memory(GiB)": 77.56, "step": 62715, "token_acc": 0.4899328859060403, "train_speed(iter/s)": 1.437826 }, { "epoch": 2.6871170900989676, "grad_norm": 6.601729393005371, "learning_rate": 4.413507638361534e-05, "loss": 2.285299873352051, "memory(GiB)": 77.56, "step": 62720, "token_acc": 0.48639455782312924, "train_speed(iter/s)": 1.437821 }, { "epoch": 2.6873313054282164, "grad_norm": 5.127752304077148, "learning_rate": 4.412839312131204e-05, "loss": 2.51846866607666, "memory(GiB)": 77.56, "step": 62725, "token_acc": 0.48172757475083056, "train_speed(iter/s)": 1.437827 }, { "epoch": 2.6875455207574657, "grad_norm": 5.674526214599609, "learning_rate": 4.41217099653784e-05, "loss": 2.241515350341797, "memory(GiB)": 77.56, "step": 62730, "token_acc": 0.5180327868852459, "train_speed(iter/s)": 1.437848 }, { "epoch": 2.6877597360867145, "grad_norm": 5.848512172698975, "learning_rate": 4.411502691593551e-05, "loss": 2.693959426879883, "memory(GiB)": 77.56, "step": 62735, "token_acc": 0.4473684210526316, "train_speed(iter/s)": 1.437843 }, { "epoch": 2.6879739514159633, "grad_norm": 4.746672630310059, "learning_rate": 4.410834397310443e-05, "loss": 2.333148193359375, "memory(GiB)": 77.56, "step": 62740, "token_acc": 0.47770700636942676, "train_speed(iter/s)": 1.437864 }, { "epoch": 2.6881881667452125, "grad_norm": 5.210576057434082, "learning_rate": 4.410166113700621e-05, "loss": 2.4908187866210936, "memory(GiB)": 77.56, "step": 62745, "token_acc": 0.4605263157894737, "train_speed(iter/s)": 1.437813 }, { "epoch": 2.6884023820744614, "grad_norm": 7.094858169555664, "learning_rate": 4.4094978407761936e-05, "loss": 2.3643184661865235, "memory(GiB)": 77.56, "step": 62750, "token_acc": 0.46179401993355484, "train_speed(iter/s)": 1.437833 }, { "epoch": 2.68861659740371, "grad_norm": 4.565588474273682, "learning_rate": 4.408829578549268e-05, "loss": 2.207187461853027, "memory(GiB)": 77.56, "step": 62755, "token_acc": 0.47719298245614034, "train_speed(iter/s)": 1.437835 }, { "epoch": 2.6888308127329594, "grad_norm": 7.889364719390869, "learning_rate": 4.4081613270319476e-05, "loss": 2.643632698059082, "memory(GiB)": 77.56, "step": 62760, "token_acc": 0.46564885496183206, "train_speed(iter/s)": 1.43785 }, { "epoch": 2.6890450280622082, "grad_norm": 5.547224521636963, "learning_rate": 4.407493086236341e-05, "loss": 2.2921451568603515, "memory(GiB)": 77.56, "step": 62765, "token_acc": 0.464, "train_speed(iter/s)": 1.437825 }, { "epoch": 2.689259243391457, "grad_norm": 3.637024402618408, "learning_rate": 4.406824856174552e-05, "loss": 2.4423154830932616, "memory(GiB)": 77.56, "step": 62770, "token_acc": 0.49193548387096775, "train_speed(iter/s)": 1.437846 }, { "epoch": 2.6894734587207063, "grad_norm": 6.107388019561768, "learning_rate": 4.406156636858688e-05, "loss": 2.5040639877319335, "memory(GiB)": 77.56, "step": 62775, "token_acc": 0.447098976109215, "train_speed(iter/s)": 1.437846 }, { "epoch": 2.689687674049955, "grad_norm": 5.72420597076416, "learning_rate": 4.4054884283008534e-05, "loss": 2.4143774032592775, "memory(GiB)": 77.56, "step": 62780, "token_acc": 0.4813664596273292, "train_speed(iter/s)": 1.437827 }, { "epoch": 2.689901889379204, "grad_norm": 4.765916347503662, "learning_rate": 4.404820230513153e-05, "loss": 2.19256591796875, "memory(GiB)": 77.56, "step": 62785, "token_acc": 0.5038461538461538, "train_speed(iter/s)": 1.437845 }, { "epoch": 2.690116104708453, "grad_norm": 5.38768196105957, "learning_rate": 4.404152043507692e-05, "loss": 2.513363075256348, "memory(GiB)": 77.56, "step": 62790, "token_acc": 0.4673202614379085, "train_speed(iter/s)": 1.437861 }, { "epoch": 2.690330320037702, "grad_norm": 4.33722448348999, "learning_rate": 4.4034838672965764e-05, "loss": 2.576100730895996, "memory(GiB)": 77.56, "step": 62795, "token_acc": 0.48125, "train_speed(iter/s)": 1.437878 }, { "epoch": 2.690544535366951, "grad_norm": 5.493600845336914, "learning_rate": 4.4028157018919106e-05, "loss": 2.598653793334961, "memory(GiB)": 77.56, "step": 62800, "token_acc": 0.4637223974763407, "train_speed(iter/s)": 1.437866 }, { "epoch": 2.6907587506962, "grad_norm": 5.1063690185546875, "learning_rate": 4.4021475473057984e-05, "loss": 2.5322696685791017, "memory(GiB)": 77.56, "step": 62805, "token_acc": 0.4642857142857143, "train_speed(iter/s)": 1.437873 }, { "epoch": 2.690972966025449, "grad_norm": 4.514760494232178, "learning_rate": 4.401479403550344e-05, "loss": 2.22237606048584, "memory(GiB)": 77.56, "step": 62810, "token_acc": 0.48928571428571427, "train_speed(iter/s)": 1.437885 }, { "epoch": 2.6911871813546977, "grad_norm": 4.8120856285095215, "learning_rate": 4.400811270637652e-05, "loss": 2.5820423126220704, "memory(GiB)": 77.56, "step": 62815, "token_acc": 0.45, "train_speed(iter/s)": 1.437889 }, { "epoch": 2.691401396683947, "grad_norm": 5.443692207336426, "learning_rate": 4.400143148579826e-05, "loss": 2.8748239517211913, "memory(GiB)": 77.56, "step": 62820, "token_acc": 0.43973941368078173, "train_speed(iter/s)": 1.437895 }, { "epoch": 2.6916156120131958, "grad_norm": 5.166743755340576, "learning_rate": 4.399475037388969e-05, "loss": 2.3774322509765624, "memory(GiB)": 77.56, "step": 62825, "token_acc": 0.4778481012658228, "train_speed(iter/s)": 1.437895 }, { "epoch": 2.6918298273424446, "grad_norm": 5.580989837646484, "learning_rate": 4.398806937077185e-05, "loss": 2.3642290115356444, "memory(GiB)": 77.56, "step": 62830, "token_acc": 0.49050632911392406, "train_speed(iter/s)": 1.437918 }, { "epoch": 2.692044042671694, "grad_norm": 5.6126322746276855, "learning_rate": 4.398138847656578e-05, "loss": 2.585008239746094, "memory(GiB)": 77.56, "step": 62835, "token_acc": 0.4594594594594595, "train_speed(iter/s)": 1.437934 }, { "epoch": 2.6922582580009427, "grad_norm": 6.069372177124023, "learning_rate": 4.3974707691392503e-05, "loss": 2.3440319061279298, "memory(GiB)": 77.56, "step": 62840, "token_acc": 0.4984025559105431, "train_speed(iter/s)": 1.437941 }, { "epoch": 2.6924724733301915, "grad_norm": 4.513391494750977, "learning_rate": 4.396802701537304e-05, "loss": 2.3978302001953127, "memory(GiB)": 77.56, "step": 62845, "token_acc": 0.4784172661870504, "train_speed(iter/s)": 1.437948 }, { "epoch": 2.6926866886594407, "grad_norm": 6.225036144256592, "learning_rate": 4.396134644862844e-05, "loss": 2.280904769897461, "memory(GiB)": 77.56, "step": 62850, "token_acc": 0.4935064935064935, "train_speed(iter/s)": 1.437944 }, { "epoch": 2.6929009039886895, "grad_norm": 4.990926742553711, "learning_rate": 4.395466599127971e-05, "loss": 2.6861850738525392, "memory(GiB)": 77.56, "step": 62855, "token_acc": 0.4805194805194805, "train_speed(iter/s)": 1.43798 }, { "epoch": 2.6931151193179383, "grad_norm": 5.945162296295166, "learning_rate": 4.3947985643447866e-05, "loss": 2.314472770690918, "memory(GiB)": 77.56, "step": 62860, "token_acc": 0.4846153846153846, "train_speed(iter/s)": 1.43798 }, { "epoch": 2.6933293346471876, "grad_norm": 4.823639869689941, "learning_rate": 4.394130540525392e-05, "loss": 2.3924217224121094, "memory(GiB)": 77.56, "step": 62865, "token_acc": 0.4984423676012461, "train_speed(iter/s)": 1.43799 }, { "epoch": 2.6935435499764364, "grad_norm": 4.460719108581543, "learning_rate": 4.393462527681894e-05, "loss": 2.4931190490722654, "memory(GiB)": 77.56, "step": 62870, "token_acc": 0.488135593220339, "train_speed(iter/s)": 1.437978 }, { "epoch": 2.6937577653056852, "grad_norm": 5.305558681488037, "learning_rate": 4.39279452582639e-05, "loss": 2.4298730850219727, "memory(GiB)": 77.56, "step": 62875, "token_acc": 0.4659498207885305, "train_speed(iter/s)": 1.437968 }, { "epoch": 2.6939719806349345, "grad_norm": 4.590268611907959, "learning_rate": 4.3921265349709844e-05, "loss": 2.3436609268188477, "memory(GiB)": 77.56, "step": 62880, "token_acc": 0.4722222222222222, "train_speed(iter/s)": 1.437967 }, { "epoch": 2.6941861959641833, "grad_norm": 5.058292865753174, "learning_rate": 4.391458555127777e-05, "loss": 2.3872493743896483, "memory(GiB)": 77.56, "step": 62885, "token_acc": 0.4855072463768116, "train_speed(iter/s)": 1.437951 }, { "epoch": 2.694400411293432, "grad_norm": 9.457488059997559, "learning_rate": 4.390790586308867e-05, "loss": 2.352023696899414, "memory(GiB)": 77.56, "step": 62890, "token_acc": 0.5131578947368421, "train_speed(iter/s)": 1.437975 }, { "epoch": 2.6946146266226814, "grad_norm": 6.278934001922607, "learning_rate": 4.390122628526358e-05, "loss": 2.2947227478027346, "memory(GiB)": 77.56, "step": 62895, "token_acc": 0.48757763975155277, "train_speed(iter/s)": 1.437988 }, { "epoch": 2.69482884195193, "grad_norm": 5.026858806610107, "learning_rate": 4.38945468179235e-05, "loss": 2.3993967056274412, "memory(GiB)": 77.56, "step": 62900, "token_acc": 0.49458483754512633, "train_speed(iter/s)": 1.437997 }, { "epoch": 2.695043057281179, "grad_norm": 7.493849754333496, "learning_rate": 4.3887867461189416e-05, "loss": 2.624139976501465, "memory(GiB)": 77.56, "step": 62905, "token_acc": 0.47876447876447875, "train_speed(iter/s)": 1.438004 }, { "epoch": 2.6952572726104282, "grad_norm": 7.232560634613037, "learning_rate": 4.388118821518236e-05, "loss": 2.421573829650879, "memory(GiB)": 77.56, "step": 62910, "token_acc": 0.5085324232081911, "train_speed(iter/s)": 1.438039 }, { "epoch": 2.695471487939677, "grad_norm": 5.298771381378174, "learning_rate": 4.3874509080023315e-05, "loss": 2.3434898376464846, "memory(GiB)": 77.56, "step": 62915, "token_acc": 0.5159010600706714, "train_speed(iter/s)": 1.438033 }, { "epoch": 2.695685703268926, "grad_norm": 5.103635787963867, "learning_rate": 4.3867830055833284e-05, "loss": 2.5847805023193358, "memory(GiB)": 77.56, "step": 62920, "token_acc": 0.48286604361370716, "train_speed(iter/s)": 1.43805 }, { "epoch": 2.695899918598175, "grad_norm": 6.296849250793457, "learning_rate": 4.386115114273328e-05, "loss": 2.135004234313965, "memory(GiB)": 77.56, "step": 62925, "token_acc": 0.4827586206896552, "train_speed(iter/s)": 1.438024 }, { "epoch": 2.696114133927424, "grad_norm": 4.00954008102417, "learning_rate": 4.385447234084426e-05, "loss": 2.176651191711426, "memory(GiB)": 77.56, "step": 62930, "token_acc": 0.5444444444444444, "train_speed(iter/s)": 1.438044 }, { "epoch": 2.6963283492566728, "grad_norm": 5.4145121574401855, "learning_rate": 4.384779365028722e-05, "loss": 2.4199140548706053, "memory(GiB)": 77.56, "step": 62935, "token_acc": 0.49642857142857144, "train_speed(iter/s)": 1.438062 }, { "epoch": 2.696542564585922, "grad_norm": 6.146902561187744, "learning_rate": 4.38411150711832e-05, "loss": 2.628597640991211, "memory(GiB)": 77.56, "step": 62940, "token_acc": 0.4605263157894737, "train_speed(iter/s)": 1.438066 }, { "epoch": 2.696756779915171, "grad_norm": 4.526706695556641, "learning_rate": 4.383443660365316e-05, "loss": 2.359404754638672, "memory(GiB)": 77.56, "step": 62945, "token_acc": 0.4707692307692308, "train_speed(iter/s)": 1.438043 }, { "epoch": 2.6969709952444196, "grad_norm": 5.5696821212768555, "learning_rate": 4.3827758247818075e-05, "loss": 2.440283012390137, "memory(GiB)": 77.56, "step": 62950, "token_acc": 0.47796610169491527, "train_speed(iter/s)": 1.438023 }, { "epoch": 2.697185210573669, "grad_norm": 5.297787666320801, "learning_rate": 4.382108000379894e-05, "loss": 2.4412647247314454, "memory(GiB)": 77.56, "step": 62955, "token_acc": 0.5095057034220533, "train_speed(iter/s)": 1.437998 }, { "epoch": 2.6973994259029177, "grad_norm": 5.234545707702637, "learning_rate": 4.381440187171675e-05, "loss": 2.5680448532104494, "memory(GiB)": 77.56, "step": 62960, "token_acc": 0.4633333333333333, "train_speed(iter/s)": 1.438022 }, { "epoch": 2.6976136412321665, "grad_norm": 5.247034549713135, "learning_rate": 4.380772385169245e-05, "loss": 2.402149963378906, "memory(GiB)": 77.56, "step": 62965, "token_acc": 0.4703703703703704, "train_speed(iter/s)": 1.438046 }, { "epoch": 2.6978278565614158, "grad_norm": 6.02849817276001, "learning_rate": 4.3801045943847064e-05, "loss": 2.156708526611328, "memory(GiB)": 77.56, "step": 62970, "token_acc": 0.5354609929078015, "train_speed(iter/s)": 1.438067 }, { "epoch": 2.6980420718906646, "grad_norm": 5.664326190948486, "learning_rate": 4.3794368148301525e-05, "loss": 2.359878730773926, "memory(GiB)": 77.56, "step": 62975, "token_acc": 0.48036253776435045, "train_speed(iter/s)": 1.438051 }, { "epoch": 2.6982562872199134, "grad_norm": 5.418706893920898, "learning_rate": 4.378769046517685e-05, "loss": 2.3388364791870115, "memory(GiB)": 77.56, "step": 62980, "token_acc": 0.5, "train_speed(iter/s)": 1.438044 }, { "epoch": 2.6984705025491627, "grad_norm": 5.228435039520264, "learning_rate": 4.3781012894593975e-05, "loss": 2.275046157836914, "memory(GiB)": 77.56, "step": 62985, "token_acc": 0.4870967741935484, "train_speed(iter/s)": 1.438074 }, { "epoch": 2.6986847178784115, "grad_norm": 6.260488510131836, "learning_rate": 4.377433543667388e-05, "loss": 2.421467590332031, "memory(GiB)": 77.56, "step": 62990, "token_acc": 0.5175718849840255, "train_speed(iter/s)": 1.438073 }, { "epoch": 2.6988989332076603, "grad_norm": 5.846724987030029, "learning_rate": 4.376765809153755e-05, "loss": 2.5110095977783202, "memory(GiB)": 77.56, "step": 62995, "token_acc": 0.4748427672955975, "train_speed(iter/s)": 1.438091 }, { "epoch": 2.6991131485369095, "grad_norm": 5.498667240142822, "learning_rate": 4.376098085930594e-05, "loss": 2.3920913696289063, "memory(GiB)": 77.56, "step": 63000, "token_acc": 0.4968553459119497, "train_speed(iter/s)": 1.438087 }, { "epoch": 2.6991131485369095, "eval_loss": 2.1162877082824707, "eval_runtime": 13.5205, "eval_samples_per_second": 7.396, "eval_steps_per_second": 7.396, "eval_token_acc": 0.47831632653061223, "step": 63000 }, { "epoch": 2.6993273638661583, "grad_norm": 8.477178573608398, "learning_rate": 4.375430374010001e-05, "loss": 2.34906005859375, "memory(GiB)": 77.56, "step": 63005, "token_acc": 0.491362763915547, "train_speed(iter/s)": 1.437637 }, { "epoch": 2.699541579195407, "grad_norm": 5.616421222686768, "learning_rate": 4.3747626734040716e-05, "loss": 2.306650924682617, "memory(GiB)": 77.56, "step": 63010, "token_acc": 0.4694533762057878, "train_speed(iter/s)": 1.437622 }, { "epoch": 2.6997557945246564, "grad_norm": 3.6965794563293457, "learning_rate": 4.374094984124904e-05, "loss": 2.4355173110961914, "memory(GiB)": 77.56, "step": 63015, "token_acc": 0.46885245901639344, "train_speed(iter/s)": 1.437606 }, { "epoch": 2.6999700098539052, "grad_norm": 5.530742168426514, "learning_rate": 4.3734273061845926e-05, "loss": 2.4598361968994142, "memory(GiB)": 77.56, "step": 63020, "token_acc": 0.5095541401273885, "train_speed(iter/s)": 1.437609 }, { "epoch": 2.700184225183154, "grad_norm": 6.5386834144592285, "learning_rate": 4.372759639595234e-05, "loss": 2.47520751953125, "memory(GiB)": 77.56, "step": 63025, "token_acc": 0.4897260273972603, "train_speed(iter/s)": 1.437607 }, { "epoch": 2.7003984405124033, "grad_norm": 4.548661231994629, "learning_rate": 4.3720919843689236e-05, "loss": 2.329376220703125, "memory(GiB)": 77.56, "step": 63030, "token_acc": 0.5214285714285715, "train_speed(iter/s)": 1.437605 }, { "epoch": 2.700612655841652, "grad_norm": 4.839725494384766, "learning_rate": 4.371424340517754e-05, "loss": 2.5615768432617188, "memory(GiB)": 77.56, "step": 63035, "token_acc": 0.4638888888888889, "train_speed(iter/s)": 1.437604 }, { "epoch": 2.700826871170901, "grad_norm": 5.1556267738342285, "learning_rate": 4.3707567080538235e-05, "loss": 2.4451314926147463, "memory(GiB)": 77.56, "step": 63040, "token_acc": 0.5, "train_speed(iter/s)": 1.437633 }, { "epoch": 2.70104108650015, "grad_norm": 6.217810153961182, "learning_rate": 4.370089086989225e-05, "loss": 2.330446243286133, "memory(GiB)": 77.56, "step": 63045, "token_acc": 0.512280701754386, "train_speed(iter/s)": 1.437637 }, { "epoch": 2.701255301829399, "grad_norm": 4.29569149017334, "learning_rate": 4.369421477336054e-05, "loss": 2.2924964904785154, "memory(GiB)": 77.56, "step": 63050, "token_acc": 0.5145631067961165, "train_speed(iter/s)": 1.437647 }, { "epoch": 2.701469517158648, "grad_norm": 4.126863956451416, "learning_rate": 4.368753879106404e-05, "loss": 2.3577011108398436, "memory(GiB)": 77.56, "step": 63055, "token_acc": 0.4790996784565916, "train_speed(iter/s)": 1.437621 }, { "epoch": 2.701683732487897, "grad_norm": 5.330383777618408, "learning_rate": 4.368086292312369e-05, "loss": 2.2391632080078123, "memory(GiB)": 77.56, "step": 63060, "token_acc": 0.4982456140350877, "train_speed(iter/s)": 1.43764 }, { "epoch": 2.701897947817146, "grad_norm": 5.941152095794678, "learning_rate": 4.367418716966045e-05, "loss": 2.3233346939086914, "memory(GiB)": 77.56, "step": 63065, "token_acc": 0.5346153846153846, "train_speed(iter/s)": 1.43761 }, { "epoch": 2.7021121631463947, "grad_norm": 4.539067268371582, "learning_rate": 4.366751153079525e-05, "loss": 2.4829776763916014, "memory(GiB)": 77.56, "step": 63070, "token_acc": 0.4592274678111588, "train_speed(iter/s)": 1.437618 }, { "epoch": 2.702326378475644, "grad_norm": 5.581842422485352, "learning_rate": 4.3660836006649e-05, "loss": 2.586964797973633, "memory(GiB)": 77.56, "step": 63075, "token_acc": 0.453125, "train_speed(iter/s)": 1.437636 }, { "epoch": 2.7025405938048928, "grad_norm": 5.822049140930176, "learning_rate": 4.365416059734266e-05, "loss": 2.448215675354004, "memory(GiB)": 77.56, "step": 63080, "token_acc": 0.48, "train_speed(iter/s)": 1.437648 }, { "epoch": 2.7027548091341416, "grad_norm": 5.618698596954346, "learning_rate": 4.364748530299714e-05, "loss": 2.464839744567871, "memory(GiB)": 77.56, "step": 63085, "token_acc": 0.48717948717948717, "train_speed(iter/s)": 1.437648 }, { "epoch": 2.702969024463391, "grad_norm": 6.2707390785217285, "learning_rate": 4.364081012373339e-05, "loss": 2.436688041687012, "memory(GiB)": 77.56, "step": 63090, "token_acc": 0.4828571428571429, "train_speed(iter/s)": 1.437654 }, { "epoch": 2.7031832397926396, "grad_norm": 5.044200420379639, "learning_rate": 4.363413505967233e-05, "loss": 2.5676612854003906, "memory(GiB)": 77.56, "step": 63095, "token_acc": 0.4557377049180328, "train_speed(iter/s)": 1.437667 }, { "epoch": 2.7033974551218884, "grad_norm": 5.146721839904785, "learning_rate": 4.36274601109349e-05, "loss": 2.6601850509643556, "memory(GiB)": 77.56, "step": 63100, "token_acc": 0.4527027027027027, "train_speed(iter/s)": 1.437633 }, { "epoch": 2.7036116704511377, "grad_norm": 6.288225173950195, "learning_rate": 4.3620785277642004e-05, "loss": 2.7035140991210938, "memory(GiB)": 77.56, "step": 63105, "token_acc": 0.44155844155844154, "train_speed(iter/s)": 1.43764 }, { "epoch": 2.7038258857803865, "grad_norm": 5.237624168395996, "learning_rate": 4.3614110559914555e-05, "loss": 2.430165481567383, "memory(GiB)": 77.56, "step": 63110, "token_acc": 0.5209003215434084, "train_speed(iter/s)": 1.437623 }, { "epoch": 2.7040401011096353, "grad_norm": 6.375870704650879, "learning_rate": 4.360743595787349e-05, "loss": 2.487293243408203, "memory(GiB)": 77.56, "step": 63115, "token_acc": 0.48632218844984804, "train_speed(iter/s)": 1.437615 }, { "epoch": 2.7042543164388846, "grad_norm": 6.199286460876465, "learning_rate": 4.360076147163972e-05, "loss": 2.5808658599853516, "memory(GiB)": 77.56, "step": 63120, "token_acc": 0.4533762057877814, "train_speed(iter/s)": 1.43763 }, { "epoch": 2.7044685317681334, "grad_norm": 5.016519069671631, "learning_rate": 4.3594087101334164e-05, "loss": 2.2785139083862305, "memory(GiB)": 77.56, "step": 63125, "token_acc": 0.517799352750809, "train_speed(iter/s)": 1.437596 }, { "epoch": 2.704682747097382, "grad_norm": 5.4700117111206055, "learning_rate": 4.3587412847077726e-05, "loss": 2.3348194122314454, "memory(GiB)": 77.56, "step": 63130, "token_acc": 0.5053763440860215, "train_speed(iter/s)": 1.437598 }, { "epoch": 2.7048969624266315, "grad_norm": 5.129421710968018, "learning_rate": 4.358073870899131e-05, "loss": 2.6222465515136717, "memory(GiB)": 77.56, "step": 63135, "token_acc": 0.47904191616766467, "train_speed(iter/s)": 1.437593 }, { "epoch": 2.7051111777558803, "grad_norm": 7.872922420501709, "learning_rate": 4.3574064687195846e-05, "loss": 2.609187698364258, "memory(GiB)": 77.56, "step": 63140, "token_acc": 0.47653429602888087, "train_speed(iter/s)": 1.437609 }, { "epoch": 2.705325393085129, "grad_norm": 5.309908390045166, "learning_rate": 4.356739078181223e-05, "loss": 2.4413629531860352, "memory(GiB)": 77.56, "step": 63145, "token_acc": 0.46488294314381273, "train_speed(iter/s)": 1.437568 }, { "epoch": 2.7055396084143783, "grad_norm": 5.252475738525391, "learning_rate": 4.356071699296135e-05, "loss": 2.2228572845458983, "memory(GiB)": 77.56, "step": 63150, "token_acc": 0.5584905660377358, "train_speed(iter/s)": 1.437575 }, { "epoch": 2.705753823743627, "grad_norm": 6.663409233093262, "learning_rate": 4.3554043320764134e-05, "loss": 2.2703807830810545, "memory(GiB)": 77.56, "step": 63155, "token_acc": 0.5166666666666667, "train_speed(iter/s)": 1.437585 }, { "epoch": 2.705968039072876, "grad_norm": 5.493904113769531, "learning_rate": 4.354736976534145e-05, "loss": 2.488634490966797, "memory(GiB)": 77.56, "step": 63160, "token_acc": 0.47794117647058826, "train_speed(iter/s)": 1.437586 }, { "epoch": 2.7061822544021252, "grad_norm": 6.335967063903809, "learning_rate": 4.354069632681423e-05, "loss": 2.3626171112060548, "memory(GiB)": 77.56, "step": 63165, "token_acc": 0.4886731391585761, "train_speed(iter/s)": 1.43762 }, { "epoch": 2.706396469731374, "grad_norm": 7.072081089019775, "learning_rate": 4.353402300530336e-05, "loss": 2.2816446304321287, "memory(GiB)": 77.56, "step": 63170, "token_acc": 0.496551724137931, "train_speed(iter/s)": 1.437627 }, { "epoch": 2.706610685060623, "grad_norm": 4.870462894439697, "learning_rate": 4.352734980092973e-05, "loss": 2.2714717864990233, "memory(GiB)": 77.56, "step": 63175, "token_acc": 0.48, "train_speed(iter/s)": 1.437607 }, { "epoch": 2.706824900389872, "grad_norm": 5.844668388366699, "learning_rate": 4.352067671381422e-05, "loss": 2.396867561340332, "memory(GiB)": 77.56, "step": 63180, "token_acc": 0.4886731391585761, "train_speed(iter/s)": 1.437596 }, { "epoch": 2.707039115719121, "grad_norm": 4.8983283042907715, "learning_rate": 4.3514003744077745e-05, "loss": 2.678304672241211, "memory(GiB)": 77.56, "step": 63185, "token_acc": 0.45224719101123595, "train_speed(iter/s)": 1.437568 }, { "epoch": 2.7072533310483697, "grad_norm": 7.347422122955322, "learning_rate": 4.350733089184117e-05, "loss": 2.2833236694335937, "memory(GiB)": 77.56, "step": 63190, "token_acc": 0.5215827338129496, "train_speed(iter/s)": 1.437559 }, { "epoch": 2.707467546377619, "grad_norm": 7.245707988739014, "learning_rate": 4.3500658157225375e-05, "loss": 2.4425127029418947, "memory(GiB)": 77.56, "step": 63195, "token_acc": 0.4924812030075188, "train_speed(iter/s)": 1.437576 }, { "epoch": 2.707681761706868, "grad_norm": 5.923473358154297, "learning_rate": 4.3493985540351265e-05, "loss": 2.3449058532714844, "memory(GiB)": 77.56, "step": 63200, "token_acc": 0.49712643678160917, "train_speed(iter/s)": 1.437592 }, { "epoch": 2.7078959770361166, "grad_norm": 6.155705451965332, "learning_rate": 4.34873130413397e-05, "loss": 2.5253669738769533, "memory(GiB)": 77.56, "step": 63205, "token_acc": 0.4519230769230769, "train_speed(iter/s)": 1.437598 }, { "epoch": 2.708110192365366, "grad_norm": 6.043651580810547, "learning_rate": 4.348064066031159e-05, "loss": 2.5645116806030273, "memory(GiB)": 77.56, "step": 63210, "token_acc": 0.468013468013468, "train_speed(iter/s)": 1.437607 }, { "epoch": 2.7083244076946147, "grad_norm": 5.626470565795898, "learning_rate": 4.3473968397387774e-05, "loss": 2.606903839111328, "memory(GiB)": 77.56, "step": 63215, "token_acc": 0.44755244755244755, "train_speed(iter/s)": 1.437589 }, { "epoch": 2.7085386230238635, "grad_norm": 4.792459487915039, "learning_rate": 4.3467296252689144e-05, "loss": 2.524639892578125, "memory(GiB)": 77.56, "step": 63220, "token_acc": 0.4231974921630094, "train_speed(iter/s)": 1.437581 }, { "epoch": 2.7087528383531128, "grad_norm": 5.940973281860352, "learning_rate": 4.3460624226336576e-05, "loss": 2.463256072998047, "memory(GiB)": 77.56, "step": 63225, "token_acc": 0.4849624060150376, "train_speed(iter/s)": 1.437566 }, { "epoch": 2.7089670536823616, "grad_norm": 6.710156440734863, "learning_rate": 4.345395231845094e-05, "loss": 2.6621768951416014, "memory(GiB)": 77.56, "step": 63230, "token_acc": 0.48322147651006714, "train_speed(iter/s)": 1.437593 }, { "epoch": 2.7091812690116104, "grad_norm": 7.585979461669922, "learning_rate": 4.344728052915307e-05, "loss": 2.262676239013672, "memory(GiB)": 77.56, "step": 63235, "token_acc": 0.5102040816326531, "train_speed(iter/s)": 1.43758 }, { "epoch": 2.7093954843408596, "grad_norm": 5.6849236488342285, "learning_rate": 4.344060885856387e-05, "loss": 2.1216243743896483, "memory(GiB)": 77.56, "step": 63240, "token_acc": 0.5272727272727272, "train_speed(iter/s)": 1.437587 }, { "epoch": 2.7096096996701085, "grad_norm": 4.63074254989624, "learning_rate": 4.343393730680421e-05, "loss": 2.4104248046875, "memory(GiB)": 77.56, "step": 63245, "token_acc": 0.5057471264367817, "train_speed(iter/s)": 1.437582 }, { "epoch": 2.7098239149993573, "grad_norm": 6.267821788787842, "learning_rate": 4.3427265873994935e-05, "loss": 2.1667816162109377, "memory(GiB)": 77.56, "step": 63250, "token_acc": 0.47330960854092524, "train_speed(iter/s)": 1.43757 }, { "epoch": 2.7100381303286065, "grad_norm": 5.555212497711182, "learning_rate": 4.342059456025689e-05, "loss": 2.2661861419677733, "memory(GiB)": 77.56, "step": 63255, "token_acc": 0.5, "train_speed(iter/s)": 1.437572 }, { "epoch": 2.7102523456578553, "grad_norm": 5.554281234741211, "learning_rate": 4.341392336571096e-05, "loss": 2.500975799560547, "memory(GiB)": 77.56, "step": 63260, "token_acc": 0.5079365079365079, "train_speed(iter/s)": 1.4376 }, { "epoch": 2.710466560987104, "grad_norm": 5.827896595001221, "learning_rate": 4.340725229047797e-05, "loss": 2.834640693664551, "memory(GiB)": 77.56, "step": 63265, "token_acc": 0.431438127090301, "train_speed(iter/s)": 1.437635 }, { "epoch": 2.7106807763163534, "grad_norm": 4.751279830932617, "learning_rate": 4.3400581334678805e-05, "loss": 2.234446334838867, "memory(GiB)": 77.56, "step": 63270, "token_acc": 0.5088967971530249, "train_speed(iter/s)": 1.437607 }, { "epoch": 2.710894991645602, "grad_norm": 4.170665740966797, "learning_rate": 4.33939104984343e-05, "loss": 2.3763450622558593, "memory(GiB)": 77.56, "step": 63275, "token_acc": 0.48314606741573035, "train_speed(iter/s)": 1.437625 }, { "epoch": 2.711109206974851, "grad_norm": 7.812934875488281, "learning_rate": 4.338723978186529e-05, "loss": 2.5575469970703124, "memory(GiB)": 77.56, "step": 63280, "token_acc": 0.4558303886925795, "train_speed(iter/s)": 1.437636 }, { "epoch": 2.7113234223041003, "grad_norm": 5.3547492027282715, "learning_rate": 4.338056918509265e-05, "loss": 2.553681182861328, "memory(GiB)": 77.56, "step": 63285, "token_acc": 0.49469964664310956, "train_speed(iter/s)": 1.437619 }, { "epoch": 2.711537637633349, "grad_norm": 4.910192966461182, "learning_rate": 4.33738987082372e-05, "loss": 2.466324234008789, "memory(GiB)": 77.56, "step": 63290, "token_acc": 0.47419354838709676, "train_speed(iter/s)": 1.437619 }, { "epoch": 2.711751852962598, "grad_norm": 4.390270709991455, "learning_rate": 4.336722835141979e-05, "loss": 2.21740779876709, "memory(GiB)": 77.56, "step": 63295, "token_acc": 0.5444444444444444, "train_speed(iter/s)": 1.43762 }, { "epoch": 2.711966068291847, "grad_norm": 4.178674697875977, "learning_rate": 4.3360558114761266e-05, "loss": 2.0102195739746094, "memory(GiB)": 77.56, "step": 63300, "token_acc": 0.5701754385964912, "train_speed(iter/s)": 1.437638 }, { "epoch": 2.712180283621096, "grad_norm": 5.572151184082031, "learning_rate": 4.3353887998382444e-05, "loss": 2.4363922119140624, "memory(GiB)": 77.56, "step": 63305, "token_acc": 0.4861878453038674, "train_speed(iter/s)": 1.437632 }, { "epoch": 2.712394498950345, "grad_norm": 7.255897521972656, "learning_rate": 4.334721800240418e-05, "loss": 2.480462837219238, "memory(GiB)": 77.56, "step": 63310, "token_acc": 0.49328859060402686, "train_speed(iter/s)": 1.437642 }, { "epoch": 2.712608714279594, "grad_norm": 6.7720112800598145, "learning_rate": 4.3340548126947316e-05, "loss": 2.4891117095947264, "memory(GiB)": 77.56, "step": 63315, "token_acc": 0.4962686567164179, "train_speed(iter/s)": 1.437643 }, { "epoch": 2.712822929608843, "grad_norm": 5.638824462890625, "learning_rate": 4.333387837213267e-05, "loss": 2.2209213256835936, "memory(GiB)": 77.56, "step": 63320, "token_acc": 0.5174603174603175, "train_speed(iter/s)": 1.43763 }, { "epoch": 2.7130371449380917, "grad_norm": 5.645864009857178, "learning_rate": 4.332720873808106e-05, "loss": 2.323262023925781, "memory(GiB)": 77.56, "step": 63325, "token_acc": 0.4921259842519685, "train_speed(iter/s)": 1.437621 }, { "epoch": 2.713251360267341, "grad_norm": 5.4597063064575195, "learning_rate": 4.332053922491333e-05, "loss": 2.7017515182495115, "memory(GiB)": 77.56, "step": 63330, "token_acc": 0.43272727272727274, "train_speed(iter/s)": 1.437647 }, { "epoch": 2.7134655755965897, "grad_norm": 6.1688313484191895, "learning_rate": 4.33138698327503e-05, "loss": 2.5099727630615236, "memory(GiB)": 77.56, "step": 63335, "token_acc": 0.48026315789473684, "train_speed(iter/s)": 1.437634 }, { "epoch": 2.7136797909258386, "grad_norm": 5.5754313468933105, "learning_rate": 4.3307200561712777e-05, "loss": 2.4527713775634767, "memory(GiB)": 77.56, "step": 63340, "token_acc": 0.4857142857142857, "train_speed(iter/s)": 1.437646 }, { "epoch": 2.713894006255088, "grad_norm": 6.005540370941162, "learning_rate": 4.330053141192161e-05, "loss": 2.424937438964844, "memory(GiB)": 77.56, "step": 63345, "token_acc": 0.5127272727272727, "train_speed(iter/s)": 1.437629 }, { "epoch": 2.7141082215843366, "grad_norm": 4.311441421508789, "learning_rate": 4.3293862383497594e-05, "loss": 2.2430381774902344, "memory(GiB)": 77.56, "step": 63350, "token_acc": 0.46987951807228917, "train_speed(iter/s)": 1.437631 }, { "epoch": 2.7143224369135854, "grad_norm": 5.606351852416992, "learning_rate": 4.3287193476561555e-05, "loss": 2.2092987060546876, "memory(GiB)": 77.56, "step": 63355, "token_acc": 0.4980392156862745, "train_speed(iter/s)": 1.437642 }, { "epoch": 2.7145366522428347, "grad_norm": 7.537847995758057, "learning_rate": 4.32805246912343e-05, "loss": 2.292195129394531, "memory(GiB)": 77.56, "step": 63360, "token_acc": 0.47720364741641336, "train_speed(iter/s)": 1.43765 }, { "epoch": 2.7147508675720835, "grad_norm": 6.258525848388672, "learning_rate": 4.327385602763664e-05, "loss": 2.2040334701538087, "memory(GiB)": 77.56, "step": 63365, "token_acc": 0.5140845070422535, "train_speed(iter/s)": 1.437618 }, { "epoch": 2.7149650829013323, "grad_norm": 5.4620232582092285, "learning_rate": 4.32671874858894e-05, "loss": 2.3868431091308593, "memory(GiB)": 77.56, "step": 63370, "token_acc": 0.49333333333333335, "train_speed(iter/s)": 1.437604 }, { "epoch": 2.7151792982305816, "grad_norm": 4.934838771820068, "learning_rate": 4.326051906611337e-05, "loss": 2.5465763092041014, "memory(GiB)": 77.56, "step": 63375, "token_acc": 0.4918032786885246, "train_speed(iter/s)": 1.437633 }, { "epoch": 2.7153935135598304, "grad_norm": 5.326876163482666, "learning_rate": 4.325385076842934e-05, "loss": 2.3296792984008787, "memory(GiB)": 77.56, "step": 63380, "token_acc": 0.5215686274509804, "train_speed(iter/s)": 1.437648 }, { "epoch": 2.715607728889079, "grad_norm": 5.654313564300537, "learning_rate": 4.3247182592958136e-05, "loss": 2.5130685806274413, "memory(GiB)": 77.56, "step": 63385, "token_acc": 0.48188405797101447, "train_speed(iter/s)": 1.437672 }, { "epoch": 2.7158219442183285, "grad_norm": 5.089864253997803, "learning_rate": 4.3240514539820574e-05, "loss": 2.4864452362060545, "memory(GiB)": 77.56, "step": 63390, "token_acc": 0.5017543859649123, "train_speed(iter/s)": 1.437657 }, { "epoch": 2.7160361595475773, "grad_norm": 5.473681926727295, "learning_rate": 4.323384660913743e-05, "loss": 2.273955154418945, "memory(GiB)": 77.56, "step": 63395, "token_acc": 0.4962962962962963, "train_speed(iter/s)": 1.43765 }, { "epoch": 2.716250374876826, "grad_norm": 6.059061050415039, "learning_rate": 4.322717880102949e-05, "loss": 2.6015626907348635, "memory(GiB)": 77.56, "step": 63400, "token_acc": 0.488135593220339, "train_speed(iter/s)": 1.437651 }, { "epoch": 2.7164645902060753, "grad_norm": 4.939112663269043, "learning_rate": 4.3220511115617565e-05, "loss": 2.588299369812012, "memory(GiB)": 77.56, "step": 63405, "token_acc": 0.4712230215827338, "train_speed(iter/s)": 1.437652 }, { "epoch": 2.716678805535324, "grad_norm": 6.9835662841796875, "learning_rate": 4.321384355302244e-05, "loss": 2.371767044067383, "memory(GiB)": 77.56, "step": 63410, "token_acc": 0.4779116465863454, "train_speed(iter/s)": 1.43764 }, { "epoch": 2.716893020864573, "grad_norm": 4.72863245010376, "learning_rate": 4.320717611336491e-05, "loss": 2.433430290222168, "memory(GiB)": 77.56, "step": 63415, "token_acc": 0.4670846394984326, "train_speed(iter/s)": 1.437638 }, { "epoch": 2.717107236193822, "grad_norm": 5.178319931030273, "learning_rate": 4.320050879676575e-05, "loss": 2.5596729278564454, "memory(GiB)": 77.56, "step": 63420, "token_acc": 0.48286604361370716, "train_speed(iter/s)": 1.437651 }, { "epoch": 2.717321451523071, "grad_norm": 6.3593339920043945, "learning_rate": 4.3193841603345755e-05, "loss": 2.3459300994873047, "memory(GiB)": 77.56, "step": 63425, "token_acc": 0.5055350553505535, "train_speed(iter/s)": 1.437688 }, { "epoch": 2.71753566685232, "grad_norm": 5.314087867736816, "learning_rate": 4.318717453322571e-05, "loss": 2.1478271484375, "memory(GiB)": 77.56, "step": 63430, "token_acc": 0.5403225806451613, "train_speed(iter/s)": 1.437715 }, { "epoch": 2.717749882181569, "grad_norm": 5.62764835357666, "learning_rate": 4.318050758652638e-05, "loss": 2.523588180541992, "memory(GiB)": 77.56, "step": 63435, "token_acc": 0.47896440129449835, "train_speed(iter/s)": 1.437721 }, { "epoch": 2.717964097510818, "grad_norm": 6.044641971588135, "learning_rate": 4.317384076336855e-05, "loss": 2.4947792053222657, "memory(GiB)": 77.56, "step": 63440, "token_acc": 0.49240121580547114, "train_speed(iter/s)": 1.437709 }, { "epoch": 2.7181783128400667, "grad_norm": 5.518644332885742, "learning_rate": 4.3167174063873004e-05, "loss": 2.3688568115234374, "memory(GiB)": 77.56, "step": 63445, "token_acc": 0.5090252707581228, "train_speed(iter/s)": 1.43771 }, { "epoch": 2.718392528169316, "grad_norm": 5.159210205078125, "learning_rate": 4.3160507488160504e-05, "loss": 2.248431396484375, "memory(GiB)": 77.56, "step": 63450, "token_acc": 0.5032894736842105, "train_speed(iter/s)": 1.437703 }, { "epoch": 2.718606743498565, "grad_norm": 5.272059440612793, "learning_rate": 4.3153841036351814e-05, "loss": 2.185207176208496, "memory(GiB)": 77.56, "step": 63455, "token_acc": 0.5084033613445378, "train_speed(iter/s)": 1.437699 }, { "epoch": 2.7188209588278136, "grad_norm": 4.720504283905029, "learning_rate": 4.314717470856774e-05, "loss": 2.5009529113769533, "memory(GiB)": 77.56, "step": 63460, "token_acc": 0.5057915057915058, "train_speed(iter/s)": 1.437717 }, { "epoch": 2.719035174157063, "grad_norm": 4.2666707038879395, "learning_rate": 4.314050850492902e-05, "loss": 2.318477249145508, "memory(GiB)": 77.56, "step": 63465, "token_acc": 0.4852941176470588, "train_speed(iter/s)": 1.437745 }, { "epoch": 2.7192493894863117, "grad_norm": 9.12672233581543, "learning_rate": 4.313384242555641e-05, "loss": 2.3794715881347654, "memory(GiB)": 77.56, "step": 63470, "token_acc": 0.5201465201465202, "train_speed(iter/s)": 1.437725 }, { "epoch": 2.7194636048155605, "grad_norm": 5.946536064147949, "learning_rate": 4.31271764705707e-05, "loss": 2.5504430770874023, "memory(GiB)": 77.56, "step": 63475, "token_acc": 0.45384615384615384, "train_speed(iter/s)": 1.437724 }, { "epoch": 2.7196778201448097, "grad_norm": 4.6696391105651855, "learning_rate": 4.3120510640092636e-05, "loss": 2.590966987609863, "memory(GiB)": 77.56, "step": 63480, "token_acc": 0.44135802469135804, "train_speed(iter/s)": 1.437733 }, { "epoch": 2.7198920354740586, "grad_norm": 5.773374557495117, "learning_rate": 4.311384493424297e-05, "loss": 2.4556367874145506, "memory(GiB)": 77.56, "step": 63485, "token_acc": 0.4892086330935252, "train_speed(iter/s)": 1.437748 }, { "epoch": 2.7201062508033074, "grad_norm": 6.427194118499756, "learning_rate": 4.310717935314247e-05, "loss": 2.672300910949707, "memory(GiB)": 77.56, "step": 63490, "token_acc": 0.4612676056338028, "train_speed(iter/s)": 1.437748 }, { "epoch": 2.7203204661325566, "grad_norm": 8.000069618225098, "learning_rate": 4.3100513896911865e-05, "loss": 2.444048309326172, "memory(GiB)": 77.56, "step": 63495, "token_acc": 0.4745762711864407, "train_speed(iter/s)": 1.437735 }, { "epoch": 2.7205346814618054, "grad_norm": 4.372310161590576, "learning_rate": 4.309384856567194e-05, "loss": 2.7593429565429686, "memory(GiB)": 77.56, "step": 63500, "token_acc": 0.4400871459694989, "train_speed(iter/s)": 1.437706 }, { "epoch": 2.7205346814618054, "eval_loss": 2.2863733768463135, "eval_runtime": 14.3387, "eval_samples_per_second": 6.974, "eval_steps_per_second": 6.974, "eval_token_acc": 0.47322540473225405, "step": 63500 }, { "epoch": 2.7207488967910542, "grad_norm": 5.951817512512207, "learning_rate": 4.3087183359543426e-05, "loss": 2.231358528137207, "memory(GiB)": 77.56, "step": 63505, "token_acc": 0.487758945386064, "train_speed(iter/s)": 1.43721 }, { "epoch": 2.7209631121203035, "grad_norm": 4.930519104003906, "learning_rate": 4.308051827864705e-05, "loss": 2.52340087890625, "memory(GiB)": 77.56, "step": 63510, "token_acc": 0.46551724137931033, "train_speed(iter/s)": 1.437217 }, { "epoch": 2.7211773274495523, "grad_norm": 4.144283294677734, "learning_rate": 4.3073853323103604e-05, "loss": 2.7167423248291014, "memory(GiB)": 77.56, "step": 63515, "token_acc": 0.4885245901639344, "train_speed(iter/s)": 1.437192 }, { "epoch": 2.721391542778801, "grad_norm": 4.923089504241943, "learning_rate": 4.3067188493033796e-05, "loss": 2.500599670410156, "memory(GiB)": 77.56, "step": 63520, "token_acc": 0.5121107266435986, "train_speed(iter/s)": 1.437224 }, { "epoch": 2.7216057581080504, "grad_norm": 5.032780170440674, "learning_rate": 4.3060523788558355e-05, "loss": 2.7366724014282227, "memory(GiB)": 77.56, "step": 63525, "token_acc": 0.46325878594249204, "train_speed(iter/s)": 1.437226 }, { "epoch": 2.721819973437299, "grad_norm": 5.736341953277588, "learning_rate": 4.3053859209798025e-05, "loss": 2.6113428115844726, "memory(GiB)": 77.56, "step": 63530, "token_acc": 0.44876325088339225, "train_speed(iter/s)": 1.437223 }, { "epoch": 2.722034188766548, "grad_norm": 5.133331775665283, "learning_rate": 4.3047194756873575e-05, "loss": 2.786575508117676, "memory(GiB)": 77.56, "step": 63535, "token_acc": 0.43018867924528303, "train_speed(iter/s)": 1.437218 }, { "epoch": 2.7222484040957973, "grad_norm": 4.850512504577637, "learning_rate": 4.3040530429905715e-05, "loss": 2.3773895263671876, "memory(GiB)": 77.56, "step": 63540, "token_acc": 0.4943820224719101, "train_speed(iter/s)": 1.437241 }, { "epoch": 2.722462619425046, "grad_norm": 4.3101348876953125, "learning_rate": 4.3033866229015166e-05, "loss": 2.4509647369384764, "memory(GiB)": 77.56, "step": 63545, "token_acc": 0.47645429362880887, "train_speed(iter/s)": 1.437226 }, { "epoch": 2.722676834754295, "grad_norm": 4.423937797546387, "learning_rate": 4.302720215432268e-05, "loss": 2.192299461364746, "memory(GiB)": 77.56, "step": 63550, "token_acc": 0.5490196078431373, "train_speed(iter/s)": 1.437213 }, { "epoch": 2.722891050083544, "grad_norm": 5.230189323425293, "learning_rate": 4.302053820594895e-05, "loss": 2.5490650177001952, "memory(GiB)": 77.56, "step": 63555, "token_acc": 0.45222929936305734, "train_speed(iter/s)": 1.437241 }, { "epoch": 2.723105265412793, "grad_norm": 6.743096828460693, "learning_rate": 4.301387438401473e-05, "loss": 2.626717376708984, "memory(GiB)": 77.56, "step": 63560, "token_acc": 0.4904214559386973, "train_speed(iter/s)": 1.437181 }, { "epoch": 2.7233194807420418, "grad_norm": 4.990501880645752, "learning_rate": 4.300721068864073e-05, "loss": 2.580089569091797, "memory(GiB)": 77.56, "step": 63565, "token_acc": 0.5062893081761006, "train_speed(iter/s)": 1.437205 }, { "epoch": 2.723533696071291, "grad_norm": 4.702508449554443, "learning_rate": 4.3000547119947656e-05, "loss": 2.6212514877319335, "memory(GiB)": 77.56, "step": 63570, "token_acc": 0.4434250764525994, "train_speed(iter/s)": 1.437221 }, { "epoch": 2.72374791140054, "grad_norm": 5.278894901275635, "learning_rate": 4.2993883678056246e-05, "loss": 2.4061717987060547, "memory(GiB)": 77.56, "step": 63575, "token_acc": 0.4927536231884058, "train_speed(iter/s)": 1.437194 }, { "epoch": 2.7239621267297887, "grad_norm": 6.516336441040039, "learning_rate": 4.298722036308721e-05, "loss": 2.6168821334838865, "memory(GiB)": 77.56, "step": 63580, "token_acc": 0.4729241877256318, "train_speed(iter/s)": 1.43723 }, { "epoch": 2.724176342059038, "grad_norm": 5.345509052276611, "learning_rate": 4.298055717516124e-05, "loss": 2.461959457397461, "memory(GiB)": 77.56, "step": 63585, "token_acc": 0.5051194539249146, "train_speed(iter/s)": 1.437232 }, { "epoch": 2.7243905573882867, "grad_norm": 4.316685676574707, "learning_rate": 4.297389411439908e-05, "loss": 1.944647979736328, "memory(GiB)": 77.56, "step": 63590, "token_acc": 0.5584905660377358, "train_speed(iter/s)": 1.437216 }, { "epoch": 2.7246047727175355, "grad_norm": 5.405136585235596, "learning_rate": 4.2967231180921395e-05, "loss": 2.5211957931518554, "memory(GiB)": 77.56, "step": 63595, "token_acc": 0.4758842443729904, "train_speed(iter/s)": 1.437242 }, { "epoch": 2.724818988046785, "grad_norm": 4.766284465789795, "learning_rate": 4.296056837484894e-05, "loss": 2.6702812194824217, "memory(GiB)": 77.56, "step": 63600, "token_acc": 0.43909348441926344, "train_speed(iter/s)": 1.437219 }, { "epoch": 2.7250332033760336, "grad_norm": 6.10042142868042, "learning_rate": 4.295390569630236e-05, "loss": 2.390985107421875, "memory(GiB)": 77.56, "step": 63605, "token_acc": 0.4694533762057878, "train_speed(iter/s)": 1.437209 }, { "epoch": 2.7252474187052824, "grad_norm": 6.727659702301025, "learning_rate": 4.294724314540241e-05, "loss": 2.349884605407715, "memory(GiB)": 77.56, "step": 63610, "token_acc": 0.5302491103202847, "train_speed(iter/s)": 1.437209 }, { "epoch": 2.7254616340345317, "grad_norm": 6.050721168518066, "learning_rate": 4.294058072226976e-05, "loss": 2.5393613815307616, "memory(GiB)": 77.56, "step": 63615, "token_acc": 0.46131805157593125, "train_speed(iter/s)": 1.437223 }, { "epoch": 2.7256758493637805, "grad_norm": 7.031256675720215, "learning_rate": 4.293391842702513e-05, "loss": 2.5156442642211916, "memory(GiB)": 77.56, "step": 63620, "token_acc": 0.4148148148148148, "train_speed(iter/s)": 1.437222 }, { "epoch": 2.7258900646930293, "grad_norm": 6.383103370666504, "learning_rate": 4.2927256259789184e-05, "loss": 2.409423828125, "memory(GiB)": 77.56, "step": 63625, "token_acc": 0.46178343949044587, "train_speed(iter/s)": 1.437234 }, { "epoch": 2.7261042800222786, "grad_norm": 6.660313606262207, "learning_rate": 4.292059422068262e-05, "loss": 2.379970359802246, "memory(GiB)": 77.56, "step": 63630, "token_acc": 0.48221343873517786, "train_speed(iter/s)": 1.437207 }, { "epoch": 2.7263184953515274, "grad_norm": 4.641581058502197, "learning_rate": 4.291393230982614e-05, "loss": 2.2225364685058593, "memory(GiB)": 77.56, "step": 63635, "token_acc": 0.5121951219512195, "train_speed(iter/s)": 1.437208 }, { "epoch": 2.726532710680776, "grad_norm": 5.122830390930176, "learning_rate": 4.290727052734042e-05, "loss": 2.3288331985473634, "memory(GiB)": 77.56, "step": 63640, "token_acc": 0.5215827338129496, "train_speed(iter/s)": 1.437214 }, { "epoch": 2.7267469260100254, "grad_norm": 4.4000067710876465, "learning_rate": 4.290060887334616e-05, "loss": 2.3637311935424803, "memory(GiB)": 77.56, "step": 63645, "token_acc": 0.4820359281437126, "train_speed(iter/s)": 1.437241 }, { "epoch": 2.7269611413392743, "grad_norm": 5.200026512145996, "learning_rate": 4.289394734796402e-05, "loss": 2.624894714355469, "memory(GiB)": 77.56, "step": 63650, "token_acc": 0.4722222222222222, "train_speed(iter/s)": 1.437275 }, { "epoch": 2.727175356668523, "grad_norm": 5.261202812194824, "learning_rate": 4.288728595131469e-05, "loss": 2.2805709838867188, "memory(GiB)": 77.56, "step": 63655, "token_acc": 0.5222672064777328, "train_speed(iter/s)": 1.437291 }, { "epoch": 2.7273895719977723, "grad_norm": 4.581358909606934, "learning_rate": 4.2880624683518844e-05, "loss": 2.2480937957763674, "memory(GiB)": 77.56, "step": 63660, "token_acc": 0.49140893470790376, "train_speed(iter/s)": 1.437323 }, { "epoch": 2.727603787327021, "grad_norm": 5.376256942749023, "learning_rate": 4.287396354469717e-05, "loss": 2.4637903213500976, "memory(GiB)": 77.56, "step": 63665, "token_acc": 0.5017921146953405, "train_speed(iter/s)": 1.437346 }, { "epoch": 2.72781800265627, "grad_norm": 4.416100978851318, "learning_rate": 4.286730253497032e-05, "loss": 2.713575744628906, "memory(GiB)": 77.56, "step": 63670, "token_acc": 0.4495677233429395, "train_speed(iter/s)": 1.437357 }, { "epoch": 2.728032217985519, "grad_norm": 4.978917598724365, "learning_rate": 4.286064165445896e-05, "loss": 2.523843002319336, "memory(GiB)": 77.56, "step": 63675, "token_acc": 0.49382716049382713, "train_speed(iter/s)": 1.437365 }, { "epoch": 2.728246433314768, "grad_norm": 5.200705528259277, "learning_rate": 4.28539809032838e-05, "loss": 2.511026382446289, "memory(GiB)": 77.56, "step": 63680, "token_acc": 0.5018587360594795, "train_speed(iter/s)": 1.437392 }, { "epoch": 2.728460648644017, "grad_norm": 5.517684459686279, "learning_rate": 4.284732028156548e-05, "loss": 2.3788078308105467, "memory(GiB)": 77.56, "step": 63685, "token_acc": 0.5042016806722689, "train_speed(iter/s)": 1.437421 }, { "epoch": 2.728674863973266, "grad_norm": 5.72015380859375, "learning_rate": 4.284065978942465e-05, "loss": 2.31542911529541, "memory(GiB)": 77.56, "step": 63690, "token_acc": 0.5066079295154186, "train_speed(iter/s)": 1.437442 }, { "epoch": 2.728889079302515, "grad_norm": 7.0121612548828125, "learning_rate": 4.2833999426982e-05, "loss": 2.4734703063964845, "memory(GiB)": 77.56, "step": 63695, "token_acc": 0.4735099337748344, "train_speed(iter/s)": 1.437462 }, { "epoch": 2.7291032946317637, "grad_norm": 5.633831977844238, "learning_rate": 4.282733919435815e-05, "loss": 2.3416704177856444, "memory(GiB)": 77.56, "step": 63700, "token_acc": 0.5046439628482973, "train_speed(iter/s)": 1.437484 }, { "epoch": 2.729317509961013, "grad_norm": 5.1880388259887695, "learning_rate": 4.28206790916738e-05, "loss": 2.6888330459594725, "memory(GiB)": 77.56, "step": 63705, "token_acc": 0.4628975265017668, "train_speed(iter/s)": 1.437462 }, { "epoch": 2.729531725290262, "grad_norm": 5.1609392166137695, "learning_rate": 4.281401911904958e-05, "loss": 2.3305335998535157, "memory(GiB)": 77.56, "step": 63710, "token_acc": 0.5230769230769231, "train_speed(iter/s)": 1.437442 }, { "epoch": 2.7297459406195106, "grad_norm": 6.756219387054443, "learning_rate": 4.280735927660613e-05, "loss": 2.470112609863281, "memory(GiB)": 77.56, "step": 63715, "token_acc": 0.5135135135135135, "train_speed(iter/s)": 1.437476 }, { "epoch": 2.72996015594876, "grad_norm": 5.60759973526001, "learning_rate": 4.2800699564464134e-05, "loss": 2.5540462493896485, "memory(GiB)": 77.56, "step": 63720, "token_acc": 0.46619217081850534, "train_speed(iter/s)": 1.437495 }, { "epoch": 2.7301743712780087, "grad_norm": 4.942457675933838, "learning_rate": 4.279403998274421e-05, "loss": 2.2493419647216797, "memory(GiB)": 77.56, "step": 63725, "token_acc": 0.5082508250825083, "train_speed(iter/s)": 1.437519 }, { "epoch": 2.7303885866072575, "grad_norm": 6.366645812988281, "learning_rate": 4.2787380531567e-05, "loss": 2.4286148071289064, "memory(GiB)": 77.56, "step": 63730, "token_acc": 0.44891640866873067, "train_speed(iter/s)": 1.437498 }, { "epoch": 2.7306028019365067, "grad_norm": 5.511943817138672, "learning_rate": 4.278072121105318e-05, "loss": 2.598671722412109, "memory(GiB)": 77.56, "step": 63735, "token_acc": 0.4265232974910394, "train_speed(iter/s)": 1.437513 }, { "epoch": 2.7308170172657555, "grad_norm": 4.579773902893066, "learning_rate": 4.277406202132335e-05, "loss": 2.3932857513427734, "memory(GiB)": 77.56, "step": 63740, "token_acc": 0.5340909090909091, "train_speed(iter/s)": 1.437492 }, { "epoch": 2.7310312325950044, "grad_norm": 5.3524346351623535, "learning_rate": 4.276740296249817e-05, "loss": 2.3440357208251954, "memory(GiB)": 77.56, "step": 63745, "token_acc": 0.4782608695652174, "train_speed(iter/s)": 1.437521 }, { "epoch": 2.7312454479242536, "grad_norm": 5.517590045928955, "learning_rate": 4.276074403469825e-05, "loss": 2.7323217391967773, "memory(GiB)": 77.56, "step": 63750, "token_acc": 0.44680851063829785, "train_speed(iter/s)": 1.43754 }, { "epoch": 2.7314596632535024, "grad_norm": 5.0154314041137695, "learning_rate": 4.275408523804427e-05, "loss": 2.7700546264648436, "memory(GiB)": 77.56, "step": 63755, "token_acc": 0.44410876132930516, "train_speed(iter/s)": 1.437554 }, { "epoch": 2.7316738785827512, "grad_norm": 5.464852333068848, "learning_rate": 4.274742657265682e-05, "loss": 2.411619186401367, "memory(GiB)": 77.56, "step": 63760, "token_acc": 0.45075757575757575, "train_speed(iter/s)": 1.437566 }, { "epoch": 2.7318880939120005, "grad_norm": 5.425018310546875, "learning_rate": 4.2740768038656546e-05, "loss": 2.21066837310791, "memory(GiB)": 77.56, "step": 63765, "token_acc": 0.4981949458483754, "train_speed(iter/s)": 1.437571 }, { "epoch": 2.7321023092412493, "grad_norm": 7.191006183624268, "learning_rate": 4.2734109636164074e-05, "loss": 2.612629699707031, "memory(GiB)": 77.56, "step": 63770, "token_acc": 0.4570446735395189, "train_speed(iter/s)": 1.437609 }, { "epoch": 2.732316524570498, "grad_norm": 5.584221363067627, "learning_rate": 4.2727451365300014e-05, "loss": 2.433461380004883, "memory(GiB)": 77.56, "step": 63775, "token_acc": 0.5093167701863354, "train_speed(iter/s)": 1.437619 }, { "epoch": 2.7325307398997474, "grad_norm": 4.80769681930542, "learning_rate": 4.272079322618501e-05, "loss": 2.5371467590332033, "memory(GiB)": 77.56, "step": 63780, "token_acc": 0.4967105263157895, "train_speed(iter/s)": 1.437627 }, { "epoch": 2.732744955228996, "grad_norm": 4.261141300201416, "learning_rate": 4.271413521893965e-05, "loss": 2.3539379119873045, "memory(GiB)": 77.56, "step": 63785, "token_acc": 0.4897260273972603, "train_speed(iter/s)": 1.43765 }, { "epoch": 2.732959170558245, "grad_norm": 5.126184940338135, "learning_rate": 4.270747734368457e-05, "loss": 2.2294429779052733, "memory(GiB)": 77.56, "step": 63790, "token_acc": 0.5376712328767124, "train_speed(iter/s)": 1.437677 }, { "epoch": 2.7331733858874943, "grad_norm": 5.520096778869629, "learning_rate": 4.270081960054038e-05, "loss": 2.525926971435547, "memory(GiB)": 77.56, "step": 63795, "token_acc": 0.4847328244274809, "train_speed(iter/s)": 1.437681 }, { "epoch": 2.733387601216743, "grad_norm": 4.697901248931885, "learning_rate": 4.269416198962768e-05, "loss": 2.1814714431762696, "memory(GiB)": 77.56, "step": 63800, "token_acc": 0.5309090909090909, "train_speed(iter/s)": 1.437649 }, { "epoch": 2.733601816545992, "grad_norm": 4.715577602386475, "learning_rate": 4.26875045110671e-05, "loss": 2.3868234634399412, "memory(GiB)": 77.56, "step": 63805, "token_acc": 0.5186246418338109, "train_speed(iter/s)": 1.437658 }, { "epoch": 2.733816031875241, "grad_norm": 5.891027927398682, "learning_rate": 4.268084716497924e-05, "loss": 2.3933685302734373, "memory(GiB)": 77.56, "step": 63810, "token_acc": 0.5022222222222222, "train_speed(iter/s)": 1.437692 }, { "epoch": 2.73403024720449, "grad_norm": 5.315133571624756, "learning_rate": 4.267418995148468e-05, "loss": 2.706546401977539, "memory(GiB)": 77.56, "step": 63815, "token_acc": 0.4570446735395189, "train_speed(iter/s)": 1.437715 }, { "epoch": 2.7342444625337388, "grad_norm": 5.708217620849609, "learning_rate": 4.266753287070406e-05, "loss": 2.252176284790039, "memory(GiB)": 77.56, "step": 63820, "token_acc": 0.5351170568561873, "train_speed(iter/s)": 1.437707 }, { "epoch": 2.734458677862988, "grad_norm": 5.75045919418335, "learning_rate": 4.266087592275794e-05, "loss": 2.4530160903930662, "memory(GiB)": 77.56, "step": 63825, "token_acc": 0.49025069637883006, "train_speed(iter/s)": 1.437719 }, { "epoch": 2.734672893192237, "grad_norm": 5.411227226257324, "learning_rate": 4.265421910776694e-05, "loss": 2.471903991699219, "memory(GiB)": 77.56, "step": 63830, "token_acc": 0.519163763066202, "train_speed(iter/s)": 1.437709 }, { "epoch": 2.7348871085214856, "grad_norm": 5.818785190582275, "learning_rate": 4.2647562425851666e-05, "loss": 2.489049530029297, "memory(GiB)": 77.56, "step": 63835, "token_acc": 0.47368421052631576, "train_speed(iter/s)": 1.43771 }, { "epoch": 2.735101323850735, "grad_norm": 5.858545303344727, "learning_rate": 4.26409058771327e-05, "loss": 2.4503189086914063, "memory(GiB)": 77.56, "step": 63840, "token_acc": 0.5088967971530249, "train_speed(iter/s)": 1.437717 }, { "epoch": 2.7353155391799837, "grad_norm": 4.953150749206543, "learning_rate": 4.2634249461730616e-05, "loss": 2.2386812210083007, "memory(GiB)": 77.56, "step": 63845, "token_acc": 0.5217391304347826, "train_speed(iter/s)": 1.437736 }, { "epoch": 2.7355297545092325, "grad_norm": 6.272424221038818, "learning_rate": 4.262759317976602e-05, "loss": 2.668625259399414, "memory(GiB)": 77.56, "step": 63850, "token_acc": 0.4732824427480916, "train_speed(iter/s)": 1.437735 }, { "epoch": 2.735743969838482, "grad_norm": 6.13362455368042, "learning_rate": 4.262093703135949e-05, "loss": 2.479934310913086, "memory(GiB)": 77.56, "step": 63855, "token_acc": 0.5, "train_speed(iter/s)": 1.437741 }, { "epoch": 2.7359581851677306, "grad_norm": 7.060302734375, "learning_rate": 4.26142810166316e-05, "loss": 2.2734760284423827, "memory(GiB)": 77.56, "step": 63860, "token_acc": 0.5075757575757576, "train_speed(iter/s)": 1.437725 }, { "epoch": 2.7361724004969794, "grad_norm": 4.511002540588379, "learning_rate": 4.260762513570294e-05, "loss": 2.16192569732666, "memory(GiB)": 77.56, "step": 63865, "token_acc": 0.5209125475285171, "train_speed(iter/s)": 1.43772 }, { "epoch": 2.7363866158262287, "grad_norm": 5.379841327667236, "learning_rate": 4.260096938869409e-05, "loss": 2.4815832138061524, "memory(GiB)": 77.56, "step": 63870, "token_acc": 0.47692307692307695, "train_speed(iter/s)": 1.437727 }, { "epoch": 2.7366008311554775, "grad_norm": 4.512026786804199, "learning_rate": 4.2594313775725616e-05, "loss": 2.228351593017578, "memory(GiB)": 77.56, "step": 63875, "token_acc": 0.5348101265822784, "train_speed(iter/s)": 1.437727 }, { "epoch": 2.7368150464847263, "grad_norm": 4.49551248550415, "learning_rate": 4.25876582969181e-05, "loss": 2.3812911987304686, "memory(GiB)": 77.56, "step": 63880, "token_acc": 0.49538461538461537, "train_speed(iter/s)": 1.437756 }, { "epoch": 2.7370292618139755, "grad_norm": 6.15917444229126, "learning_rate": 4.258100295239209e-05, "loss": 2.1320409774780273, "memory(GiB)": 77.56, "step": 63885, "token_acc": 0.5114503816793893, "train_speed(iter/s)": 1.437756 }, { "epoch": 2.7372434771432244, "grad_norm": 6.878716468811035, "learning_rate": 4.25743477422682e-05, "loss": 2.1629804611206054, "memory(GiB)": 77.56, "step": 63890, "token_acc": 0.509493670886076, "train_speed(iter/s)": 1.437765 }, { "epoch": 2.737457692472473, "grad_norm": 5.495079040527344, "learning_rate": 4.2567692666666945e-05, "loss": 2.4723489761352537, "memory(GiB)": 77.56, "step": 63895, "token_acc": 0.48089171974522293, "train_speed(iter/s)": 1.437756 }, { "epoch": 2.7376719078017224, "grad_norm": 4.677679538726807, "learning_rate": 4.2561037725708904e-05, "loss": 2.304025650024414, "memory(GiB)": 77.56, "step": 63900, "token_acc": 0.5136054421768708, "train_speed(iter/s)": 1.437736 }, { "epoch": 2.7378861231309712, "grad_norm": 8.411466598510742, "learning_rate": 4.2554382919514645e-05, "loss": 2.4469631195068358, "memory(GiB)": 77.56, "step": 63905, "token_acc": 0.4793388429752066, "train_speed(iter/s)": 1.437714 }, { "epoch": 2.73810033846022, "grad_norm": 6.134751796722412, "learning_rate": 4.254772824820474e-05, "loss": 2.371603012084961, "memory(GiB)": 77.56, "step": 63910, "token_acc": 0.5652173913043478, "train_speed(iter/s)": 1.437714 }, { "epoch": 2.7383145537894693, "grad_norm": 6.018167495727539, "learning_rate": 4.254107371189973e-05, "loss": 2.400075340270996, "memory(GiB)": 77.56, "step": 63915, "token_acc": 0.5054945054945055, "train_speed(iter/s)": 1.437724 }, { "epoch": 2.738528769118718, "grad_norm": 6.0654706954956055, "learning_rate": 4.253441931072015e-05, "loss": 2.488100051879883, "memory(GiB)": 77.56, "step": 63920, "token_acc": 0.48616600790513836, "train_speed(iter/s)": 1.437739 }, { "epoch": 2.738742984447967, "grad_norm": 5.200046539306641, "learning_rate": 4.2527765044786576e-05, "loss": 2.4781816482543944, "memory(GiB)": 77.56, "step": 63925, "token_acc": 0.4789156626506024, "train_speed(iter/s)": 1.437775 }, { "epoch": 2.738957199777216, "grad_norm": 6.048184871673584, "learning_rate": 4.252111091421954e-05, "loss": 2.5228322982788085, "memory(GiB)": 77.56, "step": 63930, "token_acc": 0.4555984555984556, "train_speed(iter/s)": 1.4378 }, { "epoch": 2.739171415106465, "grad_norm": 13.278603553771973, "learning_rate": 4.251445691913961e-05, "loss": 2.2331220626831056, "memory(GiB)": 77.56, "step": 63935, "token_acc": 0.4983164983164983, "train_speed(iter/s)": 1.437803 }, { "epoch": 2.739385630435714, "grad_norm": 5.795896530151367, "learning_rate": 4.250780305966731e-05, "loss": 2.5971269607543945, "memory(GiB)": 77.56, "step": 63940, "token_acc": 0.4651898734177215, "train_speed(iter/s)": 1.4378 }, { "epoch": 2.739599845764963, "grad_norm": 6.14264440536499, "learning_rate": 4.2501149335923176e-05, "loss": 2.2010746002197266, "memory(GiB)": 77.56, "step": 63945, "token_acc": 0.5149253731343284, "train_speed(iter/s)": 1.437822 }, { "epoch": 2.739814061094212, "grad_norm": 7.9233927726745605, "learning_rate": 4.2494495748027776e-05, "loss": 2.227177619934082, "memory(GiB)": 77.56, "step": 63950, "token_acc": 0.5349794238683128, "train_speed(iter/s)": 1.437845 }, { "epoch": 2.7400282764234607, "grad_norm": 4.575939655303955, "learning_rate": 4.2487842296101615e-05, "loss": 2.452601432800293, "memory(GiB)": 77.56, "step": 63955, "token_acc": 0.5144927536231884, "train_speed(iter/s)": 1.437853 }, { "epoch": 2.74024249175271, "grad_norm": 8.174505233764648, "learning_rate": 4.248118898026523e-05, "loss": 2.5592151641845704, "memory(GiB)": 77.56, "step": 63960, "token_acc": 0.4878048780487805, "train_speed(iter/s)": 1.437849 }, { "epoch": 2.7404567070819588, "grad_norm": 5.447796821594238, "learning_rate": 4.247453580063917e-05, "loss": 2.167592239379883, "memory(GiB)": 77.56, "step": 63965, "token_acc": 0.5014005602240896, "train_speed(iter/s)": 1.437823 }, { "epoch": 2.7406709224112076, "grad_norm": 5.853092193603516, "learning_rate": 4.246788275734393e-05, "loss": 2.3901538848876953, "memory(GiB)": 77.56, "step": 63970, "token_acc": 0.4811320754716981, "train_speed(iter/s)": 1.437846 }, { "epoch": 2.740885137740457, "grad_norm": 6.94603157043457, "learning_rate": 4.2461229850500075e-05, "loss": 2.5193914413452148, "memory(GiB)": 77.56, "step": 63975, "token_acc": 0.5333333333333333, "train_speed(iter/s)": 1.437865 }, { "epoch": 2.7410993530697056, "grad_norm": 4.713394641876221, "learning_rate": 4.245457708022813e-05, "loss": 2.1621885299682617, "memory(GiB)": 77.56, "step": 63980, "token_acc": 0.543918918918919, "train_speed(iter/s)": 1.437848 }, { "epoch": 2.7413135683989545, "grad_norm": 5.517740249633789, "learning_rate": 4.244792444664859e-05, "loss": 2.488557243347168, "memory(GiB)": 77.56, "step": 63985, "token_acc": 0.47126436781609193, "train_speed(iter/s)": 1.43786 }, { "epoch": 2.7415277837282037, "grad_norm": 5.591786861419678, "learning_rate": 4.2441271949881975e-05, "loss": 2.33306999206543, "memory(GiB)": 77.56, "step": 63990, "token_acc": 0.49429657794676807, "train_speed(iter/s)": 1.437858 }, { "epoch": 2.7417419990574525, "grad_norm": 4.84340763092041, "learning_rate": 4.2434619590048817e-05, "loss": 2.554062843322754, "memory(GiB)": 77.56, "step": 63995, "token_acc": 0.45454545454545453, "train_speed(iter/s)": 1.437878 }, { "epoch": 2.7419562143867013, "grad_norm": 5.150165557861328, "learning_rate": 4.242796736726963e-05, "loss": 2.116430473327637, "memory(GiB)": 77.56, "step": 64000, "token_acc": 0.5243445692883895, "train_speed(iter/s)": 1.437901 }, { "epoch": 2.7419562143867013, "eval_loss": 2.249814033508301, "eval_runtime": 13.9151, "eval_samples_per_second": 7.186, "eval_steps_per_second": 7.186, "eval_token_acc": 0.4489528795811518, "step": 64000 }, { "epoch": 2.7421704297159506, "grad_norm": 7.5258378982543945, "learning_rate": 4.24213152816649e-05, "loss": 2.838523292541504, "memory(GiB)": 77.56, "step": 64005, "token_acc": 0.4566420664206642, "train_speed(iter/s)": 1.437436 }, { "epoch": 2.7423846450451994, "grad_norm": 6.42828369140625, "learning_rate": 4.241466333335517e-05, "loss": 2.504629135131836, "memory(GiB)": 77.56, "step": 64010, "token_acc": 0.48344370860927155, "train_speed(iter/s)": 1.437439 }, { "epoch": 2.742598860374448, "grad_norm": 5.749969005584717, "learning_rate": 4.240801152246091e-05, "loss": 2.305723190307617, "memory(GiB)": 77.56, "step": 64015, "token_acc": 0.5173745173745173, "train_speed(iter/s)": 1.437394 }, { "epoch": 2.7428130757036975, "grad_norm": 5.637388706207275, "learning_rate": 4.2401359849102653e-05, "loss": 2.6274797439575197, "memory(GiB)": 77.56, "step": 64020, "token_acc": 0.5122699386503068, "train_speed(iter/s)": 1.437395 }, { "epoch": 2.7430272910329463, "grad_norm": 4.276328086853027, "learning_rate": 4.2394708313400894e-05, "loss": 2.275105857849121, "memory(GiB)": 77.56, "step": 64025, "token_acc": 0.4864864864864865, "train_speed(iter/s)": 1.437416 }, { "epoch": 2.743241506362195, "grad_norm": 5.042388916015625, "learning_rate": 4.238805691547612e-05, "loss": 2.4059112548828123, "memory(GiB)": 77.56, "step": 64030, "token_acc": 0.5274725274725275, "train_speed(iter/s)": 1.437442 }, { "epoch": 2.7434557216914444, "grad_norm": 5.682839870452881, "learning_rate": 4.238140565544885e-05, "loss": 2.565266418457031, "memory(GiB)": 77.56, "step": 64035, "token_acc": 0.4322033898305085, "train_speed(iter/s)": 1.43744 }, { "epoch": 2.743669937020693, "grad_norm": 5.284014701843262, "learning_rate": 4.237475453343955e-05, "loss": 2.4839141845703123, "memory(GiB)": 77.56, "step": 64040, "token_acc": 0.479020979020979, "train_speed(iter/s)": 1.437432 }, { "epoch": 2.743884152349942, "grad_norm": 8.432207107543945, "learning_rate": 4.236810354956872e-05, "loss": 2.3649349212646484, "memory(GiB)": 77.56, "step": 64045, "token_acc": 0.525096525096525, "train_speed(iter/s)": 1.437432 }, { "epoch": 2.7440983676791912, "grad_norm": 5.532547950744629, "learning_rate": 4.236145270395685e-05, "loss": 2.35306396484375, "memory(GiB)": 77.56, "step": 64050, "token_acc": 0.5189873417721519, "train_speed(iter/s)": 1.437452 }, { "epoch": 2.74431258300844, "grad_norm": 5.355996131896973, "learning_rate": 4.2354801996724444e-05, "loss": 2.484995651245117, "memory(GiB)": 77.56, "step": 64055, "token_acc": 0.5232974910394266, "train_speed(iter/s)": 1.437448 }, { "epoch": 2.744526798337689, "grad_norm": 6.381491184234619, "learning_rate": 4.2348151427991974e-05, "loss": 2.394114875793457, "memory(GiB)": 77.56, "step": 64060, "token_acc": 0.5, "train_speed(iter/s)": 1.437478 }, { "epoch": 2.744741013666938, "grad_norm": 4.961148738861084, "learning_rate": 4.234150099787991e-05, "loss": 2.376239013671875, "memory(GiB)": 77.56, "step": 64065, "token_acc": 0.49829351535836175, "train_speed(iter/s)": 1.437486 }, { "epoch": 2.744955228996187, "grad_norm": 5.538020133972168, "learning_rate": 4.233485070650874e-05, "loss": 1.9315692901611328, "memory(GiB)": 77.56, "step": 64070, "token_acc": 0.5396825396825397, "train_speed(iter/s)": 1.437457 }, { "epoch": 2.7451694443254357, "grad_norm": 5.472782611846924, "learning_rate": 4.2328200553998944e-05, "loss": 2.3290735244750977, "memory(GiB)": 77.56, "step": 64075, "token_acc": 0.5104895104895105, "train_speed(iter/s)": 1.437454 }, { "epoch": 2.745383659654685, "grad_norm": 6.293210506439209, "learning_rate": 4.2321550540470996e-05, "loss": 2.475215530395508, "memory(GiB)": 77.56, "step": 64080, "token_acc": 0.4647887323943662, "train_speed(iter/s)": 1.437477 }, { "epoch": 2.745597874983934, "grad_norm": 5.869363784790039, "learning_rate": 4.231490066604536e-05, "loss": 2.6260141372680663, "memory(GiB)": 77.56, "step": 64085, "token_acc": 0.4565916398713826, "train_speed(iter/s)": 1.437478 }, { "epoch": 2.7458120903131826, "grad_norm": 5.941280841827393, "learning_rate": 4.23082509308425e-05, "loss": 2.4243059158325195, "memory(GiB)": 77.56, "step": 64090, "token_acc": 0.4365079365079365, "train_speed(iter/s)": 1.437496 }, { "epoch": 2.746026305642432, "grad_norm": 5.775561809539795, "learning_rate": 4.23016013349829e-05, "loss": 2.7795574188232424, "memory(GiB)": 77.56, "step": 64095, "token_acc": 0.46200607902735563, "train_speed(iter/s)": 1.437518 }, { "epoch": 2.7462405209716807, "grad_norm": 5.151354789733887, "learning_rate": 4.229495187858701e-05, "loss": 2.2424667358398436, "memory(GiB)": 77.56, "step": 64100, "token_acc": 0.5360501567398119, "train_speed(iter/s)": 1.437498 }, { "epoch": 2.7464547363009295, "grad_norm": 6.758477210998535, "learning_rate": 4.2288302561775295e-05, "loss": 2.7601896286010743, "memory(GiB)": 77.56, "step": 64105, "token_acc": 0.4041095890410959, "train_speed(iter/s)": 1.437518 }, { "epoch": 2.7466689516301788, "grad_norm": 5.916769504547119, "learning_rate": 4.228165338466821e-05, "loss": 2.557953643798828, "memory(GiB)": 77.56, "step": 64110, "token_acc": 0.4610169491525424, "train_speed(iter/s)": 1.437483 }, { "epoch": 2.7468831669594276, "grad_norm": 7.3369550704956055, "learning_rate": 4.227500434738622e-05, "loss": 2.4919652938842773, "memory(GiB)": 77.56, "step": 64115, "token_acc": 0.4781144781144781, "train_speed(iter/s)": 1.437472 }, { "epoch": 2.7470973822886764, "grad_norm": 6.129856109619141, "learning_rate": 4.226835545004975e-05, "loss": 2.297022247314453, "memory(GiB)": 77.56, "step": 64120, "token_acc": 0.5133079847908745, "train_speed(iter/s)": 1.437452 }, { "epoch": 2.7473115976179256, "grad_norm": 4.017956256866455, "learning_rate": 4.226170669277929e-05, "loss": 2.086667060852051, "memory(GiB)": 77.56, "step": 64125, "token_acc": 0.532871972318339, "train_speed(iter/s)": 1.437434 }, { "epoch": 2.7475258129471745, "grad_norm": 4.9504475593566895, "learning_rate": 4.225505807569529e-05, "loss": 2.4380519866943358, "memory(GiB)": 77.56, "step": 64130, "token_acc": 0.48184818481848185, "train_speed(iter/s)": 1.437447 }, { "epoch": 2.7477400282764233, "grad_norm": 6.795463562011719, "learning_rate": 4.224840959891815e-05, "loss": 2.3949138641357424, "memory(GiB)": 77.56, "step": 64135, "token_acc": 0.5241379310344828, "train_speed(iter/s)": 1.437439 }, { "epoch": 2.7479542436056725, "grad_norm": 5.56437873840332, "learning_rate": 4.224176126256836e-05, "loss": 2.289603424072266, "memory(GiB)": 77.56, "step": 64140, "token_acc": 0.5097276264591439, "train_speed(iter/s)": 1.437443 }, { "epoch": 2.7481684589349213, "grad_norm": 6.308046340942383, "learning_rate": 4.223511306676634e-05, "loss": 2.2859668731689453, "memory(GiB)": 77.56, "step": 64145, "token_acc": 0.5238095238095238, "train_speed(iter/s)": 1.437428 }, { "epoch": 2.74838267426417, "grad_norm": 5.176784992218018, "learning_rate": 4.222846501163253e-05, "loss": 2.1252552032470704, "memory(GiB)": 77.56, "step": 64150, "token_acc": 0.5122699386503068, "train_speed(iter/s)": 1.437434 }, { "epoch": 2.7485968895934194, "grad_norm": 6.133218765258789, "learning_rate": 4.222181709728736e-05, "loss": 1.989995574951172, "memory(GiB)": 77.56, "step": 64155, "token_acc": 0.5465587044534413, "train_speed(iter/s)": 1.437467 }, { "epoch": 2.7488111049226682, "grad_norm": 5.641963958740234, "learning_rate": 4.221516932385128e-05, "loss": 2.3942235946655273, "memory(GiB)": 77.56, "step": 64160, "token_acc": 0.5223880597014925, "train_speed(iter/s)": 1.437471 }, { "epoch": 2.749025320251917, "grad_norm": 6.726781368255615, "learning_rate": 4.220852169144471e-05, "loss": 2.5887020111083983, "memory(GiB)": 77.56, "step": 64165, "token_acc": 0.46037735849056605, "train_speed(iter/s)": 1.437464 }, { "epoch": 2.7492395355811663, "grad_norm": 6.003223419189453, "learning_rate": 4.2201874200188074e-05, "loss": 2.3801013946533205, "memory(GiB)": 77.56, "step": 64170, "token_acc": 0.46179401993355484, "train_speed(iter/s)": 1.437451 }, { "epoch": 2.749453750910415, "grad_norm": 5.951838970184326, "learning_rate": 4.2195226850201796e-05, "loss": 2.3997867584228514, "memory(GiB)": 77.56, "step": 64175, "token_acc": 0.47586206896551725, "train_speed(iter/s)": 1.437449 }, { "epoch": 2.749667966239664, "grad_norm": 5.521102428436279, "learning_rate": 4.2188579641606316e-05, "loss": 2.394172477722168, "memory(GiB)": 77.56, "step": 64180, "token_acc": 0.4790996784565916, "train_speed(iter/s)": 1.437447 }, { "epoch": 2.749882181568913, "grad_norm": 6.232408046722412, "learning_rate": 4.218193257452204e-05, "loss": 2.4417022705078124, "memory(GiB)": 77.56, "step": 64185, "token_acc": 0.50187265917603, "train_speed(iter/s)": 1.437476 }, { "epoch": 2.750096396898162, "grad_norm": 4.895714282989502, "learning_rate": 4.217528564906938e-05, "loss": 2.3579416275024414, "memory(GiB)": 77.56, "step": 64190, "token_acc": 0.5202702702702703, "train_speed(iter/s)": 1.437504 }, { "epoch": 2.750310612227411, "grad_norm": 5.193429946899414, "learning_rate": 4.2168638865368756e-05, "loss": 2.474519157409668, "memory(GiB)": 77.56, "step": 64195, "token_acc": 0.4884488448844885, "train_speed(iter/s)": 1.437522 }, { "epoch": 2.75052482755666, "grad_norm": 6.1212687492370605, "learning_rate": 4.2161992223540606e-05, "loss": 2.344174385070801, "memory(GiB)": 77.56, "step": 64200, "token_acc": 0.4888888888888889, "train_speed(iter/s)": 1.437557 }, { "epoch": 2.750739042885909, "grad_norm": 6.556796073913574, "learning_rate": 4.215534572370531e-05, "loss": 2.169703483581543, "memory(GiB)": 77.56, "step": 64205, "token_acc": 0.509375, "train_speed(iter/s)": 1.437571 }, { "epoch": 2.7509532582151577, "grad_norm": 5.754509925842285, "learning_rate": 4.214869936598329e-05, "loss": 2.6595726013183594, "memory(GiB)": 77.56, "step": 64210, "token_acc": 0.48328267477203646, "train_speed(iter/s)": 1.437568 }, { "epoch": 2.751167473544407, "grad_norm": 4.662178993225098, "learning_rate": 4.214205315049494e-05, "loss": 2.3331153869628904, "memory(GiB)": 77.56, "step": 64215, "token_acc": 0.5215686274509804, "train_speed(iter/s)": 1.437548 }, { "epoch": 2.7513816888736558, "grad_norm": 4.8971266746521, "learning_rate": 4.2135407077360667e-05, "loss": 2.5479814529418947, "memory(GiB)": 77.56, "step": 64220, "token_acc": 0.4702194357366771, "train_speed(iter/s)": 1.437539 }, { "epoch": 2.7515959042029046, "grad_norm": 5.7266621589660645, "learning_rate": 4.2128761146700884e-05, "loss": 2.296100616455078, "memory(GiB)": 77.56, "step": 64225, "token_acc": 0.4865771812080537, "train_speed(iter/s)": 1.437538 }, { "epoch": 2.751810119532154, "grad_norm": 4.668185234069824, "learning_rate": 4.2122115358635975e-05, "loss": 2.357310485839844, "memory(GiB)": 77.56, "step": 64230, "token_acc": 0.49140893470790376, "train_speed(iter/s)": 1.437556 }, { "epoch": 2.7520243348614026, "grad_norm": 4.7295145988464355, "learning_rate": 4.2115469713286325e-05, "loss": 2.4320003509521486, "memory(GiB)": 77.56, "step": 64235, "token_acc": 0.48872180451127817, "train_speed(iter/s)": 1.437583 }, { "epoch": 2.7522385501906514, "grad_norm": 5.936611175537109, "learning_rate": 4.210882421077235e-05, "loss": 2.478118133544922, "memory(GiB)": 77.56, "step": 64240, "token_acc": 0.46494464944649444, "train_speed(iter/s)": 1.437566 }, { "epoch": 2.7524527655199007, "grad_norm": 4.063772201538086, "learning_rate": 4.210217885121442e-05, "loss": 2.463674545288086, "memory(GiB)": 77.56, "step": 64245, "token_acc": 0.46407185628742514, "train_speed(iter/s)": 1.437555 }, { "epoch": 2.7526669808491495, "grad_norm": 6.379580974578857, "learning_rate": 4.209553363473293e-05, "loss": 2.376224327087402, "memory(GiB)": 77.56, "step": 64250, "token_acc": 0.5145631067961165, "train_speed(iter/s)": 1.437557 }, { "epoch": 2.7528811961783983, "grad_norm": 9.669970512390137, "learning_rate": 4.208888856144826e-05, "loss": 2.440271186828613, "memory(GiB)": 77.56, "step": 64255, "token_acc": 0.4818941504178273, "train_speed(iter/s)": 1.437574 }, { "epoch": 2.7530954115076476, "grad_norm": 6.004137992858887, "learning_rate": 4.2082243631480795e-05, "loss": 2.288914108276367, "memory(GiB)": 77.56, "step": 64260, "token_acc": 0.47794117647058826, "train_speed(iter/s)": 1.437573 }, { "epoch": 2.7533096268368964, "grad_norm": 4.116734981536865, "learning_rate": 4.207559884495092e-05, "loss": 2.394124412536621, "memory(GiB)": 77.56, "step": 64265, "token_acc": 0.49836065573770494, "train_speed(iter/s)": 1.437574 }, { "epoch": 2.753523842166145, "grad_norm": 5.865310192108154, "learning_rate": 4.2068954201978985e-05, "loss": 2.4204483032226562, "memory(GiB)": 77.56, "step": 64270, "token_acc": 0.4967532467532468, "train_speed(iter/s)": 1.437599 }, { "epoch": 2.7537380574953945, "grad_norm": 6.250423431396484, "learning_rate": 4.2062309702685404e-05, "loss": 2.1397994995117187, "memory(GiB)": 77.56, "step": 64275, "token_acc": 0.5429553264604811, "train_speed(iter/s)": 1.437615 }, { "epoch": 2.7539522728246433, "grad_norm": 6.483864784240723, "learning_rate": 4.205566534719052e-05, "loss": 2.4461437225341798, "memory(GiB)": 77.56, "step": 64280, "token_acc": 0.48986486486486486, "train_speed(iter/s)": 1.437582 }, { "epoch": 2.754166488153892, "grad_norm": 6.371665000915527, "learning_rate": 4.204902113561472e-05, "loss": 2.339692306518555, "memory(GiB)": 77.56, "step": 64285, "token_acc": 0.5243055555555556, "train_speed(iter/s)": 1.437595 }, { "epoch": 2.7543807034831413, "grad_norm": 4.810792922973633, "learning_rate": 4.204237706807836e-05, "loss": 2.317988967895508, "memory(GiB)": 77.56, "step": 64290, "token_acc": 0.4845360824742268, "train_speed(iter/s)": 1.437566 }, { "epoch": 2.75459491881239, "grad_norm": 6.8076982498168945, "learning_rate": 4.20357331447018e-05, "loss": 2.5038827896118163, "memory(GiB)": 77.56, "step": 64295, "token_acc": 0.4918032786885246, "train_speed(iter/s)": 1.437551 }, { "epoch": 2.754809134141639, "grad_norm": 4.727570533752441, "learning_rate": 4.202908936560541e-05, "loss": 2.75933723449707, "memory(GiB)": 77.56, "step": 64300, "token_acc": 0.503731343283582, "train_speed(iter/s)": 1.437552 }, { "epoch": 2.7550233494708882, "grad_norm": 5.943518161773682, "learning_rate": 4.202244573090954e-05, "loss": 2.4613155364990233, "memory(GiB)": 77.56, "step": 64305, "token_acc": 0.4734848484848485, "train_speed(iter/s)": 1.437532 }, { "epoch": 2.755237564800137, "grad_norm": 4.886484622955322, "learning_rate": 4.2015802240734554e-05, "loss": 2.343560791015625, "memory(GiB)": 77.56, "step": 64310, "token_acc": 0.5015015015015015, "train_speed(iter/s)": 1.437532 }, { "epoch": 2.755451780129386, "grad_norm": 5.043753147125244, "learning_rate": 4.200915889520079e-05, "loss": 2.4613529205322267, "memory(GiB)": 77.56, "step": 64315, "token_acc": 0.475, "train_speed(iter/s)": 1.437564 }, { "epoch": 2.755665995458635, "grad_norm": 6.139881134033203, "learning_rate": 4.2002515694428616e-05, "loss": 2.395285224914551, "memory(GiB)": 77.56, "step": 64320, "token_acc": 0.4847328244274809, "train_speed(iter/s)": 1.437592 }, { "epoch": 2.755880210787884, "grad_norm": 6.566813945770264, "learning_rate": 4.1995872638538375e-05, "loss": 2.3253076553344725, "memory(GiB)": 77.56, "step": 64325, "token_acc": 0.5454545454545454, "train_speed(iter/s)": 1.43761 }, { "epoch": 2.7560944261171327, "grad_norm": 5.818182468414307, "learning_rate": 4.1989229727650416e-05, "loss": 2.640835189819336, "memory(GiB)": 77.56, "step": 64330, "token_acc": 0.4535809018567639, "train_speed(iter/s)": 1.437602 }, { "epoch": 2.756308641446382, "grad_norm": 5.255789756774902, "learning_rate": 4.1982586961885054e-05, "loss": 2.444224739074707, "memory(GiB)": 77.56, "step": 64335, "token_acc": 0.4826388888888889, "train_speed(iter/s)": 1.437584 }, { "epoch": 2.756522856775631, "grad_norm": 4.549758434295654, "learning_rate": 4.1975944341362646e-05, "loss": 2.3875503540039062, "memory(GiB)": 77.56, "step": 64340, "token_acc": 0.49295774647887325, "train_speed(iter/s)": 1.437601 }, { "epoch": 2.7567370721048796, "grad_norm": 5.681735038757324, "learning_rate": 4.196930186620355e-05, "loss": 2.4830114364624025, "memory(GiB)": 77.56, "step": 64345, "token_acc": 0.43854748603351956, "train_speed(iter/s)": 1.437591 }, { "epoch": 2.756951287434129, "grad_norm": 6.80520486831665, "learning_rate": 4.196265953652809e-05, "loss": 2.4649124145507812, "memory(GiB)": 77.56, "step": 64350, "token_acc": 0.503448275862069, "train_speed(iter/s)": 1.437615 }, { "epoch": 2.7571655027633777, "grad_norm": 4.921061992645264, "learning_rate": 4.195601735245658e-05, "loss": 2.4463294982910155, "memory(GiB)": 77.56, "step": 64355, "token_acc": 0.5050167224080268, "train_speed(iter/s)": 1.437606 }, { "epoch": 2.7573797180926265, "grad_norm": 6.679776668548584, "learning_rate": 4.194937531410937e-05, "loss": 2.5255287170410154, "memory(GiB)": 77.56, "step": 64360, "token_acc": 0.45317220543806647, "train_speed(iter/s)": 1.437631 }, { "epoch": 2.7575939334218758, "grad_norm": 4.265938758850098, "learning_rate": 4.1942733421606764e-05, "loss": 2.481113052368164, "memory(GiB)": 77.56, "step": 64365, "token_acc": 0.4670846394984326, "train_speed(iter/s)": 1.437636 }, { "epoch": 2.7578081487511246, "grad_norm": 6.029409885406494, "learning_rate": 4.193609167506912e-05, "loss": 2.729705047607422, "memory(GiB)": 77.56, "step": 64370, "token_acc": 0.4838709677419355, "train_speed(iter/s)": 1.43764 }, { "epoch": 2.7580223640803734, "grad_norm": 5.590997219085693, "learning_rate": 4.1929450074616734e-05, "loss": 2.3650768280029295, "memory(GiB)": 77.56, "step": 64375, "token_acc": 0.4662576687116564, "train_speed(iter/s)": 1.437665 }, { "epoch": 2.7582365794096226, "grad_norm": 4.810853004455566, "learning_rate": 4.192280862036992e-05, "loss": 2.2588579177856447, "memory(GiB)": 77.56, "step": 64380, "token_acc": 0.5152542372881356, "train_speed(iter/s)": 1.437671 }, { "epoch": 2.7584507947388714, "grad_norm": 5.563048839569092, "learning_rate": 4.1916167312449014e-05, "loss": 2.512114715576172, "memory(GiB)": 77.56, "step": 64385, "token_acc": 0.48375451263537905, "train_speed(iter/s)": 1.437686 }, { "epoch": 2.7586650100681203, "grad_norm": 6.968373775482178, "learning_rate": 4.1909526150974326e-05, "loss": 2.564522552490234, "memory(GiB)": 77.56, "step": 64390, "token_acc": 0.46875, "train_speed(iter/s)": 1.437674 }, { "epoch": 2.7588792253973695, "grad_norm": 5.360578536987305, "learning_rate": 4.190288513606615e-05, "loss": 2.151266670227051, "memory(GiB)": 77.56, "step": 64395, "token_acc": 0.5527156549520766, "train_speed(iter/s)": 1.437687 }, { "epoch": 2.7590934407266183, "grad_norm": 4.924160480499268, "learning_rate": 4.189624426784481e-05, "loss": 2.451461410522461, "memory(GiB)": 77.56, "step": 64400, "token_acc": 0.5196078431372549, "train_speed(iter/s)": 1.437665 }, { "epoch": 2.759307656055867, "grad_norm": 6.2384724617004395, "learning_rate": 4.18896035464306e-05, "loss": 2.4778888702392576, "memory(GiB)": 77.56, "step": 64405, "token_acc": 0.4892966360856269, "train_speed(iter/s)": 1.43767 }, { "epoch": 2.7595218713851164, "grad_norm": 5.335391998291016, "learning_rate": 4.188296297194384e-05, "loss": 2.6137153625488283, "memory(GiB)": 77.56, "step": 64410, "token_acc": 0.4601449275362319, "train_speed(iter/s)": 1.437697 }, { "epoch": 2.759736086714365, "grad_norm": 5.631296157836914, "learning_rate": 4.18763225445048e-05, "loss": 2.6373186111450195, "memory(GiB)": 77.56, "step": 64415, "token_acc": 0.44297082228116713, "train_speed(iter/s)": 1.437703 }, { "epoch": 2.759950302043614, "grad_norm": 5.207281589508057, "learning_rate": 4.1869682264233826e-05, "loss": 2.363142204284668, "memory(GiB)": 77.56, "step": 64420, "token_acc": 0.49230769230769234, "train_speed(iter/s)": 1.437705 }, { "epoch": 2.7601645173728633, "grad_norm": 5.0512871742248535, "learning_rate": 4.186304213125116e-05, "loss": 2.500602149963379, "memory(GiB)": 77.56, "step": 64425, "token_acc": 0.5098814229249012, "train_speed(iter/s)": 1.43772 }, { "epoch": 2.760378732702112, "grad_norm": 7.788637638092041, "learning_rate": 4.185640214567714e-05, "loss": 2.4444881439208985, "memory(GiB)": 77.56, "step": 64430, "token_acc": 0.5129151291512916, "train_speed(iter/s)": 1.437719 }, { "epoch": 2.760592948031361, "grad_norm": 7.587400913238525, "learning_rate": 4.184976230763203e-05, "loss": 2.3888885498046877, "memory(GiB)": 77.56, "step": 64435, "token_acc": 0.4982698961937716, "train_speed(iter/s)": 1.437683 }, { "epoch": 2.76080716336061, "grad_norm": 5.943777561187744, "learning_rate": 4.1843122617236106e-05, "loss": 2.4553855895996093, "memory(GiB)": 77.56, "step": 64440, "token_acc": 0.4874551971326165, "train_speed(iter/s)": 1.437693 }, { "epoch": 2.761021378689859, "grad_norm": 7.957814693450928, "learning_rate": 4.183648307460969e-05, "loss": 2.4153724670410157, "memory(GiB)": 77.56, "step": 64445, "token_acc": 0.4567901234567901, "train_speed(iter/s)": 1.437721 }, { "epoch": 2.761235594019108, "grad_norm": 5.246782302856445, "learning_rate": 4.182984367987302e-05, "loss": 2.5522727966308594, "memory(GiB)": 77.56, "step": 64450, "token_acc": 0.43312101910828027, "train_speed(iter/s)": 1.437763 }, { "epoch": 2.761449809348357, "grad_norm": 5.906622409820557, "learning_rate": 4.182320443314641e-05, "loss": 2.1670705795288088, "memory(GiB)": 77.56, "step": 64455, "token_acc": 0.5395189003436426, "train_speed(iter/s)": 1.437782 }, { "epoch": 2.761664024677606, "grad_norm": 5.36324405670166, "learning_rate": 4.181656533455013e-05, "loss": 2.5967132568359377, "memory(GiB)": 77.56, "step": 64460, "token_acc": 0.47474747474747475, "train_speed(iter/s)": 1.437743 }, { "epoch": 2.7618782400068547, "grad_norm": 7.413851261138916, "learning_rate": 4.1809926384204426e-05, "loss": 2.175606536865234, "memory(GiB)": 77.56, "step": 64465, "token_acc": 0.5568181818181818, "train_speed(iter/s)": 1.437757 }, { "epoch": 2.762092455336104, "grad_norm": 4.746966361999512, "learning_rate": 4.18032875822296e-05, "loss": 2.280744743347168, "memory(GiB)": 77.56, "step": 64470, "token_acc": 0.5092250922509225, "train_speed(iter/s)": 1.437783 }, { "epoch": 2.7623066706653527, "grad_norm": 8.898507118225098, "learning_rate": 4.179664892874591e-05, "loss": 2.388660430908203, "memory(GiB)": 77.56, "step": 64475, "token_acc": 0.48951048951048953, "train_speed(iter/s)": 1.437767 }, { "epoch": 2.7625208859946015, "grad_norm": 6.157951354980469, "learning_rate": 4.17900104238736e-05, "loss": 2.45493106842041, "memory(GiB)": 77.56, "step": 64480, "token_acc": 0.4846153846153846, "train_speed(iter/s)": 1.43776 }, { "epoch": 2.762735101323851, "grad_norm": 4.3204665184021, "learning_rate": 4.1783372067732977e-05, "loss": 2.4240743637084963, "memory(GiB)": 77.56, "step": 64485, "token_acc": 0.493006993006993, "train_speed(iter/s)": 1.437783 }, { "epoch": 2.7629493166530996, "grad_norm": 5.623483180999756, "learning_rate": 4.177673386044425e-05, "loss": 2.080682945251465, "memory(GiB)": 77.56, "step": 64490, "token_acc": 0.5186721991701245, "train_speed(iter/s)": 1.43779 }, { "epoch": 2.7631635319823484, "grad_norm": 5.1229963302612305, "learning_rate": 4.177009580212773e-05, "loss": 2.451407051086426, "memory(GiB)": 77.56, "step": 64495, "token_acc": 0.46511627906976744, "train_speed(iter/s)": 1.437796 }, { "epoch": 2.7633777473115977, "grad_norm": 6.0577497482299805, "learning_rate": 4.176345789290363e-05, "loss": 2.3344985961914064, "memory(GiB)": 77.56, "step": 64500, "token_acc": 0.5431309904153354, "train_speed(iter/s)": 1.437821 }, { "epoch": 2.7633777473115977, "eval_loss": 2.1342391967773438, "eval_runtime": 14.1941, "eval_samples_per_second": 7.045, "eval_steps_per_second": 7.045, "eval_token_acc": 0.48824343015214383, "step": 64500 }, { "epoch": 2.7635919626408465, "grad_norm": 5.9025959968566895, "learning_rate": 4.175682013289223e-05, "loss": 2.546734619140625, "memory(GiB)": 77.56, "step": 64505, "token_acc": 0.4811800610376399, "train_speed(iter/s)": 1.437345 }, { "epoch": 2.7638061779700953, "grad_norm": 5.714148044586182, "learning_rate": 4.1750182522213745e-05, "loss": 2.6803285598754885, "memory(GiB)": 77.56, "step": 64510, "token_acc": 0.49140893470790376, "train_speed(iter/s)": 1.43735 }, { "epoch": 2.7640203932993446, "grad_norm": 5.070879936218262, "learning_rate": 4.174354506098847e-05, "loss": 2.26236572265625, "memory(GiB)": 77.56, "step": 64515, "token_acc": 0.5422077922077922, "train_speed(iter/s)": 1.437353 }, { "epoch": 2.7642346086285934, "grad_norm": 5.840444564819336, "learning_rate": 4.1736907749336603e-05, "loss": 2.308195686340332, "memory(GiB)": 77.56, "step": 64520, "token_acc": 0.4900662251655629, "train_speed(iter/s)": 1.437349 }, { "epoch": 2.764448823957842, "grad_norm": 6.4930596351623535, "learning_rate": 4.17302705873784e-05, "loss": 2.465194320678711, "memory(GiB)": 77.56, "step": 64525, "token_acc": 0.45671641791044776, "train_speed(iter/s)": 1.437328 }, { "epoch": 2.7646630392870915, "grad_norm": 5.947774887084961, "learning_rate": 4.172363357523412e-05, "loss": 2.484974479675293, "memory(GiB)": 77.56, "step": 64530, "token_acc": 0.5016501650165016, "train_speed(iter/s)": 1.43732 }, { "epoch": 2.7648772546163403, "grad_norm": 5.339717388153076, "learning_rate": 4.1716996713023976e-05, "loss": 2.369556427001953, "memory(GiB)": 77.56, "step": 64535, "token_acc": 0.5018181818181818, "train_speed(iter/s)": 1.437314 }, { "epoch": 2.765091469945589, "grad_norm": 7.05649471282959, "learning_rate": 4.17103600008682e-05, "loss": 2.4731819152832033, "memory(GiB)": 77.56, "step": 64540, "token_acc": 0.4896551724137931, "train_speed(iter/s)": 1.437331 }, { "epoch": 2.7653056852748383, "grad_norm": 5.375307559967041, "learning_rate": 4.170372343888703e-05, "loss": 2.378491020202637, "memory(GiB)": 77.56, "step": 64545, "token_acc": 0.488135593220339, "train_speed(iter/s)": 1.437353 }, { "epoch": 2.765519900604087, "grad_norm": 6.3528594970703125, "learning_rate": 4.169708702720069e-05, "loss": 2.4149532318115234, "memory(GiB)": 77.56, "step": 64550, "token_acc": 0.5017667844522968, "train_speed(iter/s)": 1.437374 }, { "epoch": 2.765734115933336, "grad_norm": 6.189089298248291, "learning_rate": 4.169045076592942e-05, "loss": 2.469185447692871, "memory(GiB)": 77.56, "step": 64555, "token_acc": 0.49070631970260226, "train_speed(iter/s)": 1.437391 }, { "epoch": 2.765948331262585, "grad_norm": 5.242286205291748, "learning_rate": 4.168381465519342e-05, "loss": 2.5613691329956056, "memory(GiB)": 77.56, "step": 64560, "token_acc": 0.4788273615635179, "train_speed(iter/s)": 1.437395 }, { "epoch": 2.766162546591834, "grad_norm": 4.933730125427246, "learning_rate": 4.167717869511291e-05, "loss": 2.4313411712646484, "memory(GiB)": 77.56, "step": 64565, "token_acc": 0.4716981132075472, "train_speed(iter/s)": 1.437407 }, { "epoch": 2.766376761921083, "grad_norm": 6.2235798835754395, "learning_rate": 4.167054288580812e-05, "loss": 2.370328903198242, "memory(GiB)": 77.56, "step": 64570, "token_acc": 0.5160142348754448, "train_speed(iter/s)": 1.437424 }, { "epoch": 2.766590977250332, "grad_norm": 6.177143573760986, "learning_rate": 4.166390722739926e-05, "loss": 2.471444320678711, "memory(GiB)": 77.56, "step": 64575, "token_acc": 0.49606299212598426, "train_speed(iter/s)": 1.437406 }, { "epoch": 2.766805192579581, "grad_norm": 4.598169326782227, "learning_rate": 4.165727172000655e-05, "loss": 2.2622312545776366, "memory(GiB)": 77.56, "step": 64580, "token_acc": 0.5218978102189781, "train_speed(iter/s)": 1.437416 }, { "epoch": 2.7670194079088297, "grad_norm": 6.439113616943359, "learning_rate": 4.165063636375018e-05, "loss": 2.3918380737304688, "memory(GiB)": 77.56, "step": 64585, "token_acc": 0.4954682779456193, "train_speed(iter/s)": 1.437414 }, { "epoch": 2.767233623238079, "grad_norm": 5.608087062835693, "learning_rate": 4.164400115875037e-05, "loss": 2.027151107788086, "memory(GiB)": 77.56, "step": 64590, "token_acc": 0.5418502202643172, "train_speed(iter/s)": 1.437364 }, { "epoch": 2.767447838567328, "grad_norm": 5.899224281311035, "learning_rate": 4.1637366105127315e-05, "loss": 2.553853988647461, "memory(GiB)": 77.56, "step": 64595, "token_acc": 0.4728682170542636, "train_speed(iter/s)": 1.437376 }, { "epoch": 2.7676620538965766, "grad_norm": 6.398929119110107, "learning_rate": 4.163073120300122e-05, "loss": 2.4647218704223635, "memory(GiB)": 77.56, "step": 64600, "token_acc": 0.4689655172413793, "train_speed(iter/s)": 1.437362 }, { "epoch": 2.767876269225826, "grad_norm": 4.618252277374268, "learning_rate": 4.162409645249228e-05, "loss": 2.3526901245117187, "memory(GiB)": 77.56, "step": 64605, "token_acc": 0.48986486486486486, "train_speed(iter/s)": 1.437365 }, { "epoch": 2.7680904845550747, "grad_norm": 4.611629009246826, "learning_rate": 4.1617461853720685e-05, "loss": 2.2983558654785154, "memory(GiB)": 77.56, "step": 64610, "token_acc": 0.46153846153846156, "train_speed(iter/s)": 1.437391 }, { "epoch": 2.7683046998843235, "grad_norm": 7.5736541748046875, "learning_rate": 4.161082740680664e-05, "loss": 2.33463134765625, "memory(GiB)": 77.56, "step": 64615, "token_acc": 0.5075187969924813, "train_speed(iter/s)": 1.437413 }, { "epoch": 2.7685189152135727, "grad_norm": 6.545660018920898, "learning_rate": 4.160419311187033e-05, "loss": 2.5211572647094727, "memory(GiB)": 77.56, "step": 64620, "token_acc": 0.48986486486486486, "train_speed(iter/s)": 1.43741 }, { "epoch": 2.7687331305428216, "grad_norm": 7.074057102203369, "learning_rate": 4.1597558969031924e-05, "loss": 2.0580442428588865, "memory(GiB)": 77.56, "step": 64625, "token_acc": 0.5336134453781513, "train_speed(iter/s)": 1.437366 }, { "epoch": 2.7689473458720704, "grad_norm": 7.0091328620910645, "learning_rate": 4.159092497841163e-05, "loss": 2.3606201171875, "memory(GiB)": 77.56, "step": 64630, "token_acc": 0.5108695652173914, "train_speed(iter/s)": 1.437359 }, { "epoch": 2.7691615612013196, "grad_norm": 5.173372268676758, "learning_rate": 4.15842911401296e-05, "loss": 2.7328081130981445, "memory(GiB)": 77.56, "step": 64635, "token_acc": 0.47249190938511326, "train_speed(iter/s)": 1.43738 }, { "epoch": 2.7693757765305684, "grad_norm": 4.831366062164307, "learning_rate": 4.157765745430605e-05, "loss": 2.5863250732421874, "memory(GiB)": 77.56, "step": 64640, "token_acc": 0.43352601156069365, "train_speed(iter/s)": 1.437405 }, { "epoch": 2.7695899918598172, "grad_norm": 7.955633163452148, "learning_rate": 4.157102392106112e-05, "loss": 2.4766910552978514, "memory(GiB)": 77.56, "step": 64645, "token_acc": 0.4662576687116564, "train_speed(iter/s)": 1.437425 }, { "epoch": 2.7698042071890665, "grad_norm": 5.130234718322754, "learning_rate": 4.156439054051501e-05, "loss": 2.302436637878418, "memory(GiB)": 77.56, "step": 64650, "token_acc": 0.49416342412451364, "train_speed(iter/s)": 1.437424 }, { "epoch": 2.7700184225183153, "grad_norm": 5.311408996582031, "learning_rate": 4.1557757312787874e-05, "loss": 2.4721141815185548, "memory(GiB)": 77.56, "step": 64655, "token_acc": 0.4697508896797153, "train_speed(iter/s)": 1.437418 }, { "epoch": 2.770232637847564, "grad_norm": 6.5296711921691895, "learning_rate": 4.1551124237999895e-05, "loss": 2.4906278610229493, "memory(GiB)": 77.56, "step": 64660, "token_acc": 0.5117056856187291, "train_speed(iter/s)": 1.437444 }, { "epoch": 2.7704468531768134, "grad_norm": 5.803679466247559, "learning_rate": 4.1544491316271224e-05, "loss": 2.27764835357666, "memory(GiB)": 77.56, "step": 64665, "token_acc": 0.4937106918238994, "train_speed(iter/s)": 1.437437 }, { "epoch": 2.770661068506062, "grad_norm": 4.957920074462891, "learning_rate": 4.153785854772201e-05, "loss": 2.3398094177246094, "memory(GiB)": 77.56, "step": 64670, "token_acc": 0.5150501672240803, "train_speed(iter/s)": 1.437428 }, { "epoch": 2.770875283835311, "grad_norm": 6.413315773010254, "learning_rate": 4.153122593247244e-05, "loss": 2.6030054092407227, "memory(GiB)": 77.56, "step": 64675, "token_acc": 0.4375, "train_speed(iter/s)": 1.437441 }, { "epoch": 2.7710894991645603, "grad_norm": 6.683399677276611, "learning_rate": 4.1524593470642656e-05, "loss": 2.6472394943237303, "memory(GiB)": 77.56, "step": 64680, "token_acc": 0.4492307692307692, "train_speed(iter/s)": 1.437458 }, { "epoch": 2.771303714493809, "grad_norm": 5.128006458282471, "learning_rate": 4.15179611623528e-05, "loss": 2.2427282333374023, "memory(GiB)": 77.56, "step": 64685, "token_acc": 0.47560975609756095, "train_speed(iter/s)": 1.437467 }, { "epoch": 2.771517929823058, "grad_norm": 5.724479675292969, "learning_rate": 4.1511329007723046e-05, "loss": 2.698308563232422, "memory(GiB)": 77.56, "step": 64690, "token_acc": 0.4513888888888889, "train_speed(iter/s)": 1.437456 }, { "epoch": 2.771732145152307, "grad_norm": 6.818203449249268, "learning_rate": 4.1504697006873524e-05, "loss": 2.5303741455078126, "memory(GiB)": 77.56, "step": 64695, "token_acc": 0.47315436241610737, "train_speed(iter/s)": 1.437472 }, { "epoch": 2.771946360481556, "grad_norm": 4.720088958740234, "learning_rate": 4.1498065159924394e-05, "loss": 2.213025665283203, "memory(GiB)": 77.56, "step": 64700, "token_acc": 0.505338078291815, "train_speed(iter/s)": 1.437469 }, { "epoch": 2.7721605758108048, "grad_norm": 3.9661316871643066, "learning_rate": 4.149143346699579e-05, "loss": 2.248960494995117, "memory(GiB)": 77.56, "step": 64705, "token_acc": 0.5687022900763359, "train_speed(iter/s)": 1.437486 }, { "epoch": 2.772374791140054, "grad_norm": 8.90654468536377, "learning_rate": 4.1484801928207824e-05, "loss": 2.401824188232422, "memory(GiB)": 77.56, "step": 64710, "token_acc": 0.47202797202797203, "train_speed(iter/s)": 1.43746 }, { "epoch": 2.772589006469303, "grad_norm": 11.235576629638672, "learning_rate": 4.1478170543680664e-05, "loss": 2.220451545715332, "memory(GiB)": 77.56, "step": 64715, "token_acc": 0.5503875968992248, "train_speed(iter/s)": 1.437453 }, { "epoch": 2.7728032217985517, "grad_norm": 5.220694541931152, "learning_rate": 4.147153931353446e-05, "loss": 2.2256542205810548, "memory(GiB)": 77.56, "step": 64720, "token_acc": 0.4911660777385159, "train_speed(iter/s)": 1.437454 }, { "epoch": 2.773017437127801, "grad_norm": 6.951962471008301, "learning_rate": 4.1464908237889324e-05, "loss": 2.3189970016479493, "memory(GiB)": 77.56, "step": 64725, "token_acc": 0.5, "train_speed(iter/s)": 1.43748 }, { "epoch": 2.7732316524570497, "grad_norm": 5.977963924407959, "learning_rate": 4.145827731686536e-05, "loss": 2.4968643188476562, "memory(GiB)": 77.56, "step": 64730, "token_acc": 0.4520547945205479, "train_speed(iter/s)": 1.437491 }, { "epoch": 2.7734458677862985, "grad_norm": 8.230284690856934, "learning_rate": 4.145164655058273e-05, "loss": 2.6562217712402343, "memory(GiB)": 77.56, "step": 64735, "token_acc": 0.44654088050314467, "train_speed(iter/s)": 1.437518 }, { "epoch": 2.773660083115548, "grad_norm": 5.399183750152588, "learning_rate": 4.144501593916154e-05, "loss": 2.4795648574829103, "memory(GiB)": 77.56, "step": 64740, "token_acc": 0.4418604651162791, "train_speed(iter/s)": 1.437532 }, { "epoch": 2.7738742984447966, "grad_norm": 4.590023040771484, "learning_rate": 4.1438385482721913e-05, "loss": 2.0696184158325197, "memory(GiB)": 77.56, "step": 64745, "token_acc": 0.5963636363636363, "train_speed(iter/s)": 1.437562 }, { "epoch": 2.7740885137740454, "grad_norm": 4.3344502449035645, "learning_rate": 4.143175518138397e-05, "loss": 2.6216949462890624, "memory(GiB)": 77.56, "step": 64750, "token_acc": 0.479020979020979, "train_speed(iter/s)": 1.437559 }, { "epoch": 2.7743027291032947, "grad_norm": 6.110363483428955, "learning_rate": 4.14251250352678e-05, "loss": 2.2581365585327147, "memory(GiB)": 77.56, "step": 64755, "token_acc": 0.4789272030651341, "train_speed(iter/s)": 1.437572 }, { "epoch": 2.7745169444325435, "grad_norm": 4.645518779754639, "learning_rate": 4.141849504449355e-05, "loss": 2.5854907989501954, "memory(GiB)": 77.56, "step": 64760, "token_acc": 0.49070631970260226, "train_speed(iter/s)": 1.437591 }, { "epoch": 2.7747311597617923, "grad_norm": 4.550884246826172, "learning_rate": 4.141186520918132e-05, "loss": 2.709996223449707, "memory(GiB)": 77.56, "step": 64765, "token_acc": 0.445859872611465, "train_speed(iter/s)": 1.437575 }, { "epoch": 2.7749453750910416, "grad_norm": 5.34348201751709, "learning_rate": 4.140523552945118e-05, "loss": 2.696388816833496, "memory(GiB)": 77.56, "step": 64770, "token_acc": 0.45664739884393063, "train_speed(iter/s)": 1.43758 }, { "epoch": 2.7751595904202904, "grad_norm": 5.4660964012146, "learning_rate": 4.1398606005423284e-05, "loss": 2.664356994628906, "memory(GiB)": 77.56, "step": 64775, "token_acc": 0.4218289085545723, "train_speed(iter/s)": 1.437593 }, { "epoch": 2.775373805749539, "grad_norm": 7.453643798828125, "learning_rate": 4.13919766372177e-05, "loss": 2.270530891418457, "memory(GiB)": 77.56, "step": 64780, "token_acc": 0.5043478260869565, "train_speed(iter/s)": 1.437605 }, { "epoch": 2.7755880210787884, "grad_norm": 8.337597846984863, "learning_rate": 4.1385347424954526e-05, "loss": 2.6300540924072267, "memory(GiB)": 77.56, "step": 64785, "token_acc": 0.4405797101449275, "train_speed(iter/s)": 1.437622 }, { "epoch": 2.7758022364080372, "grad_norm": 5.461379528045654, "learning_rate": 4.137871836875387e-05, "loss": 2.4108423233032226, "memory(GiB)": 77.56, "step": 64790, "token_acc": 0.49838187702265374, "train_speed(iter/s)": 1.437645 }, { "epoch": 2.776016451737286, "grad_norm": 5.3231024742126465, "learning_rate": 4.137208946873582e-05, "loss": 2.436798095703125, "memory(GiB)": 77.56, "step": 64795, "token_acc": 0.46863468634686345, "train_speed(iter/s)": 1.437645 }, { "epoch": 2.7762306670665353, "grad_norm": 7.851663112640381, "learning_rate": 4.1365460725020466e-05, "loss": 2.346063995361328, "memory(GiB)": 77.56, "step": 64800, "token_acc": 0.4495798319327731, "train_speed(iter/s)": 1.43764 }, { "epoch": 2.776444882395784, "grad_norm": 5.869938850402832, "learning_rate": 4.135883213772789e-05, "loss": 2.8061729431152345, "memory(GiB)": 77.56, "step": 64805, "token_acc": 0.4931972789115646, "train_speed(iter/s)": 1.437648 }, { "epoch": 2.776659097725033, "grad_norm": 4.127633571624756, "learning_rate": 4.1352203706978186e-05, "loss": 2.3608001708984374, "memory(GiB)": 77.56, "step": 64810, "token_acc": 0.5, "train_speed(iter/s)": 1.437653 }, { "epoch": 2.776873313054282, "grad_norm": 5.486953258514404, "learning_rate": 4.134557543289141e-05, "loss": 2.616672897338867, "memory(GiB)": 77.56, "step": 64815, "token_acc": 0.48632218844984804, "train_speed(iter/s)": 1.437655 }, { "epoch": 2.777087528383531, "grad_norm": 6.279545307159424, "learning_rate": 4.1338947315587664e-05, "loss": 2.276936721801758, "memory(GiB)": 77.56, "step": 64820, "token_acc": 0.5202952029520295, "train_speed(iter/s)": 1.437659 }, { "epoch": 2.77730174371278, "grad_norm": 5.216151714324951, "learning_rate": 4.133231935518701e-05, "loss": 2.5696372985839844, "memory(GiB)": 77.56, "step": 64825, "token_acc": 0.45364238410596025, "train_speed(iter/s)": 1.437638 }, { "epoch": 2.777515959042029, "grad_norm": 4.620060920715332, "learning_rate": 4.132569155180951e-05, "loss": 2.619928550720215, "memory(GiB)": 77.56, "step": 64830, "token_acc": 0.44358974358974357, "train_speed(iter/s)": 1.437645 }, { "epoch": 2.777730174371278, "grad_norm": 5.995078086853027, "learning_rate": 4.131906390557526e-05, "loss": 2.3185092926025392, "memory(GiB)": 77.56, "step": 64835, "token_acc": 0.4584837545126354, "train_speed(iter/s)": 1.43763 }, { "epoch": 2.7779443897005267, "grad_norm": 4.878668308258057, "learning_rate": 4.131243641660429e-05, "loss": 2.600627326965332, "memory(GiB)": 77.56, "step": 64840, "token_acc": 0.4740061162079511, "train_speed(iter/s)": 1.437633 }, { "epoch": 2.778158605029776, "grad_norm": 4.560103416442871, "learning_rate": 4.130580908501671e-05, "loss": 2.4108642578125, "memory(GiB)": 77.56, "step": 64845, "token_acc": 0.5111940298507462, "train_speed(iter/s)": 1.437647 }, { "epoch": 2.7783728203590248, "grad_norm": 6.4313578605651855, "learning_rate": 4.129918191093254e-05, "loss": 2.3842390060424803, "memory(GiB)": 77.56, "step": 64850, "token_acc": 0.48736462093862815, "train_speed(iter/s)": 1.437667 }, { "epoch": 2.7785870356882736, "grad_norm": 5.557571887969971, "learning_rate": 4.1292554894471847e-05, "loss": 2.424895095825195, "memory(GiB)": 77.56, "step": 64855, "token_acc": 0.49242424242424243, "train_speed(iter/s)": 1.437694 }, { "epoch": 2.778801251017523, "grad_norm": 5.516140937805176, "learning_rate": 4.1285928035754684e-05, "loss": 2.4183223724365233, "memory(GiB)": 77.56, "step": 64860, "token_acc": 0.4521072796934866, "train_speed(iter/s)": 1.437724 }, { "epoch": 2.7790154663467717, "grad_norm": 7.888944149017334, "learning_rate": 4.127930133490112e-05, "loss": 2.245655822753906, "memory(GiB)": 77.56, "step": 64865, "token_acc": 0.5208333333333334, "train_speed(iter/s)": 1.437746 }, { "epoch": 2.7792296816760205, "grad_norm": 4.838222026824951, "learning_rate": 4.12726747920312e-05, "loss": 2.3545743942260744, "memory(GiB)": 77.56, "step": 64870, "token_acc": 0.47572815533980584, "train_speed(iter/s)": 1.437745 }, { "epoch": 2.7794438970052697, "grad_norm": 5.020430088043213, "learning_rate": 4.126604840726496e-05, "loss": 2.4572126388549806, "memory(GiB)": 77.56, "step": 64875, "token_acc": 0.512, "train_speed(iter/s)": 1.43776 }, { "epoch": 2.7796581123345185, "grad_norm": 5.911706924438477, "learning_rate": 4.125942218072244e-05, "loss": 2.3762020111083983, "memory(GiB)": 77.56, "step": 64880, "token_acc": 0.5074626865671642, "train_speed(iter/s)": 1.437762 }, { "epoch": 2.7798723276637674, "grad_norm": 4.535211563110352, "learning_rate": 4.125279611252369e-05, "loss": 2.378687286376953, "memory(GiB)": 77.56, "step": 64885, "token_acc": 0.4684385382059801, "train_speed(iter/s)": 1.437767 }, { "epoch": 2.7800865429930166, "grad_norm": 5.900091171264648, "learning_rate": 4.124617020278875e-05, "loss": 2.5336162567138674, "memory(GiB)": 77.56, "step": 64890, "token_acc": 0.4732824427480916, "train_speed(iter/s)": 1.43777 }, { "epoch": 2.7803007583222654, "grad_norm": 4.068755149841309, "learning_rate": 4.1239544451637646e-05, "loss": 2.3729990005493162, "memory(GiB)": 77.56, "step": 64895, "token_acc": 0.4806451612903226, "train_speed(iter/s)": 1.43776 }, { "epoch": 2.7805149736515142, "grad_norm": 6.493914604187012, "learning_rate": 4.12329188591904e-05, "loss": 2.2886421203613283, "memory(GiB)": 77.56, "step": 64900, "token_acc": 0.5086505190311419, "train_speed(iter/s)": 1.437772 }, { "epoch": 2.7807291889807635, "grad_norm": 8.492538452148438, "learning_rate": 4.122629342556706e-05, "loss": 2.275677490234375, "memory(GiB)": 77.56, "step": 64905, "token_acc": 0.498371335504886, "train_speed(iter/s)": 1.437771 }, { "epoch": 2.7809434043100123, "grad_norm": 5.051513195037842, "learning_rate": 4.121966815088766e-05, "loss": 2.4003904342651365, "memory(GiB)": 77.56, "step": 64910, "token_acc": 0.5107913669064749, "train_speed(iter/s)": 1.437758 }, { "epoch": 2.781157619639261, "grad_norm": 5.041954040527344, "learning_rate": 4.1213043035272184e-05, "loss": 2.3659652709960937, "memory(GiB)": 77.56, "step": 64915, "token_acc": 0.5156794425087108, "train_speed(iter/s)": 1.437754 }, { "epoch": 2.7813718349685104, "grad_norm": 6.168399810791016, "learning_rate": 4.12064180788407e-05, "loss": 2.443279838562012, "memory(GiB)": 77.56, "step": 64920, "token_acc": 0.4808362369337979, "train_speed(iter/s)": 1.437744 }, { "epoch": 2.781586050297759, "grad_norm": 5.1472673416137695, "learning_rate": 4.1199793281713176e-05, "loss": 2.4265264511108398, "memory(GiB)": 77.56, "step": 64925, "token_acc": 0.5102040816326531, "train_speed(iter/s)": 1.437736 }, { "epoch": 2.781800265627008, "grad_norm": 6.6545586585998535, "learning_rate": 4.119316864400967e-05, "loss": 2.37560977935791, "memory(GiB)": 77.56, "step": 64930, "token_acc": 0.4652014652014652, "train_speed(iter/s)": 1.437722 }, { "epoch": 2.7820144809562573, "grad_norm": 4.588661193847656, "learning_rate": 4.118654416585015e-05, "loss": 2.3643335342407226, "memory(GiB)": 77.56, "step": 64935, "token_acc": 0.513986013986014, "train_speed(iter/s)": 1.43774 }, { "epoch": 2.782228696285506, "grad_norm": 6.41018533706665, "learning_rate": 4.117991984735468e-05, "loss": 2.5099355697631838, "memory(GiB)": 77.56, "step": 64940, "token_acc": 0.5075757575757576, "train_speed(iter/s)": 1.437713 }, { "epoch": 2.782442911614755, "grad_norm": 4.999509811401367, "learning_rate": 4.117329568864322e-05, "loss": 2.3964168548583986, "memory(GiB)": 77.56, "step": 64945, "token_acc": 0.506993006993007, "train_speed(iter/s)": 1.437724 }, { "epoch": 2.782657126944004, "grad_norm": 5.6082868576049805, "learning_rate": 4.11666716898358e-05, "loss": 2.533946418762207, "memory(GiB)": 77.56, "step": 64950, "token_acc": 0.4689655172413793, "train_speed(iter/s)": 1.437719 }, { "epoch": 2.782871342273253, "grad_norm": 6.538122653961182, "learning_rate": 4.116004785105241e-05, "loss": 2.4954124450683595, "memory(GiB)": 77.56, "step": 64955, "token_acc": 0.49615384615384617, "train_speed(iter/s)": 1.43772 }, { "epoch": 2.7830855576025018, "grad_norm": 4.925257682800293, "learning_rate": 4.115342417241304e-05, "loss": 2.2166458129882813, "memory(GiB)": 77.56, "step": 64960, "token_acc": 0.5363984674329502, "train_speed(iter/s)": 1.437751 }, { "epoch": 2.783299772931751, "grad_norm": 6.454108238220215, "learning_rate": 4.114680065403769e-05, "loss": 2.402741050720215, "memory(GiB)": 77.56, "step": 64965, "token_acc": 0.46996466431095407, "train_speed(iter/s)": 1.437756 }, { "epoch": 2.783513988261, "grad_norm": 5.182466506958008, "learning_rate": 4.114017729604635e-05, "loss": 2.4790927886962892, "memory(GiB)": 77.56, "step": 64970, "token_acc": 0.4628975265017668, "train_speed(iter/s)": 1.437753 }, { "epoch": 2.783728203590249, "grad_norm": 6.383825302124023, "learning_rate": 4.113355409855901e-05, "loss": 2.2500200271606445, "memory(GiB)": 77.56, "step": 64975, "token_acc": 0.5, "train_speed(iter/s)": 1.437739 }, { "epoch": 2.783942418919498, "grad_norm": 4.710538864135742, "learning_rate": 4.1126931061695656e-05, "loss": 2.3343473434448243, "memory(GiB)": 77.56, "step": 64980, "token_acc": 0.5223367697594502, "train_speed(iter/s)": 1.437734 }, { "epoch": 2.7841566342487467, "grad_norm": 6.233351230621338, "learning_rate": 4.112030818557626e-05, "loss": 2.7330272674560545, "memory(GiB)": 77.56, "step": 64985, "token_acc": 0.40418118466898956, "train_speed(iter/s)": 1.437732 }, { "epoch": 2.784370849577996, "grad_norm": 6.382244110107422, "learning_rate": 4.111368547032083e-05, "loss": 2.7764728546142576, "memory(GiB)": 77.56, "step": 64990, "token_acc": 0.4460431654676259, "train_speed(iter/s)": 1.437729 }, { "epoch": 2.784585064907245, "grad_norm": 6.608416557312012, "learning_rate": 4.110706291604931e-05, "loss": 2.515018653869629, "memory(GiB)": 77.56, "step": 64995, "token_acc": 0.4811320754716981, "train_speed(iter/s)": 1.437751 }, { "epoch": 2.7847992802364936, "grad_norm": 6.346921443939209, "learning_rate": 4.110044052288169e-05, "loss": 2.5534263610839845, "memory(GiB)": 77.56, "step": 65000, "token_acc": 0.4894366197183099, "train_speed(iter/s)": 1.437758 }, { "epoch": 2.7847992802364936, "eval_loss": 2.2311465740203857, "eval_runtime": 14.4638, "eval_samples_per_second": 6.914, "eval_steps_per_second": 6.914, "eval_token_acc": 0.4702774108322325, "step": 65000 }, { "epoch": 2.785013495565743, "grad_norm": 5.451372146606445, "learning_rate": 4.109381829093792e-05, "loss": 2.2569353103637697, "memory(GiB)": 77.56, "step": 65005, "token_acc": 0.48096192384769537, "train_speed(iter/s)": 1.437266 }, { "epoch": 2.7852277108949917, "grad_norm": 5.394570827484131, "learning_rate": 4.108719622033801e-05, "loss": 2.6775018692016603, "memory(GiB)": 77.56, "step": 65010, "token_acc": 0.4454828660436137, "train_speed(iter/s)": 1.437295 }, { "epoch": 2.7854419262242405, "grad_norm": 5.483050346374512, "learning_rate": 4.10805743112019e-05, "loss": 2.6770463943481446, "memory(GiB)": 77.56, "step": 65015, "token_acc": 0.45075757575757575, "train_speed(iter/s)": 1.437307 }, { "epoch": 2.7856561415534897, "grad_norm": 6.613809585571289, "learning_rate": 4.1073952563649546e-05, "loss": 2.2119678497314452, "memory(GiB)": 77.56, "step": 65020, "token_acc": 0.5387596899224806, "train_speed(iter/s)": 1.437274 }, { "epoch": 2.7858703568827385, "grad_norm": 4.505990028381348, "learning_rate": 4.1067330977800924e-05, "loss": 2.5043289184570314, "memory(GiB)": 77.56, "step": 65025, "token_acc": 0.4456140350877193, "train_speed(iter/s)": 1.437284 }, { "epoch": 2.7860845722119874, "grad_norm": 6.0927324295043945, "learning_rate": 4.106070955377597e-05, "loss": 2.2570865631103514, "memory(GiB)": 77.56, "step": 65030, "token_acc": 0.5149253731343284, "train_speed(iter/s)": 1.437256 }, { "epoch": 2.7862987875412366, "grad_norm": 5.915740489959717, "learning_rate": 4.105408829169466e-05, "loss": 2.4408477783203124, "memory(GiB)": 77.56, "step": 65035, "token_acc": 0.4909090909090909, "train_speed(iter/s)": 1.437251 }, { "epoch": 2.7865130028704854, "grad_norm": 4.612392902374268, "learning_rate": 4.104746719167693e-05, "loss": 2.6966320037841798, "memory(GiB)": 77.56, "step": 65040, "token_acc": 0.4563106796116505, "train_speed(iter/s)": 1.437253 }, { "epoch": 2.7867272181997342, "grad_norm": 5.180809497833252, "learning_rate": 4.104084625384272e-05, "loss": 2.607496452331543, "memory(GiB)": 77.56, "step": 65045, "token_acc": 0.45614035087719296, "train_speed(iter/s)": 1.437274 }, { "epoch": 2.7869414335289835, "grad_norm": 4.020688056945801, "learning_rate": 4.103422547831199e-05, "loss": 2.6246448516845704, "memory(GiB)": 77.56, "step": 65050, "token_acc": 0.44039735099337746, "train_speed(iter/s)": 1.43729 }, { "epoch": 2.7871556488582323, "grad_norm": 4.693151473999023, "learning_rate": 4.102760486520468e-05, "loss": 2.650675582885742, "memory(GiB)": 77.56, "step": 65055, "token_acc": 0.4641638225255973, "train_speed(iter/s)": 1.437312 }, { "epoch": 2.787369864187481, "grad_norm": 6.454774856567383, "learning_rate": 4.1020984414640716e-05, "loss": 2.5119747161865233, "memory(GiB)": 77.56, "step": 65060, "token_acc": 0.4870967741935484, "train_speed(iter/s)": 1.43732 }, { "epoch": 2.7875840795167304, "grad_norm": 5.322110176086426, "learning_rate": 4.1014364126740056e-05, "loss": 2.2648176193237304, "memory(GiB)": 77.56, "step": 65065, "token_acc": 0.5177304964539007, "train_speed(iter/s)": 1.437304 }, { "epoch": 2.787798294845979, "grad_norm": 9.234334945678711, "learning_rate": 4.100774400162261e-05, "loss": 2.3098682403564452, "memory(GiB)": 77.56, "step": 65070, "token_acc": 0.49828178694158076, "train_speed(iter/s)": 1.437321 }, { "epoch": 2.788012510175228, "grad_norm": 6.000372409820557, "learning_rate": 4.100112403940832e-05, "loss": 2.0317359924316407, "memory(GiB)": 77.56, "step": 65075, "token_acc": 0.5755395683453237, "train_speed(iter/s)": 1.437326 }, { "epoch": 2.7882267255044773, "grad_norm": 6.502902030944824, "learning_rate": 4.099450424021709e-05, "loss": 2.38585205078125, "memory(GiB)": 77.56, "step": 65080, "token_acc": 0.5143884892086331, "train_speed(iter/s)": 1.437344 }, { "epoch": 2.788440940833726, "grad_norm": 7.568596839904785, "learning_rate": 4.0987884604168886e-05, "loss": 2.4225059509277345, "memory(GiB)": 77.56, "step": 65085, "token_acc": 0.5015105740181269, "train_speed(iter/s)": 1.437366 }, { "epoch": 2.788655156162975, "grad_norm": 4.692899703979492, "learning_rate": 4.09812651313836e-05, "loss": 2.3116535186767577, "memory(GiB)": 77.56, "step": 65090, "token_acc": 0.4657534246575342, "train_speed(iter/s)": 1.437339 }, { "epoch": 2.788869371492224, "grad_norm": 4.729672908782959, "learning_rate": 4.097464582198116e-05, "loss": 2.1429588317871096, "memory(GiB)": 77.56, "step": 65095, "token_acc": 0.5666666666666667, "train_speed(iter/s)": 1.43737 }, { "epoch": 2.789083586821473, "grad_norm": 4.973909854888916, "learning_rate": 4.0968026676081474e-05, "loss": 2.6030345916748048, "memory(GiB)": 77.56, "step": 65100, "token_acc": 0.4570446735395189, "train_speed(iter/s)": 1.43736 }, { "epoch": 2.7892978021507218, "grad_norm": 6.733554840087891, "learning_rate": 4.096140769380445e-05, "loss": 2.4143299102783202, "memory(GiB)": 77.56, "step": 65105, "token_acc": 0.46864686468646866, "train_speed(iter/s)": 1.437348 }, { "epoch": 2.789512017479971, "grad_norm": 4.743870735168457, "learning_rate": 4.095478887527002e-05, "loss": 2.7167331695556642, "memory(GiB)": 77.56, "step": 65110, "token_acc": 0.40869565217391307, "train_speed(iter/s)": 1.437346 }, { "epoch": 2.78972623280922, "grad_norm": 5.542399883270264, "learning_rate": 4.094817022059806e-05, "loss": 2.4073665618896483, "memory(GiB)": 77.56, "step": 65115, "token_acc": 0.5133079847908745, "train_speed(iter/s)": 1.437346 }, { "epoch": 2.7899404481384686, "grad_norm": 5.528306484222412, "learning_rate": 4.09415517299085e-05, "loss": 2.686728858947754, "memory(GiB)": 77.56, "step": 65120, "token_acc": 0.44816053511705684, "train_speed(iter/s)": 1.437352 }, { "epoch": 2.790154663467718, "grad_norm": 4.568327903747559, "learning_rate": 4.0934933403321226e-05, "loss": 2.0923236846923827, "memory(GiB)": 77.56, "step": 65125, "token_acc": 0.5505226480836237, "train_speed(iter/s)": 1.437362 }, { "epoch": 2.7903688787969667, "grad_norm": 6.15056848526001, "learning_rate": 4.0928315240956134e-05, "loss": 2.405792236328125, "memory(GiB)": 77.56, "step": 65130, "token_acc": 0.5190476190476191, "train_speed(iter/s)": 1.437342 }, { "epoch": 2.7905830941262155, "grad_norm": 5.395101070404053, "learning_rate": 4.0921697242933125e-05, "loss": 2.421052932739258, "memory(GiB)": 77.56, "step": 65135, "token_acc": 0.49240121580547114, "train_speed(iter/s)": 1.437368 }, { "epoch": 2.790797309455465, "grad_norm": 5.541062355041504, "learning_rate": 4.0915079409372094e-05, "loss": 2.3892696380615233, "memory(GiB)": 77.56, "step": 65140, "token_acc": 0.46006389776357826, "train_speed(iter/s)": 1.437367 }, { "epoch": 2.7910115247847136, "grad_norm": 5.859163284301758, "learning_rate": 4.09084617403929e-05, "loss": 2.734787368774414, "memory(GiB)": 77.56, "step": 65145, "token_acc": 0.45317220543806647, "train_speed(iter/s)": 1.437355 }, { "epoch": 2.7912257401139624, "grad_norm": 5.762321949005127, "learning_rate": 4.0901844236115464e-05, "loss": 2.5448539733886717, "memory(GiB)": 77.56, "step": 65150, "token_acc": 0.4662576687116564, "train_speed(iter/s)": 1.437345 }, { "epoch": 2.7914399554432117, "grad_norm": 6.067520618438721, "learning_rate": 4.089522689665964e-05, "loss": 2.402146911621094, "memory(GiB)": 77.56, "step": 65155, "token_acc": 0.48297213622291024, "train_speed(iter/s)": 1.437336 }, { "epoch": 2.7916541707724605, "grad_norm": 4.517601490020752, "learning_rate": 4.088860972214534e-05, "loss": 2.391473388671875, "memory(GiB)": 77.56, "step": 65160, "token_acc": 0.45251396648044695, "train_speed(iter/s)": 1.43735 }, { "epoch": 2.7918683861017093, "grad_norm": 5.006802558898926, "learning_rate": 4.088199271269241e-05, "loss": 2.1704885482788088, "memory(GiB)": 77.56, "step": 65165, "token_acc": 0.5182481751824818, "train_speed(iter/s)": 1.437341 }, { "epoch": 2.7920826014309585, "grad_norm": 4.754733085632324, "learning_rate": 4.087537586842074e-05, "loss": 2.5851518630981447, "memory(GiB)": 77.56, "step": 65170, "token_acc": 0.46206896551724136, "train_speed(iter/s)": 1.43734 }, { "epoch": 2.7922968167602074, "grad_norm": 4.961844444274902, "learning_rate": 4.086875918945019e-05, "loss": 2.3648906707763673, "memory(GiB)": 77.56, "step": 65175, "token_acc": 0.4943820224719101, "train_speed(iter/s)": 1.437365 }, { "epoch": 2.792511032089456, "grad_norm": 5.78552770614624, "learning_rate": 4.0862142675900645e-05, "loss": 2.576050567626953, "memory(GiB)": 77.56, "step": 65180, "token_acc": 0.5166666666666667, "train_speed(iter/s)": 1.43739 }, { "epoch": 2.7927252474187054, "grad_norm": 5.701693534851074, "learning_rate": 4.0855526327891956e-05, "loss": 2.504083442687988, "memory(GiB)": 77.56, "step": 65185, "token_acc": 0.4723127035830619, "train_speed(iter/s)": 1.437397 }, { "epoch": 2.7929394627479542, "grad_norm": 6.094283580780029, "learning_rate": 4.084891014554398e-05, "loss": 2.5823450088500977, "memory(GiB)": 77.56, "step": 65190, "token_acc": 0.46619217081850534, "train_speed(iter/s)": 1.437411 }, { "epoch": 2.793153678077203, "grad_norm": 4.283876419067383, "learning_rate": 4.0842294128976586e-05, "loss": 2.4167831420898436, "memory(GiB)": 77.56, "step": 65195, "token_acc": 0.5046439628482973, "train_speed(iter/s)": 1.437396 }, { "epoch": 2.7933678934064523, "grad_norm": 6.271384239196777, "learning_rate": 4.083567827830962e-05, "loss": 2.320393371582031, "memory(GiB)": 77.56, "step": 65200, "token_acc": 0.5143769968051118, "train_speed(iter/s)": 1.437432 }, { "epoch": 2.793582108735701, "grad_norm": 7.535311698913574, "learning_rate": 4.0829062593662944e-05, "loss": 2.4804813385009767, "memory(GiB)": 77.56, "step": 65205, "token_acc": 0.43388429752066116, "train_speed(iter/s)": 1.437431 }, { "epoch": 2.79379632406495, "grad_norm": 7.255420207977295, "learning_rate": 4.08224470751564e-05, "loss": 2.4209873199462892, "memory(GiB)": 77.56, "step": 65210, "token_acc": 0.5031055900621118, "train_speed(iter/s)": 1.437446 }, { "epoch": 2.794010539394199, "grad_norm": 5.833639621734619, "learning_rate": 4.081583172290983e-05, "loss": 2.108981895446777, "memory(GiB)": 77.56, "step": 65215, "token_acc": 0.49034749034749037, "train_speed(iter/s)": 1.437426 }, { "epoch": 2.794224754723448, "grad_norm": 5.483102798461914, "learning_rate": 4.080921653704309e-05, "loss": 2.21221809387207, "memory(GiB)": 77.56, "step": 65220, "token_acc": 0.49794238683127573, "train_speed(iter/s)": 1.437421 }, { "epoch": 2.794438970052697, "grad_norm": 5.654956817626953, "learning_rate": 4.080260151767602e-05, "loss": 2.4338405609130858, "memory(GiB)": 77.56, "step": 65225, "token_acc": 0.46545454545454545, "train_speed(iter/s)": 1.43744 }, { "epoch": 2.794653185381946, "grad_norm": 5.657629489898682, "learning_rate": 4.079598666492843e-05, "loss": 2.381987380981445, "memory(GiB)": 77.56, "step": 65230, "token_acc": 0.5036231884057971, "train_speed(iter/s)": 1.437441 }, { "epoch": 2.794867400711195, "grad_norm": 4.7143096923828125, "learning_rate": 4.0789371978920185e-05, "loss": 2.6112834930419924, "memory(GiB)": 77.56, "step": 65235, "token_acc": 0.49363057324840764, "train_speed(iter/s)": 1.437424 }, { "epoch": 2.7950816160404437, "grad_norm": 5.765244007110596, "learning_rate": 4.078275745977112e-05, "loss": 2.4692554473876953, "memory(GiB)": 77.56, "step": 65240, "token_acc": 0.4910394265232975, "train_speed(iter/s)": 1.437394 }, { "epoch": 2.795295831369693, "grad_norm": 6.623595714569092, "learning_rate": 4.0776143107601037e-05, "loss": 2.2529712677001954, "memory(GiB)": 77.56, "step": 65245, "token_acc": 0.5490909090909091, "train_speed(iter/s)": 1.437409 }, { "epoch": 2.7955100466989418, "grad_norm": 4.823298931121826, "learning_rate": 4.076952892252977e-05, "loss": 2.239645004272461, "memory(GiB)": 77.56, "step": 65250, "token_acc": 0.5176056338028169, "train_speed(iter/s)": 1.437428 }, { "epoch": 2.7957242620281906, "grad_norm": 5.661194801330566, "learning_rate": 4.0762914904677165e-05, "loss": 2.4171648025512695, "memory(GiB)": 77.56, "step": 65255, "token_acc": 0.4425287356321839, "train_speed(iter/s)": 1.437446 }, { "epoch": 2.79593847735744, "grad_norm": 5.11198616027832, "learning_rate": 4.0756301054163004e-05, "loss": 2.558170700073242, "memory(GiB)": 77.56, "step": 65260, "token_acc": 0.4444444444444444, "train_speed(iter/s)": 1.43742 }, { "epoch": 2.7961526926866886, "grad_norm": 4.3902153968811035, "learning_rate": 4.074968737110713e-05, "loss": 2.3321956634521483, "memory(GiB)": 77.56, "step": 65265, "token_acc": 0.5015479876160991, "train_speed(iter/s)": 1.437399 }, { "epoch": 2.7963669080159375, "grad_norm": 5.167755126953125, "learning_rate": 4.0743073855629355e-05, "loss": 2.502057647705078, "memory(GiB)": 77.56, "step": 65270, "token_acc": 0.45645645645645644, "train_speed(iter/s)": 1.437402 }, { "epoch": 2.7965811233451867, "grad_norm": 5.482974529266357, "learning_rate": 4.073646050784946e-05, "loss": 2.311526298522949, "memory(GiB)": 77.56, "step": 65275, "token_acc": 0.4924812030075188, "train_speed(iter/s)": 1.437403 }, { "epoch": 2.7967953386744355, "grad_norm": 5.12534761428833, "learning_rate": 4.072984732788729e-05, "loss": 2.6396320343017576, "memory(GiB)": 77.56, "step": 65280, "token_acc": 0.45645645645645644, "train_speed(iter/s)": 1.437412 }, { "epoch": 2.7970095540036843, "grad_norm": 7.030798435211182, "learning_rate": 4.072323431586263e-05, "loss": 2.56494026184082, "memory(GiB)": 77.56, "step": 65285, "token_acc": 0.49498327759197325, "train_speed(iter/s)": 1.437414 }, { "epoch": 2.7972237693329336, "grad_norm": 5.001982688903809, "learning_rate": 4.0716621471895275e-05, "loss": 2.4871023178100584, "memory(GiB)": 77.56, "step": 65290, "token_acc": 0.5073529411764706, "train_speed(iter/s)": 1.437415 }, { "epoch": 2.7974379846621824, "grad_norm": 4.885438919067383, "learning_rate": 4.0710008796105034e-05, "loss": 2.155305099487305, "memory(GiB)": 77.56, "step": 65295, "token_acc": 0.5690376569037657, "train_speed(iter/s)": 1.437424 }, { "epoch": 2.797652199991431, "grad_norm": 6.164182662963867, "learning_rate": 4.0703396288611694e-05, "loss": 2.4674415588378906, "memory(GiB)": 77.56, "step": 65300, "token_acc": 0.48314606741573035, "train_speed(iter/s)": 1.437421 }, { "epoch": 2.7978664153206805, "grad_norm": 5.3449249267578125, "learning_rate": 4.069678394953505e-05, "loss": 2.2658706665039063, "memory(GiB)": 77.56, "step": 65305, "token_acc": 0.49110320284697506, "train_speed(iter/s)": 1.43743 }, { "epoch": 2.7980806306499293, "grad_norm": 7.248142719268799, "learning_rate": 4.069017177899489e-05, "loss": 2.6356481552124023, "memory(GiB)": 77.56, "step": 65310, "token_acc": 0.43859649122807015, "train_speed(iter/s)": 1.437432 }, { "epoch": 2.7982948459791785, "grad_norm": 6.9197869300842285, "learning_rate": 4.0683559777111014e-05, "loss": 2.498550224304199, "memory(GiB)": 77.56, "step": 65315, "token_acc": 0.4634146341463415, "train_speed(iter/s)": 1.437447 }, { "epoch": 2.7985090613084274, "grad_norm": 4.77265739440918, "learning_rate": 4.0676947944003175e-05, "loss": 2.079168701171875, "memory(GiB)": 77.56, "step": 65320, "token_acc": 0.5611814345991561, "train_speed(iter/s)": 1.43747 }, { "epoch": 2.798723276637676, "grad_norm": 6.321204662322998, "learning_rate": 4.0670336279791186e-05, "loss": 2.6338876724243163, "memory(GiB)": 77.56, "step": 65325, "token_acc": 0.4461538461538462, "train_speed(iter/s)": 1.437468 }, { "epoch": 2.7989374919669254, "grad_norm": 6.061026096343994, "learning_rate": 4.066372478459481e-05, "loss": 2.3017366409301756, "memory(GiB)": 77.56, "step": 65330, "token_acc": 0.5247148288973384, "train_speed(iter/s)": 1.437462 }, { "epoch": 2.7991517072961742, "grad_norm": 4.916214466094971, "learning_rate": 4.06571134585338e-05, "loss": 2.3075334548950197, "memory(GiB)": 77.56, "step": 65335, "token_acc": 0.4897260273972603, "train_speed(iter/s)": 1.437431 }, { "epoch": 2.799365922625423, "grad_norm": 5.356903076171875, "learning_rate": 4.065050230172796e-05, "loss": 2.2832189559936524, "memory(GiB)": 77.56, "step": 65340, "token_acc": 0.5, "train_speed(iter/s)": 1.437436 }, { "epoch": 2.7995801379546723, "grad_norm": 5.1805315017700195, "learning_rate": 4.064389131429704e-05, "loss": 2.3636260986328126, "memory(GiB)": 77.56, "step": 65345, "token_acc": 0.5134328358208955, "train_speed(iter/s)": 1.437442 }, { "epoch": 2.799794353283921, "grad_norm": 5.781987190246582, "learning_rate": 4.0637280496360795e-05, "loss": 2.7125370025634767, "memory(GiB)": 77.56, "step": 65350, "token_acc": 0.4319526627218935, "train_speed(iter/s)": 1.437446 }, { "epoch": 2.80000856861317, "grad_norm": 6.091648578643799, "learning_rate": 4.0630669848039005e-05, "loss": 2.214900016784668, "memory(GiB)": 77.56, "step": 65355, "token_acc": 0.49416342412451364, "train_speed(iter/s)": 1.437444 }, { "epoch": 2.800222783942419, "grad_norm": 4.333321571350098, "learning_rate": 4.0624059369451415e-05, "loss": 2.4089054107666015, "memory(GiB)": 77.56, "step": 65360, "token_acc": 0.4842105263157895, "train_speed(iter/s)": 1.437449 }, { "epoch": 2.800436999271668, "grad_norm": 6.672257423400879, "learning_rate": 4.061744906071779e-05, "loss": 2.5361865997314452, "memory(GiB)": 77.56, "step": 65365, "token_acc": 0.4682080924855491, "train_speed(iter/s)": 1.437462 }, { "epoch": 2.800651214600917, "grad_norm": 5.3495025634765625, "learning_rate": 4.061083892195788e-05, "loss": 2.330461883544922, "memory(GiB)": 77.56, "step": 65370, "token_acc": 0.5107913669064749, "train_speed(iter/s)": 1.437455 }, { "epoch": 2.800865429930166, "grad_norm": 8.395328521728516, "learning_rate": 4.0604228953291404e-05, "loss": 2.4021318435668944, "memory(GiB)": 77.56, "step": 65375, "token_acc": 0.4901185770750988, "train_speed(iter/s)": 1.437436 }, { "epoch": 2.801079645259415, "grad_norm": 12.81687068939209, "learning_rate": 4.059761915483815e-05, "loss": 2.1439666748046875, "memory(GiB)": 77.56, "step": 65380, "token_acc": 0.5141700404858299, "train_speed(iter/s)": 1.43744 }, { "epoch": 2.8012938605886637, "grad_norm": 4.305462837219238, "learning_rate": 4.059100952671786e-05, "loss": 2.4263351440429686, "memory(GiB)": 77.56, "step": 65385, "token_acc": 0.5045592705167173, "train_speed(iter/s)": 1.437462 }, { "epoch": 2.801508075917913, "grad_norm": 8.798970222473145, "learning_rate": 4.058440006905025e-05, "loss": 2.534514236450195, "memory(GiB)": 77.56, "step": 65390, "token_acc": 0.4797047970479705, "train_speed(iter/s)": 1.437478 }, { "epoch": 2.8017222912471618, "grad_norm": 6.869740962982178, "learning_rate": 4.057779078195506e-05, "loss": 2.173324966430664, "memory(GiB)": 77.56, "step": 65395, "token_acc": 0.5037037037037037, "train_speed(iter/s)": 1.437484 }, { "epoch": 2.8019365065764106, "grad_norm": 4.36246395111084, "learning_rate": 4.0571181665552035e-05, "loss": 2.2432683944702148, "memory(GiB)": 77.56, "step": 65400, "token_acc": 0.5131086142322098, "train_speed(iter/s)": 1.437474 }, { "epoch": 2.80215072190566, "grad_norm": 4.669100284576416, "learning_rate": 4.05645727199609e-05, "loss": 2.628772163391113, "memory(GiB)": 77.56, "step": 65405, "token_acc": 0.45925925925925926, "train_speed(iter/s)": 1.437476 }, { "epoch": 2.8023649372349086, "grad_norm": 5.992696762084961, "learning_rate": 4.055796394530138e-05, "loss": 2.3717302322387694, "memory(GiB)": 77.56, "step": 65410, "token_acc": 0.49673202614379086, "train_speed(iter/s)": 1.437499 }, { "epoch": 2.8025791525641575, "grad_norm": 6.627432346343994, "learning_rate": 4.05513553416932e-05, "loss": 2.2462282180786133, "memory(GiB)": 77.56, "step": 65415, "token_acc": 0.5175438596491229, "train_speed(iter/s)": 1.437509 }, { "epoch": 2.8027933678934067, "grad_norm": 5.615395545959473, "learning_rate": 4.054474690925607e-05, "loss": 2.4067811965942383, "memory(GiB)": 77.56, "step": 65420, "token_acc": 0.4794007490636704, "train_speed(iter/s)": 1.437509 }, { "epoch": 2.8030075832226555, "grad_norm": 3.8676412105560303, "learning_rate": 4.053813864810974e-05, "loss": 2.3095130920410156, "memory(GiB)": 77.56, "step": 65425, "token_acc": 0.4811594202898551, "train_speed(iter/s)": 1.437499 }, { "epoch": 2.8032217985519043, "grad_norm": 4.624801158905029, "learning_rate": 4.05315305583739e-05, "loss": 2.5628631591796873, "memory(GiB)": 77.56, "step": 65430, "token_acc": 0.4584615384615385, "train_speed(iter/s)": 1.437508 }, { "epoch": 2.8034360138811536, "grad_norm": 7.322268009185791, "learning_rate": 4.052492264016825e-05, "loss": 2.681002616882324, "memory(GiB)": 77.56, "step": 65435, "token_acc": 0.44376899696048633, "train_speed(iter/s)": 1.437509 }, { "epoch": 2.8036502292104024, "grad_norm": 5.505902290344238, "learning_rate": 4.0518314893612535e-05, "loss": 2.4549976348876954, "memory(GiB)": 77.56, "step": 65440, "token_acc": 0.49230769230769234, "train_speed(iter/s)": 1.437527 }, { "epoch": 2.8038644445396512, "grad_norm": 6.477055072784424, "learning_rate": 4.0511707318826426e-05, "loss": 2.3828807830810548, "memory(GiB)": 77.56, "step": 65445, "token_acc": 0.5034722222222222, "train_speed(iter/s)": 1.437539 }, { "epoch": 2.8040786598689005, "grad_norm": 5.5579752922058105, "learning_rate": 4.050509991592964e-05, "loss": 2.4007511138916016, "memory(GiB)": 77.56, "step": 65450, "token_acc": 0.5017182130584192, "train_speed(iter/s)": 1.437539 }, { "epoch": 2.8042928751981493, "grad_norm": 6.419618129730225, "learning_rate": 4.049849268504187e-05, "loss": 2.0940181732177736, "memory(GiB)": 77.56, "step": 65455, "token_acc": 0.5309090909090909, "train_speed(iter/s)": 1.437531 }, { "epoch": 2.804507090527398, "grad_norm": 6.002536773681641, "learning_rate": 4.0491885626282836e-05, "loss": 2.3335418701171875, "memory(GiB)": 77.56, "step": 65460, "token_acc": 0.5101351351351351, "train_speed(iter/s)": 1.437534 }, { "epoch": 2.8047213058566474, "grad_norm": 6.772785186767578, "learning_rate": 4.04852787397722e-05, "loss": 2.3547508239746096, "memory(GiB)": 77.56, "step": 65465, "token_acc": 0.48484848484848486, "train_speed(iter/s)": 1.437537 }, { "epoch": 2.804935521185896, "grad_norm": 5.774203300476074, "learning_rate": 4.047867202562967e-05, "loss": 2.6899650573730467, "memory(GiB)": 77.56, "step": 65470, "token_acc": 0.4882943143812709, "train_speed(iter/s)": 1.437568 }, { "epoch": 2.805149736515145, "grad_norm": 6.2780561447143555, "learning_rate": 4.0472065483974933e-05, "loss": 2.2603769302368164, "memory(GiB)": 77.56, "step": 65475, "token_acc": 0.5111940298507462, "train_speed(iter/s)": 1.437578 }, { "epoch": 2.8053639518443942, "grad_norm": 5.402710437774658, "learning_rate": 4.046545911492766e-05, "loss": 2.4195978164672853, "memory(GiB)": 77.56, "step": 65480, "token_acc": 0.49142857142857144, "train_speed(iter/s)": 1.437545 }, { "epoch": 2.805578167173643, "grad_norm": 5.430166244506836, "learning_rate": 4.0458852918607545e-05, "loss": 2.3975383758544924, "memory(GiB)": 77.56, "step": 65485, "token_acc": 0.5, "train_speed(iter/s)": 1.437558 }, { "epoch": 2.805792382502892, "grad_norm": 6.7078633308410645, "learning_rate": 4.0452246895134266e-05, "loss": 2.6554012298583984, "memory(GiB)": 77.56, "step": 65490, "token_acc": 0.47619047619047616, "train_speed(iter/s)": 1.437502 }, { "epoch": 2.806006597832141, "grad_norm": 4.563215255737305, "learning_rate": 4.044564104462747e-05, "loss": 2.5749319076538084, "memory(GiB)": 77.56, "step": 65495, "token_acc": 0.48717948717948717, "train_speed(iter/s)": 1.437526 }, { "epoch": 2.80622081316139, "grad_norm": 5.335965156555176, "learning_rate": 4.0439035367206875e-05, "loss": 2.300836372375488, "memory(GiB)": 77.56, "step": 65500, "token_acc": 0.50390625, "train_speed(iter/s)": 1.43752 }, { "epoch": 2.80622081316139, "eval_loss": 2.1007239818573, "eval_runtime": 14.5777, "eval_samples_per_second": 6.86, "eval_steps_per_second": 6.86, "eval_token_acc": 0.4951321279554937, "step": 65500 }, { "epoch": 2.8064350284906388, "grad_norm": 6.0830817222595215, "learning_rate": 4.04324298629921e-05, "loss": 2.215840530395508, "memory(GiB)": 77.56, "step": 65505, "token_acc": 0.49278152069297404, "train_speed(iter/s)": 1.437018 }, { "epoch": 2.806649243819888, "grad_norm": 4.430671691894531, "learning_rate": 4.042582453210285e-05, "loss": 2.4220972061157227, "memory(GiB)": 77.56, "step": 65510, "token_acc": 0.5220125786163522, "train_speed(iter/s)": 1.437021 }, { "epoch": 2.806863459149137, "grad_norm": 7.196639060974121, "learning_rate": 4.0419219374658766e-05, "loss": 2.4943925857543947, "memory(GiB)": 77.56, "step": 65515, "token_acc": 0.46735395189003437, "train_speed(iter/s)": 1.437042 }, { "epoch": 2.8070776744783856, "grad_norm": 6.069049835205078, "learning_rate": 4.04126143907795e-05, "loss": 2.7378908157348634, "memory(GiB)": 77.56, "step": 65520, "token_acc": 0.44525547445255476, "train_speed(iter/s)": 1.437009 }, { "epoch": 2.807291889807635, "grad_norm": 6.978514671325684, "learning_rate": 4.040600958058471e-05, "loss": 2.1614658355712892, "memory(GiB)": 77.56, "step": 65525, "token_acc": 0.5316901408450704, "train_speed(iter/s)": 1.437019 }, { "epoch": 2.8075061051368837, "grad_norm": 7.327071666717529, "learning_rate": 4.039940494419407e-05, "loss": 2.451673889160156, "memory(GiB)": 77.56, "step": 65530, "token_acc": 0.5128205128205128, "train_speed(iter/s)": 1.437018 }, { "epoch": 2.8077203204661325, "grad_norm": 7.619434833526611, "learning_rate": 4.0392800481727224e-05, "loss": 2.153657913208008, "memory(GiB)": 77.56, "step": 65535, "token_acc": 0.51171875, "train_speed(iter/s)": 1.437017 }, { "epoch": 2.8079345357953818, "grad_norm": 6.128969192504883, "learning_rate": 4.03861961933038e-05, "loss": 2.597476577758789, "memory(GiB)": 77.56, "step": 65540, "token_acc": 0.4854368932038835, "train_speed(iter/s)": 1.437035 }, { "epoch": 2.8081487511246306, "grad_norm": 5.445517539978027, "learning_rate": 4.037959207904346e-05, "loss": 2.3313583374023437, "memory(GiB)": 77.56, "step": 65545, "token_acc": 0.4727272727272727, "train_speed(iter/s)": 1.437057 }, { "epoch": 2.8083629664538794, "grad_norm": 6.074429988861084, "learning_rate": 4.0372988139065824e-05, "loss": 2.5054134368896483, "memory(GiB)": 77.56, "step": 65550, "token_acc": 0.4923547400611621, "train_speed(iter/s)": 1.437056 }, { "epoch": 2.8085771817831287, "grad_norm": 4.698348522186279, "learning_rate": 4.036638437349054e-05, "loss": 2.0462865829467773, "memory(GiB)": 77.56, "step": 65555, "token_acc": 0.5846153846153846, "train_speed(iter/s)": 1.437055 }, { "epoch": 2.8087913971123775, "grad_norm": 5.37380838394165, "learning_rate": 4.035978078243725e-05, "loss": 2.6219940185546875, "memory(GiB)": 77.56, "step": 65560, "token_acc": 0.45535714285714285, "train_speed(iter/s)": 1.437077 }, { "epoch": 2.8090056124416263, "grad_norm": 5.415415287017822, "learning_rate": 4.0353177366025565e-05, "loss": 2.3958534240722655, "memory(GiB)": 77.56, "step": 65565, "token_acc": 0.4827586206896552, "train_speed(iter/s)": 1.437099 }, { "epoch": 2.8092198277708755, "grad_norm": 4.243507385253906, "learning_rate": 4.0346574124375126e-05, "loss": 2.2483856201171877, "memory(GiB)": 77.56, "step": 65570, "token_acc": 0.5480427046263345, "train_speed(iter/s)": 1.437122 }, { "epoch": 2.8094340431001243, "grad_norm": 5.420349597930908, "learning_rate": 4.033997105760555e-05, "loss": 2.6823713302612306, "memory(GiB)": 77.56, "step": 65575, "token_acc": 0.4755244755244755, "train_speed(iter/s)": 1.437118 }, { "epoch": 2.809648258429373, "grad_norm": 6.035757064819336, "learning_rate": 4.0333368165836456e-05, "loss": 2.3820026397705076, "memory(GiB)": 77.56, "step": 65580, "token_acc": 0.4570446735395189, "train_speed(iter/s)": 1.437103 }, { "epoch": 2.8098624737586224, "grad_norm": 6.843135356903076, "learning_rate": 4.032676544918747e-05, "loss": 2.3901126861572264, "memory(GiB)": 77.56, "step": 65585, "token_acc": 0.4602076124567474, "train_speed(iter/s)": 1.43714 }, { "epoch": 2.8100766890878712, "grad_norm": 4.060319900512695, "learning_rate": 4.0320162907778196e-05, "loss": 2.5534442901611327, "memory(GiB)": 77.56, "step": 65590, "token_acc": 0.46206896551724136, "train_speed(iter/s)": 1.437172 }, { "epoch": 2.81029090441712, "grad_norm": 4.8427252769470215, "learning_rate": 4.031356054172826e-05, "loss": 2.5128955841064453, "memory(GiB)": 77.56, "step": 65595, "token_acc": 0.48026315789473684, "train_speed(iter/s)": 1.437204 }, { "epoch": 2.8105051197463693, "grad_norm": 5.334048748016357, "learning_rate": 4.0306958351157245e-05, "loss": 2.6606060028076173, "memory(GiB)": 77.56, "step": 65600, "token_acc": 0.4524590163934426, "train_speed(iter/s)": 1.43723 }, { "epoch": 2.810719335075618, "grad_norm": 5.572875499725342, "learning_rate": 4.0300356336184786e-05, "loss": 2.3118499755859374, "memory(GiB)": 77.56, "step": 65605, "token_acc": 0.49865951742627346, "train_speed(iter/s)": 1.437253 }, { "epoch": 2.810933550404867, "grad_norm": 5.52056884765625, "learning_rate": 4.029375449693047e-05, "loss": 2.3903099060058595, "memory(GiB)": 77.56, "step": 65610, "token_acc": 0.48201438848920863, "train_speed(iter/s)": 1.437256 }, { "epoch": 2.811147765734116, "grad_norm": 5.154854774475098, "learning_rate": 4.02871528335139e-05, "loss": 2.4831390380859375, "memory(GiB)": 77.56, "step": 65615, "token_acc": 0.475177304964539, "train_speed(iter/s)": 1.437279 }, { "epoch": 2.811361981063365, "grad_norm": 5.662505626678467, "learning_rate": 4.028055134605467e-05, "loss": 2.4398040771484375, "memory(GiB)": 77.56, "step": 65620, "token_acc": 0.5057034220532319, "train_speed(iter/s)": 1.437247 }, { "epoch": 2.811576196392614, "grad_norm": 6.19303035736084, "learning_rate": 4.0273950034672356e-05, "loss": 2.620880126953125, "memory(GiB)": 77.56, "step": 65625, "token_acc": 0.44660194174757284, "train_speed(iter/s)": 1.437253 }, { "epoch": 2.811790411721863, "grad_norm": 5.007617473602295, "learning_rate": 4.026734889948657e-05, "loss": 2.4864192962646485, "memory(GiB)": 77.56, "step": 65630, "token_acc": 0.4872611464968153, "train_speed(iter/s)": 1.437254 }, { "epoch": 2.812004627051112, "grad_norm": 5.41358757019043, "learning_rate": 4.02607479406169e-05, "loss": 2.369548034667969, "memory(GiB)": 77.56, "step": 65635, "token_acc": 0.5283582089552239, "train_speed(iter/s)": 1.437231 }, { "epoch": 2.8122188423803607, "grad_norm": 6.951308250427246, "learning_rate": 4.0254147158182895e-05, "loss": 2.2678949356079103, "memory(GiB)": 77.56, "step": 65640, "token_acc": 0.48444444444444446, "train_speed(iter/s)": 1.437239 }, { "epoch": 2.81243305770961, "grad_norm": 5.569390773773193, "learning_rate": 4.024754655230417e-05, "loss": 2.464281463623047, "memory(GiB)": 77.56, "step": 65645, "token_acc": 0.4857142857142857, "train_speed(iter/s)": 1.437243 }, { "epoch": 2.8126472730388588, "grad_norm": 5.123871326446533, "learning_rate": 4.024094612310028e-05, "loss": 2.1402820587158202, "memory(GiB)": 77.56, "step": 65650, "token_acc": 0.5315985130111525, "train_speed(iter/s)": 1.437263 }, { "epoch": 2.8128614883681076, "grad_norm": 5.817910671234131, "learning_rate": 4.023434587069081e-05, "loss": 2.4309709548950194, "memory(GiB)": 77.56, "step": 65655, "token_acc": 0.4584615384615385, "train_speed(iter/s)": 1.437261 }, { "epoch": 2.813075703697357, "grad_norm": 8.321303367614746, "learning_rate": 4.0227745795195335e-05, "loss": 2.6245563507080076, "memory(GiB)": 77.56, "step": 65660, "token_acc": 0.4584527220630373, "train_speed(iter/s)": 1.437287 }, { "epoch": 2.8132899190266056, "grad_norm": 8.671968460083008, "learning_rate": 4.02211458967334e-05, "loss": 2.7484920501708983, "memory(GiB)": 77.56, "step": 65665, "token_acc": 0.49056603773584906, "train_speed(iter/s)": 1.437301 }, { "epoch": 2.8135041343558544, "grad_norm": 13.474469184875488, "learning_rate": 4.021454617542457e-05, "loss": 2.464711570739746, "memory(GiB)": 77.56, "step": 65670, "token_acc": 0.5474452554744526, "train_speed(iter/s)": 1.437334 }, { "epoch": 2.8137183496851037, "grad_norm": 4.670197010040283, "learning_rate": 4.0207946631388426e-05, "loss": 2.3289325714111326, "memory(GiB)": 77.56, "step": 65675, "token_acc": 0.5070422535211268, "train_speed(iter/s)": 1.437353 }, { "epoch": 2.8139325650143525, "grad_norm": 5.20142936706543, "learning_rate": 4.0201347264744524e-05, "loss": 2.1882888793945314, "memory(GiB)": 77.56, "step": 65680, "token_acc": 0.5118110236220472, "train_speed(iter/s)": 1.437352 }, { "epoch": 2.8141467803436013, "grad_norm": 8.296268463134766, "learning_rate": 4.0194748075612396e-05, "loss": 2.2669937133789064, "memory(GiB)": 77.56, "step": 65685, "token_acc": 0.5128205128205128, "train_speed(iter/s)": 1.437385 }, { "epoch": 2.8143609956728506, "grad_norm": 4.953884124755859, "learning_rate": 4.0188149064111615e-05, "loss": 2.7729183197021485, "memory(GiB)": 77.56, "step": 65690, "token_acc": 0.46616541353383456, "train_speed(iter/s)": 1.437414 }, { "epoch": 2.8145752110020994, "grad_norm": 6.948336124420166, "learning_rate": 4.018155023036171e-05, "loss": 2.230704116821289, "memory(GiB)": 77.56, "step": 65695, "token_acc": 0.5407725321888412, "train_speed(iter/s)": 1.437411 }, { "epoch": 2.814789426331348, "grad_norm": 6.491889476776123, "learning_rate": 4.017495157448224e-05, "loss": 2.6713748931884767, "memory(GiB)": 77.56, "step": 65700, "token_acc": 0.4541832669322709, "train_speed(iter/s)": 1.437444 }, { "epoch": 2.8150036416605975, "grad_norm": 5.530081748962402, "learning_rate": 4.0168353096592735e-05, "loss": 2.574834442138672, "memory(GiB)": 77.56, "step": 65705, "token_acc": 0.4935064935064935, "train_speed(iter/s)": 1.437461 }, { "epoch": 2.8152178569898463, "grad_norm": 4.453324317932129, "learning_rate": 4.0161754796812736e-05, "loss": 2.532196807861328, "memory(GiB)": 77.56, "step": 65710, "token_acc": 0.5016722408026756, "train_speed(iter/s)": 1.437453 }, { "epoch": 2.815432072319095, "grad_norm": 11.306683540344238, "learning_rate": 4.0155156675261785e-05, "loss": 2.567127990722656, "memory(GiB)": 77.56, "step": 65715, "token_acc": 0.4962686567164179, "train_speed(iter/s)": 1.437452 }, { "epoch": 2.8156462876483443, "grad_norm": 4.425283908843994, "learning_rate": 4.01485587320594e-05, "loss": 2.338490104675293, "memory(GiB)": 77.56, "step": 65720, "token_acc": 0.5083612040133779, "train_speed(iter/s)": 1.437451 }, { "epoch": 2.815860502977593, "grad_norm": 4.620877265930176, "learning_rate": 4.014196096732511e-05, "loss": 2.109731674194336, "memory(GiB)": 77.56, "step": 65725, "token_acc": 0.5291828793774319, "train_speed(iter/s)": 1.437472 }, { "epoch": 2.816074718306842, "grad_norm": 4.456051349639893, "learning_rate": 4.0135363381178454e-05, "loss": 2.3745370864868165, "memory(GiB)": 77.56, "step": 65730, "token_acc": 0.48344370860927155, "train_speed(iter/s)": 1.437487 }, { "epoch": 2.8162889336360912, "grad_norm": 6.78685188293457, "learning_rate": 4.012876597373893e-05, "loss": 2.2988656997680663, "memory(GiB)": 77.56, "step": 65735, "token_acc": 0.4980694980694981, "train_speed(iter/s)": 1.437486 }, { "epoch": 2.81650314896534, "grad_norm": 4.214529037475586, "learning_rate": 4.012216874512609e-05, "loss": 2.0138893127441406, "memory(GiB)": 77.56, "step": 65740, "token_acc": 0.5268456375838926, "train_speed(iter/s)": 1.437506 }, { "epoch": 2.816717364294589, "grad_norm": 6.482390403747559, "learning_rate": 4.0115571695459396e-05, "loss": 2.7807937622070313, "memory(GiB)": 77.56, "step": 65745, "token_acc": 0.42857142857142855, "train_speed(iter/s)": 1.437513 }, { "epoch": 2.816931579623838, "grad_norm": 5.318929195404053, "learning_rate": 4.0108974824858425e-05, "loss": 2.42476806640625, "memory(GiB)": 77.56, "step": 65750, "token_acc": 0.4689922480620155, "train_speed(iter/s)": 1.437519 }, { "epoch": 2.817145794953087, "grad_norm": 5.66624641418457, "learning_rate": 4.010237813344264e-05, "loss": 2.466521453857422, "memory(GiB)": 77.56, "step": 65755, "token_acc": 0.47703180212014135, "train_speed(iter/s)": 1.43754 }, { "epoch": 2.8173600102823357, "grad_norm": 6.177026271820068, "learning_rate": 4.0095781621331563e-05, "loss": 2.2636268615722654, "memory(GiB)": 77.56, "step": 65760, "token_acc": 0.5141843971631206, "train_speed(iter/s)": 1.437565 }, { "epoch": 2.817574225611585, "grad_norm": 5.124622344970703, "learning_rate": 4.0089185288644706e-05, "loss": 1.9913835525512695, "memory(GiB)": 77.56, "step": 65765, "token_acc": 0.5060240963855421, "train_speed(iter/s)": 1.43758 }, { "epoch": 2.817788440940834, "grad_norm": 5.099691390991211, "learning_rate": 4.008258913550153e-05, "loss": 2.2811561584472657, "memory(GiB)": 77.56, "step": 65770, "token_acc": 0.486013986013986, "train_speed(iter/s)": 1.437545 }, { "epoch": 2.8180026562700826, "grad_norm": 4.510668754577637, "learning_rate": 4.0075993162021575e-05, "loss": 2.6661176681518555, "memory(GiB)": 77.56, "step": 65775, "token_acc": 0.4437869822485207, "train_speed(iter/s)": 1.437569 }, { "epoch": 2.818216871599332, "grad_norm": 7.073869228363037, "learning_rate": 4.006939736832431e-05, "loss": 2.379596710205078, "memory(GiB)": 77.56, "step": 65780, "token_acc": 0.5016835016835017, "train_speed(iter/s)": 1.437564 }, { "epoch": 2.8184310869285807, "grad_norm": 5.8068108558654785, "learning_rate": 4.006280175452922e-05, "loss": 2.5193090438842773, "memory(GiB)": 77.56, "step": 65785, "token_acc": 0.47368421052631576, "train_speed(iter/s)": 1.437561 }, { "epoch": 2.8186453022578295, "grad_norm": 7.517391204833984, "learning_rate": 4.0056206320755806e-05, "loss": 2.311432647705078, "memory(GiB)": 77.56, "step": 65790, "token_acc": 0.5018315018315018, "train_speed(iter/s)": 1.437561 }, { "epoch": 2.8188595175870788, "grad_norm": 6.91351842880249, "learning_rate": 4.0049611067123526e-05, "loss": 2.487359619140625, "memory(GiB)": 77.56, "step": 65795, "token_acc": 0.4763636363636364, "train_speed(iter/s)": 1.437575 }, { "epoch": 2.8190737329163276, "grad_norm": 4.8192362785339355, "learning_rate": 4.00430159937519e-05, "loss": 2.3111480712890624, "memory(GiB)": 77.56, "step": 65800, "token_acc": 0.5229007633587787, "train_speed(iter/s)": 1.437594 }, { "epoch": 2.8192879482455764, "grad_norm": 6.4928789138793945, "learning_rate": 4.003642110076037e-05, "loss": 2.295087432861328, "memory(GiB)": 77.56, "step": 65805, "token_acc": 0.48132780082987553, "train_speed(iter/s)": 1.437623 }, { "epoch": 2.8195021635748256, "grad_norm": 6.54436731338501, "learning_rate": 4.002982638826841e-05, "loss": 2.430122947692871, "memory(GiB)": 77.56, "step": 65810, "token_acc": 0.48742138364779874, "train_speed(iter/s)": 1.43761 }, { "epoch": 2.8197163789040744, "grad_norm": 6.262338161468506, "learning_rate": 4.0023231856395505e-05, "loss": 2.304804229736328, "memory(GiB)": 77.56, "step": 65815, "token_acc": 0.5127118644067796, "train_speed(iter/s)": 1.43762 }, { "epoch": 2.8199305942333233, "grad_norm": 5.558399677276611, "learning_rate": 4.00166375052611e-05, "loss": 2.3864105224609373, "memory(GiB)": 77.56, "step": 65820, "token_acc": 0.4826388888888889, "train_speed(iter/s)": 1.437643 }, { "epoch": 2.8201448095625725, "grad_norm": 5.8830037117004395, "learning_rate": 4.001004333498469e-05, "loss": 2.6569293975830077, "memory(GiB)": 77.56, "step": 65825, "token_acc": 0.44696969696969696, "train_speed(iter/s)": 1.437651 }, { "epoch": 2.8203590248918213, "grad_norm": 4.427763938903809, "learning_rate": 4.0003449345685704e-05, "loss": 2.5883785247802735, "memory(GiB)": 77.56, "step": 65830, "token_acc": 0.4515235457063712, "train_speed(iter/s)": 1.437647 }, { "epoch": 2.82057324022107, "grad_norm": 4.4692277908325195, "learning_rate": 3.999685553748362e-05, "loss": 2.401324653625488, "memory(GiB)": 77.56, "step": 65835, "token_acc": 0.46959459459459457, "train_speed(iter/s)": 1.437675 }, { "epoch": 2.8207874555503194, "grad_norm": 5.024598121643066, "learning_rate": 3.9990261910497876e-05, "loss": 2.576115036010742, "memory(GiB)": 77.56, "step": 65840, "token_acc": 0.48299319727891155, "train_speed(iter/s)": 1.43769 }, { "epoch": 2.821001670879568, "grad_norm": 5.368990421295166, "learning_rate": 3.9983668464847935e-05, "loss": 2.5567007064819336, "memory(GiB)": 77.56, "step": 65845, "token_acc": 0.5177865612648221, "train_speed(iter/s)": 1.437713 }, { "epoch": 2.821215886208817, "grad_norm": 5.691743850708008, "learning_rate": 3.9977075200653234e-05, "loss": 2.290756607055664, "memory(GiB)": 77.56, "step": 65850, "token_acc": 0.45484949832775917, "train_speed(iter/s)": 1.437733 }, { "epoch": 2.8214301015380663, "grad_norm": 5.7045698165893555, "learning_rate": 3.997048211803321e-05, "loss": 2.4875391006469725, "memory(GiB)": 77.56, "step": 65855, "token_acc": 0.49846153846153846, "train_speed(iter/s)": 1.437735 }, { "epoch": 2.821644316867315, "grad_norm": 4.722874641418457, "learning_rate": 3.996388921710732e-05, "loss": 2.12872200012207, "memory(GiB)": 77.56, "step": 65860, "token_acc": 0.5454545454545454, "train_speed(iter/s)": 1.437719 }, { "epoch": 2.821858532196564, "grad_norm": 5.208227157592773, "learning_rate": 3.995729649799499e-05, "loss": 2.4433000564575194, "memory(GiB)": 77.56, "step": 65865, "token_acc": 0.4573170731707317, "train_speed(iter/s)": 1.4377 }, { "epoch": 2.822072747525813, "grad_norm": 5.113282203674316, "learning_rate": 3.995070396081565e-05, "loss": 2.0730361938476562, "memory(GiB)": 77.56, "step": 65870, "token_acc": 0.55078125, "train_speed(iter/s)": 1.437728 }, { "epoch": 2.822286962855062, "grad_norm": 5.095364570617676, "learning_rate": 3.994411160568874e-05, "loss": 2.5483612060546874, "memory(GiB)": 77.56, "step": 65875, "token_acc": 0.4668769716088328, "train_speed(iter/s)": 1.437755 }, { "epoch": 2.822501178184311, "grad_norm": 5.002376079559326, "learning_rate": 3.993751943273367e-05, "loss": 2.248806953430176, "memory(GiB)": 77.56, "step": 65880, "token_acc": 0.46464646464646464, "train_speed(iter/s)": 1.437749 }, { "epoch": 2.82271539351356, "grad_norm": 4.768962860107422, "learning_rate": 3.9930927442069885e-05, "loss": 2.433235740661621, "memory(GiB)": 77.56, "step": 65885, "token_acc": 0.4968553459119497, "train_speed(iter/s)": 1.437747 }, { "epoch": 2.822929608842809, "grad_norm": 5.688992977142334, "learning_rate": 3.99243356338168e-05, "loss": 2.554449462890625, "memory(GiB)": 77.56, "step": 65890, "token_acc": 0.4984025559105431, "train_speed(iter/s)": 1.437743 }, { "epoch": 2.8231438241720577, "grad_norm": 7.391767501831055, "learning_rate": 3.9917744008093806e-05, "loss": 2.4067935943603516, "memory(GiB)": 77.56, "step": 65895, "token_acc": 0.47674418604651164, "train_speed(iter/s)": 1.437746 }, { "epoch": 2.823358039501307, "grad_norm": 6.47195291519165, "learning_rate": 3.991115256502034e-05, "loss": 2.3028024673461913, "memory(GiB)": 77.56, "step": 65900, "token_acc": 0.5328467153284672, "train_speed(iter/s)": 1.437748 }, { "epoch": 2.8235722548305557, "grad_norm": 5.738492012023926, "learning_rate": 3.9904561304715824e-05, "loss": 2.4111894607543944, "memory(GiB)": 77.56, "step": 65905, "token_acc": 0.4689655172413793, "train_speed(iter/s)": 1.437742 }, { "epoch": 2.8237864701598046, "grad_norm": 5.683606147766113, "learning_rate": 3.989797022729966e-05, "loss": 2.4715497970581053, "memory(GiB)": 77.56, "step": 65910, "token_acc": 0.4875444839857651, "train_speed(iter/s)": 1.437763 }, { "epoch": 2.824000685489054, "grad_norm": 5.9431304931640625, "learning_rate": 3.9891379332891224e-05, "loss": 2.3763616561889647, "memory(GiB)": 77.56, "step": 65915, "token_acc": 0.4778156996587031, "train_speed(iter/s)": 1.437789 }, { "epoch": 2.8242149008183026, "grad_norm": 5.847568035125732, "learning_rate": 3.9884788621609936e-05, "loss": 2.5116508483886717, "memory(GiB)": 77.56, "step": 65920, "token_acc": 0.47058823529411764, "train_speed(iter/s)": 1.437783 }, { "epoch": 2.8244291161475514, "grad_norm": 5.4715256690979, "learning_rate": 3.98781980935752e-05, "loss": 2.3955821990966797, "memory(GiB)": 77.56, "step": 65925, "token_acc": 0.5209003215434084, "train_speed(iter/s)": 1.437813 }, { "epoch": 2.8246433314768007, "grad_norm": 6.0657501220703125, "learning_rate": 3.9871607748906395e-05, "loss": 2.707859992980957, "memory(GiB)": 77.56, "step": 65930, "token_acc": 0.42896174863387976, "train_speed(iter/s)": 1.437808 }, { "epoch": 2.8248575468060495, "grad_norm": 4.782413959503174, "learning_rate": 3.9865017587722916e-05, "loss": 2.365995407104492, "memory(GiB)": 77.56, "step": 65935, "token_acc": 0.49504950495049505, "train_speed(iter/s)": 1.437801 }, { "epoch": 2.8250717621352983, "grad_norm": 5.763640880584717, "learning_rate": 3.985842761014414e-05, "loss": 2.3232954025268553, "memory(GiB)": 77.56, "step": 65940, "token_acc": 0.5292096219931272, "train_speed(iter/s)": 1.437779 }, { "epoch": 2.8252859774645476, "grad_norm": 5.335360050201416, "learning_rate": 3.9851837816289485e-05, "loss": 2.410564422607422, "memory(GiB)": 77.56, "step": 65945, "token_acc": 0.48355263157894735, "train_speed(iter/s)": 1.437788 }, { "epoch": 2.8255001927937964, "grad_norm": 5.844241142272949, "learning_rate": 3.984524820627829e-05, "loss": 2.3132436752319334, "memory(GiB)": 77.56, "step": 65950, "token_acc": 0.5, "train_speed(iter/s)": 1.437799 }, { "epoch": 2.825714408123045, "grad_norm": 6.3975510597229, "learning_rate": 3.983865878022995e-05, "loss": 2.2847755432128904, "memory(GiB)": 77.56, "step": 65955, "token_acc": 0.49829351535836175, "train_speed(iter/s)": 1.437839 }, { "epoch": 2.8259286234522945, "grad_norm": 4.587973117828369, "learning_rate": 3.983206953826385e-05, "loss": 2.316034507751465, "memory(GiB)": 77.56, "step": 65960, "token_acc": 0.5153374233128835, "train_speed(iter/s)": 1.43782 }, { "epoch": 2.8261428387815433, "grad_norm": 5.519717693328857, "learning_rate": 3.982548048049935e-05, "loss": 2.412979316711426, "memory(GiB)": 77.56, "step": 65965, "token_acc": 0.4785714285714286, "train_speed(iter/s)": 1.437847 }, { "epoch": 2.826357054110792, "grad_norm": 6.584933757781982, "learning_rate": 3.981889160705579e-05, "loss": 2.2350576400756834, "memory(GiB)": 77.56, "step": 65970, "token_acc": 0.5315985130111525, "train_speed(iter/s)": 1.437841 }, { "epoch": 2.8265712694400413, "grad_norm": 9.50361156463623, "learning_rate": 3.981230291805257e-05, "loss": 2.6061330795288087, "memory(GiB)": 77.56, "step": 65975, "token_acc": 0.4619883040935672, "train_speed(iter/s)": 1.437867 }, { "epoch": 2.82678548476929, "grad_norm": 6.705776691436768, "learning_rate": 3.980571441360904e-05, "loss": 2.3313167572021483, "memory(GiB)": 77.56, "step": 65980, "token_acc": 0.5099601593625498, "train_speed(iter/s)": 1.437889 }, { "epoch": 2.826999700098539, "grad_norm": 6.451496601104736, "learning_rate": 3.979912609384456e-05, "loss": 2.455087089538574, "memory(GiB)": 77.56, "step": 65985, "token_acc": 0.4700854700854701, "train_speed(iter/s)": 1.437858 }, { "epoch": 2.827213915427788, "grad_norm": 4.885879039764404, "learning_rate": 3.979253795887849e-05, "loss": 2.2485420227050783, "memory(GiB)": 77.56, "step": 65990, "token_acc": 0.5292207792207793, "train_speed(iter/s)": 1.437872 }, { "epoch": 2.827428130757037, "grad_norm": 5.241494178771973, "learning_rate": 3.978595000883017e-05, "loss": 2.3914384841918945, "memory(GiB)": 77.56, "step": 65995, "token_acc": 0.5126582278481012, "train_speed(iter/s)": 1.43788 }, { "epoch": 2.827642346086286, "grad_norm": 7.1190032958984375, "learning_rate": 3.977936224381893e-05, "loss": 2.4144733428955076, "memory(GiB)": 77.56, "step": 66000, "token_acc": 0.4863013698630137, "train_speed(iter/s)": 1.437883 }, { "epoch": 2.827642346086286, "eval_loss": 2.2393407821655273, "eval_runtime": 13.5598, "eval_samples_per_second": 7.375, "eval_steps_per_second": 7.375, "eval_token_acc": 0.4613259668508287, "step": 66000 }, { "epoch": 2.827856561415535, "grad_norm": 5.540735244750977, "learning_rate": 3.9772774663964145e-05, "loss": 2.4929662704467774, "memory(GiB)": 77.56, "step": 66005, "token_acc": 0.47952047952047955, "train_speed(iter/s)": 1.43744 }, { "epoch": 2.828070776744784, "grad_norm": 4.505105495452881, "learning_rate": 3.9766187269385144e-05, "loss": 2.415993309020996, "memory(GiB)": 77.56, "step": 66010, "token_acc": 0.4855305466237942, "train_speed(iter/s)": 1.437463 }, { "epoch": 2.8282849920740327, "grad_norm": 5.809741497039795, "learning_rate": 3.9759600060201245e-05, "loss": 2.7046775817871094, "memory(GiB)": 77.56, "step": 66015, "token_acc": 0.44696969696969696, "train_speed(iter/s)": 1.437485 }, { "epoch": 2.828499207403282, "grad_norm": 6.980930328369141, "learning_rate": 3.975301303653181e-05, "loss": 2.5878875732421873, "memory(GiB)": 77.56, "step": 66020, "token_acc": 0.4745762711864407, "train_speed(iter/s)": 1.437498 }, { "epoch": 2.828713422732531, "grad_norm": 4.306469917297363, "learning_rate": 3.974642619849615e-05, "loss": 2.4797847747802733, "memory(GiB)": 77.56, "step": 66025, "token_acc": 0.48138297872340424, "train_speed(iter/s)": 1.437475 }, { "epoch": 2.8289276380617796, "grad_norm": 4.987720489501953, "learning_rate": 3.9739839546213596e-05, "loss": 2.5715408325195312, "memory(GiB)": 77.56, "step": 66030, "token_acc": 0.46774193548387094, "train_speed(iter/s)": 1.437507 }, { "epoch": 2.829141853391029, "grad_norm": 5.3655195236206055, "learning_rate": 3.9733253079803486e-05, "loss": 2.1925025939941407, "memory(GiB)": 77.56, "step": 66035, "token_acc": 0.5354609929078015, "train_speed(iter/s)": 1.437504 }, { "epoch": 2.8293560687202777, "grad_norm": 4.701647758483887, "learning_rate": 3.9726666799385095e-05, "loss": 2.303322982788086, "memory(GiB)": 77.56, "step": 66040, "token_acc": 0.49328859060402686, "train_speed(iter/s)": 1.437485 }, { "epoch": 2.8295702840495265, "grad_norm": 4.967202186584473, "learning_rate": 3.972008070507779e-05, "loss": 2.211300086975098, "memory(GiB)": 77.56, "step": 66045, "token_acc": 0.54, "train_speed(iter/s)": 1.437461 }, { "epoch": 2.8297844993787757, "grad_norm": 4.112099647521973, "learning_rate": 3.971349479700088e-05, "loss": 2.4259616851806642, "memory(GiB)": 77.56, "step": 66050, "token_acc": 0.5053003533568905, "train_speed(iter/s)": 1.437451 }, { "epoch": 2.8299987147080246, "grad_norm": 4.565462589263916, "learning_rate": 3.970690907527366e-05, "loss": 2.73983211517334, "memory(GiB)": 77.56, "step": 66055, "token_acc": 0.45819397993311034, "train_speed(iter/s)": 1.437469 }, { "epoch": 2.8302129300372734, "grad_norm": 6.556137561798096, "learning_rate": 3.970032354001542e-05, "loss": 2.639791488647461, "memory(GiB)": 77.56, "step": 66060, "token_acc": 0.4316546762589928, "train_speed(iter/s)": 1.437449 }, { "epoch": 2.8304271453665226, "grad_norm": 5.55762243270874, "learning_rate": 3.9693738191345495e-05, "loss": 2.566722869873047, "memory(GiB)": 77.56, "step": 66065, "token_acc": 0.4795539033457249, "train_speed(iter/s)": 1.437423 }, { "epoch": 2.8306413606957714, "grad_norm": 5.159157752990723, "learning_rate": 3.968715302938317e-05, "loss": 2.320499229431152, "memory(GiB)": 77.56, "step": 66070, "token_acc": 0.4743935309973046, "train_speed(iter/s)": 1.437447 }, { "epoch": 2.8308555760250202, "grad_norm": 6.623139381408691, "learning_rate": 3.9680568054247744e-05, "loss": 2.432745361328125, "memory(GiB)": 77.56, "step": 66075, "token_acc": 0.47985347985347987, "train_speed(iter/s)": 1.437466 }, { "epoch": 2.8310697913542695, "grad_norm": 3.9942383766174316, "learning_rate": 3.9673983266058504e-05, "loss": 2.4070995330810545, "memory(GiB)": 77.56, "step": 66080, "token_acc": 0.5015974440894568, "train_speed(iter/s)": 1.437464 }, { "epoch": 2.8312840066835183, "grad_norm": 5.720208644866943, "learning_rate": 3.9667398664934735e-05, "loss": 2.1331079483032225, "memory(GiB)": 77.56, "step": 66085, "token_acc": 0.5077881619937694, "train_speed(iter/s)": 1.437483 }, { "epoch": 2.831498222012767, "grad_norm": 3.524899959564209, "learning_rate": 3.966081425099575e-05, "loss": 1.8924365997314454, "memory(GiB)": 77.56, "step": 66090, "token_acc": 0.570281124497992, "train_speed(iter/s)": 1.437502 }, { "epoch": 2.8317124373420164, "grad_norm": 5.937251567840576, "learning_rate": 3.96542300243608e-05, "loss": 2.2823408126831053, "memory(GiB)": 77.56, "step": 66095, "token_acc": 0.5104166666666666, "train_speed(iter/s)": 1.437511 }, { "epoch": 2.831926652671265, "grad_norm": 5.689079761505127, "learning_rate": 3.9647645985149184e-05, "loss": 2.394300842285156, "memory(GiB)": 77.56, "step": 66100, "token_acc": 0.50187265917603, "train_speed(iter/s)": 1.437524 }, { "epoch": 2.832140868000514, "grad_norm": 5.338691711425781, "learning_rate": 3.964106213348017e-05, "loss": 2.508581352233887, "memory(GiB)": 77.56, "step": 66105, "token_acc": 0.45985401459854014, "train_speed(iter/s)": 1.437492 }, { "epoch": 2.8323550833297633, "grad_norm": 5.338334083557129, "learning_rate": 3.963447846947304e-05, "loss": 2.659164047241211, "memory(GiB)": 77.56, "step": 66110, "token_acc": 0.47547169811320755, "train_speed(iter/s)": 1.437514 }, { "epoch": 2.832569298659012, "grad_norm": 3.856503486633301, "learning_rate": 3.962789499324703e-05, "loss": 2.4176567077636717, "memory(GiB)": 77.56, "step": 66115, "token_acc": 0.47194719471947194, "train_speed(iter/s)": 1.437517 }, { "epoch": 2.832783513988261, "grad_norm": 4.626069068908691, "learning_rate": 3.962131170492145e-05, "loss": 2.5557384490966797, "memory(GiB)": 77.56, "step": 66120, "token_acc": 0.4662576687116564, "train_speed(iter/s)": 1.437537 }, { "epoch": 2.83299772931751, "grad_norm": 6.309337615966797, "learning_rate": 3.961472860461555e-05, "loss": 2.7593154907226562, "memory(GiB)": 77.56, "step": 66125, "token_acc": 0.4263565891472868, "train_speed(iter/s)": 1.437559 }, { "epoch": 2.833211944646759, "grad_norm": 5.2441582679748535, "learning_rate": 3.9608145692448575e-05, "loss": 2.5551788330078127, "memory(GiB)": 77.56, "step": 66130, "token_acc": 0.5015974440894568, "train_speed(iter/s)": 1.437518 }, { "epoch": 2.8334261599760078, "grad_norm": 5.7819318771362305, "learning_rate": 3.9601562968539796e-05, "loss": 2.4453847885131834, "memory(GiB)": 77.56, "step": 66135, "token_acc": 0.4969512195121951, "train_speed(iter/s)": 1.437498 }, { "epoch": 2.833640375305257, "grad_norm": 6.63046932220459, "learning_rate": 3.959498043300846e-05, "loss": 2.020421600341797, "memory(GiB)": 77.56, "step": 66140, "token_acc": 0.5795918367346938, "train_speed(iter/s)": 1.437483 }, { "epoch": 2.833854590634506, "grad_norm": 5.306735515594482, "learning_rate": 3.958839808597381e-05, "loss": 2.56314640045166, "memory(GiB)": 77.56, "step": 66145, "token_acc": 0.46788990825688076, "train_speed(iter/s)": 1.437479 }, { "epoch": 2.8340688059637547, "grad_norm": 5.529629230499268, "learning_rate": 3.95818159275551e-05, "loss": 2.550008773803711, "memory(GiB)": 77.56, "step": 66150, "token_acc": 0.4793103448275862, "train_speed(iter/s)": 1.43749 }, { "epoch": 2.834283021293004, "grad_norm": 4.78882360458374, "learning_rate": 3.957523395787156e-05, "loss": 2.6957208633422853, "memory(GiB)": 77.56, "step": 66155, "token_acc": 0.46440677966101696, "train_speed(iter/s)": 1.437475 }, { "epoch": 2.8344972366222527, "grad_norm": 6.711662769317627, "learning_rate": 3.956865217704244e-05, "loss": 2.79765625, "memory(GiB)": 77.56, "step": 66160, "token_acc": 0.44061302681992337, "train_speed(iter/s)": 1.437496 }, { "epoch": 2.8347114519515015, "grad_norm": 11.294705390930176, "learning_rate": 3.956207058518697e-05, "loss": 2.3666833877563476, "memory(GiB)": 77.56, "step": 66165, "token_acc": 0.5169811320754717, "train_speed(iter/s)": 1.437508 }, { "epoch": 2.834925667280751, "grad_norm": 5.45749044418335, "learning_rate": 3.955548918242438e-05, "loss": 2.3976718902587892, "memory(GiB)": 77.56, "step": 66170, "token_acc": 0.4927007299270073, "train_speed(iter/s)": 1.437529 }, { "epoch": 2.8351398826099996, "grad_norm": 5.315276622772217, "learning_rate": 3.954890796887391e-05, "loss": 2.3737110137939452, "memory(GiB)": 77.56, "step": 66175, "token_acc": 0.476027397260274, "train_speed(iter/s)": 1.437529 }, { "epoch": 2.8353540979392484, "grad_norm": 5.749845504760742, "learning_rate": 3.9542326944654775e-05, "loss": 2.370409393310547, "memory(GiB)": 77.56, "step": 66180, "token_acc": 0.4609053497942387, "train_speed(iter/s)": 1.437524 }, { "epoch": 2.8355683132684977, "grad_norm": 4.790119171142578, "learning_rate": 3.953574610988619e-05, "loss": 2.2135387420654298, "memory(GiB)": 77.56, "step": 66185, "token_acc": 0.48514851485148514, "train_speed(iter/s)": 1.437529 }, { "epoch": 2.8357825285977465, "grad_norm": 5.460172176361084, "learning_rate": 3.952916546468737e-05, "loss": 2.429324722290039, "memory(GiB)": 77.56, "step": 66190, "token_acc": 0.5034013605442177, "train_speed(iter/s)": 1.437532 }, { "epoch": 2.8359967439269953, "grad_norm": 3.906254529953003, "learning_rate": 3.9522585009177554e-05, "loss": 2.353445053100586, "memory(GiB)": 77.56, "step": 66195, "token_acc": 0.5070422535211268, "train_speed(iter/s)": 1.437542 }, { "epoch": 2.8362109592562446, "grad_norm": 5.391211032867432, "learning_rate": 3.951600474347594e-05, "loss": 2.3955163955688477, "memory(GiB)": 77.56, "step": 66200, "token_acc": 0.4857142857142857, "train_speed(iter/s)": 1.437552 }, { "epoch": 2.8364251745854934, "grad_norm": 4.388646125793457, "learning_rate": 3.950942466770173e-05, "loss": 2.410428047180176, "memory(GiB)": 77.56, "step": 66205, "token_acc": 0.46835443037974683, "train_speed(iter/s)": 1.437552 }, { "epoch": 2.836639389914742, "grad_norm": 5.6095380783081055, "learning_rate": 3.950284478197414e-05, "loss": 2.4293533325195313, "memory(GiB)": 77.56, "step": 66210, "token_acc": 0.4740740740740741, "train_speed(iter/s)": 1.43756 }, { "epoch": 2.8368536052439914, "grad_norm": 5.41325569152832, "learning_rate": 3.9496265086412364e-05, "loss": 2.242751884460449, "memory(GiB)": 77.56, "step": 66215, "token_acc": 0.5053003533568905, "train_speed(iter/s)": 1.437568 }, { "epoch": 2.8370678205732403, "grad_norm": 4.969966411590576, "learning_rate": 3.948968558113559e-05, "loss": 2.41613826751709, "memory(GiB)": 77.56, "step": 66220, "token_acc": 0.524904214559387, "train_speed(iter/s)": 1.437587 }, { "epoch": 2.837282035902489, "grad_norm": 12.915497779846191, "learning_rate": 3.9483106266263036e-05, "loss": 2.6022666931152343, "memory(GiB)": 77.56, "step": 66225, "token_acc": 0.4584450402144772, "train_speed(iter/s)": 1.437575 }, { "epoch": 2.8374962512317383, "grad_norm": 5.276846885681152, "learning_rate": 3.9476527141913866e-05, "loss": 2.3101612091064454, "memory(GiB)": 77.56, "step": 66230, "token_acc": 0.5136186770428015, "train_speed(iter/s)": 1.437552 }, { "epoch": 2.837710466560987, "grad_norm": 4.95725154876709, "learning_rate": 3.946994820820728e-05, "loss": 2.733891487121582, "memory(GiB)": 77.56, "step": 66235, "token_acc": 0.468503937007874, "train_speed(iter/s)": 1.437566 }, { "epoch": 2.837924681890236, "grad_norm": 4.425442695617676, "learning_rate": 3.9463369465262466e-05, "loss": 2.391596221923828, "memory(GiB)": 77.56, "step": 66240, "token_acc": 0.509375, "train_speed(iter/s)": 1.437581 }, { "epoch": 2.838138897219485, "grad_norm": 5.859916687011719, "learning_rate": 3.945679091319859e-05, "loss": 2.340145492553711, "memory(GiB)": 77.56, "step": 66245, "token_acc": 0.5, "train_speed(iter/s)": 1.437561 }, { "epoch": 2.838353112548734, "grad_norm": 5.283370018005371, "learning_rate": 3.9450212552134845e-05, "loss": 2.3300167083740235, "memory(GiB)": 77.56, "step": 66250, "token_acc": 0.4981684981684982, "train_speed(iter/s)": 1.437557 }, { "epoch": 2.838567327877983, "grad_norm": 5.853363990783691, "learning_rate": 3.9443634382190396e-05, "loss": 2.3598812103271483, "memory(GiB)": 77.56, "step": 66255, "token_acc": 0.463768115942029, "train_speed(iter/s)": 1.437571 }, { "epoch": 2.838781543207232, "grad_norm": 6.4891037940979, "learning_rate": 3.9437056403484404e-05, "loss": 2.3071685791015626, "memory(GiB)": 77.56, "step": 66260, "token_acc": 0.5064516129032258, "train_speed(iter/s)": 1.437595 }, { "epoch": 2.838995758536481, "grad_norm": 4.6603007316589355, "learning_rate": 3.9430478616136036e-05, "loss": 2.272079277038574, "memory(GiB)": 77.56, "step": 66265, "token_acc": 0.48188405797101447, "train_speed(iter/s)": 1.437591 }, { "epoch": 2.8392099738657297, "grad_norm": 7.033201694488525, "learning_rate": 3.9423901020264474e-05, "loss": 2.3999202728271483, "memory(GiB)": 77.56, "step": 66270, "token_acc": 0.4628975265017668, "train_speed(iter/s)": 1.437574 }, { "epoch": 2.839424189194979, "grad_norm": 6.5980048179626465, "learning_rate": 3.9417323615988864e-05, "loss": 2.3603357315063476, "memory(GiB)": 77.56, "step": 66275, "token_acc": 0.4844290657439446, "train_speed(iter/s)": 1.437581 }, { "epoch": 2.8396384045242278, "grad_norm": 5.733865737915039, "learning_rate": 3.941074640342838e-05, "loss": 2.4446090698242187, "memory(GiB)": 77.56, "step": 66280, "token_acc": 0.48417721518987344, "train_speed(iter/s)": 1.437585 }, { "epoch": 2.8398526198534766, "grad_norm": 5.516995429992676, "learning_rate": 3.940416938270215e-05, "loss": 2.331661605834961, "memory(GiB)": 77.56, "step": 66285, "token_acc": 0.46987951807228917, "train_speed(iter/s)": 1.437614 }, { "epoch": 2.840066835182726, "grad_norm": 6.06817626953125, "learning_rate": 3.939759255392932e-05, "loss": 2.46845703125, "memory(GiB)": 77.56, "step": 66290, "token_acc": 0.4808362369337979, "train_speed(iter/s)": 1.437623 }, { "epoch": 2.8402810505119747, "grad_norm": 6.026614665985107, "learning_rate": 3.939101591722906e-05, "loss": 2.1548831939697264, "memory(GiB)": 77.56, "step": 66295, "token_acc": 0.5020242914979757, "train_speed(iter/s)": 1.437642 }, { "epoch": 2.8404952658412235, "grad_norm": 5.751220703125, "learning_rate": 3.93844394727205e-05, "loss": 2.576325225830078, "memory(GiB)": 77.56, "step": 66300, "token_acc": 0.4597014925373134, "train_speed(iter/s)": 1.437662 }, { "epoch": 2.8407094811704727, "grad_norm": 6.084564685821533, "learning_rate": 3.937786322052276e-05, "loss": 2.3021015167236327, "memory(GiB)": 77.56, "step": 66305, "token_acc": 0.4892086330935252, "train_speed(iter/s)": 1.43769 }, { "epoch": 2.8409236964997215, "grad_norm": 4.717773914337158, "learning_rate": 3.937128716075501e-05, "loss": 2.096889877319336, "memory(GiB)": 77.56, "step": 66310, "token_acc": 0.563573883161512, "train_speed(iter/s)": 1.437688 }, { "epoch": 2.8411379118289704, "grad_norm": 5.652832984924316, "learning_rate": 3.936471129353635e-05, "loss": 2.3126296997070312, "memory(GiB)": 77.56, "step": 66315, "token_acc": 0.4823529411764706, "train_speed(iter/s)": 1.437691 }, { "epoch": 2.8413521271582196, "grad_norm": 5.348511695861816, "learning_rate": 3.935813561898593e-05, "loss": 2.539601707458496, "memory(GiB)": 77.56, "step": 66320, "token_acc": 0.45454545454545453, "train_speed(iter/s)": 1.437701 }, { "epoch": 2.8415663424874684, "grad_norm": 4.41096305847168, "learning_rate": 3.935156013722287e-05, "loss": 2.1715204238891603, "memory(GiB)": 77.56, "step": 66325, "token_acc": 0.5018450184501845, "train_speed(iter/s)": 1.437719 }, { "epoch": 2.8417805578167172, "grad_norm": 4.9178595542907715, "learning_rate": 3.934498484836627e-05, "loss": 2.5662769317626952, "memory(GiB)": 77.56, "step": 66330, "token_acc": 0.46387832699619774, "train_speed(iter/s)": 1.437729 }, { "epoch": 2.8419947731459665, "grad_norm": 5.927366733551025, "learning_rate": 3.933840975253527e-05, "loss": 2.3385799407958983, "memory(GiB)": 77.56, "step": 66335, "token_acc": 0.4811320754716981, "train_speed(iter/s)": 1.437738 }, { "epoch": 2.8422089884752153, "grad_norm": 5.280552864074707, "learning_rate": 3.933183484984898e-05, "loss": 2.250534439086914, "memory(GiB)": 77.56, "step": 66340, "token_acc": 0.4742647058823529, "train_speed(iter/s)": 1.437756 }, { "epoch": 2.842423203804464, "grad_norm": 4.954792499542236, "learning_rate": 3.932526014042652e-05, "loss": 2.6814754486083983, "memory(GiB)": 77.56, "step": 66345, "token_acc": 0.4773413897280967, "train_speed(iter/s)": 1.437756 }, { "epoch": 2.8426374191337134, "grad_norm": 4.5819902420043945, "learning_rate": 3.9318685624386975e-05, "loss": 2.4675159454345703, "memory(GiB)": 77.56, "step": 66350, "token_acc": 0.45348837209302323, "train_speed(iter/s)": 1.437766 }, { "epoch": 2.842851634462962, "grad_norm": 4.561880111694336, "learning_rate": 3.931211130184947e-05, "loss": 2.1820587158203124, "memory(GiB)": 77.56, "step": 66355, "token_acc": 0.5171339563862928, "train_speed(iter/s)": 1.437804 }, { "epoch": 2.843065849792211, "grad_norm": 4.890132427215576, "learning_rate": 3.9305537172933085e-05, "loss": 2.551707077026367, "memory(GiB)": 77.56, "step": 66360, "token_acc": 0.4554140127388535, "train_speed(iter/s)": 1.437798 }, { "epoch": 2.8432800651214603, "grad_norm": 7.599239349365234, "learning_rate": 3.9298963237756934e-05, "loss": 2.608646202087402, "memory(GiB)": 77.56, "step": 66365, "token_acc": 0.46613545816733065, "train_speed(iter/s)": 1.437814 }, { "epoch": 2.843494280450709, "grad_norm": 5.214544773101807, "learning_rate": 3.92923894964401e-05, "loss": 2.417283630371094, "memory(GiB)": 77.56, "step": 66370, "token_acc": 0.55, "train_speed(iter/s)": 1.437819 }, { "epoch": 2.843708495779958, "grad_norm": 5.644783020019531, "learning_rate": 3.9285815949101675e-05, "loss": 2.54773063659668, "memory(GiB)": 77.56, "step": 66375, "token_acc": 0.47774480712166173, "train_speed(iter/s)": 1.437844 }, { "epoch": 2.843922711109207, "grad_norm": 4.35864782333374, "learning_rate": 3.9279242595860746e-05, "loss": 2.6800617218017577, "memory(GiB)": 77.56, "step": 66380, "token_acc": 0.468944099378882, "train_speed(iter/s)": 1.437866 }, { "epoch": 2.844136926438456, "grad_norm": 5.798492431640625, "learning_rate": 3.9272669436836395e-05, "loss": 2.5936920166015627, "memory(GiB)": 77.56, "step": 66385, "token_acc": 0.4790996784565916, "train_speed(iter/s)": 1.437853 }, { "epoch": 2.8443511417677048, "grad_norm": 6.025155544281006, "learning_rate": 3.9266096472147694e-05, "loss": 2.4738348007202147, "memory(GiB)": 77.56, "step": 66390, "token_acc": 0.5219941348973607, "train_speed(iter/s)": 1.43787 }, { "epoch": 2.844565357096954, "grad_norm": 4.408134937286377, "learning_rate": 3.925952370191373e-05, "loss": 2.3777000427246096, "memory(GiB)": 77.56, "step": 66395, "token_acc": 0.4982698961937716, "train_speed(iter/s)": 1.437849 }, { "epoch": 2.844779572426203, "grad_norm": 6.43510103225708, "learning_rate": 3.9252951126253565e-05, "loss": 2.463430404663086, "memory(GiB)": 77.56, "step": 66400, "token_acc": 0.4907749077490775, "train_speed(iter/s)": 1.43785 }, { "epoch": 2.8449937877554516, "grad_norm": 4.5750732421875, "learning_rate": 3.9246378745286276e-05, "loss": 2.2757320404052734, "memory(GiB)": 77.56, "step": 66405, "token_acc": 0.5211726384364821, "train_speed(iter/s)": 1.437883 }, { "epoch": 2.845208003084701, "grad_norm": 5.788753509521484, "learning_rate": 3.923980655913091e-05, "loss": 2.260826301574707, "memory(GiB)": 77.56, "step": 66410, "token_acc": 0.4948805460750853, "train_speed(iter/s)": 1.43787 }, { "epoch": 2.8454222184139497, "grad_norm": 5.856846332550049, "learning_rate": 3.923323456790656e-05, "loss": 2.002977752685547, "memory(GiB)": 77.56, "step": 66415, "token_acc": 0.5163636363636364, "train_speed(iter/s)": 1.437895 }, { "epoch": 2.8456364337431985, "grad_norm": 5.099117279052734, "learning_rate": 3.9226662771732244e-05, "loss": 2.43624324798584, "memory(GiB)": 77.56, "step": 66420, "token_acc": 0.5120274914089347, "train_speed(iter/s)": 1.437877 }, { "epoch": 2.845850649072448, "grad_norm": 4.795126438140869, "learning_rate": 3.922009117072706e-05, "loss": 2.4289674758911133, "memory(GiB)": 77.56, "step": 66425, "token_acc": 0.4752475247524752, "train_speed(iter/s)": 1.437866 }, { "epoch": 2.8460648644016966, "grad_norm": 5.6901631355285645, "learning_rate": 3.921351976501004e-05, "loss": 2.2836795806884767, "memory(GiB)": 77.56, "step": 66430, "token_acc": 0.5283018867924528, "train_speed(iter/s)": 1.437859 }, { "epoch": 2.8462790797309454, "grad_norm": 4.898319244384766, "learning_rate": 3.920694855470021e-05, "loss": 2.6279300689697265, "memory(GiB)": 77.56, "step": 66435, "token_acc": 0.4634920634920635, "train_speed(iter/s)": 1.437861 }, { "epoch": 2.8464932950601947, "grad_norm": 6.0314836502075195, "learning_rate": 3.920037753991665e-05, "loss": 2.5557987213134767, "memory(GiB)": 77.56, "step": 66440, "token_acc": 0.45787545787545786, "train_speed(iter/s)": 1.437894 }, { "epoch": 2.8467075103894435, "grad_norm": 4.333067417144775, "learning_rate": 3.919380672077838e-05, "loss": 2.7301849365234374, "memory(GiB)": 77.56, "step": 66445, "token_acc": 0.4369230769230769, "train_speed(iter/s)": 1.437892 }, { "epoch": 2.8469217257186923, "grad_norm": 6.202364444732666, "learning_rate": 3.918723609740442e-05, "loss": 2.3933507919311525, "memory(GiB)": 77.56, "step": 66450, "token_acc": 0.4835164835164835, "train_speed(iter/s)": 1.437871 }, { "epoch": 2.8471359410479415, "grad_norm": 5.349114418029785, "learning_rate": 3.918066566991384e-05, "loss": 2.7145721435546877, "memory(GiB)": 77.56, "step": 66455, "token_acc": 0.44816053511705684, "train_speed(iter/s)": 1.437878 }, { "epoch": 2.8473501563771904, "grad_norm": 4.3693718910217285, "learning_rate": 3.917409543842564e-05, "loss": 2.294993591308594, "memory(GiB)": 77.56, "step": 66460, "token_acc": 0.48427672955974843, "train_speed(iter/s)": 1.437892 }, { "epoch": 2.847564371706439, "grad_norm": 4.617372512817383, "learning_rate": 3.9167525403058855e-05, "loss": 2.529158020019531, "memory(GiB)": 77.56, "step": 66465, "token_acc": 0.44904458598726116, "train_speed(iter/s)": 1.437877 }, { "epoch": 2.8477785870356884, "grad_norm": 5.825178623199463, "learning_rate": 3.9160955563932524e-05, "loss": 2.4784067153930662, "memory(GiB)": 77.56, "step": 66470, "token_acc": 0.4523809523809524, "train_speed(iter/s)": 1.437882 }, { "epoch": 2.8479928023649372, "grad_norm": 5.506708145141602, "learning_rate": 3.9154385921165635e-05, "loss": 2.5322898864746093, "memory(GiB)": 77.56, "step": 66475, "token_acc": 0.44983818770226536, "train_speed(iter/s)": 1.437898 }, { "epoch": 2.848207017694186, "grad_norm": 4.318180084228516, "learning_rate": 3.9147816474877226e-05, "loss": 2.268649673461914, "memory(GiB)": 77.56, "step": 66480, "token_acc": 0.5019011406844106, "train_speed(iter/s)": 1.437909 }, { "epoch": 2.8484212330234353, "grad_norm": 5.781953811645508, "learning_rate": 3.914124722518628e-05, "loss": 2.2700672149658203, "memory(GiB)": 77.56, "step": 66485, "token_acc": 0.515625, "train_speed(iter/s)": 1.437912 }, { "epoch": 2.848635448352684, "grad_norm": 6.998129367828369, "learning_rate": 3.913467817221185e-05, "loss": 2.3082517623901366, "memory(GiB)": 77.56, "step": 66490, "token_acc": 0.47368421052631576, "train_speed(iter/s)": 1.437925 }, { "epoch": 2.848849663681933, "grad_norm": 6.134349822998047, "learning_rate": 3.912810931607291e-05, "loss": 2.4138042449951174, "memory(GiB)": 77.56, "step": 66495, "token_acc": 0.46360153256704983, "train_speed(iter/s)": 1.437932 }, { "epoch": 2.849063879011182, "grad_norm": 6.053704738616943, "learning_rate": 3.9121540656888475e-05, "loss": 2.2307310104370117, "memory(GiB)": 77.56, "step": 66500, "token_acc": 0.5486381322957199, "train_speed(iter/s)": 1.43793 }, { "epoch": 2.849063879011182, "eval_loss": 2.191592216491699, "eval_runtime": 14.3462, "eval_samples_per_second": 6.97, "eval_steps_per_second": 6.97, "eval_token_acc": 0.4857142857142857, "step": 66500 }, { "epoch": 2.849278094340431, "grad_norm": 4.378180503845215, "learning_rate": 3.9114972194777534e-05, "loss": 2.838070106506348, "memory(GiB)": 77.56, "step": 66505, "token_acc": 0.47110675808031344, "train_speed(iter/s)": 1.437446 }, { "epoch": 2.84949230966968, "grad_norm": 5.601437568664551, "learning_rate": 3.910840392985908e-05, "loss": 2.3315408706665037, "memory(GiB)": 77.56, "step": 66510, "token_acc": 0.5371621621621622, "train_speed(iter/s)": 1.437448 }, { "epoch": 2.849706524998929, "grad_norm": 7.7358269691467285, "learning_rate": 3.910183586225212e-05, "loss": 2.4238819122314452, "memory(GiB)": 77.56, "step": 66515, "token_acc": 0.49458483754512633, "train_speed(iter/s)": 1.437461 }, { "epoch": 2.849920740328178, "grad_norm": 4.937651634216309, "learning_rate": 3.9095267992075604e-05, "loss": 2.492909049987793, "memory(GiB)": 77.56, "step": 66520, "token_acc": 0.5056603773584906, "train_speed(iter/s)": 1.437491 }, { "epoch": 2.8501349556574267, "grad_norm": 7.730500221252441, "learning_rate": 3.908870031944856e-05, "loss": 2.6884361267089845, "memory(GiB)": 77.56, "step": 66525, "token_acc": 0.4397163120567376, "train_speed(iter/s)": 1.437493 }, { "epoch": 2.850349170986676, "grad_norm": 6.503012180328369, "learning_rate": 3.9082132844489935e-05, "loss": 2.5664562225341796, "memory(GiB)": 77.56, "step": 66530, "token_acc": 0.4934640522875817, "train_speed(iter/s)": 1.437489 }, { "epoch": 2.8505633863159248, "grad_norm": 7.859287738800049, "learning_rate": 3.907556556731871e-05, "loss": 2.737318992614746, "memory(GiB)": 77.56, "step": 66535, "token_acc": 0.5057034220532319, "train_speed(iter/s)": 1.437503 }, { "epoch": 2.8507776016451736, "grad_norm": 5.697571754455566, "learning_rate": 3.906899848805387e-05, "loss": 2.72196102142334, "memory(GiB)": 77.56, "step": 66540, "token_acc": 0.444794952681388, "train_speed(iter/s)": 1.437479 }, { "epoch": 2.850991816974423, "grad_norm": 5.560765743255615, "learning_rate": 3.906243160681436e-05, "loss": 2.4972421646118166, "memory(GiB)": 77.56, "step": 66545, "token_acc": 0.4984984984984985, "train_speed(iter/s)": 1.437432 }, { "epoch": 2.8512060323036716, "grad_norm": 8.9849271774292, "learning_rate": 3.9055864923719185e-05, "loss": 2.608599853515625, "memory(GiB)": 77.56, "step": 66550, "token_acc": 0.43462897526501765, "train_speed(iter/s)": 1.437435 }, { "epoch": 2.8514202476329205, "grad_norm": 7.900022983551025, "learning_rate": 3.9049298438887276e-05, "loss": 2.554580307006836, "memory(GiB)": 77.56, "step": 66555, "token_acc": 0.5043731778425656, "train_speed(iter/s)": 1.437449 }, { "epoch": 2.8516344629621697, "grad_norm": 5.279074668884277, "learning_rate": 3.904273215243758e-05, "loss": 2.5848312377929688, "memory(GiB)": 77.56, "step": 66560, "token_acc": 0.46984126984126984, "train_speed(iter/s)": 1.437472 }, { "epoch": 2.8518486782914185, "grad_norm": 5.624842643737793, "learning_rate": 3.903616606448908e-05, "loss": 2.5240001678466797, "memory(GiB)": 77.56, "step": 66565, "token_acc": 0.45454545454545453, "train_speed(iter/s)": 1.437496 }, { "epoch": 2.8520628936206673, "grad_norm": 5.991982460021973, "learning_rate": 3.902960017516073e-05, "loss": 2.513314628601074, "memory(GiB)": 77.56, "step": 66570, "token_acc": 0.4673202614379085, "train_speed(iter/s)": 1.437492 }, { "epoch": 2.8522771089499166, "grad_norm": 7.103693008422852, "learning_rate": 3.902303448457146e-05, "loss": 2.1770584106445314, "memory(GiB)": 77.56, "step": 66575, "token_acc": 0.5294117647058824, "train_speed(iter/s)": 1.437469 }, { "epoch": 2.8524913242791654, "grad_norm": 5.116634368896484, "learning_rate": 3.9016468992840203e-05, "loss": 2.6007291793823244, "memory(GiB)": 77.56, "step": 66580, "token_acc": 0.46130952380952384, "train_speed(iter/s)": 1.437473 }, { "epoch": 2.852705539608414, "grad_norm": 4.407067775726318, "learning_rate": 3.900990370008593e-05, "loss": 2.4688270568847654, "memory(GiB)": 77.56, "step": 66585, "token_acc": 0.4874551971326165, "train_speed(iter/s)": 1.437481 }, { "epoch": 2.8529197549376635, "grad_norm": 4.946870803833008, "learning_rate": 3.9003338606427564e-05, "loss": 2.391340637207031, "memory(GiB)": 77.56, "step": 66590, "token_acc": 0.5087719298245614, "train_speed(iter/s)": 1.437494 }, { "epoch": 2.8531339702669123, "grad_norm": 4.987757682800293, "learning_rate": 3.899677371198402e-05, "loss": 2.258224868774414, "memory(GiB)": 77.56, "step": 66595, "token_acc": 0.541958041958042, "train_speed(iter/s)": 1.437503 }, { "epoch": 2.853348185596161, "grad_norm": 5.315586566925049, "learning_rate": 3.8990209016874254e-05, "loss": 2.4487504959106445, "memory(GiB)": 77.56, "step": 66600, "token_acc": 0.5118110236220472, "train_speed(iter/s)": 1.43749 }, { "epoch": 2.8535624009254104, "grad_norm": 6.663456916809082, "learning_rate": 3.898364452121717e-05, "loss": 2.3310081481933596, "memory(GiB)": 77.56, "step": 66605, "token_acc": 0.49606299212598426, "train_speed(iter/s)": 1.43751 }, { "epoch": 2.853776616254659, "grad_norm": 6.007467269897461, "learning_rate": 3.89770802251317e-05, "loss": 2.4017011642456056, "memory(GiB)": 77.56, "step": 66610, "token_acc": 0.4980544747081712, "train_speed(iter/s)": 1.437521 }, { "epoch": 2.853990831583908, "grad_norm": 5.462390899658203, "learning_rate": 3.897051612873677e-05, "loss": 2.4776123046875, "memory(GiB)": 77.56, "step": 66615, "token_acc": 0.47038327526132406, "train_speed(iter/s)": 1.437523 }, { "epoch": 2.8542050469131572, "grad_norm": 4.1070637702941895, "learning_rate": 3.896395223215128e-05, "loss": 2.6370866775512694, "memory(GiB)": 77.56, "step": 66620, "token_acc": 0.4161073825503356, "train_speed(iter/s)": 1.437479 }, { "epoch": 2.854419262242406, "grad_norm": 6.080334663391113, "learning_rate": 3.8957388535494156e-05, "loss": 2.1869640350341797, "memory(GiB)": 77.56, "step": 66625, "token_acc": 0.5113636363636364, "train_speed(iter/s)": 1.437481 }, { "epoch": 2.854633477571655, "grad_norm": 4.292595386505127, "learning_rate": 3.8950825038884296e-05, "loss": 2.261898231506348, "memory(GiB)": 77.56, "step": 66630, "token_acc": 0.5055350553505535, "train_speed(iter/s)": 1.437494 }, { "epoch": 2.854847692900904, "grad_norm": 5.835457801818848, "learning_rate": 3.894426174244058e-05, "loss": 2.2565582275390623, "memory(GiB)": 77.56, "step": 66635, "token_acc": 0.5518672199170125, "train_speed(iter/s)": 1.437508 }, { "epoch": 2.855061908230153, "grad_norm": 5.705355644226074, "learning_rate": 3.893769864628195e-05, "loss": 2.4744895935058593, "memory(GiB)": 77.56, "step": 66640, "token_acc": 0.48484848484848486, "train_speed(iter/s)": 1.437528 }, { "epoch": 2.8552761235594017, "grad_norm": 5.244635581970215, "learning_rate": 3.89311357505273e-05, "loss": 2.405169868469238, "memory(GiB)": 77.56, "step": 66645, "token_acc": 0.4948453608247423, "train_speed(iter/s)": 1.437563 }, { "epoch": 2.855490338888651, "grad_norm": 5.433347702026367, "learning_rate": 3.892457305529549e-05, "loss": 2.1917551040649412, "memory(GiB)": 77.56, "step": 66650, "token_acc": 0.5693430656934306, "train_speed(iter/s)": 1.437557 }, { "epoch": 2.8557045542179, "grad_norm": 4.850085258483887, "learning_rate": 3.891801056070545e-05, "loss": 2.1592260360717774, "memory(GiB)": 77.56, "step": 66655, "token_acc": 0.5382165605095541, "train_speed(iter/s)": 1.437559 }, { "epoch": 2.8559187695471486, "grad_norm": 6.711618423461914, "learning_rate": 3.891144826687603e-05, "loss": 2.4707393646240234, "memory(GiB)": 77.56, "step": 66660, "token_acc": 0.47368421052631576, "train_speed(iter/s)": 1.437586 }, { "epoch": 2.856132984876398, "grad_norm": 7.5741286277771, "learning_rate": 3.890488617392613e-05, "loss": 2.37469482421875, "memory(GiB)": 77.56, "step": 66665, "token_acc": 0.5084745762711864, "train_speed(iter/s)": 1.437599 }, { "epoch": 2.8563472002056467, "grad_norm": 6.927433013916016, "learning_rate": 3.889832428197463e-05, "loss": 2.4204429626464843, "memory(GiB)": 77.56, "step": 66670, "token_acc": 0.5066225165562914, "train_speed(iter/s)": 1.437614 }, { "epoch": 2.8565614155348955, "grad_norm": 5.925686836242676, "learning_rate": 3.88917625911404e-05, "loss": 2.3027008056640623, "memory(GiB)": 77.56, "step": 66675, "token_acc": 0.5186567164179104, "train_speed(iter/s)": 1.437631 }, { "epoch": 2.8567756308641448, "grad_norm": 6.618916034698486, "learning_rate": 3.8885201101542304e-05, "loss": 2.3661354064941404, "memory(GiB)": 77.56, "step": 66680, "token_acc": 0.46864686468646866, "train_speed(iter/s)": 1.437649 }, { "epoch": 2.8569898461933936, "grad_norm": 4.6011552810668945, "learning_rate": 3.8878639813299214e-05, "loss": 2.453974723815918, "memory(GiB)": 77.56, "step": 66685, "token_acc": 0.4980392156862745, "train_speed(iter/s)": 1.437653 }, { "epoch": 2.8572040615226424, "grad_norm": 4.523308277130127, "learning_rate": 3.887207872653e-05, "loss": 2.26114501953125, "memory(GiB)": 77.56, "step": 66690, "token_acc": 0.501779359430605, "train_speed(iter/s)": 1.437645 }, { "epoch": 2.8574182768518916, "grad_norm": 7.806247234344482, "learning_rate": 3.886551784135352e-05, "loss": 2.554484176635742, "memory(GiB)": 77.56, "step": 66695, "token_acc": 0.4671280276816609, "train_speed(iter/s)": 1.437669 }, { "epoch": 2.8576324921811405, "grad_norm": 4.85680627822876, "learning_rate": 3.885895715788864e-05, "loss": 2.5340404510498047, "memory(GiB)": 77.56, "step": 66700, "token_acc": 0.4755244755244755, "train_speed(iter/s)": 1.437675 }, { "epoch": 2.8578467075103893, "grad_norm": 5.873867034912109, "learning_rate": 3.8852396676254175e-05, "loss": 2.2370100021362305, "memory(GiB)": 77.56, "step": 66705, "token_acc": 0.5133079847908745, "train_speed(iter/s)": 1.437697 }, { "epoch": 2.8580609228396385, "grad_norm": 5.588469982147217, "learning_rate": 3.8845836396569007e-05, "loss": 2.354682731628418, "memory(GiB)": 77.56, "step": 66710, "token_acc": 0.5037593984962406, "train_speed(iter/s)": 1.437655 }, { "epoch": 2.8582751381688873, "grad_norm": 4.8850274085998535, "learning_rate": 3.8839276318952e-05, "loss": 2.4714897155761717, "memory(GiB)": 77.56, "step": 66715, "token_acc": 0.4484848484848485, "train_speed(iter/s)": 1.437625 }, { "epoch": 2.858489353498136, "grad_norm": 6.62422513961792, "learning_rate": 3.883271644352196e-05, "loss": 2.377308464050293, "memory(GiB)": 77.56, "step": 66720, "token_acc": 0.5129151291512916, "train_speed(iter/s)": 1.43761 }, { "epoch": 2.8587035688273854, "grad_norm": 5.528073787689209, "learning_rate": 3.882615677039773e-05, "loss": 2.336688995361328, "memory(GiB)": 77.56, "step": 66725, "token_acc": 0.5177514792899408, "train_speed(iter/s)": 1.437615 }, { "epoch": 2.8589177841566342, "grad_norm": 8.789603233337402, "learning_rate": 3.881959729969816e-05, "loss": 2.3863702774047852, "memory(GiB)": 77.56, "step": 66730, "token_acc": 0.5195729537366548, "train_speed(iter/s)": 1.437617 }, { "epoch": 2.859131999485883, "grad_norm": 5.422511100769043, "learning_rate": 3.881303803154208e-05, "loss": 2.707002639770508, "memory(GiB)": 77.56, "step": 66735, "token_acc": 0.4551282051282051, "train_speed(iter/s)": 1.437619 }, { "epoch": 2.8593462148151323, "grad_norm": 4.357863426208496, "learning_rate": 3.8806478966048295e-05, "loss": 2.2606287002563477, "memory(GiB)": 77.56, "step": 66740, "token_acc": 0.5464684014869888, "train_speed(iter/s)": 1.437613 }, { "epoch": 2.859560430144381, "grad_norm": 5.374544143676758, "learning_rate": 3.879992010333565e-05, "loss": 2.470383071899414, "memory(GiB)": 77.56, "step": 66745, "token_acc": 0.476878612716763, "train_speed(iter/s)": 1.437592 }, { "epoch": 2.85977464547363, "grad_norm": 7.355118274688721, "learning_rate": 3.879336144352296e-05, "loss": 2.633053016662598, "memory(GiB)": 77.56, "step": 66750, "token_acc": 0.46236559139784944, "train_speed(iter/s)": 1.43758 }, { "epoch": 2.859988860802879, "grad_norm": 5.335455417633057, "learning_rate": 3.878680298672904e-05, "loss": 2.155171203613281, "memory(GiB)": 77.56, "step": 66755, "token_acc": 0.5136054421768708, "train_speed(iter/s)": 1.437545 }, { "epoch": 2.860203076132128, "grad_norm": 5.805242538452148, "learning_rate": 3.8780244733072706e-05, "loss": 2.1418052673339845, "memory(GiB)": 77.56, "step": 66760, "token_acc": 0.4981684981684982, "train_speed(iter/s)": 1.43754 }, { "epoch": 2.860417291461377, "grad_norm": 7.094827651977539, "learning_rate": 3.8773686682672756e-05, "loss": 2.9565858840942383, "memory(GiB)": 77.56, "step": 66765, "token_acc": 0.43023255813953487, "train_speed(iter/s)": 1.437528 }, { "epoch": 2.860631506790626, "grad_norm": 5.819697380065918, "learning_rate": 3.876712883564801e-05, "loss": 2.5334810256958007, "memory(GiB)": 77.56, "step": 66770, "token_acc": 0.4652014652014652, "train_speed(iter/s)": 1.437548 }, { "epoch": 2.860845722119875, "grad_norm": 7.383658409118652, "learning_rate": 3.876057119211727e-05, "loss": 2.4060733795166014, "memory(GiB)": 77.56, "step": 66775, "token_acc": 0.46283783783783783, "train_speed(iter/s)": 1.437571 }, { "epoch": 2.8610599374491237, "grad_norm": 5.543306350708008, "learning_rate": 3.875401375219929e-05, "loss": 2.1814397811889648, "memory(GiB)": 77.56, "step": 66780, "token_acc": 0.5381944444444444, "train_speed(iter/s)": 1.437553 }, { "epoch": 2.861274152778373, "grad_norm": 5.551831245422363, "learning_rate": 3.8747456516012926e-05, "loss": 2.328445053100586, "memory(GiB)": 77.56, "step": 66785, "token_acc": 0.47513812154696133, "train_speed(iter/s)": 1.43759 }, { "epoch": 2.8614883681076217, "grad_norm": 4.948244571685791, "learning_rate": 3.8740899483676946e-05, "loss": 2.114749717712402, "memory(GiB)": 77.56, "step": 66790, "token_acc": 0.532608695652174, "train_speed(iter/s)": 1.437542 }, { "epoch": 2.8617025834368706, "grad_norm": 4.717979907989502, "learning_rate": 3.8734342655310124e-05, "loss": 2.2499460220336913, "memory(GiB)": 77.56, "step": 66795, "token_acc": 0.4855305466237942, "train_speed(iter/s)": 1.437539 }, { "epoch": 2.86191679876612, "grad_norm": 5.059933185577393, "learning_rate": 3.872778603103126e-05, "loss": 2.027072715759277, "memory(GiB)": 77.56, "step": 66800, "token_acc": 0.5748031496062992, "train_speed(iter/s)": 1.437557 }, { "epoch": 2.8621310140953686, "grad_norm": 6.091126441955566, "learning_rate": 3.8721229610959136e-05, "loss": 2.4446929931640624, "memory(GiB)": 77.56, "step": 66805, "token_acc": 0.455026455026455, "train_speed(iter/s)": 1.437566 }, { "epoch": 2.8623452294246174, "grad_norm": 9.122893333435059, "learning_rate": 3.87146733952125e-05, "loss": 2.501746940612793, "memory(GiB)": 77.56, "step": 66810, "token_acc": 0.4522058823529412, "train_speed(iter/s)": 1.437577 }, { "epoch": 2.8625594447538667, "grad_norm": 5.040594577789307, "learning_rate": 3.870811738391015e-05, "loss": 2.5487319946289064, "memory(GiB)": 77.56, "step": 66815, "token_acc": 0.4479495268138801, "train_speed(iter/s)": 1.437577 }, { "epoch": 2.8627736600831155, "grad_norm": 6.156131744384766, "learning_rate": 3.8701561577170855e-05, "loss": 2.3668354034423826, "memory(GiB)": 77.56, "step": 66820, "token_acc": 0.5133079847908745, "train_speed(iter/s)": 1.437587 }, { "epoch": 2.8629878754123643, "grad_norm": 4.535671710968018, "learning_rate": 3.8695005975113357e-05, "loss": 2.6816427230834963, "memory(GiB)": 77.56, "step": 66825, "token_acc": 0.43558282208588955, "train_speed(iter/s)": 1.437606 }, { "epoch": 2.8632020907416136, "grad_norm": 5.114601135253906, "learning_rate": 3.868845057785644e-05, "loss": 2.303553009033203, "memory(GiB)": 77.56, "step": 66830, "token_acc": 0.5050847457627119, "train_speed(iter/s)": 1.43758 }, { "epoch": 2.8634163060708624, "grad_norm": 5.828295707702637, "learning_rate": 3.868189538551884e-05, "loss": 2.3304513931274413, "memory(GiB)": 77.56, "step": 66835, "token_acc": 0.49328859060402686, "train_speed(iter/s)": 1.437591 }, { "epoch": 2.863630521400111, "grad_norm": 6.106845378875732, "learning_rate": 3.8675340398219335e-05, "loss": 2.491108703613281, "memory(GiB)": 77.56, "step": 66840, "token_acc": 0.5224358974358975, "train_speed(iter/s)": 1.437595 }, { "epoch": 2.8638447367293605, "grad_norm": 5.234866142272949, "learning_rate": 3.8668785616076665e-05, "loss": 2.6317298889160154, "memory(GiB)": 77.56, "step": 66845, "token_acc": 0.4591549295774648, "train_speed(iter/s)": 1.437571 }, { "epoch": 2.8640589520586093, "grad_norm": 5.1936211585998535, "learning_rate": 3.866223103920956e-05, "loss": 2.3543069839477537, "memory(GiB)": 77.56, "step": 66850, "token_acc": 0.5280898876404494, "train_speed(iter/s)": 1.437589 }, { "epoch": 2.864273167387858, "grad_norm": 5.63826322555542, "learning_rate": 3.865567666773676e-05, "loss": 2.451177215576172, "memory(GiB)": 77.56, "step": 66855, "token_acc": 0.5192307692307693, "train_speed(iter/s)": 1.437615 }, { "epoch": 2.8644873827171073, "grad_norm": 5.010950565338135, "learning_rate": 3.864912250177704e-05, "loss": 2.4862348556518556, "memory(GiB)": 77.56, "step": 66860, "token_acc": 0.4872611464968153, "train_speed(iter/s)": 1.437618 }, { "epoch": 2.864701598046356, "grad_norm": 3.9060218334198, "learning_rate": 3.864256854144912e-05, "loss": 2.618460464477539, "memory(GiB)": 77.56, "step": 66865, "token_acc": 0.4811320754716981, "train_speed(iter/s)": 1.437629 }, { "epoch": 2.864915813375605, "grad_norm": 4.667769432067871, "learning_rate": 3.863601478687171e-05, "loss": 2.0842124938964846, "memory(GiB)": 77.56, "step": 66870, "token_acc": 0.5397350993377483, "train_speed(iter/s)": 1.437627 }, { "epoch": 2.8651300287048542, "grad_norm": 5.057943344116211, "learning_rate": 3.862946123816357e-05, "loss": 2.2692262649536135, "memory(GiB)": 77.56, "step": 66875, "token_acc": 0.4970414201183432, "train_speed(iter/s)": 1.43761 }, { "epoch": 2.865344244034103, "grad_norm": 7.909023284912109, "learning_rate": 3.86229078954434e-05, "loss": 2.307342529296875, "memory(GiB)": 77.56, "step": 66880, "token_acc": 0.4915254237288136, "train_speed(iter/s)": 1.437615 }, { "epoch": 2.865558459363352, "grad_norm": 4.595394134521484, "learning_rate": 3.8616354758829915e-05, "loss": 2.4317653656005858, "memory(GiB)": 77.56, "step": 66885, "token_acc": 0.4306569343065693, "train_speed(iter/s)": 1.437607 }, { "epoch": 2.865772674692601, "grad_norm": 5.636876583099365, "learning_rate": 3.8609801828441855e-05, "loss": 2.557910919189453, "memory(GiB)": 77.56, "step": 66890, "token_acc": 0.4825174825174825, "train_speed(iter/s)": 1.437609 }, { "epoch": 2.86598689002185, "grad_norm": 5.9952521324157715, "learning_rate": 3.860324910439791e-05, "loss": 2.1720706939697267, "memory(GiB)": 77.56, "step": 66895, "token_acc": 0.484149855907781, "train_speed(iter/s)": 1.437626 }, { "epoch": 2.8662011053510987, "grad_norm": 4.391422748565674, "learning_rate": 3.859669658681681e-05, "loss": 2.628008270263672, "memory(GiB)": 77.56, "step": 66900, "token_acc": 0.4713375796178344, "train_speed(iter/s)": 1.437642 }, { "epoch": 2.866415320680348, "grad_norm": 6.580483436584473, "learning_rate": 3.859014427581724e-05, "loss": 2.326628303527832, "memory(GiB)": 77.56, "step": 66905, "token_acc": 0.48464163822525597, "train_speed(iter/s)": 1.437641 }, { "epoch": 2.866629536009597, "grad_norm": 5.895464897155762, "learning_rate": 3.858359217151791e-05, "loss": 2.2959293365478515, "memory(GiB)": 77.56, "step": 66910, "token_acc": 0.5167785234899329, "train_speed(iter/s)": 1.437648 }, { "epoch": 2.8668437513388456, "grad_norm": 3.5169382095336914, "learning_rate": 3.857704027403752e-05, "loss": 2.210647201538086, "memory(GiB)": 77.56, "step": 66915, "token_acc": 0.5384615384615384, "train_speed(iter/s)": 1.437678 }, { "epoch": 2.867057966668095, "grad_norm": 6.579715728759766, "learning_rate": 3.8570488583494766e-05, "loss": 2.456513214111328, "memory(GiB)": 77.56, "step": 66920, "token_acc": 0.4883720930232558, "train_speed(iter/s)": 1.437679 }, { "epoch": 2.8672721819973437, "grad_norm": 6.019247531890869, "learning_rate": 3.856393710000832e-05, "loss": 2.227117919921875, "memory(GiB)": 77.56, "step": 66925, "token_acc": 0.5185185185185185, "train_speed(iter/s)": 1.437701 }, { "epoch": 2.8674863973265925, "grad_norm": 9.02134895324707, "learning_rate": 3.8557385823696875e-05, "loss": 2.2834991455078124, "memory(GiB)": 77.56, "step": 66930, "token_acc": 0.5241635687732342, "train_speed(iter/s)": 1.43771 }, { "epoch": 2.8677006126558418, "grad_norm": 6.506422996520996, "learning_rate": 3.855083475467913e-05, "loss": 2.6903707504272463, "memory(GiB)": 77.56, "step": 66935, "token_acc": 0.4370860927152318, "train_speed(iter/s)": 1.437716 }, { "epoch": 2.8679148279850906, "grad_norm": 5.70438814163208, "learning_rate": 3.854428389307376e-05, "loss": 2.3973888397216796, "memory(GiB)": 77.56, "step": 66940, "token_acc": 0.4869281045751634, "train_speed(iter/s)": 1.437719 }, { "epoch": 2.8681290433143394, "grad_norm": 6.45711612701416, "learning_rate": 3.853773323899943e-05, "loss": 2.4502588272094727, "memory(GiB)": 77.56, "step": 66945, "token_acc": 0.5212765957446809, "train_speed(iter/s)": 1.437714 }, { "epoch": 2.8683432586435886, "grad_norm": 6.896414756774902, "learning_rate": 3.853118279257482e-05, "loss": 2.211759567260742, "memory(GiB)": 77.56, "step": 66950, "token_acc": 0.5062761506276151, "train_speed(iter/s)": 1.437734 }, { "epoch": 2.8685574739728374, "grad_norm": 4.425848484039307, "learning_rate": 3.852463255391858e-05, "loss": 2.283206558227539, "memory(GiB)": 77.56, "step": 66955, "token_acc": 0.5088757396449705, "train_speed(iter/s)": 1.437766 }, { "epoch": 2.8687716893020863, "grad_norm": 5.747751235961914, "learning_rate": 3.85180825231494e-05, "loss": 2.165032958984375, "memory(GiB)": 77.56, "step": 66960, "token_acc": 0.5570934256055363, "train_speed(iter/s)": 1.437783 }, { "epoch": 2.8689859046313355, "grad_norm": 6.781982898712158, "learning_rate": 3.851153270038592e-05, "loss": 2.479697036743164, "memory(GiB)": 77.56, "step": 66965, "token_acc": 0.4745762711864407, "train_speed(iter/s)": 1.437802 }, { "epoch": 2.8692001199605843, "grad_norm": 7.020091533660889, "learning_rate": 3.85049830857468e-05, "loss": 2.521246910095215, "memory(GiB)": 77.56, "step": 66970, "token_acc": 0.532608695652174, "train_speed(iter/s)": 1.437832 }, { "epoch": 2.869414335289833, "grad_norm": 5.100048542022705, "learning_rate": 3.84984336793507e-05, "loss": 2.2837223052978515, "memory(GiB)": 77.56, "step": 66975, "token_acc": 0.5289855072463768, "train_speed(iter/s)": 1.437855 }, { "epoch": 2.8696285506190824, "grad_norm": 6.363317966461182, "learning_rate": 3.849188448131625e-05, "loss": 2.108646011352539, "memory(GiB)": 77.56, "step": 66980, "token_acc": 0.5613382899628253, "train_speed(iter/s)": 1.437829 }, { "epoch": 2.869842765948331, "grad_norm": 5.7891645431518555, "learning_rate": 3.8485335491762126e-05, "loss": 2.3036727905273438, "memory(GiB)": 77.56, "step": 66985, "token_acc": 0.5379061371841155, "train_speed(iter/s)": 1.437849 }, { "epoch": 2.87005698127758, "grad_norm": 5.255712509155273, "learning_rate": 3.8478786710806946e-05, "loss": 2.4827831268310545, "memory(GiB)": 77.56, "step": 66990, "token_acc": 0.46886446886446886, "train_speed(iter/s)": 1.437855 }, { "epoch": 2.8702711966068293, "grad_norm": 5.912222862243652, "learning_rate": 3.847223813856934e-05, "loss": 2.238930892944336, "memory(GiB)": 77.56, "step": 66995, "token_acc": 0.5432098765432098, "train_speed(iter/s)": 1.437814 }, { "epoch": 2.870485411936078, "grad_norm": 6.585913181304932, "learning_rate": 3.846568977516797e-05, "loss": 2.569080352783203, "memory(GiB)": 77.56, "step": 67000, "token_acc": 0.47058823529411764, "train_speed(iter/s)": 1.437806 }, { "epoch": 2.870485411936078, "eval_loss": 2.285208225250244, "eval_runtime": 14.177, "eval_samples_per_second": 7.054, "eval_steps_per_second": 7.054, "eval_token_acc": 0.4603616133518776, "step": 67000 }, { "epoch": 2.870699627265327, "grad_norm": 5.607452392578125, "learning_rate": 3.845914162072143e-05, "loss": 2.2495880126953125, "memory(GiB)": 77.56, "step": 67005, "token_acc": 0.4594594594594595, "train_speed(iter/s)": 1.437342 }, { "epoch": 2.870913842594576, "grad_norm": 5.4821319580078125, "learning_rate": 3.845259367534838e-05, "loss": 2.571170425415039, "memory(GiB)": 77.56, "step": 67010, "token_acc": 0.4594594594594595, "train_speed(iter/s)": 1.43737 }, { "epoch": 2.871128057923825, "grad_norm": 4.354070663452148, "learning_rate": 3.8446045939167425e-05, "loss": 2.3739681243896484, "memory(GiB)": 77.56, "step": 67015, "token_acc": 0.52046783625731, "train_speed(iter/s)": 1.437403 }, { "epoch": 2.871342273253074, "grad_norm": 5.79425048828125, "learning_rate": 3.8439498412297195e-05, "loss": 2.485355758666992, "memory(GiB)": 77.56, "step": 67020, "token_acc": 0.46691176470588236, "train_speed(iter/s)": 1.43739 }, { "epoch": 2.871556488582323, "grad_norm": 5.1980743408203125, "learning_rate": 3.843295109485629e-05, "loss": 2.444027328491211, "memory(GiB)": 77.56, "step": 67025, "token_acc": 0.488135593220339, "train_speed(iter/s)": 1.437394 }, { "epoch": 2.871770703911572, "grad_norm": 7.363042831420898, "learning_rate": 3.842640398696332e-05, "loss": 2.584497833251953, "memory(GiB)": 77.56, "step": 67030, "token_acc": 0.4621212121212121, "train_speed(iter/s)": 1.437383 }, { "epoch": 2.8719849192408207, "grad_norm": 5.0718536376953125, "learning_rate": 3.841985708873691e-05, "loss": 2.4830461502075196, "memory(GiB)": 77.56, "step": 67035, "token_acc": 0.48231511254019294, "train_speed(iter/s)": 1.437387 }, { "epoch": 2.87219913457007, "grad_norm": 5.529065132141113, "learning_rate": 3.8413310400295636e-05, "loss": 2.332920455932617, "memory(GiB)": 77.56, "step": 67040, "token_acc": 0.4783950617283951, "train_speed(iter/s)": 1.437412 }, { "epoch": 2.8724133498993187, "grad_norm": 8.685287475585938, "learning_rate": 3.8406763921758124e-05, "loss": 2.4722246170043944, "memory(GiB)": 77.56, "step": 67045, "token_acc": 0.4584717607973422, "train_speed(iter/s)": 1.437402 }, { "epoch": 2.8726275652285675, "grad_norm": 6.294495582580566, "learning_rate": 3.840021765324297e-05, "loss": 2.3772228240966795, "memory(GiB)": 77.56, "step": 67050, "token_acc": 0.5205992509363296, "train_speed(iter/s)": 1.437411 }, { "epoch": 2.872841780557817, "grad_norm": 5.3892388343811035, "learning_rate": 3.839367159486874e-05, "loss": 2.1171764373779296, "memory(GiB)": 77.56, "step": 67055, "token_acc": 0.5067567567567568, "train_speed(iter/s)": 1.437421 }, { "epoch": 2.8730559958870656, "grad_norm": 5.201147556304932, "learning_rate": 3.838712574675404e-05, "loss": 2.3917259216308593, "memory(GiB)": 77.56, "step": 67060, "token_acc": 0.5236593059936908, "train_speed(iter/s)": 1.437456 }, { "epoch": 2.8732702112163144, "grad_norm": 7.4396562576293945, "learning_rate": 3.838058010901746e-05, "loss": 2.3074174880981446, "memory(GiB)": 77.56, "step": 67065, "token_acc": 0.4966216216216216, "train_speed(iter/s)": 1.437475 }, { "epoch": 2.8734844265455637, "grad_norm": 6.192290306091309, "learning_rate": 3.8374034681777546e-05, "loss": 2.577297019958496, "memory(GiB)": 77.56, "step": 67070, "token_acc": 0.4744744744744745, "train_speed(iter/s)": 1.437502 }, { "epoch": 2.8736986418748125, "grad_norm": 4.847055435180664, "learning_rate": 3.8367489465152905e-05, "loss": 2.4470558166503906, "memory(GiB)": 77.56, "step": 67075, "token_acc": 0.5051546391752577, "train_speed(iter/s)": 1.437531 }, { "epoch": 2.8739128572040613, "grad_norm": 4.06255578994751, "learning_rate": 3.836094445926212e-05, "loss": 2.683848762512207, "memory(GiB)": 77.56, "step": 67080, "token_acc": 0.4604904632152589, "train_speed(iter/s)": 1.437554 }, { "epoch": 2.8741270725333106, "grad_norm": 5.534252643585205, "learning_rate": 3.835439966422373e-05, "loss": 2.389699172973633, "memory(GiB)": 77.56, "step": 67085, "token_acc": 0.46735395189003437, "train_speed(iter/s)": 1.437554 }, { "epoch": 2.8743412878625594, "grad_norm": 4.587975978851318, "learning_rate": 3.834785508015633e-05, "loss": 2.599444580078125, "memory(GiB)": 77.56, "step": 67090, "token_acc": 0.4358108108108108, "train_speed(iter/s)": 1.437568 }, { "epoch": 2.874555503191808, "grad_norm": 6.065348148345947, "learning_rate": 3.834131070717847e-05, "loss": 2.6929557800292967, "memory(GiB)": 77.56, "step": 67095, "token_acc": 0.4557823129251701, "train_speed(iter/s)": 1.437598 }, { "epoch": 2.8747697185210574, "grad_norm": 5.553237438201904, "learning_rate": 3.833476654540869e-05, "loss": 2.803675079345703, "memory(GiB)": 77.56, "step": 67100, "token_acc": 0.4527027027027027, "train_speed(iter/s)": 1.4376 }, { "epoch": 2.8749839338503063, "grad_norm": 5.479255199432373, "learning_rate": 3.8328222594965555e-05, "loss": 2.485291862487793, "memory(GiB)": 77.56, "step": 67105, "token_acc": 0.4791666666666667, "train_speed(iter/s)": 1.437585 }, { "epoch": 2.875198149179555, "grad_norm": 5.464146137237549, "learning_rate": 3.832167885596763e-05, "loss": 2.490620422363281, "memory(GiB)": 77.56, "step": 67110, "token_acc": 0.49458483754512633, "train_speed(iter/s)": 1.437571 }, { "epoch": 2.8754123645088043, "grad_norm": 5.560009479522705, "learning_rate": 3.8315135328533435e-05, "loss": 2.253011131286621, "memory(GiB)": 77.56, "step": 67115, "token_acc": 0.5252100840336135, "train_speed(iter/s)": 1.437548 }, { "epoch": 2.875626579838053, "grad_norm": 4.747530460357666, "learning_rate": 3.830859201278153e-05, "loss": 2.211527633666992, "memory(GiB)": 77.56, "step": 67120, "token_acc": 0.528169014084507, "train_speed(iter/s)": 1.437558 }, { "epoch": 2.875840795167302, "grad_norm": 4.919352054595947, "learning_rate": 3.830204890883044e-05, "loss": 2.3209238052368164, "memory(GiB)": 77.56, "step": 67125, "token_acc": 0.4820717131474104, "train_speed(iter/s)": 1.437588 }, { "epoch": 2.876055010496551, "grad_norm": 4.848925590515137, "learning_rate": 3.829550601679872e-05, "loss": 2.360421371459961, "memory(GiB)": 77.56, "step": 67130, "token_acc": 0.5360824742268041, "train_speed(iter/s)": 1.437613 }, { "epoch": 2.8762692258258, "grad_norm": 4.4091644287109375, "learning_rate": 3.828896333680489e-05, "loss": 2.502337837219238, "memory(GiB)": 77.56, "step": 67135, "token_acc": 0.48089171974522293, "train_speed(iter/s)": 1.437604 }, { "epoch": 2.876483441155049, "grad_norm": 6.011263847351074, "learning_rate": 3.828242086896746e-05, "loss": 2.267398452758789, "memory(GiB)": 77.56, "step": 67140, "token_acc": 0.521311475409836, "train_speed(iter/s)": 1.437606 }, { "epoch": 2.876697656484298, "grad_norm": 6.308224201202393, "learning_rate": 3.827587861340498e-05, "loss": 2.236736297607422, "memory(GiB)": 77.56, "step": 67145, "token_acc": 0.5346534653465347, "train_speed(iter/s)": 1.437618 }, { "epoch": 2.876911871813547, "grad_norm": 5.017220497131348, "learning_rate": 3.826933657023594e-05, "loss": 2.130982780456543, "memory(GiB)": 77.56, "step": 67150, "token_acc": 0.5212765957446809, "train_speed(iter/s)": 1.437599 }, { "epoch": 2.8771260871427957, "grad_norm": 9.085668563842773, "learning_rate": 3.826279473957889e-05, "loss": 2.5644020080566405, "memory(GiB)": 77.56, "step": 67155, "token_acc": 0.49491525423728816, "train_speed(iter/s)": 1.437605 }, { "epoch": 2.877340302472045, "grad_norm": 8.26830768585205, "learning_rate": 3.825625312155231e-05, "loss": 2.3862737655639648, "memory(GiB)": 77.56, "step": 67160, "token_acc": 0.4729241877256318, "train_speed(iter/s)": 1.437631 }, { "epoch": 2.877554517801294, "grad_norm": 5.186570644378662, "learning_rate": 3.824971171627474e-05, "loss": 2.638486862182617, "memory(GiB)": 77.56, "step": 67165, "token_acc": 0.45110410094637227, "train_speed(iter/s)": 1.437641 }, { "epoch": 2.8777687331305426, "grad_norm": 6.128634929656982, "learning_rate": 3.8243170523864655e-05, "loss": 2.4484525680541993, "memory(GiB)": 77.56, "step": 67170, "token_acc": 0.5175097276264592, "train_speed(iter/s)": 1.437654 }, { "epoch": 2.877982948459792, "grad_norm": 4.915328025817871, "learning_rate": 3.823662954444056e-05, "loss": 2.40826416015625, "memory(GiB)": 77.56, "step": 67175, "token_acc": 0.47297297297297297, "train_speed(iter/s)": 1.437664 }, { "epoch": 2.8781971637890407, "grad_norm": 4.873974800109863, "learning_rate": 3.823008877812097e-05, "loss": 2.310495948791504, "memory(GiB)": 77.56, "step": 67180, "token_acc": 0.5185185185185185, "train_speed(iter/s)": 1.437688 }, { "epoch": 2.8784113791182895, "grad_norm": 5.154747009277344, "learning_rate": 3.822354822502434e-05, "loss": 2.3504953384399414, "memory(GiB)": 77.56, "step": 67185, "token_acc": 0.48936170212765956, "train_speed(iter/s)": 1.437706 }, { "epoch": 2.8786255944475387, "grad_norm": 5.984213352203369, "learning_rate": 3.821700788526921e-05, "loss": 2.4143396377563477, "memory(GiB)": 77.56, "step": 67190, "token_acc": 0.44984802431610943, "train_speed(iter/s)": 1.437687 }, { "epoch": 2.8788398097767876, "grad_norm": 5.142282962799072, "learning_rate": 3.8210467758974026e-05, "loss": 2.1933261871337892, "memory(GiB)": 77.56, "step": 67195, "token_acc": 0.5270758122743683, "train_speed(iter/s)": 1.437662 }, { "epoch": 2.8790540251060364, "grad_norm": 4.387335777282715, "learning_rate": 3.820392784625726e-05, "loss": 2.368913459777832, "memory(GiB)": 77.56, "step": 67200, "token_acc": 0.5092936802973977, "train_speed(iter/s)": 1.437664 }, { "epoch": 2.8792682404352856, "grad_norm": 6.414888381958008, "learning_rate": 3.819738814723742e-05, "loss": 2.5073263168334963, "memory(GiB)": 77.56, "step": 67205, "token_acc": 0.4806451612903226, "train_speed(iter/s)": 1.437657 }, { "epoch": 2.8794824557645344, "grad_norm": 6.047554969787598, "learning_rate": 3.8190848662032974e-05, "loss": 2.3778564453125, "memory(GiB)": 77.56, "step": 67210, "token_acc": 0.4786885245901639, "train_speed(iter/s)": 1.437668 }, { "epoch": 2.8796966710937832, "grad_norm": 7.756847381591797, "learning_rate": 3.8184309390762364e-05, "loss": 2.697977828979492, "memory(GiB)": 77.56, "step": 67215, "token_acc": 0.4280821917808219, "train_speed(iter/s)": 1.437693 }, { "epoch": 2.8799108864230325, "grad_norm": 7.670078754425049, "learning_rate": 3.817777033354409e-05, "loss": 2.2733713150024415, "memory(GiB)": 77.56, "step": 67220, "token_acc": 0.4860557768924303, "train_speed(iter/s)": 1.437706 }, { "epoch": 2.8801251017522813, "grad_norm": 8.820830345153809, "learning_rate": 3.817123149049657e-05, "loss": 2.411340522766113, "memory(GiB)": 77.56, "step": 67225, "token_acc": 0.5068027210884354, "train_speed(iter/s)": 1.437729 }, { "epoch": 2.88033931708153, "grad_norm": 4.12614107131958, "learning_rate": 3.81646928617383e-05, "loss": 2.4285394668579103, "memory(GiB)": 77.56, "step": 67230, "token_acc": 0.4879725085910653, "train_speed(iter/s)": 1.437751 }, { "epoch": 2.8805535324107794, "grad_norm": 6.352338790893555, "learning_rate": 3.8158154447387737e-05, "loss": 2.4290489196777343, "memory(GiB)": 77.56, "step": 67235, "token_acc": 0.5035460992907801, "train_speed(iter/s)": 1.437727 }, { "epoch": 2.880767747740028, "grad_norm": 4.837244987487793, "learning_rate": 3.81516162475633e-05, "loss": 2.3741870880126954, "memory(GiB)": 77.56, "step": 67240, "token_acc": 0.4925925925925926, "train_speed(iter/s)": 1.437724 }, { "epoch": 2.880981963069277, "grad_norm": 6.899613857269287, "learning_rate": 3.814507826238345e-05, "loss": 2.3662992477416993, "memory(GiB)": 77.56, "step": 67245, "token_acc": 0.49504950495049505, "train_speed(iter/s)": 1.437713 }, { "epoch": 2.8811961783985263, "grad_norm": 4.979419708251953, "learning_rate": 3.813854049196664e-05, "loss": 2.2906234741210936, "memory(GiB)": 77.56, "step": 67250, "token_acc": 0.46785714285714286, "train_speed(iter/s)": 1.437695 }, { "epoch": 2.881410393727775, "grad_norm": 4.446743011474609, "learning_rate": 3.813200293643129e-05, "loss": 2.6041790008544923, "memory(GiB)": 77.56, "step": 67255, "token_acc": 0.4673913043478261, "train_speed(iter/s)": 1.437688 }, { "epoch": 2.881624609057024, "grad_norm": 6.072567939758301, "learning_rate": 3.8125465595895834e-05, "loss": 2.1654294967651366, "memory(GiB)": 77.56, "step": 67260, "token_acc": 0.5492957746478874, "train_speed(iter/s)": 1.437688 }, { "epoch": 2.881838824386273, "grad_norm": 11.573997497558594, "learning_rate": 3.811892847047872e-05, "loss": 2.752538299560547, "memory(GiB)": 77.56, "step": 67265, "token_acc": 0.47017543859649125, "train_speed(iter/s)": 1.437722 }, { "epoch": 2.882053039715522, "grad_norm": 4.243500709533691, "learning_rate": 3.811239156029835e-05, "loss": 2.3777835845947264, "memory(GiB)": 77.56, "step": 67270, "token_acc": 0.5058823529411764, "train_speed(iter/s)": 1.437737 }, { "epoch": 2.8822672550447708, "grad_norm": 6.113096714019775, "learning_rate": 3.8105854865473164e-05, "loss": 2.6341041564941405, "memory(GiB)": 77.56, "step": 67275, "token_acc": 0.5017921146953405, "train_speed(iter/s)": 1.437747 }, { "epoch": 2.88248147037402, "grad_norm": 5.571936130523682, "learning_rate": 3.809931838612159e-05, "loss": 2.5526802062988283, "memory(GiB)": 77.56, "step": 67280, "token_acc": 0.47058823529411764, "train_speed(iter/s)": 1.437787 }, { "epoch": 2.882695685703269, "grad_norm": 4.928206443786621, "learning_rate": 3.8092782122362e-05, "loss": 1.855202865600586, "memory(GiB)": 77.56, "step": 67285, "token_acc": 0.5729537366548043, "train_speed(iter/s)": 1.437784 }, { "epoch": 2.8829099010325177, "grad_norm": 7.276062965393066, "learning_rate": 3.808624607431285e-05, "loss": 2.2683895111083983, "memory(GiB)": 77.56, "step": 67290, "token_acc": 0.49635036496350365, "train_speed(iter/s)": 1.437797 }, { "epoch": 2.883124116361767, "grad_norm": 4.760739803314209, "learning_rate": 3.807971024209252e-05, "loss": 2.5077787399291993, "memory(GiB)": 77.56, "step": 67295, "token_acc": 0.5259259259259259, "train_speed(iter/s)": 1.437801 }, { "epoch": 2.8833383316910157, "grad_norm": 5.484633922576904, "learning_rate": 3.8073174625819406e-05, "loss": 2.411721420288086, "memory(GiB)": 77.56, "step": 67300, "token_acc": 0.4981949458483754, "train_speed(iter/s)": 1.437819 }, { "epoch": 2.8835525470202645, "grad_norm": 5.4311418533325195, "learning_rate": 3.8066639225611934e-05, "loss": 2.6528240203857423, "memory(GiB)": 77.56, "step": 67305, "token_acc": 0.47368421052631576, "train_speed(iter/s)": 1.437813 }, { "epoch": 2.883766762349514, "grad_norm": 4.820439338684082, "learning_rate": 3.8060104041588494e-05, "loss": 2.3418437957763674, "memory(GiB)": 77.56, "step": 67310, "token_acc": 0.49828178694158076, "train_speed(iter/s)": 1.437829 }, { "epoch": 2.8839809776787626, "grad_norm": 5.935538291931152, "learning_rate": 3.805356907386747e-05, "loss": 2.3616310119628907, "memory(GiB)": 77.56, "step": 67315, "token_acc": 0.46864686468646866, "train_speed(iter/s)": 1.437836 }, { "epoch": 2.8841951930080114, "grad_norm": 5.296969413757324, "learning_rate": 3.8047034322567246e-05, "loss": 2.4964599609375, "memory(GiB)": 77.56, "step": 67320, "token_acc": 0.5, "train_speed(iter/s)": 1.437828 }, { "epoch": 2.8844094083372607, "grad_norm": 5.347250461578369, "learning_rate": 3.804049978780621e-05, "loss": 2.402444267272949, "memory(GiB)": 77.56, "step": 67325, "token_acc": 0.4751552795031056, "train_speed(iter/s)": 1.437819 }, { "epoch": 2.8846236236665095, "grad_norm": 5.347058296203613, "learning_rate": 3.803396546970273e-05, "loss": 2.3301054000854493, "memory(GiB)": 77.56, "step": 67330, "token_acc": 0.4952978056426332, "train_speed(iter/s)": 1.43785 }, { "epoch": 2.8848378389957583, "grad_norm": 6.328917503356934, "learning_rate": 3.80274313683752e-05, "loss": 2.4490352630615235, "memory(GiB)": 77.56, "step": 67335, "token_acc": 0.4929078014184397, "train_speed(iter/s)": 1.437879 }, { "epoch": 2.8850520543250076, "grad_norm": 5.516933917999268, "learning_rate": 3.802089748394199e-05, "loss": 2.2419479370117186, "memory(GiB)": 77.56, "step": 67340, "token_acc": 0.5483870967741935, "train_speed(iter/s)": 1.437885 }, { "epoch": 2.8852662696542564, "grad_norm": 4.902294158935547, "learning_rate": 3.801436381652144e-05, "loss": 2.5699737548828123, "memory(GiB)": 77.56, "step": 67345, "token_acc": 0.486404833836858, "train_speed(iter/s)": 1.437875 }, { "epoch": 2.885480484983505, "grad_norm": 4.538571357727051, "learning_rate": 3.800783036623194e-05, "loss": 2.6129196166992186, "memory(GiB)": 77.56, "step": 67350, "token_acc": 0.4809688581314879, "train_speed(iter/s)": 1.437867 }, { "epoch": 2.8856947003127544, "grad_norm": 5.637793064117432, "learning_rate": 3.8001297133191855e-05, "loss": 2.636853790283203, "memory(GiB)": 77.56, "step": 67355, "token_acc": 0.5162337662337663, "train_speed(iter/s)": 1.437886 }, { "epoch": 2.8859089156420032, "grad_norm": 5.997646331787109, "learning_rate": 3.799476411751951e-05, "loss": 2.452348327636719, "memory(GiB)": 77.56, "step": 67360, "token_acc": 0.4755700325732899, "train_speed(iter/s)": 1.437903 }, { "epoch": 2.886123130971252, "grad_norm": 5.581997871398926, "learning_rate": 3.798823131933329e-05, "loss": 2.5143415451049806, "memory(GiB)": 77.56, "step": 67365, "token_acc": 0.46853146853146854, "train_speed(iter/s)": 1.437915 }, { "epoch": 2.8863373463005013, "grad_norm": 5.182910442352295, "learning_rate": 3.79816987387515e-05, "loss": 2.539763069152832, "memory(GiB)": 77.56, "step": 67370, "token_acc": 0.4984025559105431, "train_speed(iter/s)": 1.437924 }, { "epoch": 2.88655156162975, "grad_norm": 4.865305423736572, "learning_rate": 3.797516637589252e-05, "loss": 2.399571990966797, "memory(GiB)": 77.56, "step": 67375, "token_acc": 0.48398576512455516, "train_speed(iter/s)": 1.43795 }, { "epoch": 2.886765776958999, "grad_norm": 6.325377941131592, "learning_rate": 3.79686342308747e-05, "loss": 2.6419435501098634, "memory(GiB)": 77.56, "step": 67380, "token_acc": 0.4864864864864865, "train_speed(iter/s)": 1.437945 }, { "epoch": 2.886979992288248, "grad_norm": 4.160820484161377, "learning_rate": 3.796210230381635e-05, "loss": 2.205731010437012, "memory(GiB)": 77.56, "step": 67385, "token_acc": 0.5325077399380805, "train_speed(iter/s)": 1.437932 }, { "epoch": 2.887194207617497, "grad_norm": 6.333376884460449, "learning_rate": 3.795557059483581e-05, "loss": 2.538529396057129, "memory(GiB)": 77.56, "step": 67390, "token_acc": 0.4723127035830619, "train_speed(iter/s)": 1.437958 }, { "epoch": 2.887408422946746, "grad_norm": 5.403401851654053, "learning_rate": 3.794903910405141e-05, "loss": 2.198395538330078, "memory(GiB)": 77.56, "step": 67395, "token_acc": 0.5397489539748954, "train_speed(iter/s)": 1.437965 }, { "epoch": 2.887622638275995, "grad_norm": 6.6402130126953125, "learning_rate": 3.794250783158148e-05, "loss": 2.176066780090332, "memory(GiB)": 77.56, "step": 67400, "token_acc": 0.4962962962962963, "train_speed(iter/s)": 1.437951 }, { "epoch": 2.887836853605244, "grad_norm": 6.510558128356934, "learning_rate": 3.793597677754431e-05, "loss": 2.2711370468139647, "memory(GiB)": 77.56, "step": 67405, "token_acc": 0.5044776119402985, "train_speed(iter/s)": 1.437964 }, { "epoch": 2.8880510689344927, "grad_norm": 5.322099685668945, "learning_rate": 3.792944594205825e-05, "loss": 2.542129707336426, "memory(GiB)": 77.56, "step": 67410, "token_acc": 0.4750830564784053, "train_speed(iter/s)": 1.43795 }, { "epoch": 2.888265284263742, "grad_norm": 5.281303405761719, "learning_rate": 3.792291532524159e-05, "loss": 2.512421417236328, "memory(GiB)": 77.56, "step": 67415, "token_acc": 0.49107142857142855, "train_speed(iter/s)": 1.437952 }, { "epoch": 2.8884794995929908, "grad_norm": 6.071463108062744, "learning_rate": 3.791638492721266e-05, "loss": 2.574600028991699, "memory(GiB)": 77.56, "step": 67420, "token_acc": 0.4586206896551724, "train_speed(iter/s)": 1.437958 }, { "epoch": 2.8886937149222396, "grad_norm": 5.547637939453125, "learning_rate": 3.790985474808975e-05, "loss": 2.6919523239135743, "memory(GiB)": 77.56, "step": 67425, "token_acc": 0.47183098591549294, "train_speed(iter/s)": 1.437976 }, { "epoch": 2.888907930251489, "grad_norm": 5.751314640045166, "learning_rate": 3.790332478799115e-05, "loss": 2.456285095214844, "memory(GiB)": 77.56, "step": 67430, "token_acc": 0.5104166666666666, "train_speed(iter/s)": 1.437991 }, { "epoch": 2.8891221455807377, "grad_norm": 5.877937316894531, "learning_rate": 3.789679504703519e-05, "loss": 2.383719253540039, "memory(GiB)": 77.56, "step": 67435, "token_acc": 0.5034965034965035, "train_speed(iter/s)": 1.438006 }, { "epoch": 2.8893363609099865, "grad_norm": 5.98226261138916, "learning_rate": 3.789026552534013e-05, "loss": 2.4926692962646486, "memory(GiB)": 77.56, "step": 67440, "token_acc": 0.4717607973421927, "train_speed(iter/s)": 1.438019 }, { "epoch": 2.8895505762392357, "grad_norm": 4.849386215209961, "learning_rate": 3.788373622302426e-05, "loss": 2.1820966720581056, "memory(GiB)": 77.56, "step": 67445, "token_acc": 0.4935897435897436, "train_speed(iter/s)": 1.438034 }, { "epoch": 2.8897647915684845, "grad_norm": 4.841516971588135, "learning_rate": 3.787720714020587e-05, "loss": 2.5233510971069335, "memory(GiB)": 77.56, "step": 67450, "token_acc": 0.49458483754512633, "train_speed(iter/s)": 1.438032 }, { "epoch": 2.8899790068977333, "grad_norm": 7.413880348205566, "learning_rate": 3.787067827700327e-05, "loss": 2.2927804946899415, "memory(GiB)": 77.56, "step": 67455, "token_acc": 0.48344370860927155, "train_speed(iter/s)": 1.438029 }, { "epoch": 2.8901932222269826, "grad_norm": 5.3944411277771, "learning_rate": 3.7864149633534704e-05, "loss": 2.2840316772460936, "memory(GiB)": 77.56, "step": 67460, "token_acc": 0.5032679738562091, "train_speed(iter/s)": 1.438031 }, { "epoch": 2.8904074375562314, "grad_norm": 6.688461780548096, "learning_rate": 3.785762120991844e-05, "loss": 2.2583633422851563, "memory(GiB)": 77.56, "step": 67465, "token_acc": 0.5263157894736842, "train_speed(iter/s)": 1.438028 }, { "epoch": 2.8906216528854802, "grad_norm": 4.892764091491699, "learning_rate": 3.7851093006272775e-05, "loss": 2.6320043563842774, "memory(GiB)": 77.56, "step": 67470, "token_acc": 0.4925373134328358, "train_speed(iter/s)": 1.438043 }, { "epoch": 2.8908358682147295, "grad_norm": 6.65464448928833, "learning_rate": 3.784456502271594e-05, "loss": 2.7293590545654296, "memory(GiB)": 77.56, "step": 67475, "token_acc": 0.45483870967741935, "train_speed(iter/s)": 1.438061 }, { "epoch": 2.8910500835439783, "grad_norm": 5.529016017913818, "learning_rate": 3.783803725936622e-05, "loss": 2.436013031005859, "memory(GiB)": 77.56, "step": 67480, "token_acc": 0.4560669456066946, "train_speed(iter/s)": 1.438064 }, { "epoch": 2.891264298873227, "grad_norm": 6.098552703857422, "learning_rate": 3.783150971634187e-05, "loss": 2.463293266296387, "memory(GiB)": 77.56, "step": 67485, "token_acc": 0.4831804281345566, "train_speed(iter/s)": 1.438069 }, { "epoch": 2.8914785142024764, "grad_norm": 5.520616054534912, "learning_rate": 3.782498239376113e-05, "loss": 2.5195693969726562, "memory(GiB)": 77.56, "step": 67490, "token_acc": 0.4448818897637795, "train_speed(iter/s)": 1.438075 }, { "epoch": 2.891692729531725, "grad_norm": 6.759217262268066, "learning_rate": 3.781845529174226e-05, "loss": 2.475371742248535, "memory(GiB)": 77.56, "step": 67495, "token_acc": 0.4470588235294118, "train_speed(iter/s)": 1.438082 }, { "epoch": 2.891906944860974, "grad_norm": 5.455395221710205, "learning_rate": 3.7811928410403495e-05, "loss": 2.7228761672973634, "memory(GiB)": 77.56, "step": 67500, "token_acc": 0.46547314578005117, "train_speed(iter/s)": 1.438105 }, { "epoch": 2.891906944860974, "eval_loss": 2.264267683029175, "eval_runtime": 14.619, "eval_samples_per_second": 6.84, "eval_steps_per_second": 6.84, "eval_token_acc": 0.458656330749354, "step": 67500 }, { "epoch": 2.8921211601902233, "grad_norm": 6.082050800323486, "learning_rate": 3.780540174986308e-05, "loss": 2.2605606079101563, "memory(GiB)": 77.56, "step": 67505, "token_acc": 0.4785992217898833, "train_speed(iter/s)": 1.43765 }, { "epoch": 2.892335375519472, "grad_norm": 5.861954689025879, "learning_rate": 3.7798875310239254e-05, "loss": 2.2912506103515624, "memory(GiB)": 77.56, "step": 67510, "token_acc": 0.5019455252918288, "train_speed(iter/s)": 1.437649 }, { "epoch": 2.892549590848721, "grad_norm": 5.445698261260986, "learning_rate": 3.779234909165024e-05, "loss": 2.5807964324951174, "memory(GiB)": 77.56, "step": 67515, "token_acc": 0.449438202247191, "train_speed(iter/s)": 1.437647 }, { "epoch": 2.89276380617797, "grad_norm": 6.062221050262451, "learning_rate": 3.7785823094214265e-05, "loss": 2.5411703109741213, "memory(GiB)": 77.56, "step": 67520, "token_acc": 0.4674329501915709, "train_speed(iter/s)": 1.437661 }, { "epoch": 2.892978021507219, "grad_norm": 4.514892101287842, "learning_rate": 3.777929731804958e-05, "loss": 2.2641124725341797, "memory(GiB)": 77.56, "step": 67525, "token_acc": 0.48148148148148145, "train_speed(iter/s)": 1.437659 }, { "epoch": 2.8931922368364678, "grad_norm": 5.516725063323975, "learning_rate": 3.777277176327439e-05, "loss": 2.4197708129882813, "memory(GiB)": 77.56, "step": 67530, "token_acc": 0.47058823529411764, "train_speed(iter/s)": 1.437656 }, { "epoch": 2.893406452165717, "grad_norm": 4.598053455352783, "learning_rate": 3.7766246430006904e-05, "loss": 2.4598995208740235, "memory(GiB)": 77.56, "step": 67535, "token_acc": 0.4823529411764706, "train_speed(iter/s)": 1.437665 }, { "epoch": 2.893620667494966, "grad_norm": 4.395597457885742, "learning_rate": 3.7759721318365344e-05, "loss": 2.346821403503418, "memory(GiB)": 77.56, "step": 67540, "token_acc": 0.5094850948509485, "train_speed(iter/s)": 1.437669 }, { "epoch": 2.8938348828242146, "grad_norm": 6.470637798309326, "learning_rate": 3.7753196428467915e-05, "loss": 2.3654699325561523, "memory(GiB)": 77.56, "step": 67545, "token_acc": 0.5099337748344371, "train_speed(iter/s)": 1.43767 }, { "epoch": 2.894049098153464, "grad_norm": 4.622028827667236, "learning_rate": 3.774667176043281e-05, "loss": 2.2633066177368164, "memory(GiB)": 77.56, "step": 67550, "token_acc": 0.5037593984962406, "train_speed(iter/s)": 1.437633 }, { "epoch": 2.8942633134827127, "grad_norm": 4.285539150238037, "learning_rate": 3.7740147314378254e-05, "loss": 2.409061241149902, "memory(GiB)": 77.56, "step": 67555, "token_acc": 0.5313653136531366, "train_speed(iter/s)": 1.437646 }, { "epoch": 2.8944775288119615, "grad_norm": 5.370772838592529, "learning_rate": 3.7733623090422424e-05, "loss": 2.349081802368164, "memory(GiB)": 77.56, "step": 67560, "token_acc": 0.44666666666666666, "train_speed(iter/s)": 1.437642 }, { "epoch": 2.8946917441412108, "grad_norm": 6.547722816467285, "learning_rate": 3.772709908868351e-05, "loss": 2.361754608154297, "memory(GiB)": 77.56, "step": 67565, "token_acc": 0.4859437751004016, "train_speed(iter/s)": 1.437616 }, { "epoch": 2.8949059594704596, "grad_norm": 5.333826541900635, "learning_rate": 3.772057530927973e-05, "loss": 2.525577163696289, "memory(GiB)": 77.56, "step": 67570, "token_acc": 0.5051546391752577, "train_speed(iter/s)": 1.43764 }, { "epoch": 2.8951201747997084, "grad_norm": 6.340412139892578, "learning_rate": 3.771405175232923e-05, "loss": 2.3988569259643553, "memory(GiB)": 77.56, "step": 67575, "token_acc": 0.5068493150684932, "train_speed(iter/s)": 1.43765 }, { "epoch": 2.8953343901289577, "grad_norm": 7.052786827087402, "learning_rate": 3.7707528417950215e-05, "loss": 2.3821653366088866, "memory(GiB)": 77.56, "step": 67580, "token_acc": 0.522633744855967, "train_speed(iter/s)": 1.437678 }, { "epoch": 2.8955486054582065, "grad_norm": 4.3135986328125, "learning_rate": 3.770100530626086e-05, "loss": 2.5312820434570313, "memory(GiB)": 77.56, "step": 67585, "token_acc": 0.4966216216216216, "train_speed(iter/s)": 1.43768 }, { "epoch": 2.8957628207874553, "grad_norm": 10.569611549377441, "learning_rate": 3.769448241737932e-05, "loss": 2.485857391357422, "memory(GiB)": 77.56, "step": 67590, "token_acc": 0.47101449275362317, "train_speed(iter/s)": 1.437688 }, { "epoch": 2.8959770361167045, "grad_norm": 5.541058540344238, "learning_rate": 3.7687959751423755e-05, "loss": 2.4066404342651366, "memory(GiB)": 77.56, "step": 67595, "token_acc": 0.46984126984126984, "train_speed(iter/s)": 1.437666 }, { "epoch": 2.8961912514459534, "grad_norm": 5.763273239135742, "learning_rate": 3.768143730851238e-05, "loss": 2.345656394958496, "memory(GiB)": 77.56, "step": 67600, "token_acc": 0.49372384937238495, "train_speed(iter/s)": 1.437688 }, { "epoch": 2.896405466775202, "grad_norm": 9.806373596191406, "learning_rate": 3.767491508876331e-05, "loss": 2.5688247680664062, "memory(GiB)": 77.56, "step": 67605, "token_acc": 0.4798657718120805, "train_speed(iter/s)": 1.437709 }, { "epoch": 2.8966196821044514, "grad_norm": 7.553590297698975, "learning_rate": 3.7668393092294723e-05, "loss": 2.611219024658203, "memory(GiB)": 77.56, "step": 67610, "token_acc": 0.4470198675496689, "train_speed(iter/s)": 1.437687 }, { "epoch": 2.8968338974337002, "grad_norm": 4.764229774475098, "learning_rate": 3.766187131922476e-05, "loss": 2.3265857696533203, "memory(GiB)": 77.56, "step": 67615, "token_acc": 0.49693251533742333, "train_speed(iter/s)": 1.437692 }, { "epoch": 2.897048112762949, "grad_norm": 5.240024089813232, "learning_rate": 3.765534976967157e-05, "loss": 2.3967214584350587, "memory(GiB)": 77.56, "step": 67620, "token_acc": 0.50199203187251, "train_speed(iter/s)": 1.437699 }, { "epoch": 2.8972623280921983, "grad_norm": 6.137687683105469, "learning_rate": 3.7648828443753305e-05, "loss": 2.387478256225586, "memory(GiB)": 77.56, "step": 67625, "token_acc": 0.46311475409836067, "train_speed(iter/s)": 1.43773 }, { "epoch": 2.897476543421447, "grad_norm": 5.4151105880737305, "learning_rate": 3.76423073415881e-05, "loss": 2.5631446838378906, "memory(GiB)": 77.56, "step": 67630, "token_acc": 0.46332046332046334, "train_speed(iter/s)": 1.437734 }, { "epoch": 2.897690758750696, "grad_norm": 7.524933815002441, "learning_rate": 3.763578646329408e-05, "loss": 2.2344633102416993, "memory(GiB)": 77.56, "step": 67635, "token_acc": 0.5145985401459854, "train_speed(iter/s)": 1.437747 }, { "epoch": 2.897904974079945, "grad_norm": 5.050754070281982, "learning_rate": 3.76292658089894e-05, "loss": 2.4366222381591798, "memory(GiB)": 77.56, "step": 67640, "token_acc": 0.43253968253968256, "train_speed(iter/s)": 1.437753 }, { "epoch": 2.898119189409194, "grad_norm": 4.685484886169434, "learning_rate": 3.7622745378792156e-05, "loss": 2.6455270767211916, "memory(GiB)": 77.56, "step": 67645, "token_acc": 0.4894894894894895, "train_speed(iter/s)": 1.437753 }, { "epoch": 2.898333404738443, "grad_norm": 5.1919403076171875, "learning_rate": 3.761622517282051e-05, "loss": 2.156467819213867, "memory(GiB)": 77.56, "step": 67650, "token_acc": 0.5670498084291188, "train_speed(iter/s)": 1.437732 }, { "epoch": 2.898547620067692, "grad_norm": 4.4663777351379395, "learning_rate": 3.7609705191192555e-05, "loss": 2.167580223083496, "memory(GiB)": 77.56, "step": 67655, "token_acc": 0.5238095238095238, "train_speed(iter/s)": 1.437744 }, { "epoch": 2.898761835396941, "grad_norm": 6.099833011627197, "learning_rate": 3.760318543402641e-05, "loss": 2.646506500244141, "memory(GiB)": 77.56, "step": 67660, "token_acc": 0.4755244755244755, "train_speed(iter/s)": 1.437756 }, { "epoch": 2.8989760507261897, "grad_norm": 6.132604598999023, "learning_rate": 3.75966659014402e-05, "loss": 2.4978260040283202, "memory(GiB)": 77.56, "step": 67665, "token_acc": 0.49411764705882355, "train_speed(iter/s)": 1.437762 }, { "epoch": 2.899190266055439, "grad_norm": 5.117307662963867, "learning_rate": 3.759014659355199e-05, "loss": 2.4953018188476563, "memory(GiB)": 77.56, "step": 67670, "token_acc": 0.5, "train_speed(iter/s)": 1.437724 }, { "epoch": 2.8994044813846878, "grad_norm": 5.843194484710693, "learning_rate": 3.758362751047995e-05, "loss": 2.368437576293945, "memory(GiB)": 77.56, "step": 67675, "token_acc": 0.5, "train_speed(iter/s)": 1.437729 }, { "epoch": 2.8996186967139366, "grad_norm": 4.256525993347168, "learning_rate": 3.7577108652342124e-05, "loss": 2.5725986480712892, "memory(GiB)": 77.56, "step": 67680, "token_acc": 0.5, "train_speed(iter/s)": 1.437728 }, { "epoch": 2.899832912043186, "grad_norm": 5.475727558135986, "learning_rate": 3.7570590019256645e-05, "loss": 2.2663818359375, "memory(GiB)": 77.56, "step": 67685, "token_acc": 0.5189873417721519, "train_speed(iter/s)": 1.437729 }, { "epoch": 2.9000471273724346, "grad_norm": 5.523690700531006, "learning_rate": 3.756407161134159e-05, "loss": 2.3974020004272463, "memory(GiB)": 77.56, "step": 67690, "token_acc": 0.47191011235955055, "train_speed(iter/s)": 1.437746 }, { "epoch": 2.900261342701684, "grad_norm": 4.970705509185791, "learning_rate": 3.7557553428715026e-05, "loss": 2.6790163040161135, "memory(GiB)": 77.56, "step": 67695, "token_acc": 0.475, "train_speed(iter/s)": 1.437721 }, { "epoch": 2.9004755580309327, "grad_norm": 4.712221145629883, "learning_rate": 3.7551035471495065e-05, "loss": 2.28826961517334, "memory(GiB)": 77.56, "step": 67700, "token_acc": 0.49640287769784175, "train_speed(iter/s)": 1.437695 }, { "epoch": 2.9006897733601815, "grad_norm": 6.098865509033203, "learning_rate": 3.7544517739799775e-05, "loss": 2.6099933624267577, "memory(GiB)": 77.56, "step": 67705, "token_acc": 0.4591439688715953, "train_speed(iter/s)": 1.43768 }, { "epoch": 2.900903988689431, "grad_norm": 6.39176082611084, "learning_rate": 3.753800023374723e-05, "loss": 2.5080766677856445, "memory(GiB)": 77.56, "step": 67710, "token_acc": 0.4797687861271676, "train_speed(iter/s)": 1.437667 }, { "epoch": 2.9011182040186796, "grad_norm": 4.354471683502197, "learning_rate": 3.7531482953455504e-05, "loss": 2.0768817901611327, "memory(GiB)": 77.56, "step": 67715, "token_acc": 0.5121107266435986, "train_speed(iter/s)": 1.437654 }, { "epoch": 2.9013324193479284, "grad_norm": 4.666151523590088, "learning_rate": 3.752496589904265e-05, "loss": 2.4172800064086912, "memory(GiB)": 77.56, "step": 67720, "token_acc": 0.4774774774774775, "train_speed(iter/s)": 1.437677 }, { "epoch": 2.9015466346771777, "grad_norm": 4.78005313873291, "learning_rate": 3.751844907062675e-05, "loss": 2.4841840744018553, "memory(GiB)": 77.56, "step": 67725, "token_acc": 0.44107744107744107, "train_speed(iter/s)": 1.437683 }, { "epoch": 2.9017608500064265, "grad_norm": 4.88601541519165, "learning_rate": 3.7511932468325855e-05, "loss": 2.3090999603271483, "memory(GiB)": 77.56, "step": 67730, "token_acc": 0.49070631970260226, "train_speed(iter/s)": 1.437674 }, { "epoch": 2.9019750653356753, "grad_norm": 6.03407621383667, "learning_rate": 3.750541609225801e-05, "loss": 2.605454444885254, "memory(GiB)": 77.56, "step": 67735, "token_acc": 0.46325878594249204, "train_speed(iter/s)": 1.437683 }, { "epoch": 2.9021892806649245, "grad_norm": 4.889484405517578, "learning_rate": 3.749889994254126e-05, "loss": 2.414371681213379, "memory(GiB)": 77.56, "step": 67740, "token_acc": 0.5136054421768708, "train_speed(iter/s)": 1.437673 }, { "epoch": 2.9024034959941734, "grad_norm": 7.421288967132568, "learning_rate": 3.7492384019293686e-05, "loss": 2.4009025573730467, "memory(GiB)": 77.56, "step": 67745, "token_acc": 0.5204081632653061, "train_speed(iter/s)": 1.437642 }, { "epoch": 2.902617711323422, "grad_norm": 6.974843502044678, "learning_rate": 3.74858683226333e-05, "loss": 2.2267650604248046, "memory(GiB)": 77.56, "step": 67750, "token_acc": 0.5058365758754864, "train_speed(iter/s)": 1.437635 }, { "epoch": 2.9028319266526714, "grad_norm": 4.531940937042236, "learning_rate": 3.747935285267816e-05, "loss": 2.5790096282958985, "memory(GiB)": 77.56, "step": 67755, "token_acc": 0.45686900958466453, "train_speed(iter/s)": 1.437636 }, { "epoch": 2.9030461419819202, "grad_norm": 5.8044753074646, "learning_rate": 3.747283760954629e-05, "loss": 2.391276741027832, "memory(GiB)": 77.56, "step": 67760, "token_acc": 0.49295774647887325, "train_speed(iter/s)": 1.437651 }, { "epoch": 2.903260357311169, "grad_norm": 6.199559211730957, "learning_rate": 3.74663225933557e-05, "loss": 2.37835578918457, "memory(GiB)": 77.56, "step": 67765, "token_acc": 0.4825174825174825, "train_speed(iter/s)": 1.437656 }, { "epoch": 2.9034745726404183, "grad_norm": 5.674834728240967, "learning_rate": 3.745980780422445e-05, "loss": 2.1458560943603517, "memory(GiB)": 77.56, "step": 67770, "token_acc": 0.48736462093862815, "train_speed(iter/s)": 1.437676 }, { "epoch": 2.903688787969667, "grad_norm": 5.815678119659424, "learning_rate": 3.7453293242270536e-05, "loss": 2.400010871887207, "memory(GiB)": 77.56, "step": 67775, "token_acc": 0.47876447876447875, "train_speed(iter/s)": 1.43769 }, { "epoch": 2.903903003298916, "grad_norm": 4.094466209411621, "learning_rate": 3.744677890761198e-05, "loss": 2.449346160888672, "memory(GiB)": 77.56, "step": 67780, "token_acc": 0.47278911564625853, "train_speed(iter/s)": 1.437683 }, { "epoch": 2.904117218628165, "grad_norm": 8.096376419067383, "learning_rate": 3.7440264800366815e-05, "loss": 2.468935966491699, "memory(GiB)": 77.56, "step": 67785, "token_acc": 0.46619217081850534, "train_speed(iter/s)": 1.437688 }, { "epoch": 2.904331433957414, "grad_norm": 6.131056785583496, "learning_rate": 3.743375092065302e-05, "loss": 2.5726037979125977, "memory(GiB)": 77.56, "step": 67790, "token_acc": 0.5145985401459854, "train_speed(iter/s)": 1.437702 }, { "epoch": 2.904545649286663, "grad_norm": 4.5523881912231445, "learning_rate": 3.7427237268588626e-05, "loss": 2.3564420700073243, "memory(GiB)": 77.56, "step": 67795, "token_acc": 0.5335570469798657, "train_speed(iter/s)": 1.437713 }, { "epoch": 2.904759864615912, "grad_norm": 4.512152194976807, "learning_rate": 3.7420723844291625e-05, "loss": 2.4989864349365236, "memory(GiB)": 77.56, "step": 67800, "token_acc": 0.45187165775401067, "train_speed(iter/s)": 1.437722 }, { "epoch": 2.904974079945161, "grad_norm": 5.770841121673584, "learning_rate": 3.7414210647879996e-05, "loss": 3.049637794494629, "memory(GiB)": 77.56, "step": 67805, "token_acc": 0.4057507987220447, "train_speed(iter/s)": 1.437717 }, { "epoch": 2.9051882952744097, "grad_norm": 6.343110084533691, "learning_rate": 3.740769767947177e-05, "loss": 2.3117307662963866, "memory(GiB)": 77.56, "step": 67810, "token_acc": 0.4894366197183099, "train_speed(iter/s)": 1.43772 }, { "epoch": 2.905402510603659, "grad_norm": 6.3243207931518555, "learning_rate": 3.740118493918489e-05, "loss": 2.2327476501464845, "memory(GiB)": 77.56, "step": 67815, "token_acc": 0.4948453608247423, "train_speed(iter/s)": 1.437741 }, { "epoch": 2.9056167259329078, "grad_norm": 4.467010021209717, "learning_rate": 3.739467242713737e-05, "loss": 2.192717742919922, "memory(GiB)": 77.56, "step": 67820, "token_acc": 0.5415162454873647, "train_speed(iter/s)": 1.437771 }, { "epoch": 2.9058309412621566, "grad_norm": 9.37385082244873, "learning_rate": 3.7388160143447184e-05, "loss": 2.5006927490234374, "memory(GiB)": 77.56, "step": 67825, "token_acc": 0.46887966804979253, "train_speed(iter/s)": 1.437785 }, { "epoch": 2.906045156591406, "grad_norm": 8.409736633300781, "learning_rate": 3.738164808823231e-05, "loss": 2.2262020111083984, "memory(GiB)": 77.56, "step": 67830, "token_acc": 0.5147058823529411, "train_speed(iter/s)": 1.43782 }, { "epoch": 2.9062593719206546, "grad_norm": 6.2491841316223145, "learning_rate": 3.737513626161073e-05, "loss": 2.3818056106567385, "memory(GiB)": 77.56, "step": 67835, "token_acc": 0.5, "train_speed(iter/s)": 1.437796 }, { "epoch": 2.9064735872499035, "grad_norm": 5.542945861816406, "learning_rate": 3.736862466370038e-05, "loss": 2.5873130798339843, "memory(GiB)": 77.56, "step": 67840, "token_acc": 0.4767025089605735, "train_speed(iter/s)": 1.437803 }, { "epoch": 2.9066878025791527, "grad_norm": 5.3912458419799805, "learning_rate": 3.7362113294619264e-05, "loss": 2.364415740966797, "memory(GiB)": 77.56, "step": 67845, "token_acc": 0.503030303030303, "train_speed(iter/s)": 1.437801 }, { "epoch": 2.9069020179084015, "grad_norm": 4.695046424865723, "learning_rate": 3.73556021544853e-05, "loss": 2.3008392333984373, "memory(GiB)": 77.56, "step": 67850, "token_acc": 0.5150501672240803, "train_speed(iter/s)": 1.437787 }, { "epoch": 2.9071162332376503, "grad_norm": 6.187320232391357, "learning_rate": 3.734909124341648e-05, "loss": 2.3105842590332033, "memory(GiB)": 77.56, "step": 67855, "token_acc": 0.508695652173913, "train_speed(iter/s)": 1.437771 }, { "epoch": 2.9073304485668996, "grad_norm": 6.184545993804932, "learning_rate": 3.734258056153074e-05, "loss": 2.7187423706054688, "memory(GiB)": 77.56, "step": 67860, "token_acc": 0.4722222222222222, "train_speed(iter/s)": 1.437773 }, { "epoch": 2.9075446638961484, "grad_norm": 5.685884475708008, "learning_rate": 3.7336070108946016e-05, "loss": 2.7080142974853514, "memory(GiB)": 77.56, "step": 67865, "token_acc": 0.4795539033457249, "train_speed(iter/s)": 1.43778 }, { "epoch": 2.907758879225397, "grad_norm": 4.762025356292725, "learning_rate": 3.732955988578027e-05, "loss": 2.336121368408203, "memory(GiB)": 77.56, "step": 67870, "token_acc": 0.47601476014760147, "train_speed(iter/s)": 1.43778 }, { "epoch": 2.9079730945546465, "grad_norm": 3.8940138816833496, "learning_rate": 3.732304989215143e-05, "loss": 2.3883819580078125, "memory(GiB)": 77.56, "step": 67875, "token_acc": 0.4884488448844885, "train_speed(iter/s)": 1.437786 }, { "epoch": 2.9081873098838953, "grad_norm": 6.916037082672119, "learning_rate": 3.731654012817743e-05, "loss": 2.383189582824707, "memory(GiB)": 77.56, "step": 67880, "token_acc": 0.5166666666666667, "train_speed(iter/s)": 1.437798 }, { "epoch": 2.908401525213144, "grad_norm": 7.057248115539551, "learning_rate": 3.7310030593976205e-05, "loss": 2.4842044830322267, "memory(GiB)": 77.56, "step": 67885, "token_acc": 0.4379310344827586, "train_speed(iter/s)": 1.437791 }, { "epoch": 2.9086157405423934, "grad_norm": 5.1528143882751465, "learning_rate": 3.730352128966567e-05, "loss": 2.4627311706542967, "memory(GiB)": 77.56, "step": 67890, "token_acc": 0.5043478260869565, "train_speed(iter/s)": 1.437779 }, { "epoch": 2.908829955871642, "grad_norm": 5.144994258880615, "learning_rate": 3.7297012215363756e-05, "loss": 2.4363882064819338, "memory(GiB)": 77.56, "step": 67895, "token_acc": 0.47058823529411764, "train_speed(iter/s)": 1.437774 }, { "epoch": 2.909044171200891, "grad_norm": 4.467019557952881, "learning_rate": 3.7290503371188404e-05, "loss": 2.1492786407470703, "memory(GiB)": 77.56, "step": 67900, "token_acc": 0.5348837209302325, "train_speed(iter/s)": 1.437796 }, { "epoch": 2.9092583865301402, "grad_norm": 6.21030330657959, "learning_rate": 3.72839947572575e-05, "loss": 2.627834701538086, "memory(GiB)": 77.56, "step": 67905, "token_acc": 0.46439628482972134, "train_speed(iter/s)": 1.437809 }, { "epoch": 2.909472601859389, "grad_norm": 4.537671089172363, "learning_rate": 3.727748637368895e-05, "loss": 2.5181861877441407, "memory(GiB)": 77.56, "step": 67910, "token_acc": 0.45609065155807366, "train_speed(iter/s)": 1.437817 }, { "epoch": 2.909686817188638, "grad_norm": 4.239520072937012, "learning_rate": 3.727097822060068e-05, "loss": 2.6376447677612305, "memory(GiB)": 77.56, "step": 67915, "token_acc": 0.47678018575851394, "train_speed(iter/s)": 1.437789 }, { "epoch": 2.909901032517887, "grad_norm": 4.422943592071533, "learning_rate": 3.7264470298110576e-05, "loss": 2.459255599975586, "memory(GiB)": 77.56, "step": 67920, "token_acc": 0.5015673981191222, "train_speed(iter/s)": 1.437793 }, { "epoch": 2.910115247847136, "grad_norm": 6.4508957862854, "learning_rate": 3.725796260633653e-05, "loss": 2.1374263763427734, "memory(GiB)": 77.56, "step": 67925, "token_acc": 0.541095890410959, "train_speed(iter/s)": 1.43781 }, { "epoch": 2.9103294631763847, "grad_norm": 7.026559352874756, "learning_rate": 3.725145514539646e-05, "loss": 2.7312219619750975, "memory(GiB)": 77.56, "step": 67930, "token_acc": 0.45733788395904434, "train_speed(iter/s)": 1.437832 }, { "epoch": 2.910543678505634, "grad_norm": 7.377779960632324, "learning_rate": 3.724494791540823e-05, "loss": 2.2984180450439453, "memory(GiB)": 77.56, "step": 67935, "token_acc": 0.54, "train_speed(iter/s)": 1.437839 }, { "epoch": 2.910757893834883, "grad_norm": 5.123602390289307, "learning_rate": 3.723844091648974e-05, "loss": 2.4122140884399412, "memory(GiB)": 77.56, "step": 67940, "token_acc": 0.4854368932038835, "train_speed(iter/s)": 1.437805 }, { "epoch": 2.9109721091641316, "grad_norm": 5.724231719970703, "learning_rate": 3.7231934148758865e-05, "loss": 2.4591293334960938, "memory(GiB)": 77.56, "step": 67945, "token_acc": 0.49387755102040815, "train_speed(iter/s)": 1.4378 }, { "epoch": 2.911186324493381, "grad_norm": 4.906028747558594, "learning_rate": 3.7225427612333465e-05, "loss": 2.4353408813476562, "memory(GiB)": 77.56, "step": 67950, "token_acc": 0.45390070921985815, "train_speed(iter/s)": 1.437804 }, { "epoch": 2.9114005398226297, "grad_norm": 5.741007328033447, "learning_rate": 3.7218921307331446e-05, "loss": 2.701954650878906, "memory(GiB)": 77.56, "step": 67955, "token_acc": 0.46835443037974683, "train_speed(iter/s)": 1.437821 }, { "epoch": 2.9116147551518785, "grad_norm": 6.2724432945251465, "learning_rate": 3.7212415233870646e-05, "loss": 2.256239318847656, "memory(GiB)": 77.56, "step": 67960, "token_acc": 0.4708333333333333, "train_speed(iter/s)": 1.437821 }, { "epoch": 2.9118289704811278, "grad_norm": 4.073914527893066, "learning_rate": 3.7205909392068926e-05, "loss": 2.565846061706543, "memory(GiB)": 77.56, "step": 67965, "token_acc": 0.478125, "train_speed(iter/s)": 1.437815 }, { "epoch": 2.9120431858103766, "grad_norm": 7.0176262855529785, "learning_rate": 3.719940378204417e-05, "loss": 2.505396270751953, "memory(GiB)": 77.56, "step": 67970, "token_acc": 0.5135135135135135, "train_speed(iter/s)": 1.43784 }, { "epoch": 2.9122574011396254, "grad_norm": 4.375279903411865, "learning_rate": 3.7192898403914236e-05, "loss": 2.441196250915527, "memory(GiB)": 77.56, "step": 67975, "token_acc": 0.44884488448844884, "train_speed(iter/s)": 1.43785 }, { "epoch": 2.9124716164688746, "grad_norm": 4.9972333908081055, "learning_rate": 3.718639325779696e-05, "loss": 2.30728702545166, "memory(GiB)": 77.56, "step": 67980, "token_acc": 0.5390070921985816, "train_speed(iter/s)": 1.437855 }, { "epoch": 2.9126858317981235, "grad_norm": 5.396003723144531, "learning_rate": 3.717988834381017e-05, "loss": 2.4788684844970703, "memory(GiB)": 77.56, "step": 67985, "token_acc": 0.47796610169491527, "train_speed(iter/s)": 1.437844 }, { "epoch": 2.9129000471273723, "grad_norm": 6.3774189949035645, "learning_rate": 3.717338366207175e-05, "loss": 2.435546112060547, "memory(GiB)": 77.56, "step": 67990, "token_acc": 0.4772727272727273, "train_speed(iter/s)": 1.437856 }, { "epoch": 2.9131142624566215, "grad_norm": 4.868289470672607, "learning_rate": 3.71668792126995e-05, "loss": 2.606495666503906, "memory(GiB)": 77.56, "step": 67995, "token_acc": 0.44904458598726116, "train_speed(iter/s)": 1.437854 }, { "epoch": 2.9133284777858703, "grad_norm": 4.707134246826172, "learning_rate": 3.716037499581129e-05, "loss": 2.5440460205078126, "memory(GiB)": 77.56, "step": 68000, "token_acc": 0.46601941747572817, "train_speed(iter/s)": 1.437863 }, { "epoch": 2.9133284777858703, "eval_loss": 2.0957367420196533, "eval_runtime": 14.1304, "eval_samples_per_second": 7.077, "eval_steps_per_second": 7.077, "eval_token_acc": 0.49319213313161875, "step": 68000 }, { "epoch": 2.913542693115119, "grad_norm": 5.822622776031494, "learning_rate": 3.7153871011524924e-05, "loss": 2.9383464813232423, "memory(GiB)": 77.56, "step": 68005, "token_acc": 0.4668737060041408, "train_speed(iter/s)": 1.437398 }, { "epoch": 2.9137569084443684, "grad_norm": 4.303014755249023, "learning_rate": 3.714736725995823e-05, "loss": 2.4112377166748047, "memory(GiB)": 77.56, "step": 68010, "token_acc": 0.49834983498349833, "train_speed(iter/s)": 1.437415 }, { "epoch": 2.9139711237736172, "grad_norm": 6.689478397369385, "learning_rate": 3.714086374122904e-05, "loss": 2.498320770263672, "memory(GiB)": 77.56, "step": 68015, "token_acc": 0.4775510204081633, "train_speed(iter/s)": 1.437439 }, { "epoch": 2.914185339102866, "grad_norm": 5.524967670440674, "learning_rate": 3.713436045545518e-05, "loss": 2.1342063903808595, "memory(GiB)": 77.56, "step": 68020, "token_acc": 0.5436507936507936, "train_speed(iter/s)": 1.437447 }, { "epoch": 2.9143995544321153, "grad_norm": 5.780593395233154, "learning_rate": 3.712785740275442e-05, "loss": 2.1418651580810546, "memory(GiB)": 77.56, "step": 68025, "token_acc": 0.5088967971530249, "train_speed(iter/s)": 1.43744 }, { "epoch": 2.914613769761364, "grad_norm": 4.24763298034668, "learning_rate": 3.7121354583244615e-05, "loss": 2.6473567962646483, "memory(GiB)": 77.56, "step": 68030, "token_acc": 0.49038461538461536, "train_speed(iter/s)": 1.437443 }, { "epoch": 2.9148279850906134, "grad_norm": 6.561647415161133, "learning_rate": 3.711485199704353e-05, "loss": 2.3959327697753907, "memory(GiB)": 77.56, "step": 68035, "token_acc": 0.472, "train_speed(iter/s)": 1.43745 }, { "epoch": 2.915042200419862, "grad_norm": 5.502318382263184, "learning_rate": 3.710834964426901e-05, "loss": 2.190201187133789, "memory(GiB)": 77.56, "step": 68040, "token_acc": 0.5432525951557093, "train_speed(iter/s)": 1.437458 }, { "epoch": 2.915256415749111, "grad_norm": 5.741339206695557, "learning_rate": 3.710184752503883e-05, "loss": 2.3484983444213867, "memory(GiB)": 77.56, "step": 68045, "token_acc": 0.5104602510460251, "train_speed(iter/s)": 1.437478 }, { "epoch": 2.9154706310783602, "grad_norm": 5.461989402770996, "learning_rate": 3.709534563947078e-05, "loss": 2.5413816452026365, "memory(GiB)": 77.56, "step": 68050, "token_acc": 0.4559748427672956, "train_speed(iter/s)": 1.437497 }, { "epoch": 2.915684846407609, "grad_norm": 4.681112766265869, "learning_rate": 3.708884398768265e-05, "loss": 2.6381193161010743, "memory(GiB)": 77.56, "step": 68055, "token_acc": 0.4560810810810811, "train_speed(iter/s)": 1.437504 }, { "epoch": 2.915899061736858, "grad_norm": 4.501175403594971, "learning_rate": 3.708234256979223e-05, "loss": 2.4485021591186524, "memory(GiB)": 77.56, "step": 68060, "token_acc": 0.4968944099378882, "train_speed(iter/s)": 1.437513 }, { "epoch": 2.916113277066107, "grad_norm": 6.423325061798096, "learning_rate": 3.7075841385917285e-05, "loss": 2.4590570449829103, "memory(GiB)": 77.56, "step": 68065, "token_acc": 0.4647887323943662, "train_speed(iter/s)": 1.43754 }, { "epoch": 2.916327492395356, "grad_norm": 5.460845470428467, "learning_rate": 3.706934043617559e-05, "loss": 2.239404487609863, "memory(GiB)": 77.56, "step": 68070, "token_acc": 0.511400651465798, "train_speed(iter/s)": 1.43755 }, { "epoch": 2.9165417077246047, "grad_norm": 4.988187789916992, "learning_rate": 3.7062839720684926e-05, "loss": 2.6336437225341798, "memory(GiB)": 77.56, "step": 68075, "token_acc": 0.47147147147147145, "train_speed(iter/s)": 1.437523 }, { "epoch": 2.916755923053854, "grad_norm": 5.610249996185303, "learning_rate": 3.705633923956306e-05, "loss": 2.63223876953125, "memory(GiB)": 77.56, "step": 68080, "token_acc": 0.4463667820069204, "train_speed(iter/s)": 1.437539 }, { "epoch": 2.916970138383103, "grad_norm": 5.455048561096191, "learning_rate": 3.7049838992927755e-05, "loss": 2.0385772705078127, "memory(GiB)": 77.56, "step": 68085, "token_acc": 0.5076923076923077, "train_speed(iter/s)": 1.437518 }, { "epoch": 2.9171843537123516, "grad_norm": 4.9343061447143555, "learning_rate": 3.704333898089676e-05, "loss": 2.596497917175293, "memory(GiB)": 77.56, "step": 68090, "token_acc": 0.4679245283018868, "train_speed(iter/s)": 1.437547 }, { "epoch": 2.917398569041601, "grad_norm": 6.102274417877197, "learning_rate": 3.703683920358783e-05, "loss": 2.6200613021850585, "memory(GiB)": 77.56, "step": 68095, "token_acc": 0.4738562091503268, "train_speed(iter/s)": 1.437527 }, { "epoch": 2.9176127843708497, "grad_norm": 8.687932968139648, "learning_rate": 3.7030339661118726e-05, "loss": 2.26722412109375, "memory(GiB)": 77.56, "step": 68100, "token_acc": 0.5291828793774319, "train_speed(iter/s)": 1.437513 }, { "epoch": 2.9178269997000985, "grad_norm": 5.289733409881592, "learning_rate": 3.7023840353607184e-05, "loss": 2.4978969573974608, "memory(GiB)": 77.56, "step": 68105, "token_acc": 0.43630573248407645, "train_speed(iter/s)": 1.437495 }, { "epoch": 2.9180412150293478, "grad_norm": 4.299315929412842, "learning_rate": 3.701734128117093e-05, "loss": 2.5657699584960936, "memory(GiB)": 77.56, "step": 68110, "token_acc": 0.45674740484429066, "train_speed(iter/s)": 1.437506 }, { "epoch": 2.9182554303585966, "grad_norm": 4.942446231842041, "learning_rate": 3.701084244392773e-05, "loss": 2.413589286804199, "memory(GiB)": 77.56, "step": 68115, "token_acc": 0.4714285714285714, "train_speed(iter/s)": 1.437521 }, { "epoch": 2.9184696456878454, "grad_norm": 5.2760419845581055, "learning_rate": 3.70043438419953e-05, "loss": 2.2743240356445313, "memory(GiB)": 77.56, "step": 68120, "token_acc": 0.46464646464646464, "train_speed(iter/s)": 1.437519 }, { "epoch": 2.9186838610170946, "grad_norm": 6.663489818572998, "learning_rate": 3.699784547549139e-05, "loss": 2.316331672668457, "memory(GiB)": 77.56, "step": 68125, "token_acc": 0.5218855218855218, "train_speed(iter/s)": 1.437535 }, { "epoch": 2.9188980763463435, "grad_norm": 5.129056453704834, "learning_rate": 3.699134734453369e-05, "loss": 2.6504919052124025, "memory(GiB)": 77.56, "step": 68130, "token_acc": 0.47266881028938906, "train_speed(iter/s)": 1.437516 }, { "epoch": 2.9191122916755923, "grad_norm": 4.9390740394592285, "learning_rate": 3.698484944923994e-05, "loss": 2.1255060195922852, "memory(GiB)": 77.56, "step": 68135, "token_acc": 0.5555555555555556, "train_speed(iter/s)": 1.437522 }, { "epoch": 2.9193265070048415, "grad_norm": 6.614055156707764, "learning_rate": 3.697835178972785e-05, "loss": 2.524114990234375, "memory(GiB)": 77.56, "step": 68140, "token_acc": 0.4945054945054945, "train_speed(iter/s)": 1.437514 }, { "epoch": 2.9195407223340903, "grad_norm": 4.805367946624756, "learning_rate": 3.6971854366115136e-05, "loss": 2.5792205810546873, "memory(GiB)": 77.56, "step": 68145, "token_acc": 0.4520123839009288, "train_speed(iter/s)": 1.437532 }, { "epoch": 2.919754937663339, "grad_norm": 6.10288143157959, "learning_rate": 3.69653571785195e-05, "loss": 2.163959503173828, "memory(GiB)": 77.56, "step": 68150, "token_acc": 0.5178571428571429, "train_speed(iter/s)": 1.437509 }, { "epoch": 2.9199691529925884, "grad_norm": 6.391965866088867, "learning_rate": 3.695886022705865e-05, "loss": 2.1923961639404297, "memory(GiB)": 77.56, "step": 68155, "token_acc": 0.5447154471544715, "train_speed(iter/s)": 1.43752 }, { "epoch": 2.9201833683218372, "grad_norm": 5.049295425415039, "learning_rate": 3.6952363511850274e-05, "loss": 2.498997688293457, "memory(GiB)": 77.56, "step": 68160, "token_acc": 0.44664031620553357, "train_speed(iter/s)": 1.437548 }, { "epoch": 2.920397583651086, "grad_norm": 5.195949077606201, "learning_rate": 3.6945867033012084e-05, "loss": 2.214436149597168, "memory(GiB)": 77.56, "step": 68165, "token_acc": 0.5353846153846153, "train_speed(iter/s)": 1.437548 }, { "epoch": 2.9206117989803353, "grad_norm": 7.036581993103027, "learning_rate": 3.6939370790661745e-05, "loss": 2.429710578918457, "memory(GiB)": 77.56, "step": 68170, "token_acc": 0.5035714285714286, "train_speed(iter/s)": 1.437547 }, { "epoch": 2.920826014309584, "grad_norm": 6.2165446281433105, "learning_rate": 3.693287478491696e-05, "loss": 2.4742910385131838, "memory(GiB)": 77.56, "step": 68175, "token_acc": 0.47202797202797203, "train_speed(iter/s)": 1.437565 }, { "epoch": 2.921040229638833, "grad_norm": 5.181307792663574, "learning_rate": 3.6926379015895395e-05, "loss": 2.344084548950195, "memory(GiB)": 77.56, "step": 68180, "token_acc": 0.5029585798816568, "train_speed(iter/s)": 1.437584 }, { "epoch": 2.921254444968082, "grad_norm": 6.763940811157227, "learning_rate": 3.691988348371473e-05, "loss": 2.3910369873046875, "memory(GiB)": 77.56, "step": 68185, "token_acc": 0.46122448979591835, "train_speed(iter/s)": 1.437561 }, { "epoch": 2.921468660297331, "grad_norm": 4.198749542236328, "learning_rate": 3.691338818849268e-05, "loss": 2.419862174987793, "memory(GiB)": 77.56, "step": 68190, "token_acc": 0.49683544303797467, "train_speed(iter/s)": 1.437558 }, { "epoch": 2.92168287562658, "grad_norm": 8.38559341430664, "learning_rate": 3.690689313034686e-05, "loss": 2.664231872558594, "memory(GiB)": 77.56, "step": 68195, "token_acc": 0.4774193548387097, "train_speed(iter/s)": 1.43758 }, { "epoch": 2.921897090955829, "grad_norm": 5.795619487762451, "learning_rate": 3.690039830939496e-05, "loss": 2.1378347396850588, "memory(GiB)": 77.56, "step": 68200, "token_acc": 0.5112781954887218, "train_speed(iter/s)": 1.437609 }, { "epoch": 2.922111306285078, "grad_norm": 4.589444160461426, "learning_rate": 3.689390372575463e-05, "loss": 2.359437370300293, "memory(GiB)": 77.56, "step": 68205, "token_acc": 0.5018315018315018, "train_speed(iter/s)": 1.437596 }, { "epoch": 2.9223255216143267, "grad_norm": 7.643991470336914, "learning_rate": 3.688740937954354e-05, "loss": 2.4506202697753907, "memory(GiB)": 77.56, "step": 68210, "token_acc": 0.4528301886792453, "train_speed(iter/s)": 1.437585 }, { "epoch": 2.922539736943576, "grad_norm": 5.98811149597168, "learning_rate": 3.6880915270879316e-05, "loss": 2.3916858673095702, "memory(GiB)": 77.56, "step": 68215, "token_acc": 0.49514563106796117, "train_speed(iter/s)": 1.437595 }, { "epoch": 2.9227539522728248, "grad_norm": 4.503275394439697, "learning_rate": 3.6874421399879624e-05, "loss": 2.5401371002197264, "memory(GiB)": 77.56, "step": 68220, "token_acc": 0.4827586206896552, "train_speed(iter/s)": 1.437607 }, { "epoch": 2.9229681676020736, "grad_norm": 6.436376571655273, "learning_rate": 3.686792776666209e-05, "loss": 2.7383283615112304, "memory(GiB)": 77.56, "step": 68225, "token_acc": 0.4463768115942029, "train_speed(iter/s)": 1.437628 }, { "epoch": 2.923182382931323, "grad_norm": 4.667707920074463, "learning_rate": 3.686143437134438e-05, "loss": 2.2121482849121095, "memory(GiB)": 77.56, "step": 68230, "token_acc": 0.504885993485342, "train_speed(iter/s)": 1.437627 }, { "epoch": 2.9233965982605716, "grad_norm": 4.4454665184021, "learning_rate": 3.6854941214044105e-05, "loss": 2.272251510620117, "memory(GiB)": 77.56, "step": 68235, "token_acc": 0.46830985915492956, "train_speed(iter/s)": 1.437642 }, { "epoch": 2.9236108135898204, "grad_norm": 6.854433536529541, "learning_rate": 3.68484482948789e-05, "loss": 2.2384340286254885, "memory(GiB)": 77.56, "step": 68240, "token_acc": 0.5551470588235294, "train_speed(iter/s)": 1.437628 }, { "epoch": 2.9238250289190697, "grad_norm": 5.541005611419678, "learning_rate": 3.684195561396639e-05, "loss": 2.3282079696655273, "memory(GiB)": 77.56, "step": 68245, "token_acc": 0.5190311418685121, "train_speed(iter/s)": 1.437627 }, { "epoch": 2.9240392442483185, "grad_norm": 6.127822399139404, "learning_rate": 3.6835463171424204e-05, "loss": 2.191803550720215, "memory(GiB)": 77.56, "step": 68250, "token_acc": 0.5301204819277109, "train_speed(iter/s)": 1.437643 }, { "epoch": 2.9242534595775673, "grad_norm": 6.204081058502197, "learning_rate": 3.682897096736994e-05, "loss": 2.765501022338867, "memory(GiB)": 77.56, "step": 68255, "token_acc": 0.4470899470899471, "train_speed(iter/s)": 1.437634 }, { "epoch": 2.9244676749068166, "grad_norm": 7.104529857635498, "learning_rate": 3.682247900192121e-05, "loss": 2.4396389007568358, "memory(GiB)": 77.56, "step": 68260, "token_acc": 0.475, "train_speed(iter/s)": 1.437617 }, { "epoch": 2.9246818902360654, "grad_norm": 5.439451217651367, "learning_rate": 3.6815987275195664e-05, "loss": 2.4252492904663088, "memory(GiB)": 77.56, "step": 68265, "token_acc": 0.5074626865671642, "train_speed(iter/s)": 1.437618 }, { "epoch": 2.924896105565314, "grad_norm": 6.014585018157959, "learning_rate": 3.680949578731086e-05, "loss": 2.6215152740478516, "memory(GiB)": 77.56, "step": 68270, "token_acc": 0.4682274247491639, "train_speed(iter/s)": 1.437616 }, { "epoch": 2.9251103208945635, "grad_norm": 4.763463497161865, "learning_rate": 3.6803004538384425e-05, "loss": 2.1167179107666017, "memory(GiB)": 77.56, "step": 68275, "token_acc": 0.555956678700361, "train_speed(iter/s)": 1.437624 }, { "epoch": 2.9253245362238123, "grad_norm": 5.733205318450928, "learning_rate": 3.679651352853394e-05, "loss": 2.578509521484375, "memory(GiB)": 77.56, "step": 68280, "token_acc": 0.4684385382059801, "train_speed(iter/s)": 1.437634 }, { "epoch": 2.925538751553061, "grad_norm": 5.081930160522461, "learning_rate": 3.679002275787698e-05, "loss": 2.3902057647705077, "memory(GiB)": 77.56, "step": 68285, "token_acc": 0.48055555555555557, "train_speed(iter/s)": 1.437637 }, { "epoch": 2.9257529668823103, "grad_norm": 6.3652191162109375, "learning_rate": 3.678353222653117e-05, "loss": 2.328163719177246, "memory(GiB)": 77.56, "step": 68290, "token_acc": 0.46938775510204084, "train_speed(iter/s)": 1.437642 }, { "epoch": 2.925967182211559, "grad_norm": 5.1677751541137695, "learning_rate": 3.6777041934614076e-05, "loss": 2.733831596374512, "memory(GiB)": 77.56, "step": 68295, "token_acc": 0.48484848484848486, "train_speed(iter/s)": 1.437647 }, { "epoch": 2.926181397540808, "grad_norm": 4.527955055236816, "learning_rate": 3.6770551882243256e-05, "loss": 2.3957286834716798, "memory(GiB)": 77.56, "step": 68300, "token_acc": 0.4734982332155477, "train_speed(iter/s)": 1.437627 }, { "epoch": 2.9263956128700572, "grad_norm": 8.41672420501709, "learning_rate": 3.676406206953631e-05, "loss": 2.4217927932739256, "memory(GiB)": 77.56, "step": 68305, "token_acc": 0.48125, "train_speed(iter/s)": 1.437626 }, { "epoch": 2.926609828199306, "grad_norm": 5.73418664932251, "learning_rate": 3.6757572496610794e-05, "loss": 2.4105131149291994, "memory(GiB)": 77.56, "step": 68310, "token_acc": 0.48936170212765956, "train_speed(iter/s)": 1.437626 }, { "epoch": 2.926824043528555, "grad_norm": 5.170902729034424, "learning_rate": 3.675108316358426e-05, "loss": 2.554925537109375, "memory(GiB)": 77.56, "step": 68315, "token_acc": 0.43944636678200694, "train_speed(iter/s)": 1.437631 }, { "epoch": 2.927038258857804, "grad_norm": 6.31304931640625, "learning_rate": 3.67445940705743e-05, "loss": 2.6427539825439452, "memory(GiB)": 77.56, "step": 68320, "token_acc": 0.46564885496183206, "train_speed(iter/s)": 1.437653 }, { "epoch": 2.927252474187053, "grad_norm": 6.32892370223999, "learning_rate": 3.673810521769844e-05, "loss": 2.6022083282470705, "memory(GiB)": 77.56, "step": 68325, "token_acc": 0.4645161290322581, "train_speed(iter/s)": 1.437644 }, { "epoch": 2.9274666895163017, "grad_norm": 6.0275492668151855, "learning_rate": 3.673161660507426e-05, "loss": 2.533201217651367, "memory(GiB)": 77.56, "step": 68330, "token_acc": 0.5017301038062284, "train_speed(iter/s)": 1.437658 }, { "epoch": 2.927680904845551, "grad_norm": 5.365231513977051, "learning_rate": 3.672512823281926e-05, "loss": 2.4699285507202147, "memory(GiB)": 77.56, "step": 68335, "token_acc": 0.48951048951048953, "train_speed(iter/s)": 1.43768 }, { "epoch": 2.9278951201748, "grad_norm": 6.795527935028076, "learning_rate": 3.671864010105105e-05, "loss": 2.606638717651367, "memory(GiB)": 77.56, "step": 68340, "token_acc": 0.46179401993355484, "train_speed(iter/s)": 1.43769 }, { "epoch": 2.9281093355040486, "grad_norm": 11.144953727722168, "learning_rate": 3.671215220988711e-05, "loss": 2.7022300720214845, "memory(GiB)": 77.56, "step": 68345, "token_acc": 0.4610169491525424, "train_speed(iter/s)": 1.437703 }, { "epoch": 2.928323550833298, "grad_norm": 5.704421520233154, "learning_rate": 3.670566455944502e-05, "loss": 2.378597640991211, "memory(GiB)": 77.56, "step": 68350, "token_acc": 0.4965277777777778, "train_speed(iter/s)": 1.4377 }, { "epoch": 2.9285377661625467, "grad_norm": 5.122502326965332, "learning_rate": 3.6699177149842276e-05, "loss": 2.5273069381713866, "memory(GiB)": 77.56, "step": 68355, "token_acc": 0.475, "train_speed(iter/s)": 1.437698 }, { "epoch": 2.9287519814917955, "grad_norm": 5.294159412384033, "learning_rate": 3.669268998119642e-05, "loss": 2.3849491119384765, "memory(GiB)": 77.56, "step": 68360, "token_acc": 0.4830188679245283, "train_speed(iter/s)": 1.437716 }, { "epoch": 2.9289661968210448, "grad_norm": 7.5218634605407715, "learning_rate": 3.668620305362496e-05, "loss": 2.4301084518432616, "memory(GiB)": 77.56, "step": 68365, "token_acc": 0.4981684981684982, "train_speed(iter/s)": 1.437704 }, { "epoch": 2.9291804121502936, "grad_norm": 5.494652271270752, "learning_rate": 3.667971636724542e-05, "loss": 2.234860420227051, "memory(GiB)": 77.56, "step": 68370, "token_acc": 0.5347985347985348, "train_speed(iter/s)": 1.437725 }, { "epoch": 2.9293946274795424, "grad_norm": 5.734588623046875, "learning_rate": 3.667322992217532e-05, "loss": 2.5520034790039063, "memory(GiB)": 77.56, "step": 68375, "token_acc": 0.46296296296296297, "train_speed(iter/s)": 1.437732 }, { "epoch": 2.9296088428087916, "grad_norm": 5.948920249938965, "learning_rate": 3.666674371853217e-05, "loss": 2.188946533203125, "memory(GiB)": 77.56, "step": 68380, "token_acc": 0.5098814229249012, "train_speed(iter/s)": 1.437741 }, { "epoch": 2.9298230581380404, "grad_norm": 6.205685615539551, "learning_rate": 3.6660257756433444e-05, "loss": 2.5464221954345705, "memory(GiB)": 77.56, "step": 68385, "token_acc": 0.4728434504792332, "train_speed(iter/s)": 1.437728 }, { "epoch": 2.9300372734672893, "grad_norm": 4.757958889007568, "learning_rate": 3.665377203599668e-05, "loss": 2.375376892089844, "memory(GiB)": 77.56, "step": 68390, "token_acc": 0.468944099378882, "train_speed(iter/s)": 1.437727 }, { "epoch": 2.9302514887965385, "grad_norm": 5.090051174163818, "learning_rate": 3.664728655733936e-05, "loss": 2.354806900024414, "memory(GiB)": 77.56, "step": 68395, "token_acc": 0.49122807017543857, "train_speed(iter/s)": 1.437743 }, { "epoch": 2.9304657041257873, "grad_norm": 5.366011619567871, "learning_rate": 3.664080132057896e-05, "loss": 2.7003589630126954, "memory(GiB)": 77.56, "step": 68400, "token_acc": 0.40942028985507245, "train_speed(iter/s)": 1.437729 }, { "epoch": 2.930679919455036, "grad_norm": 5.187288761138916, "learning_rate": 3.6634316325832954e-05, "loss": 2.4137845993041993, "memory(GiB)": 77.56, "step": 68405, "token_acc": 0.48265895953757226, "train_speed(iter/s)": 1.437688 }, { "epoch": 2.9308941347842854, "grad_norm": 4.5555739402771, "learning_rate": 3.662783157321887e-05, "loss": 2.3343963623046875, "memory(GiB)": 77.56, "step": 68410, "token_acc": 0.5263157894736842, "train_speed(iter/s)": 1.4377 }, { "epoch": 2.931108350113534, "grad_norm": 4.508800506591797, "learning_rate": 3.6621347062854164e-05, "loss": 2.3369203567504884, "memory(GiB)": 77.56, "step": 68415, "token_acc": 0.4880546075085324, "train_speed(iter/s)": 1.437671 }, { "epoch": 2.931322565442783, "grad_norm": 5.752854824066162, "learning_rate": 3.661486279485629e-05, "loss": 2.4629842758178713, "memory(GiB)": 77.56, "step": 68420, "token_acc": 0.45555555555555555, "train_speed(iter/s)": 1.437704 }, { "epoch": 2.9315367807720323, "grad_norm": 4.406851291656494, "learning_rate": 3.6608378769342746e-05, "loss": 2.525054168701172, "memory(GiB)": 77.56, "step": 68425, "token_acc": 0.4732142857142857, "train_speed(iter/s)": 1.437679 }, { "epoch": 2.931750996101281, "grad_norm": 5.786430835723877, "learning_rate": 3.660189498643097e-05, "loss": 2.5697994232177734, "memory(GiB)": 77.56, "step": 68430, "token_acc": 0.5028901734104047, "train_speed(iter/s)": 1.437683 }, { "epoch": 2.93196521143053, "grad_norm": 4.407654285430908, "learning_rate": 3.659541144623846e-05, "loss": 2.5435739517211915, "memory(GiB)": 77.56, "step": 68435, "token_acc": 0.46688741721854304, "train_speed(iter/s)": 1.437682 }, { "epoch": 2.932179426759779, "grad_norm": 6.907027721405029, "learning_rate": 3.658892814888264e-05, "loss": 2.5674554824829103, "memory(GiB)": 77.56, "step": 68440, "token_acc": 0.423841059602649, "train_speed(iter/s)": 1.437666 }, { "epoch": 2.932393642089028, "grad_norm": 5.145559787750244, "learning_rate": 3.658244509448094e-05, "loss": 2.2846824645996096, "memory(GiB)": 77.56, "step": 68445, "token_acc": 0.48161764705882354, "train_speed(iter/s)": 1.437678 }, { "epoch": 2.932607857418277, "grad_norm": 4.6332926750183105, "learning_rate": 3.657596228315086e-05, "loss": 2.3373672485351564, "memory(GiB)": 77.56, "step": 68450, "token_acc": 0.5096774193548387, "train_speed(iter/s)": 1.437669 }, { "epoch": 2.932822072747526, "grad_norm": 4.5164361000061035, "learning_rate": 3.656947971500982e-05, "loss": 2.3477684020996095, "memory(GiB)": 77.56, "step": 68455, "token_acc": 0.49429657794676807, "train_speed(iter/s)": 1.43767 }, { "epoch": 2.933036288076775, "grad_norm": 6.285327434539795, "learning_rate": 3.656299739017523e-05, "loss": 2.2973634719848635, "memory(GiB)": 77.56, "step": 68460, "token_acc": 0.5175718849840255, "train_speed(iter/s)": 1.43767 }, { "epoch": 2.9332505034060237, "grad_norm": 5.199221134185791, "learning_rate": 3.655651530876456e-05, "loss": 2.5503076553344726, "memory(GiB)": 77.56, "step": 68465, "token_acc": 0.44483985765124556, "train_speed(iter/s)": 1.437687 }, { "epoch": 2.933464718735273, "grad_norm": 8.812651634216309, "learning_rate": 3.6550033470895225e-05, "loss": 2.6162412643432615, "memory(GiB)": 77.56, "step": 68470, "token_acc": 0.4742647058823529, "train_speed(iter/s)": 1.437715 }, { "epoch": 2.9336789340645217, "grad_norm": 4.474532127380371, "learning_rate": 3.6543551876684656e-05, "loss": 2.5346126556396484, "memory(GiB)": 77.56, "step": 68475, "token_acc": 0.4755700325732899, "train_speed(iter/s)": 1.437724 }, { "epoch": 2.9338931493937705, "grad_norm": 5.679563999176025, "learning_rate": 3.6537070526250244e-05, "loss": 2.268147850036621, "memory(GiB)": 77.56, "step": 68480, "token_acc": 0.508833922261484, "train_speed(iter/s)": 1.437737 }, { "epoch": 2.93410736472302, "grad_norm": 6.245189189910889, "learning_rate": 3.653058941970945e-05, "loss": 2.347981262207031, "memory(GiB)": 77.56, "step": 68485, "token_acc": 0.49224806201550386, "train_speed(iter/s)": 1.437738 }, { "epoch": 2.9343215800522686, "grad_norm": 5.599194526672363, "learning_rate": 3.652410855717966e-05, "loss": 2.276208686828613, "memory(GiB)": 77.56, "step": 68490, "token_acc": 0.49242424242424243, "train_speed(iter/s)": 1.437743 }, { "epoch": 2.9345357953815174, "grad_norm": 4.174640655517578, "learning_rate": 3.651762793877829e-05, "loss": 2.4413013458251953, "memory(GiB)": 77.56, "step": 68495, "token_acc": 0.4671280276816609, "train_speed(iter/s)": 1.437743 }, { "epoch": 2.9347500107107667, "grad_norm": 7.646904468536377, "learning_rate": 3.6511147564622736e-05, "loss": 2.4015220642089843, "memory(GiB)": 77.56, "step": 68500, "token_acc": 0.5062111801242236, "train_speed(iter/s)": 1.437762 }, { "epoch": 2.9347500107107667, "eval_loss": 2.31423282623291, "eval_runtime": 13.8976, "eval_samples_per_second": 7.196, "eval_steps_per_second": 7.196, "eval_token_acc": 0.43351800554016623, "step": 68500 }, { "epoch": 2.9349642260400155, "grad_norm": 4.358438968658447, "learning_rate": 3.6504667434830394e-05, "loss": 2.3032392501831054, "memory(GiB)": 77.56, "step": 68505, "token_acc": 0.44831013916500995, "train_speed(iter/s)": 1.437311 }, { "epoch": 2.9351784413692643, "grad_norm": 8.29546070098877, "learning_rate": 3.649818754951866e-05, "loss": 2.151364326477051, "memory(GiB)": 77.56, "step": 68510, "token_acc": 0.5331325301204819, "train_speed(iter/s)": 1.437294 }, { "epoch": 2.9353926566985136, "grad_norm": 5.173444747924805, "learning_rate": 3.6491707908804926e-05, "loss": 2.5007471084594726, "memory(GiB)": 77.56, "step": 68515, "token_acc": 0.48727272727272725, "train_speed(iter/s)": 1.437312 }, { "epoch": 2.9356068720277624, "grad_norm": 6.332651615142822, "learning_rate": 3.648522851280658e-05, "loss": 2.943612289428711, "memory(GiB)": 77.56, "step": 68520, "token_acc": 0.3857677902621723, "train_speed(iter/s)": 1.437325 }, { "epoch": 2.935821087357011, "grad_norm": 5.154989719390869, "learning_rate": 3.6478749361640996e-05, "loss": 2.495052719116211, "memory(GiB)": 77.56, "step": 68525, "token_acc": 0.4766355140186916, "train_speed(iter/s)": 1.437322 }, { "epoch": 2.9360353026862605, "grad_norm": 4.607423305511475, "learning_rate": 3.647227045542554e-05, "loss": 2.5632247924804688, "memory(GiB)": 77.56, "step": 68530, "token_acc": 0.44947735191637633, "train_speed(iter/s)": 1.437314 }, { "epoch": 2.9362495180155093, "grad_norm": 5.022364139556885, "learning_rate": 3.646579179427761e-05, "loss": 1.8762619018554687, "memory(GiB)": 77.56, "step": 68535, "token_acc": 0.5580357142857143, "train_speed(iter/s)": 1.437327 }, { "epoch": 2.936463733344758, "grad_norm": 6.142657279968262, "learning_rate": 3.6459313378314554e-05, "loss": 2.215517044067383, "memory(GiB)": 77.56, "step": 68540, "token_acc": 0.5152671755725191, "train_speed(iter/s)": 1.437319 }, { "epoch": 2.9366779486740073, "grad_norm": 4.5446577072143555, "learning_rate": 3.645283520765373e-05, "loss": 2.1593366622924806, "memory(GiB)": 77.56, "step": 68545, "token_acc": 0.5222929936305732, "train_speed(iter/s)": 1.437321 }, { "epoch": 2.936892164003256, "grad_norm": 4.72842264175415, "learning_rate": 3.6446357282412514e-05, "loss": 2.3668373107910154, "memory(GiB)": 77.56, "step": 68550, "token_acc": 0.48172757475083056, "train_speed(iter/s)": 1.437326 }, { "epoch": 2.937106379332505, "grad_norm": 6.1560468673706055, "learning_rate": 3.6439879602708224e-05, "loss": 2.593875503540039, "memory(GiB)": 77.56, "step": 68555, "token_acc": 0.48188405797101447, "train_speed(iter/s)": 1.437323 }, { "epoch": 2.937320594661754, "grad_norm": 4.385132789611816, "learning_rate": 3.643340216865827e-05, "loss": 2.6797506332397463, "memory(GiB)": 77.56, "step": 68560, "token_acc": 0.45217391304347826, "train_speed(iter/s)": 1.437295 }, { "epoch": 2.937534809991003, "grad_norm": 5.3138532638549805, "learning_rate": 3.642692498037994e-05, "loss": 2.421952819824219, "memory(GiB)": 77.56, "step": 68565, "token_acc": 0.4765625, "train_speed(iter/s)": 1.437323 }, { "epoch": 2.937749025320252, "grad_norm": 4.772027492523193, "learning_rate": 3.642044803799061e-05, "loss": 1.9963918685913087, "memory(GiB)": 77.56, "step": 68570, "token_acc": 0.5772357723577236, "train_speed(iter/s)": 1.437291 }, { "epoch": 2.937963240649501, "grad_norm": 6.275188446044922, "learning_rate": 3.6413971341607596e-05, "loss": 2.5237409591674806, "memory(GiB)": 77.56, "step": 68575, "token_acc": 0.4620253164556962, "train_speed(iter/s)": 1.437274 }, { "epoch": 2.93817745597875, "grad_norm": 5.677504062652588, "learning_rate": 3.6407494891348244e-05, "loss": 2.542437934875488, "memory(GiB)": 77.56, "step": 68580, "token_acc": 0.47297297297297297, "train_speed(iter/s)": 1.437288 }, { "epoch": 2.9383916713079987, "grad_norm": 11.629992485046387, "learning_rate": 3.640101868732987e-05, "loss": 2.2602775573730467, "memory(GiB)": 77.56, "step": 68585, "token_acc": 0.48338368580060426, "train_speed(iter/s)": 1.437279 }, { "epoch": 2.938605886637248, "grad_norm": 5.792952060699463, "learning_rate": 3.639454272966979e-05, "loss": 2.2910930633544924, "memory(GiB)": 77.56, "step": 68590, "token_acc": 0.5033557046979866, "train_speed(iter/s)": 1.437276 }, { "epoch": 2.938820101966497, "grad_norm": 7.0362229347229, "learning_rate": 3.6388067018485337e-05, "loss": 2.2122020721435547, "memory(GiB)": 77.56, "step": 68595, "token_acc": 0.5445205479452054, "train_speed(iter/s)": 1.437301 }, { "epoch": 2.9390343172957456, "grad_norm": 4.994625091552734, "learning_rate": 3.638159155389383e-05, "loss": 2.540998077392578, "memory(GiB)": 77.56, "step": 68600, "token_acc": 0.449468085106383, "train_speed(iter/s)": 1.437284 }, { "epoch": 2.939248532624995, "grad_norm": 6.120875835418701, "learning_rate": 3.637511633601255e-05, "loss": 2.559706687927246, "memory(GiB)": 77.56, "step": 68605, "token_acc": 0.4485294117647059, "train_speed(iter/s)": 1.437288 }, { "epoch": 2.9394627479542437, "grad_norm": 4.160299301147461, "learning_rate": 3.636864136495883e-05, "loss": 2.440188980102539, "memory(GiB)": 77.56, "step": 68610, "token_acc": 0.5, "train_speed(iter/s)": 1.437307 }, { "epoch": 2.9396769632834925, "grad_norm": 5.6210479736328125, "learning_rate": 3.636216664084995e-05, "loss": 1.8378814697265624, "memory(GiB)": 77.56, "step": 68615, "token_acc": 0.5829787234042553, "train_speed(iter/s)": 1.437287 }, { "epoch": 2.9398911786127417, "grad_norm": 7.1883955001831055, "learning_rate": 3.635569216380322e-05, "loss": 2.498285675048828, "memory(GiB)": 77.56, "step": 68620, "token_acc": 0.45733788395904434, "train_speed(iter/s)": 1.437311 }, { "epoch": 2.9401053939419906, "grad_norm": 8.345105171203613, "learning_rate": 3.634921793393593e-05, "loss": 2.493545913696289, "memory(GiB)": 77.56, "step": 68625, "token_acc": 0.49818181818181817, "train_speed(iter/s)": 1.437331 }, { "epoch": 2.9403196092712394, "grad_norm": 5.7748122215271, "learning_rate": 3.634274395136534e-05, "loss": 2.359173583984375, "memory(GiB)": 77.56, "step": 68630, "token_acc": 0.4935483870967742, "train_speed(iter/s)": 1.437338 }, { "epoch": 2.9405338246004886, "grad_norm": 6.156806945800781, "learning_rate": 3.633627021620877e-05, "loss": 2.3151174545288087, "memory(GiB)": 77.56, "step": 68635, "token_acc": 0.49333333333333335, "train_speed(iter/s)": 1.437343 }, { "epoch": 2.9407480399297374, "grad_norm": 5.841317176818848, "learning_rate": 3.632979672858349e-05, "loss": 2.6829769134521486, "memory(GiB)": 77.56, "step": 68640, "token_acc": 0.42066420664206644, "train_speed(iter/s)": 1.437343 }, { "epoch": 2.9409622552589862, "grad_norm": 4.903331279754639, "learning_rate": 3.632332348860676e-05, "loss": 2.2645505905151366, "memory(GiB)": 77.56, "step": 68645, "token_acc": 0.5377358490566038, "train_speed(iter/s)": 1.437348 }, { "epoch": 2.9411764705882355, "grad_norm": 5.157782077789307, "learning_rate": 3.631685049639586e-05, "loss": 2.596482849121094, "memory(GiB)": 77.56, "step": 68650, "token_acc": 0.4559386973180077, "train_speed(iter/s)": 1.437354 }, { "epoch": 2.9413906859174843, "grad_norm": 6.980854511260986, "learning_rate": 3.6310377752068046e-05, "loss": 2.601549530029297, "memory(GiB)": 77.56, "step": 68655, "token_acc": 0.5052631578947369, "train_speed(iter/s)": 1.437344 }, { "epoch": 2.941604901246733, "grad_norm": 4.489633083343506, "learning_rate": 3.6303905255740575e-05, "loss": 2.5341041564941404, "memory(GiB)": 77.56, "step": 68660, "token_acc": 0.4852459016393443, "train_speed(iter/s)": 1.437351 }, { "epoch": 2.9418191165759824, "grad_norm": 6.645514011383057, "learning_rate": 3.629743300753072e-05, "loss": 2.7830312728881834, "memory(GiB)": 77.56, "step": 68665, "token_acc": 0.46178343949044587, "train_speed(iter/s)": 1.437318 }, { "epoch": 2.942033331905231, "grad_norm": 6.095151424407959, "learning_rate": 3.629096100755571e-05, "loss": 2.603617858886719, "memory(GiB)": 77.56, "step": 68670, "token_acc": 0.4490861618798956, "train_speed(iter/s)": 1.437301 }, { "epoch": 2.94224754723448, "grad_norm": 5.025662422180176, "learning_rate": 3.6284489255932805e-05, "loss": 2.5306350708007814, "memory(GiB)": 77.56, "step": 68675, "token_acc": 0.48, "train_speed(iter/s)": 1.437272 }, { "epoch": 2.9424617625637293, "grad_norm": 7.614643573760986, "learning_rate": 3.627801775277925e-05, "loss": 2.505751037597656, "memory(GiB)": 77.56, "step": 68680, "token_acc": 0.47703180212014135, "train_speed(iter/s)": 1.437283 }, { "epoch": 2.942675977892978, "grad_norm": 6.706848621368408, "learning_rate": 3.627154649821227e-05, "loss": 2.1313541412353514, "memory(GiB)": 77.56, "step": 68685, "token_acc": 0.5437262357414449, "train_speed(iter/s)": 1.43727 }, { "epoch": 2.942890193222227, "grad_norm": 6.144601821899414, "learning_rate": 3.62650754923491e-05, "loss": 2.2737705230712892, "memory(GiB)": 77.56, "step": 68690, "token_acc": 0.4692556634304207, "train_speed(iter/s)": 1.437282 }, { "epoch": 2.943104408551476, "grad_norm": 6.145516395568848, "learning_rate": 3.625860473530698e-05, "loss": 2.3433048248291017, "memory(GiB)": 77.56, "step": 68695, "token_acc": 0.5111940298507462, "train_speed(iter/s)": 1.437312 }, { "epoch": 2.943318623880725, "grad_norm": 6.162186622619629, "learning_rate": 3.62521342272031e-05, "loss": 2.4569093704223635, "memory(GiB)": 77.56, "step": 68700, "token_acc": 0.4963235294117647, "train_speed(iter/s)": 1.437326 }, { "epoch": 2.9435328392099738, "grad_norm": 7.319283962249756, "learning_rate": 3.624566396815473e-05, "loss": 2.6335189819335936, "memory(GiB)": 77.56, "step": 68705, "token_acc": 0.4266666666666667, "train_speed(iter/s)": 1.437338 }, { "epoch": 2.943747054539223, "grad_norm": 4.756680488586426, "learning_rate": 3.623919395827905e-05, "loss": 2.4158279418945314, "memory(GiB)": 77.56, "step": 68710, "token_acc": 0.4981132075471698, "train_speed(iter/s)": 1.437301 }, { "epoch": 2.943961269868472, "grad_norm": 6.41449499130249, "learning_rate": 3.623272419769329e-05, "loss": 2.5319637298583983, "memory(GiB)": 77.56, "step": 68715, "token_acc": 0.47509578544061304, "train_speed(iter/s)": 1.437305 }, { "epoch": 2.9441754851977207, "grad_norm": 5.517734050750732, "learning_rate": 3.622625468651463e-05, "loss": 2.489777755737305, "memory(GiB)": 77.56, "step": 68720, "token_acc": 0.46956521739130436, "train_speed(iter/s)": 1.43732 }, { "epoch": 2.94438970052697, "grad_norm": 6.091075420379639, "learning_rate": 3.62197854248603e-05, "loss": 2.700113868713379, "memory(GiB)": 77.56, "step": 68725, "token_acc": 0.468503937007874, "train_speed(iter/s)": 1.437318 }, { "epoch": 2.9446039158562187, "grad_norm": 5.145681381225586, "learning_rate": 3.621331641284749e-05, "loss": 2.181728172302246, "memory(GiB)": 77.56, "step": 68730, "token_acc": 0.5112781954887218, "train_speed(iter/s)": 1.437306 }, { "epoch": 2.9448181311854675, "grad_norm": 4.653256893157959, "learning_rate": 3.620684765059337e-05, "loss": 2.333662223815918, "memory(GiB)": 77.56, "step": 68735, "token_acc": 0.47735191637630664, "train_speed(iter/s)": 1.437334 }, { "epoch": 2.945032346514717, "grad_norm": 4.413647174835205, "learning_rate": 3.620037913821516e-05, "loss": 2.6249488830566405, "memory(GiB)": 77.56, "step": 68740, "token_acc": 0.43086816720257237, "train_speed(iter/s)": 1.437344 }, { "epoch": 2.9452465618439656, "grad_norm": 6.1817522048950195, "learning_rate": 3.619391087583002e-05, "loss": 2.759774398803711, "memory(GiB)": 77.56, "step": 68745, "token_acc": 0.4582043343653251, "train_speed(iter/s)": 1.437354 }, { "epoch": 2.9454607771732144, "grad_norm": 5.204505443572998, "learning_rate": 3.618744286355513e-05, "loss": 2.450588607788086, "memory(GiB)": 77.56, "step": 68750, "token_acc": 0.5156794425087108, "train_speed(iter/s)": 1.437361 }, { "epoch": 2.9456749925024637, "grad_norm": 5.7504191398620605, "learning_rate": 3.618097510150768e-05, "loss": 2.284286880493164, "memory(GiB)": 77.56, "step": 68755, "token_acc": 0.5269709543568465, "train_speed(iter/s)": 1.437371 }, { "epoch": 2.9458892078317125, "grad_norm": 5.748534202575684, "learning_rate": 3.617450758980482e-05, "loss": 2.293759346008301, "memory(GiB)": 77.56, "step": 68760, "token_acc": 0.4919614147909968, "train_speed(iter/s)": 1.437339 }, { "epoch": 2.9461034231609613, "grad_norm": 4.761528968811035, "learning_rate": 3.616804032856373e-05, "loss": 2.5627391815185545, "memory(GiB)": 77.56, "step": 68765, "token_acc": 0.4555160142348754, "train_speed(iter/s)": 1.437332 }, { "epoch": 2.9463176384902106, "grad_norm": 5.485678672790527, "learning_rate": 3.6161573317901564e-05, "loss": 2.6419851303100588, "memory(GiB)": 77.56, "step": 68770, "token_acc": 0.4701492537313433, "train_speed(iter/s)": 1.437322 }, { "epoch": 2.9465318538194594, "grad_norm": 5.006595134735107, "learning_rate": 3.6155106557935464e-05, "loss": 2.4364442825317383, "memory(GiB)": 77.56, "step": 68775, "token_acc": 0.4626865671641791, "train_speed(iter/s)": 1.43733 }, { "epoch": 2.946746069148708, "grad_norm": 4.9011921882629395, "learning_rate": 3.6148640048782604e-05, "loss": 2.335931587219238, "memory(GiB)": 77.56, "step": 68780, "token_acc": 0.5511111111111111, "train_speed(iter/s)": 1.437357 }, { "epoch": 2.9469602844779574, "grad_norm": 6.965666770935059, "learning_rate": 3.614217379056013e-05, "loss": 2.2329822540283204, "memory(GiB)": 77.56, "step": 68785, "token_acc": 0.5238095238095238, "train_speed(iter/s)": 1.437369 }, { "epoch": 2.9471744998072062, "grad_norm": 4.601630210876465, "learning_rate": 3.6135707783385183e-05, "loss": 2.626714324951172, "memory(GiB)": 77.56, "step": 68790, "token_acc": 0.4502923976608187, "train_speed(iter/s)": 1.43738 }, { "epoch": 2.947388715136455, "grad_norm": 4.9763264656066895, "learning_rate": 3.6129242027374886e-05, "loss": 2.3513513565063477, "memory(GiB)": 77.56, "step": 68795, "token_acc": 0.5306122448979592, "train_speed(iter/s)": 1.437387 }, { "epoch": 2.9476029304657043, "grad_norm": 4.851491451263428, "learning_rate": 3.6122776522646386e-05, "loss": 2.5669939041137697, "memory(GiB)": 77.56, "step": 68800, "token_acc": 0.4562334217506631, "train_speed(iter/s)": 1.437387 }, { "epoch": 2.947817145794953, "grad_norm": 4.193472385406494, "learning_rate": 3.6116311269316804e-05, "loss": 2.3566879272460937, "memory(GiB)": 77.56, "step": 68805, "token_acc": 0.5161290322580645, "train_speed(iter/s)": 1.437362 }, { "epoch": 2.948031361124202, "grad_norm": 5.2125678062438965, "learning_rate": 3.610984626750327e-05, "loss": 2.473443031311035, "memory(GiB)": 77.56, "step": 68810, "token_acc": 0.4684385382059801, "train_speed(iter/s)": 1.437374 }, { "epoch": 2.948245576453451, "grad_norm": 4.731603145599365, "learning_rate": 3.6103381517322905e-05, "loss": 2.3626129150390627, "memory(GiB)": 77.56, "step": 68815, "token_acc": 0.516245487364621, "train_speed(iter/s)": 1.437373 }, { "epoch": 2.9484597917827, "grad_norm": 5.903083801269531, "learning_rate": 3.609691701889281e-05, "loss": 2.4182340621948244, "memory(GiB)": 77.56, "step": 68820, "token_acc": 0.5033333333333333, "train_speed(iter/s)": 1.437374 }, { "epoch": 2.948674007111949, "grad_norm": 6.340105056762695, "learning_rate": 3.6090452772330115e-05, "loss": 2.312301254272461, "memory(GiB)": 77.56, "step": 68825, "token_acc": 0.4740061162079511, "train_speed(iter/s)": 1.437383 }, { "epoch": 2.948888222441198, "grad_norm": 7.80027961730957, "learning_rate": 3.6083988777751916e-05, "loss": 2.6433765411376955, "memory(GiB)": 77.56, "step": 68830, "token_acc": 0.4303030303030303, "train_speed(iter/s)": 1.437408 }, { "epoch": 2.949102437770447, "grad_norm": 5.485929012298584, "learning_rate": 3.60775250352753e-05, "loss": 2.530671310424805, "memory(GiB)": 77.56, "step": 68835, "token_acc": 0.45454545454545453, "train_speed(iter/s)": 1.437411 }, { "epoch": 2.9493166530996957, "grad_norm": 4.73348331451416, "learning_rate": 3.6071061545017395e-05, "loss": 2.529583549499512, "memory(GiB)": 77.56, "step": 68840, "token_acc": 0.4797507788161994, "train_speed(iter/s)": 1.437418 }, { "epoch": 2.949530868428945, "grad_norm": 4.363133907318115, "learning_rate": 3.6064598307095266e-05, "loss": 2.4248477935791017, "memory(GiB)": 77.56, "step": 68845, "token_acc": 0.4740740740740741, "train_speed(iter/s)": 1.437426 }, { "epoch": 2.9497450837581938, "grad_norm": 5.332622528076172, "learning_rate": 3.6058135321625995e-05, "loss": 2.3236104965209963, "memory(GiB)": 77.56, "step": 68850, "token_acc": 0.535483870967742, "train_speed(iter/s)": 1.437437 }, { "epoch": 2.9499592990874426, "grad_norm": 6.431833267211914, "learning_rate": 3.60516725887267e-05, "loss": 2.568436622619629, "memory(GiB)": 77.56, "step": 68855, "token_acc": 0.48132780082987553, "train_speed(iter/s)": 1.437467 }, { "epoch": 2.950173514416692, "grad_norm": 4.678027153015137, "learning_rate": 3.604521010851445e-05, "loss": 2.5360055923461915, "memory(GiB)": 77.56, "step": 68860, "token_acc": 0.468944099378882, "train_speed(iter/s)": 1.437464 }, { "epoch": 2.9503877297459407, "grad_norm": 5.996098518371582, "learning_rate": 3.60387478811063e-05, "loss": 2.6224271774291994, "memory(GiB)": 77.56, "step": 68865, "token_acc": 0.4791666666666667, "train_speed(iter/s)": 1.437471 }, { "epoch": 2.9506019450751895, "grad_norm": 4.644439220428467, "learning_rate": 3.603228590661933e-05, "loss": 2.233340835571289, "memory(GiB)": 77.56, "step": 68870, "token_acc": 0.4781021897810219, "train_speed(iter/s)": 1.437488 }, { "epoch": 2.9508161604044387, "grad_norm": 6.249563694000244, "learning_rate": 3.602582418517061e-05, "loss": 2.5900638580322264, "memory(GiB)": 77.56, "step": 68875, "token_acc": 0.4542372881355932, "train_speed(iter/s)": 1.437449 }, { "epoch": 2.9510303757336875, "grad_norm": 4.71757173538208, "learning_rate": 3.601936271687718e-05, "loss": 2.4595561981201173, "memory(GiB)": 77.56, "step": 68880, "token_acc": 0.4738562091503268, "train_speed(iter/s)": 1.437468 }, { "epoch": 2.9512445910629364, "grad_norm": 6.177444934844971, "learning_rate": 3.601290150185612e-05, "loss": 2.3477642059326174, "memory(GiB)": 77.56, "step": 68885, "token_acc": 0.5405405405405406, "train_speed(iter/s)": 1.437483 }, { "epoch": 2.9514588063921856, "grad_norm": 4.402703285217285, "learning_rate": 3.6006440540224465e-05, "loss": 2.4873001098632814, "memory(GiB)": 77.56, "step": 68890, "token_acc": 0.46174142480211083, "train_speed(iter/s)": 1.437488 }, { "epoch": 2.9516730217214344, "grad_norm": 5.3722429275512695, "learning_rate": 3.599997983209927e-05, "loss": 2.5535905838012694, "memory(GiB)": 77.56, "step": 68895, "token_acc": 0.44970414201183434, "train_speed(iter/s)": 1.437492 }, { "epoch": 2.9518872370506832, "grad_norm": 5.824199676513672, "learning_rate": 3.5993519377597576e-05, "loss": 2.6637346267700197, "memory(GiB)": 77.56, "step": 68900, "token_acc": 0.46440677966101696, "train_speed(iter/s)": 1.437501 }, { "epoch": 2.9521014523799325, "grad_norm": 6.875514984130859, "learning_rate": 3.5987059176836404e-05, "loss": 2.3171222686767576, "memory(GiB)": 77.56, "step": 68905, "token_acc": 0.45075757575757575, "train_speed(iter/s)": 1.437527 }, { "epoch": 2.9523156677091813, "grad_norm": 4.838869571685791, "learning_rate": 3.598059922993282e-05, "loss": 2.1232067108154298, "memory(GiB)": 77.56, "step": 68910, "token_acc": 0.5259515570934256, "train_speed(iter/s)": 1.437535 }, { "epoch": 2.95252988303843, "grad_norm": 5.742547988891602, "learning_rate": 3.597413953700382e-05, "loss": 2.31732234954834, "memory(GiB)": 77.56, "step": 68915, "token_acc": 0.5160142348754448, "train_speed(iter/s)": 1.437535 }, { "epoch": 2.9527440983676794, "grad_norm": 6.010509967803955, "learning_rate": 3.596768009816644e-05, "loss": 2.3423095703125, "memory(GiB)": 77.56, "step": 68920, "token_acc": 0.5036231884057971, "train_speed(iter/s)": 1.437553 }, { "epoch": 2.952958313696928, "grad_norm": 5.6316728591918945, "learning_rate": 3.5961220913537683e-05, "loss": 2.213157844543457, "memory(GiB)": 77.56, "step": 68925, "token_acc": 0.5387596899224806, "train_speed(iter/s)": 1.437545 }, { "epoch": 2.953172529026177, "grad_norm": 4.8132734298706055, "learning_rate": 3.5954761983234595e-05, "loss": 2.708344268798828, "memory(GiB)": 77.56, "step": 68930, "token_acc": 0.48623853211009177, "train_speed(iter/s)": 1.437547 }, { "epoch": 2.9533867443554263, "grad_norm": 5.64206600189209, "learning_rate": 3.594830330737417e-05, "loss": 2.413673210144043, "memory(GiB)": 77.56, "step": 68935, "token_acc": 0.4651898734177215, "train_speed(iter/s)": 1.437566 }, { "epoch": 2.953600959684675, "grad_norm": 4.685112476348877, "learning_rate": 3.5941844886073416e-05, "loss": 2.428424263000488, "memory(GiB)": 77.56, "step": 68940, "token_acc": 0.46488294314381273, "train_speed(iter/s)": 1.43758 }, { "epoch": 2.953815175013924, "grad_norm": 4.130372047424316, "learning_rate": 3.5935386719449324e-05, "loss": 2.4846450805664064, "memory(GiB)": 77.56, "step": 68945, "token_acc": 0.5058365758754864, "train_speed(iter/s)": 1.4376 }, { "epoch": 2.954029390343173, "grad_norm": 4.273837089538574, "learning_rate": 3.5928928807618896e-05, "loss": 2.602483367919922, "memory(GiB)": 77.56, "step": 68950, "token_acc": 0.472636815920398, "train_speed(iter/s)": 1.437626 }, { "epoch": 2.954243605672422, "grad_norm": 4.532648086547852, "learning_rate": 3.592247115069913e-05, "loss": 2.5536243438720705, "memory(GiB)": 77.56, "step": 68955, "token_acc": 0.46503496503496505, "train_speed(iter/s)": 1.437645 }, { "epoch": 2.9544578210016708, "grad_norm": 5.601273536682129, "learning_rate": 3.5916013748807e-05, "loss": 2.231674575805664, "memory(GiB)": 77.56, "step": 68960, "token_acc": 0.49523809523809526, "train_speed(iter/s)": 1.437674 }, { "epoch": 2.95467203633092, "grad_norm": 7.24081563949585, "learning_rate": 3.590955660205948e-05, "loss": 2.558842086791992, "memory(GiB)": 77.56, "step": 68965, "token_acc": 0.46938775510204084, "train_speed(iter/s)": 1.437678 }, { "epoch": 2.954886251660169, "grad_norm": 4.85072660446167, "learning_rate": 3.590309971057358e-05, "loss": 2.4630973815917967, "memory(GiB)": 77.56, "step": 68970, "token_acc": 0.4888888888888889, "train_speed(iter/s)": 1.437641 }, { "epoch": 2.9551004669894176, "grad_norm": 6.246769905090332, "learning_rate": 3.5896643074466246e-05, "loss": 2.334185028076172, "memory(GiB)": 77.56, "step": 68975, "token_acc": 0.5, "train_speed(iter/s)": 1.437679 }, { "epoch": 2.955314682318667, "grad_norm": 6.290075302124023, "learning_rate": 3.5890186693854444e-05, "loss": 2.1810699462890626, "memory(GiB)": 77.56, "step": 68980, "token_acc": 0.5217391304347826, "train_speed(iter/s)": 1.437682 }, { "epoch": 2.9555288976479157, "grad_norm": 4.407094955444336, "learning_rate": 3.5883730568855156e-05, "loss": 2.5275590896606444, "memory(GiB)": 77.56, "step": 68985, "token_acc": 0.44011976047904194, "train_speed(iter/s)": 1.437706 }, { "epoch": 2.9557431129771645, "grad_norm": 6.0976433753967285, "learning_rate": 3.587727469958532e-05, "loss": 2.6480321884155273, "memory(GiB)": 77.56, "step": 68990, "token_acc": 0.47368421052631576, "train_speed(iter/s)": 1.437734 }, { "epoch": 2.955957328306414, "grad_norm": 4.6575517654418945, "learning_rate": 3.5870819086161913e-05, "loss": 2.3110029220581056, "memory(GiB)": 77.56, "step": 68995, "token_acc": 0.5490196078431373, "train_speed(iter/s)": 1.43774 }, { "epoch": 2.9561715436356626, "grad_norm": 6.478888988494873, "learning_rate": 3.5864363728701855e-05, "loss": 1.9367637634277344, "memory(GiB)": 77.56, "step": 69000, "token_acc": 0.5495867768595041, "train_speed(iter/s)": 1.437763 }, { "epoch": 2.9561715436356626, "eval_loss": 2.1820437908172607, "eval_runtime": 14.9369, "eval_samples_per_second": 6.695, "eval_steps_per_second": 6.695, "eval_token_acc": 0.4807436918990704, "step": 69000 }, { "epoch": 2.9563857589649114, "grad_norm": 4.883810520172119, "learning_rate": 3.5857908627322124e-05, "loss": 2.6500858306884765, "memory(GiB)": 77.56, "step": 69005, "token_acc": 0.47880539499036606, "train_speed(iter/s)": 1.437289 }, { "epoch": 2.9565999742941607, "grad_norm": 4.120144367218018, "learning_rate": 3.585145378213963e-05, "loss": 2.456507110595703, "memory(GiB)": 77.56, "step": 69010, "token_acc": 0.4722222222222222, "train_speed(iter/s)": 1.43728 }, { "epoch": 2.9568141896234095, "grad_norm": 4.422980308532715, "learning_rate": 3.584499919327135e-05, "loss": 2.5950057983398436, "memory(GiB)": 77.56, "step": 69015, "token_acc": 0.49480968858131485, "train_speed(iter/s)": 1.437275 }, { "epoch": 2.9570284049526583, "grad_norm": 5.577856063842773, "learning_rate": 3.583854486083417e-05, "loss": 2.1856239318847654, "memory(GiB)": 77.56, "step": 69020, "token_acc": 0.4858156028368794, "train_speed(iter/s)": 1.437311 }, { "epoch": 2.9572426202819075, "grad_norm": 4.3809943199157715, "learning_rate": 3.583209078494503e-05, "loss": 2.690225028991699, "memory(GiB)": 77.56, "step": 69025, "token_acc": 0.42028985507246375, "train_speed(iter/s)": 1.437325 }, { "epoch": 2.9574568356111564, "grad_norm": 5.803936958312988, "learning_rate": 3.582563696572087e-05, "loss": 2.4410112380981444, "memory(GiB)": 77.56, "step": 69030, "token_acc": 0.5354330708661418, "train_speed(iter/s)": 1.437307 }, { "epoch": 2.957671050940405, "grad_norm": 5.099001884460449, "learning_rate": 3.581918340327858e-05, "loss": 2.3389801025390624, "memory(GiB)": 77.56, "step": 69035, "token_acc": 0.4620938628158845, "train_speed(iter/s)": 1.437309 }, { "epoch": 2.9578852662696544, "grad_norm": 6.368360996246338, "learning_rate": 3.581273009773509e-05, "loss": 2.463871955871582, "memory(GiB)": 77.56, "step": 69040, "token_acc": 0.47808764940239046, "train_speed(iter/s)": 1.437298 }, { "epoch": 2.9580994815989032, "grad_norm": 6.084715843200684, "learning_rate": 3.5806277049207315e-05, "loss": 2.5276683807373046, "memory(GiB)": 77.56, "step": 69045, "token_acc": 0.4539249146757679, "train_speed(iter/s)": 1.437294 }, { "epoch": 2.958313696928152, "grad_norm": 6.471798419952393, "learning_rate": 3.579982425781213e-05, "loss": 2.6048458099365233, "memory(GiB)": 77.56, "step": 69050, "token_acc": 0.49242424242424243, "train_speed(iter/s)": 1.437294 }, { "epoch": 2.9585279122574013, "grad_norm": 6.593715667724609, "learning_rate": 3.579337172366646e-05, "loss": 2.7753677368164062, "memory(GiB)": 77.56, "step": 69055, "token_acc": 0.4204946996466431, "train_speed(iter/s)": 1.437309 }, { "epoch": 2.95874212758665, "grad_norm": 5.319879531860352, "learning_rate": 3.578691944688719e-05, "loss": 2.4280622482299803, "memory(GiB)": 77.56, "step": 69060, "token_acc": 0.4935483870967742, "train_speed(iter/s)": 1.437316 }, { "epoch": 2.958956342915899, "grad_norm": 4.729599952697754, "learning_rate": 3.5780467427591194e-05, "loss": 2.5661991119384764, "memory(GiB)": 77.56, "step": 69065, "token_acc": 0.48148148148148145, "train_speed(iter/s)": 1.437324 }, { "epoch": 2.959170558245148, "grad_norm": 4.614162445068359, "learning_rate": 3.577401566589535e-05, "loss": 2.346187782287598, "memory(GiB)": 77.56, "step": 69070, "token_acc": 0.5084745762711864, "train_speed(iter/s)": 1.437337 }, { "epoch": 2.959384773574397, "grad_norm": 6.547674179077148, "learning_rate": 3.576756416191659e-05, "loss": 2.5412033081054686, "memory(GiB)": 77.56, "step": 69075, "token_acc": 0.46111111111111114, "train_speed(iter/s)": 1.437372 }, { "epoch": 2.959598988903646, "grad_norm": 5.097473621368408, "learning_rate": 3.5761112915771756e-05, "loss": 2.3003753662109374, "memory(GiB)": 77.56, "step": 69080, "token_acc": 0.5251798561151079, "train_speed(iter/s)": 1.437393 }, { "epoch": 2.959813204232895, "grad_norm": 6.277078151702881, "learning_rate": 3.57546619275777e-05, "loss": 2.357630157470703, "memory(GiB)": 77.56, "step": 69085, "token_acc": 0.5335820895522388, "train_speed(iter/s)": 1.43741 }, { "epoch": 2.960027419562144, "grad_norm": 5.296885967254639, "learning_rate": 3.574821119745133e-05, "loss": 2.576180076599121, "memory(GiB)": 77.56, "step": 69090, "token_acc": 0.4463087248322148, "train_speed(iter/s)": 1.437416 }, { "epoch": 2.9602416348913927, "grad_norm": 5.158799648284912, "learning_rate": 3.5741760725509464e-05, "loss": 2.739981842041016, "memory(GiB)": 77.56, "step": 69095, "token_acc": 0.4329608938547486, "train_speed(iter/s)": 1.437428 }, { "epoch": 2.960455850220642, "grad_norm": 4.835994720458984, "learning_rate": 3.5735310511868994e-05, "loss": 2.711231994628906, "memory(GiB)": 77.56, "step": 69100, "token_acc": 0.49683544303797467, "train_speed(iter/s)": 1.437445 }, { "epoch": 2.9606700655498908, "grad_norm": 4.914797306060791, "learning_rate": 3.572886055664675e-05, "loss": 2.5512765884399413, "memory(GiB)": 77.56, "step": 69105, "token_acc": 0.4709897610921502, "train_speed(iter/s)": 1.437464 }, { "epoch": 2.9608842808791396, "grad_norm": 5.366225242614746, "learning_rate": 3.5722410859959574e-05, "loss": 2.6116432189941405, "memory(GiB)": 77.56, "step": 69110, "token_acc": 0.4666666666666667, "train_speed(iter/s)": 1.437489 }, { "epoch": 2.961098496208389, "grad_norm": 4.803067684173584, "learning_rate": 3.571596142192433e-05, "loss": 2.1961696624755858, "memory(GiB)": 77.56, "step": 69115, "token_acc": 0.5475409836065573, "train_speed(iter/s)": 1.437489 }, { "epoch": 2.9613127115376376, "grad_norm": 6.650675296783447, "learning_rate": 3.570951224265785e-05, "loss": 2.662397766113281, "memory(GiB)": 77.56, "step": 69120, "token_acc": 0.45514950166112955, "train_speed(iter/s)": 1.437495 }, { "epoch": 2.9615269268668865, "grad_norm": 5.322486877441406, "learning_rate": 3.570306332227694e-05, "loss": 2.2299646377563476, "memory(GiB)": 77.56, "step": 69125, "token_acc": 0.4921875, "train_speed(iter/s)": 1.437505 }, { "epoch": 2.9617411421961357, "grad_norm": 7.098770618438721, "learning_rate": 3.5696614660898465e-05, "loss": 2.410774993896484, "memory(GiB)": 77.56, "step": 69130, "token_acc": 0.5225225225225225, "train_speed(iter/s)": 1.437519 }, { "epoch": 2.9619553575253845, "grad_norm": 4.78179407119751, "learning_rate": 3.5690166258639226e-05, "loss": 2.2891971588134767, "memory(GiB)": 77.56, "step": 69135, "token_acc": 0.4870848708487085, "train_speed(iter/s)": 1.437533 }, { "epoch": 2.9621695728546333, "grad_norm": 5.066426753997803, "learning_rate": 3.568371811561606e-05, "loss": 2.3036334991455076, "memory(GiB)": 77.56, "step": 69140, "token_acc": 0.5, "train_speed(iter/s)": 1.437534 }, { "epoch": 2.9623837881838826, "grad_norm": 5.383299350738525, "learning_rate": 3.5677270231945745e-05, "loss": 2.3096208572387695, "memory(GiB)": 77.56, "step": 69145, "token_acc": 0.5058823529411764, "train_speed(iter/s)": 1.437543 }, { "epoch": 2.9625980035131314, "grad_norm": 5.121975421905518, "learning_rate": 3.5670822607745134e-05, "loss": 2.5336372375488283, "memory(GiB)": 77.56, "step": 69150, "token_acc": 0.46853146853146854, "train_speed(iter/s)": 1.437512 }, { "epoch": 2.96281221884238, "grad_norm": 4.660567760467529, "learning_rate": 3.566437524313101e-05, "loss": 2.37908935546875, "memory(GiB)": 77.56, "step": 69155, "token_acc": 0.49809885931558934, "train_speed(iter/s)": 1.437529 }, { "epoch": 2.9630264341716295, "grad_norm": 5.889187335968018, "learning_rate": 3.5657928138220184e-05, "loss": 2.4170663833618162, "memory(GiB)": 77.56, "step": 69160, "token_acc": 0.4882943143812709, "train_speed(iter/s)": 1.437546 }, { "epoch": 2.9632406495008783, "grad_norm": 8.472227096557617, "learning_rate": 3.565148129312944e-05, "loss": 2.180571746826172, "memory(GiB)": 77.56, "step": 69165, "token_acc": 0.5283842794759825, "train_speed(iter/s)": 1.437566 }, { "epoch": 2.963454864830127, "grad_norm": 5.193587779998779, "learning_rate": 3.564503470797556e-05, "loss": 2.653279113769531, "memory(GiB)": 77.56, "step": 69170, "token_acc": 0.45634920634920634, "train_speed(iter/s)": 1.437564 }, { "epoch": 2.9636690801593764, "grad_norm": 4.036701202392578, "learning_rate": 3.563858838287536e-05, "loss": 2.2770002365112303, "memory(GiB)": 77.56, "step": 69175, "token_acc": 0.5016077170418006, "train_speed(iter/s)": 1.437579 }, { "epoch": 2.963883295488625, "grad_norm": 5.415582656860352, "learning_rate": 3.5632142317945596e-05, "loss": 2.3588560104370115, "memory(GiB)": 77.56, "step": 69180, "token_acc": 0.5321428571428571, "train_speed(iter/s)": 1.437586 }, { "epoch": 2.964097510817874, "grad_norm": 6.111235618591309, "learning_rate": 3.562569651330305e-05, "loss": 2.3530622482299806, "memory(GiB)": 77.56, "step": 69185, "token_acc": 0.4617737003058104, "train_speed(iter/s)": 1.437585 }, { "epoch": 2.9643117261471232, "grad_norm": 6.13635778427124, "learning_rate": 3.561925096906451e-05, "loss": 2.367791938781738, "memory(GiB)": 77.56, "step": 69190, "token_acc": 0.466403162055336, "train_speed(iter/s)": 1.437605 }, { "epoch": 2.964525941476372, "grad_norm": 5.716488361358643, "learning_rate": 3.56128056853467e-05, "loss": 2.65100212097168, "memory(GiB)": 77.56, "step": 69195, "token_acc": 0.3944636678200692, "train_speed(iter/s)": 1.437635 }, { "epoch": 2.964740156805621, "grad_norm": 7.359189987182617, "learning_rate": 3.560636066226644e-05, "loss": 2.4877765655517576, "memory(GiB)": 77.56, "step": 69200, "token_acc": 0.45493562231759654, "train_speed(iter/s)": 1.43766 }, { "epoch": 2.96495437213487, "grad_norm": 6.101253509521484, "learning_rate": 3.5599915899940454e-05, "loss": 2.5385768890380858, "memory(GiB)": 77.56, "step": 69205, "token_acc": 0.461038961038961, "train_speed(iter/s)": 1.437658 }, { "epoch": 2.965168587464119, "grad_norm": 4.345081806182861, "learning_rate": 3.559347139848549e-05, "loss": 2.2688720703125, "memory(GiB)": 77.56, "step": 69210, "token_acc": 0.5080906148867314, "train_speed(iter/s)": 1.437669 }, { "epoch": 2.9653828027933677, "grad_norm": 4.58785343170166, "learning_rate": 3.558702715801832e-05, "loss": 2.595186233520508, "memory(GiB)": 77.56, "step": 69215, "token_acc": 0.48135593220338985, "train_speed(iter/s)": 1.437676 }, { "epoch": 2.965597018122617, "grad_norm": 5.516713619232178, "learning_rate": 3.5580583178655637e-05, "loss": 2.162289047241211, "memory(GiB)": 77.56, "step": 69220, "token_acc": 0.528169014084507, "train_speed(iter/s)": 1.43767 }, { "epoch": 2.965811233451866, "grad_norm": 5.593578338623047, "learning_rate": 3.557413946051425e-05, "loss": 2.386058044433594, "memory(GiB)": 77.56, "step": 69225, "token_acc": 0.4979253112033195, "train_speed(iter/s)": 1.437693 }, { "epoch": 2.9660254487811146, "grad_norm": 6.148902416229248, "learning_rate": 3.556769600371084e-05, "loss": 2.6028745651245115, "memory(GiB)": 77.56, "step": 69230, "token_acc": 0.5155709342560554, "train_speed(iter/s)": 1.437703 }, { "epoch": 2.966239664110364, "grad_norm": 8.898268699645996, "learning_rate": 3.5561252808362176e-05, "loss": 2.7592483520507813, "memory(GiB)": 77.56, "step": 69235, "token_acc": 0.4827586206896552, "train_speed(iter/s)": 1.437696 }, { "epoch": 2.9664538794396127, "grad_norm": 6.478043079376221, "learning_rate": 3.555480987458495e-05, "loss": 2.4878005981445312, "memory(GiB)": 77.56, "step": 69240, "token_acc": 0.468, "train_speed(iter/s)": 1.437723 }, { "epoch": 2.9666680947688615, "grad_norm": 5.717689514160156, "learning_rate": 3.5548367202495894e-05, "loss": 2.4582868576049806, "memory(GiB)": 77.56, "step": 69245, "token_acc": 0.4795539033457249, "train_speed(iter/s)": 1.437719 }, { "epoch": 2.9668823100981108, "grad_norm": 4.743619918823242, "learning_rate": 3.554192479221173e-05, "loss": 2.4761852264404296, "memory(GiB)": 77.56, "step": 69250, "token_acc": 0.47043010752688175, "train_speed(iter/s)": 1.437724 }, { "epoch": 2.9670965254273596, "grad_norm": 5.507030963897705, "learning_rate": 3.5535482643849153e-05, "loss": 2.4245632171630858, "memory(GiB)": 77.56, "step": 69255, "token_acc": 0.4806451612903226, "train_speed(iter/s)": 1.437716 }, { "epoch": 2.9673107407566084, "grad_norm": 6.4472455978393555, "learning_rate": 3.552904075752489e-05, "loss": 2.163592529296875, "memory(GiB)": 77.56, "step": 69260, "token_acc": 0.519650655021834, "train_speed(iter/s)": 1.437719 }, { "epoch": 2.9675249560858576, "grad_norm": 4.059402942657471, "learning_rate": 3.552259913335562e-05, "loss": 2.487284469604492, "memory(GiB)": 77.56, "step": 69265, "token_acc": 0.45, "train_speed(iter/s)": 1.437725 }, { "epoch": 2.9677391714151065, "grad_norm": 4.72160005569458, "learning_rate": 3.5516157771458045e-05, "loss": 2.558095169067383, "memory(GiB)": 77.56, "step": 69270, "token_acc": 0.46075085324232085, "train_speed(iter/s)": 1.437732 }, { "epoch": 2.9679533867443553, "grad_norm": 4.550678730010986, "learning_rate": 3.550971667194886e-05, "loss": 2.3446065902709963, "memory(GiB)": 77.56, "step": 69275, "token_acc": 0.5036231884057971, "train_speed(iter/s)": 1.437745 }, { "epoch": 2.9681676020736045, "grad_norm": 5.5712480545043945, "learning_rate": 3.550327583494475e-05, "loss": 2.5355155944824217, "memory(GiB)": 77.56, "step": 69280, "token_acc": 0.5, "train_speed(iter/s)": 1.437745 }, { "epoch": 2.9683818174028533, "grad_norm": 5.873742580413818, "learning_rate": 3.54968352605624e-05, "loss": 2.7183086395263674, "memory(GiB)": 77.56, "step": 69285, "token_acc": 0.4965034965034965, "train_speed(iter/s)": 1.437741 }, { "epoch": 2.968596032732102, "grad_norm": 6.629122734069824, "learning_rate": 3.549039494891849e-05, "loss": 2.4706657409667967, "memory(GiB)": 77.56, "step": 69290, "token_acc": 0.47419354838709676, "train_speed(iter/s)": 1.437753 }, { "epoch": 2.9688102480613514, "grad_norm": 4.271261692047119, "learning_rate": 3.548395490012966e-05, "loss": 2.3545095443725588, "memory(GiB)": 77.56, "step": 69295, "token_acc": 0.5202952029520295, "train_speed(iter/s)": 1.437772 }, { "epoch": 2.9690244633906, "grad_norm": 6.33363151550293, "learning_rate": 3.547751511431262e-05, "loss": 2.7936716079711914, "memory(GiB)": 77.56, "step": 69300, "token_acc": 0.4275618374558304, "train_speed(iter/s)": 1.437751 }, { "epoch": 2.969238678719849, "grad_norm": 5.481980323791504, "learning_rate": 3.5471075591584024e-05, "loss": 2.181114387512207, "memory(GiB)": 77.56, "step": 69305, "token_acc": 0.506993006993007, "train_speed(iter/s)": 1.437737 }, { "epoch": 2.9694528940490983, "grad_norm": 5.386048316955566, "learning_rate": 3.546463633206052e-05, "loss": 2.0404840469360352, "memory(GiB)": 77.56, "step": 69310, "token_acc": 0.5625, "train_speed(iter/s)": 1.437743 }, { "epoch": 2.969667109378347, "grad_norm": 5.444461822509766, "learning_rate": 3.545819733585876e-05, "loss": 2.879924201965332, "memory(GiB)": 77.56, "step": 69315, "token_acc": 0.4359673024523161, "train_speed(iter/s)": 1.437754 }, { "epoch": 2.969881324707596, "grad_norm": 6.090682506561279, "learning_rate": 3.5451758603095404e-05, "loss": 2.5262264251708983, "memory(GiB)": 77.56, "step": 69320, "token_acc": 0.458955223880597, "train_speed(iter/s)": 1.437742 }, { "epoch": 2.970095540036845, "grad_norm": 4.950562000274658, "learning_rate": 3.5445320133887075e-05, "loss": 2.6107624053955076, "memory(GiB)": 77.56, "step": 69325, "token_acc": 0.5121951219512195, "train_speed(iter/s)": 1.437754 }, { "epoch": 2.970309755366094, "grad_norm": 4.2823686599731445, "learning_rate": 3.543888192835044e-05, "loss": 2.3906349182128905, "memory(GiB)": 77.56, "step": 69330, "token_acc": 0.5333333333333333, "train_speed(iter/s)": 1.43777 }, { "epoch": 2.970523970695343, "grad_norm": 5.902844429016113, "learning_rate": 3.543244398660212e-05, "loss": 2.576519584655762, "memory(GiB)": 77.56, "step": 69335, "token_acc": 0.46105919003115264, "train_speed(iter/s)": 1.43778 }, { "epoch": 2.970738186024592, "grad_norm": 6.1937642097473145, "learning_rate": 3.542600630875873e-05, "loss": 2.6070356369018555, "memory(GiB)": 77.56, "step": 69340, "token_acc": 0.4714828897338403, "train_speed(iter/s)": 1.437802 }, { "epoch": 2.970952401353841, "grad_norm": 6.378774642944336, "learning_rate": 3.541956889493692e-05, "loss": 2.4436540603637695, "memory(GiB)": 77.56, "step": 69345, "token_acc": 0.4856115107913669, "train_speed(iter/s)": 1.437807 }, { "epoch": 2.9711666166830897, "grad_norm": 6.986944675445557, "learning_rate": 3.541313174525329e-05, "loss": 2.5839462280273438, "memory(GiB)": 77.56, "step": 69350, "token_acc": 0.4948453608247423, "train_speed(iter/s)": 1.437827 }, { "epoch": 2.971380832012339, "grad_norm": 6.754354476928711, "learning_rate": 3.540669485982445e-05, "loss": 2.0642425537109377, "memory(GiB)": 77.56, "step": 69355, "token_acc": 0.5254237288135594, "train_speed(iter/s)": 1.437843 }, { "epoch": 2.9715950473415877, "grad_norm": 6.530547142028809, "learning_rate": 3.540025823876704e-05, "loss": 2.606150436401367, "memory(GiB)": 77.56, "step": 69360, "token_acc": 0.45112781954887216, "train_speed(iter/s)": 1.437846 }, { "epoch": 2.9718092626708366, "grad_norm": 4.96040153503418, "learning_rate": 3.539382188219764e-05, "loss": 2.6040573120117188, "memory(GiB)": 77.56, "step": 69365, "token_acc": 0.47266881028938906, "train_speed(iter/s)": 1.437857 }, { "epoch": 2.972023478000086, "grad_norm": 6.430371284484863, "learning_rate": 3.5387385790232854e-05, "loss": 2.4298574447631838, "memory(GiB)": 77.56, "step": 69370, "token_acc": 0.5161290322580645, "train_speed(iter/s)": 1.437865 }, { "epoch": 2.9722376933293346, "grad_norm": 7.155005931854248, "learning_rate": 3.538094996298928e-05, "loss": 2.4761009216308594, "memory(GiB)": 77.56, "step": 69375, "token_acc": 0.4916387959866221, "train_speed(iter/s)": 1.437865 }, { "epoch": 2.9724519086585834, "grad_norm": 5.11592960357666, "learning_rate": 3.537451440058353e-05, "loss": 2.630797576904297, "memory(GiB)": 77.56, "step": 69380, "token_acc": 0.44594594594594594, "train_speed(iter/s)": 1.437843 }, { "epoch": 2.9726661239878327, "grad_norm": 7.512523651123047, "learning_rate": 3.5368079103132143e-05, "loss": 2.3299076080322267, "memory(GiB)": 77.56, "step": 69385, "token_acc": 0.49498327759197325, "train_speed(iter/s)": 1.437838 }, { "epoch": 2.9728803393170815, "grad_norm": 5.576261043548584, "learning_rate": 3.536164407075175e-05, "loss": 2.2350683212280273, "memory(GiB)": 77.56, "step": 69390, "token_acc": 0.5487364620938628, "train_speed(iter/s)": 1.437832 }, { "epoch": 2.9730945546463303, "grad_norm": 5.0224127769470215, "learning_rate": 3.535520930355891e-05, "loss": 2.3432601928710937, "memory(GiB)": 77.56, "step": 69395, "token_acc": 0.5035460992907801, "train_speed(iter/s)": 1.437846 }, { "epoch": 2.9733087699755796, "grad_norm": 6.845538139343262, "learning_rate": 3.5348774801670184e-05, "loss": 2.373804473876953, "memory(GiB)": 77.56, "step": 69400, "token_acc": 0.49158249158249157, "train_speed(iter/s)": 1.437851 }, { "epoch": 2.9735229853048284, "grad_norm": 5.384983539581299, "learning_rate": 3.5342340565202146e-05, "loss": 2.216988372802734, "memory(GiB)": 77.56, "step": 69405, "token_acc": 0.532608695652174, "train_speed(iter/s)": 1.437866 }, { "epoch": 2.973737200634077, "grad_norm": 6.58991003036499, "learning_rate": 3.533590659427137e-05, "loss": 2.7947261810302733, "memory(GiB)": 77.56, "step": 69410, "token_acc": 0.4205298013245033, "train_speed(iter/s)": 1.437899 }, { "epoch": 2.9739514159633265, "grad_norm": 5.856507301330566, "learning_rate": 3.532947288899439e-05, "loss": 2.368599700927734, "memory(GiB)": 77.56, "step": 69415, "token_acc": 0.5084745762711864, "train_speed(iter/s)": 1.437921 }, { "epoch": 2.9741656312925753, "grad_norm": 5.09470796585083, "learning_rate": 3.532303944948777e-05, "loss": 2.3902076721191405, "memory(GiB)": 77.56, "step": 69420, "token_acc": 0.4912891986062718, "train_speed(iter/s)": 1.437925 }, { "epoch": 2.974379846621824, "grad_norm": 8.592560768127441, "learning_rate": 3.5316606275868056e-05, "loss": 2.418587875366211, "memory(GiB)": 77.56, "step": 69425, "token_acc": 0.46742209631728043, "train_speed(iter/s)": 1.437922 }, { "epoch": 2.9745940619510733, "grad_norm": 5.40716552734375, "learning_rate": 3.5310173368251794e-05, "loss": 2.3445207595825197, "memory(GiB)": 77.56, "step": 69430, "token_acc": 0.5217391304347826, "train_speed(iter/s)": 1.437902 }, { "epoch": 2.974808277280322, "grad_norm": 4.981843948364258, "learning_rate": 3.5303740726755544e-05, "loss": 2.318739891052246, "memory(GiB)": 77.56, "step": 69435, "token_acc": 0.488135593220339, "train_speed(iter/s)": 1.437873 }, { "epoch": 2.975022492609571, "grad_norm": 7.428905963897705, "learning_rate": 3.529730835149577e-05, "loss": 2.4202529907226564, "memory(GiB)": 77.56, "step": 69440, "token_acc": 0.4791666666666667, "train_speed(iter/s)": 1.437873 }, { "epoch": 2.9752367079388202, "grad_norm": 6.593773365020752, "learning_rate": 3.5290876242589076e-05, "loss": 2.227604866027832, "memory(GiB)": 77.56, "step": 69445, "token_acc": 0.5376712328767124, "train_speed(iter/s)": 1.437865 }, { "epoch": 2.975450923268069, "grad_norm": 4.56346321105957, "learning_rate": 3.528444440015196e-05, "loss": 2.352486801147461, "memory(GiB)": 77.56, "step": 69450, "token_acc": 0.46229508196721314, "train_speed(iter/s)": 1.437879 }, { "epoch": 2.975665138597318, "grad_norm": 7.1637420654296875, "learning_rate": 3.5278012824300944e-05, "loss": 2.3603553771972656, "memory(GiB)": 77.56, "step": 69455, "token_acc": 0.5017543859649123, "train_speed(iter/s)": 1.437879 }, { "epoch": 2.975879353926567, "grad_norm": 5.515501022338867, "learning_rate": 3.527158151515252e-05, "loss": 2.4818950653076173, "memory(GiB)": 77.56, "step": 69460, "token_acc": 0.5097402597402597, "train_speed(iter/s)": 1.43788 }, { "epoch": 2.976093569255816, "grad_norm": 6.24483060836792, "learning_rate": 3.526515047282323e-05, "loss": 2.440196228027344, "memory(GiB)": 77.56, "step": 69465, "token_acc": 0.4755700325732899, "train_speed(iter/s)": 1.437894 }, { "epoch": 2.9763077845850647, "grad_norm": 4.1129350662231445, "learning_rate": 3.525871969742954e-05, "loss": 2.4850826263427734, "memory(GiB)": 77.56, "step": 69470, "token_acc": 0.5129682997118156, "train_speed(iter/s)": 1.437898 }, { "epoch": 2.976521999914314, "grad_norm": 5.955257892608643, "learning_rate": 3.525228918908799e-05, "loss": 2.193038558959961, "memory(GiB)": 77.56, "step": 69475, "token_acc": 0.5232558139534884, "train_speed(iter/s)": 1.43787 }, { "epoch": 2.976736215243563, "grad_norm": 5.333489418029785, "learning_rate": 3.524585894791506e-05, "loss": 2.424423408508301, "memory(GiB)": 77.56, "step": 69480, "token_acc": 0.4812286689419795, "train_speed(iter/s)": 1.437874 }, { "epoch": 2.9769504305728116, "grad_norm": 5.985816955566406, "learning_rate": 3.523942897402721e-05, "loss": 2.375263214111328, "memory(GiB)": 77.56, "step": 69485, "token_acc": 0.5142857142857142, "train_speed(iter/s)": 1.437899 }, { "epoch": 2.977164645902061, "grad_norm": 6.023412227630615, "learning_rate": 3.5232999267540964e-05, "loss": 2.254396438598633, "memory(GiB)": 77.56, "step": 69490, "token_acc": 0.4979757085020243, "train_speed(iter/s)": 1.437904 }, { "epoch": 2.9773788612313097, "grad_norm": 5.949342250823975, "learning_rate": 3.52265698285728e-05, "loss": 2.3097095489501953, "memory(GiB)": 77.56, "step": 69495, "token_acc": 0.5226480836236934, "train_speed(iter/s)": 1.437907 }, { "epoch": 2.9775930765605585, "grad_norm": 6.291986465454102, "learning_rate": 3.5220140657239164e-05, "loss": 2.600223922729492, "memory(GiB)": 77.56, "step": 69500, "token_acc": 0.43521594684385384, "train_speed(iter/s)": 1.437918 }, { "epoch": 2.9775930765605585, "eval_loss": 2.2632994651794434, "eval_runtime": 13.8832, "eval_samples_per_second": 7.203, "eval_steps_per_second": 7.203, "eval_token_acc": 0.48267008985879334, "step": 69500 }, { "epoch": 2.9778072918898078, "grad_norm": 4.687198638916016, "learning_rate": 3.5213711753656565e-05, "loss": 2.4443904876708986, "memory(GiB)": 77.56, "step": 69505, "token_acc": 0.4800389483933788, "train_speed(iter/s)": 1.437482 }, { "epoch": 2.9780215072190566, "grad_norm": 6.376140117645264, "learning_rate": 3.520728311794143e-05, "loss": 2.7297183990478517, "memory(GiB)": 77.56, "step": 69510, "token_acc": 0.4629080118694362, "train_speed(iter/s)": 1.43746 }, { "epoch": 2.9782357225483054, "grad_norm": 5.8469319343566895, "learning_rate": 3.5200854750210235e-05, "loss": 2.2844945907592775, "memory(GiB)": 77.56, "step": 69515, "token_acc": 0.4901315789473684, "train_speed(iter/s)": 1.43747 }, { "epoch": 2.9784499378775546, "grad_norm": 6.49983024597168, "learning_rate": 3.5194426650579445e-05, "loss": 2.2972564697265625, "memory(GiB)": 77.56, "step": 69520, "token_acc": 0.4731182795698925, "train_speed(iter/s)": 1.437484 }, { "epoch": 2.9786641532068034, "grad_norm": 5.930006980895996, "learning_rate": 3.518799881916551e-05, "loss": 2.3452810287475585, "memory(GiB)": 77.56, "step": 69525, "token_acc": 0.46396396396396394, "train_speed(iter/s)": 1.437517 }, { "epoch": 2.9788783685360523, "grad_norm": 6.153254985809326, "learning_rate": 3.518157125608487e-05, "loss": 2.435942268371582, "memory(GiB)": 77.56, "step": 69530, "token_acc": 0.47147147147147145, "train_speed(iter/s)": 1.437525 }, { "epoch": 2.9790925838653015, "grad_norm": 5.583297252655029, "learning_rate": 3.5175143961453965e-05, "loss": 2.1783119201660157, "memory(GiB)": 77.56, "step": 69535, "token_acc": 0.5244755244755245, "train_speed(iter/s)": 1.437512 }, { "epoch": 2.9793067991945503, "grad_norm": 7.191718578338623, "learning_rate": 3.516871693538924e-05, "loss": 2.603336715698242, "memory(GiB)": 77.56, "step": 69540, "token_acc": 0.44947735191637633, "train_speed(iter/s)": 1.437502 }, { "epoch": 2.979521014523799, "grad_norm": 5.396605968475342, "learning_rate": 3.516229017800711e-05, "loss": 2.48966064453125, "memory(GiB)": 77.56, "step": 69545, "token_acc": 0.48201438848920863, "train_speed(iter/s)": 1.437513 }, { "epoch": 2.9797352298530484, "grad_norm": 5.894193649291992, "learning_rate": 3.515586368942402e-05, "loss": 2.366547393798828, "memory(GiB)": 77.56, "step": 69550, "token_acc": 0.5318352059925093, "train_speed(iter/s)": 1.43753 }, { "epoch": 2.979949445182297, "grad_norm": 5.607132911682129, "learning_rate": 3.514943746975639e-05, "loss": 2.6504955291748047, "memory(GiB)": 77.56, "step": 69555, "token_acc": 0.4966216216216216, "train_speed(iter/s)": 1.437543 }, { "epoch": 2.980163660511546, "grad_norm": 5.978559494018555, "learning_rate": 3.514301151912062e-05, "loss": 2.3608266830444338, "memory(GiB)": 77.56, "step": 69560, "token_acc": 0.48659003831417624, "train_speed(iter/s)": 1.43756 }, { "epoch": 2.9803778758407953, "grad_norm": 6.648810863494873, "learning_rate": 3.513658583763314e-05, "loss": 2.18863468170166, "memory(GiB)": 77.56, "step": 69565, "token_acc": 0.503448275862069, "train_speed(iter/s)": 1.437576 }, { "epoch": 2.980592091170044, "grad_norm": 5.128035068511963, "learning_rate": 3.513016042541034e-05, "loss": 2.5890134811401366, "memory(GiB)": 77.56, "step": 69570, "token_acc": 0.43154761904761907, "train_speed(iter/s)": 1.437587 }, { "epoch": 2.980806306499293, "grad_norm": 8.38038158416748, "learning_rate": 3.5123735282568646e-05, "loss": 2.654056930541992, "memory(GiB)": 77.56, "step": 69575, "token_acc": 0.42990654205607476, "train_speed(iter/s)": 1.437602 }, { "epoch": 2.981020521828542, "grad_norm": 5.498917102813721, "learning_rate": 3.5117310409224443e-05, "loss": 2.470449447631836, "memory(GiB)": 77.56, "step": 69580, "token_acc": 0.4647058823529412, "train_speed(iter/s)": 1.437623 }, { "epoch": 2.981234737157791, "grad_norm": 8.832070350646973, "learning_rate": 3.5110885805494115e-05, "loss": 2.570925712585449, "memory(GiB)": 77.56, "step": 69585, "token_acc": 0.476, "train_speed(iter/s)": 1.437628 }, { "epoch": 2.98144895248704, "grad_norm": 6.388439178466797, "learning_rate": 3.510446147149404e-05, "loss": 2.279448318481445, "memory(GiB)": 77.56, "step": 69590, "token_acc": 0.5352112676056338, "train_speed(iter/s)": 1.437641 }, { "epoch": 2.981663167816289, "grad_norm": 4.355556488037109, "learning_rate": 3.509803740734065e-05, "loss": 2.0512977600097657, "memory(GiB)": 77.56, "step": 69595, "token_acc": 0.5178571428571429, "train_speed(iter/s)": 1.437632 }, { "epoch": 2.981877383145538, "grad_norm": 5.755629062652588, "learning_rate": 3.509161361315028e-05, "loss": 2.5466222763061523, "memory(GiB)": 77.56, "step": 69600, "token_acc": 0.484375, "train_speed(iter/s)": 1.437625 }, { "epoch": 2.9820915984747867, "grad_norm": 7.499678611755371, "learning_rate": 3.508519008903931e-05, "loss": 2.554818534851074, "memory(GiB)": 77.56, "step": 69605, "token_acc": 0.46757679180887374, "train_speed(iter/s)": 1.437616 }, { "epoch": 2.982305813804036, "grad_norm": 4.489687919616699, "learning_rate": 3.507876683512412e-05, "loss": 2.1566585540771483, "memory(GiB)": 77.56, "step": 69610, "token_acc": 0.5144694533762058, "train_speed(iter/s)": 1.437625 }, { "epoch": 2.9825200291332847, "grad_norm": 4.953376293182373, "learning_rate": 3.507234385152106e-05, "loss": 2.4046310424804687, "memory(GiB)": 77.56, "step": 69615, "token_acc": 0.4852941176470588, "train_speed(iter/s)": 1.437604 }, { "epoch": 2.9827342444625335, "grad_norm": 5.839040279388428, "learning_rate": 3.5065921138346504e-05, "loss": 2.527578926086426, "memory(GiB)": 77.56, "step": 69620, "token_acc": 0.5036231884057971, "train_speed(iter/s)": 1.437617 }, { "epoch": 2.982948459791783, "grad_norm": 9.756697654724121, "learning_rate": 3.505949869571679e-05, "loss": 2.232336235046387, "memory(GiB)": 77.56, "step": 69625, "token_acc": 0.5384615384615384, "train_speed(iter/s)": 1.437612 }, { "epoch": 2.9831626751210316, "grad_norm": 5.325991153717041, "learning_rate": 3.505307652374827e-05, "loss": 2.2279457092285155, "memory(GiB)": 77.56, "step": 69630, "token_acc": 0.5628930817610063, "train_speed(iter/s)": 1.437608 }, { "epoch": 2.9833768904502804, "grad_norm": 4.278589248657227, "learning_rate": 3.5046654622557295e-05, "loss": 2.436077690124512, "memory(GiB)": 77.56, "step": 69635, "token_acc": 0.5097402597402597, "train_speed(iter/s)": 1.437621 }, { "epoch": 2.9835911057795297, "grad_norm": 3.8859148025512695, "learning_rate": 3.50402329922602e-05, "loss": 2.3893901824951174, "memory(GiB)": 77.56, "step": 69640, "token_acc": 0.4891640866873065, "train_speed(iter/s)": 1.437621 }, { "epoch": 2.9838053211087785, "grad_norm": 8.65683364868164, "learning_rate": 3.5033811632973315e-05, "loss": 2.6062952041625977, "memory(GiB)": 77.56, "step": 69645, "token_acc": 0.436046511627907, "train_speed(iter/s)": 1.437611 }, { "epoch": 2.9840195364380273, "grad_norm": 6.522454738616943, "learning_rate": 3.502739054481297e-05, "loss": 2.6266656875610352, "memory(GiB)": 77.56, "step": 69650, "token_acc": 0.4329268292682927, "train_speed(iter/s)": 1.437615 }, { "epoch": 2.9842337517672766, "grad_norm": 5.323433876037598, "learning_rate": 3.5020969727895484e-05, "loss": 2.34755859375, "memory(GiB)": 77.56, "step": 69655, "token_acc": 0.48615384615384616, "train_speed(iter/s)": 1.437639 }, { "epoch": 2.9844479670965254, "grad_norm": 7.181215763092041, "learning_rate": 3.50145491823372e-05, "loss": 2.919140625, "memory(GiB)": 77.56, "step": 69660, "token_acc": 0.4411764705882353, "train_speed(iter/s)": 1.437652 }, { "epoch": 2.984662182425774, "grad_norm": 6.406488418579102, "learning_rate": 3.500812890825439e-05, "loss": 2.5161109924316407, "memory(GiB)": 77.56, "step": 69665, "token_acc": 0.4936061381074169, "train_speed(iter/s)": 1.437669 }, { "epoch": 2.9848763977550234, "grad_norm": 5.650805950164795, "learning_rate": 3.50017089057634e-05, "loss": 2.3362913131713867, "memory(GiB)": 77.56, "step": 69670, "token_acc": 0.5049180327868853, "train_speed(iter/s)": 1.437687 }, { "epoch": 2.9850906130842723, "grad_norm": 6.8401713371276855, "learning_rate": 3.499528917498053e-05, "loss": 2.4290498733520507, "memory(GiB)": 77.56, "step": 69675, "token_acc": 0.4722222222222222, "train_speed(iter/s)": 1.437671 }, { "epoch": 2.985304828413521, "grad_norm": 6.590735912322998, "learning_rate": 3.4988869716022065e-05, "loss": 2.361783027648926, "memory(GiB)": 77.56, "step": 69680, "token_acc": 0.5316455696202531, "train_speed(iter/s)": 1.437708 }, { "epoch": 2.9855190437427703, "grad_norm": 7.112832069396973, "learning_rate": 3.498245052900432e-05, "loss": 2.619435691833496, "memory(GiB)": 77.56, "step": 69685, "token_acc": 0.47601476014760147, "train_speed(iter/s)": 1.437726 }, { "epoch": 2.985733259072019, "grad_norm": 5.147935390472412, "learning_rate": 3.4976031614043555e-05, "loss": 2.3561704635620115, "memory(GiB)": 77.56, "step": 69690, "token_acc": 0.47368421052631576, "train_speed(iter/s)": 1.437749 }, { "epoch": 2.985947474401268, "grad_norm": 3.965451240539551, "learning_rate": 3.496961297125608e-05, "loss": 2.8114501953125, "memory(GiB)": 77.56, "step": 69695, "token_acc": 0.46407185628742514, "train_speed(iter/s)": 1.437727 }, { "epoch": 2.986161689730517, "grad_norm": 5.191863536834717, "learning_rate": 3.4963194600758166e-05, "loss": 2.213589668273926, "memory(GiB)": 77.56, "step": 69700, "token_acc": 0.5102040816326531, "train_speed(iter/s)": 1.437692 }, { "epoch": 2.986375905059766, "grad_norm": 10.08134937286377, "learning_rate": 3.4956776502666076e-05, "loss": 2.4462020874023436, "memory(GiB)": 77.56, "step": 69705, "token_acc": 0.47202797202797203, "train_speed(iter/s)": 1.437691 }, { "epoch": 2.986590120389015, "grad_norm": 5.610540390014648, "learning_rate": 3.4950358677096103e-05, "loss": 2.6670482635498045, "memory(GiB)": 77.56, "step": 69710, "token_acc": 0.43, "train_speed(iter/s)": 1.437709 }, { "epoch": 2.986804335718264, "grad_norm": 5.5300445556640625, "learning_rate": 3.4943941124164494e-05, "loss": 2.5622278213500977, "memory(GiB)": 77.56, "step": 69715, "token_acc": 0.47648902821316613, "train_speed(iter/s)": 1.437721 }, { "epoch": 2.987018551047513, "grad_norm": 6.6703572273254395, "learning_rate": 3.493752384398753e-05, "loss": 2.4519445419311525, "memory(GiB)": 77.56, "step": 69720, "token_acc": 0.45564516129032256, "train_speed(iter/s)": 1.437731 }, { "epoch": 2.9872327663767617, "grad_norm": 7.126587390899658, "learning_rate": 3.493110683668144e-05, "loss": 2.4102910995483398, "memory(GiB)": 77.56, "step": 69725, "token_acc": 0.47491638795986624, "train_speed(iter/s)": 1.437714 }, { "epoch": 2.987446981706011, "grad_norm": 5.214530944824219, "learning_rate": 3.4924690102362475e-05, "loss": 2.3161338806152343, "memory(GiB)": 77.56, "step": 69730, "token_acc": 0.5, "train_speed(iter/s)": 1.437734 }, { "epoch": 2.98766119703526, "grad_norm": 6.560204029083252, "learning_rate": 3.491827364114689e-05, "loss": 2.3912942886352537, "memory(GiB)": 77.56, "step": 69735, "token_acc": 0.45098039215686275, "train_speed(iter/s)": 1.437747 }, { "epoch": 2.9878754123645086, "grad_norm": 4.339321136474609, "learning_rate": 3.491185745315094e-05, "loss": 2.397929382324219, "memory(GiB)": 77.56, "step": 69740, "token_acc": 0.48580441640378547, "train_speed(iter/s)": 1.437735 }, { "epoch": 2.988089627693758, "grad_norm": 8.264789581298828, "learning_rate": 3.490544153849085e-05, "loss": 2.492336463928223, "memory(GiB)": 77.56, "step": 69745, "token_acc": 0.4778761061946903, "train_speed(iter/s)": 1.437734 }, { "epoch": 2.9883038430230067, "grad_norm": 7.304971218109131, "learning_rate": 3.489902589728283e-05, "loss": 2.107792854309082, "memory(GiB)": 77.56, "step": 69750, "token_acc": 0.5473684210526316, "train_speed(iter/s)": 1.43773 }, { "epoch": 2.9885180583522555, "grad_norm": 5.320586681365967, "learning_rate": 3.4892610529643135e-05, "loss": 2.702079010009766, "memory(GiB)": 77.56, "step": 69755, "token_acc": 0.4426229508196721, "train_speed(iter/s)": 1.437741 }, { "epoch": 2.9887322736815047, "grad_norm": 5.018779754638672, "learning_rate": 3.488619543568796e-05, "loss": 2.6156057357788085, "memory(GiB)": 77.56, "step": 69760, "token_acc": 0.4836795252225519, "train_speed(iter/s)": 1.437752 }, { "epoch": 2.9889464890107535, "grad_norm": 5.850031852722168, "learning_rate": 3.487978061553355e-05, "loss": 2.377509880065918, "memory(GiB)": 77.56, "step": 69765, "token_acc": 0.48639455782312924, "train_speed(iter/s)": 1.437772 }, { "epoch": 2.9891607043400024, "grad_norm": 5.953535556793213, "learning_rate": 3.4873366069296095e-05, "loss": 2.351734924316406, "memory(GiB)": 77.56, "step": 69770, "token_acc": 0.48046875, "train_speed(iter/s)": 1.437764 }, { "epoch": 2.9893749196692516, "grad_norm": 5.522590637207031, "learning_rate": 3.4866951797091786e-05, "loss": 2.4259239196777345, "memory(GiB)": 77.56, "step": 69775, "token_acc": 0.49337748344370863, "train_speed(iter/s)": 1.437757 }, { "epoch": 2.9895891349985004, "grad_norm": 4.952304840087891, "learning_rate": 3.486053779903686e-05, "loss": 2.4415794372558595, "memory(GiB)": 77.56, "step": 69780, "token_acc": 0.47330960854092524, "train_speed(iter/s)": 1.43774 }, { "epoch": 2.9898033503277492, "grad_norm": 5.014679908752441, "learning_rate": 3.485412407524749e-05, "loss": 2.6219457626342773, "memory(GiB)": 77.56, "step": 69785, "token_acc": 0.4666666666666667, "train_speed(iter/s)": 1.437725 }, { "epoch": 2.9900175656569985, "grad_norm": 4.547702312469482, "learning_rate": 3.484771062583986e-05, "loss": 2.3821346282958986, "memory(GiB)": 77.56, "step": 69790, "token_acc": 0.49454545454545457, "train_speed(iter/s)": 1.437758 }, { "epoch": 2.9902317809862473, "grad_norm": 5.8471455574035645, "learning_rate": 3.484129745093018e-05, "loss": 2.6142168045043945, "memory(GiB)": 77.56, "step": 69795, "token_acc": 0.4723127035830619, "train_speed(iter/s)": 1.437761 }, { "epoch": 2.990445996315496, "grad_norm": 5.91321325302124, "learning_rate": 3.48348845506346e-05, "loss": 2.4356136322021484, "memory(GiB)": 77.56, "step": 69800, "token_acc": 0.4492307692307692, "train_speed(iter/s)": 1.437749 }, { "epoch": 2.9906602116447454, "grad_norm": 4.776285648345947, "learning_rate": 3.482847192506933e-05, "loss": 2.455434799194336, "memory(GiB)": 77.56, "step": 69805, "token_acc": 0.4584717607973422, "train_speed(iter/s)": 1.437746 }, { "epoch": 2.990874426973994, "grad_norm": 4.8118085861206055, "learning_rate": 3.48220595743505e-05, "loss": 2.3861270904541017, "memory(GiB)": 77.56, "step": 69810, "token_acc": 0.5037593984962406, "train_speed(iter/s)": 1.437743 }, { "epoch": 2.991088642303243, "grad_norm": 6.598339557647705, "learning_rate": 3.481564749859431e-05, "loss": 2.598998260498047, "memory(GiB)": 77.56, "step": 69815, "token_acc": 0.4766355140186916, "train_speed(iter/s)": 1.43774 }, { "epoch": 2.9913028576324923, "grad_norm": 5.918899059295654, "learning_rate": 3.480923569791691e-05, "loss": 2.6179405212402345, "memory(GiB)": 77.56, "step": 69820, "token_acc": 0.4152046783625731, "train_speed(iter/s)": 1.437752 }, { "epoch": 2.991517072961741, "grad_norm": 6.022837162017822, "learning_rate": 3.480282417243446e-05, "loss": 2.21358585357666, "memory(GiB)": 77.56, "step": 69825, "token_acc": 0.5019011406844106, "train_speed(iter/s)": 1.437774 }, { "epoch": 2.99173128829099, "grad_norm": 5.195446014404297, "learning_rate": 3.479641292226311e-05, "loss": 2.460297393798828, "memory(GiB)": 77.56, "step": 69830, "token_acc": 0.4649122807017544, "train_speed(iter/s)": 1.437764 }, { "epoch": 2.991945503620239, "grad_norm": 5.995636940002441, "learning_rate": 3.479000194751899e-05, "loss": 2.3766172409057615, "memory(GiB)": 77.56, "step": 69835, "token_acc": 0.4603174603174603, "train_speed(iter/s)": 1.437752 }, { "epoch": 2.992159718949488, "grad_norm": 4.970757484436035, "learning_rate": 3.478359124831827e-05, "loss": 2.352031135559082, "memory(GiB)": 77.56, "step": 69840, "token_acc": 0.5418060200668896, "train_speed(iter/s)": 1.437754 }, { "epoch": 2.9923739342787368, "grad_norm": 4.864602565765381, "learning_rate": 3.4777180824777057e-05, "loss": 2.4529241561889648, "memory(GiB)": 77.56, "step": 69845, "token_acc": 0.46449704142011833, "train_speed(iter/s)": 1.437758 }, { "epoch": 2.992588149607986, "grad_norm": 6.022401809692383, "learning_rate": 3.477077067701149e-05, "loss": 2.3812414169311524, "memory(GiB)": 77.56, "step": 69850, "token_acc": 0.4852459016393443, "train_speed(iter/s)": 1.437757 }, { "epoch": 2.992802364937235, "grad_norm": 4.516592502593994, "learning_rate": 3.476436080513771e-05, "loss": 2.2282077789306642, "memory(GiB)": 77.56, "step": 69855, "token_acc": 0.5466666666666666, "train_speed(iter/s)": 1.437785 }, { "epoch": 2.9930165802664837, "grad_norm": 7.423250675201416, "learning_rate": 3.475795120927181e-05, "loss": 2.2731250762939452, "memory(GiB)": 77.56, "step": 69860, "token_acc": 0.5231316725978647, "train_speed(iter/s)": 1.43781 }, { "epoch": 2.993230795595733, "grad_norm": 5.776782989501953, "learning_rate": 3.475154188952994e-05, "loss": 2.3110647201538086, "memory(GiB)": 77.56, "step": 69865, "token_acc": 0.501628664495114, "train_speed(iter/s)": 1.437806 }, { "epoch": 2.9934450109249817, "grad_norm": 7.36826229095459, "learning_rate": 3.4745132846028185e-05, "loss": 2.6141178131103517, "memory(GiB)": 77.56, "step": 69870, "token_acc": 0.5015576323987538, "train_speed(iter/s)": 1.43779 }, { "epoch": 2.9936592262542305, "grad_norm": 4.841373443603516, "learning_rate": 3.473872407888266e-05, "loss": 2.1498823165893555, "memory(GiB)": 77.56, "step": 69875, "token_acc": 0.5434782608695652, "train_speed(iter/s)": 1.437783 }, { "epoch": 2.99387344158348, "grad_norm": 6.328704833984375, "learning_rate": 3.473231558820946e-05, "loss": 2.7730695724487306, "memory(GiB)": 77.56, "step": 69880, "token_acc": 0.46153846153846156, "train_speed(iter/s)": 1.437785 }, { "epoch": 2.9940876569127286, "grad_norm": 4.848683834075928, "learning_rate": 3.472590737412467e-05, "loss": 2.423569107055664, "memory(GiB)": 77.56, "step": 69885, "token_acc": 0.5165562913907285, "train_speed(iter/s)": 1.437802 }, { "epoch": 2.9943018722419774, "grad_norm": 5.444921016693115, "learning_rate": 3.471949943674442e-05, "loss": 2.6714550018310548, "memory(GiB)": 77.56, "step": 69890, "token_acc": 0.47477744807121663, "train_speed(iter/s)": 1.43781 }, { "epoch": 2.9945160875712267, "grad_norm": 5.013216018676758, "learning_rate": 3.471309177618476e-05, "loss": 2.217668151855469, "memory(GiB)": 77.56, "step": 69895, "token_acc": 0.4980694980694981, "train_speed(iter/s)": 1.437793 }, { "epoch": 2.9947303029004755, "grad_norm": 5.5079665184021, "learning_rate": 3.47066843925618e-05, "loss": 2.409965705871582, "memory(GiB)": 77.56, "step": 69900, "token_acc": 0.549407114624506, "train_speed(iter/s)": 1.437764 }, { "epoch": 2.9949445182297243, "grad_norm": 6.3039703369140625, "learning_rate": 3.4700277285991575e-05, "loss": 2.1393192291259764, "memory(GiB)": 77.56, "step": 69905, "token_acc": 0.515358361774744, "train_speed(iter/s)": 1.437759 }, { "epoch": 2.9951587335589736, "grad_norm": 4.885385036468506, "learning_rate": 3.469387045659019e-05, "loss": 2.515911865234375, "memory(GiB)": 77.56, "step": 69910, "token_acc": 0.5050505050505051, "train_speed(iter/s)": 1.437747 }, { "epoch": 2.9953729488882224, "grad_norm": 4.702334880828857, "learning_rate": 3.4687463904473716e-05, "loss": 2.3052322387695314, "memory(GiB)": 77.56, "step": 69915, "token_acc": 0.5048543689320388, "train_speed(iter/s)": 1.43777 }, { "epoch": 2.995587164217471, "grad_norm": 5.00528621673584, "learning_rate": 3.468105762975817e-05, "loss": 2.4896568298339843, "memory(GiB)": 77.56, "step": 69920, "token_acc": 0.4748427672955975, "train_speed(iter/s)": 1.437772 }, { "epoch": 2.9958013795467204, "grad_norm": 6.449862480163574, "learning_rate": 3.467465163255966e-05, "loss": 2.3143749237060547, "memory(GiB)": 77.56, "step": 69925, "token_acc": 0.5207547169811321, "train_speed(iter/s)": 1.437755 }, { "epoch": 2.9960155948759692, "grad_norm": 4.4548492431640625, "learning_rate": 3.46682459129942e-05, "loss": 2.471028518676758, "memory(GiB)": 77.56, "step": 69930, "token_acc": 0.47368421052631576, "train_speed(iter/s)": 1.437788 }, { "epoch": 2.996229810205218, "grad_norm": 5.446771621704102, "learning_rate": 3.466184047117784e-05, "loss": 2.3469188690185545, "memory(GiB)": 77.56, "step": 69935, "token_acc": 0.4856115107913669, "train_speed(iter/s)": 1.437796 }, { "epoch": 2.9964440255344673, "grad_norm": 5.143713474273682, "learning_rate": 3.4655435307226645e-05, "loss": 2.495412826538086, "memory(GiB)": 77.56, "step": 69940, "token_acc": 0.4925925925925926, "train_speed(iter/s)": 1.437785 }, { "epoch": 2.996658240863716, "grad_norm": 8.154741287231445, "learning_rate": 3.4649030421256625e-05, "loss": 2.307531547546387, "memory(GiB)": 77.56, "step": 69945, "token_acc": 0.5027932960893855, "train_speed(iter/s)": 1.437782 }, { "epoch": 2.996872456192965, "grad_norm": 4.66604471206665, "learning_rate": 3.464262581338382e-05, "loss": 2.464568519592285, "memory(GiB)": 77.56, "step": 69950, "token_acc": 0.48297213622291024, "train_speed(iter/s)": 1.43778 }, { "epoch": 2.997086671522214, "grad_norm": 5.159629821777344, "learning_rate": 3.463622148372426e-05, "loss": 2.203643798828125, "memory(GiB)": 77.56, "step": 69955, "token_acc": 0.5, "train_speed(iter/s)": 1.43779 }, { "epoch": 2.997300886851463, "grad_norm": 5.51721715927124, "learning_rate": 3.462981743239394e-05, "loss": 2.3900142669677735, "memory(GiB)": 77.56, "step": 69960, "token_acc": 0.47333333333333333, "train_speed(iter/s)": 1.43779 }, { "epoch": 2.997515102180712, "grad_norm": 8.074501991271973, "learning_rate": 3.46234136595089e-05, "loss": 2.333910369873047, "memory(GiB)": 77.56, "step": 69965, "token_acc": 0.48333333333333334, "train_speed(iter/s)": 1.437786 }, { "epoch": 2.997729317509961, "grad_norm": 7.173497676849365, "learning_rate": 3.461701016518516e-05, "loss": 2.3989498138427736, "memory(GiB)": 77.56, "step": 69970, "token_acc": 0.4730290456431535, "train_speed(iter/s)": 1.43774 }, { "epoch": 2.99794353283921, "grad_norm": 7.386610984802246, "learning_rate": 3.461060694953871e-05, "loss": 2.425118255615234, "memory(GiB)": 77.56, "step": 69975, "token_acc": 0.4897260273972603, "train_speed(iter/s)": 1.437763 }, { "epoch": 2.9981577481684587, "grad_norm": 5.4531426429748535, "learning_rate": 3.4604204012685546e-05, "loss": 2.3320119857788084, "memory(GiB)": 77.56, "step": 69980, "token_acc": 0.4912891986062718, "train_speed(iter/s)": 1.437765 }, { "epoch": 2.998371963497708, "grad_norm": 5.652688980102539, "learning_rate": 3.459780135474168e-05, "loss": 2.392618179321289, "memory(GiB)": 77.56, "step": 69985, "token_acc": 0.48905109489051096, "train_speed(iter/s)": 1.43776 }, { "epoch": 2.9985861788269568, "grad_norm": 5.391223430633545, "learning_rate": 3.4591398975823084e-05, "loss": 2.3509243011474608, "memory(GiB)": 77.56, "step": 69990, "token_acc": 0.47878787878787876, "train_speed(iter/s)": 1.43778 }, { "epoch": 2.9988003941562056, "grad_norm": 7.125851631164551, "learning_rate": 3.458499687604575e-05, "loss": 2.5453536987304686, "memory(GiB)": 77.56, "step": 69995, "token_acc": 0.4733893557422969, "train_speed(iter/s)": 1.437775 }, { "epoch": 2.999014609485455, "grad_norm": 6.092185974121094, "learning_rate": 3.457859505552565e-05, "loss": 2.4291168212890626, "memory(GiB)": 77.56, "step": 70000, "token_acc": 0.5217391304347826, "train_speed(iter/s)": 1.437756 }, { "epoch": 2.999014609485455, "eval_loss": 2.0748212337493896, "eval_runtime": 14.4906, "eval_samples_per_second": 6.901, "eval_steps_per_second": 6.901, "eval_token_acc": 0.481029810298103, "step": 70000 }, { "epoch": 2.9992288248147037, "grad_norm": 4.501657485961914, "learning_rate": 3.457219351437877e-05, "loss": 2.2379199981689455, "memory(GiB)": 77.56, "step": 70005, "token_acc": 0.47448015122873344, "train_speed(iter/s)": 1.437305 }, { "epoch": 2.9994430401439525, "grad_norm": 5.824315071105957, "learning_rate": 3.456579225272109e-05, "loss": 2.425770378112793, "memory(GiB)": 77.56, "step": 70010, "token_acc": 0.5220588235294118, "train_speed(iter/s)": 1.437324 }, { "epoch": 2.9996572554732017, "grad_norm": 4.7570414543151855, "learning_rate": 3.4559391270668554e-05, "loss": 2.3508453369140625, "memory(GiB)": 77.56, "step": 70015, "token_acc": 0.4876325088339223, "train_speed(iter/s)": 1.437341 }, { "epoch": 2.9998714708024505, "grad_norm": 5.920152187347412, "learning_rate": 3.4552990568337124e-05, "loss": 2.4949527740478517, "memory(GiB)": 77.56, "step": 70020, "token_acc": 0.5, "train_speed(iter/s)": 1.437345 }, { "epoch": 3.0000856861317, "grad_norm": 5.641233921051025, "learning_rate": 3.454659014584277e-05, "loss": 2.28286075592041, "memory(GiB)": 77.56, "step": 70025, "token_acc": 0.5175097276264592, "train_speed(iter/s)": 1.437358 }, { "epoch": 3.0002999014609486, "grad_norm": 4.419128894805908, "learning_rate": 3.4540190003301434e-05, "loss": 2.1704872131347654, "memory(GiB)": 77.56, "step": 70030, "token_acc": 0.541958041958042, "train_speed(iter/s)": 1.437362 }, { "epoch": 3.0005141167901974, "grad_norm": 6.164981365203857, "learning_rate": 3.453379014082902e-05, "loss": 2.470066452026367, "memory(GiB)": 77.56, "step": 70035, "token_acc": 0.51953125, "train_speed(iter/s)": 1.437354 }, { "epoch": 3.0007283321194467, "grad_norm": 4.922651767730713, "learning_rate": 3.452739055854154e-05, "loss": 2.340447998046875, "memory(GiB)": 77.56, "step": 70040, "token_acc": 0.4763636363636364, "train_speed(iter/s)": 1.437387 }, { "epoch": 3.0009425474486955, "grad_norm": 5.426289081573486, "learning_rate": 3.4520991256554895e-05, "loss": 1.9927356719970704, "memory(GiB)": 77.56, "step": 70045, "token_acc": 0.588, "train_speed(iter/s)": 1.437397 }, { "epoch": 3.0011567627779443, "grad_norm": 7.255398750305176, "learning_rate": 3.4514592234985006e-05, "loss": 2.410924530029297, "memory(GiB)": 77.56, "step": 70050, "token_acc": 0.5, "train_speed(iter/s)": 1.437395 }, { "epoch": 3.0013709781071936, "grad_norm": 5.883856296539307, "learning_rate": 3.4508193493947816e-05, "loss": 2.414633369445801, "memory(GiB)": 77.56, "step": 70055, "token_acc": 0.5016722408026756, "train_speed(iter/s)": 1.437379 }, { "epoch": 3.0015851934364424, "grad_norm": 5.694874286651611, "learning_rate": 3.4501795033559224e-05, "loss": 2.5532037734985353, "memory(GiB)": 77.56, "step": 70060, "token_acc": 0.4708029197080292, "train_speed(iter/s)": 1.437395 }, { "epoch": 3.001799408765691, "grad_norm": 4.599838733673096, "learning_rate": 3.449539685393516e-05, "loss": 2.2517772674560548, "memory(GiB)": 77.56, "step": 70065, "token_acc": 0.5348837209302325, "train_speed(iter/s)": 1.437393 }, { "epoch": 3.0020136240949404, "grad_norm": 4.233546733856201, "learning_rate": 3.448899895519152e-05, "loss": 2.3383678436279296, "memory(GiB)": 77.56, "step": 70070, "token_acc": 0.49866666666666665, "train_speed(iter/s)": 1.437388 }, { "epoch": 3.0022278394241892, "grad_norm": 5.9302659034729, "learning_rate": 3.448260133744422e-05, "loss": 2.395679473876953, "memory(GiB)": 77.56, "step": 70075, "token_acc": 0.4588607594936709, "train_speed(iter/s)": 1.437397 }, { "epoch": 3.002442054753438, "grad_norm": 5.467451572418213, "learning_rate": 3.4476204000809145e-05, "loss": 2.3761524200439452, "memory(GiB)": 77.56, "step": 70080, "token_acc": 0.5133079847908745, "train_speed(iter/s)": 1.437381 }, { "epoch": 3.0026562700826873, "grad_norm": 4.680141925811768, "learning_rate": 3.446980694540221e-05, "loss": 2.249062156677246, "memory(GiB)": 77.56, "step": 70085, "token_acc": 0.5093632958801498, "train_speed(iter/s)": 1.437393 }, { "epoch": 3.002870485411936, "grad_norm": 6.755695819854736, "learning_rate": 3.4463410171339275e-05, "loss": 2.548707389831543, "memory(GiB)": 77.56, "step": 70090, "token_acc": 0.4870967741935484, "train_speed(iter/s)": 1.437402 }, { "epoch": 3.003084700741185, "grad_norm": 5.0909600257873535, "learning_rate": 3.445701367873625e-05, "loss": 2.396769332885742, "memory(GiB)": 77.56, "step": 70095, "token_acc": 0.48148148148148145, "train_speed(iter/s)": 1.437385 }, { "epoch": 3.003298916070434, "grad_norm": 6.153361797332764, "learning_rate": 3.445061746770901e-05, "loss": 1.972971534729004, "memory(GiB)": 77.56, "step": 70100, "token_acc": 0.5528169014084507, "train_speed(iter/s)": 1.437398 }, { "epoch": 3.003513131399683, "grad_norm": 6.290983200073242, "learning_rate": 3.444422153837339e-05, "loss": 2.1196720123291017, "memory(GiB)": 77.56, "step": 70105, "token_acc": 0.5328947368421053, "train_speed(iter/s)": 1.437405 }, { "epoch": 3.003727346728932, "grad_norm": 6.0566301345825195, "learning_rate": 3.443782589084531e-05, "loss": 2.595070648193359, "memory(GiB)": 77.56, "step": 70110, "token_acc": 0.45396825396825397, "train_speed(iter/s)": 1.437404 }, { "epoch": 3.003941562058181, "grad_norm": 4.441219329833984, "learning_rate": 3.443143052524062e-05, "loss": 2.3609437942504883, "memory(GiB)": 77.56, "step": 70115, "token_acc": 0.47413793103448276, "train_speed(iter/s)": 1.437402 }, { "epoch": 3.00415577738743, "grad_norm": 4.946651458740234, "learning_rate": 3.4425035441675165e-05, "loss": 2.354739952087402, "memory(GiB)": 77.56, "step": 70120, "token_acc": 0.49416342412451364, "train_speed(iter/s)": 1.43737 }, { "epoch": 3.0043699927166787, "grad_norm": 5.974501609802246, "learning_rate": 3.441864064026479e-05, "loss": 2.1505544662475584, "memory(GiB)": 77.56, "step": 70125, "token_acc": 0.512, "train_speed(iter/s)": 1.437372 }, { "epoch": 3.004584208045928, "grad_norm": 7.834686279296875, "learning_rate": 3.441224612112538e-05, "loss": 2.222279930114746, "memory(GiB)": 77.56, "step": 70130, "token_acc": 0.4952681388012618, "train_speed(iter/s)": 1.437379 }, { "epoch": 3.0047984233751768, "grad_norm": 5.017378807067871, "learning_rate": 3.440585188437273e-05, "loss": 2.4043447494506838, "memory(GiB)": 77.56, "step": 70135, "token_acc": 0.5034722222222222, "train_speed(iter/s)": 1.437394 }, { "epoch": 3.0050126387044256, "grad_norm": 5.082510948181152, "learning_rate": 3.439945793012272e-05, "loss": 2.595010757446289, "memory(GiB)": 77.56, "step": 70140, "token_acc": 0.5217391304347826, "train_speed(iter/s)": 1.437385 }, { "epoch": 3.005226854033675, "grad_norm": 5.418440341949463, "learning_rate": 3.439306425849116e-05, "loss": 2.23193244934082, "memory(GiB)": 77.56, "step": 70145, "token_acc": 0.5197132616487455, "train_speed(iter/s)": 1.437377 }, { "epoch": 3.0054410693629237, "grad_norm": 6.121163368225098, "learning_rate": 3.438667086959388e-05, "loss": 2.2608058929443358, "memory(GiB)": 77.56, "step": 70150, "token_acc": 0.5261194029850746, "train_speed(iter/s)": 1.437352 }, { "epoch": 3.0056552846921725, "grad_norm": 5.88928747177124, "learning_rate": 3.438027776354671e-05, "loss": 3.0113218307495115, "memory(GiB)": 77.56, "step": 70155, "token_acc": 0.4492307692307692, "train_speed(iter/s)": 1.437336 }, { "epoch": 3.0058695000214217, "grad_norm": 7.035855293273926, "learning_rate": 3.437388494046545e-05, "loss": 2.4730562210083007, "memory(GiB)": 77.56, "step": 70160, "token_acc": 0.45985401459854014, "train_speed(iter/s)": 1.437342 }, { "epoch": 3.0060837153506705, "grad_norm": 4.201283931732178, "learning_rate": 3.4367492400465926e-05, "loss": 2.0827741622924805, "memory(GiB)": 77.56, "step": 70165, "token_acc": 0.5680272108843537, "train_speed(iter/s)": 1.43735 }, { "epoch": 3.0062979306799194, "grad_norm": 5.9304633140563965, "learning_rate": 3.436110014366395e-05, "loss": 2.1160346984863283, "memory(GiB)": 77.56, "step": 70170, "token_acc": 0.5555555555555556, "train_speed(iter/s)": 1.437367 }, { "epoch": 3.0065121460091686, "grad_norm": 3.9505488872528076, "learning_rate": 3.4354708170175296e-05, "loss": 2.3902923583984377, "memory(GiB)": 77.56, "step": 70175, "token_acc": 0.5222222222222223, "train_speed(iter/s)": 1.437393 }, { "epoch": 3.0067263613384174, "grad_norm": 6.202552318572998, "learning_rate": 3.4348316480115786e-05, "loss": 2.3049552917480467, "memory(GiB)": 77.56, "step": 70180, "token_acc": 0.4850498338870432, "train_speed(iter/s)": 1.437374 }, { "epoch": 3.0069405766676662, "grad_norm": 5.326395511627197, "learning_rate": 3.4341925073601206e-05, "loss": 2.3128583908081053, "memory(GiB)": 77.56, "step": 70185, "token_acc": 0.49097472924187724, "train_speed(iter/s)": 1.437366 }, { "epoch": 3.0071547919969155, "grad_norm": 6.259341716766357, "learning_rate": 3.433553395074735e-05, "loss": 2.373114585876465, "memory(GiB)": 77.56, "step": 70190, "token_acc": 0.5065359477124183, "train_speed(iter/s)": 1.437388 }, { "epoch": 3.0073690073261643, "grad_norm": 7.973504066467285, "learning_rate": 3.432914311166998e-05, "loss": 2.333552360534668, "memory(GiB)": 77.56, "step": 70195, "token_acc": 0.5236220472440944, "train_speed(iter/s)": 1.437408 }, { "epoch": 3.007583222655413, "grad_norm": 6.080653667449951, "learning_rate": 3.4322752556484896e-05, "loss": 2.4298679351806642, "memory(GiB)": 77.56, "step": 70200, "token_acc": 0.4806451612903226, "train_speed(iter/s)": 1.437421 }, { "epoch": 3.0077974379846624, "grad_norm": 7.564701080322266, "learning_rate": 3.4316362285307854e-05, "loss": 2.389156150817871, "memory(GiB)": 77.56, "step": 70205, "token_acc": 0.47572815533980584, "train_speed(iter/s)": 1.437427 }, { "epoch": 3.008011653313911, "grad_norm": 5.251187801361084, "learning_rate": 3.430997229825462e-05, "loss": 2.404718017578125, "memory(GiB)": 77.56, "step": 70210, "token_acc": 0.4786885245901639, "train_speed(iter/s)": 1.437402 }, { "epoch": 3.00822586864316, "grad_norm": 5.972806453704834, "learning_rate": 3.430358259544095e-05, "loss": 2.184909439086914, "memory(GiB)": 77.56, "step": 70215, "token_acc": 0.531986531986532, "train_speed(iter/s)": 1.43741 }, { "epoch": 3.0084400839724093, "grad_norm": 5.10725736618042, "learning_rate": 3.429719317698262e-05, "loss": 2.4998268127441405, "memory(GiB)": 77.56, "step": 70220, "token_acc": 0.48578199052132703, "train_speed(iter/s)": 1.437406 }, { "epoch": 3.008654299301658, "grad_norm": 4.5519514083862305, "learning_rate": 3.429080404299535e-05, "loss": 2.054467964172363, "memory(GiB)": 77.56, "step": 70225, "token_acc": 0.48466257668711654, "train_speed(iter/s)": 1.437403 }, { "epoch": 3.008868514630907, "grad_norm": 5.849546909332275, "learning_rate": 3.428441519359491e-05, "loss": 2.0395259857177734, "memory(GiB)": 77.56, "step": 70230, "token_acc": 0.5506072874493927, "train_speed(iter/s)": 1.43741 }, { "epoch": 3.009082729960156, "grad_norm": 6.683048248291016, "learning_rate": 3.4278026628897024e-05, "loss": 2.5393098831176757, "memory(GiB)": 77.56, "step": 70235, "token_acc": 0.5014925373134328, "train_speed(iter/s)": 1.437416 }, { "epoch": 3.009296945289405, "grad_norm": 6.5449018478393555, "learning_rate": 3.427163834901744e-05, "loss": 2.19445915222168, "memory(GiB)": 77.56, "step": 70240, "token_acc": 0.5517241379310345, "train_speed(iter/s)": 1.43744 }, { "epoch": 3.0095111606186538, "grad_norm": 6.474233150482178, "learning_rate": 3.426525035407189e-05, "loss": 2.316154670715332, "memory(GiB)": 77.56, "step": 70245, "token_acc": 0.5054151624548736, "train_speed(iter/s)": 1.437451 }, { "epoch": 3.009725375947903, "grad_norm": 8.122316360473633, "learning_rate": 3.425886264417607e-05, "loss": 2.3829437255859376, "memory(GiB)": 77.56, "step": 70250, "token_acc": 0.4591194968553459, "train_speed(iter/s)": 1.437456 }, { "epoch": 3.009939591277152, "grad_norm": 7.372654438018799, "learning_rate": 3.425247521944572e-05, "loss": 2.374039649963379, "memory(GiB)": 77.56, "step": 70255, "token_acc": 0.5, "train_speed(iter/s)": 1.437489 }, { "epoch": 3.0101538066064006, "grad_norm": 13.866686820983887, "learning_rate": 3.424608807999656e-05, "loss": 2.520269203186035, "memory(GiB)": 77.56, "step": 70260, "token_acc": 0.5034722222222222, "train_speed(iter/s)": 1.437508 }, { "epoch": 3.01036802193565, "grad_norm": 8.366415977478027, "learning_rate": 3.423970122594431e-05, "loss": 2.223109245300293, "memory(GiB)": 77.56, "step": 70265, "token_acc": 0.4810606060606061, "train_speed(iter/s)": 1.437485 }, { "epoch": 3.0105822372648987, "grad_norm": 4.910088062286377, "learning_rate": 3.423331465740463e-05, "loss": 2.357571029663086, "memory(GiB)": 77.56, "step": 70270, "token_acc": 0.5090361445783133, "train_speed(iter/s)": 1.437482 }, { "epoch": 3.0107964525941475, "grad_norm": 4.795945167541504, "learning_rate": 3.4226928374493275e-05, "loss": 2.013970947265625, "memory(GiB)": 77.56, "step": 70275, "token_acc": 0.5360501567398119, "train_speed(iter/s)": 1.437463 }, { "epoch": 3.0110106679233968, "grad_norm": 6.291277885437012, "learning_rate": 3.422054237732588e-05, "loss": 2.191616249084473, "memory(GiB)": 77.56, "step": 70280, "token_acc": 0.5188284518828452, "train_speed(iter/s)": 1.437484 }, { "epoch": 3.0112248832526456, "grad_norm": 7.073383808135986, "learning_rate": 3.421415666601818e-05, "loss": 2.198455810546875, "memory(GiB)": 77.56, "step": 70285, "token_acc": 0.4777777777777778, "train_speed(iter/s)": 1.4375 }, { "epoch": 3.0114390985818944, "grad_norm": 5.46431827545166, "learning_rate": 3.420777124068584e-05, "loss": 2.4840248107910154, "memory(GiB)": 77.56, "step": 70290, "token_acc": 0.5104895104895105, "train_speed(iter/s)": 1.437524 }, { "epoch": 3.0116533139111437, "grad_norm": 6.905582904815674, "learning_rate": 3.4201386101444524e-05, "loss": 2.1254100799560547, "memory(GiB)": 77.56, "step": 70295, "token_acc": 0.5377049180327869, "train_speed(iter/s)": 1.437524 }, { "epoch": 3.0118675292403925, "grad_norm": 6.294972896575928, "learning_rate": 3.419500124840994e-05, "loss": 2.311610221862793, "memory(GiB)": 77.56, "step": 70300, "token_acc": 0.5183098591549296, "train_speed(iter/s)": 1.437521 }, { "epoch": 3.0120817445696413, "grad_norm": 6.419466018676758, "learning_rate": 3.418861668169773e-05, "loss": 2.239666748046875, "memory(GiB)": 77.56, "step": 70305, "token_acc": 0.5057034220532319, "train_speed(iter/s)": 1.437513 }, { "epoch": 3.0122959598988905, "grad_norm": 5.778848171234131, "learning_rate": 3.4182232401423544e-05, "loss": 2.308125686645508, "memory(GiB)": 77.56, "step": 70310, "token_acc": 0.5035714285714286, "train_speed(iter/s)": 1.437482 }, { "epoch": 3.0125101752281394, "grad_norm": 8.681206703186035, "learning_rate": 3.417584840770307e-05, "loss": 2.50396842956543, "memory(GiB)": 77.56, "step": 70315, "token_acc": 0.46853146853146854, "train_speed(iter/s)": 1.437499 }, { "epoch": 3.012724390557388, "grad_norm": 6.515965461730957, "learning_rate": 3.416946470065193e-05, "loss": 2.5161258697509767, "memory(GiB)": 77.56, "step": 70320, "token_acc": 0.45454545454545453, "train_speed(iter/s)": 1.437507 }, { "epoch": 3.0129386058866374, "grad_norm": 5.736940860748291, "learning_rate": 3.416308128038581e-05, "loss": 2.4026493072509765, "memory(GiB)": 77.56, "step": 70325, "token_acc": 0.5111111111111111, "train_speed(iter/s)": 1.437519 }, { "epoch": 3.0131528212158862, "grad_norm": 5.693610191345215, "learning_rate": 3.415669814702031e-05, "loss": 2.4020872116088867, "memory(GiB)": 77.56, "step": 70330, "token_acc": 0.5016077170418006, "train_speed(iter/s)": 1.437525 }, { "epoch": 3.013367036545135, "grad_norm": 6.333526134490967, "learning_rate": 3.415031530067109e-05, "loss": 2.2822597503662108, "memory(GiB)": 77.56, "step": 70335, "token_acc": 0.5111111111111111, "train_speed(iter/s)": 1.437535 }, { "epoch": 3.0135812518743843, "grad_norm": 7.3658857345581055, "learning_rate": 3.414393274145378e-05, "loss": 2.291658401489258, "memory(GiB)": 77.56, "step": 70340, "token_acc": 0.5057471264367817, "train_speed(iter/s)": 1.437544 }, { "epoch": 3.013795467203633, "grad_norm": 5.9387102127075195, "learning_rate": 3.4137550469484e-05, "loss": 2.5541927337646486, "memory(GiB)": 77.56, "step": 70345, "token_acc": 0.49477351916376305, "train_speed(iter/s)": 1.4375 }, { "epoch": 3.014009682532882, "grad_norm": 6.346806049346924, "learning_rate": 3.4131168484877385e-05, "loss": 2.3148178100585937, "memory(GiB)": 77.56, "step": 70350, "token_acc": 0.48484848484848486, "train_speed(iter/s)": 1.437517 }, { "epoch": 3.014223897862131, "grad_norm": 5.464722156524658, "learning_rate": 3.412478678774952e-05, "loss": 2.59960823059082, "memory(GiB)": 77.56, "step": 70355, "token_acc": 0.42727272727272725, "train_speed(iter/s)": 1.437539 }, { "epoch": 3.01443811319138, "grad_norm": 5.355817794799805, "learning_rate": 3.4118405378216056e-05, "loss": 2.1365318298339844, "memory(GiB)": 77.56, "step": 70360, "token_acc": 0.5390070921985816, "train_speed(iter/s)": 1.437528 }, { "epoch": 3.014652328520629, "grad_norm": 6.916172504425049, "learning_rate": 3.411202425639257e-05, "loss": 1.8687414169311523, "memory(GiB)": 77.56, "step": 70365, "token_acc": 0.55078125, "train_speed(iter/s)": 1.437526 }, { "epoch": 3.014866543849878, "grad_norm": 5.728539943695068, "learning_rate": 3.410564342239466e-05, "loss": 2.389829635620117, "memory(GiB)": 77.56, "step": 70370, "token_acc": 0.506993006993007, "train_speed(iter/s)": 1.437542 }, { "epoch": 3.015080759179127, "grad_norm": 9.179864883422852, "learning_rate": 3.409926287633793e-05, "loss": 2.2522197723388673, "memory(GiB)": 77.56, "step": 70375, "token_acc": 0.4963768115942029, "train_speed(iter/s)": 1.437567 }, { "epoch": 3.0152949745083757, "grad_norm": 5.689191818237305, "learning_rate": 3.409288261833797e-05, "loss": 2.503635597229004, "memory(GiB)": 77.56, "step": 70380, "token_acc": 0.5057471264367817, "train_speed(iter/s)": 1.437582 }, { "epoch": 3.015509189837625, "grad_norm": 6.755014896392822, "learning_rate": 3.408650264851038e-05, "loss": 2.354074478149414, "memory(GiB)": 77.56, "step": 70385, "token_acc": 0.5396825396825397, "train_speed(iter/s)": 1.437597 }, { "epoch": 3.0157234051668738, "grad_norm": 5.1933698654174805, "learning_rate": 3.4080122966970705e-05, "loss": 2.2398874282836916, "memory(GiB)": 77.56, "step": 70390, "token_acc": 0.5103448275862069, "train_speed(iter/s)": 1.437562 }, { "epoch": 3.0159376204961226, "grad_norm": 4.797146797180176, "learning_rate": 3.4073743573834535e-05, "loss": 2.3041778564453126, "memory(GiB)": 77.56, "step": 70395, "token_acc": 0.4651898734177215, "train_speed(iter/s)": 1.437579 }, { "epoch": 3.016151835825372, "grad_norm": 6.0858588218688965, "learning_rate": 3.406736446921743e-05, "loss": 2.4038543701171875, "memory(GiB)": 77.56, "step": 70400, "token_acc": 0.48148148148148145, "train_speed(iter/s)": 1.437605 }, { "epoch": 3.0163660511546206, "grad_norm": 4.796118259429932, "learning_rate": 3.406098565323499e-05, "loss": 2.3136032104492186, "memory(GiB)": 77.56, "step": 70405, "token_acc": 0.49433962264150944, "train_speed(iter/s)": 1.437604 }, { "epoch": 3.0165802664838695, "grad_norm": 5.765377521514893, "learning_rate": 3.4054607126002734e-05, "loss": 2.2307804107666014, "memory(GiB)": 77.56, "step": 70410, "token_acc": 0.5103857566765578, "train_speed(iter/s)": 1.437615 }, { "epoch": 3.0167944818131187, "grad_norm": 6.051815509796143, "learning_rate": 3.404822888763623e-05, "loss": 2.281178855895996, "memory(GiB)": 77.56, "step": 70415, "token_acc": 0.5376712328767124, "train_speed(iter/s)": 1.437634 }, { "epoch": 3.0170086971423675, "grad_norm": 7.341418266296387, "learning_rate": 3.404185093825103e-05, "loss": 2.264250373840332, "memory(GiB)": 77.56, "step": 70420, "token_acc": 0.4676258992805755, "train_speed(iter/s)": 1.437648 }, { "epoch": 3.0172229124716163, "grad_norm": 7.620987415313721, "learning_rate": 3.403547327796266e-05, "loss": 2.4151514053344725, "memory(GiB)": 77.56, "step": 70425, "token_acc": 0.5174825174825175, "train_speed(iter/s)": 1.437627 }, { "epoch": 3.0174371278008656, "grad_norm": 6.364876747131348, "learning_rate": 3.4029095906886676e-05, "loss": 2.0931800842285155, "memory(GiB)": 77.56, "step": 70430, "token_acc": 0.583916083916084, "train_speed(iter/s)": 1.43763 }, { "epoch": 3.0176513431301144, "grad_norm": 5.987843036651611, "learning_rate": 3.40227188251386e-05, "loss": 2.36295108795166, "memory(GiB)": 77.56, "step": 70435, "token_acc": 0.49280575539568344, "train_speed(iter/s)": 1.437652 }, { "epoch": 3.017865558459363, "grad_norm": 5.084464073181152, "learning_rate": 3.401634203283395e-05, "loss": 2.233847999572754, "memory(GiB)": 77.56, "step": 70440, "token_acc": 0.5326797385620915, "train_speed(iter/s)": 1.437659 }, { "epoch": 3.0180797737886125, "grad_norm": 4.420731544494629, "learning_rate": 3.4009965530088274e-05, "loss": 2.244358253479004, "memory(GiB)": 77.56, "step": 70445, "token_acc": 0.5035714285714286, "train_speed(iter/s)": 1.437687 }, { "epoch": 3.0182939891178613, "grad_norm": 6.721184730529785, "learning_rate": 3.4003589317017074e-05, "loss": 2.3993999481201174, "memory(GiB)": 77.56, "step": 70450, "token_acc": 0.43703703703703706, "train_speed(iter/s)": 1.437668 }, { "epoch": 3.01850820444711, "grad_norm": 5.2902069091796875, "learning_rate": 3.399721339373584e-05, "loss": 1.9199289321899413, "memory(GiB)": 77.56, "step": 70455, "token_acc": 0.5253164556962026, "train_speed(iter/s)": 1.437667 }, { "epoch": 3.0187224197763594, "grad_norm": 5.390171051025391, "learning_rate": 3.3990837760360106e-05, "loss": 2.3308927536010744, "memory(GiB)": 77.56, "step": 70460, "token_acc": 0.5303867403314917, "train_speed(iter/s)": 1.437681 }, { "epoch": 3.018936635105608, "grad_norm": 5.8123698234558105, "learning_rate": 3.398446241700536e-05, "loss": 2.1574642181396486, "memory(GiB)": 77.56, "step": 70465, "token_acc": 0.5340136054421769, "train_speed(iter/s)": 1.437674 }, { "epoch": 3.019150850434857, "grad_norm": 5.640305042266846, "learning_rate": 3.397808736378711e-05, "loss": 2.6134563446044923, "memory(GiB)": 77.56, "step": 70470, "token_acc": 0.4965753424657534, "train_speed(iter/s)": 1.437685 }, { "epoch": 3.0193650657641062, "grad_norm": 6.796550750732422, "learning_rate": 3.397171260082082e-05, "loss": 2.414898681640625, "memory(GiB)": 77.56, "step": 70475, "token_acc": 0.4838709677419355, "train_speed(iter/s)": 1.437662 }, { "epoch": 3.019579281093355, "grad_norm": 5.607170581817627, "learning_rate": 3.3965338128222e-05, "loss": 2.606097412109375, "memory(GiB)": 77.56, "step": 70480, "token_acc": 0.4851190476190476, "train_speed(iter/s)": 1.437678 }, { "epoch": 3.019793496422604, "grad_norm": 6.351754665374756, "learning_rate": 3.3958963946106126e-05, "loss": 2.2404232025146484, "memory(GiB)": 77.56, "step": 70485, "token_acc": 0.4780701754385965, "train_speed(iter/s)": 1.437673 }, { "epoch": 3.020007711751853, "grad_norm": 5.325263500213623, "learning_rate": 3.3952590054588675e-05, "loss": 2.297855567932129, "memory(GiB)": 77.56, "step": 70490, "token_acc": 0.5342465753424658, "train_speed(iter/s)": 1.437675 }, { "epoch": 3.020221927081102, "grad_norm": 6.447782039642334, "learning_rate": 3.394621645378511e-05, "loss": 2.354689598083496, "memory(GiB)": 77.56, "step": 70495, "token_acc": 0.46488294314381273, "train_speed(iter/s)": 1.437672 }, { "epoch": 3.0204361424103507, "grad_norm": 4.932530879974365, "learning_rate": 3.3939843143810874e-05, "loss": 2.6362049102783205, "memory(GiB)": 77.56, "step": 70500, "token_acc": 0.528052805280528, "train_speed(iter/s)": 1.437681 }, { "epoch": 3.0204361424103507, "eval_loss": 2.2287631034851074, "eval_runtime": 15.0191, "eval_samples_per_second": 6.658, "eval_steps_per_second": 6.658, "eval_token_acc": 0.4685408299866131, "step": 70500 }, { "epoch": 3.0206503577396, "grad_norm": 5.360347270965576, "learning_rate": 3.3933470124781466e-05, "loss": 2.4924240112304688, "memory(GiB)": 77.56, "step": 70505, "token_acc": 0.47696737044145876, "train_speed(iter/s)": 1.43721 }, { "epoch": 3.020864573068849, "grad_norm": 5.935469627380371, "learning_rate": 3.3927097396812304e-05, "loss": 1.7939041137695313, "memory(GiB)": 77.56, "step": 70510, "token_acc": 0.576271186440678, "train_speed(iter/s)": 1.437218 }, { "epoch": 3.0210787883980976, "grad_norm": 6.828772068023682, "learning_rate": 3.3920724960018856e-05, "loss": 2.6675512313842775, "memory(GiB)": 77.56, "step": 70515, "token_acc": 0.48828125, "train_speed(iter/s)": 1.437235 }, { "epoch": 3.021293003727347, "grad_norm": 5.44118070602417, "learning_rate": 3.3914352814516556e-05, "loss": 2.379140090942383, "memory(GiB)": 77.56, "step": 70520, "token_acc": 0.47692307692307695, "train_speed(iter/s)": 1.437224 }, { "epoch": 3.0215072190565957, "grad_norm": 5.2825751304626465, "learning_rate": 3.390798096042085e-05, "loss": 2.429238510131836, "memory(GiB)": 77.56, "step": 70525, "token_acc": 0.4844290657439446, "train_speed(iter/s)": 1.437237 }, { "epoch": 3.0217214343858445, "grad_norm": 6.980903148651123, "learning_rate": 3.390160939784716e-05, "loss": 2.152998352050781, "memory(GiB)": 77.56, "step": 70530, "token_acc": 0.49063670411985016, "train_speed(iter/s)": 1.437261 }, { "epoch": 3.0219356497150938, "grad_norm": 5.881457328796387, "learning_rate": 3.3895238126910924e-05, "loss": 2.1978784561157227, "memory(GiB)": 77.56, "step": 70535, "token_acc": 0.49416342412451364, "train_speed(iter/s)": 1.437259 }, { "epoch": 3.0221498650443426, "grad_norm": 4.331933975219727, "learning_rate": 3.388886714772754e-05, "loss": 2.080428123474121, "memory(GiB)": 77.56, "step": 70540, "token_acc": 0.5124223602484472, "train_speed(iter/s)": 1.437276 }, { "epoch": 3.0223640803735914, "grad_norm": 4.232517242431641, "learning_rate": 3.388249646041246e-05, "loss": 2.3634016036987306, "memory(GiB)": 77.56, "step": 70545, "token_acc": 0.5292096219931272, "train_speed(iter/s)": 1.437273 }, { "epoch": 3.0225782957028406, "grad_norm": 6.706959247589111, "learning_rate": 3.387612606508105e-05, "loss": 2.637325668334961, "memory(GiB)": 77.56, "step": 70550, "token_acc": 0.4873417721518987, "train_speed(iter/s)": 1.437265 }, { "epoch": 3.0227925110320895, "grad_norm": 6.284237861633301, "learning_rate": 3.386975596184877e-05, "loss": 2.4386037826538085, "memory(GiB)": 77.56, "step": 70555, "token_acc": 0.49185667752442996, "train_speed(iter/s)": 1.43726 }, { "epoch": 3.0230067263613383, "grad_norm": 5.004030227661133, "learning_rate": 3.3863386150830974e-05, "loss": 2.47908992767334, "memory(GiB)": 77.56, "step": 70560, "token_acc": 0.5145228215767634, "train_speed(iter/s)": 1.437269 }, { "epoch": 3.0232209416905875, "grad_norm": 5.973959445953369, "learning_rate": 3.385701663214309e-05, "loss": 2.30418701171875, "memory(GiB)": 77.56, "step": 70565, "token_acc": 0.5257731958762887, "train_speed(iter/s)": 1.437271 }, { "epoch": 3.0234351570198363, "grad_norm": 10.449799537658691, "learning_rate": 3.385064740590048e-05, "loss": 2.459584617614746, "memory(GiB)": 77.56, "step": 70570, "token_acc": 0.4819672131147541, "train_speed(iter/s)": 1.437288 }, { "epoch": 3.023649372349085, "grad_norm": 5.968154430389404, "learning_rate": 3.384427847221855e-05, "loss": 2.3906455993652345, "memory(GiB)": 77.56, "step": 70575, "token_acc": 0.4935483870967742, "train_speed(iter/s)": 1.437293 }, { "epoch": 3.0238635876783344, "grad_norm": 5.551918029785156, "learning_rate": 3.383790983121267e-05, "loss": 2.3442138671875, "memory(GiB)": 77.56, "step": 70580, "token_acc": 0.4956521739130435, "train_speed(iter/s)": 1.437322 }, { "epoch": 3.024077803007583, "grad_norm": 6.067318439483643, "learning_rate": 3.3831541482998205e-05, "loss": 1.964788055419922, "memory(GiB)": 77.56, "step": 70585, "token_acc": 0.558303886925795, "train_speed(iter/s)": 1.437339 }, { "epoch": 3.024292018336832, "grad_norm": 5.7133283615112305, "learning_rate": 3.3825173427690546e-05, "loss": 2.325041580200195, "memory(GiB)": 77.56, "step": 70590, "token_acc": 0.540625, "train_speed(iter/s)": 1.437366 }, { "epoch": 3.0245062336660813, "grad_norm": 5.407682418823242, "learning_rate": 3.381880566540505e-05, "loss": 2.341222381591797, "memory(GiB)": 77.56, "step": 70595, "token_acc": 0.5248447204968945, "train_speed(iter/s)": 1.437363 }, { "epoch": 3.02472044899533, "grad_norm": 7.491780757904053, "learning_rate": 3.381243819625705e-05, "loss": 2.2960657119750976, "memory(GiB)": 77.56, "step": 70600, "token_acc": 0.5314685314685315, "train_speed(iter/s)": 1.437363 }, { "epoch": 3.024934664324579, "grad_norm": 4.596776485443115, "learning_rate": 3.380607102036193e-05, "loss": 2.2798250198364256, "memory(GiB)": 77.56, "step": 70605, "token_acc": 0.47419354838709676, "train_speed(iter/s)": 1.437356 }, { "epoch": 3.025148879653828, "grad_norm": 6.106468200683594, "learning_rate": 3.3799704137835014e-05, "loss": 2.1350807189941405, "memory(GiB)": 77.56, "step": 70610, "token_acc": 0.5119453924914675, "train_speed(iter/s)": 1.43732 }, { "epoch": 3.025363094983077, "grad_norm": 5.095737457275391, "learning_rate": 3.3793337548791666e-05, "loss": 2.242348289489746, "memory(GiB)": 77.56, "step": 70615, "token_acc": 0.4797297297297297, "train_speed(iter/s)": 1.437313 }, { "epoch": 3.025577310312326, "grad_norm": 5.405594348907471, "learning_rate": 3.378697125334721e-05, "loss": 2.3909561157226564, "memory(GiB)": 77.56, "step": 70620, "token_acc": 0.4837662337662338, "train_speed(iter/s)": 1.437328 }, { "epoch": 3.025791525641575, "grad_norm": 6.876720428466797, "learning_rate": 3.378060525161696e-05, "loss": 2.1319761276245117, "memory(GiB)": 77.56, "step": 70625, "token_acc": 0.5096525096525096, "train_speed(iter/s)": 1.437351 }, { "epoch": 3.026005740970824, "grad_norm": 5.188251495361328, "learning_rate": 3.377423954371627e-05, "loss": 2.487404632568359, "memory(GiB)": 77.56, "step": 70630, "token_acc": 0.47491638795986624, "train_speed(iter/s)": 1.437344 }, { "epoch": 3.0262199563000727, "grad_norm": 4.916665077209473, "learning_rate": 3.3767874129760455e-05, "loss": 2.3435245513916017, "memory(GiB)": 77.56, "step": 70635, "token_acc": 0.5377049180327869, "train_speed(iter/s)": 1.437344 }, { "epoch": 3.026434171629322, "grad_norm": 5.324584484100342, "learning_rate": 3.3761509009864834e-05, "loss": 2.0037839889526365, "memory(GiB)": 77.56, "step": 70640, "token_acc": 0.5693215339233039, "train_speed(iter/s)": 1.43736 }, { "epoch": 3.0266483869585707, "grad_norm": 5.0959930419921875, "learning_rate": 3.3755144184144704e-05, "loss": 2.244406890869141, "memory(GiB)": 77.56, "step": 70645, "token_acc": 0.5116279069767442, "train_speed(iter/s)": 1.437368 }, { "epoch": 3.0268626022878196, "grad_norm": 5.386826515197754, "learning_rate": 3.374877965271538e-05, "loss": 2.1589298248291016, "memory(GiB)": 77.56, "step": 70650, "token_acc": 0.5177304964539007, "train_speed(iter/s)": 1.437367 }, { "epoch": 3.027076817617069, "grad_norm": 5.784743309020996, "learning_rate": 3.374241541569216e-05, "loss": 2.434076690673828, "memory(GiB)": 77.56, "step": 70655, "token_acc": 0.48753462603878117, "train_speed(iter/s)": 1.43736 }, { "epoch": 3.0272910329463176, "grad_norm": 7.675990104675293, "learning_rate": 3.3736051473190324e-05, "loss": 2.4441043853759767, "memory(GiB)": 77.56, "step": 70660, "token_acc": 0.48828125, "train_speed(iter/s)": 1.437369 }, { "epoch": 3.0275052482755664, "grad_norm": 6.093672275543213, "learning_rate": 3.372968782532519e-05, "loss": 2.062820816040039, "memory(GiB)": 77.56, "step": 70665, "token_acc": 0.5138461538461538, "train_speed(iter/s)": 1.437381 }, { "epoch": 3.0277194636048157, "grad_norm": 7.627948760986328, "learning_rate": 3.3723324472212e-05, "loss": 2.190084457397461, "memory(GiB)": 77.56, "step": 70670, "token_acc": 0.5231316725978647, "train_speed(iter/s)": 1.437359 }, { "epoch": 3.0279336789340645, "grad_norm": 5.10483980178833, "learning_rate": 3.371696141396608e-05, "loss": 2.5277629852294923, "memory(GiB)": 77.56, "step": 70675, "token_acc": 0.4560260586319218, "train_speed(iter/s)": 1.43737 }, { "epoch": 3.0281478942633133, "grad_norm": 5.51690673828125, "learning_rate": 3.371059865070266e-05, "loss": 2.3362171173095705, "memory(GiB)": 77.56, "step": 70680, "token_acc": 0.5059288537549407, "train_speed(iter/s)": 1.437388 }, { "epoch": 3.0283621095925626, "grad_norm": 6.007562160491943, "learning_rate": 3.370423618253703e-05, "loss": 2.424604606628418, "memory(GiB)": 77.56, "step": 70685, "token_acc": 0.48951048951048953, "train_speed(iter/s)": 1.437393 }, { "epoch": 3.0285763249218114, "grad_norm": 11.194559097290039, "learning_rate": 3.369787400958446e-05, "loss": 2.4155141830444338, "memory(GiB)": 77.56, "step": 70690, "token_acc": 0.48905109489051096, "train_speed(iter/s)": 1.437376 }, { "epoch": 3.02879054025106, "grad_norm": 7.8410468101501465, "learning_rate": 3.369151213196019e-05, "loss": 2.267900085449219, "memory(GiB)": 77.56, "step": 70695, "token_acc": 0.4983164983164983, "train_speed(iter/s)": 1.437389 }, { "epoch": 3.0290047555803095, "grad_norm": 5.812904357910156, "learning_rate": 3.368515054977945e-05, "loss": 2.752921295166016, "memory(GiB)": 77.56, "step": 70700, "token_acc": 0.4622356495468278, "train_speed(iter/s)": 1.43738 }, { "epoch": 3.0292189709095583, "grad_norm": 6.987752437591553, "learning_rate": 3.3678789263157534e-05, "loss": 2.320968246459961, "memory(GiB)": 77.56, "step": 70705, "token_acc": 0.49070631970260226, "train_speed(iter/s)": 1.437354 }, { "epoch": 3.029433186238807, "grad_norm": 7.077080249786377, "learning_rate": 3.367242827220967e-05, "loss": 2.3458202362060545, "memory(GiB)": 77.56, "step": 70710, "token_acc": 0.5155709342560554, "train_speed(iter/s)": 1.437356 }, { "epoch": 3.0296474015680563, "grad_norm": 7.908858776092529, "learning_rate": 3.3666067577051064e-05, "loss": 2.483523368835449, "memory(GiB)": 77.56, "step": 70715, "token_acc": 0.4787234042553192, "train_speed(iter/s)": 1.437372 }, { "epoch": 3.029861616897305, "grad_norm": 6.426355361938477, "learning_rate": 3.3659707177796984e-05, "loss": 2.22983341217041, "memory(GiB)": 77.56, "step": 70720, "token_acc": 0.5234657039711191, "train_speed(iter/s)": 1.437395 }, { "epoch": 3.030075832226554, "grad_norm": 5.172299861907959, "learning_rate": 3.365334707456264e-05, "loss": 2.438453483581543, "memory(GiB)": 77.56, "step": 70725, "token_acc": 0.45588235294117646, "train_speed(iter/s)": 1.437372 }, { "epoch": 3.0302900475558032, "grad_norm": 5.077765464782715, "learning_rate": 3.364698726746323e-05, "loss": 2.3051753997802735, "memory(GiB)": 77.56, "step": 70730, "token_acc": 0.5298245614035088, "train_speed(iter/s)": 1.437368 }, { "epoch": 3.030504262885052, "grad_norm": 5.132670879364014, "learning_rate": 3.364062775661401e-05, "loss": 2.370410346984863, "memory(GiB)": 77.56, "step": 70735, "token_acc": 0.5311475409836065, "train_speed(iter/s)": 1.437348 }, { "epoch": 3.030718478214301, "grad_norm": 6.580378532409668, "learning_rate": 3.3634268542130146e-05, "loss": 2.4990028381347655, "memory(GiB)": 77.56, "step": 70740, "token_acc": 0.49242424242424243, "train_speed(iter/s)": 1.437329 }, { "epoch": 3.03093269354355, "grad_norm": 8.148248672485352, "learning_rate": 3.3627909624126864e-05, "loss": 2.4106117248535157, "memory(GiB)": 77.56, "step": 70745, "token_acc": 0.48464163822525597, "train_speed(iter/s)": 1.437338 }, { "epoch": 3.031146908872799, "grad_norm": 6.455301761627197, "learning_rate": 3.362155100271936e-05, "loss": 2.394194793701172, "memory(GiB)": 77.56, "step": 70750, "token_acc": 0.47041420118343197, "train_speed(iter/s)": 1.437336 }, { "epoch": 3.0313611242020477, "grad_norm": 5.465808391571045, "learning_rate": 3.361519267802281e-05, "loss": 2.0059978485107424, "memory(GiB)": 77.56, "step": 70755, "token_acc": 0.5263157894736842, "train_speed(iter/s)": 1.437345 }, { "epoch": 3.031575339531297, "grad_norm": 6.298335552215576, "learning_rate": 3.360883465015243e-05, "loss": 2.4253538131713865, "memory(GiB)": 77.56, "step": 70760, "token_acc": 0.522633744855967, "train_speed(iter/s)": 1.437346 }, { "epoch": 3.031789554860546, "grad_norm": 5.976151943206787, "learning_rate": 3.3602476919223393e-05, "loss": 2.414750671386719, "memory(GiB)": 77.56, "step": 70765, "token_acc": 0.48148148148148145, "train_speed(iter/s)": 1.437365 }, { "epoch": 3.0320037701897946, "grad_norm": 6.39660120010376, "learning_rate": 3.359611948535084e-05, "loss": 2.361659049987793, "memory(GiB)": 77.56, "step": 70770, "token_acc": 0.49491525423728816, "train_speed(iter/s)": 1.437389 }, { "epoch": 3.032217985519044, "grad_norm": 5.713761806488037, "learning_rate": 3.358976234864999e-05, "loss": 2.5065639495849608, "memory(GiB)": 77.56, "step": 70775, "token_acc": 0.46153846153846156, "train_speed(iter/s)": 1.437372 }, { "epoch": 3.0324322008482927, "grad_norm": 6.726590156555176, "learning_rate": 3.3583405509235986e-05, "loss": 2.1831663131713865, "memory(GiB)": 77.56, "step": 70780, "token_acc": 0.5177865612648221, "train_speed(iter/s)": 1.437341 }, { "epoch": 3.0326464161775415, "grad_norm": 4.485472202301025, "learning_rate": 3.3577048967224e-05, "loss": 2.03262939453125, "memory(GiB)": 77.56, "step": 70785, "token_acc": 0.5416666666666666, "train_speed(iter/s)": 1.437346 }, { "epoch": 3.0328606315067907, "grad_norm": 5.325892925262451, "learning_rate": 3.3570692722729167e-05, "loss": 2.202468490600586, "memory(GiB)": 77.56, "step": 70790, "token_acc": 0.5265017667844523, "train_speed(iter/s)": 1.437347 }, { "epoch": 3.0330748468360396, "grad_norm": 7.766354084014893, "learning_rate": 3.3564336775866654e-05, "loss": 2.377099800109863, "memory(GiB)": 77.56, "step": 70795, "token_acc": 0.47161572052401746, "train_speed(iter/s)": 1.437356 }, { "epoch": 3.0332890621652884, "grad_norm": 4.616634845733643, "learning_rate": 3.35579811267516e-05, "loss": 2.4821651458740233, "memory(GiB)": 77.56, "step": 70800, "token_acc": 0.46710526315789475, "train_speed(iter/s)": 1.437366 }, { "epoch": 3.0335032774945376, "grad_norm": 4.715622901916504, "learning_rate": 3.355162577549913e-05, "loss": 1.9905763626098634, "memory(GiB)": 77.56, "step": 70805, "token_acc": 0.5328185328185329, "train_speed(iter/s)": 1.43738 }, { "epoch": 3.0337174928237864, "grad_norm": 4.912971496582031, "learning_rate": 3.35452707222244e-05, "loss": 2.2994430541992186, "memory(GiB)": 77.56, "step": 70810, "token_acc": 0.4984025559105431, "train_speed(iter/s)": 1.437395 }, { "epoch": 3.0339317081530353, "grad_norm": 4.222915172576904, "learning_rate": 3.353891596704252e-05, "loss": 2.3573863983154295, "memory(GiB)": 77.56, "step": 70815, "token_acc": 0.5258064516129032, "train_speed(iter/s)": 1.437395 }, { "epoch": 3.0341459234822845, "grad_norm": 4.757033824920654, "learning_rate": 3.353256151006863e-05, "loss": 2.629930877685547, "memory(GiB)": 77.56, "step": 70820, "token_acc": 0.44518272425249167, "train_speed(iter/s)": 1.437393 }, { "epoch": 3.0343601388115333, "grad_norm": 5.285696506500244, "learning_rate": 3.3526207351417825e-05, "loss": 2.3617713928222654, "memory(GiB)": 77.56, "step": 70825, "token_acc": 0.483974358974359, "train_speed(iter/s)": 1.437388 }, { "epoch": 3.034574354140782, "grad_norm": 6.275881767272949, "learning_rate": 3.351985349120523e-05, "loss": 2.3016841888427733, "memory(GiB)": 77.56, "step": 70830, "token_acc": 0.4711864406779661, "train_speed(iter/s)": 1.437381 }, { "epoch": 3.0347885694700314, "grad_norm": 6.186609268188477, "learning_rate": 3.351349992954595e-05, "loss": 2.4427631378173826, "memory(GiB)": 77.56, "step": 70835, "token_acc": 0.48859934853420195, "train_speed(iter/s)": 1.437345 }, { "epoch": 3.03500278479928, "grad_norm": 4.848016738891602, "learning_rate": 3.350714666655509e-05, "loss": 2.1069488525390625, "memory(GiB)": 77.56, "step": 70840, "token_acc": 0.5821917808219178, "train_speed(iter/s)": 1.437366 }, { "epoch": 3.035217000128529, "grad_norm": 6.821707248687744, "learning_rate": 3.3500793702347716e-05, "loss": 2.554111862182617, "memory(GiB)": 77.56, "step": 70845, "token_acc": 0.4798657718120805, "train_speed(iter/s)": 1.437352 }, { "epoch": 3.0354312154577783, "grad_norm": 5.950802326202393, "learning_rate": 3.349444103703895e-05, "loss": 2.418069076538086, "memory(GiB)": 77.56, "step": 70850, "token_acc": 0.5179856115107914, "train_speed(iter/s)": 1.437353 }, { "epoch": 3.035645430787027, "grad_norm": 5.721039295196533, "learning_rate": 3.3488088670743886e-05, "loss": 2.1435009002685548, "memory(GiB)": 77.56, "step": 70855, "token_acc": 0.5095541401273885, "train_speed(iter/s)": 1.43737 }, { "epoch": 3.035859646116276, "grad_norm": 5.344554424285889, "learning_rate": 3.3481736603577565e-05, "loss": 2.1546920776367187, "memory(GiB)": 77.56, "step": 70860, "token_acc": 0.555984555984556, "train_speed(iter/s)": 1.437373 }, { "epoch": 3.036073861445525, "grad_norm": 6.1356353759765625, "learning_rate": 3.34753848356551e-05, "loss": 2.4321304321289063, "memory(GiB)": 77.56, "step": 70865, "token_acc": 0.4784172661870504, "train_speed(iter/s)": 1.437384 }, { "epoch": 3.036288076774774, "grad_norm": 5.542874336242676, "learning_rate": 3.3469033367091534e-05, "loss": 2.246772575378418, "memory(GiB)": 77.56, "step": 70870, "token_acc": 0.5303643724696356, "train_speed(iter/s)": 1.437405 }, { "epoch": 3.036502292104023, "grad_norm": 5.28950309753418, "learning_rate": 3.3462682198001925e-05, "loss": 2.7895410537719725, "memory(GiB)": 77.56, "step": 70875, "token_acc": 0.4791666666666667, "train_speed(iter/s)": 1.437412 }, { "epoch": 3.036716507433272, "grad_norm": 8.909818649291992, "learning_rate": 3.345633132850136e-05, "loss": 2.134360122680664, "memory(GiB)": 77.56, "step": 70880, "token_acc": 0.524822695035461, "train_speed(iter/s)": 1.437411 }, { "epoch": 3.036930722762521, "grad_norm": 6.036808490753174, "learning_rate": 3.344998075870487e-05, "loss": 2.3594997406005858, "memory(GiB)": 77.56, "step": 70885, "token_acc": 0.5077519379844961, "train_speed(iter/s)": 1.437417 }, { "epoch": 3.0371449380917697, "grad_norm": 7.076190948486328, "learning_rate": 3.344363048872748e-05, "loss": 2.3516359329223633, "memory(GiB)": 77.56, "step": 70890, "token_acc": 0.5051194539249146, "train_speed(iter/s)": 1.437441 }, { "epoch": 3.037359153421019, "grad_norm": 9.171173095703125, "learning_rate": 3.3437280518684275e-05, "loss": 2.3217939376831054, "memory(GiB)": 77.56, "step": 70895, "token_acc": 0.5136054421768708, "train_speed(iter/s)": 1.437441 }, { "epoch": 3.0375733687502677, "grad_norm": 4.8570685386657715, "learning_rate": 3.3430930848690266e-05, "loss": 2.5207239151000977, "memory(GiB)": 77.56, "step": 70900, "token_acc": 0.47493403693931396, "train_speed(iter/s)": 1.437459 }, { "epoch": 3.0377875840795165, "grad_norm": 6.4781317710876465, "learning_rate": 3.3424581478860495e-05, "loss": 2.1019229888916016, "memory(GiB)": 77.56, "step": 70905, "token_acc": 0.5143884892086331, "train_speed(iter/s)": 1.437481 }, { "epoch": 3.038001799408766, "grad_norm": 5.527780055999756, "learning_rate": 3.3418232409309975e-05, "loss": 2.089214897155762, "memory(GiB)": 77.56, "step": 70910, "token_acc": 0.5444839857651246, "train_speed(iter/s)": 1.437481 }, { "epoch": 3.0382160147380146, "grad_norm": 6.387155055999756, "learning_rate": 3.341188364015372e-05, "loss": 2.4615354537963867, "memory(GiB)": 77.56, "step": 70915, "token_acc": 0.5037593984962406, "train_speed(iter/s)": 1.437498 }, { "epoch": 3.0384302300672634, "grad_norm": 6.141441345214844, "learning_rate": 3.340553517150674e-05, "loss": 2.3892189025878907, "memory(GiB)": 77.56, "step": 70920, "token_acc": 0.4533898305084746, "train_speed(iter/s)": 1.43751 }, { "epoch": 3.0386444453965127, "grad_norm": 6.831070899963379, "learning_rate": 3.339918700348409e-05, "loss": 2.385594367980957, "memory(GiB)": 77.56, "step": 70925, "token_acc": 0.478125, "train_speed(iter/s)": 1.437534 }, { "epoch": 3.0388586607257615, "grad_norm": 5.203241348266602, "learning_rate": 3.339283913620073e-05, "loss": 2.3997777938842773, "memory(GiB)": 77.56, "step": 70930, "token_acc": 0.5078369905956113, "train_speed(iter/s)": 1.437515 }, { "epoch": 3.0390728760550103, "grad_norm": 6.753215312957764, "learning_rate": 3.338649156977166e-05, "loss": 2.4418935775756836, "memory(GiB)": 77.56, "step": 70935, "token_acc": 0.4948453608247423, "train_speed(iter/s)": 1.437507 }, { "epoch": 3.0392870913842596, "grad_norm": 6.383613109588623, "learning_rate": 3.338014430431189e-05, "loss": 2.1237998962402345, "memory(GiB)": 77.56, "step": 70940, "token_acc": 0.5335968379446641, "train_speed(iter/s)": 1.43749 }, { "epoch": 3.0395013067135084, "grad_norm": 6.2430500984191895, "learning_rate": 3.33737973399364e-05, "loss": 2.531260108947754, "memory(GiB)": 77.56, "step": 70945, "token_acc": 0.48554913294797686, "train_speed(iter/s)": 1.437493 }, { "epoch": 3.039715522042757, "grad_norm": 6.0232625007629395, "learning_rate": 3.336745067676015e-05, "loss": 2.4142019271850588, "memory(GiB)": 77.56, "step": 70950, "token_acc": 0.49110320284697506, "train_speed(iter/s)": 1.437496 }, { "epoch": 3.0399297373720064, "grad_norm": 5.738032817840576, "learning_rate": 3.336110431489815e-05, "loss": 2.1952428817749023, "memory(GiB)": 77.56, "step": 70955, "token_acc": 0.4899598393574297, "train_speed(iter/s)": 1.437506 }, { "epoch": 3.0401439527012553, "grad_norm": 5.492711544036865, "learning_rate": 3.3354758254465346e-05, "loss": 2.259307861328125, "memory(GiB)": 77.56, "step": 70960, "token_acc": 0.548951048951049, "train_speed(iter/s)": 1.437525 }, { "epoch": 3.040358168030504, "grad_norm": 5.665395736694336, "learning_rate": 3.334841249557672e-05, "loss": 2.469676208496094, "memory(GiB)": 77.56, "step": 70965, "token_acc": 0.46808510638297873, "train_speed(iter/s)": 1.437549 }, { "epoch": 3.0405723833597533, "grad_norm": 5.712028980255127, "learning_rate": 3.334206703834721e-05, "loss": 2.2087820053100584, "memory(GiB)": 77.56, "step": 70970, "token_acc": 0.5034722222222222, "train_speed(iter/s)": 1.437571 }, { "epoch": 3.040786598689002, "grad_norm": 5.499817371368408, "learning_rate": 3.333572188289179e-05, "loss": 1.9762527465820312, "memory(GiB)": 77.56, "step": 70975, "token_acc": 0.5608856088560885, "train_speed(iter/s)": 1.437582 }, { "epoch": 3.041000814018251, "grad_norm": 4.6744818687438965, "learning_rate": 3.3329377029325396e-05, "loss": 2.1812816619873048, "memory(GiB)": 77.56, "step": 70980, "token_acc": 0.4945054945054945, "train_speed(iter/s)": 1.437603 }, { "epoch": 3.0412150293475, "grad_norm": 6.415157318115234, "learning_rate": 3.3323032477762984e-05, "loss": 2.1204456329345702, "memory(GiB)": 77.56, "step": 70985, "token_acc": 0.5108695652173914, "train_speed(iter/s)": 1.437611 }, { "epoch": 3.041429244676749, "grad_norm": 5.196455955505371, "learning_rate": 3.331668822831947e-05, "loss": 2.221500587463379, "memory(GiB)": 77.56, "step": 70990, "token_acc": 0.5056603773584906, "train_speed(iter/s)": 1.437633 }, { "epoch": 3.041643460005998, "grad_norm": 4.392649173736572, "learning_rate": 3.331034428110979e-05, "loss": 2.1638776779174806, "memory(GiB)": 77.56, "step": 70995, "token_acc": 0.5234375, "train_speed(iter/s)": 1.437644 }, { "epoch": 3.041857675335247, "grad_norm": 9.233529090881348, "learning_rate": 3.33040006362489e-05, "loss": 2.483344078063965, "memory(GiB)": 77.56, "step": 71000, "token_acc": 0.49624060150375937, "train_speed(iter/s)": 1.437667 }, { "epoch": 3.041857675335247, "eval_loss": 2.188765525817871, "eval_runtime": 14.8012, "eval_samples_per_second": 6.756, "eval_steps_per_second": 6.756, "eval_token_acc": 0.49670619235836627, "step": 71000 }, { "epoch": 3.042071890664496, "grad_norm": 6.377941131591797, "learning_rate": 3.329765729385169e-05, "loss": 2.4590631484985352, "memory(GiB)": 77.56, "step": 71005, "token_acc": 0.491412213740458, "train_speed(iter/s)": 1.437235 }, { "epoch": 3.0422861059937447, "grad_norm": 5.215327739715576, "learning_rate": 3.329131425403309e-05, "loss": 2.3740257263183593, "memory(GiB)": 77.56, "step": 71010, "token_acc": 0.4884488448844885, "train_speed(iter/s)": 1.437246 }, { "epoch": 3.042500321322994, "grad_norm": 5.504948616027832, "learning_rate": 3.3284971516908014e-05, "loss": 2.3258388519287108, "memory(GiB)": 77.56, "step": 71015, "token_acc": 0.49828178694158076, "train_speed(iter/s)": 1.437224 }, { "epoch": 3.042714536652243, "grad_norm": 5.201958179473877, "learning_rate": 3.327862908259135e-05, "loss": 2.282832145690918, "memory(GiB)": 77.56, "step": 71020, "token_acc": 0.5343511450381679, "train_speed(iter/s)": 1.4372 }, { "epoch": 3.0429287519814916, "grad_norm": 4.988426208496094, "learning_rate": 3.327228695119801e-05, "loss": 2.3858795166015625, "memory(GiB)": 77.56, "step": 71025, "token_acc": 0.48226950354609927, "train_speed(iter/s)": 1.43719 }, { "epoch": 3.043142967310741, "grad_norm": 6.245116710662842, "learning_rate": 3.326594512284289e-05, "loss": 2.2175689697265626, "memory(GiB)": 77.56, "step": 71030, "token_acc": 0.5166051660516605, "train_speed(iter/s)": 1.437201 }, { "epoch": 3.0433571826399897, "grad_norm": 6.072200775146484, "learning_rate": 3.325960359764085e-05, "loss": 2.4529499053955077, "memory(GiB)": 77.56, "step": 71035, "token_acc": 0.5, "train_speed(iter/s)": 1.437188 }, { "epoch": 3.0435713979692385, "grad_norm": 6.2255964279174805, "learning_rate": 3.3253262375706815e-05, "loss": 2.3310726165771483, "memory(GiB)": 77.56, "step": 71040, "token_acc": 0.5079365079365079, "train_speed(iter/s)": 1.437182 }, { "epoch": 3.0437856132984877, "grad_norm": 4.321385383605957, "learning_rate": 3.3246921457155634e-05, "loss": 2.379546356201172, "memory(GiB)": 77.56, "step": 71045, "token_acc": 0.4759036144578313, "train_speed(iter/s)": 1.437176 }, { "epoch": 3.0439998286277365, "grad_norm": 6.137400150299072, "learning_rate": 3.3240580842102186e-05, "loss": 2.3142412185668944, "memory(GiB)": 77.56, "step": 71050, "token_acc": 0.4721311475409836, "train_speed(iter/s)": 1.437181 }, { "epoch": 3.0442140439569854, "grad_norm": 5.424540042877197, "learning_rate": 3.323424053066135e-05, "loss": 2.321827697753906, "memory(GiB)": 77.56, "step": 71055, "token_acc": 0.5061349693251533, "train_speed(iter/s)": 1.437196 }, { "epoch": 3.0444282592862346, "grad_norm": 5.927058219909668, "learning_rate": 3.322790052294796e-05, "loss": 2.2751163482666015, "memory(GiB)": 77.56, "step": 71060, "token_acc": 0.5113636363636364, "train_speed(iter/s)": 1.437217 }, { "epoch": 3.0446424746154834, "grad_norm": 5.897186756134033, "learning_rate": 3.3221560819076894e-05, "loss": 2.4224424362182617, "memory(GiB)": 77.56, "step": 71065, "token_acc": 0.5214285714285715, "train_speed(iter/s)": 1.437222 }, { "epoch": 3.0448566899447322, "grad_norm": 4.345804691314697, "learning_rate": 3.3215221419162966e-05, "loss": 2.2936595916748046, "memory(GiB)": 77.56, "step": 71070, "token_acc": 0.4930555555555556, "train_speed(iter/s)": 1.437234 }, { "epoch": 3.0450709052739815, "grad_norm": 5.58962345123291, "learning_rate": 3.320888232332108e-05, "loss": 2.4791725158691404, "memory(GiB)": 77.56, "step": 71075, "token_acc": 0.5016835016835017, "train_speed(iter/s)": 1.437223 }, { "epoch": 3.0452851206032303, "grad_norm": 7.8404717445373535, "learning_rate": 3.320254353166602e-05, "loss": 2.495720100402832, "memory(GiB)": 77.56, "step": 71080, "token_acc": 0.48717948717948717, "train_speed(iter/s)": 1.437235 }, { "epoch": 3.045499335932479, "grad_norm": 5.145702362060547, "learning_rate": 3.3196205044312664e-05, "loss": 2.3742069244384765, "memory(GiB)": 77.56, "step": 71085, "token_acc": 0.5045317220543807, "train_speed(iter/s)": 1.437247 }, { "epoch": 3.0457135512617284, "grad_norm": 8.874959945678711, "learning_rate": 3.318986686137581e-05, "loss": 2.489121437072754, "memory(GiB)": 77.56, "step": 71090, "token_acc": 0.4827586206896552, "train_speed(iter/s)": 1.437264 }, { "epoch": 3.045927766590977, "grad_norm": 11.734989166259766, "learning_rate": 3.318352898297027e-05, "loss": 2.620874214172363, "memory(GiB)": 77.56, "step": 71095, "token_acc": 0.4557522123893805, "train_speed(iter/s)": 1.437257 }, { "epoch": 3.046141981920226, "grad_norm": 6.317271709442139, "learning_rate": 3.31771914092109e-05, "loss": 2.4212602615356444, "memory(GiB)": 77.56, "step": 71100, "token_acc": 0.4953271028037383, "train_speed(iter/s)": 1.437278 }, { "epoch": 3.0463561972494753, "grad_norm": 5.987992763519287, "learning_rate": 3.317085414021247e-05, "loss": 2.331417274475098, "memory(GiB)": 77.56, "step": 71105, "token_acc": 0.5244299674267101, "train_speed(iter/s)": 1.437277 }, { "epoch": 3.046570412578724, "grad_norm": 4.15399694442749, "learning_rate": 3.316451717608982e-05, "loss": 2.127524566650391, "memory(GiB)": 77.56, "step": 71110, "token_acc": 0.558303886925795, "train_speed(iter/s)": 1.437307 }, { "epoch": 3.046784627907973, "grad_norm": 5.694718360900879, "learning_rate": 3.3158180516957736e-05, "loss": 2.223400115966797, "memory(GiB)": 77.56, "step": 71115, "token_acc": 0.4984520123839009, "train_speed(iter/s)": 1.437294 }, { "epoch": 3.046998843237222, "grad_norm": 7.442710876464844, "learning_rate": 3.315184416293099e-05, "loss": 2.4708282470703127, "memory(GiB)": 77.56, "step": 71120, "token_acc": 0.45318352059925093, "train_speed(iter/s)": 1.437296 }, { "epoch": 3.047213058566471, "grad_norm": 4.6826558113098145, "learning_rate": 3.314550811412441e-05, "loss": 2.183447265625, "memory(GiB)": 77.56, "step": 71125, "token_acc": 0.503448275862069, "train_speed(iter/s)": 1.437313 }, { "epoch": 3.0474272738957198, "grad_norm": 7.514987945556641, "learning_rate": 3.313917237065277e-05, "loss": 2.3281620025634764, "memory(GiB)": 77.56, "step": 71130, "token_acc": 0.5201342281879194, "train_speed(iter/s)": 1.437318 }, { "epoch": 3.047641489224969, "grad_norm": 6.463863849639893, "learning_rate": 3.313283693263082e-05, "loss": 2.2465572357177734, "memory(GiB)": 77.56, "step": 71135, "token_acc": 0.49206349206349204, "train_speed(iter/s)": 1.437328 }, { "epoch": 3.047855704554218, "grad_norm": 6.358994007110596, "learning_rate": 3.3126501800173334e-05, "loss": 2.250709342956543, "memory(GiB)": 77.56, "step": 71140, "token_acc": 0.5029239766081871, "train_speed(iter/s)": 1.437353 }, { "epoch": 3.0480699198834666, "grad_norm": 6.5687336921691895, "learning_rate": 3.312016697339513e-05, "loss": 2.2080333709716795, "memory(GiB)": 77.56, "step": 71145, "token_acc": 0.5435540069686411, "train_speed(iter/s)": 1.437377 }, { "epoch": 3.048284135212716, "grad_norm": 6.290611267089844, "learning_rate": 3.311383245241092e-05, "loss": 2.416961097717285, "memory(GiB)": 77.56, "step": 71150, "token_acc": 0.48375451263537905, "train_speed(iter/s)": 1.437392 }, { "epoch": 3.0484983505419647, "grad_norm": 5.255943775177002, "learning_rate": 3.3107498237335476e-05, "loss": 2.1069141387939454, "memory(GiB)": 77.56, "step": 71155, "token_acc": 0.5212121212121212, "train_speed(iter/s)": 1.437377 }, { "epoch": 3.0487125658712135, "grad_norm": 5.697054862976074, "learning_rate": 3.3101164328283566e-05, "loss": 2.2210378646850586, "memory(GiB)": 77.56, "step": 71160, "token_acc": 0.47315436241610737, "train_speed(iter/s)": 1.437365 }, { "epoch": 3.048926781200463, "grad_norm": 5.799210071563721, "learning_rate": 3.3094830725369894e-05, "loss": 2.4000701904296875, "memory(GiB)": 77.56, "step": 71165, "token_acc": 0.45422535211267606, "train_speed(iter/s)": 1.437381 }, { "epoch": 3.0491409965297116, "grad_norm": 6.0237836837768555, "learning_rate": 3.308849742870923e-05, "loss": 2.0678529739379883, "memory(GiB)": 77.56, "step": 71170, "token_acc": 0.5566666666666666, "train_speed(iter/s)": 1.437354 }, { "epoch": 3.0493552118589604, "grad_norm": 6.069119930267334, "learning_rate": 3.3082164438416306e-05, "loss": 2.3057491302490236, "memory(GiB)": 77.56, "step": 71175, "token_acc": 0.5184049079754601, "train_speed(iter/s)": 1.437367 }, { "epoch": 3.0495694271882097, "grad_norm": 7.07837438583374, "learning_rate": 3.3075831754605834e-05, "loss": 2.2990318298339845, "memory(GiB)": 77.56, "step": 71180, "token_acc": 0.5418060200668896, "train_speed(iter/s)": 1.437365 }, { "epoch": 3.0497836425174585, "grad_norm": 7.270720481872559, "learning_rate": 3.306949937739255e-05, "loss": 2.0276330947875976, "memory(GiB)": 77.56, "step": 71185, "token_acc": 0.5350877192982456, "train_speed(iter/s)": 1.437378 }, { "epoch": 3.0499978578467073, "grad_norm": 5.985006809234619, "learning_rate": 3.3063167306891165e-05, "loss": 2.456471252441406, "memory(GiB)": 77.56, "step": 71190, "token_acc": 0.4768211920529801, "train_speed(iter/s)": 1.437391 }, { "epoch": 3.0502120731759566, "grad_norm": 7.572167873382568, "learning_rate": 3.30568355432164e-05, "loss": 2.4999338150024415, "memory(GiB)": 77.56, "step": 71195, "token_acc": 0.5390334572490706, "train_speed(iter/s)": 1.437416 }, { "epoch": 3.0504262885052054, "grad_norm": 6.423421859741211, "learning_rate": 3.305050408648295e-05, "loss": 2.3983505249023436, "memory(GiB)": 77.56, "step": 71200, "token_acc": 0.5031055900621118, "train_speed(iter/s)": 1.437418 }, { "epoch": 3.050640503834454, "grad_norm": 5.355645179748535, "learning_rate": 3.3044172936805505e-05, "loss": 2.467995834350586, "memory(GiB)": 77.56, "step": 71205, "token_acc": 0.4703703703703704, "train_speed(iter/s)": 1.437441 }, { "epoch": 3.0508547191637034, "grad_norm": 5.814355373382568, "learning_rate": 3.3037842094298795e-05, "loss": 2.3652368545532227, "memory(GiB)": 77.56, "step": 71210, "token_acc": 0.5646551724137931, "train_speed(iter/s)": 1.437455 }, { "epoch": 3.0510689344929522, "grad_norm": 5.717180252075195, "learning_rate": 3.303151155907746e-05, "loss": 2.660415840148926, "memory(GiB)": 77.56, "step": 71215, "token_acc": 0.43312101910828027, "train_speed(iter/s)": 1.437443 }, { "epoch": 3.0512831498222015, "grad_norm": 5.1082024574279785, "learning_rate": 3.302518133125623e-05, "loss": 2.3493471145629883, "memory(GiB)": 77.56, "step": 71220, "token_acc": 0.5241157556270096, "train_speed(iter/s)": 1.437462 }, { "epoch": 3.0514973651514503, "grad_norm": 5.413752555847168, "learning_rate": 3.301885141094975e-05, "loss": 2.399314308166504, "memory(GiB)": 77.56, "step": 71225, "token_acc": 0.47878787878787876, "train_speed(iter/s)": 1.437466 }, { "epoch": 3.051711580480699, "grad_norm": 5.756601333618164, "learning_rate": 3.3012521798272723e-05, "loss": 2.715800476074219, "memory(GiB)": 77.56, "step": 71230, "token_acc": 0.4437299035369775, "train_speed(iter/s)": 1.437481 }, { "epoch": 3.0519257958099484, "grad_norm": 7.276020526885986, "learning_rate": 3.3006192493339785e-05, "loss": 1.7715030670166017, "memory(GiB)": 77.56, "step": 71235, "token_acc": 0.5675675675675675, "train_speed(iter/s)": 1.437504 }, { "epoch": 3.052140011139197, "grad_norm": 5.446733474731445, "learning_rate": 3.299986349626563e-05, "loss": 2.161296844482422, "memory(GiB)": 77.56, "step": 71240, "token_acc": 0.5048231511254019, "train_speed(iter/s)": 1.437524 }, { "epoch": 3.052354226468446, "grad_norm": 5.676682472229004, "learning_rate": 3.299353480716488e-05, "loss": 2.6087387084960936, "memory(GiB)": 77.56, "step": 71245, "token_acc": 0.4586206896551724, "train_speed(iter/s)": 1.437518 }, { "epoch": 3.0525684417976953, "grad_norm": 4.904717922210693, "learning_rate": 3.298720642615221e-05, "loss": 1.9600635528564454, "memory(GiB)": 77.56, "step": 71250, "token_acc": 0.5377777777777778, "train_speed(iter/s)": 1.437525 }, { "epoch": 3.052782657126944, "grad_norm": 6.122511863708496, "learning_rate": 3.298087835334225e-05, "loss": 2.3314563751220705, "memory(GiB)": 77.56, "step": 71255, "token_acc": 0.5054945054945055, "train_speed(iter/s)": 1.437549 }, { "epoch": 3.052996872456193, "grad_norm": 5.856444835662842, "learning_rate": 3.2974550588849645e-05, "loss": 2.398816680908203, "memory(GiB)": 77.56, "step": 71260, "token_acc": 0.4753521126760563, "train_speed(iter/s)": 1.437558 }, { "epoch": 3.053211087785442, "grad_norm": 5.47257137298584, "learning_rate": 3.296822313278902e-05, "loss": 2.5130504608154296, "memory(GiB)": 77.56, "step": 71265, "token_acc": 0.47093023255813954, "train_speed(iter/s)": 1.437571 }, { "epoch": 3.053425303114691, "grad_norm": 6.759922027587891, "learning_rate": 3.296189598527502e-05, "loss": 2.0901737213134766, "memory(GiB)": 77.56, "step": 71270, "token_acc": 0.5407407407407407, "train_speed(iter/s)": 1.437544 }, { "epoch": 3.0536395184439398, "grad_norm": 4.901780605316162, "learning_rate": 3.295556914642226e-05, "loss": 2.340744209289551, "memory(GiB)": 77.56, "step": 71275, "token_acc": 0.4936708860759494, "train_speed(iter/s)": 1.437576 }, { "epoch": 3.053853733773189, "grad_norm": 6.798420429229736, "learning_rate": 3.294924261634534e-05, "loss": 2.2466558456420898, "memory(GiB)": 77.56, "step": 71280, "token_acc": 0.5173611111111112, "train_speed(iter/s)": 1.437595 }, { "epoch": 3.054067949102438, "grad_norm": 5.200614929199219, "learning_rate": 3.29429163951589e-05, "loss": 2.4750911712646486, "memory(GiB)": 77.56, "step": 71285, "token_acc": 0.4980544747081712, "train_speed(iter/s)": 1.437614 }, { "epoch": 3.0542821644316867, "grad_norm": 9.178266525268555, "learning_rate": 3.29365904829775e-05, "loss": 2.514382553100586, "memory(GiB)": 77.56, "step": 71290, "token_acc": 0.4720496894409938, "train_speed(iter/s)": 1.43763 }, { "epoch": 3.054496379760936, "grad_norm": 4.766347885131836, "learning_rate": 3.293026487991579e-05, "loss": 2.4030633926391602, "memory(GiB)": 77.56, "step": 71295, "token_acc": 0.48172757475083056, "train_speed(iter/s)": 1.437649 }, { "epoch": 3.0547105950901847, "grad_norm": 5.82194185256958, "learning_rate": 3.2923939586088346e-05, "loss": 2.2792917251586915, "memory(GiB)": 77.56, "step": 71300, "token_acc": 0.5111821086261981, "train_speed(iter/s)": 1.437645 }, { "epoch": 3.0549248104194335, "grad_norm": 6.4278974533081055, "learning_rate": 3.291761460160976e-05, "loss": 2.531342124938965, "memory(GiB)": 77.56, "step": 71305, "token_acc": 0.44404332129963897, "train_speed(iter/s)": 1.43764 }, { "epoch": 3.055139025748683, "grad_norm": 7.563528060913086, "learning_rate": 3.2911289926594604e-05, "loss": 2.0771963119506838, "memory(GiB)": 77.56, "step": 71310, "token_acc": 0.5430711610486891, "train_speed(iter/s)": 1.437646 }, { "epoch": 3.0553532410779316, "grad_norm": 4.607213020324707, "learning_rate": 3.2904965561157466e-05, "loss": 2.1626506805419923, "memory(GiB)": 77.56, "step": 71315, "token_acc": 0.49523809523809526, "train_speed(iter/s)": 1.437668 }, { "epoch": 3.0555674564071804, "grad_norm": 7.028500080108643, "learning_rate": 3.289864150541292e-05, "loss": 2.5822845458984376, "memory(GiB)": 77.56, "step": 71320, "token_acc": 0.5, "train_speed(iter/s)": 1.437698 }, { "epoch": 3.0557816717364297, "grad_norm": 6.545118808746338, "learning_rate": 3.289231775947551e-05, "loss": 2.5390146255493162, "memory(GiB)": 77.56, "step": 71325, "token_acc": 0.46446700507614214, "train_speed(iter/s)": 1.437717 }, { "epoch": 3.0559958870656785, "grad_norm": 5.889909267425537, "learning_rate": 3.2885994323459826e-05, "loss": 2.555874824523926, "memory(GiB)": 77.56, "step": 71330, "token_acc": 0.4633431085043988, "train_speed(iter/s)": 1.437714 }, { "epoch": 3.0562101023949273, "grad_norm": 5.655017375946045, "learning_rate": 3.287967119748039e-05, "loss": 2.270869827270508, "memory(GiB)": 77.56, "step": 71335, "token_acc": 0.4904214559386973, "train_speed(iter/s)": 1.437703 }, { "epoch": 3.0564243177241766, "grad_norm": 6.0177106857299805, "learning_rate": 3.287334838165179e-05, "loss": 2.3454551696777344, "memory(GiB)": 77.56, "step": 71340, "token_acc": 0.5033783783783784, "train_speed(iter/s)": 1.437732 }, { "epoch": 3.0566385330534254, "grad_norm": 6.671818256378174, "learning_rate": 3.286702587608854e-05, "loss": 2.1206298828125, "memory(GiB)": 77.56, "step": 71345, "token_acc": 0.4866920152091255, "train_speed(iter/s)": 1.437738 }, { "epoch": 3.056852748382674, "grad_norm": 5.391358852386475, "learning_rate": 3.286070368090519e-05, "loss": 1.9761165618896483, "memory(GiB)": 77.56, "step": 71350, "token_acc": 0.5505226480836237, "train_speed(iter/s)": 1.437742 }, { "epoch": 3.0570669637119234, "grad_norm": 5.055149555206299, "learning_rate": 3.285438179621627e-05, "loss": 2.3021852493286135, "memory(GiB)": 77.56, "step": 71355, "token_acc": 0.5104895104895105, "train_speed(iter/s)": 1.43775 }, { "epoch": 3.0572811790411722, "grad_norm": 7.399731159210205, "learning_rate": 3.284806022213632e-05, "loss": 2.6837385177612303, "memory(GiB)": 77.56, "step": 71360, "token_acc": 0.44554455445544555, "train_speed(iter/s)": 1.437736 }, { "epoch": 3.057495394370421, "grad_norm": 5.64953088760376, "learning_rate": 3.284173895877982e-05, "loss": 2.4469409942626954, "memory(GiB)": 77.56, "step": 71365, "token_acc": 0.5123674911660777, "train_speed(iter/s)": 1.437734 }, { "epoch": 3.0577096096996703, "grad_norm": 4.701729774475098, "learning_rate": 3.283541800626132e-05, "loss": 2.144628143310547, "memory(GiB)": 77.56, "step": 71370, "token_acc": 0.525691699604743, "train_speed(iter/s)": 1.437728 }, { "epoch": 3.057923825028919, "grad_norm": 6.855034351348877, "learning_rate": 3.282909736469535e-05, "loss": 2.5289798736572267, "memory(GiB)": 77.56, "step": 71375, "token_acc": 0.4766355140186916, "train_speed(iter/s)": 1.437699 }, { "epoch": 3.058138040358168, "grad_norm": 8.932657241821289, "learning_rate": 3.2822777034196364e-05, "loss": 2.2341377258300783, "memory(GiB)": 77.56, "step": 71380, "token_acc": 0.5306122448979592, "train_speed(iter/s)": 1.437714 }, { "epoch": 3.058352255687417, "grad_norm": 6.1834635734558105, "learning_rate": 3.28164570148789e-05, "loss": 2.301803779602051, "memory(GiB)": 77.56, "step": 71385, "token_acc": 0.5051903114186851, "train_speed(iter/s)": 1.437716 }, { "epoch": 3.058566471016666, "grad_norm": 6.1221394538879395, "learning_rate": 3.281013730685744e-05, "loss": 2.253335189819336, "memory(GiB)": 77.56, "step": 71390, "token_acc": 0.5, "train_speed(iter/s)": 1.437707 }, { "epoch": 3.058780686345915, "grad_norm": 6.424152851104736, "learning_rate": 3.280381791024646e-05, "loss": 2.4969079971313475, "memory(GiB)": 77.56, "step": 71395, "token_acc": 0.4657534246575342, "train_speed(iter/s)": 1.437701 }, { "epoch": 3.058994901675164, "grad_norm": 5.603373050689697, "learning_rate": 3.2797498825160456e-05, "loss": 2.539145088195801, "memory(GiB)": 77.56, "step": 71400, "token_acc": 0.4753246753246753, "train_speed(iter/s)": 1.437715 }, { "epoch": 3.059209117004413, "grad_norm": 6.993089199066162, "learning_rate": 3.279118005171389e-05, "loss": 2.6204206466674806, "memory(GiB)": 77.56, "step": 71405, "token_acc": 0.45454545454545453, "train_speed(iter/s)": 1.4377 }, { "epoch": 3.0594233323336617, "grad_norm": 4.646661758422852, "learning_rate": 3.278486159002124e-05, "loss": 2.394352912902832, "memory(GiB)": 77.56, "step": 71410, "token_acc": 0.5110294117647058, "train_speed(iter/s)": 1.437686 }, { "epoch": 3.059637547662911, "grad_norm": 6.484498023986816, "learning_rate": 3.277854344019699e-05, "loss": 2.381487274169922, "memory(GiB)": 77.56, "step": 71415, "token_acc": 0.5254237288135594, "train_speed(iter/s)": 1.437679 }, { "epoch": 3.0598517629921598, "grad_norm": 5.069632530212402, "learning_rate": 3.2772225602355554e-05, "loss": 2.4179157257080077, "memory(GiB)": 77.56, "step": 71420, "token_acc": 0.5048231511254019, "train_speed(iter/s)": 1.437689 }, { "epoch": 3.0600659783214086, "grad_norm": 6.545305252075195, "learning_rate": 3.2765908076611426e-05, "loss": 2.190912628173828, "memory(GiB)": 77.56, "step": 71425, "token_acc": 0.4641350210970464, "train_speed(iter/s)": 1.437677 }, { "epoch": 3.060280193650658, "grad_norm": 6.894156455993652, "learning_rate": 3.275959086307905e-05, "loss": 2.5124589920043947, "memory(GiB)": 77.56, "step": 71430, "token_acc": 0.4553846153846154, "train_speed(iter/s)": 1.437692 }, { "epoch": 3.0604944089799067, "grad_norm": 8.655752182006836, "learning_rate": 3.2753273961872824e-05, "loss": 2.5310384750366213, "memory(GiB)": 77.56, "step": 71435, "token_acc": 0.48518518518518516, "train_speed(iter/s)": 1.437707 }, { "epoch": 3.0607086243091555, "grad_norm": 5.756744861602783, "learning_rate": 3.274695737310723e-05, "loss": 2.5682937622070314, "memory(GiB)": 77.56, "step": 71440, "token_acc": 0.47692307692307695, "train_speed(iter/s)": 1.437708 }, { "epoch": 3.0609228396384047, "grad_norm": 6.639272689819336, "learning_rate": 3.2740641096896715e-05, "loss": 2.3876296997070314, "memory(GiB)": 77.56, "step": 71445, "token_acc": 0.5389408099688473, "train_speed(iter/s)": 1.437712 }, { "epoch": 3.0611370549676535, "grad_norm": 5.637764930725098, "learning_rate": 3.273432513335566e-05, "loss": 2.158272933959961, "memory(GiB)": 77.56, "step": 71450, "token_acc": 0.5692307692307692, "train_speed(iter/s)": 1.437744 }, { "epoch": 3.0613512702969023, "grad_norm": 5.608621597290039, "learning_rate": 3.27280094825985e-05, "loss": 2.142215347290039, "memory(GiB)": 77.56, "step": 71455, "token_acc": 0.53515625, "train_speed(iter/s)": 1.437728 }, { "epoch": 3.0615654856261516, "grad_norm": 5.965383529663086, "learning_rate": 3.2721694144739654e-05, "loss": 2.2114877700805664, "memory(GiB)": 77.56, "step": 71460, "token_acc": 0.5206896551724138, "train_speed(iter/s)": 1.437719 }, { "epoch": 3.0617797009554004, "grad_norm": 8.144937515258789, "learning_rate": 3.2715379119893534e-05, "loss": 2.2537193298339844, "memory(GiB)": 77.56, "step": 71465, "token_acc": 0.49206349206349204, "train_speed(iter/s)": 1.437735 }, { "epoch": 3.0619939162846492, "grad_norm": 6.5725016593933105, "learning_rate": 3.270906440817453e-05, "loss": 2.2744071960449217, "memory(GiB)": 77.56, "step": 71470, "token_acc": 0.5648854961832062, "train_speed(iter/s)": 1.437745 }, { "epoch": 3.0622081316138985, "grad_norm": 5.586136817932129, "learning_rate": 3.270275000969704e-05, "loss": 2.4055850982666014, "memory(GiB)": 77.56, "step": 71475, "token_acc": 0.4553191489361702, "train_speed(iter/s)": 1.437778 }, { "epoch": 3.0624223469431473, "grad_norm": 6.206053256988525, "learning_rate": 3.2696435924575456e-05, "loss": 2.4393325805664063, "memory(GiB)": 77.56, "step": 71480, "token_acc": 0.5288135593220339, "train_speed(iter/s)": 1.437791 }, { "epoch": 3.062636562272396, "grad_norm": 5.872185230255127, "learning_rate": 3.269012215292418e-05, "loss": 2.1251575469970705, "memory(GiB)": 77.56, "step": 71485, "token_acc": 0.5465587044534413, "train_speed(iter/s)": 1.437817 }, { "epoch": 3.0628507776016454, "grad_norm": 5.686715126037598, "learning_rate": 3.268380869485758e-05, "loss": 2.4605125427246093, "memory(GiB)": 77.56, "step": 71490, "token_acc": 0.49032258064516127, "train_speed(iter/s)": 1.437821 }, { "epoch": 3.063064992930894, "grad_norm": 10.681577682495117, "learning_rate": 3.267749555049002e-05, "loss": 2.450027656555176, "memory(GiB)": 77.56, "step": 71495, "token_acc": 0.4852941176470588, "train_speed(iter/s)": 1.437843 }, { "epoch": 3.063279208260143, "grad_norm": 5.863169193267822, "learning_rate": 3.2671182719935874e-05, "loss": 2.264522933959961, "memory(GiB)": 77.56, "step": 71500, "token_acc": 0.5485714285714286, "train_speed(iter/s)": 1.437829 }, { "epoch": 3.063279208260143, "eval_loss": 2.1684317588806152, "eval_runtime": 14.4922, "eval_samples_per_second": 6.9, "eval_steps_per_second": 6.9, "eval_token_acc": 0.479328165374677, "step": 71500 }, { "epoch": 3.0634934235893923, "grad_norm": 6.721035957336426, "learning_rate": 3.266487020330953e-05, "loss": 2.5304203033447266, "memory(GiB)": 77.56, "step": 71505, "token_acc": 0.48123195380173245, "train_speed(iter/s)": 1.437387 }, { "epoch": 3.063707638918641, "grad_norm": 4.567585468292236, "learning_rate": 3.2658558000725295e-05, "loss": 2.3212718963623047, "memory(GiB)": 77.56, "step": 71510, "token_acc": 0.5357142857142857, "train_speed(iter/s)": 1.437392 }, { "epoch": 3.06392185424789, "grad_norm": 6.7867865562438965, "learning_rate": 3.265224611229755e-05, "loss": 2.414967346191406, "memory(GiB)": 77.56, "step": 71515, "token_acc": 0.47701149425287354, "train_speed(iter/s)": 1.437377 }, { "epoch": 3.064136069577139, "grad_norm": 3.9770381450653076, "learning_rate": 3.264593453814066e-05, "loss": 2.435228729248047, "memory(GiB)": 77.56, "step": 71520, "token_acc": 0.4818181818181818, "train_speed(iter/s)": 1.437386 }, { "epoch": 3.064350284906388, "grad_norm": 6.722285747528076, "learning_rate": 3.2639623278368936e-05, "loss": 2.3312519073486326, "memory(GiB)": 77.56, "step": 71525, "token_acc": 0.54, "train_speed(iter/s)": 1.437408 }, { "epoch": 3.0645645002356368, "grad_norm": 5.002237796783447, "learning_rate": 3.263331233309673e-05, "loss": 2.897745132446289, "memory(GiB)": 77.56, "step": 71530, "token_acc": 0.44146341463414634, "train_speed(iter/s)": 1.437412 }, { "epoch": 3.064778715564886, "grad_norm": 5.541676998138428, "learning_rate": 3.262700170243836e-05, "loss": 2.232503890991211, "memory(GiB)": 77.56, "step": 71535, "token_acc": 0.5178571428571429, "train_speed(iter/s)": 1.437439 }, { "epoch": 3.064992930894135, "grad_norm": 5.969266891479492, "learning_rate": 3.262069138650814e-05, "loss": 2.3966352462768556, "memory(GiB)": 77.56, "step": 71540, "token_acc": 0.4962686567164179, "train_speed(iter/s)": 1.437445 }, { "epoch": 3.0652071462233836, "grad_norm": 5.5641069412231445, "learning_rate": 3.261438138542041e-05, "loss": 2.4115156173706054, "memory(GiB)": 77.56, "step": 71545, "token_acc": 0.5016077170418006, "train_speed(iter/s)": 1.437456 }, { "epoch": 3.065421361552633, "grad_norm": 6.443191051483154, "learning_rate": 3.260807169928948e-05, "loss": 2.166291618347168, "memory(GiB)": 77.56, "step": 71550, "token_acc": 0.532258064516129, "train_speed(iter/s)": 1.437456 }, { "epoch": 3.0656355768818817, "grad_norm": 5.40637731552124, "learning_rate": 3.2601762328229624e-05, "loss": 2.4704456329345703, "memory(GiB)": 77.56, "step": 71555, "token_acc": 0.4746268656716418, "train_speed(iter/s)": 1.437439 }, { "epoch": 3.0658497922111305, "grad_norm": 6.787854194641113, "learning_rate": 3.259545327235518e-05, "loss": 2.2822240829467773, "memory(GiB)": 77.56, "step": 71560, "token_acc": 0.5045871559633027, "train_speed(iter/s)": 1.437453 }, { "epoch": 3.0660640075403798, "grad_norm": 4.851698875427246, "learning_rate": 3.258914453178041e-05, "loss": 2.2787031173706054, "memory(GiB)": 77.56, "step": 71565, "token_acc": 0.5317725752508361, "train_speed(iter/s)": 1.437437 }, { "epoch": 3.0662782228696286, "grad_norm": 7.534502029418945, "learning_rate": 3.2582836106619636e-05, "loss": 2.356695365905762, "memory(GiB)": 77.56, "step": 71570, "token_acc": 0.49122807017543857, "train_speed(iter/s)": 1.437449 }, { "epoch": 3.0664924381988774, "grad_norm": 6.021844387054443, "learning_rate": 3.257652799698712e-05, "loss": 2.5388763427734373, "memory(GiB)": 77.56, "step": 71575, "token_acc": 0.4676470588235294, "train_speed(iter/s)": 1.437479 }, { "epoch": 3.0667066535281267, "grad_norm": 7.734993934631348, "learning_rate": 3.2570220202997134e-05, "loss": 2.0558521270751955, "memory(GiB)": 77.56, "step": 71580, "token_acc": 0.5325670498084292, "train_speed(iter/s)": 1.437478 }, { "epoch": 3.0669208688573755, "grad_norm": 5.977901935577393, "learning_rate": 3.256391272476396e-05, "loss": 2.504092788696289, "memory(GiB)": 77.56, "step": 71585, "token_acc": 0.5, "train_speed(iter/s)": 1.437509 }, { "epoch": 3.0671350841866243, "grad_norm": 5.679624557495117, "learning_rate": 3.255760556240187e-05, "loss": 2.3099443435668947, "memory(GiB)": 77.56, "step": 71590, "token_acc": 0.5123674911660777, "train_speed(iter/s)": 1.437531 }, { "epoch": 3.0673492995158735, "grad_norm": 5.730276584625244, "learning_rate": 3.255129871602512e-05, "loss": 2.5747289657592773, "memory(GiB)": 77.56, "step": 71595, "token_acc": 0.4775641025641026, "train_speed(iter/s)": 1.437558 }, { "epoch": 3.0675635148451224, "grad_norm": 4.948228359222412, "learning_rate": 3.2544992185747955e-05, "loss": 2.3366249084472654, "memory(GiB)": 77.56, "step": 71600, "token_acc": 0.4867549668874172, "train_speed(iter/s)": 1.437568 }, { "epoch": 3.067777730174371, "grad_norm": 4.9751482009887695, "learning_rate": 3.2538685971684646e-05, "loss": 2.1585954666137694, "memory(GiB)": 77.56, "step": 71605, "token_acc": 0.5398550724637681, "train_speed(iter/s)": 1.437563 }, { "epoch": 3.0679919455036204, "grad_norm": 5.517768383026123, "learning_rate": 3.253238007394942e-05, "loss": 2.2570945739746096, "memory(GiB)": 77.56, "step": 71610, "token_acc": 0.48, "train_speed(iter/s)": 1.437548 }, { "epoch": 3.0682061608328692, "grad_norm": 5.870797634124756, "learning_rate": 3.2526074492656496e-05, "loss": 2.14916877746582, "memory(GiB)": 77.56, "step": 71615, "token_acc": 0.5341880341880342, "train_speed(iter/s)": 1.437562 }, { "epoch": 3.068420376162118, "grad_norm": 5.81953763961792, "learning_rate": 3.2519769227920135e-05, "loss": 2.2913625717163084, "memory(GiB)": 77.56, "step": 71620, "token_acc": 0.5067114093959731, "train_speed(iter/s)": 1.43757 }, { "epoch": 3.0686345914913673, "grad_norm": 6.246188640594482, "learning_rate": 3.251346427985455e-05, "loss": 2.4065980911254883, "memory(GiB)": 77.56, "step": 71625, "token_acc": 0.49295774647887325, "train_speed(iter/s)": 1.437576 }, { "epoch": 3.068848806820616, "grad_norm": 5.368649482727051, "learning_rate": 3.250715964857397e-05, "loss": 2.484805679321289, "memory(GiB)": 77.56, "step": 71630, "token_acc": 0.46691176470588236, "train_speed(iter/s)": 1.437574 }, { "epoch": 3.069063022149865, "grad_norm": 6.015465259552002, "learning_rate": 3.25008553341926e-05, "loss": 2.111690330505371, "memory(GiB)": 77.56, "step": 71635, "token_acc": 0.511864406779661, "train_speed(iter/s)": 1.437557 }, { "epoch": 3.069277237479114, "grad_norm": 6.980801105499268, "learning_rate": 3.249455133682464e-05, "loss": 2.120357894897461, "memory(GiB)": 77.56, "step": 71640, "token_acc": 0.55625, "train_speed(iter/s)": 1.437576 }, { "epoch": 3.069491452808363, "grad_norm": 5.813309669494629, "learning_rate": 3.248824765658432e-05, "loss": 2.312276840209961, "memory(GiB)": 77.56, "step": 71645, "token_acc": 0.49818181818181817, "train_speed(iter/s)": 1.437579 }, { "epoch": 3.069705668137612, "grad_norm": 4.540748596191406, "learning_rate": 3.248194429358582e-05, "loss": 2.337319564819336, "memory(GiB)": 77.56, "step": 71650, "token_acc": 0.5084175084175084, "train_speed(iter/s)": 1.437602 }, { "epoch": 3.069919883466861, "grad_norm": 5.714628219604492, "learning_rate": 3.247564124794332e-05, "loss": 2.5630964279174804, "memory(GiB)": 77.56, "step": 71655, "token_acc": 0.5101214574898786, "train_speed(iter/s)": 1.437581 }, { "epoch": 3.07013409879611, "grad_norm": 4.395263671875, "learning_rate": 3.246933851977101e-05, "loss": 1.9874814987182616, "memory(GiB)": 77.56, "step": 71660, "token_acc": 0.5474683544303798, "train_speed(iter/s)": 1.437575 }, { "epoch": 3.0703483141253587, "grad_norm": 6.776604175567627, "learning_rate": 3.246303610918309e-05, "loss": 2.7311637878417967, "memory(GiB)": 77.56, "step": 71665, "token_acc": 0.4412811387900356, "train_speed(iter/s)": 1.437584 }, { "epoch": 3.070562529454608, "grad_norm": 5.680537700653076, "learning_rate": 3.245673401629372e-05, "loss": 2.378589057922363, "memory(GiB)": 77.56, "step": 71670, "token_acc": 0.4953271028037383, "train_speed(iter/s)": 1.437576 }, { "epoch": 3.0707767447838568, "grad_norm": 5.210697174072266, "learning_rate": 3.245043224121708e-05, "loss": 2.2265769958496096, "memory(GiB)": 77.56, "step": 71675, "token_acc": 0.5236363636363637, "train_speed(iter/s)": 1.437578 }, { "epoch": 3.0709909601131056, "grad_norm": 5.430446147918701, "learning_rate": 3.2444130784067315e-05, "loss": 2.5373294830322264, "memory(GiB)": 77.56, "step": 71680, "token_acc": 0.483974358974359, "train_speed(iter/s)": 1.437555 }, { "epoch": 3.071205175442355, "grad_norm": 5.818295955657959, "learning_rate": 3.243782964495859e-05, "loss": 2.374929428100586, "memory(GiB)": 77.56, "step": 71685, "token_acc": 0.5610687022900763, "train_speed(iter/s)": 1.437565 }, { "epoch": 3.0714193907716036, "grad_norm": 5.918603897094727, "learning_rate": 3.243152882400506e-05, "loss": 2.3068742752075195, "memory(GiB)": 77.56, "step": 71690, "token_acc": 0.4931506849315068, "train_speed(iter/s)": 1.437555 }, { "epoch": 3.0716336061008525, "grad_norm": 6.759967803955078, "learning_rate": 3.2425228321320863e-05, "loss": 2.3748859405517577, "memory(GiB)": 77.56, "step": 71695, "token_acc": 0.5076923076923077, "train_speed(iter/s)": 1.437555 }, { "epoch": 3.0718478214301017, "grad_norm": 4.13250207901001, "learning_rate": 3.241892813702014e-05, "loss": 2.2627542495727537, "memory(GiB)": 77.56, "step": 71700, "token_acc": 0.5175718849840255, "train_speed(iter/s)": 1.437581 }, { "epoch": 3.0720620367593505, "grad_norm": 5.313356399536133, "learning_rate": 3.2412628271217034e-05, "loss": 2.1665061950683593, "memory(GiB)": 77.56, "step": 71705, "token_acc": 0.5151515151515151, "train_speed(iter/s)": 1.437597 }, { "epoch": 3.0722762520885993, "grad_norm": 5.155050754547119, "learning_rate": 3.240632872402565e-05, "loss": 2.4742965698242188, "memory(GiB)": 77.56, "step": 71710, "token_acc": 0.4856115107913669, "train_speed(iter/s)": 1.437592 }, { "epoch": 3.0724904674178486, "grad_norm": 5.6786932945251465, "learning_rate": 3.240002949556014e-05, "loss": 2.2357444763183594, "memory(GiB)": 77.56, "step": 71715, "token_acc": 0.515527950310559, "train_speed(iter/s)": 1.437561 }, { "epoch": 3.0727046827470974, "grad_norm": 4.857753753662109, "learning_rate": 3.23937305859346e-05, "loss": 2.176723861694336, "memory(GiB)": 77.56, "step": 71720, "token_acc": 0.5263157894736842, "train_speed(iter/s)": 1.437531 }, { "epoch": 3.072918898076346, "grad_norm": 8.926349639892578, "learning_rate": 3.238743199526314e-05, "loss": 2.383189010620117, "memory(GiB)": 77.56, "step": 71725, "token_acc": 0.47315436241610737, "train_speed(iter/s)": 1.437538 }, { "epoch": 3.0731331134055955, "grad_norm": 5.758974552154541, "learning_rate": 3.2381133723659886e-05, "loss": 2.0396209716796876, "memory(GiB)": 77.56, "step": 71730, "token_acc": 0.5607142857142857, "train_speed(iter/s)": 1.437565 }, { "epoch": 3.0733473287348443, "grad_norm": 5.930120944976807, "learning_rate": 3.23748357712389e-05, "loss": 2.563071441650391, "memory(GiB)": 77.56, "step": 71735, "token_acc": 0.4646153846153846, "train_speed(iter/s)": 1.43752 }, { "epoch": 3.073561544064093, "grad_norm": 5.101224422454834, "learning_rate": 3.236853813811431e-05, "loss": 2.329065132141113, "memory(GiB)": 77.56, "step": 71740, "token_acc": 0.5191082802547771, "train_speed(iter/s)": 1.437556 }, { "epoch": 3.0737757593933424, "grad_norm": 6.716571807861328, "learning_rate": 3.236224082440019e-05, "loss": 2.2843505859375, "memory(GiB)": 77.56, "step": 71745, "token_acc": 0.47540983606557374, "train_speed(iter/s)": 1.437561 }, { "epoch": 3.073989974722591, "grad_norm": 5.448788642883301, "learning_rate": 3.235594383021063e-05, "loss": 2.566900634765625, "memory(GiB)": 77.56, "step": 71750, "token_acc": 0.46706586826347307, "train_speed(iter/s)": 1.437554 }, { "epoch": 3.07420419005184, "grad_norm": 5.533188819885254, "learning_rate": 3.23496471556597e-05, "loss": 2.3816951751708983, "memory(GiB)": 77.56, "step": 71755, "token_acc": 0.4953271028037383, "train_speed(iter/s)": 1.437569 }, { "epoch": 3.0744184053810892, "grad_norm": 5.50795316696167, "learning_rate": 3.234335080086146e-05, "loss": 2.3022790908813477, "memory(GiB)": 77.56, "step": 71760, "token_acc": 0.5328185328185329, "train_speed(iter/s)": 1.437582 }, { "epoch": 3.074632620710338, "grad_norm": 7.947305679321289, "learning_rate": 3.233705476592999e-05, "loss": 2.4905216217041017, "memory(GiB)": 77.56, "step": 71765, "token_acc": 0.4682274247491639, "train_speed(iter/s)": 1.43758 }, { "epoch": 3.074846836039587, "grad_norm": 5.364322662353516, "learning_rate": 3.233075905097933e-05, "loss": 1.9276460647583007, "memory(GiB)": 77.56, "step": 71770, "token_acc": 0.5478260869565217, "train_speed(iter/s)": 1.437616 }, { "epoch": 3.075061051368836, "grad_norm": 6.840217113494873, "learning_rate": 3.232446365612356e-05, "loss": 2.3881908416748048, "memory(GiB)": 77.56, "step": 71775, "token_acc": 0.45121951219512196, "train_speed(iter/s)": 1.437612 }, { "epoch": 3.075275266698085, "grad_norm": 5.127231597900391, "learning_rate": 3.231816858147672e-05, "loss": 2.365000534057617, "memory(GiB)": 77.56, "step": 71780, "token_acc": 0.4902597402597403, "train_speed(iter/s)": 1.437598 }, { "epoch": 3.0754894820273337, "grad_norm": 5.032304286956787, "learning_rate": 3.231187382715282e-05, "loss": 2.220559501647949, "memory(GiB)": 77.56, "step": 71785, "token_acc": 0.49812734082397003, "train_speed(iter/s)": 1.437606 }, { "epoch": 3.075703697356583, "grad_norm": 5.8689703941345215, "learning_rate": 3.2305579393265926e-05, "loss": 2.5555652618408202, "memory(GiB)": 77.56, "step": 71790, "token_acc": 0.48466257668711654, "train_speed(iter/s)": 1.437619 }, { "epoch": 3.075917912685832, "grad_norm": 5.924358367919922, "learning_rate": 3.229928527993007e-05, "loss": 2.620853042602539, "memory(GiB)": 77.56, "step": 71795, "token_acc": 0.47126436781609193, "train_speed(iter/s)": 1.437615 }, { "epoch": 3.0761321280150806, "grad_norm": 7.015709400177002, "learning_rate": 3.229299148725925e-05, "loss": 2.032087516784668, "memory(GiB)": 77.56, "step": 71800, "token_acc": 0.51953125, "train_speed(iter/s)": 1.437631 }, { "epoch": 3.07634634334433, "grad_norm": 6.854808330535889, "learning_rate": 3.228669801536749e-05, "loss": 2.7323497772216796, "memory(GiB)": 77.56, "step": 71805, "token_acc": 0.47101449275362317, "train_speed(iter/s)": 1.437631 }, { "epoch": 3.0765605586735787, "grad_norm": 6.99651575088501, "learning_rate": 3.228040486436884e-05, "loss": 2.3545578002929686, "memory(GiB)": 77.56, "step": 71810, "token_acc": 0.5015479876160991, "train_speed(iter/s)": 1.437645 }, { "epoch": 3.0767747740028275, "grad_norm": 5.134286403656006, "learning_rate": 3.2274112034377255e-05, "loss": 2.4191478729248046, "memory(GiB)": 77.56, "step": 71815, "token_acc": 0.5130718954248366, "train_speed(iter/s)": 1.43765 }, { "epoch": 3.0769889893320768, "grad_norm": 5.35711145401001, "learning_rate": 3.2267819525506784e-05, "loss": 2.1645038604736326, "memory(GiB)": 77.56, "step": 71820, "token_acc": 0.5100671140939598, "train_speed(iter/s)": 1.437668 }, { "epoch": 3.0772032046613256, "grad_norm": 5.819248199462891, "learning_rate": 3.226152733787138e-05, "loss": 2.5150949478149416, "memory(GiB)": 77.56, "step": 71825, "token_acc": 0.4418604651162791, "train_speed(iter/s)": 1.437657 }, { "epoch": 3.0774174199905744, "grad_norm": 7.229367256164551, "learning_rate": 3.225523547158505e-05, "loss": 2.5439426422119142, "memory(GiB)": 77.56, "step": 71830, "token_acc": 0.5054151624548736, "train_speed(iter/s)": 1.437675 }, { "epoch": 3.0776316353198236, "grad_norm": 6.0406084060668945, "learning_rate": 3.224894392676178e-05, "loss": 2.1014326095581053, "memory(GiB)": 77.56, "step": 71835, "token_acc": 0.5352112676056338, "train_speed(iter/s)": 1.437663 }, { "epoch": 3.0778458506490725, "grad_norm": 6.239911079406738, "learning_rate": 3.2242652703515545e-05, "loss": 2.1096033096313476, "memory(GiB)": 77.56, "step": 71840, "token_acc": 0.54, "train_speed(iter/s)": 1.437672 }, { "epoch": 3.0780600659783213, "grad_norm": 4.778571128845215, "learning_rate": 3.22363618019603e-05, "loss": 2.174831771850586, "memory(GiB)": 77.56, "step": 71845, "token_acc": 0.5415282392026578, "train_speed(iter/s)": 1.437681 }, { "epoch": 3.0782742813075705, "grad_norm": 4.574695110321045, "learning_rate": 3.223007122221004e-05, "loss": 2.6403177261352537, "memory(GiB)": 77.56, "step": 71850, "token_acc": 0.4664429530201342, "train_speed(iter/s)": 1.437684 }, { "epoch": 3.0784884966368193, "grad_norm": 4.291148662567139, "learning_rate": 3.22237809643787e-05, "loss": 2.3742092132568358, "memory(GiB)": 77.56, "step": 71855, "token_acc": 0.5015384615384615, "train_speed(iter/s)": 1.43766 }, { "epoch": 3.078702711966068, "grad_norm": 6.408908367156982, "learning_rate": 3.221749102858025e-05, "loss": 2.471113586425781, "memory(GiB)": 77.56, "step": 71860, "token_acc": 0.46546546546546547, "train_speed(iter/s)": 1.437658 }, { "epoch": 3.0789169272953174, "grad_norm": 6.602572917938232, "learning_rate": 3.2211201414928634e-05, "loss": 2.210652542114258, "memory(GiB)": 77.56, "step": 71865, "token_acc": 0.5306122448979592, "train_speed(iter/s)": 1.437669 }, { "epoch": 3.079131142624566, "grad_norm": 4.954071998596191, "learning_rate": 3.220491212353778e-05, "loss": 2.374320411682129, "memory(GiB)": 77.56, "step": 71870, "token_acc": 0.48717948717948717, "train_speed(iter/s)": 1.437684 }, { "epoch": 3.079345357953815, "grad_norm": 5.113256931304932, "learning_rate": 3.219862315452164e-05, "loss": 2.551711082458496, "memory(GiB)": 77.56, "step": 71875, "token_acc": 0.48717948717948717, "train_speed(iter/s)": 1.437701 }, { "epoch": 3.0795595732830643, "grad_norm": 5.2521467208862305, "learning_rate": 3.2192334507994125e-05, "loss": 2.2922691345214843, "memory(GiB)": 77.56, "step": 71880, "token_acc": 0.5128205128205128, "train_speed(iter/s)": 1.437723 }, { "epoch": 3.079773788612313, "grad_norm": 5.62010383605957, "learning_rate": 3.2186046184069196e-05, "loss": 2.6016489028930665, "memory(GiB)": 77.56, "step": 71885, "token_acc": 0.45565749235474007, "train_speed(iter/s)": 1.43775 }, { "epoch": 3.079988003941562, "grad_norm": 5.3352370262146, "learning_rate": 3.2179758182860744e-05, "loss": 2.4521623611450196, "memory(GiB)": 77.56, "step": 71890, "token_acc": 0.503731343283582, "train_speed(iter/s)": 1.437754 }, { "epoch": 3.080202219270811, "grad_norm": 6.253001689910889, "learning_rate": 3.21734705044827e-05, "loss": 2.474909210205078, "memory(GiB)": 77.56, "step": 71895, "token_acc": 0.4869281045751634, "train_speed(iter/s)": 1.437777 }, { "epoch": 3.08041643460006, "grad_norm": 4.9991230964660645, "learning_rate": 3.216718314904895e-05, "loss": 2.061448097229004, "memory(GiB)": 77.56, "step": 71900, "token_acc": 0.5533333333333333, "train_speed(iter/s)": 1.437806 }, { "epoch": 3.080630649929309, "grad_norm": 4.501735210418701, "learning_rate": 3.216089611667341e-05, "loss": 2.2024246215820313, "memory(GiB)": 77.56, "step": 71905, "token_acc": 0.5275080906148867, "train_speed(iter/s)": 1.437821 }, { "epoch": 3.080844865258558, "grad_norm": 7.868646621704102, "learning_rate": 3.215460940746997e-05, "loss": 2.4164424896240235, "memory(GiB)": 77.56, "step": 71910, "token_acc": 0.5384615384615384, "train_speed(iter/s)": 1.437837 }, { "epoch": 3.081059080587807, "grad_norm": 5.91912317276001, "learning_rate": 3.214832302155252e-05, "loss": 2.18265438079834, "memory(GiB)": 77.56, "step": 71915, "token_acc": 0.5211726384364821, "train_speed(iter/s)": 1.437827 }, { "epoch": 3.0812732959170557, "grad_norm": 6.025934219360352, "learning_rate": 3.214203695903495e-05, "loss": 1.8655845642089843, "memory(GiB)": 77.56, "step": 71920, "token_acc": 0.5863453815261044, "train_speed(iter/s)": 1.437831 }, { "epoch": 3.081487511246305, "grad_norm": 7.082553863525391, "learning_rate": 3.213575122003113e-05, "loss": 2.5260461807250976, "memory(GiB)": 77.56, "step": 71925, "token_acc": 0.4867549668874172, "train_speed(iter/s)": 1.437827 }, { "epoch": 3.0817017265755537, "grad_norm": 6.019506931304932, "learning_rate": 3.212946580465493e-05, "loss": 2.3202917098999025, "memory(GiB)": 77.56, "step": 71930, "token_acc": 0.5052264808362369, "train_speed(iter/s)": 1.437819 }, { "epoch": 3.0819159419048026, "grad_norm": 8.1309232711792, "learning_rate": 3.212318071302023e-05, "loss": 2.1723093032836913, "memory(GiB)": 77.56, "step": 71935, "token_acc": 0.5107296137339056, "train_speed(iter/s)": 1.437823 }, { "epoch": 3.082130157234052, "grad_norm": 5.068955421447754, "learning_rate": 3.211689594524088e-05, "loss": 2.223263740539551, "memory(GiB)": 77.56, "step": 71940, "token_acc": 0.5359477124183006, "train_speed(iter/s)": 1.437824 }, { "epoch": 3.0823443725633006, "grad_norm": 5.923271179199219, "learning_rate": 3.211061150143072e-05, "loss": 2.525506019592285, "memory(GiB)": 77.56, "step": 71945, "token_acc": 0.4862068965517241, "train_speed(iter/s)": 1.437831 }, { "epoch": 3.0825585878925494, "grad_norm": 6.4857964515686035, "learning_rate": 3.210432738170363e-05, "loss": 2.2871070861816407, "memory(GiB)": 77.56, "step": 71950, "token_acc": 0.5, "train_speed(iter/s)": 1.437806 }, { "epoch": 3.0827728032217987, "grad_norm": 4.484994411468506, "learning_rate": 3.209804358617342e-05, "loss": 2.260784912109375, "memory(GiB)": 77.56, "step": 71955, "token_acc": 0.542016806722689, "train_speed(iter/s)": 1.437804 }, { "epoch": 3.0829870185510475, "grad_norm": 5.6009521484375, "learning_rate": 3.2091760114953945e-05, "loss": 2.441606140136719, "memory(GiB)": 77.56, "step": 71960, "token_acc": 0.515358361774744, "train_speed(iter/s)": 1.437807 }, { "epoch": 3.0832012338802963, "grad_norm": 6.694757461547852, "learning_rate": 3.208547696815905e-05, "loss": 2.440965270996094, "memory(GiB)": 77.56, "step": 71965, "token_acc": 0.4827586206896552, "train_speed(iter/s)": 1.437817 }, { "epoch": 3.0834154492095456, "grad_norm": 6.703338623046875, "learning_rate": 3.207919414590255e-05, "loss": 2.2883527755737303, "memory(GiB)": 77.56, "step": 71970, "token_acc": 0.5094339622641509, "train_speed(iter/s)": 1.437759 }, { "epoch": 3.0836296645387944, "grad_norm": 4.850164413452148, "learning_rate": 3.207291164829824e-05, "loss": 2.2760610580444336, "memory(GiB)": 77.56, "step": 71975, "token_acc": 0.5048543689320388, "train_speed(iter/s)": 1.437766 }, { "epoch": 3.083843879868043, "grad_norm": 5.694988250732422, "learning_rate": 3.2066629475459964e-05, "loss": 2.1303171157836913, "memory(GiB)": 77.56, "step": 71980, "token_acc": 0.5246478873239436, "train_speed(iter/s)": 1.43775 }, { "epoch": 3.0840580951972925, "grad_norm": 7.048049449920654, "learning_rate": 3.206034762750152e-05, "loss": 2.197726821899414, "memory(GiB)": 77.56, "step": 71985, "token_acc": 0.545774647887324, "train_speed(iter/s)": 1.437747 }, { "epoch": 3.0842723105265413, "grad_norm": 6.287624835968018, "learning_rate": 3.20540661045367e-05, "loss": 2.2292152404785157, "memory(GiB)": 77.56, "step": 71990, "token_acc": 0.518796992481203, "train_speed(iter/s)": 1.437762 }, { "epoch": 3.08448652585579, "grad_norm": 5.67429256439209, "learning_rate": 3.204778490667931e-05, "loss": 2.0574792861938476, "memory(GiB)": 77.56, "step": 71995, "token_acc": 0.5630498533724341, "train_speed(iter/s)": 1.437791 }, { "epoch": 3.0847007411850393, "grad_norm": 5.883406639099121, "learning_rate": 3.2041504034043134e-05, "loss": 2.647998046875, "memory(GiB)": 77.56, "step": 72000, "token_acc": 0.5, "train_speed(iter/s)": 1.437802 }, { "epoch": 3.0847007411850393, "eval_loss": 2.357974052429199, "eval_runtime": 14.3486, "eval_samples_per_second": 6.969, "eval_steps_per_second": 6.969, "eval_token_acc": 0.48531139835487663, "step": 72000 }, { "epoch": 3.084914956514288, "grad_norm": 5.997870922088623, "learning_rate": 3.203522348674197e-05, "loss": 2.483399200439453, "memory(GiB)": 77.56, "step": 72005, "token_acc": 0.4853700516351119, "train_speed(iter/s)": 1.437363 }, { "epoch": 3.085129171843537, "grad_norm": 6.540722370147705, "learning_rate": 3.202894326488958e-05, "loss": 2.4166095733642576, "memory(GiB)": 77.56, "step": 72010, "token_acc": 0.5124555160142349, "train_speed(iter/s)": 1.437364 }, { "epoch": 3.0853433871727862, "grad_norm": 5.439075946807861, "learning_rate": 3.2022663368599736e-05, "loss": 2.591163444519043, "memory(GiB)": 77.56, "step": 72015, "token_acc": 0.4777777777777778, "train_speed(iter/s)": 1.437352 }, { "epoch": 3.085557602502035, "grad_norm": 6.601009845733643, "learning_rate": 3.201638379798622e-05, "loss": 2.4263397216796876, "memory(GiB)": 77.56, "step": 72020, "token_acc": 0.4935064935064935, "train_speed(iter/s)": 1.437342 }, { "epoch": 3.085771817831284, "grad_norm": 5.886809349060059, "learning_rate": 3.2010104553162765e-05, "loss": 2.284076118469238, "memory(GiB)": 77.56, "step": 72025, "token_acc": 0.48695652173913045, "train_speed(iter/s)": 1.437318 }, { "epoch": 3.085986033160533, "grad_norm": 6.877198696136475, "learning_rate": 3.200382563424314e-05, "loss": 2.06893253326416, "memory(GiB)": 77.56, "step": 72030, "token_acc": 0.5189873417721519, "train_speed(iter/s)": 1.437339 }, { "epoch": 3.086200248489782, "grad_norm": 4.7113356590271, "learning_rate": 3.1997547041341105e-05, "loss": 2.377621078491211, "memory(GiB)": 77.56, "step": 72035, "token_acc": 0.5098684210526315, "train_speed(iter/s)": 1.437345 }, { "epoch": 3.0864144638190307, "grad_norm": 6.559420108795166, "learning_rate": 3.1991268774570396e-05, "loss": 2.1753616333007812, "memory(GiB)": 77.56, "step": 72040, "token_acc": 0.5338345864661654, "train_speed(iter/s)": 1.437361 }, { "epoch": 3.08662867914828, "grad_norm": 5.55491828918457, "learning_rate": 3.1984990834044746e-05, "loss": 2.225050354003906, "memory(GiB)": 77.56, "step": 72045, "token_acc": 0.5047318611987381, "train_speed(iter/s)": 1.437361 }, { "epoch": 3.086842894477529, "grad_norm": 4.979081153869629, "learning_rate": 3.1978713219877876e-05, "loss": 2.218865966796875, "memory(GiB)": 77.56, "step": 72050, "token_acc": 0.5252225519287834, "train_speed(iter/s)": 1.437369 }, { "epoch": 3.0870571098067776, "grad_norm": 5.670287609100342, "learning_rate": 3.197243593218353e-05, "loss": 2.466337776184082, "memory(GiB)": 77.56, "step": 72055, "token_acc": 0.5330882352941176, "train_speed(iter/s)": 1.437348 }, { "epoch": 3.087271325136027, "grad_norm": 5.962737560272217, "learning_rate": 3.196615897107542e-05, "loss": 2.1909963607788088, "memory(GiB)": 77.56, "step": 72060, "token_acc": 0.5141843971631206, "train_speed(iter/s)": 1.437361 }, { "epoch": 3.0874855404652757, "grad_norm": 7.104704856872559, "learning_rate": 3.1959882336667254e-05, "loss": 2.455039405822754, "memory(GiB)": 77.56, "step": 72065, "token_acc": 0.478134110787172, "train_speed(iter/s)": 1.437363 }, { "epoch": 3.0876997557945245, "grad_norm": 5.077696323394775, "learning_rate": 3.1953606029072754e-05, "loss": 2.294321632385254, "memory(GiB)": 77.56, "step": 72070, "token_acc": 0.48024316109422494, "train_speed(iter/s)": 1.437377 }, { "epoch": 3.0879139711237737, "grad_norm": 6.672941207885742, "learning_rate": 3.194733004840559e-05, "loss": 2.2407079696655274, "memory(GiB)": 77.56, "step": 72075, "token_acc": 0.5618729096989966, "train_speed(iter/s)": 1.437405 }, { "epoch": 3.0881281864530226, "grad_norm": 5.932276248931885, "learning_rate": 3.194105439477949e-05, "loss": 2.081951904296875, "memory(GiB)": 77.56, "step": 72080, "token_acc": 0.5091575091575091, "train_speed(iter/s)": 1.437392 }, { "epoch": 3.0883424017822714, "grad_norm": 6.801368713378906, "learning_rate": 3.193477906830813e-05, "loss": 2.516109275817871, "memory(GiB)": 77.56, "step": 72085, "token_acc": 0.4880546075085324, "train_speed(iter/s)": 1.437413 }, { "epoch": 3.0885566171115206, "grad_norm": 6.3427019119262695, "learning_rate": 3.192850406910519e-05, "loss": 2.098362922668457, "memory(GiB)": 77.56, "step": 72090, "token_acc": 0.5807692307692308, "train_speed(iter/s)": 1.437407 }, { "epoch": 3.0887708324407694, "grad_norm": 4.7134575843811035, "learning_rate": 3.1922229397284355e-05, "loss": 2.218771743774414, "memory(GiB)": 77.56, "step": 72095, "token_acc": 0.5423076923076923, "train_speed(iter/s)": 1.437404 }, { "epoch": 3.0889850477700183, "grad_norm": 5.925879001617432, "learning_rate": 3.191595505295927e-05, "loss": 2.6201122283935545, "memory(GiB)": 77.56, "step": 72100, "token_acc": 0.461038961038961, "train_speed(iter/s)": 1.4374 }, { "epoch": 3.0891992630992675, "grad_norm": 5.819891929626465, "learning_rate": 3.1909681036243644e-05, "loss": 2.661220741271973, "memory(GiB)": 77.56, "step": 72105, "token_acc": 0.4638888888888889, "train_speed(iter/s)": 1.437411 }, { "epoch": 3.0894134784285163, "grad_norm": 7.23334264755249, "learning_rate": 3.1903407347251114e-05, "loss": 2.608384895324707, "memory(GiB)": 77.56, "step": 72110, "token_acc": 0.45149253731343286, "train_speed(iter/s)": 1.437425 }, { "epoch": 3.089627693757765, "grad_norm": 4.465384006500244, "learning_rate": 3.189713398609534e-05, "loss": 2.405615234375, "memory(GiB)": 77.56, "step": 72115, "token_acc": 0.5049833887043189, "train_speed(iter/s)": 1.437421 }, { "epoch": 3.0898419090870144, "grad_norm": 5.09502649307251, "learning_rate": 3.1890860952889955e-05, "loss": 2.4041469573974608, "memory(GiB)": 77.56, "step": 72120, "token_acc": 0.4769736842105263, "train_speed(iter/s)": 1.437406 }, { "epoch": 3.090056124416263, "grad_norm": 5.880337715148926, "learning_rate": 3.188458824774862e-05, "loss": 2.31864013671875, "memory(GiB)": 77.56, "step": 72125, "token_acc": 0.5246376811594203, "train_speed(iter/s)": 1.437423 }, { "epoch": 3.090270339745512, "grad_norm": 7.496294021606445, "learning_rate": 3.1878315870784975e-05, "loss": 2.663106918334961, "memory(GiB)": 77.56, "step": 72130, "token_acc": 0.47249190938511326, "train_speed(iter/s)": 1.437416 }, { "epoch": 3.0904845550747613, "grad_norm": 6.645288467407227, "learning_rate": 3.187204382211262e-05, "loss": 2.5387157440185546, "memory(GiB)": 77.56, "step": 72135, "token_acc": 0.4584615384615385, "train_speed(iter/s)": 1.437427 }, { "epoch": 3.09069877040401, "grad_norm": 7.943534851074219, "learning_rate": 3.18657721018452e-05, "loss": 2.1895246505737305, "memory(GiB)": 77.56, "step": 72140, "token_acc": 0.49480968858131485, "train_speed(iter/s)": 1.43744 }, { "epoch": 3.090912985733259, "grad_norm": 5.230518817901611, "learning_rate": 3.185950071009633e-05, "loss": 2.2710304260253906, "memory(GiB)": 77.56, "step": 72145, "token_acc": 0.5492957746478874, "train_speed(iter/s)": 1.437438 }, { "epoch": 3.091127201062508, "grad_norm": 6.006902694702148, "learning_rate": 3.185322964697963e-05, "loss": 2.5105504989624023, "memory(GiB)": 77.56, "step": 72150, "token_acc": 0.5, "train_speed(iter/s)": 1.437438 }, { "epoch": 3.091341416391757, "grad_norm": 6.310667037963867, "learning_rate": 3.184695891260869e-05, "loss": 2.3670064926147463, "memory(GiB)": 77.56, "step": 72155, "token_acc": 0.48134328358208955, "train_speed(iter/s)": 1.437466 }, { "epoch": 3.091555631721006, "grad_norm": 6.4622344970703125, "learning_rate": 3.184068850709711e-05, "loss": 2.5994964599609376, "memory(GiB)": 77.56, "step": 72160, "token_acc": 0.4648318042813456, "train_speed(iter/s)": 1.437493 }, { "epoch": 3.091769847050255, "grad_norm": 5.36970329284668, "learning_rate": 3.183441843055851e-05, "loss": 2.4790550231933595, "memory(GiB)": 77.56, "step": 72165, "token_acc": 0.4985507246376812, "train_speed(iter/s)": 1.437533 }, { "epoch": 3.091984062379504, "grad_norm": 6.095882415771484, "learning_rate": 3.182814868310645e-05, "loss": 2.3345258712768553, "memory(GiB)": 77.56, "step": 72170, "token_acc": 0.5, "train_speed(iter/s)": 1.437552 }, { "epoch": 3.0921982777087527, "grad_norm": 7.8465657234191895, "learning_rate": 3.1821879264854514e-05, "loss": 2.7106595993041993, "memory(GiB)": 77.56, "step": 72175, "token_acc": 0.4707692307692308, "train_speed(iter/s)": 1.437541 }, { "epoch": 3.092412493038002, "grad_norm": 6.361175060272217, "learning_rate": 3.181561017591629e-05, "loss": 2.0068809509277346, "memory(GiB)": 77.56, "step": 72180, "token_acc": 0.5689655172413793, "train_speed(iter/s)": 1.437568 }, { "epoch": 3.0926267083672507, "grad_norm": 5.485069751739502, "learning_rate": 3.180934141640536e-05, "loss": 2.086277198791504, "memory(GiB)": 77.56, "step": 72185, "token_acc": 0.5551601423487544, "train_speed(iter/s)": 1.437593 }, { "epoch": 3.0928409236964995, "grad_norm": 5.531437873840332, "learning_rate": 3.1803072986435276e-05, "loss": 2.581049919128418, "memory(GiB)": 77.56, "step": 72190, "token_acc": 0.44668587896253603, "train_speed(iter/s)": 1.437598 }, { "epoch": 3.093055139025749, "grad_norm": 5.932785987854004, "learning_rate": 3.179680488611958e-05, "loss": 2.2052261352539064, "memory(GiB)": 77.56, "step": 72195, "token_acc": 0.5273311897106109, "train_speed(iter/s)": 1.437608 }, { "epoch": 3.0932693543549976, "grad_norm": 6.721252918243408, "learning_rate": 3.179053711557185e-05, "loss": 2.1347131729125977, "memory(GiB)": 77.56, "step": 72200, "token_acc": 0.5316901408450704, "train_speed(iter/s)": 1.437596 }, { "epoch": 3.0934835696842464, "grad_norm": 9.866572380065918, "learning_rate": 3.178426967490562e-05, "loss": 2.289981651306152, "memory(GiB)": 77.56, "step": 72205, "token_acc": 0.5164835164835165, "train_speed(iter/s)": 1.437608 }, { "epoch": 3.0936977850134957, "grad_norm": 5.840310573577881, "learning_rate": 3.1778002564234435e-05, "loss": 2.4291318893432616, "memory(GiB)": 77.56, "step": 72210, "token_acc": 0.4714285714285714, "train_speed(iter/s)": 1.437614 }, { "epoch": 3.0939120003427445, "grad_norm": 6.415073871612549, "learning_rate": 3.177173578367183e-05, "loss": 2.2293848037719726, "memory(GiB)": 77.56, "step": 72215, "token_acc": 0.5494505494505495, "train_speed(iter/s)": 1.437616 }, { "epoch": 3.0941262156719933, "grad_norm": 5.98455286026001, "learning_rate": 3.176546933333132e-05, "loss": 2.3406869888305666, "memory(GiB)": 77.56, "step": 72220, "token_acc": 0.5019762845849802, "train_speed(iter/s)": 1.437571 }, { "epoch": 3.0943404310012426, "grad_norm": 6.014914512634277, "learning_rate": 3.175920321332644e-05, "loss": 2.2767578125, "memory(GiB)": 77.56, "step": 72225, "token_acc": 0.5079365079365079, "train_speed(iter/s)": 1.437576 }, { "epoch": 3.0945546463304914, "grad_norm": 5.497958660125732, "learning_rate": 3.175293742377072e-05, "loss": 2.4478378295898438, "memory(GiB)": 77.56, "step": 72230, "token_acc": 0.5090252707581228, "train_speed(iter/s)": 1.437583 }, { "epoch": 3.09476886165974, "grad_norm": 7.322943210601807, "learning_rate": 3.1746671964777635e-05, "loss": 2.2696950912475584, "memory(GiB)": 77.56, "step": 72235, "token_acc": 0.5166051660516605, "train_speed(iter/s)": 1.437593 }, { "epoch": 3.0949830769889894, "grad_norm": 6.523723602294922, "learning_rate": 3.1740406836460724e-05, "loss": 2.157855415344238, "memory(GiB)": 77.56, "step": 72240, "token_acc": 0.5527426160337553, "train_speed(iter/s)": 1.437592 }, { "epoch": 3.0951972923182383, "grad_norm": 5.072347164154053, "learning_rate": 3.173414203893346e-05, "loss": 2.541507339477539, "memory(GiB)": 77.56, "step": 72245, "token_acc": 0.4897360703812317, "train_speed(iter/s)": 1.437588 }, { "epoch": 3.095411507647487, "grad_norm": 5.609076976776123, "learning_rate": 3.172787757230934e-05, "loss": 2.286670112609863, "memory(GiB)": 77.56, "step": 72250, "token_acc": 0.5284810126582279, "train_speed(iter/s)": 1.437599 }, { "epoch": 3.0956257229767363, "grad_norm": 4.251852989196777, "learning_rate": 3.172161343670188e-05, "loss": 2.4701719284057617, "memory(GiB)": 77.56, "step": 72255, "token_acc": 0.46511627906976744, "train_speed(iter/s)": 1.437602 }, { "epoch": 3.095839938305985, "grad_norm": 7.1300153732299805, "learning_rate": 3.1715349632224536e-05, "loss": 2.5306621551513673, "memory(GiB)": 77.56, "step": 72260, "token_acc": 0.4660493827160494, "train_speed(iter/s)": 1.437611 }, { "epoch": 3.096054153635234, "grad_norm": 4.470528602600098, "learning_rate": 3.1709086158990774e-05, "loss": 2.2525672912597656, "memory(GiB)": 77.56, "step": 72265, "token_acc": 0.5424836601307189, "train_speed(iter/s)": 1.43764 }, { "epoch": 3.096268368964483, "grad_norm": 5.3731231689453125, "learning_rate": 3.170282301711409e-05, "loss": 2.0743844985961912, "memory(GiB)": 77.56, "step": 72270, "token_acc": 0.5321428571428571, "train_speed(iter/s)": 1.437665 }, { "epoch": 3.096482584293732, "grad_norm": 6.206145286560059, "learning_rate": 3.169656020670793e-05, "loss": 2.354397201538086, "memory(GiB)": 77.56, "step": 72275, "token_acc": 0.48518518518518516, "train_speed(iter/s)": 1.437668 }, { "epoch": 3.096696799622981, "grad_norm": 5.7696990966796875, "learning_rate": 3.1690297727885744e-05, "loss": 2.3854522705078125, "memory(GiB)": 77.56, "step": 72280, "token_acc": 0.4712230215827338, "train_speed(iter/s)": 1.43768 }, { "epoch": 3.09691101495223, "grad_norm": 5.036414623260498, "learning_rate": 3.1684035580761e-05, "loss": 2.2238574981689454, "memory(GiB)": 77.56, "step": 72285, "token_acc": 0.5213675213675214, "train_speed(iter/s)": 1.437683 }, { "epoch": 3.097125230281479, "grad_norm": 5.317901134490967, "learning_rate": 3.1677773765447116e-05, "loss": 2.3005258560180666, "memory(GiB)": 77.56, "step": 72290, "token_acc": 0.48161764705882354, "train_speed(iter/s)": 1.437661 }, { "epoch": 3.0973394456107277, "grad_norm": 6.160380840301514, "learning_rate": 3.1671512282057556e-05, "loss": 2.5581039428710937, "memory(GiB)": 77.56, "step": 72295, "token_acc": 0.4876543209876543, "train_speed(iter/s)": 1.437688 }, { "epoch": 3.097553660939977, "grad_norm": 6.025140762329102, "learning_rate": 3.166525113070575e-05, "loss": 2.6819684982299803, "memory(GiB)": 77.56, "step": 72300, "token_acc": 0.47058823529411764, "train_speed(iter/s)": 1.437699 }, { "epoch": 3.097767876269226, "grad_norm": 5.725876808166504, "learning_rate": 3.165899031150512e-05, "loss": 2.3693115234375, "memory(GiB)": 77.56, "step": 72305, "token_acc": 0.5275590551181102, "train_speed(iter/s)": 1.437716 }, { "epoch": 3.0979820915984746, "grad_norm": 9.523658752441406, "learning_rate": 3.165272982456908e-05, "loss": 2.3139448165893555, "memory(GiB)": 77.56, "step": 72310, "token_acc": 0.4605263157894737, "train_speed(iter/s)": 1.437738 }, { "epoch": 3.098196306927724, "grad_norm": 6.289538860321045, "learning_rate": 3.164646967001106e-05, "loss": 2.5493627548217774, "memory(GiB)": 77.56, "step": 72315, "token_acc": 0.5098039215686274, "train_speed(iter/s)": 1.437746 }, { "epoch": 3.0984105222569727, "grad_norm": 8.801753997802734, "learning_rate": 3.164020984794444e-05, "loss": 2.421823501586914, "memory(GiB)": 77.56, "step": 72320, "token_acc": 0.4707692307692308, "train_speed(iter/s)": 1.437754 }, { "epoch": 3.0986247375862215, "grad_norm": 5.483835220336914, "learning_rate": 3.163395035848263e-05, "loss": 2.1594799041748045, "memory(GiB)": 77.56, "step": 72325, "token_acc": 0.4982456140350877, "train_speed(iter/s)": 1.437756 }, { "epoch": 3.0988389529154707, "grad_norm": 5.4879326820373535, "learning_rate": 3.1627691201739075e-05, "loss": 2.1753717422485352, "memory(GiB)": 77.56, "step": 72330, "token_acc": 0.546031746031746, "train_speed(iter/s)": 1.437772 }, { "epoch": 3.0990531682447195, "grad_norm": 5.669936180114746, "learning_rate": 3.162143237782711e-05, "loss": 2.225362205505371, "memory(GiB)": 77.56, "step": 72335, "token_acc": 0.5457627118644067, "train_speed(iter/s)": 1.437785 }, { "epoch": 3.0992673835739684, "grad_norm": 7.326630115509033, "learning_rate": 3.161517388686014e-05, "loss": 2.4407577514648438, "memory(GiB)": 77.56, "step": 72340, "token_acc": 0.4717607973421927, "train_speed(iter/s)": 1.437798 }, { "epoch": 3.0994815989032176, "grad_norm": 9.202301979064941, "learning_rate": 3.160891572895155e-05, "loss": 2.6768072128295897, "memory(GiB)": 77.56, "step": 72345, "token_acc": 0.4826388888888889, "train_speed(iter/s)": 1.437784 }, { "epoch": 3.0996958142324664, "grad_norm": 5.857728481292725, "learning_rate": 3.160265790421469e-05, "loss": 2.355600929260254, "memory(GiB)": 77.56, "step": 72350, "token_acc": 0.5170068027210885, "train_speed(iter/s)": 1.437759 }, { "epoch": 3.0999100295617152, "grad_norm": 5.486622333526611, "learning_rate": 3.159640041276295e-05, "loss": 2.291430854797363, "memory(GiB)": 77.56, "step": 72355, "token_acc": 0.5191082802547771, "train_speed(iter/s)": 1.437758 }, { "epoch": 3.1001242448909645, "grad_norm": 5.8983049392700195, "learning_rate": 3.1590143254709684e-05, "loss": 2.4319149017333985, "memory(GiB)": 77.56, "step": 72360, "token_acc": 0.5087719298245614, "train_speed(iter/s)": 1.437776 }, { "epoch": 3.1003384602202133, "grad_norm": 7.934996604919434, "learning_rate": 3.158388643016823e-05, "loss": 2.0736560821533203, "memory(GiB)": 77.56, "step": 72365, "token_acc": 0.5428571428571428, "train_speed(iter/s)": 1.437774 }, { "epoch": 3.100552675549462, "grad_norm": 4.799585342407227, "learning_rate": 3.157762993925196e-05, "loss": 2.261810302734375, "memory(GiB)": 77.56, "step": 72370, "token_acc": 0.49411764705882355, "train_speed(iter/s)": 1.437769 }, { "epoch": 3.1007668908787114, "grad_norm": 5.184191703796387, "learning_rate": 3.15713737820742e-05, "loss": 2.720342254638672, "memory(GiB)": 77.56, "step": 72375, "token_acc": 0.43666666666666665, "train_speed(iter/s)": 1.437789 }, { "epoch": 3.10098110620796, "grad_norm": 6.596385478973389, "learning_rate": 3.156511795874829e-05, "loss": 2.0793190002441406, "memory(GiB)": 77.56, "step": 72380, "token_acc": 0.5517241379310345, "train_speed(iter/s)": 1.437773 }, { "epoch": 3.101195321537209, "grad_norm": 6.830258369445801, "learning_rate": 3.155886246938756e-05, "loss": 2.2441028594970702, "memory(GiB)": 77.56, "step": 72385, "token_acc": 0.5084745762711864, "train_speed(iter/s)": 1.437779 }, { "epoch": 3.1014095368664583, "grad_norm": 4.584389686584473, "learning_rate": 3.155260731410534e-05, "loss": 2.112406539916992, "memory(GiB)": 77.56, "step": 72390, "token_acc": 0.5429553264604811, "train_speed(iter/s)": 1.437768 }, { "epoch": 3.101623752195707, "grad_norm": 5.585931301116943, "learning_rate": 3.1546352493014946e-05, "loss": 2.2400184631347657, "memory(GiB)": 77.56, "step": 72395, "token_acc": 0.4794007490636704, "train_speed(iter/s)": 1.437774 }, { "epoch": 3.101837967524956, "grad_norm": 6.651171684265137, "learning_rate": 3.154009800622967e-05, "loss": 2.450203704833984, "memory(GiB)": 77.56, "step": 72400, "token_acc": 0.5068027210884354, "train_speed(iter/s)": 1.437814 }, { "epoch": 3.102052182854205, "grad_norm": 6.673031330108643, "learning_rate": 3.1533843853862856e-05, "loss": 2.609384536743164, "memory(GiB)": 77.56, "step": 72405, "token_acc": 0.4380952380952381, "train_speed(iter/s)": 1.437843 }, { "epoch": 3.102266398183454, "grad_norm": 4.675049781799316, "learning_rate": 3.1527590036027766e-05, "loss": 2.400357246398926, "memory(GiB)": 77.56, "step": 72410, "token_acc": 0.4963768115942029, "train_speed(iter/s)": 1.437847 }, { "epoch": 3.1024806135127028, "grad_norm": 5.692420482635498, "learning_rate": 3.152133655283773e-05, "loss": 1.7764366149902344, "memory(GiB)": 77.56, "step": 72415, "token_acc": 0.5944700460829493, "train_speed(iter/s)": 1.437832 }, { "epoch": 3.102694828841952, "grad_norm": 6.678676605224609, "learning_rate": 3.151508340440601e-05, "loss": 2.6026721954345704, "memory(GiB)": 77.56, "step": 72420, "token_acc": 0.47635135135135137, "train_speed(iter/s)": 1.437827 }, { "epoch": 3.102909044171201, "grad_norm": 5.01537561416626, "learning_rate": 3.150883059084588e-05, "loss": 2.1439157485961915, "memory(GiB)": 77.56, "step": 72425, "token_acc": 0.5551470588235294, "train_speed(iter/s)": 1.43783 }, { "epoch": 3.1031232595004496, "grad_norm": 6.816878795623779, "learning_rate": 3.150257811227065e-05, "loss": 2.562957000732422, "memory(GiB)": 77.56, "step": 72430, "token_acc": 0.4716417910447761, "train_speed(iter/s)": 1.437853 }, { "epoch": 3.103337474829699, "grad_norm": 5.693693161010742, "learning_rate": 3.149632596879356e-05, "loss": 2.0666225433349608, "memory(GiB)": 77.56, "step": 72435, "token_acc": 0.5204460966542751, "train_speed(iter/s)": 1.437858 }, { "epoch": 3.1035516901589477, "grad_norm": 8.398011207580566, "learning_rate": 3.149007416052789e-05, "loss": 2.417411041259766, "memory(GiB)": 77.56, "step": 72440, "token_acc": 0.4612546125461255, "train_speed(iter/s)": 1.437831 }, { "epoch": 3.1037659054881965, "grad_norm": 5.838447570800781, "learning_rate": 3.148382268758689e-05, "loss": 2.361836051940918, "memory(GiB)": 77.56, "step": 72445, "token_acc": 0.5050505050505051, "train_speed(iter/s)": 1.437816 }, { "epoch": 3.103980120817446, "grad_norm": 4.476272106170654, "learning_rate": 3.14775715500838e-05, "loss": 2.230912780761719, "memory(GiB)": 77.56, "step": 72450, "token_acc": 0.5582089552238806, "train_speed(iter/s)": 1.437803 }, { "epoch": 3.1041943361466946, "grad_norm": 5.37377405166626, "learning_rate": 3.147132074813189e-05, "loss": 2.399504280090332, "memory(GiB)": 77.56, "step": 72455, "token_acc": 0.5345345345345346, "train_speed(iter/s)": 1.437813 }, { "epoch": 3.1044085514759434, "grad_norm": 6.497664451599121, "learning_rate": 3.146507028184438e-05, "loss": 2.6256196975708006, "memory(GiB)": 77.56, "step": 72460, "token_acc": 0.4602649006622517, "train_speed(iter/s)": 1.437804 }, { "epoch": 3.1046227668051927, "grad_norm": 6.548216819763184, "learning_rate": 3.1458820151334504e-05, "loss": 2.2182369232177734, "memory(GiB)": 77.56, "step": 72465, "token_acc": 0.5189003436426117, "train_speed(iter/s)": 1.437791 }, { "epoch": 3.1048369821344415, "grad_norm": 5.119418621063232, "learning_rate": 3.145257035671548e-05, "loss": 2.126926040649414, "memory(GiB)": 77.56, "step": 72470, "token_acc": 0.5177304964539007, "train_speed(iter/s)": 1.437804 }, { "epoch": 3.1050511974636903, "grad_norm": 6.199021339416504, "learning_rate": 3.144632089810057e-05, "loss": 2.2834930419921875, "memory(GiB)": 77.56, "step": 72475, "token_acc": 0.49433962264150944, "train_speed(iter/s)": 1.437793 }, { "epoch": 3.1052654127929396, "grad_norm": 4.47617244720459, "learning_rate": 3.144007177560295e-05, "loss": 2.2830326080322267, "memory(GiB)": 77.56, "step": 72480, "token_acc": 0.4840989399293286, "train_speed(iter/s)": 1.437822 }, { "epoch": 3.1054796281221884, "grad_norm": 5.179701328277588, "learning_rate": 3.143382298933585e-05, "loss": 2.3549930572509767, "memory(GiB)": 77.56, "step": 72485, "token_acc": 0.5174825174825175, "train_speed(iter/s)": 1.437812 }, { "epoch": 3.105693843451437, "grad_norm": 4.490260601043701, "learning_rate": 3.142757453941246e-05, "loss": 2.3018190383911135, "memory(GiB)": 77.56, "step": 72490, "token_acc": 0.5058823529411764, "train_speed(iter/s)": 1.437833 }, { "epoch": 3.1059080587806864, "grad_norm": 4.446118354797363, "learning_rate": 3.142132642594597e-05, "loss": 2.4987226486206056, "memory(GiB)": 77.56, "step": 72495, "token_acc": 0.5169230769230769, "train_speed(iter/s)": 1.437848 }, { "epoch": 3.1061222741099352, "grad_norm": 4.7859063148498535, "learning_rate": 3.141507864904959e-05, "loss": 2.839399719238281, "memory(GiB)": 77.56, "step": 72500, "token_acc": 0.47368421052631576, "train_speed(iter/s)": 1.437852 }, { "epoch": 3.1061222741099352, "eval_loss": 2.4001803398132324, "eval_runtime": 14.2707, "eval_samples_per_second": 7.007, "eval_steps_per_second": 7.007, "eval_token_acc": 0.4444444444444444, "step": 72500 }, { "epoch": 3.106336489439184, "grad_norm": 4.4844279289245605, "learning_rate": 3.1408831208836496e-05, "loss": 2.3198383331298826, "memory(GiB)": 77.56, "step": 72505, "token_acc": 0.4645030425963489, "train_speed(iter/s)": 1.437426 }, { "epoch": 3.1065507047684333, "grad_norm": 6.893157482147217, "learning_rate": 3.140258410541985e-05, "loss": 2.344624328613281, "memory(GiB)": 77.56, "step": 72510, "token_acc": 0.5080645161290323, "train_speed(iter/s)": 1.437419 }, { "epoch": 3.106764920097682, "grad_norm": 6.0269083976745605, "learning_rate": 3.139633733891285e-05, "loss": 2.666086196899414, "memory(GiB)": 77.56, "step": 72515, "token_acc": 0.48863636363636365, "train_speed(iter/s)": 1.437405 }, { "epoch": 3.106979135426931, "grad_norm": 5.225742816925049, "learning_rate": 3.139009090942865e-05, "loss": 2.3564117431640623, "memory(GiB)": 77.56, "step": 72520, "token_acc": 0.47112462006079026, "train_speed(iter/s)": 1.437386 }, { "epoch": 3.10719335075618, "grad_norm": 6.708471298217773, "learning_rate": 3.138384481708041e-05, "loss": 2.374943161010742, "memory(GiB)": 77.56, "step": 72525, "token_acc": 0.5136054421768708, "train_speed(iter/s)": 1.437378 }, { "epoch": 3.107407566085429, "grad_norm": 6.151369094848633, "learning_rate": 3.137759906198129e-05, "loss": 2.1034936904907227, "memory(GiB)": 77.56, "step": 72530, "token_acc": 0.5413793103448276, "train_speed(iter/s)": 1.437357 }, { "epoch": 3.107621781414678, "grad_norm": 7.223677635192871, "learning_rate": 3.137135364424442e-05, "loss": 2.3703739166259767, "memory(GiB)": 77.56, "step": 72535, "token_acc": 0.48255813953488375, "train_speed(iter/s)": 1.437362 }, { "epoch": 3.107835996743927, "grad_norm": 5.07366943359375, "learning_rate": 3.136510856398297e-05, "loss": 2.1987770080566404, "memory(GiB)": 77.56, "step": 72540, "token_acc": 0.5512367491166078, "train_speed(iter/s)": 1.437392 }, { "epoch": 3.108050212073176, "grad_norm": 5.923264503479004, "learning_rate": 3.135886382131003e-05, "loss": 2.3404064178466797, "memory(GiB)": 77.56, "step": 72545, "token_acc": 0.5187713310580204, "train_speed(iter/s)": 1.437393 }, { "epoch": 3.1082644274024247, "grad_norm": 7.233532905578613, "learning_rate": 3.135261941633878e-05, "loss": 2.4130218505859373, "memory(GiB)": 77.56, "step": 72550, "token_acc": 0.49390243902439024, "train_speed(iter/s)": 1.437401 }, { "epoch": 3.108478642731674, "grad_norm": 4.881805896759033, "learning_rate": 3.134637534918231e-05, "loss": 1.9220348358154298, "memory(GiB)": 77.56, "step": 72555, "token_acc": 0.5313653136531366, "train_speed(iter/s)": 1.437387 }, { "epoch": 3.1086928580609228, "grad_norm": 6.463660717010498, "learning_rate": 3.134013161995377e-05, "loss": 2.2441144943237306, "memory(GiB)": 77.56, "step": 72560, "token_acc": 0.5016181229773463, "train_speed(iter/s)": 1.437401 }, { "epoch": 3.1089070733901716, "grad_norm": 5.045422077178955, "learning_rate": 3.133388822876624e-05, "loss": 2.158509063720703, "memory(GiB)": 77.56, "step": 72565, "token_acc": 0.49473684210526314, "train_speed(iter/s)": 1.437406 }, { "epoch": 3.109121288719421, "grad_norm": 5.674724578857422, "learning_rate": 3.1327645175732826e-05, "loss": 2.216973876953125, "memory(GiB)": 77.56, "step": 72570, "token_acc": 0.5131578947368421, "train_speed(iter/s)": 1.437408 }, { "epoch": 3.1093355040486697, "grad_norm": 5.278008460998535, "learning_rate": 3.132140246096665e-05, "loss": 2.1971664428710938, "memory(GiB)": 77.56, "step": 72575, "token_acc": 0.5751633986928104, "train_speed(iter/s)": 1.437411 }, { "epoch": 3.1095497193779185, "grad_norm": 8.706609725952148, "learning_rate": 3.131516008458077e-05, "loss": 2.2271652221679688, "memory(GiB)": 77.56, "step": 72580, "token_acc": 0.5434782608695652, "train_speed(iter/s)": 1.437427 }, { "epoch": 3.1097639347071677, "grad_norm": 6.113956928253174, "learning_rate": 3.130891804668832e-05, "loss": 2.478483963012695, "memory(GiB)": 77.56, "step": 72585, "token_acc": 0.4618320610687023, "train_speed(iter/s)": 1.437428 }, { "epoch": 3.1099781500364165, "grad_norm": 5.216123580932617, "learning_rate": 3.1302676347402346e-05, "loss": 2.4413789749145507, "memory(GiB)": 77.56, "step": 72590, "token_acc": 0.509090909090909, "train_speed(iter/s)": 1.437454 }, { "epoch": 3.1101923653656653, "grad_norm": 7.272138595581055, "learning_rate": 3.129643498683592e-05, "loss": 2.2538478851318358, "memory(GiB)": 77.56, "step": 72595, "token_acc": 0.5172413793103449, "train_speed(iter/s)": 1.437459 }, { "epoch": 3.1104065806949146, "grad_norm": 5.713901042938232, "learning_rate": 3.129019396510212e-05, "loss": 2.3358409881591795, "memory(GiB)": 77.56, "step": 72600, "token_acc": 0.48863636363636365, "train_speed(iter/s)": 1.437472 }, { "epoch": 3.1106207960241634, "grad_norm": 5.661256313323975, "learning_rate": 3.128395328231401e-05, "loss": 2.209878349304199, "memory(GiB)": 77.56, "step": 72605, "token_acc": 0.5504201680672269, "train_speed(iter/s)": 1.43751 }, { "epoch": 3.1108350113534122, "grad_norm": 5.26945686340332, "learning_rate": 3.1277712938584646e-05, "loss": 2.3268457412719727, "memory(GiB)": 77.56, "step": 72610, "token_acc": 0.5104602510460251, "train_speed(iter/s)": 1.437495 }, { "epoch": 3.1110492266826615, "grad_norm": 6.317492485046387, "learning_rate": 3.127147293402708e-05, "loss": 2.369083213806152, "memory(GiB)": 77.56, "step": 72615, "token_acc": 0.5290102389078498, "train_speed(iter/s)": 1.437492 }, { "epoch": 3.1112634420119103, "grad_norm": 5.80665922164917, "learning_rate": 3.126523326875432e-05, "loss": 2.2709383010864257, "memory(GiB)": 77.56, "step": 72620, "token_acc": 0.5103244837758112, "train_speed(iter/s)": 1.437476 }, { "epoch": 3.111477657341159, "grad_norm": 4.686379432678223, "learning_rate": 3.1258993942879456e-05, "loss": 2.4110038757324217, "memory(GiB)": 77.56, "step": 72625, "token_acc": 0.496875, "train_speed(iter/s)": 1.437484 }, { "epoch": 3.1116918726704084, "grad_norm": 5.836164951324463, "learning_rate": 3.125275495651551e-05, "loss": 2.7762956619262695, "memory(GiB)": 77.56, "step": 72630, "token_acc": 0.47648902821316613, "train_speed(iter/s)": 1.437499 }, { "epoch": 3.111906087999657, "grad_norm": 4.589163780212402, "learning_rate": 3.1246516309775484e-05, "loss": 2.175393486022949, "memory(GiB)": 77.56, "step": 72635, "token_acc": 0.5465587044534413, "train_speed(iter/s)": 1.437496 }, { "epoch": 3.112120303328906, "grad_norm": 4.759354114532471, "learning_rate": 3.12402780027724e-05, "loss": 2.202978324890137, "memory(GiB)": 77.56, "step": 72640, "token_acc": 0.5061349693251533, "train_speed(iter/s)": 1.437484 }, { "epoch": 3.1123345186581552, "grad_norm": 8.271427154541016, "learning_rate": 3.123404003561929e-05, "loss": 2.4508623123168944, "memory(GiB)": 77.56, "step": 72645, "token_acc": 0.5019305019305019, "train_speed(iter/s)": 1.437484 }, { "epoch": 3.112548733987404, "grad_norm": 7.143495559692383, "learning_rate": 3.122780240842915e-05, "loss": 2.3175933837890623, "memory(GiB)": 77.56, "step": 72650, "token_acc": 0.5169811320754717, "train_speed(iter/s)": 1.437448 }, { "epoch": 3.112762949316653, "grad_norm": 7.061896324157715, "learning_rate": 3.122156512131497e-05, "loss": 2.5092161178588865, "memory(GiB)": 77.56, "step": 72655, "token_acc": 0.4919786096256685, "train_speed(iter/s)": 1.437447 }, { "epoch": 3.112977164645902, "grad_norm": 6.718698024749756, "learning_rate": 3.1215328174389754e-05, "loss": 2.2886322021484373, "memory(GiB)": 77.56, "step": 72660, "token_acc": 0.5217391304347826, "train_speed(iter/s)": 1.437454 }, { "epoch": 3.113191379975151, "grad_norm": 10.471782684326172, "learning_rate": 3.1209091567766484e-05, "loss": 1.9001348495483399, "memory(GiB)": 77.56, "step": 72665, "token_acc": 0.5791505791505791, "train_speed(iter/s)": 1.437464 }, { "epoch": 3.1134055953044, "grad_norm": 6.2345075607299805, "learning_rate": 3.120285530155816e-05, "loss": 2.3077360153198243, "memory(GiB)": 77.56, "step": 72670, "token_acc": 0.5166666666666667, "train_speed(iter/s)": 1.437463 }, { "epoch": 3.113619810633649, "grad_norm": 7.0577392578125, "learning_rate": 3.1196619375877746e-05, "loss": 2.246171569824219, "memory(GiB)": 77.56, "step": 72675, "token_acc": 0.5352112676056338, "train_speed(iter/s)": 1.437471 }, { "epoch": 3.113834025962898, "grad_norm": 5.536130428314209, "learning_rate": 3.11903837908382e-05, "loss": 2.1795209884643554, "memory(GiB)": 77.56, "step": 72680, "token_acc": 0.5555555555555556, "train_speed(iter/s)": 1.437481 }, { "epoch": 3.114048241292147, "grad_norm": 5.286458969116211, "learning_rate": 3.1184148546552505e-05, "loss": 2.627972221374512, "memory(GiB)": 77.56, "step": 72685, "token_acc": 0.4326923076923077, "train_speed(iter/s)": 1.43748 }, { "epoch": 3.114262456621396, "grad_norm": 5.607089042663574, "learning_rate": 3.117791364313361e-05, "loss": 2.264396095275879, "memory(GiB)": 77.56, "step": 72690, "token_acc": 0.5377049180327869, "train_speed(iter/s)": 1.437498 }, { "epoch": 3.1144766719506447, "grad_norm": 4.869533061981201, "learning_rate": 3.117167908069445e-05, "loss": 2.3718740463256838, "memory(GiB)": 77.56, "step": 72695, "token_acc": 0.5231316725978647, "train_speed(iter/s)": 1.437484 }, { "epoch": 3.114690887279894, "grad_norm": 5.879380702972412, "learning_rate": 3.116544485934799e-05, "loss": 2.4864360809326174, "memory(GiB)": 77.56, "step": 72700, "token_acc": 0.48717948717948717, "train_speed(iter/s)": 1.43748 }, { "epoch": 3.1149051026091428, "grad_norm": 9.330574989318848, "learning_rate": 3.115921097920718e-05, "loss": 2.2893978118896485, "memory(GiB)": 77.56, "step": 72705, "token_acc": 0.5418181818181819, "train_speed(iter/s)": 1.43747 }, { "epoch": 3.1151193179383916, "grad_norm": 5.411746025085449, "learning_rate": 3.1152977440384927e-05, "loss": 2.3624856948852537, "memory(GiB)": 77.56, "step": 72710, "token_acc": 0.5151515151515151, "train_speed(iter/s)": 1.437481 }, { "epoch": 3.115333533267641, "grad_norm": 6.132997989654541, "learning_rate": 3.114674424299416e-05, "loss": 2.4021377563476562, "memory(GiB)": 77.56, "step": 72715, "token_acc": 0.46153846153846156, "train_speed(iter/s)": 1.43747 }, { "epoch": 3.1155477485968897, "grad_norm": 5.839630126953125, "learning_rate": 3.114051138714783e-05, "loss": 2.3691007614135744, "memory(GiB)": 77.56, "step": 72720, "token_acc": 0.5211726384364821, "train_speed(iter/s)": 1.437438 }, { "epoch": 3.1157619639261385, "grad_norm": 5.101189136505127, "learning_rate": 3.1134278872958814e-05, "loss": 2.4583951950073244, "memory(GiB)": 77.56, "step": 72725, "token_acc": 0.5051903114186851, "train_speed(iter/s)": 1.437435 }, { "epoch": 3.1159761792553877, "grad_norm": 7.1166768074035645, "learning_rate": 3.112804670054004e-05, "loss": 2.1673370361328126, "memory(GiB)": 77.56, "step": 72730, "token_acc": 0.5215686274509804, "train_speed(iter/s)": 1.43746 }, { "epoch": 3.1161903945846365, "grad_norm": 5.3363165855407715, "learning_rate": 3.1121814870004395e-05, "loss": 2.1296621322631837, "memory(GiB)": 77.56, "step": 72735, "token_acc": 0.5149501661129569, "train_speed(iter/s)": 1.437466 }, { "epoch": 3.1164046099138853, "grad_norm": 5.645517826080322, "learning_rate": 3.111558338146479e-05, "loss": 2.426052284240723, "memory(GiB)": 77.56, "step": 72740, "token_acc": 0.48507462686567165, "train_speed(iter/s)": 1.437473 }, { "epoch": 3.1166188252431346, "grad_norm": 6.047229290008545, "learning_rate": 3.11093522350341e-05, "loss": 2.3883052825927735, "memory(GiB)": 77.56, "step": 72745, "token_acc": 0.4857142857142857, "train_speed(iter/s)": 1.437485 }, { "epoch": 3.1168330405723834, "grad_norm": 5.909136772155762, "learning_rate": 3.1103121430825224e-05, "loss": 2.350218963623047, "memory(GiB)": 77.56, "step": 72750, "token_acc": 0.5034246575342466, "train_speed(iter/s)": 1.437509 }, { "epoch": 3.1170472559016322, "grad_norm": 6.532894134521484, "learning_rate": 3.109689096895102e-05, "loss": 2.4577178955078125, "memory(GiB)": 77.56, "step": 72755, "token_acc": 0.4798657718120805, "train_speed(iter/s)": 1.437487 }, { "epoch": 3.1172614712308815, "grad_norm": 7.467641353607178, "learning_rate": 3.109066084952438e-05, "loss": 2.582114028930664, "memory(GiB)": 77.56, "step": 72760, "token_acc": 0.46096654275092935, "train_speed(iter/s)": 1.437486 }, { "epoch": 3.1174756865601303, "grad_norm": 5.025373458862305, "learning_rate": 3.1084431072658135e-05, "loss": 2.70206356048584, "memory(GiB)": 77.56, "step": 72765, "token_acc": 0.4857142857142857, "train_speed(iter/s)": 1.437471 }, { "epoch": 3.117689901889379, "grad_norm": 5.961118698120117, "learning_rate": 3.1078201638465164e-05, "loss": 2.2784103393554687, "memory(GiB)": 77.56, "step": 72770, "token_acc": 0.5473684210526316, "train_speed(iter/s)": 1.43747 }, { "epoch": 3.1179041172186284, "grad_norm": 7.404531002044678, "learning_rate": 3.107197254705835e-05, "loss": 2.651513671875, "memory(GiB)": 77.56, "step": 72775, "token_acc": 0.47843137254901963, "train_speed(iter/s)": 1.437477 }, { "epoch": 3.118118332547877, "grad_norm": 5.981419563293457, "learning_rate": 3.1065743798550496e-05, "loss": 2.500725746154785, "memory(GiB)": 77.56, "step": 72780, "token_acc": 0.4645161290322581, "train_speed(iter/s)": 1.43747 }, { "epoch": 3.118332547877126, "grad_norm": 6.0158772468566895, "learning_rate": 3.105951539305445e-05, "loss": 2.1025915145874023, "memory(GiB)": 77.56, "step": 72785, "token_acc": 0.5817490494296578, "train_speed(iter/s)": 1.437468 }, { "epoch": 3.1185467632063752, "grad_norm": 5.096309661865234, "learning_rate": 3.105328733068306e-05, "loss": 2.2628801345825194, "memory(GiB)": 77.56, "step": 72790, "token_acc": 0.5278969957081545, "train_speed(iter/s)": 1.437459 }, { "epoch": 3.118760978535624, "grad_norm": 5.582957744598389, "learning_rate": 3.104705961154914e-05, "loss": 2.148325729370117, "memory(GiB)": 77.56, "step": 72795, "token_acc": 0.5701754385964912, "train_speed(iter/s)": 1.437474 }, { "epoch": 3.118975193864873, "grad_norm": 4.9675445556640625, "learning_rate": 3.1040832235765506e-05, "loss": 2.074441909790039, "memory(GiB)": 77.56, "step": 72800, "token_acc": 0.558641975308642, "train_speed(iter/s)": 1.437481 }, { "epoch": 3.119189409194122, "grad_norm": 4.737760066986084, "learning_rate": 3.103460520344499e-05, "loss": 2.133156585693359, "memory(GiB)": 77.56, "step": 72805, "token_acc": 0.5170068027210885, "train_speed(iter/s)": 1.437487 }, { "epoch": 3.119403624523371, "grad_norm": 8.512147903442383, "learning_rate": 3.1028378514700386e-05, "loss": 2.3238494873046873, "memory(GiB)": 77.56, "step": 72810, "token_acc": 0.4778156996587031, "train_speed(iter/s)": 1.43752 }, { "epoch": 3.1196178398526198, "grad_norm": 7.502225875854492, "learning_rate": 3.1022152169644515e-05, "loss": 2.5177003860473635, "memory(GiB)": 77.56, "step": 72815, "token_acc": 0.5253623188405797, "train_speed(iter/s)": 1.437525 }, { "epoch": 3.119832055181869, "grad_norm": 4.5646443367004395, "learning_rate": 3.101592616839015e-05, "loss": 2.361803436279297, "memory(GiB)": 77.56, "step": 72820, "token_acc": 0.4840764331210191, "train_speed(iter/s)": 1.437504 }, { "epoch": 3.120046270511118, "grad_norm": 6.551790714263916, "learning_rate": 3.100970051105009e-05, "loss": 2.3915557861328125, "memory(GiB)": 77.56, "step": 72825, "token_acc": 0.4766666666666667, "train_speed(iter/s)": 1.43748 }, { "epoch": 3.1202604858403666, "grad_norm": 7.009518623352051, "learning_rate": 3.1003475197737125e-05, "loss": 2.277404022216797, "memory(GiB)": 77.56, "step": 72830, "token_acc": 0.5085324232081911, "train_speed(iter/s)": 1.437473 }, { "epoch": 3.120474701169616, "grad_norm": 5.483964920043945, "learning_rate": 3.0997250228564026e-05, "loss": 2.1274051666259766, "memory(GiB)": 77.56, "step": 72835, "token_acc": 0.5833333333333334, "train_speed(iter/s)": 1.43749 }, { "epoch": 3.1206889164988647, "grad_norm": 8.037076950073242, "learning_rate": 3.0991025603643556e-05, "loss": 2.237125015258789, "memory(GiB)": 77.56, "step": 72840, "token_acc": 0.5192307692307693, "train_speed(iter/s)": 1.437492 }, { "epoch": 3.1209031318281135, "grad_norm": 5.772195339202881, "learning_rate": 3.0984801323088484e-05, "loss": 2.5056087493896486, "memory(GiB)": 77.56, "step": 72845, "token_acc": 0.47766323024054985, "train_speed(iter/s)": 1.437492 }, { "epoch": 3.1211173471573628, "grad_norm": 5.439091682434082, "learning_rate": 3.097857738701159e-05, "loss": 2.4180681228637697, "memory(GiB)": 77.56, "step": 72850, "token_acc": 0.5161290322580645, "train_speed(iter/s)": 1.437491 }, { "epoch": 3.1213315624866116, "grad_norm": 5.943234443664551, "learning_rate": 3.097235379552561e-05, "loss": 2.1390995025634765, "memory(GiB)": 77.56, "step": 72855, "token_acc": 0.5345454545454545, "train_speed(iter/s)": 1.437496 }, { "epoch": 3.1215457778158604, "grad_norm": 6.821101665496826, "learning_rate": 3.096613054874328e-05, "loss": 2.6187824249267577, "memory(GiB)": 77.56, "step": 72860, "token_acc": 0.4774774774774775, "train_speed(iter/s)": 1.437497 }, { "epoch": 3.1217599931451097, "grad_norm": 4.693805694580078, "learning_rate": 3.0959907646777364e-05, "loss": 2.488432502746582, "memory(GiB)": 77.56, "step": 72865, "token_acc": 0.5302491103202847, "train_speed(iter/s)": 1.437514 }, { "epoch": 3.1219742084743585, "grad_norm": 7.575918197631836, "learning_rate": 3.0953685089740566e-05, "loss": 2.244872283935547, "memory(GiB)": 77.56, "step": 72870, "token_acc": 0.5142857142857142, "train_speed(iter/s)": 1.43752 }, { "epoch": 3.1221884238036073, "grad_norm": 9.932140350341797, "learning_rate": 3.094746287774564e-05, "loss": 2.512201499938965, "memory(GiB)": 77.56, "step": 72875, "token_acc": 0.4742647058823529, "train_speed(iter/s)": 1.437541 }, { "epoch": 3.1224026391328565, "grad_norm": 6.540576934814453, "learning_rate": 3.09412410109053e-05, "loss": 2.538871002197266, "memory(GiB)": 77.56, "step": 72880, "token_acc": 0.49056603773584906, "train_speed(iter/s)": 1.437505 }, { "epoch": 3.1226168544621054, "grad_norm": 5.2672929763793945, "learning_rate": 3.093501948933225e-05, "loss": 2.2062732696533205, "memory(GiB)": 77.56, "step": 72885, "token_acc": 0.5077519379844961, "train_speed(iter/s)": 1.437508 }, { "epoch": 3.122831069791354, "grad_norm": 4.811594009399414, "learning_rate": 3.0928798313139206e-05, "loss": 2.2711551666259764, "memory(GiB)": 77.56, "step": 72890, "token_acc": 0.5252100840336135, "train_speed(iter/s)": 1.437529 }, { "epoch": 3.1230452851206034, "grad_norm": 6.194910049438477, "learning_rate": 3.092257748243888e-05, "loss": 2.4762619018554686, "memory(GiB)": 77.56, "step": 72895, "token_acc": 0.47692307692307695, "train_speed(iter/s)": 1.437529 }, { "epoch": 3.1232595004498522, "grad_norm": 4.968112945556641, "learning_rate": 3.0916356997343945e-05, "loss": 2.158442497253418, "memory(GiB)": 77.56, "step": 72900, "token_acc": 0.5594855305466238, "train_speed(iter/s)": 1.437516 }, { "epoch": 3.123473715779101, "grad_norm": 6.127618789672852, "learning_rate": 3.091013685796712e-05, "loss": 2.200439453125, "memory(GiB)": 77.56, "step": 72905, "token_acc": 0.5201465201465202, "train_speed(iter/s)": 1.437516 }, { "epoch": 3.1236879311083503, "grad_norm": 6.250693321228027, "learning_rate": 3.0903917064421064e-05, "loss": 2.0903013229370115, "memory(GiB)": 77.56, "step": 72910, "token_acc": 0.5682819383259912, "train_speed(iter/s)": 1.437529 }, { "epoch": 3.123902146437599, "grad_norm": 5.791238784790039, "learning_rate": 3.0897697616818444e-05, "loss": 2.281488037109375, "memory(GiB)": 77.56, "step": 72915, "token_acc": 0.4896142433234421, "train_speed(iter/s)": 1.437551 }, { "epoch": 3.124116361766848, "grad_norm": 7.567104339599609, "learning_rate": 3.0891478515271975e-05, "loss": 2.2617361068725588, "memory(GiB)": 77.56, "step": 72920, "token_acc": 0.5112781954887218, "train_speed(iter/s)": 1.437564 }, { "epoch": 3.124330577096097, "grad_norm": 6.483534812927246, "learning_rate": 3.0885259759894306e-05, "loss": 2.3611730575561523, "memory(GiB)": 77.56, "step": 72925, "token_acc": 0.512280701754386, "train_speed(iter/s)": 1.43758 }, { "epoch": 3.124544792425346, "grad_norm": 5.5189948081970215, "learning_rate": 3.0879041350798064e-05, "loss": 2.5294239044189455, "memory(GiB)": 77.56, "step": 72930, "token_acc": 0.4751552795031056, "train_speed(iter/s)": 1.437596 }, { "epoch": 3.124759007754595, "grad_norm": 6.098874092102051, "learning_rate": 3.0872823288095946e-05, "loss": 2.204116439819336, "memory(GiB)": 77.56, "step": 72935, "token_acc": 0.5100671140939598, "train_speed(iter/s)": 1.437591 }, { "epoch": 3.124973223083844, "grad_norm": 5.665869235992432, "learning_rate": 3.086660557190057e-05, "loss": 2.14300594329834, "memory(GiB)": 77.56, "step": 72940, "token_acc": 0.5244755244755245, "train_speed(iter/s)": 1.437601 }, { "epoch": 3.125187438413093, "grad_norm": 6.831770420074463, "learning_rate": 3.086038820232458e-05, "loss": 2.2865468978881838, "memory(GiB)": 77.56, "step": 72945, "token_acc": 0.5283018867924528, "train_speed(iter/s)": 1.437618 }, { "epoch": 3.1254016537423417, "grad_norm": 5.449192523956299, "learning_rate": 3.085417117948062e-05, "loss": 2.318807601928711, "memory(GiB)": 77.56, "step": 72950, "token_acc": 0.4642857142857143, "train_speed(iter/s)": 1.437621 }, { "epoch": 3.125615869071591, "grad_norm": 5.110204696655273, "learning_rate": 3.08479545034813e-05, "loss": 2.0981908798217774, "memory(GiB)": 77.56, "step": 72955, "token_acc": 0.5236220472440944, "train_speed(iter/s)": 1.437625 }, { "epoch": 3.1258300844008398, "grad_norm": 7.809922695159912, "learning_rate": 3.084173817443925e-05, "loss": 2.3389583587646485, "memory(GiB)": 77.56, "step": 72960, "token_acc": 0.4885245901639344, "train_speed(iter/s)": 1.437635 }, { "epoch": 3.1260442997300886, "grad_norm": 6.2622833251953125, "learning_rate": 3.0835522192467095e-05, "loss": 1.8912019729614258, "memory(GiB)": 77.56, "step": 72965, "token_acc": 0.588, "train_speed(iter/s)": 1.437663 }, { "epoch": 3.126258515059338, "grad_norm": 5.646315574645996, "learning_rate": 3.082930655767742e-05, "loss": 2.479226493835449, "memory(GiB)": 77.56, "step": 72970, "token_acc": 0.4927536231884058, "train_speed(iter/s)": 1.437689 }, { "epoch": 3.1264727303885866, "grad_norm": 5.672926902770996, "learning_rate": 3.082309127018285e-05, "loss": 2.086069107055664, "memory(GiB)": 77.56, "step": 72975, "token_acc": 0.5247524752475248, "train_speed(iter/s)": 1.437707 }, { "epoch": 3.1266869457178355, "grad_norm": 5.0447773933410645, "learning_rate": 3.081687633009598e-05, "loss": 2.194288444519043, "memory(GiB)": 77.56, "step": 72980, "token_acc": 0.48253968253968255, "train_speed(iter/s)": 1.437698 }, { "epoch": 3.1269011610470847, "grad_norm": 5.081331253051758, "learning_rate": 3.081066173752937e-05, "loss": 2.1215471267700194, "memory(GiB)": 77.56, "step": 72985, "token_acc": 0.547945205479452, "train_speed(iter/s)": 1.437713 }, { "epoch": 3.1271153763763335, "grad_norm": 5.446280002593994, "learning_rate": 3.080444749259561e-05, "loss": 2.436065673828125, "memory(GiB)": 77.56, "step": 72990, "token_acc": 0.48059701492537316, "train_speed(iter/s)": 1.437732 }, { "epoch": 3.1273295917055823, "grad_norm": 5.769184589385986, "learning_rate": 3.079823359540732e-05, "loss": 2.543348693847656, "memory(GiB)": 77.56, "step": 72995, "token_acc": 0.48, "train_speed(iter/s)": 1.437745 }, { "epoch": 3.1275438070348316, "grad_norm": 5.120479106903076, "learning_rate": 3.079202004607704e-05, "loss": 2.1759628295898437, "memory(GiB)": 77.56, "step": 73000, "token_acc": 0.5576208178438662, "train_speed(iter/s)": 1.437735 }, { "epoch": 3.1275438070348316, "eval_loss": 2.2976233959198, "eval_runtime": 14.3954, "eval_samples_per_second": 6.947, "eval_steps_per_second": 6.947, "eval_token_acc": 0.44025157232704404, "step": 73000 }, { "epoch": 3.1277580223640804, "grad_norm": 4.880911827087402, "learning_rate": 3.0785806844717324e-05, "loss": 2.5289411544799805, "memory(GiB)": 77.56, "step": 73005, "token_acc": 0.45471349353049906, "train_speed(iter/s)": 1.437306 }, { "epoch": 3.127972237693329, "grad_norm": 7.991203784942627, "learning_rate": 3.077959399144075e-05, "loss": 2.3422672271728517, "memory(GiB)": 77.56, "step": 73010, "token_acc": 0.5078125, "train_speed(iter/s)": 1.437333 }, { "epoch": 3.1281864530225785, "grad_norm": 6.289660453796387, "learning_rate": 3.0773381486359854e-05, "loss": 2.2838260650634767, "memory(GiB)": 77.56, "step": 73015, "token_acc": 0.5243055555555556, "train_speed(iter/s)": 1.437308 }, { "epoch": 3.1284006683518273, "grad_norm": 7.014865875244141, "learning_rate": 3.0767169329587195e-05, "loss": 2.3660045623779298, "memory(GiB)": 77.56, "step": 73020, "token_acc": 0.48214285714285715, "train_speed(iter/s)": 1.437309 }, { "epoch": 3.128614883681076, "grad_norm": 5.524535179138184, "learning_rate": 3.07609575212353e-05, "loss": 2.3686203002929687, "memory(GiB)": 77.56, "step": 73025, "token_acc": 0.46, "train_speed(iter/s)": 1.437309 }, { "epoch": 3.1288290990103254, "grad_norm": 6.617551326751709, "learning_rate": 3.0754746061416704e-05, "loss": 1.9826824188232421, "memory(GiB)": 77.56, "step": 73030, "token_acc": 0.5818815331010453, "train_speed(iter/s)": 1.437317 }, { "epoch": 3.129043314339574, "grad_norm": 6.423425674438477, "learning_rate": 3.074853495024395e-05, "loss": 2.0677026748657226, "memory(GiB)": 77.56, "step": 73035, "token_acc": 0.553921568627451, "train_speed(iter/s)": 1.437297 }, { "epoch": 3.129257529668823, "grad_norm": 4.993946552276611, "learning_rate": 3.074232418782954e-05, "loss": 2.353153419494629, "memory(GiB)": 77.56, "step": 73040, "token_acc": 0.5104166666666666, "train_speed(iter/s)": 1.437312 }, { "epoch": 3.1294717449980722, "grad_norm": 5.287154197692871, "learning_rate": 3.0736113774285984e-05, "loss": 2.3713645935058594, "memory(GiB)": 77.56, "step": 73045, "token_acc": 0.4725609756097561, "train_speed(iter/s)": 1.437337 }, { "epoch": 3.129685960327321, "grad_norm": 5.195074558258057, "learning_rate": 3.07299037097258e-05, "loss": 2.340109443664551, "memory(GiB)": 77.56, "step": 73050, "token_acc": 0.512987012987013, "train_speed(iter/s)": 1.437341 }, { "epoch": 3.12990017565657, "grad_norm": 6.541009426116943, "learning_rate": 3.0723693994261496e-05, "loss": 2.3680057525634766, "memory(GiB)": 77.56, "step": 73055, "token_acc": 0.4950166112956811, "train_speed(iter/s)": 1.437369 }, { "epoch": 3.130114390985819, "grad_norm": 5.066952705383301, "learning_rate": 3.0717484628005556e-05, "loss": 2.6939273834228517, "memory(GiB)": 77.56, "step": 73060, "token_acc": 0.43356643356643354, "train_speed(iter/s)": 1.437342 }, { "epoch": 3.130328606315068, "grad_norm": 6.5553297996521, "learning_rate": 3.071127561107044e-05, "loss": 2.595798873901367, "memory(GiB)": 77.56, "step": 73065, "token_acc": 0.4758842443729904, "train_speed(iter/s)": 1.437333 }, { "epoch": 3.1305428216443167, "grad_norm": 6.919444561004639, "learning_rate": 3.07050669435687e-05, "loss": 2.5092817306518556, "memory(GiB)": 77.56, "step": 73070, "token_acc": 0.4743202416918429, "train_speed(iter/s)": 1.437317 }, { "epoch": 3.130757036973566, "grad_norm": 5.77463960647583, "learning_rate": 3.069885862561275e-05, "loss": 2.577484703063965, "memory(GiB)": 77.56, "step": 73075, "token_acc": 0.4768211920529801, "train_speed(iter/s)": 1.437327 }, { "epoch": 3.130971252302815, "grad_norm": 6.542135715484619, "learning_rate": 3.0692650657315106e-05, "loss": 2.0368463516235353, "memory(GiB)": 77.56, "step": 73080, "token_acc": 0.5256410256410257, "train_speed(iter/s)": 1.43733 }, { "epoch": 3.1311854676320636, "grad_norm": 5.323869705200195, "learning_rate": 3.0686443038788196e-05, "loss": 2.0392196655273436, "memory(GiB)": 77.56, "step": 73085, "token_acc": 0.5373134328358209, "train_speed(iter/s)": 1.43733 }, { "epoch": 3.131399682961313, "grad_norm": 8.747483253479004, "learning_rate": 3.0680235770144474e-05, "loss": 2.5047634124755858, "memory(GiB)": 77.56, "step": 73090, "token_acc": 0.46905537459283386, "train_speed(iter/s)": 1.437352 }, { "epoch": 3.1316138982905617, "grad_norm": 8.253219604492188, "learning_rate": 3.067402885149642e-05, "loss": 2.1983312606811523, "memory(GiB)": 77.56, "step": 73095, "token_acc": 0.528169014084507, "train_speed(iter/s)": 1.437373 }, { "epoch": 3.1318281136198105, "grad_norm": 5.186071872711182, "learning_rate": 3.066782228295645e-05, "loss": 2.43719596862793, "memory(GiB)": 77.56, "step": 73100, "token_acc": 0.5143884892086331, "train_speed(iter/s)": 1.437397 }, { "epoch": 3.1320423289490598, "grad_norm": 6.122367858886719, "learning_rate": 3.0661616064637025e-05, "loss": 2.507410430908203, "memory(GiB)": 77.56, "step": 73105, "token_acc": 0.4797297297297297, "train_speed(iter/s)": 1.437413 }, { "epoch": 3.1322565442783086, "grad_norm": 5.829648494720459, "learning_rate": 3.065541019665057e-05, "loss": 2.292179489135742, "memory(GiB)": 77.56, "step": 73110, "token_acc": 0.4935897435897436, "train_speed(iter/s)": 1.437428 }, { "epoch": 3.1324707596075574, "grad_norm": 5.905495643615723, "learning_rate": 3.0649204679109494e-05, "loss": 2.0784915924072265, "memory(GiB)": 77.56, "step": 73115, "token_acc": 0.5633333333333334, "train_speed(iter/s)": 1.437431 }, { "epoch": 3.1326849749368066, "grad_norm": 4.417581558227539, "learning_rate": 3.064299951212624e-05, "loss": 2.420480728149414, "memory(GiB)": 77.56, "step": 73120, "token_acc": 0.4813664596273292, "train_speed(iter/s)": 1.437417 }, { "epoch": 3.1328991902660555, "grad_norm": 7.090388298034668, "learning_rate": 3.06367946958132e-05, "loss": 2.0409313201904298, "memory(GiB)": 77.56, "step": 73125, "token_acc": 0.5270758122743683, "train_speed(iter/s)": 1.437408 }, { "epoch": 3.1331134055953043, "grad_norm": 5.905135631561279, "learning_rate": 3.0630590230282783e-05, "loss": 2.4637382507324217, "memory(GiB)": 77.56, "step": 73130, "token_acc": 0.513986013986014, "train_speed(iter/s)": 1.437429 }, { "epoch": 3.1333276209245535, "grad_norm": 5.71701192855835, "learning_rate": 3.0624386115647386e-05, "loss": 2.332402801513672, "memory(GiB)": 77.56, "step": 73135, "token_acc": 0.5057803468208093, "train_speed(iter/s)": 1.437405 }, { "epoch": 3.1335418362538023, "grad_norm": 5.1474127769470215, "learning_rate": 3.0618182352019434e-05, "loss": 2.070578384399414, "memory(GiB)": 77.56, "step": 73140, "token_acc": 0.541095890410959, "train_speed(iter/s)": 1.437365 }, { "epoch": 3.133756051583051, "grad_norm": 6.234938621520996, "learning_rate": 3.061197893951128e-05, "loss": 2.4143468856811525, "memory(GiB)": 77.56, "step": 73145, "token_acc": 0.4963235294117647, "train_speed(iter/s)": 1.437348 }, { "epoch": 3.1339702669123004, "grad_norm": 8.508365631103516, "learning_rate": 3.060577587823531e-05, "loss": 2.4400894165039064, "memory(GiB)": 77.56, "step": 73150, "token_acc": 0.5104166666666666, "train_speed(iter/s)": 1.43735 }, { "epoch": 3.134184482241549, "grad_norm": 6.250735282897949, "learning_rate": 3.059957316830391e-05, "loss": 2.329109954833984, "memory(GiB)": 77.56, "step": 73155, "token_acc": 0.498220640569395, "train_speed(iter/s)": 1.437355 }, { "epoch": 3.134398697570798, "grad_norm": 6.181001663208008, "learning_rate": 3.059337080982943e-05, "loss": 2.4302394866943358, "memory(GiB)": 77.56, "step": 73160, "token_acc": 0.49838187702265374, "train_speed(iter/s)": 1.43733 }, { "epoch": 3.1346129129000473, "grad_norm": 6.349664211273193, "learning_rate": 3.0587168802924266e-05, "loss": 2.741741180419922, "memory(GiB)": 77.56, "step": 73165, "token_acc": 0.44745762711864406, "train_speed(iter/s)": 1.437327 }, { "epoch": 3.134827128229296, "grad_norm": 4.164112567901611, "learning_rate": 3.058096714770074e-05, "loss": 2.390481376647949, "memory(GiB)": 77.56, "step": 73170, "token_acc": 0.5347432024169184, "train_speed(iter/s)": 1.437338 }, { "epoch": 3.135041343558545, "grad_norm": 5.895599842071533, "learning_rate": 3.05747658442712e-05, "loss": 2.263382911682129, "memory(GiB)": 77.56, "step": 73175, "token_acc": 0.5118110236220472, "train_speed(iter/s)": 1.437332 }, { "epoch": 3.135255558887794, "grad_norm": 7.887008190155029, "learning_rate": 3.0568564892748005e-05, "loss": 2.222603988647461, "memory(GiB)": 77.56, "step": 73180, "token_acc": 0.5344262295081967, "train_speed(iter/s)": 1.437341 }, { "epoch": 3.135469774217043, "grad_norm": 6.08693790435791, "learning_rate": 3.0562364293243496e-05, "loss": 2.2647979736328123, "memory(GiB)": 77.56, "step": 73185, "token_acc": 0.5019157088122606, "train_speed(iter/s)": 1.437329 }, { "epoch": 3.135683989546292, "grad_norm": 5.291981220245361, "learning_rate": 3.055616404586998e-05, "loss": 2.1242427825927734, "memory(GiB)": 77.56, "step": 73190, "token_acc": 0.5610687022900763, "train_speed(iter/s)": 1.437334 }, { "epoch": 3.135898204875541, "grad_norm": 6.976221084594727, "learning_rate": 3.05499641507398e-05, "loss": 2.2210906982421874, "memory(GiB)": 77.56, "step": 73195, "token_acc": 0.5017421602787456, "train_speed(iter/s)": 1.437309 }, { "epoch": 3.13611242020479, "grad_norm": 7.151931285858154, "learning_rate": 3.0543764607965256e-05, "loss": 2.284947967529297, "memory(GiB)": 77.56, "step": 73200, "token_acc": 0.5144694533762058, "train_speed(iter/s)": 1.437295 }, { "epoch": 3.1363266355340387, "grad_norm": 5.4472737312316895, "learning_rate": 3.053756541765867e-05, "loss": 2.198176383972168, "memory(GiB)": 77.56, "step": 73205, "token_acc": 0.5524691358024691, "train_speed(iter/s)": 1.437302 }, { "epoch": 3.136540850863288, "grad_norm": 4.815734386444092, "learning_rate": 3.053136657993233e-05, "loss": 2.405898666381836, "memory(GiB)": 77.56, "step": 73210, "token_acc": 0.4967741935483871, "train_speed(iter/s)": 1.437311 }, { "epoch": 3.1367550661925367, "grad_norm": 5.370924472808838, "learning_rate": 3.0525168094898566e-05, "loss": 2.2557302474975587, "memory(GiB)": 77.56, "step": 73215, "token_acc": 0.5105105105105106, "train_speed(iter/s)": 1.437327 }, { "epoch": 3.1369692815217856, "grad_norm": 4.947432994842529, "learning_rate": 3.051896996266964e-05, "loss": 2.406699371337891, "memory(GiB)": 77.56, "step": 73220, "token_acc": 0.5261324041811847, "train_speed(iter/s)": 1.437356 }, { "epoch": 3.137183496851035, "grad_norm": 5.838931083679199, "learning_rate": 3.0512772183357856e-05, "loss": 2.4366416931152344, "memory(GiB)": 77.56, "step": 73225, "token_acc": 0.48507462686567165, "train_speed(iter/s)": 1.437358 }, { "epoch": 3.1373977121802836, "grad_norm": 6.927596569061279, "learning_rate": 3.050657475707549e-05, "loss": 2.6281930923461916, "memory(GiB)": 77.56, "step": 73230, "token_acc": 0.49538461538461537, "train_speed(iter/s)": 1.437373 }, { "epoch": 3.1376119275095324, "grad_norm": 4.943711757659912, "learning_rate": 3.0500377683934796e-05, "loss": 2.364436721801758, "memory(GiB)": 77.56, "step": 73235, "token_acc": 0.5149501661129569, "train_speed(iter/s)": 1.437394 }, { "epoch": 3.1378261428387817, "grad_norm": 5.119663238525391, "learning_rate": 3.0494180964048057e-05, "loss": 2.578693962097168, "memory(GiB)": 77.56, "step": 73240, "token_acc": 0.49, "train_speed(iter/s)": 1.437432 }, { "epoch": 3.1380403581680305, "grad_norm": 5.716797351837158, "learning_rate": 3.048798459752753e-05, "loss": 2.2608469009399412, "memory(GiB)": 77.56, "step": 73245, "token_acc": 0.5413533834586466, "train_speed(iter/s)": 1.437414 }, { "epoch": 3.1382545734972793, "grad_norm": 5.7616286277771, "learning_rate": 3.0481788584485467e-05, "loss": 2.371569061279297, "memory(GiB)": 77.56, "step": 73250, "token_acc": 0.5224358974358975, "train_speed(iter/s)": 1.437424 }, { "epoch": 3.1384687888265286, "grad_norm": 6.170180320739746, "learning_rate": 3.0475592925034112e-05, "loss": 2.1868541717529295, "memory(GiB)": 77.56, "step": 73255, "token_acc": 0.5432835820895522, "train_speed(iter/s)": 1.437418 }, { "epoch": 3.1386830041557774, "grad_norm": 5.0818772315979, "learning_rate": 3.04693976192857e-05, "loss": 2.362918663024902, "memory(GiB)": 77.56, "step": 73260, "token_acc": 0.46546546546546547, "train_speed(iter/s)": 1.437409 }, { "epoch": 3.138897219485026, "grad_norm": 5.2998576164245605, "learning_rate": 3.0463202667352487e-05, "loss": 2.3311336517333983, "memory(GiB)": 77.56, "step": 73265, "token_acc": 0.5540540540540541, "train_speed(iter/s)": 1.437433 }, { "epoch": 3.1391114348142755, "grad_norm": 5.516914367675781, "learning_rate": 3.0457008069346677e-05, "loss": 2.358369255065918, "memory(GiB)": 77.56, "step": 73270, "token_acc": 0.5335689045936396, "train_speed(iter/s)": 1.437444 }, { "epoch": 3.1393256501435243, "grad_norm": 6.1014628410339355, "learning_rate": 3.0450813825380487e-05, "loss": 2.443255805969238, "memory(GiB)": 77.56, "step": 73275, "token_acc": 0.5037593984962406, "train_speed(iter/s)": 1.437447 }, { "epoch": 3.139539865472773, "grad_norm": 5.917468547821045, "learning_rate": 3.044461993556616e-05, "loss": 2.254629135131836, "memory(GiB)": 77.56, "step": 73280, "token_acc": 0.49609375, "train_speed(iter/s)": 1.437458 }, { "epoch": 3.1397540808020223, "grad_norm": 4.6785888671875, "learning_rate": 3.0438426400015864e-05, "loss": 2.2239089965820313, "memory(GiB)": 77.56, "step": 73285, "token_acc": 0.5254777070063694, "train_speed(iter/s)": 1.437484 }, { "epoch": 3.139968296131271, "grad_norm": 8.927579879760742, "learning_rate": 3.0432233218841845e-05, "loss": 2.566057014465332, "memory(GiB)": 77.56, "step": 73290, "token_acc": 0.49375, "train_speed(iter/s)": 1.437517 }, { "epoch": 3.14018251146052, "grad_norm": 5.384395122528076, "learning_rate": 3.0426040392156264e-05, "loss": 2.16689453125, "memory(GiB)": 77.56, "step": 73295, "token_acc": 0.5610561056105611, "train_speed(iter/s)": 1.437538 }, { "epoch": 3.140396726789769, "grad_norm": 7.069342136383057, "learning_rate": 3.0419847920071333e-05, "loss": 2.1762584686279296, "memory(GiB)": 77.56, "step": 73300, "token_acc": 0.5657894736842105, "train_speed(iter/s)": 1.437548 }, { "epoch": 3.140610942119018, "grad_norm": 7.190010070800781, "learning_rate": 3.0413655802699216e-05, "loss": 2.361526107788086, "memory(GiB)": 77.56, "step": 73305, "token_acc": 0.5030674846625767, "train_speed(iter/s)": 1.437556 }, { "epoch": 3.140825157448267, "grad_norm": 5.127769470214844, "learning_rate": 3.040746404015211e-05, "loss": 2.405225372314453, "memory(GiB)": 77.56, "step": 73310, "token_acc": 0.5045592705167173, "train_speed(iter/s)": 1.437561 }, { "epoch": 3.141039372777516, "grad_norm": 6.798062324523926, "learning_rate": 3.0401272632542172e-05, "loss": 2.537810516357422, "memory(GiB)": 77.56, "step": 73315, "token_acc": 0.4888888888888889, "train_speed(iter/s)": 1.437549 }, { "epoch": 3.141253588106765, "grad_norm": 5.879473686218262, "learning_rate": 3.0395081579981556e-05, "loss": 2.3737668991088867, "memory(GiB)": 77.56, "step": 73320, "token_acc": 0.4418604651162791, "train_speed(iter/s)": 1.43756 }, { "epoch": 3.1414678034360137, "grad_norm": 5.051718711853027, "learning_rate": 3.038889088258243e-05, "loss": 2.57739200592041, "memory(GiB)": 77.56, "step": 73325, "token_acc": 0.46153846153846156, "train_speed(iter/s)": 1.437587 }, { "epoch": 3.141682018765263, "grad_norm": 7.146501064300537, "learning_rate": 3.0382700540456954e-05, "loss": 2.003976058959961, "memory(GiB)": 77.56, "step": 73330, "token_acc": 0.5789473684210527, "train_speed(iter/s)": 1.437572 }, { "epoch": 3.141896234094512, "grad_norm": 7.560466766357422, "learning_rate": 3.0376510553717242e-05, "loss": 2.1586069107055663, "memory(GiB)": 77.56, "step": 73335, "token_acc": 0.5400696864111498, "train_speed(iter/s)": 1.437581 }, { "epoch": 3.1421104494237606, "grad_norm": 4.7953290939331055, "learning_rate": 3.0370320922475466e-05, "loss": 1.9031032562255858, "memory(GiB)": 77.56, "step": 73340, "token_acc": 0.5912162162162162, "train_speed(iter/s)": 1.437596 }, { "epoch": 3.14232466475301, "grad_norm": 7.270368576049805, "learning_rate": 3.0364131646843725e-05, "loss": 2.426093101501465, "memory(GiB)": 77.56, "step": 73345, "token_acc": 0.4714828897338403, "train_speed(iter/s)": 1.437625 }, { "epoch": 3.1425388800822587, "grad_norm": 5.501646995544434, "learning_rate": 3.0357942726934173e-05, "loss": 2.221392822265625, "memory(GiB)": 77.56, "step": 73350, "token_acc": 0.5462555066079295, "train_speed(iter/s)": 1.437625 }, { "epoch": 3.1427530954115075, "grad_norm": 6.587131023406982, "learning_rate": 3.0351754162858913e-05, "loss": 2.3624759674072267, "memory(GiB)": 77.56, "step": 73355, "token_acc": 0.5031055900621118, "train_speed(iter/s)": 1.437633 }, { "epoch": 3.1429673107407567, "grad_norm": 5.908666133880615, "learning_rate": 3.0345565954730036e-05, "loss": 2.521999168395996, "memory(GiB)": 77.56, "step": 73360, "token_acc": 0.4716312056737589, "train_speed(iter/s)": 1.437656 }, { "epoch": 3.1431815260700056, "grad_norm": 5.3235602378845215, "learning_rate": 3.0339378102659678e-05, "loss": 2.5466480255126953, "memory(GiB)": 77.56, "step": 73365, "token_acc": 0.48322147651006714, "train_speed(iter/s)": 1.437652 }, { "epoch": 3.1433957413992544, "grad_norm": 4.571225643157959, "learning_rate": 3.033319060675994e-05, "loss": 2.1431861877441407, "memory(GiB)": 77.56, "step": 73370, "token_acc": 0.5582329317269076, "train_speed(iter/s)": 1.437652 }, { "epoch": 3.1436099567285036, "grad_norm": 5.0783538818359375, "learning_rate": 3.0327003467142907e-05, "loss": 2.2377185821533203, "memory(GiB)": 77.56, "step": 73375, "token_acc": 0.511049723756906, "train_speed(iter/s)": 1.437651 }, { "epoch": 3.1438241720577524, "grad_norm": 6.018552303314209, "learning_rate": 3.032081668392065e-05, "loss": 2.31693058013916, "memory(GiB)": 77.56, "step": 73380, "token_acc": 0.4983388704318937, "train_speed(iter/s)": 1.437627 }, { "epoch": 3.1440383873870013, "grad_norm": 7.480686187744141, "learning_rate": 3.0314630257205266e-05, "loss": 2.0678888320922852, "memory(GiB)": 77.56, "step": 73385, "token_acc": 0.5419847328244275, "train_speed(iter/s)": 1.437621 }, { "epoch": 3.1442526027162505, "grad_norm": 5.459778785705566, "learning_rate": 3.030844418710881e-05, "loss": 2.4423583984375, "memory(GiB)": 77.56, "step": 73390, "token_acc": 0.4584717607973422, "train_speed(iter/s)": 1.437626 }, { "epoch": 3.1444668180454993, "grad_norm": 5.258876323699951, "learning_rate": 3.0302258473743374e-05, "loss": 2.2689821243286135, "memory(GiB)": 77.56, "step": 73395, "token_acc": 0.5302491103202847, "train_speed(iter/s)": 1.437639 }, { "epoch": 3.144681033374748, "grad_norm": 5.106011867523193, "learning_rate": 3.0296073117221004e-05, "loss": 2.3563161849975587, "memory(GiB)": 77.56, "step": 73400, "token_acc": 0.49454545454545457, "train_speed(iter/s)": 1.437626 }, { "epoch": 3.1448952487039974, "grad_norm": 9.77120590209961, "learning_rate": 3.028988811765374e-05, "loss": 2.4008644104003904, "memory(GiB)": 77.56, "step": 73405, "token_acc": 0.5482625482625483, "train_speed(iter/s)": 1.437645 }, { "epoch": 3.145109464033246, "grad_norm": 5.13966703414917, "learning_rate": 3.0283703475153656e-05, "loss": 2.526101303100586, "memory(GiB)": 77.56, "step": 73410, "token_acc": 0.48476454293628807, "train_speed(iter/s)": 1.437627 }, { "epoch": 3.145323679362495, "grad_norm": 6.0968756675720215, "learning_rate": 3.0277519189832777e-05, "loss": 2.543075180053711, "memory(GiB)": 77.56, "step": 73415, "token_acc": 0.46488294314381273, "train_speed(iter/s)": 1.437633 }, { "epoch": 3.1455378946917443, "grad_norm": 6.282031059265137, "learning_rate": 3.0271335261803124e-05, "loss": 2.39583797454834, "memory(GiB)": 77.56, "step": 73420, "token_acc": 0.4965034965034965, "train_speed(iter/s)": 1.437639 }, { "epoch": 3.145752110020993, "grad_norm": 5.595254421234131, "learning_rate": 3.0265151691176756e-05, "loss": 2.2169618606567383, "memory(GiB)": 77.56, "step": 73425, "token_acc": 0.5284280936454849, "train_speed(iter/s)": 1.437647 }, { "epoch": 3.145966325350242, "grad_norm": 6.173315048217773, "learning_rate": 3.0258968478065665e-05, "loss": 2.4594953536987303, "memory(GiB)": 77.56, "step": 73430, "token_acc": 0.48863636363636365, "train_speed(iter/s)": 1.437666 }, { "epoch": 3.146180540679491, "grad_norm": 5.420690059661865, "learning_rate": 3.025278562258188e-05, "loss": 2.7817768096923827, "memory(GiB)": 77.56, "step": 73435, "token_acc": 0.46875, "train_speed(iter/s)": 1.437668 }, { "epoch": 3.14639475600874, "grad_norm": 4.7022199630737305, "learning_rate": 3.0246603124837414e-05, "loss": 2.3445222854614256, "memory(GiB)": 77.56, "step": 73440, "token_acc": 0.49079754601226994, "train_speed(iter/s)": 1.437621 }, { "epoch": 3.146608971337989, "grad_norm": 5.661627769470215, "learning_rate": 3.024042098494426e-05, "loss": 2.210631561279297, "memory(GiB)": 77.56, "step": 73445, "token_acc": 0.5126050420168067, "train_speed(iter/s)": 1.437629 }, { "epoch": 3.146823186667238, "grad_norm": 4.719162464141846, "learning_rate": 3.0234239203014413e-05, "loss": 2.2837533950805664, "memory(GiB)": 77.56, "step": 73450, "token_acc": 0.5320754716981132, "train_speed(iter/s)": 1.437655 }, { "epoch": 3.147037401996487, "grad_norm": 5.719695568084717, "learning_rate": 3.022805777915988e-05, "loss": 2.3076456069946287, "memory(GiB)": 77.56, "step": 73455, "token_acc": 0.5110410094637224, "train_speed(iter/s)": 1.437654 }, { "epoch": 3.1472516173257357, "grad_norm": 5.263019561767578, "learning_rate": 3.022187671349262e-05, "loss": 2.395330619812012, "memory(GiB)": 77.56, "step": 73460, "token_acc": 0.4750830564784053, "train_speed(iter/s)": 1.437643 }, { "epoch": 3.147465832654985, "grad_norm": 5.4647040367126465, "learning_rate": 3.0215696006124606e-05, "loss": 2.2821044921875, "memory(GiB)": 77.56, "step": 73465, "token_acc": 0.5160349854227405, "train_speed(iter/s)": 1.43766 }, { "epoch": 3.1476800479842337, "grad_norm": 6.213204383850098, "learning_rate": 3.020951565716783e-05, "loss": 2.476725959777832, "memory(GiB)": 77.56, "step": 73470, "token_acc": 0.46153846153846156, "train_speed(iter/s)": 1.437666 }, { "epoch": 3.1478942633134825, "grad_norm": 5.980068206787109, "learning_rate": 3.0203335666734244e-05, "loss": 2.4718374252319335, "memory(GiB)": 77.56, "step": 73475, "token_acc": 0.4891304347826087, "train_speed(iter/s)": 1.437684 }, { "epoch": 3.148108478642732, "grad_norm": 7.020590305328369, "learning_rate": 3.0197156034935792e-05, "loss": 2.5662263870239257, "memory(GiB)": 77.56, "step": 73480, "token_acc": 0.5015576323987538, "train_speed(iter/s)": 1.437686 }, { "epoch": 3.1483226939719806, "grad_norm": 5.213085174560547, "learning_rate": 3.019097676188445e-05, "loss": 2.4975818634033202, "memory(GiB)": 77.56, "step": 73485, "token_acc": 0.44477611940298506, "train_speed(iter/s)": 1.437702 }, { "epoch": 3.1485369093012294, "grad_norm": 5.716728210449219, "learning_rate": 3.0184797847692126e-05, "loss": 2.3468257904052736, "memory(GiB)": 77.56, "step": 73490, "token_acc": 0.5201465201465202, "train_speed(iter/s)": 1.437691 }, { "epoch": 3.1487511246304787, "grad_norm": 4.673936367034912, "learning_rate": 3.01786192924708e-05, "loss": 2.34005126953125, "memory(GiB)": 77.56, "step": 73495, "token_acc": 0.5247524752475248, "train_speed(iter/s)": 1.4377 }, { "epoch": 3.1489653399597275, "grad_norm": 4.815449237823486, "learning_rate": 3.017244109633237e-05, "loss": 2.2413490295410154, "memory(GiB)": 77.56, "step": 73500, "token_acc": 0.5307443365695793, "train_speed(iter/s)": 1.43771 }, { "epoch": 3.1489653399597275, "eval_loss": 2.373683214187622, "eval_runtime": 13.9378, "eval_samples_per_second": 7.175, "eval_steps_per_second": 7.175, "eval_token_acc": 0.4917808219178082, "step": 73500 }, { "epoch": 3.1491795552889763, "grad_norm": 7.076644420623779, "learning_rate": 3.016626325938875e-05, "loss": 2.581821060180664, "memory(GiB)": 77.56, "step": 73505, "token_acc": 0.49080348499515974, "train_speed(iter/s)": 1.437291 }, { "epoch": 3.1493937706182256, "grad_norm": 5.052487850189209, "learning_rate": 3.016008578175189e-05, "loss": 2.4996145248413084, "memory(GiB)": 77.56, "step": 73510, "token_acc": 0.4747191011235955, "train_speed(iter/s)": 1.437282 }, { "epoch": 3.1496079859474744, "grad_norm": 6.75240421295166, "learning_rate": 3.0153908663533693e-05, "loss": 1.9786026000976562, "memory(GiB)": 77.56, "step": 73515, "token_acc": 0.5919117647058824, "train_speed(iter/s)": 1.437285 }, { "epoch": 3.149822201276723, "grad_norm": 9.193188667297363, "learning_rate": 3.0147731904846067e-05, "loss": 2.526492881774902, "memory(GiB)": 77.56, "step": 73520, "token_acc": 0.4647887323943662, "train_speed(iter/s)": 1.437287 }, { "epoch": 3.1500364166059724, "grad_norm": 7.227169036865234, "learning_rate": 3.014155550580088e-05, "loss": 2.2694686889648437, "memory(GiB)": 77.56, "step": 73525, "token_acc": 0.5346153846153846, "train_speed(iter/s)": 1.437286 }, { "epoch": 3.1502506319352213, "grad_norm": 4.513391017913818, "learning_rate": 3.0135379466510062e-05, "loss": 2.4309270858764647, "memory(GiB)": 77.56, "step": 73530, "token_acc": 0.47794117647058826, "train_speed(iter/s)": 1.437296 }, { "epoch": 3.15046484726447, "grad_norm": 7.7047200202941895, "learning_rate": 3.012920378708546e-05, "loss": 2.283696174621582, "memory(GiB)": 77.56, "step": 73535, "token_acc": 0.5169230769230769, "train_speed(iter/s)": 1.437304 }, { "epoch": 3.1506790625937193, "grad_norm": 5.666827201843262, "learning_rate": 3.0123028467638992e-05, "loss": 2.3855566024780273, "memory(GiB)": 77.56, "step": 73540, "token_acc": 0.48484848484848486, "train_speed(iter/s)": 1.437302 }, { "epoch": 3.150893277922968, "grad_norm": 6.268438339233398, "learning_rate": 3.01168535082825e-05, "loss": 2.542085647583008, "memory(GiB)": 77.56, "step": 73545, "token_acc": 0.4773413897280967, "train_speed(iter/s)": 1.437312 }, { "epoch": 3.151107493252217, "grad_norm": 5.260461807250977, "learning_rate": 3.0110678909127855e-05, "loss": 2.4235010147094727, "memory(GiB)": 77.56, "step": 73550, "token_acc": 0.46179401993355484, "train_speed(iter/s)": 1.437287 }, { "epoch": 3.151321708581466, "grad_norm": 6.1962409019470215, "learning_rate": 3.0104504670286926e-05, "loss": 2.5472953796386717, "memory(GiB)": 77.56, "step": 73555, "token_acc": 0.450920245398773, "train_speed(iter/s)": 1.4373 }, { "epoch": 3.151535923910715, "grad_norm": 5.057364463806152, "learning_rate": 3.009833079187156e-05, "loss": 2.2611093521118164, "memory(GiB)": 77.56, "step": 73560, "token_acc": 0.5478547854785478, "train_speed(iter/s)": 1.437307 }, { "epoch": 3.151750139239964, "grad_norm": 6.639054775238037, "learning_rate": 3.009215727399359e-05, "loss": 2.2952062606811525, "memory(GiB)": 77.56, "step": 73565, "token_acc": 0.5294117647058824, "train_speed(iter/s)": 1.437318 }, { "epoch": 3.151964354569213, "grad_norm": 6.600549221038818, "learning_rate": 3.0085984116764877e-05, "loss": 2.3072265625, "memory(GiB)": 77.56, "step": 73570, "token_acc": 0.5475285171102662, "train_speed(iter/s)": 1.437323 }, { "epoch": 3.152178569898462, "grad_norm": 4.773643493652344, "learning_rate": 3.0079811320297235e-05, "loss": 2.2260398864746094, "memory(GiB)": 77.56, "step": 73575, "token_acc": 0.5387323943661971, "train_speed(iter/s)": 1.437321 }, { "epoch": 3.1523927852277107, "grad_norm": 6.038623332977295, "learning_rate": 3.007363888470249e-05, "loss": 2.4065332412719727, "memory(GiB)": 77.56, "step": 73580, "token_acc": 0.48717948717948717, "train_speed(iter/s)": 1.437339 }, { "epoch": 3.15260700055696, "grad_norm": 8.307723045349121, "learning_rate": 3.0067466810092475e-05, "loss": 2.1981456756591795, "memory(GiB)": 77.56, "step": 73585, "token_acc": 0.5183673469387755, "train_speed(iter/s)": 1.437357 }, { "epoch": 3.152821215886209, "grad_norm": 7.262567520141602, "learning_rate": 3.0061295096579008e-05, "loss": 2.6510921478271485, "memory(GiB)": 77.56, "step": 73590, "token_acc": 0.46959459459459457, "train_speed(iter/s)": 1.437369 }, { "epoch": 3.1530354312154576, "grad_norm": 5.44913387298584, "learning_rate": 3.005512374427387e-05, "loss": 2.5405113220214846, "memory(GiB)": 77.56, "step": 73595, "token_acc": 0.4528985507246377, "train_speed(iter/s)": 1.437366 }, { "epoch": 3.153249646544707, "grad_norm": 6.637559413909912, "learning_rate": 3.0048952753288882e-05, "loss": 2.5898208618164062, "memory(GiB)": 77.56, "step": 73600, "token_acc": 0.5059760956175299, "train_speed(iter/s)": 1.437355 }, { "epoch": 3.1534638618739557, "grad_norm": 5.489455699920654, "learning_rate": 3.004278212373584e-05, "loss": 2.2588163375854493, "memory(GiB)": 77.56, "step": 73605, "token_acc": 0.5392491467576792, "train_speed(iter/s)": 1.43736 }, { "epoch": 3.1536780772032045, "grad_norm": 6.767765045166016, "learning_rate": 3.003661185572651e-05, "loss": 2.50377197265625, "memory(GiB)": 77.56, "step": 73610, "token_acc": 0.4487534626038781, "train_speed(iter/s)": 1.437388 }, { "epoch": 3.1538922925324537, "grad_norm": 5.8337883949279785, "learning_rate": 3.003044194937269e-05, "loss": 2.4526750564575197, "memory(GiB)": 77.56, "step": 73615, "token_acc": 0.49063670411985016, "train_speed(iter/s)": 1.437369 }, { "epoch": 3.1541065078617025, "grad_norm": 6.908928394317627, "learning_rate": 3.0024272404786146e-05, "loss": 2.196597862243652, "memory(GiB)": 77.56, "step": 73620, "token_acc": 0.5176470588235295, "train_speed(iter/s)": 1.437392 }, { "epoch": 3.1543207231909514, "grad_norm": 6.508188724517822, "learning_rate": 3.0018103222078643e-05, "loss": 2.594831848144531, "memory(GiB)": 77.56, "step": 73625, "token_acc": 0.4793388429752066, "train_speed(iter/s)": 1.437369 }, { "epoch": 3.1545349385202006, "grad_norm": 5.630304336547852, "learning_rate": 3.001193440136194e-05, "loss": 2.3586090087890623, "memory(GiB)": 77.56, "step": 73630, "token_acc": 0.5272206303724928, "train_speed(iter/s)": 1.437349 }, { "epoch": 3.1547491538494494, "grad_norm": 6.659980297088623, "learning_rate": 3.00057659427478e-05, "loss": 2.2676794052124025, "memory(GiB)": 77.56, "step": 73635, "token_acc": 0.5325670498084292, "train_speed(iter/s)": 1.437357 }, { "epoch": 3.1549633691786982, "grad_norm": 7.744792938232422, "learning_rate": 2.999959784634797e-05, "loss": 2.2578895568847654, "memory(GiB)": 77.56, "step": 73640, "token_acc": 0.5018867924528302, "train_speed(iter/s)": 1.437374 }, { "epoch": 3.1551775845079475, "grad_norm": 4.810945987701416, "learning_rate": 2.999343011227419e-05, "loss": 2.0620615005493166, "memory(GiB)": 77.56, "step": 73645, "token_acc": 0.5333333333333333, "train_speed(iter/s)": 1.437395 }, { "epoch": 3.1553917998371963, "grad_norm": 5.521345138549805, "learning_rate": 2.9987262740638178e-05, "loss": 2.5106502532958985, "memory(GiB)": 77.56, "step": 73650, "token_acc": 0.47564469914040114, "train_speed(iter/s)": 1.437399 }, { "epoch": 3.155606015166445, "grad_norm": 7.049400329589844, "learning_rate": 2.9981095731551666e-05, "loss": 2.377778434753418, "memory(GiB)": 77.56, "step": 73655, "token_acc": 0.5100401606425703, "train_speed(iter/s)": 1.437432 }, { "epoch": 3.1558202304956944, "grad_norm": 5.648694038391113, "learning_rate": 2.99749290851264e-05, "loss": 2.1959930419921876, "memory(GiB)": 77.56, "step": 73660, "token_acc": 0.5539033457249071, "train_speed(iter/s)": 1.437438 }, { "epoch": 3.156034445824943, "grad_norm": 7.972008228302002, "learning_rate": 2.9968762801474087e-05, "loss": 2.2821102142333984, "memory(GiB)": 77.56, "step": 73665, "token_acc": 0.506578947368421, "train_speed(iter/s)": 1.437455 }, { "epoch": 3.156248661154192, "grad_norm": 5.772468090057373, "learning_rate": 2.9962596880706413e-05, "loss": 2.42156925201416, "memory(GiB)": 77.56, "step": 73670, "token_acc": 0.4684014869888476, "train_speed(iter/s)": 1.43748 }, { "epoch": 3.1564628764834413, "grad_norm": 11.613487243652344, "learning_rate": 2.9956431322935104e-05, "loss": 2.612662124633789, "memory(GiB)": 77.56, "step": 73675, "token_acc": 0.487012987012987, "train_speed(iter/s)": 1.437487 }, { "epoch": 3.15667709181269, "grad_norm": 5.495362758636475, "learning_rate": 2.995026612827183e-05, "loss": 2.4032793045043945, "memory(GiB)": 77.56, "step": 73680, "token_acc": 0.501628664495114, "train_speed(iter/s)": 1.437447 }, { "epoch": 3.156891307141939, "grad_norm": 5.41244649887085, "learning_rate": 2.994410129682831e-05, "loss": 2.6234617233276367, "memory(GiB)": 77.56, "step": 73685, "token_acc": 0.48135593220338985, "train_speed(iter/s)": 1.437438 }, { "epoch": 3.157105522471188, "grad_norm": 5.586054801940918, "learning_rate": 2.99379368287162e-05, "loss": 2.5046566009521483, "memory(GiB)": 77.56, "step": 73690, "token_acc": 0.48214285714285715, "train_speed(iter/s)": 1.437462 }, { "epoch": 3.157319737800437, "grad_norm": 6.418153762817383, "learning_rate": 2.993177272404718e-05, "loss": 2.412129020690918, "memory(GiB)": 77.56, "step": 73695, "token_acc": 0.4782608695652174, "train_speed(iter/s)": 1.437468 }, { "epoch": 3.1575339531296858, "grad_norm": 6.793228626251221, "learning_rate": 2.9925608982932928e-05, "loss": 2.508030891418457, "memory(GiB)": 77.56, "step": 73700, "token_acc": 0.49158249158249157, "train_speed(iter/s)": 1.437474 }, { "epoch": 3.157748168458935, "grad_norm": 6.082930564880371, "learning_rate": 2.9919445605485107e-05, "loss": 2.453681755065918, "memory(GiB)": 77.56, "step": 73705, "token_acc": 0.45288753799392095, "train_speed(iter/s)": 1.437499 }, { "epoch": 3.157962383788184, "grad_norm": 5.249345779418945, "learning_rate": 2.9913282591815352e-05, "loss": 2.444533920288086, "memory(GiB)": 77.56, "step": 73710, "token_acc": 0.5072992700729927, "train_speed(iter/s)": 1.437506 }, { "epoch": 3.1581765991174326, "grad_norm": 5.690240859985352, "learning_rate": 2.9907119942035338e-05, "loss": 2.0600845336914064, "memory(GiB)": 77.56, "step": 73715, "token_acc": 0.5457627118644067, "train_speed(iter/s)": 1.437508 }, { "epoch": 3.158390814446682, "grad_norm": 5.2061381340026855, "learning_rate": 2.9900957656256685e-05, "loss": 2.6037893295288086, "memory(GiB)": 77.56, "step": 73720, "token_acc": 0.46218487394957986, "train_speed(iter/s)": 1.437508 }, { "epoch": 3.1586050297759307, "grad_norm": 5.8762054443359375, "learning_rate": 2.989479573459104e-05, "loss": 2.401332473754883, "memory(GiB)": 77.56, "step": 73725, "token_acc": 0.4752851711026616, "train_speed(iter/s)": 1.43752 }, { "epoch": 3.1588192451051795, "grad_norm": 7.79713773727417, "learning_rate": 2.9888634177150022e-05, "loss": 2.3656606674194336, "memory(GiB)": 77.56, "step": 73730, "token_acc": 0.48638132295719844, "train_speed(iter/s)": 1.437551 }, { "epoch": 3.159033460434429, "grad_norm": 5.791051387786865, "learning_rate": 2.9882472984045277e-05, "loss": 2.2346126556396486, "memory(GiB)": 77.56, "step": 73735, "token_acc": 0.5340909090909091, "train_speed(iter/s)": 1.437568 }, { "epoch": 3.1592476757636776, "grad_norm": 5.915251731872559, "learning_rate": 2.9876312155388398e-05, "loss": 2.4448097229003904, "memory(GiB)": 77.56, "step": 73740, "token_acc": 0.48013245033112584, "train_speed(iter/s)": 1.437553 }, { "epoch": 3.1594618910929264, "grad_norm": 5.226356506347656, "learning_rate": 2.987015169129101e-05, "loss": 2.174884033203125, "memory(GiB)": 77.56, "step": 73745, "token_acc": 0.5226586102719033, "train_speed(iter/s)": 1.437563 }, { "epoch": 3.1596761064221757, "grad_norm": 6.0764241218566895, "learning_rate": 2.9863991591864705e-05, "loss": 2.2783248901367186, "memory(GiB)": 77.56, "step": 73750, "token_acc": 0.47686832740213525, "train_speed(iter/s)": 1.437567 }, { "epoch": 3.1598903217514245, "grad_norm": 6.55917501449585, "learning_rate": 2.9857831857221075e-05, "loss": 2.367101860046387, "memory(GiB)": 77.56, "step": 73755, "token_acc": 0.48226950354609927, "train_speed(iter/s)": 1.437561 }, { "epoch": 3.1601045370806733, "grad_norm": 5.933302879333496, "learning_rate": 2.9851672487471728e-05, "loss": 2.1851858139038085, "memory(GiB)": 77.56, "step": 73760, "token_acc": 0.5032894736842105, "train_speed(iter/s)": 1.437581 }, { "epoch": 3.1603187524099225, "grad_norm": 4.586269378662109, "learning_rate": 2.9845513482728217e-05, "loss": 2.1989646911621095, "memory(GiB)": 77.56, "step": 73765, "token_acc": 0.5399361022364217, "train_speed(iter/s)": 1.437594 }, { "epoch": 3.1605329677391714, "grad_norm": 5.770150661468506, "learning_rate": 2.983935484310215e-05, "loss": 1.866630744934082, "memory(GiB)": 77.56, "step": 73770, "token_acc": 0.6, "train_speed(iter/s)": 1.437604 }, { "epoch": 3.16074718306842, "grad_norm": 6.130843639373779, "learning_rate": 2.983319656870508e-05, "loss": 2.2004411697387694, "memory(GiB)": 77.56, "step": 73775, "token_acc": 0.5251798561151079, "train_speed(iter/s)": 1.437605 }, { "epoch": 3.1609613983976694, "grad_norm": 5.802975177764893, "learning_rate": 2.9827038659648566e-05, "loss": 2.0468700408935545, "memory(GiB)": 77.56, "step": 73780, "token_acc": 0.552, "train_speed(iter/s)": 1.437619 }, { "epoch": 3.1611756137269182, "grad_norm": 4.6417670249938965, "learning_rate": 2.982088111604418e-05, "loss": 2.3604427337646485, "memory(GiB)": 77.56, "step": 73785, "token_acc": 0.5458015267175572, "train_speed(iter/s)": 1.437615 }, { "epoch": 3.161389829056167, "grad_norm": 6.024120807647705, "learning_rate": 2.9814723938003463e-05, "loss": 2.198708152770996, "memory(GiB)": 77.56, "step": 73790, "token_acc": 0.5150375939849624, "train_speed(iter/s)": 1.437627 }, { "epoch": 3.1616040443854163, "grad_norm": 5.336095809936523, "learning_rate": 2.980856712563794e-05, "loss": 2.5639265060424803, "memory(GiB)": 77.56, "step": 73795, "token_acc": 0.4925925925925926, "train_speed(iter/s)": 1.437648 }, { "epoch": 3.161818259714665, "grad_norm": 4.918575763702393, "learning_rate": 2.9802410679059163e-05, "loss": 2.261923408508301, "memory(GiB)": 77.56, "step": 73800, "token_acc": 0.5091463414634146, "train_speed(iter/s)": 1.437639 }, { "epoch": 3.162032475043914, "grad_norm": 5.868687152862549, "learning_rate": 2.9796254598378683e-05, "loss": 2.3623683929443358, "memory(GiB)": 77.56, "step": 73805, "token_acc": 0.4860335195530726, "train_speed(iter/s)": 1.437647 }, { "epoch": 3.162246690373163, "grad_norm": 5.617335319519043, "learning_rate": 2.9790098883707996e-05, "loss": 2.1367895126342775, "memory(GiB)": 77.56, "step": 73810, "token_acc": 0.5094339622641509, "train_speed(iter/s)": 1.43764 }, { "epoch": 3.162460905702412, "grad_norm": 6.260705947875977, "learning_rate": 2.978394353515862e-05, "loss": 2.2816699981689452, "memory(GiB)": 77.56, "step": 73815, "token_acc": 0.5, "train_speed(iter/s)": 1.437666 }, { "epoch": 3.162675121031661, "grad_norm": 4.895193576812744, "learning_rate": 2.977778855284208e-05, "loss": 2.186238098144531, "memory(GiB)": 77.56, "step": 73820, "token_acc": 0.5253623188405797, "train_speed(iter/s)": 1.437685 }, { "epoch": 3.16288933636091, "grad_norm": 4.447147369384766, "learning_rate": 2.9771633936869863e-05, "loss": 2.0569997787475587, "memory(GiB)": 77.56, "step": 73825, "token_acc": 0.5104895104895105, "train_speed(iter/s)": 1.437681 }, { "epoch": 3.163103551690159, "grad_norm": 6.592648506164551, "learning_rate": 2.976547968735348e-05, "loss": 2.391103744506836, "memory(GiB)": 77.56, "step": 73830, "token_acc": 0.5099337748344371, "train_speed(iter/s)": 1.437685 }, { "epoch": 3.163317767019408, "grad_norm": 5.578714847564697, "learning_rate": 2.9759325804404418e-05, "loss": 2.606415939331055, "memory(GiB)": 77.56, "step": 73835, "token_acc": 0.4797507788161994, "train_speed(iter/s)": 1.437675 }, { "epoch": 3.163531982348657, "grad_norm": 4.855007171630859, "learning_rate": 2.9753172288134146e-05, "loss": 2.2682567596435548, "memory(GiB)": 77.56, "step": 73840, "token_acc": 0.5544554455445545, "train_speed(iter/s)": 1.437685 }, { "epoch": 3.1637461976779058, "grad_norm": 6.128581523895264, "learning_rate": 2.9747019138654157e-05, "loss": 2.4982378005981447, "memory(GiB)": 77.56, "step": 73845, "token_acc": 0.47468354430379744, "train_speed(iter/s)": 1.437699 }, { "epoch": 3.163960413007155, "grad_norm": 5.198617458343506, "learning_rate": 2.974086635607592e-05, "loss": 2.2245336532592774, "memory(GiB)": 77.56, "step": 73850, "token_acc": 0.5180722891566265, "train_speed(iter/s)": 1.437713 }, { "epoch": 3.164174628336404, "grad_norm": 6.806390285491943, "learning_rate": 2.9734713940510884e-05, "loss": 2.308121109008789, "memory(GiB)": 77.56, "step": 73855, "token_acc": 0.503448275862069, "train_speed(iter/s)": 1.437731 }, { "epoch": 3.1643888436656527, "grad_norm": 5.530201435089111, "learning_rate": 2.9728561892070518e-05, "loss": 2.382350730895996, "memory(GiB)": 77.56, "step": 73860, "token_acc": 0.5203761755485894, "train_speed(iter/s)": 1.437739 }, { "epoch": 3.164603058994902, "grad_norm": 7.2378740310668945, "learning_rate": 2.972241021086627e-05, "loss": 2.425258445739746, "memory(GiB)": 77.56, "step": 73865, "token_acc": 0.4682274247491639, "train_speed(iter/s)": 1.437756 }, { "epoch": 3.1648172743241507, "grad_norm": 6.611432075500488, "learning_rate": 2.9716258897009586e-05, "loss": 2.351436424255371, "memory(GiB)": 77.56, "step": 73870, "token_acc": 0.5333333333333333, "train_speed(iter/s)": 1.437749 }, { "epoch": 3.1650314896533995, "grad_norm": 5.041432857513428, "learning_rate": 2.9710107950611878e-05, "loss": 2.4247488021850585, "memory(GiB)": 77.56, "step": 73875, "token_acc": 0.521311475409836, "train_speed(iter/s)": 1.437755 }, { "epoch": 3.165245704982649, "grad_norm": 6.774808406829834, "learning_rate": 2.970395737178462e-05, "loss": 2.2830532073974608, "memory(GiB)": 77.56, "step": 73880, "token_acc": 0.5492957746478874, "train_speed(iter/s)": 1.437739 }, { "epoch": 3.1654599203118976, "grad_norm": 6.014718055725098, "learning_rate": 2.96978071606392e-05, "loss": 2.3167903900146483, "memory(GiB)": 77.56, "step": 73885, "token_acc": 0.5015873015873016, "train_speed(iter/s)": 1.437752 }, { "epoch": 3.1656741356411464, "grad_norm": 5.995301246643066, "learning_rate": 2.9691657317287068e-05, "loss": 2.320265197753906, "memory(GiB)": 77.56, "step": 73890, "token_acc": 0.5185185185185185, "train_speed(iter/s)": 1.43777 }, { "epoch": 3.1658883509703957, "grad_norm": 5.244626998901367, "learning_rate": 2.968550784183961e-05, "loss": 2.071191596984863, "memory(GiB)": 77.56, "step": 73895, "token_acc": 0.5253623188405797, "train_speed(iter/s)": 1.437785 }, { "epoch": 3.1661025662996445, "grad_norm": 6.777777194976807, "learning_rate": 2.967935873440822e-05, "loss": 2.31115837097168, "memory(GiB)": 77.56, "step": 73900, "token_acc": 0.47058823529411764, "train_speed(iter/s)": 1.437785 }, { "epoch": 3.1663167816288933, "grad_norm": 5.80156946182251, "learning_rate": 2.9673209995104322e-05, "loss": 2.234435272216797, "memory(GiB)": 77.56, "step": 73905, "token_acc": 0.515358361774744, "train_speed(iter/s)": 1.437786 }, { "epoch": 3.1665309969581426, "grad_norm": 6.019159317016602, "learning_rate": 2.9667061624039282e-05, "loss": 2.342307281494141, "memory(GiB)": 77.56, "step": 73910, "token_acc": 0.5163398692810458, "train_speed(iter/s)": 1.437805 }, { "epoch": 3.1667452122873914, "grad_norm": 5.571842670440674, "learning_rate": 2.9660913621324503e-05, "loss": 2.425028610229492, "memory(GiB)": 77.56, "step": 73915, "token_acc": 0.4826388888888889, "train_speed(iter/s)": 1.437829 }, { "epoch": 3.16695942761664, "grad_norm": 5.433260917663574, "learning_rate": 2.965476598707136e-05, "loss": 2.191996383666992, "memory(GiB)": 77.56, "step": 73920, "token_acc": 0.5210355987055016, "train_speed(iter/s)": 1.437822 }, { "epoch": 3.1671736429458894, "grad_norm": 5.574416637420654, "learning_rate": 2.9648618721391197e-05, "loss": 2.1054168701171876, "memory(GiB)": 77.56, "step": 73925, "token_acc": 0.5457627118644067, "train_speed(iter/s)": 1.437833 }, { "epoch": 3.1673878582751382, "grad_norm": 8.710840225219727, "learning_rate": 2.9642471824395413e-05, "loss": 2.3504207611083983, "memory(GiB)": 77.56, "step": 73930, "token_acc": 0.5137931034482759, "train_speed(iter/s)": 1.437837 }, { "epoch": 3.167602073604387, "grad_norm": 5.431893825531006, "learning_rate": 2.963632529619535e-05, "loss": 2.458882713317871, "memory(GiB)": 77.56, "step": 73935, "token_acc": 0.47904191616766467, "train_speed(iter/s)": 1.43782 }, { "epoch": 3.1678162889336363, "grad_norm": 5.714679718017578, "learning_rate": 2.9630179136902346e-05, "loss": 2.395102691650391, "memory(GiB)": 77.56, "step": 73940, "token_acc": 0.49333333333333335, "train_speed(iter/s)": 1.437808 }, { "epoch": 3.168030504262885, "grad_norm": 5.728879451751709, "learning_rate": 2.9624033346627766e-05, "loss": 1.9084863662719727, "memory(GiB)": 77.56, "step": 73945, "token_acc": 0.5925925925925926, "train_speed(iter/s)": 1.437807 }, { "epoch": 3.168244719592134, "grad_norm": 5.877350330352783, "learning_rate": 2.9617887925482914e-05, "loss": 1.980107307434082, "memory(GiB)": 77.56, "step": 73950, "token_acc": 0.5416666666666666, "train_speed(iter/s)": 1.437805 }, { "epoch": 3.168458934921383, "grad_norm": 5.895495891571045, "learning_rate": 2.961174287357916e-05, "loss": 2.2974740982055666, "memory(GiB)": 77.56, "step": 73955, "token_acc": 0.5183673469387755, "train_speed(iter/s)": 1.437799 }, { "epoch": 3.168673150250632, "grad_norm": 5.26740026473999, "learning_rate": 2.9605598191027805e-05, "loss": 2.205534553527832, "memory(GiB)": 77.56, "step": 73960, "token_acc": 0.5, "train_speed(iter/s)": 1.437807 }, { "epoch": 3.168887365579881, "grad_norm": 4.823087692260742, "learning_rate": 2.9599453877940175e-05, "loss": 2.4179988861083985, "memory(GiB)": 77.56, "step": 73965, "token_acc": 0.5131086142322098, "train_speed(iter/s)": 1.437782 }, { "epoch": 3.16910158090913, "grad_norm": 6.26754903793335, "learning_rate": 2.9593309934427565e-05, "loss": 2.0993356704711914, "memory(GiB)": 77.56, "step": 73970, "token_acc": 0.5543071161048689, "train_speed(iter/s)": 1.437776 }, { "epoch": 3.169315796238379, "grad_norm": 5.71920108795166, "learning_rate": 2.9587166360601303e-05, "loss": 2.552367401123047, "memory(GiB)": 77.56, "step": 73975, "token_acc": 0.49324324324324326, "train_speed(iter/s)": 1.43779 }, { "epoch": 3.1695300115676277, "grad_norm": 5.638586521148682, "learning_rate": 2.9581023156572668e-05, "loss": 2.2518590927124023, "memory(GiB)": 77.56, "step": 73980, "token_acc": 0.4948805460750853, "train_speed(iter/s)": 1.437818 }, { "epoch": 3.169744226896877, "grad_norm": 5.522692680358887, "learning_rate": 2.9574880322452947e-05, "loss": 2.0866403579711914, "memory(GiB)": 77.56, "step": 73985, "token_acc": 0.5428571428571428, "train_speed(iter/s)": 1.437838 }, { "epoch": 3.1699584422261258, "grad_norm": 7.261645317077637, "learning_rate": 2.9568737858353436e-05, "loss": 2.2186960220336913, "memory(GiB)": 77.56, "step": 73990, "token_acc": 0.5409836065573771, "train_speed(iter/s)": 1.437828 }, { "epoch": 3.1701726575553746, "grad_norm": 6.001633167266846, "learning_rate": 2.9562595764385405e-05, "loss": 2.3324558258056642, "memory(GiB)": 77.56, "step": 73995, "token_acc": 0.5201612903225806, "train_speed(iter/s)": 1.437841 }, { "epoch": 3.170386872884624, "grad_norm": 4.552384376525879, "learning_rate": 2.9556454040660114e-05, "loss": 2.2031044006347655, "memory(GiB)": 77.56, "step": 74000, "token_acc": 0.5047318611987381, "train_speed(iter/s)": 1.437839 }, { "epoch": 3.170386872884624, "eval_loss": 2.236055374145508, "eval_runtime": 14.156, "eval_samples_per_second": 7.064, "eval_steps_per_second": 7.064, "eval_token_acc": 0.4821882951653944, "step": 74000 }, { "epoch": 3.1706010882138727, "grad_norm": 6.383446216583252, "learning_rate": 2.9550312687288846e-05, "loss": 2.299361801147461, "memory(GiB)": 77.56, "step": 74005, "token_acc": 0.489945155393053, "train_speed(iter/s)": 1.437414 }, { "epoch": 3.1708153035431215, "grad_norm": 5.767241954803467, "learning_rate": 2.954417170438284e-05, "loss": 2.245487594604492, "memory(GiB)": 77.56, "step": 74010, "token_acc": 0.48986486486486486, "train_speed(iter/s)": 1.43742 }, { "epoch": 3.1710295188723707, "grad_norm": 7.623003959655762, "learning_rate": 2.9538031092053354e-05, "loss": 2.436071014404297, "memory(GiB)": 77.56, "step": 74015, "token_acc": 0.5181159420289855, "train_speed(iter/s)": 1.437417 }, { "epoch": 3.1712437342016195, "grad_norm": 5.613229274749756, "learning_rate": 2.9531890850411637e-05, "loss": 2.3619190216064454, "memory(GiB)": 77.56, "step": 74020, "token_acc": 0.5109489051094891, "train_speed(iter/s)": 1.437411 }, { "epoch": 3.1714579495308683, "grad_norm": 4.912458419799805, "learning_rate": 2.9525750979568895e-05, "loss": 2.6781641006469727, "memory(GiB)": 77.56, "step": 74025, "token_acc": 0.46742209631728043, "train_speed(iter/s)": 1.437408 }, { "epoch": 3.1716721648601176, "grad_norm": 6.433786869049072, "learning_rate": 2.951961147963639e-05, "loss": 2.1593416213989256, "memory(GiB)": 77.56, "step": 74030, "token_acc": 0.5426621160409556, "train_speed(iter/s)": 1.437401 }, { "epoch": 3.1718863801893664, "grad_norm": 5.007352828979492, "learning_rate": 2.9513472350725347e-05, "loss": 2.2726470947265627, "memory(GiB)": 77.56, "step": 74035, "token_acc": 0.5209003215434084, "train_speed(iter/s)": 1.437412 }, { "epoch": 3.1721005955186152, "grad_norm": 9.181182861328125, "learning_rate": 2.9507333592946975e-05, "loss": 2.4367889404296874, "memory(GiB)": 77.56, "step": 74040, "token_acc": 0.502092050209205, "train_speed(iter/s)": 1.437407 }, { "epoch": 3.1723148108478645, "grad_norm": 4.705042839050293, "learning_rate": 2.9501195206412457e-05, "loss": 2.103071403503418, "memory(GiB)": 77.56, "step": 74045, "token_acc": 0.5369774919614148, "train_speed(iter/s)": 1.437417 }, { "epoch": 3.1725290261771133, "grad_norm": 5.426323890686035, "learning_rate": 2.9495057191233033e-05, "loss": 2.3035358428955077, "memory(GiB)": 77.56, "step": 74050, "token_acc": 0.5180722891566265, "train_speed(iter/s)": 1.437424 }, { "epoch": 3.172743241506362, "grad_norm": 4.460992336273193, "learning_rate": 2.9488919547519878e-05, "loss": 2.312650680541992, "memory(GiB)": 77.56, "step": 74055, "token_acc": 0.4803921568627451, "train_speed(iter/s)": 1.437421 }, { "epoch": 3.1729574568356114, "grad_norm": 6.285937786102295, "learning_rate": 2.948278227538419e-05, "loss": 2.244205856323242, "memory(GiB)": 77.56, "step": 74060, "token_acc": 0.5, "train_speed(iter/s)": 1.437422 }, { "epoch": 3.17317167216486, "grad_norm": 6.112287521362305, "learning_rate": 2.947664537493715e-05, "loss": 2.3975431442260744, "memory(GiB)": 77.56, "step": 74065, "token_acc": 0.5074626865671642, "train_speed(iter/s)": 1.437428 }, { "epoch": 3.173385887494109, "grad_norm": 5.413914680480957, "learning_rate": 2.9470508846289924e-05, "loss": 2.288863754272461, "memory(GiB)": 77.56, "step": 74070, "token_acc": 0.5, "train_speed(iter/s)": 1.437446 }, { "epoch": 3.1736001028233582, "grad_norm": 4.8067827224731445, "learning_rate": 2.9464372689553692e-05, "loss": 2.1473098754882813, "memory(GiB)": 77.56, "step": 74075, "token_acc": 0.5364963503649635, "train_speed(iter/s)": 1.437458 }, { "epoch": 3.173814318152607, "grad_norm": 6.240786552429199, "learning_rate": 2.945823690483962e-05, "loss": 2.233226776123047, "memory(GiB)": 77.56, "step": 74080, "token_acc": 0.5, "train_speed(iter/s)": 1.437449 }, { "epoch": 3.174028533481856, "grad_norm": 5.830225467681885, "learning_rate": 2.9452101492258843e-05, "loss": 2.438872528076172, "memory(GiB)": 77.56, "step": 74085, "token_acc": 0.48986486486486486, "train_speed(iter/s)": 1.437453 }, { "epoch": 3.174242748811105, "grad_norm": 7.278238296508789, "learning_rate": 2.9445966451922536e-05, "loss": 2.0950408935546876, "memory(GiB)": 77.56, "step": 74090, "token_acc": 0.5433070866141733, "train_speed(iter/s)": 1.437439 }, { "epoch": 3.174456964140354, "grad_norm": 4.190337181091309, "learning_rate": 2.9439831783941807e-05, "loss": 2.219211959838867, "memory(GiB)": 77.56, "step": 74095, "token_acc": 0.5083056478405316, "train_speed(iter/s)": 1.437455 }, { "epoch": 3.1746711794696028, "grad_norm": 12.855223655700684, "learning_rate": 2.9433697488427825e-05, "loss": 2.272871208190918, "memory(GiB)": 77.56, "step": 74100, "token_acc": 0.5058365758754864, "train_speed(iter/s)": 1.437468 }, { "epoch": 3.174885394798852, "grad_norm": 7.407022953033447, "learning_rate": 2.94275635654917e-05, "loss": 2.4018808364868165, "memory(GiB)": 77.56, "step": 74105, "token_acc": 0.4628975265017668, "train_speed(iter/s)": 1.437466 }, { "epoch": 3.175099610128101, "grad_norm": 4.247338771820068, "learning_rate": 2.9421430015244565e-05, "loss": 2.265575981140137, "memory(GiB)": 77.56, "step": 74110, "token_acc": 0.5365853658536586, "train_speed(iter/s)": 1.437477 }, { "epoch": 3.1753138254573496, "grad_norm": 5.550304412841797, "learning_rate": 2.941529683779753e-05, "loss": 1.9959667205810547, "memory(GiB)": 77.56, "step": 74115, "token_acc": 0.5573770491803278, "train_speed(iter/s)": 1.437476 }, { "epoch": 3.175528040786599, "grad_norm": 5.715430736541748, "learning_rate": 2.9409164033261704e-05, "loss": 2.450954627990723, "memory(GiB)": 77.56, "step": 74120, "token_acc": 0.47761194029850745, "train_speed(iter/s)": 1.437475 }, { "epoch": 3.1757422561158477, "grad_norm": 6.707547664642334, "learning_rate": 2.9403031601748192e-05, "loss": 2.356776809692383, "memory(GiB)": 77.56, "step": 74125, "token_acc": 0.512280701754386, "train_speed(iter/s)": 1.437488 }, { "epoch": 3.1759564714450965, "grad_norm": 5.754292011260986, "learning_rate": 2.939689954336807e-05, "loss": 2.5592182159423826, "memory(GiB)": 77.56, "step": 74130, "token_acc": 0.4788273615635179, "train_speed(iter/s)": 1.437471 }, { "epoch": 3.1761706867743458, "grad_norm": 6.746292591094971, "learning_rate": 2.939076785823246e-05, "loss": 2.5319929122924805, "memory(GiB)": 77.56, "step": 74135, "token_acc": 0.4716981132075472, "train_speed(iter/s)": 1.437471 }, { "epoch": 3.1763849021035946, "grad_norm": 5.235986232757568, "learning_rate": 2.9384636546452415e-05, "loss": 2.2475265502929687, "memory(GiB)": 77.56, "step": 74140, "token_acc": 0.508833922261484, "train_speed(iter/s)": 1.437453 }, { "epoch": 3.1765991174328434, "grad_norm": 6.648682594299316, "learning_rate": 2.9378505608139007e-05, "loss": 2.495352554321289, "memory(GiB)": 77.56, "step": 74145, "token_acc": 0.4664310954063604, "train_speed(iter/s)": 1.437474 }, { "epoch": 3.1768133327620927, "grad_norm": 4.644205570220947, "learning_rate": 2.937237504340333e-05, "loss": 2.2440237045288085, "memory(GiB)": 77.56, "step": 74150, "token_acc": 0.5186335403726708, "train_speed(iter/s)": 1.437461 }, { "epoch": 3.1770275480913415, "grad_norm": 6.298264503479004, "learning_rate": 2.9366244852356418e-05, "loss": 2.146969223022461, "memory(GiB)": 77.56, "step": 74155, "token_acc": 0.5398550724637681, "train_speed(iter/s)": 1.437442 }, { "epoch": 3.1772417634205903, "grad_norm": 6.155228137969971, "learning_rate": 2.9360115035109337e-05, "loss": 2.1114934921264648, "memory(GiB)": 77.56, "step": 74160, "token_acc": 0.5394736842105263, "train_speed(iter/s)": 1.437476 }, { "epoch": 3.1774559787498395, "grad_norm": 6.646638870239258, "learning_rate": 2.9353985591773148e-05, "loss": 2.24499397277832, "memory(GiB)": 77.56, "step": 74165, "token_acc": 0.5111111111111111, "train_speed(iter/s)": 1.437472 }, { "epoch": 3.1776701940790884, "grad_norm": 5.539613246917725, "learning_rate": 2.934785652245885e-05, "loss": 2.3395923614501952, "memory(GiB)": 77.56, "step": 74170, "token_acc": 0.5181159420289855, "train_speed(iter/s)": 1.437455 }, { "epoch": 3.177884409408337, "grad_norm": 5.576258182525635, "learning_rate": 2.934172782727751e-05, "loss": 2.0838991165161134, "memory(GiB)": 77.56, "step": 74175, "token_acc": 0.5031446540880503, "train_speed(iter/s)": 1.437471 }, { "epoch": 3.1780986247375864, "grad_norm": 5.956866264343262, "learning_rate": 2.9335599506340166e-05, "loss": 2.552425193786621, "memory(GiB)": 77.56, "step": 74180, "token_acc": 0.47246376811594204, "train_speed(iter/s)": 1.437462 }, { "epoch": 3.1783128400668352, "grad_norm": 5.738117218017578, "learning_rate": 2.9329471559757822e-05, "loss": 2.1716545104980467, "memory(GiB)": 77.56, "step": 74185, "token_acc": 0.49504950495049505, "train_speed(iter/s)": 1.437465 }, { "epoch": 3.178527055396084, "grad_norm": 5.96266508102417, "learning_rate": 2.932334398764148e-05, "loss": 2.497850227355957, "memory(GiB)": 77.56, "step": 74190, "token_acc": 0.4605263157894737, "train_speed(iter/s)": 1.437471 }, { "epoch": 3.1787412707253333, "grad_norm": 5.279277801513672, "learning_rate": 2.9317216790102164e-05, "loss": 2.1521081924438477, "memory(GiB)": 77.56, "step": 74195, "token_acc": 0.525691699604743, "train_speed(iter/s)": 1.437481 }, { "epoch": 3.178955486054582, "grad_norm": 7.685451507568359, "learning_rate": 2.931108996725086e-05, "loss": 2.648274040222168, "memory(GiB)": 77.56, "step": 74200, "token_acc": 0.4729241877256318, "train_speed(iter/s)": 1.437492 }, { "epoch": 3.179169701383831, "grad_norm": 5.598180770874023, "learning_rate": 2.930496351919858e-05, "loss": 2.4224830627441407, "memory(GiB)": 77.56, "step": 74205, "token_acc": 0.49096385542168675, "train_speed(iter/s)": 1.437489 }, { "epoch": 3.17938391671308, "grad_norm": 6.338024139404297, "learning_rate": 2.9298837446056304e-05, "loss": 2.467112350463867, "memory(GiB)": 77.56, "step": 74210, "token_acc": 0.48787878787878786, "train_speed(iter/s)": 1.437505 }, { "epoch": 3.179598132042329, "grad_norm": 8.39466667175293, "learning_rate": 2.9292711747934993e-05, "loss": 2.4090782165527345, "memory(GiB)": 77.56, "step": 74215, "token_acc": 0.49407114624505927, "train_speed(iter/s)": 1.437514 }, { "epoch": 3.179812347371578, "grad_norm": 4.34513521194458, "learning_rate": 2.928658642494564e-05, "loss": 2.5388145446777344, "memory(GiB)": 77.56, "step": 74220, "token_acc": 0.47692307692307695, "train_speed(iter/s)": 1.437517 }, { "epoch": 3.180026562700827, "grad_norm": 6.119314193725586, "learning_rate": 2.9280461477199205e-05, "loss": 2.5132280349731446, "memory(GiB)": 77.56, "step": 74225, "token_acc": 0.5328947368421053, "train_speed(iter/s)": 1.437528 }, { "epoch": 3.180240778030076, "grad_norm": 5.85638952255249, "learning_rate": 2.9274336904806642e-05, "loss": 2.2614885330200196, "memory(GiB)": 77.56, "step": 74230, "token_acc": 0.5306122448979592, "train_speed(iter/s)": 1.43755 }, { "epoch": 3.1804549933593247, "grad_norm": 4.860142230987549, "learning_rate": 2.9268212707878917e-05, "loss": 2.096175765991211, "memory(GiB)": 77.56, "step": 74235, "token_acc": 0.5077519379844961, "train_speed(iter/s)": 1.437545 }, { "epoch": 3.180669208688574, "grad_norm": 5.776596546173096, "learning_rate": 2.9262088886526962e-05, "loss": 2.897212028503418, "memory(GiB)": 77.56, "step": 74240, "token_acc": 0.4634146341463415, "train_speed(iter/s)": 1.437553 }, { "epoch": 3.1808834240178228, "grad_norm": 5.11076545715332, "learning_rate": 2.925596544086171e-05, "loss": 2.604241943359375, "memory(GiB)": 77.56, "step": 74245, "token_acc": 0.5038167938931297, "train_speed(iter/s)": 1.437526 }, { "epoch": 3.1810976393470716, "grad_norm": 7.483839988708496, "learning_rate": 2.9249842370994106e-05, "loss": 2.344086456298828, "memory(GiB)": 77.56, "step": 74250, "token_acc": 0.48322147651006714, "train_speed(iter/s)": 1.4375 }, { "epoch": 3.181311854676321, "grad_norm": 6.672262668609619, "learning_rate": 2.9243719677035086e-05, "loss": 2.4811267852783203, "memory(GiB)": 77.56, "step": 74255, "token_acc": 0.46236559139784944, "train_speed(iter/s)": 1.437503 }, { "epoch": 3.1815260700055696, "grad_norm": 5.62570333480835, "learning_rate": 2.9237597359095547e-05, "loss": 2.26552791595459, "memory(GiB)": 77.56, "step": 74260, "token_acc": 0.5017421602787456, "train_speed(iter/s)": 1.437534 }, { "epoch": 3.1817402853348185, "grad_norm": 7.12554407119751, "learning_rate": 2.9231475417286412e-05, "loss": 2.08533935546875, "memory(GiB)": 77.56, "step": 74265, "token_acc": 0.5353535353535354, "train_speed(iter/s)": 1.437563 }, { "epoch": 3.1819545006640677, "grad_norm": 6.84037446975708, "learning_rate": 2.9225353851718586e-05, "loss": 2.1793569564819335, "memory(GiB)": 77.56, "step": 74270, "token_acc": 0.5261044176706827, "train_speed(iter/s)": 1.437553 }, { "epoch": 3.1821687159933165, "grad_norm": 5.394654273986816, "learning_rate": 2.9219232662502948e-05, "loss": 2.3360498428344725, "memory(GiB)": 77.56, "step": 74275, "token_acc": 0.4714285714285714, "train_speed(iter/s)": 1.437542 }, { "epoch": 3.1823829313225653, "grad_norm": 5.653782844543457, "learning_rate": 2.9213111849750425e-05, "loss": 2.270713043212891, "memory(GiB)": 77.56, "step": 74280, "token_acc": 0.5137931034482759, "train_speed(iter/s)": 1.437553 }, { "epoch": 3.1825971466518146, "grad_norm": 5.598612308502197, "learning_rate": 2.9206991413571872e-05, "loss": 2.1324993133544923, "memory(GiB)": 77.56, "step": 74285, "token_acc": 0.5211726384364821, "train_speed(iter/s)": 1.437555 }, { "epoch": 3.1828113619810634, "grad_norm": 4.982089042663574, "learning_rate": 2.9200871354078167e-05, "loss": 2.3357444763183595, "memory(GiB)": 77.56, "step": 74290, "token_acc": 0.5101449275362319, "train_speed(iter/s)": 1.437562 }, { "epoch": 3.183025577310312, "grad_norm": 5.30708646774292, "learning_rate": 2.91947516713802e-05, "loss": 2.3095245361328125, "memory(GiB)": 77.56, "step": 74295, "token_acc": 0.5182481751824818, "train_speed(iter/s)": 1.437574 }, { "epoch": 3.1832397926395615, "grad_norm": 5.575718402862549, "learning_rate": 2.918863236558882e-05, "loss": 2.2222293853759765, "memory(GiB)": 77.56, "step": 74300, "token_acc": 0.5015479876160991, "train_speed(iter/s)": 1.437583 }, { "epoch": 3.1834540079688103, "grad_norm": 5.541013717651367, "learning_rate": 2.9182513436814897e-05, "loss": 2.4817035675048826, "memory(GiB)": 77.56, "step": 74305, "token_acc": 0.4682274247491639, "train_speed(iter/s)": 1.437587 }, { "epoch": 3.183668223298059, "grad_norm": 6.624006748199463, "learning_rate": 2.9176394885169238e-05, "loss": 2.6079805374145506, "memory(GiB)": 77.56, "step": 74310, "token_acc": 0.475177304964539, "train_speed(iter/s)": 1.4376 }, { "epoch": 3.1838824386273084, "grad_norm": 6.421791076660156, "learning_rate": 2.9170276710762746e-05, "loss": 2.229039192199707, "memory(GiB)": 77.56, "step": 74315, "token_acc": 0.5278688524590164, "train_speed(iter/s)": 1.437574 }, { "epoch": 3.184096653956557, "grad_norm": 5.171174049377441, "learning_rate": 2.9164158913706207e-05, "loss": 2.14434814453125, "memory(GiB)": 77.56, "step": 74320, "token_acc": 0.535031847133758, "train_speed(iter/s)": 1.437556 }, { "epoch": 3.184310869285806, "grad_norm": 7.4009599685668945, "learning_rate": 2.91580414941105e-05, "loss": 2.45861759185791, "memory(GiB)": 77.56, "step": 74325, "token_acc": 0.44964028776978415, "train_speed(iter/s)": 1.437557 }, { "epoch": 3.1845250846150552, "grad_norm": 5.991366386413574, "learning_rate": 2.9151924452086428e-05, "loss": 2.428775596618652, "memory(GiB)": 77.56, "step": 74330, "token_acc": 0.44642857142857145, "train_speed(iter/s)": 1.437555 }, { "epoch": 3.184739299944304, "grad_norm": 6.654433727264404, "learning_rate": 2.9145807787744795e-05, "loss": 2.4450361251831056, "memory(GiB)": 77.56, "step": 74335, "token_acc": 0.5263157894736842, "train_speed(iter/s)": 1.437552 }, { "epoch": 3.184953515273553, "grad_norm": 8.410322189331055, "learning_rate": 2.91396915011964e-05, "loss": 2.315107727050781, "memory(GiB)": 77.56, "step": 74340, "token_acc": 0.5346534653465347, "train_speed(iter/s)": 1.437554 }, { "epoch": 3.185167730602802, "grad_norm": 6.487464427947998, "learning_rate": 2.9133575592552076e-05, "loss": 2.2820533752441405, "memory(GiB)": 77.56, "step": 74345, "token_acc": 0.4954128440366973, "train_speed(iter/s)": 1.437563 }, { "epoch": 3.185381945932051, "grad_norm": 6.504099369049072, "learning_rate": 2.9127460061922607e-05, "loss": 2.475813293457031, "memory(GiB)": 77.56, "step": 74350, "token_acc": 0.4884488448844885, "train_speed(iter/s)": 1.437562 }, { "epoch": 3.1855961612612997, "grad_norm": 5.420844078063965, "learning_rate": 2.912134490941878e-05, "loss": 2.141633415222168, "memory(GiB)": 77.56, "step": 74355, "token_acc": 0.4828897338403042, "train_speed(iter/s)": 1.437569 }, { "epoch": 3.185810376590549, "grad_norm": 6.719005107879639, "learning_rate": 2.911523013515137e-05, "loss": 2.196058654785156, "memory(GiB)": 77.56, "step": 74360, "token_acc": 0.5075187969924813, "train_speed(iter/s)": 1.437582 }, { "epoch": 3.186024591919798, "grad_norm": 5.522883892059326, "learning_rate": 2.9109115739231145e-05, "loss": 2.181784820556641, "memory(GiB)": 77.56, "step": 74365, "token_acc": 0.49517684887459806, "train_speed(iter/s)": 1.437595 }, { "epoch": 3.1862388072490466, "grad_norm": 8.079535484313965, "learning_rate": 2.910300172176891e-05, "loss": 2.4879117965698243, "memory(GiB)": 77.56, "step": 74370, "token_acc": 0.5, "train_speed(iter/s)": 1.43762 }, { "epoch": 3.186453022578296, "grad_norm": 8.391444206237793, "learning_rate": 2.909688808287539e-05, "loss": 2.479786682128906, "memory(GiB)": 77.56, "step": 74375, "token_acc": 0.5048543689320388, "train_speed(iter/s)": 1.437622 }, { "epoch": 3.1866672379075447, "grad_norm": 6.152960777282715, "learning_rate": 2.909077482266136e-05, "loss": 2.4624088287353514, "memory(GiB)": 77.56, "step": 74380, "token_acc": 0.5, "train_speed(iter/s)": 1.437608 }, { "epoch": 3.1868814532367935, "grad_norm": 6.37076997756958, "learning_rate": 2.9084661941237552e-05, "loss": 2.2588523864746093, "memory(GiB)": 77.56, "step": 74385, "token_acc": 0.4982698961937716, "train_speed(iter/s)": 1.43762 }, { "epoch": 3.1870956685660428, "grad_norm": 5.731784820556641, "learning_rate": 2.9078549438714715e-05, "loss": 2.25836124420166, "memory(GiB)": 77.56, "step": 74390, "token_acc": 0.547945205479452, "train_speed(iter/s)": 1.437624 }, { "epoch": 3.1873098838952916, "grad_norm": 6.615798473358154, "learning_rate": 2.907243731520356e-05, "loss": 2.417607879638672, "memory(GiB)": 77.56, "step": 74395, "token_acc": 0.5128205128205128, "train_speed(iter/s)": 1.437637 }, { "epoch": 3.1875240992245404, "grad_norm": 5.539975643157959, "learning_rate": 2.9066325570814834e-05, "loss": 2.1889867782592773, "memory(GiB)": 77.56, "step": 74400, "token_acc": 0.5155709342560554, "train_speed(iter/s)": 1.437654 }, { "epoch": 3.1877383145537896, "grad_norm": 5.761571407318115, "learning_rate": 2.9060214205659286e-05, "loss": 2.2632556915283204, "memory(GiB)": 77.56, "step": 74405, "token_acc": 0.47962382445141066, "train_speed(iter/s)": 1.437666 }, { "epoch": 3.1879525298830385, "grad_norm": 5.465545177459717, "learning_rate": 2.9054103219847596e-05, "loss": 2.219378662109375, "memory(GiB)": 77.56, "step": 74410, "token_acc": 0.5306859205776173, "train_speed(iter/s)": 1.437659 }, { "epoch": 3.1881667452122873, "grad_norm": 6.672488212585449, "learning_rate": 2.904799261349047e-05, "loss": 2.3821691513061523, "memory(GiB)": 77.56, "step": 74415, "token_acc": 0.45454545454545453, "train_speed(iter/s)": 1.437667 }, { "epoch": 3.1883809605415365, "grad_norm": 4.450000762939453, "learning_rate": 2.9041882386698616e-05, "loss": 2.389927101135254, "memory(GiB)": 77.56, "step": 74420, "token_acc": 0.4602739726027397, "train_speed(iter/s)": 1.437662 }, { "epoch": 3.1885951758707853, "grad_norm": 5.67801570892334, "learning_rate": 2.9035772539582706e-05, "loss": 2.2785919189453123, "memory(GiB)": 77.56, "step": 74425, "token_acc": 0.5124555160142349, "train_speed(iter/s)": 1.437668 }, { "epoch": 3.188809391200034, "grad_norm": 7.345423698425293, "learning_rate": 2.9029663072253455e-05, "loss": 2.541117477416992, "memory(GiB)": 77.56, "step": 74430, "token_acc": 0.450354609929078, "train_speed(iter/s)": 1.437686 }, { "epoch": 3.1890236065292834, "grad_norm": 8.45646858215332, "learning_rate": 2.9023553984821537e-05, "loss": 2.4620317459106444, "memory(GiB)": 77.56, "step": 74435, "token_acc": 0.48928571428571427, "train_speed(iter/s)": 1.437684 }, { "epoch": 3.189237821858532, "grad_norm": 6.675907135009766, "learning_rate": 2.9017445277397616e-05, "loss": 2.440706253051758, "memory(GiB)": 77.56, "step": 74440, "token_acc": 0.49387755102040815, "train_speed(iter/s)": 1.437688 }, { "epoch": 3.189452037187781, "grad_norm": 6.023128032684326, "learning_rate": 2.901133695009235e-05, "loss": 2.2662500381469726, "memory(GiB)": 77.56, "step": 74445, "token_acc": 0.5080906148867314, "train_speed(iter/s)": 1.437674 }, { "epoch": 3.1896662525170303, "grad_norm": 7.9766082763671875, "learning_rate": 2.900522900301641e-05, "loss": 2.46661376953125, "memory(GiB)": 77.56, "step": 74450, "token_acc": 0.49838187702265374, "train_speed(iter/s)": 1.437662 }, { "epoch": 3.189880467846279, "grad_norm": 6.445828914642334, "learning_rate": 2.899912143628042e-05, "loss": 2.1338356018066404, "memory(GiB)": 77.56, "step": 74455, "token_acc": 0.5261627906976745, "train_speed(iter/s)": 1.43765 }, { "epoch": 3.190094683175528, "grad_norm": 6.048820972442627, "learning_rate": 2.8993014249995066e-05, "loss": 2.4252927780151365, "memory(GiB)": 77.56, "step": 74460, "token_acc": 0.4876543209876543, "train_speed(iter/s)": 1.437643 }, { "epoch": 3.190308898504777, "grad_norm": 6.362627029418945, "learning_rate": 2.898690744427096e-05, "loss": 2.4332820892333986, "memory(GiB)": 77.56, "step": 74465, "token_acc": 0.5384615384615384, "train_speed(iter/s)": 1.437639 }, { "epoch": 3.190523113834026, "grad_norm": 8.85700798034668, "learning_rate": 2.8980801019218718e-05, "loss": 2.404157257080078, "memory(GiB)": 77.56, "step": 74470, "token_acc": 0.4854368932038835, "train_speed(iter/s)": 1.437641 }, { "epoch": 3.190737329163275, "grad_norm": 8.288870811462402, "learning_rate": 2.8974694974949013e-05, "loss": 2.268733024597168, "memory(GiB)": 77.56, "step": 74475, "token_acc": 0.49508196721311476, "train_speed(iter/s)": 1.437644 }, { "epoch": 3.190951544492524, "grad_norm": 5.976711750030518, "learning_rate": 2.8968589311572426e-05, "loss": 2.374024772644043, "memory(GiB)": 77.56, "step": 74480, "token_acc": 0.5243445692883895, "train_speed(iter/s)": 1.437647 }, { "epoch": 3.191165759821773, "grad_norm": 9.467687606811523, "learning_rate": 2.896248402919955e-05, "loss": 2.2588104248046874, "memory(GiB)": 77.56, "step": 74485, "token_acc": 0.4844961240310077, "train_speed(iter/s)": 1.43764 }, { "epoch": 3.1913799751510217, "grad_norm": 8.094931602478027, "learning_rate": 2.895637912794103e-05, "loss": 2.4281118392944334, "memory(GiB)": 77.56, "step": 74490, "token_acc": 0.5085910652920962, "train_speed(iter/s)": 1.437643 }, { "epoch": 3.191594190480271, "grad_norm": 5.958942413330078, "learning_rate": 2.8950274607907447e-05, "loss": 2.1914178848266603, "memory(GiB)": 77.56, "step": 74495, "token_acc": 0.47692307692307695, "train_speed(iter/s)": 1.437631 }, { "epoch": 3.1918084058095197, "grad_norm": 6.1112871170043945, "learning_rate": 2.8944170469209386e-05, "loss": 2.1578176498413084, "memory(GiB)": 77.56, "step": 74500, "token_acc": 0.5242718446601942, "train_speed(iter/s)": 1.437623 }, { "epoch": 3.1918084058095197, "eval_loss": 2.2146124839782715, "eval_runtime": 14.7283, "eval_samples_per_second": 6.79, "eval_steps_per_second": 6.79, "eval_token_acc": 0.501432664756447, "step": 74500 }, { "epoch": 3.1920226211387686, "grad_norm": 6.2797136306762695, "learning_rate": 2.8938066711957423e-05, "loss": 2.2432228088378907, "memory(GiB)": 77.56, "step": 74505, "token_acc": 0.507085020242915, "train_speed(iter/s)": 1.437202 }, { "epoch": 3.192236836468018, "grad_norm": 5.658566951751709, "learning_rate": 2.8931963336262125e-05, "loss": 2.5845794677734375, "memory(GiB)": 77.56, "step": 74510, "token_acc": 0.4759036144578313, "train_speed(iter/s)": 1.437208 }, { "epoch": 3.1924510517972666, "grad_norm": 6.106662750244141, "learning_rate": 2.8925860342234092e-05, "loss": 2.4850736618041993, "memory(GiB)": 77.56, "step": 74515, "token_acc": 0.46394984326018807, "train_speed(iter/s)": 1.437211 }, { "epoch": 3.1926652671265154, "grad_norm": 5.278083801269531, "learning_rate": 2.8919757729983866e-05, "loss": 2.3318244934082033, "memory(GiB)": 77.56, "step": 74520, "token_acc": 0.5059171597633136, "train_speed(iter/s)": 1.437203 }, { "epoch": 3.1928794824557647, "grad_norm": 7.298023700714111, "learning_rate": 2.8913655499622012e-05, "loss": 2.3609592437744142, "memory(GiB)": 77.56, "step": 74525, "token_acc": 0.5150501672240803, "train_speed(iter/s)": 1.437179 }, { "epoch": 3.1930936977850135, "grad_norm": 6.219154357910156, "learning_rate": 2.890755365125906e-05, "loss": 2.2634323120117186, "memory(GiB)": 77.56, "step": 74530, "token_acc": 0.5291970802919708, "train_speed(iter/s)": 1.437175 }, { "epoch": 3.1933079131142623, "grad_norm": 8.251895904541016, "learning_rate": 2.890145218500556e-05, "loss": 2.311538314819336, "memory(GiB)": 77.56, "step": 74535, "token_acc": 0.5133079847908745, "train_speed(iter/s)": 1.437185 }, { "epoch": 3.1935221284435116, "grad_norm": 5.973064422607422, "learning_rate": 2.8895351100972033e-05, "loss": 2.408467483520508, "memory(GiB)": 77.56, "step": 74540, "token_acc": 0.4982456140350877, "train_speed(iter/s)": 1.437165 }, { "epoch": 3.1937363437727604, "grad_norm": 7.326908111572266, "learning_rate": 2.888925039926902e-05, "loss": 2.0957464218139648, "memory(GiB)": 77.56, "step": 74545, "token_acc": 0.5408560311284046, "train_speed(iter/s)": 1.437171 }, { "epoch": 3.193950559102009, "grad_norm": 6.643439769744873, "learning_rate": 2.8883150080007047e-05, "loss": 2.4536102294921873, "memory(GiB)": 77.56, "step": 74550, "token_acc": 0.48580441640378547, "train_speed(iter/s)": 1.437171 }, { "epoch": 3.1941647744312585, "grad_norm": 4.713240623474121, "learning_rate": 2.887705014329663e-05, "loss": 2.3109451293945313, "memory(GiB)": 77.56, "step": 74555, "token_acc": 0.5114754098360655, "train_speed(iter/s)": 1.437158 }, { "epoch": 3.1943789897605073, "grad_norm": 5.468522548675537, "learning_rate": 2.8870950589248256e-05, "loss": 2.290922927856445, "memory(GiB)": 77.56, "step": 74560, "token_acc": 0.5015015015015015, "train_speed(iter/s)": 1.437141 }, { "epoch": 3.194593205089756, "grad_norm": 5.981069087982178, "learning_rate": 2.8864851417972444e-05, "loss": 2.2255929946899413, "memory(GiB)": 77.56, "step": 74565, "token_acc": 0.5170068027210885, "train_speed(iter/s)": 1.437151 }, { "epoch": 3.1948074204190053, "grad_norm": 6.307750701904297, "learning_rate": 2.8858752629579648e-05, "loss": 2.3497251510620116, "memory(GiB)": 77.56, "step": 74570, "token_acc": 0.5033557046979866, "train_speed(iter/s)": 1.437166 }, { "epoch": 3.195021635748254, "grad_norm": 5.7458977699279785, "learning_rate": 2.88526542241804e-05, "loss": 2.2701597213745117, "memory(GiB)": 77.56, "step": 74575, "token_acc": 0.5091575091575091, "train_speed(iter/s)": 1.437177 }, { "epoch": 3.195235851077503, "grad_norm": 5.074203968048096, "learning_rate": 2.8846556201885162e-05, "loss": 2.4747922897338865, "memory(GiB)": 77.56, "step": 74580, "token_acc": 0.46206896551724136, "train_speed(iter/s)": 1.437175 }, { "epoch": 3.195450066406752, "grad_norm": 5.95542049407959, "learning_rate": 2.8840458562804396e-05, "loss": 2.2604307174682616, "memory(GiB)": 77.56, "step": 74585, "token_acc": 0.5485232067510548, "train_speed(iter/s)": 1.437179 }, { "epoch": 3.195664281736001, "grad_norm": 5.192944526672363, "learning_rate": 2.883436130704858e-05, "loss": 2.258067321777344, "memory(GiB)": 77.56, "step": 74590, "token_acc": 0.5179153094462541, "train_speed(iter/s)": 1.43717 }, { "epoch": 3.19587849706525, "grad_norm": 4.612861633300781, "learning_rate": 2.882826443472816e-05, "loss": 2.4915313720703125, "memory(GiB)": 77.56, "step": 74595, "token_acc": 0.49122807017543857, "train_speed(iter/s)": 1.437139 }, { "epoch": 3.196092712394499, "grad_norm": 5.6387619972229, "learning_rate": 2.882216794595357e-05, "loss": 2.505149269104004, "memory(GiB)": 77.56, "step": 74600, "token_acc": 0.5041551246537396, "train_speed(iter/s)": 1.43714 }, { "epoch": 3.196306927723748, "grad_norm": 5.267059803009033, "learning_rate": 2.881607184083529e-05, "loss": 2.180113983154297, "memory(GiB)": 77.56, "step": 74605, "token_acc": 0.50814332247557, "train_speed(iter/s)": 1.437128 }, { "epoch": 3.1965211430529967, "grad_norm": 4.514616966247559, "learning_rate": 2.880997611948375e-05, "loss": 2.219822120666504, "memory(GiB)": 77.56, "step": 74610, "token_acc": 0.5276752767527675, "train_speed(iter/s)": 1.437136 }, { "epoch": 3.196735358382246, "grad_norm": 5.553776741027832, "learning_rate": 2.8803880782009347e-05, "loss": 2.4151798248291017, "memory(GiB)": 77.56, "step": 74615, "token_acc": 0.4708171206225681, "train_speed(iter/s)": 1.437143 }, { "epoch": 3.196949573711495, "grad_norm": 4.378702640533447, "learning_rate": 2.8797785828522543e-05, "loss": 2.076962471008301, "memory(GiB)": 77.56, "step": 74620, "token_acc": 0.5513307984790875, "train_speed(iter/s)": 1.437146 }, { "epoch": 3.1971637890407436, "grad_norm": 4.514094352722168, "learning_rate": 2.8791691259133745e-05, "loss": 2.4175682067871094, "memory(GiB)": 77.56, "step": 74625, "token_acc": 0.521594684385382, "train_speed(iter/s)": 1.437141 }, { "epoch": 3.197378004369993, "grad_norm": 6.597666263580322, "learning_rate": 2.878559707395333e-05, "loss": 2.5885677337646484, "memory(GiB)": 77.56, "step": 74630, "token_acc": 0.44785276073619634, "train_speed(iter/s)": 1.437155 }, { "epoch": 3.1975922196992417, "grad_norm": 4.421172142028809, "learning_rate": 2.8779503273091746e-05, "loss": 2.02491455078125, "memory(GiB)": 77.56, "step": 74635, "token_acc": 0.5421245421245421, "train_speed(iter/s)": 1.437159 }, { "epoch": 3.1978064350284905, "grad_norm": 5.090878963470459, "learning_rate": 2.877340985665936e-05, "loss": 2.332314300537109, "memory(GiB)": 77.56, "step": 74640, "token_acc": 0.4983277591973244, "train_speed(iter/s)": 1.437172 }, { "epoch": 3.1980206503577397, "grad_norm": 5.172229766845703, "learning_rate": 2.8767316824766577e-05, "loss": 2.2965587615966796, "memory(GiB)": 77.56, "step": 74645, "token_acc": 0.5114754098360655, "train_speed(iter/s)": 1.437157 }, { "epoch": 3.1982348656869886, "grad_norm": 5.844092845916748, "learning_rate": 2.876122417752376e-05, "loss": 2.5554794311523437, "memory(GiB)": 77.56, "step": 74650, "token_acc": 0.47058823529411764, "train_speed(iter/s)": 1.437172 }, { "epoch": 3.1984490810162374, "grad_norm": 6.179689407348633, "learning_rate": 2.8755131915041277e-05, "loss": 2.313243865966797, "memory(GiB)": 77.56, "step": 74655, "token_acc": 0.5213675213675214, "train_speed(iter/s)": 1.437165 }, { "epoch": 3.1986632963454866, "grad_norm": 5.895142555236816, "learning_rate": 2.8749040037429527e-05, "loss": 2.370087814331055, "memory(GiB)": 77.56, "step": 74660, "token_acc": 0.5165562913907285, "train_speed(iter/s)": 1.437158 }, { "epoch": 3.1988775116747354, "grad_norm": 5.770688056945801, "learning_rate": 2.874294854479885e-05, "loss": 2.162943458557129, "memory(GiB)": 77.56, "step": 74665, "token_acc": 0.539622641509434, "train_speed(iter/s)": 1.437179 }, { "epoch": 3.1990917270039843, "grad_norm": 5.231703758239746, "learning_rate": 2.8736857437259602e-05, "loss": 2.577847480773926, "memory(GiB)": 77.56, "step": 74670, "token_acc": 0.4775641025641026, "train_speed(iter/s)": 1.437162 }, { "epoch": 3.1993059423332335, "grad_norm": 7.476298809051514, "learning_rate": 2.8730766714922126e-05, "loss": 2.3307313919067383, "memory(GiB)": 77.56, "step": 74675, "token_acc": 0.4754601226993865, "train_speed(iter/s)": 1.437162 }, { "epoch": 3.1995201576624823, "grad_norm": 7.090126991271973, "learning_rate": 2.8724676377896764e-05, "loss": 2.261647415161133, "memory(GiB)": 77.56, "step": 74680, "token_acc": 0.5160142348754448, "train_speed(iter/s)": 1.437156 }, { "epoch": 3.199734372991731, "grad_norm": 6.847160339355469, "learning_rate": 2.8718586426293825e-05, "loss": 2.4295669555664063, "memory(GiB)": 77.56, "step": 74685, "token_acc": 0.5274725274725275, "train_speed(iter/s)": 1.437158 }, { "epoch": 3.1999485883209804, "grad_norm": 6.437007904052734, "learning_rate": 2.8712496860223658e-05, "loss": 2.369749069213867, "memory(GiB)": 77.56, "step": 74690, "token_acc": 0.4613003095975232, "train_speed(iter/s)": 1.43715 }, { "epoch": 3.200162803650229, "grad_norm": 8.268362045288086, "learning_rate": 2.8706407679796605e-05, "loss": 2.221266174316406, "memory(GiB)": 77.56, "step": 74695, "token_acc": 0.5562700964630225, "train_speed(iter/s)": 1.437149 }, { "epoch": 3.200377018979478, "grad_norm": 5.253814697265625, "learning_rate": 2.8700318885122944e-05, "loss": 2.4045394897460937, "memory(GiB)": 77.56, "step": 74700, "token_acc": 0.5040871934604905, "train_speed(iter/s)": 1.437168 }, { "epoch": 3.2005912343087273, "grad_norm": 7.181920051574707, "learning_rate": 2.869423047631299e-05, "loss": 2.4228822708129885, "memory(GiB)": 77.56, "step": 74705, "token_acc": 0.49158249158249157, "train_speed(iter/s)": 1.437155 }, { "epoch": 3.200805449637976, "grad_norm": 4.873536109924316, "learning_rate": 2.868814245347704e-05, "loss": 2.4592336654663085, "memory(GiB)": 77.56, "step": 74710, "token_acc": 0.5015479876160991, "train_speed(iter/s)": 1.437164 }, { "epoch": 3.201019664967225, "grad_norm": 9.333537101745605, "learning_rate": 2.8682054816725358e-05, "loss": 2.4023914337158203, "memory(GiB)": 77.56, "step": 74715, "token_acc": 0.498220640569395, "train_speed(iter/s)": 1.437144 }, { "epoch": 3.201233880296474, "grad_norm": 6.377510070800781, "learning_rate": 2.867596756616827e-05, "loss": 2.5542667388916014, "memory(GiB)": 77.56, "step": 74720, "token_acc": 0.5171428571428571, "train_speed(iter/s)": 1.437134 }, { "epoch": 3.201448095625723, "grad_norm": 5.612009048461914, "learning_rate": 2.866988070191604e-05, "loss": 2.253769302368164, "memory(GiB)": 77.56, "step": 74725, "token_acc": 0.5018587360594795, "train_speed(iter/s)": 1.437135 }, { "epoch": 3.201662310954972, "grad_norm": 5.75795841217041, "learning_rate": 2.8663794224078926e-05, "loss": 2.2889842987060547, "memory(GiB)": 77.56, "step": 74730, "token_acc": 0.5131086142322098, "train_speed(iter/s)": 1.437146 }, { "epoch": 3.201876526284221, "grad_norm": 4.786227226257324, "learning_rate": 2.865770813276719e-05, "loss": 2.4804405212402343, "memory(GiB)": 77.56, "step": 74735, "token_acc": 0.4861111111111111, "train_speed(iter/s)": 1.437166 }, { "epoch": 3.20209074161347, "grad_norm": 7.432167053222656, "learning_rate": 2.86516224280911e-05, "loss": 2.599211311340332, "memory(GiB)": 77.56, "step": 74740, "token_acc": 0.46808510638297873, "train_speed(iter/s)": 1.437181 }, { "epoch": 3.2023049569427187, "grad_norm": 6.44944953918457, "learning_rate": 2.8645537110160873e-05, "loss": 2.394886779785156, "memory(GiB)": 77.56, "step": 74745, "token_acc": 0.5098039215686274, "train_speed(iter/s)": 1.437203 }, { "epoch": 3.202519172271968, "grad_norm": 5.755970001220703, "learning_rate": 2.8639452179086788e-05, "loss": 2.422623634338379, "memory(GiB)": 77.56, "step": 74750, "token_acc": 0.4738955823293173, "train_speed(iter/s)": 1.437193 }, { "epoch": 3.2027333876012167, "grad_norm": 8.042232513427734, "learning_rate": 2.8633367634979075e-05, "loss": 2.2414199829101564, "memory(GiB)": 77.56, "step": 74755, "token_acc": 0.5, "train_speed(iter/s)": 1.437226 }, { "epoch": 3.2029476029304655, "grad_norm": 7.1323628425598145, "learning_rate": 2.8627283477947942e-05, "loss": 2.407971000671387, "memory(GiB)": 77.56, "step": 74760, "token_acc": 0.5017301038062284, "train_speed(iter/s)": 1.437237 }, { "epoch": 3.203161818259715, "grad_norm": 6.79801082611084, "learning_rate": 2.8621199708103606e-05, "loss": 2.571414756774902, "memory(GiB)": 77.56, "step": 74765, "token_acc": 0.43630573248407645, "train_speed(iter/s)": 1.437235 }, { "epoch": 3.2033760335889636, "grad_norm": 7.384902477264404, "learning_rate": 2.8615116325556307e-05, "loss": 2.5754440307617186, "memory(GiB)": 77.56, "step": 74770, "token_acc": 0.48936170212765956, "train_speed(iter/s)": 1.437232 }, { "epoch": 3.2035902489182124, "grad_norm": 4.7515764236450195, "learning_rate": 2.8609033330416213e-05, "loss": 2.2517416000366213, "memory(GiB)": 77.56, "step": 74775, "token_acc": 0.5015673981191222, "train_speed(iter/s)": 1.437231 }, { "epoch": 3.2038044642474617, "grad_norm": 6.234803199768066, "learning_rate": 2.8602950722793577e-05, "loss": 2.364109420776367, "memory(GiB)": 77.56, "step": 74780, "token_acc": 0.5096153846153846, "train_speed(iter/s)": 1.437246 }, { "epoch": 3.2040186795767105, "grad_norm": 5.362788200378418, "learning_rate": 2.859686850279855e-05, "loss": 2.067121696472168, "memory(GiB)": 77.56, "step": 74785, "token_acc": 0.5466666666666666, "train_speed(iter/s)": 1.437259 }, { "epoch": 3.2042328949059593, "grad_norm": 6.4607648849487305, "learning_rate": 2.8590786670541336e-05, "loss": 2.260964775085449, "memory(GiB)": 77.56, "step": 74790, "token_acc": 0.4896551724137931, "train_speed(iter/s)": 1.437279 }, { "epoch": 3.2044471102352086, "grad_norm": 8.414200782775879, "learning_rate": 2.8584705226132096e-05, "loss": 2.3255826950073244, "memory(GiB)": 77.56, "step": 74795, "token_acc": 0.5173611111111112, "train_speed(iter/s)": 1.437281 }, { "epoch": 3.2046613255644574, "grad_norm": 6.140413284301758, "learning_rate": 2.8578624169680994e-05, "loss": 2.3195894241333006, "memory(GiB)": 77.56, "step": 74800, "token_acc": 0.4707692307692308, "train_speed(iter/s)": 1.437286 }, { "epoch": 3.204875540893706, "grad_norm": 5.533069133758545, "learning_rate": 2.857254350129823e-05, "loss": 2.282148551940918, "memory(GiB)": 77.56, "step": 74805, "token_acc": 0.5373665480427047, "train_speed(iter/s)": 1.437279 }, { "epoch": 3.2050897562229554, "grad_norm": 4.75316047668457, "learning_rate": 2.8566463221093943e-05, "loss": 2.393589210510254, "memory(GiB)": 77.56, "step": 74810, "token_acc": 0.49201277955271566, "train_speed(iter/s)": 1.437285 }, { "epoch": 3.2053039715522043, "grad_norm": 7.1517333984375, "learning_rate": 2.8560383329178286e-05, "loss": 2.2573310852050783, "memory(GiB)": 77.56, "step": 74815, "token_acc": 0.5344827586206896, "train_speed(iter/s)": 1.437278 }, { "epoch": 3.205518186881453, "grad_norm": 8.25438404083252, "learning_rate": 2.8554303825661388e-05, "loss": 2.416891670227051, "memory(GiB)": 77.56, "step": 74820, "token_acc": 0.50199203187251, "train_speed(iter/s)": 1.437283 }, { "epoch": 3.2057324022107023, "grad_norm": 6.156642913818359, "learning_rate": 2.8548224710653393e-05, "loss": 2.2529838562011717, "memory(GiB)": 77.56, "step": 74825, "token_acc": 0.5443548387096774, "train_speed(iter/s)": 1.437263 }, { "epoch": 3.205946617539951, "grad_norm": 6.6809306144714355, "learning_rate": 2.8542145984264416e-05, "loss": 2.8260244369506835, "memory(GiB)": 77.56, "step": 74830, "token_acc": 0.43478260869565216, "train_speed(iter/s)": 1.437274 }, { "epoch": 3.2061608328692, "grad_norm": 8.493712425231934, "learning_rate": 2.8536067646604592e-05, "loss": 2.6808374404907225, "memory(GiB)": 77.56, "step": 74835, "token_acc": 0.4332129963898917, "train_speed(iter/s)": 1.43728 }, { "epoch": 3.206375048198449, "grad_norm": 6.103187084197998, "learning_rate": 2.852998969778406e-05, "loss": 2.135850524902344, "memory(GiB)": 77.56, "step": 74840, "token_acc": 0.5154639175257731, "train_speed(iter/s)": 1.43727 }, { "epoch": 3.206589263527698, "grad_norm": 6.107016086578369, "learning_rate": 2.85239121379129e-05, "loss": 2.1684507369995116, "memory(GiB)": 77.56, "step": 74845, "token_acc": 0.5606060606060606, "train_speed(iter/s)": 1.437264 }, { "epoch": 3.206803478856947, "grad_norm": 5.628046989440918, "learning_rate": 2.851783496710122e-05, "loss": 2.1371402740478516, "memory(GiB)": 77.56, "step": 74850, "token_acc": 0.5485074626865671, "train_speed(iter/s)": 1.43727 }, { "epoch": 3.207017694186196, "grad_norm": 6.466119766235352, "learning_rate": 2.8511758185459113e-05, "loss": 2.454837989807129, "memory(GiB)": 77.56, "step": 74855, "token_acc": 0.45195729537366547, "train_speed(iter/s)": 1.437278 }, { "epoch": 3.207231909515445, "grad_norm": 5.993646621704102, "learning_rate": 2.850568179309665e-05, "loss": 2.453556251525879, "memory(GiB)": 77.56, "step": 74860, "token_acc": 0.5, "train_speed(iter/s)": 1.437286 }, { "epoch": 3.2074461248446937, "grad_norm": 5.757293701171875, "learning_rate": 2.8499605790123935e-05, "loss": 2.4333534240722656, "memory(GiB)": 77.56, "step": 74865, "token_acc": 0.48589341692789967, "train_speed(iter/s)": 1.437294 }, { "epoch": 3.207660340173943, "grad_norm": 6.234353542327881, "learning_rate": 2.8493530176651028e-05, "loss": 2.1053205490112306, "memory(GiB)": 77.56, "step": 74870, "token_acc": 0.5177865612648221, "train_speed(iter/s)": 1.437316 }, { "epoch": 3.207874555503192, "grad_norm": 4.956779956817627, "learning_rate": 2.8487454952788008e-05, "loss": 2.4120140075683594, "memory(GiB)": 77.56, "step": 74875, "token_acc": 0.48231511254019294, "train_speed(iter/s)": 1.437312 }, { "epoch": 3.2080887708324406, "grad_norm": 5.676421165466309, "learning_rate": 2.848138011864491e-05, "loss": 2.247556686401367, "memory(GiB)": 77.56, "step": 74880, "token_acc": 0.523972602739726, "train_speed(iter/s)": 1.437312 }, { "epoch": 3.20830298616169, "grad_norm": 7.058403968811035, "learning_rate": 2.8475305674331808e-05, "loss": 2.232187843322754, "memory(GiB)": 77.56, "step": 74885, "token_acc": 0.4878048780487805, "train_speed(iter/s)": 1.437339 }, { "epoch": 3.2085172014909387, "grad_norm": 6.966588973999023, "learning_rate": 2.8469231619958708e-05, "loss": 2.1898117065429688, "memory(GiB)": 77.56, "step": 74890, "token_acc": 0.5294117647058824, "train_speed(iter/s)": 1.437328 }, { "epoch": 3.2087314168201875, "grad_norm": 6.0842156410217285, "learning_rate": 2.84631579556357e-05, "loss": 2.247432518005371, "memory(GiB)": 77.56, "step": 74895, "token_acc": 0.5372670807453416, "train_speed(iter/s)": 1.437301 }, { "epoch": 3.2089456321494367, "grad_norm": 6.363500595092773, "learning_rate": 2.845708468147278e-05, "loss": 2.1914358139038086, "memory(GiB)": 77.56, "step": 74900, "token_acc": 0.5355648535564853, "train_speed(iter/s)": 1.437293 }, { "epoch": 3.2091598474786855, "grad_norm": 7.665328025817871, "learning_rate": 2.8451011797579985e-05, "loss": 2.3460342407226564, "memory(GiB)": 77.56, "step": 74905, "token_acc": 0.5058365758754864, "train_speed(iter/s)": 1.437277 }, { "epoch": 3.2093740628079344, "grad_norm": 6.302661895751953, "learning_rate": 2.8444939304067307e-05, "loss": 2.4604684829711916, "memory(GiB)": 77.56, "step": 74910, "token_acc": 0.4807017543859649, "train_speed(iter/s)": 1.437296 }, { "epoch": 3.2095882781371836, "grad_norm": 9.885723114013672, "learning_rate": 2.8438867201044794e-05, "loss": 2.3899681091308596, "memory(GiB)": 77.56, "step": 74915, "token_acc": 0.4766666666666667, "train_speed(iter/s)": 1.437319 }, { "epoch": 3.2098024934664324, "grad_norm": 6.052088737487793, "learning_rate": 2.843279548862241e-05, "loss": 2.3846935272216796, "memory(GiB)": 77.56, "step": 74920, "token_acc": 0.5254237288135594, "train_speed(iter/s)": 1.437344 }, { "epoch": 3.2100167087956812, "grad_norm": 7.404747009277344, "learning_rate": 2.842672416691018e-05, "loss": 2.2464561462402344, "memory(GiB)": 77.56, "step": 74925, "token_acc": 0.5331010452961672, "train_speed(iter/s)": 1.43736 }, { "epoch": 3.2102309241249305, "grad_norm": 6.648402690887451, "learning_rate": 2.8420653236018086e-05, "loss": 2.375318717956543, "memory(GiB)": 77.56, "step": 74930, "token_acc": 0.5214521452145214, "train_speed(iter/s)": 1.43737 }, { "epoch": 3.2104451394541793, "grad_norm": 5.092421054840088, "learning_rate": 2.8414582696056102e-05, "loss": 2.628529167175293, "memory(GiB)": 77.56, "step": 74935, "token_acc": 0.4707792207792208, "train_speed(iter/s)": 1.437363 }, { "epoch": 3.210659354783428, "grad_norm": 5.738795757293701, "learning_rate": 2.840851254713419e-05, "loss": 2.414845275878906, "memory(GiB)": 77.56, "step": 74940, "token_acc": 0.5016722408026756, "train_speed(iter/s)": 1.437377 }, { "epoch": 3.2108735701126774, "grad_norm": 4.775619983673096, "learning_rate": 2.8402442789362322e-05, "loss": 2.3733440399169923, "memory(GiB)": 77.56, "step": 74945, "token_acc": 0.4888888888888889, "train_speed(iter/s)": 1.43739 }, { "epoch": 3.211087785441926, "grad_norm": 5.267678260803223, "learning_rate": 2.8396373422850476e-05, "loss": 2.30804386138916, "memory(GiB)": 77.56, "step": 74950, "token_acc": 0.5439739413680782, "train_speed(iter/s)": 1.437375 }, { "epoch": 3.211302000771175, "grad_norm": 6.17446756362915, "learning_rate": 2.8390304447708594e-05, "loss": 2.3438674926757814, "memory(GiB)": 77.56, "step": 74955, "token_acc": 0.46835443037974683, "train_speed(iter/s)": 1.437363 }, { "epoch": 3.2115162161004243, "grad_norm": 8.948515892028809, "learning_rate": 2.8384235864046614e-05, "loss": 2.4184532165527344, "memory(GiB)": 77.56, "step": 74960, "token_acc": 0.5122950819672131, "train_speed(iter/s)": 1.437391 }, { "epoch": 3.211730431429673, "grad_norm": 5.949466705322266, "learning_rate": 2.837816767197448e-05, "loss": 2.4688013076782225, "memory(GiB)": 77.56, "step": 74965, "token_acc": 0.43452380952380953, "train_speed(iter/s)": 1.437369 }, { "epoch": 3.211944646758922, "grad_norm": 5.721344470977783, "learning_rate": 2.8372099871602127e-05, "loss": 2.183220863342285, "memory(GiB)": 77.56, "step": 74970, "token_acc": 0.5222929936305732, "train_speed(iter/s)": 1.437376 }, { "epoch": 3.212158862088171, "grad_norm": 5.342127799987793, "learning_rate": 2.8366032463039448e-05, "loss": 2.578084182739258, "memory(GiB)": 77.56, "step": 74975, "token_acc": 0.48942598187311176, "train_speed(iter/s)": 1.437363 }, { "epoch": 3.21237307741742, "grad_norm": 6.603104591369629, "learning_rate": 2.835996544639641e-05, "loss": 2.5666284561157227, "memory(GiB)": 77.56, "step": 74980, "token_acc": 0.47368421052631576, "train_speed(iter/s)": 1.437365 }, { "epoch": 3.2125872927466688, "grad_norm": 5.945138454437256, "learning_rate": 2.835389882178287e-05, "loss": 2.424089050292969, "memory(GiB)": 77.56, "step": 74985, "token_acc": 0.4782608695652174, "train_speed(iter/s)": 1.43739 }, { "epoch": 3.212801508075918, "grad_norm": 5.490812301635742, "learning_rate": 2.834783258930878e-05, "loss": 2.522750663757324, "memory(GiB)": 77.56, "step": 74990, "token_acc": 0.47527472527472525, "train_speed(iter/s)": 1.437373 }, { "epoch": 3.213015723405167, "grad_norm": 6.167250633239746, "learning_rate": 2.834176674908402e-05, "loss": 2.5450586318969726, "memory(GiB)": 77.56, "step": 74995, "token_acc": 0.5268456375838926, "train_speed(iter/s)": 1.437396 }, { "epoch": 3.2132299387344156, "grad_norm": 7.590094089508057, "learning_rate": 2.8335701301218472e-05, "loss": 2.250750732421875, "memory(GiB)": 77.56, "step": 75000, "token_acc": 0.4984025559105431, "train_speed(iter/s)": 1.437394 }, { "epoch": 3.2132299387344156, "eval_loss": 2.2155067920684814, "eval_runtime": 14.8696, "eval_samples_per_second": 6.725, "eval_steps_per_second": 6.725, "eval_token_acc": 0.48567335243553006, "step": 75000 }, { "epoch": 3.213444154063665, "grad_norm": 5.956737995147705, "learning_rate": 2.8329636245821988e-05, "loss": 2.535017395019531, "memory(GiB)": 77.56, "step": 75005, "token_acc": 0.47834645669291337, "train_speed(iter/s)": 1.436969 }, { "epoch": 3.2136583693929137, "grad_norm": 5.7900238037109375, "learning_rate": 2.8323571583004505e-05, "loss": 2.36295223236084, "memory(GiB)": 77.56, "step": 75010, "token_acc": 0.4968553459119497, "train_speed(iter/s)": 1.436981 }, { "epoch": 3.2138725847221625, "grad_norm": 5.765345573425293, "learning_rate": 2.8317507312875845e-05, "loss": 2.2644847869873046, "memory(GiB)": 77.56, "step": 75015, "token_acc": 0.5413533834586466, "train_speed(iter/s)": 1.43696 }, { "epoch": 3.214086800051412, "grad_norm": 9.455818176269531, "learning_rate": 2.8311443435545892e-05, "loss": 2.1789180755615236, "memory(GiB)": 77.56, "step": 75020, "token_acc": 0.5306859205776173, "train_speed(iter/s)": 1.436954 }, { "epoch": 3.2143010153806606, "grad_norm": 5.057734966278076, "learning_rate": 2.8305379951124478e-05, "loss": 2.138809013366699, "memory(GiB)": 77.56, "step": 75025, "token_acc": 0.5304659498207885, "train_speed(iter/s)": 1.436954 }, { "epoch": 3.2145152307099094, "grad_norm": 11.719342231750488, "learning_rate": 2.8299316859721435e-05, "loss": 2.2895336151123047, "memory(GiB)": 77.56, "step": 75030, "token_acc": 0.503968253968254, "train_speed(iter/s)": 1.436963 }, { "epoch": 3.2147294460391587, "grad_norm": 4.955362796783447, "learning_rate": 2.829325416144665e-05, "loss": 2.384970283508301, "memory(GiB)": 77.56, "step": 75035, "token_acc": 0.4634920634920635, "train_speed(iter/s)": 1.43699 }, { "epoch": 3.2149436613684075, "grad_norm": 5.784004211425781, "learning_rate": 2.8287191856409935e-05, "loss": 2.0993122100830077, "memory(GiB)": 77.56, "step": 75040, "token_acc": 0.539568345323741, "train_speed(iter/s)": 1.436986 }, { "epoch": 3.2151578766976563, "grad_norm": 6.29775333404541, "learning_rate": 2.8281129944721096e-05, "loss": 2.2767024993896485, "memory(GiB)": 77.56, "step": 75045, "token_acc": 0.5347222222222222, "train_speed(iter/s)": 1.436987 }, { "epoch": 3.2153720920269055, "grad_norm": 10.267982482910156, "learning_rate": 2.8275068426489965e-05, "loss": 2.212834358215332, "memory(GiB)": 77.56, "step": 75050, "token_acc": 0.5486381322957199, "train_speed(iter/s)": 1.43701 }, { "epoch": 3.2155863073561544, "grad_norm": 7.0684332847595215, "learning_rate": 2.826900730182635e-05, "loss": 2.7192367553710937, "memory(GiB)": 77.56, "step": 75055, "token_acc": 0.49019607843137253, "train_speed(iter/s)": 1.437 }, { "epoch": 3.215800522685403, "grad_norm": 4.134343147277832, "learning_rate": 2.8262946570840032e-05, "loss": 2.333767127990723, "memory(GiB)": 77.56, "step": 75060, "token_acc": 0.4835164835164835, "train_speed(iter/s)": 1.437018 }, { "epoch": 3.2160147380146524, "grad_norm": 5.532471179962158, "learning_rate": 2.8256886233640834e-05, "loss": 2.223998260498047, "memory(GiB)": 77.56, "step": 75065, "token_acc": 0.4970414201183432, "train_speed(iter/s)": 1.437054 }, { "epoch": 3.2162289533439012, "grad_norm": 4.45161771774292, "learning_rate": 2.825082629033855e-05, "loss": 2.2879053115844727, "memory(GiB)": 77.56, "step": 75070, "token_acc": 0.5230769230769231, "train_speed(iter/s)": 1.437069 }, { "epoch": 3.21644316867315, "grad_norm": 5.835312843322754, "learning_rate": 2.824476674104295e-05, "loss": 2.675983428955078, "memory(GiB)": 77.56, "step": 75075, "token_acc": 0.4334470989761092, "train_speed(iter/s)": 1.437073 }, { "epoch": 3.2166573840023993, "grad_norm": 9.023283004760742, "learning_rate": 2.823870758586381e-05, "loss": 2.3446048736572265, "memory(GiB)": 77.56, "step": 75080, "token_acc": 0.48484848484848486, "train_speed(iter/s)": 1.437087 }, { "epoch": 3.216871599331648, "grad_norm": 5.886406898498535, "learning_rate": 2.8232648824910902e-05, "loss": 2.3803035736083986, "memory(GiB)": 77.56, "step": 75085, "token_acc": 0.5, "train_speed(iter/s)": 1.437043 }, { "epoch": 3.217085814660897, "grad_norm": 7.387778282165527, "learning_rate": 2.8226590458293957e-05, "loss": 2.4575361251831054, "memory(GiB)": 77.56, "step": 75090, "token_acc": 0.4541984732824427, "train_speed(iter/s)": 1.43706 }, { "epoch": 3.217300029990146, "grad_norm": 5.371993064880371, "learning_rate": 2.822053248612277e-05, "loss": 2.256745719909668, "memory(GiB)": 77.56, "step": 75095, "token_acc": 0.5444015444015444, "train_speed(iter/s)": 1.437059 }, { "epoch": 3.217514245319395, "grad_norm": 6.534891605377197, "learning_rate": 2.8214474908507066e-05, "loss": 2.4698951721191404, "memory(GiB)": 77.56, "step": 75100, "token_acc": 0.5077519379844961, "train_speed(iter/s)": 1.437066 }, { "epoch": 3.217728460648644, "grad_norm": 6.713491916656494, "learning_rate": 2.820841772555659e-05, "loss": 2.290480041503906, "memory(GiB)": 77.56, "step": 75105, "token_acc": 0.47985347985347987, "train_speed(iter/s)": 1.437085 }, { "epoch": 3.217942675977893, "grad_norm": 4.971664905548096, "learning_rate": 2.8202360937381066e-05, "loss": 2.0735591888427733, "memory(GiB)": 77.56, "step": 75110, "token_acc": 0.5333333333333333, "train_speed(iter/s)": 1.437073 }, { "epoch": 3.218156891307142, "grad_norm": 6.009228706359863, "learning_rate": 2.8196304544090225e-05, "loss": 2.1902050018310546, "memory(GiB)": 77.56, "step": 75115, "token_acc": 0.5079365079365079, "train_speed(iter/s)": 1.437092 }, { "epoch": 3.2183711066363907, "grad_norm": 4.496587753295898, "learning_rate": 2.819024854579375e-05, "loss": 2.1751617431640624, "memory(GiB)": 77.56, "step": 75120, "token_acc": 0.5374149659863946, "train_speed(iter/s)": 1.437119 }, { "epoch": 3.21858532196564, "grad_norm": 5.55951452255249, "learning_rate": 2.8184192942601413e-05, "loss": 2.493456268310547, "memory(GiB)": 77.56, "step": 75125, "token_acc": 0.4892086330935252, "train_speed(iter/s)": 1.437141 }, { "epoch": 3.2187995372948888, "grad_norm": 6.78969144821167, "learning_rate": 2.8178137734622878e-05, "loss": 2.2150049209594727, "memory(GiB)": 77.56, "step": 75130, "token_acc": 0.5298013245033113, "train_speed(iter/s)": 1.437152 }, { "epoch": 3.2190137526241376, "grad_norm": 6.8576579093933105, "learning_rate": 2.8172082921967836e-05, "loss": 2.2060529708862306, "memory(GiB)": 77.56, "step": 75135, "token_acc": 0.5112781954887218, "train_speed(iter/s)": 1.437164 }, { "epoch": 3.219227967953387, "grad_norm": 8.125765800476074, "learning_rate": 2.8166028504746e-05, "loss": 2.043033218383789, "memory(GiB)": 77.56, "step": 75140, "token_acc": 0.5138888888888888, "train_speed(iter/s)": 1.437175 }, { "epoch": 3.2194421832826356, "grad_norm": 6.171628952026367, "learning_rate": 2.8159974483067047e-05, "loss": 2.111388397216797, "memory(GiB)": 77.56, "step": 75145, "token_acc": 0.49433962264150944, "train_speed(iter/s)": 1.437188 }, { "epoch": 3.2196563986118845, "grad_norm": 6.2173380851745605, "learning_rate": 2.815392085704062e-05, "loss": 2.421305847167969, "memory(GiB)": 77.56, "step": 75150, "token_acc": 0.49834983498349833, "train_speed(iter/s)": 1.437211 }, { "epoch": 3.2198706139411337, "grad_norm": 5.145447731018066, "learning_rate": 2.8147867626776437e-05, "loss": 2.236465072631836, "memory(GiB)": 77.56, "step": 75155, "token_acc": 0.5288753799392097, "train_speed(iter/s)": 1.437207 }, { "epoch": 3.2200848292703825, "grad_norm": 4.8295464515686035, "learning_rate": 2.8141814792384123e-05, "loss": 2.476268196105957, "memory(GiB)": 77.56, "step": 75160, "token_acc": 0.48546511627906974, "train_speed(iter/s)": 1.437178 }, { "epoch": 3.2202990445996313, "grad_norm": 5.567069053649902, "learning_rate": 2.813576235397334e-05, "loss": 2.286100387573242, "memory(GiB)": 77.56, "step": 75165, "token_acc": 0.5159010600706714, "train_speed(iter/s)": 1.43718 }, { "epoch": 3.2205132599288806, "grad_norm": 6.422302722930908, "learning_rate": 2.8129710311653735e-05, "loss": 2.7097951889038088, "memory(GiB)": 77.56, "step": 75170, "token_acc": 0.42724458204334365, "train_speed(iter/s)": 1.437193 }, { "epoch": 3.2207274752581294, "grad_norm": 4.312794208526611, "learning_rate": 2.8123658665534913e-05, "loss": 2.155950164794922, "memory(GiB)": 77.56, "step": 75175, "token_acc": 0.5409836065573771, "train_speed(iter/s)": 1.437198 }, { "epoch": 3.2209416905873782, "grad_norm": 5.756070613861084, "learning_rate": 2.8117607415726565e-05, "loss": 2.342049789428711, "memory(GiB)": 77.56, "step": 75180, "token_acc": 0.5206349206349207, "train_speed(iter/s)": 1.437186 }, { "epoch": 3.2211559059166275, "grad_norm": 9.422969818115234, "learning_rate": 2.8111556562338292e-05, "loss": 2.0476539611816404, "memory(GiB)": 77.56, "step": 75185, "token_acc": 0.5247148288973384, "train_speed(iter/s)": 1.437206 }, { "epoch": 3.2213701212458763, "grad_norm": 6.327995777130127, "learning_rate": 2.8105506105479695e-05, "loss": 2.227604103088379, "memory(GiB)": 77.56, "step": 75190, "token_acc": 0.531055900621118, "train_speed(iter/s)": 1.437183 }, { "epoch": 3.221584336575125, "grad_norm": 6.0252556800842285, "learning_rate": 2.8099456045260396e-05, "loss": 2.4878944396972655, "memory(GiB)": 77.56, "step": 75195, "token_acc": 0.5313432835820896, "train_speed(iter/s)": 1.437192 }, { "epoch": 3.2217985519043744, "grad_norm": 4.432491302490234, "learning_rate": 2.809340638179e-05, "loss": 2.486052322387695, "memory(GiB)": 77.56, "step": 75200, "token_acc": 0.48703170028818443, "train_speed(iter/s)": 1.437192 }, { "epoch": 3.222012767233623, "grad_norm": 6.828469753265381, "learning_rate": 2.808735711517807e-05, "loss": 2.2316352844238283, "memory(GiB)": 77.56, "step": 75205, "token_acc": 0.4927536231884058, "train_speed(iter/s)": 1.43719 }, { "epoch": 3.222226982562872, "grad_norm": 4.746007919311523, "learning_rate": 2.8081308245534228e-05, "loss": 2.373458480834961, "memory(GiB)": 77.56, "step": 75210, "token_acc": 0.5167173252279635, "train_speed(iter/s)": 1.437204 }, { "epoch": 3.2224411978921212, "grad_norm": 5.120621681213379, "learning_rate": 2.8075259772968064e-05, "loss": 2.481466865539551, "memory(GiB)": 77.56, "step": 75215, "token_acc": 0.47575757575757577, "train_speed(iter/s)": 1.437231 }, { "epoch": 3.22265541322137, "grad_norm": 6.66160249710083, "learning_rate": 2.8069211697589138e-05, "loss": 2.378527069091797, "memory(GiB)": 77.56, "step": 75220, "token_acc": 0.5050505050505051, "train_speed(iter/s)": 1.437251 }, { "epoch": 3.222869628550619, "grad_norm": 7.113389492034912, "learning_rate": 2.8063164019507017e-05, "loss": 2.4076568603515627, "memory(GiB)": 77.56, "step": 75225, "token_acc": 0.5032051282051282, "train_speed(iter/s)": 1.437256 }, { "epoch": 3.223083843879868, "grad_norm": 5.610108852386475, "learning_rate": 2.805711673883125e-05, "loss": 2.467230796813965, "memory(GiB)": 77.56, "step": 75230, "token_acc": 0.4682274247491639, "train_speed(iter/s)": 1.437247 }, { "epoch": 3.223298059209117, "grad_norm": 4.766554355621338, "learning_rate": 2.8051069855671387e-05, "loss": 2.264450454711914, "memory(GiB)": 77.56, "step": 75235, "token_acc": 0.501779359430605, "train_speed(iter/s)": 1.437263 }, { "epoch": 3.2235122745383658, "grad_norm": 5.303033828735352, "learning_rate": 2.8045023370136996e-05, "loss": 2.5186840057373048, "memory(GiB)": 77.56, "step": 75240, "token_acc": 0.5190311418685121, "train_speed(iter/s)": 1.437268 }, { "epoch": 3.223726489867615, "grad_norm": 5.408699989318848, "learning_rate": 2.8038977282337607e-05, "loss": 2.4622920989990233, "memory(GiB)": 77.56, "step": 75245, "token_acc": 0.49363057324840764, "train_speed(iter/s)": 1.437283 }, { "epoch": 3.223940705196864, "grad_norm": 6.971130847930908, "learning_rate": 2.8032931592382743e-05, "loss": 2.4556713104248047, "memory(GiB)": 77.56, "step": 75250, "token_acc": 0.513986013986014, "train_speed(iter/s)": 1.43729 }, { "epoch": 3.2241549205261126, "grad_norm": 6.508571147918701, "learning_rate": 2.802688630038193e-05, "loss": 2.23946533203125, "memory(GiB)": 77.56, "step": 75255, "token_acc": 0.5377049180327869, "train_speed(iter/s)": 1.437303 }, { "epoch": 3.224369135855362, "grad_norm": 4.787049770355225, "learning_rate": 2.8020841406444687e-05, "loss": 2.3575567245483398, "memory(GiB)": 77.56, "step": 75260, "token_acc": 0.5, "train_speed(iter/s)": 1.437315 }, { "epoch": 3.2245833511846107, "grad_norm": 5.38545560836792, "learning_rate": 2.8014796910680498e-05, "loss": 2.28045597076416, "memory(GiB)": 77.56, "step": 75265, "token_acc": 0.5251798561151079, "train_speed(iter/s)": 1.43731 }, { "epoch": 3.2247975665138595, "grad_norm": 6.3859076499938965, "learning_rate": 2.800875281319891e-05, "loss": 2.3727489471435548, "memory(GiB)": 77.56, "step": 75270, "token_acc": 0.4850498338870432, "train_speed(iter/s)": 1.437299 }, { "epoch": 3.2250117818431088, "grad_norm": 6.496909141540527, "learning_rate": 2.8002709114109392e-05, "loss": 2.2590238571166994, "memory(GiB)": 77.56, "step": 75275, "token_acc": 0.5354609929078015, "train_speed(iter/s)": 1.437334 }, { "epoch": 3.2252259971723576, "grad_norm": 8.583075523376465, "learning_rate": 2.799666581352141e-05, "loss": 2.1971153259277343, "memory(GiB)": 77.56, "step": 75280, "token_acc": 0.5498154981549815, "train_speed(iter/s)": 1.43733 }, { "epoch": 3.2254402125016064, "grad_norm": 5.539919853210449, "learning_rate": 2.7990622911544494e-05, "loss": 2.2781246185302733, "memory(GiB)": 77.56, "step": 75285, "token_acc": 0.4983922829581994, "train_speed(iter/s)": 1.43734 }, { "epoch": 3.2256544278308557, "grad_norm": 5.015482425689697, "learning_rate": 2.7984580408288087e-05, "loss": 2.0152210235595702, "memory(GiB)": 77.56, "step": 75290, "token_acc": 0.5306859205776173, "train_speed(iter/s)": 1.437348 }, { "epoch": 3.2258686431601045, "grad_norm": 5.499642848968506, "learning_rate": 2.7978538303861635e-05, "loss": 2.6526565551757812, "memory(GiB)": 77.56, "step": 75295, "token_acc": 0.45674740484429066, "train_speed(iter/s)": 1.437332 }, { "epoch": 3.2260828584893533, "grad_norm": 5.827627658843994, "learning_rate": 2.797249659837464e-05, "loss": 2.2477169036865234, "memory(GiB)": 77.56, "step": 75300, "token_acc": 0.5062111801242236, "train_speed(iter/s)": 1.437283 }, { "epoch": 3.2262970738186025, "grad_norm": 5.2502336502075195, "learning_rate": 2.796645529193654e-05, "loss": 2.3638214111328124, "memory(GiB)": 77.56, "step": 75305, "token_acc": 0.4966216216216216, "train_speed(iter/s)": 1.437307 }, { "epoch": 3.2265112891478513, "grad_norm": 6.081926345825195, "learning_rate": 2.7960414384656763e-05, "loss": 2.5635066986083985, "memory(GiB)": 77.56, "step": 75310, "token_acc": 0.4919614147909968, "train_speed(iter/s)": 1.43732 }, { "epoch": 3.2267255044771, "grad_norm": 5.67254114151001, "learning_rate": 2.7954373876644756e-05, "loss": 2.7994102478027343, "memory(GiB)": 77.56, "step": 75315, "token_acc": 0.449685534591195, "train_speed(iter/s)": 1.437291 }, { "epoch": 3.2269397198063494, "grad_norm": 5.713555812835693, "learning_rate": 2.7948333768009932e-05, "loss": 2.2783538818359377, "memory(GiB)": 77.56, "step": 75320, "token_acc": 0.5171339563862928, "train_speed(iter/s)": 1.437309 }, { "epoch": 3.2271539351355982, "grad_norm": 4.630133628845215, "learning_rate": 2.7942294058861745e-05, "loss": 2.0302051544189452, "memory(GiB)": 77.56, "step": 75325, "token_acc": 0.5333333333333333, "train_speed(iter/s)": 1.43729 }, { "epoch": 3.227368150464847, "grad_norm": 10.440735816955566, "learning_rate": 2.7936254749309594e-05, "loss": 2.3708826065063477, "memory(GiB)": 77.56, "step": 75330, "token_acc": 0.5093632958801498, "train_speed(iter/s)": 1.437282 }, { "epoch": 3.2275823657940963, "grad_norm": 6.008800506591797, "learning_rate": 2.7930215839462882e-05, "loss": 2.4622371673583983, "memory(GiB)": 77.56, "step": 75335, "token_acc": 0.4983388704318937, "train_speed(iter/s)": 1.437262 }, { "epoch": 3.227796581123345, "grad_norm": 6.473403453826904, "learning_rate": 2.792417732943101e-05, "loss": 2.14124755859375, "memory(GiB)": 77.56, "step": 75340, "token_acc": 0.5330739299610895, "train_speed(iter/s)": 1.437262 }, { "epoch": 3.228010796452594, "grad_norm": 6.101529598236084, "learning_rate": 2.7918139219323386e-05, "loss": 2.278671455383301, "memory(GiB)": 77.56, "step": 75345, "token_acc": 0.5161290322580645, "train_speed(iter/s)": 1.43728 }, { "epoch": 3.228225011781843, "grad_norm": 7.243205547332764, "learning_rate": 2.7912101509249357e-05, "loss": 2.33863525390625, "memory(GiB)": 77.56, "step": 75350, "token_acc": 0.4880546075085324, "train_speed(iter/s)": 1.437288 }, { "epoch": 3.228439227111092, "grad_norm": 5.071996212005615, "learning_rate": 2.7906064199318327e-05, "loss": 2.5244226455688477, "memory(GiB)": 77.56, "step": 75355, "token_acc": 0.49216300940438873, "train_speed(iter/s)": 1.43729 }, { "epoch": 3.228653442440341, "grad_norm": 5.749786853790283, "learning_rate": 2.7900027289639695e-05, "loss": 2.1004411697387697, "memory(GiB)": 77.56, "step": 75360, "token_acc": 0.5283687943262412, "train_speed(iter/s)": 1.437306 }, { "epoch": 3.22886765776959, "grad_norm": 5.683716773986816, "learning_rate": 2.7893990780322798e-05, "loss": 2.262539100646973, "memory(GiB)": 77.56, "step": 75365, "token_acc": 0.5447154471544715, "train_speed(iter/s)": 1.437333 }, { "epoch": 3.229081873098839, "grad_norm": 4.799079895019531, "learning_rate": 2.7887954671477002e-05, "loss": 2.414706230163574, "memory(GiB)": 77.56, "step": 75370, "token_acc": 0.45751633986928103, "train_speed(iter/s)": 1.437313 }, { "epoch": 3.2292960884280877, "grad_norm": 4.655165195465088, "learning_rate": 2.7881918963211638e-05, "loss": 2.20379638671875, "memory(GiB)": 77.56, "step": 75375, "token_acc": 0.5082508250825083, "train_speed(iter/s)": 1.437334 }, { "epoch": 3.229510303757337, "grad_norm": 6.18972635269165, "learning_rate": 2.787588365563605e-05, "loss": 2.529495620727539, "memory(GiB)": 77.56, "step": 75380, "token_acc": 0.4870848708487085, "train_speed(iter/s)": 1.437345 }, { "epoch": 3.2297245190865858, "grad_norm": 6.779815196990967, "learning_rate": 2.7869848748859596e-05, "loss": 2.612514877319336, "memory(GiB)": 77.56, "step": 75385, "token_acc": 0.46691176470588236, "train_speed(iter/s)": 1.437351 }, { "epoch": 3.2299387344158346, "grad_norm": 7.4448065757751465, "learning_rate": 2.786381424299159e-05, "loss": 2.260715103149414, "memory(GiB)": 77.56, "step": 75390, "token_acc": 0.5755102040816327, "train_speed(iter/s)": 1.43735 }, { "epoch": 3.230152949745084, "grad_norm": 6.15609884262085, "learning_rate": 2.7857780138141365e-05, "loss": 2.491611862182617, "memory(GiB)": 77.56, "step": 75395, "token_acc": 0.4807692307692308, "train_speed(iter/s)": 1.437339 }, { "epoch": 3.2303671650743326, "grad_norm": 6.680952548980713, "learning_rate": 2.785174643441822e-05, "loss": 2.2012187957763674, "memory(GiB)": 77.56, "step": 75400, "token_acc": 0.5331125827814569, "train_speed(iter/s)": 1.43735 }, { "epoch": 3.2305813804035814, "grad_norm": 5.765521049499512, "learning_rate": 2.7845713131931462e-05, "loss": 2.6777210235595703, "memory(GiB)": 77.56, "step": 75405, "token_acc": 0.5163398692810458, "train_speed(iter/s)": 1.43737 }, { "epoch": 3.2307955957328307, "grad_norm": 7.155279159545898, "learning_rate": 2.783968023079038e-05, "loss": 1.909425926208496, "memory(GiB)": 77.56, "step": 75410, "token_acc": 0.5464684014869888, "train_speed(iter/s)": 1.437389 }, { "epoch": 3.2310098110620795, "grad_norm": 6.343012809753418, "learning_rate": 2.7833647731104284e-05, "loss": 2.2746932983398436, "memory(GiB)": 77.56, "step": 75415, "token_acc": 0.4845679012345679, "train_speed(iter/s)": 1.437408 }, { "epoch": 3.2312240263913283, "grad_norm": 6.070235252380371, "learning_rate": 2.7827615632982462e-05, "loss": 2.3570247650146485, "memory(GiB)": 77.56, "step": 75420, "token_acc": 0.5233333333333333, "train_speed(iter/s)": 1.437413 }, { "epoch": 3.2314382417205776, "grad_norm": 4.9791579246521, "learning_rate": 2.782158393653419e-05, "loss": 2.5933231353759765, "memory(GiB)": 77.56, "step": 75425, "token_acc": 0.48066298342541436, "train_speed(iter/s)": 1.437422 }, { "epoch": 3.2316524570498264, "grad_norm": 4.7238054275512695, "learning_rate": 2.7815552641868703e-05, "loss": 2.2195587158203125, "memory(GiB)": 77.56, "step": 75430, "token_acc": 0.5447470817120622, "train_speed(iter/s)": 1.437443 }, { "epoch": 3.231866672379075, "grad_norm": 5.422077655792236, "learning_rate": 2.78095217490953e-05, "loss": 2.188327598571777, "memory(GiB)": 77.56, "step": 75435, "token_acc": 0.5130111524163569, "train_speed(iter/s)": 1.437459 }, { "epoch": 3.2320808877083245, "grad_norm": 4.599216938018799, "learning_rate": 2.7803491258323224e-05, "loss": 2.462245559692383, "memory(GiB)": 77.56, "step": 75440, "token_acc": 0.5097493036211699, "train_speed(iter/s)": 1.437437 }, { "epoch": 3.2322951030375733, "grad_norm": 6.823783874511719, "learning_rate": 2.7797461169661735e-05, "loss": 2.2501415252685546, "memory(GiB)": 77.56, "step": 75445, "token_acc": 0.4918032786885246, "train_speed(iter/s)": 1.437448 }, { "epoch": 3.2325093183668225, "grad_norm": 5.404219627380371, "learning_rate": 2.779143148322007e-05, "loss": 2.2669036865234373, "memory(GiB)": 77.56, "step": 75450, "token_acc": 0.5157593123209169, "train_speed(iter/s)": 1.437468 }, { "epoch": 3.2327235336960713, "grad_norm": 6.246007442474365, "learning_rate": 2.7785402199107456e-05, "loss": 2.183239555358887, "memory(GiB)": 77.56, "step": 75455, "token_acc": 0.5539033457249071, "train_speed(iter/s)": 1.43747 }, { "epoch": 3.23293774902532, "grad_norm": 5.3288254737854, "learning_rate": 2.7779373317433122e-05, "loss": 2.436257743835449, "memory(GiB)": 77.56, "step": 75460, "token_acc": 0.4897260273972603, "train_speed(iter/s)": 1.437439 }, { "epoch": 3.2331519643545694, "grad_norm": 5.885639190673828, "learning_rate": 2.777334483830626e-05, "loss": 2.19609375, "memory(GiB)": 77.56, "step": 75465, "token_acc": 0.49517684887459806, "train_speed(iter/s)": 1.437452 }, { "epoch": 3.2333661796838182, "grad_norm": 5.724212646484375, "learning_rate": 2.7767316761836125e-05, "loss": 2.309849166870117, "memory(GiB)": 77.56, "step": 75470, "token_acc": 0.49812734082397003, "train_speed(iter/s)": 1.437474 }, { "epoch": 3.233580395013067, "grad_norm": 5.691561222076416, "learning_rate": 2.7761289088131916e-05, "loss": 2.219216728210449, "memory(GiB)": 77.56, "step": 75475, "token_acc": 0.5163398692810458, "train_speed(iter/s)": 1.437465 }, { "epoch": 3.2337946103423163, "grad_norm": 8.678397178649902, "learning_rate": 2.77552618173028e-05, "loss": 2.2646039962768554, "memory(GiB)": 77.56, "step": 75480, "token_acc": 0.4775510204081633, "train_speed(iter/s)": 1.437468 }, { "epoch": 3.234008825671565, "grad_norm": 8.968337059020996, "learning_rate": 2.7749234949457992e-05, "loss": 2.227193260192871, "memory(GiB)": 77.56, "step": 75485, "token_acc": 0.5368852459016393, "train_speed(iter/s)": 1.437486 }, { "epoch": 3.234223041000814, "grad_norm": 6.9968485832214355, "learning_rate": 2.7743208484706667e-05, "loss": 2.116176414489746, "memory(GiB)": 77.56, "step": 75490, "token_acc": 0.5372549019607843, "train_speed(iter/s)": 1.437479 }, { "epoch": 3.234437256330063, "grad_norm": 7.613739013671875, "learning_rate": 2.7737182423157982e-05, "loss": 2.484148406982422, "memory(GiB)": 77.56, "step": 75495, "token_acc": 0.5156794425087108, "train_speed(iter/s)": 1.437493 }, { "epoch": 3.234651471659312, "grad_norm": 7.118156909942627, "learning_rate": 2.7731156764921112e-05, "loss": 2.338268280029297, "memory(GiB)": 77.56, "step": 75500, "token_acc": 0.5430711610486891, "train_speed(iter/s)": 1.437479 }, { "epoch": 3.234651471659312, "eval_loss": 2.196960210800171, "eval_runtime": 13.6844, "eval_samples_per_second": 7.308, "eval_steps_per_second": 7.308, "eval_token_acc": 0.4728476821192053, "step": 75500 }, { "epoch": 3.234865686988561, "grad_norm": 6.853719711303711, "learning_rate": 2.772513151010525e-05, "loss": 2.3080514907836913, "memory(GiB)": 77.56, "step": 75505, "token_acc": 0.4827586206896552, "train_speed(iter/s)": 1.437084 }, { "epoch": 3.23507990231781, "grad_norm": 6.332302570343018, "learning_rate": 2.7719106658819526e-05, "loss": 2.214857292175293, "memory(GiB)": 77.56, "step": 75510, "token_acc": 0.4879518072289157, "train_speed(iter/s)": 1.437072 }, { "epoch": 3.235294117647059, "grad_norm": 7.606568336486816, "learning_rate": 2.771308221117309e-05, "loss": 2.518262481689453, "memory(GiB)": 77.56, "step": 75515, "token_acc": 0.45364238410596025, "train_speed(iter/s)": 1.437048 }, { "epoch": 3.2355083329763077, "grad_norm": 6.424154281616211, "learning_rate": 2.7707058167275075e-05, "loss": 2.283746337890625, "memory(GiB)": 77.56, "step": 75520, "token_acc": 0.5049180327868853, "train_speed(iter/s)": 1.437053 }, { "epoch": 3.235722548305557, "grad_norm": 5.926879405975342, "learning_rate": 2.770103452723459e-05, "loss": 2.4940719604492188, "memory(GiB)": 77.56, "step": 75525, "token_acc": 0.5111111111111111, "train_speed(iter/s)": 1.437038 }, { "epoch": 3.2359367636348058, "grad_norm": 5.421752452850342, "learning_rate": 2.769501129116081e-05, "loss": 2.3071590423583985, "memory(GiB)": 77.56, "step": 75530, "token_acc": 0.5288135593220339, "train_speed(iter/s)": 1.437029 }, { "epoch": 3.2361509789640546, "grad_norm": 6.382996082305908, "learning_rate": 2.7688988459162818e-05, "loss": 2.2297019958496094, "memory(GiB)": 77.56, "step": 75535, "token_acc": 0.4752851711026616, "train_speed(iter/s)": 1.437038 }, { "epoch": 3.236365194293304, "grad_norm": 5.532587051391602, "learning_rate": 2.7682966031349732e-05, "loss": 2.2956300735473634, "memory(GiB)": 77.56, "step": 75540, "token_acc": 0.5261324041811847, "train_speed(iter/s)": 1.437035 }, { "epoch": 3.2365794096225526, "grad_norm": 10.746757507324219, "learning_rate": 2.7676944007830653e-05, "loss": 2.3829824447631838, "memory(GiB)": 77.56, "step": 75545, "token_acc": 0.4840989399293286, "train_speed(iter/s)": 1.437035 }, { "epoch": 3.2367936249518015, "grad_norm": 4.990373134613037, "learning_rate": 2.767092238871467e-05, "loss": 2.4315380096435546, "memory(GiB)": 77.56, "step": 75550, "token_acc": 0.4766355140186916, "train_speed(iter/s)": 1.437021 }, { "epoch": 3.2370078402810507, "grad_norm": 5.94728422164917, "learning_rate": 2.7664901174110857e-05, "loss": 2.0546823501586915, "memory(GiB)": 77.56, "step": 75555, "token_acc": 0.5319148936170213, "train_speed(iter/s)": 1.437036 }, { "epoch": 3.2372220556102995, "grad_norm": 6.0204362869262695, "learning_rate": 2.7658880364128326e-05, "loss": 2.3509057998657226, "memory(GiB)": 77.56, "step": 75560, "token_acc": 0.5015197568389058, "train_speed(iter/s)": 1.43703 }, { "epoch": 3.2374362709395483, "grad_norm": 5.720105171203613, "learning_rate": 2.7652859958876142e-05, "loss": 1.9346551895141602, "memory(GiB)": 77.56, "step": 75565, "token_acc": 0.5679442508710801, "train_speed(iter/s)": 1.437059 }, { "epoch": 3.2376504862687976, "grad_norm": 6.184651851654053, "learning_rate": 2.764683995846336e-05, "loss": 2.212659454345703, "memory(GiB)": 77.56, "step": 75570, "token_acc": 0.5273972602739726, "train_speed(iter/s)": 1.437073 }, { "epoch": 3.2378647015980464, "grad_norm": 5.279417037963867, "learning_rate": 2.764082036299902e-05, "loss": 2.3793272018432616, "memory(GiB)": 77.56, "step": 75575, "token_acc": 0.4714285714285714, "train_speed(iter/s)": 1.437099 }, { "epoch": 3.238078916927295, "grad_norm": 6.2321648597717285, "learning_rate": 2.7634801172592217e-05, "loss": 2.485188293457031, "memory(GiB)": 77.56, "step": 75580, "token_acc": 0.4859154929577465, "train_speed(iter/s)": 1.437114 }, { "epoch": 3.2382931322565445, "grad_norm": 7.952552318572998, "learning_rate": 2.7628782387351948e-05, "loss": 2.165928268432617, "memory(GiB)": 77.56, "step": 75585, "token_acc": 0.5049833887043189, "train_speed(iter/s)": 1.437128 }, { "epoch": 3.2385073475857933, "grad_norm": 6.131654739379883, "learning_rate": 2.762276400738729e-05, "loss": 2.2802419662475586, "memory(GiB)": 77.56, "step": 75590, "token_acc": 0.49137931034482757, "train_speed(iter/s)": 1.437122 }, { "epoch": 3.238721562915042, "grad_norm": 6.049211025238037, "learning_rate": 2.7616746032807262e-05, "loss": 2.479566955566406, "memory(GiB)": 77.56, "step": 75595, "token_acc": 0.5175718849840255, "train_speed(iter/s)": 1.437131 }, { "epoch": 3.2389357782442914, "grad_norm": 7.114353656768799, "learning_rate": 2.761072846372087e-05, "loss": 2.1769914627075195, "memory(GiB)": 77.56, "step": 75600, "token_acc": 0.525691699604743, "train_speed(iter/s)": 1.437146 }, { "epoch": 3.23914999357354, "grad_norm": 4.610339641571045, "learning_rate": 2.760471130023714e-05, "loss": 2.1934295654296876, "memory(GiB)": 77.56, "step": 75605, "token_acc": 0.5233918128654971, "train_speed(iter/s)": 1.437147 }, { "epoch": 3.239364208902789, "grad_norm": 5.4529266357421875, "learning_rate": 2.7598694542465047e-05, "loss": 2.193354034423828, "memory(GiB)": 77.56, "step": 75610, "token_acc": 0.5176848874598071, "train_speed(iter/s)": 1.437152 }, { "epoch": 3.2395784242320382, "grad_norm": 4.35702657699585, "learning_rate": 2.7592678190513638e-05, "loss": 2.3082372665405275, "memory(GiB)": 77.56, "step": 75615, "token_acc": 0.49544072948328266, "train_speed(iter/s)": 1.437168 }, { "epoch": 3.239792639561287, "grad_norm": 7.161694526672363, "learning_rate": 2.7586662244491883e-05, "loss": 2.4381765365600585, "memory(GiB)": 77.56, "step": 75620, "token_acc": 0.48659003831417624, "train_speed(iter/s)": 1.437162 }, { "epoch": 3.240006854890536, "grad_norm": 5.5795087814331055, "learning_rate": 2.7580646704508772e-05, "loss": 2.253228759765625, "memory(GiB)": 77.56, "step": 75625, "token_acc": 0.503448275862069, "train_speed(iter/s)": 1.437175 }, { "epoch": 3.240221070219785, "grad_norm": 6.340299606323242, "learning_rate": 2.757463157067327e-05, "loss": 2.311410140991211, "memory(GiB)": 77.56, "step": 75630, "token_acc": 0.5172413793103449, "train_speed(iter/s)": 1.437198 }, { "epoch": 3.240435285549034, "grad_norm": 7.961738109588623, "learning_rate": 2.756861684309436e-05, "loss": 2.2823360443115233, "memory(GiB)": 77.56, "step": 75635, "token_acc": 0.5037593984962406, "train_speed(iter/s)": 1.437226 }, { "epoch": 3.2406495008782827, "grad_norm": 5.117207050323486, "learning_rate": 2.7562602521880975e-05, "loss": 2.46173038482666, "memory(GiB)": 77.56, "step": 75640, "token_acc": 0.5335570469798657, "train_speed(iter/s)": 1.437231 }, { "epoch": 3.240863716207532, "grad_norm": 5.581437587738037, "learning_rate": 2.7556588607142107e-05, "loss": 2.245204734802246, "memory(GiB)": 77.56, "step": 75645, "token_acc": 0.5399239543726235, "train_speed(iter/s)": 1.437254 }, { "epoch": 3.241077931536781, "grad_norm": 4.618085861206055, "learning_rate": 2.7550575098986688e-05, "loss": 2.319418716430664, "memory(GiB)": 77.56, "step": 75650, "token_acc": 0.48231511254019294, "train_speed(iter/s)": 1.437223 }, { "epoch": 3.2412921468660296, "grad_norm": 5.7654924392700195, "learning_rate": 2.7544561997523665e-05, "loss": 2.308021354675293, "memory(GiB)": 77.56, "step": 75655, "token_acc": 0.5389830508474577, "train_speed(iter/s)": 1.437212 }, { "epoch": 3.241506362195279, "grad_norm": 6.728402137756348, "learning_rate": 2.753854930286197e-05, "loss": 2.2943843841552733, "memory(GiB)": 77.56, "step": 75660, "token_acc": 0.4885057471264368, "train_speed(iter/s)": 1.437206 }, { "epoch": 3.2417205775245277, "grad_norm": 4.96250057220459, "learning_rate": 2.7532537015110528e-05, "loss": 2.046406555175781, "memory(GiB)": 77.56, "step": 75665, "token_acc": 0.5176470588235295, "train_speed(iter/s)": 1.437225 }, { "epoch": 3.2419347928537765, "grad_norm": 6.787867069244385, "learning_rate": 2.752652513437823e-05, "loss": 2.4859390258789062, "memory(GiB)": 77.56, "step": 75670, "token_acc": 0.4734848484848485, "train_speed(iter/s)": 1.437244 }, { "epoch": 3.2421490081830258, "grad_norm": 6.609344959259033, "learning_rate": 2.7520513660774028e-05, "loss": 2.20709171295166, "memory(GiB)": 77.56, "step": 75675, "token_acc": 0.5398550724637681, "train_speed(iter/s)": 1.43723 }, { "epoch": 3.2423632235122746, "grad_norm": 4.82760763168335, "learning_rate": 2.7514502594406812e-05, "loss": 2.2754920959472655, "memory(GiB)": 77.56, "step": 75680, "token_acc": 0.5175438596491229, "train_speed(iter/s)": 1.437196 }, { "epoch": 3.2425774388415234, "grad_norm": 4.4051079750061035, "learning_rate": 2.7508491935385478e-05, "loss": 2.355311965942383, "memory(GiB)": 77.56, "step": 75685, "token_acc": 0.5234375, "train_speed(iter/s)": 1.437217 }, { "epoch": 3.2427916541707726, "grad_norm": 5.311786651611328, "learning_rate": 2.750248168381891e-05, "loss": 2.087117385864258, "memory(GiB)": 77.56, "step": 75690, "token_acc": 0.5335820895522388, "train_speed(iter/s)": 1.43723 }, { "epoch": 3.2430058695000215, "grad_norm": 5.390645503997803, "learning_rate": 2.7496471839815983e-05, "loss": 2.2624372482299804, "memory(GiB)": 77.56, "step": 75695, "token_acc": 0.5742574257425742, "train_speed(iter/s)": 1.437221 }, { "epoch": 3.2432200848292703, "grad_norm": 7.065344333648682, "learning_rate": 2.7490462403485574e-05, "loss": 2.1746971130371096, "memory(GiB)": 77.56, "step": 75700, "token_acc": 0.4979757085020243, "train_speed(iter/s)": 1.437229 }, { "epoch": 3.2434343001585195, "grad_norm": 4.980740547180176, "learning_rate": 2.7484453374936558e-05, "loss": 2.2688079833984376, "memory(GiB)": 77.56, "step": 75705, "token_acc": 0.5047923322683706, "train_speed(iter/s)": 1.437253 }, { "epoch": 3.2436485154877683, "grad_norm": 5.923549175262451, "learning_rate": 2.74784447542778e-05, "loss": 2.4600738525390624, "memory(GiB)": 77.56, "step": 75710, "token_acc": 0.4867549668874172, "train_speed(iter/s)": 1.437273 }, { "epoch": 3.243862730817017, "grad_norm": 8.077080726623535, "learning_rate": 2.7472436541618142e-05, "loss": 2.2293310165405273, "memory(GiB)": 77.56, "step": 75715, "token_acc": 0.5032679738562091, "train_speed(iter/s)": 1.437241 }, { "epoch": 3.2440769461462664, "grad_norm": 8.59260368347168, "learning_rate": 2.7466428737066426e-05, "loss": 2.2343297958374024, "memory(GiB)": 77.56, "step": 75720, "token_acc": 0.5598591549295775, "train_speed(iter/s)": 1.437252 }, { "epoch": 3.244291161475515, "grad_norm": 4.704460620880127, "learning_rate": 2.7460421340731468e-05, "loss": 2.1195789337158204, "memory(GiB)": 77.56, "step": 75725, "token_acc": 0.516728624535316, "train_speed(iter/s)": 1.437269 }, { "epoch": 3.244505376804764, "grad_norm": 5.434798717498779, "learning_rate": 2.7454414352722128e-05, "loss": 2.295984649658203, "memory(GiB)": 77.56, "step": 75730, "token_acc": 0.48877805486284287, "train_speed(iter/s)": 1.437281 }, { "epoch": 3.2447195921340133, "grad_norm": 5.225401401519775, "learning_rate": 2.7448407773147238e-05, "loss": 2.537242126464844, "memory(GiB)": 77.56, "step": 75735, "token_acc": 0.48773006134969327, "train_speed(iter/s)": 1.437301 }, { "epoch": 3.244933807463262, "grad_norm": 7.597643852233887, "learning_rate": 2.7442401602115596e-05, "loss": 2.012503242492676, "memory(GiB)": 77.56, "step": 75740, "token_acc": 0.5447761194029851, "train_speed(iter/s)": 1.43732 }, { "epoch": 3.245148022792511, "grad_norm": 5.200222969055176, "learning_rate": 2.7436395839736016e-05, "loss": 2.2515895843505858, "memory(GiB)": 77.56, "step": 75745, "token_acc": 0.5590062111801242, "train_speed(iter/s)": 1.437342 }, { "epoch": 3.24536223812176, "grad_norm": 5.690224647521973, "learning_rate": 2.7430390486117284e-05, "loss": 2.3349048614501955, "memory(GiB)": 77.56, "step": 75750, "token_acc": 0.5292207792207793, "train_speed(iter/s)": 1.437334 }, { "epoch": 3.245576453451009, "grad_norm": 5.858898162841797, "learning_rate": 2.742438554136818e-05, "loss": 2.2467977523803713, "memory(GiB)": 77.56, "step": 75755, "token_acc": 0.5196850393700787, "train_speed(iter/s)": 1.437348 }, { "epoch": 3.245790668780258, "grad_norm": 5.439920425415039, "learning_rate": 2.741838100559754e-05, "loss": 2.363109588623047, "memory(GiB)": 77.56, "step": 75760, "token_acc": 0.5547945205479452, "train_speed(iter/s)": 1.437362 }, { "epoch": 3.246004884109507, "grad_norm": 8.374643325805664, "learning_rate": 2.7412376878914105e-05, "loss": 2.812795639038086, "memory(GiB)": 77.56, "step": 75765, "token_acc": 0.4584717607973422, "train_speed(iter/s)": 1.437364 }, { "epoch": 3.246219099438756, "grad_norm": 5.497799396514893, "learning_rate": 2.7406373161426647e-05, "loss": 2.4468584060668945, "memory(GiB)": 77.56, "step": 75770, "token_acc": 0.4769736842105263, "train_speed(iter/s)": 1.437373 }, { "epoch": 3.2464333147680047, "grad_norm": 5.403831481933594, "learning_rate": 2.7400369853243936e-05, "loss": 2.315408706665039, "memory(GiB)": 77.56, "step": 75775, "token_acc": 0.5161290322580645, "train_speed(iter/s)": 1.437374 }, { "epoch": 3.246647530097254, "grad_norm": 4.9260759353637695, "learning_rate": 2.7394366954474726e-05, "loss": 2.4794679641723634, "memory(GiB)": 77.56, "step": 75780, "token_acc": 0.50920245398773, "train_speed(iter/s)": 1.437379 }, { "epoch": 3.2468617454265027, "grad_norm": 6.249225616455078, "learning_rate": 2.7388364465227746e-05, "loss": 2.5583587646484376, "memory(GiB)": 77.56, "step": 75785, "token_acc": 0.47686832740213525, "train_speed(iter/s)": 1.437382 }, { "epoch": 3.2470759607557516, "grad_norm": 6.810762405395508, "learning_rate": 2.738236238561177e-05, "loss": 2.592457580566406, "memory(GiB)": 77.56, "step": 75790, "token_acc": 0.49846153846153846, "train_speed(iter/s)": 1.437378 }, { "epoch": 3.247290176085001, "grad_norm": 4.638576030731201, "learning_rate": 2.7376360715735527e-05, "loss": 2.196521759033203, "memory(GiB)": 77.56, "step": 75795, "token_acc": 0.5226480836236934, "train_speed(iter/s)": 1.437401 }, { "epoch": 3.2475043914142496, "grad_norm": 4.7867350578308105, "learning_rate": 2.7370359455707706e-05, "loss": 2.3255542755126952, "memory(GiB)": 77.56, "step": 75800, "token_acc": 0.5258358662613982, "train_speed(iter/s)": 1.437395 }, { "epoch": 3.2477186067434984, "grad_norm": 7.031664848327637, "learning_rate": 2.736435860563707e-05, "loss": 2.326621437072754, "memory(GiB)": 77.56, "step": 75805, "token_acc": 0.5344827586206896, "train_speed(iter/s)": 1.437382 }, { "epoch": 3.2479328220727477, "grad_norm": 4.80388879776001, "learning_rate": 2.7358358165632313e-05, "loss": 2.1625078201293944, "memory(GiB)": 77.56, "step": 75810, "token_acc": 0.5138888888888888, "train_speed(iter/s)": 1.437385 }, { "epoch": 3.2481470374019965, "grad_norm": 5.1422319412231445, "learning_rate": 2.7352358135802113e-05, "loss": 2.579312324523926, "memory(GiB)": 77.56, "step": 75815, "token_acc": 0.4379746835443038, "train_speed(iter/s)": 1.43739 }, { "epoch": 3.2483612527312453, "grad_norm": 4.782476902008057, "learning_rate": 2.7346358516255222e-05, "loss": 2.363435745239258, "memory(GiB)": 77.56, "step": 75820, "token_acc": 0.48484848484848486, "train_speed(iter/s)": 1.437399 }, { "epoch": 3.2485754680604946, "grad_norm": 5.166233539581299, "learning_rate": 2.7340359307100287e-05, "loss": 2.384761428833008, "memory(GiB)": 77.56, "step": 75825, "token_acc": 0.4925925925925926, "train_speed(iter/s)": 1.43742 }, { "epoch": 3.2487896833897434, "grad_norm": 7.33074951171875, "learning_rate": 2.7334360508446005e-05, "loss": 2.3004081726074217, "memory(GiB)": 77.56, "step": 75830, "token_acc": 0.4881889763779528, "train_speed(iter/s)": 1.437441 }, { "epoch": 3.249003898718992, "grad_norm": 4.672936916351318, "learning_rate": 2.732836212040104e-05, "loss": 2.4752323150634767, "memory(GiB)": 77.56, "step": 75835, "token_acc": 0.5138888888888888, "train_speed(iter/s)": 1.437456 }, { "epoch": 3.2492181140482415, "grad_norm": 5.176677703857422, "learning_rate": 2.732236414307407e-05, "loss": 2.136547088623047, "memory(GiB)": 77.56, "step": 75840, "token_acc": 0.5153846153846153, "train_speed(iter/s)": 1.437446 }, { "epoch": 3.2494323293774903, "grad_norm": 7.57281494140625, "learning_rate": 2.731636657657372e-05, "loss": 2.323530578613281, "memory(GiB)": 77.56, "step": 75845, "token_acc": 0.5249266862170088, "train_speed(iter/s)": 1.437446 }, { "epoch": 3.249646544706739, "grad_norm": 4.811956405639648, "learning_rate": 2.731036942100869e-05, "loss": 2.4900440216064452, "memory(GiB)": 77.56, "step": 75850, "token_acc": 0.48502994011976047, "train_speed(iter/s)": 1.437456 }, { "epoch": 3.2498607600359883, "grad_norm": 4.795746803283691, "learning_rate": 2.7304372676487606e-05, "loss": 2.3269702911376955, "memory(GiB)": 77.56, "step": 75855, "token_acc": 0.5105105105105106, "train_speed(iter/s)": 1.437478 }, { "epoch": 3.250074975365237, "grad_norm": 6.632368564605713, "learning_rate": 2.72983763431191e-05, "loss": 2.3196826934814454, "memory(GiB)": 77.56, "step": 75860, "token_acc": 0.49683544303797467, "train_speed(iter/s)": 1.437507 }, { "epoch": 3.250289190694486, "grad_norm": 6.483335018157959, "learning_rate": 2.72923804210118e-05, "loss": 2.1050172805786134, "memory(GiB)": 77.56, "step": 75865, "token_acc": 0.4979757085020243, "train_speed(iter/s)": 1.437518 }, { "epoch": 3.250503406023735, "grad_norm": 5.677504062652588, "learning_rate": 2.728638491027431e-05, "loss": 2.305510330200195, "memory(GiB)": 77.56, "step": 75870, "token_acc": 0.49236641221374045, "train_speed(iter/s)": 1.437529 }, { "epoch": 3.250717621352984, "grad_norm": 5.297688961029053, "learning_rate": 2.7280389811015267e-05, "loss": 2.270792770385742, "memory(GiB)": 77.56, "step": 75875, "token_acc": 0.4613003095975232, "train_speed(iter/s)": 1.437535 }, { "epoch": 3.250931836682233, "grad_norm": 5.259054660797119, "learning_rate": 2.7274395123343295e-05, "loss": 2.373600196838379, "memory(GiB)": 77.56, "step": 75880, "token_acc": 0.4889502762430939, "train_speed(iter/s)": 1.437534 }, { "epoch": 3.251146052011482, "grad_norm": 6.0703606605529785, "learning_rate": 2.726840084736697e-05, "loss": 2.506459426879883, "memory(GiB)": 77.56, "step": 75885, "token_acc": 0.5221843003412969, "train_speed(iter/s)": 1.437543 }, { "epoch": 3.251360267340731, "grad_norm": 6.6016764640808105, "learning_rate": 2.7262406983194894e-05, "loss": 2.2500278472900392, "memory(GiB)": 77.56, "step": 75890, "token_acc": 0.493006993006993, "train_speed(iter/s)": 1.437529 }, { "epoch": 3.2515744826699797, "grad_norm": 4.968935966491699, "learning_rate": 2.725641353093564e-05, "loss": 2.325093460083008, "memory(GiB)": 77.56, "step": 75895, "token_acc": 0.5067567567567568, "train_speed(iter/s)": 1.437541 }, { "epoch": 3.251788697999229, "grad_norm": 5.574542999267578, "learning_rate": 2.725042049069777e-05, "loss": 2.5173385620117186, "memory(GiB)": 77.56, "step": 75900, "token_acc": 0.46441947565543074, "train_speed(iter/s)": 1.437538 }, { "epoch": 3.252002913328478, "grad_norm": 4.985105991363525, "learning_rate": 2.7244427862589893e-05, "loss": 2.4265193939208984, "memory(GiB)": 77.56, "step": 75905, "token_acc": 0.5014005602240896, "train_speed(iter/s)": 1.437542 }, { "epoch": 3.2522171286577266, "grad_norm": 6.420775890350342, "learning_rate": 2.723843564672056e-05, "loss": 2.3762529373168944, "memory(GiB)": 77.56, "step": 75910, "token_acc": 0.5154639175257731, "train_speed(iter/s)": 1.437541 }, { "epoch": 3.252431343986976, "grad_norm": 6.197560787200928, "learning_rate": 2.723244384319831e-05, "loss": 2.281586837768555, "memory(GiB)": 77.56, "step": 75915, "token_acc": 0.5266666666666666, "train_speed(iter/s)": 1.437548 }, { "epoch": 3.2526455593162247, "grad_norm": 6.315161228179932, "learning_rate": 2.7226452452131703e-05, "loss": 2.466244125366211, "memory(GiB)": 77.56, "step": 75920, "token_acc": 0.49666666666666665, "train_speed(iter/s)": 1.437556 }, { "epoch": 3.2528597746454735, "grad_norm": 6.310101509094238, "learning_rate": 2.7220461473629265e-05, "loss": 2.1732805252075194, "memory(GiB)": 77.56, "step": 75925, "token_acc": 0.5303643724696356, "train_speed(iter/s)": 1.437574 }, { "epoch": 3.2530739899747227, "grad_norm": 5.555732727050781, "learning_rate": 2.7214470907799517e-05, "loss": 2.0638877868652346, "memory(GiB)": 77.56, "step": 75930, "token_acc": 0.5451127819548872, "train_speed(iter/s)": 1.437594 }, { "epoch": 3.2532882053039716, "grad_norm": 5.987128734588623, "learning_rate": 2.7208480754751033e-05, "loss": 2.262569808959961, "memory(GiB)": 77.56, "step": 75935, "token_acc": 0.48231511254019294, "train_speed(iter/s)": 1.437573 }, { "epoch": 3.2535024206332204, "grad_norm": 5.220691680908203, "learning_rate": 2.7202491014592292e-05, "loss": 2.179315948486328, "memory(GiB)": 77.56, "step": 75940, "token_acc": 0.4819277108433735, "train_speed(iter/s)": 1.437549 }, { "epoch": 3.2537166359624696, "grad_norm": 6.325334072113037, "learning_rate": 2.719650168743179e-05, "loss": 2.200699234008789, "memory(GiB)": 77.56, "step": 75945, "token_acc": 0.5220588235294118, "train_speed(iter/s)": 1.43757 }, { "epoch": 3.2539308512917184, "grad_norm": 6.337735176086426, "learning_rate": 2.7190512773378075e-05, "loss": 2.4285959243774413, "memory(GiB)": 77.56, "step": 75950, "token_acc": 0.47262247838616717, "train_speed(iter/s)": 1.437592 }, { "epoch": 3.2541450666209673, "grad_norm": 4.491027355194092, "learning_rate": 2.718452427253962e-05, "loss": 2.283153533935547, "memory(GiB)": 77.56, "step": 75955, "token_acc": 0.5381818181818182, "train_speed(iter/s)": 1.437594 }, { "epoch": 3.2543592819502165, "grad_norm": 8.201942443847656, "learning_rate": 2.717853618502489e-05, "loss": 2.0722557067871095, "memory(GiB)": 77.56, "step": 75960, "token_acc": 0.5352112676056338, "train_speed(iter/s)": 1.437611 }, { "epoch": 3.2545734972794653, "grad_norm": 8.467689514160156, "learning_rate": 2.7172548510942402e-05, "loss": 2.3165279388427735, "memory(GiB)": 77.56, "step": 75965, "token_acc": 0.5, "train_speed(iter/s)": 1.437625 }, { "epoch": 3.254787712608714, "grad_norm": 5.518149375915527, "learning_rate": 2.7166561250400613e-05, "loss": 2.3943042755126953, "memory(GiB)": 77.56, "step": 75970, "token_acc": 0.4942084942084942, "train_speed(iter/s)": 1.437632 }, { "epoch": 3.2550019279379634, "grad_norm": 10.01694393157959, "learning_rate": 2.7160574403507988e-05, "loss": 2.4342227935791017, "memory(GiB)": 77.56, "step": 75975, "token_acc": 0.4965277777777778, "train_speed(iter/s)": 1.437638 }, { "epoch": 3.255216143267212, "grad_norm": 6.17288064956665, "learning_rate": 2.715458797037298e-05, "loss": 2.2419620513916017, "memory(GiB)": 77.56, "step": 75980, "token_acc": 0.49244712990936557, "train_speed(iter/s)": 1.437655 }, { "epoch": 3.255430358596461, "grad_norm": 5.122513294219971, "learning_rate": 2.7148601951104046e-05, "loss": 2.3411544799804687, "memory(GiB)": 77.56, "step": 75985, "token_acc": 0.5164473684210527, "train_speed(iter/s)": 1.437665 }, { "epoch": 3.2556445739257103, "grad_norm": 5.716641426086426, "learning_rate": 2.71426163458096e-05, "loss": 2.1421302795410155, "memory(GiB)": 77.56, "step": 75990, "token_acc": 0.5427509293680297, "train_speed(iter/s)": 1.437688 }, { "epoch": 3.255858789254959, "grad_norm": 8.57295036315918, "learning_rate": 2.7136631154598124e-05, "loss": 2.341967010498047, "memory(GiB)": 77.56, "step": 75995, "token_acc": 0.5538461538461539, "train_speed(iter/s)": 1.437673 }, { "epoch": 3.256073004584208, "grad_norm": 4.116888046264648, "learning_rate": 2.7130646377578018e-05, "loss": 2.391357421875, "memory(GiB)": 77.56, "step": 76000, "token_acc": 0.4882154882154882, "train_speed(iter/s)": 1.437664 }, { "epoch": 3.256073004584208, "eval_loss": 2.453305244445801, "eval_runtime": 13.907, "eval_samples_per_second": 7.191, "eval_steps_per_second": 7.191, "eval_token_acc": 0.4508301404853129, "step": 76000 }, { "epoch": 3.256287219913457, "grad_norm": 5.90869140625, "learning_rate": 2.712466201485771e-05, "loss": 2.2543615341186523, "memory(GiB)": 77.56, "step": 76005, "token_acc": 0.46131805157593125, "train_speed(iter/s)": 1.437254 }, { "epoch": 3.256501435242706, "grad_norm": 6.839172840118408, "learning_rate": 2.7118678066545607e-05, "loss": 2.292075347900391, "memory(GiB)": 77.56, "step": 76010, "token_acc": 0.5460750853242321, "train_speed(iter/s)": 1.437234 }, { "epoch": 3.256715650571955, "grad_norm": 5.877674102783203, "learning_rate": 2.7112694532750092e-05, "loss": 2.1154727935791016, "memory(GiB)": 77.56, "step": 76015, "token_acc": 0.5384615384615384, "train_speed(iter/s)": 1.437236 }, { "epoch": 3.256929865901204, "grad_norm": 6.921167373657227, "learning_rate": 2.7106711413579587e-05, "loss": 2.164701461791992, "memory(GiB)": 77.56, "step": 76020, "token_acc": 0.508833922261484, "train_speed(iter/s)": 1.437246 }, { "epoch": 3.257144081230453, "grad_norm": 8.60682201385498, "learning_rate": 2.710072870914251e-05, "loss": 2.1594038009643555, "memory(GiB)": 77.56, "step": 76025, "token_acc": 0.49377593360995853, "train_speed(iter/s)": 1.437255 }, { "epoch": 3.2573582965597017, "grad_norm": 6.01790189743042, "learning_rate": 2.7094746419547202e-05, "loss": 2.4450910568237303, "memory(GiB)": 77.56, "step": 76030, "token_acc": 0.46402877697841727, "train_speed(iter/s)": 1.437247 }, { "epoch": 3.257572511888951, "grad_norm": 5.704244613647461, "learning_rate": 2.7088764544902057e-05, "loss": 2.2321807861328127, "memory(GiB)": 77.56, "step": 76035, "token_acc": 0.5465116279069767, "train_speed(iter/s)": 1.43721 }, { "epoch": 3.2577867272181997, "grad_norm": 5.279576301574707, "learning_rate": 2.7082783085315426e-05, "loss": 2.1731287002563477, "memory(GiB)": 77.56, "step": 76040, "token_acc": 0.5238095238095238, "train_speed(iter/s)": 1.43721 }, { "epoch": 3.2580009425474485, "grad_norm": 7.912080764770508, "learning_rate": 2.7076802040895666e-05, "loss": 2.3184326171875, "memory(GiB)": 77.56, "step": 76045, "token_acc": 0.5147058823529411, "train_speed(iter/s)": 1.437213 }, { "epoch": 3.258215157876698, "grad_norm": 4.597708225250244, "learning_rate": 2.7070821411751158e-05, "loss": 2.3820796966552735, "memory(GiB)": 77.56, "step": 76050, "token_acc": 0.4578313253012048, "train_speed(iter/s)": 1.437211 }, { "epoch": 3.2584293732059466, "grad_norm": 6.737573623657227, "learning_rate": 2.7064841197990225e-05, "loss": 2.0597745895385744, "memory(GiB)": 77.56, "step": 76055, "token_acc": 0.5372549019607843, "train_speed(iter/s)": 1.437224 }, { "epoch": 3.2586435885351954, "grad_norm": 5.4444899559021, "learning_rate": 2.7058861399721213e-05, "loss": 2.1524993896484377, "memory(GiB)": 77.56, "step": 76060, "token_acc": 0.5222222222222223, "train_speed(iter/s)": 1.437213 }, { "epoch": 3.2588578038644447, "grad_norm": 5.415369033813477, "learning_rate": 2.705288201705245e-05, "loss": 2.2321901321411133, "memory(GiB)": 77.56, "step": 76065, "token_acc": 0.5136186770428015, "train_speed(iter/s)": 1.43722 }, { "epoch": 3.2590720191936935, "grad_norm": 4.683697700500488, "learning_rate": 2.704690305009225e-05, "loss": 2.1994834899902345, "memory(GiB)": 77.56, "step": 76070, "token_acc": 0.501466275659824, "train_speed(iter/s)": 1.437221 }, { "epoch": 3.2592862345229423, "grad_norm": 5.0263237953186035, "learning_rate": 2.7040924498948918e-05, "loss": 2.5463226318359373, "memory(GiB)": 77.56, "step": 76075, "token_acc": 0.48909657320872274, "train_speed(iter/s)": 1.437222 }, { "epoch": 3.2595004498521916, "grad_norm": 8.046988487243652, "learning_rate": 2.703494636373079e-05, "loss": 2.2907894134521483, "memory(GiB)": 77.56, "step": 76080, "token_acc": 0.49056603773584906, "train_speed(iter/s)": 1.437237 }, { "epoch": 3.2597146651814404, "grad_norm": 6.496913909912109, "learning_rate": 2.7028968644546154e-05, "loss": 2.308920478820801, "memory(GiB)": 77.56, "step": 76085, "token_acc": 0.48134328358208955, "train_speed(iter/s)": 1.437257 }, { "epoch": 3.259928880510689, "grad_norm": 7.8755083084106445, "learning_rate": 2.7022991341503302e-05, "loss": 2.273288345336914, "memory(GiB)": 77.56, "step": 76090, "token_acc": 0.5270758122743683, "train_speed(iter/s)": 1.43727 }, { "epoch": 3.2601430958399384, "grad_norm": 5.517001628875732, "learning_rate": 2.7017014454710492e-05, "loss": 2.3058496475219727, "memory(GiB)": 77.56, "step": 76095, "token_acc": 0.5037593984962406, "train_speed(iter/s)": 1.437267 }, { "epoch": 3.2603573111691873, "grad_norm": 6.559329986572266, "learning_rate": 2.7011037984276043e-05, "loss": 2.04239559173584, "memory(GiB)": 77.56, "step": 76100, "token_acc": 0.5126050420168067, "train_speed(iter/s)": 1.437256 }, { "epoch": 3.260571526498436, "grad_norm": 6.379025936126709, "learning_rate": 2.7005061930308195e-05, "loss": 2.5620384216308594, "memory(GiB)": 77.56, "step": 76105, "token_acc": 0.4923469387755102, "train_speed(iter/s)": 1.437251 }, { "epoch": 3.2607857418276853, "grad_norm": 10.403284072875977, "learning_rate": 2.6999086292915234e-05, "loss": 2.448537254333496, "memory(GiB)": 77.56, "step": 76110, "token_acc": 0.5189393939393939, "train_speed(iter/s)": 1.437265 }, { "epoch": 3.260999957156934, "grad_norm": 5.881539344787598, "learning_rate": 2.6993111072205406e-05, "loss": 2.5906185150146483, "memory(GiB)": 77.56, "step": 76115, "token_acc": 0.4664804469273743, "train_speed(iter/s)": 1.437236 }, { "epoch": 3.261214172486183, "grad_norm": 5.683322429656982, "learning_rate": 2.698713626828695e-05, "loss": 2.2533668518066405, "memory(GiB)": 77.56, "step": 76120, "token_acc": 0.5345454545454545, "train_speed(iter/s)": 1.43724 }, { "epoch": 3.261428387815432, "grad_norm": 5.9745965003967285, "learning_rate": 2.6981161881268113e-05, "loss": 2.428679847717285, "memory(GiB)": 77.56, "step": 76125, "token_acc": 0.49110320284697506, "train_speed(iter/s)": 1.437228 }, { "epoch": 3.261642603144681, "grad_norm": 5.338161945343018, "learning_rate": 2.6975187911257116e-05, "loss": 2.313723564147949, "memory(GiB)": 77.56, "step": 76130, "token_acc": 0.511326860841424, "train_speed(iter/s)": 1.437243 }, { "epoch": 3.26185681847393, "grad_norm": 4.4195685386657715, "learning_rate": 2.696921435836217e-05, "loss": 2.33978385925293, "memory(GiB)": 77.56, "step": 76135, "token_acc": 0.514018691588785, "train_speed(iter/s)": 1.437222 }, { "epoch": 3.262071033803179, "grad_norm": 5.146104335784912, "learning_rate": 2.6963241222691533e-05, "loss": 2.4369155883789064, "memory(GiB)": 77.56, "step": 76140, "token_acc": 0.48307692307692307, "train_speed(iter/s)": 1.437216 }, { "epoch": 3.262285249132428, "grad_norm": 6.217367649078369, "learning_rate": 2.6957268504353394e-05, "loss": 2.2363914489746093, "memory(GiB)": 77.56, "step": 76145, "token_acc": 0.4957627118644068, "train_speed(iter/s)": 1.43722 }, { "epoch": 3.2624994644616767, "grad_norm": 4.863766193389893, "learning_rate": 2.6951296203455945e-05, "loss": 2.2368846893310548, "memory(GiB)": 77.56, "step": 76150, "token_acc": 0.5222929936305732, "train_speed(iter/s)": 1.437194 }, { "epoch": 3.262713679790926, "grad_norm": 6.6296610832214355, "learning_rate": 2.694532432010739e-05, "loss": 2.4694311141967775, "memory(GiB)": 77.56, "step": 76155, "token_acc": 0.4645161290322581, "train_speed(iter/s)": 1.437185 }, { "epoch": 3.262927895120175, "grad_norm": 6.070828437805176, "learning_rate": 2.693935285441589e-05, "loss": 2.4765954971313477, "memory(GiB)": 77.56, "step": 76160, "token_acc": 0.48846153846153845, "train_speed(iter/s)": 1.437183 }, { "epoch": 3.2631421104494236, "grad_norm": 10.059720993041992, "learning_rate": 2.693338180648965e-05, "loss": 2.475826644897461, "memory(GiB)": 77.56, "step": 76165, "token_acc": 0.4743083003952569, "train_speed(iter/s)": 1.43718 }, { "epoch": 3.263356325778673, "grad_norm": 4.910816669464111, "learning_rate": 2.6927411176436857e-05, "loss": 2.346239471435547, "memory(GiB)": 77.56, "step": 76170, "token_acc": 0.49498327759197325, "train_speed(iter/s)": 1.437192 }, { "epoch": 3.2635705411079217, "grad_norm": 5.955539226531982, "learning_rate": 2.692144096436565e-05, "loss": 2.270440864562988, "memory(GiB)": 77.56, "step": 76175, "token_acc": 0.5146443514644351, "train_speed(iter/s)": 1.437211 }, { "epoch": 3.2637847564371705, "grad_norm": 5.154200553894043, "learning_rate": 2.6915471170384188e-05, "loss": 2.294374465942383, "memory(GiB)": 77.56, "step": 76180, "token_acc": 0.5130111524163569, "train_speed(iter/s)": 1.437219 }, { "epoch": 3.2639989717664197, "grad_norm": 6.375717639923096, "learning_rate": 2.6909501794600622e-05, "loss": 2.4333267211914062, "memory(GiB)": 77.56, "step": 76185, "token_acc": 0.5404255319148936, "train_speed(iter/s)": 1.43722 }, { "epoch": 3.2642131870956685, "grad_norm": 6.558661937713623, "learning_rate": 2.690353283712308e-05, "loss": 2.5455415725708006, "memory(GiB)": 77.56, "step": 76190, "token_acc": 0.4775641025641026, "train_speed(iter/s)": 1.437214 }, { "epoch": 3.2644274024249174, "grad_norm": 5.703103065490723, "learning_rate": 2.689756429805972e-05, "loss": 2.2252275466918947, "memory(GiB)": 77.56, "step": 76195, "token_acc": 0.5080906148867314, "train_speed(iter/s)": 1.437222 }, { "epoch": 3.2646416177541666, "grad_norm": 5.283342361450195, "learning_rate": 2.6891596177518652e-05, "loss": 2.2977081298828126, "memory(GiB)": 77.56, "step": 76200, "token_acc": 0.5337837837837838, "train_speed(iter/s)": 1.437221 }, { "epoch": 3.2648558330834154, "grad_norm": 4.891332626342773, "learning_rate": 2.6885628475608006e-05, "loss": 2.4547088623046873, "memory(GiB)": 77.56, "step": 76205, "token_acc": 0.5123674911660777, "train_speed(iter/s)": 1.437223 }, { "epoch": 3.2650700484126642, "grad_norm": 7.913602352142334, "learning_rate": 2.6879661192435866e-05, "loss": 2.3808162689208983, "memory(GiB)": 77.56, "step": 76210, "token_acc": 0.5189393939393939, "train_speed(iter/s)": 1.437223 }, { "epoch": 3.2652842637419135, "grad_norm": 5.7912187576293945, "learning_rate": 2.6873694328110365e-05, "loss": 2.488439178466797, "memory(GiB)": 77.56, "step": 76215, "token_acc": 0.48, "train_speed(iter/s)": 1.437243 }, { "epoch": 3.2654984790711623, "grad_norm": 5.645628452301025, "learning_rate": 2.6867727882739568e-05, "loss": 2.3716983795166016, "memory(GiB)": 77.56, "step": 76220, "token_acc": 0.5186440677966102, "train_speed(iter/s)": 1.437247 }, { "epoch": 3.265712694400411, "grad_norm": 5.816945552825928, "learning_rate": 2.6861761856431595e-05, "loss": 2.4469577789306642, "memory(GiB)": 77.56, "step": 76225, "token_acc": 0.45794392523364486, "train_speed(iter/s)": 1.43723 }, { "epoch": 3.2659269097296604, "grad_norm": 6.040002822875977, "learning_rate": 2.6855796249294512e-05, "loss": 2.187143325805664, "memory(GiB)": 77.56, "step": 76230, "token_acc": 0.5313807531380753, "train_speed(iter/s)": 1.437226 }, { "epoch": 3.266141125058909, "grad_norm": 5.661389350891113, "learning_rate": 2.6849831061436394e-05, "loss": 2.1834054946899415, "memory(GiB)": 77.56, "step": 76235, "token_acc": 0.5432835820895522, "train_speed(iter/s)": 1.43723 }, { "epoch": 3.266355340388158, "grad_norm": 4.8527116775512695, "learning_rate": 2.684386629296528e-05, "loss": 2.133860778808594, "memory(GiB)": 77.56, "step": 76240, "token_acc": 0.5390070921985816, "train_speed(iter/s)": 1.437244 }, { "epoch": 3.2665695557174073, "grad_norm": 5.037625312805176, "learning_rate": 2.683790194398927e-05, "loss": 2.1706300735473634, "memory(GiB)": 77.56, "step": 76245, "token_acc": 0.527972027972028, "train_speed(iter/s)": 1.437266 }, { "epoch": 3.266783771046656, "grad_norm": 6.797175884246826, "learning_rate": 2.6831938014616377e-05, "loss": 2.1708532333374024, "memory(GiB)": 77.56, "step": 76250, "token_acc": 0.5017543859649123, "train_speed(iter/s)": 1.437273 }, { "epoch": 3.266997986375905, "grad_norm": 8.627185821533203, "learning_rate": 2.6825974504954676e-05, "loss": 2.309407424926758, "memory(GiB)": 77.56, "step": 76255, "token_acc": 0.5223214285714286, "train_speed(iter/s)": 1.437275 }, { "epoch": 3.267212201705154, "grad_norm": 5.070251941680908, "learning_rate": 2.6820011415112183e-05, "loss": 2.704513931274414, "memory(GiB)": 77.56, "step": 76260, "token_acc": 0.47249190938511326, "train_speed(iter/s)": 1.437292 }, { "epoch": 3.267426417034403, "grad_norm": 7.257891654968262, "learning_rate": 2.6814048745196933e-05, "loss": 2.2436540603637694, "memory(GiB)": 77.56, "step": 76265, "token_acc": 0.5357142857142857, "train_speed(iter/s)": 1.437299 }, { "epoch": 3.2676406323636518, "grad_norm": 6.010847568511963, "learning_rate": 2.6808086495316943e-05, "loss": 2.2097457885742187, "memory(GiB)": 77.56, "step": 76270, "token_acc": 0.5020242914979757, "train_speed(iter/s)": 1.437297 }, { "epoch": 3.267854847692901, "grad_norm": 6.027689456939697, "learning_rate": 2.68021246655802e-05, "loss": 2.744255256652832, "memory(GiB)": 77.56, "step": 76275, "token_acc": 0.45977011494252873, "train_speed(iter/s)": 1.437286 }, { "epoch": 3.26806906302215, "grad_norm": 9.140928268432617, "learning_rate": 2.679616325609474e-05, "loss": 2.4225658416748046, "memory(GiB)": 77.56, "step": 76280, "token_acc": 0.5028735632183908, "train_speed(iter/s)": 1.437295 }, { "epoch": 3.2682832783513986, "grad_norm": 6.586650371551514, "learning_rate": 2.679020226696856e-05, "loss": 2.3298568725585938, "memory(GiB)": 77.56, "step": 76285, "token_acc": 0.5105740181268882, "train_speed(iter/s)": 1.437285 }, { "epoch": 3.268497493680648, "grad_norm": 6.096415042877197, "learning_rate": 2.678424169830964e-05, "loss": 2.3894872665405273, "memory(GiB)": 77.56, "step": 76290, "token_acc": 0.5051546391752577, "train_speed(iter/s)": 1.437278 }, { "epoch": 3.2687117090098967, "grad_norm": 8.352396965026855, "learning_rate": 2.6778281550225957e-05, "loss": 2.2749988555908205, "memory(GiB)": 77.56, "step": 76295, "token_acc": 0.47191011235955055, "train_speed(iter/s)": 1.437268 }, { "epoch": 3.2689259243391455, "grad_norm": 6.9940056800842285, "learning_rate": 2.6772321822825486e-05, "loss": 2.0341529846191406, "memory(GiB)": 77.56, "step": 76300, "token_acc": 0.5692883895131086, "train_speed(iter/s)": 1.43728 }, { "epoch": 3.269140139668395, "grad_norm": 6.248111724853516, "learning_rate": 2.6766362516216177e-05, "loss": 2.4225185394287108, "memory(GiB)": 77.56, "step": 76305, "token_acc": 0.4897360703812317, "train_speed(iter/s)": 1.437281 }, { "epoch": 3.2693543549976436, "grad_norm": 7.4152679443359375, "learning_rate": 2.6760403630506027e-05, "loss": 2.1001409530639648, "memory(GiB)": 77.56, "step": 76310, "token_acc": 0.510548523206751, "train_speed(iter/s)": 1.437279 }, { "epoch": 3.2695685703268924, "grad_norm": 6.702224254608154, "learning_rate": 2.6754445165802938e-05, "loss": 2.505830764770508, "memory(GiB)": 77.56, "step": 76315, "token_acc": 0.4634920634920635, "train_speed(iter/s)": 1.437286 }, { "epoch": 3.2697827856561417, "grad_norm": 5.524763584136963, "learning_rate": 2.6748487122214905e-05, "loss": 2.4238554000854493, "memory(GiB)": 77.56, "step": 76320, "token_acc": 0.4738675958188153, "train_speed(iter/s)": 1.43727 }, { "epoch": 3.2699970009853905, "grad_norm": 4.90808629989624, "learning_rate": 2.6742529499849846e-05, "loss": 2.3672775268554687, "memory(GiB)": 77.56, "step": 76325, "token_acc": 0.48736462093862815, "train_speed(iter/s)": 1.437276 }, { "epoch": 3.2702112163146393, "grad_norm": 6.328675270080566, "learning_rate": 2.673657229881567e-05, "loss": 2.0318288803100586, "memory(GiB)": 77.56, "step": 76330, "token_acc": 0.516728624535316, "train_speed(iter/s)": 1.437292 }, { "epoch": 3.2704254316438885, "grad_norm": 4.75703763961792, "learning_rate": 2.67306155192203e-05, "loss": 2.1186452865600587, "memory(GiB)": 77.56, "step": 76335, "token_acc": 0.5594405594405595, "train_speed(iter/s)": 1.437306 }, { "epoch": 3.2706396469731374, "grad_norm": 5.916680812835693, "learning_rate": 2.6724659161171674e-05, "loss": 2.036697578430176, "memory(GiB)": 77.56, "step": 76340, "token_acc": 0.5353159851301115, "train_speed(iter/s)": 1.437321 }, { "epoch": 3.270853862302386, "grad_norm": 5.238081455230713, "learning_rate": 2.671870322477768e-05, "loss": 1.9471094131469726, "memory(GiB)": 77.56, "step": 76345, "token_acc": 0.5571428571428572, "train_speed(iter/s)": 1.437322 }, { "epoch": 3.2710680776316354, "grad_norm": 6.140990734100342, "learning_rate": 2.6712747710146224e-05, "loss": 2.4319995880126952, "memory(GiB)": 77.56, "step": 76350, "token_acc": 0.5304659498207885, "train_speed(iter/s)": 1.437318 }, { "epoch": 3.2712822929608842, "grad_norm": 4.777742385864258, "learning_rate": 2.670679261738518e-05, "loss": 2.3321855545043944, "memory(GiB)": 77.56, "step": 76355, "token_acc": 0.5275080906148867, "train_speed(iter/s)": 1.437318 }, { "epoch": 3.271496508290133, "grad_norm": 6.19354772567749, "learning_rate": 2.670083794660244e-05, "loss": 2.4010643005371093, "memory(GiB)": 77.56, "step": 76360, "token_acc": 0.49624060150375937, "train_speed(iter/s)": 1.43732 }, { "epoch": 3.2717107236193823, "grad_norm": 5.319146156311035, "learning_rate": 2.669488369790586e-05, "loss": 1.9833808898925782, "memory(GiB)": 77.56, "step": 76365, "token_acc": 0.5374592833876222, "train_speed(iter/s)": 1.437327 }, { "epoch": 3.271924938948631, "grad_norm": 5.4314985275268555, "learning_rate": 2.6688929871403346e-05, "loss": 2.597892189025879, "memory(GiB)": 77.56, "step": 76370, "token_acc": 0.4785100286532951, "train_speed(iter/s)": 1.437303 }, { "epoch": 3.27213915427788, "grad_norm": 5.978343486785889, "learning_rate": 2.6682976467202726e-05, "loss": 2.3395435333251955, "memory(GiB)": 77.56, "step": 76375, "token_acc": 0.5131086142322098, "train_speed(iter/s)": 1.437331 }, { "epoch": 3.272353369607129, "grad_norm": 5.326536655426025, "learning_rate": 2.6677023485411866e-05, "loss": 2.5036598205566407, "memory(GiB)": 77.56, "step": 76380, "token_acc": 0.4660493827160494, "train_speed(iter/s)": 1.437308 }, { "epoch": 3.272567584936378, "grad_norm": 6.113457202911377, "learning_rate": 2.667107092613861e-05, "loss": 2.455510139465332, "memory(GiB)": 77.56, "step": 76385, "token_acc": 0.4952076677316294, "train_speed(iter/s)": 1.437288 }, { "epoch": 3.272781800265627, "grad_norm": 6.942384243011475, "learning_rate": 2.6665118789490763e-05, "loss": 2.1142892837524414, "memory(GiB)": 77.56, "step": 76390, "token_acc": 0.5, "train_speed(iter/s)": 1.437277 }, { "epoch": 3.272996015594876, "grad_norm": 5.50494909286499, "learning_rate": 2.6659167075576176e-05, "loss": 2.049399185180664, "memory(GiB)": 77.56, "step": 76395, "token_acc": 0.5687022900763359, "train_speed(iter/s)": 1.437284 }, { "epoch": 3.273210230924125, "grad_norm": 9.861894607543945, "learning_rate": 2.6653215784502693e-05, "loss": 2.5155601501464844, "memory(GiB)": 77.56, "step": 76400, "token_acc": 0.5015576323987538, "train_speed(iter/s)": 1.437289 }, { "epoch": 3.2734244462533737, "grad_norm": 6.839644908905029, "learning_rate": 2.664726491637811e-05, "loss": 2.615404510498047, "memory(GiB)": 77.56, "step": 76405, "token_acc": 0.4921259842519685, "train_speed(iter/s)": 1.437302 }, { "epoch": 3.273638661582623, "grad_norm": 9.163671493530273, "learning_rate": 2.664131447131023e-05, "loss": 2.20394287109375, "memory(GiB)": 77.56, "step": 76410, "token_acc": 0.5055762081784386, "train_speed(iter/s)": 1.437322 }, { "epoch": 3.2738528769118718, "grad_norm": 6.252237319946289, "learning_rate": 2.6635364449406853e-05, "loss": 2.2719280242919924, "memory(GiB)": 77.56, "step": 76415, "token_acc": 0.5032679738562091, "train_speed(iter/s)": 1.437328 }, { "epoch": 3.274067092241121, "grad_norm": 5.88748836517334, "learning_rate": 2.662941485077574e-05, "loss": 2.2581886291503905, "memory(GiB)": 77.56, "step": 76420, "token_acc": 0.4829059829059829, "train_speed(iter/s)": 1.437309 }, { "epoch": 3.27428130757037, "grad_norm": 4.954962253570557, "learning_rate": 2.6623465675524728e-05, "loss": 2.209299087524414, "memory(GiB)": 77.56, "step": 76425, "token_acc": 0.5320754716981132, "train_speed(iter/s)": 1.437307 }, { "epoch": 3.2744955228996186, "grad_norm": 6.656917572021484, "learning_rate": 2.6617516923761553e-05, "loss": 2.1687402725219727, "memory(GiB)": 77.56, "step": 76430, "token_acc": 0.5182481751824818, "train_speed(iter/s)": 1.437309 }, { "epoch": 3.274709738228868, "grad_norm": 9.126182556152344, "learning_rate": 2.6611568595594006e-05, "loss": 2.508999252319336, "memory(GiB)": 77.56, "step": 76435, "token_acc": 0.47750865051903113, "train_speed(iter/s)": 1.437335 }, { "epoch": 3.2749239535581167, "grad_norm": 4.683877944946289, "learning_rate": 2.6605620691129828e-05, "loss": 2.7646997451782225, "memory(GiB)": 77.56, "step": 76440, "token_acc": 0.4645161290322581, "train_speed(iter/s)": 1.437334 }, { "epoch": 3.2751381688873655, "grad_norm": 6.770195007324219, "learning_rate": 2.659967321047678e-05, "loss": 2.205705261230469, "memory(GiB)": 77.56, "step": 76445, "token_acc": 0.5102040816326531, "train_speed(iter/s)": 1.437339 }, { "epoch": 3.275352384216615, "grad_norm": 5.407696723937988, "learning_rate": 2.6593726153742575e-05, "loss": 2.4108112335205076, "memory(GiB)": 77.56, "step": 76450, "token_acc": 0.45182724252491696, "train_speed(iter/s)": 1.437315 }, { "epoch": 3.2755665995458636, "grad_norm": 4.91532039642334, "learning_rate": 2.6587779521035007e-05, "loss": 2.4610605239868164, "memory(GiB)": 77.56, "step": 76455, "token_acc": 0.5142045454545454, "train_speed(iter/s)": 1.437332 }, { "epoch": 3.2757808148751124, "grad_norm": 5.391237258911133, "learning_rate": 2.6581833312461768e-05, "loss": 2.3996503829956053, "memory(GiB)": 77.56, "step": 76460, "token_acc": 0.49666666666666665, "train_speed(iter/s)": 1.437339 }, { "epoch": 3.2759950302043617, "grad_norm": 6.4376091957092285, "learning_rate": 2.657588752813057e-05, "loss": 2.4347793579101564, "memory(GiB)": 77.56, "step": 76465, "token_acc": 0.5358361774744027, "train_speed(iter/s)": 1.437355 }, { "epoch": 3.2762092455336105, "grad_norm": 4.6318793296813965, "learning_rate": 2.6569942168149165e-05, "loss": 2.0739295959472654, "memory(GiB)": 77.56, "step": 76470, "token_acc": 0.5307443365695793, "train_speed(iter/s)": 1.437364 }, { "epoch": 3.2764234608628593, "grad_norm": 6.209949970245361, "learning_rate": 2.6563997232625236e-05, "loss": 2.6977296829223634, "memory(GiB)": 77.56, "step": 76475, "token_acc": 0.45317220543806647, "train_speed(iter/s)": 1.437372 }, { "epoch": 3.2766376761921086, "grad_norm": 6.036227226257324, "learning_rate": 2.655805272166646e-05, "loss": 2.1383331298828123, "memory(GiB)": 77.56, "step": 76480, "token_acc": 0.5241935483870968, "train_speed(iter/s)": 1.437399 }, { "epoch": 3.2768518915213574, "grad_norm": 5.114137649536133, "learning_rate": 2.6552108635380567e-05, "loss": 2.2285793304443358, "memory(GiB)": 77.56, "step": 76485, "token_acc": 0.5238095238095238, "train_speed(iter/s)": 1.437413 }, { "epoch": 3.277066106850606, "grad_norm": 5.297479152679443, "learning_rate": 2.654616497387523e-05, "loss": 2.0856781005859375, "memory(GiB)": 77.56, "step": 76490, "token_acc": 0.5269230769230769, "train_speed(iter/s)": 1.437402 }, { "epoch": 3.2772803221798554, "grad_norm": 6.1026411056518555, "learning_rate": 2.654022173725811e-05, "loss": 2.148396110534668, "memory(GiB)": 77.56, "step": 76495, "token_acc": 0.524390243902439, "train_speed(iter/s)": 1.437409 }, { "epoch": 3.2774945375091042, "grad_norm": 5.110911846160889, "learning_rate": 2.653427892563688e-05, "loss": 2.410162925720215, "memory(GiB)": 77.56, "step": 76500, "token_acc": 0.4740061162079511, "train_speed(iter/s)": 1.43739 }, { "epoch": 3.2774945375091042, "eval_loss": 2.140929937362671, "eval_runtime": 14.9721, "eval_samples_per_second": 6.679, "eval_steps_per_second": 6.679, "eval_token_acc": 0.4733969986357435, "step": 76500 }, { "epoch": 3.277708752838353, "grad_norm": 7.50363302230835, "learning_rate": 2.65283365391192e-05, "loss": 2.338113784790039, "memory(GiB)": 77.56, "step": 76505, "token_acc": 0.48080614203454897, "train_speed(iter/s)": 1.436962 }, { "epoch": 3.2779229681676023, "grad_norm": 6.472196578979492, "learning_rate": 2.6522394577812702e-05, "loss": 2.3285160064697266, "memory(GiB)": 77.56, "step": 76510, "token_acc": 0.47619047619047616, "train_speed(iter/s)": 1.436988 }, { "epoch": 3.278137183496851, "grad_norm": 5.848712921142578, "learning_rate": 2.6516453041825075e-05, "loss": 2.4467742919921873, "memory(GiB)": 77.56, "step": 76515, "token_acc": 0.46688741721854304, "train_speed(iter/s)": 1.43698 }, { "epoch": 3.2783513988261, "grad_norm": 4.979660987854004, "learning_rate": 2.6510511931263926e-05, "loss": 2.132375144958496, "memory(GiB)": 77.56, "step": 76520, "token_acc": 0.5283582089552239, "train_speed(iter/s)": 1.436994 }, { "epoch": 3.278565614155349, "grad_norm": 5.430736541748047, "learning_rate": 2.6504571246236893e-05, "loss": 2.574811553955078, "memory(GiB)": 77.56, "step": 76525, "token_acc": 0.4716312056737589, "train_speed(iter/s)": 1.437004 }, { "epoch": 3.278779829484598, "grad_norm": 5.978100776672363, "learning_rate": 2.6498630986851584e-05, "loss": 2.3917766571044923, "memory(GiB)": 77.56, "step": 76530, "token_acc": 0.4778156996587031, "train_speed(iter/s)": 1.437012 }, { "epoch": 3.278994044813847, "grad_norm": 5.435888767242432, "learning_rate": 2.649269115321561e-05, "loss": 2.55991153717041, "memory(GiB)": 77.56, "step": 76535, "token_acc": 0.46645367412140576, "train_speed(iter/s)": 1.437024 }, { "epoch": 3.279208260143096, "grad_norm": 5.174025535583496, "learning_rate": 2.6486751745436578e-05, "loss": 2.373432922363281, "memory(GiB)": 77.56, "step": 76540, "token_acc": 0.514018691588785, "train_speed(iter/s)": 1.437028 }, { "epoch": 3.279422475472345, "grad_norm": 5.387723922729492, "learning_rate": 2.648081276362212e-05, "loss": 2.4880332946777344, "memory(GiB)": 77.56, "step": 76545, "token_acc": 0.5016611295681063, "train_speed(iter/s)": 1.437027 }, { "epoch": 3.2796366908015937, "grad_norm": 5.608835220336914, "learning_rate": 2.6474874207879807e-05, "loss": 2.5733638763427735, "memory(GiB)": 77.56, "step": 76550, "token_acc": 0.4642857142857143, "train_speed(iter/s)": 1.437016 }, { "epoch": 3.279850906130843, "grad_norm": 5.937811851501465, "learning_rate": 2.6468936078317207e-05, "loss": 2.4545013427734377, "memory(GiB)": 77.56, "step": 76555, "token_acc": 0.4666666666666667, "train_speed(iter/s)": 1.437033 }, { "epoch": 3.2800651214600918, "grad_norm": 5.4121575355529785, "learning_rate": 2.6462998375041908e-05, "loss": 2.1308835983276366, "memory(GiB)": 77.56, "step": 76560, "token_acc": 0.54421768707483, "train_speed(iter/s)": 1.437027 }, { "epoch": 3.2802793367893406, "grad_norm": 6.442676067352295, "learning_rate": 2.645706109816145e-05, "loss": 2.648736763000488, "memory(GiB)": 77.56, "step": 76565, "token_acc": 0.4506578947368421, "train_speed(iter/s)": 1.437004 }, { "epoch": 3.28049355211859, "grad_norm": 8.892988204956055, "learning_rate": 2.6451124247783443e-05, "loss": 2.449600601196289, "memory(GiB)": 77.56, "step": 76570, "token_acc": 0.4779116465863454, "train_speed(iter/s)": 1.437007 }, { "epoch": 3.2807077674478387, "grad_norm": 5.588657379150391, "learning_rate": 2.6445187824015406e-05, "loss": 2.5541194915771483, "memory(GiB)": 77.56, "step": 76575, "token_acc": 0.5102040816326531, "train_speed(iter/s)": 1.437004 }, { "epoch": 3.2809219827770875, "grad_norm": 5.2621169090271, "learning_rate": 2.6439251826964882e-05, "loss": 2.4545656204223634, "memory(GiB)": 77.56, "step": 76580, "token_acc": 0.4779874213836478, "train_speed(iter/s)": 1.436999 }, { "epoch": 3.2811361981063367, "grad_norm": 6.59516716003418, "learning_rate": 2.6433316256739417e-05, "loss": 2.2602039337158204, "memory(GiB)": 77.56, "step": 76585, "token_acc": 0.5061728395061729, "train_speed(iter/s)": 1.437025 }, { "epoch": 3.2813504134355855, "grad_norm": 5.80298376083374, "learning_rate": 2.6427381113446536e-05, "loss": 2.110616111755371, "memory(GiB)": 77.56, "step": 76590, "token_acc": 0.5197368421052632, "train_speed(iter/s)": 1.437052 }, { "epoch": 3.2815646287648343, "grad_norm": 6.352737903594971, "learning_rate": 2.642144639719374e-05, "loss": 2.4751840591430665, "memory(GiB)": 77.56, "step": 76595, "token_acc": 0.4928571428571429, "train_speed(iter/s)": 1.437057 }, { "epoch": 3.2817788440940836, "grad_norm": 5.855788707733154, "learning_rate": 2.641551210808858e-05, "loss": 2.0016483306884765, "memory(GiB)": 77.56, "step": 76600, "token_acc": 0.54296875, "train_speed(iter/s)": 1.437079 }, { "epoch": 3.2819930594233324, "grad_norm": 7.938076496124268, "learning_rate": 2.640957824623854e-05, "loss": 2.1572898864746093, "memory(GiB)": 77.56, "step": 76605, "token_acc": 0.5321428571428571, "train_speed(iter/s)": 1.437091 }, { "epoch": 3.2822072747525812, "grad_norm": 5.586814880371094, "learning_rate": 2.6403644811751104e-05, "loss": 2.162569046020508, "memory(GiB)": 77.56, "step": 76610, "token_acc": 0.5358361774744027, "train_speed(iter/s)": 1.437083 }, { "epoch": 3.2824214900818305, "grad_norm": 4.677947998046875, "learning_rate": 2.63977118047338e-05, "loss": 2.461477279663086, "memory(GiB)": 77.56, "step": 76615, "token_acc": 0.4618055555555556, "train_speed(iter/s)": 1.437108 }, { "epoch": 3.2826357054110793, "grad_norm": 6.2018632888793945, "learning_rate": 2.6391779225294084e-05, "loss": 2.2991186141967774, "memory(GiB)": 77.56, "step": 76620, "token_acc": 0.5, "train_speed(iter/s)": 1.437126 }, { "epoch": 3.282849920740328, "grad_norm": 5.7149434089660645, "learning_rate": 2.638584707353941e-05, "loss": 2.1063396453857424, "memory(GiB)": 77.56, "step": 76625, "token_acc": 0.553030303030303, "train_speed(iter/s)": 1.437132 }, { "epoch": 3.2830641360695774, "grad_norm": 4.538356781005859, "learning_rate": 2.637991534957729e-05, "loss": 2.4295454025268555, "memory(GiB)": 77.56, "step": 76630, "token_acc": 0.49146757679180886, "train_speed(iter/s)": 1.437122 }, { "epoch": 3.283278351398826, "grad_norm": 5.894742965698242, "learning_rate": 2.6373984053515167e-05, "loss": 2.2914783477783205, "memory(GiB)": 77.56, "step": 76635, "token_acc": 0.5016949152542373, "train_speed(iter/s)": 1.437129 }, { "epoch": 3.283492566728075, "grad_norm": 8.037131309509277, "learning_rate": 2.636805318546048e-05, "loss": 2.257550811767578, "memory(GiB)": 77.56, "step": 76640, "token_acc": 0.5095057034220533, "train_speed(iter/s)": 1.437128 }, { "epoch": 3.2837067820573242, "grad_norm": 4.997852325439453, "learning_rate": 2.636212274552068e-05, "loss": 2.2170669555664064, "memory(GiB)": 77.56, "step": 76645, "token_acc": 0.5, "train_speed(iter/s)": 1.437133 }, { "epoch": 3.283920997386573, "grad_norm": 4.542362213134766, "learning_rate": 2.6356192733803197e-05, "loss": 2.2677452087402346, "memory(GiB)": 77.56, "step": 76650, "token_acc": 0.4918032786885246, "train_speed(iter/s)": 1.437142 }, { "epoch": 3.284135212715822, "grad_norm": 5.554403305053711, "learning_rate": 2.6350263150415443e-05, "loss": 2.2887750625610352, "memory(GiB)": 77.56, "step": 76655, "token_acc": 0.4899328859060403, "train_speed(iter/s)": 1.437116 }, { "epoch": 3.284349428045071, "grad_norm": 8.246650695800781, "learning_rate": 2.6344333995464875e-05, "loss": 2.35384521484375, "memory(GiB)": 77.56, "step": 76660, "token_acc": 0.46613545816733065, "train_speed(iter/s)": 1.437145 }, { "epoch": 3.28456364337432, "grad_norm": 5.06002140045166, "learning_rate": 2.6338405269058896e-05, "loss": 2.462234878540039, "memory(GiB)": 77.56, "step": 76665, "token_acc": 0.49169435215946844, "train_speed(iter/s)": 1.437132 }, { "epoch": 3.2847778587035688, "grad_norm": 6.219139099121094, "learning_rate": 2.6332476971304898e-05, "loss": 2.5646623611450194, "memory(GiB)": 77.56, "step": 76670, "token_acc": 0.5119047619047619, "train_speed(iter/s)": 1.437123 }, { "epoch": 3.284992074032818, "grad_norm": 7.4939866065979, "learning_rate": 2.6326549102310284e-05, "loss": 2.5314207077026367, "memory(GiB)": 77.56, "step": 76675, "token_acc": 0.49612403100775193, "train_speed(iter/s)": 1.437129 }, { "epoch": 3.285206289362067, "grad_norm": 6.062115669250488, "learning_rate": 2.632062166218241e-05, "loss": 2.7366336822509765, "memory(GiB)": 77.56, "step": 76680, "token_acc": 0.44807121661721067, "train_speed(iter/s)": 1.437138 }, { "epoch": 3.2854205046913156, "grad_norm": 5.411129474639893, "learning_rate": 2.6314694651028697e-05, "loss": 2.3598533630371095, "memory(GiB)": 77.56, "step": 76685, "token_acc": 0.527027027027027, "train_speed(iter/s)": 1.43713 }, { "epoch": 3.285634720020565, "grad_norm": 6.083203315734863, "learning_rate": 2.630876806895653e-05, "loss": 2.06467342376709, "memory(GiB)": 77.56, "step": 76690, "token_acc": 0.5667870036101083, "train_speed(iter/s)": 1.437128 }, { "epoch": 3.2858489353498137, "grad_norm": 6.668188571929932, "learning_rate": 2.630284191607325e-05, "loss": 2.583022880554199, "memory(GiB)": 77.56, "step": 76695, "token_acc": 0.4645390070921986, "train_speed(iter/s)": 1.437149 }, { "epoch": 3.2860631506790625, "grad_norm": 5.002578258514404, "learning_rate": 2.629691619248622e-05, "loss": 2.5051143646240233, "memory(GiB)": 77.56, "step": 76700, "token_acc": 0.4674922600619195, "train_speed(iter/s)": 1.43717 }, { "epoch": 3.2862773660083118, "grad_norm": 7.534294128417969, "learning_rate": 2.6290990898302786e-05, "loss": 2.527523422241211, "memory(GiB)": 77.56, "step": 76705, "token_acc": 0.5036496350364964, "train_speed(iter/s)": 1.437165 }, { "epoch": 3.2864915813375606, "grad_norm": 4.781052589416504, "learning_rate": 2.6285066033630278e-05, "loss": 2.4452846527099608, "memory(GiB)": 77.56, "step": 76710, "token_acc": 0.4868421052631579, "train_speed(iter/s)": 1.437174 }, { "epoch": 3.2867057966668094, "grad_norm": 6.081940174102783, "learning_rate": 2.6279141598576062e-05, "loss": 2.662849998474121, "memory(GiB)": 77.56, "step": 76715, "token_acc": 0.4779874213836478, "train_speed(iter/s)": 1.437179 }, { "epoch": 3.2869200119960587, "grad_norm": 5.028762340545654, "learning_rate": 2.6273217593247447e-05, "loss": 2.49237060546875, "memory(GiB)": 77.56, "step": 76720, "token_acc": 0.4970588235294118, "train_speed(iter/s)": 1.437202 }, { "epoch": 3.2871342273253075, "grad_norm": 6.06223726272583, "learning_rate": 2.6267294017751753e-05, "loss": 2.2386714935302736, "memory(GiB)": 77.56, "step": 76725, "token_acc": 0.5559701492537313, "train_speed(iter/s)": 1.437195 }, { "epoch": 3.2873484426545563, "grad_norm": 8.5480375289917, "learning_rate": 2.626137087219629e-05, "loss": 2.0952470779418944, "memory(GiB)": 77.56, "step": 76730, "token_acc": 0.5276752767527675, "train_speed(iter/s)": 1.437213 }, { "epoch": 3.2875626579838055, "grad_norm": 4.992301940917969, "learning_rate": 2.625544815668836e-05, "loss": 2.448184776306152, "memory(GiB)": 77.56, "step": 76735, "token_acc": 0.4783950617283951, "train_speed(iter/s)": 1.437226 }, { "epoch": 3.2877768733130543, "grad_norm": 5.484918594360352, "learning_rate": 2.6249525871335246e-05, "loss": 2.1390968322753907, "memory(GiB)": 77.56, "step": 76740, "token_acc": 0.5445544554455446, "train_speed(iter/s)": 1.43723 }, { "epoch": 3.287991088642303, "grad_norm": 6.212332248687744, "learning_rate": 2.624360401624427e-05, "loss": 2.816640853881836, "memory(GiB)": 77.56, "step": 76745, "token_acc": 0.48355263157894735, "train_speed(iter/s)": 1.437218 }, { "epoch": 3.2882053039715524, "grad_norm": 5.6339898109436035, "learning_rate": 2.6237682591522693e-05, "loss": 2.29492301940918, "memory(GiB)": 77.56, "step": 76750, "token_acc": 0.5165562913907285, "train_speed(iter/s)": 1.437233 }, { "epoch": 3.2884195193008012, "grad_norm": 5.280892372131348, "learning_rate": 2.6231761597277783e-05, "loss": 2.4200132369995115, "memory(GiB)": 77.56, "step": 76755, "token_acc": 0.49683544303797467, "train_speed(iter/s)": 1.437228 }, { "epoch": 3.28863373463005, "grad_norm": 5.667395114898682, "learning_rate": 2.6225841033616794e-05, "loss": 2.4845457077026367, "memory(GiB)": 77.56, "step": 76760, "token_acc": 0.4945054945054945, "train_speed(iter/s)": 1.43722 }, { "epoch": 3.2888479499592993, "grad_norm": 5.819214820861816, "learning_rate": 2.621992090064701e-05, "loss": 2.399454879760742, "memory(GiB)": 77.56, "step": 76765, "token_acc": 0.49393939393939396, "train_speed(iter/s)": 1.437222 }, { "epoch": 3.289062165288548, "grad_norm": 5.851830959320068, "learning_rate": 2.6214001198475645e-05, "loss": 2.3173423767089845, "memory(GiB)": 77.56, "step": 76770, "token_acc": 0.5259515570934256, "train_speed(iter/s)": 1.437209 }, { "epoch": 3.289276380617797, "grad_norm": 5.584312915802002, "learning_rate": 2.6208081927209988e-05, "loss": 2.316762161254883, "memory(GiB)": 77.56, "step": 76775, "token_acc": 0.5266666666666666, "train_speed(iter/s)": 1.4372 }, { "epoch": 3.289490595947046, "grad_norm": 6.209677219390869, "learning_rate": 2.6202163086957243e-05, "loss": 2.1203369140625, "memory(GiB)": 77.56, "step": 76780, "token_acc": 0.51953125, "train_speed(iter/s)": 1.43721 }, { "epoch": 3.289704811276295, "grad_norm": 4.925808429718018, "learning_rate": 2.6196244677824637e-05, "loss": 2.4465789794921875, "memory(GiB)": 77.56, "step": 76785, "token_acc": 0.5037593984962406, "train_speed(iter/s)": 1.43722 }, { "epoch": 3.289919026605544, "grad_norm": 4.3598713874816895, "learning_rate": 2.6190326699919388e-05, "loss": 2.226450729370117, "memory(GiB)": 77.56, "step": 76790, "token_acc": 0.5015673981191222, "train_speed(iter/s)": 1.437235 }, { "epoch": 3.290133241934793, "grad_norm": 4.940312385559082, "learning_rate": 2.618440915334871e-05, "loss": 2.213574028015137, "memory(GiB)": 77.56, "step": 76795, "token_acc": 0.5278688524590164, "train_speed(iter/s)": 1.437224 }, { "epoch": 3.290347457264042, "grad_norm": 6.657808780670166, "learning_rate": 2.617849203821978e-05, "loss": 2.331351470947266, "memory(GiB)": 77.56, "step": 76800, "token_acc": 0.5051194539249146, "train_speed(iter/s)": 1.437254 }, { "epoch": 3.2905616725932907, "grad_norm": 4.6629638671875, "learning_rate": 2.617257535463983e-05, "loss": 2.223450469970703, "memory(GiB)": 77.56, "step": 76805, "token_acc": 0.5031645569620253, "train_speed(iter/s)": 1.437265 }, { "epoch": 3.29077588792254, "grad_norm": 5.935922622680664, "learning_rate": 2.616665910271603e-05, "loss": 2.170707130432129, "memory(GiB)": 77.56, "step": 76810, "token_acc": 0.5477031802120141, "train_speed(iter/s)": 1.437292 }, { "epoch": 3.2909901032517888, "grad_norm": 9.75797176361084, "learning_rate": 2.616074328255556e-05, "loss": 2.4682870864868165, "memory(GiB)": 77.56, "step": 76815, "token_acc": 0.5254237288135594, "train_speed(iter/s)": 1.437308 }, { "epoch": 3.2912043185810376, "grad_norm": 5.185744762420654, "learning_rate": 2.6154827894265588e-05, "loss": 2.218869209289551, "memory(GiB)": 77.56, "step": 76820, "token_acc": 0.5029585798816568, "train_speed(iter/s)": 1.437316 }, { "epoch": 3.291418533910287, "grad_norm": 5.338229656219482, "learning_rate": 2.6148912937953256e-05, "loss": 2.3645278930664064, "memory(GiB)": 77.56, "step": 76825, "token_acc": 0.5392491467576792, "train_speed(iter/s)": 1.437317 }, { "epoch": 3.2916327492395356, "grad_norm": 6.619795799255371, "learning_rate": 2.614299841372576e-05, "loss": 2.0638406753540037, "memory(GiB)": 77.56, "step": 76830, "token_acc": 0.5357142857142857, "train_speed(iter/s)": 1.437329 }, { "epoch": 3.2918469645687845, "grad_norm": 5.00105094909668, "learning_rate": 2.613708432169021e-05, "loss": 2.5913578033447267, "memory(GiB)": 77.56, "step": 76835, "token_acc": 0.45161290322580644, "train_speed(iter/s)": 1.437327 }, { "epoch": 3.2920611798980337, "grad_norm": 5.717588424682617, "learning_rate": 2.613117066195378e-05, "loss": 2.1996074676513673, "memory(GiB)": 77.56, "step": 76840, "token_acc": 0.5300353356890459, "train_speed(iter/s)": 1.437329 }, { "epoch": 3.2922753952272825, "grad_norm": 4.773777961730957, "learning_rate": 2.6125257434623584e-05, "loss": 2.459240531921387, "memory(GiB)": 77.56, "step": 76845, "token_acc": 0.48172757475083056, "train_speed(iter/s)": 1.437318 }, { "epoch": 3.2924896105565313, "grad_norm": 7.958888530731201, "learning_rate": 2.6119344639806753e-05, "loss": 2.505276107788086, "memory(GiB)": 77.56, "step": 76850, "token_acc": 0.4835164835164835, "train_speed(iter/s)": 1.43729 }, { "epoch": 3.2927038258857806, "grad_norm": 4.454339027404785, "learning_rate": 2.6113432277610367e-05, "loss": 1.8986431121826173, "memory(GiB)": 77.56, "step": 76855, "token_acc": 0.5766666666666667, "train_speed(iter/s)": 1.437236 }, { "epoch": 3.2929180412150294, "grad_norm": 5.445559501647949, "learning_rate": 2.6107520348141585e-05, "loss": 2.406661605834961, "memory(GiB)": 77.56, "step": 76860, "token_acc": 0.515625, "train_speed(iter/s)": 1.437237 }, { "epoch": 3.293132256544278, "grad_norm": 5.683530807495117, "learning_rate": 2.6101608851507486e-05, "loss": 2.0999757766723635, "memory(GiB)": 77.56, "step": 76865, "token_acc": 0.5613382899628253, "train_speed(iter/s)": 1.43723 }, { "epoch": 3.2933464718735275, "grad_norm": 5.385169506072998, "learning_rate": 2.609569778781516e-05, "loss": 2.2904966354370115, "memory(GiB)": 77.56, "step": 76870, "token_acc": 0.5050505050505051, "train_speed(iter/s)": 1.43723 }, { "epoch": 3.2935606872027763, "grad_norm": 5.778371810913086, "learning_rate": 2.6089787157171697e-05, "loss": 2.4114334106445314, "memory(GiB)": 77.56, "step": 76875, "token_acc": 0.4757834757834758, "train_speed(iter/s)": 1.437234 }, { "epoch": 3.293774902532025, "grad_norm": 10.222197532653809, "learning_rate": 2.6083876959684162e-05, "loss": 2.0714389801025392, "memory(GiB)": 77.56, "step": 76880, "token_acc": 0.5693950177935944, "train_speed(iter/s)": 1.437265 }, { "epoch": 3.2939891178612744, "grad_norm": 7.302116870880127, "learning_rate": 2.607796719545962e-05, "loss": 2.097861480712891, "memory(GiB)": 77.56, "step": 76885, "token_acc": 0.5278810408921933, "train_speed(iter/s)": 1.437275 }, { "epoch": 3.294203333190523, "grad_norm": 7.242732524871826, "learning_rate": 2.6072057864605147e-05, "loss": 2.6891801834106444, "memory(GiB)": 77.56, "step": 76890, "token_acc": 0.46474358974358976, "train_speed(iter/s)": 1.437255 }, { "epoch": 3.294417548519772, "grad_norm": 6.562575340270996, "learning_rate": 2.606614896722781e-05, "loss": 2.539505195617676, "memory(GiB)": 77.56, "step": 76895, "token_acc": 0.46130952380952384, "train_speed(iter/s)": 1.437234 }, { "epoch": 3.2946317638490212, "grad_norm": 7.772091865539551, "learning_rate": 2.6060240503434623e-05, "loss": 2.3576221466064453, "memory(GiB)": 77.56, "step": 76900, "token_acc": 0.496551724137931, "train_speed(iter/s)": 1.437245 }, { "epoch": 3.29484597917827, "grad_norm": 5.5433220863342285, "learning_rate": 2.6054332473332622e-05, "loss": 2.216729927062988, "memory(GiB)": 77.56, "step": 76905, "token_acc": 0.47491638795986624, "train_speed(iter/s)": 1.43725 }, { "epoch": 3.295060194507519, "grad_norm": 4.832674026489258, "learning_rate": 2.6048424877028876e-05, "loss": 2.3038759231567383, "memory(GiB)": 77.56, "step": 76910, "token_acc": 0.4843205574912892, "train_speed(iter/s)": 1.437237 }, { "epoch": 3.295274409836768, "grad_norm": 4.854836463928223, "learning_rate": 2.6042517714630354e-05, "loss": 2.394809341430664, "memory(GiB)": 77.56, "step": 76915, "token_acc": 0.48307692307692307, "train_speed(iter/s)": 1.437231 }, { "epoch": 3.295488625166017, "grad_norm": 8.294893264770508, "learning_rate": 2.6036610986244125e-05, "loss": 2.3702972412109373, "memory(GiB)": 77.56, "step": 76920, "token_acc": 0.5104895104895105, "train_speed(iter/s)": 1.437252 }, { "epoch": 3.2957028404952657, "grad_norm": 4.817264556884766, "learning_rate": 2.6030704691977158e-05, "loss": 2.360240364074707, "memory(GiB)": 77.56, "step": 76925, "token_acc": 0.5109034267912772, "train_speed(iter/s)": 1.437276 }, { "epoch": 3.295917055824515, "grad_norm": 6.1844401359558105, "learning_rate": 2.602479883193647e-05, "loss": 2.492644500732422, "memory(GiB)": 77.56, "step": 76930, "token_acc": 0.48464163822525597, "train_speed(iter/s)": 1.437247 }, { "epoch": 3.296131271153764, "grad_norm": 5.200422763824463, "learning_rate": 2.6018893406229033e-05, "loss": 1.9987030029296875, "memory(GiB)": 77.56, "step": 76935, "token_acc": 0.5754385964912281, "train_speed(iter/s)": 1.437253 }, { "epoch": 3.2963454864830126, "grad_norm": 5.031759738922119, "learning_rate": 2.6012988414961848e-05, "loss": 2.362395477294922, "memory(GiB)": 77.56, "step": 76940, "token_acc": 0.46710526315789475, "train_speed(iter/s)": 1.437253 }, { "epoch": 3.296559701812262, "grad_norm": 6.7006635665893555, "learning_rate": 2.6007083858241853e-05, "loss": 2.38018856048584, "memory(GiB)": 77.56, "step": 76945, "token_acc": 0.5032467532467533, "train_speed(iter/s)": 1.437243 }, { "epoch": 3.2967739171415107, "grad_norm": 6.165157794952393, "learning_rate": 2.6001179736176064e-05, "loss": 2.5515838623046876, "memory(GiB)": 77.56, "step": 76950, "token_acc": 0.4612903225806452, "train_speed(iter/s)": 1.437249 }, { "epoch": 3.2969881324707595, "grad_norm": 4.450499057769775, "learning_rate": 2.599527604887141e-05, "loss": 2.2928436279296873, "memory(GiB)": 77.56, "step": 76955, "token_acc": 0.5040322580645161, "train_speed(iter/s)": 1.437272 }, { "epoch": 3.2972023478000088, "grad_norm": 5.850487232208252, "learning_rate": 2.5989372796434854e-05, "loss": 2.1894216537475586, "memory(GiB)": 77.56, "step": 76960, "token_acc": 0.510548523206751, "train_speed(iter/s)": 1.437269 }, { "epoch": 3.2974165631292576, "grad_norm": 5.848648548126221, "learning_rate": 2.598346997897333e-05, "loss": 2.1983219146728517, "memory(GiB)": 77.56, "step": 76965, "token_acc": 0.5053003533568905, "train_speed(iter/s)": 1.437258 }, { "epoch": 3.2976307784585064, "grad_norm": 9.699432373046875, "learning_rate": 2.597756759659376e-05, "loss": 2.3296913146972655, "memory(GiB)": 77.56, "step": 76970, "token_acc": 0.5291970802919708, "train_speed(iter/s)": 1.437263 }, { "epoch": 3.2978449937877556, "grad_norm": 5.465476036071777, "learning_rate": 2.597166564940311e-05, "loss": 2.33630485534668, "memory(GiB)": 77.56, "step": 76975, "token_acc": 0.5016181229773463, "train_speed(iter/s)": 1.437259 }, { "epoch": 3.2980592091170045, "grad_norm": 6.026414394378662, "learning_rate": 2.5965764137508253e-05, "loss": 2.3223102569580076, "memory(GiB)": 77.56, "step": 76980, "token_acc": 0.5236686390532544, "train_speed(iter/s)": 1.437272 }, { "epoch": 3.2982734244462533, "grad_norm": 5.62860107421875, "learning_rate": 2.5959863061016144e-05, "loss": 2.326262855529785, "memory(GiB)": 77.56, "step": 76985, "token_acc": 0.4609375, "train_speed(iter/s)": 1.43726 }, { "epoch": 3.2984876397755025, "grad_norm": 5.432168960571289, "learning_rate": 2.5953962420033673e-05, "loss": 2.564494514465332, "memory(GiB)": 77.56, "step": 76990, "token_acc": 0.5, "train_speed(iter/s)": 1.437255 }, { "epoch": 3.2987018551047513, "grad_norm": 6.185112476348877, "learning_rate": 2.5948062214667723e-05, "loss": 2.4724334716796874, "memory(GiB)": 77.56, "step": 76995, "token_acc": 0.5034246575342466, "train_speed(iter/s)": 1.43725 }, { "epoch": 3.298916070434, "grad_norm": 5.683099746704102, "learning_rate": 2.5942162445025174e-05, "loss": 2.2289962768554688, "memory(GiB)": 77.56, "step": 77000, "token_acc": 0.5118110236220472, "train_speed(iter/s)": 1.437263 }, { "epoch": 3.298916070434, "eval_loss": 2.3029232025146484, "eval_runtime": 14.7464, "eval_samples_per_second": 6.781, "eval_steps_per_second": 6.781, "eval_token_acc": 0.4774193548387097, "step": 77000 }, { "epoch": 3.2991302857632494, "grad_norm": 6.969137668609619, "learning_rate": 2.593626311121294e-05, "loss": 2.268109893798828, "memory(GiB)": 77.56, "step": 77005, "token_acc": 0.49264705882352944, "train_speed(iter/s)": 1.436817 }, { "epoch": 3.299344501092498, "grad_norm": 5.646996974945068, "learning_rate": 2.5930364213337865e-05, "loss": 2.5246856689453123, "memory(GiB)": 77.56, "step": 77010, "token_acc": 0.45925925925925926, "train_speed(iter/s)": 1.436829 }, { "epoch": 3.299558716421747, "grad_norm": 6.769023418426514, "learning_rate": 2.592446575150683e-05, "loss": 2.1980318069458007, "memory(GiB)": 77.56, "step": 77015, "token_acc": 0.5271966527196653, "train_speed(iter/s)": 1.436815 }, { "epoch": 3.2997729317509963, "grad_norm": 6.5589823722839355, "learning_rate": 2.5918567725826682e-05, "loss": 2.33660888671875, "memory(GiB)": 77.56, "step": 77020, "token_acc": 0.5, "train_speed(iter/s)": 1.436819 }, { "epoch": 3.299987147080245, "grad_norm": 6.002827167510986, "learning_rate": 2.591267013640427e-05, "loss": 2.4459890365600585, "memory(GiB)": 77.56, "step": 77025, "token_acc": 0.4629080118694362, "train_speed(iter/s)": 1.436822 }, { "epoch": 3.300201362409494, "grad_norm": 5.471892833709717, "learning_rate": 2.590677298334641e-05, "loss": 2.5731468200683594, "memory(GiB)": 77.56, "step": 77030, "token_acc": 0.5071428571428571, "train_speed(iter/s)": 1.436841 }, { "epoch": 3.300415577738743, "grad_norm": 5.689818382263184, "learning_rate": 2.590087626675998e-05, "loss": 2.279067039489746, "memory(GiB)": 77.56, "step": 77035, "token_acc": 0.5487364620938628, "train_speed(iter/s)": 1.436846 }, { "epoch": 3.300629793067992, "grad_norm": 6.6045660972595215, "learning_rate": 2.589497998675179e-05, "loss": 2.3414470672607424, "memory(GiB)": 77.56, "step": 77040, "token_acc": 0.525, "train_speed(iter/s)": 1.436827 }, { "epoch": 3.300844008397241, "grad_norm": 4.153785705566406, "learning_rate": 2.5889084143428643e-05, "loss": 2.5267446517944334, "memory(GiB)": 77.56, "step": 77045, "token_acc": 0.5109034267912772, "train_speed(iter/s)": 1.436834 }, { "epoch": 3.30105822372649, "grad_norm": 5.836066722869873, "learning_rate": 2.5883188736897356e-05, "loss": 2.373625946044922, "memory(GiB)": 77.56, "step": 77050, "token_acc": 0.5080906148867314, "train_speed(iter/s)": 1.43681 }, { "epoch": 3.301272439055739, "grad_norm": 4.907026290893555, "learning_rate": 2.587729376726471e-05, "loss": 2.3365612030029297, "memory(GiB)": 77.56, "step": 77055, "token_acc": 0.4984423676012461, "train_speed(iter/s)": 1.436809 }, { "epoch": 3.3014866543849877, "grad_norm": 6.26133918762207, "learning_rate": 2.587139923463751e-05, "loss": 2.819825553894043, "memory(GiB)": 77.56, "step": 77060, "token_acc": 0.42803030303030304, "train_speed(iter/s)": 1.436825 }, { "epoch": 3.301700869714237, "grad_norm": 4.15021276473999, "learning_rate": 2.586550513912257e-05, "loss": 2.058642578125, "memory(GiB)": 77.56, "step": 77065, "token_acc": 0.5355805243445693, "train_speed(iter/s)": 1.436842 }, { "epoch": 3.3019150850434857, "grad_norm": 9.351841926574707, "learning_rate": 2.585961148082665e-05, "loss": 2.455078125, "memory(GiB)": 77.56, "step": 77070, "token_acc": 0.49375, "train_speed(iter/s)": 1.436862 }, { "epoch": 3.3021293003727346, "grad_norm": 6.883096218109131, "learning_rate": 2.5853718259856507e-05, "loss": 2.4500152587890627, "memory(GiB)": 77.56, "step": 77075, "token_acc": 0.5128205128205128, "train_speed(iter/s)": 1.43686 }, { "epoch": 3.302343515701984, "grad_norm": 5.932377338409424, "learning_rate": 2.584782547631891e-05, "loss": 2.5991334915161133, "memory(GiB)": 77.56, "step": 77080, "token_acc": 0.5101214574898786, "train_speed(iter/s)": 1.436858 }, { "epoch": 3.3025577310312326, "grad_norm": 4.293975353240967, "learning_rate": 2.5841933130320618e-05, "loss": 2.4596836090087892, "memory(GiB)": 77.56, "step": 77085, "token_acc": 0.4937106918238994, "train_speed(iter/s)": 1.436856 }, { "epoch": 3.3027719463604814, "grad_norm": 5.631494522094727, "learning_rate": 2.5836041221968345e-05, "loss": 2.636617660522461, "memory(GiB)": 77.56, "step": 77090, "token_acc": 0.458041958041958, "train_speed(iter/s)": 1.43687 }, { "epoch": 3.3029861616897307, "grad_norm": 5.196710109710693, "learning_rate": 2.583014975136887e-05, "loss": 2.5874141693115233, "memory(GiB)": 77.56, "step": 77095, "token_acc": 0.47674418604651164, "train_speed(iter/s)": 1.436898 }, { "epoch": 3.3032003770189795, "grad_norm": 5.8815178871154785, "learning_rate": 2.5824258718628906e-05, "loss": 2.448269844055176, "memory(GiB)": 77.56, "step": 77100, "token_acc": 0.5172413793103449, "train_speed(iter/s)": 1.436913 }, { "epoch": 3.3034145923482283, "grad_norm": 6.350180149078369, "learning_rate": 2.5818368123855176e-05, "loss": 2.6670684814453125, "memory(GiB)": 77.56, "step": 77105, "token_acc": 0.4642857142857143, "train_speed(iter/s)": 1.436919 }, { "epoch": 3.3036288076774776, "grad_norm": 7.065086841583252, "learning_rate": 2.581247796715439e-05, "loss": 2.268832206726074, "memory(GiB)": 77.56, "step": 77110, "token_acc": 0.5298804780876494, "train_speed(iter/s)": 1.436943 }, { "epoch": 3.3038430230067264, "grad_norm": 5.911028861999512, "learning_rate": 2.5806588248633235e-05, "loss": 2.5288286209106445, "memory(GiB)": 77.56, "step": 77115, "token_acc": 0.45544554455445546, "train_speed(iter/s)": 1.436916 }, { "epoch": 3.304057238335975, "grad_norm": 6.289820194244385, "learning_rate": 2.580069896839845e-05, "loss": 2.052786445617676, "memory(GiB)": 77.56, "step": 77120, "token_acc": 0.5426621160409556, "train_speed(iter/s)": 1.436925 }, { "epoch": 3.3042714536652245, "grad_norm": 5.846970081329346, "learning_rate": 2.5794810126556707e-05, "loss": 2.2656833648681642, "memory(GiB)": 77.56, "step": 77125, "token_acc": 0.5436507936507936, "train_speed(iter/s)": 1.436946 }, { "epoch": 3.3044856689944733, "grad_norm": 6.609343528747559, "learning_rate": 2.5788921723214664e-05, "loss": 2.009074401855469, "memory(GiB)": 77.56, "step": 77130, "token_acc": 0.5833333333333334, "train_speed(iter/s)": 1.436959 }, { "epoch": 3.304699884323722, "grad_norm": 5.0762763023376465, "learning_rate": 2.5783033758479035e-05, "loss": 2.3161203384399416, "memory(GiB)": 77.56, "step": 77135, "token_acc": 0.5326797385620915, "train_speed(iter/s)": 1.436975 }, { "epoch": 3.3049140996529713, "grad_norm": 6.89382266998291, "learning_rate": 2.5777146232456463e-05, "loss": 2.6329727172851562, "memory(GiB)": 77.56, "step": 77140, "token_acc": 0.4432624113475177, "train_speed(iter/s)": 1.436993 }, { "epoch": 3.30512831498222, "grad_norm": 7.186066150665283, "learning_rate": 2.577125914525359e-05, "loss": 2.2719356536865236, "memory(GiB)": 77.56, "step": 77145, "token_acc": 0.4728434504792332, "train_speed(iter/s)": 1.436999 }, { "epoch": 3.305342530311469, "grad_norm": 11.30805778503418, "learning_rate": 2.5765372496977113e-05, "loss": 2.458699417114258, "memory(GiB)": 77.56, "step": 77150, "token_acc": 0.4681818181818182, "train_speed(iter/s)": 1.436996 }, { "epoch": 3.305556745640718, "grad_norm": 6.7140021324157715, "learning_rate": 2.575948628773364e-05, "loss": 2.263802719116211, "memory(GiB)": 77.56, "step": 77155, "token_acc": 0.5460992907801419, "train_speed(iter/s)": 1.436986 }, { "epoch": 3.305770960969967, "grad_norm": 6.221335411071777, "learning_rate": 2.5753600517629817e-05, "loss": 2.507166290283203, "memory(GiB)": 77.56, "step": 77160, "token_acc": 0.48484848484848486, "train_speed(iter/s)": 1.436963 }, { "epoch": 3.305985176299216, "grad_norm": 4.820887565612793, "learning_rate": 2.5747715186772264e-05, "loss": 2.0825527191162108, "memory(GiB)": 77.56, "step": 77165, "token_acc": 0.5746031746031746, "train_speed(iter/s)": 1.436942 }, { "epoch": 3.306199391628465, "grad_norm": 7.407349109649658, "learning_rate": 2.5741830295267598e-05, "loss": 2.262668991088867, "memory(GiB)": 77.56, "step": 77170, "token_acc": 0.5163636363636364, "train_speed(iter/s)": 1.436935 }, { "epoch": 3.306413606957714, "grad_norm": 4.921755790710449, "learning_rate": 2.573594584322242e-05, "loss": 2.761060905456543, "memory(GiB)": 77.56, "step": 77175, "token_acc": 0.4570446735395189, "train_speed(iter/s)": 1.436935 }, { "epoch": 3.3066278222869627, "grad_norm": 5.5492072105407715, "learning_rate": 2.5730061830743358e-05, "loss": 2.0439180374145507, "memory(GiB)": 77.56, "step": 77180, "token_acc": 0.5454545454545454, "train_speed(iter/s)": 1.436943 }, { "epoch": 3.306842037616212, "grad_norm": 6.636061668395996, "learning_rate": 2.5724178257936992e-05, "loss": 2.2904075622558593, "memory(GiB)": 77.56, "step": 77185, "token_acc": 0.4861878453038674, "train_speed(iter/s)": 1.43694 }, { "epoch": 3.307056252945461, "grad_norm": 5.415263652801514, "learning_rate": 2.5718295124909913e-05, "loss": 2.2275161743164062, "memory(GiB)": 77.56, "step": 77190, "token_acc": 0.5051903114186851, "train_speed(iter/s)": 1.436946 }, { "epoch": 3.3072704682747096, "grad_norm": 4.004490852355957, "learning_rate": 2.5712412431768696e-05, "loss": 2.1486921310424805, "memory(GiB)": 77.56, "step": 77195, "token_acc": 0.5427631578947368, "train_speed(iter/s)": 1.436963 }, { "epoch": 3.307484683603959, "grad_norm": 5.417473316192627, "learning_rate": 2.5706530178619893e-05, "loss": 2.082661819458008, "memory(GiB)": 77.56, "step": 77200, "token_acc": 0.5487364620938628, "train_speed(iter/s)": 1.436977 }, { "epoch": 3.3076988989332077, "grad_norm": 7.476407527923584, "learning_rate": 2.570064836557008e-05, "loss": 2.4236865997314454, "memory(GiB)": 77.56, "step": 77205, "token_acc": 0.498371335504886, "train_speed(iter/s)": 1.436991 }, { "epoch": 3.3079131142624565, "grad_norm": 6.389898300170898, "learning_rate": 2.5694766992725837e-05, "loss": 2.0084529876708985, "memory(GiB)": 77.56, "step": 77210, "token_acc": 0.5, "train_speed(iter/s)": 1.436987 }, { "epoch": 3.3081273295917057, "grad_norm": 4.9215240478515625, "learning_rate": 2.5688886060193694e-05, "loss": 2.378492546081543, "memory(GiB)": 77.56, "step": 77215, "token_acc": 0.4774193548387097, "train_speed(iter/s)": 1.436993 }, { "epoch": 3.3083415449209546, "grad_norm": 4.335348129272461, "learning_rate": 2.5683005568080188e-05, "loss": 2.1754829406738283, "memory(GiB)": 77.56, "step": 77220, "token_acc": 0.5134099616858238, "train_speed(iter/s)": 1.437007 }, { "epoch": 3.3085557602502034, "grad_norm": 5.2346367835998535, "learning_rate": 2.567712551649184e-05, "loss": 2.379818916320801, "memory(GiB)": 77.56, "step": 77225, "token_acc": 0.5018315018315018, "train_speed(iter/s)": 1.436985 }, { "epoch": 3.3087699755794526, "grad_norm": 4.546489238739014, "learning_rate": 2.567124590553518e-05, "loss": 2.0582487106323244, "memory(GiB)": 77.56, "step": 77230, "token_acc": 0.5415162454873647, "train_speed(iter/s)": 1.436993 }, { "epoch": 3.3089841909087014, "grad_norm": 5.561270236968994, "learning_rate": 2.5665366735316708e-05, "loss": 2.371104431152344, "memory(GiB)": 77.56, "step": 77235, "token_acc": 0.5238095238095238, "train_speed(iter/s)": 1.437008 }, { "epoch": 3.3091984062379503, "grad_norm": 6.500630855560303, "learning_rate": 2.565948800594296e-05, "loss": 2.3415708541870117, "memory(GiB)": 77.56, "step": 77240, "token_acc": 0.532319391634981, "train_speed(iter/s)": 1.437014 }, { "epoch": 3.3094126215671995, "grad_norm": 6.098557472229004, "learning_rate": 2.5653609717520423e-05, "loss": 2.2728961944580077, "memory(GiB)": 77.56, "step": 77245, "token_acc": 0.5381944444444444, "train_speed(iter/s)": 1.437016 }, { "epoch": 3.3096268368964483, "grad_norm": 5.145260334014893, "learning_rate": 2.5647731870155585e-05, "loss": 1.9735578536987304, "memory(GiB)": 77.56, "step": 77250, "token_acc": 0.5867768595041323, "train_speed(iter/s)": 1.437008 }, { "epoch": 3.309841052225697, "grad_norm": 5.626273155212402, "learning_rate": 2.5641854463954928e-05, "loss": 2.3705728530883787, "memory(GiB)": 77.56, "step": 77255, "token_acc": 0.5084175084175084, "train_speed(iter/s)": 1.437015 }, { "epoch": 3.3100552675549464, "grad_norm": 6.133059978485107, "learning_rate": 2.563597749902491e-05, "loss": 2.1151107788085937, "memory(GiB)": 77.56, "step": 77260, "token_acc": 0.5526315789473685, "train_speed(iter/s)": 1.437034 }, { "epoch": 3.310269482884195, "grad_norm": 5.907609939575195, "learning_rate": 2.5630100975472026e-05, "loss": 2.417312240600586, "memory(GiB)": 77.56, "step": 77265, "token_acc": 0.5, "train_speed(iter/s)": 1.43702 }, { "epoch": 3.310483698213444, "grad_norm": 6.832525730133057, "learning_rate": 2.5624224893402733e-05, "loss": 2.3832204818725584, "memory(GiB)": 77.56, "step": 77270, "token_acc": 0.4966216216216216, "train_speed(iter/s)": 1.437025 }, { "epoch": 3.3106979135426933, "grad_norm": 5.4986090660095215, "learning_rate": 2.5618349252923448e-05, "loss": 2.671581840515137, "memory(GiB)": 77.56, "step": 77275, "token_acc": 0.4707692307692308, "train_speed(iter/s)": 1.437032 }, { "epoch": 3.310912128871942, "grad_norm": 6.092564105987549, "learning_rate": 2.5612474054140657e-05, "loss": 2.3200450897216798, "memory(GiB)": 77.56, "step": 77280, "token_acc": 0.5201465201465202, "train_speed(iter/s)": 1.437005 }, { "epoch": 3.311126344201191, "grad_norm": 5.981649398803711, "learning_rate": 2.560659929716078e-05, "loss": 2.4917009353637694, "memory(GiB)": 77.56, "step": 77285, "token_acc": 0.5, "train_speed(iter/s)": 1.437011 }, { "epoch": 3.31134055953044, "grad_norm": 4.974832534790039, "learning_rate": 2.560072498209022e-05, "loss": 2.4469219207763673, "memory(GiB)": 77.56, "step": 77290, "token_acc": 0.46579804560260585, "train_speed(iter/s)": 1.437044 }, { "epoch": 3.311554774859689, "grad_norm": 5.801551342010498, "learning_rate": 2.5594851109035435e-05, "loss": 2.112714958190918, "memory(GiB)": 77.56, "step": 77295, "token_acc": 0.5286624203821656, "train_speed(iter/s)": 1.437063 }, { "epoch": 3.3117689901889378, "grad_norm": 5.231300354003906, "learning_rate": 2.558897767810281e-05, "loss": 2.416782188415527, "memory(GiB)": 77.56, "step": 77300, "token_acc": 0.4854014598540146, "train_speed(iter/s)": 1.437078 }, { "epoch": 3.311983205518187, "grad_norm": 7.499194145202637, "learning_rate": 2.5583104689398757e-05, "loss": 1.9685455322265626, "memory(GiB)": 77.56, "step": 77305, "token_acc": 0.48917748917748916, "train_speed(iter/s)": 1.437078 }, { "epoch": 3.312197420847436, "grad_norm": 7.441330432891846, "learning_rate": 2.5577232143029672e-05, "loss": 2.247381019592285, "memory(GiB)": 77.56, "step": 77310, "token_acc": 0.47651006711409394, "train_speed(iter/s)": 1.437092 }, { "epoch": 3.3124116361766847, "grad_norm": 6.9077301025390625, "learning_rate": 2.5571360039101932e-05, "loss": 2.178914260864258, "memory(GiB)": 77.56, "step": 77315, "token_acc": 0.5572519083969466, "train_speed(iter/s)": 1.437113 }, { "epoch": 3.312625851505934, "grad_norm": 7.821229934692383, "learning_rate": 2.5565488377721903e-05, "loss": 2.3870431900024416, "memory(GiB)": 77.56, "step": 77320, "token_acc": 0.5079365079365079, "train_speed(iter/s)": 1.437125 }, { "epoch": 3.3128400668351827, "grad_norm": 4.951988697052002, "learning_rate": 2.555961715899599e-05, "loss": 2.2436527252197265, "memory(GiB)": 77.56, "step": 77325, "token_acc": 0.5389830508474577, "train_speed(iter/s)": 1.437133 }, { "epoch": 3.3130542821644315, "grad_norm": 5.681581020355225, "learning_rate": 2.555374638303054e-05, "loss": 2.310961151123047, "memory(GiB)": 77.56, "step": 77330, "token_acc": 0.5448028673835126, "train_speed(iter/s)": 1.437127 }, { "epoch": 3.313268497493681, "grad_norm": 6.0256547927856445, "learning_rate": 2.554787604993191e-05, "loss": 2.1599552154541017, "memory(GiB)": 77.56, "step": 77335, "token_acc": 0.5436241610738255, "train_speed(iter/s)": 1.437137 }, { "epoch": 3.3134827128229296, "grad_norm": 6.890291690826416, "learning_rate": 2.5542006159806444e-05, "loss": 2.521312713623047, "memory(GiB)": 77.56, "step": 77340, "token_acc": 0.4910394265232975, "train_speed(iter/s)": 1.437146 }, { "epoch": 3.3136969281521784, "grad_norm": 6.326132774353027, "learning_rate": 2.5536136712760457e-05, "loss": 2.4545194625854494, "memory(GiB)": 77.56, "step": 77345, "token_acc": 0.46557377049180326, "train_speed(iter/s)": 1.437116 }, { "epoch": 3.3139111434814277, "grad_norm": 8.4386625289917, "learning_rate": 2.5530267708900303e-05, "loss": 2.2343910217285154, "memory(GiB)": 77.56, "step": 77350, "token_acc": 0.5139318885448917, "train_speed(iter/s)": 1.437134 }, { "epoch": 3.3141253588106765, "grad_norm": 6.033736705780029, "learning_rate": 2.5524399148332325e-05, "loss": 2.4118396759033205, "memory(GiB)": 77.56, "step": 77355, "token_acc": 0.5257352941176471, "train_speed(iter/s)": 1.437147 }, { "epoch": 3.3143395741399253, "grad_norm": 5.393014907836914, "learning_rate": 2.551853103116281e-05, "loss": 2.4604232788085936, "memory(GiB)": 77.56, "step": 77360, "token_acc": 0.48580441640378547, "train_speed(iter/s)": 1.437167 }, { "epoch": 3.3145537894691746, "grad_norm": 7.311291217803955, "learning_rate": 2.5512663357498078e-05, "loss": 2.2386837005615234, "memory(GiB)": 77.56, "step": 77365, "token_acc": 0.5298804780876494, "train_speed(iter/s)": 1.437162 }, { "epoch": 3.3147680047984234, "grad_norm": 6.575873374938965, "learning_rate": 2.550679612744442e-05, "loss": 2.5401411056518555, "memory(GiB)": 77.56, "step": 77370, "token_acc": 0.4270833333333333, "train_speed(iter/s)": 1.437193 }, { "epoch": 3.314982220127672, "grad_norm": 6.860659122467041, "learning_rate": 2.5500929341108104e-05, "loss": 2.5129186630249025, "memory(GiB)": 77.56, "step": 77375, "token_acc": 0.47808764940239046, "train_speed(iter/s)": 1.437208 }, { "epoch": 3.3151964354569214, "grad_norm": 6.716062545776367, "learning_rate": 2.5495062998595455e-05, "loss": 2.372308349609375, "memory(GiB)": 77.56, "step": 77380, "token_acc": 0.4966216216216216, "train_speed(iter/s)": 1.437222 }, { "epoch": 3.3154106507861703, "grad_norm": 5.149068355560303, "learning_rate": 2.548919710001273e-05, "loss": 2.5191085815429686, "memory(GiB)": 77.56, "step": 77385, "token_acc": 0.42524916943521596, "train_speed(iter/s)": 1.437207 }, { "epoch": 3.315624866115419, "grad_norm": 5.713041305541992, "learning_rate": 2.5483331645466192e-05, "loss": 2.181832504272461, "memory(GiB)": 77.56, "step": 77390, "token_acc": 0.45901639344262296, "train_speed(iter/s)": 1.437197 }, { "epoch": 3.3158390814446683, "grad_norm": 5.911479473114014, "learning_rate": 2.5477466635062097e-05, "loss": 2.1378547668457033, "memory(GiB)": 77.56, "step": 77395, "token_acc": 0.5253623188405797, "train_speed(iter/s)": 1.437203 }, { "epoch": 3.316053296773917, "grad_norm": 5.920670509338379, "learning_rate": 2.5471602068906697e-05, "loss": 2.1676309585571287, "memory(GiB)": 77.56, "step": 77400, "token_acc": 0.5368421052631579, "train_speed(iter/s)": 1.437241 }, { "epoch": 3.316267512103166, "grad_norm": 5.166963577270508, "learning_rate": 2.5465737947106218e-05, "loss": 2.1913219451904298, "memory(GiB)": 77.56, "step": 77405, "token_acc": 0.5324675324675324, "train_speed(iter/s)": 1.437257 }, { "epoch": 3.316481727432415, "grad_norm": 6.212830066680908, "learning_rate": 2.545987426976693e-05, "loss": 2.2177059173583986, "memory(GiB)": 77.56, "step": 77410, "token_acc": 0.5087108013937283, "train_speed(iter/s)": 1.437263 }, { "epoch": 3.316695942761664, "grad_norm": 6.647331237792969, "learning_rate": 2.545401103699504e-05, "loss": 2.5991729736328124, "memory(GiB)": 77.56, "step": 77415, "token_acc": 0.48253968253968255, "train_speed(iter/s)": 1.437275 }, { "epoch": 3.316910158090913, "grad_norm": 4.814138412475586, "learning_rate": 2.5448148248896768e-05, "loss": 2.4958990097045897, "memory(GiB)": 77.56, "step": 77420, "token_acc": 0.49469964664310956, "train_speed(iter/s)": 1.437267 }, { "epoch": 3.317124373420162, "grad_norm": 4.491227149963379, "learning_rate": 2.54422859055783e-05, "loss": 2.555317687988281, "memory(GiB)": 77.56, "step": 77425, "token_acc": 0.4735202492211838, "train_speed(iter/s)": 1.437264 }, { "epoch": 3.317338588749411, "grad_norm": 6.427128314971924, "learning_rate": 2.543642400714588e-05, "loss": 2.1997188568115233, "memory(GiB)": 77.56, "step": 77430, "token_acc": 0.5328947368421053, "train_speed(iter/s)": 1.43725 }, { "epoch": 3.3175528040786597, "grad_norm": 5.459718227386475, "learning_rate": 2.543056255370566e-05, "loss": 2.3429189682006837, "memory(GiB)": 77.56, "step": 77435, "token_acc": 0.534965034965035, "train_speed(iter/s)": 1.437232 }, { "epoch": 3.317767019407909, "grad_norm": 7.484043121337891, "learning_rate": 2.542470154536387e-05, "loss": 2.47094669342041, "memory(GiB)": 77.56, "step": 77440, "token_acc": 0.5079872204472844, "train_speed(iter/s)": 1.437239 }, { "epoch": 3.317981234737158, "grad_norm": 6.093674182891846, "learning_rate": 2.5418840982226667e-05, "loss": 2.3110321044921873, "memory(GiB)": 77.56, "step": 77445, "token_acc": 0.49264705882352944, "train_speed(iter/s)": 1.437235 }, { "epoch": 3.3181954500664066, "grad_norm": 9.87459945678711, "learning_rate": 2.5412980864400217e-05, "loss": 2.4254985809326173, "memory(GiB)": 77.56, "step": 77450, "token_acc": 0.5037037037037037, "train_speed(iter/s)": 1.437243 }, { "epoch": 3.318409665395656, "grad_norm": 5.21718168258667, "learning_rate": 2.540712119199068e-05, "loss": 2.5110260009765626, "memory(GiB)": 77.56, "step": 77455, "token_acc": 0.5271565495207667, "train_speed(iter/s)": 1.437252 }, { "epoch": 3.3186238807249047, "grad_norm": 5.1384687423706055, "learning_rate": 2.5401261965104217e-05, "loss": 2.2521526336669924, "memory(GiB)": 77.56, "step": 77460, "token_acc": 0.5100671140939598, "train_speed(iter/s)": 1.437263 }, { "epoch": 3.3188380960541535, "grad_norm": 6.13267183303833, "learning_rate": 2.5395403183846945e-05, "loss": 2.136837959289551, "memory(GiB)": 77.56, "step": 77465, "token_acc": 0.5161290322580645, "train_speed(iter/s)": 1.437278 }, { "epoch": 3.3190523113834027, "grad_norm": 5.374451160430908, "learning_rate": 2.5389544848325054e-05, "loss": 2.4706241607666017, "memory(GiB)": 77.56, "step": 77470, "token_acc": 0.4754601226993865, "train_speed(iter/s)": 1.437264 }, { "epoch": 3.3192665267126515, "grad_norm": 6.128334999084473, "learning_rate": 2.5383686958644636e-05, "loss": 2.2426692962646486, "memory(GiB)": 77.56, "step": 77475, "token_acc": 0.5035460992907801, "train_speed(iter/s)": 1.437268 }, { "epoch": 3.3194807420419004, "grad_norm": 7.791270732879639, "learning_rate": 2.5377829514911822e-05, "loss": 2.4220884323120115, "memory(GiB)": 77.56, "step": 77480, "token_acc": 0.4542124542124542, "train_speed(iter/s)": 1.437277 }, { "epoch": 3.3196949573711496, "grad_norm": 4.998856067657471, "learning_rate": 2.537197251723272e-05, "loss": 2.290847587585449, "memory(GiB)": 77.56, "step": 77485, "token_acc": 0.5526315789473685, "train_speed(iter/s)": 1.437263 }, { "epoch": 3.3199091727003984, "grad_norm": 5.187781810760498, "learning_rate": 2.5366115965713422e-05, "loss": 1.7143903732299806, "memory(GiB)": 77.56, "step": 77490, "token_acc": 0.6065573770491803, "train_speed(iter/s)": 1.43728 }, { "epoch": 3.3201233880296472, "grad_norm": 5.368714809417725, "learning_rate": 2.5360259860460066e-05, "loss": 2.231265640258789, "memory(GiB)": 77.56, "step": 77495, "token_acc": 0.5017064846416383, "train_speed(iter/s)": 1.437288 }, { "epoch": 3.3203376033588965, "grad_norm": 5.128393650054932, "learning_rate": 2.5354404201578696e-05, "loss": 2.5056447982788086, "memory(GiB)": 77.56, "step": 77500, "token_acc": 0.4984894259818731, "train_speed(iter/s)": 1.437299 }, { "epoch": 3.3203376033588965, "eval_loss": 2.105395555496216, "eval_runtime": 14.2759, "eval_samples_per_second": 7.005, "eval_steps_per_second": 7.005, "eval_token_acc": 0.4832869080779944, "step": 77500 }, { "epoch": 3.3205518186881453, "grad_norm": 4.861273765563965, "learning_rate": 2.5348548989175424e-05, "loss": 2.174689292907715, "memory(GiB)": 77.56, "step": 77505, "token_acc": 0.49230769230769234, "train_speed(iter/s)": 1.436912 }, { "epoch": 3.320766034017394, "grad_norm": 5.340626239776611, "learning_rate": 2.534269422335632e-05, "loss": 2.338021659851074, "memory(GiB)": 77.56, "step": 77510, "token_acc": 0.49038461538461536, "train_speed(iter/s)": 1.43693 }, { "epoch": 3.3209802493466434, "grad_norm": 5.060118675231934, "learning_rate": 2.533683990422744e-05, "loss": 2.5441051483154298, "memory(GiB)": 77.56, "step": 77515, "token_acc": 0.5, "train_speed(iter/s)": 1.436953 }, { "epoch": 3.321194464675892, "grad_norm": 5.681626796722412, "learning_rate": 2.5330986031894823e-05, "loss": 2.472270202636719, "memory(GiB)": 77.56, "step": 77520, "token_acc": 0.4962686567164179, "train_speed(iter/s)": 1.436968 }, { "epoch": 3.321408680005141, "grad_norm": 5.349667549133301, "learning_rate": 2.5325132606464552e-05, "loss": 2.3636363983154296, "memory(GiB)": 77.56, "step": 77525, "token_acc": 0.48773006134969327, "train_speed(iter/s)": 1.436969 }, { "epoch": 3.3216228953343903, "grad_norm": 6.741828441619873, "learning_rate": 2.5319279628042647e-05, "loss": 2.293136978149414, "memory(GiB)": 77.56, "step": 77530, "token_acc": 0.4796511627906977, "train_speed(iter/s)": 1.436963 }, { "epoch": 3.321837110663639, "grad_norm": 5.243693828582764, "learning_rate": 2.5313427096735155e-05, "loss": 2.487224578857422, "memory(GiB)": 77.56, "step": 77535, "token_acc": 0.4738675958188153, "train_speed(iter/s)": 1.436966 }, { "epoch": 3.322051325992888, "grad_norm": 6.401910781860352, "learning_rate": 2.530757501264808e-05, "loss": 2.330003356933594, "memory(GiB)": 77.56, "step": 77540, "token_acc": 0.4846153846153846, "train_speed(iter/s)": 1.43695 }, { "epoch": 3.322265541322137, "grad_norm": 5.999273300170898, "learning_rate": 2.5301723375887447e-05, "loss": 2.492059326171875, "memory(GiB)": 77.56, "step": 77545, "token_acc": 0.47315436241610737, "train_speed(iter/s)": 1.436955 }, { "epoch": 3.322479756651386, "grad_norm": 6.247038841247559, "learning_rate": 2.5295872186559243e-05, "loss": 2.4057302474975586, "memory(GiB)": 77.56, "step": 77550, "token_acc": 0.4981549815498155, "train_speed(iter/s)": 1.43697 }, { "epoch": 3.3226939719806348, "grad_norm": 5.043736457824707, "learning_rate": 2.5290021444769517e-05, "loss": 1.9194206237792968, "memory(GiB)": 77.56, "step": 77555, "token_acc": 0.5296052631578947, "train_speed(iter/s)": 1.436985 }, { "epoch": 3.322908187309884, "grad_norm": 5.577779769897461, "learning_rate": 2.5284171150624225e-05, "loss": 2.409530448913574, "memory(GiB)": 77.56, "step": 77560, "token_acc": 0.4963235294117647, "train_speed(iter/s)": 1.436998 }, { "epoch": 3.323122402639133, "grad_norm": 5.46181058883667, "learning_rate": 2.5278321304229358e-05, "loss": 2.1508968353271483, "memory(GiB)": 77.56, "step": 77565, "token_acc": 0.574468085106383, "train_speed(iter/s)": 1.436994 }, { "epoch": 3.3233366179683816, "grad_norm": 5.481894493103027, "learning_rate": 2.5272471905690876e-05, "loss": 2.2374034881591798, "memory(GiB)": 77.56, "step": 77570, "token_acc": 0.5102040816326531, "train_speed(iter/s)": 1.436995 }, { "epoch": 3.323550833297631, "grad_norm": 4.195674419403076, "learning_rate": 2.526662295511478e-05, "loss": 2.3077556610107424, "memory(GiB)": 77.56, "step": 77575, "token_acc": 0.524822695035461, "train_speed(iter/s)": 1.437004 }, { "epoch": 3.3237650486268797, "grad_norm": 6.152205944061279, "learning_rate": 2.5260774452606993e-05, "loss": 2.2600879669189453, "memory(GiB)": 77.56, "step": 77580, "token_acc": 0.5483870967741935, "train_speed(iter/s)": 1.436959 }, { "epoch": 3.3239792639561285, "grad_norm": 5.229650020599365, "learning_rate": 2.5254926398273498e-05, "loss": 2.219290924072266, "memory(GiB)": 77.56, "step": 77585, "token_acc": 0.532051282051282, "train_speed(iter/s)": 1.436951 }, { "epoch": 3.324193479285378, "grad_norm": 5.283496379852295, "learning_rate": 2.5249078792220227e-05, "loss": 2.4240015029907225, "memory(GiB)": 77.56, "step": 77590, "token_acc": 0.5151515151515151, "train_speed(iter/s)": 1.436958 }, { "epoch": 3.3244076946146266, "grad_norm": 5.911551475524902, "learning_rate": 2.524323163455311e-05, "loss": 2.318562126159668, "memory(GiB)": 77.56, "step": 77595, "token_acc": 0.5076923076923077, "train_speed(iter/s)": 1.436949 }, { "epoch": 3.3246219099438754, "grad_norm": 6.566239356994629, "learning_rate": 2.5237384925378084e-05, "loss": 2.473470115661621, "memory(GiB)": 77.56, "step": 77600, "token_acc": 0.46204620462046203, "train_speed(iter/s)": 1.436977 }, { "epoch": 3.3248361252731247, "grad_norm": 5.843847751617432, "learning_rate": 2.5231538664801047e-05, "loss": 2.5067211151123048, "memory(GiB)": 77.56, "step": 77605, "token_acc": 0.47962382445141066, "train_speed(iter/s)": 1.436984 }, { "epoch": 3.3250503406023735, "grad_norm": 6.355237007141113, "learning_rate": 2.5225692852927913e-05, "loss": 2.354112243652344, "memory(GiB)": 77.56, "step": 77610, "token_acc": 0.4899328859060403, "train_speed(iter/s)": 1.436993 }, { "epoch": 3.3252645559316223, "grad_norm": 6.199345588684082, "learning_rate": 2.5219847489864606e-05, "loss": 2.480083465576172, "memory(GiB)": 77.56, "step": 77615, "token_acc": 0.501628664495114, "train_speed(iter/s)": 1.437016 }, { "epoch": 3.3254787712608715, "grad_norm": 6.4629926681518555, "learning_rate": 2.521400257571701e-05, "loss": 2.1230697631835938, "memory(GiB)": 77.56, "step": 77620, "token_acc": 0.5266903914590747, "train_speed(iter/s)": 1.437017 }, { "epoch": 3.3256929865901204, "grad_norm": 6.372697830200195, "learning_rate": 2.5208158110591006e-05, "loss": 2.2228771209716798, "memory(GiB)": 77.56, "step": 77625, "token_acc": 0.532319391634981, "train_speed(iter/s)": 1.437027 }, { "epoch": 3.325907201919369, "grad_norm": 6.506839752197266, "learning_rate": 2.5202314094592478e-05, "loss": 2.265080451965332, "memory(GiB)": 77.56, "step": 77630, "token_acc": 0.5114503816793893, "train_speed(iter/s)": 1.437016 }, { "epoch": 3.3261214172486184, "grad_norm": 5.440707683563232, "learning_rate": 2.519647052782727e-05, "loss": 2.350636291503906, "memory(GiB)": 77.56, "step": 77635, "token_acc": 0.5173611111111112, "train_speed(iter/s)": 1.437017 }, { "epoch": 3.3263356325778672, "grad_norm": 7.022124767303467, "learning_rate": 2.519062741040129e-05, "loss": 2.264799880981445, "memory(GiB)": 77.56, "step": 77640, "token_acc": 0.5, "train_speed(iter/s)": 1.437026 }, { "epoch": 3.326549847907116, "grad_norm": 6.467678546905518, "learning_rate": 2.5184784742420342e-05, "loss": 2.4305543899536133, "memory(GiB)": 77.56, "step": 77645, "token_acc": 0.4896142433234421, "train_speed(iter/s)": 1.437042 }, { "epoch": 3.3267640632363653, "grad_norm": 6.843394756317139, "learning_rate": 2.5178942523990324e-05, "loss": 2.4796756744384765, "memory(GiB)": 77.56, "step": 77650, "token_acc": 0.4711864406779661, "train_speed(iter/s)": 1.437054 }, { "epoch": 3.326978278565614, "grad_norm": 7.019276142120361, "learning_rate": 2.5173100755217037e-05, "loss": 2.522543716430664, "memory(GiB)": 77.56, "step": 77655, "token_acc": 0.46885245901639344, "train_speed(iter/s)": 1.437071 }, { "epoch": 3.327192493894863, "grad_norm": 6.105474948883057, "learning_rate": 2.5167259436206325e-05, "loss": 2.2438526153564453, "memory(GiB)": 77.56, "step": 77660, "token_acc": 0.5155709342560554, "train_speed(iter/s)": 1.437085 }, { "epoch": 3.327406709224112, "grad_norm": 7.286857604980469, "learning_rate": 2.5161418567063983e-05, "loss": 2.4137947082519533, "memory(GiB)": 77.56, "step": 77665, "token_acc": 0.5071428571428571, "train_speed(iter/s)": 1.437077 }, { "epoch": 3.327620924553361, "grad_norm": 4.995393753051758, "learning_rate": 2.5155578147895862e-05, "loss": 2.5548038482666016, "memory(GiB)": 77.56, "step": 77670, "token_acc": 0.47770700636942676, "train_speed(iter/s)": 1.437102 }, { "epoch": 3.32783513988261, "grad_norm": 5.823831081390381, "learning_rate": 2.5149738178807745e-05, "loss": 2.384229278564453, "memory(GiB)": 77.56, "step": 77675, "token_acc": 0.5115606936416185, "train_speed(iter/s)": 1.437127 }, { "epoch": 3.328049355211859, "grad_norm": 4.5682806968688965, "learning_rate": 2.5143898659905442e-05, "loss": 2.3219558715820314, "memory(GiB)": 77.56, "step": 77680, "token_acc": 0.4966887417218543, "train_speed(iter/s)": 1.43712 }, { "epoch": 3.328263570541108, "grad_norm": 6.711953639984131, "learning_rate": 2.5138059591294727e-05, "loss": 2.207210922241211, "memory(GiB)": 77.56, "step": 77685, "token_acc": 0.492, "train_speed(iter/s)": 1.437116 }, { "epoch": 3.3284777858703567, "grad_norm": 4.569845676422119, "learning_rate": 2.513222097308138e-05, "loss": 2.1944902420043944, "memory(GiB)": 77.56, "step": 77690, "token_acc": 0.521594684385382, "train_speed(iter/s)": 1.437127 }, { "epoch": 3.328692001199606, "grad_norm": 4.993614673614502, "learning_rate": 2.512638280537117e-05, "loss": 2.3164234161376953, "memory(GiB)": 77.56, "step": 77695, "token_acc": 0.4764705882352941, "train_speed(iter/s)": 1.437117 }, { "epoch": 3.3289062165288548, "grad_norm": 5.47150182723999, "learning_rate": 2.5120545088269877e-05, "loss": 2.5082813262939454, "memory(GiB)": 77.56, "step": 77700, "token_acc": 0.46808510638297873, "train_speed(iter/s)": 1.437135 }, { "epoch": 3.3291204318581036, "grad_norm": 5.957233428955078, "learning_rate": 2.5114707821883253e-05, "loss": 2.559680938720703, "memory(GiB)": 77.56, "step": 77705, "token_acc": 0.46875, "train_speed(iter/s)": 1.43714 }, { "epoch": 3.329334647187353, "grad_norm": 8.189870834350586, "learning_rate": 2.5108871006317046e-05, "loss": 2.5916494369506835, "memory(GiB)": 77.56, "step": 77710, "token_acc": 0.47570332480818417, "train_speed(iter/s)": 1.43713 }, { "epoch": 3.3295488625166016, "grad_norm": 6.239110469818115, "learning_rate": 2.510303464167698e-05, "loss": 2.6648834228515623, "memory(GiB)": 77.56, "step": 77715, "token_acc": 0.4734982332155477, "train_speed(iter/s)": 1.437139 }, { "epoch": 3.3297630778458505, "grad_norm": 5.674903869628906, "learning_rate": 2.509719872806878e-05, "loss": 2.4365608215332033, "memory(GiB)": 77.56, "step": 77720, "token_acc": 0.4948453608247423, "train_speed(iter/s)": 1.437147 }, { "epoch": 3.3299772931750997, "grad_norm": 6.108583927154541, "learning_rate": 2.5091363265598196e-05, "loss": 2.6104385375976564, "memory(GiB)": 77.56, "step": 77725, "token_acc": 0.46048109965635736, "train_speed(iter/s)": 1.437157 }, { "epoch": 3.3301915085043485, "grad_norm": 4.625551223754883, "learning_rate": 2.5085528254370942e-05, "loss": 2.2240320205688477, "memory(GiB)": 77.56, "step": 77730, "token_acc": 0.5015673981191222, "train_speed(iter/s)": 1.437143 }, { "epoch": 3.3304057238335973, "grad_norm": 5.3962554931640625, "learning_rate": 2.5079693694492722e-05, "loss": 2.283380317687988, "memory(GiB)": 77.56, "step": 77735, "token_acc": 0.5220338983050847, "train_speed(iter/s)": 1.437143 }, { "epoch": 3.3306199391628466, "grad_norm": 6.390612602233887, "learning_rate": 2.507385958606922e-05, "loss": 2.451319122314453, "memory(GiB)": 77.56, "step": 77740, "token_acc": 0.4666666666666667, "train_speed(iter/s)": 1.437138 }, { "epoch": 3.3308341544920954, "grad_norm": 6.468560695648193, "learning_rate": 2.506802592920614e-05, "loss": 2.5463306427001955, "memory(GiB)": 77.56, "step": 77745, "token_acc": 0.46, "train_speed(iter/s)": 1.437146 }, { "epoch": 3.3310483698213442, "grad_norm": 5.827782154083252, "learning_rate": 2.506219272400916e-05, "loss": 2.2985111236572267, "memory(GiB)": 77.56, "step": 77750, "token_acc": 0.5450980392156862, "train_speed(iter/s)": 1.437164 }, { "epoch": 3.3312625851505935, "grad_norm": 6.416467189788818, "learning_rate": 2.5056359970583925e-05, "loss": 2.2989683151245117, "memory(GiB)": 77.56, "step": 77755, "token_acc": 0.519434628975265, "train_speed(iter/s)": 1.437168 }, { "epoch": 3.3314768004798423, "grad_norm": 6.548943996429443, "learning_rate": 2.505052766903615e-05, "loss": 2.1714780807495115, "memory(GiB)": 77.56, "step": 77760, "token_acc": 0.5131086142322098, "train_speed(iter/s)": 1.437163 }, { "epoch": 3.331691015809091, "grad_norm": 4.737626075744629, "learning_rate": 2.5044695819471475e-05, "loss": 2.2847221374511717, "memory(GiB)": 77.56, "step": 77765, "token_acc": 0.5266272189349113, "train_speed(iter/s)": 1.437167 }, { "epoch": 3.3319052311383404, "grad_norm": 5.4486284255981445, "learning_rate": 2.5038864421995545e-05, "loss": 1.9878698348999024, "memory(GiB)": 77.56, "step": 77770, "token_acc": 0.5418181818181819, "train_speed(iter/s)": 1.437172 }, { "epoch": 3.332119446467589, "grad_norm": 5.901936054229736, "learning_rate": 2.5033033476713998e-05, "loss": 2.3245212554931642, "memory(GiB)": 77.56, "step": 77775, "token_acc": 0.46808510638297873, "train_speed(iter/s)": 1.437131 }, { "epoch": 3.332333661796838, "grad_norm": 5.089118003845215, "learning_rate": 2.5027202983732456e-05, "loss": 1.9117870330810547, "memory(GiB)": 77.56, "step": 77780, "token_acc": 0.5826446280991735, "train_speed(iter/s)": 1.437137 }, { "epoch": 3.3325478771260872, "grad_norm": 4.50844144821167, "learning_rate": 2.5021372943156575e-05, "loss": 2.2879833221435546, "memory(GiB)": 77.56, "step": 77785, "token_acc": 0.48936170212765956, "train_speed(iter/s)": 1.437146 }, { "epoch": 3.332762092455336, "grad_norm": 5.373756408691406, "learning_rate": 2.501554335509195e-05, "loss": 2.2437599182128904, "memory(GiB)": 77.56, "step": 77790, "token_acc": 0.511400651465798, "train_speed(iter/s)": 1.437152 }, { "epoch": 3.332976307784585, "grad_norm": 5.752211570739746, "learning_rate": 2.5009714219644175e-05, "loss": 2.316151237487793, "memory(GiB)": 77.56, "step": 77795, "token_acc": 0.4898785425101215, "train_speed(iter/s)": 1.437155 }, { "epoch": 3.333190523113834, "grad_norm": 6.007317066192627, "learning_rate": 2.5003885536918897e-05, "loss": 2.331071662902832, "memory(GiB)": 77.56, "step": 77800, "token_acc": 0.5017667844522968, "train_speed(iter/s)": 1.437149 }, { "epoch": 3.333404738443083, "grad_norm": 8.905684471130371, "learning_rate": 2.4998057307021677e-05, "loss": 2.60831298828125, "memory(GiB)": 77.56, "step": 77805, "token_acc": 0.4725609756097561, "train_speed(iter/s)": 1.437148 }, { "epoch": 3.3336189537723317, "grad_norm": 7.32859468460083, "learning_rate": 2.4992229530058087e-05, "loss": 2.297551727294922, "memory(GiB)": 77.56, "step": 77810, "token_acc": 0.5341365461847389, "train_speed(iter/s)": 1.437142 }, { "epoch": 3.333833169101581, "grad_norm": 5.975539684295654, "learning_rate": 2.498640220613373e-05, "loss": 2.313236427307129, "memory(GiB)": 77.56, "step": 77815, "token_acc": 0.5112994350282486, "train_speed(iter/s)": 1.437141 }, { "epoch": 3.33404738443083, "grad_norm": 5.545083045959473, "learning_rate": 2.498057533535417e-05, "loss": 2.289481544494629, "memory(GiB)": 77.56, "step": 77820, "token_acc": 0.5079872204472844, "train_speed(iter/s)": 1.437156 }, { "epoch": 3.3342615997600786, "grad_norm": 7.65740966796875, "learning_rate": 2.4974748917824954e-05, "loss": 2.2598876953125, "memory(GiB)": 77.56, "step": 77825, "token_acc": 0.5368852459016393, "train_speed(iter/s)": 1.437152 }, { "epoch": 3.334475815089328, "grad_norm": 5.546942234039307, "learning_rate": 2.4968922953651635e-05, "loss": 2.283573341369629, "memory(GiB)": 77.56, "step": 77830, "token_acc": 0.5017301038062284, "train_speed(iter/s)": 1.43715 }, { "epoch": 3.3346900304185767, "grad_norm": 6.100461959838867, "learning_rate": 2.496309744293976e-05, "loss": 2.337263488769531, "memory(GiB)": 77.56, "step": 77835, "token_acc": 0.524904214559387, "train_speed(iter/s)": 1.437171 }, { "epoch": 3.3349042457478255, "grad_norm": 5.265915870666504, "learning_rate": 2.495727238579484e-05, "loss": 2.141219711303711, "memory(GiB)": 77.56, "step": 77840, "token_acc": 0.556390977443609, "train_speed(iter/s)": 1.437176 }, { "epoch": 3.3351184610770748, "grad_norm": 5.829535484313965, "learning_rate": 2.495144778232244e-05, "loss": 2.5924875259399416, "memory(GiB)": 77.56, "step": 77845, "token_acc": 0.4746268656716418, "train_speed(iter/s)": 1.437192 }, { "epoch": 3.3353326764063236, "grad_norm": 5.743008613586426, "learning_rate": 2.4945623632628068e-05, "loss": 2.428538131713867, "memory(GiB)": 77.56, "step": 77850, "token_acc": 0.49014084507042255, "train_speed(iter/s)": 1.437194 }, { "epoch": 3.3355468917355724, "grad_norm": 5.537662029266357, "learning_rate": 2.4939799936817222e-05, "loss": 2.6278318405151366, "memory(GiB)": 77.56, "step": 77855, "token_acc": 0.4631578947368421, "train_speed(iter/s)": 1.437185 }, { "epoch": 3.3357611070648217, "grad_norm": 6.905949592590332, "learning_rate": 2.493397669499541e-05, "loss": 2.5519298553466796, "memory(GiB)": 77.56, "step": 77860, "token_acc": 0.4791666666666667, "train_speed(iter/s)": 1.437167 }, { "epoch": 3.3359753223940705, "grad_norm": 7.764893531799316, "learning_rate": 2.4928153907268104e-05, "loss": 2.425844192504883, "memory(GiB)": 77.56, "step": 77865, "token_acc": 0.4923547400611621, "train_speed(iter/s)": 1.437178 }, { "epoch": 3.3361895377233193, "grad_norm": 6.55760383605957, "learning_rate": 2.4922331573740808e-05, "loss": 2.3337722778320313, "memory(GiB)": 77.56, "step": 77870, "token_acc": 0.4808362369337979, "train_speed(iter/s)": 1.437154 }, { "epoch": 3.3364037530525685, "grad_norm": 7.696887493133545, "learning_rate": 2.491650969451902e-05, "loss": 2.0694936752319335, "memory(GiB)": 77.56, "step": 77875, "token_acc": 0.5202952029520295, "train_speed(iter/s)": 1.437177 }, { "epoch": 3.3366179683818173, "grad_norm": 6.950080394744873, "learning_rate": 2.4910688269708195e-05, "loss": 2.245351219177246, "memory(GiB)": 77.56, "step": 77880, "token_acc": 0.5017064846416383, "train_speed(iter/s)": 1.437179 }, { "epoch": 3.336832183711066, "grad_norm": 5.977203369140625, "learning_rate": 2.4904867299413783e-05, "loss": 2.2859540939331056, "memory(GiB)": 77.56, "step": 77885, "token_acc": 0.5176470588235295, "train_speed(iter/s)": 1.437181 }, { "epoch": 3.3370463990403154, "grad_norm": 6.971149444580078, "learning_rate": 2.4899046783741235e-05, "loss": 2.1541297912597654, "memory(GiB)": 77.56, "step": 77890, "token_acc": 0.5214285714285715, "train_speed(iter/s)": 1.437183 }, { "epoch": 3.3372606143695642, "grad_norm": 6.053440093994141, "learning_rate": 2.4893226722796008e-05, "loss": 2.4205205917358397, "memory(GiB)": 77.56, "step": 77895, "token_acc": 0.4790996784565916, "train_speed(iter/s)": 1.437206 }, { "epoch": 3.337474829698813, "grad_norm": 6.8531880378723145, "learning_rate": 2.4887407116683516e-05, "loss": 2.202609062194824, "memory(GiB)": 77.56, "step": 77900, "token_acc": 0.48535564853556484, "train_speed(iter/s)": 1.437195 }, { "epoch": 3.3376890450280623, "grad_norm": 5.678084850311279, "learning_rate": 2.488158796550921e-05, "loss": 2.300441360473633, "memory(GiB)": 77.56, "step": 77905, "token_acc": 0.4767025089605735, "train_speed(iter/s)": 1.437164 }, { "epoch": 3.337903260357311, "grad_norm": 5.907774925231934, "learning_rate": 2.487576926937851e-05, "loss": 1.8783260345458985, "memory(GiB)": 77.56, "step": 77910, "token_acc": 0.5447470817120622, "train_speed(iter/s)": 1.437174 }, { "epoch": 3.33811747568656, "grad_norm": 6.085709095001221, "learning_rate": 2.4869951028396813e-05, "loss": 2.2033203125, "memory(GiB)": 77.56, "step": 77915, "token_acc": 0.5315985130111525, "train_speed(iter/s)": 1.43718 }, { "epoch": 3.338331691015809, "grad_norm": 7.500263690948486, "learning_rate": 2.486413324266953e-05, "loss": 2.3250667572021486, "memory(GiB)": 77.56, "step": 77920, "token_acc": 0.5154639175257731, "train_speed(iter/s)": 1.437173 }, { "epoch": 3.338545906345058, "grad_norm": 5.4479241371154785, "learning_rate": 2.4858315912302033e-05, "loss": 2.252872848510742, "memory(GiB)": 77.56, "step": 77925, "token_acc": 0.5, "train_speed(iter/s)": 1.437164 }, { "epoch": 3.338760121674307, "grad_norm": 5.195981502532959, "learning_rate": 2.4852499037399747e-05, "loss": 2.331142044067383, "memory(GiB)": 77.56, "step": 77930, "token_acc": 0.49019607843137253, "train_speed(iter/s)": 1.437185 }, { "epoch": 3.338974337003556, "grad_norm": 7.888880252838135, "learning_rate": 2.484668261806803e-05, "loss": 2.3773981094360352, "memory(GiB)": 77.56, "step": 77935, "token_acc": 0.4979757085020243, "train_speed(iter/s)": 1.437209 }, { "epoch": 3.339188552332805, "grad_norm": 7.641469955444336, "learning_rate": 2.4840866654412232e-05, "loss": 2.4169261932373045, "memory(GiB)": 77.56, "step": 77940, "token_acc": 0.44192634560906513, "train_speed(iter/s)": 1.437231 }, { "epoch": 3.3394027676620537, "grad_norm": 5.84895658493042, "learning_rate": 2.483505114653776e-05, "loss": 2.359182357788086, "memory(GiB)": 77.56, "step": 77945, "token_acc": 0.5134228187919463, "train_speed(iter/s)": 1.437234 }, { "epoch": 3.339616982991303, "grad_norm": 6.297906398773193, "learning_rate": 2.482923609454994e-05, "loss": 2.5064010620117188, "memory(GiB)": 77.56, "step": 77950, "token_acc": 0.48493975903614456, "train_speed(iter/s)": 1.437223 }, { "epoch": 3.3398311983205518, "grad_norm": 5.065043926239014, "learning_rate": 2.48234214985541e-05, "loss": 2.597334098815918, "memory(GiB)": 77.56, "step": 77955, "token_acc": 0.5048543689320388, "train_speed(iter/s)": 1.437235 }, { "epoch": 3.3400454136498006, "grad_norm": 7.707672595977783, "learning_rate": 2.4817607358655614e-05, "loss": 2.3225643157958986, "memory(GiB)": 77.56, "step": 77960, "token_acc": 0.4977973568281938, "train_speed(iter/s)": 1.437217 }, { "epoch": 3.34025962897905, "grad_norm": 7.994879722595215, "learning_rate": 2.48117936749598e-05, "loss": 2.471352958679199, "memory(GiB)": 77.56, "step": 77965, "token_acc": 0.4863013698630137, "train_speed(iter/s)": 1.43724 }, { "epoch": 3.3404738443082986, "grad_norm": 6.065828323364258, "learning_rate": 2.480598044757197e-05, "loss": 2.297563934326172, "memory(GiB)": 77.56, "step": 77970, "token_acc": 0.5309090909090909, "train_speed(iter/s)": 1.43724 }, { "epoch": 3.3406880596375474, "grad_norm": 6.3979010581970215, "learning_rate": 2.4800167676597436e-05, "loss": 2.1008651733398436, "memory(GiB)": 77.56, "step": 77975, "token_acc": 0.5643939393939394, "train_speed(iter/s)": 1.437231 }, { "epoch": 3.3409022749667967, "grad_norm": 6.603244781494141, "learning_rate": 2.4794355362141508e-05, "loss": 2.4242521286010743, "memory(GiB)": 77.56, "step": 77980, "token_acc": 0.5, "train_speed(iter/s)": 1.437237 }, { "epoch": 3.3411164902960455, "grad_norm": 5.314538955688477, "learning_rate": 2.4788543504309454e-05, "loss": 2.4956085205078127, "memory(GiB)": 77.56, "step": 77985, "token_acc": 0.5055350553505535, "train_speed(iter/s)": 1.437265 }, { "epoch": 3.3413307056252943, "grad_norm": 6.049429416656494, "learning_rate": 2.4782732103206607e-05, "loss": 2.0845806121826174, "memory(GiB)": 77.56, "step": 77990, "token_acc": 0.5681818181818182, "train_speed(iter/s)": 1.437261 }, { "epoch": 3.3415449209545436, "grad_norm": 5.353680610656738, "learning_rate": 2.4776921158938222e-05, "loss": 2.4130149841308595, "memory(GiB)": 77.56, "step": 77995, "token_acc": 0.47278911564625853, "train_speed(iter/s)": 1.437271 }, { "epoch": 3.3417591362837924, "grad_norm": 7.042940139770508, "learning_rate": 2.4771110671609573e-05, "loss": 2.3023122787475585, "memory(GiB)": 77.56, "step": 78000, "token_acc": 0.5180722891566265, "train_speed(iter/s)": 1.43729 }, { "epoch": 3.3417591362837924, "eval_loss": 2.0868334770202637, "eval_runtime": 14.2158, "eval_samples_per_second": 7.034, "eval_steps_per_second": 7.034, "eval_token_acc": 0.4759556103575832, "step": 78000 }, { "epoch": 3.341973351613041, "grad_norm": 5.571708679199219, "learning_rate": 2.4765300641325915e-05, "loss": 2.2235841751098633, "memory(GiB)": 77.56, "step": 78005, "token_acc": 0.49167397020157755, "train_speed(iter/s)": 1.436902 }, { "epoch": 3.3421875669422905, "grad_norm": 6.3985772132873535, "learning_rate": 2.4759491068192496e-05, "loss": 2.1793134689331053, "memory(GiB)": 77.56, "step": 78010, "token_acc": 0.4962686567164179, "train_speed(iter/s)": 1.436896 }, { "epoch": 3.3424017822715393, "grad_norm": 6.703181266784668, "learning_rate": 2.4753681952314573e-05, "loss": 2.1313396453857423, "memory(GiB)": 77.56, "step": 78015, "token_acc": 0.5513307984790875, "train_speed(iter/s)": 1.436896 }, { "epoch": 3.342615997600788, "grad_norm": 7.6307220458984375, "learning_rate": 2.4747873293797396e-05, "loss": 2.4202816009521486, "memory(GiB)": 77.56, "step": 78020, "token_acc": 0.4657534246575342, "train_speed(iter/s)": 1.436907 }, { "epoch": 3.3428302129300373, "grad_norm": 4.953913688659668, "learning_rate": 2.474206509274619e-05, "loss": 2.2251657485961913, "memory(GiB)": 77.56, "step": 78025, "token_acc": 0.50997150997151, "train_speed(iter/s)": 1.436888 }, { "epoch": 3.343044428259286, "grad_norm": 4.519009590148926, "learning_rate": 2.4736257349266167e-05, "loss": 1.9159467697143555, "memory(GiB)": 77.56, "step": 78030, "token_acc": 0.5593869731800766, "train_speed(iter/s)": 1.436896 }, { "epoch": 3.343258643588535, "grad_norm": 5.637549877166748, "learning_rate": 2.473045006346254e-05, "loss": 2.184259033203125, "memory(GiB)": 77.56, "step": 78035, "token_acc": 0.4781021897810219, "train_speed(iter/s)": 1.436914 }, { "epoch": 3.3434728589177842, "grad_norm": 6.153614521026611, "learning_rate": 2.472464323544052e-05, "loss": 2.3234369277954103, "memory(GiB)": 77.56, "step": 78040, "token_acc": 0.5197132616487455, "train_speed(iter/s)": 1.436913 }, { "epoch": 3.343687074247033, "grad_norm": 6.867588996887207, "learning_rate": 2.4718836865305274e-05, "loss": 2.067275047302246, "memory(GiB)": 77.56, "step": 78045, "token_acc": 0.5478927203065134, "train_speed(iter/s)": 1.436905 }, { "epoch": 3.343901289576282, "grad_norm": 5.022627830505371, "learning_rate": 2.4713030953162036e-05, "loss": 2.133124351501465, "memory(GiB)": 77.56, "step": 78050, "token_acc": 0.48046875, "train_speed(iter/s)": 1.436919 }, { "epoch": 3.344115504905531, "grad_norm": 6.870769500732422, "learning_rate": 2.470722549911596e-05, "loss": 2.483769989013672, "memory(GiB)": 77.56, "step": 78055, "token_acc": 0.4767932489451477, "train_speed(iter/s)": 1.436932 }, { "epoch": 3.34432972023478, "grad_norm": 4.614297389984131, "learning_rate": 2.470142050327222e-05, "loss": 1.9196325302124024, "memory(GiB)": 77.56, "step": 78060, "token_acc": 0.5791245791245792, "train_speed(iter/s)": 1.436951 }, { "epoch": 3.3445439355640287, "grad_norm": 5.502196311950684, "learning_rate": 2.4695615965735984e-05, "loss": 2.3422874450683593, "memory(GiB)": 77.56, "step": 78065, "token_acc": 0.5286624203821656, "train_speed(iter/s)": 1.436971 }, { "epoch": 3.344758150893278, "grad_norm": 7.608879566192627, "learning_rate": 2.468981188661238e-05, "loss": 2.381334686279297, "memory(GiB)": 77.56, "step": 78070, "token_acc": 0.48787878787878786, "train_speed(iter/s)": 1.436987 }, { "epoch": 3.344972366222527, "grad_norm": 5.68510103225708, "learning_rate": 2.46840082660066e-05, "loss": 2.297234535217285, "memory(GiB)": 77.56, "step": 78075, "token_acc": 0.5223880597014925, "train_speed(iter/s)": 1.436998 }, { "epoch": 3.3451865815517756, "grad_norm": 5.46876859664917, "learning_rate": 2.467820510402375e-05, "loss": 2.5136638641357423, "memory(GiB)": 77.56, "step": 78080, "token_acc": 0.49008498583569404, "train_speed(iter/s)": 1.437005 }, { "epoch": 3.345400796881025, "grad_norm": 5.594293594360352, "learning_rate": 2.4672402400768973e-05, "loss": 2.059544563293457, "memory(GiB)": 77.56, "step": 78085, "token_acc": 0.5551020408163265, "train_speed(iter/s)": 1.437011 }, { "epoch": 3.3456150122102737, "grad_norm": 5.572644233703613, "learning_rate": 2.466660015634737e-05, "loss": 2.262191581726074, "memory(GiB)": 77.56, "step": 78090, "token_acc": 0.5033783783783784, "train_speed(iter/s)": 1.437005 }, { "epoch": 3.3458292275395225, "grad_norm": 5.715251922607422, "learning_rate": 2.4660798370864086e-05, "loss": 2.2446441650390625, "memory(GiB)": 77.56, "step": 78095, "token_acc": 0.5074626865671642, "train_speed(iter/s)": 1.437019 }, { "epoch": 3.3460434428687718, "grad_norm": 5.06517219543457, "learning_rate": 2.465499704442419e-05, "loss": 2.350856399536133, "memory(GiB)": 77.56, "step": 78100, "token_acc": 0.48286604361370716, "train_speed(iter/s)": 1.437038 }, { "epoch": 3.3462576581980206, "grad_norm": 6.508033752441406, "learning_rate": 2.4649196177132818e-05, "loss": 2.49069766998291, "memory(GiB)": 77.56, "step": 78105, "token_acc": 0.4826388888888889, "train_speed(iter/s)": 1.43703 }, { "epoch": 3.3464718735272694, "grad_norm": 5.585472583770752, "learning_rate": 2.4643395769095035e-05, "loss": 2.807998847961426, "memory(GiB)": 77.56, "step": 78110, "token_acc": 0.4024767801857585, "train_speed(iter/s)": 1.437029 }, { "epoch": 3.3466860888565186, "grad_norm": 6.244565486907959, "learning_rate": 2.4637595820415925e-05, "loss": 2.553742218017578, "memory(GiB)": 77.56, "step": 78115, "token_acc": 0.503448275862069, "train_speed(iter/s)": 1.437013 }, { "epoch": 3.3469003041857674, "grad_norm": 4.824309825897217, "learning_rate": 2.4631796331200564e-05, "loss": 2.383909225463867, "memory(GiB)": 77.56, "step": 78120, "token_acc": 0.44516129032258067, "train_speed(iter/s)": 1.437008 }, { "epoch": 3.3471145195150163, "grad_norm": 5.161762237548828, "learning_rate": 2.4625997301554005e-05, "loss": 2.1975996017456056, "memory(GiB)": 77.56, "step": 78125, "token_acc": 0.5119047619047619, "train_speed(iter/s)": 1.437006 }, { "epoch": 3.3473287348442655, "grad_norm": 5.7215800285339355, "learning_rate": 2.462019873158129e-05, "loss": 2.4382396697998048, "memory(GiB)": 77.56, "step": 78130, "token_acc": 0.5, "train_speed(iter/s)": 1.436999 }, { "epoch": 3.3475429501735143, "grad_norm": 5.0890727043151855, "learning_rate": 2.46144006213875e-05, "loss": 2.512197494506836, "memory(GiB)": 77.56, "step": 78135, "token_acc": 0.535031847133758, "train_speed(iter/s)": 1.437003 }, { "epoch": 3.347757165502763, "grad_norm": 8.426888465881348, "learning_rate": 2.460860297107766e-05, "loss": 2.3685791015625, "memory(GiB)": 77.56, "step": 78140, "token_acc": 0.5252918287937743, "train_speed(iter/s)": 1.436993 }, { "epoch": 3.3479713808320124, "grad_norm": 6.038755893707275, "learning_rate": 2.4602805780756795e-05, "loss": 2.1319467544555666, "memory(GiB)": 77.56, "step": 78145, "token_acc": 0.5353535353535354, "train_speed(iter/s)": 1.436972 }, { "epoch": 3.348185596161261, "grad_norm": 5.103981971740723, "learning_rate": 2.459700905052993e-05, "loss": 2.464715576171875, "memory(GiB)": 77.56, "step": 78150, "token_acc": 0.4608150470219436, "train_speed(iter/s)": 1.436983 }, { "epoch": 3.34839981149051, "grad_norm": 6.755388259887695, "learning_rate": 2.459121278050205e-05, "loss": 2.696822929382324, "memory(GiB)": 77.56, "step": 78155, "token_acc": 0.4863013698630137, "train_speed(iter/s)": 1.436985 }, { "epoch": 3.3486140268197593, "grad_norm": 5.507720470428467, "learning_rate": 2.4585416970778207e-05, "loss": 2.312678337097168, "memory(GiB)": 77.56, "step": 78160, "token_acc": 0.46830985915492956, "train_speed(iter/s)": 1.436984 }, { "epoch": 3.348828242149008, "grad_norm": 6.923756122589111, "learning_rate": 2.4579621621463362e-05, "loss": 2.3618371963500975, "memory(GiB)": 77.56, "step": 78165, "token_acc": 0.5077881619937694, "train_speed(iter/s)": 1.436966 }, { "epoch": 3.349042457478257, "grad_norm": 5.579107761383057, "learning_rate": 2.4573826732662537e-05, "loss": 2.170520210266113, "memory(GiB)": 77.56, "step": 78170, "token_acc": 0.5477941176470589, "train_speed(iter/s)": 1.436948 }, { "epoch": 3.349256672807506, "grad_norm": 6.676364421844482, "learning_rate": 2.4568032304480687e-05, "loss": 2.558653450012207, "memory(GiB)": 77.56, "step": 78175, "token_acc": 0.4964788732394366, "train_speed(iter/s)": 1.436948 }, { "epoch": 3.349470888136755, "grad_norm": 4.242257118225098, "learning_rate": 2.4562238337022793e-05, "loss": 2.0917230606079102, "memory(GiB)": 77.56, "step": 78180, "token_acc": 0.5470383275261324, "train_speed(iter/s)": 1.436916 }, { "epoch": 3.349685103466004, "grad_norm": 6.848104953765869, "learning_rate": 2.455644483039381e-05, "loss": 2.4666683197021486, "memory(GiB)": 77.56, "step": 78185, "token_acc": 0.4915254237288136, "train_speed(iter/s)": 1.436938 }, { "epoch": 3.349899318795253, "grad_norm": 7.107452869415283, "learning_rate": 2.455065178469868e-05, "loss": 2.505731964111328, "memory(GiB)": 77.56, "step": 78190, "token_acc": 0.50199203187251, "train_speed(iter/s)": 1.436943 }, { "epoch": 3.350113534124502, "grad_norm": 7.527360916137695, "learning_rate": 2.4544859200042386e-05, "loss": 2.4773880004882813, "memory(GiB)": 77.56, "step": 78195, "token_acc": 0.4870967741935484, "train_speed(iter/s)": 1.436947 }, { "epoch": 3.3503277494537507, "grad_norm": 5.069779396057129, "learning_rate": 2.4539067076529847e-05, "loss": 2.4517032623291017, "memory(GiB)": 77.56, "step": 78200, "token_acc": 0.4931972789115646, "train_speed(iter/s)": 1.436958 }, { "epoch": 3.350541964783, "grad_norm": 6.344890117645264, "learning_rate": 2.4533275414265992e-05, "loss": 2.4281055450439455, "memory(GiB)": 77.56, "step": 78205, "token_acc": 0.4819672131147541, "train_speed(iter/s)": 1.436947 }, { "epoch": 3.3507561801122487, "grad_norm": 6.175689220428467, "learning_rate": 2.452748421335574e-05, "loss": 2.127104568481445, "memory(GiB)": 77.56, "step": 78210, "token_acc": 0.5390625, "train_speed(iter/s)": 1.436958 }, { "epoch": 3.3509703954414976, "grad_norm": 5.342073440551758, "learning_rate": 2.452169347390399e-05, "loss": 2.346451759338379, "memory(GiB)": 77.56, "step": 78215, "token_acc": 0.49280575539568344, "train_speed(iter/s)": 1.436937 }, { "epoch": 3.351184610770747, "grad_norm": 7.146237850189209, "learning_rate": 2.4515903196015684e-05, "loss": 2.3414581298828123, "memory(GiB)": 77.56, "step": 78220, "token_acc": 0.5223880597014925, "train_speed(iter/s)": 1.436933 }, { "epoch": 3.3513988260999956, "grad_norm": 6.014937877655029, "learning_rate": 2.4510113379795696e-05, "loss": 2.262928009033203, "memory(GiB)": 77.56, "step": 78225, "token_acc": 0.5, "train_speed(iter/s)": 1.436936 }, { "epoch": 3.3516130414292444, "grad_norm": 5.9865570068359375, "learning_rate": 2.4504324025348912e-05, "loss": 2.5818096160888673, "memory(GiB)": 77.56, "step": 78230, "token_acc": 0.48639455782312924, "train_speed(iter/s)": 1.436936 }, { "epoch": 3.3518272567584937, "grad_norm": 6.626688003540039, "learning_rate": 2.44985351327802e-05, "loss": 2.4460886001586912, "memory(GiB)": 77.56, "step": 78235, "token_acc": 0.4807017543859649, "train_speed(iter/s)": 1.43694 }, { "epoch": 3.3520414720877425, "grad_norm": 5.517338275909424, "learning_rate": 2.4492746702194463e-05, "loss": 2.3007774353027344, "memory(GiB)": 77.56, "step": 78240, "token_acc": 0.49122807017543857, "train_speed(iter/s)": 1.436961 }, { "epoch": 3.3522556874169913, "grad_norm": 5.2339186668396, "learning_rate": 2.448695873369653e-05, "loss": 2.563678169250488, "memory(GiB)": 77.56, "step": 78245, "token_acc": 0.4517133956386293, "train_speed(iter/s)": 1.436973 }, { "epoch": 3.3524699027462406, "grad_norm": 6.285702705383301, "learning_rate": 2.4481171227391293e-05, "loss": 2.2242761611938477, "memory(GiB)": 77.56, "step": 78250, "token_acc": 0.4752851711026616, "train_speed(iter/s)": 1.436974 }, { "epoch": 3.3526841180754894, "grad_norm": 6.033846855163574, "learning_rate": 2.4475384183383577e-05, "loss": 2.1046445846557615, "memory(GiB)": 77.56, "step": 78255, "token_acc": 0.5488721804511278, "train_speed(iter/s)": 1.436976 }, { "epoch": 3.352898333404738, "grad_norm": 6.011137962341309, "learning_rate": 2.4469597601778222e-05, "loss": 2.1895471572875977, "memory(GiB)": 77.56, "step": 78260, "token_acc": 0.4622356495468278, "train_speed(iter/s)": 1.436964 }, { "epoch": 3.3531125487339875, "grad_norm": 5.461022853851318, "learning_rate": 2.446381148268005e-05, "loss": 2.343461608886719, "memory(GiB)": 77.56, "step": 78265, "token_acc": 0.4405144694533762, "train_speed(iter/s)": 1.436953 }, { "epoch": 3.3533267640632363, "grad_norm": 8.106976509094238, "learning_rate": 2.445802582619389e-05, "loss": 2.473061752319336, "memory(GiB)": 77.56, "step": 78270, "token_acc": 0.43824701195219123, "train_speed(iter/s)": 1.436983 }, { "epoch": 3.353540979392485, "grad_norm": 5.615047454833984, "learning_rate": 2.4452240632424538e-05, "loss": 2.2097375869750975, "memory(GiB)": 77.56, "step": 78275, "token_acc": 0.5352941176470588, "train_speed(iter/s)": 1.436975 }, { "epoch": 3.3537551947217343, "grad_norm": 8.703311920166016, "learning_rate": 2.4446455901476828e-05, "loss": 2.5292709350585936, "memory(GiB)": 77.56, "step": 78280, "token_acc": 0.47840531561461797, "train_speed(iter/s)": 1.436964 }, { "epoch": 3.353969410050983, "grad_norm": 5.164878845214844, "learning_rate": 2.4440671633455543e-05, "loss": 2.130105209350586, "memory(GiB)": 77.56, "step": 78285, "token_acc": 0.5436507936507936, "train_speed(iter/s)": 1.436954 }, { "epoch": 3.3541836253802324, "grad_norm": 4.742149353027344, "learning_rate": 2.4434887828465463e-05, "loss": 2.4441783905029295, "memory(GiB)": 77.56, "step": 78290, "token_acc": 0.5072046109510087, "train_speed(iter/s)": 1.436975 }, { "epoch": 3.354397840709481, "grad_norm": 7.378182411193848, "learning_rate": 2.4429104486611376e-05, "loss": 2.41677188873291, "memory(GiB)": 77.56, "step": 78295, "token_acc": 0.47988505747126436, "train_speed(iter/s)": 1.436975 }, { "epoch": 3.35461205603873, "grad_norm": 5.843394756317139, "learning_rate": 2.4423321607998028e-05, "loss": 2.53665771484375, "memory(GiB)": 77.56, "step": 78300, "token_acc": 0.48044692737430167, "train_speed(iter/s)": 1.436968 }, { "epoch": 3.3548262713679793, "grad_norm": 6.190471649169922, "learning_rate": 2.4417539192730226e-05, "loss": 2.4405431747436523, "memory(GiB)": 77.56, "step": 78305, "token_acc": 0.4864864864864865, "train_speed(iter/s)": 1.436981 }, { "epoch": 3.355040486697228, "grad_norm": 5.517795085906982, "learning_rate": 2.4411757240912675e-05, "loss": 2.4316272735595703, "memory(GiB)": 77.56, "step": 78310, "token_acc": 0.49473684210526314, "train_speed(iter/s)": 1.436988 }, { "epoch": 3.355254702026477, "grad_norm": 5.0164995193481445, "learning_rate": 2.4405975752650168e-05, "loss": 2.506674575805664, "memory(GiB)": 77.56, "step": 78315, "token_acc": 0.48031496062992124, "train_speed(iter/s)": 1.436984 }, { "epoch": 3.355468917355726, "grad_norm": 5.901783466339111, "learning_rate": 2.4400194728047414e-05, "loss": 2.279845428466797, "memory(GiB)": 77.56, "step": 78320, "token_acc": 0.5140562248995983, "train_speed(iter/s)": 1.436978 }, { "epoch": 3.355683132684975, "grad_norm": 5.524072170257568, "learning_rate": 2.4394414167209152e-05, "loss": 2.553447151184082, "memory(GiB)": 77.56, "step": 78325, "token_acc": 0.4642857142857143, "train_speed(iter/s)": 1.436979 }, { "epoch": 3.355897348014224, "grad_norm": 7.261928081512451, "learning_rate": 2.4388634070240097e-05, "loss": 2.4628835678100587, "memory(GiB)": 77.56, "step": 78330, "token_acc": 0.4931506849315068, "train_speed(iter/s)": 1.436966 }, { "epoch": 3.356111563343473, "grad_norm": 5.276325702667236, "learning_rate": 2.438285443724494e-05, "loss": 2.3215408325195312, "memory(GiB)": 77.56, "step": 78335, "token_acc": 0.49544072948328266, "train_speed(iter/s)": 1.436985 }, { "epoch": 3.356325778672722, "grad_norm": 5.752867221832275, "learning_rate": 2.4377075268328426e-05, "loss": 2.396231842041016, "memory(GiB)": 77.56, "step": 78340, "token_acc": 0.475177304964539, "train_speed(iter/s)": 1.437 }, { "epoch": 3.3565399940019707, "grad_norm": 6.296432018280029, "learning_rate": 2.437129656359523e-05, "loss": 2.437132453918457, "memory(GiB)": 77.56, "step": 78345, "token_acc": 0.4865771812080537, "train_speed(iter/s)": 1.437008 }, { "epoch": 3.35675420933122, "grad_norm": 6.259218215942383, "learning_rate": 2.4365518323150037e-05, "loss": 2.417165756225586, "memory(GiB)": 77.56, "step": 78350, "token_acc": 0.4980544747081712, "train_speed(iter/s)": 1.437023 }, { "epoch": 3.3569684246604687, "grad_norm": 6.226585865020752, "learning_rate": 2.4359740547097526e-05, "loss": 2.375764083862305, "memory(GiB)": 77.56, "step": 78355, "token_acc": 0.44140625, "train_speed(iter/s)": 1.43703 }, { "epoch": 3.3571826399897176, "grad_norm": 4.719928741455078, "learning_rate": 2.435396323554235e-05, "loss": 2.0290273666381835, "memory(GiB)": 77.56, "step": 78360, "token_acc": 0.5563380281690141, "train_speed(iter/s)": 1.437035 }, { "epoch": 3.357396855318967, "grad_norm": 6.692323684692383, "learning_rate": 2.4348186388589206e-05, "loss": 2.3463871002197267, "memory(GiB)": 77.56, "step": 78365, "token_acc": 0.5074626865671642, "train_speed(iter/s)": 1.437019 }, { "epoch": 3.3576110706482156, "grad_norm": 7.227686882019043, "learning_rate": 2.4342410006342732e-05, "loss": 2.2718215942382813, "memory(GiB)": 77.56, "step": 78370, "token_acc": 0.5294117647058824, "train_speed(iter/s)": 1.437023 }, { "epoch": 3.3578252859774644, "grad_norm": 5.753372669219971, "learning_rate": 2.4336634088907566e-05, "loss": 2.347666931152344, "memory(GiB)": 77.56, "step": 78375, "token_acc": 0.4919093851132686, "train_speed(iter/s)": 1.437016 }, { "epoch": 3.3580395013067137, "grad_norm": 6.541168689727783, "learning_rate": 2.4330858636388348e-05, "loss": 2.3220420837402345, "memory(GiB)": 77.56, "step": 78380, "token_acc": 0.4902597402597403, "train_speed(iter/s)": 1.437021 }, { "epoch": 3.3582537166359625, "grad_norm": 6.848973274230957, "learning_rate": 2.432508364888969e-05, "loss": 2.5263412475585936, "memory(GiB)": 77.56, "step": 78385, "token_acc": 0.45394736842105265, "train_speed(iter/s)": 1.437023 }, { "epoch": 3.3584679319652113, "grad_norm": 5.124996662139893, "learning_rate": 2.431930912651622e-05, "loss": 2.5684064865112304, "memory(GiB)": 77.56, "step": 78390, "token_acc": 0.5165165165165165, "train_speed(iter/s)": 1.437033 }, { "epoch": 3.3586821472944606, "grad_norm": 7.014379024505615, "learning_rate": 2.4313535069372584e-05, "loss": 2.1359622955322264, "memory(GiB)": 77.56, "step": 78395, "token_acc": 0.5182481751824818, "train_speed(iter/s)": 1.437035 }, { "epoch": 3.3588963626237094, "grad_norm": 5.489606857299805, "learning_rate": 2.430776147756335e-05, "loss": 1.9601947784423828, "memory(GiB)": 77.56, "step": 78400, "token_acc": 0.540453074433657, "train_speed(iter/s)": 1.437036 }, { "epoch": 3.359110577952958, "grad_norm": 5.671398639678955, "learning_rate": 2.4301988351193117e-05, "loss": 2.3124685287475586, "memory(GiB)": 77.56, "step": 78405, "token_acc": 0.49097472924187724, "train_speed(iter/s)": 1.437025 }, { "epoch": 3.3593247932822075, "grad_norm": 5.578364372253418, "learning_rate": 2.4296215690366476e-05, "loss": 2.585572052001953, "memory(GiB)": 77.56, "step": 78410, "token_acc": 0.4753521126760563, "train_speed(iter/s)": 1.437031 }, { "epoch": 3.3595390086114563, "grad_norm": 7.403778076171875, "learning_rate": 2.4290443495188e-05, "loss": 2.462246322631836, "memory(GiB)": 77.56, "step": 78415, "token_acc": 0.49612403100775193, "train_speed(iter/s)": 1.43705 }, { "epoch": 3.359753223940705, "grad_norm": 7.052006721496582, "learning_rate": 2.4284671765762235e-05, "loss": 2.1567081451416015, "memory(GiB)": 77.56, "step": 78420, "token_acc": 0.5536912751677853, "train_speed(iter/s)": 1.437065 }, { "epoch": 3.3599674392699543, "grad_norm": 5.3216729164123535, "learning_rate": 2.427890050219378e-05, "loss": 2.389144515991211, "memory(GiB)": 77.56, "step": 78425, "token_acc": 0.5094339622641509, "train_speed(iter/s)": 1.437079 }, { "epoch": 3.360181654599203, "grad_norm": 5.682316303253174, "learning_rate": 2.427312970458718e-05, "loss": 2.1203407287597655, "memory(GiB)": 77.56, "step": 78430, "token_acc": 0.5208333333333334, "train_speed(iter/s)": 1.437077 }, { "epoch": 3.360395869928452, "grad_norm": 5.355327129364014, "learning_rate": 2.426735937304696e-05, "loss": 2.4626581192016603, "memory(GiB)": 77.56, "step": 78435, "token_acc": 0.5014749262536873, "train_speed(iter/s)": 1.437081 }, { "epoch": 3.360610085257701, "grad_norm": 6.6465325355529785, "learning_rate": 2.426158950767767e-05, "loss": 2.3337871551513674, "memory(GiB)": 77.56, "step": 78440, "token_acc": 0.5211267605633803, "train_speed(iter/s)": 1.437074 }, { "epoch": 3.36082430058695, "grad_norm": 6.364837169647217, "learning_rate": 2.425582010858381e-05, "loss": 2.3513408660888673, "memory(GiB)": 77.56, "step": 78445, "token_acc": 0.46579804560260585, "train_speed(iter/s)": 1.437065 }, { "epoch": 3.361038515916199, "grad_norm": 5.640448093414307, "learning_rate": 2.4250051175869938e-05, "loss": 2.292061996459961, "memory(GiB)": 77.56, "step": 78450, "token_acc": 0.501577287066246, "train_speed(iter/s)": 1.43707 }, { "epoch": 3.361252731245448, "grad_norm": 7.223572254180908, "learning_rate": 2.4244282709640542e-05, "loss": 2.3309823989868166, "memory(GiB)": 77.56, "step": 78455, "token_acc": 0.5134328358208955, "train_speed(iter/s)": 1.437087 }, { "epoch": 3.361466946574697, "grad_norm": 8.981651306152344, "learning_rate": 2.4238514710000103e-05, "loss": 2.4027854919433596, "memory(GiB)": 77.56, "step": 78460, "token_acc": 0.5233333333333333, "train_speed(iter/s)": 1.43707 }, { "epoch": 3.3616811619039457, "grad_norm": 6.733354091644287, "learning_rate": 2.4232747177053167e-05, "loss": 2.4601390838623045, "memory(GiB)": 77.56, "step": 78465, "token_acc": 0.4837662337662338, "train_speed(iter/s)": 1.437077 }, { "epoch": 3.361895377233195, "grad_norm": 5.849766254425049, "learning_rate": 2.422698011090418e-05, "loss": 2.5320676803588866, "memory(GiB)": 77.56, "step": 78470, "token_acc": 0.4745222929936306, "train_speed(iter/s)": 1.43708 }, { "epoch": 3.362109592562444, "grad_norm": 5.1082763671875, "learning_rate": 2.4221213511657624e-05, "loss": 2.312602424621582, "memory(GiB)": 77.56, "step": 78475, "token_acc": 0.5246478873239436, "train_speed(iter/s)": 1.437095 }, { "epoch": 3.3623238078916926, "grad_norm": 5.328272819519043, "learning_rate": 2.421544737941795e-05, "loss": 2.1653512954711913, "memory(GiB)": 77.56, "step": 78480, "token_acc": 0.5132075471698113, "train_speed(iter/s)": 1.437079 }, { "epoch": 3.362538023220942, "grad_norm": 5.10175895690918, "learning_rate": 2.4209681714289655e-05, "loss": 2.266076850891113, "memory(GiB)": 77.56, "step": 78485, "token_acc": 0.478125, "train_speed(iter/s)": 1.4371 }, { "epoch": 3.3627522385501907, "grad_norm": 7.176144123077393, "learning_rate": 2.4203916516377167e-05, "loss": 2.213001823425293, "memory(GiB)": 77.56, "step": 78490, "token_acc": 0.5335276967930029, "train_speed(iter/s)": 1.437111 }, { "epoch": 3.3629664538794395, "grad_norm": 6.665398120880127, "learning_rate": 2.4198151785784934e-05, "loss": 1.9317384719848634, "memory(GiB)": 77.56, "step": 78495, "token_acc": 0.5387596899224806, "train_speed(iter/s)": 1.437122 }, { "epoch": 3.3631806692086887, "grad_norm": 7.4047112464904785, "learning_rate": 2.4192387522617384e-05, "loss": 2.1524192810058596, "memory(GiB)": 77.56, "step": 78500, "token_acc": 0.50390625, "train_speed(iter/s)": 1.437151 }, { "epoch": 3.3631806692086887, "eval_loss": 2.210336208343506, "eval_runtime": 14.0938, "eval_samples_per_second": 7.095, "eval_steps_per_second": 7.095, "eval_token_acc": 0.4819277108433735, "step": 78500 }, { "epoch": 3.3633948845379376, "grad_norm": 6.0035858154296875, "learning_rate": 2.4186623726978925e-05, "loss": 2.236882972717285, "memory(GiB)": 77.56, "step": 78505, "token_acc": 0.4975845410628019, "train_speed(iter/s)": 1.436735 }, { "epoch": 3.3636090998671864, "grad_norm": 9.440528869628906, "learning_rate": 2.418086039897401e-05, "loss": 2.7023162841796875, "memory(GiB)": 77.56, "step": 78510, "token_acc": 0.4437299035369775, "train_speed(iter/s)": 1.436719 }, { "epoch": 3.3638233151964356, "grad_norm": 6.444137096405029, "learning_rate": 2.4175097538707025e-05, "loss": 2.593526268005371, "memory(GiB)": 77.56, "step": 78515, "token_acc": 0.45695364238410596, "train_speed(iter/s)": 1.436714 }, { "epoch": 3.3640375305256844, "grad_norm": 5.1491289138793945, "learning_rate": 2.4169335146282378e-05, "loss": 2.54056396484375, "memory(GiB)": 77.56, "step": 78520, "token_acc": 0.46402877697841727, "train_speed(iter/s)": 1.436738 }, { "epoch": 3.3642517458549333, "grad_norm": 9.636716842651367, "learning_rate": 2.4163573221804457e-05, "loss": 2.5109472274780273, "memory(GiB)": 77.56, "step": 78525, "token_acc": 0.48046875, "train_speed(iter/s)": 1.436726 }, { "epoch": 3.3644659611841825, "grad_norm": 5.885531902313232, "learning_rate": 2.4157811765377624e-05, "loss": 2.549351119995117, "memory(GiB)": 77.56, "step": 78530, "token_acc": 0.5077399380804953, "train_speed(iter/s)": 1.436724 }, { "epoch": 3.3646801765134313, "grad_norm": 6.32645320892334, "learning_rate": 2.4152050777106273e-05, "loss": 2.239894485473633, "memory(GiB)": 77.56, "step": 78535, "token_acc": 0.523972602739726, "train_speed(iter/s)": 1.436717 }, { "epoch": 3.36489439184268, "grad_norm": 5.9216461181640625, "learning_rate": 2.414629025709479e-05, "loss": 2.386954498291016, "memory(GiB)": 77.56, "step": 78540, "token_acc": 0.48089171974522293, "train_speed(iter/s)": 1.436734 }, { "epoch": 3.3651086071719294, "grad_norm": 8.108158111572266, "learning_rate": 2.414053020544751e-05, "loss": 2.459110641479492, "memory(GiB)": 77.56, "step": 78545, "token_acc": 0.4765625, "train_speed(iter/s)": 1.436707 }, { "epoch": 3.365322822501178, "grad_norm": 5.555756092071533, "learning_rate": 2.4134770622268783e-05, "loss": 2.4668888092041015, "memory(GiB)": 77.56, "step": 78550, "token_acc": 0.4826388888888889, "train_speed(iter/s)": 1.436682 }, { "epoch": 3.365537037830427, "grad_norm": 6.063318252563477, "learning_rate": 2.4129011507662945e-05, "loss": 2.2724941253662108, "memory(GiB)": 77.56, "step": 78555, "token_acc": 0.51985559566787, "train_speed(iter/s)": 1.436696 }, { "epoch": 3.3657512531596763, "grad_norm": 5.631870269775391, "learning_rate": 2.4123252861734334e-05, "loss": 2.016470527648926, "memory(GiB)": 77.56, "step": 78560, "token_acc": 0.528125, "train_speed(iter/s)": 1.436689 }, { "epoch": 3.365965468488925, "grad_norm": 8.290855407714844, "learning_rate": 2.4117494684587262e-05, "loss": 2.462776756286621, "memory(GiB)": 77.56, "step": 78565, "token_acc": 0.48036253776435045, "train_speed(iter/s)": 1.436694 }, { "epoch": 3.366179683818174, "grad_norm": 8.175646781921387, "learning_rate": 2.4111736976326066e-05, "loss": 2.38049201965332, "memory(GiB)": 77.56, "step": 78570, "token_acc": 0.49823321554770317, "train_speed(iter/s)": 1.43669 }, { "epoch": 3.366393899147423, "grad_norm": 5.28679895401001, "learning_rate": 2.410597973705504e-05, "loss": 2.0457387924194337, "memory(GiB)": 77.56, "step": 78575, "token_acc": 0.5506072874493927, "train_speed(iter/s)": 1.43669 }, { "epoch": 3.366608114476672, "grad_norm": 4.147820472717285, "learning_rate": 2.4100222966878484e-05, "loss": 2.4577089309692384, "memory(GiB)": 77.56, "step": 78580, "token_acc": 0.4876543209876543, "train_speed(iter/s)": 1.436695 }, { "epoch": 3.3668223298059208, "grad_norm": 6.195014953613281, "learning_rate": 2.409446666590068e-05, "loss": 2.477466011047363, "memory(GiB)": 77.56, "step": 78585, "token_acc": 0.48375451263537905, "train_speed(iter/s)": 1.436713 }, { "epoch": 3.36703654513517, "grad_norm": 8.295110702514648, "learning_rate": 2.4088710834225896e-05, "loss": 2.2955419540405275, "memory(GiB)": 77.56, "step": 78590, "token_acc": 0.5043859649122807, "train_speed(iter/s)": 1.436705 }, { "epoch": 3.367250760464419, "grad_norm": 5.086785316467285, "learning_rate": 2.408295547195844e-05, "loss": 2.157618522644043, "memory(GiB)": 77.56, "step": 78595, "token_acc": 0.5018315018315018, "train_speed(iter/s)": 1.436709 }, { "epoch": 3.3674649757936677, "grad_norm": 6.766571998596191, "learning_rate": 2.4077200579202563e-05, "loss": 2.3635231018066407, "memory(GiB)": 77.56, "step": 78600, "token_acc": 0.4613003095975232, "train_speed(iter/s)": 1.436729 }, { "epoch": 3.367679191122917, "grad_norm": 5.771625518798828, "learning_rate": 2.4071446156062494e-05, "loss": 2.4489162445068358, "memory(GiB)": 77.56, "step": 78605, "token_acc": 0.512987012987013, "train_speed(iter/s)": 1.436725 }, { "epoch": 3.3678934064521657, "grad_norm": 6.445941925048828, "learning_rate": 2.406569220264252e-05, "loss": 2.2327081680297853, "memory(GiB)": 77.56, "step": 78610, "token_acc": 0.5111111111111111, "train_speed(iter/s)": 1.436759 }, { "epoch": 3.3681076217814145, "grad_norm": 7.226221084594727, "learning_rate": 2.405993871904686e-05, "loss": 2.241598892211914, "memory(GiB)": 77.56, "step": 78615, "token_acc": 0.5201465201465202, "train_speed(iter/s)": 1.436768 }, { "epoch": 3.368321837110664, "grad_norm": 6.602431774139404, "learning_rate": 2.4054185705379724e-05, "loss": 2.6349117279052736, "memory(GiB)": 77.56, "step": 78620, "token_acc": 0.4644808743169399, "train_speed(iter/s)": 1.436767 }, { "epoch": 3.3685360524399126, "grad_norm": 8.15915298461914, "learning_rate": 2.404843316174537e-05, "loss": 1.9415018081665039, "memory(GiB)": 77.56, "step": 78625, "token_acc": 0.5535714285714286, "train_speed(iter/s)": 1.436766 }, { "epoch": 3.3687502677691614, "grad_norm": 6.5156779289245605, "learning_rate": 2.4042681088248e-05, "loss": 2.0290224075317385, "memory(GiB)": 77.56, "step": 78630, "token_acc": 0.5708661417322834, "train_speed(iter/s)": 1.43678 }, { "epoch": 3.3689644830984107, "grad_norm": 5.561295986175537, "learning_rate": 2.4036929484991804e-05, "loss": 2.150686836242676, "memory(GiB)": 77.56, "step": 78635, "token_acc": 0.5412541254125413, "train_speed(iter/s)": 1.436797 }, { "epoch": 3.3691786984276595, "grad_norm": 6.023956298828125, "learning_rate": 2.4031178352080992e-05, "loss": 2.2956975936889648, "memory(GiB)": 77.56, "step": 78640, "token_acc": 0.5, "train_speed(iter/s)": 1.436797 }, { "epoch": 3.3693929137569083, "grad_norm": 4.972743034362793, "learning_rate": 2.402542768961974e-05, "loss": 2.4439579010009767, "memory(GiB)": 77.56, "step": 78645, "token_acc": 0.5392156862745098, "train_speed(iter/s)": 1.436792 }, { "epoch": 3.3696071290861576, "grad_norm": 6.531024932861328, "learning_rate": 2.4019677497712216e-05, "loss": 2.3612098693847656, "memory(GiB)": 77.56, "step": 78650, "token_acc": 0.4794952681388013, "train_speed(iter/s)": 1.436808 }, { "epoch": 3.3698213444154064, "grad_norm": 4.327822208404541, "learning_rate": 2.4013927776462625e-05, "loss": 2.5188278198242187, "memory(GiB)": 77.56, "step": 78655, "token_acc": 0.47194719471947194, "train_speed(iter/s)": 1.436808 }, { "epoch": 3.370035559744655, "grad_norm": 5.676511764526367, "learning_rate": 2.4008178525975105e-05, "loss": 2.563261795043945, "memory(GiB)": 77.56, "step": 78660, "token_acc": 0.45, "train_speed(iter/s)": 1.436799 }, { "epoch": 3.3702497750739044, "grad_norm": 5.0904459953308105, "learning_rate": 2.4002429746353817e-05, "loss": 2.187174606323242, "memory(GiB)": 77.56, "step": 78665, "token_acc": 0.5261324041811847, "train_speed(iter/s)": 1.436803 }, { "epoch": 3.3704639904031533, "grad_norm": 6.471100807189941, "learning_rate": 2.39966814377029e-05, "loss": 2.420202445983887, "memory(GiB)": 77.56, "step": 78670, "token_acc": 0.4931972789115646, "train_speed(iter/s)": 1.436821 }, { "epoch": 3.370678205732402, "grad_norm": 5.1567702293396, "learning_rate": 2.3990933600126476e-05, "loss": 2.4047405242919924, "memory(GiB)": 77.56, "step": 78675, "token_acc": 0.49693251533742333, "train_speed(iter/s)": 1.436819 }, { "epoch": 3.3708924210616513, "grad_norm": 6.0861334800720215, "learning_rate": 2.3985186233728686e-05, "loss": 1.9490814208984375, "memory(GiB)": 77.56, "step": 78680, "token_acc": 0.5306122448979592, "train_speed(iter/s)": 1.436843 }, { "epoch": 3.3711066363909, "grad_norm": 5.486029148101807, "learning_rate": 2.3979439338613668e-05, "loss": 2.255574035644531, "memory(GiB)": 77.56, "step": 78685, "token_acc": 0.5401459854014599, "train_speed(iter/s)": 1.436862 }, { "epoch": 3.371320851720149, "grad_norm": 6.144575119018555, "learning_rate": 2.397369291488552e-05, "loss": 2.4148992538452148, "memory(GiB)": 77.56, "step": 78690, "token_acc": 0.4774436090225564, "train_speed(iter/s)": 1.436874 }, { "epoch": 3.371535067049398, "grad_norm": 6.161258697509766, "learning_rate": 2.3967946962648334e-05, "loss": 2.2431365966796877, "memory(GiB)": 77.56, "step": 78695, "token_acc": 0.49498327759197325, "train_speed(iter/s)": 1.436877 }, { "epoch": 3.371749282378647, "grad_norm": 5.4289703369140625, "learning_rate": 2.3962201482006215e-05, "loss": 2.270370292663574, "memory(GiB)": 77.56, "step": 78700, "token_acc": 0.5, "train_speed(iter/s)": 1.436874 }, { "epoch": 3.371963497707896, "grad_norm": 5.3703436851501465, "learning_rate": 2.395645647306324e-05, "loss": 2.1858808517456056, "memory(GiB)": 77.56, "step": 78705, "token_acc": 0.5443425076452599, "train_speed(iter/s)": 1.436871 }, { "epoch": 3.372177713037145, "grad_norm": 7.366776466369629, "learning_rate": 2.3950711935923466e-05, "loss": 2.2356048583984376, "memory(GiB)": 77.56, "step": 78710, "token_acc": 0.5238095238095238, "train_speed(iter/s)": 1.43687 }, { "epoch": 3.372391928366394, "grad_norm": 5.853217601776123, "learning_rate": 2.3944967870691003e-05, "loss": 2.356764221191406, "memory(GiB)": 77.56, "step": 78715, "token_acc": 0.52, "train_speed(iter/s)": 1.436894 }, { "epoch": 3.3726061436956427, "grad_norm": 7.239072799682617, "learning_rate": 2.3939224277469886e-05, "loss": 2.3769832611083985, "memory(GiB)": 77.56, "step": 78720, "token_acc": 0.48148148148148145, "train_speed(iter/s)": 1.4369 }, { "epoch": 3.372820359024892, "grad_norm": 5.061244964599609, "learning_rate": 2.3933481156364168e-05, "loss": 2.1726757049560548, "memory(GiB)": 77.56, "step": 78725, "token_acc": 0.5395189003436426, "train_speed(iter/s)": 1.436909 }, { "epoch": 3.373034574354141, "grad_norm": 5.375277042388916, "learning_rate": 2.392773850747789e-05, "loss": 2.148880386352539, "memory(GiB)": 77.56, "step": 78730, "token_acc": 0.5095785440613027, "train_speed(iter/s)": 1.436901 }, { "epoch": 3.3732487896833896, "grad_norm": 5.191781520843506, "learning_rate": 2.3921996330915076e-05, "loss": 2.4782032012939452, "memory(GiB)": 77.56, "step": 78735, "token_acc": 0.49097472924187724, "train_speed(iter/s)": 1.43688 }, { "epoch": 3.373463005012639, "grad_norm": 8.026511192321777, "learning_rate": 2.391625462677977e-05, "loss": 2.4119050979614256, "memory(GiB)": 77.56, "step": 78740, "token_acc": 0.4634146341463415, "train_speed(iter/s)": 1.436898 }, { "epoch": 3.3736772203418877, "grad_norm": 6.124702453613281, "learning_rate": 2.3910513395175988e-05, "loss": 2.4365854263305664, "memory(GiB)": 77.56, "step": 78745, "token_acc": 0.43197278911564624, "train_speed(iter/s)": 1.436896 }, { "epoch": 3.3738914356711365, "grad_norm": 6.308697700500488, "learning_rate": 2.3904772636207723e-05, "loss": 2.259089469909668, "memory(GiB)": 77.56, "step": 78750, "token_acc": 0.5173611111111112, "train_speed(iter/s)": 1.436903 }, { "epoch": 3.3741056510003857, "grad_norm": 5.427739143371582, "learning_rate": 2.3899032349978967e-05, "loss": 2.1422523498535155, "memory(GiB)": 77.56, "step": 78755, "token_acc": 0.5617529880478087, "train_speed(iter/s)": 1.436907 }, { "epoch": 3.3743198663296345, "grad_norm": 4.6939215660095215, "learning_rate": 2.389329253659374e-05, "loss": 2.2956966400146483, "memory(GiB)": 77.56, "step": 78760, "token_acc": 0.5350553505535055, "train_speed(iter/s)": 1.436901 }, { "epoch": 3.3745340816588834, "grad_norm": 6.2993316650390625, "learning_rate": 2.3887553196155995e-05, "loss": 2.2858009338378906, "memory(GiB)": 77.56, "step": 78765, "token_acc": 0.5309090909090909, "train_speed(iter/s)": 1.43689 }, { "epoch": 3.3747482969881326, "grad_norm": 6.204442501068115, "learning_rate": 2.3881814328769737e-05, "loss": 2.245798873901367, "memory(GiB)": 77.56, "step": 78770, "token_acc": 0.48717948717948717, "train_speed(iter/s)": 1.436892 }, { "epoch": 3.3749625123173814, "grad_norm": 5.690382957458496, "learning_rate": 2.387607593453891e-05, "loss": 2.323918914794922, "memory(GiB)": 77.56, "step": 78775, "token_acc": 0.4967532467532468, "train_speed(iter/s)": 1.436896 }, { "epoch": 3.3751767276466302, "grad_norm": 4.640789985656738, "learning_rate": 2.3870338013567474e-05, "loss": 2.3230365753173827, "memory(GiB)": 77.56, "step": 78780, "token_acc": 0.5273972602739726, "train_speed(iter/s)": 1.43691 }, { "epoch": 3.3753909429758795, "grad_norm": 5.124710559844971, "learning_rate": 2.3864600565959377e-05, "loss": 2.3819780349731445, "memory(GiB)": 77.56, "step": 78785, "token_acc": 0.5068870523415978, "train_speed(iter/s)": 1.436912 }, { "epoch": 3.3756051583051283, "grad_norm": 5.971935272216797, "learning_rate": 2.3858863591818558e-05, "loss": 2.1204200744628907, "memory(GiB)": 77.56, "step": 78790, "token_acc": 0.5396341463414634, "train_speed(iter/s)": 1.436887 }, { "epoch": 3.375819373634377, "grad_norm": 6.131096839904785, "learning_rate": 2.385312709124893e-05, "loss": 2.3459039688110352, "memory(GiB)": 77.56, "step": 78795, "token_acc": 0.5257352941176471, "train_speed(iter/s)": 1.436906 }, { "epoch": 3.3760335889636264, "grad_norm": 6.3354058265686035, "learning_rate": 2.3847391064354453e-05, "loss": 2.379328727722168, "memory(GiB)": 77.56, "step": 78800, "token_acc": 0.4868421052631579, "train_speed(iter/s)": 1.436895 }, { "epoch": 3.376247804292875, "grad_norm": 6.278886318206787, "learning_rate": 2.3841655511239013e-05, "loss": 2.1081916809082033, "memory(GiB)": 77.56, "step": 78805, "token_acc": 0.5785123966942148, "train_speed(iter/s)": 1.436888 }, { "epoch": 3.376462019622124, "grad_norm": 8.349748611450195, "learning_rate": 2.3835920432006527e-05, "loss": 2.3201984405517577, "memory(GiB)": 77.56, "step": 78810, "token_acc": 0.48627450980392156, "train_speed(iter/s)": 1.4369 }, { "epoch": 3.3766762349513733, "grad_norm": 6.222157955169678, "learning_rate": 2.3830185826760887e-05, "loss": 2.1716976165771484, "memory(GiB)": 77.56, "step": 78815, "token_acc": 0.5239852398523985, "train_speed(iter/s)": 1.436911 }, { "epoch": 3.376890450280622, "grad_norm": 5.05953311920166, "learning_rate": 2.3824451695605958e-05, "loss": 2.503479766845703, "memory(GiB)": 77.56, "step": 78820, "token_acc": 0.46735395189003437, "train_speed(iter/s)": 1.43692 }, { "epoch": 3.377104665609871, "grad_norm": 6.064471244812012, "learning_rate": 2.381871803864566e-05, "loss": 2.3991886138916017, "memory(GiB)": 77.56, "step": 78825, "token_acc": 0.5250836120401338, "train_speed(iter/s)": 1.436921 }, { "epoch": 3.37731888093912, "grad_norm": 6.053960800170898, "learning_rate": 2.381298485598383e-05, "loss": 2.5158113479614257, "memory(GiB)": 77.56, "step": 78830, "token_acc": 0.4879032258064516, "train_speed(iter/s)": 1.436906 }, { "epoch": 3.377533096268369, "grad_norm": 6.7615814208984375, "learning_rate": 2.3807252147724362e-05, "loss": 2.529646873474121, "memory(GiB)": 77.56, "step": 78835, "token_acc": 0.48727272727272725, "train_speed(iter/s)": 1.436912 }, { "epoch": 3.3777473115976178, "grad_norm": 6.478045463562012, "learning_rate": 2.380151991397109e-05, "loss": 2.299068832397461, "memory(GiB)": 77.56, "step": 78840, "token_acc": 0.5259515570934256, "train_speed(iter/s)": 1.436934 }, { "epoch": 3.377961526926867, "grad_norm": 6.816653728485107, "learning_rate": 2.3795788154827862e-05, "loss": 2.3651493072509764, "memory(GiB)": 77.56, "step": 78845, "token_acc": 0.48188405797101447, "train_speed(iter/s)": 1.436938 }, { "epoch": 3.378175742256116, "grad_norm": 5.127247333526611, "learning_rate": 2.3790056870398515e-05, "loss": 2.6714509963989257, "memory(GiB)": 77.56, "step": 78850, "token_acc": 0.4161290322580645, "train_speed(iter/s)": 1.436958 }, { "epoch": 3.3783899575853646, "grad_norm": 6.059878826141357, "learning_rate": 2.3784326060786855e-05, "loss": 2.464638519287109, "memory(GiB)": 77.56, "step": 78855, "token_acc": 0.4948805460750853, "train_speed(iter/s)": 1.43697 }, { "epoch": 3.378604172914614, "grad_norm": 7.489520072937012, "learning_rate": 2.3778595726096737e-05, "loss": 2.5586109161376953, "memory(GiB)": 77.56, "step": 78860, "token_acc": 0.4806451612903226, "train_speed(iter/s)": 1.436979 }, { "epoch": 3.3788183882438627, "grad_norm": 6.65469217300415, "learning_rate": 2.3772865866431955e-05, "loss": 1.8535985946655273, "memory(GiB)": 77.56, "step": 78865, "token_acc": 0.594488188976378, "train_speed(iter/s)": 1.436989 }, { "epoch": 3.3790326035731115, "grad_norm": 5.638848781585693, "learning_rate": 2.3767136481896312e-05, "loss": 2.201658821105957, "memory(GiB)": 77.56, "step": 78870, "token_acc": 0.5124223602484472, "train_speed(iter/s)": 1.437 }, { "epoch": 3.379246818902361, "grad_norm": 6.5392889976501465, "learning_rate": 2.3761407572593603e-05, "loss": 2.2582328796386717, "memory(GiB)": 77.56, "step": 78875, "token_acc": 0.5461254612546126, "train_speed(iter/s)": 1.437007 }, { "epoch": 3.3794610342316096, "grad_norm": 5.483560562133789, "learning_rate": 2.375567913862759e-05, "loss": 2.305634689331055, "memory(GiB)": 77.56, "step": 78880, "token_acc": 0.5224358974358975, "train_speed(iter/s)": 1.437 }, { "epoch": 3.3796752495608584, "grad_norm": 6.366110801696777, "learning_rate": 2.3749951180102082e-05, "loss": 2.267227554321289, "memory(GiB)": 77.56, "step": 78885, "token_acc": 0.5217391304347826, "train_speed(iter/s)": 1.436993 }, { "epoch": 3.3798894648901077, "grad_norm": 6.813939094543457, "learning_rate": 2.3744223697120836e-05, "loss": 2.3075139999389647, "memory(GiB)": 77.56, "step": 78890, "token_acc": 0.515625, "train_speed(iter/s)": 1.436986 }, { "epoch": 3.3801036802193565, "grad_norm": 5.68144416809082, "learning_rate": 2.373849668978761e-05, "loss": 2.6016822814941407, "memory(GiB)": 77.56, "step": 78895, "token_acc": 0.4854771784232365, "train_speed(iter/s)": 1.436987 }, { "epoch": 3.3803178955486053, "grad_norm": 6.739671230316162, "learning_rate": 2.373277015820613e-05, "loss": 2.3951135635375977, "memory(GiB)": 77.56, "step": 78900, "token_acc": 0.48905109489051096, "train_speed(iter/s)": 1.43699 }, { "epoch": 3.3805321108778545, "grad_norm": 6.346278190612793, "learning_rate": 2.3727044102480184e-05, "loss": 2.5380760192871095, "memory(GiB)": 77.56, "step": 78905, "token_acc": 0.4786885245901639, "train_speed(iter/s)": 1.436998 }, { "epoch": 3.3807463262071034, "grad_norm": 6.784225940704346, "learning_rate": 2.3721318522713453e-05, "loss": 2.1891901016235353, "memory(GiB)": 77.56, "step": 78910, "token_acc": 0.5527272727272727, "train_speed(iter/s)": 1.43701 }, { "epoch": 3.380960541536352, "grad_norm": 6.588382720947266, "learning_rate": 2.3715593419009714e-05, "loss": 2.473220443725586, "memory(GiB)": 77.56, "step": 78915, "token_acc": 0.46885245901639344, "train_speed(iter/s)": 1.437022 }, { "epoch": 3.3811747568656014, "grad_norm": 5.927586555480957, "learning_rate": 2.3709868791472652e-05, "loss": 2.267233657836914, "memory(GiB)": 77.56, "step": 78920, "token_acc": 0.5017921146953405, "train_speed(iter/s)": 1.437018 }, { "epoch": 3.3813889721948502, "grad_norm": 5.045629024505615, "learning_rate": 2.3704144640205983e-05, "loss": 2.1847902297973634, "memory(GiB)": 77.56, "step": 78925, "token_acc": 0.5515873015873016, "train_speed(iter/s)": 1.437031 }, { "epoch": 3.381603187524099, "grad_norm": 5.2909722328186035, "learning_rate": 2.3698420965313395e-05, "loss": 2.217837905883789, "memory(GiB)": 77.56, "step": 78930, "token_acc": 0.4965986394557823, "train_speed(iter/s)": 1.437019 }, { "epoch": 3.3818174028533483, "grad_norm": 8.216493606567383, "learning_rate": 2.3692697766898592e-05, "loss": 2.493809127807617, "memory(GiB)": 77.56, "step": 78935, "token_acc": 0.5132743362831859, "train_speed(iter/s)": 1.437021 }, { "epoch": 3.382031618182597, "grad_norm": 4.826843738555908, "learning_rate": 2.3686975045065223e-05, "loss": 2.0823545455932617, "memory(GiB)": 77.56, "step": 78940, "token_acc": 0.5273224043715847, "train_speed(iter/s)": 1.437027 }, { "epoch": 3.382245833511846, "grad_norm": 4.023681163787842, "learning_rate": 2.3681252799917002e-05, "loss": 1.9837207794189453, "memory(GiB)": 77.56, "step": 78945, "token_acc": 0.5648854961832062, "train_speed(iter/s)": 1.437044 }, { "epoch": 3.382460048841095, "grad_norm": 4.9561991691589355, "learning_rate": 2.367553103155758e-05, "loss": 2.5363336563110352, "memory(GiB)": 77.56, "step": 78950, "token_acc": 0.49709302325581395, "train_speed(iter/s)": 1.437053 }, { "epoch": 3.382674264170344, "grad_norm": 5.4621381759643555, "learning_rate": 2.366980974009061e-05, "loss": 2.503839683532715, "memory(GiB)": 77.56, "step": 78955, "token_acc": 0.4896551724137931, "train_speed(iter/s)": 1.437074 }, { "epoch": 3.382888479499593, "grad_norm": 5.743004322052002, "learning_rate": 2.3664088925619732e-05, "loss": 2.3493316650390623, "memory(GiB)": 77.56, "step": 78960, "token_acc": 0.5050847457627119, "train_speed(iter/s)": 1.43708 }, { "epoch": 3.383102694828842, "grad_norm": 6.767465114593506, "learning_rate": 2.365836858824857e-05, "loss": 2.1151844024658204, "memory(GiB)": 77.56, "step": 78965, "token_acc": 0.525096525096525, "train_speed(iter/s)": 1.437071 }, { "epoch": 3.383316910158091, "grad_norm": 6.905044078826904, "learning_rate": 2.365264872808079e-05, "loss": 2.4366338729858397, "memory(GiB)": 77.56, "step": 78970, "token_acc": 0.48627450980392156, "train_speed(iter/s)": 1.437083 }, { "epoch": 3.3835311254873397, "grad_norm": 6.050512790679932, "learning_rate": 2.3646929345219975e-05, "loss": 2.303232955932617, "memory(GiB)": 77.56, "step": 78975, "token_acc": 0.4950166112956811, "train_speed(iter/s)": 1.437084 }, { "epoch": 3.383745340816589, "grad_norm": 6.511990070343018, "learning_rate": 2.3641210439769773e-05, "loss": 2.374123764038086, "memory(GiB)": 77.56, "step": 78980, "token_acc": 0.47703180212014135, "train_speed(iter/s)": 1.437084 }, { "epoch": 3.3839595561458378, "grad_norm": 6.069528579711914, "learning_rate": 2.3635492011833778e-05, "loss": 2.2651861190795897, "memory(GiB)": 77.56, "step": 78985, "token_acc": 0.44722222222222224, "train_speed(iter/s)": 1.437086 }, { "epoch": 3.3841737714750866, "grad_norm": 5.74617862701416, "learning_rate": 2.362977406151557e-05, "loss": 2.408353805541992, "memory(GiB)": 77.56, "step": 78990, "token_acc": 0.46774193548387094, "train_speed(iter/s)": 1.437097 }, { "epoch": 3.384387986804336, "grad_norm": 6.989368915557861, "learning_rate": 2.362405658891874e-05, "loss": 2.538044548034668, "memory(GiB)": 77.56, "step": 78995, "token_acc": 0.43902439024390244, "train_speed(iter/s)": 1.437092 }, { "epoch": 3.3846022021335846, "grad_norm": 4.436270236968994, "learning_rate": 2.3618339594146853e-05, "loss": 2.15976619720459, "memory(GiB)": 77.56, "step": 79000, "token_acc": 0.5468164794007491, "train_speed(iter/s)": 1.437093 }, { "epoch": 3.3846022021335846, "eval_loss": 2.242905378341675, "eval_runtime": 14.5098, "eval_samples_per_second": 6.892, "eval_steps_per_second": 6.892, "eval_token_acc": 0.469281045751634, "step": 79000 }, { "epoch": 3.3848164174628335, "grad_norm": 5.641612529754639, "learning_rate": 2.3612623077303514e-05, "loss": 2.408580207824707, "memory(GiB)": 77.56, "step": 79005, "token_acc": 0.4895833333333333, "train_speed(iter/s)": 1.436687 }, { "epoch": 3.3850306327920827, "grad_norm": 5.722011089324951, "learning_rate": 2.360690703849226e-05, "loss": 2.2639083862304688, "memory(GiB)": 77.56, "step": 79010, "token_acc": 0.47651006711409394, "train_speed(iter/s)": 1.43668 }, { "epoch": 3.3852448481213315, "grad_norm": 6.5511980056762695, "learning_rate": 2.360119147781664e-05, "loss": 2.4920969009399414, "memory(GiB)": 77.56, "step": 79015, "token_acc": 0.5085324232081911, "train_speed(iter/s)": 1.436695 }, { "epoch": 3.3854590634505803, "grad_norm": 4.943918228149414, "learning_rate": 2.35954763953802e-05, "loss": 2.2495140075683593, "memory(GiB)": 77.56, "step": 79020, "token_acc": 0.5234657039711191, "train_speed(iter/s)": 1.436723 }, { "epoch": 3.3856732787798296, "grad_norm": 5.779406547546387, "learning_rate": 2.3589761791286462e-05, "loss": 2.338222694396973, "memory(GiB)": 77.56, "step": 79025, "token_acc": 0.4969512195121951, "train_speed(iter/s)": 1.436738 }, { "epoch": 3.3858874941090784, "grad_norm": 4.647282123565674, "learning_rate": 2.3584047665638977e-05, "loss": 2.040752983093262, "memory(GiB)": 77.56, "step": 79030, "token_acc": 0.5430711610486891, "train_speed(iter/s)": 1.436763 }, { "epoch": 3.3861017094383272, "grad_norm": 4.793888568878174, "learning_rate": 2.3578334018541254e-05, "loss": 2.2106000900268556, "memory(GiB)": 77.56, "step": 79035, "token_acc": 0.5263157894736842, "train_speed(iter/s)": 1.436792 }, { "epoch": 3.3863159247675765, "grad_norm": 6.412319183349609, "learning_rate": 2.3572620850096787e-05, "loss": 2.2116941452026366, "memory(GiB)": 77.56, "step": 79040, "token_acc": 0.55078125, "train_speed(iter/s)": 1.436776 }, { "epoch": 3.3865301400968253, "grad_norm": 5.8349995613098145, "learning_rate": 2.356690816040909e-05, "loss": 2.4938520431518554, "memory(GiB)": 77.56, "step": 79045, "token_acc": 0.5087108013937283, "train_speed(iter/s)": 1.436785 }, { "epoch": 3.386744355426074, "grad_norm": 6.164633274078369, "learning_rate": 2.3561195949581634e-05, "loss": 2.3005395889282227, "memory(GiB)": 77.56, "step": 79050, "token_acc": 0.4854014598540146, "train_speed(iter/s)": 1.436786 }, { "epoch": 3.3869585707553234, "grad_norm": 7.676568031311035, "learning_rate": 2.3555484217717904e-05, "loss": 2.0836063385009767, "memory(GiB)": 77.56, "step": 79055, "token_acc": 0.5686274509803921, "train_speed(iter/s)": 1.436789 }, { "epoch": 3.387172786084572, "grad_norm": 6.290713787078857, "learning_rate": 2.3549772964921413e-05, "loss": 2.061748504638672, "memory(GiB)": 77.56, "step": 79060, "token_acc": 0.5567765567765568, "train_speed(iter/s)": 1.436802 }, { "epoch": 3.387387001413821, "grad_norm": 5.1710333824157715, "learning_rate": 2.354406219129559e-05, "loss": 2.359303092956543, "memory(GiB)": 77.56, "step": 79065, "token_acc": 0.5340501792114696, "train_speed(iter/s)": 1.436802 }, { "epoch": 3.3876012167430702, "grad_norm": 5.794148921966553, "learning_rate": 2.35383518969439e-05, "loss": 2.0363449096679687, "memory(GiB)": 77.56, "step": 79070, "token_acc": 0.5734265734265734, "train_speed(iter/s)": 1.436807 }, { "epoch": 3.387815432072319, "grad_norm": 7.266098976135254, "learning_rate": 2.353264208196979e-05, "loss": 2.161484146118164, "memory(GiB)": 77.56, "step": 79075, "token_acc": 0.5329153605015674, "train_speed(iter/s)": 1.436788 }, { "epoch": 3.388029647401568, "grad_norm": 7.2324395179748535, "learning_rate": 2.35269327464767e-05, "loss": 2.4642690658569335, "memory(GiB)": 77.56, "step": 79080, "token_acc": 0.4854368932038835, "train_speed(iter/s)": 1.436778 }, { "epoch": 3.388243862730817, "grad_norm": 5.676534652709961, "learning_rate": 2.3521223890568032e-05, "loss": 2.6387828826904296, "memory(GiB)": 77.56, "step": 79085, "token_acc": 0.441747572815534, "train_speed(iter/s)": 1.436793 }, { "epoch": 3.388458078060066, "grad_norm": 5.013236999511719, "learning_rate": 2.3515515514347252e-05, "loss": 2.474356460571289, "memory(GiB)": 77.56, "step": 79090, "token_acc": 0.48242811501597443, "train_speed(iter/s)": 1.436779 }, { "epoch": 3.3886722933893147, "grad_norm": 6.498418807983398, "learning_rate": 2.3509807617917757e-05, "loss": 2.0500114440917967, "memory(GiB)": 77.56, "step": 79095, "token_acc": 0.5402298850574713, "train_speed(iter/s)": 1.436795 }, { "epoch": 3.388886508718564, "grad_norm": 5.321521282196045, "learning_rate": 2.3504100201382945e-05, "loss": 2.3788131713867187, "memory(GiB)": 77.56, "step": 79100, "token_acc": 0.48135593220338985, "train_speed(iter/s)": 1.43678 }, { "epoch": 3.389100724047813, "grad_norm": 5.2803544998168945, "learning_rate": 2.3498393264846212e-05, "loss": 2.418195343017578, "memory(GiB)": 77.56, "step": 79105, "token_acc": 0.48562300319488816, "train_speed(iter/s)": 1.436791 }, { "epoch": 3.3893149393770616, "grad_norm": 6.558969974517822, "learning_rate": 2.349268680841093e-05, "loss": 2.2810951232910157, "memory(GiB)": 77.56, "step": 79110, "token_acc": 0.5, "train_speed(iter/s)": 1.436816 }, { "epoch": 3.389529154706311, "grad_norm": 7.498997688293457, "learning_rate": 2.3486980832180505e-05, "loss": 2.3022176742553713, "memory(GiB)": 77.56, "step": 79115, "token_acc": 0.5172413793103449, "train_speed(iter/s)": 1.436839 }, { "epoch": 3.3897433700355597, "grad_norm": 5.172027111053467, "learning_rate": 2.34812753362583e-05, "loss": 2.319327735900879, "memory(GiB)": 77.56, "step": 79120, "token_acc": 0.5507246376811594, "train_speed(iter/s)": 1.436828 }, { "epoch": 3.3899575853648085, "grad_norm": 6.7108154296875, "learning_rate": 2.3475570320747647e-05, "loss": 2.295004653930664, "memory(GiB)": 77.56, "step": 79125, "token_acc": 0.5, "train_speed(iter/s)": 1.436824 }, { "epoch": 3.3901718006940578, "grad_norm": 6.020142555236816, "learning_rate": 2.3469865785751938e-05, "loss": 2.1553855895996095, "memory(GiB)": 77.56, "step": 79130, "token_acc": 0.4826254826254826, "train_speed(iter/s)": 1.436832 }, { "epoch": 3.3903860160233066, "grad_norm": 4.676198959350586, "learning_rate": 2.3464161731374496e-05, "loss": 2.272721862792969, "memory(GiB)": 77.56, "step": 79135, "token_acc": 0.5172413793103449, "train_speed(iter/s)": 1.436829 }, { "epoch": 3.390600231352556, "grad_norm": 5.459369659423828, "learning_rate": 2.345845815771866e-05, "loss": 2.2400577545166014, "memory(GiB)": 77.56, "step": 79140, "token_acc": 0.49137931034482757, "train_speed(iter/s)": 1.436834 }, { "epoch": 3.3908144466818047, "grad_norm": 6.605140209197998, "learning_rate": 2.3452755064887732e-05, "loss": 2.378836441040039, "memory(GiB)": 77.56, "step": 79145, "token_acc": 0.48773006134969327, "train_speed(iter/s)": 1.436843 }, { "epoch": 3.3910286620110535, "grad_norm": 7.440648555755615, "learning_rate": 2.3447052452985068e-05, "loss": 2.395745849609375, "memory(GiB)": 77.56, "step": 79150, "token_acc": 0.49216300940438873, "train_speed(iter/s)": 1.436846 }, { "epoch": 3.3912428773403027, "grad_norm": 6.153088092803955, "learning_rate": 2.3441350322113957e-05, "loss": 2.0995552062988283, "memory(GiB)": 77.56, "step": 79155, "token_acc": 0.5402298850574713, "train_speed(iter/s)": 1.43685 }, { "epoch": 3.3914570926695515, "grad_norm": 5.468713760375977, "learning_rate": 2.3435648672377702e-05, "loss": 2.259908676147461, "memory(GiB)": 77.56, "step": 79160, "token_acc": 0.4642857142857143, "train_speed(iter/s)": 1.436865 }, { "epoch": 3.3916713079988003, "grad_norm": 6.328258514404297, "learning_rate": 2.342994750387959e-05, "loss": 2.4397909164428713, "memory(GiB)": 77.56, "step": 79165, "token_acc": 0.5123674911660777, "train_speed(iter/s)": 1.436888 }, { "epoch": 3.3918855233280496, "grad_norm": 6.460422515869141, "learning_rate": 2.3424246816722884e-05, "loss": 2.6259462356567385, "memory(GiB)": 77.56, "step": 79170, "token_acc": 0.48863636363636365, "train_speed(iter/s)": 1.436911 }, { "epoch": 3.3920997386572984, "grad_norm": 5.835700988769531, "learning_rate": 2.3418546611010895e-05, "loss": 2.2085365295410155, "memory(GiB)": 77.56, "step": 79175, "token_acc": 0.48314606741573035, "train_speed(iter/s)": 1.436923 }, { "epoch": 3.3923139539865472, "grad_norm": 5.2655181884765625, "learning_rate": 2.3412846886846867e-05, "loss": 2.112788200378418, "memory(GiB)": 77.56, "step": 79180, "token_acc": 0.556420233463035, "train_speed(iter/s)": 1.436935 }, { "epoch": 3.3925281693157965, "grad_norm": 5.313553333282471, "learning_rate": 2.3407147644334067e-05, "loss": 2.480325126647949, "memory(GiB)": 77.56, "step": 79185, "token_acc": 0.46545454545454545, "train_speed(iter/s)": 1.436939 }, { "epoch": 3.3927423846450453, "grad_norm": 5.882853984832764, "learning_rate": 2.340144888357572e-05, "loss": 2.2125625610351562, "memory(GiB)": 77.56, "step": 79190, "token_acc": 0.5292096219931272, "train_speed(iter/s)": 1.436937 }, { "epoch": 3.392956599974294, "grad_norm": 7.756390571594238, "learning_rate": 2.339575060467507e-05, "loss": 2.259334754943848, "memory(GiB)": 77.56, "step": 79195, "token_acc": 0.5813953488372093, "train_speed(iter/s)": 1.436899 }, { "epoch": 3.3931708153035434, "grad_norm": 7.951709270477295, "learning_rate": 2.3390052807735352e-05, "loss": 2.2962581634521486, "memory(GiB)": 77.56, "step": 79200, "token_acc": 0.5, "train_speed(iter/s)": 1.436896 }, { "epoch": 3.393385030632792, "grad_norm": 6.02262544631958, "learning_rate": 2.338435549285981e-05, "loss": 2.3636016845703125, "memory(GiB)": 77.56, "step": 79205, "token_acc": 0.49834983498349833, "train_speed(iter/s)": 1.436891 }, { "epoch": 3.393599245962041, "grad_norm": 6.503762722015381, "learning_rate": 2.337865866015163e-05, "loss": 2.3004405975341795, "memory(GiB)": 77.56, "step": 79210, "token_acc": 0.5, "train_speed(iter/s)": 1.436926 }, { "epoch": 3.3938134612912902, "grad_norm": 4.864107608795166, "learning_rate": 2.3372962309714023e-05, "loss": 2.367596435546875, "memory(GiB)": 77.56, "step": 79215, "token_acc": 0.4831081081081081, "train_speed(iter/s)": 1.436934 }, { "epoch": 3.394027676620539, "grad_norm": 7.189845561981201, "learning_rate": 2.3367266441650188e-05, "loss": 2.5482656478881838, "memory(GiB)": 77.56, "step": 79220, "token_acc": 0.5064516129032258, "train_speed(iter/s)": 1.436948 }, { "epoch": 3.394241891949788, "grad_norm": 8.789438247680664, "learning_rate": 2.3361571056063302e-05, "loss": 2.2059791564941404, "memory(GiB)": 77.56, "step": 79225, "token_acc": 0.5363321799307958, "train_speed(iter/s)": 1.436946 }, { "epoch": 3.394456107279037, "grad_norm": 5.332568645477295, "learning_rate": 2.335587615305652e-05, "loss": 2.3448369979858397, "memory(GiB)": 77.56, "step": 79230, "token_acc": 0.5020408163265306, "train_speed(iter/s)": 1.436941 }, { "epoch": 3.394670322608286, "grad_norm": 5.452345371246338, "learning_rate": 2.335018173273306e-05, "loss": 2.240138626098633, "memory(GiB)": 77.56, "step": 79235, "token_acc": 0.5096153846153846, "train_speed(iter/s)": 1.43694 }, { "epoch": 3.3948845379375348, "grad_norm": 6.187854290008545, "learning_rate": 2.3344487795196063e-05, "loss": 2.2077091217041014, "memory(GiB)": 77.56, "step": 79240, "token_acc": 0.5236486486486487, "train_speed(iter/s)": 1.43695 }, { "epoch": 3.395098753266784, "grad_norm": 6.3480305671691895, "learning_rate": 2.3338794340548666e-05, "loss": 2.5340810775756837, "memory(GiB)": 77.56, "step": 79245, "token_acc": 0.4868421052631579, "train_speed(iter/s)": 1.436957 }, { "epoch": 3.395312968596033, "grad_norm": 5.4756059646606445, "learning_rate": 2.3333101368894024e-05, "loss": 2.3447132110595703, "memory(GiB)": 77.56, "step": 79250, "token_acc": 0.47058823529411764, "train_speed(iter/s)": 1.436959 }, { "epoch": 3.3955271839252816, "grad_norm": 4.763704299926758, "learning_rate": 2.3327408880335245e-05, "loss": 2.172576141357422, "memory(GiB)": 77.56, "step": 79255, "token_acc": 0.5573248407643312, "train_speed(iter/s)": 1.436964 }, { "epoch": 3.395741399254531, "grad_norm": 5.209707736968994, "learning_rate": 2.3321716874975498e-05, "loss": 2.482649230957031, "memory(GiB)": 77.56, "step": 79260, "token_acc": 0.47076023391812866, "train_speed(iter/s)": 1.436948 }, { "epoch": 3.3959556145837797, "grad_norm": 6.771468162536621, "learning_rate": 2.331602535291787e-05, "loss": 2.4084396362304688, "memory(GiB)": 77.56, "step": 79265, "token_acc": 0.4808362369337979, "train_speed(iter/s)": 1.436964 }, { "epoch": 3.3961698299130285, "grad_norm": 6.260661602020264, "learning_rate": 2.331033431426546e-05, "loss": 2.2911216735839846, "memory(GiB)": 77.56, "step": 79270, "token_acc": 0.4967105263157895, "train_speed(iter/s)": 1.436974 }, { "epoch": 3.3963840452422778, "grad_norm": 4.987298488616943, "learning_rate": 2.3304643759121398e-05, "loss": 2.2478328704833985, "memory(GiB)": 77.56, "step": 79275, "token_acc": 0.5111731843575419, "train_speed(iter/s)": 1.436966 }, { "epoch": 3.3965982605715266, "grad_norm": 5.767981052398682, "learning_rate": 2.3298953687588753e-05, "loss": 2.572159767150879, "memory(GiB)": 77.56, "step": 79280, "token_acc": 0.5, "train_speed(iter/s)": 1.436941 }, { "epoch": 3.3968124759007754, "grad_norm": 5.535113334655762, "learning_rate": 2.3293264099770613e-05, "loss": 2.552655029296875, "memory(GiB)": 77.56, "step": 79285, "token_acc": 0.44907407407407407, "train_speed(iter/s)": 1.436965 }, { "epoch": 3.3970266912300247, "grad_norm": 4.84979248046875, "learning_rate": 2.3287574995770028e-05, "loss": 2.142882537841797, "memory(GiB)": 77.56, "step": 79290, "token_acc": 0.5377906976744186, "train_speed(iter/s)": 1.43697 }, { "epoch": 3.3972409065592735, "grad_norm": 5.1381049156188965, "learning_rate": 2.32818863756901e-05, "loss": 2.6485221862792967, "memory(GiB)": 77.56, "step": 79295, "token_acc": 0.48344370860927155, "train_speed(iter/s)": 1.436986 }, { "epoch": 3.3974551218885223, "grad_norm": 6.498202323913574, "learning_rate": 2.327619823963386e-05, "loss": 2.624630355834961, "memory(GiB)": 77.56, "step": 79300, "token_acc": 0.47651006711409394, "train_speed(iter/s)": 1.436988 }, { "epoch": 3.3976693372177715, "grad_norm": 4.937193870544434, "learning_rate": 2.3270510587704365e-05, "loss": 2.212263488769531, "memory(GiB)": 77.56, "step": 79305, "token_acc": 0.5311475409836065, "train_speed(iter/s)": 1.43701 }, { "epoch": 3.3978835525470203, "grad_norm": 5.264317989349365, "learning_rate": 2.326482342000464e-05, "loss": 2.2468730926513674, "memory(GiB)": 77.56, "step": 79310, "token_acc": 0.521594684385382, "train_speed(iter/s)": 1.43701 }, { "epoch": 3.398097767876269, "grad_norm": 5.205953598022461, "learning_rate": 2.3259136736637697e-05, "loss": 2.163889503479004, "memory(GiB)": 77.56, "step": 79315, "token_acc": 0.515625, "train_speed(iter/s)": 1.437023 }, { "epoch": 3.3983119832055184, "grad_norm": 5.57481050491333, "learning_rate": 2.32534505377066e-05, "loss": 2.367462730407715, "memory(GiB)": 77.56, "step": 79320, "token_acc": 0.4891304347826087, "train_speed(iter/s)": 1.437049 }, { "epoch": 3.3985261985347672, "grad_norm": 6.042647361755371, "learning_rate": 2.324776482331434e-05, "loss": 2.2394073486328123, "memory(GiB)": 77.56, "step": 79325, "token_acc": 0.5015873015873016, "train_speed(iter/s)": 1.437065 }, { "epoch": 3.398740413864016, "grad_norm": 6.816196441650391, "learning_rate": 2.324207959356391e-05, "loss": 2.3014684677124024, "memory(GiB)": 77.56, "step": 79330, "token_acc": 0.49635036496350365, "train_speed(iter/s)": 1.437069 }, { "epoch": 3.3989546291932653, "grad_norm": 7.3775434494018555, "learning_rate": 2.323639484855831e-05, "loss": 2.4087228775024414, "memory(GiB)": 77.56, "step": 79335, "token_acc": 0.5139442231075697, "train_speed(iter/s)": 1.437078 }, { "epoch": 3.399168844522514, "grad_norm": 5.364716529846191, "learning_rate": 2.3230710588400505e-05, "loss": 2.52114315032959, "memory(GiB)": 77.56, "step": 79340, "token_acc": 0.4776536312849162, "train_speed(iter/s)": 1.437084 }, { "epoch": 3.399383059851763, "grad_norm": 5.3653564453125, "learning_rate": 2.322502681319349e-05, "loss": 2.2224849700927733, "memory(GiB)": 77.56, "step": 79345, "token_acc": 0.5094339622641509, "train_speed(iter/s)": 1.437096 }, { "epoch": 3.399597275181012, "grad_norm": 5.870612621307373, "learning_rate": 2.321934352304025e-05, "loss": 2.585462951660156, "memory(GiB)": 77.56, "step": 79350, "token_acc": 0.4857142857142857, "train_speed(iter/s)": 1.437091 }, { "epoch": 3.399811490510261, "grad_norm": 8.266950607299805, "learning_rate": 2.321366071804373e-05, "loss": 2.141002655029297, "memory(GiB)": 77.56, "step": 79355, "token_acc": 0.5169811320754717, "train_speed(iter/s)": 1.437112 }, { "epoch": 3.40002570583951, "grad_norm": 5.365269660949707, "learning_rate": 2.320797839830686e-05, "loss": 2.263966751098633, "memory(GiB)": 77.56, "step": 79360, "token_acc": 0.5034246575342466, "train_speed(iter/s)": 1.437118 }, { "epoch": 3.400239921168759, "grad_norm": 6.08232307434082, "learning_rate": 2.3202296563932607e-05, "loss": 2.504233169555664, "memory(GiB)": 77.56, "step": 79365, "token_acc": 0.49825783972125437, "train_speed(iter/s)": 1.437114 }, { "epoch": 3.400454136498008, "grad_norm": 5.069942474365234, "learning_rate": 2.3196615215023886e-05, "loss": 2.228591728210449, "memory(GiB)": 77.56, "step": 79370, "token_acc": 0.5189873417721519, "train_speed(iter/s)": 1.437104 }, { "epoch": 3.4006683518272567, "grad_norm": 6.968050479888916, "learning_rate": 2.3190934351683602e-05, "loss": 2.438836860656738, "memory(GiB)": 77.56, "step": 79375, "token_acc": 0.47761194029850745, "train_speed(iter/s)": 1.437123 }, { "epoch": 3.400882567156506, "grad_norm": 4.4127044677734375, "learning_rate": 2.3185253974014714e-05, "loss": 2.0383148193359375, "memory(GiB)": 77.56, "step": 79380, "token_acc": 0.5190311418685121, "train_speed(iter/s)": 1.437108 }, { "epoch": 3.4010967824857548, "grad_norm": 5.228052616119385, "learning_rate": 2.3179574082120105e-05, "loss": 2.2109039306640623, "memory(GiB)": 77.56, "step": 79385, "token_acc": 0.5306122448979592, "train_speed(iter/s)": 1.437118 }, { "epoch": 3.4013109978150036, "grad_norm": 6.189214706420898, "learning_rate": 2.317389467610267e-05, "loss": 2.3826671600341798, "memory(GiB)": 77.56, "step": 79390, "token_acc": 0.5381818181818182, "train_speed(iter/s)": 1.437106 }, { "epoch": 3.401525213144253, "grad_norm": 6.784708499908447, "learning_rate": 2.3168215756065292e-05, "loss": 2.701226806640625, "memory(GiB)": 77.56, "step": 79395, "token_acc": 0.4912891986062718, "train_speed(iter/s)": 1.437102 }, { "epoch": 3.4017394284735016, "grad_norm": 6.745068550109863, "learning_rate": 2.3162537322110843e-05, "loss": 2.35314884185791, "memory(GiB)": 77.56, "step": 79400, "token_acc": 0.47249190938511326, "train_speed(iter/s)": 1.437119 }, { "epoch": 3.4019536438027504, "grad_norm": 7.3597731590271, "learning_rate": 2.3156859374342226e-05, "loss": 2.168228530883789, "memory(GiB)": 77.56, "step": 79405, "token_acc": 0.5244755244755245, "train_speed(iter/s)": 1.437104 }, { "epoch": 3.4021678591319997, "grad_norm": 8.19350528717041, "learning_rate": 2.315118191286228e-05, "loss": 2.142755126953125, "memory(GiB)": 77.56, "step": 79410, "token_acc": 0.5347985347985348, "train_speed(iter/s)": 1.437112 }, { "epoch": 3.4023820744612485, "grad_norm": 5.765941143035889, "learning_rate": 2.314550493777386e-05, "loss": 2.490132713317871, "memory(GiB)": 77.56, "step": 79415, "token_acc": 0.46511627906976744, "train_speed(iter/s)": 1.437096 }, { "epoch": 3.4025962897904973, "grad_norm": 4.746912479400635, "learning_rate": 2.313982844917979e-05, "loss": 2.188548278808594, "memory(GiB)": 77.56, "step": 79420, "token_acc": 0.4840989399293286, "train_speed(iter/s)": 1.437104 }, { "epoch": 3.4028105051197466, "grad_norm": 4.999203205108643, "learning_rate": 2.3134152447182945e-05, "loss": 2.3510326385498046, "memory(GiB)": 77.56, "step": 79425, "token_acc": 0.5266666666666666, "train_speed(iter/s)": 1.437115 }, { "epoch": 3.4030247204489954, "grad_norm": 7.057901382446289, "learning_rate": 2.3128476931886128e-05, "loss": 2.490496826171875, "memory(GiB)": 77.56, "step": 79430, "token_acc": 0.518796992481203, "train_speed(iter/s)": 1.43713 }, { "epoch": 3.403238935778244, "grad_norm": 5.6993584632873535, "learning_rate": 2.3122801903392145e-05, "loss": 2.242514419555664, "memory(GiB)": 77.56, "step": 79435, "token_acc": 0.5207547169811321, "train_speed(iter/s)": 1.437107 }, { "epoch": 3.4034531511074935, "grad_norm": 6.187243938446045, "learning_rate": 2.311712736180383e-05, "loss": 2.408182907104492, "memory(GiB)": 77.56, "step": 79440, "token_acc": 0.5147058823529411, "train_speed(iter/s)": 1.437102 }, { "epoch": 3.4036673664367423, "grad_norm": 4.937978744506836, "learning_rate": 2.3111453307223978e-05, "loss": 2.5719539642333986, "memory(GiB)": 77.56, "step": 79445, "token_acc": 0.46258503401360546, "train_speed(iter/s)": 1.437118 }, { "epoch": 3.403881581765991, "grad_norm": 4.627047538757324, "learning_rate": 2.3105779739755368e-05, "loss": 2.1482330322265626, "memory(GiB)": 77.56, "step": 79450, "token_acc": 0.503731343283582, "train_speed(iter/s)": 1.437121 }, { "epoch": 3.4040957970952403, "grad_norm": 5.4321794509887695, "learning_rate": 2.3100106659500794e-05, "loss": 2.273075485229492, "memory(GiB)": 77.56, "step": 79455, "token_acc": 0.483974358974359, "train_speed(iter/s)": 1.437112 }, { "epoch": 3.404310012424489, "grad_norm": 6.22972297668457, "learning_rate": 2.3094434066562993e-05, "loss": 2.2309507369995116, "memory(GiB)": 77.56, "step": 79460, "token_acc": 0.516728624535316, "train_speed(iter/s)": 1.437139 }, { "epoch": 3.404524227753738, "grad_norm": 7.1927103996276855, "learning_rate": 2.3088761961044786e-05, "loss": 2.387375259399414, "memory(GiB)": 77.56, "step": 79465, "token_acc": 0.4882943143812709, "train_speed(iter/s)": 1.437133 }, { "epoch": 3.4047384430829872, "grad_norm": 8.879721641540527, "learning_rate": 2.3083090343048898e-05, "loss": 2.5375238418579102, "memory(GiB)": 77.56, "step": 79470, "token_acc": 0.4416058394160584, "train_speed(iter/s)": 1.437136 }, { "epoch": 3.404952658412236, "grad_norm": 6.653095245361328, "learning_rate": 2.3077419212678076e-05, "loss": 2.424617958068848, "memory(GiB)": 77.56, "step": 79475, "token_acc": 0.5133531157270029, "train_speed(iter/s)": 1.437155 }, { "epoch": 3.405166873741485, "grad_norm": 7.53515100479126, "learning_rate": 2.3071748570035063e-05, "loss": 2.191908264160156, "memory(GiB)": 77.56, "step": 79480, "token_acc": 0.5507246376811594, "train_speed(iter/s)": 1.437145 }, { "epoch": 3.405381089070734, "grad_norm": 6.5444159507751465, "learning_rate": 2.3066078415222563e-05, "loss": 2.1773199081420898, "memory(GiB)": 77.56, "step": 79485, "token_acc": 0.5096153846153846, "train_speed(iter/s)": 1.437137 }, { "epoch": 3.405595304399983, "grad_norm": 5.0450825691223145, "learning_rate": 2.306040874834334e-05, "loss": 2.3375301361083984, "memory(GiB)": 77.56, "step": 79490, "token_acc": 0.5107142857142857, "train_speed(iter/s)": 1.43714 }, { "epoch": 3.4058095197292317, "grad_norm": 5.600021839141846, "learning_rate": 2.3054739569500066e-05, "loss": 2.074955368041992, "memory(GiB)": 77.56, "step": 79495, "token_acc": 0.5306859205776173, "train_speed(iter/s)": 1.437149 }, { "epoch": 3.406023735058481, "grad_norm": 8.0911283493042, "learning_rate": 2.3049070878795477e-05, "loss": 2.452461051940918, "memory(GiB)": 77.56, "step": 79500, "token_acc": 0.46598639455782315, "train_speed(iter/s)": 1.437167 }, { "epoch": 3.406023735058481, "eval_loss": 2.179293155670166, "eval_runtime": 14.297, "eval_samples_per_second": 6.994, "eval_steps_per_second": 6.994, "eval_token_acc": 0.48917748917748916, "step": 79500 }, { "epoch": 3.40623795038773, "grad_norm": 6.782418727874756, "learning_rate": 2.3043402676332253e-05, "loss": 2.328179359436035, "memory(GiB)": 77.56, "step": 79505, "token_acc": 0.4969262295081967, "train_speed(iter/s)": 1.436782 }, { "epoch": 3.4064521657169786, "grad_norm": 5.8732075691223145, "learning_rate": 2.303773496221308e-05, "loss": 2.4902297973632814, "memory(GiB)": 77.56, "step": 79510, "token_acc": 0.45962732919254656, "train_speed(iter/s)": 1.436782 }, { "epoch": 3.406666381046228, "grad_norm": 5.070359706878662, "learning_rate": 2.3032067736540626e-05, "loss": 2.1516193389892577, "memory(GiB)": 77.56, "step": 79515, "token_acc": 0.5083056478405316, "train_speed(iter/s)": 1.436751 }, { "epoch": 3.4068805963754767, "grad_norm": 5.498624324798584, "learning_rate": 2.3026400999417546e-05, "loss": 2.2944265365600587, "memory(GiB)": 77.56, "step": 79520, "token_acc": 0.5075187969924813, "train_speed(iter/s)": 1.436765 }, { "epoch": 3.4070948117047255, "grad_norm": 5.4773640632629395, "learning_rate": 2.3020734750946534e-05, "loss": 2.510694122314453, "memory(GiB)": 77.56, "step": 79525, "token_acc": 0.5030674846625767, "train_speed(iter/s)": 1.436785 }, { "epoch": 3.4073090270339748, "grad_norm": 6.929642677307129, "learning_rate": 2.3015068991230222e-05, "loss": 2.2454782485961915, "memory(GiB)": 77.56, "step": 79530, "token_acc": 0.5255972696245734, "train_speed(iter/s)": 1.436807 }, { "epoch": 3.4075232423632236, "grad_norm": 5.425712585449219, "learning_rate": 2.3009403720371247e-05, "loss": 2.5280765533447265, "memory(GiB)": 77.56, "step": 79535, "token_acc": 0.4929078014184397, "train_speed(iter/s)": 1.436816 }, { "epoch": 3.4077374576924724, "grad_norm": 5.826126575469971, "learning_rate": 2.300373893847224e-05, "loss": 2.5497669219970702, "memory(GiB)": 77.56, "step": 79540, "token_acc": 0.4555160142348754, "train_speed(iter/s)": 1.436823 }, { "epoch": 3.4079516730217216, "grad_norm": 5.537844657897949, "learning_rate": 2.2998074645635815e-05, "loss": 2.3873971939086913, "memory(GiB)": 77.56, "step": 79545, "token_acc": 0.49702380952380953, "train_speed(iter/s)": 1.436826 }, { "epoch": 3.4081658883509705, "grad_norm": 7.128018379211426, "learning_rate": 2.299241084196461e-05, "loss": 2.3843105316162108, "memory(GiB)": 77.56, "step": 79550, "token_acc": 0.48936170212765956, "train_speed(iter/s)": 1.436825 }, { "epoch": 3.4083801036802193, "grad_norm": 5.012312412261963, "learning_rate": 2.2986747527561227e-05, "loss": 2.2672367095947266, "memory(GiB)": 77.56, "step": 79555, "token_acc": 0.5396825396825397, "train_speed(iter/s)": 1.43683 }, { "epoch": 3.4085943190094685, "grad_norm": 7.558150768280029, "learning_rate": 2.2981084702528244e-05, "loss": 2.379343032836914, "memory(GiB)": 77.56, "step": 79560, "token_acc": 0.5182724252491694, "train_speed(iter/s)": 1.436853 }, { "epoch": 3.4088085343387173, "grad_norm": 5.734176158905029, "learning_rate": 2.297542236696824e-05, "loss": 2.2569149017333983, "memory(GiB)": 77.56, "step": 79565, "token_acc": 0.5698529411764706, "train_speed(iter/s)": 1.436871 }, { "epoch": 3.409022749667966, "grad_norm": 5.442965030670166, "learning_rate": 2.296976052098383e-05, "loss": 2.3304931640625, "memory(GiB)": 77.56, "step": 79570, "token_acc": 0.5276872964169381, "train_speed(iter/s)": 1.436894 }, { "epoch": 3.4092369649972154, "grad_norm": 5.321924686431885, "learning_rate": 2.2964099164677567e-05, "loss": 2.5034063339233397, "memory(GiB)": 77.56, "step": 79575, "token_acc": 0.49221183800623053, "train_speed(iter/s)": 1.436901 }, { "epoch": 3.409451180326464, "grad_norm": 8.341198921203613, "learning_rate": 2.295843829815199e-05, "loss": 2.216433525085449, "memory(GiB)": 77.56, "step": 79580, "token_acc": 0.463519313304721, "train_speed(iter/s)": 1.436907 }, { "epoch": 3.409665395655713, "grad_norm": 5.1558942794799805, "learning_rate": 2.295277792150969e-05, "loss": 2.4018110275268554, "memory(GiB)": 77.56, "step": 79585, "token_acc": 0.48717948717948717, "train_speed(iter/s)": 1.436889 }, { "epoch": 3.4098796109849623, "grad_norm": 6.371580600738525, "learning_rate": 2.294711803485319e-05, "loss": 2.2421146392822267, "memory(GiB)": 77.56, "step": 79590, "token_acc": 0.5160256410256411, "train_speed(iter/s)": 1.436895 }, { "epoch": 3.410093826314211, "grad_norm": 5.7849602699279785, "learning_rate": 2.2941458638285017e-05, "loss": 2.2049732208251953, "memory(GiB)": 77.56, "step": 79595, "token_acc": 0.5140562248995983, "train_speed(iter/s)": 1.436914 }, { "epoch": 3.41030804164346, "grad_norm": 4.941309452056885, "learning_rate": 2.2935799731907707e-05, "loss": 2.064447593688965, "memory(GiB)": 77.56, "step": 79600, "token_acc": 0.5189003436426117, "train_speed(iter/s)": 1.436922 }, { "epoch": 3.410522256972709, "grad_norm": 6.720141410827637, "learning_rate": 2.293014131582376e-05, "loss": 2.2994516372680662, "memory(GiB)": 77.56, "step": 79605, "token_acc": 0.5036231884057971, "train_speed(iter/s)": 1.436932 }, { "epoch": 3.410736472301958, "grad_norm": 5.530195236206055, "learning_rate": 2.2924483390135716e-05, "loss": 2.2740102767944337, "memory(GiB)": 77.56, "step": 79610, "token_acc": 0.49454545454545457, "train_speed(iter/s)": 1.436961 }, { "epoch": 3.410950687631207, "grad_norm": 5.3113203048706055, "learning_rate": 2.291882595494605e-05, "loss": 2.197417449951172, "memory(GiB)": 77.56, "step": 79615, "token_acc": 0.5151515151515151, "train_speed(iter/s)": 1.436943 }, { "epoch": 3.411164902960456, "grad_norm": 7.109493255615234, "learning_rate": 2.2913169010357256e-05, "loss": 2.252703857421875, "memory(GiB)": 77.56, "step": 79620, "token_acc": 0.5338345864661654, "train_speed(iter/s)": 1.436948 }, { "epoch": 3.411379118289705, "grad_norm": 8.050132751464844, "learning_rate": 2.2907512556471817e-05, "loss": 2.3119527816772463, "memory(GiB)": 77.56, "step": 79625, "token_acc": 0.4745762711864407, "train_speed(iter/s)": 1.436946 }, { "epoch": 3.4115933336189537, "grad_norm": 8.109003067016602, "learning_rate": 2.290185659339218e-05, "loss": 2.50976505279541, "memory(GiB)": 77.56, "step": 79630, "token_acc": 0.44805194805194803, "train_speed(iter/s)": 1.436938 }, { "epoch": 3.411807548948203, "grad_norm": 6.99859094619751, "learning_rate": 2.2896201121220856e-05, "loss": 2.250266456604004, "memory(GiB)": 77.56, "step": 79635, "token_acc": 0.5234657039711191, "train_speed(iter/s)": 1.436954 }, { "epoch": 3.4120217642774517, "grad_norm": 4.7064313888549805, "learning_rate": 2.289054614006025e-05, "loss": 2.3661777496337892, "memory(GiB)": 77.56, "step": 79640, "token_acc": 0.493006993006993, "train_speed(iter/s)": 1.436971 }, { "epoch": 3.4122359796067006, "grad_norm": 5.220667839050293, "learning_rate": 2.288489165001285e-05, "loss": 2.3214014053344725, "memory(GiB)": 77.56, "step": 79645, "token_acc": 0.48333333333333334, "train_speed(iter/s)": 1.43699 }, { "epoch": 3.41245019493595, "grad_norm": 5.36407995223999, "learning_rate": 2.287923765118108e-05, "loss": 2.4953689575195312, "memory(GiB)": 77.56, "step": 79650, "token_acc": 0.5170068027210885, "train_speed(iter/s)": 1.437005 }, { "epoch": 3.4126644102651986, "grad_norm": 6.248805522918701, "learning_rate": 2.2873584143667352e-05, "loss": 2.323070526123047, "memory(GiB)": 77.56, "step": 79655, "token_acc": 0.5218855218855218, "train_speed(iter/s)": 1.437015 }, { "epoch": 3.4128786255944474, "grad_norm": 5.491689682006836, "learning_rate": 2.28679311275741e-05, "loss": 2.6055953979492186, "memory(GiB)": 77.56, "step": 79660, "token_acc": 0.49469964664310956, "train_speed(iter/s)": 1.436999 }, { "epoch": 3.4130928409236967, "grad_norm": 5.359642505645752, "learning_rate": 2.2862278603003707e-05, "loss": 2.322352409362793, "memory(GiB)": 77.56, "step": 79665, "token_acc": 0.4946236559139785, "train_speed(iter/s)": 1.436995 }, { "epoch": 3.4133070562529455, "grad_norm": 6.347226619720459, "learning_rate": 2.2856626570058613e-05, "loss": 2.268888473510742, "memory(GiB)": 77.56, "step": 79670, "token_acc": 0.5245398773006135, "train_speed(iter/s)": 1.436996 }, { "epoch": 3.4135212715821943, "grad_norm": 5.04331111907959, "learning_rate": 2.2850975028841194e-05, "loss": 2.4179004669189452, "memory(GiB)": 77.56, "step": 79675, "token_acc": 0.49056603773584906, "train_speed(iter/s)": 1.437002 }, { "epoch": 3.4137354869114436, "grad_norm": 5.191434860229492, "learning_rate": 2.2845323979453832e-05, "loss": 1.9253528594970704, "memory(GiB)": 77.56, "step": 79680, "token_acc": 0.5468164794007491, "train_speed(iter/s)": 1.436997 }, { "epoch": 3.4139497022406924, "grad_norm": 4.421807289123535, "learning_rate": 2.2839673421998892e-05, "loss": 2.186411476135254, "memory(GiB)": 77.56, "step": 79685, "token_acc": 0.5451127819548872, "train_speed(iter/s)": 1.436997 }, { "epoch": 3.414163917569941, "grad_norm": 5.415095329284668, "learning_rate": 2.283402335657873e-05, "loss": 2.6603214263916017, "memory(GiB)": 77.56, "step": 79690, "token_acc": 0.44785276073619634, "train_speed(iter/s)": 1.436996 }, { "epoch": 3.4143781328991905, "grad_norm": 4.832653999328613, "learning_rate": 2.2828373783295743e-05, "loss": 2.298392486572266, "memory(GiB)": 77.56, "step": 79695, "token_acc": 0.54, "train_speed(iter/s)": 1.436996 }, { "epoch": 3.4145923482284393, "grad_norm": 5.813657283782959, "learning_rate": 2.2822724702252245e-05, "loss": 2.100205993652344, "memory(GiB)": 77.56, "step": 79700, "token_acc": 0.5138461538461538, "train_speed(iter/s)": 1.436989 }, { "epoch": 3.414806563557688, "grad_norm": 6.642099380493164, "learning_rate": 2.281707611355059e-05, "loss": 2.4816299438476563, "memory(GiB)": 77.56, "step": 79705, "token_acc": 0.5034013605442177, "train_speed(iter/s)": 1.437006 }, { "epoch": 3.4150207788869373, "grad_norm": 6.974586486816406, "learning_rate": 2.2811428017293096e-05, "loss": 2.395670509338379, "memory(GiB)": 77.56, "step": 79710, "token_acc": 0.47719298245614034, "train_speed(iter/s)": 1.437018 }, { "epoch": 3.415234994216186, "grad_norm": 8.6386137008667, "learning_rate": 2.2805780413582075e-05, "loss": 2.3550479888916014, "memory(GiB)": 77.56, "step": 79715, "token_acc": 0.4744525547445255, "train_speed(iter/s)": 1.437016 }, { "epoch": 3.415449209545435, "grad_norm": 7.0342841148376465, "learning_rate": 2.2800133302519866e-05, "loss": 2.405625915527344, "memory(GiB)": 77.56, "step": 79720, "token_acc": 0.48466257668711654, "train_speed(iter/s)": 1.437036 }, { "epoch": 3.415663424874684, "grad_norm": 5.447117328643799, "learning_rate": 2.2794486684208744e-05, "loss": 2.42040901184082, "memory(GiB)": 77.56, "step": 79725, "token_acc": 0.4584717607973422, "train_speed(iter/s)": 1.43703 }, { "epoch": 3.415877640203933, "grad_norm": 6.344385147094727, "learning_rate": 2.278884055875103e-05, "loss": 2.3380762100219727, "memory(GiB)": 77.56, "step": 79730, "token_acc": 0.47717842323651455, "train_speed(iter/s)": 1.437041 }, { "epoch": 3.416091855533182, "grad_norm": 6.731952667236328, "learning_rate": 2.2783194926248996e-05, "loss": 2.247830390930176, "memory(GiB)": 77.56, "step": 79735, "token_acc": 0.5350553505535055, "train_speed(iter/s)": 1.437059 }, { "epoch": 3.416306070862431, "grad_norm": 5.4700493812561035, "learning_rate": 2.2777549786804918e-05, "loss": 2.7882902145385744, "memory(GiB)": 77.56, "step": 79740, "token_acc": 0.43450479233226835, "train_speed(iter/s)": 1.437076 }, { "epoch": 3.41652028619168, "grad_norm": 6.6195197105407715, "learning_rate": 2.2771905140521067e-05, "loss": 2.3725833892822266, "memory(GiB)": 77.56, "step": 79745, "token_acc": 0.49146757679180886, "train_speed(iter/s)": 1.437039 }, { "epoch": 3.4167345015209287, "grad_norm": 5.307271957397461, "learning_rate": 2.2766260987499677e-05, "loss": 2.4947845458984377, "memory(GiB)": 77.56, "step": 79750, "token_acc": 0.48366013071895425, "train_speed(iter/s)": 1.437064 }, { "epoch": 3.416948716850178, "grad_norm": 5.474647521972656, "learning_rate": 2.276061732784303e-05, "loss": 2.5116575241088865, "memory(GiB)": 77.56, "step": 79755, "token_acc": 0.4968152866242038, "train_speed(iter/s)": 1.437062 }, { "epoch": 3.417162932179427, "grad_norm": 5.101426124572754, "learning_rate": 2.275497416165335e-05, "loss": 1.9923717498779296, "memory(GiB)": 77.56, "step": 79760, "token_acc": 0.5537459283387622, "train_speed(iter/s)": 1.437074 }, { "epoch": 3.4173771475086756, "grad_norm": 5.685215950012207, "learning_rate": 2.2749331489032884e-05, "loss": 2.2149404525756835, "memory(GiB)": 77.56, "step": 79765, "token_acc": 0.5373665480427047, "train_speed(iter/s)": 1.437083 }, { "epoch": 3.417591362837925, "grad_norm": 6.465989589691162, "learning_rate": 2.274368931008383e-05, "loss": 2.424448776245117, "memory(GiB)": 77.56, "step": 79770, "token_acc": 0.48484848484848486, "train_speed(iter/s)": 1.437097 }, { "epoch": 3.4178055781671737, "grad_norm": 4.465653896331787, "learning_rate": 2.27380476249084e-05, "loss": 2.2471393585205077, "memory(GiB)": 77.56, "step": 79775, "token_acc": 0.495114006514658, "train_speed(iter/s)": 1.437094 }, { "epoch": 3.4180197934964225, "grad_norm": 5.666294574737549, "learning_rate": 2.2732406433608826e-05, "loss": 2.379106330871582, "memory(GiB)": 77.56, "step": 79780, "token_acc": 0.49411764705882355, "train_speed(iter/s)": 1.437108 }, { "epoch": 3.4182340088256717, "grad_norm": 6.225849151611328, "learning_rate": 2.2726765736287287e-05, "loss": 2.2286605834960938, "memory(GiB)": 77.56, "step": 79785, "token_acc": 0.5211267605633803, "train_speed(iter/s)": 1.437111 }, { "epoch": 3.4184482241549206, "grad_norm": 4.352685928344727, "learning_rate": 2.2721125533045955e-05, "loss": 2.1221166610717774, "memory(GiB)": 77.56, "step": 79790, "token_acc": 0.5433962264150943, "train_speed(iter/s)": 1.437113 }, { "epoch": 3.4186624394841694, "grad_norm": 7.090567588806152, "learning_rate": 2.2715485823987043e-05, "loss": 2.541061210632324, "memory(GiB)": 77.56, "step": 79795, "token_acc": 0.49019607843137253, "train_speed(iter/s)": 1.437096 }, { "epoch": 3.4188766548134186, "grad_norm": 5.3690338134765625, "learning_rate": 2.27098466092127e-05, "loss": 2.6047725677490234, "memory(GiB)": 77.56, "step": 79800, "token_acc": 0.49491525423728816, "train_speed(iter/s)": 1.437104 }, { "epoch": 3.4190908701426674, "grad_norm": 6.505526542663574, "learning_rate": 2.2704207888825086e-05, "loss": 2.4545711517333983, "memory(GiB)": 77.56, "step": 79805, "token_acc": 0.48632218844984804, "train_speed(iter/s)": 1.437108 }, { "epoch": 3.4193050854719163, "grad_norm": 7.470578193664551, "learning_rate": 2.2698569662926333e-05, "loss": 2.25128231048584, "memory(GiB)": 77.56, "step": 79810, "token_acc": 0.5, "train_speed(iter/s)": 1.437119 }, { "epoch": 3.4195193008011655, "grad_norm": 6.0045976638793945, "learning_rate": 2.2692931931618622e-05, "loss": 2.173305320739746, "memory(GiB)": 77.56, "step": 79815, "token_acc": 0.506993006993007, "train_speed(iter/s)": 1.437133 }, { "epoch": 3.4197335161304143, "grad_norm": 5.753742694854736, "learning_rate": 2.2687294695004058e-05, "loss": 2.374880790710449, "memory(GiB)": 77.56, "step": 79820, "token_acc": 0.5052264808362369, "train_speed(iter/s)": 1.437153 }, { "epoch": 3.419947731459663, "grad_norm": 4.839774131774902, "learning_rate": 2.2681657953184775e-05, "loss": 2.262137031555176, "memory(GiB)": 77.56, "step": 79825, "token_acc": 0.555921052631579, "train_speed(iter/s)": 1.437157 }, { "epoch": 3.4201619467889124, "grad_norm": 5.287930011749268, "learning_rate": 2.267602170626289e-05, "loss": 2.496467399597168, "memory(GiB)": 77.56, "step": 79830, "token_acc": 0.4952076677316294, "train_speed(iter/s)": 1.437156 }, { "epoch": 3.420376162118161, "grad_norm": 5.560831546783447, "learning_rate": 2.267038595434048e-05, "loss": 2.4401559829711914, "memory(GiB)": 77.56, "step": 79835, "token_acc": 0.5054151624548736, "train_speed(iter/s)": 1.437178 }, { "epoch": 3.42059037744741, "grad_norm": 6.871474742889404, "learning_rate": 2.266475069751969e-05, "loss": 2.202611541748047, "memory(GiB)": 77.56, "step": 79840, "token_acc": 0.5055350553505535, "train_speed(iter/s)": 1.437173 }, { "epoch": 3.4208045927766593, "grad_norm": 5.315938472747803, "learning_rate": 2.2659115935902576e-05, "loss": 2.588887023925781, "memory(GiB)": 77.56, "step": 79845, "token_acc": 0.46953405017921146, "train_speed(iter/s)": 1.437177 }, { "epoch": 3.421018808105908, "grad_norm": 4.877957820892334, "learning_rate": 2.265348166959123e-05, "loss": 2.452315902709961, "memory(GiB)": 77.56, "step": 79850, "token_acc": 0.4707792207792208, "train_speed(iter/s)": 1.43718 }, { "epoch": 3.421233023435157, "grad_norm": 11.038566589355469, "learning_rate": 2.2647847898687712e-05, "loss": 2.247224044799805, "memory(GiB)": 77.56, "step": 79855, "token_acc": 0.5394190871369294, "train_speed(iter/s)": 1.437195 }, { "epoch": 3.421447238764406, "grad_norm": 7.7552876472473145, "learning_rate": 2.2642214623294073e-05, "loss": 2.363880157470703, "memory(GiB)": 77.56, "step": 79860, "token_acc": 0.5379061371841155, "train_speed(iter/s)": 1.437168 }, { "epoch": 3.421661454093655, "grad_norm": 5.875951766967773, "learning_rate": 2.2636581843512378e-05, "loss": 2.306252670288086, "memory(GiB)": 77.56, "step": 79865, "token_acc": 0.49063670411985016, "train_speed(iter/s)": 1.43715 }, { "epoch": 3.4218756694229038, "grad_norm": 6.256127834320068, "learning_rate": 2.2630949559444693e-05, "loss": 2.12294864654541, "memory(GiB)": 77.56, "step": 79870, "token_acc": 0.5357142857142857, "train_speed(iter/s)": 1.437175 }, { "epoch": 3.422089884752153, "grad_norm": 7.295079231262207, "learning_rate": 2.2625317771193027e-05, "loss": 2.479261779785156, "memory(GiB)": 77.56, "step": 79875, "token_acc": 0.5100671140939598, "train_speed(iter/s)": 1.437183 }, { "epoch": 3.422304100081402, "grad_norm": 5.242221832275391, "learning_rate": 2.2619686478859416e-05, "loss": 2.3235212326049806, "memory(GiB)": 77.56, "step": 79880, "token_acc": 0.48344370860927155, "train_speed(iter/s)": 1.437197 }, { "epoch": 3.4225183154106507, "grad_norm": 4.939266204833984, "learning_rate": 2.261405568254586e-05, "loss": 2.5372453689575196, "memory(GiB)": 77.56, "step": 79885, "token_acc": 0.48757763975155277, "train_speed(iter/s)": 1.437208 }, { "epoch": 3.4227325307399, "grad_norm": 5.698630332946777, "learning_rate": 2.2608425382354383e-05, "loss": 2.1325597763061523, "memory(GiB)": 77.56, "step": 79890, "token_acc": 0.4875444839857651, "train_speed(iter/s)": 1.437199 }, { "epoch": 3.4229467460691487, "grad_norm": 4.5413360595703125, "learning_rate": 2.2602795578386954e-05, "loss": 2.3484495162963865, "memory(GiB)": 77.56, "step": 79895, "token_acc": 0.5132075471698113, "train_speed(iter/s)": 1.437209 }, { "epoch": 3.4231609613983975, "grad_norm": 8.1338529586792, "learning_rate": 2.2597166270745603e-05, "loss": 2.586288642883301, "memory(GiB)": 77.56, "step": 79900, "token_acc": 0.5111111111111111, "train_speed(iter/s)": 1.437214 }, { "epoch": 3.423375176727647, "grad_norm": 7.2224507331848145, "learning_rate": 2.2591537459532287e-05, "loss": 2.2072099685668944, "memory(GiB)": 77.56, "step": 79905, "token_acc": 0.5243445692883895, "train_speed(iter/s)": 1.437185 }, { "epoch": 3.4235893920568956, "grad_norm": 5.806991100311279, "learning_rate": 2.258590914484898e-05, "loss": 2.345249557495117, "memory(GiB)": 77.56, "step": 79910, "token_acc": 0.47041420118343197, "train_speed(iter/s)": 1.437195 }, { "epoch": 3.4238036073861444, "grad_norm": 6.364602088928223, "learning_rate": 2.2580281326797653e-05, "loss": 2.1221460342407226, "memory(GiB)": 77.56, "step": 79915, "token_acc": 0.5363321799307958, "train_speed(iter/s)": 1.437202 }, { "epoch": 3.4240178227153937, "grad_norm": 7.039035797119141, "learning_rate": 2.2574654005480232e-05, "loss": 2.511148452758789, "memory(GiB)": 77.56, "step": 79920, "token_acc": 0.4774011299435028, "train_speed(iter/s)": 1.437194 }, { "epoch": 3.4242320380446425, "grad_norm": 6.606437683105469, "learning_rate": 2.2569027180998697e-05, "loss": 2.268423652648926, "memory(GiB)": 77.56, "step": 79925, "token_acc": 0.4964788732394366, "train_speed(iter/s)": 1.437214 }, { "epoch": 3.4244462533738913, "grad_norm": 5.700983047485352, "learning_rate": 2.2563400853454963e-05, "loss": 2.458114433288574, "memory(GiB)": 77.56, "step": 79930, "token_acc": 0.5185185185185185, "train_speed(iter/s)": 1.43723 }, { "epoch": 3.4246604687031406, "grad_norm": 6.004363059997559, "learning_rate": 2.2557775022950948e-05, "loss": 2.214258575439453, "memory(GiB)": 77.56, "step": 79935, "token_acc": 0.5017543859649123, "train_speed(iter/s)": 1.437238 }, { "epoch": 3.4248746840323894, "grad_norm": 5.716898441314697, "learning_rate": 2.25521496895886e-05, "loss": 2.24503173828125, "memory(GiB)": 77.56, "step": 79940, "token_acc": 0.5480769230769231, "train_speed(iter/s)": 1.437258 }, { "epoch": 3.425088899361638, "grad_norm": 5.839542865753174, "learning_rate": 2.2546524853469802e-05, "loss": 2.3188934326171875, "memory(GiB)": 77.56, "step": 79945, "token_acc": 0.4789156626506024, "train_speed(iter/s)": 1.437272 }, { "epoch": 3.4253031146908874, "grad_norm": 5.682053089141846, "learning_rate": 2.2540900514696466e-05, "loss": 2.1888347625732423, "memory(GiB)": 77.56, "step": 79950, "token_acc": 0.5428571428571428, "train_speed(iter/s)": 1.437294 }, { "epoch": 3.4255173300201363, "grad_norm": 6.361555576324463, "learning_rate": 2.2535276673370453e-05, "loss": 2.4863655090332033, "memory(GiB)": 77.56, "step": 79955, "token_acc": 0.5179153094462541, "train_speed(iter/s)": 1.437287 }, { "epoch": 3.425731545349385, "grad_norm": 6.168817043304443, "learning_rate": 2.2529653329593686e-05, "loss": 2.3084014892578124, "memory(GiB)": 77.56, "step": 79960, "token_acc": 0.4750830564784053, "train_speed(iter/s)": 1.437265 }, { "epoch": 3.4259457606786343, "grad_norm": 6.668516635894775, "learning_rate": 2.2524030483468024e-05, "loss": 2.289501953125, "memory(GiB)": 77.56, "step": 79965, "token_acc": 0.4876543209876543, "train_speed(iter/s)": 1.437282 }, { "epoch": 3.426159976007883, "grad_norm": 7.366813659667969, "learning_rate": 2.251840813509532e-05, "loss": 2.098384666442871, "memory(GiB)": 77.56, "step": 79970, "token_acc": 0.5466237942122186, "train_speed(iter/s)": 1.437301 }, { "epoch": 3.426374191337132, "grad_norm": 7.163089752197266, "learning_rate": 2.251278628457743e-05, "loss": 2.420930862426758, "memory(GiB)": 77.56, "step": 79975, "token_acc": 0.5018587360594795, "train_speed(iter/s)": 1.437312 }, { "epoch": 3.426588406666381, "grad_norm": 8.034339904785156, "learning_rate": 2.2507164932016188e-05, "loss": 2.158621406555176, "memory(GiB)": 77.56, "step": 79980, "token_acc": 0.4867549668874172, "train_speed(iter/s)": 1.437328 }, { "epoch": 3.42680262199563, "grad_norm": 6.220357418060303, "learning_rate": 2.250154407751346e-05, "loss": 2.057103157043457, "memory(GiB)": 77.56, "step": 79985, "token_acc": 0.5382059800664452, "train_speed(iter/s)": 1.43731 }, { "epoch": 3.427016837324879, "grad_norm": 5.75319766998291, "learning_rate": 2.2495923721171053e-05, "loss": 2.724925994873047, "memory(GiB)": 77.56, "step": 79990, "token_acc": 0.450920245398773, "train_speed(iter/s)": 1.437317 }, { "epoch": 3.427231052654128, "grad_norm": 5.656720161437988, "learning_rate": 2.2490303863090793e-05, "loss": 2.4372371673583983, "memory(GiB)": 77.56, "step": 79995, "token_acc": 0.49107142857142855, "train_speed(iter/s)": 1.437294 }, { "epoch": 3.427445267983377, "grad_norm": 6.461557865142822, "learning_rate": 2.2484684503374487e-05, "loss": 2.1339502334594727, "memory(GiB)": 77.56, "step": 80000, "token_acc": 0.5136054421768708, "train_speed(iter/s)": 1.437292 }, { "epoch": 3.427445267983377, "eval_loss": 2.102041244506836, "eval_runtime": 14.3997, "eval_samples_per_second": 6.945, "eval_steps_per_second": 6.945, "eval_token_acc": 0.4992679355783309, "step": 80000 }, { "epoch": 3.4276594833126257, "grad_norm": 4.7783098220825195, "learning_rate": 2.2479065642123908e-05, "loss": 2.2108074188232423, "memory(GiB)": 77.56, "step": 80005, "token_acc": 0.4952978056426332, "train_speed(iter/s)": 1.436891 }, { "epoch": 3.427873698641875, "grad_norm": 7.851231098175049, "learning_rate": 2.2473447279440875e-05, "loss": 2.437786865234375, "memory(GiB)": 77.56, "step": 80010, "token_acc": 0.5163934426229508, "train_speed(iter/s)": 1.436896 }, { "epoch": 3.428087913971124, "grad_norm": 5.642972946166992, "learning_rate": 2.246782941542718e-05, "loss": 2.296259880065918, "memory(GiB)": 77.56, "step": 80015, "token_acc": 0.5076923076923077, "train_speed(iter/s)": 1.436911 }, { "epoch": 3.4283021293003726, "grad_norm": 7.360104560852051, "learning_rate": 2.2462212050184594e-05, "loss": 2.5477380752563477, "memory(GiB)": 77.56, "step": 80020, "token_acc": 0.4807017543859649, "train_speed(iter/s)": 1.436923 }, { "epoch": 3.428516344629622, "grad_norm": 5.5510711669921875, "learning_rate": 2.245659518381486e-05, "loss": 2.2331222534179687, "memory(GiB)": 77.56, "step": 80025, "token_acc": 0.5104166666666666, "train_speed(iter/s)": 1.43693 }, { "epoch": 3.4287305599588707, "grad_norm": 5.894035816192627, "learning_rate": 2.2450978816419742e-05, "loss": 2.2158954620361326, "memory(GiB)": 77.56, "step": 80030, "token_acc": 0.5469798657718121, "train_speed(iter/s)": 1.436942 }, { "epoch": 3.4289447752881195, "grad_norm": 10.432945251464844, "learning_rate": 2.244536294810099e-05, "loss": 2.4966503143310548, "memory(GiB)": 77.56, "step": 80035, "token_acc": 0.5147058823529411, "train_speed(iter/s)": 1.436935 }, { "epoch": 3.4291589906173687, "grad_norm": 7.354784965515137, "learning_rate": 2.2439747578960318e-05, "loss": 2.4185523986816406, "memory(GiB)": 77.56, "step": 80040, "token_acc": 0.4777777777777778, "train_speed(iter/s)": 1.436946 }, { "epoch": 3.4293732059466175, "grad_norm": 10.520761489868164, "learning_rate": 2.2434132709099487e-05, "loss": 2.436611557006836, "memory(GiB)": 77.56, "step": 80045, "token_acc": 0.5018587360594795, "train_speed(iter/s)": 1.436966 }, { "epoch": 3.4295874212758664, "grad_norm": 5.536417484283447, "learning_rate": 2.24285183386202e-05, "loss": 2.587092399597168, "memory(GiB)": 77.56, "step": 80050, "token_acc": 0.4657534246575342, "train_speed(iter/s)": 1.436962 }, { "epoch": 3.4298016366051156, "grad_norm": 5.768876552581787, "learning_rate": 2.2422904467624172e-05, "loss": 2.2307018280029296, "memory(GiB)": 77.56, "step": 80055, "token_acc": 0.5201342281879194, "train_speed(iter/s)": 1.436981 }, { "epoch": 3.4300158519343644, "grad_norm": 9.289017677307129, "learning_rate": 2.24172910962131e-05, "loss": 2.284924125671387, "memory(GiB)": 77.56, "step": 80060, "token_acc": 0.4946236559139785, "train_speed(iter/s)": 1.436978 }, { "epoch": 3.4302300672636132, "grad_norm": 7.217469692230225, "learning_rate": 2.241167822448865e-05, "loss": 2.5033828735351564, "memory(GiB)": 77.56, "step": 80065, "token_acc": 0.515748031496063, "train_speed(iter/s)": 1.436983 }, { "epoch": 3.4304442825928625, "grad_norm": 5.486662864685059, "learning_rate": 2.2406065852552556e-05, "loss": 2.2092723846435547, "memory(GiB)": 77.56, "step": 80070, "token_acc": 0.5272727272727272, "train_speed(iter/s)": 1.43698 }, { "epoch": 3.4306584979221113, "grad_norm": 5.81664514541626, "learning_rate": 2.240045398050646e-05, "loss": 2.5850425720214845, "memory(GiB)": 77.56, "step": 80075, "token_acc": 0.47720364741641336, "train_speed(iter/s)": 1.436981 }, { "epoch": 3.43087271325136, "grad_norm": 9.906838417053223, "learning_rate": 2.2394842608452026e-05, "loss": 2.4849132537841796, "memory(GiB)": 77.56, "step": 80080, "token_acc": 0.4716312056737589, "train_speed(iter/s)": 1.436986 }, { "epoch": 3.4310869285806094, "grad_norm": 7.220390796661377, "learning_rate": 2.2389231736490896e-05, "loss": 2.329759216308594, "memory(GiB)": 77.56, "step": 80085, "token_acc": 0.5522388059701493, "train_speed(iter/s)": 1.436997 }, { "epoch": 3.431301143909858, "grad_norm": 8.10257625579834, "learning_rate": 2.2383621364724755e-05, "loss": 2.3808422088623047, "memory(GiB)": 77.56, "step": 80090, "token_acc": 0.4782608695652174, "train_speed(iter/s)": 1.437012 }, { "epoch": 3.431515359239107, "grad_norm": 6.302786827087402, "learning_rate": 2.2378011493255225e-05, "loss": 2.5750173568725585, "memory(GiB)": 77.56, "step": 80095, "token_acc": 0.4854368932038835, "train_speed(iter/s)": 1.437017 }, { "epoch": 3.4317295745683563, "grad_norm": 7.061655521392822, "learning_rate": 2.23724021221839e-05, "loss": 1.7369194030761719, "memory(GiB)": 77.56, "step": 80100, "token_acc": 0.6181818181818182, "train_speed(iter/s)": 1.437023 }, { "epoch": 3.431943789897605, "grad_norm": 4.812687873840332, "learning_rate": 2.236679325161245e-05, "loss": 2.0262184143066406, "memory(GiB)": 77.56, "step": 80105, "token_acc": 0.5303030303030303, "train_speed(iter/s)": 1.437031 }, { "epoch": 3.432158005226854, "grad_norm": 4.6677045822143555, "learning_rate": 2.2361184881642468e-05, "loss": 2.480453109741211, "memory(GiB)": 77.56, "step": 80110, "token_acc": 0.5033783783783784, "train_speed(iter/s)": 1.43703 }, { "epoch": 3.432372220556103, "grad_norm": 5.960814476013184, "learning_rate": 2.2355577012375538e-05, "loss": 2.2339080810546874, "memory(GiB)": 77.56, "step": 80115, "token_acc": 0.5304878048780488, "train_speed(iter/s)": 1.437044 }, { "epoch": 3.432586435885352, "grad_norm": 6.189528465270996, "learning_rate": 2.234996964391327e-05, "loss": 2.257598876953125, "memory(GiB)": 77.56, "step": 80120, "token_acc": 0.49280575539568344, "train_speed(iter/s)": 1.437052 }, { "epoch": 3.4328006512146008, "grad_norm": 6.552147388458252, "learning_rate": 2.2344362776357218e-05, "loss": 2.324764633178711, "memory(GiB)": 77.56, "step": 80125, "token_acc": 0.5, "train_speed(iter/s)": 1.437069 }, { "epoch": 3.43301486654385, "grad_norm": 6.442381858825684, "learning_rate": 2.2338756409809004e-05, "loss": 2.1429094314575194, "memory(GiB)": 77.56, "step": 80130, "token_acc": 0.541095890410959, "train_speed(iter/s)": 1.437075 }, { "epoch": 3.433229081873099, "grad_norm": 6.968410491943359, "learning_rate": 2.2333150544370157e-05, "loss": 2.0460203170776365, "memory(GiB)": 77.56, "step": 80135, "token_acc": 0.5372549019607843, "train_speed(iter/s)": 1.437077 }, { "epoch": 3.4334432972023476, "grad_norm": 6.643887042999268, "learning_rate": 2.2327545180142247e-05, "loss": 2.3601781845092775, "memory(GiB)": 77.56, "step": 80140, "token_acc": 0.524904214559387, "train_speed(iter/s)": 1.43706 }, { "epoch": 3.433657512531597, "grad_norm": 6.591537952423096, "learning_rate": 2.232194031722681e-05, "loss": 2.4266279220581053, "memory(GiB)": 77.56, "step": 80145, "token_acc": 0.49206349206349204, "train_speed(iter/s)": 1.437034 }, { "epoch": 3.4338717278608457, "grad_norm": 5.752573490142822, "learning_rate": 2.231633595572537e-05, "loss": 2.5533803939819335, "memory(GiB)": 77.56, "step": 80150, "token_acc": 0.5, "train_speed(iter/s)": 1.437053 }, { "epoch": 3.4340859431900945, "grad_norm": 5.829822063446045, "learning_rate": 2.2310732095739494e-05, "loss": 2.509723663330078, "memory(GiB)": 77.56, "step": 80155, "token_acc": 0.5189003436426117, "train_speed(iter/s)": 1.437068 }, { "epoch": 3.434300158519344, "grad_norm": 5.743741035461426, "learning_rate": 2.2305128737370662e-05, "loss": 2.101865768432617, "memory(GiB)": 77.56, "step": 80160, "token_acc": 0.5474137931034483, "train_speed(iter/s)": 1.437026 }, { "epoch": 3.4345143738485926, "grad_norm": 6.74943208694458, "learning_rate": 2.2299525880720424e-05, "loss": 2.2783987045288088, "memory(GiB)": 77.56, "step": 80165, "token_acc": 0.5223367697594502, "train_speed(iter/s)": 1.437047 }, { "epoch": 3.4347285891778414, "grad_norm": 5.304409027099609, "learning_rate": 2.2293923525890258e-05, "loss": 2.2710968017578126, "memory(GiB)": 77.56, "step": 80170, "token_acc": 0.4966887417218543, "train_speed(iter/s)": 1.437051 }, { "epoch": 3.4349428045070907, "grad_norm": 5.887391567230225, "learning_rate": 2.2288321672981655e-05, "loss": 2.3437952041625976, "memory(GiB)": 77.56, "step": 80175, "token_acc": 0.5179856115107914, "train_speed(iter/s)": 1.437029 }, { "epoch": 3.4351570198363395, "grad_norm": 7.42817497253418, "learning_rate": 2.22827203220961e-05, "loss": 2.3383115768432616, "memory(GiB)": 77.56, "step": 80180, "token_acc": 0.48214285714285715, "train_speed(iter/s)": 1.437055 }, { "epoch": 3.4353712351655883, "grad_norm": 6.673346996307373, "learning_rate": 2.2277119473335058e-05, "loss": 2.538251495361328, "memory(GiB)": 77.56, "step": 80185, "token_acc": 0.50187265917603, "train_speed(iter/s)": 1.437071 }, { "epoch": 3.4355854504948375, "grad_norm": 5.708316326141357, "learning_rate": 2.2271519126800013e-05, "loss": 2.402574157714844, "memory(GiB)": 77.56, "step": 80190, "token_acc": 0.5234657039711191, "train_speed(iter/s)": 1.437039 }, { "epoch": 3.4357996658240864, "grad_norm": 4.448325157165527, "learning_rate": 2.2265919282592412e-05, "loss": 2.2469444274902344, "memory(GiB)": 77.56, "step": 80195, "token_acc": 0.5246913580246914, "train_speed(iter/s)": 1.437039 }, { "epoch": 3.436013881153335, "grad_norm": 6.499133586883545, "learning_rate": 2.22603199408137e-05, "loss": 2.289023590087891, "memory(GiB)": 77.56, "step": 80200, "token_acc": 0.5163398692810458, "train_speed(iter/s)": 1.437061 }, { "epoch": 3.4362280964825844, "grad_norm": 7.010324001312256, "learning_rate": 2.225472110156531e-05, "loss": 2.2964408874511717, "memory(GiB)": 77.56, "step": 80205, "token_acc": 0.5229007633587787, "train_speed(iter/s)": 1.437076 }, { "epoch": 3.4364423118118332, "grad_norm": 7.886505126953125, "learning_rate": 2.2249122764948665e-05, "loss": 2.2595388412475588, "memory(GiB)": 77.56, "step": 80210, "token_acc": 0.4962686567164179, "train_speed(iter/s)": 1.437097 }, { "epoch": 3.436656527141082, "grad_norm": 6.966797351837158, "learning_rate": 2.2243524931065206e-05, "loss": 2.2966806411743166, "memory(GiB)": 77.56, "step": 80215, "token_acc": 0.5169230769230769, "train_speed(iter/s)": 1.437118 }, { "epoch": 3.4368707424703313, "grad_norm": 6.148560523986816, "learning_rate": 2.2237927600016334e-05, "loss": 2.2613367080688476, "memory(GiB)": 77.56, "step": 80220, "token_acc": 0.5134228187919463, "train_speed(iter/s)": 1.437124 }, { "epoch": 3.43708495779958, "grad_norm": 5.912735462188721, "learning_rate": 2.223233077190344e-05, "loss": 2.331842041015625, "memory(GiB)": 77.56, "step": 80225, "token_acc": 0.47904191616766467, "train_speed(iter/s)": 1.437129 }, { "epoch": 3.437299173128829, "grad_norm": 7.128683567047119, "learning_rate": 2.222673444682793e-05, "loss": 2.6025352478027344, "memory(GiB)": 77.56, "step": 80230, "token_acc": 0.5039370078740157, "train_speed(iter/s)": 1.437135 }, { "epoch": 3.437513388458078, "grad_norm": 6.587675094604492, "learning_rate": 2.2221138624891158e-05, "loss": 2.3038351058959963, "memory(GiB)": 77.56, "step": 80235, "token_acc": 0.5121107266435986, "train_speed(iter/s)": 1.437135 }, { "epoch": 3.437727603787327, "grad_norm": 7.692835330963135, "learning_rate": 2.2215543306194535e-05, "loss": 2.3964452743530273, "memory(GiB)": 77.56, "step": 80240, "token_acc": 0.5112781954887218, "train_speed(iter/s)": 1.437139 }, { "epoch": 3.437941819116576, "grad_norm": 7.405257225036621, "learning_rate": 2.2209948490839388e-05, "loss": 2.551258087158203, "memory(GiB)": 77.56, "step": 80245, "token_acc": 0.5054151624548736, "train_speed(iter/s)": 1.437139 }, { "epoch": 3.438156034445825, "grad_norm": 12.880958557128906, "learning_rate": 2.2204354178927116e-05, "loss": 2.5628082275390627, "memory(GiB)": 77.56, "step": 80250, "token_acc": 0.47651006711409394, "train_speed(iter/s)": 1.437156 }, { "epoch": 3.438370249775074, "grad_norm": 4.969246864318848, "learning_rate": 2.2198760370559048e-05, "loss": 1.9327110290527343, "memory(GiB)": 77.56, "step": 80255, "token_acc": 0.5895196506550219, "train_speed(iter/s)": 1.437161 }, { "epoch": 3.4385844651043227, "grad_norm": 6.226701259613037, "learning_rate": 2.2193167065836506e-05, "loss": 2.1575859069824217, "memory(GiB)": 77.56, "step": 80260, "token_acc": 0.5165562913907285, "train_speed(iter/s)": 1.437181 }, { "epoch": 3.438798680433572, "grad_norm": 5.530786514282227, "learning_rate": 2.218757426486083e-05, "loss": 2.1741479873657226, "memory(GiB)": 77.56, "step": 80265, "token_acc": 0.5333333333333333, "train_speed(iter/s)": 1.437188 }, { "epoch": 3.4390128957628208, "grad_norm": 6.205590724945068, "learning_rate": 2.2181981967733316e-05, "loss": 2.382988166809082, "memory(GiB)": 77.56, "step": 80270, "token_acc": 0.5, "train_speed(iter/s)": 1.437158 }, { "epoch": 3.4392271110920696, "grad_norm": 5.49648904800415, "learning_rate": 2.2176390174555316e-05, "loss": 2.3088052749633787, "memory(GiB)": 77.56, "step": 80275, "token_acc": 0.5141242937853108, "train_speed(iter/s)": 1.437171 }, { "epoch": 3.439441326421319, "grad_norm": 6.080382823944092, "learning_rate": 2.2170798885428113e-05, "loss": 2.34581298828125, "memory(GiB)": 77.56, "step": 80280, "token_acc": 0.4748201438848921, "train_speed(iter/s)": 1.437187 }, { "epoch": 3.4396555417505676, "grad_norm": 5.663379192352295, "learning_rate": 2.2165208100452982e-05, "loss": 2.0927642822265624, "memory(GiB)": 77.56, "step": 80285, "token_acc": 0.5470383275261324, "train_speed(iter/s)": 1.437202 }, { "epoch": 3.4398697570798165, "grad_norm": 6.5682597160339355, "learning_rate": 2.2159617819731226e-05, "loss": 2.1643686294555664, "memory(GiB)": 77.56, "step": 80290, "token_acc": 0.5127272727272727, "train_speed(iter/s)": 1.43722 }, { "epoch": 3.4400839724090657, "grad_norm": 5.731741428375244, "learning_rate": 2.215402804336409e-05, "loss": 2.268716239929199, "memory(GiB)": 77.56, "step": 80295, "token_acc": 0.505338078291815, "train_speed(iter/s)": 1.437241 }, { "epoch": 3.4402981877383145, "grad_norm": 6.32515811920166, "learning_rate": 2.214843877145288e-05, "loss": 2.220654296875, "memory(GiB)": 77.56, "step": 80300, "token_acc": 0.5042016806722689, "train_speed(iter/s)": 1.437243 }, { "epoch": 3.4405124030675633, "grad_norm": 7.128988742828369, "learning_rate": 2.2142850004098804e-05, "loss": 2.2158023834228517, "memory(GiB)": 77.56, "step": 80305, "token_acc": 0.5435684647302904, "train_speed(iter/s)": 1.437264 }, { "epoch": 3.4407266183968126, "grad_norm": 5.263505935668945, "learning_rate": 2.2137261741403154e-05, "loss": 2.6984487533569337, "memory(GiB)": 77.56, "step": 80310, "token_acc": 0.48324022346368717, "train_speed(iter/s)": 1.437282 }, { "epoch": 3.4409408337260614, "grad_norm": 5.217807769775391, "learning_rate": 2.213167398346714e-05, "loss": 2.5654275894165037, "memory(GiB)": 77.56, "step": 80315, "token_acc": 0.4356955380577428, "train_speed(iter/s)": 1.437275 }, { "epoch": 3.4411550490553102, "grad_norm": 5.2641706466674805, "learning_rate": 2.2126086730391997e-05, "loss": 2.4469430923461912, "memory(GiB)": 77.56, "step": 80320, "token_acc": 0.477124183006536, "train_speed(iter/s)": 1.437284 }, { "epoch": 3.4413692643845595, "grad_norm": 5.480915069580078, "learning_rate": 2.2120499982278942e-05, "loss": 2.5283790588378907, "memory(GiB)": 77.56, "step": 80325, "token_acc": 0.4373401534526854, "train_speed(iter/s)": 1.437297 }, { "epoch": 3.4415834797138083, "grad_norm": 6.123584270477295, "learning_rate": 2.2114913739229166e-05, "loss": 2.2348833084106445, "memory(GiB)": 77.56, "step": 80330, "token_acc": 0.5063694267515924, "train_speed(iter/s)": 1.437289 }, { "epoch": 3.441797695043057, "grad_norm": 6.341625690460205, "learning_rate": 2.2109328001343898e-05, "loss": 2.2139204025268553, "memory(GiB)": 77.56, "step": 80335, "token_acc": 0.5340909090909091, "train_speed(iter/s)": 1.437293 }, { "epoch": 3.4420119103723064, "grad_norm": 5.228277206420898, "learning_rate": 2.2103742768724322e-05, "loss": 2.1739824295043944, "memory(GiB)": 77.56, "step": 80340, "token_acc": 0.5111111111111111, "train_speed(iter/s)": 1.437295 }, { "epoch": 3.442226125701555, "grad_norm": 7.82539176940918, "learning_rate": 2.2098158041471617e-05, "loss": 2.5690967559814455, "memory(GiB)": 77.56, "step": 80345, "token_acc": 0.4671280276816609, "train_speed(iter/s)": 1.437308 }, { "epoch": 3.442440341030804, "grad_norm": 8.47783374786377, "learning_rate": 2.2092573819686945e-05, "loss": 2.5492820739746094, "memory(GiB)": 77.56, "step": 80350, "token_acc": 0.45588235294117646, "train_speed(iter/s)": 1.437309 }, { "epoch": 3.4426545563600532, "grad_norm": 6.251232147216797, "learning_rate": 2.2086990103471468e-05, "loss": 2.719070816040039, "memory(GiB)": 77.56, "step": 80355, "token_acc": 0.49473684210526314, "train_speed(iter/s)": 1.43734 }, { "epoch": 3.442868771689302, "grad_norm": 6.333085536956787, "learning_rate": 2.2081406892926366e-05, "loss": 2.5249135971069334, "memory(GiB)": 77.56, "step": 80360, "token_acc": 0.4709897610921502, "train_speed(iter/s)": 1.437364 }, { "epoch": 3.443082987018551, "grad_norm": 4.731055736541748, "learning_rate": 2.2075824188152762e-05, "loss": 2.313634490966797, "memory(GiB)": 77.56, "step": 80365, "token_acc": 0.5189003436426117, "train_speed(iter/s)": 1.437368 }, { "epoch": 3.4432972023478, "grad_norm": 6.09040641784668, "learning_rate": 2.2070241989251805e-05, "loss": 2.452932929992676, "memory(GiB)": 77.56, "step": 80370, "token_acc": 0.49221183800623053, "train_speed(iter/s)": 1.437355 }, { "epoch": 3.443511417677049, "grad_norm": 5.076159477233887, "learning_rate": 2.2064660296324607e-05, "loss": 2.124247169494629, "memory(GiB)": 77.56, "step": 80375, "token_acc": 0.5444444444444444, "train_speed(iter/s)": 1.437365 }, { "epoch": 3.4437256330062977, "grad_norm": 7.787264823913574, "learning_rate": 2.205907910947228e-05, "loss": 2.140587043762207, "memory(GiB)": 77.56, "step": 80380, "token_acc": 0.5412186379928315, "train_speed(iter/s)": 1.437382 }, { "epoch": 3.443939848335547, "grad_norm": 4.084612846374512, "learning_rate": 2.2053498428795965e-05, "loss": 2.4007919311523436, "memory(GiB)": 77.56, "step": 80385, "token_acc": 0.5066666666666667, "train_speed(iter/s)": 1.437394 }, { "epoch": 3.444154063664796, "grad_norm": 6.645567893981934, "learning_rate": 2.2047918254396722e-05, "loss": 2.06510066986084, "memory(GiB)": 77.56, "step": 80390, "token_acc": 0.5309090909090909, "train_speed(iter/s)": 1.437396 }, { "epoch": 3.4443682789940446, "grad_norm": 4.380014896392822, "learning_rate": 2.2042338586375676e-05, "loss": 2.4250310897827148, "memory(GiB)": 77.56, "step": 80395, "token_acc": 0.49173553719008267, "train_speed(iter/s)": 1.437402 }, { "epoch": 3.444582494323294, "grad_norm": 7.077320575714111, "learning_rate": 2.2036759424833898e-05, "loss": 2.5221746444702147, "memory(GiB)": 77.56, "step": 80400, "token_acc": 0.4277286135693215, "train_speed(iter/s)": 1.437417 }, { "epoch": 3.4447967096525427, "grad_norm": 4.8981194496154785, "learning_rate": 2.2031180769872456e-05, "loss": 2.5382272720336916, "memory(GiB)": 77.56, "step": 80405, "token_acc": 0.527027027027027, "train_speed(iter/s)": 1.437427 }, { "epoch": 3.4450109249817915, "grad_norm": 4.959420680999756, "learning_rate": 2.2025602621592404e-05, "loss": 2.1550403594970704, "memory(GiB)": 77.56, "step": 80410, "token_acc": 0.521978021978022, "train_speed(iter/s)": 1.437436 }, { "epoch": 3.4452251403110408, "grad_norm": 6.577432155609131, "learning_rate": 2.2020024980094788e-05, "loss": 2.2065486907958984, "memory(GiB)": 77.56, "step": 80415, "token_acc": 0.5034722222222222, "train_speed(iter/s)": 1.437438 }, { "epoch": 3.4454393556402896, "grad_norm": 5.82411003112793, "learning_rate": 2.2014447845480684e-05, "loss": 2.485791015625, "memory(GiB)": 77.56, "step": 80420, "token_acc": 0.5, "train_speed(iter/s)": 1.437454 }, { "epoch": 3.4456535709695384, "grad_norm": 6.361300945281982, "learning_rate": 2.2008871217851114e-05, "loss": 2.391726493835449, "memory(GiB)": 77.56, "step": 80425, "token_acc": 0.4840989399293286, "train_speed(iter/s)": 1.437434 }, { "epoch": 3.4458677862987876, "grad_norm": 5.705111503601074, "learning_rate": 2.2003295097307097e-05, "loss": 2.0951635360717775, "memory(GiB)": 77.56, "step": 80430, "token_acc": 0.5692883895131086, "train_speed(iter/s)": 1.437438 }, { "epoch": 3.4460820016280365, "grad_norm": 6.383053779602051, "learning_rate": 2.1997719483949652e-05, "loss": 2.1492645263671877, "memory(GiB)": 77.56, "step": 80435, "token_acc": 0.5457627118644067, "train_speed(iter/s)": 1.43743 }, { "epoch": 3.4462962169572853, "grad_norm": 9.024420738220215, "learning_rate": 2.199214437787977e-05, "loss": 2.33245735168457, "memory(GiB)": 77.56, "step": 80440, "token_acc": 0.5052631578947369, "train_speed(iter/s)": 1.437444 }, { "epoch": 3.4465104322865345, "grad_norm": 6.605022430419922, "learning_rate": 2.1986569779198486e-05, "loss": 2.446097755432129, "memory(GiB)": 77.56, "step": 80445, "token_acc": 0.48231511254019294, "train_speed(iter/s)": 1.437459 }, { "epoch": 3.4467246476157833, "grad_norm": 7.3540358543396, "learning_rate": 2.198099568800676e-05, "loss": 2.448853874206543, "memory(GiB)": 77.56, "step": 80450, "token_acc": 0.5054545454545455, "train_speed(iter/s)": 1.437447 }, { "epoch": 3.446938862945032, "grad_norm": 6.087551116943359, "learning_rate": 2.1975422104405576e-05, "loss": 2.3472049713134764, "memory(GiB)": 77.56, "step": 80455, "token_acc": 0.49469964664310956, "train_speed(iter/s)": 1.437424 }, { "epoch": 3.4471530782742814, "grad_norm": 5.073938846588135, "learning_rate": 2.1969849028495915e-05, "loss": 1.8209077835083007, "memory(GiB)": 77.56, "step": 80460, "token_acc": 0.5862068965517241, "train_speed(iter/s)": 1.437427 }, { "epoch": 3.4473672936035302, "grad_norm": 4.9377031326293945, "learning_rate": 2.196427646037874e-05, "loss": 2.3113643646240236, "memory(GiB)": 77.56, "step": 80465, "token_acc": 0.47058823529411764, "train_speed(iter/s)": 1.437429 }, { "epoch": 3.447581508932779, "grad_norm": 6.210414886474609, "learning_rate": 2.1958704400154995e-05, "loss": 2.5444915771484373, "memory(GiB)": 77.56, "step": 80470, "token_acc": 0.4726027397260274, "train_speed(iter/s)": 1.437438 }, { "epoch": 3.4477957242620283, "grad_norm": 4.72358512878418, "learning_rate": 2.195313284792561e-05, "loss": 2.432740020751953, "memory(GiB)": 77.56, "step": 80475, "token_acc": 0.4744318181818182, "train_speed(iter/s)": 1.437446 }, { "epoch": 3.448009939591277, "grad_norm": 6.679464817047119, "learning_rate": 2.194756180379155e-05, "loss": 2.223580741882324, "memory(GiB)": 77.56, "step": 80480, "token_acc": 0.5143769968051118, "train_speed(iter/s)": 1.437472 }, { "epoch": 3.448224154920526, "grad_norm": 6.284995079040527, "learning_rate": 2.1941991267853722e-05, "loss": 2.3431787490844727, "memory(GiB)": 77.56, "step": 80485, "token_acc": 0.5017543859649123, "train_speed(iter/s)": 1.437477 }, { "epoch": 3.448438370249775, "grad_norm": 5.769493103027344, "learning_rate": 2.193642124021304e-05, "loss": 2.043992042541504, "memory(GiB)": 77.56, "step": 80490, "token_acc": 0.5387596899224806, "train_speed(iter/s)": 1.43748 }, { "epoch": 3.448652585579024, "grad_norm": 8.8023042678833, "learning_rate": 2.193085172097042e-05, "loss": 2.373923683166504, "memory(GiB)": 77.56, "step": 80495, "token_acc": 0.45980707395498394, "train_speed(iter/s)": 1.437481 }, { "epoch": 3.448866800908273, "grad_norm": 4.613881587982178, "learning_rate": 2.192528271022673e-05, "loss": 2.3887168884277346, "memory(GiB)": 77.56, "step": 80500, "token_acc": 0.4945054945054945, "train_speed(iter/s)": 1.437491 }, { "epoch": 3.448866800908273, "eval_loss": 2.26839542388916, "eval_runtime": 14.1365, "eval_samples_per_second": 7.074, "eval_steps_per_second": 7.074, "eval_token_acc": 0.47979139504563234, "step": 80500 }, { "epoch": 3.449081016237522, "grad_norm": 5.859959602355957, "learning_rate": 2.1919714208082898e-05, "loss": 2.2975589752197267, "memory(GiB)": 77.56, "step": 80505, "token_acc": 0.4976303317535545, "train_speed(iter/s)": 1.437116 }, { "epoch": 3.449295231566771, "grad_norm": 5.504870414733887, "learning_rate": 2.1914146214639782e-05, "loss": 2.1874622344970702, "memory(GiB)": 77.56, "step": 80510, "token_acc": 0.540084388185654, "train_speed(iter/s)": 1.437122 }, { "epoch": 3.4495094468960197, "grad_norm": 4.164382457733154, "learning_rate": 2.1908578729998258e-05, "loss": 2.2466367721557616, "memory(GiB)": 77.56, "step": 80515, "token_acc": 0.5, "train_speed(iter/s)": 1.437137 }, { "epoch": 3.449723662225269, "grad_norm": 4.9224114418029785, "learning_rate": 2.1903011754259177e-05, "loss": 2.3191972732543946, "memory(GiB)": 77.56, "step": 80520, "token_acc": 0.47038327526132406, "train_speed(iter/s)": 1.437121 }, { "epoch": 3.4499378775545178, "grad_norm": 9.172769546508789, "learning_rate": 2.1897445287523384e-05, "loss": 2.249551773071289, "memory(GiB)": 77.56, "step": 80525, "token_acc": 0.5164835164835165, "train_speed(iter/s)": 1.437142 }, { "epoch": 3.4501520928837666, "grad_norm": 6.565616607666016, "learning_rate": 2.1891879329891746e-05, "loss": 2.2075742721557616, "memory(GiB)": 77.56, "step": 80530, "token_acc": 0.5387453874538746, "train_speed(iter/s)": 1.437177 }, { "epoch": 3.450366308213016, "grad_norm": 6.820053577423096, "learning_rate": 2.1886313881465063e-05, "loss": 2.319544792175293, "memory(GiB)": 77.56, "step": 80535, "token_acc": 0.5274725274725275, "train_speed(iter/s)": 1.43718 }, { "epoch": 3.4505805235422646, "grad_norm": 7.024238109588623, "learning_rate": 2.1880748942344198e-05, "loss": 2.1867679595947265, "memory(GiB)": 77.56, "step": 80540, "token_acc": 0.5016722408026756, "train_speed(iter/s)": 1.437191 }, { "epoch": 3.4507947388715134, "grad_norm": 6.06240177154541, "learning_rate": 2.1875184512629943e-05, "loss": 2.3003362655639648, "memory(GiB)": 77.56, "step": 80545, "token_acc": 0.5034965034965035, "train_speed(iter/s)": 1.437214 }, { "epoch": 3.4510089542007627, "grad_norm": 5.127089023590088, "learning_rate": 2.18696205924231e-05, "loss": 2.335889434814453, "memory(GiB)": 77.56, "step": 80550, "token_acc": 0.5154320987654321, "train_speed(iter/s)": 1.437234 }, { "epoch": 3.4512231695300115, "grad_norm": 5.436455249786377, "learning_rate": 2.186405718182447e-05, "loss": 2.5750221252441405, "memory(GiB)": 77.56, "step": 80555, "token_acc": 0.4600760456273764, "train_speed(iter/s)": 1.437237 }, { "epoch": 3.4514373848592603, "grad_norm": 5.467102527618408, "learning_rate": 2.1858494280934826e-05, "loss": 2.3708337783813476, "memory(GiB)": 77.56, "step": 80560, "token_acc": 0.48242811501597443, "train_speed(iter/s)": 1.43724 }, { "epoch": 3.4516516001885096, "grad_norm": 6.265877723693848, "learning_rate": 2.1852931889854966e-05, "loss": 2.397037124633789, "memory(GiB)": 77.56, "step": 80565, "token_acc": 0.49809885931558934, "train_speed(iter/s)": 1.437243 }, { "epoch": 3.4518658155177584, "grad_norm": 6.674444198608398, "learning_rate": 2.1847370008685657e-05, "loss": 2.2196815490722654, "memory(GiB)": 77.56, "step": 80570, "token_acc": 0.519434628975265, "train_speed(iter/s)": 1.437253 }, { "epoch": 3.452080030847007, "grad_norm": 5.629004955291748, "learning_rate": 2.1841808637527645e-05, "loss": 2.157611083984375, "memory(GiB)": 77.56, "step": 80575, "token_acc": 0.5149501661129569, "train_speed(iter/s)": 1.437258 }, { "epoch": 3.4522942461762565, "grad_norm": 6.988944053649902, "learning_rate": 2.1836247776481684e-05, "loss": 2.4590389251708986, "memory(GiB)": 77.56, "step": 80580, "token_acc": 0.4573170731707317, "train_speed(iter/s)": 1.437261 }, { "epoch": 3.4525084615055053, "grad_norm": 5.524704456329346, "learning_rate": 2.1830687425648495e-05, "loss": 2.4149913787841797, "memory(GiB)": 77.56, "step": 80585, "token_acc": 0.5404411764705882, "train_speed(iter/s)": 1.437279 }, { "epoch": 3.452722676834754, "grad_norm": 4.795392036437988, "learning_rate": 2.182512758512885e-05, "loss": 2.159110450744629, "memory(GiB)": 77.56, "step": 80590, "token_acc": 0.5636363636363636, "train_speed(iter/s)": 1.437286 }, { "epoch": 3.4529368921640033, "grad_norm": 6.560310363769531, "learning_rate": 2.181956825502345e-05, "loss": 2.130175018310547, "memory(GiB)": 77.56, "step": 80595, "token_acc": 0.543918918918919, "train_speed(iter/s)": 1.437284 }, { "epoch": 3.453151107493252, "grad_norm": 5.127025127410889, "learning_rate": 2.181400943543298e-05, "loss": 2.2630002975463865, "memory(GiB)": 77.56, "step": 80600, "token_acc": 0.5249169435215947, "train_speed(iter/s)": 1.437304 }, { "epoch": 3.453365322822501, "grad_norm": 5.843238353729248, "learning_rate": 2.1808451126458196e-05, "loss": 2.145084571838379, "memory(GiB)": 77.56, "step": 80605, "token_acc": 0.5318352059925093, "train_speed(iter/s)": 1.437312 }, { "epoch": 3.4535795381517502, "grad_norm": 5.5879621505737305, "learning_rate": 2.1802893328199763e-05, "loss": 2.4597858428955077, "memory(GiB)": 77.56, "step": 80610, "token_acc": 0.45422535211267606, "train_speed(iter/s)": 1.437348 }, { "epoch": 3.453793753480999, "grad_norm": 8.249075889587402, "learning_rate": 2.1797336040758365e-05, "loss": 2.7451202392578127, "memory(GiB)": 77.56, "step": 80615, "token_acc": 0.4840989399293286, "train_speed(iter/s)": 1.437363 }, { "epoch": 3.454007968810248, "grad_norm": 5.4733428955078125, "learning_rate": 2.179177926423467e-05, "loss": 2.4360809326171875, "memory(GiB)": 77.56, "step": 80620, "token_acc": 0.4672897196261682, "train_speed(iter/s)": 1.437342 }, { "epoch": 3.454222184139497, "grad_norm": 6.793597221374512, "learning_rate": 2.1786222998729367e-05, "loss": 2.3753881454467773, "memory(GiB)": 77.56, "step": 80625, "token_acc": 0.5123456790123457, "train_speed(iter/s)": 1.437348 }, { "epoch": 3.454436399468746, "grad_norm": 5.418949604034424, "learning_rate": 2.1780667244343106e-05, "loss": 2.4566730499267577, "memory(GiB)": 77.56, "step": 80630, "token_acc": 0.4981684981684982, "train_speed(iter/s)": 1.437353 }, { "epoch": 3.4546506147979947, "grad_norm": 4.494617462158203, "learning_rate": 2.1775112001176522e-05, "loss": 1.8612266540527345, "memory(GiB)": 77.56, "step": 80635, "token_acc": 0.5905797101449275, "train_speed(iter/s)": 1.437357 }, { "epoch": 3.454864830127244, "grad_norm": 6.100369453430176, "learning_rate": 2.176955726933027e-05, "loss": 2.3583000183105467, "memory(GiB)": 77.56, "step": 80640, "token_acc": 0.490625, "train_speed(iter/s)": 1.437366 }, { "epoch": 3.455079045456493, "grad_norm": 6.376121997833252, "learning_rate": 2.1764003048904947e-05, "loss": 2.413394546508789, "memory(GiB)": 77.56, "step": 80645, "token_acc": 0.47, "train_speed(iter/s)": 1.437369 }, { "epoch": 3.4552932607857416, "grad_norm": 5.128441333770752, "learning_rate": 2.1758449340001213e-05, "loss": 2.035100746154785, "memory(GiB)": 77.56, "step": 80650, "token_acc": 0.5546875, "train_speed(iter/s)": 1.437355 }, { "epoch": 3.455507476114991, "grad_norm": 13.508808135986328, "learning_rate": 2.175289614271967e-05, "loss": 2.2530609130859376, "memory(GiB)": 77.56, "step": 80655, "token_acc": 0.5069444444444444, "train_speed(iter/s)": 1.437349 }, { "epoch": 3.4557216914442397, "grad_norm": 5.964972019195557, "learning_rate": 2.1747343457160902e-05, "loss": 2.080478858947754, "memory(GiB)": 77.56, "step": 80660, "token_acc": 0.5409252669039146, "train_speed(iter/s)": 1.437373 }, { "epoch": 3.4559359067734885, "grad_norm": 4.962120056152344, "learning_rate": 2.1741791283425516e-05, "loss": 2.4243751525878907, "memory(GiB)": 77.56, "step": 80665, "token_acc": 0.48905109489051096, "train_speed(iter/s)": 1.437362 }, { "epoch": 3.4561501221027378, "grad_norm": 8.512435913085938, "learning_rate": 2.173623962161407e-05, "loss": 2.403050994873047, "memory(GiB)": 77.56, "step": 80670, "token_acc": 0.49140893470790376, "train_speed(iter/s)": 1.437386 }, { "epoch": 3.4563643374319866, "grad_norm": 6.178690433502197, "learning_rate": 2.1730688471827176e-05, "loss": 2.334092903137207, "memory(GiB)": 77.56, "step": 80675, "token_acc": 0.5018315018315018, "train_speed(iter/s)": 1.437367 }, { "epoch": 3.4565785527612354, "grad_norm": 7.984079360961914, "learning_rate": 2.1725137834165356e-05, "loss": 2.353314971923828, "memory(GiB)": 77.56, "step": 80680, "token_acc": 0.4807692307692308, "train_speed(iter/s)": 1.437389 }, { "epoch": 3.4567927680904846, "grad_norm": 5.329522132873535, "learning_rate": 2.171958770872921e-05, "loss": 2.085029411315918, "memory(GiB)": 77.56, "step": 80685, "token_acc": 0.5220883534136547, "train_speed(iter/s)": 1.437394 }, { "epoch": 3.4570069834197334, "grad_norm": 6.020061016082764, "learning_rate": 2.171403809561926e-05, "loss": 2.447394371032715, "memory(GiB)": 77.56, "step": 80690, "token_acc": 0.4823529411764706, "train_speed(iter/s)": 1.437379 }, { "epoch": 3.4572211987489823, "grad_norm": 7.538059711456299, "learning_rate": 2.1708488994936048e-05, "loss": 2.4250808715820313, "memory(GiB)": 77.56, "step": 80695, "token_acc": 0.5269230769230769, "train_speed(iter/s)": 1.437392 }, { "epoch": 3.4574354140782315, "grad_norm": 5.586945533752441, "learning_rate": 2.1702940406780097e-05, "loss": 2.337316703796387, "memory(GiB)": 77.56, "step": 80700, "token_acc": 0.5084745762711864, "train_speed(iter/s)": 1.437385 }, { "epoch": 3.4576496294074803, "grad_norm": 9.51451301574707, "learning_rate": 2.1697392331251904e-05, "loss": 2.32459716796875, "memory(GiB)": 77.56, "step": 80705, "token_acc": 0.5461847389558233, "train_speed(iter/s)": 1.437389 }, { "epoch": 3.457863844736729, "grad_norm": 6.780073165893555, "learning_rate": 2.169184476845202e-05, "loss": 2.3071727752685547, "memory(GiB)": 77.56, "step": 80710, "token_acc": 0.4983164983164983, "train_speed(iter/s)": 1.437392 }, { "epoch": 3.4580780600659784, "grad_norm": 5.097660541534424, "learning_rate": 2.168629771848092e-05, "loss": 2.2894062042236327, "memory(GiB)": 77.56, "step": 80715, "token_acc": 0.5421245421245421, "train_speed(iter/s)": 1.437406 }, { "epoch": 3.458292275395227, "grad_norm": 5.0699782371521, "learning_rate": 2.1680751181439096e-05, "loss": 2.4819961547851563, "memory(GiB)": 77.56, "step": 80720, "token_acc": 0.45867768595041325, "train_speed(iter/s)": 1.437416 }, { "epoch": 3.458506490724476, "grad_norm": 6.098877906799316, "learning_rate": 2.1675205157427035e-05, "loss": 2.170017433166504, "memory(GiB)": 77.56, "step": 80725, "token_acc": 0.534965034965035, "train_speed(iter/s)": 1.437401 }, { "epoch": 3.4587207060537253, "grad_norm": 5.183691024780273, "learning_rate": 2.1669659646545183e-05, "loss": 2.038774871826172, "memory(GiB)": 77.56, "step": 80730, "token_acc": 0.5016077170418006, "train_speed(iter/s)": 1.437419 }, { "epoch": 3.458934921382974, "grad_norm": 6.527834415435791, "learning_rate": 2.166411464889404e-05, "loss": 2.4816041946411134, "memory(GiB)": 77.56, "step": 80735, "token_acc": 0.4753521126760563, "train_speed(iter/s)": 1.437424 }, { "epoch": 3.459149136712223, "grad_norm": 5.282503604888916, "learning_rate": 2.1658570164574044e-05, "loss": 2.385080337524414, "memory(GiB)": 77.56, "step": 80740, "token_acc": 0.48201438848920863, "train_speed(iter/s)": 1.437423 }, { "epoch": 3.459363352041472, "grad_norm": 5.43533182144165, "learning_rate": 2.1653026193685632e-05, "loss": 2.1967086791992188, "memory(GiB)": 77.56, "step": 80745, "token_acc": 0.5017543859649123, "train_speed(iter/s)": 1.437436 }, { "epoch": 3.459577567370721, "grad_norm": 5.7788286209106445, "learning_rate": 2.1647482736329227e-05, "loss": 2.2340927124023438, "memory(GiB)": 77.56, "step": 80750, "token_acc": 0.5378486055776892, "train_speed(iter/s)": 1.437453 }, { "epoch": 3.45979178269997, "grad_norm": 8.328045845031738, "learning_rate": 2.1641939792605287e-05, "loss": 2.5865297317504883, "memory(GiB)": 77.56, "step": 80755, "token_acc": 0.4343065693430657, "train_speed(iter/s)": 1.437438 }, { "epoch": 3.460005998029219, "grad_norm": 6.668324947357178, "learning_rate": 2.163639736261421e-05, "loss": 2.241141128540039, "memory(GiB)": 77.56, "step": 80760, "token_acc": 0.5197132616487455, "train_speed(iter/s)": 1.437423 }, { "epoch": 3.460220213358468, "grad_norm": 6.232999324798584, "learning_rate": 2.1630855446456382e-05, "loss": 2.140214538574219, "memory(GiB)": 77.56, "step": 80765, "token_acc": 0.5204460966542751, "train_speed(iter/s)": 1.437428 }, { "epoch": 3.4604344286877167, "grad_norm": 8.097764015197754, "learning_rate": 2.1625314044232236e-05, "loss": 1.9963481903076172, "memory(GiB)": 77.56, "step": 80770, "token_acc": 0.5443548387096774, "train_speed(iter/s)": 1.437433 }, { "epoch": 3.460648644016966, "grad_norm": 4.857347011566162, "learning_rate": 2.161977315604215e-05, "loss": 2.3844308853149414, "memory(GiB)": 77.56, "step": 80775, "token_acc": 0.4944649446494465, "train_speed(iter/s)": 1.437443 }, { "epoch": 3.4608628593462147, "grad_norm": 6.190577983856201, "learning_rate": 2.1614232781986493e-05, "loss": 2.4740447998046875, "memory(GiB)": 77.56, "step": 80780, "token_acc": 0.47019867549668876, "train_speed(iter/s)": 1.437441 }, { "epoch": 3.4610770746754635, "grad_norm": 4.87861967086792, "learning_rate": 2.1608692922165634e-05, "loss": 2.3485822677612305, "memory(GiB)": 77.56, "step": 80785, "token_acc": 0.5110410094637224, "train_speed(iter/s)": 1.437419 }, { "epoch": 3.461291290004713, "grad_norm": 8.18832015991211, "learning_rate": 2.1603153576679917e-05, "loss": 2.623311996459961, "memory(GiB)": 77.56, "step": 80790, "token_acc": 0.48299319727891155, "train_speed(iter/s)": 1.43742 }, { "epoch": 3.4615055053339616, "grad_norm": 6.9606032371521, "learning_rate": 2.1597614745629734e-05, "loss": 2.4861278533935547, "memory(GiB)": 77.56, "step": 80795, "token_acc": 0.5238095238095238, "train_speed(iter/s)": 1.437429 }, { "epoch": 3.4617197206632104, "grad_norm": 5.269224166870117, "learning_rate": 2.1592076429115406e-05, "loss": 2.347925567626953, "memory(GiB)": 77.56, "step": 80800, "token_acc": 0.5027777777777778, "train_speed(iter/s)": 1.43745 }, { "epoch": 3.4619339359924597, "grad_norm": 4.54539680480957, "learning_rate": 2.1586538627237257e-05, "loss": 2.302774429321289, "memory(GiB)": 77.56, "step": 80805, "token_acc": 0.4773413897280967, "train_speed(iter/s)": 1.437437 }, { "epoch": 3.4621481513217085, "grad_norm": 5.531946182250977, "learning_rate": 2.158100134009562e-05, "loss": 2.526936340332031, "memory(GiB)": 77.56, "step": 80810, "token_acc": 0.4824561403508772, "train_speed(iter/s)": 1.437434 }, { "epoch": 3.4623623666509573, "grad_norm": 7.029908180236816, "learning_rate": 2.1575464567790778e-05, "loss": 2.8980453491210936, "memory(GiB)": 77.56, "step": 80815, "token_acc": 0.4562043795620438, "train_speed(iter/s)": 1.43746 }, { "epoch": 3.4625765819802066, "grad_norm": 6.715392589569092, "learning_rate": 2.1569928310423082e-05, "loss": 2.575844383239746, "memory(GiB)": 77.56, "step": 80820, "token_acc": 0.47416413373860183, "train_speed(iter/s)": 1.437477 }, { "epoch": 3.4627907973094554, "grad_norm": 5.619195461273193, "learning_rate": 2.1564392568092785e-05, "loss": 2.3543155670166014, "memory(GiB)": 77.56, "step": 80825, "token_acc": 0.5031446540880503, "train_speed(iter/s)": 1.437466 }, { "epoch": 3.463005012638704, "grad_norm": 6.5175628662109375, "learning_rate": 2.1558857340900207e-05, "loss": 2.494320297241211, "memory(GiB)": 77.56, "step": 80830, "token_acc": 0.4946236559139785, "train_speed(iter/s)": 1.43747 }, { "epoch": 3.4632192279679535, "grad_norm": 5.518880844116211, "learning_rate": 2.1553322628945615e-05, "loss": 2.514445686340332, "memory(GiB)": 77.56, "step": 80835, "token_acc": 0.4902597402597403, "train_speed(iter/s)": 1.437481 }, { "epoch": 3.4634334432972023, "grad_norm": 5.718807220458984, "learning_rate": 2.1547788432329263e-05, "loss": 2.5359895706176756, "memory(GiB)": 77.56, "step": 80840, "token_acc": 0.5120274914089347, "train_speed(iter/s)": 1.437494 }, { "epoch": 3.463647658626451, "grad_norm": 5.379241943359375, "learning_rate": 2.1542254751151412e-05, "loss": 2.7938880920410156, "memory(GiB)": 77.56, "step": 80845, "token_acc": 0.4681528662420382, "train_speed(iter/s)": 1.437499 }, { "epoch": 3.4638618739557003, "grad_norm": 5.155433177947998, "learning_rate": 2.1536721585512298e-05, "loss": 2.273894119262695, "memory(GiB)": 77.56, "step": 80850, "token_acc": 0.5241157556270096, "train_speed(iter/s)": 1.437509 }, { "epoch": 3.464076089284949, "grad_norm": 6.263373374938965, "learning_rate": 2.153118893551219e-05, "loss": 2.0479362487792967, "memory(GiB)": 77.56, "step": 80855, "token_acc": 0.583916083916084, "train_speed(iter/s)": 1.437508 }, { "epoch": 3.464290304614198, "grad_norm": 4.4990363121032715, "learning_rate": 2.1525656801251304e-05, "loss": 2.322727584838867, "memory(GiB)": 77.56, "step": 80860, "token_acc": 0.49034749034749037, "train_speed(iter/s)": 1.437504 }, { "epoch": 3.464504519943447, "grad_norm": 5.9686598777771, "learning_rate": 2.1520125182829852e-05, "loss": 2.562971496582031, "memory(GiB)": 77.56, "step": 80865, "token_acc": 0.5018450184501845, "train_speed(iter/s)": 1.437499 }, { "epoch": 3.464718735272696, "grad_norm": 5.9565205574035645, "learning_rate": 2.151459408034805e-05, "loss": 2.3215253829956053, "memory(GiB)": 77.56, "step": 80870, "token_acc": 0.5156794425087108, "train_speed(iter/s)": 1.437475 }, { "epoch": 3.464932950601945, "grad_norm": 5.3802947998046875, "learning_rate": 2.150906349390608e-05, "loss": 2.3780303955078126, "memory(GiB)": 77.56, "step": 80875, "token_acc": 0.5018587360594795, "train_speed(iter/s)": 1.437467 }, { "epoch": 3.465147165931194, "grad_norm": 5.702744960784912, "learning_rate": 2.1503533423604167e-05, "loss": 2.1353302001953125, "memory(GiB)": 77.56, "step": 80880, "token_acc": 0.5698529411764706, "train_speed(iter/s)": 1.437471 }, { "epoch": 3.465361381260443, "grad_norm": 6.480691909790039, "learning_rate": 2.1498003869542483e-05, "loss": 2.376254844665527, "memory(GiB)": 77.56, "step": 80885, "token_acc": 0.46794871794871795, "train_speed(iter/s)": 1.437504 }, { "epoch": 3.4655755965896917, "grad_norm": 7.769824028015137, "learning_rate": 2.149247483182119e-05, "loss": 2.088960647583008, "memory(GiB)": 77.56, "step": 80890, "token_acc": 0.5125, "train_speed(iter/s)": 1.437509 }, { "epoch": 3.465789811918941, "grad_norm": 5.404630184173584, "learning_rate": 2.148694631054046e-05, "loss": 2.281485366821289, "memory(GiB)": 77.56, "step": 80895, "token_acc": 0.5180327868852459, "train_speed(iter/s)": 1.43751 }, { "epoch": 3.46600402724819, "grad_norm": 5.724725723266602, "learning_rate": 2.1481418305800425e-05, "loss": 2.1865924835205077, "memory(GiB)": 77.56, "step": 80900, "token_acc": 0.49097472924187724, "train_speed(iter/s)": 1.437511 }, { "epoch": 3.4662182425774386, "grad_norm": 6.743160724639893, "learning_rate": 2.1475890817701272e-05, "loss": 2.6283044815063477, "memory(GiB)": 77.56, "step": 80905, "token_acc": 0.4777777777777778, "train_speed(iter/s)": 1.437541 }, { "epoch": 3.466432457906688, "grad_norm": 7.072866916656494, "learning_rate": 2.1470363846343096e-05, "loss": 2.362382507324219, "memory(GiB)": 77.56, "step": 80910, "token_acc": 0.496875, "train_speed(iter/s)": 1.437551 }, { "epoch": 3.4666466732359367, "grad_norm": 6.42167854309082, "learning_rate": 2.146483739182606e-05, "loss": 2.4504037857055665, "memory(GiB)": 77.56, "step": 80915, "token_acc": 0.47733333333333333, "train_speed(iter/s)": 1.437545 }, { "epoch": 3.4668608885651855, "grad_norm": 6.257534980773926, "learning_rate": 2.1459311454250258e-05, "loss": 2.543269157409668, "memory(GiB)": 77.56, "step": 80920, "token_acc": 0.44868035190615835, "train_speed(iter/s)": 1.437543 }, { "epoch": 3.4670751038944347, "grad_norm": 5.240074634552002, "learning_rate": 2.1453786033715807e-05, "loss": 2.296847343444824, "memory(GiB)": 77.56, "step": 80925, "token_acc": 0.5, "train_speed(iter/s)": 1.437565 }, { "epoch": 3.4672893192236836, "grad_norm": 6.4981770515441895, "learning_rate": 2.14482611303228e-05, "loss": 2.4388652801513673, "memory(GiB)": 77.56, "step": 80930, "token_acc": 0.487012987012987, "train_speed(iter/s)": 1.437574 }, { "epoch": 3.4675035345529324, "grad_norm": 6.463721752166748, "learning_rate": 2.1442736744171305e-05, "loss": 2.4519615173339844, "memory(GiB)": 77.56, "step": 80935, "token_acc": 0.475, "train_speed(iter/s)": 1.437559 }, { "epoch": 3.4677177498821816, "grad_norm": 5.944515705108643, "learning_rate": 2.1437212875361444e-05, "loss": 2.3641876220703124, "memory(GiB)": 77.56, "step": 80940, "token_acc": 0.46691176470588236, "train_speed(iter/s)": 1.437536 }, { "epoch": 3.4679319652114304, "grad_norm": 5.1539740562438965, "learning_rate": 2.1431689523993263e-05, "loss": 2.2079532623291014, "memory(GiB)": 77.56, "step": 80945, "token_acc": 0.5574912891986062, "train_speed(iter/s)": 1.437527 }, { "epoch": 3.4681461805406792, "grad_norm": 6.763242721557617, "learning_rate": 2.142616669016683e-05, "loss": 2.4610616683959963, "memory(GiB)": 77.56, "step": 80950, "token_acc": 0.4281437125748503, "train_speed(iter/s)": 1.437536 }, { "epoch": 3.4683603958699285, "grad_norm": 8.104201316833496, "learning_rate": 2.142064437398219e-05, "loss": 2.17742919921875, "memory(GiB)": 77.56, "step": 80955, "token_acc": 0.5119453924914675, "train_speed(iter/s)": 1.437524 }, { "epoch": 3.4685746111991773, "grad_norm": 5.250313758850098, "learning_rate": 2.141512257553937e-05, "loss": 2.2010494232177735, "memory(GiB)": 77.56, "step": 80960, "token_acc": 0.503125, "train_speed(iter/s)": 1.437526 }, { "epoch": 3.468788826528426, "grad_norm": 6.576564311981201, "learning_rate": 2.1409601294938436e-05, "loss": 2.3733232498168944, "memory(GiB)": 77.56, "step": 80965, "token_acc": 0.4968152866242038, "train_speed(iter/s)": 1.437507 }, { "epoch": 3.4690030418576754, "grad_norm": 6.809447765350342, "learning_rate": 2.1404080532279376e-05, "loss": 2.2804502487182616, "memory(GiB)": 77.56, "step": 80970, "token_acc": 0.5016501650165016, "train_speed(iter/s)": 1.437497 }, { "epoch": 3.469217257186924, "grad_norm": 6.49641752243042, "learning_rate": 2.139856028766224e-05, "loss": 2.490182113647461, "memory(GiB)": 77.56, "step": 80975, "token_acc": 0.5101214574898786, "train_speed(iter/s)": 1.437516 }, { "epoch": 3.469431472516173, "grad_norm": 7.083703994750977, "learning_rate": 2.139304056118701e-05, "loss": 2.632050323486328, "memory(GiB)": 77.56, "step": 80980, "token_acc": 0.4674922600619195, "train_speed(iter/s)": 1.437512 }, { "epoch": 3.4696456878454223, "grad_norm": 4.87672233581543, "learning_rate": 2.1387521352953695e-05, "loss": 2.320743751525879, "memory(GiB)": 77.56, "step": 80985, "token_acc": 0.5067567567567568, "train_speed(iter/s)": 1.437509 }, { "epoch": 3.469859903174671, "grad_norm": 7.414366245269775, "learning_rate": 2.138200266306227e-05, "loss": 2.4225080490112303, "memory(GiB)": 77.56, "step": 80990, "token_acc": 0.4967948717948718, "train_speed(iter/s)": 1.437521 }, { "epoch": 3.47007411850392, "grad_norm": 9.614435195922852, "learning_rate": 2.1376484491612685e-05, "loss": 2.3034950256347657, "memory(GiB)": 77.56, "step": 80995, "token_acc": 0.54, "train_speed(iter/s)": 1.437521 }, { "epoch": 3.470288333833169, "grad_norm": 5.982474327087402, "learning_rate": 2.1370966838704963e-05, "loss": 2.5725757598876955, "memory(GiB)": 77.56, "step": 81000, "token_acc": 0.4797047970479705, "train_speed(iter/s)": 1.437498 }, { "epoch": 3.470288333833169, "eval_loss": 2.343449115753174, "eval_runtime": 14.2529, "eval_samples_per_second": 7.016, "eval_steps_per_second": 7.016, "eval_token_acc": 0.4537914691943128, "step": 81000 }, { "epoch": 3.470502549162418, "grad_norm": 6.501448154449463, "learning_rate": 2.1365449704439028e-05, "loss": 2.7911149978637697, "memory(GiB)": 77.56, "step": 81005, "token_acc": 0.44601769911504424, "train_speed(iter/s)": 1.437104 }, { "epoch": 3.470716764491667, "grad_norm": 6.5665483474731445, "learning_rate": 2.1359933088914836e-05, "loss": 2.267047119140625, "memory(GiB)": 77.56, "step": 81010, "token_acc": 0.5185185185185185, "train_speed(iter/s)": 1.437125 }, { "epoch": 3.470930979820916, "grad_norm": 5.170248031616211, "learning_rate": 2.1354416992232318e-05, "loss": 2.2988662719726562, "memory(GiB)": 77.56, "step": 81015, "token_acc": 0.4711864406779661, "train_speed(iter/s)": 1.437142 }, { "epoch": 3.471145195150165, "grad_norm": 4.95196533203125, "learning_rate": 2.1348901414491396e-05, "loss": 2.2833063125610353, "memory(GiB)": 77.56, "step": 81020, "token_acc": 0.4965277777777778, "train_speed(iter/s)": 1.437166 }, { "epoch": 3.471359410479414, "grad_norm": 5.3953070640563965, "learning_rate": 2.1343386355792017e-05, "loss": 2.1579689025878905, "memory(GiB)": 77.56, "step": 81025, "token_acc": 0.5494880546075085, "train_speed(iter/s)": 1.437163 }, { "epoch": 3.471573625808663, "grad_norm": 4.396566390991211, "learning_rate": 2.1337871816234072e-05, "loss": 2.290094757080078, "memory(GiB)": 77.56, "step": 81030, "token_acc": 0.49337748344370863, "train_speed(iter/s)": 1.437161 }, { "epoch": 3.4717878411379117, "grad_norm": 6.068824768066406, "learning_rate": 2.1332357795917475e-05, "loss": 2.367625427246094, "memory(GiB)": 77.56, "step": 81035, "token_acc": 0.4921875, "train_speed(iter/s)": 1.437173 }, { "epoch": 3.472002056467161, "grad_norm": 5.833049774169922, "learning_rate": 2.13268442949421e-05, "loss": 2.22940616607666, "memory(GiB)": 77.56, "step": 81040, "token_acc": 0.5071428571428571, "train_speed(iter/s)": 1.437193 }, { "epoch": 3.47221627179641, "grad_norm": 8.971484184265137, "learning_rate": 2.132133131340782e-05, "loss": 2.377848243713379, "memory(GiB)": 77.56, "step": 81045, "token_acc": 0.5035971223021583, "train_speed(iter/s)": 1.437207 }, { "epoch": 3.4724304871256586, "grad_norm": 5.634929180145264, "learning_rate": 2.1315818851414553e-05, "loss": 2.1934200286865235, "memory(GiB)": 77.56, "step": 81050, "token_acc": 0.5266457680250783, "train_speed(iter/s)": 1.437196 }, { "epoch": 3.472644702454908, "grad_norm": 5.89577579498291, "learning_rate": 2.131030690906211e-05, "loss": 2.758171272277832, "memory(GiB)": 77.56, "step": 81055, "token_acc": 0.4456140350877193, "train_speed(iter/s)": 1.437208 }, { "epoch": 3.4728589177841567, "grad_norm": 6.55145788192749, "learning_rate": 2.1304795486450398e-05, "loss": 2.172046661376953, "memory(GiB)": 77.56, "step": 81060, "token_acc": 0.5215827338129496, "train_speed(iter/s)": 1.437207 }, { "epoch": 3.4730731331134055, "grad_norm": 9.11303424835205, "learning_rate": 2.1299284583679224e-05, "loss": 2.420275115966797, "memory(GiB)": 77.56, "step": 81065, "token_acc": 0.47468354430379744, "train_speed(iter/s)": 1.43723 }, { "epoch": 3.4732873484426547, "grad_norm": 5.454561710357666, "learning_rate": 2.1293774200848437e-05, "loss": 2.279291343688965, "memory(GiB)": 77.56, "step": 81070, "token_acc": 0.4870848708487085, "train_speed(iter/s)": 1.437238 }, { "epoch": 3.4735015637719036, "grad_norm": 5.810431480407715, "learning_rate": 2.1288264338057855e-05, "loss": 2.373587799072266, "memory(GiB)": 77.56, "step": 81075, "token_acc": 0.48184818481848185, "train_speed(iter/s)": 1.437246 }, { "epoch": 3.4737157791011524, "grad_norm": 5.706579685211182, "learning_rate": 2.1282754995407285e-05, "loss": 2.238748550415039, "memory(GiB)": 77.56, "step": 81080, "token_acc": 0.5390334572490706, "train_speed(iter/s)": 1.437261 }, { "epoch": 3.4739299944304016, "grad_norm": 5.767402172088623, "learning_rate": 2.1277246172996562e-05, "loss": 2.377247619628906, "memory(GiB)": 77.56, "step": 81085, "token_acc": 0.5013774104683195, "train_speed(iter/s)": 1.437258 }, { "epoch": 3.4741442097596504, "grad_norm": 7.721344470977783, "learning_rate": 2.1271737870925473e-05, "loss": 2.5996915817260744, "memory(GiB)": 77.56, "step": 81090, "token_acc": 0.47079037800687284, "train_speed(iter/s)": 1.437277 }, { "epoch": 3.4743584250888992, "grad_norm": 7.026059627532959, "learning_rate": 2.12662300892938e-05, "loss": 2.4789125442504885, "memory(GiB)": 77.56, "step": 81095, "token_acc": 0.449438202247191, "train_speed(iter/s)": 1.437296 }, { "epoch": 3.4745726404181485, "grad_norm": 5.499500751495361, "learning_rate": 2.1260722828201323e-05, "loss": 2.306113433837891, "memory(GiB)": 77.56, "step": 81100, "token_acc": 0.5252918287937743, "train_speed(iter/s)": 1.43729 }, { "epoch": 3.4747868557473973, "grad_norm": 6.0566020011901855, "learning_rate": 2.1255216087747792e-05, "loss": 2.3789741516113283, "memory(GiB)": 77.56, "step": 81105, "token_acc": 0.5205047318611987, "train_speed(iter/s)": 1.437284 }, { "epoch": 3.475001071076646, "grad_norm": 7.280029773712158, "learning_rate": 2.1249709868033007e-05, "loss": 2.2331844329833985, "memory(GiB)": 77.56, "step": 81110, "token_acc": 0.5614035087719298, "train_speed(iter/s)": 1.437308 }, { "epoch": 3.4752152864058954, "grad_norm": 6.601282119750977, "learning_rate": 2.1244204169156694e-05, "loss": 2.7776655197143554, "memory(GiB)": 77.56, "step": 81115, "token_acc": 0.4581818181818182, "train_speed(iter/s)": 1.437305 }, { "epoch": 3.475429501735144, "grad_norm": 5.758901119232178, "learning_rate": 2.123869899121858e-05, "loss": 2.33856143951416, "memory(GiB)": 77.56, "step": 81120, "token_acc": 0.4854368932038835, "train_speed(iter/s)": 1.437315 }, { "epoch": 3.475643717064393, "grad_norm": 6.502024173736572, "learning_rate": 2.1233194334318435e-05, "loss": 2.5741403579711912, "memory(GiB)": 77.56, "step": 81125, "token_acc": 0.4867924528301887, "train_speed(iter/s)": 1.437318 }, { "epoch": 3.4758579323936423, "grad_norm": 6.306130409240723, "learning_rate": 2.122769019855596e-05, "loss": 2.2737781524658205, "memory(GiB)": 77.56, "step": 81130, "token_acc": 0.47104247104247104, "train_speed(iter/s)": 1.437309 }, { "epoch": 3.476072147722891, "grad_norm": 5.00747013092041, "learning_rate": 2.1222186584030867e-05, "loss": 2.4421785354614256, "memory(GiB)": 77.56, "step": 81135, "token_acc": 0.48562300319488816, "train_speed(iter/s)": 1.437294 }, { "epoch": 3.47628636305214, "grad_norm": 7.00429630279541, "learning_rate": 2.1216683490842843e-05, "loss": 2.6984304428100585, "memory(GiB)": 77.56, "step": 81140, "token_acc": 0.4779874213836478, "train_speed(iter/s)": 1.437316 }, { "epoch": 3.476500578381389, "grad_norm": 6.758488178253174, "learning_rate": 2.1211180919091615e-05, "loss": 2.3303596496582033, "memory(GiB)": 77.56, "step": 81145, "token_acc": 0.5162241887905604, "train_speed(iter/s)": 1.437335 }, { "epoch": 3.476714793710638, "grad_norm": 6.930009365081787, "learning_rate": 2.120567886887685e-05, "loss": 2.501367378234863, "memory(GiB)": 77.56, "step": 81150, "token_acc": 0.5, "train_speed(iter/s)": 1.437334 }, { "epoch": 3.4769290090398868, "grad_norm": 5.041632175445557, "learning_rate": 2.1200177340298222e-05, "loss": 2.677979278564453, "memory(GiB)": 77.56, "step": 81155, "token_acc": 0.5195729537366548, "train_speed(iter/s)": 1.437336 }, { "epoch": 3.477143224369136, "grad_norm": 5.47381591796875, "learning_rate": 2.1194676333455404e-05, "loss": 2.7373023986816407, "memory(GiB)": 77.56, "step": 81160, "token_acc": 0.4352517985611511, "train_speed(iter/s)": 1.437328 }, { "epoch": 3.477357439698385, "grad_norm": 4.7734856605529785, "learning_rate": 2.1189175848448023e-05, "loss": 2.2085458755493166, "memory(GiB)": 77.56, "step": 81165, "token_acc": 0.5132450331125827, "train_speed(iter/s)": 1.437323 }, { "epoch": 3.4775716550276337, "grad_norm": 6.531951904296875, "learning_rate": 2.1183675885375764e-05, "loss": 2.4423059463500976, "memory(GiB)": 77.56, "step": 81170, "token_acc": 0.47586206896551725, "train_speed(iter/s)": 1.437333 }, { "epoch": 3.477785870356883, "grad_norm": 5.582119941711426, "learning_rate": 2.1178176444338255e-05, "loss": 2.783696174621582, "memory(GiB)": 77.56, "step": 81175, "token_acc": 0.4440894568690096, "train_speed(iter/s)": 1.437331 }, { "epoch": 3.4780000856861317, "grad_norm": 4.885003566741943, "learning_rate": 2.1172677525435115e-05, "loss": 2.2024856567382813, "memory(GiB)": 77.56, "step": 81180, "token_acc": 0.5159235668789809, "train_speed(iter/s)": 1.437332 }, { "epoch": 3.4782143010153805, "grad_norm": 8.581168174743652, "learning_rate": 2.1167179128765958e-05, "loss": 2.5739604949951174, "memory(GiB)": 77.56, "step": 81185, "token_acc": 0.45387453874538747, "train_speed(iter/s)": 1.43737 }, { "epoch": 3.47842851634463, "grad_norm": 6.265730381011963, "learning_rate": 2.1161681254430387e-05, "loss": 2.4695104598999023, "memory(GiB)": 77.56, "step": 81190, "token_acc": 0.4773413897280967, "train_speed(iter/s)": 1.437368 }, { "epoch": 3.4786427316738786, "grad_norm": 7.1325764656066895, "learning_rate": 2.1156183902528024e-05, "loss": 2.5873565673828125, "memory(GiB)": 77.56, "step": 81195, "token_acc": 0.49158249158249157, "train_speed(iter/s)": 1.437386 }, { "epoch": 3.4788569470031274, "grad_norm": 6.447802543640137, "learning_rate": 2.1150687073158437e-05, "loss": 2.517266845703125, "memory(GiB)": 77.56, "step": 81200, "token_acc": 0.4697406340057637, "train_speed(iter/s)": 1.437418 }, { "epoch": 3.4790711623323767, "grad_norm": 4.610739707946777, "learning_rate": 2.114519076642123e-05, "loss": 2.425057601928711, "memory(GiB)": 77.56, "step": 81205, "token_acc": 0.49477351916376305, "train_speed(iter/s)": 1.437412 }, { "epoch": 3.4792853776616255, "grad_norm": 4.679841995239258, "learning_rate": 2.1139694982415965e-05, "loss": 2.3926328659057616, "memory(GiB)": 77.56, "step": 81210, "token_acc": 0.5198675496688742, "train_speed(iter/s)": 1.437422 }, { "epoch": 3.4794995929908743, "grad_norm": 8.280360221862793, "learning_rate": 2.1134199721242194e-05, "loss": 2.250014877319336, "memory(GiB)": 77.56, "step": 81215, "token_acc": 0.5265017667844523, "train_speed(iter/s)": 1.437429 }, { "epoch": 3.4797138083201236, "grad_norm": 4.4545793533325195, "learning_rate": 2.112870498299947e-05, "loss": 2.3474857330322267, "memory(GiB)": 77.56, "step": 81220, "token_acc": 0.524390243902439, "train_speed(iter/s)": 1.437433 }, { "epoch": 3.4799280236493724, "grad_norm": 6.209092140197754, "learning_rate": 2.112321076778733e-05, "loss": 2.6005908966064455, "memory(GiB)": 77.56, "step": 81225, "token_acc": 0.4625, "train_speed(iter/s)": 1.437455 }, { "epoch": 3.480142238978621, "grad_norm": 7.500361919403076, "learning_rate": 2.1117717075705324e-05, "loss": 2.185984420776367, "memory(GiB)": 77.56, "step": 81230, "token_acc": 0.5203252032520326, "train_speed(iter/s)": 1.43747 }, { "epoch": 3.4803564543078704, "grad_norm": 7.425896644592285, "learning_rate": 2.1112223906852973e-05, "loss": 2.440434455871582, "memory(GiB)": 77.56, "step": 81235, "token_acc": 0.4898785425101215, "train_speed(iter/s)": 1.437485 }, { "epoch": 3.4805706696371193, "grad_norm": 8.427542686462402, "learning_rate": 2.1106731261329783e-05, "loss": 2.3623069763183593, "memory(GiB)": 77.56, "step": 81240, "token_acc": 0.5017064846416383, "train_speed(iter/s)": 1.437509 }, { "epoch": 3.480784884966368, "grad_norm": 8.56961441040039, "learning_rate": 2.1101239139235263e-05, "loss": 2.473025894165039, "memory(GiB)": 77.56, "step": 81245, "token_acc": 0.4435146443514644, "train_speed(iter/s)": 1.437506 }, { "epoch": 3.4809991002956173, "grad_norm": 5.188290119171143, "learning_rate": 2.109574754066888e-05, "loss": 2.5860151290893554, "memory(GiB)": 77.56, "step": 81250, "token_acc": 0.5018181818181818, "train_speed(iter/s)": 1.437496 }, { "epoch": 3.481213315624866, "grad_norm": 7.030704975128174, "learning_rate": 2.109025646573017e-05, "loss": 2.461176109313965, "memory(GiB)": 77.56, "step": 81255, "token_acc": 0.4953271028037383, "train_speed(iter/s)": 1.437508 }, { "epoch": 3.481427530954115, "grad_norm": 5.170688152313232, "learning_rate": 2.108476591451858e-05, "loss": 2.539849853515625, "memory(GiB)": 77.56, "step": 81260, "token_acc": 0.48928571428571427, "train_speed(iter/s)": 1.437506 }, { "epoch": 3.481641746283364, "grad_norm": 5.83721923828125, "learning_rate": 2.1079275887133565e-05, "loss": 2.1372711181640627, "memory(GiB)": 77.56, "step": 81265, "token_acc": 0.541095890410959, "train_speed(iter/s)": 1.437495 }, { "epoch": 3.481855961612613, "grad_norm": 5.7097272872924805, "learning_rate": 2.107378638367461e-05, "loss": 2.3553020477294924, "memory(GiB)": 77.56, "step": 81270, "token_acc": 0.5189393939393939, "train_speed(iter/s)": 1.437485 }, { "epoch": 3.482070176941862, "grad_norm": 5.059092998504639, "learning_rate": 2.1068297404241156e-05, "loss": 2.374205780029297, "memory(GiB)": 77.56, "step": 81275, "token_acc": 0.49117647058823527, "train_speed(iter/s)": 1.437476 }, { "epoch": 3.482284392271111, "grad_norm": 6.398963451385498, "learning_rate": 2.1062808948932633e-05, "loss": 2.45794792175293, "memory(GiB)": 77.56, "step": 81280, "token_acc": 0.434375, "train_speed(iter/s)": 1.437485 }, { "epoch": 3.48249860760036, "grad_norm": 6.465716361999512, "learning_rate": 2.105732101784845e-05, "loss": 2.0750759124755858, "memory(GiB)": 77.56, "step": 81285, "token_acc": 0.5081967213114754, "train_speed(iter/s)": 1.437466 }, { "epoch": 3.4827128229296087, "grad_norm": 4.81996488571167, "learning_rate": 2.105183361108807e-05, "loss": 2.3773626327514648, "memory(GiB)": 77.56, "step": 81290, "token_acc": 0.5230263157894737, "train_speed(iter/s)": 1.437455 }, { "epoch": 3.482927038258858, "grad_norm": 4.905280590057373, "learning_rate": 2.1046346728750872e-05, "loss": 2.4053321838378907, "memory(GiB)": 77.56, "step": 81295, "token_acc": 0.4738675958188153, "train_speed(iter/s)": 1.437473 }, { "epoch": 3.483141253588107, "grad_norm": 5.443066120147705, "learning_rate": 2.104086037093627e-05, "loss": 2.1631118774414064, "memory(GiB)": 77.56, "step": 81300, "token_acc": 0.5151515151515151, "train_speed(iter/s)": 1.437486 }, { "epoch": 3.4833554689173556, "grad_norm": 9.772308349609375, "learning_rate": 2.1035374537743645e-05, "loss": 2.531840515136719, "memory(GiB)": 77.56, "step": 81305, "token_acc": 0.47717842323651455, "train_speed(iter/s)": 1.437495 }, { "epoch": 3.483569684246605, "grad_norm": 6.452080249786377, "learning_rate": 2.1029889229272364e-05, "loss": 2.456101417541504, "memory(GiB)": 77.56, "step": 81310, "token_acc": 0.49823321554770317, "train_speed(iter/s)": 1.437505 }, { "epoch": 3.4837838995758537, "grad_norm": 6.846014499664307, "learning_rate": 2.1024404445621837e-05, "loss": 2.318460464477539, "memory(GiB)": 77.56, "step": 81315, "token_acc": 0.4925373134328358, "train_speed(iter/s)": 1.437514 }, { "epoch": 3.4839981149051025, "grad_norm": 5.934245586395264, "learning_rate": 2.1018920186891407e-05, "loss": 2.5138072967529297, "memory(GiB)": 77.56, "step": 81320, "token_acc": 0.4755700325732899, "train_speed(iter/s)": 1.437522 }, { "epoch": 3.4842123302343517, "grad_norm": 4.512868881225586, "learning_rate": 2.1013436453180417e-05, "loss": 1.8392452239990233, "memory(GiB)": 77.56, "step": 81325, "token_acc": 0.5708955223880597, "train_speed(iter/s)": 1.43753 }, { "epoch": 3.4844265455636005, "grad_norm": 7.431070804595947, "learning_rate": 2.1007953244588218e-05, "loss": 2.495029830932617, "memory(GiB)": 77.56, "step": 81330, "token_acc": 0.5128205128205128, "train_speed(iter/s)": 1.437523 }, { "epoch": 3.4846407608928494, "grad_norm": 5.911373615264893, "learning_rate": 2.1002470561214126e-05, "loss": 2.294062042236328, "memory(GiB)": 77.56, "step": 81335, "token_acc": 0.4950166112956811, "train_speed(iter/s)": 1.437527 }, { "epoch": 3.4848549762220986, "grad_norm": 6.209281921386719, "learning_rate": 2.09969884031575e-05, "loss": 2.4408111572265625, "memory(GiB)": 77.56, "step": 81340, "token_acc": 0.48333333333333334, "train_speed(iter/s)": 1.437528 }, { "epoch": 3.4850691915513474, "grad_norm": 4.966249465942383, "learning_rate": 2.0991506770517622e-05, "loss": 2.3700326919555663, "memory(GiB)": 77.56, "step": 81345, "token_acc": 0.5015197568389058, "train_speed(iter/s)": 1.43752 }, { "epoch": 3.4852834068805962, "grad_norm": 5.710653781890869, "learning_rate": 2.0986025663393823e-05, "loss": 2.409952926635742, "memory(GiB)": 77.56, "step": 81350, "token_acc": 0.4870848708487085, "train_speed(iter/s)": 1.437543 }, { "epoch": 3.4854976222098455, "grad_norm": 7.252102851867676, "learning_rate": 2.098054508188539e-05, "loss": 2.2739690780639648, "memory(GiB)": 77.56, "step": 81355, "token_acc": 0.5088967971530249, "train_speed(iter/s)": 1.437546 }, { "epoch": 3.4857118375390943, "grad_norm": 4.753846168518066, "learning_rate": 2.0975065026091606e-05, "loss": 2.398512268066406, "memory(GiB)": 77.56, "step": 81360, "token_acc": 0.4548611111111111, "train_speed(iter/s)": 1.437557 }, { "epoch": 3.485926052868343, "grad_norm": 6.591948986053467, "learning_rate": 2.0969585496111743e-05, "loss": 2.8616899490356444, "memory(GiB)": 77.56, "step": 81365, "token_acc": 0.4485049833887043, "train_speed(iter/s)": 1.437582 }, { "epoch": 3.4861402681975924, "grad_norm": 7.861251354217529, "learning_rate": 2.0964106492045055e-05, "loss": 2.430014419555664, "memory(GiB)": 77.56, "step": 81370, "token_acc": 0.540268456375839, "train_speed(iter/s)": 1.437595 }, { "epoch": 3.486354483526841, "grad_norm": 7.088751316070557, "learning_rate": 2.0958628013990832e-05, "loss": 2.309187126159668, "memory(GiB)": 77.56, "step": 81375, "token_acc": 0.5220338983050847, "train_speed(iter/s)": 1.437599 }, { "epoch": 3.48656869885609, "grad_norm": 4.486125946044922, "learning_rate": 2.095315006204831e-05, "loss": 2.1281736373901365, "memory(GiB)": 77.56, "step": 81380, "token_acc": 0.5681818181818182, "train_speed(iter/s)": 1.437631 }, { "epoch": 3.4867829141853393, "grad_norm": 5.6165642738342285, "learning_rate": 2.0947672636316723e-05, "loss": 2.5054168701171875, "memory(GiB)": 77.56, "step": 81385, "token_acc": 0.4808362369337979, "train_speed(iter/s)": 1.437613 }, { "epoch": 3.486997129514588, "grad_norm": 7.254798412322998, "learning_rate": 2.0942195736895293e-05, "loss": 2.51782112121582, "memory(GiB)": 77.56, "step": 81390, "token_acc": 0.49310344827586206, "train_speed(iter/s)": 1.437601 }, { "epoch": 3.487211344843837, "grad_norm": 5.275437831878662, "learning_rate": 2.0936719363883234e-05, "loss": 2.4329294204711913, "memory(GiB)": 77.56, "step": 81395, "token_acc": 0.4956772334293948, "train_speed(iter/s)": 1.437593 }, { "epoch": 3.487425560173086, "grad_norm": 6.750207901000977, "learning_rate": 2.0931243517379784e-05, "loss": 2.5842845916748045, "memory(GiB)": 77.56, "step": 81400, "token_acc": 0.4808362369337979, "train_speed(iter/s)": 1.437585 }, { "epoch": 3.487639775502335, "grad_norm": 5.144187927246094, "learning_rate": 2.0925768197484124e-05, "loss": 2.38464241027832, "memory(GiB)": 77.56, "step": 81405, "token_acc": 0.5044510385756676, "train_speed(iter/s)": 1.437583 }, { "epoch": 3.4878539908315838, "grad_norm": 4.889918327331543, "learning_rate": 2.092029340429545e-05, "loss": 1.939674186706543, "memory(GiB)": 77.56, "step": 81410, "token_acc": 0.5879828326180258, "train_speed(iter/s)": 1.437589 }, { "epoch": 3.488068206160833, "grad_norm": 7.400070667266846, "learning_rate": 2.0914819137912916e-05, "loss": 2.2888742446899415, "memory(GiB)": 77.56, "step": 81415, "token_acc": 0.5018050541516246, "train_speed(iter/s)": 1.437593 }, { "epoch": 3.488282421490082, "grad_norm": 6.307043552398682, "learning_rate": 2.0909345398435743e-05, "loss": 2.389171600341797, "memory(GiB)": 77.56, "step": 81420, "token_acc": 0.46296296296296297, "train_speed(iter/s)": 1.437593 }, { "epoch": 3.4884966368193306, "grad_norm": 6.655287265777588, "learning_rate": 2.0903872185963063e-05, "loss": 2.1464033126831055, "memory(GiB)": 77.56, "step": 81425, "token_acc": 0.543918918918919, "train_speed(iter/s)": 1.437581 }, { "epoch": 3.48871085214858, "grad_norm": 4.3760151863098145, "learning_rate": 2.0898399500594012e-05, "loss": 2.5138574600219727, "memory(GiB)": 77.56, "step": 81430, "token_acc": 0.46853146853146854, "train_speed(iter/s)": 1.437588 }, { "epoch": 3.4889250674778287, "grad_norm": 5.804774761199951, "learning_rate": 2.0892927342427775e-05, "loss": 2.4694240570068358, "memory(GiB)": 77.56, "step": 81435, "token_acc": 0.48859934853420195, "train_speed(iter/s)": 1.437592 }, { "epoch": 3.4891392828070775, "grad_norm": 7.967717170715332, "learning_rate": 2.0887455711563468e-05, "loss": 2.6059778213500975, "memory(GiB)": 77.56, "step": 81440, "token_acc": 0.5, "train_speed(iter/s)": 1.437583 }, { "epoch": 3.489353498136327, "grad_norm": 5.517899036407471, "learning_rate": 2.0881984608100207e-05, "loss": 2.2054464340209963, "memory(GiB)": 77.56, "step": 81445, "token_acc": 0.5080906148867314, "train_speed(iter/s)": 1.437592 }, { "epoch": 3.4895677134655756, "grad_norm": 6.188095569610596, "learning_rate": 2.0876514032137105e-05, "loss": 2.292200469970703, "memory(GiB)": 77.56, "step": 81450, "token_acc": 0.4635036496350365, "train_speed(iter/s)": 1.437587 }, { "epoch": 3.4897819287948244, "grad_norm": 5.592284202575684, "learning_rate": 2.087104398377326e-05, "loss": 2.3133792877197266, "memory(GiB)": 77.56, "step": 81455, "token_acc": 0.4889589905362776, "train_speed(iter/s)": 1.43762 }, { "epoch": 3.4899961441240737, "grad_norm": 8.104219436645508, "learning_rate": 2.0865574463107794e-05, "loss": 2.0908971786499024, "memory(GiB)": 77.56, "step": 81460, "token_acc": 0.5381679389312977, "train_speed(iter/s)": 1.437629 }, { "epoch": 3.4902103594533225, "grad_norm": 4.688964366912842, "learning_rate": 2.0860105470239777e-05, "loss": 2.451948356628418, "memory(GiB)": 77.56, "step": 81465, "token_acc": 0.44054054054054054, "train_speed(iter/s)": 1.437641 }, { "epoch": 3.4904245747825713, "grad_norm": 3.8236515522003174, "learning_rate": 2.0854637005268284e-05, "loss": 2.4827356338500977, "memory(GiB)": 77.56, "step": 81470, "token_acc": 0.5173611111111112, "train_speed(iter/s)": 1.43765 }, { "epoch": 3.4906387901118205, "grad_norm": 6.5485615730285645, "learning_rate": 2.0849169068292386e-05, "loss": 2.260237121582031, "memory(GiB)": 77.56, "step": 81475, "token_acc": 0.5053003533568905, "train_speed(iter/s)": 1.437658 }, { "epoch": 3.4908530054410694, "grad_norm": 5.4037346839904785, "learning_rate": 2.0843701659411112e-05, "loss": 2.235797882080078, "memory(GiB)": 77.56, "step": 81480, "token_acc": 0.5358490566037736, "train_speed(iter/s)": 1.437671 }, { "epoch": 3.491067220770318, "grad_norm": 5.696934700012207, "learning_rate": 2.0838234778723555e-05, "loss": 2.248728370666504, "memory(GiB)": 77.56, "step": 81485, "token_acc": 0.5472972972972973, "train_speed(iter/s)": 1.437675 }, { "epoch": 3.4912814360995674, "grad_norm": 8.324548721313477, "learning_rate": 2.0832768426328715e-05, "loss": 2.201025390625, "memory(GiB)": 77.56, "step": 81490, "token_acc": 0.49458483754512633, "train_speed(iter/s)": 1.437665 }, { "epoch": 3.4914956514288162, "grad_norm": 5.908787250518799, "learning_rate": 2.082730260232565e-05, "loss": 2.462218475341797, "memory(GiB)": 77.56, "step": 81495, "token_acc": 0.4958217270194986, "train_speed(iter/s)": 1.437685 }, { "epoch": 3.491709866758065, "grad_norm": 5.736169815063477, "learning_rate": 2.0821837306813364e-05, "loss": 2.101084518432617, "memory(GiB)": 77.56, "step": 81500, "token_acc": 0.5313807531380753, "train_speed(iter/s)": 1.437705 }, { "epoch": 3.491709866758065, "eval_loss": 2.1931748390197754, "eval_runtime": 14.3072, "eval_samples_per_second": 6.99, "eval_steps_per_second": 6.99, "eval_token_acc": 0.46185286103542234, "step": 81500 }, { "epoch": 3.4919240820873143, "grad_norm": 5.316596984863281, "learning_rate": 2.0816372539890872e-05, "loss": 2.1783926010131838, "memory(GiB)": 77.56, "step": 81505, "token_acc": 0.47368421052631576, "train_speed(iter/s)": 1.437335 }, { "epoch": 3.492138297416563, "grad_norm": 8.70543384552002, "learning_rate": 2.081090830165716e-05, "loss": 2.3440284729003906, "memory(GiB)": 77.56, "step": 81510, "token_acc": 0.48880597014925375, "train_speed(iter/s)": 1.437353 }, { "epoch": 3.492352512745812, "grad_norm": 4.562430381774902, "learning_rate": 2.0805444592211216e-05, "loss": 2.322625923156738, "memory(GiB)": 77.56, "step": 81515, "token_acc": 0.5539358600583091, "train_speed(iter/s)": 1.437369 }, { "epoch": 3.492566728075061, "grad_norm": 6.326747894287109, "learning_rate": 2.079998141165204e-05, "loss": 2.4467071533203124, "memory(GiB)": 77.56, "step": 81520, "token_acc": 0.5152671755725191, "train_speed(iter/s)": 1.43735 }, { "epoch": 3.49278094340431, "grad_norm": 9.520673751831055, "learning_rate": 2.0794518760078596e-05, "loss": 1.9941160202026367, "memory(GiB)": 77.56, "step": 81525, "token_acc": 0.5585585585585585, "train_speed(iter/s)": 1.437365 }, { "epoch": 3.492995158733559, "grad_norm": 6.004496097564697, "learning_rate": 2.0789056637589845e-05, "loss": 2.365845489501953, "memory(GiB)": 77.56, "step": 81530, "token_acc": 0.5038167938931297, "train_speed(iter/s)": 1.437361 }, { "epoch": 3.493209374062808, "grad_norm": 4.822202205657959, "learning_rate": 2.078359504428473e-05, "loss": 2.146456718444824, "memory(GiB)": 77.56, "step": 81535, "token_acc": 0.5518518518518518, "train_speed(iter/s)": 1.437384 }, { "epoch": 3.493423589392057, "grad_norm": 5.514254093170166, "learning_rate": 2.077813398026218e-05, "loss": 2.5107051849365236, "memory(GiB)": 77.56, "step": 81540, "token_acc": 0.4718498659517426, "train_speed(iter/s)": 1.437374 }, { "epoch": 3.4936378047213057, "grad_norm": 7.6399946212768555, "learning_rate": 2.0772673445621165e-05, "loss": 2.2827678680419923, "memory(GiB)": 77.56, "step": 81545, "token_acc": 0.5116279069767442, "train_speed(iter/s)": 1.437375 }, { "epoch": 3.493852020050555, "grad_norm": 5.592060565948486, "learning_rate": 2.076721344046059e-05, "loss": 2.3949600219726563, "memory(GiB)": 77.56, "step": 81550, "token_acc": 0.5228070175438596, "train_speed(iter/s)": 1.437398 }, { "epoch": 3.4940662353798038, "grad_norm": 6.032351493835449, "learning_rate": 2.0761753964879367e-05, "loss": 2.2148674011230467, "memory(GiB)": 77.56, "step": 81555, "token_acc": 0.5289855072463768, "train_speed(iter/s)": 1.437411 }, { "epoch": 3.4942804507090526, "grad_norm": 6.091811656951904, "learning_rate": 2.0756295018976397e-05, "loss": 2.4260675430297853, "memory(GiB)": 77.56, "step": 81560, "token_acc": 0.5302013422818792, "train_speed(iter/s)": 1.437396 }, { "epoch": 3.494494666038302, "grad_norm": 4.107726097106934, "learning_rate": 2.0750836602850553e-05, "loss": 2.0095905303955077, "memory(GiB)": 77.56, "step": 81565, "token_acc": 0.5653846153846154, "train_speed(iter/s)": 1.437406 }, { "epoch": 3.4947088813675506, "grad_norm": 5.650120735168457, "learning_rate": 2.0745378716600766e-05, "loss": 2.265056610107422, "memory(GiB)": 77.56, "step": 81570, "token_acc": 0.5075757575757576, "train_speed(iter/s)": 1.437411 }, { "epoch": 3.4949230966967995, "grad_norm": 5.963672161102295, "learning_rate": 2.0739921360325866e-05, "loss": 2.457226371765137, "memory(GiB)": 77.56, "step": 81575, "token_acc": 0.47619047619047616, "train_speed(iter/s)": 1.437421 }, { "epoch": 3.4951373120260487, "grad_norm": 5.260590553283691, "learning_rate": 2.0734464534124754e-05, "loss": 2.2100851058959963, "memory(GiB)": 77.56, "step": 81580, "token_acc": 0.5129151291512916, "train_speed(iter/s)": 1.437449 }, { "epoch": 3.4953515273552975, "grad_norm": 6.349801540374756, "learning_rate": 2.0729008238096275e-05, "loss": 2.6900619506835937, "memory(GiB)": 77.56, "step": 81585, "token_acc": 0.48773006134969327, "train_speed(iter/s)": 1.437447 }, { "epoch": 3.4955657426845463, "grad_norm": 5.960821151733398, "learning_rate": 2.0723552472339268e-05, "loss": 2.267192840576172, "memory(GiB)": 77.56, "step": 81590, "token_acc": 0.5265151515151515, "train_speed(iter/s)": 1.43745 }, { "epoch": 3.4957799580137956, "grad_norm": 5.048311710357666, "learning_rate": 2.071809723695257e-05, "loss": 2.6000579833984374, "memory(GiB)": 77.56, "step": 81595, "token_acc": 0.4380664652567976, "train_speed(iter/s)": 1.437451 }, { "epoch": 3.4959941733430444, "grad_norm": 5.510818958282471, "learning_rate": 2.071264253203499e-05, "loss": 2.4653921127319336, "memory(GiB)": 77.56, "step": 81600, "token_acc": 0.47101449275362317, "train_speed(iter/s)": 1.43744 }, { "epoch": 3.496208388672293, "grad_norm": 6.702828884124756, "learning_rate": 2.0707188357685393e-05, "loss": 2.6925033569335937, "memory(GiB)": 77.56, "step": 81605, "token_acc": 0.4732142857142857, "train_speed(iter/s)": 1.437451 }, { "epoch": 3.4964226040015425, "grad_norm": 5.453231334686279, "learning_rate": 2.0701734714002545e-05, "loss": 2.1410213470458985, "memory(GiB)": 77.56, "step": 81610, "token_acc": 0.590443686006826, "train_speed(iter/s)": 1.437451 }, { "epoch": 3.4966368193307913, "grad_norm": 5.779356479644775, "learning_rate": 2.0696281601085264e-05, "loss": 2.041061019897461, "memory(GiB)": 77.56, "step": 81615, "token_acc": 0.5533596837944664, "train_speed(iter/s)": 1.437461 }, { "epoch": 3.49685103466004, "grad_norm": 5.889928340911865, "learning_rate": 2.0690829019032327e-05, "loss": 2.3318981170654296, "memory(GiB)": 77.56, "step": 81620, "token_acc": 0.55, "train_speed(iter/s)": 1.437481 }, { "epoch": 3.4970652499892894, "grad_norm": 6.09144401550293, "learning_rate": 2.06853769679425e-05, "loss": 2.47115421295166, "memory(GiB)": 77.56, "step": 81625, "token_acc": 0.4713375796178344, "train_speed(iter/s)": 1.437507 }, { "epoch": 3.497279465318538, "grad_norm": 4.814365863800049, "learning_rate": 2.067992544791459e-05, "loss": 2.357257270812988, "memory(GiB)": 77.56, "step": 81630, "token_acc": 0.5263157894736842, "train_speed(iter/s)": 1.437504 }, { "epoch": 3.497493680647787, "grad_norm": 6.003396511077881, "learning_rate": 2.067447445904731e-05, "loss": 2.341724395751953, "memory(GiB)": 77.56, "step": 81635, "token_acc": 0.5035971223021583, "train_speed(iter/s)": 1.437505 }, { "epoch": 3.4977078959770362, "grad_norm": 6.591564655303955, "learning_rate": 2.066902400143946e-05, "loss": 2.381968307495117, "memory(GiB)": 77.56, "step": 81640, "token_acc": 0.475, "train_speed(iter/s)": 1.437531 }, { "epoch": 3.497922111306285, "grad_norm": 7.968791484832764, "learning_rate": 2.066357407518975e-05, "loss": 2.4607547760009765, "memory(GiB)": 77.56, "step": 81645, "token_acc": 0.46621621621621623, "train_speed(iter/s)": 1.437529 }, { "epoch": 3.498136326635534, "grad_norm": 4.376980304718018, "learning_rate": 2.0658124680396918e-05, "loss": 2.211495208740234, "memory(GiB)": 77.56, "step": 81650, "token_acc": 0.5025906735751295, "train_speed(iter/s)": 1.437514 }, { "epoch": 3.498350541964783, "grad_norm": 5.4497528076171875, "learning_rate": 2.065267581715968e-05, "loss": 2.395167922973633, "memory(GiB)": 77.56, "step": 81655, "token_acc": 0.5243055555555556, "train_speed(iter/s)": 1.437517 }, { "epoch": 3.498564757294032, "grad_norm": 5.857762813568115, "learning_rate": 2.0647227485576733e-05, "loss": 2.256017303466797, "memory(GiB)": 77.56, "step": 81660, "token_acc": 0.5714285714285714, "train_speed(iter/s)": 1.437503 }, { "epoch": 3.4987789726232807, "grad_norm": 5.593625545501709, "learning_rate": 2.064177968574681e-05, "loss": 2.181442070007324, "memory(GiB)": 77.56, "step": 81665, "token_acc": 0.5016393442622951, "train_speed(iter/s)": 1.437535 }, { "epoch": 3.49899318795253, "grad_norm": 6.4224982261657715, "learning_rate": 2.0636332417768595e-05, "loss": 2.054220962524414, "memory(GiB)": 77.56, "step": 81670, "token_acc": 0.5037593984962406, "train_speed(iter/s)": 1.43754 }, { "epoch": 3.499207403281779, "grad_norm": 6.219550609588623, "learning_rate": 2.0630885681740758e-05, "loss": 2.1898889541625977, "memory(GiB)": 77.56, "step": 81675, "token_acc": 0.5033112582781457, "train_speed(iter/s)": 1.437479 }, { "epoch": 3.4994216186110276, "grad_norm": 7.5490922927856445, "learning_rate": 2.0625439477761975e-05, "loss": 2.2260009765625, "memory(GiB)": 77.56, "step": 81680, "token_acc": 0.5447470817120622, "train_speed(iter/s)": 1.437503 }, { "epoch": 3.499635833940277, "grad_norm": 4.393792152404785, "learning_rate": 2.0619993805930904e-05, "loss": 2.2037561416625975, "memory(GiB)": 77.56, "step": 81685, "token_acc": 0.4972067039106145, "train_speed(iter/s)": 1.437494 }, { "epoch": 3.4998500492695257, "grad_norm": 6.435640335083008, "learning_rate": 2.0614548666346212e-05, "loss": 2.566878890991211, "memory(GiB)": 77.56, "step": 81690, "token_acc": 0.4792332268370607, "train_speed(iter/s)": 1.437489 }, { "epoch": 3.500064264598775, "grad_norm": 5.867866039276123, "learning_rate": 2.060910405910654e-05, "loss": 2.516805648803711, "memory(GiB)": 77.56, "step": 81695, "token_acc": 0.45098039215686275, "train_speed(iter/s)": 1.437471 }, { "epoch": 3.5002784799280238, "grad_norm": 8.683708190917969, "learning_rate": 2.0603659984310525e-05, "loss": 2.34912223815918, "memory(GiB)": 77.56, "step": 81700, "token_acc": 0.4713656387665198, "train_speed(iter/s)": 1.437477 }, { "epoch": 3.5004926952572726, "grad_norm": 8.144967079162598, "learning_rate": 2.0598216442056784e-05, "loss": 2.360447120666504, "memory(GiB)": 77.56, "step": 81705, "token_acc": 0.5170068027210885, "train_speed(iter/s)": 1.437482 }, { "epoch": 3.500706910586522, "grad_norm": 4.874293327331543, "learning_rate": 2.0592773432443913e-05, "loss": 2.5154451370239257, "memory(GiB)": 77.56, "step": 81710, "token_acc": 0.4517133956386293, "train_speed(iter/s)": 1.437457 }, { "epoch": 3.5009211259157706, "grad_norm": 5.730638027191162, "learning_rate": 2.058733095557055e-05, "loss": 2.2578094482421873, "memory(GiB)": 77.56, "step": 81715, "token_acc": 0.5232974910394266, "train_speed(iter/s)": 1.43742 }, { "epoch": 3.5011353412450195, "grad_norm": 6.449235439300537, "learning_rate": 2.0581889011535264e-05, "loss": 2.304231071472168, "memory(GiB)": 77.56, "step": 81720, "token_acc": 0.5158227848101266, "train_speed(iter/s)": 1.437429 }, { "epoch": 3.5013495565742687, "grad_norm": 5.849225044250488, "learning_rate": 2.0576447600436676e-05, "loss": 2.5435646057128904, "memory(GiB)": 77.56, "step": 81725, "token_acc": 0.5078740157480315, "train_speed(iter/s)": 1.437414 }, { "epoch": 3.5015637719035175, "grad_norm": 5.297560691833496, "learning_rate": 2.0571006722373343e-05, "loss": 2.2740482330322265, "memory(GiB)": 77.56, "step": 81730, "token_acc": 0.5217391304347826, "train_speed(iter/s)": 1.437396 }, { "epoch": 3.5017779872327663, "grad_norm": 6.248037338256836, "learning_rate": 2.0565566377443823e-05, "loss": 2.4441009521484376, "memory(GiB)": 77.56, "step": 81735, "token_acc": 0.4886731391585761, "train_speed(iter/s)": 1.437391 }, { "epoch": 3.5019922025620156, "grad_norm": 5.896634578704834, "learning_rate": 2.0560126565746686e-05, "loss": 2.598617935180664, "memory(GiB)": 77.56, "step": 81740, "token_acc": 0.4724137931034483, "train_speed(iter/s)": 1.437403 }, { "epoch": 3.5022064178912644, "grad_norm": 4.903329849243164, "learning_rate": 2.0554687287380447e-05, "loss": 2.1694101333618163, "memory(GiB)": 77.56, "step": 81745, "token_acc": 0.5151515151515151, "train_speed(iter/s)": 1.437407 }, { "epoch": 3.5024206332205132, "grad_norm": 5.899200439453125, "learning_rate": 2.0549248542443695e-05, "loss": 2.40710391998291, "memory(GiB)": 77.56, "step": 81750, "token_acc": 0.49324324324324326, "train_speed(iter/s)": 1.437433 }, { "epoch": 3.5026348485497625, "grad_norm": 7.038031101226807, "learning_rate": 2.0543810331034928e-05, "loss": 2.2765064239501953, "memory(GiB)": 77.56, "step": 81755, "token_acc": 0.49645390070921985, "train_speed(iter/s)": 1.437414 }, { "epoch": 3.5028490638790113, "grad_norm": 5.879939556121826, "learning_rate": 2.0538372653252673e-05, "loss": 2.6087474822998047, "memory(GiB)": 77.56, "step": 81760, "token_acc": 0.46397694524495675, "train_speed(iter/s)": 1.437424 }, { "epoch": 3.50306327920826, "grad_norm": 5.754982948303223, "learning_rate": 2.0532935509195432e-05, "loss": 2.3488487243652343, "memory(GiB)": 77.56, "step": 81765, "token_acc": 0.49640287769784175, "train_speed(iter/s)": 1.437419 }, { "epoch": 3.5032774945375094, "grad_norm": 5.459589958190918, "learning_rate": 2.0527498898961695e-05, "loss": 2.2230276107788085, "memory(GiB)": 77.56, "step": 81770, "token_acc": 0.5539568345323741, "train_speed(iter/s)": 1.437421 }, { "epoch": 3.503491709866758, "grad_norm": 5.302542209625244, "learning_rate": 2.0522062822649973e-05, "loss": 2.6603164672851562, "memory(GiB)": 77.56, "step": 81775, "token_acc": 0.47928994082840237, "train_speed(iter/s)": 1.437428 }, { "epoch": 3.503705925196007, "grad_norm": 8.425333976745605, "learning_rate": 2.0516627280358737e-05, "loss": 2.5466064453125, "memory(GiB)": 77.56, "step": 81780, "token_acc": 0.4850498338870432, "train_speed(iter/s)": 1.437424 }, { "epoch": 3.5039201405252562, "grad_norm": 6.835321426391602, "learning_rate": 2.051119227218644e-05, "loss": 2.2706045150756835, "memory(GiB)": 77.56, "step": 81785, "token_acc": 0.5017301038062284, "train_speed(iter/s)": 1.437458 }, { "epoch": 3.504134355854505, "grad_norm": 5.7809319496154785, "learning_rate": 2.0505757798231573e-05, "loss": 2.8129369735717775, "memory(GiB)": 77.56, "step": 81790, "token_acc": 0.44744744744744747, "train_speed(iter/s)": 1.437455 }, { "epoch": 3.504348571183754, "grad_norm": 5.822942733764648, "learning_rate": 2.0500323858592572e-05, "loss": 2.3328447341918945, "memory(GiB)": 77.56, "step": 81795, "token_acc": 0.490625, "train_speed(iter/s)": 1.437468 }, { "epoch": 3.504562786513003, "grad_norm": 7.569825172424316, "learning_rate": 2.0494890453367875e-05, "loss": 2.5288734436035156, "memory(GiB)": 77.56, "step": 81800, "token_acc": 0.4577922077922078, "train_speed(iter/s)": 1.437479 }, { "epoch": 3.504777001842252, "grad_norm": 7.130863189697266, "learning_rate": 2.0489457582655902e-05, "loss": 2.171244812011719, "memory(GiB)": 77.56, "step": 81805, "token_acc": 0.5430711610486891, "train_speed(iter/s)": 1.437491 }, { "epoch": 3.5049912171715008, "grad_norm": 5.355434894561768, "learning_rate": 2.0484025246555107e-05, "loss": 2.0865997314453124, "memory(GiB)": 77.56, "step": 81810, "token_acc": 0.5320754716981132, "train_speed(iter/s)": 1.437488 }, { "epoch": 3.50520543250075, "grad_norm": 4.92542028427124, "learning_rate": 2.047859344516388e-05, "loss": 2.3790191650390624, "memory(GiB)": 77.56, "step": 81815, "token_acc": 0.5161290322580645, "train_speed(iter/s)": 1.437504 }, { "epoch": 3.505419647829999, "grad_norm": 5.839577674865723, "learning_rate": 2.0473162178580622e-05, "loss": 2.576974296569824, "memory(GiB)": 77.56, "step": 81820, "token_acc": 0.4728434504792332, "train_speed(iter/s)": 1.437518 }, { "epoch": 3.5056338631592476, "grad_norm": 4.92479944229126, "learning_rate": 2.0467731446903736e-05, "loss": 2.3958988189697266, "memory(GiB)": 77.56, "step": 81825, "token_acc": 0.45195729537366547, "train_speed(iter/s)": 1.437488 }, { "epoch": 3.505848078488497, "grad_norm": 6.27665376663208, "learning_rate": 2.046230125023158e-05, "loss": 2.234067916870117, "memory(GiB)": 77.56, "step": 81830, "token_acc": 0.5244299674267101, "train_speed(iter/s)": 1.437484 }, { "epoch": 3.5060622938177457, "grad_norm": 5.429213047027588, "learning_rate": 2.0456871588662563e-05, "loss": 2.456011962890625, "memory(GiB)": 77.56, "step": 81835, "token_acc": 0.4647887323943662, "train_speed(iter/s)": 1.437489 }, { "epoch": 3.5062765091469945, "grad_norm": 5.514077663421631, "learning_rate": 2.0451442462295028e-05, "loss": 2.261183738708496, "memory(GiB)": 77.56, "step": 81840, "token_acc": 0.45481049562682213, "train_speed(iter/s)": 1.437494 }, { "epoch": 3.5064907244762438, "grad_norm": 5.656719207763672, "learning_rate": 2.044601387122733e-05, "loss": 2.2159067153930665, "memory(GiB)": 77.56, "step": 81845, "token_acc": 0.5442622950819672, "train_speed(iter/s)": 1.437503 }, { "epoch": 3.5067049398054926, "grad_norm": 7.250095844268799, "learning_rate": 2.044058581555782e-05, "loss": 2.1542308807373045, "memory(GiB)": 77.56, "step": 81850, "token_acc": 0.4936708860759494, "train_speed(iter/s)": 1.437502 }, { "epoch": 3.5069191551347414, "grad_norm": 6.025059223175049, "learning_rate": 2.0435158295384826e-05, "loss": 2.0294025421142576, "memory(GiB)": 77.56, "step": 81855, "token_acc": 0.5756457564575646, "train_speed(iter/s)": 1.437483 }, { "epoch": 3.5071333704639907, "grad_norm": 4.839198112487793, "learning_rate": 2.0429731310806655e-05, "loss": 2.4145973205566404, "memory(GiB)": 77.56, "step": 81860, "token_acc": 0.49074074074074076, "train_speed(iter/s)": 1.437505 }, { "epoch": 3.5073475857932395, "grad_norm": 6.596020221710205, "learning_rate": 2.042430486192164e-05, "loss": 2.3078685760498048, "memory(GiB)": 77.56, "step": 81865, "token_acc": 0.5182481751824818, "train_speed(iter/s)": 1.437516 }, { "epoch": 3.5075618011224883, "grad_norm": 6.517064094543457, "learning_rate": 2.04188789488281e-05, "loss": 2.580352783203125, "memory(GiB)": 77.56, "step": 81870, "token_acc": 0.44242424242424244, "train_speed(iter/s)": 1.437511 }, { "epoch": 3.5077760164517375, "grad_norm": 6.77636194229126, "learning_rate": 2.0413453571624323e-05, "loss": 2.6035734176635743, "memory(GiB)": 77.56, "step": 81875, "token_acc": 0.4831932773109244, "train_speed(iter/s)": 1.437524 }, { "epoch": 3.5079902317809863, "grad_norm": 7.556825160980225, "learning_rate": 2.0408028730408584e-05, "loss": 2.0738378524780274, "memory(GiB)": 77.56, "step": 81880, "token_acc": 0.5953307392996109, "train_speed(iter/s)": 1.437543 }, { "epoch": 3.508204447110235, "grad_norm": 7.605564594268799, "learning_rate": 2.040260442527917e-05, "loss": 2.4505847930908202, "memory(GiB)": 77.56, "step": 81885, "token_acc": 0.4918032786885246, "train_speed(iter/s)": 1.43755 }, { "epoch": 3.5084186624394844, "grad_norm": 6.6986517906188965, "learning_rate": 2.0397180656334318e-05, "loss": 2.1217586517333986, "memory(GiB)": 77.56, "step": 81890, "token_acc": 0.5164835164835165, "train_speed(iter/s)": 1.437552 }, { "epoch": 3.5086328777687332, "grad_norm": 6.056543827056885, "learning_rate": 2.039175742367233e-05, "loss": 2.1005084991455076, "memory(GiB)": 77.56, "step": 81895, "token_acc": 0.5298013245033113, "train_speed(iter/s)": 1.437545 }, { "epoch": 3.508847093097982, "grad_norm": 5.604673862457275, "learning_rate": 2.0386334727391432e-05, "loss": 2.2785987854003906, "memory(GiB)": 77.56, "step": 81900, "token_acc": 0.5295698924731183, "train_speed(iter/s)": 1.437568 }, { "epoch": 3.5090613084272313, "grad_norm": 4.90866756439209, "learning_rate": 2.038091256758986e-05, "loss": 2.545729064941406, "memory(GiB)": 77.56, "step": 81905, "token_acc": 0.4805194805194805, "train_speed(iter/s)": 1.437573 }, { "epoch": 3.50927552375648, "grad_norm": 6.3202595710754395, "learning_rate": 2.037549094436584e-05, "loss": 2.1428165435791016, "memory(GiB)": 77.56, "step": 81910, "token_acc": 0.5092936802973977, "train_speed(iter/s)": 1.437584 }, { "epoch": 3.509489739085729, "grad_norm": 6.21103048324585, "learning_rate": 2.0370069857817576e-05, "loss": 2.1100795745849608, "memory(GiB)": 77.56, "step": 81915, "token_acc": 0.5461538461538461, "train_speed(iter/s)": 1.43758 }, { "epoch": 3.509703954414978, "grad_norm": 5.559045314788818, "learning_rate": 2.0364649308043303e-05, "loss": 2.4545820236206053, "memory(GiB)": 77.56, "step": 81920, "token_acc": 0.48363636363636364, "train_speed(iter/s)": 1.437577 }, { "epoch": 3.509918169744227, "grad_norm": 6.621930122375488, "learning_rate": 2.0359229295141213e-05, "loss": 2.326128387451172, "memory(GiB)": 77.56, "step": 81925, "token_acc": 0.47101449275362317, "train_speed(iter/s)": 1.437587 }, { "epoch": 3.510132385073476, "grad_norm": 5.367328643798828, "learning_rate": 2.035380981920949e-05, "loss": 2.347463607788086, "memory(GiB)": 77.56, "step": 81930, "token_acc": 0.5119453924914675, "train_speed(iter/s)": 1.437574 }, { "epoch": 3.510346600402725, "grad_norm": 8.202676773071289, "learning_rate": 2.034839088034629e-05, "loss": 2.2407098770141602, "memory(GiB)": 77.56, "step": 81935, "token_acc": 0.532871972318339, "train_speed(iter/s)": 1.437585 }, { "epoch": 3.510560815731974, "grad_norm": 5.3595967292785645, "learning_rate": 2.034297247864983e-05, "loss": 2.0268245697021485, "memory(GiB)": 77.56, "step": 81940, "token_acc": 0.5631067961165048, "train_speed(iter/s)": 1.437559 }, { "epoch": 3.5107750310612227, "grad_norm": 6.485270977020264, "learning_rate": 2.033755461421824e-05, "loss": 2.417161560058594, "memory(GiB)": 77.56, "step": 81945, "token_acc": 0.48214285714285715, "train_speed(iter/s)": 1.437573 }, { "epoch": 3.510989246390472, "grad_norm": 5.715497016906738, "learning_rate": 2.0332137287149654e-05, "loss": 2.437858772277832, "memory(GiB)": 77.56, "step": 81950, "token_acc": 0.4716312056737589, "train_speed(iter/s)": 1.437598 }, { "epoch": 3.5112034617197208, "grad_norm": 7.498754978179932, "learning_rate": 2.0326720497542252e-05, "loss": 2.150317573547363, "memory(GiB)": 77.56, "step": 81955, "token_acc": 0.5169811320754717, "train_speed(iter/s)": 1.4376 }, { "epoch": 3.5114176770489696, "grad_norm": 5.521462440490723, "learning_rate": 2.0321304245494145e-05, "loss": 2.2425622940063477, "memory(GiB)": 77.56, "step": 81960, "token_acc": 0.49504950495049505, "train_speed(iter/s)": 1.437605 }, { "epoch": 3.511631892378219, "grad_norm": 7.329811096191406, "learning_rate": 2.031588853110345e-05, "loss": 2.3874088287353517, "memory(GiB)": 77.56, "step": 81965, "token_acc": 0.5176056338028169, "train_speed(iter/s)": 1.437621 }, { "epoch": 3.5118461077074676, "grad_norm": 7.661202907562256, "learning_rate": 2.0310473354468283e-05, "loss": 2.218642997741699, "memory(GiB)": 77.56, "step": 81970, "token_acc": 0.5084175084175084, "train_speed(iter/s)": 1.437618 }, { "epoch": 3.5120603230367164, "grad_norm": 6.89029598236084, "learning_rate": 2.030505871568672e-05, "loss": 2.4021123886108398, "memory(GiB)": 77.56, "step": 81975, "token_acc": 0.5210727969348659, "train_speed(iter/s)": 1.437621 }, { "epoch": 3.5122745383659657, "grad_norm": 5.101368427276611, "learning_rate": 2.0299644614856895e-05, "loss": 2.3878353118896483, "memory(GiB)": 77.56, "step": 81980, "token_acc": 0.4793103448275862, "train_speed(iter/s)": 1.437632 }, { "epoch": 3.5124887536952145, "grad_norm": 5.6344990730285645, "learning_rate": 2.0294231052076874e-05, "loss": 2.1968948364257814, "memory(GiB)": 77.56, "step": 81985, "token_acc": 0.5186335403726708, "train_speed(iter/s)": 1.437636 }, { "epoch": 3.5127029690244633, "grad_norm": 6.068678855895996, "learning_rate": 2.028881802744472e-05, "loss": 2.401434326171875, "memory(GiB)": 77.56, "step": 81990, "token_acc": 0.5174418604651163, "train_speed(iter/s)": 1.437636 }, { "epoch": 3.5129171843537126, "grad_norm": 7.214240074157715, "learning_rate": 2.0283405541058494e-05, "loss": 2.4219038009643556, "memory(GiB)": 77.56, "step": 81995, "token_acc": 0.4797297297297297, "train_speed(iter/s)": 1.437615 }, { "epoch": 3.5131313996829614, "grad_norm": 5.917394638061523, "learning_rate": 2.0277993593016252e-05, "loss": 2.2924312591552733, "memory(GiB)": 77.56, "step": 82000, "token_acc": 0.5179153094462541, "train_speed(iter/s)": 1.437618 }, { "epoch": 3.5131313996829614, "eval_loss": 2.2839174270629883, "eval_runtime": 14.06, "eval_samples_per_second": 7.112, "eval_steps_per_second": 7.112, "eval_token_acc": 0.4642857142857143, "step": 82000 }, { "epoch": 3.51334561501221, "grad_norm": 6.334027290344238, "learning_rate": 2.0272582183416027e-05, "loss": 2.2294769287109375, "memory(GiB)": 77.56, "step": 82005, "token_acc": 0.47673314339981004, "train_speed(iter/s)": 1.437242 }, { "epoch": 3.5135598303414595, "grad_norm": 6.302854061126709, "learning_rate": 2.026717131235586e-05, "loss": 2.472804641723633, "memory(GiB)": 77.56, "step": 82010, "token_acc": 0.498220640569395, "train_speed(iter/s)": 1.437249 }, { "epoch": 3.5137740456707083, "grad_norm": 7.024232387542725, "learning_rate": 2.0261760979933796e-05, "loss": 2.2838665008544923, "memory(GiB)": 77.56, "step": 82015, "token_acc": 0.4847560975609756, "train_speed(iter/s)": 1.437265 }, { "epoch": 3.513988260999957, "grad_norm": 4.253474235534668, "learning_rate": 2.0256351186247824e-05, "loss": 2.4336471557617188, "memory(GiB)": 77.56, "step": 82020, "token_acc": 0.48859934853420195, "train_speed(iter/s)": 1.437267 }, { "epoch": 3.5142024763292063, "grad_norm": 6.034378528594971, "learning_rate": 2.0250941931395957e-05, "loss": 2.303436279296875, "memory(GiB)": 77.56, "step": 82025, "token_acc": 0.4729241877256318, "train_speed(iter/s)": 1.43726 }, { "epoch": 3.514416691658455, "grad_norm": 5.84157657623291, "learning_rate": 2.024553321547618e-05, "loss": 2.523159980773926, "memory(GiB)": 77.56, "step": 82030, "token_acc": 0.4336283185840708, "train_speed(iter/s)": 1.437259 }, { "epoch": 3.514630906987704, "grad_norm": 5.684593200683594, "learning_rate": 2.0240125038586465e-05, "loss": 2.3350486755371094, "memory(GiB)": 77.56, "step": 82035, "token_acc": 0.5173611111111112, "train_speed(iter/s)": 1.437264 }, { "epoch": 3.5148451223169532, "grad_norm": 5.939591407775879, "learning_rate": 2.0234717400824814e-05, "loss": 2.566223907470703, "memory(GiB)": 77.56, "step": 82040, "token_acc": 0.46905537459283386, "train_speed(iter/s)": 1.437263 }, { "epoch": 3.515059337646202, "grad_norm": 4.637340068817139, "learning_rate": 2.022931030228919e-05, "loss": 2.2540449142456054, "memory(GiB)": 77.56, "step": 82045, "token_acc": 0.5471698113207547, "train_speed(iter/s)": 1.437294 }, { "epoch": 3.515273552975451, "grad_norm": 5.105705261230469, "learning_rate": 2.0223903743077528e-05, "loss": 2.1845422744750977, "memory(GiB)": 77.56, "step": 82050, "token_acc": 0.5435540069686411, "train_speed(iter/s)": 1.437312 }, { "epoch": 3.5154877683047, "grad_norm": 5.498888969421387, "learning_rate": 2.0218497723287788e-05, "loss": 2.3621320724487305, "memory(GiB)": 77.56, "step": 82055, "token_acc": 0.5112994350282486, "train_speed(iter/s)": 1.437325 }, { "epoch": 3.515701983633949, "grad_norm": 5.912207126617432, "learning_rate": 2.021309224301788e-05, "loss": 1.9903440475463867, "memory(GiB)": 77.56, "step": 82060, "token_acc": 0.5597269624573379, "train_speed(iter/s)": 1.437336 }, { "epoch": 3.5159161989631977, "grad_norm": 6.581540584564209, "learning_rate": 2.0207687302365762e-05, "loss": 2.4122432708740233, "memory(GiB)": 77.56, "step": 82065, "token_acc": 0.5102739726027398, "train_speed(iter/s)": 1.437323 }, { "epoch": 3.516130414292447, "grad_norm": 5.1985673904418945, "learning_rate": 2.0202282901429338e-05, "loss": 2.209666633605957, "memory(GiB)": 77.56, "step": 82070, "token_acc": 0.5278969957081545, "train_speed(iter/s)": 1.43734 }, { "epoch": 3.516344629621696, "grad_norm": 6.54959774017334, "learning_rate": 2.019687904030651e-05, "loss": 2.4396503448486326, "memory(GiB)": 77.56, "step": 82075, "token_acc": 0.4691358024691358, "train_speed(iter/s)": 1.437362 }, { "epoch": 3.5165588449509446, "grad_norm": 6.053768634796143, "learning_rate": 2.0191475719095166e-05, "loss": 2.1498748779296877, "memory(GiB)": 77.56, "step": 82080, "token_acc": 0.5257352941176471, "train_speed(iter/s)": 1.437356 }, { "epoch": 3.516773060280194, "grad_norm": 5.249053001403809, "learning_rate": 2.0186072937893212e-05, "loss": 2.1845705032348635, "memory(GiB)": 77.56, "step": 82085, "token_acc": 0.5013698630136987, "train_speed(iter/s)": 1.437356 }, { "epoch": 3.5169872756094427, "grad_norm": 6.844965934753418, "learning_rate": 2.0180670696798514e-05, "loss": 2.3761863708496094, "memory(GiB)": 77.56, "step": 82090, "token_acc": 0.5017421602787456, "train_speed(iter/s)": 1.437358 }, { "epoch": 3.5172014909386915, "grad_norm": 6.118383884429932, "learning_rate": 2.0175268995908924e-05, "loss": 2.374035453796387, "memory(GiB)": 77.56, "step": 82095, "token_acc": 0.5214521452145214, "train_speed(iter/s)": 1.437351 }, { "epoch": 3.5174157062679408, "grad_norm": 6.474025249481201, "learning_rate": 2.0169867835322332e-05, "loss": 2.2998636245727537, "memory(GiB)": 77.56, "step": 82100, "token_acc": 0.5310344827586206, "train_speed(iter/s)": 1.437361 }, { "epoch": 3.5176299215971896, "grad_norm": 5.027404308319092, "learning_rate": 2.0164467215136566e-05, "loss": 2.2003076553344725, "memory(GiB)": 77.56, "step": 82105, "token_acc": 0.5019011406844106, "train_speed(iter/s)": 1.437367 }, { "epoch": 3.5178441369264384, "grad_norm": 4.83841609954834, "learning_rate": 2.0159067135449467e-05, "loss": 2.483617401123047, "memory(GiB)": 77.56, "step": 82110, "token_acc": 0.4915254237288136, "train_speed(iter/s)": 1.43735 }, { "epoch": 3.5180583522556876, "grad_norm": 5.018257141113281, "learning_rate": 2.0153667596358862e-05, "loss": 2.6086502075195312, "memory(GiB)": 77.56, "step": 82115, "token_acc": 0.498220640569395, "train_speed(iter/s)": 1.437344 }, { "epoch": 3.5182725675849364, "grad_norm": 7.816195011138916, "learning_rate": 2.0148268597962544e-05, "loss": 2.1150815963745115, "memory(GiB)": 77.56, "step": 82120, "token_acc": 0.5488215488215489, "train_speed(iter/s)": 1.437343 }, { "epoch": 3.5184867829141853, "grad_norm": 7.717521667480469, "learning_rate": 2.014287014035836e-05, "loss": 2.6598011016845704, "memory(GiB)": 77.56, "step": 82125, "token_acc": 0.42857142857142855, "train_speed(iter/s)": 1.43736 }, { "epoch": 3.5187009982434345, "grad_norm": 11.461997985839844, "learning_rate": 2.0137472223644093e-05, "loss": 2.650553894042969, "memory(GiB)": 77.56, "step": 82130, "token_acc": 0.46757679180887374, "train_speed(iter/s)": 1.437342 }, { "epoch": 3.5189152135726833, "grad_norm": 6.628066539764404, "learning_rate": 2.0132074847917533e-05, "loss": 2.4993259429931642, "memory(GiB)": 77.56, "step": 82135, "token_acc": 0.4854014598540146, "train_speed(iter/s)": 1.437328 }, { "epoch": 3.519129428901932, "grad_norm": 7.059751987457275, "learning_rate": 2.012667801327645e-05, "loss": 2.2334800720214845, "memory(GiB)": 77.56, "step": 82140, "token_acc": 0.518796992481203, "train_speed(iter/s)": 1.437317 }, { "epoch": 3.5193436442311814, "grad_norm": 6.7917304039001465, "learning_rate": 2.0121281719818625e-05, "loss": 2.2749711990356447, "memory(GiB)": 77.56, "step": 82145, "token_acc": 0.49032258064516127, "train_speed(iter/s)": 1.437319 }, { "epoch": 3.51955785956043, "grad_norm": 5.486000061035156, "learning_rate": 2.011588596764179e-05, "loss": 2.346443939208984, "memory(GiB)": 77.56, "step": 82150, "token_acc": 0.5032467532467533, "train_speed(iter/s)": 1.437333 }, { "epoch": 3.519772074889679, "grad_norm": 5.41943359375, "learning_rate": 2.0110490756843714e-05, "loss": 2.428234672546387, "memory(GiB)": 77.56, "step": 82155, "token_acc": 0.4793650793650794, "train_speed(iter/s)": 1.437359 }, { "epoch": 3.5199862902189283, "grad_norm": 4.3501877784729, "learning_rate": 2.0105096087522153e-05, "loss": 2.333928680419922, "memory(GiB)": 77.56, "step": 82160, "token_acc": 0.49363057324840764, "train_speed(iter/s)": 1.437365 }, { "epoch": 3.520200505548177, "grad_norm": 6.327761650085449, "learning_rate": 2.009970195977482e-05, "loss": 2.576307487487793, "memory(GiB)": 77.56, "step": 82165, "token_acc": 0.4458204334365325, "train_speed(iter/s)": 1.437382 }, { "epoch": 3.520414720877426, "grad_norm": 7.110805988311768, "learning_rate": 2.0094308373699434e-05, "loss": 2.0621532440185546, "memory(GiB)": 77.56, "step": 82170, "token_acc": 0.5061349693251533, "train_speed(iter/s)": 1.437386 }, { "epoch": 3.520628936206675, "grad_norm": 7.577486038208008, "learning_rate": 2.008891532939371e-05, "loss": 2.73079833984375, "memory(GiB)": 77.56, "step": 82175, "token_acc": 0.4370629370629371, "train_speed(iter/s)": 1.437405 }, { "epoch": 3.520843151535924, "grad_norm": 7.6457719802856445, "learning_rate": 2.008352282695532e-05, "loss": 2.592006492614746, "memory(GiB)": 77.56, "step": 82180, "token_acc": 0.4672131147540984, "train_speed(iter/s)": 1.437418 }, { "epoch": 3.521057366865173, "grad_norm": 4.591124057769775, "learning_rate": 2.0078130866481998e-05, "loss": 2.239780044555664, "memory(GiB)": 77.56, "step": 82185, "token_acc": 0.4935897435897436, "train_speed(iter/s)": 1.437412 }, { "epoch": 3.521271582194422, "grad_norm": 6.016817569732666, "learning_rate": 2.0072739448071405e-05, "loss": 2.174859809875488, "memory(GiB)": 77.56, "step": 82190, "token_acc": 0.5176470588235295, "train_speed(iter/s)": 1.437417 }, { "epoch": 3.521485797523671, "grad_norm": 7.4195661544799805, "learning_rate": 2.0067348571821214e-05, "loss": 2.3572559356689453, "memory(GiB)": 77.56, "step": 82195, "token_acc": 0.46502057613168724, "train_speed(iter/s)": 1.437416 }, { "epoch": 3.5217000128529197, "grad_norm": 5.714663028717041, "learning_rate": 2.0061958237829075e-05, "loss": 2.1579971313476562, "memory(GiB)": 77.56, "step": 82200, "token_acc": 0.5387323943661971, "train_speed(iter/s)": 1.437422 }, { "epoch": 3.521914228182169, "grad_norm": 6.145798683166504, "learning_rate": 2.0056568446192636e-05, "loss": 2.239365005493164, "memory(GiB)": 77.56, "step": 82205, "token_acc": 0.47761194029850745, "train_speed(iter/s)": 1.437388 }, { "epoch": 3.5221284435114177, "grad_norm": 5.390622615814209, "learning_rate": 2.0051179197009568e-05, "loss": 2.5779598236083983, "memory(GiB)": 77.56, "step": 82210, "token_acc": 0.47101449275362317, "train_speed(iter/s)": 1.437402 }, { "epoch": 3.5223426588406666, "grad_norm": 5.716423988342285, "learning_rate": 2.0045790490377476e-05, "loss": 2.153798484802246, "memory(GiB)": 77.56, "step": 82215, "token_acc": 0.4983922829581994, "train_speed(iter/s)": 1.437385 }, { "epoch": 3.522556874169916, "grad_norm": 5.381307601928711, "learning_rate": 2.0040402326393993e-05, "loss": 2.4117923736572267, "memory(GiB)": 77.56, "step": 82220, "token_acc": 0.45054945054945056, "train_speed(iter/s)": 1.43736 }, { "epoch": 3.5227710894991646, "grad_norm": 8.643548965454102, "learning_rate": 2.0035014705156723e-05, "loss": 2.2229400634765626, "memory(GiB)": 77.56, "step": 82225, "token_acc": 0.5463917525773195, "train_speed(iter/s)": 1.437353 }, { "epoch": 3.5229853048284134, "grad_norm": 7.187229633331299, "learning_rate": 2.002962762676326e-05, "loss": 2.3501636505126955, "memory(GiB)": 77.56, "step": 82230, "token_acc": 0.47586206896551725, "train_speed(iter/s)": 1.437376 }, { "epoch": 3.5231995201576627, "grad_norm": 5.810476303100586, "learning_rate": 2.0024241091311218e-05, "loss": 2.4225767135620115, "memory(GiB)": 77.56, "step": 82235, "token_acc": 0.5038461538461538, "train_speed(iter/s)": 1.437371 }, { "epoch": 3.5234137354869115, "grad_norm": 7.631504535675049, "learning_rate": 2.0018855098898153e-05, "loss": 2.152008056640625, "memory(GiB)": 77.56, "step": 82240, "token_acc": 0.5421245421245421, "train_speed(iter/s)": 1.437355 }, { "epoch": 3.5236279508161603, "grad_norm": 6.231959342956543, "learning_rate": 2.001346964962167e-05, "loss": 2.5960718154907227, "memory(GiB)": 77.56, "step": 82245, "token_acc": 0.4652567975830816, "train_speed(iter/s)": 1.437366 }, { "epoch": 3.5238421661454096, "grad_norm": 5.141421794891357, "learning_rate": 2.0008084743579313e-05, "loss": 2.4458024978637694, "memory(GiB)": 77.56, "step": 82250, "token_acc": 0.4964788732394366, "train_speed(iter/s)": 1.437376 }, { "epoch": 3.5240563814746584, "grad_norm": 9.120293617248535, "learning_rate": 2.0002700380868633e-05, "loss": 2.559148406982422, "memory(GiB)": 77.56, "step": 82255, "token_acc": 0.4855305466237942, "train_speed(iter/s)": 1.437377 }, { "epoch": 3.524270596803907, "grad_norm": 5.958583354949951, "learning_rate": 1.999731656158718e-05, "loss": 2.4650705337524412, "memory(GiB)": 77.56, "step": 82260, "token_acc": 0.4588607594936709, "train_speed(iter/s)": 1.437369 }, { "epoch": 3.5244848121331565, "grad_norm": 8.771286964416504, "learning_rate": 1.9991933285832465e-05, "loss": 2.351560592651367, "memory(GiB)": 77.56, "step": 82265, "token_acc": 0.49666666666666665, "train_speed(iter/s)": 1.437378 }, { "epoch": 3.5246990274624053, "grad_norm": 5.028775691986084, "learning_rate": 1.9986550553702045e-05, "loss": 2.2397974014282225, "memory(GiB)": 77.56, "step": 82270, "token_acc": 0.5105633802816901, "train_speed(iter/s)": 1.437401 }, { "epoch": 3.524913242791654, "grad_norm": 6.096969127655029, "learning_rate": 1.998116836529341e-05, "loss": 2.4847530364990233, "memory(GiB)": 77.56, "step": 82275, "token_acc": 0.5190311418685121, "train_speed(iter/s)": 1.437394 }, { "epoch": 3.5251274581209033, "grad_norm": 6.039845943450928, "learning_rate": 1.9975786720704083e-05, "loss": 2.4454181671142576, "memory(GiB)": 77.56, "step": 82280, "token_acc": 0.5261538461538462, "train_speed(iter/s)": 1.43741 }, { "epoch": 3.525341673450152, "grad_norm": 6.259216785430908, "learning_rate": 1.9970405620031534e-05, "loss": 2.182073211669922, "memory(GiB)": 77.56, "step": 82285, "token_acc": 0.5296442687747036, "train_speed(iter/s)": 1.437413 }, { "epoch": 3.525555888779401, "grad_norm": 6.64978551864624, "learning_rate": 1.996502506337326e-05, "loss": 2.5128606796264648, "memory(GiB)": 77.56, "step": 82290, "token_acc": 0.4953271028037383, "train_speed(iter/s)": 1.437434 }, { "epoch": 3.52577010410865, "grad_norm": 6.265961170196533, "learning_rate": 1.9959645050826714e-05, "loss": 2.4377384185791016, "memory(GiB)": 77.56, "step": 82295, "token_acc": 0.4507936507936508, "train_speed(iter/s)": 1.437447 }, { "epoch": 3.525984319437899, "grad_norm": 5.836155414581299, "learning_rate": 1.995426558248938e-05, "loss": 2.300048828125, "memory(GiB)": 77.56, "step": 82300, "token_acc": 0.5127388535031847, "train_speed(iter/s)": 1.437446 }, { "epoch": 3.526198534767148, "grad_norm": 5.279250144958496, "learning_rate": 1.9948886658458727e-05, "loss": 2.5441802978515624, "memory(GiB)": 77.56, "step": 82305, "token_acc": 0.4950166112956811, "train_speed(iter/s)": 1.437468 }, { "epoch": 3.526412750096397, "grad_norm": 4.949451446533203, "learning_rate": 1.9943508278832186e-05, "loss": 2.505910110473633, "memory(GiB)": 77.56, "step": 82310, "token_acc": 0.539622641509434, "train_speed(iter/s)": 1.437484 }, { "epoch": 3.526626965425646, "grad_norm": 5.963874340057373, "learning_rate": 1.993813044370718e-05, "loss": 2.241254425048828, "memory(GiB)": 77.56, "step": 82315, "token_acc": 0.4845679012345679, "train_speed(iter/s)": 1.437478 }, { "epoch": 3.5268411807548947, "grad_norm": 5.687797546386719, "learning_rate": 1.993275315318115e-05, "loss": 2.353618049621582, "memory(GiB)": 77.56, "step": 82320, "token_acc": 0.528, "train_speed(iter/s)": 1.437477 }, { "epoch": 3.527055396084144, "grad_norm": 5.25728702545166, "learning_rate": 1.992737640735148e-05, "loss": 2.342526435852051, "memory(GiB)": 77.56, "step": 82325, "token_acc": 0.4823848238482385, "train_speed(iter/s)": 1.437463 }, { "epoch": 3.527269611413393, "grad_norm": 5.838143825531006, "learning_rate": 1.992200020631561e-05, "loss": 2.495411491394043, "memory(GiB)": 77.56, "step": 82330, "token_acc": 0.46178343949044587, "train_speed(iter/s)": 1.437468 }, { "epoch": 3.5274838267426416, "grad_norm": 5.741201400756836, "learning_rate": 1.9916624550170926e-05, "loss": 2.5044696807861326, "memory(GiB)": 77.56, "step": 82335, "token_acc": 0.48494983277591974, "train_speed(iter/s)": 1.437485 }, { "epoch": 3.527698042071891, "grad_norm": 5.947779655456543, "learning_rate": 1.9911249439014812e-05, "loss": 2.3775211334228517, "memory(GiB)": 77.56, "step": 82340, "token_acc": 0.4898785425101215, "train_speed(iter/s)": 1.437505 }, { "epoch": 3.5279122574011397, "grad_norm": 6.149830341339111, "learning_rate": 1.9905874872944635e-05, "loss": 2.2238487243652343, "memory(GiB)": 77.56, "step": 82345, "token_acc": 0.5222929936305732, "train_speed(iter/s)": 1.437503 }, { "epoch": 3.5281264727303885, "grad_norm": 5.631735801696777, "learning_rate": 1.9900500852057747e-05, "loss": 2.384516143798828, "memory(GiB)": 77.56, "step": 82350, "token_acc": 0.5033557046979866, "train_speed(iter/s)": 1.437516 }, { "epoch": 3.5283406880596377, "grad_norm": 6.412612438201904, "learning_rate": 1.9895127376451544e-05, "loss": 2.508793258666992, "memory(GiB)": 77.56, "step": 82355, "token_acc": 0.4416058394160584, "train_speed(iter/s)": 1.437523 }, { "epoch": 3.5285549033888866, "grad_norm": 4.521615028381348, "learning_rate": 1.9889754446223348e-05, "loss": 2.44223575592041, "memory(GiB)": 77.56, "step": 82360, "token_acc": 0.4837662337662338, "train_speed(iter/s)": 1.437532 }, { "epoch": 3.5287691187181354, "grad_norm": 6.081164360046387, "learning_rate": 1.9884382061470492e-05, "loss": 2.4649074554443358, "memory(GiB)": 77.56, "step": 82365, "token_acc": 0.5116959064327485, "train_speed(iter/s)": 1.437548 }, { "epoch": 3.5289833340473846, "grad_norm": 4.48116397857666, "learning_rate": 1.9879010222290314e-05, "loss": 2.298491096496582, "memory(GiB)": 77.56, "step": 82370, "token_acc": 0.48148148148148145, "train_speed(iter/s)": 1.437551 }, { "epoch": 3.5291975493766334, "grad_norm": 7.052316665649414, "learning_rate": 1.9873638928780093e-05, "loss": 2.519794464111328, "memory(GiB)": 77.56, "step": 82375, "token_acc": 0.4463087248322148, "train_speed(iter/s)": 1.437547 }, { "epoch": 3.5294117647058822, "grad_norm": 6.021188735961914, "learning_rate": 1.9868268181037185e-05, "loss": 2.10813045501709, "memory(GiB)": 77.56, "step": 82380, "token_acc": 0.5544217687074829, "train_speed(iter/s)": 1.437558 }, { "epoch": 3.5296259800351315, "grad_norm": 6.387473106384277, "learning_rate": 1.986289797915885e-05, "loss": 2.3520223617553713, "memory(GiB)": 77.56, "step": 82385, "token_acc": 0.5210727969348659, "train_speed(iter/s)": 1.437581 }, { "epoch": 3.5298401953643803, "grad_norm": 5.794710159301758, "learning_rate": 1.9857528323242407e-05, "loss": 2.2839115142822264, "memory(GiB)": 77.56, "step": 82390, "token_acc": 0.4688427299703264, "train_speed(iter/s)": 1.437575 }, { "epoch": 3.530054410693629, "grad_norm": 6.127869129180908, "learning_rate": 1.9852159213385113e-05, "loss": 2.1248931884765625, "memory(GiB)": 77.56, "step": 82395, "token_acc": 0.5264705882352941, "train_speed(iter/s)": 1.437565 }, { "epoch": 3.5302686260228784, "grad_norm": 5.518728733062744, "learning_rate": 1.9846790649684232e-05, "loss": 2.242991638183594, "memory(GiB)": 77.56, "step": 82400, "token_acc": 0.47959183673469385, "train_speed(iter/s)": 1.437581 }, { "epoch": 3.530482841352127, "grad_norm": 5.253847122192383, "learning_rate": 1.9841422632237026e-05, "loss": 2.307224082946777, "memory(GiB)": 77.56, "step": 82405, "token_acc": 0.5, "train_speed(iter/s)": 1.437595 }, { "epoch": 3.530697056681376, "grad_norm": 6.456474304199219, "learning_rate": 1.9836055161140725e-05, "loss": 2.311740684509277, "memory(GiB)": 77.56, "step": 82410, "token_acc": 0.4828767123287671, "train_speed(iter/s)": 1.437615 }, { "epoch": 3.5309112720106253, "grad_norm": 4.971041679382324, "learning_rate": 1.9830688236492602e-05, "loss": 2.5282386779785155, "memory(GiB)": 77.56, "step": 82415, "token_acc": 0.4628975265017668, "train_speed(iter/s)": 1.437593 }, { "epoch": 3.531125487339874, "grad_norm": 4.824924945831299, "learning_rate": 1.9825321858389855e-05, "loss": 2.323630142211914, "memory(GiB)": 77.56, "step": 82420, "token_acc": 0.5288135593220339, "train_speed(iter/s)": 1.4376 }, { "epoch": 3.531339702669123, "grad_norm": 6.92716646194458, "learning_rate": 1.9819956026929715e-05, "loss": 2.3047618865966797, "memory(GiB)": 77.56, "step": 82425, "token_acc": 0.4721189591078067, "train_speed(iter/s)": 1.437582 }, { "epoch": 3.531553917998372, "grad_norm": 6.4586501121521, "learning_rate": 1.9814590742209382e-05, "loss": 2.255636215209961, "memory(GiB)": 77.56, "step": 82430, "token_acc": 0.5182926829268293, "train_speed(iter/s)": 1.437538 }, { "epoch": 3.531768133327621, "grad_norm": 6.5254225730896, "learning_rate": 1.9809226004326032e-05, "loss": 2.47855281829834, "memory(GiB)": 77.56, "step": 82435, "token_acc": 0.4708029197080292, "train_speed(iter/s)": 1.437563 }, { "epoch": 3.5319823486568698, "grad_norm": 5.294203758239746, "learning_rate": 1.9803861813376895e-05, "loss": 2.5271537780761717, "memory(GiB)": 77.56, "step": 82440, "token_acc": 0.517799352750809, "train_speed(iter/s)": 1.437575 }, { "epoch": 3.532196563986119, "grad_norm": 5.014577388763428, "learning_rate": 1.9798498169459124e-05, "loss": 2.2906585693359376, "memory(GiB)": 77.56, "step": 82445, "token_acc": 0.48175182481751827, "train_speed(iter/s)": 1.437563 }, { "epoch": 3.532410779315368, "grad_norm": 5.592288494110107, "learning_rate": 1.9793135072669878e-05, "loss": 2.267938232421875, "memory(GiB)": 77.56, "step": 82450, "token_acc": 0.513595166163142, "train_speed(iter/s)": 1.437568 }, { "epoch": 3.5326249946446167, "grad_norm": 5.750865936279297, "learning_rate": 1.978777252310634e-05, "loss": 2.02744083404541, "memory(GiB)": 77.56, "step": 82455, "token_acc": 0.569023569023569, "train_speed(iter/s)": 1.437566 }, { "epoch": 3.532839209973866, "grad_norm": 4.834346771240234, "learning_rate": 1.9782410520865648e-05, "loss": 2.5700956344604493, "memory(GiB)": 77.56, "step": 82460, "token_acc": 0.47896440129449835, "train_speed(iter/s)": 1.437588 }, { "epoch": 3.5330534253031147, "grad_norm": 7.49498987197876, "learning_rate": 1.977704906604493e-05, "loss": 2.2034053802490234, "memory(GiB)": 77.56, "step": 82465, "token_acc": 0.5179282868525896, "train_speed(iter/s)": 1.437588 }, { "epoch": 3.5332676406323635, "grad_norm": 7.677392959594727, "learning_rate": 1.9771688158741302e-05, "loss": 2.3965681076049803, "memory(GiB)": 77.56, "step": 82470, "token_acc": 0.5035211267605634, "train_speed(iter/s)": 1.437593 }, { "epoch": 3.533481855961613, "grad_norm": 7.435286998748779, "learning_rate": 1.976632779905192e-05, "loss": 2.3592693328857424, "memory(GiB)": 77.56, "step": 82475, "token_acc": 0.50814332247557, "train_speed(iter/s)": 1.437606 }, { "epoch": 3.5336960712908616, "grad_norm": 5.908475875854492, "learning_rate": 1.9760967987073876e-05, "loss": 2.421270179748535, "memory(GiB)": 77.56, "step": 82480, "token_acc": 0.5152671755725191, "train_speed(iter/s)": 1.437609 }, { "epoch": 3.5339102866201104, "grad_norm": 5.820932388305664, "learning_rate": 1.9755608722904256e-05, "loss": 2.4349483489990233, "memory(GiB)": 77.56, "step": 82485, "token_acc": 0.48220064724919093, "train_speed(iter/s)": 1.43763 }, { "epoch": 3.5341245019493597, "grad_norm": 6.203174114227295, "learning_rate": 1.975025000664016e-05, "loss": 2.3788665771484374, "memory(GiB)": 77.56, "step": 82490, "token_acc": 0.49390243902439024, "train_speed(iter/s)": 1.437646 }, { "epoch": 3.5343387172786085, "grad_norm": 6.152998924255371, "learning_rate": 1.974489183837864e-05, "loss": 2.567764472961426, "memory(GiB)": 77.56, "step": 82495, "token_acc": 0.46264367816091956, "train_speed(iter/s)": 1.437643 }, { "epoch": 3.5345529326078573, "grad_norm": 5.205676078796387, "learning_rate": 1.9739534218216805e-05, "loss": 2.3460786819458006, "memory(GiB)": 77.56, "step": 82500, "token_acc": 0.4919614147909968, "train_speed(iter/s)": 1.437646 }, { "epoch": 3.5345529326078573, "eval_loss": 2.1523923873901367, "eval_runtime": 14.0725, "eval_samples_per_second": 7.106, "eval_steps_per_second": 7.106, "eval_token_acc": 0.4811320754716981, "step": 82500 }, { "epoch": 3.5347671479371066, "grad_norm": 5.5499348640441895, "learning_rate": 1.97341771462517e-05, "loss": 2.32886962890625, "memory(GiB)": 77.56, "step": 82505, "token_acc": 0.4863849765258216, "train_speed(iter/s)": 1.437254 }, { "epoch": 3.5349813632663554, "grad_norm": 4.896663188934326, "learning_rate": 1.9728820622580358e-05, "loss": 2.1635631561279296, "memory(GiB)": 77.56, "step": 82510, "token_acc": 0.5202702702702703, "train_speed(iter/s)": 1.437244 }, { "epoch": 3.535195578595604, "grad_norm": 4.505125045776367, "learning_rate": 1.9723464647299834e-05, "loss": 2.2201520919799806, "memory(GiB)": 77.56, "step": 82515, "token_acc": 0.5467128027681661, "train_speed(iter/s)": 1.437243 }, { "epoch": 3.5354097939248534, "grad_norm": 5.725396633148193, "learning_rate": 1.9718109220507147e-05, "loss": 2.3150224685668945, "memory(GiB)": 77.56, "step": 82520, "token_acc": 0.4618181818181818, "train_speed(iter/s)": 1.437214 }, { "epoch": 3.5356240092541023, "grad_norm": 6.492303371429443, "learning_rate": 1.97127543422993e-05, "loss": 2.139881896972656, "memory(GiB)": 77.56, "step": 82525, "token_acc": 0.5355805243445693, "train_speed(iter/s)": 1.437233 }, { "epoch": 3.535838224583351, "grad_norm": 5.474704265594482, "learning_rate": 1.9707400012773318e-05, "loss": 2.3984148025512697, "memory(GiB)": 77.56, "step": 82530, "token_acc": 0.45980707395498394, "train_speed(iter/s)": 1.437238 }, { "epoch": 3.5360524399126003, "grad_norm": 4.704589366912842, "learning_rate": 1.9702046232026222e-05, "loss": 2.0160974502563476, "memory(GiB)": 77.56, "step": 82535, "token_acc": 0.5800711743772242, "train_speed(iter/s)": 1.437252 }, { "epoch": 3.536266655241849, "grad_norm": 5.991926670074463, "learning_rate": 1.969669300015498e-05, "loss": 2.3534507751464844, "memory(GiB)": 77.56, "step": 82540, "token_acc": 0.5, "train_speed(iter/s)": 1.437267 }, { "epoch": 3.536480870571098, "grad_norm": 4.970750331878662, "learning_rate": 1.9691340317256575e-05, "loss": 2.43546142578125, "memory(GiB)": 77.56, "step": 82545, "token_acc": 0.4968553459119497, "train_speed(iter/s)": 1.437261 }, { "epoch": 3.536695085900347, "grad_norm": 5.680489540100098, "learning_rate": 1.9685988183427968e-05, "loss": 2.5124191284179687, "memory(GiB)": 77.56, "step": 82550, "token_acc": 0.47750865051903113, "train_speed(iter/s)": 1.437278 }, { "epoch": 3.536909301229596, "grad_norm": 5.884768009185791, "learning_rate": 1.9680636598766104e-05, "loss": 2.324357604980469, "memory(GiB)": 77.56, "step": 82555, "token_acc": 0.4963768115942029, "train_speed(iter/s)": 1.437277 }, { "epoch": 3.537123516558845, "grad_norm": 5.680827617645264, "learning_rate": 1.967528556336797e-05, "loss": 2.441228675842285, "memory(GiB)": 77.56, "step": 82560, "token_acc": 0.4863013698630137, "train_speed(iter/s)": 1.437284 }, { "epoch": 3.537337731888094, "grad_norm": 5.874486446380615, "learning_rate": 1.966993507733048e-05, "loss": 2.417146682739258, "memory(GiB)": 77.56, "step": 82565, "token_acc": 0.5120274914089347, "train_speed(iter/s)": 1.437294 }, { "epoch": 3.537551947217343, "grad_norm": 6.918670177459717, "learning_rate": 1.9664585140750574e-05, "loss": 2.1657217025756834, "memory(GiB)": 77.56, "step": 82570, "token_acc": 0.5151515151515151, "train_speed(iter/s)": 1.4373 }, { "epoch": 3.5377661625465917, "grad_norm": 6.989270210266113, "learning_rate": 1.965923575372516e-05, "loss": 2.3373685836791993, "memory(GiB)": 77.56, "step": 82575, "token_acc": 0.5116279069767442, "train_speed(iter/s)": 1.437319 }, { "epoch": 3.537980377875841, "grad_norm": 5.781280994415283, "learning_rate": 1.965388691635114e-05, "loss": 2.4048690795898438, "memory(GiB)": 77.56, "step": 82580, "token_acc": 0.4798657718120805, "train_speed(iter/s)": 1.437311 }, { "epoch": 3.5381945932050898, "grad_norm": 5.102858066558838, "learning_rate": 1.964853862872544e-05, "loss": 2.4206228256225586, "memory(GiB)": 77.56, "step": 82585, "token_acc": 0.473972602739726, "train_speed(iter/s)": 1.437307 }, { "epoch": 3.5384088085343386, "grad_norm": 6.523634433746338, "learning_rate": 1.964319089094493e-05, "loss": 2.548038101196289, "memory(GiB)": 77.56, "step": 82590, "token_acc": 0.5060240963855421, "train_speed(iter/s)": 1.437318 }, { "epoch": 3.538623023863588, "grad_norm": 5.074010848999023, "learning_rate": 1.9637843703106503e-05, "loss": 2.2360279083251955, "memory(GiB)": 77.56, "step": 82595, "token_acc": 0.47191011235955055, "train_speed(iter/s)": 1.437333 }, { "epoch": 3.5388372391928367, "grad_norm": 5.488282680511475, "learning_rate": 1.9632497065306993e-05, "loss": 2.5630460739135743, "memory(GiB)": 77.56, "step": 82600, "token_acc": 0.4837662337662338, "train_speed(iter/s)": 1.437345 }, { "epoch": 3.5390514545220855, "grad_norm": 6.235357761383057, "learning_rate": 1.962715097764331e-05, "loss": 2.199538230895996, "memory(GiB)": 77.56, "step": 82605, "token_acc": 0.5067114093959731, "train_speed(iter/s)": 1.437353 }, { "epoch": 3.5392656698513347, "grad_norm": 4.771381855010986, "learning_rate": 1.9621805440212275e-05, "loss": 2.450325775146484, "memory(GiB)": 77.56, "step": 82610, "token_acc": 0.44072948328267475, "train_speed(iter/s)": 1.43734 }, { "epoch": 3.5394798851805835, "grad_norm": 5.8777384757995605, "learning_rate": 1.9616460453110712e-05, "loss": 2.311336898803711, "memory(GiB)": 77.56, "step": 82615, "token_acc": 0.5148514851485149, "train_speed(iter/s)": 1.437367 }, { "epoch": 3.5396941005098324, "grad_norm": 6.512287616729736, "learning_rate": 1.9611116016435495e-05, "loss": 2.3543508529663084, "memory(GiB)": 77.56, "step": 82620, "token_acc": 0.5381944444444444, "train_speed(iter/s)": 1.437383 }, { "epoch": 3.5399083158390816, "grad_norm": 7.980329990386963, "learning_rate": 1.9605772130283413e-05, "loss": 2.5231698989868163, "memory(GiB)": 77.56, "step": 82625, "token_acc": 0.5067567567567568, "train_speed(iter/s)": 1.437402 }, { "epoch": 3.5401225311683304, "grad_norm": 6.829824924468994, "learning_rate": 1.9600428794751285e-05, "loss": 2.416186714172363, "memory(GiB)": 77.56, "step": 82630, "token_acc": 0.49838187702265374, "train_speed(iter/s)": 1.437431 }, { "epoch": 3.5403367464975792, "grad_norm": 5.182572364807129, "learning_rate": 1.9595086009935903e-05, "loss": 2.279319953918457, "memory(GiB)": 77.56, "step": 82635, "token_acc": 0.5223367697594502, "train_speed(iter/s)": 1.437443 }, { "epoch": 3.5405509618268285, "grad_norm": 5.381372928619385, "learning_rate": 1.9589743775934043e-05, "loss": 2.0162782669067383, "memory(GiB)": 77.56, "step": 82640, "token_acc": 0.5161290322580645, "train_speed(iter/s)": 1.437441 }, { "epoch": 3.5407651771560773, "grad_norm": 6.481118679046631, "learning_rate": 1.958440209284252e-05, "loss": 2.0292457580566405, "memory(GiB)": 77.56, "step": 82645, "token_acc": 0.5335689045936396, "train_speed(iter/s)": 1.43742 }, { "epoch": 3.540979392485326, "grad_norm": 6.066073417663574, "learning_rate": 1.9579060960758082e-05, "loss": 2.274923324584961, "memory(GiB)": 77.56, "step": 82650, "token_acc": 0.5326460481099656, "train_speed(iter/s)": 1.437429 }, { "epoch": 3.5411936078145754, "grad_norm": 6.3374457359313965, "learning_rate": 1.9573720379777498e-05, "loss": 2.2671525955200194, "memory(GiB)": 77.56, "step": 82655, "token_acc": 0.504950495049505, "train_speed(iter/s)": 1.43744 }, { "epoch": 3.541407823143824, "grad_norm": 5.057811260223389, "learning_rate": 1.9568380349997506e-05, "loss": 2.4789525985717775, "memory(GiB)": 77.56, "step": 82660, "token_acc": 0.48404255319148937, "train_speed(iter/s)": 1.437427 }, { "epoch": 3.541622038473073, "grad_norm": 7.5626935958862305, "learning_rate": 1.9563040871514854e-05, "loss": 2.2776498794555664, "memory(GiB)": 77.56, "step": 82665, "token_acc": 0.5236363636363637, "train_speed(iter/s)": 1.437438 }, { "epoch": 3.5418362538023223, "grad_norm": 6.010182857513428, "learning_rate": 1.955770194442625e-05, "loss": 2.372435760498047, "memory(GiB)": 77.56, "step": 82670, "token_acc": 0.5117845117845118, "train_speed(iter/s)": 1.437422 }, { "epoch": 3.542050469131571, "grad_norm": 6.353419780731201, "learning_rate": 1.9552363568828437e-05, "loss": 2.2901140213012696, "memory(GiB)": 77.56, "step": 82675, "token_acc": 0.5410447761194029, "train_speed(iter/s)": 1.437439 }, { "epoch": 3.54226468446082, "grad_norm": 6.403958320617676, "learning_rate": 1.954702574481813e-05, "loss": 2.299019622802734, "memory(GiB)": 77.56, "step": 82680, "token_acc": 0.5401459854014599, "train_speed(iter/s)": 1.437462 }, { "epoch": 3.542478899790069, "grad_norm": 7.3599629402160645, "learning_rate": 1.954168847249202e-05, "loss": 2.282818603515625, "memory(GiB)": 77.56, "step": 82685, "token_acc": 0.5229681978798587, "train_speed(iter/s)": 1.43747 }, { "epoch": 3.542693115119318, "grad_norm": 5.285392761230469, "learning_rate": 1.95363517519468e-05, "loss": 2.588500213623047, "memory(GiB)": 77.56, "step": 82690, "token_acc": 0.486404833836858, "train_speed(iter/s)": 1.437461 }, { "epoch": 3.5429073304485668, "grad_norm": 5.888866424560547, "learning_rate": 1.9531015583279143e-05, "loss": 2.322518730163574, "memory(GiB)": 77.56, "step": 82695, "token_acc": 0.5047318611987381, "train_speed(iter/s)": 1.437478 }, { "epoch": 3.543121545777816, "grad_norm": 7.0269598960876465, "learning_rate": 1.9525679966585703e-05, "loss": 2.2573759078979494, "memory(GiB)": 77.56, "step": 82700, "token_acc": 0.49800796812749004, "train_speed(iter/s)": 1.437501 }, { "epoch": 3.543335761107065, "grad_norm": 6.0991597175598145, "learning_rate": 1.9520344901963177e-05, "loss": 2.4632055282592775, "memory(GiB)": 77.56, "step": 82705, "token_acc": 0.494949494949495, "train_speed(iter/s)": 1.437506 }, { "epoch": 3.5435499764363136, "grad_norm": 6.6377129554748535, "learning_rate": 1.951501038950819e-05, "loss": 1.9929412841796874, "memory(GiB)": 77.56, "step": 82710, "token_acc": 0.5725806451612904, "train_speed(iter/s)": 1.437512 }, { "epoch": 3.543764191765563, "grad_norm": 6.9814982414245605, "learning_rate": 1.9509676429317397e-05, "loss": 2.2883739471435547, "memory(GiB)": 77.56, "step": 82715, "token_acc": 0.4713114754098361, "train_speed(iter/s)": 1.437511 }, { "epoch": 3.5439784070948117, "grad_norm": 6.554887771606445, "learning_rate": 1.950434302148741e-05, "loss": 2.4074459075927734, "memory(GiB)": 77.56, "step": 82720, "token_acc": 0.4892966360856269, "train_speed(iter/s)": 1.43751 }, { "epoch": 3.5441926224240605, "grad_norm": 6.756819725036621, "learning_rate": 1.9499010166114838e-05, "loss": 2.5532487869262694, "memory(GiB)": 77.56, "step": 82725, "token_acc": 0.49375, "train_speed(iter/s)": 1.437516 }, { "epoch": 3.54440683775331, "grad_norm": 8.334879875183105, "learning_rate": 1.9493677863296332e-05, "loss": 2.4358564376831056, "memory(GiB)": 77.56, "step": 82730, "token_acc": 0.5056603773584906, "train_speed(iter/s)": 1.437532 }, { "epoch": 3.5446210530825586, "grad_norm": 7.20154333114624, "learning_rate": 1.948834611312847e-05, "loss": 2.0781314849853514, "memory(GiB)": 77.56, "step": 82735, "token_acc": 0.5142857142857142, "train_speed(iter/s)": 1.437533 }, { "epoch": 3.5448352684118074, "grad_norm": 4.454391956329346, "learning_rate": 1.9483014915707832e-05, "loss": 2.2311704635620115, "memory(GiB)": 77.56, "step": 82740, "token_acc": 0.49393939393939396, "train_speed(iter/s)": 1.437523 }, { "epoch": 3.5450494837410567, "grad_norm": 6.827240467071533, "learning_rate": 1.9477684271130995e-05, "loss": 2.402065086364746, "memory(GiB)": 77.56, "step": 82745, "token_acc": 0.5018050541516246, "train_speed(iter/s)": 1.437498 }, { "epoch": 3.5452636990703055, "grad_norm": 5.677723407745361, "learning_rate": 1.9472354179494556e-05, "loss": 2.230828857421875, "memory(GiB)": 77.56, "step": 82750, "token_acc": 0.4898785425101215, "train_speed(iter/s)": 1.437504 }, { "epoch": 3.5454779143995543, "grad_norm": 6.276457786560059, "learning_rate": 1.9467024640895053e-05, "loss": 2.5080982208251954, "memory(GiB)": 77.56, "step": 82755, "token_acc": 0.4816053511705686, "train_speed(iter/s)": 1.437508 }, { "epoch": 3.5456921297288035, "grad_norm": 5.828501224517822, "learning_rate": 1.946169565542903e-05, "loss": 2.380128288269043, "memory(GiB)": 77.56, "step": 82760, "token_acc": 0.504885993485342, "train_speed(iter/s)": 1.43751 }, { "epoch": 3.5459063450580524, "grad_norm": 6.4717254638671875, "learning_rate": 1.9456367223193055e-05, "loss": 2.505108451843262, "memory(GiB)": 77.56, "step": 82765, "token_acc": 0.43234323432343236, "train_speed(iter/s)": 1.437509 }, { "epoch": 3.546120560387301, "grad_norm": 5.942768096923828, "learning_rate": 1.945103934428364e-05, "loss": 2.6547855377197265, "memory(GiB)": 77.56, "step": 82770, "token_acc": 0.44981412639405205, "train_speed(iter/s)": 1.437524 }, { "epoch": 3.5463347757165504, "grad_norm": 5.546896457672119, "learning_rate": 1.9445712018797296e-05, "loss": 2.2827091217041016, "memory(GiB)": 77.56, "step": 82775, "token_acc": 0.5204081632653061, "train_speed(iter/s)": 1.437497 }, { "epoch": 3.5465489910457992, "grad_norm": 6.897151947021484, "learning_rate": 1.944038524683055e-05, "loss": 2.5003940582275392, "memory(GiB)": 77.56, "step": 82780, "token_acc": 0.49709302325581395, "train_speed(iter/s)": 1.437497 }, { "epoch": 3.546763206375048, "grad_norm": 7.0494489669799805, "learning_rate": 1.9435059028479874e-05, "loss": 2.2949268341064455, "memory(GiB)": 77.56, "step": 82785, "token_acc": 0.5035714285714286, "train_speed(iter/s)": 1.437497 }, { "epoch": 3.5469774217042973, "grad_norm": 5.7212724685668945, "learning_rate": 1.942973336384179e-05, "loss": 2.2536026000976563, "memory(GiB)": 77.56, "step": 82790, "token_acc": 0.4918032786885246, "train_speed(iter/s)": 1.437506 }, { "epoch": 3.547191637033546, "grad_norm": 5.892222881317139, "learning_rate": 1.9424408253012765e-05, "loss": 2.4641143798828127, "memory(GiB)": 77.56, "step": 82795, "token_acc": 0.4520123839009288, "train_speed(iter/s)": 1.437501 }, { "epoch": 3.547405852362795, "grad_norm": 5.773164749145508, "learning_rate": 1.941908369608927e-05, "loss": 2.5576282501220704, "memory(GiB)": 77.56, "step": 82800, "token_acc": 0.4827586206896552, "train_speed(iter/s)": 1.437506 }, { "epoch": 3.547620067692044, "grad_norm": 5.912968635559082, "learning_rate": 1.9413759693167754e-05, "loss": 2.131065559387207, "memory(GiB)": 77.56, "step": 82805, "token_acc": 0.5310344827586206, "train_speed(iter/s)": 1.437522 }, { "epoch": 3.547834283021293, "grad_norm": 5.046838283538818, "learning_rate": 1.940843624434468e-05, "loss": 2.562174606323242, "memory(GiB)": 77.56, "step": 82810, "token_acc": 0.504, "train_speed(iter/s)": 1.437521 }, { "epoch": 3.548048498350542, "grad_norm": 8.35932731628418, "learning_rate": 1.940311334971646e-05, "loss": 2.10133056640625, "memory(GiB)": 77.56, "step": 82815, "token_acc": 0.54296875, "train_speed(iter/s)": 1.437513 }, { "epoch": 3.548262713679791, "grad_norm": 7.073241233825684, "learning_rate": 1.9397791009379546e-05, "loss": 2.3052635192871094, "memory(GiB)": 77.56, "step": 82820, "token_acc": 0.5211267605633803, "train_speed(iter/s)": 1.437532 }, { "epoch": 3.54847692900904, "grad_norm": 5.813384056091309, "learning_rate": 1.939246922343037e-05, "loss": 2.3398006439208983, "memory(GiB)": 77.56, "step": 82825, "token_acc": 0.48, "train_speed(iter/s)": 1.437525 }, { "epoch": 3.5486911443382887, "grad_norm": 9.540542602539062, "learning_rate": 1.9387147991965327e-05, "loss": 2.3147607803344727, "memory(GiB)": 77.56, "step": 82830, "token_acc": 0.5136363636363637, "train_speed(iter/s)": 1.437548 }, { "epoch": 3.548905359667538, "grad_norm": 5.839265823364258, "learning_rate": 1.9381827315080815e-05, "loss": 2.501369857788086, "memory(GiB)": 77.56, "step": 82835, "token_acc": 0.44891640866873067, "train_speed(iter/s)": 1.437566 }, { "epoch": 3.5491195749967868, "grad_norm": 6.068883419036865, "learning_rate": 1.9376507192873223e-05, "loss": 2.134100341796875, "memory(GiB)": 77.56, "step": 82840, "token_acc": 0.5419354838709678, "train_speed(iter/s)": 1.437555 }, { "epoch": 3.5493337903260356, "grad_norm": 6.481598854064941, "learning_rate": 1.9371187625438913e-05, "loss": 2.6785915374755858, "memory(GiB)": 77.56, "step": 82845, "token_acc": 0.4845679012345679, "train_speed(iter/s)": 1.437573 }, { "epoch": 3.549548005655285, "grad_norm": 6.411945343017578, "learning_rate": 1.9365868612874287e-05, "loss": 2.175364112854004, "memory(GiB)": 77.56, "step": 82850, "token_acc": 0.5524475524475524, "train_speed(iter/s)": 1.437582 }, { "epoch": 3.5497622209845336, "grad_norm": 7.459131717681885, "learning_rate": 1.9360550155275693e-05, "loss": 2.4688262939453125, "memory(GiB)": 77.56, "step": 82855, "token_acc": 0.4896551724137931, "train_speed(iter/s)": 1.437596 }, { "epoch": 3.5499764363137825, "grad_norm": 6.617015838623047, "learning_rate": 1.9355232252739474e-05, "loss": 2.2800247192382814, "memory(GiB)": 77.56, "step": 82860, "token_acc": 0.5188679245283019, "train_speed(iter/s)": 1.437617 }, { "epoch": 3.5501906516430317, "grad_norm": 6.170217514038086, "learning_rate": 1.9349914905361964e-05, "loss": 2.2226984024047853, "memory(GiB)": 77.56, "step": 82865, "token_acc": 0.4835164835164835, "train_speed(iter/s)": 1.437612 }, { "epoch": 3.5504048669722805, "grad_norm": 6.286918640136719, "learning_rate": 1.9344598113239488e-05, "loss": 2.3269222259521483, "memory(GiB)": 77.56, "step": 82870, "token_acc": 0.4673913043478261, "train_speed(iter/s)": 1.437626 }, { "epoch": 3.5506190823015293, "grad_norm": 8.006568908691406, "learning_rate": 1.933928187646839e-05, "loss": 2.300686264038086, "memory(GiB)": 77.56, "step": 82875, "token_acc": 0.5102040816326531, "train_speed(iter/s)": 1.437618 }, { "epoch": 3.5508332976307786, "grad_norm": 5.5448503494262695, "learning_rate": 1.9333966195144958e-05, "loss": 2.3359729766845705, "memory(GiB)": 77.56, "step": 82880, "token_acc": 0.48014440433212996, "train_speed(iter/s)": 1.43761 }, { "epoch": 3.5510475129600274, "grad_norm": 6.8241448402404785, "learning_rate": 1.93286510693655e-05, "loss": 2.128226470947266, "memory(GiB)": 77.56, "step": 82885, "token_acc": 0.515850144092219, "train_speed(iter/s)": 1.437582 }, { "epoch": 3.551261728289276, "grad_norm": 4.49894905090332, "learning_rate": 1.93233364992263e-05, "loss": 2.339345169067383, "memory(GiB)": 77.56, "step": 82890, "token_acc": 0.4983388704318937, "train_speed(iter/s)": 1.437549 }, { "epoch": 3.5514759436185255, "grad_norm": 6.002151966094971, "learning_rate": 1.9318022484823618e-05, "loss": 2.3803098678588865, "memory(GiB)": 77.56, "step": 82895, "token_acc": 0.5225563909774437, "train_speed(iter/s)": 1.437557 }, { "epoch": 3.5516901589477743, "grad_norm": 7.201699733734131, "learning_rate": 1.9312709026253756e-05, "loss": 2.199318695068359, "memory(GiB)": 77.56, "step": 82900, "token_acc": 0.5444839857651246, "train_speed(iter/s)": 1.437548 }, { "epoch": 3.551904374277023, "grad_norm": 9.999852180480957, "learning_rate": 1.9307396123612942e-05, "loss": 2.2409217834472654, "memory(GiB)": 77.56, "step": 82905, "token_acc": 0.5773584905660377, "train_speed(iter/s)": 1.437553 }, { "epoch": 3.5521185896062724, "grad_norm": 7.5008673667907715, "learning_rate": 1.9302083776997454e-05, "loss": 2.323116111755371, "memory(GiB)": 77.56, "step": 82910, "token_acc": 0.5437262357414449, "train_speed(iter/s)": 1.437553 }, { "epoch": 3.552332804935521, "grad_norm": 6.727487564086914, "learning_rate": 1.929677198650352e-05, "loss": 2.3816705703735352, "memory(GiB)": 77.56, "step": 82915, "token_acc": 0.48757763975155277, "train_speed(iter/s)": 1.437565 }, { "epoch": 3.55254702026477, "grad_norm": 8.107377052307129, "learning_rate": 1.929146075222736e-05, "loss": 2.2722057342529296, "memory(GiB)": 77.56, "step": 82920, "token_acc": 0.4943820224719101, "train_speed(iter/s)": 1.437581 }, { "epoch": 3.5527612355940192, "grad_norm": 5.84694242477417, "learning_rate": 1.9286150074265198e-05, "loss": 2.4540891647338867, "memory(GiB)": 77.56, "step": 82925, "token_acc": 0.4557823129251701, "train_speed(iter/s)": 1.437598 }, { "epoch": 3.552975450923268, "grad_norm": 6.92732048034668, "learning_rate": 1.928083995271322e-05, "loss": 2.3148471832275392, "memory(GiB)": 77.56, "step": 82930, "token_acc": 0.528125, "train_speed(iter/s)": 1.437596 }, { "epoch": 3.553189666252517, "grad_norm": 7.589896202087402, "learning_rate": 1.9275530387667655e-05, "loss": 2.4585365295410155, "memory(GiB)": 77.56, "step": 82935, "token_acc": 0.4582043343653251, "train_speed(iter/s)": 1.437615 }, { "epoch": 3.553403881581766, "grad_norm": 5.6032490730285645, "learning_rate": 1.927022137922469e-05, "loss": 2.193222427368164, "memory(GiB)": 77.56, "step": 82940, "token_acc": 0.5228758169934641, "train_speed(iter/s)": 1.43763 }, { "epoch": 3.553618096911015, "grad_norm": 6.023399353027344, "learning_rate": 1.9264912927480483e-05, "loss": 2.1307304382324217, "memory(GiB)": 77.56, "step": 82945, "token_acc": 0.5463917525773195, "train_speed(iter/s)": 1.43764 }, { "epoch": 3.5538323122402637, "grad_norm": 6.18001651763916, "learning_rate": 1.9259605032531213e-05, "loss": 2.3222259521484374, "memory(GiB)": 77.56, "step": 82950, "token_acc": 0.460431654676259, "train_speed(iter/s)": 1.43765 }, { "epoch": 3.554046527569513, "grad_norm": 7.535070896148682, "learning_rate": 1.9254297694473038e-05, "loss": 2.488578224182129, "memory(GiB)": 77.56, "step": 82955, "token_acc": 0.479020979020979, "train_speed(iter/s)": 1.437676 }, { "epoch": 3.554260742898762, "grad_norm": 5.162063121795654, "learning_rate": 1.9248990913402083e-05, "loss": 2.250819206237793, "memory(GiB)": 77.56, "step": 82960, "token_acc": 0.5239852398523985, "train_speed(iter/s)": 1.437688 }, { "epoch": 3.5544749582280106, "grad_norm": 7.015691757202148, "learning_rate": 1.9243684689414498e-05, "loss": 2.3662088394165037, "memory(GiB)": 77.56, "step": 82965, "token_acc": 0.5347222222222222, "train_speed(iter/s)": 1.437695 }, { "epoch": 3.55468917355726, "grad_norm": 6.708771705627441, "learning_rate": 1.923837902260644e-05, "loss": 2.3405385971069337, "memory(GiB)": 77.56, "step": 82970, "token_acc": 0.47592067988668557, "train_speed(iter/s)": 1.437698 }, { "epoch": 3.5549033888865087, "grad_norm": 5.543859004974365, "learning_rate": 1.9233073913074002e-05, "loss": 2.3417749404907227, "memory(GiB)": 77.56, "step": 82975, "token_acc": 0.49829351535836175, "train_speed(iter/s)": 1.437708 }, { "epoch": 3.5551176042157575, "grad_norm": 6.479108810424805, "learning_rate": 1.9227769360913296e-05, "loss": 2.2098255157470703, "memory(GiB)": 77.56, "step": 82980, "token_acc": 0.5317725752508361, "train_speed(iter/s)": 1.437703 }, { "epoch": 3.5553318195450068, "grad_norm": 7.023055076599121, "learning_rate": 1.9222465366220406e-05, "loss": 2.3799482345581056, "memory(GiB)": 77.56, "step": 82985, "token_acc": 0.4888888888888889, "train_speed(iter/s)": 1.437713 }, { "epoch": 3.5555460348742556, "grad_norm": 6.113099575042725, "learning_rate": 1.9217161929091415e-05, "loss": 2.3580018997192385, "memory(GiB)": 77.56, "step": 82990, "token_acc": 0.5069444444444444, "train_speed(iter/s)": 1.437724 }, { "epoch": 3.5557602502035044, "grad_norm": 6.232082843780518, "learning_rate": 1.921185904962243e-05, "loss": 2.260558319091797, "memory(GiB)": 77.56, "step": 82995, "token_acc": 0.5028901734104047, "train_speed(iter/s)": 1.437722 }, { "epoch": 3.5559744655327536, "grad_norm": 6.2291951179504395, "learning_rate": 1.9206556727909496e-05, "loss": 2.466316795349121, "memory(GiB)": 77.56, "step": 83000, "token_acc": 0.49795918367346936, "train_speed(iter/s)": 1.437724 }, { "epoch": 3.5559744655327536, "eval_loss": 2.24786376953125, "eval_runtime": 14.5619, "eval_samples_per_second": 6.867, "eval_steps_per_second": 6.867, "eval_token_acc": 0.48221343873517786, "step": 83000 }, { "epoch": 3.5561886808620025, "grad_norm": 5.826775074005127, "learning_rate": 1.9201254964048675e-05, "loss": 2.264176940917969, "memory(GiB)": 77.56, "step": 83005, "token_acc": 0.48930232558139536, "train_speed(iter/s)": 1.437341 }, { "epoch": 3.5564028961912513, "grad_norm": 5.459835052490234, "learning_rate": 1.9195953758136015e-05, "loss": 2.2328216552734377, "memory(GiB)": 77.56, "step": 83010, "token_acc": 0.48928571428571427, "train_speed(iter/s)": 1.437346 }, { "epoch": 3.5566171115205005, "grad_norm": 7.009839057922363, "learning_rate": 1.9190653110267532e-05, "loss": 2.614377021789551, "memory(GiB)": 77.56, "step": 83015, "token_acc": 0.4722222222222222, "train_speed(iter/s)": 1.437352 }, { "epoch": 3.5568313268497493, "grad_norm": 5.47695779800415, "learning_rate": 1.9185353020539288e-05, "loss": 1.952031707763672, "memory(GiB)": 77.56, "step": 83020, "token_acc": 0.5724907063197026, "train_speed(iter/s)": 1.437365 }, { "epoch": 3.557045542178998, "grad_norm": 4.556182861328125, "learning_rate": 1.9180053489047273e-05, "loss": 2.1064924240112304, "memory(GiB)": 77.56, "step": 83025, "token_acc": 0.5421686746987951, "train_speed(iter/s)": 1.437383 }, { "epoch": 3.5572597575082474, "grad_norm": 5.167880535125732, "learning_rate": 1.9174754515887506e-05, "loss": 2.242349624633789, "memory(GiB)": 77.56, "step": 83030, "token_acc": 0.49700598802395207, "train_speed(iter/s)": 1.4374 }, { "epoch": 3.5574739728374962, "grad_norm": 5.98271369934082, "learning_rate": 1.9169456101155974e-05, "loss": 2.3210241317749025, "memory(GiB)": 77.56, "step": 83035, "token_acc": 0.5031446540880503, "train_speed(iter/s)": 1.437396 }, { "epoch": 3.557688188166745, "grad_norm": 4.825085639953613, "learning_rate": 1.9164158244948654e-05, "loss": 2.364198684692383, "memory(GiB)": 77.56, "step": 83040, "token_acc": 0.5, "train_speed(iter/s)": 1.437402 }, { "epoch": 3.5579024034959943, "grad_norm": 5.873676776885986, "learning_rate": 1.9158860947361545e-05, "loss": 2.426919937133789, "memory(GiB)": 77.56, "step": 83045, "token_acc": 0.4983164983164983, "train_speed(iter/s)": 1.43738 }, { "epoch": 3.558116618825243, "grad_norm": 5.452817440032959, "learning_rate": 1.9153564208490588e-05, "loss": 2.241208076477051, "memory(GiB)": 77.56, "step": 83050, "token_acc": 0.5365853658536586, "train_speed(iter/s)": 1.437389 }, { "epoch": 3.558330834154492, "grad_norm": 7.726084232330322, "learning_rate": 1.9148268028431772e-05, "loss": 2.145909881591797, "memory(GiB)": 77.56, "step": 83055, "token_acc": 0.5589519650655022, "train_speed(iter/s)": 1.437387 }, { "epoch": 3.558545049483741, "grad_norm": 6.4745869636535645, "learning_rate": 1.9142972407281017e-05, "loss": 2.3085563659667967, "memory(GiB)": 77.56, "step": 83060, "token_acc": 0.5483870967741935, "train_speed(iter/s)": 1.437409 }, { "epoch": 3.55875926481299, "grad_norm": 5.735274314880371, "learning_rate": 1.9137677345134262e-05, "loss": 2.128959846496582, "memory(GiB)": 77.56, "step": 83065, "token_acc": 0.5107913669064749, "train_speed(iter/s)": 1.437412 }, { "epoch": 3.558973480142239, "grad_norm": 6.817280292510986, "learning_rate": 1.9132382842087438e-05, "loss": 2.519239044189453, "memory(GiB)": 77.56, "step": 83070, "token_acc": 0.4491525423728814, "train_speed(iter/s)": 1.437408 }, { "epoch": 3.559187695471488, "grad_norm": 6.035449981689453, "learning_rate": 1.9127088898236434e-05, "loss": 2.375954818725586, "memory(GiB)": 77.56, "step": 83075, "token_acc": 0.49693251533742333, "train_speed(iter/s)": 1.437389 }, { "epoch": 3.559401910800737, "grad_norm": 5.832775592803955, "learning_rate": 1.912179551367719e-05, "loss": 2.079724884033203, "memory(GiB)": 77.56, "step": 83080, "token_acc": 0.48535564853556484, "train_speed(iter/s)": 1.437385 }, { "epoch": 3.5596161261299857, "grad_norm": 6.4994049072265625, "learning_rate": 1.9116502688505593e-05, "loss": 2.344584655761719, "memory(GiB)": 77.56, "step": 83085, "token_acc": 0.4797047970479705, "train_speed(iter/s)": 1.437393 }, { "epoch": 3.559830341459235, "grad_norm": 8.411913871765137, "learning_rate": 1.911121042281752e-05, "loss": 2.6202192306518555, "memory(GiB)": 77.56, "step": 83090, "token_acc": 0.46078431372549017, "train_speed(iter/s)": 1.437398 }, { "epoch": 3.5600445567884837, "grad_norm": 7.736565589904785, "learning_rate": 1.9105918716708843e-05, "loss": 2.2053459167480467, "memory(GiB)": 77.56, "step": 83095, "token_acc": 0.5505226480836237, "train_speed(iter/s)": 1.437434 }, { "epoch": 3.5602587721177326, "grad_norm": 7.399161338806152, "learning_rate": 1.910062757027543e-05, "loss": 2.0944755554199217, "memory(GiB)": 77.56, "step": 83100, "token_acc": 0.5578947368421052, "train_speed(iter/s)": 1.437452 }, { "epoch": 3.560472987446982, "grad_norm": 5.091325283050537, "learning_rate": 1.9095336983613122e-05, "loss": 2.383116912841797, "memory(GiB)": 77.56, "step": 83105, "token_acc": 0.5016611295681063, "train_speed(iter/s)": 1.437479 }, { "epoch": 3.5606872027762306, "grad_norm": 6.804299354553223, "learning_rate": 1.9090046956817786e-05, "loss": 2.755426216125488, "memory(GiB)": 77.56, "step": 83110, "token_acc": 0.4420731707317073, "train_speed(iter/s)": 1.437483 }, { "epoch": 3.5609014181054794, "grad_norm": 8.014354705810547, "learning_rate": 1.908475748998523e-05, "loss": 2.2549163818359377, "memory(GiB)": 77.56, "step": 83115, "token_acc": 0.5130434782608696, "train_speed(iter/s)": 1.437469 }, { "epoch": 3.5611156334347287, "grad_norm": 6.086384296417236, "learning_rate": 1.9079468583211314e-05, "loss": 2.5265478134155273, "memory(GiB)": 77.56, "step": 83120, "token_acc": 0.4745222929936306, "train_speed(iter/s)": 1.437464 }, { "epoch": 3.5613298487639775, "grad_norm": 5.863791465759277, "learning_rate": 1.9074180236591826e-05, "loss": 2.9002330780029295, "memory(GiB)": 77.56, "step": 83125, "token_acc": 0.42724458204334365, "train_speed(iter/s)": 1.437469 }, { "epoch": 3.5615440640932263, "grad_norm": 7.259562969207764, "learning_rate": 1.9068892450222577e-05, "loss": 2.3109451293945313, "memory(GiB)": 77.56, "step": 83130, "token_acc": 0.5072463768115942, "train_speed(iter/s)": 1.437488 }, { "epoch": 3.5617582794224756, "grad_norm": 5.618175506591797, "learning_rate": 1.9063605224199333e-05, "loss": 2.2776432037353516, "memory(GiB)": 77.56, "step": 83135, "token_acc": 0.5151515151515151, "train_speed(iter/s)": 1.437501 }, { "epoch": 3.5619724947517244, "grad_norm": 5.945962905883789, "learning_rate": 1.9058318558617927e-05, "loss": 2.3846551895141603, "memory(GiB)": 77.56, "step": 83140, "token_acc": 0.4690265486725664, "train_speed(iter/s)": 1.437501 }, { "epoch": 3.562186710080973, "grad_norm": 7.259668350219727, "learning_rate": 1.9053032453574098e-05, "loss": 2.5611623764038085, "memory(GiB)": 77.56, "step": 83145, "token_acc": 0.4807692307692308, "train_speed(iter/s)": 1.43751 }, { "epoch": 3.5624009254102225, "grad_norm": 6.636059761047363, "learning_rate": 1.9047746909163623e-05, "loss": 2.2019626617431642, "memory(GiB)": 77.56, "step": 83150, "token_acc": 0.5277777777777778, "train_speed(iter/s)": 1.437513 }, { "epoch": 3.5626151407394713, "grad_norm": 7.4019975662231445, "learning_rate": 1.9042461925482248e-05, "loss": 2.182967948913574, "memory(GiB)": 77.56, "step": 83155, "token_acc": 0.5243445692883895, "train_speed(iter/s)": 1.43752 }, { "epoch": 3.56282935606872, "grad_norm": 6.843671798706055, "learning_rate": 1.9037177502625697e-05, "loss": 2.595068359375, "memory(GiB)": 77.56, "step": 83160, "token_acc": 0.4605263157894737, "train_speed(iter/s)": 1.437529 }, { "epoch": 3.5630435713979693, "grad_norm": 6.6064453125, "learning_rate": 1.9031893640689735e-05, "loss": 2.3750823974609374, "memory(GiB)": 77.56, "step": 83165, "token_acc": 0.4983922829581994, "train_speed(iter/s)": 1.437535 }, { "epoch": 3.563257786727218, "grad_norm": 6.949801921844482, "learning_rate": 1.9026610339770074e-05, "loss": 2.5514530181884765, "memory(GiB)": 77.56, "step": 83170, "token_acc": 0.49226006191950467, "train_speed(iter/s)": 1.437525 }, { "epoch": 3.563472002056467, "grad_norm": 4.747146129608154, "learning_rate": 1.902132759996242e-05, "loss": 2.1096385955810546, "memory(GiB)": 77.56, "step": 83175, "token_acc": 0.5230263157894737, "train_speed(iter/s)": 1.437521 }, { "epoch": 3.5636862173857162, "grad_norm": 6.8741936683654785, "learning_rate": 1.9016045421362478e-05, "loss": 2.374500846862793, "memory(GiB)": 77.56, "step": 83180, "token_acc": 0.4896551724137931, "train_speed(iter/s)": 1.437532 }, { "epoch": 3.563900432714965, "grad_norm": 5.954809188842773, "learning_rate": 1.901076380406594e-05, "loss": 2.0965930938720705, "memory(GiB)": 77.56, "step": 83185, "token_acc": 0.5575539568345323, "train_speed(iter/s)": 1.43755 }, { "epoch": 3.564114648044214, "grad_norm": 6.052725315093994, "learning_rate": 1.9005482748168468e-05, "loss": 2.3403459548950196, "memory(GiB)": 77.56, "step": 83190, "token_acc": 0.49122807017543857, "train_speed(iter/s)": 1.437553 }, { "epoch": 3.564328863373463, "grad_norm": 4.475099563598633, "learning_rate": 1.900020225376575e-05, "loss": 2.392584228515625, "memory(GiB)": 77.56, "step": 83195, "token_acc": 0.49504950495049505, "train_speed(iter/s)": 1.437541 }, { "epoch": 3.564543078702712, "grad_norm": 6.442307472229004, "learning_rate": 1.8994922320953462e-05, "loss": 2.0630943298339846, "memory(GiB)": 77.56, "step": 83200, "token_acc": 0.5074626865671642, "train_speed(iter/s)": 1.43755 }, { "epoch": 3.5647572940319607, "grad_norm": 6.023510456085205, "learning_rate": 1.8989642949827246e-05, "loss": 2.303834915161133, "memory(GiB)": 77.56, "step": 83205, "token_acc": 0.4865771812080537, "train_speed(iter/s)": 1.437532 }, { "epoch": 3.56497150936121, "grad_norm": 5.839447975158691, "learning_rate": 1.898436414048274e-05, "loss": 2.0025238037109374, "memory(GiB)": 77.56, "step": 83210, "token_acc": 0.5555555555555556, "train_speed(iter/s)": 1.437541 }, { "epoch": 3.565185724690459, "grad_norm": 5.560582160949707, "learning_rate": 1.897908589301557e-05, "loss": 2.419338035583496, "memory(GiB)": 77.56, "step": 83215, "token_acc": 0.5197132616487455, "train_speed(iter/s)": 1.437561 }, { "epoch": 3.5653999400197076, "grad_norm": 5.170771598815918, "learning_rate": 1.8973808207521344e-05, "loss": 2.455280876159668, "memory(GiB)": 77.56, "step": 83220, "token_acc": 0.48148148148148145, "train_speed(iter/s)": 1.437574 }, { "epoch": 3.565614155348957, "grad_norm": 6.187577724456787, "learning_rate": 1.89685310840957e-05, "loss": 2.3973594665527345, "memory(GiB)": 77.56, "step": 83225, "token_acc": 0.4985835694050991, "train_speed(iter/s)": 1.437586 }, { "epoch": 3.5658283706782057, "grad_norm": 6.095307350158691, "learning_rate": 1.8963254522834227e-05, "loss": 1.9889350891113282, "memory(GiB)": 77.56, "step": 83230, "token_acc": 0.5871559633027523, "train_speed(iter/s)": 1.437604 }, { "epoch": 3.5660425860074545, "grad_norm": 6.330908298492432, "learning_rate": 1.8957978523832514e-05, "loss": 2.834647369384766, "memory(GiB)": 77.56, "step": 83235, "token_acc": 0.42765273311897106, "train_speed(iter/s)": 1.437598 }, { "epoch": 3.5662568013367038, "grad_norm": 5.0780181884765625, "learning_rate": 1.8952703087186142e-05, "loss": 2.5584503173828126, "memory(GiB)": 77.56, "step": 83240, "token_acc": 0.47686832740213525, "train_speed(iter/s)": 1.437591 }, { "epoch": 3.5664710166659526, "grad_norm": 5.6616950035095215, "learning_rate": 1.894742821299067e-05, "loss": 2.273427200317383, "memory(GiB)": 77.56, "step": 83245, "token_acc": 0.5174603174603175, "train_speed(iter/s)": 1.437615 }, { "epoch": 3.5666852319952014, "grad_norm": 6.130789279937744, "learning_rate": 1.894215390134166e-05, "loss": 2.090376281738281, "memory(GiB)": 77.56, "step": 83250, "token_acc": 0.5387453874538746, "train_speed(iter/s)": 1.437605 }, { "epoch": 3.5668994473244506, "grad_norm": 7.444840431213379, "learning_rate": 1.8936880152334674e-05, "loss": 2.194511604309082, "memory(GiB)": 77.56, "step": 83255, "token_acc": 0.5277777777777778, "train_speed(iter/s)": 1.437607 }, { "epoch": 3.5671136626536994, "grad_norm": 5.143515110015869, "learning_rate": 1.8931606966065247e-05, "loss": 2.379649543762207, "memory(GiB)": 77.56, "step": 83260, "token_acc": 0.5, "train_speed(iter/s)": 1.437625 }, { "epoch": 3.5673278779829483, "grad_norm": 8.019730567932129, "learning_rate": 1.8926334342628888e-05, "loss": 2.353934478759766, "memory(GiB)": 77.56, "step": 83265, "token_acc": 0.55859375, "train_speed(iter/s)": 1.437633 }, { "epoch": 3.5675420933121975, "grad_norm": 5.699904918670654, "learning_rate": 1.892106228212115e-05, "loss": 2.3754390716552733, "memory(GiB)": 77.56, "step": 83270, "token_acc": 0.4618181818181818, "train_speed(iter/s)": 1.437626 }, { "epoch": 3.5677563086414463, "grad_norm": 4.760214805603027, "learning_rate": 1.891579078463752e-05, "loss": 2.3147865295410157, "memory(GiB)": 77.56, "step": 83275, "token_acc": 0.5291666666666667, "train_speed(iter/s)": 1.437642 }, { "epoch": 3.567970523970695, "grad_norm": 6.6610541343688965, "learning_rate": 1.8910519850273485e-05, "loss": 2.324036407470703, "memory(GiB)": 77.56, "step": 83280, "token_acc": 0.5157894736842106, "train_speed(iter/s)": 1.437666 }, { "epoch": 3.5681847392999444, "grad_norm": 5.243537425994873, "learning_rate": 1.8905249479124563e-05, "loss": 2.440631866455078, "memory(GiB)": 77.56, "step": 83285, "token_acc": 0.4498567335243553, "train_speed(iter/s)": 1.437666 }, { "epoch": 3.568398954629193, "grad_norm": 6.568016052246094, "learning_rate": 1.8899979671286216e-05, "loss": 2.2847660064697264, "memory(GiB)": 77.56, "step": 83290, "token_acc": 0.5149501661129569, "train_speed(iter/s)": 1.437648 }, { "epoch": 3.568613169958442, "grad_norm": 6.597051620483398, "learning_rate": 1.8894710426853913e-05, "loss": 2.22582893371582, "memory(GiB)": 77.56, "step": 83295, "token_acc": 0.49814126394052044, "train_speed(iter/s)": 1.437651 }, { "epoch": 3.5688273852876913, "grad_norm": 6.890224933624268, "learning_rate": 1.888944174592311e-05, "loss": 2.4346603393554687, "memory(GiB)": 77.56, "step": 83300, "token_acc": 0.484593837535014, "train_speed(iter/s)": 1.437648 }, { "epoch": 3.56904160061694, "grad_norm": 5.900251865386963, "learning_rate": 1.8884173628589236e-05, "loss": 2.282183074951172, "memory(GiB)": 77.56, "step": 83305, "token_acc": 0.5055762081784386, "train_speed(iter/s)": 1.437659 }, { "epoch": 3.569255815946189, "grad_norm": 7.65071439743042, "learning_rate": 1.887890607494776e-05, "loss": 2.2001413345336913, "memory(GiB)": 77.56, "step": 83310, "token_acc": 0.4915254237288136, "train_speed(iter/s)": 1.437652 }, { "epoch": 3.569470031275438, "grad_norm": 6.551050662994385, "learning_rate": 1.8873639085094097e-05, "loss": 2.1010459899902343, "memory(GiB)": 77.56, "step": 83315, "token_acc": 0.5135135135135135, "train_speed(iter/s)": 1.437672 }, { "epoch": 3.569684246604687, "grad_norm": 7.86661958694458, "learning_rate": 1.8868372659123655e-05, "loss": 2.231205368041992, "memory(GiB)": 77.56, "step": 83320, "token_acc": 0.4722222222222222, "train_speed(iter/s)": 1.437688 }, { "epoch": 3.569898461933936, "grad_norm": 6.6978535652160645, "learning_rate": 1.8863106797131857e-05, "loss": 2.0611236572265623, "memory(GiB)": 77.56, "step": 83325, "token_acc": 0.5173611111111112, "train_speed(iter/s)": 1.437682 }, { "epoch": 3.570112677263185, "grad_norm": 7.841678619384766, "learning_rate": 1.885784149921408e-05, "loss": 2.247226333618164, "memory(GiB)": 77.56, "step": 83330, "token_acc": 0.5543071161048689, "train_speed(iter/s)": 1.437697 }, { "epoch": 3.570326892592434, "grad_norm": 8.14116096496582, "learning_rate": 1.88525767654657e-05, "loss": 2.122659111022949, "memory(GiB)": 77.56, "step": 83335, "token_acc": 0.5230263157894737, "train_speed(iter/s)": 1.437691 }, { "epoch": 3.5705411079216827, "grad_norm": 6.669203281402588, "learning_rate": 1.884731259598212e-05, "loss": 2.4635011672973635, "memory(GiB)": 77.56, "step": 83340, "token_acc": 0.4625, "train_speed(iter/s)": 1.4377 }, { "epoch": 3.570755323250932, "grad_norm": 6.690664768218994, "learning_rate": 1.8842048990858706e-05, "loss": 2.5491796493530274, "memory(GiB)": 77.56, "step": 83345, "token_acc": 0.45390070921985815, "train_speed(iter/s)": 1.437695 }, { "epoch": 3.5709695385801807, "grad_norm": 7.593526363372803, "learning_rate": 1.8836785950190804e-05, "loss": 2.4520469665527345, "memory(GiB)": 77.56, "step": 83350, "token_acc": 0.4723926380368098, "train_speed(iter/s)": 1.437707 }, { "epoch": 3.5711837539094295, "grad_norm": 5.425413131713867, "learning_rate": 1.883152347407376e-05, "loss": 2.4824970245361326, "memory(GiB)": 77.56, "step": 83355, "token_acc": 0.4855305466237942, "train_speed(iter/s)": 1.43771 }, { "epoch": 3.571397969238679, "grad_norm": 7.038394927978516, "learning_rate": 1.8826261562602905e-05, "loss": 2.2883943557739257, "memory(GiB)": 77.56, "step": 83360, "token_acc": 0.48188405797101447, "train_speed(iter/s)": 1.437717 }, { "epoch": 3.5716121845679276, "grad_norm": 5.130452632904053, "learning_rate": 1.8821000215873548e-05, "loss": 2.492097282409668, "memory(GiB)": 77.56, "step": 83365, "token_acc": 0.4770408163265306, "train_speed(iter/s)": 1.43773 }, { "epoch": 3.5718263998971764, "grad_norm": 8.625472068786621, "learning_rate": 1.8815739433981034e-05, "loss": 2.171437454223633, "memory(GiB)": 77.56, "step": 83370, "token_acc": 0.5037878787878788, "train_speed(iter/s)": 1.437731 }, { "epoch": 3.5720406152264257, "grad_norm": 6.226128101348877, "learning_rate": 1.8810479217020654e-05, "loss": 2.4843910217285154, "memory(GiB)": 77.56, "step": 83375, "token_acc": 0.49828178694158076, "train_speed(iter/s)": 1.437737 }, { "epoch": 3.5722548305556745, "grad_norm": 6.627004623413086, "learning_rate": 1.8805219565087707e-05, "loss": 2.474307060241699, "memory(GiB)": 77.56, "step": 83380, "token_acc": 0.5, "train_speed(iter/s)": 1.437744 }, { "epoch": 3.5724690458849233, "grad_norm": 5.86984920501709, "learning_rate": 1.879996047827747e-05, "loss": 1.9881847381591797, "memory(GiB)": 77.56, "step": 83385, "token_acc": 0.5623100303951368, "train_speed(iter/s)": 1.437748 }, { "epoch": 3.5726832612141726, "grad_norm": 4.480538845062256, "learning_rate": 1.8794701956685212e-05, "loss": 2.3707780838012695, "memory(GiB)": 77.56, "step": 83390, "token_acc": 0.5095541401273885, "train_speed(iter/s)": 1.437748 }, { "epoch": 3.5728974765434214, "grad_norm": 6.057560920715332, "learning_rate": 1.878944400040618e-05, "loss": 2.2293033599853516, "memory(GiB)": 77.56, "step": 83395, "token_acc": 0.528052805280528, "train_speed(iter/s)": 1.43776 }, { "epoch": 3.57311169187267, "grad_norm": 5.348850727081299, "learning_rate": 1.8784186609535677e-05, "loss": 2.4478580474853517, "memory(GiB)": 77.56, "step": 83400, "token_acc": 0.47126436781609193, "train_speed(iter/s)": 1.437773 }, { "epoch": 3.5733259072019194, "grad_norm": 5.241185188293457, "learning_rate": 1.877892978416891e-05, "loss": 2.0920101165771485, "memory(GiB)": 77.56, "step": 83405, "token_acc": 0.5263157894736842, "train_speed(iter/s)": 1.437756 }, { "epoch": 3.5735401225311683, "grad_norm": 7.74973201751709, "learning_rate": 1.87736735244011e-05, "loss": 2.1073328018188477, "memory(GiB)": 77.56, "step": 83410, "token_acc": 0.5241635687732342, "train_speed(iter/s)": 1.437738 }, { "epoch": 3.573754337860417, "grad_norm": 6.204540729522705, "learning_rate": 1.8768417830327507e-05, "loss": 2.2306278228759764, "memory(GiB)": 77.56, "step": 83415, "token_acc": 0.519163763066202, "train_speed(iter/s)": 1.437749 }, { "epoch": 3.5739685531896663, "grad_norm": 6.120555400848389, "learning_rate": 1.876316270204332e-05, "loss": 2.218754196166992, "memory(GiB)": 77.56, "step": 83420, "token_acc": 0.48717948717948717, "train_speed(iter/s)": 1.437749 }, { "epoch": 3.574182768518915, "grad_norm": 5.856813907623291, "learning_rate": 1.875790813964372e-05, "loss": 2.2100473403930665, "memory(GiB)": 77.56, "step": 83425, "token_acc": 0.4724137931034483, "train_speed(iter/s)": 1.437741 }, { "epoch": 3.574396983848164, "grad_norm": 5.689226150512695, "learning_rate": 1.8752654143223946e-05, "loss": 2.5029279708862306, "memory(GiB)": 77.56, "step": 83430, "token_acc": 0.4876325088339223, "train_speed(iter/s)": 1.437752 }, { "epoch": 3.574611199177413, "grad_norm": 5.500398635864258, "learning_rate": 1.8747400712879154e-05, "loss": 2.360789489746094, "memory(GiB)": 77.56, "step": 83435, "token_acc": 0.48615384615384616, "train_speed(iter/s)": 1.43776 }, { "epoch": 3.574825414506662, "grad_norm": 8.030378341674805, "learning_rate": 1.8742147848704516e-05, "loss": 2.179517364501953, "memory(GiB)": 77.56, "step": 83440, "token_acc": 0.5437956204379562, "train_speed(iter/s)": 1.437761 }, { "epoch": 3.575039629835911, "grad_norm": 5.850546836853027, "learning_rate": 1.873689555079519e-05, "loss": 2.3703914642333985, "memory(GiB)": 77.56, "step": 83445, "token_acc": 0.5171232876712328, "train_speed(iter/s)": 1.43777 }, { "epoch": 3.57525384516516, "grad_norm": 5.844541072845459, "learning_rate": 1.8731643819246312e-05, "loss": 2.42591552734375, "memory(GiB)": 77.56, "step": 83450, "token_acc": 0.4870967741935484, "train_speed(iter/s)": 1.437779 }, { "epoch": 3.575468060494409, "grad_norm": 5.354722023010254, "learning_rate": 1.8726392654153058e-05, "loss": 2.4405771255493165, "memory(GiB)": 77.56, "step": 83455, "token_acc": 0.49852507374631266, "train_speed(iter/s)": 1.437793 }, { "epoch": 3.5756822758236577, "grad_norm": 7.0297675132751465, "learning_rate": 1.8721142055610536e-05, "loss": 2.3134946823120117, "memory(GiB)": 77.56, "step": 83460, "token_acc": 0.5116279069767442, "train_speed(iter/s)": 1.437804 }, { "epoch": 3.575896491152907, "grad_norm": 6.780994415283203, "learning_rate": 1.8715892023713866e-05, "loss": 2.5196014404296876, "memory(GiB)": 77.56, "step": 83465, "token_acc": 0.48986486486486486, "train_speed(iter/s)": 1.437831 }, { "epoch": 3.576110706482156, "grad_norm": 6.342161178588867, "learning_rate": 1.8710642558558162e-05, "loss": 2.0995716094970702, "memory(GiB)": 77.56, "step": 83470, "token_acc": 0.5252100840336135, "train_speed(iter/s)": 1.437831 }, { "epoch": 3.5763249218114046, "grad_norm": 5.033740043640137, "learning_rate": 1.870539366023852e-05, "loss": 2.407251739501953, "memory(GiB)": 77.56, "step": 83475, "token_acc": 0.5317725752508361, "train_speed(iter/s)": 1.437832 }, { "epoch": 3.576539137140654, "grad_norm": 5.4944658279418945, "learning_rate": 1.8700145328850005e-05, "loss": 2.302262878417969, "memory(GiB)": 77.56, "step": 83480, "token_acc": 0.511400651465798, "train_speed(iter/s)": 1.437811 }, { "epoch": 3.5767533524699027, "grad_norm": 6.006190299987793, "learning_rate": 1.8694897564487725e-05, "loss": 2.326752853393555, "memory(GiB)": 77.56, "step": 83485, "token_acc": 0.5074074074074074, "train_speed(iter/s)": 1.437811 }, { "epoch": 3.5769675677991515, "grad_norm": 6.718530654907227, "learning_rate": 1.868965036724676e-05, "loss": 2.514005661010742, "memory(GiB)": 77.56, "step": 83490, "token_acc": 0.4827586206896552, "train_speed(iter/s)": 1.437829 }, { "epoch": 3.5771817831284007, "grad_norm": 5.306966304779053, "learning_rate": 1.8684403737222144e-05, "loss": 2.169674301147461, "memory(GiB)": 77.56, "step": 83495, "token_acc": 0.5232974910394266, "train_speed(iter/s)": 1.43784 }, { "epoch": 3.5773959984576496, "grad_norm": 6.0591912269592285, "learning_rate": 1.8679157674508936e-05, "loss": 2.4086856842041016, "memory(GiB)": 77.56, "step": 83500, "token_acc": 0.4437869822485207, "train_speed(iter/s)": 1.437853 }, { "epoch": 3.5773959984576496, "eval_loss": 2.27353572845459, "eval_runtime": 14.5028, "eval_samples_per_second": 6.895, "eval_steps_per_second": 6.895, "eval_token_acc": 0.465, "step": 83500 }, { "epoch": 3.5776102137868984, "grad_norm": 7.037579536437988, "learning_rate": 1.8673912179202163e-05, "loss": 2.461414337158203, "memory(GiB)": 77.56, "step": 83505, "token_acc": 0.46955719557195574, "train_speed(iter/s)": 1.437482 }, { "epoch": 3.5778244291161476, "grad_norm": 6.035539150238037, "learning_rate": 1.8668667251396847e-05, "loss": 2.124465751647949, "memory(GiB)": 77.56, "step": 83510, "token_acc": 0.5217391304347826, "train_speed(iter/s)": 1.437508 }, { "epoch": 3.5780386444453964, "grad_norm": 5.7824883460998535, "learning_rate": 1.8663422891188024e-05, "loss": 2.38531436920166, "memory(GiB)": 77.56, "step": 83515, "token_acc": 0.4519230769230769, "train_speed(iter/s)": 1.43747 }, { "epoch": 3.5782528597746452, "grad_norm": 7.397764205932617, "learning_rate": 1.8658179098670698e-05, "loss": 2.171657371520996, "memory(GiB)": 77.56, "step": 83520, "token_acc": 0.5172413793103449, "train_speed(iter/s)": 1.437475 }, { "epoch": 3.5784670751038945, "grad_norm": 4.856852054595947, "learning_rate": 1.8652935873939858e-05, "loss": 2.4343599319458007, "memory(GiB)": 77.56, "step": 83525, "token_acc": 0.4964788732394366, "train_speed(iter/s)": 1.43748 }, { "epoch": 3.5786812904331433, "grad_norm": 6.537598609924316, "learning_rate": 1.864769321709049e-05, "loss": 2.4519826889038088, "memory(GiB)": 77.56, "step": 83530, "token_acc": 0.48360655737704916, "train_speed(iter/s)": 1.437489 }, { "epoch": 3.578895505762392, "grad_norm": 6.61281156539917, "learning_rate": 1.864245112821757e-05, "loss": 2.3779029846191406, "memory(GiB)": 77.56, "step": 83535, "token_acc": 0.5114503816793893, "train_speed(iter/s)": 1.437487 }, { "epoch": 3.5791097210916414, "grad_norm": 4.975118637084961, "learning_rate": 1.8637209607416044e-05, "loss": 2.0590206146240235, "memory(GiB)": 77.56, "step": 83540, "token_acc": 0.5758754863813229, "train_speed(iter/s)": 1.437488 }, { "epoch": 3.57932393642089, "grad_norm": 4.7138285636901855, "learning_rate": 1.8631968654780906e-05, "loss": 2.371194267272949, "memory(GiB)": 77.56, "step": 83545, "token_acc": 0.5083612040133779, "train_speed(iter/s)": 1.437486 }, { "epoch": 3.579538151750139, "grad_norm": 5.548679828643799, "learning_rate": 1.8626728270407078e-05, "loss": 2.3220003128051756, "memory(GiB)": 77.56, "step": 83550, "token_acc": 0.5150501672240803, "train_speed(iter/s)": 1.437483 }, { "epoch": 3.5797523670793883, "grad_norm": 4.663828372955322, "learning_rate": 1.86214884543895e-05, "loss": 2.4430870056152343, "memory(GiB)": 77.56, "step": 83555, "token_acc": 0.4984126984126984, "train_speed(iter/s)": 1.437474 }, { "epoch": 3.579966582408637, "grad_norm": 5.698978424072266, "learning_rate": 1.8616249206823085e-05, "loss": 2.2960018157958983, "memory(GiB)": 77.56, "step": 83560, "token_acc": 0.5060240963855421, "train_speed(iter/s)": 1.437477 }, { "epoch": 3.580180797737886, "grad_norm": 6.00016450881958, "learning_rate": 1.8611010527802763e-05, "loss": 2.573992156982422, "memory(GiB)": 77.56, "step": 83565, "token_acc": 0.4612676056338028, "train_speed(iter/s)": 1.437493 }, { "epoch": 3.580395013067135, "grad_norm": 4.582003593444824, "learning_rate": 1.8605772417423417e-05, "loss": 2.2473644256591796, "memory(GiB)": 77.56, "step": 83570, "token_acc": 0.5264900662251656, "train_speed(iter/s)": 1.43748 }, { "epoch": 3.580609228396384, "grad_norm": 5.979335308074951, "learning_rate": 1.8600534875779967e-05, "loss": 2.472529983520508, "memory(GiB)": 77.56, "step": 83575, "token_acc": 0.46827794561933533, "train_speed(iter/s)": 1.437496 }, { "epoch": 3.5808234437256328, "grad_norm": 6.774959564208984, "learning_rate": 1.859529790296729e-05, "loss": 2.517338180541992, "memory(GiB)": 77.56, "step": 83580, "token_acc": 0.475, "train_speed(iter/s)": 1.437499 }, { "epoch": 3.581037659054882, "grad_norm": 7.629566192626953, "learning_rate": 1.8590061499080246e-05, "loss": 2.458251190185547, "memory(GiB)": 77.56, "step": 83585, "token_acc": 0.5151515151515151, "train_speed(iter/s)": 1.437486 }, { "epoch": 3.581251874384131, "grad_norm": 7.092397212982178, "learning_rate": 1.8584825664213707e-05, "loss": 2.5144321441650392, "memory(GiB)": 77.56, "step": 83590, "token_acc": 0.4789156626506024, "train_speed(iter/s)": 1.4375 }, { "epoch": 3.5814660897133797, "grad_norm": 5.519599914550781, "learning_rate": 1.85795903984625e-05, "loss": 2.537326431274414, "memory(GiB)": 77.56, "step": 83595, "token_acc": 0.5074626865671642, "train_speed(iter/s)": 1.437501 }, { "epoch": 3.581680305042629, "grad_norm": 5.344213485717773, "learning_rate": 1.8574355701921502e-05, "loss": 2.5091175079345702, "memory(GiB)": 77.56, "step": 83600, "token_acc": 0.47619047619047616, "train_speed(iter/s)": 1.437501 }, { "epoch": 3.5818945203718777, "grad_norm": 4.627424716949463, "learning_rate": 1.8569121574685538e-05, "loss": 2.2671415328979494, "memory(GiB)": 77.56, "step": 83605, "token_acc": 0.5217391304347826, "train_speed(iter/s)": 1.437504 }, { "epoch": 3.5821087357011265, "grad_norm": 6.110616207122803, "learning_rate": 1.8563888016849417e-05, "loss": 2.309220886230469, "memory(GiB)": 77.56, "step": 83610, "token_acc": 0.5130718954248366, "train_speed(iter/s)": 1.437516 }, { "epoch": 3.582322951030376, "grad_norm": 5.834411144256592, "learning_rate": 1.8558655028507954e-05, "loss": 2.2596513748168947, "memory(GiB)": 77.56, "step": 83615, "token_acc": 0.48404255319148937, "train_speed(iter/s)": 1.437533 }, { "epoch": 3.5825371663596246, "grad_norm": 8.132269859313965, "learning_rate": 1.855342260975595e-05, "loss": 2.5855949401855467, "memory(GiB)": 77.56, "step": 83620, "token_acc": 0.4381625441696113, "train_speed(iter/s)": 1.437525 }, { "epoch": 3.5827513816888734, "grad_norm": 7.2655158042907715, "learning_rate": 1.8548190760688176e-05, "loss": 1.9536001205444335, "memory(GiB)": 77.56, "step": 83625, "token_acc": 0.5740740740740741, "train_speed(iter/s)": 1.437544 }, { "epoch": 3.5829655970181227, "grad_norm": 6.249345779418945, "learning_rate": 1.8542959481399435e-05, "loss": 2.417573165893555, "memory(GiB)": 77.56, "step": 83630, "token_acc": 0.4952681388012618, "train_speed(iter/s)": 1.437538 }, { "epoch": 3.5831798123473715, "grad_norm": 5.846346378326416, "learning_rate": 1.8537728771984504e-05, "loss": 2.258195495605469, "memory(GiB)": 77.56, "step": 83635, "token_acc": 0.5263157894736842, "train_speed(iter/s)": 1.437554 }, { "epoch": 3.5833940276766203, "grad_norm": 4.9812912940979, "learning_rate": 1.853249863253813e-05, "loss": 2.118291664123535, "memory(GiB)": 77.56, "step": 83640, "token_acc": 0.5555555555555556, "train_speed(iter/s)": 1.437559 }, { "epoch": 3.5836082430058696, "grad_norm": 6.051755428314209, "learning_rate": 1.8527269063155068e-05, "loss": 2.199260711669922, "memory(GiB)": 77.56, "step": 83645, "token_acc": 0.5382059800664452, "train_speed(iter/s)": 1.437569 }, { "epoch": 3.5838224583351184, "grad_norm": 6.251136302947998, "learning_rate": 1.8522040063930045e-05, "loss": 2.2736490249633787, "memory(GiB)": 77.56, "step": 83650, "token_acc": 0.5340501792114696, "train_speed(iter/s)": 1.437586 }, { "epoch": 3.584036673664367, "grad_norm": 6.631663799285889, "learning_rate": 1.851681163495778e-05, "loss": 2.3653751373291017, "memory(GiB)": 77.56, "step": 83655, "token_acc": 0.5167785234899329, "train_speed(iter/s)": 1.437594 }, { "epoch": 3.5842508889936164, "grad_norm": 5.372085094451904, "learning_rate": 1.8511583776333026e-05, "loss": 2.081974411010742, "memory(GiB)": 77.56, "step": 83660, "token_acc": 0.5322033898305085, "train_speed(iter/s)": 1.437593 }, { "epoch": 3.5844651043228652, "grad_norm": 5.604437828063965, "learning_rate": 1.8506356488150467e-05, "loss": 2.400470161437988, "memory(GiB)": 77.56, "step": 83665, "token_acc": 0.5102040816326531, "train_speed(iter/s)": 1.437623 }, { "epoch": 3.584679319652114, "grad_norm": 6.18123197555542, "learning_rate": 1.8501129770504805e-05, "loss": 2.4475830078125, "memory(GiB)": 77.56, "step": 83670, "token_acc": 0.4634146341463415, "train_speed(iter/s)": 1.43762 }, { "epoch": 3.5848935349813633, "grad_norm": 5.402523994445801, "learning_rate": 1.8495903623490722e-05, "loss": 2.5152135848999024, "memory(GiB)": 77.56, "step": 83675, "token_acc": 0.5, "train_speed(iter/s)": 1.437632 }, { "epoch": 3.585107750310612, "grad_norm": 6.611544132232666, "learning_rate": 1.8490678047202885e-05, "loss": 2.4221534729003906, "memory(GiB)": 77.56, "step": 83680, "token_acc": 0.5275080906148867, "train_speed(iter/s)": 1.437651 }, { "epoch": 3.585321965639861, "grad_norm": 5.7812018394470215, "learning_rate": 1.848545304173599e-05, "loss": 2.1626869201660157, "memory(GiB)": 77.56, "step": 83685, "token_acc": 0.4880952380952381, "train_speed(iter/s)": 1.437659 }, { "epoch": 3.58553618096911, "grad_norm": 6.878974914550781, "learning_rate": 1.8480228607184675e-05, "loss": 2.4956584930419923, "memory(GiB)": 77.56, "step": 83690, "token_acc": 0.4885245901639344, "train_speed(iter/s)": 1.437668 }, { "epoch": 3.585750396298359, "grad_norm": 5.689792633056641, "learning_rate": 1.8475004743643587e-05, "loss": 2.375166130065918, "memory(GiB)": 77.56, "step": 83695, "token_acc": 0.4869281045751634, "train_speed(iter/s)": 1.437683 }, { "epoch": 3.585964611627608, "grad_norm": 6.481431484222412, "learning_rate": 1.846978145120736e-05, "loss": 2.2234724044799803, "memory(GiB)": 77.56, "step": 83700, "token_acc": 0.5323741007194245, "train_speed(iter/s)": 1.43768 }, { "epoch": 3.586178826956857, "grad_norm": 5.415676116943359, "learning_rate": 1.8464558729970603e-05, "loss": 2.0002246856689454, "memory(GiB)": 77.56, "step": 83705, "token_acc": 0.5983935742971888, "train_speed(iter/s)": 1.437683 }, { "epoch": 3.586393042286106, "grad_norm": 6.878890514373779, "learning_rate": 1.845933658002797e-05, "loss": 2.1780679702758787, "memory(GiB)": 77.56, "step": 83710, "token_acc": 0.5364431486880467, "train_speed(iter/s)": 1.437697 }, { "epoch": 3.5866072576153547, "grad_norm": 5.245037078857422, "learning_rate": 1.845411500147402e-05, "loss": 2.156040000915527, "memory(GiB)": 77.56, "step": 83715, "token_acc": 0.5494880546075085, "train_speed(iter/s)": 1.437717 }, { "epoch": 3.586821472944604, "grad_norm": 5.945116996765137, "learning_rate": 1.844889399440338e-05, "loss": 2.3285757064819337, "memory(GiB)": 77.56, "step": 83720, "token_acc": 0.4963768115942029, "train_speed(iter/s)": 1.437737 }, { "epoch": 3.5870356882738528, "grad_norm": 4.118406295776367, "learning_rate": 1.8443673558910635e-05, "loss": 2.4053455352783204, "memory(GiB)": 77.56, "step": 83725, "token_acc": 0.5263157894736842, "train_speed(iter/s)": 1.437743 }, { "epoch": 3.5872499036031016, "grad_norm": 6.067787170410156, "learning_rate": 1.8438453695090342e-05, "loss": 2.390274238586426, "memory(GiB)": 77.56, "step": 83730, "token_acc": 0.502906976744186, "train_speed(iter/s)": 1.437747 }, { "epoch": 3.587464118932351, "grad_norm": 6.500606060028076, "learning_rate": 1.8433234403037065e-05, "loss": 2.5961103439331055, "memory(GiB)": 77.56, "step": 83735, "token_acc": 0.46283783783783783, "train_speed(iter/s)": 1.437747 }, { "epoch": 3.5876783342615997, "grad_norm": 5.9028544425964355, "learning_rate": 1.842801568284534e-05, "loss": 2.1781572341918944, "memory(GiB)": 77.56, "step": 83740, "token_acc": 0.5367647058823529, "train_speed(iter/s)": 1.437764 }, { "epoch": 3.5878925495908485, "grad_norm": 6.955174446105957, "learning_rate": 1.8422797534609748e-05, "loss": 2.2484804153442384, "memory(GiB)": 77.56, "step": 83745, "token_acc": 0.5135135135135135, "train_speed(iter/s)": 1.437739 }, { "epoch": 3.5881067649200977, "grad_norm": 5.07317590713501, "learning_rate": 1.8417579958424797e-05, "loss": 2.1796667098999025, "memory(GiB)": 77.56, "step": 83750, "token_acc": 0.5360824742268041, "train_speed(iter/s)": 1.43772 }, { "epoch": 3.5883209802493465, "grad_norm": 5.653574466705322, "learning_rate": 1.8412362954385015e-05, "loss": 2.3984197616577148, "memory(GiB)": 77.56, "step": 83755, "token_acc": 0.48424068767908307, "train_speed(iter/s)": 1.437734 }, { "epoch": 3.5885351955785953, "grad_norm": 5.272899150848389, "learning_rate": 1.8407146522584906e-05, "loss": 2.333488655090332, "memory(GiB)": 77.56, "step": 83760, "token_acc": 0.4979253112033195, "train_speed(iter/s)": 1.437739 }, { "epoch": 3.5887494109078446, "grad_norm": 7.157073497772217, "learning_rate": 1.8401930663118983e-05, "loss": 2.637564277648926, "memory(GiB)": 77.56, "step": 83765, "token_acc": 0.46735395189003437, "train_speed(iter/s)": 1.43776 }, { "epoch": 3.5889636262370934, "grad_norm": 7.094167232513428, "learning_rate": 1.83967153760817e-05, "loss": 2.211750030517578, "memory(GiB)": 77.56, "step": 83770, "token_acc": 0.5164835164835165, "train_speed(iter/s)": 1.437756 }, { "epoch": 3.5891778415663422, "grad_norm": 6.066314697265625, "learning_rate": 1.8391500661567584e-05, "loss": 2.4501609802246094, "memory(GiB)": 77.56, "step": 83775, "token_acc": 0.5019305019305019, "train_speed(iter/s)": 1.437768 }, { "epoch": 3.5893920568955915, "grad_norm": 7.008761882781982, "learning_rate": 1.838628651967107e-05, "loss": 2.1569086074829102, "memory(GiB)": 77.56, "step": 83780, "token_acc": 0.549407114624506, "train_speed(iter/s)": 1.437785 }, { "epoch": 3.5896062722248403, "grad_norm": 5.256586074829102, "learning_rate": 1.838107295048665e-05, "loss": 2.029164695739746, "memory(GiB)": 77.56, "step": 83785, "token_acc": 0.5477941176470589, "train_speed(iter/s)": 1.437801 }, { "epoch": 3.589820487554089, "grad_norm": 4.974340915679932, "learning_rate": 1.8375859954108753e-05, "loss": 2.600382614135742, "memory(GiB)": 77.56, "step": 83790, "token_acc": 0.4709480122324159, "train_speed(iter/s)": 1.437833 }, { "epoch": 3.5900347028833384, "grad_norm": 5.226301670074463, "learning_rate": 1.8370647530631818e-05, "loss": 2.4387868881225585, "memory(GiB)": 77.56, "step": 83795, "token_acc": 0.49615384615384617, "train_speed(iter/s)": 1.437844 }, { "epoch": 3.590248918212587, "grad_norm": 8.13363265991211, "learning_rate": 1.8365435680150256e-05, "loss": 2.3786300659179687, "memory(GiB)": 77.56, "step": 83800, "token_acc": 0.47796610169491527, "train_speed(iter/s)": 1.43786 }, { "epoch": 3.590463133541836, "grad_norm": 5.606438636779785, "learning_rate": 1.8360224402758525e-05, "loss": 2.511959266662598, "memory(GiB)": 77.56, "step": 83805, "token_acc": 0.5016835016835017, "train_speed(iter/s)": 1.437854 }, { "epoch": 3.5906773488710853, "grad_norm": 7.411236763000488, "learning_rate": 1.8355013698551004e-05, "loss": 2.3481796264648436, "memory(GiB)": 77.56, "step": 83810, "token_acc": 0.5, "train_speed(iter/s)": 1.437859 }, { "epoch": 3.590891564200334, "grad_norm": 5.4996018409729, "learning_rate": 1.8349803567622094e-05, "loss": 2.2429292678833006, "memory(GiB)": 77.56, "step": 83815, "token_acc": 0.4875444839857651, "train_speed(iter/s)": 1.437859 }, { "epoch": 3.591105779529583, "grad_norm": 5.252297401428223, "learning_rate": 1.8344594010066184e-05, "loss": 2.380876159667969, "memory(GiB)": 77.56, "step": 83820, "token_acc": 0.5062893081761006, "train_speed(iter/s)": 1.437862 }, { "epoch": 3.591319994858832, "grad_norm": 5.427470684051514, "learning_rate": 1.8339385025977635e-05, "loss": 2.228294563293457, "memory(GiB)": 77.56, "step": 83825, "token_acc": 0.5150375939849624, "train_speed(iter/s)": 1.437849 }, { "epoch": 3.591534210188081, "grad_norm": 5.288304805755615, "learning_rate": 1.833417661545084e-05, "loss": 2.226371955871582, "memory(GiB)": 77.56, "step": 83830, "token_acc": 0.5445205479452054, "train_speed(iter/s)": 1.437841 }, { "epoch": 3.5917484255173298, "grad_norm": 5.43513298034668, "learning_rate": 1.832896877858014e-05, "loss": 2.363986778259277, "memory(GiB)": 77.56, "step": 83835, "token_acc": 0.5060975609756098, "train_speed(iter/s)": 1.437868 }, { "epoch": 3.591962640846579, "grad_norm": 6.374598979949951, "learning_rate": 1.832376151545988e-05, "loss": 2.337849235534668, "memory(GiB)": 77.56, "step": 83840, "token_acc": 0.5066666666666667, "train_speed(iter/s)": 1.43787 }, { "epoch": 3.592176856175828, "grad_norm": 7.433907985687256, "learning_rate": 1.83185548261844e-05, "loss": 2.0900434494018554, "memory(GiB)": 77.56, "step": 83845, "token_acc": 0.5192307692307693, "train_speed(iter/s)": 1.437868 }, { "epoch": 3.5923910715050766, "grad_norm": 5.9668049812316895, "learning_rate": 1.8313348710848018e-05, "loss": 2.6514781951904296, "memory(GiB)": 77.56, "step": 83850, "token_acc": 0.4567901234567901, "train_speed(iter/s)": 1.437897 }, { "epoch": 3.592605286834326, "grad_norm": 5.6707258224487305, "learning_rate": 1.8308143169545028e-05, "loss": 2.1907678604125977, "memory(GiB)": 77.56, "step": 83855, "token_acc": 0.51953125, "train_speed(iter/s)": 1.437907 }, { "epoch": 3.5928195021635747, "grad_norm": 5.066697120666504, "learning_rate": 1.8302938202369752e-05, "loss": 2.246035575866699, "memory(GiB)": 77.56, "step": 83860, "token_acc": 0.53, "train_speed(iter/s)": 1.437915 }, { "epoch": 3.5930337174928235, "grad_norm": 5.942101001739502, "learning_rate": 1.8297733809416505e-05, "loss": 2.3774572372436524, "memory(GiB)": 77.56, "step": 83865, "token_acc": 0.4981949458483754, "train_speed(iter/s)": 1.437931 }, { "epoch": 3.5932479328220728, "grad_norm": 4.433930397033691, "learning_rate": 1.829252999077955e-05, "loss": 2.1279762268066404, "memory(GiB)": 77.56, "step": 83870, "token_acc": 0.5049833887043189, "train_speed(iter/s)": 1.437945 }, { "epoch": 3.5934621481513216, "grad_norm": 4.171961784362793, "learning_rate": 1.8287326746553157e-05, "loss": 2.4300605773925783, "memory(GiB)": 77.56, "step": 83875, "token_acc": 0.5, "train_speed(iter/s)": 1.437968 }, { "epoch": 3.5936763634805704, "grad_norm": 6.884018898010254, "learning_rate": 1.8282124076831592e-05, "loss": 2.5158658981323243, "memory(GiB)": 77.56, "step": 83880, "token_acc": 0.4966216216216216, "train_speed(iter/s)": 1.437966 }, { "epoch": 3.5938905788098197, "grad_norm": 8.164193153381348, "learning_rate": 1.8276921981709077e-05, "loss": 2.404288101196289, "memory(GiB)": 77.56, "step": 83885, "token_acc": 0.5146579804560261, "train_speed(iter/s)": 1.437965 }, { "epoch": 3.5941047941390685, "grad_norm": 5.8763275146484375, "learning_rate": 1.8271720461279907e-05, "loss": 2.0672731399536133, "memory(GiB)": 77.56, "step": 83890, "token_acc": 0.575, "train_speed(iter/s)": 1.437976 }, { "epoch": 3.5943190094683173, "grad_norm": 7.617894649505615, "learning_rate": 1.8266519515638277e-05, "loss": 2.2012426376342775, "memory(GiB)": 77.56, "step": 83895, "token_acc": 0.5186567164179104, "train_speed(iter/s)": 1.437988 }, { "epoch": 3.5945332247975665, "grad_norm": 5.624991416931152, "learning_rate": 1.826131914487842e-05, "loss": 2.0258535385131835, "memory(GiB)": 77.56, "step": 83900, "token_acc": 0.5661764705882353, "train_speed(iter/s)": 1.437979 }, { "epoch": 3.5947474401268154, "grad_norm": 5.233500957489014, "learning_rate": 1.8256119349094538e-05, "loss": 2.4378084182739257, "memory(GiB)": 77.56, "step": 83905, "token_acc": 0.49375, "train_speed(iter/s)": 1.437986 }, { "epoch": 3.594961655456064, "grad_norm": 6.583970069885254, "learning_rate": 1.8250920128380826e-05, "loss": 2.4524112701416017, "memory(GiB)": 77.56, "step": 83910, "token_acc": 0.46417445482866043, "train_speed(iter/s)": 1.437979 }, { "epoch": 3.5951758707853134, "grad_norm": 6.571187496185303, "learning_rate": 1.8245721482831468e-05, "loss": 2.493466377258301, "memory(GiB)": 77.56, "step": 83915, "token_acc": 0.43465045592705165, "train_speed(iter/s)": 1.437994 }, { "epoch": 3.5953900861145622, "grad_norm": 4.98223352432251, "learning_rate": 1.8240523412540667e-05, "loss": 2.184986877441406, "memory(GiB)": 77.56, "step": 83920, "token_acc": 0.46540880503144655, "train_speed(iter/s)": 1.437989 }, { "epoch": 3.595604301443811, "grad_norm": 5.9935712814331055, "learning_rate": 1.823532591760258e-05, "loss": 2.578005790710449, "memory(GiB)": 77.56, "step": 83925, "token_acc": 0.48089171974522293, "train_speed(iter/s)": 1.437992 }, { "epoch": 3.5958185167730603, "grad_norm": 6.469836711883545, "learning_rate": 1.8230128998111346e-05, "loss": 2.2249731063842773, "memory(GiB)": 77.56, "step": 83930, "token_acc": 0.5458333333333333, "train_speed(iter/s)": 1.438 }, { "epoch": 3.596032732102309, "grad_norm": 5.506590843200684, "learning_rate": 1.8224932654161143e-05, "loss": 2.093449020385742, "memory(GiB)": 77.56, "step": 83935, "token_acc": 0.5358361774744027, "train_speed(iter/s)": 1.438026 }, { "epoch": 3.596246947431558, "grad_norm": 5.520415306091309, "learning_rate": 1.821973688584609e-05, "loss": 2.1561962127685548, "memory(GiB)": 77.56, "step": 83940, "token_acc": 0.5527272727272727, "train_speed(iter/s)": 1.438023 }, { "epoch": 3.596461162760807, "grad_norm": 6.590030670166016, "learning_rate": 1.82145416932603e-05, "loss": 2.2194372177124024, "memory(GiB)": 77.56, "step": 83945, "token_acc": 0.5172413793103449, "train_speed(iter/s)": 1.438043 }, { "epoch": 3.596675378090056, "grad_norm": 5.831902980804443, "learning_rate": 1.8209347076497924e-05, "loss": 2.4743839263916017, "memory(GiB)": 77.56, "step": 83950, "token_acc": 0.49264705882352944, "train_speed(iter/s)": 1.437998 }, { "epoch": 3.596889593419305, "grad_norm": 4.735555171966553, "learning_rate": 1.820415303565305e-05, "loss": 2.321951675415039, "memory(GiB)": 77.56, "step": 83955, "token_acc": 0.4968944099378882, "train_speed(iter/s)": 1.43801 }, { "epoch": 3.597103808748554, "grad_norm": 5.199782371520996, "learning_rate": 1.8198959570819763e-05, "loss": 2.5252227783203125, "memory(GiB)": 77.56, "step": 83960, "token_acc": 0.5027777777777778, "train_speed(iter/s)": 1.438015 }, { "epoch": 3.597318024077803, "grad_norm": 6.447930812835693, "learning_rate": 1.8193766682092155e-05, "loss": 2.4034652709960938, "memory(GiB)": 77.56, "step": 83965, "token_acc": 0.4791666666666667, "train_speed(iter/s)": 1.438033 }, { "epoch": 3.597532239407052, "grad_norm": 5.510873794555664, "learning_rate": 1.8188574369564293e-05, "loss": 2.4141664505004883, "memory(GiB)": 77.56, "step": 83970, "token_acc": 0.484375, "train_speed(iter/s)": 1.438025 }, { "epoch": 3.597746454736301, "grad_norm": 8.423981666564941, "learning_rate": 1.8183382633330255e-05, "loss": 2.373681640625, "memory(GiB)": 77.56, "step": 83975, "token_acc": 0.5134228187919463, "train_speed(iter/s)": 1.438047 }, { "epoch": 3.5979606700655498, "grad_norm": 5.218179702758789, "learning_rate": 1.817819147348409e-05, "loss": 2.500479888916016, "memory(GiB)": 77.56, "step": 83980, "token_acc": 0.4735202492211838, "train_speed(iter/s)": 1.438054 }, { "epoch": 3.598174885394799, "grad_norm": 10.89646053314209, "learning_rate": 1.817300089011984e-05, "loss": 2.2559423446655273, "memory(GiB)": 77.56, "step": 83985, "token_acc": 0.5018450184501845, "train_speed(iter/s)": 1.438057 }, { "epoch": 3.598389100724048, "grad_norm": 8.182391166687012, "learning_rate": 1.816781088333153e-05, "loss": 2.2623111724853517, "memory(GiB)": 77.56, "step": 83990, "token_acc": 0.5111111111111111, "train_speed(iter/s)": 1.438067 }, { "epoch": 3.5986033160532966, "grad_norm": 7.0720415115356445, "learning_rate": 1.8162621453213197e-05, "loss": 2.4828422546386717, "memory(GiB)": 77.56, "step": 83995, "token_acc": 0.50920245398773, "train_speed(iter/s)": 1.438069 }, { "epoch": 3.598817531382546, "grad_norm": 6.682283401489258, "learning_rate": 1.8157432599858814e-05, "loss": 2.1567333221435545, "memory(GiB)": 77.56, "step": 84000, "token_acc": 0.5357142857142857, "train_speed(iter/s)": 1.438086 }, { "epoch": 3.598817531382546, "eval_loss": 2.127963066101074, "eval_runtime": 15.0222, "eval_samples_per_second": 6.657, "eval_steps_per_second": 6.657, "eval_token_acc": 0.4673495518565941, "step": 84000 }, { "epoch": 3.5990317467117947, "grad_norm": 7.187999248504639, "learning_rate": 1.8152244323362416e-05, "loss": 2.5996124267578127, "memory(GiB)": 77.56, "step": 84005, "token_acc": 0.47297297297297297, "train_speed(iter/s)": 1.437692 }, { "epoch": 3.5992459620410435, "grad_norm": 6.486800193786621, "learning_rate": 1.8147056623818004e-05, "loss": 2.2904407501220705, "memory(GiB)": 77.56, "step": 84010, "token_acc": 0.5133079847908745, "train_speed(iter/s)": 1.437684 }, { "epoch": 3.599460177370293, "grad_norm": 5.945263862609863, "learning_rate": 1.814186950131954e-05, "loss": 2.2699140548706054, "memory(GiB)": 77.56, "step": 84015, "token_acc": 0.5034013605442177, "train_speed(iter/s)": 1.437708 }, { "epoch": 3.5996743926995416, "grad_norm": 5.000082969665527, "learning_rate": 1.8136682955960997e-05, "loss": 2.390966796875, "memory(GiB)": 77.56, "step": 84020, "token_acc": 0.48846153846153845, "train_speed(iter/s)": 1.43771 }, { "epoch": 3.5998886080287904, "grad_norm": 7.152932167053223, "learning_rate": 1.813149698783633e-05, "loss": 2.339006805419922, "memory(GiB)": 77.56, "step": 84025, "token_acc": 0.5213675213675214, "train_speed(iter/s)": 1.437727 }, { "epoch": 3.6001028233580397, "grad_norm": 6.6526923179626465, "learning_rate": 1.812631159703948e-05, "loss": 2.447410011291504, "memory(GiB)": 77.56, "step": 84030, "token_acc": 0.4729241877256318, "train_speed(iter/s)": 1.437715 }, { "epoch": 3.6003170386872885, "grad_norm": 7.088724613189697, "learning_rate": 1.8121126783664406e-05, "loss": 2.273637390136719, "memory(GiB)": 77.56, "step": 84035, "token_acc": 0.5, "train_speed(iter/s)": 1.437709 }, { "epoch": 3.6005312540165373, "grad_norm": 5.185393810272217, "learning_rate": 1.8115942547805025e-05, "loss": 2.0088356018066404, "memory(GiB)": 77.56, "step": 84040, "token_acc": 0.5608108108108109, "train_speed(iter/s)": 1.437726 }, { "epoch": 3.6007454693457865, "grad_norm": 6.586428165435791, "learning_rate": 1.8110758889555256e-05, "loss": 2.476908874511719, "memory(GiB)": 77.56, "step": 84045, "token_acc": 0.49477351916376305, "train_speed(iter/s)": 1.437712 }, { "epoch": 3.6009596846750354, "grad_norm": 5.801070690155029, "learning_rate": 1.8105575809009008e-05, "loss": 2.1285720825195313, "memory(GiB)": 77.56, "step": 84050, "token_acc": 0.5111821086261981, "train_speed(iter/s)": 1.437725 }, { "epoch": 3.601173900004284, "grad_norm": 5.38275146484375, "learning_rate": 1.8100393306260166e-05, "loss": 2.160166931152344, "memory(GiB)": 77.56, "step": 84055, "token_acc": 0.5185185185185185, "train_speed(iter/s)": 1.43773 }, { "epoch": 3.6013881153335334, "grad_norm": 6.874888896942139, "learning_rate": 1.809521138140261e-05, "loss": 2.0716358184814454, "memory(GiB)": 77.56, "step": 84060, "token_acc": 0.49645390070921985, "train_speed(iter/s)": 1.437741 }, { "epoch": 3.6016023306627822, "grad_norm": 5.6881513595581055, "learning_rate": 1.8090030034530247e-05, "loss": 2.5533378601074217, "memory(GiB)": 77.56, "step": 84065, "token_acc": 0.45, "train_speed(iter/s)": 1.437751 }, { "epoch": 3.601816545992031, "grad_norm": 6.07249641418457, "learning_rate": 1.8084849265736927e-05, "loss": 2.2581628799438476, "memory(GiB)": 77.56, "step": 84070, "token_acc": 0.5, "train_speed(iter/s)": 1.437757 }, { "epoch": 3.6020307613212803, "grad_norm": 4.646111488342285, "learning_rate": 1.807966907511648e-05, "loss": 2.203865814208984, "memory(GiB)": 77.56, "step": 84075, "token_acc": 0.5290102389078498, "train_speed(iter/s)": 1.437774 }, { "epoch": 3.602244976650529, "grad_norm": 5.254541397094727, "learning_rate": 1.807448946276279e-05, "loss": 2.1519844055175783, "memory(GiB)": 77.56, "step": 84080, "token_acc": 0.5390334572490706, "train_speed(iter/s)": 1.437792 }, { "epoch": 3.602459191979778, "grad_norm": 6.682652473449707, "learning_rate": 1.806931042876967e-05, "loss": 2.4928821563720702, "memory(GiB)": 77.56, "step": 84085, "token_acc": 0.466403162055336, "train_speed(iter/s)": 1.437794 }, { "epoch": 3.602673407309027, "grad_norm": 7.392765522003174, "learning_rate": 1.806413197323093e-05, "loss": 2.5664039611816407, "memory(GiB)": 77.56, "step": 84090, "token_acc": 0.45, "train_speed(iter/s)": 1.437809 }, { "epoch": 3.602887622638276, "grad_norm": 7.688574314117432, "learning_rate": 1.805895409624041e-05, "loss": 2.4077522277832033, "memory(GiB)": 77.56, "step": 84095, "token_acc": 0.44368600682593856, "train_speed(iter/s)": 1.437818 }, { "epoch": 3.603101837967525, "grad_norm": 8.079208374023438, "learning_rate": 1.80537767978919e-05, "loss": 2.67993106842041, "memory(GiB)": 77.56, "step": 84100, "token_acc": 0.494949494949495, "train_speed(iter/s)": 1.437837 }, { "epoch": 3.603316053296774, "grad_norm": 8.550474166870117, "learning_rate": 1.80486000782792e-05, "loss": 2.0392208099365234, "memory(GiB)": 77.56, "step": 84105, "token_acc": 0.5294117647058824, "train_speed(iter/s)": 1.437838 }, { "epoch": 3.603530268626023, "grad_norm": 4.775581359863281, "learning_rate": 1.804342393749608e-05, "loss": 2.142267036437988, "memory(GiB)": 77.56, "step": 84110, "token_acc": 0.5333333333333333, "train_speed(iter/s)": 1.437843 }, { "epoch": 3.6037444839552717, "grad_norm": 5.96275520324707, "learning_rate": 1.803824837563629e-05, "loss": 2.696409225463867, "memory(GiB)": 77.56, "step": 84115, "token_acc": 0.4723926380368098, "train_speed(iter/s)": 1.437844 }, { "epoch": 3.603958699284521, "grad_norm": 5.446780204772949, "learning_rate": 1.803307339279363e-05, "loss": 2.10909481048584, "memory(GiB)": 77.56, "step": 84120, "token_acc": 0.5595667870036101, "train_speed(iter/s)": 1.437844 }, { "epoch": 3.6041729146137698, "grad_norm": 5.746261119842529, "learning_rate": 1.8027898989061838e-05, "loss": 2.40936222076416, "memory(GiB)": 77.56, "step": 84125, "token_acc": 0.48717948717948717, "train_speed(iter/s)": 1.437857 }, { "epoch": 3.6043871299430186, "grad_norm": 5.935001373291016, "learning_rate": 1.8022725164534647e-05, "loss": 2.2035167694091795, "memory(GiB)": 77.56, "step": 84130, "token_acc": 0.5310077519379846, "train_speed(iter/s)": 1.43788 }, { "epoch": 3.604601345272268, "grad_norm": 7.3856916427612305, "learning_rate": 1.8017551919305787e-05, "loss": 2.4229381561279295, "memory(GiB)": 77.56, "step": 84135, "token_acc": 0.46923076923076923, "train_speed(iter/s)": 1.437898 }, { "epoch": 3.6048155606015166, "grad_norm": 5.225915431976318, "learning_rate": 1.8012379253468975e-05, "loss": 2.365320587158203, "memory(GiB)": 77.56, "step": 84140, "token_acc": 0.5254901960784314, "train_speed(iter/s)": 1.437928 }, { "epoch": 3.6050297759307655, "grad_norm": 7.766951560974121, "learning_rate": 1.80072071671179e-05, "loss": 2.5186187744140627, "memory(GiB)": 77.56, "step": 84145, "token_acc": 0.4790874524714829, "train_speed(iter/s)": 1.437919 }, { "epoch": 3.6052439912600147, "grad_norm": 4.88001012802124, "learning_rate": 1.8002035660346283e-05, "loss": 2.2217777252197264, "memory(GiB)": 77.56, "step": 84150, "token_acc": 0.5598455598455598, "train_speed(iter/s)": 1.437927 }, { "epoch": 3.6054582065892635, "grad_norm": 5.7520952224731445, "learning_rate": 1.7996864733247822e-05, "loss": 2.175375556945801, "memory(GiB)": 77.56, "step": 84155, "token_acc": 0.5017543859649123, "train_speed(iter/s)": 1.437929 }, { "epoch": 3.6056724219185123, "grad_norm": 5.575802803039551, "learning_rate": 1.7991694385916176e-05, "loss": 2.2137933731079102, "memory(GiB)": 77.56, "step": 84160, "token_acc": 0.5127388535031847, "train_speed(iter/s)": 1.437876 }, { "epoch": 3.6058866372477616, "grad_norm": 8.660526275634766, "learning_rate": 1.7986524618445017e-05, "loss": 2.3878547668457033, "memory(GiB)": 77.56, "step": 84165, "token_acc": 0.5246478873239436, "train_speed(iter/s)": 1.437852 }, { "epoch": 3.6061008525770104, "grad_norm": 4.151677131652832, "learning_rate": 1.7981355430927992e-05, "loss": 2.5714694976806642, "memory(GiB)": 77.56, "step": 84170, "token_acc": 0.5080906148867314, "train_speed(iter/s)": 1.437848 }, { "epoch": 3.606315067906259, "grad_norm": 5.799408912658691, "learning_rate": 1.797618682345873e-05, "loss": 2.5114513397216798, "memory(GiB)": 77.56, "step": 84175, "token_acc": 0.4664179104477612, "train_speed(iter/s)": 1.43785 }, { "epoch": 3.6065292832355085, "grad_norm": 4.498894691467285, "learning_rate": 1.7971018796130906e-05, "loss": 2.5082462310791014, "memory(GiB)": 77.56, "step": 84180, "token_acc": 0.4585987261146497, "train_speed(iter/s)": 1.43785 }, { "epoch": 3.6067434985647573, "grad_norm": 5.322307109832764, "learning_rate": 1.796585134903812e-05, "loss": 2.4112205505371094, "memory(GiB)": 77.56, "step": 84185, "token_acc": 0.4817073170731707, "train_speed(iter/s)": 1.437838 }, { "epoch": 3.606957713894006, "grad_norm": 7.19744873046875, "learning_rate": 1.7960684482273986e-05, "loss": 2.229107666015625, "memory(GiB)": 77.56, "step": 84190, "token_acc": 0.5090252707581228, "train_speed(iter/s)": 1.43783 }, { "epoch": 3.6071719292232554, "grad_norm": 5.487410068511963, "learning_rate": 1.7955518195932114e-05, "loss": 2.1874242782592774, "memory(GiB)": 77.56, "step": 84195, "token_acc": 0.5377049180327869, "train_speed(iter/s)": 1.437806 }, { "epoch": 3.607386144552504, "grad_norm": 7.555301189422607, "learning_rate": 1.7950352490106083e-05, "loss": 2.361857032775879, "memory(GiB)": 77.56, "step": 84200, "token_acc": 0.47601476014760147, "train_speed(iter/s)": 1.437829 }, { "epoch": 3.607600359881753, "grad_norm": 4.5566816329956055, "learning_rate": 1.794518736488947e-05, "loss": 2.4064708709716798, "memory(GiB)": 77.56, "step": 84205, "token_acc": 0.46981627296587924, "train_speed(iter/s)": 1.43781 }, { "epoch": 3.6078145752110022, "grad_norm": 5.034022808074951, "learning_rate": 1.794002282037587e-05, "loss": 2.3505836486816407, "memory(GiB)": 77.56, "step": 84210, "token_acc": 0.4923547400611621, "train_speed(iter/s)": 1.43782 }, { "epoch": 3.608028790540251, "grad_norm": 5.144019603729248, "learning_rate": 1.793485885665883e-05, "loss": 2.4347957611083983, "memory(GiB)": 77.56, "step": 84215, "token_acc": 0.49079754601226994, "train_speed(iter/s)": 1.437834 }, { "epoch": 3.6082430058695, "grad_norm": 6.86022424697876, "learning_rate": 1.792969547383191e-05, "loss": 2.345616912841797, "memory(GiB)": 77.56, "step": 84220, "token_acc": 0.5052264808362369, "train_speed(iter/s)": 1.43785 }, { "epoch": 3.608457221198749, "grad_norm": 6.0889716148376465, "learning_rate": 1.792453267198862e-05, "loss": 2.384096145629883, "memory(GiB)": 77.56, "step": 84225, "token_acc": 0.4786885245901639, "train_speed(iter/s)": 1.437831 }, { "epoch": 3.608671436527998, "grad_norm": 7.356601715087891, "learning_rate": 1.7919370451222523e-05, "loss": 2.4440357208251955, "memory(GiB)": 77.56, "step": 84230, "token_acc": 0.5167785234899329, "train_speed(iter/s)": 1.437837 }, { "epoch": 3.6088856518572467, "grad_norm": 6.352375030517578, "learning_rate": 1.791420881162711e-05, "loss": 2.1649765014648437, "memory(GiB)": 77.56, "step": 84235, "token_acc": 0.5224913494809689, "train_speed(iter/s)": 1.437848 }, { "epoch": 3.609099867186496, "grad_norm": 5.922575950622559, "learning_rate": 1.790904775329592e-05, "loss": 2.3507728576660156, "memory(GiB)": 77.56, "step": 84240, "token_acc": 0.473015873015873, "train_speed(iter/s)": 1.437839 }, { "epoch": 3.609314082515745, "grad_norm": 6.288422107696533, "learning_rate": 1.7903887276322433e-05, "loss": 2.425602340698242, "memory(GiB)": 77.56, "step": 84245, "token_acc": 0.46735395189003437, "train_speed(iter/s)": 1.437856 }, { "epoch": 3.6095282978449936, "grad_norm": 5.826930522918701, "learning_rate": 1.789872738080014e-05, "loss": 2.387664222717285, "memory(GiB)": 77.56, "step": 84250, "token_acc": 0.4838709677419355, "train_speed(iter/s)": 1.437865 }, { "epoch": 3.609742513174243, "grad_norm": 4.966198921203613, "learning_rate": 1.7893568066822507e-05, "loss": 2.2773681640625, "memory(GiB)": 77.56, "step": 84255, "token_acc": 0.5158227848101266, "train_speed(iter/s)": 1.437871 }, { "epoch": 3.6099567285034917, "grad_norm": 5.8261637687683105, "learning_rate": 1.7888409334483e-05, "loss": 2.4344600677490233, "memory(GiB)": 77.56, "step": 84260, "token_acc": 0.4788732394366197, "train_speed(iter/s)": 1.43787 }, { "epoch": 3.6101709438327405, "grad_norm": 5.018042087554932, "learning_rate": 1.788325118387509e-05, "loss": 2.3421566009521486, "memory(GiB)": 77.56, "step": 84265, "token_acc": 0.4767025089605735, "train_speed(iter/s)": 1.437876 }, { "epoch": 3.6103851591619898, "grad_norm": 5.248198986053467, "learning_rate": 1.787809361509221e-05, "loss": 2.4936141967773438, "memory(GiB)": 77.56, "step": 84270, "token_acc": 0.478134110787172, "train_speed(iter/s)": 1.43789 }, { "epoch": 3.6105993744912386, "grad_norm": 5.8466715812683105, "learning_rate": 1.7872936628227804e-05, "loss": 2.4604724884033202, "memory(GiB)": 77.56, "step": 84275, "token_acc": 0.49085365853658536, "train_speed(iter/s)": 1.4379 }, { "epoch": 3.6108135898204874, "grad_norm": 5.019824504852295, "learning_rate": 1.7867780223375284e-05, "loss": 2.2418691635131838, "memory(GiB)": 77.56, "step": 84280, "token_acc": 0.5295857988165681, "train_speed(iter/s)": 1.437907 }, { "epoch": 3.6110278051497366, "grad_norm": 5.7772111892700195, "learning_rate": 1.7862624400628074e-05, "loss": 2.338362121582031, "memory(GiB)": 77.56, "step": 84285, "token_acc": 0.5095367847411444, "train_speed(iter/s)": 1.437925 }, { "epoch": 3.6112420204789855, "grad_norm": 5.309464454650879, "learning_rate": 1.785746916007955e-05, "loss": 2.358504867553711, "memory(GiB)": 77.56, "step": 84290, "token_acc": 0.55, "train_speed(iter/s)": 1.437923 }, { "epoch": 3.6114562358082343, "grad_norm": 5.367234706878662, "learning_rate": 1.785231450182314e-05, "loss": 2.503060531616211, "memory(GiB)": 77.56, "step": 84295, "token_acc": 0.4601226993865031, "train_speed(iter/s)": 1.437919 }, { "epoch": 3.6116704511374835, "grad_norm": 4.922787189483643, "learning_rate": 1.7847160425952193e-05, "loss": 2.4450763702392577, "memory(GiB)": 77.56, "step": 84300, "token_acc": 0.5, "train_speed(iter/s)": 1.437916 }, { "epoch": 3.6118846664667323, "grad_norm": 7.75718879699707, "learning_rate": 1.7842006932560113e-05, "loss": 2.114785003662109, "memory(GiB)": 77.56, "step": 84305, "token_acc": 0.5236220472440944, "train_speed(iter/s)": 1.437919 }, { "epoch": 3.6120988817959816, "grad_norm": 5.380731105804443, "learning_rate": 1.7836854021740245e-05, "loss": 2.2468557357788086, "memory(GiB)": 77.56, "step": 84310, "token_acc": 0.5430711610486891, "train_speed(iter/s)": 1.437919 }, { "epoch": 3.6123130971252304, "grad_norm": 5.819838523864746, "learning_rate": 1.783170169358594e-05, "loss": 2.368028259277344, "memory(GiB)": 77.56, "step": 84315, "token_acc": 0.5033333333333333, "train_speed(iter/s)": 1.437936 }, { "epoch": 3.6125273124544792, "grad_norm": 6.225353240966797, "learning_rate": 1.7826549948190514e-05, "loss": 2.455536460876465, "memory(GiB)": 77.56, "step": 84320, "token_acc": 0.5, "train_speed(iter/s)": 1.437941 }, { "epoch": 3.6127415277837285, "grad_norm": 5.648646831512451, "learning_rate": 1.7821398785647338e-05, "loss": 2.1312772750854494, "memory(GiB)": 77.56, "step": 84325, "token_acc": 0.5364963503649635, "train_speed(iter/s)": 1.437937 }, { "epoch": 3.6129557431129773, "grad_norm": 7.226056098937988, "learning_rate": 1.7816248206049708e-05, "loss": 2.219803810119629, "memory(GiB)": 77.56, "step": 84330, "token_acc": 0.5265017667844523, "train_speed(iter/s)": 1.437907 }, { "epoch": 3.613169958442226, "grad_norm": 5.097196578979492, "learning_rate": 1.7811098209490927e-05, "loss": 2.351849365234375, "memory(GiB)": 77.56, "step": 84335, "token_acc": 0.46394984326018807, "train_speed(iter/s)": 1.437913 }, { "epoch": 3.6133841737714754, "grad_norm": 9.04416561126709, "learning_rate": 1.7805948796064305e-05, "loss": 2.352223777770996, "memory(GiB)": 77.56, "step": 84340, "token_acc": 0.5086505190311419, "train_speed(iter/s)": 1.437891 }, { "epoch": 3.613598389100724, "grad_norm": 4.838322639465332, "learning_rate": 1.780079996586312e-05, "loss": 2.335775947570801, "memory(GiB)": 77.56, "step": 84345, "token_acc": 0.489247311827957, "train_speed(iter/s)": 1.437911 }, { "epoch": 3.613812604429973, "grad_norm": 5.915621757507324, "learning_rate": 1.7795651718980626e-05, "loss": 2.7252878189086913, "memory(GiB)": 77.56, "step": 84350, "token_acc": 0.43661971830985913, "train_speed(iter/s)": 1.437933 }, { "epoch": 3.6140268197592222, "grad_norm": 4.800912380218506, "learning_rate": 1.779050405551013e-05, "loss": 2.3222444534301756, "memory(GiB)": 77.56, "step": 84355, "token_acc": 0.501577287066246, "train_speed(iter/s)": 1.437951 }, { "epoch": 3.614241035088471, "grad_norm": 7.249481678009033, "learning_rate": 1.7785356975544875e-05, "loss": 2.730297660827637, "memory(GiB)": 77.56, "step": 84360, "token_acc": 0.47962382445141066, "train_speed(iter/s)": 1.437947 }, { "epoch": 3.61445525041772, "grad_norm": 5.133399963378906, "learning_rate": 1.7780210479178095e-05, "loss": 2.1141616821289064, "memory(GiB)": 77.56, "step": 84365, "token_acc": 0.5179856115107914, "train_speed(iter/s)": 1.437938 }, { "epoch": 3.614669465746969, "grad_norm": 5.133951187133789, "learning_rate": 1.7775064566503008e-05, "loss": 2.4025285720825194, "memory(GiB)": 77.56, "step": 84370, "token_acc": 0.5206349206349207, "train_speed(iter/s)": 1.437938 }, { "epoch": 3.614883681076218, "grad_norm": 5.603993892669678, "learning_rate": 1.776991923761287e-05, "loss": 2.3214759826660156, "memory(GiB)": 77.56, "step": 84375, "token_acc": 0.5165562913907285, "train_speed(iter/s)": 1.437962 }, { "epoch": 3.6150978964054667, "grad_norm": 4.5850725173950195, "learning_rate": 1.7764774492600866e-05, "loss": 2.2654388427734373, "memory(GiB)": 77.56, "step": 84380, "token_acc": 0.49846153846153846, "train_speed(iter/s)": 1.437974 }, { "epoch": 3.615312111734716, "grad_norm": 5.483675003051758, "learning_rate": 1.775963033156023e-05, "loss": 2.4457557678222654, "memory(GiB)": 77.56, "step": 84385, "token_acc": 0.4967741935483871, "train_speed(iter/s)": 1.437977 }, { "epoch": 3.615526327063965, "grad_norm": 5.81123161315918, "learning_rate": 1.7754486754584132e-05, "loss": 2.453587532043457, "memory(GiB)": 77.56, "step": 84390, "token_acc": 0.4887459807073955, "train_speed(iter/s)": 1.437989 }, { "epoch": 3.6157405423932136, "grad_norm": 7.053272724151611, "learning_rate": 1.7749343761765753e-05, "loss": 2.4135324478149416, "memory(GiB)": 77.56, "step": 84395, "token_acc": 0.5031055900621118, "train_speed(iter/s)": 1.438006 }, { "epoch": 3.615954757722463, "grad_norm": 5.910089492797852, "learning_rate": 1.7744201353198263e-05, "loss": 2.403470993041992, "memory(GiB)": 77.56, "step": 84400, "token_acc": 0.496551724137931, "train_speed(iter/s)": 1.438024 }, { "epoch": 3.6161689730517117, "grad_norm": 6.935621738433838, "learning_rate": 1.7739059528974815e-05, "loss": 2.397553253173828, "memory(GiB)": 77.56, "step": 84405, "token_acc": 0.4925373134328358, "train_speed(iter/s)": 1.438046 }, { "epoch": 3.6163831883809605, "grad_norm": 4.800797939300537, "learning_rate": 1.7733918289188577e-05, "loss": 2.0946538925170897, "memory(GiB)": 77.56, "step": 84410, "token_acc": 0.54421768707483, "train_speed(iter/s)": 1.438052 }, { "epoch": 3.6165974037102098, "grad_norm": 5.917965888977051, "learning_rate": 1.7728777633932688e-05, "loss": 2.4258491516113283, "memory(GiB)": 77.56, "step": 84415, "token_acc": 0.5121951219512195, "train_speed(iter/s)": 1.438047 }, { "epoch": 3.6168116190394586, "grad_norm": 7.704404830932617, "learning_rate": 1.7723637563300256e-05, "loss": 2.1151952743530273, "memory(GiB)": 77.56, "step": 84420, "token_acc": 0.553030303030303, "train_speed(iter/s)": 1.438062 }, { "epoch": 3.6170258343687074, "grad_norm": 4.804319858551025, "learning_rate": 1.7718498077384406e-05, "loss": 2.4303009033203127, "memory(GiB)": 77.56, "step": 84425, "token_acc": 0.5034013605442177, "train_speed(iter/s)": 1.438057 }, { "epoch": 3.6172400496979566, "grad_norm": 5.899385452270508, "learning_rate": 1.771335917627825e-05, "loss": 2.1928693771362306, "memory(GiB)": 77.56, "step": 84430, "token_acc": 0.5243055555555556, "train_speed(iter/s)": 1.438084 }, { "epoch": 3.6174542650272055, "grad_norm": 5.611340522766113, "learning_rate": 1.7708220860074858e-05, "loss": 2.2682941436767576, "memory(GiB)": 77.56, "step": 84435, "token_acc": 0.5107142857142857, "train_speed(iter/s)": 1.438107 }, { "epoch": 3.6176684803564543, "grad_norm": 5.671706676483154, "learning_rate": 1.770308312886736e-05, "loss": 2.516790008544922, "memory(GiB)": 77.56, "step": 84440, "token_acc": 0.5051903114186851, "train_speed(iter/s)": 1.438111 }, { "epoch": 3.6178826956857035, "grad_norm": 8.051642417907715, "learning_rate": 1.7697945982748786e-05, "loss": 2.4413047790527345, "memory(GiB)": 77.56, "step": 84445, "token_acc": 0.4766666666666667, "train_speed(iter/s)": 1.438125 }, { "epoch": 3.6180969110149523, "grad_norm": 6.651272773742676, "learning_rate": 1.7692809421812246e-05, "loss": 2.2289749145507813, "memory(GiB)": 77.56, "step": 84450, "token_acc": 0.5104895104895105, "train_speed(iter/s)": 1.43812 }, { "epoch": 3.618311126344201, "grad_norm": 5.098021507263184, "learning_rate": 1.768767344615076e-05, "loss": 2.1872684478759767, "memory(GiB)": 77.56, "step": 84455, "token_acc": 0.519298245614035, "train_speed(iter/s)": 1.438137 }, { "epoch": 3.6185253416734504, "grad_norm": 6.062860488891602, "learning_rate": 1.768253805585739e-05, "loss": 2.5328508377075196, "memory(GiB)": 77.56, "step": 84460, "token_acc": 0.4838709677419355, "train_speed(iter/s)": 1.438121 }, { "epoch": 3.6187395570026992, "grad_norm": 5.618769645690918, "learning_rate": 1.767740325102514e-05, "loss": 2.3572317123413087, "memory(GiB)": 77.56, "step": 84465, "token_acc": 0.47096774193548385, "train_speed(iter/s)": 1.438114 }, { "epoch": 3.618953772331948, "grad_norm": 6.2001166343688965, "learning_rate": 1.7672269031747065e-05, "loss": 2.2721626281738283, "memory(GiB)": 77.56, "step": 84470, "token_acc": 0.5180327868852459, "train_speed(iter/s)": 1.438112 }, { "epoch": 3.6191679876611973, "grad_norm": 6.136464595794678, "learning_rate": 1.7667135398116163e-05, "loss": 2.1934186935424806, "memory(GiB)": 77.56, "step": 84475, "token_acc": 0.5282392026578073, "train_speed(iter/s)": 1.438122 }, { "epoch": 3.619382202990446, "grad_norm": 6.489969253540039, "learning_rate": 1.7662002350225437e-05, "loss": 2.4644866943359376, "memory(GiB)": 77.56, "step": 84480, "token_acc": 0.4734982332155477, "train_speed(iter/s)": 1.438096 }, { "epoch": 3.619596418319695, "grad_norm": 5.997142791748047, "learning_rate": 1.765686988816787e-05, "loss": 2.6942161560058593, "memory(GiB)": 77.56, "step": 84485, "token_acc": 0.4426229508196721, "train_speed(iter/s)": 1.438074 }, { "epoch": 3.619810633648944, "grad_norm": 7.912371635437012, "learning_rate": 1.765173801203645e-05, "loss": 2.4193485260009764, "memory(GiB)": 77.56, "step": 84490, "token_acc": 0.44776119402985076, "train_speed(iter/s)": 1.438078 }, { "epoch": 3.620024848978193, "grad_norm": 6.506412029266357, "learning_rate": 1.7646606721924126e-05, "loss": 2.3007251739501955, "memory(GiB)": 77.56, "step": 84495, "token_acc": 0.5461538461538461, "train_speed(iter/s)": 1.438084 }, { "epoch": 3.620239064307442, "grad_norm": 5.071538925170898, "learning_rate": 1.764147601792388e-05, "loss": 2.6214832305908202, "memory(GiB)": 77.56, "step": 84500, "token_acc": 0.4666666666666667, "train_speed(iter/s)": 1.438106 }, { "epoch": 3.620239064307442, "eval_loss": 2.177626848220825, "eval_runtime": 14.2723, "eval_samples_per_second": 7.007, "eval_steps_per_second": 7.007, "eval_token_acc": 0.4843962008141113, "step": 84500 }, { "epoch": 3.620453279636691, "grad_norm": 6.3451008796691895, "learning_rate": 1.7636345900128654e-05, "loss": 2.256543731689453, "memory(GiB)": 77.56, "step": 84505, "token_acc": 0.496551724137931, "train_speed(iter/s)": 1.437747 }, { "epoch": 3.62066749496594, "grad_norm": 5.751153945922852, "learning_rate": 1.763121636863138e-05, "loss": 2.4733028411865234, "memory(GiB)": 77.56, "step": 84510, "token_acc": 0.4612676056338028, "train_speed(iter/s)": 1.437749 }, { "epoch": 3.6208817102951887, "grad_norm": 8.732616424560547, "learning_rate": 1.762608742352499e-05, "loss": 2.6257314682006836, "memory(GiB)": 77.56, "step": 84515, "token_acc": 0.46645367412140576, "train_speed(iter/s)": 1.437745 }, { "epoch": 3.621095925624438, "grad_norm": 7.366602897644043, "learning_rate": 1.7620959064902376e-05, "loss": 2.6168935775756834, "memory(GiB)": 77.56, "step": 84520, "token_acc": 0.46956521739130436, "train_speed(iter/s)": 1.437763 }, { "epoch": 3.6213101409536868, "grad_norm": 4.783069610595703, "learning_rate": 1.7615831292856468e-05, "loss": 2.33459358215332, "memory(GiB)": 77.56, "step": 84525, "token_acc": 0.5047923322683706, "train_speed(iter/s)": 1.437752 }, { "epoch": 3.6215243562829356, "grad_norm": 4.890158653259277, "learning_rate": 1.7610704107480165e-05, "loss": 2.083945465087891, "memory(GiB)": 77.56, "step": 84530, "token_acc": 0.5117647058823529, "train_speed(iter/s)": 1.437751 }, { "epoch": 3.621738571612185, "grad_norm": 6.235067844390869, "learning_rate": 1.760557750886634e-05, "loss": 2.520630645751953, "memory(GiB)": 77.56, "step": 84535, "token_acc": 0.4728434504792332, "train_speed(iter/s)": 1.437747 }, { "epoch": 3.6219527869414336, "grad_norm": 5.803600311279297, "learning_rate": 1.7600451497107873e-05, "loss": 2.7453956604003906, "memory(GiB)": 77.56, "step": 84540, "token_acc": 0.42857142857142855, "train_speed(iter/s)": 1.43773 }, { "epoch": 3.6221670022706824, "grad_norm": 5.568145275115967, "learning_rate": 1.7595326072297615e-05, "loss": 2.4914594650268556, "memory(GiB)": 77.56, "step": 84545, "token_acc": 0.46496815286624205, "train_speed(iter/s)": 1.437738 }, { "epoch": 3.6223812175999317, "grad_norm": 4.958303451538086, "learning_rate": 1.759020123452841e-05, "loss": 2.3187862396240235, "memory(GiB)": 77.56, "step": 84550, "token_acc": 0.5054545454545455, "train_speed(iter/s)": 1.437755 }, { "epoch": 3.6225954329291805, "grad_norm": 6.314785480499268, "learning_rate": 1.758507698389313e-05, "loss": 2.130345916748047, "memory(GiB)": 77.56, "step": 84555, "token_acc": 0.47440273037542663, "train_speed(iter/s)": 1.437768 }, { "epoch": 3.6228096482584293, "grad_norm": 4.95468807220459, "learning_rate": 1.757995332048458e-05, "loss": 2.4282060623168946, "memory(GiB)": 77.56, "step": 84560, "token_acc": 0.5016611295681063, "train_speed(iter/s)": 1.437762 }, { "epoch": 3.6230238635876786, "grad_norm": 6.215773105621338, "learning_rate": 1.7574830244395597e-05, "loss": 2.0673656463623047, "memory(GiB)": 77.56, "step": 84565, "token_acc": 0.5537848605577689, "train_speed(iter/s)": 1.437761 }, { "epoch": 3.6232380789169274, "grad_norm": 9.88821029663086, "learning_rate": 1.756970775571898e-05, "loss": 2.43021297454834, "memory(GiB)": 77.56, "step": 84570, "token_acc": 0.4916387959866221, "train_speed(iter/s)": 1.437782 }, { "epoch": 3.623452294246176, "grad_norm": 5.7987060546875, "learning_rate": 1.7564585854547522e-05, "loss": 2.2280540466308594, "memory(GiB)": 77.56, "step": 84575, "token_acc": 0.5, "train_speed(iter/s)": 1.437783 }, { "epoch": 3.6236665095754255, "grad_norm": 6.403611660003662, "learning_rate": 1.755946454097401e-05, "loss": 2.66406307220459, "memory(GiB)": 77.56, "step": 84580, "token_acc": 0.46408839779005523, "train_speed(iter/s)": 1.437788 }, { "epoch": 3.6238807249046743, "grad_norm": 6.059589862823486, "learning_rate": 1.755434381509124e-05, "loss": 2.095827102661133, "memory(GiB)": 77.56, "step": 84585, "token_acc": 0.5202952029520295, "train_speed(iter/s)": 1.437806 }, { "epoch": 3.624094940233923, "grad_norm": 7.217907428741455, "learning_rate": 1.754922367699197e-05, "loss": 2.5607112884521483, "memory(GiB)": 77.56, "step": 84590, "token_acc": 0.4533333333333333, "train_speed(iter/s)": 1.437788 }, { "epoch": 3.6243091555631723, "grad_norm": 4.224649429321289, "learning_rate": 1.754410412676894e-05, "loss": 2.3494264602661135, "memory(GiB)": 77.56, "step": 84595, "token_acc": 0.49477351916376305, "train_speed(iter/s)": 1.437785 }, { "epoch": 3.624523370892421, "grad_norm": 7.713440418243408, "learning_rate": 1.7538985164514925e-05, "loss": 2.3876441955566405, "memory(GiB)": 77.56, "step": 84600, "token_acc": 0.47419354838709676, "train_speed(iter/s)": 1.437774 }, { "epoch": 3.62473758622167, "grad_norm": 4.943674087524414, "learning_rate": 1.7533866790322647e-05, "loss": 2.4379251480102537, "memory(GiB)": 77.56, "step": 84605, "token_acc": 0.4934210526315789, "train_speed(iter/s)": 1.437794 }, { "epoch": 3.6249518015509192, "grad_norm": 5.520883560180664, "learning_rate": 1.7528749004284812e-05, "loss": 2.2836917877197265, "memory(GiB)": 77.56, "step": 84610, "token_acc": 0.4666666666666667, "train_speed(iter/s)": 1.437798 }, { "epoch": 3.625166016880168, "grad_norm": 7.87253475189209, "learning_rate": 1.752363180649416e-05, "loss": 2.5776390075683593, "memory(GiB)": 77.56, "step": 84615, "token_acc": 0.5272727272727272, "train_speed(iter/s)": 1.437794 }, { "epoch": 3.625380232209417, "grad_norm": 7.26462459564209, "learning_rate": 1.7518515197043383e-05, "loss": 2.5163551330566407, "memory(GiB)": 77.56, "step": 84620, "token_acc": 0.48, "train_speed(iter/s)": 1.437809 }, { "epoch": 3.625594447538666, "grad_norm": 4.841714382171631, "learning_rate": 1.751339917602518e-05, "loss": 2.1523181915283205, "memory(GiB)": 77.56, "step": 84625, "token_acc": 0.5169811320754717, "train_speed(iter/s)": 1.437809 }, { "epoch": 3.625808662867915, "grad_norm": 5.723125457763672, "learning_rate": 1.7508283743532223e-05, "loss": 2.596650505065918, "memory(GiB)": 77.56, "step": 84630, "token_acc": 0.4612794612794613, "train_speed(iter/s)": 1.437832 }, { "epoch": 3.6260228781971637, "grad_norm": 5.5952467918396, "learning_rate": 1.7503168899657184e-05, "loss": 2.447065544128418, "memory(GiB)": 77.56, "step": 84635, "token_acc": 0.49498327759197325, "train_speed(iter/s)": 1.437814 }, { "epoch": 3.626237093526413, "grad_norm": 5.607433319091797, "learning_rate": 1.749805464449271e-05, "loss": 2.4382219314575195, "memory(GiB)": 77.56, "step": 84640, "token_acc": 0.46417445482866043, "train_speed(iter/s)": 1.437806 }, { "epoch": 3.626451308855662, "grad_norm": 7.0985918045043945, "learning_rate": 1.7492940978131478e-05, "loss": 2.4026002883911133, "memory(GiB)": 77.56, "step": 84645, "token_acc": 0.5, "train_speed(iter/s)": 1.437813 }, { "epoch": 3.6266655241849106, "grad_norm": 6.141039848327637, "learning_rate": 1.748782790066612e-05, "loss": 2.002517318725586, "memory(GiB)": 77.56, "step": 84650, "token_acc": 0.5990338164251208, "train_speed(iter/s)": 1.437823 }, { "epoch": 3.62687973951416, "grad_norm": 5.390111446380615, "learning_rate": 1.7482715412189253e-05, "loss": 2.1562688827514647, "memory(GiB)": 77.56, "step": 84655, "token_acc": 0.546031746031746, "train_speed(iter/s)": 1.437824 }, { "epoch": 3.6270939548434087, "grad_norm": 6.4040303230285645, "learning_rate": 1.7477603512793504e-05, "loss": 2.0433748245239256, "memory(GiB)": 77.56, "step": 84660, "token_acc": 0.49433962264150944, "train_speed(iter/s)": 1.4378 }, { "epoch": 3.6273081701726575, "grad_norm": 6.350438594818115, "learning_rate": 1.747249220257146e-05, "loss": 2.2895782470703123, "memory(GiB)": 77.56, "step": 84665, "token_acc": 0.536741214057508, "train_speed(iter/s)": 1.437808 }, { "epoch": 3.6275223855019068, "grad_norm": 5.343311786651611, "learning_rate": 1.746738148161573e-05, "loss": 2.2991817474365233, "memory(GiB)": 77.56, "step": 84670, "token_acc": 0.5176056338028169, "train_speed(iter/s)": 1.437817 }, { "epoch": 3.6277366008311556, "grad_norm": 5.636126518249512, "learning_rate": 1.746227135001892e-05, "loss": 2.261979675292969, "memory(GiB)": 77.56, "step": 84675, "token_acc": 0.5311572700296736, "train_speed(iter/s)": 1.43783 }, { "epoch": 3.6279508161604044, "grad_norm": 5.327690124511719, "learning_rate": 1.745716180787359e-05, "loss": 2.309015083312988, "memory(GiB)": 77.56, "step": 84680, "token_acc": 0.5230263157894737, "train_speed(iter/s)": 1.437845 }, { "epoch": 3.6281650314896536, "grad_norm": 5.751166343688965, "learning_rate": 1.7452052855272298e-05, "loss": 2.213973617553711, "memory(GiB)": 77.56, "step": 84685, "token_acc": 0.48514851485148514, "train_speed(iter/s)": 1.437855 }, { "epoch": 3.6283792468189024, "grad_norm": 5.247895240783691, "learning_rate": 1.7446944492307605e-05, "loss": 2.385271453857422, "memory(GiB)": 77.56, "step": 84690, "token_acc": 0.5174825174825175, "train_speed(iter/s)": 1.437852 }, { "epoch": 3.6285934621481513, "grad_norm": 7.145521640777588, "learning_rate": 1.744183671907203e-05, "loss": 2.483266830444336, "memory(GiB)": 77.56, "step": 84695, "token_acc": 0.4748201438848921, "train_speed(iter/s)": 1.437866 }, { "epoch": 3.6288076774774005, "grad_norm": 6.7839789390563965, "learning_rate": 1.743672953565814e-05, "loss": 2.3552433013916017, "memory(GiB)": 77.56, "step": 84700, "token_acc": 0.5140845070422535, "train_speed(iter/s)": 1.437877 }, { "epoch": 3.6290218928066493, "grad_norm": 7.0621442794799805, "learning_rate": 1.7431622942158445e-05, "loss": 2.123543357849121, "memory(GiB)": 77.56, "step": 84705, "token_acc": 0.5403225806451613, "train_speed(iter/s)": 1.437841 }, { "epoch": 3.629236108135898, "grad_norm": 6.648751258850098, "learning_rate": 1.742651693866545e-05, "loss": 2.3292205810546873, "memory(GiB)": 77.56, "step": 84710, "token_acc": 0.4845360824742268, "train_speed(iter/s)": 1.437844 }, { "epoch": 3.6294503234651474, "grad_norm": 4.871461391448975, "learning_rate": 1.742141152527166e-05, "loss": 2.1896108627319335, "memory(GiB)": 77.56, "step": 84715, "token_acc": 0.5236486486486487, "train_speed(iter/s)": 1.437863 }, { "epoch": 3.629664538794396, "grad_norm": 5.25560188293457, "learning_rate": 1.7416306702069558e-05, "loss": 2.3394748687744142, "memory(GiB)": 77.56, "step": 84720, "token_acc": 0.4965986394557823, "train_speed(iter/s)": 1.437885 }, { "epoch": 3.629878754123645, "grad_norm": 7.446652412414551, "learning_rate": 1.7411202469151606e-05, "loss": 2.706114959716797, "memory(GiB)": 77.56, "step": 84725, "token_acc": 0.46735395189003437, "train_speed(iter/s)": 1.437895 }, { "epoch": 3.6300929694528943, "grad_norm": 10.241846084594727, "learning_rate": 1.7406098826610313e-05, "loss": 2.4451271057128907, "memory(GiB)": 77.56, "step": 84730, "token_acc": 0.5074074074074074, "train_speed(iter/s)": 1.437922 }, { "epoch": 3.630307184782143, "grad_norm": 5.8900933265686035, "learning_rate": 1.7400995774538116e-05, "loss": 2.5738042831420898, "memory(GiB)": 77.56, "step": 84735, "token_acc": 0.4950166112956811, "train_speed(iter/s)": 1.437931 }, { "epoch": 3.630521400111392, "grad_norm": 6.171445846557617, "learning_rate": 1.739589331302744e-05, "loss": 2.5489086151123046, "memory(GiB)": 77.56, "step": 84740, "token_acc": 0.4461538461538462, "train_speed(iter/s)": 1.437929 }, { "epoch": 3.630735615440641, "grad_norm": 9.676703453063965, "learning_rate": 1.7390791442170763e-05, "loss": 2.5274946212768556, "memory(GiB)": 77.56, "step": 84745, "token_acc": 0.5035971223021583, "train_speed(iter/s)": 1.437922 }, { "epoch": 3.63094983076989, "grad_norm": 6.300182819366455, "learning_rate": 1.7385690162060487e-05, "loss": 2.36279296875, "memory(GiB)": 77.56, "step": 84750, "token_acc": 0.5207547169811321, "train_speed(iter/s)": 1.437939 }, { "epoch": 3.631164046099139, "grad_norm": 7.311707973480225, "learning_rate": 1.7380589472789015e-05, "loss": 2.4290876388549805, "memory(GiB)": 77.56, "step": 84755, "token_acc": 0.5038461538461538, "train_speed(iter/s)": 1.437954 }, { "epoch": 3.631378261428388, "grad_norm": 5.435907363891602, "learning_rate": 1.7375489374448776e-05, "loss": 2.5090909957885743, "memory(GiB)": 77.56, "step": 84760, "token_acc": 0.5333333333333333, "train_speed(iter/s)": 1.437943 }, { "epoch": 3.631592476757637, "grad_norm": 6.9370222091674805, "learning_rate": 1.7370389867132154e-05, "loss": 2.445126533508301, "memory(GiB)": 77.56, "step": 84765, "token_acc": 0.4734982332155477, "train_speed(iter/s)": 1.437963 }, { "epoch": 3.6318066920868857, "grad_norm": 5.734540939331055, "learning_rate": 1.7365290950931527e-05, "loss": 2.336766242980957, "memory(GiB)": 77.56, "step": 84770, "token_acc": 0.48363636363636364, "train_speed(iter/s)": 1.43794 }, { "epoch": 3.632020907416135, "grad_norm": 6.48061990737915, "learning_rate": 1.736019262593927e-05, "loss": 2.2358119964599608, "memory(GiB)": 77.56, "step": 84775, "token_acc": 0.49469964664310956, "train_speed(iter/s)": 1.437922 }, { "epoch": 3.6322351227453837, "grad_norm": 6.097958564758301, "learning_rate": 1.7355094892247725e-05, "loss": 2.314980316162109, "memory(GiB)": 77.56, "step": 84780, "token_acc": 0.5421245421245421, "train_speed(iter/s)": 1.437911 }, { "epoch": 3.6324493380746325, "grad_norm": 4.919453144073486, "learning_rate": 1.7349997749949277e-05, "loss": 2.359080696105957, "memory(GiB)": 77.56, "step": 84785, "token_acc": 0.49842271293375395, "train_speed(iter/s)": 1.437904 }, { "epoch": 3.632663553403882, "grad_norm": 5.391432285308838, "learning_rate": 1.734490119913625e-05, "loss": 2.3844417572021483, "memory(GiB)": 77.56, "step": 84790, "token_acc": 0.4826388888888889, "train_speed(iter/s)": 1.43789 }, { "epoch": 3.6328777687331306, "grad_norm": 6.963891983032227, "learning_rate": 1.7339805239900964e-05, "loss": 2.0690610885620115, "memory(GiB)": 77.56, "step": 84795, "token_acc": 0.562015503875969, "train_speed(iter/s)": 1.437895 }, { "epoch": 3.6330919840623794, "grad_norm": 4.100226879119873, "learning_rate": 1.7334709872335746e-05, "loss": 2.102535629272461, "memory(GiB)": 77.56, "step": 84800, "token_acc": 0.5482866043613707, "train_speed(iter/s)": 1.43789 }, { "epoch": 3.6333061993916287, "grad_norm": 5.69957971572876, "learning_rate": 1.7329615096532905e-05, "loss": 2.4984720230102537, "memory(GiB)": 77.56, "step": 84805, "token_acc": 0.47151898734177217, "train_speed(iter/s)": 1.437909 }, { "epoch": 3.6335204147208775, "grad_norm": 6.949384689331055, "learning_rate": 1.7324520912584713e-05, "loss": 2.5807977676391602, "memory(GiB)": 77.56, "step": 84810, "token_acc": 0.48615384615384616, "train_speed(iter/s)": 1.437915 }, { "epoch": 3.6337346300501263, "grad_norm": 6.165522575378418, "learning_rate": 1.731942732058348e-05, "loss": 2.399928855895996, "memory(GiB)": 77.56, "step": 84815, "token_acc": 0.5015105740181269, "train_speed(iter/s)": 1.437905 }, { "epoch": 3.6339488453793756, "grad_norm": 5.983424663543701, "learning_rate": 1.7314334320621494e-05, "loss": 2.0674776077270507, "memory(GiB)": 77.56, "step": 84820, "token_acc": 0.5826086956521739, "train_speed(iter/s)": 1.43791 }, { "epoch": 3.6341630607086244, "grad_norm": 5.636471271514893, "learning_rate": 1.7309241912791e-05, "loss": 2.504780578613281, "memory(GiB)": 77.56, "step": 84825, "token_acc": 0.49032258064516127, "train_speed(iter/s)": 1.437911 }, { "epoch": 3.634377276037873, "grad_norm": 8.242146492004395, "learning_rate": 1.7304150097184257e-05, "loss": 2.1323854446411135, "memory(GiB)": 77.56, "step": 84830, "token_acc": 0.4957983193277311, "train_speed(iter/s)": 1.437918 }, { "epoch": 3.6345914913671225, "grad_norm": 6.046113967895508, "learning_rate": 1.7299058873893507e-05, "loss": 2.3913726806640625, "memory(GiB)": 77.56, "step": 84835, "token_acc": 0.4923547400611621, "train_speed(iter/s)": 1.437949 }, { "epoch": 3.6348057066963713, "grad_norm": 6.416695594787598, "learning_rate": 1.7293968243010967e-05, "loss": 2.4320699691772463, "memory(GiB)": 77.56, "step": 84840, "token_acc": 0.5387453874538746, "train_speed(iter/s)": 1.437972 }, { "epoch": 3.63501992202562, "grad_norm": 5.738819599151611, "learning_rate": 1.728887820462889e-05, "loss": 2.441440963745117, "memory(GiB)": 77.56, "step": 84845, "token_acc": 0.4676923076923077, "train_speed(iter/s)": 1.437986 }, { "epoch": 3.6352341373548693, "grad_norm": 5.451039791107178, "learning_rate": 1.7283788758839463e-05, "loss": 2.337528419494629, "memory(GiB)": 77.56, "step": 84850, "token_acc": 0.49206349206349204, "train_speed(iter/s)": 1.437993 }, { "epoch": 3.635448352684118, "grad_norm": 6.397180080413818, "learning_rate": 1.7278699905734897e-05, "loss": 2.381456184387207, "memory(GiB)": 77.56, "step": 84855, "token_acc": 0.5138461538461538, "train_speed(iter/s)": 1.437992 }, { "epoch": 3.635662568013367, "grad_norm": 5.56934118270874, "learning_rate": 1.727361164540738e-05, "loss": 2.3892595291137697, "memory(GiB)": 77.56, "step": 84860, "token_acc": 0.5189393939393939, "train_speed(iter/s)": 1.437974 }, { "epoch": 3.635876783342616, "grad_norm": 7.340134620666504, "learning_rate": 1.7268523977949084e-05, "loss": 2.379293441772461, "memory(GiB)": 77.56, "step": 84865, "token_acc": 0.45180722891566266, "train_speed(iter/s)": 1.437977 }, { "epoch": 3.636090998671865, "grad_norm": 7.421058177947998, "learning_rate": 1.7263436903452162e-05, "loss": 2.2533777236938475, "memory(GiB)": 77.56, "step": 84870, "token_acc": 0.5164473684210527, "train_speed(iter/s)": 1.437988 }, { "epoch": 3.636305214001114, "grad_norm": 6.16189432144165, "learning_rate": 1.7258350422008813e-05, "loss": 2.3028507232666016, "memory(GiB)": 77.56, "step": 84875, "token_acc": 0.5419847328244275, "train_speed(iter/s)": 1.437983 }, { "epoch": 3.636519429330363, "grad_norm": 9.566047668457031, "learning_rate": 1.7253264533711155e-05, "loss": 2.36781005859375, "memory(GiB)": 77.56, "step": 84880, "token_acc": 0.5265151515151515, "train_speed(iter/s)": 1.437989 }, { "epoch": 3.636733644659612, "grad_norm": 4.463603973388672, "learning_rate": 1.7248179238651334e-05, "loss": 2.5948728561401366, "memory(GiB)": 77.56, "step": 84885, "token_acc": 0.46078431372549017, "train_speed(iter/s)": 1.437986 }, { "epoch": 3.6369478599888607, "grad_norm": 9.498019218444824, "learning_rate": 1.724309453692145e-05, "loss": 2.3666923522949217, "memory(GiB)": 77.56, "step": 84890, "token_acc": 0.4807017543859649, "train_speed(iter/s)": 1.437999 }, { "epoch": 3.63716207531811, "grad_norm": 5.189406394958496, "learning_rate": 1.7238010428613654e-05, "loss": 2.6930814743041993, "memory(GiB)": 77.56, "step": 84895, "token_acc": 0.47214076246334313, "train_speed(iter/s)": 1.437999 }, { "epoch": 3.637376290647359, "grad_norm": 6.054851531982422, "learning_rate": 1.7232926913820015e-05, "loss": 2.129806709289551, "memory(GiB)": 77.56, "step": 84900, "token_acc": 0.5206349206349207, "train_speed(iter/s)": 1.438022 }, { "epoch": 3.6375905059766076, "grad_norm": 6.123406887054443, "learning_rate": 1.7227843992632658e-05, "loss": 2.1641653060913084, "memory(GiB)": 77.56, "step": 84905, "token_acc": 0.5573770491803278, "train_speed(iter/s)": 1.43804 }, { "epoch": 3.637804721305857, "grad_norm": 5.806746482849121, "learning_rate": 1.7222761665143654e-05, "loss": 2.186139678955078, "memory(GiB)": 77.56, "step": 84910, "token_acc": 0.49851632047477745, "train_speed(iter/s)": 1.438034 }, { "epoch": 3.6380189366351057, "grad_norm": 5.628856182098389, "learning_rate": 1.7217679931445068e-05, "loss": 2.370722770690918, "memory(GiB)": 77.56, "step": 84915, "token_acc": 0.5288461538461539, "train_speed(iter/s)": 1.438036 }, { "epoch": 3.6382331519643545, "grad_norm": 5.779271602630615, "learning_rate": 1.721259879162896e-05, "loss": 2.1186973571777346, "memory(GiB)": 77.56, "step": 84920, "token_acc": 0.5443425076452599, "train_speed(iter/s)": 1.438028 }, { "epoch": 3.6384473672936037, "grad_norm": 5.749781608581543, "learning_rate": 1.720751824578737e-05, "loss": 1.9772274017333984, "memory(GiB)": 77.56, "step": 84925, "token_acc": 0.5932835820895522, "train_speed(iter/s)": 1.438043 }, { "epoch": 3.6386615826228526, "grad_norm": 6.004026889801025, "learning_rate": 1.7202438294012363e-05, "loss": 2.396786117553711, "memory(GiB)": 77.56, "step": 84930, "token_acc": 0.4972067039106145, "train_speed(iter/s)": 1.438043 }, { "epoch": 3.6388757979521014, "grad_norm": 5.7338433265686035, "learning_rate": 1.719735893639595e-05, "loss": 2.284738540649414, "memory(GiB)": 77.56, "step": 84935, "token_acc": 0.46846846846846846, "train_speed(iter/s)": 1.438045 }, { "epoch": 3.6390900132813506, "grad_norm": 5.394063949584961, "learning_rate": 1.7192280173030156e-05, "loss": 2.44345703125, "memory(GiB)": 77.56, "step": 84940, "token_acc": 0.4364820846905538, "train_speed(iter/s)": 1.438024 }, { "epoch": 3.6393042286105994, "grad_norm": 6.275918006896973, "learning_rate": 1.718720200400698e-05, "loss": 2.5629690170288084, "memory(GiB)": 77.56, "step": 84945, "token_acc": 0.4477124183006536, "train_speed(iter/s)": 1.438007 }, { "epoch": 3.6395184439398482, "grad_norm": 5.952244281768799, "learning_rate": 1.718212442941842e-05, "loss": 2.2209205627441406, "memory(GiB)": 77.56, "step": 84950, "token_acc": 0.5536332179930796, "train_speed(iter/s)": 1.438023 }, { "epoch": 3.6397326592690975, "grad_norm": 6.568378448486328, "learning_rate": 1.7177047449356447e-05, "loss": 2.4038665771484373, "memory(GiB)": 77.56, "step": 84955, "token_acc": 0.4963768115942029, "train_speed(iter/s)": 1.438035 }, { "epoch": 3.6399468745983463, "grad_norm": 6.705170154571533, "learning_rate": 1.7171971063913066e-05, "loss": 2.2902069091796875, "memory(GiB)": 77.56, "step": 84960, "token_acc": 0.5, "train_speed(iter/s)": 1.438019 }, { "epoch": 3.640161089927595, "grad_norm": 5.389596939086914, "learning_rate": 1.716689527318021e-05, "loss": 2.3979835510253906, "memory(GiB)": 77.56, "step": 84965, "token_acc": 0.5327102803738317, "train_speed(iter/s)": 1.438026 }, { "epoch": 3.6403753052568444, "grad_norm": 6.294544696807861, "learning_rate": 1.7161820077249856e-05, "loss": 2.026536750793457, "memory(GiB)": 77.56, "step": 84970, "token_acc": 0.5480427046263345, "train_speed(iter/s)": 1.43804 }, { "epoch": 3.640589520586093, "grad_norm": 5.4064621925354, "learning_rate": 1.715674547621394e-05, "loss": 1.979637908935547, "memory(GiB)": 77.56, "step": 84975, "token_acc": 0.5625, "train_speed(iter/s)": 1.438043 }, { "epoch": 3.640803735915342, "grad_norm": 7.404888153076172, "learning_rate": 1.7151671470164392e-05, "loss": 2.1301578521728515, "memory(GiB)": 77.56, "step": 84980, "token_acc": 0.5020746887966805, "train_speed(iter/s)": 1.438033 }, { "epoch": 3.6410179512445913, "grad_norm": 7.268360614776611, "learning_rate": 1.7146598059193113e-05, "loss": 2.24462890625, "memory(GiB)": 77.56, "step": 84985, "token_acc": 0.5254237288135594, "train_speed(iter/s)": 1.438032 }, { "epoch": 3.64123216657384, "grad_norm": 6.248690128326416, "learning_rate": 1.714152524339204e-05, "loss": 2.2621837615966798, "memory(GiB)": 77.56, "step": 84990, "token_acc": 0.5340501792114696, "train_speed(iter/s)": 1.438039 }, { "epoch": 3.641446381903089, "grad_norm": 5.92620849609375, "learning_rate": 1.7136453022853067e-05, "loss": 2.410206413269043, "memory(GiB)": 77.56, "step": 84995, "token_acc": 0.4804270462633452, "train_speed(iter/s)": 1.438051 }, { "epoch": 3.641660597232338, "grad_norm": 6.842756271362305, "learning_rate": 1.7131381397668072e-05, "loss": 2.3542247772216798, "memory(GiB)": 77.56, "step": 85000, "token_acc": 0.5112781954887218, "train_speed(iter/s)": 1.438066 }, { "epoch": 3.641660597232338, "eval_loss": 2.196579694747925, "eval_runtime": 14.3309, "eval_samples_per_second": 6.978, "eval_steps_per_second": 6.978, "eval_token_acc": 0.48005698005698005, "step": 85000 }, { "epoch": 3.641874812561587, "grad_norm": 6.053701877593994, "learning_rate": 1.7126310367928937e-05, "loss": 2.034068298339844, "memory(GiB)": 77.56, "step": 85005, "token_acc": 0.49589322381930184, "train_speed(iter/s)": 1.437723 }, { "epoch": 3.6420890278908358, "grad_norm": 6.0152764320373535, "learning_rate": 1.712123993372753e-05, "loss": 2.191161346435547, "memory(GiB)": 77.56, "step": 85010, "token_acc": 0.5243445692883895, "train_speed(iter/s)": 1.437729 }, { "epoch": 3.642303243220085, "grad_norm": 6.775215148925781, "learning_rate": 1.7116170095155687e-05, "loss": 2.4492094039916994, "memory(GiB)": 77.56, "step": 85015, "token_acc": 0.4797297297297297, "train_speed(iter/s)": 1.437728 }, { "epoch": 3.642517458549334, "grad_norm": 6.784506320953369, "learning_rate": 1.7111100852305283e-05, "loss": 2.437163543701172, "memory(GiB)": 77.56, "step": 85020, "token_acc": 0.4964788732394366, "train_speed(iter/s)": 1.437746 }, { "epoch": 3.6427316738785827, "grad_norm": 4.5357160568237305, "learning_rate": 1.710603220526814e-05, "loss": 2.5728427886962892, "memory(GiB)": 77.56, "step": 85025, "token_acc": 0.4660493827160494, "train_speed(iter/s)": 1.437756 }, { "epoch": 3.642945889207832, "grad_norm": 5.133059024810791, "learning_rate": 1.7100964154136086e-05, "loss": 2.4496696472167967, "memory(GiB)": 77.56, "step": 85030, "token_acc": 0.5015479876160991, "train_speed(iter/s)": 1.437733 }, { "epoch": 3.6431601045370807, "grad_norm": 7.43684720993042, "learning_rate": 1.7095896699000908e-05, "loss": 2.41947078704834, "memory(GiB)": 77.56, "step": 85035, "token_acc": 0.4838709677419355, "train_speed(iter/s)": 1.437732 }, { "epoch": 3.6433743198663295, "grad_norm": 5.835127830505371, "learning_rate": 1.709082983995444e-05, "loss": 2.408224868774414, "memory(GiB)": 77.56, "step": 85040, "token_acc": 0.47262247838616717, "train_speed(iter/s)": 1.437742 }, { "epoch": 3.643588535195579, "grad_norm": 7.0789995193481445, "learning_rate": 1.7085763577088453e-05, "loss": 2.179537773132324, "memory(GiB)": 77.56, "step": 85045, "token_acc": 0.4864864864864865, "train_speed(iter/s)": 1.437763 }, { "epoch": 3.6438027505248276, "grad_norm": 4.962024688720703, "learning_rate": 1.708069791049475e-05, "loss": 2.0523859024047852, "memory(GiB)": 77.56, "step": 85050, "token_acc": 0.5477178423236515, "train_speed(iter/s)": 1.437765 }, { "epoch": 3.6440169658540764, "grad_norm": 6.24876594543457, "learning_rate": 1.7075632840265083e-05, "loss": 2.6402164459228517, "memory(GiB)": 77.56, "step": 85055, "token_acc": 0.4756944444444444, "train_speed(iter/s)": 1.437759 }, { "epoch": 3.6442311811833257, "grad_norm": 5.73577356338501, "learning_rate": 1.7070568366491208e-05, "loss": 2.1512435913085937, "memory(GiB)": 77.56, "step": 85060, "token_acc": 0.5322033898305085, "train_speed(iter/s)": 1.437788 }, { "epoch": 3.6444453965125745, "grad_norm": 5.112829685211182, "learning_rate": 1.7065504489264884e-05, "loss": 2.4789907455444338, "memory(GiB)": 77.56, "step": 85065, "token_acc": 0.5043731778425656, "train_speed(iter/s)": 1.437793 }, { "epoch": 3.6446596118418233, "grad_norm": 4.949240684509277, "learning_rate": 1.7060441208677818e-05, "loss": 2.287200927734375, "memory(GiB)": 77.56, "step": 85070, "token_acc": 0.5449101796407185, "train_speed(iter/s)": 1.437785 }, { "epoch": 3.6448738271710726, "grad_norm": 4.728801727294922, "learning_rate": 1.705537852482178e-05, "loss": 2.389324188232422, "memory(GiB)": 77.56, "step": 85075, "token_acc": 0.4791666666666667, "train_speed(iter/s)": 1.437767 }, { "epoch": 3.6450880425003214, "grad_norm": 5.657744884490967, "learning_rate": 1.7050316437788466e-05, "loss": 2.286519241333008, "memory(GiB)": 77.56, "step": 85080, "token_acc": 0.5077519379844961, "train_speed(iter/s)": 1.437783 }, { "epoch": 3.64530225782957, "grad_norm": 7.365163326263428, "learning_rate": 1.704525494766958e-05, "loss": 2.3439968109130858, "memory(GiB)": 77.56, "step": 85085, "token_acc": 0.5072463768115942, "train_speed(iter/s)": 1.437793 }, { "epoch": 3.6455164731588194, "grad_norm": 6.09114933013916, "learning_rate": 1.7040194054556813e-05, "loss": 2.3588485717773438, "memory(GiB)": 77.56, "step": 85090, "token_acc": 0.52, "train_speed(iter/s)": 1.437791 }, { "epoch": 3.6457306884880682, "grad_norm": 4.982217788696289, "learning_rate": 1.703513375854185e-05, "loss": 2.2544700622558596, "memory(GiB)": 77.56, "step": 85095, "token_acc": 0.4792332268370607, "train_speed(iter/s)": 1.437784 }, { "epoch": 3.645944903817317, "grad_norm": 6.5849609375, "learning_rate": 1.7030074059716338e-05, "loss": 2.165945625305176, "memory(GiB)": 77.56, "step": 85100, "token_acc": 0.5065359477124183, "train_speed(iter/s)": 1.4378 }, { "epoch": 3.6461591191465663, "grad_norm": 6.173681259155273, "learning_rate": 1.7025014958171993e-05, "loss": 2.4783203125, "memory(GiB)": 77.56, "step": 85105, "token_acc": 0.4772727272727273, "train_speed(iter/s)": 1.437786 }, { "epoch": 3.646373334475815, "grad_norm": 5.841230869293213, "learning_rate": 1.7019956454000406e-05, "loss": 2.4201770782470704, "memory(GiB)": 77.56, "step": 85110, "token_acc": 0.5088967971530249, "train_speed(iter/s)": 1.437771 }, { "epoch": 3.646587549805064, "grad_norm": 6.165286064147949, "learning_rate": 1.7014898547293266e-05, "loss": 2.2794252395629884, "memory(GiB)": 77.56, "step": 85115, "token_acc": 0.5527156549520766, "train_speed(iter/s)": 1.437798 }, { "epoch": 3.646801765134313, "grad_norm": 8.642879486083984, "learning_rate": 1.7009841238142188e-05, "loss": 2.401658821105957, "memory(GiB)": 77.56, "step": 85120, "token_acc": 0.5316455696202531, "train_speed(iter/s)": 1.437816 }, { "epoch": 3.647015980463562, "grad_norm": 6.1140828132629395, "learning_rate": 1.7004784526638777e-05, "loss": 2.490953254699707, "memory(GiB)": 77.56, "step": 85125, "token_acc": 0.48231511254019294, "train_speed(iter/s)": 1.43783 }, { "epoch": 3.647230195792811, "grad_norm": 5.107372283935547, "learning_rate": 1.6999728412874632e-05, "loss": 2.122241973876953, "memory(GiB)": 77.56, "step": 85130, "token_acc": 0.5328467153284672, "train_speed(iter/s)": 1.437842 }, { "epoch": 3.64744441112206, "grad_norm": 7.959492206573486, "learning_rate": 1.6994672896941377e-05, "loss": 2.4018396377563476, "memory(GiB)": 77.56, "step": 85135, "token_acc": 0.5146443514644351, "train_speed(iter/s)": 1.437852 }, { "epoch": 3.647658626451309, "grad_norm": 6.337654113769531, "learning_rate": 1.6989617978930587e-05, "loss": 2.513269805908203, "memory(GiB)": 77.56, "step": 85140, "token_acc": 0.447887323943662, "train_speed(iter/s)": 1.437858 }, { "epoch": 3.6478728417805577, "grad_norm": 7.01861572265625, "learning_rate": 1.6984563658933834e-05, "loss": 2.5734073638916017, "memory(GiB)": 77.56, "step": 85145, "token_acc": 0.4402730375426621, "train_speed(iter/s)": 1.437843 }, { "epoch": 3.648087057109807, "grad_norm": 6.410262584686279, "learning_rate": 1.697950993704268e-05, "loss": 2.494732666015625, "memory(GiB)": 77.56, "step": 85150, "token_acc": 0.5169811320754717, "train_speed(iter/s)": 1.437848 }, { "epoch": 3.6483012724390558, "grad_norm": 6.100495338439941, "learning_rate": 1.6974456813348678e-05, "loss": 2.44498291015625, "memory(GiB)": 77.56, "step": 85155, "token_acc": 0.48299319727891155, "train_speed(iter/s)": 1.437839 }, { "epoch": 3.6485154877683046, "grad_norm": 5.960434436798096, "learning_rate": 1.6969404287943356e-05, "loss": 2.3965576171875, "memory(GiB)": 77.56, "step": 85160, "token_acc": 0.5145985401459854, "train_speed(iter/s)": 1.437848 }, { "epoch": 3.648729703097554, "grad_norm": 5.544156074523926, "learning_rate": 1.6964352360918277e-05, "loss": 2.4419994354248047, "memory(GiB)": 77.56, "step": 85165, "token_acc": 0.4925925925925926, "train_speed(iter/s)": 1.437871 }, { "epoch": 3.6489439184268027, "grad_norm": 7.668413162231445, "learning_rate": 1.6959301032364944e-05, "loss": 2.1129375457763673, "memory(GiB)": 77.56, "step": 85170, "token_acc": 0.5015873015873016, "train_speed(iter/s)": 1.437865 }, { "epoch": 3.6491581337560515, "grad_norm": 8.6959867477417, "learning_rate": 1.695425030237487e-05, "loss": 2.2139163970947267, "memory(GiB)": 77.56, "step": 85175, "token_acc": 0.5070422535211268, "train_speed(iter/s)": 1.437873 }, { "epoch": 3.6493723490853007, "grad_norm": 6.109446048736572, "learning_rate": 1.6949200171039547e-05, "loss": 2.3366411209106444, "memory(GiB)": 77.56, "step": 85180, "token_acc": 0.4899328859060403, "train_speed(iter/s)": 1.437862 }, { "epoch": 3.6495865644145495, "grad_norm": 5.506080150604248, "learning_rate": 1.6944150638450457e-05, "loss": 2.542353057861328, "memory(GiB)": 77.56, "step": 85185, "token_acc": 0.446875, "train_speed(iter/s)": 1.437869 }, { "epoch": 3.6498007797437984, "grad_norm": 5.778169631958008, "learning_rate": 1.6939101704699077e-05, "loss": 2.217597007751465, "memory(GiB)": 77.56, "step": 85190, "token_acc": 0.5412186379928315, "train_speed(iter/s)": 1.437896 }, { "epoch": 3.6500149950730476, "grad_norm": 5.550141334533691, "learning_rate": 1.6934053369876912e-05, "loss": 2.356995964050293, "memory(GiB)": 77.56, "step": 85195, "token_acc": 0.4859154929577465, "train_speed(iter/s)": 1.437889 }, { "epoch": 3.6502292104022964, "grad_norm": 4.3431715965271, "learning_rate": 1.692900563407538e-05, "loss": 2.5239662170410155, "memory(GiB)": 77.56, "step": 85200, "token_acc": 0.45751633986928103, "train_speed(iter/s)": 1.437874 }, { "epoch": 3.6504434257315452, "grad_norm": 5.0271172523498535, "learning_rate": 1.692395849738594e-05, "loss": 2.307295227050781, "memory(GiB)": 77.56, "step": 85205, "token_acc": 0.5115511551155115, "train_speed(iter/s)": 1.437871 }, { "epoch": 3.6506576410607945, "grad_norm": 5.528509616851807, "learning_rate": 1.6918911959900015e-05, "loss": 2.1722105026245115, "memory(GiB)": 77.56, "step": 85210, "token_acc": 0.5127118644067796, "train_speed(iter/s)": 1.437895 }, { "epoch": 3.6508718563900433, "grad_norm": 5.842065811157227, "learning_rate": 1.691386602170902e-05, "loss": 2.196075439453125, "memory(GiB)": 77.56, "step": 85215, "token_acc": 0.5134228187919463, "train_speed(iter/s)": 1.437903 }, { "epoch": 3.651086071719292, "grad_norm": 5.357964992523193, "learning_rate": 1.6908820682904387e-05, "loss": 2.408620071411133, "memory(GiB)": 77.56, "step": 85220, "token_acc": 0.4984126984126984, "train_speed(iter/s)": 1.43788 }, { "epoch": 3.6513002870485414, "grad_norm": 6.5117106437683105, "learning_rate": 1.6903775943577516e-05, "loss": 2.6389820098876955, "memory(GiB)": 77.56, "step": 85225, "token_acc": 0.47003154574132494, "train_speed(iter/s)": 1.437896 }, { "epoch": 3.65151450237779, "grad_norm": 4.985795021057129, "learning_rate": 1.689873180381979e-05, "loss": 2.1541330337524416, "memory(GiB)": 77.56, "step": 85230, "token_acc": 0.49158249158249157, "train_speed(iter/s)": 1.437909 }, { "epoch": 3.651728717707039, "grad_norm": 6.050500869750977, "learning_rate": 1.689368826372259e-05, "loss": 2.3743358612060548, "memory(GiB)": 77.56, "step": 85235, "token_acc": 0.5136186770428015, "train_speed(iter/s)": 1.437928 }, { "epoch": 3.6519429330362883, "grad_norm": 4.540327072143555, "learning_rate": 1.6888645323377283e-05, "loss": 2.4179924011230467, "memory(GiB)": 77.56, "step": 85240, "token_acc": 0.513986013986014, "train_speed(iter/s)": 1.437913 }, { "epoch": 3.652157148365537, "grad_norm": 5.712406635284424, "learning_rate": 1.6883602982875207e-05, "loss": 2.0408262252807616, "memory(GiB)": 77.56, "step": 85245, "token_acc": 0.5481171548117155, "train_speed(iter/s)": 1.43791 }, { "epoch": 3.652371363694786, "grad_norm": 3.920994758605957, "learning_rate": 1.687856124230775e-05, "loss": 2.556990051269531, "memory(GiB)": 77.56, "step": 85250, "token_acc": 0.45931758530183725, "train_speed(iter/s)": 1.437913 }, { "epoch": 3.652585579024035, "grad_norm": 6.97440767288208, "learning_rate": 1.6873520101766223e-05, "loss": 2.2614572525024412, "memory(GiB)": 77.56, "step": 85255, "token_acc": 0.5118110236220472, "train_speed(iter/s)": 1.4379 }, { "epoch": 3.652799794353284, "grad_norm": 4.942814826965332, "learning_rate": 1.6868479561341936e-05, "loss": 2.188820457458496, "memory(GiB)": 77.56, "step": 85260, "token_acc": 0.5261324041811847, "train_speed(iter/s)": 1.437921 }, { "epoch": 3.6530140096825328, "grad_norm": 9.058539390563965, "learning_rate": 1.6863439621126243e-05, "loss": 2.0449899673461913, "memory(GiB)": 77.56, "step": 85265, "token_acc": 0.5117845117845118, "train_speed(iter/s)": 1.437927 }, { "epoch": 3.653228225011782, "grad_norm": 6.4220733642578125, "learning_rate": 1.6858400281210423e-05, "loss": 2.203816032409668, "memory(GiB)": 77.56, "step": 85270, "token_acc": 0.5280898876404494, "train_speed(iter/s)": 1.437927 }, { "epoch": 3.653442440341031, "grad_norm": 6.192942142486572, "learning_rate": 1.685336154168576e-05, "loss": 2.2622230529785154, "memory(GiB)": 77.56, "step": 85275, "token_acc": 0.5174603174603175, "train_speed(iter/s)": 1.437935 }, { "epoch": 3.6536566556702796, "grad_norm": 5.6832661628723145, "learning_rate": 1.6848323402643558e-05, "loss": 2.5135812759399414, "memory(GiB)": 77.56, "step": 85280, "token_acc": 0.4937888198757764, "train_speed(iter/s)": 1.437952 }, { "epoch": 3.653870870999529, "grad_norm": 6.137045860290527, "learning_rate": 1.684328586417508e-05, "loss": 2.4684810638427734, "memory(GiB)": 77.56, "step": 85285, "token_acc": 0.48736462093862815, "train_speed(iter/s)": 1.437941 }, { "epoch": 3.6540850863287777, "grad_norm": 5.741428852081299, "learning_rate": 1.6838248926371575e-05, "loss": 2.2643291473388674, "memory(GiB)": 77.56, "step": 85290, "token_acc": 0.5149501661129569, "train_speed(iter/s)": 1.437959 }, { "epoch": 3.6542993016580265, "grad_norm": 4.961923122406006, "learning_rate": 1.6833212589324304e-05, "loss": 2.289890098571777, "memory(GiB)": 77.56, "step": 85295, "token_acc": 0.5037878787878788, "train_speed(iter/s)": 1.437958 }, { "epoch": 3.654513516987276, "grad_norm": 5.9168171882629395, "learning_rate": 1.68281768531245e-05, "loss": 2.321359634399414, "memory(GiB)": 77.56, "step": 85300, "token_acc": 0.5018867924528302, "train_speed(iter/s)": 1.437958 }, { "epoch": 3.6547277323165246, "grad_norm": 8.691854476928711, "learning_rate": 1.682314171786337e-05, "loss": 2.42355899810791, "memory(GiB)": 77.56, "step": 85305, "token_acc": 0.5229007633587787, "train_speed(iter/s)": 1.437971 }, { "epoch": 3.6549419476457734, "grad_norm": 5.665338039398193, "learning_rate": 1.6818107183632176e-05, "loss": 2.434714698791504, "memory(GiB)": 77.56, "step": 85310, "token_acc": 0.48, "train_speed(iter/s)": 1.437951 }, { "epoch": 3.6551561629750227, "grad_norm": 7.878963947296143, "learning_rate": 1.681307325052209e-05, "loss": 2.2658031463623045, "memory(GiB)": 77.56, "step": 85315, "token_acc": 0.510548523206751, "train_speed(iter/s)": 1.437958 }, { "epoch": 3.6553703783042715, "grad_norm": 5.9954071044921875, "learning_rate": 1.6808039918624313e-05, "loss": 2.3356298446655273, "memory(GiB)": 77.56, "step": 85320, "token_acc": 0.48936170212765956, "train_speed(iter/s)": 1.437962 }, { "epoch": 3.6555845936335203, "grad_norm": 6.986223220825195, "learning_rate": 1.6803007188030036e-05, "loss": 2.3986270904541014, "memory(GiB)": 77.56, "step": 85325, "token_acc": 0.4968152866242038, "train_speed(iter/s)": 1.437966 }, { "epoch": 3.6557988089627695, "grad_norm": 4.978462219238281, "learning_rate": 1.67979750588304e-05, "loss": 2.442804145812988, "memory(GiB)": 77.56, "step": 85330, "token_acc": 0.511326860841424, "train_speed(iter/s)": 1.437986 }, { "epoch": 3.6560130242920184, "grad_norm": 5.887653350830078, "learning_rate": 1.6792943531116594e-05, "loss": 2.3946210861206056, "memory(GiB)": 77.56, "step": 85335, "token_acc": 0.5187713310580204, "train_speed(iter/s)": 1.438006 }, { "epoch": 3.656227239621267, "grad_norm": 4.989517688751221, "learning_rate": 1.678791260497979e-05, "loss": 2.163958740234375, "memory(GiB)": 77.56, "step": 85340, "token_acc": 0.5298245614035088, "train_speed(iter/s)": 1.438013 }, { "epoch": 3.6564414549505164, "grad_norm": 5.966470241546631, "learning_rate": 1.6782882280511097e-05, "loss": 2.6729021072387695, "memory(GiB)": 77.56, "step": 85345, "token_acc": 0.4709480122324159, "train_speed(iter/s)": 1.437996 }, { "epoch": 3.6566556702797652, "grad_norm": 9.256418228149414, "learning_rate": 1.6777852557801655e-05, "loss": 2.7239952087402344, "memory(GiB)": 77.56, "step": 85350, "token_acc": 0.46511627906976744, "train_speed(iter/s)": 1.437996 }, { "epoch": 3.656869885609014, "grad_norm": 6.300131797790527, "learning_rate": 1.6772823436942582e-05, "loss": 2.5542022705078127, "memory(GiB)": 77.56, "step": 85355, "token_acc": 0.49101796407185627, "train_speed(iter/s)": 1.437991 }, { "epoch": 3.6570841009382633, "grad_norm": 5.623291969299316, "learning_rate": 1.676779491802496e-05, "loss": 2.6017124176025392, "memory(GiB)": 77.56, "step": 85360, "token_acc": 0.5032258064516129, "train_speed(iter/s)": 1.437973 }, { "epoch": 3.657298316267512, "grad_norm": 5.452791690826416, "learning_rate": 1.676276700113992e-05, "loss": 2.444034385681152, "memory(GiB)": 77.56, "step": 85365, "token_acc": 0.44879518072289154, "train_speed(iter/s)": 1.437982 }, { "epoch": 3.657512531596761, "grad_norm": 4.567123889923096, "learning_rate": 1.6757739686378543e-05, "loss": 2.0532804489135743, "memory(GiB)": 77.56, "step": 85370, "token_acc": 0.5166051660516605, "train_speed(iter/s)": 1.43798 }, { "epoch": 3.65772674692601, "grad_norm": 7.203108787536621, "learning_rate": 1.6752712973831898e-05, "loss": 2.244427299499512, "memory(GiB)": 77.56, "step": 85375, "token_acc": 0.5497835497835498, "train_speed(iter/s)": 1.43798 }, { "epoch": 3.657940962255259, "grad_norm": 6.722211837768555, "learning_rate": 1.674768686359104e-05, "loss": 2.307176971435547, "memory(GiB)": 77.56, "step": 85380, "token_acc": 0.5111111111111111, "train_speed(iter/s)": 1.437986 }, { "epoch": 3.658155177584508, "grad_norm": 4.666999816894531, "learning_rate": 1.6742661355747026e-05, "loss": 2.216294860839844, "memory(GiB)": 77.56, "step": 85385, "token_acc": 0.5133531157270029, "train_speed(iter/s)": 1.437975 }, { "epoch": 3.658369392913757, "grad_norm": 6.5407609939575195, "learning_rate": 1.6737636450390887e-05, "loss": 2.6667196273803713, "memory(GiB)": 77.56, "step": 85390, "token_acc": 0.460431654676259, "train_speed(iter/s)": 1.437983 }, { "epoch": 3.658583608243006, "grad_norm": 5.101613998413086, "learning_rate": 1.673261214761368e-05, "loss": 2.109746551513672, "memory(GiB)": 77.56, "step": 85395, "token_acc": 0.5445205479452054, "train_speed(iter/s)": 1.437984 }, { "epoch": 3.6587978235722547, "grad_norm": 6.342676639556885, "learning_rate": 1.6727588447506408e-05, "loss": 2.3776596069335936, "memory(GiB)": 77.56, "step": 85400, "token_acc": 0.47096774193548385, "train_speed(iter/s)": 1.437986 }, { "epoch": 3.659012038901504, "grad_norm": 5.256192684173584, "learning_rate": 1.6722565350160075e-05, "loss": 2.369375801086426, "memory(GiB)": 77.56, "step": 85405, "token_acc": 0.519163763066202, "train_speed(iter/s)": 1.437984 }, { "epoch": 3.6592262542307528, "grad_norm": 5.332834720611572, "learning_rate": 1.6717542855665702e-05, "loss": 2.258794403076172, "memory(GiB)": 77.56, "step": 85410, "token_acc": 0.5263157894736842, "train_speed(iter/s)": 1.438007 }, { "epoch": 3.6594404695600016, "grad_norm": 5.113658905029297, "learning_rate": 1.671252096411426e-05, "loss": 2.2794450759887694, "memory(GiB)": 77.56, "step": 85415, "token_acc": 0.5015673981191222, "train_speed(iter/s)": 1.438012 }, { "epoch": 3.659654684889251, "grad_norm": 5.8600029945373535, "learning_rate": 1.670749967559671e-05, "loss": 2.317844009399414, "memory(GiB)": 77.56, "step": 85420, "token_acc": 0.5089820359281437, "train_speed(iter/s)": 1.438014 }, { "epoch": 3.6598689002184996, "grad_norm": 5.123144626617432, "learning_rate": 1.670247899020405e-05, "loss": 2.144133186340332, "memory(GiB)": 77.56, "step": 85425, "token_acc": 0.5224913494809689, "train_speed(iter/s)": 1.438042 }, { "epoch": 3.6600831155477485, "grad_norm": 5.756959915161133, "learning_rate": 1.669745890802722e-05, "loss": 2.2468536376953123, "memory(GiB)": 77.56, "step": 85430, "token_acc": 0.5348101265822784, "train_speed(iter/s)": 1.43808 }, { "epoch": 3.6602973308769977, "grad_norm": 5.73333215713501, "learning_rate": 1.6692439429157164e-05, "loss": 2.167403221130371, "memory(GiB)": 77.56, "step": 85435, "token_acc": 0.5134228187919463, "train_speed(iter/s)": 1.43807 }, { "epoch": 3.6605115462062465, "grad_norm": 7.354944705963135, "learning_rate": 1.6687420553684808e-05, "loss": 2.488819885253906, "memory(GiB)": 77.56, "step": 85440, "token_acc": 0.4389438943894389, "train_speed(iter/s)": 1.438054 }, { "epoch": 3.6607257615354953, "grad_norm": 8.660374641418457, "learning_rate": 1.668240228170108e-05, "loss": 2.0837434768676757, "memory(GiB)": 77.56, "step": 85445, "token_acc": 0.5112781954887218, "train_speed(iter/s)": 1.438057 }, { "epoch": 3.6609399768647446, "grad_norm": 5.56748628616333, "learning_rate": 1.6677384613296875e-05, "loss": 2.0525686264038088, "memory(GiB)": 77.56, "step": 85450, "token_acc": 0.5536332179930796, "train_speed(iter/s)": 1.438045 }, { "epoch": 3.6611541921939934, "grad_norm": 6.961643218994141, "learning_rate": 1.6672367548563124e-05, "loss": 2.2162675857543945, "memory(GiB)": 77.56, "step": 85455, "token_acc": 0.5703703703703704, "train_speed(iter/s)": 1.43806 }, { "epoch": 3.661368407523242, "grad_norm": 5.4471845626831055, "learning_rate": 1.6667351087590693e-05, "loss": 2.3374263763427736, "memory(GiB)": 77.56, "step": 85460, "token_acc": 0.5460992907801419, "train_speed(iter/s)": 1.438057 }, { "epoch": 3.6615826228524915, "grad_norm": 6.084554672241211, "learning_rate": 1.6662335230470472e-05, "loss": 2.0502607345581056, "memory(GiB)": 77.56, "step": 85465, "token_acc": 0.5099337748344371, "train_speed(iter/s)": 1.43805 }, { "epoch": 3.6617968381817403, "grad_norm": 5.148804187774658, "learning_rate": 1.665731997729332e-05, "loss": 2.5644710540771483, "memory(GiB)": 77.56, "step": 85470, "token_acc": 0.487012987012987, "train_speed(iter/s)": 1.438059 }, { "epoch": 3.662011053510989, "grad_norm": 5.40590763092041, "learning_rate": 1.6652305328150074e-05, "loss": 2.338912582397461, "memory(GiB)": 77.56, "step": 85475, "token_acc": 0.49107142857142855, "train_speed(iter/s)": 1.438069 }, { "epoch": 3.6622252688402384, "grad_norm": 5.661278247833252, "learning_rate": 1.6647291283131606e-05, "loss": 2.632363700866699, "memory(GiB)": 77.56, "step": 85480, "token_acc": 0.47058823529411764, "train_speed(iter/s)": 1.438083 }, { "epoch": 3.662439484169487, "grad_norm": 11.32270622253418, "learning_rate": 1.664227784232876e-05, "loss": 2.5806529998779295, "memory(GiB)": 77.56, "step": 85485, "token_acc": 0.4845360824742268, "train_speed(iter/s)": 1.438093 }, { "epoch": 3.662653699498736, "grad_norm": 6.08767032623291, "learning_rate": 1.6637265005832335e-05, "loss": 2.323574256896973, "memory(GiB)": 77.56, "step": 85490, "token_acc": 0.5081967213114754, "train_speed(iter/s)": 1.438101 }, { "epoch": 3.6628679148279852, "grad_norm": 5.566212177276611, "learning_rate": 1.6632252773733154e-05, "loss": 2.2979804992675783, "memory(GiB)": 77.56, "step": 85495, "token_acc": 0.47865853658536583, "train_speed(iter/s)": 1.438103 }, { "epoch": 3.663082130157234, "grad_norm": 5.454259872436523, "learning_rate": 1.6627241146122017e-05, "loss": 2.145988655090332, "memory(GiB)": 77.56, "step": 85500, "token_acc": 0.5095785440613027, "train_speed(iter/s)": 1.438092 }, { "epoch": 3.663082130157234, "eval_loss": 2.3091440200805664, "eval_runtime": 15.9932, "eval_samples_per_second": 6.253, "eval_steps_per_second": 6.253, "eval_token_acc": 0.4723404255319149, "step": 85500 }, { "epoch": 3.663296345486483, "grad_norm": 5.737945556640625, "learning_rate": 1.66222301230897e-05, "loss": 2.3726125717163087, "memory(GiB)": 77.56, "step": 85505, "token_acc": 0.48023715415019763, "train_speed(iter/s)": 1.437671 }, { "epoch": 3.663510560815732, "grad_norm": 5.8969316482543945, "learning_rate": 1.6617219704727004e-05, "loss": 2.004317855834961, "memory(GiB)": 77.56, "step": 85510, "token_acc": 0.5270758122743683, "train_speed(iter/s)": 1.437685 }, { "epoch": 3.663724776144981, "grad_norm": 8.292634010314941, "learning_rate": 1.6612209891124696e-05, "loss": 2.125049018859863, "memory(GiB)": 77.56, "step": 85515, "token_acc": 0.5236363636363637, "train_speed(iter/s)": 1.437688 }, { "epoch": 3.6639389914742297, "grad_norm": 5.804937362670898, "learning_rate": 1.6607200682373526e-05, "loss": 2.5256513595581054, "memory(GiB)": 77.56, "step": 85520, "token_acc": 0.48562300319488816, "train_speed(iter/s)": 1.4377 }, { "epoch": 3.664153206803479, "grad_norm": 7.040986061096191, "learning_rate": 1.6602192078564237e-05, "loss": 2.322521781921387, "memory(GiB)": 77.56, "step": 85525, "token_acc": 0.4899598393574297, "train_speed(iter/s)": 1.437713 }, { "epoch": 3.664367422132728, "grad_norm": 5.853885650634766, "learning_rate": 1.659718407978757e-05, "loss": 2.239628219604492, "memory(GiB)": 77.56, "step": 85530, "token_acc": 0.5227272727272727, "train_speed(iter/s)": 1.437712 }, { "epoch": 3.6645816374619766, "grad_norm": 5.997447490692139, "learning_rate": 1.6592176686134232e-05, "loss": 2.4890716552734373, "memory(GiB)": 77.56, "step": 85535, "token_acc": 0.4453125, "train_speed(iter/s)": 1.437721 }, { "epoch": 3.664795852791226, "grad_norm": 6.228614330291748, "learning_rate": 1.6587169897694965e-05, "loss": 2.4528972625732424, "memory(GiB)": 77.56, "step": 85540, "token_acc": 0.46855345911949686, "train_speed(iter/s)": 1.437729 }, { "epoch": 3.6650100681204747, "grad_norm": 5.8728413581848145, "learning_rate": 1.658216371456046e-05, "loss": 2.0404239654541017, "memory(GiB)": 77.56, "step": 85545, "token_acc": 0.5670995670995671, "train_speed(iter/s)": 1.437736 }, { "epoch": 3.6652242834497235, "grad_norm": 6.953040599822998, "learning_rate": 1.6577158136821408e-05, "loss": 2.360011672973633, "memory(GiB)": 77.56, "step": 85550, "token_acc": 0.47075208913649025, "train_speed(iter/s)": 1.437706 }, { "epoch": 3.6654384987789728, "grad_norm": 6.1802077293396, "learning_rate": 1.6572153164568478e-05, "loss": 2.2948246002197266, "memory(GiB)": 77.56, "step": 85555, "token_acc": 0.49146757679180886, "train_speed(iter/s)": 1.437703 }, { "epoch": 3.6656527141082216, "grad_norm": 6.816431999206543, "learning_rate": 1.6567148797892362e-05, "loss": 2.474551963806152, "memory(GiB)": 77.56, "step": 85560, "token_acc": 0.468013468013468, "train_speed(iter/s)": 1.437683 }, { "epoch": 3.6658669294374704, "grad_norm": 6.462721347808838, "learning_rate": 1.6562145036883692e-05, "loss": 2.2315439224243163, "memory(GiB)": 77.56, "step": 85565, "token_acc": 0.5615384615384615, "train_speed(iter/s)": 1.437703 }, { "epoch": 3.6660811447667196, "grad_norm": 6.0261616706848145, "learning_rate": 1.6557141881633152e-05, "loss": 2.2492610931396486, "memory(GiB)": 77.56, "step": 85570, "token_acc": 0.5070422535211268, "train_speed(iter/s)": 1.437711 }, { "epoch": 3.6662953600959685, "grad_norm": 5.113628387451172, "learning_rate": 1.6552139332231354e-05, "loss": 2.332870292663574, "memory(GiB)": 77.56, "step": 85575, "token_acc": 0.5232974910394266, "train_speed(iter/s)": 1.437725 }, { "epoch": 3.6665095754252173, "grad_norm": 4.488736152648926, "learning_rate": 1.6547137388768937e-05, "loss": 2.224553871154785, "memory(GiB)": 77.56, "step": 85580, "token_acc": 0.547550432276657, "train_speed(iter/s)": 1.437751 }, { "epoch": 3.6667237907544665, "grad_norm": 7.389726638793945, "learning_rate": 1.65421360513365e-05, "loss": 2.3076539993286134, "memory(GiB)": 77.56, "step": 85585, "token_acc": 0.47266881028938906, "train_speed(iter/s)": 1.437734 }, { "epoch": 3.6669380060837153, "grad_norm": 6.888086318969727, "learning_rate": 1.6537135320024654e-05, "loss": 2.440826416015625, "memory(GiB)": 77.56, "step": 85590, "token_acc": 0.47770700636942676, "train_speed(iter/s)": 1.437716 }, { "epoch": 3.667152221412964, "grad_norm": 6.0712385177612305, "learning_rate": 1.6532135194923982e-05, "loss": 2.1879508972167967, "memory(GiB)": 77.56, "step": 85595, "token_acc": 0.49140893470790376, "train_speed(iter/s)": 1.437727 }, { "epoch": 3.6673664367422134, "grad_norm": 5.9911603927612305, "learning_rate": 1.652713567612509e-05, "loss": 2.3903751373291016, "memory(GiB)": 77.56, "step": 85600, "token_acc": 0.48355263157894735, "train_speed(iter/s)": 1.437731 }, { "epoch": 3.667580652071462, "grad_norm": 7.412249565124512, "learning_rate": 1.6522136763718533e-05, "loss": 2.385560989379883, "memory(GiB)": 77.56, "step": 85605, "token_acc": 0.5416666666666666, "train_speed(iter/s)": 1.437735 }, { "epoch": 3.667794867400711, "grad_norm": 6.102590084075928, "learning_rate": 1.6517138457794873e-05, "loss": 2.241424560546875, "memory(GiB)": 77.56, "step": 85610, "token_acc": 0.5075528700906344, "train_speed(iter/s)": 1.437753 }, { "epoch": 3.6680090827299603, "grad_norm": 6.744265079498291, "learning_rate": 1.651214075844466e-05, "loss": 2.0103885650634767, "memory(GiB)": 77.56, "step": 85615, "token_acc": 0.5524193548387096, "train_speed(iter/s)": 1.437756 }, { "epoch": 3.668223298059209, "grad_norm": 6.396752834320068, "learning_rate": 1.6507143665758417e-05, "loss": 2.261327362060547, "memory(GiB)": 77.56, "step": 85620, "token_acc": 0.49480968858131485, "train_speed(iter/s)": 1.43777 }, { "epoch": 3.668437513388458, "grad_norm": 6.04730749130249, "learning_rate": 1.65021471798267e-05, "loss": 2.2555437088012695, "memory(GiB)": 77.56, "step": 85625, "token_acc": 0.505338078291815, "train_speed(iter/s)": 1.43777 }, { "epoch": 3.668651728717707, "grad_norm": 5.6761698722839355, "learning_rate": 1.6497151300739994e-05, "loss": 2.5593610763549806, "memory(GiB)": 77.56, "step": 85630, "token_acc": 0.5, "train_speed(iter/s)": 1.437773 }, { "epoch": 3.668865944046956, "grad_norm": 6.154073238372803, "learning_rate": 1.6492156028588833e-05, "loss": 2.2233081817626954, "memory(GiB)": 77.56, "step": 85635, "token_acc": 0.5236686390532544, "train_speed(iter/s)": 1.437767 }, { "epoch": 3.669080159376205, "grad_norm": 6.3652024269104, "learning_rate": 1.6487161363463705e-05, "loss": 2.3743534088134766, "memory(GiB)": 77.56, "step": 85640, "token_acc": 0.4878048780487805, "train_speed(iter/s)": 1.437769 }, { "epoch": 3.669294374705454, "grad_norm": 4.999141693115234, "learning_rate": 1.6482167305455086e-05, "loss": 2.3318323135375976, "memory(GiB)": 77.56, "step": 85645, "token_acc": 0.4804270462633452, "train_speed(iter/s)": 1.437772 }, { "epoch": 3.669508590034703, "grad_norm": 5.1600518226623535, "learning_rate": 1.6477173854653433e-05, "loss": 2.5484424591064454, "memory(GiB)": 77.56, "step": 85650, "token_acc": 0.4563758389261745, "train_speed(iter/s)": 1.437755 }, { "epoch": 3.6697228053639517, "grad_norm": 7.605493068695068, "learning_rate": 1.6472181011149233e-05, "loss": 2.486475944519043, "memory(GiB)": 77.56, "step": 85655, "token_acc": 0.5015576323987538, "train_speed(iter/s)": 1.43775 }, { "epoch": 3.669937020693201, "grad_norm": 6.143488883972168, "learning_rate": 1.6467188775032925e-05, "loss": 2.3266164779663088, "memory(GiB)": 77.56, "step": 85660, "token_acc": 0.4966216216216216, "train_speed(iter/s)": 1.437736 }, { "epoch": 3.6701512360224497, "grad_norm": 6.265586853027344, "learning_rate": 1.646219714639496e-05, "loss": 2.1769920349121095, "memory(GiB)": 77.56, "step": 85665, "token_acc": 0.5898617511520737, "train_speed(iter/s)": 1.437747 }, { "epoch": 3.6703654513516986, "grad_norm": 6.088485240936279, "learning_rate": 1.6457206125325747e-05, "loss": 2.313176727294922, "memory(GiB)": 77.56, "step": 85670, "token_acc": 0.5253623188405797, "train_speed(iter/s)": 1.437742 }, { "epoch": 3.670579666680948, "grad_norm": 5.356929779052734, "learning_rate": 1.6452215711915708e-05, "loss": 2.166522216796875, "memory(GiB)": 77.56, "step": 85675, "token_acc": 0.5205479452054794, "train_speed(iter/s)": 1.437741 }, { "epoch": 3.6707938820101966, "grad_norm": 7.862729549407959, "learning_rate": 1.6447225906255237e-05, "loss": 2.045585060119629, "memory(GiB)": 77.56, "step": 85680, "token_acc": 0.5629139072847682, "train_speed(iter/s)": 1.43775 }, { "epoch": 3.6710080973394454, "grad_norm": 10.510821342468262, "learning_rate": 1.6442236708434767e-05, "loss": 2.264939880371094, "memory(GiB)": 77.56, "step": 85685, "token_acc": 0.5038167938931297, "train_speed(iter/s)": 1.437745 }, { "epoch": 3.6712223126686947, "grad_norm": 4.985856056213379, "learning_rate": 1.643724811854465e-05, "loss": 2.147949981689453, "memory(GiB)": 77.56, "step": 85690, "token_acc": 0.5183946488294314, "train_speed(iter/s)": 1.43775 }, { "epoch": 3.6714365279979435, "grad_norm": 4.9990363121032715, "learning_rate": 1.643226013667527e-05, "loss": 2.415330505371094, "memory(GiB)": 77.56, "step": 85695, "token_acc": 0.48344370860927155, "train_speed(iter/s)": 1.437759 }, { "epoch": 3.6716507433271923, "grad_norm": 5.873000144958496, "learning_rate": 1.6427272762916973e-05, "loss": 2.521626663208008, "memory(GiB)": 77.56, "step": 85700, "token_acc": 0.5477178423236515, "train_speed(iter/s)": 1.43776 }, { "epoch": 3.6718649586564416, "grad_norm": 5.7365922927856445, "learning_rate": 1.6422285997360136e-05, "loss": 2.1732915878295898, "memory(GiB)": 77.56, "step": 85705, "token_acc": 0.5225563909774437, "train_speed(iter/s)": 1.437766 }, { "epoch": 3.6720791739856904, "grad_norm": 6.276196479797363, "learning_rate": 1.6417299840095073e-05, "loss": 2.2730649948120116, "memory(GiB)": 77.56, "step": 85710, "token_acc": 0.5182724252491694, "train_speed(iter/s)": 1.437765 }, { "epoch": 3.672293389314939, "grad_norm": 5.621336936950684, "learning_rate": 1.6412314291212143e-05, "loss": 2.4270160675048826, "memory(GiB)": 77.56, "step": 85715, "token_acc": 0.46601941747572817, "train_speed(iter/s)": 1.437756 }, { "epoch": 3.6725076046441885, "grad_norm": 6.542372226715088, "learning_rate": 1.6407329350801647e-05, "loss": 2.3589384078979494, "memory(GiB)": 77.56, "step": 85720, "token_acc": 0.4901315789473684, "train_speed(iter/s)": 1.437763 }, { "epoch": 3.6727218199734373, "grad_norm": 4.933358669281006, "learning_rate": 1.640234501895389e-05, "loss": 2.369892120361328, "memory(GiB)": 77.56, "step": 85725, "token_acc": 0.5014836795252225, "train_speed(iter/s)": 1.43778 }, { "epoch": 3.672936035302686, "grad_norm": 6.267177104949951, "learning_rate": 1.6397361295759172e-05, "loss": 2.6145603179931642, "memory(GiB)": 77.56, "step": 85730, "token_acc": 0.4326241134751773, "train_speed(iter/s)": 1.43779 }, { "epoch": 3.6731502506319353, "grad_norm": 7.575082778930664, "learning_rate": 1.6392378181307777e-05, "loss": 2.1985549926757812, "memory(GiB)": 77.56, "step": 85735, "token_acc": 0.5052264808362369, "train_speed(iter/s)": 1.437789 }, { "epoch": 3.673364465961184, "grad_norm": 5.519351959228516, "learning_rate": 1.6387395675689964e-05, "loss": 2.317628288269043, "memory(GiB)": 77.56, "step": 85740, "token_acc": 0.5098684210526315, "train_speed(iter/s)": 1.437783 }, { "epoch": 3.673578681290433, "grad_norm": 6.22172737121582, "learning_rate": 1.6382413778996018e-05, "loss": 2.307122230529785, "memory(GiB)": 77.56, "step": 85745, "token_acc": 0.4950166112956811, "train_speed(iter/s)": 1.437807 }, { "epoch": 3.6737928966196822, "grad_norm": 6.005603790283203, "learning_rate": 1.637743249131619e-05, "loss": 2.0492132186889647, "memory(GiB)": 77.56, "step": 85750, "token_acc": 0.5583333333333333, "train_speed(iter/s)": 1.437797 }, { "epoch": 3.674007111948931, "grad_norm": 6.747081756591797, "learning_rate": 1.637245181274071e-05, "loss": 2.4059064865112303, "memory(GiB)": 77.56, "step": 85755, "token_acc": 0.5, "train_speed(iter/s)": 1.437813 }, { "epoch": 3.67422132727818, "grad_norm": 5.078756332397461, "learning_rate": 1.6367471743359814e-05, "loss": 2.4050907135009765, "memory(GiB)": 77.56, "step": 85760, "token_acc": 0.49818181818181817, "train_speed(iter/s)": 1.437824 }, { "epoch": 3.674435542607429, "grad_norm": 4.663774013519287, "learning_rate": 1.636249228326369e-05, "loss": 2.221445083618164, "memory(GiB)": 77.56, "step": 85765, "token_acc": 0.5337620578778135, "train_speed(iter/s)": 1.437839 }, { "epoch": 3.674649757936678, "grad_norm": 7.007661819458008, "learning_rate": 1.63575134325426e-05, "loss": 2.309585952758789, "memory(GiB)": 77.56, "step": 85770, "token_acc": 0.4943820224719101, "train_speed(iter/s)": 1.437836 }, { "epoch": 3.6748639732659267, "grad_norm": 6.709556579589844, "learning_rate": 1.6352535191286693e-05, "loss": 2.2447441101074217, "memory(GiB)": 77.56, "step": 85775, "token_acc": 0.5080385852090032, "train_speed(iter/s)": 1.437826 }, { "epoch": 3.675078188595176, "grad_norm": 6.195633411407471, "learning_rate": 1.6347557559586192e-05, "loss": 2.3480348587036133, "memory(GiB)": 77.56, "step": 85780, "token_acc": 0.48909657320872274, "train_speed(iter/s)": 1.437851 }, { "epoch": 3.675292403924425, "grad_norm": 5.72022008895874, "learning_rate": 1.6342580537531256e-05, "loss": 2.477133560180664, "memory(GiB)": 77.56, "step": 85785, "token_acc": 0.4952076677316294, "train_speed(iter/s)": 1.437841 }, { "epoch": 3.6755066192536736, "grad_norm": 7.236176013946533, "learning_rate": 1.633760412521204e-05, "loss": 2.2890499114990233, "memory(GiB)": 77.56, "step": 85790, "token_acc": 0.5277777777777778, "train_speed(iter/s)": 1.437842 }, { "epoch": 3.675720834582923, "grad_norm": 6.310311317443848, "learning_rate": 1.6332628322718696e-05, "loss": 2.393000030517578, "memory(GiB)": 77.56, "step": 85795, "token_acc": 0.5, "train_speed(iter/s)": 1.437858 }, { "epoch": 3.6759350499121717, "grad_norm": 6.815532684326172, "learning_rate": 1.6327653130141385e-05, "loss": 2.3370025634765623, "memory(GiB)": 77.56, "step": 85800, "token_acc": 0.5369127516778524, "train_speed(iter/s)": 1.437856 }, { "epoch": 3.6761492652414205, "grad_norm": 7.008269309997559, "learning_rate": 1.6322678547570224e-05, "loss": 2.1994056701660156, "memory(GiB)": 77.56, "step": 85805, "token_acc": 0.5241935483870968, "train_speed(iter/s)": 1.437842 }, { "epoch": 3.6763634805706698, "grad_norm": 8.137584686279297, "learning_rate": 1.6317704575095333e-05, "loss": 2.4131784439086914, "memory(GiB)": 77.56, "step": 85810, "token_acc": 0.46215139442231074, "train_speed(iter/s)": 1.437851 }, { "epoch": 3.6765776958999186, "grad_norm": 4.737418174743652, "learning_rate": 1.631273121280682e-05, "loss": 2.221118927001953, "memory(GiB)": 77.56, "step": 85815, "token_acc": 0.5223367697594502, "train_speed(iter/s)": 1.43785 }, { "epoch": 3.6767919112291674, "grad_norm": 6.895242691040039, "learning_rate": 1.630775846079478e-05, "loss": 2.405704879760742, "memory(GiB)": 77.56, "step": 85820, "token_acc": 0.524904214559387, "train_speed(iter/s)": 1.437853 }, { "epoch": 3.6770061265584166, "grad_norm": 5.875631332397461, "learning_rate": 1.630278631914929e-05, "loss": 2.21718807220459, "memory(GiB)": 77.56, "step": 85825, "token_acc": 0.5268456375838926, "train_speed(iter/s)": 1.437858 }, { "epoch": 3.6772203418876654, "grad_norm": 5.645164489746094, "learning_rate": 1.6297814787960453e-05, "loss": 2.2740612030029297, "memory(GiB)": 77.56, "step": 85830, "token_acc": 0.5093632958801498, "train_speed(iter/s)": 1.437858 }, { "epoch": 3.6774345572169143, "grad_norm": 4.974506378173828, "learning_rate": 1.6292843867318307e-05, "loss": 2.0515157699584963, "memory(GiB)": 77.56, "step": 85835, "token_acc": 0.5476190476190477, "train_speed(iter/s)": 1.437831 }, { "epoch": 3.6776487725461635, "grad_norm": 7.125292778015137, "learning_rate": 1.6287873557312927e-05, "loss": 2.3242237091064455, "memory(GiB)": 77.56, "step": 85840, "token_acc": 0.5258064516129032, "train_speed(iter/s)": 1.437854 }, { "epoch": 3.6778629878754123, "grad_norm": 5.172096252441406, "learning_rate": 1.628290385803433e-05, "loss": 1.9990224838256836, "memory(GiB)": 77.56, "step": 85845, "token_acc": 0.5179856115107914, "train_speed(iter/s)": 1.437888 }, { "epoch": 3.678077203204661, "grad_norm": 6.257838249206543, "learning_rate": 1.6277934769572552e-05, "loss": 2.3137283325195312, "memory(GiB)": 77.56, "step": 85850, "token_acc": 0.516, "train_speed(iter/s)": 1.437893 }, { "epoch": 3.6782914185339104, "grad_norm": 6.082180976867676, "learning_rate": 1.6272966292017616e-05, "loss": 2.568686866760254, "memory(GiB)": 77.56, "step": 85855, "token_acc": 0.45938375350140054, "train_speed(iter/s)": 1.437873 }, { "epoch": 3.678505633863159, "grad_norm": 6.004775524139404, "learning_rate": 1.6267998425459552e-05, "loss": 2.5092388153076173, "memory(GiB)": 77.56, "step": 85860, "token_acc": 0.5154639175257731, "train_speed(iter/s)": 1.437886 }, { "epoch": 3.678719849192408, "grad_norm": 7.182103157043457, "learning_rate": 1.6263031169988337e-05, "loss": 1.9721778869628905, "memory(GiB)": 77.56, "step": 85865, "token_acc": 0.543918918918919, "train_speed(iter/s)": 1.437892 }, { "epoch": 3.6789340645216573, "grad_norm": 5.52949857711792, "learning_rate": 1.625806452569396e-05, "loss": 2.7242692947387694, "memory(GiB)": 77.56, "step": 85870, "token_acc": 0.4539877300613497, "train_speed(iter/s)": 1.437887 }, { "epoch": 3.679148279850906, "grad_norm": 6.061544895172119, "learning_rate": 1.6253098492666397e-05, "loss": 2.4043949127197264, "memory(GiB)": 77.56, "step": 85875, "token_acc": 0.49642857142857144, "train_speed(iter/s)": 1.437888 }, { "epoch": 3.679362495180155, "grad_norm": 5.408303260803223, "learning_rate": 1.6248133070995613e-05, "loss": 2.3352306365966795, "memory(GiB)": 77.56, "step": 85880, "token_acc": 0.5416666666666666, "train_speed(iter/s)": 1.437869 }, { "epoch": 3.679576710509404, "grad_norm": 5.1435866355896, "learning_rate": 1.6243168260771547e-05, "loss": 2.291178512573242, "memory(GiB)": 77.56, "step": 85885, "token_acc": 0.554006968641115, "train_speed(iter/s)": 1.437881 }, { "epoch": 3.679790925838653, "grad_norm": 6.436489582061768, "learning_rate": 1.623820406208417e-05, "loss": 2.3680397033691407, "memory(GiB)": 77.56, "step": 85890, "token_acc": 0.5066666666666667, "train_speed(iter/s)": 1.437884 }, { "epoch": 3.680005141167902, "grad_norm": 5.247585296630859, "learning_rate": 1.6233240475023394e-05, "loss": 2.155375862121582, "memory(GiB)": 77.56, "step": 85895, "token_acc": 0.5111111111111111, "train_speed(iter/s)": 1.437899 }, { "epoch": 3.680219356497151, "grad_norm": 7.1036376953125, "learning_rate": 1.622827749967914e-05, "loss": 2.5256114959716798, "memory(GiB)": 77.56, "step": 85900, "token_acc": 0.4979253112033195, "train_speed(iter/s)": 1.437895 }, { "epoch": 3.6804335718264, "grad_norm": 6.2489013671875, "learning_rate": 1.6223315136141327e-05, "loss": 2.473739242553711, "memory(GiB)": 77.56, "step": 85905, "token_acc": 0.458041958041958, "train_speed(iter/s)": 1.437899 }, { "epoch": 3.6806477871556487, "grad_norm": 5.340621471405029, "learning_rate": 1.6218353384499824e-05, "loss": 2.2262123107910154, "memory(GiB)": 77.56, "step": 85910, "token_acc": 0.5264705882352941, "train_speed(iter/s)": 1.437887 }, { "epoch": 3.680862002484898, "grad_norm": 5.009217262268066, "learning_rate": 1.6213392244844554e-05, "loss": 2.2785408020019533, "memory(GiB)": 77.56, "step": 85915, "token_acc": 0.48859934853420195, "train_speed(iter/s)": 1.437897 }, { "epoch": 3.6810762178141467, "grad_norm": 5.049722671508789, "learning_rate": 1.6208431717265382e-05, "loss": 2.5480831146240233, "memory(GiB)": 77.56, "step": 85920, "token_acc": 0.46686746987951805, "train_speed(iter/s)": 1.437889 }, { "epoch": 3.6812904331433955, "grad_norm": 4.7372870445251465, "learning_rate": 1.6203471801852148e-05, "loss": 2.421305465698242, "memory(GiB)": 77.56, "step": 85925, "token_acc": 0.49328859060402686, "train_speed(iter/s)": 1.437892 }, { "epoch": 3.681504648472645, "grad_norm": 8.707403182983398, "learning_rate": 1.6198512498694744e-05, "loss": 1.9206993103027343, "memory(GiB)": 77.56, "step": 85930, "token_acc": 0.56640625, "train_speed(iter/s)": 1.437907 }, { "epoch": 3.6817188638018936, "grad_norm": 4.758282661437988, "learning_rate": 1.6193553807882998e-05, "loss": 2.6549762725830077, "memory(GiB)": 77.56, "step": 85935, "token_acc": 0.4513715710723192, "train_speed(iter/s)": 1.437894 }, { "epoch": 3.6819330791311424, "grad_norm": 5.072307109832764, "learning_rate": 1.6188595729506722e-05, "loss": 2.264400291442871, "memory(GiB)": 77.56, "step": 85940, "token_acc": 0.5040214477211796, "train_speed(iter/s)": 1.437904 }, { "epoch": 3.6821472944603917, "grad_norm": 6.071218490600586, "learning_rate": 1.6183638263655765e-05, "loss": 2.5853748321533203, "memory(GiB)": 77.56, "step": 85945, "token_acc": 0.46511627906976744, "train_speed(iter/s)": 1.437907 }, { "epoch": 3.6823615097896405, "grad_norm": 6.709692001342773, "learning_rate": 1.617868141041992e-05, "loss": 2.2545841217041014, "memory(GiB)": 77.56, "step": 85950, "token_acc": 0.5157894736842106, "train_speed(iter/s)": 1.437895 }, { "epoch": 3.6825757251188893, "grad_norm": 4.431982040405273, "learning_rate": 1.6173725169888994e-05, "loss": 2.0270526885986326, "memory(GiB)": 77.56, "step": 85955, "token_acc": 0.5346534653465347, "train_speed(iter/s)": 1.437873 }, { "epoch": 3.6827899404481386, "grad_norm": 5.886152744293213, "learning_rate": 1.6168769542152767e-05, "loss": 2.2336679458618165, "memory(GiB)": 77.56, "step": 85960, "token_acc": 0.5344827586206896, "train_speed(iter/s)": 1.437856 }, { "epoch": 3.6830041557773874, "grad_norm": 5.202999114990234, "learning_rate": 1.6163814527301013e-05, "loss": 2.024606704711914, "memory(GiB)": 77.56, "step": 85965, "token_acc": 0.5525291828793775, "train_speed(iter/s)": 1.437858 }, { "epoch": 3.683218371106636, "grad_norm": 5.611460208892822, "learning_rate": 1.6158860125423485e-05, "loss": 2.552311134338379, "memory(GiB)": 77.56, "step": 85970, "token_acc": 0.45857988165680474, "train_speed(iter/s)": 1.437871 }, { "epoch": 3.6834325864358854, "grad_norm": 5.390848159790039, "learning_rate": 1.615390633660997e-05, "loss": 2.3400020599365234, "memory(GiB)": 77.56, "step": 85975, "token_acc": 0.48314606741573035, "train_speed(iter/s)": 1.437876 }, { "epoch": 3.6836468017651343, "grad_norm": 7.039914131164551, "learning_rate": 1.6148953160950187e-05, "loss": 2.2798843383789062, "memory(GiB)": 77.56, "step": 85980, "token_acc": 0.47878787878787876, "train_speed(iter/s)": 1.437852 }, { "epoch": 3.683861017094383, "grad_norm": 6.332791328430176, "learning_rate": 1.6144000598533877e-05, "loss": 2.4170612335205077, "memory(GiB)": 77.56, "step": 85985, "token_acc": 0.5333333333333333, "train_speed(iter/s)": 1.437842 }, { "epoch": 3.6840752324236323, "grad_norm": 5.428787708282471, "learning_rate": 1.6139048649450756e-05, "loss": 2.3463172912597656, "memory(GiB)": 77.56, "step": 85990, "token_acc": 0.4701492537313433, "train_speed(iter/s)": 1.437841 }, { "epoch": 3.684289447752881, "grad_norm": 5.748325824737549, "learning_rate": 1.6134097313790515e-05, "loss": 2.4689104080200197, "memory(GiB)": 77.56, "step": 85995, "token_acc": 0.4847560975609756, "train_speed(iter/s)": 1.437853 }, { "epoch": 3.68450366308213, "grad_norm": 6.762945175170898, "learning_rate": 1.6129146591642868e-05, "loss": 2.4168350219726564, "memory(GiB)": 77.56, "step": 86000, "token_acc": 0.46875, "train_speed(iter/s)": 1.437848 }, { "epoch": 3.68450366308213, "eval_loss": 2.258465051651001, "eval_runtime": 14.8893, "eval_samples_per_second": 6.716, "eval_steps_per_second": 6.716, "eval_token_acc": 0.4717741935483871, "step": 86000 }, { "epoch": 3.684717878411379, "grad_norm": 5.709500312805176, "learning_rate": 1.612419648309752e-05, "loss": 2.455639457702637, "memory(GiB)": 77.56, "step": 86005, "token_acc": 0.4690522243713733, "train_speed(iter/s)": 1.437482 }, { "epoch": 3.684932093740628, "grad_norm": 5.775025844573975, "learning_rate": 1.6119246988244136e-05, "loss": 2.1570785522460936, "memory(GiB)": 77.56, "step": 86010, "token_acc": 0.5567375886524822, "train_speed(iter/s)": 1.437494 }, { "epoch": 3.685146309069877, "grad_norm": 5.722115516662598, "learning_rate": 1.6114298107172372e-05, "loss": 2.342878723144531, "memory(GiB)": 77.56, "step": 86015, "token_acc": 0.4703389830508475, "train_speed(iter/s)": 1.437513 }, { "epoch": 3.685360524399126, "grad_norm": 5.323507785797119, "learning_rate": 1.6109349839971883e-05, "loss": 2.594965362548828, "memory(GiB)": 77.56, "step": 86020, "token_acc": 0.45345345345345345, "train_speed(iter/s)": 1.437521 }, { "epoch": 3.685574739728375, "grad_norm": 5.1564717292785645, "learning_rate": 1.61044021867323e-05, "loss": 2.1144077301025392, "memory(GiB)": 77.56, "step": 86025, "token_acc": 0.5341614906832298, "train_speed(iter/s)": 1.437509 }, { "epoch": 3.6857889550576237, "grad_norm": 5.714221000671387, "learning_rate": 1.6099455147543284e-05, "loss": 2.1551139831542967, "memory(GiB)": 77.56, "step": 86030, "token_acc": 0.5153846153846153, "train_speed(iter/s)": 1.437513 }, { "epoch": 3.686003170386873, "grad_norm": 10.161355018615723, "learning_rate": 1.609450872249444e-05, "loss": 2.054778289794922, "memory(GiB)": 77.56, "step": 86035, "token_acc": 0.5247148288973384, "train_speed(iter/s)": 1.437532 }, { "epoch": 3.686217385716122, "grad_norm": 5.03427791595459, "learning_rate": 1.6089562911675377e-05, "loss": 2.21038761138916, "memory(GiB)": 77.56, "step": 86040, "token_acc": 0.5238095238095238, "train_speed(iter/s)": 1.437538 }, { "epoch": 3.6864316010453706, "grad_norm": 3.9941518306732178, "learning_rate": 1.6084617715175686e-05, "loss": 2.171978759765625, "memory(GiB)": 77.56, "step": 86045, "token_acc": 0.5197368421052632, "train_speed(iter/s)": 1.437556 }, { "epoch": 3.68664581637462, "grad_norm": 5.85705041885376, "learning_rate": 1.6079673133084967e-05, "loss": 2.461191940307617, "memory(GiB)": 77.56, "step": 86050, "token_acc": 0.4713375796178344, "train_speed(iter/s)": 1.437547 }, { "epoch": 3.6868600317038687, "grad_norm": 5.6941142082214355, "learning_rate": 1.607472916549277e-05, "loss": 2.5281028747558594, "memory(GiB)": 77.56, "step": 86055, "token_acc": 0.46474358974358976, "train_speed(iter/s)": 1.437546 }, { "epoch": 3.6870742470331175, "grad_norm": 5.876923084259033, "learning_rate": 1.6069785812488696e-05, "loss": 2.085440444946289, "memory(GiB)": 77.56, "step": 86060, "token_acc": 0.5675675675675675, "train_speed(iter/s)": 1.43756 }, { "epoch": 3.6872884623623667, "grad_norm": 5.975595951080322, "learning_rate": 1.6064843074162285e-05, "loss": 2.307909393310547, "memory(GiB)": 77.56, "step": 86065, "token_acc": 0.5087108013937283, "train_speed(iter/s)": 1.437543 }, { "epoch": 3.6875026776916155, "grad_norm": 9.323893547058105, "learning_rate": 1.6059900950603052e-05, "loss": 2.2341079711914062, "memory(GiB)": 77.56, "step": 86070, "token_acc": 0.5335689045936396, "train_speed(iter/s)": 1.437542 }, { "epoch": 3.6877168930208644, "grad_norm": 5.0056562423706055, "learning_rate": 1.6054959441900574e-05, "loss": 2.4665449142456053, "memory(GiB)": 77.56, "step": 86075, "token_acc": 0.4955223880597015, "train_speed(iter/s)": 1.437528 }, { "epoch": 3.6879311083501136, "grad_norm": 6.423130512237549, "learning_rate": 1.605001854814434e-05, "loss": 2.401065635681152, "memory(GiB)": 77.56, "step": 86080, "token_acc": 0.5060728744939271, "train_speed(iter/s)": 1.437518 }, { "epoch": 3.6881453236793624, "grad_norm": 5.566985130310059, "learning_rate": 1.6045078269423864e-05, "loss": 2.0344518661499023, "memory(GiB)": 77.56, "step": 86085, "token_acc": 0.5658914728682171, "train_speed(iter/s)": 1.43752 }, { "epoch": 3.6883595390086112, "grad_norm": 7.832764625549316, "learning_rate": 1.6040138605828653e-05, "loss": 2.373078727722168, "memory(GiB)": 77.56, "step": 86090, "token_acc": 0.4931506849315068, "train_speed(iter/s)": 1.4375 }, { "epoch": 3.6885737543378605, "grad_norm": 4.745559215545654, "learning_rate": 1.6035199557448194e-05, "loss": 2.058857727050781, "memory(GiB)": 77.56, "step": 86095, "token_acc": 0.5244755244755245, "train_speed(iter/s)": 1.43751 }, { "epoch": 3.6887879696671093, "grad_norm": 5.882087707519531, "learning_rate": 1.6030261124371953e-05, "loss": 2.2976215362548826, "memory(GiB)": 77.56, "step": 86100, "token_acc": 0.5182481751824818, "train_speed(iter/s)": 1.437522 }, { "epoch": 3.689002184996358, "grad_norm": 5.777613162994385, "learning_rate": 1.60253233066894e-05, "loss": 2.481217956542969, "memory(GiB)": 77.56, "step": 86105, "token_acc": 0.486404833836858, "train_speed(iter/s)": 1.437518 }, { "epoch": 3.6892164003256074, "grad_norm": 7.4842939376831055, "learning_rate": 1.6020386104489986e-05, "loss": 2.264832878112793, "memory(GiB)": 77.56, "step": 86110, "token_acc": 0.5187713310580204, "train_speed(iter/s)": 1.437519 }, { "epoch": 3.689430615654856, "grad_norm": 5.6255083084106445, "learning_rate": 1.6015449517863133e-05, "loss": 2.1634780883789064, "memory(GiB)": 77.56, "step": 86115, "token_acc": 0.5090252707581228, "train_speed(iter/s)": 1.437521 }, { "epoch": 3.689644830984105, "grad_norm": 6.240344524383545, "learning_rate": 1.6010513546898316e-05, "loss": 2.3222721099853514, "memory(GiB)": 77.56, "step": 86120, "token_acc": 0.546031746031746, "train_speed(iter/s)": 1.43753 }, { "epoch": 3.6898590463133543, "grad_norm": 6.719814300537109, "learning_rate": 1.600557819168493e-05, "loss": 2.174650955200195, "memory(GiB)": 77.56, "step": 86125, "token_acc": 0.5230125523012552, "train_speed(iter/s)": 1.437532 }, { "epoch": 3.690073261642603, "grad_norm": 5.500310897827148, "learning_rate": 1.600064345231238e-05, "loss": 2.5471914291381834, "memory(GiB)": 77.56, "step": 86130, "token_acc": 0.4562334217506631, "train_speed(iter/s)": 1.437539 }, { "epoch": 3.690287476971852, "grad_norm": 4.539886951446533, "learning_rate": 1.5995709328870067e-05, "loss": 2.1795543670654296, "memory(GiB)": 77.56, "step": 86135, "token_acc": 0.5437956204379562, "train_speed(iter/s)": 1.437531 }, { "epoch": 3.690501692301101, "grad_norm": 5.665561199188232, "learning_rate": 1.5990775821447363e-05, "loss": 2.526131248474121, "memory(GiB)": 77.56, "step": 86140, "token_acc": 0.44984802431610943, "train_speed(iter/s)": 1.437537 }, { "epoch": 3.69071590763035, "grad_norm": 6.106988906860352, "learning_rate": 1.598584293013366e-05, "loss": 2.3974456787109375, "memory(GiB)": 77.56, "step": 86145, "token_acc": 0.5034246575342466, "train_speed(iter/s)": 1.437559 }, { "epoch": 3.6909301229595988, "grad_norm": 6.5132598876953125, "learning_rate": 1.5980910655018332e-05, "loss": 2.2827762603759765, "memory(GiB)": 77.56, "step": 86150, "token_acc": 0.5218855218855218, "train_speed(iter/s)": 1.43758 }, { "epoch": 3.691144338288848, "grad_norm": 6.577299118041992, "learning_rate": 1.5975978996190727e-05, "loss": 2.463894081115723, "memory(GiB)": 77.56, "step": 86155, "token_acc": 0.4889705882352941, "train_speed(iter/s)": 1.437586 }, { "epoch": 3.691358553618097, "grad_norm": 4.83075475692749, "learning_rate": 1.5971047953740174e-05, "loss": 2.4222484588623048, "memory(GiB)": 77.56, "step": 86160, "token_acc": 0.47953216374269003, "train_speed(iter/s)": 1.437574 }, { "epoch": 3.6915727689473457, "grad_norm": 6.76957893371582, "learning_rate": 1.5966117527756013e-05, "loss": 2.2134328842163087, "memory(GiB)": 77.56, "step": 86165, "token_acc": 0.5071428571428571, "train_speed(iter/s)": 1.437585 }, { "epoch": 3.691786984276595, "grad_norm": 4.991066932678223, "learning_rate": 1.596118771832754e-05, "loss": 2.489685821533203, "memory(GiB)": 77.56, "step": 86170, "token_acc": 0.5229681978798587, "train_speed(iter/s)": 1.437597 }, { "epoch": 3.6920011996058437, "grad_norm": 7.68670129776001, "learning_rate": 1.59562585255441e-05, "loss": 2.3423063278198244, "memory(GiB)": 77.56, "step": 86175, "token_acc": 0.4626865671641791, "train_speed(iter/s)": 1.437614 }, { "epoch": 3.6922154149350925, "grad_norm": 4.818495750427246, "learning_rate": 1.5951329949494976e-05, "loss": 2.311749076843262, "memory(GiB)": 77.56, "step": 86180, "token_acc": 0.504424778761062, "train_speed(iter/s)": 1.437615 }, { "epoch": 3.692429630264342, "grad_norm": 6.807466506958008, "learning_rate": 1.5946401990269444e-05, "loss": 2.1719968795776365, "memory(GiB)": 77.56, "step": 86185, "token_acc": 0.5176056338028169, "train_speed(iter/s)": 1.437624 }, { "epoch": 3.6926438455935906, "grad_norm": 6.530341625213623, "learning_rate": 1.5941474647956788e-05, "loss": 2.429329681396484, "memory(GiB)": 77.56, "step": 86190, "token_acc": 0.47674418604651164, "train_speed(iter/s)": 1.43763 }, { "epoch": 3.6928580609228394, "grad_norm": 6.639760494232178, "learning_rate": 1.5936547922646268e-05, "loss": 2.506332015991211, "memory(GiB)": 77.56, "step": 86195, "token_acc": 0.468944099378882, "train_speed(iter/s)": 1.437623 }, { "epoch": 3.6930722762520887, "grad_norm": 6.662483215332031, "learning_rate": 1.5931621814427118e-05, "loss": 2.110951614379883, "memory(GiB)": 77.56, "step": 86200, "token_acc": 0.5555555555555556, "train_speed(iter/s)": 1.437631 }, { "epoch": 3.6932864915813375, "grad_norm": 4.36579704284668, "learning_rate": 1.5926696323388618e-05, "loss": 2.3908714294433593, "memory(GiB)": 77.56, "step": 86205, "token_acc": 0.5097402597402597, "train_speed(iter/s)": 1.437625 }, { "epoch": 3.6935007069105863, "grad_norm": 6.042758941650391, "learning_rate": 1.5921771449619977e-05, "loss": 2.6326574325561523, "memory(GiB)": 77.56, "step": 86210, "token_acc": 0.46875, "train_speed(iter/s)": 1.437611 }, { "epoch": 3.6937149222398356, "grad_norm": 6.384288311004639, "learning_rate": 1.591684719321041e-05, "loss": 2.0806385040283204, "memory(GiB)": 77.56, "step": 86215, "token_acc": 0.5269230769230769, "train_speed(iter/s)": 1.437621 }, { "epoch": 3.6939291375690844, "grad_norm": 5.6147379875183105, "learning_rate": 1.5911923554249113e-05, "loss": 2.5784053802490234, "memory(GiB)": 77.56, "step": 86220, "token_acc": 0.49829351535836175, "train_speed(iter/s)": 1.43764 }, { "epoch": 3.694143352898333, "grad_norm": 6.724876880645752, "learning_rate": 1.590700053282532e-05, "loss": 2.571179962158203, "memory(GiB)": 77.56, "step": 86225, "token_acc": 0.4939759036144578, "train_speed(iter/s)": 1.43764 }, { "epoch": 3.6943575682275824, "grad_norm": 4.180628299713135, "learning_rate": 1.590207812902817e-05, "loss": 2.3681308746337892, "memory(GiB)": 77.56, "step": 86230, "token_acc": 0.5068027210884354, "train_speed(iter/s)": 1.437635 }, { "epoch": 3.6945717835568312, "grad_norm": 5.39839506149292, "learning_rate": 1.5897156342946883e-05, "loss": 2.4671966552734377, "memory(GiB)": 77.56, "step": 86235, "token_acc": 0.44981412639405205, "train_speed(iter/s)": 1.437657 }, { "epoch": 3.69478599888608, "grad_norm": 5.495696067810059, "learning_rate": 1.58922351746706e-05, "loss": 2.736591911315918, "memory(GiB)": 77.56, "step": 86240, "token_acc": 0.4702194357366771, "train_speed(iter/s)": 1.437674 }, { "epoch": 3.6950002142153293, "grad_norm": 7.17864465713501, "learning_rate": 1.5887314624288467e-05, "loss": 2.408966636657715, "memory(GiB)": 77.56, "step": 86245, "token_acc": 0.450354609929078, "train_speed(iter/s)": 1.437687 }, { "epoch": 3.695214429544578, "grad_norm": 6.017739295959473, "learning_rate": 1.5882394691889636e-05, "loss": 2.2471899032592773, "memory(GiB)": 77.56, "step": 86250, "token_acc": 0.5348101265822784, "train_speed(iter/s)": 1.437697 }, { "epoch": 3.695428644873827, "grad_norm": 6.043412685394287, "learning_rate": 1.587747537756323e-05, "loss": 2.664326858520508, "memory(GiB)": 77.56, "step": 86255, "token_acc": 0.5154639175257731, "train_speed(iter/s)": 1.437672 }, { "epoch": 3.695642860203076, "grad_norm": 5.255742073059082, "learning_rate": 1.5872556681398342e-05, "loss": 2.5010602951049803, "memory(GiB)": 77.56, "step": 86260, "token_acc": 0.4868421052631579, "train_speed(iter/s)": 1.437673 }, { "epoch": 3.695857075532325, "grad_norm": 4.185830593109131, "learning_rate": 1.5867638603484132e-05, "loss": 2.6247385025024412, "memory(GiB)": 77.56, "step": 86265, "token_acc": 0.48255813953488375, "train_speed(iter/s)": 1.437703 }, { "epoch": 3.696071290861574, "grad_norm": 6.7779998779296875, "learning_rate": 1.5862721143909658e-05, "loss": 2.274894905090332, "memory(GiB)": 77.56, "step": 86270, "token_acc": 0.5076335877862596, "train_speed(iter/s)": 1.43771 }, { "epoch": 3.696285506190823, "grad_norm": 6.136999607086182, "learning_rate": 1.585780430276402e-05, "loss": 2.0463333129882812, "memory(GiB)": 77.56, "step": 86275, "token_acc": 0.49469964664310956, "train_speed(iter/s)": 1.437705 }, { "epoch": 3.696499721520072, "grad_norm": 4.349707126617432, "learning_rate": 1.585288808013628e-05, "loss": 2.291848564147949, "memory(GiB)": 77.56, "step": 86280, "token_acc": 0.4944237918215613, "train_speed(iter/s)": 1.437726 }, { "epoch": 3.6967139368493207, "grad_norm": 6.033775329589844, "learning_rate": 1.584797247611549e-05, "loss": 2.2331987380981446, "memory(GiB)": 77.56, "step": 86285, "token_acc": 0.5394736842105263, "train_speed(iter/s)": 1.437749 }, { "epoch": 3.69692815217857, "grad_norm": 7.3697662353515625, "learning_rate": 1.5843057490790737e-05, "loss": 2.4206491470336915, "memory(GiB)": 77.56, "step": 86290, "token_acc": 0.5120274914089347, "train_speed(iter/s)": 1.437752 }, { "epoch": 3.6971423675078188, "grad_norm": 6.867831707000732, "learning_rate": 1.583814312425102e-05, "loss": 2.323303985595703, "memory(GiB)": 77.56, "step": 86295, "token_acc": 0.4982078853046595, "train_speed(iter/s)": 1.437759 }, { "epoch": 3.6973565828370676, "grad_norm": 6.913043975830078, "learning_rate": 1.5833229376585402e-05, "loss": 2.6895069122314452, "memory(GiB)": 77.56, "step": 86300, "token_acc": 0.47307692307692306, "train_speed(iter/s)": 1.437766 }, { "epoch": 3.697570798166317, "grad_norm": 4.569516181945801, "learning_rate": 1.5828316247882884e-05, "loss": 2.1337255477905273, "memory(GiB)": 77.56, "step": 86305, "token_acc": 0.5206611570247934, "train_speed(iter/s)": 1.437753 }, { "epoch": 3.6977850134955657, "grad_norm": 7.96969747543335, "learning_rate": 1.582340373823248e-05, "loss": 2.3565837860107424, "memory(GiB)": 77.56, "step": 86310, "token_acc": 0.47249190938511326, "train_speed(iter/s)": 1.437757 }, { "epoch": 3.6979992288248145, "grad_norm": 4.90771484375, "learning_rate": 1.581849184772315e-05, "loss": 2.3359630584716795, "memory(GiB)": 77.56, "step": 86315, "token_acc": 0.5189003436426117, "train_speed(iter/s)": 1.437758 }, { "epoch": 3.6982134441540637, "grad_norm": 5.176599979400635, "learning_rate": 1.581358057644393e-05, "loss": 2.3966684341430664, "memory(GiB)": 77.56, "step": 86320, "token_acc": 0.504950495049505, "train_speed(iter/s)": 1.437756 }, { "epoch": 3.6984276594833125, "grad_norm": 4.707461357116699, "learning_rate": 1.5808669924483765e-05, "loss": 2.5782777786254885, "memory(GiB)": 77.56, "step": 86325, "token_acc": 0.5122699386503068, "train_speed(iter/s)": 1.437762 }, { "epoch": 3.6986418748125613, "grad_norm": 6.585312366485596, "learning_rate": 1.5803759891931613e-05, "loss": 2.290018081665039, "memory(GiB)": 77.56, "step": 86330, "token_acc": 0.5380577427821522, "train_speed(iter/s)": 1.437783 }, { "epoch": 3.6988560901418106, "grad_norm": 5.981418609619141, "learning_rate": 1.579885047887644e-05, "loss": 2.3021392822265625, "memory(GiB)": 77.56, "step": 86335, "token_acc": 0.5068027210884354, "train_speed(iter/s)": 1.437797 }, { "epoch": 3.6990703054710594, "grad_norm": 5.6736674308776855, "learning_rate": 1.5793941685407165e-05, "loss": 2.3263540267944336, "memory(GiB)": 77.56, "step": 86340, "token_acc": 0.4778156996587031, "train_speed(iter/s)": 1.437795 }, { "epoch": 3.6992845208003082, "grad_norm": 4.543986797332764, "learning_rate": 1.5789033511612712e-05, "loss": 2.0895456314086913, "memory(GiB)": 77.56, "step": 86345, "token_acc": 0.5611510791366906, "train_speed(iter/s)": 1.437798 }, { "epoch": 3.6994987361295575, "grad_norm": 5.240317344665527, "learning_rate": 1.5784125957582025e-05, "loss": 2.2390132904052735, "memory(GiB)": 77.56, "step": 86350, "token_acc": 0.5, "train_speed(iter/s)": 1.437804 }, { "epoch": 3.6997129514588063, "grad_norm": 6.601179599761963, "learning_rate": 1.577921902340399e-05, "loss": 2.477798271179199, "memory(GiB)": 77.56, "step": 86355, "token_acc": 0.4827586206896552, "train_speed(iter/s)": 1.437801 }, { "epoch": 3.699927166788055, "grad_norm": 4.889268398284912, "learning_rate": 1.5774312709167505e-05, "loss": 2.4026519775390627, "memory(GiB)": 77.56, "step": 86360, "token_acc": 0.46839080459770116, "train_speed(iter/s)": 1.437798 }, { "epoch": 3.7001413821173044, "grad_norm": 4.572332382202148, "learning_rate": 1.576940701496144e-05, "loss": 2.560747528076172, "memory(GiB)": 77.56, "step": 86365, "token_acc": 0.44642857142857145, "train_speed(iter/s)": 1.437798 }, { "epoch": 3.700355597446553, "grad_norm": 6.059507369995117, "learning_rate": 1.5764501940874688e-05, "loss": 2.1256687164306642, "memory(GiB)": 77.56, "step": 86370, "token_acc": 0.5311475409836065, "train_speed(iter/s)": 1.437809 }, { "epoch": 3.700569812775802, "grad_norm": 4.444861888885498, "learning_rate": 1.5759597486996086e-05, "loss": 2.604166030883789, "memory(GiB)": 77.56, "step": 86375, "token_acc": 0.4954128440366973, "train_speed(iter/s)": 1.437802 }, { "epoch": 3.7007840281050512, "grad_norm": 7.19088888168335, "learning_rate": 1.5754693653414515e-05, "loss": 2.1890735626220703, "memory(GiB)": 77.56, "step": 86380, "token_acc": 0.5582329317269076, "train_speed(iter/s)": 1.437823 }, { "epoch": 3.7009982434343, "grad_norm": 7.390120029449463, "learning_rate": 1.5749790440218787e-05, "loss": 2.507330322265625, "memory(GiB)": 77.56, "step": 86385, "token_acc": 0.5143884892086331, "train_speed(iter/s)": 1.437845 }, { "epoch": 3.701212458763549, "grad_norm": 5.598202228546143, "learning_rate": 1.5744887847497735e-05, "loss": 2.129720687866211, "memory(GiB)": 77.56, "step": 86390, "token_acc": 0.536, "train_speed(iter/s)": 1.437864 }, { "epoch": 3.701426674092798, "grad_norm": 5.509608268737793, "learning_rate": 1.573998587534018e-05, "loss": 2.264794921875, "memory(GiB)": 77.56, "step": 86395, "token_acc": 0.551948051948052, "train_speed(iter/s)": 1.437871 }, { "epoch": 3.701640889422047, "grad_norm": 8.979785919189453, "learning_rate": 1.5735084523834913e-05, "loss": 2.2470375061035157, "memory(GiB)": 77.56, "step": 86400, "token_acc": 0.4612794612794613, "train_speed(iter/s)": 1.43788 }, { "epoch": 3.7018551047512958, "grad_norm": 5.396533012390137, "learning_rate": 1.573018379307072e-05, "loss": 2.436020851135254, "memory(GiB)": 77.56, "step": 86405, "token_acc": 0.5244755244755245, "train_speed(iter/s)": 1.437893 }, { "epoch": 3.702069320080545, "grad_norm": 5.912084579467773, "learning_rate": 1.572528368313641e-05, "loss": 1.9977581024169921, "memory(GiB)": 77.56, "step": 86410, "token_acc": 0.5304659498207885, "train_speed(iter/s)": 1.437878 }, { "epoch": 3.702283535409794, "grad_norm": 6.608034610748291, "learning_rate": 1.572038419412074e-05, "loss": 2.1554019927978514, "memory(GiB)": 77.56, "step": 86415, "token_acc": 0.5, "train_speed(iter/s)": 1.4379 }, { "epoch": 3.7024977507390426, "grad_norm": 6.69238805770874, "learning_rate": 1.5715485326112467e-05, "loss": 2.2238237380981447, "memory(GiB)": 77.56, "step": 86420, "token_acc": 0.4952076677316294, "train_speed(iter/s)": 1.437894 }, { "epoch": 3.702711966068292, "grad_norm": 7.633655071258545, "learning_rate": 1.5710587079200335e-05, "loss": 2.3384639739990236, "memory(GiB)": 77.56, "step": 86425, "token_acc": 0.5, "train_speed(iter/s)": 1.437914 }, { "epoch": 3.7029261813975407, "grad_norm": 5.715300559997559, "learning_rate": 1.5705689453473072e-05, "loss": 2.399251937866211, "memory(GiB)": 77.56, "step": 86430, "token_acc": 0.4740484429065744, "train_speed(iter/s)": 1.437926 }, { "epoch": 3.7031403967267895, "grad_norm": 5.963386535644531, "learning_rate": 1.570079244901943e-05, "loss": 2.3137426376342773, "memory(GiB)": 77.56, "step": 86435, "token_acc": 0.5090252707581228, "train_speed(iter/s)": 1.437927 }, { "epoch": 3.7033546120560388, "grad_norm": 11.06846809387207, "learning_rate": 1.569589606592809e-05, "loss": 2.2921144485473635, "memory(GiB)": 77.56, "step": 86440, "token_acc": 0.4865771812080537, "train_speed(iter/s)": 1.437936 }, { "epoch": 3.7035688273852876, "grad_norm": 8.007438659667969, "learning_rate": 1.569100030428779e-05, "loss": 2.45764274597168, "memory(GiB)": 77.56, "step": 86445, "token_acc": 0.4900398406374502, "train_speed(iter/s)": 1.437931 }, { "epoch": 3.7037830427145364, "grad_norm": 6.153940200805664, "learning_rate": 1.568610516418721e-05, "loss": 2.4899049758911134, "memory(GiB)": 77.56, "step": 86450, "token_acc": 0.509375, "train_speed(iter/s)": 1.437918 }, { "epoch": 3.7039972580437857, "grad_norm": 6.3502397537231445, "learning_rate": 1.5681210645715017e-05, "loss": 2.237340545654297, "memory(GiB)": 77.56, "step": 86455, "token_acc": 0.5144694533762058, "train_speed(iter/s)": 1.437909 }, { "epoch": 3.7042114733730345, "grad_norm": 7.710557460784912, "learning_rate": 1.567631674895987e-05, "loss": 2.398152160644531, "memory(GiB)": 77.56, "step": 86460, "token_acc": 0.5063829787234042, "train_speed(iter/s)": 1.437938 }, { "epoch": 3.7044256887022833, "grad_norm": 4.601808547973633, "learning_rate": 1.567142347401046e-05, "loss": 2.2292415618896486, "memory(GiB)": 77.56, "step": 86465, "token_acc": 0.5, "train_speed(iter/s)": 1.437932 }, { "epoch": 3.7046399040315325, "grad_norm": 5.665591716766357, "learning_rate": 1.566653082095542e-05, "loss": 2.300118637084961, "memory(GiB)": 77.56, "step": 86470, "token_acc": 0.4775641025641026, "train_speed(iter/s)": 1.437931 }, { "epoch": 3.7048541193607814, "grad_norm": 5.024373531341553, "learning_rate": 1.566163878988338e-05, "loss": 2.373953628540039, "memory(GiB)": 77.56, "step": 86475, "token_acc": 0.5047923322683706, "train_speed(iter/s)": 1.437906 }, { "epoch": 3.70506833469003, "grad_norm": 6.110803604125977, "learning_rate": 1.5656747380882965e-05, "loss": 2.3358633041381838, "memory(GiB)": 77.56, "step": 86480, "token_acc": 0.4856115107913669, "train_speed(iter/s)": 1.43793 }, { "epoch": 3.7052825500192794, "grad_norm": 7.4865403175354, "learning_rate": 1.565185659404279e-05, "loss": 2.391068458557129, "memory(GiB)": 77.56, "step": 86485, "token_acc": 0.4601449275362319, "train_speed(iter/s)": 1.437942 }, { "epoch": 3.7054967653485282, "grad_norm": 5.674342632293701, "learning_rate": 1.564696642945143e-05, "loss": 2.8004451751708985, "memory(GiB)": 77.56, "step": 86490, "token_acc": 0.47750865051903113, "train_speed(iter/s)": 1.437942 }, { "epoch": 3.705710980677777, "grad_norm": 7.742238998413086, "learning_rate": 1.5642076887197527e-05, "loss": 2.6121700286865233, "memory(GiB)": 77.56, "step": 86495, "token_acc": 0.4897959183673469, "train_speed(iter/s)": 1.437956 }, { "epoch": 3.7059251960070263, "grad_norm": 5.112504959106445, "learning_rate": 1.563718796736962e-05, "loss": 2.178631401062012, "memory(GiB)": 77.56, "step": 86500, "token_acc": 0.546031746031746, "train_speed(iter/s)": 1.437974 }, { "epoch": 3.7059251960070263, "eval_loss": 2.3018243312835693, "eval_runtime": 14.2816, "eval_samples_per_second": 7.002, "eval_steps_per_second": 7.002, "eval_token_acc": 0.4769433465085639, "step": 86500 }, { "epoch": 3.706139411336275, "grad_norm": 7.568201065063477, "learning_rate": 1.5632299670056296e-05, "loss": 2.3550209045410155, "memory(GiB)": 77.56, "step": 86505, "token_acc": 0.4878286270691334, "train_speed(iter/s)": 1.437619 }, { "epoch": 3.706353626665524, "grad_norm": 5.062838554382324, "learning_rate": 1.5627411995346105e-05, "loss": 2.4407291412353516, "memory(GiB)": 77.56, "step": 86510, "token_acc": 0.5201342281879194, "train_speed(iter/s)": 1.437634 }, { "epoch": 3.706567841994773, "grad_norm": 5.365954875946045, "learning_rate": 1.5622524943327567e-05, "loss": 2.4155345916748048, "memory(GiB)": 77.56, "step": 86515, "token_acc": 0.47368421052631576, "train_speed(iter/s)": 1.437625 }, { "epoch": 3.706782057324022, "grad_norm": 5.632654190063477, "learning_rate": 1.5617638514089237e-05, "loss": 2.4387001037597655, "memory(GiB)": 77.56, "step": 86520, "token_acc": 0.5190311418685121, "train_speed(iter/s)": 1.437617 }, { "epoch": 3.706996272653271, "grad_norm": 5.162031173706055, "learning_rate": 1.561275270771966e-05, "loss": 2.467332458496094, "memory(GiB)": 77.56, "step": 86525, "token_acc": 0.4874551971326165, "train_speed(iter/s)": 1.437609 }, { "epoch": 3.70721048798252, "grad_norm": 5.403102397918701, "learning_rate": 1.5607867524307323e-05, "loss": 2.348347473144531, "memory(GiB)": 77.56, "step": 86530, "token_acc": 0.4747191011235955, "train_speed(iter/s)": 1.437607 }, { "epoch": 3.707424703311769, "grad_norm": 5.386386394500732, "learning_rate": 1.5602982963940726e-05, "loss": 1.9832290649414062, "memory(GiB)": 77.56, "step": 86535, "token_acc": 0.5397350993377483, "train_speed(iter/s)": 1.437617 }, { "epoch": 3.7076389186410177, "grad_norm": 8.20964527130127, "learning_rate": 1.559809902670836e-05, "loss": 2.6929420471191405, "memory(GiB)": 77.56, "step": 86540, "token_acc": 0.45064377682403434, "train_speed(iter/s)": 1.437603 }, { "epoch": 3.707853133970267, "grad_norm": 4.948558807373047, "learning_rate": 1.5593215712698705e-05, "loss": 2.3050155639648438, "memory(GiB)": 77.56, "step": 86545, "token_acc": 0.47678018575851394, "train_speed(iter/s)": 1.437601 }, { "epoch": 3.7080673492995158, "grad_norm": 5.862262725830078, "learning_rate": 1.5588333022000212e-05, "loss": 2.6217864990234374, "memory(GiB)": 77.56, "step": 86550, "token_acc": 0.4527027027027027, "train_speed(iter/s)": 1.437608 }, { "epoch": 3.7082815646287646, "grad_norm": 5.878262996673584, "learning_rate": 1.5583450954701356e-05, "loss": 2.3322378158569337, "memory(GiB)": 77.56, "step": 86555, "token_acc": 0.497907949790795, "train_speed(iter/s)": 1.437634 }, { "epoch": 3.708495779958014, "grad_norm": 7.144289493560791, "learning_rate": 1.5578569510890574e-05, "loss": 2.5948171615600586, "memory(GiB)": 77.56, "step": 86560, "token_acc": 0.4423076923076923, "train_speed(iter/s)": 1.437636 }, { "epoch": 3.7087099952872626, "grad_norm": 4.589893341064453, "learning_rate": 1.557368869065629e-05, "loss": 2.1795501708984375, "memory(GiB)": 77.56, "step": 86565, "token_acc": 0.49340369393139843, "train_speed(iter/s)": 1.437626 }, { "epoch": 3.7089242106165115, "grad_norm": 5.418041229248047, "learning_rate": 1.5568808494086933e-05, "loss": 2.303356742858887, "memory(GiB)": 77.56, "step": 86570, "token_acc": 0.5408805031446541, "train_speed(iter/s)": 1.437649 }, { "epoch": 3.7091384259457607, "grad_norm": 6.835787296295166, "learning_rate": 1.5563928921270898e-05, "loss": 2.056692123413086, "memory(GiB)": 77.56, "step": 86575, "token_acc": 0.5284810126582279, "train_speed(iter/s)": 1.437652 }, { "epoch": 3.7093526412750095, "grad_norm": 5.318356037139893, "learning_rate": 1.5559049972296606e-05, "loss": 2.4325708389282226, "memory(GiB)": 77.56, "step": 86580, "token_acc": 0.5016835016835017, "train_speed(iter/s)": 1.437662 }, { "epoch": 3.7095668566042583, "grad_norm": 5.4713311195373535, "learning_rate": 1.555417164725243e-05, "loss": 2.4357158660888674, "memory(GiB)": 77.56, "step": 86585, "token_acc": 0.4835820895522388, "train_speed(iter/s)": 1.437667 }, { "epoch": 3.7097810719335076, "grad_norm": 6.023533821105957, "learning_rate": 1.5549293946226734e-05, "loss": 2.059641456604004, "memory(GiB)": 77.56, "step": 86590, "token_acc": 0.5669014084507042, "train_speed(iter/s)": 1.437688 }, { "epoch": 3.7099952872627564, "grad_norm": 7.49127721786499, "learning_rate": 1.5544416869307915e-05, "loss": 2.3057533264160157, "memory(GiB)": 77.56, "step": 86595, "token_acc": 0.49110320284697506, "train_speed(iter/s)": 1.437691 }, { "epoch": 3.710209502592005, "grad_norm": 6.071860313415527, "learning_rate": 1.5539540416584304e-05, "loss": 2.3259809494018553, "memory(GiB)": 77.56, "step": 86600, "token_acc": 0.4909090909090909, "train_speed(iter/s)": 1.437672 }, { "epoch": 3.7104237179212545, "grad_norm": 4.546608924865723, "learning_rate": 1.5534664588144226e-05, "loss": 2.314710235595703, "memory(GiB)": 77.56, "step": 86605, "token_acc": 0.5258064516129032, "train_speed(iter/s)": 1.437664 }, { "epoch": 3.7106379332505033, "grad_norm": 11.2535982131958, "learning_rate": 1.552978938407605e-05, "loss": 2.4088861465454103, "memory(GiB)": 77.56, "step": 86610, "token_acc": 0.5238095238095238, "train_speed(iter/s)": 1.437668 }, { "epoch": 3.710852148579752, "grad_norm": 5.248917102813721, "learning_rate": 1.552491480446807e-05, "loss": 2.4195835113525392, "memory(GiB)": 77.56, "step": 86615, "token_acc": 0.49673202614379086, "train_speed(iter/s)": 1.437678 }, { "epoch": 3.7110663639090014, "grad_norm": 6.9785051345825195, "learning_rate": 1.55200408494086e-05, "loss": 2.130632781982422, "memory(GiB)": 77.56, "step": 86620, "token_acc": 0.5714285714285714, "train_speed(iter/s)": 1.437695 }, { "epoch": 3.71128057923825, "grad_norm": 5.133209228515625, "learning_rate": 1.5515167518985933e-05, "loss": 2.139729690551758, "memory(GiB)": 77.56, "step": 86625, "token_acc": 0.5462555066079295, "train_speed(iter/s)": 1.437708 }, { "epoch": 3.711494794567499, "grad_norm": 5.0500359535217285, "learning_rate": 1.551029481328836e-05, "loss": 2.1852933883666994, "memory(GiB)": 77.56, "step": 86630, "token_acc": 0.5151515151515151, "train_speed(iter/s)": 1.437708 }, { "epoch": 3.7117090098967482, "grad_norm": 5.2850518226623535, "learning_rate": 1.5505422732404134e-05, "loss": 2.4500160217285156, "memory(GiB)": 77.56, "step": 86635, "token_acc": 0.49458483754512633, "train_speed(iter/s)": 1.43771 }, { "epoch": 3.711923225225997, "grad_norm": 5.476200103759766, "learning_rate": 1.550055127642155e-05, "loss": 2.3460809707641603, "memory(GiB)": 77.56, "step": 86640, "token_acc": 0.5161290322580645, "train_speed(iter/s)": 1.437734 }, { "epoch": 3.712137440555246, "grad_norm": 4.466678142547607, "learning_rate": 1.549568044542884e-05, "loss": 2.41572265625, "memory(GiB)": 77.56, "step": 86645, "token_acc": 0.5261324041811847, "train_speed(iter/s)": 1.437735 }, { "epoch": 3.712351655884495, "grad_norm": 9.083475112915039, "learning_rate": 1.5490810239514254e-05, "loss": 2.4300527572631836, "memory(GiB)": 77.56, "step": 86650, "token_acc": 0.5084745762711864, "train_speed(iter/s)": 1.437734 }, { "epoch": 3.712565871213744, "grad_norm": 5.925046920776367, "learning_rate": 1.5485940658766e-05, "loss": 2.495985984802246, "memory(GiB)": 77.56, "step": 86655, "token_acc": 0.48863636363636365, "train_speed(iter/s)": 1.437746 }, { "epoch": 3.7127800865429927, "grad_norm": 7.50604248046875, "learning_rate": 1.5481071703272304e-05, "loss": 2.566284942626953, "memory(GiB)": 77.56, "step": 86660, "token_acc": 0.4526627218934911, "train_speed(iter/s)": 1.437747 }, { "epoch": 3.712994301872242, "grad_norm": 5.756147384643555, "learning_rate": 1.547620337312137e-05, "loss": 2.1826284408569334, "memory(GiB)": 77.56, "step": 86665, "token_acc": 0.5674603174603174, "train_speed(iter/s)": 1.437744 }, { "epoch": 3.713208517201491, "grad_norm": 4.954468727111816, "learning_rate": 1.547133566840141e-05, "loss": 2.473418045043945, "memory(GiB)": 77.56, "step": 86670, "token_acc": 0.5060975609756098, "train_speed(iter/s)": 1.437745 }, { "epoch": 3.7134227325307396, "grad_norm": 8.725044250488281, "learning_rate": 1.54664685892006e-05, "loss": 2.4450586318969725, "memory(GiB)": 77.56, "step": 86675, "token_acc": 0.4748427672955975, "train_speed(iter/s)": 1.437756 }, { "epoch": 3.713636947859989, "grad_norm": 6.033003807067871, "learning_rate": 1.5461602135607105e-05, "loss": 2.224970245361328, "memory(GiB)": 77.56, "step": 86680, "token_acc": 0.5238095238095238, "train_speed(iter/s)": 1.43777 }, { "epoch": 3.7138511631892377, "grad_norm": 5.881270408630371, "learning_rate": 1.545673630770909e-05, "loss": 2.233582305908203, "memory(GiB)": 77.56, "step": 86685, "token_acc": 0.4980694980694981, "train_speed(iter/s)": 1.437779 }, { "epoch": 3.714065378518487, "grad_norm": 5.8360514640808105, "learning_rate": 1.5451871105594696e-05, "loss": 2.558456611633301, "memory(GiB)": 77.56, "step": 86690, "token_acc": 0.4563758389261745, "train_speed(iter/s)": 1.437799 }, { "epoch": 3.7142795938477358, "grad_norm": 5.247663974761963, "learning_rate": 1.5447006529352054e-05, "loss": 2.6439434051513673, "memory(GiB)": 77.56, "step": 86695, "token_acc": 0.4912891986062718, "train_speed(iter/s)": 1.437828 }, { "epoch": 3.7144938091769846, "grad_norm": 7.151357173919678, "learning_rate": 1.5442142579069315e-05, "loss": 2.2906425476074217, "memory(GiB)": 77.56, "step": 86700, "token_acc": 0.5016835016835017, "train_speed(iter/s)": 1.437813 }, { "epoch": 3.714708024506234, "grad_norm": 6.617393493652344, "learning_rate": 1.543727925483458e-05, "loss": 2.353643226623535, "memory(GiB)": 77.56, "step": 86705, "token_acc": 0.5454545454545454, "train_speed(iter/s)": 1.437793 }, { "epoch": 3.7149222398354826, "grad_norm": 6.362409591674805, "learning_rate": 1.5432416556735957e-05, "loss": 2.3665643692016602, "memory(GiB)": 77.56, "step": 86710, "token_acc": 0.4808362369337979, "train_speed(iter/s)": 1.437795 }, { "epoch": 3.7151364551647315, "grad_norm": 6.409313201904297, "learning_rate": 1.5427554484861532e-05, "loss": 2.2252857208251955, "memory(GiB)": 77.56, "step": 86715, "token_acc": 0.5326460481099656, "train_speed(iter/s)": 1.437803 }, { "epoch": 3.7153506704939807, "grad_norm": 5.6080241203308105, "learning_rate": 1.542269303929938e-05, "loss": 2.3943241119384764, "memory(GiB)": 77.56, "step": 86720, "token_acc": 0.5373665480427047, "train_speed(iter/s)": 1.437837 }, { "epoch": 3.7155648858232295, "grad_norm": 6.60405158996582, "learning_rate": 1.541783222013759e-05, "loss": 2.6751174926757812, "memory(GiB)": 77.56, "step": 86725, "token_acc": 0.45993031358885017, "train_speed(iter/s)": 1.437835 }, { "epoch": 3.7157791011524783, "grad_norm": 6.042786121368408, "learning_rate": 1.5412972027464213e-05, "loss": 2.3408435821533202, "memory(GiB)": 77.56, "step": 86730, "token_acc": 0.5267379679144385, "train_speed(iter/s)": 1.437817 }, { "epoch": 3.7159933164817276, "grad_norm": 6.136815547943115, "learning_rate": 1.5408112461367268e-05, "loss": 2.488810348510742, "memory(GiB)": 77.56, "step": 86735, "token_acc": 0.49097472924187724, "train_speed(iter/s)": 1.437833 }, { "epoch": 3.7162075318109764, "grad_norm": 6.867037296295166, "learning_rate": 1.540325352193484e-05, "loss": 2.4635574340820314, "memory(GiB)": 77.56, "step": 86740, "token_acc": 0.46630727762803237, "train_speed(iter/s)": 1.437842 }, { "epoch": 3.716421747140225, "grad_norm": 7.39055871963501, "learning_rate": 1.539839520925493e-05, "loss": 2.1807117462158203, "memory(GiB)": 77.56, "step": 86745, "token_acc": 0.5436507936507936, "train_speed(iter/s)": 1.43782 }, { "epoch": 3.7166359624694745, "grad_norm": 5.075364589691162, "learning_rate": 1.539353752341553e-05, "loss": 2.7658105850219727, "memory(GiB)": 77.56, "step": 86750, "token_acc": 0.4906832298136646, "train_speed(iter/s)": 1.43784 }, { "epoch": 3.7168501777987233, "grad_norm": 5.643764495849609, "learning_rate": 1.538868046450468e-05, "loss": 2.5875982284545898, "memory(GiB)": 77.56, "step": 86755, "token_acc": 0.4692556634304207, "train_speed(iter/s)": 1.437847 }, { "epoch": 3.717064393127972, "grad_norm": 6.702514171600342, "learning_rate": 1.538382403261035e-05, "loss": 2.336283493041992, "memory(GiB)": 77.56, "step": 86760, "token_acc": 0.5107913669064749, "train_speed(iter/s)": 1.437855 }, { "epoch": 3.7172786084572214, "grad_norm": 4.063947677612305, "learning_rate": 1.5378968227820518e-05, "loss": 2.2570259094238283, "memory(GiB)": 77.56, "step": 86765, "token_acc": 0.4835820895522388, "train_speed(iter/s)": 1.43787 }, { "epoch": 3.71749282378647, "grad_norm": 5.202474117279053, "learning_rate": 1.5374113050223153e-05, "loss": 2.0593425750732424, "memory(GiB)": 77.56, "step": 86770, "token_acc": 0.5598591549295775, "train_speed(iter/s)": 1.43788 }, { "epoch": 3.717707039115719, "grad_norm": 5.654252052307129, "learning_rate": 1.5369258499906215e-05, "loss": 2.3037891387939453, "memory(GiB)": 77.56, "step": 86775, "token_acc": 0.5032051282051282, "train_speed(iter/s)": 1.437896 }, { "epoch": 3.7179212544449682, "grad_norm": 5.633025646209717, "learning_rate": 1.536440457695763e-05, "loss": 2.3663572311401366, "memory(GiB)": 77.56, "step": 86780, "token_acc": 0.4901315789473684, "train_speed(iter/s)": 1.437886 }, { "epoch": 3.718135469774217, "grad_norm": 5.767129421234131, "learning_rate": 1.5359551281465363e-05, "loss": 2.3048080444335937, "memory(GiB)": 77.56, "step": 86785, "token_acc": 0.49850746268656715, "train_speed(iter/s)": 1.437849 }, { "epoch": 3.718349685103466, "grad_norm": 6.4442057609558105, "learning_rate": 1.5354698613517316e-05, "loss": 2.4966196060180663, "memory(GiB)": 77.56, "step": 86790, "token_acc": 0.5079365079365079, "train_speed(iter/s)": 1.437847 }, { "epoch": 3.718563900432715, "grad_norm": 5.920304298400879, "learning_rate": 1.534984657320141e-05, "loss": 2.462950897216797, "memory(GiB)": 77.56, "step": 86795, "token_acc": 0.4792332268370607, "train_speed(iter/s)": 1.437877 }, { "epoch": 3.718778115761964, "grad_norm": 4.767179012298584, "learning_rate": 1.534499516060553e-05, "loss": 2.178111267089844, "memory(GiB)": 77.56, "step": 86800, "token_acc": 0.5137931034482759, "train_speed(iter/s)": 1.437882 }, { "epoch": 3.7189923310912127, "grad_norm": 5.359717845916748, "learning_rate": 1.534014437581756e-05, "loss": 2.2563379287719725, "memory(GiB)": 77.56, "step": 86805, "token_acc": 0.5348101265822784, "train_speed(iter/s)": 1.437902 }, { "epoch": 3.719206546420462, "grad_norm": 7.186066150665283, "learning_rate": 1.5335294218925388e-05, "loss": 2.443486213684082, "memory(GiB)": 77.56, "step": 86810, "token_acc": 0.5141955835962145, "train_speed(iter/s)": 1.43792 }, { "epoch": 3.719420761749711, "grad_norm": 6.606617450714111, "learning_rate": 1.53304446900169e-05, "loss": 2.1772224426269533, "memory(GiB)": 77.56, "step": 86815, "token_acc": 0.5358649789029536, "train_speed(iter/s)": 1.437925 }, { "epoch": 3.7196349770789596, "grad_norm": 5.413238525390625, "learning_rate": 1.532559578917992e-05, "loss": 2.5105783462524416, "memory(GiB)": 77.56, "step": 86820, "token_acc": 0.4965034965034965, "train_speed(iter/s)": 1.437936 }, { "epoch": 3.719849192408209, "grad_norm": 7.256813049316406, "learning_rate": 1.5320747516502303e-05, "loss": 2.2638626098632812, "memory(GiB)": 77.56, "step": 86825, "token_acc": 0.5338078291814946, "train_speed(iter/s)": 1.437947 }, { "epoch": 3.7200634077374577, "grad_norm": 5.982698440551758, "learning_rate": 1.5315899872071882e-05, "loss": 2.3710479736328125, "memory(GiB)": 77.56, "step": 86830, "token_acc": 0.47093023255813954, "train_speed(iter/s)": 1.437928 }, { "epoch": 3.7202776230667065, "grad_norm": 5.053171634674072, "learning_rate": 1.5311052855976465e-05, "loss": 2.4563880920410157, "memory(GiB)": 77.56, "step": 86835, "token_acc": 0.48717948717948717, "train_speed(iter/s)": 1.437937 }, { "epoch": 3.7204918383959558, "grad_norm": 4.804139137268066, "learning_rate": 1.530620646830385e-05, "loss": 2.5838596343994142, "memory(GiB)": 77.56, "step": 86840, "token_acc": 0.4943181818181818, "train_speed(iter/s)": 1.437948 }, { "epoch": 3.7207060537252046, "grad_norm": 6.840320110321045, "learning_rate": 1.530136070914187e-05, "loss": 2.3054986953735352, "memory(GiB)": 77.56, "step": 86845, "token_acc": 0.48214285714285715, "train_speed(iter/s)": 1.437955 }, { "epoch": 3.7209202690544534, "grad_norm": 5.580686092376709, "learning_rate": 1.5296515578578285e-05, "loss": 2.542148208618164, "memory(GiB)": 77.56, "step": 86850, "token_acc": 0.48441926345609065, "train_speed(iter/s)": 1.437965 }, { "epoch": 3.7211344843837026, "grad_norm": 5.10511589050293, "learning_rate": 1.5291671076700882e-05, "loss": 1.8831974029541017, "memory(GiB)": 77.56, "step": 86855, "token_acc": 0.5597269624573379, "train_speed(iter/s)": 1.437986 }, { "epoch": 3.7213486997129515, "grad_norm": 6.026157379150391, "learning_rate": 1.5286827203597414e-05, "loss": 2.4431621551513674, "memory(GiB)": 77.56, "step": 86860, "token_acc": 0.4731182795698925, "train_speed(iter/s)": 1.437971 }, { "epoch": 3.7215629150422003, "grad_norm": 5.7920098304748535, "learning_rate": 1.5281983959355615e-05, "loss": 2.1135372161865233, "memory(GiB)": 77.56, "step": 86865, "token_acc": 0.5018050541516246, "train_speed(iter/s)": 1.437984 }, { "epoch": 3.7217771303714495, "grad_norm": 4.803773403167725, "learning_rate": 1.5277141344063267e-05, "loss": 2.298733711242676, "memory(GiB)": 77.56, "step": 86870, "token_acc": 0.48344370860927155, "train_speed(iter/s)": 1.437991 }, { "epoch": 3.7219913457006983, "grad_norm": 6.743514537811279, "learning_rate": 1.5272299357808072e-05, "loss": 2.7616195678710938, "memory(GiB)": 77.56, "step": 86875, "token_acc": 0.46735395189003437, "train_speed(iter/s)": 1.438 }, { "epoch": 3.722205561029947, "grad_norm": 5.747242450714111, "learning_rate": 1.5267458000677753e-05, "loss": 2.2895477294921873, "memory(GiB)": 77.56, "step": 86880, "token_acc": 0.53125, "train_speed(iter/s)": 1.438017 }, { "epoch": 3.7224197763591964, "grad_norm": 4.466111660003662, "learning_rate": 1.5262617272759993e-05, "loss": 2.348099708557129, "memory(GiB)": 77.56, "step": 86885, "token_acc": 0.506896551724138, "train_speed(iter/s)": 1.438021 }, { "epoch": 3.722633991688445, "grad_norm": 6.285738468170166, "learning_rate": 1.5257777174142529e-05, "loss": 2.3186534881591796, "memory(GiB)": 77.56, "step": 86890, "token_acc": 0.5, "train_speed(iter/s)": 1.438019 }, { "epoch": 3.722848207017694, "grad_norm": 5.389781475067139, "learning_rate": 1.5252937704913006e-05, "loss": 2.3748050689697267, "memory(GiB)": 77.56, "step": 86895, "token_acc": 0.4520547945205479, "train_speed(iter/s)": 1.438004 }, { "epoch": 3.7230624223469433, "grad_norm": 8.105585098266602, "learning_rate": 1.5248098865159127e-05, "loss": 2.4184099197387696, "memory(GiB)": 77.56, "step": 86900, "token_acc": 0.48109965635738833, "train_speed(iter/s)": 1.438 }, { "epoch": 3.723276637676192, "grad_norm": 7.014865875244141, "learning_rate": 1.5243260654968539e-05, "loss": 2.164982223510742, "memory(GiB)": 77.56, "step": 86905, "token_acc": 0.540650406504065, "train_speed(iter/s)": 1.438006 }, { "epoch": 3.723490853005441, "grad_norm": 7.801687717437744, "learning_rate": 1.5238423074428888e-05, "loss": 2.7186166763305666, "memory(GiB)": 77.56, "step": 86910, "token_acc": 0.47368421052631576, "train_speed(iter/s)": 1.438 }, { "epoch": 3.72370506833469, "grad_norm": 6.729595184326172, "learning_rate": 1.5233586123627807e-05, "loss": 2.218162727355957, "memory(GiB)": 77.56, "step": 86915, "token_acc": 0.5181518151815182, "train_speed(iter/s)": 1.438007 }, { "epoch": 3.723919283663939, "grad_norm": 5.841404914855957, "learning_rate": 1.5228749802652932e-05, "loss": 2.3726444244384766, "memory(GiB)": 77.56, "step": 86920, "token_acc": 0.5, "train_speed(iter/s)": 1.438001 }, { "epoch": 3.724133498993188, "grad_norm": 5.373078346252441, "learning_rate": 1.5223914111591853e-05, "loss": 2.351149559020996, "memory(GiB)": 77.56, "step": 86925, "token_acc": 0.5031847133757962, "train_speed(iter/s)": 1.438016 }, { "epoch": 3.724347714322437, "grad_norm": 4.831430912017822, "learning_rate": 1.5219079050532209e-05, "loss": 2.024088478088379, "memory(GiB)": 77.56, "step": 86930, "token_acc": 0.5424836601307189, "train_speed(iter/s)": 1.437983 }, { "epoch": 3.724561929651686, "grad_norm": 5.509981155395508, "learning_rate": 1.521424461956158e-05, "loss": 2.1668506622314454, "memory(GiB)": 77.56, "step": 86935, "token_acc": 0.5132450331125827, "train_speed(iter/s)": 1.437989 }, { "epoch": 3.7247761449809347, "grad_norm": 4.685031890869141, "learning_rate": 1.5209410818767539e-05, "loss": 2.3187753677368166, "memory(GiB)": 77.56, "step": 86940, "token_acc": 0.5018587360594795, "train_speed(iter/s)": 1.437981 }, { "epoch": 3.724990360310184, "grad_norm": 5.680825233459473, "learning_rate": 1.5204577648237656e-05, "loss": 2.278506278991699, "memory(GiB)": 77.56, "step": 86945, "token_acc": 0.4752475247524752, "train_speed(iter/s)": 1.437987 }, { "epoch": 3.7252045756394327, "grad_norm": 5.809874057769775, "learning_rate": 1.519974510805947e-05, "loss": 2.27001838684082, "memory(GiB)": 77.56, "step": 86950, "token_acc": 0.5139318885448917, "train_speed(iter/s)": 1.437989 }, { "epoch": 3.7254187909686816, "grad_norm": 5.0600266456604, "learning_rate": 1.519491319832057e-05, "loss": 2.3480995178222654, "memory(GiB)": 77.56, "step": 86955, "token_acc": 0.495114006514658, "train_speed(iter/s)": 1.437994 }, { "epoch": 3.725633006297931, "grad_norm": 5.149813175201416, "learning_rate": 1.5190081919108445e-05, "loss": 2.400051307678223, "memory(GiB)": 77.56, "step": 86960, "token_acc": 0.5016835016835017, "train_speed(iter/s)": 1.437986 }, { "epoch": 3.7258472216271796, "grad_norm": 6.670567989349365, "learning_rate": 1.5185251270510659e-05, "loss": 2.245740509033203, "memory(GiB)": 77.56, "step": 86965, "token_acc": 0.51953125, "train_speed(iter/s)": 1.437986 }, { "epoch": 3.7260614369564284, "grad_norm": 7.417488098144531, "learning_rate": 1.5180421252614707e-05, "loss": 2.519170951843262, "memory(GiB)": 77.56, "step": 86970, "token_acc": 0.50187265917603, "train_speed(iter/s)": 1.437995 }, { "epoch": 3.7262756522856777, "grad_norm": 5.339607238769531, "learning_rate": 1.5175591865508082e-05, "loss": 2.417881965637207, "memory(GiB)": 77.56, "step": 86975, "token_acc": 0.50814332247557, "train_speed(iter/s)": 1.438014 }, { "epoch": 3.7264898676149265, "grad_norm": 5.728578567504883, "learning_rate": 1.5170763109278285e-05, "loss": 2.128196144104004, "memory(GiB)": 77.56, "step": 86980, "token_acc": 0.5648535564853556, "train_speed(iter/s)": 1.438012 }, { "epoch": 3.7267040829441753, "grad_norm": 5.405481338500977, "learning_rate": 1.5165934984012769e-05, "loss": 2.3688072204589843, "memory(GiB)": 77.56, "step": 86985, "token_acc": 0.5227272727272727, "train_speed(iter/s)": 1.43801 }, { "epoch": 3.7269182982734246, "grad_norm": 6.180614471435547, "learning_rate": 1.5161107489799032e-05, "loss": 2.276973915100098, "memory(GiB)": 77.56, "step": 86990, "token_acc": 0.5278688524590164, "train_speed(iter/s)": 1.438017 }, { "epoch": 3.7271325136026734, "grad_norm": 5.205055236816406, "learning_rate": 1.5156280626724512e-05, "loss": 1.963690185546875, "memory(GiB)": 77.56, "step": 86995, "token_acc": 0.5903614457831325, "train_speed(iter/s)": 1.438043 }, { "epoch": 3.727346728931922, "grad_norm": 7.002117156982422, "learning_rate": 1.5151454394876658e-05, "loss": 2.2104793548583985, "memory(GiB)": 77.56, "step": 87000, "token_acc": 0.5405405405405406, "train_speed(iter/s)": 1.438045 }, { "epoch": 3.727346728931922, "eval_loss": 2.308195114135742, "eval_runtime": 14.3281, "eval_samples_per_second": 6.979, "eval_steps_per_second": 6.979, "eval_token_acc": 0.4654696132596685, "step": 87000 }, { "epoch": 3.7275609442611715, "grad_norm": 7.07882833480835, "learning_rate": 1.5146628794342898e-05, "loss": 2.3575599670410154, "memory(GiB)": 77.56, "step": 87005, "token_acc": 0.46875, "train_speed(iter/s)": 1.437699 }, { "epoch": 3.7277751595904203, "grad_norm": 5.945185661315918, "learning_rate": 1.5141803825210637e-05, "loss": 2.127720832824707, "memory(GiB)": 77.56, "step": 87010, "token_acc": 0.5450819672131147, "train_speed(iter/s)": 1.437725 }, { "epoch": 3.727989374919669, "grad_norm": 5.8800435066223145, "learning_rate": 1.5136979487567315e-05, "loss": 2.278525543212891, "memory(GiB)": 77.56, "step": 87015, "token_acc": 0.49808429118773945, "train_speed(iter/s)": 1.437725 }, { "epoch": 3.7282035902489183, "grad_norm": 5.666601181030273, "learning_rate": 1.5132155781500313e-05, "loss": 2.3060993194580077, "memory(GiB)": 77.56, "step": 87020, "token_acc": 0.5142857142857142, "train_speed(iter/s)": 1.437723 }, { "epoch": 3.728417805578167, "grad_norm": 5.1239705085754395, "learning_rate": 1.5127332707097013e-05, "loss": 2.422837257385254, "memory(GiB)": 77.56, "step": 87025, "token_acc": 0.486646884272997, "train_speed(iter/s)": 1.437713 }, { "epoch": 3.7286320209074164, "grad_norm": 7.413276672363281, "learning_rate": 1.5122510264444784e-05, "loss": 2.3034963607788086, "memory(GiB)": 77.56, "step": 87030, "token_acc": 0.5133333333333333, "train_speed(iter/s)": 1.437714 }, { "epoch": 3.7288462362366652, "grad_norm": 5.47199821472168, "learning_rate": 1.5117688453631019e-05, "loss": 2.3623912811279295, "memory(GiB)": 77.56, "step": 87035, "token_acc": 0.48955223880597015, "train_speed(iter/s)": 1.437715 }, { "epoch": 3.729060451565914, "grad_norm": 6.05703592300415, "learning_rate": 1.5112867274743026e-05, "loss": 2.0504739761352537, "memory(GiB)": 77.56, "step": 87040, "token_acc": 0.5173611111111112, "train_speed(iter/s)": 1.437733 }, { "epoch": 3.7292746668951633, "grad_norm": 7.642071723937988, "learning_rate": 1.5108046727868186e-05, "loss": 2.7841732025146486, "memory(GiB)": 77.56, "step": 87045, "token_acc": 0.4701195219123506, "train_speed(iter/s)": 1.437711 }, { "epoch": 3.729488882224412, "grad_norm": 5.824583053588867, "learning_rate": 1.5103226813093813e-05, "loss": 2.2717166900634767, "memory(GiB)": 77.56, "step": 87050, "token_acc": 0.4891304347826087, "train_speed(iter/s)": 1.437725 }, { "epoch": 3.729703097553661, "grad_norm": 5.409212589263916, "learning_rate": 1.5098407530507225e-05, "loss": 2.5261409759521483, "memory(GiB)": 77.56, "step": 87055, "token_acc": 0.4966216216216216, "train_speed(iter/s)": 1.437731 }, { "epoch": 3.72991731288291, "grad_norm": 8.594744682312012, "learning_rate": 1.509358888019572e-05, "loss": 2.2982528686523436, "memory(GiB)": 77.56, "step": 87060, "token_acc": 0.5095057034220533, "train_speed(iter/s)": 1.437749 }, { "epoch": 3.730131528212159, "grad_norm": 8.690960884094238, "learning_rate": 1.5088770862246599e-05, "loss": 2.225796127319336, "memory(GiB)": 77.56, "step": 87065, "token_acc": 0.5285714285714286, "train_speed(iter/s)": 1.437762 }, { "epoch": 3.730345743541408, "grad_norm": 5.013505935668945, "learning_rate": 1.508395347674713e-05, "loss": 2.222603988647461, "memory(GiB)": 77.56, "step": 87070, "token_acc": 0.535593220338983, "train_speed(iter/s)": 1.437769 }, { "epoch": 3.730559958870657, "grad_norm": 5.395937442779541, "learning_rate": 1.507913672378461e-05, "loss": 2.2606481552124023, "memory(GiB)": 77.56, "step": 87075, "token_acc": 0.5261324041811847, "train_speed(iter/s)": 1.437777 }, { "epoch": 3.730774174199906, "grad_norm": 6.910282135009766, "learning_rate": 1.507432060344629e-05, "loss": 2.3298110961914062, "memory(GiB)": 77.56, "step": 87080, "token_acc": 0.5164473684210527, "train_speed(iter/s)": 1.437781 }, { "epoch": 3.7309883895291547, "grad_norm": 5.100796699523926, "learning_rate": 1.506950511581941e-05, "loss": 2.4390256881713865, "memory(GiB)": 77.56, "step": 87085, "token_acc": 0.5014577259475219, "train_speed(iter/s)": 1.437799 }, { "epoch": 3.731202604858404, "grad_norm": 6.4759626388549805, "learning_rate": 1.5064690260991215e-05, "loss": 2.3322307586669924, "memory(GiB)": 77.56, "step": 87090, "token_acc": 0.5120274914089347, "train_speed(iter/s)": 1.437804 }, { "epoch": 3.7314168201876527, "grad_norm": 6.136115550994873, "learning_rate": 1.5059876039048914e-05, "loss": 2.7116325378417967, "memory(GiB)": 77.56, "step": 87095, "token_acc": 0.4405797101449275, "train_speed(iter/s)": 1.437811 }, { "epoch": 3.7316310355169016, "grad_norm": 5.605592250823975, "learning_rate": 1.5055062450079749e-05, "loss": 2.574116516113281, "memory(GiB)": 77.56, "step": 87100, "token_acc": 0.47840531561461797, "train_speed(iter/s)": 1.43781 }, { "epoch": 3.731845250846151, "grad_norm": 9.748913764953613, "learning_rate": 1.5050249494170893e-05, "loss": 2.479766845703125, "memory(GiB)": 77.56, "step": 87105, "token_acc": 0.4856115107913669, "train_speed(iter/s)": 1.437829 }, { "epoch": 3.7320594661753996, "grad_norm": 5.765749931335449, "learning_rate": 1.5045437171409571e-05, "loss": 2.371507263183594, "memory(GiB)": 77.56, "step": 87110, "token_acc": 0.5069444444444444, "train_speed(iter/s)": 1.437837 }, { "epoch": 3.7322736815046484, "grad_norm": 6.0616559982299805, "learning_rate": 1.5040625481882942e-05, "loss": 2.424093246459961, "memory(GiB)": 77.56, "step": 87115, "token_acc": 0.5128205128205128, "train_speed(iter/s)": 1.437851 }, { "epoch": 3.7324878968338977, "grad_norm": 4.634643077850342, "learning_rate": 1.5035814425678174e-05, "loss": 2.2948190689086916, "memory(GiB)": 77.56, "step": 87120, "token_acc": 0.506993006993007, "train_speed(iter/s)": 1.437859 }, { "epoch": 3.7327021121631465, "grad_norm": 5.670689582824707, "learning_rate": 1.5031004002882431e-05, "loss": 2.2508188247680665, "memory(GiB)": 77.56, "step": 87125, "token_acc": 0.5309446254071661, "train_speed(iter/s)": 1.437877 }, { "epoch": 3.7329163274923953, "grad_norm": 7.1844801902771, "learning_rate": 1.502619421358284e-05, "loss": 2.478338623046875, "memory(GiB)": 77.56, "step": 87130, "token_acc": 0.48297213622291024, "train_speed(iter/s)": 1.437867 }, { "epoch": 3.7331305428216446, "grad_norm": 5.184489727020264, "learning_rate": 1.502138505786656e-05, "loss": 2.2576826095581053, "memory(GiB)": 77.56, "step": 87135, "token_acc": 0.5072463768115942, "train_speed(iter/s)": 1.437878 }, { "epoch": 3.7333447581508934, "grad_norm": 5.93429708480835, "learning_rate": 1.5016576535820708e-05, "loss": 2.3770168304443358, "memory(GiB)": 77.56, "step": 87140, "token_acc": 0.5294117647058824, "train_speed(iter/s)": 1.437874 }, { "epoch": 3.733558973480142, "grad_norm": 7.028170585632324, "learning_rate": 1.5011768647532382e-05, "loss": 2.397247314453125, "memory(GiB)": 77.56, "step": 87145, "token_acc": 0.4647058823529412, "train_speed(iter/s)": 1.437867 }, { "epoch": 3.7337731888093915, "grad_norm": 5.135748386383057, "learning_rate": 1.5006961393088692e-05, "loss": 2.169891357421875, "memory(GiB)": 77.56, "step": 87150, "token_acc": 0.49230769230769234, "train_speed(iter/s)": 1.43787 }, { "epoch": 3.7339874041386403, "grad_norm": 5.808743953704834, "learning_rate": 1.5002154772576709e-05, "loss": 2.238406753540039, "memory(GiB)": 77.56, "step": 87155, "token_acc": 0.5483870967741935, "train_speed(iter/s)": 1.43786 }, { "epoch": 3.734201619467889, "grad_norm": 6.619450569152832, "learning_rate": 1.4997348786083537e-05, "loss": 2.3706283569335938, "memory(GiB)": 77.56, "step": 87160, "token_acc": 0.5167173252279635, "train_speed(iter/s)": 1.437848 }, { "epoch": 3.7344158347971383, "grad_norm": 4.534168243408203, "learning_rate": 1.4992543433696228e-05, "loss": 2.4198450088500976, "memory(GiB)": 77.56, "step": 87165, "token_acc": 0.5155709342560554, "train_speed(iter/s)": 1.437872 }, { "epoch": 3.734630050126387, "grad_norm": 5.226425647735596, "learning_rate": 1.4987738715501832e-05, "loss": 2.453997993469238, "memory(GiB)": 77.56, "step": 87170, "token_acc": 0.4642857142857143, "train_speed(iter/s)": 1.437886 }, { "epoch": 3.734844265455636, "grad_norm": 7.435589790344238, "learning_rate": 1.4982934631587391e-05, "loss": 2.308641815185547, "memory(GiB)": 77.56, "step": 87175, "token_acc": 0.4691358024691358, "train_speed(iter/s)": 1.437877 }, { "epoch": 3.7350584807848852, "grad_norm": 4.9824910163879395, "learning_rate": 1.4978131182039928e-05, "loss": 2.2796201705932617, "memory(GiB)": 77.56, "step": 87180, "token_acc": 0.5076923076923077, "train_speed(iter/s)": 1.437884 }, { "epoch": 3.735272696114134, "grad_norm": 5.569031715393066, "learning_rate": 1.4973328366946471e-05, "loss": 2.3210727691650392, "memory(GiB)": 77.56, "step": 87185, "token_acc": 0.5121212121212121, "train_speed(iter/s)": 1.437897 }, { "epoch": 3.735486911443383, "grad_norm": 7.1741132736206055, "learning_rate": 1.4968526186394039e-05, "loss": 2.4107852935791017, "memory(GiB)": 77.56, "step": 87190, "token_acc": 0.5051546391752577, "train_speed(iter/s)": 1.437889 }, { "epoch": 3.735701126772632, "grad_norm": 8.392370223999023, "learning_rate": 1.4963724640469622e-05, "loss": 2.515874481201172, "memory(GiB)": 77.56, "step": 87195, "token_acc": 0.48366013071895425, "train_speed(iter/s)": 1.437873 }, { "epoch": 3.735915342101881, "grad_norm": 5.084343910217285, "learning_rate": 1.4958923729260198e-05, "loss": 2.220355415344238, "memory(GiB)": 77.56, "step": 87200, "token_acc": 0.490625, "train_speed(iter/s)": 1.437868 }, { "epoch": 3.7361295574311297, "grad_norm": 5.194983005523682, "learning_rate": 1.4954123452852742e-05, "loss": 2.4692991256713865, "memory(GiB)": 77.56, "step": 87205, "token_acc": 0.4507462686567164, "train_speed(iter/s)": 1.437884 }, { "epoch": 3.736343772760379, "grad_norm": 5.243771553039551, "learning_rate": 1.4949323811334214e-05, "loss": 2.344961166381836, "memory(GiB)": 77.56, "step": 87210, "token_acc": 0.47101449275362317, "train_speed(iter/s)": 1.437902 }, { "epoch": 3.736557988089628, "grad_norm": 5.781335830688477, "learning_rate": 1.4944524804791554e-05, "loss": 2.2208852767944336, "memory(GiB)": 77.56, "step": 87215, "token_acc": 0.4982698961937716, "train_speed(iter/s)": 1.437898 }, { "epoch": 3.7367722034188766, "grad_norm": 6.047597885131836, "learning_rate": 1.4939726433311723e-05, "loss": 2.0977304458618162, "memory(GiB)": 77.56, "step": 87220, "token_acc": 0.5391304347826087, "train_speed(iter/s)": 1.437899 }, { "epoch": 3.736986418748126, "grad_norm": 6.536790370941162, "learning_rate": 1.4934928696981636e-05, "loss": 2.4487327575683593, "memory(GiB)": 77.56, "step": 87225, "token_acc": 0.5141843971631206, "train_speed(iter/s)": 1.43792 }, { "epoch": 3.7372006340773747, "grad_norm": 6.325838565826416, "learning_rate": 1.4930131595888213e-05, "loss": 2.370090866088867, "memory(GiB)": 77.56, "step": 87230, "token_acc": 0.5381818181818182, "train_speed(iter/s)": 1.437948 }, { "epoch": 3.7374148494066235, "grad_norm": 5.113385200500488, "learning_rate": 1.4925335130118357e-05, "loss": 2.2227386474609374, "memory(GiB)": 77.56, "step": 87235, "token_acc": 0.5339805825242718, "train_speed(iter/s)": 1.437962 }, { "epoch": 3.7376290647358728, "grad_norm": 6.892491340637207, "learning_rate": 1.4920539299758935e-05, "loss": 2.34505615234375, "memory(GiB)": 77.56, "step": 87240, "token_acc": 0.517799352750809, "train_speed(iter/s)": 1.437952 }, { "epoch": 3.7378432800651216, "grad_norm": 6.933368682861328, "learning_rate": 1.4915744104896872e-05, "loss": 2.5074304580688476, "memory(GiB)": 77.56, "step": 87245, "token_acc": 0.46853146853146854, "train_speed(iter/s)": 1.437958 }, { "epoch": 3.7380574953943704, "grad_norm": 5.056584358215332, "learning_rate": 1.4910949545619013e-05, "loss": 2.4061588287353515, "memory(GiB)": 77.56, "step": 87250, "token_acc": 0.44135802469135804, "train_speed(iter/s)": 1.437959 }, { "epoch": 3.7382717107236196, "grad_norm": 5.506441116333008, "learning_rate": 1.4906155622012202e-05, "loss": 1.9020082473754882, "memory(GiB)": 77.56, "step": 87255, "token_acc": 0.6223175965665236, "train_speed(iter/s)": 1.437959 }, { "epoch": 3.7384859260528684, "grad_norm": 4.79023551940918, "learning_rate": 1.490136233416332e-05, "loss": 2.349620056152344, "memory(GiB)": 77.56, "step": 87260, "token_acc": 0.5220338983050847, "train_speed(iter/s)": 1.43796 }, { "epoch": 3.7387001413821173, "grad_norm": 6.383564472198486, "learning_rate": 1.489656968215919e-05, "loss": 2.4896823883056642, "memory(GiB)": 77.56, "step": 87265, "token_acc": 0.45263157894736844, "train_speed(iter/s)": 1.437951 }, { "epoch": 3.7389143567113665, "grad_norm": 7.178621292114258, "learning_rate": 1.4891777666086609e-05, "loss": 1.8542938232421875, "memory(GiB)": 77.56, "step": 87270, "token_acc": 0.5921052631578947, "train_speed(iter/s)": 1.437945 }, { "epoch": 3.7391285720406153, "grad_norm": 6.1213603019714355, "learning_rate": 1.4886986286032423e-05, "loss": 2.352415657043457, "memory(GiB)": 77.56, "step": 87275, "token_acc": 0.538961038961039, "train_speed(iter/s)": 1.437951 }, { "epoch": 3.739342787369864, "grad_norm": 6.763651371002197, "learning_rate": 1.4882195542083421e-05, "loss": 2.3152748107910157, "memory(GiB)": 77.56, "step": 87280, "token_acc": 0.5413223140495868, "train_speed(iter/s)": 1.437957 }, { "epoch": 3.7395570026991134, "grad_norm": 6.121639728546143, "learning_rate": 1.487740543432639e-05, "loss": 2.3437719345092773, "memory(GiB)": 77.56, "step": 87285, "token_acc": 0.4889589905362776, "train_speed(iter/s)": 1.437974 }, { "epoch": 3.739771218028362, "grad_norm": 6.855044364929199, "learning_rate": 1.4872615962848113e-05, "loss": 2.464218521118164, "memory(GiB)": 77.56, "step": 87290, "token_acc": 0.48231511254019294, "train_speed(iter/s)": 1.437956 }, { "epoch": 3.739985433357611, "grad_norm": 7.222046375274658, "learning_rate": 1.4867827127735346e-05, "loss": 2.4066207885742186, "memory(GiB)": 77.56, "step": 87295, "token_acc": 0.5090252707581228, "train_speed(iter/s)": 1.43794 }, { "epoch": 3.7401996486868603, "grad_norm": 5.3208231925964355, "learning_rate": 1.486303892907483e-05, "loss": 2.6841299057006838, "memory(GiB)": 77.56, "step": 87300, "token_acc": 0.498371335504886, "train_speed(iter/s)": 1.43792 }, { "epoch": 3.740413864016109, "grad_norm": 5.993175983428955, "learning_rate": 1.485825136695334e-05, "loss": 2.2455379486083986, "memory(GiB)": 77.56, "step": 87305, "token_acc": 0.527027027027027, "train_speed(iter/s)": 1.437946 }, { "epoch": 3.740628079345358, "grad_norm": 5.692388534545898, "learning_rate": 1.4853464441457593e-05, "loss": 2.338824653625488, "memory(GiB)": 77.56, "step": 87310, "token_acc": 0.4880546075085324, "train_speed(iter/s)": 1.437949 }, { "epoch": 3.740842294674607, "grad_norm": 6.645580768585205, "learning_rate": 1.4848678152674311e-05, "loss": 2.336594009399414, "memory(GiB)": 77.56, "step": 87315, "token_acc": 0.5156794425087108, "train_speed(iter/s)": 1.437962 }, { "epoch": 3.741056510003856, "grad_norm": 3.9664742946624756, "learning_rate": 1.4843892500690193e-05, "loss": 2.3229135513305663, "memory(GiB)": 77.56, "step": 87320, "token_acc": 0.5066225165562914, "train_speed(iter/s)": 1.437976 }, { "epoch": 3.741270725333105, "grad_norm": 5.8854780197143555, "learning_rate": 1.483910748559193e-05, "loss": 2.331338310241699, "memory(GiB)": 77.56, "step": 87325, "token_acc": 0.5, "train_speed(iter/s)": 1.437978 }, { "epoch": 3.741484940662354, "grad_norm": 8.000292778015137, "learning_rate": 1.4834323107466218e-05, "loss": 2.4718648910522463, "memory(GiB)": 77.56, "step": 87330, "token_acc": 0.4763636363636364, "train_speed(iter/s)": 1.437998 }, { "epoch": 3.741699155991603, "grad_norm": 6.634707927703857, "learning_rate": 1.4829539366399747e-05, "loss": 2.270786666870117, "memory(GiB)": 77.56, "step": 87335, "token_acc": 0.48859934853420195, "train_speed(iter/s)": 1.437978 }, { "epoch": 3.7419133713208517, "grad_norm": 5.0419416427612305, "learning_rate": 1.4824756262479161e-05, "loss": 2.4486883163452147, "memory(GiB)": 77.56, "step": 87340, "token_acc": 0.47560975609756095, "train_speed(iter/s)": 1.437976 }, { "epoch": 3.742127586650101, "grad_norm": 6.889967918395996, "learning_rate": 1.4819973795791115e-05, "loss": 2.3378862380981444, "memory(GiB)": 77.56, "step": 87345, "token_acc": 0.5093632958801498, "train_speed(iter/s)": 1.437994 }, { "epoch": 3.7423418019793497, "grad_norm": 6.417646884918213, "learning_rate": 1.4815191966422243e-05, "loss": 2.720916748046875, "memory(GiB)": 77.56, "step": 87350, "token_acc": 0.5053003533568905, "train_speed(iter/s)": 1.437991 }, { "epoch": 3.7425560173085985, "grad_norm": 5.461677551269531, "learning_rate": 1.4810410774459171e-05, "loss": 2.0022937774658205, "memory(GiB)": 77.56, "step": 87355, "token_acc": 0.556420233463035, "train_speed(iter/s)": 1.437996 }, { "epoch": 3.742770232637848, "grad_norm": 5.625781059265137, "learning_rate": 1.4805630219988508e-05, "loss": 2.521144485473633, "memory(GiB)": 77.56, "step": 87360, "token_acc": 0.47266881028938906, "train_speed(iter/s)": 1.438 }, { "epoch": 3.7429844479670966, "grad_norm": 7.273965358734131, "learning_rate": 1.4800850303096885e-05, "loss": 2.439421463012695, "memory(GiB)": 77.56, "step": 87365, "token_acc": 0.4878048780487805, "train_speed(iter/s)": 1.438011 }, { "epoch": 3.7431986632963454, "grad_norm": 5.6455912590026855, "learning_rate": 1.4796071023870872e-05, "loss": 2.403054046630859, "memory(GiB)": 77.56, "step": 87370, "token_acc": 0.5102739726027398, "train_speed(iter/s)": 1.438018 }, { "epoch": 3.7434128786255947, "grad_norm": 5.489974021911621, "learning_rate": 1.4791292382397064e-05, "loss": 2.015095901489258, "memory(GiB)": 77.56, "step": 87375, "token_acc": 0.5542168674698795, "train_speed(iter/s)": 1.438039 }, { "epoch": 3.7436270939548435, "grad_norm": 5.249879837036133, "learning_rate": 1.4786514378762017e-05, "loss": 2.2564437866210936, "memory(GiB)": 77.56, "step": 87380, "token_acc": 0.525, "train_speed(iter/s)": 1.438031 }, { "epoch": 3.7438413092840923, "grad_norm": 7.277451038360596, "learning_rate": 1.4781737013052282e-05, "loss": 2.1843563079833985, "memory(GiB)": 77.56, "step": 87385, "token_acc": 0.4796380090497738, "train_speed(iter/s)": 1.438038 }, { "epoch": 3.7440555246133416, "grad_norm": 6.090884685516357, "learning_rate": 1.4776960285354436e-05, "loss": 2.3574953079223633, "memory(GiB)": 77.56, "step": 87390, "token_acc": 0.5264900662251656, "train_speed(iter/s)": 1.438066 }, { "epoch": 3.7442697399425904, "grad_norm": 5.9975361824035645, "learning_rate": 1.4772184195754996e-05, "loss": 2.3810352325439452, "memory(GiB)": 77.56, "step": 87395, "token_acc": 0.49230769230769234, "train_speed(iter/s)": 1.438084 }, { "epoch": 3.744483955271839, "grad_norm": 6.648539066314697, "learning_rate": 1.4767408744340466e-05, "loss": 2.2519874572753906, "memory(GiB)": 77.56, "step": 87400, "token_acc": 0.504424778761062, "train_speed(iter/s)": 1.438067 }, { "epoch": 3.7446981706010884, "grad_norm": 7.1107964515686035, "learning_rate": 1.4762633931197395e-05, "loss": 2.27294807434082, "memory(GiB)": 77.56, "step": 87405, "token_acc": 0.501779359430605, "train_speed(iter/s)": 1.438056 }, { "epoch": 3.7449123859303373, "grad_norm": 6.525959014892578, "learning_rate": 1.4757859756412268e-05, "loss": 2.2874944686889647, "memory(GiB)": 77.56, "step": 87410, "token_acc": 0.4928571428571429, "train_speed(iter/s)": 1.438074 }, { "epoch": 3.745126601259586, "grad_norm": 6.147908687591553, "learning_rate": 1.475308622007155e-05, "loss": 2.3837844848632814, "memory(GiB)": 77.56, "step": 87415, "token_acc": 0.5019011406844106, "train_speed(iter/s)": 1.438082 }, { "epoch": 3.7453408165888353, "grad_norm": 7.439899921417236, "learning_rate": 1.4748313322261758e-05, "loss": 2.318966293334961, "memory(GiB)": 77.56, "step": 87420, "token_acc": 0.525691699604743, "train_speed(iter/s)": 1.438079 }, { "epoch": 3.745555031918084, "grad_norm": 5.130218982696533, "learning_rate": 1.4743541063069339e-05, "loss": 2.0219400405883787, "memory(GiB)": 77.56, "step": 87425, "token_acc": 0.5078125, "train_speed(iter/s)": 1.438084 }, { "epoch": 3.745769247247333, "grad_norm": 5.853952407836914, "learning_rate": 1.4738769442580746e-05, "loss": 2.1071950912475588, "memory(GiB)": 77.56, "step": 87430, "token_acc": 0.4967948717948718, "train_speed(iter/s)": 1.438073 }, { "epoch": 3.745983462576582, "grad_norm": 5.421910762786865, "learning_rate": 1.4733998460882425e-05, "loss": 2.549492073059082, "memory(GiB)": 77.56, "step": 87435, "token_acc": 0.4258241758241758, "train_speed(iter/s)": 1.438073 }, { "epoch": 3.746197677905831, "grad_norm": 7.759028434753418, "learning_rate": 1.4729228118060807e-05, "loss": 2.4368078231811525, "memory(GiB)": 77.56, "step": 87440, "token_acc": 0.49851632047477745, "train_speed(iter/s)": 1.438086 }, { "epoch": 3.74641189323508, "grad_norm": 10.274950981140137, "learning_rate": 1.4724458414202285e-05, "loss": 2.1819526672363283, "memory(GiB)": 77.56, "step": 87445, "token_acc": 0.5057034220532319, "train_speed(iter/s)": 1.438104 }, { "epoch": 3.746626108564329, "grad_norm": 6.6081132888793945, "learning_rate": 1.4719689349393312e-05, "loss": 2.124695587158203, "memory(GiB)": 77.56, "step": 87450, "token_acc": 0.5144927536231884, "train_speed(iter/s)": 1.438104 }, { "epoch": 3.746840323893578, "grad_norm": 5.244287967681885, "learning_rate": 1.4714920923720254e-05, "loss": 2.07525577545166, "memory(GiB)": 77.56, "step": 87455, "token_acc": 0.5389830508474577, "train_speed(iter/s)": 1.438088 }, { "epoch": 3.7470545392228267, "grad_norm": 6.249682426452637, "learning_rate": 1.471015313726951e-05, "loss": 2.4163894653320312, "memory(GiB)": 77.56, "step": 87460, "token_acc": 0.45126353790613716, "train_speed(iter/s)": 1.438086 }, { "epoch": 3.747268754552076, "grad_norm": 5.447098255157471, "learning_rate": 1.4705385990127446e-05, "loss": 2.197842025756836, "memory(GiB)": 77.56, "step": 87465, "token_acc": 0.5285171102661597, "train_speed(iter/s)": 1.438079 }, { "epoch": 3.747482969881325, "grad_norm": 6.029251575469971, "learning_rate": 1.4700619482380406e-05, "loss": 2.540346145629883, "memory(GiB)": 77.56, "step": 87470, "token_acc": 0.4720496894409938, "train_speed(iter/s)": 1.438066 }, { "epoch": 3.7476971852105736, "grad_norm": 5.29820442199707, "learning_rate": 1.4695853614114763e-05, "loss": 2.3586051940917967, "memory(GiB)": 77.56, "step": 87475, "token_acc": 0.4812680115273775, "train_speed(iter/s)": 1.438076 }, { "epoch": 3.747911400539823, "grad_norm": 6.930717945098877, "learning_rate": 1.4691088385416857e-05, "loss": 2.41739559173584, "memory(GiB)": 77.56, "step": 87480, "token_acc": 0.49809885931558934, "train_speed(iter/s)": 1.438081 }, { "epoch": 3.7481256158690717, "grad_norm": 7.671302318572998, "learning_rate": 1.4686323796373014e-05, "loss": 2.2959407806396483, "memory(GiB)": 77.56, "step": 87485, "token_acc": 0.5091575091575091, "train_speed(iter/s)": 1.438069 }, { "epoch": 3.7483398311983205, "grad_norm": 6.608582973480225, "learning_rate": 1.4681559847069537e-05, "loss": 2.269548034667969, "memory(GiB)": 77.56, "step": 87490, "token_acc": 0.5185185185185185, "train_speed(iter/s)": 1.438066 }, { "epoch": 3.7485540465275697, "grad_norm": 6.8144965171813965, "learning_rate": 1.467679653759274e-05, "loss": 2.2238227844238283, "memory(GiB)": 77.56, "step": 87495, "token_acc": 0.5275590551181102, "train_speed(iter/s)": 1.438068 }, { "epoch": 3.7487682618568186, "grad_norm": 6.36323356628418, "learning_rate": 1.4672033868028907e-05, "loss": 2.4278650283813477, "memory(GiB)": 77.56, "step": 87500, "token_acc": 0.4948805460750853, "train_speed(iter/s)": 1.438077 }, { "epoch": 3.7487682618568186, "eval_loss": 2.2121448516845703, "eval_runtime": 14.2801, "eval_samples_per_second": 7.003, "eval_steps_per_second": 7.003, "eval_token_acc": 0.49159663865546216, "step": 87500 }, { "epoch": 3.7489824771860674, "grad_norm": 5.867633819580078, "learning_rate": 1.4667271838464303e-05, "loss": 2.487856864929199, "memory(GiB)": 77.56, "step": 87505, "token_acc": 0.4824824824824825, "train_speed(iter/s)": 1.437726 }, { "epoch": 3.7491966925153166, "grad_norm": 5.005818843841553, "learning_rate": 1.4662510448985234e-05, "loss": 2.4117219924926756, "memory(GiB)": 77.56, "step": 87510, "token_acc": 0.44765342960288806, "train_speed(iter/s)": 1.437722 }, { "epoch": 3.7494109078445654, "grad_norm": 7.557612419128418, "learning_rate": 1.4657749699677937e-05, "loss": 2.1620864868164062, "memory(GiB)": 77.56, "step": 87515, "token_acc": 0.46715328467153283, "train_speed(iter/s)": 1.437721 }, { "epoch": 3.7496251231738142, "grad_norm": 5.773110866546631, "learning_rate": 1.4652989590628658e-05, "loss": 2.3477474212646485, "memory(GiB)": 77.56, "step": 87520, "token_acc": 0.48355263157894735, "train_speed(iter/s)": 1.437739 }, { "epoch": 3.7498393385030635, "grad_norm": 6.7726006507873535, "learning_rate": 1.4648230121923629e-05, "loss": 2.340291404724121, "memory(GiB)": 77.56, "step": 87525, "token_acc": 0.5203252032520326, "train_speed(iter/s)": 1.437758 }, { "epoch": 3.7500535538323123, "grad_norm": 5.539238929748535, "learning_rate": 1.4643471293649059e-05, "loss": 2.4184734344482424, "memory(GiB)": 77.56, "step": 87530, "token_acc": 0.4798657718120805, "train_speed(iter/s)": 1.43777 }, { "epoch": 3.750267769161561, "grad_norm": 6.12601900100708, "learning_rate": 1.4638713105891188e-05, "loss": 2.4049610137939452, "memory(GiB)": 77.56, "step": 87535, "token_acc": 0.5032467532467533, "train_speed(iter/s)": 1.437781 }, { "epoch": 3.7504819844908104, "grad_norm": 5.433905601501465, "learning_rate": 1.4633955558736201e-05, "loss": 2.3850824356079103, "memory(GiB)": 77.56, "step": 87540, "token_acc": 0.47079037800687284, "train_speed(iter/s)": 1.437799 }, { "epoch": 3.750696199820059, "grad_norm": 8.146924018859863, "learning_rate": 1.4629198652270288e-05, "loss": 2.3787258148193358, "memory(GiB)": 77.56, "step": 87545, "token_acc": 0.4983388704318937, "train_speed(iter/s)": 1.437794 }, { "epoch": 3.750910415149308, "grad_norm": 5.818040370941162, "learning_rate": 1.4624442386579601e-05, "loss": 2.308137893676758, "memory(GiB)": 77.56, "step": 87550, "token_acc": 0.5331491712707183, "train_speed(iter/s)": 1.437801 }, { "epoch": 3.7511246304785573, "grad_norm": 6.6491780281066895, "learning_rate": 1.4619686761750345e-05, "loss": 2.322003936767578, "memory(GiB)": 77.56, "step": 87555, "token_acc": 0.5092250922509225, "train_speed(iter/s)": 1.437804 }, { "epoch": 3.751338845807806, "grad_norm": 7.675992965698242, "learning_rate": 1.4614931777868634e-05, "loss": 2.6120983123779298, "memory(GiB)": 77.56, "step": 87560, "token_acc": 0.46691176470588236, "train_speed(iter/s)": 1.437825 }, { "epoch": 3.751553061137055, "grad_norm": 6.975713729858398, "learning_rate": 1.4610177435020645e-05, "loss": 2.3559783935546874, "memory(GiB)": 77.56, "step": 87565, "token_acc": 0.48132780082987553, "train_speed(iter/s)": 1.437839 }, { "epoch": 3.751767276466304, "grad_norm": 4.986622333526611, "learning_rate": 1.4605423733292494e-05, "loss": 2.052254295349121, "memory(GiB)": 77.56, "step": 87570, "token_acc": 0.5254777070063694, "train_speed(iter/s)": 1.437839 }, { "epoch": 3.751981491795553, "grad_norm": 5.038890838623047, "learning_rate": 1.460067067277029e-05, "loss": 2.4341768264770507, "memory(GiB)": 77.56, "step": 87575, "token_acc": 0.4856115107913669, "train_speed(iter/s)": 1.437831 }, { "epoch": 3.7521957071248018, "grad_norm": 5.075716018676758, "learning_rate": 1.4595918253540147e-05, "loss": 2.3425437927246096, "memory(GiB)": 77.56, "step": 87580, "token_acc": 0.5335689045936396, "train_speed(iter/s)": 1.437835 }, { "epoch": 3.752409922454051, "grad_norm": 6.498806953430176, "learning_rate": 1.459116647568815e-05, "loss": 2.683580207824707, "memory(GiB)": 77.56, "step": 87585, "token_acc": 0.4840989399293286, "train_speed(iter/s)": 1.437836 }, { "epoch": 3.7526241377833, "grad_norm": 5.05894136428833, "learning_rate": 1.458641533930038e-05, "loss": 2.3216506958007814, "memory(GiB)": 77.56, "step": 87590, "token_acc": 0.5089605734767025, "train_speed(iter/s)": 1.437835 }, { "epoch": 3.7528383531125487, "grad_norm": 4.817282676696777, "learning_rate": 1.4581664844462929e-05, "loss": 2.2896682739257814, "memory(GiB)": 77.56, "step": 87595, "token_acc": 0.5276073619631901, "train_speed(iter/s)": 1.437829 }, { "epoch": 3.753052568441798, "grad_norm": 5.829711437225342, "learning_rate": 1.4576914991261848e-05, "loss": 2.559900093078613, "memory(GiB)": 77.56, "step": 87600, "token_acc": 0.5046439628482973, "train_speed(iter/s)": 1.437854 }, { "epoch": 3.7532667837710467, "grad_norm": 4.7632527351379395, "learning_rate": 1.4572165779783176e-05, "loss": 2.05426082611084, "memory(GiB)": 77.56, "step": 87605, "token_acc": 0.5518672199170125, "train_speed(iter/s)": 1.437839 }, { "epoch": 3.7534809991002955, "grad_norm": 7.520813465118408, "learning_rate": 1.456741721011296e-05, "loss": 2.5381019592285154, "memory(GiB)": 77.56, "step": 87610, "token_acc": 0.432258064516129, "train_speed(iter/s)": 1.437828 }, { "epoch": 3.753695214429545, "grad_norm": 5.405460357666016, "learning_rate": 1.4562669282337198e-05, "loss": 2.052898406982422, "memory(GiB)": 77.56, "step": 87615, "token_acc": 0.5346153846153846, "train_speed(iter/s)": 1.437846 }, { "epoch": 3.7539094297587936, "grad_norm": 5.803069114685059, "learning_rate": 1.4557921996541946e-05, "loss": 2.5014766693115233, "memory(GiB)": 77.56, "step": 87620, "token_acc": 0.45592705167173253, "train_speed(iter/s)": 1.437866 }, { "epoch": 3.7541236450880424, "grad_norm": 5.596368312835693, "learning_rate": 1.455317535281317e-05, "loss": 2.3184619903564454, "memory(GiB)": 77.56, "step": 87625, "token_acc": 0.512280701754386, "train_speed(iter/s)": 1.437873 }, { "epoch": 3.7543378604172917, "grad_norm": 6.604919910430908, "learning_rate": 1.4548429351236886e-05, "loss": 2.2990365982055665, "memory(GiB)": 77.56, "step": 87630, "token_acc": 0.525096525096525, "train_speed(iter/s)": 1.437885 }, { "epoch": 3.7545520757465405, "grad_norm": 6.015281677246094, "learning_rate": 1.454368399189906e-05, "loss": 2.173585891723633, "memory(GiB)": 77.56, "step": 87635, "token_acc": 0.4948453608247423, "train_speed(iter/s)": 1.43789 }, { "epoch": 3.7547662910757893, "grad_norm": 6.053415775299072, "learning_rate": 1.4538939274885665e-05, "loss": 2.6174060821533205, "memory(GiB)": 77.56, "step": 87640, "token_acc": 0.4963235294117647, "train_speed(iter/s)": 1.437882 }, { "epoch": 3.7549805064050386, "grad_norm": 6.9501051902771, "learning_rate": 1.4534195200282646e-05, "loss": 2.0804874420166017, "memory(GiB)": 77.56, "step": 87645, "token_acc": 0.5176991150442478, "train_speed(iter/s)": 1.437897 }, { "epoch": 3.7551947217342874, "grad_norm": 5.198156356811523, "learning_rate": 1.4529451768175933e-05, "loss": 2.2638015747070312, "memory(GiB)": 77.56, "step": 87650, "token_acc": 0.5221238938053098, "train_speed(iter/s)": 1.437896 }, { "epoch": 3.755408937063536, "grad_norm": 4.92119836807251, "learning_rate": 1.4524708978651491e-05, "loss": 2.3373477935791014, "memory(GiB)": 77.56, "step": 87655, "token_acc": 0.5, "train_speed(iter/s)": 1.437899 }, { "epoch": 3.7556231523927854, "grad_norm": 7.089959621429443, "learning_rate": 1.4519966831795228e-05, "loss": 2.260489845275879, "memory(GiB)": 77.56, "step": 87660, "token_acc": 0.5111111111111111, "train_speed(iter/s)": 1.437918 }, { "epoch": 3.7558373677220342, "grad_norm": 5.744592189788818, "learning_rate": 1.4515225327693049e-05, "loss": 2.413991928100586, "memory(GiB)": 77.56, "step": 87665, "token_acc": 0.4691780821917808, "train_speed(iter/s)": 1.437908 }, { "epoch": 3.756051583051283, "grad_norm": 7.479886054992676, "learning_rate": 1.4510484466430846e-05, "loss": 2.0419031143188477, "memory(GiB)": 77.56, "step": 87670, "token_acc": 0.5051194539249146, "train_speed(iter/s)": 1.437899 }, { "epoch": 3.7562657983805323, "grad_norm": 6.407444953918457, "learning_rate": 1.45057442480945e-05, "loss": 2.071546936035156, "memory(GiB)": 77.56, "step": 87675, "token_acc": 0.5359712230215827, "train_speed(iter/s)": 1.43791 }, { "epoch": 3.756480013709781, "grad_norm": 8.050786972045898, "learning_rate": 1.4501004672769903e-05, "loss": 2.2287151336669924, "memory(GiB)": 77.56, "step": 87680, "token_acc": 0.5112540192926045, "train_speed(iter/s)": 1.437922 }, { "epoch": 3.75669422903903, "grad_norm": 7.456993579864502, "learning_rate": 1.4496265740542908e-05, "loss": 2.198100280761719, "memory(GiB)": 77.56, "step": 87685, "token_acc": 0.5625, "train_speed(iter/s)": 1.437923 }, { "epoch": 3.756908444368279, "grad_norm": 5.820798873901367, "learning_rate": 1.4491527451499365e-05, "loss": 2.502303123474121, "memory(GiB)": 77.56, "step": 87690, "token_acc": 0.48698884758364314, "train_speed(iter/s)": 1.437923 }, { "epoch": 3.757122659697528, "grad_norm": 5.178523540496826, "learning_rate": 1.448678980572511e-05, "loss": 2.2209280014038084, "memory(GiB)": 77.56, "step": 87695, "token_acc": 0.5325077399380805, "train_speed(iter/s)": 1.43794 }, { "epoch": 3.757336875026777, "grad_norm": 6.158017158508301, "learning_rate": 1.4482052803305962e-05, "loss": 2.2875513076782226, "memory(GiB)": 77.56, "step": 87700, "token_acc": 0.5181159420289855, "train_speed(iter/s)": 1.437954 }, { "epoch": 3.757551090356026, "grad_norm": 10.388846397399902, "learning_rate": 1.4477316444327738e-05, "loss": 2.5663766860961914, "memory(GiB)": 77.56, "step": 87705, "token_acc": 0.4813664596273292, "train_speed(iter/s)": 1.43798 }, { "epoch": 3.757765305685275, "grad_norm": 5.621857166290283, "learning_rate": 1.4472580728876272e-05, "loss": 2.128612518310547, "memory(GiB)": 77.56, "step": 87710, "token_acc": 0.5427509293680297, "train_speed(iter/s)": 1.437983 }, { "epoch": 3.7579795210145237, "grad_norm": 5.60006046295166, "learning_rate": 1.4467845657037332e-05, "loss": 2.4085105895996093, "memory(GiB)": 77.56, "step": 87715, "token_acc": 0.46204620462046203, "train_speed(iter/s)": 1.437978 }, { "epoch": 3.758193736343773, "grad_norm": 7.108492851257324, "learning_rate": 1.4463111228896697e-05, "loss": 2.2842512130737305, "memory(GiB)": 77.56, "step": 87720, "token_acc": 0.49416342412451364, "train_speed(iter/s)": 1.437979 }, { "epoch": 3.7584079516730218, "grad_norm": 5.585237979888916, "learning_rate": 1.4458377444540139e-05, "loss": 2.5472055435180665, "memory(GiB)": 77.56, "step": 87725, "token_acc": 0.48589341692789967, "train_speed(iter/s)": 1.437974 }, { "epoch": 3.7586221670022706, "grad_norm": 7.364243507385254, "learning_rate": 1.4453644304053415e-05, "loss": 2.332600402832031, "memory(GiB)": 77.56, "step": 87730, "token_acc": 0.5050847457627119, "train_speed(iter/s)": 1.437988 }, { "epoch": 3.75883638233152, "grad_norm": 5.167562961578369, "learning_rate": 1.4448911807522253e-05, "loss": 2.5560848236083986, "memory(GiB)": 77.56, "step": 87735, "token_acc": 0.46875, "train_speed(iter/s)": 1.437993 }, { "epoch": 3.7590505976607687, "grad_norm": 6.318556308746338, "learning_rate": 1.4444179955032422e-05, "loss": 2.283896636962891, "memory(GiB)": 77.56, "step": 87740, "token_acc": 0.5075187969924813, "train_speed(iter/s)": 1.437996 }, { "epoch": 3.7592648129900175, "grad_norm": 6.268924713134766, "learning_rate": 1.443944874666962e-05, "loss": 2.2712738037109377, "memory(GiB)": 77.56, "step": 87745, "token_acc": 0.48936170212765956, "train_speed(iter/s)": 1.437999 }, { "epoch": 3.7594790283192667, "grad_norm": 4.683320045471191, "learning_rate": 1.443471818251957e-05, "loss": 2.562276077270508, "memory(GiB)": 77.56, "step": 87750, "token_acc": 0.4909090909090909, "train_speed(iter/s)": 1.438008 }, { "epoch": 3.7596932436485155, "grad_norm": 6.328271865844727, "learning_rate": 1.4429988262667959e-05, "loss": 2.044296646118164, "memory(GiB)": 77.56, "step": 87755, "token_acc": 0.5521885521885522, "train_speed(iter/s)": 1.438018 }, { "epoch": 3.7599074589777643, "grad_norm": 4.791431903839111, "learning_rate": 1.4425258987200463e-05, "loss": 2.4454938888549806, "memory(GiB)": 77.56, "step": 87760, "token_acc": 0.5, "train_speed(iter/s)": 1.437996 }, { "epoch": 3.7601216743070136, "grad_norm": 5.1604437828063965, "learning_rate": 1.4420530356202783e-05, "loss": 2.459474563598633, "memory(GiB)": 77.56, "step": 87765, "token_acc": 0.5032051282051282, "train_speed(iter/s)": 1.437973 }, { "epoch": 3.7603358896362624, "grad_norm": 5.239433765411377, "learning_rate": 1.4415802369760562e-05, "loss": 2.512373352050781, "memory(GiB)": 77.56, "step": 87770, "token_acc": 0.44, "train_speed(iter/s)": 1.437984 }, { "epoch": 3.7605501049655112, "grad_norm": 5.4469075202941895, "learning_rate": 1.4411075027959475e-05, "loss": 2.4335300445556642, "memory(GiB)": 77.56, "step": 87775, "token_acc": 0.4260355029585799, "train_speed(iter/s)": 1.43801 }, { "epoch": 3.7607643202947605, "grad_norm": 5.976019859313965, "learning_rate": 1.440634833088515e-05, "loss": 2.391189384460449, "memory(GiB)": 77.56, "step": 87780, "token_acc": 0.5182724252491694, "train_speed(iter/s)": 1.438032 }, { "epoch": 3.7609785356240093, "grad_norm": 7.111571788787842, "learning_rate": 1.4401622278623217e-05, "loss": 2.307042694091797, "memory(GiB)": 77.56, "step": 87785, "token_acc": 0.5123674911660777, "train_speed(iter/s)": 1.438038 }, { "epoch": 3.761192750953258, "grad_norm": 7.658970832824707, "learning_rate": 1.4396896871259286e-05, "loss": 2.1998964309692384, "memory(GiB)": 77.56, "step": 87790, "token_acc": 0.5253623188405797, "train_speed(iter/s)": 1.438025 }, { "epoch": 3.7614069662825074, "grad_norm": 5.72540283203125, "learning_rate": 1.4392172108878954e-05, "loss": 2.263543891906738, "memory(GiB)": 77.56, "step": 87795, "token_acc": 0.508833922261484, "train_speed(iter/s)": 1.438016 }, { "epoch": 3.761621181611756, "grad_norm": 6.018642902374268, "learning_rate": 1.4387447991567838e-05, "loss": 2.223993110656738, "memory(GiB)": 77.56, "step": 87800, "token_acc": 0.49508196721311476, "train_speed(iter/s)": 1.438006 }, { "epoch": 3.761835396941005, "grad_norm": 5.77709436416626, "learning_rate": 1.438272451941151e-05, "loss": 2.0517341613769533, "memory(GiB)": 77.56, "step": 87805, "token_acc": 0.5151515151515151, "train_speed(iter/s)": 1.438012 }, { "epoch": 3.7620496122702543, "grad_norm": 7.062778472900391, "learning_rate": 1.4378001692495546e-05, "loss": 2.3862701416015626, "memory(GiB)": 77.56, "step": 87810, "token_acc": 0.5347432024169184, "train_speed(iter/s)": 1.438009 }, { "epoch": 3.762263827599503, "grad_norm": 5.769493103027344, "learning_rate": 1.437327951090549e-05, "loss": 2.4958463668823243, "memory(GiB)": 77.56, "step": 87815, "token_acc": 0.46511627906976744, "train_speed(iter/s)": 1.438015 }, { "epoch": 3.762478042928752, "grad_norm": 7.23477840423584, "learning_rate": 1.4368557974726882e-05, "loss": 2.6645326614379883, "memory(GiB)": 77.56, "step": 87820, "token_acc": 0.4982078853046595, "train_speed(iter/s)": 1.438029 }, { "epoch": 3.762692258258001, "grad_norm": 6.248107433319092, "learning_rate": 1.436383708404529e-05, "loss": 2.606052017211914, "memory(GiB)": 77.56, "step": 87825, "token_acc": 0.45565749235474007, "train_speed(iter/s)": 1.438031 }, { "epoch": 3.76290647358725, "grad_norm": 4.963569641113281, "learning_rate": 1.4359116838946219e-05, "loss": 2.334998893737793, "memory(GiB)": 77.56, "step": 87830, "token_acc": 0.4965034965034965, "train_speed(iter/s)": 1.43805 }, { "epoch": 3.7631206889164988, "grad_norm": 7.157711029052734, "learning_rate": 1.4354397239515177e-05, "loss": 2.630050468444824, "memory(GiB)": 77.56, "step": 87835, "token_acc": 0.45808383233532934, "train_speed(iter/s)": 1.438045 }, { "epoch": 3.763334904245748, "grad_norm": 5.773626327514648, "learning_rate": 1.434967828583767e-05, "loss": 2.3394601821899412, "memory(GiB)": 77.56, "step": 87840, "token_acc": 0.5028735632183908, "train_speed(iter/s)": 1.438056 }, { "epoch": 3.763549119574997, "grad_norm": 6.297607898712158, "learning_rate": 1.4344959977999162e-05, "loss": 2.4435955047607423, "memory(GiB)": 77.56, "step": 87845, "token_acc": 0.4934210526315789, "train_speed(iter/s)": 1.438065 }, { "epoch": 3.7637633349042456, "grad_norm": 5.704437732696533, "learning_rate": 1.4340242316085156e-05, "loss": 2.295665168762207, "memory(GiB)": 77.56, "step": 87850, "token_acc": 0.5428571428571428, "train_speed(iter/s)": 1.438082 }, { "epoch": 3.763977550233495, "grad_norm": 4.977346897125244, "learning_rate": 1.433552530018113e-05, "loss": 2.2148517608642577, "memory(GiB)": 77.56, "step": 87855, "token_acc": 0.5225225225225225, "train_speed(iter/s)": 1.438096 }, { "epoch": 3.7641917655627437, "grad_norm": 5.197224140167236, "learning_rate": 1.4330808930372514e-05, "loss": 2.2645254135131836, "memory(GiB)": 77.56, "step": 87860, "token_acc": 0.4966887417218543, "train_speed(iter/s)": 1.438095 }, { "epoch": 3.7644059808919925, "grad_norm": 7.71748161315918, "learning_rate": 1.4326093206744756e-05, "loss": 2.5463199615478516, "memory(GiB)": 77.56, "step": 87865, "token_acc": 0.4935483870967742, "train_speed(iter/s)": 1.438092 }, { "epoch": 3.7646201962212418, "grad_norm": 8.733891487121582, "learning_rate": 1.432137812938329e-05, "loss": 2.3446420669555663, "memory(GiB)": 77.56, "step": 87870, "token_acc": 0.5114155251141552, "train_speed(iter/s)": 1.438111 }, { "epoch": 3.7648344115504906, "grad_norm": 5.362898826599121, "learning_rate": 1.4316663698373522e-05, "loss": 2.435167694091797, "memory(GiB)": 77.56, "step": 87875, "token_acc": 0.47735191637630664, "train_speed(iter/s)": 1.438105 }, { "epoch": 3.7650486268797394, "grad_norm": 5.609925270080566, "learning_rate": 1.4311949913800848e-05, "loss": 2.3758773803710938, "memory(GiB)": 77.56, "step": 87880, "token_acc": 0.48125, "train_speed(iter/s)": 1.438113 }, { "epoch": 3.7652628422089887, "grad_norm": 7.326563358306885, "learning_rate": 1.4307236775750699e-05, "loss": 2.249185562133789, "memory(GiB)": 77.56, "step": 87885, "token_acc": 0.5126582278481012, "train_speed(iter/s)": 1.438103 }, { "epoch": 3.7654770575382375, "grad_norm": 6.238636016845703, "learning_rate": 1.4302524284308438e-05, "loss": 2.2153274536132814, "memory(GiB)": 77.56, "step": 87890, "token_acc": 0.5344827586206896, "train_speed(iter/s)": 1.438101 }, { "epoch": 3.7656912728674863, "grad_norm": 4.449050426483154, "learning_rate": 1.4297812439559439e-05, "loss": 2.5964075088500977, "memory(GiB)": 77.56, "step": 87895, "token_acc": 0.46, "train_speed(iter/s)": 1.438109 }, { "epoch": 3.7659054881967355, "grad_norm": 6.41837215423584, "learning_rate": 1.4293101241589051e-05, "loss": 2.1801387786865236, "memory(GiB)": 77.56, "step": 87900, "token_acc": 0.5120274914089347, "train_speed(iter/s)": 1.438091 }, { "epoch": 3.7661197035259844, "grad_norm": 5.493099212646484, "learning_rate": 1.4288390690482622e-05, "loss": 2.3284868240356444, "memory(GiB)": 77.56, "step": 87905, "token_acc": 0.5111821086261981, "train_speed(iter/s)": 1.438085 }, { "epoch": 3.766333918855233, "grad_norm": 6.2421674728393555, "learning_rate": 1.428368078632551e-05, "loss": 2.0088367462158203, "memory(GiB)": 77.56, "step": 87910, "token_acc": 0.5704697986577181, "train_speed(iter/s)": 1.438072 }, { "epoch": 3.7665481341844824, "grad_norm": 6.400420188903809, "learning_rate": 1.4278971529203023e-05, "loss": 2.339704895019531, "memory(GiB)": 77.56, "step": 87915, "token_acc": 0.5, "train_speed(iter/s)": 1.43808 }, { "epoch": 3.7667623495137312, "grad_norm": 5.7013115882873535, "learning_rate": 1.427426291920046e-05, "loss": 2.461294937133789, "memory(GiB)": 77.56, "step": 87920, "token_acc": 0.49491525423728816, "train_speed(iter/s)": 1.438065 }, { "epoch": 3.76697656484298, "grad_norm": 5.306664943695068, "learning_rate": 1.4269554956403153e-05, "loss": 2.3485349655151366, "memory(GiB)": 77.56, "step": 87925, "token_acc": 0.45454545454545453, "train_speed(iter/s)": 1.438071 }, { "epoch": 3.7671907801722293, "grad_norm": 6.84415340423584, "learning_rate": 1.4264847640896378e-05, "loss": 2.546147346496582, "memory(GiB)": 77.56, "step": 87930, "token_acc": 0.47761194029850745, "train_speed(iter/s)": 1.438084 }, { "epoch": 3.767404995501478, "grad_norm": 6.763495445251465, "learning_rate": 1.4260140972765407e-05, "loss": 2.5340614318847656, "memory(GiB)": 77.56, "step": 87935, "token_acc": 0.46689895470383275, "train_speed(iter/s)": 1.438093 }, { "epoch": 3.767619210830727, "grad_norm": 5.658900260925293, "learning_rate": 1.4255434952095498e-05, "loss": 2.629825210571289, "memory(GiB)": 77.56, "step": 87940, "token_acc": 0.49469964664310956, "train_speed(iter/s)": 1.438097 }, { "epoch": 3.767833426159976, "grad_norm": 4.1563801765441895, "learning_rate": 1.4250729578971927e-05, "loss": 2.2303306579589846, "memory(GiB)": 77.56, "step": 87945, "token_acc": 0.5294117647058824, "train_speed(iter/s)": 1.438102 }, { "epoch": 3.768047641489225, "grad_norm": 8.799057960510254, "learning_rate": 1.4246024853479928e-05, "loss": 2.4237850189208983, "memory(GiB)": 77.56, "step": 87950, "token_acc": 0.4870967741935484, "train_speed(iter/s)": 1.438105 }, { "epoch": 3.768261856818474, "grad_norm": 4.2870988845825195, "learning_rate": 1.4241320775704725e-05, "loss": 2.3504398345947264, "memory(GiB)": 77.56, "step": 87955, "token_acc": 0.4937888198757764, "train_speed(iter/s)": 1.438111 }, { "epoch": 3.768476072147723, "grad_norm": 7.406762599945068, "learning_rate": 1.4236617345731546e-05, "loss": 2.437298393249512, "memory(GiB)": 77.56, "step": 87960, "token_acc": 0.5194805194805194, "train_speed(iter/s)": 1.438132 }, { "epoch": 3.768690287476972, "grad_norm": 6.2104597091674805, "learning_rate": 1.4231914563645576e-05, "loss": 2.2796226501464845, "memory(GiB)": 77.56, "step": 87965, "token_acc": 0.5352697095435685, "train_speed(iter/s)": 1.438133 }, { "epoch": 3.7689045028062207, "grad_norm": 5.935835361480713, "learning_rate": 1.4227212429532038e-05, "loss": 2.283091926574707, "memory(GiB)": 77.56, "step": 87970, "token_acc": 0.5296167247386759, "train_speed(iter/s)": 1.438121 }, { "epoch": 3.76911871813547, "grad_norm": 5.743374347686768, "learning_rate": 1.4222510943476109e-05, "loss": 2.2220298767089846, "memory(GiB)": 77.56, "step": 87975, "token_acc": 0.48484848484848486, "train_speed(iter/s)": 1.438121 }, { "epoch": 3.7693329334647188, "grad_norm": 5.496264934539795, "learning_rate": 1.4217810105562957e-05, "loss": 2.2633298873901366, "memory(GiB)": 77.56, "step": 87980, "token_acc": 0.5345911949685535, "train_speed(iter/s)": 1.43812 }, { "epoch": 3.7695471487939676, "grad_norm": 6.349199295043945, "learning_rate": 1.4213109915877736e-05, "loss": 2.056529426574707, "memory(GiB)": 77.56, "step": 87985, "token_acc": 0.5294117647058824, "train_speed(iter/s)": 1.438105 }, { "epoch": 3.769761364123217, "grad_norm": 4.894119739532471, "learning_rate": 1.4208410374505587e-05, "loss": 2.4239370346069338, "memory(GiB)": 77.56, "step": 87990, "token_acc": 0.5160142348754448, "train_speed(iter/s)": 1.438103 }, { "epoch": 3.7699755794524656, "grad_norm": 5.657999038696289, "learning_rate": 1.4203711481531662e-05, "loss": 2.5051578521728515, "memory(GiB)": 77.56, "step": 87995, "token_acc": 0.47058823529411764, "train_speed(iter/s)": 1.438118 }, { "epoch": 3.7701897947817145, "grad_norm": 6.166049957275391, "learning_rate": 1.4199013237041093e-05, "loss": 2.047438049316406, "memory(GiB)": 77.56, "step": 88000, "token_acc": 0.5769230769230769, "train_speed(iter/s)": 1.438126 }, { "epoch": 3.7701897947817145, "eval_loss": 2.046614646911621, "eval_runtime": 14.8782, "eval_samples_per_second": 6.721, "eval_steps_per_second": 6.721, "eval_token_acc": 0.4853146853146853, "step": 88000 }, { "epoch": 3.7704040101109637, "grad_norm": 6.574954032897949, "learning_rate": 1.4194315641118989e-05, "loss": 2.451774787902832, "memory(GiB)": 77.56, "step": 88005, "token_acc": 0.4807692307692308, "train_speed(iter/s)": 1.437745 }, { "epoch": 3.7706182254402125, "grad_norm": 5.201181888580322, "learning_rate": 1.4189618693850444e-05, "loss": 2.38657169342041, "memory(GiB)": 77.56, "step": 88010, "token_acc": 0.5119047619047619, "train_speed(iter/s)": 1.437777 }, { "epoch": 3.7708324407694613, "grad_norm": 4.677194118499756, "learning_rate": 1.4184922395320543e-05, "loss": 2.497384452819824, "memory(GiB)": 77.56, "step": 88015, "token_acc": 0.46830985915492956, "train_speed(iter/s)": 1.437791 }, { "epoch": 3.7710466560987106, "grad_norm": 7.468648910522461, "learning_rate": 1.4180226745614379e-05, "loss": 2.3117725372314455, "memory(GiB)": 77.56, "step": 88020, "token_acc": 0.5095541401273885, "train_speed(iter/s)": 1.437771 }, { "epoch": 3.7712608714279594, "grad_norm": 6.285245895385742, "learning_rate": 1.4175531744816989e-05, "loss": 2.35408992767334, "memory(GiB)": 77.56, "step": 88025, "token_acc": 0.493006993006993, "train_speed(iter/s)": 1.437794 }, { "epoch": 3.771475086757208, "grad_norm": 6.402484893798828, "learning_rate": 1.4170837393013465e-05, "loss": 2.3732528686523438, "memory(GiB)": 77.56, "step": 88030, "token_acc": 0.49645390070921985, "train_speed(iter/s)": 1.437807 }, { "epoch": 3.7716893020864575, "grad_norm": 5.333975315093994, "learning_rate": 1.416614369028883e-05, "loss": 2.1019189834594725, "memory(GiB)": 77.56, "step": 88035, "token_acc": 0.5189873417721519, "train_speed(iter/s)": 1.437818 }, { "epoch": 3.7719035174157063, "grad_norm": 5.57673978805542, "learning_rate": 1.4161450636728119e-05, "loss": 2.4872100830078123, "memory(GiB)": 77.56, "step": 88040, "token_acc": 0.48405797101449277, "train_speed(iter/s)": 1.43782 }, { "epoch": 3.772117732744955, "grad_norm": 5.9250168800354, "learning_rate": 1.4156758232416345e-05, "loss": 2.40474910736084, "memory(GiB)": 77.56, "step": 88045, "token_acc": 0.5087719298245614, "train_speed(iter/s)": 1.437815 }, { "epoch": 3.7723319480742044, "grad_norm": 5.628423690795898, "learning_rate": 1.4152066477438508e-05, "loss": 2.221597099304199, "memory(GiB)": 77.56, "step": 88050, "token_acc": 0.5175718849840255, "train_speed(iter/s)": 1.437821 }, { "epoch": 3.772546163403453, "grad_norm": 9.546796798706055, "learning_rate": 1.4147375371879628e-05, "loss": 2.32208137512207, "memory(GiB)": 77.56, "step": 88055, "token_acc": 0.49814126394052044, "train_speed(iter/s)": 1.437816 }, { "epoch": 3.772760378732702, "grad_norm": 5.79128360748291, "learning_rate": 1.4142684915824678e-05, "loss": 2.3078880310058594, "memory(GiB)": 77.56, "step": 88060, "token_acc": 0.5148148148148148, "train_speed(iter/s)": 1.437829 }, { "epoch": 3.7729745940619512, "grad_norm": 6.553867816925049, "learning_rate": 1.413799510935861e-05, "loss": 2.237506866455078, "memory(GiB)": 77.56, "step": 88065, "token_acc": 0.5245901639344263, "train_speed(iter/s)": 1.437832 }, { "epoch": 3.7731888093912, "grad_norm": 7.1847662925720215, "learning_rate": 1.4133305952566416e-05, "loss": 2.5017158508300783, "memory(GiB)": 77.56, "step": 88070, "token_acc": 0.5145985401459854, "train_speed(iter/s)": 1.437852 }, { "epoch": 3.773403024720449, "grad_norm": 5.464165687561035, "learning_rate": 1.4128617445533037e-05, "loss": 2.3889543533325197, "memory(GiB)": 77.56, "step": 88075, "token_acc": 0.5503875968992248, "train_speed(iter/s)": 1.437869 }, { "epoch": 3.773617240049698, "grad_norm": 5.415406227111816, "learning_rate": 1.4123929588343398e-05, "loss": 2.1404388427734373, "memory(GiB)": 77.56, "step": 88080, "token_acc": 0.5153846153846153, "train_speed(iter/s)": 1.437864 }, { "epoch": 3.773831455378947, "grad_norm": 7.293788433074951, "learning_rate": 1.4119242381082415e-05, "loss": 2.301029014587402, "memory(GiB)": 77.56, "step": 88085, "token_acc": 0.5402298850574713, "train_speed(iter/s)": 1.437859 }, { "epoch": 3.7740456707081957, "grad_norm": 7.515618324279785, "learning_rate": 1.411455582383503e-05, "loss": 2.2081398010253905, "memory(GiB)": 77.56, "step": 88090, "token_acc": 0.5098039215686274, "train_speed(iter/s)": 1.43787 }, { "epoch": 3.774259886037445, "grad_norm": 6.695063591003418, "learning_rate": 1.4109869916686125e-05, "loss": 2.241544723510742, "memory(GiB)": 77.56, "step": 88095, "token_acc": 0.5071428571428571, "train_speed(iter/s)": 1.43787 }, { "epoch": 3.774474101366694, "grad_norm": 4.579066753387451, "learning_rate": 1.41051846597206e-05, "loss": 2.43978214263916, "memory(GiB)": 77.56, "step": 88100, "token_acc": 0.47058823529411764, "train_speed(iter/s)": 1.437888 }, { "epoch": 3.7746883166959426, "grad_norm": 10.251058578491211, "learning_rate": 1.4100500053023324e-05, "loss": 2.345491409301758, "memory(GiB)": 77.56, "step": 88105, "token_acc": 0.5241935483870968, "train_speed(iter/s)": 1.437908 }, { "epoch": 3.774902532025192, "grad_norm": 5.597171306610107, "learning_rate": 1.4095816096679155e-05, "loss": 2.4285993576049805, "memory(GiB)": 77.56, "step": 88110, "token_acc": 0.46607669616519176, "train_speed(iter/s)": 1.437885 }, { "epoch": 3.7751167473544407, "grad_norm": 5.961618423461914, "learning_rate": 1.409113279077297e-05, "loss": 2.603607940673828, "memory(GiB)": 77.56, "step": 88115, "token_acc": 0.44086021505376344, "train_speed(iter/s)": 1.437856 }, { "epoch": 3.7753309626836895, "grad_norm": 5.908853054046631, "learning_rate": 1.40864501353896e-05, "loss": 2.3162328720092775, "memory(GiB)": 77.56, "step": 88120, "token_acc": 0.46179401993355484, "train_speed(iter/s)": 1.437873 }, { "epoch": 3.7755451780129388, "grad_norm": 5.879225254058838, "learning_rate": 1.4081768130613877e-05, "loss": 2.304482650756836, "memory(GiB)": 77.56, "step": 88125, "token_acc": 0.5100671140939598, "train_speed(iter/s)": 1.43788 }, { "epoch": 3.7757593933421876, "grad_norm": 6.7951436042785645, "learning_rate": 1.4077086776530618e-05, "loss": 2.4009750366210936, "memory(GiB)": 77.56, "step": 88130, "token_acc": 0.5161290322580645, "train_speed(iter/s)": 1.437839 }, { "epoch": 3.7759736086714364, "grad_norm": 5.3484086990356445, "learning_rate": 1.4072406073224608e-05, "loss": 2.2523502349853515, "memory(GiB)": 77.56, "step": 88135, "token_acc": 0.5014925373134328, "train_speed(iter/s)": 1.437822 }, { "epoch": 3.7761878240006856, "grad_norm": 4.904800891876221, "learning_rate": 1.4067726020780675e-05, "loss": 2.3921941757202148, "memory(GiB)": 77.56, "step": 88140, "token_acc": 0.47692307692307695, "train_speed(iter/s)": 1.437836 }, { "epoch": 3.7764020393299345, "grad_norm": 5.1627349853515625, "learning_rate": 1.4063046619283604e-05, "loss": 2.3879465103149413, "memory(GiB)": 77.56, "step": 88145, "token_acc": 0.4854368932038835, "train_speed(iter/s)": 1.437838 }, { "epoch": 3.7766162546591833, "grad_norm": 6.091917514801025, "learning_rate": 1.4058367868818156e-05, "loss": 2.4942670822143556, "memory(GiB)": 77.56, "step": 88150, "token_acc": 0.4634920634920635, "train_speed(iter/s)": 1.437841 }, { "epoch": 3.7768304699884325, "grad_norm": 6.140024662017822, "learning_rate": 1.4053689769469085e-05, "loss": 2.0714902877807617, "memory(GiB)": 77.56, "step": 88155, "token_acc": 0.5246478873239436, "train_speed(iter/s)": 1.437841 }, { "epoch": 3.7770446853176813, "grad_norm": 6.162842750549316, "learning_rate": 1.4049012321321147e-05, "loss": 2.60401611328125, "memory(GiB)": 77.56, "step": 88160, "token_acc": 0.479020979020979, "train_speed(iter/s)": 1.437856 }, { "epoch": 3.77725890064693, "grad_norm": 6.035883903503418, "learning_rate": 1.4044335524459078e-05, "loss": 2.2452037811279295, "memory(GiB)": 77.56, "step": 88165, "token_acc": 0.4897959183673469, "train_speed(iter/s)": 1.437876 }, { "epoch": 3.7774731159761794, "grad_norm": 5.5866193771362305, "learning_rate": 1.4039659378967584e-05, "loss": 2.4295591354370116, "memory(GiB)": 77.56, "step": 88170, "token_acc": 0.4775641025641026, "train_speed(iter/s)": 1.437887 }, { "epoch": 3.777687331305428, "grad_norm": 7.191069602966309, "learning_rate": 1.4034983884931407e-05, "loss": 2.3771324157714844, "memory(GiB)": 77.56, "step": 88175, "token_acc": 0.5099601593625498, "train_speed(iter/s)": 1.437897 }, { "epoch": 3.777901546634677, "grad_norm": 8.728489875793457, "learning_rate": 1.4030309042435236e-05, "loss": 2.326151466369629, "memory(GiB)": 77.56, "step": 88180, "token_acc": 0.506896551724138, "train_speed(iter/s)": 1.437889 }, { "epoch": 3.7781157619639263, "grad_norm": 5.650343418121338, "learning_rate": 1.4025634851563762e-05, "loss": 2.416168975830078, "memory(GiB)": 77.56, "step": 88185, "token_acc": 0.4622093023255814, "train_speed(iter/s)": 1.437896 }, { "epoch": 3.778329977293175, "grad_norm": 5.094841957092285, "learning_rate": 1.4020961312401653e-05, "loss": 2.3069131851196287, "memory(GiB)": 77.56, "step": 88190, "token_acc": 0.5106382978723404, "train_speed(iter/s)": 1.437893 }, { "epoch": 3.778544192622424, "grad_norm": 8.147745132446289, "learning_rate": 1.4016288425033574e-05, "loss": 2.1368404388427735, "memory(GiB)": 77.56, "step": 88195, "token_acc": 0.5117845117845118, "train_speed(iter/s)": 1.437881 }, { "epoch": 3.778758407951673, "grad_norm": 8.104848861694336, "learning_rate": 1.4011616189544197e-05, "loss": 2.40368709564209, "memory(GiB)": 77.56, "step": 88200, "token_acc": 0.4521072796934866, "train_speed(iter/s)": 1.437892 }, { "epoch": 3.778972623280922, "grad_norm": 7.374303817749023, "learning_rate": 1.4006944606018147e-05, "loss": 2.216788101196289, "memory(GiB)": 77.56, "step": 88205, "token_acc": 0.45016077170418006, "train_speed(iter/s)": 1.437878 }, { "epoch": 3.779186838610171, "grad_norm": 4.357128620147705, "learning_rate": 1.4002273674540067e-05, "loss": 2.1201887130737305, "memory(GiB)": 77.56, "step": 88210, "token_acc": 0.5645756457564576, "train_speed(iter/s)": 1.437885 }, { "epoch": 3.77940105393942, "grad_norm": 5.161882400512695, "learning_rate": 1.3997603395194548e-05, "loss": 2.2386753082275392, "memory(GiB)": 77.56, "step": 88215, "token_acc": 0.5095057034220533, "train_speed(iter/s)": 1.437885 }, { "epoch": 3.779615269268669, "grad_norm": 4.354639530181885, "learning_rate": 1.3992933768066235e-05, "loss": 2.0564577102661135, "memory(GiB)": 77.56, "step": 88220, "token_acc": 0.541095890410959, "train_speed(iter/s)": 1.43786 }, { "epoch": 3.7798294845979177, "grad_norm": 4.925902843475342, "learning_rate": 1.3988264793239702e-05, "loss": 2.4225868225097655, "memory(GiB)": 77.56, "step": 88225, "token_acc": 0.46745562130177515, "train_speed(iter/s)": 1.437869 }, { "epoch": 3.780043699927167, "grad_norm": 5.911545276641846, "learning_rate": 1.398359647079952e-05, "loss": 2.2908609390258787, "memory(GiB)": 77.56, "step": 88230, "token_acc": 0.5, "train_speed(iter/s)": 1.437875 }, { "epoch": 3.7802579152564157, "grad_norm": 4.6984381675720215, "learning_rate": 1.3978928800830288e-05, "loss": 2.4360822677612304, "memory(GiB)": 77.56, "step": 88235, "token_acc": 0.5042492917847026, "train_speed(iter/s)": 1.437885 }, { "epoch": 3.7804721305856646, "grad_norm": 5.815092086791992, "learning_rate": 1.3974261783416554e-05, "loss": 2.421206474304199, "memory(GiB)": 77.56, "step": 88240, "token_acc": 0.5166051660516605, "train_speed(iter/s)": 1.437901 }, { "epoch": 3.780686345914914, "grad_norm": 5.670464038848877, "learning_rate": 1.3969595418642862e-05, "loss": 2.2015037536621094, "memory(GiB)": 77.56, "step": 88245, "token_acc": 0.5053003533568905, "train_speed(iter/s)": 1.437889 }, { "epoch": 3.7809005612441626, "grad_norm": 5.267561912536621, "learning_rate": 1.396492970659375e-05, "loss": 2.372382164001465, "memory(GiB)": 77.56, "step": 88250, "token_acc": 0.4930555555555556, "train_speed(iter/s)": 1.437881 }, { "epoch": 3.7811147765734114, "grad_norm": 6.126773357391357, "learning_rate": 1.3960264647353721e-05, "loss": 2.099675941467285, "memory(GiB)": 77.56, "step": 88255, "token_acc": 0.5, "train_speed(iter/s)": 1.437875 }, { "epoch": 3.7813289919026607, "grad_norm": 5.632686138153076, "learning_rate": 1.3955600241007322e-05, "loss": 2.4065338134765626, "memory(GiB)": 77.56, "step": 88260, "token_acc": 0.5333333333333333, "train_speed(iter/s)": 1.437889 }, { "epoch": 3.7815432072319095, "grad_norm": 6.635392189025879, "learning_rate": 1.3950936487639039e-05, "loss": 2.5109167098999023, "memory(GiB)": 77.56, "step": 88265, "token_acc": 0.47297297297297297, "train_speed(iter/s)": 1.437919 }, { "epoch": 3.7817574225611583, "grad_norm": 6.3631744384765625, "learning_rate": 1.3946273387333352e-05, "loss": 2.2108951568603517, "memory(GiB)": 77.56, "step": 88270, "token_acc": 0.5228215767634855, "train_speed(iter/s)": 1.437905 }, { "epoch": 3.7819716378904076, "grad_norm": 5.487986087799072, "learning_rate": 1.3941610940174748e-05, "loss": 2.2736156463623045, "memory(GiB)": 77.56, "step": 88275, "token_acc": 0.5179153094462541, "train_speed(iter/s)": 1.437887 }, { "epoch": 3.7821858532196564, "grad_norm": 5.851731777191162, "learning_rate": 1.3936949146247675e-05, "loss": 2.303765869140625, "memory(GiB)": 77.56, "step": 88280, "token_acc": 0.5181159420289855, "train_speed(iter/s)": 1.437905 }, { "epoch": 3.782400068548905, "grad_norm": 5.260596752166748, "learning_rate": 1.3932288005636607e-05, "loss": 2.221438407897949, "memory(GiB)": 77.56, "step": 88285, "token_acc": 0.5355805243445693, "train_speed(iter/s)": 1.437912 }, { "epoch": 3.7826142838781545, "grad_norm": 8.172385215759277, "learning_rate": 1.3927627518425967e-05, "loss": 2.1053960800170897, "memory(GiB)": 77.56, "step": 88290, "token_acc": 0.5342960288808665, "train_speed(iter/s)": 1.437918 }, { "epoch": 3.7828284992074033, "grad_norm": 6.352456569671631, "learning_rate": 1.3922967684700206e-05, "loss": 2.5014036178588865, "memory(GiB)": 77.56, "step": 88295, "token_acc": 0.5148148148148148, "train_speed(iter/s)": 1.437936 }, { "epoch": 3.783042714536652, "grad_norm": 6.75061559677124, "learning_rate": 1.3918308504543725e-05, "loss": 2.3760402679443358, "memory(GiB)": 77.56, "step": 88300, "token_acc": 0.5050167224080268, "train_speed(iter/s)": 1.437945 }, { "epoch": 3.7832569298659013, "grad_norm": 7.905178546905518, "learning_rate": 1.3913649978040939e-05, "loss": 2.3905990600585936, "memory(GiB)": 77.56, "step": 88305, "token_acc": 0.4831081081081081, "train_speed(iter/s)": 1.437921 }, { "epoch": 3.78347114519515, "grad_norm": 8.97487735748291, "learning_rate": 1.390899210527623e-05, "loss": 2.5207069396972654, "memory(GiB)": 77.56, "step": 88310, "token_acc": 0.44656488549618323, "train_speed(iter/s)": 1.437931 }, { "epoch": 3.783685360524399, "grad_norm": 4.976228713989258, "learning_rate": 1.3904334886333975e-05, "loss": 2.1794254302978517, "memory(GiB)": 77.56, "step": 88315, "token_acc": 0.5314465408805031, "train_speed(iter/s)": 1.437925 }, { "epoch": 3.7838995758536482, "grad_norm": 5.2152204513549805, "learning_rate": 1.3899678321298565e-05, "loss": 2.1674293518066405, "memory(GiB)": 77.56, "step": 88320, "token_acc": 0.5873015873015873, "train_speed(iter/s)": 1.437925 }, { "epoch": 3.784113791182897, "grad_norm": 5.7985382080078125, "learning_rate": 1.3895022410254348e-05, "loss": 2.329416275024414, "memory(GiB)": 77.56, "step": 88325, "token_acc": 0.5117845117845118, "train_speed(iter/s)": 1.437941 }, { "epoch": 3.784328006512146, "grad_norm": 4.425474643707275, "learning_rate": 1.3890367153285672e-05, "loss": 2.517572021484375, "memory(GiB)": 77.56, "step": 88330, "token_acc": 0.5168539325842697, "train_speed(iter/s)": 1.437966 }, { "epoch": 3.784542221841395, "grad_norm": 7.013037204742432, "learning_rate": 1.3885712550476864e-05, "loss": 2.269378662109375, "memory(GiB)": 77.56, "step": 88335, "token_acc": 0.5147540983606558, "train_speed(iter/s)": 1.437956 }, { "epoch": 3.784756437170644, "grad_norm": 4.694156169891357, "learning_rate": 1.388105860191224e-05, "loss": 2.3789909362792967, "memory(GiB)": 77.56, "step": 88340, "token_acc": 0.5017421602787456, "train_speed(iter/s)": 1.437963 }, { "epoch": 3.7849706524998927, "grad_norm": 9.157238006591797, "learning_rate": 1.3876405307676133e-05, "loss": 2.2227705001831053, "memory(GiB)": 77.56, "step": 88345, "token_acc": 0.5450980392156862, "train_speed(iter/s)": 1.437961 }, { "epoch": 3.785184867829142, "grad_norm": 5.300726890563965, "learning_rate": 1.3871752667852833e-05, "loss": 2.2756267547607423, "memory(GiB)": 77.56, "step": 88350, "token_acc": 0.47151898734177217, "train_speed(iter/s)": 1.437964 }, { "epoch": 3.785399083158391, "grad_norm": 7.77399206161499, "learning_rate": 1.3867100682526624e-05, "loss": 2.1836641311645506, "memory(GiB)": 77.56, "step": 88355, "token_acc": 0.5189003436426117, "train_speed(iter/s)": 1.437932 }, { "epoch": 3.7856132984876396, "grad_norm": 6.013106346130371, "learning_rate": 1.386244935178178e-05, "loss": 2.4205345153808593, "memory(GiB)": 77.56, "step": 88360, "token_acc": 0.468013468013468, "train_speed(iter/s)": 1.437947 }, { "epoch": 3.785827513816889, "grad_norm": 6.620067119598389, "learning_rate": 1.3857798675702555e-05, "loss": 2.2716690063476563, "memory(GiB)": 77.56, "step": 88365, "token_acc": 0.5402298850574713, "train_speed(iter/s)": 1.43791 }, { "epoch": 3.7860417291461377, "grad_norm": 5.683618068695068, "learning_rate": 1.385314865437322e-05, "loss": 2.1879451751708983, "memory(GiB)": 77.56, "step": 88370, "token_acc": 0.5567765567765568, "train_speed(iter/s)": 1.437921 }, { "epoch": 3.7862559444753865, "grad_norm": 7.9503679275512695, "learning_rate": 1.3848499287877998e-05, "loss": 2.2505825042724608, "memory(GiB)": 77.56, "step": 88375, "token_acc": 0.47635135135135137, "train_speed(iter/s)": 1.437928 }, { "epoch": 3.7864701598046357, "grad_norm": 8.003528594970703, "learning_rate": 1.3843850576301137e-05, "loss": 2.463016891479492, "memory(GiB)": 77.56, "step": 88380, "token_acc": 0.5104895104895105, "train_speed(iter/s)": 1.437913 }, { "epoch": 3.7866843751338846, "grad_norm": 4.881189823150635, "learning_rate": 1.3839202519726835e-05, "loss": 2.2944822311401367, "memory(GiB)": 77.56, "step": 88385, "token_acc": 0.547945205479452, "train_speed(iter/s)": 1.437924 }, { "epoch": 3.7868985904631334, "grad_norm": 7.816769123077393, "learning_rate": 1.3834555118239306e-05, "loss": 2.4817783355712892, "memory(GiB)": 77.56, "step": 88390, "token_acc": 0.49480968858131485, "train_speed(iter/s)": 1.437934 }, { "epoch": 3.7871128057923826, "grad_norm": 6.9696221351623535, "learning_rate": 1.3829908371922734e-05, "loss": 2.532012176513672, "memory(GiB)": 77.56, "step": 88395, "token_acc": 0.5125786163522013, "train_speed(iter/s)": 1.437959 }, { "epoch": 3.7873270211216314, "grad_norm": 6.02717399597168, "learning_rate": 1.382526228086129e-05, "loss": 2.4970827102661133, "memory(GiB)": 77.56, "step": 88400, "token_acc": 0.4826388888888889, "train_speed(iter/s)": 1.437964 }, { "epoch": 3.7875412364508803, "grad_norm": 10.718117713928223, "learning_rate": 1.3820616845139168e-05, "loss": 2.0928085327148436, "memory(GiB)": 77.56, "step": 88405, "token_acc": 0.5300353356890459, "train_speed(iter/s)": 1.437974 }, { "epoch": 3.7877554517801295, "grad_norm": 5.691068172454834, "learning_rate": 1.381597206484051e-05, "loss": 2.388553237915039, "memory(GiB)": 77.56, "step": 88410, "token_acc": 0.47692307692307695, "train_speed(iter/s)": 1.437978 }, { "epoch": 3.7879696671093783, "grad_norm": 6.725213527679443, "learning_rate": 1.3811327940049462e-05, "loss": 2.387480926513672, "memory(GiB)": 77.56, "step": 88415, "token_acc": 0.5059760956175299, "train_speed(iter/s)": 1.437999 }, { "epoch": 3.788183882438627, "grad_norm": 6.097580432891846, "learning_rate": 1.3806684470850156e-05, "loss": 2.1837238311767577, "memory(GiB)": 77.56, "step": 88420, "token_acc": 0.5320754716981132, "train_speed(iter/s)": 1.438009 }, { "epoch": 3.7883980977678764, "grad_norm": 7.747653007507324, "learning_rate": 1.3802041657326698e-05, "loss": 2.3544782638549804, "memory(GiB)": 77.56, "step": 88425, "token_acc": 0.5116279069767442, "train_speed(iter/s)": 1.437997 }, { "epoch": 3.788612313097125, "grad_norm": 6.273040294647217, "learning_rate": 1.3797399499563229e-05, "loss": 2.4027923583984374, "memory(GiB)": 77.56, "step": 88430, "token_acc": 0.4602649006622517, "train_speed(iter/s)": 1.438004 }, { "epoch": 3.788826528426374, "grad_norm": 5.949589729309082, "learning_rate": 1.3792757997643806e-05, "loss": 2.433192825317383, "memory(GiB)": 77.56, "step": 88435, "token_acc": 0.4775641025641026, "train_speed(iter/s)": 1.438028 }, { "epoch": 3.7890407437556233, "grad_norm": 5.2899861335754395, "learning_rate": 1.3788117151652563e-05, "loss": 2.3401731491088866, "memory(GiB)": 77.56, "step": 88440, "token_acc": 0.48013245033112584, "train_speed(iter/s)": 1.438032 }, { "epoch": 3.789254959084872, "grad_norm": 6.018672943115234, "learning_rate": 1.3783476961673541e-05, "loss": 2.3252368927001954, "memory(GiB)": 77.56, "step": 88445, "token_acc": 0.49544072948328266, "train_speed(iter/s)": 1.438048 }, { "epoch": 3.789469174414121, "grad_norm": 6.2637858390808105, "learning_rate": 1.3778837427790809e-05, "loss": 2.417571258544922, "memory(GiB)": 77.56, "step": 88450, "token_acc": 0.4671280276816609, "train_speed(iter/s)": 1.438051 }, { "epoch": 3.78968338974337, "grad_norm": 6.149107456207275, "learning_rate": 1.3774198550088413e-05, "loss": 2.4179168701171876, "memory(GiB)": 77.56, "step": 88455, "token_acc": 0.5160256410256411, "train_speed(iter/s)": 1.438068 }, { "epoch": 3.789897605072619, "grad_norm": 7.0848894119262695, "learning_rate": 1.3769560328650383e-05, "loss": 2.5141664505004884, "memory(GiB)": 77.56, "step": 88460, "token_acc": 0.4911660777385159, "train_speed(iter/s)": 1.438042 }, { "epoch": 3.790111820401868, "grad_norm": 5.412406921386719, "learning_rate": 1.3764922763560767e-05, "loss": 2.7039491653442385, "memory(GiB)": 77.56, "step": 88465, "token_acc": 0.46742209631728043, "train_speed(iter/s)": 1.438044 }, { "epoch": 3.790326035731117, "grad_norm": 8.049283027648926, "learning_rate": 1.3760285854903566e-05, "loss": 2.669767379760742, "memory(GiB)": 77.56, "step": 88470, "token_acc": 0.476038338658147, "train_speed(iter/s)": 1.438064 }, { "epoch": 3.790540251060366, "grad_norm": 6.3365044593811035, "learning_rate": 1.3755649602762777e-05, "loss": 2.130634880065918, "memory(GiB)": 77.56, "step": 88475, "token_acc": 0.5358361774744027, "train_speed(iter/s)": 1.438081 }, { "epoch": 3.7907544663896147, "grad_norm": 5.440357208251953, "learning_rate": 1.3751014007222402e-05, "loss": 2.2859848022460936, "memory(GiB)": 77.56, "step": 88480, "token_acc": 0.48148148148148145, "train_speed(iter/s)": 1.438092 }, { "epoch": 3.790968681718864, "grad_norm": 4.9004998207092285, "learning_rate": 1.3746379068366394e-05, "loss": 2.5116214752197266, "memory(GiB)": 77.56, "step": 88485, "token_acc": 0.46419098143236076, "train_speed(iter/s)": 1.438115 }, { "epoch": 3.7911828970481127, "grad_norm": 6.595088005065918, "learning_rate": 1.374174478627876e-05, "loss": 2.4399547576904297, "memory(GiB)": 77.56, "step": 88490, "token_acc": 0.4838709677419355, "train_speed(iter/s)": 1.438113 }, { "epoch": 3.7913971123773615, "grad_norm": 5.438348293304443, "learning_rate": 1.3737111161043426e-05, "loss": 2.25557861328125, "memory(GiB)": 77.56, "step": 88495, "token_acc": 0.5272108843537415, "train_speed(iter/s)": 1.438127 }, { "epoch": 3.791611327706611, "grad_norm": 5.495586395263672, "learning_rate": 1.3732478192744342e-05, "loss": 2.315243148803711, "memory(GiB)": 77.56, "step": 88500, "token_acc": 0.5622317596566524, "train_speed(iter/s)": 1.438139 }, { "epoch": 3.791611327706611, "eval_loss": 2.179872512817383, "eval_runtime": 14.6266, "eval_samples_per_second": 6.837, "eval_steps_per_second": 6.837, "eval_token_acc": 0.4811443433029909, "step": 88500 }, { "epoch": 3.7918255430358596, "grad_norm": 5.515460014343262, "learning_rate": 1.3727845881465434e-05, "loss": 2.4595081329345705, "memory(GiB)": 77.56, "step": 88505, "token_acc": 0.48402255639097747, "train_speed(iter/s)": 1.437765 }, { "epoch": 3.7920397583651084, "grad_norm": 6.716407299041748, "learning_rate": 1.3723214227290615e-05, "loss": 2.259731101989746, "memory(GiB)": 77.56, "step": 88510, "token_acc": 0.5371024734982333, "train_speed(iter/s)": 1.437758 }, { "epoch": 3.7922539736943577, "grad_norm": 4.956208229064941, "learning_rate": 1.3718583230303794e-05, "loss": 1.9834444046020507, "memory(GiB)": 77.56, "step": 88515, "token_acc": 0.5389221556886228, "train_speed(iter/s)": 1.437758 }, { "epoch": 3.7924681890236065, "grad_norm": 5.995739936828613, "learning_rate": 1.3713952890588894e-05, "loss": 2.6465744018554687, "memory(GiB)": 77.56, "step": 88520, "token_acc": 0.46846846846846846, "train_speed(iter/s)": 1.437774 }, { "epoch": 3.7926824043528553, "grad_norm": 5.004510879516602, "learning_rate": 1.370932320822977e-05, "loss": 2.5779163360595705, "memory(GiB)": 77.56, "step": 88525, "token_acc": 0.4966216216216216, "train_speed(iter/s)": 1.437748 }, { "epoch": 3.7928966196821046, "grad_norm": 5.810140132904053, "learning_rate": 1.3704694183310301e-05, "loss": 2.3363718032836913, "memory(GiB)": 77.56, "step": 88530, "token_acc": 0.5487804878048781, "train_speed(iter/s)": 1.437774 }, { "epoch": 3.7931108350113534, "grad_norm": 4.239035606384277, "learning_rate": 1.3700065815914348e-05, "loss": 2.233226776123047, "memory(GiB)": 77.56, "step": 88535, "token_acc": 0.5046153846153846, "train_speed(iter/s)": 1.437796 }, { "epoch": 3.793325050340602, "grad_norm": 6.235622406005859, "learning_rate": 1.3695438106125757e-05, "loss": 2.3486305236816407, "memory(GiB)": 77.56, "step": 88540, "token_acc": 0.5176056338028169, "train_speed(iter/s)": 1.437793 }, { "epoch": 3.7935392656698514, "grad_norm": 5.882605075836182, "learning_rate": 1.369081105402834e-05, "loss": 2.4421903610229494, "memory(GiB)": 77.56, "step": 88545, "token_acc": 0.5093167701863354, "train_speed(iter/s)": 1.43781 }, { "epoch": 3.7937534809991003, "grad_norm": 6.603424072265625, "learning_rate": 1.3686184659705959e-05, "loss": 2.723403549194336, "memory(GiB)": 77.56, "step": 88550, "token_acc": 0.4377224199288256, "train_speed(iter/s)": 1.43783 }, { "epoch": 3.793967696328349, "grad_norm": 6.497887134552002, "learning_rate": 1.368155892324241e-05, "loss": 2.0882404327392576, "memory(GiB)": 77.56, "step": 88555, "token_acc": 0.5849056603773585, "train_speed(iter/s)": 1.437823 }, { "epoch": 3.7941819116575983, "grad_norm": 5.324814796447754, "learning_rate": 1.3676933844721484e-05, "loss": 2.221200942993164, "memory(GiB)": 77.56, "step": 88560, "token_acc": 0.5116279069767442, "train_speed(iter/s)": 1.437804 }, { "epoch": 3.794396126986847, "grad_norm": 6.296723365783691, "learning_rate": 1.3672309424226981e-05, "loss": 2.426207733154297, "memory(GiB)": 77.56, "step": 88565, "token_acc": 0.5135135135135135, "train_speed(iter/s)": 1.43782 }, { "epoch": 3.794610342316096, "grad_norm": 5.804182529449463, "learning_rate": 1.3667685661842655e-05, "loss": 2.2590354919433593, "memory(GiB)": 77.56, "step": 88570, "token_acc": 0.5131578947368421, "train_speed(iter/s)": 1.437823 }, { "epoch": 3.794824557645345, "grad_norm": 5.601439476013184, "learning_rate": 1.36630625576523e-05, "loss": 2.4765195846557617, "memory(GiB)": 77.56, "step": 88575, "token_acc": 0.49085365853658536, "train_speed(iter/s)": 1.437829 }, { "epoch": 3.795038772974594, "grad_norm": 6.061161518096924, "learning_rate": 1.3658440111739657e-05, "loss": 2.0731300354003905, "memory(GiB)": 77.56, "step": 88580, "token_acc": 0.5270758122743683, "train_speed(iter/s)": 1.437826 }, { "epoch": 3.795252988303843, "grad_norm": 6.686573505401611, "learning_rate": 1.3653818324188444e-05, "loss": 2.359030532836914, "memory(GiB)": 77.56, "step": 88585, "token_acc": 0.4840764331210191, "train_speed(iter/s)": 1.437836 }, { "epoch": 3.795467203633092, "grad_norm": 6.2744364738464355, "learning_rate": 1.364919719508242e-05, "loss": 2.588063430786133, "memory(GiB)": 77.56, "step": 88590, "token_acc": 0.5015576323987538, "train_speed(iter/s)": 1.437836 }, { "epoch": 3.795681418962341, "grad_norm": 7.436146259307861, "learning_rate": 1.3644576724505292e-05, "loss": 2.6732555389404298, "memory(GiB)": 77.56, "step": 88595, "token_acc": 0.49295774647887325, "train_speed(iter/s)": 1.437841 }, { "epoch": 3.7958956342915897, "grad_norm": 5.4840288162231445, "learning_rate": 1.3639956912540764e-05, "loss": 2.4047895431518556, "memory(GiB)": 77.56, "step": 88600, "token_acc": 0.5222222222222223, "train_speed(iter/s)": 1.437853 }, { "epoch": 3.796109849620839, "grad_norm": 5.893927574157715, "learning_rate": 1.3635337759272503e-05, "loss": 2.2417497634887695, "memory(GiB)": 77.56, "step": 88605, "token_acc": 0.5235109717868338, "train_speed(iter/s)": 1.437859 }, { "epoch": 3.796324064950088, "grad_norm": 5.517001628875732, "learning_rate": 1.363071926478423e-05, "loss": 2.401530647277832, "memory(GiB)": 77.56, "step": 88610, "token_acc": 0.471976401179941, "train_speed(iter/s)": 1.437857 }, { "epoch": 3.7965382802793366, "grad_norm": 6.061408519744873, "learning_rate": 1.3626101429159593e-05, "loss": 2.487696075439453, "memory(GiB)": 77.56, "step": 88615, "token_acc": 0.46619217081850534, "train_speed(iter/s)": 1.437872 }, { "epoch": 3.796752495608586, "grad_norm": 6.266184329986572, "learning_rate": 1.3621484252482252e-05, "loss": 2.2935035705566404, "memory(GiB)": 77.56, "step": 88620, "token_acc": 0.5343511450381679, "train_speed(iter/s)": 1.437889 }, { "epoch": 3.7969667109378347, "grad_norm": 5.447831630706787, "learning_rate": 1.3616867734835854e-05, "loss": 2.3129610061645507, "memory(GiB)": 77.56, "step": 88625, "token_acc": 0.49469964664310956, "train_speed(iter/s)": 1.437891 }, { "epoch": 3.7971809262670835, "grad_norm": 5.633322238922119, "learning_rate": 1.3612251876304005e-05, "loss": 2.3474416732788086, "memory(GiB)": 77.56, "step": 88630, "token_acc": 0.5258358662613982, "train_speed(iter/s)": 1.437881 }, { "epoch": 3.7973951415963327, "grad_norm": 6.463729381561279, "learning_rate": 1.3607636676970365e-05, "loss": 2.555350494384766, "memory(GiB)": 77.56, "step": 88635, "token_acc": 0.46688741721854304, "train_speed(iter/s)": 1.437879 }, { "epoch": 3.7976093569255815, "grad_norm": 7.855784893035889, "learning_rate": 1.3603022136918526e-05, "loss": 2.4156044006347654, "memory(GiB)": 77.56, "step": 88640, "token_acc": 0.4944649446494465, "train_speed(iter/s)": 1.437879 }, { "epoch": 3.7978235722548304, "grad_norm": 4.906198024749756, "learning_rate": 1.3598408256232082e-05, "loss": 2.0094844818115236, "memory(GiB)": 77.56, "step": 88645, "token_acc": 0.554006968641115, "train_speed(iter/s)": 1.43789 }, { "epoch": 3.7980377875840796, "grad_norm": 5.530091285705566, "learning_rate": 1.3593795034994622e-05, "loss": 2.4955734252929687, "memory(GiB)": 77.56, "step": 88650, "token_acc": 0.48788927335640137, "train_speed(iter/s)": 1.437885 }, { "epoch": 3.7982520029133284, "grad_norm": 5.036495685577393, "learning_rate": 1.3589182473289703e-05, "loss": 2.515034866333008, "memory(GiB)": 77.56, "step": 88655, "token_acc": 0.4878048780487805, "train_speed(iter/s)": 1.437894 }, { "epoch": 3.7984662182425772, "grad_norm": 6.222428798675537, "learning_rate": 1.3584570571200895e-05, "loss": 2.318907928466797, "memory(GiB)": 77.56, "step": 88660, "token_acc": 0.48348348348348347, "train_speed(iter/s)": 1.437907 }, { "epoch": 3.7986804335718265, "grad_norm": 5.604467868804932, "learning_rate": 1.357995932881177e-05, "loss": 2.1846290588378907, "memory(GiB)": 77.56, "step": 88665, "token_acc": 0.48880597014925375, "train_speed(iter/s)": 1.437912 }, { "epoch": 3.7988946489010753, "grad_norm": 5.464486122131348, "learning_rate": 1.3575348746205847e-05, "loss": 2.1534345626831053, "memory(GiB)": 77.56, "step": 88670, "token_acc": 0.5171232876712328, "train_speed(iter/s)": 1.437913 }, { "epoch": 3.799108864230324, "grad_norm": 5.152063846588135, "learning_rate": 1.3570738823466645e-05, "loss": 2.215536117553711, "memory(GiB)": 77.56, "step": 88675, "token_acc": 0.556910569105691, "train_speed(iter/s)": 1.437909 }, { "epoch": 3.7993230795595734, "grad_norm": 6.006877899169922, "learning_rate": 1.3566129560677687e-05, "loss": 2.5528993606567383, "memory(GiB)": 77.56, "step": 88680, "token_acc": 0.46355685131195334, "train_speed(iter/s)": 1.437908 }, { "epoch": 3.799537294888822, "grad_norm": 5.099438667297363, "learning_rate": 1.3561520957922469e-05, "loss": 2.391332244873047, "memory(GiB)": 77.56, "step": 88685, "token_acc": 0.4798657718120805, "train_speed(iter/s)": 1.437925 }, { "epoch": 3.799751510218071, "grad_norm": 6.159640312194824, "learning_rate": 1.3556913015284467e-05, "loss": 2.172776222229004, "memory(GiB)": 77.56, "step": 88690, "token_acc": 0.4931506849315068, "train_speed(iter/s)": 1.437941 }, { "epoch": 3.7999657255473203, "grad_norm": 4.8566718101501465, "learning_rate": 1.3552305732847186e-05, "loss": 2.5580873489379883, "memory(GiB)": 77.56, "step": 88695, "token_acc": 0.4956268221574344, "train_speed(iter/s)": 1.437955 }, { "epoch": 3.800179940876569, "grad_norm": 9.976699829101562, "learning_rate": 1.3547699110694079e-05, "loss": 2.597652626037598, "memory(GiB)": 77.56, "step": 88700, "token_acc": 0.4948805460750853, "train_speed(iter/s)": 1.437972 }, { "epoch": 3.800394156205818, "grad_norm": 5.338417053222656, "learning_rate": 1.3543093148908597e-05, "loss": 2.345846176147461, "memory(GiB)": 77.56, "step": 88705, "token_acc": 0.5101351351351351, "train_speed(iter/s)": 1.437965 }, { "epoch": 3.800608371535067, "grad_norm": 8.344489097595215, "learning_rate": 1.3538487847574183e-05, "loss": 2.6195497512817383, "memory(GiB)": 77.56, "step": 88710, "token_acc": 0.4878048780487805, "train_speed(iter/s)": 1.437972 }, { "epoch": 3.800822586864316, "grad_norm": 6.010809421539307, "learning_rate": 1.353388320677425e-05, "loss": 2.454133987426758, "memory(GiB)": 77.56, "step": 88715, "token_acc": 0.48172757475083056, "train_speed(iter/s)": 1.437991 }, { "epoch": 3.8010368021935648, "grad_norm": 5.5802226066589355, "learning_rate": 1.3529279226592245e-05, "loss": 2.148936653137207, "memory(GiB)": 77.56, "step": 88720, "token_acc": 0.5433962264150943, "train_speed(iter/s)": 1.437998 }, { "epoch": 3.801251017522814, "grad_norm": 5.8756208419799805, "learning_rate": 1.3524675907111555e-05, "loss": 2.5548810958862305, "memory(GiB)": 77.56, "step": 88725, "token_acc": 0.4807121661721068, "train_speed(iter/s)": 1.438007 }, { "epoch": 3.801465232852063, "grad_norm": 6.6932172775268555, "learning_rate": 1.3520073248415565e-05, "loss": 2.5040332794189455, "memory(GiB)": 77.56, "step": 88730, "token_acc": 0.4564459930313589, "train_speed(iter/s)": 1.437989 }, { "epoch": 3.8016794481813116, "grad_norm": 6.522132873535156, "learning_rate": 1.3515471250587686e-05, "loss": 2.134455108642578, "memory(GiB)": 77.56, "step": 88735, "token_acc": 0.5510204081632653, "train_speed(iter/s)": 1.437982 }, { "epoch": 3.801893663510561, "grad_norm": 5.06507682800293, "learning_rate": 1.3510869913711272e-05, "loss": 2.4543106079101564, "memory(GiB)": 77.56, "step": 88740, "token_acc": 0.4666666666666667, "train_speed(iter/s)": 1.437983 }, { "epoch": 3.8021078788398097, "grad_norm": 5.750798225402832, "learning_rate": 1.3506269237869679e-05, "loss": 2.3681476593017576, "memory(GiB)": 77.56, "step": 88745, "token_acc": 0.5127118644067796, "train_speed(iter/s)": 1.437978 }, { "epoch": 3.8023220941690585, "grad_norm": 6.647711277008057, "learning_rate": 1.3501669223146236e-05, "loss": 2.441213035583496, "memory(GiB)": 77.56, "step": 88750, "token_acc": 0.5298013245033113, "train_speed(iter/s)": 1.438001 }, { "epoch": 3.802536309498308, "grad_norm": 6.817627429962158, "learning_rate": 1.3497069869624307e-05, "loss": 2.697922134399414, "memory(GiB)": 77.56, "step": 88755, "token_acc": 0.45689655172413796, "train_speed(iter/s)": 1.438005 }, { "epoch": 3.8027505248275566, "grad_norm": 6.088765621185303, "learning_rate": 1.34924711773872e-05, "loss": 2.2126121520996094, "memory(GiB)": 77.56, "step": 88760, "token_acc": 0.5101351351351351, "train_speed(iter/s)": 1.438019 }, { "epoch": 3.8029647401568054, "grad_norm": 6.300668716430664, "learning_rate": 1.348787314651822e-05, "loss": 2.510703468322754, "memory(GiB)": 77.56, "step": 88765, "token_acc": 0.5215827338129496, "train_speed(iter/s)": 1.438009 }, { "epoch": 3.8031789554860547, "grad_norm": 6.420996189117432, "learning_rate": 1.3483275777100679e-05, "loss": 2.6871437072753905, "memory(GiB)": 77.56, "step": 88770, "token_acc": 0.4365079365079365, "train_speed(iter/s)": 1.438005 }, { "epoch": 3.8033931708153035, "grad_norm": 4.696811676025391, "learning_rate": 1.3478679069217831e-05, "loss": 2.2402273178100587, "memory(GiB)": 77.56, "step": 88775, "token_acc": 0.5, "train_speed(iter/s)": 1.438006 }, { "epoch": 3.8036073861445523, "grad_norm": 5.776646614074707, "learning_rate": 1.3474083022952987e-05, "loss": 1.9888893127441407, "memory(GiB)": 77.56, "step": 88780, "token_acc": 0.5607843137254902, "train_speed(iter/s)": 1.438033 }, { "epoch": 3.8038216014738016, "grad_norm": 4.437472820281982, "learning_rate": 1.3469487638389394e-05, "loss": 2.373304557800293, "memory(GiB)": 77.56, "step": 88785, "token_acc": 0.4756944444444444, "train_speed(iter/s)": 1.438048 }, { "epoch": 3.8040358168030504, "grad_norm": 6.797004222869873, "learning_rate": 1.3464892915610305e-05, "loss": 2.1475845336914063, "memory(GiB)": 77.56, "step": 88790, "token_acc": 0.49812734082397003, "train_speed(iter/s)": 1.438054 }, { "epoch": 3.804250032132299, "grad_norm": 5.936126232147217, "learning_rate": 1.346029885469895e-05, "loss": 2.401422882080078, "memory(GiB)": 77.56, "step": 88795, "token_acc": 0.45821325648414984, "train_speed(iter/s)": 1.438073 }, { "epoch": 3.8044642474615484, "grad_norm": 6.225060939788818, "learning_rate": 1.3455705455738543e-05, "loss": 2.261306381225586, "memory(GiB)": 77.56, "step": 88800, "token_acc": 0.4778481012658228, "train_speed(iter/s)": 1.438081 }, { "epoch": 3.8046784627907972, "grad_norm": 5.280925273895264, "learning_rate": 1.3451112718812308e-05, "loss": 2.239191436767578, "memory(GiB)": 77.56, "step": 88805, "token_acc": 0.5064102564102564, "train_speed(iter/s)": 1.438077 }, { "epoch": 3.804892678120046, "grad_norm": 5.71987247467041, "learning_rate": 1.3446520644003474e-05, "loss": 2.279434585571289, "memory(GiB)": 77.56, "step": 88810, "token_acc": 0.5105105105105106, "train_speed(iter/s)": 1.438073 }, { "epoch": 3.8051068934492953, "grad_norm": 5.1327805519104, "learning_rate": 1.3441929231395207e-05, "loss": 2.4025634765625, "memory(GiB)": 77.56, "step": 88815, "token_acc": 0.5124223602484472, "train_speed(iter/s)": 1.43807 }, { "epoch": 3.805321108778544, "grad_norm": 6.3011016845703125, "learning_rate": 1.3437338481070689e-05, "loss": 2.3850595474243166, "memory(GiB)": 77.56, "step": 88820, "token_acc": 0.4921875, "train_speed(iter/s)": 1.438074 }, { "epoch": 3.805535324107793, "grad_norm": 5.278100490570068, "learning_rate": 1.3432748393113075e-05, "loss": 2.1128021240234376, "memory(GiB)": 77.56, "step": 88825, "token_acc": 0.5419354838709678, "train_speed(iter/s)": 1.438077 }, { "epoch": 3.805749539437042, "grad_norm": 5.199338912963867, "learning_rate": 1.342815896760553e-05, "loss": 2.3207975387573243, "memory(GiB)": 77.56, "step": 88830, "token_acc": 0.494949494949495, "train_speed(iter/s)": 1.438074 }, { "epoch": 3.805963754766291, "grad_norm": 6.1854729652404785, "learning_rate": 1.342357020463118e-05, "loss": 2.428904724121094, "memory(GiB)": 77.56, "step": 88835, "token_acc": 0.49615384615384617, "train_speed(iter/s)": 1.438088 }, { "epoch": 3.80617797009554, "grad_norm": 5.287379741668701, "learning_rate": 1.3418982104273182e-05, "loss": 2.2304576873779296, "memory(GiB)": 77.56, "step": 88840, "token_acc": 0.5018315018315018, "train_speed(iter/s)": 1.438052 }, { "epoch": 3.806392185424789, "grad_norm": 6.720423698425293, "learning_rate": 1.3414394666614638e-05, "loss": 2.480998229980469, "memory(GiB)": 77.56, "step": 88845, "token_acc": 0.4606741573033708, "train_speed(iter/s)": 1.438055 }, { "epoch": 3.806606400754038, "grad_norm": 7.8951592445373535, "learning_rate": 1.3409807891738652e-05, "loss": 2.660314178466797, "memory(GiB)": 77.56, "step": 88850, "token_acc": 0.45987654320987653, "train_speed(iter/s)": 1.438065 }, { "epoch": 3.8068206160832867, "grad_norm": 5.180807590484619, "learning_rate": 1.3405221779728327e-05, "loss": 2.4456287384033204, "memory(GiB)": 77.56, "step": 88855, "token_acc": 0.45588235294117646, "train_speed(iter/s)": 1.438068 }, { "epoch": 3.807034831412536, "grad_norm": 6.398998260498047, "learning_rate": 1.3400636330666716e-05, "loss": 2.436860466003418, "memory(GiB)": 77.56, "step": 88860, "token_acc": 0.4863013698630137, "train_speed(iter/s)": 1.438055 }, { "epoch": 3.8072490467417848, "grad_norm": 4.665395736694336, "learning_rate": 1.3396051544636928e-05, "loss": 2.452156639099121, "memory(GiB)": 77.56, "step": 88865, "token_acc": 0.4533333333333333, "train_speed(iter/s)": 1.438062 }, { "epoch": 3.8074632620710336, "grad_norm": 8.299912452697754, "learning_rate": 1.3391467421722004e-05, "loss": 2.2736478805541993, "memory(GiB)": 77.56, "step": 88870, "token_acc": 0.5089820359281437, "train_speed(iter/s)": 1.438085 }, { "epoch": 3.807677477400283, "grad_norm": 4.993086814880371, "learning_rate": 1.3386883962004992e-05, "loss": 1.9588802337646485, "memory(GiB)": 77.56, "step": 88875, "token_acc": 0.5851528384279476, "train_speed(iter/s)": 1.43808 }, { "epoch": 3.8078916927295317, "grad_norm": 4.616026401519775, "learning_rate": 1.3382301165568905e-05, "loss": 2.0985668182373045, "memory(GiB)": 77.56, "step": 88880, "token_acc": 0.5548780487804879, "train_speed(iter/s)": 1.438089 }, { "epoch": 3.8081059080587805, "grad_norm": 6.2056121826171875, "learning_rate": 1.33777190324968e-05, "loss": 2.611598587036133, "memory(GiB)": 77.56, "step": 88885, "token_acc": 0.45045045045045046, "train_speed(iter/s)": 1.438086 }, { "epoch": 3.8083201233880297, "grad_norm": 5.870640754699707, "learning_rate": 1.3373137562871668e-05, "loss": 2.4061965942382812, "memory(GiB)": 77.56, "step": 88890, "token_acc": 0.46534653465346537, "train_speed(iter/s)": 1.438086 }, { "epoch": 3.8085343387172785, "grad_norm": 5.236010551452637, "learning_rate": 1.336855675677649e-05, "loss": 2.472321701049805, "memory(GiB)": 77.56, "step": 88895, "token_acc": 0.4588607594936709, "train_speed(iter/s)": 1.438088 }, { "epoch": 3.8087485540465273, "grad_norm": 6.35364294052124, "learning_rate": 1.3363976614294288e-05, "loss": 2.48392333984375, "memory(GiB)": 77.56, "step": 88900, "token_acc": 0.47468354430379744, "train_speed(iter/s)": 1.438091 }, { "epoch": 3.8089627693757766, "grad_norm": 4.6643781661987305, "learning_rate": 1.3359397135508017e-05, "loss": 2.2396944046020506, "memory(GiB)": 77.56, "step": 88905, "token_acc": 0.5016611295681063, "train_speed(iter/s)": 1.438059 }, { "epoch": 3.8091769847050254, "grad_norm": 5.817845344543457, "learning_rate": 1.3354818320500634e-05, "loss": 2.1948976516723633, "memory(GiB)": 77.56, "step": 88910, "token_acc": 0.5444444444444444, "train_speed(iter/s)": 1.438056 }, { "epoch": 3.8093912000342742, "grad_norm": 6.068362236022949, "learning_rate": 1.3350240169355099e-05, "loss": 2.5008708953857424, "memory(GiB)": 77.56, "step": 88915, "token_acc": 0.4828571428571429, "train_speed(iter/s)": 1.438052 }, { "epoch": 3.8096054153635235, "grad_norm": 5.653886795043945, "learning_rate": 1.3345662682154326e-05, "loss": 2.330107307434082, "memory(GiB)": 77.56, "step": 88920, "token_acc": 0.4967741935483871, "train_speed(iter/s)": 1.438069 }, { "epoch": 3.8098196306927723, "grad_norm": 6.844162464141846, "learning_rate": 1.3341085858981273e-05, "loss": 2.1554405212402346, "memory(GiB)": 77.56, "step": 88925, "token_acc": 0.5158730158730159, "train_speed(iter/s)": 1.438077 }, { "epoch": 3.810033846022021, "grad_norm": 5.717306137084961, "learning_rate": 1.3336509699918837e-05, "loss": 2.688567543029785, "memory(GiB)": 77.56, "step": 88930, "token_acc": 0.4588607594936709, "train_speed(iter/s)": 1.438086 }, { "epoch": 3.8102480613512704, "grad_norm": 7.388886451721191, "learning_rate": 1.3331934205049924e-05, "loss": 2.192700004577637, "memory(GiB)": 77.56, "step": 88935, "token_acc": 0.5033112582781457, "train_speed(iter/s)": 1.4381 }, { "epoch": 3.810462276680519, "grad_norm": 4.905337333679199, "learning_rate": 1.3327359374457415e-05, "loss": 2.3151618957519533, "memory(GiB)": 77.56, "step": 88940, "token_acc": 0.5138888888888888, "train_speed(iter/s)": 1.438115 }, { "epoch": 3.810676492009768, "grad_norm": 6.05534029006958, "learning_rate": 1.3322785208224181e-05, "loss": 2.3507307052612303, "memory(GiB)": 77.56, "step": 88945, "token_acc": 0.49063670411985016, "train_speed(iter/s)": 1.438125 }, { "epoch": 3.8108907073390172, "grad_norm": 5.012078285217285, "learning_rate": 1.3318211706433114e-05, "loss": 2.1554950714111327, "memory(GiB)": 77.56, "step": 88950, "token_acc": 0.5415162454873647, "train_speed(iter/s)": 1.438131 }, { "epoch": 3.811104922668266, "grad_norm": 5.602942943572998, "learning_rate": 1.331363886916704e-05, "loss": 2.5066082000732424, "memory(GiB)": 77.56, "step": 88955, "token_acc": 0.49544072948328266, "train_speed(iter/s)": 1.438123 }, { "epoch": 3.811319137997515, "grad_norm": 6.411240100860596, "learning_rate": 1.3309066696508826e-05, "loss": 2.576996612548828, "memory(GiB)": 77.56, "step": 88960, "token_acc": 0.504950495049505, "train_speed(iter/s)": 1.438137 }, { "epoch": 3.811533353326764, "grad_norm": 4.697442054748535, "learning_rate": 1.3304495188541283e-05, "loss": 2.3949050903320312, "memory(GiB)": 77.56, "step": 88965, "token_acc": 0.5083798882681564, "train_speed(iter/s)": 1.438101 }, { "epoch": 3.811747568656013, "grad_norm": 6.031418323516846, "learning_rate": 1.3299924345347237e-05, "loss": 2.269063949584961, "memory(GiB)": 77.56, "step": 88970, "token_acc": 0.5346534653465347, "train_speed(iter/s)": 1.4381 }, { "epoch": 3.8119617839852618, "grad_norm": 5.140743732452393, "learning_rate": 1.3295354167009489e-05, "loss": 2.3306386947631834, "memory(GiB)": 77.56, "step": 88975, "token_acc": 0.49707602339181284, "train_speed(iter/s)": 1.438087 }, { "epoch": 3.812175999314511, "grad_norm": 6.081470012664795, "learning_rate": 1.3290784653610817e-05, "loss": 2.3037765502929686, "memory(GiB)": 77.56, "step": 88980, "token_acc": 0.4983164983164983, "train_speed(iter/s)": 1.438103 }, { "epoch": 3.81239021464376, "grad_norm": 7.274415493011475, "learning_rate": 1.328621580523403e-05, "loss": 2.193290138244629, "memory(GiB)": 77.56, "step": 88985, "token_acc": 0.5348837209302325, "train_speed(iter/s)": 1.438109 }, { "epoch": 3.8126044299730086, "grad_norm": 6.503421306610107, "learning_rate": 1.3281647621961885e-05, "loss": 1.9087289810180663, "memory(GiB)": 77.56, "step": 88990, "token_acc": 0.5529953917050692, "train_speed(iter/s)": 1.438108 }, { "epoch": 3.812818645302258, "grad_norm": 5.625267028808594, "learning_rate": 1.3277080103877138e-05, "loss": 2.590570831298828, "memory(GiB)": 77.56, "step": 88995, "token_acc": 0.46788990825688076, "train_speed(iter/s)": 1.438119 }, { "epoch": 3.8130328606315067, "grad_norm": 6.017382621765137, "learning_rate": 1.3272513251062535e-05, "loss": 2.3766555786132812, "memory(GiB)": 77.56, "step": 89000, "token_acc": 0.498371335504886, "train_speed(iter/s)": 1.438116 }, { "epoch": 3.8130328606315067, "eval_loss": 2.049919843673706, "eval_runtime": 13.7157, "eval_samples_per_second": 7.291, "eval_steps_per_second": 7.291, "eval_token_acc": 0.4951590594744122, "step": 89000 }, { "epoch": 3.8132470759607555, "grad_norm": 4.624563217163086, "learning_rate": 1.3267947063600794e-05, "loss": 2.003373146057129, "memory(GiB)": 77.56, "step": 89005, "token_acc": 0.517, "train_speed(iter/s)": 1.437786 }, { "epoch": 3.8134612912900048, "grad_norm": 5.108028888702393, "learning_rate": 1.3263381541574659e-05, "loss": 2.4233119964599608, "memory(GiB)": 77.56, "step": 89010, "token_acc": 0.5030120481927711, "train_speed(iter/s)": 1.437801 }, { "epoch": 3.8136755066192536, "grad_norm": 5.897947311401367, "learning_rate": 1.3258816685066832e-05, "loss": 2.368221664428711, "memory(GiB)": 77.56, "step": 89015, "token_acc": 0.4956521739130435, "train_speed(iter/s)": 1.43781 }, { "epoch": 3.8138897219485024, "grad_norm": 5.4629011154174805, "learning_rate": 1.3254252494160007e-05, "loss": 2.1324527740478514, "memory(GiB)": 77.56, "step": 89020, "token_acc": 0.5413533834586466, "train_speed(iter/s)": 1.437821 }, { "epoch": 3.8141039372777517, "grad_norm": 6.022658348083496, "learning_rate": 1.3249688968936869e-05, "loss": 2.297369194030762, "memory(GiB)": 77.56, "step": 89025, "token_acc": 0.47101449275362317, "train_speed(iter/s)": 1.437815 }, { "epoch": 3.8143181526070005, "grad_norm": 4.894735336303711, "learning_rate": 1.3245126109480076e-05, "loss": 2.674047088623047, "memory(GiB)": 77.56, "step": 89030, "token_acc": 0.4447852760736196, "train_speed(iter/s)": 1.437828 }, { "epoch": 3.8145323679362493, "grad_norm": 5.074991703033447, "learning_rate": 1.3240563915872316e-05, "loss": 2.2101449966430664, "memory(GiB)": 77.56, "step": 89035, "token_acc": 0.5601374570446735, "train_speed(iter/s)": 1.43783 }, { "epoch": 3.8147465832654985, "grad_norm": 4.689578056335449, "learning_rate": 1.3236002388196206e-05, "loss": 2.1054061889648437, "memory(GiB)": 77.56, "step": 89040, "token_acc": 0.5363636363636364, "train_speed(iter/s)": 1.437801 }, { "epoch": 3.8149607985947473, "grad_norm": 6.7528886795043945, "learning_rate": 1.3231441526534422e-05, "loss": 2.1909435272216795, "memory(GiB)": 77.56, "step": 89045, "token_acc": 0.5655737704918032, "train_speed(iter/s)": 1.437804 }, { "epoch": 3.815175013923996, "grad_norm": 6.765122413635254, "learning_rate": 1.3226881330969559e-05, "loss": 2.552584648132324, "memory(GiB)": 77.56, "step": 89050, "token_acc": 0.5069444444444444, "train_speed(iter/s)": 1.437831 }, { "epoch": 3.8153892292532454, "grad_norm": 6.50324821472168, "learning_rate": 1.3222321801584242e-05, "loss": 2.1903121948242186, "memory(GiB)": 77.56, "step": 89055, "token_acc": 0.5274725274725275, "train_speed(iter/s)": 1.437839 }, { "epoch": 3.8156034445824942, "grad_norm": 6.8679890632629395, "learning_rate": 1.3217762938461065e-05, "loss": 2.3075584411621093, "memory(GiB)": 77.56, "step": 89060, "token_acc": 0.47735191637630664, "train_speed(iter/s)": 1.43785 }, { "epoch": 3.815817659911743, "grad_norm": 5.843669891357422, "learning_rate": 1.32132047416826e-05, "loss": 2.451921844482422, "memory(GiB)": 77.56, "step": 89065, "token_acc": 0.4984894259818731, "train_speed(iter/s)": 1.437853 }, { "epoch": 3.8160318752409923, "grad_norm": 4.5095367431640625, "learning_rate": 1.3208647211331454e-05, "loss": 2.275189208984375, "memory(GiB)": 77.56, "step": 89070, "token_acc": 0.5185185185185185, "train_speed(iter/s)": 1.437857 }, { "epoch": 3.816246090570241, "grad_norm": 5.4449968338012695, "learning_rate": 1.3204090347490178e-05, "loss": 2.4361053466796876, "memory(GiB)": 77.56, "step": 89075, "token_acc": 0.503448275862069, "train_speed(iter/s)": 1.437847 }, { "epoch": 3.81646030589949, "grad_norm": 5.6485514640808105, "learning_rate": 1.3199534150241321e-05, "loss": 2.132061767578125, "memory(GiB)": 77.56, "step": 89080, "token_acc": 0.5708955223880597, "train_speed(iter/s)": 1.437866 }, { "epoch": 3.816674521228739, "grad_norm": 5.526097297668457, "learning_rate": 1.319497861966743e-05, "loss": 2.0641841888427734, "memory(GiB)": 77.56, "step": 89085, "token_acc": 0.5393258426966292, "train_speed(iter/s)": 1.437889 }, { "epoch": 3.816888736557988, "grad_norm": 5.482429027557373, "learning_rate": 1.3190423755851005e-05, "loss": 2.203058052062988, "memory(GiB)": 77.56, "step": 89090, "token_acc": 0.5186335403726708, "train_speed(iter/s)": 1.437864 }, { "epoch": 3.817102951887237, "grad_norm": 8.089812278747559, "learning_rate": 1.3185869558874602e-05, "loss": 2.386006164550781, "memory(GiB)": 77.56, "step": 89095, "token_acc": 0.49185667752442996, "train_speed(iter/s)": 1.437884 }, { "epoch": 3.817317167216486, "grad_norm": 5.60097599029541, "learning_rate": 1.3181316028820695e-05, "loss": 2.5724180221557615, "memory(GiB)": 77.56, "step": 89100, "token_acc": 0.4306784660766962, "train_speed(iter/s)": 1.437886 }, { "epoch": 3.817531382545735, "grad_norm": 11.27708625793457, "learning_rate": 1.31767631657718e-05, "loss": 2.1788700103759764, "memory(GiB)": 77.56, "step": 89105, "token_acc": 0.5639097744360902, "train_speed(iter/s)": 1.4379 }, { "epoch": 3.8177455978749837, "grad_norm": 5.431806564331055, "learning_rate": 1.3172210969810378e-05, "loss": 2.6047641754150392, "memory(GiB)": 77.56, "step": 89110, "token_acc": 0.47104247104247104, "train_speed(iter/s)": 1.437911 }, { "epoch": 3.817959813204233, "grad_norm": 5.5743088722229, "learning_rate": 1.3167659441018909e-05, "loss": 2.4898139953613283, "memory(GiB)": 77.56, "step": 89115, "token_acc": 0.4542483660130719, "train_speed(iter/s)": 1.437923 }, { "epoch": 3.8181740285334818, "grad_norm": 5.825106143951416, "learning_rate": 1.3163108579479838e-05, "loss": 2.3108795166015623, "memory(GiB)": 77.56, "step": 89120, "token_acc": 0.5017667844522968, "train_speed(iter/s)": 1.437929 }, { "epoch": 3.8183882438627306, "grad_norm": 8.026784896850586, "learning_rate": 1.3158558385275594e-05, "loss": 2.4738929748535154, "memory(GiB)": 77.56, "step": 89125, "token_acc": 0.4610169491525424, "train_speed(iter/s)": 1.437952 }, { "epoch": 3.81860245919198, "grad_norm": 4.974749565124512, "learning_rate": 1.3154008858488643e-05, "loss": 2.300914001464844, "memory(GiB)": 77.56, "step": 89130, "token_acc": 0.5238095238095238, "train_speed(iter/s)": 1.43796 }, { "epoch": 3.8188166745212286, "grad_norm": 5.531448841094971, "learning_rate": 1.314945999920138e-05, "loss": 2.135707664489746, "memory(GiB)": 77.56, "step": 89135, "token_acc": 0.5703422053231939, "train_speed(iter/s)": 1.437964 }, { "epoch": 3.8190308898504775, "grad_norm": 5.284693717956543, "learning_rate": 1.3144911807496224e-05, "loss": 2.478932571411133, "memory(GiB)": 77.56, "step": 89140, "token_acc": 0.5316455696202531, "train_speed(iter/s)": 1.437958 }, { "epoch": 3.8192451051797267, "grad_norm": 6.723596096038818, "learning_rate": 1.3140364283455558e-05, "loss": 2.5755098342895506, "memory(GiB)": 77.56, "step": 89145, "token_acc": 0.4872611464968153, "train_speed(iter/s)": 1.437947 }, { "epoch": 3.8194593205089755, "grad_norm": 6.281871318817139, "learning_rate": 1.3135817427161762e-05, "loss": 2.6059696197509767, "memory(GiB)": 77.56, "step": 89150, "token_acc": 0.4909090909090909, "train_speed(iter/s)": 1.437936 }, { "epoch": 3.8196735358382243, "grad_norm": 6.7038702964782715, "learning_rate": 1.3131271238697223e-05, "loss": 2.353596305847168, "memory(GiB)": 77.56, "step": 89155, "token_acc": 0.5077399380804953, "train_speed(iter/s)": 1.43795 }, { "epoch": 3.8198877511674736, "grad_norm": 5.308282375335693, "learning_rate": 1.3126725718144295e-05, "loss": 2.651363563537598, "memory(GiB)": 77.56, "step": 89160, "token_acc": 0.44025157232704404, "train_speed(iter/s)": 1.437957 }, { "epoch": 3.8201019664967224, "grad_norm": 4.589012622833252, "learning_rate": 1.312218086558532e-05, "loss": 2.337087631225586, "memory(GiB)": 77.56, "step": 89165, "token_acc": 0.49838187702265374, "train_speed(iter/s)": 1.43796 }, { "epoch": 3.820316181825971, "grad_norm": 7.262591361999512, "learning_rate": 1.3117636681102635e-05, "loss": 2.4449745178222657, "memory(GiB)": 77.56, "step": 89170, "token_acc": 0.4916387959866221, "train_speed(iter/s)": 1.437981 }, { "epoch": 3.8205303971552205, "grad_norm": 8.451205253601074, "learning_rate": 1.311309316477854e-05, "loss": 2.1115385055541993, "memory(GiB)": 77.56, "step": 89175, "token_acc": 0.5211726384364821, "train_speed(iter/s)": 1.437995 }, { "epoch": 3.8207446124844693, "grad_norm": 5.48556661605835, "learning_rate": 1.3108550316695383e-05, "loss": 2.4338003158569337, "memory(GiB)": 77.56, "step": 89180, "token_acc": 0.48138297872340424, "train_speed(iter/s)": 1.437987 }, { "epoch": 3.820958827813718, "grad_norm": 5.887182712554932, "learning_rate": 1.3104008136935431e-05, "loss": 2.2330379486083984, "memory(GiB)": 77.56, "step": 89185, "token_acc": 0.525, "train_speed(iter/s)": 1.437991 }, { "epoch": 3.8211730431429674, "grad_norm": 5.910205841064453, "learning_rate": 1.3099466625580992e-05, "loss": 1.8738798141479491, "memory(GiB)": 77.56, "step": 89190, "token_acc": 0.5563380281690141, "train_speed(iter/s)": 1.437997 }, { "epoch": 3.821387258472216, "grad_norm": 8.237466812133789, "learning_rate": 1.3094925782714335e-05, "loss": 2.2999019622802734, "memory(GiB)": 77.56, "step": 89195, "token_acc": 0.483271375464684, "train_speed(iter/s)": 1.437992 }, { "epoch": 3.821601473801465, "grad_norm": 5.856817245483398, "learning_rate": 1.3090385608417716e-05, "loss": 2.5226097106933594, "memory(GiB)": 77.56, "step": 89200, "token_acc": 0.49063670411985016, "train_speed(iter/s)": 1.437998 }, { "epoch": 3.8218156891307142, "grad_norm": 5.237555503845215, "learning_rate": 1.3085846102773386e-05, "loss": 2.376255989074707, "memory(GiB)": 77.56, "step": 89205, "token_acc": 0.48253968253968255, "train_speed(iter/s)": 1.438011 }, { "epoch": 3.822029904459963, "grad_norm": 5.246608734130859, "learning_rate": 1.3081307265863568e-05, "loss": 2.372589111328125, "memory(GiB)": 77.56, "step": 89210, "token_acc": 0.49829351535836175, "train_speed(iter/s)": 1.438001 }, { "epoch": 3.822244119789212, "grad_norm": 5.699592590332031, "learning_rate": 1.3076769097770514e-05, "loss": 2.2755014419555666, "memory(GiB)": 77.56, "step": 89215, "token_acc": 0.5176848874598071, "train_speed(iter/s)": 1.437999 }, { "epoch": 3.822458335118461, "grad_norm": 5.594287872314453, "learning_rate": 1.3072231598576423e-05, "loss": 2.566649627685547, "memory(GiB)": 77.56, "step": 89220, "token_acc": 0.5072992700729927, "train_speed(iter/s)": 1.437991 }, { "epoch": 3.82267255044771, "grad_norm": 5.3272881507873535, "learning_rate": 1.3067694768363502e-05, "loss": 2.299101638793945, "memory(GiB)": 77.56, "step": 89225, "token_acc": 0.5030120481927711, "train_speed(iter/s)": 1.437999 }, { "epoch": 3.8228867657769587, "grad_norm": 7.6558661460876465, "learning_rate": 1.3063158607213938e-05, "loss": 2.296774482727051, "memory(GiB)": 77.56, "step": 89230, "token_acc": 0.4944237918215613, "train_speed(iter/s)": 1.438001 }, { "epoch": 3.823100981106208, "grad_norm": 5.500546455383301, "learning_rate": 1.3058623115209884e-05, "loss": 2.2983016967773438, "memory(GiB)": 77.56, "step": 89235, "token_acc": 0.516245487364621, "train_speed(iter/s)": 1.438022 }, { "epoch": 3.823315196435457, "grad_norm": 7.9366044998168945, "learning_rate": 1.3054088292433542e-05, "loss": 2.2561222076416017, "memory(GiB)": 77.56, "step": 89240, "token_acc": 0.5, "train_speed(iter/s)": 1.438031 }, { "epoch": 3.8235294117647056, "grad_norm": 6.5696258544921875, "learning_rate": 1.3049554138967051e-05, "loss": 1.9168405532836914, "memory(GiB)": 77.56, "step": 89245, "token_acc": 0.5601374570446735, "train_speed(iter/s)": 1.438027 }, { "epoch": 3.823743627093955, "grad_norm": 7.222368240356445, "learning_rate": 1.3045020654892537e-05, "loss": 2.2084190368652346, "memory(GiB)": 77.56, "step": 89250, "token_acc": 0.5207667731629393, "train_speed(iter/s)": 1.43805 }, { "epoch": 3.8239578424232037, "grad_norm": 6.450833797454834, "learning_rate": 1.3040487840292148e-05, "loss": 2.4299835205078124, "memory(GiB)": 77.56, "step": 89255, "token_acc": 0.5035971223021583, "train_speed(iter/s)": 1.438048 }, { "epoch": 3.8241720577524525, "grad_norm": 5.956780910491943, "learning_rate": 1.3035955695247998e-05, "loss": 2.3340259552001954, "memory(GiB)": 77.56, "step": 89260, "token_acc": 0.5, "train_speed(iter/s)": 1.438061 }, { "epoch": 3.8243862730817018, "grad_norm": 6.589070796966553, "learning_rate": 1.3031424219842186e-05, "loss": 2.348227691650391, "memory(GiB)": 77.56, "step": 89265, "token_acc": 0.4897260273972603, "train_speed(iter/s)": 1.43806 }, { "epoch": 3.8246004884109506, "grad_norm": 6.042869567871094, "learning_rate": 1.3026893414156783e-05, "loss": 2.578312301635742, "memory(GiB)": 77.56, "step": 89270, "token_acc": 0.47904191616766467, "train_speed(iter/s)": 1.438064 }, { "epoch": 3.8248147037401994, "grad_norm": 5.407898426055908, "learning_rate": 1.3022363278273908e-05, "loss": 2.3430669784545897, "memory(GiB)": 77.56, "step": 89275, "token_acc": 0.5051546391752577, "train_speed(iter/s)": 1.438077 }, { "epoch": 3.8250289190694486, "grad_norm": 6.5342535972595215, "learning_rate": 1.3017833812275609e-05, "loss": 2.171712112426758, "memory(GiB)": 77.56, "step": 89280, "token_acc": 0.5381944444444444, "train_speed(iter/s)": 1.438079 }, { "epoch": 3.8252431343986975, "grad_norm": 5.928194046020508, "learning_rate": 1.3013305016243949e-05, "loss": 2.6452568054199217, "memory(GiB)": 77.56, "step": 89285, "token_acc": 0.5017667844522968, "train_speed(iter/s)": 1.438082 }, { "epoch": 3.8254573497279463, "grad_norm": 7.538603782653809, "learning_rate": 1.300877689026096e-05, "loss": 2.165818786621094, "memory(GiB)": 77.56, "step": 89290, "token_acc": 0.503731343283582, "train_speed(iter/s)": 1.438077 }, { "epoch": 3.8256715650571955, "grad_norm": 5.8745903968811035, "learning_rate": 1.300424943440866e-05, "loss": 2.473246192932129, "memory(GiB)": 77.56, "step": 89295, "token_acc": 0.4944237918215613, "train_speed(iter/s)": 1.438085 }, { "epoch": 3.8258857803864443, "grad_norm": 6.3269805908203125, "learning_rate": 1.2999722648769103e-05, "loss": 2.387524223327637, "memory(GiB)": 77.56, "step": 89300, "token_acc": 0.48028673835125446, "train_speed(iter/s)": 1.438102 }, { "epoch": 3.826099995715693, "grad_norm": 6.30497932434082, "learning_rate": 1.299519653342428e-05, "loss": 2.3783422470092774, "memory(GiB)": 77.56, "step": 89305, "token_acc": 0.5168539325842697, "train_speed(iter/s)": 1.4381 }, { "epoch": 3.8263142110449424, "grad_norm": 5.20912504196167, "learning_rate": 1.2990671088456186e-05, "loss": 2.3772661209106447, "memory(GiB)": 77.56, "step": 89310, "token_acc": 0.5408805031446541, "train_speed(iter/s)": 1.438119 }, { "epoch": 3.826528426374191, "grad_norm": 6.4588117599487305, "learning_rate": 1.2986146313946807e-05, "loss": 2.344211959838867, "memory(GiB)": 77.56, "step": 89315, "token_acc": 0.5180327868852459, "train_speed(iter/s)": 1.438103 }, { "epoch": 3.82674264170344, "grad_norm": 5.583101272583008, "learning_rate": 1.298162220997809e-05, "loss": 2.261591339111328, "memory(GiB)": 77.56, "step": 89320, "token_acc": 0.5112781954887218, "train_speed(iter/s)": 1.438104 }, { "epoch": 3.8269568570326893, "grad_norm": 4.968259334564209, "learning_rate": 1.2977098776632023e-05, "loss": 2.3683284759521483, "memory(GiB)": 77.56, "step": 89325, "token_acc": 0.47752808988764045, "train_speed(iter/s)": 1.438092 }, { "epoch": 3.827171072361938, "grad_norm": 7.383714199066162, "learning_rate": 1.2972576013990534e-05, "loss": 2.296504020690918, "memory(GiB)": 77.56, "step": 89330, "token_acc": 0.5072992700729927, "train_speed(iter/s)": 1.438089 }, { "epoch": 3.827385287691187, "grad_norm": 6.4993577003479, "learning_rate": 1.2968053922135576e-05, "loss": 2.5657075881958007, "memory(GiB)": 77.56, "step": 89335, "token_acc": 0.47388059701492535, "train_speed(iter/s)": 1.438081 }, { "epoch": 3.827599503020436, "grad_norm": 5.624339580535889, "learning_rate": 1.2963532501149062e-05, "loss": 2.41269588470459, "memory(GiB)": 77.56, "step": 89340, "token_acc": 0.49829351535836175, "train_speed(iter/s)": 1.438102 }, { "epoch": 3.827813718349685, "grad_norm": 8.805702209472656, "learning_rate": 1.2959011751112898e-05, "loss": 2.1869312286376954, "memory(GiB)": 77.56, "step": 89345, "token_acc": 0.5093632958801498, "train_speed(iter/s)": 1.438124 }, { "epoch": 3.828027933678934, "grad_norm": 6.802753925323486, "learning_rate": 1.2954491672108987e-05, "loss": 2.471541976928711, "memory(GiB)": 77.56, "step": 89350, "token_acc": 0.47720364741641336, "train_speed(iter/s)": 1.438126 }, { "epoch": 3.828242149008183, "grad_norm": 6.480319976806641, "learning_rate": 1.294997226421919e-05, "loss": 2.4483322143554687, "memory(GiB)": 77.56, "step": 89355, "token_acc": 0.4842105263157895, "train_speed(iter/s)": 1.438121 }, { "epoch": 3.828456364337432, "grad_norm": 7.437613487243652, "learning_rate": 1.294545352752542e-05, "loss": 2.380429267883301, "memory(GiB)": 77.56, "step": 89360, "token_acc": 0.48548812664907653, "train_speed(iter/s)": 1.438122 }, { "epoch": 3.8286705796666807, "grad_norm": 8.145204544067383, "learning_rate": 1.2940935462109522e-05, "loss": 2.637093353271484, "memory(GiB)": 77.56, "step": 89365, "token_acc": 0.4563106796116505, "train_speed(iter/s)": 1.438116 }, { "epoch": 3.82888479499593, "grad_norm": 7.535129547119141, "learning_rate": 1.2936418068053335e-05, "loss": 2.110073280334473, "memory(GiB)": 77.56, "step": 89370, "token_acc": 0.49019607843137253, "train_speed(iter/s)": 1.438116 }, { "epoch": 3.8290990103251787, "grad_norm": 4.610408782958984, "learning_rate": 1.2931901345438708e-05, "loss": 2.3601507186889648, "memory(GiB)": 77.56, "step": 89375, "token_acc": 0.49193548387096775, "train_speed(iter/s)": 1.438141 }, { "epoch": 3.8293132256544276, "grad_norm": 7.503290176391602, "learning_rate": 1.2927385294347444e-05, "loss": 2.346113586425781, "memory(GiB)": 77.56, "step": 89380, "token_acc": 0.4967948717948718, "train_speed(iter/s)": 1.438149 }, { "epoch": 3.829527440983677, "grad_norm": 6.240098476409912, "learning_rate": 1.292286991486139e-05, "loss": 2.2731552124023438, "memory(GiB)": 77.56, "step": 89385, "token_acc": 0.5641891891891891, "train_speed(iter/s)": 1.438138 }, { "epoch": 3.8297416563129256, "grad_norm": 6.197078227996826, "learning_rate": 1.2918355207062327e-05, "loss": 2.187102699279785, "memory(GiB)": 77.56, "step": 89390, "token_acc": 0.5328185328185329, "train_speed(iter/s)": 1.438128 }, { "epoch": 3.829955871642175, "grad_norm": 7.338923931121826, "learning_rate": 1.2913841171032033e-05, "loss": 2.3082998275756834, "memory(GiB)": 77.56, "step": 89395, "token_acc": 0.5107142857142857, "train_speed(iter/s)": 1.438131 }, { "epoch": 3.8301700869714237, "grad_norm": 5.5944623947143555, "learning_rate": 1.2909327806852305e-05, "loss": 2.19866886138916, "memory(GiB)": 77.56, "step": 89400, "token_acc": 0.5860655737704918, "train_speed(iter/s)": 1.438131 }, { "epoch": 3.8303843023006725, "grad_norm": 5.738093852996826, "learning_rate": 1.2904815114604901e-05, "loss": 2.348261260986328, "memory(GiB)": 77.56, "step": 89405, "token_acc": 0.5079365079365079, "train_speed(iter/s)": 1.438123 }, { "epoch": 3.8305985176299218, "grad_norm": 5.5591511726379395, "learning_rate": 1.2900303094371568e-05, "loss": 1.9802574157714843, "memory(GiB)": 77.56, "step": 89410, "token_acc": 0.5656108597285068, "train_speed(iter/s)": 1.438126 }, { "epoch": 3.8308127329591706, "grad_norm": 7.392581939697266, "learning_rate": 1.2895791746234032e-05, "loss": 2.502816390991211, "memory(GiB)": 77.56, "step": 89415, "token_acc": 0.47335423197492166, "train_speed(iter/s)": 1.438112 }, { "epoch": 3.8310269482884194, "grad_norm": 6.14442777633667, "learning_rate": 1.2891281070274052e-05, "loss": 2.414131927490234, "memory(GiB)": 77.56, "step": 89420, "token_acc": 0.49696969696969695, "train_speed(iter/s)": 1.438098 }, { "epoch": 3.8312411636176686, "grad_norm": 6.514246463775635, "learning_rate": 1.2886771066573323e-05, "loss": 2.4701881408691406, "memory(GiB)": 77.56, "step": 89425, "token_acc": 0.49825783972125437, "train_speed(iter/s)": 1.438103 }, { "epoch": 3.8314553789469175, "grad_norm": 6.374442100524902, "learning_rate": 1.2882261735213552e-05, "loss": 2.2698043823242187, "memory(GiB)": 77.56, "step": 89430, "token_acc": 0.5106382978723404, "train_speed(iter/s)": 1.43812 }, { "epoch": 3.8316695942761663, "grad_norm": 6.142033576965332, "learning_rate": 1.2877753076276434e-05, "loss": 2.425623321533203, "memory(GiB)": 77.56, "step": 89435, "token_acc": 0.47096774193548385, "train_speed(iter/s)": 1.438113 }, { "epoch": 3.8318838096054155, "grad_norm": 6.489376068115234, "learning_rate": 1.2873245089843621e-05, "loss": 2.5443748474121093, "memory(GiB)": 77.56, "step": 89440, "token_acc": 0.4691358024691358, "train_speed(iter/s)": 1.438102 }, { "epoch": 3.8320980249346643, "grad_norm": 5.969970226287842, "learning_rate": 1.286873777599682e-05, "loss": 2.1370635986328126, "memory(GiB)": 77.56, "step": 89445, "token_acc": 0.513986013986014, "train_speed(iter/s)": 1.438096 }, { "epoch": 3.832312240263913, "grad_norm": 4.6389594078063965, "learning_rate": 1.2864231134817661e-05, "loss": 2.4667181015014648, "memory(GiB)": 77.56, "step": 89450, "token_acc": 0.43564356435643564, "train_speed(iter/s)": 1.438099 }, { "epoch": 3.8325264555931624, "grad_norm": 8.391698837280273, "learning_rate": 1.2859725166387798e-05, "loss": 2.4705230712890627, "memory(GiB)": 77.56, "step": 89455, "token_acc": 0.5170068027210885, "train_speed(iter/s)": 1.438104 }, { "epoch": 3.832740670922411, "grad_norm": 6.3625922203063965, "learning_rate": 1.2855219870788849e-05, "loss": 2.1654340744018556, "memory(GiB)": 77.56, "step": 89460, "token_acc": 0.5425101214574899, "train_speed(iter/s)": 1.438109 }, { "epoch": 3.83295488625166, "grad_norm": 5.567348957061768, "learning_rate": 1.2850715248102425e-05, "loss": 2.2136323928833006, "memory(GiB)": 77.56, "step": 89465, "token_acc": 0.49523809523809526, "train_speed(iter/s)": 1.438138 }, { "epoch": 3.8331691015809093, "grad_norm": 5.1307759284973145, "learning_rate": 1.2846211298410155e-05, "loss": 2.3576023101806642, "memory(GiB)": 77.56, "step": 89470, "token_acc": 0.48031496062992124, "train_speed(iter/s)": 1.438129 }, { "epoch": 3.833383316910158, "grad_norm": 6.851169109344482, "learning_rate": 1.2841708021793608e-05, "loss": 2.3290983200073243, "memory(GiB)": 77.56, "step": 89475, "token_acc": 0.48226950354609927, "train_speed(iter/s)": 1.438127 }, { "epoch": 3.833597532239407, "grad_norm": 5.531679630279541, "learning_rate": 1.2837205418334392e-05, "loss": 2.3891551971435545, "memory(GiB)": 77.56, "step": 89480, "token_acc": 0.5366666666666666, "train_speed(iter/s)": 1.43812 }, { "epoch": 3.833811747568656, "grad_norm": 9.474173545837402, "learning_rate": 1.2832703488114055e-05, "loss": 2.4122554779052736, "memory(GiB)": 77.56, "step": 89485, "token_acc": 0.5347222222222222, "train_speed(iter/s)": 1.438121 }, { "epoch": 3.834025962897905, "grad_norm": 8.35528564453125, "learning_rate": 1.2828202231214171e-05, "loss": 2.1345138549804688, "memory(GiB)": 77.56, "step": 89490, "token_acc": 0.5197368421052632, "train_speed(iter/s)": 1.438097 }, { "epoch": 3.834240178227154, "grad_norm": 4.959789276123047, "learning_rate": 1.2823701647716269e-05, "loss": 2.350536346435547, "memory(GiB)": 77.56, "step": 89495, "token_acc": 0.5, "train_speed(iter/s)": 1.438088 }, { "epoch": 3.834454393556403, "grad_norm": 7.224593162536621, "learning_rate": 1.281920173770187e-05, "loss": 2.2063188552856445, "memory(GiB)": 77.56, "step": 89500, "token_acc": 0.4855305466237942, "train_speed(iter/s)": 1.438095 }, { "epoch": 3.834454393556403, "eval_loss": 2.2880513668060303, "eval_runtime": 14.1879, "eval_samples_per_second": 7.048, "eval_steps_per_second": 7.048, "eval_token_acc": 0.4837837837837838, "step": 89500 }, { "epoch": 3.834668608885652, "grad_norm": 5.8053412437438965, "learning_rate": 1.2814702501252524e-05, "loss": 2.4667795181274412, "memory(GiB)": 77.56, "step": 89505, "token_acc": 0.48141263940520446, "train_speed(iter/s)": 1.437732 }, { "epoch": 3.8348828242149007, "grad_norm": 6.08140754699707, "learning_rate": 1.2810203938449721e-05, "loss": 2.016112518310547, "memory(GiB)": 77.56, "step": 89510, "token_acc": 0.5461538461538461, "train_speed(iter/s)": 1.437748 }, { "epoch": 3.83509703954415, "grad_norm": 8.658079147338867, "learning_rate": 1.2805706049374967e-05, "loss": 2.3258127212524413, "memory(GiB)": 77.56, "step": 89515, "token_acc": 0.5263157894736842, "train_speed(iter/s)": 1.437762 }, { "epoch": 3.8353112548733987, "grad_norm": 5.1748151779174805, "learning_rate": 1.280120883410974e-05, "loss": 2.5754547119140625, "memory(GiB)": 77.56, "step": 89520, "token_acc": 0.4305555555555556, "train_speed(iter/s)": 1.437773 }, { "epoch": 3.8355254702026476, "grad_norm": 19.345731735229492, "learning_rate": 1.2796712292735491e-05, "loss": 2.574457550048828, "memory(GiB)": 77.56, "step": 89525, "token_acc": 0.4670846394984326, "train_speed(iter/s)": 1.437775 }, { "epoch": 3.835739685531897, "grad_norm": 6.9523138999938965, "learning_rate": 1.2792216425333715e-05, "loss": 2.2901451110839846, "memory(GiB)": 77.56, "step": 89530, "token_acc": 0.484375, "train_speed(iter/s)": 1.437781 }, { "epoch": 3.8359539008611456, "grad_norm": 6.754496097564697, "learning_rate": 1.2787721231985845e-05, "loss": 2.398523712158203, "memory(GiB)": 77.56, "step": 89535, "token_acc": 0.4557377049180328, "train_speed(iter/s)": 1.437754 }, { "epoch": 3.8361681161903944, "grad_norm": 5.389326572418213, "learning_rate": 1.2783226712773305e-05, "loss": 2.414544105529785, "memory(GiB)": 77.56, "step": 89540, "token_acc": 0.5051903114186851, "train_speed(iter/s)": 1.437747 }, { "epoch": 3.8363823315196437, "grad_norm": 6.015725612640381, "learning_rate": 1.2778732867777515e-05, "loss": 2.461769866943359, "memory(GiB)": 77.56, "step": 89545, "token_acc": 0.4722222222222222, "train_speed(iter/s)": 1.43777 }, { "epoch": 3.8365965468488925, "grad_norm": 6.404618740081787, "learning_rate": 1.2774239697079915e-05, "loss": 2.221240425109863, "memory(GiB)": 77.56, "step": 89550, "token_acc": 0.5198412698412699, "train_speed(iter/s)": 1.43777 }, { "epoch": 3.8368107621781413, "grad_norm": 6.301713466644287, "learning_rate": 1.2769747200761878e-05, "loss": 2.2808141708374023, "memory(GiB)": 77.56, "step": 89555, "token_acc": 0.4872881355932203, "train_speed(iter/s)": 1.437784 }, { "epoch": 3.8370249775073906, "grad_norm": 6.150553226470947, "learning_rate": 1.2765255378904778e-05, "loss": 2.2991519927978517, "memory(GiB)": 77.56, "step": 89560, "token_acc": 0.5177304964539007, "train_speed(iter/s)": 1.437789 }, { "epoch": 3.8372391928366394, "grad_norm": 6.455543518066406, "learning_rate": 1.2760764231590027e-05, "loss": 1.9785884857177733, "memory(GiB)": 77.56, "step": 89565, "token_acc": 0.5962962962962963, "train_speed(iter/s)": 1.437801 }, { "epoch": 3.837453408165888, "grad_norm": 5.895117282867432, "learning_rate": 1.2756273758898962e-05, "loss": 2.1818775177001952, "memory(GiB)": 77.56, "step": 89570, "token_acc": 0.5244755244755245, "train_speed(iter/s)": 1.437811 }, { "epoch": 3.8376676234951375, "grad_norm": 4.784303665161133, "learning_rate": 1.2751783960912933e-05, "loss": 2.532892417907715, "memory(GiB)": 77.56, "step": 89575, "token_acc": 0.45098039215686275, "train_speed(iter/s)": 1.437822 }, { "epoch": 3.8378818388243863, "grad_norm": 5.0392913818359375, "learning_rate": 1.2747294837713285e-05, "loss": 2.581121253967285, "memory(GiB)": 77.56, "step": 89580, "token_acc": 0.517799352750809, "train_speed(iter/s)": 1.437817 }, { "epoch": 3.838096054153635, "grad_norm": 5.183769702911377, "learning_rate": 1.2742806389381318e-05, "loss": 2.112078094482422, "memory(GiB)": 77.56, "step": 89585, "token_acc": 0.5377049180327869, "train_speed(iter/s)": 1.437817 }, { "epoch": 3.8383102694828843, "grad_norm": 5.146134853363037, "learning_rate": 1.2738318615998379e-05, "loss": 2.3564823150634764, "memory(GiB)": 77.56, "step": 89590, "token_acc": 0.5131964809384164, "train_speed(iter/s)": 1.437825 }, { "epoch": 3.838524484812133, "grad_norm": 6.693490505218506, "learning_rate": 1.273383151764575e-05, "loss": 2.170207977294922, "memory(GiB)": 77.56, "step": 89595, "token_acc": 0.5518394648829431, "train_speed(iter/s)": 1.437847 }, { "epoch": 3.838738700141382, "grad_norm": 5.216087341308594, "learning_rate": 1.2729345094404727e-05, "loss": 2.466537857055664, "memory(GiB)": 77.56, "step": 89600, "token_acc": 0.49846153846153846, "train_speed(iter/s)": 1.437851 }, { "epoch": 3.838952915470631, "grad_norm": 4.496460914611816, "learning_rate": 1.2724859346356577e-05, "loss": 2.5224403381347655, "memory(GiB)": 77.56, "step": 89605, "token_acc": 0.49707602339181284, "train_speed(iter/s)": 1.437841 }, { "epoch": 3.83916713079988, "grad_norm": 4.567159175872803, "learning_rate": 1.2720374273582553e-05, "loss": 2.3182855606079102, "memory(GiB)": 77.56, "step": 89610, "token_acc": 0.5242718446601942, "train_speed(iter/s)": 1.437843 }, { "epoch": 3.839381346129129, "grad_norm": 5.612655162811279, "learning_rate": 1.2715889876163933e-05, "loss": 2.3745309829711916, "memory(GiB)": 77.56, "step": 89615, "token_acc": 0.4806451612903226, "train_speed(iter/s)": 1.437859 }, { "epoch": 3.839595561458378, "grad_norm": 7.709139347076416, "learning_rate": 1.2711406154181933e-05, "loss": 2.1623903274536134, "memory(GiB)": 77.56, "step": 89620, "token_acc": 0.5205882352941177, "train_speed(iter/s)": 1.437845 }, { "epoch": 3.839809776787627, "grad_norm": 5.680721759796143, "learning_rate": 1.27069231077178e-05, "loss": 1.8974676132202148, "memory(GiB)": 77.56, "step": 89625, "token_acc": 0.5204460966542751, "train_speed(iter/s)": 1.437847 }, { "epoch": 3.8400239921168757, "grad_norm": 5.09709358215332, "learning_rate": 1.270244073685274e-05, "loss": 2.3554325103759766, "memory(GiB)": 77.56, "step": 89630, "token_acc": 0.528169014084507, "train_speed(iter/s)": 1.437842 }, { "epoch": 3.840238207446125, "grad_norm": 4.189144134521484, "learning_rate": 1.269795904166795e-05, "loss": 2.1337955474853514, "memory(GiB)": 77.56, "step": 89635, "token_acc": 0.5225563909774437, "train_speed(iter/s)": 1.437869 }, { "epoch": 3.840452422775374, "grad_norm": 4.511162281036377, "learning_rate": 1.2693478022244631e-05, "loss": 2.325242042541504, "memory(GiB)": 77.56, "step": 89640, "token_acc": 0.5, "train_speed(iter/s)": 1.437887 }, { "epoch": 3.8406666381046226, "grad_norm": 5.370865821838379, "learning_rate": 1.268899767866394e-05, "loss": 2.1646139144897463, "memory(GiB)": 77.56, "step": 89645, "token_acc": 0.5173611111111112, "train_speed(iter/s)": 1.437879 }, { "epoch": 3.840880853433872, "grad_norm": 5.54640007019043, "learning_rate": 1.2684518011007062e-05, "loss": 2.289965057373047, "memory(GiB)": 77.56, "step": 89650, "token_acc": 0.4831804281345566, "train_speed(iter/s)": 1.437883 }, { "epoch": 3.8410950687631207, "grad_norm": 6.399964809417725, "learning_rate": 1.2680039019355156e-05, "loss": 2.238947105407715, "memory(GiB)": 77.56, "step": 89655, "token_acc": 0.5369127516778524, "train_speed(iter/s)": 1.437884 }, { "epoch": 3.8413092840923695, "grad_norm": 6.106026649475098, "learning_rate": 1.2675560703789347e-05, "loss": 2.300439643859863, "memory(GiB)": 77.56, "step": 89660, "token_acc": 0.5192878338278932, "train_speed(iter/s)": 1.437887 }, { "epoch": 3.8415234994216187, "grad_norm": 6.112517356872559, "learning_rate": 1.2671083064390765e-05, "loss": 2.6393333435058595, "memory(GiB)": 77.56, "step": 89665, "token_acc": 0.47572815533980584, "train_speed(iter/s)": 1.437898 }, { "epoch": 3.8417377147508676, "grad_norm": 6.421717166900635, "learning_rate": 1.2666606101240525e-05, "loss": 2.2042654037475584, "memory(GiB)": 77.56, "step": 89670, "token_acc": 0.5, "train_speed(iter/s)": 1.437918 }, { "epoch": 3.8419519300801164, "grad_norm": 4.967819690704346, "learning_rate": 1.2662129814419742e-05, "loss": 2.5706480026245115, "memory(GiB)": 77.56, "step": 89675, "token_acc": 0.4739583333333333, "train_speed(iter/s)": 1.437919 }, { "epoch": 3.8421661454093656, "grad_norm": 5.273562431335449, "learning_rate": 1.2657654204009512e-05, "loss": 2.1526462554931642, "memory(GiB)": 77.56, "step": 89680, "token_acc": 0.5384615384615384, "train_speed(iter/s)": 1.437928 }, { "epoch": 3.8423803607386144, "grad_norm": 5.4288554191589355, "learning_rate": 1.2653179270090897e-05, "loss": 2.146344757080078, "memory(GiB)": 77.56, "step": 89685, "token_acc": 0.5298804780876494, "train_speed(iter/s)": 1.43794 }, { "epoch": 3.8425945760678633, "grad_norm": 4.961272239685059, "learning_rate": 1.2648705012744983e-05, "loss": 2.5688634872436524, "memory(GiB)": 77.56, "step": 89690, "token_acc": 0.43197278911564624, "train_speed(iter/s)": 1.437931 }, { "epoch": 3.8428087913971125, "grad_norm": 5.9478068351745605, "learning_rate": 1.2644231432052794e-05, "loss": 2.3850337982177736, "memory(GiB)": 77.56, "step": 89695, "token_acc": 0.5167173252279635, "train_speed(iter/s)": 1.437937 }, { "epoch": 3.8430230067263613, "grad_norm": 6.486205101013184, "learning_rate": 1.2639758528095414e-05, "loss": 2.3262929916381836, "memory(GiB)": 77.56, "step": 89700, "token_acc": 0.5409252669039146, "train_speed(iter/s)": 1.43791 }, { "epoch": 3.84323722205561, "grad_norm": 5.780078887939453, "learning_rate": 1.2635286300953841e-05, "loss": 2.70859375, "memory(GiB)": 77.56, "step": 89705, "token_acc": 0.49142857142857144, "train_speed(iter/s)": 1.437912 }, { "epoch": 3.8434514373848594, "grad_norm": 7.040881633758545, "learning_rate": 1.2630814750709119e-05, "loss": 2.346533012390137, "memory(GiB)": 77.56, "step": 89710, "token_acc": 0.4789272030651341, "train_speed(iter/s)": 1.437922 }, { "epoch": 3.843665652714108, "grad_norm": 7.47244119644165, "learning_rate": 1.2626343877442242e-05, "loss": 2.394953155517578, "memory(GiB)": 77.56, "step": 89715, "token_acc": 0.5217391304347826, "train_speed(iter/s)": 1.437928 }, { "epoch": 3.843879868043357, "grad_norm": 5.17090368270874, "learning_rate": 1.2621873681234208e-05, "loss": 2.286424255371094, "memory(GiB)": 77.56, "step": 89720, "token_acc": 0.5151515151515151, "train_speed(iter/s)": 1.437908 }, { "epoch": 3.8440940833726063, "grad_norm": 5.325435161590576, "learning_rate": 1.2617404162165997e-05, "loss": 2.4823562622070314, "memory(GiB)": 77.56, "step": 89725, "token_acc": 0.4807121661721068, "train_speed(iter/s)": 1.437914 }, { "epoch": 3.844308298701855, "grad_norm": 6.1892619132995605, "learning_rate": 1.2612935320318558e-05, "loss": 1.9413860321044922, "memory(GiB)": 77.56, "step": 89730, "token_acc": 0.584192439862543, "train_speed(iter/s)": 1.437894 }, { "epoch": 3.8445225140311043, "grad_norm": 4.755699157714844, "learning_rate": 1.2608467155772885e-05, "loss": 2.2821407318115234, "memory(GiB)": 77.56, "step": 89735, "token_acc": 0.5317725752508361, "train_speed(iter/s)": 1.437895 }, { "epoch": 3.844736729360353, "grad_norm": 5.3083271980285645, "learning_rate": 1.2603999668609906e-05, "loss": 2.456380844116211, "memory(GiB)": 77.56, "step": 89740, "token_acc": 0.47076023391812866, "train_speed(iter/s)": 1.437895 }, { "epoch": 3.844950944689602, "grad_norm": 6.417150020599365, "learning_rate": 1.2599532858910556e-05, "loss": 2.306637001037598, "memory(GiB)": 77.56, "step": 89745, "token_acc": 0.49635036496350365, "train_speed(iter/s)": 1.437903 }, { "epoch": 3.8451651600188512, "grad_norm": 6.258324146270752, "learning_rate": 1.259506672675575e-05, "loss": 2.401631736755371, "memory(GiB)": 77.56, "step": 89750, "token_acc": 0.4981684981684982, "train_speed(iter/s)": 1.437897 }, { "epoch": 3.8453793753481, "grad_norm": 5.937260150909424, "learning_rate": 1.2590601272226383e-05, "loss": 2.233934783935547, "memory(GiB)": 77.56, "step": 89755, "token_acc": 0.5360824742268041, "train_speed(iter/s)": 1.43792 }, { "epoch": 3.845593590677349, "grad_norm": 6.198304653167725, "learning_rate": 1.2586136495403384e-05, "loss": 1.954640769958496, "memory(GiB)": 77.56, "step": 89760, "token_acc": 0.596, "train_speed(iter/s)": 1.437917 }, { "epoch": 3.845807806006598, "grad_norm": 6.78243350982666, "learning_rate": 1.25816723963676e-05, "loss": 2.422947883605957, "memory(GiB)": 77.56, "step": 89765, "token_acc": 0.5179856115107914, "train_speed(iter/s)": 1.43794 }, { "epoch": 3.846022021335847, "grad_norm": 6.163959980010986, "learning_rate": 1.257720897519994e-05, "loss": 2.6398540496826173, "memory(GiB)": 77.56, "step": 89770, "token_acc": 0.45671641791044776, "train_speed(iter/s)": 1.437946 }, { "epoch": 3.8462362366650957, "grad_norm": 6.760027885437012, "learning_rate": 1.2572746231981247e-05, "loss": 2.0449737548828124, "memory(GiB)": 77.56, "step": 89775, "token_acc": 0.5080645161290323, "train_speed(iter/s)": 1.437962 }, { "epoch": 3.846450451994345, "grad_norm": 7.761449337005615, "learning_rate": 1.256828416679236e-05, "loss": 2.4073207855224608, "memory(GiB)": 77.56, "step": 89780, "token_acc": 0.5, "train_speed(iter/s)": 1.437966 }, { "epoch": 3.846664667323594, "grad_norm": 5.966902256011963, "learning_rate": 1.2563822779714125e-05, "loss": 2.4390159606933595, "memory(GiB)": 77.56, "step": 89785, "token_acc": 0.4889705882352941, "train_speed(iter/s)": 1.437971 }, { "epoch": 3.8468788826528426, "grad_norm": 7.9838995933532715, "learning_rate": 1.2559362070827341e-05, "loss": 2.053150939941406, "memory(GiB)": 77.56, "step": 89790, "token_acc": 0.5392953929539296, "train_speed(iter/s)": 1.437974 }, { "epoch": 3.847093097982092, "grad_norm": 5.227750778198242, "learning_rate": 1.2554902040212846e-05, "loss": 2.272739219665527, "memory(GiB)": 77.56, "step": 89795, "token_acc": 0.5291970802919708, "train_speed(iter/s)": 1.437979 }, { "epoch": 3.8473073133113407, "grad_norm": 7.420921325683594, "learning_rate": 1.2550442687951431e-05, "loss": 2.548366928100586, "memory(GiB)": 77.56, "step": 89800, "token_acc": 0.4601226993865031, "train_speed(iter/s)": 1.437971 }, { "epoch": 3.8475215286405895, "grad_norm": 6.045298099517822, "learning_rate": 1.2545984014123879e-05, "loss": 2.469654083251953, "memory(GiB)": 77.56, "step": 89805, "token_acc": 0.4668769716088328, "train_speed(iter/s)": 1.437968 }, { "epoch": 3.8477357439698388, "grad_norm": 5.813005447387695, "learning_rate": 1.2541526018810956e-05, "loss": 2.4589685440063476, "memory(GiB)": 77.56, "step": 89810, "token_acc": 0.4934640522875817, "train_speed(iter/s)": 1.437974 }, { "epoch": 3.8479499592990876, "grad_norm": 7.683053493499756, "learning_rate": 1.253706870209342e-05, "loss": 2.4205291748046873, "memory(GiB)": 77.56, "step": 89815, "token_acc": 0.4927536231884058, "train_speed(iter/s)": 1.437972 }, { "epoch": 3.8481641746283364, "grad_norm": 5.512688159942627, "learning_rate": 1.2532612064052036e-05, "loss": 2.481915283203125, "memory(GiB)": 77.56, "step": 89820, "token_acc": 0.4746268656716418, "train_speed(iter/s)": 1.437991 }, { "epoch": 3.8483783899575856, "grad_norm": 6.90846586227417, "learning_rate": 1.2528156104767536e-05, "loss": 2.2012306213378907, "memory(GiB)": 77.56, "step": 89825, "token_acc": 0.5254901960784314, "train_speed(iter/s)": 1.437995 }, { "epoch": 3.8485926052868344, "grad_norm": 5.92537784576416, "learning_rate": 1.2523700824320638e-05, "loss": 2.336228942871094, "memory(GiB)": 77.56, "step": 89830, "token_acc": 0.5422077922077922, "train_speed(iter/s)": 1.437994 }, { "epoch": 3.8488068206160833, "grad_norm": 5.809115886688232, "learning_rate": 1.2519246222792053e-05, "loss": 2.655837821960449, "memory(GiB)": 77.56, "step": 89835, "token_acc": 0.47315436241610737, "train_speed(iter/s)": 1.437989 }, { "epoch": 3.8490210359453325, "grad_norm": 4.90449333190918, "learning_rate": 1.2514792300262474e-05, "loss": 2.62183837890625, "memory(GiB)": 77.56, "step": 89840, "token_acc": 0.4768392370572207, "train_speed(iter/s)": 1.437993 }, { "epoch": 3.8492352512745813, "grad_norm": 6.031124591827393, "learning_rate": 1.2510339056812603e-05, "loss": 2.1152647018432615, "memory(GiB)": 77.56, "step": 89845, "token_acc": 0.568, "train_speed(iter/s)": 1.438005 }, { "epoch": 3.84944946660383, "grad_norm": 6.151834011077881, "learning_rate": 1.25058864925231e-05, "loss": 2.4015045166015625, "memory(GiB)": 77.56, "step": 89850, "token_acc": 0.5239852398523985, "train_speed(iter/s)": 1.437998 }, { "epoch": 3.8496636819330794, "grad_norm": 5.381189823150635, "learning_rate": 1.2501434607474644e-05, "loss": 2.150976371765137, "memory(GiB)": 77.56, "step": 89855, "token_acc": 0.5086705202312138, "train_speed(iter/s)": 1.437994 }, { "epoch": 3.849877897262328, "grad_norm": 7.384481906890869, "learning_rate": 1.249698340174788e-05, "loss": 2.34448127746582, "memory(GiB)": 77.56, "step": 89860, "token_acc": 0.48366013071895425, "train_speed(iter/s)": 1.43799 }, { "epoch": 3.850092112591577, "grad_norm": 4.894954204559326, "learning_rate": 1.2492532875423445e-05, "loss": 1.7909559249877929, "memory(GiB)": 77.56, "step": 89865, "token_acc": 0.5582329317269076, "train_speed(iter/s)": 1.438003 }, { "epoch": 3.8503063279208263, "grad_norm": 6.887175559997559, "learning_rate": 1.248808302858196e-05, "loss": 2.219059371948242, "memory(GiB)": 77.56, "step": 89870, "token_acc": 0.5166666666666667, "train_speed(iter/s)": 1.438008 }, { "epoch": 3.850520543250075, "grad_norm": 7.515207290649414, "learning_rate": 1.2483633861304023e-05, "loss": 2.2352569580078123, "memory(GiB)": 77.56, "step": 89875, "token_acc": 0.5451388888888888, "train_speed(iter/s)": 1.438016 }, { "epoch": 3.850734758579324, "grad_norm": 6.325348854064941, "learning_rate": 1.2479185373670272e-05, "loss": 2.1009830474853515, "memory(GiB)": 77.56, "step": 89880, "token_acc": 0.48672566371681414, "train_speed(iter/s)": 1.438027 }, { "epoch": 3.850948973908573, "grad_norm": 6.187824726104736, "learning_rate": 1.2474737565761268e-05, "loss": 2.4757076263427735, "memory(GiB)": 77.56, "step": 89885, "token_acc": 0.48717948717948717, "train_speed(iter/s)": 1.438033 }, { "epoch": 3.851163189237822, "grad_norm": 5.819389343261719, "learning_rate": 1.24702904376576e-05, "loss": 2.7836130142211912, "memory(GiB)": 77.56, "step": 89890, "token_acc": 0.41919191919191917, "train_speed(iter/s)": 1.438024 }, { "epoch": 3.851377404567071, "grad_norm": 5.284809589385986, "learning_rate": 1.2465843989439824e-05, "loss": 2.3953598022460936, "memory(GiB)": 77.56, "step": 89895, "token_acc": 0.5118110236220472, "train_speed(iter/s)": 1.438032 }, { "epoch": 3.85159161989632, "grad_norm": 5.492016792297363, "learning_rate": 1.246139822118848e-05, "loss": 2.2401538848876954, "memory(GiB)": 77.56, "step": 89900, "token_acc": 0.5939849624060151, "train_speed(iter/s)": 1.438042 }, { "epoch": 3.851805835225569, "grad_norm": 6.976442337036133, "learning_rate": 1.2456953132984133e-05, "loss": 2.1454036712646483, "memory(GiB)": 77.56, "step": 89905, "token_acc": 0.5045045045045045, "train_speed(iter/s)": 1.43805 }, { "epoch": 3.8520200505548177, "grad_norm": 5.561842918395996, "learning_rate": 1.2452508724907303e-05, "loss": 2.2576921463012694, "memory(GiB)": 77.56, "step": 89910, "token_acc": 0.4883720930232558, "train_speed(iter/s)": 1.438052 }, { "epoch": 3.852234265884067, "grad_norm": 6.199827194213867, "learning_rate": 1.2448064997038483e-05, "loss": 2.152077484130859, "memory(GiB)": 77.56, "step": 89915, "token_acc": 0.5053763440860215, "train_speed(iter/s)": 1.438052 }, { "epoch": 3.8524484812133157, "grad_norm": 8.509712219238281, "learning_rate": 1.2443621949458206e-05, "loss": 2.496550178527832, "memory(GiB)": 77.56, "step": 89920, "token_acc": 0.48464163822525597, "train_speed(iter/s)": 1.438059 }, { "epoch": 3.8526626965425645, "grad_norm": 6.870933532714844, "learning_rate": 1.2439179582246952e-05, "loss": 2.1269506454467773, "memory(GiB)": 77.56, "step": 89925, "token_acc": 0.5291970802919708, "train_speed(iter/s)": 1.438049 }, { "epoch": 3.852876911871814, "grad_norm": 9.983698844909668, "learning_rate": 1.2434737895485188e-05, "loss": 2.5557470321655273, "memory(GiB)": 77.56, "step": 89930, "token_acc": 0.4612903225806452, "train_speed(iter/s)": 1.438062 }, { "epoch": 3.8530911272010626, "grad_norm": 7.707968711853027, "learning_rate": 1.2430296889253378e-05, "loss": 2.474230194091797, "memory(GiB)": 77.56, "step": 89935, "token_acc": 0.4962121212121212, "train_speed(iter/s)": 1.43806 }, { "epoch": 3.8533053425303114, "grad_norm": 5.554915904998779, "learning_rate": 1.2425856563631993e-05, "loss": 2.271873664855957, "memory(GiB)": 77.56, "step": 89940, "token_acc": 0.5535168195718655, "train_speed(iter/s)": 1.438056 }, { "epoch": 3.8535195578595607, "grad_norm": 7.130796432495117, "learning_rate": 1.2421416918701467e-05, "loss": 2.307566261291504, "memory(GiB)": 77.56, "step": 89945, "token_acc": 0.5368421052631579, "train_speed(iter/s)": 1.438065 }, { "epoch": 3.8537337731888095, "grad_norm": 6.80617618560791, "learning_rate": 1.2416977954542225e-05, "loss": 2.395248031616211, "memory(GiB)": 77.56, "step": 89950, "token_acc": 0.5075757575757576, "train_speed(iter/s)": 1.438075 }, { "epoch": 3.8539479885180583, "grad_norm": 6.161218166351318, "learning_rate": 1.241253967123468e-05, "loss": 2.507639694213867, "memory(GiB)": 77.56, "step": 89955, "token_acc": 0.4621212121212121, "train_speed(iter/s)": 1.438076 }, { "epoch": 3.8541622038473076, "grad_norm": 4.90719747543335, "learning_rate": 1.2408102068859228e-05, "loss": 2.6448198318481446, "memory(GiB)": 77.56, "step": 89960, "token_acc": 0.46855345911949686, "train_speed(iter/s)": 1.438078 }, { "epoch": 3.8543764191765564, "grad_norm": 8.31674575805664, "learning_rate": 1.2403665147496285e-05, "loss": 2.3424072265625, "memory(GiB)": 77.56, "step": 89965, "token_acc": 0.4765625, "train_speed(iter/s)": 1.438084 }, { "epoch": 3.854590634505805, "grad_norm": 5.838057994842529, "learning_rate": 1.2399228907226212e-05, "loss": 2.4374057769775392, "memory(GiB)": 77.56, "step": 89970, "token_acc": 0.48623853211009177, "train_speed(iter/s)": 1.438097 }, { "epoch": 3.8548048498350544, "grad_norm": 4.9149956703186035, "learning_rate": 1.2394793348129385e-05, "loss": 2.411842155456543, "memory(GiB)": 77.56, "step": 89975, "token_acc": 0.48546511627906974, "train_speed(iter/s)": 1.438115 }, { "epoch": 3.8550190651643033, "grad_norm": 7.946128845214844, "learning_rate": 1.2390358470286151e-05, "loss": 2.6184871673583983, "memory(GiB)": 77.56, "step": 89980, "token_acc": 0.4763636363636364, "train_speed(iter/s)": 1.438118 }, { "epoch": 3.855233280493552, "grad_norm": 11.506965637207031, "learning_rate": 1.2385924273776839e-05, "loss": 2.282339859008789, "memory(GiB)": 77.56, "step": 89985, "token_acc": 0.4962686567164179, "train_speed(iter/s)": 1.438117 }, { "epoch": 3.8554474958228013, "grad_norm": 4.797327041625977, "learning_rate": 1.2381490758681808e-05, "loss": 2.4253971099853517, "memory(GiB)": 77.56, "step": 89990, "token_acc": 0.534965034965035, "train_speed(iter/s)": 1.438104 }, { "epoch": 3.85566171115205, "grad_norm": 6.015229225158691, "learning_rate": 1.2377057925081342e-05, "loss": 2.24902286529541, "memory(GiB)": 77.56, "step": 89995, "token_acc": 0.5140845070422535, "train_speed(iter/s)": 1.438101 }, { "epoch": 3.855875926481299, "grad_norm": 6.512338161468506, "learning_rate": 1.2372625773055784e-05, "loss": 2.1010799407958984, "memory(GiB)": 77.56, "step": 90000, "token_acc": 0.5125, "train_speed(iter/s)": 1.438098 }, { "epoch": 3.855875926481299, "eval_loss": 2.1195545196533203, "eval_runtime": 14.1336, "eval_samples_per_second": 7.075, "eval_steps_per_second": 7.075, "eval_token_acc": 0.47844228094575797, "step": 90000 }, { "epoch": 3.856090141810548, "grad_norm": 5.665398597717285, "learning_rate": 1.236819430268541e-05, "loss": 2.787649726867676, "memory(GiB)": 77.56, "step": 90005, "token_acc": 0.485456369107322, "train_speed(iter/s)": 1.437773 }, { "epoch": 3.856304357139797, "grad_norm": 5.269782066345215, "learning_rate": 1.2363763514050496e-05, "loss": 2.5828184127807616, "memory(GiB)": 77.56, "step": 90010, "token_acc": 0.4969512195121951, "train_speed(iter/s)": 1.437791 }, { "epoch": 3.856518572469046, "grad_norm": 6.276030540466309, "learning_rate": 1.235933340723131e-05, "loss": 2.0275732040405274, "memory(GiB)": 77.56, "step": 90015, "token_acc": 0.575, "train_speed(iter/s)": 1.437781 }, { "epoch": 3.856732787798295, "grad_norm": 6.831111431121826, "learning_rate": 1.2354903982308098e-05, "loss": 2.207332992553711, "memory(GiB)": 77.56, "step": 90020, "token_acc": 0.4889867841409692, "train_speed(iter/s)": 1.437796 }, { "epoch": 3.856947003127544, "grad_norm": 7.143763542175293, "learning_rate": 1.2350475239361126e-05, "loss": 2.3331787109375, "memory(GiB)": 77.56, "step": 90025, "token_acc": 0.4883720930232558, "train_speed(iter/s)": 1.437801 }, { "epoch": 3.8571612184567927, "grad_norm": 6.3429436683654785, "learning_rate": 1.234604717847061e-05, "loss": 2.441649627685547, "memory(GiB)": 77.56, "step": 90030, "token_acc": 0.48344370860927155, "train_speed(iter/s)": 1.437811 }, { "epoch": 3.857375433786042, "grad_norm": 5.202672481536865, "learning_rate": 1.2341619799716775e-05, "loss": 2.5058883666992187, "memory(GiB)": 77.56, "step": 90035, "token_acc": 0.5081967213114754, "train_speed(iter/s)": 1.437814 }, { "epoch": 3.857589649115291, "grad_norm": 6.648130893707275, "learning_rate": 1.2337193103179823e-05, "loss": 2.1851377487182617, "memory(GiB)": 77.56, "step": 90040, "token_acc": 0.5238095238095238, "train_speed(iter/s)": 1.437812 }, { "epoch": 3.8578038644445396, "grad_norm": 6.269743919372559, "learning_rate": 1.2332767088939935e-05, "loss": 2.31964168548584, "memory(GiB)": 77.56, "step": 90045, "token_acc": 0.5300751879699248, "train_speed(iter/s)": 1.437822 }, { "epoch": 3.858018079773789, "grad_norm": 4.7640604972839355, "learning_rate": 1.2328341757077322e-05, "loss": 2.395347785949707, "memory(GiB)": 77.56, "step": 90050, "token_acc": 0.5288753799392097, "train_speed(iter/s)": 1.437806 }, { "epoch": 3.8582322951030377, "grad_norm": 6.6956377029418945, "learning_rate": 1.2323917107672134e-05, "loss": 2.1309988021850588, "memory(GiB)": 77.56, "step": 90055, "token_acc": 0.49310344827586206, "train_speed(iter/s)": 1.437808 }, { "epoch": 3.8584465104322865, "grad_norm": 5.487354278564453, "learning_rate": 1.2319493140804534e-05, "loss": 2.440011978149414, "memory(GiB)": 77.56, "step": 90060, "token_acc": 0.4425087108013937, "train_speed(iter/s)": 1.437796 }, { "epoch": 3.8586607257615357, "grad_norm": 7.776607513427734, "learning_rate": 1.2315069856554645e-05, "loss": 2.5134429931640625, "memory(GiB)": 77.56, "step": 90065, "token_acc": 0.4956521739130435, "train_speed(iter/s)": 1.437802 }, { "epoch": 3.8588749410907845, "grad_norm": 7.0737409591674805, "learning_rate": 1.2310647255002634e-05, "loss": 2.5011013031005858, "memory(GiB)": 77.56, "step": 90070, "token_acc": 0.5029411764705882, "train_speed(iter/s)": 1.437819 }, { "epoch": 3.8590891564200334, "grad_norm": 7.587624549865723, "learning_rate": 1.23062253362286e-05, "loss": 2.2718746185302736, "memory(GiB)": 77.56, "step": 90075, "token_acc": 0.4925373134328358, "train_speed(iter/s)": 1.437813 }, { "epoch": 3.8593033717492826, "grad_norm": 5.467663288116455, "learning_rate": 1.2301804100312642e-05, "loss": 2.3961555480957033, "memory(GiB)": 77.56, "step": 90080, "token_acc": 0.4812286689419795, "train_speed(iter/s)": 1.437809 }, { "epoch": 3.8595175870785314, "grad_norm": 5.291189193725586, "learning_rate": 1.229738354733488e-05, "loss": 2.373830795288086, "memory(GiB)": 77.56, "step": 90085, "token_acc": 0.5060240963855421, "train_speed(iter/s)": 1.437819 }, { "epoch": 3.8597318024077802, "grad_norm": 4.977430820465088, "learning_rate": 1.229296367737538e-05, "loss": 2.430828666687012, "memory(GiB)": 77.56, "step": 90090, "token_acc": 0.4873417721518987, "train_speed(iter/s)": 1.437841 }, { "epoch": 3.8599460177370295, "grad_norm": 6.836342811584473, "learning_rate": 1.228854449051422e-05, "loss": 2.0693782806396483, "memory(GiB)": 77.56, "step": 90095, "token_acc": 0.5222672064777328, "train_speed(iter/s)": 1.437831 }, { "epoch": 3.8601602330662783, "grad_norm": 4.87190055847168, "learning_rate": 1.2284125986831452e-05, "loss": 2.5876340866088867, "memory(GiB)": 77.56, "step": 90100, "token_acc": 0.46229508196721314, "train_speed(iter/s)": 1.437846 }, { "epoch": 3.860374448395527, "grad_norm": 5.780139923095703, "learning_rate": 1.2279708166407106e-05, "loss": 2.474539947509766, "memory(GiB)": 77.56, "step": 90105, "token_acc": 0.4962962962962963, "train_speed(iter/s)": 1.437823 }, { "epoch": 3.8605886637247764, "grad_norm": 4.127979278564453, "learning_rate": 1.2275291029321246e-05, "loss": 2.362444305419922, "memory(GiB)": 77.56, "step": 90110, "token_acc": 0.4801223241590214, "train_speed(iter/s)": 1.43781 }, { "epoch": 3.860802879054025, "grad_norm": 5.123636245727539, "learning_rate": 1.227087457565388e-05, "loss": 2.400991439819336, "memory(GiB)": 77.56, "step": 90115, "token_acc": 0.47586206896551725, "train_speed(iter/s)": 1.437828 }, { "epoch": 3.861017094383274, "grad_norm": 4.854820251464844, "learning_rate": 1.2266458805485009e-05, "loss": 2.28357048034668, "memory(GiB)": 77.56, "step": 90120, "token_acc": 0.4983164983164983, "train_speed(iter/s)": 1.437827 }, { "epoch": 3.8612313097125233, "grad_norm": 5.11182975769043, "learning_rate": 1.2262043718894633e-05, "loss": 2.468713569641113, "memory(GiB)": 77.56, "step": 90125, "token_acc": 0.47096774193548385, "train_speed(iter/s)": 1.437826 }, { "epoch": 3.861445525041772, "grad_norm": 6.18702507019043, "learning_rate": 1.2257629315962726e-05, "loss": 2.417119598388672, "memory(GiB)": 77.56, "step": 90130, "token_acc": 0.4983388704318937, "train_speed(iter/s)": 1.437822 }, { "epoch": 3.861659740371021, "grad_norm": 5.6934051513671875, "learning_rate": 1.2253215596769285e-05, "loss": 2.258602523803711, "memory(GiB)": 77.56, "step": 90135, "token_acc": 0.4820359281437126, "train_speed(iter/s)": 1.437835 }, { "epoch": 3.86187395570027, "grad_norm": 5.434886455535889, "learning_rate": 1.2248802561394234e-05, "loss": 2.2361404418945314, "memory(GiB)": 77.56, "step": 90140, "token_acc": 0.4982078853046595, "train_speed(iter/s)": 1.437855 }, { "epoch": 3.862088171029519, "grad_norm": 7.693119525909424, "learning_rate": 1.2244390209917555e-05, "loss": 2.6497882843017577, "memory(GiB)": 77.56, "step": 90145, "token_acc": 0.5072463768115942, "train_speed(iter/s)": 1.437879 }, { "epoch": 3.8623023863587678, "grad_norm": 9.227799415588379, "learning_rate": 1.2239978542419162e-05, "loss": 2.240663528442383, "memory(GiB)": 77.56, "step": 90150, "token_acc": 0.5346153846153846, "train_speed(iter/s)": 1.437865 }, { "epoch": 3.862516601688017, "grad_norm": 6.612579822540283, "learning_rate": 1.2235567558978983e-05, "loss": 2.1815208435058593, "memory(GiB)": 77.56, "step": 90155, "token_acc": 0.5258358662613982, "train_speed(iter/s)": 1.437849 }, { "epoch": 3.862730817017266, "grad_norm": 4.85897970199585, "learning_rate": 1.2231157259676923e-05, "loss": 2.305440330505371, "memory(GiB)": 77.56, "step": 90160, "token_acc": 0.5220338983050847, "train_speed(iter/s)": 1.437859 }, { "epoch": 3.8629450323465147, "grad_norm": 8.610416412353516, "learning_rate": 1.2226747644592862e-05, "loss": 2.2540578842163086, "memory(GiB)": 77.56, "step": 90165, "token_acc": 0.4816053511705686, "train_speed(iter/s)": 1.437873 }, { "epoch": 3.863159247675764, "grad_norm": 7.405158042907715, "learning_rate": 1.2222338713806718e-05, "loss": 2.297007751464844, "memory(GiB)": 77.56, "step": 90170, "token_acc": 0.5201238390092879, "train_speed(iter/s)": 1.437879 }, { "epoch": 3.8633734630050127, "grad_norm": 7.38932466506958, "learning_rate": 1.2217930467398348e-05, "loss": 2.5066806793212892, "memory(GiB)": 77.56, "step": 90175, "token_acc": 0.46885245901639344, "train_speed(iter/s)": 1.437889 }, { "epoch": 3.8635876783342615, "grad_norm": 6.427505016326904, "learning_rate": 1.2213522905447612e-05, "loss": 2.128851890563965, "memory(GiB)": 77.56, "step": 90180, "token_acc": 0.5157232704402516, "train_speed(iter/s)": 1.43788 }, { "epoch": 3.863801893663511, "grad_norm": 5.607212066650391, "learning_rate": 1.2209116028034351e-05, "loss": 2.5071102142333985, "memory(GiB)": 77.56, "step": 90185, "token_acc": 0.4980544747081712, "train_speed(iter/s)": 1.437881 }, { "epoch": 3.8640161089927596, "grad_norm": 6.2990264892578125, "learning_rate": 1.2204709835238388e-05, "loss": 2.4463857650756835, "memory(GiB)": 77.56, "step": 90190, "token_acc": 0.46808510638297873, "train_speed(iter/s)": 1.437885 }, { "epoch": 3.8642303243220084, "grad_norm": 6.721704006195068, "learning_rate": 1.2200304327139578e-05, "loss": 2.769192123413086, "memory(GiB)": 77.56, "step": 90195, "token_acc": 0.4642857142857143, "train_speed(iter/s)": 1.437883 }, { "epoch": 3.8644445396512577, "grad_norm": 6.095344543457031, "learning_rate": 1.2195899503817715e-05, "loss": 2.329058074951172, "memory(GiB)": 77.56, "step": 90200, "token_acc": 0.5134099616858238, "train_speed(iter/s)": 1.437893 }, { "epoch": 3.8646587549805065, "grad_norm": 6.428568363189697, "learning_rate": 1.2191495365352596e-05, "loss": 2.379052925109863, "memory(GiB)": 77.56, "step": 90205, "token_acc": 0.4953271028037383, "train_speed(iter/s)": 1.437881 }, { "epoch": 3.8648729703097553, "grad_norm": 8.200699806213379, "learning_rate": 1.2187091911823989e-05, "loss": 2.381650924682617, "memory(GiB)": 77.56, "step": 90210, "token_acc": 0.545816733067729, "train_speed(iter/s)": 1.437905 }, { "epoch": 3.8650871856390046, "grad_norm": 6.794419765472412, "learning_rate": 1.21826891433117e-05, "loss": 2.5461925506591796, "memory(GiB)": 77.56, "step": 90215, "token_acc": 0.46017699115044247, "train_speed(iter/s)": 1.437878 }, { "epoch": 3.8653014009682534, "grad_norm": 7.485188961029053, "learning_rate": 1.2178287059895471e-05, "loss": 2.2980983734130858, "memory(GiB)": 77.56, "step": 90220, "token_acc": 0.5180722891566265, "train_speed(iter/s)": 1.437883 }, { "epoch": 3.865515616297502, "grad_norm": 5.096858024597168, "learning_rate": 1.2173885661655038e-05, "loss": 2.5063623428344726, "memory(GiB)": 77.56, "step": 90225, "token_acc": 0.5015479876160991, "train_speed(iter/s)": 1.437875 }, { "epoch": 3.8657298316267514, "grad_norm": 5.671842575073242, "learning_rate": 1.2169484948670162e-05, "loss": 1.9846601486206055, "memory(GiB)": 77.56, "step": 90230, "token_acc": 0.5185185185185185, "train_speed(iter/s)": 1.437882 }, { "epoch": 3.8659440469560002, "grad_norm": 5.549198627471924, "learning_rate": 1.2165084921020558e-05, "loss": 2.1314945220947266, "memory(GiB)": 77.56, "step": 90235, "token_acc": 0.4931972789115646, "train_speed(iter/s)": 1.437878 }, { "epoch": 3.866158262285249, "grad_norm": 6.721835613250732, "learning_rate": 1.216068557878593e-05, "loss": 2.602103042602539, "memory(GiB)": 77.56, "step": 90240, "token_acc": 0.4551083591331269, "train_speed(iter/s)": 1.437886 }, { "epoch": 3.8663724776144983, "grad_norm": 4.766403675079346, "learning_rate": 1.215628692204598e-05, "loss": 2.385386085510254, "memory(GiB)": 77.56, "step": 90245, "token_acc": 0.4765625, "train_speed(iter/s)": 1.437912 }, { "epoch": 3.866586692943747, "grad_norm": 7.1068596839904785, "learning_rate": 1.2151888950880374e-05, "loss": 2.377451705932617, "memory(GiB)": 77.56, "step": 90250, "token_acc": 0.5252100840336135, "train_speed(iter/s)": 1.437918 }, { "epoch": 3.866800908272996, "grad_norm": 8.329933166503906, "learning_rate": 1.2147491665368826e-05, "loss": 2.2984407424926756, "memory(GiB)": 77.56, "step": 90255, "token_acc": 0.496, "train_speed(iter/s)": 1.437925 }, { "epoch": 3.867015123602245, "grad_norm": 8.43309211730957, "learning_rate": 1.214309506559097e-05, "loss": 2.626738166809082, "memory(GiB)": 77.56, "step": 90260, "token_acc": 0.47232472324723246, "train_speed(iter/s)": 1.437939 }, { "epoch": 3.867229338931494, "grad_norm": 7.081747531890869, "learning_rate": 1.213869915162646e-05, "loss": 2.3862384796142577, "memory(GiB)": 77.56, "step": 90265, "token_acc": 0.5017421602787456, "train_speed(iter/s)": 1.437934 }, { "epoch": 3.867443554260743, "grad_norm": 10.188179016113281, "learning_rate": 1.2134303923554935e-05, "loss": 2.241412353515625, "memory(GiB)": 77.56, "step": 90270, "token_acc": 0.47333333333333333, "train_speed(iter/s)": 1.437944 }, { "epoch": 3.867657769589992, "grad_norm": 5.070733070373535, "learning_rate": 1.2129909381455996e-05, "loss": 2.3196861267089846, "memory(GiB)": 77.56, "step": 90275, "token_acc": 0.5448028673835126, "train_speed(iter/s)": 1.437943 }, { "epoch": 3.867871984919241, "grad_norm": 9.10721492767334, "learning_rate": 1.2125515525409293e-05, "loss": 2.4494951248168944, "memory(GiB)": 77.56, "step": 90280, "token_acc": 0.46691176470588236, "train_speed(iter/s)": 1.437957 }, { "epoch": 3.8680862002484897, "grad_norm": 4.724442958831787, "learning_rate": 1.2121122355494385e-05, "loss": 2.010164260864258, "memory(GiB)": 77.56, "step": 90285, "token_acc": 0.5574324324324325, "train_speed(iter/s)": 1.437947 }, { "epoch": 3.868300415577739, "grad_norm": 6.833724021911621, "learning_rate": 1.21167298717909e-05, "loss": 2.4033851623535156, "memory(GiB)": 77.56, "step": 90290, "token_acc": 0.4967948717948718, "train_speed(iter/s)": 1.437937 }, { "epoch": 3.8685146309069878, "grad_norm": 6.595108985900879, "learning_rate": 1.2112338074378381e-05, "loss": 2.6386344909667967, "memory(GiB)": 77.56, "step": 90295, "token_acc": 0.5018450184501845, "train_speed(iter/s)": 1.437946 }, { "epoch": 3.8687288462362366, "grad_norm": 4.952311992645264, "learning_rate": 1.210794696333641e-05, "loss": 2.2181528091430662, "memory(GiB)": 77.56, "step": 90300, "token_acc": 0.5, "train_speed(iter/s)": 1.437955 }, { "epoch": 3.868943061565486, "grad_norm": 5.8525567054748535, "learning_rate": 1.210355653874452e-05, "loss": 2.2108102798461915, "memory(GiB)": 77.56, "step": 90305, "token_acc": 0.5225806451612903, "train_speed(iter/s)": 1.437944 }, { "epoch": 3.8691572768947347, "grad_norm": 6.393714427947998, "learning_rate": 1.2099166800682238e-05, "loss": 2.5033590316772463, "memory(GiB)": 77.56, "step": 90310, "token_acc": 0.4558303886925795, "train_speed(iter/s)": 1.437938 }, { "epoch": 3.8693714922239835, "grad_norm": 5.960321426391602, "learning_rate": 1.209477774922912e-05, "loss": 2.2494022369384767, "memory(GiB)": 77.56, "step": 90315, "token_acc": 0.545816733067729, "train_speed(iter/s)": 1.437941 }, { "epoch": 3.8695857075532327, "grad_norm": 7.149394989013672, "learning_rate": 1.2090389384464662e-05, "loss": 2.303862380981445, "memory(GiB)": 77.56, "step": 90320, "token_acc": 0.5406360424028268, "train_speed(iter/s)": 1.437944 }, { "epoch": 3.8697999228824815, "grad_norm": 7.303092956542969, "learning_rate": 1.2086001706468358e-05, "loss": 2.2746608734130858, "memory(GiB)": 77.56, "step": 90325, "token_acc": 0.50199203187251, "train_speed(iter/s)": 1.43796 }, { "epoch": 3.8700141382117303, "grad_norm": 5.146965980529785, "learning_rate": 1.2081614715319706e-05, "loss": 2.027683639526367, "memory(GiB)": 77.56, "step": 90330, "token_acc": 0.5528455284552846, "train_speed(iter/s)": 1.437951 }, { "epoch": 3.8702283535409796, "grad_norm": 6.2376203536987305, "learning_rate": 1.207722841109815e-05, "loss": 2.205574798583984, "memory(GiB)": 77.56, "step": 90335, "token_acc": 0.5384615384615384, "train_speed(iter/s)": 1.437945 }, { "epoch": 3.8704425688702284, "grad_norm": 6.945105075836182, "learning_rate": 1.2072842793883199e-05, "loss": 2.2663864135742187, "memory(GiB)": 77.56, "step": 90340, "token_acc": 0.48172757475083056, "train_speed(iter/s)": 1.437945 }, { "epoch": 3.8706567841994772, "grad_norm": 7.281012058258057, "learning_rate": 1.2068457863754273e-05, "loss": 2.123605728149414, "memory(GiB)": 77.56, "step": 90345, "token_acc": 0.5384615384615384, "train_speed(iter/s)": 1.437934 }, { "epoch": 3.8708709995287265, "grad_norm": 5.775146007537842, "learning_rate": 1.2064073620790823e-05, "loss": 2.2103017807006835, "memory(GiB)": 77.56, "step": 90350, "token_acc": 0.5179153094462541, "train_speed(iter/s)": 1.437912 }, { "epoch": 3.8710852148579753, "grad_norm": 5.414769172668457, "learning_rate": 1.205969006507226e-05, "loss": 2.1214412689208983, "memory(GiB)": 77.56, "step": 90355, "token_acc": 0.5110294117647058, "train_speed(iter/s)": 1.437928 }, { "epoch": 3.871299430187224, "grad_norm": 6.277258396148682, "learning_rate": 1.205530719667799e-05, "loss": 2.454164505004883, "memory(GiB)": 77.56, "step": 90360, "token_acc": 0.4867021276595745, "train_speed(iter/s)": 1.437928 }, { "epoch": 3.8715136455164734, "grad_norm": 5.787603378295898, "learning_rate": 1.2050925015687437e-05, "loss": 2.143338203430176, "memory(GiB)": 77.56, "step": 90365, "token_acc": 0.554006968641115, "train_speed(iter/s)": 1.437942 }, { "epoch": 3.871727860845722, "grad_norm": 4.927225112915039, "learning_rate": 1.2046543522179966e-05, "loss": 2.135004997253418, "memory(GiB)": 77.56, "step": 90370, "token_acc": 0.543918918918919, "train_speed(iter/s)": 1.437949 }, { "epoch": 3.871942076174971, "grad_norm": 5.743471622467041, "learning_rate": 1.2042162716234972e-05, "loss": 2.243510437011719, "memory(GiB)": 77.56, "step": 90375, "token_acc": 0.5340501792114696, "train_speed(iter/s)": 1.437956 }, { "epoch": 3.8721562915042202, "grad_norm": 6.920787811279297, "learning_rate": 1.2037782597931812e-05, "loss": 2.5593738555908203, "memory(GiB)": 77.56, "step": 90380, "token_acc": 0.47474747474747475, "train_speed(iter/s)": 1.437965 }, { "epoch": 3.872370506833469, "grad_norm": 5.611932754516602, "learning_rate": 1.2033403167349833e-05, "loss": 2.5393428802490234, "memory(GiB)": 77.56, "step": 90385, "token_acc": 0.45794392523364486, "train_speed(iter/s)": 1.437982 }, { "epoch": 3.872584722162718, "grad_norm": 7.454343318939209, "learning_rate": 1.2029024424568363e-05, "loss": 2.1756834030151366, "memory(GiB)": 77.56, "step": 90390, "token_acc": 0.5405405405405406, "train_speed(iter/s)": 1.437984 }, { "epoch": 3.872798937491967, "grad_norm": 7.0108256340026855, "learning_rate": 1.2024646369666731e-05, "loss": 2.5703283309936524, "memory(GiB)": 77.56, "step": 90395, "token_acc": 0.516245487364621, "train_speed(iter/s)": 1.437959 }, { "epoch": 3.873013152821216, "grad_norm": 6.987051486968994, "learning_rate": 1.202026900272426e-05, "loss": 2.0040735244750976, "memory(GiB)": 77.56, "step": 90400, "token_acc": 0.5283842794759825, "train_speed(iter/s)": 1.437972 }, { "epoch": 3.8732273681504648, "grad_norm": 4.960381031036377, "learning_rate": 1.201589232382025e-05, "loss": 2.2686237335205077, "memory(GiB)": 77.56, "step": 90405, "token_acc": 0.5289855072463768, "train_speed(iter/s)": 1.43797 }, { "epoch": 3.873441583479714, "grad_norm": 6.1224684715271, "learning_rate": 1.2011516333033979e-05, "loss": 2.3726947784423826, "memory(GiB)": 77.56, "step": 90410, "token_acc": 0.47058823529411764, "train_speed(iter/s)": 1.43798 }, { "epoch": 3.873655798808963, "grad_norm": 7.679008960723877, "learning_rate": 1.2007141030444723e-05, "loss": 2.2848957061767576, "memory(GiB)": 77.56, "step": 90415, "token_acc": 0.5062240663900415, "train_speed(iter/s)": 1.437973 }, { "epoch": 3.8738700141382116, "grad_norm": 5.951594352722168, "learning_rate": 1.2002766416131739e-05, "loss": 2.4400360107421877, "memory(GiB)": 77.56, "step": 90420, "token_acc": 0.4659498207885305, "train_speed(iter/s)": 1.437972 }, { "epoch": 3.874084229467461, "grad_norm": 6.499587059020996, "learning_rate": 1.1998392490174299e-05, "loss": 2.2784664154052736, "memory(GiB)": 77.56, "step": 90425, "token_acc": 0.5274261603375527, "train_speed(iter/s)": 1.43799 }, { "epoch": 3.8742984447967097, "grad_norm": 5.811992645263672, "learning_rate": 1.1994019252651611e-05, "loss": 2.1423919677734373, "memory(GiB)": 77.56, "step": 90430, "token_acc": 0.5662251655629139, "train_speed(iter/s)": 1.437981 }, { "epoch": 3.8745126601259585, "grad_norm": 5.947575092315674, "learning_rate": 1.1989646703642931e-05, "loss": 2.2671558380126955, "memory(GiB)": 77.56, "step": 90435, "token_acc": 0.5408805031446541, "train_speed(iter/s)": 1.437996 }, { "epoch": 3.8747268754552078, "grad_norm": 6.770403861999512, "learning_rate": 1.1985274843227456e-05, "loss": 2.4048460006713865, "memory(GiB)": 77.56, "step": 90440, "token_acc": 0.5100671140939598, "train_speed(iter/s)": 1.438 }, { "epoch": 3.8749410907844566, "grad_norm": 6.454441547393799, "learning_rate": 1.1980903671484389e-05, "loss": 2.4436107635498048, "memory(GiB)": 77.56, "step": 90445, "token_acc": 0.48161764705882354, "train_speed(iter/s)": 1.438003 }, { "epoch": 3.8751553061137054, "grad_norm": 7.0906548500061035, "learning_rate": 1.1976533188492922e-05, "loss": 2.0948753356933594, "memory(GiB)": 77.56, "step": 90450, "token_acc": 0.5934065934065934, "train_speed(iter/s)": 1.438021 }, { "epoch": 3.8753695214429547, "grad_norm": 5.974190711975098, "learning_rate": 1.1972163394332203e-05, "loss": 2.400987243652344, "memory(GiB)": 77.56, "step": 90455, "token_acc": 0.483974358974359, "train_speed(iter/s)": 1.438013 }, { "epoch": 3.8755837367722035, "grad_norm": 5.458359241485596, "learning_rate": 1.1967794289081436e-05, "loss": 2.177106475830078, "memory(GiB)": 77.56, "step": 90460, "token_acc": 0.5110410094637224, "train_speed(iter/s)": 1.438016 }, { "epoch": 3.8757979521014523, "grad_norm": 6.068882465362549, "learning_rate": 1.1963425872819755e-05, "loss": 2.225819206237793, "memory(GiB)": 77.56, "step": 90465, "token_acc": 0.4752475247524752, "train_speed(iter/s)": 1.43802 }, { "epoch": 3.8760121674307015, "grad_norm": 5.950623989105225, "learning_rate": 1.1959058145626289e-05, "loss": 2.334121322631836, "memory(GiB)": 77.56, "step": 90470, "token_acc": 0.5429553264604811, "train_speed(iter/s)": 1.438034 }, { "epoch": 3.8762263827599504, "grad_norm": 6.657140254974365, "learning_rate": 1.1954691107580174e-05, "loss": 2.392098808288574, "memory(GiB)": 77.56, "step": 90475, "token_acc": 0.4827586206896552, "train_speed(iter/s)": 1.438037 }, { "epoch": 3.876440598089199, "grad_norm": 5.313343048095703, "learning_rate": 1.1950324758760507e-05, "loss": 2.3941287994384766, "memory(GiB)": 77.56, "step": 90480, "token_acc": 0.5032679738562091, "train_speed(iter/s)": 1.438028 }, { "epoch": 3.8766548134184484, "grad_norm": 7.793916702270508, "learning_rate": 1.1945959099246407e-05, "loss": 2.2018304824829102, "memory(GiB)": 77.56, "step": 90485, "token_acc": 0.5212765957446809, "train_speed(iter/s)": 1.438024 }, { "epoch": 3.8768690287476972, "grad_norm": 7.034725666046143, "learning_rate": 1.1941594129116962e-05, "loss": 2.4903772354125975, "memory(GiB)": 77.56, "step": 90490, "token_acc": 0.5047318611987381, "train_speed(iter/s)": 1.438023 }, { "epoch": 3.877083244076946, "grad_norm": 5.782181262969971, "learning_rate": 1.1937229848451237e-05, "loss": 2.2202796936035156, "memory(GiB)": 77.56, "step": 90495, "token_acc": 0.5233333333333333, "train_speed(iter/s)": 1.438035 }, { "epoch": 3.8772974594061953, "grad_norm": 4.895261764526367, "learning_rate": 1.1932866257328302e-05, "loss": 2.280509376525879, "memory(GiB)": 77.56, "step": 90500, "token_acc": 0.4983164983164983, "train_speed(iter/s)": 1.438029 }, { "epoch": 3.8772974594061953, "eval_loss": 2.1616945266723633, "eval_runtime": 14.2274, "eval_samples_per_second": 7.029, "eval_steps_per_second": 7.029, "eval_token_acc": 0.4872881355932203, "step": 90500 }, { "epoch": 3.877511674735444, "grad_norm": 6.078415393829346, "learning_rate": 1.1928503355827192e-05, "loss": 2.3752138137817385, "memory(GiB)": 77.56, "step": 90505, "token_acc": 0.49950641658440276, "train_speed(iter/s)": 1.437681 }, { "epoch": 3.877725890064693, "grad_norm": 8.73322582244873, "learning_rate": 1.1924141144026969e-05, "loss": 2.322062873840332, "memory(GiB)": 77.56, "step": 90510, "token_acc": 0.49169435215946844, "train_speed(iter/s)": 1.437668 }, { "epoch": 3.877940105393942, "grad_norm": 6.695369243621826, "learning_rate": 1.1919779622006632e-05, "loss": 2.640370750427246, "memory(GiB)": 77.56, "step": 90515, "token_acc": 0.4626334519572954, "train_speed(iter/s)": 1.437669 }, { "epoch": 3.878154320723191, "grad_norm": 4.382381916046143, "learning_rate": 1.1915418789845229e-05, "loss": 2.3129093170166017, "memory(GiB)": 77.56, "step": 90520, "token_acc": 0.46706586826347307, "train_speed(iter/s)": 1.437676 }, { "epoch": 3.87836853605244, "grad_norm": 6.578730583190918, "learning_rate": 1.1911058647621737e-05, "loss": 2.3238664627075196, "memory(GiB)": 77.56, "step": 90525, "token_acc": 0.5413793103448276, "train_speed(iter/s)": 1.437672 }, { "epoch": 3.878582751381689, "grad_norm": 6.443264484405518, "learning_rate": 1.1906699195415144e-05, "loss": 1.986534309387207, "memory(GiB)": 77.56, "step": 90530, "token_acc": 0.5461254612546126, "train_speed(iter/s)": 1.437662 }, { "epoch": 3.878796966710938, "grad_norm": 5.691235542297363, "learning_rate": 1.1902340433304431e-05, "loss": 2.3310657501220704, "memory(GiB)": 77.56, "step": 90535, "token_acc": 0.4748201438848921, "train_speed(iter/s)": 1.43767 }, { "epoch": 3.8790111820401867, "grad_norm": 6.651580333709717, "learning_rate": 1.1897982361368548e-05, "loss": 2.007834053039551, "memory(GiB)": 77.56, "step": 90540, "token_acc": 0.5470383275261324, "train_speed(iter/s)": 1.437666 }, { "epoch": 3.879225397369436, "grad_norm": 4.5281195640563965, "learning_rate": 1.1893624979686474e-05, "loss": 2.4487594604492187, "memory(GiB)": 77.56, "step": 90545, "token_acc": 0.4781420765027322, "train_speed(iter/s)": 1.437674 }, { "epoch": 3.8794396126986848, "grad_norm": 5.499750137329102, "learning_rate": 1.1889268288337124e-05, "loss": 2.4119199752807616, "memory(GiB)": 77.56, "step": 90550, "token_acc": 0.4783861671469741, "train_speed(iter/s)": 1.437678 }, { "epoch": 3.8796538280279336, "grad_norm": 5.003000736236572, "learning_rate": 1.1884912287399436e-05, "loss": 2.289682960510254, "memory(GiB)": 77.56, "step": 90555, "token_acc": 0.5277777777777778, "train_speed(iter/s)": 1.437682 }, { "epoch": 3.879868043357183, "grad_norm": 4.916240215301514, "learning_rate": 1.1880556976952312e-05, "loss": 2.5092018127441404, "memory(GiB)": 77.56, "step": 90560, "token_acc": 0.5, "train_speed(iter/s)": 1.437691 }, { "epoch": 3.8800822586864316, "grad_norm": 5.766725063323975, "learning_rate": 1.1876202357074645e-05, "loss": 2.514948844909668, "memory(GiB)": 77.56, "step": 90565, "token_acc": 0.4633333333333333, "train_speed(iter/s)": 1.437695 }, { "epoch": 3.8802964740156805, "grad_norm": 5.398946285247803, "learning_rate": 1.187184842784535e-05, "loss": 2.502707290649414, "memory(GiB)": 77.56, "step": 90570, "token_acc": 0.46449704142011833, "train_speed(iter/s)": 1.437676 }, { "epoch": 3.8805106893449297, "grad_norm": 6.373363494873047, "learning_rate": 1.1867495189343286e-05, "loss": 2.2389551162719727, "memory(GiB)": 77.56, "step": 90575, "token_acc": 0.5172413793103449, "train_speed(iter/s)": 1.437691 }, { "epoch": 3.8807249046741785, "grad_norm": 6.4101338386535645, "learning_rate": 1.1863142641647307e-05, "loss": 2.285857009887695, "memory(GiB)": 77.56, "step": 90580, "token_acc": 0.5016077170418006, "train_speed(iter/s)": 1.437711 }, { "epoch": 3.8809391200034273, "grad_norm": 4.727756500244141, "learning_rate": 1.1858790784836282e-05, "loss": 2.316763687133789, "memory(GiB)": 77.56, "step": 90585, "token_acc": 0.5135135135135135, "train_speed(iter/s)": 1.437703 }, { "epoch": 3.8811533353326766, "grad_norm": 5.3445820808410645, "learning_rate": 1.185443961898905e-05, "loss": 2.4759702682495117, "memory(GiB)": 77.56, "step": 90590, "token_acc": 0.5053003533568905, "train_speed(iter/s)": 1.437706 }, { "epoch": 3.8813675506619254, "grad_norm": 6.157519340515137, "learning_rate": 1.1850089144184423e-05, "loss": 2.279096984863281, "memory(GiB)": 77.56, "step": 90595, "token_acc": 0.47586206896551725, "train_speed(iter/s)": 1.437723 }, { "epoch": 3.881581765991174, "grad_norm": 6.4327874183654785, "learning_rate": 1.18457393605012e-05, "loss": 2.179935646057129, "memory(GiB)": 77.56, "step": 90600, "token_acc": 0.5080645161290323, "train_speed(iter/s)": 1.437722 }, { "epoch": 3.8817959813204235, "grad_norm": 6.15175724029541, "learning_rate": 1.184139026801822e-05, "loss": 2.4754737854003905, "memory(GiB)": 77.56, "step": 90605, "token_acc": 0.48846153846153845, "train_speed(iter/s)": 1.43773 }, { "epoch": 3.8820101966496723, "grad_norm": 5.153484344482422, "learning_rate": 1.1837041866814252e-05, "loss": 2.2230756759643553, "memory(GiB)": 77.56, "step": 90610, "token_acc": 0.5238095238095238, "train_speed(iter/s)": 1.437726 }, { "epoch": 3.882224411978921, "grad_norm": 9.239290237426758, "learning_rate": 1.1832694156968066e-05, "loss": 2.6743162155151365, "memory(GiB)": 77.56, "step": 90615, "token_acc": 0.4978723404255319, "train_speed(iter/s)": 1.437729 }, { "epoch": 3.8824386273081704, "grad_norm": 6.394015789031982, "learning_rate": 1.1828347138558432e-05, "loss": 2.467215347290039, "memory(GiB)": 77.56, "step": 90620, "token_acc": 0.5018867924528302, "train_speed(iter/s)": 1.437725 }, { "epoch": 3.882652842637419, "grad_norm": 6.586813449859619, "learning_rate": 1.1824000811664083e-05, "loss": 2.1649574279785155, "memory(GiB)": 77.56, "step": 90625, "token_acc": 0.4968944099378882, "train_speed(iter/s)": 1.437735 }, { "epoch": 3.882867057966668, "grad_norm": 5.818645000457764, "learning_rate": 1.1819655176363786e-05, "loss": 2.3574411392211916, "memory(GiB)": 77.56, "step": 90630, "token_acc": 0.5404411764705882, "train_speed(iter/s)": 1.437724 }, { "epoch": 3.8830812732959172, "grad_norm": 8.16355037689209, "learning_rate": 1.1815310232736249e-05, "loss": 2.251815605163574, "memory(GiB)": 77.56, "step": 90635, "token_acc": 0.5037593984962406, "train_speed(iter/s)": 1.437727 }, { "epoch": 3.883295488625166, "grad_norm": 6.806107997894287, "learning_rate": 1.1810965980860189e-05, "loss": 2.5406349182128904, "memory(GiB)": 77.56, "step": 90640, "token_acc": 0.4793650793650794, "train_speed(iter/s)": 1.437736 }, { "epoch": 3.883509703954415, "grad_norm": 8.11182975769043, "learning_rate": 1.1806622420814306e-05, "loss": 2.552924919128418, "memory(GiB)": 77.56, "step": 90645, "token_acc": 0.5103448275862069, "train_speed(iter/s)": 1.437749 }, { "epoch": 3.883723919283664, "grad_norm": 10.275968551635742, "learning_rate": 1.1802279552677282e-05, "loss": 2.2463611602783202, "memory(GiB)": 77.56, "step": 90650, "token_acc": 0.5120274914089347, "train_speed(iter/s)": 1.437773 }, { "epoch": 3.883938134612913, "grad_norm": 6.112062454223633, "learning_rate": 1.1797937376527784e-05, "loss": 2.1707592010498047, "memory(GiB)": 77.56, "step": 90655, "token_acc": 0.5350553505535055, "train_speed(iter/s)": 1.437761 }, { "epoch": 3.8841523499421617, "grad_norm": 5.287235260009766, "learning_rate": 1.1793595892444492e-05, "loss": 2.559587860107422, "memory(GiB)": 77.56, "step": 90660, "token_acc": 0.49107142857142855, "train_speed(iter/s)": 1.437764 }, { "epoch": 3.884366565271411, "grad_norm": 6.164012908935547, "learning_rate": 1.1789255100506057e-05, "loss": 2.5242151260375976, "memory(GiB)": 77.56, "step": 90665, "token_acc": 0.49146757679180886, "train_speed(iter/s)": 1.437764 }, { "epoch": 3.88458078060066, "grad_norm": 4.857330799102783, "learning_rate": 1.1784915000791114e-05, "loss": 2.275299072265625, "memory(GiB)": 77.56, "step": 90670, "token_acc": 0.5425531914893617, "train_speed(iter/s)": 1.437778 }, { "epoch": 3.8847949959299086, "grad_norm": 6.191040992736816, "learning_rate": 1.1780575593378284e-05, "loss": 2.5952877044677733, "memory(GiB)": 77.56, "step": 90675, "token_acc": 0.46923076923076923, "train_speed(iter/s)": 1.437777 }, { "epoch": 3.885009211259158, "grad_norm": 4.9005303382873535, "learning_rate": 1.177623687834618e-05, "loss": 2.3590728759765627, "memory(GiB)": 77.56, "step": 90680, "token_acc": 0.4954954954954955, "train_speed(iter/s)": 1.437765 }, { "epoch": 3.8852234265884067, "grad_norm": 6.755948066711426, "learning_rate": 1.1771898855773388e-05, "loss": 2.129837226867676, "memory(GiB)": 77.56, "step": 90685, "token_acc": 0.5787671232876712, "train_speed(iter/s)": 1.437783 }, { "epoch": 3.8854376419176555, "grad_norm": 5.061296463012695, "learning_rate": 1.1767561525738525e-05, "loss": 1.975636863708496, "memory(GiB)": 77.56, "step": 90690, "token_acc": 0.5418326693227091, "train_speed(iter/s)": 1.437783 }, { "epoch": 3.8856518572469048, "grad_norm": 5.949632167816162, "learning_rate": 1.1763224888320145e-05, "loss": 2.3912097930908205, "memory(GiB)": 77.56, "step": 90695, "token_acc": 0.47388059701492535, "train_speed(iter/s)": 1.437783 }, { "epoch": 3.8858660725761536, "grad_norm": 5.824096202850342, "learning_rate": 1.1758888943596818e-05, "loss": 2.5480459213256834, "memory(GiB)": 77.56, "step": 90700, "token_acc": 0.4808362369337979, "train_speed(iter/s)": 1.437781 }, { "epoch": 3.8860802879054024, "grad_norm": 6.116512775421143, "learning_rate": 1.1754553691647092e-05, "loss": 2.3607730865478516, "memory(GiB)": 77.56, "step": 90705, "token_acc": 0.5434083601286174, "train_speed(iter/s)": 1.437791 }, { "epoch": 3.8862945032346516, "grad_norm": 6.583705902099609, "learning_rate": 1.1750219132549489e-05, "loss": 2.4126081466674805, "memory(GiB)": 77.56, "step": 90710, "token_acc": 0.44482758620689655, "train_speed(iter/s)": 1.437786 }, { "epoch": 3.8865087185639005, "grad_norm": 5.833676338195801, "learning_rate": 1.1745885266382561e-05, "loss": 2.072280693054199, "memory(GiB)": 77.56, "step": 90715, "token_acc": 0.5229681978798587, "train_speed(iter/s)": 1.437781 }, { "epoch": 3.8867229338931493, "grad_norm": 5.618956089019775, "learning_rate": 1.1741552093224805e-05, "loss": 2.498998260498047, "memory(GiB)": 77.56, "step": 90720, "token_acc": 0.45017182130584193, "train_speed(iter/s)": 1.437791 }, { "epoch": 3.8869371492223985, "grad_norm": 5.718469619750977, "learning_rate": 1.1737219613154727e-05, "loss": 2.206536293029785, "memory(GiB)": 77.56, "step": 90725, "token_acc": 0.5217391304347826, "train_speed(iter/s)": 1.437787 }, { "epoch": 3.8871513645516473, "grad_norm": 5.852831840515137, "learning_rate": 1.1732887826250787e-05, "loss": 2.202661895751953, "memory(GiB)": 77.56, "step": 90730, "token_acc": 0.5168539325842697, "train_speed(iter/s)": 1.437808 }, { "epoch": 3.887365579880896, "grad_norm": 7.852920055389404, "learning_rate": 1.1728556732591501e-05, "loss": 2.517822265625, "memory(GiB)": 77.56, "step": 90735, "token_acc": 0.528957528957529, "train_speed(iter/s)": 1.43781 }, { "epoch": 3.8875797952101454, "grad_norm": 6.729727268218994, "learning_rate": 1.172422633225531e-05, "loss": 2.0631500244140626, "memory(GiB)": 77.56, "step": 90740, "token_acc": 0.5487804878048781, "train_speed(iter/s)": 1.437827 }, { "epoch": 3.887794010539394, "grad_norm": 5.046313285827637, "learning_rate": 1.1719896625320654e-05, "loss": 2.3603225708007813, "memory(GiB)": 77.56, "step": 90745, "token_acc": 0.5032258064516129, "train_speed(iter/s)": 1.437836 }, { "epoch": 3.888008225868643, "grad_norm": 6.943776607513428, "learning_rate": 1.1715567611865991e-05, "loss": 2.4751752853393554, "memory(GiB)": 77.56, "step": 90750, "token_acc": 0.4787234042553192, "train_speed(iter/s)": 1.43783 }, { "epoch": 3.8882224411978923, "grad_norm": 5.720273494720459, "learning_rate": 1.1711239291969733e-05, "loss": 2.7610435485839844, "memory(GiB)": 77.56, "step": 90755, "token_acc": 0.4967532467532468, "train_speed(iter/s)": 1.437816 }, { "epoch": 3.888436656527141, "grad_norm": 5.783408164978027, "learning_rate": 1.1706911665710296e-05, "loss": 2.680044937133789, "memory(GiB)": 77.56, "step": 90760, "token_acc": 0.48363636363636364, "train_speed(iter/s)": 1.437829 }, { "epoch": 3.88865087185639, "grad_norm": 6.125732898712158, "learning_rate": 1.1702584733166073e-05, "loss": 2.442182922363281, "memory(GiB)": 77.56, "step": 90765, "token_acc": 0.5208333333333334, "train_speed(iter/s)": 1.437829 }, { "epoch": 3.888865087185639, "grad_norm": 6.588894367218018, "learning_rate": 1.1698258494415443e-05, "loss": 2.413641357421875, "memory(GiB)": 77.56, "step": 90770, "token_acc": 0.483974358974359, "train_speed(iter/s)": 1.437842 }, { "epoch": 3.889079302514888, "grad_norm": 7.6249895095825195, "learning_rate": 1.1693932949536801e-05, "loss": 2.6100582122802733, "memory(GiB)": 77.56, "step": 90775, "token_acc": 0.43956043956043955, "train_speed(iter/s)": 1.437853 }, { "epoch": 3.889293517844137, "grad_norm": 5.18500280380249, "learning_rate": 1.1689608098608495e-05, "loss": 2.8090087890625, "memory(GiB)": 77.56, "step": 90780, "token_acc": 0.4809688581314879, "train_speed(iter/s)": 1.43786 }, { "epoch": 3.889507733173386, "grad_norm": 8.355817794799805, "learning_rate": 1.168528394170888e-05, "loss": 2.4034042358398438, "memory(GiB)": 77.56, "step": 90785, "token_acc": 0.5091575091575091, "train_speed(iter/s)": 1.437845 }, { "epoch": 3.889721948502635, "grad_norm": 5.919632434844971, "learning_rate": 1.1680960478916292e-05, "loss": 2.3702533721923826, "memory(GiB)": 77.56, "step": 90790, "token_acc": 0.48717948717948717, "train_speed(iter/s)": 1.437853 }, { "epoch": 3.8899361638318837, "grad_norm": 7.269247531890869, "learning_rate": 1.1676637710309047e-05, "loss": 2.7270795822143556, "memory(GiB)": 77.56, "step": 90795, "token_acc": 0.4540229885057471, "train_speed(iter/s)": 1.437856 }, { "epoch": 3.890150379161133, "grad_norm": 6.30020809173584, "learning_rate": 1.1672315635965447e-05, "loss": 2.520612335205078, "memory(GiB)": 77.56, "step": 90800, "token_acc": 0.47840531561461797, "train_speed(iter/s)": 1.437841 }, { "epoch": 3.8903645944903817, "grad_norm": 7.151754379272461, "learning_rate": 1.1667994255963805e-05, "loss": 2.196531867980957, "memory(GiB)": 77.56, "step": 90805, "token_acc": 0.5354609929078015, "train_speed(iter/s)": 1.43783 }, { "epoch": 3.8905788098196306, "grad_norm": 5.397918701171875, "learning_rate": 1.1663673570382416e-05, "loss": 2.2160144805908204, "memory(GiB)": 77.56, "step": 90810, "token_acc": 0.5105105105105106, "train_speed(iter/s)": 1.437843 }, { "epoch": 3.89079302514888, "grad_norm": 6.126784801483154, "learning_rate": 1.1659353579299543e-05, "loss": 2.443135452270508, "memory(GiB)": 77.56, "step": 90815, "token_acc": 0.5065359477124183, "train_speed(iter/s)": 1.437841 }, { "epoch": 3.8910072404781286, "grad_norm": 7.230842590332031, "learning_rate": 1.1655034282793448e-05, "loss": 2.0986690521240234, "memory(GiB)": 77.56, "step": 90820, "token_acc": 0.5157894736842106, "train_speed(iter/s)": 1.43785 }, { "epoch": 3.8912214558073774, "grad_norm": 5.642878532409668, "learning_rate": 1.1650715680942381e-05, "loss": 2.4565019607543945, "memory(GiB)": 77.56, "step": 90825, "token_acc": 0.48056537102473496, "train_speed(iter/s)": 1.437851 }, { "epoch": 3.8914356711366267, "grad_norm": 5.745751857757568, "learning_rate": 1.1646397773824553e-05, "loss": 2.2658226013183596, "memory(GiB)": 77.56, "step": 90830, "token_acc": 0.5300751879699248, "train_speed(iter/s)": 1.437851 }, { "epoch": 3.8916498864658755, "grad_norm": 5.3980512619018555, "learning_rate": 1.1642080561518226e-05, "loss": 2.2125831604003907, "memory(GiB)": 77.56, "step": 90835, "token_acc": 0.5086705202312138, "train_speed(iter/s)": 1.437857 }, { "epoch": 3.8918641017951243, "grad_norm": 6.2098517417907715, "learning_rate": 1.163776404410159e-05, "loss": 2.423111152648926, "memory(GiB)": 77.56, "step": 90840, "token_acc": 0.4944649446494465, "train_speed(iter/s)": 1.437852 }, { "epoch": 3.8920783171243736, "grad_norm": 6.400033473968506, "learning_rate": 1.1633448221652848e-05, "loss": 1.8792903900146485, "memory(GiB)": 77.56, "step": 90845, "token_acc": 0.591743119266055, "train_speed(iter/s)": 1.437853 }, { "epoch": 3.8922925324536224, "grad_norm": 6.435670852661133, "learning_rate": 1.1629133094250183e-05, "loss": 2.0793975830078124, "memory(GiB)": 77.56, "step": 90850, "token_acc": 0.5631399317406144, "train_speed(iter/s)": 1.437867 }, { "epoch": 3.892506747782871, "grad_norm": 5.783316135406494, "learning_rate": 1.1624818661971747e-05, "loss": 2.5071504592895506, "memory(GiB)": 77.56, "step": 90855, "token_acc": 0.5216049382716049, "train_speed(iter/s)": 1.437878 }, { "epoch": 3.8927209631121205, "grad_norm": 5.509244918823242, "learning_rate": 1.1620504924895737e-05, "loss": 2.2704851150512697, "memory(GiB)": 77.56, "step": 90860, "token_acc": 0.5421245421245421, "train_speed(iter/s)": 1.437885 }, { "epoch": 3.8929351784413693, "grad_norm": 5.796581745147705, "learning_rate": 1.1616191883100285e-05, "loss": 2.470877265930176, "memory(GiB)": 77.56, "step": 90865, "token_acc": 0.46355685131195334, "train_speed(iter/s)": 1.437888 }, { "epoch": 3.893149393770618, "grad_norm": 6.25365686416626, "learning_rate": 1.1611879536663523e-05, "loss": 2.378495979309082, "memory(GiB)": 77.56, "step": 90870, "token_acc": 0.5016722408026756, "train_speed(iter/s)": 1.437894 }, { "epoch": 3.8933636090998673, "grad_norm": 6.276653289794922, "learning_rate": 1.1607567885663555e-05, "loss": 2.371576690673828, "memory(GiB)": 77.56, "step": 90875, "token_acc": 0.5, "train_speed(iter/s)": 1.437901 }, { "epoch": 3.893577824429116, "grad_norm": 5.147952556610107, "learning_rate": 1.160325693017853e-05, "loss": 2.3934024810791015, "memory(GiB)": 77.56, "step": 90880, "token_acc": 0.48172757475083056, "train_speed(iter/s)": 1.437919 }, { "epoch": 3.893792039758365, "grad_norm": 6.9699578285217285, "learning_rate": 1.1598946670286525e-05, "loss": 2.1941316604614256, "memory(GiB)": 77.56, "step": 90885, "token_acc": 0.54421768707483, "train_speed(iter/s)": 1.437915 }, { "epoch": 3.894006255087614, "grad_norm": 4.8469390869140625, "learning_rate": 1.1594637106065608e-05, "loss": 2.3624467849731445, "memory(GiB)": 77.56, "step": 90890, "token_acc": 0.518796992481203, "train_speed(iter/s)": 1.437927 }, { "epoch": 3.894220470416863, "grad_norm": 5.040021896362305, "learning_rate": 1.1590328237593878e-05, "loss": 2.3942983627319334, "memory(GiB)": 77.56, "step": 90895, "token_acc": 0.5, "train_speed(iter/s)": 1.437928 }, { "epoch": 3.894434685746112, "grad_norm": 6.077275276184082, "learning_rate": 1.1586020064949387e-05, "loss": 2.603710174560547, "memory(GiB)": 77.56, "step": 90900, "token_acc": 0.47878787878787876, "train_speed(iter/s)": 1.437942 }, { "epoch": 3.894648901075361, "grad_norm": 7.9805498123168945, "learning_rate": 1.1581712588210181e-05, "loss": 2.2039958953857424, "memory(GiB)": 77.56, "step": 90905, "token_acc": 0.5404411764705882, "train_speed(iter/s)": 1.437964 }, { "epoch": 3.89486311640461, "grad_norm": 5.799496650695801, "learning_rate": 1.1577405807454283e-05, "loss": 2.0657562255859374, "memory(GiB)": 77.56, "step": 90910, "token_acc": 0.5458015267175572, "train_speed(iter/s)": 1.437974 }, { "epoch": 3.8950773317338587, "grad_norm": 6.915517330169678, "learning_rate": 1.1573099722759712e-05, "loss": 2.485106658935547, "memory(GiB)": 77.56, "step": 90915, "token_acc": 0.49469964664310956, "train_speed(iter/s)": 1.437979 }, { "epoch": 3.895291547063108, "grad_norm": 7.066059589385986, "learning_rate": 1.1568794334204502e-05, "loss": 2.0547019958496096, "memory(GiB)": 77.56, "step": 90920, "token_acc": 0.5720930232558139, "train_speed(iter/s)": 1.437993 }, { "epoch": 3.895505762392357, "grad_norm": 5.3980393409729, "learning_rate": 1.1564489641866633e-05, "loss": 2.7890256881713866, "memory(GiB)": 77.56, "step": 90925, "token_acc": 0.47017543859649125, "train_speed(iter/s)": 1.438009 }, { "epoch": 3.8957199777216056, "grad_norm": 5.78582239151001, "learning_rate": 1.156018564582409e-05, "loss": 2.256192207336426, "memory(GiB)": 77.56, "step": 90930, "token_acc": 0.515625, "train_speed(iter/s)": 1.438014 }, { "epoch": 3.895934193050855, "grad_norm": 6.013241767883301, "learning_rate": 1.1555882346154845e-05, "loss": 2.1658794403076174, "memory(GiB)": 77.56, "step": 90935, "token_acc": 0.5761316872427984, "train_speed(iter/s)": 1.438035 }, { "epoch": 3.8961484083801037, "grad_norm": 6.3571906089782715, "learning_rate": 1.1551579742936852e-05, "loss": 2.250625228881836, "memory(GiB)": 77.56, "step": 90940, "token_acc": 0.5018867924528302, "train_speed(iter/s)": 1.438043 }, { "epoch": 3.8963626237093525, "grad_norm": 4.944669723510742, "learning_rate": 1.1547277836248043e-05, "loss": 2.237442398071289, "memory(GiB)": 77.56, "step": 90945, "token_acc": 0.5398550724637681, "train_speed(iter/s)": 1.438053 }, { "epoch": 3.8965768390386017, "grad_norm": 8.360963821411133, "learning_rate": 1.1542976626166374e-05, "loss": 2.467558670043945, "memory(GiB)": 77.56, "step": 90950, "token_acc": 0.5190839694656488, "train_speed(iter/s)": 1.438058 }, { "epoch": 3.8967910543678506, "grad_norm": 6.620720863342285, "learning_rate": 1.153867611276977e-05, "loss": 2.363943099975586, "memory(GiB)": 77.56, "step": 90955, "token_acc": 0.5241635687732342, "train_speed(iter/s)": 1.43806 }, { "epoch": 3.8970052696970994, "grad_norm": 6.948920726776123, "learning_rate": 1.1534376296136124e-05, "loss": 2.6090906143188475, "memory(GiB)": 77.56, "step": 90960, "token_acc": 0.48616600790513836, "train_speed(iter/s)": 1.438068 }, { "epoch": 3.8972194850263486, "grad_norm": 6.90181303024292, "learning_rate": 1.153007717634334e-05, "loss": 2.002796745300293, "memory(GiB)": 77.56, "step": 90965, "token_acc": 0.5675675675675675, "train_speed(iter/s)": 1.438066 }, { "epoch": 3.8974337003555974, "grad_norm": 8.537093162536621, "learning_rate": 1.1525778753469297e-05, "loss": 2.2156164169311525, "memory(GiB)": 77.56, "step": 90970, "token_acc": 0.5229681978798587, "train_speed(iter/s)": 1.438029 }, { "epoch": 3.8976479156848463, "grad_norm": 5.488020420074463, "learning_rate": 1.1521481027591847e-05, "loss": 2.207258605957031, "memory(GiB)": 77.56, "step": 90975, "token_acc": 0.5037037037037037, "train_speed(iter/s)": 1.438042 }, { "epoch": 3.8978621310140955, "grad_norm": 5.44802188873291, "learning_rate": 1.1517183998788877e-05, "loss": 2.4842533111572265, "memory(GiB)": 77.56, "step": 90980, "token_acc": 0.5, "train_speed(iter/s)": 1.438045 }, { "epoch": 3.8980763463433443, "grad_norm": 7.082776069641113, "learning_rate": 1.1512887667138217e-05, "loss": 2.6597930908203127, "memory(GiB)": 77.56, "step": 90985, "token_acc": 0.4798657718120805, "train_speed(iter/s)": 1.438049 }, { "epoch": 3.898290561672593, "grad_norm": 6.406974792480469, "learning_rate": 1.1508592032717701e-05, "loss": 2.706146240234375, "memory(GiB)": 77.56, "step": 90990, "token_acc": 0.4409937888198758, "train_speed(iter/s)": 1.438055 }, { "epoch": 3.8985047770018424, "grad_norm": 5.782268047332764, "learning_rate": 1.1504297095605154e-05, "loss": 2.374405288696289, "memory(GiB)": 77.56, "step": 90995, "token_acc": 0.5223367697594502, "train_speed(iter/s)": 1.438071 }, { "epoch": 3.898718992331091, "grad_norm": 8.141237258911133, "learning_rate": 1.1500002855878362e-05, "loss": 2.6752086639404298, "memory(GiB)": 77.56, "step": 91000, "token_acc": 0.5, "train_speed(iter/s)": 1.438069 }, { "epoch": 3.898718992331091, "eval_loss": 2.138394594192505, "eval_runtime": 14.4932, "eval_samples_per_second": 6.9, "eval_steps_per_second": 6.9, "eval_token_acc": 0.46196403872752423, "step": 91000 }, { "epoch": 3.89893320766034, "grad_norm": 4.859127044677734, "learning_rate": 1.1495709313615143e-05, "loss": 2.604720687866211, "memory(GiB)": 77.56, "step": 91005, "token_acc": 0.46509341199606685, "train_speed(iter/s)": 1.437715 }, { "epoch": 3.8991474229895893, "grad_norm": 6.947793006896973, "learning_rate": 1.1491416468893274e-05, "loss": 2.5877716064453127, "memory(GiB)": 77.56, "step": 91010, "token_acc": 0.471875, "train_speed(iter/s)": 1.43773 }, { "epoch": 3.899361638318838, "grad_norm": 7.862221717834473, "learning_rate": 1.1487124321790515e-05, "loss": 2.2432907104492186, "memory(GiB)": 77.56, "step": 91015, "token_acc": 0.5390070921985816, "train_speed(iter/s)": 1.437743 }, { "epoch": 3.899575853648087, "grad_norm": 7.062439918518066, "learning_rate": 1.148283287238463e-05, "loss": 2.412496566772461, "memory(GiB)": 77.56, "step": 91020, "token_acc": 0.4855305466237942, "train_speed(iter/s)": 1.437757 }, { "epoch": 3.899790068977336, "grad_norm": 6.912678241729736, "learning_rate": 1.1478542120753344e-05, "loss": 2.351667785644531, "memory(GiB)": 77.56, "step": 91025, "token_acc": 0.49814126394052044, "train_speed(iter/s)": 1.437781 }, { "epoch": 3.900004284306585, "grad_norm": 5.951408863067627, "learning_rate": 1.1474252066974422e-05, "loss": 2.1137380599975586, "memory(GiB)": 77.56, "step": 91030, "token_acc": 0.558303886925795, "train_speed(iter/s)": 1.437777 }, { "epoch": 3.900218499635834, "grad_norm": 7.651566505432129, "learning_rate": 1.1469962711125548e-05, "loss": 2.350306510925293, "memory(GiB)": 77.56, "step": 91035, "token_acc": 0.5070422535211268, "train_speed(iter/s)": 1.437795 }, { "epoch": 3.900432714965083, "grad_norm": 6.07174015045166, "learning_rate": 1.1465674053284452e-05, "loss": 2.4441177368164064, "memory(GiB)": 77.56, "step": 91040, "token_acc": 0.4965034965034965, "train_speed(iter/s)": 1.437812 }, { "epoch": 3.900646930294332, "grad_norm": 8.041930198669434, "learning_rate": 1.1461386093528826e-05, "loss": 2.739452362060547, "memory(GiB)": 77.56, "step": 91045, "token_acc": 0.4337748344370861, "train_speed(iter/s)": 1.437798 }, { "epoch": 3.9008611456235807, "grad_norm": 6.146678447723389, "learning_rate": 1.1457098831936342e-05, "loss": 2.389677810668945, "memory(GiB)": 77.56, "step": 91050, "token_acc": 0.5100671140939598, "train_speed(iter/s)": 1.437819 }, { "epoch": 3.90107536095283, "grad_norm": 5.0948686599731445, "learning_rate": 1.1452812268584667e-05, "loss": 2.227981758117676, "memory(GiB)": 77.56, "step": 91055, "token_acc": 0.5180722891566265, "train_speed(iter/s)": 1.437832 }, { "epoch": 3.9012895762820787, "grad_norm": 5.616827011108398, "learning_rate": 1.1448526403551441e-05, "loss": 2.4123701095581054, "memory(GiB)": 77.56, "step": 91060, "token_acc": 0.5112540192926045, "train_speed(iter/s)": 1.43785 }, { "epoch": 3.9015037916113275, "grad_norm": 5.040546894073486, "learning_rate": 1.1444241236914343e-05, "loss": 2.4961627960205077, "memory(GiB)": 77.56, "step": 91065, "token_acc": 0.4969512195121951, "train_speed(iter/s)": 1.437859 }, { "epoch": 3.901718006940577, "grad_norm": 5.902012825012207, "learning_rate": 1.143995676875098e-05, "loss": 2.435586357116699, "memory(GiB)": 77.56, "step": 91070, "token_acc": 0.5278810408921933, "train_speed(iter/s)": 1.437864 }, { "epoch": 3.9019322222698256, "grad_norm": 5.462906360626221, "learning_rate": 1.1435672999138975e-05, "loss": 2.4297624588012696, "memory(GiB)": 77.56, "step": 91075, "token_acc": 0.5132450331125827, "train_speed(iter/s)": 1.437892 }, { "epoch": 3.9021464375990744, "grad_norm": 5.44817590713501, "learning_rate": 1.1431389928155922e-05, "loss": 2.2389373779296875, "memory(GiB)": 77.56, "step": 91080, "token_acc": 0.5409836065573771, "train_speed(iter/s)": 1.437907 }, { "epoch": 3.9023606529283237, "grad_norm": 6.258960723876953, "learning_rate": 1.1427107555879412e-05, "loss": 2.303700637817383, "memory(GiB)": 77.56, "step": 91085, "token_acc": 0.5054545454545455, "train_speed(iter/s)": 1.437919 }, { "epoch": 3.9025748682575725, "grad_norm": 4.524411201477051, "learning_rate": 1.142282588238705e-05, "loss": 2.0361377716064455, "memory(GiB)": 77.56, "step": 91090, "token_acc": 0.5348837209302325, "train_speed(iter/s)": 1.437911 }, { "epoch": 3.9027890835868213, "grad_norm": 5.329421043395996, "learning_rate": 1.1418544907756368e-05, "loss": 2.173225975036621, "memory(GiB)": 77.56, "step": 91095, "token_acc": 0.5286195286195287, "train_speed(iter/s)": 1.437907 }, { "epoch": 3.9030032989160706, "grad_norm": 6.68475866317749, "learning_rate": 1.1414264632064952e-05, "loss": 2.1077871322631836, "memory(GiB)": 77.56, "step": 91100, "token_acc": 0.49838187702265374, "train_speed(iter/s)": 1.437909 }, { "epoch": 3.9032175142453194, "grad_norm": 5.613337993621826, "learning_rate": 1.1409985055390332e-05, "loss": 2.537759208679199, "memory(GiB)": 77.56, "step": 91105, "token_acc": 0.4666666666666667, "train_speed(iter/s)": 1.437924 }, { "epoch": 3.903431729574568, "grad_norm": 5.314393043518066, "learning_rate": 1.1405706177810027e-05, "loss": 2.444041633605957, "memory(GiB)": 77.56, "step": 91110, "token_acc": 0.49146757679180886, "train_speed(iter/s)": 1.437931 }, { "epoch": 3.9036459449038174, "grad_norm": 6.4607744216918945, "learning_rate": 1.1401427999401565e-05, "loss": 2.474997901916504, "memory(GiB)": 77.56, "step": 91115, "token_acc": 0.45325779036827196, "train_speed(iter/s)": 1.437938 }, { "epoch": 3.9038601602330663, "grad_norm": 5.1067585945129395, "learning_rate": 1.1397150520242422e-05, "loss": 2.2475372314453126, "memory(GiB)": 77.56, "step": 91120, "token_acc": 0.5034722222222222, "train_speed(iter/s)": 1.437957 }, { "epoch": 3.904074375562315, "grad_norm": 6.846627712249756, "learning_rate": 1.1392873740410132e-05, "loss": 2.4637718200683594, "memory(GiB)": 77.56, "step": 91125, "token_acc": 0.46153846153846156, "train_speed(iter/s)": 1.43797 }, { "epoch": 3.9042885908915643, "grad_norm": 4.901951789855957, "learning_rate": 1.1388597659982148e-05, "loss": 2.3763832092285155, "memory(GiB)": 77.56, "step": 91130, "token_acc": 0.501779359430605, "train_speed(iter/s)": 1.437969 }, { "epoch": 3.904502806220813, "grad_norm": 5.6291728019714355, "learning_rate": 1.138432227903593e-05, "loss": 2.178856086730957, "memory(GiB)": 77.56, "step": 91135, "token_acc": 0.5525291828793775, "train_speed(iter/s)": 1.437956 }, { "epoch": 3.904717021550062, "grad_norm": 5.030359745025635, "learning_rate": 1.1380047597648946e-05, "loss": 1.9573993682861328, "memory(GiB)": 77.56, "step": 91140, "token_acc": 0.5483870967741935, "train_speed(iter/s)": 1.437968 }, { "epoch": 3.904931236879311, "grad_norm": 6.37743616104126, "learning_rate": 1.1375773615898617e-05, "loss": 2.0662460327148438, "memory(GiB)": 77.56, "step": 91145, "token_acc": 0.5060240963855421, "train_speed(iter/s)": 1.437989 }, { "epoch": 3.90514545220856, "grad_norm": 6.363149166107178, "learning_rate": 1.137150033386239e-05, "loss": 2.193059539794922, "memory(GiB)": 77.56, "step": 91150, "token_acc": 0.510548523206751, "train_speed(iter/s)": 1.438011 }, { "epoch": 3.905359667537809, "grad_norm": 4.6900434494018555, "learning_rate": 1.1367227751617671e-05, "loss": 2.1674001693725584, "memory(GiB)": 77.56, "step": 91155, "token_acc": 0.546031746031746, "train_speed(iter/s)": 1.438022 }, { "epoch": 3.905573882867058, "grad_norm": 9.145657539367676, "learning_rate": 1.1362955869241865e-05, "loss": 2.56341609954834, "memory(GiB)": 77.56, "step": 91160, "token_acc": 0.468013468013468, "train_speed(iter/s)": 1.438028 }, { "epoch": 3.905788098196307, "grad_norm": 5.5173845291137695, "learning_rate": 1.1358684686812354e-05, "loss": 2.37689208984375, "memory(GiB)": 77.56, "step": 91165, "token_acc": 0.504, "train_speed(iter/s)": 1.438024 }, { "epoch": 3.9060023135255557, "grad_norm": 6.67338752746582, "learning_rate": 1.1354414204406505e-05, "loss": 2.1870487213134764, "memory(GiB)": 77.56, "step": 91170, "token_acc": 0.5338645418326693, "train_speed(iter/s)": 1.437988 }, { "epoch": 3.906216528854805, "grad_norm": 5.9065656661987305, "learning_rate": 1.135014442210171e-05, "loss": 2.274699020385742, "memory(GiB)": 77.56, "step": 91175, "token_acc": 0.5146579804560261, "train_speed(iter/s)": 1.438002 }, { "epoch": 3.906430744184054, "grad_norm": 7.112194061279297, "learning_rate": 1.1345875339975292e-05, "loss": 2.0654006958007813, "memory(GiB)": 77.56, "step": 91180, "token_acc": 0.5132075471698113, "train_speed(iter/s)": 1.437994 }, { "epoch": 3.9066449595133026, "grad_norm": 9.399968147277832, "learning_rate": 1.1341606958104616e-05, "loss": 2.1300832748413088, "memory(GiB)": 77.56, "step": 91185, "token_acc": 0.5107142857142857, "train_speed(iter/s)": 1.437986 }, { "epoch": 3.906859174842552, "grad_norm": 5.284914970397949, "learning_rate": 1.1337339276566993e-05, "loss": 2.343291473388672, "memory(GiB)": 77.56, "step": 91190, "token_acc": 0.5097402597402597, "train_speed(iter/s)": 1.437992 }, { "epoch": 3.9070733901718007, "grad_norm": 5.782185077667236, "learning_rate": 1.1333072295439739e-05, "loss": 2.6387989044189455, "memory(GiB)": 77.56, "step": 91195, "token_acc": 0.46598639455782315, "train_speed(iter/s)": 1.438 }, { "epoch": 3.9072876055010495, "grad_norm": 6.247674942016602, "learning_rate": 1.1328806014800158e-05, "loss": 2.450332260131836, "memory(GiB)": 77.56, "step": 91200, "token_acc": 0.5131578947368421, "train_speed(iter/s)": 1.437979 }, { "epoch": 3.9075018208302987, "grad_norm": 5.871550559997559, "learning_rate": 1.132454043472551e-05, "loss": 2.271565818786621, "memory(GiB)": 77.56, "step": 91205, "token_acc": 0.476027397260274, "train_speed(iter/s)": 1.437963 }, { "epoch": 3.9077160361595475, "grad_norm": 4.952987194061279, "learning_rate": 1.1320275555293113e-05, "loss": 2.229094314575195, "memory(GiB)": 77.56, "step": 91210, "token_acc": 0.483974358974359, "train_speed(iter/s)": 1.437967 }, { "epoch": 3.9079302514887964, "grad_norm": 6.041511535644531, "learning_rate": 1.1316011376580204e-05, "loss": 2.387455177307129, "memory(GiB)": 77.56, "step": 91215, "token_acc": 0.498220640569395, "train_speed(iter/s)": 1.437958 }, { "epoch": 3.9081444668180456, "grad_norm": 5.558559417724609, "learning_rate": 1.1311747898664038e-05, "loss": 2.146357536315918, "memory(GiB)": 77.56, "step": 91220, "token_acc": 0.503125, "train_speed(iter/s)": 1.43797 }, { "epoch": 3.9083586821472944, "grad_norm": 6.989698886871338, "learning_rate": 1.1307485121621858e-05, "loss": 2.5166481018066404, "memory(GiB)": 77.56, "step": 91225, "token_acc": 0.47653429602888087, "train_speed(iter/s)": 1.437978 }, { "epoch": 3.9085728974765432, "grad_norm": 5.662220001220703, "learning_rate": 1.1303223045530859e-05, "loss": 2.1686641693115236, "memory(GiB)": 77.56, "step": 91230, "token_acc": 0.5138888888888888, "train_speed(iter/s)": 1.437977 }, { "epoch": 3.9087871128057925, "grad_norm": 5.485433101654053, "learning_rate": 1.1298961670468294e-05, "loss": 2.0881086349487306, "memory(GiB)": 77.56, "step": 91235, "token_acc": 0.5036231884057971, "train_speed(iter/s)": 1.437977 }, { "epoch": 3.9090013281350413, "grad_norm": 5.765471935272217, "learning_rate": 1.1294700996511342e-05, "loss": 2.393413543701172, "memory(GiB)": 77.56, "step": 91240, "token_acc": 0.5106382978723404, "train_speed(iter/s)": 1.437984 }, { "epoch": 3.90921554346429, "grad_norm": 7.634309768676758, "learning_rate": 1.1290441023737175e-05, "loss": 2.218937301635742, "memory(GiB)": 77.56, "step": 91245, "token_acc": 0.5186440677966102, "train_speed(iter/s)": 1.437986 }, { "epoch": 3.9094297587935394, "grad_norm": 5.171670436859131, "learning_rate": 1.1286181752222996e-05, "loss": 2.226266860961914, "memory(GiB)": 77.56, "step": 91250, "token_acc": 0.5054545454545455, "train_speed(iter/s)": 1.437976 }, { "epoch": 3.909643974122788, "grad_norm": 4.843559265136719, "learning_rate": 1.128192318204595e-05, "loss": 2.491107177734375, "memory(GiB)": 77.56, "step": 91255, "token_acc": 0.459214501510574, "train_speed(iter/s)": 1.437973 }, { "epoch": 3.909858189452037, "grad_norm": 7.1399149894714355, "learning_rate": 1.1277665313283187e-05, "loss": 2.610714149475098, "memory(GiB)": 77.56, "step": 91260, "token_acc": 0.45454545454545453, "train_speed(iter/s)": 1.437968 }, { "epoch": 3.9100724047812863, "grad_norm": 6.221587657928467, "learning_rate": 1.1273408146011827e-05, "loss": 2.4690521240234373, "memory(GiB)": 77.56, "step": 91265, "token_acc": 0.4952076677316294, "train_speed(iter/s)": 1.437987 }, { "epoch": 3.910286620110535, "grad_norm": 6.32486629486084, "learning_rate": 1.1269151680309021e-05, "loss": 2.415798568725586, "memory(GiB)": 77.56, "step": 91270, "token_acc": 0.46788990825688076, "train_speed(iter/s)": 1.438012 }, { "epoch": 3.910500835439784, "grad_norm": 7.332726001739502, "learning_rate": 1.126489591625186e-05, "loss": 2.42496337890625, "memory(GiB)": 77.56, "step": 91275, "token_acc": 0.48242811501597443, "train_speed(iter/s)": 1.438012 }, { "epoch": 3.910715050769033, "grad_norm": 5.7802205085754395, "learning_rate": 1.1260640853917453e-05, "loss": 2.511993980407715, "memory(GiB)": 77.56, "step": 91280, "token_acc": 0.4842105263157895, "train_speed(iter/s)": 1.438017 }, { "epoch": 3.910929266098282, "grad_norm": 5.3685832023620605, "learning_rate": 1.1256386493382876e-05, "loss": 2.085711860656738, "memory(GiB)": 77.56, "step": 91285, "token_acc": 0.5077519379844961, "train_speed(iter/s)": 1.438015 }, { "epoch": 3.9111434814275308, "grad_norm": 5.690767765045166, "learning_rate": 1.1252132834725187e-05, "loss": 2.3168817520141602, "memory(GiB)": 77.56, "step": 91290, "token_acc": 0.5285714285714286, "train_speed(iter/s)": 1.438024 }, { "epoch": 3.91135769675678, "grad_norm": 6.205465316772461, "learning_rate": 1.1247879878021472e-05, "loss": 2.419499969482422, "memory(GiB)": 77.56, "step": 91295, "token_acc": 0.5162337662337663, "train_speed(iter/s)": 1.438033 }, { "epoch": 3.911571912086029, "grad_norm": 5.981412410736084, "learning_rate": 1.1243627623348769e-05, "loss": 2.463705635070801, "memory(GiB)": 77.56, "step": 91300, "token_acc": 0.5028409090909091, "train_speed(iter/s)": 1.438033 }, { "epoch": 3.9117861274152776, "grad_norm": 6.188590049743652, "learning_rate": 1.1239376070784108e-05, "loss": 2.2529827117919923, "memory(GiB)": 77.56, "step": 91305, "token_acc": 0.4797297297297297, "train_speed(iter/s)": 1.43803 }, { "epoch": 3.912000342744527, "grad_norm": 5.830347537994385, "learning_rate": 1.1235125220404507e-05, "loss": 2.4111351013183593, "memory(GiB)": 77.56, "step": 91310, "token_acc": 0.5080645161290323, "train_speed(iter/s)": 1.438043 }, { "epoch": 3.9122145580737757, "grad_norm": 7.293909549713135, "learning_rate": 1.1230875072286979e-05, "loss": 2.2350406646728516, "memory(GiB)": 77.56, "step": 91315, "token_acc": 0.5, "train_speed(iter/s)": 1.438045 }, { "epoch": 3.9124287734030245, "grad_norm": 7.565934181213379, "learning_rate": 1.1226625626508502e-05, "loss": 2.6229209899902344, "memory(GiB)": 77.56, "step": 91320, "token_acc": 0.4280701754385965, "train_speed(iter/s)": 1.438068 }, { "epoch": 3.912642988732274, "grad_norm": 7.400323867797852, "learning_rate": 1.1222376883146079e-05, "loss": 2.283451461791992, "memory(GiB)": 77.56, "step": 91325, "token_acc": 0.4913494809688581, "train_speed(iter/s)": 1.438076 }, { "epoch": 3.9128572040615226, "grad_norm": 5.257184028625488, "learning_rate": 1.1218128842276688e-05, "loss": 2.4757291793823244, "memory(GiB)": 77.56, "step": 91330, "token_acc": 0.4444444444444444, "train_speed(iter/s)": 1.438079 }, { "epoch": 3.9130714193907714, "grad_norm": 6.685637950897217, "learning_rate": 1.121388150397727e-05, "loss": 2.557989501953125, "memory(GiB)": 77.56, "step": 91335, "token_acc": 0.4774193548387097, "train_speed(iter/s)": 1.43809 }, { "epoch": 3.9132856347200207, "grad_norm": 5.636646747589111, "learning_rate": 1.120963486832477e-05, "loss": 2.63091983795166, "memory(GiB)": 77.56, "step": 91340, "token_acc": 0.49808429118773945, "train_speed(iter/s)": 1.438102 }, { "epoch": 3.9134998500492695, "grad_norm": 10.529409408569336, "learning_rate": 1.1205388935396127e-05, "loss": 2.3805414199829102, "memory(GiB)": 77.56, "step": 91345, "token_acc": 0.5135135135135135, "train_speed(iter/s)": 1.438096 }, { "epoch": 3.9137140653785183, "grad_norm": 7.061811447143555, "learning_rate": 1.120114370526824e-05, "loss": 2.4556888580322265, "memory(GiB)": 77.56, "step": 91350, "token_acc": 0.49433962264150944, "train_speed(iter/s)": 1.438094 }, { "epoch": 3.9139282807077675, "grad_norm": 6.291705131530762, "learning_rate": 1.119689917801805e-05, "loss": 2.3719932556152346, "memory(GiB)": 77.56, "step": 91355, "token_acc": 0.49049429657794674, "train_speed(iter/s)": 1.438094 }, { "epoch": 3.9141424960370164, "grad_norm": 5.4585280418396, "learning_rate": 1.1192655353722425e-05, "loss": 2.46896915435791, "memory(GiB)": 77.56, "step": 91360, "token_acc": 0.47865853658536583, "train_speed(iter/s)": 1.438106 }, { "epoch": 3.914356711366265, "grad_norm": 5.959686279296875, "learning_rate": 1.1188412232458256e-05, "loss": 2.0860803604125975, "memory(GiB)": 77.56, "step": 91365, "token_acc": 0.5576923076923077, "train_speed(iter/s)": 1.438102 }, { "epoch": 3.9145709266955144, "grad_norm": 6.7772698402404785, "learning_rate": 1.1184169814302409e-05, "loss": 1.9571212768554687, "memory(GiB)": 77.56, "step": 91370, "token_acc": 0.576271186440678, "train_speed(iter/s)": 1.438094 }, { "epoch": 3.9147851420247632, "grad_norm": 5.908357620239258, "learning_rate": 1.117992809933172e-05, "loss": 2.389080619812012, "memory(GiB)": 77.56, "step": 91375, "token_acc": 0.5202952029520295, "train_speed(iter/s)": 1.438099 }, { "epoch": 3.914999357354012, "grad_norm": 5.820104598999023, "learning_rate": 1.1175687087623066e-05, "loss": 2.241306686401367, "memory(GiB)": 77.56, "step": 91380, "token_acc": 0.5448504983388704, "train_speed(iter/s)": 1.438097 }, { "epoch": 3.9152135726832613, "grad_norm": 4.487586498260498, "learning_rate": 1.1171446779253258e-05, "loss": 2.4218076705932616, "memory(GiB)": 77.56, "step": 91385, "token_acc": 0.5049833887043189, "train_speed(iter/s)": 1.438117 }, { "epoch": 3.91542778801251, "grad_norm": 5.582045078277588, "learning_rate": 1.116720717429912e-05, "loss": 2.506919097900391, "memory(GiB)": 77.56, "step": 91390, "token_acc": 0.49324324324324326, "train_speed(iter/s)": 1.438127 }, { "epoch": 3.915642003341759, "grad_norm": 5.908291816711426, "learning_rate": 1.1162968272837438e-05, "loss": 2.4093696594238283, "memory(GiB)": 77.56, "step": 91395, "token_acc": 0.4693140794223827, "train_speed(iter/s)": 1.438115 }, { "epoch": 3.915856218671008, "grad_norm": 5.138401508331299, "learning_rate": 1.1158730074945029e-05, "loss": 2.358805274963379, "memory(GiB)": 77.56, "step": 91400, "token_acc": 0.5389408099688473, "train_speed(iter/s)": 1.438105 }, { "epoch": 3.916070434000257, "grad_norm": 5.670218467712402, "learning_rate": 1.1154492580698661e-05, "loss": 2.3675537109375, "memory(GiB)": 77.56, "step": 91405, "token_acc": 0.5181818181818182, "train_speed(iter/s)": 1.438118 }, { "epoch": 3.916284649329506, "grad_norm": 5.988509178161621, "learning_rate": 1.1150255790175086e-05, "loss": 2.559895324707031, "memory(GiB)": 77.56, "step": 91410, "token_acc": 0.48494983277591974, "train_speed(iter/s)": 1.438126 }, { "epoch": 3.916498864658755, "grad_norm": 5.357651710510254, "learning_rate": 1.1146019703451083e-05, "loss": 2.346920394897461, "memory(GiB)": 77.56, "step": 91415, "token_acc": 0.4935483870967742, "train_speed(iter/s)": 1.438124 }, { "epoch": 3.916713079988004, "grad_norm": 6.376716613769531, "learning_rate": 1.1141784320603382e-05, "loss": 2.4762502670288087, "memory(GiB)": 77.56, "step": 91420, "token_acc": 0.4859154929577465, "train_speed(iter/s)": 1.438118 }, { "epoch": 3.9169272953172527, "grad_norm": 5.306493282318115, "learning_rate": 1.1137549641708717e-05, "loss": 2.338175964355469, "memory(GiB)": 77.56, "step": 91425, "token_acc": 0.505338078291815, "train_speed(iter/s)": 1.438138 }, { "epoch": 3.917141510646502, "grad_norm": 6.561230182647705, "learning_rate": 1.1133315666843791e-05, "loss": 2.5423145294189453, "memory(GiB)": 77.56, "step": 91430, "token_acc": 0.4937106918238994, "train_speed(iter/s)": 1.438138 }, { "epoch": 3.9173557259757508, "grad_norm": 6.005884647369385, "learning_rate": 1.1129082396085294e-05, "loss": 2.125341606140137, "memory(GiB)": 77.56, "step": 91435, "token_acc": 0.5596707818930041, "train_speed(iter/s)": 1.438155 }, { "epoch": 3.9175699413049996, "grad_norm": 6.431205749511719, "learning_rate": 1.1124849829509953e-05, "loss": 2.228074836730957, "memory(GiB)": 77.56, "step": 91440, "token_acc": 0.525, "train_speed(iter/s)": 1.43816 }, { "epoch": 3.917784156634249, "grad_norm": 6.310107231140137, "learning_rate": 1.1120617967194425e-05, "loss": 2.296904182434082, "memory(GiB)": 77.56, "step": 91445, "token_acc": 0.5, "train_speed(iter/s)": 1.438188 }, { "epoch": 3.9179983719634977, "grad_norm": 5.776144027709961, "learning_rate": 1.1116386809215368e-05, "loss": 2.4614768981933595, "memory(GiB)": 77.56, "step": 91450, "token_acc": 0.5116279069767442, "train_speed(iter/s)": 1.438189 }, { "epoch": 3.9182125872927465, "grad_norm": 5.911812782287598, "learning_rate": 1.1112156355649444e-05, "loss": 2.2575763702392577, "memory(GiB)": 77.56, "step": 91455, "token_acc": 0.5047923322683706, "train_speed(iter/s)": 1.438197 }, { "epoch": 3.9184268026219957, "grad_norm": 4.981050491333008, "learning_rate": 1.1107926606573287e-05, "loss": 2.4948247909545898, "memory(GiB)": 77.56, "step": 91460, "token_acc": 0.49838187702265374, "train_speed(iter/s)": 1.438206 }, { "epoch": 3.9186410179512445, "grad_norm": 5.084926128387451, "learning_rate": 1.1103697562063513e-05, "loss": 2.058063507080078, "memory(GiB)": 77.56, "step": 91465, "token_acc": 0.5505226480836237, "train_speed(iter/s)": 1.438204 }, { "epoch": 3.9188552332804933, "grad_norm": 5.6330132484436035, "learning_rate": 1.109946922219674e-05, "loss": 2.3477508544921877, "memory(GiB)": 77.56, "step": 91470, "token_acc": 0.5253164556962026, "train_speed(iter/s)": 1.438212 }, { "epoch": 3.9190694486097426, "grad_norm": 6.594236373901367, "learning_rate": 1.1095241587049593e-05, "loss": 2.533837890625, "memory(GiB)": 77.56, "step": 91475, "token_acc": 0.4872611464968153, "train_speed(iter/s)": 1.438226 }, { "epoch": 3.9192836639389914, "grad_norm": 7.738058567047119, "learning_rate": 1.1091014656698634e-05, "loss": 2.3503273010253904, "memory(GiB)": 77.56, "step": 91480, "token_acc": 0.4830188679245283, "train_speed(iter/s)": 1.438223 }, { "epoch": 3.9194978792682402, "grad_norm": 6.766422271728516, "learning_rate": 1.1086788431220446e-05, "loss": 2.350724792480469, "memory(GiB)": 77.56, "step": 91485, "token_acc": 0.531986531986532, "train_speed(iter/s)": 1.438213 }, { "epoch": 3.9197120945974895, "grad_norm": 9.74452018737793, "learning_rate": 1.108256291069159e-05, "loss": 2.315723419189453, "memory(GiB)": 77.56, "step": 91490, "token_acc": 0.5333333333333333, "train_speed(iter/s)": 1.438222 }, { "epoch": 3.9199263099267383, "grad_norm": 6.988160133361816, "learning_rate": 1.1078338095188601e-05, "loss": 2.458779716491699, "memory(GiB)": 77.56, "step": 91495, "token_acc": 0.4929078014184397, "train_speed(iter/s)": 1.438215 }, { "epoch": 3.920140525255987, "grad_norm": 6.007402420043945, "learning_rate": 1.1074113984788043e-05, "loss": 2.2051992416381836, "memory(GiB)": 77.56, "step": 91500, "token_acc": 0.5134099616858238, "train_speed(iter/s)": 1.43822 }, { "epoch": 3.920140525255987, "eval_loss": 2.230638027191162, "eval_runtime": 14.1072, "eval_samples_per_second": 7.089, "eval_steps_per_second": 7.089, "eval_token_acc": 0.4462915601023018, "step": 91500 }, { "epoch": 3.9203547405852364, "grad_norm": 6.819450378417969, "learning_rate": 1.1069890579566427e-05, "loss": 2.343904495239258, "memory(GiB)": 77.56, "step": 91505, "token_acc": 0.4606946983546618, "train_speed(iter/s)": 1.437878 }, { "epoch": 3.920568955914485, "grad_norm": 8.59701156616211, "learning_rate": 1.106566787960026e-05, "loss": 2.2338972091674805, "memory(GiB)": 77.56, "step": 91510, "token_acc": 0.515625, "train_speed(iter/s)": 1.437875 }, { "epoch": 3.920783171243734, "grad_norm": 5.157470703125, "learning_rate": 1.1061445884966042e-05, "loss": 2.673126983642578, "memory(GiB)": 77.56, "step": 91515, "token_acc": 0.46959459459459457, "train_speed(iter/s)": 1.437872 }, { "epoch": 3.9209973865729832, "grad_norm": 5.757594585418701, "learning_rate": 1.1057224595740246e-05, "loss": 2.493881607055664, "memory(GiB)": 77.56, "step": 91520, "token_acc": 0.48175182481751827, "train_speed(iter/s)": 1.437877 }, { "epoch": 3.921211601902232, "grad_norm": 6.437959671020508, "learning_rate": 1.1053004011999374e-05, "loss": 2.334260940551758, "memory(GiB)": 77.56, "step": 91525, "token_acc": 0.5458333333333333, "train_speed(iter/s)": 1.437894 }, { "epoch": 3.921425817231481, "grad_norm": 4.9855875968933105, "learning_rate": 1.1048784133819867e-05, "loss": 2.3796356201171873, "memory(GiB)": 77.56, "step": 91530, "token_acc": 0.5156794425087108, "train_speed(iter/s)": 1.437897 }, { "epoch": 3.92164003256073, "grad_norm": 6.146213531494141, "learning_rate": 1.1044564961278175e-05, "loss": 2.1525810241699217, "memory(GiB)": 77.56, "step": 91535, "token_acc": 0.5321428571428571, "train_speed(iter/s)": 1.437887 }, { "epoch": 3.921854247889979, "grad_norm": 5.222489356994629, "learning_rate": 1.104034649445072e-05, "loss": 2.2984035491943358, "memory(GiB)": 77.56, "step": 91540, "token_acc": 0.48348348348348347, "train_speed(iter/s)": 1.437875 }, { "epoch": 3.9220684632192278, "grad_norm": 6.565844535827637, "learning_rate": 1.1036128733413948e-05, "loss": 2.125788688659668, "memory(GiB)": 77.56, "step": 91545, "token_acc": 0.5517241379310345, "train_speed(iter/s)": 1.437866 }, { "epoch": 3.922282678548477, "grad_norm": 5.705295085906982, "learning_rate": 1.1031911678244255e-05, "loss": 2.2338294982910156, "memory(GiB)": 77.56, "step": 91550, "token_acc": 0.5161290322580645, "train_speed(iter/s)": 1.437868 }, { "epoch": 3.922496893877726, "grad_norm": 6.459292411804199, "learning_rate": 1.1027695329018023e-05, "loss": 2.3540958404541015, "memory(GiB)": 77.56, "step": 91555, "token_acc": 0.4932249322493225, "train_speed(iter/s)": 1.437886 }, { "epoch": 3.9227111092069746, "grad_norm": 7.275162220001221, "learning_rate": 1.102347968581166e-05, "loss": 1.8346271514892578, "memory(GiB)": 77.56, "step": 91560, "token_acc": 0.5487179487179488, "train_speed(iter/s)": 1.437884 }, { "epoch": 3.922925324536224, "grad_norm": 6.538516044616699, "learning_rate": 1.101926474870153e-05, "loss": 2.2621707916259766, "memory(GiB)": 77.56, "step": 91565, "token_acc": 0.4827586206896552, "train_speed(iter/s)": 1.437881 }, { "epoch": 3.9231395398654727, "grad_norm": 6.015599727630615, "learning_rate": 1.1015050517763987e-05, "loss": 2.421868324279785, "memory(GiB)": 77.56, "step": 91570, "token_acc": 0.4605678233438486, "train_speed(iter/s)": 1.437859 }, { "epoch": 3.9233537551947215, "grad_norm": 5.973509311676025, "learning_rate": 1.101083699307537e-05, "loss": 2.1814422607421875, "memory(GiB)": 77.56, "step": 91575, "token_acc": 0.5102739726027398, "train_speed(iter/s)": 1.437846 }, { "epoch": 3.9235679705239708, "grad_norm": 7.527085781097412, "learning_rate": 1.1006624174711999e-05, "loss": 2.491049385070801, "memory(GiB)": 77.56, "step": 91580, "token_acc": 0.4536082474226804, "train_speed(iter/s)": 1.437842 }, { "epoch": 3.9237821858532196, "grad_norm": 5.906025409698486, "learning_rate": 1.1002412062750233e-05, "loss": 2.557822418212891, "memory(GiB)": 77.56, "step": 91585, "token_acc": 0.4448669201520912, "train_speed(iter/s)": 1.437856 }, { "epoch": 3.9239964011824684, "grad_norm": 5.4656829833984375, "learning_rate": 1.099820065726635e-05, "loss": 2.3741434097290037, "memory(GiB)": 77.56, "step": 91590, "token_acc": 0.5135135135135135, "train_speed(iter/s)": 1.437857 }, { "epoch": 3.9242106165117177, "grad_norm": 6.288724422454834, "learning_rate": 1.0993989958336648e-05, "loss": 2.0333003997802734, "memory(GiB)": 77.56, "step": 91595, "token_acc": 0.536, "train_speed(iter/s)": 1.437849 }, { "epoch": 3.9244248318409665, "grad_norm": 6.021236896514893, "learning_rate": 1.098977996603741e-05, "loss": 2.583403778076172, "memory(GiB)": 77.56, "step": 91600, "token_acc": 0.46254071661237783, "train_speed(iter/s)": 1.43787 }, { "epoch": 3.9246390471702153, "grad_norm": 4.439737319946289, "learning_rate": 1.09855706804449e-05, "loss": 2.3207374572753907, "memory(GiB)": 77.56, "step": 91605, "token_acc": 0.4585635359116022, "train_speed(iter/s)": 1.437884 }, { "epoch": 3.9248532624994645, "grad_norm": 7.106756687164307, "learning_rate": 1.0981362101635367e-05, "loss": 2.422466278076172, "memory(GiB)": 77.56, "step": 91610, "token_acc": 0.47854785478547857, "train_speed(iter/s)": 1.437895 }, { "epoch": 3.9250674778287133, "grad_norm": 7.427601337432861, "learning_rate": 1.0977154229685055e-05, "loss": 2.144119644165039, "memory(GiB)": 77.56, "step": 91615, "token_acc": 0.5217391304347826, "train_speed(iter/s)": 1.437906 }, { "epoch": 3.925281693157962, "grad_norm": 6.106443881988525, "learning_rate": 1.097294706467022e-05, "loss": 2.0666793823242187, "memory(GiB)": 77.56, "step": 91620, "token_acc": 0.5321428571428571, "train_speed(iter/s)": 1.437906 }, { "epoch": 3.9254959084872114, "grad_norm": 7.301298141479492, "learning_rate": 1.096874060666706e-05, "loss": 2.153365135192871, "memory(GiB)": 77.56, "step": 91625, "token_acc": 0.5118110236220472, "train_speed(iter/s)": 1.437913 }, { "epoch": 3.9257101238164602, "grad_norm": 8.789388656616211, "learning_rate": 1.0964534855751774e-05, "loss": 2.432993698120117, "memory(GiB)": 77.56, "step": 91630, "token_acc": 0.4847328244274809, "train_speed(iter/s)": 1.437929 }, { "epoch": 3.925924339145709, "grad_norm": 7.694841384887695, "learning_rate": 1.0960329812000557e-05, "loss": 2.319536018371582, "memory(GiB)": 77.56, "step": 91635, "token_acc": 0.5403508771929825, "train_speed(iter/s)": 1.437929 }, { "epoch": 3.9261385544749583, "grad_norm": 7.1682820320129395, "learning_rate": 1.0956125475489575e-05, "loss": 2.5165098190307615, "memory(GiB)": 77.56, "step": 91640, "token_acc": 0.4666666666666667, "train_speed(iter/s)": 1.437937 }, { "epoch": 3.926352769804207, "grad_norm": 5.862178802490234, "learning_rate": 1.095192184629502e-05, "loss": 2.5554582595825197, "memory(GiB)": 77.56, "step": 91645, "token_acc": 0.47368421052631576, "train_speed(iter/s)": 1.437943 }, { "epoch": 3.926566985133456, "grad_norm": 7.210686206817627, "learning_rate": 1.0947718924493034e-05, "loss": 2.2724714279174805, "memory(GiB)": 77.56, "step": 91650, "token_acc": 0.5319148936170213, "train_speed(iter/s)": 1.437958 }, { "epoch": 3.926781200462705, "grad_norm": 5.732788562774658, "learning_rate": 1.0943516710159751e-05, "loss": 2.31829833984375, "memory(GiB)": 77.56, "step": 91655, "token_acc": 0.5, "train_speed(iter/s)": 1.43797 }, { "epoch": 3.926995415791954, "grad_norm": 5.065240383148193, "learning_rate": 1.0939315203371303e-05, "loss": 2.5158546447753904, "memory(GiB)": 77.56, "step": 91660, "token_acc": 0.46956521739130436, "train_speed(iter/s)": 1.437972 }, { "epoch": 3.927209631121203, "grad_norm": 6.3762736320495605, "learning_rate": 1.0935114404203783e-05, "loss": 2.3252967834472655, "memory(GiB)": 77.56, "step": 91665, "token_acc": 0.4711864406779661, "train_speed(iter/s)": 1.43796 }, { "epoch": 3.927423846450452, "grad_norm": 4.754208087921143, "learning_rate": 1.0930914312733332e-05, "loss": 2.14044132232666, "memory(GiB)": 77.56, "step": 91670, "token_acc": 0.5359477124183006, "train_speed(iter/s)": 1.437963 }, { "epoch": 3.927638061779701, "grad_norm": 5.205066680908203, "learning_rate": 1.0926714929036014e-05, "loss": 2.4429603576660157, "memory(GiB)": 77.56, "step": 91675, "token_acc": 0.4630225080385852, "train_speed(iter/s)": 1.437979 }, { "epoch": 3.9278522771089497, "grad_norm": 5.333390235900879, "learning_rate": 1.0922516253187909e-05, "loss": 2.2542984008789064, "memory(GiB)": 77.56, "step": 91680, "token_acc": 0.4965753424657534, "train_speed(iter/s)": 1.437984 }, { "epoch": 3.928066492438199, "grad_norm": 5.958066463470459, "learning_rate": 1.0918318285265078e-05, "loss": 2.263372802734375, "memory(GiB)": 77.56, "step": 91685, "token_acc": 0.5607142857142857, "train_speed(iter/s)": 1.437998 }, { "epoch": 3.9282807077674478, "grad_norm": 5.380476474761963, "learning_rate": 1.091412102534356e-05, "loss": 2.413420867919922, "memory(GiB)": 77.56, "step": 91690, "token_acc": 0.49698795180722893, "train_speed(iter/s)": 1.438018 }, { "epoch": 3.9284949230966966, "grad_norm": 4.951543807983398, "learning_rate": 1.0909924473499423e-05, "loss": 2.372220993041992, "memory(GiB)": 77.56, "step": 91695, "token_acc": 0.47701149425287354, "train_speed(iter/s)": 1.437997 }, { "epoch": 3.928709138425946, "grad_norm": 6.767967224121094, "learning_rate": 1.0905728629808654e-05, "loss": 2.1728693008422852, "memory(GiB)": 77.56, "step": 91700, "token_acc": 0.5092250922509225, "train_speed(iter/s)": 1.437975 }, { "epoch": 3.9289233537551946, "grad_norm": 7.530840873718262, "learning_rate": 1.0901533494347304e-05, "loss": 2.5382482528686525, "memory(GiB)": 77.56, "step": 91705, "token_acc": 0.4788273615635179, "train_speed(iter/s)": 1.43797 }, { "epoch": 3.9291375690844434, "grad_norm": 5.2336554527282715, "learning_rate": 1.0897339067191347e-05, "loss": 2.3763662338256837, "memory(GiB)": 77.56, "step": 91710, "token_acc": 0.503731343283582, "train_speed(iter/s)": 1.43798 }, { "epoch": 3.9293517844136927, "grad_norm": 4.730438232421875, "learning_rate": 1.0893145348416778e-05, "loss": 2.222860336303711, "memory(GiB)": 77.56, "step": 91715, "token_acc": 0.491869918699187, "train_speed(iter/s)": 1.438 }, { "epoch": 3.9295659997429415, "grad_norm": 5.695847034454346, "learning_rate": 1.0888952338099561e-05, "loss": 2.4839725494384766, "memory(GiB)": 77.56, "step": 91720, "token_acc": 0.4785714285714286, "train_speed(iter/s)": 1.438018 }, { "epoch": 3.9297802150721903, "grad_norm": 5.649783611297607, "learning_rate": 1.088476003631565e-05, "loss": 2.373153305053711, "memory(GiB)": 77.56, "step": 91725, "token_acc": 0.5015873015873016, "train_speed(iter/s)": 1.438013 }, { "epoch": 3.9299944304014396, "grad_norm": 14.608512878417969, "learning_rate": 1.088056844314102e-05, "loss": 2.4498228073120116, "memory(GiB)": 77.56, "step": 91730, "token_acc": 0.5112781954887218, "train_speed(iter/s)": 1.438005 }, { "epoch": 3.9302086457306884, "grad_norm": 7.321146011352539, "learning_rate": 1.087637755865159e-05, "loss": 2.3396718978881834, "memory(GiB)": 77.56, "step": 91735, "token_acc": 0.5017182130584192, "train_speed(iter/s)": 1.438023 }, { "epoch": 3.930422861059937, "grad_norm": 5.216070175170898, "learning_rate": 1.0872187382923277e-05, "loss": 2.3084171295166014, "memory(GiB)": 77.56, "step": 91740, "token_acc": 0.5454545454545454, "train_speed(iter/s)": 1.438032 }, { "epoch": 3.9306370763891865, "grad_norm": 6.924057483673096, "learning_rate": 1.0867997916031997e-05, "loss": 2.1397930145263673, "memory(GiB)": 77.56, "step": 91745, "token_acc": 0.5617977528089888, "train_speed(iter/s)": 1.438029 }, { "epoch": 3.9308512917184353, "grad_norm": 6.8544769287109375, "learning_rate": 1.0863809158053645e-05, "loss": 2.2442209243774416, "memory(GiB)": 77.56, "step": 91750, "token_acc": 0.5267175572519084, "train_speed(iter/s)": 1.438042 }, { "epoch": 3.931065507047684, "grad_norm": 5.260197639465332, "learning_rate": 1.0859621109064089e-05, "loss": 2.656650733947754, "memory(GiB)": 77.56, "step": 91755, "token_acc": 0.46418338108882523, "train_speed(iter/s)": 1.438048 }, { "epoch": 3.9312797223769333, "grad_norm": 6.848014831542969, "learning_rate": 1.0855433769139223e-05, "loss": 2.2859222412109377, "memory(GiB)": 77.56, "step": 91760, "token_acc": 0.5126353790613718, "train_speed(iter/s)": 1.438045 }, { "epoch": 3.931493937706182, "grad_norm": 5.351479530334473, "learning_rate": 1.0851247138354886e-05, "loss": 2.648614501953125, "memory(GiB)": 77.56, "step": 91765, "token_acc": 0.46407185628742514, "train_speed(iter/s)": 1.438044 }, { "epoch": 3.931708153035431, "grad_norm": 6.796504497528076, "learning_rate": 1.084706121678694e-05, "loss": 2.270510673522949, "memory(GiB)": 77.56, "step": 91770, "token_acc": 0.5157894736842106, "train_speed(iter/s)": 1.438059 }, { "epoch": 3.9319223683646802, "grad_norm": 7.183225631713867, "learning_rate": 1.0842876004511215e-05, "loss": 2.47519474029541, "memory(GiB)": 77.56, "step": 91775, "token_acc": 0.45878136200716846, "train_speed(iter/s)": 1.438079 }, { "epoch": 3.932136583693929, "grad_norm": 5.062354564666748, "learning_rate": 1.083869150160352e-05, "loss": 2.3480819702148437, "memory(GiB)": 77.56, "step": 91780, "token_acc": 0.48788927335640137, "train_speed(iter/s)": 1.438058 }, { "epoch": 3.932350799023178, "grad_norm": 5.611119747161865, "learning_rate": 1.0834507708139651e-05, "loss": 2.3739879608154295, "memory(GiB)": 77.56, "step": 91785, "token_acc": 0.5139442231075697, "train_speed(iter/s)": 1.438077 }, { "epoch": 3.932565014352427, "grad_norm": 4.909784317016602, "learning_rate": 1.083032462419543e-05, "loss": 2.1236831665039064, "memory(GiB)": 77.56, "step": 91790, "token_acc": 0.5348101265822784, "train_speed(iter/s)": 1.438083 }, { "epoch": 3.932779229681676, "grad_norm": 5.592144966125488, "learning_rate": 1.0826142249846621e-05, "loss": 2.3189544677734375, "memory(GiB)": 77.56, "step": 91795, "token_acc": 0.5062893081761006, "train_speed(iter/s)": 1.438096 }, { "epoch": 3.9329934450109247, "grad_norm": 6.355260372161865, "learning_rate": 1.0821960585168994e-05, "loss": 2.4610824584960938, "memory(GiB)": 77.56, "step": 91800, "token_acc": 0.48638132295719844, "train_speed(iter/s)": 1.438099 }, { "epoch": 3.933207660340174, "grad_norm": 5.820274353027344, "learning_rate": 1.0817779630238301e-05, "loss": 2.5243928909301756, "memory(GiB)": 77.56, "step": 91805, "token_acc": 0.45231607629427795, "train_speed(iter/s)": 1.438099 }, { "epoch": 3.933421875669423, "grad_norm": 7.225492000579834, "learning_rate": 1.0813599385130274e-05, "loss": 2.2347366333007814, "memory(GiB)": 77.56, "step": 91810, "token_acc": 0.5486381322957199, "train_speed(iter/s)": 1.438098 }, { "epoch": 3.9336360909986716, "grad_norm": 6.190768241882324, "learning_rate": 1.0809419849920671e-05, "loss": 2.237508010864258, "memory(GiB)": 77.56, "step": 91815, "token_acc": 0.49808429118773945, "train_speed(iter/s)": 1.438114 }, { "epoch": 3.933850306327921, "grad_norm": 5.9196624755859375, "learning_rate": 1.0805241024685186e-05, "loss": 2.134809112548828, "memory(GiB)": 77.56, "step": 91820, "token_acc": 0.5567765567765568, "train_speed(iter/s)": 1.438105 }, { "epoch": 3.9340645216571697, "grad_norm": 6.857304573059082, "learning_rate": 1.0801062909499526e-05, "loss": 2.034542274475098, "memory(GiB)": 77.56, "step": 91825, "token_acc": 0.5677655677655677, "train_speed(iter/s)": 1.438059 }, { "epoch": 3.9342787369864185, "grad_norm": 5.46055793762207, "learning_rate": 1.0796885504439391e-05, "loss": 2.2587892532348635, "memory(GiB)": 77.56, "step": 91830, "token_acc": 0.5606694560669456, "train_speed(iter/s)": 1.438054 }, { "epoch": 3.9344929523156678, "grad_norm": 6.6425700187683105, "learning_rate": 1.0792708809580431e-05, "loss": 2.245984649658203, "memory(GiB)": 77.56, "step": 91835, "token_acc": 0.49415204678362573, "train_speed(iter/s)": 1.438038 }, { "epoch": 3.9347071676449166, "grad_norm": 5.84804630279541, "learning_rate": 1.0788532824998343e-05, "loss": 2.185314178466797, "memory(GiB)": 77.56, "step": 91840, "token_acc": 0.5203761755485894, "train_speed(iter/s)": 1.438053 }, { "epoch": 3.9349213829741654, "grad_norm": 6.235397815704346, "learning_rate": 1.0784357550768753e-05, "loss": 2.2381511688232423, "memory(GiB)": 77.56, "step": 91845, "token_acc": 0.5126582278481012, "train_speed(iter/s)": 1.438075 }, { "epoch": 3.9351355983034146, "grad_norm": 5.820837020874023, "learning_rate": 1.078018298696733e-05, "loss": 2.3817996978759766, "memory(GiB)": 77.56, "step": 91850, "token_acc": 0.5508771929824562, "train_speed(iter/s)": 1.438096 }, { "epoch": 3.9353498136326635, "grad_norm": 5.505218982696533, "learning_rate": 1.0776009133669684e-05, "loss": 2.1380382537841798, "memory(GiB)": 77.56, "step": 91855, "token_acc": 0.5127388535031847, "train_speed(iter/s)": 1.438089 }, { "epoch": 3.9355640289619123, "grad_norm": 7.809144973754883, "learning_rate": 1.0771835990951424e-05, "loss": 2.6247671127319334, "memory(GiB)": 77.56, "step": 91860, "token_acc": 0.47194719471947194, "train_speed(iter/s)": 1.438113 }, { "epoch": 3.9357782442911615, "grad_norm": 6.382263660430908, "learning_rate": 1.0767663558888159e-05, "loss": 2.4069332122802733, "memory(GiB)": 77.56, "step": 91865, "token_acc": 0.5142857142857142, "train_speed(iter/s)": 1.438123 }, { "epoch": 3.9359924596204103, "grad_norm": 7.607944488525391, "learning_rate": 1.076349183755545e-05, "loss": 2.3623115539550783, "memory(GiB)": 77.56, "step": 91870, "token_acc": 0.4962962962962963, "train_speed(iter/s)": 1.438129 }, { "epoch": 3.936206674949659, "grad_norm": 6.977862358093262, "learning_rate": 1.0759320827028913e-05, "loss": 2.205623435974121, "memory(GiB)": 77.56, "step": 91875, "token_acc": 0.5069444444444444, "train_speed(iter/s)": 1.43813 }, { "epoch": 3.9364208902789084, "grad_norm": 10.510127067565918, "learning_rate": 1.0755150527384089e-05, "loss": 2.475603485107422, "memory(GiB)": 77.56, "step": 91880, "token_acc": 0.4785714285714286, "train_speed(iter/s)": 1.438126 }, { "epoch": 3.936635105608157, "grad_norm": 5.402792930603027, "learning_rate": 1.0750980938696525e-05, "loss": 2.612287902832031, "memory(GiB)": 77.56, "step": 91885, "token_acc": 0.483974358974359, "train_speed(iter/s)": 1.438141 }, { "epoch": 3.936849320937406, "grad_norm": 5.810826778411865, "learning_rate": 1.0746812061041761e-05, "loss": 2.2637868881225587, "memory(GiB)": 77.56, "step": 91890, "token_acc": 0.5059760956175299, "train_speed(iter/s)": 1.438147 }, { "epoch": 3.9370635362666553, "grad_norm": 6.126474857330322, "learning_rate": 1.0742643894495319e-05, "loss": 2.1765499114990234, "memory(GiB)": 77.56, "step": 91895, "token_acc": 0.53515625, "train_speed(iter/s)": 1.438148 }, { "epoch": 3.937277751595904, "grad_norm": 7.01622200012207, "learning_rate": 1.07384764391327e-05, "loss": 2.138622283935547, "memory(GiB)": 77.56, "step": 91900, "token_acc": 0.5236363636363637, "train_speed(iter/s)": 1.438157 }, { "epoch": 3.937491966925153, "grad_norm": 5.674077033996582, "learning_rate": 1.0734309695029422e-05, "loss": 2.2281938552856446, "memory(GiB)": 77.56, "step": 91905, "token_acc": 0.49612403100775193, "train_speed(iter/s)": 1.438173 }, { "epoch": 3.937706182254402, "grad_norm": 6.738650321960449, "learning_rate": 1.0730143662260938e-05, "loss": 2.4492055892944338, "memory(GiB)": 77.56, "step": 91910, "token_acc": 0.49070631970260226, "train_speed(iter/s)": 1.438183 }, { "epoch": 3.937920397583651, "grad_norm": 8.028721809387207, "learning_rate": 1.0725978340902758e-05, "loss": 2.5546566009521485, "memory(GiB)": 77.56, "step": 91915, "token_acc": 0.4666666666666667, "train_speed(iter/s)": 1.438192 }, { "epoch": 3.9381346129129, "grad_norm": 5.925221920013428, "learning_rate": 1.0721813731030328e-05, "loss": 2.3093683242797853, "memory(GiB)": 77.56, "step": 91920, "token_acc": 0.5096774193548387, "train_speed(iter/s)": 1.4382 }, { "epoch": 3.938348828242149, "grad_norm": 6.920371055603027, "learning_rate": 1.071764983271908e-05, "loss": 2.6063796997070314, "memory(GiB)": 77.56, "step": 91925, "token_acc": 0.4603174603174603, "train_speed(iter/s)": 1.438227 }, { "epoch": 3.938563043571398, "grad_norm": 6.748383045196533, "learning_rate": 1.0713486646044446e-05, "loss": 2.2467294692993165, "memory(GiB)": 77.56, "step": 91930, "token_acc": 0.5182724252491694, "train_speed(iter/s)": 1.438218 }, { "epoch": 3.9387772589006467, "grad_norm": 5.717504501342773, "learning_rate": 1.0709324171081863e-05, "loss": 2.2911935806274415, "memory(GiB)": 77.56, "step": 91935, "token_acc": 0.541958041958042, "train_speed(iter/s)": 1.438223 }, { "epoch": 3.938991474229896, "grad_norm": 5.653698921203613, "learning_rate": 1.0705162407906739e-05, "loss": 2.173803520202637, "memory(GiB)": 77.56, "step": 91940, "token_acc": 0.5189873417721519, "train_speed(iter/s)": 1.43822 }, { "epoch": 3.9392056895591447, "grad_norm": 6.838394641876221, "learning_rate": 1.0701001356594453e-05, "loss": 2.5400789260864256, "memory(GiB)": 77.56, "step": 91945, "token_acc": 0.4967948717948718, "train_speed(iter/s)": 1.438236 }, { "epoch": 3.9394199048883936, "grad_norm": 6.167585372924805, "learning_rate": 1.0696841017220394e-05, "loss": 2.295496940612793, "memory(GiB)": 77.56, "step": 91950, "token_acc": 0.4921875, "train_speed(iter/s)": 1.438235 }, { "epoch": 3.939634120217643, "grad_norm": 5.840410232543945, "learning_rate": 1.0692681389859916e-05, "loss": 2.1997032165527344, "memory(GiB)": 77.56, "step": 91955, "token_acc": 0.5061728395061729, "train_speed(iter/s)": 1.438241 }, { "epoch": 3.9398483355468916, "grad_norm": 7.691062927246094, "learning_rate": 1.0688522474588403e-05, "loss": 2.2673009872436523, "memory(GiB)": 77.56, "step": 91960, "token_acc": 0.5, "train_speed(iter/s)": 1.438242 }, { "epoch": 3.9400625508761404, "grad_norm": 4.867900371551514, "learning_rate": 1.0684364271481184e-05, "loss": 2.280361557006836, "memory(GiB)": 77.56, "step": 91965, "token_acc": 0.5337423312883436, "train_speed(iter/s)": 1.438237 }, { "epoch": 3.9402767662053897, "grad_norm": 6.0499091148376465, "learning_rate": 1.0680206780613583e-05, "loss": 2.5694808959960938, "memory(GiB)": 77.56, "step": 91970, "token_acc": 0.49280575539568344, "train_speed(iter/s)": 1.438242 }, { "epoch": 3.9404909815346385, "grad_norm": 7.973316192626953, "learning_rate": 1.0676050002060922e-05, "loss": 2.3693368911743162, "memory(GiB)": 77.56, "step": 91975, "token_acc": 0.5257352941176471, "train_speed(iter/s)": 1.438235 }, { "epoch": 3.9407051968638873, "grad_norm": 5.942612648010254, "learning_rate": 1.0671893935898509e-05, "loss": 2.1118722915649415, "memory(GiB)": 77.56, "step": 91980, "token_acc": 0.5290102389078498, "train_speed(iter/s)": 1.438227 }, { "epoch": 3.9409194121931366, "grad_norm": 5.290731906890869, "learning_rate": 1.0667738582201608e-05, "loss": 2.2850690841674806, "memory(GiB)": 77.56, "step": 91985, "token_acc": 0.5451127819548872, "train_speed(iter/s)": 1.438227 }, { "epoch": 3.9411336275223854, "grad_norm": 6.314393997192383, "learning_rate": 1.0663583941045525e-05, "loss": 2.239452362060547, "memory(GiB)": 77.56, "step": 91990, "token_acc": 0.48788927335640137, "train_speed(iter/s)": 1.438225 }, { "epoch": 3.941347842851634, "grad_norm": 6.988499641418457, "learning_rate": 1.0659430012505534e-05, "loss": 2.1616661071777346, "memory(GiB)": 77.56, "step": 91995, "token_acc": 0.5269230769230769, "train_speed(iter/s)": 1.438225 }, { "epoch": 3.9415620581808835, "grad_norm": 6.156588554382324, "learning_rate": 1.0655276796656871e-05, "loss": 2.474592399597168, "memory(GiB)": 77.56, "step": 92000, "token_acc": 0.504950495049505, "train_speed(iter/s)": 1.438251 }, { "epoch": 3.9415620581808835, "eval_loss": 2.3511781692504883, "eval_runtime": 14.3955, "eval_samples_per_second": 6.947, "eval_steps_per_second": 6.947, "eval_token_acc": 0.4568764568764569, "step": 92000 }, { "epoch": 3.9417762735101323, "grad_norm": 5.084593772888184, "learning_rate": 1.0651124293574777e-05, "loss": 2.261703300476074, "memory(GiB)": 77.56, "step": 92005, "token_acc": 0.4794168096054888, "train_speed(iter/s)": 1.437878 }, { "epoch": 3.941990488839381, "grad_norm": 5.4553680419921875, "learning_rate": 1.0646972503334474e-05, "loss": 2.407316207885742, "memory(GiB)": 77.56, "step": 92010, "token_acc": 0.46153846153846156, "train_speed(iter/s)": 1.437871 }, { "epoch": 3.9422047041686303, "grad_norm": 5.671087265014648, "learning_rate": 1.0642821426011174e-05, "loss": 2.3135040283203123, "memory(GiB)": 77.56, "step": 92015, "token_acc": 0.49333333333333335, "train_speed(iter/s)": 1.437872 }, { "epoch": 3.942418919497879, "grad_norm": 6.357110023498535, "learning_rate": 1.0638671061680095e-05, "loss": 2.278458023071289, "memory(GiB)": 77.56, "step": 92020, "token_acc": 0.4909090909090909, "train_speed(iter/s)": 1.437862 }, { "epoch": 3.942633134827128, "grad_norm": 7.281513214111328, "learning_rate": 1.063452141041641e-05, "loss": 2.2567901611328125, "memory(GiB)": 77.56, "step": 92025, "token_acc": 0.5102564102564102, "train_speed(iter/s)": 1.437856 }, { "epoch": 3.942847350156377, "grad_norm": 6.84661340713501, "learning_rate": 1.0630372472295303e-05, "loss": 2.1634395599365233, "memory(GiB)": 77.56, "step": 92030, "token_acc": 0.5177304964539007, "train_speed(iter/s)": 1.437867 }, { "epoch": 3.943061565485626, "grad_norm": 7.569800853729248, "learning_rate": 1.0626224247391924e-05, "loss": 2.3042369842529298, "memory(GiB)": 77.56, "step": 92035, "token_acc": 0.5029761904761905, "train_speed(iter/s)": 1.437861 }, { "epoch": 3.943275780814875, "grad_norm": 7.012302398681641, "learning_rate": 1.0622076735781433e-05, "loss": 2.3331539154052736, "memory(GiB)": 77.56, "step": 92040, "token_acc": 0.5074626865671642, "train_speed(iter/s)": 1.437875 }, { "epoch": 3.943489996144124, "grad_norm": 7.040589809417725, "learning_rate": 1.0617929937538951e-05, "loss": 2.262278366088867, "memory(GiB)": 77.56, "step": 92045, "token_acc": 0.5362903225806451, "train_speed(iter/s)": 1.437886 }, { "epoch": 3.943704211473373, "grad_norm": 5.6781415939331055, "learning_rate": 1.0613783852739617e-05, "loss": 2.512264442443848, "memory(GiB)": 77.56, "step": 92050, "token_acc": 0.5016949152542373, "train_speed(iter/s)": 1.437869 }, { "epoch": 3.9439184268026217, "grad_norm": 5.332988262176514, "learning_rate": 1.0609638481458545e-05, "loss": 2.597708511352539, "memory(GiB)": 77.56, "step": 92055, "token_acc": 0.47474747474747475, "train_speed(iter/s)": 1.437866 }, { "epoch": 3.944132642131871, "grad_norm": 5.484576225280762, "learning_rate": 1.0605493823770801e-05, "loss": 2.546600914001465, "memory(GiB)": 77.56, "step": 92060, "token_acc": 0.48059701492537316, "train_speed(iter/s)": 1.437885 }, { "epoch": 3.94434685746112, "grad_norm": 4.486135005950928, "learning_rate": 1.060134987975151e-05, "loss": 2.3396984100341798, "memory(GiB)": 77.56, "step": 92065, "token_acc": 0.49683544303797467, "train_speed(iter/s)": 1.43789 }, { "epoch": 3.9445610727903686, "grad_norm": 7.204801559448242, "learning_rate": 1.0597206649475721e-05, "loss": 2.7304920196533202, "memory(GiB)": 77.56, "step": 92070, "token_acc": 0.45195729537366547, "train_speed(iter/s)": 1.437891 }, { "epoch": 3.944775288119618, "grad_norm": 4.8624138832092285, "learning_rate": 1.0593064133018488e-05, "loss": 2.402364158630371, "memory(GiB)": 77.56, "step": 92075, "token_acc": 0.5105105105105106, "train_speed(iter/s)": 1.437888 }, { "epoch": 3.9449895034488667, "grad_norm": 5.270648002624512, "learning_rate": 1.0588922330454876e-05, "loss": 2.148133087158203, "memory(GiB)": 77.56, "step": 92080, "token_acc": 0.5163398692810458, "train_speed(iter/s)": 1.437879 }, { "epoch": 3.9452037187781155, "grad_norm": 5.261557579040527, "learning_rate": 1.0584781241859903e-05, "loss": 2.6369842529296874, "memory(GiB)": 77.56, "step": 92085, "token_acc": 0.48494983277591974, "train_speed(iter/s)": 1.437887 }, { "epoch": 3.9454179341073647, "grad_norm": 5.2815704345703125, "learning_rate": 1.05806408673086e-05, "loss": 2.385141372680664, "memory(GiB)": 77.56, "step": 92090, "token_acc": 0.47686832740213525, "train_speed(iter/s)": 1.437882 }, { "epoch": 3.9456321494366136, "grad_norm": 5.734361171722412, "learning_rate": 1.057650120687596e-05, "loss": 2.3255592346191407, "memory(GiB)": 77.56, "step": 92095, "token_acc": 0.4885245901639344, "train_speed(iter/s)": 1.437843 }, { "epoch": 3.9458463647658624, "grad_norm": 7.409005641937256, "learning_rate": 1.057236226063697e-05, "loss": 2.0789005279541017, "memory(GiB)": 77.56, "step": 92100, "token_acc": 0.5568862275449101, "train_speed(iter/s)": 1.437846 }, { "epoch": 3.9460605800951116, "grad_norm": 7.198379039764404, "learning_rate": 1.056822402866664e-05, "loss": 2.493305969238281, "memory(GiB)": 77.56, "step": 92105, "token_acc": 0.47770700636942676, "train_speed(iter/s)": 1.437839 }, { "epoch": 3.9462747954243604, "grad_norm": 6.27956485748291, "learning_rate": 1.0564086511039917e-05, "loss": 2.3432601928710937, "memory(GiB)": 77.56, "step": 92110, "token_acc": 0.5124223602484472, "train_speed(iter/s)": 1.437863 }, { "epoch": 3.9464890107536097, "grad_norm": 6.296996593475342, "learning_rate": 1.0559949707831762e-05, "loss": 2.2563201904296877, "memory(GiB)": 77.56, "step": 92115, "token_acc": 0.5283018867924528, "train_speed(iter/s)": 1.437873 }, { "epoch": 3.9467032260828585, "grad_norm": 11.899503707885742, "learning_rate": 1.0555813619117122e-05, "loss": 2.2604516983032226, "memory(GiB)": 77.56, "step": 92120, "token_acc": 0.5367647058823529, "train_speed(iter/s)": 1.437866 }, { "epoch": 3.9469174414121073, "grad_norm": 7.101419925689697, "learning_rate": 1.0551678244970914e-05, "loss": 2.4868316650390625, "memory(GiB)": 77.56, "step": 92125, "token_acc": 0.47079037800687284, "train_speed(iter/s)": 1.437873 }, { "epoch": 3.9471316567413566, "grad_norm": 6.956063747406006, "learning_rate": 1.0547543585468044e-05, "loss": 2.3549430847167967, "memory(GiB)": 77.56, "step": 92130, "token_acc": 0.4855305466237942, "train_speed(iter/s)": 1.437875 }, { "epoch": 3.9473458720706054, "grad_norm": 5.81870174407959, "learning_rate": 1.0543409640683439e-05, "loss": 2.129019927978516, "memory(GiB)": 77.56, "step": 92135, "token_acc": 0.5454545454545454, "train_speed(iter/s)": 1.437888 }, { "epoch": 3.947560087399854, "grad_norm": 6.7818989753723145, "learning_rate": 1.053927641069199e-05, "loss": 2.2010318756103517, "memory(GiB)": 77.56, "step": 92140, "token_acc": 0.4963235294117647, "train_speed(iter/s)": 1.437868 }, { "epoch": 3.9477743027291035, "grad_norm": 5.320273399353027, "learning_rate": 1.0535143895568566e-05, "loss": 2.4698165893554687, "memory(GiB)": 77.56, "step": 92145, "token_acc": 0.47701149425287354, "train_speed(iter/s)": 1.437861 }, { "epoch": 3.9479885180583523, "grad_norm": 6.498271942138672, "learning_rate": 1.0531012095388032e-05, "loss": 2.415632629394531, "memory(GiB)": 77.56, "step": 92150, "token_acc": 0.47619047619047616, "train_speed(iter/s)": 1.437859 }, { "epoch": 3.948202733387601, "grad_norm": 6.241724967956543, "learning_rate": 1.0526881010225242e-05, "loss": 2.2276315689086914, "memory(GiB)": 77.56, "step": 92155, "token_acc": 0.5381944444444444, "train_speed(iter/s)": 1.437866 }, { "epoch": 3.9484169487168503, "grad_norm": 5.565863132476807, "learning_rate": 1.0522750640155016e-05, "loss": 2.213591384887695, "memory(GiB)": 77.56, "step": 92160, "token_acc": 0.5524691358024691, "train_speed(iter/s)": 1.437872 }, { "epoch": 3.948631164046099, "grad_norm": 11.722769737243652, "learning_rate": 1.0518620985252208e-05, "loss": 2.277585029602051, "memory(GiB)": 77.56, "step": 92165, "token_acc": 0.5126353790613718, "train_speed(iter/s)": 1.437863 }, { "epoch": 3.948845379375348, "grad_norm": 6.0761237144470215, "learning_rate": 1.0514492045591617e-05, "loss": 2.217298126220703, "memory(GiB)": 77.56, "step": 92170, "token_acc": 0.5357142857142857, "train_speed(iter/s)": 1.43784 }, { "epoch": 3.949059594704597, "grad_norm": 6.448444843292236, "learning_rate": 1.0510363821248042e-05, "loss": 2.4353891372680665, "memory(GiB)": 77.56, "step": 92175, "token_acc": 0.4965034965034965, "train_speed(iter/s)": 1.437851 }, { "epoch": 3.949273810033846, "grad_norm": 4.6907958984375, "learning_rate": 1.050623631229627e-05, "loss": 2.1077457427978517, "memory(GiB)": 77.56, "step": 92180, "token_acc": 0.5296296296296297, "train_speed(iter/s)": 1.437862 }, { "epoch": 3.949488025363095, "grad_norm": 7.817314624786377, "learning_rate": 1.0502109518811065e-05, "loss": 2.628652572631836, "memory(GiB)": 77.56, "step": 92185, "token_acc": 0.48507462686567165, "train_speed(iter/s)": 1.437877 }, { "epoch": 3.949702240692344, "grad_norm": 4.980259418487549, "learning_rate": 1.049798344086721e-05, "loss": 2.049240303039551, "memory(GiB)": 77.56, "step": 92190, "token_acc": 0.5265306122448979, "train_speed(iter/s)": 1.437875 }, { "epoch": 3.949916456021593, "grad_norm": 4.446688175201416, "learning_rate": 1.0493858078539442e-05, "loss": 2.167164993286133, "memory(GiB)": 77.56, "step": 92195, "token_acc": 0.5, "train_speed(iter/s)": 1.43785 }, { "epoch": 3.9501306713508417, "grad_norm": 5.079987049102783, "learning_rate": 1.0489733431902494e-05, "loss": 2.47097110748291, "memory(GiB)": 77.56, "step": 92200, "token_acc": 0.5143769968051118, "train_speed(iter/s)": 1.437855 }, { "epoch": 3.950344886680091, "grad_norm": 6.935995578765869, "learning_rate": 1.0485609501031074e-05, "loss": 2.2259145736694337, "memory(GiB)": 77.56, "step": 92205, "token_acc": 0.4965277777777778, "train_speed(iter/s)": 1.43785 }, { "epoch": 3.95055910200934, "grad_norm": 4.849509239196777, "learning_rate": 1.048148628599992e-05, "loss": 2.2015085220336914, "memory(GiB)": 77.56, "step": 92210, "token_acc": 0.5746268656716418, "train_speed(iter/s)": 1.437835 }, { "epoch": 3.9507733173385886, "grad_norm": 4.599462985992432, "learning_rate": 1.0477363786883709e-05, "loss": 2.5279514312744142, "memory(GiB)": 77.56, "step": 92215, "token_acc": 0.4881656804733728, "train_speed(iter/s)": 1.437825 }, { "epoch": 3.950987532667838, "grad_norm": 5.7188401222229, "learning_rate": 1.0473242003757123e-05, "loss": 2.378571319580078, "memory(GiB)": 77.56, "step": 92220, "token_acc": 0.5, "train_speed(iter/s)": 1.437839 }, { "epoch": 3.9512017479970867, "grad_norm": 6.001686096191406, "learning_rate": 1.0469120936694843e-05, "loss": 2.128190803527832, "memory(GiB)": 77.56, "step": 92225, "token_acc": 0.5301204819277109, "train_speed(iter/s)": 1.437837 }, { "epoch": 3.9514159633263355, "grad_norm": 6.048079013824463, "learning_rate": 1.0465000585771522e-05, "loss": 2.2242752075195313, "memory(GiB)": 77.56, "step": 92230, "token_acc": 0.513986013986014, "train_speed(iter/s)": 1.437848 }, { "epoch": 3.9516301786555847, "grad_norm": 4.913375377655029, "learning_rate": 1.0460880951061807e-05, "loss": 2.4028850555419923, "memory(GiB)": 77.56, "step": 92235, "token_acc": 0.476056338028169, "train_speed(iter/s)": 1.437865 }, { "epoch": 3.9518443939848336, "grad_norm": 4.643843173980713, "learning_rate": 1.0456762032640322e-05, "loss": 2.3900989532470702, "memory(GiB)": 77.56, "step": 92240, "token_acc": 0.4919614147909968, "train_speed(iter/s)": 1.437863 }, { "epoch": 3.9520586093140824, "grad_norm": 7.745403289794922, "learning_rate": 1.0452643830581672e-05, "loss": 2.1904064178466798, "memory(GiB)": 77.56, "step": 92245, "token_acc": 0.5128205128205128, "train_speed(iter/s)": 1.437882 }, { "epoch": 3.9522728246433316, "grad_norm": 5.026733875274658, "learning_rate": 1.0448526344960491e-05, "loss": 2.5203929901123048, "memory(GiB)": 77.56, "step": 92250, "token_acc": 0.5031645569620253, "train_speed(iter/s)": 1.4379 }, { "epoch": 3.9524870399725804, "grad_norm": 7.009357929229736, "learning_rate": 1.0444409575851355e-05, "loss": 2.217828369140625, "memory(GiB)": 77.56, "step": 92255, "token_acc": 0.5158227848101266, "train_speed(iter/s)": 1.437896 }, { "epoch": 3.9527012553018293, "grad_norm": 6.079318046569824, "learning_rate": 1.0440293523328847e-05, "loss": 2.33267822265625, "memory(GiB)": 77.56, "step": 92260, "token_acc": 0.5086206896551724, "train_speed(iter/s)": 1.437908 }, { "epoch": 3.9529154706310785, "grad_norm": 6.795281410217285, "learning_rate": 1.0436178187467538e-05, "loss": 2.4412967681884767, "memory(GiB)": 77.56, "step": 92265, "token_acc": 0.5018050541516246, "train_speed(iter/s)": 1.437918 }, { "epoch": 3.9531296859603273, "grad_norm": 5.610692501068115, "learning_rate": 1.043206356834197e-05, "loss": 2.1294960021972655, "memory(GiB)": 77.56, "step": 92270, "token_acc": 0.5243445692883895, "train_speed(iter/s)": 1.437929 }, { "epoch": 3.953343901289576, "grad_norm": 9.385631561279297, "learning_rate": 1.0427949666026677e-05, "loss": 2.221689987182617, "memory(GiB)": 77.56, "step": 92275, "token_acc": 0.5192307692307693, "train_speed(iter/s)": 1.437938 }, { "epoch": 3.9535581166188254, "grad_norm": 5.482063293457031, "learning_rate": 1.0423836480596195e-05, "loss": 2.5405521392822266, "memory(GiB)": 77.56, "step": 92280, "token_acc": 0.468013468013468, "train_speed(iter/s)": 1.437956 }, { "epoch": 3.953772331948074, "grad_norm": 8.201769828796387, "learning_rate": 1.0419724012125053e-05, "loss": 2.4377384185791016, "memory(GiB)": 77.56, "step": 92285, "token_acc": 0.4758842443729904, "train_speed(iter/s)": 1.437944 }, { "epoch": 3.953986547277323, "grad_norm": 6.548315048217773, "learning_rate": 1.041561226068774e-05, "loss": 2.113050842285156, "memory(GiB)": 77.56, "step": 92290, "token_acc": 0.53156146179402, "train_speed(iter/s)": 1.437955 }, { "epoch": 3.9542007626065723, "grad_norm": 5.980075836181641, "learning_rate": 1.041150122635875e-05, "loss": 2.5393014907836915, "memory(GiB)": 77.56, "step": 92295, "token_acc": 0.4628099173553719, "train_speed(iter/s)": 1.437961 }, { "epoch": 3.954414977935821, "grad_norm": 5.306587219238281, "learning_rate": 1.0407390909212544e-05, "loss": 2.3953643798828126, "memory(GiB)": 77.56, "step": 92300, "token_acc": 0.5115511551155115, "train_speed(iter/s)": 1.437963 }, { "epoch": 3.95462919326507, "grad_norm": 4.704372882843018, "learning_rate": 1.0403281309323587e-05, "loss": 2.6436405181884766, "memory(GiB)": 77.56, "step": 92305, "token_acc": 0.43440233236151604, "train_speed(iter/s)": 1.437965 }, { "epoch": 3.954843408594319, "grad_norm": 5.839838027954102, "learning_rate": 1.0399172426766341e-05, "loss": 2.354899597167969, "memory(GiB)": 77.56, "step": 92310, "token_acc": 0.5105740181268882, "train_speed(iter/s)": 1.437978 }, { "epoch": 3.955057623923568, "grad_norm": 7.160571575164795, "learning_rate": 1.039506426161524e-05, "loss": 2.512831115722656, "memory(GiB)": 77.56, "step": 92315, "token_acc": 0.49070631970260226, "train_speed(iter/s)": 1.437989 }, { "epoch": 3.955271839252817, "grad_norm": 6.297356605529785, "learning_rate": 1.0390956813944697e-05, "loss": 2.2826522827148437, "memory(GiB)": 77.56, "step": 92320, "token_acc": 0.5016077170418006, "train_speed(iter/s)": 1.437976 }, { "epoch": 3.955486054582066, "grad_norm": 6.495451927185059, "learning_rate": 1.038685008382913e-05, "loss": 2.4069850921630858, "memory(GiB)": 77.56, "step": 92325, "token_acc": 0.5, "train_speed(iter/s)": 1.437988 }, { "epoch": 3.955700269911315, "grad_norm": 5.984065055847168, "learning_rate": 1.038274407134292e-05, "loss": 2.331298065185547, "memory(GiB)": 77.56, "step": 92330, "token_acc": 0.5212765957446809, "train_speed(iter/s)": 1.437974 }, { "epoch": 3.9559144852405637, "grad_norm": 12.185142517089844, "learning_rate": 1.0378638776560483e-05, "loss": 2.315629577636719, "memory(GiB)": 77.56, "step": 92335, "token_acc": 0.48616600790513836, "train_speed(iter/s)": 1.437991 }, { "epoch": 3.956128700569813, "grad_norm": 5.467767715454102, "learning_rate": 1.0374534199556168e-05, "loss": 2.4507892608642576, "memory(GiB)": 77.56, "step": 92340, "token_acc": 0.5047619047619047, "train_speed(iter/s)": 1.437976 }, { "epoch": 3.9563429158990617, "grad_norm": 7.182713985443115, "learning_rate": 1.0370430340404336e-05, "loss": 2.116982078552246, "memory(GiB)": 77.56, "step": 92345, "token_acc": 0.5634920634920635, "train_speed(iter/s)": 1.437966 }, { "epoch": 3.9565571312283105, "grad_norm": 5.4907755851745605, "learning_rate": 1.0366327199179338e-05, "loss": 2.619407844543457, "memory(GiB)": 77.56, "step": 92350, "token_acc": 0.45652173913043476, "train_speed(iter/s)": 1.437969 }, { "epoch": 3.95677134655756, "grad_norm": 5.2013421058654785, "learning_rate": 1.0362224775955486e-05, "loss": 2.4770137786865236, "memory(GiB)": 77.56, "step": 92355, "token_acc": 0.43727598566308246, "train_speed(iter/s)": 1.43797 }, { "epoch": 3.9569855618868086, "grad_norm": 5.84731912612915, "learning_rate": 1.035812307080713e-05, "loss": 2.2171327590942385, "memory(GiB)": 77.56, "step": 92360, "token_acc": 0.5080385852090032, "train_speed(iter/s)": 1.437999 }, { "epoch": 3.9571997772160574, "grad_norm": 7.181849479675293, "learning_rate": 1.0354022083808551e-05, "loss": 2.052858352661133, "memory(GiB)": 77.56, "step": 92365, "token_acc": 0.549520766773163, "train_speed(iter/s)": 1.438006 }, { "epoch": 3.9574139925453067, "grad_norm": 5.451623916625977, "learning_rate": 1.0349921815034063e-05, "loss": 2.466476821899414, "memory(GiB)": 77.56, "step": 92370, "token_acc": 0.4633333333333333, "train_speed(iter/s)": 1.437994 }, { "epoch": 3.9576282078745555, "grad_norm": 5.480225086212158, "learning_rate": 1.0345822264557936e-05, "loss": 2.6819793701171877, "memory(GiB)": 77.56, "step": 92375, "token_acc": 0.48518518518518516, "train_speed(iter/s)": 1.437994 }, { "epoch": 3.9578424232038043, "grad_norm": 6.137053966522217, "learning_rate": 1.0341723432454437e-05, "loss": 2.3277715682983398, "memory(GiB)": 77.56, "step": 92380, "token_acc": 0.5121951219512195, "train_speed(iter/s)": 1.437987 }, { "epoch": 3.9580566385330536, "grad_norm": 6.884665489196777, "learning_rate": 1.0337625318797822e-05, "loss": 2.2621673583984374, "memory(GiB)": 77.56, "step": 92385, "token_acc": 0.49206349206349204, "train_speed(iter/s)": 1.437978 }, { "epoch": 3.9582708538623024, "grad_norm": 5.110538005828857, "learning_rate": 1.0333527923662318e-05, "loss": 2.251437187194824, "memory(GiB)": 77.56, "step": 92390, "token_acc": 0.5226586102719033, "train_speed(iter/s)": 1.437959 }, { "epoch": 3.958485069191551, "grad_norm": 7.737443923950195, "learning_rate": 1.0329431247122179e-05, "loss": 2.5528079986572267, "memory(GiB)": 77.56, "step": 92395, "token_acc": 0.45104895104895104, "train_speed(iter/s)": 1.437965 }, { "epoch": 3.9586992845208004, "grad_norm": 6.978601932525635, "learning_rate": 1.0325335289251602e-05, "loss": 2.841652679443359, "memory(GiB)": 77.56, "step": 92400, "token_acc": 0.4268292682926829, "train_speed(iter/s)": 1.437964 }, { "epoch": 3.9589134998500493, "grad_norm": 5.924506664276123, "learning_rate": 1.0321240050124797e-05, "loss": 2.1365234375, "memory(GiB)": 77.56, "step": 92405, "token_acc": 0.5305164319248826, "train_speed(iter/s)": 1.437984 }, { "epoch": 3.959127715179298, "grad_norm": 8.098037719726562, "learning_rate": 1.0317145529815952e-05, "loss": 2.109018898010254, "memory(GiB)": 77.56, "step": 92410, "token_acc": 0.548, "train_speed(iter/s)": 1.437993 }, { "epoch": 3.9593419305085473, "grad_norm": 6.439405918121338, "learning_rate": 1.0313051728399243e-05, "loss": 2.1036882400512695, "memory(GiB)": 77.56, "step": 92415, "token_acc": 0.5017064846416383, "train_speed(iter/s)": 1.437999 }, { "epoch": 3.959556145837796, "grad_norm": 6.563499927520752, "learning_rate": 1.0308958645948814e-05, "loss": 2.0365978240966798, "memory(GiB)": 77.56, "step": 92420, "token_acc": 0.5909090909090909, "train_speed(iter/s)": 1.438011 }, { "epoch": 3.959770361167045, "grad_norm": 6.739819049835205, "learning_rate": 1.030486628253885e-05, "loss": 2.4002443313598634, "memory(GiB)": 77.56, "step": 92425, "token_acc": 0.4876543209876543, "train_speed(iter/s)": 1.438022 }, { "epoch": 3.959984576496294, "grad_norm": 5.5007781982421875, "learning_rate": 1.0300774638243454e-05, "loss": 2.4650896072387694, "memory(GiB)": 77.56, "step": 92430, "token_acc": 0.501628664495114, "train_speed(iter/s)": 1.438004 }, { "epoch": 3.960198791825543, "grad_norm": 4.279637813568115, "learning_rate": 1.0296683713136779e-05, "loss": 1.9827146530151367, "memory(GiB)": 77.56, "step": 92435, "token_acc": 0.5673758865248227, "train_speed(iter/s)": 1.438005 }, { "epoch": 3.960413007154792, "grad_norm": 7.978905200958252, "learning_rate": 1.0292593507292924e-05, "loss": 2.2765472412109373, "memory(GiB)": 77.56, "step": 92440, "token_acc": 0.5141843971631206, "train_speed(iter/s)": 1.437978 }, { "epoch": 3.960627222484041, "grad_norm": 5.935574054718018, "learning_rate": 1.0288504020785983e-05, "loss": 2.584377670288086, "memory(GiB)": 77.56, "step": 92445, "token_acc": 0.4402332361516035, "train_speed(iter/s)": 1.437981 }, { "epoch": 3.96084143781329, "grad_norm": 5.890320777893066, "learning_rate": 1.0284415253690032e-05, "loss": 2.4728418350219727, "memory(GiB)": 77.56, "step": 92450, "token_acc": 0.5078864353312302, "train_speed(iter/s)": 1.437976 }, { "epoch": 3.961055653142539, "grad_norm": 4.8769683837890625, "learning_rate": 1.0280327206079172e-05, "loss": 1.948649787902832, "memory(GiB)": 77.56, "step": 92455, "token_acc": 0.5642857142857143, "train_speed(iter/s)": 1.437983 }, { "epoch": 3.961269868471788, "grad_norm": 7.111137390136719, "learning_rate": 1.0276239878027443e-05, "loss": 2.459162712097168, "memory(GiB)": 77.56, "step": 92460, "token_acc": 0.49122807017543857, "train_speed(iter/s)": 1.437991 }, { "epoch": 3.961484083801037, "grad_norm": 5.310895919799805, "learning_rate": 1.0272153269608892e-05, "loss": 2.1758729934692385, "memory(GiB)": 77.56, "step": 92465, "token_acc": 0.5396341463414634, "train_speed(iter/s)": 1.438016 }, { "epoch": 3.961698299130286, "grad_norm": 6.717132091522217, "learning_rate": 1.0268067380897556e-05, "loss": 2.1248960494995117, "memory(GiB)": 77.56, "step": 92470, "token_acc": 0.5321428571428571, "train_speed(iter/s)": 1.438009 }, { "epoch": 3.961912514459535, "grad_norm": 5.88025426864624, "learning_rate": 1.0263982211967433e-05, "loss": 2.3237077713012697, "memory(GiB)": 77.56, "step": 92475, "token_acc": 0.5017182130584192, "train_speed(iter/s)": 1.43801 }, { "epoch": 3.9621267297887837, "grad_norm": 6.892989635467529, "learning_rate": 1.0259897762892562e-05, "loss": 2.2981101989746096, "memory(GiB)": 77.56, "step": 92480, "token_acc": 0.5353159851301115, "train_speed(iter/s)": 1.438019 }, { "epoch": 3.962340945118033, "grad_norm": 5.83104133605957, "learning_rate": 1.0255814033746925e-05, "loss": 2.2894113540649412, "memory(GiB)": 77.56, "step": 92485, "token_acc": 0.48179271708683474, "train_speed(iter/s)": 1.438026 }, { "epoch": 3.9625551604472817, "grad_norm": 6.483590602874756, "learning_rate": 1.0251731024604495e-05, "loss": 2.3141902923583983, "memory(GiB)": 77.56, "step": 92490, "token_acc": 0.5197132616487455, "train_speed(iter/s)": 1.438018 }, { "epoch": 3.9627693757765305, "grad_norm": 5.813948631286621, "learning_rate": 1.0247648735539245e-05, "loss": 2.0924434661865234, "memory(GiB)": 77.56, "step": 92495, "token_acc": 0.5197132616487455, "train_speed(iter/s)": 1.438018 }, { "epoch": 3.96298359110578, "grad_norm": 7.791220664978027, "learning_rate": 1.0243567166625117e-05, "loss": 2.3897720336914063, "memory(GiB)": 77.56, "step": 92500, "token_acc": 0.458041958041958, "train_speed(iter/s)": 1.43804 }, { "epoch": 3.96298359110578, "eval_loss": 1.9803563356399536, "eval_runtime": 14.5488, "eval_samples_per_second": 6.873, "eval_steps_per_second": 6.873, "eval_token_acc": 0.5121212121212121, "step": 92500 }, { "epoch": 3.9631978064350286, "grad_norm": 5.973801136016846, "learning_rate": 1.0239486317936081e-05, "loss": 2.1581859588623047, "memory(GiB)": 77.56, "step": 92505, "token_acc": 0.5138004246284501, "train_speed(iter/s)": 1.437695 }, { "epoch": 3.9634120217642774, "grad_norm": 5.572224140167236, "learning_rate": 1.0235406189546031e-05, "loss": 2.3361495971679687, "memory(GiB)": 77.56, "step": 92510, "token_acc": 0.49295774647887325, "train_speed(iter/s)": 1.437706 }, { "epoch": 3.9636262370935267, "grad_norm": 5.9734978675842285, "learning_rate": 1.0231326781528916e-05, "loss": 2.18347225189209, "memory(GiB)": 77.56, "step": 92515, "token_acc": 0.49828178694158076, "train_speed(iter/s)": 1.437718 }, { "epoch": 3.9638404524227755, "grad_norm": 8.143786430358887, "learning_rate": 1.022724809395862e-05, "loss": 2.1307796478271483, "memory(GiB)": 77.56, "step": 92520, "token_acc": 0.5337837837837838, "train_speed(iter/s)": 1.437722 }, { "epoch": 3.9640546677520243, "grad_norm": 5.335572242736816, "learning_rate": 1.0223170126909038e-05, "loss": 2.262030601501465, "memory(GiB)": 77.56, "step": 92525, "token_acc": 0.5349544072948328, "train_speed(iter/s)": 1.437721 }, { "epoch": 3.9642688830812736, "grad_norm": 6.198302268981934, "learning_rate": 1.021909288045404e-05, "loss": 2.2171550750732423, "memory(GiB)": 77.56, "step": 92530, "token_acc": 0.5402298850574713, "train_speed(iter/s)": 1.437723 }, { "epoch": 3.9644830984105224, "grad_norm": 4.79813814163208, "learning_rate": 1.0215016354667477e-05, "loss": 2.2162036895751953, "memory(GiB)": 77.56, "step": 92535, "token_acc": 0.546583850931677, "train_speed(iter/s)": 1.437714 }, { "epoch": 3.964697313739771, "grad_norm": 6.9610514640808105, "learning_rate": 1.0210940549623227e-05, "loss": 2.520808792114258, "memory(GiB)": 77.56, "step": 92540, "token_acc": 0.4778156996587031, "train_speed(iter/s)": 1.437707 }, { "epoch": 3.9649115290690204, "grad_norm": 6.620671272277832, "learning_rate": 1.020686546539512e-05, "loss": 2.3940929412841796, "memory(GiB)": 77.56, "step": 92545, "token_acc": 0.4866666666666667, "train_speed(iter/s)": 1.437721 }, { "epoch": 3.9651257443982693, "grad_norm": 6.23718786239624, "learning_rate": 1.0202791102056969e-05, "loss": 2.2938634872436525, "memory(GiB)": 77.56, "step": 92550, "token_acc": 0.4885057471264368, "train_speed(iter/s)": 1.437728 }, { "epoch": 3.965339959727518, "grad_norm": 5.441056251525879, "learning_rate": 1.0198717459682594e-05, "loss": 2.1483461380004885, "memory(GiB)": 77.56, "step": 92555, "token_acc": 0.5405405405405406, "train_speed(iter/s)": 1.437734 }, { "epoch": 3.9655541750567673, "grad_norm": 6.096147537231445, "learning_rate": 1.0194644538345787e-05, "loss": 2.575356674194336, "memory(GiB)": 77.56, "step": 92560, "token_acc": 0.47865853658536583, "train_speed(iter/s)": 1.437737 }, { "epoch": 3.965768390386016, "grad_norm": 5.840301036834717, "learning_rate": 1.0190572338120318e-05, "loss": 2.2437034606933595, "memory(GiB)": 77.56, "step": 92565, "token_acc": 0.5088967971530249, "train_speed(iter/s)": 1.43775 }, { "epoch": 3.965982605715265, "grad_norm": 6.897130966186523, "learning_rate": 1.0186500859079995e-05, "loss": 2.32926082611084, "memory(GiB)": 77.56, "step": 92570, "token_acc": 0.49838187702265374, "train_speed(iter/s)": 1.437749 }, { "epoch": 3.966196821044514, "grad_norm": 5.886148452758789, "learning_rate": 1.018243010129854e-05, "loss": 2.294659233093262, "memory(GiB)": 77.56, "step": 92575, "token_acc": 0.5052264808362369, "train_speed(iter/s)": 1.437743 }, { "epoch": 3.966411036373763, "grad_norm": 5.895969867706299, "learning_rate": 1.017836006484973e-05, "loss": 2.4033576965332033, "memory(GiB)": 77.56, "step": 92580, "token_acc": 0.47634069400630913, "train_speed(iter/s)": 1.437745 }, { "epoch": 3.966625251703012, "grad_norm": 7.525404930114746, "learning_rate": 1.0174290749807281e-05, "loss": 2.4287345886230467, "memory(GiB)": 77.56, "step": 92585, "token_acc": 0.44360902255639095, "train_speed(iter/s)": 1.437755 }, { "epoch": 3.966839467032261, "grad_norm": 5.77642297744751, "learning_rate": 1.0170222156244914e-05, "loss": 2.517479133605957, "memory(GiB)": 77.56, "step": 92590, "token_acc": 0.46, "train_speed(iter/s)": 1.437767 }, { "epoch": 3.96705368236151, "grad_norm": 7.495279788970947, "learning_rate": 1.0166154284236324e-05, "loss": 2.8124372482299806, "memory(GiB)": 77.56, "step": 92595, "token_acc": 0.4411764705882353, "train_speed(iter/s)": 1.437768 }, { "epoch": 3.9672678976907587, "grad_norm": 4.941127777099609, "learning_rate": 1.016208713385523e-05, "loss": 2.3981298446655273, "memory(GiB)": 77.56, "step": 92600, "token_acc": 0.47619047619047616, "train_speed(iter/s)": 1.437783 }, { "epoch": 3.967482113020008, "grad_norm": 6.305639743804932, "learning_rate": 1.0158020705175298e-05, "loss": 2.4088714599609373, "memory(GiB)": 77.56, "step": 92605, "token_acc": 0.4876325088339223, "train_speed(iter/s)": 1.437787 }, { "epoch": 3.967696328349257, "grad_norm": 6.702791690826416, "learning_rate": 1.01539549982702e-05, "loss": 2.3219625473022463, "memory(GiB)": 77.56, "step": 92610, "token_acc": 0.5434083601286174, "train_speed(iter/s)": 1.437788 }, { "epoch": 3.9679105436785056, "grad_norm": 5.356317520141602, "learning_rate": 1.014989001321358e-05, "loss": 2.5029346466064455, "memory(GiB)": 77.56, "step": 92615, "token_acc": 0.4742647058823529, "train_speed(iter/s)": 1.437779 }, { "epoch": 3.968124759007755, "grad_norm": 5.056755065917969, "learning_rate": 1.0145825750079075e-05, "loss": 2.2346929550170898, "memory(GiB)": 77.56, "step": 92620, "token_acc": 0.5064935064935064, "train_speed(iter/s)": 1.437791 }, { "epoch": 3.9683389743370037, "grad_norm": 4.774623870849609, "learning_rate": 1.0141762208940337e-05, "loss": 2.6570507049560548, "memory(GiB)": 77.56, "step": 92625, "token_acc": 0.4778481012658228, "train_speed(iter/s)": 1.437789 }, { "epoch": 3.9685531896662525, "grad_norm": 6.32444429397583, "learning_rate": 1.0137699389870963e-05, "loss": 2.4473831176757814, "memory(GiB)": 77.56, "step": 92630, "token_acc": 0.4882943143812709, "train_speed(iter/s)": 1.437799 }, { "epoch": 3.9687674049955017, "grad_norm": 7.622020721435547, "learning_rate": 1.013363729294456e-05, "loss": 2.42104434967041, "memory(GiB)": 77.56, "step": 92635, "token_acc": 0.49800796812749004, "train_speed(iter/s)": 1.437795 }, { "epoch": 3.9689816203247505, "grad_norm": 6.257937908172607, "learning_rate": 1.0129575918234713e-05, "loss": 2.2202287673950196, "memory(GiB)": 77.56, "step": 92640, "token_acc": 0.5435435435435435, "train_speed(iter/s)": 1.437797 }, { "epoch": 3.9691958356539994, "grad_norm": 5.764307498931885, "learning_rate": 1.0125515265815005e-05, "loss": 2.311808395385742, "memory(GiB)": 77.56, "step": 92645, "token_acc": 0.5323741007194245, "train_speed(iter/s)": 1.437795 }, { "epoch": 3.9694100509832486, "grad_norm": 7.226253032684326, "learning_rate": 1.0121455335758978e-05, "loss": 2.1958854675292967, "memory(GiB)": 77.56, "step": 92650, "token_acc": 0.506578947368421, "train_speed(iter/s)": 1.437795 }, { "epoch": 3.9696242663124974, "grad_norm": 6.088156223297119, "learning_rate": 1.0117396128140194e-05, "loss": 2.3840669631958007, "memory(GiB)": 77.56, "step": 92655, "token_acc": 0.49814126394052044, "train_speed(iter/s)": 1.437806 }, { "epoch": 3.9698384816417462, "grad_norm": 6.461009979248047, "learning_rate": 1.0113337643032206e-05, "loss": 2.5208641052246095, "memory(GiB)": 77.56, "step": 92660, "token_acc": 0.4838709677419355, "train_speed(iter/s)": 1.437806 }, { "epoch": 3.9700526969709955, "grad_norm": 5.879950046539307, "learning_rate": 1.0109279880508527e-05, "loss": 2.29910945892334, "memory(GiB)": 77.56, "step": 92665, "token_acc": 0.4950166112956811, "train_speed(iter/s)": 1.43781 }, { "epoch": 3.9702669123002443, "grad_norm": 5.410061836242676, "learning_rate": 1.010522284064266e-05, "loss": 2.1926912307739257, "memory(GiB)": 77.56, "step": 92670, "token_acc": 0.5466666666666666, "train_speed(iter/s)": 1.437813 }, { "epoch": 3.970481127629493, "grad_norm": 5.707328796386719, "learning_rate": 1.0101166523508104e-05, "loss": 2.2371553421020507, "memory(GiB)": 77.56, "step": 92675, "token_acc": 0.48582995951417, "train_speed(iter/s)": 1.43782 }, { "epoch": 3.9706953429587424, "grad_norm": 5.6087846755981445, "learning_rate": 1.0097110929178332e-05, "loss": 2.560465431213379, "memory(GiB)": 77.56, "step": 92680, "token_acc": 0.4856115107913669, "train_speed(iter/s)": 1.437842 }, { "epoch": 3.970909558287991, "grad_norm": 6.74299430847168, "learning_rate": 1.009305605772684e-05, "loss": 2.0412059783935548, "memory(GiB)": 77.56, "step": 92685, "token_acc": 0.5539033457249071, "train_speed(iter/s)": 1.437831 }, { "epoch": 3.97112377361724, "grad_norm": 5.704792499542236, "learning_rate": 1.008900190922708e-05, "loss": 2.361504554748535, "memory(GiB)": 77.56, "step": 92690, "token_acc": 0.4642857142857143, "train_speed(iter/s)": 1.437829 }, { "epoch": 3.9713379889464893, "grad_norm": 5.117986679077148, "learning_rate": 1.0084948483752482e-05, "loss": 2.521312713623047, "memory(GiB)": 77.56, "step": 92695, "token_acc": 0.4791666666666667, "train_speed(iter/s)": 1.437825 }, { "epoch": 3.971552204275738, "grad_norm": 8.231768608093262, "learning_rate": 1.0080895781376493e-05, "loss": 2.7389270782470705, "memory(GiB)": 77.56, "step": 92700, "token_acc": 0.4358974358974359, "train_speed(iter/s)": 1.437851 }, { "epoch": 3.971766419604987, "grad_norm": 6.550184726715088, "learning_rate": 1.007684380217252e-05, "loss": 2.3109886169433596, "memory(GiB)": 77.56, "step": 92705, "token_acc": 0.5066666666666667, "train_speed(iter/s)": 1.43787 }, { "epoch": 3.971980634934236, "grad_norm": 5.8178629875183105, "learning_rate": 1.007279254621396e-05, "loss": 2.345347785949707, "memory(GiB)": 77.56, "step": 92710, "token_acc": 0.5030120481927711, "train_speed(iter/s)": 1.437864 }, { "epoch": 3.972194850263485, "grad_norm": 6.1667866706848145, "learning_rate": 1.0068742013574229e-05, "loss": 2.200144577026367, "memory(GiB)": 77.56, "step": 92715, "token_acc": 0.5234375, "train_speed(iter/s)": 1.437864 }, { "epoch": 3.9724090655927338, "grad_norm": 5.679183006286621, "learning_rate": 1.0064692204326699e-05, "loss": 2.0072067260742186, "memory(GiB)": 77.56, "step": 92720, "token_acc": 0.5838926174496645, "train_speed(iter/s)": 1.437871 }, { "epoch": 3.972623280921983, "grad_norm": 6.569579601287842, "learning_rate": 1.006064311854471e-05, "loss": 2.315715026855469, "memory(GiB)": 77.56, "step": 92725, "token_acc": 0.5135135135135135, "train_speed(iter/s)": 1.437859 }, { "epoch": 3.972837496251232, "grad_norm": 5.709552764892578, "learning_rate": 1.0056594756301658e-05, "loss": 2.257454681396484, "memory(GiB)": 77.56, "step": 92730, "token_acc": 0.5214007782101168, "train_speed(iter/s)": 1.437866 }, { "epoch": 3.9730517115804806, "grad_norm": 5.594107627868652, "learning_rate": 1.0052547117670863e-05, "loss": 2.382841110229492, "memory(GiB)": 77.56, "step": 92735, "token_acc": 0.4896142433234421, "train_speed(iter/s)": 1.437876 }, { "epoch": 3.97326592690973, "grad_norm": 5.784968852996826, "learning_rate": 1.004850020272563e-05, "loss": 2.4212446212768555, "memory(GiB)": 77.56, "step": 92740, "token_acc": 0.4785100286532951, "train_speed(iter/s)": 1.437865 }, { "epoch": 3.9734801422389787, "grad_norm": 6.823126316070557, "learning_rate": 1.0044454011539317e-05, "loss": 2.487283706665039, "memory(GiB)": 77.56, "step": 92745, "token_acc": 0.46308724832214765, "train_speed(iter/s)": 1.437876 }, { "epoch": 3.9736943575682275, "grad_norm": 6.894071102142334, "learning_rate": 1.0040408544185192e-05, "loss": 2.0663173675537108, "memory(GiB)": 77.56, "step": 92750, "token_acc": 0.54, "train_speed(iter/s)": 1.437868 }, { "epoch": 3.973908572897477, "grad_norm": 5.159613132476807, "learning_rate": 1.0036363800736554e-05, "loss": 2.4883251190185547, "memory(GiB)": 77.56, "step": 92755, "token_acc": 0.5046728971962616, "train_speed(iter/s)": 1.437863 }, { "epoch": 3.9741227882267256, "grad_norm": 6.901095390319824, "learning_rate": 1.0032319781266674e-05, "loss": 2.4273557662963867, "memory(GiB)": 77.56, "step": 92760, "token_acc": 0.4912891986062718, "train_speed(iter/s)": 1.437861 }, { "epoch": 3.9743370035559744, "grad_norm": 4.95836877822876, "learning_rate": 1.0028276485848803e-05, "loss": 2.6295169830322265, "memory(GiB)": 77.56, "step": 92765, "token_acc": 0.4642857142857143, "train_speed(iter/s)": 1.437881 }, { "epoch": 3.9745512188852237, "grad_norm": 9.252575874328613, "learning_rate": 1.0024233914556213e-05, "loss": 2.670606994628906, "memory(GiB)": 77.56, "step": 92770, "token_acc": 0.44054054054054054, "train_speed(iter/s)": 1.437874 }, { "epoch": 3.9747654342144725, "grad_norm": 5.035546779632568, "learning_rate": 1.0020192067462125e-05, "loss": 2.4046119689941405, "memory(GiB)": 77.56, "step": 92775, "token_acc": 0.4632768361581921, "train_speed(iter/s)": 1.43788 }, { "epoch": 3.9749796495437213, "grad_norm": 5.519577503204346, "learning_rate": 1.0016150944639763e-05, "loss": 2.6053440093994142, "memory(GiB)": 77.56, "step": 92780, "token_acc": 0.45161290322580644, "train_speed(iter/s)": 1.437891 }, { "epoch": 3.9751938648729706, "grad_norm": 5.951094150543213, "learning_rate": 1.0012110546162334e-05, "loss": 2.1103424072265624, "memory(GiB)": 77.56, "step": 92785, "token_acc": 0.5580736543909348, "train_speed(iter/s)": 1.437887 }, { "epoch": 3.9754080802022194, "grad_norm": 5.941181659698486, "learning_rate": 1.0008070872103032e-05, "loss": 2.21679630279541, "memory(GiB)": 77.56, "step": 92790, "token_acc": 0.49504950495049505, "train_speed(iter/s)": 1.437898 }, { "epoch": 3.975622295531468, "grad_norm": 5.627597332000732, "learning_rate": 1.0004031922535029e-05, "loss": 2.382151412963867, "memory(GiB)": 77.56, "step": 92795, "token_acc": 0.4879725085910653, "train_speed(iter/s)": 1.437899 }, { "epoch": 3.9758365108607174, "grad_norm": 5.793582916259766, "learning_rate": 9.999993697531512e-06, "loss": 2.309271240234375, "memory(GiB)": 77.56, "step": 92800, "token_acc": 0.48909657320872274, "train_speed(iter/s)": 1.437897 }, { "epoch": 3.9760507261899662, "grad_norm": 5.2836833000183105, "learning_rate": 9.99595619716564e-06, "loss": 2.372360038757324, "memory(GiB)": 77.56, "step": 92805, "token_acc": 0.5098039215686274, "train_speed(iter/s)": 1.437915 }, { "epoch": 3.976264941519215, "grad_norm": 7.119791030883789, "learning_rate": 9.991919421510547e-06, "loss": 2.4161401748657227, "memory(GiB)": 77.56, "step": 92810, "token_acc": 0.4919614147909968, "train_speed(iter/s)": 1.437893 }, { "epoch": 3.9764791568484643, "grad_norm": 5.952172756195068, "learning_rate": 9.987883370639368e-06, "loss": 1.9757646560668944, "memory(GiB)": 77.56, "step": 92815, "token_acc": 0.5891472868217055, "train_speed(iter/s)": 1.437897 }, { "epoch": 3.976693372177713, "grad_norm": 7.871514320373535, "learning_rate": 9.983848044625216e-06, "loss": 2.3188039779663088, "memory(GiB)": 77.56, "step": 92820, "token_acc": 0.49201277955271566, "train_speed(iter/s)": 1.437911 }, { "epoch": 3.976907587506962, "grad_norm": 4.459116458892822, "learning_rate": 9.97981344354118e-06, "loss": 2.309185028076172, "memory(GiB)": 77.56, "step": 92825, "token_acc": 0.5210727969348659, "train_speed(iter/s)": 1.437908 }, { "epoch": 3.977121802836211, "grad_norm": 4.845858573913574, "learning_rate": 9.97577956746038e-06, "loss": 2.2914608001708983, "memory(GiB)": 77.56, "step": 92830, "token_acc": 0.4969512195121951, "train_speed(iter/s)": 1.437915 }, { "epoch": 3.97733601816546, "grad_norm": 6.601243495941162, "learning_rate": 9.971746416455879e-06, "loss": 2.3751863479614257, "memory(GiB)": 77.56, "step": 92835, "token_acc": 0.5308219178082192, "train_speed(iter/s)": 1.437911 }, { "epoch": 3.977550233494709, "grad_norm": 7.530718803405762, "learning_rate": 9.96771399060074e-06, "loss": 2.3008981704711915, "memory(GiB)": 77.56, "step": 92840, "token_acc": 0.5055762081784386, "train_speed(iter/s)": 1.437927 }, { "epoch": 3.977764448823958, "grad_norm": 7.323751449584961, "learning_rate": 9.963682289968018e-06, "loss": 2.1395172119140624, "memory(GiB)": 77.56, "step": 92845, "token_acc": 0.5019762845849802, "train_speed(iter/s)": 1.437928 }, { "epoch": 3.977978664153207, "grad_norm": 5.858083248138428, "learning_rate": 9.959651314630747e-06, "loss": 2.4996669769287108, "memory(GiB)": 77.56, "step": 92850, "token_acc": 0.4690909090909091, "train_speed(iter/s)": 1.437952 }, { "epoch": 3.9781928794824557, "grad_norm": 5.75156307220459, "learning_rate": 9.955621064661935e-06, "loss": 2.37884578704834, "memory(GiB)": 77.56, "step": 92855, "token_acc": 0.5121951219512195, "train_speed(iter/s)": 1.437977 }, { "epoch": 3.978407094811705, "grad_norm": 6.420896530151367, "learning_rate": 9.95159154013463e-06, "loss": 2.270208168029785, "memory(GiB)": 77.56, "step": 92860, "token_acc": 0.5034246575342466, "train_speed(iter/s)": 1.437959 }, { "epoch": 3.9786213101409538, "grad_norm": 5.730321407318115, "learning_rate": 9.947562741121813e-06, "loss": 2.3636148452758787, "memory(GiB)": 77.56, "step": 92865, "token_acc": 0.5032679738562091, "train_speed(iter/s)": 1.437961 }, { "epoch": 3.9788355254702026, "grad_norm": 6.030406951904297, "learning_rate": 9.943534667696459e-06, "loss": 2.534542274475098, "memory(GiB)": 77.56, "step": 92870, "token_acc": 0.4576271186440678, "train_speed(iter/s)": 1.437956 }, { "epoch": 3.979049740799452, "grad_norm": 4.293148994445801, "learning_rate": 9.939507319931557e-06, "loss": 2.248686981201172, "memory(GiB)": 77.56, "step": 92875, "token_acc": 0.5, "train_speed(iter/s)": 1.437951 }, { "epoch": 3.9792639561287007, "grad_norm": 6.239877223968506, "learning_rate": 9.935480697900063e-06, "loss": 2.353713607788086, "memory(GiB)": 77.56, "step": 92880, "token_acc": 0.5111821086261981, "train_speed(iter/s)": 1.43796 }, { "epoch": 3.9794781714579495, "grad_norm": 6.3617682456970215, "learning_rate": 9.931454801674906e-06, "loss": 2.4965335845947267, "memory(GiB)": 77.56, "step": 92885, "token_acc": 0.4664429530201342, "train_speed(iter/s)": 1.437968 }, { "epoch": 3.9796923867871987, "grad_norm": 7.433448314666748, "learning_rate": 9.92742963132905e-06, "loss": 2.303684616088867, "memory(GiB)": 77.56, "step": 92890, "token_acc": 0.5104166666666666, "train_speed(iter/s)": 1.437968 }, { "epoch": 3.9799066021164475, "grad_norm": 6.065186977386475, "learning_rate": 9.923405186935392e-06, "loss": 2.179836654663086, "memory(GiB)": 77.56, "step": 92895, "token_acc": 0.5336134453781513, "train_speed(iter/s)": 1.437971 }, { "epoch": 3.9801208174456963, "grad_norm": 5.21151876449585, "learning_rate": 9.919381468566846e-06, "loss": 2.4848758697509767, "memory(GiB)": 77.56, "step": 92900, "token_acc": 0.5032679738562091, "train_speed(iter/s)": 1.437981 }, { "epoch": 3.9803350327749456, "grad_norm": 6.633542060852051, "learning_rate": 9.9153584762963e-06, "loss": 2.4606199264526367, "memory(GiB)": 77.56, "step": 92905, "token_acc": 0.4630225080385852, "train_speed(iter/s)": 1.437984 }, { "epoch": 3.9805492481041944, "grad_norm": 5.363124847412109, "learning_rate": 9.911336210196626e-06, "loss": 2.0713830947875977, "memory(GiB)": 77.56, "step": 92910, "token_acc": 0.5266666666666666, "train_speed(iter/s)": 1.437989 }, { "epoch": 3.9807634634334432, "grad_norm": 6.806240081787109, "learning_rate": 9.907314670340717e-06, "loss": 2.5236743927001952, "memory(GiB)": 77.56, "step": 92915, "token_acc": 0.4676923076923077, "train_speed(iter/s)": 1.43798 }, { "epoch": 3.9809776787626925, "grad_norm": 5.791401386260986, "learning_rate": 9.903293856801415e-06, "loss": 2.592616081237793, "memory(GiB)": 77.56, "step": 92920, "token_acc": 0.444794952681388, "train_speed(iter/s)": 1.437968 }, { "epoch": 3.9811918940919413, "grad_norm": 8.250869750976562, "learning_rate": 9.899273769651557e-06, "loss": 2.500656509399414, "memory(GiB)": 77.56, "step": 92925, "token_acc": 0.46863468634686345, "train_speed(iter/s)": 1.437976 }, { "epoch": 3.98140610942119, "grad_norm": 6.224512577056885, "learning_rate": 9.895254408963972e-06, "loss": 2.297273063659668, "memory(GiB)": 77.56, "step": 92930, "token_acc": 0.5077881619937694, "train_speed(iter/s)": 1.437992 }, { "epoch": 3.9816203247504394, "grad_norm": 5.6414923667907715, "learning_rate": 9.891235774811474e-06, "loss": 2.342025566101074, "memory(GiB)": 77.56, "step": 92935, "token_acc": 0.4950166112956811, "train_speed(iter/s)": 1.437993 }, { "epoch": 3.981834540079688, "grad_norm": 6.357151985168457, "learning_rate": 9.88721786726685e-06, "loss": 2.497633934020996, "memory(GiB)": 77.56, "step": 92940, "token_acc": 0.5178571428571429, "train_speed(iter/s)": 1.43799 }, { "epoch": 3.982048755408937, "grad_norm": 5.937796592712402, "learning_rate": 9.883200686402905e-06, "loss": 2.355463981628418, "memory(GiB)": 77.56, "step": 92945, "token_acc": 0.5263157894736842, "train_speed(iter/s)": 1.43798 }, { "epoch": 3.9822629707381862, "grad_norm": 5.1598124504089355, "learning_rate": 9.879184232292426e-06, "loss": 2.229428291320801, "memory(GiB)": 77.56, "step": 92950, "token_acc": 0.49673202614379086, "train_speed(iter/s)": 1.437983 }, { "epoch": 3.982477186067435, "grad_norm": 7.349733352661133, "learning_rate": 9.875168505008159e-06, "loss": 2.4955867767333983, "memory(GiB)": 77.56, "step": 92955, "token_acc": 0.5246478873239436, "train_speed(iter/s)": 1.437984 }, { "epoch": 3.982691401396684, "grad_norm": 5.181321620941162, "learning_rate": 9.87115350462286e-06, "loss": 2.5198835372924804, "memory(GiB)": 77.56, "step": 92960, "token_acc": 0.4597315436241611, "train_speed(iter/s)": 1.437998 }, { "epoch": 3.982905616725933, "grad_norm": 8.551186561584473, "learning_rate": 9.867139231209255e-06, "loss": 2.369637680053711, "memory(GiB)": 77.56, "step": 92965, "token_acc": 0.5, "train_speed(iter/s)": 1.437995 }, { "epoch": 3.983119832055182, "grad_norm": 5.37523078918457, "learning_rate": 9.863125684840059e-06, "loss": 2.2184019088745117, "memory(GiB)": 77.56, "step": 92970, "token_acc": 0.5333333333333333, "train_speed(iter/s)": 1.437998 }, { "epoch": 3.9833340473844308, "grad_norm": 5.884568214416504, "learning_rate": 9.859112865588e-06, "loss": 2.433843994140625, "memory(GiB)": 77.56, "step": 92975, "token_acc": 0.5206349206349207, "train_speed(iter/s)": 1.438013 }, { "epoch": 3.98354826271368, "grad_norm": 4.879148006439209, "learning_rate": 9.855100773525771e-06, "loss": 2.370548439025879, "memory(GiB)": 77.56, "step": 92980, "token_acc": 0.49441340782122906, "train_speed(iter/s)": 1.438009 }, { "epoch": 3.983762478042929, "grad_norm": 5.4018330574035645, "learning_rate": 9.851089408726055e-06, "loss": 2.40179328918457, "memory(GiB)": 77.56, "step": 92985, "token_acc": 0.46688741721854304, "train_speed(iter/s)": 1.437993 }, { "epoch": 3.9839766933721776, "grad_norm": 5.380850315093994, "learning_rate": 9.847078771261509e-06, "loss": 2.7432432174682617, "memory(GiB)": 77.56, "step": 92990, "token_acc": 0.41317365269461076, "train_speed(iter/s)": 1.437998 }, { "epoch": 3.984190908701427, "grad_norm": 7.40653133392334, "learning_rate": 9.843068861204801e-06, "loss": 2.4660552978515624, "memory(GiB)": 77.56, "step": 92995, "token_acc": 0.46691176470588236, "train_speed(iter/s)": 1.438005 }, { "epoch": 3.9844051240306757, "grad_norm": 6.670955181121826, "learning_rate": 9.839059678628559e-06, "loss": 2.178139877319336, "memory(GiB)": 77.56, "step": 93000, "token_acc": 0.528169014084507, "train_speed(iter/s)": 1.43801 }, { "epoch": 3.9844051240306757, "eval_loss": 2.020324945449829, "eval_runtime": 14.5715, "eval_samples_per_second": 6.863, "eval_steps_per_second": 6.863, "eval_token_acc": 0.510989010989011, "step": 93000 }, { "epoch": 3.9846193393599245, "grad_norm": 6.374236583709717, "learning_rate": 9.83505122360544e-06, "loss": 2.3164394378662108, "memory(GiB)": 77.56, "step": 93005, "token_acc": 0.5042735042735043, "train_speed(iter/s)": 1.437648 }, { "epoch": 3.9848335546891738, "grad_norm": 5.763950347900391, "learning_rate": 9.831043496208042e-06, "loss": 2.2330081939697264, "memory(GiB)": 77.56, "step": 93010, "token_acc": 0.5268817204301075, "train_speed(iter/s)": 1.437643 }, { "epoch": 3.9850477700184226, "grad_norm": 5.373263359069824, "learning_rate": 9.827036496508973e-06, "loss": 2.189457321166992, "memory(GiB)": 77.56, "step": 93015, "token_acc": 0.5236486486486487, "train_speed(iter/s)": 1.437655 }, { "epoch": 3.9852619853476714, "grad_norm": 5.882124423980713, "learning_rate": 9.823030224580809e-06, "loss": 2.4792526245117186, "memory(GiB)": 77.56, "step": 93020, "token_acc": 0.48242811501597443, "train_speed(iter/s)": 1.437637 }, { "epoch": 3.9854762006769207, "grad_norm": 6.168560028076172, "learning_rate": 9.819024680496159e-06, "loss": 2.3294500350952148, "memory(GiB)": 77.56, "step": 93025, "token_acc": 0.4793103448275862, "train_speed(iter/s)": 1.437635 }, { "epoch": 3.9856904160061695, "grad_norm": 4.794861793518066, "learning_rate": 9.815019864327551e-06, "loss": 2.4881208419799803, "memory(GiB)": 77.56, "step": 93030, "token_acc": 0.49019607843137253, "train_speed(iter/s)": 1.437648 }, { "epoch": 3.9859046313354183, "grad_norm": 4.418397426605225, "learning_rate": 9.81101577614757e-06, "loss": 2.4073436737060545, "memory(GiB)": 77.56, "step": 93035, "token_acc": 0.4850498338870432, "train_speed(iter/s)": 1.437645 }, { "epoch": 3.9861188466646675, "grad_norm": 5.007444381713867, "learning_rate": 9.807012416028739e-06, "loss": 2.1383548736572267, "memory(GiB)": 77.56, "step": 93040, "token_acc": 0.5189393939393939, "train_speed(iter/s)": 1.437648 }, { "epoch": 3.9863330619939163, "grad_norm": 6.416004657745361, "learning_rate": 9.803009784043581e-06, "loss": 2.5978683471679687, "memory(GiB)": 77.56, "step": 93045, "token_acc": 0.47337278106508873, "train_speed(iter/s)": 1.437653 }, { "epoch": 3.986547277323165, "grad_norm": 5.443553447723389, "learning_rate": 9.799007880264611e-06, "loss": 2.232929801940918, "memory(GiB)": 77.56, "step": 93050, "token_acc": 0.5195195195195195, "train_speed(iter/s)": 1.437666 }, { "epoch": 3.9867614926524144, "grad_norm": 7.6471381187438965, "learning_rate": 9.79500670476431e-06, "loss": 2.5438961029052733, "memory(GiB)": 77.56, "step": 93055, "token_acc": 0.48641304347826086, "train_speed(iter/s)": 1.437676 }, { "epoch": 3.9869757079816632, "grad_norm": 5.956918239593506, "learning_rate": 9.791006257615192e-06, "loss": 2.233340835571289, "memory(GiB)": 77.56, "step": 93060, "token_acc": 0.5416666666666666, "train_speed(iter/s)": 1.437676 }, { "epoch": 3.987189923310912, "grad_norm": 7.244227886199951, "learning_rate": 9.787006538889714e-06, "loss": 2.315345573425293, "memory(GiB)": 77.56, "step": 93065, "token_acc": 0.5256410256410257, "train_speed(iter/s)": 1.437685 }, { "epoch": 3.9874041386401613, "grad_norm": 5.253133296966553, "learning_rate": 9.783007548660338e-06, "loss": 2.043259048461914, "memory(GiB)": 77.56, "step": 93070, "token_acc": 0.5480427046263345, "train_speed(iter/s)": 1.4377 }, { "epoch": 3.98761835396941, "grad_norm": 5.82855749130249, "learning_rate": 9.779009286999501e-06, "loss": 2.2444684982299803, "memory(GiB)": 77.56, "step": 93075, "token_acc": 0.5487804878048781, "train_speed(iter/s)": 1.437697 }, { "epoch": 3.987832569298659, "grad_norm": 4.557111740112305, "learning_rate": 9.775011753979646e-06, "loss": 2.2299251556396484, "memory(GiB)": 77.56, "step": 93080, "token_acc": 0.562962962962963, "train_speed(iter/s)": 1.437693 }, { "epoch": 3.988046784627908, "grad_norm": 6.188343524932861, "learning_rate": 9.771014949673174e-06, "loss": 2.198384666442871, "memory(GiB)": 77.56, "step": 93085, "token_acc": 0.47509578544061304, "train_speed(iter/s)": 1.437691 }, { "epoch": 3.988260999957157, "grad_norm": 7.0401082038879395, "learning_rate": 9.767018874152517e-06, "loss": 2.136541748046875, "memory(GiB)": 77.56, "step": 93090, "token_acc": 0.5, "train_speed(iter/s)": 1.437709 }, { "epoch": 3.988475215286406, "grad_norm": 5.74680757522583, "learning_rate": 9.76302352749004e-06, "loss": 2.3570327758789062, "memory(GiB)": 77.56, "step": 93095, "token_acc": 0.5213414634146342, "train_speed(iter/s)": 1.437723 }, { "epoch": 3.988689430615655, "grad_norm": 19.408462524414062, "learning_rate": 9.75902890975815e-06, "loss": 2.376866912841797, "memory(GiB)": 77.56, "step": 93100, "token_acc": 0.532319391634981, "train_speed(iter/s)": 1.437731 }, { "epoch": 3.988903645944904, "grad_norm": 5.173044204711914, "learning_rate": 9.755035021029202e-06, "loss": 2.2135377883911134, "memory(GiB)": 77.56, "step": 93105, "token_acc": 0.525974025974026, "train_speed(iter/s)": 1.437722 }, { "epoch": 3.9891178612741527, "grad_norm": 6.243752956390381, "learning_rate": 9.751041861375549e-06, "loss": 2.541492462158203, "memory(GiB)": 77.56, "step": 93110, "token_acc": 0.5160256410256411, "train_speed(iter/s)": 1.437724 }, { "epoch": 3.989332076603402, "grad_norm": 8.36265754699707, "learning_rate": 9.74704943086951e-06, "loss": 2.5464977264404296, "memory(GiB)": 77.56, "step": 93115, "token_acc": 0.475177304964539, "train_speed(iter/s)": 1.437718 }, { "epoch": 3.9895462919326508, "grad_norm": 5.7020978927612305, "learning_rate": 9.74305772958345e-06, "loss": 2.017613983154297, "memory(GiB)": 77.56, "step": 93120, "token_acc": 0.5720338983050848, "train_speed(iter/s)": 1.437715 }, { "epoch": 3.9897605072618996, "grad_norm": 6.958425998687744, "learning_rate": 9.73906675758966e-06, "loss": 2.670576477050781, "memory(GiB)": 77.56, "step": 93125, "token_acc": 0.4405594405594406, "train_speed(iter/s)": 1.437698 }, { "epoch": 3.989974722591149, "grad_norm": 6.194924354553223, "learning_rate": 9.735076514960445e-06, "loss": 2.3503650665283202, "memory(GiB)": 77.56, "step": 93130, "token_acc": 0.5136186770428015, "train_speed(iter/s)": 1.437712 }, { "epoch": 3.9901889379203976, "grad_norm": 5.379251003265381, "learning_rate": 9.731087001768085e-06, "loss": 2.2071304321289062, "memory(GiB)": 77.56, "step": 93135, "token_acc": 0.53156146179402, "train_speed(iter/s)": 1.437718 }, { "epoch": 3.9904031532496465, "grad_norm": 7.103784561157227, "learning_rate": 9.727098218084868e-06, "loss": 2.3284521102905273, "memory(GiB)": 77.56, "step": 93140, "token_acc": 0.5121212121212121, "train_speed(iter/s)": 1.437719 }, { "epoch": 3.9906173685788957, "grad_norm": 4.897048473358154, "learning_rate": 9.72311016398303e-06, "loss": 2.3074663162231444, "memory(GiB)": 77.56, "step": 93145, "token_acc": 0.514018691588785, "train_speed(iter/s)": 1.437746 }, { "epoch": 3.9908315839081445, "grad_norm": 6.873358249664307, "learning_rate": 9.719122839534844e-06, "loss": 2.3732961654663085, "memory(GiB)": 77.56, "step": 93150, "token_acc": 0.48985507246376814, "train_speed(iter/s)": 1.437747 }, { "epoch": 3.9910457992373933, "grad_norm": 5.830979824066162, "learning_rate": 9.715136244812533e-06, "loss": 2.592213821411133, "memory(GiB)": 77.56, "step": 93155, "token_acc": 0.48788927335640137, "train_speed(iter/s)": 1.437744 }, { "epoch": 3.9912600145666426, "grad_norm": 5.25096321105957, "learning_rate": 9.711150379888328e-06, "loss": 2.093849945068359, "memory(GiB)": 77.56, "step": 93160, "token_acc": 0.5298245614035088, "train_speed(iter/s)": 1.437754 }, { "epoch": 3.9914742298958914, "grad_norm": 5.936777591705322, "learning_rate": 9.70716524483441e-06, "loss": 2.481352424621582, "memory(GiB)": 77.56, "step": 93165, "token_acc": 0.47126436781609193, "train_speed(iter/s)": 1.437752 }, { "epoch": 3.99168844522514, "grad_norm": 9.324991226196289, "learning_rate": 9.703180839723003e-06, "loss": 2.2618932723999023, "memory(GiB)": 77.56, "step": 93170, "token_acc": 0.5049833887043189, "train_speed(iter/s)": 1.437752 }, { "epoch": 3.9919026605543895, "grad_norm": 5.5073041915893555, "learning_rate": 9.699197164626261e-06, "loss": 2.340204048156738, "memory(GiB)": 77.56, "step": 93175, "token_acc": 0.4723926380368098, "train_speed(iter/s)": 1.437725 }, { "epoch": 3.9921168758836383, "grad_norm": 5.616020202636719, "learning_rate": 9.695214219616383e-06, "loss": 2.6657670974731444, "memory(GiB)": 77.56, "step": 93180, "token_acc": 0.4300341296928328, "train_speed(iter/s)": 1.43772 }, { "epoch": 3.992331091212887, "grad_norm": 4.7828369140625, "learning_rate": 9.69123200476551e-06, "loss": 2.3973552703857424, "memory(GiB)": 77.56, "step": 93185, "token_acc": 0.5084745762711864, "train_speed(iter/s)": 1.437738 }, { "epoch": 3.9925453065421364, "grad_norm": 5.4492573738098145, "learning_rate": 9.687250520145785e-06, "loss": 2.4097145080566404, "memory(GiB)": 77.56, "step": 93190, "token_acc": 0.5092250922509225, "train_speed(iter/s)": 1.437722 }, { "epoch": 3.992759521871385, "grad_norm": 4.831192493438721, "learning_rate": 9.683269765829328e-06, "loss": 2.4074705123901365, "memory(GiB)": 77.56, "step": 93195, "token_acc": 0.46417445482866043, "train_speed(iter/s)": 1.437721 }, { "epoch": 3.992973737200634, "grad_norm": 5.701098918914795, "learning_rate": 9.679289741888248e-06, "loss": 2.3720630645751952, "memory(GiB)": 77.56, "step": 93200, "token_acc": 0.5162337662337663, "train_speed(iter/s)": 1.437715 }, { "epoch": 3.9931879525298832, "grad_norm": 5.549937725067139, "learning_rate": 9.67531044839467e-06, "loss": 2.468387794494629, "memory(GiB)": 77.56, "step": 93205, "token_acc": 0.4296028880866426, "train_speed(iter/s)": 1.437722 }, { "epoch": 3.993402167859132, "grad_norm": 6.998640537261963, "learning_rate": 9.67133188542067e-06, "loss": 2.3003664016723633, "memory(GiB)": 77.56, "step": 93210, "token_acc": 0.5019157088122606, "train_speed(iter/s)": 1.43772 }, { "epoch": 3.993616383188381, "grad_norm": 6.147166728973389, "learning_rate": 9.667354053038329e-06, "loss": 2.1802204132080076, "memory(GiB)": 77.56, "step": 93215, "token_acc": 0.5493421052631579, "train_speed(iter/s)": 1.437736 }, { "epoch": 3.99383059851763, "grad_norm": 5.608270645141602, "learning_rate": 9.663376951319702e-06, "loss": 1.9577875137329102, "memory(GiB)": 77.56, "step": 93220, "token_acc": 0.5537190082644629, "train_speed(iter/s)": 1.43774 }, { "epoch": 3.994044813846879, "grad_norm": 5.10335636138916, "learning_rate": 9.659400580336836e-06, "loss": 2.0693607330322266, "memory(GiB)": 77.56, "step": 93225, "token_acc": 0.5616438356164384, "train_speed(iter/s)": 1.437745 }, { "epoch": 3.9942590291761277, "grad_norm": 5.3745436668396, "learning_rate": 9.655424940161761e-06, "loss": 2.2429805755615235, "memory(GiB)": 77.56, "step": 93230, "token_acc": 0.5016501650165016, "train_speed(iter/s)": 1.437719 }, { "epoch": 3.994473244505377, "grad_norm": 4.842333793640137, "learning_rate": 9.651450030866527e-06, "loss": 2.092479705810547, "memory(GiB)": 77.56, "step": 93235, "token_acc": 0.5173501577287066, "train_speed(iter/s)": 1.437728 }, { "epoch": 3.994687459834626, "grad_norm": 6.195989608764648, "learning_rate": 9.647475852523109e-06, "loss": 2.0949399948120115, "memory(GiB)": 77.56, "step": 93240, "token_acc": 0.5303030303030303, "train_speed(iter/s)": 1.437717 }, { "epoch": 3.9949016751638746, "grad_norm": 5.4378275871276855, "learning_rate": 9.64350240520353e-06, "loss": 2.0547292709350584, "memory(GiB)": 77.56, "step": 93245, "token_acc": 0.5273972602739726, "train_speed(iter/s)": 1.437716 }, { "epoch": 3.995115890493124, "grad_norm": 4.592390060424805, "learning_rate": 9.639529688979765e-06, "loss": 2.140955352783203, "memory(GiB)": 77.56, "step": 93250, "token_acc": 0.5402298850574713, "train_speed(iter/s)": 1.437719 }, { "epoch": 3.9953301058223727, "grad_norm": 5.867456436157227, "learning_rate": 9.63555770392378e-06, "loss": 2.4671438217163084, "memory(GiB)": 77.56, "step": 93255, "token_acc": 0.4883720930232558, "train_speed(iter/s)": 1.437727 }, { "epoch": 3.9955443211516215, "grad_norm": 4.547386169433594, "learning_rate": 9.63158645010752e-06, "loss": 2.209004592895508, "memory(GiB)": 77.56, "step": 93260, "token_acc": 0.5, "train_speed(iter/s)": 1.437722 }, { "epoch": 3.9957585364808708, "grad_norm": 4.398655414581299, "learning_rate": 9.627615927602957e-06, "loss": 2.3498098373413088, "memory(GiB)": 77.56, "step": 93265, "token_acc": 0.49206349206349204, "train_speed(iter/s)": 1.437708 }, { "epoch": 3.9959727518101196, "grad_norm": 7.089205265045166, "learning_rate": 9.623646136482e-06, "loss": 2.0108156204223633, "memory(GiB)": 77.56, "step": 93270, "token_acc": 0.5541666666666667, "train_speed(iter/s)": 1.437728 }, { "epoch": 3.9961869671393684, "grad_norm": 5.16269063949585, "learning_rate": 9.619677076816568e-06, "loss": 2.542580223083496, "memory(GiB)": 77.56, "step": 93275, "token_acc": 0.4822485207100592, "train_speed(iter/s)": 1.437707 }, { "epoch": 3.9964011824686176, "grad_norm": 6.381258964538574, "learning_rate": 9.615708748678565e-06, "loss": 2.168239974975586, "memory(GiB)": 77.56, "step": 93280, "token_acc": 0.5467625899280576, "train_speed(iter/s)": 1.437722 }, { "epoch": 3.9966153977978665, "grad_norm": 5.963865756988525, "learning_rate": 9.611741152139886e-06, "loss": 2.6363449096679688, "memory(GiB)": 77.56, "step": 93285, "token_acc": 0.4588235294117647, "train_speed(iter/s)": 1.437714 }, { "epoch": 3.9968296131271153, "grad_norm": 7.664267539978027, "learning_rate": 9.607774287272386e-06, "loss": 2.522052001953125, "memory(GiB)": 77.56, "step": 93290, "token_acc": 0.48945147679324896, "train_speed(iter/s)": 1.437712 }, { "epoch": 3.9970438284563645, "grad_norm": 4.96965217590332, "learning_rate": 9.603808154147958e-06, "loss": 2.44207763671875, "memory(GiB)": 77.56, "step": 93295, "token_acc": 0.4358974358974359, "train_speed(iter/s)": 1.437737 }, { "epoch": 3.9972580437856133, "grad_norm": 5.467095851898193, "learning_rate": 9.599842752838444e-06, "loss": 2.2445674896240235, "memory(GiB)": 77.56, "step": 93300, "token_acc": 0.5376044568245125, "train_speed(iter/s)": 1.437736 }, { "epoch": 3.997472259114862, "grad_norm": 6.500300407409668, "learning_rate": 9.595878083415676e-06, "loss": 2.3761457443237304, "memory(GiB)": 77.56, "step": 93305, "token_acc": 0.5166051660516605, "train_speed(iter/s)": 1.437745 }, { "epoch": 3.9976864744441114, "grad_norm": 5.187769889831543, "learning_rate": 9.59191414595148e-06, "loss": 2.103641700744629, "memory(GiB)": 77.56, "step": 93310, "token_acc": 0.5311475409836065, "train_speed(iter/s)": 1.437738 }, { "epoch": 3.99790068977336, "grad_norm": 6.747546195983887, "learning_rate": 9.587950940517648e-06, "loss": 2.561017608642578, "memory(GiB)": 77.56, "step": 93315, "token_acc": 0.4965753424657534, "train_speed(iter/s)": 1.437749 }, { "epoch": 3.998114905102609, "grad_norm": 7.278005123138428, "learning_rate": 9.583988467185995e-06, "loss": 2.38443603515625, "memory(GiB)": 77.56, "step": 93320, "token_acc": 0.4730878186968839, "train_speed(iter/s)": 1.43777 }, { "epoch": 3.9983291204318583, "grad_norm": 8.352123260498047, "learning_rate": 9.580026726028318e-06, "loss": 2.7073802947998047, "memory(GiB)": 77.56, "step": 93325, "token_acc": 0.4605678233438486, "train_speed(iter/s)": 1.437769 }, { "epoch": 3.998543335761107, "grad_norm": 6.051680088043213, "learning_rate": 9.576065717116378e-06, "loss": 2.3547239303588867, "memory(GiB)": 77.56, "step": 93330, "token_acc": 0.48299319727891155, "train_speed(iter/s)": 1.437758 }, { "epoch": 3.998757551090356, "grad_norm": 7.034042835235596, "learning_rate": 9.572105440521922e-06, "loss": 2.5420087814331054, "memory(GiB)": 77.56, "step": 93335, "token_acc": 0.5, "train_speed(iter/s)": 1.437774 }, { "epoch": 3.998971766419605, "grad_norm": 6.215058326721191, "learning_rate": 9.568145896316704e-06, "loss": 2.1047475814819334, "memory(GiB)": 77.56, "step": 93340, "token_acc": 0.5130718954248366, "train_speed(iter/s)": 1.43776 }, { "epoch": 3.999185981748854, "grad_norm": 7.614290714263916, "learning_rate": 9.56418708457244e-06, "loss": 2.460840034484863, "memory(GiB)": 77.56, "step": 93345, "token_acc": 0.5153846153846153, "train_speed(iter/s)": 1.43779 }, { "epoch": 3.999400197078103, "grad_norm": 6.198672771453857, "learning_rate": 9.560229005360866e-06, "loss": 2.296146583557129, "memory(GiB)": 77.56, "step": 93350, "token_acc": 0.47333333333333333, "train_speed(iter/s)": 1.437805 }, { "epoch": 3.999614412407352, "grad_norm": 5.386726379394531, "learning_rate": 9.556271658753685e-06, "loss": 2.3075128555297852, "memory(GiB)": 77.56, "step": 93355, "token_acc": 0.4885245901639344, "train_speed(iter/s)": 1.437798 }, { "epoch": 3.999828627736601, "grad_norm": 5.895130634307861, "learning_rate": 9.552315044822579e-06, "loss": 2.198731231689453, "memory(GiB)": 77.56, "step": 93360, "token_acc": 0.49666666666666665, "train_speed(iter/s)": 1.437814 }, { "epoch": 4.00004284306585, "grad_norm": 5.6926727294921875, "learning_rate": 9.548359163639226e-06, "loss": 2.057406997680664, "memory(GiB)": 77.56, "step": 93365, "token_acc": 0.5617977528089888, "train_speed(iter/s)": 1.437809 }, { "epoch": 4.000257058395099, "grad_norm": 6.77676248550415, "learning_rate": 9.5444040152753e-06, "loss": 2.1082324981689453, "memory(GiB)": 77.56, "step": 93370, "token_acc": 0.5057915057915058, "train_speed(iter/s)": 1.437811 }, { "epoch": 4.000471273724347, "grad_norm": 6.574366092681885, "learning_rate": 9.540449599802426e-06, "loss": 2.222467041015625, "memory(GiB)": 77.56, "step": 93375, "token_acc": 0.5288135593220339, "train_speed(iter/s)": 1.437821 }, { "epoch": 4.0006854890535966, "grad_norm": 6.4058027267456055, "learning_rate": 9.536495917292283e-06, "loss": 2.3530227661132814, "memory(GiB)": 77.56, "step": 93380, "token_acc": 0.47333333333333333, "train_speed(iter/s)": 1.437837 }, { "epoch": 4.000899704382846, "grad_norm": 4.862844467163086, "learning_rate": 9.532542967816466e-06, "loss": 2.409433937072754, "memory(GiB)": 77.56, "step": 93385, "token_acc": 0.5121107266435986, "train_speed(iter/s)": 1.437845 }, { "epoch": 4.001113919712094, "grad_norm": 5.503967761993408, "learning_rate": 9.528590751446587e-06, "loss": 2.4542724609375, "memory(GiB)": 77.56, "step": 93390, "token_acc": 0.49557522123893805, "train_speed(iter/s)": 1.437864 }, { "epoch": 4.001328135041343, "grad_norm": 7.940217971801758, "learning_rate": 9.524639268254259e-06, "loss": 2.7267017364501953, "memory(GiB)": 77.56, "step": 93395, "token_acc": 0.5017667844522968, "train_speed(iter/s)": 1.437871 }, { "epoch": 4.001542350370593, "grad_norm": 5.918318271636963, "learning_rate": 9.520688518311065e-06, "loss": 2.1060964584350588, "memory(GiB)": 77.56, "step": 93400, "token_acc": 0.5583941605839416, "train_speed(iter/s)": 1.43788 }, { "epoch": 4.001756565699841, "grad_norm": 7.177742958068848, "learning_rate": 9.51673850168856e-06, "loss": 2.380371856689453, "memory(GiB)": 77.56, "step": 93405, "token_acc": 0.47318611987381703, "train_speed(iter/s)": 1.437889 }, { "epoch": 4.00197078102909, "grad_norm": 4.978150844573975, "learning_rate": 9.512789218458323e-06, "loss": 2.395379638671875, "memory(GiB)": 77.56, "step": 93410, "token_acc": 0.4774774774774775, "train_speed(iter/s)": 1.437904 }, { "epoch": 4.00218499635834, "grad_norm": 7.750488758087158, "learning_rate": 9.508840668691887e-06, "loss": 2.1341812133789064, "memory(GiB)": 77.56, "step": 93415, "token_acc": 0.5531135531135531, "train_speed(iter/s)": 1.437911 }, { "epoch": 4.002399211687588, "grad_norm": 5.484640121459961, "learning_rate": 9.50489285246079e-06, "loss": 2.158664512634277, "memory(GiB)": 77.56, "step": 93420, "token_acc": 0.6095890410958904, "train_speed(iter/s)": 1.43792 }, { "epoch": 4.002613427016837, "grad_norm": 5.87212610244751, "learning_rate": 9.500945769836545e-06, "loss": 2.2428680419921876, "memory(GiB)": 77.56, "step": 93425, "token_acc": 0.5257352941176471, "train_speed(iter/s)": 1.437921 }, { "epoch": 4.0028276423460865, "grad_norm": 5.944907188415527, "learning_rate": 9.496999420890646e-06, "loss": 1.8895322799682617, "memory(GiB)": 77.56, "step": 93430, "token_acc": 0.5977443609022557, "train_speed(iter/s)": 1.437894 }, { "epoch": 4.003041857675335, "grad_norm": 4.8054633140563965, "learning_rate": 9.493053805694608e-06, "loss": 2.293653106689453, "memory(GiB)": 77.56, "step": 93435, "token_acc": 0.5218855218855218, "train_speed(iter/s)": 1.437904 }, { "epoch": 4.003256073004584, "grad_norm": 8.297558784484863, "learning_rate": 9.4891089243199e-06, "loss": 2.5106374740600588, "memory(GiB)": 77.56, "step": 93440, "token_acc": 0.45964912280701753, "train_speed(iter/s)": 1.437902 }, { "epoch": 4.003470288333833, "grad_norm": 6.737578868865967, "learning_rate": 9.48516477683799e-06, "loss": 2.484136199951172, "memory(GiB)": 77.56, "step": 93445, "token_acc": 0.49038461538461536, "train_speed(iter/s)": 1.437901 }, { "epoch": 4.003684503663082, "grad_norm": 7.1392107009887695, "learning_rate": 9.48122136332032e-06, "loss": 2.4458948135375977, "memory(GiB)": 77.56, "step": 93450, "token_acc": 0.5246478873239436, "train_speed(iter/s)": 1.437908 }, { "epoch": 4.003898718992331, "grad_norm": 7.190907955169678, "learning_rate": 9.477278683838341e-06, "loss": 2.170952796936035, "memory(GiB)": 77.56, "step": 93455, "token_acc": 0.5719844357976653, "train_speed(iter/s)": 1.437916 }, { "epoch": 4.00411293432158, "grad_norm": 4.597238063812256, "learning_rate": 9.473336738463457e-06, "loss": 2.47666015625, "memory(GiB)": 77.56, "step": 93460, "token_acc": 0.5159420289855072, "train_speed(iter/s)": 1.437924 }, { "epoch": 4.004327149650829, "grad_norm": 8.489082336425781, "learning_rate": 9.469395527267089e-06, "loss": 2.493121337890625, "memory(GiB)": 77.56, "step": 93465, "token_acc": 0.4940119760479042, "train_speed(iter/s)": 1.437931 }, { "epoch": 4.004541364980078, "grad_norm": 6.523462295532227, "learning_rate": 9.46545505032066e-06, "loss": 2.1725048065185546, "memory(GiB)": 77.56, "step": 93470, "token_acc": 0.5358490566037736, "train_speed(iter/s)": 1.437932 }, { "epoch": 4.004755580309327, "grad_norm": 4.0298237800598145, "learning_rate": 9.46151530769554e-06, "loss": 2.107798194885254, "memory(GiB)": 77.56, "step": 93475, "token_acc": 0.5449275362318841, "train_speed(iter/s)": 1.437941 }, { "epoch": 4.0049697956385755, "grad_norm": 4.946781158447266, "learning_rate": 9.457576299463094e-06, "loss": 2.421368217468262, "memory(GiB)": 77.56, "step": 93480, "token_acc": 0.47752808988764045, "train_speed(iter/s)": 1.437939 }, { "epoch": 4.005184010967825, "grad_norm": 5.446173191070557, "learning_rate": 9.45363802569469e-06, "loss": 2.1397424697875977, "memory(GiB)": 77.56, "step": 93485, "token_acc": 0.535593220338983, "train_speed(iter/s)": 1.437913 }, { "epoch": 4.005398226297074, "grad_norm": 6.718372344970703, "learning_rate": 9.449700486461649e-06, "loss": 2.100190353393555, "memory(GiB)": 77.56, "step": 93490, "token_acc": 0.5524475524475524, "train_speed(iter/s)": 1.437911 }, { "epoch": 4.005612441626322, "grad_norm": 5.494898796081543, "learning_rate": 9.44576368183534e-06, "loss": 2.1900152206420898, "memory(GiB)": 77.56, "step": 93495, "token_acc": 0.5046728971962616, "train_speed(iter/s)": 1.437921 }, { "epoch": 4.005826656955572, "grad_norm": 6.694465637207031, "learning_rate": 9.44182761188706e-06, "loss": 2.4290599822998047, "memory(GiB)": 77.56, "step": 93500, "token_acc": 0.5076923076923077, "train_speed(iter/s)": 1.437896 }, { "epoch": 4.005826656955572, "eval_loss": 2.255371332168579, "eval_runtime": 13.7707, "eval_samples_per_second": 7.262, "eval_steps_per_second": 7.262, "eval_token_acc": 0.4885386819484241, "step": 93500 }, { "epoch": 4.006040872284821, "grad_norm": 6.061179161071777, "learning_rate": 9.437892276688126e-06, "loss": 2.2897716522216798, "memory(GiB)": 77.56, "step": 93505, "token_acc": 0.5053974484789009, "train_speed(iter/s)": 1.437563 }, { "epoch": 4.006255087614069, "grad_norm": 6.244830131530762, "learning_rate": 9.433957676309823e-06, "loss": 2.6214229583740236, "memory(GiB)": 77.56, "step": 93510, "token_acc": 0.4588235294117647, "train_speed(iter/s)": 1.437578 }, { "epoch": 4.0064693029433185, "grad_norm": 6.47458553314209, "learning_rate": 9.43002381082343e-06, "loss": 2.2354877471923826, "memory(GiB)": 77.56, "step": 93515, "token_acc": 0.517799352750809, "train_speed(iter/s)": 1.4376 }, { "epoch": 4.006683518272568, "grad_norm": 5.488643169403076, "learning_rate": 9.426090680300197e-06, "loss": 2.1180740356445313, "memory(GiB)": 77.56, "step": 93520, "token_acc": 0.55078125, "train_speed(iter/s)": 1.437584 }, { "epoch": 4.006897733601816, "grad_norm": 6.219763278961182, "learning_rate": 9.422158284811406e-06, "loss": 2.352944183349609, "memory(GiB)": 77.56, "step": 93525, "token_acc": 0.5492063492063493, "train_speed(iter/s)": 1.437569 }, { "epoch": 4.007111948931065, "grad_norm": 5.979735851287842, "learning_rate": 9.41822662442829e-06, "loss": 2.187449264526367, "memory(GiB)": 77.56, "step": 93530, "token_acc": 0.5228758169934641, "train_speed(iter/s)": 1.437581 }, { "epoch": 4.007326164260315, "grad_norm": 5.848051071166992, "learning_rate": 9.414295699222048e-06, "loss": 2.336979103088379, "memory(GiB)": 77.56, "step": 93535, "token_acc": 0.4857142857142857, "train_speed(iter/s)": 1.437595 }, { "epoch": 4.007540379589563, "grad_norm": 5.259042739868164, "learning_rate": 9.410365509263924e-06, "loss": 2.3839765548706056, "memory(GiB)": 77.56, "step": 93540, "token_acc": 0.5088235294117647, "train_speed(iter/s)": 1.437595 }, { "epoch": 4.007754594918812, "grad_norm": 7.334307670593262, "learning_rate": 9.406436054625106e-06, "loss": 2.255917549133301, "memory(GiB)": 77.56, "step": 93545, "token_acc": 0.49242424242424243, "train_speed(iter/s)": 1.437612 }, { "epoch": 4.0079688102480615, "grad_norm": 5.4149322509765625, "learning_rate": 9.40250733537677e-06, "loss": 2.18704891204834, "memory(GiB)": 77.56, "step": 93550, "token_acc": 0.5064516129032258, "train_speed(iter/s)": 1.437637 }, { "epoch": 4.00818302557731, "grad_norm": 5.233251094818115, "learning_rate": 9.398579351590103e-06, "loss": 2.2285476684570313, "memory(GiB)": 77.56, "step": 93555, "token_acc": 0.5016181229773463, "train_speed(iter/s)": 1.43764 }, { "epoch": 4.008397240906559, "grad_norm": 7.4715800285339355, "learning_rate": 9.394652103336265e-06, "loss": 2.255872344970703, "memory(GiB)": 77.56, "step": 93560, "token_acc": 0.5298804780876494, "train_speed(iter/s)": 1.437657 }, { "epoch": 4.008611456235808, "grad_norm": 7.020638465881348, "learning_rate": 9.39072559068639e-06, "loss": 2.1696699142456053, "memory(GiB)": 77.56, "step": 93565, "token_acc": 0.5379537953795379, "train_speed(iter/s)": 1.437671 }, { "epoch": 4.008825671565057, "grad_norm": 5.364024639129639, "learning_rate": 9.386799813711617e-06, "loss": 2.167232704162598, "memory(GiB)": 77.56, "step": 93570, "token_acc": 0.5307692307692308, "train_speed(iter/s)": 1.437667 }, { "epoch": 4.009039886894306, "grad_norm": 6.392496585845947, "learning_rate": 9.382874772483052e-06, "loss": 2.1624725341796873, "memory(GiB)": 77.56, "step": 93575, "token_acc": 0.5205479452054794, "train_speed(iter/s)": 1.437678 }, { "epoch": 4.009254102223555, "grad_norm": 4.925895690917969, "learning_rate": 9.378950467071829e-06, "loss": 1.9535303115844727, "memory(GiB)": 77.56, "step": 93580, "token_acc": 0.5421245421245421, "train_speed(iter/s)": 1.437681 }, { "epoch": 4.009468317552804, "grad_norm": 6.108914852142334, "learning_rate": 9.375026897549017e-06, "loss": 2.4048938751220703, "memory(GiB)": 77.56, "step": 93585, "token_acc": 0.5203488372093024, "train_speed(iter/s)": 1.437692 }, { "epoch": 4.009682532882053, "grad_norm": 7.15532922744751, "learning_rate": 9.371104063985702e-06, "loss": 2.3725967407226562, "memory(GiB)": 77.56, "step": 93590, "token_acc": 0.5154929577464789, "train_speed(iter/s)": 1.437704 }, { "epoch": 4.009896748211302, "grad_norm": 5.323001861572266, "learning_rate": 9.367181966452953e-06, "loss": 2.1099403381347654, "memory(GiB)": 77.56, "step": 93595, "token_acc": 0.5555555555555556, "train_speed(iter/s)": 1.437703 }, { "epoch": 4.0101109635405505, "grad_norm": 5.8606085777282715, "learning_rate": 9.363260605021824e-06, "loss": 2.096474838256836, "memory(GiB)": 77.56, "step": 93600, "token_acc": 0.5278810408921933, "train_speed(iter/s)": 1.437717 }, { "epoch": 4.0103251788698, "grad_norm": 5.25267219543457, "learning_rate": 9.359339979763326e-06, "loss": 2.1067861557006835, "memory(GiB)": 77.56, "step": 93605, "token_acc": 0.5127388535031847, "train_speed(iter/s)": 1.437721 }, { "epoch": 4.010539394199049, "grad_norm": 4.698491096496582, "learning_rate": 9.355420090748518e-06, "loss": 2.2778825759887695, "memory(GiB)": 77.56, "step": 93610, "token_acc": 0.5337620578778135, "train_speed(iter/s)": 1.43772 }, { "epoch": 4.010753609528297, "grad_norm": 5.380602836608887, "learning_rate": 9.351500938048408e-06, "loss": 2.276262092590332, "memory(GiB)": 77.56, "step": 93615, "token_acc": 0.5287769784172662, "train_speed(iter/s)": 1.437728 }, { "epoch": 4.010967824857547, "grad_norm": 5.973597526550293, "learning_rate": 9.347582521733994e-06, "loss": 2.2164852142333986, "memory(GiB)": 77.56, "step": 93620, "token_acc": 0.5174603174603175, "train_speed(iter/s)": 1.437714 }, { "epoch": 4.011182040186796, "grad_norm": 9.466315269470215, "learning_rate": 9.34366484187626e-06, "loss": 2.203280448913574, "memory(GiB)": 77.56, "step": 93625, "token_acc": 0.5017421602787456, "train_speed(iter/s)": 1.437731 }, { "epoch": 4.011396255516045, "grad_norm": 7.253759384155273, "learning_rate": 9.339747898546169e-06, "loss": 2.2978134155273438, "memory(GiB)": 77.56, "step": 93630, "token_acc": 0.5181518151815182, "train_speed(iter/s)": 1.437718 }, { "epoch": 4.0116104708452935, "grad_norm": 5.4126410484313965, "learning_rate": 9.335831691814684e-06, "loss": 2.337779998779297, "memory(GiB)": 77.56, "step": 93635, "token_acc": 0.498220640569395, "train_speed(iter/s)": 1.437729 }, { "epoch": 4.011824686174543, "grad_norm": 8.379061698913574, "learning_rate": 9.331916221752756e-06, "loss": 2.4290111541748045, "memory(GiB)": 77.56, "step": 93640, "token_acc": 0.48615384615384616, "train_speed(iter/s)": 1.437729 }, { "epoch": 4.012038901503792, "grad_norm": 5.679766654968262, "learning_rate": 9.328001488431326e-06, "loss": 2.4866424560546876, "memory(GiB)": 77.56, "step": 93645, "token_acc": 0.5047923322683706, "train_speed(iter/s)": 1.437711 }, { "epoch": 4.01225311683304, "grad_norm": 4.610808372497559, "learning_rate": 9.324087491921296e-06, "loss": 2.3665496826171877, "memory(GiB)": 77.56, "step": 93650, "token_acc": 0.4863013698630137, "train_speed(iter/s)": 1.437719 }, { "epoch": 4.01246733216229, "grad_norm": 6.19395637512207, "learning_rate": 9.320174232293576e-06, "loss": 2.4105527877807615, "memory(GiB)": 77.56, "step": 93655, "token_acc": 0.46959459459459457, "train_speed(iter/s)": 1.437728 }, { "epoch": 4.012681547491539, "grad_norm": 7.039161205291748, "learning_rate": 9.316261709619068e-06, "loss": 2.5016502380371093, "memory(GiB)": 77.56, "step": 93660, "token_acc": 0.4785714285714286, "train_speed(iter/s)": 1.437736 }, { "epoch": 4.012895762820787, "grad_norm": 7.270907878875732, "learning_rate": 9.312349923968628e-06, "loss": 2.511243438720703, "memory(GiB)": 77.56, "step": 93665, "token_acc": 0.4697802197802198, "train_speed(iter/s)": 1.437757 }, { "epoch": 4.013109978150037, "grad_norm": 6.870691299438477, "learning_rate": 9.308438875413145e-06, "loss": 2.474112701416016, "memory(GiB)": 77.56, "step": 93670, "token_acc": 0.4733893557422969, "train_speed(iter/s)": 1.437736 }, { "epoch": 4.013324193479286, "grad_norm": 5.674384593963623, "learning_rate": 9.304528564023468e-06, "loss": 2.4112449645996095, "memory(GiB)": 77.56, "step": 93675, "token_acc": 0.5172413793103449, "train_speed(iter/s)": 1.437743 }, { "epoch": 4.013538408808534, "grad_norm": 5.993567943572998, "learning_rate": 9.30061898987043e-06, "loss": 2.24066162109375, "memory(GiB)": 77.56, "step": 93680, "token_acc": 0.5122699386503068, "train_speed(iter/s)": 1.437761 }, { "epoch": 4.013752624137783, "grad_norm": 6.70174503326416, "learning_rate": 9.296710153024846e-06, "loss": 2.271550941467285, "memory(GiB)": 77.56, "step": 93685, "token_acc": 0.5096525096525096, "train_speed(iter/s)": 1.437782 }, { "epoch": 4.013966839467033, "grad_norm": 5.078118324279785, "learning_rate": 9.292802053557553e-06, "loss": 2.5433586120605467, "memory(GiB)": 77.56, "step": 93690, "token_acc": 0.5234375, "train_speed(iter/s)": 1.437778 }, { "epoch": 4.014181054796281, "grad_norm": 6.76052188873291, "learning_rate": 9.288894691539323e-06, "loss": 2.1711614608764647, "memory(GiB)": 77.56, "step": 93695, "token_acc": 0.5687022900763359, "train_speed(iter/s)": 1.437772 }, { "epoch": 4.01439527012553, "grad_norm": 5.271911144256592, "learning_rate": 9.284988067040973e-06, "loss": 2.108733367919922, "memory(GiB)": 77.56, "step": 93700, "token_acc": 0.5622775800711743, "train_speed(iter/s)": 1.437771 }, { "epoch": 4.01460948545478, "grad_norm": 7.146370887756348, "learning_rate": 9.281082180133254e-06, "loss": 2.4907913208007812, "memory(GiB)": 77.56, "step": 93705, "token_acc": 0.4721311475409836, "train_speed(iter/s)": 1.437772 }, { "epoch": 4.014823700784028, "grad_norm": 4.833681583404541, "learning_rate": 9.27717703088693e-06, "loss": 2.4081165313720705, "memory(GiB)": 77.56, "step": 93710, "token_acc": 0.5064102564102564, "train_speed(iter/s)": 1.437783 }, { "epoch": 4.015037916113277, "grad_norm": 5.994205951690674, "learning_rate": 9.273272619372748e-06, "loss": 2.372846984863281, "memory(GiB)": 77.56, "step": 93715, "token_acc": 0.5033333333333333, "train_speed(iter/s)": 1.437774 }, { "epoch": 4.0152521314425265, "grad_norm": 8.07086181640625, "learning_rate": 9.269368945661422e-06, "loss": 2.5486459732055664, "memory(GiB)": 77.56, "step": 93720, "token_acc": 0.4468864468864469, "train_speed(iter/s)": 1.437797 }, { "epoch": 4.015466346771775, "grad_norm": 6.291993141174316, "learning_rate": 9.265466009823698e-06, "loss": 2.392086982727051, "memory(GiB)": 77.56, "step": 93725, "token_acc": 0.46418338108882523, "train_speed(iter/s)": 1.437809 }, { "epoch": 4.015680562101024, "grad_norm": 6.247165679931641, "learning_rate": 9.261563811930269e-06, "loss": 2.0815719604492187, "memory(GiB)": 77.56, "step": 93730, "token_acc": 0.5604395604395604, "train_speed(iter/s)": 1.437791 }, { "epoch": 4.015894777430273, "grad_norm": 7.932577133178711, "learning_rate": 9.257662352051827e-06, "loss": 2.198335075378418, "memory(GiB)": 77.56, "step": 93735, "token_acc": 0.55859375, "train_speed(iter/s)": 1.437781 }, { "epoch": 4.016108992759522, "grad_norm": 5.848824501037598, "learning_rate": 9.25376163025905e-06, "loss": 2.5626031875610353, "memory(GiB)": 77.56, "step": 93740, "token_acc": 0.4887005649717514, "train_speed(iter/s)": 1.437787 }, { "epoch": 4.016323208088771, "grad_norm": 5.185723304748535, "learning_rate": 9.24986164662261e-06, "loss": 2.2850088119506835, "memory(GiB)": 77.56, "step": 93745, "token_acc": 0.5213414634146342, "train_speed(iter/s)": 1.437796 }, { "epoch": 4.01653742341802, "grad_norm": 7.207386493682861, "learning_rate": 9.245962401213132e-06, "loss": 2.306472969055176, "memory(GiB)": 77.56, "step": 93750, "token_acc": 0.5224489795918368, "train_speed(iter/s)": 1.437797 }, { "epoch": 4.016751638747269, "grad_norm": 6.612720012664795, "learning_rate": 9.242063894101294e-06, "loss": 2.2329856872558596, "memory(GiB)": 77.56, "step": 93755, "token_acc": 0.5225806451612903, "train_speed(iter/s)": 1.437798 }, { "epoch": 4.016965854076518, "grad_norm": 7.3874192237854, "learning_rate": 9.238166125357689e-06, "loss": 2.7943016052246095, "memory(GiB)": 77.56, "step": 93760, "token_acc": 0.4260869565217391, "train_speed(iter/s)": 1.437808 }, { "epoch": 4.017180069405767, "grad_norm": 7.883175373077393, "learning_rate": 9.234269095052956e-06, "loss": 2.1302608489990233, "memory(GiB)": 77.56, "step": 93765, "token_acc": 0.5139442231075697, "train_speed(iter/s)": 1.437813 }, { "epoch": 4.0173942847350155, "grad_norm": 4.341423988342285, "learning_rate": 9.230372803257675e-06, "loss": 2.2670223236083986, "memory(GiB)": 77.56, "step": 93770, "token_acc": 0.5078125, "train_speed(iter/s)": 1.437814 }, { "epoch": 4.017608500064265, "grad_norm": 6.676019191741943, "learning_rate": 9.226477250042443e-06, "loss": 2.3452211380004884, "memory(GiB)": 77.56, "step": 93775, "token_acc": 0.5104895104895105, "train_speed(iter/s)": 1.437821 }, { "epoch": 4.017822715393514, "grad_norm": 6.873841762542725, "learning_rate": 9.222582435477805e-06, "loss": 1.9828811645507813, "memory(GiB)": 77.56, "step": 93780, "token_acc": 0.5460526315789473, "train_speed(iter/s)": 1.43782 }, { "epoch": 4.018036930722762, "grad_norm": 7.752004146575928, "learning_rate": 9.218688359634352e-06, "loss": 2.4788864135742186, "memory(GiB)": 77.56, "step": 93785, "token_acc": 0.4758364312267658, "train_speed(iter/s)": 1.437824 }, { "epoch": 4.018251146052012, "grad_norm": 5.326835632324219, "learning_rate": 9.214795022582617e-06, "loss": 2.3073713302612306, "memory(GiB)": 77.56, "step": 93790, "token_acc": 0.49848024316109424, "train_speed(iter/s)": 1.437815 }, { "epoch": 4.018465361381261, "grad_norm": 5.026467323303223, "learning_rate": 9.210902424393131e-06, "loss": 2.0833648681640624, "memory(GiB)": 77.56, "step": 93795, "token_acc": 0.5373134328358209, "train_speed(iter/s)": 1.437812 }, { "epoch": 4.018679576710509, "grad_norm": 6.776566505432129, "learning_rate": 9.207010565136409e-06, "loss": 1.9410991668701172, "memory(GiB)": 77.56, "step": 93800, "token_acc": 0.5875486381322957, "train_speed(iter/s)": 1.437819 }, { "epoch": 4.0188937920397585, "grad_norm": 5.776382923126221, "learning_rate": 9.203119444882958e-06, "loss": 2.6139245986938477, "memory(GiB)": 77.56, "step": 93805, "token_acc": 0.45454545454545453, "train_speed(iter/s)": 1.437822 }, { "epoch": 4.019108007369008, "grad_norm": 4.514953136444092, "learning_rate": 9.199229063703257e-06, "loss": 2.11966552734375, "memory(GiB)": 77.56, "step": 93810, "token_acc": 0.5421245421245421, "train_speed(iter/s)": 1.437828 }, { "epoch": 4.019322222698256, "grad_norm": 5.997488975524902, "learning_rate": 9.195339421667804e-06, "loss": 2.1361522674560547, "memory(GiB)": 77.56, "step": 93815, "token_acc": 0.523943661971831, "train_speed(iter/s)": 1.437837 }, { "epoch": 4.019536438027505, "grad_norm": 8.49476432800293, "learning_rate": 9.191450518847062e-06, "loss": 2.173126983642578, "memory(GiB)": 77.56, "step": 93820, "token_acc": 0.5161290322580645, "train_speed(iter/s)": 1.437846 }, { "epoch": 4.019750653356755, "grad_norm": 6.412816524505615, "learning_rate": 9.187562355311469e-06, "loss": 2.045247268676758, "memory(GiB)": 77.56, "step": 93825, "token_acc": 0.5335968379446641, "train_speed(iter/s)": 1.437855 }, { "epoch": 4.019964868686003, "grad_norm": 6.843888282775879, "learning_rate": 9.183674931131464e-06, "loss": 2.1562389373779296, "memory(GiB)": 77.56, "step": 93830, "token_acc": 0.5461538461538461, "train_speed(iter/s)": 1.437879 }, { "epoch": 4.020179084015252, "grad_norm": 7.188787460327148, "learning_rate": 9.17978824637748e-06, "loss": 2.0955738067626952, "memory(GiB)": 77.56, "step": 93835, "token_acc": 0.5481481481481482, "train_speed(iter/s)": 1.437885 }, { "epoch": 4.0203932993445015, "grad_norm": 6.331221580505371, "learning_rate": 9.175902301119922e-06, "loss": 2.2976964950561523, "memory(GiB)": 77.56, "step": 93840, "token_acc": 0.565068493150685, "train_speed(iter/s)": 1.437905 }, { "epoch": 4.02060751467375, "grad_norm": 5.83984375, "learning_rate": 9.172017095429191e-06, "loss": 2.0562944412231445, "memory(GiB)": 77.56, "step": 93845, "token_acc": 0.5830258302583026, "train_speed(iter/s)": 1.437926 }, { "epoch": 4.020821730002999, "grad_norm": 4.872024059295654, "learning_rate": 9.16813262937568e-06, "loss": 2.4306571960449217, "memory(GiB)": 77.56, "step": 93850, "token_acc": 0.4879518072289157, "train_speed(iter/s)": 1.437909 }, { "epoch": 4.021035945332248, "grad_norm": 11.080470085144043, "learning_rate": 9.164248903029742e-06, "loss": 2.336315155029297, "memory(GiB)": 77.56, "step": 93855, "token_acc": 0.49110320284697506, "train_speed(iter/s)": 1.437935 }, { "epoch": 4.021250160661497, "grad_norm": 7.394295692443848, "learning_rate": 9.16036591646175e-06, "loss": 2.073668670654297, "memory(GiB)": 77.56, "step": 93860, "token_acc": 0.547244094488189, "train_speed(iter/s)": 1.437947 }, { "epoch": 4.021464375990746, "grad_norm": 5.520383834838867, "learning_rate": 9.156483669742022e-06, "loss": 2.4991296768188476, "memory(GiB)": 77.56, "step": 93865, "token_acc": 0.468944099378882, "train_speed(iter/s)": 1.437927 }, { "epoch": 4.021678591319995, "grad_norm": 6.500822067260742, "learning_rate": 9.152602162940921e-06, "loss": 2.2651416778564455, "memory(GiB)": 77.56, "step": 93870, "token_acc": 0.5152542372881356, "train_speed(iter/s)": 1.437933 }, { "epoch": 4.021892806649244, "grad_norm": 5.594616889953613, "learning_rate": 9.148721396128751e-06, "loss": 2.369472885131836, "memory(GiB)": 77.56, "step": 93875, "token_acc": 0.48955223880597015, "train_speed(iter/s)": 1.437943 }, { "epoch": 4.022107021978493, "grad_norm": 6.3511223793029785, "learning_rate": 9.144841369375811e-06, "loss": 2.4668338775634764, "memory(GiB)": 77.56, "step": 93880, "token_acc": 0.48746518105849584, "train_speed(iter/s)": 1.437941 }, { "epoch": 4.022321237307742, "grad_norm": 5.558446884155273, "learning_rate": 9.140962082752397e-06, "loss": 2.4822521209716797, "memory(GiB)": 77.56, "step": 93885, "token_acc": 0.47335423197492166, "train_speed(iter/s)": 1.437964 }, { "epoch": 4.0225354526369905, "grad_norm": 8.363835334777832, "learning_rate": 9.137083536328783e-06, "loss": 2.3325389862060546, "memory(GiB)": 77.56, "step": 93890, "token_acc": 0.5117845117845118, "train_speed(iter/s)": 1.437956 }, { "epoch": 4.02274966796624, "grad_norm": 6.519898891448975, "learning_rate": 9.133205730175221e-06, "loss": 2.4897390365600587, "memory(GiB)": 77.56, "step": 93895, "token_acc": 0.4659498207885305, "train_speed(iter/s)": 1.437958 }, { "epoch": 4.022963883295489, "grad_norm": 4.96488094329834, "learning_rate": 9.129328664361985e-06, "loss": 2.0881967544555664, "memory(GiB)": 77.56, "step": 93900, "token_acc": 0.5475409836065573, "train_speed(iter/s)": 1.437952 }, { "epoch": 4.023178098624737, "grad_norm": 6.539687633514404, "learning_rate": 9.12545233895929e-06, "loss": 2.14547061920166, "memory(GiB)": 77.56, "step": 93905, "token_acc": 0.5394190871369294, "train_speed(iter/s)": 1.437955 }, { "epoch": 4.023392313953987, "grad_norm": 6.46572208404541, "learning_rate": 9.121576754037381e-06, "loss": 2.5262537002563477, "memory(GiB)": 77.56, "step": 93910, "token_acc": 0.486013986013986, "train_speed(iter/s)": 1.437944 }, { "epoch": 4.023606529283236, "grad_norm": 5.656039714813232, "learning_rate": 9.117701909666453e-06, "loss": 1.9382194519042968, "memory(GiB)": 77.56, "step": 93915, "token_acc": 0.5793650793650794, "train_speed(iter/s)": 1.437941 }, { "epoch": 4.023820744612484, "grad_norm": 5.459254264831543, "learning_rate": 9.113827805916714e-06, "loss": 2.266299247741699, "memory(GiB)": 77.56, "step": 93920, "token_acc": 0.4717607973421927, "train_speed(iter/s)": 1.437903 }, { "epoch": 4.0240349599417335, "grad_norm": 6.356588363647461, "learning_rate": 9.109954442858315e-06, "loss": 2.159096336364746, "memory(GiB)": 77.56, "step": 93925, "token_acc": 0.5322033898305085, "train_speed(iter/s)": 1.43792 }, { "epoch": 4.024249175270983, "grad_norm": 6.883572578430176, "learning_rate": 9.106081820561469e-06, "loss": 2.130754852294922, "memory(GiB)": 77.56, "step": 93930, "token_acc": 0.48417721518987344, "train_speed(iter/s)": 1.437927 }, { "epoch": 4.024463390600231, "grad_norm": 6.1585493087768555, "learning_rate": 9.102209939096311e-06, "loss": 2.3141834259033205, "memory(GiB)": 77.56, "step": 93935, "token_acc": 0.528125, "train_speed(iter/s)": 1.437918 }, { "epoch": 4.02467760592948, "grad_norm": 4.8613691329956055, "learning_rate": 9.098338798532985e-06, "loss": 2.335850715637207, "memory(GiB)": 77.56, "step": 93940, "token_acc": 0.5034965034965035, "train_speed(iter/s)": 1.437926 }, { "epoch": 4.02489182125873, "grad_norm": 5.536412239074707, "learning_rate": 9.094468398941618e-06, "loss": 2.4390344619750977, "memory(GiB)": 77.56, "step": 93945, "token_acc": 0.48859934853420195, "train_speed(iter/s)": 1.437927 }, { "epoch": 4.025106036587978, "grad_norm": 5.811814308166504, "learning_rate": 9.090598740392331e-06, "loss": 2.358005905151367, "memory(GiB)": 77.56, "step": 93950, "token_acc": 0.4946236559139785, "train_speed(iter/s)": 1.437941 }, { "epoch": 4.025320251917227, "grad_norm": 6.258542537689209, "learning_rate": 9.086729822955214e-06, "loss": 2.34885311126709, "memory(GiB)": 77.56, "step": 93955, "token_acc": 0.5138339920948617, "train_speed(iter/s)": 1.43793 }, { "epoch": 4.025534467246477, "grad_norm": 4.842301845550537, "learning_rate": 9.082861646700374e-06, "loss": 2.214085006713867, "memory(GiB)": 77.56, "step": 93960, "token_acc": 0.5197568389057751, "train_speed(iter/s)": 1.437919 }, { "epoch": 4.025748682575725, "grad_norm": 4.775691032409668, "learning_rate": 9.078994211697883e-06, "loss": 2.193232536315918, "memory(GiB)": 77.56, "step": 93965, "token_acc": 0.5544554455445545, "train_speed(iter/s)": 1.437927 }, { "epoch": 4.025962897904974, "grad_norm": 5.557093143463135, "learning_rate": 9.075127518017796e-06, "loss": 2.130133628845215, "memory(GiB)": 77.56, "step": 93970, "token_acc": 0.535593220338983, "train_speed(iter/s)": 1.437931 }, { "epoch": 4.0261771132342234, "grad_norm": 7.165945053100586, "learning_rate": 9.07126156573016e-06, "loss": 1.7786710739135743, "memory(GiB)": 77.56, "step": 93975, "token_acc": 0.596, "train_speed(iter/s)": 1.437931 }, { "epoch": 4.026391328563472, "grad_norm": 6.821719169616699, "learning_rate": 9.06739635490501e-06, "loss": 2.408656120300293, "memory(GiB)": 77.56, "step": 93980, "token_acc": 0.5033783783783784, "train_speed(iter/s)": 1.437937 }, { "epoch": 4.026605543892721, "grad_norm": 6.899848937988281, "learning_rate": 9.06353188561237e-06, "loss": 2.173081970214844, "memory(GiB)": 77.56, "step": 93985, "token_acc": 0.5335463258785943, "train_speed(iter/s)": 1.43795 }, { "epoch": 4.02681975922197, "grad_norm": 5.928413391113281, "learning_rate": 9.059668157922262e-06, "loss": 2.167915916442871, "memory(GiB)": 77.56, "step": 93990, "token_acc": 0.5222222222222223, "train_speed(iter/s)": 1.437963 }, { "epoch": 4.027033974551219, "grad_norm": 5.2138190269470215, "learning_rate": 9.055805171904674e-06, "loss": 2.347804641723633, "memory(GiB)": 77.56, "step": 93995, "token_acc": 0.5224913494809689, "train_speed(iter/s)": 1.437963 }, { "epoch": 4.027248189880468, "grad_norm": 6.200860500335693, "learning_rate": 9.051942927629586e-06, "loss": 2.168229675292969, "memory(GiB)": 77.56, "step": 94000, "token_acc": 0.5480427046263345, "train_speed(iter/s)": 1.437982 }, { "epoch": 4.027248189880468, "eval_loss": 2.3073084354400635, "eval_runtime": 13.6052, "eval_samples_per_second": 7.35, "eval_steps_per_second": 7.35, "eval_token_acc": 0.47229551451187335, "step": 94000 }, { "epoch": 4.027462405209717, "grad_norm": 6.543453693389893, "learning_rate": 9.04808142516696e-06, "loss": 2.0507335662841797, "memory(GiB)": 77.56, "step": 94005, "token_acc": 0.48221343873517786, "train_speed(iter/s)": 1.437684 }, { "epoch": 4.027676620538966, "grad_norm": 6.228626251220703, "learning_rate": 9.044220664586745e-06, "loss": 2.1440031051635744, "memory(GiB)": 77.56, "step": 94010, "token_acc": 0.5084175084175084, "train_speed(iter/s)": 1.4377 }, { "epoch": 4.027890835868215, "grad_norm": 7.446531295776367, "learning_rate": 9.0403606459589e-06, "loss": 2.4961212158203123, "memory(GiB)": 77.56, "step": 94015, "token_acc": 0.47530864197530864, "train_speed(iter/s)": 1.437693 }, { "epoch": 4.028105051197464, "grad_norm": 6.428940296173096, "learning_rate": 9.03650136935335e-06, "loss": 2.323657989501953, "memory(GiB)": 77.56, "step": 94020, "token_acc": 0.5247524752475248, "train_speed(iter/s)": 1.4377 }, { "epoch": 4.0283192665267125, "grad_norm": 5.986566066741943, "learning_rate": 9.03264283484e-06, "loss": 2.451040840148926, "memory(GiB)": 77.56, "step": 94025, "token_acc": 0.47368421052631576, "train_speed(iter/s)": 1.437691 }, { "epoch": 4.028533481855962, "grad_norm": 5.285022735595703, "learning_rate": 9.028785042488758e-06, "loss": 2.0093597412109374, "memory(GiB)": 77.56, "step": 94030, "token_acc": 0.5622895622895623, "train_speed(iter/s)": 1.437685 }, { "epoch": 4.028747697185211, "grad_norm": 5.937748432159424, "learning_rate": 9.024927992369514e-06, "loss": 2.5239925384521484, "memory(GiB)": 77.56, "step": 94035, "token_acc": 0.5084175084175084, "train_speed(iter/s)": 1.437704 }, { "epoch": 4.028961912514459, "grad_norm": 9.210309982299805, "learning_rate": 9.021071684552117e-06, "loss": 2.227867317199707, "memory(GiB)": 77.56, "step": 94040, "token_acc": 0.5152671755725191, "train_speed(iter/s)": 1.437711 }, { "epoch": 4.029176127843709, "grad_norm": 5.284056186676025, "learning_rate": 9.017216119106464e-06, "loss": 1.9757549285888671, "memory(GiB)": 77.56, "step": 94045, "token_acc": 0.5704225352112676, "train_speed(iter/s)": 1.437693 }, { "epoch": 4.029390343172958, "grad_norm": 8.236066818237305, "learning_rate": 9.013361296102385e-06, "loss": 2.0382003784179688, "memory(GiB)": 77.56, "step": 94050, "token_acc": 0.5403508771929825, "train_speed(iter/s)": 1.437692 }, { "epoch": 4.029604558502206, "grad_norm": 5.8400702476501465, "learning_rate": 9.009507215609702e-06, "loss": 2.4904153823852537, "memory(GiB)": 77.56, "step": 94055, "token_acc": 0.4738292011019284, "train_speed(iter/s)": 1.437695 }, { "epoch": 4.0298187738314555, "grad_norm": 6.436703681945801, "learning_rate": 9.005653877698261e-06, "loss": 2.241098403930664, "memory(GiB)": 77.56, "step": 94060, "token_acc": 0.47987616099071206, "train_speed(iter/s)": 1.4377 }, { "epoch": 4.030032989160705, "grad_norm": 7.1326727867126465, "learning_rate": 9.001801282437854e-06, "loss": 2.3208244323730467, "memory(GiB)": 77.56, "step": 94065, "token_acc": 0.5141843971631206, "train_speed(iter/s)": 1.437725 }, { "epoch": 4.030247204489953, "grad_norm": 6.026688098907471, "learning_rate": 8.997949429898267e-06, "loss": 2.549468231201172, "memory(GiB)": 77.56, "step": 94070, "token_acc": 0.4784172661870504, "train_speed(iter/s)": 1.437724 }, { "epoch": 4.030461419819202, "grad_norm": 6.105508327484131, "learning_rate": 8.994098320149303e-06, "loss": 2.3366880416870117, "memory(GiB)": 77.56, "step": 94075, "token_acc": 0.5034965034965035, "train_speed(iter/s)": 1.437721 }, { "epoch": 4.030675635148452, "grad_norm": 5.545905113220215, "learning_rate": 8.99024795326071e-06, "loss": 2.3433456420898438, "memory(GiB)": 77.56, "step": 94080, "token_acc": 0.4883720930232558, "train_speed(iter/s)": 1.437698 }, { "epoch": 4.0308898504777, "grad_norm": 5.896813869476318, "learning_rate": 8.986398329302248e-06, "loss": 2.2935924530029297, "memory(GiB)": 77.56, "step": 94085, "token_acc": 0.4906832298136646, "train_speed(iter/s)": 1.437692 }, { "epoch": 4.031104065806949, "grad_norm": 6.328481197357178, "learning_rate": 8.982549448343653e-06, "loss": 2.0394792556762695, "memory(GiB)": 77.56, "step": 94090, "token_acc": 0.55078125, "train_speed(iter/s)": 1.437695 }, { "epoch": 4.0313182811361985, "grad_norm": 6.472174167633057, "learning_rate": 8.978701310454652e-06, "loss": 2.1981388092041017, "memory(GiB)": 77.56, "step": 94095, "token_acc": 0.4812286689419795, "train_speed(iter/s)": 1.437712 }, { "epoch": 4.031532496465447, "grad_norm": 6.0419793128967285, "learning_rate": 8.974853915704945e-06, "loss": 2.242582893371582, "memory(GiB)": 77.56, "step": 94100, "token_acc": 0.5047318611987381, "train_speed(iter/s)": 1.437716 }, { "epoch": 4.031746711794696, "grad_norm": 6.998654842376709, "learning_rate": 8.97100726416426e-06, "loss": 2.5070701599121095, "memory(GiB)": 77.56, "step": 94105, "token_acc": 0.5048231511254019, "train_speed(iter/s)": 1.437699 }, { "epoch": 4.031960927123945, "grad_norm": 5.389399528503418, "learning_rate": 8.967161355902265e-06, "loss": 2.485128402709961, "memory(GiB)": 77.56, "step": 94110, "token_acc": 0.5278688524590164, "train_speed(iter/s)": 1.437689 }, { "epoch": 4.032175142453194, "grad_norm": 6.884333610534668, "learning_rate": 8.96331619098863e-06, "loss": 2.2493865966796873, "memory(GiB)": 77.56, "step": 94115, "token_acc": 0.5148514851485149, "train_speed(iter/s)": 1.437694 }, { "epoch": 4.032389357782443, "grad_norm": 5.905115604400635, "learning_rate": 8.959471769493022e-06, "loss": 2.235182189941406, "memory(GiB)": 77.56, "step": 94120, "token_acc": 0.4937888198757764, "train_speed(iter/s)": 1.437681 }, { "epoch": 4.032603573111692, "grad_norm": 6.828675746917725, "learning_rate": 8.955628091485068e-06, "loss": 1.9574359893798827, "memory(GiB)": 77.56, "step": 94125, "token_acc": 0.531986531986532, "train_speed(iter/s)": 1.437702 }, { "epoch": 4.032817788440941, "grad_norm": 5.532287120819092, "learning_rate": 8.951785157034415e-06, "loss": 1.9771404266357422, "memory(GiB)": 77.56, "step": 94130, "token_acc": 0.5559322033898305, "train_speed(iter/s)": 1.437713 }, { "epoch": 4.03303200377019, "grad_norm": 6.384617328643799, "learning_rate": 8.947942966210692e-06, "loss": 2.2690092086791993, "memory(GiB)": 77.56, "step": 94135, "token_acc": 0.49280575539568344, "train_speed(iter/s)": 1.437723 }, { "epoch": 4.033246219099439, "grad_norm": 5.13917350769043, "learning_rate": 8.944101519083492e-06, "loss": 2.5535118103027346, "memory(GiB)": 77.56, "step": 94140, "token_acc": 0.4728434504792332, "train_speed(iter/s)": 1.437731 }, { "epoch": 4.0334604344286875, "grad_norm": 6.78342342376709, "learning_rate": 8.940260815722407e-06, "loss": 2.173914337158203, "memory(GiB)": 77.56, "step": 94145, "token_acc": 0.563573883161512, "train_speed(iter/s)": 1.437745 }, { "epoch": 4.033674649757937, "grad_norm": 7.350124359130859, "learning_rate": 8.936420856197015e-06, "loss": 2.1620296478271483, "memory(GiB)": 77.56, "step": 94150, "token_acc": 0.5, "train_speed(iter/s)": 1.437743 }, { "epoch": 4.033888865087186, "grad_norm": 7.334506988525391, "learning_rate": 8.932581640576865e-06, "loss": 2.124253273010254, "memory(GiB)": 77.56, "step": 94155, "token_acc": 0.5351681957186545, "train_speed(iter/s)": 1.437739 }, { "epoch": 4.034103080416434, "grad_norm": 5.405720233917236, "learning_rate": 8.928743168931535e-06, "loss": 2.214800262451172, "memory(GiB)": 77.56, "step": 94160, "token_acc": 0.5426621160409556, "train_speed(iter/s)": 1.437762 }, { "epoch": 4.034317295745684, "grad_norm": 7.348898410797119, "learning_rate": 8.924905441330556e-06, "loss": 2.107699966430664, "memory(GiB)": 77.56, "step": 94165, "token_acc": 0.5104895104895105, "train_speed(iter/s)": 1.437777 }, { "epoch": 4.034531511074933, "grad_norm": 6.633355617523193, "learning_rate": 8.921068457843445e-06, "loss": 2.515640640258789, "memory(GiB)": 77.56, "step": 94170, "token_acc": 0.46178343949044587, "train_speed(iter/s)": 1.43778 }, { "epoch": 4.034745726404181, "grad_norm": 7.168416976928711, "learning_rate": 8.917232218539712e-06, "loss": 2.4323757171630858, "memory(GiB)": 77.56, "step": 94175, "token_acc": 0.47333333333333333, "train_speed(iter/s)": 1.437797 }, { "epoch": 4.0349599417334305, "grad_norm": 5.7977094650268555, "learning_rate": 8.913396723488854e-06, "loss": 2.229866600036621, "memory(GiB)": 77.56, "step": 94180, "token_acc": 0.525, "train_speed(iter/s)": 1.43781 }, { "epoch": 4.03517415706268, "grad_norm": 5.585672378540039, "learning_rate": 8.909561972760343e-06, "loss": 2.4388826370239256, "memory(GiB)": 77.56, "step": 94185, "token_acc": 0.49825783972125437, "train_speed(iter/s)": 1.437827 }, { "epoch": 4.035388372391928, "grad_norm": 6.51699161529541, "learning_rate": 8.905727966423682e-06, "loss": 2.2358789443969727, "memory(GiB)": 77.56, "step": 94190, "token_acc": 0.5062111801242236, "train_speed(iter/s)": 1.437825 }, { "epoch": 4.035602587721177, "grad_norm": 8.528943061828613, "learning_rate": 8.901894704548308e-06, "loss": 2.3063486099243162, "memory(GiB)": 77.56, "step": 94195, "token_acc": 0.49615384615384617, "train_speed(iter/s)": 1.437819 }, { "epoch": 4.035816803050427, "grad_norm": 5.621964931488037, "learning_rate": 8.89806218720365e-06, "loss": 2.2568063735961914, "memory(GiB)": 77.56, "step": 94200, "token_acc": 0.5015576323987538, "train_speed(iter/s)": 1.437817 }, { "epoch": 4.036031018379675, "grad_norm": 7.0758562088012695, "learning_rate": 8.89423041445916e-06, "loss": 2.210978698730469, "memory(GiB)": 77.56, "step": 94205, "token_acc": 0.5486381322957199, "train_speed(iter/s)": 1.437818 }, { "epoch": 4.036245233708924, "grad_norm": 6.545154571533203, "learning_rate": 8.890399386384251e-06, "loss": 2.154315948486328, "memory(GiB)": 77.56, "step": 94210, "token_acc": 0.5083056478405316, "train_speed(iter/s)": 1.437822 }, { "epoch": 4.0364594490381736, "grad_norm": 6.9491729736328125, "learning_rate": 8.88656910304831e-06, "loss": 2.5408910751342773, "memory(GiB)": 77.56, "step": 94215, "token_acc": 0.47735191637630664, "train_speed(iter/s)": 1.437829 }, { "epoch": 4.036673664367422, "grad_norm": 9.270106315612793, "learning_rate": 8.882739564520743e-06, "loss": 2.3467565536499024, "memory(GiB)": 77.56, "step": 94220, "token_acc": 0.5298804780876494, "train_speed(iter/s)": 1.437831 }, { "epoch": 4.036887879696671, "grad_norm": 5.729696273803711, "learning_rate": 8.878910770870924e-06, "loss": 2.2115558624267577, "memory(GiB)": 77.56, "step": 94225, "token_acc": 0.5102040816326531, "train_speed(iter/s)": 1.437833 }, { "epoch": 4.03710209502592, "grad_norm": 4.764277935028076, "learning_rate": 8.875082722168215e-06, "loss": 2.413976287841797, "memory(GiB)": 77.56, "step": 94230, "token_acc": 0.4691011235955056, "train_speed(iter/s)": 1.43784 }, { "epoch": 4.037316310355169, "grad_norm": 5.905101776123047, "learning_rate": 8.871255418481955e-06, "loss": 2.2787710189819337, "memory(GiB)": 77.56, "step": 94235, "token_acc": 0.5253623188405797, "train_speed(iter/s)": 1.437845 }, { "epoch": 4.037530525684418, "grad_norm": 7.858553886413574, "learning_rate": 8.867428859881487e-06, "loss": 2.1349054336547852, "memory(GiB)": 77.56, "step": 94240, "token_acc": 0.5454545454545454, "train_speed(iter/s)": 1.437854 }, { "epoch": 4.037744741013667, "grad_norm": 5.127308368682861, "learning_rate": 8.863603046436119e-06, "loss": 2.185748291015625, "memory(GiB)": 77.56, "step": 94245, "token_acc": 0.5254237288135594, "train_speed(iter/s)": 1.437864 }, { "epoch": 4.037958956342916, "grad_norm": 6.771142482757568, "learning_rate": 8.859777978215184e-06, "loss": 2.623724365234375, "memory(GiB)": 77.56, "step": 94250, "token_acc": 0.46855345911949686, "train_speed(iter/s)": 1.437869 }, { "epoch": 4.038173171672165, "grad_norm": 6.034451484680176, "learning_rate": 8.855953655287962e-06, "loss": 2.1346485137939455, "memory(GiB)": 77.56, "step": 94255, "token_acc": 0.5110294117647058, "train_speed(iter/s)": 1.437889 }, { "epoch": 4.038387387001414, "grad_norm": 7.054361343383789, "learning_rate": 8.852130077723736e-06, "loss": 2.2240100860595704, "memory(GiB)": 77.56, "step": 94260, "token_acc": 0.515625, "train_speed(iter/s)": 1.437889 }, { "epoch": 4.038601602330663, "grad_norm": 6.232862949371338, "learning_rate": 8.848307245591774e-06, "loss": 2.259897994995117, "memory(GiB)": 77.56, "step": 94265, "token_acc": 0.5390070921985816, "train_speed(iter/s)": 1.437895 }, { "epoch": 4.038815817659912, "grad_norm": 5.6272149085998535, "learning_rate": 8.844485158961318e-06, "loss": 2.2962581634521486, "memory(GiB)": 77.56, "step": 94270, "token_acc": 0.49852507374631266, "train_speed(iter/s)": 1.437912 }, { "epoch": 4.039030032989161, "grad_norm": 7.337576866149902, "learning_rate": 8.840663817901618e-06, "loss": 2.1763942718505858, "memory(GiB)": 77.56, "step": 94275, "token_acc": 0.5251798561151079, "train_speed(iter/s)": 1.437912 }, { "epoch": 4.0392442483184094, "grad_norm": 5.830558776855469, "learning_rate": 8.836843222481917e-06, "loss": 2.210898208618164, "memory(GiB)": 77.56, "step": 94280, "token_acc": 0.5067567567567568, "train_speed(iter/s)": 1.437915 }, { "epoch": 4.039458463647659, "grad_norm": 6.63686990737915, "learning_rate": 8.833023372771415e-06, "loss": 2.1141281127929688, "memory(GiB)": 77.56, "step": 94285, "token_acc": 0.5627376425855514, "train_speed(iter/s)": 1.437938 }, { "epoch": 4.039672678976908, "grad_norm": 6.3502020835876465, "learning_rate": 8.829204268839314e-06, "loss": 2.2010631561279297, "memory(GiB)": 77.56, "step": 94290, "token_acc": 0.5532646048109966, "train_speed(iter/s)": 1.437937 }, { "epoch": 4.039886894306156, "grad_norm": 9.333070755004883, "learning_rate": 8.825385910754796e-06, "loss": 2.044743537902832, "memory(GiB)": 77.56, "step": 94295, "token_acc": 0.5625, "train_speed(iter/s)": 1.437934 }, { "epoch": 4.040101109635406, "grad_norm": 6.368394374847412, "learning_rate": 8.82156829858703e-06, "loss": 2.2585208892822264, "memory(GiB)": 77.56, "step": 94300, "token_acc": 0.50187265917603, "train_speed(iter/s)": 1.437931 }, { "epoch": 4.040315324964655, "grad_norm": 5.39070987701416, "learning_rate": 8.817751432405186e-06, "loss": 2.18239803314209, "memory(GiB)": 77.56, "step": 94305, "token_acc": 0.5197368421052632, "train_speed(iter/s)": 1.437945 }, { "epoch": 4.040529540293903, "grad_norm": 7.199963092803955, "learning_rate": 8.813935312278414e-06, "loss": 2.1374454498291016, "memory(GiB)": 77.56, "step": 94310, "token_acc": 0.5467128027681661, "train_speed(iter/s)": 1.437951 }, { "epoch": 4.0407437556231525, "grad_norm": 10.056708335876465, "learning_rate": 8.81011993827584e-06, "loss": 2.0617654800415037, "memory(GiB)": 77.56, "step": 94315, "token_acc": 0.5642201834862385, "train_speed(iter/s)": 1.437973 }, { "epoch": 4.040957970952402, "grad_norm": 5.356524467468262, "learning_rate": 8.806305310466579e-06, "loss": 2.0068901062011717, "memory(GiB)": 77.56, "step": 94320, "token_acc": 0.564, "train_speed(iter/s)": 1.437991 }, { "epoch": 4.04117218628165, "grad_norm": 6.368059158325195, "learning_rate": 8.80249142891974e-06, "loss": 2.504183769226074, "memory(GiB)": 77.56, "step": 94325, "token_acc": 0.516, "train_speed(iter/s)": 1.43799 }, { "epoch": 4.041386401610899, "grad_norm": 7.350532054901123, "learning_rate": 8.798678293704405e-06, "loss": 2.2723342895507814, "memory(GiB)": 77.56, "step": 94330, "token_acc": 0.4961832061068702, "train_speed(iter/s)": 1.438 }, { "epoch": 4.041600616940149, "grad_norm": 4.41812801361084, "learning_rate": 8.794865904889677e-06, "loss": 1.8743091583251954, "memory(GiB)": 77.56, "step": 94335, "token_acc": 0.6118881118881119, "train_speed(iter/s)": 1.438015 }, { "epoch": 4.041814832269397, "grad_norm": 7.434039115905762, "learning_rate": 8.791054262544601e-06, "loss": 2.0234777450561525, "memory(GiB)": 77.56, "step": 94340, "token_acc": 0.5508771929824562, "train_speed(iter/s)": 1.438015 }, { "epoch": 4.042029047598646, "grad_norm": 8.150466918945312, "learning_rate": 8.78724336673824e-06, "loss": 2.347923469543457, "memory(GiB)": 77.56, "step": 94345, "token_acc": 0.5016181229773463, "train_speed(iter/s)": 1.438016 }, { "epoch": 4.0422432629278955, "grad_norm": 5.66418981552124, "learning_rate": 8.783433217539611e-06, "loss": 2.379938507080078, "memory(GiB)": 77.56, "step": 94350, "token_acc": 0.484375, "train_speed(iter/s)": 1.438005 }, { "epoch": 4.042457478257144, "grad_norm": 7.125446796417236, "learning_rate": 8.779623815017763e-06, "loss": 2.1093027114868166, "memory(GiB)": 77.56, "step": 94355, "token_acc": 0.5347222222222222, "train_speed(iter/s)": 1.437997 }, { "epoch": 4.042671693586393, "grad_norm": 7.883285999298096, "learning_rate": 8.77581515924169e-06, "loss": 2.2761085510253904, "memory(GiB)": 77.56, "step": 94360, "token_acc": 0.4967105263157895, "train_speed(iter/s)": 1.437998 }, { "epoch": 4.042885908915642, "grad_norm": 7.811742305755615, "learning_rate": 8.772007250280407e-06, "loss": 2.2774017333984373, "memory(GiB)": 77.56, "step": 94365, "token_acc": 0.5342019543973942, "train_speed(iter/s)": 1.437994 }, { "epoch": 4.043100124244891, "grad_norm": 5.259608745574951, "learning_rate": 8.768200088202888e-06, "loss": 2.2066184997558596, "memory(GiB)": 77.56, "step": 94370, "token_acc": 0.5570934256055363, "train_speed(iter/s)": 1.437979 }, { "epoch": 4.04331433957414, "grad_norm": 7.543445587158203, "learning_rate": 8.764393673078102e-06, "loss": 2.4556665420532227, "memory(GiB)": 77.56, "step": 94375, "token_acc": 0.45317220543806647, "train_speed(iter/s)": 1.437984 }, { "epoch": 4.043528554903389, "grad_norm": 5.840571880340576, "learning_rate": 8.760588004975007e-06, "loss": 2.3214370727539064, "memory(GiB)": 77.56, "step": 94380, "token_acc": 0.5269121813031161, "train_speed(iter/s)": 1.437995 }, { "epoch": 4.043742770232638, "grad_norm": 5.127938270568848, "learning_rate": 8.756783083962545e-06, "loss": 2.1085704803466796, "memory(GiB)": 77.56, "step": 94385, "token_acc": 0.5494880546075085, "train_speed(iter/s)": 1.438006 }, { "epoch": 4.043956985561887, "grad_norm": 6.106111526489258, "learning_rate": 8.752978910109638e-06, "loss": 1.9957218170166016, "memory(GiB)": 77.56, "step": 94390, "token_acc": 0.5266903914590747, "train_speed(iter/s)": 1.438 }, { "epoch": 4.044171200891136, "grad_norm": 5.976374626159668, "learning_rate": 8.749175483485223e-06, "loss": 2.20544490814209, "memory(GiB)": 77.56, "step": 94395, "token_acc": 0.5342465753424658, "train_speed(iter/s)": 1.437986 }, { "epoch": 4.0443854162203845, "grad_norm": 5.485307216644287, "learning_rate": 8.745372804158187e-06, "loss": 2.097411346435547, "memory(GiB)": 77.56, "step": 94400, "token_acc": 0.5338078291814946, "train_speed(iter/s)": 1.437989 }, { "epoch": 4.044599631549634, "grad_norm": 5.8869147300720215, "learning_rate": 8.741570872197424e-06, "loss": 1.9910985946655273, "memory(GiB)": 77.56, "step": 94405, "token_acc": 0.5559440559440559, "train_speed(iter/s)": 1.437975 }, { "epoch": 4.044813846878883, "grad_norm": 5.397964000701904, "learning_rate": 8.73776968767181e-06, "loss": 2.1720096588134767, "memory(GiB)": 77.56, "step": 94410, "token_acc": 0.5754385964912281, "train_speed(iter/s)": 1.437986 }, { "epoch": 4.045028062208131, "grad_norm": 7.197945594787598, "learning_rate": 8.733969250650187e-06, "loss": 2.190837860107422, "memory(GiB)": 77.56, "step": 94415, "token_acc": 0.5326797385620915, "train_speed(iter/s)": 1.438002 }, { "epoch": 4.045242277537381, "grad_norm": 6.3045196533203125, "learning_rate": 8.73016956120144e-06, "loss": 2.2060741424560546, "memory(GiB)": 77.56, "step": 94420, "token_acc": 0.5245283018867924, "train_speed(iter/s)": 1.437992 }, { "epoch": 4.04545649286663, "grad_norm": 5.721884727478027, "learning_rate": 8.72637061939437e-06, "loss": 2.2531305313110352, "memory(GiB)": 77.56, "step": 94425, "token_acc": 0.5015873015873016, "train_speed(iter/s)": 1.437995 }, { "epoch": 4.045670708195878, "grad_norm": 6.655179023742676, "learning_rate": 8.722572425297832e-06, "loss": 2.2428781509399416, "memory(GiB)": 77.56, "step": 94430, "token_acc": 0.49842271293375395, "train_speed(iter/s)": 1.437986 }, { "epoch": 4.0458849235251275, "grad_norm": 7.368620872497559, "learning_rate": 8.71877497898061e-06, "loss": 2.453813934326172, "memory(GiB)": 77.56, "step": 94435, "token_acc": 0.5, "train_speed(iter/s)": 1.438002 }, { "epoch": 4.046099138854377, "grad_norm": 9.210283279418945, "learning_rate": 8.714978280511505e-06, "loss": 2.456880569458008, "memory(GiB)": 77.56, "step": 94440, "token_acc": 0.5035971223021583, "train_speed(iter/s)": 1.438012 }, { "epoch": 4.046313354183625, "grad_norm": 5.5849456787109375, "learning_rate": 8.711182329959284e-06, "loss": 2.386907196044922, "memory(GiB)": 77.56, "step": 94445, "token_acc": 0.4755700325732899, "train_speed(iter/s)": 1.438016 }, { "epoch": 4.046527569512874, "grad_norm": 5.408041954040527, "learning_rate": 8.70738712739274e-06, "loss": 2.325563430786133, "memory(GiB)": 77.56, "step": 94450, "token_acc": 0.4702549575070821, "train_speed(iter/s)": 1.438037 }, { "epoch": 4.046741784842124, "grad_norm": 5.731386661529541, "learning_rate": 8.703592672880617e-06, "loss": 2.483907699584961, "memory(GiB)": 77.56, "step": 94455, "token_acc": 0.4609164420485175, "train_speed(iter/s)": 1.43804 }, { "epoch": 4.046956000171372, "grad_norm": 7.373782634735107, "learning_rate": 8.699798966491651e-06, "loss": 2.0968038558959963, "memory(GiB)": 77.56, "step": 94460, "token_acc": 0.5520504731861199, "train_speed(iter/s)": 1.438048 }, { "epoch": 4.047170215500621, "grad_norm": 7.003344535827637, "learning_rate": 8.696006008294566e-06, "loss": 2.1224441528320312, "memory(GiB)": 77.56, "step": 94465, "token_acc": 0.5379310344827586, "train_speed(iter/s)": 1.438032 }, { "epoch": 4.0473844308298705, "grad_norm": 6.923903942108154, "learning_rate": 8.692213798358084e-06, "loss": 2.5746427536010743, "memory(GiB)": 77.56, "step": 94470, "token_acc": 0.43731778425655976, "train_speed(iter/s)": 1.438048 }, { "epoch": 4.047598646159119, "grad_norm": 5.497438907623291, "learning_rate": 8.688422336750879e-06, "loss": 2.2300783157348634, "memory(GiB)": 77.56, "step": 94475, "token_acc": 0.5408163265306123, "train_speed(iter/s)": 1.438061 }, { "epoch": 4.047812861488368, "grad_norm": 8.704301834106445, "learning_rate": 8.684631623541672e-06, "loss": 2.0712379455566405, "memory(GiB)": 77.56, "step": 94480, "token_acc": 0.5050847457627119, "train_speed(iter/s)": 1.438072 }, { "epoch": 4.048027076817617, "grad_norm": 6.696622371673584, "learning_rate": 8.680841658799127e-06, "loss": 2.2620059967041017, "memory(GiB)": 77.56, "step": 94485, "token_acc": 0.5389408099688473, "train_speed(iter/s)": 1.438097 }, { "epoch": 4.048241292146866, "grad_norm": 6.093163013458252, "learning_rate": 8.677052442591888e-06, "loss": 2.1327383041381838, "memory(GiB)": 77.56, "step": 94490, "token_acc": 0.5069444444444444, "train_speed(iter/s)": 1.438107 }, { "epoch": 4.048455507476115, "grad_norm": 6.234643459320068, "learning_rate": 8.6732639749886e-06, "loss": 1.9161249160766602, "memory(GiB)": 77.56, "step": 94495, "token_acc": 0.5575757575757576, "train_speed(iter/s)": 1.438114 }, { "epoch": 4.048669722805364, "grad_norm": 6.737776756286621, "learning_rate": 8.669476256057913e-06, "loss": 2.289379119873047, "memory(GiB)": 77.56, "step": 94500, "token_acc": 0.46830985915492956, "train_speed(iter/s)": 1.43813 }, { "epoch": 4.048669722805364, "eval_loss": 2.2524924278259277, "eval_runtime": 13.7648, "eval_samples_per_second": 7.265, "eval_steps_per_second": 7.265, "eval_token_acc": 0.45232273838630804, "step": 94500 }, { "epoch": 4.048883938134613, "grad_norm": 6.0029425621032715, "learning_rate": 8.665689285868416e-06, "loss": 2.143111801147461, "memory(GiB)": 77.56, "step": 94505, "token_acc": 0.47696476964769646, "train_speed(iter/s)": 1.437812 }, { "epoch": 4.049098153463862, "grad_norm": 7.486948490142822, "learning_rate": 8.661903064488753e-06, "loss": 2.164644241333008, "memory(GiB)": 77.56, "step": 94510, "token_acc": 0.5214521452145214, "train_speed(iter/s)": 1.43782 }, { "epoch": 4.049312368793111, "grad_norm": 5.806775093078613, "learning_rate": 8.65811759198749e-06, "loss": 2.0831167221069338, "memory(GiB)": 77.56, "step": 94515, "token_acc": 0.5513698630136986, "train_speed(iter/s)": 1.437837 }, { "epoch": 4.0495265841223596, "grad_norm": 6.2082343101501465, "learning_rate": 8.654332868433213e-06, "loss": 2.0956356048583986, "memory(GiB)": 77.56, "step": 94520, "token_acc": 0.5315985130111525, "train_speed(iter/s)": 1.437823 }, { "epoch": 4.049740799451609, "grad_norm": 4.95619535446167, "learning_rate": 8.650548893894484e-06, "loss": 2.246072006225586, "memory(GiB)": 77.56, "step": 94525, "token_acc": 0.5017301038062284, "train_speed(iter/s)": 1.437809 }, { "epoch": 4.049955014780858, "grad_norm": 7.168889999389648, "learning_rate": 8.646765668439844e-06, "loss": 2.323457717895508, "memory(GiB)": 77.56, "step": 94530, "token_acc": 0.5035460992907801, "train_speed(iter/s)": 1.437795 }, { "epoch": 4.050169230110106, "grad_norm": 5.401705741882324, "learning_rate": 8.642983192137827e-06, "loss": 2.381578063964844, "memory(GiB)": 77.56, "step": 94535, "token_acc": 0.5060606060606061, "train_speed(iter/s)": 1.43778 }, { "epoch": 4.050383445439356, "grad_norm": 6.20339298248291, "learning_rate": 8.639201465056973e-06, "loss": 2.394364356994629, "memory(GiB)": 77.56, "step": 94540, "token_acc": 0.5125448028673835, "train_speed(iter/s)": 1.437798 }, { "epoch": 4.050597660768605, "grad_norm": 7.206087589263916, "learning_rate": 8.635420487265788e-06, "loss": 2.1705791473388674, "memory(GiB)": 77.56, "step": 94545, "token_acc": 0.5238095238095238, "train_speed(iter/s)": 1.437805 }, { "epoch": 4.050811876097853, "grad_norm": 6.113037586212158, "learning_rate": 8.631640258832769e-06, "loss": 2.38806037902832, "memory(GiB)": 77.56, "step": 94550, "token_acc": 0.5111111111111111, "train_speed(iter/s)": 1.437824 }, { "epoch": 4.051026091427103, "grad_norm": 7.499508857727051, "learning_rate": 8.627860779826385e-06, "loss": 2.2373558044433595, "memory(GiB)": 77.56, "step": 94555, "token_acc": 0.5151515151515151, "train_speed(iter/s)": 1.437844 }, { "epoch": 4.051240306756352, "grad_norm": 6.184325695037842, "learning_rate": 8.624082050315107e-06, "loss": 2.2075767517089844, "memory(GiB)": 77.56, "step": 94560, "token_acc": 0.5404411764705882, "train_speed(iter/s)": 1.437845 }, { "epoch": 4.0514545220856, "grad_norm": 5.859940528869629, "learning_rate": 8.620304070367401e-06, "loss": 2.411662292480469, "memory(GiB)": 77.56, "step": 94565, "token_acc": 0.5357142857142857, "train_speed(iter/s)": 1.437857 }, { "epoch": 4.0516687374148495, "grad_norm": 5.652826309204102, "learning_rate": 8.616526840051697e-06, "loss": 2.1072874069213867, "memory(GiB)": 77.56, "step": 94570, "token_acc": 0.519434628975265, "train_speed(iter/s)": 1.437864 }, { "epoch": 4.051882952744099, "grad_norm": 7.250621795654297, "learning_rate": 8.612750359436444e-06, "loss": 2.299967575073242, "memory(GiB)": 77.56, "step": 94575, "token_acc": 0.4903581267217631, "train_speed(iter/s)": 1.437865 }, { "epoch": 4.052097168073347, "grad_norm": 5.8685784339904785, "learning_rate": 8.608974628590044e-06, "loss": 2.617667388916016, "memory(GiB)": 77.56, "step": 94580, "token_acc": 0.47468354430379744, "train_speed(iter/s)": 1.437885 }, { "epoch": 4.052311383402596, "grad_norm": 5.469445705413818, "learning_rate": 8.605199647580892e-06, "loss": 1.961815643310547, "memory(GiB)": 77.56, "step": 94585, "token_acc": 0.5443425076452599, "train_speed(iter/s)": 1.437894 }, { "epoch": 4.052525598731846, "grad_norm": 5.220660209655762, "learning_rate": 8.601425416477366e-06, "loss": 2.045301055908203, "memory(GiB)": 77.56, "step": 94590, "token_acc": 0.5451388888888888, "train_speed(iter/s)": 1.437881 }, { "epoch": 4.052739814061094, "grad_norm": 6.873932838439941, "learning_rate": 8.597651935347872e-06, "loss": 2.3389307022094727, "memory(GiB)": 77.56, "step": 94595, "token_acc": 0.5016722408026756, "train_speed(iter/s)": 1.437891 }, { "epoch": 4.052954029390343, "grad_norm": 5.854475975036621, "learning_rate": 8.593879204260747e-06, "loss": 2.2878625869750975, "memory(GiB)": 77.56, "step": 94600, "token_acc": 0.5128205128205128, "train_speed(iter/s)": 1.43789 }, { "epoch": 4.0531682447195925, "grad_norm": 5.454529285430908, "learning_rate": 8.590107223284344e-06, "loss": 2.299834442138672, "memory(GiB)": 77.56, "step": 94605, "token_acc": 0.48161764705882354, "train_speed(iter/s)": 1.437896 }, { "epoch": 4.053382460048841, "grad_norm": 6.291232109069824, "learning_rate": 8.586335992486994e-06, "loss": 2.377471923828125, "memory(GiB)": 77.56, "step": 94610, "token_acc": 0.49411764705882355, "train_speed(iter/s)": 1.437895 }, { "epoch": 4.05359667537809, "grad_norm": 5.7743072509765625, "learning_rate": 8.582565511937019e-06, "loss": 2.4286264419555663, "memory(GiB)": 77.56, "step": 94615, "token_acc": 0.5376712328767124, "train_speed(iter/s)": 1.437903 }, { "epoch": 4.053810890707339, "grad_norm": 5.726933002471924, "learning_rate": 8.578795781702703e-06, "loss": 2.1536651611328126, "memory(GiB)": 77.56, "step": 94620, "token_acc": 0.4924812030075188, "train_speed(iter/s)": 1.43787 }, { "epoch": 4.054025106036588, "grad_norm": 5.126599311828613, "learning_rate": 8.575026801852375e-06, "loss": 2.1796525955200194, "memory(GiB)": 77.56, "step": 94625, "token_acc": 0.5085324232081911, "train_speed(iter/s)": 1.437866 }, { "epoch": 4.054239321365837, "grad_norm": 7.860447406768799, "learning_rate": 8.571258572454299e-06, "loss": 2.178411674499512, "memory(GiB)": 77.56, "step": 94630, "token_acc": 0.5135135135135135, "train_speed(iter/s)": 1.437885 }, { "epoch": 4.054453536695086, "grad_norm": 5.048957824707031, "learning_rate": 8.567491093576735e-06, "loss": 2.1166738510131835, "memory(GiB)": 77.56, "step": 94635, "token_acc": 0.5180722891566265, "train_speed(iter/s)": 1.437879 }, { "epoch": 4.054667752024335, "grad_norm": 4.927050590515137, "learning_rate": 8.563724365287934e-06, "loss": 2.2646278381347655, "memory(GiB)": 77.56, "step": 94640, "token_acc": 0.5075187969924813, "train_speed(iter/s)": 1.437896 }, { "epoch": 4.054881967353584, "grad_norm": 5.999354839324951, "learning_rate": 8.559958387656125e-06, "loss": 2.2153263092041016, "memory(GiB)": 77.56, "step": 94645, "token_acc": 0.5373665480427047, "train_speed(iter/s)": 1.437914 }, { "epoch": 4.055096182682833, "grad_norm": 7.3741374015808105, "learning_rate": 8.556193160749549e-06, "loss": 2.485906982421875, "memory(GiB)": 77.56, "step": 94650, "token_acc": 0.5091463414634146, "train_speed(iter/s)": 1.437926 }, { "epoch": 4.0553103980120815, "grad_norm": 4.946549892425537, "learning_rate": 8.552428684636416e-06, "loss": 2.4336305618286134, "memory(GiB)": 77.56, "step": 94655, "token_acc": 0.4907749077490775, "train_speed(iter/s)": 1.437949 }, { "epoch": 4.055524613341331, "grad_norm": 6.874972343444824, "learning_rate": 8.548664959384922e-06, "loss": 2.3196725845336914, "memory(GiB)": 77.56, "step": 94660, "token_acc": 0.49834983498349833, "train_speed(iter/s)": 1.437943 }, { "epoch": 4.05573882867058, "grad_norm": 6.6315388679504395, "learning_rate": 8.544901985063247e-06, "loss": 2.160009765625, "memory(GiB)": 77.56, "step": 94665, "token_acc": 0.5318471337579618, "train_speed(iter/s)": 1.437963 }, { "epoch": 4.055953043999828, "grad_norm": 6.8424272537231445, "learning_rate": 8.541139761739559e-06, "loss": 2.456861877441406, "memory(GiB)": 77.56, "step": 94670, "token_acc": 0.49538461538461537, "train_speed(iter/s)": 1.437979 }, { "epoch": 4.056167259329078, "grad_norm": 5.514702796936035, "learning_rate": 8.537378289482006e-06, "loss": 2.2611709594726563, "memory(GiB)": 77.56, "step": 94675, "token_acc": 0.56, "train_speed(iter/s)": 1.437997 }, { "epoch": 4.056381474658327, "grad_norm": 5.307720184326172, "learning_rate": 8.533617568358749e-06, "loss": 1.9555061340332032, "memory(GiB)": 77.56, "step": 94680, "token_acc": 0.5359712230215827, "train_speed(iter/s)": 1.438005 }, { "epoch": 4.056595689987575, "grad_norm": 6.721695423126221, "learning_rate": 8.529857598437912e-06, "loss": 2.4033859252929686, "memory(GiB)": 77.56, "step": 94685, "token_acc": 0.4473684210526316, "train_speed(iter/s)": 1.437994 }, { "epoch": 4.0568099053168245, "grad_norm": 5.998652935028076, "learning_rate": 8.526098379787606e-06, "loss": 2.483084297180176, "memory(GiB)": 77.56, "step": 94690, "token_acc": 0.4688427299703264, "train_speed(iter/s)": 1.43801 }, { "epoch": 4.057024120646074, "grad_norm": 5.612516403198242, "learning_rate": 8.522339912475936e-06, "loss": 2.3897886276245117, "memory(GiB)": 77.56, "step": 94695, "token_acc": 0.513986013986014, "train_speed(iter/s)": 1.438015 }, { "epoch": 4.057238335975322, "grad_norm": 6.510907173156738, "learning_rate": 8.51858219657098e-06, "loss": 2.3343429565429688, "memory(GiB)": 77.56, "step": 94700, "token_acc": 0.47191011235955055, "train_speed(iter/s)": 1.438023 }, { "epoch": 4.057452551304571, "grad_norm": 5.609668254852295, "learning_rate": 8.514825232140816e-06, "loss": 2.215006637573242, "memory(GiB)": 77.56, "step": 94705, "token_acc": 0.5098684210526315, "train_speed(iter/s)": 1.438031 }, { "epoch": 4.057666766633821, "grad_norm": 5.9900078773498535, "learning_rate": 8.511069019253515e-06, "loss": 2.2272533416748046, "memory(GiB)": 77.56, "step": 94710, "token_acc": 0.5263157894736842, "train_speed(iter/s)": 1.438022 }, { "epoch": 4.057880981963069, "grad_norm": 6.236502647399902, "learning_rate": 8.507313557977121e-06, "loss": 2.3736949920654298, "memory(GiB)": 77.56, "step": 94715, "token_acc": 0.49221183800623053, "train_speed(iter/s)": 1.438006 }, { "epoch": 4.058095197292318, "grad_norm": 6.645170211791992, "learning_rate": 8.503558848379656e-06, "loss": 2.1365692138671877, "memory(GiB)": 77.56, "step": 94720, "token_acc": 0.5116279069767442, "train_speed(iter/s)": 1.437996 }, { "epoch": 4.0583094126215675, "grad_norm": 6.495617389678955, "learning_rate": 8.499804890529157e-06, "loss": 2.329438400268555, "memory(GiB)": 77.56, "step": 94725, "token_acc": 0.47634069400630913, "train_speed(iter/s)": 1.437998 }, { "epoch": 4.058523627950816, "grad_norm": 6.9974164962768555, "learning_rate": 8.496051684493627e-06, "loss": 2.318832206726074, "memory(GiB)": 77.56, "step": 94730, "token_acc": 0.5236686390532544, "train_speed(iter/s)": 1.438008 }, { "epoch": 4.058737843280065, "grad_norm": 5.69292688369751, "learning_rate": 8.492299230341039e-06, "loss": 2.136651611328125, "memory(GiB)": 77.56, "step": 94735, "token_acc": 0.5449101796407185, "train_speed(iter/s)": 1.438013 }, { "epoch": 4.058952058609314, "grad_norm": 6.1332688331604, "learning_rate": 8.488547528139402e-06, "loss": 2.2839872360229494, "memory(GiB)": 77.56, "step": 94740, "token_acc": 0.5015197568389058, "train_speed(iter/s)": 1.43802 }, { "epoch": 4.059166273938563, "grad_norm": 6.985415458679199, "learning_rate": 8.484796577956666e-06, "loss": 1.9944267272949219, "memory(GiB)": 77.56, "step": 94745, "token_acc": 0.58203125, "train_speed(iter/s)": 1.438008 }, { "epoch": 4.059380489267812, "grad_norm": 6.244665145874023, "learning_rate": 8.481046379860785e-06, "loss": 2.244589614868164, "memory(GiB)": 77.56, "step": 94750, "token_acc": 0.4897959183673469, "train_speed(iter/s)": 1.438008 }, { "epoch": 4.059594704597061, "grad_norm": 5.623499870300293, "learning_rate": 8.477296933919693e-06, "loss": 2.019356918334961, "memory(GiB)": 77.56, "step": 94755, "token_acc": 0.5601503759398496, "train_speed(iter/s)": 1.438018 }, { "epoch": 4.05980891992631, "grad_norm": 6.117869853973389, "learning_rate": 8.473548240201323e-06, "loss": 2.0456363677978517, "memory(GiB)": 77.56, "step": 94760, "token_acc": 0.532051282051282, "train_speed(iter/s)": 1.438017 }, { "epoch": 4.060023135255559, "grad_norm": 7.197115898132324, "learning_rate": 8.469800298773567e-06, "loss": 2.1761476516723635, "memory(GiB)": 77.56, "step": 94765, "token_acc": 0.49145299145299143, "train_speed(iter/s)": 1.438015 }, { "epoch": 4.060237350584808, "grad_norm": 5.124648094177246, "learning_rate": 8.466053109704353e-06, "loss": 2.4700817108154296, "memory(GiB)": 77.56, "step": 94770, "token_acc": 0.47619047619047616, "train_speed(iter/s)": 1.438024 }, { "epoch": 4.0604515659140565, "grad_norm": 5.926012992858887, "learning_rate": 8.462306673061548e-06, "loss": 2.3996227264404295, "memory(GiB)": 77.56, "step": 94775, "token_acc": 0.5195195195195195, "train_speed(iter/s)": 1.438004 }, { "epoch": 4.060665781243306, "grad_norm": 6.4998369216918945, "learning_rate": 8.458560988913022e-06, "loss": 2.168090057373047, "memory(GiB)": 77.56, "step": 94780, "token_acc": 0.5298804780876494, "train_speed(iter/s)": 1.438004 }, { "epoch": 4.060879996572555, "grad_norm": 5.087442398071289, "learning_rate": 8.45481605732663e-06, "loss": 2.2478397369384764, "memory(GiB)": 77.56, "step": 94785, "token_acc": 0.5396825396825397, "train_speed(iter/s)": 1.43801 }, { "epoch": 4.061094211901803, "grad_norm": 7.737916469573975, "learning_rate": 8.451071878370209e-06, "loss": 1.9614067077636719, "memory(GiB)": 77.56, "step": 94790, "token_acc": 0.5794392523364486, "train_speed(iter/s)": 1.438014 }, { "epoch": 4.061308427231053, "grad_norm": 6.445511817932129, "learning_rate": 8.447328452111597e-06, "loss": 2.3380258560180662, "memory(GiB)": 77.56, "step": 94795, "token_acc": 0.519163763066202, "train_speed(iter/s)": 1.438008 }, { "epoch": 4.061522642560302, "grad_norm": 6.61165714263916, "learning_rate": 8.443585778618618e-06, "loss": 2.4090763092041017, "memory(GiB)": 77.56, "step": 94800, "token_acc": 0.4589041095890411, "train_speed(iter/s)": 1.438009 }, { "epoch": 4.06173685788955, "grad_norm": 4.864951133728027, "learning_rate": 8.439843857959074e-06, "loss": 2.5200191497802735, "memory(GiB)": 77.56, "step": 94805, "token_acc": 0.4803921568627451, "train_speed(iter/s)": 1.437995 }, { "epoch": 4.0619510732188, "grad_norm": 8.043155670166016, "learning_rate": 8.436102690200737e-06, "loss": 2.285897445678711, "memory(GiB)": 77.56, "step": 94810, "token_acc": 0.4930555555555556, "train_speed(iter/s)": 1.437988 }, { "epoch": 4.062165288548049, "grad_norm": 5.587471961975098, "learning_rate": 8.432362275411392e-06, "loss": 2.5362762451171874, "memory(GiB)": 77.56, "step": 94815, "token_acc": 0.46407185628742514, "train_speed(iter/s)": 1.438002 }, { "epoch": 4.062379503877297, "grad_norm": 6.805018901824951, "learning_rate": 8.428622613658788e-06, "loss": 2.217032623291016, "memory(GiB)": 77.56, "step": 94820, "token_acc": 0.5048543689320388, "train_speed(iter/s)": 1.438021 }, { "epoch": 4.062593719206546, "grad_norm": 9.164910316467285, "learning_rate": 8.424883705010689e-06, "loss": 2.153001403808594, "memory(GiB)": 77.56, "step": 94825, "token_acc": 0.5323529411764706, "train_speed(iter/s)": 1.438033 }, { "epoch": 4.062807934535796, "grad_norm": 5.749707221984863, "learning_rate": 8.421145549534826e-06, "loss": 2.01830997467041, "memory(GiB)": 77.56, "step": 94830, "token_acc": 0.5645161290322581, "train_speed(iter/s)": 1.43805 }, { "epoch": 4.063022149865044, "grad_norm": 5.759202003479004, "learning_rate": 8.417408147298916e-06, "loss": 2.1515344619750976, "memory(GiB)": 77.56, "step": 94835, "token_acc": 0.5316455696202531, "train_speed(iter/s)": 1.438026 }, { "epoch": 4.063236365194293, "grad_norm": 6.5611772537231445, "learning_rate": 8.413671498370667e-06, "loss": 2.1368858337402346, "memory(GiB)": 77.56, "step": 94840, "token_acc": 0.5470383275261324, "train_speed(iter/s)": 1.438019 }, { "epoch": 4.063450580523543, "grad_norm": 4.8147292137146, "learning_rate": 8.409935602817764e-06, "loss": 2.245065689086914, "memory(GiB)": 77.56, "step": 94845, "token_acc": 0.5323076923076923, "train_speed(iter/s)": 1.438031 }, { "epoch": 4.063664795852791, "grad_norm": 6.519495964050293, "learning_rate": 8.406200460707885e-06, "loss": 2.3183284759521485, "memory(GiB)": 77.56, "step": 94850, "token_acc": 0.5242718446601942, "train_speed(iter/s)": 1.438026 }, { "epoch": 4.06387901118204, "grad_norm": 4.928180694580078, "learning_rate": 8.402466072108716e-06, "loss": 2.3318574905395506, "memory(GiB)": 77.56, "step": 94855, "token_acc": 0.4970588235294118, "train_speed(iter/s)": 1.438017 }, { "epoch": 4.0640932265112895, "grad_norm": 6.480074405670166, "learning_rate": 8.398732437087892e-06, "loss": 2.319177436828613, "memory(GiB)": 77.56, "step": 94860, "token_acc": 0.5107913669064749, "train_speed(iter/s)": 1.438027 }, { "epoch": 4.064307441840538, "grad_norm": 10.078136444091797, "learning_rate": 8.394999555713045e-06, "loss": 2.37449951171875, "memory(GiB)": 77.56, "step": 94865, "token_acc": 0.5172413793103449, "train_speed(iter/s)": 1.438043 }, { "epoch": 4.064521657169787, "grad_norm": 5.725994110107422, "learning_rate": 8.391267428051819e-06, "loss": 2.318703842163086, "memory(GiB)": 77.56, "step": 94870, "token_acc": 0.5105105105105106, "train_speed(iter/s)": 1.438055 }, { "epoch": 4.064735872499036, "grad_norm": 8.054520606994629, "learning_rate": 8.387536054171818e-06, "loss": 2.2320878982543944, "memory(GiB)": 77.56, "step": 94875, "token_acc": 0.5066666666666667, "train_speed(iter/s)": 1.438065 }, { "epoch": 4.064950087828285, "grad_norm": 8.072264671325684, "learning_rate": 8.383805434140623e-06, "loss": 2.4040828704833985, "memory(GiB)": 77.56, "step": 94880, "token_acc": 0.5128205128205128, "train_speed(iter/s)": 1.438064 }, { "epoch": 4.065164303157534, "grad_norm": 5.176023960113525, "learning_rate": 8.380075568025847e-06, "loss": 2.440072250366211, "memory(GiB)": 77.56, "step": 94885, "token_acc": 0.46440677966101696, "train_speed(iter/s)": 1.438075 }, { "epoch": 4.065378518486783, "grad_norm": 6.106320381164551, "learning_rate": 8.376346455895046e-06, "loss": 2.0221248626708985, "memory(GiB)": 77.56, "step": 94890, "token_acc": 0.5494505494505495, "train_speed(iter/s)": 1.438072 }, { "epoch": 4.065592733816032, "grad_norm": 7.271626949310303, "learning_rate": 8.37261809781577e-06, "loss": 2.5189109802246095, "memory(GiB)": 77.56, "step": 94895, "token_acc": 0.46567164179104475, "train_speed(iter/s)": 1.438076 }, { "epoch": 4.065806949145281, "grad_norm": 4.877325057983398, "learning_rate": 8.368890493855568e-06, "loss": 2.062717819213867, "memory(GiB)": 77.56, "step": 94900, "token_acc": 0.5578231292517006, "train_speed(iter/s)": 1.438055 }, { "epoch": 4.06602116447453, "grad_norm": 7.08648157119751, "learning_rate": 8.365163644081969e-06, "loss": 2.3857030868530273, "memory(GiB)": 77.56, "step": 94905, "token_acc": 0.5155709342560554, "train_speed(iter/s)": 1.43805 }, { "epoch": 4.0662353798037785, "grad_norm": 8.799745559692383, "learning_rate": 8.361437548562478e-06, "loss": 2.10213623046875, "memory(GiB)": 77.56, "step": 94910, "token_acc": 0.5481481481481482, "train_speed(iter/s)": 1.438072 }, { "epoch": 4.066449595133028, "grad_norm": 6.067258834838867, "learning_rate": 8.357712207364615e-06, "loss": 2.6216259002685547, "memory(GiB)": 77.56, "step": 94915, "token_acc": 0.4670846394984326, "train_speed(iter/s)": 1.438087 }, { "epoch": 4.066663810462277, "grad_norm": 5.18054723739624, "learning_rate": 8.353987620555859e-06, "loss": 2.0616338729858397, "memory(GiB)": 77.56, "step": 94920, "token_acc": 0.5116279069767442, "train_speed(iter/s)": 1.438096 }, { "epoch": 4.066878025791525, "grad_norm": 12.234222412109375, "learning_rate": 8.350263788203682e-06, "loss": 2.207724189758301, "memory(GiB)": 77.56, "step": 94925, "token_acc": 0.5517241379310345, "train_speed(iter/s)": 1.438092 }, { "epoch": 4.067092241120775, "grad_norm": 7.175808906555176, "learning_rate": 8.34654071037555e-06, "loss": 2.473559761047363, "memory(GiB)": 77.56, "step": 94930, "token_acc": 0.4883720930232558, "train_speed(iter/s)": 1.438111 }, { "epoch": 4.067306456450024, "grad_norm": 7.485994338989258, "learning_rate": 8.342818387138895e-06, "loss": 2.137923240661621, "memory(GiB)": 77.56, "step": 94935, "token_acc": 0.5397350993377483, "train_speed(iter/s)": 1.438097 }, { "epoch": 4.067520671779272, "grad_norm": 5.164398670196533, "learning_rate": 8.339096818561165e-06, "loss": 2.4431785583496093, "memory(GiB)": 77.56, "step": 94940, "token_acc": 0.5162337662337663, "train_speed(iter/s)": 1.43809 }, { "epoch": 4.0677348871085215, "grad_norm": 5.50698184967041, "learning_rate": 8.335376004709784e-06, "loss": 2.2709802627563476, "memory(GiB)": 77.56, "step": 94945, "token_acc": 0.5227272727272727, "train_speed(iter/s)": 1.438077 }, { "epoch": 4.067949102437771, "grad_norm": 8.534979820251465, "learning_rate": 8.331655945652155e-06, "loss": 2.074360466003418, "memory(GiB)": 77.56, "step": 94950, "token_acc": 0.52046783625731, "train_speed(iter/s)": 1.438089 }, { "epoch": 4.068163317767019, "grad_norm": 8.147920608520508, "learning_rate": 8.327936641455663e-06, "loss": 2.441698455810547, "memory(GiB)": 77.56, "step": 94955, "token_acc": 0.5148936170212766, "train_speed(iter/s)": 1.438096 }, { "epoch": 4.068377533096268, "grad_norm": 6.128410339355469, "learning_rate": 8.324218092187696e-06, "loss": 2.061117935180664, "memory(GiB)": 77.56, "step": 94960, "token_acc": 0.582995951417004, "train_speed(iter/s)": 1.438083 }, { "epoch": 4.068591748425518, "grad_norm": 5.585254669189453, "learning_rate": 8.320500297915595e-06, "loss": 2.482101821899414, "memory(GiB)": 77.56, "step": 94965, "token_acc": 0.4716312056737589, "train_speed(iter/s)": 1.438076 }, { "epoch": 4.068805963754766, "grad_norm": 5.491774082183838, "learning_rate": 8.316783258706745e-06, "loss": 2.174637222290039, "memory(GiB)": 77.56, "step": 94970, "token_acc": 0.5292207792207793, "train_speed(iter/s)": 1.438066 }, { "epoch": 4.069020179084015, "grad_norm": 7.608911991119385, "learning_rate": 8.31306697462847e-06, "loss": 2.291824150085449, "memory(GiB)": 77.56, "step": 94975, "token_acc": 0.5172413793103449, "train_speed(iter/s)": 1.43807 }, { "epoch": 4.0692343944132645, "grad_norm": 6.652978420257568, "learning_rate": 8.309351445748086e-06, "loss": 2.495310592651367, "memory(GiB)": 77.56, "step": 94980, "token_acc": 0.5034013605442177, "train_speed(iter/s)": 1.438083 }, { "epoch": 4.069448609742513, "grad_norm": 4.828470706939697, "learning_rate": 8.305636672132916e-06, "loss": 2.0697349548339843, "memory(GiB)": 77.56, "step": 94985, "token_acc": 0.562962962962963, "train_speed(iter/s)": 1.438089 }, { "epoch": 4.069662825071762, "grad_norm": 6.481742858886719, "learning_rate": 8.301922653850247e-06, "loss": 2.1012456893920897, "memory(GiB)": 77.56, "step": 94990, "token_acc": 0.5328467153284672, "train_speed(iter/s)": 1.438105 }, { "epoch": 4.069877040401011, "grad_norm": 6.6270246505737305, "learning_rate": 8.298209390967355e-06, "loss": 2.248515319824219, "memory(GiB)": 77.56, "step": 94995, "token_acc": 0.5282258064516129, "train_speed(iter/s)": 1.438102 }, { "epoch": 4.07009125573026, "grad_norm": 5.540470123291016, "learning_rate": 8.294496883551528e-06, "loss": 2.281123161315918, "memory(GiB)": 77.56, "step": 95000, "token_acc": 0.5102739726027398, "train_speed(iter/s)": 1.438103 }, { "epoch": 4.07009125573026, "eval_loss": 2.3116610050201416, "eval_runtime": 14.0226, "eval_samples_per_second": 7.131, "eval_steps_per_second": 7.131, "eval_token_acc": 0.44823232323232326, "step": 95000 }, { "epoch": 4.070305471059509, "grad_norm": 5.085848331451416, "learning_rate": 8.290785131670014e-06, "loss": 2.211634635925293, "memory(GiB)": 77.56, "step": 95005, "token_acc": 0.4659498207885305, "train_speed(iter/s)": 1.43778 }, { "epoch": 4.070519686388758, "grad_norm": 6.199611186981201, "learning_rate": 8.287074135390061e-06, "loss": 2.1248369216918945, "memory(GiB)": 77.56, "step": 95010, "token_acc": 0.5701357466063348, "train_speed(iter/s)": 1.437784 }, { "epoch": 4.070733901718007, "grad_norm": 7.530384063720703, "learning_rate": 8.283363894778868e-06, "loss": 2.291860580444336, "memory(GiB)": 77.56, "step": 95015, "token_acc": 0.5035460992907801, "train_speed(iter/s)": 1.437786 }, { "epoch": 4.070948117047256, "grad_norm": 5.3922576904296875, "learning_rate": 8.279654409903687e-06, "loss": 2.3500675201416015, "memory(GiB)": 77.56, "step": 95020, "token_acc": 0.508833922261484, "train_speed(iter/s)": 1.437787 }, { "epoch": 4.071162332376505, "grad_norm": 4.7777557373046875, "learning_rate": 8.275945680831693e-06, "loss": 2.093250846862793, "memory(GiB)": 77.56, "step": 95025, "token_acc": 0.5708502024291497, "train_speed(iter/s)": 1.43778 }, { "epoch": 4.0713765477057535, "grad_norm": 5.861032485961914, "learning_rate": 8.272237707630092e-06, "loss": 2.6367023468017576, "memory(GiB)": 77.56, "step": 95030, "token_acc": 0.4919093851132686, "train_speed(iter/s)": 1.437769 }, { "epoch": 4.071590763035003, "grad_norm": 5.4793877601623535, "learning_rate": 8.26853049036605e-06, "loss": 2.145457458496094, "memory(GiB)": 77.56, "step": 95035, "token_acc": 0.5575539568345323, "train_speed(iter/s)": 1.43777 }, { "epoch": 4.071804978364252, "grad_norm": 8.698028564453125, "learning_rate": 8.264824029106727e-06, "loss": 2.360030746459961, "memory(GiB)": 77.56, "step": 95040, "token_acc": 0.4927536231884058, "train_speed(iter/s)": 1.437766 }, { "epoch": 4.0720191936935, "grad_norm": 6.546703815460205, "learning_rate": 8.261118323919265e-06, "loss": 2.737155532836914, "memory(GiB)": 77.56, "step": 95045, "token_acc": 0.4881889763779528, "train_speed(iter/s)": 1.437757 }, { "epoch": 4.07223340902275, "grad_norm": 8.735042572021484, "learning_rate": 8.257413374870804e-06, "loss": 2.4486717224121093, "memory(GiB)": 77.56, "step": 95050, "token_acc": 0.515358361774744, "train_speed(iter/s)": 1.437774 }, { "epoch": 4.072447624351999, "grad_norm": 7.635506629943848, "learning_rate": 8.25370918202844e-06, "loss": 2.271725654602051, "memory(GiB)": 77.56, "step": 95055, "token_acc": 0.5643939393939394, "train_speed(iter/s)": 1.437795 }, { "epoch": 4.072661839681247, "grad_norm": 6.147125244140625, "learning_rate": 8.25000574545931e-06, "loss": 2.3958587646484375, "memory(GiB)": 77.56, "step": 95060, "token_acc": 0.5033333333333333, "train_speed(iter/s)": 1.437809 }, { "epoch": 4.0728760550104965, "grad_norm": 7.892317295074463, "learning_rate": 8.246303065230488e-06, "loss": 2.2090301513671875, "memory(GiB)": 77.56, "step": 95065, "token_acc": 0.5358490566037736, "train_speed(iter/s)": 1.437815 }, { "epoch": 4.073090270339746, "grad_norm": 6.296213626861572, "learning_rate": 8.242601141409056e-06, "loss": 2.2790971755981446, "memory(GiB)": 77.56, "step": 95070, "token_acc": 0.532608695652174, "train_speed(iter/s)": 1.437835 }, { "epoch": 4.073304485668994, "grad_norm": 5.616121768951416, "learning_rate": 8.238899974062076e-06, "loss": 2.07140998840332, "memory(GiB)": 77.56, "step": 95075, "token_acc": 0.5268456375838926, "train_speed(iter/s)": 1.43783 }, { "epoch": 4.073518700998243, "grad_norm": 6.632471084594727, "learning_rate": 8.235199563256585e-06, "loss": 2.316982460021973, "memory(GiB)": 77.56, "step": 95080, "token_acc": 0.49836065573770494, "train_speed(iter/s)": 1.437819 }, { "epoch": 4.073732916327493, "grad_norm": 6.675464630126953, "learning_rate": 8.231499909059642e-06, "loss": 2.4138263702392577, "memory(GiB)": 77.56, "step": 95085, "token_acc": 0.47019867549668876, "train_speed(iter/s)": 1.437834 }, { "epoch": 4.073947131656741, "grad_norm": 5.686821460723877, "learning_rate": 8.227801011538256e-06, "loss": 2.0893932342529298, "memory(GiB)": 77.56, "step": 95090, "token_acc": 0.5178571428571429, "train_speed(iter/s)": 1.437832 }, { "epoch": 4.07416134698599, "grad_norm": 7.414421081542969, "learning_rate": 8.224102870759448e-06, "loss": 2.199736785888672, "memory(GiB)": 77.56, "step": 95095, "token_acc": 0.5283018867924528, "train_speed(iter/s)": 1.437835 }, { "epoch": 4.07437556231524, "grad_norm": 7.159857273101807, "learning_rate": 8.2204054867902e-06, "loss": 2.5327426910400392, "memory(GiB)": 77.56, "step": 95100, "token_acc": 0.48135593220338985, "train_speed(iter/s)": 1.437841 }, { "epoch": 4.074589777644488, "grad_norm": 7.560205459594727, "learning_rate": 8.216708859697508e-06, "loss": 2.262774658203125, "memory(GiB)": 77.56, "step": 95105, "token_acc": 0.49056603773584906, "train_speed(iter/s)": 1.437824 }, { "epoch": 4.074803992973737, "grad_norm": 4.729887962341309, "learning_rate": 8.213012989548313e-06, "loss": 2.0913330078125, "memory(GiB)": 77.56, "step": 95110, "token_acc": 0.5749128919860628, "train_speed(iter/s)": 1.437839 }, { "epoch": 4.0750182083029864, "grad_norm": 5.275120735168457, "learning_rate": 8.209317876409601e-06, "loss": 2.399083709716797, "memory(GiB)": 77.56, "step": 95115, "token_acc": 0.5071942446043165, "train_speed(iter/s)": 1.437854 }, { "epoch": 4.075232423632235, "grad_norm": 5.846557140350342, "learning_rate": 8.205623520348299e-06, "loss": 2.39022216796875, "memory(GiB)": 77.56, "step": 95120, "token_acc": 0.5051903114186851, "train_speed(iter/s)": 1.437867 }, { "epoch": 4.075446638961484, "grad_norm": 8.769886016845703, "learning_rate": 8.201929921431334e-06, "loss": 2.584098052978516, "memory(GiB)": 77.56, "step": 95125, "token_acc": 0.43956043956043955, "train_speed(iter/s)": 1.437883 }, { "epoch": 4.075660854290733, "grad_norm": 6.7576212882995605, "learning_rate": 8.198237079725613e-06, "loss": 2.396385383605957, "memory(GiB)": 77.56, "step": 95130, "token_acc": 0.5303030303030303, "train_speed(iter/s)": 1.437884 }, { "epoch": 4.075875069619982, "grad_norm": 5.972720146179199, "learning_rate": 8.194544995298048e-06, "loss": 2.2690196990966798, "memory(GiB)": 77.56, "step": 95135, "token_acc": 0.5469255663430421, "train_speed(iter/s)": 1.437879 }, { "epoch": 4.076089284949231, "grad_norm": 7.026678085327148, "learning_rate": 8.190853668215498e-06, "loss": 2.2759994506835937, "memory(GiB)": 77.56, "step": 95140, "token_acc": 0.5424354243542435, "train_speed(iter/s)": 1.437874 }, { "epoch": 4.07630350027848, "grad_norm": 7.014134883880615, "learning_rate": 8.18716309854487e-06, "loss": 2.3330326080322266, "memory(GiB)": 77.56, "step": 95145, "token_acc": 0.5, "train_speed(iter/s)": 1.437891 }, { "epoch": 4.076517715607729, "grad_norm": 7.18118143081665, "learning_rate": 8.183473286353005e-06, "loss": 2.4699195861816405, "memory(GiB)": 77.56, "step": 95150, "token_acc": 0.46534653465346537, "train_speed(iter/s)": 1.43791 }, { "epoch": 4.076731930936978, "grad_norm": 6.186906814575195, "learning_rate": 8.179784231706749e-06, "loss": 2.1483314514160154, "memory(GiB)": 77.56, "step": 95155, "token_acc": 0.5416666666666666, "train_speed(iter/s)": 1.437905 }, { "epoch": 4.076946146266227, "grad_norm": 5.541762828826904, "learning_rate": 8.176095934672933e-06, "loss": 2.1984859466552735, "memory(GiB)": 77.56, "step": 95160, "token_acc": 0.5270758122743683, "train_speed(iter/s)": 1.437929 }, { "epoch": 4.0771603615954755, "grad_norm": 5.22767448425293, "learning_rate": 8.172408395318359e-06, "loss": 2.2991353988647463, "memory(GiB)": 77.56, "step": 95165, "token_acc": 0.5207547169811321, "train_speed(iter/s)": 1.437926 }, { "epoch": 4.077374576924725, "grad_norm": 6.797808647155762, "learning_rate": 8.168721613709846e-06, "loss": 2.3800064086914063, "memory(GiB)": 77.56, "step": 95170, "token_acc": 0.4827586206896552, "train_speed(iter/s)": 1.437926 }, { "epoch": 4.077588792253974, "grad_norm": 6.5170578956604, "learning_rate": 8.165035589914193e-06, "loss": 2.2007110595703123, "memory(GiB)": 77.56, "step": 95175, "token_acc": 0.4939759036144578, "train_speed(iter/s)": 1.43794 }, { "epoch": 4.077803007583222, "grad_norm": 6.947213172912598, "learning_rate": 8.161350323998169e-06, "loss": 2.13214168548584, "memory(GiB)": 77.56, "step": 95180, "token_acc": 0.5136186770428015, "train_speed(iter/s)": 1.437945 }, { "epoch": 4.078017222912472, "grad_norm": 7.823688507080078, "learning_rate": 8.157665816028525e-06, "loss": 2.567250061035156, "memory(GiB)": 77.56, "step": 95185, "token_acc": 0.48518518518518516, "train_speed(iter/s)": 1.437949 }, { "epoch": 4.078231438241721, "grad_norm": 5.183547019958496, "learning_rate": 8.153982066072018e-06, "loss": 2.184250068664551, "memory(GiB)": 77.56, "step": 95190, "token_acc": 0.5348837209302325, "train_speed(iter/s)": 1.437942 }, { "epoch": 4.078445653570969, "grad_norm": 6.618625164031982, "learning_rate": 8.150299074195383e-06, "loss": 2.26678352355957, "memory(GiB)": 77.56, "step": 95195, "token_acc": 0.5665399239543726, "train_speed(iter/s)": 1.437958 }, { "epoch": 4.0786598689002185, "grad_norm": 5.132879734039307, "learning_rate": 8.146616840465326e-06, "loss": 2.4960214614868166, "memory(GiB)": 77.56, "step": 95200, "token_acc": 0.5136054421768708, "train_speed(iter/s)": 1.437952 }, { "epoch": 4.078874084229468, "grad_norm": 7.099595069885254, "learning_rate": 8.142935364948579e-06, "loss": 2.251189041137695, "memory(GiB)": 77.56, "step": 95205, "token_acc": 0.5431034482758621, "train_speed(iter/s)": 1.437951 }, { "epoch": 4.079088299558716, "grad_norm": 5.392854690551758, "learning_rate": 8.13925464771182e-06, "loss": 2.2247493743896483, "memory(GiB)": 77.56, "step": 95210, "token_acc": 0.51953125, "train_speed(iter/s)": 1.43796 }, { "epoch": 4.079302514887965, "grad_norm": 6.9510297775268555, "learning_rate": 8.135574688821734e-06, "loss": 2.124391555786133, "memory(GiB)": 77.56, "step": 95215, "token_acc": 0.4936708860759494, "train_speed(iter/s)": 1.437965 }, { "epoch": 4.079516730217215, "grad_norm": 6.948002338409424, "learning_rate": 8.131895488344981e-06, "loss": 2.3184295654296876, "memory(GiB)": 77.56, "step": 95220, "token_acc": 0.5, "train_speed(iter/s)": 1.437986 }, { "epoch": 4.079730945546463, "grad_norm": 6.519289493560791, "learning_rate": 8.128217046348207e-06, "loss": 2.0918804168701173, "memory(GiB)": 77.56, "step": 95225, "token_acc": 0.5, "train_speed(iter/s)": 1.437995 }, { "epoch": 4.079945160875712, "grad_norm": 5.949084758758545, "learning_rate": 8.124539362898076e-06, "loss": 2.291040802001953, "memory(GiB)": 77.56, "step": 95230, "token_acc": 0.5723905723905723, "train_speed(iter/s)": 1.438 }, { "epoch": 4.0801593762049615, "grad_norm": 6.39797306060791, "learning_rate": 8.120862438061183e-06, "loss": 2.545256996154785, "memory(GiB)": 77.56, "step": 95235, "token_acc": 0.4570446735395189, "train_speed(iter/s)": 1.438013 }, { "epoch": 4.08037359153421, "grad_norm": 7.161879539489746, "learning_rate": 8.117186271904158e-06, "loss": 2.200242042541504, "memory(GiB)": 77.56, "step": 95240, "token_acc": 0.5627118644067797, "train_speed(iter/s)": 1.43803 }, { "epoch": 4.080587806863459, "grad_norm": 6.7964582443237305, "learning_rate": 8.113510864493602e-06, "loss": 2.7155387878417967, "memory(GiB)": 77.56, "step": 95245, "token_acc": 0.47474747474747475, "train_speed(iter/s)": 1.438041 }, { "epoch": 4.080802022192708, "grad_norm": 7.678805351257324, "learning_rate": 8.109836215896083e-06, "loss": 2.2545888900756834, "memory(GiB)": 77.56, "step": 95250, "token_acc": 0.5177865612648221, "train_speed(iter/s)": 1.438045 }, { "epoch": 4.081016237521957, "grad_norm": 5.471217155456543, "learning_rate": 8.106162326178168e-06, "loss": 2.272845458984375, "memory(GiB)": 77.56, "step": 95255, "token_acc": 0.5429553264604811, "train_speed(iter/s)": 1.438054 }, { "epoch": 4.081230452851206, "grad_norm": 6.315338611602783, "learning_rate": 8.102489195406432e-06, "loss": 2.491302490234375, "memory(GiB)": 77.56, "step": 95260, "token_acc": 0.5350553505535055, "train_speed(iter/s)": 1.438044 }, { "epoch": 4.081444668180455, "grad_norm": 5.879712104797363, "learning_rate": 8.09881682364741e-06, "loss": 2.1852466583251955, "memory(GiB)": 77.56, "step": 95265, "token_acc": 0.532319391634981, "train_speed(iter/s)": 1.438017 }, { "epoch": 4.081658883509704, "grad_norm": 7.742741584777832, "learning_rate": 8.095145210967624e-06, "loss": 2.003104400634766, "memory(GiB)": 77.56, "step": 95270, "token_acc": 0.5421245421245421, "train_speed(iter/s)": 1.438035 }, { "epoch": 4.081873098838953, "grad_norm": 5.597198963165283, "learning_rate": 8.091474357433598e-06, "loss": 2.4399093627929687, "memory(GiB)": 77.56, "step": 95275, "token_acc": 0.5043103448275862, "train_speed(iter/s)": 1.438047 }, { "epoch": 4.082087314168202, "grad_norm": 5.738831996917725, "learning_rate": 8.087804263111825e-06, "loss": 1.9840042114257812, "memory(GiB)": 77.56, "step": 95280, "token_acc": 0.5518394648829431, "train_speed(iter/s)": 1.438066 }, { "epoch": 4.0823015294974505, "grad_norm": 8.161035537719727, "learning_rate": 8.084134928068782e-06, "loss": 2.3944358825683594, "memory(GiB)": 77.56, "step": 95285, "token_acc": 0.5231316725978647, "train_speed(iter/s)": 1.438077 }, { "epoch": 4.0825157448267, "grad_norm": 5.912302017211914, "learning_rate": 8.080466352370968e-06, "loss": 2.1009111404418945, "memory(GiB)": 77.56, "step": 95290, "token_acc": 0.5454545454545454, "train_speed(iter/s)": 1.438084 }, { "epoch": 4.082729960155949, "grad_norm": 6.509933948516846, "learning_rate": 8.076798536084828e-06, "loss": 2.1947967529296877, "memory(GiB)": 77.56, "step": 95295, "token_acc": 0.5368098159509203, "train_speed(iter/s)": 1.438084 }, { "epoch": 4.082944175485197, "grad_norm": 7.838466644287109, "learning_rate": 8.073131479276807e-06, "loss": 2.0913082122802735, "memory(GiB)": 77.56, "step": 95300, "token_acc": 0.5543478260869565, "train_speed(iter/s)": 1.438089 }, { "epoch": 4.083158390814447, "grad_norm": 6.435009479522705, "learning_rate": 8.069465182013347e-06, "loss": 2.442860984802246, "memory(GiB)": 77.56, "step": 95305, "token_acc": 0.5273972602739726, "train_speed(iter/s)": 1.438072 }, { "epoch": 4.083372606143696, "grad_norm": 5.454482555389404, "learning_rate": 8.065799644360844e-06, "loss": 2.46114501953125, "memory(GiB)": 77.56, "step": 95310, "token_acc": 0.48633879781420764, "train_speed(iter/s)": 1.438072 }, { "epoch": 4.083586821472944, "grad_norm": 5.948737621307373, "learning_rate": 8.062134866385717e-06, "loss": 2.3383861541748048, "memory(GiB)": 77.56, "step": 95315, "token_acc": 0.5221843003412969, "train_speed(iter/s)": 1.438078 }, { "epoch": 4.0838010368021935, "grad_norm": 7.446720123291016, "learning_rate": 8.05847084815437e-06, "loss": 2.4039730072021483, "memory(GiB)": 77.56, "step": 95320, "token_acc": 0.4754601226993865, "train_speed(iter/s)": 1.43807 }, { "epoch": 4.084015252131443, "grad_norm": 8.093862533569336, "learning_rate": 8.054807589733166e-06, "loss": 2.3856197357177735, "memory(GiB)": 77.56, "step": 95325, "token_acc": 0.4808362369337979, "train_speed(iter/s)": 1.438064 }, { "epoch": 4.084229467460691, "grad_norm": 4.595720291137695, "learning_rate": 8.051145091188472e-06, "loss": 2.3056467056274412, "memory(GiB)": 77.56, "step": 95330, "token_acc": 0.5381944444444444, "train_speed(iter/s)": 1.438067 }, { "epoch": 4.08444368278994, "grad_norm": 6.777374267578125, "learning_rate": 8.047483352586638e-06, "loss": 2.122158432006836, "memory(GiB)": 77.56, "step": 95335, "token_acc": 0.5143884892086331, "train_speed(iter/s)": 1.438072 }, { "epoch": 4.08465789811919, "grad_norm": 5.51237154006958, "learning_rate": 8.043822373993997e-06, "loss": 2.2516345977783203, "memory(GiB)": 77.56, "step": 95340, "token_acc": 0.5633802816901409, "train_speed(iter/s)": 1.438068 }, { "epoch": 4.084872113448438, "grad_norm": 8.379335403442383, "learning_rate": 8.04016215547686e-06, "loss": 2.58554630279541, "memory(GiB)": 77.56, "step": 95345, "token_acc": 0.48366013071895425, "train_speed(iter/s)": 1.438089 }, { "epoch": 4.085086328777687, "grad_norm": 6.422924041748047, "learning_rate": 8.036502697101555e-06, "loss": 2.3697158813476564, "memory(GiB)": 77.56, "step": 95350, "token_acc": 0.4897959183673469, "train_speed(iter/s)": 1.438094 }, { "epoch": 4.0853005441069365, "grad_norm": 6.77065896987915, "learning_rate": 8.032843998934369e-06, "loss": 2.2962230682373046, "memory(GiB)": 77.56, "step": 95355, "token_acc": 0.49158249158249157, "train_speed(iter/s)": 1.438105 }, { "epoch": 4.085514759436185, "grad_norm": 5.855922698974609, "learning_rate": 8.029186061041588e-06, "loss": 2.3217077255249023, "memory(GiB)": 77.56, "step": 95360, "token_acc": 0.49221183800623053, "train_speed(iter/s)": 1.438086 }, { "epoch": 4.085728974765434, "grad_norm": 7.765806198120117, "learning_rate": 8.025528883489468e-06, "loss": 2.5206104278564454, "memory(GiB)": 77.56, "step": 95365, "token_acc": 0.44966442953020136, "train_speed(iter/s)": 1.438075 }, { "epoch": 4.085943190094683, "grad_norm": 9.420412063598633, "learning_rate": 8.02187246634426e-06, "loss": 2.0320037841796874, "memory(GiB)": 77.56, "step": 95370, "token_acc": 0.5256410256410257, "train_speed(iter/s)": 1.438086 }, { "epoch": 4.086157405423932, "grad_norm": 5.942191123962402, "learning_rate": 8.018216809672219e-06, "loss": 2.470602607727051, "memory(GiB)": 77.56, "step": 95375, "token_acc": 0.4811594202898551, "train_speed(iter/s)": 1.43809 }, { "epoch": 4.086371620753181, "grad_norm": 5.013623237609863, "learning_rate": 8.014561913539565e-06, "loss": 2.254461097717285, "memory(GiB)": 77.56, "step": 95380, "token_acc": 0.5, "train_speed(iter/s)": 1.438102 }, { "epoch": 4.08658583608243, "grad_norm": 5.06242036819458, "learning_rate": 8.010907778012494e-06, "loss": 2.2041616439819336, "memory(GiB)": 77.56, "step": 95385, "token_acc": 0.5384615384615384, "train_speed(iter/s)": 1.438118 }, { "epoch": 4.086800051411679, "grad_norm": 4.987133502960205, "learning_rate": 8.007254403157233e-06, "loss": 2.111385726928711, "memory(GiB)": 77.56, "step": 95390, "token_acc": 0.5672131147540984, "train_speed(iter/s)": 1.438125 }, { "epoch": 4.087014266740928, "grad_norm": 6.158591270446777, "learning_rate": 8.003601789039944e-06, "loss": 2.337290573120117, "memory(GiB)": 77.56, "step": 95395, "token_acc": 0.5172413793103449, "train_speed(iter/s)": 1.43814 }, { "epoch": 4.087228482070177, "grad_norm": 8.32453441619873, "learning_rate": 7.999949935726797e-06, "loss": 2.31163330078125, "memory(GiB)": 77.56, "step": 95400, "token_acc": 0.4930555555555556, "train_speed(iter/s)": 1.438143 }, { "epoch": 4.087442697399426, "grad_norm": 7.872544765472412, "learning_rate": 7.996298843283967e-06, "loss": 2.3166744232177736, "memory(GiB)": 77.56, "step": 95405, "token_acc": 0.5276872964169381, "train_speed(iter/s)": 1.438148 }, { "epoch": 4.087656912728675, "grad_norm": 5.44281530380249, "learning_rate": 7.99264851177759e-06, "loss": 2.1454681396484374, "memory(GiB)": 77.56, "step": 95410, "token_acc": 0.5422535211267606, "train_speed(iter/s)": 1.438161 }, { "epoch": 4.087871128057924, "grad_norm": 5.412315845489502, "learning_rate": 7.988998941273784e-06, "loss": 2.164523124694824, "memory(GiB)": 77.56, "step": 95415, "token_acc": 0.5192307692307693, "train_speed(iter/s)": 1.438161 }, { "epoch": 4.088085343387172, "grad_norm": 5.925240993499756, "learning_rate": 7.985350131838676e-06, "loss": 2.5320085525512694, "memory(GiB)": 77.56, "step": 95420, "token_acc": 0.48056537102473496, "train_speed(iter/s)": 1.43817 }, { "epoch": 4.088299558716422, "grad_norm": 9.179483413696289, "learning_rate": 7.981702083538368e-06, "loss": 2.29078369140625, "memory(GiB)": 77.56, "step": 95425, "token_acc": 0.5140845070422535, "train_speed(iter/s)": 1.438184 }, { "epoch": 4.088513774045671, "grad_norm": 6.44094181060791, "learning_rate": 7.978054796438922e-06, "loss": 2.2283884048461915, "memory(GiB)": 77.56, "step": 95430, "token_acc": 0.5271317829457365, "train_speed(iter/s)": 1.438188 }, { "epoch": 4.088727989374919, "grad_norm": 5.382179260253906, "learning_rate": 7.974408270606448e-06, "loss": 2.0547115325927736, "memory(GiB)": 77.56, "step": 95435, "token_acc": 0.5709219858156028, "train_speed(iter/s)": 1.438188 }, { "epoch": 4.088942204704169, "grad_norm": 5.694599151611328, "learning_rate": 7.970762506106993e-06, "loss": 2.3554187774658204, "memory(GiB)": 77.56, "step": 95440, "token_acc": 0.5074626865671642, "train_speed(iter/s)": 1.4382 }, { "epoch": 4.089156420033418, "grad_norm": 6.321893692016602, "learning_rate": 7.967117503006604e-06, "loss": 2.4103702545166015, "memory(GiB)": 77.56, "step": 95445, "token_acc": 0.4868913857677903, "train_speed(iter/s)": 1.438216 }, { "epoch": 4.089370635362666, "grad_norm": 6.720142364501953, "learning_rate": 7.963473261371307e-06, "loss": 2.3507190704345704, "memory(GiB)": 77.56, "step": 95450, "token_acc": 0.5314465408805031, "train_speed(iter/s)": 1.438202 }, { "epoch": 4.0895848506919155, "grad_norm": 7.978154182434082, "learning_rate": 7.959829781267114e-06, "loss": 2.2015880584716796, "memory(GiB)": 77.56, "step": 95455, "token_acc": 0.5072992700729927, "train_speed(iter/s)": 1.438195 }, { "epoch": 4.089799066021165, "grad_norm": 4.7595415115356445, "learning_rate": 7.956187062760045e-06, "loss": 2.0481498718261717, "memory(GiB)": 77.56, "step": 95460, "token_acc": 0.5399239543726235, "train_speed(iter/s)": 1.438206 }, { "epoch": 4.090013281350413, "grad_norm": 7.336482048034668, "learning_rate": 7.952545105916098e-06, "loss": 2.499127960205078, "memory(GiB)": 77.56, "step": 95465, "token_acc": 0.4861111111111111, "train_speed(iter/s)": 1.438227 }, { "epoch": 4.090227496679662, "grad_norm": 6.1849365234375, "learning_rate": 7.948903910801236e-06, "loss": 2.1779363632202147, "memory(GiB)": 77.56, "step": 95470, "token_acc": 0.4981949458483754, "train_speed(iter/s)": 1.438226 }, { "epoch": 4.090441712008912, "grad_norm": 5.000257968902588, "learning_rate": 7.945263477481434e-06, "loss": 2.1321340560913087, "memory(GiB)": 77.56, "step": 95475, "token_acc": 0.5507246376811594, "train_speed(iter/s)": 1.438216 }, { "epoch": 4.09065592733816, "grad_norm": 5.467508792877197, "learning_rate": 7.94162380602263e-06, "loss": 2.602534294128418, "memory(GiB)": 77.56, "step": 95480, "token_acc": 0.46464646464646464, "train_speed(iter/s)": 1.438223 }, { "epoch": 4.090870142667409, "grad_norm": 7.177868366241455, "learning_rate": 7.937984896490763e-06, "loss": 2.151432418823242, "memory(GiB)": 77.56, "step": 95485, "token_acc": 0.5038167938931297, "train_speed(iter/s)": 1.438229 }, { "epoch": 4.0910843579966585, "grad_norm": 5.606530666351318, "learning_rate": 7.934346748951748e-06, "loss": 2.462385559082031, "memory(GiB)": 77.56, "step": 95490, "token_acc": 0.49843260188087773, "train_speed(iter/s)": 1.438235 }, { "epoch": 4.091298573325907, "grad_norm": 5.938510417938232, "learning_rate": 7.930709363471517e-06, "loss": 2.3911277770996096, "memory(GiB)": 77.56, "step": 95495, "token_acc": 0.49240121580547114, "train_speed(iter/s)": 1.438229 }, { "epoch": 4.091512788655156, "grad_norm": 5.3229217529296875, "learning_rate": 7.927072740115943e-06, "loss": 2.1502012252807616, "memory(GiB)": 77.56, "step": 95500, "token_acc": 0.5331230283911672, "train_speed(iter/s)": 1.438228 }, { "epoch": 4.091512788655156, "eval_loss": 2.348863124847412, "eval_runtime": 13.8788, "eval_samples_per_second": 7.205, "eval_steps_per_second": 7.205, "eval_token_acc": 0.4654498044328553, "step": 95500 }, { "epoch": 4.091727003984405, "grad_norm": 6.539514541625977, "learning_rate": 7.923436878950919e-06, "loss": 2.063608169555664, "memory(GiB)": 77.56, "step": 95505, "token_acc": 0.48363636363636364, "train_speed(iter/s)": 1.437913 }, { "epoch": 4.091941219313654, "grad_norm": 6.420368194580078, "learning_rate": 7.919801780042307e-06, "loss": 2.0076530456542967, "memory(GiB)": 77.56, "step": 95510, "token_acc": 0.5300353356890459, "train_speed(iter/s)": 1.437926 }, { "epoch": 4.092155434642903, "grad_norm": 7.928882122039795, "learning_rate": 7.916167443455946e-06, "loss": 2.3196598052978517, "memory(GiB)": 77.56, "step": 95515, "token_acc": 0.5046153846153846, "train_speed(iter/s)": 1.437938 }, { "epoch": 4.092369649972152, "grad_norm": 5.978614807128906, "learning_rate": 7.912533869257704e-06, "loss": 2.494196128845215, "memory(GiB)": 77.56, "step": 95520, "token_acc": 0.4833948339483395, "train_speed(iter/s)": 1.43796 }, { "epoch": 4.092583865301401, "grad_norm": 5.221742630004883, "learning_rate": 7.90890105751339e-06, "loss": 2.2820737838745115, "memory(GiB)": 77.56, "step": 95525, "token_acc": 0.5218855218855218, "train_speed(iter/s)": 1.43797 }, { "epoch": 4.09279808063065, "grad_norm": 6.007336139678955, "learning_rate": 7.905269008288807e-06, "loss": 2.6127859115600587, "memory(GiB)": 77.56, "step": 95530, "token_acc": 0.4827586206896552, "train_speed(iter/s)": 1.43797 }, { "epoch": 4.093012295959899, "grad_norm": 7.333731651306152, "learning_rate": 7.901637721649774e-06, "loss": 2.227700424194336, "memory(GiB)": 77.56, "step": 95535, "token_acc": 0.5087719298245614, "train_speed(iter/s)": 1.437976 }, { "epoch": 4.0932265112891475, "grad_norm": 5.782073020935059, "learning_rate": 7.898007197662066e-06, "loss": 2.4427804946899414, "memory(GiB)": 77.56, "step": 95540, "token_acc": 0.4797047970479705, "train_speed(iter/s)": 1.437967 }, { "epoch": 4.093440726618397, "grad_norm": 5.386437892913818, "learning_rate": 7.894377436391443e-06, "loss": 2.1094970703125, "memory(GiB)": 77.56, "step": 95545, "token_acc": 0.5207667731629393, "train_speed(iter/s)": 1.43796 }, { "epoch": 4.093654941947646, "grad_norm": 6.629706859588623, "learning_rate": 7.890748437903677e-06, "loss": 2.4512868881225587, "memory(GiB)": 77.56, "step": 95550, "token_acc": 0.5171102661596958, "train_speed(iter/s)": 1.437943 }, { "epoch": 4.093869157276894, "grad_norm": 6.183679580688477, "learning_rate": 7.887120202264514e-06, "loss": 2.1981393814086916, "memory(GiB)": 77.56, "step": 95555, "token_acc": 0.49640287769784175, "train_speed(iter/s)": 1.43792 }, { "epoch": 4.094083372606144, "grad_norm": 8.030348777770996, "learning_rate": 7.883492729539665e-06, "loss": 2.1019378662109376, "memory(GiB)": 77.56, "step": 95560, "token_acc": 0.5289256198347108, "train_speed(iter/s)": 1.437935 }, { "epoch": 4.094297587935393, "grad_norm": 5.193393707275391, "learning_rate": 7.879866019794858e-06, "loss": 2.3798805236816407, "memory(GiB)": 77.56, "step": 95565, "token_acc": 0.48484848484848486, "train_speed(iter/s)": 1.437952 }, { "epoch": 4.094511803264641, "grad_norm": 8.052562713623047, "learning_rate": 7.876240073095793e-06, "loss": 2.3297948837280273, "memory(GiB)": 77.56, "step": 95570, "token_acc": 0.5060606060606061, "train_speed(iter/s)": 1.437971 }, { "epoch": 4.0947260185938905, "grad_norm": 6.5791401863098145, "learning_rate": 7.872614889508134e-06, "loss": 2.3634870529174803, "memory(GiB)": 77.56, "step": 95575, "token_acc": 0.501779359430605, "train_speed(iter/s)": 1.437967 }, { "epoch": 4.09494023392314, "grad_norm": 5.681127548217773, "learning_rate": 7.868990469097593e-06, "loss": 2.1815195083618164, "memory(GiB)": 77.56, "step": 95580, "token_acc": 0.5306859205776173, "train_speed(iter/s)": 1.437956 }, { "epoch": 4.095154449252388, "grad_norm": 6.2353925704956055, "learning_rate": 7.86536681192981e-06, "loss": 2.1699281692504884, "memory(GiB)": 77.56, "step": 95585, "token_acc": 0.5482758620689655, "train_speed(iter/s)": 1.437968 }, { "epoch": 4.095368664581637, "grad_norm": 7.309560298919678, "learning_rate": 7.861743918070435e-06, "loss": 2.4608787536621093, "memory(GiB)": 77.56, "step": 95590, "token_acc": 0.49258160237388726, "train_speed(iter/s)": 1.43798 }, { "epoch": 4.095582879910887, "grad_norm": 7.188143730163574, "learning_rate": 7.858121787585093e-06, "loss": 1.9250286102294922, "memory(GiB)": 77.56, "step": 95595, "token_acc": 0.588, "train_speed(iter/s)": 1.437992 }, { "epoch": 4.095797095240135, "grad_norm": 6.157402515411377, "learning_rate": 7.854500420539401e-06, "loss": 2.354156494140625, "memory(GiB)": 77.56, "step": 95600, "token_acc": 0.5194805194805194, "train_speed(iter/s)": 1.438002 }, { "epoch": 4.096011310569384, "grad_norm": 7.745563507080078, "learning_rate": 7.850879816998962e-06, "loss": 2.1147571563720704, "memory(GiB)": 77.56, "step": 95605, "token_acc": 0.5211864406779662, "train_speed(iter/s)": 1.438012 }, { "epoch": 4.0962255258986335, "grad_norm": 6.6154704093933105, "learning_rate": 7.847259977029392e-06, "loss": 2.508718490600586, "memory(GiB)": 77.56, "step": 95610, "token_acc": 0.4430769230769231, "train_speed(iter/s)": 1.437996 }, { "epoch": 4.096439741227882, "grad_norm": 6.540365219116211, "learning_rate": 7.843640900696247e-06, "loss": 1.9651012420654297, "memory(GiB)": 77.56, "step": 95615, "token_acc": 0.581081081081081, "train_speed(iter/s)": 1.43798 }, { "epoch": 4.096653956557131, "grad_norm": 4.798427104949951, "learning_rate": 7.840022588065098e-06, "loss": 2.5056270599365233, "memory(GiB)": 77.56, "step": 95620, "token_acc": 0.47572815533980584, "train_speed(iter/s)": 1.437989 }, { "epoch": 4.09686817188638, "grad_norm": 6.073946475982666, "learning_rate": 7.836405039201483e-06, "loss": 2.686945343017578, "memory(GiB)": 77.56, "step": 95625, "token_acc": 0.4906832298136646, "train_speed(iter/s)": 1.437987 }, { "epoch": 4.097082387215629, "grad_norm": 5.64340877532959, "learning_rate": 7.832788254170948e-06, "loss": 2.186273956298828, "memory(GiB)": 77.56, "step": 95630, "token_acc": 0.5478260869565217, "train_speed(iter/s)": 1.437995 }, { "epoch": 4.097296602544878, "grad_norm": 9.057547569274902, "learning_rate": 7.829172233038996e-06, "loss": 2.23011474609375, "memory(GiB)": 77.56, "step": 95635, "token_acc": 0.5331230283911672, "train_speed(iter/s)": 1.438001 }, { "epoch": 4.097510817874127, "grad_norm": 4.938108444213867, "learning_rate": 7.825556975871156e-06, "loss": 2.4074224472045898, "memory(GiB)": 77.56, "step": 95640, "token_acc": 0.513677811550152, "train_speed(iter/s)": 1.437999 }, { "epoch": 4.097725033203376, "grad_norm": 6.268854141235352, "learning_rate": 7.821942482732918e-06, "loss": 2.087177276611328, "memory(GiB)": 77.56, "step": 95645, "token_acc": 0.5434782608695652, "train_speed(iter/s)": 1.437996 }, { "epoch": 4.097939248532625, "grad_norm": 5.036936283111572, "learning_rate": 7.818328753689763e-06, "loss": 2.321675109863281, "memory(GiB)": 77.56, "step": 95650, "token_acc": 0.5076923076923077, "train_speed(iter/s)": 1.437987 }, { "epoch": 4.098153463861874, "grad_norm": 7.010135173797607, "learning_rate": 7.814715788807148e-06, "loss": 2.2630308151245115, "memory(GiB)": 77.56, "step": 95655, "token_acc": 0.5158227848101266, "train_speed(iter/s)": 1.437983 }, { "epoch": 4.0983676791911225, "grad_norm": 7.799376964569092, "learning_rate": 7.811103588150514e-06, "loss": 2.4632816314697266, "memory(GiB)": 77.56, "step": 95660, "token_acc": 0.4613003095975232, "train_speed(iter/s)": 1.437979 }, { "epoch": 4.098581894520372, "grad_norm": 6.319108963012695, "learning_rate": 7.807492151785334e-06, "loss": 2.0347557067871094, "memory(GiB)": 77.56, "step": 95665, "token_acc": 0.5224489795918368, "train_speed(iter/s)": 1.438003 }, { "epoch": 4.098796109849621, "grad_norm": 5.453060626983643, "learning_rate": 7.803881479777008e-06, "loss": 2.3898683547973634, "memory(GiB)": 77.56, "step": 95670, "token_acc": 0.495114006514658, "train_speed(iter/s)": 1.437994 }, { "epoch": 4.099010325178869, "grad_norm": 6.550273418426514, "learning_rate": 7.800271572190954e-06, "loss": 2.3743898391723635, "memory(GiB)": 77.56, "step": 95675, "token_acc": 0.5093167701863354, "train_speed(iter/s)": 1.437999 }, { "epoch": 4.099224540508119, "grad_norm": 6.600028991699219, "learning_rate": 7.796662429092555e-06, "loss": 2.088513946533203, "memory(GiB)": 77.56, "step": 95680, "token_acc": 0.5443037974683544, "train_speed(iter/s)": 1.438018 }, { "epoch": 4.099438755837368, "grad_norm": 5.347016334533691, "learning_rate": 7.793054050547215e-06, "loss": 2.2152591705322267, "memory(GiB)": 77.56, "step": 95685, "token_acc": 0.5100671140939598, "train_speed(iter/s)": 1.438031 }, { "epoch": 4.099652971166616, "grad_norm": 5.001875877380371, "learning_rate": 7.78944643662029e-06, "loss": 2.079903221130371, "memory(GiB)": 77.56, "step": 95690, "token_acc": 0.532258064516129, "train_speed(iter/s)": 1.438043 }, { "epoch": 4.099867186495866, "grad_norm": 6.1742682456970215, "learning_rate": 7.785839587377141e-06, "loss": 2.180209922790527, "memory(GiB)": 77.56, "step": 95695, "token_acc": 0.5306859205776173, "train_speed(iter/s)": 1.43804 }, { "epoch": 4.100081401825115, "grad_norm": 7.490435600280762, "learning_rate": 7.782233502883119e-06, "loss": 2.44027042388916, "memory(GiB)": 77.56, "step": 95700, "token_acc": 0.4416961130742049, "train_speed(iter/s)": 1.438058 }, { "epoch": 4.100295617154363, "grad_norm": 5.422607898712158, "learning_rate": 7.778628183203535e-06, "loss": 2.24871826171875, "memory(GiB)": 77.56, "step": 95705, "token_acc": 0.5113636363636364, "train_speed(iter/s)": 1.438067 }, { "epoch": 4.1005098324836124, "grad_norm": 5.190501689910889, "learning_rate": 7.775023628403705e-06, "loss": 2.4379295349121093, "memory(GiB)": 77.56, "step": 95710, "token_acc": 0.5031055900621118, "train_speed(iter/s)": 1.438075 }, { "epoch": 4.100724047812862, "grad_norm": 5.1958723068237305, "learning_rate": 7.771419838548938e-06, "loss": 2.0504478454589843, "memory(GiB)": 77.56, "step": 95715, "token_acc": 0.5514705882352942, "train_speed(iter/s)": 1.438079 }, { "epoch": 4.10093826314211, "grad_norm": 5.958120346069336, "learning_rate": 7.767816813704499e-06, "loss": 2.1624988555908202, "memory(GiB)": 77.56, "step": 95720, "token_acc": 0.4961832061068702, "train_speed(iter/s)": 1.438096 }, { "epoch": 4.101152478471359, "grad_norm": 5.108565807342529, "learning_rate": 7.76421455393569e-06, "loss": 2.1604690551757812, "memory(GiB)": 77.56, "step": 95725, "token_acc": 0.5089285714285714, "train_speed(iter/s)": 1.438108 }, { "epoch": 4.101366693800609, "grad_norm": 6.5323286056518555, "learning_rate": 7.760613059307748e-06, "loss": 2.244904708862305, "memory(GiB)": 77.56, "step": 95730, "token_acc": 0.5447761194029851, "train_speed(iter/s)": 1.438131 }, { "epoch": 4.101580909129857, "grad_norm": 5.427577018737793, "learning_rate": 7.757012329885933e-06, "loss": 2.2815195083618165, "memory(GiB)": 77.56, "step": 95735, "token_acc": 0.5239520958083832, "train_speed(iter/s)": 1.438148 }, { "epoch": 4.101795124459106, "grad_norm": 6.407330513000488, "learning_rate": 7.753412365735463e-06, "loss": 2.197022247314453, "memory(GiB)": 77.56, "step": 95740, "token_acc": 0.49825783972125437, "train_speed(iter/s)": 1.438152 }, { "epoch": 4.1020093397883555, "grad_norm": 5.737429141998291, "learning_rate": 7.749813166921543e-06, "loss": 2.3980236053466797, "memory(GiB)": 77.56, "step": 95745, "token_acc": 0.4709480122324159, "train_speed(iter/s)": 1.43817 }, { "epoch": 4.102223555117604, "grad_norm": 5.706313610076904, "learning_rate": 7.746214733509411e-06, "loss": 2.078744125366211, "memory(GiB)": 77.56, "step": 95750, "token_acc": 0.534965034965035, "train_speed(iter/s)": 1.438184 }, { "epoch": 4.102437770446853, "grad_norm": 5.231309413909912, "learning_rate": 7.742617065564217e-06, "loss": 2.2042591094970705, "memory(GiB)": 77.56, "step": 95755, "token_acc": 0.5351351351351351, "train_speed(iter/s)": 1.438175 }, { "epoch": 4.102651985776102, "grad_norm": 5.8632378578186035, "learning_rate": 7.739020163151173e-06, "loss": 2.340458297729492, "memory(GiB)": 77.56, "step": 95760, "token_acc": 0.5, "train_speed(iter/s)": 1.438177 }, { "epoch": 4.102866201105351, "grad_norm": 4.819559097290039, "learning_rate": 7.73542402633542e-06, "loss": 2.2115365982055666, "memory(GiB)": 77.56, "step": 95765, "token_acc": 0.5285714285714286, "train_speed(iter/s)": 1.4382 }, { "epoch": 4.1030804164346, "grad_norm": 5.6769537925720215, "learning_rate": 7.731828655182105e-06, "loss": 2.208267402648926, "memory(GiB)": 77.56, "step": 95770, "token_acc": 0.5418060200668896, "train_speed(iter/s)": 1.438197 }, { "epoch": 4.103294631763849, "grad_norm": 6.025320529937744, "learning_rate": 7.728234049756372e-06, "loss": 2.002666473388672, "memory(GiB)": 77.56, "step": 95775, "token_acc": 0.5436507936507936, "train_speed(iter/s)": 1.43821 }, { "epoch": 4.103508847093098, "grad_norm": 6.932184219360352, "learning_rate": 7.724640210123312e-06, "loss": 2.4075464248657226, "memory(GiB)": 77.56, "step": 95780, "token_acc": 0.46417445482866043, "train_speed(iter/s)": 1.43822 }, { "epoch": 4.103723062422347, "grad_norm": 6.757833003997803, "learning_rate": 7.721047136348076e-06, "loss": 2.2383880615234375, "memory(GiB)": 77.56, "step": 95785, "token_acc": 0.5186440677966102, "train_speed(iter/s)": 1.438218 }, { "epoch": 4.103937277751596, "grad_norm": 5.736269950866699, "learning_rate": 7.717454828495724e-06, "loss": 2.28125114440918, "memory(GiB)": 77.56, "step": 95790, "token_acc": 0.5208333333333334, "train_speed(iter/s)": 1.438219 }, { "epoch": 4.1041514930808445, "grad_norm": 5.992380619049072, "learning_rate": 7.71386328663134e-06, "loss": 2.301872444152832, "memory(GiB)": 77.56, "step": 95795, "token_acc": 0.5198776758409785, "train_speed(iter/s)": 1.438225 }, { "epoch": 4.104365708410094, "grad_norm": 9.938249588012695, "learning_rate": 7.710272510819993e-06, "loss": 1.8846904754638671, "memory(GiB)": 77.56, "step": 95800, "token_acc": 0.5397489539748954, "train_speed(iter/s)": 1.438225 }, { "epoch": 4.104579923739343, "grad_norm": 7.297255992889404, "learning_rate": 7.706682501126722e-06, "loss": 2.024399185180664, "memory(GiB)": 77.56, "step": 95805, "token_acc": 0.5242718446601942, "train_speed(iter/s)": 1.438224 }, { "epoch": 4.104794139068591, "grad_norm": 5.847188472747803, "learning_rate": 7.703093257616579e-06, "loss": 2.338427734375, "memory(GiB)": 77.56, "step": 95810, "token_acc": 0.48717948717948717, "train_speed(iter/s)": 1.438226 }, { "epoch": 4.105008354397841, "grad_norm": 5.563513278961182, "learning_rate": 7.699504780354582e-06, "loss": 2.284145736694336, "memory(GiB)": 77.56, "step": 95815, "token_acc": 0.5187713310580204, "train_speed(iter/s)": 1.438242 }, { "epoch": 4.10522256972709, "grad_norm": 6.084682464599609, "learning_rate": 7.69591706940574e-06, "loss": 2.309156036376953, "memory(GiB)": 77.56, "step": 95820, "token_acc": 0.4748201438848921, "train_speed(iter/s)": 1.438257 }, { "epoch": 4.105436785056338, "grad_norm": 6.657647609710693, "learning_rate": 7.69233012483504e-06, "loss": 2.151595115661621, "memory(GiB)": 77.56, "step": 95825, "token_acc": 0.5534351145038168, "train_speed(iter/s)": 1.438268 }, { "epoch": 4.1056510003855875, "grad_norm": 6.458507061004639, "learning_rate": 7.688743946707455e-06, "loss": 2.043415069580078, "memory(GiB)": 77.56, "step": 95830, "token_acc": 0.5304347826086957, "train_speed(iter/s)": 1.438267 }, { "epoch": 4.105865215714837, "grad_norm": 6.901049613952637, "learning_rate": 7.685158535087966e-06, "loss": 2.2042892456054686, "memory(GiB)": 77.56, "step": 95835, "token_acc": 0.5670103092783505, "train_speed(iter/s)": 1.438262 }, { "epoch": 4.106079431044085, "grad_norm": 5.083637714385986, "learning_rate": 7.681573890041539e-06, "loss": 2.165730094909668, "memory(GiB)": 77.56, "step": 95840, "token_acc": 0.5508196721311476, "train_speed(iter/s)": 1.438262 }, { "epoch": 4.106293646373334, "grad_norm": 8.376875877380371, "learning_rate": 7.677990011633096e-06, "loss": 2.294547271728516, "memory(GiB)": 77.56, "step": 95845, "token_acc": 0.483271375464684, "train_speed(iter/s)": 1.438274 }, { "epoch": 4.106507861702584, "grad_norm": 6.624063491821289, "learning_rate": 7.674406899927567e-06, "loss": 2.3447238922119142, "memory(GiB)": 77.56, "step": 95850, "token_acc": 0.5146443514644351, "train_speed(iter/s)": 1.438287 }, { "epoch": 4.106722077031832, "grad_norm": 8.687882423400879, "learning_rate": 7.670824554989858e-06, "loss": 2.3343679428100588, "memory(GiB)": 77.56, "step": 95855, "token_acc": 0.5294117647058824, "train_speed(iter/s)": 1.438302 }, { "epoch": 4.106936292361081, "grad_norm": 5.211456298828125, "learning_rate": 7.667242976884875e-06, "loss": 2.2146095275878905, "memory(GiB)": 77.56, "step": 95860, "token_acc": 0.5668789808917197, "train_speed(iter/s)": 1.438301 }, { "epoch": 4.1071505076903305, "grad_norm": 6.3015217781066895, "learning_rate": 7.663662165677481e-06, "loss": 2.1318674087524414, "memory(GiB)": 77.56, "step": 95865, "token_acc": 0.5165165165165165, "train_speed(iter/s)": 1.438315 }, { "epoch": 4.107364723019579, "grad_norm": 5.2183027267456055, "learning_rate": 7.660082121432577e-06, "loss": 2.179535675048828, "memory(GiB)": 77.56, "step": 95870, "token_acc": 0.5319148936170213, "train_speed(iter/s)": 1.438326 }, { "epoch": 4.107578938348828, "grad_norm": 6.747520923614502, "learning_rate": 7.656502844214996e-06, "loss": 2.480525588989258, "memory(GiB)": 77.56, "step": 95875, "token_acc": 0.4793103448275862, "train_speed(iter/s)": 1.438313 }, { "epoch": 4.107793153678077, "grad_norm": 8.101838111877441, "learning_rate": 7.652924334089594e-06, "loss": 2.334914779663086, "memory(GiB)": 77.56, "step": 95880, "token_acc": 0.47368421052631576, "train_speed(iter/s)": 1.438301 }, { "epoch": 4.108007369007326, "grad_norm": 6.330286026000977, "learning_rate": 7.649346591121193e-06, "loss": 2.2796926498413086, "memory(GiB)": 77.56, "step": 95885, "token_acc": 0.5208333333333334, "train_speed(iter/s)": 1.438311 }, { "epoch": 4.108221584336575, "grad_norm": 6.005471706390381, "learning_rate": 7.645769615374592e-06, "loss": 2.2894432067871096, "memory(GiB)": 77.56, "step": 95890, "token_acc": 0.5425867507886435, "train_speed(iter/s)": 1.438319 }, { "epoch": 4.108435799665824, "grad_norm": 7.703975677490234, "learning_rate": 7.642193406914621e-06, "loss": 2.3478599548339845, "memory(GiB)": 77.56, "step": 95895, "token_acc": 0.506993006993007, "train_speed(iter/s)": 1.438322 }, { "epoch": 4.108650014995073, "grad_norm": 6.016992092132568, "learning_rate": 7.638617965806038e-06, "loss": 2.451349639892578, "memory(GiB)": 77.56, "step": 95900, "token_acc": 0.5111940298507462, "train_speed(iter/s)": 1.438318 }, { "epoch": 4.108864230324322, "grad_norm": 6.158310413360596, "learning_rate": 7.635043292113637e-06, "loss": 2.1001445770263674, "memory(GiB)": 77.56, "step": 95905, "token_acc": 0.5361216730038023, "train_speed(iter/s)": 1.438304 }, { "epoch": 4.109078445653571, "grad_norm": 6.671321392059326, "learning_rate": 7.631469385902173e-06, "loss": 2.239193344116211, "memory(GiB)": 77.56, "step": 95910, "token_acc": 0.5125448028673835, "train_speed(iter/s)": 1.438311 }, { "epoch": 4.1092926609828195, "grad_norm": 5.9768829345703125, "learning_rate": 7.627896247236382e-06, "loss": 2.190685844421387, "memory(GiB)": 77.56, "step": 95915, "token_acc": 0.5109034267912772, "train_speed(iter/s)": 1.438305 }, { "epoch": 4.109506876312069, "grad_norm": 7.609327793121338, "learning_rate": 7.624323876180989e-06, "loss": 2.5587928771972654, "memory(GiB)": 77.56, "step": 95920, "token_acc": 0.43490304709141275, "train_speed(iter/s)": 1.438326 }, { "epoch": 4.109721091641318, "grad_norm": 5.801846027374268, "learning_rate": 7.6207522728007354e-06, "loss": 2.462677764892578, "memory(GiB)": 77.56, "step": 95925, "token_acc": 0.4843205574912892, "train_speed(iter/s)": 1.438323 }, { "epoch": 4.109935306970566, "grad_norm": 6.480906963348389, "learning_rate": 7.6171814371603035e-06, "loss": 2.3157093048095705, "memory(GiB)": 77.56, "step": 95930, "token_acc": 0.5304659498207885, "train_speed(iter/s)": 1.438322 }, { "epoch": 4.110149522299816, "grad_norm": 5.38148307800293, "learning_rate": 7.613611369324392e-06, "loss": 2.2307273864746096, "memory(GiB)": 77.56, "step": 95935, "token_acc": 0.5236220472440944, "train_speed(iter/s)": 1.438307 }, { "epoch": 4.110363737629065, "grad_norm": 6.86534309387207, "learning_rate": 7.61004206935767e-06, "loss": 2.126034736633301, "memory(GiB)": 77.56, "step": 95940, "token_acc": 0.5255972696245734, "train_speed(iter/s)": 1.438322 }, { "epoch": 4.110577952958313, "grad_norm": 7.398293495178223, "learning_rate": 7.606473537324804e-06, "loss": 2.361636734008789, "memory(GiB)": 77.56, "step": 95945, "token_acc": 0.5202702702702703, "train_speed(iter/s)": 1.438334 }, { "epoch": 4.1107921682875626, "grad_norm": 5.546005725860596, "learning_rate": 7.602905773290425e-06, "loss": 2.291697120666504, "memory(GiB)": 77.56, "step": 95950, "token_acc": 0.5035211267605634, "train_speed(iter/s)": 1.438326 }, { "epoch": 4.111006383616812, "grad_norm": 9.449408531188965, "learning_rate": 7.599338777319192e-06, "loss": 2.116642951965332, "memory(GiB)": 77.56, "step": 95955, "token_acc": 0.5528455284552846, "train_speed(iter/s)": 1.43834 }, { "epoch": 4.11122059894606, "grad_norm": 7.142069339752197, "learning_rate": 7.59577254947571e-06, "loss": 2.2153013229370115, "memory(GiB)": 77.56, "step": 95960, "token_acc": 0.5390625, "train_speed(iter/s)": 1.43835 }, { "epoch": 4.111434814275309, "grad_norm": 7.215594291687012, "learning_rate": 7.592207089824588e-06, "loss": 1.9601696014404297, "memory(GiB)": 77.56, "step": 95965, "token_acc": 0.564, "train_speed(iter/s)": 1.438337 }, { "epoch": 4.111649029604559, "grad_norm": 6.804347515106201, "learning_rate": 7.5886423984304125e-06, "loss": 2.282740592956543, "memory(GiB)": 77.56, "step": 95970, "token_acc": 0.5031446540880503, "train_speed(iter/s)": 1.43834 }, { "epoch": 4.111863244933807, "grad_norm": 5.42451286315918, "learning_rate": 7.58507847535776e-06, "loss": 2.1462989807128907, "memory(GiB)": 77.56, "step": 95975, "token_acc": 0.5443037974683544, "train_speed(iter/s)": 1.438356 }, { "epoch": 4.112077460263056, "grad_norm": 4.953402042388916, "learning_rate": 7.5815153206711955e-06, "loss": 1.9254533767700195, "memory(GiB)": 77.56, "step": 95980, "token_acc": 0.5555555555555556, "train_speed(iter/s)": 1.438358 }, { "epoch": 4.112291675592306, "grad_norm": 7.234443187713623, "learning_rate": 7.577952934435284e-06, "loss": 2.3278614044189454, "memory(GiB)": 77.56, "step": 95985, "token_acc": 0.515358361774744, "train_speed(iter/s)": 1.438373 }, { "epoch": 4.112505890921554, "grad_norm": 6.257533073425293, "learning_rate": 7.574391316714552e-06, "loss": 2.1003753662109377, "memory(GiB)": 77.56, "step": 95990, "token_acc": 0.5436507936507936, "train_speed(iter/s)": 1.438397 }, { "epoch": 4.112720106250803, "grad_norm": 7.7184953689575195, "learning_rate": 7.57083046757352e-06, "loss": 2.2016576766967773, "memory(GiB)": 77.56, "step": 95995, "token_acc": 0.5159010600706714, "train_speed(iter/s)": 1.438376 }, { "epoch": 4.1129343215800525, "grad_norm": 7.000500202178955, "learning_rate": 7.567270387076692e-06, "loss": 2.1520095825195313, "memory(GiB)": 77.56, "step": 96000, "token_acc": 0.5143884892086331, "train_speed(iter/s)": 1.438385 }, { "epoch": 4.1129343215800525, "eval_loss": 2.0750722885131836, "eval_runtime": 14.6451, "eval_samples_per_second": 6.828, "eval_steps_per_second": 6.828, "eval_token_acc": 0.47690217391304346, "step": 96000 }, { "epoch": 4.113148536909301, "grad_norm": 5.765586853027344, "learning_rate": 7.563711075288571e-06, "loss": 2.3592966079711912, "memory(GiB)": 77.56, "step": 96005, "token_acc": 0.4831358249772106, "train_speed(iter/s)": 1.438051 }, { "epoch": 4.11336275223855, "grad_norm": 8.4855375289917, "learning_rate": 7.560152532273618e-06, "loss": 2.289281463623047, "memory(GiB)": 77.56, "step": 96010, "token_acc": 0.5078125, "train_speed(iter/s)": 1.438074 }, { "epoch": 4.113576967567799, "grad_norm": 12.849916458129883, "learning_rate": 7.556594758096325e-06, "loss": 2.4361392974853517, "memory(GiB)": 77.56, "step": 96015, "token_acc": 0.476056338028169, "train_speed(iter/s)": 1.438061 }, { "epoch": 4.113791182897048, "grad_norm": 6.677891254425049, "learning_rate": 7.553037752821135e-06, "loss": 2.2970102310180662, "memory(GiB)": 77.56, "step": 96020, "token_acc": 0.5126353790613718, "train_speed(iter/s)": 1.438069 }, { "epoch": 4.114005398226297, "grad_norm": 7.1309990882873535, "learning_rate": 7.549481516512485e-06, "loss": 2.2121131896972654, "memory(GiB)": 77.56, "step": 96025, "token_acc": 0.5389830508474577, "train_speed(iter/s)": 1.438089 }, { "epoch": 4.114219613555546, "grad_norm": 4.925137042999268, "learning_rate": 7.5459260492347975e-06, "loss": 2.6382211685180663, "memory(GiB)": 77.56, "step": 96030, "token_acc": 0.4819277108433735, "train_speed(iter/s)": 1.438098 }, { "epoch": 4.114433828884795, "grad_norm": 5.852963447570801, "learning_rate": 7.5423713510524745e-06, "loss": 2.304315185546875, "memory(GiB)": 77.56, "step": 96035, "token_acc": 0.5285714285714286, "train_speed(iter/s)": 1.438099 }, { "epoch": 4.114648044214044, "grad_norm": 7.86866569519043, "learning_rate": 7.538817422029931e-06, "loss": 2.355795478820801, "memory(GiB)": 77.56, "step": 96040, "token_acc": 0.45, "train_speed(iter/s)": 1.438094 }, { "epoch": 4.114862259543293, "grad_norm": 6.939375400543213, "learning_rate": 7.535264262231545e-06, "loss": 1.9418508529663085, "memory(GiB)": 77.56, "step": 96045, "token_acc": 0.5752508361204013, "train_speed(iter/s)": 1.438087 }, { "epoch": 4.1150764748725415, "grad_norm": 5.743912696838379, "learning_rate": 7.531711871721669e-06, "loss": 2.222469711303711, "memory(GiB)": 77.56, "step": 96050, "token_acc": 0.5151515151515151, "train_speed(iter/s)": 1.438093 }, { "epoch": 4.115290690201791, "grad_norm": 5.764291763305664, "learning_rate": 7.528160250564681e-06, "loss": 2.367168998718262, "memory(GiB)": 77.56, "step": 96055, "token_acc": 0.47003154574132494, "train_speed(iter/s)": 1.438089 }, { "epoch": 4.11550490553104, "grad_norm": 8.050865173339844, "learning_rate": 7.524609398824916e-06, "loss": 2.517007827758789, "memory(GiB)": 77.56, "step": 96060, "token_acc": 0.4984423676012461, "train_speed(iter/s)": 1.438095 }, { "epoch": 4.115719120860288, "grad_norm": 6.087296485900879, "learning_rate": 7.521059316566681e-06, "loss": 1.9852994918823241, "memory(GiB)": 77.56, "step": 96065, "token_acc": 0.5376344086021505, "train_speed(iter/s)": 1.438069 }, { "epoch": 4.115933336189538, "grad_norm": 7.634925842285156, "learning_rate": 7.517510003854322e-06, "loss": 2.205913543701172, "memory(GiB)": 77.56, "step": 96070, "token_acc": 0.4563758389261745, "train_speed(iter/s)": 1.438092 }, { "epoch": 4.116147551518787, "grad_norm": 6.784355163574219, "learning_rate": 7.513961460752117e-06, "loss": 2.393558883666992, "memory(GiB)": 77.56, "step": 96075, "token_acc": 0.4984423676012461, "train_speed(iter/s)": 1.438111 }, { "epoch": 4.116361766848035, "grad_norm": 6.767854690551758, "learning_rate": 7.510413687324358e-06, "loss": 2.4951629638671875, "memory(GiB)": 77.56, "step": 96080, "token_acc": 0.4797297297297297, "train_speed(iter/s)": 1.438096 }, { "epoch": 4.1165759821772845, "grad_norm": 6.6120758056640625, "learning_rate": 7.506866683635311e-06, "loss": 2.4073081970214845, "memory(GiB)": 77.56, "step": 96085, "token_acc": 0.4900662251655629, "train_speed(iter/s)": 1.438106 }, { "epoch": 4.116790197506534, "grad_norm": 5.49633264541626, "learning_rate": 7.503320449749235e-06, "loss": 2.291116142272949, "memory(GiB)": 77.56, "step": 96090, "token_acc": 0.4963768115942029, "train_speed(iter/s)": 1.43811 }, { "epoch": 4.117004412835782, "grad_norm": 7.807966232299805, "learning_rate": 7.499774985730363e-06, "loss": 2.0695318222045898, "memory(GiB)": 77.56, "step": 96095, "token_acc": 0.48518518518518516, "train_speed(iter/s)": 1.438105 }, { "epoch": 4.117218628165031, "grad_norm": 7.62118673324585, "learning_rate": 7.496230291642947e-06, "loss": 2.378623199462891, "memory(GiB)": 77.56, "step": 96100, "token_acc": 0.50390625, "train_speed(iter/s)": 1.438087 }, { "epoch": 4.117432843494281, "grad_norm": 7.91900634765625, "learning_rate": 7.492686367551194e-06, "loss": 2.160654067993164, "memory(GiB)": 77.56, "step": 96105, "token_acc": 0.5568181818181818, "train_speed(iter/s)": 1.438076 }, { "epoch": 4.117647058823529, "grad_norm": 6.8633503913879395, "learning_rate": 7.489143213519301e-06, "loss": 2.240801239013672, "memory(GiB)": 77.56, "step": 96110, "token_acc": 0.4930555555555556, "train_speed(iter/s)": 1.43808 }, { "epoch": 4.117861274152778, "grad_norm": 5.698827266693115, "learning_rate": 7.485600829611456e-06, "loss": 2.2253496170043947, "memory(GiB)": 77.56, "step": 96115, "token_acc": 0.5454545454545454, "train_speed(iter/s)": 1.438083 }, { "epoch": 4.1180754894820275, "grad_norm": 5.872425079345703, "learning_rate": 7.482059215891823e-06, "loss": 2.0663137435913086, "memory(GiB)": 77.56, "step": 96120, "token_acc": 0.5353982300884956, "train_speed(iter/s)": 1.438076 }, { "epoch": 4.118289704811276, "grad_norm": 6.3538055419921875, "learning_rate": 7.478518372424576e-06, "loss": 2.2023794174194338, "memory(GiB)": 77.56, "step": 96125, "token_acc": 0.5083612040133779, "train_speed(iter/s)": 1.438077 }, { "epoch": 4.118503920140525, "grad_norm": 6.036527633666992, "learning_rate": 7.474978299273861e-06, "loss": 2.133736991882324, "memory(GiB)": 77.56, "step": 96130, "token_acc": 0.5282392026578073, "train_speed(iter/s)": 1.438077 }, { "epoch": 4.118718135469774, "grad_norm": 5.533919334411621, "learning_rate": 7.47143899650381e-06, "loss": 2.243769073486328, "memory(GiB)": 77.56, "step": 96135, "token_acc": 0.5213414634146342, "train_speed(iter/s)": 1.438069 }, { "epoch": 4.118932350799023, "grad_norm": 8.145271301269531, "learning_rate": 7.467900464178534e-06, "loss": 2.3492404937744142, "memory(GiB)": 77.56, "step": 96140, "token_acc": 0.4934640522875817, "train_speed(iter/s)": 1.438069 }, { "epoch": 4.119146566128272, "grad_norm": 5.510066032409668, "learning_rate": 7.4643627023621435e-06, "loss": 1.8543930053710938, "memory(GiB)": 77.56, "step": 96145, "token_acc": 0.5438066465256798, "train_speed(iter/s)": 1.438053 }, { "epoch": 4.119360781457521, "grad_norm": 8.078634262084961, "learning_rate": 7.460825711118724e-06, "loss": 2.5635005950927736, "memory(GiB)": 77.56, "step": 96150, "token_acc": 0.49477351916376305, "train_speed(iter/s)": 1.438054 }, { "epoch": 4.11957499678677, "grad_norm": 6.9337239265441895, "learning_rate": 7.457289490512337e-06, "loss": 2.4758447647094726, "memory(GiB)": 77.56, "step": 96155, "token_acc": 0.46105919003115264, "train_speed(iter/s)": 1.438059 }, { "epoch": 4.119789212116019, "grad_norm": 6.736894607543945, "learning_rate": 7.453754040607075e-06, "loss": 2.6031646728515625, "memory(GiB)": 77.56, "step": 96160, "token_acc": 0.48134328358208955, "train_speed(iter/s)": 1.438057 }, { "epoch": 4.120003427445268, "grad_norm": 6.146721363067627, "learning_rate": 7.450219361466965e-06, "loss": 2.0872114181518553, "memory(GiB)": 77.56, "step": 96165, "token_acc": 0.5278810408921933, "train_speed(iter/s)": 1.43806 }, { "epoch": 4.1202176427745165, "grad_norm": 6.551029682159424, "learning_rate": 7.44668545315605e-06, "loss": 2.1993005752563475, "memory(GiB)": 77.56, "step": 96170, "token_acc": 0.5533980582524272, "train_speed(iter/s)": 1.438061 }, { "epoch": 4.120431858103766, "grad_norm": 6.693967819213867, "learning_rate": 7.443152315738344e-06, "loss": 2.2617431640625, "memory(GiB)": 77.56, "step": 96175, "token_acc": 0.48760330578512395, "train_speed(iter/s)": 1.438059 }, { "epoch": 4.120646073433015, "grad_norm": 5.57351016998291, "learning_rate": 7.439619949277848e-06, "loss": 2.294831466674805, "memory(GiB)": 77.56, "step": 96180, "token_acc": 0.5, "train_speed(iter/s)": 1.43807 }, { "epoch": 4.120860288762263, "grad_norm": 11.081794738769531, "learning_rate": 7.436088353838566e-06, "loss": 2.3934621810913086, "memory(GiB)": 77.56, "step": 96185, "token_acc": 0.509090909090909, "train_speed(iter/s)": 1.438064 }, { "epoch": 4.121074504091513, "grad_norm": 7.196897983551025, "learning_rate": 7.432557529484479e-06, "loss": 2.1505853652954103, "memory(GiB)": 77.56, "step": 96190, "token_acc": 0.5197568389057751, "train_speed(iter/s)": 1.438078 }, { "epoch": 4.121288719420762, "grad_norm": 9.193681716918945, "learning_rate": 7.4290274762795265e-06, "loss": 2.4362371444702147, "memory(GiB)": 77.56, "step": 96195, "token_acc": 0.4765625, "train_speed(iter/s)": 1.43808 }, { "epoch": 4.12150293475001, "grad_norm": 4.986240863800049, "learning_rate": 7.425498194287689e-06, "loss": 2.081932258605957, "memory(GiB)": 77.56, "step": 96200, "token_acc": 0.5168195718654435, "train_speed(iter/s)": 1.438096 }, { "epoch": 4.1217171500792595, "grad_norm": 5.693068504333496, "learning_rate": 7.421969683572894e-06, "loss": 2.636934280395508, "memory(GiB)": 77.56, "step": 96205, "token_acc": 0.47079037800687284, "train_speed(iter/s)": 1.438102 }, { "epoch": 4.121931365408509, "grad_norm": 5.989151954650879, "learning_rate": 7.418441944199045e-06, "loss": 2.4715244293212892, "memory(GiB)": 77.56, "step": 96210, "token_acc": 0.49591280653950953, "train_speed(iter/s)": 1.438105 }, { "epoch": 4.122145580737757, "grad_norm": 5.698012828826904, "learning_rate": 7.414914976230075e-06, "loss": 2.150025177001953, "memory(GiB)": 77.56, "step": 96215, "token_acc": 0.5316455696202531, "train_speed(iter/s)": 1.438116 }, { "epoch": 4.122359796067006, "grad_norm": 6.786555290222168, "learning_rate": 7.411388779729872e-06, "loss": 2.3515079498291014, "memory(GiB)": 77.56, "step": 96220, "token_acc": 0.5035460992907801, "train_speed(iter/s)": 1.438104 }, { "epoch": 4.122574011396256, "grad_norm": 5.212714672088623, "learning_rate": 7.4078633547623065e-06, "loss": 2.3681928634643556, "memory(GiB)": 77.56, "step": 96225, "token_acc": 0.4981949458483754, "train_speed(iter/s)": 1.438099 }, { "epoch": 4.122788226725504, "grad_norm": 6.217240810394287, "learning_rate": 7.404338701391256e-06, "loss": 2.2296422958374023, "memory(GiB)": 77.56, "step": 96230, "token_acc": 0.5311355311355311, "train_speed(iter/s)": 1.438124 }, { "epoch": 4.123002442054753, "grad_norm": 6.141689777374268, "learning_rate": 7.400814819680568e-06, "loss": 2.519417572021484, "memory(GiB)": 77.56, "step": 96235, "token_acc": 0.48175182481751827, "train_speed(iter/s)": 1.438136 }, { "epoch": 4.123216657384003, "grad_norm": 6.020229339599609, "learning_rate": 7.397291709694065e-06, "loss": 2.364508628845215, "memory(GiB)": 77.56, "step": 96240, "token_acc": 0.4914772727272727, "train_speed(iter/s)": 1.438126 }, { "epoch": 4.123430872713251, "grad_norm": 5.66908597946167, "learning_rate": 7.393769371495602e-06, "loss": 2.565126419067383, "memory(GiB)": 77.56, "step": 96245, "token_acc": 0.47107438016528924, "train_speed(iter/s)": 1.438136 }, { "epoch": 4.1236450880425, "grad_norm": 5.747981548309326, "learning_rate": 7.390247805148976e-06, "loss": 1.9654863357543946, "memory(GiB)": 77.56, "step": 96250, "token_acc": 0.5907172995780591, "train_speed(iter/s)": 1.438133 }, { "epoch": 4.123859303371749, "grad_norm": 5.34492301940918, "learning_rate": 7.386727010717981e-06, "loss": 2.502433013916016, "memory(GiB)": 77.56, "step": 96255, "token_acc": 0.5070921985815603, "train_speed(iter/s)": 1.43814 }, { "epoch": 4.124073518700998, "grad_norm": 5.8116607666015625, "learning_rate": 7.3832069882664e-06, "loss": 2.4343244552612306, "memory(GiB)": 77.56, "step": 96260, "token_acc": 0.49206349206349204, "train_speed(iter/s)": 1.43814 }, { "epoch": 4.124287734030247, "grad_norm": 7.909563064575195, "learning_rate": 7.379687737857987e-06, "loss": 2.525436210632324, "memory(GiB)": 77.56, "step": 96265, "token_acc": 0.5434782608695652, "train_speed(iter/s)": 1.438134 }, { "epoch": 4.124501949359496, "grad_norm": 6.475271701812744, "learning_rate": 7.37616925955652e-06, "loss": 2.166606903076172, "memory(GiB)": 77.56, "step": 96270, "token_acc": 0.5636363636363636, "train_speed(iter/s)": 1.438124 }, { "epoch": 4.124716164688745, "grad_norm": 4.453949928283691, "learning_rate": 7.372651553425735e-06, "loss": 2.191685676574707, "memory(GiB)": 77.56, "step": 96275, "token_acc": 0.5324675324675324, "train_speed(iter/s)": 1.438125 }, { "epoch": 4.124930380017994, "grad_norm": 6.0995306968688965, "learning_rate": 7.369134619529361e-06, "loss": 2.020674133300781, "memory(GiB)": 77.56, "step": 96280, "token_acc": 0.5579710144927537, "train_speed(iter/s)": 1.438135 }, { "epoch": 4.125144595347243, "grad_norm": 9.233741760253906, "learning_rate": 7.365618457931101e-06, "loss": 2.518416976928711, "memory(GiB)": 77.56, "step": 96285, "token_acc": 0.5167785234899329, "train_speed(iter/s)": 1.438151 }, { "epoch": 4.125358810676492, "grad_norm": 11.09237003326416, "learning_rate": 7.362103068694653e-06, "loss": 2.557550048828125, "memory(GiB)": 77.56, "step": 96290, "token_acc": 0.5231788079470199, "train_speed(iter/s)": 1.438159 }, { "epoch": 4.125573026005741, "grad_norm": 5.068997383117676, "learning_rate": 7.358588451883714e-06, "loss": 2.4432735443115234, "memory(GiB)": 77.56, "step": 96295, "token_acc": 0.4811320754716981, "train_speed(iter/s)": 1.438152 }, { "epoch": 4.12578724133499, "grad_norm": 6.230804920196533, "learning_rate": 7.355074607561929e-06, "loss": 2.265390396118164, "memory(GiB)": 77.56, "step": 96300, "token_acc": 0.5248447204968945, "train_speed(iter/s)": 1.438161 }, { "epoch": 4.126001456664239, "grad_norm": 11.891359329223633, "learning_rate": 7.351561535792984e-06, "loss": 2.3951742172241213, "memory(GiB)": 77.56, "step": 96305, "token_acc": 0.5065359477124183, "train_speed(iter/s)": 1.438163 }, { "epoch": 4.126215671993488, "grad_norm": 6.688968658447266, "learning_rate": 7.348049236640509e-06, "loss": 2.3005237579345703, "memory(GiB)": 77.56, "step": 96310, "token_acc": 0.4589041095890411, "train_speed(iter/s)": 1.438184 }, { "epoch": 4.126429887322737, "grad_norm": 7.9630279541015625, "learning_rate": 7.344537710168136e-06, "loss": 2.3335906982421877, "memory(GiB)": 77.56, "step": 96315, "token_acc": 0.5109489051094891, "train_speed(iter/s)": 1.438193 }, { "epoch": 4.126644102651985, "grad_norm": 6.251548767089844, "learning_rate": 7.3410269564394725e-06, "loss": 2.216363525390625, "memory(GiB)": 77.56, "step": 96320, "token_acc": 0.538135593220339, "train_speed(iter/s)": 1.438198 }, { "epoch": 4.126858317981235, "grad_norm": 6.065381050109863, "learning_rate": 7.337516975518116e-06, "loss": 2.044463348388672, "memory(GiB)": 77.56, "step": 96325, "token_acc": 0.5126353790613718, "train_speed(iter/s)": 1.438206 }, { "epoch": 4.127072533310484, "grad_norm": 6.984500408172607, "learning_rate": 7.334007767467666e-06, "loss": 2.2201511383056642, "memory(GiB)": 77.56, "step": 96330, "token_acc": 0.49085365853658536, "train_speed(iter/s)": 1.438199 }, { "epoch": 4.127286748639733, "grad_norm": 7.817566871643066, "learning_rate": 7.330499332351692e-06, "loss": 2.153267669677734, "memory(GiB)": 77.56, "step": 96335, "token_acc": 0.5057471264367817, "train_speed(iter/s)": 1.438213 }, { "epoch": 4.1275009639689815, "grad_norm": 7.226218223571777, "learning_rate": 7.326991670233751e-06, "loss": 2.1477020263671873, "memory(GiB)": 77.56, "step": 96340, "token_acc": 0.5301587301587302, "train_speed(iter/s)": 1.438211 }, { "epoch": 4.127715179298231, "grad_norm": 7.403777599334717, "learning_rate": 7.3234847811773755e-06, "loss": 2.1869073867797852, "memory(GiB)": 77.56, "step": 96345, "token_acc": 0.5172413793103449, "train_speed(iter/s)": 1.438212 }, { "epoch": 4.127929394627479, "grad_norm": 8.69003963470459, "learning_rate": 7.319978665246113e-06, "loss": 2.5224687576293947, "memory(GiB)": 77.56, "step": 96350, "token_acc": 0.506993006993007, "train_speed(iter/s)": 1.438213 }, { "epoch": 4.128143609956728, "grad_norm": 5.238253116607666, "learning_rate": 7.31647332250347e-06, "loss": 2.2845026016235352, "memory(GiB)": 77.56, "step": 96355, "token_acc": 0.5364963503649635, "train_speed(iter/s)": 1.438218 }, { "epoch": 4.128357825285978, "grad_norm": 5.36712121963501, "learning_rate": 7.312968753012961e-06, "loss": 2.2127079010009765, "memory(GiB)": 77.56, "step": 96360, "token_acc": 0.5109034267912772, "train_speed(iter/s)": 1.43822 }, { "epoch": 4.128572040615227, "grad_norm": 5.671230792999268, "learning_rate": 7.309464956838063e-06, "loss": 2.3613243103027344, "memory(GiB)": 77.56, "step": 96365, "token_acc": 0.5079872204472844, "train_speed(iter/s)": 1.438233 }, { "epoch": 4.128786255944475, "grad_norm": 8.403008460998535, "learning_rate": 7.30596193404226e-06, "loss": 2.4431825637817384, "memory(GiB)": 77.56, "step": 96370, "token_acc": 0.5, "train_speed(iter/s)": 1.438244 }, { "epoch": 4.1290004712737245, "grad_norm": 8.125631332397461, "learning_rate": 7.302459684689006e-06, "loss": 1.9763547897338867, "memory(GiB)": 77.56, "step": 96375, "token_acc": 0.5163636363636364, "train_speed(iter/s)": 1.43825 }, { "epoch": 4.129214686602973, "grad_norm": 8.112006187438965, "learning_rate": 7.298958208841744e-06, "loss": 2.1689355850219725, "memory(GiB)": 77.56, "step": 96380, "token_acc": 0.5598290598290598, "train_speed(iter/s)": 1.438259 }, { "epoch": 4.129428901932222, "grad_norm": 5.576782703399658, "learning_rate": 7.295457506563902e-06, "loss": 2.2749116897583006, "memory(GiB)": 77.56, "step": 96385, "token_acc": 0.5059288537549407, "train_speed(iter/s)": 1.438269 }, { "epoch": 4.129643117261471, "grad_norm": 7.517724514007568, "learning_rate": 7.291957577918923e-06, "loss": 2.2987342834472657, "memory(GiB)": 77.56, "step": 96390, "token_acc": 0.5234899328859061, "train_speed(iter/s)": 1.438272 }, { "epoch": 4.129857332590721, "grad_norm": 6.646448612213135, "learning_rate": 7.288458422970191e-06, "loss": 2.3035144805908203, "memory(GiB)": 77.56, "step": 96395, "token_acc": 0.5317725752508361, "train_speed(iter/s)": 1.438281 }, { "epoch": 4.130071547919969, "grad_norm": 5.043950080871582, "learning_rate": 7.284960041781102e-06, "loss": 2.1844324111938476, "memory(GiB)": 77.56, "step": 96400, "token_acc": 0.5355805243445693, "train_speed(iter/s)": 1.438276 }, { "epoch": 4.130285763249218, "grad_norm": 6.114025592803955, "learning_rate": 7.281462434415032e-06, "loss": 2.2071548461914063, "memory(GiB)": 77.56, "step": 96405, "token_acc": 0.5421686746987951, "train_speed(iter/s)": 1.438286 }, { "epoch": 4.130499978578467, "grad_norm": 5.625747203826904, "learning_rate": 7.277965600935333e-06, "loss": 2.4538200378417967, "memory(GiB)": 77.56, "step": 96410, "token_acc": 0.4783950617283951, "train_speed(iter/s)": 1.438285 }, { "epoch": 4.130714193907716, "grad_norm": 7.017910480499268, "learning_rate": 7.274469541405376e-06, "loss": 2.3580970764160156, "memory(GiB)": 77.56, "step": 96415, "token_acc": 0.5401929260450161, "train_speed(iter/s)": 1.438276 }, { "epoch": 4.130928409236965, "grad_norm": 5.083176612854004, "learning_rate": 7.270974255888469e-06, "loss": 2.1233327865600584, "memory(GiB)": 77.56, "step": 96420, "token_acc": 0.5384615384615384, "train_speed(iter/s)": 1.438286 }, { "epoch": 4.131142624566214, "grad_norm": 7.452576637268066, "learning_rate": 7.267479744447958e-06, "loss": 2.0950721740722655, "memory(GiB)": 77.56, "step": 96425, "token_acc": 0.5398773006134969, "train_speed(iter/s)": 1.438276 }, { "epoch": 4.131356839895463, "grad_norm": 6.489446640014648, "learning_rate": 7.263986007147139e-06, "loss": 2.294108200073242, "memory(GiB)": 77.56, "step": 96430, "token_acc": 0.5363984674329502, "train_speed(iter/s)": 1.438289 }, { "epoch": 4.131571055224712, "grad_norm": 5.427678108215332, "learning_rate": 7.260493044049299e-06, "loss": 2.200992774963379, "memory(GiB)": 77.56, "step": 96435, "token_acc": 0.5412844036697247, "train_speed(iter/s)": 1.438287 }, { "epoch": 4.13178527055396, "grad_norm": 6.177824974060059, "learning_rate": 7.257000855217727e-06, "loss": 2.490162467956543, "memory(GiB)": 77.56, "step": 96440, "token_acc": 0.49834983498349833, "train_speed(iter/s)": 1.438294 }, { "epoch": 4.13199948588321, "grad_norm": 9.818201065063477, "learning_rate": 7.253509440715667e-06, "loss": 2.3024213790893553, "memory(GiB)": 77.56, "step": 96445, "token_acc": 0.4852941176470588, "train_speed(iter/s)": 1.438285 }, { "epoch": 4.132213701212459, "grad_norm": 6.740192413330078, "learning_rate": 7.25001880060639e-06, "loss": 2.364698028564453, "memory(GiB)": 77.56, "step": 96450, "token_acc": 0.4728434504792332, "train_speed(iter/s)": 1.438252 }, { "epoch": 4.132427916541708, "grad_norm": 6.426738739013672, "learning_rate": 7.246528934953128e-06, "loss": 2.471163749694824, "memory(GiB)": 77.56, "step": 96455, "token_acc": 0.46551724137931033, "train_speed(iter/s)": 1.438273 }, { "epoch": 4.1326421318709565, "grad_norm": 6.708327293395996, "learning_rate": 7.243039843819105e-06, "loss": 2.3847736358642577, "memory(GiB)": 77.56, "step": 96460, "token_acc": 0.503968253968254, "train_speed(iter/s)": 1.438269 }, { "epoch": 4.132856347200206, "grad_norm": 7.480318546295166, "learning_rate": 7.2395515272675186e-06, "loss": 2.181375503540039, "memory(GiB)": 77.56, "step": 96465, "token_acc": 0.547244094488189, "train_speed(iter/s)": 1.438272 }, { "epoch": 4.133070562529454, "grad_norm": 6.168023586273193, "learning_rate": 7.236063985361563e-06, "loss": 2.1938461303710937, "memory(GiB)": 77.56, "step": 96470, "token_acc": 0.5112540192926045, "train_speed(iter/s)": 1.438283 }, { "epoch": 4.133284777858703, "grad_norm": 4.6516313552856445, "learning_rate": 7.232577218164427e-06, "loss": 2.408511734008789, "memory(GiB)": 77.56, "step": 96475, "token_acc": 0.5015974440894568, "train_speed(iter/s)": 1.438278 }, { "epoch": 4.133498993187953, "grad_norm": 5.809952259063721, "learning_rate": 7.229091225739282e-06, "loss": 2.2135032653808593, "memory(GiB)": 77.56, "step": 96480, "token_acc": 0.5063291139240507, "train_speed(iter/s)": 1.438293 }, { "epoch": 4.133713208517202, "grad_norm": 6.514778137207031, "learning_rate": 7.225606008149266e-06, "loss": 2.145332908630371, "memory(GiB)": 77.56, "step": 96485, "token_acc": 0.528169014084507, "train_speed(iter/s)": 1.438294 }, { "epoch": 4.13392742384645, "grad_norm": 5.6674065589904785, "learning_rate": 7.222121565457529e-06, "loss": 2.0490707397460937, "memory(GiB)": 77.56, "step": 96490, "token_acc": 0.5305343511450382, "train_speed(iter/s)": 1.438278 }, { "epoch": 4.1341416391756995, "grad_norm": 4.809019565582275, "learning_rate": 7.2186378977271705e-06, "loss": 2.199611854553223, "memory(GiB)": 77.56, "step": 96495, "token_acc": 0.49264705882352944, "train_speed(iter/s)": 1.438264 }, { "epoch": 4.134355854504949, "grad_norm": 5.550069808959961, "learning_rate": 7.215155005021323e-06, "loss": 2.290171432495117, "memory(GiB)": 77.56, "step": 96500, "token_acc": 0.5142857142857142, "train_speed(iter/s)": 1.438257 }, { "epoch": 4.134355854504949, "eval_loss": 2.1640238761901855, "eval_runtime": 14.0832, "eval_samples_per_second": 7.101, "eval_steps_per_second": 7.101, "eval_token_acc": 0.4731182795698925, "step": 96500 }, { "epoch": 4.134570069834197, "grad_norm": 6.027403354644775, "learning_rate": 7.211672887403087e-06, "loss": 2.3687294006347654, "memory(GiB)": 77.56, "step": 96505, "token_acc": 0.4844167408726625, "train_speed(iter/s)": 1.437937 }, { "epoch": 4.134784285163446, "grad_norm": 5.739585876464844, "learning_rate": 7.208191544935538e-06, "loss": 2.2002351760864256, "memory(GiB)": 77.56, "step": 96510, "token_acc": 0.5128205128205128, "train_speed(iter/s)": 1.437952 }, { "epoch": 4.134998500492696, "grad_norm": 4.584177494049072, "learning_rate": 7.204710977681739e-06, "loss": 2.3370132446289062, "memory(GiB)": 77.56, "step": 96515, "token_acc": 0.546583850931677, "train_speed(iter/s)": 1.437925 }, { "epoch": 4.135212715821944, "grad_norm": 6.653926372528076, "learning_rate": 7.201231185704749e-06, "loss": 2.158795928955078, "memory(GiB)": 77.56, "step": 96520, "token_acc": 0.5636363636363636, "train_speed(iter/s)": 1.437918 }, { "epoch": 4.135426931151193, "grad_norm": 8.364823341369629, "learning_rate": 7.197752169067601e-06, "loss": 2.457591438293457, "memory(GiB)": 77.56, "step": 96525, "token_acc": 0.5104895104895105, "train_speed(iter/s)": 1.437933 }, { "epoch": 4.135641146480443, "grad_norm": 5.409210681915283, "learning_rate": 7.194273927833312e-06, "loss": 2.1754566192626954, "memory(GiB)": 77.56, "step": 96530, "token_acc": 0.5294117647058824, "train_speed(iter/s)": 1.437929 }, { "epoch": 4.135855361809691, "grad_norm": 5.88220739364624, "learning_rate": 7.190796462064919e-06, "loss": 2.0327999114990236, "memory(GiB)": 77.56, "step": 96535, "token_acc": 0.5018315018315018, "train_speed(iter/s)": 1.437908 }, { "epoch": 4.13606957713894, "grad_norm": 7.667297840118408, "learning_rate": 7.187319771825407e-06, "loss": 2.2790462493896486, "memory(GiB)": 77.56, "step": 96540, "token_acc": 0.5115384615384615, "train_speed(iter/s)": 1.437877 }, { "epoch": 4.1362837924681894, "grad_norm": 6.999119758605957, "learning_rate": 7.183843857177757e-06, "loss": 2.698093605041504, "memory(GiB)": 77.56, "step": 96545, "token_acc": 0.48424068767908307, "train_speed(iter/s)": 1.437885 }, { "epoch": 4.136498007797438, "grad_norm": 5.225920677185059, "learning_rate": 7.180368718184943e-06, "loss": 2.40865478515625, "memory(GiB)": 77.56, "step": 96550, "token_acc": 0.5017921146953405, "train_speed(iter/s)": 1.437899 }, { "epoch": 4.136712223126687, "grad_norm": 5.234836578369141, "learning_rate": 7.176894354909908e-06, "loss": 2.2354469299316406, "memory(GiB)": 77.56, "step": 96555, "token_acc": 0.5206611570247934, "train_speed(iter/s)": 1.437909 }, { "epoch": 4.136926438455936, "grad_norm": 7.849437236785889, "learning_rate": 7.173420767415611e-06, "loss": 2.4241825103759767, "memory(GiB)": 77.56, "step": 96560, "token_acc": 0.4740484429065744, "train_speed(iter/s)": 1.437909 }, { "epoch": 4.137140653785185, "grad_norm": 6.073554039001465, "learning_rate": 7.169947955764961e-06, "loss": 2.149165153503418, "memory(GiB)": 77.56, "step": 96565, "token_acc": 0.5327102803738317, "train_speed(iter/s)": 1.4379 }, { "epoch": 4.137354869114434, "grad_norm": 5.625076770782471, "learning_rate": 7.166475920020888e-06, "loss": 2.331562805175781, "memory(GiB)": 77.56, "step": 96570, "token_acc": 0.5208333333333334, "train_speed(iter/s)": 1.437911 }, { "epoch": 4.137569084443683, "grad_norm": 8.233735084533691, "learning_rate": 7.163004660246292e-06, "loss": 2.16846923828125, "memory(GiB)": 77.56, "step": 96575, "token_acc": 0.4883720930232558, "train_speed(iter/s)": 1.437924 }, { "epoch": 4.137783299772932, "grad_norm": 5.690011501312256, "learning_rate": 7.159534176504046e-06, "loss": 2.2551654815673827, "memory(GiB)": 77.56, "step": 96580, "token_acc": 0.547945205479452, "train_speed(iter/s)": 1.437924 }, { "epoch": 4.137997515102181, "grad_norm": 6.119157314300537, "learning_rate": 7.1560644688570324e-06, "loss": 2.2950843811035155, "memory(GiB)": 77.56, "step": 96585, "token_acc": 0.49848024316109424, "train_speed(iter/s)": 1.437946 }, { "epoch": 4.13821173043143, "grad_norm": 5.932070732116699, "learning_rate": 7.152595537368084e-06, "loss": 2.178785705566406, "memory(GiB)": 77.56, "step": 96590, "token_acc": 0.5406360424028268, "train_speed(iter/s)": 1.437956 }, { "epoch": 4.1384259457606785, "grad_norm": 5.11445426940918, "learning_rate": 7.149127382100074e-06, "loss": 2.651362991333008, "memory(GiB)": 77.56, "step": 96595, "token_acc": 0.46397694524495675, "train_speed(iter/s)": 1.437956 }, { "epoch": 4.138640161089928, "grad_norm": 8.318365097045898, "learning_rate": 7.145660003115822e-06, "loss": 1.9505754470825196, "memory(GiB)": 77.56, "step": 96600, "token_acc": 0.572992700729927, "train_speed(iter/s)": 1.437949 }, { "epoch": 4.138854376419177, "grad_norm": 9.093037605285645, "learning_rate": 7.142193400478136e-06, "loss": 2.238535499572754, "memory(GiB)": 77.56, "step": 96605, "token_acc": 0.5320754716981132, "train_speed(iter/s)": 1.437958 }, { "epoch": 4.139068591748425, "grad_norm": 5.706610202789307, "learning_rate": 7.138727574249821e-06, "loss": 2.4575634002685547, "memory(GiB)": 77.56, "step": 96610, "token_acc": 0.4759036144578313, "train_speed(iter/s)": 1.437954 }, { "epoch": 4.139282807077675, "grad_norm": 5.96674919128418, "learning_rate": 7.135262524493652e-06, "loss": 2.4098133087158202, "memory(GiB)": 77.56, "step": 96615, "token_acc": 0.512987012987013, "train_speed(iter/s)": 1.437956 }, { "epoch": 4.139497022406924, "grad_norm": 5.251148223876953, "learning_rate": 7.131798251272426e-06, "loss": 2.6687137603759767, "memory(GiB)": 77.56, "step": 96620, "token_acc": 0.4594594594594595, "train_speed(iter/s)": 1.437946 }, { "epoch": 4.139711237736172, "grad_norm": 6.330445289611816, "learning_rate": 7.128334754648891e-06, "loss": 1.9899627685546875, "memory(GiB)": 77.56, "step": 96625, "token_acc": 0.5378486055776892, "train_speed(iter/s)": 1.437941 }, { "epoch": 4.1399254530654215, "grad_norm": 6.628699779510498, "learning_rate": 7.124872034685781e-06, "loss": 2.2640108108520507, "memory(GiB)": 77.56, "step": 96630, "token_acc": 0.5290519877675841, "train_speed(iter/s)": 1.437944 }, { "epoch": 4.140139668394671, "grad_norm": 6.793025016784668, "learning_rate": 7.1214100914458404e-06, "loss": 2.4432655334472657, "memory(GiB)": 77.56, "step": 96635, "token_acc": 0.5107033639143731, "train_speed(iter/s)": 1.437958 }, { "epoch": 4.140353883723919, "grad_norm": 7.4457855224609375, "learning_rate": 7.117948924991769e-06, "loss": 2.290312385559082, "memory(GiB)": 77.56, "step": 96640, "token_acc": 0.5229007633587787, "train_speed(iter/s)": 1.437951 }, { "epoch": 4.140568099053168, "grad_norm": 6.8137359619140625, "learning_rate": 7.1144885353862714e-06, "loss": 2.3147441864013674, "memory(GiB)": 77.56, "step": 96645, "token_acc": 0.5074626865671642, "train_speed(iter/s)": 1.437969 }, { "epoch": 4.140782314382418, "grad_norm": 7.0620245933532715, "learning_rate": 7.111028922692065e-06, "loss": 2.5058111190795898, "memory(GiB)": 77.56, "step": 96650, "token_acc": 0.4966442953020134, "train_speed(iter/s)": 1.437976 }, { "epoch": 4.140996529711666, "grad_norm": 5.347829818725586, "learning_rate": 7.107570086971793e-06, "loss": 2.2791021347045897, "memory(GiB)": 77.56, "step": 96655, "token_acc": 0.511326860841424, "train_speed(iter/s)": 1.437984 }, { "epoch": 4.141210745040915, "grad_norm": 6.416926860809326, "learning_rate": 7.104112028288135e-06, "loss": 2.041130256652832, "memory(GiB)": 77.56, "step": 96660, "token_acc": 0.5423728813559322, "train_speed(iter/s)": 1.438006 }, { "epoch": 4.1414249603701645, "grad_norm": 6.40525484085083, "learning_rate": 7.100654746703722e-06, "loss": 2.4706687927246094, "memory(GiB)": 77.56, "step": 96665, "token_acc": 0.5189873417721519, "train_speed(iter/s)": 1.438015 }, { "epoch": 4.141639175699413, "grad_norm": 5.112632751464844, "learning_rate": 7.097198242281189e-06, "loss": 2.180733299255371, "memory(GiB)": 77.56, "step": 96670, "token_acc": 0.5, "train_speed(iter/s)": 1.438014 }, { "epoch": 4.141853391028662, "grad_norm": 5.637341499328613, "learning_rate": 7.093742515083146e-06, "loss": 2.2847274780273437, "memory(GiB)": 77.56, "step": 96675, "token_acc": 0.47202797202797203, "train_speed(iter/s)": 1.43801 }, { "epoch": 4.142067606357911, "grad_norm": 6.188724994659424, "learning_rate": 7.0902875651722215e-06, "loss": 2.177390480041504, "memory(GiB)": 77.56, "step": 96680, "token_acc": 0.5631399317406144, "train_speed(iter/s)": 1.438017 }, { "epoch": 4.14228182168716, "grad_norm": 5.32813024520874, "learning_rate": 7.0868333926109865e-06, "loss": 2.1966196060180665, "memory(GiB)": 77.56, "step": 96685, "token_acc": 0.5379310344827586, "train_speed(iter/s)": 1.438015 }, { "epoch": 4.142496037016409, "grad_norm": 5.9052934646606445, "learning_rate": 7.08337999746202e-06, "loss": 2.3645824432373046, "memory(GiB)": 77.56, "step": 96690, "token_acc": 0.5031055900621118, "train_speed(iter/s)": 1.438029 }, { "epoch": 4.142710252345658, "grad_norm": 5.724667072296143, "learning_rate": 7.079927379787887e-06, "loss": 2.331894111633301, "memory(GiB)": 77.56, "step": 96695, "token_acc": 0.4756756756756757, "train_speed(iter/s)": 1.43803 }, { "epoch": 4.142924467674907, "grad_norm": 5.6233134269714355, "learning_rate": 7.076475539651117e-06, "loss": 2.080643081665039, "memory(GiB)": 77.56, "step": 96700, "token_acc": 0.5288753799392097, "train_speed(iter/s)": 1.43805 }, { "epoch": 4.143138683004156, "grad_norm": 5.347681999206543, "learning_rate": 7.073024477114276e-06, "loss": 1.9573766708374023, "memory(GiB)": 77.56, "step": 96705, "token_acc": 0.5647058823529412, "train_speed(iter/s)": 1.438056 }, { "epoch": 4.143352898333405, "grad_norm": 5.899239540100098, "learning_rate": 7.069574192239858e-06, "loss": 2.225295639038086, "memory(GiB)": 77.56, "step": 96710, "token_acc": 0.551094890510949, "train_speed(iter/s)": 1.438067 }, { "epoch": 4.1435671136626535, "grad_norm": 6.060286045074463, "learning_rate": 7.06612468509037e-06, "loss": 2.174710273742676, "memory(GiB)": 77.56, "step": 96715, "token_acc": 0.5825242718446602, "train_speed(iter/s)": 1.438083 }, { "epoch": 4.143781328991903, "grad_norm": 6.017674922943115, "learning_rate": 7.062675955728315e-06, "loss": 2.4469844818115236, "memory(GiB)": 77.56, "step": 96720, "token_acc": 0.5124555160142349, "train_speed(iter/s)": 1.43808 }, { "epoch": 4.143995544321152, "grad_norm": 4.882197856903076, "learning_rate": 7.0592280042161675e-06, "loss": 2.641741180419922, "memory(GiB)": 77.56, "step": 96725, "token_acc": 0.4620253164556962, "train_speed(iter/s)": 1.438079 }, { "epoch": 4.1442097596504, "grad_norm": 5.449734687805176, "learning_rate": 7.055780830616382e-06, "loss": 2.0512874603271483, "memory(GiB)": 77.56, "step": 96730, "token_acc": 0.5447761194029851, "train_speed(iter/s)": 1.438078 }, { "epoch": 4.14442397497965, "grad_norm": 7.146668910980225, "learning_rate": 7.052334434991403e-06, "loss": 2.209654998779297, "memory(GiB)": 77.56, "step": 96735, "token_acc": 0.5503875968992248, "train_speed(iter/s)": 1.438091 }, { "epoch": 4.144638190308899, "grad_norm": 7.1566057205200195, "learning_rate": 7.048888817403687e-06, "loss": 2.210976409912109, "memory(GiB)": 77.56, "step": 96740, "token_acc": 0.5171232876712328, "train_speed(iter/s)": 1.438103 }, { "epoch": 4.144852405638147, "grad_norm": 7.269792556762695, "learning_rate": 7.045443977915639e-06, "loss": 2.459884834289551, "memory(GiB)": 77.56, "step": 96745, "token_acc": 0.46488294314381273, "train_speed(iter/s)": 1.438116 }, { "epoch": 4.1450666209673965, "grad_norm": 7.2651591300964355, "learning_rate": 7.04199991658967e-06, "loss": 2.309665489196777, "memory(GiB)": 77.56, "step": 96750, "token_acc": 0.5016611295681063, "train_speed(iter/s)": 1.438128 }, { "epoch": 4.145280836296646, "grad_norm": 7.6188225746154785, "learning_rate": 7.038556633488169e-06, "loss": 2.226517677307129, "memory(GiB)": 77.56, "step": 96755, "token_acc": 0.5212765957446809, "train_speed(iter/s)": 1.438129 }, { "epoch": 4.145495051625894, "grad_norm": 6.360135078430176, "learning_rate": 7.035114128673503e-06, "loss": 2.3225950241088866, "memory(GiB)": 77.56, "step": 96760, "token_acc": 0.48363636363636364, "train_speed(iter/s)": 1.438132 }, { "epoch": 4.145709266955143, "grad_norm": 5.371589183807373, "learning_rate": 7.031672402208061e-06, "loss": 2.3779829025268553, "memory(GiB)": 77.56, "step": 96765, "token_acc": 0.4946236559139785, "train_speed(iter/s)": 1.438137 }, { "epoch": 4.145923482284393, "grad_norm": 5.895726203918457, "learning_rate": 7.028231454154183e-06, "loss": 2.240631103515625, "memory(GiB)": 77.56, "step": 96770, "token_acc": 0.4873417721518987, "train_speed(iter/s)": 1.438147 }, { "epoch": 4.146137697613641, "grad_norm": 6.390429973602295, "learning_rate": 7.0247912845742005e-06, "loss": 2.4046411514282227, "memory(GiB)": 77.56, "step": 96775, "token_acc": 0.46474358974358976, "train_speed(iter/s)": 1.438166 }, { "epoch": 4.14635191294289, "grad_norm": 5.912595272064209, "learning_rate": 7.021351893530437e-06, "loss": 2.308026885986328, "memory(GiB)": 77.56, "step": 96780, "token_acc": 0.5457063711911357, "train_speed(iter/s)": 1.438171 }, { "epoch": 4.1465661282721396, "grad_norm": 5.500983238220215, "learning_rate": 7.0179132810851924e-06, "loss": 2.4201290130615236, "memory(GiB)": 77.56, "step": 96785, "token_acc": 0.4694533762057878, "train_speed(iter/s)": 1.438191 }, { "epoch": 4.146780343601388, "grad_norm": 5.839399337768555, "learning_rate": 7.014475447300767e-06, "loss": 2.5619298934936525, "memory(GiB)": 77.56, "step": 96790, "token_acc": 0.46648793565683644, "train_speed(iter/s)": 1.43819 }, { "epoch": 4.146994558930637, "grad_norm": 6.6061201095581055, "learning_rate": 7.011038392239455e-06, "loss": 2.0807096481323244, "memory(GiB)": 77.56, "step": 96795, "token_acc": 0.5783582089552238, "train_speed(iter/s)": 1.438193 }, { "epoch": 4.147208774259886, "grad_norm": 7.838625431060791, "learning_rate": 7.007602115963513e-06, "loss": 2.33133487701416, "memory(GiB)": 77.56, "step": 96800, "token_acc": 0.4713804713804714, "train_speed(iter/s)": 1.438201 }, { "epoch": 4.147422989589135, "grad_norm": 6.6929168701171875, "learning_rate": 7.004166618535185e-06, "loss": 2.432495880126953, "memory(GiB)": 77.56, "step": 96805, "token_acc": 0.4888888888888889, "train_speed(iter/s)": 1.438218 }, { "epoch": 4.147637204918384, "grad_norm": 6.055613994598389, "learning_rate": 7.00073190001671e-06, "loss": 2.3883293151855467, "memory(GiB)": 77.56, "step": 96810, "token_acc": 0.501779359430605, "train_speed(iter/s)": 1.438215 }, { "epoch": 4.147851420247633, "grad_norm": 8.714577674865723, "learning_rate": 6.997297960470317e-06, "loss": 2.3065279006958006, "memory(GiB)": 77.56, "step": 96815, "token_acc": 0.5066225165562914, "train_speed(iter/s)": 1.438213 }, { "epoch": 4.148065635576882, "grad_norm": 6.251741886138916, "learning_rate": 6.9938647999582e-06, "loss": 2.1662261962890623, "memory(GiB)": 77.56, "step": 96820, "token_acc": 0.5551601423487544, "train_speed(iter/s)": 1.438209 }, { "epoch": 4.148279850906131, "grad_norm": 8.280163764953613, "learning_rate": 6.990432418542575e-06, "loss": 2.1520479202270506, "memory(GiB)": 77.56, "step": 96825, "token_acc": 0.5163398692810458, "train_speed(iter/s)": 1.438209 }, { "epoch": 4.14849406623538, "grad_norm": 6.963266849517822, "learning_rate": 6.987000816285611e-06, "loss": 2.1842084884643556, "memory(GiB)": 77.56, "step": 96830, "token_acc": 0.5058823529411764, "train_speed(iter/s)": 1.43821 }, { "epoch": 4.148708281564629, "grad_norm": 8.484952926635742, "learning_rate": 6.983569993249478e-06, "loss": 2.486441421508789, "memory(GiB)": 77.56, "step": 96835, "token_acc": 0.4563106796116505, "train_speed(iter/s)": 1.438228 }, { "epoch": 4.148922496893878, "grad_norm": 6.30465841293335, "learning_rate": 6.9801399494963285e-06, "loss": 2.355697822570801, "memory(GiB)": 77.56, "step": 96840, "token_acc": 0.52046783625731, "train_speed(iter/s)": 1.438224 }, { "epoch": 4.149136712223127, "grad_norm": 6.986323833465576, "learning_rate": 6.976710685088289e-06, "loss": 2.2218799591064453, "memory(GiB)": 77.56, "step": 96845, "token_acc": 0.5446808510638298, "train_speed(iter/s)": 1.438228 }, { "epoch": 4.1493509275523754, "grad_norm": 7.946080207824707, "learning_rate": 6.973282200087506e-06, "loss": 2.2127965927124023, "memory(GiB)": 77.56, "step": 96850, "token_acc": 0.5133333333333333, "train_speed(iter/s)": 1.438242 }, { "epoch": 4.149565142881625, "grad_norm": 7.758993148803711, "learning_rate": 6.969854494556077e-06, "loss": 2.3757564544677736, "memory(GiB)": 77.56, "step": 96855, "token_acc": 0.47750865051903113, "train_speed(iter/s)": 1.438243 }, { "epoch": 4.149779358210874, "grad_norm": 7.61237096786499, "learning_rate": 6.966427568556089e-06, "loss": 2.3386470794677736, "memory(GiB)": 77.56, "step": 96860, "token_acc": 0.4984126984126984, "train_speed(iter/s)": 1.438253 }, { "epoch": 4.149993573540122, "grad_norm": 5.416196823120117, "learning_rate": 6.963001422149646e-06, "loss": 2.200771522521973, "memory(GiB)": 77.56, "step": 96865, "token_acc": 0.5387205387205387, "train_speed(iter/s)": 1.438252 }, { "epoch": 4.150207788869372, "grad_norm": 8.396271705627441, "learning_rate": 6.959576055398798e-06, "loss": 2.2988351821899413, "memory(GiB)": 77.56, "step": 96870, "token_acc": 0.49504950495049505, "train_speed(iter/s)": 1.43823 }, { "epoch": 4.150422004198621, "grad_norm": 7.2163825035095215, "learning_rate": 6.956151468365613e-06, "loss": 2.447060394287109, "memory(GiB)": 77.56, "step": 96875, "token_acc": 0.4603174603174603, "train_speed(iter/s)": 1.438237 }, { "epoch": 4.150636219527869, "grad_norm": 5.654470443725586, "learning_rate": 6.952727661112107e-06, "loss": 2.118244171142578, "memory(GiB)": 77.56, "step": 96880, "token_acc": 0.5132075471698113, "train_speed(iter/s)": 1.438236 }, { "epoch": 4.1508504348571185, "grad_norm": 6.6399078369140625, "learning_rate": 6.949304633700332e-06, "loss": 2.211338424682617, "memory(GiB)": 77.56, "step": 96885, "token_acc": 0.5617529880478087, "train_speed(iter/s)": 1.438218 }, { "epoch": 4.151064650186368, "grad_norm": 8.192840576171875, "learning_rate": 6.945882386192293e-06, "loss": 2.1926952362060548, "memory(GiB)": 77.56, "step": 96890, "token_acc": 0.5179856115107914, "train_speed(iter/s)": 1.438221 }, { "epoch": 4.151278865515616, "grad_norm": 7.22297477722168, "learning_rate": 6.942460918649979e-06, "loss": 2.6257568359375, "memory(GiB)": 77.56, "step": 96895, "token_acc": 0.5171232876712328, "train_speed(iter/s)": 1.43822 }, { "epoch": 4.151493080844865, "grad_norm": 5.9305243492126465, "learning_rate": 6.939040231135374e-06, "loss": 2.276951026916504, "memory(GiB)": 77.56, "step": 96900, "token_acc": 0.5076335877862596, "train_speed(iter/s)": 1.438207 }, { "epoch": 4.151707296174115, "grad_norm": 6.744251728057861, "learning_rate": 6.935620323710446e-06, "loss": 2.4633607864379883, "memory(GiB)": 77.56, "step": 96905, "token_acc": 0.46200607902735563, "train_speed(iter/s)": 1.438217 }, { "epoch": 4.151921511503363, "grad_norm": 6.1679534912109375, "learning_rate": 6.9322011964371605e-06, "loss": 2.0164342880249024, "memory(GiB)": 77.56, "step": 96910, "token_acc": 0.5588235294117647, "train_speed(iter/s)": 1.438231 }, { "epoch": 4.152135726832612, "grad_norm": 5.559601306915283, "learning_rate": 6.928782849377447e-06, "loss": 2.392054557800293, "memory(GiB)": 77.56, "step": 96915, "token_acc": 0.5168539325842697, "train_speed(iter/s)": 1.438225 }, { "epoch": 4.1523499421618615, "grad_norm": 6.203959941864014, "learning_rate": 6.925365282593244e-06, "loss": 2.1733688354492187, "memory(GiB)": 77.56, "step": 96920, "token_acc": 0.5368421052631579, "train_speed(iter/s)": 1.438234 }, { "epoch": 4.15256415749111, "grad_norm": 9.566116333007812, "learning_rate": 6.921948496146452e-06, "loss": 2.2709945678710937, "memory(GiB)": 77.56, "step": 96925, "token_acc": 0.4880952380952381, "train_speed(iter/s)": 1.438238 }, { "epoch": 4.152778372820359, "grad_norm": 6.782430171966553, "learning_rate": 6.918532490098961e-06, "loss": 2.418376350402832, "memory(GiB)": 77.56, "step": 96930, "token_acc": 0.5072046109510087, "train_speed(iter/s)": 1.438242 }, { "epoch": 4.152992588149608, "grad_norm": 6.6341872215271, "learning_rate": 6.915117264512672e-06, "loss": 2.424847221374512, "memory(GiB)": 77.56, "step": 96935, "token_acc": 0.5045871559633027, "train_speed(iter/s)": 1.438256 }, { "epoch": 4.153206803478857, "grad_norm": 8.093936920166016, "learning_rate": 6.911702819449456e-06, "loss": 1.952655029296875, "memory(GiB)": 77.56, "step": 96940, "token_acc": 0.5662100456621004, "train_speed(iter/s)": 1.438273 }, { "epoch": 4.153421018808106, "grad_norm": 6.486035346984863, "learning_rate": 6.9082891549711705e-06, "loss": 2.4411787033081054, "memory(GiB)": 77.56, "step": 96945, "token_acc": 0.4647887323943662, "train_speed(iter/s)": 1.438274 }, { "epoch": 4.153635234137355, "grad_norm": 7.692143440246582, "learning_rate": 6.904876271139643e-06, "loss": 2.8277603149414063, "memory(GiB)": 77.56, "step": 96950, "token_acc": 0.43042071197411, "train_speed(iter/s)": 1.438285 }, { "epoch": 4.153849449466604, "grad_norm": 7.961983680725098, "learning_rate": 6.901464168016713e-06, "loss": 2.3008981704711915, "memory(GiB)": 77.56, "step": 96955, "token_acc": 0.5390625, "train_speed(iter/s)": 1.438297 }, { "epoch": 4.154063664795853, "grad_norm": 5.8426594734191895, "learning_rate": 6.898052845664188e-06, "loss": 2.1396427154541016, "memory(GiB)": 77.56, "step": 96960, "token_acc": 0.5379537953795379, "train_speed(iter/s)": 1.438279 }, { "epoch": 4.154277880125102, "grad_norm": 7.845702648162842, "learning_rate": 6.894642304143856e-06, "loss": 2.1792814254760744, "memory(GiB)": 77.56, "step": 96965, "token_acc": 0.5265306122448979, "train_speed(iter/s)": 1.438293 }, { "epoch": 4.1544920954543505, "grad_norm": 7.331331729888916, "learning_rate": 6.891232543517529e-06, "loss": 1.9727485656738282, "memory(GiB)": 77.56, "step": 96970, "token_acc": 0.5252525252525253, "train_speed(iter/s)": 1.438295 }, { "epoch": 4.1547063107836, "grad_norm": 5.405606746673584, "learning_rate": 6.887823563846962e-06, "loss": 2.4060237884521483, "memory(GiB)": 77.56, "step": 96975, "token_acc": 0.5151515151515151, "train_speed(iter/s)": 1.438295 }, { "epoch": 4.154920526112849, "grad_norm": 6.257388591766357, "learning_rate": 6.884415365193913e-06, "loss": 2.492660331726074, "memory(GiB)": 77.56, "step": 96980, "token_acc": 0.5053003533568905, "train_speed(iter/s)": 1.438305 }, { "epoch": 4.155134741442097, "grad_norm": 7.470952987670898, "learning_rate": 6.881007947620127e-06, "loss": 2.0173486709594726, "memory(GiB)": 77.56, "step": 96985, "token_acc": 0.5942492012779552, "train_speed(iter/s)": 1.438313 }, { "epoch": 4.155348956771347, "grad_norm": 6.3645548820495605, "learning_rate": 6.877601311187321e-06, "loss": 2.078144073486328, "memory(GiB)": 77.56, "step": 96990, "token_acc": 0.5274390243902439, "train_speed(iter/s)": 1.43831 }, { "epoch": 4.155563172100596, "grad_norm": 5.185481071472168, "learning_rate": 6.874195455957227e-06, "loss": 2.1442182540893553, "memory(GiB)": 77.56, "step": 96995, "token_acc": 0.5112781954887218, "train_speed(iter/s)": 1.438319 }, { "epoch": 4.155777387429844, "grad_norm": 5.954684257507324, "learning_rate": 6.870790381991538e-06, "loss": 2.166065216064453, "memory(GiB)": 77.56, "step": 97000, "token_acc": 0.5612648221343873, "train_speed(iter/s)": 1.438328 }, { "epoch": 4.155777387429844, "eval_loss": 2.2515976428985596, "eval_runtime": 15.265, "eval_samples_per_second": 6.551, "eval_steps_per_second": 6.551, "eval_token_acc": 0.4630541871921182, "step": 97000 }, { "epoch": 4.1559916027590935, "grad_norm": 6.8432512283325195, "learning_rate": 6.86738608935194e-06, "loss": 2.371525192260742, "memory(GiB)": 77.56, "step": 97005, "token_acc": 0.46616541353383456, "train_speed(iter/s)": 1.437988 }, { "epoch": 4.156205818088343, "grad_norm": 6.098437309265137, "learning_rate": 6.863982578100098e-06, "loss": 2.3131662368774415, "memory(GiB)": 77.56, "step": 97010, "token_acc": 0.5130718954248366, "train_speed(iter/s)": 1.437991 }, { "epoch": 4.156420033417591, "grad_norm": 6.7221550941467285, "learning_rate": 6.860579848297683e-06, "loss": 2.4015438079833986, "memory(GiB)": 77.56, "step": 97015, "token_acc": 0.48231511254019294, "train_speed(iter/s)": 1.437995 }, { "epoch": 4.15663424874684, "grad_norm": 6.369801998138428, "learning_rate": 6.857177900006317e-06, "loss": 2.290873718261719, "memory(GiB)": 77.56, "step": 97020, "token_acc": 0.5244299674267101, "train_speed(iter/s)": 1.438003 }, { "epoch": 4.15684846407609, "grad_norm": 7.332221984863281, "learning_rate": 6.853776733287665e-06, "loss": 2.112679862976074, "memory(GiB)": 77.56, "step": 97025, "token_acc": 0.4845360824742268, "train_speed(iter/s)": 1.438025 }, { "epoch": 4.157062679405338, "grad_norm": 7.508343696594238, "learning_rate": 6.850376348203313e-06, "loss": 2.262190818786621, "memory(GiB)": 77.56, "step": 97030, "token_acc": 0.5250836120401338, "train_speed(iter/s)": 1.438019 }, { "epoch": 4.157276894734587, "grad_norm": 6.953838348388672, "learning_rate": 6.8469767448148755e-06, "loss": 2.7071453094482423, "memory(GiB)": 77.56, "step": 97035, "token_acc": 0.4740484429065744, "train_speed(iter/s)": 1.438011 }, { "epoch": 4.1574911100638365, "grad_norm": 6.497248649597168, "learning_rate": 6.843577923183936e-06, "loss": 1.9916418075561524, "memory(GiB)": 77.56, "step": 97040, "token_acc": 0.5538461538461539, "train_speed(iter/s)": 1.438013 }, { "epoch": 4.157705325393085, "grad_norm": 8.44781494140625, "learning_rate": 6.840179883372066e-06, "loss": 2.198017120361328, "memory(GiB)": 77.56, "step": 97045, "token_acc": 0.4865771812080537, "train_speed(iter/s)": 1.438022 }, { "epoch": 4.157919540722334, "grad_norm": 6.003915309906006, "learning_rate": 6.836782625440813e-06, "loss": 2.099190521240234, "memory(GiB)": 77.56, "step": 97050, "token_acc": 0.5662251655629139, "train_speed(iter/s)": 1.438025 }, { "epoch": 4.158133756051583, "grad_norm": 5.657415866851807, "learning_rate": 6.833386149451748e-06, "loss": 2.389454650878906, "memory(GiB)": 77.56, "step": 97055, "token_acc": 0.4853801169590643, "train_speed(iter/s)": 1.438044 }, { "epoch": 4.158347971380832, "grad_norm": 5.5086140632629395, "learning_rate": 6.8299904554663785e-06, "loss": 2.06239013671875, "memory(GiB)": 77.56, "step": 97060, "token_acc": 0.5018450184501845, "train_speed(iter/s)": 1.438035 }, { "epoch": 4.158562186710081, "grad_norm": 6.3214263916015625, "learning_rate": 6.8265955435462325e-06, "loss": 2.307545471191406, "memory(GiB)": 77.56, "step": 97065, "token_acc": 0.5, "train_speed(iter/s)": 1.438028 }, { "epoch": 4.15877640203933, "grad_norm": 7.249273777008057, "learning_rate": 6.823201413752811e-06, "loss": 2.0276695251464845, "memory(GiB)": 77.56, "step": 97070, "token_acc": 0.5294117647058824, "train_speed(iter/s)": 1.438037 }, { "epoch": 4.158990617368579, "grad_norm": 7.243667125701904, "learning_rate": 6.819808066147587e-06, "loss": 2.564301681518555, "memory(GiB)": 77.56, "step": 97075, "token_acc": 0.4750830564784053, "train_speed(iter/s)": 1.438021 }, { "epoch": 4.159204832697828, "grad_norm": 9.378226280212402, "learning_rate": 6.816415500792056e-06, "loss": 2.2360790252685545, "memory(GiB)": 77.56, "step": 97080, "token_acc": 0.5365853658536586, "train_speed(iter/s)": 1.438018 }, { "epoch": 4.159419048027077, "grad_norm": 6.667816162109375, "learning_rate": 6.813023717747652e-06, "loss": 2.4280372619628907, "memory(GiB)": 77.56, "step": 97085, "token_acc": 0.532608695652174, "train_speed(iter/s)": 1.438003 }, { "epoch": 4.1596332633563255, "grad_norm": 5.489818572998047, "learning_rate": 6.8096327170758535e-06, "loss": 2.134889030456543, "memory(GiB)": 77.56, "step": 97090, "token_acc": 0.5346153846153846, "train_speed(iter/s)": 1.438001 }, { "epoch": 4.159847478685575, "grad_norm": 5.1787109375, "learning_rate": 6.806242498838072e-06, "loss": 2.403506851196289, "memory(GiB)": 77.56, "step": 97095, "token_acc": 0.4935064935064935, "train_speed(iter/s)": 1.438002 }, { "epoch": 4.160061694014824, "grad_norm": 5.666685104370117, "learning_rate": 6.802853063095727e-06, "loss": 1.9356639862060547, "memory(GiB)": 77.56, "step": 97100, "token_acc": 0.5779467680608364, "train_speed(iter/s)": 1.438012 }, { "epoch": 4.160275909344072, "grad_norm": 6.130240440368652, "learning_rate": 6.799464409910222e-06, "loss": 2.1629528045654296, "memory(GiB)": 77.56, "step": 97105, "token_acc": 0.5436893203883495, "train_speed(iter/s)": 1.438006 }, { "epoch": 4.160490124673322, "grad_norm": 5.2250447273254395, "learning_rate": 6.796076539342933e-06, "loss": 2.1709333419799806, "memory(GiB)": 77.56, "step": 97110, "token_acc": 0.5544871794871795, "train_speed(iter/s)": 1.438011 }, { "epoch": 4.160704340002571, "grad_norm": 5.822183609008789, "learning_rate": 6.792689451455253e-06, "loss": 2.3373680114746094, "memory(GiB)": 77.56, "step": 97115, "token_acc": 0.5013850415512465, "train_speed(iter/s)": 1.438 }, { "epoch": 4.160918555331819, "grad_norm": 6.231523513793945, "learning_rate": 6.78930314630854e-06, "loss": 2.2892728805541993, "memory(GiB)": 77.56, "step": 97120, "token_acc": 0.49333333333333335, "train_speed(iter/s)": 1.438004 }, { "epoch": 4.161132770661069, "grad_norm": 5.831514358520508, "learning_rate": 6.785917623964133e-06, "loss": 2.402569580078125, "memory(GiB)": 77.56, "step": 97125, "token_acc": 0.5087719298245614, "train_speed(iter/s)": 1.437989 }, { "epoch": 4.161346985990318, "grad_norm": 7.194214344024658, "learning_rate": 6.782532884483367e-06, "loss": 2.309739112854004, "memory(GiB)": 77.56, "step": 97130, "token_acc": 0.521613832853026, "train_speed(iter/s)": 1.438003 }, { "epoch": 4.161561201319566, "grad_norm": 6.924748420715332, "learning_rate": 6.779148927927548e-06, "loss": 2.4431758880615235, "memory(GiB)": 77.56, "step": 97135, "token_acc": 0.4945054945054945, "train_speed(iter/s)": 1.438022 }, { "epoch": 4.1617754166488155, "grad_norm": 5.608720302581787, "learning_rate": 6.775765754358004e-06, "loss": 2.1150487899780273, "memory(GiB)": 77.56, "step": 97140, "token_acc": 0.540625, "train_speed(iter/s)": 1.438048 }, { "epoch": 4.161989631978065, "grad_norm": 8.364985466003418, "learning_rate": 6.77238336383601e-06, "loss": 2.1854007720947264, "memory(GiB)": 77.56, "step": 97145, "token_acc": 0.5198412698412699, "train_speed(iter/s)": 1.438051 }, { "epoch": 4.162203847307313, "grad_norm": 6.343231201171875, "learning_rate": 6.769001756422838e-06, "loss": 2.1295557022094727, "memory(GiB)": 77.56, "step": 97150, "token_acc": 0.5414012738853503, "train_speed(iter/s)": 1.438045 }, { "epoch": 4.162418062636562, "grad_norm": 7.365429878234863, "learning_rate": 6.765620932179756e-06, "loss": 2.4055419921875, "memory(GiB)": 77.56, "step": 97155, "token_acc": 0.5076335877862596, "train_speed(iter/s)": 1.438032 }, { "epoch": 4.162632277965812, "grad_norm": 7.216892242431641, "learning_rate": 6.7622408911679976e-06, "loss": 2.376376724243164, "memory(GiB)": 77.56, "step": 97160, "token_acc": 0.5015384615384615, "train_speed(iter/s)": 1.438011 }, { "epoch": 4.16284649329506, "grad_norm": 5.5754594802856445, "learning_rate": 6.7588616334488045e-06, "loss": 2.0337640762329103, "memory(GiB)": 77.56, "step": 97165, "token_acc": 0.5785123966942148, "train_speed(iter/s)": 1.43801 }, { "epoch": 4.163060708624309, "grad_norm": 5.141740798950195, "learning_rate": 6.7554831590834086e-06, "loss": 2.223271942138672, "memory(GiB)": 77.56, "step": 97170, "token_acc": 0.512280701754386, "train_speed(iter/s)": 1.438006 }, { "epoch": 4.1632749239535585, "grad_norm": 7.582283973693848, "learning_rate": 6.752105468133002e-06, "loss": 2.301517105102539, "memory(GiB)": 77.56, "step": 97175, "token_acc": 0.5087108013937283, "train_speed(iter/s)": 1.438003 }, { "epoch": 4.163489139282807, "grad_norm": 5.5994391441345215, "learning_rate": 6.748728560658774e-06, "loss": 2.0906661987304687, "memory(GiB)": 77.56, "step": 97180, "token_acc": 0.5128205128205128, "train_speed(iter/s)": 1.438012 }, { "epoch": 4.163703354612056, "grad_norm": 7.829991340637207, "learning_rate": 6.745352436721902e-06, "loss": 2.427323913574219, "memory(GiB)": 77.56, "step": 97185, "token_acc": 0.5152671755725191, "train_speed(iter/s)": 1.438017 }, { "epoch": 4.163917569941305, "grad_norm": 6.009195327758789, "learning_rate": 6.741977096383545e-06, "loss": 2.268332099914551, "memory(GiB)": 77.56, "step": 97190, "token_acc": 0.49843260188087773, "train_speed(iter/s)": 1.438031 }, { "epoch": 4.164131785270554, "grad_norm": 10.224699974060059, "learning_rate": 6.738602539704842e-06, "loss": 2.638759231567383, "memory(GiB)": 77.56, "step": 97195, "token_acc": 0.5117056856187291, "train_speed(iter/s)": 1.438054 }, { "epoch": 4.164346000599803, "grad_norm": 9.838624954223633, "learning_rate": 6.735228766746948e-06, "loss": 2.439218521118164, "memory(GiB)": 77.56, "step": 97200, "token_acc": 0.5076923076923077, "train_speed(iter/s)": 1.438059 }, { "epoch": 4.164560215929052, "grad_norm": 6.8448710441589355, "learning_rate": 6.731855777570972e-06, "loss": 2.155548095703125, "memory(GiB)": 77.56, "step": 97205, "token_acc": 0.541958041958042, "train_speed(iter/s)": 1.438075 }, { "epoch": 4.164774431258301, "grad_norm": 6.567218780517578, "learning_rate": 6.728483572238015e-06, "loss": 2.27388858795166, "memory(GiB)": 77.56, "step": 97210, "token_acc": 0.5175097276264592, "train_speed(iter/s)": 1.438095 }, { "epoch": 4.16498864658755, "grad_norm": 5.403478622436523, "learning_rate": 6.725112150809171e-06, "loss": 2.580466461181641, "memory(GiB)": 77.56, "step": 97215, "token_acc": 0.4702702702702703, "train_speed(iter/s)": 1.438118 }, { "epoch": 4.165202861916799, "grad_norm": 8.584498405456543, "learning_rate": 6.721741513345503e-06, "loss": 2.326335144042969, "memory(GiB)": 77.56, "step": 97220, "token_acc": 0.4560810810810811, "train_speed(iter/s)": 1.438127 }, { "epoch": 4.1654170772460475, "grad_norm": 8.195961952209473, "learning_rate": 6.718371659908101e-06, "loss": 2.3034795761108398, "memory(GiB)": 77.56, "step": 97225, "token_acc": 0.4792243767313019, "train_speed(iter/s)": 1.438129 }, { "epoch": 4.165631292575297, "grad_norm": 5.719882488250732, "learning_rate": 6.715002590557984e-06, "loss": 2.191481018066406, "memory(GiB)": 77.56, "step": 97230, "token_acc": 0.5443786982248521, "train_speed(iter/s)": 1.438123 }, { "epoch": 4.165845507904546, "grad_norm": 10.552340507507324, "learning_rate": 6.711634305356207e-06, "loss": 2.485573959350586, "memory(GiB)": 77.56, "step": 97235, "token_acc": 0.4753521126760563, "train_speed(iter/s)": 1.438118 }, { "epoch": 4.166059723233794, "grad_norm": 6.061198711395264, "learning_rate": 6.708266804363789e-06, "loss": 1.971318817138672, "memory(GiB)": 77.56, "step": 97240, "token_acc": 0.5469798657718121, "train_speed(iter/s)": 1.438113 }, { "epoch": 4.166273938563044, "grad_norm": 6.476930141448975, "learning_rate": 6.704900087641725e-06, "loss": 2.3940757751464843, "memory(GiB)": 77.56, "step": 97245, "token_acc": 0.4965277777777778, "train_speed(iter/s)": 1.438116 }, { "epoch": 4.166488153892293, "grad_norm": 5.970035552978516, "learning_rate": 6.70153415525101e-06, "loss": 2.409836769104004, "memory(GiB)": 77.56, "step": 97250, "token_acc": 0.4773413897280967, "train_speed(iter/s)": 1.438113 }, { "epoch": 4.166702369221541, "grad_norm": 7.412076473236084, "learning_rate": 6.6981690072526085e-06, "loss": 2.2710716247558596, "memory(GiB)": 77.56, "step": 97255, "token_acc": 0.5236220472440944, "train_speed(iter/s)": 1.438108 }, { "epoch": 4.1669165845507905, "grad_norm": 6.00469446182251, "learning_rate": 6.694804643707509e-06, "loss": 2.3177921295166017, "memory(GiB)": 77.56, "step": 97260, "token_acc": 0.5129032258064516, "train_speed(iter/s)": 1.438119 }, { "epoch": 4.16713079988004, "grad_norm": 7.901450157165527, "learning_rate": 6.691441064676651e-06, "loss": 2.195972442626953, "memory(GiB)": 77.56, "step": 97265, "token_acc": 0.5427631578947368, "train_speed(iter/s)": 1.438107 }, { "epoch": 4.167345015209288, "grad_norm": 6.463228702545166, "learning_rate": 6.68807827022096e-06, "loss": 2.4190383911132813, "memory(GiB)": 77.56, "step": 97270, "token_acc": 0.5163636363636364, "train_speed(iter/s)": 1.438111 }, { "epoch": 4.167559230538537, "grad_norm": 6.3011860847473145, "learning_rate": 6.684716260401358e-06, "loss": 1.8667572021484375, "memory(GiB)": 77.56, "step": 97275, "token_acc": 0.5923076923076923, "train_speed(iter/s)": 1.43811 }, { "epoch": 4.167773445867787, "grad_norm": 5.087489128112793, "learning_rate": 6.681355035278747e-06, "loss": 2.281972122192383, "memory(GiB)": 77.56, "step": 97280, "token_acc": 0.5055555555555555, "train_speed(iter/s)": 1.438112 }, { "epoch": 4.167987661197035, "grad_norm": 8.16202163696289, "learning_rate": 6.677994594914039e-06, "loss": 2.288997840881348, "memory(GiB)": 77.56, "step": 97285, "token_acc": 0.5121107266435986, "train_speed(iter/s)": 1.438126 }, { "epoch": 4.168201876526284, "grad_norm": 8.033756256103516, "learning_rate": 6.674634939368096e-06, "loss": 2.1416812896728517, "memory(GiB)": 77.56, "step": 97290, "token_acc": 0.5277777777777778, "train_speed(iter/s)": 1.438148 }, { "epoch": 4.1684160918555335, "grad_norm": 7.147014617919922, "learning_rate": 6.671276068701782e-06, "loss": 2.0187665939331056, "memory(GiB)": 77.56, "step": 97295, "token_acc": 0.5411255411255411, "train_speed(iter/s)": 1.438157 }, { "epoch": 4.168630307184782, "grad_norm": 6.444074630737305, "learning_rate": 6.667917982975947e-06, "loss": 2.212594223022461, "memory(GiB)": 77.56, "step": 97300, "token_acc": 0.5604395604395604, "train_speed(iter/s)": 1.438153 }, { "epoch": 4.168844522514031, "grad_norm": 5.972939968109131, "learning_rate": 6.66456068225142e-06, "loss": 2.571119689941406, "memory(GiB)": 77.56, "step": 97305, "token_acc": 0.43843843843843844, "train_speed(iter/s)": 1.438143 }, { "epoch": 4.16905873784328, "grad_norm": 6.782871723175049, "learning_rate": 6.661204166589025e-06, "loss": 2.6636266708374023, "memory(GiB)": 77.56, "step": 97310, "token_acc": 0.4797047970479705, "train_speed(iter/s)": 1.438155 }, { "epoch": 4.169272953172529, "grad_norm": 5.814826965332031, "learning_rate": 6.657848436049585e-06, "loss": 2.2265905380249023, "memory(GiB)": 77.56, "step": 97315, "token_acc": 0.5512367491166078, "train_speed(iter/s)": 1.43817 }, { "epoch": 4.169487168501778, "grad_norm": 8.261284828186035, "learning_rate": 6.654493490693875e-06, "loss": 2.132475471496582, "memory(GiB)": 77.56, "step": 97320, "token_acc": 0.5515695067264574, "train_speed(iter/s)": 1.438193 }, { "epoch": 4.169701383831027, "grad_norm": 6.549203395843506, "learning_rate": 6.651139330582679e-06, "loss": 1.973699951171875, "memory(GiB)": 77.56, "step": 97325, "token_acc": 0.5362318840579711, "train_speed(iter/s)": 1.438184 }, { "epoch": 4.169915599160276, "grad_norm": 7.741466999053955, "learning_rate": 6.647785955776759e-06, "loss": 2.3276317596435545, "memory(GiB)": 77.56, "step": 97330, "token_acc": 0.46691176470588236, "train_speed(iter/s)": 1.438197 }, { "epoch": 4.170129814489525, "grad_norm": 5.569271087646484, "learning_rate": 6.644433366336861e-06, "loss": 2.323288345336914, "memory(GiB)": 77.56, "step": 97335, "token_acc": 0.5136054421768708, "train_speed(iter/s)": 1.43821 }, { "epoch": 4.170344029818774, "grad_norm": 7.044909477233887, "learning_rate": 6.6410815623237145e-06, "loss": 2.2888736724853516, "memory(GiB)": 77.56, "step": 97340, "token_acc": 0.4880952380952381, "train_speed(iter/s)": 1.438202 }, { "epoch": 4.1705582451480225, "grad_norm": 6.836385250091553, "learning_rate": 6.6377305437980596e-06, "loss": 2.1167577743530273, "memory(GiB)": 77.56, "step": 97345, "token_acc": 0.5134228187919463, "train_speed(iter/s)": 1.438211 }, { "epoch": 4.170772460477272, "grad_norm": 7.373457431793213, "learning_rate": 6.634380310820598e-06, "loss": 2.291274642944336, "memory(GiB)": 77.56, "step": 97350, "token_acc": 0.5441696113074205, "train_speed(iter/s)": 1.438214 }, { "epoch": 4.170986675806521, "grad_norm": 5.679426670074463, "learning_rate": 6.6310308634520144e-06, "loss": 2.4195085525512696, "memory(GiB)": 77.56, "step": 97355, "token_acc": 0.4930747922437673, "train_speed(iter/s)": 1.43821 }, { "epoch": 4.171200891135769, "grad_norm": 8.170836448669434, "learning_rate": 6.627682201752988e-06, "loss": 2.4505640029907227, "memory(GiB)": 77.56, "step": 97360, "token_acc": 0.45874587458745875, "train_speed(iter/s)": 1.438215 }, { "epoch": 4.171415106465019, "grad_norm": 5.908185958862305, "learning_rate": 6.6243343257841765e-06, "loss": 2.109127998352051, "memory(GiB)": 77.56, "step": 97365, "token_acc": 0.5523809523809524, "train_speed(iter/s)": 1.438178 }, { "epoch": 4.171629321794268, "grad_norm": 6.141332626342773, "learning_rate": 6.620987235606246e-06, "loss": 2.1970733642578124, "memory(GiB)": 77.56, "step": 97370, "token_acc": 0.5287356321839081, "train_speed(iter/s)": 1.438164 }, { "epoch": 4.171843537123516, "grad_norm": 7.779545307159424, "learning_rate": 6.617640931279828e-06, "loss": 2.199837303161621, "memory(GiB)": 77.56, "step": 97375, "token_acc": 0.5264797507788161, "train_speed(iter/s)": 1.438181 }, { "epoch": 4.1720577524527656, "grad_norm": 5.421979904174805, "learning_rate": 6.614295412865523e-06, "loss": 2.2061885833740233, "memory(GiB)": 77.56, "step": 97380, "token_acc": 0.5523465703971119, "train_speed(iter/s)": 1.438177 }, { "epoch": 4.172271967782015, "grad_norm": 7.4345479011535645, "learning_rate": 6.610950680423972e-06, "loss": 2.268479919433594, "memory(GiB)": 77.56, "step": 97385, "token_acc": 0.47151898734177217, "train_speed(iter/s)": 1.438193 }, { "epoch": 4.172486183111263, "grad_norm": 5.9489336013793945, "learning_rate": 6.607606734015753e-06, "loss": 2.3969257354736326, "memory(GiB)": 77.56, "step": 97390, "token_acc": 0.5019011406844106, "train_speed(iter/s)": 1.438194 }, { "epoch": 4.172700398440512, "grad_norm": 6.3637919425964355, "learning_rate": 6.604263573701441e-06, "loss": 2.112289047241211, "memory(GiB)": 77.56, "step": 97395, "token_acc": 0.5512367491166078, "train_speed(iter/s)": 1.438205 }, { "epoch": 4.172914613769762, "grad_norm": 6.1700053215026855, "learning_rate": 6.600921199541593e-06, "loss": 2.129522705078125, "memory(GiB)": 77.56, "step": 97400, "token_acc": 0.515358361774744, "train_speed(iter/s)": 1.438225 }, { "epoch": 4.17312882909901, "grad_norm": 4.967783451080322, "learning_rate": 6.597579611596782e-06, "loss": 2.297202491760254, "memory(GiB)": 77.56, "step": 97405, "token_acc": 0.5454545454545454, "train_speed(iter/s)": 1.438235 }, { "epoch": 4.173343044428259, "grad_norm": 6.289876937866211, "learning_rate": 6.594238809927527e-06, "loss": 2.1155399322509765, "memory(GiB)": 77.56, "step": 97410, "token_acc": 0.5232558139534884, "train_speed(iter/s)": 1.438236 }, { "epoch": 4.173557259757509, "grad_norm": 5.384018421173096, "learning_rate": 6.590898794594358e-06, "loss": 2.0759410858154297, "memory(GiB)": 77.56, "step": 97415, "token_acc": 0.5406360424028268, "train_speed(iter/s)": 1.438245 }, { "epoch": 4.173771475086757, "grad_norm": 6.808653354644775, "learning_rate": 6.587559565657775e-06, "loss": 1.897985076904297, "memory(GiB)": 77.56, "step": 97420, "token_acc": 0.5285714285714286, "train_speed(iter/s)": 1.438261 }, { "epoch": 4.173985690416006, "grad_norm": 5.335715293884277, "learning_rate": 6.584221123178263e-06, "loss": 2.418997383117676, "memory(GiB)": 77.56, "step": 97425, "token_acc": 0.4937888198757764, "train_speed(iter/s)": 1.438263 }, { "epoch": 4.1741999057452555, "grad_norm": 6.962917804718018, "learning_rate": 6.580883467216326e-06, "loss": 2.406814193725586, "memory(GiB)": 77.56, "step": 97430, "token_acc": 0.48333333333333334, "train_speed(iter/s)": 1.438263 }, { "epoch": 4.174414121074504, "grad_norm": 6.551785945892334, "learning_rate": 6.577546597832413e-06, "loss": 2.2140954971313476, "memory(GiB)": 77.56, "step": 97435, "token_acc": 0.5, "train_speed(iter/s)": 1.438246 }, { "epoch": 4.174628336403753, "grad_norm": 9.075545310974121, "learning_rate": 6.574210515086982e-06, "loss": 2.3686283111572264, "memory(GiB)": 77.56, "step": 97440, "token_acc": 0.5034722222222222, "train_speed(iter/s)": 1.438244 }, { "epoch": 4.174842551733002, "grad_norm": 5.752353191375732, "learning_rate": 6.570875219040457e-06, "loss": 2.3777570724487305, "memory(GiB)": 77.56, "step": 97445, "token_acc": 0.5054945054945055, "train_speed(iter/s)": 1.438259 }, { "epoch": 4.175056767062251, "grad_norm": 9.251683235168457, "learning_rate": 6.567540709753262e-06, "loss": 2.174790382385254, "memory(GiB)": 77.56, "step": 97450, "token_acc": 0.5531135531135531, "train_speed(iter/s)": 1.438251 }, { "epoch": 4.1752709823915, "grad_norm": 7.3459296226501465, "learning_rate": 6.564206987285809e-06, "loss": 2.243643379211426, "memory(GiB)": 77.56, "step": 97455, "token_acc": 0.5135135135135135, "train_speed(iter/s)": 1.43826 }, { "epoch": 4.175485197720749, "grad_norm": 5.697394371032715, "learning_rate": 6.560874051698502e-06, "loss": 2.3248973846435548, "memory(GiB)": 77.56, "step": 97460, "token_acc": 0.5147058823529411, "train_speed(iter/s)": 1.438272 }, { "epoch": 4.175699413049998, "grad_norm": 7.8997673988342285, "learning_rate": 6.557541903051712e-06, "loss": 2.3239925384521483, "memory(GiB)": 77.56, "step": 97465, "token_acc": 0.4885245901639344, "train_speed(iter/s)": 1.438279 }, { "epoch": 4.175913628379247, "grad_norm": 6.572592258453369, "learning_rate": 6.554210541405803e-06, "loss": 2.3544687271118163, "memory(GiB)": 77.56, "step": 97470, "token_acc": 0.5018450184501845, "train_speed(iter/s)": 1.438275 }, { "epoch": 4.176127843708496, "grad_norm": 9.373518943786621, "learning_rate": 6.550879966821122e-06, "loss": 2.298045349121094, "memory(GiB)": 77.56, "step": 97475, "token_acc": 0.5333333333333333, "train_speed(iter/s)": 1.438271 }, { "epoch": 4.1763420590377445, "grad_norm": 9.53109073638916, "learning_rate": 6.547550179358014e-06, "loss": 2.5055658340454103, "memory(GiB)": 77.56, "step": 97480, "token_acc": 0.501779359430605, "train_speed(iter/s)": 1.438271 }, { "epoch": 4.176556274366994, "grad_norm": 5.433248043060303, "learning_rate": 6.544221179076782e-06, "loss": 2.0234601974487303, "memory(GiB)": 77.56, "step": 97485, "token_acc": 0.49612403100775193, "train_speed(iter/s)": 1.438262 }, { "epoch": 4.176770489696243, "grad_norm": 7.148240566253662, "learning_rate": 6.5408929660377595e-06, "loss": 2.3061311721801756, "memory(GiB)": 77.56, "step": 97490, "token_acc": 0.5081967213114754, "train_speed(iter/s)": 1.438277 }, { "epoch": 4.176984705025491, "grad_norm": 5.337233543395996, "learning_rate": 6.537565540301227e-06, "loss": 2.241322708129883, "memory(GiB)": 77.56, "step": 97495, "token_acc": 0.5379061371841155, "train_speed(iter/s)": 1.438294 }, { "epoch": 4.177198920354741, "grad_norm": 13.385344505310059, "learning_rate": 6.534238901927469e-06, "loss": 2.5765384674072265, "memory(GiB)": 77.56, "step": 97500, "token_acc": 0.4503105590062112, "train_speed(iter/s)": 1.438301 }, { "epoch": 4.177198920354741, "eval_loss": 2.1568682193756104, "eval_runtime": 13.9265, "eval_samples_per_second": 7.181, "eval_steps_per_second": 7.181, "eval_token_acc": 0.48476454293628807, "step": 97500 }, { "epoch": 4.17741313568399, "grad_norm": 5.83855676651001, "learning_rate": 6.530913050976744e-06, "loss": 2.324435997009277, "memory(GiB)": 77.56, "step": 97505, "token_acc": 0.4935960591133005, "train_speed(iter/s)": 1.438004 }, { "epoch": 4.177627351013238, "grad_norm": 8.217616081237793, "learning_rate": 6.527587987509299e-06, "loss": 2.521082878112793, "memory(GiB)": 77.56, "step": 97510, "token_acc": 0.4769874476987448, "train_speed(iter/s)": 1.438018 }, { "epoch": 4.1778415663424875, "grad_norm": 6.036564826965332, "learning_rate": 6.5242637115853876e-06, "loss": 2.2948129653930662, "memory(GiB)": 77.56, "step": 97515, "token_acc": 0.5035211267605634, "train_speed(iter/s)": 1.438018 }, { "epoch": 4.178055781671737, "grad_norm": 5.985663414001465, "learning_rate": 6.520940223265226e-06, "loss": 2.241323471069336, "memory(GiB)": 77.56, "step": 97520, "token_acc": 0.49498327759197325, "train_speed(iter/s)": 1.438014 }, { "epoch": 4.178269997000985, "grad_norm": 6.278834342956543, "learning_rate": 6.517617522609015e-06, "loss": 2.413946533203125, "memory(GiB)": 77.56, "step": 97525, "token_acc": 0.5217391304347826, "train_speed(iter/s)": 1.438008 }, { "epoch": 4.178484212330234, "grad_norm": 9.926697731018066, "learning_rate": 6.51429560967694e-06, "loss": 2.7918434143066406, "memory(GiB)": 77.56, "step": 97530, "token_acc": 0.4568345323741007, "train_speed(iter/s)": 1.438033 }, { "epoch": 4.178698427659484, "grad_norm": 6.260964393615723, "learning_rate": 6.510974484529209e-06, "loss": 2.324381637573242, "memory(GiB)": 77.56, "step": 97535, "token_acc": 0.504885993485342, "train_speed(iter/s)": 1.43803 }, { "epoch": 4.178912642988732, "grad_norm": 4.834085941314697, "learning_rate": 6.5076541472259666e-06, "loss": 2.2774003982543944, "memory(GiB)": 77.56, "step": 97540, "token_acc": 0.5032679738562091, "train_speed(iter/s)": 1.438038 }, { "epoch": 4.179126858317981, "grad_norm": 6.412882328033447, "learning_rate": 6.50433459782736e-06, "loss": 2.52331600189209, "memory(GiB)": 77.56, "step": 97545, "token_acc": 0.4489795918367347, "train_speed(iter/s)": 1.438046 }, { "epoch": 4.1793410736472305, "grad_norm": 4.971535682678223, "learning_rate": 6.501015836393543e-06, "loss": 2.296042251586914, "memory(GiB)": 77.56, "step": 97550, "token_acc": 0.5026737967914439, "train_speed(iter/s)": 1.438053 }, { "epoch": 4.179555288976479, "grad_norm": 6.823814392089844, "learning_rate": 6.497697862984631e-06, "loss": 2.1310056686401366, "memory(GiB)": 77.56, "step": 97555, "token_acc": 0.5774647887323944, "train_speed(iter/s)": 1.438066 }, { "epoch": 4.179769504305728, "grad_norm": 13.579385757446289, "learning_rate": 6.494380677660733e-06, "loss": 2.1263118743896485, "memory(GiB)": 77.56, "step": 97560, "token_acc": 0.5327868852459017, "train_speed(iter/s)": 1.438073 }, { "epoch": 4.179983719634977, "grad_norm": 6.955369472503662, "learning_rate": 6.491064280481934e-06, "loss": 2.122196006774902, "memory(GiB)": 77.56, "step": 97565, "token_acc": 0.5104166666666666, "train_speed(iter/s)": 1.438067 }, { "epoch": 4.180197934964226, "grad_norm": 5.315395832061768, "learning_rate": 6.4877486715083144e-06, "loss": 2.205295753479004, "memory(GiB)": 77.56, "step": 97570, "token_acc": 0.5517241379310345, "train_speed(iter/s)": 1.438065 }, { "epoch": 4.180412150293475, "grad_norm": 6.053256034851074, "learning_rate": 6.484433850799959e-06, "loss": 2.146514129638672, "memory(GiB)": 77.56, "step": 97575, "token_acc": 0.5387205387205387, "train_speed(iter/s)": 1.438075 }, { "epoch": 4.180626365622724, "grad_norm": 6.181568622589111, "learning_rate": 6.481119818416898e-06, "loss": 2.221596527099609, "memory(GiB)": 77.56, "step": 97580, "token_acc": 0.5382165605095541, "train_speed(iter/s)": 1.438074 }, { "epoch": 4.180840580951973, "grad_norm": 5.924911022186279, "learning_rate": 6.477806574419182e-06, "loss": 2.24060173034668, "memory(GiB)": 77.56, "step": 97585, "token_acc": 0.49795918367346936, "train_speed(iter/s)": 1.438068 }, { "epoch": 4.181054796281222, "grad_norm": 6.827852725982666, "learning_rate": 6.474494118866825e-06, "loss": 1.9550445556640625, "memory(GiB)": 77.56, "step": 97590, "token_acc": 0.5991189427312775, "train_speed(iter/s)": 1.438067 }, { "epoch": 4.181269011610471, "grad_norm": 8.712571144104004, "learning_rate": 6.471182451819824e-06, "loss": 2.5854272842407227, "memory(GiB)": 77.56, "step": 97595, "token_acc": 0.4777777777777778, "train_speed(iter/s)": 1.438049 }, { "epoch": 4.1814832269397195, "grad_norm": 10.523072242736816, "learning_rate": 6.467871573338186e-06, "loss": 2.2612483978271483, "memory(GiB)": 77.56, "step": 97600, "token_acc": 0.5347222222222222, "train_speed(iter/s)": 1.438035 }, { "epoch": 4.181697442268969, "grad_norm": 6.188942909240723, "learning_rate": 6.464561483481907e-06, "loss": 2.011410713195801, "memory(GiB)": 77.56, "step": 97605, "token_acc": 0.5547703180212014, "train_speed(iter/s)": 1.438048 }, { "epoch": 4.181911657598218, "grad_norm": 6.513525485992432, "learning_rate": 6.461252182310929e-06, "loss": 2.52322998046875, "memory(GiB)": 77.56, "step": 97610, "token_acc": 0.49838187702265374, "train_speed(iter/s)": 1.438065 }, { "epoch": 4.182125872927466, "grad_norm": 5.843857288360596, "learning_rate": 6.457943669885219e-06, "loss": 1.9907096862792968, "memory(GiB)": 77.56, "step": 97615, "token_acc": 0.5400696864111498, "train_speed(iter/s)": 1.438071 }, { "epoch": 4.182340088256716, "grad_norm": 4.699541091918945, "learning_rate": 6.4546359462646975e-06, "loss": 2.001882553100586, "memory(GiB)": 77.56, "step": 97620, "token_acc": 0.5304659498207885, "train_speed(iter/s)": 1.438075 }, { "epoch": 4.182554303585965, "grad_norm": 7.146362781524658, "learning_rate": 6.4513290115093e-06, "loss": 2.1803197860717773, "memory(GiB)": 77.56, "step": 97625, "token_acc": 0.5108695652173914, "train_speed(iter/s)": 1.438076 }, { "epoch": 4.182768518915213, "grad_norm": 5.4522881507873535, "learning_rate": 6.448022865678916e-06, "loss": 2.0480764389038084, "memory(GiB)": 77.56, "step": 97630, "token_acc": 0.553030303030303, "train_speed(iter/s)": 1.438084 }, { "epoch": 4.1829827342444625, "grad_norm": 7.047962665557861, "learning_rate": 6.444717508833464e-06, "loss": 2.206417465209961, "memory(GiB)": 77.56, "step": 97635, "token_acc": 0.5033557046979866, "train_speed(iter/s)": 1.43808 }, { "epoch": 4.183196949573712, "grad_norm": 8.34561538696289, "learning_rate": 6.441412941032809e-06, "loss": 2.111255073547363, "memory(GiB)": 77.56, "step": 97640, "token_acc": 0.5482456140350878, "train_speed(iter/s)": 1.438079 }, { "epoch": 4.18341116490296, "grad_norm": 7.390643119812012, "learning_rate": 6.4381091623368254e-06, "loss": 2.3486318588256836, "memory(GiB)": 77.56, "step": 97645, "token_acc": 0.5035971223021583, "train_speed(iter/s)": 1.438083 }, { "epoch": 4.183625380232209, "grad_norm": 7.53248405456543, "learning_rate": 6.434806172805358e-06, "loss": 2.304783248901367, "memory(GiB)": 77.56, "step": 97650, "token_acc": 0.4612676056338028, "train_speed(iter/s)": 1.438094 }, { "epoch": 4.183839595561459, "grad_norm": 5.743891716003418, "learning_rate": 6.431503972498232e-06, "loss": 2.2595176696777344, "memory(GiB)": 77.56, "step": 97655, "token_acc": 0.5032258064516129, "train_speed(iter/s)": 1.438106 }, { "epoch": 4.184053810890707, "grad_norm": 6.216049671173096, "learning_rate": 6.428202561475288e-06, "loss": 2.3658281326293946, "memory(GiB)": 77.56, "step": 97660, "token_acc": 0.5301724137931034, "train_speed(iter/s)": 1.438122 }, { "epoch": 4.184268026219956, "grad_norm": 13.739350318908691, "learning_rate": 6.424901939796335e-06, "loss": 2.116403579711914, "memory(GiB)": 77.56, "step": 97665, "token_acc": 0.53, "train_speed(iter/s)": 1.438127 }, { "epoch": 4.184482241549206, "grad_norm": 8.467107772827148, "learning_rate": 6.421602107521157e-06, "loss": 2.258096694946289, "memory(GiB)": 77.56, "step": 97670, "token_acc": 0.5134228187919463, "train_speed(iter/s)": 1.438125 }, { "epoch": 4.184696456878454, "grad_norm": 7.090216159820557, "learning_rate": 6.41830306470953e-06, "loss": 2.169106864929199, "memory(GiB)": 77.56, "step": 97675, "token_acc": 0.5591397849462365, "train_speed(iter/s)": 1.438141 }, { "epoch": 4.184910672207703, "grad_norm": 8.354065895080566, "learning_rate": 6.415004811421232e-06, "loss": 1.9704107284545898, "memory(GiB)": 77.56, "step": 97680, "token_acc": 0.56640625, "train_speed(iter/s)": 1.438139 }, { "epoch": 4.185124887536952, "grad_norm": 5.548110008239746, "learning_rate": 6.411707347716012e-06, "loss": 2.150985527038574, "memory(GiB)": 77.56, "step": 97685, "token_acc": 0.5508196721311476, "train_speed(iter/s)": 1.438143 }, { "epoch": 4.185339102866201, "grad_norm": 6.08723783493042, "learning_rate": 6.408410673653586e-06, "loss": 2.199851417541504, "memory(GiB)": 77.56, "step": 97690, "token_acc": 0.5187713310580204, "train_speed(iter/s)": 1.43815 }, { "epoch": 4.18555331819545, "grad_norm": 6.072889804840088, "learning_rate": 6.405114789293709e-06, "loss": 2.3310268402099608, "memory(GiB)": 77.56, "step": 97695, "token_acc": 0.5204081632653061, "train_speed(iter/s)": 1.438156 }, { "epoch": 4.185767533524699, "grad_norm": 6.160730361938477, "learning_rate": 6.40181969469607e-06, "loss": 2.2882965087890623, "memory(GiB)": 77.56, "step": 97700, "token_acc": 0.4873417721518987, "train_speed(iter/s)": 1.438163 }, { "epoch": 4.185981748853948, "grad_norm": 7.00139045715332, "learning_rate": 6.398525389920368e-06, "loss": 2.1385366439819338, "memory(GiB)": 77.56, "step": 97705, "token_acc": 0.5390334572490706, "train_speed(iter/s)": 1.438176 }, { "epoch": 4.186195964183197, "grad_norm": 6.793406963348389, "learning_rate": 6.395231875026275e-06, "loss": 2.4661527633666993, "memory(GiB)": 77.56, "step": 97710, "token_acc": 0.4722222222222222, "train_speed(iter/s)": 1.438172 }, { "epoch": 4.186410179512446, "grad_norm": 8.219132423400879, "learning_rate": 6.391939150073456e-06, "loss": 2.453174591064453, "memory(GiB)": 77.56, "step": 97715, "token_acc": 0.5115511551155115, "train_speed(iter/s)": 1.438166 }, { "epoch": 4.186624394841695, "grad_norm": 7.5466156005859375, "learning_rate": 6.388647215121579e-06, "loss": 2.2419387817382814, "memory(GiB)": 77.56, "step": 97720, "token_acc": 0.5057471264367817, "train_speed(iter/s)": 1.438174 }, { "epoch": 4.186838610170944, "grad_norm": 7.3004326820373535, "learning_rate": 6.385356070230264e-06, "loss": 2.2508113861083983, "memory(GiB)": 77.56, "step": 97725, "token_acc": 0.47017543859649125, "train_speed(iter/s)": 1.438168 }, { "epoch": 4.187052825500193, "grad_norm": 6.568329811096191, "learning_rate": 6.38206571545914e-06, "loss": 2.0413450241088866, "memory(GiB)": 77.56, "step": 97730, "token_acc": 0.546875, "train_speed(iter/s)": 1.438171 }, { "epoch": 4.1872670408294415, "grad_norm": 10.363677978515625, "learning_rate": 6.37877615086781e-06, "loss": 2.472368621826172, "memory(GiB)": 77.56, "step": 97735, "token_acc": 0.4591549295774648, "train_speed(iter/s)": 1.438153 }, { "epoch": 4.187481256158691, "grad_norm": 8.492854118347168, "learning_rate": 6.375487376515859e-06, "loss": 2.3931474685668945, "memory(GiB)": 77.56, "step": 97740, "token_acc": 0.4682274247491639, "train_speed(iter/s)": 1.438161 }, { "epoch": 4.18769547148794, "grad_norm": 5.212835311889648, "learning_rate": 6.372199392462891e-06, "loss": 1.8894960403442382, "memory(GiB)": 77.56, "step": 97745, "token_acc": 0.5466101694915254, "train_speed(iter/s)": 1.438182 }, { "epoch": 4.187909686817188, "grad_norm": 8.418830871582031, "learning_rate": 6.368912198768445e-06, "loss": 2.198039245605469, "memory(GiB)": 77.56, "step": 97750, "token_acc": 0.4773413897280967, "train_speed(iter/s)": 1.438171 }, { "epoch": 4.188123902146438, "grad_norm": 8.315765380859375, "learning_rate": 6.365625795492092e-06, "loss": 2.1980485916137695, "memory(GiB)": 77.56, "step": 97755, "token_acc": 0.5132075471698113, "train_speed(iter/s)": 1.43818 }, { "epoch": 4.188338117475687, "grad_norm": 9.58134651184082, "learning_rate": 6.362340182693366e-06, "loss": 2.1204824447631836, "memory(GiB)": 77.56, "step": 97760, "token_acc": 0.49264705882352944, "train_speed(iter/s)": 1.438172 }, { "epoch": 4.188552332804935, "grad_norm": 5.854559898376465, "learning_rate": 6.3590553604317756e-06, "loss": 2.447714996337891, "memory(GiB)": 77.56, "step": 97765, "token_acc": 0.5143769968051118, "train_speed(iter/s)": 1.438187 }, { "epoch": 4.1887665481341845, "grad_norm": 7.2104363441467285, "learning_rate": 6.3557713287668405e-06, "loss": 2.108713150024414, "memory(GiB)": 77.56, "step": 97770, "token_acc": 0.5732899022801303, "train_speed(iter/s)": 1.438199 }, { "epoch": 4.188980763463434, "grad_norm": 5.27855920791626, "learning_rate": 6.3524880877580405e-06, "loss": 1.8638744354248047, "memory(GiB)": 77.56, "step": 97775, "token_acc": 0.5415162454873647, "train_speed(iter/s)": 1.438193 }, { "epoch": 4.189194978792682, "grad_norm": 6.88019323348999, "learning_rate": 6.349205637464872e-06, "loss": 2.152066230773926, "memory(GiB)": 77.56, "step": 97780, "token_acc": 0.5234375, "train_speed(iter/s)": 1.438186 }, { "epoch": 4.189409194121931, "grad_norm": 6.190378665924072, "learning_rate": 6.345923977946789e-06, "loss": 2.0639495849609375, "memory(GiB)": 77.56, "step": 97785, "token_acc": 0.5161290322580645, "train_speed(iter/s)": 1.438175 }, { "epoch": 4.189623409451181, "grad_norm": 6.240383625030518, "learning_rate": 6.3426431092632465e-06, "loss": 2.1495569229125975, "memory(GiB)": 77.56, "step": 97790, "token_acc": 0.5364238410596026, "train_speed(iter/s)": 1.438173 }, { "epoch": 4.189837624780429, "grad_norm": 7.886990070343018, "learning_rate": 6.339363031473677e-06, "loss": 2.1694637298583985, "memory(GiB)": 77.56, "step": 97795, "token_acc": 0.4899135446685879, "train_speed(iter/s)": 1.438139 }, { "epoch": 4.190051840109678, "grad_norm": 9.617639541625977, "learning_rate": 6.3360837446374945e-06, "loss": 2.3558122634887697, "memory(GiB)": 77.56, "step": 97800, "token_acc": 0.5, "train_speed(iter/s)": 1.438146 }, { "epoch": 4.1902660554389275, "grad_norm": 5.966063976287842, "learning_rate": 6.3328052488141225e-06, "loss": 2.286698913574219, "memory(GiB)": 77.56, "step": 97805, "token_acc": 0.4981549815498155, "train_speed(iter/s)": 1.438145 }, { "epoch": 4.190480270768176, "grad_norm": 7.710631370544434, "learning_rate": 6.329527544062952e-06, "loss": 2.4361454010009767, "memory(GiB)": 77.56, "step": 97810, "token_acc": 0.5173611111111112, "train_speed(iter/s)": 1.43816 }, { "epoch": 4.190694486097425, "grad_norm": 5.625353813171387, "learning_rate": 6.326250630443348e-06, "loss": 2.374558448791504, "memory(GiB)": 77.56, "step": 97815, "token_acc": 0.5173611111111112, "train_speed(iter/s)": 1.438171 }, { "epoch": 4.190908701426674, "grad_norm": 5.857768535614014, "learning_rate": 6.32297450801469e-06, "loss": 2.1345306396484376, "memory(GiB)": 77.56, "step": 97820, "token_acc": 0.5030674846625767, "train_speed(iter/s)": 1.438184 }, { "epoch": 4.191122916755923, "grad_norm": 5.928982257843018, "learning_rate": 6.319699176836308e-06, "loss": 2.4232322692871096, "memory(GiB)": 77.56, "step": 97825, "token_acc": 0.4662379421221865, "train_speed(iter/s)": 1.4382 }, { "epoch": 4.191337132085172, "grad_norm": 6.280635356903076, "learning_rate": 6.316424636967561e-06, "loss": 2.127361297607422, "memory(GiB)": 77.56, "step": 97830, "token_acc": 0.5164835164835165, "train_speed(iter/s)": 1.438216 }, { "epoch": 4.191551347414421, "grad_norm": 6.911950588226318, "learning_rate": 6.313150888467751e-06, "loss": 2.2145158767700197, "memory(GiB)": 77.56, "step": 97835, "token_acc": 0.5469798657718121, "train_speed(iter/s)": 1.438214 }, { "epoch": 4.19176556274367, "grad_norm": 7.9430623054504395, "learning_rate": 6.309877931396202e-06, "loss": 2.287306213378906, "memory(GiB)": 77.56, "step": 97840, "token_acc": 0.49642857142857144, "train_speed(iter/s)": 1.438232 }, { "epoch": 4.191979778072919, "grad_norm": 7.1939697265625, "learning_rate": 6.306605765812202e-06, "loss": 2.0288459777832033, "memory(GiB)": 77.56, "step": 97845, "token_acc": 0.5517241379310345, "train_speed(iter/s)": 1.438224 }, { "epoch": 4.192193993402168, "grad_norm": 8.208932876586914, "learning_rate": 6.3033343917750235e-06, "loss": 1.899272346496582, "memory(GiB)": 77.56, "step": 97850, "token_acc": 0.581081081081081, "train_speed(iter/s)": 1.438229 }, { "epoch": 4.1924082087314165, "grad_norm": 6.498878479003906, "learning_rate": 6.300063809343936e-06, "loss": 2.156787109375, "memory(GiB)": 77.56, "step": 97855, "token_acc": 0.5234657039711191, "train_speed(iter/s)": 1.438243 }, { "epoch": 4.192622424060666, "grad_norm": 5.411504745483398, "learning_rate": 6.2967940185781785e-06, "loss": 2.2550283432006837, "memory(GiB)": 77.56, "step": 97860, "token_acc": 0.5, "train_speed(iter/s)": 1.438255 }, { "epoch": 4.192836639389915, "grad_norm": 7.424903392791748, "learning_rate": 6.293525019537e-06, "loss": 2.3128299713134766, "memory(GiB)": 77.56, "step": 97865, "token_acc": 0.5077881619937694, "train_speed(iter/s)": 1.438269 }, { "epoch": 4.193050854719163, "grad_norm": 5.775293827056885, "learning_rate": 6.290256812279616e-06, "loss": 2.1359560012817385, "memory(GiB)": 77.56, "step": 97870, "token_acc": 0.4944237918215613, "train_speed(iter/s)": 1.438273 }, { "epoch": 4.193265070048413, "grad_norm": 6.167384624481201, "learning_rate": 6.286989396865234e-06, "loss": 2.305561065673828, "memory(GiB)": 77.56, "step": 97875, "token_acc": 0.5290102389078498, "train_speed(iter/s)": 1.438273 }, { "epoch": 4.193479285377662, "grad_norm": 4.649319648742676, "learning_rate": 6.283722773353046e-06, "loss": 2.201676368713379, "memory(GiB)": 77.56, "step": 97880, "token_acc": 0.5212121212121212, "train_speed(iter/s)": 1.438269 }, { "epoch": 4.19369350070691, "grad_norm": 7.607011795043945, "learning_rate": 6.280456941802215e-06, "loss": 2.393903923034668, "memory(GiB)": 77.56, "step": 97885, "token_acc": 0.5321100917431193, "train_speed(iter/s)": 1.438282 }, { "epoch": 4.1939077160361595, "grad_norm": 6.558903217315674, "learning_rate": 6.277191902271934e-06, "loss": 2.205317497253418, "memory(GiB)": 77.56, "step": 97890, "token_acc": 0.5387596899224806, "train_speed(iter/s)": 1.438305 }, { "epoch": 4.194121931365409, "grad_norm": 5.9045796394348145, "learning_rate": 6.273927654821321e-06, "loss": 2.454580307006836, "memory(GiB)": 77.56, "step": 97895, "token_acc": 0.44936708860759494, "train_speed(iter/s)": 1.43832 }, { "epoch": 4.194336146694657, "grad_norm": 6.074865341186523, "learning_rate": 6.2706641995095405e-06, "loss": 2.208053207397461, "memory(GiB)": 77.56, "step": 97900, "token_acc": 0.5136186770428015, "train_speed(iter/s)": 1.438322 }, { "epoch": 4.194550362023906, "grad_norm": 6.615030288696289, "learning_rate": 6.267401536395701e-06, "loss": 2.2229873657226564, "memory(GiB)": 77.56, "step": 97905, "token_acc": 0.5093632958801498, "train_speed(iter/s)": 1.438318 }, { "epoch": 4.194764577353156, "grad_norm": 5.493546009063721, "learning_rate": 6.2641396655389095e-06, "loss": 2.2882667541503907, "memory(GiB)": 77.56, "step": 97910, "token_acc": 0.5252225519287834, "train_speed(iter/s)": 1.438315 }, { "epoch": 4.194978792682404, "grad_norm": 5.833301544189453, "learning_rate": 6.2608785869982475e-06, "loss": 2.1958927154541015, "memory(GiB)": 77.56, "step": 97915, "token_acc": 0.4910394265232975, "train_speed(iter/s)": 1.438317 }, { "epoch": 4.195193008011653, "grad_norm": 5.743492126464844, "learning_rate": 6.257618300832796e-06, "loss": 2.340970993041992, "memory(GiB)": 77.56, "step": 97920, "token_acc": 0.5229681978798587, "train_speed(iter/s)": 1.438321 }, { "epoch": 4.1954072233409025, "grad_norm": 6.084545612335205, "learning_rate": 6.254358807101635e-06, "loss": 2.1390939712524415, "memory(GiB)": 77.56, "step": 97925, "token_acc": 0.545774647887324, "train_speed(iter/s)": 1.438317 }, { "epoch": 4.195621438670151, "grad_norm": 6.5965352058410645, "learning_rate": 6.251100105863794e-06, "loss": 2.244105339050293, "memory(GiB)": 77.56, "step": 97930, "token_acc": 0.5547445255474452, "train_speed(iter/s)": 1.438319 }, { "epoch": 4.1958356539994, "grad_norm": 5.935375690460205, "learning_rate": 6.247842197178317e-06, "loss": 2.026915740966797, "memory(GiB)": 77.56, "step": 97935, "token_acc": 0.5306122448979592, "train_speed(iter/s)": 1.438295 }, { "epoch": 4.196049869328649, "grad_norm": 5.711519241333008, "learning_rate": 6.2445850811042264e-06, "loss": 2.078433609008789, "memory(GiB)": 77.56, "step": 97940, "token_acc": 0.5261044176706827, "train_speed(iter/s)": 1.438309 }, { "epoch": 4.196264084657898, "grad_norm": 5.879454135894775, "learning_rate": 6.241328757700505e-06, "loss": 2.2151498794555664, "memory(GiB)": 77.56, "step": 97945, "token_acc": 0.5399239543726235, "train_speed(iter/s)": 1.438329 }, { "epoch": 4.196478299987147, "grad_norm": 5.962684154510498, "learning_rate": 6.238073227026176e-06, "loss": 2.297053909301758, "memory(GiB)": 77.56, "step": 97950, "token_acc": 0.5, "train_speed(iter/s)": 1.438335 }, { "epoch": 4.196692515316396, "grad_norm": 6.07538366317749, "learning_rate": 6.234818489140204e-06, "loss": 2.4884803771972654, "memory(GiB)": 77.56, "step": 97955, "token_acc": 0.46808510638297873, "train_speed(iter/s)": 1.438328 }, { "epoch": 4.196906730645645, "grad_norm": 7.4774932861328125, "learning_rate": 6.231564544101548e-06, "loss": 2.5031810760498048, "memory(GiB)": 77.56, "step": 97960, "token_acc": 0.5, "train_speed(iter/s)": 1.438321 }, { "epoch": 4.197120945974894, "grad_norm": 5.270471096038818, "learning_rate": 6.22831139196916e-06, "loss": 2.3547115325927734, "memory(GiB)": 77.56, "step": 97965, "token_acc": 0.45751633986928103, "train_speed(iter/s)": 1.438317 }, { "epoch": 4.197335161304143, "grad_norm": 7.197223663330078, "learning_rate": 6.225059032801961e-06, "loss": 2.2638641357421876, "memory(GiB)": 77.56, "step": 97970, "token_acc": 0.5364431486880467, "train_speed(iter/s)": 1.438324 }, { "epoch": 4.197549376633392, "grad_norm": 5.827846050262451, "learning_rate": 6.221807466658891e-06, "loss": 2.5818321228027346, "memory(GiB)": 77.56, "step": 97975, "token_acc": 0.4963235294117647, "train_speed(iter/s)": 1.438332 }, { "epoch": 4.197763591962641, "grad_norm": 7.089547157287598, "learning_rate": 6.2185566935988375e-06, "loss": 1.9585094451904297, "memory(GiB)": 77.56, "step": 97980, "token_acc": 0.5866141732283464, "train_speed(iter/s)": 1.43833 }, { "epoch": 4.19797780729189, "grad_norm": 6.133625030517578, "learning_rate": 6.2153067136807076e-06, "loss": 2.40917854309082, "memory(GiB)": 77.56, "step": 97985, "token_acc": 0.4876543209876543, "train_speed(iter/s)": 1.438353 }, { "epoch": 4.198192022621138, "grad_norm": 5.699099063873291, "learning_rate": 6.212057526963372e-06, "loss": 2.2886009216308594, "memory(GiB)": 77.56, "step": 97990, "token_acc": 0.4887640449438202, "train_speed(iter/s)": 1.438359 }, { "epoch": 4.198406237950388, "grad_norm": 5.499150276184082, "learning_rate": 6.2088091335056945e-06, "loss": 2.3925298690795898, "memory(GiB)": 77.56, "step": 97995, "token_acc": 0.5226586102719033, "train_speed(iter/s)": 1.438378 }, { "epoch": 4.198620453279637, "grad_norm": 5.269199848175049, "learning_rate": 6.205561533366511e-06, "loss": 2.0267637252807615, "memory(GiB)": 77.56, "step": 98000, "token_acc": 0.551829268292683, "train_speed(iter/s)": 1.43839 }, { "epoch": 4.198620453279637, "eval_loss": 2.1447606086730957, "eval_runtime": 13.7184, "eval_samples_per_second": 7.289, "eval_steps_per_second": 7.289, "eval_token_acc": 0.5054945054945055, "step": 98000 }, { "epoch": 4.198834668608885, "grad_norm": 6.185956001281738, "learning_rate": 6.202314726604658e-06, "loss": 2.3849361419677733, "memory(GiB)": 77.56, "step": 98005, "token_acc": 0.5071907957813998, "train_speed(iter/s)": 1.438086 }, { "epoch": 4.199048883938135, "grad_norm": 7.049850940704346, "learning_rate": 6.199068713278966e-06, "loss": 2.6123046875, "memory(GiB)": 77.56, "step": 98010, "token_acc": 0.4548736462093863, "train_speed(iter/s)": 1.438075 }, { "epoch": 4.199263099267384, "grad_norm": 6.718698024749756, "learning_rate": 6.1958234934482384e-06, "loss": 2.262205123901367, "memory(GiB)": 77.56, "step": 98015, "token_acc": 0.5111111111111111, "train_speed(iter/s)": 1.438081 }, { "epoch": 4.199477314596632, "grad_norm": 6.537539958953857, "learning_rate": 6.192579067171256e-06, "loss": 2.694133758544922, "memory(GiB)": 77.56, "step": 98020, "token_acc": 0.45733788395904434, "train_speed(iter/s)": 1.438085 }, { "epoch": 4.1996915299258815, "grad_norm": 5.617667198181152, "learning_rate": 6.189335434506799e-06, "loss": 2.2067539215087892, "memory(GiB)": 77.56, "step": 98025, "token_acc": 0.5698924731182796, "train_speed(iter/s)": 1.438084 }, { "epoch": 4.199905745255131, "grad_norm": 9.685507774353027, "learning_rate": 6.186092595513615e-06, "loss": 2.10714054107666, "memory(GiB)": 77.56, "step": 98030, "token_acc": 0.5547445255474452, "train_speed(iter/s)": 1.438103 }, { "epoch": 4.200119960584379, "grad_norm": 9.102299690246582, "learning_rate": 6.18285055025048e-06, "loss": 2.2809188842773436, "memory(GiB)": 77.56, "step": 98035, "token_acc": 0.48484848484848486, "train_speed(iter/s)": 1.438114 }, { "epoch": 4.200334175913628, "grad_norm": 5.823385715484619, "learning_rate": 6.179609298776106e-06, "loss": 2.501713180541992, "memory(GiB)": 77.56, "step": 98040, "token_acc": 0.4726027397260274, "train_speed(iter/s)": 1.438102 }, { "epoch": 4.200548391242878, "grad_norm": 6.058045864105225, "learning_rate": 6.176368841149211e-06, "loss": 2.2380088806152343, "memory(GiB)": 77.56, "step": 98045, "token_acc": 0.5564516129032258, "train_speed(iter/s)": 1.438108 }, { "epoch": 4.200762606572126, "grad_norm": 6.595065593719482, "learning_rate": 6.17312917742851e-06, "loss": 2.140004348754883, "memory(GiB)": 77.56, "step": 98050, "token_acc": 0.5681818181818182, "train_speed(iter/s)": 1.438093 }, { "epoch": 4.200976821901375, "grad_norm": 7.752303123474121, "learning_rate": 6.169890307672693e-06, "loss": 2.2860483169555663, "memory(GiB)": 77.56, "step": 98055, "token_acc": 0.4826254826254826, "train_speed(iter/s)": 1.438106 }, { "epoch": 4.2011910372306245, "grad_norm": 7.3059844970703125, "learning_rate": 6.166652231940423e-06, "loss": 2.3380111694335937, "memory(GiB)": 77.56, "step": 98060, "token_acc": 0.49836065573770494, "train_speed(iter/s)": 1.438126 }, { "epoch": 4.201405252559873, "grad_norm": 6.263556957244873, "learning_rate": 6.163414950290358e-06, "loss": 2.2195613861083983, "memory(GiB)": 77.56, "step": 98065, "token_acc": 0.5288135593220339, "train_speed(iter/s)": 1.438136 }, { "epoch": 4.201619467889122, "grad_norm": 7.176837921142578, "learning_rate": 6.1601784627811684e-06, "loss": 2.513922691345215, "memory(GiB)": 77.56, "step": 98070, "token_acc": 0.48175182481751827, "train_speed(iter/s)": 1.43815 }, { "epoch": 4.201833683218371, "grad_norm": 5.722093105316162, "learning_rate": 6.1569427694714635e-06, "loss": 2.1636272430419923, "memory(GiB)": 77.56, "step": 98075, "token_acc": 0.513986013986014, "train_speed(iter/s)": 1.438152 }, { "epoch": 4.20204789854762, "grad_norm": 5.712563991546631, "learning_rate": 6.1537078704198726e-06, "loss": 2.490376663208008, "memory(GiB)": 77.56, "step": 98080, "token_acc": 0.48028673835125446, "train_speed(iter/s)": 1.43815 }, { "epoch": 4.202262113876869, "grad_norm": 6.3566575050354, "learning_rate": 6.150473765684994e-06, "loss": 2.4727546691894533, "memory(GiB)": 77.56, "step": 98085, "token_acc": 0.49175824175824173, "train_speed(iter/s)": 1.438159 }, { "epoch": 4.202476329206118, "grad_norm": 5.163562297821045, "learning_rate": 6.147240455325409e-06, "loss": 2.1134862899780273, "memory(GiB)": 77.56, "step": 98090, "token_acc": 0.5319148936170213, "train_speed(iter/s)": 1.43818 }, { "epoch": 4.202690544535367, "grad_norm": 7.16831636428833, "learning_rate": 6.14400793939971e-06, "loss": 2.202798843383789, "memory(GiB)": 77.56, "step": 98095, "token_acc": 0.512, "train_speed(iter/s)": 1.438183 }, { "epoch": 4.202904759864616, "grad_norm": 7.538551330566406, "learning_rate": 6.140776217966443e-06, "loss": 2.4330799102783205, "memory(GiB)": 77.56, "step": 98100, "token_acc": 0.49044585987261147, "train_speed(iter/s)": 1.438191 }, { "epoch": 4.203118975193865, "grad_norm": 5.994205951690674, "learning_rate": 6.137545291084162e-06, "loss": 2.0087726593017576, "memory(GiB)": 77.56, "step": 98105, "token_acc": 0.5480769230769231, "train_speed(iter/s)": 1.438214 }, { "epoch": 4.2033331905231135, "grad_norm": 5.978413105010986, "learning_rate": 6.134315158811393e-06, "loss": 2.176012420654297, "memory(GiB)": 77.56, "step": 98110, "token_acc": 0.5335689045936396, "train_speed(iter/s)": 1.438241 }, { "epoch": 4.203547405852363, "grad_norm": 6.383014678955078, "learning_rate": 6.1310858212066445e-06, "loss": 2.2739749908447267, "memory(GiB)": 77.56, "step": 98115, "token_acc": 0.5109489051094891, "train_speed(iter/s)": 1.438252 }, { "epoch": 4.203761621181612, "grad_norm": 5.5930633544921875, "learning_rate": 6.127857278328442e-06, "loss": 2.281202697753906, "memory(GiB)": 77.56, "step": 98120, "token_acc": 0.5434083601286174, "train_speed(iter/s)": 1.43827 }, { "epoch": 4.20397583651086, "grad_norm": 5.5748772621154785, "learning_rate": 6.124629530235249e-06, "loss": 2.2819797515869142, "memory(GiB)": 77.56, "step": 98125, "token_acc": 0.4965986394557823, "train_speed(iter/s)": 1.438267 }, { "epoch": 4.20419005184011, "grad_norm": 5.827408790588379, "learning_rate": 6.121402576985558e-06, "loss": 2.1849925994873045, "memory(GiB)": 77.56, "step": 98130, "token_acc": 0.5181818181818182, "train_speed(iter/s)": 1.438252 }, { "epoch": 4.204404267169359, "grad_norm": 7.5553202629089355, "learning_rate": 6.118176418637822e-06, "loss": 2.213795471191406, "memory(GiB)": 77.56, "step": 98135, "token_acc": 0.5303643724696356, "train_speed(iter/s)": 1.438262 }, { "epoch": 4.204618482498607, "grad_norm": 5.586918354034424, "learning_rate": 6.114951055250484e-06, "loss": 2.4456783294677735, "memory(GiB)": 77.56, "step": 98140, "token_acc": 0.47580645161290325, "train_speed(iter/s)": 1.438272 }, { "epoch": 4.2048326978278565, "grad_norm": 5.2323899269104, "learning_rate": 6.111726486881975e-06, "loss": 2.0326816558837892, "memory(GiB)": 77.56, "step": 98145, "token_acc": 0.5570469798657718, "train_speed(iter/s)": 1.438278 }, { "epoch": 4.205046913157106, "grad_norm": 5.317123889923096, "learning_rate": 6.108502713590702e-06, "loss": 2.2312458038330076, "memory(GiB)": 77.56, "step": 98150, "token_acc": 0.4935483870967742, "train_speed(iter/s)": 1.438282 }, { "epoch": 4.205261128486354, "grad_norm": 6.194159507751465, "learning_rate": 6.105279735435082e-06, "loss": 2.4072303771972656, "memory(GiB)": 77.56, "step": 98155, "token_acc": 0.47115384615384615, "train_speed(iter/s)": 1.438296 }, { "epoch": 4.205475343815603, "grad_norm": 7.651185989379883, "learning_rate": 6.1020575524735005e-06, "loss": 2.192337989807129, "memory(GiB)": 77.56, "step": 98160, "token_acc": 0.5698529411764706, "train_speed(iter/s)": 1.438309 }, { "epoch": 4.205689559144853, "grad_norm": 6.0030717849731445, "learning_rate": 6.098836164764326e-06, "loss": 2.5083805084228517, "memory(GiB)": 77.56, "step": 98165, "token_acc": 0.5068027210884354, "train_speed(iter/s)": 1.438313 }, { "epoch": 4.205903774474101, "grad_norm": 5.139372825622559, "learning_rate": 6.095615572365921e-06, "loss": 2.4234233856201173, "memory(GiB)": 77.56, "step": 98170, "token_acc": 0.5299401197604791, "train_speed(iter/s)": 1.438324 }, { "epoch": 4.20611798980335, "grad_norm": 5.935065269470215, "learning_rate": 6.092395775336607e-06, "loss": 2.074169158935547, "memory(GiB)": 77.56, "step": 98175, "token_acc": 0.5476190476190477, "train_speed(iter/s)": 1.438337 }, { "epoch": 4.2063322051325995, "grad_norm": 6.742197036743164, "learning_rate": 6.089176773734745e-06, "loss": 2.3369895935058596, "memory(GiB)": 77.56, "step": 98180, "token_acc": 0.5152542372881356, "train_speed(iter/s)": 1.438345 }, { "epoch": 4.206546420461848, "grad_norm": 7.1738457679748535, "learning_rate": 6.085958567618638e-06, "loss": 2.0658105850219726, "memory(GiB)": 77.56, "step": 98185, "token_acc": 0.50199203187251, "train_speed(iter/s)": 1.438354 }, { "epoch": 4.206760635791097, "grad_norm": 6.9448652267456055, "learning_rate": 6.08274115704659e-06, "loss": 2.035039520263672, "memory(GiB)": 77.56, "step": 98190, "token_acc": 0.5287356321839081, "train_speed(iter/s)": 1.438354 }, { "epoch": 4.206974851120346, "grad_norm": 5.532359600067139, "learning_rate": 6.07952454207687e-06, "loss": 2.4549732208251953, "memory(GiB)": 77.56, "step": 98195, "token_acc": 0.5297297297297298, "train_speed(iter/s)": 1.438353 }, { "epoch": 4.207189066449595, "grad_norm": 8.086493492126465, "learning_rate": 6.076308722767776e-06, "loss": 2.3581161499023438, "memory(GiB)": 77.56, "step": 98200, "token_acc": 0.5197132616487455, "train_speed(iter/s)": 1.438359 }, { "epoch": 4.207403281778844, "grad_norm": 7.090773582458496, "learning_rate": 6.073093699177551e-06, "loss": 2.1348033905029298, "memory(GiB)": 77.56, "step": 98205, "token_acc": 0.5469798657718121, "train_speed(iter/s)": 1.438346 }, { "epoch": 4.207617497108093, "grad_norm": 6.109488487243652, "learning_rate": 6.069879471364426e-06, "loss": 2.171565818786621, "memory(GiB)": 77.56, "step": 98210, "token_acc": 0.583969465648855, "train_speed(iter/s)": 1.43834 }, { "epoch": 4.207831712437342, "grad_norm": 5.301544189453125, "learning_rate": 6.0666660393866596e-06, "loss": 2.2061752319335937, "memory(GiB)": 77.56, "step": 98215, "token_acc": 0.5, "train_speed(iter/s)": 1.438338 }, { "epoch": 4.208045927766591, "grad_norm": 4.8507795333862305, "learning_rate": 6.063453403302449e-06, "loss": 2.079071044921875, "memory(GiB)": 77.56, "step": 98220, "token_acc": 0.5472972972972973, "train_speed(iter/s)": 1.438344 }, { "epoch": 4.20826014309584, "grad_norm": 4.961392879486084, "learning_rate": 6.06024156317e-06, "loss": 2.2174943923950194, "memory(GiB)": 77.56, "step": 98225, "token_acc": 0.5168918918918919, "train_speed(iter/s)": 1.438343 }, { "epoch": 4.2084743584250885, "grad_norm": 5.052983283996582, "learning_rate": 6.057030519047491e-06, "loss": 2.2127986907958985, "memory(GiB)": 77.56, "step": 98230, "token_acc": 0.5046439628482973, "train_speed(iter/s)": 1.438354 }, { "epoch": 4.208688573754338, "grad_norm": 5.836075305938721, "learning_rate": 6.053820270993082e-06, "loss": 2.2800180435180666, "memory(GiB)": 77.56, "step": 98235, "token_acc": 0.4963235294117647, "train_speed(iter/s)": 1.438351 }, { "epoch": 4.208902789083587, "grad_norm": 6.770490646362305, "learning_rate": 6.050610819064961e-06, "loss": 2.3001054763793944, "memory(GiB)": 77.56, "step": 98240, "token_acc": 0.5148936170212766, "train_speed(iter/s)": 1.438354 }, { "epoch": 4.209117004412835, "grad_norm": 8.350386619567871, "learning_rate": 6.047402163321248e-06, "loss": 2.344517135620117, "memory(GiB)": 77.56, "step": 98245, "token_acc": 0.4746376811594203, "train_speed(iter/s)": 1.438367 }, { "epoch": 4.209331219742085, "grad_norm": 6.6530985832214355, "learning_rate": 6.044194303820078e-06, "loss": 2.306108856201172, "memory(GiB)": 77.56, "step": 98250, "token_acc": 0.4935064935064935, "train_speed(iter/s)": 1.438371 }, { "epoch": 4.209545435071334, "grad_norm": 7.232362747192383, "learning_rate": 6.040987240619561e-06, "loss": 2.0119325637817385, "memory(GiB)": 77.56, "step": 98255, "token_acc": 0.5643153526970954, "train_speed(iter/s)": 1.438369 }, { "epoch": 4.209759650400582, "grad_norm": 5.363946437835693, "learning_rate": 6.037780973777785e-06, "loss": 2.3746849060058595, "memory(GiB)": 77.56, "step": 98260, "token_acc": 0.5, "train_speed(iter/s)": 1.438365 }, { "epoch": 4.209973865729832, "grad_norm": 5.251760959625244, "learning_rate": 6.0345755033528514e-06, "loss": 1.975484848022461, "memory(GiB)": 77.56, "step": 98265, "token_acc": 0.6, "train_speed(iter/s)": 1.438374 }, { "epoch": 4.210188081059081, "grad_norm": 7.701329708099365, "learning_rate": 6.031370829402832e-06, "loss": 2.2395561218261717, "memory(GiB)": 77.56, "step": 98270, "token_acc": 0.5128205128205128, "train_speed(iter/s)": 1.438373 }, { "epoch": 4.210402296388329, "grad_norm": 5.9888691902160645, "learning_rate": 6.0281669519857755e-06, "loss": 2.4060972213745115, "memory(GiB)": 77.56, "step": 98275, "token_acc": 0.52, "train_speed(iter/s)": 1.438378 }, { "epoch": 4.2106165117175784, "grad_norm": 5.402688980102539, "learning_rate": 6.024963871159722e-06, "loss": 2.3136241912841795, "memory(GiB)": 77.56, "step": 98280, "token_acc": 0.5238095238095238, "train_speed(iter/s)": 1.438394 }, { "epoch": 4.210830727046828, "grad_norm": 7.225913047790527, "learning_rate": 6.021761586982705e-06, "loss": 2.4117456436157227, "memory(GiB)": 77.56, "step": 98285, "token_acc": 0.45627376425855515, "train_speed(iter/s)": 1.438403 }, { "epoch": 4.211044942376076, "grad_norm": 8.282779693603516, "learning_rate": 6.018560099512732e-06, "loss": 2.504245376586914, "memory(GiB)": 77.56, "step": 98290, "token_acc": 0.4865771812080537, "train_speed(iter/s)": 1.438405 }, { "epoch": 4.211259157705325, "grad_norm": 5.624424934387207, "learning_rate": 6.015359408807786e-06, "loss": 2.26424560546875, "memory(GiB)": 77.56, "step": 98295, "token_acc": 0.5101351351351351, "train_speed(iter/s)": 1.438421 }, { "epoch": 4.211473373034575, "grad_norm": 5.580746650695801, "learning_rate": 6.012159514925875e-06, "loss": 2.319450569152832, "memory(GiB)": 77.56, "step": 98300, "token_acc": 0.5431309904153354, "train_speed(iter/s)": 1.438429 }, { "epoch": 4.211687588363823, "grad_norm": 8.379209518432617, "learning_rate": 6.008960417924964e-06, "loss": 2.3237228393554688, "memory(GiB)": 77.56, "step": 98305, "token_acc": 0.466403162055336, "train_speed(iter/s)": 1.438434 }, { "epoch": 4.211901803693072, "grad_norm": 6.290983200073242, "learning_rate": 6.005762117862995e-06, "loss": 2.3621248245239257, "memory(GiB)": 77.56, "step": 98310, "token_acc": 0.4982078853046595, "train_speed(iter/s)": 1.438429 }, { "epoch": 4.2121160190223215, "grad_norm": 7.093401908874512, "learning_rate": 6.002564614797923e-06, "loss": 2.3215509414672852, "memory(GiB)": 77.56, "step": 98315, "token_acc": 0.483739837398374, "train_speed(iter/s)": 1.438416 }, { "epoch": 4.21233023435157, "grad_norm": 8.208555221557617, "learning_rate": 5.999367908787651e-06, "loss": 2.142250633239746, "memory(GiB)": 77.56, "step": 98320, "token_acc": 0.5368421052631579, "train_speed(iter/s)": 1.438417 }, { "epoch": 4.212544449680819, "grad_norm": 5.318323135375977, "learning_rate": 5.996171999890116e-06, "loss": 2.1883272171020507, "memory(GiB)": 77.56, "step": 98325, "token_acc": 0.55, "train_speed(iter/s)": 1.438418 }, { "epoch": 4.212758665010068, "grad_norm": 7.310027599334717, "learning_rate": 5.992976888163204e-06, "loss": 1.9704902648925782, "memory(GiB)": 77.56, "step": 98330, "token_acc": 0.6059322033898306, "train_speed(iter/s)": 1.438433 }, { "epoch": 4.212972880339317, "grad_norm": 6.041313171386719, "learning_rate": 5.989782573664798e-06, "loss": 2.436429977416992, "memory(GiB)": 77.56, "step": 98335, "token_acc": 0.47416413373860183, "train_speed(iter/s)": 1.438408 }, { "epoch": 4.213187095668566, "grad_norm": 5.936883926391602, "learning_rate": 5.9865890564527515e-06, "loss": 2.4405813217163086, "memory(GiB)": 77.56, "step": 98340, "token_acc": 0.5053003533568905, "train_speed(iter/s)": 1.4384 }, { "epoch": 4.213401310997815, "grad_norm": 6.204477787017822, "learning_rate": 5.983396336584945e-06, "loss": 2.279122734069824, "memory(GiB)": 77.56, "step": 98345, "token_acc": 0.5, "train_speed(iter/s)": 1.4384 }, { "epoch": 4.213615526327064, "grad_norm": 9.50565242767334, "learning_rate": 5.9802044141192056e-06, "loss": 2.4103214263916017, "memory(GiB)": 77.56, "step": 98350, "token_acc": 0.4967948717948718, "train_speed(iter/s)": 1.438393 }, { "epoch": 4.213829741656313, "grad_norm": 6.365574359893799, "learning_rate": 5.977013289113348e-06, "loss": 2.070890426635742, "memory(GiB)": 77.56, "step": 98355, "token_acc": 0.5384615384615384, "train_speed(iter/s)": 1.438387 }, { "epoch": 4.214043956985562, "grad_norm": 5.899397850036621, "learning_rate": 5.973822961625203e-06, "loss": 2.256831932067871, "memory(GiB)": 77.56, "step": 98360, "token_acc": 0.4742647058823529, "train_speed(iter/s)": 1.438397 }, { "epoch": 4.2142581723148105, "grad_norm": 7.603686809539795, "learning_rate": 5.970633431712552e-06, "loss": 2.135301971435547, "memory(GiB)": 77.56, "step": 98365, "token_acc": 0.5206611570247934, "train_speed(iter/s)": 1.438404 }, { "epoch": 4.21447238764406, "grad_norm": 7.04205846786499, "learning_rate": 5.967444699433183e-06, "loss": 2.2547868728637694, "memory(GiB)": 77.56, "step": 98370, "token_acc": 0.549618320610687, "train_speed(iter/s)": 1.438402 }, { "epoch": 4.214686602973309, "grad_norm": 9.290288925170898, "learning_rate": 5.964256764844855e-06, "loss": 2.0210039138793947, "memory(GiB)": 77.56, "step": 98375, "token_acc": 0.5750798722044729, "train_speed(iter/s)": 1.438395 }, { "epoch": 4.214900818302557, "grad_norm": 6.7043046951293945, "learning_rate": 5.961069628005317e-06, "loss": 2.5747379302978515, "memory(GiB)": 77.56, "step": 98380, "token_acc": 0.46078431372549017, "train_speed(iter/s)": 1.438374 }, { "epoch": 4.215115033631807, "grad_norm": 7.077681064605713, "learning_rate": 5.9578832889723215e-06, "loss": 2.150142860412598, "memory(GiB)": 77.56, "step": 98385, "token_acc": 0.5304054054054054, "train_speed(iter/s)": 1.438376 }, { "epoch": 4.215329248961056, "grad_norm": 10.056730270385742, "learning_rate": 5.954697747803584e-06, "loss": 2.591453742980957, "memory(GiB)": 77.56, "step": 98390, "token_acc": 0.47335423197492166, "train_speed(iter/s)": 1.438368 }, { "epoch": 4.215543464290304, "grad_norm": 8.434317588806152, "learning_rate": 5.951513004556819e-06, "loss": 2.3313249588012694, "memory(GiB)": 77.56, "step": 98395, "token_acc": 0.506993006993007, "train_speed(iter/s)": 1.438365 }, { "epoch": 4.2157576796195535, "grad_norm": 6.500161170959473, "learning_rate": 5.948329059289715e-06, "loss": 2.2964502334594727, "memory(GiB)": 77.56, "step": 98400, "token_acc": 0.5193798449612403, "train_speed(iter/s)": 1.438383 }, { "epoch": 4.215971894948803, "grad_norm": 5.685385227203369, "learning_rate": 5.945145912059946e-06, "loss": 2.4756072998046874, "memory(GiB)": 77.56, "step": 98405, "token_acc": 0.4525316455696203, "train_speed(iter/s)": 1.438395 }, { "epoch": 4.216186110278051, "grad_norm": 4.890557765960693, "learning_rate": 5.94196356292519e-06, "loss": 2.0542274475097657, "memory(GiB)": 77.56, "step": 98410, "token_acc": 0.5223880597014925, "train_speed(iter/s)": 1.438416 }, { "epoch": 4.2164003256073, "grad_norm": 6.49935245513916, "learning_rate": 5.938782011943089e-06, "loss": 2.075295066833496, "memory(GiB)": 77.56, "step": 98415, "token_acc": 0.5219123505976095, "train_speed(iter/s)": 1.43843 }, { "epoch": 4.21661454093655, "grad_norm": 7.217024803161621, "learning_rate": 5.935601259171292e-06, "loss": 2.615322303771973, "memory(GiB)": 77.56, "step": 98420, "token_acc": 0.4297082228116711, "train_speed(iter/s)": 1.438441 }, { "epoch": 4.216828756265798, "grad_norm": 5.824163913726807, "learning_rate": 5.932421304667418e-06, "loss": 2.466215896606445, "memory(GiB)": 77.56, "step": 98425, "token_acc": 0.459546925566343, "train_speed(iter/s)": 1.438423 }, { "epoch": 4.217042971595047, "grad_norm": 8.371705055236816, "learning_rate": 5.9292421484890674e-06, "loss": 2.3368967056274412, "memory(GiB)": 77.56, "step": 98430, "token_acc": 0.49404761904761907, "train_speed(iter/s)": 1.438412 }, { "epoch": 4.2172571869242965, "grad_norm": 5.895653247833252, "learning_rate": 5.926063790693837e-06, "loss": 2.2445981979370115, "memory(GiB)": 77.56, "step": 98435, "token_acc": 0.47368421052631576, "train_speed(iter/s)": 1.438424 }, { "epoch": 4.217471402253545, "grad_norm": 8.02182388305664, "learning_rate": 5.922886231339297e-06, "loss": 1.728099250793457, "memory(GiB)": 77.56, "step": 98440, "token_acc": 0.6163265306122448, "train_speed(iter/s)": 1.438419 }, { "epoch": 4.217685617582794, "grad_norm": 5.7319135665893555, "learning_rate": 5.919709470483032e-06, "loss": 2.115361785888672, "memory(GiB)": 77.56, "step": 98445, "token_acc": 0.5150602409638554, "train_speed(iter/s)": 1.438422 }, { "epoch": 4.217899832912043, "grad_norm": 5.47056245803833, "learning_rate": 5.9165335081825754e-06, "loss": 2.3248260498046873, "memory(GiB)": 77.56, "step": 98450, "token_acc": 0.5045317220543807, "train_speed(iter/s)": 1.438393 }, { "epoch": 4.218114048241292, "grad_norm": 5.632694721221924, "learning_rate": 5.913358344495473e-06, "loss": 2.342509460449219, "memory(GiB)": 77.56, "step": 98455, "token_acc": 0.5056818181818182, "train_speed(iter/s)": 1.438398 }, { "epoch": 4.218328263570541, "grad_norm": 5.1728129386901855, "learning_rate": 5.910183979479239e-06, "loss": 1.9806270599365234, "memory(GiB)": 77.56, "step": 98460, "token_acc": 0.5333333333333333, "train_speed(iter/s)": 1.438412 }, { "epoch": 4.21854247889979, "grad_norm": 7.188817501068115, "learning_rate": 5.907010413191372e-06, "loss": 2.3121814727783203, "memory(GiB)": 77.56, "step": 98465, "token_acc": 0.47686832740213525, "train_speed(iter/s)": 1.43842 }, { "epoch": 4.218756694229039, "grad_norm": 6.093756675720215, "learning_rate": 5.903837645689381e-06, "loss": 2.184810447692871, "memory(GiB)": 77.56, "step": 98470, "token_acc": 0.4714285714285714, "train_speed(iter/s)": 1.438432 }, { "epoch": 4.218970909558288, "grad_norm": 5.1576337814331055, "learning_rate": 5.900665677030742e-06, "loss": 2.134235954284668, "memory(GiB)": 77.56, "step": 98475, "token_acc": 0.5548780487804879, "train_speed(iter/s)": 1.438437 }, { "epoch": 4.219185124887537, "grad_norm": 4.905875205993652, "learning_rate": 5.89749450727291e-06, "loss": 2.8184064865112304, "memory(GiB)": 77.56, "step": 98480, "token_acc": 0.44932432432432434, "train_speed(iter/s)": 1.438442 }, { "epoch": 4.2193993402167855, "grad_norm": 8.835637092590332, "learning_rate": 5.894324136473334e-06, "loss": 1.9978958129882813, "memory(GiB)": 77.56, "step": 98485, "token_acc": 0.5540983606557377, "train_speed(iter/s)": 1.438436 }, { "epoch": 4.219613555546035, "grad_norm": 7.472249984741211, "learning_rate": 5.891154564689444e-06, "loss": 2.3475620269775392, "memory(GiB)": 77.56, "step": 98490, "token_acc": 0.48172757475083056, "train_speed(iter/s)": 1.438432 }, { "epoch": 4.219827770875284, "grad_norm": 4.666188716888428, "learning_rate": 5.887985791978673e-06, "loss": 1.9814159393310546, "memory(GiB)": 77.56, "step": 98495, "token_acc": 0.5874587458745875, "train_speed(iter/s)": 1.438432 }, { "epoch": 4.220041986204532, "grad_norm": 9.00847053527832, "learning_rate": 5.884817818398414e-06, "loss": 2.3205339431762697, "memory(GiB)": 77.56, "step": 98500, "token_acc": 0.4868913857677903, "train_speed(iter/s)": 1.438444 }, { "epoch": 4.220041986204532, "eval_loss": 2.128352403640747, "eval_runtime": 14.862, "eval_samples_per_second": 6.729, "eval_steps_per_second": 6.729, "eval_token_acc": 0.47406340057636887, "step": 98500 }, { "epoch": 4.220256201533782, "grad_norm": 5.496734142303467, "learning_rate": 5.881650644006071e-06, "loss": 2.1217145919799805, "memory(GiB)": 77.56, "step": 98505, "token_acc": 0.5024485798237023, "train_speed(iter/s)": 1.438106 }, { "epoch": 4.220470416863031, "grad_norm": 5.40130090713501, "learning_rate": 5.878484268859013e-06, "loss": 2.4276966094970702, "memory(GiB)": 77.56, "step": 98510, "token_acc": 0.48493975903614456, "train_speed(iter/s)": 1.438105 }, { "epoch": 4.220684632192279, "grad_norm": 6.0814032554626465, "learning_rate": 5.875318693014603e-06, "loss": 2.6082441329956056, "memory(GiB)": 77.56, "step": 98515, "token_acc": 0.4849624060150376, "train_speed(iter/s)": 1.438112 }, { "epoch": 4.2208988475215286, "grad_norm": 5.476480484008789, "learning_rate": 5.872153916530187e-06, "loss": 2.203523635864258, "memory(GiB)": 77.56, "step": 98520, "token_acc": 0.5236220472440944, "train_speed(iter/s)": 1.438125 }, { "epoch": 4.221113062850778, "grad_norm": 6.536108493804932, "learning_rate": 5.868989939463087e-06, "loss": 1.9606212615966796, "memory(GiB)": 77.56, "step": 98525, "token_acc": 0.5457627118644067, "train_speed(iter/s)": 1.438128 }, { "epoch": 4.221327278180026, "grad_norm": 9.845624923706055, "learning_rate": 5.865826761870646e-06, "loss": 2.414236068725586, "memory(GiB)": 77.56, "step": 98530, "token_acc": 0.4852459016393443, "train_speed(iter/s)": 1.43812 }, { "epoch": 4.221541493509275, "grad_norm": 5.5067620277404785, "learning_rate": 5.862664383810151e-06, "loss": 2.319127655029297, "memory(GiB)": 77.56, "step": 98535, "token_acc": 0.5364238410596026, "train_speed(iter/s)": 1.438133 }, { "epoch": 4.221755708838525, "grad_norm": 8.395221710205078, "learning_rate": 5.859502805338896e-06, "loss": 2.334292984008789, "memory(GiB)": 77.56, "step": 98540, "token_acc": 0.5358649789029536, "train_speed(iter/s)": 1.438138 }, { "epoch": 4.221969924167773, "grad_norm": 9.597164154052734, "learning_rate": 5.856342026514156e-06, "loss": 2.358758544921875, "memory(GiB)": 77.56, "step": 98545, "token_acc": 0.5016722408026756, "train_speed(iter/s)": 1.438142 }, { "epoch": 4.222184139497022, "grad_norm": 5.924988746643066, "learning_rate": 5.85318204739318e-06, "loss": 2.1829099655151367, "memory(GiB)": 77.56, "step": 98550, "token_acc": 0.5510204081632653, "train_speed(iter/s)": 1.438136 }, { "epoch": 4.222398354826272, "grad_norm": 6.551088333129883, "learning_rate": 5.850022868033234e-06, "loss": 2.112716293334961, "memory(GiB)": 77.56, "step": 98555, "token_acc": 0.5375494071146245, "train_speed(iter/s)": 1.438142 }, { "epoch": 4.22261257015552, "grad_norm": 6.376165866851807, "learning_rate": 5.846864488491527e-06, "loss": 2.368951606750488, "memory(GiB)": 77.56, "step": 98560, "token_acc": 0.5100671140939598, "train_speed(iter/s)": 1.438148 }, { "epoch": 4.222826785484769, "grad_norm": 5.307037353515625, "learning_rate": 5.843706908825303e-06, "loss": 2.016937255859375, "memory(GiB)": 77.56, "step": 98565, "token_acc": 0.5357142857142857, "train_speed(iter/s)": 1.438143 }, { "epoch": 4.2230410008140185, "grad_norm": 6.171824932098389, "learning_rate": 5.840550129091743e-06, "loss": 2.499873924255371, "memory(GiB)": 77.56, "step": 98570, "token_acc": 0.502906976744186, "train_speed(iter/s)": 1.43813 }, { "epoch": 4.223255216143267, "grad_norm": 6.762292861938477, "learning_rate": 5.837394149348052e-06, "loss": 2.5716716766357424, "memory(GiB)": 77.56, "step": 98575, "token_acc": 0.4393939393939394, "train_speed(iter/s)": 1.438143 }, { "epoch": 4.223469431472516, "grad_norm": 5.51644229888916, "learning_rate": 5.83423896965139e-06, "loss": 2.445097732543945, "memory(GiB)": 77.56, "step": 98580, "token_acc": 0.4763636363636364, "train_speed(iter/s)": 1.438148 }, { "epoch": 4.223683646801765, "grad_norm": 6.734059810638428, "learning_rate": 5.831084590058905e-06, "loss": 2.457259178161621, "memory(GiB)": 77.56, "step": 98585, "token_acc": 0.48255813953488375, "train_speed(iter/s)": 1.438157 }, { "epoch": 4.223897862131014, "grad_norm": 5.5849103927612305, "learning_rate": 5.827931010627774e-06, "loss": 2.0584590911865233, "memory(GiB)": 77.56, "step": 98590, "token_acc": 0.5528169014084507, "train_speed(iter/s)": 1.438164 }, { "epoch": 4.224112077460263, "grad_norm": 6.087093830108643, "learning_rate": 5.824778231415106e-06, "loss": 2.3791933059692383, "memory(GiB)": 77.56, "step": 98595, "token_acc": 0.4682274247491639, "train_speed(iter/s)": 1.438166 }, { "epoch": 4.224326292789512, "grad_norm": 5.346545696258545, "learning_rate": 5.821626252478018e-06, "loss": 2.2676155090332033, "memory(GiB)": 77.56, "step": 98600, "token_acc": 0.4566929133858268, "train_speed(iter/s)": 1.438172 }, { "epoch": 4.224540508118761, "grad_norm": 6.784097194671631, "learning_rate": 5.818475073873614e-06, "loss": 2.5720359802246096, "memory(GiB)": 77.56, "step": 98605, "token_acc": 0.45, "train_speed(iter/s)": 1.438171 }, { "epoch": 4.22475472344801, "grad_norm": 6.069586753845215, "learning_rate": 5.815324695658975e-06, "loss": 2.101156806945801, "memory(GiB)": 77.56, "step": 98610, "token_acc": 0.5467128027681661, "train_speed(iter/s)": 1.438168 }, { "epoch": 4.224968938777259, "grad_norm": 6.479527950286865, "learning_rate": 5.812175117891183e-06, "loss": 2.158317756652832, "memory(GiB)": 77.56, "step": 98615, "token_acc": 0.5425101214574899, "train_speed(iter/s)": 1.438174 }, { "epoch": 4.2251831541065075, "grad_norm": 5.19526481628418, "learning_rate": 5.809026340627288e-06, "loss": 2.595577621459961, "memory(GiB)": 77.56, "step": 98620, "token_acc": 0.4509090909090909, "train_speed(iter/s)": 1.438181 }, { "epoch": 4.225397369435757, "grad_norm": 6.5625481605529785, "learning_rate": 5.805878363924338e-06, "loss": 2.572738456726074, "memory(GiB)": 77.56, "step": 98625, "token_acc": 0.4875, "train_speed(iter/s)": 1.438194 }, { "epoch": 4.225611584765006, "grad_norm": 7.128811359405518, "learning_rate": 5.802731187839361e-06, "loss": 2.055022430419922, "memory(GiB)": 77.56, "step": 98630, "token_acc": 0.5895522388059702, "train_speed(iter/s)": 1.438193 }, { "epoch": 4.225825800094254, "grad_norm": 7.126205921173096, "learning_rate": 5.799584812429354e-06, "loss": 2.409199523925781, "memory(GiB)": 77.56, "step": 98635, "token_acc": 0.46, "train_speed(iter/s)": 1.438175 }, { "epoch": 4.226040015423504, "grad_norm": 7.448665618896484, "learning_rate": 5.7964392377513445e-06, "loss": 2.297069549560547, "memory(GiB)": 77.56, "step": 98640, "token_acc": 0.47619047619047616, "train_speed(iter/s)": 1.438181 }, { "epoch": 4.226254230752753, "grad_norm": 6.745507717132568, "learning_rate": 5.793294463862297e-06, "loss": 2.4201173782348633, "memory(GiB)": 77.56, "step": 98645, "token_acc": 0.5272108843537415, "train_speed(iter/s)": 1.438179 }, { "epoch": 4.226468446082001, "grad_norm": 7.347485542297363, "learning_rate": 5.790150490819196e-06, "loss": 2.2076059341430665, "memory(GiB)": 77.56, "step": 98650, "token_acc": 0.5212765957446809, "train_speed(iter/s)": 1.438188 }, { "epoch": 4.2266826614112505, "grad_norm": 8.259467124938965, "learning_rate": 5.787007318678994e-06, "loss": 2.267050552368164, "memory(GiB)": 77.56, "step": 98655, "token_acc": 0.5435540069686411, "train_speed(iter/s)": 1.438186 }, { "epoch": 4.2268968767405, "grad_norm": 5.506394386291504, "learning_rate": 5.7838649474986235e-06, "loss": 2.2759540557861326, "memory(GiB)": 77.56, "step": 98660, "token_acc": 0.5097402597402597, "train_speed(iter/s)": 1.438177 }, { "epoch": 4.227111092069748, "grad_norm": 6.584667205810547, "learning_rate": 5.7807233773350235e-06, "loss": 2.429344367980957, "memory(GiB)": 77.56, "step": 98665, "token_acc": 0.5272727272727272, "train_speed(iter/s)": 1.438176 }, { "epoch": 4.227325307398997, "grad_norm": 6.250676155090332, "learning_rate": 5.7775826082450915e-06, "loss": 2.323969268798828, "memory(GiB)": 77.56, "step": 98670, "token_acc": 0.49375, "train_speed(iter/s)": 1.438181 }, { "epoch": 4.227539522728247, "grad_norm": 5.680970191955566, "learning_rate": 5.774442640285738e-06, "loss": 1.863088035583496, "memory(GiB)": 77.56, "step": 98675, "token_acc": 0.532871972318339, "train_speed(iter/s)": 1.438194 }, { "epoch": 4.227753738057495, "grad_norm": 5.946530818939209, "learning_rate": 5.77130347351385e-06, "loss": 1.9548000335693358, "memory(GiB)": 77.56, "step": 98680, "token_acc": 0.5838150289017341, "train_speed(iter/s)": 1.438184 }, { "epoch": 4.227967953386744, "grad_norm": 6.0132155418396, "learning_rate": 5.768165107986284e-06, "loss": 2.1032190322875977, "memory(GiB)": 77.56, "step": 98685, "token_acc": 0.47692307692307695, "train_speed(iter/s)": 1.438202 }, { "epoch": 4.2281821687159935, "grad_norm": 6.270233631134033, "learning_rate": 5.765027543759904e-06, "loss": 2.336669158935547, "memory(GiB)": 77.56, "step": 98690, "token_acc": 0.46204620462046203, "train_speed(iter/s)": 1.438204 }, { "epoch": 4.228396384045242, "grad_norm": 6.963911533355713, "learning_rate": 5.7618907808915325e-06, "loss": 2.1510234832763673, "memory(GiB)": 77.56, "step": 98695, "token_acc": 0.5376344086021505, "train_speed(iter/s)": 1.438219 }, { "epoch": 4.228610599374491, "grad_norm": 6.034351348876953, "learning_rate": 5.758754819438017e-06, "loss": 2.3063426971435548, "memory(GiB)": 77.56, "step": 98700, "token_acc": 0.5249169435215947, "train_speed(iter/s)": 1.438227 }, { "epoch": 4.22882481470374, "grad_norm": 6.990081787109375, "learning_rate": 5.755619659456163e-06, "loss": 2.3793033599853515, "memory(GiB)": 77.56, "step": 98705, "token_acc": 0.47214076246334313, "train_speed(iter/s)": 1.438243 }, { "epoch": 4.229039030032989, "grad_norm": 5.555932998657227, "learning_rate": 5.752485301002752e-06, "loss": 2.240178871154785, "memory(GiB)": 77.56, "step": 98710, "token_acc": 0.4793103448275862, "train_speed(iter/s)": 1.438229 }, { "epoch": 4.229253245362238, "grad_norm": 6.410317897796631, "learning_rate": 5.74935174413459e-06, "loss": 2.4478342056274416, "memory(GiB)": 77.56, "step": 98715, "token_acc": 0.5328947368421053, "train_speed(iter/s)": 1.438243 }, { "epoch": 4.229467460691487, "grad_norm": 5.335734844207764, "learning_rate": 5.746218988908431e-06, "loss": 2.292744255065918, "memory(GiB)": 77.56, "step": 98720, "token_acc": 0.5223367697594502, "train_speed(iter/s)": 1.438243 }, { "epoch": 4.229681676020736, "grad_norm": 7.612647533416748, "learning_rate": 5.743087035381028e-06, "loss": 2.281413459777832, "memory(GiB)": 77.56, "step": 98725, "token_acc": 0.543918918918919, "train_speed(iter/s)": 1.438246 }, { "epoch": 4.229895891349985, "grad_norm": 6.659115314483643, "learning_rate": 5.739955883609105e-06, "loss": 2.3006193161010744, "memory(GiB)": 77.56, "step": 98730, "token_acc": 0.4886731391585761, "train_speed(iter/s)": 1.438251 }, { "epoch": 4.230110106679234, "grad_norm": 7.927532196044922, "learning_rate": 5.736825533649415e-06, "loss": 2.4834529876708986, "memory(GiB)": 77.56, "step": 98735, "token_acc": 0.5052631578947369, "train_speed(iter/s)": 1.438254 }, { "epoch": 4.2303243220084825, "grad_norm": 5.332328796386719, "learning_rate": 5.733695985558651e-06, "loss": 2.2541717529296874, "memory(GiB)": 77.56, "step": 98740, "token_acc": 0.5382262996941896, "train_speed(iter/s)": 1.43824 }, { "epoch": 4.230538537337732, "grad_norm": 9.012012481689453, "learning_rate": 5.730567239393514e-06, "loss": 2.1209537506103517, "memory(GiB)": 77.56, "step": 98745, "token_acc": 0.5467625899280576, "train_speed(iter/s)": 1.438246 }, { "epoch": 4.230752752666981, "grad_norm": 5.358112812042236, "learning_rate": 5.7274392952106735e-06, "loss": 2.108470344543457, "memory(GiB)": 77.56, "step": 98750, "token_acc": 0.5492957746478874, "train_speed(iter/s)": 1.438253 }, { "epoch": 4.230966967996229, "grad_norm": 6.052133560180664, "learning_rate": 5.724312153066791e-06, "loss": 2.447452926635742, "memory(GiB)": 77.56, "step": 98755, "token_acc": 0.5016722408026756, "train_speed(iter/s)": 1.438258 }, { "epoch": 4.231181183325479, "grad_norm": 7.421567916870117, "learning_rate": 5.721185813018543e-06, "loss": 2.2603517532348634, "memory(GiB)": 77.56, "step": 98760, "token_acc": 0.5229357798165137, "train_speed(iter/s)": 1.438257 }, { "epoch": 4.231395398654728, "grad_norm": 5.925826549530029, "learning_rate": 5.71806027512255e-06, "loss": 2.478693389892578, "memory(GiB)": 77.56, "step": 98765, "token_acc": 0.47435897435897434, "train_speed(iter/s)": 1.438266 }, { "epoch": 4.231609613983976, "grad_norm": 5.559430122375488, "learning_rate": 5.714935539435429e-06, "loss": 2.3873729705810547, "memory(GiB)": 77.56, "step": 98770, "token_acc": 0.47151898734177217, "train_speed(iter/s)": 1.438274 }, { "epoch": 4.2318238293132255, "grad_norm": 5.392225742340088, "learning_rate": 5.711811606013801e-06, "loss": 1.9760448455810546, "memory(GiB)": 77.56, "step": 98775, "token_acc": 0.5381818181818182, "train_speed(iter/s)": 1.438269 }, { "epoch": 4.232038044642475, "grad_norm": 8.049981117248535, "learning_rate": 5.708688474914237e-06, "loss": 2.337434768676758, "memory(GiB)": 77.56, "step": 98780, "token_acc": 0.504424778761062, "train_speed(iter/s)": 1.43827 }, { "epoch": 4.232252259971723, "grad_norm": 5.579767227172852, "learning_rate": 5.705566146193342e-06, "loss": 2.1649669647216796, "memory(GiB)": 77.56, "step": 98785, "token_acc": 0.550561797752809, "train_speed(iter/s)": 1.438289 }, { "epoch": 4.232466475300972, "grad_norm": 6.73727560043335, "learning_rate": 5.702444619907654e-06, "loss": 2.6488134384155275, "memory(GiB)": 77.56, "step": 98790, "token_acc": 0.4542124542124542, "train_speed(iter/s)": 1.438296 }, { "epoch": 4.232680690630222, "grad_norm": 8.497204780578613, "learning_rate": 5.699323896113751e-06, "loss": 2.3096485137939453, "memory(GiB)": 77.56, "step": 98795, "token_acc": 0.5138339920948617, "train_speed(iter/s)": 1.438298 }, { "epoch": 4.23289490595947, "grad_norm": 6.431375026702881, "learning_rate": 5.696203974868147e-06, "loss": 2.2419174194335936, "memory(GiB)": 77.56, "step": 98800, "token_acc": 0.5083612040133779, "train_speed(iter/s)": 1.438297 }, { "epoch": 4.233109121288719, "grad_norm": 6.082571983337402, "learning_rate": 5.6930848562273766e-06, "loss": 2.203379821777344, "memory(GiB)": 77.56, "step": 98805, "token_acc": 0.5638629283489096, "train_speed(iter/s)": 1.438305 }, { "epoch": 4.233323336617969, "grad_norm": 5.070413112640381, "learning_rate": 5.6899665402479315e-06, "loss": 2.2239059448242187, "memory(GiB)": 77.56, "step": 98810, "token_acc": 0.512, "train_speed(iter/s)": 1.438312 }, { "epoch": 4.233537551947217, "grad_norm": 7.9468536376953125, "learning_rate": 5.686849026986296e-06, "loss": 2.5571640014648436, "memory(GiB)": 77.56, "step": 98815, "token_acc": 0.49850746268656715, "train_speed(iter/s)": 1.438307 }, { "epoch": 4.233751767276466, "grad_norm": 7.240723133087158, "learning_rate": 5.683732316498974e-06, "loss": 2.2770036697387694, "memory(GiB)": 77.56, "step": 98820, "token_acc": 0.5231316725978647, "train_speed(iter/s)": 1.438304 }, { "epoch": 4.233965982605715, "grad_norm": 6.013421535491943, "learning_rate": 5.6806164088424154e-06, "loss": 2.408305549621582, "memory(GiB)": 77.56, "step": 98825, "token_acc": 0.48717948717948717, "train_speed(iter/s)": 1.438295 }, { "epoch": 4.234180197934964, "grad_norm": 8.179483413696289, "learning_rate": 5.677501304073063e-06, "loss": 2.2367530822753907, "memory(GiB)": 77.56, "step": 98830, "token_acc": 0.5178571428571429, "train_speed(iter/s)": 1.438265 }, { "epoch": 4.234394413264213, "grad_norm": 5.203120231628418, "learning_rate": 5.67438700224735e-06, "loss": 2.2220144271850586, "memory(GiB)": 77.56, "step": 98835, "token_acc": 0.5289256198347108, "train_speed(iter/s)": 1.438288 }, { "epoch": 4.234608628593462, "grad_norm": 7.3809919357299805, "learning_rate": 5.671273503421692e-06, "loss": 2.2949779510498045, "memory(GiB)": 77.56, "step": 98840, "token_acc": 0.5337837837837838, "train_speed(iter/s)": 1.438313 }, { "epoch": 4.234822843922711, "grad_norm": 5.515556335449219, "learning_rate": 5.668160807652506e-06, "loss": 2.5175912857055662, "memory(GiB)": 77.56, "step": 98845, "token_acc": 0.46107784431137727, "train_speed(iter/s)": 1.438317 }, { "epoch": 4.23503705925196, "grad_norm": 6.509804725646973, "learning_rate": 5.6650489149961685e-06, "loss": 2.190166473388672, "memory(GiB)": 77.56, "step": 98850, "token_acc": 0.5270758122743683, "train_speed(iter/s)": 1.438328 }, { "epoch": 4.235251274581209, "grad_norm": 8.552698135375977, "learning_rate": 5.661937825509067e-06, "loss": 2.0742788314819336, "memory(GiB)": 77.56, "step": 98855, "token_acc": 0.5591836734693878, "train_speed(iter/s)": 1.438326 }, { "epoch": 4.235465489910458, "grad_norm": 5.004457950592041, "learning_rate": 5.658827539247541e-06, "loss": 2.5703901290893554, "memory(GiB)": 77.56, "step": 98860, "token_acc": 0.5051546391752577, "train_speed(iter/s)": 1.438324 }, { "epoch": 4.235679705239707, "grad_norm": 7.116892337799072, "learning_rate": 5.655718056267962e-06, "loss": 2.416287422180176, "memory(GiB)": 77.56, "step": 98865, "token_acc": 0.5116279069767442, "train_speed(iter/s)": 1.438343 }, { "epoch": 4.235893920568956, "grad_norm": 7.928796768188477, "learning_rate": 5.652609376626644e-06, "loss": 2.3415254592895507, "memory(GiB)": 77.56, "step": 98870, "token_acc": 0.5033557046979866, "train_speed(iter/s)": 1.438358 }, { "epoch": 4.2361081358982045, "grad_norm": 5.152106285095215, "learning_rate": 5.6495015003799e-06, "loss": 2.1879743576049804, "memory(GiB)": 77.56, "step": 98875, "token_acc": 0.5185185185185185, "train_speed(iter/s)": 1.438379 }, { "epoch": 4.236322351227454, "grad_norm": 5.262955665588379, "learning_rate": 5.6463944275840505e-06, "loss": 1.9860342025756836, "memory(GiB)": 77.56, "step": 98880, "token_acc": 0.5482625482625483, "train_speed(iter/s)": 1.438373 }, { "epoch": 4.236536566556703, "grad_norm": 6.443000316619873, "learning_rate": 5.643288158295373e-06, "loss": 2.4924346923828127, "memory(GiB)": 77.56, "step": 98885, "token_acc": 0.45614035087719296, "train_speed(iter/s)": 1.438373 }, { "epoch": 4.236750781885951, "grad_norm": 5.321374416351318, "learning_rate": 5.640182692570145e-06, "loss": 2.2751941680908203, "memory(GiB)": 77.56, "step": 98890, "token_acc": 0.5103626943005182, "train_speed(iter/s)": 1.438378 }, { "epoch": 4.236964997215201, "grad_norm": 5.902636528015137, "learning_rate": 5.637078030464615e-06, "loss": 2.33965950012207, "memory(GiB)": 77.56, "step": 98895, "token_acc": 0.5, "train_speed(iter/s)": 1.438389 }, { "epoch": 4.23717921254445, "grad_norm": 6.948122978210449, "learning_rate": 5.633974172035023e-06, "loss": 2.2784000396728517, "memory(GiB)": 77.56, "step": 98900, "token_acc": 0.5305466237942122, "train_speed(iter/s)": 1.438372 }, { "epoch": 4.237393427873698, "grad_norm": 5.692466735839844, "learning_rate": 5.630871117337621e-06, "loss": 2.212904357910156, "memory(GiB)": 77.56, "step": 98905, "token_acc": 0.49842271293375395, "train_speed(iter/s)": 1.438363 }, { "epoch": 4.2376076432029475, "grad_norm": 6.55184268951416, "learning_rate": 5.627768866428606e-06, "loss": 2.5515163421630858, "memory(GiB)": 77.56, "step": 98910, "token_acc": 0.5160349854227405, "train_speed(iter/s)": 1.438366 }, { "epoch": 4.237821858532197, "grad_norm": 5.398247241973877, "learning_rate": 5.624667419364182e-06, "loss": 2.1727046966552734, "memory(GiB)": 77.56, "step": 98915, "token_acc": 0.551094890510949, "train_speed(iter/s)": 1.438361 }, { "epoch": 4.238036073861445, "grad_norm": 6.375877380371094, "learning_rate": 5.621566776200538e-06, "loss": 2.218283843994141, "memory(GiB)": 77.56, "step": 98920, "token_acc": 0.501577287066246, "train_speed(iter/s)": 1.438354 }, { "epoch": 4.238250289190694, "grad_norm": 4.74083948135376, "learning_rate": 5.618466936993832e-06, "loss": 2.5549644470214843, "memory(GiB)": 77.56, "step": 98925, "token_acc": 0.5013927576601671, "train_speed(iter/s)": 1.438321 }, { "epoch": 4.238464504519944, "grad_norm": 6.965949058532715, "learning_rate": 5.6153679018002394e-06, "loss": 2.1256568908691404, "memory(GiB)": 77.56, "step": 98930, "token_acc": 0.5222222222222223, "train_speed(iter/s)": 1.438319 }, { "epoch": 4.238678719849192, "grad_norm": 7.280891418457031, "learning_rate": 5.612269670675885e-06, "loss": 2.128206253051758, "memory(GiB)": 77.56, "step": 98935, "token_acc": 0.5138888888888888, "train_speed(iter/s)": 1.43832 }, { "epoch": 4.238892935178441, "grad_norm": 6.390178680419922, "learning_rate": 5.609172243676914e-06, "loss": 2.2373172760009767, "memory(GiB)": 77.56, "step": 98940, "token_acc": 0.52, "train_speed(iter/s)": 1.43834 }, { "epoch": 4.2391071505076905, "grad_norm": 5.835618019104004, "learning_rate": 5.60607562085943e-06, "loss": 2.502004623413086, "memory(GiB)": 77.56, "step": 98945, "token_acc": 0.4811594202898551, "train_speed(iter/s)": 1.438343 }, { "epoch": 4.23932136583694, "grad_norm": 6.4024176597595215, "learning_rate": 5.602979802279534e-06, "loss": 2.2551433563232424, "memory(GiB)": 77.56, "step": 98950, "token_acc": 0.5303514376996805, "train_speed(iter/s)": 1.438356 }, { "epoch": 4.239535581166188, "grad_norm": 5.6541666984558105, "learning_rate": 5.599884787993304e-06, "loss": 2.5244083404541016, "memory(GiB)": 77.56, "step": 98955, "token_acc": 0.45980707395498394, "train_speed(iter/s)": 1.438343 }, { "epoch": 4.239749796495437, "grad_norm": 6.16555118560791, "learning_rate": 5.5967905780567985e-06, "loss": 2.547613334655762, "memory(GiB)": 77.56, "step": 98960, "token_acc": 0.4722222222222222, "train_speed(iter/s)": 1.438342 }, { "epoch": 4.239964011824686, "grad_norm": 5.594049453735352, "learning_rate": 5.593697172526097e-06, "loss": 2.288450241088867, "memory(GiB)": 77.56, "step": 98965, "token_acc": 0.5497076023391813, "train_speed(iter/s)": 1.438352 }, { "epoch": 4.240178227153935, "grad_norm": 8.040581703186035, "learning_rate": 5.590604571457231e-06, "loss": 2.507685661315918, "memory(GiB)": 77.56, "step": 98970, "token_acc": 0.48322147651006714, "train_speed(iter/s)": 1.438341 }, { "epoch": 4.240392442483184, "grad_norm": 6.547095775604248, "learning_rate": 5.587512774906217e-06, "loss": 2.1984237670898437, "memory(GiB)": 77.56, "step": 98975, "token_acc": 0.5352112676056338, "train_speed(iter/s)": 1.43834 }, { "epoch": 4.2406066578124335, "grad_norm": 5.017219543457031, "learning_rate": 5.584421782929078e-06, "loss": 1.9073963165283203, "memory(GiB)": 77.56, "step": 98980, "token_acc": 0.547244094488189, "train_speed(iter/s)": 1.438345 }, { "epoch": 4.240820873141682, "grad_norm": 6.180419445037842, "learning_rate": 5.58133159558179e-06, "loss": 2.2009613037109377, "memory(GiB)": 77.56, "step": 98985, "token_acc": 0.5104895104895105, "train_speed(iter/s)": 1.438353 }, { "epoch": 4.241035088470931, "grad_norm": 7.331000804901123, "learning_rate": 5.5782422129203534e-06, "loss": 2.3994380950927736, "memory(GiB)": 77.56, "step": 98990, "token_acc": 0.4717607973421927, "train_speed(iter/s)": 1.438374 }, { "epoch": 4.2412493038001795, "grad_norm": 6.821191310882568, "learning_rate": 5.575153635000735e-06, "loss": 2.139596939086914, "memory(GiB)": 77.56, "step": 98995, "token_acc": 0.5096525096525096, "train_speed(iter/s)": 1.438373 }, { "epoch": 4.241463519129429, "grad_norm": 5.0258917808532715, "learning_rate": 5.572065861878883e-06, "loss": 2.1131879806518556, "memory(GiB)": 77.56, "step": 99000, "token_acc": 0.5491525423728814, "train_speed(iter/s)": 1.438371 }, { "epoch": 4.241463519129429, "eval_loss": 2.3487865924835205, "eval_runtime": 14.4108, "eval_samples_per_second": 6.939, "eval_steps_per_second": 6.939, "eval_token_acc": 0.45481049562682213, "step": 99000 }, { "epoch": 4.241677734458678, "grad_norm": 5.596465110778809, "learning_rate": 5.568978893610727e-06, "loss": 2.135300064086914, "memory(GiB)": 77.56, "step": 99005, "token_acc": 0.46707193515704154, "train_speed(iter/s)": 1.438056 }, { "epoch": 4.241891949787927, "grad_norm": 5.467395305633545, "learning_rate": 5.565892730252203e-06, "loss": 2.4273265838623046, "memory(GiB)": 77.56, "step": 99010, "token_acc": 0.5147540983606558, "train_speed(iter/s)": 1.438064 }, { "epoch": 4.242106165117176, "grad_norm": 7.064877986907959, "learning_rate": 5.5628073718592184e-06, "loss": 2.356730842590332, "memory(GiB)": 77.56, "step": 99015, "token_acc": 0.49166666666666664, "train_speed(iter/s)": 1.438065 }, { "epoch": 4.242320380446425, "grad_norm": 5.671635627746582, "learning_rate": 5.55972281848765e-06, "loss": 2.390118217468262, "memory(GiB)": 77.56, "step": 99020, "token_acc": 0.4827586206896552, "train_speed(iter/s)": 1.438073 }, { "epoch": 4.242534595775673, "grad_norm": 4.671056270599365, "learning_rate": 5.556639070193404e-06, "loss": 2.126407241821289, "memory(GiB)": 77.56, "step": 99025, "token_acc": 0.5173745173745173, "train_speed(iter/s)": 1.43806 }, { "epoch": 4.2427488111049225, "grad_norm": 5.497961044311523, "learning_rate": 5.553556127032333e-06, "loss": 2.2226787567138673, "memory(GiB)": 77.56, "step": 99030, "token_acc": 0.519434628975265, "train_speed(iter/s)": 1.438056 }, { "epoch": 4.242963026434172, "grad_norm": 8.50023078918457, "learning_rate": 5.550473989060284e-06, "loss": 2.1681802749633787, "memory(GiB)": 77.56, "step": 99035, "token_acc": 0.5218855218855218, "train_speed(iter/s)": 1.438061 }, { "epoch": 4.243177241763421, "grad_norm": 6.356548309326172, "learning_rate": 5.5473926563331e-06, "loss": 2.2580083847045898, "memory(GiB)": 77.56, "step": 99040, "token_acc": 0.5054945054945055, "train_speed(iter/s)": 1.438069 }, { "epoch": 4.243391457092669, "grad_norm": 6.910303592681885, "learning_rate": 5.5443121289065814e-06, "loss": 2.2745216369628904, "memory(GiB)": 77.56, "step": 99045, "token_acc": 0.47318611987381703, "train_speed(iter/s)": 1.438052 }, { "epoch": 4.243605672421919, "grad_norm": 8.169788360595703, "learning_rate": 5.541232406836567e-06, "loss": 2.2359338760375977, "memory(GiB)": 77.56, "step": 99050, "token_acc": 0.4728682170542636, "train_speed(iter/s)": 1.438073 }, { "epoch": 4.243819887751167, "grad_norm": 6.21835994720459, "learning_rate": 5.538153490178833e-06, "loss": 2.3249101638793945, "memory(GiB)": 77.56, "step": 99055, "token_acc": 0.49363057324840764, "train_speed(iter/s)": 1.438081 }, { "epoch": 4.244034103080416, "grad_norm": 5.761159896850586, "learning_rate": 5.5350753789891515e-06, "loss": 2.3328433990478517, "memory(GiB)": 77.56, "step": 99060, "token_acc": 0.5325443786982249, "train_speed(iter/s)": 1.438078 }, { "epoch": 4.2442483184096655, "grad_norm": 7.299875259399414, "learning_rate": 5.531998073323297e-06, "loss": 2.1044246673583986, "memory(GiB)": 77.56, "step": 99065, "token_acc": 0.5150375939849624, "train_speed(iter/s)": 1.438076 }, { "epoch": 4.244462533738915, "grad_norm": 7.110866546630859, "learning_rate": 5.528921573237001e-06, "loss": 2.329795074462891, "memory(GiB)": 77.56, "step": 99070, "token_acc": 0.5037593984962406, "train_speed(iter/s)": 1.438087 }, { "epoch": 4.244676749068163, "grad_norm": 7.9992804527282715, "learning_rate": 5.525845878786018e-06, "loss": 2.4182613372802733, "memory(GiB)": 77.56, "step": 99075, "token_acc": 0.5098039215686274, "train_speed(iter/s)": 1.438082 }, { "epoch": 4.244890964397412, "grad_norm": 7.7661519050598145, "learning_rate": 5.522770990026044e-06, "loss": 2.332794952392578, "memory(GiB)": 77.56, "step": 99080, "token_acc": 0.48788927335640137, "train_speed(iter/s)": 1.438091 }, { "epoch": 4.245105179726661, "grad_norm": 8.123416900634766, "learning_rate": 5.519696907012811e-06, "loss": 2.0453876495361327, "memory(GiB)": 77.56, "step": 99085, "token_acc": 0.5372168284789643, "train_speed(iter/s)": 1.438087 }, { "epoch": 4.24531939505591, "grad_norm": 7.143505096435547, "learning_rate": 5.516623629801987e-06, "loss": 2.131815719604492, "memory(GiB)": 77.56, "step": 99090, "token_acc": 0.5641025641025641, "train_speed(iter/s)": 1.43808 }, { "epoch": 4.245533610385159, "grad_norm": 7.471089839935303, "learning_rate": 5.51355115844926e-06, "loss": 2.3486806869506838, "memory(GiB)": 77.56, "step": 99095, "token_acc": 0.49258160237388726, "train_speed(iter/s)": 1.438084 }, { "epoch": 4.245747825714409, "grad_norm": 5.78510856628418, "learning_rate": 5.510479493010285e-06, "loss": 2.678004837036133, "memory(GiB)": 77.56, "step": 99100, "token_acc": 0.483695652173913, "train_speed(iter/s)": 1.438067 }, { "epoch": 4.245962041043657, "grad_norm": 5.872328281402588, "learning_rate": 5.5074086335407e-06, "loss": 2.4142339706420897, "memory(GiB)": 77.56, "step": 99105, "token_acc": 0.48826291079812206, "train_speed(iter/s)": 1.438059 }, { "epoch": 4.246176256372906, "grad_norm": 5.436133861541748, "learning_rate": 5.504338580096152e-06, "loss": 2.1725513458251955, "memory(GiB)": 77.56, "step": 99110, "token_acc": 0.5076452599388379, "train_speed(iter/s)": 1.438073 }, { "epoch": 4.246390471702155, "grad_norm": 7.735098361968994, "learning_rate": 5.501269332732256e-06, "loss": 2.75048828125, "memory(GiB)": 77.56, "step": 99115, "token_acc": 0.4528301886792453, "train_speed(iter/s)": 1.438072 }, { "epoch": 4.246604687031404, "grad_norm": 5.928342342376709, "learning_rate": 5.498200891504606e-06, "loss": 2.1290210723876952, "memory(GiB)": 77.56, "step": 99120, "token_acc": 0.5490909090909091, "train_speed(iter/s)": 1.438072 }, { "epoch": 4.246818902360653, "grad_norm": 6.75995397567749, "learning_rate": 5.4951332564687926e-06, "loss": 2.1365535736083983, "memory(GiB)": 77.56, "step": 99125, "token_acc": 0.5247524752475248, "train_speed(iter/s)": 1.438076 }, { "epoch": 4.247033117689902, "grad_norm": 10.028285026550293, "learning_rate": 5.492066427680376e-06, "loss": 2.295850944519043, "memory(GiB)": 77.56, "step": 99130, "token_acc": 0.49454545454545457, "train_speed(iter/s)": 1.438083 }, { "epoch": 4.247247333019151, "grad_norm": 7.067670822143555, "learning_rate": 5.489000405194944e-06, "loss": 2.4321945190429686, "memory(GiB)": 77.56, "step": 99135, "token_acc": 0.46779661016949153, "train_speed(iter/s)": 1.438075 }, { "epoch": 4.2474615483484, "grad_norm": 8.564013481140137, "learning_rate": 5.48593518906802e-06, "loss": 2.015664291381836, "memory(GiB)": 77.56, "step": 99140, "token_acc": 0.5782608695652174, "train_speed(iter/s)": 1.438062 }, { "epoch": 4.247675763677648, "grad_norm": 6.376132488250732, "learning_rate": 5.482870779355137e-06, "loss": 2.229803466796875, "memory(GiB)": 77.56, "step": 99145, "token_acc": 0.5241379310344828, "train_speed(iter/s)": 1.438075 }, { "epoch": 4.247889979006898, "grad_norm": 7.828505992889404, "learning_rate": 5.479807176111812e-06, "loss": 2.4596206665039064, "memory(GiB)": 77.56, "step": 99150, "token_acc": 0.4935064935064935, "train_speed(iter/s)": 1.438089 }, { "epoch": 4.248104194336147, "grad_norm": 6.324970245361328, "learning_rate": 5.476744379393534e-06, "loss": 2.4612213134765626, "memory(GiB)": 77.56, "step": 99155, "token_acc": 0.5148514851485149, "train_speed(iter/s)": 1.43809 }, { "epoch": 4.248318409665396, "grad_norm": 6.837876796722412, "learning_rate": 5.4736823892558075e-06, "loss": 1.9744007110595703, "memory(GiB)": 77.56, "step": 99160, "token_acc": 0.5381944444444444, "train_speed(iter/s)": 1.438102 }, { "epoch": 4.2485326249946445, "grad_norm": 7.05295467376709, "learning_rate": 5.470621205754084e-06, "loss": 1.9171564102172851, "memory(GiB)": 77.56, "step": 99165, "token_acc": 0.5826771653543307, "train_speed(iter/s)": 1.438111 }, { "epoch": 4.248746840323894, "grad_norm": 5.768963813781738, "learning_rate": 5.467560828943841e-06, "loss": 2.338684844970703, "memory(GiB)": 77.56, "step": 99170, "token_acc": 0.5, "train_speed(iter/s)": 1.438106 }, { "epoch": 4.248961055653142, "grad_norm": 7.49630880355835, "learning_rate": 5.464501258880505e-06, "loss": 2.3309356689453127, "memory(GiB)": 77.56, "step": 99175, "token_acc": 0.527027027027027, "train_speed(iter/s)": 1.4381 }, { "epoch": 4.249175270982391, "grad_norm": 6.827689170837402, "learning_rate": 5.461442495619507e-06, "loss": 2.3259746551513674, "memory(GiB)": 77.56, "step": 99180, "token_acc": 0.4921875, "train_speed(iter/s)": 1.438103 }, { "epoch": 4.249389486311641, "grad_norm": 6.1214776039123535, "learning_rate": 5.45838453921626e-06, "loss": 2.5952426910400392, "memory(GiB)": 77.56, "step": 99185, "token_acc": 0.5031055900621118, "train_speed(iter/s)": 1.438123 }, { "epoch": 4.24960370164089, "grad_norm": 6.791317939758301, "learning_rate": 5.455327389726151e-06, "loss": 2.275722694396973, "memory(GiB)": 77.56, "step": 99190, "token_acc": 0.5224913494809689, "train_speed(iter/s)": 1.438134 }, { "epoch": 4.249817916970138, "grad_norm": 5.986358165740967, "learning_rate": 5.452271047204582e-06, "loss": 2.1992698669433595, "memory(GiB)": 77.56, "step": 99195, "token_acc": 0.5631399317406144, "train_speed(iter/s)": 1.438152 }, { "epoch": 4.2500321322993875, "grad_norm": 6.667753219604492, "learning_rate": 5.449215511706907e-06, "loss": 2.2963939666748048, "memory(GiB)": 77.56, "step": 99200, "token_acc": 0.49169435215946844, "train_speed(iter/s)": 1.438165 }, { "epoch": 4.250246347628636, "grad_norm": 5.870570182800293, "learning_rate": 5.4461607832884895e-06, "loss": 2.5898883819580076, "memory(GiB)": 77.56, "step": 99205, "token_acc": 0.4813664596273292, "train_speed(iter/s)": 1.43815 }, { "epoch": 4.250460562957885, "grad_norm": 7.62021017074585, "learning_rate": 5.443106862004666e-06, "loss": 2.0108711242675783, "memory(GiB)": 77.56, "step": 99210, "token_acc": 0.5207667731629393, "train_speed(iter/s)": 1.438173 }, { "epoch": 4.250674778287134, "grad_norm": 5.2923808097839355, "learning_rate": 5.440053747910751e-06, "loss": 2.1698883056640623, "memory(GiB)": 77.56, "step": 99215, "token_acc": 0.5225563909774437, "train_speed(iter/s)": 1.438167 }, { "epoch": 4.250888993616384, "grad_norm": 5.813499927520752, "learning_rate": 5.4370014410620674e-06, "loss": 2.0973180770874023, "memory(GiB)": 77.56, "step": 99220, "token_acc": 0.5283018867924528, "train_speed(iter/s)": 1.438171 }, { "epoch": 4.251103208945632, "grad_norm": 6.895333290100098, "learning_rate": 5.4339499415139105e-06, "loss": 2.4307680130004883, "memory(GiB)": 77.56, "step": 99225, "token_acc": 0.48148148148148145, "train_speed(iter/s)": 1.438184 }, { "epoch": 4.251317424274881, "grad_norm": 6.923884391784668, "learning_rate": 5.4308992493215414e-06, "loss": 2.2147375106811524, "memory(GiB)": 77.56, "step": 99230, "token_acc": 0.5475285171102662, "train_speed(iter/s)": 1.438191 }, { "epoch": 4.25153163960413, "grad_norm": 7.689892768859863, "learning_rate": 5.427849364540255e-06, "loss": 2.25538330078125, "memory(GiB)": 77.56, "step": 99235, "token_acc": 0.511864406779661, "train_speed(iter/s)": 1.438182 }, { "epoch": 4.251745854933379, "grad_norm": 9.414280891418457, "learning_rate": 5.4248002872252904e-06, "loss": 2.115106773376465, "memory(GiB)": 77.56, "step": 99240, "token_acc": 0.5375494071146245, "train_speed(iter/s)": 1.438196 }, { "epoch": 4.251960070262628, "grad_norm": 6.857348918914795, "learning_rate": 5.42175201743188e-06, "loss": 2.075666809082031, "memory(GiB)": 77.56, "step": 99245, "token_acc": 0.5359477124183006, "train_speed(iter/s)": 1.438206 }, { "epoch": 4.252174285591877, "grad_norm": 5.502389430999756, "learning_rate": 5.418704555215243e-06, "loss": 2.500968360900879, "memory(GiB)": 77.56, "step": 99250, "token_acc": 0.48757763975155277, "train_speed(iter/s)": 1.43821 }, { "epoch": 4.252388500921126, "grad_norm": 6.886981010437012, "learning_rate": 5.415657900630605e-06, "loss": 2.1749717712402346, "memory(GiB)": 77.56, "step": 99255, "token_acc": 0.545774647887324, "train_speed(iter/s)": 1.438215 }, { "epoch": 4.252602716250375, "grad_norm": 6.770241737365723, "learning_rate": 5.412612053733146e-06, "loss": 2.2678466796875, "memory(GiB)": 77.56, "step": 99260, "token_acc": 0.49433962264150944, "train_speed(iter/s)": 1.438228 }, { "epoch": 4.252816931579623, "grad_norm": 8.066960334777832, "learning_rate": 5.409567014578043e-06, "loss": 2.125078582763672, "memory(GiB)": 77.56, "step": 99265, "token_acc": 0.5123674911660777, "train_speed(iter/s)": 1.438237 }, { "epoch": 4.253031146908873, "grad_norm": 6.868201732635498, "learning_rate": 5.406522783220463e-06, "loss": 2.0965816497802736, "memory(GiB)": 77.56, "step": 99270, "token_acc": 0.5132743362831859, "train_speed(iter/s)": 1.43825 }, { "epoch": 4.253245362238122, "grad_norm": 6.25499963760376, "learning_rate": 5.403479359715552e-06, "loss": 2.4377157211303713, "memory(GiB)": 77.56, "step": 99275, "token_acc": 0.48554913294797686, "train_speed(iter/s)": 1.438259 }, { "epoch": 4.253459577567371, "grad_norm": 9.412491798400879, "learning_rate": 5.400436744118448e-06, "loss": 2.1824052810668944, "memory(GiB)": 77.56, "step": 99280, "token_acc": 0.5306122448979592, "train_speed(iter/s)": 1.438257 }, { "epoch": 4.2536737928966195, "grad_norm": 5.684455394744873, "learning_rate": 5.397394936484274e-06, "loss": 2.156208801269531, "memory(GiB)": 77.56, "step": 99285, "token_acc": 0.5451388888888888, "train_speed(iter/s)": 1.438245 }, { "epoch": 4.253888008225869, "grad_norm": 5.2482476234436035, "learning_rate": 5.394353936868135e-06, "loss": 2.405049514770508, "memory(GiB)": 77.56, "step": 99290, "token_acc": 0.5, "train_speed(iter/s)": 1.438257 }, { "epoch": 4.254102223555117, "grad_norm": 7.334107875823975, "learning_rate": 5.391313745325116e-06, "loss": 2.417220687866211, "memory(GiB)": 77.56, "step": 99295, "token_acc": 0.4619883040935672, "train_speed(iter/s)": 1.438267 }, { "epoch": 4.254316438884366, "grad_norm": 7.12433385848999, "learning_rate": 5.3882743619102786e-06, "loss": 2.387996482849121, "memory(GiB)": 77.56, "step": 99300, "token_acc": 0.5205479452054794, "train_speed(iter/s)": 1.438276 }, { "epoch": 4.254530654213616, "grad_norm": 7.295494556427002, "learning_rate": 5.385235786678716e-06, "loss": 2.5620138168334963, "memory(GiB)": 77.56, "step": 99305, "token_acc": 0.49466192170818507, "train_speed(iter/s)": 1.438286 }, { "epoch": 4.254744869542865, "grad_norm": 6.653997421264648, "learning_rate": 5.3821980196854475e-06, "loss": 2.3662322998046874, "memory(GiB)": 77.56, "step": 99310, "token_acc": 0.4907749077490775, "train_speed(iter/s)": 1.43828 }, { "epoch": 4.254959084872113, "grad_norm": 9.474124908447266, "learning_rate": 5.379161060985533e-06, "loss": 1.9860734939575195, "memory(GiB)": 77.56, "step": 99315, "token_acc": 0.5797665369649806, "train_speed(iter/s)": 1.438289 }, { "epoch": 4.2551733002013625, "grad_norm": 5.496501922607422, "learning_rate": 5.376124910633967e-06, "loss": 1.9740488052368164, "memory(GiB)": 77.56, "step": 99320, "token_acc": 0.5180327868852459, "train_speed(iter/s)": 1.438286 }, { "epoch": 4.255387515530612, "grad_norm": 5.606962203979492, "learning_rate": 5.373089568685758e-06, "loss": 2.6144222259521483, "memory(GiB)": 77.56, "step": 99325, "token_acc": 0.44594594594594594, "train_speed(iter/s)": 1.438289 }, { "epoch": 4.25560173085986, "grad_norm": 6.997704029083252, "learning_rate": 5.370055035195898e-06, "loss": 2.456332969665527, "memory(GiB)": 77.56, "step": 99330, "token_acc": 0.48830409356725146, "train_speed(iter/s)": 1.438308 }, { "epoch": 4.255815946189109, "grad_norm": 7.725266933441162, "learning_rate": 5.367021310219344e-06, "loss": 2.530974769592285, "memory(GiB)": 77.56, "step": 99335, "token_acc": 0.5074626865671642, "train_speed(iter/s)": 1.438306 }, { "epoch": 4.256030161518359, "grad_norm": 10.022805213928223, "learning_rate": 5.363988393811081e-06, "loss": 1.9978378295898438, "memory(GiB)": 77.56, "step": 99340, "token_acc": 0.5540540540540541, "train_speed(iter/s)": 1.438279 }, { "epoch": 4.256244376847607, "grad_norm": 7.094108581542969, "learning_rate": 5.360956286026042e-06, "loss": 2.327950859069824, "memory(GiB)": 77.56, "step": 99345, "token_acc": 0.540268456375839, "train_speed(iter/s)": 1.438294 }, { "epoch": 4.256458592176856, "grad_norm": 4.85783052444458, "learning_rate": 5.357924986919149e-06, "loss": 1.9933506011962892, "memory(GiB)": 77.56, "step": 99350, "token_acc": 0.5275590551181102, "train_speed(iter/s)": 1.438302 }, { "epoch": 4.2566728075061055, "grad_norm": 6.5165534019470215, "learning_rate": 5.354894496545326e-06, "loss": 2.5928401947021484, "memory(GiB)": 77.56, "step": 99355, "token_acc": 0.4441340782122905, "train_speed(iter/s)": 1.438301 }, { "epoch": 4.256887022835354, "grad_norm": 5.5560455322265625, "learning_rate": 5.351864814959462e-06, "loss": 2.4872867584228517, "memory(GiB)": 77.56, "step": 99360, "token_acc": 0.5, "train_speed(iter/s)": 1.438303 }, { "epoch": 4.257101238164603, "grad_norm": 7.415276527404785, "learning_rate": 5.348835942216457e-06, "loss": 2.339769744873047, "memory(GiB)": 77.56, "step": 99365, "token_acc": 0.5576208178438662, "train_speed(iter/s)": 1.438312 }, { "epoch": 4.257315453493852, "grad_norm": 5.808889389038086, "learning_rate": 5.345807878371173e-06, "loss": 2.033720016479492, "memory(GiB)": 77.56, "step": 99370, "token_acc": 0.5274725274725275, "train_speed(iter/s)": 1.438325 }, { "epoch": 4.257529668823101, "grad_norm": 7.288916110992432, "learning_rate": 5.3427806234784606e-06, "loss": 2.2452007293701173, "memory(GiB)": 77.56, "step": 99375, "token_acc": 0.49393939393939396, "train_speed(iter/s)": 1.438321 }, { "epoch": 4.25774388415235, "grad_norm": 7.555299282073975, "learning_rate": 5.339754177593182e-06, "loss": 2.0741466522216796, "memory(GiB)": 77.56, "step": 99380, "token_acc": 0.5421686746987951, "train_speed(iter/s)": 1.438323 }, { "epoch": 4.257958099481599, "grad_norm": 7.6305670738220215, "learning_rate": 5.336728540770148e-06, "loss": 2.0681602478027346, "memory(GiB)": 77.56, "step": 99385, "token_acc": 0.5263157894736842, "train_speed(iter/s)": 1.438317 }, { "epoch": 4.258172314810848, "grad_norm": 5.377715587615967, "learning_rate": 5.333703713064175e-06, "loss": 2.3758419036865233, "memory(GiB)": 77.56, "step": 99390, "token_acc": 0.5374592833876222, "train_speed(iter/s)": 1.438322 }, { "epoch": 4.258386530140097, "grad_norm": 7.726016998291016, "learning_rate": 5.330679694530049e-06, "loss": 2.2782661437988283, "memory(GiB)": 77.56, "step": 99395, "token_acc": 0.4984025559105431, "train_speed(iter/s)": 1.438324 }, { "epoch": 4.258600745469346, "grad_norm": 6.461470127105713, "learning_rate": 5.327656485222576e-06, "loss": 2.293054389953613, "memory(GiB)": 77.56, "step": 99400, "token_acc": 0.5392857142857143, "train_speed(iter/s)": 1.43833 }, { "epoch": 4.258814960798595, "grad_norm": 5.0221076011657715, "learning_rate": 5.324634085196506e-06, "loss": 2.5675674438476563, "memory(GiB)": 77.56, "step": 99405, "token_acc": 0.4752475247524752, "train_speed(iter/s)": 1.438325 }, { "epoch": 4.259029176127844, "grad_norm": 7.002190589904785, "learning_rate": 5.321612494506606e-06, "loss": 2.286764144897461, "memory(GiB)": 77.56, "step": 99410, "token_acc": 0.5144927536231884, "train_speed(iter/s)": 1.438327 }, { "epoch": 4.259243391457093, "grad_norm": 7.029500961303711, "learning_rate": 5.318591713207599e-06, "loss": 2.337006759643555, "memory(GiB)": 77.56, "step": 99415, "token_acc": 0.5364963503649635, "train_speed(iter/s)": 1.438332 }, { "epoch": 4.259457606786341, "grad_norm": 5.700656414031982, "learning_rate": 5.315571741354214e-06, "loss": 2.400377655029297, "memory(GiB)": 77.56, "step": 99420, "token_acc": 0.45714285714285713, "train_speed(iter/s)": 1.438333 }, { "epoch": 4.259671822115591, "grad_norm": 5.522071838378906, "learning_rate": 5.312552579001173e-06, "loss": 2.3048778533935548, "memory(GiB)": 77.56, "step": 99425, "token_acc": 0.5310077519379846, "train_speed(iter/s)": 1.438343 }, { "epoch": 4.25988603744484, "grad_norm": 6.354565143585205, "learning_rate": 5.309534226203161e-06, "loss": 2.4640254974365234, "memory(GiB)": 77.56, "step": 99430, "token_acc": 0.5075987841945289, "train_speed(iter/s)": 1.438337 }, { "epoch": 4.260100252774088, "grad_norm": 5.910831451416016, "learning_rate": 5.306516683014862e-06, "loss": 2.3290300369262695, "memory(GiB)": 77.56, "step": 99435, "token_acc": 0.4899135446685879, "train_speed(iter/s)": 1.438323 }, { "epoch": 4.260314468103338, "grad_norm": 6.572525978088379, "learning_rate": 5.303499949490937e-06, "loss": 2.138357925415039, "memory(GiB)": 77.56, "step": 99440, "token_acc": 0.5195729537366548, "train_speed(iter/s)": 1.438321 }, { "epoch": 4.260528683432587, "grad_norm": 6.1764235496521, "learning_rate": 5.300484025686037e-06, "loss": 2.2882753372192384, "memory(GiB)": 77.56, "step": 99445, "token_acc": 0.5261194029850746, "train_speed(iter/s)": 1.438326 }, { "epoch": 4.260742898761835, "grad_norm": 6.314894199371338, "learning_rate": 5.297468911654796e-06, "loss": 2.247754669189453, "memory(GiB)": 77.56, "step": 99450, "token_acc": 0.5164473684210527, "train_speed(iter/s)": 1.438326 }, { "epoch": 4.2609571140910845, "grad_norm": 6.027066707611084, "learning_rate": 5.294454607451838e-06, "loss": 2.119722366333008, "memory(GiB)": 77.56, "step": 99455, "token_acc": 0.523972602739726, "train_speed(iter/s)": 1.438303 }, { "epoch": 4.261171329420334, "grad_norm": 6.138664245605469, "learning_rate": 5.291441113131779e-06, "loss": 1.921840286254883, "memory(GiB)": 77.56, "step": 99460, "token_acc": 0.575187969924812, "train_speed(iter/s)": 1.438305 }, { "epoch": 4.261385544749582, "grad_norm": 6.930879592895508, "learning_rate": 5.2884284287492034e-06, "loss": 2.1499303817749023, "memory(GiB)": 77.56, "step": 99465, "token_acc": 0.5457627118644067, "train_speed(iter/s)": 1.438311 }, { "epoch": 4.261599760078831, "grad_norm": 5.446767330169678, "learning_rate": 5.285416554358696e-06, "loss": 2.422323799133301, "memory(GiB)": 77.56, "step": 99470, "token_acc": 0.5078125, "train_speed(iter/s)": 1.438318 }, { "epoch": 4.261813975408081, "grad_norm": 7.983272075653076, "learning_rate": 5.282405490014808e-06, "loss": 2.199492645263672, "memory(GiB)": 77.56, "step": 99475, "token_acc": 0.5, "train_speed(iter/s)": 1.438315 }, { "epoch": 4.262028190737329, "grad_norm": 5.806427001953125, "learning_rate": 5.279395235772084e-06, "loss": 2.2700372695922852, "memory(GiB)": 77.56, "step": 99480, "token_acc": 0.5454545454545454, "train_speed(iter/s)": 1.438322 }, { "epoch": 4.262242406066578, "grad_norm": 5.768808364868164, "learning_rate": 5.276385791685079e-06, "loss": 2.4191593170166015, "memory(GiB)": 77.56, "step": 99485, "token_acc": 0.4808259587020649, "train_speed(iter/s)": 1.438317 }, { "epoch": 4.2624566213958275, "grad_norm": 5.224371433258057, "learning_rate": 5.273377157808296e-06, "loss": 2.337211799621582, "memory(GiB)": 77.56, "step": 99490, "token_acc": 0.5056179775280899, "train_speed(iter/s)": 1.438322 }, { "epoch": 4.262670836725076, "grad_norm": 5.661655426025391, "learning_rate": 5.270369334196246e-06, "loss": 2.1746469497680665, "memory(GiB)": 77.56, "step": 99495, "token_acc": 0.5447154471544715, "train_speed(iter/s)": 1.438317 }, { "epoch": 4.262885052054325, "grad_norm": 6.167550086975098, "learning_rate": 5.267362320903413e-06, "loss": 2.273846244812012, "memory(GiB)": 77.56, "step": 99500, "token_acc": 0.49829351535836175, "train_speed(iter/s)": 1.438327 }, { "epoch": 4.262885052054325, "eval_loss": 2.259369373321533, "eval_runtime": 14.0799, "eval_samples_per_second": 7.102, "eval_steps_per_second": 7.102, "eval_token_acc": 0.4695290858725762, "step": 99500 }, { "epoch": 4.263099267383574, "grad_norm": 5.5677571296691895, "learning_rate": 5.264356117984265e-06, "loss": 2.3016624450683594, "memory(GiB)": 77.56, "step": 99505, "token_acc": 0.49193548387096775, "train_speed(iter/s)": 1.43803 }, { "epoch": 4.263313482712823, "grad_norm": 6.390848159790039, "learning_rate": 5.261350725493286e-06, "loss": 2.1338474273681642, "memory(GiB)": 77.56, "step": 99510, "token_acc": 0.5555555555555556, "train_speed(iter/s)": 1.43802 }, { "epoch": 4.263527698042072, "grad_norm": 7.957448959350586, "learning_rate": 5.258346143484899e-06, "loss": 2.2904277801513673, "memory(GiB)": 77.56, "step": 99515, "token_acc": 0.5108695652173914, "train_speed(iter/s)": 1.438023 }, { "epoch": 4.263741913371321, "grad_norm": 6.106991291046143, "learning_rate": 5.2553423720135494e-06, "loss": 2.507122611999512, "memory(GiB)": 77.56, "step": 99520, "token_acc": 0.48355263157894735, "train_speed(iter/s)": 1.438021 }, { "epoch": 4.26395612870057, "grad_norm": 5.650023460388184, "learning_rate": 5.2523394111336334e-06, "loss": 2.479850006103516, "memory(GiB)": 77.56, "step": 99525, "token_acc": 0.47601476014760147, "train_speed(iter/s)": 1.438014 }, { "epoch": 4.264170344029819, "grad_norm": 5.992207050323486, "learning_rate": 5.249337260899573e-06, "loss": 2.4205209732055666, "memory(GiB)": 77.56, "step": 99530, "token_acc": 0.494949494949495, "train_speed(iter/s)": 1.438026 }, { "epoch": 4.264384559359068, "grad_norm": 6.949053764343262, "learning_rate": 5.2463359213657525e-06, "loss": 2.3510799407958984, "memory(GiB)": 77.56, "step": 99535, "token_acc": 0.506993006993007, "train_speed(iter/s)": 1.438037 }, { "epoch": 4.2645987746883165, "grad_norm": 5.720202445983887, "learning_rate": 5.243335392586524e-06, "loss": 2.0583702087402345, "memory(GiB)": 77.56, "step": 99540, "token_acc": 0.5466666666666666, "train_speed(iter/s)": 1.438042 }, { "epoch": 4.264812990017566, "grad_norm": 6.283769607543945, "learning_rate": 5.240335674616276e-06, "loss": 2.2413457870483398, "memory(GiB)": 77.56, "step": 99545, "token_acc": 0.5437262357414449, "train_speed(iter/s)": 1.438047 }, { "epoch": 4.265027205346815, "grad_norm": 6.344459056854248, "learning_rate": 5.237336767509332e-06, "loss": 2.4375762939453125, "memory(GiB)": 77.56, "step": 99550, "token_acc": 0.4772727272727273, "train_speed(iter/s)": 1.438053 }, { "epoch": 4.265241420676063, "grad_norm": 5.999893665313721, "learning_rate": 5.234338671320021e-06, "loss": 2.261827087402344, "memory(GiB)": 77.56, "step": 99555, "token_acc": 0.5031847133757962, "train_speed(iter/s)": 1.438065 }, { "epoch": 4.265455636005313, "grad_norm": 6.464510440826416, "learning_rate": 5.2313413861026614e-06, "loss": 2.314762496948242, "memory(GiB)": 77.56, "step": 99560, "token_acc": 0.4982698961937716, "train_speed(iter/s)": 1.438079 }, { "epoch": 4.265669851334562, "grad_norm": 6.643327236175537, "learning_rate": 5.2283449119115366e-06, "loss": 2.312142753601074, "memory(GiB)": 77.56, "step": 99565, "token_acc": 0.5225225225225225, "train_speed(iter/s)": 1.438094 }, { "epoch": 4.26588406666381, "grad_norm": 8.836644172668457, "learning_rate": 5.225349248800954e-06, "loss": 2.2041854858398438, "memory(GiB)": 77.56, "step": 99570, "token_acc": 0.5218855218855218, "train_speed(iter/s)": 1.438101 }, { "epoch": 4.2660982819930595, "grad_norm": 5.8553242683410645, "learning_rate": 5.222354396825174e-06, "loss": 2.2258806228637695, "memory(GiB)": 77.56, "step": 99575, "token_acc": 0.48942598187311176, "train_speed(iter/s)": 1.438115 }, { "epoch": 4.266312497322309, "grad_norm": 5.980251312255859, "learning_rate": 5.2193603560384495e-06, "loss": 2.4428539276123047, "memory(GiB)": 77.56, "step": 99580, "token_acc": 0.5227272727272727, "train_speed(iter/s)": 1.438121 }, { "epoch": 4.266526712651557, "grad_norm": 6.075314044952393, "learning_rate": 5.216367126495015e-06, "loss": 2.2006053924560547, "memory(GiB)": 77.56, "step": 99585, "token_acc": 0.5625, "train_speed(iter/s)": 1.438121 }, { "epoch": 4.266740927980806, "grad_norm": 5.85908842086792, "learning_rate": 5.213374708249097e-06, "loss": 2.2895736694335938, "memory(GiB)": 77.56, "step": 99590, "token_acc": 0.4984520123839009, "train_speed(iter/s)": 1.43813 }, { "epoch": 4.266955143310056, "grad_norm": 7.5717854499816895, "learning_rate": 5.210383101354915e-06, "loss": 2.5901809692382813, "memory(GiB)": 77.56, "step": 99595, "token_acc": 0.4503105590062112, "train_speed(iter/s)": 1.438129 }, { "epoch": 4.267169358639304, "grad_norm": 7.153294563293457, "learning_rate": 5.207392305866648e-06, "loss": 2.409659194946289, "memory(GiB)": 77.56, "step": 99600, "token_acc": 0.4935897435897436, "train_speed(iter/s)": 1.438133 }, { "epoch": 4.267383573968553, "grad_norm": 8.248741149902344, "learning_rate": 5.204402321838503e-06, "loss": 2.3266618728637694, "memory(GiB)": 77.56, "step": 99605, "token_acc": 0.5140845070422535, "train_speed(iter/s)": 1.438144 }, { "epoch": 4.2675977892978025, "grad_norm": 6.237697124481201, "learning_rate": 5.2014131493246246e-06, "loss": 2.298316764831543, "memory(GiB)": 77.56, "step": 99610, "token_acc": 0.48201438848920863, "train_speed(iter/s)": 1.438138 }, { "epoch": 4.267812004627051, "grad_norm": 6.186761856079102, "learning_rate": 5.198424788379175e-06, "loss": 2.2674753189086916, "memory(GiB)": 77.56, "step": 99615, "token_acc": 0.49842271293375395, "train_speed(iter/s)": 1.43814 }, { "epoch": 4.2680262199563, "grad_norm": 5.699726104736328, "learning_rate": 5.195437239056289e-06, "loss": 2.4532527923583984, "memory(GiB)": 77.56, "step": 99620, "token_acc": 0.46814404432132967, "train_speed(iter/s)": 1.438153 }, { "epoch": 4.268240435285549, "grad_norm": 8.543010711669922, "learning_rate": 5.192450501410079e-06, "loss": 2.053589630126953, "memory(GiB)": 77.56, "step": 99625, "token_acc": 0.5792880258899676, "train_speed(iter/s)": 1.438157 }, { "epoch": 4.268454650614798, "grad_norm": 6.113498210906982, "learning_rate": 5.189464575494668e-06, "loss": 2.100913429260254, "memory(GiB)": 77.56, "step": 99630, "token_acc": 0.5437956204379562, "train_speed(iter/s)": 1.438163 }, { "epoch": 4.268668865944047, "grad_norm": 6.139715671539307, "learning_rate": 5.186479461364141e-06, "loss": 2.4362077713012695, "memory(GiB)": 77.56, "step": 99635, "token_acc": 0.48951048951048953, "train_speed(iter/s)": 1.438176 }, { "epoch": 4.268883081273296, "grad_norm": 6.840907573699951, "learning_rate": 5.183495159072583e-06, "loss": 2.3649318695068358, "memory(GiB)": 77.56, "step": 99640, "token_acc": 0.5551839464882943, "train_speed(iter/s)": 1.438193 }, { "epoch": 4.269097296602545, "grad_norm": 7.1368560791015625, "learning_rate": 5.180511668674043e-06, "loss": 2.3247711181640627, "memory(GiB)": 77.56, "step": 99645, "token_acc": 0.5254237288135594, "train_speed(iter/s)": 1.438179 }, { "epoch": 4.269311511931794, "grad_norm": 10.226576805114746, "learning_rate": 5.177528990222574e-06, "loss": 2.3273765563964846, "memory(GiB)": 77.56, "step": 99650, "token_acc": 0.5, "train_speed(iter/s)": 1.438191 }, { "epoch": 4.269525727261043, "grad_norm": 6.319547176361084, "learning_rate": 5.174547123772222e-06, "loss": 2.251133346557617, "memory(GiB)": 77.56, "step": 99655, "token_acc": 0.5, "train_speed(iter/s)": 1.438192 }, { "epoch": 4.2697399425902915, "grad_norm": 7.835252285003662, "learning_rate": 5.1715660693769985e-06, "loss": 2.324872589111328, "memory(GiB)": 77.56, "step": 99660, "token_acc": 0.5252100840336135, "train_speed(iter/s)": 1.438204 }, { "epoch": 4.269954157919541, "grad_norm": 5.754541873931885, "learning_rate": 5.168585827090911e-06, "loss": 2.4149904251098633, "memory(GiB)": 77.56, "step": 99665, "token_acc": 0.47151898734177217, "train_speed(iter/s)": 1.438215 }, { "epoch": 4.27016837324879, "grad_norm": 10.338605880737305, "learning_rate": 5.165606396967932e-06, "loss": 2.2170360565185545, "memory(GiB)": 77.56, "step": 99670, "token_acc": 0.5296296296296297, "train_speed(iter/s)": 1.438225 }, { "epoch": 4.270382588578038, "grad_norm": 8.407870292663574, "learning_rate": 5.1626277790620625e-06, "loss": 2.248361015319824, "memory(GiB)": 77.56, "step": 99675, "token_acc": 0.5071428571428571, "train_speed(iter/s)": 1.438236 }, { "epoch": 4.270596803907288, "grad_norm": 7.160726070404053, "learning_rate": 5.159649973427255e-06, "loss": 2.2546363830566407, "memory(GiB)": 77.56, "step": 99680, "token_acc": 0.5387596899224806, "train_speed(iter/s)": 1.438224 }, { "epoch": 4.270811019236537, "grad_norm": 5.566939830780029, "learning_rate": 5.1566729801174385e-06, "loss": 2.351669120788574, "memory(GiB)": 77.56, "step": 99685, "token_acc": 0.5227272727272727, "train_speed(iter/s)": 1.438222 }, { "epoch": 4.271025234565785, "grad_norm": 6.521866798400879, "learning_rate": 5.153696799186569e-06, "loss": 2.3032325744628905, "memory(GiB)": 77.56, "step": 99690, "token_acc": 0.49328859060402686, "train_speed(iter/s)": 1.438222 }, { "epoch": 4.271239449895035, "grad_norm": 5.597241401672363, "learning_rate": 5.1507214306885485e-06, "loss": 2.2157379150390626, "memory(GiB)": 77.56, "step": 99695, "token_acc": 0.5017064846416383, "train_speed(iter/s)": 1.438209 }, { "epoch": 4.271453665224284, "grad_norm": 8.706413269042969, "learning_rate": 5.147746874677284e-06, "loss": 2.3960054397583006, "memory(GiB)": 77.56, "step": 99700, "token_acc": 0.5019011406844106, "train_speed(iter/s)": 1.438216 }, { "epoch": 4.271667880553532, "grad_norm": 6.453300952911377, "learning_rate": 5.144773131206659e-06, "loss": 2.070814514160156, "memory(GiB)": 77.56, "step": 99705, "token_acc": 0.5404411764705882, "train_speed(iter/s)": 1.438211 }, { "epoch": 4.2718820958827814, "grad_norm": 6.497753143310547, "learning_rate": 5.141800200330538e-06, "loss": 2.2009424209594726, "memory(GiB)": 77.56, "step": 99710, "token_acc": 0.5041322314049587, "train_speed(iter/s)": 1.438226 }, { "epoch": 4.272096311212031, "grad_norm": 5.68049430847168, "learning_rate": 5.138828082102792e-06, "loss": 2.321919822692871, "memory(GiB)": 77.56, "step": 99715, "token_acc": 0.496551724137931, "train_speed(iter/s)": 1.438223 }, { "epoch": 4.272310526541279, "grad_norm": 5.51497220993042, "learning_rate": 5.135856776577263e-06, "loss": 1.954019546508789, "memory(GiB)": 77.56, "step": 99720, "token_acc": 0.5350553505535055, "train_speed(iter/s)": 1.438235 }, { "epoch": 4.272524741870528, "grad_norm": 5.677099227905273, "learning_rate": 5.1328862838077754e-06, "loss": 2.156208801269531, "memory(GiB)": 77.56, "step": 99725, "token_acc": 0.5297619047619048, "train_speed(iter/s)": 1.438239 }, { "epoch": 4.272738957199778, "grad_norm": 7.19480037689209, "learning_rate": 5.129916603848139e-06, "loss": 2.5902456283569335, "memory(GiB)": 77.56, "step": 99730, "token_acc": 0.48854961832061067, "train_speed(iter/s)": 1.438242 }, { "epoch": 4.272953172529026, "grad_norm": 8.099814414978027, "learning_rate": 5.126947736752141e-06, "loss": 2.340462303161621, "memory(GiB)": 77.56, "step": 99735, "token_acc": 0.5370370370370371, "train_speed(iter/s)": 1.438259 }, { "epoch": 4.273167387858275, "grad_norm": 4.437596321105957, "learning_rate": 5.123979682573599e-06, "loss": 2.2372098922729493, "memory(GiB)": 77.56, "step": 99740, "token_acc": 0.5319148936170213, "train_speed(iter/s)": 1.438264 }, { "epoch": 4.2733816031875245, "grad_norm": 5.398176193237305, "learning_rate": 5.12101244136624e-06, "loss": 2.361555290222168, "memory(GiB)": 77.56, "step": 99745, "token_acc": 0.5288461538461539, "train_speed(iter/s)": 1.43828 }, { "epoch": 4.273595818516773, "grad_norm": 6.621888637542725, "learning_rate": 5.118046013183858e-06, "loss": 2.6222583770751955, "memory(GiB)": 77.56, "step": 99750, "token_acc": 0.4788732394366197, "train_speed(iter/s)": 1.438284 }, { "epoch": 4.273810033846022, "grad_norm": 6.522944927215576, "learning_rate": 5.115080398080174e-06, "loss": 2.205311584472656, "memory(GiB)": 77.56, "step": 99755, "token_acc": 0.5215827338129496, "train_speed(iter/s)": 1.438284 }, { "epoch": 4.274024249175271, "grad_norm": 5.84501838684082, "learning_rate": 5.112115596108918e-06, "loss": 2.0059930801391603, "memory(GiB)": 77.56, "step": 99760, "token_acc": 0.5326460481099656, "train_speed(iter/s)": 1.438284 }, { "epoch": 4.27423846450452, "grad_norm": 5.14306116104126, "learning_rate": 5.109151607323792e-06, "loss": 1.9309453964233398, "memory(GiB)": 77.56, "step": 99765, "token_acc": 0.588, "train_speed(iter/s)": 1.438288 }, { "epoch": 4.274452679833769, "grad_norm": 6.364627361297607, "learning_rate": 5.1061884317784855e-06, "loss": 2.3918811798095705, "memory(GiB)": 77.56, "step": 99770, "token_acc": 0.49158249158249157, "train_speed(iter/s)": 1.438289 }, { "epoch": 4.274666895163018, "grad_norm": 7.238813877105713, "learning_rate": 5.103226069526701e-06, "loss": 2.0740615844726564, "memory(GiB)": 77.56, "step": 99775, "token_acc": 0.5627705627705628, "train_speed(iter/s)": 1.438284 }, { "epoch": 4.274881110492267, "grad_norm": 6.310179233551025, "learning_rate": 5.100264520622089e-06, "loss": 2.251984405517578, "memory(GiB)": 77.56, "step": 99780, "token_acc": 0.5387205387205387, "train_speed(iter/s)": 1.438312 }, { "epoch": 4.275095325821516, "grad_norm": 5.881538391113281, "learning_rate": 5.097303785118307e-06, "loss": 2.276052474975586, "memory(GiB)": 77.56, "step": 99785, "token_acc": 0.49019607843137253, "train_speed(iter/s)": 1.43831 }, { "epoch": 4.275309541150765, "grad_norm": 6.447880268096924, "learning_rate": 5.09434386306899e-06, "loss": 2.420423889160156, "memory(GiB)": 77.56, "step": 99790, "token_acc": 0.5108695652173914, "train_speed(iter/s)": 1.438302 }, { "epoch": 4.2755237564800135, "grad_norm": 4.215697765350342, "learning_rate": 5.09138475452775e-06, "loss": 2.231801223754883, "memory(GiB)": 77.56, "step": 99795, "token_acc": 0.5253164556962026, "train_speed(iter/s)": 1.438266 }, { "epoch": 4.275737971809263, "grad_norm": 7.529350757598877, "learning_rate": 5.088426459548207e-06, "loss": 2.0099105834960938, "memory(GiB)": 77.56, "step": 99800, "token_acc": 0.49224806201550386, "train_speed(iter/s)": 1.438277 }, { "epoch": 4.275952187138512, "grad_norm": 11.920438766479492, "learning_rate": 5.085468978183955e-06, "loss": 2.4065725326538088, "memory(GiB)": 77.56, "step": 99805, "token_acc": 0.46808510638297873, "train_speed(iter/s)": 1.438287 }, { "epoch": 4.27616640246776, "grad_norm": 4.699000835418701, "learning_rate": 5.082512310488563e-06, "loss": 1.7878015518188477, "memory(GiB)": 77.56, "step": 99810, "token_acc": 0.5902777777777778, "train_speed(iter/s)": 1.438268 }, { "epoch": 4.27638061779701, "grad_norm": 6.252408027648926, "learning_rate": 5.079556456515599e-06, "loss": 2.4323404312133787, "memory(GiB)": 77.56, "step": 99815, "token_acc": 0.4633333333333333, "train_speed(iter/s)": 1.438267 }, { "epoch": 4.276594833126259, "grad_norm": 4.815014362335205, "learning_rate": 5.0766014163185935e-06, "loss": 2.3686960220336912, "memory(GiB)": 77.56, "step": 99820, "token_acc": 0.5086505190311419, "train_speed(iter/s)": 1.438279 }, { "epoch": 4.276809048455507, "grad_norm": 8.460524559020996, "learning_rate": 5.073647189951109e-06, "loss": 2.3323667526245115, "memory(GiB)": 77.56, "step": 99825, "token_acc": 0.496, "train_speed(iter/s)": 1.438276 }, { "epoch": 4.2770232637847565, "grad_norm": 6.189655303955078, "learning_rate": 5.07069377746664e-06, "loss": 2.2044851303100588, "memory(GiB)": 77.56, "step": 99830, "token_acc": 0.55859375, "train_speed(iter/s)": 1.438289 }, { "epoch": 4.277237479114006, "grad_norm": 7.330394268035889, "learning_rate": 5.067741178918711e-06, "loss": 2.4030878067016603, "memory(GiB)": 77.56, "step": 99835, "token_acc": 0.4942528735632184, "train_speed(iter/s)": 1.4383 }, { "epoch": 4.277451694443254, "grad_norm": 8.211701393127441, "learning_rate": 5.0647893943607975e-06, "loss": 1.9339447021484375, "memory(GiB)": 77.56, "step": 99840, "token_acc": 0.5652173913043478, "train_speed(iter/s)": 1.438315 }, { "epoch": 4.277665909772503, "grad_norm": 5.901163578033447, "learning_rate": 5.061838423846377e-06, "loss": 2.2432674407958983, "memory(GiB)": 77.56, "step": 99845, "token_acc": 0.5164835164835165, "train_speed(iter/s)": 1.438325 }, { "epoch": 4.277880125101753, "grad_norm": 6.125593662261963, "learning_rate": 5.058888267428913e-06, "loss": 2.3655691146850586, "memory(GiB)": 77.56, "step": 99850, "token_acc": 0.5080385852090032, "train_speed(iter/s)": 1.438347 }, { "epoch": 4.278094340431001, "grad_norm": 7.210329532623291, "learning_rate": 5.055938925161835e-06, "loss": 2.3708343505859375, "memory(GiB)": 77.56, "step": 99855, "token_acc": 0.4660493827160494, "train_speed(iter/s)": 1.438351 }, { "epoch": 4.27830855576025, "grad_norm": 6.790513515472412, "learning_rate": 5.052990397098589e-06, "loss": 2.2558460235595703, "memory(GiB)": 77.56, "step": 99860, "token_acc": 0.47648902821316613, "train_speed(iter/s)": 1.43836 }, { "epoch": 4.2785227710894995, "grad_norm": 6.5188798904418945, "learning_rate": 5.050042683292589e-06, "loss": 2.4967369079589843, "memory(GiB)": 77.56, "step": 99865, "token_acc": 0.49480968858131485, "train_speed(iter/s)": 1.43837 }, { "epoch": 4.278736986418748, "grad_norm": 7.214264869689941, "learning_rate": 5.047095783797234e-06, "loss": 2.3858945846557615, "memory(GiB)": 77.56, "step": 99870, "token_acc": 0.4662379421221865, "train_speed(iter/s)": 1.438385 }, { "epoch": 4.278951201747997, "grad_norm": 7.626358509063721, "learning_rate": 5.044149698665906e-06, "loss": 2.491080474853516, "memory(GiB)": 77.56, "step": 99875, "token_acc": 0.4726027397260274, "train_speed(iter/s)": 1.438398 }, { "epoch": 4.279165417077246, "grad_norm": 7.813310146331787, "learning_rate": 5.041204427951968e-06, "loss": 2.2463138580322264, "memory(GiB)": 77.56, "step": 99880, "token_acc": 0.5541666666666667, "train_speed(iter/s)": 1.438414 }, { "epoch": 4.279379632406495, "grad_norm": 7.381002902984619, "learning_rate": 5.0382599717087974e-06, "loss": 2.4186691284179687, "memory(GiB)": 77.56, "step": 99885, "token_acc": 0.47865853658536583, "train_speed(iter/s)": 1.43842 }, { "epoch": 4.279593847735744, "grad_norm": 6.85996675491333, "learning_rate": 5.035316329989725e-06, "loss": 2.375261688232422, "memory(GiB)": 77.56, "step": 99890, "token_acc": 0.5086505190311419, "train_speed(iter/s)": 1.438439 }, { "epoch": 4.279808063064993, "grad_norm": 6.606855392456055, "learning_rate": 5.03237350284807e-06, "loss": 2.1992351531982424, "memory(GiB)": 77.56, "step": 99895, "token_acc": 0.5296167247386759, "train_speed(iter/s)": 1.438452 }, { "epoch": 4.280022278394242, "grad_norm": 8.225568771362305, "learning_rate": 5.029431490337156e-06, "loss": 2.110148048400879, "memory(GiB)": 77.56, "step": 99900, "token_acc": 0.5166051660516605, "train_speed(iter/s)": 1.438458 }, { "epoch": 4.280236493723491, "grad_norm": 6.616367816925049, "learning_rate": 5.026490292510283e-06, "loss": 2.4189706802368165, "memory(GiB)": 77.56, "step": 99905, "token_acc": 0.4870848708487085, "train_speed(iter/s)": 1.438453 }, { "epoch": 4.28045070905274, "grad_norm": 5.795795917510986, "learning_rate": 5.023549909420722e-06, "loss": 2.2419610977172852, "memory(GiB)": 77.56, "step": 99910, "token_acc": 0.5198863636363636, "train_speed(iter/s)": 1.438447 }, { "epoch": 4.2806649243819885, "grad_norm": 7.112375736236572, "learning_rate": 5.02061034112174e-06, "loss": 2.3884592056274414, "memory(GiB)": 77.56, "step": 99915, "token_acc": 0.49851632047477745, "train_speed(iter/s)": 1.438437 }, { "epoch": 4.280879139711238, "grad_norm": 6.334680080413818, "learning_rate": 5.0176715876666e-06, "loss": 1.8271564483642577, "memory(GiB)": 77.56, "step": 99920, "token_acc": 0.6336206896551724, "train_speed(iter/s)": 1.438451 }, { "epoch": 4.281093355040487, "grad_norm": 6.141162872314453, "learning_rate": 5.014733649108538e-06, "loss": 2.3352020263671873, "memory(GiB)": 77.56, "step": 99925, "token_acc": 0.4827586206896552, "train_speed(iter/s)": 1.438447 }, { "epoch": 4.281307570369735, "grad_norm": 7.723939418792725, "learning_rate": 5.011796525500778e-06, "loss": 2.5333572387695313, "memory(GiB)": 77.56, "step": 99930, "token_acc": 0.46355685131195334, "train_speed(iter/s)": 1.438454 }, { "epoch": 4.281521785698985, "grad_norm": 5.576572895050049, "learning_rate": 5.008860216896527e-06, "loss": 2.1228477478027346, "memory(GiB)": 77.56, "step": 99935, "token_acc": 0.5338078291814946, "train_speed(iter/s)": 1.438457 }, { "epoch": 4.281736001028234, "grad_norm": 7.131587505340576, "learning_rate": 5.005924723348965e-06, "loss": 1.948977279663086, "memory(GiB)": 77.56, "step": 99940, "token_acc": 0.5807692307692308, "train_speed(iter/s)": 1.438464 }, { "epoch": 4.281950216357482, "grad_norm": 6.809156894683838, "learning_rate": 5.002990044911299e-06, "loss": 2.086215019226074, "memory(GiB)": 77.56, "step": 99945, "token_acc": 0.5305466237942122, "train_speed(iter/s)": 1.438466 }, { "epoch": 4.2821644316867316, "grad_norm": 5.828101634979248, "learning_rate": 5.000056181636676e-06, "loss": 2.0575798034667967, "memory(GiB)": 77.56, "step": 99950, "token_acc": 0.5777777777777777, "train_speed(iter/s)": 1.438452 }, { "epoch": 4.282378647015981, "grad_norm": 9.026351928710938, "learning_rate": 4.9971231335782465e-06, "loss": 2.1061494827270506, "memory(GiB)": 77.56, "step": 99955, "token_acc": 0.5261044176706827, "train_speed(iter/s)": 1.438473 }, { "epoch": 4.282592862345229, "grad_norm": 6.032985210418701, "learning_rate": 4.9941909007891476e-06, "loss": 2.1595590591430662, "memory(GiB)": 77.56, "step": 99960, "token_acc": 0.5440613026819924, "train_speed(iter/s)": 1.438473 }, { "epoch": 4.282807077674478, "grad_norm": 9.163704872131348, "learning_rate": 4.991259483322491e-06, "loss": 2.091373825073242, "memory(GiB)": 77.56, "step": 99965, "token_acc": 0.5257731958762887, "train_speed(iter/s)": 1.438489 }, { "epoch": 4.283021293003728, "grad_norm": 11.740559577941895, "learning_rate": 4.988328881231402e-06, "loss": 2.576679992675781, "memory(GiB)": 77.56, "step": 99970, "token_acc": 0.5202312138728323, "train_speed(iter/s)": 1.438502 }, { "epoch": 4.283235508332976, "grad_norm": 6.126219749450684, "learning_rate": 4.985399094568949e-06, "loss": 2.186813735961914, "memory(GiB)": 77.56, "step": 99975, "token_acc": 0.4968944099378882, "train_speed(iter/s)": 1.438499 }, { "epoch": 4.283449723662225, "grad_norm": 6.322460174560547, "learning_rate": 4.982470123388228e-06, "loss": 2.2531089782714844, "memory(GiB)": 77.56, "step": 99980, "token_acc": 0.5354330708661418, "train_speed(iter/s)": 1.438503 }, { "epoch": 4.283663938991475, "grad_norm": 6.5808820724487305, "learning_rate": 4.97954196774229e-06, "loss": 2.277579116821289, "memory(GiB)": 77.56, "step": 99985, "token_acc": 0.47703180212014135, "train_speed(iter/s)": 1.438522 }, { "epoch": 4.283878154320723, "grad_norm": 7.900747776031494, "learning_rate": 4.976614627684184e-06, "loss": 2.1548343658447267, "memory(GiB)": 77.56, "step": 99990, "token_acc": 0.532520325203252, "train_speed(iter/s)": 1.43854 }, { "epoch": 4.284092369649972, "grad_norm": 5.826950550079346, "learning_rate": 4.973688103266938e-06, "loss": 2.300174522399902, "memory(GiB)": 77.56, "step": 99995, "token_acc": 0.5017182130584192, "train_speed(iter/s)": 1.438535 }, { "epoch": 4.2843065849792215, "grad_norm": 5.82090425491333, "learning_rate": 4.97076239454356e-06, "loss": 2.033364486694336, "memory(GiB)": 77.56, "step": 100000, "token_acc": 0.5442622950819672, "train_speed(iter/s)": 1.438538 }, { "epoch": 4.2843065849792215, "eval_loss": 2.166041612625122, "eval_runtime": 14.5673, "eval_samples_per_second": 6.865, "eval_steps_per_second": 6.865, "eval_token_acc": 0.4884393063583815, "step": 100000 }, { "epoch": 4.28452080030847, "grad_norm": 6.366593837738037, "learning_rate": 4.967837501567069e-06, "loss": 2.4353309631347657, "memory(GiB)": 77.56, "step": 100005, "token_acc": 0.49745158002038736, "train_speed(iter/s)": 1.43823 }, { "epoch": 4.284735015637719, "grad_norm": 8.043924331665039, "learning_rate": 4.964913424390449e-06, "loss": 2.4790821075439453, "memory(GiB)": 77.56, "step": 100010, "token_acc": 0.47115384615384615, "train_speed(iter/s)": 1.438242 }, { "epoch": 4.284949230966968, "grad_norm": 6.464168071746826, "learning_rate": 4.961990163066671e-06, "loss": 2.2506282806396483, "memory(GiB)": 77.56, "step": 100015, "token_acc": 0.5129151291512916, "train_speed(iter/s)": 1.438234 }, { "epoch": 4.285163446296217, "grad_norm": 6.957046031951904, "learning_rate": 4.959067717648685e-06, "loss": 2.345835494995117, "memory(GiB)": 77.56, "step": 100020, "token_acc": 0.4944649446494465, "train_speed(iter/s)": 1.438209 }, { "epoch": 4.285377661625466, "grad_norm": 8.82361125946045, "learning_rate": 4.956146088189434e-06, "loss": 2.1141170501708983, "memory(GiB)": 77.56, "step": 100025, "token_acc": 0.5229357798165137, "train_speed(iter/s)": 1.438228 }, { "epoch": 4.285591876954715, "grad_norm": 4.408382892608643, "learning_rate": 4.953225274741857e-06, "loss": 2.1523441314697265, "memory(GiB)": 77.56, "step": 100030, "token_acc": 0.5242165242165242, "train_speed(iter/s)": 1.43824 }, { "epoch": 4.285806092283964, "grad_norm": 8.105433464050293, "learning_rate": 4.9503052773588635e-06, "loss": 2.459971046447754, "memory(GiB)": 77.56, "step": 100035, "token_acc": 0.5089820359281437, "train_speed(iter/s)": 1.438239 }, { "epoch": 4.286020307613213, "grad_norm": 6.042153835296631, "learning_rate": 4.947386096093337e-06, "loss": 2.1405755996704103, "memory(GiB)": 77.56, "step": 100040, "token_acc": 0.5802469135802469, "train_speed(iter/s)": 1.438237 }, { "epoch": 4.286234522942462, "grad_norm": 5.343184947967529, "learning_rate": 4.944467730998187e-06, "loss": 2.4361061096191405, "memory(GiB)": 77.56, "step": 100045, "token_acc": 0.5462555066079295, "train_speed(iter/s)": 1.438252 }, { "epoch": 4.2864487382717105, "grad_norm": 5.78253698348999, "learning_rate": 4.94155018212627e-06, "loss": 2.562644195556641, "memory(GiB)": 77.56, "step": 100050, "token_acc": 0.47796610169491527, "train_speed(iter/s)": 1.438264 }, { "epoch": 4.28666295360096, "grad_norm": 6.665125846862793, "learning_rate": 4.9386334495304384e-06, "loss": 2.2552669525146483, "memory(GiB)": 77.56, "step": 100055, "token_acc": 0.5592105263157895, "train_speed(iter/s)": 1.438284 }, { "epoch": 4.286877168930209, "grad_norm": 7.766232490539551, "learning_rate": 4.935717533263523e-06, "loss": 2.300341987609863, "memory(GiB)": 77.56, "step": 100060, "token_acc": 0.4982698961937716, "train_speed(iter/s)": 1.438295 }, { "epoch": 4.287091384259457, "grad_norm": 5.08329963684082, "learning_rate": 4.93280243337837e-06, "loss": 2.548065185546875, "memory(GiB)": 77.56, "step": 100065, "token_acc": 0.5151515151515151, "train_speed(iter/s)": 1.438307 }, { "epoch": 4.287305599588707, "grad_norm": 7.803770542144775, "learning_rate": 4.929888149927775e-06, "loss": 2.387367820739746, "memory(GiB)": 77.56, "step": 100070, "token_acc": 0.4913494809688581, "train_speed(iter/s)": 1.438299 }, { "epoch": 4.287519814917956, "grad_norm": 5.919239044189453, "learning_rate": 4.926974682964536e-06, "loss": 2.143289566040039, "memory(GiB)": 77.56, "step": 100075, "token_acc": 0.5333333333333333, "train_speed(iter/s)": 1.438321 }, { "epoch": 4.287734030247204, "grad_norm": 6.287697792053223, "learning_rate": 4.924062032541432e-06, "loss": 2.140834426879883, "memory(GiB)": 77.56, "step": 100080, "token_acc": 0.5063291139240507, "train_speed(iter/s)": 1.43833 }, { "epoch": 4.2879482455764535, "grad_norm": 5.852848052978516, "learning_rate": 4.921150198711217e-06, "loss": 2.340757369995117, "memory(GiB)": 77.56, "step": 100085, "token_acc": 0.5469387755102041, "train_speed(iter/s)": 1.438334 }, { "epoch": 4.288162460905703, "grad_norm": 6.5657782554626465, "learning_rate": 4.9182391815266685e-06, "loss": 2.3492603302001953, "memory(GiB)": 77.56, "step": 100090, "token_acc": 0.5306122448979592, "train_speed(iter/s)": 1.438349 }, { "epoch": 4.288376676234951, "grad_norm": 7.167140960693359, "learning_rate": 4.915328981040501e-06, "loss": 2.291988563537598, "memory(GiB)": 77.56, "step": 100095, "token_acc": 0.5, "train_speed(iter/s)": 1.438348 }, { "epoch": 4.2885908915642, "grad_norm": 6.007768630981445, "learning_rate": 4.912419597305446e-06, "loss": 2.480708122253418, "memory(GiB)": 77.56, "step": 100100, "token_acc": 0.46853146853146854, "train_speed(iter/s)": 1.438346 }, { "epoch": 4.28880510689345, "grad_norm": 5.0034565925598145, "learning_rate": 4.909511030374209e-06, "loss": 2.386271858215332, "memory(GiB)": 77.56, "step": 100105, "token_acc": 0.4861111111111111, "train_speed(iter/s)": 1.438368 }, { "epoch": 4.289019322222698, "grad_norm": 7.46071195602417, "learning_rate": 4.906603280299471e-06, "loss": 2.638846588134766, "memory(GiB)": 77.56, "step": 100110, "token_acc": 0.4553846153846154, "train_speed(iter/s)": 1.438382 }, { "epoch": 4.289233537551947, "grad_norm": 8.779317855834961, "learning_rate": 4.903696347133912e-06, "loss": 2.289180946350098, "memory(GiB)": 77.56, "step": 100115, "token_acc": 0.5339805825242718, "train_speed(iter/s)": 1.438385 }, { "epoch": 4.2894477528811965, "grad_norm": 7.734889030456543, "learning_rate": 4.900790230930191e-06, "loss": 2.256235694885254, "memory(GiB)": 77.56, "step": 100120, "token_acc": 0.541958041958042, "train_speed(iter/s)": 1.43839 }, { "epoch": 4.289661968210445, "grad_norm": 6.054869651794434, "learning_rate": 4.897884931740976e-06, "loss": 2.2363828659057616, "memory(GiB)": 77.56, "step": 100125, "token_acc": 0.5494505494505495, "train_speed(iter/s)": 1.438407 }, { "epoch": 4.289876183539694, "grad_norm": 7.103828430175781, "learning_rate": 4.894980449618886e-06, "loss": 2.4280017852783202, "memory(GiB)": 77.56, "step": 100130, "token_acc": 0.5049180327868853, "train_speed(iter/s)": 1.438403 }, { "epoch": 4.290090398868943, "grad_norm": 4.5956711769104, "learning_rate": 4.892076784616534e-06, "loss": 2.622050476074219, "memory(GiB)": 77.56, "step": 100135, "token_acc": 0.48109965635738833, "train_speed(iter/s)": 1.438415 }, { "epoch": 4.290304614198192, "grad_norm": 6.303820610046387, "learning_rate": 4.889173936786523e-06, "loss": 2.3801748275756838, "memory(GiB)": 77.56, "step": 100140, "token_acc": 0.4896551724137931, "train_speed(iter/s)": 1.438387 }, { "epoch": 4.290518829527441, "grad_norm": 6.977520942687988, "learning_rate": 4.886271906181439e-06, "loss": 2.4256723403930662, "memory(GiB)": 77.56, "step": 100145, "token_acc": 0.4748427672955975, "train_speed(iter/s)": 1.438392 }, { "epoch": 4.29073304485669, "grad_norm": 7.3222575187683105, "learning_rate": 4.883370692853867e-06, "loss": 2.3569026947021485, "memory(GiB)": 77.56, "step": 100150, "token_acc": 0.5034965034965035, "train_speed(iter/s)": 1.438397 }, { "epoch": 4.290947260185939, "grad_norm": 5.459466934204102, "learning_rate": 4.8804702968563596e-06, "loss": 2.3904865264892576, "memory(GiB)": 77.56, "step": 100155, "token_acc": 0.5116279069767442, "train_speed(iter/s)": 1.438407 }, { "epoch": 4.291161475515188, "grad_norm": 5.739074230194092, "learning_rate": 4.877570718241453e-06, "loss": 2.444879913330078, "memory(GiB)": 77.56, "step": 100160, "token_acc": 0.4815950920245399, "train_speed(iter/s)": 1.438404 }, { "epoch": 4.291375690844437, "grad_norm": 4.854170799255371, "learning_rate": 4.8746719570616885e-06, "loss": 1.8629764556884765, "memory(GiB)": 77.56, "step": 100165, "token_acc": 0.5481481481481482, "train_speed(iter/s)": 1.438423 }, { "epoch": 4.2915899061736855, "grad_norm": 5.591575622558594, "learning_rate": 4.871774013369556e-06, "loss": 2.2301511764526367, "memory(GiB)": 77.56, "step": 100170, "token_acc": 0.5173745173745173, "train_speed(iter/s)": 1.438429 }, { "epoch": 4.291804121502935, "grad_norm": 6.782240867614746, "learning_rate": 4.868876887217583e-06, "loss": 2.4777896881103514, "memory(GiB)": 77.56, "step": 100175, "token_acc": 0.4808362369337979, "train_speed(iter/s)": 1.438436 }, { "epoch": 4.292018336832184, "grad_norm": 6.378512859344482, "learning_rate": 4.865980578658241e-06, "loss": 2.316455268859863, "memory(GiB)": 77.56, "step": 100180, "token_acc": 0.468944099378882, "train_speed(iter/s)": 1.438451 }, { "epoch": 4.292232552161432, "grad_norm": 6.03173828125, "learning_rate": 4.8630850877440014e-06, "loss": 2.268305778503418, "memory(GiB)": 77.56, "step": 100185, "token_acc": 0.5370919881305638, "train_speed(iter/s)": 1.438449 }, { "epoch": 4.292446767490682, "grad_norm": 5.537770748138428, "learning_rate": 4.86019041452731e-06, "loss": 1.9870378494262695, "memory(GiB)": 77.56, "step": 100190, "token_acc": 0.5748987854251012, "train_speed(iter/s)": 1.43845 }, { "epoch": 4.292660982819931, "grad_norm": 5.093726634979248, "learning_rate": 4.8572965590606204e-06, "loss": 2.1830625534057617, "memory(GiB)": 77.56, "step": 100195, "token_acc": 0.48366013071895425, "train_speed(iter/s)": 1.438463 }, { "epoch": 4.292875198149179, "grad_norm": 8.003413200378418, "learning_rate": 4.8544035213963505e-06, "loss": 2.4027179718017577, "memory(GiB)": 77.56, "step": 100200, "token_acc": 0.5306122448979592, "train_speed(iter/s)": 1.438477 }, { "epoch": 4.2930894134784285, "grad_norm": 7.073882102966309, "learning_rate": 4.851511301586903e-06, "loss": 2.0694869995117187, "memory(GiB)": 77.56, "step": 100205, "token_acc": 0.5134099616858238, "train_speed(iter/s)": 1.438485 }, { "epoch": 4.293303628807678, "grad_norm": 8.690638542175293, "learning_rate": 4.848619899684686e-06, "loss": 2.4516393661499025, "memory(GiB)": 77.56, "step": 100210, "token_acc": 0.503125, "train_speed(iter/s)": 1.438481 }, { "epoch": 4.293517844136926, "grad_norm": 8.586088180541992, "learning_rate": 4.845729315742081e-06, "loss": 2.0516942977905273, "memory(GiB)": 77.56, "step": 100215, "token_acc": 0.5545851528384279, "train_speed(iter/s)": 1.438497 }, { "epoch": 4.293732059466175, "grad_norm": 5.63650369644165, "learning_rate": 4.8428395498114455e-06, "loss": 2.5344547271728515, "memory(GiB)": 77.56, "step": 100220, "token_acc": 0.4782608695652174, "train_speed(iter/s)": 1.438481 }, { "epoch": 4.293946274795425, "grad_norm": 6.443324565887451, "learning_rate": 4.839950601945131e-06, "loss": 2.1202959060668944, "memory(GiB)": 77.56, "step": 100225, "token_acc": 0.5425531914893617, "train_speed(iter/s)": 1.43848 }, { "epoch": 4.294160490124673, "grad_norm": 6.104748725891113, "learning_rate": 4.837062472195469e-06, "loss": 2.277557373046875, "memory(GiB)": 77.56, "step": 100230, "token_acc": 0.5344129554655871, "train_speed(iter/s)": 1.438481 }, { "epoch": 4.294374705453922, "grad_norm": 5.983887195587158, "learning_rate": 4.83417516061479e-06, "loss": 2.1089056015014647, "memory(GiB)": 77.56, "step": 100235, "token_acc": 0.5063291139240507, "train_speed(iter/s)": 1.438491 }, { "epoch": 4.294588920783172, "grad_norm": 6.209619998931885, "learning_rate": 4.831288667255401e-06, "loss": 2.2133438110351564, "memory(GiB)": 77.56, "step": 100240, "token_acc": 0.5147058823529411, "train_speed(iter/s)": 1.438501 }, { "epoch": 4.29480313611242, "grad_norm": 6.0841851234436035, "learning_rate": 4.82840299216959e-06, "loss": 2.4208400726318358, "memory(GiB)": 77.56, "step": 100245, "token_acc": 0.4966887417218543, "train_speed(iter/s)": 1.438505 }, { "epoch": 4.295017351441669, "grad_norm": 5.9809980392456055, "learning_rate": 4.8255181354096345e-06, "loss": 2.4528121948242188, "memory(GiB)": 77.56, "step": 100250, "token_acc": 0.477124183006536, "train_speed(iter/s)": 1.438512 }, { "epoch": 4.295231566770918, "grad_norm": 5.294595718383789, "learning_rate": 4.822634097027789e-06, "loss": 2.408539581298828, "memory(GiB)": 77.56, "step": 100255, "token_acc": 0.5172413793103449, "train_speed(iter/s)": 1.438519 }, { "epoch": 4.295445782100167, "grad_norm": 5.184760570526123, "learning_rate": 4.819750877076301e-06, "loss": 2.137157440185547, "memory(GiB)": 77.56, "step": 100260, "token_acc": 0.5254901960784314, "train_speed(iter/s)": 1.438515 }, { "epoch": 4.295659997429416, "grad_norm": 5.924389362335205, "learning_rate": 4.816868475607411e-06, "loss": 2.3872756958007812, "memory(GiB)": 77.56, "step": 100265, "token_acc": 0.4843205574912892, "train_speed(iter/s)": 1.438522 }, { "epoch": 4.295874212758665, "grad_norm": 5.923495292663574, "learning_rate": 4.813986892673339e-06, "loss": 2.3210147857666015, "memory(GiB)": 77.56, "step": 100270, "token_acc": 0.4724137931034483, "train_speed(iter/s)": 1.43852 }, { "epoch": 4.296088428087914, "grad_norm": 6.01570463180542, "learning_rate": 4.811106128326281e-06, "loss": 2.3359352111816407, "memory(GiB)": 77.56, "step": 100275, "token_acc": 0.5047619047619047, "train_speed(iter/s)": 1.438513 }, { "epoch": 4.296302643417163, "grad_norm": 6.0657525062561035, "learning_rate": 4.808226182618431e-06, "loss": 1.992770004272461, "memory(GiB)": 77.56, "step": 100280, "token_acc": 0.5555555555555556, "train_speed(iter/s)": 1.438514 }, { "epoch": 4.296516858746412, "grad_norm": 4.778995037078857, "learning_rate": 4.8053470556019555e-06, "loss": 2.1875919342041015, "memory(GiB)": 77.56, "step": 100285, "token_acc": 0.5047021943573667, "train_speed(iter/s)": 1.438521 }, { "epoch": 4.296731074075661, "grad_norm": 7.304758071899414, "learning_rate": 4.802468747329003e-06, "loss": 2.5573429107666015, "memory(GiB)": 77.56, "step": 100290, "token_acc": 0.47440273037542663, "train_speed(iter/s)": 1.438547 }, { "epoch": 4.29694528940491, "grad_norm": 10.679129600524902, "learning_rate": 4.799591257851738e-06, "loss": 2.335013198852539, "memory(GiB)": 77.56, "step": 100295, "token_acc": 0.48013245033112584, "train_speed(iter/s)": 1.438551 }, { "epoch": 4.297159504734159, "grad_norm": 6.13103723526001, "learning_rate": 4.796714587222278e-06, "loss": 2.4073814392089843, "memory(GiB)": 77.56, "step": 100300, "token_acc": 0.531986531986532, "train_speed(iter/s)": 1.438565 }, { "epoch": 4.2973737200634075, "grad_norm": 5.2924675941467285, "learning_rate": 4.7938387354927396e-06, "loss": 2.375481605529785, "memory(GiB)": 77.56, "step": 100305, "token_acc": 0.48109965635738833, "train_speed(iter/s)": 1.438582 }, { "epoch": 4.297587935392657, "grad_norm": 6.4611124992370605, "learning_rate": 4.790963702715218e-06, "loss": 2.6847564697265627, "memory(GiB)": 77.56, "step": 100310, "token_acc": 0.4583333333333333, "train_speed(iter/s)": 1.438577 }, { "epoch": 4.297802150721906, "grad_norm": 6.762071132659912, "learning_rate": 4.788089488941788e-06, "loss": 2.3069812774658205, "memory(GiB)": 77.56, "step": 100315, "token_acc": 0.5241379310344828, "train_speed(iter/s)": 1.438588 }, { "epoch": 4.298016366051154, "grad_norm": 7.382822036743164, "learning_rate": 4.785216094224543e-06, "loss": 2.3777690887451173, "memory(GiB)": 77.56, "step": 100320, "token_acc": 0.4850498338870432, "train_speed(iter/s)": 1.438596 }, { "epoch": 4.298230581380404, "grad_norm": 6.4481048583984375, "learning_rate": 4.782343518615517e-06, "loss": 2.204959487915039, "memory(GiB)": 77.56, "step": 100325, "token_acc": 0.5333333333333333, "train_speed(iter/s)": 1.438604 }, { "epoch": 4.298444796709653, "grad_norm": 6.603975296020508, "learning_rate": 4.779471762166759e-06, "loss": 2.197288703918457, "memory(GiB)": 77.56, "step": 100330, "token_acc": 0.550185873605948, "train_speed(iter/s)": 1.438593 }, { "epoch": 4.298659012038901, "grad_norm": 7.149195194244385, "learning_rate": 4.776600824930283e-06, "loss": 2.354528045654297, "memory(GiB)": 77.56, "step": 100335, "token_acc": 0.5156794425087108, "train_speed(iter/s)": 1.438593 }, { "epoch": 4.2988732273681505, "grad_norm": 6.460785865783691, "learning_rate": 4.773730706958113e-06, "loss": 2.0070682525634767, "memory(GiB)": 77.56, "step": 100340, "token_acc": 0.5463576158940397, "train_speed(iter/s)": 1.43858 }, { "epoch": 4.2990874426974, "grad_norm": 4.652647495269775, "learning_rate": 4.770861408302235e-06, "loss": 2.4208192825317383, "memory(GiB)": 77.56, "step": 100345, "token_acc": 0.5165562913907285, "train_speed(iter/s)": 1.438566 }, { "epoch": 4.299301658026648, "grad_norm": 6.717465877532959, "learning_rate": 4.767992929014625e-06, "loss": 2.0254501342773437, "memory(GiB)": 77.56, "step": 100350, "token_acc": 0.5609756097560976, "train_speed(iter/s)": 1.43856 }, { "epoch": 4.299515873355897, "grad_norm": 6.3007683753967285, "learning_rate": 4.765125269147264e-06, "loss": 2.3138801574707033, "memory(GiB)": 77.56, "step": 100355, "token_acc": 0.4931972789115646, "train_speed(iter/s)": 1.438565 }, { "epoch": 4.299730088685147, "grad_norm": 5.693936824798584, "learning_rate": 4.762258428752092e-06, "loss": 2.073953628540039, "memory(GiB)": 77.56, "step": 100360, "token_acc": 0.5366795366795367, "train_speed(iter/s)": 1.438572 }, { "epoch": 4.299944304014395, "grad_norm": 6.544804096221924, "learning_rate": 4.759392407881041e-06, "loss": 2.211111068725586, "memory(GiB)": 77.56, "step": 100365, "token_acc": 0.5318471337579618, "train_speed(iter/s)": 1.438561 }, { "epoch": 4.300158519343644, "grad_norm": 6.404198169708252, "learning_rate": 4.756527206586037e-06, "loss": 2.1870901107788088, "memory(GiB)": 77.56, "step": 100370, "token_acc": 0.5134228187919463, "train_speed(iter/s)": 1.438564 }, { "epoch": 4.3003727346728935, "grad_norm": 7.706130504608154, "learning_rate": 4.7536628249189805e-06, "loss": 2.2626474380493162, "memory(GiB)": 77.56, "step": 100375, "token_acc": 0.510548523206751, "train_speed(iter/s)": 1.438566 }, { "epoch": 4.300586950002142, "grad_norm": 6.014760494232178, "learning_rate": 4.750799262931771e-06, "loss": 2.1988319396972655, "memory(GiB)": 77.56, "step": 100380, "token_acc": 0.5226480836236934, "train_speed(iter/s)": 1.438571 }, { "epoch": 4.300801165331391, "grad_norm": 6.171198844909668, "learning_rate": 4.747936520676277e-06, "loss": 2.14705753326416, "memory(GiB)": 77.56, "step": 100385, "token_acc": 0.5104895104895105, "train_speed(iter/s)": 1.438571 }, { "epoch": 4.30101538066064, "grad_norm": 6.147973537445068, "learning_rate": 4.745074598204369e-06, "loss": 2.2761959075927733, "memory(GiB)": 77.56, "step": 100390, "token_acc": 0.5198863636363636, "train_speed(iter/s)": 1.438563 }, { "epoch": 4.301229595989889, "grad_norm": 6.9807515144348145, "learning_rate": 4.742213495567882e-06, "loss": 2.3244550704956053, "memory(GiB)": 77.56, "step": 100395, "token_acc": 0.4889589905362776, "train_speed(iter/s)": 1.438567 }, { "epoch": 4.301443811319138, "grad_norm": 4.6973042488098145, "learning_rate": 4.739353212818659e-06, "loss": 2.414212226867676, "memory(GiB)": 77.56, "step": 100400, "token_acc": 0.5088235294117647, "train_speed(iter/s)": 1.438576 }, { "epoch": 4.301658026648387, "grad_norm": 6.312798976898193, "learning_rate": 4.736493750008497e-06, "loss": 2.2111274719238283, "memory(GiB)": 77.56, "step": 100405, "token_acc": 0.5231788079470199, "train_speed(iter/s)": 1.438588 }, { "epoch": 4.301872241977636, "grad_norm": 6.337025165557861, "learning_rate": 4.7336351071892105e-06, "loss": 2.433379554748535, "memory(GiB)": 77.56, "step": 100410, "token_acc": 0.5163636363636364, "train_speed(iter/s)": 1.438607 }, { "epoch": 4.302086457306885, "grad_norm": 6.201536178588867, "learning_rate": 4.730777284412596e-06, "loss": 2.057038497924805, "memory(GiB)": 77.56, "step": 100415, "token_acc": 0.5830258302583026, "train_speed(iter/s)": 1.438611 }, { "epoch": 4.302300672636134, "grad_norm": 5.896797180175781, "learning_rate": 4.727920281730425e-06, "loss": 2.0697708129882812, "memory(GiB)": 77.56, "step": 100420, "token_acc": 0.5719844357976653, "train_speed(iter/s)": 1.43862 }, { "epoch": 4.3025148879653825, "grad_norm": 5.5664873123168945, "learning_rate": 4.7250640991944375e-06, "loss": 2.4261806488037108, "memory(GiB)": 77.56, "step": 100425, "token_acc": 0.48059701492537316, "train_speed(iter/s)": 1.43862 }, { "epoch": 4.302729103294632, "grad_norm": 5.498995780944824, "learning_rate": 4.722208736856387e-06, "loss": 2.5309362411499023, "memory(GiB)": 77.56, "step": 100430, "token_acc": 0.47368421052631576, "train_speed(iter/s)": 1.438622 }, { "epoch": 4.302943318623881, "grad_norm": 6.2231526374816895, "learning_rate": 4.719354194767994e-06, "loss": 2.106704330444336, "memory(GiB)": 77.56, "step": 100435, "token_acc": 0.5170068027210885, "train_speed(iter/s)": 1.438638 }, { "epoch": 4.303157533953129, "grad_norm": 7.580305099487305, "learning_rate": 4.716500472980983e-06, "loss": 2.345419692993164, "memory(GiB)": 77.56, "step": 100440, "token_acc": 0.5038167938931297, "train_speed(iter/s)": 1.438651 }, { "epoch": 4.303371749282379, "grad_norm": 7.637293338775635, "learning_rate": 4.713647571547048e-06, "loss": 2.1894128799438475, "memory(GiB)": 77.56, "step": 100445, "token_acc": 0.5037593984962406, "train_speed(iter/s)": 1.438649 }, { "epoch": 4.303585964611628, "grad_norm": 5.935568809509277, "learning_rate": 4.710795490517861e-06, "loss": 2.150985908508301, "memory(GiB)": 77.56, "step": 100450, "token_acc": 0.5195729537366548, "train_speed(iter/s)": 1.438666 }, { "epoch": 4.303800179940876, "grad_norm": 7.975590705871582, "learning_rate": 4.707944229945105e-06, "loss": 2.370107650756836, "memory(GiB)": 77.56, "step": 100455, "token_acc": 0.524822695035461, "train_speed(iter/s)": 1.438675 }, { "epoch": 4.3040143952701255, "grad_norm": 6.179610252380371, "learning_rate": 4.705093789880416e-06, "loss": 2.187881851196289, "memory(GiB)": 77.56, "step": 100460, "token_acc": 0.5478927203065134, "train_speed(iter/s)": 1.43868 }, { "epoch": 4.304228610599375, "grad_norm": 5.767256259918213, "learning_rate": 4.702244170375453e-06, "loss": 1.9687423706054688, "memory(GiB)": 77.56, "step": 100465, "token_acc": 0.5649122807017544, "train_speed(iter/s)": 1.438679 }, { "epoch": 4.304442825928623, "grad_norm": 6.775595188140869, "learning_rate": 4.699395371481829e-06, "loss": 1.9267642974853516, "memory(GiB)": 77.56, "step": 100470, "token_acc": 0.5648854961832062, "train_speed(iter/s)": 1.438684 }, { "epoch": 4.304657041257872, "grad_norm": 6.519343376159668, "learning_rate": 4.696547393251155e-06, "loss": 2.1873184204101563, "memory(GiB)": 77.56, "step": 100475, "token_acc": 0.5107913669064749, "train_speed(iter/s)": 1.438678 }, { "epoch": 4.304871256587122, "grad_norm": 6.8834075927734375, "learning_rate": 4.693700235735021e-06, "loss": 2.5468477249145507, "memory(GiB)": 77.56, "step": 100480, "token_acc": 0.47491638795986624, "train_speed(iter/s)": 1.43867 }, { "epoch": 4.30508547191637, "grad_norm": 7.11431884765625, "learning_rate": 4.690853898985004e-06, "loss": 2.3558788299560547, "memory(GiB)": 77.56, "step": 100485, "token_acc": 0.541958041958042, "train_speed(iter/s)": 1.438675 }, { "epoch": 4.305299687245619, "grad_norm": 6.505257606506348, "learning_rate": 4.688008383052672e-06, "loss": 2.3135259628295897, "memory(GiB)": 77.56, "step": 100490, "token_acc": 0.5161290322580645, "train_speed(iter/s)": 1.438689 }, { "epoch": 4.3055139025748685, "grad_norm": 6.5288920402526855, "learning_rate": 4.685163687989569e-06, "loss": 2.2993209838867186, "memory(GiB)": 77.56, "step": 100495, "token_acc": 0.503125, "train_speed(iter/s)": 1.438669 }, { "epoch": 4.305728117904117, "grad_norm": 5.279067039489746, "learning_rate": 4.682319813847247e-06, "loss": 2.2329715728759765, "memory(GiB)": 77.56, "step": 100500, "token_acc": 0.47904191616766467, "train_speed(iter/s)": 1.438674 }, { "epoch": 4.305728117904117, "eval_loss": 2.1699624061584473, "eval_runtime": 15.0777, "eval_samples_per_second": 6.632, "eval_steps_per_second": 6.632, "eval_token_acc": 0.4699300699300699, "step": 100500 }, { "epoch": 4.305942333233366, "grad_norm": 5.510031223297119, "learning_rate": 4.679476760677209e-06, "loss": 2.283426284790039, "memory(GiB)": 77.56, "step": 100505, "token_acc": 0.4893410852713178, "train_speed(iter/s)": 1.438351 }, { "epoch": 4.306156548562615, "grad_norm": 7.931199550628662, "learning_rate": 4.676634528530965e-06, "loss": 2.383808898925781, "memory(GiB)": 77.56, "step": 100510, "token_acc": 0.5145985401459854, "train_speed(iter/s)": 1.438348 }, { "epoch": 4.306370763891864, "grad_norm": 5.487090110778809, "learning_rate": 4.673793117460007e-06, "loss": 2.330915641784668, "memory(GiB)": 77.56, "step": 100515, "token_acc": 0.46706586826347307, "train_speed(iter/s)": 1.43836 }, { "epoch": 4.306584979221113, "grad_norm": 5.396183490753174, "learning_rate": 4.670952527515793e-06, "loss": 1.9868366241455078, "memory(GiB)": 77.56, "step": 100520, "token_acc": 0.5668016194331984, "train_speed(iter/s)": 1.438372 }, { "epoch": 4.306799194550362, "grad_norm": 7.76336145401001, "learning_rate": 4.668112758749804e-06, "loss": 2.415496826171875, "memory(GiB)": 77.56, "step": 100525, "token_acc": 0.49640287769784175, "train_speed(iter/s)": 1.438386 }, { "epoch": 4.307013409879611, "grad_norm": 5.985770225524902, "learning_rate": 4.665273811213478e-06, "loss": 2.116036224365234, "memory(GiB)": 77.56, "step": 100530, "token_acc": 0.5, "train_speed(iter/s)": 1.438396 }, { "epoch": 4.30722762520886, "grad_norm": 6.645129680633545, "learning_rate": 4.662435684958244e-06, "loss": 2.400859260559082, "memory(GiB)": 77.56, "step": 100535, "token_acc": 0.5085910652920962, "train_speed(iter/s)": 1.438401 }, { "epoch": 4.307441840538109, "grad_norm": 9.147539138793945, "learning_rate": 4.659598380035518e-06, "loss": 2.2356204986572266, "memory(GiB)": 77.56, "step": 100540, "token_acc": 0.4981949458483754, "train_speed(iter/s)": 1.438412 }, { "epoch": 4.307656055867358, "grad_norm": 8.270538330078125, "learning_rate": 4.656761896496703e-06, "loss": 2.107296371459961, "memory(GiB)": 77.56, "step": 100545, "token_acc": 0.5245901639344263, "train_speed(iter/s)": 1.438416 }, { "epoch": 4.307870271196607, "grad_norm": 6.911714553833008, "learning_rate": 4.653926234393169e-06, "loss": 1.792007827758789, "memory(GiB)": 77.56, "step": 100550, "token_acc": 0.5497835497835498, "train_speed(iter/s)": 1.438426 }, { "epoch": 4.308084486525856, "grad_norm": 5.8084940910339355, "learning_rate": 4.651091393776308e-06, "loss": 2.2597923278808594, "memory(GiB)": 77.56, "step": 100555, "token_acc": 0.5229681978798587, "train_speed(iter/s)": 1.438438 }, { "epoch": 4.308298701855104, "grad_norm": 7.472497940063477, "learning_rate": 4.648257374697462e-06, "loss": 2.2256282806396483, "memory(GiB)": 77.56, "step": 100560, "token_acc": 0.5048231511254019, "train_speed(iter/s)": 1.438452 }, { "epoch": 4.308512917184354, "grad_norm": 6.116946697235107, "learning_rate": 4.645424177207985e-06, "loss": 2.4894184112548827, "memory(GiB)": 77.56, "step": 100565, "token_acc": 0.5123456790123457, "train_speed(iter/s)": 1.438463 }, { "epoch": 4.308727132513603, "grad_norm": 10.386627197265625, "learning_rate": 4.6425918013591916e-06, "loss": 2.465712547302246, "memory(GiB)": 77.56, "step": 100570, "token_acc": 0.4756554307116105, "train_speed(iter/s)": 1.43847 }, { "epoch": 4.308941347842851, "grad_norm": 5.519584655761719, "learning_rate": 4.639760247202396e-06, "loss": 2.4339309692382813, "memory(GiB)": 77.56, "step": 100575, "token_acc": 0.5269709543568465, "train_speed(iter/s)": 1.438484 }, { "epoch": 4.309155563172101, "grad_norm": 5.392373561859131, "learning_rate": 4.636929514788891e-06, "loss": 2.471701431274414, "memory(GiB)": 77.56, "step": 100580, "token_acc": 0.53125, "train_speed(iter/s)": 1.43849 }, { "epoch": 4.30936977850135, "grad_norm": 6.107878684997559, "learning_rate": 4.6340996041699665e-06, "loss": 2.0578975677490234, "memory(GiB)": 77.56, "step": 100585, "token_acc": 0.5387323943661971, "train_speed(iter/s)": 1.438492 }, { "epoch": 4.309583993830598, "grad_norm": 6.998330116271973, "learning_rate": 4.631270515396891e-06, "loss": 2.5220548629760744, "memory(GiB)": 77.56, "step": 100590, "token_acc": 0.48880597014925375, "train_speed(iter/s)": 1.438485 }, { "epoch": 4.3097982091598475, "grad_norm": 7.377869606018066, "learning_rate": 4.628442248520904e-06, "loss": 2.4531581878662108, "memory(GiB)": 77.56, "step": 100595, "token_acc": 0.4702194357366771, "train_speed(iter/s)": 1.438482 }, { "epoch": 4.310012424489097, "grad_norm": 6.266183376312256, "learning_rate": 4.625614803593248e-06, "loss": 2.3262874603271486, "memory(GiB)": 77.56, "step": 100600, "token_acc": 0.4787234042553192, "train_speed(iter/s)": 1.438479 }, { "epoch": 4.310226639818345, "grad_norm": 8.184282302856445, "learning_rate": 4.622788180665133e-06, "loss": 2.115907669067383, "memory(GiB)": 77.56, "step": 100605, "token_acc": 0.527972027972028, "train_speed(iter/s)": 1.43849 }, { "epoch": 4.310440855147594, "grad_norm": 7.2758469581604, "learning_rate": 4.61996237978779e-06, "loss": 2.3972925186157226, "memory(GiB)": 77.56, "step": 100610, "token_acc": 0.5108695652173914, "train_speed(iter/s)": 1.438486 }, { "epoch": 4.310655070476844, "grad_norm": 5.9413957595825195, "learning_rate": 4.6171374010123945e-06, "loss": 2.192464828491211, "memory(GiB)": 77.56, "step": 100615, "token_acc": 0.5015105740181269, "train_speed(iter/s)": 1.438507 }, { "epoch": 4.310869285806092, "grad_norm": 8.079716682434082, "learning_rate": 4.614313244390133e-06, "loss": 2.2437488555908205, "memory(GiB)": 77.56, "step": 100620, "token_acc": 0.531986531986532, "train_speed(iter/s)": 1.438519 }, { "epoch": 4.311083501135341, "grad_norm": 10.538756370544434, "learning_rate": 4.611489909972161e-06, "loss": 2.578044319152832, "memory(GiB)": 77.56, "step": 100625, "token_acc": 0.4440789473684211, "train_speed(iter/s)": 1.438522 }, { "epoch": 4.3112977164645905, "grad_norm": 8.221933364868164, "learning_rate": 4.6086673978096125e-06, "loss": 2.3713968276977537, "memory(GiB)": 77.56, "step": 100630, "token_acc": 0.5155709342560554, "train_speed(iter/s)": 1.438533 }, { "epoch": 4.311511931793839, "grad_norm": 5.455938339233398, "learning_rate": 4.605845707953649e-06, "loss": 2.565324401855469, "memory(GiB)": 77.56, "step": 100635, "token_acc": 0.528169014084507, "train_speed(iter/s)": 1.438541 }, { "epoch": 4.311726147123088, "grad_norm": 6.34124755859375, "learning_rate": 4.603024840455367e-06, "loss": 2.448826789855957, "memory(GiB)": 77.56, "step": 100640, "token_acc": 0.5108695652173914, "train_speed(iter/s)": 1.438558 }, { "epoch": 4.311940362452337, "grad_norm": 5.32847785949707, "learning_rate": 4.600204795365881e-06, "loss": 2.1911535263061523, "memory(GiB)": 77.56, "step": 100645, "token_acc": 0.5086705202312138, "train_speed(iter/s)": 1.43856 }, { "epoch": 4.312154577781586, "grad_norm": 5.827273845672607, "learning_rate": 4.597385572736273e-06, "loss": 2.0310623168945314, "memory(GiB)": 77.56, "step": 100650, "token_acc": 0.5263157894736842, "train_speed(iter/s)": 1.438557 }, { "epoch": 4.312368793110835, "grad_norm": 6.4080610275268555, "learning_rate": 4.59456717261762e-06, "loss": 2.2133161544799806, "memory(GiB)": 77.56, "step": 100655, "token_acc": 0.5350877192982456, "train_speed(iter/s)": 1.438562 }, { "epoch": 4.312583008440084, "grad_norm": 4.702672958374023, "learning_rate": 4.591749595060974e-06, "loss": 1.986974334716797, "memory(GiB)": 77.56, "step": 100660, "token_acc": 0.5964285714285714, "train_speed(iter/s)": 1.438555 }, { "epoch": 4.312797223769333, "grad_norm": 6.637474536895752, "learning_rate": 4.588932840117366e-06, "loss": 2.2598472595214845, "memory(GiB)": 77.56, "step": 100665, "token_acc": 0.5318471337579618, "train_speed(iter/s)": 1.438548 }, { "epoch": 4.313011439098582, "grad_norm": 7.995211124420166, "learning_rate": 4.586116907837857e-06, "loss": 2.6341548919677735, "memory(GiB)": 77.56, "step": 100670, "token_acc": 0.48623853211009177, "train_speed(iter/s)": 1.438552 }, { "epoch": 4.313225654427831, "grad_norm": 5.640625476837158, "learning_rate": 4.583301798273437e-06, "loss": 2.3597785949707033, "memory(GiB)": 77.56, "step": 100675, "token_acc": 0.5, "train_speed(iter/s)": 1.438559 }, { "epoch": 4.3134398697570795, "grad_norm": 5.814727783203125, "learning_rate": 4.580487511475112e-06, "loss": 2.283009147644043, "memory(GiB)": 77.56, "step": 100680, "token_acc": 0.5444839857651246, "train_speed(iter/s)": 1.438565 }, { "epoch": 4.313654085086329, "grad_norm": 7.987022876739502, "learning_rate": 4.577674047493857e-06, "loss": 2.1850864410400392, "memory(GiB)": 77.56, "step": 100685, "token_acc": 0.531496062992126, "train_speed(iter/s)": 1.438571 }, { "epoch": 4.313868300415578, "grad_norm": 6.397132396697998, "learning_rate": 4.574861406380654e-06, "loss": 2.5650733947753905, "memory(GiB)": 77.56, "step": 100690, "token_acc": 0.4542857142857143, "train_speed(iter/s)": 1.438585 }, { "epoch": 4.314082515744826, "grad_norm": 4.941383361816406, "learning_rate": 4.572049588186433e-06, "loss": 2.2035526275634765, "memory(GiB)": 77.56, "step": 100695, "token_acc": 0.5666666666666667, "train_speed(iter/s)": 1.4386 }, { "epoch": 4.314296731074076, "grad_norm": 7.093830108642578, "learning_rate": 4.5692385929621604e-06, "loss": 2.2035017013549805, "memory(GiB)": 77.56, "step": 100700, "token_acc": 0.4981949458483754, "train_speed(iter/s)": 1.438623 }, { "epoch": 4.314510946403325, "grad_norm": 5.65403938293457, "learning_rate": 4.566428420758739e-06, "loss": 2.0075403213500977, "memory(GiB)": 77.56, "step": 100705, "token_acc": 0.5738255033557047, "train_speed(iter/s)": 1.43863 }, { "epoch": 4.314725161732573, "grad_norm": 5.659056186676025, "learning_rate": 4.563619071627096e-06, "loss": 2.4824567794799806, "memory(GiB)": 77.56, "step": 100710, "token_acc": 0.45901639344262296, "train_speed(iter/s)": 1.438643 }, { "epoch": 4.3149393770618225, "grad_norm": 6.608780860900879, "learning_rate": 4.560810545618116e-06, "loss": 2.5676361083984376, "memory(GiB)": 77.56, "step": 100715, "token_acc": 0.42424242424242425, "train_speed(iter/s)": 1.438649 }, { "epoch": 4.315153592391072, "grad_norm": 7.0502543449401855, "learning_rate": 4.558002842782677e-06, "loss": 2.3460012435913087, "memory(GiB)": 77.56, "step": 100720, "token_acc": 0.48028673835125446, "train_speed(iter/s)": 1.438654 }, { "epoch": 4.31536780772032, "grad_norm": 5.865872859954834, "learning_rate": 4.5551959631716376e-06, "loss": 2.1148401260375977, "memory(GiB)": 77.56, "step": 100725, "token_acc": 0.47985347985347987, "train_speed(iter/s)": 1.43866 }, { "epoch": 4.315582023049569, "grad_norm": 5.7145538330078125, "learning_rate": 4.552389906835863e-06, "loss": 2.3243362426757814, "memory(GiB)": 77.56, "step": 100730, "token_acc": 0.4694533762057878, "train_speed(iter/s)": 1.438674 }, { "epoch": 4.315796238378819, "grad_norm": 7.404398441314697, "learning_rate": 4.549584673826179e-06, "loss": 2.294243049621582, "memory(GiB)": 77.56, "step": 100735, "token_acc": 0.5384615384615384, "train_speed(iter/s)": 1.438694 }, { "epoch": 4.316010453708067, "grad_norm": 5.528452396392822, "learning_rate": 4.546780264193406e-06, "loss": 2.5592575073242188, "memory(GiB)": 77.56, "step": 100740, "token_acc": 0.5051546391752577, "train_speed(iter/s)": 1.438674 }, { "epoch": 4.316224669037316, "grad_norm": 7.086595058441162, "learning_rate": 4.543976677988343e-06, "loss": 2.309468460083008, "memory(GiB)": 77.56, "step": 100745, "token_acc": 0.5255813953488372, "train_speed(iter/s)": 1.43867 }, { "epoch": 4.3164388843665655, "grad_norm": 6.76436710357666, "learning_rate": 4.541173915261776e-06, "loss": 2.2618900299072267, "memory(GiB)": 77.56, "step": 100750, "token_acc": 0.5277777777777778, "train_speed(iter/s)": 1.438672 }, { "epoch": 4.316653099695814, "grad_norm": 5.549356460571289, "learning_rate": 4.538371976064499e-06, "loss": 2.2458831787109377, "memory(GiB)": 77.56, "step": 100755, "token_acc": 0.5054545454545455, "train_speed(iter/s)": 1.43868 }, { "epoch": 4.316867315025063, "grad_norm": 6.185845851898193, "learning_rate": 4.535570860447258e-06, "loss": 2.2630590438842773, "memory(GiB)": 77.56, "step": 100760, "token_acc": 0.52, "train_speed(iter/s)": 1.43869 }, { "epoch": 4.317081530354312, "grad_norm": 5.358894348144531, "learning_rate": 4.532770568460798e-06, "loss": 2.2653152465820314, "memory(GiB)": 77.56, "step": 100765, "token_acc": 0.5121951219512195, "train_speed(iter/s)": 1.438674 }, { "epoch": 4.317295745683561, "grad_norm": 6.094043731689453, "learning_rate": 4.529971100155855e-06, "loss": 2.092005729675293, "memory(GiB)": 77.56, "step": 100770, "token_acc": 0.5031055900621118, "train_speed(iter/s)": 1.438673 }, { "epoch": 4.31750996101281, "grad_norm": 5.736149787902832, "learning_rate": 4.52717245558314e-06, "loss": 2.1980871200561523, "memory(GiB)": 77.56, "step": 100775, "token_acc": 0.5379061371841155, "train_speed(iter/s)": 1.438664 }, { "epoch": 4.317724176342059, "grad_norm": 5.013285160064697, "learning_rate": 4.524374634793338e-06, "loss": 2.191897392272949, "memory(GiB)": 77.56, "step": 100780, "token_acc": 0.5111940298507462, "train_speed(iter/s)": 1.438662 }, { "epoch": 4.317938391671308, "grad_norm": 5.689235687255859, "learning_rate": 4.521577637837154e-06, "loss": 2.2575002670288087, "memory(GiB)": 77.56, "step": 100785, "token_acc": 0.5149253731343284, "train_speed(iter/s)": 1.438694 }, { "epoch": 4.318152607000557, "grad_norm": 6.8830695152282715, "learning_rate": 4.518781464765259e-06, "loss": 2.3067911148071287, "memory(GiB)": 77.56, "step": 100790, "token_acc": 0.5547945205479452, "train_speed(iter/s)": 1.438686 }, { "epoch": 4.318366822329806, "grad_norm": 6.526504993438721, "learning_rate": 4.515986115628306e-06, "loss": 2.084463882446289, "memory(GiB)": 77.56, "step": 100795, "token_acc": 0.5362903225806451, "train_speed(iter/s)": 1.438692 }, { "epoch": 4.3185810376590545, "grad_norm": 5.212656497955322, "learning_rate": 4.513191590476934e-06, "loss": 2.4213384628295898, "memory(GiB)": 77.56, "step": 100800, "token_acc": 0.5096774193548387, "train_speed(iter/s)": 1.438692 }, { "epoch": 4.318795252988304, "grad_norm": 6.196159362792969, "learning_rate": 4.510397889361761e-06, "loss": 2.316198539733887, "memory(GiB)": 77.56, "step": 100805, "token_acc": 0.484375, "train_speed(iter/s)": 1.438701 }, { "epoch": 4.319009468317553, "grad_norm": 7.193774700164795, "learning_rate": 4.507605012333394e-06, "loss": 2.364968109130859, "memory(GiB)": 77.56, "step": 100810, "token_acc": 0.5319148936170213, "train_speed(iter/s)": 1.438687 }, { "epoch": 4.319223683646801, "grad_norm": 5.920011520385742, "learning_rate": 4.504812959442451e-06, "loss": 2.2006067276000976, "memory(GiB)": 77.56, "step": 100815, "token_acc": 0.5238095238095238, "train_speed(iter/s)": 1.438692 }, { "epoch": 4.319437898976051, "grad_norm": 6.084973335266113, "learning_rate": 4.502021730739497e-06, "loss": 2.400521087646484, "memory(GiB)": 77.56, "step": 100820, "token_acc": 0.465625, "train_speed(iter/s)": 1.438706 }, { "epoch": 4.3196521143053, "grad_norm": 6.604528427124023, "learning_rate": 4.4992313262751e-06, "loss": 2.133163642883301, "memory(GiB)": 77.56, "step": 100825, "token_acc": 0.5413533834586466, "train_speed(iter/s)": 1.438702 }, { "epoch": 4.319866329634548, "grad_norm": 6.3021955490112305, "learning_rate": 4.496441746099811e-06, "loss": 2.4390275955200194, "memory(GiB)": 77.56, "step": 100830, "token_acc": 0.49709302325581395, "train_speed(iter/s)": 1.43871 }, { "epoch": 4.320080544963798, "grad_norm": 11.538179397583008, "learning_rate": 4.493652990264152e-06, "loss": 2.1963125228881837, "memory(GiB)": 77.56, "step": 100835, "token_acc": 0.5358361774744027, "train_speed(iter/s)": 1.438729 }, { "epoch": 4.320294760293047, "grad_norm": 6.0271220207214355, "learning_rate": 4.4908650588186715e-06, "loss": 2.2506488800048827, "memory(GiB)": 77.56, "step": 100840, "token_acc": 0.5136054421768708, "train_speed(iter/s)": 1.438735 }, { "epoch": 4.320508975622295, "grad_norm": 6.2990617752075195, "learning_rate": 4.488077951813863e-06, "loss": 2.4979904174804686, "memory(GiB)": 77.56, "step": 100845, "token_acc": 0.4774011299435028, "train_speed(iter/s)": 1.438761 }, { "epoch": 4.3207231909515444, "grad_norm": 6.008528709411621, "learning_rate": 4.4852916693002124e-06, "loss": 2.040965270996094, "memory(GiB)": 77.56, "step": 100850, "token_acc": 0.524904214559387, "train_speed(iter/s)": 1.438757 }, { "epoch": 4.320937406280794, "grad_norm": 10.73942756652832, "learning_rate": 4.482506211328191e-06, "loss": 2.5719877243041993, "memory(GiB)": 77.56, "step": 100855, "token_acc": 0.4878048780487805, "train_speed(iter/s)": 1.438757 }, { "epoch": 4.321151621610042, "grad_norm": 6.381577491760254, "learning_rate": 4.479721577948276e-06, "loss": 2.053926467895508, "memory(GiB)": 77.56, "step": 100860, "token_acc": 0.5261324041811847, "train_speed(iter/s)": 1.438775 }, { "epoch": 4.321365836939291, "grad_norm": 5.6428961753845215, "learning_rate": 4.476937769210909e-06, "loss": 2.0882631301879884, "memory(GiB)": 77.56, "step": 100865, "token_acc": 0.5440613026819924, "train_speed(iter/s)": 1.43879 }, { "epoch": 4.321580052268541, "grad_norm": 6.470463275909424, "learning_rate": 4.474154785166506e-06, "loss": 2.0445018768310548, "memory(GiB)": 77.56, "step": 100870, "token_acc": 0.5030487804878049, "train_speed(iter/s)": 1.438798 }, { "epoch": 4.321794267597789, "grad_norm": 6.739324569702148, "learning_rate": 4.471372625865511e-06, "loss": 2.3265600204467773, "memory(GiB)": 77.56, "step": 100875, "token_acc": 0.5482625482625483, "train_speed(iter/s)": 1.438785 }, { "epoch": 4.322008482927038, "grad_norm": 5.522739410400391, "learning_rate": 4.468591291358304e-06, "loss": 2.0621692657470705, "memory(GiB)": 77.56, "step": 100880, "token_acc": 0.546875, "train_speed(iter/s)": 1.438787 }, { "epoch": 4.3222226982562875, "grad_norm": 5.56320333480835, "learning_rate": 4.465810781695279e-06, "loss": 2.2735784530639647, "memory(GiB)": 77.56, "step": 100885, "token_acc": 0.5173611111111112, "train_speed(iter/s)": 1.438801 }, { "epoch": 4.322436913585536, "grad_norm": 6.924131393432617, "learning_rate": 4.463031096926806e-06, "loss": 2.1173870086669924, "memory(GiB)": 77.56, "step": 100890, "token_acc": 0.5144927536231884, "train_speed(iter/s)": 1.438804 }, { "epoch": 4.322651128914785, "grad_norm": 9.428475379943848, "learning_rate": 4.460252237103235e-06, "loss": 2.134607696533203, "memory(GiB)": 77.56, "step": 100895, "token_acc": 0.5547169811320755, "train_speed(iter/s)": 1.438816 }, { "epoch": 4.322865344244034, "grad_norm": 5.395373344421387, "learning_rate": 4.457474202274925e-06, "loss": 2.0755924224853515, "memory(GiB)": 77.56, "step": 100900, "token_acc": 0.5684647302904564, "train_speed(iter/s)": 1.438822 }, { "epoch": 4.323079559573283, "grad_norm": 7.781178951263428, "learning_rate": 4.454696992492185e-06, "loss": 2.2073802947998047, "memory(GiB)": 77.56, "step": 100905, "token_acc": 0.5282258064516129, "train_speed(iter/s)": 1.438836 }, { "epoch": 4.323293774902532, "grad_norm": 5.264420032501221, "learning_rate": 4.451920607805343e-06, "loss": 2.2307435989379885, "memory(GiB)": 77.56, "step": 100910, "token_acc": 0.5306859205776173, "train_speed(iter/s)": 1.438828 }, { "epoch": 4.323507990231781, "grad_norm": 5.754848957061768, "learning_rate": 4.44914504826468e-06, "loss": 2.3392467498779297, "memory(GiB)": 77.56, "step": 100915, "token_acc": 0.4942528735632184, "train_speed(iter/s)": 1.438813 }, { "epoch": 4.32372220556103, "grad_norm": 6.3487653732299805, "learning_rate": 4.446370313920489e-06, "loss": 2.3639129638671874, "memory(GiB)": 77.56, "step": 100920, "token_acc": 0.503125, "train_speed(iter/s)": 1.438844 }, { "epoch": 4.323936420890279, "grad_norm": 7.507499694824219, "learning_rate": 4.44359640482302e-06, "loss": 2.2884601593017577, "memory(GiB)": 77.56, "step": 100925, "token_acc": 0.4959349593495935, "train_speed(iter/s)": 1.438819 }, { "epoch": 4.324150636219528, "grad_norm": 5.1254143714904785, "learning_rate": 4.440823321022541e-06, "loss": 2.084254837036133, "memory(GiB)": 77.56, "step": 100930, "token_acc": 0.5335689045936396, "train_speed(iter/s)": 1.438812 }, { "epoch": 4.3243648515487765, "grad_norm": 6.045876979827881, "learning_rate": 4.438051062569293e-06, "loss": 2.577326202392578, "memory(GiB)": 77.56, "step": 100935, "token_acc": 0.47770700636942676, "train_speed(iter/s)": 1.438821 }, { "epoch": 4.324579066878026, "grad_norm": 6.5383806228637695, "learning_rate": 4.435279629513489e-06, "loss": 2.339593505859375, "memory(GiB)": 77.56, "step": 100940, "token_acc": 0.43573667711598746, "train_speed(iter/s)": 1.438824 }, { "epoch": 4.324793282207275, "grad_norm": 5.796730041503906, "learning_rate": 4.432509021905334e-06, "loss": 2.5958990097045898, "memory(GiB)": 77.56, "step": 100945, "token_acc": 0.44, "train_speed(iter/s)": 1.438822 }, { "epoch": 4.325007497536523, "grad_norm": 5.288985729217529, "learning_rate": 4.4297392397950334e-06, "loss": 2.131447601318359, "memory(GiB)": 77.56, "step": 100950, "token_acc": 0.550561797752809, "train_speed(iter/s)": 1.438842 }, { "epoch": 4.325221712865773, "grad_norm": 6.5406904220581055, "learning_rate": 4.42697028323274e-06, "loss": 2.53708553314209, "memory(GiB)": 77.56, "step": 100955, "token_acc": 0.5172413793103449, "train_speed(iter/s)": 1.438847 }, { "epoch": 4.325435928195022, "grad_norm": 7.722628116607666, "learning_rate": 4.424202152268642e-06, "loss": 2.323473358154297, "memory(GiB)": 77.56, "step": 100960, "token_acc": 0.4787234042553192, "train_speed(iter/s)": 1.438837 }, { "epoch": 4.32565014352427, "grad_norm": 7.154002666473389, "learning_rate": 4.421434846952871e-06, "loss": 2.1355976104736327, "memory(GiB)": 77.56, "step": 100965, "token_acc": 0.4952978056426332, "train_speed(iter/s)": 1.438837 }, { "epoch": 4.3258643588535195, "grad_norm": 4.168364524841309, "learning_rate": 4.4186683673355714e-06, "loss": 2.0892311096191407, "memory(GiB)": 77.56, "step": 100970, "token_acc": 0.5490196078431373, "train_speed(iter/s)": 1.438834 }, { "epoch": 4.326078574182769, "grad_norm": 5.667438507080078, "learning_rate": 4.415902713466846e-06, "loss": 2.48551025390625, "memory(GiB)": 77.56, "step": 100975, "token_acc": 0.45098039215686275, "train_speed(iter/s)": 1.438838 }, { "epoch": 4.326292789512017, "grad_norm": 5.668957233428955, "learning_rate": 4.413137885396795e-06, "loss": 2.481974220275879, "memory(GiB)": 77.56, "step": 100980, "token_acc": 0.49624060150375937, "train_speed(iter/s)": 1.438847 }, { "epoch": 4.326507004841266, "grad_norm": 5.5176568031311035, "learning_rate": 4.4103738831755285e-06, "loss": 2.5879791259765623, "memory(GiB)": 77.56, "step": 100985, "token_acc": 0.48014440433212996, "train_speed(iter/s)": 1.438841 }, { "epoch": 4.326721220170516, "grad_norm": 5.67827033996582, "learning_rate": 4.407610706853105e-06, "loss": 2.2679656982421874, "memory(GiB)": 77.56, "step": 100990, "token_acc": 0.4952076677316294, "train_speed(iter/s)": 1.43886 }, { "epoch": 4.326935435499764, "grad_norm": 5.687675476074219, "learning_rate": 4.404848356479585e-06, "loss": 2.174545669555664, "memory(GiB)": 77.56, "step": 100995, "token_acc": 0.5147058823529411, "train_speed(iter/s)": 1.438869 }, { "epoch": 4.327149650829013, "grad_norm": 6.245078086853027, "learning_rate": 4.402086832104996e-06, "loss": 2.2083816528320312, "memory(GiB)": 77.56, "step": 101000, "token_acc": 0.53125, "train_speed(iter/s)": 1.438879 }, { "epoch": 4.327149650829013, "eval_loss": 2.0820162296295166, "eval_runtime": 14.3835, "eval_samples_per_second": 6.952, "eval_steps_per_second": 6.952, "eval_token_acc": 0.48262032085561496, "step": 101000 }, { "epoch": 4.3273638661582625, "grad_norm": 8.662749290466309, "learning_rate": 4.399326133779386e-06, "loss": 2.3326534271240233, "memory(GiB)": 77.56, "step": 101005, "token_acc": 0.4814453125, "train_speed(iter/s)": 1.438559 }, { "epoch": 4.327578081487511, "grad_norm": 6.474173069000244, "learning_rate": 4.396566261552765e-06, "loss": 2.2989614486694334, "memory(GiB)": 77.56, "step": 101010, "token_acc": 0.5028901734104047, "train_speed(iter/s)": 1.438565 }, { "epoch": 4.32779229681676, "grad_norm": 6.23170280456543, "learning_rate": 4.393807215475115e-06, "loss": 2.3394250869750977, "memory(GiB)": 77.56, "step": 101015, "token_acc": 0.506896551724138, "train_speed(iter/s)": 1.438564 }, { "epoch": 4.328006512146009, "grad_norm": 7.6872687339782715, "learning_rate": 4.3910489955964365e-06, "loss": 1.8481668472290038, "memory(GiB)": 77.56, "step": 101020, "token_acc": 0.602112676056338, "train_speed(iter/s)": 1.438575 }, { "epoch": 4.328220727475258, "grad_norm": 6.086094856262207, "learning_rate": 4.388291601966688e-06, "loss": 2.4521799087524414, "memory(GiB)": 77.56, "step": 101025, "token_acc": 0.49295774647887325, "train_speed(iter/s)": 1.438588 }, { "epoch": 4.328434942804507, "grad_norm": 8.996162414550781, "learning_rate": 4.385535034635829e-06, "loss": 2.3587310791015623, "memory(GiB)": 77.56, "step": 101030, "token_acc": 0.49097472924187724, "train_speed(iter/s)": 1.438591 }, { "epoch": 4.328649158133756, "grad_norm": 6.893521785736084, "learning_rate": 4.382779293653788e-06, "loss": 2.069912910461426, "memory(GiB)": 77.56, "step": 101035, "token_acc": 0.5331125827814569, "train_speed(iter/s)": 1.438614 }, { "epoch": 4.328863373463005, "grad_norm": 6.17027473449707, "learning_rate": 4.380024379070491e-06, "loss": 2.389052391052246, "memory(GiB)": 77.56, "step": 101040, "token_acc": 0.48502994011976047, "train_speed(iter/s)": 1.438622 }, { "epoch": 4.329077588792254, "grad_norm": 6.016664981842041, "learning_rate": 4.377270290935848e-06, "loss": 2.3577083587646483, "memory(GiB)": 77.56, "step": 101045, "token_acc": 0.46546546546546547, "train_speed(iter/s)": 1.438631 }, { "epoch": 4.329291804121503, "grad_norm": 8.608914375305176, "learning_rate": 4.374517029299757e-06, "loss": 2.3155344009399412, "memory(GiB)": 77.56, "step": 101050, "token_acc": 0.5061728395061729, "train_speed(iter/s)": 1.438644 }, { "epoch": 4.3295060194507515, "grad_norm": 6.313039779663086, "learning_rate": 4.3717645942120855e-06, "loss": 2.2667036056518555, "memory(GiB)": 77.56, "step": 101055, "token_acc": 0.5322033898305085, "train_speed(iter/s)": 1.438648 }, { "epoch": 4.329720234780001, "grad_norm": 4.852303981781006, "learning_rate": 4.369012985722704e-06, "loss": 1.8334367752075196, "memory(GiB)": 77.56, "step": 101060, "token_acc": 0.6038461538461538, "train_speed(iter/s)": 1.438659 }, { "epoch": 4.32993445010925, "grad_norm": 5.725183486938477, "learning_rate": 4.366262203881461e-06, "loss": 2.0838666915893556, "memory(GiB)": 77.56, "step": 101065, "token_acc": 0.5844155844155844, "train_speed(iter/s)": 1.43868 }, { "epoch": 4.330148665438498, "grad_norm": 5.849481582641602, "learning_rate": 4.363512248738172e-06, "loss": 2.5036239624023438, "memory(GiB)": 77.56, "step": 101070, "token_acc": 0.5157232704402516, "train_speed(iter/s)": 1.438666 }, { "epoch": 4.330362880767748, "grad_norm": 6.329677104949951, "learning_rate": 4.3607631203426704e-06, "loss": 2.133966064453125, "memory(GiB)": 77.56, "step": 101075, "token_acc": 0.5471014492753623, "train_speed(iter/s)": 1.438674 }, { "epoch": 4.330577096096997, "grad_norm": 9.487719535827637, "learning_rate": 4.358014818744765e-06, "loss": 2.397746467590332, "memory(GiB)": 77.56, "step": 101080, "token_acc": 0.49635036496350365, "train_speed(iter/s)": 1.438684 }, { "epoch": 4.330791311426245, "grad_norm": 6.407149314880371, "learning_rate": 4.355267343994241e-06, "loss": 2.0790287017822267, "memory(GiB)": 77.56, "step": 101085, "token_acc": 0.549800796812749, "train_speed(iter/s)": 1.438693 }, { "epoch": 4.3310055267554946, "grad_norm": 5.770221710205078, "learning_rate": 4.352520696140866e-06, "loss": 2.3384258270263674, "memory(GiB)": 77.56, "step": 101090, "token_acc": 0.4965277777777778, "train_speed(iter/s)": 1.438699 }, { "epoch": 4.331219742084744, "grad_norm": 7.723062038421631, "learning_rate": 4.349774875234397e-06, "loss": 2.349536895751953, "memory(GiB)": 77.56, "step": 101095, "token_acc": 0.5048231511254019, "train_speed(iter/s)": 1.438695 }, { "epoch": 4.331433957413992, "grad_norm": 5.800660133361816, "learning_rate": 4.3470298813245715e-06, "loss": 2.1011037826538086, "memory(GiB)": 77.56, "step": 101100, "token_acc": 0.5457627118644067, "train_speed(iter/s)": 1.438695 }, { "epoch": 4.331648172743241, "grad_norm": 6.468334197998047, "learning_rate": 4.344285714461138e-06, "loss": 2.0974365234375, "memory(GiB)": 77.56, "step": 101105, "token_acc": 0.545774647887324, "train_speed(iter/s)": 1.438705 }, { "epoch": 4.331862388072491, "grad_norm": 7.865931510925293, "learning_rate": 4.341542374693791e-06, "loss": 2.2582515716552733, "memory(GiB)": 77.56, "step": 101110, "token_acc": 0.4864864864864865, "train_speed(iter/s)": 1.438698 }, { "epoch": 4.332076603401739, "grad_norm": 4.625708103179932, "learning_rate": 4.3387998620722405e-06, "loss": 2.579315757751465, "memory(GiB)": 77.56, "step": 101115, "token_acc": 0.493006993006993, "train_speed(iter/s)": 1.438709 }, { "epoch": 4.332290818730988, "grad_norm": 5.9434356689453125, "learning_rate": 4.336058176646163e-06, "loss": 2.4340280532836913, "memory(GiB)": 77.56, "step": 101120, "token_acc": 0.49808429118773945, "train_speed(iter/s)": 1.438704 }, { "epoch": 4.332505034060238, "grad_norm": 4.637039661407471, "learning_rate": 4.3333173184652185e-06, "loss": 2.443637466430664, "memory(GiB)": 77.56, "step": 101125, "token_acc": 0.5109489051094891, "train_speed(iter/s)": 1.4387 }, { "epoch": 4.332719249389486, "grad_norm": 5.87703275680542, "learning_rate": 4.330577287579079e-06, "loss": 2.3531347274780274, "memory(GiB)": 77.56, "step": 101130, "token_acc": 0.5105633802816901, "train_speed(iter/s)": 1.438698 }, { "epoch": 4.332933464718735, "grad_norm": 4.111064434051514, "learning_rate": 4.327838084037372e-06, "loss": 2.38293514251709, "memory(GiB)": 77.56, "step": 101135, "token_acc": 0.4930555555555556, "train_speed(iter/s)": 1.438711 }, { "epoch": 4.3331476800479845, "grad_norm": 4.8027663230896, "learning_rate": 4.325099707889724e-06, "loss": 2.373774528503418, "memory(GiB)": 77.56, "step": 101140, "token_acc": 0.5059880239520959, "train_speed(iter/s)": 1.438719 }, { "epoch": 4.333361895377233, "grad_norm": 6.752740383148193, "learning_rate": 4.322362159185739e-06, "loss": 2.351709175109863, "memory(GiB)": 77.56, "step": 101145, "token_acc": 0.5, "train_speed(iter/s)": 1.438718 }, { "epoch": 4.333576110706482, "grad_norm": 7.650457859039307, "learning_rate": 4.319625437975006e-06, "loss": 2.3520641326904297, "memory(GiB)": 77.56, "step": 101150, "token_acc": 0.5127272727272727, "train_speed(iter/s)": 1.438725 }, { "epoch": 4.333790326035731, "grad_norm": 6.833845138549805, "learning_rate": 4.3168895443071124e-06, "loss": 2.153400993347168, "memory(GiB)": 77.56, "step": 101155, "token_acc": 0.5193548387096775, "train_speed(iter/s)": 1.438744 }, { "epoch": 4.33400454136498, "grad_norm": 6.676098346710205, "learning_rate": 4.314154478231619e-06, "loss": 2.3579586029052733, "memory(GiB)": 77.56, "step": 101160, "token_acc": 0.5202492211838006, "train_speed(iter/s)": 1.438749 }, { "epoch": 4.334218756694229, "grad_norm": 6.239243507385254, "learning_rate": 4.3114202397980755e-06, "loss": 2.4325647354125977, "memory(GiB)": 77.56, "step": 101165, "token_acc": 0.45454545454545453, "train_speed(iter/s)": 1.438739 }, { "epoch": 4.334432972023478, "grad_norm": 5.6411309242248535, "learning_rate": 4.308686829056019e-06, "loss": 2.430034637451172, "memory(GiB)": 77.56, "step": 101170, "token_acc": 0.45964912280701753, "train_speed(iter/s)": 1.438736 }, { "epoch": 4.334647187352727, "grad_norm": 6.036357402801514, "learning_rate": 4.305954246054961e-06, "loss": 2.224743461608887, "memory(GiB)": 77.56, "step": 101175, "token_acc": 0.5119453924914675, "train_speed(iter/s)": 1.438735 }, { "epoch": 4.334861402681976, "grad_norm": 6.004115104675293, "learning_rate": 4.303222490844405e-06, "loss": 2.494205093383789, "memory(GiB)": 77.56, "step": 101180, "token_acc": 0.4515235457063712, "train_speed(iter/s)": 1.438745 }, { "epoch": 4.335075618011225, "grad_norm": 7.252013206481934, "learning_rate": 4.30049156347383e-06, "loss": 2.3625343322753904, "memory(GiB)": 77.56, "step": 101185, "token_acc": 0.49473684210526314, "train_speed(iter/s)": 1.438754 }, { "epoch": 4.3352898333404735, "grad_norm": 10.650192260742188, "learning_rate": 4.297761463992733e-06, "loss": 2.4016536712646483, "memory(GiB)": 77.56, "step": 101190, "token_acc": 0.5319148936170213, "train_speed(iter/s)": 1.438771 }, { "epoch": 4.335504048669723, "grad_norm": 7.285583972930908, "learning_rate": 4.29503219245056e-06, "loss": 2.0923715591430665, "memory(GiB)": 77.56, "step": 101195, "token_acc": 0.5442622950819672, "train_speed(iter/s)": 1.438759 }, { "epoch": 4.335718263998972, "grad_norm": 7.679899215698242, "learning_rate": 4.292303748896748e-06, "loss": 2.3973468780517577, "memory(GiB)": 77.56, "step": 101200, "token_acc": 0.5154639175257731, "train_speed(iter/s)": 1.438773 }, { "epoch": 4.33593247932822, "grad_norm": 4.87914514541626, "learning_rate": 4.289576133380735e-06, "loss": 2.195694160461426, "memory(GiB)": 77.56, "step": 101205, "token_acc": 0.4986376021798365, "train_speed(iter/s)": 1.438754 }, { "epoch": 4.33614669465747, "grad_norm": 7.900181293487549, "learning_rate": 4.286849345951932e-06, "loss": 2.3426822662353515, "memory(GiB)": 77.56, "step": 101210, "token_acc": 0.48366013071895425, "train_speed(iter/s)": 1.438747 }, { "epoch": 4.336360909986719, "grad_norm": 7.688501834869385, "learning_rate": 4.284123386659722e-06, "loss": 2.1559396743774415, "memory(GiB)": 77.56, "step": 101215, "token_acc": 0.5458715596330275, "train_speed(iter/s)": 1.438751 }, { "epoch": 4.336575125315967, "grad_norm": 5.1097002029418945, "learning_rate": 4.281398255553515e-06, "loss": 2.0712360382080077, "memory(GiB)": 77.56, "step": 101220, "token_acc": 0.5292307692307693, "train_speed(iter/s)": 1.438759 }, { "epoch": 4.3367893406452165, "grad_norm": 6.921679973602295, "learning_rate": 4.278673952682655e-06, "loss": 2.407971954345703, "memory(GiB)": 77.56, "step": 101225, "token_acc": 0.49050632911392406, "train_speed(iter/s)": 1.438766 }, { "epoch": 4.337003555974466, "grad_norm": 7.617849349975586, "learning_rate": 4.275950478096513e-06, "loss": 2.2217613220214845, "memory(GiB)": 77.56, "step": 101230, "token_acc": 0.527027027027027, "train_speed(iter/s)": 1.43877 }, { "epoch": 4.337217771303714, "grad_norm": 7.503749847412109, "learning_rate": 4.273227831844423e-06, "loss": 2.209715461730957, "memory(GiB)": 77.56, "step": 101235, "token_acc": 0.4968944099378882, "train_speed(iter/s)": 1.438782 }, { "epoch": 4.337431986632963, "grad_norm": 5.194855213165283, "learning_rate": 4.2705060139757054e-06, "loss": 2.44457893371582, "memory(GiB)": 77.56, "step": 101240, "token_acc": 0.5123287671232877, "train_speed(iter/s)": 1.438781 }, { "epoch": 4.337646201962213, "grad_norm": 5.237576484680176, "learning_rate": 4.267785024539661e-06, "loss": 2.204781913757324, "memory(GiB)": 77.56, "step": 101245, "token_acc": 0.5330882352941176, "train_speed(iter/s)": 1.43879 }, { "epoch": 4.337860417291461, "grad_norm": 6.590160846710205, "learning_rate": 4.2650648635856e-06, "loss": 2.4221874237060548, "memory(GiB)": 77.56, "step": 101250, "token_acc": 0.4657039711191336, "train_speed(iter/s)": 1.438804 }, { "epoch": 4.33807463262071, "grad_norm": 7.082057952880859, "learning_rate": 4.262345531162792e-06, "loss": 2.2764734268188476, "memory(GiB)": 77.56, "step": 101255, "token_acc": 0.49097472924187724, "train_speed(iter/s)": 1.438804 }, { "epoch": 4.3382888479499595, "grad_norm": 8.069806098937988, "learning_rate": 4.259627027320501e-06, "loss": 2.6750423431396486, "memory(GiB)": 77.56, "step": 101260, "token_acc": 0.47527472527472525, "train_speed(iter/s)": 1.438807 }, { "epoch": 4.338503063279208, "grad_norm": 5.843471050262451, "learning_rate": 4.256909352107969e-06, "loss": 2.217774200439453, "memory(GiB)": 77.56, "step": 101265, "token_acc": 0.5335689045936396, "train_speed(iter/s)": 1.438779 }, { "epoch": 4.338717278608457, "grad_norm": 6.1159491539001465, "learning_rate": 4.254192505574433e-06, "loss": 2.247700500488281, "memory(GiB)": 77.56, "step": 101270, "token_acc": 0.5396825396825397, "train_speed(iter/s)": 1.438781 }, { "epoch": 4.338931493937706, "grad_norm": 6.537121295928955, "learning_rate": 4.2514764877691184e-06, "loss": 2.1320571899414062, "memory(GiB)": 77.56, "step": 101275, "token_acc": 0.5118110236220472, "train_speed(iter/s)": 1.438781 }, { "epoch": 4.339145709266955, "grad_norm": 5.182265758514404, "learning_rate": 4.2487612987412215e-06, "loss": 2.166476821899414, "memory(GiB)": 77.56, "step": 101280, "token_acc": 0.5013404825737265, "train_speed(iter/s)": 1.438781 }, { "epoch": 4.339359924596204, "grad_norm": 6.607207775115967, "learning_rate": 4.246046938539933e-06, "loss": 2.3562170028686524, "memory(GiB)": 77.56, "step": 101285, "token_acc": 0.5166051660516605, "train_speed(iter/s)": 1.438781 }, { "epoch": 4.339574139925453, "grad_norm": 6.849672317504883, "learning_rate": 4.243333407214422e-06, "loss": 2.2435211181640624, "memory(GiB)": 77.56, "step": 101290, "token_acc": 0.5174825174825175, "train_speed(iter/s)": 1.438782 }, { "epoch": 4.339788355254702, "grad_norm": 5.224052906036377, "learning_rate": 4.240620704813847e-06, "loss": 2.089272117614746, "memory(GiB)": 77.56, "step": 101295, "token_acc": 0.5416666666666666, "train_speed(iter/s)": 1.438787 }, { "epoch": 4.340002570583951, "grad_norm": 6.374125003814697, "learning_rate": 4.23790883138736e-06, "loss": 2.4972208023071287, "memory(GiB)": 77.56, "step": 101300, "token_acc": 0.4880239520958084, "train_speed(iter/s)": 1.438792 }, { "epoch": 4.3402167859132, "grad_norm": 5.530206203460693, "learning_rate": 4.235197786984074e-06, "loss": 2.1429861068725584, "memory(GiB)": 77.56, "step": 101305, "token_acc": 0.5565217391304348, "train_speed(iter/s)": 1.438789 }, { "epoch": 4.3404310012424485, "grad_norm": 8.494129180908203, "learning_rate": 4.232487571653115e-06, "loss": 1.9244573593139649, "memory(GiB)": 77.56, "step": 101310, "token_acc": 0.5615384615384615, "train_speed(iter/s)": 1.438804 }, { "epoch": 4.340645216571698, "grad_norm": 5.509167194366455, "learning_rate": 4.229778185443583e-06, "loss": 2.167266845703125, "memory(GiB)": 77.56, "step": 101315, "token_acc": 0.5387453874538746, "train_speed(iter/s)": 1.438791 }, { "epoch": 4.340859431900947, "grad_norm": 6.201045513153076, "learning_rate": 4.227069628404551e-06, "loss": 2.067679023742676, "memory(GiB)": 77.56, "step": 101320, "token_acc": 0.5020242914979757, "train_speed(iter/s)": 1.438786 }, { "epoch": 4.341073647230195, "grad_norm": 9.918316841125488, "learning_rate": 4.224361900585094e-06, "loss": 1.9724859237670898, "memory(GiB)": 77.56, "step": 101325, "token_acc": 0.5564516129032258, "train_speed(iter/s)": 1.438791 }, { "epoch": 4.341287862559445, "grad_norm": 6.729365348815918, "learning_rate": 4.221655002034253e-06, "loss": 2.2674991607666017, "memory(GiB)": 77.56, "step": 101330, "token_acc": 0.4864864864864865, "train_speed(iter/s)": 1.438787 }, { "epoch": 4.341502077888694, "grad_norm": 6.806403636932373, "learning_rate": 4.218948932801087e-06, "loss": 2.418392372131348, "memory(GiB)": 77.56, "step": 101335, "token_acc": 0.4940239043824701, "train_speed(iter/s)": 1.438794 }, { "epoch": 4.341716293217942, "grad_norm": 6.020773887634277, "learning_rate": 4.216243692934602e-06, "loss": 2.2918224334716797, "memory(GiB)": 77.56, "step": 101340, "token_acc": 0.5094339622641509, "train_speed(iter/s)": 1.438798 }, { "epoch": 4.3419305085471915, "grad_norm": 6.5184221267700195, "learning_rate": 4.213539282483814e-06, "loss": 2.542782020568848, "memory(GiB)": 77.56, "step": 101345, "token_acc": 0.4965753424657534, "train_speed(iter/s)": 1.438802 }, { "epoch": 4.342144723876441, "grad_norm": 7.2213568687438965, "learning_rate": 4.210835701497712e-06, "loss": 2.598005485534668, "memory(GiB)": 77.56, "step": 101350, "token_acc": 0.4755244755244755, "train_speed(iter/s)": 1.438799 }, { "epoch": 4.342358939205689, "grad_norm": 7.181949615478516, "learning_rate": 4.208132950025273e-06, "loss": 2.2167888641357423, "memory(GiB)": 77.56, "step": 101355, "token_acc": 0.4965986394557823, "train_speed(iter/s)": 1.4388 }, { "epoch": 4.342573154534938, "grad_norm": 7.149319648742676, "learning_rate": 4.205431028115458e-06, "loss": 2.608908462524414, "memory(GiB)": 77.56, "step": 101360, "token_acc": 0.49236641221374045, "train_speed(iter/s)": 1.438818 }, { "epoch": 4.342787369864188, "grad_norm": 6.05332088470459, "learning_rate": 4.202729935817224e-06, "loss": 2.5681474685668944, "memory(GiB)": 77.56, "step": 101365, "token_acc": 0.4732824427480916, "train_speed(iter/s)": 1.438832 }, { "epoch": 4.343001585193436, "grad_norm": 5.106203556060791, "learning_rate": 4.20002967317949e-06, "loss": 2.19384822845459, "memory(GiB)": 77.56, "step": 101370, "token_acc": 0.5227963525835866, "train_speed(iter/s)": 1.438829 }, { "epoch": 4.343215800522685, "grad_norm": 6.929333209991455, "learning_rate": 4.19733024025119e-06, "loss": 2.2576053619384764, "memory(GiB)": 77.56, "step": 101375, "token_acc": 0.4723127035830619, "train_speed(iter/s)": 1.438824 }, { "epoch": 4.3434300158519346, "grad_norm": 5.270244598388672, "learning_rate": 4.19463163708122e-06, "loss": 1.8078964233398438, "memory(GiB)": 77.56, "step": 101380, "token_acc": 0.5830721003134797, "train_speed(iter/s)": 1.43883 }, { "epoch": 4.343644231181183, "grad_norm": 5.989297389984131, "learning_rate": 4.191933863718467e-06, "loss": 2.2562576293945313, "memory(GiB)": 77.56, "step": 101385, "token_acc": 0.5247148288973384, "train_speed(iter/s)": 1.438844 }, { "epoch": 4.343858446510432, "grad_norm": 7.0800065994262695, "learning_rate": 4.189236920211798e-06, "loss": 2.221025848388672, "memory(GiB)": 77.56, "step": 101390, "token_acc": 0.5183823529411765, "train_speed(iter/s)": 1.438855 }, { "epoch": 4.344072661839681, "grad_norm": 6.620867729187012, "learning_rate": 4.1865408066100785e-06, "loss": 2.088029479980469, "memory(GiB)": 77.56, "step": 101395, "token_acc": 0.5067114093959731, "train_speed(iter/s)": 1.438868 }, { "epoch": 4.34428687716893, "grad_norm": 5.606696128845215, "learning_rate": 4.183845522962154e-06, "loss": 2.7019441604614256, "memory(GiB)": 77.56, "step": 101400, "token_acc": 0.4634146341463415, "train_speed(iter/s)": 1.438864 }, { "epoch": 4.344501092498179, "grad_norm": 9.510915756225586, "learning_rate": 4.181151069316846e-06, "loss": 2.212398719787598, "memory(GiB)": 77.56, "step": 101405, "token_acc": 0.5494505494505495, "train_speed(iter/s)": 1.438869 }, { "epoch": 4.344715307827428, "grad_norm": 5.622706413269043, "learning_rate": 4.178457445722967e-06, "loss": 2.5402738571166994, "memory(GiB)": 77.56, "step": 101410, "token_acc": 0.446064139941691, "train_speed(iter/s)": 1.438842 }, { "epoch": 4.344929523156677, "grad_norm": 5.042716979980469, "learning_rate": 4.175764652229308e-06, "loss": 2.2626226425170897, "memory(GiB)": 77.56, "step": 101415, "token_acc": 0.5049180327868853, "train_speed(iter/s)": 1.438852 }, { "epoch": 4.345143738485926, "grad_norm": 5.740492820739746, "learning_rate": 4.173072688884666e-06, "loss": 2.135538864135742, "memory(GiB)": 77.56, "step": 101420, "token_acc": 0.5, "train_speed(iter/s)": 1.43885 }, { "epoch": 4.345357953815175, "grad_norm": 5.865013599395752, "learning_rate": 4.170381555737801e-06, "loss": 2.191912078857422, "memory(GiB)": 77.56, "step": 101425, "token_acc": 0.4659498207885305, "train_speed(iter/s)": 1.438855 }, { "epoch": 4.345572169144424, "grad_norm": 5.078368186950684, "learning_rate": 4.167691252837463e-06, "loss": 2.3704860687255858, "memory(GiB)": 77.56, "step": 101430, "token_acc": 0.4868804664723032, "train_speed(iter/s)": 1.438866 }, { "epoch": 4.345786384473673, "grad_norm": 4.758178234100342, "learning_rate": 4.165001780232397e-06, "loss": 1.9802011489868163, "memory(GiB)": 77.56, "step": 101435, "token_acc": 0.5574912891986062, "train_speed(iter/s)": 1.43888 }, { "epoch": 4.346000599802922, "grad_norm": 7.279247283935547, "learning_rate": 4.1623131379713186e-06, "loss": 2.258874702453613, "memory(GiB)": 77.56, "step": 101440, "token_acc": 0.5, "train_speed(iter/s)": 1.43887 }, { "epoch": 4.3462148151321705, "grad_norm": 6.9657511711120605, "learning_rate": 4.159625326102934e-06, "loss": 2.529164123535156, "memory(GiB)": 77.56, "step": 101445, "token_acc": 0.4652567975830816, "train_speed(iter/s)": 1.438873 }, { "epoch": 4.34642903046142, "grad_norm": 6.005016803741455, "learning_rate": 4.156938344675932e-06, "loss": 2.33060302734375, "memory(GiB)": 77.56, "step": 101450, "token_acc": 0.4945054945054945, "train_speed(iter/s)": 1.438882 }, { "epoch": 4.346643245790669, "grad_norm": 5.210018634796143, "learning_rate": 4.154252193739006e-06, "loss": 2.1816497802734376, "memory(GiB)": 77.56, "step": 101455, "token_acc": 0.5291970802919708, "train_speed(iter/s)": 1.438884 }, { "epoch": 4.346857461119917, "grad_norm": 6.272582054138184, "learning_rate": 4.151566873340806e-06, "loss": 2.108576011657715, "memory(GiB)": 77.56, "step": 101460, "token_acc": 0.5241379310344828, "train_speed(iter/s)": 1.438877 }, { "epoch": 4.347071676449167, "grad_norm": 5.846337795257568, "learning_rate": 4.148882383529984e-06, "loss": 2.2197166442871095, "memory(GiB)": 77.56, "step": 101465, "token_acc": 0.5049180327868853, "train_speed(iter/s)": 1.438862 }, { "epoch": 4.347285891778416, "grad_norm": 5.538933753967285, "learning_rate": 4.146198724355171e-06, "loss": 2.0510019302368163, "memory(GiB)": 77.56, "step": 101470, "token_acc": 0.54, "train_speed(iter/s)": 1.438848 }, { "epoch": 4.347500107107664, "grad_norm": 5.194798469543457, "learning_rate": 4.143515895864969e-06, "loss": 2.1033184051513674, "memory(GiB)": 77.56, "step": 101475, "token_acc": 0.5537974683544303, "train_speed(iter/s)": 1.438858 }, { "epoch": 4.3477143224369135, "grad_norm": 8.422808647155762, "learning_rate": 4.1408338981080096e-06, "loss": 2.1998289108276365, "memory(GiB)": 77.56, "step": 101480, "token_acc": 0.5441696113074205, "train_speed(iter/s)": 1.438859 }, { "epoch": 4.347928537766163, "grad_norm": 5.203928470611572, "learning_rate": 4.138152731132855e-06, "loss": 2.3962709426879885, "memory(GiB)": 77.56, "step": 101485, "token_acc": 0.5393258426966292, "train_speed(iter/s)": 1.438862 }, { "epoch": 4.348142753095411, "grad_norm": 6.964390277862549, "learning_rate": 4.135472394988094e-06, "loss": 2.3716407775878907, "memory(GiB)": 77.56, "step": 101490, "token_acc": 0.5592105263157895, "train_speed(iter/s)": 1.43887 }, { "epoch": 4.34835696842466, "grad_norm": 5.814328670501709, "learning_rate": 4.1327928897222705e-06, "loss": 2.2129642486572267, "memory(GiB)": 77.56, "step": 101495, "token_acc": 0.5055350553505535, "train_speed(iter/s)": 1.438872 }, { "epoch": 4.34857118375391, "grad_norm": 6.519754409790039, "learning_rate": 4.130114215383929e-06, "loss": 2.336849594116211, "memory(GiB)": 77.56, "step": 101500, "token_acc": 0.5322128851540616, "train_speed(iter/s)": 1.43889 }, { "epoch": 4.34857118375391, "eval_loss": 2.228541851043701, "eval_runtime": 13.7904, "eval_samples_per_second": 7.251, "eval_steps_per_second": 7.251, "eval_token_acc": 0.44966442953020136, "step": 101500 }, { "epoch": 4.348785399083158, "grad_norm": 6.421586036682129, "learning_rate": 4.1274363720215924e-06, "loss": 2.311581039428711, "memory(GiB)": 77.56, "step": 101505, "token_acc": 0.4560229445506692, "train_speed(iter/s)": 1.438596 }, { "epoch": 4.348999614412407, "grad_norm": 6.308603286743164, "learning_rate": 4.124759359683783e-06, "loss": 1.9527198791503906, "memory(GiB)": 77.56, "step": 101510, "token_acc": 0.5738396624472574, "train_speed(iter/s)": 1.438586 }, { "epoch": 4.3492138297416565, "grad_norm": 12.110864639282227, "learning_rate": 4.122083178418995e-06, "loss": 2.024698257446289, "memory(GiB)": 77.56, "step": 101515, "token_acc": 0.5038167938931297, "train_speed(iter/s)": 1.438599 }, { "epoch": 4.349428045070905, "grad_norm": 5.848987579345703, "learning_rate": 4.119407828275696e-06, "loss": 2.2689754486083986, "memory(GiB)": 77.56, "step": 101520, "token_acc": 0.5266903914590747, "train_speed(iter/s)": 1.438603 }, { "epoch": 4.349642260400154, "grad_norm": 6.438787460327148, "learning_rate": 4.116733309302373e-06, "loss": 2.233139991760254, "memory(GiB)": 77.56, "step": 101525, "token_acc": 0.5539033457249071, "train_speed(iter/s)": 1.43861 }, { "epoch": 4.349856475729403, "grad_norm": 5.045784950256348, "learning_rate": 4.114059621547472e-06, "loss": 2.1629079818725585, "memory(GiB)": 77.56, "step": 101530, "token_acc": 0.5317725752508361, "train_speed(iter/s)": 1.43862 }, { "epoch": 4.350070691058653, "grad_norm": 5.2980265617370605, "learning_rate": 4.111386765059416e-06, "loss": 2.513823318481445, "memory(GiB)": 77.56, "step": 101535, "token_acc": 0.49244712990936557, "train_speed(iter/s)": 1.438613 }, { "epoch": 4.350284906387901, "grad_norm": 5.629308700561523, "learning_rate": 4.108714739886638e-06, "loss": 2.1469287872314453, "memory(GiB)": 77.56, "step": 101540, "token_acc": 0.5270758122743683, "train_speed(iter/s)": 1.438624 }, { "epoch": 4.35049912171715, "grad_norm": 5.630993843078613, "learning_rate": 4.106043546077548e-06, "loss": 2.0595304489135744, "memory(GiB)": 77.56, "step": 101545, "token_acc": 0.5429553264604811, "train_speed(iter/s)": 1.438622 }, { "epoch": 4.350713337046399, "grad_norm": 5.7489471435546875, "learning_rate": 4.103373183680531e-06, "loss": 2.268590545654297, "memory(GiB)": 77.56, "step": 101550, "token_acc": 0.52, "train_speed(iter/s)": 1.438623 }, { "epoch": 4.350927552375648, "grad_norm": 5.641597270965576, "learning_rate": 4.100703652743959e-06, "loss": 2.2816795349121093, "memory(GiB)": 77.56, "step": 101555, "token_acc": 0.5214723926380368, "train_speed(iter/s)": 1.438596 }, { "epoch": 4.351141767704897, "grad_norm": 6.621870517730713, "learning_rate": 4.098034953316194e-06, "loss": 2.422886848449707, "memory(GiB)": 77.56, "step": 101560, "token_acc": 0.5123674911660777, "train_speed(iter/s)": 1.438595 }, { "epoch": 4.351355983034146, "grad_norm": 6.43054723739624, "learning_rate": 4.09536708544559e-06, "loss": 2.2933895111083986, "memory(GiB)": 77.56, "step": 101565, "token_acc": 0.511400651465798, "train_speed(iter/s)": 1.438597 }, { "epoch": 4.351570198363395, "grad_norm": 6.566086769104004, "learning_rate": 4.092700049180476e-06, "loss": 2.3913063049316405, "memory(GiB)": 77.56, "step": 101570, "token_acc": 0.46551724137931033, "train_speed(iter/s)": 1.438609 }, { "epoch": 4.351784413692644, "grad_norm": 5.274600028991699, "learning_rate": 4.090033844569163e-06, "loss": 2.3955715179443358, "memory(GiB)": 77.56, "step": 101575, "token_acc": 0.4691358024691358, "train_speed(iter/s)": 1.438611 }, { "epoch": 4.351998629021892, "grad_norm": 6.710582256317139, "learning_rate": 4.087368471659958e-06, "loss": 2.0774911880493163, "memory(GiB)": 77.56, "step": 101580, "token_acc": 0.5167785234899329, "train_speed(iter/s)": 1.438601 }, { "epoch": 4.352212844351142, "grad_norm": 7.038486480712891, "learning_rate": 4.084703930501138e-06, "loss": 2.4165790557861326, "memory(GiB)": 77.56, "step": 101585, "token_acc": 0.45695364238410596, "train_speed(iter/s)": 1.438597 }, { "epoch": 4.352427059680391, "grad_norm": 4.987838268280029, "learning_rate": 4.08204022114097e-06, "loss": 2.3569786071777346, "memory(GiB)": 77.56, "step": 101590, "token_acc": 0.48923076923076925, "train_speed(iter/s)": 1.438615 }, { "epoch": 4.35264127500964, "grad_norm": 5.530329704284668, "learning_rate": 4.079377343627722e-06, "loss": 2.36212272644043, "memory(GiB)": 77.56, "step": 101595, "token_acc": 0.46830985915492956, "train_speed(iter/s)": 1.438629 }, { "epoch": 4.3528554903388885, "grad_norm": 7.183777809143066, "learning_rate": 4.076715298009637e-06, "loss": 2.0948135375976564, "memory(GiB)": 77.56, "step": 101600, "token_acc": 0.5285714285714286, "train_speed(iter/s)": 1.438643 }, { "epoch": 4.353069705668138, "grad_norm": 6.435431003570557, "learning_rate": 4.0740540843349325e-06, "loss": 2.3647430419921873, "memory(GiB)": 77.56, "step": 101605, "token_acc": 0.5, "train_speed(iter/s)": 1.438664 }, { "epoch": 4.353283920997386, "grad_norm": 6.683962345123291, "learning_rate": 4.071393702651821e-06, "loss": 2.464950942993164, "memory(GiB)": 77.56, "step": 101610, "token_acc": 0.5, "train_speed(iter/s)": 1.43866 }, { "epoch": 4.353498136326635, "grad_norm": 7.889510631561279, "learning_rate": 4.068734153008497e-06, "loss": 2.216324234008789, "memory(GiB)": 77.56, "step": 101615, "token_acc": 0.5441696113074205, "train_speed(iter/s)": 1.438631 }, { "epoch": 4.353712351655885, "grad_norm": 6.581700801849365, "learning_rate": 4.066075435453132e-06, "loss": 2.064121055603027, "memory(GiB)": 77.56, "step": 101620, "token_acc": 0.5521885521885522, "train_speed(iter/s)": 1.438636 }, { "epoch": 4.353926566985134, "grad_norm": 6.234287261962891, "learning_rate": 4.06341755003391e-06, "loss": 2.140406036376953, "memory(GiB)": 77.56, "step": 101625, "token_acc": 0.5369127516778524, "train_speed(iter/s)": 1.438624 }, { "epoch": 4.354140782314382, "grad_norm": 7.736416339874268, "learning_rate": 4.060760496798971e-06, "loss": 2.075102615356445, "memory(GiB)": 77.56, "step": 101630, "token_acc": 0.5727699530516432, "train_speed(iter/s)": 1.438637 }, { "epoch": 4.3543549976436315, "grad_norm": 5.36174201965332, "learning_rate": 4.058104275796449e-06, "loss": 2.3465129852294924, "memory(GiB)": 77.56, "step": 101635, "token_acc": 0.5179153094462541, "train_speed(iter/s)": 1.438633 }, { "epoch": 4.35456921297288, "grad_norm": 5.539736270904541, "learning_rate": 4.0554488870744645e-06, "loss": 2.255598449707031, "memory(GiB)": 77.56, "step": 101640, "token_acc": 0.5353535353535354, "train_speed(iter/s)": 1.438621 }, { "epoch": 4.354783428302129, "grad_norm": 6.58910608291626, "learning_rate": 4.052794330681125e-06, "loss": 2.3917633056640626, "memory(GiB)": 77.56, "step": 101645, "token_acc": 0.5044776119402985, "train_speed(iter/s)": 1.438622 }, { "epoch": 4.354997643631378, "grad_norm": 5.834932327270508, "learning_rate": 4.050140606664505e-06, "loss": 2.3838035583496096, "memory(GiB)": 77.56, "step": 101650, "token_acc": 0.4967948717948718, "train_speed(iter/s)": 1.438621 }, { "epoch": 4.355211858960628, "grad_norm": 5.4448933601379395, "learning_rate": 4.047487715072706e-06, "loss": 2.1299243927001954, "memory(GiB)": 77.56, "step": 101655, "token_acc": 0.543918918918919, "train_speed(iter/s)": 1.438628 }, { "epoch": 4.355426074289876, "grad_norm": 6.422670841217041, "learning_rate": 4.044835655953772e-06, "loss": 2.3987396240234373, "memory(GiB)": 77.56, "step": 101660, "token_acc": 0.4803921568627451, "train_speed(iter/s)": 1.438624 }, { "epoch": 4.355640289619125, "grad_norm": 6.349356651306152, "learning_rate": 4.042184429355739e-06, "loss": 2.0001426696777345, "memory(GiB)": 77.56, "step": 101665, "token_acc": 0.5591397849462365, "train_speed(iter/s)": 1.438618 }, { "epoch": 4.355854504948374, "grad_norm": 5.946943759918213, "learning_rate": 4.039534035326659e-06, "loss": 2.1214082717895506, "memory(GiB)": 77.56, "step": 101670, "token_acc": 0.5420875420875421, "train_speed(iter/s)": 1.438596 }, { "epoch": 4.356068720277623, "grad_norm": 7.886771202087402, "learning_rate": 4.0368844739145315e-06, "loss": 2.4991241455078126, "memory(GiB)": 77.56, "step": 101675, "token_acc": 0.4817073170731707, "train_speed(iter/s)": 1.438611 }, { "epoch": 4.356282935606872, "grad_norm": 7.021752834320068, "learning_rate": 4.034235745167353e-06, "loss": 2.363290023803711, "memory(GiB)": 77.56, "step": 101680, "token_acc": 0.47987616099071206, "train_speed(iter/s)": 1.43862 }, { "epoch": 4.356497150936121, "grad_norm": 7.677444934844971, "learning_rate": 4.031587849133123e-06, "loss": 2.320631408691406, "memory(GiB)": 77.56, "step": 101685, "token_acc": 0.5136186770428015, "train_speed(iter/s)": 1.438621 }, { "epoch": 4.35671136626537, "grad_norm": 7.164308071136475, "learning_rate": 4.0289407858597974e-06, "loss": 2.4267263412475586, "memory(GiB)": 77.56, "step": 101690, "token_acc": 0.5333333333333333, "train_speed(iter/s)": 1.438621 }, { "epoch": 4.356925581594619, "grad_norm": 7.085773468017578, "learning_rate": 4.026294555395332e-06, "loss": 2.3393688201904297, "memory(GiB)": 77.56, "step": 101695, "token_acc": 0.5, "train_speed(iter/s)": 1.43862 }, { "epoch": 4.357139796923867, "grad_norm": 7.105220317840576, "learning_rate": 4.023649157787673e-06, "loss": 2.223957061767578, "memory(GiB)": 77.56, "step": 101700, "token_acc": 0.5267605633802817, "train_speed(iter/s)": 1.438626 }, { "epoch": 4.357354012253117, "grad_norm": 5.647937297821045, "learning_rate": 4.021004593084726e-06, "loss": 2.199010467529297, "memory(GiB)": 77.56, "step": 101705, "token_acc": 0.5323741007194245, "train_speed(iter/s)": 1.438619 }, { "epoch": 4.357568227582366, "grad_norm": 6.133718490600586, "learning_rate": 4.0183608613344244e-06, "loss": 2.3756628036499023, "memory(GiB)": 77.56, "step": 101710, "token_acc": 0.4897260273972603, "train_speed(iter/s)": 1.438602 }, { "epoch": 4.357782442911615, "grad_norm": 6.701653480529785, "learning_rate": 4.015717962584648e-06, "loss": 2.4356565475463867, "memory(GiB)": 77.56, "step": 101715, "token_acc": 0.4923547400611621, "train_speed(iter/s)": 1.438594 }, { "epoch": 4.357996658240864, "grad_norm": 6.717548847198486, "learning_rate": 4.013075896883278e-06, "loss": 2.23958740234375, "memory(GiB)": 77.56, "step": 101720, "token_acc": 0.4885245901639344, "train_speed(iter/s)": 1.4386 }, { "epoch": 4.358210873570113, "grad_norm": 4.824134826660156, "learning_rate": 4.0104346642781785e-06, "loss": 2.1966087341308596, "memory(GiB)": 77.56, "step": 101725, "token_acc": 0.5040983606557377, "train_speed(iter/s)": 1.438594 }, { "epoch": 4.358425088899361, "grad_norm": 6.535068035125732, "learning_rate": 4.007794264817194e-06, "loss": 2.3015857696533204, "memory(GiB)": 77.56, "step": 101730, "token_acc": 0.47477744807121663, "train_speed(iter/s)": 1.438593 }, { "epoch": 4.3586393042286105, "grad_norm": 6.709981441497803, "learning_rate": 4.005154698548152e-06, "loss": 2.2980566024780273, "memory(GiB)": 77.56, "step": 101735, "token_acc": 0.5331412103746398, "train_speed(iter/s)": 1.438604 }, { "epoch": 4.35885351955786, "grad_norm": 5.353514671325684, "learning_rate": 4.002515965518883e-06, "loss": 2.2750146865844725, "memory(GiB)": 77.56, "step": 101740, "token_acc": 0.5294117647058824, "train_speed(iter/s)": 1.438606 }, { "epoch": 4.359067734887109, "grad_norm": 8.38457202911377, "learning_rate": 3.999878065777191e-06, "loss": 2.6203729629516603, "memory(GiB)": 77.56, "step": 101745, "token_acc": 0.44694533762057875, "train_speed(iter/s)": 1.438619 }, { "epoch": 4.359281950216357, "grad_norm": 6.5823493003845215, "learning_rate": 3.9972409993708605e-06, "loss": 2.3757259368896486, "memory(GiB)": 77.56, "step": 101750, "token_acc": 0.5067114093959731, "train_speed(iter/s)": 1.438611 }, { "epoch": 4.359496165545607, "grad_norm": 11.166370391845703, "learning_rate": 3.994604766347665e-06, "loss": 2.050105667114258, "memory(GiB)": 77.56, "step": 101755, "token_acc": 0.5276752767527675, "train_speed(iter/s)": 1.438627 }, { "epoch": 4.359710380874855, "grad_norm": 4.874205112457275, "learning_rate": 3.99196936675536e-06, "loss": 2.4722108840942383, "memory(GiB)": 77.56, "step": 101760, "token_acc": 0.4896755162241888, "train_speed(iter/s)": 1.438639 }, { "epoch": 4.359924596204104, "grad_norm": 5.018775939941406, "learning_rate": 3.989334800641681e-06, "loss": 1.9645587921142578, "memory(GiB)": 77.56, "step": 101765, "token_acc": 0.5522875816993464, "train_speed(iter/s)": 1.438644 }, { "epoch": 4.3601388115333535, "grad_norm": 4.494512557983398, "learning_rate": 3.986701068054371e-06, "loss": 2.1807619094848634, "memory(GiB)": 77.56, "step": 101770, "token_acc": 0.5564516129032258, "train_speed(iter/s)": 1.438631 }, { "epoch": 4.360353026862603, "grad_norm": 6.0012359619140625, "learning_rate": 3.9840681690411384e-06, "loss": 2.3009567260742188, "memory(GiB)": 77.56, "step": 101775, "token_acc": 0.51440329218107, "train_speed(iter/s)": 1.43862 }, { "epoch": 4.360567242191851, "grad_norm": 6.165236473083496, "learning_rate": 3.981436103649672e-06, "loss": 2.0071544647216797, "memory(GiB)": 77.56, "step": 101780, "token_acc": 0.5934959349593496, "train_speed(iter/s)": 1.438632 }, { "epoch": 4.3607814575211, "grad_norm": 5.125621318817139, "learning_rate": 3.978804871927661e-06, "loss": 2.307464599609375, "memory(GiB)": 77.56, "step": 101785, "token_acc": 0.5119453924914675, "train_speed(iter/s)": 1.438634 }, { "epoch": 4.360995672850349, "grad_norm": 6.096005916595459, "learning_rate": 3.976174473922772e-06, "loss": 2.285843276977539, "memory(GiB)": 77.56, "step": 101790, "token_acc": 0.5118110236220472, "train_speed(iter/s)": 1.438644 }, { "epoch": 4.361209888179598, "grad_norm": 5.178644180297852, "learning_rate": 3.973544909682647e-06, "loss": 2.1128358840942383, "memory(GiB)": 77.56, "step": 101795, "token_acc": 0.5150375939849624, "train_speed(iter/s)": 1.438645 }, { "epoch": 4.361424103508847, "grad_norm": 6.157804489135742, "learning_rate": 3.970916179254941e-06, "loss": 2.3324485778808595, "memory(GiB)": 77.56, "step": 101800, "token_acc": 0.5, "train_speed(iter/s)": 1.438658 }, { "epoch": 4.3616383188380965, "grad_norm": 6.665396213531494, "learning_rate": 3.968288282687272e-06, "loss": 2.3250059127807616, "memory(GiB)": 77.56, "step": 101805, "token_acc": 0.5232198142414861, "train_speed(iter/s)": 1.438664 }, { "epoch": 4.361852534167345, "grad_norm": 6.363424301147461, "learning_rate": 3.965661220027233e-06, "loss": 2.289792060852051, "memory(GiB)": 77.56, "step": 101810, "token_acc": 0.5324232081911263, "train_speed(iter/s)": 1.438675 }, { "epoch": 4.362066749496594, "grad_norm": 6.050394058227539, "learning_rate": 3.963034991322423e-06, "loss": 2.2415313720703125, "memory(GiB)": 77.56, "step": 101815, "token_acc": 0.5114503816793893, "train_speed(iter/s)": 1.438702 }, { "epoch": 4.3622809648258425, "grad_norm": 7.389623641967773, "learning_rate": 3.9604095966204295e-06, "loss": 2.287662887573242, "memory(GiB)": 77.56, "step": 101820, "token_acc": 0.5364238410596026, "train_speed(iter/s)": 1.438705 }, { "epoch": 4.362495180155092, "grad_norm": 6.708871841430664, "learning_rate": 3.957785035968792e-06, "loss": 2.300059509277344, "memory(GiB)": 77.56, "step": 101825, "token_acc": 0.5661764705882353, "train_speed(iter/s)": 1.438701 }, { "epoch": 4.362709395484341, "grad_norm": 6.89017915725708, "learning_rate": 3.955161309415078e-06, "loss": 2.5353761672973634, "memory(GiB)": 77.56, "step": 101830, "token_acc": 0.4414715719063545, "train_speed(iter/s)": 1.438716 }, { "epoch": 4.36292361081359, "grad_norm": 5.644868850708008, "learning_rate": 3.9525384170068125e-06, "loss": 2.173910140991211, "memory(GiB)": 77.56, "step": 101835, "token_acc": 0.524904214559387, "train_speed(iter/s)": 1.438723 }, { "epoch": 4.363137826142839, "grad_norm": 7.357850074768066, "learning_rate": 3.949916358791511e-06, "loss": 2.234305572509766, "memory(GiB)": 77.56, "step": 101840, "token_acc": 0.5340136054421769, "train_speed(iter/s)": 1.438726 }, { "epoch": 4.363352041472088, "grad_norm": 6.908636093139648, "learning_rate": 3.94729513481667e-06, "loss": 2.748577880859375, "memory(GiB)": 77.56, "step": 101845, "token_acc": 0.4460431654676259, "train_speed(iter/s)": 1.438723 }, { "epoch": 4.363566256801336, "grad_norm": 7.517205715179443, "learning_rate": 3.944674745129773e-06, "loss": 2.2894689559936525, "memory(GiB)": 77.56, "step": 101850, "token_acc": 0.4823943661971831, "train_speed(iter/s)": 1.438732 }, { "epoch": 4.3637804721305855, "grad_norm": 5.923609256744385, "learning_rate": 3.942055189778305e-06, "loss": 2.2906402587890624, "memory(GiB)": 77.56, "step": 101855, "token_acc": 0.5177514792899408, "train_speed(iter/s)": 1.438738 }, { "epoch": 4.363994687459835, "grad_norm": 6.85636043548584, "learning_rate": 3.93943646880971e-06, "loss": 1.9102947235107421, "memory(GiB)": 77.56, "step": 101860, "token_acc": 0.5371900826446281, "train_speed(iter/s)": 1.438741 }, { "epoch": 4.364208902789084, "grad_norm": 5.837848663330078, "learning_rate": 3.936818582271428e-06, "loss": 1.9517528533935546, "memory(GiB)": 77.56, "step": 101865, "token_acc": 0.535031847133758, "train_speed(iter/s)": 1.438729 }, { "epoch": 4.364423118118332, "grad_norm": 6.050806522369385, "learning_rate": 3.934201530210896e-06, "loss": 2.189637565612793, "memory(GiB)": 77.56, "step": 101870, "token_acc": 0.5250836120401338, "train_speed(iter/s)": 1.438734 }, { "epoch": 4.364637333447582, "grad_norm": 7.866569519042969, "learning_rate": 3.931585312675512e-06, "loss": 2.72440242767334, "memory(GiB)": 77.56, "step": 101875, "token_acc": 0.5017421602787456, "train_speed(iter/s)": 1.438745 }, { "epoch": 4.36485154877683, "grad_norm": 7.117494583129883, "learning_rate": 3.928969929712673e-06, "loss": 2.551620292663574, "memory(GiB)": 77.56, "step": 101880, "token_acc": 0.4583333333333333, "train_speed(iter/s)": 1.438731 }, { "epoch": 4.365065764106079, "grad_norm": 7.407101154327393, "learning_rate": 3.926355381369762e-06, "loss": 2.266368865966797, "memory(GiB)": 77.56, "step": 101885, "token_acc": 0.47719298245614034, "train_speed(iter/s)": 1.438732 }, { "epoch": 4.3652799794353285, "grad_norm": 8.699575424194336, "learning_rate": 3.923741667694142e-06, "loss": 2.057978057861328, "memory(GiB)": 77.56, "step": 101890, "token_acc": 0.49586776859504134, "train_speed(iter/s)": 1.438738 }, { "epoch": 4.365494194764578, "grad_norm": 5.718353271484375, "learning_rate": 3.921128788733169e-06, "loss": 2.2229318618774414, "memory(GiB)": 77.56, "step": 101895, "token_acc": 0.5104166666666666, "train_speed(iter/s)": 1.438749 }, { "epoch": 4.365708410093826, "grad_norm": 6.512753963470459, "learning_rate": 3.918516744534178e-06, "loss": 2.528089141845703, "memory(GiB)": 77.56, "step": 101900, "token_acc": 0.4919614147909968, "train_speed(iter/s)": 1.438768 }, { "epoch": 4.365922625423075, "grad_norm": 6.803748607635498, "learning_rate": 3.915905535144482e-06, "loss": 2.234787178039551, "memory(GiB)": 77.56, "step": 101905, "token_acc": 0.5070921985815603, "train_speed(iter/s)": 1.438779 }, { "epoch": 4.366136840752324, "grad_norm": 6.593135833740234, "learning_rate": 3.9132951606113745e-06, "loss": 2.3033109664916993, "memory(GiB)": 77.56, "step": 101910, "token_acc": 0.5231788079470199, "train_speed(iter/s)": 1.43879 }, { "epoch": 4.366351056081573, "grad_norm": 8.200020790100098, "learning_rate": 3.910685620982174e-06, "loss": 2.102082061767578, "memory(GiB)": 77.56, "step": 101915, "token_acc": 0.5261194029850746, "train_speed(iter/s)": 1.438796 }, { "epoch": 4.366565271410822, "grad_norm": 6.162394046783447, "learning_rate": 3.908076916304132e-06, "loss": 2.1550668716430663, "memory(GiB)": 77.56, "step": 101920, "token_acc": 0.5176848874598071, "train_speed(iter/s)": 1.4388 }, { "epoch": 4.3667794867400715, "grad_norm": 5.4941511154174805, "learning_rate": 3.9054690466245215e-06, "loss": 2.4001319885253904, "memory(GiB)": 77.56, "step": 101925, "token_acc": 0.4879725085910653, "train_speed(iter/s)": 1.438814 }, { "epoch": 4.36699370206932, "grad_norm": 7.408539295196533, "learning_rate": 3.902862011990571e-06, "loss": 2.372382926940918, "memory(GiB)": 77.56, "step": 101930, "token_acc": 0.5079872204472844, "train_speed(iter/s)": 1.438816 }, { "epoch": 4.367207917398569, "grad_norm": 5.480215072631836, "learning_rate": 3.900255812449527e-06, "loss": 1.9784221649169922, "memory(GiB)": 77.56, "step": 101935, "token_acc": 0.5536480686695279, "train_speed(iter/s)": 1.438819 }, { "epoch": 4.3674221327278175, "grad_norm": 6.481391906738281, "learning_rate": 3.897650448048579e-06, "loss": 2.184911346435547, "memory(GiB)": 77.56, "step": 101940, "token_acc": 0.5221843003412969, "train_speed(iter/s)": 1.438829 }, { "epoch": 4.367636348057067, "grad_norm": 8.044012069702148, "learning_rate": 3.89504591883495e-06, "loss": 2.596766471862793, "memory(GiB)": 77.56, "step": 101945, "token_acc": 0.4857142857142857, "train_speed(iter/s)": 1.438852 }, { "epoch": 4.367850563386316, "grad_norm": 7.4052300453186035, "learning_rate": 3.892442224855813e-06, "loss": 2.018967056274414, "memory(GiB)": 77.56, "step": 101950, "token_acc": 0.5510204081632653, "train_speed(iter/s)": 1.438858 }, { "epoch": 4.368064778715565, "grad_norm": 5.536005973815918, "learning_rate": 3.889839366158343e-06, "loss": 2.1921310424804688, "memory(GiB)": 77.56, "step": 101955, "token_acc": 0.475, "train_speed(iter/s)": 1.438861 }, { "epoch": 4.368278994044814, "grad_norm": 6.33701753616333, "learning_rate": 3.8872373427896735e-06, "loss": 2.0811885833740233, "memory(GiB)": 77.56, "step": 101960, "token_acc": 0.5306122448979592, "train_speed(iter/s)": 1.43887 }, { "epoch": 4.368493209374063, "grad_norm": 7.837864398956299, "learning_rate": 3.884636154796967e-06, "loss": 2.385394287109375, "memory(GiB)": 77.56, "step": 101965, "token_acc": 0.48484848484848486, "train_speed(iter/s)": 1.438865 }, { "epoch": 4.368707424703311, "grad_norm": 5.585497856140137, "learning_rate": 3.88203580222733e-06, "loss": 2.2544960021972655, "memory(GiB)": 77.56, "step": 101970, "token_acc": 0.5222222222222223, "train_speed(iter/s)": 1.43886 }, { "epoch": 4.368921640032561, "grad_norm": 5.798569679260254, "learning_rate": 3.879436285127886e-06, "loss": 2.688420295715332, "memory(GiB)": 77.56, "step": 101975, "token_acc": 0.43769968051118213, "train_speed(iter/s)": 1.438862 }, { "epoch": 4.36913585536181, "grad_norm": 6.69862174987793, "learning_rate": 3.876837603545713e-06, "loss": 2.441334533691406, "memory(GiB)": 77.56, "step": 101980, "token_acc": 0.48606811145510836, "train_speed(iter/s)": 1.438852 }, { "epoch": 4.369350070691059, "grad_norm": 7.233700752258301, "learning_rate": 3.874239757527897e-06, "loss": 2.4201616287231444, "memory(GiB)": 77.56, "step": 101985, "token_acc": 0.48717948717948717, "train_speed(iter/s)": 1.438855 }, { "epoch": 4.369564286020307, "grad_norm": 5.579944610595703, "learning_rate": 3.8716427471214955e-06, "loss": 2.0034767150878907, "memory(GiB)": 77.56, "step": 101990, "token_acc": 0.5595667870036101, "train_speed(iter/s)": 1.438849 }, { "epoch": 4.369778501349557, "grad_norm": 7.078153610229492, "learning_rate": 3.869046572373552e-06, "loss": 2.261469841003418, "memory(GiB)": 77.56, "step": 101995, "token_acc": 0.4964788732394366, "train_speed(iter/s)": 1.438849 }, { "epoch": 4.369992716678805, "grad_norm": 7.603664875030518, "learning_rate": 3.866451233331108e-06, "loss": 2.156891632080078, "memory(GiB)": 77.56, "step": 102000, "token_acc": 0.5174825174825175, "train_speed(iter/s)": 1.438855 }, { "epoch": 4.369992716678805, "eval_loss": 2.340041160583496, "eval_runtime": 14.1504, "eval_samples_per_second": 7.067, "eval_steps_per_second": 7.067, "eval_token_acc": 0.4856020942408377, "step": 102000 }, { "epoch": 4.370206932008054, "grad_norm": 5.848587512969971, "learning_rate": 3.863856730041182e-06, "loss": 2.3252744674682617, "memory(GiB)": 77.56, "step": 102005, "token_acc": 0.4905123339658444, "train_speed(iter/s)": 1.438554 }, { "epoch": 4.370421147337304, "grad_norm": 5.226724147796631, "learning_rate": 3.861263062550769e-06, "loss": 2.389395523071289, "memory(GiB)": 77.56, "step": 102010, "token_acc": 0.4746666666666667, "train_speed(iter/s)": 1.438556 }, { "epoch": 4.370635362666553, "grad_norm": 8.791924476623535, "learning_rate": 3.858670230906852e-06, "loss": 2.389272117614746, "memory(GiB)": 77.56, "step": 102015, "token_acc": 0.5029585798816568, "train_speed(iter/s)": 1.438563 }, { "epoch": 4.370849577995801, "grad_norm": 5.486352443695068, "learning_rate": 3.856078235156413e-06, "loss": 2.3030513763427733, "memory(GiB)": 77.56, "step": 102020, "token_acc": 0.4563758389261745, "train_speed(iter/s)": 1.438559 }, { "epoch": 4.3710637933250505, "grad_norm": 5.798378944396973, "learning_rate": 3.853487075346396e-06, "loss": 2.3755891799926756, "memory(GiB)": 77.56, "step": 102025, "token_acc": 0.5255474452554745, "train_speed(iter/s)": 1.438557 }, { "epoch": 4.371278008654299, "grad_norm": 6.607595443725586, "learning_rate": 3.850896751523758e-06, "loss": 2.451091766357422, "memory(GiB)": 77.56, "step": 102030, "token_acc": 0.4888888888888889, "train_speed(iter/s)": 1.438574 }, { "epoch": 4.371492223983548, "grad_norm": 5.402934551239014, "learning_rate": 3.8483072637354065e-06, "loss": 2.4583719253540037, "memory(GiB)": 77.56, "step": 102035, "token_acc": 0.48427672955974843, "train_speed(iter/s)": 1.438562 }, { "epoch": 4.371706439312797, "grad_norm": 5.248274803161621, "learning_rate": 3.845718612028271e-06, "loss": 2.4814741134643556, "memory(GiB)": 77.56, "step": 102040, "token_acc": 0.48148148148148145, "train_speed(iter/s)": 1.438561 }, { "epoch": 4.371920654642047, "grad_norm": 6.561869144439697, "learning_rate": 3.843130796449235e-06, "loss": 2.2307239532470704, "memory(GiB)": 77.56, "step": 102045, "token_acc": 0.49390243902439024, "train_speed(iter/s)": 1.438575 }, { "epoch": 4.372134869971295, "grad_norm": 5.527139663696289, "learning_rate": 3.840543817045189e-06, "loss": 2.2846038818359373, "memory(GiB)": 77.56, "step": 102050, "token_acc": 0.5017543859649123, "train_speed(iter/s)": 1.43857 }, { "epoch": 4.372349085300544, "grad_norm": 5.8063812255859375, "learning_rate": 3.8379576738629856e-06, "loss": 2.3370399475097656, "memory(GiB)": 77.56, "step": 102055, "token_acc": 0.5147540983606558, "train_speed(iter/s)": 1.438553 }, { "epoch": 4.3725633006297935, "grad_norm": 6.326207637786865, "learning_rate": 3.8353723669494855e-06, "loss": 2.1681535720825194, "memory(GiB)": 77.56, "step": 102060, "token_acc": 0.5203761755485894, "train_speed(iter/s)": 1.438543 }, { "epoch": 4.372777515959042, "grad_norm": 6.0897417068481445, "learning_rate": 3.8327878963515256e-06, "loss": 2.502007484436035, "memory(GiB)": 77.56, "step": 102065, "token_acc": 0.48333333333333334, "train_speed(iter/s)": 1.438546 }, { "epoch": 4.372991731288291, "grad_norm": 7.496880054473877, "learning_rate": 3.830204262115916e-06, "loss": 2.9834030151367186, "memory(GiB)": 77.56, "step": 102070, "token_acc": 0.4358974358974359, "train_speed(iter/s)": 1.438553 }, { "epoch": 4.37320594661754, "grad_norm": 5.736611843109131, "learning_rate": 3.827621464289471e-06, "loss": 2.283222961425781, "memory(GiB)": 77.56, "step": 102075, "token_acc": 0.5428571428571428, "train_speed(iter/s)": 1.438566 }, { "epoch": 4.373420161946789, "grad_norm": 7.284671306610107, "learning_rate": 3.825039502918965e-06, "loss": 2.4526615142822266, "memory(GiB)": 77.56, "step": 102080, "token_acc": 0.5220125786163522, "train_speed(iter/s)": 1.438559 }, { "epoch": 4.373634377276038, "grad_norm": 5.200911045074463, "learning_rate": 3.822458378051197e-06, "loss": 2.2039566040039062, "memory(GiB)": 77.56, "step": 102085, "token_acc": 0.5105633802816901, "train_speed(iter/s)": 1.438561 }, { "epoch": 4.373848592605287, "grad_norm": 5.566181659698486, "learning_rate": 3.819878089732909e-06, "loss": 2.448895072937012, "memory(GiB)": 77.56, "step": 102090, "token_acc": 0.5016181229773463, "train_speed(iter/s)": 1.438564 }, { "epoch": 4.374062807934536, "grad_norm": 6.476151466369629, "learning_rate": 3.817298638010852e-06, "loss": 2.356591796875, "memory(GiB)": 77.56, "step": 102095, "token_acc": 0.4879032258064516, "train_speed(iter/s)": 1.438573 }, { "epoch": 4.374277023263785, "grad_norm": 5.828040599822998, "learning_rate": 3.814720022931756e-06, "loss": 2.1494466781616213, "memory(GiB)": 77.56, "step": 102100, "token_acc": 0.524822695035461, "train_speed(iter/s)": 1.438574 }, { "epoch": 4.374491238593034, "grad_norm": 6.093513488769531, "learning_rate": 3.812142244542327e-06, "loss": 2.407297134399414, "memory(GiB)": 77.56, "step": 102105, "token_acc": 0.48109965635738833, "train_speed(iter/s)": 1.438578 }, { "epoch": 4.3747054539222825, "grad_norm": 7.090908050537109, "learning_rate": 3.809565302889262e-06, "loss": 2.3766469955444336, "memory(GiB)": 77.56, "step": 102110, "token_acc": 0.5, "train_speed(iter/s)": 1.438595 }, { "epoch": 4.374919669251532, "grad_norm": 6.150954723358154, "learning_rate": 3.8069891980192507e-06, "loss": 2.2138465881347655, "memory(GiB)": 77.56, "step": 102115, "token_acc": 0.5566343042071198, "train_speed(iter/s)": 1.438603 }, { "epoch": 4.375133884580781, "grad_norm": 5.788748264312744, "learning_rate": 3.8044139299789727e-06, "loss": 2.3531898498535155, "memory(GiB)": 77.56, "step": 102120, "token_acc": 0.4879518072289157, "train_speed(iter/s)": 1.438619 }, { "epoch": 4.375348099910029, "grad_norm": 7.598631381988525, "learning_rate": 3.801839498815074e-06, "loss": 2.2558507919311523, "memory(GiB)": 77.56, "step": 102125, "token_acc": 0.5335820895522388, "train_speed(iter/s)": 1.438629 }, { "epoch": 4.375562315239279, "grad_norm": 5.6159467697143555, "learning_rate": 3.799265904574184e-06, "loss": 2.3174352645874023, "memory(GiB)": 77.56, "step": 102130, "token_acc": 0.521594684385382, "train_speed(iter/s)": 1.43865 }, { "epoch": 4.375776530568528, "grad_norm": 6.232875823974609, "learning_rate": 3.7966931473029378e-06, "loss": 2.404511642456055, "memory(GiB)": 77.56, "step": 102135, "token_acc": 0.5055350553505535, "train_speed(iter/s)": 1.43866 }, { "epoch": 4.375990745897776, "grad_norm": 7.546611309051514, "learning_rate": 3.7941212270479254e-06, "loss": 2.143113708496094, "memory(GiB)": 77.56, "step": 102140, "token_acc": 0.5462184873949579, "train_speed(iter/s)": 1.438673 }, { "epoch": 4.3762049612270255, "grad_norm": 7.115991592407227, "learning_rate": 3.79155014385576e-06, "loss": 2.464577293395996, "memory(GiB)": 77.56, "step": 102145, "token_acc": 0.4666666666666667, "train_speed(iter/s)": 1.438675 }, { "epoch": 4.376419176556275, "grad_norm": 7.377319812774658, "learning_rate": 3.7889798977730153e-06, "loss": 2.2669668197631836, "memory(GiB)": 77.56, "step": 102150, "token_acc": 0.5251798561151079, "train_speed(iter/s)": 1.438667 }, { "epoch": 4.376633391885523, "grad_norm": 5.948807716369629, "learning_rate": 3.7864104888462425e-06, "loss": 2.1526660919189453, "memory(GiB)": 77.56, "step": 102155, "token_acc": 0.5144927536231884, "train_speed(iter/s)": 1.438643 }, { "epoch": 4.376847607214772, "grad_norm": 5.095167636871338, "learning_rate": 3.783841917122e-06, "loss": 2.2469051361083983, "memory(GiB)": 77.56, "step": 102160, "token_acc": 0.554858934169279, "train_speed(iter/s)": 1.438652 }, { "epoch": 4.377061822544022, "grad_norm": 5.583045959472656, "learning_rate": 3.781274182646816e-06, "loss": 2.276612663269043, "memory(GiB)": 77.56, "step": 102165, "token_acc": 0.532520325203252, "train_speed(iter/s)": 1.438657 }, { "epoch": 4.37727603787327, "grad_norm": 5.439349174499512, "learning_rate": 3.778707285467198e-06, "loss": 2.2353775024414064, "memory(GiB)": 77.56, "step": 102170, "token_acc": 0.5279503105590062, "train_speed(iter/s)": 1.438663 }, { "epoch": 4.377490253202519, "grad_norm": 5.987916469573975, "learning_rate": 3.776141225629659e-06, "loss": 2.150084686279297, "memory(GiB)": 77.56, "step": 102175, "token_acc": 0.5375, "train_speed(iter/s)": 1.438653 }, { "epoch": 4.3777044685317685, "grad_norm": 6.209805488586426, "learning_rate": 3.77357600318069e-06, "loss": 2.354393196105957, "memory(GiB)": 77.56, "step": 102180, "token_acc": 0.484375, "train_speed(iter/s)": 1.438647 }, { "epoch": 4.377918683861017, "grad_norm": 7.4460649490356445, "learning_rate": 3.7710116181667422e-06, "loss": 2.400659942626953, "memory(GiB)": 77.56, "step": 102185, "token_acc": 0.47770700636942676, "train_speed(iter/s)": 1.438632 }, { "epoch": 4.378132899190266, "grad_norm": 9.189557075500488, "learning_rate": 3.7684480706343005e-06, "loss": 2.434188461303711, "memory(GiB)": 77.56, "step": 102190, "token_acc": 0.4775641025641026, "train_speed(iter/s)": 1.438635 }, { "epoch": 4.378347114519515, "grad_norm": 5.722678184509277, "learning_rate": 3.765885360629784e-06, "loss": 2.1983808517456054, "memory(GiB)": 77.56, "step": 102195, "token_acc": 0.5555555555555556, "train_speed(iter/s)": 1.438633 }, { "epoch": 4.378561329848764, "grad_norm": 6.487076759338379, "learning_rate": 3.7633234881996216e-06, "loss": 2.265846061706543, "memory(GiB)": 77.56, "step": 102200, "token_acc": 0.5155709342560554, "train_speed(iter/s)": 1.438646 }, { "epoch": 4.378775545178013, "grad_norm": 6.377604007720947, "learning_rate": 3.760762453390232e-06, "loss": 2.0761249542236326, "memory(GiB)": 77.56, "step": 102205, "token_acc": 0.5091575091575091, "train_speed(iter/s)": 1.43866 }, { "epoch": 4.378989760507262, "grad_norm": 6.834473133087158, "learning_rate": 3.7582022562480112e-06, "loss": 1.9617494583129882, "memory(GiB)": 77.56, "step": 102210, "token_acc": 0.5702479338842975, "train_speed(iter/s)": 1.438667 }, { "epoch": 4.379203975836511, "grad_norm": 6.393004417419434, "learning_rate": 3.7556428968193335e-06, "loss": 1.9575017929077148, "memory(GiB)": 77.56, "step": 102215, "token_acc": 0.5806451612903226, "train_speed(iter/s)": 1.438675 }, { "epoch": 4.37941819116576, "grad_norm": 7.924510478973389, "learning_rate": 3.7530843751505617e-06, "loss": 2.2697858810424805, "memory(GiB)": 77.56, "step": 102220, "token_acc": 0.5330578512396694, "train_speed(iter/s)": 1.43868 }, { "epoch": 4.379632406495009, "grad_norm": 6.417081356048584, "learning_rate": 3.7505266912880423e-06, "loss": 1.9659889221191407, "memory(GiB)": 77.56, "step": 102225, "token_acc": 0.5542168674698795, "train_speed(iter/s)": 1.438684 }, { "epoch": 4.3798466218242575, "grad_norm": 5.71816349029541, "learning_rate": 3.7479698452781264e-06, "loss": 2.333475112915039, "memory(GiB)": 77.56, "step": 102230, "token_acc": 0.5338078291814946, "train_speed(iter/s)": 1.438668 }, { "epoch": 4.380060837153507, "grad_norm": 5.253359317779541, "learning_rate": 3.7454138371671275e-06, "loss": 2.054024314880371, "memory(GiB)": 77.56, "step": 102235, "token_acc": 0.5498392282958199, "train_speed(iter/s)": 1.438679 }, { "epoch": 4.380275052482756, "grad_norm": 5.206414699554443, "learning_rate": 3.742858667001342e-06, "loss": 2.145540237426758, "memory(GiB)": 77.56, "step": 102240, "token_acc": 0.5130718954248366, "train_speed(iter/s)": 1.438676 }, { "epoch": 4.380489267812004, "grad_norm": 7.803227424621582, "learning_rate": 3.7403043348270716e-06, "loss": 2.354961395263672, "memory(GiB)": 77.56, "step": 102245, "token_acc": 0.4647887323943662, "train_speed(iter/s)": 1.438677 }, { "epoch": 4.380703483141254, "grad_norm": 6.29659366607666, "learning_rate": 3.737750840690579e-06, "loss": 2.321546173095703, "memory(GiB)": 77.56, "step": 102250, "token_acc": 0.5518672199170125, "train_speed(iter/s)": 1.43869 }, { "epoch": 4.380917698470503, "grad_norm": 6.761232852935791, "learning_rate": 3.735198184638117e-06, "loss": 2.027303123474121, "memory(GiB)": 77.56, "step": 102255, "token_acc": 0.5261324041811847, "train_speed(iter/s)": 1.438702 }, { "epoch": 4.381131913799751, "grad_norm": 6.1805524826049805, "learning_rate": 3.732646366715942e-06, "loss": 2.292635917663574, "memory(GiB)": 77.56, "step": 102260, "token_acc": 0.5015873015873016, "train_speed(iter/s)": 1.438691 }, { "epoch": 4.381346129129001, "grad_norm": 7.5055975914001465, "learning_rate": 3.7300953869702894e-06, "loss": 1.9432130813598634, "memory(GiB)": 77.56, "step": 102265, "token_acc": 0.572992700729927, "train_speed(iter/s)": 1.438707 }, { "epoch": 4.38156034445825, "grad_norm": 5.628097057342529, "learning_rate": 3.7275452454473613e-06, "loss": 2.564580535888672, "memory(GiB)": 77.56, "step": 102270, "token_acc": 0.4925373134328358, "train_speed(iter/s)": 1.438709 }, { "epoch": 4.381774559787498, "grad_norm": 6.682338237762451, "learning_rate": 3.72499594219336e-06, "loss": 2.2786733627319338, "memory(GiB)": 77.56, "step": 102275, "token_acc": 0.5155709342560554, "train_speed(iter/s)": 1.438721 }, { "epoch": 4.3819887751167474, "grad_norm": 6.442184925079346, "learning_rate": 3.7224474772544706e-06, "loss": 2.3629243850708006, "memory(GiB)": 77.56, "step": 102280, "token_acc": 0.5136054421768708, "train_speed(iter/s)": 1.438701 }, { "epoch": 4.382202990445997, "grad_norm": 5.647762298583984, "learning_rate": 3.7198998506768444e-06, "loss": 2.2429290771484376, "memory(GiB)": 77.56, "step": 102285, "token_acc": 0.5249169435215947, "train_speed(iter/s)": 1.438715 }, { "epoch": 4.382417205775245, "grad_norm": 5.52142858505249, "learning_rate": 3.717353062506651e-06, "loss": 2.2926504135131838, "memory(GiB)": 77.56, "step": 102290, "token_acc": 0.4810606060606061, "train_speed(iter/s)": 1.438706 }, { "epoch": 4.382631421104494, "grad_norm": 6.8619537353515625, "learning_rate": 3.714807112790031e-06, "loss": 2.184850311279297, "memory(GiB)": 77.56, "step": 102295, "token_acc": 0.5617529880478087, "train_speed(iter/s)": 1.438721 }, { "epoch": 4.382845636433744, "grad_norm": 5.306491851806641, "learning_rate": 3.7122620015730967e-06, "loss": 2.162005615234375, "memory(GiB)": 77.56, "step": 102300, "token_acc": 0.525974025974026, "train_speed(iter/s)": 1.438727 }, { "epoch": 4.383059851762992, "grad_norm": 5.403891086578369, "learning_rate": 3.709717728901957e-06, "loss": 1.9421588897705078, "memory(GiB)": 77.56, "step": 102305, "token_acc": 0.553030303030303, "train_speed(iter/s)": 1.438741 }, { "epoch": 4.383274067092241, "grad_norm": 6.866203308105469, "learning_rate": 3.707174294822702e-06, "loss": 2.7477653503417967, "memory(GiB)": 77.56, "step": 102310, "token_acc": 0.46325878594249204, "train_speed(iter/s)": 1.438757 }, { "epoch": 4.3834882824214905, "grad_norm": 6.528674125671387, "learning_rate": 3.7046316993814058e-06, "loss": 2.1985727310180665, "memory(GiB)": 77.56, "step": 102315, "token_acc": 0.5447470817120622, "train_speed(iter/s)": 1.438736 }, { "epoch": 4.383702497750739, "grad_norm": 5.5168867111206055, "learning_rate": 3.702089942624143e-06, "loss": 2.0589027404785156, "memory(GiB)": 77.56, "step": 102320, "token_acc": 0.5611285266457681, "train_speed(iter/s)": 1.438728 }, { "epoch": 4.383916713079988, "grad_norm": 5.517563343048096, "learning_rate": 3.699549024596949e-06, "loss": 2.136216735839844, "memory(GiB)": 77.56, "step": 102325, "token_acc": 0.5387205387205387, "train_speed(iter/s)": 1.438728 }, { "epoch": 4.384130928409237, "grad_norm": 5.827166557312012, "learning_rate": 3.6970089453458534e-06, "loss": 2.275006103515625, "memory(GiB)": 77.56, "step": 102330, "token_acc": 0.5, "train_speed(iter/s)": 1.438743 }, { "epoch": 4.384345143738486, "grad_norm": 5.796579360961914, "learning_rate": 3.6944697049168807e-06, "loss": 2.2470897674560546, "memory(GiB)": 77.56, "step": 102335, "token_acc": 0.5337837837837838, "train_speed(iter/s)": 1.438738 }, { "epoch": 4.384559359067735, "grad_norm": 7.104578018188477, "learning_rate": 3.6919313033560278e-06, "loss": 2.325518035888672, "memory(GiB)": 77.56, "step": 102340, "token_acc": 0.5147540983606558, "train_speed(iter/s)": 1.438753 }, { "epoch": 4.384773574396984, "grad_norm": 5.3219075202941895, "learning_rate": 3.689393740709274e-06, "loss": 2.276840591430664, "memory(GiB)": 77.56, "step": 102345, "token_acc": 0.5555555555555556, "train_speed(iter/s)": 1.438754 }, { "epoch": 4.384987789726233, "grad_norm": 7.892182350158691, "learning_rate": 3.686857017022605e-06, "loss": 2.577449417114258, "memory(GiB)": 77.56, "step": 102350, "token_acc": 0.47368421052631576, "train_speed(iter/s)": 1.438761 }, { "epoch": 4.385202005055482, "grad_norm": 7.663252830505371, "learning_rate": 3.684321132341961e-06, "loss": 2.270546722412109, "memory(GiB)": 77.56, "step": 102355, "token_acc": 0.5243445692883895, "train_speed(iter/s)": 1.438747 }, { "epoch": 4.385416220384731, "grad_norm": 6.864215850830078, "learning_rate": 3.681786086713296e-06, "loss": 2.064653205871582, "memory(GiB)": 77.56, "step": 102360, "token_acc": 0.5594855305466238, "train_speed(iter/s)": 1.438756 }, { "epoch": 4.3856304357139795, "grad_norm": 5.660896301269531, "learning_rate": 3.679251880182516e-06, "loss": 2.261896324157715, "memory(GiB)": 77.56, "step": 102365, "token_acc": 0.49570200573065903, "train_speed(iter/s)": 1.438753 }, { "epoch": 4.385844651043229, "grad_norm": 6.448981761932373, "learning_rate": 3.6767185127955407e-06, "loss": 2.3489830017089846, "memory(GiB)": 77.56, "step": 102370, "token_acc": 0.5365079365079365, "train_speed(iter/s)": 1.438765 }, { "epoch": 4.386058866372478, "grad_norm": 5.482466220855713, "learning_rate": 3.674185984598266e-06, "loss": 2.160551071166992, "memory(GiB)": 77.56, "step": 102375, "token_acc": 0.5189003436426117, "train_speed(iter/s)": 1.438774 }, { "epoch": 4.386273081701726, "grad_norm": 6.697409629821777, "learning_rate": 3.671654295636573e-06, "loss": 2.5021800994873047, "memory(GiB)": 77.56, "step": 102380, "token_acc": 0.49310344827586206, "train_speed(iter/s)": 1.438769 }, { "epoch": 4.386487297030976, "grad_norm": 6.35969877243042, "learning_rate": 3.669123445956324e-06, "loss": 2.221974182128906, "memory(GiB)": 77.56, "step": 102385, "token_acc": 0.5191082802547771, "train_speed(iter/s)": 1.438766 }, { "epoch": 4.386701512360225, "grad_norm": 7.7098612785339355, "learning_rate": 3.6665934356033604e-06, "loss": 2.6257568359375, "memory(GiB)": 77.56, "step": 102390, "token_acc": 0.4819672131147541, "train_speed(iter/s)": 1.438783 }, { "epoch": 4.386915727689473, "grad_norm": 6.42761754989624, "learning_rate": 3.6640642646235236e-06, "loss": 2.2071720123291017, "memory(GiB)": 77.56, "step": 102395, "token_acc": 0.5340501792114696, "train_speed(iter/s)": 1.438795 }, { "epoch": 4.3871299430187225, "grad_norm": 5.812616348266602, "learning_rate": 3.661535933062621e-06, "loss": 2.221953010559082, "memory(GiB)": 77.56, "step": 102400, "token_acc": 0.4889705882352941, "train_speed(iter/s)": 1.438794 }, { "epoch": 4.387344158347972, "grad_norm": 7.074045181274414, "learning_rate": 3.6590084409664605e-06, "loss": 2.0088850021362306, "memory(GiB)": 77.56, "step": 102405, "token_acc": 0.532608695652174, "train_speed(iter/s)": 1.438804 }, { "epoch": 4.38755837367722, "grad_norm": 5.440163612365723, "learning_rate": 3.65648178838085e-06, "loss": 2.287728691101074, "memory(GiB)": 77.56, "step": 102410, "token_acc": 0.5341246290801187, "train_speed(iter/s)": 1.438801 }, { "epoch": 4.387772589006469, "grad_norm": 5.775484085083008, "learning_rate": 3.653955975351536e-06, "loss": 2.1079334259033202, "memory(GiB)": 77.56, "step": 102415, "token_acc": 0.5373134328358209, "train_speed(iter/s)": 1.438803 }, { "epoch": 4.387986804335719, "grad_norm": 5.511375904083252, "learning_rate": 3.651431001924294e-06, "loss": 2.3807945251464844, "memory(GiB)": 77.56, "step": 102420, "token_acc": 0.5325670498084292, "train_speed(iter/s)": 1.43882 }, { "epoch": 4.388201019664967, "grad_norm": 6.314801216125488, "learning_rate": 3.6489068681448522e-06, "loss": 2.421588134765625, "memory(GiB)": 77.56, "step": 102425, "token_acc": 0.4770992366412214, "train_speed(iter/s)": 1.438836 }, { "epoch": 4.388415234994216, "grad_norm": 5.989437103271484, "learning_rate": 3.646383574058937e-06, "loss": 2.293304443359375, "memory(GiB)": 77.56, "step": 102430, "token_acc": 0.49829351535836175, "train_speed(iter/s)": 1.43883 }, { "epoch": 4.3886294503234655, "grad_norm": 6.0139312744140625, "learning_rate": 3.6438611197122773e-06, "loss": 2.3289308547973633, "memory(GiB)": 77.56, "step": 102435, "token_acc": 0.4886731391585761, "train_speed(iter/s)": 1.438849 }, { "epoch": 4.388843665652714, "grad_norm": 5.903653144836426, "learning_rate": 3.641339505150554e-06, "loss": 2.2890724182128905, "memory(GiB)": 77.56, "step": 102440, "token_acc": 0.4748201438848921, "train_speed(iter/s)": 1.438867 }, { "epoch": 4.389057880981963, "grad_norm": 7.067310810089111, "learning_rate": 3.6388187304194577e-06, "loss": 2.2559886932373048, "memory(GiB)": 77.56, "step": 102445, "token_acc": 0.4868421052631579, "train_speed(iter/s)": 1.438856 }, { "epoch": 4.389272096311212, "grad_norm": 5.744080066680908, "learning_rate": 3.6362987955646468e-06, "loss": 2.105891799926758, "memory(GiB)": 77.56, "step": 102450, "token_acc": 0.53125, "train_speed(iter/s)": 1.438868 }, { "epoch": 4.389486311640461, "grad_norm": 7.300531387329102, "learning_rate": 3.6337797006317785e-06, "loss": 2.1764875411987306, "memory(GiB)": 77.56, "step": 102455, "token_acc": 0.551948051948052, "train_speed(iter/s)": 1.438872 }, { "epoch": 4.38970052696971, "grad_norm": 5.576843738555908, "learning_rate": 3.6312614456664782e-06, "loss": 2.132879638671875, "memory(GiB)": 77.56, "step": 102460, "token_acc": 0.5182481751824818, "train_speed(iter/s)": 1.438879 }, { "epoch": 4.389914742298959, "grad_norm": 8.703091621398926, "learning_rate": 3.6287440307143816e-06, "loss": 1.9866539001464845, "memory(GiB)": 77.56, "step": 102465, "token_acc": 0.5665399239543726, "train_speed(iter/s)": 1.43889 }, { "epoch": 4.390128957628208, "grad_norm": 8.888564109802246, "learning_rate": 3.6262274558210852e-06, "loss": 2.4895147323608398, "memory(GiB)": 77.56, "step": 102470, "token_acc": 0.4968152866242038, "train_speed(iter/s)": 1.438902 }, { "epoch": 4.390343172957457, "grad_norm": 5.700866222381592, "learning_rate": 3.6237117210321802e-06, "loss": 2.2201826095581056, "memory(GiB)": 77.56, "step": 102475, "token_acc": 0.5571955719557196, "train_speed(iter/s)": 1.438914 }, { "epoch": 4.390557388286706, "grad_norm": 8.399176597595215, "learning_rate": 3.621196826393236e-06, "loss": 2.436996269226074, "memory(GiB)": 77.56, "step": 102480, "token_acc": 0.48355263157894735, "train_speed(iter/s)": 1.438921 }, { "epoch": 4.3907716036159545, "grad_norm": 6.0412373542785645, "learning_rate": 3.6186827719498272e-06, "loss": 2.242756462097168, "memory(GiB)": 77.56, "step": 102485, "token_acc": 0.5272108843537415, "train_speed(iter/s)": 1.438926 }, { "epoch": 4.390985818945204, "grad_norm": 5.355155944824219, "learning_rate": 3.6161695577474784e-06, "loss": 2.1781797409057617, "memory(GiB)": 77.56, "step": 102490, "token_acc": 0.5345454545454545, "train_speed(iter/s)": 1.438928 }, { "epoch": 4.391200034274453, "grad_norm": 5.779280662536621, "learning_rate": 3.613657183831737e-06, "loss": 2.3542625427246096, "memory(GiB)": 77.56, "step": 102495, "token_acc": 0.49825783972125437, "train_speed(iter/s)": 1.438938 }, { "epoch": 4.391414249603701, "grad_norm": 8.196554183959961, "learning_rate": 3.61114565024811e-06, "loss": 1.898379135131836, "memory(GiB)": 77.56, "step": 102500, "token_acc": 0.5422535211267606, "train_speed(iter/s)": 1.438942 }, { "epoch": 4.391414249603701, "eval_loss": 2.239689350128174, "eval_runtime": 14.3867, "eval_samples_per_second": 6.951, "eval_steps_per_second": 6.951, "eval_token_acc": 0.4548919949174079, "step": 102500 }, { "epoch": 4.391628464932951, "grad_norm": 5.600312232971191, "learning_rate": 3.608634957042095e-06, "loss": 2.354815673828125, "memory(GiB)": 77.56, "step": 102505, "token_acc": 0.47314814814814815, "train_speed(iter/s)": 1.438618 }, { "epoch": 4.3918426802622, "grad_norm": 5.942442417144775, "learning_rate": 3.6061251042591837e-06, "loss": 2.652938461303711, "memory(GiB)": 77.56, "step": 102510, "token_acc": 0.4782608695652174, "train_speed(iter/s)": 1.438611 }, { "epoch": 4.392056895591448, "grad_norm": 5.3099589347839355, "learning_rate": 3.6036160919448226e-06, "loss": 2.3568065643310545, "memory(GiB)": 77.56, "step": 102515, "token_acc": 0.4934210526315789, "train_speed(iter/s)": 1.4386 }, { "epoch": 4.3922711109206976, "grad_norm": 8.654638290405273, "learning_rate": 3.6011079201444865e-06, "loss": 2.4184709548950196, "memory(GiB)": 77.56, "step": 102520, "token_acc": 0.5171232876712328, "train_speed(iter/s)": 1.438594 }, { "epoch": 4.392485326249947, "grad_norm": 8.568259239196777, "learning_rate": 3.598600588903611e-06, "loss": 2.0518417358398438, "memory(GiB)": 77.56, "step": 102525, "token_acc": 0.5579710144927537, "train_speed(iter/s)": 1.438581 }, { "epoch": 4.392699541579195, "grad_norm": 7.474752902984619, "learning_rate": 3.5960940982676105e-06, "loss": 2.4256092071533204, "memory(GiB)": 77.56, "step": 102530, "token_acc": 0.48859934853420195, "train_speed(iter/s)": 1.438567 }, { "epoch": 4.392913756908444, "grad_norm": 5.823642253875732, "learning_rate": 3.593588448281893e-06, "loss": 2.354109191894531, "memory(GiB)": 77.56, "step": 102535, "token_acc": 0.5107913669064749, "train_speed(iter/s)": 1.438576 }, { "epoch": 4.393127972237694, "grad_norm": 6.928681373596191, "learning_rate": 3.59108363899186e-06, "loss": 2.4611879348754884, "memory(GiB)": 77.56, "step": 102540, "token_acc": 0.5154320987654321, "train_speed(iter/s)": 1.438585 }, { "epoch": 4.393342187566942, "grad_norm": 6.965407371520996, "learning_rate": 3.5885796704428707e-06, "loss": 2.399798583984375, "memory(GiB)": 77.56, "step": 102545, "token_acc": 0.5177514792899408, "train_speed(iter/s)": 1.438598 }, { "epoch": 4.393556402896191, "grad_norm": 5.064464569091797, "learning_rate": 3.5860765426803056e-06, "loss": 2.3432443618774412, "memory(GiB)": 77.56, "step": 102550, "token_acc": 0.5016949152542373, "train_speed(iter/s)": 1.438605 }, { "epoch": 4.393770618225441, "grad_norm": 5.201739311218262, "learning_rate": 3.5835742557494943e-06, "loss": 2.1250043869018556, "memory(GiB)": 77.56, "step": 102555, "token_acc": 0.5448275862068965, "train_speed(iter/s)": 1.438611 }, { "epoch": 4.393984833554689, "grad_norm": 7.4527788162231445, "learning_rate": 3.58107280969579e-06, "loss": 2.435356330871582, "memory(GiB)": 77.56, "step": 102560, "token_acc": 0.46830985915492956, "train_speed(iter/s)": 1.4386 }, { "epoch": 4.394199048883938, "grad_norm": 7.7264227867126465, "learning_rate": 3.5785722045644898e-06, "loss": 2.2883148193359375, "memory(GiB)": 77.56, "step": 102565, "token_acc": 0.528169014084507, "train_speed(iter/s)": 1.438586 }, { "epoch": 4.3944132642131875, "grad_norm": 7.861260414123535, "learning_rate": 3.576072440400907e-06, "loss": 2.727778434753418, "memory(GiB)": 77.56, "step": 102570, "token_acc": 0.5084745762711864, "train_speed(iter/s)": 1.438589 }, { "epoch": 4.394627479542436, "grad_norm": 5.563624382019043, "learning_rate": 3.5735735172503117e-06, "loss": 2.060395622253418, "memory(GiB)": 77.56, "step": 102575, "token_acc": 0.5494505494505495, "train_speed(iter/s)": 1.438593 }, { "epoch": 4.394841694871685, "grad_norm": 6.749576568603516, "learning_rate": 3.571075435157989e-06, "loss": 2.6023681640625, "memory(GiB)": 77.56, "step": 102580, "token_acc": 0.49834983498349833, "train_speed(iter/s)": 1.438602 }, { "epoch": 4.395055910200934, "grad_norm": 6.05499792098999, "learning_rate": 3.568578194169192e-06, "loss": 2.258266639709473, "memory(GiB)": 77.56, "step": 102585, "token_acc": 0.5401459854014599, "train_speed(iter/s)": 1.438594 }, { "epoch": 4.395270125530183, "grad_norm": 6.144804954528809, "learning_rate": 3.566081794329157e-06, "loss": 2.0408275604248045, "memory(GiB)": 77.56, "step": 102590, "token_acc": 0.4981684981684982, "train_speed(iter/s)": 1.438602 }, { "epoch": 4.395484340859432, "grad_norm": 6.501466274261475, "learning_rate": 3.563586235683103e-06, "loss": 2.487798309326172, "memory(GiB)": 77.56, "step": 102595, "token_acc": 0.4882943143812709, "train_speed(iter/s)": 1.438604 }, { "epoch": 4.395698556188681, "grad_norm": 5.503562927246094, "learning_rate": 3.5610915182762492e-06, "loss": 2.207347297668457, "memory(GiB)": 77.56, "step": 102600, "token_acc": 0.543918918918919, "train_speed(iter/s)": 1.438598 }, { "epoch": 4.39591277151793, "grad_norm": 6.674918174743652, "learning_rate": 3.5585976421537825e-06, "loss": 2.199715232849121, "memory(GiB)": 77.56, "step": 102605, "token_acc": 0.5294117647058824, "train_speed(iter/s)": 1.438614 }, { "epoch": 4.396126986847179, "grad_norm": 6.238319396972656, "learning_rate": 3.556104607360888e-06, "loss": 2.393204689025879, "memory(GiB)": 77.56, "step": 102610, "token_acc": 0.5439330543933054, "train_speed(iter/s)": 1.438619 }, { "epoch": 4.396341202176428, "grad_norm": 7.066052436828613, "learning_rate": 3.553612413942725e-06, "loss": 2.345548629760742, "memory(GiB)": 77.56, "step": 102615, "token_acc": 0.4807692307692308, "train_speed(iter/s)": 1.4386 }, { "epoch": 4.3965554175056765, "grad_norm": 7.078065872192383, "learning_rate": 3.551121061944446e-06, "loss": 2.510301208496094, "memory(GiB)": 77.56, "step": 102620, "token_acc": 0.5308219178082192, "train_speed(iter/s)": 1.438595 }, { "epoch": 4.396769632834926, "grad_norm": 6.518613338470459, "learning_rate": 3.5486305514111805e-06, "loss": 2.0175621032714846, "memory(GiB)": 77.56, "step": 102625, "token_acc": 0.508695652173913, "train_speed(iter/s)": 1.438608 }, { "epoch": 4.396983848164175, "grad_norm": 8.077980995178223, "learning_rate": 3.5461408823880437e-06, "loss": 2.7288206100463865, "memory(GiB)": 77.56, "step": 102630, "token_acc": 0.4749262536873156, "train_speed(iter/s)": 1.438634 }, { "epoch": 4.397198063493423, "grad_norm": 7.078575611114502, "learning_rate": 3.543652054920138e-06, "loss": 2.1935190200805663, "memory(GiB)": 77.56, "step": 102635, "token_acc": 0.5304659498207885, "train_speed(iter/s)": 1.438636 }, { "epoch": 4.397412278822673, "grad_norm": 6.157913684844971, "learning_rate": 3.5411640690525606e-06, "loss": 2.056357574462891, "memory(GiB)": 77.56, "step": 102640, "token_acc": 0.5543859649122806, "train_speed(iter/s)": 1.438657 }, { "epoch": 4.397626494151922, "grad_norm": 8.185633659362793, "learning_rate": 3.538676924830381e-06, "loss": 2.2879461288452148, "memory(GiB)": 77.56, "step": 102645, "token_acc": 0.5294117647058824, "train_speed(iter/s)": 1.438674 }, { "epoch": 4.39784070948117, "grad_norm": 7.19227409362793, "learning_rate": 3.5361906222986517e-06, "loss": 2.1820980072021485, "memory(GiB)": 77.56, "step": 102650, "token_acc": 0.5333333333333333, "train_speed(iter/s)": 1.438682 }, { "epoch": 4.3980549248104195, "grad_norm": 5.405209064483643, "learning_rate": 3.5337051615024207e-06, "loss": 2.4287492752075197, "memory(GiB)": 77.56, "step": 102655, "token_acc": 0.4986149584487535, "train_speed(iter/s)": 1.438705 }, { "epoch": 4.398269140139669, "grad_norm": 10.108190536499023, "learning_rate": 3.531220542486696e-06, "loss": 2.46765251159668, "memory(GiB)": 77.56, "step": 102660, "token_acc": 0.5056603773584906, "train_speed(iter/s)": 1.43871 }, { "epoch": 4.398483355468917, "grad_norm": 7.863072395324707, "learning_rate": 3.528736765296514e-06, "loss": 2.4931325912475586, "memory(GiB)": 77.56, "step": 102665, "token_acc": 0.5017667844522968, "train_speed(iter/s)": 1.438728 }, { "epoch": 4.398697570798166, "grad_norm": 7.470780372619629, "learning_rate": 3.5262538299768553e-06, "loss": 2.3910215377807615, "memory(GiB)": 77.56, "step": 102670, "token_acc": 0.48518518518518516, "train_speed(iter/s)": 1.438713 }, { "epoch": 4.398911786127416, "grad_norm": 7.713631629943848, "learning_rate": 3.5237717365727065e-06, "loss": 2.2994256973266602, "memory(GiB)": 77.56, "step": 102675, "token_acc": 0.553030303030303, "train_speed(iter/s)": 1.438708 }, { "epoch": 4.399126001456664, "grad_norm": 6.646881103515625, "learning_rate": 3.521290485129031e-06, "loss": 2.212492752075195, "memory(GiB)": 77.56, "step": 102680, "token_acc": 0.5166051660516605, "train_speed(iter/s)": 1.438698 }, { "epoch": 4.399340216785913, "grad_norm": 4.960170269012451, "learning_rate": 3.5188100756907815e-06, "loss": 2.3690673828125, "memory(GiB)": 77.56, "step": 102685, "token_acc": 0.5079787234042553, "train_speed(iter/s)": 1.438704 }, { "epoch": 4.3995544321151625, "grad_norm": 5.805353164672852, "learning_rate": 3.5163305083028785e-06, "loss": 2.434545707702637, "memory(GiB)": 77.56, "step": 102690, "token_acc": 0.526813880126183, "train_speed(iter/s)": 1.438705 }, { "epoch": 4.399768647444411, "grad_norm": 6.253900527954102, "learning_rate": 3.5138517830102635e-06, "loss": 2.1208045959472654, "memory(GiB)": 77.56, "step": 102695, "token_acc": 0.5684210526315789, "train_speed(iter/s)": 1.43871 }, { "epoch": 4.39998286277366, "grad_norm": 7.896162509918213, "learning_rate": 3.5113738998578227e-06, "loss": 2.377148246765137, "memory(GiB)": 77.56, "step": 102700, "token_acc": 0.496551724137931, "train_speed(iter/s)": 1.438703 }, { "epoch": 4.400197078102909, "grad_norm": 5.874502658843994, "learning_rate": 3.5088968588904593e-06, "loss": 2.0496097564697267, "memory(GiB)": 77.56, "step": 102705, "token_acc": 0.5114754098360655, "train_speed(iter/s)": 1.438715 }, { "epoch": 4.400411293432158, "grad_norm": 7.194603443145752, "learning_rate": 3.5064206601530423e-06, "loss": 2.3831321716308596, "memory(GiB)": 77.56, "step": 102710, "token_acc": 0.4963235294117647, "train_speed(iter/s)": 1.438732 }, { "epoch": 4.400625508761407, "grad_norm": 8.089038848876953, "learning_rate": 3.503945303690431e-06, "loss": 2.2116674423217773, "memory(GiB)": 77.56, "step": 102715, "token_acc": 0.5373134328358209, "train_speed(iter/s)": 1.438744 }, { "epoch": 4.400839724090656, "grad_norm": 5.585841655731201, "learning_rate": 3.501470789547462e-06, "loss": 2.2736770629882814, "memory(GiB)": 77.56, "step": 102720, "token_acc": 0.5110410094637224, "train_speed(iter/s)": 1.438748 }, { "epoch": 4.401053939419905, "grad_norm": 5.96068000793457, "learning_rate": 3.4989971177689707e-06, "loss": 2.405515670776367, "memory(GiB)": 77.56, "step": 102725, "token_acc": 0.5115511551155115, "train_speed(iter/s)": 1.43873 }, { "epoch": 4.401268154749154, "grad_norm": 7.212528228759766, "learning_rate": 3.496524288399772e-06, "loss": 2.602706718444824, "memory(GiB)": 77.56, "step": 102730, "token_acc": 0.4377358490566038, "train_speed(iter/s)": 1.438727 }, { "epoch": 4.401482370078403, "grad_norm": 6.397942066192627, "learning_rate": 3.4940523014846572e-06, "loss": 2.353199577331543, "memory(GiB)": 77.56, "step": 102735, "token_acc": 0.5437262357414449, "train_speed(iter/s)": 1.438732 }, { "epoch": 4.4016965854076515, "grad_norm": 6.335452079772949, "learning_rate": 3.491581157068413e-06, "loss": 2.5333284378051757, "memory(GiB)": 77.56, "step": 102740, "token_acc": 0.45307443365695793, "train_speed(iter/s)": 1.438732 }, { "epoch": 4.401910800736901, "grad_norm": 5.486098766326904, "learning_rate": 3.4891108551958042e-06, "loss": 2.1428770065307616, "memory(GiB)": 77.56, "step": 102745, "token_acc": 0.5115384615384615, "train_speed(iter/s)": 1.438739 }, { "epoch": 4.40212501606615, "grad_norm": 8.776297569274902, "learning_rate": 3.4866413959115774e-06, "loss": 2.274058151245117, "memory(GiB)": 77.56, "step": 102750, "token_acc": 0.5376344086021505, "train_speed(iter/s)": 1.438712 }, { "epoch": 4.402339231395398, "grad_norm": 7.713545322418213, "learning_rate": 3.4841727792604805e-06, "loss": 2.3287942886352537, "memory(GiB)": 77.56, "step": 102755, "token_acc": 0.49851632047477745, "train_speed(iter/s)": 1.438731 }, { "epoch": 4.402553446724648, "grad_norm": 8.110536575317383, "learning_rate": 3.481705005287228e-06, "loss": 2.162745475769043, "memory(GiB)": 77.56, "step": 102760, "token_acc": 0.548951048951049, "train_speed(iter/s)": 1.438728 }, { "epoch": 4.402767662053897, "grad_norm": 6.409671306610107, "learning_rate": 3.479238074036528e-06, "loss": 2.3971099853515625, "memory(GiB)": 77.56, "step": 102765, "token_acc": 0.5057471264367817, "train_speed(iter/s)": 1.43873 }, { "epoch": 4.402981877383145, "grad_norm": 8.387430191040039, "learning_rate": 3.476771985553068e-06, "loss": 2.7748668670654295, "memory(GiB)": 77.56, "step": 102770, "token_acc": 0.4749262536873156, "train_speed(iter/s)": 1.43875 }, { "epoch": 4.4031960927123945, "grad_norm": 7.343550205230713, "learning_rate": 3.474306739881522e-06, "loss": 2.3631175994873046, "memory(GiB)": 77.56, "step": 102775, "token_acc": 0.5017421602787456, "train_speed(iter/s)": 1.438751 }, { "epoch": 4.403410308041644, "grad_norm": 6.574812889099121, "learning_rate": 3.47184233706655e-06, "loss": 2.47608642578125, "memory(GiB)": 77.56, "step": 102780, "token_acc": 0.4939759036144578, "train_speed(iter/s)": 1.438751 }, { "epoch": 4.403624523370892, "grad_norm": 4.98598051071167, "learning_rate": 3.469378777152815e-06, "loss": 2.6040485382080076, "memory(GiB)": 77.56, "step": 102785, "token_acc": 0.46111111111111114, "train_speed(iter/s)": 1.438734 }, { "epoch": 4.403838738700141, "grad_norm": 7.161421298980713, "learning_rate": 3.4669160601849272e-06, "loss": 2.1930047988891603, "memory(GiB)": 77.56, "step": 102790, "token_acc": 0.5504587155963303, "train_speed(iter/s)": 1.43876 }, { "epoch": 4.404052954029391, "grad_norm": 7.1895623207092285, "learning_rate": 3.4644541862075054e-06, "loss": 2.4721134185791014, "memory(GiB)": 77.56, "step": 102795, "token_acc": 0.47750865051903113, "train_speed(iter/s)": 1.438757 }, { "epoch": 4.404267169358639, "grad_norm": 7.5333781242370605, "learning_rate": 3.461993155265153e-06, "loss": 2.3281063079833983, "memory(GiB)": 77.56, "step": 102800, "token_acc": 0.5133333333333333, "train_speed(iter/s)": 1.438762 }, { "epoch": 4.404481384687888, "grad_norm": 8.899251937866211, "learning_rate": 3.4595329674024456e-06, "loss": 2.4926557540893555, "memory(GiB)": 77.56, "step": 102805, "token_acc": 0.44516129032258067, "train_speed(iter/s)": 1.438755 }, { "epoch": 4.404695600017138, "grad_norm": 6.462435245513916, "learning_rate": 3.4570736226639588e-06, "loss": 2.4888607025146485, "memory(GiB)": 77.56, "step": 102810, "token_acc": 0.47474747474747475, "train_speed(iter/s)": 1.438761 }, { "epoch": 4.404909815346386, "grad_norm": 6.181706428527832, "learning_rate": 3.454615121094251e-06, "loss": 2.405670166015625, "memory(GiB)": 77.56, "step": 102815, "token_acc": 0.49645390070921985, "train_speed(iter/s)": 1.438755 }, { "epoch": 4.405124030675635, "grad_norm": 6.386962890625, "learning_rate": 3.4521574627378485e-06, "loss": 2.4266725540161134, "memory(GiB)": 77.56, "step": 102820, "token_acc": 0.5103448275862069, "train_speed(iter/s)": 1.438766 }, { "epoch": 4.405338246004884, "grad_norm": 7.187178134918213, "learning_rate": 3.4497006476392813e-06, "loss": 2.229561996459961, "memory(GiB)": 77.56, "step": 102825, "token_acc": 0.4965986394557823, "train_speed(iter/s)": 1.438753 }, { "epoch": 4.405552461334133, "grad_norm": 4.833610534667969, "learning_rate": 3.4472446758430533e-06, "loss": 2.310893249511719, "memory(GiB)": 77.56, "step": 102830, "token_acc": 0.5296052631578947, "train_speed(iter/s)": 1.43875 }, { "epoch": 4.405766676663382, "grad_norm": 5.882937431335449, "learning_rate": 3.4447895473936453e-06, "loss": 2.151133346557617, "memory(GiB)": 77.56, "step": 102835, "token_acc": 0.5286195286195287, "train_speed(iter/s)": 1.438752 }, { "epoch": 4.405980891992631, "grad_norm": 7.544086933135986, "learning_rate": 3.442335262335561e-06, "loss": 2.4032720565795898, "memory(GiB)": 77.56, "step": 102840, "token_acc": 0.5310344827586206, "train_speed(iter/s)": 1.438748 }, { "epoch": 4.40619510732188, "grad_norm": 5.912472248077393, "learning_rate": 3.439881820713248e-06, "loss": 2.4105043411254883, "memory(GiB)": 77.56, "step": 102845, "token_acc": 0.4718498659517426, "train_speed(iter/s)": 1.438752 }, { "epoch": 4.406409322651129, "grad_norm": 6.072861671447754, "learning_rate": 3.4374292225711423e-06, "loss": 2.415452575683594, "memory(GiB)": 77.56, "step": 102850, "token_acc": 0.5098591549295775, "train_speed(iter/s)": 1.438743 }, { "epoch": 4.406623537980378, "grad_norm": 6.437143325805664, "learning_rate": 3.434977467953693e-06, "loss": 2.346902275085449, "memory(GiB)": 77.56, "step": 102855, "token_acc": 0.5, "train_speed(iter/s)": 1.438762 }, { "epoch": 4.406837753309627, "grad_norm": 8.530142784118652, "learning_rate": 3.4325265569053134e-06, "loss": 2.1180702209472657, "memory(GiB)": 77.56, "step": 102860, "token_acc": 0.5441696113074205, "train_speed(iter/s)": 1.438751 }, { "epoch": 4.407051968638876, "grad_norm": 6.524062156677246, "learning_rate": 3.430076489470385e-06, "loss": 2.5981245040893555, "memory(GiB)": 77.56, "step": 102865, "token_acc": 0.4542857142857143, "train_speed(iter/s)": 1.438745 }, { "epoch": 4.407266183968125, "grad_norm": 5.981161594390869, "learning_rate": 3.4276272656933174e-06, "loss": 2.2366401672363283, "memory(GiB)": 77.56, "step": 102870, "token_acc": 0.48863636363636365, "train_speed(iter/s)": 1.438755 }, { "epoch": 4.4074803992973735, "grad_norm": 5.634190559387207, "learning_rate": 3.425178885618474e-06, "loss": 2.318357467651367, "memory(GiB)": 77.56, "step": 102875, "token_acc": 0.48148148148148145, "train_speed(iter/s)": 1.438759 }, { "epoch": 4.407694614626623, "grad_norm": 6.3136162757873535, "learning_rate": 3.422731349290198e-06, "loss": 2.2633522033691404, "memory(GiB)": 77.56, "step": 102880, "token_acc": 0.5413533834586466, "train_speed(iter/s)": 1.438763 }, { "epoch": 4.407908829955872, "grad_norm": 6.478562355041504, "learning_rate": 3.4202846567528424e-06, "loss": 2.2929569244384767, "memory(GiB)": 77.56, "step": 102885, "token_acc": 0.5254237288135594, "train_speed(iter/s)": 1.438769 }, { "epoch": 4.40812304528512, "grad_norm": 4.679701805114746, "learning_rate": 3.4178388080507274e-06, "loss": 2.3439220428466796, "memory(GiB)": 77.56, "step": 102890, "token_acc": 0.4779874213836478, "train_speed(iter/s)": 1.43878 }, { "epoch": 4.40833726061437, "grad_norm": 5.395583629608154, "learning_rate": 3.4153938032281454e-06, "loss": 2.462462615966797, "memory(GiB)": 77.56, "step": 102895, "token_acc": 0.48284960422163586, "train_speed(iter/s)": 1.43879 }, { "epoch": 4.408551475943619, "grad_norm": 7.386928081512451, "learning_rate": 3.412949642329416e-06, "loss": 2.326919364929199, "memory(GiB)": 77.56, "step": 102900, "token_acc": 0.48736462093862815, "train_speed(iter/s)": 1.438804 }, { "epoch": 4.408765691272867, "grad_norm": 10.29039478302002, "learning_rate": 3.4105063253988045e-06, "loss": 2.6700963973999023, "memory(GiB)": 77.56, "step": 102905, "token_acc": 0.4552238805970149, "train_speed(iter/s)": 1.438803 }, { "epoch": 4.4089799066021165, "grad_norm": 6.046471118927002, "learning_rate": 3.408063852480581e-06, "loss": 2.424648094177246, "memory(GiB)": 77.56, "step": 102910, "token_acc": 0.4934210526315789, "train_speed(iter/s)": 1.438803 }, { "epoch": 4.409194121931366, "grad_norm": 6.12212610244751, "learning_rate": 3.4056222236189815e-06, "loss": 2.226657485961914, "memory(GiB)": 77.56, "step": 102915, "token_acc": 0.4879725085910653, "train_speed(iter/s)": 1.438799 }, { "epoch": 4.409408337260614, "grad_norm": 6.161651134490967, "learning_rate": 3.4031814388582384e-06, "loss": 2.162016677856445, "memory(GiB)": 77.56, "step": 102920, "token_acc": 0.5346153846153846, "train_speed(iter/s)": 1.438802 }, { "epoch": 4.409622552589863, "grad_norm": 8.177957534790039, "learning_rate": 3.4007414982425766e-06, "loss": 2.0994197845458986, "memory(GiB)": 77.56, "step": 102925, "token_acc": 0.5490909090909091, "train_speed(iter/s)": 1.438809 }, { "epoch": 4.409836767919113, "grad_norm": 6.875726222991943, "learning_rate": 3.3983024018162004e-06, "loss": 2.3808807373046874, "memory(GiB)": 77.56, "step": 102930, "token_acc": 0.5229007633587787, "train_speed(iter/s)": 1.438812 }, { "epoch": 4.410050983248361, "grad_norm": 5.5511627197265625, "learning_rate": 3.395864149623296e-06, "loss": 1.9341264724731446, "memory(GiB)": 77.56, "step": 102935, "token_acc": 0.5606557377049181, "train_speed(iter/s)": 1.438827 }, { "epoch": 4.41026519857761, "grad_norm": 5.994646072387695, "learning_rate": 3.393426741708028e-06, "loss": 1.996798324584961, "memory(GiB)": 77.56, "step": 102940, "token_acc": 0.5318471337579618, "train_speed(iter/s)": 1.438837 }, { "epoch": 4.4104794139068595, "grad_norm": 5.379752159118652, "learning_rate": 3.390990178114556e-06, "loss": 2.398251533508301, "memory(GiB)": 77.56, "step": 102945, "token_acc": 0.5015873015873016, "train_speed(iter/s)": 1.438839 }, { "epoch": 4.410693629236108, "grad_norm": 6.1705546379089355, "learning_rate": 3.388554458887011e-06, "loss": 2.2360561370849608, "memory(GiB)": 77.56, "step": 102950, "token_acc": 0.5434782608695652, "train_speed(iter/s)": 1.438836 }, { "epoch": 4.410907844565357, "grad_norm": 8.667627334594727, "learning_rate": 3.386119584069536e-06, "loss": 2.3674911499023437, "memory(GiB)": 77.56, "step": 102955, "token_acc": 0.502127659574468, "train_speed(iter/s)": 1.438829 }, { "epoch": 4.411122059894606, "grad_norm": 6.950751304626465, "learning_rate": 3.383685553706234e-06, "loss": 2.710087776184082, "memory(GiB)": 77.56, "step": 102960, "token_acc": 0.4533333333333333, "train_speed(iter/s)": 1.438818 }, { "epoch": 4.411336275223855, "grad_norm": 5.777164936065674, "learning_rate": 3.381252367841192e-06, "loss": 2.175288772583008, "memory(GiB)": 77.56, "step": 102965, "token_acc": 0.5572755417956656, "train_speed(iter/s)": 1.438833 }, { "epoch": 4.411550490553104, "grad_norm": 6.37148904800415, "learning_rate": 3.3788200265185033e-06, "loss": 2.2468969345092775, "memory(GiB)": 77.56, "step": 102970, "token_acc": 0.5583941605839416, "train_speed(iter/s)": 1.438831 }, { "epoch": 4.411764705882353, "grad_norm": 7.066985130310059, "learning_rate": 3.376388529782215e-06, "loss": 2.5912525177001955, "memory(GiB)": 77.56, "step": 102975, "token_acc": 0.46598639455782315, "train_speed(iter/s)": 1.438843 }, { "epoch": 4.411978921211602, "grad_norm": 6.8794331550598145, "learning_rate": 3.3739578776763814e-06, "loss": 2.218465042114258, "memory(GiB)": 77.56, "step": 102980, "token_acc": 0.519298245614035, "train_speed(iter/s)": 1.438834 }, { "epoch": 4.412193136540851, "grad_norm": 7.623315811157227, "learning_rate": 3.3715280702450502e-06, "loss": 2.2979253768920898, "memory(GiB)": 77.56, "step": 102985, "token_acc": 0.5209003215434084, "train_speed(iter/s)": 1.438825 }, { "epoch": 4.4124073518701, "grad_norm": 6.517643928527832, "learning_rate": 3.3690991075322254e-06, "loss": 2.3695747375488283, "memory(GiB)": 77.56, "step": 102990, "token_acc": 0.5095057034220533, "train_speed(iter/s)": 1.438826 }, { "epoch": 4.4126215671993485, "grad_norm": 7.054264545440674, "learning_rate": 3.3666709895819104e-06, "loss": 2.424116325378418, "memory(GiB)": 77.56, "step": 102995, "token_acc": 0.4950166112956811, "train_speed(iter/s)": 1.438823 }, { "epoch": 4.412835782528598, "grad_norm": 6.089029788970947, "learning_rate": 3.364243716438098e-06, "loss": 2.2391313552856444, "memory(GiB)": 77.56, "step": 103000, "token_acc": 0.5394321766561514, "train_speed(iter/s)": 1.438825 }, { "epoch": 4.412835782528598, "eval_loss": 1.950676679611206, "eval_runtime": 14.8525, "eval_samples_per_second": 6.733, "eval_steps_per_second": 6.733, "eval_token_acc": 0.509641873278237, "step": 103000 } ], "logging_steps": 5, "max_steps": 116705, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.717583870903251e+18, "train_batch_size": 8, "trial_name": null, "trial_params": null }