{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.809877820156442, "eval_steps": -11601, "global_step": 7000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00025858167948800825, "grad_norm": 242.40017700195312, "learning_rate": 8.605851979345955e-09, "loss": 6.130181312561035, "memory(GiB)": 34.54, "step": 1, "token_acc": 0.22661183727419593, "train_speed(iter/s)": 0.013364 }, { "epoch": 0.0012929083974400413, "grad_norm": 655.3204345703125, "learning_rate": 4.3029259896729774e-08, "loss": 6.322210311889648, "memory(GiB)": 56.04, "step": 5, "token_acc": 0.1488586387434555, "train_speed(iter/s)": 0.023012 }, { "epoch": 0.0025858167948800827, "grad_norm": 484.6578369140625, "learning_rate": 8.605851979345955e-08, "loss": 6.274671173095703, "memory(GiB)": 56.04, "step": 10, "token_acc": 0.1413762226201956, "train_speed(iter/s)": 0.025227 }, { "epoch": 0.003878725192320124, "grad_norm": 247.675048828125, "learning_rate": 1.2908777969018933e-07, "loss": 5.9146270751953125, "memory(GiB)": 56.04, "step": 15, "token_acc": 0.2279082774049217, "train_speed(iter/s)": 0.026453 }, { "epoch": 0.005171633589760165, "grad_norm": 334.42694091796875, "learning_rate": 1.721170395869191e-07, "loss": 6.069369506835938, "memory(GiB)": 56.04, "step": 20, "token_acc": 0.2284955683104916, "train_speed(iter/s)": 0.02697 }, { "epoch": 0.006464541987200207, "grad_norm": 492.56463623046875, "learning_rate": 2.151462994836489e-07, "loss": 6.095654296875, "memory(GiB)": 56.04, "step": 25, "token_acc": 0.24055135615829257, "train_speed(iter/s)": 0.027272 }, { "epoch": 0.007757450384640248, "grad_norm": 182.06201171875, "learning_rate": 2.5817555938037866e-07, "loss": 6.0770011901855465, "memory(GiB)": 76.04, "step": 30, "token_acc": 0.1521892967713401, "train_speed(iter/s)": 0.027249 }, { "epoch": 0.009050358782080289, "grad_norm": 519.2252197265625, "learning_rate": 3.0120481927710845e-07, "loss": 6.1493080139160154, "memory(GiB)": 76.04, "step": 35, "token_acc": 0.15634508680866296, "train_speed(iter/s)": 0.027269 }, { "epoch": 0.01034326717952033, "grad_norm": 714.878173828125, "learning_rate": 3.442340791738382e-07, "loss": 6.561576843261719, "memory(GiB)": 76.04, "step": 40, "token_acc": 0.16204781045060293, "train_speed(iter/s)": 0.027175 }, { "epoch": 0.011636175576960372, "grad_norm": 666.3334350585938, "learning_rate": 3.8726333907056804e-07, "loss": 5.985930252075195, "memory(GiB)": 76.04, "step": 45, "token_acc": 0.1617008883063929, "train_speed(iter/s)": 0.027249 }, { "epoch": 0.012929083974400414, "grad_norm": 841.8603515625, "learning_rate": 4.302925989672978e-07, "loss": 6.3445274353027346, "memory(GiB)": 76.04, "step": 50, "token_acc": 0.2033275151335091, "train_speed(iter/s)": 0.027146 }, { "epoch": 0.014221992371840455, "grad_norm": 956.3386840820312, "learning_rate": 4.733218588640276e-07, "loss": 6.072727966308594, "memory(GiB)": 76.04, "step": 55, "token_acc": 0.18730415801147765, "train_speed(iter/s)": 0.027337 }, { "epoch": 0.015514900769280497, "grad_norm": 145.8020782470703, "learning_rate": 5.163511187607573e-07, "loss": 5.959784317016601, "memory(GiB)": 76.04, "step": 60, "token_acc": 0.21014295439074201, "train_speed(iter/s)": 0.027487 }, { "epoch": 0.016807809166720537, "grad_norm": 1471.006591796875, "learning_rate": 5.593803786574872e-07, "loss": 5.995822906494141, "memory(GiB)": 76.04, "step": 65, "token_acc": 0.16151609777107784, "train_speed(iter/s)": 0.027505 }, { "epoch": 0.018100717564160578, "grad_norm": 127.47267150878906, "learning_rate": 6.024096385542169e-07, "loss": 5.947823333740234, "memory(GiB)": 76.04, "step": 70, "token_acc": 0.17944442339030583, "train_speed(iter/s)": 0.027514 }, { "epoch": 0.01939362596160062, "grad_norm": 103.65353393554688, "learning_rate": 6.454388984509467e-07, "loss": 6.049673843383789, "memory(GiB)": 76.04, "step": 75, "token_acc": 0.1430964467005076, "train_speed(iter/s)": 0.027525 }, { "epoch": 0.02068653435904066, "grad_norm": 186.977294921875, "learning_rate": 6.884681583476764e-07, "loss": 5.822255706787109, "memory(GiB)": 76.04, "step": 80, "token_acc": 0.20191989407480967, "train_speed(iter/s)": 0.027535 }, { "epoch": 0.021979442756480703, "grad_norm": 487.37152099609375, "learning_rate": 7.314974182444062e-07, "loss": 5.861573791503906, "memory(GiB)": 76.04, "step": 85, "token_acc": 0.25108269048585735, "train_speed(iter/s)": 0.027566 }, { "epoch": 0.023272351153920744, "grad_norm": 536.355712890625, "learning_rate": 7.745266781411361e-07, "loss": 5.783546447753906, "memory(GiB)": 76.04, "step": 90, "token_acc": 0.1870295076687734, "train_speed(iter/s)": 0.027608 }, { "epoch": 0.024565259551360786, "grad_norm": 285.6499938964844, "learning_rate": 8.175559380378658e-07, "loss": 5.746212768554687, "memory(GiB)": 76.04, "step": 95, "token_acc": 0.18162528216704288, "train_speed(iter/s)": 0.02758 }, { "epoch": 0.025858167948800827, "grad_norm": 1952.98583984375, "learning_rate": 8.605851979345956e-07, "loss": 5.601922607421875, "memory(GiB)": 76.04, "step": 100, "token_acc": 0.2339803356501102, "train_speed(iter/s)": 0.027603 }, { "epoch": 0.02715107634624087, "grad_norm": 93.57474517822266, "learning_rate": 9.036144578313254e-07, "loss": 5.615843200683594, "memory(GiB)": 76.04, "step": 105, "token_acc": 0.2615709535364429, "train_speed(iter/s)": 0.027644 }, { "epoch": 0.02844398474368091, "grad_norm": 229.4741973876953, "learning_rate": 9.466437177280551e-07, "loss": 5.597761917114258, "memory(GiB)": 76.04, "step": 110, "token_acc": 0.15636001564666974, "train_speed(iter/s)": 0.027645 }, { "epoch": 0.029736893141120952, "grad_norm": 36.493988037109375, "learning_rate": 9.896729776247848e-07, "loss": 5.327813720703125, "memory(GiB)": 76.04, "step": 115, "token_acc": 0.1776396590866798, "train_speed(iter/s)": 0.027731 }, { "epoch": 0.031029801538560994, "grad_norm": 35.07899475097656, "learning_rate": 1.0327022375215146e-06, "loss": 5.352128982543945, "memory(GiB)": 76.04, "step": 120, "token_acc": 0.24768539325842698, "train_speed(iter/s)": 0.027714 }, { "epoch": 0.03232270993600103, "grad_norm": 144.920166015625, "learning_rate": 1.0757314974182445e-06, "loss": 5.0349891662597654, "memory(GiB)": 76.04, "step": 125, "token_acc": 0.2503892788768347, "train_speed(iter/s)": 0.027712 }, { "epoch": 0.03361561833344107, "grad_norm": 38.186100006103516, "learning_rate": 1.1187607573149743e-06, "loss": 5.087076950073242, "memory(GiB)": 76.04, "step": 130, "token_acc": 0.24298036336942558, "train_speed(iter/s)": 0.027764 }, { "epoch": 0.034908526730881115, "grad_norm": 130.58258056640625, "learning_rate": 1.161790017211704e-06, "loss": 4.898267364501953, "memory(GiB)": 76.04, "step": 135, "token_acc": 0.27113337507827173, "train_speed(iter/s)": 0.027766 }, { "epoch": 0.036201435128321156, "grad_norm": 26.92106056213379, "learning_rate": 1.2048192771084338e-06, "loss": 4.7400367736816404, "memory(GiB)": 76.04, "step": 140, "token_acc": 0.20421513969901067, "train_speed(iter/s)": 0.027799 }, { "epoch": 0.0374943435257612, "grad_norm": 208.29981994628906, "learning_rate": 1.2478485370051637e-06, "loss": 4.690925598144531, "memory(GiB)": 76.04, "step": 145, "token_acc": 0.22307749241358893, "train_speed(iter/s)": 0.027793 }, { "epoch": 0.03878725192320124, "grad_norm": 364.9195251464844, "learning_rate": 1.2908777969018935e-06, "loss": 4.929594421386719, "memory(GiB)": 76.04, "step": 150, "token_acc": 0.23747012178927798, "train_speed(iter/s)": 0.027767 }, { "epoch": 0.04008016032064128, "grad_norm": 361.99853515625, "learning_rate": 1.3339070567986231e-06, "loss": 4.708561706542969, "memory(GiB)": 76.04, "step": 155, "token_acc": 0.2766303463977883, "train_speed(iter/s)": 0.027779 }, { "epoch": 0.04137306871808132, "grad_norm": 714.6271362304688, "learning_rate": 1.3769363166953528e-06, "loss": 4.614714050292969, "memory(GiB)": 76.04, "step": 160, "token_acc": 0.2798645816540384, "train_speed(iter/s)": 0.027778 }, { "epoch": 0.042665977115521364, "grad_norm": 38.3203010559082, "learning_rate": 1.4199655765920828e-06, "loss": 4.407340621948242, "memory(GiB)": 76.04, "step": 165, "token_acc": 0.26736621196222454, "train_speed(iter/s)": 0.027815 }, { "epoch": 0.043958885512961406, "grad_norm": 142.34910583496094, "learning_rate": 1.4629948364888125e-06, "loss": 4.265171813964844, "memory(GiB)": 76.04, "step": 170, "token_acc": 0.2892184533278596, "train_speed(iter/s)": 0.027794 }, { "epoch": 0.04525179391040145, "grad_norm": 16.595224380493164, "learning_rate": 1.5060240963855425e-06, "loss": 4.142366409301758, "memory(GiB)": 76.04, "step": 175, "token_acc": 0.28310791772330235, "train_speed(iter/s)": 0.027783 }, { "epoch": 0.04654470230784149, "grad_norm": 24.273094177246094, "learning_rate": 1.5490533562822722e-06, "loss": 4.011473083496094, "memory(GiB)": 76.04, "step": 180, "token_acc": 0.31321029626032054, "train_speed(iter/s)": 0.027749 }, { "epoch": 0.04783761070528153, "grad_norm": 12.379109382629395, "learning_rate": 1.5920826161790018e-06, "loss": 3.945102310180664, "memory(GiB)": 76.04, "step": 185, "token_acc": 0.3472663749960656, "train_speed(iter/s)": 0.027756 }, { "epoch": 0.04913051910272157, "grad_norm": 11.814841270446777, "learning_rate": 1.6351118760757316e-06, "loss": 3.744676971435547, "memory(GiB)": 76.04, "step": 190, "token_acc": 0.3549518681677272, "train_speed(iter/s)": 0.027799 }, { "epoch": 0.05042342750016161, "grad_norm": 13.708587646484375, "learning_rate": 1.6781411359724615e-06, "loss": 3.6159019470214844, "memory(GiB)": 76.04, "step": 195, "token_acc": 0.34429772852314705, "train_speed(iter/s)": 0.027816 }, { "epoch": 0.051716335897601655, "grad_norm": 10.363809585571289, "learning_rate": 1.7211703958691911e-06, "loss": 3.508245086669922, "memory(GiB)": 76.04, "step": 200, "token_acc": 0.34663152792923785, "train_speed(iter/s)": 0.027818 }, { "epoch": 0.053009244295041696, "grad_norm": 10.083678245544434, "learning_rate": 1.764199655765921e-06, "loss": 3.42333869934082, "memory(GiB)": 76.04, "step": 205, "token_acc": 0.4063367473915957, "train_speed(iter/s)": 0.027805 }, { "epoch": 0.05430215269248174, "grad_norm": 8.798538208007812, "learning_rate": 1.8072289156626508e-06, "loss": 3.2132949829101562, "memory(GiB)": 76.04, "step": 210, "token_acc": 0.40024092757114893, "train_speed(iter/s)": 0.027761 }, { "epoch": 0.05559506108992178, "grad_norm": 9.79233455657959, "learning_rate": 1.8502581755593804e-06, "loss": 3.2455322265625, "memory(GiB)": 76.04, "step": 215, "token_acc": 0.37759151099023586, "train_speed(iter/s)": 0.027773 }, { "epoch": 0.05688796948736182, "grad_norm": 7.508085250854492, "learning_rate": 1.8932874354561103e-06, "loss": 3.0217824935913087, "memory(GiB)": 76.04, "step": 220, "token_acc": 0.41623787623514413, "train_speed(iter/s)": 0.027752 }, { "epoch": 0.05818087788480186, "grad_norm": 11.832063674926758, "learning_rate": 1.93631669535284e-06, "loss": 3.122202682495117, "memory(GiB)": 76.04, "step": 225, "token_acc": 0.4242820412254325, "train_speed(iter/s)": 0.027731 }, { "epoch": 0.059473786282241904, "grad_norm": 6.528841018676758, "learning_rate": 1.9793459552495696e-06, "loss": 2.9143745422363283, "memory(GiB)": 76.04, "step": 230, "token_acc": 0.4459988808058198, "train_speed(iter/s)": 0.027736 }, { "epoch": 0.060766694679681946, "grad_norm": 13.579035758972168, "learning_rate": 2.0223752151463e-06, "loss": 2.980012130737305, "memory(GiB)": 76.04, "step": 235, "token_acc": 0.4274643521388717, "train_speed(iter/s)": 0.027736 }, { "epoch": 0.06205960307712199, "grad_norm": 10.801968574523926, "learning_rate": 2.0654044750430293e-06, "loss": 2.9146419525146485, "memory(GiB)": 76.04, "step": 240, "token_acc": 0.4537141861882783, "train_speed(iter/s)": 0.027738 }, { "epoch": 0.06335251147456203, "grad_norm": 6.302434921264648, "learning_rate": 2.1084337349397595e-06, "loss": 2.8420055389404295, "memory(GiB)": 76.04, "step": 245, "token_acc": 0.4608826083524118, "train_speed(iter/s)": 0.027761 }, { "epoch": 0.06464541987200206, "grad_norm": 6.119657516479492, "learning_rate": 2.151462994836489e-06, "loss": 2.787134552001953, "memory(GiB)": 76.04, "step": 250, "token_acc": 0.44650911754500944, "train_speed(iter/s)": 0.027728 }, { "epoch": 0.06593832826944211, "grad_norm": 6.240930080413818, "learning_rate": 2.194492254733219e-06, "loss": 2.7430130004882813, "memory(GiB)": 76.04, "step": 255, "token_acc": 0.4696727853152434, "train_speed(iter/s)": 0.027754 }, { "epoch": 0.06723123666688215, "grad_norm": 6.333411693572998, "learning_rate": 2.2375215146299486e-06, "loss": 2.6061046600341795, "memory(GiB)": 76.04, "step": 260, "token_acc": 0.48562300319488816, "train_speed(iter/s)": 0.027762 }, { "epoch": 0.0685241450643222, "grad_norm": 7.289592742919922, "learning_rate": 2.2805507745266785e-06, "loss": 2.5609256744384767, "memory(GiB)": 76.04, "step": 265, "token_acc": 0.4682853243301487, "train_speed(iter/s)": 0.027763 }, { "epoch": 0.06981705346176223, "grad_norm": 5.113116264343262, "learning_rate": 2.323580034423408e-06, "loss": 2.5773744583129883, "memory(GiB)": 76.04, "step": 270, "token_acc": 0.4764347547290798, "train_speed(iter/s)": 0.02777 }, { "epoch": 0.07110996185920228, "grad_norm": 4.905300140380859, "learning_rate": 2.3666092943201378e-06, "loss": 2.493597221374512, "memory(GiB)": 76.04, "step": 275, "token_acc": 0.5032743942370661, "train_speed(iter/s)": 0.027774 }, { "epoch": 0.07240287025664231, "grad_norm": 5.233267307281494, "learning_rate": 2.4096385542168676e-06, "loss": 2.4493991851806642, "memory(GiB)": 76.04, "step": 280, "token_acc": 0.5405880959631848, "train_speed(iter/s)": 0.027795 }, { "epoch": 0.07369577865408236, "grad_norm": 8.618249893188477, "learning_rate": 2.4526678141135975e-06, "loss": 2.423325538635254, "memory(GiB)": 76.04, "step": 285, "token_acc": 0.530728862973761, "train_speed(iter/s)": 0.027795 }, { "epoch": 0.0749886870515224, "grad_norm": 5.426891803741455, "learning_rate": 2.4956970740103273e-06, "loss": 2.376586151123047, "memory(GiB)": 76.04, "step": 290, "token_acc": 0.5145979170223663, "train_speed(iter/s)": 0.027806 }, { "epoch": 0.07628159544896244, "grad_norm": 7.925861358642578, "learning_rate": 2.538726333907057e-06, "loss": 2.348302459716797, "memory(GiB)": 76.04, "step": 295, "token_acc": 0.5290513911109377, "train_speed(iter/s)": 0.027804 }, { "epoch": 0.07757450384640248, "grad_norm": 6.098197937011719, "learning_rate": 2.581755593803787e-06, "loss": 2.325778579711914, "memory(GiB)": 76.04, "step": 300, "token_acc": 0.526869597895528, "train_speed(iter/s)": 0.027824 }, { "epoch": 0.07886741224384253, "grad_norm": 4.564492225646973, "learning_rate": 2.6247848537005164e-06, "loss": 2.283577728271484, "memory(GiB)": 76.04, "step": 305, "token_acc": 0.5214347367411528, "train_speed(iter/s)": 0.02782 }, { "epoch": 0.08016032064128256, "grad_norm": 5.382349491119385, "learning_rate": 2.6678141135972463e-06, "loss": 2.260573959350586, "memory(GiB)": 76.04, "step": 310, "token_acc": 0.5193088205746433, "train_speed(iter/s)": 0.027827 }, { "epoch": 0.08145322903872261, "grad_norm": 4.09184455871582, "learning_rate": 2.710843373493976e-06, "loss": 2.202308464050293, "memory(GiB)": 76.04, "step": 315, "token_acc": 0.5255345169316409, "train_speed(iter/s)": 0.027838 }, { "epoch": 0.08274613743616265, "grad_norm": 6.044122695922852, "learning_rate": 2.7538726333907055e-06, "loss": 2.0868404388427733, "memory(GiB)": 76.04, "step": 320, "token_acc": 0.5558229225038731, "train_speed(iter/s)": 0.027868 }, { "epoch": 0.0840390458336027, "grad_norm": 6.152168273925781, "learning_rate": 2.796901893287436e-06, "loss": 2.019194412231445, "memory(GiB)": 76.04, "step": 325, "token_acc": 0.5501780112094188, "train_speed(iter/s)": 0.027887 }, { "epoch": 0.08533195423104273, "grad_norm": 7.0422773361206055, "learning_rate": 2.8399311531841657e-06, "loss": 2.0607986450195312, "memory(GiB)": 76.04, "step": 330, "token_acc": 0.5108016425638279, "train_speed(iter/s)": 0.027881 }, { "epoch": 0.08662486262848278, "grad_norm": 5.497431755065918, "learning_rate": 2.882960413080895e-06, "loss": 2.0316564559936525, "memory(GiB)": 76.04, "step": 335, "token_acc": 0.5838930163447251, "train_speed(iter/s)": 0.027889 }, { "epoch": 0.08791777102592281, "grad_norm": 5.31265115737915, "learning_rate": 2.925989672977625e-06, "loss": 2.0162227630615233, "memory(GiB)": 76.04, "step": 340, "token_acc": 0.5688795253246195, "train_speed(iter/s)": 0.027898 }, { "epoch": 0.08921067942336286, "grad_norm": 3.3592262268066406, "learning_rate": 2.9690189328743548e-06, "loss": 1.976116943359375, "memory(GiB)": 76.04, "step": 345, "token_acc": 0.5811347794931926, "train_speed(iter/s)": 0.027902 }, { "epoch": 0.0905035878208029, "grad_norm": 4.368274211883545, "learning_rate": 3.012048192771085e-06, "loss": 1.9872814178466798, "memory(GiB)": 76.04, "step": 350, "token_acc": 0.5612575668814684, "train_speed(iter/s)": 0.027903 }, { "epoch": 0.09179649621824294, "grad_norm": 4.863376140594482, "learning_rate": 3.0550774526678145e-06, "loss": 1.9638809204101562, "memory(GiB)": 76.04, "step": 355, "token_acc": 0.611854751336805, "train_speed(iter/s)": 0.027901 }, { "epoch": 0.09308940461568298, "grad_norm": 4.721250057220459, "learning_rate": 3.0981067125645443e-06, "loss": 1.9370807647705077, "memory(GiB)": 76.04, "step": 360, "token_acc": 0.584777590187093, "train_speed(iter/s)": 0.027911 }, { "epoch": 0.09438231301312303, "grad_norm": 3.189765691757202, "learning_rate": 3.1411359724612737e-06, "loss": 1.9602073669433593, "memory(GiB)": 76.04, "step": 365, "token_acc": 0.6110412738319715, "train_speed(iter/s)": 0.027898 }, { "epoch": 0.09567522141056306, "grad_norm": 4.200387001037598, "learning_rate": 3.1841652323580036e-06, "loss": 1.8618885040283204, "memory(GiB)": 76.04, "step": 370, "token_acc": 0.6040174341481903, "train_speed(iter/s)": 0.027895 }, { "epoch": 0.09696812980800311, "grad_norm": 3.2454254627227783, "learning_rate": 3.2271944922547334e-06, "loss": 1.821019744873047, "memory(GiB)": 76.04, "step": 375, "token_acc": 0.6053048476893134, "train_speed(iter/s)": 0.027882 }, { "epoch": 0.09826103820544314, "grad_norm": 3.34602427482605, "learning_rate": 3.2702237521514633e-06, "loss": 1.814716911315918, "memory(GiB)": 76.04, "step": 380, "token_acc": 0.6051584430617856, "train_speed(iter/s)": 0.027888 }, { "epoch": 0.09955394660288319, "grad_norm": 3.311286211013794, "learning_rate": 3.313253012048193e-06, "loss": 1.8220832824707032, "memory(GiB)": 76.04, "step": 385, "token_acc": 0.5944087085601187, "train_speed(iter/s)": 0.027892 }, { "epoch": 0.10084685500032323, "grad_norm": 5.586461544036865, "learning_rate": 3.356282271944923e-06, "loss": 1.7891130447387695, "memory(GiB)": 76.04, "step": 390, "token_acc": 0.618925967321305, "train_speed(iter/s)": 0.027898 }, { "epoch": 0.10213976339776328, "grad_norm": 6.616051197052002, "learning_rate": 3.3993115318416524e-06, "loss": 1.7373517990112304, "memory(GiB)": 76.04, "step": 395, "token_acc": 0.6165356711003628, "train_speed(iter/s)": 0.027877 }, { "epoch": 0.10343267179520331, "grad_norm": 2.835207223892212, "learning_rate": 3.4423407917383822e-06, "loss": 1.6772958755493164, "memory(GiB)": 76.04, "step": 400, "token_acc": 0.6225554448697111, "train_speed(iter/s)": 0.027881 }, { "epoch": 0.10472558019264334, "grad_norm": 2.6146092414855957, "learning_rate": 3.485370051635112e-06, "loss": 1.7310756683349608, "memory(GiB)": 76.04, "step": 405, "token_acc": 0.615843204488778, "train_speed(iter/s)": 0.027878 }, { "epoch": 0.10601848859008339, "grad_norm": 4.303338527679443, "learning_rate": 3.528399311531842e-06, "loss": 1.6970853805541992, "memory(GiB)": 76.04, "step": 410, "token_acc": 0.6082253291152159, "train_speed(iter/s)": 0.027889 }, { "epoch": 0.10731139698752343, "grad_norm": 3.152858257293701, "learning_rate": 3.5714285714285718e-06, "loss": 1.6324186325073242, "memory(GiB)": 76.04, "step": 415, "token_acc": 0.6986581950424278, "train_speed(iter/s)": 0.027905 }, { "epoch": 0.10860430538496348, "grad_norm": 2.5081074237823486, "learning_rate": 3.6144578313253016e-06, "loss": 1.6505191802978516, "memory(GiB)": 76.04, "step": 420, "token_acc": 0.6553758610362383, "train_speed(iter/s)": 0.027916 }, { "epoch": 0.10989721378240351, "grad_norm": 3.3457190990448, "learning_rate": 3.657487091222031e-06, "loss": 1.6839021682739257, "memory(GiB)": 76.04, "step": 425, "token_acc": 0.6694164051234202, "train_speed(iter/s)": 0.027902 }, { "epoch": 0.11119012217984356, "grad_norm": 2.359487295150757, "learning_rate": 3.700516351118761e-06, "loss": 1.6301973342895508, "memory(GiB)": 76.04, "step": 430, "token_acc": 0.6513765837858436, "train_speed(iter/s)": 0.027902 }, { "epoch": 0.1124830305772836, "grad_norm": 2.1152243614196777, "learning_rate": 3.743545611015491e-06, "loss": 1.5873135566711425, "memory(GiB)": 76.04, "step": 435, "token_acc": 0.6180145649674205, "train_speed(iter/s)": 0.027907 }, { "epoch": 0.11377593897472364, "grad_norm": 2.4160220623016357, "learning_rate": 3.7865748709122206e-06, "loss": 1.5810693740844726, "memory(GiB)": 76.04, "step": 440, "token_acc": 0.6297903669547023, "train_speed(iter/s)": 0.027905 }, { "epoch": 0.11506884737216368, "grad_norm": 2.329163074493408, "learning_rate": 3.8296041308089504e-06, "loss": 1.5028837203979493, "memory(GiB)": 76.04, "step": 445, "token_acc": 0.6969645118236885, "train_speed(iter/s)": 0.027912 }, { "epoch": 0.11636175576960373, "grad_norm": 2.004277467727661, "learning_rate": 3.87263339070568e-06, "loss": 1.5502431869506836, "memory(GiB)": 76.04, "step": 450, "token_acc": 0.6654662441179295, "train_speed(iter/s)": 0.027911 }, { "epoch": 0.11765466416704376, "grad_norm": 291.7929382324219, "learning_rate": 3.91566265060241e-06, "loss": 1.6129791259765625, "memory(GiB)": 76.04, "step": 455, "token_acc": 0.623754295532646, "train_speed(iter/s)": 0.027925 }, { "epoch": 0.11894757256448381, "grad_norm": 5.003607749938965, "learning_rate": 3.958691910499139e-06, "loss": 1.523465919494629, "memory(GiB)": 76.04, "step": 460, "token_acc": 0.6120657218111125, "train_speed(iter/s)": 0.027923 }, { "epoch": 0.12024048096192384, "grad_norm": 2.6622467041015625, "learning_rate": 4.00172117039587e-06, "loss": 1.5748241424560547, "memory(GiB)": 76.04, "step": 465, "token_acc": 0.6213646902947996, "train_speed(iter/s)": 0.027931 }, { "epoch": 0.12153338935936389, "grad_norm": 2.4443795680999756, "learning_rate": 4.0447504302926e-06, "loss": 1.526081657409668, "memory(GiB)": 76.04, "step": 470, "token_acc": 0.6476268348713157, "train_speed(iter/s)": 0.027937 }, { "epoch": 0.12282629775680393, "grad_norm": 2.6567704677581787, "learning_rate": 4.087779690189329e-06, "loss": 1.5151639938354493, "memory(GiB)": 76.04, "step": 475, "token_acc": 0.6848871707273217, "train_speed(iter/s)": 0.027943 }, { "epoch": 0.12411920615424397, "grad_norm": 2.540998935699463, "learning_rate": 4.1308089500860585e-06, "loss": 1.4633543014526367, "memory(GiB)": 76.04, "step": 480, "token_acc": 0.6596202575584019, "train_speed(iter/s)": 0.027936 }, { "epoch": 0.12541211455168402, "grad_norm": 2.337562084197998, "learning_rate": 4.173838209982788e-06, "loss": 1.4880233764648438, "memory(GiB)": 76.04, "step": 485, "token_acc": 0.6452993555369705, "train_speed(iter/s)": 0.027931 }, { "epoch": 0.12670502294912406, "grad_norm": 2.5540599822998047, "learning_rate": 4.216867469879519e-06, "loss": 1.4692201614379883, "memory(GiB)": 76.04, "step": 490, "token_acc": 0.6396389676264359, "train_speed(iter/s)": 0.027945 }, { "epoch": 0.1279979313465641, "grad_norm": 2.3116421699523926, "learning_rate": 4.259896729776248e-06, "loss": 1.4556035995483398, "memory(GiB)": 76.04, "step": 495, "token_acc": 0.6138832517607735, "train_speed(iter/s)": 0.02794 }, { "epoch": 0.12929083974400413, "grad_norm": 1.984755039215088, "learning_rate": 4.302925989672978e-06, "loss": 1.4775214195251465, "memory(GiB)": 76.04, "step": 500, "token_acc": 0.6642163033079905, "train_speed(iter/s)": 0.027946 }, { "epoch": 0.1305837481414442, "grad_norm": 2.973404884338379, "learning_rate": 4.345955249569708e-06, "loss": 1.421070671081543, "memory(GiB)": 76.04, "step": 505, "token_acc": 0.6482611781405252, "train_speed(iter/s)": 0.027935 }, { "epoch": 0.13187665653888422, "grad_norm": 2.4486446380615234, "learning_rate": 4.388984509466438e-06, "loss": 1.4313584327697755, "memory(GiB)": 76.04, "step": 510, "token_acc": 0.6798098365476511, "train_speed(iter/s)": 0.027937 }, { "epoch": 0.13316956493632426, "grad_norm": 2.326204776763916, "learning_rate": 4.4320137693631674e-06, "loss": 1.4402247428894044, "memory(GiB)": 76.04, "step": 515, "token_acc": 0.7008269899445541, "train_speed(iter/s)": 0.02794 }, { "epoch": 0.1344624733337643, "grad_norm": 1.8890414237976074, "learning_rate": 4.475043029259897e-06, "loss": 1.3767863273620606, "memory(GiB)": 76.04, "step": 520, "token_acc": 0.6819113223176527, "train_speed(iter/s)": 0.027946 }, { "epoch": 0.13575538173120436, "grad_norm": 2.8504834175109863, "learning_rate": 4.518072289156627e-06, "loss": 1.407078170776367, "memory(GiB)": 76.04, "step": 525, "token_acc": 0.683960224816256, "train_speed(iter/s)": 0.027961 }, { "epoch": 0.1370482901286444, "grad_norm": 2.0273118019104004, "learning_rate": 4.561101549053357e-06, "loss": 1.336355972290039, "memory(GiB)": 76.04, "step": 530, "token_acc": 0.6482288828337874, "train_speed(iter/s)": 0.027971 }, { "epoch": 0.13834119852608442, "grad_norm": 1.9206314086914062, "learning_rate": 4.604130808950086e-06, "loss": 1.3975639343261719, "memory(GiB)": 76.04, "step": 535, "token_acc": 0.7144246703653001, "train_speed(iter/s)": 0.027969 }, { "epoch": 0.13963410692352446, "grad_norm": 2.952894687652588, "learning_rate": 4.647160068846816e-06, "loss": 1.372206974029541, "memory(GiB)": 76.04, "step": 540, "token_acc": 0.6865656633371321, "train_speed(iter/s)": 0.027968 }, { "epoch": 0.14092701532096452, "grad_norm": 1.6340878009796143, "learning_rate": 4.6901893287435465e-06, "loss": 1.373509120941162, "memory(GiB)": 76.04, "step": 545, "token_acc": 0.6471132494448557, "train_speed(iter/s)": 0.027965 }, { "epoch": 0.14221992371840456, "grad_norm": 1.9484697580337524, "learning_rate": 4.7332185886402755e-06, "loss": 1.3788504600524902, "memory(GiB)": 76.04, "step": 550, "token_acc": 0.6455056445137216, "train_speed(iter/s)": 0.02796 }, { "epoch": 0.1435128321158446, "grad_norm": 2.1868441104888916, "learning_rate": 4.776247848537005e-06, "loss": 1.3279705047607422, "memory(GiB)": 76.04, "step": 555, "token_acc": 0.6940514224859273, "train_speed(iter/s)": 0.027951 }, { "epoch": 0.14480574051328463, "grad_norm": 1.7936123609542847, "learning_rate": 4.819277108433735e-06, "loss": 1.2733698844909669, "memory(GiB)": 76.04, "step": 560, "token_acc": 0.6824586324720697, "train_speed(iter/s)": 0.027957 }, { "epoch": 0.1460986489107247, "grad_norm": 2.035456418991089, "learning_rate": 4.862306368330465e-06, "loss": 1.3023791313171387, "memory(GiB)": 76.04, "step": 565, "token_acc": 0.732261012611954, "train_speed(iter/s)": 0.027962 }, { "epoch": 0.14739155730816472, "grad_norm": 1.8656188249588013, "learning_rate": 4.905335628227195e-06, "loss": 1.2920080184936524, "memory(GiB)": 76.04, "step": 570, "token_acc": 0.7507948232120492, "train_speed(iter/s)": 0.027955 }, { "epoch": 0.14868446570560476, "grad_norm": 2.3736183643341064, "learning_rate": 4.948364888123925e-06, "loss": 1.3393060684204101, "memory(GiB)": 76.04, "step": 575, "token_acc": 0.6658280922431866, "train_speed(iter/s)": 0.027956 }, { "epoch": 0.1499773741030448, "grad_norm": 1.932016372680664, "learning_rate": 4.991394148020655e-06, "loss": 1.2938769340515137, "memory(GiB)": 76.04, "step": 580, "token_acc": 0.6553120323915148, "train_speed(iter/s)": 0.027949 }, { "epoch": 0.15127028250048485, "grad_norm": 2.1315174102783203, "learning_rate": 4.999998374576611e-06, "loss": 1.3226532936096191, "memory(GiB)": 76.04, "step": 585, "token_acc": 0.6744726857329363, "train_speed(iter/s)": 0.027952 }, { "epoch": 0.1525631908979249, "grad_norm": 1.8392802476882935, "learning_rate": 4.999991771297712e-06, "loss": 1.298147964477539, "memory(GiB)": 76.04, "step": 590, "token_acc": 0.6859605911330049, "train_speed(iter/s)": 0.027946 }, { "epoch": 0.15385609929536492, "grad_norm": 1.761626124382019, "learning_rate": 4.999980088587748e-06, "loss": 1.3261050224304198, "memory(GiB)": 76.04, "step": 595, "token_acc": 0.7036491873658387, "train_speed(iter/s)": 0.027948 }, { "epoch": 0.15514900769280496, "grad_norm": 1.6543495655059814, "learning_rate": 4.999963326470457e-06, "loss": 1.2572940826416015, "memory(GiB)": 76.04, "step": 600, "token_acc": 0.6750695088044486, "train_speed(iter/s)": 0.027957 }, { "epoch": 0.156441916090245, "grad_norm": 1.7115892171859741, "learning_rate": 4.999941484979894e-06, "loss": 1.279404354095459, "memory(GiB)": 76.04, "step": 605, "token_acc": 0.6865163550249149, "train_speed(iter/s)": 0.027961 }, { "epoch": 0.15773482448768505, "grad_norm": 1.595937967300415, "learning_rate": 4.999914564160437e-06, "loss": 1.239435577392578, "memory(GiB)": 76.04, "step": 610, "token_acc": 0.6807362476469357, "train_speed(iter/s)": 0.02795 }, { "epoch": 0.1590277328851251, "grad_norm": 1.494962215423584, "learning_rate": 4.9998825640667835e-06, "loss": 1.249193286895752, "memory(GiB)": 76.04, "step": 615, "token_acc": 0.7230293299917766, "train_speed(iter/s)": 0.027954 }, { "epoch": 0.16032064128256512, "grad_norm": 1.5208592414855957, "learning_rate": 4.99984548476395e-06, "loss": 1.259343719482422, "memory(GiB)": 76.04, "step": 620, "token_acc": 0.7032384586844755, "train_speed(iter/s)": 0.027964 }, { "epoch": 0.16161354968000516, "grad_norm": 1.6536712646484375, "learning_rate": 4.999803326327274e-06, "loss": 1.2767888069152833, "memory(GiB)": 76.04, "step": 625, "token_acc": 0.6942289403874068, "train_speed(iter/s)": 0.027964 }, { "epoch": 0.16290645807744522, "grad_norm": 1.6706756353378296, "learning_rate": 4.9997560888424115e-06, "loss": 1.2389703750610352, "memory(GiB)": 76.04, "step": 630, "token_acc": 0.7009845373278569, "train_speed(iter/s)": 0.027965 }, { "epoch": 0.16419936647488526, "grad_norm": 1.7092621326446533, "learning_rate": 4.999703772405339e-06, "loss": 1.1957599639892578, "memory(GiB)": 76.04, "step": 635, "token_acc": 0.7072115946546149, "train_speed(iter/s)": 0.027969 }, { "epoch": 0.1654922748723253, "grad_norm": 1.8869761228561401, "learning_rate": 4.999646377122352e-06, "loss": 1.1767961502075195, "memory(GiB)": 76.04, "step": 640, "token_acc": 0.7185826888756692, "train_speed(iter/s)": 0.027965 }, { "epoch": 0.16678518326976532, "grad_norm": 1.5505192279815674, "learning_rate": 4.9995839031100636e-06, "loss": 1.2072343826293945, "memory(GiB)": 76.04, "step": 645, "token_acc": 0.6714076782449726, "train_speed(iter/s)": 0.027966 }, { "epoch": 0.1680780916672054, "grad_norm": 1.6234943866729736, "learning_rate": 4.9995163504954105e-06, "loss": 1.1328813552856445, "memory(GiB)": 76.04, "step": 650, "token_acc": 0.718151112416477, "train_speed(iter/s)": 0.02796 }, { "epoch": 0.16937100006464542, "grad_norm": 1.6178089380264282, "learning_rate": 4.999443719415641e-06, "loss": 1.2163790702819823, "memory(GiB)": 76.04, "step": 655, "token_acc": 0.7040156056713294, "train_speed(iter/s)": 0.027964 }, { "epoch": 0.17066390846208546, "grad_norm": 1.6068527698516846, "learning_rate": 4.999366010018328e-06, "loss": 1.254256248474121, "memory(GiB)": 76.04, "step": 660, "token_acc": 0.6559546915269338, "train_speed(iter/s)": 0.02796 }, { "epoch": 0.1719568168595255, "grad_norm": 1.6556439399719238, "learning_rate": 4.999283222461359e-06, "loss": 1.1994304656982422, "memory(GiB)": 76.04, "step": 665, "token_acc": 0.7349007266163743, "train_speed(iter/s)": 0.027956 }, { "epoch": 0.17324972525696555, "grad_norm": 1.3648124933242798, "learning_rate": 4.999195356912941e-06, "loss": 1.1895877838134765, "memory(GiB)": 76.04, "step": 670, "token_acc": 0.7510124364534566, "train_speed(iter/s)": 0.027953 }, { "epoch": 0.1745426336544056, "grad_norm": 1.3924592733383179, "learning_rate": 4.999102413551594e-06, "loss": 1.1863578796386718, "memory(GiB)": 76.04, "step": 675, "token_acc": 0.7059563448020718, "train_speed(iter/s)": 0.02796 }, { "epoch": 0.17583554205184562, "grad_norm": 1.3376160860061646, "learning_rate": 4.9990043925661625e-06, "loss": 1.2073113441467285, "memory(GiB)": 76.04, "step": 680, "token_acc": 0.7219528395881767, "train_speed(iter/s)": 0.027965 }, { "epoch": 0.17712845044928566, "grad_norm": 1.3140792846679688, "learning_rate": 4.998901294155801e-06, "loss": 1.1344953536987306, "memory(GiB)": 76.04, "step": 685, "token_acc": 0.7513243683781581, "train_speed(iter/s)": 0.027967 }, { "epoch": 0.17842135884672572, "grad_norm": 1.3123114109039307, "learning_rate": 4.9987931185299836e-06, "loss": 1.1784892082214355, "memory(GiB)": 76.04, "step": 690, "token_acc": 0.6674288089794221, "train_speed(iter/s)": 0.027966 }, { "epoch": 0.17971426724416575, "grad_norm": 2.659935235977173, "learning_rate": 4.998679865908499e-06, "loss": 1.157151985168457, "memory(GiB)": 76.04, "step": 695, "token_acc": 0.7257713001430225, "train_speed(iter/s)": 0.027976 }, { "epoch": 0.1810071756416058, "grad_norm": 1.3218928575515747, "learning_rate": 4.998561536521452e-06, "loss": 1.1583141326904296, "memory(GiB)": 76.04, "step": 700, "token_acc": 0.7759477598403773, "train_speed(iter/s)": 0.027968 }, { "epoch": 0.18230008403904582, "grad_norm": 1.296757698059082, "learning_rate": 4.998438130609261e-06, "loss": 1.1259532928466798, "memory(GiB)": 76.04, "step": 705, "token_acc": 0.727386377384361, "train_speed(iter/s)": 0.02797 }, { "epoch": 0.18359299243648589, "grad_norm": 1.4752744436264038, "learning_rate": 4.9983096484226605e-06, "loss": 1.1163427352905273, "memory(GiB)": 76.04, "step": 710, "token_acc": 0.7253694101134547, "train_speed(iter/s)": 0.027972 }, { "epoch": 0.18488590083392592, "grad_norm": 1.3070727586746216, "learning_rate": 4.998176090222697e-06, "loss": 1.1096561431884766, "memory(GiB)": 76.04, "step": 715, "token_acc": 0.7192952446117004, "train_speed(iter/s)": 0.027971 }, { "epoch": 0.18617880923136595, "grad_norm": 1.4229007959365845, "learning_rate": 4.998037456280732e-06, "loss": 1.1451845169067383, "memory(GiB)": 76.04, "step": 720, "token_acc": 0.7217227852239294, "train_speed(iter/s)": 0.027973 }, { "epoch": 0.187471717628806, "grad_norm": 1.2608271837234497, "learning_rate": 4.9978937468784376e-06, "loss": 1.106486701965332, "memory(GiB)": 76.04, "step": 725, "token_acc": 0.7184099215637961, "train_speed(iter/s)": 0.027972 }, { "epoch": 0.18876462602624605, "grad_norm": 1.3104456663131714, "learning_rate": 4.9977449623078015e-06, "loss": 1.1219176292419433, "memory(GiB)": 76.04, "step": 730, "token_acc": 0.7516463274234401, "train_speed(iter/s)": 0.027972 }, { "epoch": 0.1900575344236861, "grad_norm": 1.353380799293518, "learning_rate": 4.9975911028711195e-06, "loss": 1.1417633056640626, "memory(GiB)": 76.04, "step": 735, "token_acc": 0.739439049637699, "train_speed(iter/s)": 0.027974 }, { "epoch": 0.19135044282112612, "grad_norm": 1.4334852695465088, "learning_rate": 4.997432168881002e-06, "loss": 1.1226820945739746, "memory(GiB)": 76.04, "step": 740, "token_acc": 0.7192217376719222, "train_speed(iter/s)": 0.027973 }, { "epoch": 0.19264335121856616, "grad_norm": 1.3817933797836304, "learning_rate": 4.997268160660366e-06, "loss": 1.1217589378356934, "memory(GiB)": 76.04, "step": 745, "token_acc": 0.731275833562965, "train_speed(iter/s)": 0.027968 }, { "epoch": 0.19393625961600622, "grad_norm": 1.2183865308761597, "learning_rate": 4.99709907854244e-06, "loss": 1.1149643898010253, "memory(GiB)": 76.04, "step": 750, "token_acc": 0.7301776086267048, "train_speed(iter/s)": 0.027963 }, { "epoch": 0.19522916801344625, "grad_norm": 1.2150975465774536, "learning_rate": 4.9969249228707625e-06, "loss": 1.0912652969360352, "memory(GiB)": 76.04, "step": 755, "token_acc": 0.7373849358328294, "train_speed(iter/s)": 0.027958 }, { "epoch": 0.1965220764108863, "grad_norm": 1.3223531246185303, "learning_rate": 4.996745693999179e-06, "loss": 1.1588199615478516, "memory(GiB)": 76.04, "step": 760, "token_acc": 0.7059101248980643, "train_speed(iter/s)": 0.027957 }, { "epoch": 0.19781498480832632, "grad_norm": 1.419268250465393, "learning_rate": 4.996561392291842e-06, "loss": 1.1392223358154296, "memory(GiB)": 76.04, "step": 765, "token_acc": 0.6890176058642581, "train_speed(iter/s)": 0.027957 }, { "epoch": 0.19910789320576638, "grad_norm": 1.4065704345703125, "learning_rate": 4.996372018123213e-06, "loss": 1.0843055725097657, "memory(GiB)": 76.04, "step": 770, "token_acc": 0.7391510740140037, "train_speed(iter/s)": 0.027954 }, { "epoch": 0.20040080160320642, "grad_norm": 1.3634155988693237, "learning_rate": 4.996177571878058e-06, "loss": 1.1053363800048828, "memory(GiB)": 76.04, "step": 775, "token_acc": 0.7354631733725437, "train_speed(iter/s)": 0.027955 }, { "epoch": 0.20169371000064645, "grad_norm": 1.2733179330825806, "learning_rate": 4.995978053951449e-06, "loss": 1.120106315612793, "memory(GiB)": 76.04, "step": 780, "token_acc": 0.7424737177445047, "train_speed(iter/s)": 0.027949 }, { "epoch": 0.2029866183980865, "grad_norm": 1.224840521812439, "learning_rate": 4.995773464748763e-06, "loss": 1.117567253112793, "memory(GiB)": 76.04, "step": 785, "token_acc": 0.7321232876712329, "train_speed(iter/s)": 0.027949 }, { "epoch": 0.20427952679552655, "grad_norm": 1.5229790210723877, "learning_rate": 4.995563804685679e-06, "loss": 1.0795328140258789, "memory(GiB)": 76.04, "step": 790, "token_acc": 0.7580057607590647, "train_speed(iter/s)": 0.027946 }, { "epoch": 0.20557243519296658, "grad_norm": 1.4150093793869019, "learning_rate": 4.9953490741881796e-06, "loss": 1.1043382644653321, "memory(GiB)": 76.04, "step": 795, "token_acc": 0.700471466457126, "train_speed(iter/s)": 0.027945 }, { "epoch": 0.20686534359040662, "grad_norm": 1.4656771421432495, "learning_rate": 4.9951292736925515e-06, "loss": 1.0530550956726075, "memory(GiB)": 76.04, "step": 800, "token_acc": 0.7616183012073713, "train_speed(iter/s)": 0.027949 }, { "epoch": 0.20815825198784665, "grad_norm": 1.2729520797729492, "learning_rate": 4.994904403645378e-06, "loss": 1.0495079040527344, "memory(GiB)": 76.04, "step": 805, "token_acc": 0.7524990313831849, "train_speed(iter/s)": 0.027946 }, { "epoch": 0.2094511603852867, "grad_norm": 1.4444711208343506, "learning_rate": 4.9946744645035496e-06, "loss": 1.0937026977539062, "memory(GiB)": 76.04, "step": 810, "token_acc": 0.6928733031674208, "train_speed(iter/s)": 0.027942 }, { "epoch": 0.21074406878272675, "grad_norm": 1.390000820159912, "learning_rate": 4.994439456734248e-06, "loss": 1.092994499206543, "memory(GiB)": 76.04, "step": 815, "token_acc": 0.7247861835309672, "train_speed(iter/s)": 0.027945 }, { "epoch": 0.21203697718016679, "grad_norm": 1.1274871826171875, "learning_rate": 4.994199380814958e-06, "loss": 1.064702320098877, "memory(GiB)": 76.04, "step": 820, "token_acc": 0.7177709296353364, "train_speed(iter/s)": 0.027947 }, { "epoch": 0.21332988557760682, "grad_norm": 1.1373642683029175, "learning_rate": 4.9939542372334625e-06, "loss": 1.0526296615600585, "memory(GiB)": 76.04, "step": 825, "token_acc": 0.7360295948493579, "train_speed(iter/s)": 0.027945 }, { "epoch": 0.21462279397504685, "grad_norm": 1.2715613842010498, "learning_rate": 4.993704026487837e-06, "loss": 1.0497617721557617, "memory(GiB)": 76.04, "step": 830, "token_acc": 0.7389768524290404, "train_speed(iter/s)": 0.027948 }, { "epoch": 0.21591570237248692, "grad_norm": 1.3336645364761353, "learning_rate": 4.993448749086455e-06, "loss": 1.0744933128356933, "memory(GiB)": 76.04, "step": 835, "token_acc": 0.7657203842049093, "train_speed(iter/s)": 0.027954 }, { "epoch": 0.21720861076992695, "grad_norm": 1.3586411476135254, "learning_rate": 4.9931884055479855e-06, "loss": 1.0742916107177733, "memory(GiB)": 76.04, "step": 840, "token_acc": 0.7213896713615023, "train_speed(iter/s)": 0.027951 }, { "epoch": 0.218501519167367, "grad_norm": 1.2536128759384155, "learning_rate": 4.992922996401386e-06, "loss": 1.0365938186645507, "memory(GiB)": 76.04, "step": 845, "token_acc": 0.7336075791573173, "train_speed(iter/s)": 0.027954 }, { "epoch": 0.21979442756480702, "grad_norm": 1.0951530933380127, "learning_rate": 4.992652522185912e-06, "loss": 1.0514394760131835, "memory(GiB)": 76.04, "step": 850, "token_acc": 0.7264080844107594, "train_speed(iter/s)": 0.027953 }, { "epoch": 0.22108733596224708, "grad_norm": 1.302139401435852, "learning_rate": 4.992376983451106e-06, "loss": 1.0614931106567382, "memory(GiB)": 76.04, "step": 855, "token_acc": 0.7223553005403599, "train_speed(iter/s)": 0.02795 }, { "epoch": 0.22238024435968712, "grad_norm": 1.2117213010787964, "learning_rate": 4.992096380756802e-06, "loss": 1.0211700439453124, "memory(GiB)": 76.04, "step": 860, "token_acc": 0.7403405370006548, "train_speed(iter/s)": 0.027946 }, { "epoch": 0.22367315275712715, "grad_norm": 1.1893075704574585, "learning_rate": 4.9918107146731234e-06, "loss": 1.02754545211792, "memory(GiB)": 76.04, "step": 865, "token_acc": 0.7416530944625407, "train_speed(iter/s)": 0.027949 }, { "epoch": 0.2249660611545672, "grad_norm": 1.3030766248703003, "learning_rate": 4.991519985780479e-06, "loss": 1.0705657958984376, "memory(GiB)": 76.04, "step": 870, "token_acc": 0.7212401717504043, "train_speed(iter/s)": 0.027952 }, { "epoch": 0.22625896955200725, "grad_norm": 1.3120644092559814, "learning_rate": 4.991224194669567e-06, "loss": 1.0438270568847656, "memory(GiB)": 76.04, "step": 875, "token_acc": 0.7455082274151154, "train_speed(iter/s)": 0.027949 }, { "epoch": 0.22755187794944728, "grad_norm": 1.2804142236709595, "learning_rate": 4.99092334194137e-06, "loss": 1.0537703514099122, "memory(GiB)": 76.04, "step": 880, "token_acc": 0.7769719655320311, "train_speed(iter/s)": 0.027949 }, { "epoch": 0.22884478634688732, "grad_norm": 1.141849160194397, "learning_rate": 4.990617428207153e-06, "loss": 1.0109355926513672, "memory(GiB)": 76.04, "step": 885, "token_acc": 0.7504947627552252, "train_speed(iter/s)": 0.027943 }, { "epoch": 0.23013769474432735, "grad_norm": 1.3178867101669312, "learning_rate": 4.990306454088467e-06, "loss": 1.0286626815795898, "memory(GiB)": 76.04, "step": 890, "token_acc": 0.7264516717602518, "train_speed(iter/s)": 0.027938 }, { "epoch": 0.23143060314176742, "grad_norm": 1.5300428867340088, "learning_rate": 4.98999042021714e-06, "loss": 1.0479674339294434, "memory(GiB)": 76.04, "step": 895, "token_acc": 0.7131689263189698, "train_speed(iter/s)": 0.027942 }, { "epoch": 0.23272351153920745, "grad_norm": 1.482164740562439, "learning_rate": 4.989669327235285e-06, "loss": 1.0493934631347657, "memory(GiB)": 76.04, "step": 900, "token_acc": 0.7489839605373375, "train_speed(iter/s)": 0.027946 }, { "epoch": 0.23401641993664749, "grad_norm": 1.259787678718567, "learning_rate": 4.989343175795291e-06, "loss": 1.0503623962402344, "memory(GiB)": 76.04, "step": 905, "token_acc": 0.7800720499077409, "train_speed(iter/s)": 0.027945 }, { "epoch": 0.23530932833408752, "grad_norm": 1.2325483560562134, "learning_rate": 4.9890119665598265e-06, "loss": 1.0217859268188476, "memory(GiB)": 76.04, "step": 910, "token_acc": 0.7489180775947156, "train_speed(iter/s)": 0.027956 }, { "epoch": 0.23660223673152758, "grad_norm": 1.2175496816635132, "learning_rate": 4.988675700201836e-06, "loss": 1.0210546493530273, "memory(GiB)": 76.04, "step": 915, "token_acc": 0.7292449540928989, "train_speed(iter/s)": 0.02796 }, { "epoch": 0.23789514512896762, "grad_norm": 1.282808542251587, "learning_rate": 4.988334377404537e-06, "loss": 1.0249996185302734, "memory(GiB)": 76.04, "step": 920, "token_acc": 0.7644511799090712, "train_speed(iter/s)": 0.027959 }, { "epoch": 0.23918805352640765, "grad_norm": 1.388667106628418, "learning_rate": 4.9879879988614226e-06, "loss": 1.0185768127441406, "memory(GiB)": 76.04, "step": 925, "token_acc": 0.7619047619047619, "train_speed(iter/s)": 0.027952 }, { "epoch": 0.24048096192384769, "grad_norm": 1.1671382188796997, "learning_rate": 4.987636565276258e-06, "loss": 0.9928812026977539, "memory(GiB)": 76.04, "step": 930, "token_acc": 0.7683724235963042, "train_speed(iter/s)": 0.027942 }, { "epoch": 0.24177387032128775, "grad_norm": 1.122075080871582, "learning_rate": 4.987280077363077e-06, "loss": 0.993900203704834, "memory(GiB)": 76.04, "step": 935, "token_acc": 0.7723164046901659, "train_speed(iter/s)": 0.027941 }, { "epoch": 0.24306677871872778, "grad_norm": 1.2567425966262817, "learning_rate": 4.986918535846187e-06, "loss": 1.0273015975952149, "memory(GiB)": 76.04, "step": 940, "token_acc": 0.7539662184442898, "train_speed(iter/s)": 0.027938 }, { "epoch": 0.24435968711616782, "grad_norm": 1.1676665544509888, "learning_rate": 4.986551941460158e-06, "loss": 0.9982240676879883, "memory(GiB)": 76.04, "step": 945, "token_acc": 0.7614623913694936, "train_speed(iter/s)": 0.027934 }, { "epoch": 0.24565259551360785, "grad_norm": 1.2946789264678955, "learning_rate": 4.98618029494983e-06, "loss": 1.0252174377441405, "memory(GiB)": 76.04, "step": 950, "token_acc": 0.7381703470031545, "train_speed(iter/s)": 0.027937 }, { "epoch": 0.24694550391104791, "grad_norm": 1.6065422296524048, "learning_rate": 4.985803597070306e-06, "loss": 1.0155667304992675, "memory(GiB)": 76.04, "step": 955, "token_acc": 0.736665709087427, "train_speed(iter/s)": 0.027941 }, { "epoch": 0.24823841230848795, "grad_norm": 1.2003506422042847, "learning_rate": 4.985421848586954e-06, "loss": 1.020925521850586, "memory(GiB)": 76.04, "step": 960, "token_acc": 0.7488078360613482, "train_speed(iter/s)": 0.027942 }, { "epoch": 0.24953132070592798, "grad_norm": 1.5374075174331665, "learning_rate": 4.985035050275402e-06, "loss": 1.013150405883789, "memory(GiB)": 76.04, "step": 965, "token_acc": 0.7596559355296673, "train_speed(iter/s)": 0.027937 }, { "epoch": 0.25082422910336805, "grad_norm": 1.2153679132461548, "learning_rate": 4.984643202921538e-06, "loss": 1.0238693237304688, "memory(GiB)": 76.04, "step": 970, "token_acc": 0.7379732289822666, "train_speed(iter/s)": 0.027935 }, { "epoch": 0.25211713750080805, "grad_norm": 1.153549075126648, "learning_rate": 4.984246307321511e-06, "loss": 0.9940820693969726, "memory(GiB)": 76.04, "step": 975, "token_acc": 0.7361269479285443, "train_speed(iter/s)": 0.02794 }, { "epoch": 0.2534100458982481, "grad_norm": 1.8333048820495605, "learning_rate": 4.983844364281723e-06, "loss": 0.9937544822692871, "memory(GiB)": 76.04, "step": 980, "token_acc": 0.739548529052237, "train_speed(iter/s)": 0.027942 }, { "epoch": 0.2547029542956882, "grad_norm": 2.401205062866211, "learning_rate": 4.983437374618835e-06, "loss": 1.039668083190918, "memory(GiB)": 76.04, "step": 985, "token_acc": 0.7770313636558127, "train_speed(iter/s)": 0.027946 }, { "epoch": 0.2559958626931282, "grad_norm": 4.683147430419922, "learning_rate": 4.983025339159759e-06, "loss": 1.0015725135803222, "memory(GiB)": 76.04, "step": 990, "token_acc": 0.7626828398144845, "train_speed(iter/s)": 0.027952 }, { "epoch": 0.25728877109056825, "grad_norm": 1.1371662616729736, "learning_rate": 4.982608258741662e-06, "loss": 1.0269445419311523, "memory(GiB)": 76.04, "step": 995, "token_acc": 0.7389006342494715, "train_speed(iter/s)": 0.027949 }, { "epoch": 0.25858167948800825, "grad_norm": 1.1645772457122803, "learning_rate": 4.982186134211957e-06, "loss": 1.0103599548339843, "memory(GiB)": 76.04, "step": 1000, "token_acc": 0.7301087055814378, "train_speed(iter/s)": 0.027949 }, { "epoch": 0.2598745878854483, "grad_norm": 1.1663211584091187, "learning_rate": 4.98175896642831e-06, "loss": 0.9422775268554687, "memory(GiB)": 76.04, "step": 1005, "token_acc": 0.7944890929965557, "train_speed(iter/s)": 0.027805 }, { "epoch": 0.2611674962828884, "grad_norm": 1.0399482250213623, "learning_rate": 4.981326756258629e-06, "loss": 0.9761096000671386, "memory(GiB)": 76.04, "step": 1010, "token_acc": 0.7143371038011112, "train_speed(iter/s)": 0.027808 }, { "epoch": 0.2624604046803284, "grad_norm": 1.1718828678131104, "learning_rate": 4.9808895045810715e-06, "loss": 0.9784045219421387, "memory(GiB)": 76.04, "step": 1015, "token_acc": 0.7504920027194332, "train_speed(iter/s)": 0.027808 }, { "epoch": 0.26375331307776845, "grad_norm": 1.5339823961257935, "learning_rate": 4.980447212284035e-06, "loss": 0.9676334381103515, "memory(GiB)": 76.04, "step": 1020, "token_acc": 0.759060549655097, "train_speed(iter/s)": 0.027814 }, { "epoch": 0.26504622147520845, "grad_norm": 1.1942572593688965, "learning_rate": 4.979999880266162e-06, "loss": 0.9654909133911133, "memory(GiB)": 76.04, "step": 1025, "token_acc": 0.7506817855604995, "train_speed(iter/s)": 0.027813 }, { "epoch": 0.2663391298726485, "grad_norm": 1.2203813791275024, "learning_rate": 4.979547509436329e-06, "loss": 1.0167608261108398, "memory(GiB)": 76.04, "step": 1030, "token_acc": 0.7280177187153931, "train_speed(iter/s)": 0.027814 }, { "epoch": 0.2676320382700886, "grad_norm": 1.1107381582260132, "learning_rate": 4.979090100713657e-06, "loss": 0.944733715057373, "memory(GiB)": 76.04, "step": 1035, "token_acc": 0.8061226833245297, "train_speed(iter/s)": 0.027814 }, { "epoch": 0.2689249466675286, "grad_norm": 1.2653183937072754, "learning_rate": 4.978627655027497e-06, "loss": 0.9633764266967774, "memory(GiB)": 76.04, "step": 1040, "token_acc": 0.7571960586546668, "train_speed(iter/s)": 0.027816 }, { "epoch": 0.27021785506496865, "grad_norm": 1.3168116807937622, "learning_rate": 4.978160173317439e-06, "loss": 1.0249298095703125, "memory(GiB)": 76.04, "step": 1045, "token_acc": 0.7487644982349975, "train_speed(iter/s)": 0.027818 }, { "epoch": 0.2715107634624087, "grad_norm": 1.2121893167495728, "learning_rate": 4.9776876565332995e-06, "loss": 0.9906470298767089, "memory(GiB)": 76.04, "step": 1050, "token_acc": 0.7474919305591904, "train_speed(iter/s)": 0.027814 }, { "epoch": 0.2728036718598487, "grad_norm": 1.1107871532440186, "learning_rate": 4.97721010563513e-06, "loss": 0.9752838134765625, "memory(GiB)": 76.04, "step": 1055, "token_acc": 0.7437383839277812, "train_speed(iter/s)": 0.027814 }, { "epoch": 0.2740965802572888, "grad_norm": 1.0446956157684326, "learning_rate": 4.976727521593209e-06, "loss": 0.9726054191589355, "memory(GiB)": 76.04, "step": 1060, "token_acc": 0.7167705088265836, "train_speed(iter/s)": 0.027817 }, { "epoch": 0.2753894886547288, "grad_norm": 1.2050210237503052, "learning_rate": 4.9762399053880395e-06, "loss": 1.0308534622192382, "memory(GiB)": 76.04, "step": 1065, "token_acc": 0.7568904593639576, "train_speed(iter/s)": 0.027811 }, { "epoch": 0.27668239705216885, "grad_norm": 1.5667829513549805, "learning_rate": 4.97574725801035e-06, "loss": 0.9559214591979981, "memory(GiB)": 76.04, "step": 1070, "token_acc": 0.7598421892050701, "train_speed(iter/s)": 0.027811 }, { "epoch": 0.2779753054496089, "grad_norm": 1.1865519285202026, "learning_rate": 4.975249580461092e-06, "loss": 1.010297679901123, "memory(GiB)": 76.04, "step": 1075, "token_acc": 0.744272567064813, "train_speed(iter/s)": 0.02781 }, { "epoch": 0.2792682138470489, "grad_norm": 1.058516263961792, "learning_rate": 4.974746873751435e-06, "loss": 1.015725040435791, "memory(GiB)": 76.04, "step": 1080, "token_acc": 0.7451629446594247, "train_speed(iter/s)": 0.027809 }, { "epoch": 0.280561122244489, "grad_norm": 1.0704551935195923, "learning_rate": 4.9742391389027695e-06, "loss": 0.9716552734375, "memory(GiB)": 76.04, "step": 1085, "token_acc": 0.7283868278449354, "train_speed(iter/s)": 0.027812 }, { "epoch": 0.28185403064192904, "grad_norm": 1.0352953672409058, "learning_rate": 4.973726376946699e-06, "loss": 0.9752028465270997, "memory(GiB)": 76.04, "step": 1090, "token_acc": 0.7493878800244848, "train_speed(iter/s)": 0.027812 }, { "epoch": 0.28314693903936905, "grad_norm": 1.1525194644927979, "learning_rate": 4.973208588925045e-06, "loss": 0.9712867736816406, "memory(GiB)": 76.04, "step": 1095, "token_acc": 0.7517407605784682, "train_speed(iter/s)": 0.027815 }, { "epoch": 0.2844398474368091, "grad_norm": 1.0644663572311401, "learning_rate": 4.972685775889836e-06, "loss": 0.9582048416137695, "memory(GiB)": 76.04, "step": 1100, "token_acc": 0.7543896103896104, "train_speed(iter/s)": 0.027814 }, { "epoch": 0.2857327558342491, "grad_norm": 1.143939733505249, "learning_rate": 4.9721579389033125e-06, "loss": 0.9977324485778809, "memory(GiB)": 76.04, "step": 1105, "token_acc": 0.7675656607767053, "train_speed(iter/s)": 0.027815 }, { "epoch": 0.2870256642316892, "grad_norm": 1.3790191411972046, "learning_rate": 4.971625079037925e-06, "loss": 0.9518113136291504, "memory(GiB)": 76.04, "step": 1110, "token_acc": 0.7680853988179778, "train_speed(iter/s)": 0.027815 }, { "epoch": 0.28831857262912924, "grad_norm": 1.1732274293899536, "learning_rate": 4.971087197376325e-06, "loss": 0.9738147735595704, "memory(GiB)": 76.04, "step": 1115, "token_acc": 0.7463278436450451, "train_speed(iter/s)": 0.027817 }, { "epoch": 0.28961148102656925, "grad_norm": 1.0615239143371582, "learning_rate": 4.970544295011369e-06, "loss": 0.9704501152038574, "memory(GiB)": 76.04, "step": 1120, "token_acc": 0.72969752000454, "train_speed(iter/s)": 0.027813 }, { "epoch": 0.2909043894240093, "grad_norm": 3877.243896484375, "learning_rate": 4.969996373046117e-06, "loss": 1.222921085357666, "memory(GiB)": 76.04, "step": 1125, "token_acc": 0.7576140282702677, "train_speed(iter/s)": 0.027814 }, { "epoch": 0.2921972978214494, "grad_norm": 358.6942138671875, "learning_rate": 4.969443432593823e-06, "loss": 4.470377349853516, "memory(GiB)": 76.04, "step": 1130, "token_acc": 0.40514805901433504, "train_speed(iter/s)": 0.027816 }, { "epoch": 0.2934902062188894, "grad_norm": 646.41748046875, "learning_rate": 4.968885474777941e-06, "loss": 4.353573226928711, "memory(GiB)": 76.04, "step": 1135, "token_acc": 0.404987128989238, "train_speed(iter/s)": 0.027819 }, { "epoch": 0.29478311461632944, "grad_norm": 133.66444396972656, "learning_rate": 4.968322500732118e-06, "loss": 3.6896575927734374, "memory(GiB)": 76.04, "step": 1140, "token_acc": 0.4133568570076165, "train_speed(iter/s)": 0.027822 }, { "epoch": 0.29607602301376945, "grad_norm": 371.82977294921875, "learning_rate": 4.967754511600192e-06, "loss": 3.5502925872802735, "memory(GiB)": 76.04, "step": 1145, "token_acc": 0.5075539022168236, "train_speed(iter/s)": 0.027827 }, { "epoch": 0.2973689314112095, "grad_norm": 716.2843627929688, "learning_rate": 4.967181508536193e-06, "loss": 2.8075958251953126, "memory(GiB)": 76.04, "step": 1150, "token_acc": 0.47253147435230464, "train_speed(iter/s)": 0.027825 }, { "epoch": 0.2986618398086496, "grad_norm": 235.48390197753906, "learning_rate": 4.9666034927043346e-06, "loss": 2.0499923706054686, "memory(GiB)": 76.04, "step": 1155, "token_acc": 0.6685902289996606, "train_speed(iter/s)": 0.027823 }, { "epoch": 0.2999547482060896, "grad_norm": 4.845156192779541, "learning_rate": 4.96602046527902e-06, "loss": 1.2279346466064454, "memory(GiB)": 76.04, "step": 1160, "token_acc": 0.7027854164818593, "train_speed(iter/s)": 0.027823 }, { "epoch": 0.30124765660352965, "grad_norm": 1.512497901916504, "learning_rate": 4.96543242744483e-06, "loss": 1.0421756744384765, "memory(GiB)": 76.04, "step": 1165, "token_acc": 0.7120159370292989, "train_speed(iter/s)": 0.027825 }, { "epoch": 0.3025405650009697, "grad_norm": 1.465827465057373, "learning_rate": 4.964839380396529e-06, "loss": 0.953398609161377, "memory(GiB)": 76.04, "step": 1170, "token_acc": 0.7572973559518211, "train_speed(iter/s)": 0.027828 }, { "epoch": 0.3038334733984097, "grad_norm": 1.2102559804916382, "learning_rate": 4.964241325339056e-06, "loss": 0.9315123558044434, "memory(GiB)": 76.04, "step": 1175, "token_acc": 0.7300858454781394, "train_speed(iter/s)": 0.027824 }, { "epoch": 0.3051263817958498, "grad_norm": 2.6368777751922607, "learning_rate": 4.963638263487528e-06, "loss": 0.9297597885131836, "memory(GiB)": 76.04, "step": 1180, "token_acc": 0.7474219317356572, "train_speed(iter/s)": 0.027825 }, { "epoch": 0.3064192901932898, "grad_norm": 353.7561950683594, "learning_rate": 4.963030196067233e-06, "loss": 0.942746639251709, "memory(GiB)": 76.04, "step": 1185, "token_acc": 0.7225467822911913, "train_speed(iter/s)": 0.027827 }, { "epoch": 0.30771219859072985, "grad_norm": 1.3002510070800781, "learning_rate": 4.96241712431363e-06, "loss": 0.9472110748291016, "memory(GiB)": 76.04, "step": 1190, "token_acc": 0.7242681047765793, "train_speed(iter/s)": 0.027824 }, { "epoch": 0.3090051069881699, "grad_norm": 1.1316903829574585, "learning_rate": 4.9617990494723444e-06, "loss": 0.9454745292663574, "memory(GiB)": 76.04, "step": 1195, "token_acc": 0.7497220256747195, "train_speed(iter/s)": 0.027828 }, { "epoch": 0.3102980153856099, "grad_norm": 1.108446478843689, "learning_rate": 4.961175972799169e-06, "loss": 0.9554048538208008, "memory(GiB)": 76.04, "step": 1200, "token_acc": 0.7410781445883828, "train_speed(iter/s)": 0.02783 }, { "epoch": 0.31159092378305, "grad_norm": 1.25348961353302, "learning_rate": 4.960547895560058e-06, "loss": 0.9723408699035645, "memory(GiB)": 76.04, "step": 1205, "token_acc": 0.7924053665548635, "train_speed(iter/s)": 0.027837 }, { "epoch": 0.31288383218049, "grad_norm": 2.172182559967041, "learning_rate": 4.959914819031125e-06, "loss": 0.9340225219726562, "memory(GiB)": 76.04, "step": 1210, "token_acc": 0.7698709945900957, "train_speed(iter/s)": 0.027838 }, { "epoch": 0.31417674057793005, "grad_norm": 9.525141716003418, "learning_rate": 4.959276744498642e-06, "loss": 0.9623056411743164, "memory(GiB)": 76.04, "step": 1215, "token_acc": 0.7322046531438002, "train_speed(iter/s)": 0.027841 }, { "epoch": 0.3154696489753701, "grad_norm": 1.1645443439483643, "learning_rate": 4.9586336732590344e-06, "loss": 0.9339606285095214, "memory(GiB)": 76.04, "step": 1220, "token_acc": 0.7669276434655424, "train_speed(iter/s)": 0.02784 }, { "epoch": 0.3167625573728101, "grad_norm": 1.1521093845367432, "learning_rate": 4.957985606618882e-06, "loss": 0.9695714950561524, "memory(GiB)": 76.04, "step": 1225, "token_acc": 0.7714450456843785, "train_speed(iter/s)": 0.027843 }, { "epoch": 0.3180554657702502, "grad_norm": 1.1521477699279785, "learning_rate": 4.957332545894914e-06, "loss": 0.9398648262023925, "memory(GiB)": 76.04, "step": 1230, "token_acc": 0.8035982876316093, "train_speed(iter/s)": 0.027843 }, { "epoch": 0.31934837416769024, "grad_norm": 1.1556190252304077, "learning_rate": 4.956674492414003e-06, "loss": 0.9573787689208985, "memory(GiB)": 76.04, "step": 1235, "token_acc": 0.7518860016764459, "train_speed(iter/s)": 0.027841 }, { "epoch": 0.32064128256513025, "grad_norm": 1.3817846775054932, "learning_rate": 4.95601144751317e-06, "loss": 0.9417957305908203, "memory(GiB)": 76.04, "step": 1240, "token_acc": 0.7512520868113522, "train_speed(iter/s)": 0.027839 }, { "epoch": 0.3219341909625703, "grad_norm": 1.157069206237793, "learning_rate": 4.955343412539576e-06, "loss": 0.9470592498779297, "memory(GiB)": 76.04, "step": 1245, "token_acc": 0.7514258079578428, "train_speed(iter/s)": 0.027833 }, { "epoch": 0.3232270993600103, "grad_norm": 1.0551875829696655, "learning_rate": 4.954670388850521e-06, "loss": 0.9208686828613282, "memory(GiB)": 76.04, "step": 1250, "token_acc": 0.7600368787193027, "train_speed(iter/s)": 0.027834 }, { "epoch": 0.3245200077574504, "grad_norm": 1.0360503196716309, "learning_rate": 4.953992377813438e-06, "loss": 0.9444967269897461, "memory(GiB)": 76.04, "step": 1255, "token_acc": 0.8037869164814226, "train_speed(iter/s)": 0.027836 }, { "epoch": 0.32581291615489044, "grad_norm": 0.9959269762039185, "learning_rate": 4.953309380805897e-06, "loss": 0.9297657012939453, "memory(GiB)": 76.04, "step": 1260, "token_acc": 0.7694388999778221, "train_speed(iter/s)": 0.027837 }, { "epoch": 0.32710582455233045, "grad_norm": 1.429611086845398, "learning_rate": 4.952621399215598e-06, "loss": 0.9631167411804199, "memory(GiB)": 76.04, "step": 1265, "token_acc": 0.7283613171938045, "train_speed(iter/s)": 0.027842 }, { "epoch": 0.3283987329497705, "grad_norm": 1.3571066856384277, "learning_rate": 4.951928434440367e-06, "loss": 0.9502096176147461, "memory(GiB)": 76.04, "step": 1270, "token_acc": 0.7302693616497888, "train_speed(iter/s)": 0.027844 }, { "epoch": 0.3296916413472106, "grad_norm": 3.4126250743865967, "learning_rate": 4.951230487888154e-06, "loss": 0.9481155395507812, "memory(GiB)": 76.04, "step": 1275, "token_acc": 0.7534746180384426, "train_speed(iter/s)": 0.027842 }, { "epoch": 0.3309845497446506, "grad_norm": 1.0404834747314453, "learning_rate": 4.950527560977035e-06, "loss": 0.945584487915039, "memory(GiB)": 76.04, "step": 1280, "token_acc": 0.7738585496866607, "train_speed(iter/s)": 0.027844 }, { "epoch": 0.33227745814209064, "grad_norm": 1.1452839374542236, "learning_rate": 4.9498196551352e-06, "loss": 0.9602731704711914, "memory(GiB)": 76.04, "step": 1285, "token_acc": 0.722290316932236, "train_speed(iter/s)": 0.027851 }, { "epoch": 0.33357036653953065, "grad_norm": 1.2483798265457153, "learning_rate": 4.949106771800958e-06, "loss": 0.9069469451904297, "memory(GiB)": 76.04, "step": 1290, "token_acc": 0.7798567304608147, "train_speed(iter/s)": 0.027851 }, { "epoch": 0.3348632749369707, "grad_norm": 1.1929678916931152, "learning_rate": 4.94838891242273e-06, "loss": 0.924495792388916, "memory(GiB)": 76.04, "step": 1295, "token_acc": 0.7753519103705832, "train_speed(iter/s)": 0.027853 }, { "epoch": 0.3361561833344108, "grad_norm": 1.1313072443008423, "learning_rate": 4.947666078459049e-06, "loss": 0.9245437622070313, "memory(GiB)": 76.04, "step": 1300, "token_acc": 0.7716767637913902, "train_speed(iter/s)": 0.027845 }, { "epoch": 0.3374490917318508, "grad_norm": 1.1400386095046997, "learning_rate": 4.946938271378552e-06, "loss": 0.9137565612792968, "memory(GiB)": 76.04, "step": 1305, "token_acc": 0.7587449115602147, "train_speed(iter/s)": 0.027844 }, { "epoch": 0.33874200012929084, "grad_norm": 1.0596169233322144, "learning_rate": 4.946205492659984e-06, "loss": 0.8961214065551758, "memory(GiB)": 76.04, "step": 1310, "token_acc": 0.7542817732480172, "train_speed(iter/s)": 0.027844 }, { "epoch": 0.3400349085267309, "grad_norm": 0.9843769073486328, "learning_rate": 4.945467743792188e-06, "loss": 0.9182037353515625, "memory(GiB)": 76.04, "step": 1315, "token_acc": 0.766585993622988, "train_speed(iter/s)": 0.027846 }, { "epoch": 0.3413278169241709, "grad_norm": 1.1926190853118896, "learning_rate": 4.9447250262741085e-06, "loss": 0.9283374786376953, "memory(GiB)": 76.04, "step": 1320, "token_acc": 0.7409656847859095, "train_speed(iter/s)": 0.027843 }, { "epoch": 0.342620725321611, "grad_norm": 1.0026227235794067, "learning_rate": 4.943977341614782e-06, "loss": 0.9378311157226562, "memory(GiB)": 76.04, "step": 1325, "token_acc": 0.7585428321089169, "train_speed(iter/s)": 0.027841 }, { "epoch": 0.343913633719051, "grad_norm": 1.1508121490478516, "learning_rate": 4.943224691333339e-06, "loss": 0.9445396423339844, "memory(GiB)": 76.04, "step": 1330, "token_acc": 0.7383880704599721, "train_speed(iter/s)": 0.027834 }, { "epoch": 0.34520654211649104, "grad_norm": 1.5133693218231201, "learning_rate": 4.942467076958999e-06, "loss": 0.8884575843811036, "memory(GiB)": 76.04, "step": 1335, "token_acc": 0.7681140292991949, "train_speed(iter/s)": 0.027831 }, { "epoch": 0.3464994505139311, "grad_norm": 1.0745313167572021, "learning_rate": 4.941704500031066e-06, "loss": 0.8931808471679688, "memory(GiB)": 76.04, "step": 1340, "token_acc": 0.7731155696658266, "train_speed(iter/s)": 0.027831 }, { "epoch": 0.3477923589113711, "grad_norm": 1.1880706548690796, "learning_rate": 4.940936962098929e-06, "loss": 0.9454404830932617, "memory(GiB)": 76.04, "step": 1345, "token_acc": 0.7694478894923662, "train_speed(iter/s)": 0.027836 }, { "epoch": 0.3490852673088112, "grad_norm": 1.056232213973999, "learning_rate": 4.9401644647220545e-06, "loss": 0.8956671714782715, "memory(GiB)": 76.04, "step": 1350, "token_acc": 0.7636090870124304, "train_speed(iter/s)": 0.027838 }, { "epoch": 0.35037817570625124, "grad_norm": 0.9796701073646545, "learning_rate": 4.939387009469988e-06, "loss": 0.9031806945800781, "memory(GiB)": 76.04, "step": 1355, "token_acc": 0.7830841262649146, "train_speed(iter/s)": 0.027838 }, { "epoch": 0.35167108410369124, "grad_norm": 1.0584688186645508, "learning_rate": 4.938604597922346e-06, "loss": 0.9216032981872558, "memory(GiB)": 76.04, "step": 1360, "token_acc": 0.7680820851083322, "train_speed(iter/s)": 0.027842 }, { "epoch": 0.3529639925011313, "grad_norm": 1.00162672996521, "learning_rate": 4.937817231668815e-06, "loss": 0.8896630287170411, "memory(GiB)": 76.04, "step": 1365, "token_acc": 0.7780082987551867, "train_speed(iter/s)": 0.027839 }, { "epoch": 0.3542569008985713, "grad_norm": 2.0998218059539795, "learning_rate": 4.937024912309152e-06, "loss": 0.9393485069274903, "memory(GiB)": 76.04, "step": 1370, "token_acc": 0.7547226992625518, "train_speed(iter/s)": 0.027835 }, { "epoch": 0.3555498092960114, "grad_norm": 1.2376868724822998, "learning_rate": 4.936227641453172e-06, "loss": 0.9312064170837402, "memory(GiB)": 76.04, "step": 1375, "token_acc": 0.7210518525827618, "train_speed(iter/s)": 0.027834 }, { "epoch": 0.35684271769345144, "grad_norm": 1.0578678846359253, "learning_rate": 4.935425420720754e-06, "loss": 0.9209253311157226, "memory(GiB)": 76.04, "step": 1380, "token_acc": 0.7725167678058128, "train_speed(iter/s)": 0.027834 }, { "epoch": 0.35813562609089145, "grad_norm": 1.360901951789856, "learning_rate": 4.934618251741835e-06, "loss": 0.9340425491333008, "memory(GiB)": 76.04, "step": 1385, "token_acc": 0.779114302812687, "train_speed(iter/s)": 0.027829 }, { "epoch": 0.3594285344883315, "grad_norm": 1.5933281183242798, "learning_rate": 4.933806136156402e-06, "loss": 0.8858348846435546, "memory(GiB)": 76.04, "step": 1390, "token_acc": 0.7908785127852725, "train_speed(iter/s)": 0.027822 }, { "epoch": 0.36072144288577157, "grad_norm": 0.9765493273735046, "learning_rate": 4.932989075614496e-06, "loss": 0.9056285858154297, "memory(GiB)": 76.04, "step": 1395, "token_acc": 0.7752316896727017, "train_speed(iter/s)": 0.027825 }, { "epoch": 0.3620143512832116, "grad_norm": 1.0190749168395996, "learning_rate": 4.932167071776203e-06, "loss": 0.8850472450256348, "memory(GiB)": 76.04, "step": 1400, "token_acc": 0.7488913755232293, "train_speed(iter/s)": 0.027823 }, { "epoch": 0.36330725968065164, "grad_norm": 0.9817783832550049, "learning_rate": 4.931340126311652e-06, "loss": 0.8637564659118653, "memory(GiB)": 76.04, "step": 1405, "token_acc": 0.8092789765596534, "train_speed(iter/s)": 0.02782 }, { "epoch": 0.36460016807809165, "grad_norm": 1.116895318031311, "learning_rate": 4.930508240901015e-06, "loss": 0.9004463195800781, "memory(GiB)": 76.04, "step": 1410, "token_acc": 0.7735781849843669, "train_speed(iter/s)": 0.027815 }, { "epoch": 0.3658930764755317, "grad_norm": 1.0126279592514038, "learning_rate": 4.9296714172345e-06, "loss": 0.9231013298034668, "memory(GiB)": 76.04, "step": 1415, "token_acc": 0.7658435279228997, "train_speed(iter/s)": 0.027811 }, { "epoch": 0.36718598487297177, "grad_norm": 0.9945583939552307, "learning_rate": 4.928829657012346e-06, "loss": 0.8893575668334961, "memory(GiB)": 76.04, "step": 1420, "token_acc": 0.7532214137636681, "train_speed(iter/s)": 0.027807 }, { "epoch": 0.3684788932704118, "grad_norm": 1.384425163269043, "learning_rate": 4.927982961944825e-06, "loss": 0.9314968109130859, "memory(GiB)": 76.04, "step": 1425, "token_acc": 0.7419209649833468, "train_speed(iter/s)": 0.027811 }, { "epoch": 0.36977180166785184, "grad_norm": 0.8520684242248535, "learning_rate": 4.9271313337522346e-06, "loss": 0.9494674682617188, "memory(GiB)": 76.04, "step": 1430, "token_acc": 0.782346893817007, "train_speed(iter/s)": 0.027809 }, { "epoch": 0.37106471006529185, "grad_norm": 0.9675486087799072, "learning_rate": 4.926274774164893e-06, "loss": 0.9049705505371094, "memory(GiB)": 76.04, "step": 1435, "token_acc": 0.7790923317683881, "train_speed(iter/s)": 0.02781 }, { "epoch": 0.3723576184627319, "grad_norm": 0.9582749009132385, "learning_rate": 4.925413284923143e-06, "loss": 0.8903584480285645, "memory(GiB)": 76.04, "step": 1440, "token_acc": 0.7866145377848655, "train_speed(iter/s)": 0.027812 }, { "epoch": 0.37365052686017197, "grad_norm": 1.1724884510040283, "learning_rate": 4.924546867777339e-06, "loss": 0.9676746368408203, "memory(GiB)": 76.04, "step": 1445, "token_acc": 0.7439509954058193, "train_speed(iter/s)": 0.027815 }, { "epoch": 0.374943435257612, "grad_norm": 1.0407586097717285, "learning_rate": 4.92367552448785e-06, "loss": 0.9126652717590332, "memory(GiB)": 76.04, "step": 1450, "token_acc": 0.781545586561482, "train_speed(iter/s)": 0.027815 }, { "epoch": 0.37623634365505204, "grad_norm": 1.0518836975097656, "learning_rate": 4.922799256825052e-06, "loss": 0.8564701080322266, "memory(GiB)": 76.04, "step": 1455, "token_acc": 0.7408266266539354, "train_speed(iter/s)": 0.027818 }, { "epoch": 0.3775292520524921, "grad_norm": 0.9088625907897949, "learning_rate": 4.921918066569328e-06, "loss": 0.8742757797241211, "memory(GiB)": 76.04, "step": 1460, "token_acc": 0.7589349964020149, "train_speed(iter/s)": 0.027815 }, { "epoch": 0.3788221604499321, "grad_norm": 1.0474976301193237, "learning_rate": 4.921031955511061e-06, "loss": 0.8954677581787109, "memory(GiB)": 76.04, "step": 1465, "token_acc": 0.7686122547832405, "train_speed(iter/s)": 0.027811 }, { "epoch": 0.3801150688473722, "grad_norm": 1.012302279472351, "learning_rate": 4.920140925450634e-06, "loss": 0.9504472732543945, "memory(GiB)": 76.04, "step": 1470, "token_acc": 0.7751241428233625, "train_speed(iter/s)": 0.027809 }, { "epoch": 0.3814079772448122, "grad_norm": 0.9360347390174866, "learning_rate": 4.919244978198424e-06, "loss": 0.8807231903076171, "memory(GiB)": 76.04, "step": 1475, "token_acc": 0.7640212437379938, "train_speed(iter/s)": 0.027808 }, { "epoch": 0.38270088564225224, "grad_norm": 1.006102442741394, "learning_rate": 4.918344115574797e-06, "loss": 0.9025184631347656, "memory(GiB)": 76.04, "step": 1480, "token_acc": 0.7411331796417805, "train_speed(iter/s)": 0.027809 }, { "epoch": 0.3839937940396923, "grad_norm": 0.8951787948608398, "learning_rate": 4.917438339410105e-06, "loss": 0.8877702713012695, "memory(GiB)": 76.04, "step": 1485, "token_acc": 0.7563314788673918, "train_speed(iter/s)": 0.027806 }, { "epoch": 0.3852867024371323, "grad_norm": 1.0641748905181885, "learning_rate": 4.916527651544689e-06, "loss": 0.8795459747314454, "memory(GiB)": 76.04, "step": 1490, "token_acc": 0.8010365029292474, "train_speed(iter/s)": 0.027805 }, { "epoch": 0.3865796108345724, "grad_norm": 1.0358333587646484, "learning_rate": 4.915612053828862e-06, "loss": 0.8692068099975586, "memory(GiB)": 76.04, "step": 1495, "token_acc": 0.8206166847085937, "train_speed(iter/s)": 0.027807 }, { "epoch": 0.38787251923201244, "grad_norm": 1.0726227760314941, "learning_rate": 4.914691548122919e-06, "loss": 0.8898172378540039, "memory(GiB)": 76.04, "step": 1500, "token_acc": 0.7631806836126535, "train_speed(iter/s)": 0.027806 }, { "epoch": 0.38916542762945244, "grad_norm": 1.0023351907730103, "learning_rate": 4.9137661362971225e-06, "loss": 0.9159588813781738, "memory(GiB)": 76.04, "step": 1505, "token_acc": 0.7451686323194703, "train_speed(iter/s)": 0.027808 }, { "epoch": 0.3904583360268925, "grad_norm": 1.1598299741744995, "learning_rate": 4.912835820231705e-06, "loss": 0.8731332778930664, "memory(GiB)": 76.04, "step": 1510, "token_acc": 0.7731112837444164, "train_speed(iter/s)": 0.027808 }, { "epoch": 0.3917512444243325, "grad_norm": 1.2678931951522827, "learning_rate": 4.9119006018168645e-06, "loss": 0.8393604278564453, "memory(GiB)": 76.04, "step": 1515, "token_acc": 0.7939447383891828, "train_speed(iter/s)": 0.027803 }, { "epoch": 0.3930441528217726, "grad_norm": 1.062391996383667, "learning_rate": 4.910960482952757e-06, "loss": 0.9229723930358886, "memory(GiB)": 76.04, "step": 1520, "token_acc": 0.7733812949640287, "train_speed(iter/s)": 0.027804 }, { "epoch": 0.39433706121921264, "grad_norm": 0.9789305925369263, "learning_rate": 4.910015465549497e-06, "loss": 0.9235004425048828, "memory(GiB)": 76.04, "step": 1525, "token_acc": 0.7600789189591631, "train_speed(iter/s)": 0.027803 }, { "epoch": 0.39562996961665264, "grad_norm": 1.0256842374801636, "learning_rate": 4.909065551527151e-06, "loss": 0.8544706344604492, "memory(GiB)": 76.04, "step": 1530, "token_acc": 0.7887576797255246, "train_speed(iter/s)": 0.027803 }, { "epoch": 0.3969228780140927, "grad_norm": 1.0858112573623657, "learning_rate": 4.908110742815735e-06, "loss": 0.8899390220642089, "memory(GiB)": 76.04, "step": 1535, "token_acc": 0.7800430187973441, "train_speed(iter/s)": 0.027803 }, { "epoch": 0.39821578641153277, "grad_norm": 0.9626816511154175, "learning_rate": 4.907151041355208e-06, "loss": 0.8749662399291992, "memory(GiB)": 76.04, "step": 1540, "token_acc": 0.8031964754405699, "train_speed(iter/s)": 0.027798 }, { "epoch": 0.3995086948089728, "grad_norm": 0.9466427564620972, "learning_rate": 4.9061864490954725e-06, "loss": 0.8291332244873046, "memory(GiB)": 76.04, "step": 1545, "token_acc": 0.76341123125218, "train_speed(iter/s)": 0.027798 }, { "epoch": 0.40080160320641284, "grad_norm": 1.047394037246704, "learning_rate": 4.905216967996367e-06, "loss": 0.8456403732299804, "memory(GiB)": 76.04, "step": 1550, "token_acc": 0.7513071152534667, "train_speed(iter/s)": 0.0278 }, { "epoch": 0.40209451160385284, "grad_norm": 0.9296531677246094, "learning_rate": 4.904242600027662e-06, "loss": 0.8476978302001953, "memory(GiB)": 76.04, "step": 1555, "token_acc": 0.8000686931135154, "train_speed(iter/s)": 0.027798 }, { "epoch": 0.4033874200012929, "grad_norm": 1.176629900932312, "learning_rate": 4.903263347169058e-06, "loss": 0.9175498962402344, "memory(GiB)": 76.04, "step": 1560, "token_acc": 0.8039329091960671, "train_speed(iter/s)": 0.027801 }, { "epoch": 0.40468032839873297, "grad_norm": 1.2354000806808472, "learning_rate": 4.902279211410182e-06, "loss": 0.9165899276733398, "memory(GiB)": 76.04, "step": 1565, "token_acc": 0.7679446219382322, "train_speed(iter/s)": 0.027801 }, { "epoch": 0.405973236796173, "grad_norm": 2.35729718208313, "learning_rate": 4.901290194750579e-06, "loss": 0.8489980697631836, "memory(GiB)": 76.04, "step": 1570, "token_acc": 0.8143410397840629, "train_speed(iter/s)": 0.0278 }, { "epoch": 0.40726614519361304, "grad_norm": 0.986346960067749, "learning_rate": 4.900296299199714e-06, "loss": 0.87310791015625, "memory(GiB)": 76.04, "step": 1575, "token_acc": 0.7906714736367734, "train_speed(iter/s)": 0.027801 }, { "epoch": 0.4085590535910531, "grad_norm": 0.9651816487312317, "learning_rate": 4.899297526776962e-06, "loss": 0.8573389053344727, "memory(GiB)": 76.04, "step": 1580, "token_acc": 0.7923456022732318, "train_speed(iter/s)": 0.027803 }, { "epoch": 0.4098519619884931, "grad_norm": 1.0119379758834839, "learning_rate": 4.898293879511608e-06, "loss": 0.8485713958740234, "memory(GiB)": 76.04, "step": 1585, "token_acc": 0.7772183472677722, "train_speed(iter/s)": 0.027803 }, { "epoch": 0.41114487038593317, "grad_norm": 0.925304651260376, "learning_rate": 4.897285359442841e-06, "loss": 0.891656494140625, "memory(GiB)": 76.04, "step": 1590, "token_acc": 0.7572081654822794, "train_speed(iter/s)": 0.027804 }, { "epoch": 0.4124377787833732, "grad_norm": 1.0190355777740479, "learning_rate": 4.896271968619752e-06, "loss": 0.8359519004821777, "memory(GiB)": 76.04, "step": 1595, "token_acc": 0.7742064125059818, "train_speed(iter/s)": 0.027803 }, { "epoch": 0.41373068718081324, "grad_norm": 4.286404132843018, "learning_rate": 4.895253709101327e-06, "loss": 0.865880012512207, "memory(GiB)": 76.04, "step": 1600, "token_acc": 0.7423144213946513, "train_speed(iter/s)": 0.027806 }, { "epoch": 0.4150235955782533, "grad_norm": 1.1072417497634888, "learning_rate": 4.894230582956444e-06, "loss": 0.8957183837890625, "memory(GiB)": 76.04, "step": 1605, "token_acc": 0.7621177149451818, "train_speed(iter/s)": 0.027807 }, { "epoch": 0.4163165039756933, "grad_norm": 1.0325218439102173, "learning_rate": 4.89320259226387e-06, "loss": 0.8576887130737305, "memory(GiB)": 76.04, "step": 1610, "token_acc": 0.7561050328227571, "train_speed(iter/s)": 0.027809 }, { "epoch": 0.41760941237313337, "grad_norm": 1.0209314823150635, "learning_rate": 4.8921697391122555e-06, "loss": 0.8784740447998047, "memory(GiB)": 76.04, "step": 1615, "token_acc": 0.7576126674786845, "train_speed(iter/s)": 0.02781 }, { "epoch": 0.4189023207705734, "grad_norm": 1.2361773252487183, "learning_rate": 4.891132025600128e-06, "loss": 0.8804727554321289, "memory(GiB)": 76.04, "step": 1620, "token_acc": 0.7516129032258064, "train_speed(iter/s)": 0.027805 }, { "epoch": 0.42019522916801344, "grad_norm": 1.0383259057998657, "learning_rate": 4.890089453835894e-06, "loss": 0.8933810234069824, "memory(GiB)": 76.04, "step": 1625, "token_acc": 0.7678587433898001, "train_speed(iter/s)": 0.027808 }, { "epoch": 0.4214881375654535, "grad_norm": 1.1485199928283691, "learning_rate": 4.889042025937829e-06, "loss": 0.8679392814636231, "memory(GiB)": 76.04, "step": 1630, "token_acc": 0.7426187419768935, "train_speed(iter/s)": 0.027808 }, { "epoch": 0.4227810459628935, "grad_norm": 0.9472554922103882, "learning_rate": 4.887989744034074e-06, "loss": 0.8719472885131836, "memory(GiB)": 76.04, "step": 1635, "token_acc": 0.7547612635142934, "train_speed(iter/s)": 0.027806 }, { "epoch": 0.42407395436033357, "grad_norm": 1.0658771991729736, "learning_rate": 4.886932610262634e-06, "loss": 0.8944738388061524, "memory(GiB)": 76.04, "step": 1640, "token_acc": 0.8233047873087183, "train_speed(iter/s)": 0.027805 }, { "epoch": 0.42536686275777363, "grad_norm": 0.9620645642280579, "learning_rate": 4.885870626771371e-06, "loss": 0.8650775909423828, "memory(GiB)": 76.04, "step": 1645, "token_acc": 0.7568502864923129, "train_speed(iter/s)": 0.027803 }, { "epoch": 0.42665977115521364, "grad_norm": 1.0661425590515137, "learning_rate": 4.884803795718001e-06, "loss": 0.9001960754394531, "memory(GiB)": 76.04, "step": 1650, "token_acc": 0.7570867129358642, "train_speed(iter/s)": 0.027807 }, { "epoch": 0.4279526795526537, "grad_norm": 1.1007188558578491, "learning_rate": 4.88373211927009e-06, "loss": 0.8699914932250976, "memory(GiB)": 76.04, "step": 1655, "token_acc": 0.7583125246418715, "train_speed(iter/s)": 0.027808 }, { "epoch": 0.4292455879500937, "grad_norm": 1.0380903482437134, "learning_rate": 4.882655599605045e-06, "loss": 0.8764565467834473, "memory(GiB)": 76.04, "step": 1660, "token_acc": 0.7656633221850613, "train_speed(iter/s)": 0.027808 }, { "epoch": 0.4305384963475338, "grad_norm": 1.033884882926941, "learning_rate": 4.88157423891012e-06, "loss": 0.8574346542358399, "memory(GiB)": 76.04, "step": 1665, "token_acc": 0.7794178559325226, "train_speed(iter/s)": 0.027804 }, { "epoch": 0.43183140474497383, "grad_norm": 0.911655068397522, "learning_rate": 4.8804880393823986e-06, "loss": 0.8541059494018555, "memory(GiB)": 76.04, "step": 1670, "token_acc": 0.7727084040907204, "train_speed(iter/s)": 0.027807 }, { "epoch": 0.43312431314241384, "grad_norm": 0.9688575863838196, "learning_rate": 4.8793970032287985e-06, "loss": 0.8185391426086426, "memory(GiB)": 76.04, "step": 1675, "token_acc": 0.8178748580649017, "train_speed(iter/s)": 0.027804 }, { "epoch": 0.4344172215398539, "grad_norm": 0.9555157423019409, "learning_rate": 4.878301132666066e-06, "loss": 0.8625661849975585, "memory(GiB)": 76.04, "step": 1680, "token_acc": 0.7851330293761182, "train_speed(iter/s)": 0.027804 }, { "epoch": 0.43571012993729397, "grad_norm": 1.0205975770950317, "learning_rate": 4.877200429920765e-06, "loss": 0.8751688003540039, "memory(GiB)": 76.04, "step": 1685, "token_acc": 0.772077701884416, "train_speed(iter/s)": 0.027807 }, { "epoch": 0.437003038334734, "grad_norm": 1.0219794511795044, "learning_rate": 4.876094897229283e-06, "loss": 0.810630989074707, "memory(GiB)": 76.04, "step": 1690, "token_acc": 0.8176644891911913, "train_speed(iter/s)": 0.027806 }, { "epoch": 0.43829594673217404, "grad_norm": 1.1282055377960205, "learning_rate": 4.874984536837817e-06, "loss": 0.8619385719299316, "memory(GiB)": 76.04, "step": 1695, "token_acc": 0.7675262655205348, "train_speed(iter/s)": 0.027807 }, { "epoch": 0.43958885512961404, "grad_norm": 0.9397704601287842, "learning_rate": 4.873869351002374e-06, "loss": 0.820007610321045, "memory(GiB)": 76.04, "step": 1700, "token_acc": 0.7972864541542053, "train_speed(iter/s)": 0.027803 }, { "epoch": 0.4408817635270541, "grad_norm": 1.0283387899398804, "learning_rate": 4.872749341988765e-06, "loss": 0.8253473281860352, "memory(GiB)": 76.04, "step": 1705, "token_acc": 0.7752377949445584, "train_speed(iter/s)": 0.027804 }, { "epoch": 0.44217467192449417, "grad_norm": 0.9191579818725586, "learning_rate": 4.871624512072603e-06, "loss": 0.8367796897888183, "memory(GiB)": 76.04, "step": 1710, "token_acc": 0.7903411821239789, "train_speed(iter/s)": 0.027801 }, { "epoch": 0.4434675803219342, "grad_norm": 1.2455042600631714, "learning_rate": 4.870494863539291e-06, "loss": 0.8392200469970703, "memory(GiB)": 76.04, "step": 1715, "token_acc": 0.7550399545694236, "train_speed(iter/s)": 0.027798 }, { "epoch": 0.44476048871937424, "grad_norm": 1.0765002965927124, "learning_rate": 4.8693603986840274e-06, "loss": 0.8334452629089355, "memory(GiB)": 76.04, "step": 1720, "token_acc": 0.7612031220255092, "train_speed(iter/s)": 0.027797 }, { "epoch": 0.4460533971168143, "grad_norm": 1.0502086877822876, "learning_rate": 4.868221119811793e-06, "loss": 0.8496732711791992, "memory(GiB)": 76.04, "step": 1725, "token_acc": 0.7960901439044258, "train_speed(iter/s)": 0.027801 }, { "epoch": 0.4473463055142543, "grad_norm": 0.883604884147644, "learning_rate": 4.867077029237352e-06, "loss": 0.817476749420166, "memory(GiB)": 76.04, "step": 1730, "token_acc": 0.746853904492041, "train_speed(iter/s)": 0.027799 }, { "epoch": 0.44863921391169437, "grad_norm": 1.0631402730941772, "learning_rate": 4.865928129285242e-06, "loss": 0.8631902694702148, "memory(GiB)": 76.04, "step": 1735, "token_acc": 0.7656874459231181, "train_speed(iter/s)": 0.0278 }, { "epoch": 0.4499321223091344, "grad_norm": 1.0118037462234497, "learning_rate": 4.864774422289776e-06, "loss": 0.8337348937988281, "memory(GiB)": 76.04, "step": 1740, "token_acc": 0.7787095835959087, "train_speed(iter/s)": 0.027798 }, { "epoch": 0.45122503070657444, "grad_norm": 1.099346399307251, "learning_rate": 4.863615910595031e-06, "loss": 0.8567562103271484, "memory(GiB)": 76.04, "step": 1745, "token_acc": 0.8049837122611412, "train_speed(iter/s)": 0.027797 }, { "epoch": 0.4525179391040145, "grad_norm": 0.9110936522483826, "learning_rate": 4.8624525965548456e-06, "loss": 0.858333683013916, "memory(GiB)": 76.04, "step": 1750, "token_acc": 0.7628092095319663, "train_speed(iter/s)": 0.0278 }, { "epoch": 0.4538108475014545, "grad_norm": 1.1097652912139893, "learning_rate": 4.861284482532819e-06, "loss": 0.8601787567138672, "memory(GiB)": 76.04, "step": 1755, "token_acc": 0.7758728179551122, "train_speed(iter/s)": 0.027803 }, { "epoch": 0.45510375589889457, "grad_norm": 0.9955366253852844, "learning_rate": 4.860111570902298e-06, "loss": 0.8417009353637696, "memory(GiB)": 76.04, "step": 1760, "token_acc": 0.7899390978219729, "train_speed(iter/s)": 0.027798 }, { "epoch": 0.45639666429633463, "grad_norm": 0.9379090666770935, "learning_rate": 4.858933864046384e-06, "loss": 0.8158811569213867, "memory(GiB)": 76.04, "step": 1765, "token_acc": 0.7703970866307165, "train_speed(iter/s)": 0.027797 }, { "epoch": 0.45768957269377464, "grad_norm": 0.9244673252105713, "learning_rate": 4.857751364357913e-06, "loss": 0.8572831153869629, "memory(GiB)": 76.04, "step": 1770, "token_acc": 0.7679937895087058, "train_speed(iter/s)": 0.027796 }, { "epoch": 0.4589824810912147, "grad_norm": 0.8768739104270935, "learning_rate": 4.856564074239467e-06, "loss": 0.8114492416381835, "memory(GiB)": 76.04, "step": 1775, "token_acc": 0.7533039647577092, "train_speed(iter/s)": 0.027797 }, { "epoch": 0.4602753894886547, "grad_norm": 1.0087144374847412, "learning_rate": 4.855371996103354e-06, "loss": 0.8448333740234375, "memory(GiB)": 76.04, "step": 1780, "token_acc": 0.7925396227993142, "train_speed(iter/s)": 0.027798 }, { "epoch": 0.46156829788609477, "grad_norm": 0.9475561380386353, "learning_rate": 4.854175132371615e-06, "loss": 0.8426584243774414, "memory(GiB)": 76.04, "step": 1785, "token_acc": 0.7877760352646972, "train_speed(iter/s)": 0.027799 }, { "epoch": 0.46286120628353483, "grad_norm": 0.8809593915939331, "learning_rate": 4.852973485476014e-06, "loss": 0.8447649002075195, "memory(GiB)": 76.04, "step": 1790, "token_acc": 0.7644977511244377, "train_speed(iter/s)": 0.027801 }, { "epoch": 0.46415411468097484, "grad_norm": 0.9489724636077881, "learning_rate": 4.85176705785803e-06, "loss": 0.8333120346069336, "memory(GiB)": 76.04, "step": 1795, "token_acc": 0.8094422805290417, "train_speed(iter/s)": 0.027801 }, { "epoch": 0.4654470230784149, "grad_norm": 1.0435246229171753, "learning_rate": 4.850555851968858e-06, "loss": 0.8334157943725586, "memory(GiB)": 76.04, "step": 1800, "token_acc": 0.7686591887926546, "train_speed(iter/s)": 0.027804 }, { "epoch": 0.46673993147585496, "grad_norm": 1.1650222539901733, "learning_rate": 4.849339870269401e-06, "loss": 0.9079343795776367, "memory(GiB)": 76.04, "step": 1805, "token_acc": 0.7502884738664045, "train_speed(iter/s)": 0.027805 }, { "epoch": 0.46803283987329497, "grad_norm": 0.9376285076141357, "learning_rate": 4.848119115230264e-06, "loss": 0.8245293617248535, "memory(GiB)": 76.04, "step": 1810, "token_acc": 0.7958735551228404, "train_speed(iter/s)": 0.027805 }, { "epoch": 0.46932574827073503, "grad_norm": 0.9769212603569031, "learning_rate": 4.8468935893317545e-06, "loss": 0.8638315200805664, "memory(GiB)": 76.04, "step": 1815, "token_acc": 0.7886973180076629, "train_speed(iter/s)": 0.027805 }, { "epoch": 0.47061865666817504, "grad_norm": 0.9659924507141113, "learning_rate": 4.8456632950638675e-06, "loss": 0.8185907363891601, "memory(GiB)": 76.04, "step": 1820, "token_acc": 0.7737749169435216, "train_speed(iter/s)": 0.027804 }, { "epoch": 0.4719115650656151, "grad_norm": 0.9423291683197021, "learning_rate": 4.844428234926291e-06, "loss": 0.8167947769165039, "memory(GiB)": 76.04, "step": 1825, "token_acc": 0.7969283276450512, "train_speed(iter/s)": 0.027801 }, { "epoch": 0.47320447346305516, "grad_norm": 0.9784870147705078, "learning_rate": 4.843188411428394e-06, "loss": 0.838237190246582, "memory(GiB)": 76.04, "step": 1830, "token_acc": 0.8140107775211701, "train_speed(iter/s)": 0.027801 }, { "epoch": 0.47449738186049517, "grad_norm": 2.1008307933807373, "learning_rate": 4.841943827089223e-06, "loss": 0.8713891983032227, "memory(GiB)": 76.04, "step": 1835, "token_acc": 0.7783555923255723, "train_speed(iter/s)": 0.027803 }, { "epoch": 0.47579029025793523, "grad_norm": 1.0217597484588623, "learning_rate": 4.840694484437499e-06, "loss": 0.8342850685119629, "memory(GiB)": 76.04, "step": 1840, "token_acc": 0.7716500553709856, "train_speed(iter/s)": 0.027805 }, { "epoch": 0.47708319865537524, "grad_norm": 0.935716986656189, "learning_rate": 4.8394403860116115e-06, "loss": 0.8083118438720703, "memory(GiB)": 76.04, "step": 1845, "token_acc": 0.7871954487364472, "train_speed(iter/s)": 0.027805 }, { "epoch": 0.4783761070528153, "grad_norm": 0.906399667263031, "learning_rate": 4.83818153435961e-06, "loss": 0.8438366889953614, "memory(GiB)": 76.04, "step": 1850, "token_acc": 0.7733610953372453, "train_speed(iter/s)": 0.027803 }, { "epoch": 0.47966901545025536, "grad_norm": 1.9505832195281982, "learning_rate": 4.836917932039204e-06, "loss": 0.8615127563476562, "memory(GiB)": 76.04, "step": 1855, "token_acc": 0.7813404825737266, "train_speed(iter/s)": 0.027804 }, { "epoch": 0.48096192384769537, "grad_norm": 1.1248425245285034, "learning_rate": 4.835649581617753e-06, "loss": 0.8535722732543946, "memory(GiB)": 76.04, "step": 1860, "token_acc": 0.7716059271125351, "train_speed(iter/s)": 0.027805 }, { "epoch": 0.48225483224513543, "grad_norm": 0.9488275647163391, "learning_rate": 4.834376485672266e-06, "loss": 0.8235734939575196, "memory(GiB)": 76.04, "step": 1865, "token_acc": 0.772141609970498, "train_speed(iter/s)": 0.027805 }, { "epoch": 0.4835477406425755, "grad_norm": 0.9314141273498535, "learning_rate": 4.833098646789393e-06, "loss": 0.825401496887207, "memory(GiB)": 76.04, "step": 1870, "token_acc": 0.802578972013111, "train_speed(iter/s)": 0.027804 }, { "epoch": 0.4848406490400155, "grad_norm": 1.1258958578109741, "learning_rate": 4.831816067565419e-06, "loss": 0.8634084701538086, "memory(GiB)": 76.04, "step": 1875, "token_acc": 0.7933092156789617, "train_speed(iter/s)": 0.027807 }, { "epoch": 0.48613355743745557, "grad_norm": 0.8910898566246033, "learning_rate": 4.830528750606263e-06, "loss": 0.8147882461547852, "memory(GiB)": 76.04, "step": 1880, "token_acc": 0.812186275932105, "train_speed(iter/s)": 0.027803 }, { "epoch": 0.4874264658348956, "grad_norm": 1.1478573083877563, "learning_rate": 4.829236698527469e-06, "loss": 0.8461570739746094, "memory(GiB)": 76.04, "step": 1885, "token_acc": 0.7892949615858058, "train_speed(iter/s)": 0.027802 }, { "epoch": 0.48871937423233563, "grad_norm": 0.9229076504707336, "learning_rate": 4.827939913954199e-06, "loss": 0.8387362480163574, "memory(GiB)": 76.04, "step": 1890, "token_acc": 0.8044181034482759, "train_speed(iter/s)": 0.027803 }, { "epoch": 0.4900122826297757, "grad_norm": 1.209421992301941, "learning_rate": 4.826638399521235e-06, "loss": 0.8628839492797852, "memory(GiB)": 76.04, "step": 1895, "token_acc": 0.7861487236403996, "train_speed(iter/s)": 0.027804 }, { "epoch": 0.4913051910272157, "grad_norm": 0.9494127631187439, "learning_rate": 4.825332157872966e-06, "loss": 0.8163295745849609, "memory(GiB)": 76.04, "step": 1900, "token_acc": 0.7902574714203331, "train_speed(iter/s)": 0.027806 }, { "epoch": 0.49259809942465577, "grad_norm": 1.0148690938949585, "learning_rate": 4.824021191663387e-06, "loss": 0.8092700004577636, "memory(GiB)": 76.04, "step": 1905, "token_acc": 0.7959501969388564, "train_speed(iter/s)": 0.027805 }, { "epoch": 0.49389100782209583, "grad_norm": 1.064133644104004, "learning_rate": 4.822705503556092e-06, "loss": 0.8303569793701172, "memory(GiB)": 76.04, "step": 1910, "token_acc": 0.8055975400010423, "train_speed(iter/s)": 0.027802 }, { "epoch": 0.49518391621953584, "grad_norm": 1.0159554481506348, "learning_rate": 4.821385096224268e-06, "loss": 0.8641040802001954, "memory(GiB)": 76.04, "step": 1915, "token_acc": 0.7566073149698169, "train_speed(iter/s)": 0.027801 }, { "epoch": 0.4964768246169759, "grad_norm": 1.0026555061340332, "learning_rate": 4.820059972350691e-06, "loss": 0.8560010910034179, "memory(GiB)": 76.04, "step": 1920, "token_acc": 0.7960721112402084, "train_speed(iter/s)": 0.027801 }, { "epoch": 0.4977697330144159, "grad_norm": 0.8519271016120911, "learning_rate": 4.81873013462772e-06, "loss": 0.8016116142272949, "memory(GiB)": 76.04, "step": 1925, "token_acc": 0.7795004600129485, "train_speed(iter/s)": 0.027796 }, { "epoch": 0.49906264141185597, "grad_norm": 0.9784479141235352, "learning_rate": 4.8173955857572926e-06, "loss": 0.8383674621582031, "memory(GiB)": 76.04, "step": 1930, "token_acc": 0.7987399059366403, "train_speed(iter/s)": 0.027796 }, { "epoch": 0.500355549809296, "grad_norm": 0.958125650882721, "learning_rate": 4.816056328450916e-06, "loss": 0.8211706161499024, "memory(GiB)": 76.04, "step": 1935, "token_acc": 0.8177427201334353, "train_speed(iter/s)": 0.02779 }, { "epoch": 0.5016484582067361, "grad_norm": 1.0574296712875366, "learning_rate": 4.814712365429665e-06, "loss": 0.8358111381530762, "memory(GiB)": 76.04, "step": 1940, "token_acc": 0.773138165533256, "train_speed(iter/s)": 0.02779 }, { "epoch": 0.5029413666041761, "grad_norm": 1.152383804321289, "learning_rate": 4.813363699424176e-06, "loss": 0.8466585159301758, "memory(GiB)": 76.04, "step": 1945, "token_acc": 0.7601995890813033, "train_speed(iter/s)": 0.027789 }, { "epoch": 0.5042342750016161, "grad_norm": 0.9282750487327576, "learning_rate": 4.812010333174642e-06, "loss": 0.821980094909668, "memory(GiB)": 76.04, "step": 1950, "token_acc": 0.7666437886067261, "train_speed(iter/s)": 0.027792 }, { "epoch": 0.5055271833990562, "grad_norm": 1.0676088333129883, "learning_rate": 4.8106522694308026e-06, "loss": 0.8337220191955567, "memory(GiB)": 76.04, "step": 1955, "token_acc": 0.7935955447267664, "train_speed(iter/s)": 0.027792 }, { "epoch": 0.5068200917964962, "grad_norm": 1.028906226158142, "learning_rate": 4.809289510951943e-06, "loss": 0.8194513320922852, "memory(GiB)": 76.04, "step": 1960, "token_acc": 0.7874173098125689, "train_speed(iter/s)": 0.027794 }, { "epoch": 0.5081130001939362, "grad_norm": 1.000504732131958, "learning_rate": 4.807922060506889e-06, "loss": 0.8190900802612304, "memory(GiB)": 76.04, "step": 1965, "token_acc": 0.7874103263615237, "train_speed(iter/s)": 0.027795 }, { "epoch": 0.5094059085913764, "grad_norm": 0.9075422883033752, "learning_rate": 4.806549920873996e-06, "loss": 0.797203254699707, "memory(GiB)": 76.04, "step": 1970, "token_acc": 0.7732008028290165, "train_speed(iter/s)": 0.027794 }, { "epoch": 0.5106988169888164, "grad_norm": 1.4181991815567017, "learning_rate": 4.8051730948411505e-06, "loss": 0.795828914642334, "memory(GiB)": 76.04, "step": 1975, "token_acc": 0.8003244957409934, "train_speed(iter/s)": 0.027794 }, { "epoch": 0.5119917253862564, "grad_norm": 1.0753087997436523, "learning_rate": 4.803791585205757e-06, "loss": 0.8330776214599609, "memory(GiB)": 76.04, "step": 1980, "token_acc": 0.7414700390426073, "train_speed(iter/s)": 0.027795 }, { "epoch": 0.5132846337836964, "grad_norm": 1.0679435729980469, "learning_rate": 4.802405394774739e-06, "loss": 0.8332581520080566, "memory(GiB)": 76.04, "step": 1985, "token_acc": 0.7553387146214366, "train_speed(iter/s)": 0.027794 }, { "epoch": 0.5145775421811365, "grad_norm": 0.8826218843460083, "learning_rate": 4.801014526364531e-06, "loss": 0.7712962627410889, "memory(GiB)": 76.04, "step": 1990, "token_acc": 0.7524300269352383, "train_speed(iter/s)": 0.027795 }, { "epoch": 0.5158704505785765, "grad_norm": 3.6322293281555176, "learning_rate": 4.799618982801066e-06, "loss": 0.8304604530334473, "memory(GiB)": 76.04, "step": 1995, "token_acc": 0.8159029172079839, "train_speed(iter/s)": 0.027797 }, { "epoch": 0.5171633589760165, "grad_norm": 1.0873634815216064, "learning_rate": 4.798218766919784e-06, "loss": 0.8011078834533691, "memory(GiB)": 76.04, "step": 2000, "token_acc": 0.7605788670946689, "train_speed(iter/s)": 0.027796 }, { "epoch": 0.5184562673734566, "grad_norm": 0.9646498560905457, "learning_rate": 4.796813881565614e-06, "loss": 0.7656961441040039, "memory(GiB)": 76.04, "step": 2005, "token_acc": 0.8116391078933645, "train_speed(iter/s)": 0.027724 }, { "epoch": 0.5197491757708966, "grad_norm": 0.9246786832809448, "learning_rate": 4.795404329592971e-06, "loss": 0.7999061107635498, "memory(GiB)": 76.04, "step": 2010, "token_acc": 0.8214931011826544, "train_speed(iter/s)": 0.027723 }, { "epoch": 0.5210420841683366, "grad_norm": 0.9651414155960083, "learning_rate": 4.793990113865754e-06, "loss": 0.8470598220825195, "memory(GiB)": 76.04, "step": 2015, "token_acc": 0.7925045299862995, "train_speed(iter/s)": 0.027723 }, { "epoch": 0.5223349925657768, "grad_norm": 0.9942532777786255, "learning_rate": 4.792571237257338e-06, "loss": 0.8307376861572265, "memory(GiB)": 76.04, "step": 2020, "token_acc": 0.760911584985659, "train_speed(iter/s)": 0.027724 }, { "epoch": 0.5236279009632168, "grad_norm": 0.9892558455467224, "learning_rate": 4.7911477026505656e-06, "loss": 0.8515020370483398, "memory(GiB)": 76.04, "step": 2025, "token_acc": 0.7540479906359735, "train_speed(iter/s)": 0.027722 }, { "epoch": 0.5249208093606568, "grad_norm": 1.0707569122314453, "learning_rate": 4.789719512937745e-06, "loss": 0.8141921997070313, "memory(GiB)": 76.04, "step": 2030, "token_acc": 0.7763675366464069, "train_speed(iter/s)": 0.027721 }, { "epoch": 0.5262137177580969, "grad_norm": 0.9917581677436829, "learning_rate": 4.788286671020642e-06, "loss": 0.8206811904907226, "memory(GiB)": 76.04, "step": 2035, "token_acc": 0.7850752688172044, "train_speed(iter/s)": 0.027723 }, { "epoch": 0.5275066261555369, "grad_norm": 0.9246799945831299, "learning_rate": 4.786849179810475e-06, "loss": 0.7965336799621582, "memory(GiB)": 76.04, "step": 2040, "token_acc": 0.7653000594177065, "train_speed(iter/s)": 0.027725 }, { "epoch": 0.5287995345529769, "grad_norm": 1.071942925453186, "learning_rate": 4.78540704222791e-06, "loss": 0.8213727951049805, "memory(GiB)": 76.04, "step": 2045, "token_acc": 0.8214273371349576, "train_speed(iter/s)": 0.027722 }, { "epoch": 0.5300924429504169, "grad_norm": 1.2201160192489624, "learning_rate": 4.783960261203051e-06, "loss": 0.8097395896911621, "memory(GiB)": 76.04, "step": 2050, "token_acc": 0.7943978387601308, "train_speed(iter/s)": 0.02772 }, { "epoch": 0.531385351347857, "grad_norm": 0.9751284122467041, "learning_rate": 4.782508839675436e-06, "loss": 0.8254419326782226, "memory(GiB)": 76.04, "step": 2055, "token_acc": 0.7764489832482308, "train_speed(iter/s)": 0.027722 }, { "epoch": 0.532678259745297, "grad_norm": 1.0070680379867554, "learning_rate": 4.7810527805940344e-06, "loss": 0.8492563247680665, "memory(GiB)": 76.04, "step": 2060, "token_acc": 0.7705357535270074, "train_speed(iter/s)": 0.027723 }, { "epoch": 0.533971168142737, "grad_norm": 0.8822097182273865, "learning_rate": 4.779592086917238e-06, "loss": 0.7865631580352783, "memory(GiB)": 76.04, "step": 2065, "token_acc": 0.799327011318446, "train_speed(iter/s)": 0.027724 }, { "epoch": 0.5352640765401772, "grad_norm": 1.0193886756896973, "learning_rate": 4.77812676161285e-06, "loss": 0.8170513153076172, "memory(GiB)": 76.04, "step": 2070, "token_acc": 0.7710854546297584, "train_speed(iter/s)": 0.027726 }, { "epoch": 0.5365569849376172, "grad_norm": 0.9742515683174133, "learning_rate": 4.776656807658091e-06, "loss": 0.844205379486084, "memory(GiB)": 76.04, "step": 2075, "token_acc": 0.7571799189841154, "train_speed(iter/s)": 0.027726 }, { "epoch": 0.5378498933350572, "grad_norm": 1.2338061332702637, "learning_rate": 4.775182228039582e-06, "loss": 0.8240803718566895, "memory(GiB)": 76.04, "step": 2080, "token_acc": 0.7739548334963637, "train_speed(iter/s)": 0.027728 }, { "epoch": 0.5391428017324973, "grad_norm": 1.135621428489685, "learning_rate": 4.773703025753343e-06, "loss": 0.7704273700714112, "memory(GiB)": 76.04, "step": 2085, "token_acc": 0.8158826332629859, "train_speed(iter/s)": 0.02773 }, { "epoch": 0.5404357101299373, "grad_norm": 0.9862043261528015, "learning_rate": 4.772219203804785e-06, "loss": 0.8293350219726563, "memory(GiB)": 76.04, "step": 2090, "token_acc": 0.7778981581798483, "train_speed(iter/s)": 0.027731 }, { "epoch": 0.5417286185273773, "grad_norm": 1.0542078018188477, "learning_rate": 4.770730765208708e-06, "loss": 0.8214458465576172, "memory(GiB)": 76.04, "step": 2095, "token_acc": 0.8010532239909953, "train_speed(iter/s)": 0.027732 }, { "epoch": 0.5430215269248174, "grad_norm": 1.3685965538024902, "learning_rate": 4.76923771298929e-06, "loss": 0.7963518142700196, "memory(GiB)": 76.04, "step": 2100, "token_acc": 0.7876639186707104, "train_speed(iter/s)": 0.027734 }, { "epoch": 0.5443144353222574, "grad_norm": 0.9173294901847839, "learning_rate": 4.767740050180083e-06, "loss": 0.797146987915039, "memory(GiB)": 76.04, "step": 2105, "token_acc": 0.8026390843061946, "train_speed(iter/s)": 0.027727 }, { "epoch": 0.5456073437196974, "grad_norm": 1.0344001054763794, "learning_rate": 4.766237779824008e-06, "loss": 0.8145599365234375, "memory(GiB)": 76.04, "step": 2110, "token_acc": 0.8000528162372204, "train_speed(iter/s)": 0.027726 }, { "epoch": 0.5469002521171376, "grad_norm": 0.9387233257293701, "learning_rate": 4.764730904973345e-06, "loss": 0.8474384307861328, "memory(GiB)": 76.04, "step": 2115, "token_acc": 0.7702894841608372, "train_speed(iter/s)": 0.027726 }, { "epoch": 0.5481931605145776, "grad_norm": 0.8692566156387329, "learning_rate": 4.7632194286897315e-06, "loss": 0.8177039146423339, "memory(GiB)": 76.04, "step": 2120, "token_acc": 0.8068763457940626, "train_speed(iter/s)": 0.027729 }, { "epoch": 0.5494860689120176, "grad_norm": 1.0659557580947876, "learning_rate": 4.761703354044155e-06, "loss": 0.7883958339691162, "memory(GiB)": 76.04, "step": 2125, "token_acc": 0.800734618916437, "train_speed(iter/s)": 0.027731 }, { "epoch": 0.5507789773094576, "grad_norm": 0.9900258779525757, "learning_rate": 4.760182684116942e-06, "loss": 0.8056777954101563, "memory(GiB)": 76.04, "step": 2130, "token_acc": 0.7733108386141059, "train_speed(iter/s)": 0.027731 }, { "epoch": 0.5520718857068977, "grad_norm": 1.03944993019104, "learning_rate": 4.7586574219977585e-06, "loss": 0.8212559700012207, "memory(GiB)": 76.04, "step": 2135, "token_acc": 0.7548755884330868, "train_speed(iter/s)": 0.02773 }, { "epoch": 0.5533647941043377, "grad_norm": 0.9362234473228455, "learning_rate": 4.7571275707856e-06, "loss": 0.798857307434082, "memory(GiB)": 76.04, "step": 2140, "token_acc": 0.8130052348563085, "train_speed(iter/s)": 0.027732 }, { "epoch": 0.5546577025017777, "grad_norm": 1.0358259677886963, "learning_rate": 4.755593133588788e-06, "loss": 0.8120311737060547, "memory(GiB)": 76.04, "step": 2145, "token_acc": 0.8000494239026349, "train_speed(iter/s)": 0.027731 }, { "epoch": 0.5559506108992178, "grad_norm": 1.1722190380096436, "learning_rate": 4.754054113524959e-06, "loss": 0.8086760520935059, "memory(GiB)": 76.04, "step": 2150, "token_acc": 0.8190579981609508, "train_speed(iter/s)": 0.027731 }, { "epoch": 0.5572435192966578, "grad_norm": 0.9975719451904297, "learning_rate": 4.752510513721061e-06, "loss": 0.8197290420532226, "memory(GiB)": 76.04, "step": 2155, "token_acc": 0.7630993323892373, "train_speed(iter/s)": 0.027732 }, { "epoch": 0.5585364276940978, "grad_norm": 1.0064895153045654, "learning_rate": 4.750962337313347e-06, "loss": 0.8426996231079101, "memory(GiB)": 76.04, "step": 2160, "token_acc": 0.7553154809791978, "train_speed(iter/s)": 0.02773 }, { "epoch": 0.559829336091538, "grad_norm": 1.056726336479187, "learning_rate": 4.749409587447372e-06, "loss": 0.8352632522583008, "memory(GiB)": 76.04, "step": 2165, "token_acc": 0.8019056825243389, "train_speed(iter/s)": 0.027734 }, { "epoch": 0.561122244488978, "grad_norm": 0.9361665844917297, "learning_rate": 4.747852267277981e-06, "loss": 0.765074634552002, "memory(GiB)": 76.04, "step": 2170, "token_acc": 0.7859190721313611, "train_speed(iter/s)": 0.027733 }, { "epoch": 0.562415152886418, "grad_norm": 1.1270101070404053, "learning_rate": 4.746290379969301e-06, "loss": 0.8160411834716796, "memory(GiB)": 76.04, "step": 2175, "token_acc": 0.7946054543900145, "train_speed(iter/s)": 0.027735 }, { "epoch": 0.5637080612838581, "grad_norm": 0.957750678062439, "learning_rate": 4.744723928694745e-06, "loss": 0.8085262298583984, "memory(GiB)": 76.04, "step": 2180, "token_acc": 0.7642607683352736, "train_speed(iter/s)": 0.027735 }, { "epoch": 0.5650009696812981, "grad_norm": 1.0245423316955566, "learning_rate": 4.743152916636995e-06, "loss": 0.793109130859375, "memory(GiB)": 76.04, "step": 2185, "token_acc": 0.7618901098901099, "train_speed(iter/s)": 0.027736 }, { "epoch": 0.5662938780787381, "grad_norm": 1.0268268585205078, "learning_rate": 4.7415773469880015e-06, "loss": 0.8279844284057617, "memory(GiB)": 76.04, "step": 2190, "token_acc": 0.7590428234859334, "train_speed(iter/s)": 0.027737 }, { "epoch": 0.5675867864761782, "grad_norm": 0.9654160737991333, "learning_rate": 4.739997222948972e-06, "loss": 0.8115758895874023, "memory(GiB)": 76.04, "step": 2195, "token_acc": 0.824989417785816, "train_speed(iter/s)": 0.027738 }, { "epoch": 0.5688796948736182, "grad_norm": 0.9180038571357727, "learning_rate": 4.738412547730371e-06, "loss": 0.7820042133331299, "memory(GiB)": 76.04, "step": 2200, "token_acc": 0.7811721577290032, "train_speed(iter/s)": 0.027738 }, { "epoch": 0.5701726032710582, "grad_norm": 0.9447706341743469, "learning_rate": 4.736823324551909e-06, "loss": 0.8502116203308105, "memory(GiB)": 76.04, "step": 2205, "token_acc": 0.7345110180295028, "train_speed(iter/s)": 0.027737 }, { "epoch": 0.5714655116684982, "grad_norm": 1.0418199300765991, "learning_rate": 4.7352295566425355e-06, "loss": 0.7954240322113038, "memory(GiB)": 76.04, "step": 2210, "token_acc": 0.7976113712187053, "train_speed(iter/s)": 0.027736 }, { "epoch": 0.5727584200659384, "grad_norm": 2.2441470623016357, "learning_rate": 4.733631247240435e-06, "loss": 0.8036426544189453, "memory(GiB)": 76.04, "step": 2215, "token_acc": 0.7925195951601857, "train_speed(iter/s)": 0.027736 }, { "epoch": 0.5740513284633784, "grad_norm": 0.8851604461669922, "learning_rate": 4.732028399593018e-06, "loss": 0.8041337013244629, "memory(GiB)": 76.04, "step": 2220, "token_acc": 0.7804418779814211, "train_speed(iter/s)": 0.027734 }, { "epoch": 0.5753442368608184, "grad_norm": 0.897997260093689, "learning_rate": 4.730421016956919e-06, "loss": 0.7801138877868652, "memory(GiB)": 76.04, "step": 2225, "token_acc": 0.8051513959889894, "train_speed(iter/s)": 0.027732 }, { "epoch": 0.5766371452582585, "grad_norm": 3.450253486633301, "learning_rate": 4.728809102597984e-06, "loss": 0.795560646057129, "memory(GiB)": 76.04, "step": 2230, "token_acc": 0.777429320351994, "train_speed(iter/s)": 0.027732 }, { "epoch": 0.5779300536556985, "grad_norm": 1.5096064805984497, "learning_rate": 4.727192659791265e-06, "loss": 0.800804615020752, "memory(GiB)": 76.04, "step": 2235, "token_acc": 0.7972484309406044, "train_speed(iter/s)": 0.027733 }, { "epoch": 0.5792229620531385, "grad_norm": 1.0118114948272705, "learning_rate": 4.72557169182102e-06, "loss": 0.7758650302886962, "memory(GiB)": 76.04, "step": 2240, "token_acc": 0.7874528625299966, "train_speed(iter/s)": 0.027734 }, { "epoch": 0.5805158704505786, "grad_norm": 1.16028892993927, "learning_rate": 4.723946201980695e-06, "loss": 0.8420794486999512, "memory(GiB)": 76.04, "step": 2245, "token_acc": 0.7777456885881674, "train_speed(iter/s)": 0.027735 }, { "epoch": 0.5818087788480186, "grad_norm": 1.1023540496826172, "learning_rate": 4.7223161935729274e-06, "loss": 0.801850700378418, "memory(GiB)": 76.04, "step": 2250, "token_acc": 0.7952162077736624, "train_speed(iter/s)": 0.027737 }, { "epoch": 0.5831016872454586, "grad_norm": 0.8935644626617432, "learning_rate": 4.7206816699095345e-06, "loss": 0.7811629295349121, "memory(GiB)": 76.04, "step": 2255, "token_acc": 0.789712556732224, "train_speed(iter/s)": 0.027737 }, { "epoch": 0.5843945956428988, "grad_norm": 1.0098074674606323, "learning_rate": 4.719042634311507e-06, "loss": 0.8304760932922364, "memory(GiB)": 76.04, "step": 2260, "token_acc": 0.7755578712853498, "train_speed(iter/s)": 0.027737 }, { "epoch": 0.5856875040403388, "grad_norm": 1.1288141012191772, "learning_rate": 4.717399090109003e-06, "loss": 0.8142587661743164, "memory(GiB)": 76.04, "step": 2265, "token_acc": 0.7781233799896319, "train_speed(iter/s)": 0.027739 }, { "epoch": 0.5869804124377788, "grad_norm": 1.0086054801940918, "learning_rate": 4.715751040641341e-06, "loss": 0.8228842735290527, "memory(GiB)": 76.04, "step": 2270, "token_acc": 0.7793262574988463, "train_speed(iter/s)": 0.027739 }, { "epoch": 0.5882733208352188, "grad_norm": 5.436489105224609, "learning_rate": 4.714098489256994e-06, "loss": 0.7786747932434082, "memory(GiB)": 76.04, "step": 2275, "token_acc": 0.8480059038774945, "train_speed(iter/s)": 0.027741 }, { "epoch": 0.5895662292326589, "grad_norm": 0.8497810363769531, "learning_rate": 4.712441439313583e-06, "loss": 0.7513184070587158, "memory(GiB)": 76.04, "step": 2280, "token_acc": 0.804937625403472, "train_speed(iter/s)": 0.027739 }, { "epoch": 0.5908591376300989, "grad_norm": 1.5754011869430542, "learning_rate": 4.710779894177864e-06, "loss": 0.8058387756347656, "memory(GiB)": 76.04, "step": 2285, "token_acc": 0.7810834813499112, "train_speed(iter/s)": 0.027738 }, { "epoch": 0.5921520460275389, "grad_norm": 1.010524868965149, "learning_rate": 4.709113857225732e-06, "loss": 0.8032638549804687, "memory(GiB)": 76.04, "step": 2290, "token_acc": 0.8110142754505982, "train_speed(iter/s)": 0.027739 }, { "epoch": 0.593444954424979, "grad_norm": 0.877875804901123, "learning_rate": 4.707443331842206e-06, "loss": 0.809267234802246, "memory(GiB)": 76.04, "step": 2295, "token_acc": 0.7685890635548269, "train_speed(iter/s)": 0.027741 }, { "epoch": 0.594737862822419, "grad_norm": 1.047855257987976, "learning_rate": 4.705768321421425e-06, "loss": 0.7906962394714355, "memory(GiB)": 76.04, "step": 2300, "token_acc": 0.7821157343031341, "train_speed(iter/s)": 0.027738 }, { "epoch": 0.596030771219859, "grad_norm": 1.188430905342102, "learning_rate": 4.704088829366638e-06, "loss": 0.8145524978637695, "memory(GiB)": 76.04, "step": 2305, "token_acc": 0.7796888204006561, "train_speed(iter/s)": 0.027737 }, { "epoch": 0.5973236796172992, "grad_norm": 1.0411370992660522, "learning_rate": 4.702404859090204e-06, "loss": 0.7802029609680176, "memory(GiB)": 76.04, "step": 2310, "token_acc": 0.7938298768784233, "train_speed(iter/s)": 0.027739 }, { "epoch": 0.5986165880147392, "grad_norm": 0.9956724643707275, "learning_rate": 4.700716414013577e-06, "loss": 0.7613677978515625, "memory(GiB)": 76.04, "step": 2315, "token_acc": 0.8293824550807791, "train_speed(iter/s)": 0.027739 }, { "epoch": 0.5999094964121792, "grad_norm": 1.021669626235962, "learning_rate": 4.6990234975673065e-06, "loss": 0.7912391662597656, "memory(GiB)": 76.04, "step": 2320, "token_acc": 0.7770263788968825, "train_speed(iter/s)": 0.027739 }, { "epoch": 0.6012024048096193, "grad_norm": 2.0476624965667725, "learning_rate": 4.697326113191024e-06, "loss": 0.8161981582641602, "memory(GiB)": 76.04, "step": 2325, "token_acc": 0.7861008259755056, "train_speed(iter/s)": 0.027737 }, { "epoch": 0.6024953132070593, "grad_norm": 2.5752296447753906, "learning_rate": 4.695624264333438e-06, "loss": 0.7860607624053955, "memory(GiB)": 76.04, "step": 2330, "token_acc": 0.7906607543657962, "train_speed(iter/s)": 0.027737 }, { "epoch": 0.6037882216044993, "grad_norm": 1.1529428958892822, "learning_rate": 4.6939179544523315e-06, "loss": 0.8076473236083984, "memory(GiB)": 76.04, "step": 2335, "token_acc": 0.7956367704642924, "train_speed(iter/s)": 0.027737 }, { "epoch": 0.6050811300019394, "grad_norm": 0.9944195747375488, "learning_rate": 4.692207187014548e-06, "loss": 0.8114787101745605, "memory(GiB)": 76.04, "step": 2340, "token_acc": 0.8053776627151746, "train_speed(iter/s)": 0.027736 }, { "epoch": 0.6063740383993794, "grad_norm": 0.9465590715408325, "learning_rate": 4.690491965495989e-06, "loss": 0.7890607357025147, "memory(GiB)": 76.04, "step": 2345, "token_acc": 0.7868282075178626, "train_speed(iter/s)": 0.027736 }, { "epoch": 0.6076669467968194, "grad_norm": 1.0112555027008057, "learning_rate": 4.688772293381608e-06, "loss": 0.7973843574523926, "memory(GiB)": 76.04, "step": 2350, "token_acc": 0.7798850081524071, "train_speed(iter/s)": 0.027737 }, { "epoch": 0.6089598551942594, "grad_norm": 1.1251353025436401, "learning_rate": 4.6870481741653965e-06, "loss": 0.8469139099121094, "memory(GiB)": 76.04, "step": 2355, "token_acc": 0.7770078088638361, "train_speed(iter/s)": 0.027739 }, { "epoch": 0.6102527635916996, "grad_norm": 0.9500820636749268, "learning_rate": 4.685319611350384e-06, "loss": 0.8143545150756836, "memory(GiB)": 76.04, "step": 2360, "token_acc": 0.8021919497701536, "train_speed(iter/s)": 0.027741 }, { "epoch": 0.6115456719891396, "grad_norm": 1.0462709665298462, "learning_rate": 4.683586608448629e-06, "loss": 0.7490966320037842, "memory(GiB)": 76.04, "step": 2365, "token_acc": 0.8057272352698805, "train_speed(iter/s)": 0.027738 }, { "epoch": 0.6128385803865796, "grad_norm": 0.982092022895813, "learning_rate": 4.681849168981211e-06, "loss": 0.8468921661376954, "memory(GiB)": 76.04, "step": 2370, "token_acc": 0.7924534664148908, "train_speed(iter/s)": 0.02774 }, { "epoch": 0.6141314887840197, "grad_norm": 1.270372748374939, "learning_rate": 4.680107296478223e-06, "loss": 0.799936580657959, "memory(GiB)": 76.04, "step": 2375, "token_acc": 0.8000295322824763, "train_speed(iter/s)": 0.027741 }, { "epoch": 0.6154243971814597, "grad_norm": 1.3359791040420532, "learning_rate": 4.678360994478763e-06, "loss": 0.8011417388916016, "memory(GiB)": 76.04, "step": 2380, "token_acc": 0.7963584606708382, "train_speed(iter/s)": 0.027739 }, { "epoch": 0.6167173055788997, "grad_norm": 1.0611239671707153, "learning_rate": 4.676610266530935e-06, "loss": 0.800925350189209, "memory(GiB)": 76.04, "step": 2385, "token_acc": 0.7784312845148835, "train_speed(iter/s)": 0.027735 }, { "epoch": 0.6180102139763398, "grad_norm": 0.9599133729934692, "learning_rate": 4.6748551161918285e-06, "loss": 0.7691280364990234, "memory(GiB)": 76.04, "step": 2390, "token_acc": 0.8164638974875819, "train_speed(iter/s)": 0.027734 }, { "epoch": 0.6193031223737798, "grad_norm": 1.0434238910675049, "learning_rate": 4.673095547027522e-06, "loss": 0.7575326442718506, "memory(GiB)": 76.04, "step": 2395, "token_acc": 0.8145789878142496, "train_speed(iter/s)": 0.027734 }, { "epoch": 0.6205960307712198, "grad_norm": 1.002805233001709, "learning_rate": 4.671331562613072e-06, "loss": 0.7855173110961914, "memory(GiB)": 76.04, "step": 2400, "token_acc": 0.8110472959950661, "train_speed(iter/s)": 0.027734 }, { "epoch": 0.62188893916866, "grad_norm": 0.8859378099441528, "learning_rate": 4.669563166532504e-06, "loss": 0.807244873046875, "memory(GiB)": 76.04, "step": 2405, "token_acc": 0.7864419894252676, "train_speed(iter/s)": 0.027733 }, { "epoch": 0.6231818475661, "grad_norm": 2.113131046295166, "learning_rate": 4.667790362378809e-06, "loss": 0.794129753112793, "memory(GiB)": 76.04, "step": 2410, "token_acc": 0.7970005356186395, "train_speed(iter/s)": 0.027733 }, { "epoch": 0.62447475596354, "grad_norm": 1.0956636667251587, "learning_rate": 4.6660131537539335e-06, "loss": 0.8120314598083496, "memory(GiB)": 76.04, "step": 2415, "token_acc": 0.7850858214337227, "train_speed(iter/s)": 0.027734 }, { "epoch": 0.62576766436098, "grad_norm": 2.5566296577453613, "learning_rate": 4.664231544268774e-06, "loss": 0.7688230037689209, "memory(GiB)": 76.04, "step": 2420, "token_acc": 0.7974286336892569, "train_speed(iter/s)": 0.027737 }, { "epoch": 0.6270605727584201, "grad_norm": 0.8976960182189941, "learning_rate": 4.662445537543164e-06, "loss": 0.8087752342224122, "memory(GiB)": 76.04, "step": 2425, "token_acc": 0.7868685635201693, "train_speed(iter/s)": 0.027735 }, { "epoch": 0.6283534811558601, "grad_norm": 1.0024232864379883, "learning_rate": 4.660655137205878e-06, "loss": 0.7957705020904541, "memory(GiB)": 76.04, "step": 2430, "token_acc": 0.7706113070005151, "train_speed(iter/s)": 0.027736 }, { "epoch": 0.6296463895533001, "grad_norm": 1.0616440773010254, "learning_rate": 4.658860346894613e-06, "loss": 0.7973846912384033, "memory(GiB)": 76.04, "step": 2435, "token_acc": 0.8036959869553402, "train_speed(iter/s)": 0.027737 }, { "epoch": 0.6309392979507402, "grad_norm": 1.0026406049728394, "learning_rate": 4.6570611702559854e-06, "loss": 0.8205162048339844, "memory(GiB)": 76.04, "step": 2440, "token_acc": 0.7975911152823401, "train_speed(iter/s)": 0.027737 }, { "epoch": 0.6322322063481802, "grad_norm": 0.9040783047676086, "learning_rate": 4.655257610945526e-06, "loss": 0.8040790557861328, "memory(GiB)": 76.04, "step": 2445, "token_acc": 0.8114757319709177, "train_speed(iter/s)": 0.027735 }, { "epoch": 0.6335251147456202, "grad_norm": 1.0662907361984253, "learning_rate": 4.653449672627669e-06, "loss": 0.7849656105041504, "memory(GiB)": 76.04, "step": 2450, "token_acc": 0.8061563270726617, "train_speed(iter/s)": 0.027735 }, { "epoch": 0.6348180231430604, "grad_norm": 1.0695264339447021, "learning_rate": 4.6516373589757445e-06, "loss": 0.7940691947937012, "memory(GiB)": 76.04, "step": 2455, "token_acc": 0.7807667525773195, "train_speed(iter/s)": 0.027737 }, { "epoch": 0.6361109315405004, "grad_norm": 1.1556239128112793, "learning_rate": 4.649820673671976e-06, "loss": 0.7685293197631836, "memory(GiB)": 76.04, "step": 2460, "token_acc": 0.7840851495184997, "train_speed(iter/s)": 0.027739 }, { "epoch": 0.6374038399379404, "grad_norm": 2.466895580291748, "learning_rate": 4.647999620407463e-06, "loss": 0.7619011878967286, "memory(GiB)": 76.04, "step": 2465, "token_acc": 0.7804016362960208, "train_speed(iter/s)": 0.02774 }, { "epoch": 0.6386967483353805, "grad_norm": 1.1291913986206055, "learning_rate": 4.646174202882186e-06, "loss": 0.8165172576904297, "memory(GiB)": 76.04, "step": 2470, "token_acc": 0.7608570606844981, "train_speed(iter/s)": 0.027742 }, { "epoch": 0.6399896567328205, "grad_norm": 1.1947365999221802, "learning_rate": 4.64434442480499e-06, "loss": 0.7749819755554199, "memory(GiB)": 76.04, "step": 2475, "token_acc": 0.7708522212148685, "train_speed(iter/s)": 0.027742 }, { "epoch": 0.6412825651302605, "grad_norm": 1.0024884939193726, "learning_rate": 4.64251028989358e-06, "loss": 0.766645097732544, "memory(GiB)": 76.04, "step": 2480, "token_acc": 0.7914130613587761, "train_speed(iter/s)": 0.027743 }, { "epoch": 0.6425754735277006, "grad_norm": 0.9784958362579346, "learning_rate": 4.640671801874512e-06, "loss": 0.8136966705322266, "memory(GiB)": 76.04, "step": 2485, "token_acc": 0.7942760819377771, "train_speed(iter/s)": 0.027746 }, { "epoch": 0.6438683819251406, "grad_norm": 0.8597215414047241, "learning_rate": 4.638828964483188e-06, "loss": 0.775879955291748, "memory(GiB)": 76.04, "step": 2490, "token_acc": 0.7876452918897741, "train_speed(iter/s)": 0.027745 }, { "epoch": 0.6451612903225806, "grad_norm": 1.1758781671524048, "learning_rate": 4.636981781463848e-06, "loss": 0.8091221809387207, "memory(GiB)": 76.04, "step": 2495, "token_acc": 0.8069754035357417, "train_speed(iter/s)": 0.027745 }, { "epoch": 0.6464541987200206, "grad_norm": 0.9592023491859436, "learning_rate": 4.635130256569558e-06, "loss": 0.7946199417114258, "memory(GiB)": 76.04, "step": 2500, "token_acc": 0.7830649234049717, "train_speed(iter/s)": 0.027746 }, { "epoch": 0.6477471071174608, "grad_norm": 1.495296835899353, "learning_rate": 4.633274393562208e-06, "loss": 0.7667324542999268, "memory(GiB)": 76.04, "step": 2505, "token_acc": 0.8036371800628649, "train_speed(iter/s)": 0.027748 }, { "epoch": 0.6490400155149008, "grad_norm": 1.0845485925674438, "learning_rate": 4.631414196212502e-06, "loss": 0.774350357055664, "memory(GiB)": 76.04, "step": 2510, "token_acc": 0.7877581120943953, "train_speed(iter/s)": 0.02775 }, { "epoch": 0.6503329239123408, "grad_norm": 0.9458225965499878, "learning_rate": 4.629549668299949e-06, "loss": 0.7802841186523437, "memory(GiB)": 76.04, "step": 2515, "token_acc": 0.7762283711761699, "train_speed(iter/s)": 0.027751 }, { "epoch": 0.6516258323097809, "grad_norm": 1.0014280080795288, "learning_rate": 4.62768081361286e-06, "loss": 0.7994625568389893, "memory(GiB)": 76.04, "step": 2520, "token_acc": 0.8127975163849603, "train_speed(iter/s)": 0.027749 }, { "epoch": 0.6529187407072209, "grad_norm": 1.5184024572372437, "learning_rate": 4.6258076359483335e-06, "loss": 0.7841564655303955, "memory(GiB)": 76.04, "step": 2525, "token_acc": 0.8111151834205178, "train_speed(iter/s)": 0.027751 }, { "epoch": 0.6542116491046609, "grad_norm": 1.1411337852478027, "learning_rate": 4.623930139112252e-06, "loss": 0.7719697952270508, "memory(GiB)": 76.04, "step": 2530, "token_acc": 0.7725351785631357, "train_speed(iter/s)": 0.027751 }, { "epoch": 0.655504557502101, "grad_norm": 1.3554903268814087, "learning_rate": 4.622048326919277e-06, "loss": 0.7868958950042725, "memory(GiB)": 76.04, "step": 2535, "token_acc": 0.7877291008718654, "train_speed(iter/s)": 0.027749 }, { "epoch": 0.656797465899541, "grad_norm": 1.3750821352005005, "learning_rate": 4.620162203192833e-06, "loss": 0.7791455268859864, "memory(GiB)": 76.04, "step": 2540, "token_acc": 0.7791341738940311, "train_speed(iter/s)": 0.02775 }, { "epoch": 0.658090374296981, "grad_norm": 1.1238117218017578, "learning_rate": 4.618271771765108e-06, "loss": 0.7734639644622803, "memory(GiB)": 76.04, "step": 2545, "token_acc": 0.7830758898589657, "train_speed(iter/s)": 0.027753 }, { "epoch": 0.6593832826944211, "grad_norm": 1.0922011137008667, "learning_rate": 4.616377036477039e-06, "loss": 0.769841194152832, "memory(GiB)": 76.04, "step": 2550, "token_acc": 0.7772533671002647, "train_speed(iter/s)": 0.027753 }, { "epoch": 0.6606761910918612, "grad_norm": 1.0475714206695557, "learning_rate": 4.614478001178312e-06, "loss": 0.7945080280303956, "memory(GiB)": 76.04, "step": 2555, "token_acc": 0.7906106546310226, "train_speed(iter/s)": 0.027752 }, { "epoch": 0.6619690994893012, "grad_norm": 1.1444096565246582, "learning_rate": 4.612574669727346e-06, "loss": 0.7711798667907714, "memory(GiB)": 76.04, "step": 2560, "token_acc": 0.7995795091578054, "train_speed(iter/s)": 0.027752 }, { "epoch": 0.6632620078867413, "grad_norm": 1.4287755489349365, "learning_rate": 4.6106670459912915e-06, "loss": 0.794065284729004, "memory(GiB)": 76.04, "step": 2565, "token_acc": 0.7696101905947706, "train_speed(iter/s)": 0.027751 }, { "epoch": 0.6645549162841813, "grad_norm": 1.3806992769241333, "learning_rate": 4.608755133846017e-06, "loss": 0.8211702346801758, "memory(GiB)": 76.04, "step": 2570, "token_acc": 0.80044866626941, "train_speed(iter/s)": 0.027752 }, { "epoch": 0.6658478246816213, "grad_norm": 0.9568463563919067, "learning_rate": 4.6068389371761055e-06, "loss": 0.7481316566467285, "memory(GiB)": 76.04, "step": 2575, "token_acc": 0.8280512901693842, "train_speed(iter/s)": 0.027751 }, { "epoch": 0.6671407330790613, "grad_norm": 1.2518895864486694, "learning_rate": 4.604918459874846e-06, "loss": 0.7877891540527344, "memory(GiB)": 76.04, "step": 2580, "token_acc": 0.8081138790035587, "train_speed(iter/s)": 0.027751 }, { "epoch": 0.6684336414765014, "grad_norm": 1.919282078742981, "learning_rate": 4.602993705844225e-06, "loss": 0.7748439311981201, "memory(GiB)": 76.04, "step": 2585, "token_acc": 0.8042981252857796, "train_speed(iter/s)": 0.02775 }, { "epoch": 0.6697265498739414, "grad_norm": 1.1794474124908447, "learning_rate": 4.601064678994916e-06, "loss": 0.7562169075012207, "memory(GiB)": 76.04, "step": 2590, "token_acc": 0.7905587888470463, "train_speed(iter/s)": 0.027747 }, { "epoch": 0.6710194582713814, "grad_norm": 0.9287105798721313, "learning_rate": 4.599131383246277e-06, "loss": 0.7767970085144043, "memory(GiB)": 76.04, "step": 2595, "token_acc": 0.7871586083297619, "train_speed(iter/s)": 0.027748 }, { "epoch": 0.6723123666688215, "grad_norm": 1.4129362106323242, "learning_rate": 4.5971938225263366e-06, "loss": 0.7788604736328125, "memory(GiB)": 76.04, "step": 2600, "token_acc": 0.810065880876619, "train_speed(iter/s)": 0.027748 }, { "epoch": 0.6736052750662616, "grad_norm": 1.1094108819961548, "learning_rate": 4.59525200077179e-06, "loss": 0.7465203285217286, "memory(GiB)": 76.04, "step": 2605, "token_acc": 0.8041896446078431, "train_speed(iter/s)": 0.027748 }, { "epoch": 0.6748981834637016, "grad_norm": 1.05765962600708, "learning_rate": 4.593305921927992e-06, "loss": 0.7598991394042969, "memory(GiB)": 76.04, "step": 2610, "token_acc": 0.8296476919196166, "train_speed(iter/s)": 0.027747 }, { "epoch": 0.6761910918611417, "grad_norm": 1.0570799112319946, "learning_rate": 4.591355589948943e-06, "loss": 0.7356798648834229, "memory(GiB)": 76.04, "step": 2615, "token_acc": 0.7747376064426695, "train_speed(iter/s)": 0.027745 }, { "epoch": 0.6774840002585817, "grad_norm": 1.726942777633667, "learning_rate": 4.589401008797288e-06, "loss": 0.7580029487609863, "memory(GiB)": 76.04, "step": 2620, "token_acc": 0.7843151506341535, "train_speed(iter/s)": 0.027745 }, { "epoch": 0.6787769086560217, "grad_norm": 1.046608805656433, "learning_rate": 4.587442182444303e-06, "loss": 0.7981472969055176, "memory(GiB)": 76.04, "step": 2625, "token_acc": 0.8134403515732291, "train_speed(iter/s)": 0.027744 }, { "epoch": 0.6800698170534618, "grad_norm": 1.075890302658081, "learning_rate": 4.585479114869892e-06, "loss": 0.7996755599975586, "memory(GiB)": 76.04, "step": 2630, "token_acc": 0.7618249365712214, "train_speed(iter/s)": 0.027747 }, { "epoch": 0.6813627254509018, "grad_norm": 1.182303786277771, "learning_rate": 4.583511810062573e-06, "loss": 0.7393967628479003, "memory(GiB)": 76.04, "step": 2635, "token_acc": 0.7840963855421687, "train_speed(iter/s)": 0.027749 }, { "epoch": 0.6826556338483418, "grad_norm": 0.9905603528022766, "learning_rate": 4.581540272019476e-06, "loss": 0.7551537036895752, "memory(GiB)": 76.04, "step": 2640, "token_acc": 0.804885036888475, "train_speed(iter/s)": 0.027751 }, { "epoch": 0.6839485422457818, "grad_norm": 0.9618648290634155, "learning_rate": 4.579564504746331e-06, "loss": 0.7748908996582031, "memory(GiB)": 76.04, "step": 2645, "token_acc": 0.8088857158547971, "train_speed(iter/s)": 0.02775 }, { "epoch": 0.685241450643222, "grad_norm": 1.2999211549758911, "learning_rate": 4.577584512257459e-06, "loss": 0.7771445274353027, "memory(GiB)": 76.04, "step": 2650, "token_acc": 0.8316016931592813, "train_speed(iter/s)": 0.027747 }, { "epoch": 0.686534359040662, "grad_norm": 0.9438580274581909, "learning_rate": 4.57560029857577e-06, "loss": 0.7551321983337402, "memory(GiB)": 76.04, "step": 2655, "token_acc": 0.7968830005120328, "train_speed(iter/s)": 0.027747 }, { "epoch": 0.687827267438102, "grad_norm": 1.2633525133132935, "learning_rate": 4.573611867732746e-06, "loss": 0.750664758682251, "memory(GiB)": 76.04, "step": 2660, "token_acc": 0.7704320666319625, "train_speed(iter/s)": 0.027748 }, { "epoch": 0.6891201758355421, "grad_norm": 1.7194573879241943, "learning_rate": 4.571619223768439e-06, "loss": 0.7772263526916504, "memory(GiB)": 76.04, "step": 2665, "token_acc": 0.7634119583104773, "train_speed(iter/s)": 0.02775 }, { "epoch": 0.6904130842329821, "grad_norm": 1.2169469594955444, "learning_rate": 4.569622370731463e-06, "loss": 0.7446264743804931, "memory(GiB)": 76.04, "step": 2670, "token_acc": 0.7990216722278014, "train_speed(iter/s)": 0.027749 }, { "epoch": 0.6917059926304221, "grad_norm": 1.146213173866272, "learning_rate": 4.56762131267898e-06, "loss": 0.7797055244445801, "memory(GiB)": 76.04, "step": 2675, "token_acc": 0.7709560205488034, "train_speed(iter/s)": 0.02775 }, { "epoch": 0.6929989010278622, "grad_norm": 6.729126930236816, "learning_rate": 4.565616053676701e-06, "loss": 0.7762058258056641, "memory(GiB)": 76.04, "step": 2680, "token_acc": 0.8343838296022604, "train_speed(iter/s)": 0.02775 }, { "epoch": 0.6942918094253022, "grad_norm": 1.7651880979537964, "learning_rate": 4.563606597798866e-06, "loss": 0.8064382553100586, "memory(GiB)": 76.04, "step": 2685, "token_acc": 0.7710679099225898, "train_speed(iter/s)": 0.027751 }, { "epoch": 0.6955847178227422, "grad_norm": 1.7482510805130005, "learning_rate": 4.561592949128249e-06, "loss": 0.7633975505828857, "memory(GiB)": 76.04, "step": 2690, "token_acc": 0.7979380661789789, "train_speed(iter/s)": 0.027752 }, { "epoch": 0.6968776262201823, "grad_norm": 1.2659438848495483, "learning_rate": 4.5595751117561365e-06, "loss": 0.7893208503723145, "memory(GiB)": 76.04, "step": 2695, "token_acc": 0.8003590821509897, "train_speed(iter/s)": 0.027751 }, { "epoch": 0.6981705346176224, "grad_norm": 1.2541935443878174, "learning_rate": 4.5575530897823296e-06, "loss": 0.7760859489440918, "memory(GiB)": 76.04, "step": 2700, "token_acc": 0.7648711490021314, "train_speed(iter/s)": 0.027751 }, { "epoch": 0.6994634430150624, "grad_norm": 1.4929347038269043, "learning_rate": 4.55552688731513e-06, "loss": 0.7721807479858398, "memory(GiB)": 76.04, "step": 2705, "token_acc": 0.7744538013073435, "train_speed(iter/s)": 0.027753 }, { "epoch": 0.7007563514125025, "grad_norm": 1.3372719287872314, "learning_rate": 4.553496508471333e-06, "loss": 0.7598706245422363, "memory(GiB)": 76.04, "step": 2710, "token_acc": 0.7882575476596692, "train_speed(iter/s)": 0.027753 }, { "epoch": 0.7020492598099425, "grad_norm": 1.0163182020187378, "learning_rate": 4.551461957376221e-06, "loss": 0.7641387939453125, "memory(GiB)": 76.04, "step": 2715, "token_acc": 0.8151145642243085, "train_speed(iter/s)": 0.027755 }, { "epoch": 0.7033421682073825, "grad_norm": 2.0491156578063965, "learning_rate": 4.5494232381635526e-06, "loss": 0.7833964347839355, "memory(GiB)": 76.04, "step": 2720, "token_acc": 0.795193260654113, "train_speed(iter/s)": 0.027756 }, { "epoch": 0.7046350766048225, "grad_norm": 1.0847963094711304, "learning_rate": 4.547380354975554e-06, "loss": 0.774288558959961, "memory(GiB)": 76.04, "step": 2725, "token_acc": 0.7972633104565412, "train_speed(iter/s)": 0.027757 }, { "epoch": 0.7059279850022626, "grad_norm": 0.9379494190216064, "learning_rate": 4.545333311962912e-06, "loss": 0.7845103740692139, "memory(GiB)": 76.04, "step": 2730, "token_acc": 0.7804776566530748, "train_speed(iter/s)": 0.027756 }, { "epoch": 0.7072208933997026, "grad_norm": 0.9910460114479065, "learning_rate": 4.543282113284767e-06, "loss": 0.7749279022216797, "memory(GiB)": 76.04, "step": 2735, "token_acc": 0.7755603122639134, "train_speed(iter/s)": 0.027758 }, { "epoch": 0.7085138017971426, "grad_norm": 0.8512127995491028, "learning_rate": 4.541226763108702e-06, "loss": 0.750948715209961, "memory(GiB)": 76.04, "step": 2740, "token_acc": 0.804368820418487, "train_speed(iter/s)": 0.027757 }, { "epoch": 0.7098067101945827, "grad_norm": 2.1939456462860107, "learning_rate": 4.5391672656107335e-06, "loss": 0.7639683723449707, "memory(GiB)": 76.04, "step": 2745, "token_acc": 0.8181778169014085, "train_speed(iter/s)": 0.027758 }, { "epoch": 0.7110996185920228, "grad_norm": 1.079122543334961, "learning_rate": 4.537103624975306e-06, "loss": 0.7661020278930664, "memory(GiB)": 76.04, "step": 2750, "token_acc": 0.7944695989650712, "train_speed(iter/s)": 0.027758 }, { "epoch": 0.7123925269894628, "grad_norm": 1.3096694946289062, "learning_rate": 4.53503584539528e-06, "loss": 0.7214805603027343, "memory(GiB)": 76.04, "step": 2755, "token_acc": 0.7952853160179271, "train_speed(iter/s)": 0.027758 }, { "epoch": 0.7136854353869029, "grad_norm": 1.1697825193405151, "learning_rate": 4.532963931071929e-06, "loss": 0.7563837051391602, "memory(GiB)": 76.04, "step": 2760, "token_acc": 0.7784021071115013, "train_speed(iter/s)": 0.027758 }, { "epoch": 0.7149783437843429, "grad_norm": 0.9587258100509644, "learning_rate": 4.530887886214925e-06, "loss": 0.7307098388671875, "memory(GiB)": 76.04, "step": 2765, "token_acc": 0.8118209311876937, "train_speed(iter/s)": 0.027757 }, { "epoch": 0.7162712521817829, "grad_norm": 1.2170313596725464, "learning_rate": 4.528807715042333e-06, "loss": 0.7652310371398926, "memory(GiB)": 76.04, "step": 2770, "token_acc": 0.8206977655821247, "train_speed(iter/s)": 0.027758 }, { "epoch": 0.717564160579223, "grad_norm": 1.1587222814559937, "learning_rate": 4.526723421780598e-06, "loss": 0.757373857498169, "memory(GiB)": 76.04, "step": 2775, "token_acc": 0.8355521801286633, "train_speed(iter/s)": 0.027758 }, { "epoch": 0.718857068976663, "grad_norm": 1.151134967803955, "learning_rate": 4.524635010664547e-06, "loss": 0.7718755722045898, "memory(GiB)": 76.04, "step": 2780, "token_acc": 0.8152306441780126, "train_speed(iter/s)": 0.027757 }, { "epoch": 0.720149977374103, "grad_norm": 1.1560102701187134, "learning_rate": 4.522542485937369e-06, "loss": 0.7426802635192871, "memory(GiB)": 76.04, "step": 2785, "token_acc": 0.806016436656846, "train_speed(iter/s)": 0.027754 }, { "epoch": 0.7214428857715431, "grad_norm": 0.993427038192749, "learning_rate": 4.520445851850612e-06, "loss": 0.7491902828216552, "memory(GiB)": 76.04, "step": 2790, "token_acc": 0.8148384523334663, "train_speed(iter/s)": 0.027754 }, { "epoch": 0.7227357941689831, "grad_norm": 0.9622454047203064, "learning_rate": 4.518345112664173e-06, "loss": 0.731049919128418, "memory(GiB)": 76.04, "step": 2795, "token_acc": 0.8307215380677455, "train_speed(iter/s)": 0.027753 }, { "epoch": 0.7240287025664232, "grad_norm": 1.0693981647491455, "learning_rate": 4.516240272646291e-06, "loss": 0.7997897148132325, "memory(GiB)": 76.04, "step": 2800, "token_acc": 0.7474579404695877, "train_speed(iter/s)": 0.027754 }, { "epoch": 0.7253216109638632, "grad_norm": 0.9485954642295837, "learning_rate": 4.514131336073534e-06, "loss": 0.76673583984375, "memory(GiB)": 76.04, "step": 2805, "token_acc": 0.7821131082858396, "train_speed(iter/s)": 0.027753 }, { "epoch": 0.7266145193613033, "grad_norm": 1.123063564300537, "learning_rate": 4.512018307230798e-06, "loss": 0.7704802036285401, "memory(GiB)": 76.04, "step": 2810, "token_acc": 0.7895082445644244, "train_speed(iter/s)": 0.027755 }, { "epoch": 0.7279074277587433, "grad_norm": 1.4126653671264648, "learning_rate": 4.509901190411289e-06, "loss": 0.7815113544464112, "memory(GiB)": 76.04, "step": 2815, "token_acc": 0.8011522700531505, "train_speed(iter/s)": 0.027754 }, { "epoch": 0.7292003361561833, "grad_norm": 1.1078338623046875, "learning_rate": 4.5077799899165206e-06, "loss": 0.7516324996948243, "memory(GiB)": 76.04, "step": 2820, "token_acc": 0.7875029811590747, "train_speed(iter/s)": 0.027753 }, { "epoch": 0.7304932445536234, "grad_norm": 1.1581236124038696, "learning_rate": 4.505654710056305e-06, "loss": 0.7554468154907227, "memory(GiB)": 76.04, "step": 2825, "token_acc": 0.7982930298719773, "train_speed(iter/s)": 0.027752 }, { "epoch": 0.7317861529510634, "grad_norm": 0.9877261519432068, "learning_rate": 4.50352535514874e-06, "loss": 0.7270550727844238, "memory(GiB)": 76.04, "step": 2830, "token_acc": 0.8090806830964311, "train_speed(iter/s)": 0.027753 }, { "epoch": 0.7330790613485034, "grad_norm": 1.0771080255508423, "learning_rate": 4.501391929520206e-06, "loss": 0.7520308494567871, "memory(GiB)": 76.04, "step": 2835, "token_acc": 0.7689856611789697, "train_speed(iter/s)": 0.027755 }, { "epoch": 0.7343719697459435, "grad_norm": 1.3513661623001099, "learning_rate": 4.499254437505351e-06, "loss": 0.7171365737915039, "memory(GiB)": 76.04, "step": 2840, "token_acc": 0.813343427029162, "train_speed(iter/s)": 0.027755 }, { "epoch": 0.7356648781433835, "grad_norm": 1.1246927976608276, "learning_rate": 4.497112883447088e-06, "loss": 0.7306987762451171, "memory(GiB)": 76.04, "step": 2845, "token_acc": 0.8194618966664203, "train_speed(iter/s)": 0.027755 }, { "epoch": 0.7369577865408236, "grad_norm": 1.2061104774475098, "learning_rate": 4.494967271696581e-06, "loss": 0.787189531326294, "memory(GiB)": 76.04, "step": 2850, "token_acc": 0.7943105778422388, "train_speed(iter/s)": 0.027755 }, { "epoch": 0.7382506949382637, "grad_norm": 1.228200078010559, "learning_rate": 4.492817606613239e-06, "loss": 0.736682653427124, "memory(GiB)": 76.04, "step": 2855, "token_acc": 0.8220771643206185, "train_speed(iter/s)": 0.027754 }, { "epoch": 0.7395436033357037, "grad_norm": 1.1733845472335815, "learning_rate": 4.4906638925647075e-06, "loss": 0.7503646850585938, "memory(GiB)": 76.04, "step": 2860, "token_acc": 0.7979779479101798, "train_speed(iter/s)": 0.027756 }, { "epoch": 0.7408365117331437, "grad_norm": 1.2325780391693115, "learning_rate": 4.488506133926857e-06, "loss": 0.7381996154785156, "memory(GiB)": 76.04, "step": 2865, "token_acc": 0.7863309352517985, "train_speed(iter/s)": 0.027757 }, { "epoch": 0.7421294201305837, "grad_norm": 1.1675026416778564, "learning_rate": 4.486344335083775e-06, "loss": 0.7488877296447753, "memory(GiB)": 76.04, "step": 2870, "token_acc": 0.797289709130386, "train_speed(iter/s)": 0.027757 }, { "epoch": 0.7434223285280238, "grad_norm": 1.6887255907058716, "learning_rate": 4.484178500427762e-06, "loss": 0.7432705402374268, "memory(GiB)": 76.04, "step": 2875, "token_acc": 0.805889321374175, "train_speed(iter/s)": 0.027756 }, { "epoch": 0.7447152369254638, "grad_norm": 1.2886244058609009, "learning_rate": 4.482008634359316e-06, "loss": 0.7218676567077636, "memory(GiB)": 76.04, "step": 2880, "token_acc": 0.8163206292290787, "train_speed(iter/s)": 0.027754 }, { "epoch": 0.7460081453229038, "grad_norm": 1.7008750438690186, "learning_rate": 4.4798347412871226e-06, "loss": 0.7312119960784912, "memory(GiB)": 76.04, "step": 2885, "token_acc": 0.8356855218094915, "train_speed(iter/s)": 0.027755 }, { "epoch": 0.7473010537203439, "grad_norm": 1.5202350616455078, "learning_rate": 4.477656825628054e-06, "loss": 0.7271114349365234, "memory(GiB)": 76.04, "step": 2890, "token_acc": 0.8097763430943048, "train_speed(iter/s)": 0.027754 }, { "epoch": 0.748593962117784, "grad_norm": 1.6034634113311768, "learning_rate": 4.475474891807153e-06, "loss": 0.6789961814880371, "memory(GiB)": 76.04, "step": 2895, "token_acc": 0.78770261615017, "train_speed(iter/s)": 0.027753 }, { "epoch": 0.749886870515224, "grad_norm": 1.1834633350372314, "learning_rate": 4.473288944257627e-06, "loss": 0.712617301940918, "memory(GiB)": 76.04, "step": 2900, "token_acc": 0.7983367123174314, "train_speed(iter/s)": 0.027753 }, { "epoch": 0.7511797789126641, "grad_norm": 2.4413537979125977, "learning_rate": 4.471098987420841e-06, "loss": 0.7433537483215332, "memory(GiB)": 76.04, "step": 2905, "token_acc": 0.8024606971975393, "train_speed(iter/s)": 0.027754 }, { "epoch": 0.7524726873101041, "grad_norm": 1.2915472984313965, "learning_rate": 4.468905025746301e-06, "loss": 0.7077127456665039, "memory(GiB)": 76.04, "step": 2910, "token_acc": 0.8141985793699815, "train_speed(iter/s)": 0.027756 }, { "epoch": 0.7537655957075441, "grad_norm": 1.2215969562530518, "learning_rate": 4.466707063691653e-06, "loss": 0.7059410095214844, "memory(GiB)": 76.04, "step": 2915, "token_acc": 0.7902067464635474, "train_speed(iter/s)": 0.027753 }, { "epoch": 0.7550585041049842, "grad_norm": 1.0937923192977905, "learning_rate": 4.464505105722672e-06, "loss": 0.7048573017120361, "memory(GiB)": 76.04, "step": 2920, "token_acc": 0.7998023436397007, "train_speed(iter/s)": 0.027752 }, { "epoch": 0.7563514125024242, "grad_norm": 1.2312453985214233, "learning_rate": 4.4622991563132475e-06, "loss": 0.6955265045166016, "memory(GiB)": 76.04, "step": 2925, "token_acc": 0.808813281410125, "train_speed(iter/s)": 0.027754 }, { "epoch": 0.7576443208998642, "grad_norm": 1.7371655702590942, "learning_rate": 4.460089219945383e-06, "loss": 0.6832226276397705, "memory(GiB)": 76.04, "step": 2930, "token_acc": 0.8051185818094706, "train_speed(iter/s)": 0.027754 }, { "epoch": 0.7589372292973043, "grad_norm": 1.2064937353134155, "learning_rate": 4.457875301109181e-06, "loss": 0.6924856662750244, "memory(GiB)": 76.04, "step": 2935, "token_acc": 0.8090518665345227, "train_speed(iter/s)": 0.027753 }, { "epoch": 0.7602301376947443, "grad_norm": 1.9841270446777344, "learning_rate": 4.455657404302836e-06, "loss": 0.6858362197875977, "memory(GiB)": 76.04, "step": 2940, "token_acc": 0.8241852487135506, "train_speed(iter/s)": 0.027753 }, { "epoch": 0.7615230460921844, "grad_norm": 1.5064440965652466, "learning_rate": 4.4534355340326226e-06, "loss": 0.6784512519836425, "memory(GiB)": 76.04, "step": 2945, "token_acc": 0.8192387024189012, "train_speed(iter/s)": 0.027752 }, { "epoch": 0.7628159544896244, "grad_norm": 1.823947548866272, "learning_rate": 4.451209694812893e-06, "loss": 0.6957567214965821, "memory(GiB)": 76.04, "step": 2950, "token_acc": 0.8207297541953903, "train_speed(iter/s)": 0.027753 }, { "epoch": 0.7641088628870645, "grad_norm": 1.7657442092895508, "learning_rate": 4.448979891166059e-06, "loss": 0.7199502944946289, "memory(GiB)": 76.04, "step": 2955, "token_acc": 0.8217210270645385, "train_speed(iter/s)": 0.027754 }, { "epoch": 0.7654017712845045, "grad_norm": 1.6712024211883545, "learning_rate": 4.44674612762259e-06, "loss": 0.700252914428711, "memory(GiB)": 76.04, "step": 2960, "token_acc": 0.8259719184364637, "train_speed(iter/s)": 0.027753 }, { "epoch": 0.7666946796819445, "grad_norm": 1.6742796897888184, "learning_rate": 4.444508408720999e-06, "loss": 0.7040081977844238, "memory(GiB)": 76.04, "step": 2965, "token_acc": 0.8206386483928634, "train_speed(iter/s)": 0.027753 }, { "epoch": 0.7679875880793846, "grad_norm": 1.0268195867538452, "learning_rate": 4.442266739007838e-06, "loss": 0.725772476196289, "memory(GiB)": 76.04, "step": 2970, "token_acc": 0.7764441447516296, "train_speed(iter/s)": 0.027754 }, { "epoch": 0.7692804964768246, "grad_norm": 1.271381139755249, "learning_rate": 4.440021123037683e-06, "loss": 0.7173772335052491, "memory(GiB)": 76.04, "step": 2975, "token_acc": 0.8247627142654766, "train_speed(iter/s)": 0.027754 }, { "epoch": 0.7705734048742646, "grad_norm": 1.742287039756775, "learning_rate": 4.437771565373131e-06, "loss": 0.6777096748352051, "memory(GiB)": 76.04, "step": 2980, "token_acc": 0.8065741348588578, "train_speed(iter/s)": 0.027755 }, { "epoch": 0.7718663132717047, "grad_norm": 1.113531470298767, "learning_rate": 4.4355180705847854e-06, "loss": 0.6992631912231445, "memory(GiB)": 76.04, "step": 2985, "token_acc": 0.8095548168203159, "train_speed(iter/s)": 0.027755 }, { "epoch": 0.7731592216691447, "grad_norm": 1.5363075733184814, "learning_rate": 4.43326064325125e-06, "loss": 0.6818428993225097, "memory(GiB)": 76.04, "step": 2990, "token_acc": 0.8093805660003958, "train_speed(iter/s)": 0.027756 }, { "epoch": 0.7744521300665848, "grad_norm": 1.2914507389068604, "learning_rate": 4.43099928795912e-06, "loss": 0.6791769027709961, "memory(GiB)": 76.04, "step": 2995, "token_acc": 0.7912520619379556, "train_speed(iter/s)": 0.027756 }, { "epoch": 0.7757450384640249, "grad_norm": 1.2839219570159912, "learning_rate": 4.428734009302968e-06, "loss": 0.6807722091674805, "memory(GiB)": 76.04, "step": 3000, "token_acc": 0.8037732367729139, "train_speed(iter/s)": 0.027757 }, { "epoch": 0.7770379468614649, "grad_norm": 1.6240931749343872, "learning_rate": 4.42646481188534e-06, "loss": 0.6738556861877442, "memory(GiB)": 76.04, "step": 3005, "token_acc": 0.8336527405136067, "train_speed(iter/s)": 0.02771 }, { "epoch": 0.7783308552589049, "grad_norm": 2.119504690170288, "learning_rate": 4.424191700316745e-06, "loss": 0.7054489135742188, "memory(GiB)": 76.04, "step": 3010, "token_acc": 0.8144756176741961, "train_speed(iter/s)": 0.02771 }, { "epoch": 0.7796237636563449, "grad_norm": 1.3605269193649292, "learning_rate": 4.421914679215643e-06, "loss": 0.6763367652893066, "memory(GiB)": 76.04, "step": 3015, "token_acc": 0.832611100866679, "train_speed(iter/s)": 0.02771 }, { "epoch": 0.780916672053785, "grad_norm": 1.5182582139968872, "learning_rate": 4.419633753208438e-06, "loss": 0.6742976188659668, "memory(GiB)": 76.04, "step": 3020, "token_acc": 0.8088533082175653, "train_speed(iter/s)": 0.027711 }, { "epoch": 0.782209580451225, "grad_norm": 1.180389404296875, "learning_rate": 4.417348926929467e-06, "loss": 0.6577554702758789, "memory(GiB)": 76.04, "step": 3025, "token_acc": 0.7916683734076106, "train_speed(iter/s)": 0.027712 }, { "epoch": 0.783502488848665, "grad_norm": 1.0676547288894653, "learning_rate": 4.4150602050209935e-06, "loss": 0.6725570678710937, "memory(GiB)": 76.04, "step": 3030, "token_acc": 0.8131301520575388, "train_speed(iter/s)": 0.027712 }, { "epoch": 0.7847953972461051, "grad_norm": 1.066395878791809, "learning_rate": 4.412767592133195e-06, "loss": 0.6555842399597168, "memory(GiB)": 76.04, "step": 3035, "token_acc": 0.8660530809527944, "train_speed(iter/s)": 0.027711 }, { "epoch": 0.7860883056435451, "grad_norm": 5.532017230987549, "learning_rate": 4.410471092924154e-06, "loss": 0.6637729167938232, "memory(GiB)": 76.04, "step": 3040, "token_acc": 0.814479006834984, "train_speed(iter/s)": 0.02771 }, { "epoch": 0.7873812140409852, "grad_norm": 1.9172098636627197, "learning_rate": 4.408170712059848e-06, "loss": 0.706690502166748, "memory(GiB)": 76.04, "step": 3045, "token_acc": 0.7951164898437917, "train_speed(iter/s)": 0.027708 }, { "epoch": 0.7886741224384253, "grad_norm": 2.5375490188598633, "learning_rate": 4.405866454214145e-06, "loss": 0.6923388481140137, "memory(GiB)": 76.04, "step": 3050, "token_acc": 0.7856790394210209, "train_speed(iter/s)": 0.027708 }, { "epoch": 0.7899670308358653, "grad_norm": 1.3066571950912476, "learning_rate": 4.403558324068787e-06, "loss": 0.6584675788879395, "memory(GiB)": 76.04, "step": 3055, "token_acc": 0.8082852648138438, "train_speed(iter/s)": 0.027709 }, { "epoch": 0.7912599392333053, "grad_norm": 1.8484247922897339, "learning_rate": 4.401246326313386e-06, "loss": 0.6835250854492188, "memory(GiB)": 76.04, "step": 3060, "token_acc": 0.8032010726107177, "train_speed(iter/s)": 0.02771 }, { "epoch": 0.7925528476307454, "grad_norm": 1.7470216751098633, "learning_rate": 4.398930465645409e-06, "loss": 0.6875529289245605, "memory(GiB)": 76.04, "step": 3065, "token_acc": 0.8029278650053081, "train_speed(iter/s)": 0.02771 }, { "epoch": 0.7938457560281854, "grad_norm": 1.1840174198150635, "learning_rate": 4.396610746770173e-06, "loss": 0.6479888916015625, "memory(GiB)": 76.04, "step": 3070, "token_acc": 0.8107086371176175, "train_speed(iter/s)": 0.02771 }, { "epoch": 0.7951386644256254, "grad_norm": 1.2682684659957886, "learning_rate": 4.394287174400838e-06, "loss": 0.6412975788116455, "memory(GiB)": 76.04, "step": 3075, "token_acc": 0.829871190130624, "train_speed(iter/s)": 0.027708 }, { "epoch": 0.7964315728230655, "grad_norm": 1.5862990617752075, "learning_rate": 4.3919597532583845e-06, "loss": 0.680488395690918, "memory(GiB)": 76.04, "step": 3080, "token_acc": 0.82756076566791, "train_speed(iter/s)": 0.027709 }, { "epoch": 0.7977244812205055, "grad_norm": 1.4510713815689087, "learning_rate": 4.389628488071622e-06, "loss": 0.644444751739502, "memory(GiB)": 76.04, "step": 3085, "token_acc": 0.800807537012113, "train_speed(iter/s)": 0.027708 }, { "epoch": 0.7990173896179456, "grad_norm": 1.2568798065185547, "learning_rate": 4.387293383577165e-06, "loss": 0.6682034015655518, "memory(GiB)": 76.04, "step": 3090, "token_acc": 0.8065676636686886, "train_speed(iter/s)": 0.027707 }, { "epoch": 0.8003102980153856, "grad_norm": 1.0545753240585327, "learning_rate": 4.38495444451943e-06, "loss": 0.6688919544219971, "memory(GiB)": 76.04, "step": 3095, "token_acc": 0.7727210465036641, "train_speed(iter/s)": 0.027708 }, { "epoch": 0.8016032064128257, "grad_norm": 1.586976170539856, "learning_rate": 4.382611675650626e-06, "loss": 0.6349334716796875, "memory(GiB)": 76.04, "step": 3100, "token_acc": 0.806030889924001, "train_speed(iter/s)": 0.027707 }, { "epoch": 0.8028961148102657, "grad_norm": 2.7589170932769775, "learning_rate": 4.380265081730739e-06, "loss": 0.6485045433044434, "memory(GiB)": 76.04, "step": 3105, "token_acc": 0.8114932360204947, "train_speed(iter/s)": 0.027707 }, { "epoch": 0.8041890232077057, "grad_norm": 1.262620210647583, "learning_rate": 4.377914667527532e-06, "loss": 0.6574973106384278, "memory(GiB)": 76.04, "step": 3110, "token_acc": 0.8018425922280404, "train_speed(iter/s)": 0.027707 }, { "epoch": 0.8054819316051458, "grad_norm": 1.670192003250122, "learning_rate": 4.375560437816527e-06, "loss": 0.6576096534729003, "memory(GiB)": 76.04, "step": 3115, "token_acc": 0.8066886816886817, "train_speed(iter/s)": 0.027706 }, { "epoch": 0.8067748400025858, "grad_norm": 1.9839909076690674, "learning_rate": 4.373202397380998e-06, "loss": 0.6304091930389404, "memory(GiB)": 76.04, "step": 3120, "token_acc": 0.8234421364985163, "train_speed(iter/s)": 0.027704 }, { "epoch": 0.8080677484000258, "grad_norm": 1.1080540418624878, "learning_rate": 4.370840551011963e-06, "loss": 0.6576041221618653, "memory(GiB)": 76.04, "step": 3125, "token_acc": 0.8217494089834515, "train_speed(iter/s)": 0.027706 }, { "epoch": 0.8093606567974659, "grad_norm": 1.1593878269195557, "learning_rate": 4.3684749035081705e-06, "loss": 0.6419290542602539, "memory(GiB)": 76.04, "step": 3130, "token_acc": 0.7999515151515152, "train_speed(iter/s)": 0.027706 }, { "epoch": 0.810653565194906, "grad_norm": 1.1493967771530151, "learning_rate": 4.366105459676097e-06, "loss": 0.646766471862793, "memory(GiB)": 76.04, "step": 3135, "token_acc": 0.8102760440126118, "train_speed(iter/s)": 0.027707 }, { "epoch": 0.811946473592346, "grad_norm": 1.3651187419891357, "learning_rate": 4.3637322243299255e-06, "loss": 0.6666352272033691, "memory(GiB)": 76.04, "step": 3140, "token_acc": 0.8152125937913786, "train_speed(iter/s)": 0.027707 }, { "epoch": 0.8132393819897861, "grad_norm": 1.128293514251709, "learning_rate": 4.361355202291548e-06, "loss": 0.6353740692138672, "memory(GiB)": 76.04, "step": 3145, "token_acc": 0.8045256453234998, "train_speed(iter/s)": 0.027707 }, { "epoch": 0.8145322903872261, "grad_norm": 1.6029019355773926, "learning_rate": 4.358974398390548e-06, "loss": 0.6691800117492676, "memory(GiB)": 76.04, "step": 3150, "token_acc": 0.8306377243385117, "train_speed(iter/s)": 0.027707 }, { "epoch": 0.8158251987846661, "grad_norm": 1.3377734422683716, "learning_rate": 4.356589817464193e-06, "loss": 0.6470844745635986, "memory(GiB)": 76.04, "step": 3155, "token_acc": 0.8250958558747833, "train_speed(iter/s)": 0.027707 }, { "epoch": 0.8171181071821062, "grad_norm": 19.943740844726562, "learning_rate": 4.354201464357424e-06, "loss": 0.6441401481628418, "memory(GiB)": 76.04, "step": 3160, "token_acc": 0.8164092901323066, "train_speed(iter/s)": 0.027705 }, { "epoch": 0.8184110155795462, "grad_norm": 2.096036672592163, "learning_rate": 4.3518093439228484e-06, "loss": 0.6595673561096191, "memory(GiB)": 76.04, "step": 3165, "token_acc": 0.8180080986396105, "train_speed(iter/s)": 0.027705 }, { "epoch": 0.8197039239769862, "grad_norm": 1.3042539358139038, "learning_rate": 4.349413461020725e-06, "loss": 0.6536635398864746, "memory(GiB)": 76.04, "step": 3170, "token_acc": 0.7721032106415942, "train_speed(iter/s)": 0.027707 }, { "epoch": 0.8209968323744262, "grad_norm": 1.1923153400421143, "learning_rate": 4.347013820518959e-06, "loss": 0.6662230491638184, "memory(GiB)": 76.04, "step": 3175, "token_acc": 0.7864515044545302, "train_speed(iter/s)": 0.027706 }, { "epoch": 0.8222897407718663, "grad_norm": 0.9587339162826538, "learning_rate": 4.344610427293091e-06, "loss": 0.637930154800415, "memory(GiB)": 76.04, "step": 3180, "token_acc": 0.8349569816879248, "train_speed(iter/s)": 0.027703 }, { "epoch": 0.8235826491693063, "grad_norm": 1.4377241134643555, "learning_rate": 4.342203286226284e-06, "loss": 0.6546686172485352, "memory(GiB)": 76.04, "step": 3185, "token_acc": 0.8348570317058938, "train_speed(iter/s)": 0.027704 }, { "epoch": 0.8248755575667464, "grad_norm": 1.3020581007003784, "learning_rate": 4.339792402209318e-06, "loss": 0.6620816707611084, "memory(GiB)": 76.04, "step": 3190, "token_acc": 0.8184861571423789, "train_speed(iter/s)": 0.027705 }, { "epoch": 0.8261684659641865, "grad_norm": 1.6828721761703491, "learning_rate": 4.337377780140575e-06, "loss": 0.6277073860168457, "memory(GiB)": 76.04, "step": 3195, "token_acc": 0.8200602270094973, "train_speed(iter/s)": 0.027704 }, { "epoch": 0.8274613743616265, "grad_norm": 1.6351348161697388, "learning_rate": 4.334959424926036e-06, "loss": 0.6136197566986084, "memory(GiB)": 76.04, "step": 3200, "token_acc": 0.805330584597261, "train_speed(iter/s)": 0.027705 }, { "epoch": 0.8287542827590665, "grad_norm": 1.1907836198806763, "learning_rate": 4.3325373414792625e-06, "loss": 0.647891902923584, "memory(GiB)": 76.04, "step": 3205, "token_acc": 0.8155163083583411, "train_speed(iter/s)": 0.027705 }, { "epoch": 0.8300471911565066, "grad_norm": 1.5931166410446167, "learning_rate": 4.330111534721394e-06, "loss": 0.6463868141174316, "memory(GiB)": 76.04, "step": 3210, "token_acc": 0.8210717829970228, "train_speed(iter/s)": 0.027707 }, { "epoch": 0.8313400995539466, "grad_norm": 1.8650577068328857, "learning_rate": 4.327682009581134e-06, "loss": 0.6347787380218506, "memory(GiB)": 76.04, "step": 3215, "token_acc": 0.8082674179217684, "train_speed(iter/s)": 0.027707 }, { "epoch": 0.8326330079513866, "grad_norm": 2.3283803462982178, "learning_rate": 4.325248770994741e-06, "loss": 0.6708244800567627, "memory(GiB)": 76.04, "step": 3220, "token_acc": 0.7994605743296843, "train_speed(iter/s)": 0.027706 }, { "epoch": 0.8339259163488267, "grad_norm": 1.4204248189926147, "learning_rate": 4.322811823906018e-06, "loss": 0.6237285137176514, "memory(GiB)": 76.04, "step": 3225, "token_acc": 0.8479689603294006, "train_speed(iter/s)": 0.027704 }, { "epoch": 0.8352188247462667, "grad_norm": 2.9352431297302246, "learning_rate": 4.3203711732663035e-06, "loss": 0.6458423614501954, "memory(GiB)": 76.04, "step": 3230, "token_acc": 0.8232549095858448, "train_speed(iter/s)": 0.027703 }, { "epoch": 0.8365117331437067, "grad_norm": 1.1581929922103882, "learning_rate": 4.31792682403446e-06, "loss": 0.6388854026794434, "memory(GiB)": 76.04, "step": 3235, "token_acc": 0.7934440464560947, "train_speed(iter/s)": 0.027701 }, { "epoch": 0.8378046415411468, "grad_norm": 1.1175485849380493, "learning_rate": 4.315478781176867e-06, "loss": 0.6145687103271484, "memory(GiB)": 76.04, "step": 3240, "token_acc": 0.8398847580708817, "train_speed(iter/s)": 0.027701 }, { "epoch": 0.8390975499385869, "grad_norm": 1.2444353103637695, "learning_rate": 4.313027049667405e-06, "loss": 0.6328566074371338, "memory(GiB)": 76.04, "step": 3245, "token_acc": 0.8066215947504474, "train_speed(iter/s)": 0.027699 }, { "epoch": 0.8403904583360269, "grad_norm": 1.141342043876648, "learning_rate": 4.310571634487451e-06, "loss": 0.629487419128418, "memory(GiB)": 76.04, "step": 3250, "token_acc": 0.8341686379856461, "train_speed(iter/s)": 0.0277 }, { "epoch": 0.8416833667334669, "grad_norm": 1.3321287631988525, "learning_rate": 4.3081125406258655e-06, "loss": 0.6453184604644775, "memory(GiB)": 76.04, "step": 3255, "token_acc": 0.7997035782341732, "train_speed(iter/s)": 0.0277 }, { "epoch": 0.842976275130907, "grad_norm": 1.0039650201797485, "learning_rate": 4.305649773078987e-06, "loss": 0.666410255432129, "memory(GiB)": 76.04, "step": 3260, "token_acc": 0.8168428282519937, "train_speed(iter/s)": 0.027699 }, { "epoch": 0.844269183528347, "grad_norm": 1.2001808881759644, "learning_rate": 4.303183336850612e-06, "loss": 0.660033893585205, "memory(GiB)": 76.04, "step": 3265, "token_acc": 0.8161585530947095, "train_speed(iter/s)": 0.027699 }, { "epoch": 0.845562091925787, "grad_norm": 1.2713844776153564, "learning_rate": 4.300713236951996e-06, "loss": 0.6356592655181885, "memory(GiB)": 76.04, "step": 3270, "token_acc": 0.8038586795618277, "train_speed(iter/s)": 0.027701 }, { "epoch": 0.8468550003232271, "grad_norm": 1.3221766948699951, "learning_rate": 4.298239478401836e-06, "loss": 0.6444936275482178, "memory(GiB)": 76.04, "step": 3275, "token_acc": 0.8171707402848603, "train_speed(iter/s)": 0.027701 }, { "epoch": 0.8481479087206671, "grad_norm": 1.2477511167526245, "learning_rate": 4.295762066226262e-06, "loss": 0.611814022064209, "memory(GiB)": 76.04, "step": 3280, "token_acc": 0.8180022127390548, "train_speed(iter/s)": 0.027701 }, { "epoch": 0.8494408171181071, "grad_norm": 1.2124427556991577, "learning_rate": 4.293281005458831e-06, "loss": 0.6272024631500244, "memory(GiB)": 76.04, "step": 3285, "token_acc": 0.8499176225558768, "train_speed(iter/s)": 0.027699 }, { "epoch": 0.8507337255155473, "grad_norm": 4.538881778717041, "learning_rate": 4.290796301140506e-06, "loss": 0.6252808094024658, "memory(GiB)": 76.04, "step": 3290, "token_acc": 0.8332118523213436, "train_speed(iter/s)": 0.027698 }, { "epoch": 0.8520266339129873, "grad_norm": 1.4854230880737305, "learning_rate": 4.288307958319662e-06, "loss": 0.6353150367736816, "memory(GiB)": 76.04, "step": 3295, "token_acc": 0.7681834998150203, "train_speed(iter/s)": 0.027696 }, { "epoch": 0.8533195423104273, "grad_norm": 1.1235853433609009, "learning_rate": 4.285815982052058e-06, "loss": 0.6190371036529541, "memory(GiB)": 76.04, "step": 3300, "token_acc": 0.8698982508288556, "train_speed(iter/s)": 0.027695 }, { "epoch": 0.8546124507078674, "grad_norm": 1.0786458253860474, "learning_rate": 4.283320377400842e-06, "loss": 0.6302780151367188, "memory(GiB)": 76.04, "step": 3305, "token_acc": 0.792910447761194, "train_speed(iter/s)": 0.027693 }, { "epoch": 0.8559053591053074, "grad_norm": 1.0524226427078247, "learning_rate": 4.280821149436531e-06, "loss": 0.629145622253418, "memory(GiB)": 76.04, "step": 3310, "token_acc": 0.8330289590399165, "train_speed(iter/s)": 0.027692 }, { "epoch": 0.8571982675027474, "grad_norm": 1.4898467063903809, "learning_rate": 4.278318303237003e-06, "loss": 0.6266490459442139, "memory(GiB)": 76.04, "step": 3315, "token_acc": 0.8104506584124652, "train_speed(iter/s)": 0.027692 }, { "epoch": 0.8584911759001874, "grad_norm": 1.1593666076660156, "learning_rate": 4.275811843887491e-06, "loss": 0.6542300224304199, "memory(GiB)": 76.04, "step": 3320, "token_acc": 0.8437677735485847, "train_speed(iter/s)": 0.027692 }, { "epoch": 0.8597840842976275, "grad_norm": 1.2023606300354004, "learning_rate": 4.273301776480564e-06, "loss": 0.6109468936920166, "memory(GiB)": 76.04, "step": 3325, "token_acc": 0.8550740689464211, "train_speed(iter/s)": 0.02769 }, { "epoch": 0.8610769926950675, "grad_norm": 1.4408375024795532, "learning_rate": 4.270788106116125e-06, "loss": 0.6247062683105469, "memory(GiB)": 76.04, "step": 3330, "token_acc": 0.8023508574188873, "train_speed(iter/s)": 0.027689 }, { "epoch": 0.8623699010925076, "grad_norm": 1.197199821472168, "learning_rate": 4.268270837901399e-06, "loss": 0.638817024230957, "memory(GiB)": 76.04, "step": 3335, "token_acc": 0.8134350688210652, "train_speed(iter/s)": 0.02769 }, { "epoch": 0.8636628094899477, "grad_norm": 1.215605616569519, "learning_rate": 4.265749976950917e-06, "loss": 0.6219228744506836, "memory(GiB)": 76.04, "step": 3340, "token_acc": 0.830684302174799, "train_speed(iter/s)": 0.027689 }, { "epoch": 0.8649557178873877, "grad_norm": 1.1016297340393066, "learning_rate": 4.263225528386512e-06, "loss": 0.6183833122253418, "memory(GiB)": 76.04, "step": 3345, "token_acc": 0.8504693786320966, "train_speed(iter/s)": 0.027689 }, { "epoch": 0.8662486262848277, "grad_norm": 1.1827194690704346, "learning_rate": 4.260697497337306e-06, "loss": 0.6260892868041992, "memory(GiB)": 76.04, "step": 3350, "token_acc": 0.8206773446545735, "train_speed(iter/s)": 0.027689 }, { "epoch": 0.8675415346822678, "grad_norm": 1.1964573860168457, "learning_rate": 4.2581658889397e-06, "loss": 0.6217505931854248, "memory(GiB)": 76.04, "step": 3355, "token_acc": 0.8112748538011696, "train_speed(iter/s)": 0.027691 }, { "epoch": 0.8688344430797078, "grad_norm": 1.9450868368148804, "learning_rate": 4.2556307083373635e-06, "loss": 0.6057548522949219, "memory(GiB)": 76.04, "step": 3360, "token_acc": 0.8432432432432433, "train_speed(iter/s)": 0.027691 }, { "epoch": 0.8701273514771478, "grad_norm": 1.085252285003662, "learning_rate": 4.253091960681222e-06, "loss": 0.650747537612915, "memory(GiB)": 76.04, "step": 3365, "token_acc": 0.8127441586201813, "train_speed(iter/s)": 0.027692 }, { "epoch": 0.8714202598745879, "grad_norm": 1.4419254064559937, "learning_rate": 4.250549651129451e-06, "loss": 0.6490330696105957, "memory(GiB)": 76.04, "step": 3370, "token_acc": 0.817296827466319, "train_speed(iter/s)": 0.027693 }, { "epoch": 0.8727131682720279, "grad_norm": 0.9393129348754883, "learning_rate": 4.248003784847462e-06, "loss": 0.5855797290802002, "memory(GiB)": 76.04, "step": 3375, "token_acc": 0.8437131244263799, "train_speed(iter/s)": 0.027693 }, { "epoch": 0.874006076669468, "grad_norm": 1.4661402702331543, "learning_rate": 4.245454367007893e-06, "loss": 0.6375166416168213, "memory(GiB)": 76.04, "step": 3380, "token_acc": 0.8220987966001851, "train_speed(iter/s)": 0.027691 }, { "epoch": 0.8752989850669081, "grad_norm": 1.0783532857894897, "learning_rate": 4.242901402790597e-06, "loss": 0.5942583084106445, "memory(GiB)": 76.04, "step": 3385, "token_acc": 0.8271346924848588, "train_speed(iter/s)": 0.027691 }, { "epoch": 0.8765918934643481, "grad_norm": 1.002106785774231, "learning_rate": 4.240344897382633e-06, "loss": 0.6190349578857421, "memory(GiB)": 76.04, "step": 3390, "token_acc": 0.8104156272786583, "train_speed(iter/s)": 0.02769 }, { "epoch": 0.8778848018617881, "grad_norm": 1.0779801607131958, "learning_rate": 4.237784855978258e-06, "loss": 0.6126032829284668, "memory(GiB)": 76.04, "step": 3395, "token_acc": 0.8469879143753689, "train_speed(iter/s)": 0.02769 }, { "epoch": 0.8791777102592281, "grad_norm": 1.1293411254882812, "learning_rate": 4.2352212837789086e-06, "loss": 0.6498593330383301, "memory(GiB)": 76.04, "step": 3400, "token_acc": 0.8174972974987885, "train_speed(iter/s)": 0.02769 }, { "epoch": 0.8804706186566682, "grad_norm": 1.9169739484786987, "learning_rate": 4.232654185993197e-06, "loss": 0.6312263965606689, "memory(GiB)": 76.04, "step": 3405, "token_acc": 0.8185292511864264, "train_speed(iter/s)": 0.02769 }, { "epoch": 0.8817635270541082, "grad_norm": 2.342643976211548, "learning_rate": 4.2300835678369005e-06, "loss": 0.5902108192443848, "memory(GiB)": 76.04, "step": 3410, "token_acc": 0.8093889113719142, "train_speed(iter/s)": 0.02769 }, { "epoch": 0.8830564354515482, "grad_norm": 1.2619035243988037, "learning_rate": 4.227509434532945e-06, "loss": 0.6150105953216553, "memory(GiB)": 76.04, "step": 3415, "token_acc": 0.8166153846153846, "train_speed(iter/s)": 0.027689 }, { "epoch": 0.8843493438489883, "grad_norm": 1.1575284004211426, "learning_rate": 4.224931791311403e-06, "loss": 0.6235898017883301, "memory(GiB)": 76.04, "step": 3420, "token_acc": 0.8300420709195501, "train_speed(iter/s)": 0.027689 }, { "epoch": 0.8856422522464283, "grad_norm": 1.627013921737671, "learning_rate": 4.2223506434094754e-06, "loss": 0.601617431640625, "memory(GiB)": 76.04, "step": 3425, "token_acc": 0.8208349821923229, "train_speed(iter/s)": 0.027689 }, { "epoch": 0.8869351606438683, "grad_norm": 1.0518633127212524, "learning_rate": 4.219765996071483e-06, "loss": 0.6408526420593261, "memory(GiB)": 76.04, "step": 3430, "token_acc": 0.8019846954820224, "train_speed(iter/s)": 0.02769 }, { "epoch": 0.8882280690413085, "grad_norm": 0.9839140176773071, "learning_rate": 4.217177854548862e-06, "loss": 0.6014208793640137, "memory(GiB)": 76.04, "step": 3435, "token_acc": 0.8176200504021818, "train_speed(iter/s)": 0.027689 }, { "epoch": 0.8895209774387485, "grad_norm": 1.1011220216751099, "learning_rate": 4.21458622410014e-06, "loss": 0.6313972473144531, "memory(GiB)": 76.04, "step": 3440, "token_acc": 0.8165993852079553, "train_speed(iter/s)": 0.027689 }, { "epoch": 0.8908138858361885, "grad_norm": 1.156149983406067, "learning_rate": 4.211991109990941e-06, "loss": 0.6519000053405761, "memory(GiB)": 76.04, "step": 3445, "token_acc": 0.7982918203025058, "train_speed(iter/s)": 0.027691 }, { "epoch": 0.8921067942336286, "grad_norm": 1.144892692565918, "learning_rate": 4.2093925174939606e-06, "loss": 0.6042433738708496, "memory(GiB)": 76.04, "step": 3450, "token_acc": 0.8215976553693545, "train_speed(iter/s)": 0.02769 }, { "epoch": 0.8933997026310686, "grad_norm": 1.2235312461853027, "learning_rate": 4.206790451888968e-06, "loss": 0.6446715354919433, "memory(GiB)": 76.04, "step": 3455, "token_acc": 0.8082164853885467, "train_speed(iter/s)": 0.02769 }, { "epoch": 0.8946926110285086, "grad_norm": 1.1150991916656494, "learning_rate": 4.204184918462783e-06, "loss": 0.628176212310791, "memory(GiB)": 76.04, "step": 3460, "token_acc": 0.803219563687544, "train_speed(iter/s)": 0.027688 }, { "epoch": 0.8959855194259486, "grad_norm": 1.0735828876495361, "learning_rate": 4.201575922509277e-06, "loss": 0.6142620086669922, "memory(GiB)": 76.04, "step": 3465, "token_acc": 0.8073942988329826, "train_speed(iter/s)": 0.027688 }, { "epoch": 0.8972784278233887, "grad_norm": 0.9149400591850281, "learning_rate": 4.198963469329351e-06, "loss": 0.5981680870056152, "memory(GiB)": 76.04, "step": 3470, "token_acc": 0.8314239727324371, "train_speed(iter/s)": 0.027688 }, { "epoch": 0.8985713362208287, "grad_norm": 0.9329715371131897, "learning_rate": 4.196347564230933e-06, "loss": 0.6357330322265625, "memory(GiB)": 76.04, "step": 3475, "token_acc": 0.815760798500632, "train_speed(iter/s)": 0.027685 }, { "epoch": 0.8998642446182687, "grad_norm": 2.6730282306671143, "learning_rate": 4.193728212528965e-06, "loss": 0.6184768676757812, "memory(GiB)": 76.04, "step": 3480, "token_acc": 0.8195593938666986, "train_speed(iter/s)": 0.027686 }, { "epoch": 0.9011571530157089, "grad_norm": 1.3464089632034302, "learning_rate": 4.191105419545391e-06, "loss": 0.6040889263153076, "memory(GiB)": 76.04, "step": 3485, "token_acc": 0.8135844450257215, "train_speed(iter/s)": 0.027687 }, { "epoch": 0.9024500614131489, "grad_norm": 1.3222928047180176, "learning_rate": 4.188479190609146e-06, "loss": 0.6070952415466309, "memory(GiB)": 76.04, "step": 3490, "token_acc": 0.8631930567568373, "train_speed(iter/s)": 0.027685 }, { "epoch": 0.9037429698105889, "grad_norm": 1.5099354982376099, "learning_rate": 4.185849531056149e-06, "loss": 0.6029548645019531, "memory(GiB)": 76.04, "step": 3495, "token_acc": 0.8064048588584444, "train_speed(iter/s)": 0.027686 }, { "epoch": 0.905035878208029, "grad_norm": 0.861173689365387, "learning_rate": 4.1832164462292865e-06, "loss": 0.6235533714294433, "memory(GiB)": 76.04, "step": 3500, "token_acc": 0.8324725253388218, "train_speed(iter/s)": 0.027686 }, { "epoch": 0.906328786605469, "grad_norm": 1.607367992401123, "learning_rate": 4.1805799414784044e-06, "loss": 0.6227012634277344, "memory(GiB)": 76.04, "step": 3505, "token_acc": 0.7834105927606273, "train_speed(iter/s)": 0.027685 }, { "epoch": 0.907621695002909, "grad_norm": 1.4191631078720093, "learning_rate": 4.177940022160299e-06, "loss": 0.6287036895751953, "memory(GiB)": 76.04, "step": 3510, "token_acc": 0.8294466536361799, "train_speed(iter/s)": 0.027686 }, { "epoch": 0.9089146034003491, "grad_norm": 1.4229148626327515, "learning_rate": 4.175296693638703e-06, "loss": 0.6371709823608398, "memory(GiB)": 76.04, "step": 3515, "token_acc": 0.7917865974784124, "train_speed(iter/s)": 0.027687 }, { "epoch": 0.9102075117977891, "grad_norm": 1.202496886253357, "learning_rate": 4.172649961284276e-06, "loss": 0.6231961250305176, "memory(GiB)": 76.04, "step": 3520, "token_acc": 0.7966687617850409, "train_speed(iter/s)": 0.027686 }, { "epoch": 0.9115004201952291, "grad_norm": 7.718347549438477, "learning_rate": 4.169999830474594e-06, "loss": 0.6057699203491211, "memory(GiB)": 76.04, "step": 3525, "token_acc": 0.8327813797285322, "train_speed(iter/s)": 0.027687 }, { "epoch": 0.9127933285926693, "grad_norm": 2.148632526397705, "learning_rate": 4.167346306594136e-06, "loss": 0.6129049777984619, "memory(GiB)": 76.04, "step": 3530, "token_acc": 0.8131457736835553, "train_speed(iter/s)": 0.027687 }, { "epoch": 0.9140862369901093, "grad_norm": 3.352047920227051, "learning_rate": 4.1646893950342785e-06, "loss": 0.6119277000427246, "memory(GiB)": 76.04, "step": 3535, "token_acc": 0.8336779068938476, "train_speed(iter/s)": 0.027686 }, { "epoch": 0.9153791453875493, "grad_norm": 1.2060611248016357, "learning_rate": 4.1620291011932765e-06, "loss": 0.6040964126586914, "memory(GiB)": 76.04, "step": 3540, "token_acc": 0.8219741053244108, "train_speed(iter/s)": 0.027686 }, { "epoch": 0.9166720537849893, "grad_norm": 1.5047026872634888, "learning_rate": 4.159365430476262e-06, "loss": 0.6265472412109375, "memory(GiB)": 76.04, "step": 3545, "token_acc": 0.8295702534832969, "train_speed(iter/s)": 0.027688 }, { "epoch": 0.9179649621824294, "grad_norm": 1.3251553773880005, "learning_rate": 4.156698388295222e-06, "loss": 0.6167987823486328, "memory(GiB)": 76.04, "step": 3550, "token_acc": 0.8122264371170119, "train_speed(iter/s)": 0.027688 }, { "epoch": 0.9192578705798694, "grad_norm": 0.998990535736084, "learning_rate": 4.154027980069002e-06, "loss": 0.5864760398864746, "memory(GiB)": 76.04, "step": 3555, "token_acc": 0.8300649626616478, "train_speed(iter/s)": 0.027686 }, { "epoch": 0.9205507789773094, "grad_norm": 1.6433701515197754, "learning_rate": 4.151354211223278e-06, "loss": 0.5933123588562011, "memory(GiB)": 76.04, "step": 3560, "token_acc": 0.8285720878715156, "train_speed(iter/s)": 0.027685 }, { "epoch": 0.9218436873747495, "grad_norm": 1.4250352382659912, "learning_rate": 4.148677087190559e-06, "loss": 0.6165533065795898, "memory(GiB)": 76.04, "step": 3565, "token_acc": 0.8124648441894476, "train_speed(iter/s)": 0.027686 }, { "epoch": 0.9231365957721895, "grad_norm": 1.9832043647766113, "learning_rate": 4.145996613410169e-06, "loss": 0.601347017288208, "memory(GiB)": 76.04, "step": 3570, "token_acc": 0.8363061287980919, "train_speed(iter/s)": 0.027686 }, { "epoch": 0.9244295041696295, "grad_norm": 1.1738619804382324, "learning_rate": 4.143312795328239e-06, "loss": 0.5805646419525147, "memory(GiB)": 76.04, "step": 3575, "token_acc": 0.7983795574945466, "train_speed(iter/s)": 0.027683 }, { "epoch": 0.9257224125670697, "grad_norm": 1.162148356437683, "learning_rate": 4.1406256383976945e-06, "loss": 0.6304599285125733, "memory(GiB)": 76.04, "step": 3580, "token_acc": 0.7998939233217154, "train_speed(iter/s)": 0.027683 }, { "epoch": 0.9270153209645097, "grad_norm": 1.555344820022583, "learning_rate": 4.1379351480782445e-06, "loss": 0.6200345039367676, "memory(GiB)": 76.04, "step": 3585, "token_acc": 0.8321557607386592, "train_speed(iter/s)": 0.027683 }, { "epoch": 0.9283082293619497, "grad_norm": 1.4574919939041138, "learning_rate": 4.135241329836372e-06, "loss": 0.6034027099609375, "memory(GiB)": 76.04, "step": 3590, "token_acc": 0.8022632918173296, "train_speed(iter/s)": 0.027682 }, { "epoch": 0.9296011377593898, "grad_norm": 1.1182024478912354, "learning_rate": 4.132544189145321e-06, "loss": 0.6192724227905273, "memory(GiB)": 76.04, "step": 3595, "token_acc": 0.8323038628192126, "train_speed(iter/s)": 0.027684 }, { "epoch": 0.9308940461568298, "grad_norm": 1.55838143825531, "learning_rate": 4.129843731485084e-06, "loss": 0.6345338821411133, "memory(GiB)": 76.04, "step": 3600, "token_acc": 0.7966842932685436, "train_speed(iter/s)": 0.027684 }, { "epoch": 0.9321869545542698, "grad_norm": 5.046814918518066, "learning_rate": 4.127139962342395e-06, "loss": 0.5721908569335937, "memory(GiB)": 76.04, "step": 3605, "token_acc": 0.8486438258386867, "train_speed(iter/s)": 0.027683 }, { "epoch": 0.9334798629517099, "grad_norm": 1.2368015050888062, "learning_rate": 4.124432887210715e-06, "loss": 0.6120264053344726, "memory(GiB)": 76.04, "step": 3610, "token_acc": 0.7992660550458716, "train_speed(iter/s)": 0.027684 }, { "epoch": 0.9347727713491499, "grad_norm": 0.935610294342041, "learning_rate": 4.121722511590225e-06, "loss": 0.5871891975402832, "memory(GiB)": 76.04, "step": 3615, "token_acc": 0.814580080217997, "train_speed(iter/s)": 0.027681 }, { "epoch": 0.9360656797465899, "grad_norm": 1.0046820640563965, "learning_rate": 4.119008840987807e-06, "loss": 0.6071587562561035, "memory(GiB)": 76.04, "step": 3620, "token_acc": 0.836279004654744, "train_speed(iter/s)": 0.027681 }, { "epoch": 0.93735858814403, "grad_norm": 1.2669122219085693, "learning_rate": 4.116291880917042e-06, "loss": 0.6148792266845703, "memory(GiB)": 76.04, "step": 3625, "token_acc": 0.8264746964650264, "train_speed(iter/s)": 0.027682 }, { "epoch": 0.9386514965414701, "grad_norm": 1.177242636680603, "learning_rate": 4.113571636898191e-06, "loss": 0.6176681518554688, "memory(GiB)": 76.04, "step": 3630, "token_acc": 0.8233010616902857, "train_speed(iter/s)": 0.027684 }, { "epoch": 0.9399444049389101, "grad_norm": 1.3925117254257202, "learning_rate": 4.110848114458191e-06, "loss": 0.5971219062805175, "memory(GiB)": 76.04, "step": 3635, "token_acc": 0.8236563174186287, "train_speed(iter/s)": 0.027685 }, { "epoch": 0.9412373133363501, "grad_norm": 1.1656811237335205, "learning_rate": 4.108121319130638e-06, "loss": 0.6168715476989746, "memory(GiB)": 76.04, "step": 3640, "token_acc": 0.8150085866048964, "train_speed(iter/s)": 0.027684 }, { "epoch": 0.9425302217337902, "grad_norm": 1.0381889343261719, "learning_rate": 4.105391256455776e-06, "loss": 0.6066938400268554, "memory(GiB)": 76.04, "step": 3645, "token_acc": 0.8139741020502543, "train_speed(iter/s)": 0.027684 }, { "epoch": 0.9438231301312302, "grad_norm": 1.0766489505767822, "learning_rate": 4.1026579319804894e-06, "loss": 0.60537691116333, "memory(GiB)": 76.04, "step": 3650, "token_acc": 0.8072617246596067, "train_speed(iter/s)": 0.027684 }, { "epoch": 0.9451160385286702, "grad_norm": 1.0981967449188232, "learning_rate": 4.099921351258292e-06, "loss": 0.6052407264709473, "memory(GiB)": 76.04, "step": 3655, "token_acc": 0.7947088678415858, "train_speed(iter/s)": 0.027682 }, { "epoch": 0.9464089469261103, "grad_norm": 1.5217511653900146, "learning_rate": 4.097181519849309e-06, "loss": 0.5945847034454346, "memory(GiB)": 76.04, "step": 3660, "token_acc": 0.8231042745613357, "train_speed(iter/s)": 0.027681 }, { "epoch": 0.9477018553235503, "grad_norm": 1.4334150552749634, "learning_rate": 4.094438443320274e-06, "loss": 0.6149433135986329, "memory(GiB)": 76.04, "step": 3665, "token_acc": 0.8234104473930844, "train_speed(iter/s)": 0.02768 }, { "epoch": 0.9489947637209903, "grad_norm": 1.6359714269638062, "learning_rate": 4.091692127244511e-06, "loss": 0.6281001567840576, "memory(GiB)": 76.04, "step": 3670, "token_acc": 0.8364228557642044, "train_speed(iter/s)": 0.027681 }, { "epoch": 0.9502876721184305, "grad_norm": 1.234947681427002, "learning_rate": 4.088942577201931e-06, "loss": 0.5957602977752685, "memory(GiB)": 76.04, "step": 3675, "token_acc": 0.841389663306884, "train_speed(iter/s)": 0.027681 }, { "epoch": 0.9515805805158705, "grad_norm": 1.0987111330032349, "learning_rate": 4.086189798779008e-06, "loss": 0.6053364753723145, "memory(GiB)": 76.04, "step": 3680, "token_acc": 0.8306053185547966, "train_speed(iter/s)": 0.027681 }, { "epoch": 0.9528734889133105, "grad_norm": 1.273268222808838, "learning_rate": 4.083433797568783e-06, "loss": 0.6212119579315185, "memory(GiB)": 76.04, "step": 3685, "token_acc": 0.8268924889543446, "train_speed(iter/s)": 0.027681 }, { "epoch": 0.9541663973107505, "grad_norm": 1.1013809442520142, "learning_rate": 4.0806745791708406e-06, "loss": 0.6078325271606445, "memory(GiB)": 76.04, "step": 3690, "token_acc": 0.8134150493701056, "train_speed(iter/s)": 0.027682 }, { "epoch": 0.9554593057081906, "grad_norm": 1.449779987335205, "learning_rate": 4.0779121491913035e-06, "loss": 0.6228477478027343, "memory(GiB)": 76.04, "step": 3695, "token_acc": 0.7949391768744967, "train_speed(iter/s)": 0.027683 }, { "epoch": 0.9567522141056306, "grad_norm": 1.177632451057434, "learning_rate": 4.075146513242818e-06, "loss": 0.6086900711059571, "memory(GiB)": 76.04, "step": 3700, "token_acc": 0.8241206030150754, "train_speed(iter/s)": 0.027683 }, { "epoch": 0.9580451225030706, "grad_norm": 1.7194976806640625, "learning_rate": 4.072377676944545e-06, "loss": 0.6042545318603516, "memory(GiB)": 76.04, "step": 3705, "token_acc": 0.8258380709664772, "train_speed(iter/s)": 0.027682 }, { "epoch": 0.9593380309005107, "grad_norm": 1.1962262392044067, "learning_rate": 4.069605645922152e-06, "loss": 0.5851446151733398, "memory(GiB)": 76.04, "step": 3710, "token_acc": 0.8344977304124729, "train_speed(iter/s)": 0.027682 }, { "epoch": 0.9606309392979507, "grad_norm": 1.1156598329544067, "learning_rate": 4.066830425807789e-06, "loss": 0.5880330085754395, "memory(GiB)": 76.04, "step": 3715, "token_acc": 0.8415927377759439, "train_speed(iter/s)": 0.027682 }, { "epoch": 0.9619238476953907, "grad_norm": 2.626760244369507, "learning_rate": 4.0640520222400945e-06, "loss": 0.6129249095916748, "memory(GiB)": 76.04, "step": 3720, "token_acc": 0.8348955352032055, "train_speed(iter/s)": 0.027683 }, { "epoch": 0.9632167560928309, "grad_norm": 1.0791536569595337, "learning_rate": 4.0612704408641675e-06, "loss": 0.6016806125640869, "memory(GiB)": 76.04, "step": 3725, "token_acc": 0.8637572233842663, "train_speed(iter/s)": 0.027684 }, { "epoch": 0.9645096644902709, "grad_norm": 1.5433270931243896, "learning_rate": 4.058485687331569e-06, "loss": 0.6169820785522461, "memory(GiB)": 76.04, "step": 3730, "token_acc": 0.8325906172146391, "train_speed(iter/s)": 0.027684 }, { "epoch": 0.9658025728877109, "grad_norm": 1.101804256439209, "learning_rate": 4.055697767300302e-06, "loss": 0.5675209999084473, "memory(GiB)": 76.04, "step": 3735, "token_acc": 0.8326423357664233, "train_speed(iter/s)": 0.027683 }, { "epoch": 0.967095481285151, "grad_norm": 1.0612066984176636, "learning_rate": 4.0529066864348046e-06, "loss": 0.5953152656555176, "memory(GiB)": 76.04, "step": 3740, "token_acc": 0.8497927240323893, "train_speed(iter/s)": 0.027684 }, { "epoch": 0.968388389682591, "grad_norm": 1.5735872983932495, "learning_rate": 4.050112450405937e-06, "loss": 0.6017944812774658, "memory(GiB)": 76.04, "step": 3745, "token_acc": 0.8264524103831892, "train_speed(iter/s)": 0.027682 }, { "epoch": 0.969681298080031, "grad_norm": 1.157410979270935, "learning_rate": 4.047315064890968e-06, "loss": 0.5977309226989747, "memory(GiB)": 76.04, "step": 3750, "token_acc": 0.815944055944056, "train_speed(iter/s)": 0.027682 }, { "epoch": 0.9709742064774711, "grad_norm": 1.57929265499115, "learning_rate": 4.044514535573569e-06, "loss": 0.589405632019043, "memory(GiB)": 76.04, "step": 3755, "token_acc": 0.8179965511835711, "train_speed(iter/s)": 0.027684 }, { "epoch": 0.9722671148749111, "grad_norm": 1.016497015953064, "learning_rate": 4.041710868143796e-06, "loss": 0.589882230758667, "memory(GiB)": 76.04, "step": 3760, "token_acc": 0.8092984293193717, "train_speed(iter/s)": 0.027683 }, { "epoch": 0.9735600232723511, "grad_norm": 1.266408085823059, "learning_rate": 4.038904068298083e-06, "loss": 0.5920291423797608, "memory(GiB)": 76.04, "step": 3765, "token_acc": 0.823118662159758, "train_speed(iter/s)": 0.027685 }, { "epoch": 0.9748529316697911, "grad_norm": 1.1780680418014526, "learning_rate": 4.036094141739225e-06, "loss": 0.6140639305114746, "memory(GiB)": 76.04, "step": 3770, "token_acc": 0.7946187371681734, "train_speed(iter/s)": 0.027686 }, { "epoch": 0.9761458400672313, "grad_norm": 1.0114781856536865, "learning_rate": 4.0332810941763745e-06, "loss": 0.5897872924804688, "memory(GiB)": 76.04, "step": 3775, "token_acc": 0.8349316002363314, "train_speed(iter/s)": 0.027685 }, { "epoch": 0.9774387484646713, "grad_norm": 0.9569077491760254, "learning_rate": 4.030464931325021e-06, "loss": 0.6214170455932617, "memory(GiB)": 76.04, "step": 3780, "token_acc": 0.8027143591975626, "train_speed(iter/s)": 0.027686 }, { "epoch": 0.9787316568621113, "grad_norm": 1.5879004001617432, "learning_rate": 4.027645658906986e-06, "loss": 0.6039529800415039, "memory(GiB)": 76.04, "step": 3785, "token_acc": 0.815490288962577, "train_speed(iter/s)": 0.027686 }, { "epoch": 0.9800245652595514, "grad_norm": 1.1456199884414673, "learning_rate": 4.02482328265041e-06, "loss": 0.5806800842285156, "memory(GiB)": 76.04, "step": 3790, "token_acc": 0.8437443809112813, "train_speed(iter/s)": 0.027686 }, { "epoch": 0.9813174736569914, "grad_norm": 0.8533855676651001, "learning_rate": 4.0219978082897355e-06, "loss": 0.593365478515625, "memory(GiB)": 76.04, "step": 3795, "token_acc": 0.8269325803035651, "train_speed(iter/s)": 0.027687 }, { "epoch": 0.9826103820544314, "grad_norm": 1.1695212125778198, "learning_rate": 4.019169241565704e-06, "loss": 0.6025032043457031, "memory(GiB)": 76.04, "step": 3800, "token_acc": 0.8349052595802532, "train_speed(iter/s)": 0.027688 }, { "epoch": 0.9839032904518715, "grad_norm": 1.4339202642440796, "learning_rate": 4.0163375882253366e-06, "loss": 0.6019165992736817, "memory(GiB)": 76.04, "step": 3805, "token_acc": 0.8244785353007565, "train_speed(iter/s)": 0.027688 }, { "epoch": 0.9851961988493115, "grad_norm": 1.4449810981750488, "learning_rate": 4.013502854021929e-06, "loss": 0.606717872619629, "memory(GiB)": 76.04, "step": 3810, "token_acc": 0.8192271272038598, "train_speed(iter/s)": 0.027689 }, { "epoch": 0.9864891072467515, "grad_norm": 1.1717056035995483, "learning_rate": 4.010665044715034e-06, "loss": 0.609653091430664, "memory(GiB)": 76.04, "step": 3815, "token_acc": 0.8458090195926885, "train_speed(iter/s)": 0.027688 }, { "epoch": 0.9877820156441917, "grad_norm": 4.467617988586426, "learning_rate": 4.007824166070455e-06, "loss": 0.6024861335754395, "memory(GiB)": 76.04, "step": 3820, "token_acc": 0.8408324188107141, "train_speed(iter/s)": 0.027687 }, { "epoch": 0.9890749240416317, "grad_norm": 1.2757420539855957, "learning_rate": 4.004980223860228e-06, "loss": 0.6156288146972656, "memory(GiB)": 76.04, "step": 3825, "token_acc": 0.826805096743747, "train_speed(iter/s)": 0.027687 }, { "epoch": 0.9903678324390717, "grad_norm": 1.178419828414917, "learning_rate": 4.002133223862615e-06, "loss": 0.5892780303955079, "memory(GiB)": 76.04, "step": 3830, "token_acc": 0.809322033898305, "train_speed(iter/s)": 0.027687 }, { "epoch": 0.9916607408365117, "grad_norm": 1.8065563440322876, "learning_rate": 3.999283171862093e-06, "loss": 0.6025252342224121, "memory(GiB)": 76.04, "step": 3835, "token_acc": 0.804472722092968, "train_speed(iter/s)": 0.02769 }, { "epoch": 0.9929536492339518, "grad_norm": 1.030576229095459, "learning_rate": 3.996430073649338e-06, "loss": 0.5885412216186523, "memory(GiB)": 76.04, "step": 3840, "token_acc": 0.8501988939555641, "train_speed(iter/s)": 0.02769 }, { "epoch": 0.9942465576313918, "grad_norm": 1.0532441139221191, "learning_rate": 3.993573935021213e-06, "loss": 0.6129741191864013, "memory(GiB)": 76.04, "step": 3845, "token_acc": 0.8317618076792389, "train_speed(iter/s)": 0.027689 }, { "epoch": 0.9955394660288318, "grad_norm": 1.620641827583313, "learning_rate": 3.990714761780763e-06, "loss": 0.583595085144043, "memory(GiB)": 76.04, "step": 3850, "token_acc": 0.8565638488261922, "train_speed(iter/s)": 0.027689 }, { "epoch": 0.9968323744262719, "grad_norm": 1.6248869895935059, "learning_rate": 3.987852559737196e-06, "loss": 0.6013848304748535, "memory(GiB)": 76.04, "step": 3855, "token_acc": 0.7915984036967024, "train_speed(iter/s)": 0.027689 }, { "epoch": 0.9981252828237119, "grad_norm": 1.3243690729141235, "learning_rate": 3.984987334705875e-06, "loss": 0.5860433101654052, "memory(GiB)": 76.04, "step": 3860, "token_acc": 0.834250554154246, "train_speed(iter/s)": 0.027689 }, { "epoch": 0.9994181912211519, "grad_norm": 1.1566941738128662, "learning_rate": 3.9821190925083025e-06, "loss": 0.5727869033813476, "memory(GiB)": 76.04, "step": 3865, "token_acc": 0.8439064677644144, "train_speed(iter/s)": 0.027688 }, { "epoch": 1.000517163358976, "grad_norm": 2.703934669494629, "learning_rate": 3.979247838972114e-06, "loss": 0.604734230041504, "memory(GiB)": 76.04, "step": 3870, "token_acc": 0.8727201521763456, "train_speed(iter/s)": 0.027694 }, { "epoch": 1.001810071756416, "grad_norm": 1.0446311235427856, "learning_rate": 3.976373579931063e-06, "loss": 0.5894432544708252, "memory(GiB)": 76.04, "step": 3875, "token_acc": 0.7687516615396301, "train_speed(iter/s)": 0.027693 }, { "epoch": 1.003102980153856, "grad_norm": 1.3841272592544556, "learning_rate": 3.97349632122501e-06, "loss": 0.5918097019195556, "memory(GiB)": 76.04, "step": 3880, "token_acc": 0.8106176985459612, "train_speed(iter/s)": 0.027694 }, { "epoch": 1.0043958885512962, "grad_norm": 1.3004447221755981, "learning_rate": 3.970616068699906e-06, "loss": 0.5655845642089844, "memory(GiB)": 76.04, "step": 3885, "token_acc": 0.8302924727239388, "train_speed(iter/s)": 0.027694 }, { "epoch": 1.0056887969487363, "grad_norm": 1.2096521854400635, "learning_rate": 3.96773282820779e-06, "loss": 0.5523721218109131, "memory(GiB)": 76.04, "step": 3890, "token_acc": 0.8321216960121024, "train_speed(iter/s)": 0.027693 }, { "epoch": 1.0069817053461763, "grad_norm": 0.9291108846664429, "learning_rate": 3.9648466056067705e-06, "loss": 0.5470512390136719, "memory(GiB)": 76.04, "step": 3895, "token_acc": 0.8546372106154715, "train_speed(iter/s)": 0.027694 }, { "epoch": 1.0082746137436163, "grad_norm": 1.7991231679916382, "learning_rate": 3.961957406761012e-06, "loss": 0.5519303321838379, "memory(GiB)": 76.04, "step": 3900, "token_acc": 0.8346437931856088, "train_speed(iter/s)": 0.027695 }, { "epoch": 1.0095675221410563, "grad_norm": 0.9921211004257202, "learning_rate": 3.9590652375407305e-06, "loss": 0.5495428562164306, "memory(GiB)": 76.04, "step": 3905, "token_acc": 0.8289614561027837, "train_speed(iter/s)": 0.027694 }, { "epoch": 1.0108604305384963, "grad_norm": 0.9098636507987976, "learning_rate": 3.956170103822174e-06, "loss": 0.5806960105895996, "memory(GiB)": 76.04, "step": 3910, "token_acc": 0.8316886778453777, "train_speed(iter/s)": 0.027694 }, { "epoch": 1.0121533389359363, "grad_norm": 1.403108835220337, "learning_rate": 3.953272011487615e-06, "loss": 0.5835510730743408, "memory(GiB)": 76.04, "step": 3915, "token_acc": 0.8022167487684729, "train_speed(iter/s)": 0.027695 }, { "epoch": 1.0134462473333765, "grad_norm": 0.8766242861747742, "learning_rate": 3.950370966425336e-06, "loss": 0.5739788055419922, "memory(GiB)": 76.04, "step": 3920, "token_acc": 0.8167596743207391, "train_speed(iter/s)": 0.027695 }, { "epoch": 1.0147391557308165, "grad_norm": 1.0786027908325195, "learning_rate": 3.947466974529622e-06, "loss": 0.57960524559021, "memory(GiB)": 76.04, "step": 3925, "token_acc": 0.806325589127634, "train_speed(iter/s)": 0.027695 }, { "epoch": 1.0160320641282565, "grad_norm": 1.3315753936767578, "learning_rate": 3.9445600417007416e-06, "loss": 0.5844710826873779, "memory(GiB)": 76.04, "step": 3930, "token_acc": 0.8209227957971676, "train_speed(iter/s)": 0.027695 }, { "epoch": 1.0173249725256965, "grad_norm": 1.4551076889038086, "learning_rate": 3.941650173844939e-06, "loss": 0.5371768951416016, "memory(GiB)": 76.04, "step": 3935, "token_acc": 0.8264791248046439, "train_speed(iter/s)": 0.027695 }, { "epoch": 1.0186178809231365, "grad_norm": 1.0872153043746948, "learning_rate": 3.938737376874425e-06, "loss": 0.5733814239501953, "memory(GiB)": 76.04, "step": 3940, "token_acc": 0.8316657328103738, "train_speed(iter/s)": 0.027697 }, { "epoch": 1.0199107893205765, "grad_norm": 0.9862046241760254, "learning_rate": 3.935821656707359e-06, "loss": 0.5849019050598144, "memory(GiB)": 76.04, "step": 3945, "token_acc": 0.8450649147505084, "train_speed(iter/s)": 0.027697 }, { "epoch": 1.0212036977180168, "grad_norm": 1.2747979164123535, "learning_rate": 3.93290301926784e-06, "loss": 0.5715857982635498, "memory(GiB)": 76.04, "step": 3950, "token_acc": 0.8034803940358005, "train_speed(iter/s)": 0.027699 }, { "epoch": 1.0224966061154568, "grad_norm": 0.8877357840538025, "learning_rate": 3.929981470485897e-06, "loss": 0.5560395240783691, "memory(GiB)": 76.04, "step": 3955, "token_acc": 0.8330184222957014, "train_speed(iter/s)": 0.027699 }, { "epoch": 1.0237895145128968, "grad_norm": 3.0129168033599854, "learning_rate": 3.927057016297466e-06, "loss": 0.5378780364990234, "memory(GiB)": 76.04, "step": 3960, "token_acc": 0.8276883389862896, "train_speed(iter/s)": 0.027697 }, { "epoch": 1.0250824229103368, "grad_norm": 1.279038429260254, "learning_rate": 3.924129662644398e-06, "loss": 0.5460095405578613, "memory(GiB)": 76.04, "step": 3965, "token_acc": 0.8445347544377564, "train_speed(iter/s)": 0.027698 }, { "epoch": 1.0263753313077768, "grad_norm": 1.0856847763061523, "learning_rate": 3.921199415474426e-06, "loss": 0.5677762985229492, "memory(GiB)": 76.04, "step": 3970, "token_acc": 0.8344854941069809, "train_speed(iter/s)": 0.027698 }, { "epoch": 1.0276682397052168, "grad_norm": 1.0214816331863403, "learning_rate": 3.918266280741166e-06, "loss": 0.5525214195251464, "memory(GiB)": 76.04, "step": 3975, "token_acc": 0.8022044260742272, "train_speed(iter/s)": 0.027699 }, { "epoch": 1.0289611481026568, "grad_norm": 2.20583176612854, "learning_rate": 3.915330264404098e-06, "loss": 0.5635844230651855, "memory(GiB)": 76.04, "step": 3980, "token_acc": 0.8163771712158809, "train_speed(iter/s)": 0.0277 }, { "epoch": 1.030254056500097, "grad_norm": 1.1798099279403687, "learning_rate": 3.912391372428561e-06, "loss": 0.563462209701538, "memory(GiB)": 76.04, "step": 3985, "token_acc": 0.8396355353075171, "train_speed(iter/s)": 0.027699 }, { "epoch": 1.031546964897537, "grad_norm": 0.9924677014350891, "learning_rate": 3.9094496107857336e-06, "loss": 0.5675541400909424, "memory(GiB)": 76.04, "step": 3990, "token_acc": 0.8223220012828736, "train_speed(iter/s)": 0.027699 }, { "epoch": 1.032839873294977, "grad_norm": 1.259884238243103, "learning_rate": 3.906504985452626e-06, "loss": 0.5578344345092774, "memory(GiB)": 76.04, "step": 3995, "token_acc": 0.8310387984981227, "train_speed(iter/s)": 0.027698 }, { "epoch": 1.034132781692417, "grad_norm": 1.3022695779800415, "learning_rate": 3.903557502412065e-06, "loss": 0.5636180877685547, "memory(GiB)": 76.04, "step": 4000, "token_acc": 0.8399380474257637, "train_speed(iter/s)": 0.027698 }, { "epoch": 1.035425690089857, "grad_norm": 1.0616282224655151, "learning_rate": 3.900607167652687e-06, "loss": 0.5414395809173584, "memory(GiB)": 76.04, "step": 4005, "token_acc": 0.8558815464765561, "train_speed(iter/s)": 0.027663 }, { "epoch": 1.036718598487297, "grad_norm": 1.1525382995605469, "learning_rate": 3.897653987168919e-06, "loss": 0.5726981163024902, "memory(GiB)": 76.04, "step": 4010, "token_acc": 0.8494477021682804, "train_speed(iter/s)": 0.027663 }, { "epoch": 1.0380115068847373, "grad_norm": 1.1002613306045532, "learning_rate": 3.894697966960972e-06, "loss": 0.5688316345214843, "memory(GiB)": 76.04, "step": 4015, "token_acc": 0.8253162219554981, "train_speed(iter/s)": 0.027664 }, { "epoch": 1.0393044152821773, "grad_norm": 0.9993764758110046, "learning_rate": 3.891739113034826e-06, "loss": 0.5663973331451416, "memory(GiB)": 76.04, "step": 4020, "token_acc": 0.847761685319289, "train_speed(iter/s)": 0.027665 }, { "epoch": 1.0405973236796173, "grad_norm": 1.1270966529846191, "learning_rate": 3.888777431402219e-06, "loss": 0.5679460525512695, "memory(GiB)": 76.04, "step": 4025, "token_acc": 0.8138078016016533, "train_speed(iter/s)": 0.027665 }, { "epoch": 1.0418902320770573, "grad_norm": 1.4055637121200562, "learning_rate": 3.885812928080634e-06, "loss": 0.5653609275817871, "memory(GiB)": 76.04, "step": 4030, "token_acc": 0.8330082979618371, "train_speed(iter/s)": 0.027665 }, { "epoch": 1.0431831404744973, "grad_norm": 1.0064263343811035, "learning_rate": 3.8828456090932855e-06, "loss": 0.5649868011474609, "memory(GiB)": 76.04, "step": 4035, "token_acc": 0.8298977309044423, "train_speed(iter/s)": 0.027665 }, { "epoch": 1.0444760488719373, "grad_norm": 1.3120592832565308, "learning_rate": 3.879875480469112e-06, "loss": 0.558688735961914, "memory(GiB)": 76.04, "step": 4040, "token_acc": 0.8430891302155129, "train_speed(iter/s)": 0.027664 }, { "epoch": 1.0457689572693774, "grad_norm": 1.1473355293273926, "learning_rate": 3.876902548242758e-06, "loss": 0.5573469161987304, "memory(GiB)": 76.04, "step": 4045, "token_acc": 0.8069763883930848, "train_speed(iter/s)": 0.027665 }, { "epoch": 1.0470618656668176, "grad_norm": 1.2132424116134644, "learning_rate": 3.873926818454565e-06, "loss": 0.6102540016174316, "memory(GiB)": 76.04, "step": 4050, "token_acc": 0.8502197995428169, "train_speed(iter/s)": 0.027665 }, { "epoch": 1.0483547740642576, "grad_norm": 6.2116618156433105, "learning_rate": 3.87094829715056e-06, "loss": 0.548386812210083, "memory(GiB)": 76.04, "step": 4055, "token_acc": 0.8210007451137732, "train_speed(iter/s)": 0.027665 }, { "epoch": 1.0496476824616976, "grad_norm": 1.0829912424087524, "learning_rate": 3.867966990382438e-06, "loss": 0.5702716827392578, "memory(GiB)": 76.04, "step": 4060, "token_acc": 0.7936434822662367, "train_speed(iter/s)": 0.027665 }, { "epoch": 1.0509405908591376, "grad_norm": 1.3445277214050293, "learning_rate": 3.864982904207557e-06, "loss": 0.5754476547241211, "memory(GiB)": 76.04, "step": 4065, "token_acc": 0.824864653316809, "train_speed(iter/s)": 0.027663 }, { "epoch": 1.0522334992565776, "grad_norm": 4.769752025604248, "learning_rate": 3.861996044688922e-06, "loss": 0.5743865013122559, "memory(GiB)": 76.04, "step": 4070, "token_acc": 0.8401046687784052, "train_speed(iter/s)": 0.027664 }, { "epoch": 1.0535264076540176, "grad_norm": 3.4262094497680664, "learning_rate": 3.8590064178951695e-06, "loss": 0.5537999153137207, "memory(GiB)": 76.04, "step": 4075, "token_acc": 0.852727935517842, "train_speed(iter/s)": 0.027663 }, { "epoch": 1.0548193160514578, "grad_norm": 1.150668978691101, "learning_rate": 3.856014029900563e-06, "loss": 0.541869068145752, "memory(GiB)": 76.04, "step": 4080, "token_acc": 0.8340331114524663, "train_speed(iter/s)": 0.027661 }, { "epoch": 1.0561122244488979, "grad_norm": 1.7679377794265747, "learning_rate": 3.853018886784973e-06, "loss": 0.5608885765075684, "memory(GiB)": 76.04, "step": 4085, "token_acc": 0.8438836612489307, "train_speed(iter/s)": 0.027661 }, { "epoch": 1.0574051328463379, "grad_norm": 3.0141847133636475, "learning_rate": 3.850020994633869e-06, "loss": 0.5597274303436279, "memory(GiB)": 76.04, "step": 4090, "token_acc": 0.8566967231141412, "train_speed(iter/s)": 0.02766 }, { "epoch": 1.0586980412437779, "grad_norm": 1.5420576333999634, "learning_rate": 3.8470203595383034e-06, "loss": 0.5814280986785889, "memory(GiB)": 76.04, "step": 4095, "token_acc": 0.8152106326752682, "train_speed(iter/s)": 0.027659 }, { "epoch": 1.0599909496412179, "grad_norm": 3.600728988647461, "learning_rate": 3.8440169875949075e-06, "loss": 0.55950927734375, "memory(GiB)": 76.04, "step": 4100, "token_acc": 0.8275422378068218, "train_speed(iter/s)": 0.027658 }, { "epoch": 1.0612838580386579, "grad_norm": 1.3479124307632446, "learning_rate": 3.841010884905868e-06, "loss": 0.5699577331542969, "memory(GiB)": 76.04, "step": 4105, "token_acc": 0.8080579942442898, "train_speed(iter/s)": 0.027659 }, { "epoch": 1.062576766436098, "grad_norm": 0.989827036857605, "learning_rate": 3.838002057578921e-06, "loss": 0.5578522682189941, "memory(GiB)": 76.04, "step": 4110, "token_acc": 0.8228656838896867, "train_speed(iter/s)": 0.027658 }, { "epoch": 1.0638696748335381, "grad_norm": 1.1100108623504639, "learning_rate": 3.834990511727341e-06, "loss": 0.5745999813079834, "memory(GiB)": 76.04, "step": 4115, "token_acc": 0.8142663088493522, "train_speed(iter/s)": 0.027658 }, { "epoch": 1.0651625832309781, "grad_norm": 0.9445801973342896, "learning_rate": 3.831976253469921e-06, "loss": 0.5575265884399414, "memory(GiB)": 76.04, "step": 4120, "token_acc": 0.813193334855056, "train_speed(iter/s)": 0.027658 }, { "epoch": 1.0664554916284181, "grad_norm": 1.1623046398162842, "learning_rate": 3.828959288930971e-06, "loss": 0.5857258796691894, "memory(GiB)": 76.04, "step": 4125, "token_acc": 0.8250850433446725, "train_speed(iter/s)": 0.027659 }, { "epoch": 1.0677484000258581, "grad_norm": 1.0290391445159912, "learning_rate": 3.825939624240294e-06, "loss": 0.5558303833007813, "memory(GiB)": 76.04, "step": 4130, "token_acc": 0.8497815003641661, "train_speed(iter/s)": 0.027657 }, { "epoch": 1.0690413084232981, "grad_norm": 1.3947314023971558, "learning_rate": 3.822917265533184e-06, "loss": 0.5638031959533691, "memory(GiB)": 76.04, "step": 4135, "token_acc": 0.8299896824486989, "train_speed(iter/s)": 0.027657 }, { "epoch": 1.0703342168207381, "grad_norm": 1.2008622884750366, "learning_rate": 3.819892218950403e-06, "loss": 0.5699079513549805, "memory(GiB)": 76.04, "step": 4140, "token_acc": 0.8410833741230217, "train_speed(iter/s)": 0.027657 }, { "epoch": 1.0716271252181784, "grad_norm": 1.117075800895691, "learning_rate": 3.816864490638181e-06, "loss": 0.546845531463623, "memory(GiB)": 76.04, "step": 4145, "token_acc": 0.8247305985692294, "train_speed(iter/s)": 0.027657 }, { "epoch": 1.0729200336156184, "grad_norm": 1.193162202835083, "learning_rate": 3.8138340867481914e-06, "loss": 0.540710735321045, "memory(GiB)": 76.04, "step": 4150, "token_acc": 0.8468981429794202, "train_speed(iter/s)": 0.027658 }, { "epoch": 1.0742129420130584, "grad_norm": 1.0668905973434448, "learning_rate": 3.810801013437546e-06, "loss": 0.5506375312805176, "memory(GiB)": 76.04, "step": 4155, "token_acc": 0.8278915329275042, "train_speed(iter/s)": 0.027655 }, { "epoch": 1.0755058504104984, "grad_norm": 1.1097651720046997, "learning_rate": 3.807765276868779e-06, "loss": 0.5460940361022949, "memory(GiB)": 76.04, "step": 4160, "token_acc": 0.8142139418044798, "train_speed(iter/s)": 0.027655 }, { "epoch": 1.0767987588079384, "grad_norm": 1.4875537157058716, "learning_rate": 3.8047268832098376e-06, "loss": 0.5787097454071045, "memory(GiB)": 76.04, "step": 4165, "token_acc": 0.8459223372238127, "train_speed(iter/s)": 0.027656 }, { "epoch": 1.0780916672053784, "grad_norm": 1.1992076635360718, "learning_rate": 3.801685838634066e-06, "loss": 0.5527867794036865, "memory(GiB)": 76.04, "step": 4170, "token_acc": 0.8586858373272209, "train_speed(iter/s)": 0.027656 }, { "epoch": 1.0793845756028186, "grad_norm": 1.046899676322937, "learning_rate": 3.7986421493201952e-06, "loss": 0.5584450721740722, "memory(GiB)": 76.04, "step": 4175, "token_acc": 0.813550135501355, "train_speed(iter/s)": 0.027656 }, { "epoch": 1.0806774840002586, "grad_norm": 1.144406795501709, "learning_rate": 3.7955958214523297e-06, "loss": 0.5506217002868652, "memory(GiB)": 76.04, "step": 4180, "token_acc": 0.8326120340639397, "train_speed(iter/s)": 0.027655 }, { "epoch": 1.0819703923976987, "grad_norm": 1.1309776306152344, "learning_rate": 3.7925468612199344e-06, "loss": 0.5434449195861817, "memory(GiB)": 76.04, "step": 4185, "token_acc": 0.8455437400857764, "train_speed(iter/s)": 0.027656 }, { "epoch": 1.0832633007951387, "grad_norm": 1.772002100944519, "learning_rate": 3.7894952748178238e-06, "loss": 0.5281466484069824, "memory(GiB)": 76.04, "step": 4190, "token_acc": 0.8599103788530303, "train_speed(iter/s)": 0.027656 }, { "epoch": 1.0845562091925787, "grad_norm": 1.2275793552398682, "learning_rate": 3.786441068446146e-06, "loss": 0.5290435791015625, "memory(GiB)": 76.04, "step": 4195, "token_acc": 0.8503279666070364, "train_speed(iter/s)": 0.027657 }, { "epoch": 1.0858491175900187, "grad_norm": 1.002516746520996, "learning_rate": 3.7833842483103754e-06, "loss": 0.553908109664917, "memory(GiB)": 76.04, "step": 4200, "token_acc": 0.83946196437169, "train_speed(iter/s)": 0.027657 }, { "epoch": 1.087142025987459, "grad_norm": 0.9520712494850159, "learning_rate": 3.7803248206212943e-06, "loss": 0.5496163368225098, "memory(GiB)": 76.04, "step": 4205, "token_acc": 0.8473314975085234, "train_speed(iter/s)": 0.027658 }, { "epoch": 1.088434934384899, "grad_norm": 1.1912646293640137, "learning_rate": 3.7772627915949844e-06, "loss": 0.5416050910949707, "memory(GiB)": 76.04, "step": 4210, "token_acc": 0.862577306575792, "train_speed(iter/s)": 0.027658 }, { "epoch": 1.089727842782339, "grad_norm": 0.875792920589447, "learning_rate": 3.7741981674528116e-06, "loss": 0.5520293235778808, "memory(GiB)": 76.04, "step": 4215, "token_acc": 0.8360961569212728, "train_speed(iter/s)": 0.027658 }, { "epoch": 1.091020751179779, "grad_norm": 1.6698824167251587, "learning_rate": 3.7711309544214163e-06, "loss": 0.5539298534393311, "memory(GiB)": 76.04, "step": 4220, "token_acc": 0.8197564955441194, "train_speed(iter/s)": 0.027658 }, { "epoch": 1.092313659577219, "grad_norm": 1.9617213010787964, "learning_rate": 3.768061158732697e-06, "loss": 0.543891191482544, "memory(GiB)": 76.04, "step": 4225, "token_acc": 0.8655219780219781, "train_speed(iter/s)": 0.027659 }, { "epoch": 1.093606567974659, "grad_norm": 1.0107512474060059, "learning_rate": 3.764988786623801e-06, "loss": 0.5563596725463867, "memory(GiB)": 76.04, "step": 4230, "token_acc": 0.8210617141917989, "train_speed(iter/s)": 0.027659 }, { "epoch": 1.094899476372099, "grad_norm": 1.1788944005966187, "learning_rate": 3.76191384433711e-06, "loss": 0.5319845676422119, "memory(GiB)": 76.04, "step": 4235, "token_acc": 0.8385356134816536, "train_speed(iter/s)": 0.027658 }, { "epoch": 1.0961923847695392, "grad_norm": 1.3050668239593506, "learning_rate": 3.7588363381202264e-06, "loss": 0.5554252624511719, "memory(GiB)": 76.04, "step": 4240, "token_acc": 0.8417724746315843, "train_speed(iter/s)": 0.027659 }, { "epoch": 1.0974852931669792, "grad_norm": 1.0011587142944336, "learning_rate": 3.7557562742259635e-06, "loss": 0.5328820705413818, "memory(GiB)": 76.04, "step": 4245, "token_acc": 0.8045835662381219, "train_speed(iter/s)": 0.027659 }, { "epoch": 1.0987782015644192, "grad_norm": 0.953105092048645, "learning_rate": 3.752673658912331e-06, "loss": 0.5456388473510743, "memory(GiB)": 76.04, "step": 4250, "token_acc": 0.8526051825020897, "train_speed(iter/s)": 0.027659 }, { "epoch": 1.1000711099618592, "grad_norm": 1.271561622619629, "learning_rate": 3.7495884984425235e-06, "loss": 0.5330571174621582, "memory(GiB)": 76.04, "step": 4255, "token_acc": 0.8340923877683799, "train_speed(iter/s)": 0.027658 }, { "epoch": 1.1013640183592992, "grad_norm": 1.6461479663848877, "learning_rate": 3.746500799084904e-06, "loss": 0.5901468753814697, "memory(GiB)": 76.04, "step": 4260, "token_acc": 0.8404071670932793, "train_speed(iter/s)": 0.02766 }, { "epoch": 1.1026569267567392, "grad_norm": 1.140838384628296, "learning_rate": 3.7434105671129962e-06, "loss": 0.5382442474365234, "memory(GiB)": 76.04, "step": 4265, "token_acc": 0.8247861227962376, "train_speed(iter/s)": 0.02766 }, { "epoch": 1.1039498351541792, "grad_norm": 1.1467549800872803, "learning_rate": 3.7403178088054676e-06, "loss": 0.5643450260162354, "memory(GiB)": 76.04, "step": 4270, "token_acc": 0.8297176451105407, "train_speed(iter/s)": 0.02766 }, { "epoch": 1.1052427435516194, "grad_norm": 1.270693302154541, "learning_rate": 3.737222530446122e-06, "loss": 0.5628186225891113, "memory(GiB)": 76.04, "step": 4275, "token_acc": 0.801499403646277, "train_speed(iter/s)": 0.02766 }, { "epoch": 1.1065356519490595, "grad_norm": 1.3963029384613037, "learning_rate": 3.7341247383238793e-06, "loss": 0.5608326911926269, "memory(GiB)": 76.04, "step": 4280, "token_acc": 0.8412363787523383, "train_speed(iter/s)": 0.027659 }, { "epoch": 1.1078285603464995, "grad_norm": 1.8522865772247314, "learning_rate": 3.731024438732771e-06, "loss": 0.5282313346862793, "memory(GiB)": 76.04, "step": 4285, "token_acc": 0.8429379193156183, "train_speed(iter/s)": 0.027657 }, { "epoch": 1.1091214687439395, "grad_norm": 1.071329951286316, "learning_rate": 3.7279216379719194e-06, "loss": 0.5438883781433106, "memory(GiB)": 76.04, "step": 4290, "token_acc": 0.8636459342232703, "train_speed(iter/s)": 0.027658 }, { "epoch": 1.1104143771413795, "grad_norm": 1.016403317451477, "learning_rate": 3.7248163423455307e-06, "loss": 0.5469881057739258, "memory(GiB)": 76.04, "step": 4295, "token_acc": 0.8709039687639005, "train_speed(iter/s)": 0.027657 }, { "epoch": 1.1117072855388195, "grad_norm": 1.8317582607269287, "learning_rate": 3.721708558162881e-06, "loss": 0.5621847152709961, "memory(GiB)": 76.04, "step": 4300, "token_acc": 0.8440362706347361, "train_speed(iter/s)": 0.027657 }, { "epoch": 1.1130001939362597, "grad_norm": 0.949685275554657, "learning_rate": 3.7185982917382986e-06, "loss": 0.5375046730041504, "memory(GiB)": 76.04, "step": 4305, "token_acc": 0.8541533400347254, "train_speed(iter/s)": 0.027657 }, { "epoch": 1.1142931023336997, "grad_norm": 1.1784414052963257, "learning_rate": 3.7154855493911596e-06, "loss": 0.5650627136230468, "memory(GiB)": 76.04, "step": 4310, "token_acc": 0.837442021839962, "train_speed(iter/s)": 0.027658 }, { "epoch": 1.1155860107311397, "grad_norm": 1.3990085124969482, "learning_rate": 3.7123703374458685e-06, "loss": 0.5586078643798829, "memory(GiB)": 76.04, "step": 4315, "token_acc": 0.8209936463113308, "train_speed(iter/s)": 0.027657 }, { "epoch": 1.1168789191285797, "grad_norm": 1.3244526386260986, "learning_rate": 3.709252662231849e-06, "loss": 0.5645613670349121, "memory(GiB)": 76.04, "step": 4320, "token_acc": 0.8429337789112655, "train_speed(iter/s)": 0.027657 }, { "epoch": 1.1181718275260197, "grad_norm": 0.8955655097961426, "learning_rate": 3.706132530083527e-06, "loss": 0.5438594818115234, "memory(GiB)": 76.04, "step": 4325, "token_acc": 0.8423168980373384, "train_speed(iter/s)": 0.027658 }, { "epoch": 1.1194647359234597, "grad_norm": 1.5076687335968018, "learning_rate": 3.703009947340322e-06, "loss": 0.5539616584777832, "memory(GiB)": 76.04, "step": 4330, "token_acc": 0.831388096935139, "train_speed(iter/s)": 0.02766 }, { "epoch": 1.1207576443209, "grad_norm": 1.3437058925628662, "learning_rate": 3.6998849203466324e-06, "loss": 0.5941734313964844, "memory(GiB)": 76.04, "step": 4335, "token_acc": 0.8199184374329255, "train_speed(iter/s)": 0.027661 }, { "epoch": 1.12205055271834, "grad_norm": 1.2628297805786133, "learning_rate": 3.6967574554518237e-06, "loss": 0.5422052383422852, "memory(GiB)": 76.04, "step": 4340, "token_acc": 0.8556654985226414, "train_speed(iter/s)": 0.02766 }, { "epoch": 1.12334346111578, "grad_norm": 1.053979516029358, "learning_rate": 3.6936275590102133e-06, "loss": 0.5253170967102051, "memory(GiB)": 76.04, "step": 4345, "token_acc": 0.839918890776566, "train_speed(iter/s)": 0.027659 }, { "epoch": 1.12463636951322, "grad_norm": 1.1738144159317017, "learning_rate": 3.6904952373810586e-06, "loss": 0.5661196231842041, "memory(GiB)": 76.04, "step": 4350, "token_acc": 0.8402832743178504, "train_speed(iter/s)": 0.027659 }, { "epoch": 1.12592927791066, "grad_norm": 1.5563236474990845, "learning_rate": 3.6873604969285466e-06, "loss": 0.5621729850769043, "memory(GiB)": 76.04, "step": 4355, "token_acc": 0.8411411300726107, "train_speed(iter/s)": 0.027658 }, { "epoch": 1.1272221863081, "grad_norm": 0.9622613787651062, "learning_rate": 3.6842233440217757e-06, "loss": 0.554353904724121, "memory(GiB)": 76.04, "step": 4360, "token_acc": 0.845444059976932, "train_speed(iter/s)": 0.027659 }, { "epoch": 1.12851509470554, "grad_norm": 1.0929608345031738, "learning_rate": 3.68108378503475e-06, "loss": 0.5726329803466796, "memory(GiB)": 76.04, "step": 4365, "token_acc": 0.8360975096088032, "train_speed(iter/s)": 0.027657 }, { "epoch": 1.1298080031029802, "grad_norm": 1.5161771774291992, "learning_rate": 3.677941826346358e-06, "loss": 0.5386641502380372, "memory(GiB)": 76.04, "step": 4370, "token_acc": 0.8529266398361929, "train_speed(iter/s)": 0.027657 }, { "epoch": 1.1311009115004202, "grad_norm": 1.0926306247711182, "learning_rate": 3.674797474340367e-06, "loss": 0.567785120010376, "memory(GiB)": 76.04, "step": 4375, "token_acc": 0.8242666666666667, "train_speed(iter/s)": 0.027659 }, { "epoch": 1.1323938198978603, "grad_norm": 1.2848386764526367, "learning_rate": 3.6716507354054044e-06, "loss": 0.5367423534393311, "memory(GiB)": 76.04, "step": 4380, "token_acc": 0.8228389830508475, "train_speed(iter/s)": 0.027658 }, { "epoch": 1.1336867282953003, "grad_norm": 1.2967411279678345, "learning_rate": 3.6685016159349483e-06, "loss": 0.5374815940856934, "memory(GiB)": 76.04, "step": 4385, "token_acc": 0.8508662193411426, "train_speed(iter/s)": 0.027658 }, { "epoch": 1.1349796366927403, "grad_norm": 0.8757297396659851, "learning_rate": 3.665350122327316e-06, "loss": 0.5277114391326905, "memory(GiB)": 76.04, "step": 4390, "token_acc": 0.8365678150894025, "train_speed(iter/s)": 0.027658 }, { "epoch": 1.1362725450901803, "grad_norm": 1.1371816396713257, "learning_rate": 3.662196260985646e-06, "loss": 0.5421219825744629, "memory(GiB)": 76.04, "step": 4395, "token_acc": 0.8501142154278686, "train_speed(iter/s)": 0.027658 }, { "epoch": 1.1375654534876203, "grad_norm": 1.695295810699463, "learning_rate": 3.6590400383178866e-06, "loss": 0.5642148971557617, "memory(GiB)": 76.04, "step": 4400, "token_acc": 0.8535144713526285, "train_speed(iter/s)": 0.027659 }, { "epoch": 1.1388583618850605, "grad_norm": 1.0313785076141357, "learning_rate": 3.6558814607367854e-06, "loss": 0.5805719375610352, "memory(GiB)": 76.04, "step": 4405, "token_acc": 0.8027235587834771, "train_speed(iter/s)": 0.027659 }, { "epoch": 1.1401512702825005, "grad_norm": 4.604849815368652, "learning_rate": 3.6527205346598754e-06, "loss": 0.5551558017730713, "memory(GiB)": 76.04, "step": 4410, "token_acc": 0.8510540083089706, "train_speed(iter/s)": 0.027659 }, { "epoch": 1.1414441786799405, "grad_norm": 0.8938732147216797, "learning_rate": 3.649557266509458e-06, "loss": 0.5434865951538086, "memory(GiB)": 76.04, "step": 4415, "token_acc": 0.8330908429571702, "train_speed(iter/s)": 0.027657 }, { "epoch": 1.1427370870773805, "grad_norm": 1.1276806592941284, "learning_rate": 3.646391662712598e-06, "loss": 0.5468146324157714, "memory(GiB)": 76.04, "step": 4420, "token_acc": 0.8307084785133566, "train_speed(iter/s)": 0.027657 }, { "epoch": 1.1440299954748205, "grad_norm": 0.9285451173782349, "learning_rate": 3.6432237297011016e-06, "loss": 0.5583270072937012, "memory(GiB)": 76.04, "step": 4425, "token_acc": 0.8463666452600899, "train_speed(iter/s)": 0.027657 }, { "epoch": 1.1453229038722608, "grad_norm": 1.3203446865081787, "learning_rate": 3.640053473911509e-06, "loss": 0.546565055847168, "memory(GiB)": 76.04, "step": 4430, "token_acc": 0.8101812275602667, "train_speed(iter/s)": 0.027657 }, { "epoch": 1.1466158122697008, "grad_norm": 1.0341274738311768, "learning_rate": 3.6368809017850796e-06, "loss": 0.5599943161010742, "memory(GiB)": 76.04, "step": 4435, "token_acc": 0.8432740304620504, "train_speed(iter/s)": 0.027658 }, { "epoch": 1.1479087206671408, "grad_norm": 1.6448158025741577, "learning_rate": 3.6337060197677803e-06, "loss": 0.5772030830383301, "memory(GiB)": 76.04, "step": 4440, "token_acc": 0.8330186134340437, "train_speed(iter/s)": 0.027659 }, { "epoch": 1.1492016290645808, "grad_norm": 1.2664222717285156, "learning_rate": 3.6305288343102686e-06, "loss": 0.5556002616882324, "memory(GiB)": 76.04, "step": 4445, "token_acc": 0.8380842848927955, "train_speed(iter/s)": 0.027659 }, { "epoch": 1.1504945374620208, "grad_norm": 1.121952772140503, "learning_rate": 3.6273493518678843e-06, "loss": 0.5274020671844483, "memory(GiB)": 76.04, "step": 4450, "token_acc": 0.8465437496040044, "train_speed(iter/s)": 0.027659 }, { "epoch": 1.1517874458594608, "grad_norm": 2.5075345039367676, "learning_rate": 3.624167578900633e-06, "loss": 0.5526081085205078, "memory(GiB)": 76.04, "step": 4455, "token_acc": 0.8559080459770115, "train_speed(iter/s)": 0.027658 }, { "epoch": 1.1530803542569008, "grad_norm": 1.5982297658920288, "learning_rate": 3.6209835218731753e-06, "loss": 0.5586674213409424, "memory(GiB)": 76.04, "step": 4460, "token_acc": 0.8431690299347288, "train_speed(iter/s)": 0.027657 }, { "epoch": 1.154373262654341, "grad_norm": 0.9923174977302551, "learning_rate": 3.6177971872548116e-06, "loss": 0.5380115032196044, "memory(GiB)": 76.04, "step": 4465, "token_acc": 0.8524468348607622, "train_speed(iter/s)": 0.027656 }, { "epoch": 1.155666171051781, "grad_norm": 1.332160234451294, "learning_rate": 3.6146085815194694e-06, "loss": 0.5499836444854737, "memory(GiB)": 76.04, "step": 4470, "token_acc": 0.8119310724416783, "train_speed(iter/s)": 0.027656 }, { "epoch": 1.156959079449221, "grad_norm": 1.0057822465896606, "learning_rate": 3.6114177111456916e-06, "loss": 0.5390474319458007, "memory(GiB)": 76.04, "step": 4475, "token_acc": 0.8468805191604715, "train_speed(iter/s)": 0.027656 }, { "epoch": 1.158251987846661, "grad_norm": 1.1006532907485962, "learning_rate": 3.608224582616622e-06, "loss": 0.5385686874389648, "memory(GiB)": 76.04, "step": 4480, "token_acc": 0.8392769471100201, "train_speed(iter/s)": 0.027657 }, { "epoch": 1.159544896244101, "grad_norm": 3.348762035369873, "learning_rate": 3.6050292024199916e-06, "loss": 0.5231637001037598, "memory(GiB)": 76.04, "step": 4485, "token_acc": 0.8456874336819766, "train_speed(iter/s)": 0.027657 }, { "epoch": 1.160837804641541, "grad_norm": 1.867474913597107, "learning_rate": 3.601831577048109e-06, "loss": 0.5361900329589844, "memory(GiB)": 76.04, "step": 4490, "token_acc": 0.848513334725994, "train_speed(iter/s)": 0.027656 }, { "epoch": 1.162130713038981, "grad_norm": 1.6171462535858154, "learning_rate": 3.598631712997841e-06, "loss": 0.5521645545959473, "memory(GiB)": 76.04, "step": 4495, "token_acc": 0.8328944218338521, "train_speed(iter/s)": 0.027658 }, { "epoch": 1.1634236214364213, "grad_norm": 1.5987505912780762, "learning_rate": 3.5954296167706054e-06, "loss": 0.5655074119567871, "memory(GiB)": 76.04, "step": 4500, "token_acc": 0.8466187172830574, "train_speed(iter/s)": 0.027657 }, { "epoch": 1.1647165298338613, "grad_norm": 0.9721993803977966, "learning_rate": 3.5922252948723547e-06, "loss": 0.5404928684234619, "memory(GiB)": 76.04, "step": 4505, "token_acc": 0.8404960207292245, "train_speed(iter/s)": 0.027655 }, { "epoch": 1.1660094382313013, "grad_norm": 1.1576437950134277, "learning_rate": 3.5890187538135616e-06, "loss": 0.5581830024719239, "memory(GiB)": 76.04, "step": 4510, "token_acc": 0.8301479321887485, "train_speed(iter/s)": 0.027655 }, { "epoch": 1.1673023466287413, "grad_norm": 1.2888849973678589, "learning_rate": 3.5858100001092117e-06, "loss": 0.5397047996520996, "memory(GiB)": 76.04, "step": 4515, "token_acc": 0.8300223392372746, "train_speed(iter/s)": 0.027655 }, { "epoch": 1.1685952550261813, "grad_norm": 1.0602697134017944, "learning_rate": 3.5825990402787815e-06, "loss": 0.5373691558837891, "memory(GiB)": 76.04, "step": 4520, "token_acc": 0.8421267268185757, "train_speed(iter/s)": 0.027656 }, { "epoch": 1.1698881634236213, "grad_norm": 1.1042633056640625, "learning_rate": 3.579385880846232e-06, "loss": 0.5380208015441894, "memory(GiB)": 76.04, "step": 4525, "token_acc": 0.8591904314733356, "train_speed(iter/s)": 0.027657 }, { "epoch": 1.1711810718210616, "grad_norm": 1.2820953130722046, "learning_rate": 3.576170528339996e-06, "loss": 0.5534794807434082, "memory(GiB)": 76.04, "step": 4530, "token_acc": 0.8538792049463793, "train_speed(iter/s)": 0.027658 }, { "epoch": 1.1724739802185016, "grad_norm": 1.0828604698181152, "learning_rate": 3.5729529892929577e-06, "loss": 0.5525107383728027, "memory(GiB)": 76.04, "step": 4535, "token_acc": 0.8491177281499862, "train_speed(iter/s)": 0.027659 }, { "epoch": 1.1737668886159416, "grad_norm": 1.0604639053344727, "learning_rate": 3.569733270242446e-06, "loss": 0.5319010734558105, "memory(GiB)": 76.04, "step": 4540, "token_acc": 0.8434462262398613, "train_speed(iter/s)": 0.027657 }, { "epoch": 1.1750597970133816, "grad_norm": 1.1371124982833862, "learning_rate": 3.5665113777302184e-06, "loss": 0.5360076904296875, "memory(GiB)": 76.04, "step": 4545, "token_acc": 0.8431543594888123, "train_speed(iter/s)": 0.027656 }, { "epoch": 1.1763527054108216, "grad_norm": 1.3616864681243896, "learning_rate": 3.56328731830245e-06, "loss": 0.5336400508880615, "memory(GiB)": 76.04, "step": 4550, "token_acc": 0.8413301476636246, "train_speed(iter/s)": 0.027656 }, { "epoch": 1.1776456138082616, "grad_norm": 1.6478408575057983, "learning_rate": 3.5600610985097158e-06, "loss": 0.5487207412719727, "memory(GiB)": 76.04, "step": 4555, "token_acc": 0.8626700118843259, "train_speed(iter/s)": 0.027656 }, { "epoch": 1.1789385222057018, "grad_norm": 1.0522022247314453, "learning_rate": 3.5568327249069835e-06, "loss": 0.5672080993652344, "memory(GiB)": 76.04, "step": 4560, "token_acc": 0.8288274920616079, "train_speed(iter/s)": 0.027657 }, { "epoch": 1.1802314306031418, "grad_norm": 1.0040467977523804, "learning_rate": 3.553602204053593e-06, "loss": 0.5410587787628174, "memory(GiB)": 76.04, "step": 4565, "token_acc": 0.8245939675174014, "train_speed(iter/s)": 0.027658 }, { "epoch": 1.1815243390005818, "grad_norm": 1.081748366355896, "learning_rate": 3.550369542513252e-06, "loss": 0.5334537982940674, "memory(GiB)": 76.04, "step": 4570, "token_acc": 0.8730332603067118, "train_speed(iter/s)": 0.027656 }, { "epoch": 1.1828172473980219, "grad_norm": 2.2112412452697754, "learning_rate": 3.5471347468540124e-06, "loss": 0.5522329330444335, "memory(GiB)": 76.04, "step": 4575, "token_acc": 0.8477234082750803, "train_speed(iter/s)": 0.027656 }, { "epoch": 1.1841101557954619, "grad_norm": 0.9928748607635498, "learning_rate": 3.5438978236482656e-06, "loss": 0.5604439735412597, "memory(GiB)": 76.04, "step": 4580, "token_acc": 0.7849382585192644, "train_speed(iter/s)": 0.027657 }, { "epoch": 1.1854030641929019, "grad_norm": 1.1116896867752075, "learning_rate": 3.540658779472723e-06, "loss": 0.5413738250732422, "memory(GiB)": 76.04, "step": 4585, "token_acc": 0.8287749204588575, "train_speed(iter/s)": 0.027656 }, { "epoch": 1.1866959725903419, "grad_norm": 1.1231666803359985, "learning_rate": 3.5374176209084087e-06, "loss": 0.5632248401641846, "memory(GiB)": 76.04, "step": 4590, "token_acc": 0.8709531013615733, "train_speed(iter/s)": 0.027656 }, { "epoch": 1.187988880987782, "grad_norm": 1.3912053108215332, "learning_rate": 3.5341743545406403e-06, "loss": 0.5327963829040527, "memory(GiB)": 76.04, "step": 4595, "token_acc": 0.834390750074118, "train_speed(iter/s)": 0.027656 }, { "epoch": 1.189281789385222, "grad_norm": 0.8718860149383545, "learning_rate": 3.530928986959019e-06, "loss": 0.5319995880126953, "memory(GiB)": 76.04, "step": 4600, "token_acc": 0.8448853130778072, "train_speed(iter/s)": 0.027656 }, { "epoch": 1.1905746977826621, "grad_norm": 1.055916666984558, "learning_rate": 3.5276815247574148e-06, "loss": 0.5589988708496094, "memory(GiB)": 76.04, "step": 4605, "token_acc": 0.8575937187283504, "train_speed(iter/s)": 0.027656 }, { "epoch": 1.1918676061801021, "grad_norm": 0.9413480758666992, "learning_rate": 3.5244319745339524e-06, "loss": 0.5528499126434326, "memory(GiB)": 76.04, "step": 4610, "token_acc": 0.8506312722563937, "train_speed(iter/s)": 0.027655 }, { "epoch": 1.1931605145775421, "grad_norm": 1.0575560331344604, "learning_rate": 3.5211803428910015e-06, "loss": 0.513238525390625, "memory(GiB)": 76.04, "step": 4615, "token_acc": 0.8514719699342311, "train_speed(iter/s)": 0.027656 }, { "epoch": 1.1944534229749821, "grad_norm": 1.3240203857421875, "learning_rate": 3.5179266364351584e-06, "loss": 0.522664737701416, "memory(GiB)": 76.04, "step": 4620, "token_acc": 0.87151792998951, "train_speed(iter/s)": 0.027657 }, { "epoch": 1.1957463313724221, "grad_norm": 1.1790480613708496, "learning_rate": 3.5146708617772362e-06, "loss": 0.5358052253723145, "memory(GiB)": 76.04, "step": 4625, "token_acc": 0.835999462293319, "train_speed(iter/s)": 0.027656 }, { "epoch": 1.1970392397698624, "grad_norm": 0.9325648546218872, "learning_rate": 3.51141302553225e-06, "loss": 0.5524285316467286, "memory(GiB)": 76.04, "step": 4630, "token_acc": 0.805278226398473, "train_speed(iter/s)": 0.027655 }, { "epoch": 1.1983321481673024, "grad_norm": 2.2591135501861572, "learning_rate": 3.508153134319404e-06, "loss": 0.5479226112365723, "memory(GiB)": 76.04, "step": 4635, "token_acc": 0.8172845227062094, "train_speed(iter/s)": 0.027654 }, { "epoch": 1.1996250565647424, "grad_norm": 0.9014770984649658, "learning_rate": 3.5048911947620774e-06, "loss": 0.5491894245147705, "memory(GiB)": 76.04, "step": 4640, "token_acc": 0.8255224825839139, "train_speed(iter/s)": 0.027655 }, { "epoch": 1.2009179649621824, "grad_norm": 0.9961432814598083, "learning_rate": 3.5016272134878084e-06, "loss": 0.5200064182281494, "memory(GiB)": 76.04, "step": 4645, "token_acc": 0.8566537085189094, "train_speed(iter/s)": 0.027654 }, { "epoch": 1.2022108733596224, "grad_norm": 0.9640424847602844, "learning_rate": 3.4983611971282882e-06, "loss": 0.5232643604278564, "memory(GiB)": 76.04, "step": 4650, "token_acc": 0.8389979490184588, "train_speed(iter/s)": 0.027655 }, { "epoch": 1.2035037817570626, "grad_norm": 14.804332733154297, "learning_rate": 3.49509315231934e-06, "loss": 0.5426907062530517, "memory(GiB)": 76.04, "step": 4655, "token_acc": 0.8573487661061368, "train_speed(iter/s)": 0.027655 }, { "epoch": 1.2047966901545026, "grad_norm": 1.2045928239822388, "learning_rate": 3.4918230857009083e-06, "loss": 0.5525260448455811, "memory(GiB)": 76.04, "step": 4660, "token_acc": 0.8139168327847573, "train_speed(iter/s)": 0.027655 }, { "epoch": 1.2060895985519426, "grad_norm": 6.34891414642334, "learning_rate": 3.488551003917048e-06, "loss": 0.5549496650695801, "memory(GiB)": 76.04, "step": 4665, "token_acc": 0.8387813757424794, "train_speed(iter/s)": 0.027654 }, { "epoch": 1.2073825069493827, "grad_norm": 2.3981356620788574, "learning_rate": 3.4852769136159047e-06, "loss": 0.536187744140625, "memory(GiB)": 76.04, "step": 4670, "token_acc": 0.8524989411266413, "train_speed(iter/s)": 0.027655 }, { "epoch": 1.2086754153468227, "grad_norm": 1.0487481355667114, "learning_rate": 3.482000821449707e-06, "loss": 0.5361638069152832, "memory(GiB)": 76.04, "step": 4675, "token_acc": 0.8482098061573546, "train_speed(iter/s)": 0.027655 }, { "epoch": 1.2099683237442627, "grad_norm": 1.1350358724594116, "learning_rate": 3.4787227340747514e-06, "loss": 0.5472620010375977, "memory(GiB)": 76.04, "step": 4680, "token_acc": 0.8297029702970297, "train_speed(iter/s)": 0.027655 }, { "epoch": 1.2112612321417027, "grad_norm": 1.4464890956878662, "learning_rate": 3.4754426581513866e-06, "loss": 0.5401841163635254, "memory(GiB)": 76.04, "step": 4685, "token_acc": 0.8511260213910848, "train_speed(iter/s)": 0.027656 }, { "epoch": 1.212554140539143, "grad_norm": 1.922498106956482, "learning_rate": 3.4721606003440023e-06, "loss": 0.5158808708190918, "memory(GiB)": 76.04, "step": 4690, "token_acc": 0.8568056902683479, "train_speed(iter/s)": 0.027655 }, { "epoch": 1.213847048936583, "grad_norm": 0.9439219236373901, "learning_rate": 3.4688765673210155e-06, "loss": 0.5801658630371094, "memory(GiB)": 76.04, "step": 4695, "token_acc": 0.8500309427215843, "train_speed(iter/s)": 0.027655 }, { "epoch": 1.215139957334023, "grad_norm": 1.2153284549713135, "learning_rate": 3.465590565754856e-06, "loss": 0.5326606273651123, "memory(GiB)": 76.04, "step": 4700, "token_acc": 0.8437681640787179, "train_speed(iter/s)": 0.027655 }, { "epoch": 1.216432865731463, "grad_norm": 1.3219401836395264, "learning_rate": 3.462302602321953e-06, "loss": 0.5341041088104248, "memory(GiB)": 76.04, "step": 4705, "token_acc": 0.8587982960469481, "train_speed(iter/s)": 0.027655 }, { "epoch": 1.217725774128903, "grad_norm": 0.9187490940093994, "learning_rate": 3.4590126837027216e-06, "loss": 0.5361604690551758, "memory(GiB)": 76.04, "step": 4710, "token_acc": 0.8289933797317942, "train_speed(iter/s)": 0.027655 }, { "epoch": 1.219018682526343, "grad_norm": 1.189947247505188, "learning_rate": 3.4557208165815503e-06, "loss": 0.5369776725769043, "memory(GiB)": 76.04, "step": 4715, "token_acc": 0.784606727522821, "train_speed(iter/s)": 0.027655 }, { "epoch": 1.220311590923783, "grad_norm": 0.8782621026039124, "learning_rate": 3.4524270076467846e-06, "loss": 0.5394928455352783, "memory(GiB)": 76.04, "step": 4720, "token_acc": 0.8381009137862535, "train_speed(iter/s)": 0.027655 }, { "epoch": 1.2216044993212232, "grad_norm": 1.1077133417129517, "learning_rate": 3.449131263590718e-06, "loss": 0.5199668884277344, "memory(GiB)": 76.04, "step": 4725, "token_acc": 0.8437195256220705, "train_speed(iter/s)": 0.027656 }, { "epoch": 1.2228974077186632, "grad_norm": 2.1963064670562744, "learning_rate": 3.445833591109574e-06, "loss": 0.533887529373169, "memory(GiB)": 76.04, "step": 4730, "token_acc": 0.8215962441314554, "train_speed(iter/s)": 0.027656 }, { "epoch": 1.2241903161161032, "grad_norm": 1.0866763591766357, "learning_rate": 3.4425339969034955e-06, "loss": 0.5230364322662353, "memory(GiB)": 76.04, "step": 4735, "token_acc": 0.8641819515774027, "train_speed(iter/s)": 0.027657 }, { "epoch": 1.2254832245135432, "grad_norm": 1.1297239065170288, "learning_rate": 3.439232487676527e-06, "loss": 0.5545130729675293, "memory(GiB)": 76.04, "step": 4740, "token_acc": 0.8013548084891723, "train_speed(iter/s)": 0.027657 }, { "epoch": 1.2267761329109832, "grad_norm": 1.2349060773849487, "learning_rate": 3.435929070136609e-06, "loss": 0.5242255210876465, "memory(GiB)": 76.04, "step": 4745, "token_acc": 0.8695360580716427, "train_speed(iter/s)": 0.027657 }, { "epoch": 1.2280690413084232, "grad_norm": 0.9875677227973938, "learning_rate": 3.4326237509955533e-06, "loss": 0.5407393932342529, "memory(GiB)": 76.04, "step": 4750, "token_acc": 0.8353607552258935, "train_speed(iter/s)": 0.027656 }, { "epoch": 1.2293619497058634, "grad_norm": 1.4724373817443848, "learning_rate": 3.4293165369690406e-06, "loss": 0.5200931549072265, "memory(GiB)": 76.04, "step": 4755, "token_acc": 0.8424033399891088, "train_speed(iter/s)": 0.027657 }, { "epoch": 1.2306548581033034, "grad_norm": 0.8519977331161499, "learning_rate": 3.4260074347765975e-06, "loss": 0.5357259750366211, "memory(GiB)": 76.04, "step": 4760, "token_acc": 0.8267593859249126, "train_speed(iter/s)": 0.027658 }, { "epoch": 1.2319477665007434, "grad_norm": 0.9893440008163452, "learning_rate": 3.42269645114159e-06, "loss": 0.5508286952972412, "memory(GiB)": 76.04, "step": 4765, "token_acc": 0.8041002277904328, "train_speed(iter/s)": 0.027658 }, { "epoch": 1.2332406748981835, "grad_norm": 1.4743452072143555, "learning_rate": 3.419383592791205e-06, "loss": 0.5639371871948242, "memory(GiB)": 76.04, "step": 4770, "token_acc": 0.8497330282227308, "train_speed(iter/s)": 0.027658 }, { "epoch": 1.2345335832956235, "grad_norm": 1.2142781019210815, "learning_rate": 3.4160688664564382e-06, "loss": 0.5326876640319824, "memory(GiB)": 76.04, "step": 4775, "token_acc": 0.8382480707313333, "train_speed(iter/s)": 0.027658 }, { "epoch": 1.2358264916930635, "grad_norm": 8.9053955078125, "learning_rate": 3.4127522788720836e-06, "loss": 0.5383922100067139, "memory(GiB)": 76.04, "step": 4780, "token_acc": 0.8079113088728835, "train_speed(iter/s)": 0.027658 }, { "epoch": 1.2371194000905037, "grad_norm": 1.0299571752548218, "learning_rate": 3.4094338367767117e-06, "loss": 0.5383823394775391, "memory(GiB)": 76.04, "step": 4785, "token_acc": 0.8180522825669974, "train_speed(iter/s)": 0.027658 }, { "epoch": 1.2384123084879437, "grad_norm": 2.690765619277954, "learning_rate": 3.4061135469126654e-06, "loss": 0.5509030818939209, "memory(GiB)": 76.04, "step": 4790, "token_acc": 0.8323850658249927, "train_speed(iter/s)": 0.027658 }, { "epoch": 1.2397052168853837, "grad_norm": 1.1407406330108643, "learning_rate": 3.40279141602604e-06, "loss": 0.5402188777923584, "memory(GiB)": 76.04, "step": 4795, "token_acc": 0.8609082248332804, "train_speed(iter/s)": 0.027658 }, { "epoch": 1.2409981252828237, "grad_norm": 0.979908287525177, "learning_rate": 3.3994674508666715e-06, "loss": 0.5451946258544922, "memory(GiB)": 76.04, "step": 4800, "token_acc": 0.8271758253130498, "train_speed(iter/s)": 0.027658 }, { "epoch": 1.2422910336802637, "grad_norm": 1.4777721166610718, "learning_rate": 3.3961416581881236e-06, "loss": 0.566465187072754, "memory(GiB)": 76.04, "step": 4805, "token_acc": 0.8578219364893824, "train_speed(iter/s)": 0.027658 }, { "epoch": 1.2435839420777037, "grad_norm": 1.3482059240341187, "learning_rate": 3.3928140447476722e-06, "loss": 0.5285268783569336, "memory(GiB)": 76.04, "step": 4810, "token_acc": 0.8426216288863005, "train_speed(iter/s)": 0.027657 }, { "epoch": 1.2448768504751437, "grad_norm": 1.7287280559539795, "learning_rate": 3.3894846173062917e-06, "loss": 0.5343065738677979, "memory(GiB)": 76.04, "step": 4815, "token_acc": 0.8432345137847502, "train_speed(iter/s)": 0.027657 }, { "epoch": 1.246169758872584, "grad_norm": 2.6334922313690186, "learning_rate": 3.386153382628644e-06, "loss": 0.5239715576171875, "memory(GiB)": 76.04, "step": 4820, "token_acc": 0.8302545572652349, "train_speed(iter/s)": 0.027659 }, { "epoch": 1.247462667270024, "grad_norm": 1.018120527267456, "learning_rate": 3.3828203474830623e-06, "loss": 0.5379975318908692, "memory(GiB)": 76.04, "step": 4825, "token_acc": 0.8100699300699301, "train_speed(iter/s)": 0.027658 }, { "epoch": 1.248755575667464, "grad_norm": 1.9307032823562622, "learning_rate": 3.3794855186415374e-06, "loss": 0.5401200771331787, "memory(GiB)": 76.04, "step": 4830, "token_acc": 0.8570304677442426, "train_speed(iter/s)": 0.027658 }, { "epoch": 1.250048484064904, "grad_norm": 0.8735835552215576, "learning_rate": 3.3761489028797063e-06, "loss": 0.5682656288146972, "memory(GiB)": 76.04, "step": 4835, "token_acc": 0.8328065512535019, "train_speed(iter/s)": 0.027658 }, { "epoch": 1.251341392462344, "grad_norm": 0.9113880395889282, "learning_rate": 3.372810506976833e-06, "loss": 0.519595718383789, "memory(GiB)": 76.04, "step": 4840, "token_acc": 0.8493732447427906, "train_speed(iter/s)": 0.027655 }, { "epoch": 1.2526343008597842, "grad_norm": 1.123384952545166, "learning_rate": 3.369470337715802e-06, "loss": 0.5394314765930176, "memory(GiB)": 76.04, "step": 4845, "token_acc": 0.8533837894922116, "train_speed(iter/s)": 0.027656 }, { "epoch": 1.253927209257224, "grad_norm": 2.7480571269989014, "learning_rate": 3.3661284018830986e-06, "loss": 0.5219066619873047, "memory(GiB)": 76.04, "step": 4850, "token_acc": 0.8398635428686099, "train_speed(iter/s)": 0.027657 }, { "epoch": 1.2552201176546642, "grad_norm": 1.892354130744934, "learning_rate": 3.3627847062687996e-06, "loss": 0.5399574756622314, "memory(GiB)": 76.04, "step": 4855, "token_acc": 0.8358763125833962, "train_speed(iter/s)": 0.027656 }, { "epoch": 1.2565130260521042, "grad_norm": 1.2802281379699707, "learning_rate": 3.359439257666554e-06, "loss": 0.5371671676635742, "memory(GiB)": 76.04, "step": 4860, "token_acc": 0.8427890861844954, "train_speed(iter/s)": 0.027655 }, { "epoch": 1.2578059344495442, "grad_norm": 0.9374321699142456, "learning_rate": 3.356092062873576e-06, "loss": 0.5454726219177246, "memory(GiB)": 76.04, "step": 4865, "token_acc": 0.8636550683553564, "train_speed(iter/s)": 0.027656 }, { "epoch": 1.2590988428469843, "grad_norm": 0.8962191939353943, "learning_rate": 3.3527431286906248e-06, "loss": 0.5191185951232911, "memory(GiB)": 76.04, "step": 4870, "token_acc": 0.8448458652748329, "train_speed(iter/s)": 0.027656 }, { "epoch": 1.2603917512444243, "grad_norm": 1.248224139213562, "learning_rate": 3.3493924619219964e-06, "loss": 0.5477604866027832, "memory(GiB)": 76.04, "step": 4875, "token_acc": 0.8294104944936299, "train_speed(iter/s)": 0.027656 }, { "epoch": 1.2616846596418645, "grad_norm": 0.9002890586853027, "learning_rate": 3.3460400693755047e-06, "loss": 0.5323681831359863, "memory(GiB)": 76.04, "step": 4880, "token_acc": 0.8577657555815738, "train_speed(iter/s)": 0.027656 }, { "epoch": 1.2629775680393043, "grad_norm": 1.059833288192749, "learning_rate": 3.3426859578624705e-06, "loss": 0.5649502754211426, "memory(GiB)": 76.04, "step": 4885, "token_acc": 0.7916213275299239, "train_speed(iter/s)": 0.027656 }, { "epoch": 1.2642704764367445, "grad_norm": 1.3469425439834595, "learning_rate": 3.339330134197708e-06, "loss": 0.5313740730285644, "memory(GiB)": 76.04, "step": 4890, "token_acc": 0.8403429238296153, "train_speed(iter/s)": 0.027655 }, { "epoch": 1.2655633848341845, "grad_norm": 1.0573686361312866, "learning_rate": 3.3359726051995097e-06, "loss": 0.5383338451385498, "memory(GiB)": 76.04, "step": 4895, "token_acc": 0.8299968790405279, "train_speed(iter/s)": 0.027655 }, { "epoch": 1.2668562932316245, "grad_norm": 1.5502042770385742, "learning_rate": 3.332613377689632e-06, "loss": 0.5520769119262695, "memory(GiB)": 76.04, "step": 4900, "token_acc": 0.8267734765697351, "train_speed(iter/s)": 0.027656 }, { "epoch": 1.2681492016290645, "grad_norm": 1.3487634658813477, "learning_rate": 3.3292524584932846e-06, "loss": 0.5057527542114257, "memory(GiB)": 76.04, "step": 4905, "token_acc": 0.8129584979223311, "train_speed(iter/s)": 0.027656 }, { "epoch": 1.2694421100265045, "grad_norm": 1.016811490058899, "learning_rate": 3.325889854439112e-06, "loss": 0.5458771228790283, "memory(GiB)": 76.04, "step": 4910, "token_acc": 0.8184902798291486, "train_speed(iter/s)": 0.027655 }, { "epoch": 1.2707350184239448, "grad_norm": 1.8867390155792236, "learning_rate": 3.322525572359183e-06, "loss": 0.5553860664367676, "memory(GiB)": 76.04, "step": 4915, "token_acc": 0.8386983751587614, "train_speed(iter/s)": 0.027654 }, { "epoch": 1.2720279268213848, "grad_norm": 1.2835884094238281, "learning_rate": 3.3191596190889762e-06, "loss": 0.5246952056884766, "memory(GiB)": 76.04, "step": 4920, "token_acc": 0.8337373292199207, "train_speed(iter/s)": 0.027654 }, { "epoch": 1.2733208352188248, "grad_norm": 1.0106348991394043, "learning_rate": 3.3157920014673646e-06, "loss": 0.5335243225097657, "memory(GiB)": 76.04, "step": 4925, "token_acc": 0.8471834913552705, "train_speed(iter/s)": 0.027654 }, { "epoch": 1.2746137436162648, "grad_norm": 1.2083282470703125, "learning_rate": 3.3124227263366036e-06, "loss": 0.557880973815918, "memory(GiB)": 76.04, "step": 4930, "token_acc": 0.8243945635852616, "train_speed(iter/s)": 0.027655 }, { "epoch": 1.2759066520137048, "grad_norm": 1.0607614517211914, "learning_rate": 3.3090518005423157e-06, "loss": 0.5547267436981201, "memory(GiB)": 76.04, "step": 4935, "token_acc": 0.8197203446674578, "train_speed(iter/s)": 0.027656 }, { "epoch": 1.2771995604111448, "grad_norm": 1.0059987306594849, "learning_rate": 3.305679230933478e-06, "loss": 0.5478557586669922, "memory(GiB)": 76.04, "step": 4940, "token_acc": 0.7879940655076654, "train_speed(iter/s)": 0.027655 }, { "epoch": 1.2784924688085848, "grad_norm": 0.9902651906013489, "learning_rate": 3.3023050243624066e-06, "loss": 0.5528521537780762, "memory(GiB)": 76.04, "step": 4945, "token_acc": 0.832831287809007, "train_speed(iter/s)": 0.027655 }, { "epoch": 1.279785377206025, "grad_norm": 1.2956874370574951, "learning_rate": 3.298929187684744e-06, "loss": 0.5243937969207764, "memory(GiB)": 76.04, "step": 4950, "token_acc": 0.8375224024129038, "train_speed(iter/s)": 0.027656 }, { "epoch": 1.281078285603465, "grad_norm": 0.8811606764793396, "learning_rate": 3.2955517277594453e-06, "loss": 0.5211551666259766, "memory(GiB)": 76.04, "step": 4955, "token_acc": 0.8267743146826887, "train_speed(iter/s)": 0.027656 }, { "epoch": 1.282371194000905, "grad_norm": 0.9188509583473206, "learning_rate": 3.292172651448761e-06, "loss": 0.5098612785339356, "memory(GiB)": 76.04, "step": 4960, "token_acc": 0.8693684341651787, "train_speed(iter/s)": 0.027656 }, { "epoch": 1.283664102398345, "grad_norm": 0.9744674563407898, "learning_rate": 3.2887919656182304e-06, "loss": 0.5251672744750977, "memory(GiB)": 76.04, "step": 4965, "token_acc": 0.8267029592406476, "train_speed(iter/s)": 0.027656 }, { "epoch": 1.284957010795785, "grad_norm": 0.9584162831306458, "learning_rate": 3.2854096771366584e-06, "loss": 0.5332806587219239, "memory(GiB)": 76.04, "step": 4970, "token_acc": 0.8322600222529418, "train_speed(iter/s)": 0.027657 }, { "epoch": 1.2862499191932253, "grad_norm": 0.8617119789123535, "learning_rate": 3.28202579287611e-06, "loss": 0.5289664745330811, "memory(GiB)": 76.04, "step": 4975, "token_acc": 0.829112426035503, "train_speed(iter/s)": 0.027657 }, { "epoch": 1.287542827590665, "grad_norm": 1.0189121961593628, "learning_rate": 3.278640319711889e-06, "loss": 0.5311687469482422, "memory(GiB)": 76.04, "step": 4980, "token_acc": 0.8320498040119898, "train_speed(iter/s)": 0.027658 }, { "epoch": 1.2888357359881053, "grad_norm": 0.9284194707870483, "learning_rate": 3.275253264522529e-06, "loss": 0.5279128074645996, "memory(GiB)": 76.04, "step": 4985, "token_acc": 0.8646108400841427, "train_speed(iter/s)": 0.027656 }, { "epoch": 1.2901286443855453, "grad_norm": 1.1427520513534546, "learning_rate": 3.2718646341897796e-06, "loss": 0.5510351181030273, "memory(GiB)": 76.04, "step": 4990, "token_acc": 0.8475239880886732, "train_speed(iter/s)": 0.027657 }, { "epoch": 1.2914215527829853, "grad_norm": 0.9549011588096619, "learning_rate": 3.268474435598587e-06, "loss": 0.5165416240692139, "memory(GiB)": 76.04, "step": 4995, "token_acc": 0.856951293364478, "train_speed(iter/s)": 0.027655 }, { "epoch": 1.2927144611804253, "grad_norm": 1.217895746231079, "learning_rate": 3.265082675637087e-06, "loss": 0.535146427154541, "memory(GiB)": 76.04, "step": 5000, "token_acc": 0.8132745913451641, "train_speed(iter/s)": 0.027657 }, { "epoch": 1.2940073695778653, "grad_norm": 0.9800029993057251, "learning_rate": 3.2616893611965865e-06, "loss": 0.5271368503570557, "memory(GiB)": 76.04, "step": 5005, "token_acc": 0.8222583265637693, "train_speed(iter/s)": 0.027627 }, { "epoch": 1.2953002779753056, "grad_norm": 0.9374005198478699, "learning_rate": 3.258294499171552e-06, "loss": 0.5365757942199707, "memory(GiB)": 76.04, "step": 5010, "token_acc": 0.8411453966124434, "train_speed(iter/s)": 0.027627 }, { "epoch": 1.2965931863727456, "grad_norm": 1.7360183000564575, "learning_rate": 3.254898096459591e-06, "loss": 0.5575047492980957, "memory(GiB)": 76.04, "step": 5015, "token_acc": 0.8461703497103625, "train_speed(iter/s)": 0.027627 }, { "epoch": 1.2978860947701856, "grad_norm": 1.3810652494430542, "learning_rate": 3.251500159961446e-06, "loss": 0.5436039924621582, "memory(GiB)": 76.04, "step": 5020, "token_acc": 0.8282478766907833, "train_speed(iter/s)": 0.027626 }, { "epoch": 1.2991790031676256, "grad_norm": 1.2262240648269653, "learning_rate": 3.2481006965809713e-06, "loss": 0.5245812892913818, "memory(GiB)": 76.04, "step": 5025, "token_acc": 0.8353918706490007, "train_speed(iter/s)": 0.027626 }, { "epoch": 1.3004719115650656, "grad_norm": 1.2243238687515259, "learning_rate": 3.2446997132251267e-06, "loss": 0.5234585762023926, "memory(GiB)": 76.04, "step": 5030, "token_acc": 0.8347611572101368, "train_speed(iter/s)": 0.027627 }, { "epoch": 1.3017648199625056, "grad_norm": 0.8756423592567444, "learning_rate": 3.241297216803959e-06, "loss": 0.5213943004608155, "memory(GiB)": 76.04, "step": 5035, "token_acc": 0.8430144773070433, "train_speed(iter/s)": 0.027626 }, { "epoch": 1.3030577283599456, "grad_norm": 1.070388913154602, "learning_rate": 3.2378932142305896e-06, "loss": 0.5314732551574707, "memory(GiB)": 76.04, "step": 5040, "token_acc": 0.8426463389048185, "train_speed(iter/s)": 0.027627 }, { "epoch": 1.3043506367573858, "grad_norm": 1.6048212051391602, "learning_rate": 3.2344877124211986e-06, "loss": 0.5154043674468994, "memory(GiB)": 76.04, "step": 5045, "token_acc": 0.836419641239355, "train_speed(iter/s)": 0.027627 }, { "epoch": 1.3056435451548258, "grad_norm": 0.8205667734146118, "learning_rate": 3.2310807182950157e-06, "loss": 0.5318900585174561, "memory(GiB)": 76.04, "step": 5050, "token_acc": 0.839140860160196, "train_speed(iter/s)": 0.027628 }, { "epoch": 1.3069364535522658, "grad_norm": 2.0000104904174805, "learning_rate": 3.2276722387742986e-06, "loss": 0.5485349178314209, "memory(GiB)": 76.04, "step": 5055, "token_acc": 0.832, "train_speed(iter/s)": 0.027629 }, { "epoch": 1.3082293619497058, "grad_norm": 1.5011872053146362, "learning_rate": 3.2242622807843256e-06, "loss": 0.5459944725036621, "memory(GiB)": 76.04, "step": 5060, "token_acc": 0.8583078032077852, "train_speed(iter/s)": 0.027628 }, { "epoch": 1.3095222703471459, "grad_norm": 1.206041932106018, "learning_rate": 3.2208508512533777e-06, "loss": 0.5489155769348144, "memory(GiB)": 76.04, "step": 5065, "token_acc": 0.8435306288332225, "train_speed(iter/s)": 0.027628 }, { "epoch": 1.310815178744586, "grad_norm": 0.9339408874511719, "learning_rate": 3.2174379571127255e-06, "loss": 0.5105900764465332, "memory(GiB)": 76.04, "step": 5070, "token_acc": 0.8589012405348799, "train_speed(iter/s)": 0.027629 }, { "epoch": 1.3121080871420259, "grad_norm": 0.9643262624740601, "learning_rate": 3.214023605296618e-06, "loss": 0.5285213947296142, "memory(GiB)": 76.04, "step": 5075, "token_acc": 0.8527266411948593, "train_speed(iter/s)": 0.027629 }, { "epoch": 1.313400995539466, "grad_norm": 0.9289806485176086, "learning_rate": 3.2106078027422617e-06, "loss": 0.546751070022583, "memory(GiB)": 76.04, "step": 5080, "token_acc": 0.8559926386013342, "train_speed(iter/s)": 0.02763 }, { "epoch": 1.314693903936906, "grad_norm": 0.9898268580436707, "learning_rate": 3.2071905563898147e-06, "loss": 0.5333544731140136, "memory(GiB)": 76.04, "step": 5085, "token_acc": 0.8701183055590892, "train_speed(iter/s)": 0.02763 }, { "epoch": 1.3159868123343461, "grad_norm": 1.0481353998184204, "learning_rate": 3.2037718731823654e-06, "loss": 0.5345610618591309, "memory(GiB)": 76.04, "step": 5090, "token_acc": 0.8575830948712304, "train_speed(iter/s)": 0.02763 }, { "epoch": 1.3172797207317861, "grad_norm": 1.0228970050811768, "learning_rate": 3.200351760065924e-06, "loss": 0.5261023998260498, "memory(GiB)": 76.04, "step": 5095, "token_acc": 0.8451571927596062, "train_speed(iter/s)": 0.027628 }, { "epoch": 1.3185726291292261, "grad_norm": 1.2880408763885498, "learning_rate": 3.196930223989404e-06, "loss": 0.5189993858337403, "memory(GiB)": 76.04, "step": 5100, "token_acc": 0.8433385103653184, "train_speed(iter/s)": 0.027627 }, { "epoch": 1.3198655375266664, "grad_norm": 1.5179208517074585, "learning_rate": 3.193507271904612e-06, "loss": 0.5425951957702637, "memory(GiB)": 76.04, "step": 5105, "token_acc": 0.8408305921052631, "train_speed(iter/s)": 0.027627 }, { "epoch": 1.3211584459241061, "grad_norm": 1.4803640842437744, "learning_rate": 3.1900829107662296e-06, "loss": 0.5434229373931885, "memory(GiB)": 76.04, "step": 5110, "token_acc": 0.8403665573028624, "train_speed(iter/s)": 0.027627 }, { "epoch": 1.3224513543215464, "grad_norm": 2.6232545375823975, "learning_rate": 3.186657147531802e-06, "loss": 0.5110975742340088, "memory(GiB)": 76.04, "step": 5115, "token_acc": 0.8574821852731591, "train_speed(iter/s)": 0.027626 }, { "epoch": 1.3237442627189864, "grad_norm": 0.9406218528747559, "learning_rate": 3.1832299891617245e-06, "loss": 0.5422788143157959, "memory(GiB)": 76.04, "step": 5120, "token_acc": 0.8556760308854937, "train_speed(iter/s)": 0.027626 }, { "epoch": 1.3250371711164264, "grad_norm": 1.6263813972473145, "learning_rate": 3.179801442619225e-06, "loss": 0.5206321716308594, "memory(GiB)": 76.04, "step": 5125, "token_acc": 0.8325710236423371, "train_speed(iter/s)": 0.027627 }, { "epoch": 1.3263300795138664, "grad_norm": 1.0195496082305908, "learning_rate": 3.176371514870354e-06, "loss": 0.5497357368469238, "memory(GiB)": 76.04, "step": 5130, "token_acc": 0.8564178043952697, "train_speed(iter/s)": 0.027627 }, { "epoch": 1.3276229879113064, "grad_norm": 1.1096144914627075, "learning_rate": 3.172940212883965e-06, "loss": 0.5373088836669921, "memory(GiB)": 76.04, "step": 5135, "token_acc": 0.8881567463780764, "train_speed(iter/s)": 0.027626 }, { "epoch": 1.3289158963087466, "grad_norm": 1.392109751701355, "learning_rate": 3.1695075436317073e-06, "loss": 0.5438241004943848, "memory(GiB)": 76.04, "step": 5140, "token_acc": 0.8368131622479545, "train_speed(iter/s)": 0.027624 }, { "epoch": 1.3302088047061866, "grad_norm": 1.4390590190887451, "learning_rate": 3.166073514088006e-06, "loss": 0.5391247272491455, "memory(GiB)": 76.04, "step": 5145, "token_acc": 0.8375243285325029, "train_speed(iter/s)": 0.027624 }, { "epoch": 1.3315017131036266, "grad_norm": 1.352954387664795, "learning_rate": 3.1626381312300516e-06, "loss": 0.5338696479797364, "memory(GiB)": 76.04, "step": 5150, "token_acc": 0.847240778978906, "train_speed(iter/s)": 0.027625 }, { "epoch": 1.3327946215010666, "grad_norm": 3.8461802005767822, "learning_rate": 3.1592014020377815e-06, "loss": 0.5344533920288086, "memory(GiB)": 76.04, "step": 5155, "token_acc": 0.8609710100434191, "train_speed(iter/s)": 0.027625 }, { "epoch": 1.3340875298985067, "grad_norm": 3.0712478160858154, "learning_rate": 3.1557633334938712e-06, "loss": 0.5250087261199952, "memory(GiB)": 76.04, "step": 5160, "token_acc": 0.8473618090452262, "train_speed(iter/s)": 0.027625 }, { "epoch": 1.3353804382959467, "grad_norm": 0.8653470873832703, "learning_rate": 3.1523239325837174e-06, "loss": 0.5317577362060547, "memory(GiB)": 76.04, "step": 5165, "token_acc": 0.8672264497507216, "train_speed(iter/s)": 0.027624 }, { "epoch": 1.3366733466933867, "grad_norm": 1.0058352947235107, "learning_rate": 3.1488832062954213e-06, "loss": 0.5124196529388427, "memory(GiB)": 76.04, "step": 5170, "token_acc": 0.8276955161626695, "train_speed(iter/s)": 0.027622 }, { "epoch": 1.337966255090827, "grad_norm": 1.0088118314743042, "learning_rate": 3.145441161619779e-06, "loss": 0.5366281509399414, "memory(GiB)": 76.04, "step": 5175, "token_acc": 0.8506988094357761, "train_speed(iter/s)": 0.027622 }, { "epoch": 1.339259163488267, "grad_norm": 0.955906331539154, "learning_rate": 3.1419978055502666e-06, "loss": 0.5448675155639648, "memory(GiB)": 76.04, "step": 5180, "token_acc": 0.79640928536363, "train_speed(iter/s)": 0.027622 }, { "epoch": 1.340552071885707, "grad_norm": 1.171993374824524, "learning_rate": 3.138553145083022e-06, "loss": 0.5282750129699707, "memory(GiB)": 76.04, "step": 5185, "token_acc": 0.8676538311665308, "train_speed(iter/s)": 0.027621 }, { "epoch": 1.341844980283147, "grad_norm": 1.1022040843963623, "learning_rate": 3.135107187216834e-06, "loss": 0.534688663482666, "memory(GiB)": 76.04, "step": 5190, "token_acc": 0.8357040716489802, "train_speed(iter/s)": 0.027622 }, { "epoch": 1.343137888680587, "grad_norm": 2.8540091514587402, "learning_rate": 3.1316599389531282e-06, "loss": 0.5261801719665528, "memory(GiB)": 76.04, "step": 5195, "token_acc": 0.8275160272718022, "train_speed(iter/s)": 0.027621 }, { "epoch": 1.3444307970780272, "grad_norm": 1.3484967947006226, "learning_rate": 3.128211407295951e-06, "loss": 0.5323428630828857, "memory(GiB)": 76.04, "step": 5200, "token_acc": 0.8412020736880043, "train_speed(iter/s)": 0.027621 }, { "epoch": 1.345723705475467, "grad_norm": 1.8615549802780151, "learning_rate": 3.1247615992519587e-06, "loss": 0.5560379981994629, "memory(GiB)": 76.04, "step": 5205, "token_acc": 0.8468941382327209, "train_speed(iter/s)": 0.027622 }, { "epoch": 1.3470166138729072, "grad_norm": 2.5111136436462402, "learning_rate": 3.1213105218303972e-06, "loss": 0.534544563293457, "memory(GiB)": 76.04, "step": 5210, "token_acc": 0.8330609679446889, "train_speed(iter/s)": 0.027622 }, { "epoch": 1.3483095222703472, "grad_norm": 0.9236442446708679, "learning_rate": 3.1178581820430957e-06, "loss": 0.5287897109985351, "memory(GiB)": 76.04, "step": 5215, "token_acc": 0.8456866092341895, "train_speed(iter/s)": 0.027622 }, { "epoch": 1.3496024306677872, "grad_norm": 1.2912098169326782, "learning_rate": 3.1144045869044437e-06, "loss": 0.5496071815490723, "memory(GiB)": 76.04, "step": 5220, "token_acc": 0.8341737438075018, "train_speed(iter/s)": 0.027622 }, { "epoch": 1.3508953390652272, "grad_norm": 0.8818367123603821, "learning_rate": 3.1109497434313857e-06, "loss": 0.5452832698822021, "memory(GiB)": 76.04, "step": 5225, "token_acc": 0.8284331373254931, "train_speed(iter/s)": 0.027622 }, { "epoch": 1.3521882474626672, "grad_norm": 1.3674169778823853, "learning_rate": 3.1074936586433994e-06, "loss": 0.537296199798584, "memory(GiB)": 76.04, "step": 5230, "token_acc": 0.8586094734702175, "train_speed(iter/s)": 0.027623 }, { "epoch": 1.3534811558601074, "grad_norm": 1.324703335762024, "learning_rate": 3.1040363395624854e-06, "loss": 0.49640579223632814, "memory(GiB)": 76.04, "step": 5235, "token_acc": 0.8524991832734401, "train_speed(iter/s)": 0.027622 }, { "epoch": 1.3547740642575474, "grad_norm": 0.9162821173667908, "learning_rate": 3.1005777932131535e-06, "loss": 0.5111923217773438, "memory(GiB)": 76.04, "step": 5240, "token_acc": 0.8507638072855465, "train_speed(iter/s)": 0.027621 }, { "epoch": 1.3560669726549874, "grad_norm": 1.0688142776489258, "learning_rate": 3.097118026622405e-06, "loss": 0.5468463897705078, "memory(GiB)": 76.04, "step": 5245, "token_acc": 0.832568012476174, "train_speed(iter/s)": 0.027621 }, { "epoch": 1.3573598810524274, "grad_norm": 1.0131880044937134, "learning_rate": 3.093657046819722e-06, "loss": 0.4972386360168457, "memory(GiB)": 76.04, "step": 5250, "token_acc": 0.8424430280275911, "train_speed(iter/s)": 0.027622 }, { "epoch": 1.3586527894498674, "grad_norm": 1.027860164642334, "learning_rate": 3.0901948608370503e-06, "loss": 0.5250637054443359, "memory(GiB)": 76.04, "step": 5255, "token_acc": 0.8416179528424026, "train_speed(iter/s)": 0.027622 }, { "epoch": 1.3599456978473075, "grad_norm": 0.9048755764961243, "learning_rate": 3.086731475708788e-06, "loss": 0.5370029449462891, "memory(GiB)": 76.04, "step": 5260, "token_acc": 0.8346947027901335, "train_speed(iter/s)": 0.027621 }, { "epoch": 1.3612386062447475, "grad_norm": 2.5965473651885986, "learning_rate": 3.0832668984717675e-06, "loss": 0.5500319480895997, "memory(GiB)": 76.04, "step": 5265, "token_acc": 0.8267131242740999, "train_speed(iter/s)": 0.027622 }, { "epoch": 1.3625315146421877, "grad_norm": 1.969552993774414, "learning_rate": 3.079801136165246e-06, "loss": 0.5336560726165771, "memory(GiB)": 76.04, "step": 5270, "token_acc": 0.8175245806824755, "train_speed(iter/s)": 0.027623 }, { "epoch": 1.3638244230396277, "grad_norm": 1.069265604019165, "learning_rate": 3.0763341958308853e-06, "loss": 0.5203034400939941, "memory(GiB)": 76.04, "step": 5275, "token_acc": 0.8421536276680172, "train_speed(iter/s)": 0.027624 }, { "epoch": 1.3651173314370677, "grad_norm": 1.2012121677398682, "learning_rate": 3.072866084512743e-06, "loss": 0.5232099533081055, "memory(GiB)": 76.04, "step": 5280, "token_acc": 0.8650010324179228, "train_speed(iter/s)": 0.027624 }, { "epoch": 1.3664102398345077, "grad_norm": 1.19133722782135, "learning_rate": 3.069396809257256e-06, "loss": 0.5404583930969238, "memory(GiB)": 76.04, "step": 5285, "token_acc": 0.8128638853481241, "train_speed(iter/s)": 0.027624 }, { "epoch": 1.3677031482319477, "grad_norm": 1.1554311513900757, "learning_rate": 3.065926377113224e-06, "loss": 0.5264840126037598, "memory(GiB)": 76.04, "step": 5290, "token_acc": 0.8585351063368996, "train_speed(iter/s)": 0.027624 }, { "epoch": 1.3689960566293877, "grad_norm": 1.4480676651000977, "learning_rate": 3.0624547951318e-06, "loss": 0.5401974678039551, "memory(GiB)": 76.04, "step": 5295, "token_acc": 0.8407013111993263, "train_speed(iter/s)": 0.027624 }, { "epoch": 1.3702889650268277, "grad_norm": 0.975131094455719, "learning_rate": 3.0589820703664707e-06, "loss": 0.5349632263183594, "memory(GiB)": 76.04, "step": 5300, "token_acc": 0.8640092475203222, "train_speed(iter/s)": 0.027624 }, { "epoch": 1.371581873424268, "grad_norm": 1.336971640586853, "learning_rate": 3.0555082098730464e-06, "loss": 0.5260316371917725, "memory(GiB)": 76.04, "step": 5305, "token_acc": 0.8318684124147488, "train_speed(iter/s)": 0.027624 }, { "epoch": 1.372874781821708, "grad_norm": 1.4911202192306519, "learning_rate": 3.0520332207096433e-06, "loss": 0.5175662994384765, "memory(GiB)": 76.04, "step": 5310, "token_acc": 0.8419973789441849, "train_speed(iter/s)": 0.027625 }, { "epoch": 1.374167690219148, "grad_norm": 1.1500071287155151, "learning_rate": 3.0485571099366724e-06, "loss": 0.5503662586212158, "memory(GiB)": 76.04, "step": 5315, "token_acc": 0.800382509562739, "train_speed(iter/s)": 0.027624 }, { "epoch": 1.375460598616588, "grad_norm": 0.9023737907409668, "learning_rate": 3.0450798846168227e-06, "loss": 0.5276325225830079, "memory(GiB)": 76.04, "step": 5320, "token_acc": 0.8494656224308771, "train_speed(iter/s)": 0.027625 }, { "epoch": 1.376753507014028, "grad_norm": 1.091489315032959, "learning_rate": 3.0416015518150494e-06, "loss": 0.5327792167663574, "memory(GiB)": 76.04, "step": 5325, "token_acc": 0.8433721260289526, "train_speed(iter/s)": 0.027625 }, { "epoch": 1.3780464154114682, "grad_norm": 1.429714322090149, "learning_rate": 3.0381221185985543e-06, "loss": 0.5325508117675781, "memory(GiB)": 76.04, "step": 5330, "token_acc": 0.8607167276676185, "train_speed(iter/s)": 0.027626 }, { "epoch": 1.379339323808908, "grad_norm": 0.9919387698173523, "learning_rate": 3.034641592036779e-06, "loss": 0.5155058860778808, "memory(GiB)": 76.04, "step": 5335, "token_acc": 0.8486006657625447, "train_speed(iter/s)": 0.027626 }, { "epoch": 1.3806322322063482, "grad_norm": 2.1436212062835693, "learning_rate": 3.031159979201383e-06, "loss": 0.5232511043548584, "memory(GiB)": 76.04, "step": 5340, "token_acc": 0.8501789414202298, "train_speed(iter/s)": 0.027625 }, { "epoch": 1.3819251406037882, "grad_norm": 1.056612253189087, "learning_rate": 3.027677287166235e-06, "loss": 0.5240641117095948, "memory(GiB)": 76.04, "step": 5345, "token_acc": 0.8404334212261042, "train_speed(iter/s)": 0.027626 }, { "epoch": 1.3832180490012282, "grad_norm": 1.0267983675003052, "learning_rate": 3.0241935230073977e-06, "loss": 0.5429930210113525, "memory(GiB)": 76.04, "step": 5350, "token_acc": 0.8286713286713286, "train_speed(iter/s)": 0.027626 }, { "epoch": 1.3845109573986683, "grad_norm": 1.0295692682266235, "learning_rate": 3.020708693803108e-06, "loss": 0.5250686645507813, "memory(GiB)": 76.04, "step": 5355, "token_acc": 0.8244736210071252, "train_speed(iter/s)": 0.027626 }, { "epoch": 1.3858038657961083, "grad_norm": 0.999599277973175, "learning_rate": 3.0172228066337704e-06, "loss": 0.5352205276489258, "memory(GiB)": 76.04, "step": 5360, "token_acc": 0.84011528503737, "train_speed(iter/s)": 0.027625 }, { "epoch": 1.3870967741935485, "grad_norm": 1.8036699295043945, "learning_rate": 3.013735868581937e-06, "loss": 0.5204336166381835, "memory(GiB)": 76.04, "step": 5365, "token_acc": 0.8598814043234085, "train_speed(iter/s)": 0.027625 }, { "epoch": 1.3883896825909885, "grad_norm": 1.0070078372955322, "learning_rate": 3.0102478867322967e-06, "loss": 0.5483356952667237, "memory(GiB)": 76.04, "step": 5370, "token_acc": 0.824980503222099, "train_speed(iter/s)": 0.027626 }, { "epoch": 1.3896825909884285, "grad_norm": 1.2133702039718628, "learning_rate": 3.0067588681716563e-06, "loss": 0.5264020919799804, "memory(GiB)": 76.04, "step": 5375, "token_acc": 0.8479607640681466, "train_speed(iter/s)": 0.027626 }, { "epoch": 1.3909754993858685, "grad_norm": 1.567028522491455, "learning_rate": 3.0032688199889328e-06, "loss": 0.5459973335266113, "memory(GiB)": 76.04, "step": 5380, "token_acc": 0.8320722155847604, "train_speed(iter/s)": 0.027627 }, { "epoch": 1.3922684077833085, "grad_norm": 0.9477059245109558, "learning_rate": 2.9997777492751313e-06, "loss": 0.522393798828125, "memory(GiB)": 76.04, "step": 5385, "token_acc": 0.8648985404058384, "train_speed(iter/s)": 0.027627 }, { "epoch": 1.3935613161807485, "grad_norm": 0.9237920045852661, "learning_rate": 2.9962856631233388e-06, "loss": 0.5231618404388427, "memory(GiB)": 76.04, "step": 5390, "token_acc": 0.8635757044267358, "train_speed(iter/s)": 0.027627 }, { "epoch": 1.3948542245781885, "grad_norm": 1.1687318086624146, "learning_rate": 2.9927925686287006e-06, "loss": 0.5056675434112549, "memory(GiB)": 76.04, "step": 5395, "token_acc": 0.8491524700055506, "train_speed(iter/s)": 0.027628 }, { "epoch": 1.3961471329756288, "grad_norm": 1.1914643049240112, "learning_rate": 2.9892984728884155e-06, "loss": 0.5470870018005372, "memory(GiB)": 76.04, "step": 5400, "token_acc": 0.8393457238872505, "train_speed(iter/s)": 0.027629 }, { "epoch": 1.3974400413730688, "grad_norm": 0.9498441219329834, "learning_rate": 2.9858033830017127e-06, "loss": 0.5178772926330566, "memory(GiB)": 76.04, "step": 5405, "token_acc": 0.8383829302646169, "train_speed(iter/s)": 0.027628 }, { "epoch": 1.3987329497705088, "grad_norm": 1.407654881477356, "learning_rate": 2.982307306069842e-06, "loss": 0.5494901180267334, "memory(GiB)": 76.04, "step": 5410, "token_acc": 0.8369012373794883, "train_speed(iter/s)": 0.027628 }, { "epoch": 1.4000258581679488, "grad_norm": 1.148308515548706, "learning_rate": 2.9788102491960606e-06, "loss": 0.5415051460266114, "memory(GiB)": 76.04, "step": 5415, "token_acc": 0.8146666105050335, "train_speed(iter/s)": 0.027628 }, { "epoch": 1.4013187665653888, "grad_norm": 1.0424379110336304, "learning_rate": 2.975312219485616e-06, "loss": 0.5347636699676513, "memory(GiB)": 76.04, "step": 5420, "token_acc": 0.8373481740260795, "train_speed(iter/s)": 0.027628 }, { "epoch": 1.402611674962829, "grad_norm": 1.1721105575561523, "learning_rate": 2.971813224045732e-06, "loss": 0.5278305053710938, "memory(GiB)": 76.04, "step": 5425, "token_acc": 0.8396724598930482, "train_speed(iter/s)": 0.027629 }, { "epoch": 1.4039045833602688, "grad_norm": 1.179748296737671, "learning_rate": 2.9683132699855933e-06, "loss": 0.5224045276641845, "memory(GiB)": 76.04, "step": 5430, "token_acc": 0.8568342151675485, "train_speed(iter/s)": 0.027629 }, { "epoch": 1.405197491757709, "grad_norm": 2.302943706512451, "learning_rate": 2.9648123644163344e-06, "loss": 0.51423659324646, "memory(GiB)": 76.04, "step": 5435, "token_acc": 0.8518541896796591, "train_speed(iter/s)": 0.02763 }, { "epoch": 1.406490400155149, "grad_norm": 1.695887804031372, "learning_rate": 2.961310514451021e-06, "loss": 0.5096250534057617, "memory(GiB)": 76.04, "step": 5440, "token_acc": 0.8589175232620451, "train_speed(iter/s)": 0.02763 }, { "epoch": 1.407783308552589, "grad_norm": 1.0010126829147339, "learning_rate": 2.9578077272046407e-06, "loss": 0.5219532012939453, "memory(GiB)": 76.04, "step": 5445, "token_acc": 0.8385249390550633, "train_speed(iter/s)": 0.027631 }, { "epoch": 1.409076216950029, "grad_norm": 1.0724811553955078, "learning_rate": 2.954304009794082e-06, "loss": 0.5457123279571533, "memory(GiB)": 76.04, "step": 5450, "token_acc": 0.8312751004016065, "train_speed(iter/s)": 0.02763 }, { "epoch": 1.410369125347469, "grad_norm": 1.078969120979309, "learning_rate": 2.9507993693381245e-06, "loss": 0.4943378925323486, "memory(GiB)": 76.04, "step": 5455, "token_acc": 0.8496458467482292, "train_speed(iter/s)": 0.027629 }, { "epoch": 1.4116620337449093, "grad_norm": 0.9450307488441467, "learning_rate": 2.9472938129574248e-06, "loss": 0.5415146827697754, "memory(GiB)": 76.04, "step": 5460, "token_acc": 0.8225661328054705, "train_speed(iter/s)": 0.027629 }, { "epoch": 1.4129549421423493, "grad_norm": 1.5226620435714722, "learning_rate": 2.9437873477744973e-06, "loss": 0.5119266033172607, "memory(GiB)": 76.04, "step": 5465, "token_acc": 0.8384682058151446, "train_speed(iter/s)": 0.027628 }, { "epoch": 1.4142478505397893, "grad_norm": 1.004164457321167, "learning_rate": 2.9402799809137066e-06, "loss": 0.5116465091705322, "memory(GiB)": 76.04, "step": 5470, "token_acc": 0.8437827370559665, "train_speed(iter/s)": 0.027628 }, { "epoch": 1.4155407589372293, "grad_norm": 0.9910258054733276, "learning_rate": 2.936771719501246e-06, "loss": 0.5433405876159668, "memory(GiB)": 76.04, "step": 5475, "token_acc": 0.844466902475998, "train_speed(iter/s)": 0.027629 }, { "epoch": 1.4168336673346693, "grad_norm": 0.9276953935623169, "learning_rate": 2.9332625706651287e-06, "loss": 0.5179524898529053, "memory(GiB)": 76.04, "step": 5480, "token_acc": 0.8560186436098352, "train_speed(iter/s)": 0.02763 }, { "epoch": 1.4181265757321093, "grad_norm": 0.9490028619766235, "learning_rate": 2.929752541535169e-06, "loss": 0.5286359786987305, "memory(GiB)": 76.04, "step": 5485, "token_acc": 0.8138392178714351, "train_speed(iter/s)": 0.027629 }, { "epoch": 1.4194194841295493, "grad_norm": 0.9604682326316833, "learning_rate": 2.9262416392429727e-06, "loss": 0.5103157043457032, "memory(GiB)": 76.04, "step": 5490, "token_acc": 0.8366208149493901, "train_speed(iter/s)": 0.02763 }, { "epoch": 1.4207123925269896, "grad_norm": 1.2919334173202515, "learning_rate": 2.922729870921916e-06, "loss": 0.5384269714355469, "memory(GiB)": 76.04, "step": 5495, "token_acc": 0.8337518834756403, "train_speed(iter/s)": 0.02763 }, { "epoch": 1.4220053009244296, "grad_norm": 8.227481842041016, "learning_rate": 2.919217243707137e-06, "loss": 0.5168218612670898, "memory(GiB)": 76.04, "step": 5500, "token_acc": 0.848703986059682, "train_speed(iter/s)": 0.027631 }, { "epoch": 1.4232982093218696, "grad_norm": 1.9908632040023804, "learning_rate": 2.915703764735518e-06, "loss": 0.5363755226135254, "memory(GiB)": 76.04, "step": 5505, "token_acc": 0.8440745986779982, "train_speed(iter/s)": 0.027631 }, { "epoch": 1.4245911177193096, "grad_norm": 4.587954521179199, "learning_rate": 2.9121894411456727e-06, "loss": 0.5621316432952881, "memory(GiB)": 76.04, "step": 5510, "token_acc": 0.831285065455517, "train_speed(iter/s)": 0.027631 }, { "epoch": 1.4258840261167496, "grad_norm": 1.0272570848464966, "learning_rate": 2.90867428007793e-06, "loss": 0.5223082542419434, "memory(GiB)": 76.04, "step": 5515, "token_acc": 0.8410167818361303, "train_speed(iter/s)": 0.027631 }, { "epoch": 1.4271769345141896, "grad_norm": 1.0505872964859009, "learning_rate": 2.90515828867432e-06, "loss": 0.5363224983215332, "memory(GiB)": 76.04, "step": 5520, "token_acc": 0.8203096575979302, "train_speed(iter/s)": 0.027631 }, { "epoch": 1.4284698429116296, "grad_norm": 1.0748521089553833, "learning_rate": 2.9016414740785625e-06, "loss": 0.5091330051422119, "memory(GiB)": 76.04, "step": 5525, "token_acc": 0.8322848205813095, "train_speed(iter/s)": 0.027631 }, { "epoch": 1.4297627513090698, "grad_norm": 1.0750956535339355, "learning_rate": 2.8981238434360467e-06, "loss": 0.5427698135375977, "memory(GiB)": 76.04, "step": 5530, "token_acc": 0.8349636803874092, "train_speed(iter/s)": 0.027631 }, { "epoch": 1.4310556597065098, "grad_norm": 2.610778331756592, "learning_rate": 2.894605403893821e-06, "loss": 0.4974540710449219, "memory(GiB)": 76.04, "step": 5535, "token_acc": 0.8535060294774452, "train_speed(iter/s)": 0.027631 }, { "epoch": 1.4323485681039498, "grad_norm": 1.1820318698883057, "learning_rate": 2.8910861626005774e-06, "loss": 0.5238434314727783, "memory(GiB)": 76.04, "step": 5540, "token_acc": 0.8497934516523867, "train_speed(iter/s)": 0.027632 }, { "epoch": 1.4336414765013898, "grad_norm": 3.4977128505706787, "learning_rate": 2.887566126706638e-06, "loss": 0.5260235786437988, "memory(GiB)": 76.04, "step": 5545, "token_acc": 0.8721027400272683, "train_speed(iter/s)": 0.027632 }, { "epoch": 1.4349343848988299, "grad_norm": 1.0233287811279297, "learning_rate": 2.884045303363936e-06, "loss": 0.5392961978912354, "memory(GiB)": 76.04, "step": 5550, "token_acc": 0.8287411925544221, "train_speed(iter/s)": 0.027632 }, { "epoch": 1.43622729329627, "grad_norm": 2.892608642578125, "learning_rate": 2.8805236997260083e-06, "loss": 0.5215497016906738, "memory(GiB)": 76.04, "step": 5555, "token_acc": 0.8498609823911029, "train_speed(iter/s)": 0.027631 }, { "epoch": 1.4375202016937099, "grad_norm": 1.1967157125473022, "learning_rate": 2.877001322947975e-06, "loss": 0.5007841110229492, "memory(GiB)": 76.04, "step": 5560, "token_acc": 0.861061495279408, "train_speed(iter/s)": 0.027631 }, { "epoch": 1.43881311009115, "grad_norm": 0.955826461315155, "learning_rate": 2.8734781801865295e-06, "loss": 0.5322293281555176, "memory(GiB)": 76.04, "step": 5565, "token_acc": 0.8267916342651332, "train_speed(iter/s)": 0.027632 }, { "epoch": 1.44010601848859, "grad_norm": 0.9877298474311829, "learning_rate": 2.8699542785999174e-06, "loss": 0.5368639469146729, "memory(GiB)": 76.04, "step": 5570, "token_acc": 0.8380945422663841, "train_speed(iter/s)": 0.027632 }, { "epoch": 1.44139892688603, "grad_norm": 1.6299902200698853, "learning_rate": 2.866429625347929e-06, "loss": 0.5405995368957519, "memory(GiB)": 76.04, "step": 5575, "token_acc": 0.8694860906491031, "train_speed(iter/s)": 0.027633 }, { "epoch": 1.4426918352834701, "grad_norm": 1.1087404489517212, "learning_rate": 2.8629042275918816e-06, "loss": 0.5386042118072509, "memory(GiB)": 76.04, "step": 5580, "token_acc": 0.8234553974314475, "train_speed(iter/s)": 0.027633 }, { "epoch": 1.4439847436809101, "grad_norm": 1.3870139122009277, "learning_rate": 2.8593780924946035e-06, "loss": 0.5439047813415527, "memory(GiB)": 76.04, "step": 5585, "token_acc": 0.8323816375162987, "train_speed(iter/s)": 0.027633 }, { "epoch": 1.4452776520783503, "grad_norm": 1.3395311832427979, "learning_rate": 2.8558512272204236e-06, "loss": 0.5457947254180908, "memory(GiB)": 76.04, "step": 5590, "token_acc": 0.8384256861729674, "train_speed(iter/s)": 0.027634 }, { "epoch": 1.4465705604757904, "grad_norm": 1.6546262502670288, "learning_rate": 2.852323638935153e-06, "loss": 0.5411076545715332, "memory(GiB)": 76.04, "step": 5595, "token_acc": 0.7911689027752251, "train_speed(iter/s)": 0.027635 }, { "epoch": 1.4478634688732304, "grad_norm": 1.460876226425171, "learning_rate": 2.8487953348060717e-06, "loss": 0.5316921710968018, "memory(GiB)": 76.04, "step": 5600, "token_acc": 0.841174282138871, "train_speed(iter/s)": 0.027635 }, { "epoch": 1.4491563772706704, "grad_norm": 1.1629871129989624, "learning_rate": 2.845266322001914e-06, "loss": 0.5173054695129394, "memory(GiB)": 76.04, "step": 5605, "token_acc": 0.8505799971707455, "train_speed(iter/s)": 0.027635 }, { "epoch": 1.4504492856681104, "grad_norm": 1.3548821210861206, "learning_rate": 2.841736607692855e-06, "loss": 0.5308181762695312, "memory(GiB)": 76.04, "step": 5610, "token_acc": 0.8199830736297108, "train_speed(iter/s)": 0.027636 }, { "epoch": 1.4517421940655504, "grad_norm": 1.4260450601577759, "learning_rate": 2.8382061990504937e-06, "loss": 0.5264840126037598, "memory(GiB)": 76.04, "step": 5615, "token_acc": 0.8558112625353561, "train_speed(iter/s)": 0.027636 }, { "epoch": 1.4530351024629904, "grad_norm": 1.11582612991333, "learning_rate": 2.8346751032478416e-06, "loss": 0.5299251556396485, "memory(GiB)": 76.04, "step": 5620, "token_acc": 0.8517095224639729, "train_speed(iter/s)": 0.027635 }, { "epoch": 1.4543280108604306, "grad_norm": 1.011853814125061, "learning_rate": 2.831143327459304e-06, "loss": 0.5147687911987304, "memory(GiB)": 76.04, "step": 5625, "token_acc": 0.8489732511286956, "train_speed(iter/s)": 0.027635 }, { "epoch": 1.4556209192578706, "grad_norm": 1.401329755783081, "learning_rate": 2.8276108788606716e-06, "loss": 0.5251947402954101, "memory(GiB)": 76.04, "step": 5630, "token_acc": 0.8668347467338987, "train_speed(iter/s)": 0.027636 }, { "epoch": 1.4569138276553106, "grad_norm": 1.0562440156936646, "learning_rate": 2.8240777646290973e-06, "loss": 0.5131159782409668, "memory(GiB)": 76.04, "step": 5635, "token_acc": 0.8574029383123757, "train_speed(iter/s)": 0.027636 }, { "epoch": 1.4582067360527506, "grad_norm": 1.0854851007461548, "learning_rate": 2.82054399194309e-06, "loss": 0.5298294067382813, "memory(GiB)": 76.04, "step": 5640, "token_acc": 0.8360400339911246, "train_speed(iter/s)": 0.027637 }, { "epoch": 1.4594996444501906, "grad_norm": 1.0677274465560913, "learning_rate": 2.817009567982495e-06, "loss": 0.5486864566802978, "memory(GiB)": 76.04, "step": 5645, "token_acc": 0.8450833930215901, "train_speed(iter/s)": 0.027637 }, { "epoch": 1.4607925528476309, "grad_norm": 1.0018072128295898, "learning_rate": 2.81347449992848e-06, "loss": 0.5392383098602295, "memory(GiB)": 76.04, "step": 5650, "token_acc": 0.8489272284892723, "train_speed(iter/s)": 0.027638 }, { "epoch": 1.4620854612450707, "grad_norm": 1.0016893148422241, "learning_rate": 2.8099387949635244e-06, "loss": 0.5180238723754883, "memory(GiB)": 76.04, "step": 5655, "token_acc": 0.8308455244235061, "train_speed(iter/s)": 0.027638 }, { "epoch": 1.463378369642511, "grad_norm": 0.9236847162246704, "learning_rate": 2.8064024602713978e-06, "loss": 0.5212345600128174, "memory(GiB)": 76.04, "step": 5660, "token_acc": 0.8309616213885296, "train_speed(iter/s)": 0.027638 }, { "epoch": 1.464671278039951, "grad_norm": 1.1901302337646484, "learning_rate": 2.802865503037153e-06, "loss": 0.5204244613647461, "memory(GiB)": 76.04, "step": 5665, "token_acc": 0.8307556954991665, "train_speed(iter/s)": 0.027638 }, { "epoch": 1.465964186437391, "grad_norm": 1.3907729387283325, "learning_rate": 2.799327930447105e-06, "loss": 0.5336479187011719, "memory(GiB)": 76.04, "step": 5670, "token_acc": 0.8514618825974964, "train_speed(iter/s)": 0.027638 }, { "epoch": 1.467257094834831, "grad_norm": 1.4721412658691406, "learning_rate": 2.79578974968882e-06, "loss": 0.5241554737091064, "memory(GiB)": 76.04, "step": 5675, "token_acc": 0.8389457435252415, "train_speed(iter/s)": 0.027639 }, { "epoch": 1.468550003232271, "grad_norm": 1.0565476417541504, "learning_rate": 2.792250967951099e-06, "loss": 0.5248475074768066, "memory(GiB)": 76.04, "step": 5680, "token_acc": 0.8303552659239016, "train_speed(iter/s)": 0.02764 }, { "epoch": 1.4698429116297111, "grad_norm": 1.4000303745269775, "learning_rate": 2.788711592423966e-06, "loss": 0.5044834613800049, "memory(GiB)": 76.04, "step": 5685, "token_acc": 0.8474695172874561, "train_speed(iter/s)": 0.027639 }, { "epoch": 1.4711358200271512, "grad_norm": 1.4237065315246582, "learning_rate": 2.785171630298649e-06, "loss": 0.527522611618042, "memory(GiB)": 76.04, "step": 5690, "token_acc": 0.8151931330472103, "train_speed(iter/s)": 0.027638 }, { "epoch": 1.4724287284245912, "grad_norm": 1.0766621828079224, "learning_rate": 2.7816310887675697e-06, "loss": 0.5117476940155029, "memory(GiB)": 76.04, "step": 5695, "token_acc": 0.8338809784592918, "train_speed(iter/s)": 0.027639 }, { "epoch": 1.4737216368220312, "grad_norm": 1.4094700813293457, "learning_rate": 2.7780899750243275e-06, "loss": 0.5268692970275879, "memory(GiB)": 76.04, "step": 5700, "token_acc": 0.8330804888327012, "train_speed(iter/s)": 0.027639 }, { "epoch": 1.4750145452194712, "grad_norm": 1.1132041215896606, "learning_rate": 2.7745482962636815e-06, "loss": 0.4945709228515625, "memory(GiB)": 76.04, "step": 5705, "token_acc": 0.860883552163992, "train_speed(iter/s)": 0.027639 }, { "epoch": 1.4763074536169112, "grad_norm": 1.0947760343551636, "learning_rate": 2.7710060596815425e-06, "loss": 0.5298891067504883, "memory(GiB)": 76.04, "step": 5710, "token_acc": 0.8435150568998808, "train_speed(iter/s)": 0.027639 }, { "epoch": 1.4776003620143512, "grad_norm": 0.9912996292114258, "learning_rate": 2.767463272474951e-06, "loss": 0.48708510398864746, "memory(GiB)": 76.04, "step": 5715, "token_acc": 0.8264887063655031, "train_speed(iter/s)": 0.027639 }, { "epoch": 1.4788932704117914, "grad_norm": 0.8707374930381775, "learning_rate": 2.763919941842069e-06, "loss": 0.5079801559448243, "memory(GiB)": 76.04, "step": 5720, "token_acc": 0.8312231452305929, "train_speed(iter/s)": 0.02764 }, { "epoch": 1.4801861788092314, "grad_norm": 1.5692007541656494, "learning_rate": 2.760376074982161e-06, "loss": 0.5193423748016357, "memory(GiB)": 76.04, "step": 5725, "token_acc": 0.8429825267734923, "train_speed(iter/s)": 0.027641 }, { "epoch": 1.4814790872066714, "grad_norm": 1.1025525331497192, "learning_rate": 2.756831679095583e-06, "loss": 0.5138895034790039, "memory(GiB)": 76.04, "step": 5730, "token_acc": 0.8342220895013012, "train_speed(iter/s)": 0.027641 }, { "epoch": 1.4827719956041114, "grad_norm": 1.114014983177185, "learning_rate": 2.7532867613837632e-06, "loss": 0.5035554885864257, "memory(GiB)": 76.04, "step": 5735, "token_acc": 0.8443557981664217, "train_speed(iter/s)": 0.027642 }, { "epoch": 1.4840649040015514, "grad_norm": 0.9390487670898438, "learning_rate": 2.7497413290491927e-06, "loss": 0.5343178749084473, "memory(GiB)": 76.04, "step": 5740, "token_acc": 0.8542599136238712, "train_speed(iter/s)": 0.027641 }, { "epoch": 1.4853578123989915, "grad_norm": 0.9856504797935486, "learning_rate": 2.746195389295406e-06, "loss": 0.5330347537994384, "memory(GiB)": 76.04, "step": 5745, "token_acc": 0.8207178164624973, "train_speed(iter/s)": 0.027641 }, { "epoch": 1.4866507207964315, "grad_norm": 1.0896226167678833, "learning_rate": 2.7426489493269693e-06, "loss": 0.538813591003418, "memory(GiB)": 76.04, "step": 5750, "token_acc": 0.8037091060637633, "train_speed(iter/s)": 0.027642 }, { "epoch": 1.4879436291938717, "grad_norm": 1.0768758058547974, "learning_rate": 2.739102016349465e-06, "loss": 0.5243756294250488, "memory(GiB)": 76.04, "step": 5755, "token_acc": 0.8149051903817803, "train_speed(iter/s)": 0.027642 }, { "epoch": 1.4892365375913117, "grad_norm": 1.063176155090332, "learning_rate": 2.7355545975694777e-06, "loss": 0.5046000480651855, "memory(GiB)": 76.04, "step": 5760, "token_acc": 0.8376587897828166, "train_speed(iter/s)": 0.027641 }, { "epoch": 1.4905294459887517, "grad_norm": 1.0583608150482178, "learning_rate": 2.73200670019458e-06, "loss": 0.5038406372070312, "memory(GiB)": 76.04, "step": 5765, "token_acc": 0.835580538569638, "train_speed(iter/s)": 0.027642 }, { "epoch": 1.4918223543861917, "grad_norm": 1.7861460447311401, "learning_rate": 2.7284583314333136e-06, "loss": 0.5076050758361816, "memory(GiB)": 76.04, "step": 5770, "token_acc": 0.8467831009250311, "train_speed(iter/s)": 0.027642 }, { "epoch": 1.4931152627836317, "grad_norm": 6.293168067932129, "learning_rate": 2.7249094984951817e-06, "loss": 0.5296279430389405, "memory(GiB)": 76.04, "step": 5775, "token_acc": 0.8256261520112762, "train_speed(iter/s)": 0.027642 }, { "epoch": 1.494408171181072, "grad_norm": 1.0335216522216797, "learning_rate": 2.7213602085906284e-06, "loss": 0.5116629600524902, "memory(GiB)": 76.04, "step": 5780, "token_acc": 0.8385129247749056, "train_speed(iter/s)": 0.027642 }, { "epoch": 1.4957010795785117, "grad_norm": 1.0121421813964844, "learning_rate": 2.7178104689310268e-06, "loss": 0.49023923873901365, "memory(GiB)": 76.04, "step": 5785, "token_acc": 0.8403039150163565, "train_speed(iter/s)": 0.027641 }, { "epoch": 1.496993987975952, "grad_norm": 2.905419111251831, "learning_rate": 2.714260286728663e-06, "loss": 0.5063573837280273, "memory(GiB)": 76.04, "step": 5790, "token_acc": 0.8344622697563874, "train_speed(iter/s)": 0.027641 }, { "epoch": 1.498286896373392, "grad_norm": 1.3976821899414062, "learning_rate": 2.7107096691967242e-06, "loss": 0.5138403892517089, "memory(GiB)": 76.04, "step": 5795, "token_acc": 0.8758511480601742, "train_speed(iter/s)": 0.02764 }, { "epoch": 1.499579804770832, "grad_norm": 1.0694046020507812, "learning_rate": 2.70715862354928e-06, "loss": 0.5170317649841308, "memory(GiB)": 76.04, "step": 5800, "token_acc": 0.8628498120412913, "train_speed(iter/s)": 0.027641 }, { "epoch": 1.500872713168272, "grad_norm": 1.158471941947937, "learning_rate": 2.703607157001273e-06, "loss": 0.5261846542358398, "memory(GiB)": 76.04, "step": 5805, "token_acc": 0.8195343894257913, "train_speed(iter/s)": 0.02764 }, { "epoch": 1.502165621565712, "grad_norm": 0.9272586107254028, "learning_rate": 2.7000552767684962e-06, "loss": 0.5037094116210937, "memory(GiB)": 76.04, "step": 5810, "token_acc": 0.8422422339722406, "train_speed(iter/s)": 0.027641 }, { "epoch": 1.5034585299631522, "grad_norm": 1.1119998693466187, "learning_rate": 2.696502990067586e-06, "loss": 0.5135734558105469, "memory(GiB)": 76.04, "step": 5815, "token_acc": 0.8626723760072827, "train_speed(iter/s)": 0.02764 }, { "epoch": 1.504751438360592, "grad_norm": 1.0358091592788696, "learning_rate": 2.6929503041160054e-06, "loss": 0.5373703956604003, "memory(GiB)": 76.04, "step": 5820, "token_acc": 0.8301167050647732, "train_speed(iter/s)": 0.027641 }, { "epoch": 1.5060443467580322, "grad_norm": 1.039354920387268, "learning_rate": 2.6893972261320265e-06, "loss": 0.5479695320129394, "memory(GiB)": 76.04, "step": 5825, "token_acc": 0.8479183638468465, "train_speed(iter/s)": 0.027642 }, { "epoch": 1.5073372551554722, "grad_norm": 1.3886367082595825, "learning_rate": 2.6858437633347197e-06, "loss": 0.49077515602111815, "memory(GiB)": 76.04, "step": 5830, "token_acc": 0.8483184202406665, "train_speed(iter/s)": 0.027641 }, { "epoch": 1.5086301635529122, "grad_norm": 1.0695741176605225, "learning_rate": 2.6822899229439354e-06, "loss": 0.5208306789398194, "memory(GiB)": 76.04, "step": 5835, "token_acc": 0.8321777497636307, "train_speed(iter/s)": 0.027642 }, { "epoch": 1.5099230719503525, "grad_norm": 1.3519343137741089, "learning_rate": 2.678735712180294e-06, "loss": 0.5065782070159912, "memory(GiB)": 76.04, "step": 5840, "token_acc": 0.865064039408867, "train_speed(iter/s)": 0.027642 }, { "epoch": 1.5112159803477923, "grad_norm": 1.1784507036209106, "learning_rate": 2.6751811382651656e-06, "loss": 0.5237961769104004, "memory(GiB)": 76.04, "step": 5845, "token_acc": 0.8759957417128593, "train_speed(iter/s)": 0.027642 }, { "epoch": 1.5125088887452325, "grad_norm": 1.2745177745819092, "learning_rate": 2.6716262084206596e-06, "loss": 0.5225517272949218, "memory(GiB)": 76.04, "step": 5850, "token_acc": 0.8607184154574956, "train_speed(iter/s)": 0.027642 }, { "epoch": 1.5138017971426725, "grad_norm": 1.0925981998443604, "learning_rate": 2.6680709298696075e-06, "loss": 0.5313197135925293, "memory(GiB)": 76.04, "step": 5855, "token_acc": 0.8273417489937798, "train_speed(iter/s)": 0.027642 }, { "epoch": 1.5150947055401125, "grad_norm": 1.1206554174423218, "learning_rate": 2.66451530983555e-06, "loss": 0.5206215858459473, "memory(GiB)": 76.04, "step": 5860, "token_acc": 0.8360609797107947, "train_speed(iter/s)": 0.027642 }, { "epoch": 1.5163876139375525, "grad_norm": 1.0352177619934082, "learning_rate": 2.6609593555427233e-06, "loss": 0.5028391361236573, "memory(GiB)": 76.04, "step": 5865, "token_acc": 0.8328511593764844, "train_speed(iter/s)": 0.027642 }, { "epoch": 1.5176805223349925, "grad_norm": 1.2911217212677002, "learning_rate": 2.6574030742160397e-06, "loss": 0.510726022720337, "memory(GiB)": 76.04, "step": 5870, "token_acc": 0.8490013110202822, "train_speed(iter/s)": 0.027642 }, { "epoch": 1.5189734307324327, "grad_norm": 1.0364927053451538, "learning_rate": 2.6538464730810774e-06, "loss": 0.5217413902282715, "memory(GiB)": 76.04, "step": 5875, "token_acc": 0.8393885789449812, "train_speed(iter/s)": 0.027642 }, { "epoch": 1.5202663391298725, "grad_norm": 1.238673210144043, "learning_rate": 2.6502895593640643e-06, "loss": 0.5099982738494873, "memory(GiB)": 76.04, "step": 5880, "token_acc": 0.8526344031928095, "train_speed(iter/s)": 0.027642 }, { "epoch": 1.5215592475273128, "grad_norm": 0.8736827969551086, "learning_rate": 2.646732340291864e-06, "loss": 0.5140372753143311, "memory(GiB)": 76.04, "step": 5885, "token_acc": 0.8392264114084782, "train_speed(iter/s)": 0.027641 }, { "epoch": 1.5228521559247528, "grad_norm": 1.5109645128250122, "learning_rate": 2.6431748230919583e-06, "loss": 0.5010466575622559, "memory(GiB)": 76.04, "step": 5890, "token_acc": 0.8540109197816044, "train_speed(iter/s)": 0.02764 }, { "epoch": 1.5241450643221928, "grad_norm": 1.3737353086471558, "learning_rate": 2.639617014992438e-06, "loss": 0.5450526237487793, "memory(GiB)": 76.04, "step": 5895, "token_acc": 0.8286816981515336, "train_speed(iter/s)": 0.02764 }, { "epoch": 1.5254379727196328, "grad_norm": 1.299264669418335, "learning_rate": 2.6360589232219826e-06, "loss": 0.5287326812744141, "memory(GiB)": 76.04, "step": 5900, "token_acc": 0.8438716156839771, "train_speed(iter/s)": 0.02764 }, { "epoch": 1.5267308811170728, "grad_norm": 1.595326542854309, "learning_rate": 2.632500555009849e-06, "loss": 0.5352768898010254, "memory(GiB)": 76.04, "step": 5905, "token_acc": 0.8498021897138651, "train_speed(iter/s)": 0.02764 }, { "epoch": 1.528023789514513, "grad_norm": 0.9794163107872009, "learning_rate": 2.6289419175858557e-06, "loss": 0.5425346374511719, "memory(GiB)": 76.04, "step": 5910, "token_acc": 0.8340483277884784, "train_speed(iter/s)": 0.02764 }, { "epoch": 1.5293166979119528, "grad_norm": 1.0976084470748901, "learning_rate": 2.625383018180367e-06, "loss": 0.521512794494629, "memory(GiB)": 76.04, "step": 5915, "token_acc": 0.8385939188146319, "train_speed(iter/s)": 0.02764 }, { "epoch": 1.530609606309393, "grad_norm": 0.8977828621864319, "learning_rate": 2.6218238640242804e-06, "loss": 0.5215116500854492, "memory(GiB)": 76.04, "step": 5920, "token_acc": 0.867619533775736, "train_speed(iter/s)": 0.027639 }, { "epoch": 1.531902514706833, "grad_norm": 0.9642647504806519, "learning_rate": 2.6182644623490123e-06, "loss": 0.5066309928894043, "memory(GiB)": 76.04, "step": 5925, "token_acc": 0.8547594142259414, "train_speed(iter/s)": 0.02764 }, { "epoch": 1.533195423104273, "grad_norm": 1.0187339782714844, "learning_rate": 2.6147048203864785e-06, "loss": 0.5130214691162109, "memory(GiB)": 76.04, "step": 5930, "token_acc": 0.8448008040935673, "train_speed(iter/s)": 0.02764 }, { "epoch": 1.5344883315017133, "grad_norm": 1.0723613500595093, "learning_rate": 2.6111449453690867e-06, "loss": 0.5088356971740723, "memory(GiB)": 76.04, "step": 5935, "token_acc": 0.8498759764540372, "train_speed(iter/s)": 0.02764 }, { "epoch": 1.535781239899153, "grad_norm": 0.9338003993034363, "learning_rate": 2.607584844529717e-06, "loss": 0.5098363399505615, "memory(GiB)": 76.04, "step": 5940, "token_acc": 0.8526082509376065, "train_speed(iter/s)": 0.027641 }, { "epoch": 1.5370741482965933, "grad_norm": 0.8586558103561401, "learning_rate": 2.604024525101707e-06, "loss": 0.5505722045898438, "memory(GiB)": 76.04, "step": 5945, "token_acc": 0.8450296382094433, "train_speed(iter/s)": 0.027641 }, { "epoch": 1.538367056694033, "grad_norm": 1.0932189226150513, "learning_rate": 2.6004639943188397e-06, "loss": 0.51469407081604, "memory(GiB)": 76.04, "step": 5950, "token_acc": 0.840957878166293, "train_speed(iter/s)": 0.027641 }, { "epoch": 1.5396599650914733, "grad_norm": 1.4429826736450195, "learning_rate": 2.5969032594153267e-06, "loss": 0.5273025512695313, "memory(GiB)": 76.04, "step": 5955, "token_acc": 0.8560200279459711, "train_speed(iter/s)": 0.027642 }, { "epoch": 1.5409528734889133, "grad_norm": 1.6431050300598145, "learning_rate": 2.5933423276257957e-06, "loss": 0.5339940071105957, "memory(GiB)": 76.04, "step": 5960, "token_acc": 0.8337239801328199, "train_speed(iter/s)": 0.027641 }, { "epoch": 1.5422457818863533, "grad_norm": 1.0177254676818848, "learning_rate": 2.5897812061852728e-06, "loss": 0.523937177658081, "memory(GiB)": 76.04, "step": 5965, "token_acc": 0.8334849487264308, "train_speed(iter/s)": 0.027641 }, { "epoch": 1.5435386902837935, "grad_norm": 1.0822439193725586, "learning_rate": 2.58621990232917e-06, "loss": 0.5013058662414551, "memory(GiB)": 76.04, "step": 5970, "token_acc": 0.8428224266620379, "train_speed(iter/s)": 0.027642 }, { "epoch": 1.5448315986812333, "grad_norm": 1.0421521663665771, "learning_rate": 2.5826584232932707e-06, "loss": 0.5094140052795411, "memory(GiB)": 76.04, "step": 5975, "token_acc": 0.828722488626583, "train_speed(iter/s)": 0.027643 }, { "epoch": 1.5461245070786735, "grad_norm": 1.2964197397232056, "learning_rate": 2.5790967763137136e-06, "loss": 0.5127614498138428, "memory(GiB)": 76.04, "step": 5980, "token_acc": 0.842072213500785, "train_speed(iter/s)": 0.027643 }, { "epoch": 1.5474174154761136, "grad_norm": 0.8814839124679565, "learning_rate": 2.575534968626978e-06, "loss": 0.5174202919006348, "memory(GiB)": 76.04, "step": 5985, "token_acc": 0.8393448656606551, "train_speed(iter/s)": 0.027644 }, { "epoch": 1.5487103238735536, "grad_norm": 1.192781686782837, "learning_rate": 2.5719730074698718e-06, "loss": 0.5092106342315674, "memory(GiB)": 76.04, "step": 5990, "token_acc": 0.8536305586357206, "train_speed(iter/s)": 0.027644 }, { "epoch": 1.5500032322709936, "grad_norm": 1.0610737800598145, "learning_rate": 2.5684109000795114e-06, "loss": 0.4976038932800293, "memory(GiB)": 76.04, "step": 5995, "token_acc": 0.8407422307150759, "train_speed(iter/s)": 0.027645 }, { "epoch": 1.5512961406684336, "grad_norm": 0.9621560573577881, "learning_rate": 2.564848653693313e-06, "loss": 0.5234485626220703, "memory(GiB)": 76.04, "step": 6000, "token_acc": 0.8350293049512783, "train_speed(iter/s)": 0.027645 }, { "epoch": 1.5525890490658738, "grad_norm": 0.9867120385169983, "learning_rate": 2.5612862755489754e-06, "loss": 0.5299267292022705, "memory(GiB)": 76.04, "step": 6005, "token_acc": 0.8546105977748444, "train_speed(iter/s)": 0.027621 }, { "epoch": 1.5538819574633136, "grad_norm": 1.0978645086288452, "learning_rate": 2.5577237728844624e-06, "loss": 0.5120854854583741, "memory(GiB)": 76.04, "step": 6010, "token_acc": 0.826845756196704, "train_speed(iter/s)": 0.027622 }, { "epoch": 1.5551748658607538, "grad_norm": 1.5117709636688232, "learning_rate": 2.554161152937994e-06, "loss": 0.49729576110839846, "memory(GiB)": 76.04, "step": 6015, "token_acc": 0.8334134348774447, "train_speed(iter/s)": 0.027622 }, { "epoch": 1.5564677742581938, "grad_norm": 0.8881126642227173, "learning_rate": 2.5505984229480257e-06, "loss": 0.5338102340698242, "memory(GiB)": 76.04, "step": 6020, "token_acc": 0.8419421487603306, "train_speed(iter/s)": 0.027622 }, { "epoch": 1.5577606826556338, "grad_norm": 1.0090521574020386, "learning_rate": 2.547035590153239e-06, "loss": 0.5258452892303467, "memory(GiB)": 76.04, "step": 6025, "token_acc": 0.8578943772631004, "train_speed(iter/s)": 0.027622 }, { "epoch": 1.5590535910530738, "grad_norm": 0.9812294244766235, "learning_rate": 2.5434726617925214e-06, "loss": 0.5136911392211914, "memory(GiB)": 76.04, "step": 6030, "token_acc": 0.827103274559194, "train_speed(iter/s)": 0.027622 }, { "epoch": 1.5603464994505138, "grad_norm": 0.9446991682052612, "learning_rate": 2.5399096451049586e-06, "loss": 0.5100172996520996, "memory(GiB)": 76.04, "step": 6035, "token_acc": 0.8508031581813231, "train_speed(iter/s)": 0.027622 }, { "epoch": 1.561639407847954, "grad_norm": 1.0973703861236572, "learning_rate": 2.536346547329812e-06, "loss": 0.5151572704315186, "memory(GiB)": 76.04, "step": 6040, "token_acc": 0.8401378579003181, "train_speed(iter/s)": 0.027622 }, { "epoch": 1.5629323162453939, "grad_norm": 1.1219217777252197, "learning_rate": 2.5327833757065102e-06, "loss": 0.5503729343414306, "memory(GiB)": 76.04, "step": 6045, "token_acc": 0.8555493103895543, "train_speed(iter/s)": 0.027621 }, { "epoch": 1.564225224642834, "grad_norm": 0.9930498600006104, "learning_rate": 2.5292201374746306e-06, "loss": 0.5092242240905762, "memory(GiB)": 76.04, "step": 6050, "token_acc": 0.8612712103502479, "train_speed(iter/s)": 0.027621 }, { "epoch": 1.565518133040274, "grad_norm": 1.1532442569732666, "learning_rate": 2.525656839873885e-06, "loss": 0.509462833404541, "memory(GiB)": 76.04, "step": 6055, "token_acc": 0.8444057905958927, "train_speed(iter/s)": 0.027622 }, { "epoch": 1.566811041437714, "grad_norm": 0.9126574397087097, "learning_rate": 2.522093490144109e-06, "loss": 0.5357399940490722, "memory(GiB)": 76.04, "step": 6060, "token_acc": 0.8304055410560128, "train_speed(iter/s)": 0.027622 }, { "epoch": 1.5681039498351543, "grad_norm": 4.396268844604492, "learning_rate": 2.5185300955252406e-06, "loss": 0.5380908489227295, "memory(GiB)": 76.04, "step": 6065, "token_acc": 0.8541631222566266, "train_speed(iter/s)": 0.027622 }, { "epoch": 1.5693968582325941, "grad_norm": 1.2452019453048706, "learning_rate": 2.514966663257311e-06, "loss": 0.5378365516662598, "memory(GiB)": 76.04, "step": 6070, "token_acc": 0.8556809966075302, "train_speed(iter/s)": 0.027622 }, { "epoch": 1.5706897666300343, "grad_norm": 0.9279251098632812, "learning_rate": 2.511403200580428e-06, "loss": 0.5115952014923095, "memory(GiB)": 76.04, "step": 6075, "token_acc": 0.8463237893248498, "train_speed(iter/s)": 0.027622 }, { "epoch": 1.5719826750274744, "grad_norm": 2.242185592651367, "learning_rate": 2.50783971473476e-06, "loss": 0.5192525386810303, "memory(GiB)": 76.04, "step": 6080, "token_acc": 0.8700274811911519, "train_speed(iter/s)": 0.027622 }, { "epoch": 1.5732755834249144, "grad_norm": 2.6423373222351074, "learning_rate": 2.5042762129605235e-06, "loss": 0.5067386150360107, "memory(GiB)": 76.04, "step": 6085, "token_acc": 0.8241304899720742, "train_speed(iter/s)": 0.027622 }, { "epoch": 1.5745684918223544, "grad_norm": 0.9732599854469299, "learning_rate": 2.500712702497967e-06, "loss": 0.4948467254638672, "memory(GiB)": 76.04, "step": 6090, "token_acc": 0.8798655462184874, "train_speed(iter/s)": 0.027622 }, { "epoch": 1.5758614002197944, "grad_norm": 1.5327101945877075, "learning_rate": 2.497149190587356e-06, "loss": 0.5227193355560302, "memory(GiB)": 76.04, "step": 6095, "token_acc": 0.8608474068152293, "train_speed(iter/s)": 0.027623 }, { "epoch": 1.5771543086172346, "grad_norm": 1.2196455001831055, "learning_rate": 2.4935856844689605e-06, "loss": 0.519383716583252, "memory(GiB)": 76.04, "step": 6100, "token_acc": 0.8335468679663424, "train_speed(iter/s)": 0.027622 }, { "epoch": 1.5784472170146744, "grad_norm": 1.0245038270950317, "learning_rate": 2.4900221913830368e-06, "loss": 0.5222830772399902, "memory(GiB)": 76.04, "step": 6105, "token_acc": 0.8626334519572953, "train_speed(iter/s)": 0.027623 }, { "epoch": 1.5797401254121146, "grad_norm": 1.114092469215393, "learning_rate": 2.486458718569817e-06, "loss": 0.5028997898101807, "memory(GiB)": 76.04, "step": 6110, "token_acc": 0.8522080471050049, "train_speed(iter/s)": 0.027623 }, { "epoch": 1.5810330338095546, "grad_norm": 1.0639675855636597, "learning_rate": 2.4828952732694887e-06, "loss": 0.5147637367248535, "memory(GiB)": 76.04, "step": 6115, "token_acc": 0.863227909435292, "train_speed(iter/s)": 0.027623 }, { "epoch": 1.5823259422069946, "grad_norm": 0.8807838559150696, "learning_rate": 2.479331862722188e-06, "loss": 0.5280374526977539, "memory(GiB)": 76.04, "step": 6120, "token_acc": 0.8349608197709464, "train_speed(iter/s)": 0.027623 }, { "epoch": 1.5836188506044346, "grad_norm": 1.1169236898422241, "learning_rate": 2.4757684941679767e-06, "loss": 0.5291852474212646, "memory(GiB)": 76.04, "step": 6125, "token_acc": 0.83143130614048, "train_speed(iter/s)": 0.027624 }, { "epoch": 1.5849117590018746, "grad_norm": 1.1640690565109253, "learning_rate": 2.4722051748468336e-06, "loss": 0.54544095993042, "memory(GiB)": 76.04, "step": 6130, "token_acc": 0.8492234388601274, "train_speed(iter/s)": 0.027623 }, { "epoch": 1.5862046673993149, "grad_norm": 1.0770084857940674, "learning_rate": 2.4686419119986337e-06, "loss": 0.5241689205169677, "memory(GiB)": 76.04, "step": 6135, "token_acc": 0.8025563166443048, "train_speed(iter/s)": 0.027624 }, { "epoch": 1.5874975757967547, "grad_norm": 0.8977876901626587, "learning_rate": 2.4650787128631433e-06, "loss": 0.47954139709472654, "memory(GiB)": 76.04, "step": 6140, "token_acc": 0.8597583511016347, "train_speed(iter/s)": 0.027623 }, { "epoch": 1.5887904841941949, "grad_norm": 5.924169540405273, "learning_rate": 2.461515584679995e-06, "loss": 0.5163521766662598, "memory(GiB)": 76.04, "step": 6145, "token_acc": 0.8557573765102326, "train_speed(iter/s)": 0.027624 }, { "epoch": 1.590083392591635, "grad_norm": 1.4680997133255005, "learning_rate": 2.457952534688678e-06, "loss": 0.5192079544067383, "memory(GiB)": 76.04, "step": 6150, "token_acc": 0.8392393432144142, "train_speed(iter/s)": 0.027622 }, { "epoch": 1.591376300989075, "grad_norm": 1.3616418838500977, "learning_rate": 2.4543895701285214e-06, "loss": 0.521982479095459, "memory(GiB)": 76.04, "step": 6155, "token_acc": 0.8226835625056169, "train_speed(iter/s)": 0.027623 }, { "epoch": 1.5926692093865151, "grad_norm": 0.9679275751113892, "learning_rate": 2.450826698238685e-06, "loss": 0.5229485034942627, "memory(GiB)": 76.04, "step": 6160, "token_acc": 0.8353462704120866, "train_speed(iter/s)": 0.027623 }, { "epoch": 1.593962117783955, "grad_norm": 1.0366556644439697, "learning_rate": 2.447263926258136e-06, "loss": 0.518170976638794, "memory(GiB)": 76.04, "step": 6165, "token_acc": 0.8254823304680038, "train_speed(iter/s)": 0.027622 }, { "epoch": 1.5952550261813951, "grad_norm": 0.9990222454071045, "learning_rate": 2.4437012614256394e-06, "loss": 0.5325229167938232, "memory(GiB)": 76.04, "step": 6170, "token_acc": 0.8040033620770165, "train_speed(iter/s)": 0.027623 }, { "epoch": 1.596547934578835, "grad_norm": 1.1222083568572998, "learning_rate": 2.4401387109797446e-06, "loss": 0.5065582275390625, "memory(GiB)": 76.04, "step": 6175, "token_acc": 0.8548062202884538, "train_speed(iter/s)": 0.027623 }, { "epoch": 1.5978408429762752, "grad_norm": 1.0990639925003052, "learning_rate": 2.4365762821587656e-06, "loss": 0.5230794906616211, "memory(GiB)": 76.04, "step": 6180, "token_acc": 0.8195025958800871, "train_speed(iter/s)": 0.027623 }, { "epoch": 1.5991337513737152, "grad_norm": 1.0618468523025513, "learning_rate": 2.4330139822007726e-06, "loss": 0.5022711753845215, "memory(GiB)": 76.04, "step": 6185, "token_acc": 0.8767551452202347, "train_speed(iter/s)": 0.027622 }, { "epoch": 1.6004266597711552, "grad_norm": 1.059678077697754, "learning_rate": 2.4294518183435715e-06, "loss": 0.5181986808776855, "memory(GiB)": 76.04, "step": 6190, "token_acc": 0.8563752841496177, "train_speed(iter/s)": 0.027623 }, { "epoch": 1.6017195681685954, "grad_norm": 0.8919258117675781, "learning_rate": 2.4258897978246925e-06, "loss": 0.49803409576416013, "memory(GiB)": 76.04, "step": 6195, "token_acc": 0.8634100953710165, "train_speed(iter/s)": 0.027623 }, { "epoch": 1.6030124765660352, "grad_norm": 1.0404894351959229, "learning_rate": 2.4223279278813736e-06, "loss": 0.5113819122314454, "memory(GiB)": 76.04, "step": 6200, "token_acc": 0.828187138284458, "train_speed(iter/s)": 0.027623 }, { "epoch": 1.6043053849634754, "grad_norm": 1.3208954334259033, "learning_rate": 2.418766215750549e-06, "loss": 0.5281610012054443, "memory(GiB)": 76.04, "step": 6205, "token_acc": 0.835780681665095, "train_speed(iter/s)": 0.027624 }, { "epoch": 1.6055982933609154, "grad_norm": 1.3880317211151123, "learning_rate": 2.4152046686688305e-06, "loss": 0.5289054870605469, "memory(GiB)": 76.04, "step": 6210, "token_acc": 0.8374174516442627, "train_speed(iter/s)": 0.027625 }, { "epoch": 1.6068912017583554, "grad_norm": 1.5898443460464478, "learning_rate": 2.4116432938724953e-06, "loss": 0.5337974548339843, "memory(GiB)": 76.04, "step": 6215, "token_acc": 0.8592785422089996, "train_speed(iter/s)": 0.027625 }, { "epoch": 1.6081841101557954, "grad_norm": 1.666271686553955, "learning_rate": 2.4080820985974707e-06, "loss": 0.5134634017944336, "memory(GiB)": 76.04, "step": 6220, "token_acc": 0.8779267140307283, "train_speed(iter/s)": 0.027626 }, { "epoch": 1.6094770185532354, "grad_norm": 0.9248247742652893, "learning_rate": 2.4045210900793167e-06, "loss": 0.5277139186859131, "memory(GiB)": 76.04, "step": 6225, "token_acc": 0.8437402643642352, "train_speed(iter/s)": 0.027625 }, { "epoch": 1.6107699269506757, "grad_norm": 1.5387322902679443, "learning_rate": 2.4009602755532188e-06, "loss": 0.5056108474731446, "memory(GiB)": 76.04, "step": 6230, "token_acc": 0.8501114918148692, "train_speed(iter/s)": 0.027626 }, { "epoch": 1.6120628353481155, "grad_norm": 1.0052211284637451, "learning_rate": 2.3973996622539646e-06, "loss": 0.5336996078491211, "memory(GiB)": 76.04, "step": 6235, "token_acc": 0.8721888153938665, "train_speed(iter/s)": 0.027626 }, { "epoch": 1.6133557437455557, "grad_norm": 1.027434229850769, "learning_rate": 2.393839257415933e-06, "loss": 0.49329376220703125, "memory(GiB)": 76.04, "step": 6240, "token_acc": 0.8676368108218897, "train_speed(iter/s)": 0.027626 }, { "epoch": 1.6146486521429957, "grad_norm": 1.1870218515396118, "learning_rate": 2.3902790682730806e-06, "loss": 0.5256915092468262, "memory(GiB)": 76.04, "step": 6245, "token_acc": 0.8572780020181635, "train_speed(iter/s)": 0.027625 }, { "epoch": 1.6159415605404357, "grad_norm": 1.0639426708221436, "learning_rate": 2.3867191020589264e-06, "loss": 0.5284603118896485, "memory(GiB)": 76.04, "step": 6250, "token_acc": 0.8446619622126109, "train_speed(iter/s)": 0.027626 }, { "epoch": 1.6172344689378757, "grad_norm": 1.0155668258666992, "learning_rate": 2.3831593660065345e-06, "loss": 0.5121121406555176, "memory(GiB)": 76.04, "step": 6255, "token_acc": 0.8595757910736493, "train_speed(iter/s)": 0.027626 }, { "epoch": 1.6185273773353157, "grad_norm": 1.0661635398864746, "learning_rate": 2.3795998673485025e-06, "loss": 0.5248492240905762, "memory(GiB)": 76.04, "step": 6260, "token_acc": 0.8348161428909712, "train_speed(iter/s)": 0.027626 }, { "epoch": 1.619820285732756, "grad_norm": 1.0434828996658325, "learning_rate": 2.376040613316944e-06, "loss": 0.5192477226257324, "memory(GiB)": 76.04, "step": 6265, "token_acc": 0.8534787948847626, "train_speed(iter/s)": 0.027626 }, { "epoch": 1.6211131941301957, "grad_norm": 0.9453509449958801, "learning_rate": 2.37248161114348e-06, "loss": 0.5361949920654296, "memory(GiB)": 76.04, "step": 6270, "token_acc": 0.8283499021225963, "train_speed(iter/s)": 0.027626 }, { "epoch": 1.622406102527636, "grad_norm": 1.1677709817886353, "learning_rate": 2.3689228680592138e-06, "loss": 0.52266845703125, "memory(GiB)": 76.04, "step": 6275, "token_acc": 0.8238867321306235, "train_speed(iter/s)": 0.027626 }, { "epoch": 1.623699010925076, "grad_norm": 1.974316954612732, "learning_rate": 2.3653643912947276e-06, "loss": 0.5168787479400635, "memory(GiB)": 76.04, "step": 6280, "token_acc": 0.8214940319191738, "train_speed(iter/s)": 0.027626 }, { "epoch": 1.624991919322516, "grad_norm": 1.1270877122879028, "learning_rate": 2.3618061880800586e-06, "loss": 0.48665618896484375, "memory(GiB)": 76.04, "step": 6285, "token_acc": 0.8658167398627041, "train_speed(iter/s)": 0.027626 }, { "epoch": 1.6262848277199562, "grad_norm": 1.4701924324035645, "learning_rate": 2.3582482656446897e-06, "loss": 0.5326834678649902, "memory(GiB)": 76.04, "step": 6290, "token_acc": 0.841709722874589, "train_speed(iter/s)": 0.027627 }, { "epoch": 1.627577736117396, "grad_norm": 0.9950865507125854, "learning_rate": 2.3546906312175347e-06, "loss": 0.597511100769043, "memory(GiB)": 76.04, "step": 6295, "token_acc": 0.7920300141959035, "train_speed(iter/s)": 0.027627 }, { "epoch": 1.6288706445148362, "grad_norm": 1.3875091075897217, "learning_rate": 2.35113329202692e-06, "loss": 0.5079882621765137, "memory(GiB)": 76.04, "step": 6300, "token_acc": 0.8363686840644087, "train_speed(iter/s)": 0.027627 }, { "epoch": 1.6301635529122762, "grad_norm": 1.17184317111969, "learning_rate": 2.3475762553005727e-06, "loss": 0.5145916938781738, "memory(GiB)": 76.04, "step": 6305, "token_acc": 0.841187863137508, "train_speed(iter/s)": 0.027627 }, { "epoch": 1.6314564613097162, "grad_norm": 1.8713252544403076, "learning_rate": 2.344019528265607e-06, "loss": 0.5273695468902588, "memory(GiB)": 76.04, "step": 6310, "token_acc": 0.8691662296801258, "train_speed(iter/s)": 0.027627 }, { "epoch": 1.6327493697071562, "grad_norm": 1.0383869409561157, "learning_rate": 2.3404631181485053e-06, "loss": 0.5135766983032226, "memory(GiB)": 76.04, "step": 6315, "token_acc": 0.8497678608551036, "train_speed(iter/s)": 0.027627 }, { "epoch": 1.6340422781045962, "grad_norm": 0.8791532516479492, "learning_rate": 2.3369070321751085e-06, "loss": 0.5190924167633056, "memory(GiB)": 76.04, "step": 6320, "token_acc": 0.8457278865303347, "train_speed(iter/s)": 0.027627 }, { "epoch": 1.6353351865020365, "grad_norm": 1.2614482641220093, "learning_rate": 2.3333512775705975e-06, "loss": 0.5101301193237304, "memory(GiB)": 76.04, "step": 6325, "token_acc": 0.8530480522450639, "train_speed(iter/s)": 0.027627 }, { "epoch": 1.6366280948994762, "grad_norm": 1.5320963859558105, "learning_rate": 2.3297958615594786e-06, "loss": 0.4884361743927002, "memory(GiB)": 76.04, "step": 6330, "token_acc": 0.8361637380975754, "train_speed(iter/s)": 0.027627 }, { "epoch": 1.6379210032969165, "grad_norm": 1.157752275466919, "learning_rate": 2.326240791365575e-06, "loss": 0.4981177806854248, "memory(GiB)": 76.04, "step": 6335, "token_acc": 0.8416252072968491, "train_speed(iter/s)": 0.027628 }, { "epoch": 1.6392139116943565, "grad_norm": 1.2564195394515991, "learning_rate": 2.3226860742120017e-06, "loss": 0.538153886795044, "memory(GiB)": 76.04, "step": 6340, "token_acc": 0.8415269756303705, "train_speed(iter/s)": 0.027628 }, { "epoch": 1.6405068200917965, "grad_norm": 0.9583210945129395, "learning_rate": 2.319131717321159e-06, "loss": 0.4883336067199707, "memory(GiB)": 76.04, "step": 6345, "token_acc": 0.8325664381632079, "train_speed(iter/s)": 0.027628 }, { "epoch": 1.6417997284892365, "grad_norm": 1.2164779901504517, "learning_rate": 2.3155777279147156e-06, "loss": 0.5153134346008301, "memory(GiB)": 76.04, "step": 6350, "token_acc": 0.8470916505687915, "train_speed(iter/s)": 0.027628 }, { "epoch": 1.6430926368866765, "grad_norm": 1.1382580995559692, "learning_rate": 2.312024113213592e-06, "loss": 0.5252164840698242, "memory(GiB)": 76.04, "step": 6355, "token_acc": 0.8316279498525073, "train_speed(iter/s)": 0.027628 }, { "epoch": 1.6443855452841167, "grad_norm": 0.8894354104995728, "learning_rate": 2.3084708804379497e-06, "loss": 0.5195868015289307, "memory(GiB)": 76.04, "step": 6360, "token_acc": 0.8409683261916332, "train_speed(iter/s)": 0.027628 }, { "epoch": 1.6456784536815565, "grad_norm": 1.683902382850647, "learning_rate": 2.3049180368071724e-06, "loss": 0.5006110191345214, "memory(GiB)": 76.04, "step": 6365, "token_acc": 0.8291754756871036, "train_speed(iter/s)": 0.027628 }, { "epoch": 1.6469713620789967, "grad_norm": 1.7893975973129272, "learning_rate": 2.301365589539853e-06, "loss": 0.49852724075317384, "memory(GiB)": 76.04, "step": 6370, "token_acc": 0.8365970585845454, "train_speed(iter/s)": 0.027627 }, { "epoch": 1.6482642704764368, "grad_norm": 0.9394116997718811, "learning_rate": 2.2978135458537793e-06, "loss": 0.5331932067871094, "memory(GiB)": 76.04, "step": 6375, "token_acc": 0.8245893719806763, "train_speed(iter/s)": 0.027628 }, { "epoch": 1.6495571788738768, "grad_norm": 1.047746181488037, "learning_rate": 2.2942619129659205e-06, "loss": 0.5376855850219726, "memory(GiB)": 76.04, "step": 6380, "token_acc": 0.8324474924989285, "train_speed(iter/s)": 0.027628 }, { "epoch": 1.650850087271317, "grad_norm": 0.8962509632110596, "learning_rate": 2.2907106980924104e-06, "loss": 0.4863112449645996, "memory(GiB)": 76.04, "step": 6385, "token_acc": 0.8503153721391241, "train_speed(iter/s)": 0.027629 }, { "epoch": 1.6521429956687568, "grad_norm": 1.389729380607605, "learning_rate": 2.2871599084485325e-06, "loss": 0.5152921676635742, "memory(GiB)": 76.04, "step": 6390, "token_acc": 0.8414910086935811, "train_speed(iter/s)": 0.027629 }, { "epoch": 1.653435904066197, "grad_norm": 0.9997110366821289, "learning_rate": 2.2836095512487063e-06, "loss": 0.5211985588073731, "memory(GiB)": 76.04, "step": 6395, "token_acc": 0.857375318849503, "train_speed(iter/s)": 0.027629 }, { "epoch": 1.6547288124636368, "grad_norm": 1.0818804502487183, "learning_rate": 2.280059633706475e-06, "loss": 0.5084996223449707, "memory(GiB)": 76.04, "step": 6400, "token_acc": 0.8613592233009709, "train_speed(iter/s)": 0.027629 }, { "epoch": 1.656021720861077, "grad_norm": 0.9981624484062195, "learning_rate": 2.276510163034486e-06, "loss": 0.5429449081420898, "memory(GiB)": 76.04, "step": 6405, "token_acc": 0.8164522088613749, "train_speed(iter/s)": 0.02763 }, { "epoch": 1.657314629258517, "grad_norm": 0.9858782887458801, "learning_rate": 2.2729611464444797e-06, "loss": 0.5149686813354493, "memory(GiB)": 76.04, "step": 6410, "token_acc": 0.8206152336907014, "train_speed(iter/s)": 0.027629 }, { "epoch": 1.658607537655957, "grad_norm": 0.9807013273239136, "learning_rate": 2.2694125911472743e-06, "loss": 0.5264925479888916, "memory(GiB)": 76.04, "step": 6415, "token_acc": 0.8599959754502465, "train_speed(iter/s)": 0.027629 }, { "epoch": 1.6599004460533973, "grad_norm": 1.4319406747817993, "learning_rate": 2.265864504352749e-06, "loss": 0.5101997375488281, "memory(GiB)": 76.04, "step": 6420, "token_acc": 0.8523102555710927, "train_speed(iter/s)": 0.02763 }, { "epoch": 1.661193354450837, "grad_norm": 1.1590079069137573, "learning_rate": 2.2623168932698347e-06, "loss": 0.4951170444488525, "memory(GiB)": 76.04, "step": 6425, "token_acc": 0.852101487651052, "train_speed(iter/s)": 0.027629 }, { "epoch": 1.6624862628482773, "grad_norm": 0.9497671127319336, "learning_rate": 2.258769765106492e-06, "loss": 0.5196887969970703, "memory(GiB)": 76.04, "step": 6430, "token_acc": 0.8118867658795361, "train_speed(iter/s)": 0.027629 }, { "epoch": 1.6637791712457173, "grad_norm": 1.0885018110275269, "learning_rate": 2.255223127069702e-06, "loss": 0.5309447765350341, "memory(GiB)": 76.04, "step": 6435, "token_acc": 0.8558985773734636, "train_speed(iter/s)": 0.027629 }, { "epoch": 1.6650720796431573, "grad_norm": 1.0342893600463867, "learning_rate": 2.251676986365449e-06, "loss": 0.49361910820007326, "memory(GiB)": 76.04, "step": 6440, "token_acc": 0.8603108210435222, "train_speed(iter/s)": 0.027631 }, { "epoch": 1.6663649880405973, "grad_norm": 1.072338342666626, "learning_rate": 2.2481313501987103e-06, "loss": 0.5142477035522461, "memory(GiB)": 76.04, "step": 6445, "token_acc": 0.8642217409120178, "train_speed(iter/s)": 0.02763 }, { "epoch": 1.6676578964380373, "grad_norm": 1.2066073417663574, "learning_rate": 2.2445862257734317e-06, "loss": 0.5130002975463868, "memory(GiB)": 76.04, "step": 6450, "token_acc": 0.8323050805349675, "train_speed(iter/s)": 0.02763 }, { "epoch": 1.6689508048354775, "grad_norm": 1.054640769958496, "learning_rate": 2.2410416202925262e-06, "loss": 0.5043740749359131, "memory(GiB)": 76.04, "step": 6455, "token_acc": 0.8686844613918017, "train_speed(iter/s)": 0.027631 }, { "epoch": 1.6702437132329173, "grad_norm": 2.2036914825439453, "learning_rate": 2.237497540957848e-06, "loss": 0.5211320877075195, "memory(GiB)": 76.04, "step": 6460, "token_acc": 0.8593575418994414, "train_speed(iter/s)": 0.027631 }, { "epoch": 1.6715366216303575, "grad_norm": 1.2252447605133057, "learning_rate": 2.2339539949701817e-06, "loss": 0.5284463882446289, "memory(GiB)": 76.04, "step": 6465, "token_acc": 0.8268600408188509, "train_speed(iter/s)": 0.027631 }, { "epoch": 1.6728295300277976, "grad_norm": 1.165474772453308, "learning_rate": 2.230410989529233e-06, "loss": 0.5352771759033204, "memory(GiB)": 76.04, "step": 6470, "token_acc": 0.8386907812843231, "train_speed(iter/s)": 0.02763 }, { "epoch": 1.6741224384252376, "grad_norm": 0.8712031841278076, "learning_rate": 2.226868531833605e-06, "loss": 0.5065167903900146, "memory(GiB)": 76.04, "step": 6475, "token_acc": 0.8473345970687503, "train_speed(iter/s)": 0.02763 }, { "epoch": 1.6754153468226776, "grad_norm": 1.0779868364334106, "learning_rate": 2.2233266290807886e-06, "loss": 0.5394890785217286, "memory(GiB)": 76.04, "step": 6480, "token_acc": 0.8533062727144003, "train_speed(iter/s)": 0.027629 }, { "epoch": 1.6767082552201176, "grad_norm": 1.0535589456558228, "learning_rate": 2.2197852884671487e-06, "loss": 0.5131864547729492, "memory(GiB)": 76.04, "step": 6485, "token_acc": 0.8391084472747705, "train_speed(iter/s)": 0.027629 }, { "epoch": 1.6780011636175578, "grad_norm": 0.8883035778999329, "learning_rate": 2.2162445171879067e-06, "loss": 0.5062174320220947, "memory(GiB)": 76.04, "step": 6490, "token_acc": 0.853890824622532, "train_speed(iter/s)": 0.027629 }, { "epoch": 1.6792940720149976, "grad_norm": 0.983790934085846, "learning_rate": 2.212704322437129e-06, "loss": 0.500247859954834, "memory(GiB)": 76.04, "step": 6495, "token_acc": 0.860179981923213, "train_speed(iter/s)": 0.027628 }, { "epoch": 1.6805869804124378, "grad_norm": 3.652728319168091, "learning_rate": 2.2091647114077083e-06, "loss": 0.5243520736694336, "memory(GiB)": 76.04, "step": 6500, "token_acc": 0.8531156542628818, "train_speed(iter/s)": 0.027629 }, { "epoch": 1.6818798888098778, "grad_norm": 1.112502098083496, "learning_rate": 2.2056256912913508e-06, "loss": 0.5279044151306153, "memory(GiB)": 76.04, "step": 6505, "token_acc": 0.8433751743375174, "train_speed(iter/s)": 0.027629 }, { "epoch": 1.6831727972073178, "grad_norm": 1.060985803604126, "learning_rate": 2.2020872692785666e-06, "loss": 0.5015209197998047, "memory(GiB)": 76.04, "step": 6510, "token_acc": 0.8586926542245105, "train_speed(iter/s)": 0.027629 }, { "epoch": 1.684465705604758, "grad_norm": 1.1359707117080688, "learning_rate": 2.1985494525586458e-06, "loss": 0.4859332084655762, "memory(GiB)": 76.04, "step": 6515, "token_acc": 0.844207331995497, "train_speed(iter/s)": 0.02763 }, { "epoch": 1.6857586140021978, "grad_norm": 1.0586234331130981, "learning_rate": 2.1950122483196513e-06, "loss": 0.5136495590209961, "memory(GiB)": 76.04, "step": 6520, "token_acc": 0.8458574181117534, "train_speed(iter/s)": 0.02763 }, { "epoch": 1.687051522399638, "grad_norm": 2.7810006141662598, "learning_rate": 2.191475663748401e-06, "loss": 0.5169890403747559, "memory(GiB)": 76.04, "step": 6525, "token_acc": 0.8014549325762953, "train_speed(iter/s)": 0.02763 }, { "epoch": 1.688344430797078, "grad_norm": 1.60906982421875, "learning_rate": 2.1879397060304518e-06, "loss": 0.5097242832183838, "memory(GiB)": 76.04, "step": 6530, "token_acc": 0.8751048951048951, "train_speed(iter/s)": 0.02763 }, { "epoch": 1.689637339194518, "grad_norm": 1.4863407611846924, "learning_rate": 2.1844043823500912e-06, "loss": 0.5065485954284668, "memory(GiB)": 76.04, "step": 6535, "token_acc": 0.8393135069196147, "train_speed(iter/s)": 0.02763 }, { "epoch": 1.690930247591958, "grad_norm": 1.1122289896011353, "learning_rate": 2.1808696998903147e-06, "loss": 0.4878704071044922, "memory(GiB)": 76.04, "step": 6540, "token_acc": 0.8160763559053693, "train_speed(iter/s)": 0.02763 }, { "epoch": 1.692223155989398, "grad_norm": 2.09844970703125, "learning_rate": 2.177335665832816e-06, "loss": 0.5010098457336426, "memory(GiB)": 76.04, "step": 6545, "token_acc": 0.844059639520619, "train_speed(iter/s)": 0.02763 }, { "epoch": 1.6935160643868383, "grad_norm": 1.2505282163619995, "learning_rate": 2.1738022873579724e-06, "loss": 0.5115324020385742, "memory(GiB)": 76.04, "step": 6550, "token_acc": 0.8518385971190838, "train_speed(iter/s)": 0.02763 }, { "epoch": 1.6948089727842781, "grad_norm": 35.658302307128906, "learning_rate": 2.1702695716448276e-06, "loss": 0.5169626235961914, "memory(GiB)": 76.04, "step": 6555, "token_acc": 0.8498464176012572, "train_speed(iter/s)": 0.02763 }, { "epoch": 1.6961018811817183, "grad_norm": 1.1642698049545288, "learning_rate": 2.166737525871081e-06, "loss": 0.5165857315063477, "memory(GiB)": 76.04, "step": 6560, "token_acc": 0.8605155555555556, "train_speed(iter/s)": 0.02763 }, { "epoch": 1.6973947895791583, "grad_norm": 1.4594037532806396, "learning_rate": 2.1632061572130687e-06, "loss": 0.48950824737548826, "memory(GiB)": 76.04, "step": 6565, "token_acc": 0.832944099378882, "train_speed(iter/s)": 0.027629 }, { "epoch": 1.6986876979765984, "grad_norm": 1.3144365549087524, "learning_rate": 2.1596754728457508e-06, "loss": 0.5155162811279297, "memory(GiB)": 76.04, "step": 6570, "token_acc": 0.8293847917462743, "train_speed(iter/s)": 0.027629 }, { "epoch": 1.6999806063740384, "grad_norm": 1.3902095556259155, "learning_rate": 2.1561454799426997e-06, "loss": 0.5293027877807617, "memory(GiB)": 76.04, "step": 6575, "token_acc": 0.8462088378535365, "train_speed(iter/s)": 0.02763 }, { "epoch": 1.7012735147714784, "grad_norm": 1.1059478521347046, "learning_rate": 2.1526161856760806e-06, "loss": 0.5223227500915527, "memory(GiB)": 76.04, "step": 6580, "token_acc": 0.8502826247235193, "train_speed(iter/s)": 0.027629 }, { "epoch": 1.7025664231689186, "grad_norm": 1.1676130294799805, "learning_rate": 2.1490875972166394e-06, "loss": 0.5052920341491699, "memory(GiB)": 76.04, "step": 6585, "token_acc": 0.8628404326533489, "train_speed(iter/s)": 0.027629 }, { "epoch": 1.7038593315663584, "grad_norm": 1.3875575065612793, "learning_rate": 2.1455597217336895e-06, "loss": 0.5150994777679443, "memory(GiB)": 76.04, "step": 6590, "token_acc": 0.8165382212039158, "train_speed(iter/s)": 0.027628 }, { "epoch": 1.7051522399637986, "grad_norm": 1.5124651193618774, "learning_rate": 2.1420325663950923e-06, "loss": 0.4880176544189453, "memory(GiB)": 76.04, "step": 6595, "token_acc": 0.8497203061189471, "train_speed(iter/s)": 0.027628 }, { "epoch": 1.7064451483612386, "grad_norm": 1.1382073163986206, "learning_rate": 2.138506138367252e-06, "loss": 0.496349573135376, "memory(GiB)": 76.04, "step": 6600, "token_acc": 0.8560916156924068, "train_speed(iter/s)": 0.027627 }, { "epoch": 1.7077380567586786, "grad_norm": 1.3284364938735962, "learning_rate": 2.134980444815089e-06, "loss": 0.5333932876586914, "memory(GiB)": 76.04, "step": 6605, "token_acc": 0.8525549959102846, "train_speed(iter/s)": 0.027626 }, { "epoch": 1.7090309651561189, "grad_norm": 0.9929280281066895, "learning_rate": 2.1314554929020335e-06, "loss": 0.49078850746154784, "memory(GiB)": 76.04, "step": 6610, "token_acc": 0.8561151079136691, "train_speed(iter/s)": 0.027627 }, { "epoch": 1.7103238735535586, "grad_norm": 1.0720187425613403, "learning_rate": 2.1279312897900097e-06, "loss": 0.5510223388671875, "memory(GiB)": 76.04, "step": 6615, "token_acc": 0.8111014442317731, "train_speed(iter/s)": 0.027627 }, { "epoch": 1.7116167819509989, "grad_norm": 0.9900431036949158, "learning_rate": 2.124407842639421e-06, "loss": 0.535820198059082, "memory(GiB)": 76.04, "step": 6620, "token_acc": 0.8021880945909214, "train_speed(iter/s)": 0.027627 }, { "epoch": 1.7129096903484387, "grad_norm": 1.2437286376953125, "learning_rate": 2.120885158609132e-06, "loss": 0.5138998985290527, "memory(GiB)": 76.04, "step": 6625, "token_acc": 0.8522361238259926, "train_speed(iter/s)": 0.027627 }, { "epoch": 1.7142025987458789, "grad_norm": 1.634427547454834, "learning_rate": 2.1173632448564603e-06, "loss": 0.4958186149597168, "memory(GiB)": 76.04, "step": 6630, "token_acc": 0.8392738961898459, "train_speed(iter/s)": 0.027627 }, { "epoch": 1.7154955071433189, "grad_norm": 0.9783524870872498, "learning_rate": 2.113842108537155e-06, "loss": 0.51722412109375, "memory(GiB)": 76.04, "step": 6635, "token_acc": 0.8480223559759243, "train_speed(iter/s)": 0.027628 }, { "epoch": 1.716788415540759, "grad_norm": 0.919309139251709, "learning_rate": 2.110321756805388e-06, "loss": 0.4969566822052002, "memory(GiB)": 76.04, "step": 6640, "token_acc": 0.8367924528301887, "train_speed(iter/s)": 0.027626 }, { "epoch": 1.7180813239381991, "grad_norm": 1.3210502862930298, "learning_rate": 2.1068021968137367e-06, "loss": 0.509549903869629, "memory(GiB)": 76.04, "step": 6645, "token_acc": 0.8161617605030008, "train_speed(iter/s)": 0.027626 }, { "epoch": 1.719374232335639, "grad_norm": 1.611215591430664, "learning_rate": 2.103283435713169e-06, "loss": 0.49874000549316405, "memory(GiB)": 76.04, "step": 6650, "token_acc": 0.8636546184738956, "train_speed(iter/s)": 0.027624 }, { "epoch": 1.7206671407330791, "grad_norm": 1.5470666885375977, "learning_rate": 2.0997654806530314e-06, "loss": 0.5100409984588623, "memory(GiB)": 76.04, "step": 6655, "token_acc": 0.8550865800865801, "train_speed(iter/s)": 0.027625 }, { "epoch": 1.7219600491305191, "grad_norm": 1.1041886806488037, "learning_rate": 2.0962483387810293e-06, "loss": 0.5100605964660645, "memory(GiB)": 76.04, "step": 6660, "token_acc": 0.8544809228039042, "train_speed(iter/s)": 0.027625 }, { "epoch": 1.7232529575279592, "grad_norm": 1.3654582500457764, "learning_rate": 2.092732017243221e-06, "loss": 0.5010916709899902, "memory(GiB)": 76.04, "step": 6665, "token_acc": 0.8365678065576336, "train_speed(iter/s)": 0.027625 }, { "epoch": 1.7245458659253992, "grad_norm": 1.7149609327316284, "learning_rate": 2.0892165231839935e-06, "loss": 0.5101409912109375, "memory(GiB)": 76.04, "step": 6670, "token_acc": 0.866217041193058, "train_speed(iter/s)": 0.027625 }, { "epoch": 1.7258387743228392, "grad_norm": 1.0402779579162598, "learning_rate": 2.085701863746054e-06, "loss": 0.5074934005737305, "memory(GiB)": 76.04, "step": 6675, "token_acc": 0.8241552132337115, "train_speed(iter/s)": 0.027624 }, { "epoch": 1.7271316827202794, "grad_norm": 1.1275187730789185, "learning_rate": 2.082188046070414e-06, "loss": 0.48826584815979, "memory(GiB)": 76.04, "step": 6680, "token_acc": 0.8426791277258567, "train_speed(iter/s)": 0.027625 }, { "epoch": 1.7284245911177192, "grad_norm": 0.9774495363235474, "learning_rate": 2.0786750772963758e-06, "loss": 0.49518957138061526, "memory(GiB)": 76.04, "step": 6685, "token_acc": 0.8610470275066548, "train_speed(iter/s)": 0.027625 }, { "epoch": 1.7297174995151594, "grad_norm": 0.9467169642448425, "learning_rate": 2.0751629645615155e-06, "loss": 0.5169444561004639, "memory(GiB)": 76.04, "step": 6690, "token_acc": 0.8478371242891958, "train_speed(iter/s)": 0.027625 }, { "epoch": 1.7310104079125994, "grad_norm": 1.2938302755355835, "learning_rate": 2.071651715001671e-06, "loss": 0.549882173538208, "memory(GiB)": 76.04, "step": 6695, "token_acc": 0.8068968578022369, "train_speed(iter/s)": 0.027626 }, { "epoch": 1.7323033163100394, "grad_norm": 2.241086721420288, "learning_rate": 2.068141335750925e-06, "loss": 0.49896945953369143, "memory(GiB)": 76.04, "step": 6700, "token_acc": 0.845947499520981, "train_speed(iter/s)": 0.027626 }, { "epoch": 1.7335962247074794, "grad_norm": 2.3480138778686523, "learning_rate": 2.0646318339415917e-06, "loss": 0.5186596393585206, "memory(GiB)": 76.04, "step": 6705, "token_acc": 0.8365800865800865, "train_speed(iter/s)": 0.027625 }, { "epoch": 1.7348891331049194, "grad_norm": 1.1465567350387573, "learning_rate": 2.0611232167042062e-06, "loss": 0.504915428161621, "memory(GiB)": 76.04, "step": 6710, "token_acc": 0.8566171520890364, "train_speed(iter/s)": 0.027624 }, { "epoch": 1.7361820415023597, "grad_norm": 0.9098281860351562, "learning_rate": 2.0576154911675024e-06, "loss": 0.49738254547119143, "memory(GiB)": 76.04, "step": 6715, "token_acc": 0.8353113246970331, "train_speed(iter/s)": 0.027624 }, { "epoch": 1.7374749498997994, "grad_norm": 2.8491294384002686, "learning_rate": 2.0541086644584033e-06, "loss": 0.48783140182495116, "memory(GiB)": 76.04, "step": 6720, "token_acc": 0.8488560619708161, "train_speed(iter/s)": 0.027624 }, { "epoch": 1.7387678582972397, "grad_norm": 1.6546626091003418, "learning_rate": 2.0506027437020067e-06, "loss": 0.5130843162536621, "memory(GiB)": 76.04, "step": 6725, "token_acc": 0.8279507603186097, "train_speed(iter/s)": 0.027624 }, { "epoch": 1.7400607666946797, "grad_norm": 1.783795952796936, "learning_rate": 2.047097736021569e-06, "loss": 0.5069493293762207, "memory(GiB)": 76.04, "step": 6730, "token_acc": 0.8449707155589509, "train_speed(iter/s)": 0.027624 }, { "epoch": 1.7413536750921197, "grad_norm": 1.0760575532913208, "learning_rate": 2.043593648538492e-06, "loss": 0.5043985366821289, "memory(GiB)": 76.04, "step": 6735, "token_acc": 0.8538088715625329, "train_speed(iter/s)": 0.027624 }, { "epoch": 1.74264658348956, "grad_norm": 0.9393293261528015, "learning_rate": 2.0400904883723074e-06, "loss": 0.5335483551025391, "memory(GiB)": 76.04, "step": 6740, "token_acc": 0.8334933205343572, "train_speed(iter/s)": 0.027623 }, { "epoch": 1.7439394918869997, "grad_norm": 1.1046473979949951, "learning_rate": 2.036588262640661e-06, "loss": 0.5038503170013428, "memory(GiB)": 76.04, "step": 6745, "token_acc": 0.8518955250280055, "train_speed(iter/s)": 0.027623 }, { "epoch": 1.74523240028444, "grad_norm": 3.058380603790283, "learning_rate": 2.0330869784593054e-06, "loss": 0.5195840835571289, "memory(GiB)": 76.04, "step": 6750, "token_acc": 0.8470919324577861, "train_speed(iter/s)": 0.027623 }, { "epoch": 1.74652530868188, "grad_norm": 0.9729762673377991, "learning_rate": 2.029586642942074e-06, "loss": 0.5047917366027832, "memory(GiB)": 76.04, "step": 6755, "token_acc": 0.8124232148999951, "train_speed(iter/s)": 0.027623 }, { "epoch": 1.74781821707932, "grad_norm": 1.6101481914520264, "learning_rate": 2.026087263200876e-06, "loss": 0.5221758365631104, "memory(GiB)": 76.04, "step": 6760, "token_acc": 0.8239374739860333, "train_speed(iter/s)": 0.027622 }, { "epoch": 1.74911112547676, "grad_norm": 1.5293364524841309, "learning_rate": 2.0225888463456787e-06, "loss": 0.5044497489929199, "memory(GiB)": 76.04, "step": 6765, "token_acc": 0.8550766191978082, "train_speed(iter/s)": 0.027622 }, { "epoch": 1.7504040338742, "grad_norm": 0.9731065034866333, "learning_rate": 2.019091399484491e-06, "loss": 0.499710750579834, "memory(GiB)": 76.04, "step": 6770, "token_acc": 0.8571820068120425, "train_speed(iter/s)": 0.027623 }, { "epoch": 1.7516969422716402, "grad_norm": 1.806395173072815, "learning_rate": 2.0155949297233542e-06, "loss": 0.5355013847351074, "memory(GiB)": 76.04, "step": 6775, "token_acc": 0.813873528994754, "train_speed(iter/s)": 0.027623 }, { "epoch": 1.75298985066908, "grad_norm": 1.0448716878890991, "learning_rate": 2.012099444166322e-06, "loss": 0.5205565929412842, "memory(GiB)": 76.04, "step": 6780, "token_acc": 0.8436368468258978, "train_speed(iter/s)": 0.027623 }, { "epoch": 1.7542827590665202, "grad_norm": 1.2543259859085083, "learning_rate": 2.008604949915448e-06, "loss": 0.5098013877868652, "memory(GiB)": 76.04, "step": 6785, "token_acc": 0.8451558833389206, "train_speed(iter/s)": 0.027622 }, { "epoch": 1.7555756674639602, "grad_norm": 0.9221981167793274, "learning_rate": 2.005111454070773e-06, "loss": 0.5172914505004883, "memory(GiB)": 76.04, "step": 6790, "token_acc": 0.8517071704916801, "train_speed(iter/s)": 0.027622 }, { "epoch": 1.7568685758614002, "grad_norm": 4.873886585235596, "learning_rate": 2.0016189637303087e-06, "loss": 0.5167638778686523, "memory(GiB)": 76.04, "step": 6795, "token_acc": 0.842072213500785, "train_speed(iter/s)": 0.027622 }, { "epoch": 1.7581614842588402, "grad_norm": 0.9154864549636841, "learning_rate": 1.9981274859900253e-06, "loss": 0.49820308685302733, "memory(GiB)": 76.04, "step": 6800, "token_acc": 0.8464762230585016, "train_speed(iter/s)": 0.027622 }, { "epoch": 1.7594543926562802, "grad_norm": 1.0603070259094238, "learning_rate": 1.9946370279438337e-06, "loss": 0.5082100868225098, "memory(GiB)": 76.04, "step": 6805, "token_acc": 0.8356841646066598, "train_speed(iter/s)": 0.027621 }, { "epoch": 1.7607473010537205, "grad_norm": 1.0266915559768677, "learning_rate": 1.9911475966835735e-06, "loss": 0.5149668216705322, "memory(GiB)": 76.04, "step": 6810, "token_acc": 0.8340411379451494, "train_speed(iter/s)": 0.027621 }, { "epoch": 1.7620402094511602, "grad_norm": 1.4958763122558594, "learning_rate": 1.987659199298997e-06, "loss": 0.515727186203003, "memory(GiB)": 76.04, "step": 6815, "token_acc": 0.8732497922127741, "train_speed(iter/s)": 0.027621 }, { "epoch": 1.7633331178486005, "grad_norm": 1.098486304283142, "learning_rate": 1.984171842877759e-06, "loss": 0.49164752960205077, "memory(GiB)": 76.04, "step": 6820, "token_acc": 0.850276862943795, "train_speed(iter/s)": 0.02762 }, { "epoch": 1.7646260262460405, "grad_norm": 0.9649628400802612, "learning_rate": 1.9806855345053964e-06, "loss": 0.4989636898040771, "memory(GiB)": 76.04, "step": 6825, "token_acc": 0.8244157210490719, "train_speed(iter/s)": 0.02762 }, { "epoch": 1.7659189346434805, "grad_norm": 1.1349451541900635, "learning_rate": 1.977200281265319e-06, "loss": 0.5093589782714844, "memory(GiB)": 76.04, "step": 6830, "token_acc": 0.8507517284266745, "train_speed(iter/s)": 0.027619 }, { "epoch": 1.7672118430409207, "grad_norm": 0.9703465700149536, "learning_rate": 1.9737160902387896e-06, "loss": 0.5363808631896972, "memory(GiB)": 76.04, "step": 6835, "token_acc": 0.8378486587481649, "train_speed(iter/s)": 0.027619 }, { "epoch": 1.7685047514383605, "grad_norm": 1.3537312746047974, "learning_rate": 1.9702329685049167e-06, "loss": 0.4682920455932617, "memory(GiB)": 76.04, "step": 6840, "token_acc": 0.8536826495304004, "train_speed(iter/s)": 0.027618 }, { "epoch": 1.7697976598358007, "grad_norm": 1.2528867721557617, "learning_rate": 1.9667509231406332e-06, "loss": 0.5215599060058593, "memory(GiB)": 76.04, "step": 6845, "token_acc": 0.8464350200378737, "train_speed(iter/s)": 0.027618 }, { "epoch": 1.7710905682332405, "grad_norm": 1.063242793083191, "learning_rate": 1.963269961220687e-06, "loss": 0.5151140689849854, "memory(GiB)": 76.04, "step": 6850, "token_acc": 0.8156099097207642, "train_speed(iter/s)": 0.027617 }, { "epoch": 1.7723834766306807, "grad_norm": 1.098589539527893, "learning_rate": 1.9597900898176212e-06, "loss": 0.5092347145080567, "memory(GiB)": 76.04, "step": 6855, "token_acc": 0.8469693605683837, "train_speed(iter/s)": 0.027616 }, { "epoch": 1.7736763850281207, "grad_norm": 1.204325556755066, "learning_rate": 1.9563113160017692e-06, "loss": 0.5028075218200684, "memory(GiB)": 76.04, "step": 6860, "token_acc": 0.8575784400511459, "train_speed(iter/s)": 0.027616 }, { "epoch": 1.7749692934255608, "grad_norm": 1.1051974296569824, "learning_rate": 1.952833646841229e-06, "loss": 0.5096775531768799, "memory(GiB)": 76.04, "step": 6865, "token_acc": 0.844632645043, "train_speed(iter/s)": 0.027615 }, { "epoch": 1.776262201823001, "grad_norm": 1.019508957862854, "learning_rate": 1.949357089401858e-06, "loss": 0.5253026962280274, "memory(GiB)": 76.04, "step": 6870, "token_acc": 0.8215040397762585, "train_speed(iter/s)": 0.027615 }, { "epoch": 1.7775551102204408, "grad_norm": 1.104564905166626, "learning_rate": 1.9458816507472508e-06, "loss": 0.5215746879577636, "memory(GiB)": 76.04, "step": 6875, "token_acc": 0.8430459464254035, "train_speed(iter/s)": 0.027615 }, { "epoch": 1.778848018617881, "grad_norm": 1.5065202713012695, "learning_rate": 1.942407337938731e-06, "loss": 0.5007893562316894, "memory(GiB)": 76.04, "step": 6880, "token_acc": 0.877871314353399, "train_speed(iter/s)": 0.027615 }, { "epoch": 1.780140927015321, "grad_norm": 1.023169994354248, "learning_rate": 1.9389341580353376e-06, "loss": 0.5197202682495117, "memory(GiB)": 76.04, "step": 6885, "token_acc": 0.8612209229744913, "train_speed(iter/s)": 0.027615 }, { "epoch": 1.781433835412761, "grad_norm": 1.0600008964538574, "learning_rate": 1.9354621180938025e-06, "loss": 0.5054890155792237, "memory(GiB)": 76.04, "step": 6890, "token_acc": 0.8306423761008461, "train_speed(iter/s)": 0.027615 }, { "epoch": 1.782726743810201, "grad_norm": 1.1297129392623901, "learning_rate": 1.931991225168544e-06, "loss": 0.5017886161804199, "memory(GiB)": 76.04, "step": 6895, "token_acc": 0.8641059027777778, "train_speed(iter/s)": 0.027616 }, { "epoch": 1.784019652207641, "grad_norm": 1.0830426216125488, "learning_rate": 1.92852148631165e-06, "loss": 0.5002258777618408, "memory(GiB)": 76.04, "step": 6900, "token_acc": 0.8582578976537965, "train_speed(iter/s)": 0.027616 }, { "epoch": 1.7853125606050813, "grad_norm": 1.4555113315582275, "learning_rate": 1.9250529085728656e-06, "loss": 0.5128755569458008, "memory(GiB)": 76.04, "step": 6905, "token_acc": 0.861249647125247, "train_speed(iter/s)": 0.027616 }, { "epoch": 1.786605469002521, "grad_norm": 1.1025890111923218, "learning_rate": 1.9215854989995726e-06, "loss": 0.5137574195861816, "memory(GiB)": 76.04, "step": 6910, "token_acc": 0.8645792423863758, "train_speed(iter/s)": 0.027617 }, { "epoch": 1.7878983773999613, "grad_norm": 0.8277395367622375, "learning_rate": 1.9181192646367815e-06, "loss": 0.4998950958251953, "memory(GiB)": 76.04, "step": 6915, "token_acc": 0.86408374778284, "train_speed(iter/s)": 0.027616 }, { "epoch": 1.7891912857974013, "grad_norm": 4.725937843322754, "learning_rate": 1.914654212527114e-06, "loss": 0.48327035903930665, "memory(GiB)": 76.04, "step": 6920, "token_acc": 0.8672415229525952, "train_speed(iter/s)": 0.027616 }, { "epoch": 1.7904841941948413, "grad_norm": 1.076759934425354, "learning_rate": 1.9111903497107924e-06, "loss": 0.5146621704101563, "memory(GiB)": 76.04, "step": 6925, "token_acc": 0.8483241482097674, "train_speed(iter/s)": 0.027616 }, { "epoch": 1.7917771025922813, "grad_norm": 1.0654821395874023, "learning_rate": 1.90772768322562e-06, "loss": 0.5301095962524414, "memory(GiB)": 76.04, "step": 6930, "token_acc": 0.8365392073218025, "train_speed(iter/s)": 0.027616 }, { "epoch": 1.7930700109897213, "grad_norm": 0.9456325769424438, "learning_rate": 1.9042662201069705e-06, "loss": 0.4947515487670898, "memory(GiB)": 76.04, "step": 6935, "token_acc": 0.853165902597834, "train_speed(iter/s)": 0.027616 }, { "epoch": 1.7943629193871615, "grad_norm": 0.9174202680587769, "learning_rate": 1.9008059673877728e-06, "loss": 0.5024736404418946, "memory(GiB)": 76.04, "step": 6940, "token_acc": 0.826625231817954, "train_speed(iter/s)": 0.027615 }, { "epoch": 1.7956558277846013, "grad_norm": 1.0330584049224854, "learning_rate": 1.8973469320984939e-06, "loss": 0.5240283012390137, "memory(GiB)": 76.04, "step": 6945, "token_acc": 0.8250224483687518, "train_speed(iter/s)": 0.027616 }, { "epoch": 1.7969487361820415, "grad_norm": 1.1314250230789185, "learning_rate": 1.893889121267132e-06, "loss": 0.5122389793395996, "memory(GiB)": 76.04, "step": 6950, "token_acc": 0.8462841506319767, "train_speed(iter/s)": 0.027616 }, { "epoch": 1.7982416445794815, "grad_norm": 0.9879010319709778, "learning_rate": 1.8904325419191941e-06, "loss": 0.5107357025146484, "memory(GiB)": 76.04, "step": 6955, "token_acc": 0.8432230939274413, "train_speed(iter/s)": 0.027616 }, { "epoch": 1.7995345529769216, "grad_norm": 0.9637373089790344, "learning_rate": 1.886977201077685e-06, "loss": 0.5289700508117676, "memory(GiB)": 76.04, "step": 6960, "token_acc": 0.8506177606177606, "train_speed(iter/s)": 0.027616 }, { "epoch": 1.8008274613743618, "grad_norm": 0.8645473122596741, "learning_rate": 1.8835231057630955e-06, "loss": 0.5153064727783203, "memory(GiB)": 76.04, "step": 6965, "token_acc": 0.8533177661023545, "train_speed(iter/s)": 0.027617 }, { "epoch": 1.8021203697718016, "grad_norm": 0.9590107202529907, "learning_rate": 1.8800702629933828e-06, "loss": 0.4972332000732422, "memory(GiB)": 76.04, "step": 6970, "token_acc": 0.8284552126624812, "train_speed(iter/s)": 0.027618 }, { "epoch": 1.8034132781692418, "grad_norm": 0.996835470199585, "learning_rate": 1.8766186797839625e-06, "loss": 0.48930206298828127, "memory(GiB)": 76.04, "step": 6975, "token_acc": 0.8765837634913186, "train_speed(iter/s)": 0.027617 }, { "epoch": 1.8047061865666818, "grad_norm": 1.4224867820739746, "learning_rate": 1.8731683631476885e-06, "loss": 0.5020298480987548, "memory(GiB)": 76.04, "step": 6980, "token_acc": 0.8626479614204297, "train_speed(iter/s)": 0.027617 }, { "epoch": 1.8059990949641218, "grad_norm": 1.8809725046157837, "learning_rate": 1.8697193200948415e-06, "loss": 0.5353089332580566, "memory(GiB)": 76.04, "step": 6985, "token_acc": 0.8270812946250589, "train_speed(iter/s)": 0.027618 }, { "epoch": 1.8072920033615618, "grad_norm": 0.9029675126075745, "learning_rate": 1.866271557633115e-06, "loss": 0.4966177463531494, "memory(GiB)": 76.04, "step": 6990, "token_acc": 0.8467085471597947, "train_speed(iter/s)": 0.027617 }, { "epoch": 1.8085849117590018, "grad_norm": 1.1452031135559082, "learning_rate": 1.862825082767602e-06, "loss": 0.5193626403808593, "memory(GiB)": 76.04, "step": 6995, "token_acc": 0.8338509316770186, "train_speed(iter/s)": 0.027617 }, { "epoch": 1.809877820156442, "grad_norm": 1.0218499898910522, "learning_rate": 1.8593799025007772e-06, "loss": 0.4930767059326172, "memory(GiB)": 76.04, "step": 7000, "token_acc": 0.8444614310877729, "train_speed(iter/s)": 0.027616 } ], "logging_steps": 5, "max_steps": 11601, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 9.806726131996636e+19, "train_batch_size": 2, "trial_name": null, "trial_params": null }