Simple-VL-8B / trainer_state.json
Gábor Stefanik
mirror files from https://www.modelscope.cn/models/swift/Simple-VL-8B
71d367a
raw
history blame
409 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.809877820156442,
"eval_steps": -11601,
"global_step": 7000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00025858167948800825,
"grad_norm": 242.40017700195312,
"learning_rate": 8.605851979345955e-09,
"loss": 6.130181312561035,
"memory(GiB)": 34.54,
"step": 1,
"token_acc": 0.22661183727419593,
"train_speed(iter/s)": 0.013364
},
{
"epoch": 0.0012929083974400413,
"grad_norm": 655.3204345703125,
"learning_rate": 4.3029259896729774e-08,
"loss": 6.322210311889648,
"memory(GiB)": 56.04,
"step": 5,
"token_acc": 0.1488586387434555,
"train_speed(iter/s)": 0.023012
},
{
"epoch": 0.0025858167948800827,
"grad_norm": 484.6578369140625,
"learning_rate": 8.605851979345955e-08,
"loss": 6.274671173095703,
"memory(GiB)": 56.04,
"step": 10,
"token_acc": 0.1413762226201956,
"train_speed(iter/s)": 0.025227
},
{
"epoch": 0.003878725192320124,
"grad_norm": 247.675048828125,
"learning_rate": 1.2908777969018933e-07,
"loss": 5.9146270751953125,
"memory(GiB)": 56.04,
"step": 15,
"token_acc": 0.2279082774049217,
"train_speed(iter/s)": 0.026453
},
{
"epoch": 0.005171633589760165,
"grad_norm": 334.42694091796875,
"learning_rate": 1.721170395869191e-07,
"loss": 6.069369506835938,
"memory(GiB)": 56.04,
"step": 20,
"token_acc": 0.2284955683104916,
"train_speed(iter/s)": 0.02697
},
{
"epoch": 0.006464541987200207,
"grad_norm": 492.56463623046875,
"learning_rate": 2.151462994836489e-07,
"loss": 6.095654296875,
"memory(GiB)": 56.04,
"step": 25,
"token_acc": 0.24055135615829257,
"train_speed(iter/s)": 0.027272
},
{
"epoch": 0.007757450384640248,
"grad_norm": 182.06201171875,
"learning_rate": 2.5817555938037866e-07,
"loss": 6.0770011901855465,
"memory(GiB)": 76.04,
"step": 30,
"token_acc": 0.1521892967713401,
"train_speed(iter/s)": 0.027249
},
{
"epoch": 0.009050358782080289,
"grad_norm": 519.2252197265625,
"learning_rate": 3.0120481927710845e-07,
"loss": 6.1493080139160154,
"memory(GiB)": 76.04,
"step": 35,
"token_acc": 0.15634508680866296,
"train_speed(iter/s)": 0.027269
},
{
"epoch": 0.01034326717952033,
"grad_norm": 714.878173828125,
"learning_rate": 3.442340791738382e-07,
"loss": 6.561576843261719,
"memory(GiB)": 76.04,
"step": 40,
"token_acc": 0.16204781045060293,
"train_speed(iter/s)": 0.027175
},
{
"epoch": 0.011636175576960372,
"grad_norm": 666.3334350585938,
"learning_rate": 3.8726333907056804e-07,
"loss": 5.985930252075195,
"memory(GiB)": 76.04,
"step": 45,
"token_acc": 0.1617008883063929,
"train_speed(iter/s)": 0.027249
},
{
"epoch": 0.012929083974400414,
"grad_norm": 841.8603515625,
"learning_rate": 4.302925989672978e-07,
"loss": 6.3445274353027346,
"memory(GiB)": 76.04,
"step": 50,
"token_acc": 0.2033275151335091,
"train_speed(iter/s)": 0.027146
},
{
"epoch": 0.014221992371840455,
"grad_norm": 956.3386840820312,
"learning_rate": 4.733218588640276e-07,
"loss": 6.072727966308594,
"memory(GiB)": 76.04,
"step": 55,
"token_acc": 0.18730415801147765,
"train_speed(iter/s)": 0.027337
},
{
"epoch": 0.015514900769280497,
"grad_norm": 145.8020782470703,
"learning_rate": 5.163511187607573e-07,
"loss": 5.959784317016601,
"memory(GiB)": 76.04,
"step": 60,
"token_acc": 0.21014295439074201,
"train_speed(iter/s)": 0.027487
},
{
"epoch": 0.016807809166720537,
"grad_norm": 1471.006591796875,
"learning_rate": 5.593803786574872e-07,
"loss": 5.995822906494141,
"memory(GiB)": 76.04,
"step": 65,
"token_acc": 0.16151609777107784,
"train_speed(iter/s)": 0.027505
},
{
"epoch": 0.018100717564160578,
"grad_norm": 127.47267150878906,
"learning_rate": 6.024096385542169e-07,
"loss": 5.947823333740234,
"memory(GiB)": 76.04,
"step": 70,
"token_acc": 0.17944442339030583,
"train_speed(iter/s)": 0.027514
},
{
"epoch": 0.01939362596160062,
"grad_norm": 103.65353393554688,
"learning_rate": 6.454388984509467e-07,
"loss": 6.049673843383789,
"memory(GiB)": 76.04,
"step": 75,
"token_acc": 0.1430964467005076,
"train_speed(iter/s)": 0.027525
},
{
"epoch": 0.02068653435904066,
"grad_norm": 186.977294921875,
"learning_rate": 6.884681583476764e-07,
"loss": 5.822255706787109,
"memory(GiB)": 76.04,
"step": 80,
"token_acc": 0.20191989407480967,
"train_speed(iter/s)": 0.027535
},
{
"epoch": 0.021979442756480703,
"grad_norm": 487.37152099609375,
"learning_rate": 7.314974182444062e-07,
"loss": 5.861573791503906,
"memory(GiB)": 76.04,
"step": 85,
"token_acc": 0.25108269048585735,
"train_speed(iter/s)": 0.027566
},
{
"epoch": 0.023272351153920744,
"grad_norm": 536.355712890625,
"learning_rate": 7.745266781411361e-07,
"loss": 5.783546447753906,
"memory(GiB)": 76.04,
"step": 90,
"token_acc": 0.1870295076687734,
"train_speed(iter/s)": 0.027608
},
{
"epoch": 0.024565259551360786,
"grad_norm": 285.6499938964844,
"learning_rate": 8.175559380378658e-07,
"loss": 5.746212768554687,
"memory(GiB)": 76.04,
"step": 95,
"token_acc": 0.18162528216704288,
"train_speed(iter/s)": 0.02758
},
{
"epoch": 0.025858167948800827,
"grad_norm": 1952.98583984375,
"learning_rate": 8.605851979345956e-07,
"loss": 5.601922607421875,
"memory(GiB)": 76.04,
"step": 100,
"token_acc": 0.2339803356501102,
"train_speed(iter/s)": 0.027603
},
{
"epoch": 0.02715107634624087,
"grad_norm": 93.57474517822266,
"learning_rate": 9.036144578313254e-07,
"loss": 5.615843200683594,
"memory(GiB)": 76.04,
"step": 105,
"token_acc": 0.2615709535364429,
"train_speed(iter/s)": 0.027644
},
{
"epoch": 0.02844398474368091,
"grad_norm": 229.4741973876953,
"learning_rate": 9.466437177280551e-07,
"loss": 5.597761917114258,
"memory(GiB)": 76.04,
"step": 110,
"token_acc": 0.15636001564666974,
"train_speed(iter/s)": 0.027645
},
{
"epoch": 0.029736893141120952,
"grad_norm": 36.493988037109375,
"learning_rate": 9.896729776247848e-07,
"loss": 5.327813720703125,
"memory(GiB)": 76.04,
"step": 115,
"token_acc": 0.1776396590866798,
"train_speed(iter/s)": 0.027731
},
{
"epoch": 0.031029801538560994,
"grad_norm": 35.07899475097656,
"learning_rate": 1.0327022375215146e-06,
"loss": 5.352128982543945,
"memory(GiB)": 76.04,
"step": 120,
"token_acc": 0.24768539325842698,
"train_speed(iter/s)": 0.027714
},
{
"epoch": 0.03232270993600103,
"grad_norm": 144.920166015625,
"learning_rate": 1.0757314974182445e-06,
"loss": 5.0349891662597654,
"memory(GiB)": 76.04,
"step": 125,
"token_acc": 0.2503892788768347,
"train_speed(iter/s)": 0.027712
},
{
"epoch": 0.03361561833344107,
"grad_norm": 38.186100006103516,
"learning_rate": 1.1187607573149743e-06,
"loss": 5.087076950073242,
"memory(GiB)": 76.04,
"step": 130,
"token_acc": 0.24298036336942558,
"train_speed(iter/s)": 0.027764
},
{
"epoch": 0.034908526730881115,
"grad_norm": 130.58258056640625,
"learning_rate": 1.161790017211704e-06,
"loss": 4.898267364501953,
"memory(GiB)": 76.04,
"step": 135,
"token_acc": 0.27113337507827173,
"train_speed(iter/s)": 0.027766
},
{
"epoch": 0.036201435128321156,
"grad_norm": 26.92106056213379,
"learning_rate": 1.2048192771084338e-06,
"loss": 4.7400367736816404,
"memory(GiB)": 76.04,
"step": 140,
"token_acc": 0.20421513969901067,
"train_speed(iter/s)": 0.027799
},
{
"epoch": 0.0374943435257612,
"grad_norm": 208.29981994628906,
"learning_rate": 1.2478485370051637e-06,
"loss": 4.690925598144531,
"memory(GiB)": 76.04,
"step": 145,
"token_acc": 0.22307749241358893,
"train_speed(iter/s)": 0.027793
},
{
"epoch": 0.03878725192320124,
"grad_norm": 364.9195251464844,
"learning_rate": 1.2908777969018935e-06,
"loss": 4.929594421386719,
"memory(GiB)": 76.04,
"step": 150,
"token_acc": 0.23747012178927798,
"train_speed(iter/s)": 0.027767
},
{
"epoch": 0.04008016032064128,
"grad_norm": 361.99853515625,
"learning_rate": 1.3339070567986231e-06,
"loss": 4.708561706542969,
"memory(GiB)": 76.04,
"step": 155,
"token_acc": 0.2766303463977883,
"train_speed(iter/s)": 0.027779
},
{
"epoch": 0.04137306871808132,
"grad_norm": 714.6271362304688,
"learning_rate": 1.3769363166953528e-06,
"loss": 4.614714050292969,
"memory(GiB)": 76.04,
"step": 160,
"token_acc": 0.2798645816540384,
"train_speed(iter/s)": 0.027778
},
{
"epoch": 0.042665977115521364,
"grad_norm": 38.3203010559082,
"learning_rate": 1.4199655765920828e-06,
"loss": 4.407340621948242,
"memory(GiB)": 76.04,
"step": 165,
"token_acc": 0.26736621196222454,
"train_speed(iter/s)": 0.027815
},
{
"epoch": 0.043958885512961406,
"grad_norm": 142.34910583496094,
"learning_rate": 1.4629948364888125e-06,
"loss": 4.265171813964844,
"memory(GiB)": 76.04,
"step": 170,
"token_acc": 0.2892184533278596,
"train_speed(iter/s)": 0.027794
},
{
"epoch": 0.04525179391040145,
"grad_norm": 16.595224380493164,
"learning_rate": 1.5060240963855425e-06,
"loss": 4.142366409301758,
"memory(GiB)": 76.04,
"step": 175,
"token_acc": 0.28310791772330235,
"train_speed(iter/s)": 0.027783
},
{
"epoch": 0.04654470230784149,
"grad_norm": 24.273094177246094,
"learning_rate": 1.5490533562822722e-06,
"loss": 4.011473083496094,
"memory(GiB)": 76.04,
"step": 180,
"token_acc": 0.31321029626032054,
"train_speed(iter/s)": 0.027749
},
{
"epoch": 0.04783761070528153,
"grad_norm": 12.379109382629395,
"learning_rate": 1.5920826161790018e-06,
"loss": 3.945102310180664,
"memory(GiB)": 76.04,
"step": 185,
"token_acc": 0.3472663749960656,
"train_speed(iter/s)": 0.027756
},
{
"epoch": 0.04913051910272157,
"grad_norm": 11.814841270446777,
"learning_rate": 1.6351118760757316e-06,
"loss": 3.744676971435547,
"memory(GiB)": 76.04,
"step": 190,
"token_acc": 0.3549518681677272,
"train_speed(iter/s)": 0.027799
},
{
"epoch": 0.05042342750016161,
"grad_norm": 13.708587646484375,
"learning_rate": 1.6781411359724615e-06,
"loss": 3.6159019470214844,
"memory(GiB)": 76.04,
"step": 195,
"token_acc": 0.34429772852314705,
"train_speed(iter/s)": 0.027816
},
{
"epoch": 0.051716335897601655,
"grad_norm": 10.363809585571289,
"learning_rate": 1.7211703958691911e-06,
"loss": 3.508245086669922,
"memory(GiB)": 76.04,
"step": 200,
"token_acc": 0.34663152792923785,
"train_speed(iter/s)": 0.027818
},
{
"epoch": 0.053009244295041696,
"grad_norm": 10.083678245544434,
"learning_rate": 1.764199655765921e-06,
"loss": 3.42333869934082,
"memory(GiB)": 76.04,
"step": 205,
"token_acc": 0.4063367473915957,
"train_speed(iter/s)": 0.027805
},
{
"epoch": 0.05430215269248174,
"grad_norm": 8.798538208007812,
"learning_rate": 1.8072289156626508e-06,
"loss": 3.2132949829101562,
"memory(GiB)": 76.04,
"step": 210,
"token_acc": 0.40024092757114893,
"train_speed(iter/s)": 0.027761
},
{
"epoch": 0.05559506108992178,
"grad_norm": 9.79233455657959,
"learning_rate": 1.8502581755593804e-06,
"loss": 3.2455322265625,
"memory(GiB)": 76.04,
"step": 215,
"token_acc": 0.37759151099023586,
"train_speed(iter/s)": 0.027773
},
{
"epoch": 0.05688796948736182,
"grad_norm": 7.508085250854492,
"learning_rate": 1.8932874354561103e-06,
"loss": 3.0217824935913087,
"memory(GiB)": 76.04,
"step": 220,
"token_acc": 0.41623787623514413,
"train_speed(iter/s)": 0.027752
},
{
"epoch": 0.05818087788480186,
"grad_norm": 11.832063674926758,
"learning_rate": 1.93631669535284e-06,
"loss": 3.122202682495117,
"memory(GiB)": 76.04,
"step": 225,
"token_acc": 0.4242820412254325,
"train_speed(iter/s)": 0.027731
},
{
"epoch": 0.059473786282241904,
"grad_norm": 6.528841018676758,
"learning_rate": 1.9793459552495696e-06,
"loss": 2.9143745422363283,
"memory(GiB)": 76.04,
"step": 230,
"token_acc": 0.4459988808058198,
"train_speed(iter/s)": 0.027736
},
{
"epoch": 0.060766694679681946,
"grad_norm": 13.579035758972168,
"learning_rate": 2.0223752151463e-06,
"loss": 2.980012130737305,
"memory(GiB)": 76.04,
"step": 235,
"token_acc": 0.4274643521388717,
"train_speed(iter/s)": 0.027736
},
{
"epoch": 0.06205960307712199,
"grad_norm": 10.801968574523926,
"learning_rate": 2.0654044750430293e-06,
"loss": 2.9146419525146485,
"memory(GiB)": 76.04,
"step": 240,
"token_acc": 0.4537141861882783,
"train_speed(iter/s)": 0.027738
},
{
"epoch": 0.06335251147456203,
"grad_norm": 6.302434921264648,
"learning_rate": 2.1084337349397595e-06,
"loss": 2.8420055389404295,
"memory(GiB)": 76.04,
"step": 245,
"token_acc": 0.4608826083524118,
"train_speed(iter/s)": 0.027761
},
{
"epoch": 0.06464541987200206,
"grad_norm": 6.119657516479492,
"learning_rate": 2.151462994836489e-06,
"loss": 2.787134552001953,
"memory(GiB)": 76.04,
"step": 250,
"token_acc": 0.44650911754500944,
"train_speed(iter/s)": 0.027728
},
{
"epoch": 0.06593832826944211,
"grad_norm": 6.240930080413818,
"learning_rate": 2.194492254733219e-06,
"loss": 2.7430130004882813,
"memory(GiB)": 76.04,
"step": 255,
"token_acc": 0.4696727853152434,
"train_speed(iter/s)": 0.027754
},
{
"epoch": 0.06723123666688215,
"grad_norm": 6.333411693572998,
"learning_rate": 2.2375215146299486e-06,
"loss": 2.6061046600341795,
"memory(GiB)": 76.04,
"step": 260,
"token_acc": 0.48562300319488816,
"train_speed(iter/s)": 0.027762
},
{
"epoch": 0.0685241450643222,
"grad_norm": 7.289592742919922,
"learning_rate": 2.2805507745266785e-06,
"loss": 2.5609256744384767,
"memory(GiB)": 76.04,
"step": 265,
"token_acc": 0.4682853243301487,
"train_speed(iter/s)": 0.027763
},
{
"epoch": 0.06981705346176223,
"grad_norm": 5.113116264343262,
"learning_rate": 2.323580034423408e-06,
"loss": 2.5773744583129883,
"memory(GiB)": 76.04,
"step": 270,
"token_acc": 0.4764347547290798,
"train_speed(iter/s)": 0.02777
},
{
"epoch": 0.07110996185920228,
"grad_norm": 4.905300140380859,
"learning_rate": 2.3666092943201378e-06,
"loss": 2.493597221374512,
"memory(GiB)": 76.04,
"step": 275,
"token_acc": 0.5032743942370661,
"train_speed(iter/s)": 0.027774
},
{
"epoch": 0.07240287025664231,
"grad_norm": 5.233267307281494,
"learning_rate": 2.4096385542168676e-06,
"loss": 2.4493991851806642,
"memory(GiB)": 76.04,
"step": 280,
"token_acc": 0.5405880959631848,
"train_speed(iter/s)": 0.027795
},
{
"epoch": 0.07369577865408236,
"grad_norm": 8.618249893188477,
"learning_rate": 2.4526678141135975e-06,
"loss": 2.423325538635254,
"memory(GiB)": 76.04,
"step": 285,
"token_acc": 0.530728862973761,
"train_speed(iter/s)": 0.027795
},
{
"epoch": 0.0749886870515224,
"grad_norm": 5.426891803741455,
"learning_rate": 2.4956970740103273e-06,
"loss": 2.376586151123047,
"memory(GiB)": 76.04,
"step": 290,
"token_acc": 0.5145979170223663,
"train_speed(iter/s)": 0.027806
},
{
"epoch": 0.07628159544896244,
"grad_norm": 7.925861358642578,
"learning_rate": 2.538726333907057e-06,
"loss": 2.348302459716797,
"memory(GiB)": 76.04,
"step": 295,
"token_acc": 0.5290513911109377,
"train_speed(iter/s)": 0.027804
},
{
"epoch": 0.07757450384640248,
"grad_norm": 6.098197937011719,
"learning_rate": 2.581755593803787e-06,
"loss": 2.325778579711914,
"memory(GiB)": 76.04,
"step": 300,
"token_acc": 0.526869597895528,
"train_speed(iter/s)": 0.027824
},
{
"epoch": 0.07886741224384253,
"grad_norm": 4.564492225646973,
"learning_rate": 2.6247848537005164e-06,
"loss": 2.283577728271484,
"memory(GiB)": 76.04,
"step": 305,
"token_acc": 0.5214347367411528,
"train_speed(iter/s)": 0.02782
},
{
"epoch": 0.08016032064128256,
"grad_norm": 5.382349491119385,
"learning_rate": 2.6678141135972463e-06,
"loss": 2.260573959350586,
"memory(GiB)": 76.04,
"step": 310,
"token_acc": 0.5193088205746433,
"train_speed(iter/s)": 0.027827
},
{
"epoch": 0.08145322903872261,
"grad_norm": 4.09184455871582,
"learning_rate": 2.710843373493976e-06,
"loss": 2.202308464050293,
"memory(GiB)": 76.04,
"step": 315,
"token_acc": 0.5255345169316409,
"train_speed(iter/s)": 0.027838
},
{
"epoch": 0.08274613743616265,
"grad_norm": 6.044122695922852,
"learning_rate": 2.7538726333907055e-06,
"loss": 2.0868404388427733,
"memory(GiB)": 76.04,
"step": 320,
"token_acc": 0.5558229225038731,
"train_speed(iter/s)": 0.027868
},
{
"epoch": 0.0840390458336027,
"grad_norm": 6.152168273925781,
"learning_rate": 2.796901893287436e-06,
"loss": 2.019194412231445,
"memory(GiB)": 76.04,
"step": 325,
"token_acc": 0.5501780112094188,
"train_speed(iter/s)": 0.027887
},
{
"epoch": 0.08533195423104273,
"grad_norm": 7.0422773361206055,
"learning_rate": 2.8399311531841657e-06,
"loss": 2.0607986450195312,
"memory(GiB)": 76.04,
"step": 330,
"token_acc": 0.5108016425638279,
"train_speed(iter/s)": 0.027881
},
{
"epoch": 0.08662486262848278,
"grad_norm": 5.497431755065918,
"learning_rate": 2.882960413080895e-06,
"loss": 2.0316564559936525,
"memory(GiB)": 76.04,
"step": 335,
"token_acc": 0.5838930163447251,
"train_speed(iter/s)": 0.027889
},
{
"epoch": 0.08791777102592281,
"grad_norm": 5.31265115737915,
"learning_rate": 2.925989672977625e-06,
"loss": 2.0162227630615233,
"memory(GiB)": 76.04,
"step": 340,
"token_acc": 0.5688795253246195,
"train_speed(iter/s)": 0.027898
},
{
"epoch": 0.08921067942336286,
"grad_norm": 3.3592262268066406,
"learning_rate": 2.9690189328743548e-06,
"loss": 1.976116943359375,
"memory(GiB)": 76.04,
"step": 345,
"token_acc": 0.5811347794931926,
"train_speed(iter/s)": 0.027902
},
{
"epoch": 0.0905035878208029,
"grad_norm": 4.368274211883545,
"learning_rate": 3.012048192771085e-06,
"loss": 1.9872814178466798,
"memory(GiB)": 76.04,
"step": 350,
"token_acc": 0.5612575668814684,
"train_speed(iter/s)": 0.027903
},
{
"epoch": 0.09179649621824294,
"grad_norm": 4.863376140594482,
"learning_rate": 3.0550774526678145e-06,
"loss": 1.9638809204101562,
"memory(GiB)": 76.04,
"step": 355,
"token_acc": 0.611854751336805,
"train_speed(iter/s)": 0.027901
},
{
"epoch": 0.09308940461568298,
"grad_norm": 4.721250057220459,
"learning_rate": 3.0981067125645443e-06,
"loss": 1.9370807647705077,
"memory(GiB)": 76.04,
"step": 360,
"token_acc": 0.584777590187093,
"train_speed(iter/s)": 0.027911
},
{
"epoch": 0.09438231301312303,
"grad_norm": 3.189765691757202,
"learning_rate": 3.1411359724612737e-06,
"loss": 1.9602073669433593,
"memory(GiB)": 76.04,
"step": 365,
"token_acc": 0.6110412738319715,
"train_speed(iter/s)": 0.027898
},
{
"epoch": 0.09567522141056306,
"grad_norm": 4.200387001037598,
"learning_rate": 3.1841652323580036e-06,
"loss": 1.8618885040283204,
"memory(GiB)": 76.04,
"step": 370,
"token_acc": 0.6040174341481903,
"train_speed(iter/s)": 0.027895
},
{
"epoch": 0.09696812980800311,
"grad_norm": 3.2454254627227783,
"learning_rate": 3.2271944922547334e-06,
"loss": 1.821019744873047,
"memory(GiB)": 76.04,
"step": 375,
"token_acc": 0.6053048476893134,
"train_speed(iter/s)": 0.027882
},
{
"epoch": 0.09826103820544314,
"grad_norm": 3.34602427482605,
"learning_rate": 3.2702237521514633e-06,
"loss": 1.814716911315918,
"memory(GiB)": 76.04,
"step": 380,
"token_acc": 0.6051584430617856,
"train_speed(iter/s)": 0.027888
},
{
"epoch": 0.09955394660288319,
"grad_norm": 3.311286211013794,
"learning_rate": 3.313253012048193e-06,
"loss": 1.8220832824707032,
"memory(GiB)": 76.04,
"step": 385,
"token_acc": 0.5944087085601187,
"train_speed(iter/s)": 0.027892
},
{
"epoch": 0.10084685500032323,
"grad_norm": 5.586461544036865,
"learning_rate": 3.356282271944923e-06,
"loss": 1.7891130447387695,
"memory(GiB)": 76.04,
"step": 390,
"token_acc": 0.618925967321305,
"train_speed(iter/s)": 0.027898
},
{
"epoch": 0.10213976339776328,
"grad_norm": 6.616051197052002,
"learning_rate": 3.3993115318416524e-06,
"loss": 1.7373517990112304,
"memory(GiB)": 76.04,
"step": 395,
"token_acc": 0.6165356711003628,
"train_speed(iter/s)": 0.027877
},
{
"epoch": 0.10343267179520331,
"grad_norm": 2.835207223892212,
"learning_rate": 3.4423407917383822e-06,
"loss": 1.6772958755493164,
"memory(GiB)": 76.04,
"step": 400,
"token_acc": 0.6225554448697111,
"train_speed(iter/s)": 0.027881
},
{
"epoch": 0.10472558019264334,
"grad_norm": 2.6146092414855957,
"learning_rate": 3.485370051635112e-06,
"loss": 1.7310756683349608,
"memory(GiB)": 76.04,
"step": 405,
"token_acc": 0.615843204488778,
"train_speed(iter/s)": 0.027878
},
{
"epoch": 0.10601848859008339,
"grad_norm": 4.303338527679443,
"learning_rate": 3.528399311531842e-06,
"loss": 1.6970853805541992,
"memory(GiB)": 76.04,
"step": 410,
"token_acc": 0.6082253291152159,
"train_speed(iter/s)": 0.027889
},
{
"epoch": 0.10731139698752343,
"grad_norm": 3.152858257293701,
"learning_rate": 3.5714285714285718e-06,
"loss": 1.6324186325073242,
"memory(GiB)": 76.04,
"step": 415,
"token_acc": 0.6986581950424278,
"train_speed(iter/s)": 0.027905
},
{
"epoch": 0.10860430538496348,
"grad_norm": 2.5081074237823486,
"learning_rate": 3.6144578313253016e-06,
"loss": 1.6505191802978516,
"memory(GiB)": 76.04,
"step": 420,
"token_acc": 0.6553758610362383,
"train_speed(iter/s)": 0.027916
},
{
"epoch": 0.10989721378240351,
"grad_norm": 3.3457190990448,
"learning_rate": 3.657487091222031e-06,
"loss": 1.6839021682739257,
"memory(GiB)": 76.04,
"step": 425,
"token_acc": 0.6694164051234202,
"train_speed(iter/s)": 0.027902
},
{
"epoch": 0.11119012217984356,
"grad_norm": 2.359487295150757,
"learning_rate": 3.700516351118761e-06,
"loss": 1.6301973342895508,
"memory(GiB)": 76.04,
"step": 430,
"token_acc": 0.6513765837858436,
"train_speed(iter/s)": 0.027902
},
{
"epoch": 0.1124830305772836,
"grad_norm": 2.1152243614196777,
"learning_rate": 3.743545611015491e-06,
"loss": 1.5873135566711425,
"memory(GiB)": 76.04,
"step": 435,
"token_acc": 0.6180145649674205,
"train_speed(iter/s)": 0.027907
},
{
"epoch": 0.11377593897472364,
"grad_norm": 2.4160220623016357,
"learning_rate": 3.7865748709122206e-06,
"loss": 1.5810693740844726,
"memory(GiB)": 76.04,
"step": 440,
"token_acc": 0.6297903669547023,
"train_speed(iter/s)": 0.027905
},
{
"epoch": 0.11506884737216368,
"grad_norm": 2.329163074493408,
"learning_rate": 3.8296041308089504e-06,
"loss": 1.5028837203979493,
"memory(GiB)": 76.04,
"step": 445,
"token_acc": 0.6969645118236885,
"train_speed(iter/s)": 0.027912
},
{
"epoch": 0.11636175576960373,
"grad_norm": 2.004277467727661,
"learning_rate": 3.87263339070568e-06,
"loss": 1.5502431869506836,
"memory(GiB)": 76.04,
"step": 450,
"token_acc": 0.6654662441179295,
"train_speed(iter/s)": 0.027911
},
{
"epoch": 0.11765466416704376,
"grad_norm": 291.7929382324219,
"learning_rate": 3.91566265060241e-06,
"loss": 1.6129791259765625,
"memory(GiB)": 76.04,
"step": 455,
"token_acc": 0.623754295532646,
"train_speed(iter/s)": 0.027925
},
{
"epoch": 0.11894757256448381,
"grad_norm": 5.003607749938965,
"learning_rate": 3.958691910499139e-06,
"loss": 1.523465919494629,
"memory(GiB)": 76.04,
"step": 460,
"token_acc": 0.6120657218111125,
"train_speed(iter/s)": 0.027923
},
{
"epoch": 0.12024048096192384,
"grad_norm": 2.6622467041015625,
"learning_rate": 4.00172117039587e-06,
"loss": 1.5748241424560547,
"memory(GiB)": 76.04,
"step": 465,
"token_acc": 0.6213646902947996,
"train_speed(iter/s)": 0.027931
},
{
"epoch": 0.12153338935936389,
"grad_norm": 2.4443795680999756,
"learning_rate": 4.0447504302926e-06,
"loss": 1.526081657409668,
"memory(GiB)": 76.04,
"step": 470,
"token_acc": 0.6476268348713157,
"train_speed(iter/s)": 0.027937
},
{
"epoch": 0.12282629775680393,
"grad_norm": 2.6567704677581787,
"learning_rate": 4.087779690189329e-06,
"loss": 1.5151639938354493,
"memory(GiB)": 76.04,
"step": 475,
"token_acc": 0.6848871707273217,
"train_speed(iter/s)": 0.027943
},
{
"epoch": 0.12411920615424397,
"grad_norm": 2.540998935699463,
"learning_rate": 4.1308089500860585e-06,
"loss": 1.4633543014526367,
"memory(GiB)": 76.04,
"step": 480,
"token_acc": 0.6596202575584019,
"train_speed(iter/s)": 0.027936
},
{
"epoch": 0.12541211455168402,
"grad_norm": 2.337562084197998,
"learning_rate": 4.173838209982788e-06,
"loss": 1.4880233764648438,
"memory(GiB)": 76.04,
"step": 485,
"token_acc": 0.6452993555369705,
"train_speed(iter/s)": 0.027931
},
{
"epoch": 0.12670502294912406,
"grad_norm": 2.5540599822998047,
"learning_rate": 4.216867469879519e-06,
"loss": 1.4692201614379883,
"memory(GiB)": 76.04,
"step": 490,
"token_acc": 0.6396389676264359,
"train_speed(iter/s)": 0.027945
},
{
"epoch": 0.1279979313465641,
"grad_norm": 2.3116421699523926,
"learning_rate": 4.259896729776248e-06,
"loss": 1.4556035995483398,
"memory(GiB)": 76.04,
"step": 495,
"token_acc": 0.6138832517607735,
"train_speed(iter/s)": 0.02794
},
{
"epoch": 0.12929083974400413,
"grad_norm": 1.984755039215088,
"learning_rate": 4.302925989672978e-06,
"loss": 1.4775214195251465,
"memory(GiB)": 76.04,
"step": 500,
"token_acc": 0.6642163033079905,
"train_speed(iter/s)": 0.027946
},
{
"epoch": 0.1305837481414442,
"grad_norm": 2.973404884338379,
"learning_rate": 4.345955249569708e-06,
"loss": 1.421070671081543,
"memory(GiB)": 76.04,
"step": 505,
"token_acc": 0.6482611781405252,
"train_speed(iter/s)": 0.027935
},
{
"epoch": 0.13187665653888422,
"grad_norm": 2.4486446380615234,
"learning_rate": 4.388984509466438e-06,
"loss": 1.4313584327697755,
"memory(GiB)": 76.04,
"step": 510,
"token_acc": 0.6798098365476511,
"train_speed(iter/s)": 0.027937
},
{
"epoch": 0.13316956493632426,
"grad_norm": 2.326204776763916,
"learning_rate": 4.4320137693631674e-06,
"loss": 1.4402247428894044,
"memory(GiB)": 76.04,
"step": 515,
"token_acc": 0.7008269899445541,
"train_speed(iter/s)": 0.02794
},
{
"epoch": 0.1344624733337643,
"grad_norm": 1.8890414237976074,
"learning_rate": 4.475043029259897e-06,
"loss": 1.3767863273620606,
"memory(GiB)": 76.04,
"step": 520,
"token_acc": 0.6819113223176527,
"train_speed(iter/s)": 0.027946
},
{
"epoch": 0.13575538173120436,
"grad_norm": 2.8504834175109863,
"learning_rate": 4.518072289156627e-06,
"loss": 1.407078170776367,
"memory(GiB)": 76.04,
"step": 525,
"token_acc": 0.683960224816256,
"train_speed(iter/s)": 0.027961
},
{
"epoch": 0.1370482901286444,
"grad_norm": 2.0273118019104004,
"learning_rate": 4.561101549053357e-06,
"loss": 1.336355972290039,
"memory(GiB)": 76.04,
"step": 530,
"token_acc": 0.6482288828337874,
"train_speed(iter/s)": 0.027971
},
{
"epoch": 0.13834119852608442,
"grad_norm": 1.9206314086914062,
"learning_rate": 4.604130808950086e-06,
"loss": 1.3975639343261719,
"memory(GiB)": 76.04,
"step": 535,
"token_acc": 0.7144246703653001,
"train_speed(iter/s)": 0.027969
},
{
"epoch": 0.13963410692352446,
"grad_norm": 2.952894687652588,
"learning_rate": 4.647160068846816e-06,
"loss": 1.372206974029541,
"memory(GiB)": 76.04,
"step": 540,
"token_acc": 0.6865656633371321,
"train_speed(iter/s)": 0.027968
},
{
"epoch": 0.14092701532096452,
"grad_norm": 1.6340878009796143,
"learning_rate": 4.6901893287435465e-06,
"loss": 1.373509120941162,
"memory(GiB)": 76.04,
"step": 545,
"token_acc": 0.6471132494448557,
"train_speed(iter/s)": 0.027965
},
{
"epoch": 0.14221992371840456,
"grad_norm": 1.9484697580337524,
"learning_rate": 4.7332185886402755e-06,
"loss": 1.3788504600524902,
"memory(GiB)": 76.04,
"step": 550,
"token_acc": 0.6455056445137216,
"train_speed(iter/s)": 0.02796
},
{
"epoch": 0.1435128321158446,
"grad_norm": 2.1868441104888916,
"learning_rate": 4.776247848537005e-06,
"loss": 1.3279705047607422,
"memory(GiB)": 76.04,
"step": 555,
"token_acc": 0.6940514224859273,
"train_speed(iter/s)": 0.027951
},
{
"epoch": 0.14480574051328463,
"grad_norm": 1.7936123609542847,
"learning_rate": 4.819277108433735e-06,
"loss": 1.2733698844909669,
"memory(GiB)": 76.04,
"step": 560,
"token_acc": 0.6824586324720697,
"train_speed(iter/s)": 0.027957
},
{
"epoch": 0.1460986489107247,
"grad_norm": 2.035456418991089,
"learning_rate": 4.862306368330465e-06,
"loss": 1.3023791313171387,
"memory(GiB)": 76.04,
"step": 565,
"token_acc": 0.732261012611954,
"train_speed(iter/s)": 0.027962
},
{
"epoch": 0.14739155730816472,
"grad_norm": 1.8656188249588013,
"learning_rate": 4.905335628227195e-06,
"loss": 1.2920080184936524,
"memory(GiB)": 76.04,
"step": 570,
"token_acc": 0.7507948232120492,
"train_speed(iter/s)": 0.027955
},
{
"epoch": 0.14868446570560476,
"grad_norm": 2.3736183643341064,
"learning_rate": 4.948364888123925e-06,
"loss": 1.3393060684204101,
"memory(GiB)": 76.04,
"step": 575,
"token_acc": 0.6658280922431866,
"train_speed(iter/s)": 0.027956
},
{
"epoch": 0.1499773741030448,
"grad_norm": 1.932016372680664,
"learning_rate": 4.991394148020655e-06,
"loss": 1.2938769340515137,
"memory(GiB)": 76.04,
"step": 580,
"token_acc": 0.6553120323915148,
"train_speed(iter/s)": 0.027949
},
{
"epoch": 0.15127028250048485,
"grad_norm": 2.1315174102783203,
"learning_rate": 4.999998374576611e-06,
"loss": 1.3226532936096191,
"memory(GiB)": 76.04,
"step": 585,
"token_acc": 0.6744726857329363,
"train_speed(iter/s)": 0.027952
},
{
"epoch": 0.1525631908979249,
"grad_norm": 1.8392802476882935,
"learning_rate": 4.999991771297712e-06,
"loss": 1.298147964477539,
"memory(GiB)": 76.04,
"step": 590,
"token_acc": 0.6859605911330049,
"train_speed(iter/s)": 0.027946
},
{
"epoch": 0.15385609929536492,
"grad_norm": 1.761626124382019,
"learning_rate": 4.999980088587748e-06,
"loss": 1.3261050224304198,
"memory(GiB)": 76.04,
"step": 595,
"token_acc": 0.7036491873658387,
"train_speed(iter/s)": 0.027948
},
{
"epoch": 0.15514900769280496,
"grad_norm": 1.6543495655059814,
"learning_rate": 4.999963326470457e-06,
"loss": 1.2572940826416015,
"memory(GiB)": 76.04,
"step": 600,
"token_acc": 0.6750695088044486,
"train_speed(iter/s)": 0.027957
},
{
"epoch": 0.156441916090245,
"grad_norm": 1.7115892171859741,
"learning_rate": 4.999941484979894e-06,
"loss": 1.279404354095459,
"memory(GiB)": 76.04,
"step": 605,
"token_acc": 0.6865163550249149,
"train_speed(iter/s)": 0.027961
},
{
"epoch": 0.15773482448768505,
"grad_norm": 1.595937967300415,
"learning_rate": 4.999914564160437e-06,
"loss": 1.239435577392578,
"memory(GiB)": 76.04,
"step": 610,
"token_acc": 0.6807362476469357,
"train_speed(iter/s)": 0.02795
},
{
"epoch": 0.1590277328851251,
"grad_norm": 1.494962215423584,
"learning_rate": 4.9998825640667835e-06,
"loss": 1.249193286895752,
"memory(GiB)": 76.04,
"step": 615,
"token_acc": 0.7230293299917766,
"train_speed(iter/s)": 0.027954
},
{
"epoch": 0.16032064128256512,
"grad_norm": 1.5208592414855957,
"learning_rate": 4.99984548476395e-06,
"loss": 1.259343719482422,
"memory(GiB)": 76.04,
"step": 620,
"token_acc": 0.7032384586844755,
"train_speed(iter/s)": 0.027964
},
{
"epoch": 0.16161354968000516,
"grad_norm": 1.6536712646484375,
"learning_rate": 4.999803326327274e-06,
"loss": 1.2767888069152833,
"memory(GiB)": 76.04,
"step": 625,
"token_acc": 0.6942289403874068,
"train_speed(iter/s)": 0.027964
},
{
"epoch": 0.16290645807744522,
"grad_norm": 1.6706756353378296,
"learning_rate": 4.9997560888424115e-06,
"loss": 1.2389703750610352,
"memory(GiB)": 76.04,
"step": 630,
"token_acc": 0.7009845373278569,
"train_speed(iter/s)": 0.027965
},
{
"epoch": 0.16419936647488526,
"grad_norm": 1.7092621326446533,
"learning_rate": 4.999703772405339e-06,
"loss": 1.1957599639892578,
"memory(GiB)": 76.04,
"step": 635,
"token_acc": 0.7072115946546149,
"train_speed(iter/s)": 0.027969
},
{
"epoch": 0.1654922748723253,
"grad_norm": 1.8869761228561401,
"learning_rate": 4.999646377122352e-06,
"loss": 1.1767961502075195,
"memory(GiB)": 76.04,
"step": 640,
"token_acc": 0.7185826888756692,
"train_speed(iter/s)": 0.027965
},
{
"epoch": 0.16678518326976532,
"grad_norm": 1.5505192279815674,
"learning_rate": 4.9995839031100636e-06,
"loss": 1.2072343826293945,
"memory(GiB)": 76.04,
"step": 645,
"token_acc": 0.6714076782449726,
"train_speed(iter/s)": 0.027966
},
{
"epoch": 0.1680780916672054,
"grad_norm": 1.6234943866729736,
"learning_rate": 4.9995163504954105e-06,
"loss": 1.1328813552856445,
"memory(GiB)": 76.04,
"step": 650,
"token_acc": 0.718151112416477,
"train_speed(iter/s)": 0.02796
},
{
"epoch": 0.16937100006464542,
"grad_norm": 1.6178089380264282,
"learning_rate": 4.999443719415641e-06,
"loss": 1.2163790702819823,
"memory(GiB)": 76.04,
"step": 655,
"token_acc": 0.7040156056713294,
"train_speed(iter/s)": 0.027964
},
{
"epoch": 0.17066390846208546,
"grad_norm": 1.6068527698516846,
"learning_rate": 4.999366010018328e-06,
"loss": 1.254256248474121,
"memory(GiB)": 76.04,
"step": 660,
"token_acc": 0.6559546915269338,
"train_speed(iter/s)": 0.02796
},
{
"epoch": 0.1719568168595255,
"grad_norm": 1.6556439399719238,
"learning_rate": 4.999283222461359e-06,
"loss": 1.1994304656982422,
"memory(GiB)": 76.04,
"step": 665,
"token_acc": 0.7349007266163743,
"train_speed(iter/s)": 0.027956
},
{
"epoch": 0.17324972525696555,
"grad_norm": 1.3648124933242798,
"learning_rate": 4.999195356912941e-06,
"loss": 1.1895877838134765,
"memory(GiB)": 76.04,
"step": 670,
"token_acc": 0.7510124364534566,
"train_speed(iter/s)": 0.027953
},
{
"epoch": 0.1745426336544056,
"grad_norm": 1.3924592733383179,
"learning_rate": 4.999102413551594e-06,
"loss": 1.1863578796386718,
"memory(GiB)": 76.04,
"step": 675,
"token_acc": 0.7059563448020718,
"train_speed(iter/s)": 0.02796
},
{
"epoch": 0.17583554205184562,
"grad_norm": 1.3376160860061646,
"learning_rate": 4.9990043925661625e-06,
"loss": 1.2073113441467285,
"memory(GiB)": 76.04,
"step": 680,
"token_acc": 0.7219528395881767,
"train_speed(iter/s)": 0.027965
},
{
"epoch": 0.17712845044928566,
"grad_norm": 1.3140792846679688,
"learning_rate": 4.998901294155801e-06,
"loss": 1.1344953536987306,
"memory(GiB)": 76.04,
"step": 685,
"token_acc": 0.7513243683781581,
"train_speed(iter/s)": 0.027967
},
{
"epoch": 0.17842135884672572,
"grad_norm": 1.3123114109039307,
"learning_rate": 4.9987931185299836e-06,
"loss": 1.1784892082214355,
"memory(GiB)": 76.04,
"step": 690,
"token_acc": 0.6674288089794221,
"train_speed(iter/s)": 0.027966
},
{
"epoch": 0.17971426724416575,
"grad_norm": 2.659935235977173,
"learning_rate": 4.998679865908499e-06,
"loss": 1.157151985168457,
"memory(GiB)": 76.04,
"step": 695,
"token_acc": 0.7257713001430225,
"train_speed(iter/s)": 0.027976
},
{
"epoch": 0.1810071756416058,
"grad_norm": 1.3218928575515747,
"learning_rate": 4.998561536521452e-06,
"loss": 1.1583141326904296,
"memory(GiB)": 76.04,
"step": 700,
"token_acc": 0.7759477598403773,
"train_speed(iter/s)": 0.027968
},
{
"epoch": 0.18230008403904582,
"grad_norm": 1.296757698059082,
"learning_rate": 4.998438130609261e-06,
"loss": 1.1259532928466798,
"memory(GiB)": 76.04,
"step": 705,
"token_acc": 0.727386377384361,
"train_speed(iter/s)": 0.02797
},
{
"epoch": 0.18359299243648589,
"grad_norm": 1.4752744436264038,
"learning_rate": 4.9983096484226605e-06,
"loss": 1.1163427352905273,
"memory(GiB)": 76.04,
"step": 710,
"token_acc": 0.7253694101134547,
"train_speed(iter/s)": 0.027972
},
{
"epoch": 0.18488590083392592,
"grad_norm": 1.3070727586746216,
"learning_rate": 4.998176090222697e-06,
"loss": 1.1096561431884766,
"memory(GiB)": 76.04,
"step": 715,
"token_acc": 0.7192952446117004,
"train_speed(iter/s)": 0.027971
},
{
"epoch": 0.18617880923136595,
"grad_norm": 1.4229007959365845,
"learning_rate": 4.998037456280732e-06,
"loss": 1.1451845169067383,
"memory(GiB)": 76.04,
"step": 720,
"token_acc": 0.7217227852239294,
"train_speed(iter/s)": 0.027973
},
{
"epoch": 0.187471717628806,
"grad_norm": 1.2608271837234497,
"learning_rate": 4.9978937468784376e-06,
"loss": 1.106486701965332,
"memory(GiB)": 76.04,
"step": 725,
"token_acc": 0.7184099215637961,
"train_speed(iter/s)": 0.027972
},
{
"epoch": 0.18876462602624605,
"grad_norm": 1.3104456663131714,
"learning_rate": 4.9977449623078015e-06,
"loss": 1.1219176292419433,
"memory(GiB)": 76.04,
"step": 730,
"token_acc": 0.7516463274234401,
"train_speed(iter/s)": 0.027972
},
{
"epoch": 0.1900575344236861,
"grad_norm": 1.353380799293518,
"learning_rate": 4.9975911028711195e-06,
"loss": 1.1417633056640626,
"memory(GiB)": 76.04,
"step": 735,
"token_acc": 0.739439049637699,
"train_speed(iter/s)": 0.027974
},
{
"epoch": 0.19135044282112612,
"grad_norm": 1.4334852695465088,
"learning_rate": 4.997432168881002e-06,
"loss": 1.1226820945739746,
"memory(GiB)": 76.04,
"step": 740,
"token_acc": 0.7192217376719222,
"train_speed(iter/s)": 0.027973
},
{
"epoch": 0.19264335121856616,
"grad_norm": 1.3817933797836304,
"learning_rate": 4.997268160660366e-06,
"loss": 1.1217589378356934,
"memory(GiB)": 76.04,
"step": 745,
"token_acc": 0.731275833562965,
"train_speed(iter/s)": 0.027968
},
{
"epoch": 0.19393625961600622,
"grad_norm": 1.2183865308761597,
"learning_rate": 4.99709907854244e-06,
"loss": 1.1149643898010253,
"memory(GiB)": 76.04,
"step": 750,
"token_acc": 0.7301776086267048,
"train_speed(iter/s)": 0.027963
},
{
"epoch": 0.19522916801344625,
"grad_norm": 1.2150975465774536,
"learning_rate": 4.9969249228707625e-06,
"loss": 1.0912652969360352,
"memory(GiB)": 76.04,
"step": 755,
"token_acc": 0.7373849358328294,
"train_speed(iter/s)": 0.027958
},
{
"epoch": 0.1965220764108863,
"grad_norm": 1.3223531246185303,
"learning_rate": 4.996745693999179e-06,
"loss": 1.1588199615478516,
"memory(GiB)": 76.04,
"step": 760,
"token_acc": 0.7059101248980643,
"train_speed(iter/s)": 0.027957
},
{
"epoch": 0.19781498480832632,
"grad_norm": 1.419268250465393,
"learning_rate": 4.996561392291842e-06,
"loss": 1.1392223358154296,
"memory(GiB)": 76.04,
"step": 765,
"token_acc": 0.6890176058642581,
"train_speed(iter/s)": 0.027957
},
{
"epoch": 0.19910789320576638,
"grad_norm": 1.4065704345703125,
"learning_rate": 4.996372018123213e-06,
"loss": 1.0843055725097657,
"memory(GiB)": 76.04,
"step": 770,
"token_acc": 0.7391510740140037,
"train_speed(iter/s)": 0.027954
},
{
"epoch": 0.20040080160320642,
"grad_norm": 1.3634155988693237,
"learning_rate": 4.996177571878058e-06,
"loss": 1.1053363800048828,
"memory(GiB)": 76.04,
"step": 775,
"token_acc": 0.7354631733725437,
"train_speed(iter/s)": 0.027955
},
{
"epoch": 0.20169371000064645,
"grad_norm": 1.2733179330825806,
"learning_rate": 4.995978053951449e-06,
"loss": 1.120106315612793,
"memory(GiB)": 76.04,
"step": 780,
"token_acc": 0.7424737177445047,
"train_speed(iter/s)": 0.027949
},
{
"epoch": 0.2029866183980865,
"grad_norm": 1.224840521812439,
"learning_rate": 4.995773464748763e-06,
"loss": 1.117567253112793,
"memory(GiB)": 76.04,
"step": 785,
"token_acc": 0.7321232876712329,
"train_speed(iter/s)": 0.027949
},
{
"epoch": 0.20427952679552655,
"grad_norm": 1.5229790210723877,
"learning_rate": 4.995563804685679e-06,
"loss": 1.0795328140258789,
"memory(GiB)": 76.04,
"step": 790,
"token_acc": 0.7580057607590647,
"train_speed(iter/s)": 0.027946
},
{
"epoch": 0.20557243519296658,
"grad_norm": 1.4150093793869019,
"learning_rate": 4.9953490741881796e-06,
"loss": 1.1043382644653321,
"memory(GiB)": 76.04,
"step": 795,
"token_acc": 0.700471466457126,
"train_speed(iter/s)": 0.027945
},
{
"epoch": 0.20686534359040662,
"grad_norm": 1.4656771421432495,
"learning_rate": 4.9951292736925515e-06,
"loss": 1.0530550956726075,
"memory(GiB)": 76.04,
"step": 800,
"token_acc": 0.7616183012073713,
"train_speed(iter/s)": 0.027949
},
{
"epoch": 0.20815825198784665,
"grad_norm": 1.2729520797729492,
"learning_rate": 4.994904403645378e-06,
"loss": 1.0495079040527344,
"memory(GiB)": 76.04,
"step": 805,
"token_acc": 0.7524990313831849,
"train_speed(iter/s)": 0.027946
},
{
"epoch": 0.2094511603852867,
"grad_norm": 1.4444711208343506,
"learning_rate": 4.9946744645035496e-06,
"loss": 1.0937026977539062,
"memory(GiB)": 76.04,
"step": 810,
"token_acc": 0.6928733031674208,
"train_speed(iter/s)": 0.027942
},
{
"epoch": 0.21074406878272675,
"grad_norm": 1.390000820159912,
"learning_rate": 4.994439456734248e-06,
"loss": 1.092994499206543,
"memory(GiB)": 76.04,
"step": 815,
"token_acc": 0.7247861835309672,
"train_speed(iter/s)": 0.027945
},
{
"epoch": 0.21203697718016679,
"grad_norm": 1.1274871826171875,
"learning_rate": 4.994199380814958e-06,
"loss": 1.064702320098877,
"memory(GiB)": 76.04,
"step": 820,
"token_acc": 0.7177709296353364,
"train_speed(iter/s)": 0.027947
},
{
"epoch": 0.21332988557760682,
"grad_norm": 1.1373642683029175,
"learning_rate": 4.9939542372334625e-06,
"loss": 1.0526296615600585,
"memory(GiB)": 76.04,
"step": 825,
"token_acc": 0.7360295948493579,
"train_speed(iter/s)": 0.027945
},
{
"epoch": 0.21462279397504685,
"grad_norm": 1.2715613842010498,
"learning_rate": 4.993704026487837e-06,
"loss": 1.0497617721557617,
"memory(GiB)": 76.04,
"step": 830,
"token_acc": 0.7389768524290404,
"train_speed(iter/s)": 0.027948
},
{
"epoch": 0.21591570237248692,
"grad_norm": 1.3336645364761353,
"learning_rate": 4.993448749086455e-06,
"loss": 1.0744933128356933,
"memory(GiB)": 76.04,
"step": 835,
"token_acc": 0.7657203842049093,
"train_speed(iter/s)": 0.027954
},
{
"epoch": 0.21720861076992695,
"grad_norm": 1.3586411476135254,
"learning_rate": 4.9931884055479855e-06,
"loss": 1.0742916107177733,
"memory(GiB)": 76.04,
"step": 840,
"token_acc": 0.7213896713615023,
"train_speed(iter/s)": 0.027951
},
{
"epoch": 0.218501519167367,
"grad_norm": 1.2536128759384155,
"learning_rate": 4.992922996401386e-06,
"loss": 1.0365938186645507,
"memory(GiB)": 76.04,
"step": 845,
"token_acc": 0.7336075791573173,
"train_speed(iter/s)": 0.027954
},
{
"epoch": 0.21979442756480702,
"grad_norm": 1.0951530933380127,
"learning_rate": 4.992652522185912e-06,
"loss": 1.0514394760131835,
"memory(GiB)": 76.04,
"step": 850,
"token_acc": 0.7264080844107594,
"train_speed(iter/s)": 0.027953
},
{
"epoch": 0.22108733596224708,
"grad_norm": 1.302139401435852,
"learning_rate": 4.992376983451106e-06,
"loss": 1.0614931106567382,
"memory(GiB)": 76.04,
"step": 855,
"token_acc": 0.7223553005403599,
"train_speed(iter/s)": 0.02795
},
{
"epoch": 0.22238024435968712,
"grad_norm": 1.2117213010787964,
"learning_rate": 4.992096380756802e-06,
"loss": 1.0211700439453124,
"memory(GiB)": 76.04,
"step": 860,
"token_acc": 0.7403405370006548,
"train_speed(iter/s)": 0.027946
},
{
"epoch": 0.22367315275712715,
"grad_norm": 1.1893075704574585,
"learning_rate": 4.9918107146731234e-06,
"loss": 1.02754545211792,
"memory(GiB)": 76.04,
"step": 865,
"token_acc": 0.7416530944625407,
"train_speed(iter/s)": 0.027949
},
{
"epoch": 0.2249660611545672,
"grad_norm": 1.3030766248703003,
"learning_rate": 4.991519985780479e-06,
"loss": 1.0705657958984376,
"memory(GiB)": 76.04,
"step": 870,
"token_acc": 0.7212401717504043,
"train_speed(iter/s)": 0.027952
},
{
"epoch": 0.22625896955200725,
"grad_norm": 1.3120644092559814,
"learning_rate": 4.991224194669567e-06,
"loss": 1.0438270568847656,
"memory(GiB)": 76.04,
"step": 875,
"token_acc": 0.7455082274151154,
"train_speed(iter/s)": 0.027949
},
{
"epoch": 0.22755187794944728,
"grad_norm": 1.2804142236709595,
"learning_rate": 4.99092334194137e-06,
"loss": 1.0537703514099122,
"memory(GiB)": 76.04,
"step": 880,
"token_acc": 0.7769719655320311,
"train_speed(iter/s)": 0.027949
},
{
"epoch": 0.22884478634688732,
"grad_norm": 1.141849160194397,
"learning_rate": 4.990617428207153e-06,
"loss": 1.0109355926513672,
"memory(GiB)": 76.04,
"step": 885,
"token_acc": 0.7504947627552252,
"train_speed(iter/s)": 0.027943
},
{
"epoch": 0.23013769474432735,
"grad_norm": 1.3178867101669312,
"learning_rate": 4.990306454088467e-06,
"loss": 1.0286626815795898,
"memory(GiB)": 76.04,
"step": 890,
"token_acc": 0.7264516717602518,
"train_speed(iter/s)": 0.027938
},
{
"epoch": 0.23143060314176742,
"grad_norm": 1.5300428867340088,
"learning_rate": 4.98999042021714e-06,
"loss": 1.0479674339294434,
"memory(GiB)": 76.04,
"step": 895,
"token_acc": 0.7131689263189698,
"train_speed(iter/s)": 0.027942
},
{
"epoch": 0.23272351153920745,
"grad_norm": 1.482164740562439,
"learning_rate": 4.989669327235285e-06,
"loss": 1.0493934631347657,
"memory(GiB)": 76.04,
"step": 900,
"token_acc": 0.7489839605373375,
"train_speed(iter/s)": 0.027946
},
{
"epoch": 0.23401641993664749,
"grad_norm": 1.259787678718567,
"learning_rate": 4.989343175795291e-06,
"loss": 1.0503623962402344,
"memory(GiB)": 76.04,
"step": 905,
"token_acc": 0.7800720499077409,
"train_speed(iter/s)": 0.027945
},
{
"epoch": 0.23530932833408752,
"grad_norm": 1.2325483560562134,
"learning_rate": 4.9890119665598265e-06,
"loss": 1.0217859268188476,
"memory(GiB)": 76.04,
"step": 910,
"token_acc": 0.7489180775947156,
"train_speed(iter/s)": 0.027956
},
{
"epoch": 0.23660223673152758,
"grad_norm": 1.2175496816635132,
"learning_rate": 4.988675700201836e-06,
"loss": 1.0210546493530273,
"memory(GiB)": 76.04,
"step": 915,
"token_acc": 0.7292449540928989,
"train_speed(iter/s)": 0.02796
},
{
"epoch": 0.23789514512896762,
"grad_norm": 1.282808542251587,
"learning_rate": 4.988334377404537e-06,
"loss": 1.0249996185302734,
"memory(GiB)": 76.04,
"step": 920,
"token_acc": 0.7644511799090712,
"train_speed(iter/s)": 0.027959
},
{
"epoch": 0.23918805352640765,
"grad_norm": 1.388667106628418,
"learning_rate": 4.9879879988614226e-06,
"loss": 1.0185768127441406,
"memory(GiB)": 76.04,
"step": 925,
"token_acc": 0.7619047619047619,
"train_speed(iter/s)": 0.027952
},
{
"epoch": 0.24048096192384769,
"grad_norm": 1.1671382188796997,
"learning_rate": 4.987636565276258e-06,
"loss": 0.9928812026977539,
"memory(GiB)": 76.04,
"step": 930,
"token_acc": 0.7683724235963042,
"train_speed(iter/s)": 0.027942
},
{
"epoch": 0.24177387032128775,
"grad_norm": 1.122075080871582,
"learning_rate": 4.987280077363077e-06,
"loss": 0.993900203704834,
"memory(GiB)": 76.04,
"step": 935,
"token_acc": 0.7723164046901659,
"train_speed(iter/s)": 0.027941
},
{
"epoch": 0.24306677871872778,
"grad_norm": 1.2567425966262817,
"learning_rate": 4.986918535846187e-06,
"loss": 1.0273015975952149,
"memory(GiB)": 76.04,
"step": 940,
"token_acc": 0.7539662184442898,
"train_speed(iter/s)": 0.027938
},
{
"epoch": 0.24435968711616782,
"grad_norm": 1.1676665544509888,
"learning_rate": 4.986551941460158e-06,
"loss": 0.9982240676879883,
"memory(GiB)": 76.04,
"step": 945,
"token_acc": 0.7614623913694936,
"train_speed(iter/s)": 0.027934
},
{
"epoch": 0.24565259551360785,
"grad_norm": 1.2946789264678955,
"learning_rate": 4.98618029494983e-06,
"loss": 1.0252174377441405,
"memory(GiB)": 76.04,
"step": 950,
"token_acc": 0.7381703470031545,
"train_speed(iter/s)": 0.027937
},
{
"epoch": 0.24694550391104791,
"grad_norm": 1.6065422296524048,
"learning_rate": 4.985803597070306e-06,
"loss": 1.0155667304992675,
"memory(GiB)": 76.04,
"step": 955,
"token_acc": 0.736665709087427,
"train_speed(iter/s)": 0.027941
},
{
"epoch": 0.24823841230848795,
"grad_norm": 1.2003506422042847,
"learning_rate": 4.985421848586954e-06,
"loss": 1.020925521850586,
"memory(GiB)": 76.04,
"step": 960,
"token_acc": 0.7488078360613482,
"train_speed(iter/s)": 0.027942
},
{
"epoch": 0.24953132070592798,
"grad_norm": 1.5374075174331665,
"learning_rate": 4.985035050275402e-06,
"loss": 1.013150405883789,
"memory(GiB)": 76.04,
"step": 965,
"token_acc": 0.7596559355296673,
"train_speed(iter/s)": 0.027937
},
{
"epoch": 0.25082422910336805,
"grad_norm": 1.2153679132461548,
"learning_rate": 4.984643202921538e-06,
"loss": 1.0238693237304688,
"memory(GiB)": 76.04,
"step": 970,
"token_acc": 0.7379732289822666,
"train_speed(iter/s)": 0.027935
},
{
"epoch": 0.25211713750080805,
"grad_norm": 1.153549075126648,
"learning_rate": 4.984246307321511e-06,
"loss": 0.9940820693969726,
"memory(GiB)": 76.04,
"step": 975,
"token_acc": 0.7361269479285443,
"train_speed(iter/s)": 0.02794
},
{
"epoch": 0.2534100458982481,
"grad_norm": 1.8333048820495605,
"learning_rate": 4.983844364281723e-06,
"loss": 0.9937544822692871,
"memory(GiB)": 76.04,
"step": 980,
"token_acc": 0.739548529052237,
"train_speed(iter/s)": 0.027942
},
{
"epoch": 0.2547029542956882,
"grad_norm": 2.401205062866211,
"learning_rate": 4.983437374618835e-06,
"loss": 1.039668083190918,
"memory(GiB)": 76.04,
"step": 985,
"token_acc": 0.7770313636558127,
"train_speed(iter/s)": 0.027946
},
{
"epoch": 0.2559958626931282,
"grad_norm": 4.683147430419922,
"learning_rate": 4.983025339159759e-06,
"loss": 1.0015725135803222,
"memory(GiB)": 76.04,
"step": 990,
"token_acc": 0.7626828398144845,
"train_speed(iter/s)": 0.027952
},
{
"epoch": 0.25728877109056825,
"grad_norm": 1.1371662616729736,
"learning_rate": 4.982608258741662e-06,
"loss": 1.0269445419311523,
"memory(GiB)": 76.04,
"step": 995,
"token_acc": 0.7389006342494715,
"train_speed(iter/s)": 0.027949
},
{
"epoch": 0.25858167948800825,
"grad_norm": 1.1645772457122803,
"learning_rate": 4.982186134211957e-06,
"loss": 1.0103599548339843,
"memory(GiB)": 76.04,
"step": 1000,
"token_acc": 0.7301087055814378,
"train_speed(iter/s)": 0.027949
},
{
"epoch": 0.2598745878854483,
"grad_norm": 1.1663211584091187,
"learning_rate": 4.98175896642831e-06,
"loss": 0.9422775268554687,
"memory(GiB)": 76.04,
"step": 1005,
"token_acc": 0.7944890929965557,
"train_speed(iter/s)": 0.027805
},
{
"epoch": 0.2611674962828884,
"grad_norm": 1.0399482250213623,
"learning_rate": 4.981326756258629e-06,
"loss": 0.9761096000671386,
"memory(GiB)": 76.04,
"step": 1010,
"token_acc": 0.7143371038011112,
"train_speed(iter/s)": 0.027808
},
{
"epoch": 0.2624604046803284,
"grad_norm": 1.1718828678131104,
"learning_rate": 4.9808895045810715e-06,
"loss": 0.9784045219421387,
"memory(GiB)": 76.04,
"step": 1015,
"token_acc": 0.7504920027194332,
"train_speed(iter/s)": 0.027808
},
{
"epoch": 0.26375331307776845,
"grad_norm": 1.5339823961257935,
"learning_rate": 4.980447212284035e-06,
"loss": 0.9676334381103515,
"memory(GiB)": 76.04,
"step": 1020,
"token_acc": 0.759060549655097,
"train_speed(iter/s)": 0.027814
},
{
"epoch": 0.26504622147520845,
"grad_norm": 1.1942572593688965,
"learning_rate": 4.979999880266162e-06,
"loss": 0.9654909133911133,
"memory(GiB)": 76.04,
"step": 1025,
"token_acc": 0.7506817855604995,
"train_speed(iter/s)": 0.027813
},
{
"epoch": 0.2663391298726485,
"grad_norm": 1.2203813791275024,
"learning_rate": 4.979547509436329e-06,
"loss": 1.0167608261108398,
"memory(GiB)": 76.04,
"step": 1030,
"token_acc": 0.7280177187153931,
"train_speed(iter/s)": 0.027814
},
{
"epoch": 0.2676320382700886,
"grad_norm": 1.1107381582260132,
"learning_rate": 4.979090100713657e-06,
"loss": 0.944733715057373,
"memory(GiB)": 76.04,
"step": 1035,
"token_acc": 0.8061226833245297,
"train_speed(iter/s)": 0.027814
},
{
"epoch": 0.2689249466675286,
"grad_norm": 1.2653183937072754,
"learning_rate": 4.978627655027497e-06,
"loss": 0.9633764266967774,
"memory(GiB)": 76.04,
"step": 1040,
"token_acc": 0.7571960586546668,
"train_speed(iter/s)": 0.027816
},
{
"epoch": 0.27021785506496865,
"grad_norm": 1.3168116807937622,
"learning_rate": 4.978160173317439e-06,
"loss": 1.0249298095703125,
"memory(GiB)": 76.04,
"step": 1045,
"token_acc": 0.7487644982349975,
"train_speed(iter/s)": 0.027818
},
{
"epoch": 0.2715107634624087,
"grad_norm": 1.2121893167495728,
"learning_rate": 4.9776876565332995e-06,
"loss": 0.9906470298767089,
"memory(GiB)": 76.04,
"step": 1050,
"token_acc": 0.7474919305591904,
"train_speed(iter/s)": 0.027814
},
{
"epoch": 0.2728036718598487,
"grad_norm": 1.1107871532440186,
"learning_rate": 4.97721010563513e-06,
"loss": 0.9752838134765625,
"memory(GiB)": 76.04,
"step": 1055,
"token_acc": 0.7437383839277812,
"train_speed(iter/s)": 0.027814
},
{
"epoch": 0.2740965802572888,
"grad_norm": 1.0446956157684326,
"learning_rate": 4.976727521593209e-06,
"loss": 0.9726054191589355,
"memory(GiB)": 76.04,
"step": 1060,
"token_acc": 0.7167705088265836,
"train_speed(iter/s)": 0.027817
},
{
"epoch": 0.2753894886547288,
"grad_norm": 1.2050210237503052,
"learning_rate": 4.9762399053880395e-06,
"loss": 1.0308534622192382,
"memory(GiB)": 76.04,
"step": 1065,
"token_acc": 0.7568904593639576,
"train_speed(iter/s)": 0.027811
},
{
"epoch": 0.27668239705216885,
"grad_norm": 1.5667829513549805,
"learning_rate": 4.97574725801035e-06,
"loss": 0.9559214591979981,
"memory(GiB)": 76.04,
"step": 1070,
"token_acc": 0.7598421892050701,
"train_speed(iter/s)": 0.027811
},
{
"epoch": 0.2779753054496089,
"grad_norm": 1.1865519285202026,
"learning_rate": 4.975249580461092e-06,
"loss": 1.010297679901123,
"memory(GiB)": 76.04,
"step": 1075,
"token_acc": 0.744272567064813,
"train_speed(iter/s)": 0.02781
},
{
"epoch": 0.2792682138470489,
"grad_norm": 1.058516263961792,
"learning_rate": 4.974746873751435e-06,
"loss": 1.015725040435791,
"memory(GiB)": 76.04,
"step": 1080,
"token_acc": 0.7451629446594247,
"train_speed(iter/s)": 0.027809
},
{
"epoch": 0.280561122244489,
"grad_norm": 1.0704551935195923,
"learning_rate": 4.9742391389027695e-06,
"loss": 0.9716552734375,
"memory(GiB)": 76.04,
"step": 1085,
"token_acc": 0.7283868278449354,
"train_speed(iter/s)": 0.027812
},
{
"epoch": 0.28185403064192904,
"grad_norm": 1.0352953672409058,
"learning_rate": 4.973726376946699e-06,
"loss": 0.9752028465270997,
"memory(GiB)": 76.04,
"step": 1090,
"token_acc": 0.7493878800244848,
"train_speed(iter/s)": 0.027812
},
{
"epoch": 0.28314693903936905,
"grad_norm": 1.1525194644927979,
"learning_rate": 4.973208588925045e-06,
"loss": 0.9712867736816406,
"memory(GiB)": 76.04,
"step": 1095,
"token_acc": 0.7517407605784682,
"train_speed(iter/s)": 0.027815
},
{
"epoch": 0.2844398474368091,
"grad_norm": 1.0644663572311401,
"learning_rate": 4.972685775889836e-06,
"loss": 0.9582048416137695,
"memory(GiB)": 76.04,
"step": 1100,
"token_acc": 0.7543896103896104,
"train_speed(iter/s)": 0.027814
},
{
"epoch": 0.2857327558342491,
"grad_norm": 1.143939733505249,
"learning_rate": 4.9721579389033125e-06,
"loss": 0.9977324485778809,
"memory(GiB)": 76.04,
"step": 1105,
"token_acc": 0.7675656607767053,
"train_speed(iter/s)": 0.027815
},
{
"epoch": 0.2870256642316892,
"grad_norm": 1.3790191411972046,
"learning_rate": 4.971625079037925e-06,
"loss": 0.9518113136291504,
"memory(GiB)": 76.04,
"step": 1110,
"token_acc": 0.7680853988179778,
"train_speed(iter/s)": 0.027815
},
{
"epoch": 0.28831857262912924,
"grad_norm": 1.1732274293899536,
"learning_rate": 4.971087197376325e-06,
"loss": 0.9738147735595704,
"memory(GiB)": 76.04,
"step": 1115,
"token_acc": 0.7463278436450451,
"train_speed(iter/s)": 0.027817
},
{
"epoch": 0.28961148102656925,
"grad_norm": 1.0615239143371582,
"learning_rate": 4.970544295011369e-06,
"loss": 0.9704501152038574,
"memory(GiB)": 76.04,
"step": 1120,
"token_acc": 0.72969752000454,
"train_speed(iter/s)": 0.027813
},
{
"epoch": 0.2909043894240093,
"grad_norm": 3877.243896484375,
"learning_rate": 4.969996373046117e-06,
"loss": 1.222921085357666,
"memory(GiB)": 76.04,
"step": 1125,
"token_acc": 0.7576140282702677,
"train_speed(iter/s)": 0.027814
},
{
"epoch": 0.2921972978214494,
"grad_norm": 358.6942138671875,
"learning_rate": 4.969443432593823e-06,
"loss": 4.470377349853516,
"memory(GiB)": 76.04,
"step": 1130,
"token_acc": 0.40514805901433504,
"train_speed(iter/s)": 0.027816
},
{
"epoch": 0.2934902062188894,
"grad_norm": 646.41748046875,
"learning_rate": 4.968885474777941e-06,
"loss": 4.353573226928711,
"memory(GiB)": 76.04,
"step": 1135,
"token_acc": 0.404987128989238,
"train_speed(iter/s)": 0.027819
},
{
"epoch": 0.29478311461632944,
"grad_norm": 133.66444396972656,
"learning_rate": 4.968322500732118e-06,
"loss": 3.6896575927734374,
"memory(GiB)": 76.04,
"step": 1140,
"token_acc": 0.4133568570076165,
"train_speed(iter/s)": 0.027822
},
{
"epoch": 0.29607602301376945,
"grad_norm": 371.82977294921875,
"learning_rate": 4.967754511600192e-06,
"loss": 3.5502925872802735,
"memory(GiB)": 76.04,
"step": 1145,
"token_acc": 0.5075539022168236,
"train_speed(iter/s)": 0.027827
},
{
"epoch": 0.2973689314112095,
"grad_norm": 716.2843627929688,
"learning_rate": 4.967181508536193e-06,
"loss": 2.8075958251953126,
"memory(GiB)": 76.04,
"step": 1150,
"token_acc": 0.47253147435230464,
"train_speed(iter/s)": 0.027825
},
{
"epoch": 0.2986618398086496,
"grad_norm": 235.48390197753906,
"learning_rate": 4.9666034927043346e-06,
"loss": 2.0499923706054686,
"memory(GiB)": 76.04,
"step": 1155,
"token_acc": 0.6685902289996606,
"train_speed(iter/s)": 0.027823
},
{
"epoch": 0.2999547482060896,
"grad_norm": 4.845156192779541,
"learning_rate": 4.96602046527902e-06,
"loss": 1.2279346466064454,
"memory(GiB)": 76.04,
"step": 1160,
"token_acc": 0.7027854164818593,
"train_speed(iter/s)": 0.027823
},
{
"epoch": 0.30124765660352965,
"grad_norm": 1.512497901916504,
"learning_rate": 4.96543242744483e-06,
"loss": 1.0421756744384765,
"memory(GiB)": 76.04,
"step": 1165,
"token_acc": 0.7120159370292989,
"train_speed(iter/s)": 0.027825
},
{
"epoch": 0.3025405650009697,
"grad_norm": 1.465827465057373,
"learning_rate": 4.964839380396529e-06,
"loss": 0.953398609161377,
"memory(GiB)": 76.04,
"step": 1170,
"token_acc": 0.7572973559518211,
"train_speed(iter/s)": 0.027828
},
{
"epoch": 0.3038334733984097,
"grad_norm": 1.2102559804916382,
"learning_rate": 4.964241325339056e-06,
"loss": 0.9315123558044434,
"memory(GiB)": 76.04,
"step": 1175,
"token_acc": 0.7300858454781394,
"train_speed(iter/s)": 0.027824
},
{
"epoch": 0.3051263817958498,
"grad_norm": 2.6368777751922607,
"learning_rate": 4.963638263487528e-06,
"loss": 0.9297597885131836,
"memory(GiB)": 76.04,
"step": 1180,
"token_acc": 0.7474219317356572,
"train_speed(iter/s)": 0.027825
},
{
"epoch": 0.3064192901932898,
"grad_norm": 353.7561950683594,
"learning_rate": 4.963030196067233e-06,
"loss": 0.942746639251709,
"memory(GiB)": 76.04,
"step": 1185,
"token_acc": 0.7225467822911913,
"train_speed(iter/s)": 0.027827
},
{
"epoch": 0.30771219859072985,
"grad_norm": 1.3002510070800781,
"learning_rate": 4.96241712431363e-06,
"loss": 0.9472110748291016,
"memory(GiB)": 76.04,
"step": 1190,
"token_acc": 0.7242681047765793,
"train_speed(iter/s)": 0.027824
},
{
"epoch": 0.3090051069881699,
"grad_norm": 1.1316903829574585,
"learning_rate": 4.9617990494723444e-06,
"loss": 0.9454745292663574,
"memory(GiB)": 76.04,
"step": 1195,
"token_acc": 0.7497220256747195,
"train_speed(iter/s)": 0.027828
},
{
"epoch": 0.3102980153856099,
"grad_norm": 1.108446478843689,
"learning_rate": 4.961175972799169e-06,
"loss": 0.9554048538208008,
"memory(GiB)": 76.04,
"step": 1200,
"token_acc": 0.7410781445883828,
"train_speed(iter/s)": 0.02783
},
{
"epoch": 0.31159092378305,
"grad_norm": 1.25348961353302,
"learning_rate": 4.960547895560058e-06,
"loss": 0.9723408699035645,
"memory(GiB)": 76.04,
"step": 1205,
"token_acc": 0.7924053665548635,
"train_speed(iter/s)": 0.027837
},
{
"epoch": 0.31288383218049,
"grad_norm": 2.172182559967041,
"learning_rate": 4.959914819031125e-06,
"loss": 0.9340225219726562,
"memory(GiB)": 76.04,
"step": 1210,
"token_acc": 0.7698709945900957,
"train_speed(iter/s)": 0.027838
},
{
"epoch": 0.31417674057793005,
"grad_norm": 9.525141716003418,
"learning_rate": 4.959276744498642e-06,
"loss": 0.9623056411743164,
"memory(GiB)": 76.04,
"step": 1215,
"token_acc": 0.7322046531438002,
"train_speed(iter/s)": 0.027841
},
{
"epoch": 0.3154696489753701,
"grad_norm": 1.1645443439483643,
"learning_rate": 4.9586336732590344e-06,
"loss": 0.9339606285095214,
"memory(GiB)": 76.04,
"step": 1220,
"token_acc": 0.7669276434655424,
"train_speed(iter/s)": 0.02784
},
{
"epoch": 0.3167625573728101,
"grad_norm": 1.1521093845367432,
"learning_rate": 4.957985606618882e-06,
"loss": 0.9695714950561524,
"memory(GiB)": 76.04,
"step": 1225,
"token_acc": 0.7714450456843785,
"train_speed(iter/s)": 0.027843
},
{
"epoch": 0.3180554657702502,
"grad_norm": 1.1521477699279785,
"learning_rate": 4.957332545894914e-06,
"loss": 0.9398648262023925,
"memory(GiB)": 76.04,
"step": 1230,
"token_acc": 0.8035982876316093,
"train_speed(iter/s)": 0.027843
},
{
"epoch": 0.31934837416769024,
"grad_norm": 1.1556190252304077,
"learning_rate": 4.956674492414003e-06,
"loss": 0.9573787689208985,
"memory(GiB)": 76.04,
"step": 1235,
"token_acc": 0.7518860016764459,
"train_speed(iter/s)": 0.027841
},
{
"epoch": 0.32064128256513025,
"grad_norm": 1.3817846775054932,
"learning_rate": 4.95601144751317e-06,
"loss": 0.9417957305908203,
"memory(GiB)": 76.04,
"step": 1240,
"token_acc": 0.7512520868113522,
"train_speed(iter/s)": 0.027839
},
{
"epoch": 0.3219341909625703,
"grad_norm": 1.157069206237793,
"learning_rate": 4.955343412539576e-06,
"loss": 0.9470592498779297,
"memory(GiB)": 76.04,
"step": 1245,
"token_acc": 0.7514258079578428,
"train_speed(iter/s)": 0.027833
},
{
"epoch": 0.3232270993600103,
"grad_norm": 1.0551875829696655,
"learning_rate": 4.954670388850521e-06,
"loss": 0.9208686828613282,
"memory(GiB)": 76.04,
"step": 1250,
"token_acc": 0.7600368787193027,
"train_speed(iter/s)": 0.027834
},
{
"epoch": 0.3245200077574504,
"grad_norm": 1.0360503196716309,
"learning_rate": 4.953992377813438e-06,
"loss": 0.9444967269897461,
"memory(GiB)": 76.04,
"step": 1255,
"token_acc": 0.8037869164814226,
"train_speed(iter/s)": 0.027836
},
{
"epoch": 0.32581291615489044,
"grad_norm": 0.9959269762039185,
"learning_rate": 4.953309380805897e-06,
"loss": 0.9297657012939453,
"memory(GiB)": 76.04,
"step": 1260,
"token_acc": 0.7694388999778221,
"train_speed(iter/s)": 0.027837
},
{
"epoch": 0.32710582455233045,
"grad_norm": 1.429611086845398,
"learning_rate": 4.952621399215598e-06,
"loss": 0.9631167411804199,
"memory(GiB)": 76.04,
"step": 1265,
"token_acc": 0.7283613171938045,
"train_speed(iter/s)": 0.027842
},
{
"epoch": 0.3283987329497705,
"grad_norm": 1.3571066856384277,
"learning_rate": 4.951928434440367e-06,
"loss": 0.9502096176147461,
"memory(GiB)": 76.04,
"step": 1270,
"token_acc": 0.7302693616497888,
"train_speed(iter/s)": 0.027844
},
{
"epoch": 0.3296916413472106,
"grad_norm": 3.4126250743865967,
"learning_rate": 4.951230487888154e-06,
"loss": 0.9481155395507812,
"memory(GiB)": 76.04,
"step": 1275,
"token_acc": 0.7534746180384426,
"train_speed(iter/s)": 0.027842
},
{
"epoch": 0.3309845497446506,
"grad_norm": 1.0404834747314453,
"learning_rate": 4.950527560977035e-06,
"loss": 0.945584487915039,
"memory(GiB)": 76.04,
"step": 1280,
"token_acc": 0.7738585496866607,
"train_speed(iter/s)": 0.027844
},
{
"epoch": 0.33227745814209064,
"grad_norm": 1.1452839374542236,
"learning_rate": 4.9498196551352e-06,
"loss": 0.9602731704711914,
"memory(GiB)": 76.04,
"step": 1285,
"token_acc": 0.722290316932236,
"train_speed(iter/s)": 0.027851
},
{
"epoch": 0.33357036653953065,
"grad_norm": 1.2483798265457153,
"learning_rate": 4.949106771800958e-06,
"loss": 0.9069469451904297,
"memory(GiB)": 76.04,
"step": 1290,
"token_acc": 0.7798567304608147,
"train_speed(iter/s)": 0.027851
},
{
"epoch": 0.3348632749369707,
"grad_norm": 1.1929678916931152,
"learning_rate": 4.94838891242273e-06,
"loss": 0.924495792388916,
"memory(GiB)": 76.04,
"step": 1295,
"token_acc": 0.7753519103705832,
"train_speed(iter/s)": 0.027853
},
{
"epoch": 0.3361561833344108,
"grad_norm": 1.1313072443008423,
"learning_rate": 4.947666078459049e-06,
"loss": 0.9245437622070313,
"memory(GiB)": 76.04,
"step": 1300,
"token_acc": 0.7716767637913902,
"train_speed(iter/s)": 0.027845
},
{
"epoch": 0.3374490917318508,
"grad_norm": 1.1400386095046997,
"learning_rate": 4.946938271378552e-06,
"loss": 0.9137565612792968,
"memory(GiB)": 76.04,
"step": 1305,
"token_acc": 0.7587449115602147,
"train_speed(iter/s)": 0.027844
},
{
"epoch": 0.33874200012929084,
"grad_norm": 1.0596169233322144,
"learning_rate": 4.946205492659984e-06,
"loss": 0.8961214065551758,
"memory(GiB)": 76.04,
"step": 1310,
"token_acc": 0.7542817732480172,
"train_speed(iter/s)": 0.027844
},
{
"epoch": 0.3400349085267309,
"grad_norm": 0.9843769073486328,
"learning_rate": 4.945467743792188e-06,
"loss": 0.9182037353515625,
"memory(GiB)": 76.04,
"step": 1315,
"token_acc": 0.766585993622988,
"train_speed(iter/s)": 0.027846
},
{
"epoch": 0.3413278169241709,
"grad_norm": 1.1926190853118896,
"learning_rate": 4.9447250262741085e-06,
"loss": 0.9283374786376953,
"memory(GiB)": 76.04,
"step": 1320,
"token_acc": 0.7409656847859095,
"train_speed(iter/s)": 0.027843
},
{
"epoch": 0.342620725321611,
"grad_norm": 1.0026227235794067,
"learning_rate": 4.943977341614782e-06,
"loss": 0.9378311157226562,
"memory(GiB)": 76.04,
"step": 1325,
"token_acc": 0.7585428321089169,
"train_speed(iter/s)": 0.027841
},
{
"epoch": 0.343913633719051,
"grad_norm": 1.1508121490478516,
"learning_rate": 4.943224691333339e-06,
"loss": 0.9445396423339844,
"memory(GiB)": 76.04,
"step": 1330,
"token_acc": 0.7383880704599721,
"train_speed(iter/s)": 0.027834
},
{
"epoch": 0.34520654211649104,
"grad_norm": 1.5133693218231201,
"learning_rate": 4.942467076958999e-06,
"loss": 0.8884575843811036,
"memory(GiB)": 76.04,
"step": 1335,
"token_acc": 0.7681140292991949,
"train_speed(iter/s)": 0.027831
},
{
"epoch": 0.3464994505139311,
"grad_norm": 1.0745313167572021,
"learning_rate": 4.941704500031066e-06,
"loss": 0.8931808471679688,
"memory(GiB)": 76.04,
"step": 1340,
"token_acc": 0.7731155696658266,
"train_speed(iter/s)": 0.027831
},
{
"epoch": 0.3477923589113711,
"grad_norm": 1.1880706548690796,
"learning_rate": 4.940936962098929e-06,
"loss": 0.9454404830932617,
"memory(GiB)": 76.04,
"step": 1345,
"token_acc": 0.7694478894923662,
"train_speed(iter/s)": 0.027836
},
{
"epoch": 0.3490852673088112,
"grad_norm": 1.056232213973999,
"learning_rate": 4.9401644647220545e-06,
"loss": 0.8956671714782715,
"memory(GiB)": 76.04,
"step": 1350,
"token_acc": 0.7636090870124304,
"train_speed(iter/s)": 0.027838
},
{
"epoch": 0.35037817570625124,
"grad_norm": 0.9796701073646545,
"learning_rate": 4.939387009469988e-06,
"loss": 0.9031806945800781,
"memory(GiB)": 76.04,
"step": 1355,
"token_acc": 0.7830841262649146,
"train_speed(iter/s)": 0.027838
},
{
"epoch": 0.35167108410369124,
"grad_norm": 1.0584688186645508,
"learning_rate": 4.938604597922346e-06,
"loss": 0.9216032981872558,
"memory(GiB)": 76.04,
"step": 1360,
"token_acc": 0.7680820851083322,
"train_speed(iter/s)": 0.027842
},
{
"epoch": 0.3529639925011313,
"grad_norm": 1.00162672996521,
"learning_rate": 4.937817231668815e-06,
"loss": 0.8896630287170411,
"memory(GiB)": 76.04,
"step": 1365,
"token_acc": 0.7780082987551867,
"train_speed(iter/s)": 0.027839
},
{
"epoch": 0.3542569008985713,
"grad_norm": 2.0998218059539795,
"learning_rate": 4.937024912309152e-06,
"loss": 0.9393485069274903,
"memory(GiB)": 76.04,
"step": 1370,
"token_acc": 0.7547226992625518,
"train_speed(iter/s)": 0.027835
},
{
"epoch": 0.3555498092960114,
"grad_norm": 1.2376868724822998,
"learning_rate": 4.936227641453172e-06,
"loss": 0.9312064170837402,
"memory(GiB)": 76.04,
"step": 1375,
"token_acc": 0.7210518525827618,
"train_speed(iter/s)": 0.027834
},
{
"epoch": 0.35684271769345144,
"grad_norm": 1.0578678846359253,
"learning_rate": 4.935425420720754e-06,
"loss": 0.9209253311157226,
"memory(GiB)": 76.04,
"step": 1380,
"token_acc": 0.7725167678058128,
"train_speed(iter/s)": 0.027834
},
{
"epoch": 0.35813562609089145,
"grad_norm": 1.360901951789856,
"learning_rate": 4.934618251741835e-06,
"loss": 0.9340425491333008,
"memory(GiB)": 76.04,
"step": 1385,
"token_acc": 0.779114302812687,
"train_speed(iter/s)": 0.027829
},
{
"epoch": 0.3594285344883315,
"grad_norm": 1.5933281183242798,
"learning_rate": 4.933806136156402e-06,
"loss": 0.8858348846435546,
"memory(GiB)": 76.04,
"step": 1390,
"token_acc": 0.7908785127852725,
"train_speed(iter/s)": 0.027822
},
{
"epoch": 0.36072144288577157,
"grad_norm": 0.9765493273735046,
"learning_rate": 4.932989075614496e-06,
"loss": 0.9056285858154297,
"memory(GiB)": 76.04,
"step": 1395,
"token_acc": 0.7752316896727017,
"train_speed(iter/s)": 0.027825
},
{
"epoch": 0.3620143512832116,
"grad_norm": 1.0190749168395996,
"learning_rate": 4.932167071776203e-06,
"loss": 0.8850472450256348,
"memory(GiB)": 76.04,
"step": 1400,
"token_acc": 0.7488913755232293,
"train_speed(iter/s)": 0.027823
},
{
"epoch": 0.36330725968065164,
"grad_norm": 0.9817783832550049,
"learning_rate": 4.931340126311652e-06,
"loss": 0.8637564659118653,
"memory(GiB)": 76.04,
"step": 1405,
"token_acc": 0.8092789765596534,
"train_speed(iter/s)": 0.02782
},
{
"epoch": 0.36460016807809165,
"grad_norm": 1.116895318031311,
"learning_rate": 4.930508240901015e-06,
"loss": 0.9004463195800781,
"memory(GiB)": 76.04,
"step": 1410,
"token_acc": 0.7735781849843669,
"train_speed(iter/s)": 0.027815
},
{
"epoch": 0.3658930764755317,
"grad_norm": 1.0126279592514038,
"learning_rate": 4.9296714172345e-06,
"loss": 0.9231013298034668,
"memory(GiB)": 76.04,
"step": 1415,
"token_acc": 0.7658435279228997,
"train_speed(iter/s)": 0.027811
},
{
"epoch": 0.36718598487297177,
"grad_norm": 0.9945583939552307,
"learning_rate": 4.928829657012346e-06,
"loss": 0.8893575668334961,
"memory(GiB)": 76.04,
"step": 1420,
"token_acc": 0.7532214137636681,
"train_speed(iter/s)": 0.027807
},
{
"epoch": 0.3684788932704118,
"grad_norm": 1.384425163269043,
"learning_rate": 4.927982961944825e-06,
"loss": 0.9314968109130859,
"memory(GiB)": 76.04,
"step": 1425,
"token_acc": 0.7419209649833468,
"train_speed(iter/s)": 0.027811
},
{
"epoch": 0.36977180166785184,
"grad_norm": 0.8520684242248535,
"learning_rate": 4.9271313337522346e-06,
"loss": 0.9494674682617188,
"memory(GiB)": 76.04,
"step": 1430,
"token_acc": 0.782346893817007,
"train_speed(iter/s)": 0.027809
},
{
"epoch": 0.37106471006529185,
"grad_norm": 0.9675486087799072,
"learning_rate": 4.926274774164893e-06,
"loss": 0.9049705505371094,
"memory(GiB)": 76.04,
"step": 1435,
"token_acc": 0.7790923317683881,
"train_speed(iter/s)": 0.02781
},
{
"epoch": 0.3723576184627319,
"grad_norm": 0.9582749009132385,
"learning_rate": 4.925413284923143e-06,
"loss": 0.8903584480285645,
"memory(GiB)": 76.04,
"step": 1440,
"token_acc": 0.7866145377848655,
"train_speed(iter/s)": 0.027812
},
{
"epoch": 0.37365052686017197,
"grad_norm": 1.1724884510040283,
"learning_rate": 4.924546867777339e-06,
"loss": 0.9676746368408203,
"memory(GiB)": 76.04,
"step": 1445,
"token_acc": 0.7439509954058193,
"train_speed(iter/s)": 0.027815
},
{
"epoch": 0.374943435257612,
"grad_norm": 1.0407586097717285,
"learning_rate": 4.92367552448785e-06,
"loss": 0.9126652717590332,
"memory(GiB)": 76.04,
"step": 1450,
"token_acc": 0.781545586561482,
"train_speed(iter/s)": 0.027815
},
{
"epoch": 0.37623634365505204,
"grad_norm": 1.0518836975097656,
"learning_rate": 4.922799256825052e-06,
"loss": 0.8564701080322266,
"memory(GiB)": 76.04,
"step": 1455,
"token_acc": 0.7408266266539354,
"train_speed(iter/s)": 0.027818
},
{
"epoch": 0.3775292520524921,
"grad_norm": 0.9088625907897949,
"learning_rate": 4.921918066569328e-06,
"loss": 0.8742757797241211,
"memory(GiB)": 76.04,
"step": 1460,
"token_acc": 0.7589349964020149,
"train_speed(iter/s)": 0.027815
},
{
"epoch": 0.3788221604499321,
"grad_norm": 1.0474976301193237,
"learning_rate": 4.921031955511061e-06,
"loss": 0.8954677581787109,
"memory(GiB)": 76.04,
"step": 1465,
"token_acc": 0.7686122547832405,
"train_speed(iter/s)": 0.027811
},
{
"epoch": 0.3801150688473722,
"grad_norm": 1.012302279472351,
"learning_rate": 4.920140925450634e-06,
"loss": 0.9504472732543945,
"memory(GiB)": 76.04,
"step": 1470,
"token_acc": 0.7751241428233625,
"train_speed(iter/s)": 0.027809
},
{
"epoch": 0.3814079772448122,
"grad_norm": 0.9360347390174866,
"learning_rate": 4.919244978198424e-06,
"loss": 0.8807231903076171,
"memory(GiB)": 76.04,
"step": 1475,
"token_acc": 0.7640212437379938,
"train_speed(iter/s)": 0.027808
},
{
"epoch": 0.38270088564225224,
"grad_norm": 1.006102442741394,
"learning_rate": 4.918344115574797e-06,
"loss": 0.9025184631347656,
"memory(GiB)": 76.04,
"step": 1480,
"token_acc": 0.7411331796417805,
"train_speed(iter/s)": 0.027809
},
{
"epoch": 0.3839937940396923,
"grad_norm": 0.8951787948608398,
"learning_rate": 4.917438339410105e-06,
"loss": 0.8877702713012695,
"memory(GiB)": 76.04,
"step": 1485,
"token_acc": 0.7563314788673918,
"train_speed(iter/s)": 0.027806
},
{
"epoch": 0.3852867024371323,
"grad_norm": 1.0641748905181885,
"learning_rate": 4.916527651544689e-06,
"loss": 0.8795459747314454,
"memory(GiB)": 76.04,
"step": 1490,
"token_acc": 0.8010365029292474,
"train_speed(iter/s)": 0.027805
},
{
"epoch": 0.3865796108345724,
"grad_norm": 1.0358333587646484,
"learning_rate": 4.915612053828862e-06,
"loss": 0.8692068099975586,
"memory(GiB)": 76.04,
"step": 1495,
"token_acc": 0.8206166847085937,
"train_speed(iter/s)": 0.027807
},
{
"epoch": 0.38787251923201244,
"grad_norm": 1.0726227760314941,
"learning_rate": 4.914691548122919e-06,
"loss": 0.8898172378540039,
"memory(GiB)": 76.04,
"step": 1500,
"token_acc": 0.7631806836126535,
"train_speed(iter/s)": 0.027806
},
{
"epoch": 0.38916542762945244,
"grad_norm": 1.0023351907730103,
"learning_rate": 4.9137661362971225e-06,
"loss": 0.9159588813781738,
"memory(GiB)": 76.04,
"step": 1505,
"token_acc": 0.7451686323194703,
"train_speed(iter/s)": 0.027808
},
{
"epoch": 0.3904583360268925,
"grad_norm": 1.1598299741744995,
"learning_rate": 4.912835820231705e-06,
"loss": 0.8731332778930664,
"memory(GiB)": 76.04,
"step": 1510,
"token_acc": 0.7731112837444164,
"train_speed(iter/s)": 0.027808
},
{
"epoch": 0.3917512444243325,
"grad_norm": 1.2678931951522827,
"learning_rate": 4.9119006018168645e-06,
"loss": 0.8393604278564453,
"memory(GiB)": 76.04,
"step": 1515,
"token_acc": 0.7939447383891828,
"train_speed(iter/s)": 0.027803
},
{
"epoch": 0.3930441528217726,
"grad_norm": 1.062391996383667,
"learning_rate": 4.910960482952757e-06,
"loss": 0.9229723930358886,
"memory(GiB)": 76.04,
"step": 1520,
"token_acc": 0.7733812949640287,
"train_speed(iter/s)": 0.027804
},
{
"epoch": 0.39433706121921264,
"grad_norm": 0.9789305925369263,
"learning_rate": 4.910015465549497e-06,
"loss": 0.9235004425048828,
"memory(GiB)": 76.04,
"step": 1525,
"token_acc": 0.7600789189591631,
"train_speed(iter/s)": 0.027803
},
{
"epoch": 0.39562996961665264,
"grad_norm": 1.0256842374801636,
"learning_rate": 4.909065551527151e-06,
"loss": 0.8544706344604492,
"memory(GiB)": 76.04,
"step": 1530,
"token_acc": 0.7887576797255246,
"train_speed(iter/s)": 0.027803
},
{
"epoch": 0.3969228780140927,
"grad_norm": 1.0858112573623657,
"learning_rate": 4.908110742815735e-06,
"loss": 0.8899390220642089,
"memory(GiB)": 76.04,
"step": 1535,
"token_acc": 0.7800430187973441,
"train_speed(iter/s)": 0.027803
},
{
"epoch": 0.39821578641153277,
"grad_norm": 0.9626816511154175,
"learning_rate": 4.907151041355208e-06,
"loss": 0.8749662399291992,
"memory(GiB)": 76.04,
"step": 1540,
"token_acc": 0.8031964754405699,
"train_speed(iter/s)": 0.027798
},
{
"epoch": 0.3995086948089728,
"grad_norm": 0.9466427564620972,
"learning_rate": 4.9061864490954725e-06,
"loss": 0.8291332244873046,
"memory(GiB)": 76.04,
"step": 1545,
"token_acc": 0.76341123125218,
"train_speed(iter/s)": 0.027798
},
{
"epoch": 0.40080160320641284,
"grad_norm": 1.047394037246704,
"learning_rate": 4.905216967996367e-06,
"loss": 0.8456403732299804,
"memory(GiB)": 76.04,
"step": 1550,
"token_acc": 0.7513071152534667,
"train_speed(iter/s)": 0.0278
},
{
"epoch": 0.40209451160385284,
"grad_norm": 0.9296531677246094,
"learning_rate": 4.904242600027662e-06,
"loss": 0.8476978302001953,
"memory(GiB)": 76.04,
"step": 1555,
"token_acc": 0.8000686931135154,
"train_speed(iter/s)": 0.027798
},
{
"epoch": 0.4033874200012929,
"grad_norm": 1.176629900932312,
"learning_rate": 4.903263347169058e-06,
"loss": 0.9175498962402344,
"memory(GiB)": 76.04,
"step": 1560,
"token_acc": 0.8039329091960671,
"train_speed(iter/s)": 0.027801
},
{
"epoch": 0.40468032839873297,
"grad_norm": 1.2354000806808472,
"learning_rate": 4.902279211410182e-06,
"loss": 0.9165899276733398,
"memory(GiB)": 76.04,
"step": 1565,
"token_acc": 0.7679446219382322,
"train_speed(iter/s)": 0.027801
},
{
"epoch": 0.405973236796173,
"grad_norm": 2.35729718208313,
"learning_rate": 4.901290194750579e-06,
"loss": 0.8489980697631836,
"memory(GiB)": 76.04,
"step": 1570,
"token_acc": 0.8143410397840629,
"train_speed(iter/s)": 0.0278
},
{
"epoch": 0.40726614519361304,
"grad_norm": 0.986346960067749,
"learning_rate": 4.900296299199714e-06,
"loss": 0.87310791015625,
"memory(GiB)": 76.04,
"step": 1575,
"token_acc": 0.7906714736367734,
"train_speed(iter/s)": 0.027801
},
{
"epoch": 0.4085590535910531,
"grad_norm": 0.9651816487312317,
"learning_rate": 4.899297526776962e-06,
"loss": 0.8573389053344727,
"memory(GiB)": 76.04,
"step": 1580,
"token_acc": 0.7923456022732318,
"train_speed(iter/s)": 0.027803
},
{
"epoch": 0.4098519619884931,
"grad_norm": 1.0119379758834839,
"learning_rate": 4.898293879511608e-06,
"loss": 0.8485713958740234,
"memory(GiB)": 76.04,
"step": 1585,
"token_acc": 0.7772183472677722,
"train_speed(iter/s)": 0.027803
},
{
"epoch": 0.41114487038593317,
"grad_norm": 0.925304651260376,
"learning_rate": 4.897285359442841e-06,
"loss": 0.891656494140625,
"memory(GiB)": 76.04,
"step": 1590,
"token_acc": 0.7572081654822794,
"train_speed(iter/s)": 0.027804
},
{
"epoch": 0.4124377787833732,
"grad_norm": 1.0190355777740479,
"learning_rate": 4.896271968619752e-06,
"loss": 0.8359519004821777,
"memory(GiB)": 76.04,
"step": 1595,
"token_acc": 0.7742064125059818,
"train_speed(iter/s)": 0.027803
},
{
"epoch": 0.41373068718081324,
"grad_norm": 4.286404132843018,
"learning_rate": 4.895253709101327e-06,
"loss": 0.865880012512207,
"memory(GiB)": 76.04,
"step": 1600,
"token_acc": 0.7423144213946513,
"train_speed(iter/s)": 0.027806
},
{
"epoch": 0.4150235955782533,
"grad_norm": 1.1072417497634888,
"learning_rate": 4.894230582956444e-06,
"loss": 0.8957183837890625,
"memory(GiB)": 76.04,
"step": 1605,
"token_acc": 0.7621177149451818,
"train_speed(iter/s)": 0.027807
},
{
"epoch": 0.4163165039756933,
"grad_norm": 1.0325218439102173,
"learning_rate": 4.89320259226387e-06,
"loss": 0.8576887130737305,
"memory(GiB)": 76.04,
"step": 1610,
"token_acc": 0.7561050328227571,
"train_speed(iter/s)": 0.027809
},
{
"epoch": 0.41760941237313337,
"grad_norm": 1.0209314823150635,
"learning_rate": 4.8921697391122555e-06,
"loss": 0.8784740447998047,
"memory(GiB)": 76.04,
"step": 1615,
"token_acc": 0.7576126674786845,
"train_speed(iter/s)": 0.02781
},
{
"epoch": 0.4189023207705734,
"grad_norm": 1.2361773252487183,
"learning_rate": 4.891132025600128e-06,
"loss": 0.8804727554321289,
"memory(GiB)": 76.04,
"step": 1620,
"token_acc": 0.7516129032258064,
"train_speed(iter/s)": 0.027805
},
{
"epoch": 0.42019522916801344,
"grad_norm": 1.0383259057998657,
"learning_rate": 4.890089453835894e-06,
"loss": 0.8933810234069824,
"memory(GiB)": 76.04,
"step": 1625,
"token_acc": 0.7678587433898001,
"train_speed(iter/s)": 0.027808
},
{
"epoch": 0.4214881375654535,
"grad_norm": 1.1485199928283691,
"learning_rate": 4.889042025937829e-06,
"loss": 0.8679392814636231,
"memory(GiB)": 76.04,
"step": 1630,
"token_acc": 0.7426187419768935,
"train_speed(iter/s)": 0.027808
},
{
"epoch": 0.4227810459628935,
"grad_norm": 0.9472554922103882,
"learning_rate": 4.887989744034074e-06,
"loss": 0.8719472885131836,
"memory(GiB)": 76.04,
"step": 1635,
"token_acc": 0.7547612635142934,
"train_speed(iter/s)": 0.027806
},
{
"epoch": 0.42407395436033357,
"grad_norm": 1.0658771991729736,
"learning_rate": 4.886932610262634e-06,
"loss": 0.8944738388061524,
"memory(GiB)": 76.04,
"step": 1640,
"token_acc": 0.8233047873087183,
"train_speed(iter/s)": 0.027805
},
{
"epoch": 0.42536686275777363,
"grad_norm": 0.9620645642280579,
"learning_rate": 4.885870626771371e-06,
"loss": 0.8650775909423828,
"memory(GiB)": 76.04,
"step": 1645,
"token_acc": 0.7568502864923129,
"train_speed(iter/s)": 0.027803
},
{
"epoch": 0.42665977115521364,
"grad_norm": 1.0661425590515137,
"learning_rate": 4.884803795718001e-06,
"loss": 0.9001960754394531,
"memory(GiB)": 76.04,
"step": 1650,
"token_acc": 0.7570867129358642,
"train_speed(iter/s)": 0.027807
},
{
"epoch": 0.4279526795526537,
"grad_norm": 1.1007188558578491,
"learning_rate": 4.88373211927009e-06,
"loss": 0.8699914932250976,
"memory(GiB)": 76.04,
"step": 1655,
"token_acc": 0.7583125246418715,
"train_speed(iter/s)": 0.027808
},
{
"epoch": 0.4292455879500937,
"grad_norm": 1.0380903482437134,
"learning_rate": 4.882655599605045e-06,
"loss": 0.8764565467834473,
"memory(GiB)": 76.04,
"step": 1660,
"token_acc": 0.7656633221850613,
"train_speed(iter/s)": 0.027808
},
{
"epoch": 0.4305384963475338,
"grad_norm": 1.033884882926941,
"learning_rate": 4.88157423891012e-06,
"loss": 0.8574346542358399,
"memory(GiB)": 76.04,
"step": 1665,
"token_acc": 0.7794178559325226,
"train_speed(iter/s)": 0.027804
},
{
"epoch": 0.43183140474497383,
"grad_norm": 0.911655068397522,
"learning_rate": 4.8804880393823986e-06,
"loss": 0.8541059494018555,
"memory(GiB)": 76.04,
"step": 1670,
"token_acc": 0.7727084040907204,
"train_speed(iter/s)": 0.027807
},
{
"epoch": 0.43312431314241384,
"grad_norm": 0.9688575863838196,
"learning_rate": 4.8793970032287985e-06,
"loss": 0.8185391426086426,
"memory(GiB)": 76.04,
"step": 1675,
"token_acc": 0.8178748580649017,
"train_speed(iter/s)": 0.027804
},
{
"epoch": 0.4344172215398539,
"grad_norm": 0.9555157423019409,
"learning_rate": 4.878301132666066e-06,
"loss": 0.8625661849975585,
"memory(GiB)": 76.04,
"step": 1680,
"token_acc": 0.7851330293761182,
"train_speed(iter/s)": 0.027804
},
{
"epoch": 0.43571012993729397,
"grad_norm": 1.0205975770950317,
"learning_rate": 4.877200429920765e-06,
"loss": 0.8751688003540039,
"memory(GiB)": 76.04,
"step": 1685,
"token_acc": 0.772077701884416,
"train_speed(iter/s)": 0.027807
},
{
"epoch": 0.437003038334734,
"grad_norm": 1.0219794511795044,
"learning_rate": 4.876094897229283e-06,
"loss": 0.810630989074707,
"memory(GiB)": 76.04,
"step": 1690,
"token_acc": 0.8176644891911913,
"train_speed(iter/s)": 0.027806
},
{
"epoch": 0.43829594673217404,
"grad_norm": 1.1282055377960205,
"learning_rate": 4.874984536837817e-06,
"loss": 0.8619385719299316,
"memory(GiB)": 76.04,
"step": 1695,
"token_acc": 0.7675262655205348,
"train_speed(iter/s)": 0.027807
},
{
"epoch": 0.43958885512961404,
"grad_norm": 0.9397704601287842,
"learning_rate": 4.873869351002374e-06,
"loss": 0.820007610321045,
"memory(GiB)": 76.04,
"step": 1700,
"token_acc": 0.7972864541542053,
"train_speed(iter/s)": 0.027803
},
{
"epoch": 0.4408817635270541,
"grad_norm": 1.0283387899398804,
"learning_rate": 4.872749341988765e-06,
"loss": 0.8253473281860352,
"memory(GiB)": 76.04,
"step": 1705,
"token_acc": 0.7752377949445584,
"train_speed(iter/s)": 0.027804
},
{
"epoch": 0.44217467192449417,
"grad_norm": 0.9191579818725586,
"learning_rate": 4.871624512072603e-06,
"loss": 0.8367796897888183,
"memory(GiB)": 76.04,
"step": 1710,
"token_acc": 0.7903411821239789,
"train_speed(iter/s)": 0.027801
},
{
"epoch": 0.4434675803219342,
"grad_norm": 1.2455042600631714,
"learning_rate": 4.870494863539291e-06,
"loss": 0.8392200469970703,
"memory(GiB)": 76.04,
"step": 1715,
"token_acc": 0.7550399545694236,
"train_speed(iter/s)": 0.027798
},
{
"epoch": 0.44476048871937424,
"grad_norm": 1.0765002965927124,
"learning_rate": 4.8693603986840274e-06,
"loss": 0.8334452629089355,
"memory(GiB)": 76.04,
"step": 1720,
"token_acc": 0.7612031220255092,
"train_speed(iter/s)": 0.027797
},
{
"epoch": 0.4460533971168143,
"grad_norm": 1.0502086877822876,
"learning_rate": 4.868221119811793e-06,
"loss": 0.8496732711791992,
"memory(GiB)": 76.04,
"step": 1725,
"token_acc": 0.7960901439044258,
"train_speed(iter/s)": 0.027801
},
{
"epoch": 0.4473463055142543,
"grad_norm": 0.883604884147644,
"learning_rate": 4.867077029237352e-06,
"loss": 0.817476749420166,
"memory(GiB)": 76.04,
"step": 1730,
"token_acc": 0.746853904492041,
"train_speed(iter/s)": 0.027799
},
{
"epoch": 0.44863921391169437,
"grad_norm": 1.0631402730941772,
"learning_rate": 4.865928129285242e-06,
"loss": 0.8631902694702148,
"memory(GiB)": 76.04,
"step": 1735,
"token_acc": 0.7656874459231181,
"train_speed(iter/s)": 0.0278
},
{
"epoch": 0.4499321223091344,
"grad_norm": 1.0118037462234497,
"learning_rate": 4.864774422289776e-06,
"loss": 0.8337348937988281,
"memory(GiB)": 76.04,
"step": 1740,
"token_acc": 0.7787095835959087,
"train_speed(iter/s)": 0.027798
},
{
"epoch": 0.45122503070657444,
"grad_norm": 1.099346399307251,
"learning_rate": 4.863615910595031e-06,
"loss": 0.8567562103271484,
"memory(GiB)": 76.04,
"step": 1745,
"token_acc": 0.8049837122611412,
"train_speed(iter/s)": 0.027797
},
{
"epoch": 0.4525179391040145,
"grad_norm": 0.9110936522483826,
"learning_rate": 4.8624525965548456e-06,
"loss": 0.858333683013916,
"memory(GiB)": 76.04,
"step": 1750,
"token_acc": 0.7628092095319663,
"train_speed(iter/s)": 0.0278
},
{
"epoch": 0.4538108475014545,
"grad_norm": 1.1097652912139893,
"learning_rate": 4.861284482532819e-06,
"loss": 0.8601787567138672,
"memory(GiB)": 76.04,
"step": 1755,
"token_acc": 0.7758728179551122,
"train_speed(iter/s)": 0.027803
},
{
"epoch": 0.45510375589889457,
"grad_norm": 0.9955366253852844,
"learning_rate": 4.860111570902298e-06,
"loss": 0.8417009353637696,
"memory(GiB)": 76.04,
"step": 1760,
"token_acc": 0.7899390978219729,
"train_speed(iter/s)": 0.027798
},
{
"epoch": 0.45639666429633463,
"grad_norm": 0.9379090666770935,
"learning_rate": 4.858933864046384e-06,
"loss": 0.8158811569213867,
"memory(GiB)": 76.04,
"step": 1765,
"token_acc": 0.7703970866307165,
"train_speed(iter/s)": 0.027797
},
{
"epoch": 0.45768957269377464,
"grad_norm": 0.9244673252105713,
"learning_rate": 4.857751364357913e-06,
"loss": 0.8572831153869629,
"memory(GiB)": 76.04,
"step": 1770,
"token_acc": 0.7679937895087058,
"train_speed(iter/s)": 0.027796
},
{
"epoch": 0.4589824810912147,
"grad_norm": 0.8768739104270935,
"learning_rate": 4.856564074239467e-06,
"loss": 0.8114492416381835,
"memory(GiB)": 76.04,
"step": 1775,
"token_acc": 0.7533039647577092,
"train_speed(iter/s)": 0.027797
},
{
"epoch": 0.4602753894886547,
"grad_norm": 1.0087144374847412,
"learning_rate": 4.855371996103354e-06,
"loss": 0.8448333740234375,
"memory(GiB)": 76.04,
"step": 1780,
"token_acc": 0.7925396227993142,
"train_speed(iter/s)": 0.027798
},
{
"epoch": 0.46156829788609477,
"grad_norm": 0.9475561380386353,
"learning_rate": 4.854175132371615e-06,
"loss": 0.8426584243774414,
"memory(GiB)": 76.04,
"step": 1785,
"token_acc": 0.7877760352646972,
"train_speed(iter/s)": 0.027799
},
{
"epoch": 0.46286120628353483,
"grad_norm": 0.8809593915939331,
"learning_rate": 4.852973485476014e-06,
"loss": 0.8447649002075195,
"memory(GiB)": 76.04,
"step": 1790,
"token_acc": 0.7644977511244377,
"train_speed(iter/s)": 0.027801
},
{
"epoch": 0.46415411468097484,
"grad_norm": 0.9489724636077881,
"learning_rate": 4.85176705785803e-06,
"loss": 0.8333120346069336,
"memory(GiB)": 76.04,
"step": 1795,
"token_acc": 0.8094422805290417,
"train_speed(iter/s)": 0.027801
},
{
"epoch": 0.4654470230784149,
"grad_norm": 1.0435246229171753,
"learning_rate": 4.850555851968858e-06,
"loss": 0.8334157943725586,
"memory(GiB)": 76.04,
"step": 1800,
"token_acc": 0.7686591887926546,
"train_speed(iter/s)": 0.027804
},
{
"epoch": 0.46673993147585496,
"grad_norm": 1.1650222539901733,
"learning_rate": 4.849339870269401e-06,
"loss": 0.9079343795776367,
"memory(GiB)": 76.04,
"step": 1805,
"token_acc": 0.7502884738664045,
"train_speed(iter/s)": 0.027805
},
{
"epoch": 0.46803283987329497,
"grad_norm": 0.9376285076141357,
"learning_rate": 4.848119115230264e-06,
"loss": 0.8245293617248535,
"memory(GiB)": 76.04,
"step": 1810,
"token_acc": 0.7958735551228404,
"train_speed(iter/s)": 0.027805
},
{
"epoch": 0.46932574827073503,
"grad_norm": 0.9769212603569031,
"learning_rate": 4.8468935893317545e-06,
"loss": 0.8638315200805664,
"memory(GiB)": 76.04,
"step": 1815,
"token_acc": 0.7886973180076629,
"train_speed(iter/s)": 0.027805
},
{
"epoch": 0.47061865666817504,
"grad_norm": 0.9659924507141113,
"learning_rate": 4.8456632950638675e-06,
"loss": 0.8185907363891601,
"memory(GiB)": 76.04,
"step": 1820,
"token_acc": 0.7737749169435216,
"train_speed(iter/s)": 0.027804
},
{
"epoch": 0.4719115650656151,
"grad_norm": 0.9423291683197021,
"learning_rate": 4.844428234926291e-06,
"loss": 0.8167947769165039,
"memory(GiB)": 76.04,
"step": 1825,
"token_acc": 0.7969283276450512,
"train_speed(iter/s)": 0.027801
},
{
"epoch": 0.47320447346305516,
"grad_norm": 0.9784870147705078,
"learning_rate": 4.843188411428394e-06,
"loss": 0.838237190246582,
"memory(GiB)": 76.04,
"step": 1830,
"token_acc": 0.8140107775211701,
"train_speed(iter/s)": 0.027801
},
{
"epoch": 0.47449738186049517,
"grad_norm": 2.1008307933807373,
"learning_rate": 4.841943827089223e-06,
"loss": 0.8713891983032227,
"memory(GiB)": 76.04,
"step": 1835,
"token_acc": 0.7783555923255723,
"train_speed(iter/s)": 0.027803
},
{
"epoch": 0.47579029025793523,
"grad_norm": 1.0217597484588623,
"learning_rate": 4.840694484437499e-06,
"loss": 0.8342850685119629,
"memory(GiB)": 76.04,
"step": 1840,
"token_acc": 0.7716500553709856,
"train_speed(iter/s)": 0.027805
},
{
"epoch": 0.47708319865537524,
"grad_norm": 0.935716986656189,
"learning_rate": 4.8394403860116115e-06,
"loss": 0.8083118438720703,
"memory(GiB)": 76.04,
"step": 1845,
"token_acc": 0.7871954487364472,
"train_speed(iter/s)": 0.027805
},
{
"epoch": 0.4783761070528153,
"grad_norm": 0.906399667263031,
"learning_rate": 4.83818153435961e-06,
"loss": 0.8438366889953614,
"memory(GiB)": 76.04,
"step": 1850,
"token_acc": 0.7733610953372453,
"train_speed(iter/s)": 0.027803
},
{
"epoch": 0.47966901545025536,
"grad_norm": 1.9505832195281982,
"learning_rate": 4.836917932039204e-06,
"loss": 0.8615127563476562,
"memory(GiB)": 76.04,
"step": 1855,
"token_acc": 0.7813404825737266,
"train_speed(iter/s)": 0.027804
},
{
"epoch": 0.48096192384769537,
"grad_norm": 1.1248425245285034,
"learning_rate": 4.835649581617753e-06,
"loss": 0.8535722732543946,
"memory(GiB)": 76.04,
"step": 1860,
"token_acc": 0.7716059271125351,
"train_speed(iter/s)": 0.027805
},
{
"epoch": 0.48225483224513543,
"grad_norm": 0.9488275647163391,
"learning_rate": 4.834376485672266e-06,
"loss": 0.8235734939575196,
"memory(GiB)": 76.04,
"step": 1865,
"token_acc": 0.772141609970498,
"train_speed(iter/s)": 0.027805
},
{
"epoch": 0.4835477406425755,
"grad_norm": 0.9314141273498535,
"learning_rate": 4.833098646789393e-06,
"loss": 0.825401496887207,
"memory(GiB)": 76.04,
"step": 1870,
"token_acc": 0.802578972013111,
"train_speed(iter/s)": 0.027804
},
{
"epoch": 0.4848406490400155,
"grad_norm": 1.1258958578109741,
"learning_rate": 4.831816067565419e-06,
"loss": 0.8634084701538086,
"memory(GiB)": 76.04,
"step": 1875,
"token_acc": 0.7933092156789617,
"train_speed(iter/s)": 0.027807
},
{
"epoch": 0.48613355743745557,
"grad_norm": 0.8910898566246033,
"learning_rate": 4.830528750606263e-06,
"loss": 0.8147882461547852,
"memory(GiB)": 76.04,
"step": 1880,
"token_acc": 0.812186275932105,
"train_speed(iter/s)": 0.027803
},
{
"epoch": 0.4874264658348956,
"grad_norm": 1.1478573083877563,
"learning_rate": 4.829236698527469e-06,
"loss": 0.8461570739746094,
"memory(GiB)": 76.04,
"step": 1885,
"token_acc": 0.7892949615858058,
"train_speed(iter/s)": 0.027802
},
{
"epoch": 0.48871937423233563,
"grad_norm": 0.9229076504707336,
"learning_rate": 4.827939913954199e-06,
"loss": 0.8387362480163574,
"memory(GiB)": 76.04,
"step": 1890,
"token_acc": 0.8044181034482759,
"train_speed(iter/s)": 0.027803
},
{
"epoch": 0.4900122826297757,
"grad_norm": 1.209421992301941,
"learning_rate": 4.826638399521235e-06,
"loss": 0.8628839492797852,
"memory(GiB)": 76.04,
"step": 1895,
"token_acc": 0.7861487236403996,
"train_speed(iter/s)": 0.027804
},
{
"epoch": 0.4913051910272157,
"grad_norm": 0.9494127631187439,
"learning_rate": 4.825332157872966e-06,
"loss": 0.8163295745849609,
"memory(GiB)": 76.04,
"step": 1900,
"token_acc": 0.7902574714203331,
"train_speed(iter/s)": 0.027806
},
{
"epoch": 0.49259809942465577,
"grad_norm": 1.0148690938949585,
"learning_rate": 4.824021191663387e-06,
"loss": 0.8092700004577636,
"memory(GiB)": 76.04,
"step": 1905,
"token_acc": 0.7959501969388564,
"train_speed(iter/s)": 0.027805
},
{
"epoch": 0.49389100782209583,
"grad_norm": 1.064133644104004,
"learning_rate": 4.822705503556092e-06,
"loss": 0.8303569793701172,
"memory(GiB)": 76.04,
"step": 1910,
"token_acc": 0.8055975400010423,
"train_speed(iter/s)": 0.027802
},
{
"epoch": 0.49518391621953584,
"grad_norm": 1.0159554481506348,
"learning_rate": 4.821385096224268e-06,
"loss": 0.8641040802001954,
"memory(GiB)": 76.04,
"step": 1915,
"token_acc": 0.7566073149698169,
"train_speed(iter/s)": 0.027801
},
{
"epoch": 0.4964768246169759,
"grad_norm": 1.0026555061340332,
"learning_rate": 4.820059972350691e-06,
"loss": 0.8560010910034179,
"memory(GiB)": 76.04,
"step": 1920,
"token_acc": 0.7960721112402084,
"train_speed(iter/s)": 0.027801
},
{
"epoch": 0.4977697330144159,
"grad_norm": 0.8519271016120911,
"learning_rate": 4.81873013462772e-06,
"loss": 0.8016116142272949,
"memory(GiB)": 76.04,
"step": 1925,
"token_acc": 0.7795004600129485,
"train_speed(iter/s)": 0.027796
},
{
"epoch": 0.49906264141185597,
"grad_norm": 0.9784479141235352,
"learning_rate": 4.8173955857572926e-06,
"loss": 0.8383674621582031,
"memory(GiB)": 76.04,
"step": 1930,
"token_acc": 0.7987399059366403,
"train_speed(iter/s)": 0.027796
},
{
"epoch": 0.500355549809296,
"grad_norm": 0.958125650882721,
"learning_rate": 4.816056328450916e-06,
"loss": 0.8211706161499024,
"memory(GiB)": 76.04,
"step": 1935,
"token_acc": 0.8177427201334353,
"train_speed(iter/s)": 0.02779
},
{
"epoch": 0.5016484582067361,
"grad_norm": 1.0574296712875366,
"learning_rate": 4.814712365429665e-06,
"loss": 0.8358111381530762,
"memory(GiB)": 76.04,
"step": 1940,
"token_acc": 0.773138165533256,
"train_speed(iter/s)": 0.02779
},
{
"epoch": 0.5029413666041761,
"grad_norm": 1.152383804321289,
"learning_rate": 4.813363699424176e-06,
"loss": 0.8466585159301758,
"memory(GiB)": 76.04,
"step": 1945,
"token_acc": 0.7601995890813033,
"train_speed(iter/s)": 0.027789
},
{
"epoch": 0.5042342750016161,
"grad_norm": 0.9282750487327576,
"learning_rate": 4.812010333174642e-06,
"loss": 0.821980094909668,
"memory(GiB)": 76.04,
"step": 1950,
"token_acc": 0.7666437886067261,
"train_speed(iter/s)": 0.027792
},
{
"epoch": 0.5055271833990562,
"grad_norm": 1.0676088333129883,
"learning_rate": 4.8106522694308026e-06,
"loss": 0.8337220191955567,
"memory(GiB)": 76.04,
"step": 1955,
"token_acc": 0.7935955447267664,
"train_speed(iter/s)": 0.027792
},
{
"epoch": 0.5068200917964962,
"grad_norm": 1.028906226158142,
"learning_rate": 4.809289510951943e-06,
"loss": 0.8194513320922852,
"memory(GiB)": 76.04,
"step": 1960,
"token_acc": 0.7874173098125689,
"train_speed(iter/s)": 0.027794
},
{
"epoch": 0.5081130001939362,
"grad_norm": 1.000504732131958,
"learning_rate": 4.807922060506889e-06,
"loss": 0.8190900802612304,
"memory(GiB)": 76.04,
"step": 1965,
"token_acc": 0.7874103263615237,
"train_speed(iter/s)": 0.027795
},
{
"epoch": 0.5094059085913764,
"grad_norm": 0.9075422883033752,
"learning_rate": 4.806549920873996e-06,
"loss": 0.797203254699707,
"memory(GiB)": 76.04,
"step": 1970,
"token_acc": 0.7732008028290165,
"train_speed(iter/s)": 0.027794
},
{
"epoch": 0.5106988169888164,
"grad_norm": 1.4181991815567017,
"learning_rate": 4.8051730948411505e-06,
"loss": 0.795828914642334,
"memory(GiB)": 76.04,
"step": 1975,
"token_acc": 0.8003244957409934,
"train_speed(iter/s)": 0.027794
},
{
"epoch": 0.5119917253862564,
"grad_norm": 1.0753087997436523,
"learning_rate": 4.803791585205757e-06,
"loss": 0.8330776214599609,
"memory(GiB)": 76.04,
"step": 1980,
"token_acc": 0.7414700390426073,
"train_speed(iter/s)": 0.027795
},
{
"epoch": 0.5132846337836964,
"grad_norm": 1.0679435729980469,
"learning_rate": 4.802405394774739e-06,
"loss": 0.8332581520080566,
"memory(GiB)": 76.04,
"step": 1985,
"token_acc": 0.7553387146214366,
"train_speed(iter/s)": 0.027794
},
{
"epoch": 0.5145775421811365,
"grad_norm": 0.8826218843460083,
"learning_rate": 4.801014526364531e-06,
"loss": 0.7712962627410889,
"memory(GiB)": 76.04,
"step": 1990,
"token_acc": 0.7524300269352383,
"train_speed(iter/s)": 0.027795
},
{
"epoch": 0.5158704505785765,
"grad_norm": 3.6322293281555176,
"learning_rate": 4.799618982801066e-06,
"loss": 0.8304604530334473,
"memory(GiB)": 76.04,
"step": 1995,
"token_acc": 0.8159029172079839,
"train_speed(iter/s)": 0.027797
},
{
"epoch": 0.5171633589760165,
"grad_norm": 1.0873634815216064,
"learning_rate": 4.798218766919784e-06,
"loss": 0.8011078834533691,
"memory(GiB)": 76.04,
"step": 2000,
"token_acc": 0.7605788670946689,
"train_speed(iter/s)": 0.027796
},
{
"epoch": 0.5184562673734566,
"grad_norm": 0.9646498560905457,
"learning_rate": 4.796813881565614e-06,
"loss": 0.7656961441040039,
"memory(GiB)": 76.04,
"step": 2005,
"token_acc": 0.8116391078933645,
"train_speed(iter/s)": 0.027724
},
{
"epoch": 0.5197491757708966,
"grad_norm": 0.9246786832809448,
"learning_rate": 4.795404329592971e-06,
"loss": 0.7999061107635498,
"memory(GiB)": 76.04,
"step": 2010,
"token_acc": 0.8214931011826544,
"train_speed(iter/s)": 0.027723
},
{
"epoch": 0.5210420841683366,
"grad_norm": 0.9651414155960083,
"learning_rate": 4.793990113865754e-06,
"loss": 0.8470598220825195,
"memory(GiB)": 76.04,
"step": 2015,
"token_acc": 0.7925045299862995,
"train_speed(iter/s)": 0.027723
},
{
"epoch": 0.5223349925657768,
"grad_norm": 0.9942532777786255,
"learning_rate": 4.792571237257338e-06,
"loss": 0.8307376861572265,
"memory(GiB)": 76.04,
"step": 2020,
"token_acc": 0.760911584985659,
"train_speed(iter/s)": 0.027724
},
{
"epoch": 0.5236279009632168,
"grad_norm": 0.9892558455467224,
"learning_rate": 4.7911477026505656e-06,
"loss": 0.8515020370483398,
"memory(GiB)": 76.04,
"step": 2025,
"token_acc": 0.7540479906359735,
"train_speed(iter/s)": 0.027722
},
{
"epoch": 0.5249208093606568,
"grad_norm": 1.0707569122314453,
"learning_rate": 4.789719512937745e-06,
"loss": 0.8141921997070313,
"memory(GiB)": 76.04,
"step": 2030,
"token_acc": 0.7763675366464069,
"train_speed(iter/s)": 0.027721
},
{
"epoch": 0.5262137177580969,
"grad_norm": 0.9917581677436829,
"learning_rate": 4.788286671020642e-06,
"loss": 0.8206811904907226,
"memory(GiB)": 76.04,
"step": 2035,
"token_acc": 0.7850752688172044,
"train_speed(iter/s)": 0.027723
},
{
"epoch": 0.5275066261555369,
"grad_norm": 0.9246799945831299,
"learning_rate": 4.786849179810475e-06,
"loss": 0.7965336799621582,
"memory(GiB)": 76.04,
"step": 2040,
"token_acc": 0.7653000594177065,
"train_speed(iter/s)": 0.027725
},
{
"epoch": 0.5287995345529769,
"grad_norm": 1.071942925453186,
"learning_rate": 4.78540704222791e-06,
"loss": 0.8213727951049805,
"memory(GiB)": 76.04,
"step": 2045,
"token_acc": 0.8214273371349576,
"train_speed(iter/s)": 0.027722
},
{
"epoch": 0.5300924429504169,
"grad_norm": 1.2201160192489624,
"learning_rate": 4.783960261203051e-06,
"loss": 0.8097395896911621,
"memory(GiB)": 76.04,
"step": 2050,
"token_acc": 0.7943978387601308,
"train_speed(iter/s)": 0.02772
},
{
"epoch": 0.531385351347857,
"grad_norm": 0.9751284122467041,
"learning_rate": 4.782508839675436e-06,
"loss": 0.8254419326782226,
"memory(GiB)": 76.04,
"step": 2055,
"token_acc": 0.7764489832482308,
"train_speed(iter/s)": 0.027722
},
{
"epoch": 0.532678259745297,
"grad_norm": 1.0070680379867554,
"learning_rate": 4.7810527805940344e-06,
"loss": 0.8492563247680665,
"memory(GiB)": 76.04,
"step": 2060,
"token_acc": 0.7705357535270074,
"train_speed(iter/s)": 0.027723
},
{
"epoch": 0.533971168142737,
"grad_norm": 0.8822097182273865,
"learning_rate": 4.779592086917238e-06,
"loss": 0.7865631580352783,
"memory(GiB)": 76.04,
"step": 2065,
"token_acc": 0.799327011318446,
"train_speed(iter/s)": 0.027724
},
{
"epoch": 0.5352640765401772,
"grad_norm": 1.0193886756896973,
"learning_rate": 4.77812676161285e-06,
"loss": 0.8170513153076172,
"memory(GiB)": 76.04,
"step": 2070,
"token_acc": 0.7710854546297584,
"train_speed(iter/s)": 0.027726
},
{
"epoch": 0.5365569849376172,
"grad_norm": 0.9742515683174133,
"learning_rate": 4.776656807658091e-06,
"loss": 0.844205379486084,
"memory(GiB)": 76.04,
"step": 2075,
"token_acc": 0.7571799189841154,
"train_speed(iter/s)": 0.027726
},
{
"epoch": 0.5378498933350572,
"grad_norm": 1.2338061332702637,
"learning_rate": 4.775182228039582e-06,
"loss": 0.8240803718566895,
"memory(GiB)": 76.04,
"step": 2080,
"token_acc": 0.7739548334963637,
"train_speed(iter/s)": 0.027728
},
{
"epoch": 0.5391428017324973,
"grad_norm": 1.135621428489685,
"learning_rate": 4.773703025753343e-06,
"loss": 0.7704273700714112,
"memory(GiB)": 76.04,
"step": 2085,
"token_acc": 0.8158826332629859,
"train_speed(iter/s)": 0.02773
},
{
"epoch": 0.5404357101299373,
"grad_norm": 0.9862043261528015,
"learning_rate": 4.772219203804785e-06,
"loss": 0.8293350219726563,
"memory(GiB)": 76.04,
"step": 2090,
"token_acc": 0.7778981581798483,
"train_speed(iter/s)": 0.027731
},
{
"epoch": 0.5417286185273773,
"grad_norm": 1.0542078018188477,
"learning_rate": 4.770730765208708e-06,
"loss": 0.8214458465576172,
"memory(GiB)": 76.04,
"step": 2095,
"token_acc": 0.8010532239909953,
"train_speed(iter/s)": 0.027732
},
{
"epoch": 0.5430215269248174,
"grad_norm": 1.3685965538024902,
"learning_rate": 4.76923771298929e-06,
"loss": 0.7963518142700196,
"memory(GiB)": 76.04,
"step": 2100,
"token_acc": 0.7876639186707104,
"train_speed(iter/s)": 0.027734
},
{
"epoch": 0.5443144353222574,
"grad_norm": 0.9173294901847839,
"learning_rate": 4.767740050180083e-06,
"loss": 0.797146987915039,
"memory(GiB)": 76.04,
"step": 2105,
"token_acc": 0.8026390843061946,
"train_speed(iter/s)": 0.027727
},
{
"epoch": 0.5456073437196974,
"grad_norm": 1.0344001054763794,
"learning_rate": 4.766237779824008e-06,
"loss": 0.8145599365234375,
"memory(GiB)": 76.04,
"step": 2110,
"token_acc": 0.8000528162372204,
"train_speed(iter/s)": 0.027726
},
{
"epoch": 0.5469002521171376,
"grad_norm": 0.9387233257293701,
"learning_rate": 4.764730904973345e-06,
"loss": 0.8474384307861328,
"memory(GiB)": 76.04,
"step": 2115,
"token_acc": 0.7702894841608372,
"train_speed(iter/s)": 0.027726
},
{
"epoch": 0.5481931605145776,
"grad_norm": 0.8692566156387329,
"learning_rate": 4.7632194286897315e-06,
"loss": 0.8177039146423339,
"memory(GiB)": 76.04,
"step": 2120,
"token_acc": 0.8068763457940626,
"train_speed(iter/s)": 0.027729
},
{
"epoch": 0.5494860689120176,
"grad_norm": 1.0659557580947876,
"learning_rate": 4.761703354044155e-06,
"loss": 0.7883958339691162,
"memory(GiB)": 76.04,
"step": 2125,
"token_acc": 0.800734618916437,
"train_speed(iter/s)": 0.027731
},
{
"epoch": 0.5507789773094576,
"grad_norm": 0.9900258779525757,
"learning_rate": 4.760182684116942e-06,
"loss": 0.8056777954101563,
"memory(GiB)": 76.04,
"step": 2130,
"token_acc": 0.7733108386141059,
"train_speed(iter/s)": 0.027731
},
{
"epoch": 0.5520718857068977,
"grad_norm": 1.03944993019104,
"learning_rate": 4.7586574219977585e-06,
"loss": 0.8212559700012207,
"memory(GiB)": 76.04,
"step": 2135,
"token_acc": 0.7548755884330868,
"train_speed(iter/s)": 0.02773
},
{
"epoch": 0.5533647941043377,
"grad_norm": 0.9362234473228455,
"learning_rate": 4.7571275707856e-06,
"loss": 0.798857307434082,
"memory(GiB)": 76.04,
"step": 2140,
"token_acc": 0.8130052348563085,
"train_speed(iter/s)": 0.027732
},
{
"epoch": 0.5546577025017777,
"grad_norm": 1.0358259677886963,
"learning_rate": 4.755593133588788e-06,
"loss": 0.8120311737060547,
"memory(GiB)": 76.04,
"step": 2145,
"token_acc": 0.8000494239026349,
"train_speed(iter/s)": 0.027731
},
{
"epoch": 0.5559506108992178,
"grad_norm": 1.1722190380096436,
"learning_rate": 4.754054113524959e-06,
"loss": 0.8086760520935059,
"memory(GiB)": 76.04,
"step": 2150,
"token_acc": 0.8190579981609508,
"train_speed(iter/s)": 0.027731
},
{
"epoch": 0.5572435192966578,
"grad_norm": 0.9975719451904297,
"learning_rate": 4.752510513721061e-06,
"loss": 0.8197290420532226,
"memory(GiB)": 76.04,
"step": 2155,
"token_acc": 0.7630993323892373,
"train_speed(iter/s)": 0.027732
},
{
"epoch": 0.5585364276940978,
"grad_norm": 1.0064895153045654,
"learning_rate": 4.750962337313347e-06,
"loss": 0.8426996231079101,
"memory(GiB)": 76.04,
"step": 2160,
"token_acc": 0.7553154809791978,
"train_speed(iter/s)": 0.02773
},
{
"epoch": 0.559829336091538,
"grad_norm": 1.056726336479187,
"learning_rate": 4.749409587447372e-06,
"loss": 0.8352632522583008,
"memory(GiB)": 76.04,
"step": 2165,
"token_acc": 0.8019056825243389,
"train_speed(iter/s)": 0.027734
},
{
"epoch": 0.561122244488978,
"grad_norm": 0.9361665844917297,
"learning_rate": 4.747852267277981e-06,
"loss": 0.765074634552002,
"memory(GiB)": 76.04,
"step": 2170,
"token_acc": 0.7859190721313611,
"train_speed(iter/s)": 0.027733
},
{
"epoch": 0.562415152886418,
"grad_norm": 1.1270101070404053,
"learning_rate": 4.746290379969301e-06,
"loss": 0.8160411834716796,
"memory(GiB)": 76.04,
"step": 2175,
"token_acc": 0.7946054543900145,
"train_speed(iter/s)": 0.027735
},
{
"epoch": 0.5637080612838581,
"grad_norm": 0.957750678062439,
"learning_rate": 4.744723928694745e-06,
"loss": 0.8085262298583984,
"memory(GiB)": 76.04,
"step": 2180,
"token_acc": 0.7642607683352736,
"train_speed(iter/s)": 0.027735
},
{
"epoch": 0.5650009696812981,
"grad_norm": 1.0245423316955566,
"learning_rate": 4.743152916636995e-06,
"loss": 0.793109130859375,
"memory(GiB)": 76.04,
"step": 2185,
"token_acc": 0.7618901098901099,
"train_speed(iter/s)": 0.027736
},
{
"epoch": 0.5662938780787381,
"grad_norm": 1.0268268585205078,
"learning_rate": 4.7415773469880015e-06,
"loss": 0.8279844284057617,
"memory(GiB)": 76.04,
"step": 2190,
"token_acc": 0.7590428234859334,
"train_speed(iter/s)": 0.027737
},
{
"epoch": 0.5675867864761782,
"grad_norm": 0.9654160737991333,
"learning_rate": 4.739997222948972e-06,
"loss": 0.8115758895874023,
"memory(GiB)": 76.04,
"step": 2195,
"token_acc": 0.824989417785816,
"train_speed(iter/s)": 0.027738
},
{
"epoch": 0.5688796948736182,
"grad_norm": 0.9180038571357727,
"learning_rate": 4.738412547730371e-06,
"loss": 0.7820042133331299,
"memory(GiB)": 76.04,
"step": 2200,
"token_acc": 0.7811721577290032,
"train_speed(iter/s)": 0.027738
},
{
"epoch": 0.5701726032710582,
"grad_norm": 0.9447706341743469,
"learning_rate": 4.736823324551909e-06,
"loss": 0.8502116203308105,
"memory(GiB)": 76.04,
"step": 2205,
"token_acc": 0.7345110180295028,
"train_speed(iter/s)": 0.027737
},
{
"epoch": 0.5714655116684982,
"grad_norm": 1.0418199300765991,
"learning_rate": 4.7352295566425355e-06,
"loss": 0.7954240322113038,
"memory(GiB)": 76.04,
"step": 2210,
"token_acc": 0.7976113712187053,
"train_speed(iter/s)": 0.027736
},
{
"epoch": 0.5727584200659384,
"grad_norm": 2.2441470623016357,
"learning_rate": 4.733631247240435e-06,
"loss": 0.8036426544189453,
"memory(GiB)": 76.04,
"step": 2215,
"token_acc": 0.7925195951601857,
"train_speed(iter/s)": 0.027736
},
{
"epoch": 0.5740513284633784,
"grad_norm": 0.8851604461669922,
"learning_rate": 4.732028399593018e-06,
"loss": 0.8041337013244629,
"memory(GiB)": 76.04,
"step": 2220,
"token_acc": 0.7804418779814211,
"train_speed(iter/s)": 0.027734
},
{
"epoch": 0.5753442368608184,
"grad_norm": 0.897997260093689,
"learning_rate": 4.730421016956919e-06,
"loss": 0.7801138877868652,
"memory(GiB)": 76.04,
"step": 2225,
"token_acc": 0.8051513959889894,
"train_speed(iter/s)": 0.027732
},
{
"epoch": 0.5766371452582585,
"grad_norm": 3.450253486633301,
"learning_rate": 4.728809102597984e-06,
"loss": 0.795560646057129,
"memory(GiB)": 76.04,
"step": 2230,
"token_acc": 0.777429320351994,
"train_speed(iter/s)": 0.027732
},
{
"epoch": 0.5779300536556985,
"grad_norm": 1.5096064805984497,
"learning_rate": 4.727192659791265e-06,
"loss": 0.800804615020752,
"memory(GiB)": 76.04,
"step": 2235,
"token_acc": 0.7972484309406044,
"train_speed(iter/s)": 0.027733
},
{
"epoch": 0.5792229620531385,
"grad_norm": 1.0118114948272705,
"learning_rate": 4.72557169182102e-06,
"loss": 0.7758650302886962,
"memory(GiB)": 76.04,
"step": 2240,
"token_acc": 0.7874528625299966,
"train_speed(iter/s)": 0.027734
},
{
"epoch": 0.5805158704505786,
"grad_norm": 1.16028892993927,
"learning_rate": 4.723946201980695e-06,
"loss": 0.8420794486999512,
"memory(GiB)": 76.04,
"step": 2245,
"token_acc": 0.7777456885881674,
"train_speed(iter/s)": 0.027735
},
{
"epoch": 0.5818087788480186,
"grad_norm": 1.1023540496826172,
"learning_rate": 4.7223161935729274e-06,
"loss": 0.801850700378418,
"memory(GiB)": 76.04,
"step": 2250,
"token_acc": 0.7952162077736624,
"train_speed(iter/s)": 0.027737
},
{
"epoch": 0.5831016872454586,
"grad_norm": 0.8935644626617432,
"learning_rate": 4.7206816699095345e-06,
"loss": 0.7811629295349121,
"memory(GiB)": 76.04,
"step": 2255,
"token_acc": 0.789712556732224,
"train_speed(iter/s)": 0.027737
},
{
"epoch": 0.5843945956428988,
"grad_norm": 1.0098074674606323,
"learning_rate": 4.719042634311507e-06,
"loss": 0.8304760932922364,
"memory(GiB)": 76.04,
"step": 2260,
"token_acc": 0.7755578712853498,
"train_speed(iter/s)": 0.027737
},
{
"epoch": 0.5856875040403388,
"grad_norm": 1.1288141012191772,
"learning_rate": 4.717399090109003e-06,
"loss": 0.8142587661743164,
"memory(GiB)": 76.04,
"step": 2265,
"token_acc": 0.7781233799896319,
"train_speed(iter/s)": 0.027739
},
{
"epoch": 0.5869804124377788,
"grad_norm": 1.0086054801940918,
"learning_rate": 4.715751040641341e-06,
"loss": 0.8228842735290527,
"memory(GiB)": 76.04,
"step": 2270,
"token_acc": 0.7793262574988463,
"train_speed(iter/s)": 0.027739
},
{
"epoch": 0.5882733208352188,
"grad_norm": 5.436489105224609,
"learning_rate": 4.714098489256994e-06,
"loss": 0.7786747932434082,
"memory(GiB)": 76.04,
"step": 2275,
"token_acc": 0.8480059038774945,
"train_speed(iter/s)": 0.027741
},
{
"epoch": 0.5895662292326589,
"grad_norm": 0.8497810363769531,
"learning_rate": 4.712441439313583e-06,
"loss": 0.7513184070587158,
"memory(GiB)": 76.04,
"step": 2280,
"token_acc": 0.804937625403472,
"train_speed(iter/s)": 0.027739
},
{
"epoch": 0.5908591376300989,
"grad_norm": 1.5754011869430542,
"learning_rate": 4.710779894177864e-06,
"loss": 0.8058387756347656,
"memory(GiB)": 76.04,
"step": 2285,
"token_acc": 0.7810834813499112,
"train_speed(iter/s)": 0.027738
},
{
"epoch": 0.5921520460275389,
"grad_norm": 1.010524868965149,
"learning_rate": 4.709113857225732e-06,
"loss": 0.8032638549804687,
"memory(GiB)": 76.04,
"step": 2290,
"token_acc": 0.8110142754505982,
"train_speed(iter/s)": 0.027739
},
{
"epoch": 0.593444954424979,
"grad_norm": 0.877875804901123,
"learning_rate": 4.707443331842206e-06,
"loss": 0.809267234802246,
"memory(GiB)": 76.04,
"step": 2295,
"token_acc": 0.7685890635548269,
"train_speed(iter/s)": 0.027741
},
{
"epoch": 0.594737862822419,
"grad_norm": 1.047855257987976,
"learning_rate": 4.705768321421425e-06,
"loss": 0.7906962394714355,
"memory(GiB)": 76.04,
"step": 2300,
"token_acc": 0.7821157343031341,
"train_speed(iter/s)": 0.027738
},
{
"epoch": 0.596030771219859,
"grad_norm": 1.188430905342102,
"learning_rate": 4.704088829366638e-06,
"loss": 0.8145524978637695,
"memory(GiB)": 76.04,
"step": 2305,
"token_acc": 0.7796888204006561,
"train_speed(iter/s)": 0.027737
},
{
"epoch": 0.5973236796172992,
"grad_norm": 1.0411370992660522,
"learning_rate": 4.702404859090204e-06,
"loss": 0.7802029609680176,
"memory(GiB)": 76.04,
"step": 2310,
"token_acc": 0.7938298768784233,
"train_speed(iter/s)": 0.027739
},
{
"epoch": 0.5986165880147392,
"grad_norm": 0.9956724643707275,
"learning_rate": 4.700716414013577e-06,
"loss": 0.7613677978515625,
"memory(GiB)": 76.04,
"step": 2315,
"token_acc": 0.8293824550807791,
"train_speed(iter/s)": 0.027739
},
{
"epoch": 0.5999094964121792,
"grad_norm": 1.021669626235962,
"learning_rate": 4.6990234975673065e-06,
"loss": 0.7912391662597656,
"memory(GiB)": 76.04,
"step": 2320,
"token_acc": 0.7770263788968825,
"train_speed(iter/s)": 0.027739
},
{
"epoch": 0.6012024048096193,
"grad_norm": 2.0476624965667725,
"learning_rate": 4.697326113191024e-06,
"loss": 0.8161981582641602,
"memory(GiB)": 76.04,
"step": 2325,
"token_acc": 0.7861008259755056,
"train_speed(iter/s)": 0.027737
},
{
"epoch": 0.6024953132070593,
"grad_norm": 2.5752296447753906,
"learning_rate": 4.695624264333438e-06,
"loss": 0.7860607624053955,
"memory(GiB)": 76.04,
"step": 2330,
"token_acc": 0.7906607543657962,
"train_speed(iter/s)": 0.027737
},
{
"epoch": 0.6037882216044993,
"grad_norm": 1.1529428958892822,
"learning_rate": 4.6939179544523315e-06,
"loss": 0.8076473236083984,
"memory(GiB)": 76.04,
"step": 2335,
"token_acc": 0.7956367704642924,
"train_speed(iter/s)": 0.027737
},
{
"epoch": 0.6050811300019394,
"grad_norm": 0.9944195747375488,
"learning_rate": 4.692207187014548e-06,
"loss": 0.8114787101745605,
"memory(GiB)": 76.04,
"step": 2340,
"token_acc": 0.8053776627151746,
"train_speed(iter/s)": 0.027736
},
{
"epoch": 0.6063740383993794,
"grad_norm": 0.9465590715408325,
"learning_rate": 4.690491965495989e-06,
"loss": 0.7890607357025147,
"memory(GiB)": 76.04,
"step": 2345,
"token_acc": 0.7868282075178626,
"train_speed(iter/s)": 0.027736
},
{
"epoch": 0.6076669467968194,
"grad_norm": 1.0112555027008057,
"learning_rate": 4.688772293381608e-06,
"loss": 0.7973843574523926,
"memory(GiB)": 76.04,
"step": 2350,
"token_acc": 0.7798850081524071,
"train_speed(iter/s)": 0.027737
},
{
"epoch": 0.6089598551942594,
"grad_norm": 1.1251353025436401,
"learning_rate": 4.6870481741653965e-06,
"loss": 0.8469139099121094,
"memory(GiB)": 76.04,
"step": 2355,
"token_acc": 0.7770078088638361,
"train_speed(iter/s)": 0.027739
},
{
"epoch": 0.6102527635916996,
"grad_norm": 0.9500820636749268,
"learning_rate": 4.685319611350384e-06,
"loss": 0.8143545150756836,
"memory(GiB)": 76.04,
"step": 2360,
"token_acc": 0.8021919497701536,
"train_speed(iter/s)": 0.027741
},
{
"epoch": 0.6115456719891396,
"grad_norm": 1.0462709665298462,
"learning_rate": 4.683586608448629e-06,
"loss": 0.7490966320037842,
"memory(GiB)": 76.04,
"step": 2365,
"token_acc": 0.8057272352698805,
"train_speed(iter/s)": 0.027738
},
{
"epoch": 0.6128385803865796,
"grad_norm": 0.982092022895813,
"learning_rate": 4.681849168981211e-06,
"loss": 0.8468921661376954,
"memory(GiB)": 76.04,
"step": 2370,
"token_acc": 0.7924534664148908,
"train_speed(iter/s)": 0.02774
},
{
"epoch": 0.6141314887840197,
"grad_norm": 1.270372748374939,
"learning_rate": 4.680107296478223e-06,
"loss": 0.799936580657959,
"memory(GiB)": 76.04,
"step": 2375,
"token_acc": 0.8000295322824763,
"train_speed(iter/s)": 0.027741
},
{
"epoch": 0.6154243971814597,
"grad_norm": 1.3359791040420532,
"learning_rate": 4.678360994478763e-06,
"loss": 0.8011417388916016,
"memory(GiB)": 76.04,
"step": 2380,
"token_acc": 0.7963584606708382,
"train_speed(iter/s)": 0.027739
},
{
"epoch": 0.6167173055788997,
"grad_norm": 1.0611239671707153,
"learning_rate": 4.676610266530935e-06,
"loss": 0.800925350189209,
"memory(GiB)": 76.04,
"step": 2385,
"token_acc": 0.7784312845148835,
"train_speed(iter/s)": 0.027735
},
{
"epoch": 0.6180102139763398,
"grad_norm": 0.9599133729934692,
"learning_rate": 4.6748551161918285e-06,
"loss": 0.7691280364990234,
"memory(GiB)": 76.04,
"step": 2390,
"token_acc": 0.8164638974875819,
"train_speed(iter/s)": 0.027734
},
{
"epoch": 0.6193031223737798,
"grad_norm": 1.0434238910675049,
"learning_rate": 4.673095547027522e-06,
"loss": 0.7575326442718506,
"memory(GiB)": 76.04,
"step": 2395,
"token_acc": 0.8145789878142496,
"train_speed(iter/s)": 0.027734
},
{
"epoch": 0.6205960307712198,
"grad_norm": 1.002805233001709,
"learning_rate": 4.671331562613072e-06,
"loss": 0.7855173110961914,
"memory(GiB)": 76.04,
"step": 2400,
"token_acc": 0.8110472959950661,
"train_speed(iter/s)": 0.027734
},
{
"epoch": 0.62188893916866,
"grad_norm": 0.8859378099441528,
"learning_rate": 4.669563166532504e-06,
"loss": 0.807244873046875,
"memory(GiB)": 76.04,
"step": 2405,
"token_acc": 0.7864419894252676,
"train_speed(iter/s)": 0.027733
},
{
"epoch": 0.6231818475661,
"grad_norm": 2.113131046295166,
"learning_rate": 4.667790362378809e-06,
"loss": 0.794129753112793,
"memory(GiB)": 76.04,
"step": 2410,
"token_acc": 0.7970005356186395,
"train_speed(iter/s)": 0.027733
},
{
"epoch": 0.62447475596354,
"grad_norm": 1.0956636667251587,
"learning_rate": 4.6660131537539335e-06,
"loss": 0.8120314598083496,
"memory(GiB)": 76.04,
"step": 2415,
"token_acc": 0.7850858214337227,
"train_speed(iter/s)": 0.027734
},
{
"epoch": 0.62576766436098,
"grad_norm": 2.5566296577453613,
"learning_rate": 4.664231544268774e-06,
"loss": 0.7688230037689209,
"memory(GiB)": 76.04,
"step": 2420,
"token_acc": 0.7974286336892569,
"train_speed(iter/s)": 0.027737
},
{
"epoch": 0.6270605727584201,
"grad_norm": 0.8976960182189941,
"learning_rate": 4.662445537543164e-06,
"loss": 0.8087752342224122,
"memory(GiB)": 76.04,
"step": 2425,
"token_acc": 0.7868685635201693,
"train_speed(iter/s)": 0.027735
},
{
"epoch": 0.6283534811558601,
"grad_norm": 1.0024232864379883,
"learning_rate": 4.660655137205878e-06,
"loss": 0.7957705020904541,
"memory(GiB)": 76.04,
"step": 2430,
"token_acc": 0.7706113070005151,
"train_speed(iter/s)": 0.027736
},
{
"epoch": 0.6296463895533001,
"grad_norm": 1.0616440773010254,
"learning_rate": 4.658860346894613e-06,
"loss": 0.7973846912384033,
"memory(GiB)": 76.04,
"step": 2435,
"token_acc": 0.8036959869553402,
"train_speed(iter/s)": 0.027737
},
{
"epoch": 0.6309392979507402,
"grad_norm": 1.0026406049728394,
"learning_rate": 4.6570611702559854e-06,
"loss": 0.8205162048339844,
"memory(GiB)": 76.04,
"step": 2440,
"token_acc": 0.7975911152823401,
"train_speed(iter/s)": 0.027737
},
{
"epoch": 0.6322322063481802,
"grad_norm": 0.9040783047676086,
"learning_rate": 4.655257610945526e-06,
"loss": 0.8040790557861328,
"memory(GiB)": 76.04,
"step": 2445,
"token_acc": 0.8114757319709177,
"train_speed(iter/s)": 0.027735
},
{
"epoch": 0.6335251147456202,
"grad_norm": 1.0662907361984253,
"learning_rate": 4.653449672627669e-06,
"loss": 0.7849656105041504,
"memory(GiB)": 76.04,
"step": 2450,
"token_acc": 0.8061563270726617,
"train_speed(iter/s)": 0.027735
},
{
"epoch": 0.6348180231430604,
"grad_norm": 1.0695264339447021,
"learning_rate": 4.6516373589757445e-06,
"loss": 0.7940691947937012,
"memory(GiB)": 76.04,
"step": 2455,
"token_acc": 0.7807667525773195,
"train_speed(iter/s)": 0.027737
},
{
"epoch": 0.6361109315405004,
"grad_norm": 1.1556239128112793,
"learning_rate": 4.649820673671976e-06,
"loss": 0.7685293197631836,
"memory(GiB)": 76.04,
"step": 2460,
"token_acc": 0.7840851495184997,
"train_speed(iter/s)": 0.027739
},
{
"epoch": 0.6374038399379404,
"grad_norm": 2.466895580291748,
"learning_rate": 4.647999620407463e-06,
"loss": 0.7619011878967286,
"memory(GiB)": 76.04,
"step": 2465,
"token_acc": 0.7804016362960208,
"train_speed(iter/s)": 0.02774
},
{
"epoch": 0.6386967483353805,
"grad_norm": 1.1291913986206055,
"learning_rate": 4.646174202882186e-06,
"loss": 0.8165172576904297,
"memory(GiB)": 76.04,
"step": 2470,
"token_acc": 0.7608570606844981,
"train_speed(iter/s)": 0.027742
},
{
"epoch": 0.6399896567328205,
"grad_norm": 1.1947365999221802,
"learning_rate": 4.64434442480499e-06,
"loss": 0.7749819755554199,
"memory(GiB)": 76.04,
"step": 2475,
"token_acc": 0.7708522212148685,
"train_speed(iter/s)": 0.027742
},
{
"epoch": 0.6412825651302605,
"grad_norm": 1.0024884939193726,
"learning_rate": 4.64251028989358e-06,
"loss": 0.766645097732544,
"memory(GiB)": 76.04,
"step": 2480,
"token_acc": 0.7914130613587761,
"train_speed(iter/s)": 0.027743
},
{
"epoch": 0.6425754735277006,
"grad_norm": 0.9784958362579346,
"learning_rate": 4.640671801874512e-06,
"loss": 0.8136966705322266,
"memory(GiB)": 76.04,
"step": 2485,
"token_acc": 0.7942760819377771,
"train_speed(iter/s)": 0.027746
},
{
"epoch": 0.6438683819251406,
"grad_norm": 0.8597215414047241,
"learning_rate": 4.638828964483188e-06,
"loss": 0.775879955291748,
"memory(GiB)": 76.04,
"step": 2490,
"token_acc": 0.7876452918897741,
"train_speed(iter/s)": 0.027745
},
{
"epoch": 0.6451612903225806,
"grad_norm": 1.1758781671524048,
"learning_rate": 4.636981781463848e-06,
"loss": 0.8091221809387207,
"memory(GiB)": 76.04,
"step": 2495,
"token_acc": 0.8069754035357417,
"train_speed(iter/s)": 0.027745
},
{
"epoch": 0.6464541987200206,
"grad_norm": 0.9592023491859436,
"learning_rate": 4.635130256569558e-06,
"loss": 0.7946199417114258,
"memory(GiB)": 76.04,
"step": 2500,
"token_acc": 0.7830649234049717,
"train_speed(iter/s)": 0.027746
},
{
"epoch": 0.6477471071174608,
"grad_norm": 1.495296835899353,
"learning_rate": 4.633274393562208e-06,
"loss": 0.7667324542999268,
"memory(GiB)": 76.04,
"step": 2505,
"token_acc": 0.8036371800628649,
"train_speed(iter/s)": 0.027748
},
{
"epoch": 0.6490400155149008,
"grad_norm": 1.0845485925674438,
"learning_rate": 4.631414196212502e-06,
"loss": 0.774350357055664,
"memory(GiB)": 76.04,
"step": 2510,
"token_acc": 0.7877581120943953,
"train_speed(iter/s)": 0.02775
},
{
"epoch": 0.6503329239123408,
"grad_norm": 0.9458225965499878,
"learning_rate": 4.629549668299949e-06,
"loss": 0.7802841186523437,
"memory(GiB)": 76.04,
"step": 2515,
"token_acc": 0.7762283711761699,
"train_speed(iter/s)": 0.027751
},
{
"epoch": 0.6516258323097809,
"grad_norm": 1.0014280080795288,
"learning_rate": 4.62768081361286e-06,
"loss": 0.7994625568389893,
"memory(GiB)": 76.04,
"step": 2520,
"token_acc": 0.8127975163849603,
"train_speed(iter/s)": 0.027749
},
{
"epoch": 0.6529187407072209,
"grad_norm": 1.5184024572372437,
"learning_rate": 4.6258076359483335e-06,
"loss": 0.7841564655303955,
"memory(GiB)": 76.04,
"step": 2525,
"token_acc": 0.8111151834205178,
"train_speed(iter/s)": 0.027751
},
{
"epoch": 0.6542116491046609,
"grad_norm": 1.1411337852478027,
"learning_rate": 4.623930139112252e-06,
"loss": 0.7719697952270508,
"memory(GiB)": 76.04,
"step": 2530,
"token_acc": 0.7725351785631357,
"train_speed(iter/s)": 0.027751
},
{
"epoch": 0.655504557502101,
"grad_norm": 1.3554903268814087,
"learning_rate": 4.622048326919277e-06,
"loss": 0.7868958950042725,
"memory(GiB)": 76.04,
"step": 2535,
"token_acc": 0.7877291008718654,
"train_speed(iter/s)": 0.027749
},
{
"epoch": 0.656797465899541,
"grad_norm": 1.3750821352005005,
"learning_rate": 4.620162203192833e-06,
"loss": 0.7791455268859864,
"memory(GiB)": 76.04,
"step": 2540,
"token_acc": 0.7791341738940311,
"train_speed(iter/s)": 0.02775
},
{
"epoch": 0.658090374296981,
"grad_norm": 1.1238117218017578,
"learning_rate": 4.618271771765108e-06,
"loss": 0.7734639644622803,
"memory(GiB)": 76.04,
"step": 2545,
"token_acc": 0.7830758898589657,
"train_speed(iter/s)": 0.027753
},
{
"epoch": 0.6593832826944211,
"grad_norm": 1.0922011137008667,
"learning_rate": 4.616377036477039e-06,
"loss": 0.769841194152832,
"memory(GiB)": 76.04,
"step": 2550,
"token_acc": 0.7772533671002647,
"train_speed(iter/s)": 0.027753
},
{
"epoch": 0.6606761910918612,
"grad_norm": 1.0475714206695557,
"learning_rate": 4.614478001178312e-06,
"loss": 0.7945080280303956,
"memory(GiB)": 76.04,
"step": 2555,
"token_acc": 0.7906106546310226,
"train_speed(iter/s)": 0.027752
},
{
"epoch": 0.6619690994893012,
"grad_norm": 1.1444096565246582,
"learning_rate": 4.612574669727346e-06,
"loss": 0.7711798667907714,
"memory(GiB)": 76.04,
"step": 2560,
"token_acc": 0.7995795091578054,
"train_speed(iter/s)": 0.027752
},
{
"epoch": 0.6632620078867413,
"grad_norm": 1.4287755489349365,
"learning_rate": 4.6106670459912915e-06,
"loss": 0.794065284729004,
"memory(GiB)": 76.04,
"step": 2565,
"token_acc": 0.7696101905947706,
"train_speed(iter/s)": 0.027751
},
{
"epoch": 0.6645549162841813,
"grad_norm": 1.3806992769241333,
"learning_rate": 4.608755133846017e-06,
"loss": 0.8211702346801758,
"memory(GiB)": 76.04,
"step": 2570,
"token_acc": 0.80044866626941,
"train_speed(iter/s)": 0.027752
},
{
"epoch": 0.6658478246816213,
"grad_norm": 0.9568463563919067,
"learning_rate": 4.6068389371761055e-06,
"loss": 0.7481316566467285,
"memory(GiB)": 76.04,
"step": 2575,
"token_acc": 0.8280512901693842,
"train_speed(iter/s)": 0.027751
},
{
"epoch": 0.6671407330790613,
"grad_norm": 1.2518895864486694,
"learning_rate": 4.604918459874846e-06,
"loss": 0.7877891540527344,
"memory(GiB)": 76.04,
"step": 2580,
"token_acc": 0.8081138790035587,
"train_speed(iter/s)": 0.027751
},
{
"epoch": 0.6684336414765014,
"grad_norm": 1.919282078742981,
"learning_rate": 4.602993705844225e-06,
"loss": 0.7748439311981201,
"memory(GiB)": 76.04,
"step": 2585,
"token_acc": 0.8042981252857796,
"train_speed(iter/s)": 0.02775
},
{
"epoch": 0.6697265498739414,
"grad_norm": 1.1794474124908447,
"learning_rate": 4.601064678994916e-06,
"loss": 0.7562169075012207,
"memory(GiB)": 76.04,
"step": 2590,
"token_acc": 0.7905587888470463,
"train_speed(iter/s)": 0.027747
},
{
"epoch": 0.6710194582713814,
"grad_norm": 0.9287105798721313,
"learning_rate": 4.599131383246277e-06,
"loss": 0.7767970085144043,
"memory(GiB)": 76.04,
"step": 2595,
"token_acc": 0.7871586083297619,
"train_speed(iter/s)": 0.027748
},
{
"epoch": 0.6723123666688215,
"grad_norm": 1.4129362106323242,
"learning_rate": 4.5971938225263366e-06,
"loss": 0.7788604736328125,
"memory(GiB)": 76.04,
"step": 2600,
"token_acc": 0.810065880876619,
"train_speed(iter/s)": 0.027748
},
{
"epoch": 0.6736052750662616,
"grad_norm": 1.1094108819961548,
"learning_rate": 4.59525200077179e-06,
"loss": 0.7465203285217286,
"memory(GiB)": 76.04,
"step": 2605,
"token_acc": 0.8041896446078431,
"train_speed(iter/s)": 0.027748
},
{
"epoch": 0.6748981834637016,
"grad_norm": 1.05765962600708,
"learning_rate": 4.593305921927992e-06,
"loss": 0.7598991394042969,
"memory(GiB)": 76.04,
"step": 2610,
"token_acc": 0.8296476919196166,
"train_speed(iter/s)": 0.027747
},
{
"epoch": 0.6761910918611417,
"grad_norm": 1.0570799112319946,
"learning_rate": 4.591355589948943e-06,
"loss": 0.7356798648834229,
"memory(GiB)": 76.04,
"step": 2615,
"token_acc": 0.7747376064426695,
"train_speed(iter/s)": 0.027745
},
{
"epoch": 0.6774840002585817,
"grad_norm": 1.726942777633667,
"learning_rate": 4.589401008797288e-06,
"loss": 0.7580029487609863,
"memory(GiB)": 76.04,
"step": 2620,
"token_acc": 0.7843151506341535,
"train_speed(iter/s)": 0.027745
},
{
"epoch": 0.6787769086560217,
"grad_norm": 1.046608805656433,
"learning_rate": 4.587442182444303e-06,
"loss": 0.7981472969055176,
"memory(GiB)": 76.04,
"step": 2625,
"token_acc": 0.8134403515732291,
"train_speed(iter/s)": 0.027744
},
{
"epoch": 0.6800698170534618,
"grad_norm": 1.075890302658081,
"learning_rate": 4.585479114869892e-06,
"loss": 0.7996755599975586,
"memory(GiB)": 76.04,
"step": 2630,
"token_acc": 0.7618249365712214,
"train_speed(iter/s)": 0.027747
},
{
"epoch": 0.6813627254509018,
"grad_norm": 1.182303786277771,
"learning_rate": 4.583511810062573e-06,
"loss": 0.7393967628479003,
"memory(GiB)": 76.04,
"step": 2635,
"token_acc": 0.7840963855421687,
"train_speed(iter/s)": 0.027749
},
{
"epoch": 0.6826556338483418,
"grad_norm": 0.9905603528022766,
"learning_rate": 4.581540272019476e-06,
"loss": 0.7551537036895752,
"memory(GiB)": 76.04,
"step": 2640,
"token_acc": 0.804885036888475,
"train_speed(iter/s)": 0.027751
},
{
"epoch": 0.6839485422457818,
"grad_norm": 0.9618648290634155,
"learning_rate": 4.579564504746331e-06,
"loss": 0.7748908996582031,
"memory(GiB)": 76.04,
"step": 2645,
"token_acc": 0.8088857158547971,
"train_speed(iter/s)": 0.02775
},
{
"epoch": 0.685241450643222,
"grad_norm": 1.2999211549758911,
"learning_rate": 4.577584512257459e-06,
"loss": 0.7771445274353027,
"memory(GiB)": 76.04,
"step": 2650,
"token_acc": 0.8316016931592813,
"train_speed(iter/s)": 0.027747
},
{
"epoch": 0.686534359040662,
"grad_norm": 0.9438580274581909,
"learning_rate": 4.57560029857577e-06,
"loss": 0.7551321983337402,
"memory(GiB)": 76.04,
"step": 2655,
"token_acc": 0.7968830005120328,
"train_speed(iter/s)": 0.027747
},
{
"epoch": 0.687827267438102,
"grad_norm": 1.2633525133132935,
"learning_rate": 4.573611867732746e-06,
"loss": 0.750664758682251,
"memory(GiB)": 76.04,
"step": 2660,
"token_acc": 0.7704320666319625,
"train_speed(iter/s)": 0.027748
},
{
"epoch": 0.6891201758355421,
"grad_norm": 1.7194573879241943,
"learning_rate": 4.571619223768439e-06,
"loss": 0.7772263526916504,
"memory(GiB)": 76.04,
"step": 2665,
"token_acc": 0.7634119583104773,
"train_speed(iter/s)": 0.02775
},
{
"epoch": 0.6904130842329821,
"grad_norm": 1.2169469594955444,
"learning_rate": 4.569622370731463e-06,
"loss": 0.7446264743804931,
"memory(GiB)": 76.04,
"step": 2670,
"token_acc": 0.7990216722278014,
"train_speed(iter/s)": 0.027749
},
{
"epoch": 0.6917059926304221,
"grad_norm": 1.146213173866272,
"learning_rate": 4.56762131267898e-06,
"loss": 0.7797055244445801,
"memory(GiB)": 76.04,
"step": 2675,
"token_acc": 0.7709560205488034,
"train_speed(iter/s)": 0.02775
},
{
"epoch": 0.6929989010278622,
"grad_norm": 6.729126930236816,
"learning_rate": 4.565616053676701e-06,
"loss": 0.7762058258056641,
"memory(GiB)": 76.04,
"step": 2680,
"token_acc": 0.8343838296022604,
"train_speed(iter/s)": 0.02775
},
{
"epoch": 0.6942918094253022,
"grad_norm": 1.7651880979537964,
"learning_rate": 4.563606597798866e-06,
"loss": 0.8064382553100586,
"memory(GiB)": 76.04,
"step": 2685,
"token_acc": 0.7710679099225898,
"train_speed(iter/s)": 0.027751
},
{
"epoch": 0.6955847178227422,
"grad_norm": 1.7482510805130005,
"learning_rate": 4.561592949128249e-06,
"loss": 0.7633975505828857,
"memory(GiB)": 76.04,
"step": 2690,
"token_acc": 0.7979380661789789,
"train_speed(iter/s)": 0.027752
},
{
"epoch": 0.6968776262201823,
"grad_norm": 1.2659438848495483,
"learning_rate": 4.5595751117561365e-06,
"loss": 0.7893208503723145,
"memory(GiB)": 76.04,
"step": 2695,
"token_acc": 0.8003590821509897,
"train_speed(iter/s)": 0.027751
},
{
"epoch": 0.6981705346176224,
"grad_norm": 1.2541935443878174,
"learning_rate": 4.5575530897823296e-06,
"loss": 0.7760859489440918,
"memory(GiB)": 76.04,
"step": 2700,
"token_acc": 0.7648711490021314,
"train_speed(iter/s)": 0.027751
},
{
"epoch": 0.6994634430150624,
"grad_norm": 1.4929347038269043,
"learning_rate": 4.55552688731513e-06,
"loss": 0.7721807479858398,
"memory(GiB)": 76.04,
"step": 2705,
"token_acc": 0.7744538013073435,
"train_speed(iter/s)": 0.027753
},
{
"epoch": 0.7007563514125025,
"grad_norm": 1.3372719287872314,
"learning_rate": 4.553496508471333e-06,
"loss": 0.7598706245422363,
"memory(GiB)": 76.04,
"step": 2710,
"token_acc": 0.7882575476596692,
"train_speed(iter/s)": 0.027753
},
{
"epoch": 0.7020492598099425,
"grad_norm": 1.0163182020187378,
"learning_rate": 4.551461957376221e-06,
"loss": 0.7641387939453125,
"memory(GiB)": 76.04,
"step": 2715,
"token_acc": 0.8151145642243085,
"train_speed(iter/s)": 0.027755
},
{
"epoch": 0.7033421682073825,
"grad_norm": 2.0491156578063965,
"learning_rate": 4.5494232381635526e-06,
"loss": 0.7833964347839355,
"memory(GiB)": 76.04,
"step": 2720,
"token_acc": 0.795193260654113,
"train_speed(iter/s)": 0.027756
},
{
"epoch": 0.7046350766048225,
"grad_norm": 1.0847963094711304,
"learning_rate": 4.547380354975554e-06,
"loss": 0.774288558959961,
"memory(GiB)": 76.04,
"step": 2725,
"token_acc": 0.7972633104565412,
"train_speed(iter/s)": 0.027757
},
{
"epoch": 0.7059279850022626,
"grad_norm": 0.9379494190216064,
"learning_rate": 4.545333311962912e-06,
"loss": 0.7845103740692139,
"memory(GiB)": 76.04,
"step": 2730,
"token_acc": 0.7804776566530748,
"train_speed(iter/s)": 0.027756
},
{
"epoch": 0.7072208933997026,
"grad_norm": 0.9910460114479065,
"learning_rate": 4.543282113284767e-06,
"loss": 0.7749279022216797,
"memory(GiB)": 76.04,
"step": 2735,
"token_acc": 0.7755603122639134,
"train_speed(iter/s)": 0.027758
},
{
"epoch": 0.7085138017971426,
"grad_norm": 0.8512127995491028,
"learning_rate": 4.541226763108702e-06,
"loss": 0.750948715209961,
"memory(GiB)": 76.04,
"step": 2740,
"token_acc": 0.804368820418487,
"train_speed(iter/s)": 0.027757
},
{
"epoch": 0.7098067101945827,
"grad_norm": 2.1939456462860107,
"learning_rate": 4.5391672656107335e-06,
"loss": 0.7639683723449707,
"memory(GiB)": 76.04,
"step": 2745,
"token_acc": 0.8181778169014085,
"train_speed(iter/s)": 0.027758
},
{
"epoch": 0.7110996185920228,
"grad_norm": 1.079122543334961,
"learning_rate": 4.537103624975306e-06,
"loss": 0.7661020278930664,
"memory(GiB)": 76.04,
"step": 2750,
"token_acc": 0.7944695989650712,
"train_speed(iter/s)": 0.027758
},
{
"epoch": 0.7123925269894628,
"grad_norm": 1.3096694946289062,
"learning_rate": 4.53503584539528e-06,
"loss": 0.7214805603027343,
"memory(GiB)": 76.04,
"step": 2755,
"token_acc": 0.7952853160179271,
"train_speed(iter/s)": 0.027758
},
{
"epoch": 0.7136854353869029,
"grad_norm": 1.1697825193405151,
"learning_rate": 4.532963931071929e-06,
"loss": 0.7563837051391602,
"memory(GiB)": 76.04,
"step": 2760,
"token_acc": 0.7784021071115013,
"train_speed(iter/s)": 0.027758
},
{
"epoch": 0.7149783437843429,
"grad_norm": 0.9587258100509644,
"learning_rate": 4.530887886214925e-06,
"loss": 0.7307098388671875,
"memory(GiB)": 76.04,
"step": 2765,
"token_acc": 0.8118209311876937,
"train_speed(iter/s)": 0.027757
},
{
"epoch": 0.7162712521817829,
"grad_norm": 1.2170313596725464,
"learning_rate": 4.528807715042333e-06,
"loss": 0.7652310371398926,
"memory(GiB)": 76.04,
"step": 2770,
"token_acc": 0.8206977655821247,
"train_speed(iter/s)": 0.027758
},
{
"epoch": 0.717564160579223,
"grad_norm": 1.1587222814559937,
"learning_rate": 4.526723421780598e-06,
"loss": 0.757373857498169,
"memory(GiB)": 76.04,
"step": 2775,
"token_acc": 0.8355521801286633,
"train_speed(iter/s)": 0.027758
},
{
"epoch": 0.718857068976663,
"grad_norm": 1.151134967803955,
"learning_rate": 4.524635010664547e-06,
"loss": 0.7718755722045898,
"memory(GiB)": 76.04,
"step": 2780,
"token_acc": 0.8152306441780126,
"train_speed(iter/s)": 0.027757
},
{
"epoch": 0.720149977374103,
"grad_norm": 1.1560102701187134,
"learning_rate": 4.522542485937369e-06,
"loss": 0.7426802635192871,
"memory(GiB)": 76.04,
"step": 2785,
"token_acc": 0.806016436656846,
"train_speed(iter/s)": 0.027754
},
{
"epoch": 0.7214428857715431,
"grad_norm": 0.993427038192749,
"learning_rate": 4.520445851850612e-06,
"loss": 0.7491902828216552,
"memory(GiB)": 76.04,
"step": 2790,
"token_acc": 0.8148384523334663,
"train_speed(iter/s)": 0.027754
},
{
"epoch": 0.7227357941689831,
"grad_norm": 0.9622454047203064,
"learning_rate": 4.518345112664173e-06,
"loss": 0.731049919128418,
"memory(GiB)": 76.04,
"step": 2795,
"token_acc": 0.8307215380677455,
"train_speed(iter/s)": 0.027753
},
{
"epoch": 0.7240287025664232,
"grad_norm": 1.0693981647491455,
"learning_rate": 4.516240272646291e-06,
"loss": 0.7997897148132325,
"memory(GiB)": 76.04,
"step": 2800,
"token_acc": 0.7474579404695877,
"train_speed(iter/s)": 0.027754
},
{
"epoch": 0.7253216109638632,
"grad_norm": 0.9485954642295837,
"learning_rate": 4.514131336073534e-06,
"loss": 0.76673583984375,
"memory(GiB)": 76.04,
"step": 2805,
"token_acc": 0.7821131082858396,
"train_speed(iter/s)": 0.027753
},
{
"epoch": 0.7266145193613033,
"grad_norm": 1.123063564300537,
"learning_rate": 4.512018307230798e-06,
"loss": 0.7704802036285401,
"memory(GiB)": 76.04,
"step": 2810,
"token_acc": 0.7895082445644244,
"train_speed(iter/s)": 0.027755
},
{
"epoch": 0.7279074277587433,
"grad_norm": 1.4126653671264648,
"learning_rate": 4.509901190411289e-06,
"loss": 0.7815113544464112,
"memory(GiB)": 76.04,
"step": 2815,
"token_acc": 0.8011522700531505,
"train_speed(iter/s)": 0.027754
},
{
"epoch": 0.7292003361561833,
"grad_norm": 1.1078338623046875,
"learning_rate": 4.5077799899165206e-06,
"loss": 0.7516324996948243,
"memory(GiB)": 76.04,
"step": 2820,
"token_acc": 0.7875029811590747,
"train_speed(iter/s)": 0.027753
},
{
"epoch": 0.7304932445536234,
"grad_norm": 1.1581236124038696,
"learning_rate": 4.505654710056305e-06,
"loss": 0.7554468154907227,
"memory(GiB)": 76.04,
"step": 2825,
"token_acc": 0.7982930298719773,
"train_speed(iter/s)": 0.027752
},
{
"epoch": 0.7317861529510634,
"grad_norm": 0.9877261519432068,
"learning_rate": 4.50352535514874e-06,
"loss": 0.7270550727844238,
"memory(GiB)": 76.04,
"step": 2830,
"token_acc": 0.8090806830964311,
"train_speed(iter/s)": 0.027753
},
{
"epoch": 0.7330790613485034,
"grad_norm": 1.0771080255508423,
"learning_rate": 4.501391929520206e-06,
"loss": 0.7520308494567871,
"memory(GiB)": 76.04,
"step": 2835,
"token_acc": 0.7689856611789697,
"train_speed(iter/s)": 0.027755
},
{
"epoch": 0.7343719697459435,
"grad_norm": 1.3513661623001099,
"learning_rate": 4.499254437505351e-06,
"loss": 0.7171365737915039,
"memory(GiB)": 76.04,
"step": 2840,
"token_acc": 0.813343427029162,
"train_speed(iter/s)": 0.027755
},
{
"epoch": 0.7356648781433835,
"grad_norm": 1.1246927976608276,
"learning_rate": 4.497112883447088e-06,
"loss": 0.7306987762451171,
"memory(GiB)": 76.04,
"step": 2845,
"token_acc": 0.8194618966664203,
"train_speed(iter/s)": 0.027755
},
{
"epoch": 0.7369577865408236,
"grad_norm": 1.2061104774475098,
"learning_rate": 4.494967271696581e-06,
"loss": 0.787189531326294,
"memory(GiB)": 76.04,
"step": 2850,
"token_acc": 0.7943105778422388,
"train_speed(iter/s)": 0.027755
},
{
"epoch": 0.7382506949382637,
"grad_norm": 1.228200078010559,
"learning_rate": 4.492817606613239e-06,
"loss": 0.736682653427124,
"memory(GiB)": 76.04,
"step": 2855,
"token_acc": 0.8220771643206185,
"train_speed(iter/s)": 0.027754
},
{
"epoch": 0.7395436033357037,
"grad_norm": 1.1733845472335815,
"learning_rate": 4.4906638925647075e-06,
"loss": 0.7503646850585938,
"memory(GiB)": 76.04,
"step": 2860,
"token_acc": 0.7979779479101798,
"train_speed(iter/s)": 0.027756
},
{
"epoch": 0.7408365117331437,
"grad_norm": 1.2325780391693115,
"learning_rate": 4.488506133926857e-06,
"loss": 0.7381996154785156,
"memory(GiB)": 76.04,
"step": 2865,
"token_acc": 0.7863309352517985,
"train_speed(iter/s)": 0.027757
},
{
"epoch": 0.7421294201305837,
"grad_norm": 1.1675026416778564,
"learning_rate": 4.486344335083775e-06,
"loss": 0.7488877296447753,
"memory(GiB)": 76.04,
"step": 2870,
"token_acc": 0.797289709130386,
"train_speed(iter/s)": 0.027757
},
{
"epoch": 0.7434223285280238,
"grad_norm": 1.6887255907058716,
"learning_rate": 4.484178500427762e-06,
"loss": 0.7432705402374268,
"memory(GiB)": 76.04,
"step": 2875,
"token_acc": 0.805889321374175,
"train_speed(iter/s)": 0.027756
},
{
"epoch": 0.7447152369254638,
"grad_norm": 1.2886244058609009,
"learning_rate": 4.482008634359316e-06,
"loss": 0.7218676567077636,
"memory(GiB)": 76.04,
"step": 2880,
"token_acc": 0.8163206292290787,
"train_speed(iter/s)": 0.027754
},
{
"epoch": 0.7460081453229038,
"grad_norm": 1.7008750438690186,
"learning_rate": 4.4798347412871226e-06,
"loss": 0.7312119960784912,
"memory(GiB)": 76.04,
"step": 2885,
"token_acc": 0.8356855218094915,
"train_speed(iter/s)": 0.027755
},
{
"epoch": 0.7473010537203439,
"grad_norm": 1.5202350616455078,
"learning_rate": 4.477656825628054e-06,
"loss": 0.7271114349365234,
"memory(GiB)": 76.04,
"step": 2890,
"token_acc": 0.8097763430943048,
"train_speed(iter/s)": 0.027754
},
{
"epoch": 0.748593962117784,
"grad_norm": 1.6034634113311768,
"learning_rate": 4.475474891807153e-06,
"loss": 0.6789961814880371,
"memory(GiB)": 76.04,
"step": 2895,
"token_acc": 0.78770261615017,
"train_speed(iter/s)": 0.027753
},
{
"epoch": 0.749886870515224,
"grad_norm": 1.1834633350372314,
"learning_rate": 4.473288944257627e-06,
"loss": 0.712617301940918,
"memory(GiB)": 76.04,
"step": 2900,
"token_acc": 0.7983367123174314,
"train_speed(iter/s)": 0.027753
},
{
"epoch": 0.7511797789126641,
"grad_norm": 2.4413537979125977,
"learning_rate": 4.471098987420841e-06,
"loss": 0.7433537483215332,
"memory(GiB)": 76.04,
"step": 2905,
"token_acc": 0.8024606971975393,
"train_speed(iter/s)": 0.027754
},
{
"epoch": 0.7524726873101041,
"grad_norm": 1.2915472984313965,
"learning_rate": 4.468905025746301e-06,
"loss": 0.7077127456665039,
"memory(GiB)": 76.04,
"step": 2910,
"token_acc": 0.8141985793699815,
"train_speed(iter/s)": 0.027756
},
{
"epoch": 0.7537655957075441,
"grad_norm": 1.2215969562530518,
"learning_rate": 4.466707063691653e-06,
"loss": 0.7059410095214844,
"memory(GiB)": 76.04,
"step": 2915,
"token_acc": 0.7902067464635474,
"train_speed(iter/s)": 0.027753
},
{
"epoch": 0.7550585041049842,
"grad_norm": 1.0937923192977905,
"learning_rate": 4.464505105722672e-06,
"loss": 0.7048573017120361,
"memory(GiB)": 76.04,
"step": 2920,
"token_acc": 0.7998023436397007,
"train_speed(iter/s)": 0.027752
},
{
"epoch": 0.7563514125024242,
"grad_norm": 1.2312453985214233,
"learning_rate": 4.4622991563132475e-06,
"loss": 0.6955265045166016,
"memory(GiB)": 76.04,
"step": 2925,
"token_acc": 0.808813281410125,
"train_speed(iter/s)": 0.027754
},
{
"epoch": 0.7576443208998642,
"grad_norm": 1.7371655702590942,
"learning_rate": 4.460089219945383e-06,
"loss": 0.6832226276397705,
"memory(GiB)": 76.04,
"step": 2930,
"token_acc": 0.8051185818094706,
"train_speed(iter/s)": 0.027754
},
{
"epoch": 0.7589372292973043,
"grad_norm": 1.2064937353134155,
"learning_rate": 4.457875301109181e-06,
"loss": 0.6924856662750244,
"memory(GiB)": 76.04,
"step": 2935,
"token_acc": 0.8090518665345227,
"train_speed(iter/s)": 0.027753
},
{
"epoch": 0.7602301376947443,
"grad_norm": 1.9841270446777344,
"learning_rate": 4.455657404302836e-06,
"loss": 0.6858362197875977,
"memory(GiB)": 76.04,
"step": 2940,
"token_acc": 0.8241852487135506,
"train_speed(iter/s)": 0.027753
},
{
"epoch": 0.7615230460921844,
"grad_norm": 1.5064440965652466,
"learning_rate": 4.4534355340326226e-06,
"loss": 0.6784512519836425,
"memory(GiB)": 76.04,
"step": 2945,
"token_acc": 0.8192387024189012,
"train_speed(iter/s)": 0.027752
},
{
"epoch": 0.7628159544896244,
"grad_norm": 1.823947548866272,
"learning_rate": 4.451209694812893e-06,
"loss": 0.6957567214965821,
"memory(GiB)": 76.04,
"step": 2950,
"token_acc": 0.8207297541953903,
"train_speed(iter/s)": 0.027753
},
{
"epoch": 0.7641088628870645,
"grad_norm": 1.7657442092895508,
"learning_rate": 4.448979891166059e-06,
"loss": 0.7199502944946289,
"memory(GiB)": 76.04,
"step": 2955,
"token_acc": 0.8217210270645385,
"train_speed(iter/s)": 0.027754
},
{
"epoch": 0.7654017712845045,
"grad_norm": 1.6712024211883545,
"learning_rate": 4.44674612762259e-06,
"loss": 0.700252914428711,
"memory(GiB)": 76.04,
"step": 2960,
"token_acc": 0.8259719184364637,
"train_speed(iter/s)": 0.027753
},
{
"epoch": 0.7666946796819445,
"grad_norm": 1.6742796897888184,
"learning_rate": 4.444508408720999e-06,
"loss": 0.7040081977844238,
"memory(GiB)": 76.04,
"step": 2965,
"token_acc": 0.8206386483928634,
"train_speed(iter/s)": 0.027753
},
{
"epoch": 0.7679875880793846,
"grad_norm": 1.0268195867538452,
"learning_rate": 4.442266739007838e-06,
"loss": 0.725772476196289,
"memory(GiB)": 76.04,
"step": 2970,
"token_acc": 0.7764441447516296,
"train_speed(iter/s)": 0.027754
},
{
"epoch": 0.7692804964768246,
"grad_norm": 1.271381139755249,
"learning_rate": 4.440021123037683e-06,
"loss": 0.7173772335052491,
"memory(GiB)": 76.04,
"step": 2975,
"token_acc": 0.8247627142654766,
"train_speed(iter/s)": 0.027754
},
{
"epoch": 0.7705734048742646,
"grad_norm": 1.742287039756775,
"learning_rate": 4.437771565373131e-06,
"loss": 0.6777096748352051,
"memory(GiB)": 76.04,
"step": 2980,
"token_acc": 0.8065741348588578,
"train_speed(iter/s)": 0.027755
},
{
"epoch": 0.7718663132717047,
"grad_norm": 1.113531470298767,
"learning_rate": 4.4355180705847854e-06,
"loss": 0.6992631912231445,
"memory(GiB)": 76.04,
"step": 2985,
"token_acc": 0.8095548168203159,
"train_speed(iter/s)": 0.027755
},
{
"epoch": 0.7731592216691447,
"grad_norm": 1.5363075733184814,
"learning_rate": 4.43326064325125e-06,
"loss": 0.6818428993225097,
"memory(GiB)": 76.04,
"step": 2990,
"token_acc": 0.8093805660003958,
"train_speed(iter/s)": 0.027756
},
{
"epoch": 0.7744521300665848,
"grad_norm": 1.2914507389068604,
"learning_rate": 4.43099928795912e-06,
"loss": 0.6791769027709961,
"memory(GiB)": 76.04,
"step": 2995,
"token_acc": 0.7912520619379556,
"train_speed(iter/s)": 0.027756
},
{
"epoch": 0.7757450384640249,
"grad_norm": 1.2839219570159912,
"learning_rate": 4.428734009302968e-06,
"loss": 0.6807722091674805,
"memory(GiB)": 76.04,
"step": 3000,
"token_acc": 0.8037732367729139,
"train_speed(iter/s)": 0.027757
},
{
"epoch": 0.7770379468614649,
"grad_norm": 1.6240931749343872,
"learning_rate": 4.42646481188534e-06,
"loss": 0.6738556861877442,
"memory(GiB)": 76.04,
"step": 3005,
"token_acc": 0.8336527405136067,
"train_speed(iter/s)": 0.02771
},
{
"epoch": 0.7783308552589049,
"grad_norm": 2.119504690170288,
"learning_rate": 4.424191700316745e-06,
"loss": 0.7054489135742188,
"memory(GiB)": 76.04,
"step": 3010,
"token_acc": 0.8144756176741961,
"train_speed(iter/s)": 0.02771
},
{
"epoch": 0.7796237636563449,
"grad_norm": 1.3605269193649292,
"learning_rate": 4.421914679215643e-06,
"loss": 0.6763367652893066,
"memory(GiB)": 76.04,
"step": 3015,
"token_acc": 0.832611100866679,
"train_speed(iter/s)": 0.02771
},
{
"epoch": 0.780916672053785,
"grad_norm": 1.5182582139968872,
"learning_rate": 4.419633753208438e-06,
"loss": 0.6742976188659668,
"memory(GiB)": 76.04,
"step": 3020,
"token_acc": 0.8088533082175653,
"train_speed(iter/s)": 0.027711
},
{
"epoch": 0.782209580451225,
"grad_norm": 1.180389404296875,
"learning_rate": 4.417348926929467e-06,
"loss": 0.6577554702758789,
"memory(GiB)": 76.04,
"step": 3025,
"token_acc": 0.7916683734076106,
"train_speed(iter/s)": 0.027712
},
{
"epoch": 0.783502488848665,
"grad_norm": 1.0676547288894653,
"learning_rate": 4.4150602050209935e-06,
"loss": 0.6725570678710937,
"memory(GiB)": 76.04,
"step": 3030,
"token_acc": 0.8131301520575388,
"train_speed(iter/s)": 0.027712
},
{
"epoch": 0.7847953972461051,
"grad_norm": 1.066395878791809,
"learning_rate": 4.412767592133195e-06,
"loss": 0.6555842399597168,
"memory(GiB)": 76.04,
"step": 3035,
"token_acc": 0.8660530809527944,
"train_speed(iter/s)": 0.027711
},
{
"epoch": 0.7860883056435451,
"grad_norm": 5.532017230987549,
"learning_rate": 4.410471092924154e-06,
"loss": 0.6637729167938232,
"memory(GiB)": 76.04,
"step": 3040,
"token_acc": 0.814479006834984,
"train_speed(iter/s)": 0.02771
},
{
"epoch": 0.7873812140409852,
"grad_norm": 1.9172098636627197,
"learning_rate": 4.408170712059848e-06,
"loss": 0.706690502166748,
"memory(GiB)": 76.04,
"step": 3045,
"token_acc": 0.7951164898437917,
"train_speed(iter/s)": 0.027708
},
{
"epoch": 0.7886741224384253,
"grad_norm": 2.5375490188598633,
"learning_rate": 4.405866454214145e-06,
"loss": 0.6923388481140137,
"memory(GiB)": 76.04,
"step": 3050,
"token_acc": 0.7856790394210209,
"train_speed(iter/s)": 0.027708
},
{
"epoch": 0.7899670308358653,
"grad_norm": 1.3066571950912476,
"learning_rate": 4.403558324068787e-06,
"loss": 0.6584675788879395,
"memory(GiB)": 76.04,
"step": 3055,
"token_acc": 0.8082852648138438,
"train_speed(iter/s)": 0.027709
},
{
"epoch": 0.7912599392333053,
"grad_norm": 1.8484247922897339,
"learning_rate": 4.401246326313386e-06,
"loss": 0.6835250854492188,
"memory(GiB)": 76.04,
"step": 3060,
"token_acc": 0.8032010726107177,
"train_speed(iter/s)": 0.02771
},
{
"epoch": 0.7925528476307454,
"grad_norm": 1.7470216751098633,
"learning_rate": 4.398930465645409e-06,
"loss": 0.6875529289245605,
"memory(GiB)": 76.04,
"step": 3065,
"token_acc": 0.8029278650053081,
"train_speed(iter/s)": 0.02771
},
{
"epoch": 0.7938457560281854,
"grad_norm": 1.1840174198150635,
"learning_rate": 4.396610746770173e-06,
"loss": 0.6479888916015625,
"memory(GiB)": 76.04,
"step": 3070,
"token_acc": 0.8107086371176175,
"train_speed(iter/s)": 0.02771
},
{
"epoch": 0.7951386644256254,
"grad_norm": 1.2682684659957886,
"learning_rate": 4.394287174400838e-06,
"loss": 0.6412975788116455,
"memory(GiB)": 76.04,
"step": 3075,
"token_acc": 0.829871190130624,
"train_speed(iter/s)": 0.027708
},
{
"epoch": 0.7964315728230655,
"grad_norm": 1.5862990617752075,
"learning_rate": 4.3919597532583845e-06,
"loss": 0.680488395690918,
"memory(GiB)": 76.04,
"step": 3080,
"token_acc": 0.82756076566791,
"train_speed(iter/s)": 0.027709
},
{
"epoch": 0.7977244812205055,
"grad_norm": 1.4510713815689087,
"learning_rate": 4.389628488071622e-06,
"loss": 0.644444751739502,
"memory(GiB)": 76.04,
"step": 3085,
"token_acc": 0.800807537012113,
"train_speed(iter/s)": 0.027708
},
{
"epoch": 0.7990173896179456,
"grad_norm": 1.2568798065185547,
"learning_rate": 4.387293383577165e-06,
"loss": 0.6682034015655518,
"memory(GiB)": 76.04,
"step": 3090,
"token_acc": 0.8065676636686886,
"train_speed(iter/s)": 0.027707
},
{
"epoch": 0.8003102980153856,
"grad_norm": 1.0545753240585327,
"learning_rate": 4.38495444451943e-06,
"loss": 0.6688919544219971,
"memory(GiB)": 76.04,
"step": 3095,
"token_acc": 0.7727210465036641,
"train_speed(iter/s)": 0.027708
},
{
"epoch": 0.8016032064128257,
"grad_norm": 1.586976170539856,
"learning_rate": 4.382611675650626e-06,
"loss": 0.6349334716796875,
"memory(GiB)": 76.04,
"step": 3100,
"token_acc": 0.806030889924001,
"train_speed(iter/s)": 0.027707
},
{
"epoch": 0.8028961148102657,
"grad_norm": 2.7589170932769775,
"learning_rate": 4.380265081730739e-06,
"loss": 0.6485045433044434,
"memory(GiB)": 76.04,
"step": 3105,
"token_acc": 0.8114932360204947,
"train_speed(iter/s)": 0.027707
},
{
"epoch": 0.8041890232077057,
"grad_norm": 1.262620210647583,
"learning_rate": 4.377914667527532e-06,
"loss": 0.6574973106384278,
"memory(GiB)": 76.04,
"step": 3110,
"token_acc": 0.8018425922280404,
"train_speed(iter/s)": 0.027707
},
{
"epoch": 0.8054819316051458,
"grad_norm": 1.670192003250122,
"learning_rate": 4.375560437816527e-06,
"loss": 0.6576096534729003,
"memory(GiB)": 76.04,
"step": 3115,
"token_acc": 0.8066886816886817,
"train_speed(iter/s)": 0.027706
},
{
"epoch": 0.8067748400025858,
"grad_norm": 1.9839909076690674,
"learning_rate": 4.373202397380998e-06,
"loss": 0.6304091930389404,
"memory(GiB)": 76.04,
"step": 3120,
"token_acc": 0.8234421364985163,
"train_speed(iter/s)": 0.027704
},
{
"epoch": 0.8080677484000258,
"grad_norm": 1.1080540418624878,
"learning_rate": 4.370840551011963e-06,
"loss": 0.6576041221618653,
"memory(GiB)": 76.04,
"step": 3125,
"token_acc": 0.8217494089834515,
"train_speed(iter/s)": 0.027706
},
{
"epoch": 0.8093606567974659,
"grad_norm": 1.1593878269195557,
"learning_rate": 4.3684749035081705e-06,
"loss": 0.6419290542602539,
"memory(GiB)": 76.04,
"step": 3130,
"token_acc": 0.7999515151515152,
"train_speed(iter/s)": 0.027706
},
{
"epoch": 0.810653565194906,
"grad_norm": 1.1493967771530151,
"learning_rate": 4.366105459676097e-06,
"loss": 0.646766471862793,
"memory(GiB)": 76.04,
"step": 3135,
"token_acc": 0.8102760440126118,
"train_speed(iter/s)": 0.027707
},
{
"epoch": 0.811946473592346,
"grad_norm": 1.3651187419891357,
"learning_rate": 4.3637322243299255e-06,
"loss": 0.6666352272033691,
"memory(GiB)": 76.04,
"step": 3140,
"token_acc": 0.8152125937913786,
"train_speed(iter/s)": 0.027707
},
{
"epoch": 0.8132393819897861,
"grad_norm": 1.128293514251709,
"learning_rate": 4.361355202291548e-06,
"loss": 0.6353740692138672,
"memory(GiB)": 76.04,
"step": 3145,
"token_acc": 0.8045256453234998,
"train_speed(iter/s)": 0.027707
},
{
"epoch": 0.8145322903872261,
"grad_norm": 1.6029019355773926,
"learning_rate": 4.358974398390548e-06,
"loss": 0.6691800117492676,
"memory(GiB)": 76.04,
"step": 3150,
"token_acc": 0.8306377243385117,
"train_speed(iter/s)": 0.027707
},
{
"epoch": 0.8158251987846661,
"grad_norm": 1.3377734422683716,
"learning_rate": 4.356589817464193e-06,
"loss": 0.6470844745635986,
"memory(GiB)": 76.04,
"step": 3155,
"token_acc": 0.8250958558747833,
"train_speed(iter/s)": 0.027707
},
{
"epoch": 0.8171181071821062,
"grad_norm": 19.943740844726562,
"learning_rate": 4.354201464357424e-06,
"loss": 0.6441401481628418,
"memory(GiB)": 76.04,
"step": 3160,
"token_acc": 0.8164092901323066,
"train_speed(iter/s)": 0.027705
},
{
"epoch": 0.8184110155795462,
"grad_norm": 2.096036672592163,
"learning_rate": 4.3518093439228484e-06,
"loss": 0.6595673561096191,
"memory(GiB)": 76.04,
"step": 3165,
"token_acc": 0.8180080986396105,
"train_speed(iter/s)": 0.027705
},
{
"epoch": 0.8197039239769862,
"grad_norm": 1.3042539358139038,
"learning_rate": 4.349413461020725e-06,
"loss": 0.6536635398864746,
"memory(GiB)": 76.04,
"step": 3170,
"token_acc": 0.7721032106415942,
"train_speed(iter/s)": 0.027707
},
{
"epoch": 0.8209968323744262,
"grad_norm": 1.1923153400421143,
"learning_rate": 4.347013820518959e-06,
"loss": 0.6662230491638184,
"memory(GiB)": 76.04,
"step": 3175,
"token_acc": 0.7864515044545302,
"train_speed(iter/s)": 0.027706
},
{
"epoch": 0.8222897407718663,
"grad_norm": 0.9587339162826538,
"learning_rate": 4.344610427293091e-06,
"loss": 0.637930154800415,
"memory(GiB)": 76.04,
"step": 3180,
"token_acc": 0.8349569816879248,
"train_speed(iter/s)": 0.027703
},
{
"epoch": 0.8235826491693063,
"grad_norm": 1.4377241134643555,
"learning_rate": 4.342203286226284e-06,
"loss": 0.6546686172485352,
"memory(GiB)": 76.04,
"step": 3185,
"token_acc": 0.8348570317058938,
"train_speed(iter/s)": 0.027704
},
{
"epoch": 0.8248755575667464,
"grad_norm": 1.3020581007003784,
"learning_rate": 4.339792402209318e-06,
"loss": 0.6620816707611084,
"memory(GiB)": 76.04,
"step": 3190,
"token_acc": 0.8184861571423789,
"train_speed(iter/s)": 0.027705
},
{
"epoch": 0.8261684659641865,
"grad_norm": 1.6828721761703491,
"learning_rate": 4.337377780140575e-06,
"loss": 0.6277073860168457,
"memory(GiB)": 76.04,
"step": 3195,
"token_acc": 0.8200602270094973,
"train_speed(iter/s)": 0.027704
},
{
"epoch": 0.8274613743616265,
"grad_norm": 1.6351348161697388,
"learning_rate": 4.334959424926036e-06,
"loss": 0.6136197566986084,
"memory(GiB)": 76.04,
"step": 3200,
"token_acc": 0.805330584597261,
"train_speed(iter/s)": 0.027705
},
{
"epoch": 0.8287542827590665,
"grad_norm": 1.1907836198806763,
"learning_rate": 4.3325373414792625e-06,
"loss": 0.647891902923584,
"memory(GiB)": 76.04,
"step": 3205,
"token_acc": 0.8155163083583411,
"train_speed(iter/s)": 0.027705
},
{
"epoch": 0.8300471911565066,
"grad_norm": 1.5931166410446167,
"learning_rate": 4.330111534721394e-06,
"loss": 0.6463868141174316,
"memory(GiB)": 76.04,
"step": 3210,
"token_acc": 0.8210717829970228,
"train_speed(iter/s)": 0.027707
},
{
"epoch": 0.8313400995539466,
"grad_norm": 1.8650577068328857,
"learning_rate": 4.327682009581134e-06,
"loss": 0.6347787380218506,
"memory(GiB)": 76.04,
"step": 3215,
"token_acc": 0.8082674179217684,
"train_speed(iter/s)": 0.027707
},
{
"epoch": 0.8326330079513866,
"grad_norm": 2.3283803462982178,
"learning_rate": 4.325248770994741e-06,
"loss": 0.6708244800567627,
"memory(GiB)": 76.04,
"step": 3220,
"token_acc": 0.7994605743296843,
"train_speed(iter/s)": 0.027706
},
{
"epoch": 0.8339259163488267,
"grad_norm": 1.4204248189926147,
"learning_rate": 4.322811823906018e-06,
"loss": 0.6237285137176514,
"memory(GiB)": 76.04,
"step": 3225,
"token_acc": 0.8479689603294006,
"train_speed(iter/s)": 0.027704
},
{
"epoch": 0.8352188247462667,
"grad_norm": 2.9352431297302246,
"learning_rate": 4.3203711732663035e-06,
"loss": 0.6458423614501954,
"memory(GiB)": 76.04,
"step": 3230,
"token_acc": 0.8232549095858448,
"train_speed(iter/s)": 0.027703
},
{
"epoch": 0.8365117331437067,
"grad_norm": 1.1581929922103882,
"learning_rate": 4.31792682403446e-06,
"loss": 0.6388854026794434,
"memory(GiB)": 76.04,
"step": 3235,
"token_acc": 0.7934440464560947,
"train_speed(iter/s)": 0.027701
},
{
"epoch": 0.8378046415411468,
"grad_norm": 1.1175485849380493,
"learning_rate": 4.315478781176867e-06,
"loss": 0.6145687103271484,
"memory(GiB)": 76.04,
"step": 3240,
"token_acc": 0.8398847580708817,
"train_speed(iter/s)": 0.027701
},
{
"epoch": 0.8390975499385869,
"grad_norm": 1.2444353103637695,
"learning_rate": 4.313027049667405e-06,
"loss": 0.6328566074371338,
"memory(GiB)": 76.04,
"step": 3245,
"token_acc": 0.8066215947504474,
"train_speed(iter/s)": 0.027699
},
{
"epoch": 0.8403904583360269,
"grad_norm": 1.141342043876648,
"learning_rate": 4.310571634487451e-06,
"loss": 0.629487419128418,
"memory(GiB)": 76.04,
"step": 3250,
"token_acc": 0.8341686379856461,
"train_speed(iter/s)": 0.0277
},
{
"epoch": 0.8416833667334669,
"grad_norm": 1.3321287631988525,
"learning_rate": 4.3081125406258655e-06,
"loss": 0.6453184604644775,
"memory(GiB)": 76.04,
"step": 3255,
"token_acc": 0.7997035782341732,
"train_speed(iter/s)": 0.0277
},
{
"epoch": 0.842976275130907,
"grad_norm": 1.0039650201797485,
"learning_rate": 4.305649773078987e-06,
"loss": 0.666410255432129,
"memory(GiB)": 76.04,
"step": 3260,
"token_acc": 0.8168428282519937,
"train_speed(iter/s)": 0.027699
},
{
"epoch": 0.844269183528347,
"grad_norm": 1.2001808881759644,
"learning_rate": 4.303183336850612e-06,
"loss": 0.660033893585205,
"memory(GiB)": 76.04,
"step": 3265,
"token_acc": 0.8161585530947095,
"train_speed(iter/s)": 0.027699
},
{
"epoch": 0.845562091925787,
"grad_norm": 1.2713844776153564,
"learning_rate": 4.300713236951996e-06,
"loss": 0.6356592655181885,
"memory(GiB)": 76.04,
"step": 3270,
"token_acc": 0.8038586795618277,
"train_speed(iter/s)": 0.027701
},
{
"epoch": 0.8468550003232271,
"grad_norm": 1.3221766948699951,
"learning_rate": 4.298239478401836e-06,
"loss": 0.6444936275482178,
"memory(GiB)": 76.04,
"step": 3275,
"token_acc": 0.8171707402848603,
"train_speed(iter/s)": 0.027701
},
{
"epoch": 0.8481479087206671,
"grad_norm": 1.2477511167526245,
"learning_rate": 4.295762066226262e-06,
"loss": 0.611814022064209,
"memory(GiB)": 76.04,
"step": 3280,
"token_acc": 0.8180022127390548,
"train_speed(iter/s)": 0.027701
},
{
"epoch": 0.8494408171181071,
"grad_norm": 1.2124427556991577,
"learning_rate": 4.293281005458831e-06,
"loss": 0.6272024631500244,
"memory(GiB)": 76.04,
"step": 3285,
"token_acc": 0.8499176225558768,
"train_speed(iter/s)": 0.027699
},
{
"epoch": 0.8507337255155473,
"grad_norm": 4.538881778717041,
"learning_rate": 4.290796301140506e-06,
"loss": 0.6252808094024658,
"memory(GiB)": 76.04,
"step": 3290,
"token_acc": 0.8332118523213436,
"train_speed(iter/s)": 0.027698
},
{
"epoch": 0.8520266339129873,
"grad_norm": 1.4854230880737305,
"learning_rate": 4.288307958319662e-06,
"loss": 0.6353150367736816,
"memory(GiB)": 76.04,
"step": 3295,
"token_acc": 0.7681834998150203,
"train_speed(iter/s)": 0.027696
},
{
"epoch": 0.8533195423104273,
"grad_norm": 1.1235853433609009,
"learning_rate": 4.285815982052058e-06,
"loss": 0.6190371036529541,
"memory(GiB)": 76.04,
"step": 3300,
"token_acc": 0.8698982508288556,
"train_speed(iter/s)": 0.027695
},
{
"epoch": 0.8546124507078674,
"grad_norm": 1.0786458253860474,
"learning_rate": 4.283320377400842e-06,
"loss": 0.6302780151367188,
"memory(GiB)": 76.04,
"step": 3305,
"token_acc": 0.792910447761194,
"train_speed(iter/s)": 0.027693
},
{
"epoch": 0.8559053591053074,
"grad_norm": 1.0524226427078247,
"learning_rate": 4.280821149436531e-06,
"loss": 0.629145622253418,
"memory(GiB)": 76.04,
"step": 3310,
"token_acc": 0.8330289590399165,
"train_speed(iter/s)": 0.027692
},
{
"epoch": 0.8571982675027474,
"grad_norm": 1.4898467063903809,
"learning_rate": 4.278318303237003e-06,
"loss": 0.6266490459442139,
"memory(GiB)": 76.04,
"step": 3315,
"token_acc": 0.8104506584124652,
"train_speed(iter/s)": 0.027692
},
{
"epoch": 0.8584911759001874,
"grad_norm": 1.1593666076660156,
"learning_rate": 4.275811843887491e-06,
"loss": 0.6542300224304199,
"memory(GiB)": 76.04,
"step": 3320,
"token_acc": 0.8437677735485847,
"train_speed(iter/s)": 0.027692
},
{
"epoch": 0.8597840842976275,
"grad_norm": 1.2023606300354004,
"learning_rate": 4.273301776480564e-06,
"loss": 0.6109468936920166,
"memory(GiB)": 76.04,
"step": 3325,
"token_acc": 0.8550740689464211,
"train_speed(iter/s)": 0.02769
},
{
"epoch": 0.8610769926950675,
"grad_norm": 1.4408375024795532,
"learning_rate": 4.270788106116125e-06,
"loss": 0.6247062683105469,
"memory(GiB)": 76.04,
"step": 3330,
"token_acc": 0.8023508574188873,
"train_speed(iter/s)": 0.027689
},
{
"epoch": 0.8623699010925076,
"grad_norm": 1.197199821472168,
"learning_rate": 4.268270837901399e-06,
"loss": 0.638817024230957,
"memory(GiB)": 76.04,
"step": 3335,
"token_acc": 0.8134350688210652,
"train_speed(iter/s)": 0.02769
},
{
"epoch": 0.8636628094899477,
"grad_norm": 1.215605616569519,
"learning_rate": 4.265749976950917e-06,
"loss": 0.6219228744506836,
"memory(GiB)": 76.04,
"step": 3340,
"token_acc": 0.830684302174799,
"train_speed(iter/s)": 0.027689
},
{
"epoch": 0.8649557178873877,
"grad_norm": 1.1016297340393066,
"learning_rate": 4.263225528386512e-06,
"loss": 0.6183833122253418,
"memory(GiB)": 76.04,
"step": 3345,
"token_acc": 0.8504693786320966,
"train_speed(iter/s)": 0.027689
},
{
"epoch": 0.8662486262848277,
"grad_norm": 1.1827194690704346,
"learning_rate": 4.260697497337306e-06,
"loss": 0.6260892868041992,
"memory(GiB)": 76.04,
"step": 3350,
"token_acc": 0.8206773446545735,
"train_speed(iter/s)": 0.027689
},
{
"epoch": 0.8675415346822678,
"grad_norm": 1.1964573860168457,
"learning_rate": 4.2581658889397e-06,
"loss": 0.6217505931854248,
"memory(GiB)": 76.04,
"step": 3355,
"token_acc": 0.8112748538011696,
"train_speed(iter/s)": 0.027691
},
{
"epoch": 0.8688344430797078,
"grad_norm": 1.9450868368148804,
"learning_rate": 4.2556307083373635e-06,
"loss": 0.6057548522949219,
"memory(GiB)": 76.04,
"step": 3360,
"token_acc": 0.8432432432432433,
"train_speed(iter/s)": 0.027691
},
{
"epoch": 0.8701273514771478,
"grad_norm": 1.085252285003662,
"learning_rate": 4.253091960681222e-06,
"loss": 0.650747537612915,
"memory(GiB)": 76.04,
"step": 3365,
"token_acc": 0.8127441586201813,
"train_speed(iter/s)": 0.027692
},
{
"epoch": 0.8714202598745879,
"grad_norm": 1.4419254064559937,
"learning_rate": 4.250549651129451e-06,
"loss": 0.6490330696105957,
"memory(GiB)": 76.04,
"step": 3370,
"token_acc": 0.817296827466319,
"train_speed(iter/s)": 0.027693
},
{
"epoch": 0.8727131682720279,
"grad_norm": 0.9393129348754883,
"learning_rate": 4.248003784847462e-06,
"loss": 0.5855797290802002,
"memory(GiB)": 76.04,
"step": 3375,
"token_acc": 0.8437131244263799,
"train_speed(iter/s)": 0.027693
},
{
"epoch": 0.874006076669468,
"grad_norm": 1.4661402702331543,
"learning_rate": 4.245454367007893e-06,
"loss": 0.6375166416168213,
"memory(GiB)": 76.04,
"step": 3380,
"token_acc": 0.8220987966001851,
"train_speed(iter/s)": 0.027691
},
{
"epoch": 0.8752989850669081,
"grad_norm": 1.0783532857894897,
"learning_rate": 4.242901402790597e-06,
"loss": 0.5942583084106445,
"memory(GiB)": 76.04,
"step": 3385,
"token_acc": 0.8271346924848588,
"train_speed(iter/s)": 0.027691
},
{
"epoch": 0.8765918934643481,
"grad_norm": 1.002106785774231,
"learning_rate": 4.240344897382633e-06,
"loss": 0.6190349578857421,
"memory(GiB)": 76.04,
"step": 3390,
"token_acc": 0.8104156272786583,
"train_speed(iter/s)": 0.02769
},
{
"epoch": 0.8778848018617881,
"grad_norm": 1.0779801607131958,
"learning_rate": 4.237784855978258e-06,
"loss": 0.6126032829284668,
"memory(GiB)": 76.04,
"step": 3395,
"token_acc": 0.8469879143753689,
"train_speed(iter/s)": 0.02769
},
{
"epoch": 0.8791777102592281,
"grad_norm": 1.1293411254882812,
"learning_rate": 4.2352212837789086e-06,
"loss": 0.6498593330383301,
"memory(GiB)": 76.04,
"step": 3400,
"token_acc": 0.8174972974987885,
"train_speed(iter/s)": 0.02769
},
{
"epoch": 0.8804706186566682,
"grad_norm": 1.9169739484786987,
"learning_rate": 4.232654185993197e-06,
"loss": 0.6312263965606689,
"memory(GiB)": 76.04,
"step": 3405,
"token_acc": 0.8185292511864264,
"train_speed(iter/s)": 0.02769
},
{
"epoch": 0.8817635270541082,
"grad_norm": 2.342643976211548,
"learning_rate": 4.2300835678369005e-06,
"loss": 0.5902108192443848,
"memory(GiB)": 76.04,
"step": 3410,
"token_acc": 0.8093889113719142,
"train_speed(iter/s)": 0.02769
},
{
"epoch": 0.8830564354515482,
"grad_norm": 1.2619035243988037,
"learning_rate": 4.227509434532945e-06,
"loss": 0.6150105953216553,
"memory(GiB)": 76.04,
"step": 3415,
"token_acc": 0.8166153846153846,
"train_speed(iter/s)": 0.027689
},
{
"epoch": 0.8843493438489883,
"grad_norm": 1.1575284004211426,
"learning_rate": 4.224931791311403e-06,
"loss": 0.6235898017883301,
"memory(GiB)": 76.04,
"step": 3420,
"token_acc": 0.8300420709195501,
"train_speed(iter/s)": 0.027689
},
{
"epoch": 0.8856422522464283,
"grad_norm": 1.627013921737671,
"learning_rate": 4.2223506434094754e-06,
"loss": 0.601617431640625,
"memory(GiB)": 76.04,
"step": 3425,
"token_acc": 0.8208349821923229,
"train_speed(iter/s)": 0.027689
},
{
"epoch": 0.8869351606438683,
"grad_norm": 1.0518633127212524,
"learning_rate": 4.219765996071483e-06,
"loss": 0.6408526420593261,
"memory(GiB)": 76.04,
"step": 3430,
"token_acc": 0.8019846954820224,
"train_speed(iter/s)": 0.02769
},
{
"epoch": 0.8882280690413085,
"grad_norm": 0.9839140176773071,
"learning_rate": 4.217177854548862e-06,
"loss": 0.6014208793640137,
"memory(GiB)": 76.04,
"step": 3435,
"token_acc": 0.8176200504021818,
"train_speed(iter/s)": 0.027689
},
{
"epoch": 0.8895209774387485,
"grad_norm": 1.1011220216751099,
"learning_rate": 4.21458622410014e-06,
"loss": 0.6313972473144531,
"memory(GiB)": 76.04,
"step": 3440,
"token_acc": 0.8165993852079553,
"train_speed(iter/s)": 0.027689
},
{
"epoch": 0.8908138858361885,
"grad_norm": 1.156149983406067,
"learning_rate": 4.211991109990941e-06,
"loss": 0.6519000053405761,
"memory(GiB)": 76.04,
"step": 3445,
"token_acc": 0.7982918203025058,
"train_speed(iter/s)": 0.027691
},
{
"epoch": 0.8921067942336286,
"grad_norm": 1.144892692565918,
"learning_rate": 4.2093925174939606e-06,
"loss": 0.6042433738708496,
"memory(GiB)": 76.04,
"step": 3450,
"token_acc": 0.8215976553693545,
"train_speed(iter/s)": 0.02769
},
{
"epoch": 0.8933997026310686,
"grad_norm": 1.2235312461853027,
"learning_rate": 4.206790451888968e-06,
"loss": 0.6446715354919433,
"memory(GiB)": 76.04,
"step": 3455,
"token_acc": 0.8082164853885467,
"train_speed(iter/s)": 0.02769
},
{
"epoch": 0.8946926110285086,
"grad_norm": 1.1150991916656494,
"learning_rate": 4.204184918462783e-06,
"loss": 0.628176212310791,
"memory(GiB)": 76.04,
"step": 3460,
"token_acc": 0.803219563687544,
"train_speed(iter/s)": 0.027688
},
{
"epoch": 0.8959855194259486,
"grad_norm": 1.0735828876495361,
"learning_rate": 4.201575922509277e-06,
"loss": 0.6142620086669922,
"memory(GiB)": 76.04,
"step": 3465,
"token_acc": 0.8073942988329826,
"train_speed(iter/s)": 0.027688
},
{
"epoch": 0.8972784278233887,
"grad_norm": 0.9149400591850281,
"learning_rate": 4.198963469329351e-06,
"loss": 0.5981680870056152,
"memory(GiB)": 76.04,
"step": 3470,
"token_acc": 0.8314239727324371,
"train_speed(iter/s)": 0.027688
},
{
"epoch": 0.8985713362208287,
"grad_norm": 0.9329715371131897,
"learning_rate": 4.196347564230933e-06,
"loss": 0.6357330322265625,
"memory(GiB)": 76.04,
"step": 3475,
"token_acc": 0.815760798500632,
"train_speed(iter/s)": 0.027685
},
{
"epoch": 0.8998642446182687,
"grad_norm": 2.6730282306671143,
"learning_rate": 4.193728212528965e-06,
"loss": 0.6184768676757812,
"memory(GiB)": 76.04,
"step": 3480,
"token_acc": 0.8195593938666986,
"train_speed(iter/s)": 0.027686
},
{
"epoch": 0.9011571530157089,
"grad_norm": 1.3464089632034302,
"learning_rate": 4.191105419545391e-06,
"loss": 0.6040889263153076,
"memory(GiB)": 76.04,
"step": 3485,
"token_acc": 0.8135844450257215,
"train_speed(iter/s)": 0.027687
},
{
"epoch": 0.9024500614131489,
"grad_norm": 1.3222928047180176,
"learning_rate": 4.188479190609146e-06,
"loss": 0.6070952415466309,
"memory(GiB)": 76.04,
"step": 3490,
"token_acc": 0.8631930567568373,
"train_speed(iter/s)": 0.027685
},
{
"epoch": 0.9037429698105889,
"grad_norm": 1.5099354982376099,
"learning_rate": 4.185849531056149e-06,
"loss": 0.6029548645019531,
"memory(GiB)": 76.04,
"step": 3495,
"token_acc": 0.8064048588584444,
"train_speed(iter/s)": 0.027686
},
{
"epoch": 0.905035878208029,
"grad_norm": 0.861173689365387,
"learning_rate": 4.1832164462292865e-06,
"loss": 0.6235533714294433,
"memory(GiB)": 76.04,
"step": 3500,
"token_acc": 0.8324725253388218,
"train_speed(iter/s)": 0.027686
},
{
"epoch": 0.906328786605469,
"grad_norm": 1.607367992401123,
"learning_rate": 4.1805799414784044e-06,
"loss": 0.6227012634277344,
"memory(GiB)": 76.04,
"step": 3505,
"token_acc": 0.7834105927606273,
"train_speed(iter/s)": 0.027685
},
{
"epoch": 0.907621695002909,
"grad_norm": 1.4191631078720093,
"learning_rate": 4.177940022160299e-06,
"loss": 0.6287036895751953,
"memory(GiB)": 76.04,
"step": 3510,
"token_acc": 0.8294466536361799,
"train_speed(iter/s)": 0.027686
},
{
"epoch": 0.9089146034003491,
"grad_norm": 1.4229148626327515,
"learning_rate": 4.175296693638703e-06,
"loss": 0.6371709823608398,
"memory(GiB)": 76.04,
"step": 3515,
"token_acc": 0.7917865974784124,
"train_speed(iter/s)": 0.027687
},
{
"epoch": 0.9102075117977891,
"grad_norm": 1.202496886253357,
"learning_rate": 4.172649961284276e-06,
"loss": 0.6231961250305176,
"memory(GiB)": 76.04,
"step": 3520,
"token_acc": 0.7966687617850409,
"train_speed(iter/s)": 0.027686
},
{
"epoch": 0.9115004201952291,
"grad_norm": 7.718347549438477,
"learning_rate": 4.169999830474594e-06,
"loss": 0.6057699203491211,
"memory(GiB)": 76.04,
"step": 3525,
"token_acc": 0.8327813797285322,
"train_speed(iter/s)": 0.027687
},
{
"epoch": 0.9127933285926693,
"grad_norm": 2.148632526397705,
"learning_rate": 4.167346306594136e-06,
"loss": 0.6129049777984619,
"memory(GiB)": 76.04,
"step": 3530,
"token_acc": 0.8131457736835553,
"train_speed(iter/s)": 0.027687
},
{
"epoch": 0.9140862369901093,
"grad_norm": 3.352047920227051,
"learning_rate": 4.1646893950342785e-06,
"loss": 0.6119277000427246,
"memory(GiB)": 76.04,
"step": 3535,
"token_acc": 0.8336779068938476,
"train_speed(iter/s)": 0.027686
},
{
"epoch": 0.9153791453875493,
"grad_norm": 1.2060611248016357,
"learning_rate": 4.1620291011932765e-06,
"loss": 0.6040964126586914,
"memory(GiB)": 76.04,
"step": 3540,
"token_acc": 0.8219741053244108,
"train_speed(iter/s)": 0.027686
},
{
"epoch": 0.9166720537849893,
"grad_norm": 1.5047026872634888,
"learning_rate": 4.159365430476262e-06,
"loss": 0.6265472412109375,
"memory(GiB)": 76.04,
"step": 3545,
"token_acc": 0.8295702534832969,
"train_speed(iter/s)": 0.027688
},
{
"epoch": 0.9179649621824294,
"grad_norm": 1.3251553773880005,
"learning_rate": 4.156698388295222e-06,
"loss": 0.6167987823486328,
"memory(GiB)": 76.04,
"step": 3550,
"token_acc": 0.8122264371170119,
"train_speed(iter/s)": 0.027688
},
{
"epoch": 0.9192578705798694,
"grad_norm": 0.998990535736084,
"learning_rate": 4.154027980069002e-06,
"loss": 0.5864760398864746,
"memory(GiB)": 76.04,
"step": 3555,
"token_acc": 0.8300649626616478,
"train_speed(iter/s)": 0.027686
},
{
"epoch": 0.9205507789773094,
"grad_norm": 1.6433701515197754,
"learning_rate": 4.151354211223278e-06,
"loss": 0.5933123588562011,
"memory(GiB)": 76.04,
"step": 3560,
"token_acc": 0.8285720878715156,
"train_speed(iter/s)": 0.027685
},
{
"epoch": 0.9218436873747495,
"grad_norm": 1.4250352382659912,
"learning_rate": 4.148677087190559e-06,
"loss": 0.6165533065795898,
"memory(GiB)": 76.04,
"step": 3565,
"token_acc": 0.8124648441894476,
"train_speed(iter/s)": 0.027686
},
{
"epoch": 0.9231365957721895,
"grad_norm": 1.9832043647766113,
"learning_rate": 4.145996613410169e-06,
"loss": 0.601347017288208,
"memory(GiB)": 76.04,
"step": 3570,
"token_acc": 0.8363061287980919,
"train_speed(iter/s)": 0.027686
},
{
"epoch": 0.9244295041696295,
"grad_norm": 1.1738619804382324,
"learning_rate": 4.143312795328239e-06,
"loss": 0.5805646419525147,
"memory(GiB)": 76.04,
"step": 3575,
"token_acc": 0.7983795574945466,
"train_speed(iter/s)": 0.027683
},
{
"epoch": 0.9257224125670697,
"grad_norm": 1.162148356437683,
"learning_rate": 4.1406256383976945e-06,
"loss": 0.6304599285125733,
"memory(GiB)": 76.04,
"step": 3580,
"token_acc": 0.7998939233217154,
"train_speed(iter/s)": 0.027683
},
{
"epoch": 0.9270153209645097,
"grad_norm": 1.555344820022583,
"learning_rate": 4.1379351480782445e-06,
"loss": 0.6200345039367676,
"memory(GiB)": 76.04,
"step": 3585,
"token_acc": 0.8321557607386592,
"train_speed(iter/s)": 0.027683
},
{
"epoch": 0.9283082293619497,
"grad_norm": 1.4574919939041138,
"learning_rate": 4.135241329836372e-06,
"loss": 0.6034027099609375,
"memory(GiB)": 76.04,
"step": 3590,
"token_acc": 0.8022632918173296,
"train_speed(iter/s)": 0.027682
},
{
"epoch": 0.9296011377593898,
"grad_norm": 1.1182024478912354,
"learning_rate": 4.132544189145321e-06,
"loss": 0.6192724227905273,
"memory(GiB)": 76.04,
"step": 3595,
"token_acc": 0.8323038628192126,
"train_speed(iter/s)": 0.027684
},
{
"epoch": 0.9308940461568298,
"grad_norm": 1.55838143825531,
"learning_rate": 4.129843731485084e-06,
"loss": 0.6345338821411133,
"memory(GiB)": 76.04,
"step": 3600,
"token_acc": 0.7966842932685436,
"train_speed(iter/s)": 0.027684
},
{
"epoch": 0.9321869545542698,
"grad_norm": 5.046814918518066,
"learning_rate": 4.127139962342395e-06,
"loss": 0.5721908569335937,
"memory(GiB)": 76.04,
"step": 3605,
"token_acc": 0.8486438258386867,
"train_speed(iter/s)": 0.027683
},
{
"epoch": 0.9334798629517099,
"grad_norm": 1.2368015050888062,
"learning_rate": 4.124432887210715e-06,
"loss": 0.6120264053344726,
"memory(GiB)": 76.04,
"step": 3610,
"token_acc": 0.7992660550458716,
"train_speed(iter/s)": 0.027684
},
{
"epoch": 0.9347727713491499,
"grad_norm": 0.935610294342041,
"learning_rate": 4.121722511590225e-06,
"loss": 0.5871891975402832,
"memory(GiB)": 76.04,
"step": 3615,
"token_acc": 0.814580080217997,
"train_speed(iter/s)": 0.027681
},
{
"epoch": 0.9360656797465899,
"grad_norm": 1.0046820640563965,
"learning_rate": 4.119008840987807e-06,
"loss": 0.6071587562561035,
"memory(GiB)": 76.04,
"step": 3620,
"token_acc": 0.836279004654744,
"train_speed(iter/s)": 0.027681
},
{
"epoch": 0.93735858814403,
"grad_norm": 1.2669122219085693,
"learning_rate": 4.116291880917042e-06,
"loss": 0.6148792266845703,
"memory(GiB)": 76.04,
"step": 3625,
"token_acc": 0.8264746964650264,
"train_speed(iter/s)": 0.027682
},
{
"epoch": 0.9386514965414701,
"grad_norm": 1.177242636680603,
"learning_rate": 4.113571636898191e-06,
"loss": 0.6176681518554688,
"memory(GiB)": 76.04,
"step": 3630,
"token_acc": 0.8233010616902857,
"train_speed(iter/s)": 0.027684
},
{
"epoch": 0.9399444049389101,
"grad_norm": 1.3925117254257202,
"learning_rate": 4.110848114458191e-06,
"loss": 0.5971219062805175,
"memory(GiB)": 76.04,
"step": 3635,
"token_acc": 0.8236563174186287,
"train_speed(iter/s)": 0.027685
},
{
"epoch": 0.9412373133363501,
"grad_norm": 1.1656811237335205,
"learning_rate": 4.108121319130638e-06,
"loss": 0.6168715476989746,
"memory(GiB)": 76.04,
"step": 3640,
"token_acc": 0.8150085866048964,
"train_speed(iter/s)": 0.027684
},
{
"epoch": 0.9425302217337902,
"grad_norm": 1.0381889343261719,
"learning_rate": 4.105391256455776e-06,
"loss": 0.6066938400268554,
"memory(GiB)": 76.04,
"step": 3645,
"token_acc": 0.8139741020502543,
"train_speed(iter/s)": 0.027684
},
{
"epoch": 0.9438231301312302,
"grad_norm": 1.0766489505767822,
"learning_rate": 4.1026579319804894e-06,
"loss": 0.60537691116333,
"memory(GiB)": 76.04,
"step": 3650,
"token_acc": 0.8072617246596067,
"train_speed(iter/s)": 0.027684
},
{
"epoch": 0.9451160385286702,
"grad_norm": 1.0981967449188232,
"learning_rate": 4.099921351258292e-06,
"loss": 0.6052407264709473,
"memory(GiB)": 76.04,
"step": 3655,
"token_acc": 0.7947088678415858,
"train_speed(iter/s)": 0.027682
},
{
"epoch": 0.9464089469261103,
"grad_norm": 1.5217511653900146,
"learning_rate": 4.097181519849309e-06,
"loss": 0.5945847034454346,
"memory(GiB)": 76.04,
"step": 3660,
"token_acc": 0.8231042745613357,
"train_speed(iter/s)": 0.027681
},
{
"epoch": 0.9477018553235503,
"grad_norm": 1.4334150552749634,
"learning_rate": 4.094438443320274e-06,
"loss": 0.6149433135986329,
"memory(GiB)": 76.04,
"step": 3665,
"token_acc": 0.8234104473930844,
"train_speed(iter/s)": 0.02768
},
{
"epoch": 0.9489947637209903,
"grad_norm": 1.6359714269638062,
"learning_rate": 4.091692127244511e-06,
"loss": 0.6281001567840576,
"memory(GiB)": 76.04,
"step": 3670,
"token_acc": 0.8364228557642044,
"train_speed(iter/s)": 0.027681
},
{
"epoch": 0.9502876721184305,
"grad_norm": 1.234947681427002,
"learning_rate": 4.088942577201931e-06,
"loss": 0.5957602977752685,
"memory(GiB)": 76.04,
"step": 3675,
"token_acc": 0.841389663306884,
"train_speed(iter/s)": 0.027681
},
{
"epoch": 0.9515805805158705,
"grad_norm": 1.0987111330032349,
"learning_rate": 4.086189798779008e-06,
"loss": 0.6053364753723145,
"memory(GiB)": 76.04,
"step": 3680,
"token_acc": 0.8306053185547966,
"train_speed(iter/s)": 0.027681
},
{
"epoch": 0.9528734889133105,
"grad_norm": 1.273268222808838,
"learning_rate": 4.083433797568783e-06,
"loss": 0.6212119579315185,
"memory(GiB)": 76.04,
"step": 3685,
"token_acc": 0.8268924889543446,
"train_speed(iter/s)": 0.027681
},
{
"epoch": 0.9541663973107505,
"grad_norm": 1.1013809442520142,
"learning_rate": 4.0806745791708406e-06,
"loss": 0.6078325271606445,
"memory(GiB)": 76.04,
"step": 3690,
"token_acc": 0.8134150493701056,
"train_speed(iter/s)": 0.027682
},
{
"epoch": 0.9554593057081906,
"grad_norm": 1.449779987335205,
"learning_rate": 4.0779121491913035e-06,
"loss": 0.6228477478027343,
"memory(GiB)": 76.04,
"step": 3695,
"token_acc": 0.7949391768744967,
"train_speed(iter/s)": 0.027683
},
{
"epoch": 0.9567522141056306,
"grad_norm": 1.177632451057434,
"learning_rate": 4.075146513242818e-06,
"loss": 0.6086900711059571,
"memory(GiB)": 76.04,
"step": 3700,
"token_acc": 0.8241206030150754,
"train_speed(iter/s)": 0.027683
},
{
"epoch": 0.9580451225030706,
"grad_norm": 1.7194976806640625,
"learning_rate": 4.072377676944545e-06,
"loss": 0.6042545318603516,
"memory(GiB)": 76.04,
"step": 3705,
"token_acc": 0.8258380709664772,
"train_speed(iter/s)": 0.027682
},
{
"epoch": 0.9593380309005107,
"grad_norm": 1.1962262392044067,
"learning_rate": 4.069605645922152e-06,
"loss": 0.5851446151733398,
"memory(GiB)": 76.04,
"step": 3710,
"token_acc": 0.8344977304124729,
"train_speed(iter/s)": 0.027682
},
{
"epoch": 0.9606309392979507,
"grad_norm": 1.1156598329544067,
"learning_rate": 4.066830425807789e-06,
"loss": 0.5880330085754395,
"memory(GiB)": 76.04,
"step": 3715,
"token_acc": 0.8415927377759439,
"train_speed(iter/s)": 0.027682
},
{
"epoch": 0.9619238476953907,
"grad_norm": 2.626760244369507,
"learning_rate": 4.0640520222400945e-06,
"loss": 0.6129249095916748,
"memory(GiB)": 76.04,
"step": 3720,
"token_acc": 0.8348955352032055,
"train_speed(iter/s)": 0.027683
},
{
"epoch": 0.9632167560928309,
"grad_norm": 1.0791536569595337,
"learning_rate": 4.0612704408641675e-06,
"loss": 0.6016806125640869,
"memory(GiB)": 76.04,
"step": 3725,
"token_acc": 0.8637572233842663,
"train_speed(iter/s)": 0.027684
},
{
"epoch": 0.9645096644902709,
"grad_norm": 1.5433270931243896,
"learning_rate": 4.058485687331569e-06,
"loss": 0.6169820785522461,
"memory(GiB)": 76.04,
"step": 3730,
"token_acc": 0.8325906172146391,
"train_speed(iter/s)": 0.027684
},
{
"epoch": 0.9658025728877109,
"grad_norm": 1.101804256439209,
"learning_rate": 4.055697767300302e-06,
"loss": 0.5675209999084473,
"memory(GiB)": 76.04,
"step": 3735,
"token_acc": 0.8326423357664233,
"train_speed(iter/s)": 0.027683
},
{
"epoch": 0.967095481285151,
"grad_norm": 1.0612066984176636,
"learning_rate": 4.0529066864348046e-06,
"loss": 0.5953152656555176,
"memory(GiB)": 76.04,
"step": 3740,
"token_acc": 0.8497927240323893,
"train_speed(iter/s)": 0.027684
},
{
"epoch": 0.968388389682591,
"grad_norm": 1.5735872983932495,
"learning_rate": 4.050112450405937e-06,
"loss": 0.6017944812774658,
"memory(GiB)": 76.04,
"step": 3745,
"token_acc": 0.8264524103831892,
"train_speed(iter/s)": 0.027682
},
{
"epoch": 0.969681298080031,
"grad_norm": 1.157410979270935,
"learning_rate": 4.047315064890968e-06,
"loss": 0.5977309226989747,
"memory(GiB)": 76.04,
"step": 3750,
"token_acc": 0.815944055944056,
"train_speed(iter/s)": 0.027682
},
{
"epoch": 0.9709742064774711,
"grad_norm": 1.57929265499115,
"learning_rate": 4.044514535573569e-06,
"loss": 0.589405632019043,
"memory(GiB)": 76.04,
"step": 3755,
"token_acc": 0.8179965511835711,
"train_speed(iter/s)": 0.027684
},
{
"epoch": 0.9722671148749111,
"grad_norm": 1.016497015953064,
"learning_rate": 4.041710868143796e-06,
"loss": 0.589882230758667,
"memory(GiB)": 76.04,
"step": 3760,
"token_acc": 0.8092984293193717,
"train_speed(iter/s)": 0.027683
},
{
"epoch": 0.9735600232723511,
"grad_norm": 1.266408085823059,
"learning_rate": 4.038904068298083e-06,
"loss": 0.5920291423797608,
"memory(GiB)": 76.04,
"step": 3765,
"token_acc": 0.823118662159758,
"train_speed(iter/s)": 0.027685
},
{
"epoch": 0.9748529316697911,
"grad_norm": 1.1780680418014526,
"learning_rate": 4.036094141739225e-06,
"loss": 0.6140639305114746,
"memory(GiB)": 76.04,
"step": 3770,
"token_acc": 0.7946187371681734,
"train_speed(iter/s)": 0.027686
},
{
"epoch": 0.9761458400672313,
"grad_norm": 1.0114781856536865,
"learning_rate": 4.0332810941763745e-06,
"loss": 0.5897872924804688,
"memory(GiB)": 76.04,
"step": 3775,
"token_acc": 0.8349316002363314,
"train_speed(iter/s)": 0.027685
},
{
"epoch": 0.9774387484646713,
"grad_norm": 0.9569077491760254,
"learning_rate": 4.030464931325021e-06,
"loss": 0.6214170455932617,
"memory(GiB)": 76.04,
"step": 3780,
"token_acc": 0.8027143591975626,
"train_speed(iter/s)": 0.027686
},
{
"epoch": 0.9787316568621113,
"grad_norm": 1.5879004001617432,
"learning_rate": 4.027645658906986e-06,
"loss": 0.6039529800415039,
"memory(GiB)": 76.04,
"step": 3785,
"token_acc": 0.815490288962577,
"train_speed(iter/s)": 0.027686
},
{
"epoch": 0.9800245652595514,
"grad_norm": 1.1456199884414673,
"learning_rate": 4.02482328265041e-06,
"loss": 0.5806800842285156,
"memory(GiB)": 76.04,
"step": 3790,
"token_acc": 0.8437443809112813,
"train_speed(iter/s)": 0.027686
},
{
"epoch": 0.9813174736569914,
"grad_norm": 0.8533855676651001,
"learning_rate": 4.0219978082897355e-06,
"loss": 0.593365478515625,
"memory(GiB)": 76.04,
"step": 3795,
"token_acc": 0.8269325803035651,
"train_speed(iter/s)": 0.027687
},
{
"epoch": 0.9826103820544314,
"grad_norm": 1.1695212125778198,
"learning_rate": 4.019169241565704e-06,
"loss": 0.6025032043457031,
"memory(GiB)": 76.04,
"step": 3800,
"token_acc": 0.8349052595802532,
"train_speed(iter/s)": 0.027688
},
{
"epoch": 0.9839032904518715,
"grad_norm": 1.4339202642440796,
"learning_rate": 4.0163375882253366e-06,
"loss": 0.6019165992736817,
"memory(GiB)": 76.04,
"step": 3805,
"token_acc": 0.8244785353007565,
"train_speed(iter/s)": 0.027688
},
{
"epoch": 0.9851961988493115,
"grad_norm": 1.4449810981750488,
"learning_rate": 4.013502854021929e-06,
"loss": 0.606717872619629,
"memory(GiB)": 76.04,
"step": 3810,
"token_acc": 0.8192271272038598,
"train_speed(iter/s)": 0.027689
},
{
"epoch": 0.9864891072467515,
"grad_norm": 1.1717056035995483,
"learning_rate": 4.010665044715034e-06,
"loss": 0.609653091430664,
"memory(GiB)": 76.04,
"step": 3815,
"token_acc": 0.8458090195926885,
"train_speed(iter/s)": 0.027688
},
{
"epoch": 0.9877820156441917,
"grad_norm": 4.467617988586426,
"learning_rate": 4.007824166070455e-06,
"loss": 0.6024861335754395,
"memory(GiB)": 76.04,
"step": 3820,
"token_acc": 0.8408324188107141,
"train_speed(iter/s)": 0.027687
},
{
"epoch": 0.9890749240416317,
"grad_norm": 1.2757420539855957,
"learning_rate": 4.004980223860228e-06,
"loss": 0.6156288146972656,
"memory(GiB)": 76.04,
"step": 3825,
"token_acc": 0.826805096743747,
"train_speed(iter/s)": 0.027687
},
{
"epoch": 0.9903678324390717,
"grad_norm": 1.178419828414917,
"learning_rate": 4.002133223862615e-06,
"loss": 0.5892780303955079,
"memory(GiB)": 76.04,
"step": 3830,
"token_acc": 0.809322033898305,
"train_speed(iter/s)": 0.027687
},
{
"epoch": 0.9916607408365117,
"grad_norm": 1.8065563440322876,
"learning_rate": 3.999283171862093e-06,
"loss": 0.6025252342224121,
"memory(GiB)": 76.04,
"step": 3835,
"token_acc": 0.804472722092968,
"train_speed(iter/s)": 0.02769
},
{
"epoch": 0.9929536492339518,
"grad_norm": 1.030576229095459,
"learning_rate": 3.996430073649338e-06,
"loss": 0.5885412216186523,
"memory(GiB)": 76.04,
"step": 3840,
"token_acc": 0.8501988939555641,
"train_speed(iter/s)": 0.02769
},
{
"epoch": 0.9942465576313918,
"grad_norm": 1.0532441139221191,
"learning_rate": 3.993573935021213e-06,
"loss": 0.6129741191864013,
"memory(GiB)": 76.04,
"step": 3845,
"token_acc": 0.8317618076792389,
"train_speed(iter/s)": 0.027689
},
{
"epoch": 0.9955394660288318,
"grad_norm": 1.620641827583313,
"learning_rate": 3.990714761780763e-06,
"loss": 0.583595085144043,
"memory(GiB)": 76.04,
"step": 3850,
"token_acc": 0.8565638488261922,
"train_speed(iter/s)": 0.027689
},
{
"epoch": 0.9968323744262719,
"grad_norm": 1.6248869895935059,
"learning_rate": 3.987852559737196e-06,
"loss": 0.6013848304748535,
"memory(GiB)": 76.04,
"step": 3855,
"token_acc": 0.7915984036967024,
"train_speed(iter/s)": 0.027689
},
{
"epoch": 0.9981252828237119,
"grad_norm": 1.3243690729141235,
"learning_rate": 3.984987334705875e-06,
"loss": 0.5860433101654052,
"memory(GiB)": 76.04,
"step": 3860,
"token_acc": 0.834250554154246,
"train_speed(iter/s)": 0.027689
},
{
"epoch": 0.9994181912211519,
"grad_norm": 1.1566941738128662,
"learning_rate": 3.9821190925083025e-06,
"loss": 0.5727869033813476,
"memory(GiB)": 76.04,
"step": 3865,
"token_acc": 0.8439064677644144,
"train_speed(iter/s)": 0.027688
},
{
"epoch": 1.000517163358976,
"grad_norm": 2.703934669494629,
"learning_rate": 3.979247838972114e-06,
"loss": 0.604734230041504,
"memory(GiB)": 76.04,
"step": 3870,
"token_acc": 0.8727201521763456,
"train_speed(iter/s)": 0.027694
},
{
"epoch": 1.001810071756416,
"grad_norm": 1.0446311235427856,
"learning_rate": 3.976373579931063e-06,
"loss": 0.5894432544708252,
"memory(GiB)": 76.04,
"step": 3875,
"token_acc": 0.7687516615396301,
"train_speed(iter/s)": 0.027693
},
{
"epoch": 1.003102980153856,
"grad_norm": 1.3841272592544556,
"learning_rate": 3.97349632122501e-06,
"loss": 0.5918097019195556,
"memory(GiB)": 76.04,
"step": 3880,
"token_acc": 0.8106176985459612,
"train_speed(iter/s)": 0.027694
},
{
"epoch": 1.0043958885512962,
"grad_norm": 1.3004447221755981,
"learning_rate": 3.970616068699906e-06,
"loss": 0.5655845642089844,
"memory(GiB)": 76.04,
"step": 3885,
"token_acc": 0.8302924727239388,
"train_speed(iter/s)": 0.027694
},
{
"epoch": 1.0056887969487363,
"grad_norm": 1.2096521854400635,
"learning_rate": 3.96773282820779e-06,
"loss": 0.5523721218109131,
"memory(GiB)": 76.04,
"step": 3890,
"token_acc": 0.8321216960121024,
"train_speed(iter/s)": 0.027693
},
{
"epoch": 1.0069817053461763,
"grad_norm": 0.9291108846664429,
"learning_rate": 3.9648466056067705e-06,
"loss": 0.5470512390136719,
"memory(GiB)": 76.04,
"step": 3895,
"token_acc": 0.8546372106154715,
"train_speed(iter/s)": 0.027694
},
{
"epoch": 1.0082746137436163,
"grad_norm": 1.7991231679916382,
"learning_rate": 3.961957406761012e-06,
"loss": 0.5519303321838379,
"memory(GiB)": 76.04,
"step": 3900,
"token_acc": 0.8346437931856088,
"train_speed(iter/s)": 0.027695
},
{
"epoch": 1.0095675221410563,
"grad_norm": 0.9921211004257202,
"learning_rate": 3.9590652375407305e-06,
"loss": 0.5495428562164306,
"memory(GiB)": 76.04,
"step": 3905,
"token_acc": 0.8289614561027837,
"train_speed(iter/s)": 0.027694
},
{
"epoch": 1.0108604305384963,
"grad_norm": 0.9098636507987976,
"learning_rate": 3.956170103822174e-06,
"loss": 0.5806960105895996,
"memory(GiB)": 76.04,
"step": 3910,
"token_acc": 0.8316886778453777,
"train_speed(iter/s)": 0.027694
},
{
"epoch": 1.0121533389359363,
"grad_norm": 1.403108835220337,
"learning_rate": 3.953272011487615e-06,
"loss": 0.5835510730743408,
"memory(GiB)": 76.04,
"step": 3915,
"token_acc": 0.8022167487684729,
"train_speed(iter/s)": 0.027695
},
{
"epoch": 1.0134462473333765,
"grad_norm": 0.8766242861747742,
"learning_rate": 3.950370966425336e-06,
"loss": 0.5739788055419922,
"memory(GiB)": 76.04,
"step": 3920,
"token_acc": 0.8167596743207391,
"train_speed(iter/s)": 0.027695
},
{
"epoch": 1.0147391557308165,
"grad_norm": 1.0786027908325195,
"learning_rate": 3.947466974529622e-06,
"loss": 0.57960524559021,
"memory(GiB)": 76.04,
"step": 3925,
"token_acc": 0.806325589127634,
"train_speed(iter/s)": 0.027695
},
{
"epoch": 1.0160320641282565,
"grad_norm": 1.3315753936767578,
"learning_rate": 3.9445600417007416e-06,
"loss": 0.5844710826873779,
"memory(GiB)": 76.04,
"step": 3930,
"token_acc": 0.8209227957971676,
"train_speed(iter/s)": 0.027695
},
{
"epoch": 1.0173249725256965,
"grad_norm": 1.4551076889038086,
"learning_rate": 3.941650173844939e-06,
"loss": 0.5371768951416016,
"memory(GiB)": 76.04,
"step": 3935,
"token_acc": 0.8264791248046439,
"train_speed(iter/s)": 0.027695
},
{
"epoch": 1.0186178809231365,
"grad_norm": 1.0872153043746948,
"learning_rate": 3.938737376874425e-06,
"loss": 0.5733814239501953,
"memory(GiB)": 76.04,
"step": 3940,
"token_acc": 0.8316657328103738,
"train_speed(iter/s)": 0.027697
},
{
"epoch": 1.0199107893205765,
"grad_norm": 0.9862046241760254,
"learning_rate": 3.935821656707359e-06,
"loss": 0.5849019050598144,
"memory(GiB)": 76.04,
"step": 3945,
"token_acc": 0.8450649147505084,
"train_speed(iter/s)": 0.027697
},
{
"epoch": 1.0212036977180168,
"grad_norm": 1.2747979164123535,
"learning_rate": 3.93290301926784e-06,
"loss": 0.5715857982635498,
"memory(GiB)": 76.04,
"step": 3950,
"token_acc": 0.8034803940358005,
"train_speed(iter/s)": 0.027699
},
{
"epoch": 1.0224966061154568,
"grad_norm": 0.8877357840538025,
"learning_rate": 3.929981470485897e-06,
"loss": 0.5560395240783691,
"memory(GiB)": 76.04,
"step": 3955,
"token_acc": 0.8330184222957014,
"train_speed(iter/s)": 0.027699
},
{
"epoch": 1.0237895145128968,
"grad_norm": 3.0129168033599854,
"learning_rate": 3.927057016297466e-06,
"loss": 0.5378780364990234,
"memory(GiB)": 76.04,
"step": 3960,
"token_acc": 0.8276883389862896,
"train_speed(iter/s)": 0.027697
},
{
"epoch": 1.0250824229103368,
"grad_norm": 1.279038429260254,
"learning_rate": 3.924129662644398e-06,
"loss": 0.5460095405578613,
"memory(GiB)": 76.04,
"step": 3965,
"token_acc": 0.8445347544377564,
"train_speed(iter/s)": 0.027698
},
{
"epoch": 1.0263753313077768,
"grad_norm": 1.0856847763061523,
"learning_rate": 3.921199415474426e-06,
"loss": 0.5677762985229492,
"memory(GiB)": 76.04,
"step": 3970,
"token_acc": 0.8344854941069809,
"train_speed(iter/s)": 0.027698
},
{
"epoch": 1.0276682397052168,
"grad_norm": 1.0214816331863403,
"learning_rate": 3.918266280741166e-06,
"loss": 0.5525214195251464,
"memory(GiB)": 76.04,
"step": 3975,
"token_acc": 0.8022044260742272,
"train_speed(iter/s)": 0.027699
},
{
"epoch": 1.0289611481026568,
"grad_norm": 2.20583176612854,
"learning_rate": 3.915330264404098e-06,
"loss": 0.5635844230651855,
"memory(GiB)": 76.04,
"step": 3980,
"token_acc": 0.8163771712158809,
"train_speed(iter/s)": 0.0277
},
{
"epoch": 1.030254056500097,
"grad_norm": 1.1798099279403687,
"learning_rate": 3.912391372428561e-06,
"loss": 0.563462209701538,
"memory(GiB)": 76.04,
"step": 3985,
"token_acc": 0.8396355353075171,
"train_speed(iter/s)": 0.027699
},
{
"epoch": 1.031546964897537,
"grad_norm": 0.9924677014350891,
"learning_rate": 3.9094496107857336e-06,
"loss": 0.5675541400909424,
"memory(GiB)": 76.04,
"step": 3990,
"token_acc": 0.8223220012828736,
"train_speed(iter/s)": 0.027699
},
{
"epoch": 1.032839873294977,
"grad_norm": 1.259884238243103,
"learning_rate": 3.906504985452626e-06,
"loss": 0.5578344345092774,
"memory(GiB)": 76.04,
"step": 3995,
"token_acc": 0.8310387984981227,
"train_speed(iter/s)": 0.027698
},
{
"epoch": 1.034132781692417,
"grad_norm": 1.3022695779800415,
"learning_rate": 3.903557502412065e-06,
"loss": 0.5636180877685547,
"memory(GiB)": 76.04,
"step": 4000,
"token_acc": 0.8399380474257637,
"train_speed(iter/s)": 0.027698
},
{
"epoch": 1.035425690089857,
"grad_norm": 1.0616282224655151,
"learning_rate": 3.900607167652687e-06,
"loss": 0.5414395809173584,
"memory(GiB)": 76.04,
"step": 4005,
"token_acc": 0.8558815464765561,
"train_speed(iter/s)": 0.027663
},
{
"epoch": 1.036718598487297,
"grad_norm": 1.1525382995605469,
"learning_rate": 3.897653987168919e-06,
"loss": 0.5726981163024902,
"memory(GiB)": 76.04,
"step": 4010,
"token_acc": 0.8494477021682804,
"train_speed(iter/s)": 0.027663
},
{
"epoch": 1.0380115068847373,
"grad_norm": 1.1002613306045532,
"learning_rate": 3.894697966960972e-06,
"loss": 0.5688316345214843,
"memory(GiB)": 76.04,
"step": 4015,
"token_acc": 0.8253162219554981,
"train_speed(iter/s)": 0.027664
},
{
"epoch": 1.0393044152821773,
"grad_norm": 0.9993764758110046,
"learning_rate": 3.891739113034826e-06,
"loss": 0.5663973331451416,
"memory(GiB)": 76.04,
"step": 4020,
"token_acc": 0.847761685319289,
"train_speed(iter/s)": 0.027665
},
{
"epoch": 1.0405973236796173,
"grad_norm": 1.1270966529846191,
"learning_rate": 3.888777431402219e-06,
"loss": 0.5679460525512695,
"memory(GiB)": 76.04,
"step": 4025,
"token_acc": 0.8138078016016533,
"train_speed(iter/s)": 0.027665
},
{
"epoch": 1.0418902320770573,
"grad_norm": 1.4055637121200562,
"learning_rate": 3.885812928080634e-06,
"loss": 0.5653609275817871,
"memory(GiB)": 76.04,
"step": 4030,
"token_acc": 0.8330082979618371,
"train_speed(iter/s)": 0.027665
},
{
"epoch": 1.0431831404744973,
"grad_norm": 1.0064263343811035,
"learning_rate": 3.8828456090932855e-06,
"loss": 0.5649868011474609,
"memory(GiB)": 76.04,
"step": 4035,
"token_acc": 0.8298977309044423,
"train_speed(iter/s)": 0.027665
},
{
"epoch": 1.0444760488719373,
"grad_norm": 1.3120592832565308,
"learning_rate": 3.879875480469112e-06,
"loss": 0.558688735961914,
"memory(GiB)": 76.04,
"step": 4040,
"token_acc": 0.8430891302155129,
"train_speed(iter/s)": 0.027664
},
{
"epoch": 1.0457689572693774,
"grad_norm": 1.1473355293273926,
"learning_rate": 3.876902548242758e-06,
"loss": 0.5573469161987304,
"memory(GiB)": 76.04,
"step": 4045,
"token_acc": 0.8069763883930848,
"train_speed(iter/s)": 0.027665
},
{
"epoch": 1.0470618656668176,
"grad_norm": 1.2132424116134644,
"learning_rate": 3.873926818454565e-06,
"loss": 0.6102540016174316,
"memory(GiB)": 76.04,
"step": 4050,
"token_acc": 0.8502197995428169,
"train_speed(iter/s)": 0.027665
},
{
"epoch": 1.0483547740642576,
"grad_norm": 6.2116618156433105,
"learning_rate": 3.87094829715056e-06,
"loss": 0.548386812210083,
"memory(GiB)": 76.04,
"step": 4055,
"token_acc": 0.8210007451137732,
"train_speed(iter/s)": 0.027665
},
{
"epoch": 1.0496476824616976,
"grad_norm": 1.0829912424087524,
"learning_rate": 3.867966990382438e-06,
"loss": 0.5702716827392578,
"memory(GiB)": 76.04,
"step": 4060,
"token_acc": 0.7936434822662367,
"train_speed(iter/s)": 0.027665
},
{
"epoch": 1.0509405908591376,
"grad_norm": 1.3445277214050293,
"learning_rate": 3.864982904207557e-06,
"loss": 0.5754476547241211,
"memory(GiB)": 76.04,
"step": 4065,
"token_acc": 0.824864653316809,
"train_speed(iter/s)": 0.027663
},
{
"epoch": 1.0522334992565776,
"grad_norm": 4.769752025604248,
"learning_rate": 3.861996044688922e-06,
"loss": 0.5743865013122559,
"memory(GiB)": 76.04,
"step": 4070,
"token_acc": 0.8401046687784052,
"train_speed(iter/s)": 0.027664
},
{
"epoch": 1.0535264076540176,
"grad_norm": 3.4262094497680664,
"learning_rate": 3.8590064178951695e-06,
"loss": 0.5537999153137207,
"memory(GiB)": 76.04,
"step": 4075,
"token_acc": 0.852727935517842,
"train_speed(iter/s)": 0.027663
},
{
"epoch": 1.0548193160514578,
"grad_norm": 1.150668978691101,
"learning_rate": 3.856014029900563e-06,
"loss": 0.541869068145752,
"memory(GiB)": 76.04,
"step": 4080,
"token_acc": 0.8340331114524663,
"train_speed(iter/s)": 0.027661
},
{
"epoch": 1.0561122244488979,
"grad_norm": 1.7679377794265747,
"learning_rate": 3.853018886784973e-06,
"loss": 0.5608885765075684,
"memory(GiB)": 76.04,
"step": 4085,
"token_acc": 0.8438836612489307,
"train_speed(iter/s)": 0.027661
},
{
"epoch": 1.0574051328463379,
"grad_norm": 3.0141847133636475,
"learning_rate": 3.850020994633869e-06,
"loss": 0.5597274303436279,
"memory(GiB)": 76.04,
"step": 4090,
"token_acc": 0.8566967231141412,
"train_speed(iter/s)": 0.02766
},
{
"epoch": 1.0586980412437779,
"grad_norm": 1.5420576333999634,
"learning_rate": 3.8470203595383034e-06,
"loss": 0.5814280986785889,
"memory(GiB)": 76.04,
"step": 4095,
"token_acc": 0.8152106326752682,
"train_speed(iter/s)": 0.027659
},
{
"epoch": 1.0599909496412179,
"grad_norm": 3.600728988647461,
"learning_rate": 3.8440169875949075e-06,
"loss": 0.55950927734375,
"memory(GiB)": 76.04,
"step": 4100,
"token_acc": 0.8275422378068218,
"train_speed(iter/s)": 0.027658
},
{
"epoch": 1.0612838580386579,
"grad_norm": 1.3479124307632446,
"learning_rate": 3.841010884905868e-06,
"loss": 0.5699577331542969,
"memory(GiB)": 76.04,
"step": 4105,
"token_acc": 0.8080579942442898,
"train_speed(iter/s)": 0.027659
},
{
"epoch": 1.062576766436098,
"grad_norm": 0.989827036857605,
"learning_rate": 3.838002057578921e-06,
"loss": 0.5578522682189941,
"memory(GiB)": 76.04,
"step": 4110,
"token_acc": 0.8228656838896867,
"train_speed(iter/s)": 0.027658
},
{
"epoch": 1.0638696748335381,
"grad_norm": 1.1100108623504639,
"learning_rate": 3.834990511727341e-06,
"loss": 0.5745999813079834,
"memory(GiB)": 76.04,
"step": 4115,
"token_acc": 0.8142663088493522,
"train_speed(iter/s)": 0.027658
},
{
"epoch": 1.0651625832309781,
"grad_norm": 0.9445801973342896,
"learning_rate": 3.831976253469921e-06,
"loss": 0.5575265884399414,
"memory(GiB)": 76.04,
"step": 4120,
"token_acc": 0.813193334855056,
"train_speed(iter/s)": 0.027658
},
{
"epoch": 1.0664554916284181,
"grad_norm": 1.1623046398162842,
"learning_rate": 3.828959288930971e-06,
"loss": 0.5857258796691894,
"memory(GiB)": 76.04,
"step": 4125,
"token_acc": 0.8250850433446725,
"train_speed(iter/s)": 0.027659
},
{
"epoch": 1.0677484000258581,
"grad_norm": 1.0290391445159912,
"learning_rate": 3.825939624240294e-06,
"loss": 0.5558303833007813,
"memory(GiB)": 76.04,
"step": 4130,
"token_acc": 0.8497815003641661,
"train_speed(iter/s)": 0.027657
},
{
"epoch": 1.0690413084232981,
"grad_norm": 1.3947314023971558,
"learning_rate": 3.822917265533184e-06,
"loss": 0.5638031959533691,
"memory(GiB)": 76.04,
"step": 4135,
"token_acc": 0.8299896824486989,
"train_speed(iter/s)": 0.027657
},
{
"epoch": 1.0703342168207381,
"grad_norm": 1.2008622884750366,
"learning_rate": 3.819892218950403e-06,
"loss": 0.5699079513549805,
"memory(GiB)": 76.04,
"step": 4140,
"token_acc": 0.8410833741230217,
"train_speed(iter/s)": 0.027657
},
{
"epoch": 1.0716271252181784,
"grad_norm": 1.117075800895691,
"learning_rate": 3.816864490638181e-06,
"loss": 0.546845531463623,
"memory(GiB)": 76.04,
"step": 4145,
"token_acc": 0.8247305985692294,
"train_speed(iter/s)": 0.027657
},
{
"epoch": 1.0729200336156184,
"grad_norm": 1.193162202835083,
"learning_rate": 3.8138340867481914e-06,
"loss": 0.540710735321045,
"memory(GiB)": 76.04,
"step": 4150,
"token_acc": 0.8468981429794202,
"train_speed(iter/s)": 0.027658
},
{
"epoch": 1.0742129420130584,
"grad_norm": 1.0668905973434448,
"learning_rate": 3.810801013437546e-06,
"loss": 0.5506375312805176,
"memory(GiB)": 76.04,
"step": 4155,
"token_acc": 0.8278915329275042,
"train_speed(iter/s)": 0.027655
},
{
"epoch": 1.0755058504104984,
"grad_norm": 1.1097651720046997,
"learning_rate": 3.807765276868779e-06,
"loss": 0.5460940361022949,
"memory(GiB)": 76.04,
"step": 4160,
"token_acc": 0.8142139418044798,
"train_speed(iter/s)": 0.027655
},
{
"epoch": 1.0767987588079384,
"grad_norm": 1.4875537157058716,
"learning_rate": 3.8047268832098376e-06,
"loss": 0.5787097454071045,
"memory(GiB)": 76.04,
"step": 4165,
"token_acc": 0.8459223372238127,
"train_speed(iter/s)": 0.027656
},
{
"epoch": 1.0780916672053784,
"grad_norm": 1.1992076635360718,
"learning_rate": 3.801685838634066e-06,
"loss": 0.5527867794036865,
"memory(GiB)": 76.04,
"step": 4170,
"token_acc": 0.8586858373272209,
"train_speed(iter/s)": 0.027656
},
{
"epoch": 1.0793845756028186,
"grad_norm": 1.046899676322937,
"learning_rate": 3.7986421493201952e-06,
"loss": 0.5584450721740722,
"memory(GiB)": 76.04,
"step": 4175,
"token_acc": 0.813550135501355,
"train_speed(iter/s)": 0.027656
},
{
"epoch": 1.0806774840002586,
"grad_norm": 1.144406795501709,
"learning_rate": 3.7955958214523297e-06,
"loss": 0.5506217002868652,
"memory(GiB)": 76.04,
"step": 4180,
"token_acc": 0.8326120340639397,
"train_speed(iter/s)": 0.027655
},
{
"epoch": 1.0819703923976987,
"grad_norm": 1.1309776306152344,
"learning_rate": 3.7925468612199344e-06,
"loss": 0.5434449195861817,
"memory(GiB)": 76.04,
"step": 4185,
"token_acc": 0.8455437400857764,
"train_speed(iter/s)": 0.027656
},
{
"epoch": 1.0832633007951387,
"grad_norm": 1.772002100944519,
"learning_rate": 3.7894952748178238e-06,
"loss": 0.5281466484069824,
"memory(GiB)": 76.04,
"step": 4190,
"token_acc": 0.8599103788530303,
"train_speed(iter/s)": 0.027656
},
{
"epoch": 1.0845562091925787,
"grad_norm": 1.2275793552398682,
"learning_rate": 3.786441068446146e-06,
"loss": 0.5290435791015625,
"memory(GiB)": 76.04,
"step": 4195,
"token_acc": 0.8503279666070364,
"train_speed(iter/s)": 0.027657
},
{
"epoch": 1.0858491175900187,
"grad_norm": 1.002516746520996,
"learning_rate": 3.7833842483103754e-06,
"loss": 0.553908109664917,
"memory(GiB)": 76.04,
"step": 4200,
"token_acc": 0.83946196437169,
"train_speed(iter/s)": 0.027657
},
{
"epoch": 1.087142025987459,
"grad_norm": 0.9520712494850159,
"learning_rate": 3.7803248206212943e-06,
"loss": 0.5496163368225098,
"memory(GiB)": 76.04,
"step": 4205,
"token_acc": 0.8473314975085234,
"train_speed(iter/s)": 0.027658
},
{
"epoch": 1.088434934384899,
"grad_norm": 1.1912646293640137,
"learning_rate": 3.7772627915949844e-06,
"loss": 0.5416050910949707,
"memory(GiB)": 76.04,
"step": 4210,
"token_acc": 0.862577306575792,
"train_speed(iter/s)": 0.027658
},
{
"epoch": 1.089727842782339,
"grad_norm": 0.875792920589447,
"learning_rate": 3.7741981674528116e-06,
"loss": 0.5520293235778808,
"memory(GiB)": 76.04,
"step": 4215,
"token_acc": 0.8360961569212728,
"train_speed(iter/s)": 0.027658
},
{
"epoch": 1.091020751179779,
"grad_norm": 1.6698824167251587,
"learning_rate": 3.7711309544214163e-06,
"loss": 0.5539298534393311,
"memory(GiB)": 76.04,
"step": 4220,
"token_acc": 0.8197564955441194,
"train_speed(iter/s)": 0.027658
},
{
"epoch": 1.092313659577219,
"grad_norm": 1.9617213010787964,
"learning_rate": 3.768061158732697e-06,
"loss": 0.543891191482544,
"memory(GiB)": 76.04,
"step": 4225,
"token_acc": 0.8655219780219781,
"train_speed(iter/s)": 0.027659
},
{
"epoch": 1.093606567974659,
"grad_norm": 1.0107512474060059,
"learning_rate": 3.764988786623801e-06,
"loss": 0.5563596725463867,
"memory(GiB)": 76.04,
"step": 4230,
"token_acc": 0.8210617141917989,
"train_speed(iter/s)": 0.027659
},
{
"epoch": 1.094899476372099,
"grad_norm": 1.1788944005966187,
"learning_rate": 3.76191384433711e-06,
"loss": 0.5319845676422119,
"memory(GiB)": 76.04,
"step": 4235,
"token_acc": 0.8385356134816536,
"train_speed(iter/s)": 0.027658
},
{
"epoch": 1.0961923847695392,
"grad_norm": 1.3050668239593506,
"learning_rate": 3.7588363381202264e-06,
"loss": 0.5554252624511719,
"memory(GiB)": 76.04,
"step": 4240,
"token_acc": 0.8417724746315843,
"train_speed(iter/s)": 0.027659
},
{
"epoch": 1.0974852931669792,
"grad_norm": 1.0011587142944336,
"learning_rate": 3.7557562742259635e-06,
"loss": 0.5328820705413818,
"memory(GiB)": 76.04,
"step": 4245,
"token_acc": 0.8045835662381219,
"train_speed(iter/s)": 0.027659
},
{
"epoch": 1.0987782015644192,
"grad_norm": 0.953105092048645,
"learning_rate": 3.752673658912331e-06,
"loss": 0.5456388473510743,
"memory(GiB)": 76.04,
"step": 4250,
"token_acc": 0.8526051825020897,
"train_speed(iter/s)": 0.027659
},
{
"epoch": 1.1000711099618592,
"grad_norm": 1.271561622619629,
"learning_rate": 3.7495884984425235e-06,
"loss": 0.5330571174621582,
"memory(GiB)": 76.04,
"step": 4255,
"token_acc": 0.8340923877683799,
"train_speed(iter/s)": 0.027658
},
{
"epoch": 1.1013640183592992,
"grad_norm": 1.6461479663848877,
"learning_rate": 3.746500799084904e-06,
"loss": 0.5901468753814697,
"memory(GiB)": 76.04,
"step": 4260,
"token_acc": 0.8404071670932793,
"train_speed(iter/s)": 0.02766
},
{
"epoch": 1.1026569267567392,
"grad_norm": 1.140838384628296,
"learning_rate": 3.7434105671129962e-06,
"loss": 0.5382442474365234,
"memory(GiB)": 76.04,
"step": 4265,
"token_acc": 0.8247861227962376,
"train_speed(iter/s)": 0.02766
},
{
"epoch": 1.1039498351541792,
"grad_norm": 1.1467549800872803,
"learning_rate": 3.7403178088054676e-06,
"loss": 0.5643450260162354,
"memory(GiB)": 76.04,
"step": 4270,
"token_acc": 0.8297176451105407,
"train_speed(iter/s)": 0.02766
},
{
"epoch": 1.1052427435516194,
"grad_norm": 1.270693302154541,
"learning_rate": 3.737222530446122e-06,
"loss": 0.5628186225891113,
"memory(GiB)": 76.04,
"step": 4275,
"token_acc": 0.801499403646277,
"train_speed(iter/s)": 0.02766
},
{
"epoch": 1.1065356519490595,
"grad_norm": 1.3963029384613037,
"learning_rate": 3.7341247383238793e-06,
"loss": 0.5608326911926269,
"memory(GiB)": 76.04,
"step": 4280,
"token_acc": 0.8412363787523383,
"train_speed(iter/s)": 0.027659
},
{
"epoch": 1.1078285603464995,
"grad_norm": 1.8522865772247314,
"learning_rate": 3.731024438732771e-06,
"loss": 0.5282313346862793,
"memory(GiB)": 76.04,
"step": 4285,
"token_acc": 0.8429379193156183,
"train_speed(iter/s)": 0.027657
},
{
"epoch": 1.1091214687439395,
"grad_norm": 1.071329951286316,
"learning_rate": 3.7279216379719194e-06,
"loss": 0.5438883781433106,
"memory(GiB)": 76.04,
"step": 4290,
"token_acc": 0.8636459342232703,
"train_speed(iter/s)": 0.027658
},
{
"epoch": 1.1104143771413795,
"grad_norm": 1.016403317451477,
"learning_rate": 3.7248163423455307e-06,
"loss": 0.5469881057739258,
"memory(GiB)": 76.04,
"step": 4295,
"token_acc": 0.8709039687639005,
"train_speed(iter/s)": 0.027657
},
{
"epoch": 1.1117072855388195,
"grad_norm": 1.8317582607269287,
"learning_rate": 3.721708558162881e-06,
"loss": 0.5621847152709961,
"memory(GiB)": 76.04,
"step": 4300,
"token_acc": 0.8440362706347361,
"train_speed(iter/s)": 0.027657
},
{
"epoch": 1.1130001939362597,
"grad_norm": 0.949685275554657,
"learning_rate": 3.7185982917382986e-06,
"loss": 0.5375046730041504,
"memory(GiB)": 76.04,
"step": 4305,
"token_acc": 0.8541533400347254,
"train_speed(iter/s)": 0.027657
},
{
"epoch": 1.1142931023336997,
"grad_norm": 1.1784414052963257,
"learning_rate": 3.7154855493911596e-06,
"loss": 0.5650627136230468,
"memory(GiB)": 76.04,
"step": 4310,
"token_acc": 0.837442021839962,
"train_speed(iter/s)": 0.027658
},
{
"epoch": 1.1155860107311397,
"grad_norm": 1.3990085124969482,
"learning_rate": 3.7123703374458685e-06,
"loss": 0.5586078643798829,
"memory(GiB)": 76.04,
"step": 4315,
"token_acc": 0.8209936463113308,
"train_speed(iter/s)": 0.027657
},
{
"epoch": 1.1168789191285797,
"grad_norm": 1.3244526386260986,
"learning_rate": 3.709252662231849e-06,
"loss": 0.5645613670349121,
"memory(GiB)": 76.04,
"step": 4320,
"token_acc": 0.8429337789112655,
"train_speed(iter/s)": 0.027657
},
{
"epoch": 1.1181718275260197,
"grad_norm": 0.8955655097961426,
"learning_rate": 3.706132530083527e-06,
"loss": 0.5438594818115234,
"memory(GiB)": 76.04,
"step": 4325,
"token_acc": 0.8423168980373384,
"train_speed(iter/s)": 0.027658
},
{
"epoch": 1.1194647359234597,
"grad_norm": 1.5076687335968018,
"learning_rate": 3.703009947340322e-06,
"loss": 0.5539616584777832,
"memory(GiB)": 76.04,
"step": 4330,
"token_acc": 0.831388096935139,
"train_speed(iter/s)": 0.02766
},
{
"epoch": 1.1207576443209,
"grad_norm": 1.3437058925628662,
"learning_rate": 3.6998849203466324e-06,
"loss": 0.5941734313964844,
"memory(GiB)": 76.04,
"step": 4335,
"token_acc": 0.8199184374329255,
"train_speed(iter/s)": 0.027661
},
{
"epoch": 1.12205055271834,
"grad_norm": 1.2628297805786133,
"learning_rate": 3.6967574554518237e-06,
"loss": 0.5422052383422852,
"memory(GiB)": 76.04,
"step": 4340,
"token_acc": 0.8556654985226414,
"train_speed(iter/s)": 0.02766
},
{
"epoch": 1.12334346111578,
"grad_norm": 1.053979516029358,
"learning_rate": 3.6936275590102133e-06,
"loss": 0.5253170967102051,
"memory(GiB)": 76.04,
"step": 4345,
"token_acc": 0.839918890776566,
"train_speed(iter/s)": 0.027659
},
{
"epoch": 1.12463636951322,
"grad_norm": 1.1738144159317017,
"learning_rate": 3.6904952373810586e-06,
"loss": 0.5661196231842041,
"memory(GiB)": 76.04,
"step": 4350,
"token_acc": 0.8402832743178504,
"train_speed(iter/s)": 0.027659
},
{
"epoch": 1.12592927791066,
"grad_norm": 1.5563236474990845,
"learning_rate": 3.6873604969285466e-06,
"loss": 0.5621729850769043,
"memory(GiB)": 76.04,
"step": 4355,
"token_acc": 0.8411411300726107,
"train_speed(iter/s)": 0.027658
},
{
"epoch": 1.1272221863081,
"grad_norm": 0.9622613787651062,
"learning_rate": 3.6842233440217757e-06,
"loss": 0.554353904724121,
"memory(GiB)": 76.04,
"step": 4360,
"token_acc": 0.845444059976932,
"train_speed(iter/s)": 0.027659
},
{
"epoch": 1.12851509470554,
"grad_norm": 1.0929608345031738,
"learning_rate": 3.68108378503475e-06,
"loss": 0.5726329803466796,
"memory(GiB)": 76.04,
"step": 4365,
"token_acc": 0.8360975096088032,
"train_speed(iter/s)": 0.027657
},
{
"epoch": 1.1298080031029802,
"grad_norm": 1.5161771774291992,
"learning_rate": 3.677941826346358e-06,
"loss": 0.5386641502380372,
"memory(GiB)": 76.04,
"step": 4370,
"token_acc": 0.8529266398361929,
"train_speed(iter/s)": 0.027657
},
{
"epoch": 1.1311009115004202,
"grad_norm": 1.0926306247711182,
"learning_rate": 3.674797474340367e-06,
"loss": 0.567785120010376,
"memory(GiB)": 76.04,
"step": 4375,
"token_acc": 0.8242666666666667,
"train_speed(iter/s)": 0.027659
},
{
"epoch": 1.1323938198978603,
"grad_norm": 1.2848386764526367,
"learning_rate": 3.6716507354054044e-06,
"loss": 0.5367423534393311,
"memory(GiB)": 76.04,
"step": 4380,
"token_acc": 0.8228389830508475,
"train_speed(iter/s)": 0.027658
},
{
"epoch": 1.1336867282953003,
"grad_norm": 1.2967411279678345,
"learning_rate": 3.6685016159349483e-06,
"loss": 0.5374815940856934,
"memory(GiB)": 76.04,
"step": 4385,
"token_acc": 0.8508662193411426,
"train_speed(iter/s)": 0.027658
},
{
"epoch": 1.1349796366927403,
"grad_norm": 0.8757297396659851,
"learning_rate": 3.665350122327316e-06,
"loss": 0.5277114391326905,
"memory(GiB)": 76.04,
"step": 4390,
"token_acc": 0.8365678150894025,
"train_speed(iter/s)": 0.027658
},
{
"epoch": 1.1362725450901803,
"grad_norm": 1.1371816396713257,
"learning_rate": 3.662196260985646e-06,
"loss": 0.5421219825744629,
"memory(GiB)": 76.04,
"step": 4395,
"token_acc": 0.8501142154278686,
"train_speed(iter/s)": 0.027658
},
{
"epoch": 1.1375654534876203,
"grad_norm": 1.695295810699463,
"learning_rate": 3.6590400383178866e-06,
"loss": 0.5642148971557617,
"memory(GiB)": 76.04,
"step": 4400,
"token_acc": 0.8535144713526285,
"train_speed(iter/s)": 0.027659
},
{
"epoch": 1.1388583618850605,
"grad_norm": 1.0313785076141357,
"learning_rate": 3.6558814607367854e-06,
"loss": 0.5805719375610352,
"memory(GiB)": 76.04,
"step": 4405,
"token_acc": 0.8027235587834771,
"train_speed(iter/s)": 0.027659
},
{
"epoch": 1.1401512702825005,
"grad_norm": 4.604849815368652,
"learning_rate": 3.6527205346598754e-06,
"loss": 0.5551558017730713,
"memory(GiB)": 76.04,
"step": 4410,
"token_acc": 0.8510540083089706,
"train_speed(iter/s)": 0.027659
},
{
"epoch": 1.1414441786799405,
"grad_norm": 0.8938732147216797,
"learning_rate": 3.649557266509458e-06,
"loss": 0.5434865951538086,
"memory(GiB)": 76.04,
"step": 4415,
"token_acc": 0.8330908429571702,
"train_speed(iter/s)": 0.027657
},
{
"epoch": 1.1427370870773805,
"grad_norm": 1.1276806592941284,
"learning_rate": 3.646391662712598e-06,
"loss": 0.5468146324157714,
"memory(GiB)": 76.04,
"step": 4420,
"token_acc": 0.8307084785133566,
"train_speed(iter/s)": 0.027657
},
{
"epoch": 1.1440299954748205,
"grad_norm": 0.9285451173782349,
"learning_rate": 3.6432237297011016e-06,
"loss": 0.5583270072937012,
"memory(GiB)": 76.04,
"step": 4425,
"token_acc": 0.8463666452600899,
"train_speed(iter/s)": 0.027657
},
{
"epoch": 1.1453229038722608,
"grad_norm": 1.3203446865081787,
"learning_rate": 3.640053473911509e-06,
"loss": 0.546565055847168,
"memory(GiB)": 76.04,
"step": 4430,
"token_acc": 0.8101812275602667,
"train_speed(iter/s)": 0.027657
},
{
"epoch": 1.1466158122697008,
"grad_norm": 1.0341274738311768,
"learning_rate": 3.6368809017850796e-06,
"loss": 0.5599943161010742,
"memory(GiB)": 76.04,
"step": 4435,
"token_acc": 0.8432740304620504,
"train_speed(iter/s)": 0.027658
},
{
"epoch": 1.1479087206671408,
"grad_norm": 1.6448158025741577,
"learning_rate": 3.6337060197677803e-06,
"loss": 0.5772030830383301,
"memory(GiB)": 76.04,
"step": 4440,
"token_acc": 0.8330186134340437,
"train_speed(iter/s)": 0.027659
},
{
"epoch": 1.1492016290645808,
"grad_norm": 1.2664222717285156,
"learning_rate": 3.6305288343102686e-06,
"loss": 0.5556002616882324,
"memory(GiB)": 76.04,
"step": 4445,
"token_acc": 0.8380842848927955,
"train_speed(iter/s)": 0.027659
},
{
"epoch": 1.1504945374620208,
"grad_norm": 1.121952772140503,
"learning_rate": 3.6273493518678843e-06,
"loss": 0.5274020671844483,
"memory(GiB)": 76.04,
"step": 4450,
"token_acc": 0.8465437496040044,
"train_speed(iter/s)": 0.027659
},
{
"epoch": 1.1517874458594608,
"grad_norm": 2.5075345039367676,
"learning_rate": 3.624167578900633e-06,
"loss": 0.5526081085205078,
"memory(GiB)": 76.04,
"step": 4455,
"token_acc": 0.8559080459770115,
"train_speed(iter/s)": 0.027658
},
{
"epoch": 1.1530803542569008,
"grad_norm": 1.5982297658920288,
"learning_rate": 3.6209835218731753e-06,
"loss": 0.5586674213409424,
"memory(GiB)": 76.04,
"step": 4460,
"token_acc": 0.8431690299347288,
"train_speed(iter/s)": 0.027657
},
{
"epoch": 1.154373262654341,
"grad_norm": 0.9923174977302551,
"learning_rate": 3.6177971872548116e-06,
"loss": 0.5380115032196044,
"memory(GiB)": 76.04,
"step": 4465,
"token_acc": 0.8524468348607622,
"train_speed(iter/s)": 0.027656
},
{
"epoch": 1.155666171051781,
"grad_norm": 1.332160234451294,
"learning_rate": 3.6146085815194694e-06,
"loss": 0.5499836444854737,
"memory(GiB)": 76.04,
"step": 4470,
"token_acc": 0.8119310724416783,
"train_speed(iter/s)": 0.027656
},
{
"epoch": 1.156959079449221,
"grad_norm": 1.0057822465896606,
"learning_rate": 3.6114177111456916e-06,
"loss": 0.5390474319458007,
"memory(GiB)": 76.04,
"step": 4475,
"token_acc": 0.8468805191604715,
"train_speed(iter/s)": 0.027656
},
{
"epoch": 1.158251987846661,
"grad_norm": 1.1006532907485962,
"learning_rate": 3.608224582616622e-06,
"loss": 0.5385686874389648,
"memory(GiB)": 76.04,
"step": 4480,
"token_acc": 0.8392769471100201,
"train_speed(iter/s)": 0.027657
},
{
"epoch": 1.159544896244101,
"grad_norm": 3.348762035369873,
"learning_rate": 3.6050292024199916e-06,
"loss": 0.5231637001037598,
"memory(GiB)": 76.04,
"step": 4485,
"token_acc": 0.8456874336819766,
"train_speed(iter/s)": 0.027657
},
{
"epoch": 1.160837804641541,
"grad_norm": 1.867474913597107,
"learning_rate": 3.601831577048109e-06,
"loss": 0.5361900329589844,
"memory(GiB)": 76.04,
"step": 4490,
"token_acc": 0.848513334725994,
"train_speed(iter/s)": 0.027656
},
{
"epoch": 1.162130713038981,
"grad_norm": 1.6171462535858154,
"learning_rate": 3.598631712997841e-06,
"loss": 0.5521645545959473,
"memory(GiB)": 76.04,
"step": 4495,
"token_acc": 0.8328944218338521,
"train_speed(iter/s)": 0.027658
},
{
"epoch": 1.1634236214364213,
"grad_norm": 1.5987505912780762,
"learning_rate": 3.5954296167706054e-06,
"loss": 0.5655074119567871,
"memory(GiB)": 76.04,
"step": 4500,
"token_acc": 0.8466187172830574,
"train_speed(iter/s)": 0.027657
},
{
"epoch": 1.1647165298338613,
"grad_norm": 0.9721993803977966,
"learning_rate": 3.5922252948723547e-06,
"loss": 0.5404928684234619,
"memory(GiB)": 76.04,
"step": 4505,
"token_acc": 0.8404960207292245,
"train_speed(iter/s)": 0.027655
},
{
"epoch": 1.1660094382313013,
"grad_norm": 1.1576437950134277,
"learning_rate": 3.5890187538135616e-06,
"loss": 0.5581830024719239,
"memory(GiB)": 76.04,
"step": 4510,
"token_acc": 0.8301479321887485,
"train_speed(iter/s)": 0.027655
},
{
"epoch": 1.1673023466287413,
"grad_norm": 1.2888849973678589,
"learning_rate": 3.5858100001092117e-06,
"loss": 0.5397047996520996,
"memory(GiB)": 76.04,
"step": 4515,
"token_acc": 0.8300223392372746,
"train_speed(iter/s)": 0.027655
},
{
"epoch": 1.1685952550261813,
"grad_norm": 1.0602697134017944,
"learning_rate": 3.5825990402787815e-06,
"loss": 0.5373691558837891,
"memory(GiB)": 76.04,
"step": 4520,
"token_acc": 0.8421267268185757,
"train_speed(iter/s)": 0.027656
},
{
"epoch": 1.1698881634236213,
"grad_norm": 1.1042633056640625,
"learning_rate": 3.579385880846232e-06,
"loss": 0.5380208015441894,
"memory(GiB)": 76.04,
"step": 4525,
"token_acc": 0.8591904314733356,
"train_speed(iter/s)": 0.027657
},
{
"epoch": 1.1711810718210616,
"grad_norm": 1.2820953130722046,
"learning_rate": 3.576170528339996e-06,
"loss": 0.5534794807434082,
"memory(GiB)": 76.04,
"step": 4530,
"token_acc": 0.8538792049463793,
"train_speed(iter/s)": 0.027658
},
{
"epoch": 1.1724739802185016,
"grad_norm": 1.0828604698181152,
"learning_rate": 3.5729529892929577e-06,
"loss": 0.5525107383728027,
"memory(GiB)": 76.04,
"step": 4535,
"token_acc": 0.8491177281499862,
"train_speed(iter/s)": 0.027659
},
{
"epoch": 1.1737668886159416,
"grad_norm": 1.0604639053344727,
"learning_rate": 3.569733270242446e-06,
"loss": 0.5319010734558105,
"memory(GiB)": 76.04,
"step": 4540,
"token_acc": 0.8434462262398613,
"train_speed(iter/s)": 0.027657
},
{
"epoch": 1.1750597970133816,
"grad_norm": 1.1371124982833862,
"learning_rate": 3.5665113777302184e-06,
"loss": 0.5360076904296875,
"memory(GiB)": 76.04,
"step": 4545,
"token_acc": 0.8431543594888123,
"train_speed(iter/s)": 0.027656
},
{
"epoch": 1.1763527054108216,
"grad_norm": 1.3616864681243896,
"learning_rate": 3.56328731830245e-06,
"loss": 0.5336400508880615,
"memory(GiB)": 76.04,
"step": 4550,
"token_acc": 0.8413301476636246,
"train_speed(iter/s)": 0.027656
},
{
"epoch": 1.1776456138082616,
"grad_norm": 1.6478408575057983,
"learning_rate": 3.5600610985097158e-06,
"loss": 0.5487207412719727,
"memory(GiB)": 76.04,
"step": 4555,
"token_acc": 0.8626700118843259,
"train_speed(iter/s)": 0.027656
},
{
"epoch": 1.1789385222057018,
"grad_norm": 1.0522022247314453,
"learning_rate": 3.5568327249069835e-06,
"loss": 0.5672080993652344,
"memory(GiB)": 76.04,
"step": 4560,
"token_acc": 0.8288274920616079,
"train_speed(iter/s)": 0.027657
},
{
"epoch": 1.1802314306031418,
"grad_norm": 1.0040467977523804,
"learning_rate": 3.553602204053593e-06,
"loss": 0.5410587787628174,
"memory(GiB)": 76.04,
"step": 4565,
"token_acc": 0.8245939675174014,
"train_speed(iter/s)": 0.027658
},
{
"epoch": 1.1815243390005818,
"grad_norm": 1.081748366355896,
"learning_rate": 3.550369542513252e-06,
"loss": 0.5334537982940674,
"memory(GiB)": 76.04,
"step": 4570,
"token_acc": 0.8730332603067118,
"train_speed(iter/s)": 0.027656
},
{
"epoch": 1.1828172473980219,
"grad_norm": 2.2112412452697754,
"learning_rate": 3.5471347468540124e-06,
"loss": 0.5522329330444335,
"memory(GiB)": 76.04,
"step": 4575,
"token_acc": 0.8477234082750803,
"train_speed(iter/s)": 0.027656
},
{
"epoch": 1.1841101557954619,
"grad_norm": 0.9928748607635498,
"learning_rate": 3.5438978236482656e-06,
"loss": 0.5604439735412597,
"memory(GiB)": 76.04,
"step": 4580,
"token_acc": 0.7849382585192644,
"train_speed(iter/s)": 0.027657
},
{
"epoch": 1.1854030641929019,
"grad_norm": 1.1116896867752075,
"learning_rate": 3.540658779472723e-06,
"loss": 0.5413738250732422,
"memory(GiB)": 76.04,
"step": 4585,
"token_acc": 0.8287749204588575,
"train_speed(iter/s)": 0.027656
},
{
"epoch": 1.1866959725903419,
"grad_norm": 1.1231666803359985,
"learning_rate": 3.5374176209084087e-06,
"loss": 0.5632248401641846,
"memory(GiB)": 76.04,
"step": 4590,
"token_acc": 0.8709531013615733,
"train_speed(iter/s)": 0.027656
},
{
"epoch": 1.187988880987782,
"grad_norm": 1.3912053108215332,
"learning_rate": 3.5341743545406403e-06,
"loss": 0.5327963829040527,
"memory(GiB)": 76.04,
"step": 4595,
"token_acc": 0.834390750074118,
"train_speed(iter/s)": 0.027656
},
{
"epoch": 1.189281789385222,
"grad_norm": 0.8718860149383545,
"learning_rate": 3.530928986959019e-06,
"loss": 0.5319995880126953,
"memory(GiB)": 76.04,
"step": 4600,
"token_acc": 0.8448853130778072,
"train_speed(iter/s)": 0.027656
},
{
"epoch": 1.1905746977826621,
"grad_norm": 1.055916666984558,
"learning_rate": 3.5276815247574148e-06,
"loss": 0.5589988708496094,
"memory(GiB)": 76.04,
"step": 4605,
"token_acc": 0.8575937187283504,
"train_speed(iter/s)": 0.027656
},
{
"epoch": 1.1918676061801021,
"grad_norm": 0.9413480758666992,
"learning_rate": 3.5244319745339524e-06,
"loss": 0.5528499126434326,
"memory(GiB)": 76.04,
"step": 4610,
"token_acc": 0.8506312722563937,
"train_speed(iter/s)": 0.027655
},
{
"epoch": 1.1931605145775421,
"grad_norm": 1.0575560331344604,
"learning_rate": 3.5211803428910015e-06,
"loss": 0.513238525390625,
"memory(GiB)": 76.04,
"step": 4615,
"token_acc": 0.8514719699342311,
"train_speed(iter/s)": 0.027656
},
{
"epoch": 1.1944534229749821,
"grad_norm": 1.3240203857421875,
"learning_rate": 3.5179266364351584e-06,
"loss": 0.522664737701416,
"memory(GiB)": 76.04,
"step": 4620,
"token_acc": 0.87151792998951,
"train_speed(iter/s)": 0.027657
},
{
"epoch": 1.1957463313724221,
"grad_norm": 1.1790480613708496,
"learning_rate": 3.5146708617772362e-06,
"loss": 0.5358052253723145,
"memory(GiB)": 76.04,
"step": 4625,
"token_acc": 0.835999462293319,
"train_speed(iter/s)": 0.027656
},
{
"epoch": 1.1970392397698624,
"grad_norm": 0.9325648546218872,
"learning_rate": 3.51141302553225e-06,
"loss": 0.5524285316467286,
"memory(GiB)": 76.04,
"step": 4630,
"token_acc": 0.805278226398473,
"train_speed(iter/s)": 0.027655
},
{
"epoch": 1.1983321481673024,
"grad_norm": 2.2591135501861572,
"learning_rate": 3.508153134319404e-06,
"loss": 0.5479226112365723,
"memory(GiB)": 76.04,
"step": 4635,
"token_acc": 0.8172845227062094,
"train_speed(iter/s)": 0.027654
},
{
"epoch": 1.1996250565647424,
"grad_norm": 0.9014770984649658,
"learning_rate": 3.5048911947620774e-06,
"loss": 0.5491894245147705,
"memory(GiB)": 76.04,
"step": 4640,
"token_acc": 0.8255224825839139,
"train_speed(iter/s)": 0.027655
},
{
"epoch": 1.2009179649621824,
"grad_norm": 0.9961432814598083,
"learning_rate": 3.5016272134878084e-06,
"loss": 0.5200064182281494,
"memory(GiB)": 76.04,
"step": 4645,
"token_acc": 0.8566537085189094,
"train_speed(iter/s)": 0.027654
},
{
"epoch": 1.2022108733596224,
"grad_norm": 0.9640424847602844,
"learning_rate": 3.4983611971282882e-06,
"loss": 0.5232643604278564,
"memory(GiB)": 76.04,
"step": 4650,
"token_acc": 0.8389979490184588,
"train_speed(iter/s)": 0.027655
},
{
"epoch": 1.2035037817570626,
"grad_norm": 14.804332733154297,
"learning_rate": 3.49509315231934e-06,
"loss": 0.5426907062530517,
"memory(GiB)": 76.04,
"step": 4655,
"token_acc": 0.8573487661061368,
"train_speed(iter/s)": 0.027655
},
{
"epoch": 1.2047966901545026,
"grad_norm": 1.2045928239822388,
"learning_rate": 3.4918230857009083e-06,
"loss": 0.5525260448455811,
"memory(GiB)": 76.04,
"step": 4660,
"token_acc": 0.8139168327847573,
"train_speed(iter/s)": 0.027655
},
{
"epoch": 1.2060895985519426,
"grad_norm": 6.34891414642334,
"learning_rate": 3.488551003917048e-06,
"loss": 0.5549496650695801,
"memory(GiB)": 76.04,
"step": 4665,
"token_acc": 0.8387813757424794,
"train_speed(iter/s)": 0.027654
},
{
"epoch": 1.2073825069493827,
"grad_norm": 2.3981356620788574,
"learning_rate": 3.4852769136159047e-06,
"loss": 0.536187744140625,
"memory(GiB)": 76.04,
"step": 4670,
"token_acc": 0.8524989411266413,
"train_speed(iter/s)": 0.027655
},
{
"epoch": 1.2086754153468227,
"grad_norm": 1.0487481355667114,
"learning_rate": 3.482000821449707e-06,
"loss": 0.5361638069152832,
"memory(GiB)": 76.04,
"step": 4675,
"token_acc": 0.8482098061573546,
"train_speed(iter/s)": 0.027655
},
{
"epoch": 1.2099683237442627,
"grad_norm": 1.1350358724594116,
"learning_rate": 3.4787227340747514e-06,
"loss": 0.5472620010375977,
"memory(GiB)": 76.04,
"step": 4680,
"token_acc": 0.8297029702970297,
"train_speed(iter/s)": 0.027655
},
{
"epoch": 1.2112612321417027,
"grad_norm": 1.4464890956878662,
"learning_rate": 3.4754426581513866e-06,
"loss": 0.5401841163635254,
"memory(GiB)": 76.04,
"step": 4685,
"token_acc": 0.8511260213910848,
"train_speed(iter/s)": 0.027656
},
{
"epoch": 1.212554140539143,
"grad_norm": 1.922498106956482,
"learning_rate": 3.4721606003440023e-06,
"loss": 0.5158808708190918,
"memory(GiB)": 76.04,
"step": 4690,
"token_acc": 0.8568056902683479,
"train_speed(iter/s)": 0.027655
},
{
"epoch": 1.213847048936583,
"grad_norm": 0.9439219236373901,
"learning_rate": 3.4688765673210155e-06,
"loss": 0.5801658630371094,
"memory(GiB)": 76.04,
"step": 4695,
"token_acc": 0.8500309427215843,
"train_speed(iter/s)": 0.027655
},
{
"epoch": 1.215139957334023,
"grad_norm": 1.2153284549713135,
"learning_rate": 3.465590565754856e-06,
"loss": 0.5326606273651123,
"memory(GiB)": 76.04,
"step": 4700,
"token_acc": 0.8437681640787179,
"train_speed(iter/s)": 0.027655
},
{
"epoch": 1.216432865731463,
"grad_norm": 1.3219401836395264,
"learning_rate": 3.462302602321953e-06,
"loss": 0.5341041088104248,
"memory(GiB)": 76.04,
"step": 4705,
"token_acc": 0.8587982960469481,
"train_speed(iter/s)": 0.027655
},
{
"epoch": 1.217725774128903,
"grad_norm": 0.9187490940093994,
"learning_rate": 3.4590126837027216e-06,
"loss": 0.5361604690551758,
"memory(GiB)": 76.04,
"step": 4710,
"token_acc": 0.8289933797317942,
"train_speed(iter/s)": 0.027655
},
{
"epoch": 1.219018682526343,
"grad_norm": 1.189947247505188,
"learning_rate": 3.4557208165815503e-06,
"loss": 0.5369776725769043,
"memory(GiB)": 76.04,
"step": 4715,
"token_acc": 0.784606727522821,
"train_speed(iter/s)": 0.027655
},
{
"epoch": 1.220311590923783,
"grad_norm": 0.8782621026039124,
"learning_rate": 3.4524270076467846e-06,
"loss": 0.5394928455352783,
"memory(GiB)": 76.04,
"step": 4720,
"token_acc": 0.8381009137862535,
"train_speed(iter/s)": 0.027655
},
{
"epoch": 1.2216044993212232,
"grad_norm": 1.1077133417129517,
"learning_rate": 3.449131263590718e-06,
"loss": 0.5199668884277344,
"memory(GiB)": 76.04,
"step": 4725,
"token_acc": 0.8437195256220705,
"train_speed(iter/s)": 0.027656
},
{
"epoch": 1.2228974077186632,
"grad_norm": 2.1963064670562744,
"learning_rate": 3.445833591109574e-06,
"loss": 0.533887529373169,
"memory(GiB)": 76.04,
"step": 4730,
"token_acc": 0.8215962441314554,
"train_speed(iter/s)": 0.027656
},
{
"epoch": 1.2241903161161032,
"grad_norm": 1.0866763591766357,
"learning_rate": 3.4425339969034955e-06,
"loss": 0.5230364322662353,
"memory(GiB)": 76.04,
"step": 4735,
"token_acc": 0.8641819515774027,
"train_speed(iter/s)": 0.027657
},
{
"epoch": 1.2254832245135432,
"grad_norm": 1.1297239065170288,
"learning_rate": 3.439232487676527e-06,
"loss": 0.5545130729675293,
"memory(GiB)": 76.04,
"step": 4740,
"token_acc": 0.8013548084891723,
"train_speed(iter/s)": 0.027657
},
{
"epoch": 1.2267761329109832,
"grad_norm": 1.2349060773849487,
"learning_rate": 3.435929070136609e-06,
"loss": 0.5242255210876465,
"memory(GiB)": 76.04,
"step": 4745,
"token_acc": 0.8695360580716427,
"train_speed(iter/s)": 0.027657
},
{
"epoch": 1.2280690413084232,
"grad_norm": 0.9875677227973938,
"learning_rate": 3.4326237509955533e-06,
"loss": 0.5407393932342529,
"memory(GiB)": 76.04,
"step": 4750,
"token_acc": 0.8353607552258935,
"train_speed(iter/s)": 0.027656
},
{
"epoch": 1.2293619497058634,
"grad_norm": 1.4724373817443848,
"learning_rate": 3.4293165369690406e-06,
"loss": 0.5200931549072265,
"memory(GiB)": 76.04,
"step": 4755,
"token_acc": 0.8424033399891088,
"train_speed(iter/s)": 0.027657
},
{
"epoch": 1.2306548581033034,
"grad_norm": 0.8519977331161499,
"learning_rate": 3.4260074347765975e-06,
"loss": 0.5357259750366211,
"memory(GiB)": 76.04,
"step": 4760,
"token_acc": 0.8267593859249126,
"train_speed(iter/s)": 0.027658
},
{
"epoch": 1.2319477665007434,
"grad_norm": 0.9893440008163452,
"learning_rate": 3.42269645114159e-06,
"loss": 0.5508286952972412,
"memory(GiB)": 76.04,
"step": 4765,
"token_acc": 0.8041002277904328,
"train_speed(iter/s)": 0.027658
},
{
"epoch": 1.2332406748981835,
"grad_norm": 1.4743452072143555,
"learning_rate": 3.419383592791205e-06,
"loss": 0.5639371871948242,
"memory(GiB)": 76.04,
"step": 4770,
"token_acc": 0.8497330282227308,
"train_speed(iter/s)": 0.027658
},
{
"epoch": 1.2345335832956235,
"grad_norm": 1.2142781019210815,
"learning_rate": 3.4160688664564382e-06,
"loss": 0.5326876640319824,
"memory(GiB)": 76.04,
"step": 4775,
"token_acc": 0.8382480707313333,
"train_speed(iter/s)": 0.027658
},
{
"epoch": 1.2358264916930635,
"grad_norm": 8.9053955078125,
"learning_rate": 3.4127522788720836e-06,
"loss": 0.5383922100067139,
"memory(GiB)": 76.04,
"step": 4780,
"token_acc": 0.8079113088728835,
"train_speed(iter/s)": 0.027658
},
{
"epoch": 1.2371194000905037,
"grad_norm": 1.0299571752548218,
"learning_rate": 3.4094338367767117e-06,
"loss": 0.5383823394775391,
"memory(GiB)": 76.04,
"step": 4785,
"token_acc": 0.8180522825669974,
"train_speed(iter/s)": 0.027658
},
{
"epoch": 1.2384123084879437,
"grad_norm": 2.690765619277954,
"learning_rate": 3.4061135469126654e-06,
"loss": 0.5509030818939209,
"memory(GiB)": 76.04,
"step": 4790,
"token_acc": 0.8323850658249927,
"train_speed(iter/s)": 0.027658
},
{
"epoch": 1.2397052168853837,
"grad_norm": 1.1407406330108643,
"learning_rate": 3.40279141602604e-06,
"loss": 0.5402188777923584,
"memory(GiB)": 76.04,
"step": 4795,
"token_acc": 0.8609082248332804,
"train_speed(iter/s)": 0.027658
},
{
"epoch": 1.2409981252828237,
"grad_norm": 0.979908287525177,
"learning_rate": 3.3994674508666715e-06,
"loss": 0.5451946258544922,
"memory(GiB)": 76.04,
"step": 4800,
"token_acc": 0.8271758253130498,
"train_speed(iter/s)": 0.027658
},
{
"epoch": 1.2422910336802637,
"grad_norm": 1.4777721166610718,
"learning_rate": 3.3961416581881236e-06,
"loss": 0.566465187072754,
"memory(GiB)": 76.04,
"step": 4805,
"token_acc": 0.8578219364893824,
"train_speed(iter/s)": 0.027658
},
{
"epoch": 1.2435839420777037,
"grad_norm": 1.3482059240341187,
"learning_rate": 3.3928140447476722e-06,
"loss": 0.5285268783569336,
"memory(GiB)": 76.04,
"step": 4810,
"token_acc": 0.8426216288863005,
"train_speed(iter/s)": 0.027657
},
{
"epoch": 1.2448768504751437,
"grad_norm": 1.7287280559539795,
"learning_rate": 3.3894846173062917e-06,
"loss": 0.5343065738677979,
"memory(GiB)": 76.04,
"step": 4815,
"token_acc": 0.8432345137847502,
"train_speed(iter/s)": 0.027657
},
{
"epoch": 1.246169758872584,
"grad_norm": 2.6334922313690186,
"learning_rate": 3.386153382628644e-06,
"loss": 0.5239715576171875,
"memory(GiB)": 76.04,
"step": 4820,
"token_acc": 0.8302545572652349,
"train_speed(iter/s)": 0.027659
},
{
"epoch": 1.247462667270024,
"grad_norm": 1.018120527267456,
"learning_rate": 3.3828203474830623e-06,
"loss": 0.5379975318908692,
"memory(GiB)": 76.04,
"step": 4825,
"token_acc": 0.8100699300699301,
"train_speed(iter/s)": 0.027658
},
{
"epoch": 1.248755575667464,
"grad_norm": 1.9307032823562622,
"learning_rate": 3.3794855186415374e-06,
"loss": 0.5401200771331787,
"memory(GiB)": 76.04,
"step": 4830,
"token_acc": 0.8570304677442426,
"train_speed(iter/s)": 0.027658
},
{
"epoch": 1.250048484064904,
"grad_norm": 0.8735835552215576,
"learning_rate": 3.3761489028797063e-06,
"loss": 0.5682656288146972,
"memory(GiB)": 76.04,
"step": 4835,
"token_acc": 0.8328065512535019,
"train_speed(iter/s)": 0.027658
},
{
"epoch": 1.251341392462344,
"grad_norm": 0.9113880395889282,
"learning_rate": 3.372810506976833e-06,
"loss": 0.519595718383789,
"memory(GiB)": 76.04,
"step": 4840,
"token_acc": 0.8493732447427906,
"train_speed(iter/s)": 0.027655
},
{
"epoch": 1.2526343008597842,
"grad_norm": 1.123384952545166,
"learning_rate": 3.369470337715802e-06,
"loss": 0.5394314765930176,
"memory(GiB)": 76.04,
"step": 4845,
"token_acc": 0.8533837894922116,
"train_speed(iter/s)": 0.027656
},
{
"epoch": 1.253927209257224,
"grad_norm": 2.7480571269989014,
"learning_rate": 3.3661284018830986e-06,
"loss": 0.5219066619873047,
"memory(GiB)": 76.04,
"step": 4850,
"token_acc": 0.8398635428686099,
"train_speed(iter/s)": 0.027657
},
{
"epoch": 1.2552201176546642,
"grad_norm": 1.892354130744934,
"learning_rate": 3.3627847062687996e-06,
"loss": 0.5399574756622314,
"memory(GiB)": 76.04,
"step": 4855,
"token_acc": 0.8358763125833962,
"train_speed(iter/s)": 0.027656
},
{
"epoch": 1.2565130260521042,
"grad_norm": 1.2802281379699707,
"learning_rate": 3.359439257666554e-06,
"loss": 0.5371671676635742,
"memory(GiB)": 76.04,
"step": 4860,
"token_acc": 0.8427890861844954,
"train_speed(iter/s)": 0.027655
},
{
"epoch": 1.2578059344495442,
"grad_norm": 0.9374321699142456,
"learning_rate": 3.356092062873576e-06,
"loss": 0.5454726219177246,
"memory(GiB)": 76.04,
"step": 4865,
"token_acc": 0.8636550683553564,
"train_speed(iter/s)": 0.027656
},
{
"epoch": 1.2590988428469843,
"grad_norm": 0.8962191939353943,
"learning_rate": 3.3527431286906248e-06,
"loss": 0.5191185951232911,
"memory(GiB)": 76.04,
"step": 4870,
"token_acc": 0.8448458652748329,
"train_speed(iter/s)": 0.027656
},
{
"epoch": 1.2603917512444243,
"grad_norm": 1.248224139213562,
"learning_rate": 3.3493924619219964e-06,
"loss": 0.5477604866027832,
"memory(GiB)": 76.04,
"step": 4875,
"token_acc": 0.8294104944936299,
"train_speed(iter/s)": 0.027656
},
{
"epoch": 1.2616846596418645,
"grad_norm": 0.9002890586853027,
"learning_rate": 3.3460400693755047e-06,
"loss": 0.5323681831359863,
"memory(GiB)": 76.04,
"step": 4880,
"token_acc": 0.8577657555815738,
"train_speed(iter/s)": 0.027656
},
{
"epoch": 1.2629775680393043,
"grad_norm": 1.059833288192749,
"learning_rate": 3.3426859578624705e-06,
"loss": 0.5649502754211426,
"memory(GiB)": 76.04,
"step": 4885,
"token_acc": 0.7916213275299239,
"train_speed(iter/s)": 0.027656
},
{
"epoch": 1.2642704764367445,
"grad_norm": 1.3469425439834595,
"learning_rate": 3.339330134197708e-06,
"loss": 0.5313740730285644,
"memory(GiB)": 76.04,
"step": 4890,
"token_acc": 0.8403429238296153,
"train_speed(iter/s)": 0.027655
},
{
"epoch": 1.2655633848341845,
"grad_norm": 1.0573686361312866,
"learning_rate": 3.3359726051995097e-06,
"loss": 0.5383338451385498,
"memory(GiB)": 76.04,
"step": 4895,
"token_acc": 0.8299968790405279,
"train_speed(iter/s)": 0.027655
},
{
"epoch": 1.2668562932316245,
"grad_norm": 1.5502042770385742,
"learning_rate": 3.332613377689632e-06,
"loss": 0.5520769119262695,
"memory(GiB)": 76.04,
"step": 4900,
"token_acc": 0.8267734765697351,
"train_speed(iter/s)": 0.027656
},
{
"epoch": 1.2681492016290645,
"grad_norm": 1.3487634658813477,
"learning_rate": 3.3292524584932846e-06,
"loss": 0.5057527542114257,
"memory(GiB)": 76.04,
"step": 4905,
"token_acc": 0.8129584979223311,
"train_speed(iter/s)": 0.027656
},
{
"epoch": 1.2694421100265045,
"grad_norm": 1.016811490058899,
"learning_rate": 3.325889854439112e-06,
"loss": 0.5458771228790283,
"memory(GiB)": 76.04,
"step": 4910,
"token_acc": 0.8184902798291486,
"train_speed(iter/s)": 0.027655
},
{
"epoch": 1.2707350184239448,
"grad_norm": 1.8867390155792236,
"learning_rate": 3.322525572359183e-06,
"loss": 0.5553860664367676,
"memory(GiB)": 76.04,
"step": 4915,
"token_acc": 0.8386983751587614,
"train_speed(iter/s)": 0.027654
},
{
"epoch": 1.2720279268213848,
"grad_norm": 1.2835884094238281,
"learning_rate": 3.3191596190889762e-06,
"loss": 0.5246952056884766,
"memory(GiB)": 76.04,
"step": 4920,
"token_acc": 0.8337373292199207,
"train_speed(iter/s)": 0.027654
},
{
"epoch": 1.2733208352188248,
"grad_norm": 1.0106348991394043,
"learning_rate": 3.3157920014673646e-06,
"loss": 0.5335243225097657,
"memory(GiB)": 76.04,
"step": 4925,
"token_acc": 0.8471834913552705,
"train_speed(iter/s)": 0.027654
},
{
"epoch": 1.2746137436162648,
"grad_norm": 1.2083282470703125,
"learning_rate": 3.3124227263366036e-06,
"loss": 0.557880973815918,
"memory(GiB)": 76.04,
"step": 4930,
"token_acc": 0.8243945635852616,
"train_speed(iter/s)": 0.027655
},
{
"epoch": 1.2759066520137048,
"grad_norm": 1.0607614517211914,
"learning_rate": 3.3090518005423157e-06,
"loss": 0.5547267436981201,
"memory(GiB)": 76.04,
"step": 4935,
"token_acc": 0.8197203446674578,
"train_speed(iter/s)": 0.027656
},
{
"epoch": 1.2771995604111448,
"grad_norm": 1.0059987306594849,
"learning_rate": 3.305679230933478e-06,
"loss": 0.5478557586669922,
"memory(GiB)": 76.04,
"step": 4940,
"token_acc": 0.7879940655076654,
"train_speed(iter/s)": 0.027655
},
{
"epoch": 1.2784924688085848,
"grad_norm": 0.9902651906013489,
"learning_rate": 3.3023050243624066e-06,
"loss": 0.5528521537780762,
"memory(GiB)": 76.04,
"step": 4945,
"token_acc": 0.832831287809007,
"train_speed(iter/s)": 0.027655
},
{
"epoch": 1.279785377206025,
"grad_norm": 1.2956874370574951,
"learning_rate": 3.298929187684744e-06,
"loss": 0.5243937969207764,
"memory(GiB)": 76.04,
"step": 4950,
"token_acc": 0.8375224024129038,
"train_speed(iter/s)": 0.027656
},
{
"epoch": 1.281078285603465,
"grad_norm": 0.8811606764793396,
"learning_rate": 3.2955517277594453e-06,
"loss": 0.5211551666259766,
"memory(GiB)": 76.04,
"step": 4955,
"token_acc": 0.8267743146826887,
"train_speed(iter/s)": 0.027656
},
{
"epoch": 1.282371194000905,
"grad_norm": 0.9188509583473206,
"learning_rate": 3.292172651448761e-06,
"loss": 0.5098612785339356,
"memory(GiB)": 76.04,
"step": 4960,
"token_acc": 0.8693684341651787,
"train_speed(iter/s)": 0.027656
},
{
"epoch": 1.283664102398345,
"grad_norm": 0.9744674563407898,
"learning_rate": 3.2887919656182304e-06,
"loss": 0.5251672744750977,
"memory(GiB)": 76.04,
"step": 4965,
"token_acc": 0.8267029592406476,
"train_speed(iter/s)": 0.027656
},
{
"epoch": 1.284957010795785,
"grad_norm": 0.9584162831306458,
"learning_rate": 3.2854096771366584e-06,
"loss": 0.5332806587219239,
"memory(GiB)": 76.04,
"step": 4970,
"token_acc": 0.8322600222529418,
"train_speed(iter/s)": 0.027657
},
{
"epoch": 1.2862499191932253,
"grad_norm": 0.8617119789123535,
"learning_rate": 3.28202579287611e-06,
"loss": 0.5289664745330811,
"memory(GiB)": 76.04,
"step": 4975,
"token_acc": 0.829112426035503,
"train_speed(iter/s)": 0.027657
},
{
"epoch": 1.287542827590665,
"grad_norm": 1.0189121961593628,
"learning_rate": 3.278640319711889e-06,
"loss": 0.5311687469482422,
"memory(GiB)": 76.04,
"step": 4980,
"token_acc": 0.8320498040119898,
"train_speed(iter/s)": 0.027658
},
{
"epoch": 1.2888357359881053,
"grad_norm": 0.9284194707870483,
"learning_rate": 3.275253264522529e-06,
"loss": 0.5279128074645996,
"memory(GiB)": 76.04,
"step": 4985,
"token_acc": 0.8646108400841427,
"train_speed(iter/s)": 0.027656
},
{
"epoch": 1.2901286443855453,
"grad_norm": 1.1427520513534546,
"learning_rate": 3.2718646341897796e-06,
"loss": 0.5510351181030273,
"memory(GiB)": 76.04,
"step": 4990,
"token_acc": 0.8475239880886732,
"train_speed(iter/s)": 0.027657
},
{
"epoch": 1.2914215527829853,
"grad_norm": 0.9549011588096619,
"learning_rate": 3.268474435598587e-06,
"loss": 0.5165416240692139,
"memory(GiB)": 76.04,
"step": 4995,
"token_acc": 0.856951293364478,
"train_speed(iter/s)": 0.027655
},
{
"epoch": 1.2927144611804253,
"grad_norm": 1.217895746231079,
"learning_rate": 3.265082675637087e-06,
"loss": 0.535146427154541,
"memory(GiB)": 76.04,
"step": 5000,
"token_acc": 0.8132745913451641,
"train_speed(iter/s)": 0.027657
},
{
"epoch": 1.2940073695778653,
"grad_norm": 0.9800029993057251,
"learning_rate": 3.2616893611965865e-06,
"loss": 0.5271368503570557,
"memory(GiB)": 76.04,
"step": 5005,
"token_acc": 0.8222583265637693,
"train_speed(iter/s)": 0.027627
},
{
"epoch": 1.2953002779753056,
"grad_norm": 0.9374005198478699,
"learning_rate": 3.258294499171552e-06,
"loss": 0.5365757942199707,
"memory(GiB)": 76.04,
"step": 5010,
"token_acc": 0.8411453966124434,
"train_speed(iter/s)": 0.027627
},
{
"epoch": 1.2965931863727456,
"grad_norm": 1.7360183000564575,
"learning_rate": 3.254898096459591e-06,
"loss": 0.5575047492980957,
"memory(GiB)": 76.04,
"step": 5015,
"token_acc": 0.8461703497103625,
"train_speed(iter/s)": 0.027627
},
{
"epoch": 1.2978860947701856,
"grad_norm": 1.3810652494430542,
"learning_rate": 3.251500159961446e-06,
"loss": 0.5436039924621582,
"memory(GiB)": 76.04,
"step": 5020,
"token_acc": 0.8282478766907833,
"train_speed(iter/s)": 0.027626
},
{
"epoch": 1.2991790031676256,
"grad_norm": 1.2262240648269653,
"learning_rate": 3.2481006965809713e-06,
"loss": 0.5245812892913818,
"memory(GiB)": 76.04,
"step": 5025,
"token_acc": 0.8353918706490007,
"train_speed(iter/s)": 0.027626
},
{
"epoch": 1.3004719115650656,
"grad_norm": 1.2243238687515259,
"learning_rate": 3.2446997132251267e-06,
"loss": 0.5234585762023926,
"memory(GiB)": 76.04,
"step": 5030,
"token_acc": 0.8347611572101368,
"train_speed(iter/s)": 0.027627
},
{
"epoch": 1.3017648199625056,
"grad_norm": 0.8756423592567444,
"learning_rate": 3.241297216803959e-06,
"loss": 0.5213943004608155,
"memory(GiB)": 76.04,
"step": 5035,
"token_acc": 0.8430144773070433,
"train_speed(iter/s)": 0.027626
},
{
"epoch": 1.3030577283599456,
"grad_norm": 1.070388913154602,
"learning_rate": 3.2378932142305896e-06,
"loss": 0.5314732551574707,
"memory(GiB)": 76.04,
"step": 5040,
"token_acc": 0.8426463389048185,
"train_speed(iter/s)": 0.027627
},
{
"epoch": 1.3043506367573858,
"grad_norm": 1.6048212051391602,
"learning_rate": 3.2344877124211986e-06,
"loss": 0.5154043674468994,
"memory(GiB)": 76.04,
"step": 5045,
"token_acc": 0.836419641239355,
"train_speed(iter/s)": 0.027627
},
{
"epoch": 1.3056435451548258,
"grad_norm": 0.8205667734146118,
"learning_rate": 3.2310807182950157e-06,
"loss": 0.5318900585174561,
"memory(GiB)": 76.04,
"step": 5050,
"token_acc": 0.839140860160196,
"train_speed(iter/s)": 0.027628
},
{
"epoch": 1.3069364535522658,
"grad_norm": 2.0000104904174805,
"learning_rate": 3.2276722387742986e-06,
"loss": 0.5485349178314209,
"memory(GiB)": 76.04,
"step": 5055,
"token_acc": 0.832,
"train_speed(iter/s)": 0.027629
},
{
"epoch": 1.3082293619497058,
"grad_norm": 1.5011872053146362,
"learning_rate": 3.2242622807843256e-06,
"loss": 0.5459944725036621,
"memory(GiB)": 76.04,
"step": 5060,
"token_acc": 0.8583078032077852,
"train_speed(iter/s)": 0.027628
},
{
"epoch": 1.3095222703471459,
"grad_norm": 1.206041932106018,
"learning_rate": 3.2208508512533777e-06,
"loss": 0.5489155769348144,
"memory(GiB)": 76.04,
"step": 5065,
"token_acc": 0.8435306288332225,
"train_speed(iter/s)": 0.027628
},
{
"epoch": 1.310815178744586,
"grad_norm": 0.9339408874511719,
"learning_rate": 3.2174379571127255e-06,
"loss": 0.5105900764465332,
"memory(GiB)": 76.04,
"step": 5070,
"token_acc": 0.8589012405348799,
"train_speed(iter/s)": 0.027629
},
{
"epoch": 1.3121080871420259,
"grad_norm": 0.9643262624740601,
"learning_rate": 3.214023605296618e-06,
"loss": 0.5285213947296142,
"memory(GiB)": 76.04,
"step": 5075,
"token_acc": 0.8527266411948593,
"train_speed(iter/s)": 0.027629
},
{
"epoch": 1.313400995539466,
"grad_norm": 0.9289806485176086,
"learning_rate": 3.2106078027422617e-06,
"loss": 0.546751070022583,
"memory(GiB)": 76.04,
"step": 5080,
"token_acc": 0.8559926386013342,
"train_speed(iter/s)": 0.02763
},
{
"epoch": 1.314693903936906,
"grad_norm": 0.9898268580436707,
"learning_rate": 3.2071905563898147e-06,
"loss": 0.5333544731140136,
"memory(GiB)": 76.04,
"step": 5085,
"token_acc": 0.8701183055590892,
"train_speed(iter/s)": 0.02763
},
{
"epoch": 1.3159868123343461,
"grad_norm": 1.0481353998184204,
"learning_rate": 3.2037718731823654e-06,
"loss": 0.5345610618591309,
"memory(GiB)": 76.04,
"step": 5090,
"token_acc": 0.8575830948712304,
"train_speed(iter/s)": 0.02763
},
{
"epoch": 1.3172797207317861,
"grad_norm": 1.0228970050811768,
"learning_rate": 3.200351760065924e-06,
"loss": 0.5261023998260498,
"memory(GiB)": 76.04,
"step": 5095,
"token_acc": 0.8451571927596062,
"train_speed(iter/s)": 0.027628
},
{
"epoch": 1.3185726291292261,
"grad_norm": 1.2880408763885498,
"learning_rate": 3.196930223989404e-06,
"loss": 0.5189993858337403,
"memory(GiB)": 76.04,
"step": 5100,
"token_acc": 0.8433385103653184,
"train_speed(iter/s)": 0.027627
},
{
"epoch": 1.3198655375266664,
"grad_norm": 1.5179208517074585,
"learning_rate": 3.193507271904612e-06,
"loss": 0.5425951957702637,
"memory(GiB)": 76.04,
"step": 5105,
"token_acc": 0.8408305921052631,
"train_speed(iter/s)": 0.027627
},
{
"epoch": 1.3211584459241061,
"grad_norm": 1.4803640842437744,
"learning_rate": 3.1900829107662296e-06,
"loss": 0.5434229373931885,
"memory(GiB)": 76.04,
"step": 5110,
"token_acc": 0.8403665573028624,
"train_speed(iter/s)": 0.027627
},
{
"epoch": 1.3224513543215464,
"grad_norm": 2.6232545375823975,
"learning_rate": 3.186657147531802e-06,
"loss": 0.5110975742340088,
"memory(GiB)": 76.04,
"step": 5115,
"token_acc": 0.8574821852731591,
"train_speed(iter/s)": 0.027626
},
{
"epoch": 1.3237442627189864,
"grad_norm": 0.9406218528747559,
"learning_rate": 3.1832299891617245e-06,
"loss": 0.5422788143157959,
"memory(GiB)": 76.04,
"step": 5120,
"token_acc": 0.8556760308854937,
"train_speed(iter/s)": 0.027626
},
{
"epoch": 1.3250371711164264,
"grad_norm": 1.6263813972473145,
"learning_rate": 3.179801442619225e-06,
"loss": 0.5206321716308594,
"memory(GiB)": 76.04,
"step": 5125,
"token_acc": 0.8325710236423371,
"train_speed(iter/s)": 0.027627
},
{
"epoch": 1.3263300795138664,
"grad_norm": 1.0195496082305908,
"learning_rate": 3.176371514870354e-06,
"loss": 0.5497357368469238,
"memory(GiB)": 76.04,
"step": 5130,
"token_acc": 0.8564178043952697,
"train_speed(iter/s)": 0.027627
},
{
"epoch": 1.3276229879113064,
"grad_norm": 1.1096144914627075,
"learning_rate": 3.172940212883965e-06,
"loss": 0.5373088836669921,
"memory(GiB)": 76.04,
"step": 5135,
"token_acc": 0.8881567463780764,
"train_speed(iter/s)": 0.027626
},
{
"epoch": 1.3289158963087466,
"grad_norm": 1.392109751701355,
"learning_rate": 3.1695075436317073e-06,
"loss": 0.5438241004943848,
"memory(GiB)": 76.04,
"step": 5140,
"token_acc": 0.8368131622479545,
"train_speed(iter/s)": 0.027624
},
{
"epoch": 1.3302088047061866,
"grad_norm": 1.4390590190887451,
"learning_rate": 3.166073514088006e-06,
"loss": 0.5391247272491455,
"memory(GiB)": 76.04,
"step": 5145,
"token_acc": 0.8375243285325029,
"train_speed(iter/s)": 0.027624
},
{
"epoch": 1.3315017131036266,
"grad_norm": 1.352954387664795,
"learning_rate": 3.1626381312300516e-06,
"loss": 0.5338696479797364,
"memory(GiB)": 76.04,
"step": 5150,
"token_acc": 0.847240778978906,
"train_speed(iter/s)": 0.027625
},
{
"epoch": 1.3327946215010666,
"grad_norm": 3.8461802005767822,
"learning_rate": 3.1592014020377815e-06,
"loss": 0.5344533920288086,
"memory(GiB)": 76.04,
"step": 5155,
"token_acc": 0.8609710100434191,
"train_speed(iter/s)": 0.027625
},
{
"epoch": 1.3340875298985067,
"grad_norm": 3.0712478160858154,
"learning_rate": 3.1557633334938712e-06,
"loss": 0.5250087261199952,
"memory(GiB)": 76.04,
"step": 5160,
"token_acc": 0.8473618090452262,
"train_speed(iter/s)": 0.027625
},
{
"epoch": 1.3353804382959467,
"grad_norm": 0.8653470873832703,
"learning_rate": 3.1523239325837174e-06,
"loss": 0.5317577362060547,
"memory(GiB)": 76.04,
"step": 5165,
"token_acc": 0.8672264497507216,
"train_speed(iter/s)": 0.027624
},
{
"epoch": 1.3366733466933867,
"grad_norm": 1.0058352947235107,
"learning_rate": 3.1488832062954213e-06,
"loss": 0.5124196529388427,
"memory(GiB)": 76.04,
"step": 5170,
"token_acc": 0.8276955161626695,
"train_speed(iter/s)": 0.027622
},
{
"epoch": 1.337966255090827,
"grad_norm": 1.0088118314743042,
"learning_rate": 3.145441161619779e-06,
"loss": 0.5366281509399414,
"memory(GiB)": 76.04,
"step": 5175,
"token_acc": 0.8506988094357761,
"train_speed(iter/s)": 0.027622
},
{
"epoch": 1.339259163488267,
"grad_norm": 0.955906331539154,
"learning_rate": 3.1419978055502666e-06,
"loss": 0.5448675155639648,
"memory(GiB)": 76.04,
"step": 5180,
"token_acc": 0.79640928536363,
"train_speed(iter/s)": 0.027622
},
{
"epoch": 1.340552071885707,
"grad_norm": 1.171993374824524,
"learning_rate": 3.138553145083022e-06,
"loss": 0.5282750129699707,
"memory(GiB)": 76.04,
"step": 5185,
"token_acc": 0.8676538311665308,
"train_speed(iter/s)": 0.027621
},
{
"epoch": 1.341844980283147,
"grad_norm": 1.1022040843963623,
"learning_rate": 3.135107187216834e-06,
"loss": 0.534688663482666,
"memory(GiB)": 76.04,
"step": 5190,
"token_acc": 0.8357040716489802,
"train_speed(iter/s)": 0.027622
},
{
"epoch": 1.343137888680587,
"grad_norm": 2.8540091514587402,
"learning_rate": 3.1316599389531282e-06,
"loss": 0.5261801719665528,
"memory(GiB)": 76.04,
"step": 5195,
"token_acc": 0.8275160272718022,
"train_speed(iter/s)": 0.027621
},
{
"epoch": 1.3444307970780272,
"grad_norm": 1.3484967947006226,
"learning_rate": 3.128211407295951e-06,
"loss": 0.5323428630828857,
"memory(GiB)": 76.04,
"step": 5200,
"token_acc": 0.8412020736880043,
"train_speed(iter/s)": 0.027621
},
{
"epoch": 1.345723705475467,
"grad_norm": 1.8615549802780151,
"learning_rate": 3.1247615992519587e-06,
"loss": 0.5560379981994629,
"memory(GiB)": 76.04,
"step": 5205,
"token_acc": 0.8468941382327209,
"train_speed(iter/s)": 0.027622
},
{
"epoch": 1.3470166138729072,
"grad_norm": 2.5111136436462402,
"learning_rate": 3.1213105218303972e-06,
"loss": 0.534544563293457,
"memory(GiB)": 76.04,
"step": 5210,
"token_acc": 0.8330609679446889,
"train_speed(iter/s)": 0.027622
},
{
"epoch": 1.3483095222703472,
"grad_norm": 0.9236442446708679,
"learning_rate": 3.1178581820430957e-06,
"loss": 0.5287897109985351,
"memory(GiB)": 76.04,
"step": 5215,
"token_acc": 0.8456866092341895,
"train_speed(iter/s)": 0.027622
},
{
"epoch": 1.3496024306677872,
"grad_norm": 1.2912098169326782,
"learning_rate": 3.1144045869044437e-06,
"loss": 0.5496071815490723,
"memory(GiB)": 76.04,
"step": 5220,
"token_acc": 0.8341737438075018,
"train_speed(iter/s)": 0.027622
},
{
"epoch": 1.3508953390652272,
"grad_norm": 0.8818367123603821,
"learning_rate": 3.1109497434313857e-06,
"loss": 0.5452832698822021,
"memory(GiB)": 76.04,
"step": 5225,
"token_acc": 0.8284331373254931,
"train_speed(iter/s)": 0.027622
},
{
"epoch": 1.3521882474626672,
"grad_norm": 1.3674169778823853,
"learning_rate": 3.1074936586433994e-06,
"loss": 0.537296199798584,
"memory(GiB)": 76.04,
"step": 5230,
"token_acc": 0.8586094734702175,
"train_speed(iter/s)": 0.027623
},
{
"epoch": 1.3534811558601074,
"grad_norm": 1.324703335762024,
"learning_rate": 3.1040363395624854e-06,
"loss": 0.49640579223632814,
"memory(GiB)": 76.04,
"step": 5235,
"token_acc": 0.8524991832734401,
"train_speed(iter/s)": 0.027622
},
{
"epoch": 1.3547740642575474,
"grad_norm": 0.9162821173667908,
"learning_rate": 3.1005777932131535e-06,
"loss": 0.5111923217773438,
"memory(GiB)": 76.04,
"step": 5240,
"token_acc": 0.8507638072855465,
"train_speed(iter/s)": 0.027621
},
{
"epoch": 1.3560669726549874,
"grad_norm": 1.0688142776489258,
"learning_rate": 3.097118026622405e-06,
"loss": 0.5468463897705078,
"memory(GiB)": 76.04,
"step": 5245,
"token_acc": 0.832568012476174,
"train_speed(iter/s)": 0.027621
},
{
"epoch": 1.3573598810524274,
"grad_norm": 1.0131880044937134,
"learning_rate": 3.093657046819722e-06,
"loss": 0.4972386360168457,
"memory(GiB)": 76.04,
"step": 5250,
"token_acc": 0.8424430280275911,
"train_speed(iter/s)": 0.027622
},
{
"epoch": 1.3586527894498674,
"grad_norm": 1.027860164642334,
"learning_rate": 3.0901948608370503e-06,
"loss": 0.5250637054443359,
"memory(GiB)": 76.04,
"step": 5255,
"token_acc": 0.8416179528424026,
"train_speed(iter/s)": 0.027622
},
{
"epoch": 1.3599456978473075,
"grad_norm": 0.9048755764961243,
"learning_rate": 3.086731475708788e-06,
"loss": 0.5370029449462891,
"memory(GiB)": 76.04,
"step": 5260,
"token_acc": 0.8346947027901335,
"train_speed(iter/s)": 0.027621
},
{
"epoch": 1.3612386062447475,
"grad_norm": 2.5965473651885986,
"learning_rate": 3.0832668984717675e-06,
"loss": 0.5500319480895997,
"memory(GiB)": 76.04,
"step": 5265,
"token_acc": 0.8267131242740999,
"train_speed(iter/s)": 0.027622
},
{
"epoch": 1.3625315146421877,
"grad_norm": 1.969552993774414,
"learning_rate": 3.079801136165246e-06,
"loss": 0.5336560726165771,
"memory(GiB)": 76.04,
"step": 5270,
"token_acc": 0.8175245806824755,
"train_speed(iter/s)": 0.027623
},
{
"epoch": 1.3638244230396277,
"grad_norm": 1.069265604019165,
"learning_rate": 3.0763341958308853e-06,
"loss": 0.5203034400939941,
"memory(GiB)": 76.04,
"step": 5275,
"token_acc": 0.8421536276680172,
"train_speed(iter/s)": 0.027624
},
{
"epoch": 1.3651173314370677,
"grad_norm": 1.2012121677398682,
"learning_rate": 3.072866084512743e-06,
"loss": 0.5232099533081055,
"memory(GiB)": 76.04,
"step": 5280,
"token_acc": 0.8650010324179228,
"train_speed(iter/s)": 0.027624
},
{
"epoch": 1.3664102398345077,
"grad_norm": 1.19133722782135,
"learning_rate": 3.069396809257256e-06,
"loss": 0.5404583930969238,
"memory(GiB)": 76.04,
"step": 5285,
"token_acc": 0.8128638853481241,
"train_speed(iter/s)": 0.027624
},
{
"epoch": 1.3677031482319477,
"grad_norm": 1.1554311513900757,
"learning_rate": 3.065926377113224e-06,
"loss": 0.5264840126037598,
"memory(GiB)": 76.04,
"step": 5290,
"token_acc": 0.8585351063368996,
"train_speed(iter/s)": 0.027624
},
{
"epoch": 1.3689960566293877,
"grad_norm": 1.4480676651000977,
"learning_rate": 3.0624547951318e-06,
"loss": 0.5401974678039551,
"memory(GiB)": 76.04,
"step": 5295,
"token_acc": 0.8407013111993263,
"train_speed(iter/s)": 0.027624
},
{
"epoch": 1.3702889650268277,
"grad_norm": 0.975131094455719,
"learning_rate": 3.0589820703664707e-06,
"loss": 0.5349632263183594,
"memory(GiB)": 76.04,
"step": 5300,
"token_acc": 0.8640092475203222,
"train_speed(iter/s)": 0.027624
},
{
"epoch": 1.371581873424268,
"grad_norm": 1.336971640586853,
"learning_rate": 3.0555082098730464e-06,
"loss": 0.5260316371917725,
"memory(GiB)": 76.04,
"step": 5305,
"token_acc": 0.8318684124147488,
"train_speed(iter/s)": 0.027624
},
{
"epoch": 1.372874781821708,
"grad_norm": 1.4911202192306519,
"learning_rate": 3.0520332207096433e-06,
"loss": 0.5175662994384765,
"memory(GiB)": 76.04,
"step": 5310,
"token_acc": 0.8419973789441849,
"train_speed(iter/s)": 0.027625
},
{
"epoch": 1.374167690219148,
"grad_norm": 1.1500071287155151,
"learning_rate": 3.0485571099366724e-06,
"loss": 0.5503662586212158,
"memory(GiB)": 76.04,
"step": 5315,
"token_acc": 0.800382509562739,
"train_speed(iter/s)": 0.027624
},
{
"epoch": 1.375460598616588,
"grad_norm": 0.9023737907409668,
"learning_rate": 3.0450798846168227e-06,
"loss": 0.5276325225830079,
"memory(GiB)": 76.04,
"step": 5320,
"token_acc": 0.8494656224308771,
"train_speed(iter/s)": 0.027625
},
{
"epoch": 1.376753507014028,
"grad_norm": 1.091489315032959,
"learning_rate": 3.0416015518150494e-06,
"loss": 0.5327792167663574,
"memory(GiB)": 76.04,
"step": 5325,
"token_acc": 0.8433721260289526,
"train_speed(iter/s)": 0.027625
},
{
"epoch": 1.3780464154114682,
"grad_norm": 1.429714322090149,
"learning_rate": 3.0381221185985543e-06,
"loss": 0.5325508117675781,
"memory(GiB)": 76.04,
"step": 5330,
"token_acc": 0.8607167276676185,
"train_speed(iter/s)": 0.027626
},
{
"epoch": 1.379339323808908,
"grad_norm": 0.9919387698173523,
"learning_rate": 3.034641592036779e-06,
"loss": 0.5155058860778808,
"memory(GiB)": 76.04,
"step": 5335,
"token_acc": 0.8486006657625447,
"train_speed(iter/s)": 0.027626
},
{
"epoch": 1.3806322322063482,
"grad_norm": 2.1436212062835693,
"learning_rate": 3.031159979201383e-06,
"loss": 0.5232511043548584,
"memory(GiB)": 76.04,
"step": 5340,
"token_acc": 0.8501789414202298,
"train_speed(iter/s)": 0.027625
},
{
"epoch": 1.3819251406037882,
"grad_norm": 1.056612253189087,
"learning_rate": 3.027677287166235e-06,
"loss": 0.5240641117095948,
"memory(GiB)": 76.04,
"step": 5345,
"token_acc": 0.8404334212261042,
"train_speed(iter/s)": 0.027626
},
{
"epoch": 1.3832180490012282,
"grad_norm": 1.0267983675003052,
"learning_rate": 3.0241935230073977e-06,
"loss": 0.5429930210113525,
"memory(GiB)": 76.04,
"step": 5350,
"token_acc": 0.8286713286713286,
"train_speed(iter/s)": 0.027626
},
{
"epoch": 1.3845109573986683,
"grad_norm": 1.0295692682266235,
"learning_rate": 3.020708693803108e-06,
"loss": 0.5250686645507813,
"memory(GiB)": 76.04,
"step": 5355,
"token_acc": 0.8244736210071252,
"train_speed(iter/s)": 0.027626
},
{
"epoch": 1.3858038657961083,
"grad_norm": 0.999599277973175,
"learning_rate": 3.0172228066337704e-06,
"loss": 0.5352205276489258,
"memory(GiB)": 76.04,
"step": 5360,
"token_acc": 0.84011528503737,
"train_speed(iter/s)": 0.027625
},
{
"epoch": 1.3870967741935485,
"grad_norm": 1.8036699295043945,
"learning_rate": 3.013735868581937e-06,
"loss": 0.5204336166381835,
"memory(GiB)": 76.04,
"step": 5365,
"token_acc": 0.8598814043234085,
"train_speed(iter/s)": 0.027625
},
{
"epoch": 1.3883896825909885,
"grad_norm": 1.0070078372955322,
"learning_rate": 3.0102478867322967e-06,
"loss": 0.5483356952667237,
"memory(GiB)": 76.04,
"step": 5370,
"token_acc": 0.824980503222099,
"train_speed(iter/s)": 0.027626
},
{
"epoch": 1.3896825909884285,
"grad_norm": 1.2133702039718628,
"learning_rate": 3.0067588681716563e-06,
"loss": 0.5264020919799804,
"memory(GiB)": 76.04,
"step": 5375,
"token_acc": 0.8479607640681466,
"train_speed(iter/s)": 0.027626
},
{
"epoch": 1.3909754993858685,
"grad_norm": 1.567028522491455,
"learning_rate": 3.0032688199889328e-06,
"loss": 0.5459973335266113,
"memory(GiB)": 76.04,
"step": 5380,
"token_acc": 0.8320722155847604,
"train_speed(iter/s)": 0.027627
},
{
"epoch": 1.3922684077833085,
"grad_norm": 0.9477059245109558,
"learning_rate": 2.9997777492751313e-06,
"loss": 0.522393798828125,
"memory(GiB)": 76.04,
"step": 5385,
"token_acc": 0.8648985404058384,
"train_speed(iter/s)": 0.027627
},
{
"epoch": 1.3935613161807485,
"grad_norm": 0.9237920045852661,
"learning_rate": 2.9962856631233388e-06,
"loss": 0.5231618404388427,
"memory(GiB)": 76.04,
"step": 5390,
"token_acc": 0.8635757044267358,
"train_speed(iter/s)": 0.027627
},
{
"epoch": 1.3948542245781885,
"grad_norm": 1.1687318086624146,
"learning_rate": 2.9927925686287006e-06,
"loss": 0.5056675434112549,
"memory(GiB)": 76.04,
"step": 5395,
"token_acc": 0.8491524700055506,
"train_speed(iter/s)": 0.027628
},
{
"epoch": 1.3961471329756288,
"grad_norm": 1.1914643049240112,
"learning_rate": 2.9892984728884155e-06,
"loss": 0.5470870018005372,
"memory(GiB)": 76.04,
"step": 5400,
"token_acc": 0.8393457238872505,
"train_speed(iter/s)": 0.027629
},
{
"epoch": 1.3974400413730688,
"grad_norm": 0.9498441219329834,
"learning_rate": 2.9858033830017127e-06,
"loss": 0.5178772926330566,
"memory(GiB)": 76.04,
"step": 5405,
"token_acc": 0.8383829302646169,
"train_speed(iter/s)": 0.027628
},
{
"epoch": 1.3987329497705088,
"grad_norm": 1.407654881477356,
"learning_rate": 2.982307306069842e-06,
"loss": 0.5494901180267334,
"memory(GiB)": 76.04,
"step": 5410,
"token_acc": 0.8369012373794883,
"train_speed(iter/s)": 0.027628
},
{
"epoch": 1.4000258581679488,
"grad_norm": 1.148308515548706,
"learning_rate": 2.9788102491960606e-06,
"loss": 0.5415051460266114,
"memory(GiB)": 76.04,
"step": 5415,
"token_acc": 0.8146666105050335,
"train_speed(iter/s)": 0.027628
},
{
"epoch": 1.4013187665653888,
"grad_norm": 1.0424379110336304,
"learning_rate": 2.975312219485616e-06,
"loss": 0.5347636699676513,
"memory(GiB)": 76.04,
"step": 5420,
"token_acc": 0.8373481740260795,
"train_speed(iter/s)": 0.027628
},
{
"epoch": 1.402611674962829,
"grad_norm": 1.1721105575561523,
"learning_rate": 2.971813224045732e-06,
"loss": 0.5278305053710938,
"memory(GiB)": 76.04,
"step": 5425,
"token_acc": 0.8396724598930482,
"train_speed(iter/s)": 0.027629
},
{
"epoch": 1.4039045833602688,
"grad_norm": 1.179748296737671,
"learning_rate": 2.9683132699855933e-06,
"loss": 0.5224045276641845,
"memory(GiB)": 76.04,
"step": 5430,
"token_acc": 0.8568342151675485,
"train_speed(iter/s)": 0.027629
},
{
"epoch": 1.405197491757709,
"grad_norm": 2.302943706512451,
"learning_rate": 2.9648123644163344e-06,
"loss": 0.51423659324646,
"memory(GiB)": 76.04,
"step": 5435,
"token_acc": 0.8518541896796591,
"train_speed(iter/s)": 0.02763
},
{
"epoch": 1.406490400155149,
"grad_norm": 1.695887804031372,
"learning_rate": 2.961310514451021e-06,
"loss": 0.5096250534057617,
"memory(GiB)": 76.04,
"step": 5440,
"token_acc": 0.8589175232620451,
"train_speed(iter/s)": 0.02763
},
{
"epoch": 1.407783308552589,
"grad_norm": 1.0010126829147339,
"learning_rate": 2.9578077272046407e-06,
"loss": 0.5219532012939453,
"memory(GiB)": 76.04,
"step": 5445,
"token_acc": 0.8385249390550633,
"train_speed(iter/s)": 0.027631
},
{
"epoch": 1.409076216950029,
"grad_norm": 1.0724811553955078,
"learning_rate": 2.954304009794082e-06,
"loss": 0.5457123279571533,
"memory(GiB)": 76.04,
"step": 5450,
"token_acc": 0.8312751004016065,
"train_speed(iter/s)": 0.02763
},
{
"epoch": 1.410369125347469,
"grad_norm": 1.078969120979309,
"learning_rate": 2.9507993693381245e-06,
"loss": 0.4943378925323486,
"memory(GiB)": 76.04,
"step": 5455,
"token_acc": 0.8496458467482292,
"train_speed(iter/s)": 0.027629
},
{
"epoch": 1.4116620337449093,
"grad_norm": 0.9450307488441467,
"learning_rate": 2.9472938129574248e-06,
"loss": 0.5415146827697754,
"memory(GiB)": 76.04,
"step": 5460,
"token_acc": 0.8225661328054705,
"train_speed(iter/s)": 0.027629
},
{
"epoch": 1.4129549421423493,
"grad_norm": 1.5226620435714722,
"learning_rate": 2.9437873477744973e-06,
"loss": 0.5119266033172607,
"memory(GiB)": 76.04,
"step": 5465,
"token_acc": 0.8384682058151446,
"train_speed(iter/s)": 0.027628
},
{
"epoch": 1.4142478505397893,
"grad_norm": 1.004164457321167,
"learning_rate": 2.9402799809137066e-06,
"loss": 0.5116465091705322,
"memory(GiB)": 76.04,
"step": 5470,
"token_acc": 0.8437827370559665,
"train_speed(iter/s)": 0.027628
},
{
"epoch": 1.4155407589372293,
"grad_norm": 0.9910258054733276,
"learning_rate": 2.936771719501246e-06,
"loss": 0.5433405876159668,
"memory(GiB)": 76.04,
"step": 5475,
"token_acc": 0.844466902475998,
"train_speed(iter/s)": 0.027629
},
{
"epoch": 1.4168336673346693,
"grad_norm": 0.9276953935623169,
"learning_rate": 2.9332625706651287e-06,
"loss": 0.5179524898529053,
"memory(GiB)": 76.04,
"step": 5480,
"token_acc": 0.8560186436098352,
"train_speed(iter/s)": 0.02763
},
{
"epoch": 1.4181265757321093,
"grad_norm": 0.9490028619766235,
"learning_rate": 2.929752541535169e-06,
"loss": 0.5286359786987305,
"memory(GiB)": 76.04,
"step": 5485,
"token_acc": 0.8138392178714351,
"train_speed(iter/s)": 0.027629
},
{
"epoch": 1.4194194841295493,
"grad_norm": 0.9604682326316833,
"learning_rate": 2.9262416392429727e-06,
"loss": 0.5103157043457032,
"memory(GiB)": 76.04,
"step": 5490,
"token_acc": 0.8366208149493901,
"train_speed(iter/s)": 0.02763
},
{
"epoch": 1.4207123925269896,
"grad_norm": 1.2919334173202515,
"learning_rate": 2.922729870921916e-06,
"loss": 0.5384269714355469,
"memory(GiB)": 76.04,
"step": 5495,
"token_acc": 0.8337518834756403,
"train_speed(iter/s)": 0.02763
},
{
"epoch": 1.4220053009244296,
"grad_norm": 8.227481842041016,
"learning_rate": 2.919217243707137e-06,
"loss": 0.5168218612670898,
"memory(GiB)": 76.04,
"step": 5500,
"token_acc": 0.848703986059682,
"train_speed(iter/s)": 0.027631
},
{
"epoch": 1.4232982093218696,
"grad_norm": 1.9908632040023804,
"learning_rate": 2.915703764735518e-06,
"loss": 0.5363755226135254,
"memory(GiB)": 76.04,
"step": 5505,
"token_acc": 0.8440745986779982,
"train_speed(iter/s)": 0.027631
},
{
"epoch": 1.4245911177193096,
"grad_norm": 4.587954521179199,
"learning_rate": 2.9121894411456727e-06,
"loss": 0.5621316432952881,
"memory(GiB)": 76.04,
"step": 5510,
"token_acc": 0.831285065455517,
"train_speed(iter/s)": 0.027631
},
{
"epoch": 1.4258840261167496,
"grad_norm": 1.0272570848464966,
"learning_rate": 2.90867428007793e-06,
"loss": 0.5223082542419434,
"memory(GiB)": 76.04,
"step": 5515,
"token_acc": 0.8410167818361303,
"train_speed(iter/s)": 0.027631
},
{
"epoch": 1.4271769345141896,
"grad_norm": 1.0505872964859009,
"learning_rate": 2.90515828867432e-06,
"loss": 0.5363224983215332,
"memory(GiB)": 76.04,
"step": 5520,
"token_acc": 0.8203096575979302,
"train_speed(iter/s)": 0.027631
},
{
"epoch": 1.4284698429116296,
"grad_norm": 1.0748521089553833,
"learning_rate": 2.9016414740785625e-06,
"loss": 0.5091330051422119,
"memory(GiB)": 76.04,
"step": 5525,
"token_acc": 0.8322848205813095,
"train_speed(iter/s)": 0.027631
},
{
"epoch": 1.4297627513090698,
"grad_norm": 1.0750956535339355,
"learning_rate": 2.8981238434360467e-06,
"loss": 0.5427698135375977,
"memory(GiB)": 76.04,
"step": 5530,
"token_acc": 0.8349636803874092,
"train_speed(iter/s)": 0.027631
},
{
"epoch": 1.4310556597065098,
"grad_norm": 2.610778331756592,
"learning_rate": 2.894605403893821e-06,
"loss": 0.4974540710449219,
"memory(GiB)": 76.04,
"step": 5535,
"token_acc": 0.8535060294774452,
"train_speed(iter/s)": 0.027631
},
{
"epoch": 1.4323485681039498,
"grad_norm": 1.1820318698883057,
"learning_rate": 2.8910861626005774e-06,
"loss": 0.5238434314727783,
"memory(GiB)": 76.04,
"step": 5540,
"token_acc": 0.8497934516523867,
"train_speed(iter/s)": 0.027632
},
{
"epoch": 1.4336414765013898,
"grad_norm": 3.4977128505706787,
"learning_rate": 2.887566126706638e-06,
"loss": 0.5260235786437988,
"memory(GiB)": 76.04,
"step": 5545,
"token_acc": 0.8721027400272683,
"train_speed(iter/s)": 0.027632
},
{
"epoch": 1.4349343848988299,
"grad_norm": 1.0233287811279297,
"learning_rate": 2.884045303363936e-06,
"loss": 0.5392961978912354,
"memory(GiB)": 76.04,
"step": 5550,
"token_acc": 0.8287411925544221,
"train_speed(iter/s)": 0.027632
},
{
"epoch": 1.43622729329627,
"grad_norm": 2.892608642578125,
"learning_rate": 2.8805236997260083e-06,
"loss": 0.5215497016906738,
"memory(GiB)": 76.04,
"step": 5555,
"token_acc": 0.8498609823911029,
"train_speed(iter/s)": 0.027631
},
{
"epoch": 1.4375202016937099,
"grad_norm": 1.1967157125473022,
"learning_rate": 2.877001322947975e-06,
"loss": 0.5007841110229492,
"memory(GiB)": 76.04,
"step": 5560,
"token_acc": 0.861061495279408,
"train_speed(iter/s)": 0.027631
},
{
"epoch": 1.43881311009115,
"grad_norm": 0.955826461315155,
"learning_rate": 2.8734781801865295e-06,
"loss": 0.5322293281555176,
"memory(GiB)": 76.04,
"step": 5565,
"token_acc": 0.8267916342651332,
"train_speed(iter/s)": 0.027632
},
{
"epoch": 1.44010601848859,
"grad_norm": 0.9877298474311829,
"learning_rate": 2.8699542785999174e-06,
"loss": 0.5368639469146729,
"memory(GiB)": 76.04,
"step": 5570,
"token_acc": 0.8380945422663841,
"train_speed(iter/s)": 0.027632
},
{
"epoch": 1.44139892688603,
"grad_norm": 1.6299902200698853,
"learning_rate": 2.866429625347929e-06,
"loss": 0.5405995368957519,
"memory(GiB)": 76.04,
"step": 5575,
"token_acc": 0.8694860906491031,
"train_speed(iter/s)": 0.027633
},
{
"epoch": 1.4426918352834701,
"grad_norm": 1.1087404489517212,
"learning_rate": 2.8629042275918816e-06,
"loss": 0.5386042118072509,
"memory(GiB)": 76.04,
"step": 5580,
"token_acc": 0.8234553974314475,
"train_speed(iter/s)": 0.027633
},
{
"epoch": 1.4439847436809101,
"grad_norm": 1.3870139122009277,
"learning_rate": 2.8593780924946035e-06,
"loss": 0.5439047813415527,
"memory(GiB)": 76.04,
"step": 5585,
"token_acc": 0.8323816375162987,
"train_speed(iter/s)": 0.027633
},
{
"epoch": 1.4452776520783503,
"grad_norm": 1.3395311832427979,
"learning_rate": 2.8558512272204236e-06,
"loss": 0.5457947254180908,
"memory(GiB)": 76.04,
"step": 5590,
"token_acc": 0.8384256861729674,
"train_speed(iter/s)": 0.027634
},
{
"epoch": 1.4465705604757904,
"grad_norm": 1.6546262502670288,
"learning_rate": 2.852323638935153e-06,
"loss": 0.5411076545715332,
"memory(GiB)": 76.04,
"step": 5595,
"token_acc": 0.7911689027752251,
"train_speed(iter/s)": 0.027635
},
{
"epoch": 1.4478634688732304,
"grad_norm": 1.460876226425171,
"learning_rate": 2.8487953348060717e-06,
"loss": 0.5316921710968018,
"memory(GiB)": 76.04,
"step": 5600,
"token_acc": 0.841174282138871,
"train_speed(iter/s)": 0.027635
},
{
"epoch": 1.4491563772706704,
"grad_norm": 1.1629871129989624,
"learning_rate": 2.845266322001914e-06,
"loss": 0.5173054695129394,
"memory(GiB)": 76.04,
"step": 5605,
"token_acc": 0.8505799971707455,
"train_speed(iter/s)": 0.027635
},
{
"epoch": 1.4504492856681104,
"grad_norm": 1.3548821210861206,
"learning_rate": 2.841736607692855e-06,
"loss": 0.5308181762695312,
"memory(GiB)": 76.04,
"step": 5610,
"token_acc": 0.8199830736297108,
"train_speed(iter/s)": 0.027636
},
{
"epoch": 1.4517421940655504,
"grad_norm": 1.4260450601577759,
"learning_rate": 2.8382061990504937e-06,
"loss": 0.5264840126037598,
"memory(GiB)": 76.04,
"step": 5615,
"token_acc": 0.8558112625353561,
"train_speed(iter/s)": 0.027636
},
{
"epoch": 1.4530351024629904,
"grad_norm": 1.11582612991333,
"learning_rate": 2.8346751032478416e-06,
"loss": 0.5299251556396485,
"memory(GiB)": 76.04,
"step": 5620,
"token_acc": 0.8517095224639729,
"train_speed(iter/s)": 0.027635
},
{
"epoch": 1.4543280108604306,
"grad_norm": 1.011853814125061,
"learning_rate": 2.831143327459304e-06,
"loss": 0.5147687911987304,
"memory(GiB)": 76.04,
"step": 5625,
"token_acc": 0.8489732511286956,
"train_speed(iter/s)": 0.027635
},
{
"epoch": 1.4556209192578706,
"grad_norm": 1.401329755783081,
"learning_rate": 2.8276108788606716e-06,
"loss": 0.5251947402954101,
"memory(GiB)": 76.04,
"step": 5630,
"token_acc": 0.8668347467338987,
"train_speed(iter/s)": 0.027636
},
{
"epoch": 1.4569138276553106,
"grad_norm": 1.0562440156936646,
"learning_rate": 2.8240777646290973e-06,
"loss": 0.5131159782409668,
"memory(GiB)": 76.04,
"step": 5635,
"token_acc": 0.8574029383123757,
"train_speed(iter/s)": 0.027636
},
{
"epoch": 1.4582067360527506,
"grad_norm": 1.0854851007461548,
"learning_rate": 2.82054399194309e-06,
"loss": 0.5298294067382813,
"memory(GiB)": 76.04,
"step": 5640,
"token_acc": 0.8360400339911246,
"train_speed(iter/s)": 0.027637
},
{
"epoch": 1.4594996444501906,
"grad_norm": 1.0677274465560913,
"learning_rate": 2.817009567982495e-06,
"loss": 0.5486864566802978,
"memory(GiB)": 76.04,
"step": 5645,
"token_acc": 0.8450833930215901,
"train_speed(iter/s)": 0.027637
},
{
"epoch": 1.4607925528476309,
"grad_norm": 1.0018072128295898,
"learning_rate": 2.81347449992848e-06,
"loss": 0.5392383098602295,
"memory(GiB)": 76.04,
"step": 5650,
"token_acc": 0.8489272284892723,
"train_speed(iter/s)": 0.027638
},
{
"epoch": 1.4620854612450707,
"grad_norm": 1.0016893148422241,
"learning_rate": 2.8099387949635244e-06,
"loss": 0.5180238723754883,
"memory(GiB)": 76.04,
"step": 5655,
"token_acc": 0.8308455244235061,
"train_speed(iter/s)": 0.027638
},
{
"epoch": 1.463378369642511,
"grad_norm": 0.9236847162246704,
"learning_rate": 2.8064024602713978e-06,
"loss": 0.5212345600128174,
"memory(GiB)": 76.04,
"step": 5660,
"token_acc": 0.8309616213885296,
"train_speed(iter/s)": 0.027638
},
{
"epoch": 1.464671278039951,
"grad_norm": 1.1901302337646484,
"learning_rate": 2.802865503037153e-06,
"loss": 0.5204244613647461,
"memory(GiB)": 76.04,
"step": 5665,
"token_acc": 0.8307556954991665,
"train_speed(iter/s)": 0.027638
},
{
"epoch": 1.465964186437391,
"grad_norm": 1.3907729387283325,
"learning_rate": 2.799327930447105e-06,
"loss": 0.5336479187011719,
"memory(GiB)": 76.04,
"step": 5670,
"token_acc": 0.8514618825974964,
"train_speed(iter/s)": 0.027638
},
{
"epoch": 1.467257094834831,
"grad_norm": 1.4721412658691406,
"learning_rate": 2.79578974968882e-06,
"loss": 0.5241554737091064,
"memory(GiB)": 76.04,
"step": 5675,
"token_acc": 0.8389457435252415,
"train_speed(iter/s)": 0.027639
},
{
"epoch": 1.468550003232271,
"grad_norm": 1.0565476417541504,
"learning_rate": 2.792250967951099e-06,
"loss": 0.5248475074768066,
"memory(GiB)": 76.04,
"step": 5680,
"token_acc": 0.8303552659239016,
"train_speed(iter/s)": 0.02764
},
{
"epoch": 1.4698429116297111,
"grad_norm": 1.4000303745269775,
"learning_rate": 2.788711592423966e-06,
"loss": 0.5044834613800049,
"memory(GiB)": 76.04,
"step": 5685,
"token_acc": 0.8474695172874561,
"train_speed(iter/s)": 0.027639
},
{
"epoch": 1.4711358200271512,
"grad_norm": 1.4237065315246582,
"learning_rate": 2.785171630298649e-06,
"loss": 0.527522611618042,
"memory(GiB)": 76.04,
"step": 5690,
"token_acc": 0.8151931330472103,
"train_speed(iter/s)": 0.027638
},
{
"epoch": 1.4724287284245912,
"grad_norm": 1.0766621828079224,
"learning_rate": 2.7816310887675697e-06,
"loss": 0.5117476940155029,
"memory(GiB)": 76.04,
"step": 5695,
"token_acc": 0.8338809784592918,
"train_speed(iter/s)": 0.027639
},
{
"epoch": 1.4737216368220312,
"grad_norm": 1.4094700813293457,
"learning_rate": 2.7780899750243275e-06,
"loss": 0.5268692970275879,
"memory(GiB)": 76.04,
"step": 5700,
"token_acc": 0.8330804888327012,
"train_speed(iter/s)": 0.027639
},
{
"epoch": 1.4750145452194712,
"grad_norm": 1.1132041215896606,
"learning_rate": 2.7745482962636815e-06,
"loss": 0.4945709228515625,
"memory(GiB)": 76.04,
"step": 5705,
"token_acc": 0.860883552163992,
"train_speed(iter/s)": 0.027639
},
{
"epoch": 1.4763074536169112,
"grad_norm": 1.0947760343551636,
"learning_rate": 2.7710060596815425e-06,
"loss": 0.5298891067504883,
"memory(GiB)": 76.04,
"step": 5710,
"token_acc": 0.8435150568998808,
"train_speed(iter/s)": 0.027639
},
{
"epoch": 1.4776003620143512,
"grad_norm": 0.9912996292114258,
"learning_rate": 2.767463272474951e-06,
"loss": 0.48708510398864746,
"memory(GiB)": 76.04,
"step": 5715,
"token_acc": 0.8264887063655031,
"train_speed(iter/s)": 0.027639
},
{
"epoch": 1.4788932704117914,
"grad_norm": 0.8707374930381775,
"learning_rate": 2.763919941842069e-06,
"loss": 0.5079801559448243,
"memory(GiB)": 76.04,
"step": 5720,
"token_acc": 0.8312231452305929,
"train_speed(iter/s)": 0.02764
},
{
"epoch": 1.4801861788092314,
"grad_norm": 1.5692007541656494,
"learning_rate": 2.760376074982161e-06,
"loss": 0.5193423748016357,
"memory(GiB)": 76.04,
"step": 5725,
"token_acc": 0.8429825267734923,
"train_speed(iter/s)": 0.027641
},
{
"epoch": 1.4814790872066714,
"grad_norm": 1.1025525331497192,
"learning_rate": 2.756831679095583e-06,
"loss": 0.5138895034790039,
"memory(GiB)": 76.04,
"step": 5730,
"token_acc": 0.8342220895013012,
"train_speed(iter/s)": 0.027641
},
{
"epoch": 1.4827719956041114,
"grad_norm": 1.114014983177185,
"learning_rate": 2.7532867613837632e-06,
"loss": 0.5035554885864257,
"memory(GiB)": 76.04,
"step": 5735,
"token_acc": 0.8443557981664217,
"train_speed(iter/s)": 0.027642
},
{
"epoch": 1.4840649040015514,
"grad_norm": 0.9390487670898438,
"learning_rate": 2.7497413290491927e-06,
"loss": 0.5343178749084473,
"memory(GiB)": 76.04,
"step": 5740,
"token_acc": 0.8542599136238712,
"train_speed(iter/s)": 0.027641
},
{
"epoch": 1.4853578123989915,
"grad_norm": 0.9856504797935486,
"learning_rate": 2.746195389295406e-06,
"loss": 0.5330347537994384,
"memory(GiB)": 76.04,
"step": 5745,
"token_acc": 0.8207178164624973,
"train_speed(iter/s)": 0.027641
},
{
"epoch": 1.4866507207964315,
"grad_norm": 1.0896226167678833,
"learning_rate": 2.7426489493269693e-06,
"loss": 0.538813591003418,
"memory(GiB)": 76.04,
"step": 5750,
"token_acc": 0.8037091060637633,
"train_speed(iter/s)": 0.027642
},
{
"epoch": 1.4879436291938717,
"grad_norm": 1.0768758058547974,
"learning_rate": 2.739102016349465e-06,
"loss": 0.5243756294250488,
"memory(GiB)": 76.04,
"step": 5755,
"token_acc": 0.8149051903817803,
"train_speed(iter/s)": 0.027642
},
{
"epoch": 1.4892365375913117,
"grad_norm": 1.063176155090332,
"learning_rate": 2.7355545975694777e-06,
"loss": 0.5046000480651855,
"memory(GiB)": 76.04,
"step": 5760,
"token_acc": 0.8376587897828166,
"train_speed(iter/s)": 0.027641
},
{
"epoch": 1.4905294459887517,
"grad_norm": 1.0583608150482178,
"learning_rate": 2.73200670019458e-06,
"loss": 0.5038406372070312,
"memory(GiB)": 76.04,
"step": 5765,
"token_acc": 0.835580538569638,
"train_speed(iter/s)": 0.027642
},
{
"epoch": 1.4918223543861917,
"grad_norm": 1.7861460447311401,
"learning_rate": 2.7284583314333136e-06,
"loss": 0.5076050758361816,
"memory(GiB)": 76.04,
"step": 5770,
"token_acc": 0.8467831009250311,
"train_speed(iter/s)": 0.027642
},
{
"epoch": 1.4931152627836317,
"grad_norm": 6.293168067932129,
"learning_rate": 2.7249094984951817e-06,
"loss": 0.5296279430389405,
"memory(GiB)": 76.04,
"step": 5775,
"token_acc": 0.8256261520112762,
"train_speed(iter/s)": 0.027642
},
{
"epoch": 1.494408171181072,
"grad_norm": 1.0335216522216797,
"learning_rate": 2.7213602085906284e-06,
"loss": 0.5116629600524902,
"memory(GiB)": 76.04,
"step": 5780,
"token_acc": 0.8385129247749056,
"train_speed(iter/s)": 0.027642
},
{
"epoch": 1.4957010795785117,
"grad_norm": 1.0121421813964844,
"learning_rate": 2.7178104689310268e-06,
"loss": 0.49023923873901365,
"memory(GiB)": 76.04,
"step": 5785,
"token_acc": 0.8403039150163565,
"train_speed(iter/s)": 0.027641
},
{
"epoch": 1.496993987975952,
"grad_norm": 2.905419111251831,
"learning_rate": 2.714260286728663e-06,
"loss": 0.5063573837280273,
"memory(GiB)": 76.04,
"step": 5790,
"token_acc": 0.8344622697563874,
"train_speed(iter/s)": 0.027641
},
{
"epoch": 1.498286896373392,
"grad_norm": 1.3976821899414062,
"learning_rate": 2.7107096691967242e-06,
"loss": 0.5138403892517089,
"memory(GiB)": 76.04,
"step": 5795,
"token_acc": 0.8758511480601742,
"train_speed(iter/s)": 0.02764
},
{
"epoch": 1.499579804770832,
"grad_norm": 1.0694046020507812,
"learning_rate": 2.70715862354928e-06,
"loss": 0.5170317649841308,
"memory(GiB)": 76.04,
"step": 5800,
"token_acc": 0.8628498120412913,
"train_speed(iter/s)": 0.027641
},
{
"epoch": 1.500872713168272,
"grad_norm": 1.158471941947937,
"learning_rate": 2.703607157001273e-06,
"loss": 0.5261846542358398,
"memory(GiB)": 76.04,
"step": 5805,
"token_acc": 0.8195343894257913,
"train_speed(iter/s)": 0.02764
},
{
"epoch": 1.502165621565712,
"grad_norm": 0.9272586107254028,
"learning_rate": 2.7000552767684962e-06,
"loss": 0.5037094116210937,
"memory(GiB)": 76.04,
"step": 5810,
"token_acc": 0.8422422339722406,
"train_speed(iter/s)": 0.027641
},
{
"epoch": 1.5034585299631522,
"grad_norm": 1.1119998693466187,
"learning_rate": 2.696502990067586e-06,
"loss": 0.5135734558105469,
"memory(GiB)": 76.04,
"step": 5815,
"token_acc": 0.8626723760072827,
"train_speed(iter/s)": 0.02764
},
{
"epoch": 1.504751438360592,
"grad_norm": 1.0358091592788696,
"learning_rate": 2.6929503041160054e-06,
"loss": 0.5373703956604003,
"memory(GiB)": 76.04,
"step": 5820,
"token_acc": 0.8301167050647732,
"train_speed(iter/s)": 0.027641
},
{
"epoch": 1.5060443467580322,
"grad_norm": 1.039354920387268,
"learning_rate": 2.6893972261320265e-06,
"loss": 0.5479695320129394,
"memory(GiB)": 76.04,
"step": 5825,
"token_acc": 0.8479183638468465,
"train_speed(iter/s)": 0.027642
},
{
"epoch": 1.5073372551554722,
"grad_norm": 1.3886367082595825,
"learning_rate": 2.6858437633347197e-06,
"loss": 0.49077515602111815,
"memory(GiB)": 76.04,
"step": 5830,
"token_acc": 0.8483184202406665,
"train_speed(iter/s)": 0.027641
},
{
"epoch": 1.5086301635529122,
"grad_norm": 1.0695741176605225,
"learning_rate": 2.6822899229439354e-06,
"loss": 0.5208306789398194,
"memory(GiB)": 76.04,
"step": 5835,
"token_acc": 0.8321777497636307,
"train_speed(iter/s)": 0.027642
},
{
"epoch": 1.5099230719503525,
"grad_norm": 1.3519343137741089,
"learning_rate": 2.678735712180294e-06,
"loss": 0.5065782070159912,
"memory(GiB)": 76.04,
"step": 5840,
"token_acc": 0.865064039408867,
"train_speed(iter/s)": 0.027642
},
{
"epoch": 1.5112159803477923,
"grad_norm": 1.1784507036209106,
"learning_rate": 2.6751811382651656e-06,
"loss": 0.5237961769104004,
"memory(GiB)": 76.04,
"step": 5845,
"token_acc": 0.8759957417128593,
"train_speed(iter/s)": 0.027642
},
{
"epoch": 1.5125088887452325,
"grad_norm": 1.2745177745819092,
"learning_rate": 2.6716262084206596e-06,
"loss": 0.5225517272949218,
"memory(GiB)": 76.04,
"step": 5850,
"token_acc": 0.8607184154574956,
"train_speed(iter/s)": 0.027642
},
{
"epoch": 1.5138017971426725,
"grad_norm": 1.0925981998443604,
"learning_rate": 2.6680709298696075e-06,
"loss": 0.5313197135925293,
"memory(GiB)": 76.04,
"step": 5855,
"token_acc": 0.8273417489937798,
"train_speed(iter/s)": 0.027642
},
{
"epoch": 1.5150947055401125,
"grad_norm": 1.1206554174423218,
"learning_rate": 2.66451530983555e-06,
"loss": 0.5206215858459473,
"memory(GiB)": 76.04,
"step": 5860,
"token_acc": 0.8360609797107947,
"train_speed(iter/s)": 0.027642
},
{
"epoch": 1.5163876139375525,
"grad_norm": 1.0352177619934082,
"learning_rate": 2.6609593555427233e-06,
"loss": 0.5028391361236573,
"memory(GiB)": 76.04,
"step": 5865,
"token_acc": 0.8328511593764844,
"train_speed(iter/s)": 0.027642
},
{
"epoch": 1.5176805223349925,
"grad_norm": 1.2911217212677002,
"learning_rate": 2.6574030742160397e-06,
"loss": 0.510726022720337,
"memory(GiB)": 76.04,
"step": 5870,
"token_acc": 0.8490013110202822,
"train_speed(iter/s)": 0.027642
},
{
"epoch": 1.5189734307324327,
"grad_norm": 1.0364927053451538,
"learning_rate": 2.6538464730810774e-06,
"loss": 0.5217413902282715,
"memory(GiB)": 76.04,
"step": 5875,
"token_acc": 0.8393885789449812,
"train_speed(iter/s)": 0.027642
},
{
"epoch": 1.5202663391298725,
"grad_norm": 1.238673210144043,
"learning_rate": 2.6502895593640643e-06,
"loss": 0.5099982738494873,
"memory(GiB)": 76.04,
"step": 5880,
"token_acc": 0.8526344031928095,
"train_speed(iter/s)": 0.027642
},
{
"epoch": 1.5215592475273128,
"grad_norm": 0.8736827969551086,
"learning_rate": 2.646732340291864e-06,
"loss": 0.5140372753143311,
"memory(GiB)": 76.04,
"step": 5885,
"token_acc": 0.8392264114084782,
"train_speed(iter/s)": 0.027641
},
{
"epoch": 1.5228521559247528,
"grad_norm": 1.5109645128250122,
"learning_rate": 2.6431748230919583e-06,
"loss": 0.5010466575622559,
"memory(GiB)": 76.04,
"step": 5890,
"token_acc": 0.8540109197816044,
"train_speed(iter/s)": 0.02764
},
{
"epoch": 1.5241450643221928,
"grad_norm": 1.3737353086471558,
"learning_rate": 2.639617014992438e-06,
"loss": 0.5450526237487793,
"memory(GiB)": 76.04,
"step": 5895,
"token_acc": 0.8286816981515336,
"train_speed(iter/s)": 0.02764
},
{
"epoch": 1.5254379727196328,
"grad_norm": 1.299264669418335,
"learning_rate": 2.6360589232219826e-06,
"loss": 0.5287326812744141,
"memory(GiB)": 76.04,
"step": 5900,
"token_acc": 0.8438716156839771,
"train_speed(iter/s)": 0.02764
},
{
"epoch": 1.5267308811170728,
"grad_norm": 1.595326542854309,
"learning_rate": 2.632500555009849e-06,
"loss": 0.5352768898010254,
"memory(GiB)": 76.04,
"step": 5905,
"token_acc": 0.8498021897138651,
"train_speed(iter/s)": 0.02764
},
{
"epoch": 1.528023789514513,
"grad_norm": 0.9794163107872009,
"learning_rate": 2.6289419175858557e-06,
"loss": 0.5425346374511719,
"memory(GiB)": 76.04,
"step": 5910,
"token_acc": 0.8340483277884784,
"train_speed(iter/s)": 0.02764
},
{
"epoch": 1.5293166979119528,
"grad_norm": 1.0976084470748901,
"learning_rate": 2.625383018180367e-06,
"loss": 0.521512794494629,
"memory(GiB)": 76.04,
"step": 5915,
"token_acc": 0.8385939188146319,
"train_speed(iter/s)": 0.02764
},
{
"epoch": 1.530609606309393,
"grad_norm": 0.8977828621864319,
"learning_rate": 2.6218238640242804e-06,
"loss": 0.5215116500854492,
"memory(GiB)": 76.04,
"step": 5920,
"token_acc": 0.867619533775736,
"train_speed(iter/s)": 0.027639
},
{
"epoch": 1.531902514706833,
"grad_norm": 0.9642647504806519,
"learning_rate": 2.6182644623490123e-06,
"loss": 0.5066309928894043,
"memory(GiB)": 76.04,
"step": 5925,
"token_acc": 0.8547594142259414,
"train_speed(iter/s)": 0.02764
},
{
"epoch": 1.533195423104273,
"grad_norm": 1.0187339782714844,
"learning_rate": 2.6147048203864785e-06,
"loss": 0.5130214691162109,
"memory(GiB)": 76.04,
"step": 5930,
"token_acc": 0.8448008040935673,
"train_speed(iter/s)": 0.02764
},
{
"epoch": 1.5344883315017133,
"grad_norm": 1.0723613500595093,
"learning_rate": 2.6111449453690867e-06,
"loss": 0.5088356971740723,
"memory(GiB)": 76.04,
"step": 5935,
"token_acc": 0.8498759764540372,
"train_speed(iter/s)": 0.02764
},
{
"epoch": 1.535781239899153,
"grad_norm": 0.9338003993034363,
"learning_rate": 2.607584844529717e-06,
"loss": 0.5098363399505615,
"memory(GiB)": 76.04,
"step": 5940,
"token_acc": 0.8526082509376065,
"train_speed(iter/s)": 0.027641
},
{
"epoch": 1.5370741482965933,
"grad_norm": 0.8586558103561401,
"learning_rate": 2.604024525101707e-06,
"loss": 0.5505722045898438,
"memory(GiB)": 76.04,
"step": 5945,
"token_acc": 0.8450296382094433,
"train_speed(iter/s)": 0.027641
},
{
"epoch": 1.538367056694033,
"grad_norm": 1.0932189226150513,
"learning_rate": 2.6004639943188397e-06,
"loss": 0.51469407081604,
"memory(GiB)": 76.04,
"step": 5950,
"token_acc": 0.840957878166293,
"train_speed(iter/s)": 0.027641
},
{
"epoch": 1.5396599650914733,
"grad_norm": 1.4429826736450195,
"learning_rate": 2.5969032594153267e-06,
"loss": 0.5273025512695313,
"memory(GiB)": 76.04,
"step": 5955,
"token_acc": 0.8560200279459711,
"train_speed(iter/s)": 0.027642
},
{
"epoch": 1.5409528734889133,
"grad_norm": 1.6431050300598145,
"learning_rate": 2.5933423276257957e-06,
"loss": 0.5339940071105957,
"memory(GiB)": 76.04,
"step": 5960,
"token_acc": 0.8337239801328199,
"train_speed(iter/s)": 0.027641
},
{
"epoch": 1.5422457818863533,
"grad_norm": 1.0177254676818848,
"learning_rate": 2.5897812061852728e-06,
"loss": 0.523937177658081,
"memory(GiB)": 76.04,
"step": 5965,
"token_acc": 0.8334849487264308,
"train_speed(iter/s)": 0.027641
},
{
"epoch": 1.5435386902837935,
"grad_norm": 1.0822439193725586,
"learning_rate": 2.58621990232917e-06,
"loss": 0.5013058662414551,
"memory(GiB)": 76.04,
"step": 5970,
"token_acc": 0.8428224266620379,
"train_speed(iter/s)": 0.027642
},
{
"epoch": 1.5448315986812333,
"grad_norm": 1.0421521663665771,
"learning_rate": 2.5826584232932707e-06,
"loss": 0.5094140052795411,
"memory(GiB)": 76.04,
"step": 5975,
"token_acc": 0.828722488626583,
"train_speed(iter/s)": 0.027643
},
{
"epoch": 1.5461245070786735,
"grad_norm": 1.2964197397232056,
"learning_rate": 2.5790967763137136e-06,
"loss": 0.5127614498138428,
"memory(GiB)": 76.04,
"step": 5980,
"token_acc": 0.842072213500785,
"train_speed(iter/s)": 0.027643
},
{
"epoch": 1.5474174154761136,
"grad_norm": 0.8814839124679565,
"learning_rate": 2.575534968626978e-06,
"loss": 0.5174202919006348,
"memory(GiB)": 76.04,
"step": 5985,
"token_acc": 0.8393448656606551,
"train_speed(iter/s)": 0.027644
},
{
"epoch": 1.5487103238735536,
"grad_norm": 1.192781686782837,
"learning_rate": 2.5719730074698718e-06,
"loss": 0.5092106342315674,
"memory(GiB)": 76.04,
"step": 5990,
"token_acc": 0.8536305586357206,
"train_speed(iter/s)": 0.027644
},
{
"epoch": 1.5500032322709936,
"grad_norm": 1.0610737800598145,
"learning_rate": 2.5684109000795114e-06,
"loss": 0.4976038932800293,
"memory(GiB)": 76.04,
"step": 5995,
"token_acc": 0.8407422307150759,
"train_speed(iter/s)": 0.027645
},
{
"epoch": 1.5512961406684336,
"grad_norm": 0.9621560573577881,
"learning_rate": 2.564848653693313e-06,
"loss": 0.5234485626220703,
"memory(GiB)": 76.04,
"step": 6000,
"token_acc": 0.8350293049512783,
"train_speed(iter/s)": 0.027645
},
{
"epoch": 1.5525890490658738,
"grad_norm": 0.9867120385169983,
"learning_rate": 2.5612862755489754e-06,
"loss": 0.5299267292022705,
"memory(GiB)": 76.04,
"step": 6005,
"token_acc": 0.8546105977748444,
"train_speed(iter/s)": 0.027621
},
{
"epoch": 1.5538819574633136,
"grad_norm": 1.0978645086288452,
"learning_rate": 2.5577237728844624e-06,
"loss": 0.5120854854583741,
"memory(GiB)": 76.04,
"step": 6010,
"token_acc": 0.826845756196704,
"train_speed(iter/s)": 0.027622
},
{
"epoch": 1.5551748658607538,
"grad_norm": 1.5117709636688232,
"learning_rate": 2.554161152937994e-06,
"loss": 0.49729576110839846,
"memory(GiB)": 76.04,
"step": 6015,
"token_acc": 0.8334134348774447,
"train_speed(iter/s)": 0.027622
},
{
"epoch": 1.5564677742581938,
"grad_norm": 0.8881126642227173,
"learning_rate": 2.5505984229480257e-06,
"loss": 0.5338102340698242,
"memory(GiB)": 76.04,
"step": 6020,
"token_acc": 0.8419421487603306,
"train_speed(iter/s)": 0.027622
},
{
"epoch": 1.5577606826556338,
"grad_norm": 1.0090521574020386,
"learning_rate": 2.547035590153239e-06,
"loss": 0.5258452892303467,
"memory(GiB)": 76.04,
"step": 6025,
"token_acc": 0.8578943772631004,
"train_speed(iter/s)": 0.027622
},
{
"epoch": 1.5590535910530738,
"grad_norm": 0.9812294244766235,
"learning_rate": 2.5434726617925214e-06,
"loss": 0.5136911392211914,
"memory(GiB)": 76.04,
"step": 6030,
"token_acc": 0.827103274559194,
"train_speed(iter/s)": 0.027622
},
{
"epoch": 1.5603464994505138,
"grad_norm": 0.9446991682052612,
"learning_rate": 2.5399096451049586e-06,
"loss": 0.5100172996520996,
"memory(GiB)": 76.04,
"step": 6035,
"token_acc": 0.8508031581813231,
"train_speed(iter/s)": 0.027622
},
{
"epoch": 1.561639407847954,
"grad_norm": 1.0973703861236572,
"learning_rate": 2.536346547329812e-06,
"loss": 0.5151572704315186,
"memory(GiB)": 76.04,
"step": 6040,
"token_acc": 0.8401378579003181,
"train_speed(iter/s)": 0.027622
},
{
"epoch": 1.5629323162453939,
"grad_norm": 1.1219217777252197,
"learning_rate": 2.5327833757065102e-06,
"loss": 0.5503729343414306,
"memory(GiB)": 76.04,
"step": 6045,
"token_acc": 0.8555493103895543,
"train_speed(iter/s)": 0.027621
},
{
"epoch": 1.564225224642834,
"grad_norm": 0.9930498600006104,
"learning_rate": 2.5292201374746306e-06,
"loss": 0.5092242240905762,
"memory(GiB)": 76.04,
"step": 6050,
"token_acc": 0.8612712103502479,
"train_speed(iter/s)": 0.027621
},
{
"epoch": 1.565518133040274,
"grad_norm": 1.1532442569732666,
"learning_rate": 2.525656839873885e-06,
"loss": 0.509462833404541,
"memory(GiB)": 76.04,
"step": 6055,
"token_acc": 0.8444057905958927,
"train_speed(iter/s)": 0.027622
},
{
"epoch": 1.566811041437714,
"grad_norm": 0.9126574397087097,
"learning_rate": 2.522093490144109e-06,
"loss": 0.5357399940490722,
"memory(GiB)": 76.04,
"step": 6060,
"token_acc": 0.8304055410560128,
"train_speed(iter/s)": 0.027622
},
{
"epoch": 1.5681039498351543,
"grad_norm": 4.396268844604492,
"learning_rate": 2.5185300955252406e-06,
"loss": 0.5380908489227295,
"memory(GiB)": 76.04,
"step": 6065,
"token_acc": 0.8541631222566266,
"train_speed(iter/s)": 0.027622
},
{
"epoch": 1.5693968582325941,
"grad_norm": 1.2452019453048706,
"learning_rate": 2.514966663257311e-06,
"loss": 0.5378365516662598,
"memory(GiB)": 76.04,
"step": 6070,
"token_acc": 0.8556809966075302,
"train_speed(iter/s)": 0.027622
},
{
"epoch": 1.5706897666300343,
"grad_norm": 0.9279251098632812,
"learning_rate": 2.511403200580428e-06,
"loss": 0.5115952014923095,
"memory(GiB)": 76.04,
"step": 6075,
"token_acc": 0.8463237893248498,
"train_speed(iter/s)": 0.027622
},
{
"epoch": 1.5719826750274744,
"grad_norm": 2.242185592651367,
"learning_rate": 2.50783971473476e-06,
"loss": 0.5192525386810303,
"memory(GiB)": 76.04,
"step": 6080,
"token_acc": 0.8700274811911519,
"train_speed(iter/s)": 0.027622
},
{
"epoch": 1.5732755834249144,
"grad_norm": 2.6423373222351074,
"learning_rate": 2.5042762129605235e-06,
"loss": 0.5067386150360107,
"memory(GiB)": 76.04,
"step": 6085,
"token_acc": 0.8241304899720742,
"train_speed(iter/s)": 0.027622
},
{
"epoch": 1.5745684918223544,
"grad_norm": 0.9732599854469299,
"learning_rate": 2.500712702497967e-06,
"loss": 0.4948467254638672,
"memory(GiB)": 76.04,
"step": 6090,
"token_acc": 0.8798655462184874,
"train_speed(iter/s)": 0.027622
},
{
"epoch": 1.5758614002197944,
"grad_norm": 1.5327101945877075,
"learning_rate": 2.497149190587356e-06,
"loss": 0.5227193355560302,
"memory(GiB)": 76.04,
"step": 6095,
"token_acc": 0.8608474068152293,
"train_speed(iter/s)": 0.027623
},
{
"epoch": 1.5771543086172346,
"grad_norm": 1.2196455001831055,
"learning_rate": 2.4935856844689605e-06,
"loss": 0.519383716583252,
"memory(GiB)": 76.04,
"step": 6100,
"token_acc": 0.8335468679663424,
"train_speed(iter/s)": 0.027622
},
{
"epoch": 1.5784472170146744,
"grad_norm": 1.0245038270950317,
"learning_rate": 2.4900221913830368e-06,
"loss": 0.5222830772399902,
"memory(GiB)": 76.04,
"step": 6105,
"token_acc": 0.8626334519572953,
"train_speed(iter/s)": 0.027623
},
{
"epoch": 1.5797401254121146,
"grad_norm": 1.114092469215393,
"learning_rate": 2.486458718569817e-06,
"loss": 0.5028997898101807,
"memory(GiB)": 76.04,
"step": 6110,
"token_acc": 0.8522080471050049,
"train_speed(iter/s)": 0.027623
},
{
"epoch": 1.5810330338095546,
"grad_norm": 1.0639675855636597,
"learning_rate": 2.4828952732694887e-06,
"loss": 0.5147637367248535,
"memory(GiB)": 76.04,
"step": 6115,
"token_acc": 0.863227909435292,
"train_speed(iter/s)": 0.027623
},
{
"epoch": 1.5823259422069946,
"grad_norm": 0.8807838559150696,
"learning_rate": 2.479331862722188e-06,
"loss": 0.5280374526977539,
"memory(GiB)": 76.04,
"step": 6120,
"token_acc": 0.8349608197709464,
"train_speed(iter/s)": 0.027623
},
{
"epoch": 1.5836188506044346,
"grad_norm": 1.1169236898422241,
"learning_rate": 2.4757684941679767e-06,
"loss": 0.5291852474212646,
"memory(GiB)": 76.04,
"step": 6125,
"token_acc": 0.83143130614048,
"train_speed(iter/s)": 0.027624
},
{
"epoch": 1.5849117590018746,
"grad_norm": 1.1640690565109253,
"learning_rate": 2.4722051748468336e-06,
"loss": 0.54544095993042,
"memory(GiB)": 76.04,
"step": 6130,
"token_acc": 0.8492234388601274,
"train_speed(iter/s)": 0.027623
},
{
"epoch": 1.5862046673993149,
"grad_norm": 1.0770084857940674,
"learning_rate": 2.4686419119986337e-06,
"loss": 0.5241689205169677,
"memory(GiB)": 76.04,
"step": 6135,
"token_acc": 0.8025563166443048,
"train_speed(iter/s)": 0.027624
},
{
"epoch": 1.5874975757967547,
"grad_norm": 0.8977876901626587,
"learning_rate": 2.4650787128631433e-06,
"loss": 0.47954139709472654,
"memory(GiB)": 76.04,
"step": 6140,
"token_acc": 0.8597583511016347,
"train_speed(iter/s)": 0.027623
},
{
"epoch": 1.5887904841941949,
"grad_norm": 5.924169540405273,
"learning_rate": 2.461515584679995e-06,
"loss": 0.5163521766662598,
"memory(GiB)": 76.04,
"step": 6145,
"token_acc": 0.8557573765102326,
"train_speed(iter/s)": 0.027624
},
{
"epoch": 1.590083392591635,
"grad_norm": 1.4680997133255005,
"learning_rate": 2.457952534688678e-06,
"loss": 0.5192079544067383,
"memory(GiB)": 76.04,
"step": 6150,
"token_acc": 0.8392393432144142,
"train_speed(iter/s)": 0.027622
},
{
"epoch": 1.591376300989075,
"grad_norm": 1.3616418838500977,
"learning_rate": 2.4543895701285214e-06,
"loss": 0.521982479095459,
"memory(GiB)": 76.04,
"step": 6155,
"token_acc": 0.8226835625056169,
"train_speed(iter/s)": 0.027623
},
{
"epoch": 1.5926692093865151,
"grad_norm": 0.9679275751113892,
"learning_rate": 2.450826698238685e-06,
"loss": 0.5229485034942627,
"memory(GiB)": 76.04,
"step": 6160,
"token_acc": 0.8353462704120866,
"train_speed(iter/s)": 0.027623
},
{
"epoch": 1.593962117783955,
"grad_norm": 1.0366556644439697,
"learning_rate": 2.447263926258136e-06,
"loss": 0.518170976638794,
"memory(GiB)": 76.04,
"step": 6165,
"token_acc": 0.8254823304680038,
"train_speed(iter/s)": 0.027622
},
{
"epoch": 1.5952550261813951,
"grad_norm": 0.9990222454071045,
"learning_rate": 2.4437012614256394e-06,
"loss": 0.5325229167938232,
"memory(GiB)": 76.04,
"step": 6170,
"token_acc": 0.8040033620770165,
"train_speed(iter/s)": 0.027623
},
{
"epoch": 1.596547934578835,
"grad_norm": 1.1222083568572998,
"learning_rate": 2.4401387109797446e-06,
"loss": 0.5065582275390625,
"memory(GiB)": 76.04,
"step": 6175,
"token_acc": 0.8548062202884538,
"train_speed(iter/s)": 0.027623
},
{
"epoch": 1.5978408429762752,
"grad_norm": 1.0990639925003052,
"learning_rate": 2.4365762821587656e-06,
"loss": 0.5230794906616211,
"memory(GiB)": 76.04,
"step": 6180,
"token_acc": 0.8195025958800871,
"train_speed(iter/s)": 0.027623
},
{
"epoch": 1.5991337513737152,
"grad_norm": 1.0618468523025513,
"learning_rate": 2.4330139822007726e-06,
"loss": 0.5022711753845215,
"memory(GiB)": 76.04,
"step": 6185,
"token_acc": 0.8767551452202347,
"train_speed(iter/s)": 0.027622
},
{
"epoch": 1.6004266597711552,
"grad_norm": 1.059678077697754,
"learning_rate": 2.4294518183435715e-06,
"loss": 0.5181986808776855,
"memory(GiB)": 76.04,
"step": 6190,
"token_acc": 0.8563752841496177,
"train_speed(iter/s)": 0.027623
},
{
"epoch": 1.6017195681685954,
"grad_norm": 0.8919258117675781,
"learning_rate": 2.4258897978246925e-06,
"loss": 0.49803409576416013,
"memory(GiB)": 76.04,
"step": 6195,
"token_acc": 0.8634100953710165,
"train_speed(iter/s)": 0.027623
},
{
"epoch": 1.6030124765660352,
"grad_norm": 1.0404894351959229,
"learning_rate": 2.4223279278813736e-06,
"loss": 0.5113819122314454,
"memory(GiB)": 76.04,
"step": 6200,
"token_acc": 0.828187138284458,
"train_speed(iter/s)": 0.027623
},
{
"epoch": 1.6043053849634754,
"grad_norm": 1.3208954334259033,
"learning_rate": 2.418766215750549e-06,
"loss": 0.5281610012054443,
"memory(GiB)": 76.04,
"step": 6205,
"token_acc": 0.835780681665095,
"train_speed(iter/s)": 0.027624
},
{
"epoch": 1.6055982933609154,
"grad_norm": 1.3880317211151123,
"learning_rate": 2.4152046686688305e-06,
"loss": 0.5289054870605469,
"memory(GiB)": 76.04,
"step": 6210,
"token_acc": 0.8374174516442627,
"train_speed(iter/s)": 0.027625
},
{
"epoch": 1.6068912017583554,
"grad_norm": 1.5898443460464478,
"learning_rate": 2.4116432938724953e-06,
"loss": 0.5337974548339843,
"memory(GiB)": 76.04,
"step": 6215,
"token_acc": 0.8592785422089996,
"train_speed(iter/s)": 0.027625
},
{
"epoch": 1.6081841101557954,
"grad_norm": 1.666271686553955,
"learning_rate": 2.4080820985974707e-06,
"loss": 0.5134634017944336,
"memory(GiB)": 76.04,
"step": 6220,
"token_acc": 0.8779267140307283,
"train_speed(iter/s)": 0.027626
},
{
"epoch": 1.6094770185532354,
"grad_norm": 0.9248247742652893,
"learning_rate": 2.4045210900793167e-06,
"loss": 0.5277139186859131,
"memory(GiB)": 76.04,
"step": 6225,
"token_acc": 0.8437402643642352,
"train_speed(iter/s)": 0.027625
},
{
"epoch": 1.6107699269506757,
"grad_norm": 1.5387322902679443,
"learning_rate": 2.4009602755532188e-06,
"loss": 0.5056108474731446,
"memory(GiB)": 76.04,
"step": 6230,
"token_acc": 0.8501114918148692,
"train_speed(iter/s)": 0.027626
},
{
"epoch": 1.6120628353481155,
"grad_norm": 1.0052211284637451,
"learning_rate": 2.3973996622539646e-06,
"loss": 0.5336996078491211,
"memory(GiB)": 76.04,
"step": 6235,
"token_acc": 0.8721888153938665,
"train_speed(iter/s)": 0.027626
},
{
"epoch": 1.6133557437455557,
"grad_norm": 1.027434229850769,
"learning_rate": 2.393839257415933e-06,
"loss": 0.49329376220703125,
"memory(GiB)": 76.04,
"step": 6240,
"token_acc": 0.8676368108218897,
"train_speed(iter/s)": 0.027626
},
{
"epoch": 1.6146486521429957,
"grad_norm": 1.1870218515396118,
"learning_rate": 2.3902790682730806e-06,
"loss": 0.5256915092468262,
"memory(GiB)": 76.04,
"step": 6245,
"token_acc": 0.8572780020181635,
"train_speed(iter/s)": 0.027625
},
{
"epoch": 1.6159415605404357,
"grad_norm": 1.0639426708221436,
"learning_rate": 2.3867191020589264e-06,
"loss": 0.5284603118896485,
"memory(GiB)": 76.04,
"step": 6250,
"token_acc": 0.8446619622126109,
"train_speed(iter/s)": 0.027626
},
{
"epoch": 1.6172344689378757,
"grad_norm": 1.0155668258666992,
"learning_rate": 2.3831593660065345e-06,
"loss": 0.5121121406555176,
"memory(GiB)": 76.04,
"step": 6255,
"token_acc": 0.8595757910736493,
"train_speed(iter/s)": 0.027626
},
{
"epoch": 1.6185273773353157,
"grad_norm": 1.0661635398864746,
"learning_rate": 2.3795998673485025e-06,
"loss": 0.5248492240905762,
"memory(GiB)": 76.04,
"step": 6260,
"token_acc": 0.8348161428909712,
"train_speed(iter/s)": 0.027626
},
{
"epoch": 1.619820285732756,
"grad_norm": 1.0434828996658325,
"learning_rate": 2.376040613316944e-06,
"loss": 0.5192477226257324,
"memory(GiB)": 76.04,
"step": 6265,
"token_acc": 0.8534787948847626,
"train_speed(iter/s)": 0.027626
},
{
"epoch": 1.6211131941301957,
"grad_norm": 0.9453509449958801,
"learning_rate": 2.37248161114348e-06,
"loss": 0.5361949920654296,
"memory(GiB)": 76.04,
"step": 6270,
"token_acc": 0.8283499021225963,
"train_speed(iter/s)": 0.027626
},
{
"epoch": 1.622406102527636,
"grad_norm": 1.1677709817886353,
"learning_rate": 2.3689228680592138e-06,
"loss": 0.52266845703125,
"memory(GiB)": 76.04,
"step": 6275,
"token_acc": 0.8238867321306235,
"train_speed(iter/s)": 0.027626
},
{
"epoch": 1.623699010925076,
"grad_norm": 1.974316954612732,
"learning_rate": 2.3653643912947276e-06,
"loss": 0.5168787479400635,
"memory(GiB)": 76.04,
"step": 6280,
"token_acc": 0.8214940319191738,
"train_speed(iter/s)": 0.027626
},
{
"epoch": 1.624991919322516,
"grad_norm": 1.1270877122879028,
"learning_rate": 2.3618061880800586e-06,
"loss": 0.48665618896484375,
"memory(GiB)": 76.04,
"step": 6285,
"token_acc": 0.8658167398627041,
"train_speed(iter/s)": 0.027626
},
{
"epoch": 1.6262848277199562,
"grad_norm": 1.4701924324035645,
"learning_rate": 2.3582482656446897e-06,
"loss": 0.5326834678649902,
"memory(GiB)": 76.04,
"step": 6290,
"token_acc": 0.841709722874589,
"train_speed(iter/s)": 0.027627
},
{
"epoch": 1.627577736117396,
"grad_norm": 0.9950865507125854,
"learning_rate": 2.3546906312175347e-06,
"loss": 0.597511100769043,
"memory(GiB)": 76.04,
"step": 6295,
"token_acc": 0.7920300141959035,
"train_speed(iter/s)": 0.027627
},
{
"epoch": 1.6288706445148362,
"grad_norm": 1.3875091075897217,
"learning_rate": 2.35113329202692e-06,
"loss": 0.5079882621765137,
"memory(GiB)": 76.04,
"step": 6300,
"token_acc": 0.8363686840644087,
"train_speed(iter/s)": 0.027627
},
{
"epoch": 1.6301635529122762,
"grad_norm": 1.17184317111969,
"learning_rate": 2.3475762553005727e-06,
"loss": 0.5145916938781738,
"memory(GiB)": 76.04,
"step": 6305,
"token_acc": 0.841187863137508,
"train_speed(iter/s)": 0.027627
},
{
"epoch": 1.6314564613097162,
"grad_norm": 1.8713252544403076,
"learning_rate": 2.344019528265607e-06,
"loss": 0.5273695468902588,
"memory(GiB)": 76.04,
"step": 6310,
"token_acc": 0.8691662296801258,
"train_speed(iter/s)": 0.027627
},
{
"epoch": 1.6327493697071562,
"grad_norm": 1.0383869409561157,
"learning_rate": 2.3404631181485053e-06,
"loss": 0.5135766983032226,
"memory(GiB)": 76.04,
"step": 6315,
"token_acc": 0.8497678608551036,
"train_speed(iter/s)": 0.027627
},
{
"epoch": 1.6340422781045962,
"grad_norm": 0.8791532516479492,
"learning_rate": 2.3369070321751085e-06,
"loss": 0.5190924167633056,
"memory(GiB)": 76.04,
"step": 6320,
"token_acc": 0.8457278865303347,
"train_speed(iter/s)": 0.027627
},
{
"epoch": 1.6353351865020365,
"grad_norm": 1.2614482641220093,
"learning_rate": 2.3333512775705975e-06,
"loss": 0.5101301193237304,
"memory(GiB)": 76.04,
"step": 6325,
"token_acc": 0.8530480522450639,
"train_speed(iter/s)": 0.027627
},
{
"epoch": 1.6366280948994762,
"grad_norm": 1.5320963859558105,
"learning_rate": 2.3297958615594786e-06,
"loss": 0.4884361743927002,
"memory(GiB)": 76.04,
"step": 6330,
"token_acc": 0.8361637380975754,
"train_speed(iter/s)": 0.027627
},
{
"epoch": 1.6379210032969165,
"grad_norm": 1.157752275466919,
"learning_rate": 2.326240791365575e-06,
"loss": 0.4981177806854248,
"memory(GiB)": 76.04,
"step": 6335,
"token_acc": 0.8416252072968491,
"train_speed(iter/s)": 0.027628
},
{
"epoch": 1.6392139116943565,
"grad_norm": 1.2564195394515991,
"learning_rate": 2.3226860742120017e-06,
"loss": 0.538153886795044,
"memory(GiB)": 76.04,
"step": 6340,
"token_acc": 0.8415269756303705,
"train_speed(iter/s)": 0.027628
},
{
"epoch": 1.6405068200917965,
"grad_norm": 0.9583210945129395,
"learning_rate": 2.319131717321159e-06,
"loss": 0.4883336067199707,
"memory(GiB)": 76.04,
"step": 6345,
"token_acc": 0.8325664381632079,
"train_speed(iter/s)": 0.027628
},
{
"epoch": 1.6417997284892365,
"grad_norm": 1.2164779901504517,
"learning_rate": 2.3155777279147156e-06,
"loss": 0.5153134346008301,
"memory(GiB)": 76.04,
"step": 6350,
"token_acc": 0.8470916505687915,
"train_speed(iter/s)": 0.027628
},
{
"epoch": 1.6430926368866765,
"grad_norm": 1.1382580995559692,
"learning_rate": 2.312024113213592e-06,
"loss": 0.5252164840698242,
"memory(GiB)": 76.04,
"step": 6355,
"token_acc": 0.8316279498525073,
"train_speed(iter/s)": 0.027628
},
{
"epoch": 1.6443855452841167,
"grad_norm": 0.8894354104995728,
"learning_rate": 2.3084708804379497e-06,
"loss": 0.5195868015289307,
"memory(GiB)": 76.04,
"step": 6360,
"token_acc": 0.8409683261916332,
"train_speed(iter/s)": 0.027628
},
{
"epoch": 1.6456784536815565,
"grad_norm": 1.683902382850647,
"learning_rate": 2.3049180368071724e-06,
"loss": 0.5006110191345214,
"memory(GiB)": 76.04,
"step": 6365,
"token_acc": 0.8291754756871036,
"train_speed(iter/s)": 0.027628
},
{
"epoch": 1.6469713620789967,
"grad_norm": 1.7893975973129272,
"learning_rate": 2.301365589539853e-06,
"loss": 0.49852724075317384,
"memory(GiB)": 76.04,
"step": 6370,
"token_acc": 0.8365970585845454,
"train_speed(iter/s)": 0.027627
},
{
"epoch": 1.6482642704764368,
"grad_norm": 0.9394116997718811,
"learning_rate": 2.2978135458537793e-06,
"loss": 0.5331932067871094,
"memory(GiB)": 76.04,
"step": 6375,
"token_acc": 0.8245893719806763,
"train_speed(iter/s)": 0.027628
},
{
"epoch": 1.6495571788738768,
"grad_norm": 1.047746181488037,
"learning_rate": 2.2942619129659205e-06,
"loss": 0.5376855850219726,
"memory(GiB)": 76.04,
"step": 6380,
"token_acc": 0.8324474924989285,
"train_speed(iter/s)": 0.027628
},
{
"epoch": 1.650850087271317,
"grad_norm": 0.8962509632110596,
"learning_rate": 2.2907106980924104e-06,
"loss": 0.4863112449645996,
"memory(GiB)": 76.04,
"step": 6385,
"token_acc": 0.8503153721391241,
"train_speed(iter/s)": 0.027629
},
{
"epoch": 1.6521429956687568,
"grad_norm": 1.389729380607605,
"learning_rate": 2.2871599084485325e-06,
"loss": 0.5152921676635742,
"memory(GiB)": 76.04,
"step": 6390,
"token_acc": 0.8414910086935811,
"train_speed(iter/s)": 0.027629
},
{
"epoch": 1.653435904066197,
"grad_norm": 0.9997110366821289,
"learning_rate": 2.2836095512487063e-06,
"loss": 0.5211985588073731,
"memory(GiB)": 76.04,
"step": 6395,
"token_acc": 0.857375318849503,
"train_speed(iter/s)": 0.027629
},
{
"epoch": 1.6547288124636368,
"grad_norm": 1.0818804502487183,
"learning_rate": 2.280059633706475e-06,
"loss": 0.5084996223449707,
"memory(GiB)": 76.04,
"step": 6400,
"token_acc": 0.8613592233009709,
"train_speed(iter/s)": 0.027629
},
{
"epoch": 1.656021720861077,
"grad_norm": 0.9981624484062195,
"learning_rate": 2.276510163034486e-06,
"loss": 0.5429449081420898,
"memory(GiB)": 76.04,
"step": 6405,
"token_acc": 0.8164522088613749,
"train_speed(iter/s)": 0.02763
},
{
"epoch": 1.657314629258517,
"grad_norm": 0.9858782887458801,
"learning_rate": 2.2729611464444797e-06,
"loss": 0.5149686813354493,
"memory(GiB)": 76.04,
"step": 6410,
"token_acc": 0.8206152336907014,
"train_speed(iter/s)": 0.027629
},
{
"epoch": 1.658607537655957,
"grad_norm": 0.9807013273239136,
"learning_rate": 2.2694125911472743e-06,
"loss": 0.5264925479888916,
"memory(GiB)": 76.04,
"step": 6415,
"token_acc": 0.8599959754502465,
"train_speed(iter/s)": 0.027629
},
{
"epoch": 1.6599004460533973,
"grad_norm": 1.4319406747817993,
"learning_rate": 2.265864504352749e-06,
"loss": 0.5101997375488281,
"memory(GiB)": 76.04,
"step": 6420,
"token_acc": 0.8523102555710927,
"train_speed(iter/s)": 0.02763
},
{
"epoch": 1.661193354450837,
"grad_norm": 1.1590079069137573,
"learning_rate": 2.2623168932698347e-06,
"loss": 0.4951170444488525,
"memory(GiB)": 76.04,
"step": 6425,
"token_acc": 0.852101487651052,
"train_speed(iter/s)": 0.027629
},
{
"epoch": 1.6624862628482773,
"grad_norm": 0.9497671127319336,
"learning_rate": 2.258769765106492e-06,
"loss": 0.5196887969970703,
"memory(GiB)": 76.04,
"step": 6430,
"token_acc": 0.8118867658795361,
"train_speed(iter/s)": 0.027629
},
{
"epoch": 1.6637791712457173,
"grad_norm": 1.0885018110275269,
"learning_rate": 2.255223127069702e-06,
"loss": 0.5309447765350341,
"memory(GiB)": 76.04,
"step": 6435,
"token_acc": 0.8558985773734636,
"train_speed(iter/s)": 0.027629
},
{
"epoch": 1.6650720796431573,
"grad_norm": 1.0342893600463867,
"learning_rate": 2.251676986365449e-06,
"loss": 0.49361910820007326,
"memory(GiB)": 76.04,
"step": 6440,
"token_acc": 0.8603108210435222,
"train_speed(iter/s)": 0.027631
},
{
"epoch": 1.6663649880405973,
"grad_norm": 1.072338342666626,
"learning_rate": 2.2481313501987103e-06,
"loss": 0.5142477035522461,
"memory(GiB)": 76.04,
"step": 6445,
"token_acc": 0.8642217409120178,
"train_speed(iter/s)": 0.02763
},
{
"epoch": 1.6676578964380373,
"grad_norm": 1.2066073417663574,
"learning_rate": 2.2445862257734317e-06,
"loss": 0.5130002975463868,
"memory(GiB)": 76.04,
"step": 6450,
"token_acc": 0.8323050805349675,
"train_speed(iter/s)": 0.02763
},
{
"epoch": 1.6689508048354775,
"grad_norm": 1.054640769958496,
"learning_rate": 2.2410416202925262e-06,
"loss": 0.5043740749359131,
"memory(GiB)": 76.04,
"step": 6455,
"token_acc": 0.8686844613918017,
"train_speed(iter/s)": 0.027631
},
{
"epoch": 1.6702437132329173,
"grad_norm": 2.2036914825439453,
"learning_rate": 2.237497540957848e-06,
"loss": 0.5211320877075195,
"memory(GiB)": 76.04,
"step": 6460,
"token_acc": 0.8593575418994414,
"train_speed(iter/s)": 0.027631
},
{
"epoch": 1.6715366216303575,
"grad_norm": 1.2252447605133057,
"learning_rate": 2.2339539949701817e-06,
"loss": 0.5284463882446289,
"memory(GiB)": 76.04,
"step": 6465,
"token_acc": 0.8268600408188509,
"train_speed(iter/s)": 0.027631
},
{
"epoch": 1.6728295300277976,
"grad_norm": 1.165474772453308,
"learning_rate": 2.230410989529233e-06,
"loss": 0.5352771759033204,
"memory(GiB)": 76.04,
"step": 6470,
"token_acc": 0.8386907812843231,
"train_speed(iter/s)": 0.02763
},
{
"epoch": 1.6741224384252376,
"grad_norm": 0.8712031841278076,
"learning_rate": 2.226868531833605e-06,
"loss": 0.5065167903900146,
"memory(GiB)": 76.04,
"step": 6475,
"token_acc": 0.8473345970687503,
"train_speed(iter/s)": 0.02763
},
{
"epoch": 1.6754153468226776,
"grad_norm": 1.0779868364334106,
"learning_rate": 2.2233266290807886e-06,
"loss": 0.5394890785217286,
"memory(GiB)": 76.04,
"step": 6480,
"token_acc": 0.8533062727144003,
"train_speed(iter/s)": 0.027629
},
{
"epoch": 1.6767082552201176,
"grad_norm": 1.0535589456558228,
"learning_rate": 2.2197852884671487e-06,
"loss": 0.5131864547729492,
"memory(GiB)": 76.04,
"step": 6485,
"token_acc": 0.8391084472747705,
"train_speed(iter/s)": 0.027629
},
{
"epoch": 1.6780011636175578,
"grad_norm": 0.8883035778999329,
"learning_rate": 2.2162445171879067e-06,
"loss": 0.5062174320220947,
"memory(GiB)": 76.04,
"step": 6490,
"token_acc": 0.853890824622532,
"train_speed(iter/s)": 0.027629
},
{
"epoch": 1.6792940720149976,
"grad_norm": 0.983790934085846,
"learning_rate": 2.212704322437129e-06,
"loss": 0.500247859954834,
"memory(GiB)": 76.04,
"step": 6495,
"token_acc": 0.860179981923213,
"train_speed(iter/s)": 0.027628
},
{
"epoch": 1.6805869804124378,
"grad_norm": 3.652728319168091,
"learning_rate": 2.2091647114077083e-06,
"loss": 0.5243520736694336,
"memory(GiB)": 76.04,
"step": 6500,
"token_acc": 0.8531156542628818,
"train_speed(iter/s)": 0.027629
},
{
"epoch": 1.6818798888098778,
"grad_norm": 1.112502098083496,
"learning_rate": 2.2056256912913508e-06,
"loss": 0.5279044151306153,
"memory(GiB)": 76.04,
"step": 6505,
"token_acc": 0.8433751743375174,
"train_speed(iter/s)": 0.027629
},
{
"epoch": 1.6831727972073178,
"grad_norm": 1.060985803604126,
"learning_rate": 2.2020872692785666e-06,
"loss": 0.5015209197998047,
"memory(GiB)": 76.04,
"step": 6510,
"token_acc": 0.8586926542245105,
"train_speed(iter/s)": 0.027629
},
{
"epoch": 1.684465705604758,
"grad_norm": 1.1359707117080688,
"learning_rate": 2.1985494525586458e-06,
"loss": 0.4859332084655762,
"memory(GiB)": 76.04,
"step": 6515,
"token_acc": 0.844207331995497,
"train_speed(iter/s)": 0.02763
},
{
"epoch": 1.6857586140021978,
"grad_norm": 1.0586234331130981,
"learning_rate": 2.1950122483196513e-06,
"loss": 0.5136495590209961,
"memory(GiB)": 76.04,
"step": 6520,
"token_acc": 0.8458574181117534,
"train_speed(iter/s)": 0.02763
},
{
"epoch": 1.687051522399638,
"grad_norm": 2.7810006141662598,
"learning_rate": 2.191475663748401e-06,
"loss": 0.5169890403747559,
"memory(GiB)": 76.04,
"step": 6525,
"token_acc": 0.8014549325762953,
"train_speed(iter/s)": 0.02763
},
{
"epoch": 1.688344430797078,
"grad_norm": 1.60906982421875,
"learning_rate": 2.1879397060304518e-06,
"loss": 0.5097242832183838,
"memory(GiB)": 76.04,
"step": 6530,
"token_acc": 0.8751048951048951,
"train_speed(iter/s)": 0.02763
},
{
"epoch": 1.689637339194518,
"grad_norm": 1.4863407611846924,
"learning_rate": 2.1844043823500912e-06,
"loss": 0.5065485954284668,
"memory(GiB)": 76.04,
"step": 6535,
"token_acc": 0.8393135069196147,
"train_speed(iter/s)": 0.02763
},
{
"epoch": 1.690930247591958,
"grad_norm": 1.1122289896011353,
"learning_rate": 2.1808696998903147e-06,
"loss": 0.4878704071044922,
"memory(GiB)": 76.04,
"step": 6540,
"token_acc": 0.8160763559053693,
"train_speed(iter/s)": 0.02763
},
{
"epoch": 1.692223155989398,
"grad_norm": 2.09844970703125,
"learning_rate": 2.177335665832816e-06,
"loss": 0.5010098457336426,
"memory(GiB)": 76.04,
"step": 6545,
"token_acc": 0.844059639520619,
"train_speed(iter/s)": 0.02763
},
{
"epoch": 1.6935160643868383,
"grad_norm": 1.2505282163619995,
"learning_rate": 2.1738022873579724e-06,
"loss": 0.5115324020385742,
"memory(GiB)": 76.04,
"step": 6550,
"token_acc": 0.8518385971190838,
"train_speed(iter/s)": 0.02763
},
{
"epoch": 1.6948089727842781,
"grad_norm": 35.658302307128906,
"learning_rate": 2.1702695716448276e-06,
"loss": 0.5169626235961914,
"memory(GiB)": 76.04,
"step": 6555,
"token_acc": 0.8498464176012572,
"train_speed(iter/s)": 0.02763
},
{
"epoch": 1.6961018811817183,
"grad_norm": 1.1642698049545288,
"learning_rate": 2.166737525871081e-06,
"loss": 0.5165857315063477,
"memory(GiB)": 76.04,
"step": 6560,
"token_acc": 0.8605155555555556,
"train_speed(iter/s)": 0.02763
},
{
"epoch": 1.6973947895791583,
"grad_norm": 1.4594037532806396,
"learning_rate": 2.1632061572130687e-06,
"loss": 0.48950824737548826,
"memory(GiB)": 76.04,
"step": 6565,
"token_acc": 0.832944099378882,
"train_speed(iter/s)": 0.027629
},
{
"epoch": 1.6986876979765984,
"grad_norm": 1.3144365549087524,
"learning_rate": 2.1596754728457508e-06,
"loss": 0.5155162811279297,
"memory(GiB)": 76.04,
"step": 6570,
"token_acc": 0.8293847917462743,
"train_speed(iter/s)": 0.027629
},
{
"epoch": 1.6999806063740384,
"grad_norm": 1.3902095556259155,
"learning_rate": 2.1561454799426997e-06,
"loss": 0.5293027877807617,
"memory(GiB)": 76.04,
"step": 6575,
"token_acc": 0.8462088378535365,
"train_speed(iter/s)": 0.02763
},
{
"epoch": 1.7012735147714784,
"grad_norm": 1.1059478521347046,
"learning_rate": 2.1526161856760806e-06,
"loss": 0.5223227500915527,
"memory(GiB)": 76.04,
"step": 6580,
"token_acc": 0.8502826247235193,
"train_speed(iter/s)": 0.027629
},
{
"epoch": 1.7025664231689186,
"grad_norm": 1.1676130294799805,
"learning_rate": 2.1490875972166394e-06,
"loss": 0.5052920341491699,
"memory(GiB)": 76.04,
"step": 6585,
"token_acc": 0.8628404326533489,
"train_speed(iter/s)": 0.027629
},
{
"epoch": 1.7038593315663584,
"grad_norm": 1.3875575065612793,
"learning_rate": 2.1455597217336895e-06,
"loss": 0.5150994777679443,
"memory(GiB)": 76.04,
"step": 6590,
"token_acc": 0.8165382212039158,
"train_speed(iter/s)": 0.027628
},
{
"epoch": 1.7051522399637986,
"grad_norm": 1.5124651193618774,
"learning_rate": 2.1420325663950923e-06,
"loss": 0.4880176544189453,
"memory(GiB)": 76.04,
"step": 6595,
"token_acc": 0.8497203061189471,
"train_speed(iter/s)": 0.027628
},
{
"epoch": 1.7064451483612386,
"grad_norm": 1.1382073163986206,
"learning_rate": 2.138506138367252e-06,
"loss": 0.496349573135376,
"memory(GiB)": 76.04,
"step": 6600,
"token_acc": 0.8560916156924068,
"train_speed(iter/s)": 0.027627
},
{
"epoch": 1.7077380567586786,
"grad_norm": 1.3284364938735962,
"learning_rate": 2.134980444815089e-06,
"loss": 0.5333932876586914,
"memory(GiB)": 76.04,
"step": 6605,
"token_acc": 0.8525549959102846,
"train_speed(iter/s)": 0.027626
},
{
"epoch": 1.7090309651561189,
"grad_norm": 0.9929280281066895,
"learning_rate": 2.1314554929020335e-06,
"loss": 0.49078850746154784,
"memory(GiB)": 76.04,
"step": 6610,
"token_acc": 0.8561151079136691,
"train_speed(iter/s)": 0.027627
},
{
"epoch": 1.7103238735535586,
"grad_norm": 1.0720187425613403,
"learning_rate": 2.1279312897900097e-06,
"loss": 0.5510223388671875,
"memory(GiB)": 76.04,
"step": 6615,
"token_acc": 0.8111014442317731,
"train_speed(iter/s)": 0.027627
},
{
"epoch": 1.7116167819509989,
"grad_norm": 0.9900431036949158,
"learning_rate": 2.124407842639421e-06,
"loss": 0.535820198059082,
"memory(GiB)": 76.04,
"step": 6620,
"token_acc": 0.8021880945909214,
"train_speed(iter/s)": 0.027627
},
{
"epoch": 1.7129096903484387,
"grad_norm": 1.2437286376953125,
"learning_rate": 2.120885158609132e-06,
"loss": 0.5138998985290527,
"memory(GiB)": 76.04,
"step": 6625,
"token_acc": 0.8522361238259926,
"train_speed(iter/s)": 0.027627
},
{
"epoch": 1.7142025987458789,
"grad_norm": 1.634427547454834,
"learning_rate": 2.1173632448564603e-06,
"loss": 0.4958186149597168,
"memory(GiB)": 76.04,
"step": 6630,
"token_acc": 0.8392738961898459,
"train_speed(iter/s)": 0.027627
},
{
"epoch": 1.7154955071433189,
"grad_norm": 0.9783524870872498,
"learning_rate": 2.113842108537155e-06,
"loss": 0.51722412109375,
"memory(GiB)": 76.04,
"step": 6635,
"token_acc": 0.8480223559759243,
"train_speed(iter/s)": 0.027628
},
{
"epoch": 1.716788415540759,
"grad_norm": 0.919309139251709,
"learning_rate": 2.110321756805388e-06,
"loss": 0.4969566822052002,
"memory(GiB)": 76.04,
"step": 6640,
"token_acc": 0.8367924528301887,
"train_speed(iter/s)": 0.027626
},
{
"epoch": 1.7180813239381991,
"grad_norm": 1.3210502862930298,
"learning_rate": 2.1068021968137367e-06,
"loss": 0.509549903869629,
"memory(GiB)": 76.04,
"step": 6645,
"token_acc": 0.8161617605030008,
"train_speed(iter/s)": 0.027626
},
{
"epoch": 1.719374232335639,
"grad_norm": 1.611215591430664,
"learning_rate": 2.103283435713169e-06,
"loss": 0.49874000549316405,
"memory(GiB)": 76.04,
"step": 6650,
"token_acc": 0.8636546184738956,
"train_speed(iter/s)": 0.027624
},
{
"epoch": 1.7206671407330791,
"grad_norm": 1.5470666885375977,
"learning_rate": 2.0997654806530314e-06,
"loss": 0.5100409984588623,
"memory(GiB)": 76.04,
"step": 6655,
"token_acc": 0.8550865800865801,
"train_speed(iter/s)": 0.027625
},
{
"epoch": 1.7219600491305191,
"grad_norm": 1.1041886806488037,
"learning_rate": 2.0962483387810293e-06,
"loss": 0.5100605964660645,
"memory(GiB)": 76.04,
"step": 6660,
"token_acc": 0.8544809228039042,
"train_speed(iter/s)": 0.027625
},
{
"epoch": 1.7232529575279592,
"grad_norm": 1.3654582500457764,
"learning_rate": 2.092732017243221e-06,
"loss": 0.5010916709899902,
"memory(GiB)": 76.04,
"step": 6665,
"token_acc": 0.8365678065576336,
"train_speed(iter/s)": 0.027625
},
{
"epoch": 1.7245458659253992,
"grad_norm": 1.7149609327316284,
"learning_rate": 2.0892165231839935e-06,
"loss": 0.5101409912109375,
"memory(GiB)": 76.04,
"step": 6670,
"token_acc": 0.866217041193058,
"train_speed(iter/s)": 0.027625
},
{
"epoch": 1.7258387743228392,
"grad_norm": 1.0402779579162598,
"learning_rate": 2.085701863746054e-06,
"loss": 0.5074934005737305,
"memory(GiB)": 76.04,
"step": 6675,
"token_acc": 0.8241552132337115,
"train_speed(iter/s)": 0.027624
},
{
"epoch": 1.7271316827202794,
"grad_norm": 1.1275187730789185,
"learning_rate": 2.082188046070414e-06,
"loss": 0.48826584815979,
"memory(GiB)": 76.04,
"step": 6680,
"token_acc": 0.8426791277258567,
"train_speed(iter/s)": 0.027625
},
{
"epoch": 1.7284245911177192,
"grad_norm": 0.9774495363235474,
"learning_rate": 2.0786750772963758e-06,
"loss": 0.49518957138061526,
"memory(GiB)": 76.04,
"step": 6685,
"token_acc": 0.8610470275066548,
"train_speed(iter/s)": 0.027625
},
{
"epoch": 1.7297174995151594,
"grad_norm": 0.9467169642448425,
"learning_rate": 2.0751629645615155e-06,
"loss": 0.5169444561004639,
"memory(GiB)": 76.04,
"step": 6690,
"token_acc": 0.8478371242891958,
"train_speed(iter/s)": 0.027625
},
{
"epoch": 1.7310104079125994,
"grad_norm": 1.2938302755355835,
"learning_rate": 2.071651715001671e-06,
"loss": 0.549882173538208,
"memory(GiB)": 76.04,
"step": 6695,
"token_acc": 0.8068968578022369,
"train_speed(iter/s)": 0.027626
},
{
"epoch": 1.7323033163100394,
"grad_norm": 2.241086721420288,
"learning_rate": 2.068141335750925e-06,
"loss": 0.49896945953369143,
"memory(GiB)": 76.04,
"step": 6700,
"token_acc": 0.845947499520981,
"train_speed(iter/s)": 0.027626
},
{
"epoch": 1.7335962247074794,
"grad_norm": 2.3480138778686523,
"learning_rate": 2.0646318339415917e-06,
"loss": 0.5186596393585206,
"memory(GiB)": 76.04,
"step": 6705,
"token_acc": 0.8365800865800865,
"train_speed(iter/s)": 0.027625
},
{
"epoch": 1.7348891331049194,
"grad_norm": 1.1465567350387573,
"learning_rate": 2.0611232167042062e-06,
"loss": 0.504915428161621,
"memory(GiB)": 76.04,
"step": 6710,
"token_acc": 0.8566171520890364,
"train_speed(iter/s)": 0.027624
},
{
"epoch": 1.7361820415023597,
"grad_norm": 0.9098281860351562,
"learning_rate": 2.0576154911675024e-06,
"loss": 0.49738254547119143,
"memory(GiB)": 76.04,
"step": 6715,
"token_acc": 0.8353113246970331,
"train_speed(iter/s)": 0.027624
},
{
"epoch": 1.7374749498997994,
"grad_norm": 2.8491294384002686,
"learning_rate": 2.0541086644584033e-06,
"loss": 0.48783140182495116,
"memory(GiB)": 76.04,
"step": 6720,
"token_acc": 0.8488560619708161,
"train_speed(iter/s)": 0.027624
},
{
"epoch": 1.7387678582972397,
"grad_norm": 1.6546626091003418,
"learning_rate": 2.0506027437020067e-06,
"loss": 0.5130843162536621,
"memory(GiB)": 76.04,
"step": 6725,
"token_acc": 0.8279507603186097,
"train_speed(iter/s)": 0.027624
},
{
"epoch": 1.7400607666946797,
"grad_norm": 1.783795952796936,
"learning_rate": 2.047097736021569e-06,
"loss": 0.5069493293762207,
"memory(GiB)": 76.04,
"step": 6730,
"token_acc": 0.8449707155589509,
"train_speed(iter/s)": 0.027624
},
{
"epoch": 1.7413536750921197,
"grad_norm": 1.0760575532913208,
"learning_rate": 2.043593648538492e-06,
"loss": 0.5043985366821289,
"memory(GiB)": 76.04,
"step": 6735,
"token_acc": 0.8538088715625329,
"train_speed(iter/s)": 0.027624
},
{
"epoch": 1.74264658348956,
"grad_norm": 0.9393293261528015,
"learning_rate": 2.0400904883723074e-06,
"loss": 0.5335483551025391,
"memory(GiB)": 76.04,
"step": 6740,
"token_acc": 0.8334933205343572,
"train_speed(iter/s)": 0.027623
},
{
"epoch": 1.7439394918869997,
"grad_norm": 1.1046473979949951,
"learning_rate": 2.036588262640661e-06,
"loss": 0.5038503170013428,
"memory(GiB)": 76.04,
"step": 6745,
"token_acc": 0.8518955250280055,
"train_speed(iter/s)": 0.027623
},
{
"epoch": 1.74523240028444,
"grad_norm": 3.058380603790283,
"learning_rate": 2.0330869784593054e-06,
"loss": 0.5195840835571289,
"memory(GiB)": 76.04,
"step": 6750,
"token_acc": 0.8470919324577861,
"train_speed(iter/s)": 0.027623
},
{
"epoch": 1.74652530868188,
"grad_norm": 0.9729762673377991,
"learning_rate": 2.029586642942074e-06,
"loss": 0.5047917366027832,
"memory(GiB)": 76.04,
"step": 6755,
"token_acc": 0.8124232148999951,
"train_speed(iter/s)": 0.027623
},
{
"epoch": 1.74781821707932,
"grad_norm": 1.6101481914520264,
"learning_rate": 2.026087263200876e-06,
"loss": 0.5221758365631104,
"memory(GiB)": 76.04,
"step": 6760,
"token_acc": 0.8239374739860333,
"train_speed(iter/s)": 0.027622
},
{
"epoch": 1.74911112547676,
"grad_norm": 1.5293364524841309,
"learning_rate": 2.0225888463456787e-06,
"loss": 0.5044497489929199,
"memory(GiB)": 76.04,
"step": 6765,
"token_acc": 0.8550766191978082,
"train_speed(iter/s)": 0.027622
},
{
"epoch": 1.7504040338742,
"grad_norm": 0.9731065034866333,
"learning_rate": 2.019091399484491e-06,
"loss": 0.499710750579834,
"memory(GiB)": 76.04,
"step": 6770,
"token_acc": 0.8571820068120425,
"train_speed(iter/s)": 0.027623
},
{
"epoch": 1.7516969422716402,
"grad_norm": 1.806395173072815,
"learning_rate": 2.0155949297233542e-06,
"loss": 0.5355013847351074,
"memory(GiB)": 76.04,
"step": 6775,
"token_acc": 0.813873528994754,
"train_speed(iter/s)": 0.027623
},
{
"epoch": 1.75298985066908,
"grad_norm": 1.0448716878890991,
"learning_rate": 2.012099444166322e-06,
"loss": 0.5205565929412842,
"memory(GiB)": 76.04,
"step": 6780,
"token_acc": 0.8436368468258978,
"train_speed(iter/s)": 0.027623
},
{
"epoch": 1.7542827590665202,
"grad_norm": 1.2543259859085083,
"learning_rate": 2.008604949915448e-06,
"loss": 0.5098013877868652,
"memory(GiB)": 76.04,
"step": 6785,
"token_acc": 0.8451558833389206,
"train_speed(iter/s)": 0.027622
},
{
"epoch": 1.7555756674639602,
"grad_norm": 0.9221981167793274,
"learning_rate": 2.005111454070773e-06,
"loss": 0.5172914505004883,
"memory(GiB)": 76.04,
"step": 6790,
"token_acc": 0.8517071704916801,
"train_speed(iter/s)": 0.027622
},
{
"epoch": 1.7568685758614002,
"grad_norm": 4.873886585235596,
"learning_rate": 2.0016189637303087e-06,
"loss": 0.5167638778686523,
"memory(GiB)": 76.04,
"step": 6795,
"token_acc": 0.842072213500785,
"train_speed(iter/s)": 0.027622
},
{
"epoch": 1.7581614842588402,
"grad_norm": 0.9154864549636841,
"learning_rate": 1.9981274859900253e-06,
"loss": 0.49820308685302733,
"memory(GiB)": 76.04,
"step": 6800,
"token_acc": 0.8464762230585016,
"train_speed(iter/s)": 0.027622
},
{
"epoch": 1.7594543926562802,
"grad_norm": 1.0603070259094238,
"learning_rate": 1.9946370279438337e-06,
"loss": 0.5082100868225098,
"memory(GiB)": 76.04,
"step": 6805,
"token_acc": 0.8356841646066598,
"train_speed(iter/s)": 0.027621
},
{
"epoch": 1.7607473010537205,
"grad_norm": 1.0266915559768677,
"learning_rate": 1.9911475966835735e-06,
"loss": 0.5149668216705322,
"memory(GiB)": 76.04,
"step": 6810,
"token_acc": 0.8340411379451494,
"train_speed(iter/s)": 0.027621
},
{
"epoch": 1.7620402094511602,
"grad_norm": 1.4958763122558594,
"learning_rate": 1.987659199298997e-06,
"loss": 0.515727186203003,
"memory(GiB)": 76.04,
"step": 6815,
"token_acc": 0.8732497922127741,
"train_speed(iter/s)": 0.027621
},
{
"epoch": 1.7633331178486005,
"grad_norm": 1.098486304283142,
"learning_rate": 1.984171842877759e-06,
"loss": 0.49164752960205077,
"memory(GiB)": 76.04,
"step": 6820,
"token_acc": 0.850276862943795,
"train_speed(iter/s)": 0.02762
},
{
"epoch": 1.7646260262460405,
"grad_norm": 0.9649628400802612,
"learning_rate": 1.9806855345053964e-06,
"loss": 0.4989636898040771,
"memory(GiB)": 76.04,
"step": 6825,
"token_acc": 0.8244157210490719,
"train_speed(iter/s)": 0.02762
},
{
"epoch": 1.7659189346434805,
"grad_norm": 1.1349451541900635,
"learning_rate": 1.977200281265319e-06,
"loss": 0.5093589782714844,
"memory(GiB)": 76.04,
"step": 6830,
"token_acc": 0.8507517284266745,
"train_speed(iter/s)": 0.027619
},
{
"epoch": 1.7672118430409207,
"grad_norm": 0.9703465700149536,
"learning_rate": 1.9737160902387896e-06,
"loss": 0.5363808631896972,
"memory(GiB)": 76.04,
"step": 6835,
"token_acc": 0.8378486587481649,
"train_speed(iter/s)": 0.027619
},
{
"epoch": 1.7685047514383605,
"grad_norm": 1.3537312746047974,
"learning_rate": 1.9702329685049167e-06,
"loss": 0.4682920455932617,
"memory(GiB)": 76.04,
"step": 6840,
"token_acc": 0.8536826495304004,
"train_speed(iter/s)": 0.027618
},
{
"epoch": 1.7697976598358007,
"grad_norm": 1.2528867721557617,
"learning_rate": 1.9667509231406332e-06,
"loss": 0.5215599060058593,
"memory(GiB)": 76.04,
"step": 6845,
"token_acc": 0.8464350200378737,
"train_speed(iter/s)": 0.027618
},
{
"epoch": 1.7710905682332405,
"grad_norm": 1.063242793083191,
"learning_rate": 1.963269961220687e-06,
"loss": 0.5151140689849854,
"memory(GiB)": 76.04,
"step": 6850,
"token_acc": 0.8156099097207642,
"train_speed(iter/s)": 0.027617
},
{
"epoch": 1.7723834766306807,
"grad_norm": 1.098589539527893,
"learning_rate": 1.9597900898176212e-06,
"loss": 0.5092347145080567,
"memory(GiB)": 76.04,
"step": 6855,
"token_acc": 0.8469693605683837,
"train_speed(iter/s)": 0.027616
},
{
"epoch": 1.7736763850281207,
"grad_norm": 1.204325556755066,
"learning_rate": 1.9563113160017692e-06,
"loss": 0.5028075218200684,
"memory(GiB)": 76.04,
"step": 6860,
"token_acc": 0.8575784400511459,
"train_speed(iter/s)": 0.027616
},
{
"epoch": 1.7749692934255608,
"grad_norm": 1.1051974296569824,
"learning_rate": 1.952833646841229e-06,
"loss": 0.5096775531768799,
"memory(GiB)": 76.04,
"step": 6865,
"token_acc": 0.844632645043,
"train_speed(iter/s)": 0.027615
},
{
"epoch": 1.776262201823001,
"grad_norm": 1.019508957862854,
"learning_rate": 1.949357089401858e-06,
"loss": 0.5253026962280274,
"memory(GiB)": 76.04,
"step": 6870,
"token_acc": 0.8215040397762585,
"train_speed(iter/s)": 0.027615
},
{
"epoch": 1.7775551102204408,
"grad_norm": 1.104564905166626,
"learning_rate": 1.9458816507472508e-06,
"loss": 0.5215746879577636,
"memory(GiB)": 76.04,
"step": 6875,
"token_acc": 0.8430459464254035,
"train_speed(iter/s)": 0.027615
},
{
"epoch": 1.778848018617881,
"grad_norm": 1.5065202713012695,
"learning_rate": 1.942407337938731e-06,
"loss": 0.5007893562316894,
"memory(GiB)": 76.04,
"step": 6880,
"token_acc": 0.877871314353399,
"train_speed(iter/s)": 0.027615
},
{
"epoch": 1.780140927015321,
"grad_norm": 1.023169994354248,
"learning_rate": 1.9389341580353376e-06,
"loss": 0.5197202682495117,
"memory(GiB)": 76.04,
"step": 6885,
"token_acc": 0.8612209229744913,
"train_speed(iter/s)": 0.027615
},
{
"epoch": 1.781433835412761,
"grad_norm": 1.0600008964538574,
"learning_rate": 1.9354621180938025e-06,
"loss": 0.5054890155792237,
"memory(GiB)": 76.04,
"step": 6890,
"token_acc": 0.8306423761008461,
"train_speed(iter/s)": 0.027615
},
{
"epoch": 1.782726743810201,
"grad_norm": 1.1297129392623901,
"learning_rate": 1.931991225168544e-06,
"loss": 0.5017886161804199,
"memory(GiB)": 76.04,
"step": 6895,
"token_acc": 0.8641059027777778,
"train_speed(iter/s)": 0.027616
},
{
"epoch": 1.784019652207641,
"grad_norm": 1.0830426216125488,
"learning_rate": 1.92852148631165e-06,
"loss": 0.5002258777618408,
"memory(GiB)": 76.04,
"step": 6900,
"token_acc": 0.8582578976537965,
"train_speed(iter/s)": 0.027616
},
{
"epoch": 1.7853125606050813,
"grad_norm": 1.4555113315582275,
"learning_rate": 1.9250529085728656e-06,
"loss": 0.5128755569458008,
"memory(GiB)": 76.04,
"step": 6905,
"token_acc": 0.861249647125247,
"train_speed(iter/s)": 0.027616
},
{
"epoch": 1.786605469002521,
"grad_norm": 1.1025890111923218,
"learning_rate": 1.9215854989995726e-06,
"loss": 0.5137574195861816,
"memory(GiB)": 76.04,
"step": 6910,
"token_acc": 0.8645792423863758,
"train_speed(iter/s)": 0.027617
},
{
"epoch": 1.7878983773999613,
"grad_norm": 0.8277395367622375,
"learning_rate": 1.9181192646367815e-06,
"loss": 0.4998950958251953,
"memory(GiB)": 76.04,
"step": 6915,
"token_acc": 0.86408374778284,
"train_speed(iter/s)": 0.027616
},
{
"epoch": 1.7891912857974013,
"grad_norm": 4.725937843322754,
"learning_rate": 1.914654212527114e-06,
"loss": 0.48327035903930665,
"memory(GiB)": 76.04,
"step": 6920,
"token_acc": 0.8672415229525952,
"train_speed(iter/s)": 0.027616
},
{
"epoch": 1.7904841941948413,
"grad_norm": 1.076759934425354,
"learning_rate": 1.9111903497107924e-06,
"loss": 0.5146621704101563,
"memory(GiB)": 76.04,
"step": 6925,
"token_acc": 0.8483241482097674,
"train_speed(iter/s)": 0.027616
},
{
"epoch": 1.7917771025922813,
"grad_norm": 1.0654821395874023,
"learning_rate": 1.90772768322562e-06,
"loss": 0.5301095962524414,
"memory(GiB)": 76.04,
"step": 6930,
"token_acc": 0.8365392073218025,
"train_speed(iter/s)": 0.027616
},
{
"epoch": 1.7930700109897213,
"grad_norm": 0.9456325769424438,
"learning_rate": 1.9042662201069705e-06,
"loss": 0.4947515487670898,
"memory(GiB)": 76.04,
"step": 6935,
"token_acc": 0.853165902597834,
"train_speed(iter/s)": 0.027616
},
{
"epoch": 1.7943629193871615,
"grad_norm": 0.9174202680587769,
"learning_rate": 1.9008059673877728e-06,
"loss": 0.5024736404418946,
"memory(GiB)": 76.04,
"step": 6940,
"token_acc": 0.826625231817954,
"train_speed(iter/s)": 0.027615
},
{
"epoch": 1.7956558277846013,
"grad_norm": 1.0330584049224854,
"learning_rate": 1.8973469320984939e-06,
"loss": 0.5240283012390137,
"memory(GiB)": 76.04,
"step": 6945,
"token_acc": 0.8250224483687518,
"train_speed(iter/s)": 0.027616
},
{
"epoch": 1.7969487361820415,
"grad_norm": 1.1314250230789185,
"learning_rate": 1.893889121267132e-06,
"loss": 0.5122389793395996,
"memory(GiB)": 76.04,
"step": 6950,
"token_acc": 0.8462841506319767,
"train_speed(iter/s)": 0.027616
},
{
"epoch": 1.7982416445794815,
"grad_norm": 0.9879010319709778,
"learning_rate": 1.8904325419191941e-06,
"loss": 0.5107357025146484,
"memory(GiB)": 76.04,
"step": 6955,
"token_acc": 0.8432230939274413,
"train_speed(iter/s)": 0.027616
},
{
"epoch": 1.7995345529769216,
"grad_norm": 0.9637373089790344,
"learning_rate": 1.886977201077685e-06,
"loss": 0.5289700508117676,
"memory(GiB)": 76.04,
"step": 6960,
"token_acc": 0.8506177606177606,
"train_speed(iter/s)": 0.027616
},
{
"epoch": 1.8008274613743618,
"grad_norm": 0.8645473122596741,
"learning_rate": 1.8835231057630955e-06,
"loss": 0.5153064727783203,
"memory(GiB)": 76.04,
"step": 6965,
"token_acc": 0.8533177661023545,
"train_speed(iter/s)": 0.027617
},
{
"epoch": 1.8021203697718016,
"grad_norm": 0.9590107202529907,
"learning_rate": 1.8800702629933828e-06,
"loss": 0.4972332000732422,
"memory(GiB)": 76.04,
"step": 6970,
"token_acc": 0.8284552126624812,
"train_speed(iter/s)": 0.027618
},
{
"epoch": 1.8034132781692418,
"grad_norm": 0.996835470199585,
"learning_rate": 1.8766186797839625e-06,
"loss": 0.48930206298828127,
"memory(GiB)": 76.04,
"step": 6975,
"token_acc": 0.8765837634913186,
"train_speed(iter/s)": 0.027617
},
{
"epoch": 1.8047061865666818,
"grad_norm": 1.4224867820739746,
"learning_rate": 1.8731683631476885e-06,
"loss": 0.5020298480987548,
"memory(GiB)": 76.04,
"step": 6980,
"token_acc": 0.8626479614204297,
"train_speed(iter/s)": 0.027617
},
{
"epoch": 1.8059990949641218,
"grad_norm": 1.8809725046157837,
"learning_rate": 1.8697193200948415e-06,
"loss": 0.5353089332580566,
"memory(GiB)": 76.04,
"step": 6985,
"token_acc": 0.8270812946250589,
"train_speed(iter/s)": 0.027618
},
{
"epoch": 1.8072920033615618,
"grad_norm": 0.9029675126075745,
"learning_rate": 1.866271557633115e-06,
"loss": 0.4966177463531494,
"memory(GiB)": 76.04,
"step": 6990,
"token_acc": 0.8467085471597947,
"train_speed(iter/s)": 0.027617
},
{
"epoch": 1.8085849117590018,
"grad_norm": 1.1452031135559082,
"learning_rate": 1.862825082767602e-06,
"loss": 0.5193626403808593,
"memory(GiB)": 76.04,
"step": 6995,
"token_acc": 0.8338509316770186,
"train_speed(iter/s)": 0.027617
},
{
"epoch": 1.809877820156442,
"grad_norm": 1.0218499898910522,
"learning_rate": 1.8593799025007772e-06,
"loss": 0.4930767059326172,
"memory(GiB)": 76.04,
"step": 7000,
"token_acc": 0.8444614310877729,
"train_speed(iter/s)": 0.027616
}
],
"logging_steps": 5,
"max_steps": 11601,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 9.806726131996636e+19,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}