3TF-4B / trainer_state.json
volcanos's picture
Upload folder using huggingface_hub
e998d4b
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 2562,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00117096018735363,
"grad_norm": 4.268844127655029,
"learning_rate": 1.5503875968992249e-07,
"loss": 0.6057141423225403,
"memory(GiB)": 127.52,
"step": 1,
"token_acc": 0.8403535163595337,
"train_speed(iter/s)": 0.023362
},
{
"epoch": 0.00585480093676815,
"grad_norm": 4.00634765625,
"learning_rate": 7.751937984496125e-07,
"loss": 0.5714304447174072,
"memory(GiB)": 127.52,
"step": 5,
"token_acc": 0.8414377559265664,
"train_speed(iter/s)": 0.058165
},
{
"epoch": 0.0117096018735363,
"grad_norm": 3.714452028274536,
"learning_rate": 1.550387596899225e-06,
"loss": 0.5679570198059082,
"memory(GiB)": 127.52,
"step": 10,
"token_acc": 0.8314816958517272,
"train_speed(iter/s)": 0.072408
},
{
"epoch": 0.01756440281030445,
"grad_norm": 2.5877742767333984,
"learning_rate": 2.3255813953488376e-06,
"loss": 0.5387242317199707,
"memory(GiB)": 127.52,
"step": 15,
"token_acc": 0.8362197181678389,
"train_speed(iter/s)": 0.080114
},
{
"epoch": 0.0234192037470726,
"grad_norm": 1.0245263576507568,
"learning_rate": 3.10077519379845e-06,
"loss": 0.4778164863586426,
"memory(GiB)": 127.52,
"step": 20,
"token_acc": 0.8409289456094262,
"train_speed(iter/s)": 0.083991
},
{
"epoch": 0.02927400468384075,
"grad_norm": 1.0654064416885376,
"learning_rate": 3.875968992248063e-06,
"loss": 0.45667400360107424,
"memory(GiB)": 127.52,
"step": 25,
"token_acc": 0.8518281248542258,
"train_speed(iter/s)": 0.086554
},
{
"epoch": 0.0351288056206089,
"grad_norm": 0.5691505670547485,
"learning_rate": 4.651162790697675e-06,
"loss": 0.44004316329956056,
"memory(GiB)": 127.52,
"step": 30,
"token_acc": 0.8520554823322664,
"train_speed(iter/s)": 0.088726
},
{
"epoch": 0.040983606557377046,
"grad_norm": 0.5251653790473938,
"learning_rate": 5.4263565891472865e-06,
"loss": 0.43890109062194826,
"memory(GiB)": 127.52,
"step": 35,
"token_acc": 0.8498815333197345,
"train_speed(iter/s)": 0.090273
},
{
"epoch": 0.0468384074941452,
"grad_norm": 0.4052143096923828,
"learning_rate": 6.2015503875969e-06,
"loss": 0.41214742660522463,
"memory(GiB)": 127.52,
"step": 40,
"token_acc": 0.8529612170691973,
"train_speed(iter/s)": 0.091488
},
{
"epoch": 0.05269320843091335,
"grad_norm": 0.3396666944026947,
"learning_rate": 6.976744186046513e-06,
"loss": 0.423629093170166,
"memory(GiB)": 127.52,
"step": 45,
"token_acc": 0.845838888731289,
"train_speed(iter/s)": 0.092386
},
{
"epoch": 0.0585480093676815,
"grad_norm": 0.3074694573879242,
"learning_rate": 7.751937984496126e-06,
"loss": 0.41414508819580076,
"memory(GiB)": 127.52,
"step": 50,
"token_acc": 0.8609199657045593,
"train_speed(iter/s)": 0.0932
},
{
"epoch": 0.06440281030444965,
"grad_norm": 0.31701743602752686,
"learning_rate": 8.527131782945736e-06,
"loss": 0.4058098793029785,
"memory(GiB)": 127.52,
"step": 55,
"token_acc": 0.8598621225118498,
"train_speed(iter/s)": 0.093922
},
{
"epoch": 0.0702576112412178,
"grad_norm": 0.29167020320892334,
"learning_rate": 9.30232558139535e-06,
"loss": 0.42685737609863283,
"memory(GiB)": 127.52,
"step": 60,
"token_acc": 0.8583035383662712,
"train_speed(iter/s)": 0.09448
},
{
"epoch": 0.07611241217798595,
"grad_norm": 0.2796083092689514,
"learning_rate": 1.0077519379844963e-05,
"loss": 0.4080663681030273,
"memory(GiB)": 127.52,
"step": 65,
"token_acc": 0.86975264356343,
"train_speed(iter/s)": 0.095012
},
{
"epoch": 0.08196721311475409,
"grad_norm": 0.32925185561180115,
"learning_rate": 1.0852713178294573e-05,
"loss": 0.4072235584259033,
"memory(GiB)": 127.52,
"step": 70,
"token_acc": 0.8576062541566801,
"train_speed(iter/s)": 0.095364
},
{
"epoch": 0.08782201405152225,
"grad_norm": 0.29692476987838745,
"learning_rate": 1.1627906976744187e-05,
"loss": 0.4062563419342041,
"memory(GiB)": 127.52,
"step": 75,
"token_acc": 0.8451938495195714,
"train_speed(iter/s)": 0.0958
},
{
"epoch": 0.0936768149882904,
"grad_norm": 0.32430365681648254,
"learning_rate": 1.24031007751938e-05,
"loss": 0.4052871227264404,
"memory(GiB)": 127.52,
"step": 80,
"token_acc": 0.8536078219242759,
"train_speed(iter/s)": 0.096071
},
{
"epoch": 0.09953161592505855,
"grad_norm": 0.2918962240219116,
"learning_rate": 1.3178294573643412e-05,
"loss": 0.39542815685272215,
"memory(GiB)": 127.52,
"step": 85,
"token_acc": 0.8758206774505389,
"train_speed(iter/s)": 0.096477
},
{
"epoch": 0.1053864168618267,
"grad_norm": 0.30198103189468384,
"learning_rate": 1.3953488372093025e-05,
"loss": 0.4015383243560791,
"memory(GiB)": 127.52,
"step": 90,
"token_acc": 0.8578144099246164,
"train_speed(iter/s)": 0.096833
},
{
"epoch": 0.11124121779859485,
"grad_norm": 0.32643797993659973,
"learning_rate": 1.4728682170542636e-05,
"loss": 0.401915454864502,
"memory(GiB)": 127.52,
"step": 95,
"token_acc": 0.8631059302340187,
"train_speed(iter/s)": 0.097122
},
{
"epoch": 0.117096018735363,
"grad_norm": 0.3097076416015625,
"learning_rate": 1.550387596899225e-05,
"loss": 0.4027417182922363,
"memory(GiB)": 127.52,
"step": 100,
"token_acc": 0.8636492034198335,
"train_speed(iter/s)": 0.097418
},
{
"epoch": 0.12295081967213115,
"grad_norm": 0.28134772181510925,
"learning_rate": 1.6279069767441862e-05,
"loss": 0.39868090152740476,
"memory(GiB)": 127.52,
"step": 105,
"token_acc": 0.8655913809126278,
"train_speed(iter/s)": 0.0977
},
{
"epoch": 0.1288056206088993,
"grad_norm": 0.2668236196041107,
"learning_rate": 1.7054263565891473e-05,
"loss": 0.38587536811828616,
"memory(GiB)": 127.52,
"step": 110,
"token_acc": 0.864132983946116,
"train_speed(iter/s)": 0.09794
},
{
"epoch": 0.13466042154566746,
"grad_norm": 0.3235706686973572,
"learning_rate": 1.7829457364341087e-05,
"loss": 0.40470218658447266,
"memory(GiB)": 127.52,
"step": 115,
"token_acc": 0.8720630828529737,
"train_speed(iter/s)": 0.098141
},
{
"epoch": 0.1405152224824356,
"grad_norm": 0.2895485460758209,
"learning_rate": 1.86046511627907e-05,
"loss": 0.39603259563446047,
"memory(GiB)": 127.52,
"step": 120,
"token_acc": 0.8639678736880146,
"train_speed(iter/s)": 0.09831
},
{
"epoch": 0.14637002341920374,
"grad_norm": 0.3098626434803009,
"learning_rate": 1.937984496124031e-05,
"loss": 0.4097726821899414,
"memory(GiB)": 127.52,
"step": 125,
"token_acc": 0.8581558732162836,
"train_speed(iter/s)": 0.098474
},
{
"epoch": 0.1522248243559719,
"grad_norm": 0.35938969254493713,
"learning_rate": 1.9999991663467044e-05,
"loss": 0.4081538200378418,
"memory(GiB)": 127.52,
"step": 130,
"token_acc": 0.8630013632327376,
"train_speed(iter/s)": 0.098587
},
{
"epoch": 0.15807962529274006,
"grad_norm": 0.3397412896156311,
"learning_rate": 1.9999699886272926e-05,
"loss": 0.40991506576538084,
"memory(GiB)": 127.52,
"step": 135,
"token_acc": 0.8502879675585575,
"train_speed(iter/s)": 0.098664
},
{
"epoch": 0.16393442622950818,
"grad_norm": 0.32449835538864136,
"learning_rate": 1.9998991296330317e-05,
"loss": 0.40630359649658204,
"memory(GiB)": 127.52,
"step": 140,
"token_acc": 0.8630894085796805,
"train_speed(iter/s)": 0.098721
},
{
"epoch": 0.16978922716627634,
"grad_norm": 0.32687216997146606,
"learning_rate": 1.9997865923175027e-05,
"loss": 0.396761417388916,
"memory(GiB)": 127.52,
"step": 145,
"token_acc": 0.8691767868585987,
"train_speed(iter/s)": 0.09874
},
{
"epoch": 0.1756440281030445,
"grad_norm": 0.32365313172340393,
"learning_rate": 1.999632381371545e-05,
"loss": 0.40283679962158203,
"memory(GiB)": 127.52,
"step": 150,
"token_acc": 0.8533993606842608,
"train_speed(iter/s)": 0.09881
},
{
"epoch": 0.18149882903981265,
"grad_norm": 0.3086594343185425,
"learning_rate": 1.999436503223061e-05,
"loss": 0.4014937400817871,
"memory(GiB)": 127.52,
"step": 155,
"token_acc": 0.8624249503342012,
"train_speed(iter/s)": 0.098838
},
{
"epoch": 0.1873536299765808,
"grad_norm": 0.32935866713523865,
"learning_rate": 1.9991989660367463e-05,
"loss": 0.4079470634460449,
"memory(GiB)": 127.52,
"step": 160,
"token_acc": 0.8470391967320465,
"train_speed(iter/s)": 0.098876
},
{
"epoch": 0.19320843091334894,
"grad_norm": 0.27776622772216797,
"learning_rate": 1.998919779713751e-05,
"loss": 0.4115422248840332,
"memory(GiB)": 127.52,
"step": 165,
"token_acc": 0.8531520964716057,
"train_speed(iter/s)": 0.098931
},
{
"epoch": 0.1990632318501171,
"grad_norm": 0.28459489345550537,
"learning_rate": 1.998598955891266e-05,
"loss": 0.4005699634552002,
"memory(GiB)": 127.52,
"step": 170,
"token_acc": 0.867363933744935,
"train_speed(iter/s)": 0.099005
},
{
"epoch": 0.20491803278688525,
"grad_norm": 0.3174498379230499,
"learning_rate": 1.9982365079420382e-05,
"loss": 0.38856048583984376,
"memory(GiB)": 127.52,
"step": 175,
"token_acc": 0.8610733940638768,
"train_speed(iter/s)": 0.099065
},
{
"epoch": 0.2107728337236534,
"grad_norm": 0.30468112230300903,
"learning_rate": 1.9978324509738147e-05,
"loss": 0.392287540435791,
"memory(GiB)": 127.52,
"step": 180,
"token_acc": 0.8659642567171478,
"train_speed(iter/s)": 0.099132
},
{
"epoch": 0.21662763466042154,
"grad_norm": 0.31203576922416687,
"learning_rate": 1.9973868018287093e-05,
"loss": 0.3912659168243408,
"memory(GiB)": 127.52,
"step": 185,
"token_acc": 0.8592000200480526,
"train_speed(iter/s)": 0.099222
},
{
"epoch": 0.2224824355971897,
"grad_norm": 0.2872975766658783,
"learning_rate": 1.9968995790825048e-05,
"loss": 0.3968376159667969,
"memory(GiB)": 127.52,
"step": 190,
"token_acc": 0.8492010693857249,
"train_speed(iter/s)": 0.099228
},
{
"epoch": 0.22833723653395785,
"grad_norm": 0.3107975721359253,
"learning_rate": 1.9963708030438754e-05,
"loss": 0.39564805030822753,
"memory(GiB)": 127.52,
"step": 195,
"token_acc": 0.8623048224402377,
"train_speed(iter/s)": 0.099285
},
{
"epoch": 0.234192037470726,
"grad_norm": 0.33172452449798584,
"learning_rate": 1.995800495753542e-05,
"loss": 0.3955163240432739,
"memory(GiB)": 127.52,
"step": 200,
"token_acc": 0.8543361827625122,
"train_speed(iter/s)": 0.099318
},
{
"epoch": 0.24004683840749413,
"grad_norm": 0.4809193015098572,
"learning_rate": 1.9951886809833537e-05,
"loss": 0.40662593841552735,
"memory(GiB)": 127.52,
"step": 205,
"token_acc": 0.8530674732086181,
"train_speed(iter/s)": 0.099376
},
{
"epoch": 0.2459016393442623,
"grad_norm": 0.3544229567050934,
"learning_rate": 1.9945353842352943e-05,
"loss": 0.4021385669708252,
"memory(GiB)": 127.52,
"step": 210,
"token_acc": 0.8561705450570045,
"train_speed(iter/s)": 0.099425
},
{
"epoch": 0.25175644028103045,
"grad_norm": 0.336126446723938,
"learning_rate": 1.9938406327404233e-05,
"loss": 0.3979261159896851,
"memory(GiB)": 127.52,
"step": 215,
"token_acc": 0.8645368893679286,
"train_speed(iter/s)": 0.099503
},
{
"epoch": 0.2576112412177986,
"grad_norm": 0.33789604902267456,
"learning_rate": 1.9931044554577373e-05,
"loss": 0.3947408676147461,
"memory(GiB)": 127.52,
"step": 220,
"token_acc": 0.8581383757515342,
"train_speed(iter/s)": 0.099556
},
{
"epoch": 0.26346604215456676,
"grad_norm": 0.3256719708442688,
"learning_rate": 1.992326883072965e-05,
"loss": 0.39812633991241453,
"memory(GiB)": 127.52,
"step": 225,
"token_acc": 0.8538002738372856,
"train_speed(iter/s)": 0.099561
},
{
"epoch": 0.2693208430913349,
"grad_norm": 0.29769811034202576,
"learning_rate": 1.991507947997287e-05,
"loss": 0.40686187744140623,
"memory(GiB)": 127.52,
"step": 230,
"token_acc": 0.8601537153116829,
"train_speed(iter/s)": 0.099609
},
{
"epoch": 0.275175644028103,
"grad_norm": 0.30855706334114075,
"learning_rate": 1.9906476843659866e-05,
"loss": 0.40198640823364257,
"memory(GiB)": 127.52,
"step": 235,
"token_acc": 0.8681018040834193,
"train_speed(iter/s)": 0.099643
},
{
"epoch": 0.2810304449648712,
"grad_norm": 0.38956841826438904,
"learning_rate": 1.989746128037024e-05,
"loss": 0.3874382972717285,
"memory(GiB)": 127.52,
"step": 240,
"token_acc": 0.8601923167422234,
"train_speed(iter/s)": 0.099684
},
{
"epoch": 0.28688524590163933,
"grad_norm": 0.317061185836792,
"learning_rate": 1.988803316589545e-05,
"loss": 0.396057653427124,
"memory(GiB)": 127.52,
"step": 245,
"token_acc": 0.8594824803587602,
"train_speed(iter/s)": 0.099766
},
{
"epoch": 0.2927400468384075,
"grad_norm": 0.31615447998046875,
"learning_rate": 1.987819289322311e-05,
"loss": 0.39992465972900393,
"memory(GiB)": 127.52,
"step": 250,
"token_acc": 0.858279346005983,
"train_speed(iter/s)": 0.099854
},
{
"epoch": 0.29859484777517564,
"grad_norm": 0.32358142733573914,
"learning_rate": 1.9867940872520646e-05,
"loss": 0.40424213409423826,
"memory(GiB)": 127.52,
"step": 255,
"token_acc": 0.8581384084126314,
"train_speed(iter/s)": 0.09983
},
{
"epoch": 0.3044496487119438,
"grad_norm": 0.289928138256073,
"learning_rate": 1.9857277531118173e-05,
"loss": 0.3975801706314087,
"memory(GiB)": 127.52,
"step": 260,
"token_acc": 0.8720583892069197,
"train_speed(iter/s)": 0.099856
},
{
"epoch": 0.31030444964871196,
"grad_norm": 0.2990163266658783,
"learning_rate": 1.9846203313490697e-05,
"loss": 0.38855001926422117,
"memory(GiB)": 127.52,
"step": 265,
"token_acc": 0.8751479791620219,
"train_speed(iter/s)": 0.099904
},
{
"epoch": 0.3161592505854801,
"grad_norm": 0.3375948369503021,
"learning_rate": 1.983471868123958e-05,
"loss": 0.3869392156600952,
"memory(GiB)": 127.52,
"step": 270,
"token_acc": 0.8583391727600954,
"train_speed(iter/s)": 0.099986
},
{
"epoch": 0.32201405152224827,
"grad_norm": 0.31450051069259644,
"learning_rate": 1.98228241130733e-05,
"loss": 0.4127011775970459,
"memory(GiB)": 127.52,
"step": 275,
"token_acc": 0.8624973560772896,
"train_speed(iter/s)": 0.100024
},
{
"epoch": 0.32786885245901637,
"grad_norm": 0.30610159039497375,
"learning_rate": 1.98105201047875e-05,
"loss": 0.38500449657440183,
"memory(GiB)": 127.52,
"step": 280,
"token_acc": 0.8676562826677817,
"train_speed(iter/s)": 0.1001
},
{
"epoch": 0.3337236533957845,
"grad_norm": 0.29564493894577026,
"learning_rate": 1.9797807169244326e-05,
"loss": 0.39098482131958007,
"memory(GiB)": 127.52,
"step": 285,
"token_acc": 0.8600835808177637,
"train_speed(iter/s)": 0.100123
},
{
"epoch": 0.3395784543325527,
"grad_norm": 0.2966287136077881,
"learning_rate": 1.9784685836351045e-05,
"loss": 0.40611705780029295,
"memory(GiB)": 127.52,
"step": 290,
"token_acc": 0.85560257646949,
"train_speed(iter/s)": 0.100118
},
{
"epoch": 0.34543325526932084,
"grad_norm": 0.3238191604614258,
"learning_rate": 1.9771156653037944e-05,
"loss": 0.3969024419784546,
"memory(GiB)": 127.52,
"step": 295,
"token_acc": 0.8581954258818798,
"train_speed(iter/s)": 0.100158
},
{
"epoch": 0.351288056206089,
"grad_norm": 0.27766069769859314,
"learning_rate": 1.975722018323556e-05,
"loss": 0.38973977565765383,
"memory(GiB)": 127.52,
"step": 300,
"token_acc": 0.8660634024604128,
"train_speed(iter/s)": 0.100143
},
{
"epoch": 0.35714285714285715,
"grad_norm": 0.30145326256752014,
"learning_rate": 1.974287700785116e-05,
"loss": 0.3852071285247803,
"memory(GiB)": 127.52,
"step": 305,
"token_acc": 0.8624855074734434,
"train_speed(iter/s)": 0.100192
},
{
"epoch": 0.3629976580796253,
"grad_norm": 0.3129558563232422,
"learning_rate": 1.9728127724744516e-05,
"loss": 0.3764306306838989,
"memory(GiB)": 127.52,
"step": 310,
"token_acc": 0.8621159494397087,
"train_speed(iter/s)": 0.100188
},
{
"epoch": 0.36885245901639346,
"grad_norm": 0.28354689478874207,
"learning_rate": 1.9712972948703006e-05,
"loss": 0.4006787300109863,
"memory(GiB)": 127.52,
"step": 315,
"token_acc": 0.8685145789802604,
"train_speed(iter/s)": 0.100213
},
{
"epoch": 0.3747072599531616,
"grad_norm": 0.32204070687294006,
"learning_rate": 1.9697413311415967e-05,
"loss": 0.3947436332702637,
"memory(GiB)": 127.52,
"step": 320,
"token_acc": 0.840712523808037,
"train_speed(iter/s)": 0.100233
},
{
"epoch": 0.3805620608899297,
"grad_norm": 0.2838529944419861,
"learning_rate": 1.9681449461448386e-05,
"loss": 0.3909641981124878,
"memory(GiB)": 127.52,
"step": 325,
"token_acc": 0.8644274332135604,
"train_speed(iter/s)": 0.10024
},
{
"epoch": 0.3864168618266979,
"grad_norm": 0.2927788197994232,
"learning_rate": 1.9665082064213856e-05,
"loss": 0.3943678140640259,
"memory(GiB)": 127.52,
"step": 330,
"token_acc": 0.8593159978638758,
"train_speed(iter/s)": 0.100274
},
{
"epoch": 0.39227166276346603,
"grad_norm": 0.28758853673934937,
"learning_rate": 1.9648311801946823e-05,
"loss": 0.39302983283996584,
"memory(GiB)": 127.52,
"step": 335,
"token_acc": 0.8576617952773522,
"train_speed(iter/s)": 0.10031
},
{
"epoch": 0.3981264637002342,
"grad_norm": 0.32002732157707214,
"learning_rate": 1.9631139373674188e-05,
"loss": 0.3899127721786499,
"memory(GiB)": 127.52,
"step": 340,
"token_acc": 0.859130068814327,
"train_speed(iter/s)": 0.100326
},
{
"epoch": 0.40398126463700235,
"grad_norm": 0.29767319560050964,
"learning_rate": 1.9613565495186126e-05,
"loss": 0.38013973236083987,
"memory(GiB)": 127.52,
"step": 345,
"token_acc": 0.8582271352459535,
"train_speed(iter/s)": 0.100383
},
{
"epoch": 0.4098360655737705,
"grad_norm": 0.30334916710853577,
"learning_rate": 1.9595590899006288e-05,
"loss": 0.3990506649017334,
"memory(GiB)": 127.52,
"step": 350,
"token_acc": 0.8646594498490017,
"train_speed(iter/s)": 0.100383
},
{
"epoch": 0.41569086651053866,
"grad_norm": 0.27606984972953796,
"learning_rate": 1.957721633436124e-05,
"loss": 0.39636931419372556,
"memory(GiB)": 127.52,
"step": 355,
"token_acc": 0.8610379971059329,
"train_speed(iter/s)": 0.100374
},
{
"epoch": 0.4215456674473068,
"grad_norm": 0.2963041067123413,
"learning_rate": 1.9558442567149244e-05,
"loss": 0.3938555955886841,
"memory(GiB)": 127.52,
"step": 360,
"token_acc": 0.8734277076877441,
"train_speed(iter/s)": 0.100411
},
{
"epoch": 0.4274004683840749,
"grad_norm": 0.3044081926345825,
"learning_rate": 1.953927037990834e-05,
"loss": 0.4011641502380371,
"memory(GiB)": 127.52,
"step": 365,
"token_acc": 0.8496909477706446,
"train_speed(iter/s)": 0.100429
},
{
"epoch": 0.4332552693208431,
"grad_norm": 0.3151879906654358,
"learning_rate": 1.9519700571783718e-05,
"loss": 0.40146493911743164,
"memory(GiB)": 127.52,
"step": 370,
"token_acc": 0.8655695668198701,
"train_speed(iter/s)": 0.100402
},
{
"epoch": 0.43911007025761123,
"grad_norm": 0.30802202224731445,
"learning_rate": 1.9499733958494405e-05,
"loss": 0.3972816467285156,
"memory(GiB)": 127.52,
"step": 375,
"token_acc": 0.8541561335505496,
"train_speed(iter/s)": 0.100424
},
{
"epoch": 0.4449648711943794,
"grad_norm": 0.2896055579185486,
"learning_rate": 1.947937137229928e-05,
"loss": 0.39000208377838136,
"memory(GiB)": 127.52,
"step": 380,
"token_acc": 0.8715701816495711,
"train_speed(iter/s)": 0.10044
},
{
"epoch": 0.45081967213114754,
"grad_norm": 0.3016491234302521,
"learning_rate": 1.9458613661962366e-05,
"loss": 0.3910162687301636,
"memory(GiB)": 127.52,
"step": 385,
"token_acc": 0.8739838931744026,
"train_speed(iter/s)": 0.100469
},
{
"epoch": 0.4566744730679157,
"grad_norm": 0.29643046855926514,
"learning_rate": 1.943746169271746e-05,
"loss": 0.39229693412780764,
"memory(GiB)": 127.52,
"step": 390,
"token_acc": 0.8722126097825781,
"train_speed(iter/s)": 0.100449
},
{
"epoch": 0.46252927400468385,
"grad_norm": 0.27366167306900024,
"learning_rate": 1.941591634623206e-05,
"loss": 0.39676542282104493,
"memory(GiB)": 127.52,
"step": 395,
"token_acc": 0.8644101402067695,
"train_speed(iter/s)": 0.100471
},
{
"epoch": 0.468384074941452,
"grad_norm": 0.2772040069103241,
"learning_rate": 1.9393978520570638e-05,
"loss": 0.38228650093078614,
"memory(GiB)": 127.52,
"step": 400,
"token_acc": 0.8660634050880627,
"train_speed(iter/s)": 0.100525
},
{
"epoch": 0.47423887587822017,
"grad_norm": 0.27195385098457336,
"learning_rate": 1.9371649130157166e-05,
"loss": 0.3779789209365845,
"memory(GiB)": 127.52,
"step": 405,
"token_acc": 0.8644070452060074,
"train_speed(iter/s)": 0.100537
},
{
"epoch": 0.48009367681498827,
"grad_norm": 0.3120705783367157,
"learning_rate": 1.9348929105737044e-05,
"loss": 0.3843944549560547,
"memory(GiB)": 127.52,
"step": 410,
"token_acc": 0.8640640315662635,
"train_speed(iter/s)": 0.100541
},
{
"epoch": 0.4859484777517564,
"grad_norm": 0.30002740025520325,
"learning_rate": 1.932581939433827e-05,
"loss": 0.3987558841705322,
"memory(GiB)": 127.52,
"step": 415,
"token_acc": 0.8650914968394279,
"train_speed(iter/s)": 0.100546
},
{
"epoch": 0.4918032786885246,
"grad_norm": 0.2787948250770569,
"learning_rate": 1.9302320959231997e-05,
"loss": 0.3887160778045654,
"memory(GiB)": 127.52,
"step": 420,
"token_acc": 0.8633874480548741,
"train_speed(iter/s)": 0.100566
},
{
"epoch": 0.49765807962529274,
"grad_norm": 0.30231156945228577,
"learning_rate": 1.927843477989234e-05,
"loss": 0.38535680770874026,
"memory(GiB)": 127.52,
"step": 425,
"token_acc": 0.8781958006354674,
"train_speed(iter/s)": 0.100582
},
{
"epoch": 0.5035128805620609,
"grad_norm": 0.43067944049835205,
"learning_rate": 1.9254161851955587e-05,
"loss": 0.3992464065551758,
"memory(GiB)": 127.52,
"step": 430,
"token_acc": 0.8681868917427511,
"train_speed(iter/s)": 0.100614
},
{
"epoch": 0.509367681498829,
"grad_norm": 0.31797730922698975,
"learning_rate": 1.9229503187178694e-05,
"loss": 0.3914906978607178,
"memory(GiB)": 127.52,
"step": 435,
"token_acc": 0.8623976908030916,
"train_speed(iter/s)": 0.100629
},
{
"epoch": 0.5152224824355972,
"grad_norm": 0.3029649555683136,
"learning_rate": 1.920445981339708e-05,
"loss": 0.3909397840499878,
"memory(GiB)": 127.52,
"step": 440,
"token_acc": 0.8603624171988666,
"train_speed(iter/s)": 0.10065
},
{
"epoch": 0.5210772833723654,
"grad_norm": 0.30808401107788086,
"learning_rate": 1.9179032774481822e-05,
"loss": 0.38848447799682617,
"memory(GiB)": 127.52,
"step": 445,
"token_acc": 0.8688334300638422,
"train_speed(iter/s)": 0.10068
},
{
"epoch": 0.5269320843091335,
"grad_norm": 0.30352672934532166,
"learning_rate": 1.9153223130296125e-05,
"loss": 0.38553576469421386,
"memory(GiB)": 127.52,
"step": 450,
"token_acc": 0.871061226654355,
"train_speed(iter/s)": 0.100707
},
{
"epoch": 0.5327868852459017,
"grad_norm": 0.30111393332481384,
"learning_rate": 1.9127031956651153e-05,
"loss": 0.38896827697753905,
"memory(GiB)": 127.52,
"step": 455,
"token_acc": 0.868666861524493,
"train_speed(iter/s)": 0.10072
},
{
"epoch": 0.5386416861826698,
"grad_norm": 0.3043946325778961,
"learning_rate": 1.9100460345261175e-05,
"loss": 0.4031389236450195,
"memory(GiB)": 127.52,
"step": 460,
"token_acc": 0.8602805306930444,
"train_speed(iter/s)": 0.10069
},
{
"epoch": 0.544496487119438,
"grad_norm": 0.3046748638153076,
"learning_rate": 1.9073509403698062e-05,
"loss": 0.3981820821762085,
"memory(GiB)": 127.52,
"step": 465,
"token_acc": 0.8679260633787171,
"train_speed(iter/s)": 0.100702
},
{
"epoch": 0.550351288056206,
"grad_norm": 0.31403180956840515,
"learning_rate": 1.9046180255345142e-05,
"loss": 0.3932758569717407,
"memory(GiB)": 127.52,
"step": 470,
"token_acc": 0.8679127068807225,
"train_speed(iter/s)": 0.10072
},
{
"epoch": 0.5562060889929742,
"grad_norm": 0.29715070128440857,
"learning_rate": 1.9018474039350342e-05,
"loss": 0.3857383966445923,
"memory(GiB)": 127.52,
"step": 475,
"token_acc": 0.8670612150699786,
"train_speed(iter/s)": 0.100729
},
{
"epoch": 0.5620608899297423,
"grad_norm": 0.3304217755794525,
"learning_rate": 1.899039191057872e-05,
"loss": 0.3876671075820923,
"memory(GiB)": 127.52,
"step": 480,
"token_acc": 0.8610883356974732,
"train_speed(iter/s)": 0.100734
},
{
"epoch": 0.5679156908665105,
"grad_norm": 0.28700098395347595,
"learning_rate": 1.8961935039564338e-05,
"loss": 0.3859807252883911,
"memory(GiB)": 127.52,
"step": 485,
"token_acc": 0.861040389753261,
"train_speed(iter/s)": 0.100756
},
{
"epoch": 0.5737704918032787,
"grad_norm": 0.30889761447906494,
"learning_rate": 1.8933104612461454e-05,
"loss": 0.3886594772338867,
"memory(GiB)": 127.52,
"step": 490,
"token_acc": 0.862199389425299,
"train_speed(iter/s)": 0.100764
},
{
"epoch": 0.5796252927400468,
"grad_norm": 0.30002301931381226,
"learning_rate": 1.8903901830995093e-05,
"loss": 0.3925405740737915,
"memory(GiB)": 127.52,
"step": 495,
"token_acc": 0.8591249033461787,
"train_speed(iter/s)": 0.10076
},
{
"epoch": 0.585480093676815,
"grad_norm": 0.28031232953071594,
"learning_rate": 1.8874327912410945e-05,
"loss": 0.40421361923217775,
"memory(GiB)": 127.52,
"step": 500,
"token_acc": 0.8617515420490447,
"train_speed(iter/s)": 0.100788
},
{
"epoch": 0.5913348946135831,
"grad_norm": 0.27785587310791016,
"learning_rate": 1.884438408942463e-05,
"loss": 0.39117045402526857,
"memory(GiB)": 127.52,
"step": 505,
"token_acc": 0.8509159982582465,
"train_speed(iter/s)": 0.100792
},
{
"epoch": 0.5971896955503513,
"grad_norm": 0.26203179359436035,
"learning_rate": 1.881407161017033e-05,
"loss": 0.3850869655609131,
"memory(GiB)": 127.52,
"step": 510,
"token_acc": 0.871426780341023,
"train_speed(iter/s)": 0.100813
},
{
"epoch": 0.6030444964871194,
"grad_norm": 0.2775160074234009,
"learning_rate": 1.8783391738148738e-05,
"loss": 0.38030352592468264,
"memory(GiB)": 127.52,
"step": 515,
"token_acc": 0.865779336694748,
"train_speed(iter/s)": 0.100836
},
{
"epoch": 0.6088992974238876,
"grad_norm": 0.283777117729187,
"learning_rate": 1.875234575217441e-05,
"loss": 0.38051447868347166,
"memory(GiB)": 127.52,
"step": 520,
"token_acc": 0.8643710911880905,
"train_speed(iter/s)": 0.100855
},
{
"epoch": 0.6147540983606558,
"grad_norm": 0.2693696618080139,
"learning_rate": 1.8720934946322466e-05,
"loss": 0.3941120862960815,
"memory(GiB)": 127.52,
"step": 525,
"token_acc": 0.8575597963261037,
"train_speed(iter/s)": 0.10087
},
{
"epoch": 0.6206088992974239,
"grad_norm": 0.2502153515815735,
"learning_rate": 1.8689160629874622e-05,
"loss": 0.36350240707397463,
"memory(GiB)": 127.52,
"step": 530,
"token_acc": 0.8788319745551232,
"train_speed(iter/s)": 0.100858
},
{
"epoch": 0.6264637002341921,
"grad_norm": 0.2630903124809265,
"learning_rate": 1.865702412726465e-05,
"loss": 0.3757188081741333,
"memory(GiB)": 127.52,
"step": 535,
"token_acc": 0.8759880681391801,
"train_speed(iter/s)": 0.100883
},
{
"epoch": 0.6323185011709602,
"grad_norm": 0.2726694941520691,
"learning_rate": 1.8624526778023142e-05,
"loss": 0.3769080638885498,
"memory(GiB)": 127.52,
"step": 540,
"token_acc": 0.8733085553248108,
"train_speed(iter/s)": 0.100896
},
{
"epoch": 0.6381733021077284,
"grad_norm": 0.2886805832386017,
"learning_rate": 1.85916699367217e-05,
"loss": 0.3801791429519653,
"memory(GiB)": 127.52,
"step": 545,
"token_acc": 0.8658838767809878,
"train_speed(iter/s)": 0.100897
},
{
"epoch": 0.6440281030444965,
"grad_norm": 0.28697773814201355,
"learning_rate": 1.855845497291646e-05,
"loss": 0.3925698041915894,
"memory(GiB)": 127.52,
"step": 550,
"token_acc": 0.8631926701668678,
"train_speed(iter/s)": 0.100906
},
{
"epoch": 0.6498829039812647,
"grad_norm": 0.26602187752723694,
"learning_rate": 1.8524883271091004e-05,
"loss": 0.38099260330200196,
"memory(GiB)": 127.52,
"step": 555,
"token_acc": 0.8710958004218123,
"train_speed(iter/s)": 0.100905
},
{
"epoch": 0.6557377049180327,
"grad_norm": 0.2533867359161377,
"learning_rate": 1.8490956230598668e-05,
"loss": 0.3997593879699707,
"memory(GiB)": 127.52,
"step": 560,
"token_acc": 0.8649844205573561,
"train_speed(iter/s)": 0.100903
},
{
"epoch": 0.6615925058548009,
"grad_norm": 0.287895530462265,
"learning_rate": 1.8456675265604183e-05,
"loss": 0.3792722702026367,
"memory(GiB)": 127.52,
"step": 565,
"token_acc": 0.8638586429067867,
"train_speed(iter/s)": 0.100923
},
{
"epoch": 0.667447306791569,
"grad_norm": 0.30773329734802246,
"learning_rate": 1.842204180502476e-05,
"loss": 0.3829328536987305,
"memory(GiB)": 127.52,
"step": 570,
"token_acc": 0.8727389815600163,
"train_speed(iter/s)": 0.100938
},
{
"epoch": 0.6733021077283372,
"grad_norm": 0.30301594734191895,
"learning_rate": 1.8387057292470517e-05,
"loss": 0.39844498634338377,
"memory(GiB)": 127.52,
"step": 575,
"token_acc": 0.8632732480308832,
"train_speed(iter/s)": 0.100939
},
{
"epoch": 0.6791569086651054,
"grad_norm": 0.27384889125823975,
"learning_rate": 1.8351723186184295e-05,
"loss": 0.3866116523742676,
"memory(GiB)": 127.52,
"step": 580,
"token_acc": 0.8537265892945595,
"train_speed(iter/s)": 0.100945
},
{
"epoch": 0.6850117096018735,
"grad_norm": 0.300459086894989,
"learning_rate": 1.8316040958980896e-05,
"loss": 0.3856982707977295,
"memory(GiB)": 127.52,
"step": 585,
"token_acc": 0.8774584957729205,
"train_speed(iter/s)": 0.100955
},
{
"epoch": 0.6908665105386417,
"grad_norm": 0.32351046800613403,
"learning_rate": 1.828001209818567e-05,
"loss": 0.403375244140625,
"memory(GiB)": 127.52,
"step": 590,
"token_acc": 0.8606907256499806,
"train_speed(iter/s)": 0.100969
},
{
"epoch": 0.6967213114754098,
"grad_norm": 0.3171491324901581,
"learning_rate": 1.8243638105572547e-05,
"loss": 0.3851677656173706,
"memory(GiB)": 127.52,
"step": 595,
"token_acc": 0.8713710233181722,
"train_speed(iter/s)": 0.100978
},
{
"epoch": 0.702576112412178,
"grad_norm": 0.3137357532978058,
"learning_rate": 1.82069204973014e-05,
"loss": 0.3799635648727417,
"memory(GiB)": 127.52,
"step": 600,
"token_acc": 0.8784900280426953,
"train_speed(iter/s)": 0.101006
},
{
"epoch": 0.7084309133489461,
"grad_norm": 0.28434112668037415,
"learning_rate": 1.816986080385489e-05,
"loss": 0.40052270889282227,
"memory(GiB)": 127.52,
"step": 605,
"token_acc": 0.8462195284773476,
"train_speed(iter/s)": 0.101006
},
{
"epoch": 0.7142857142857143,
"grad_norm": 0.30604925751686096,
"learning_rate": 1.813246056997465e-05,
"loss": 0.3835596084594727,
"memory(GiB)": 127.52,
"step": 610,
"token_acc": 0.8614169593452318,
"train_speed(iter/s)": 0.101011
},
{
"epoch": 0.7201405152224825,
"grad_norm": 0.3114904463291168,
"learning_rate": 1.809472135459688e-05,
"loss": 0.38530282974243163,
"memory(GiB)": 127.52,
"step": 615,
"token_acc": 0.8642289288270977,
"train_speed(iter/s)": 0.101016
},
{
"epoch": 0.7259953161592506,
"grad_norm": 0.29733744263648987,
"learning_rate": 1.8056644730787412e-05,
"loss": 0.39410853385925293,
"memory(GiB)": 127.52,
"step": 620,
"token_acc": 0.8700788764122717,
"train_speed(iter/s)": 0.101043
},
{
"epoch": 0.7318501170960188,
"grad_norm": 0.28432950377464294,
"learning_rate": 1.8018232285676092e-05,
"loss": 0.3745533227920532,
"memory(GiB)": 127.52,
"step": 625,
"token_acc": 0.8656255611667859,
"train_speed(iter/s)": 0.101068
},
{
"epoch": 0.7377049180327869,
"grad_norm": 0.2615796625614166,
"learning_rate": 1.797948562039066e-05,
"loss": 0.3919194459915161,
"memory(GiB)": 127.52,
"step": 630,
"token_acc": 0.8600643002591344,
"train_speed(iter/s)": 0.101046
},
{
"epoch": 0.7435597189695551,
"grad_norm": 0.27267464995384216,
"learning_rate": 1.7940406349989987e-05,
"loss": 0.388127875328064,
"memory(GiB)": 127.52,
"step": 635,
"token_acc": 0.8630637748223948,
"train_speed(iter/s)": 0.10107
},
{
"epoch": 0.7494145199063232,
"grad_norm": 0.274472177028656,
"learning_rate": 1.7900996103396772e-05,
"loss": 0.38143386840820315,
"memory(GiB)": 127.52,
"step": 640,
"token_acc": 0.8701312848988129,
"train_speed(iter/s)": 0.101069
},
{
"epoch": 0.7552693208430913,
"grad_norm": 0.27030906081199646,
"learning_rate": 1.7861256523329634e-05,
"loss": 0.3786378145217896,
"memory(GiB)": 127.52,
"step": 645,
"token_acc": 0.8602489884842826,
"train_speed(iter/s)": 0.101063
},
{
"epoch": 0.7611241217798594,
"grad_norm": 0.2663189172744751,
"learning_rate": 1.7821189266234647e-05,
"loss": 0.38404848575592043,
"memory(GiB)": 127.52,
"step": 650,
"token_acc": 0.8616431608743905,
"train_speed(iter/s)": 0.10106
},
{
"epoch": 0.7669789227166276,
"grad_norm": 0.26061564683914185,
"learning_rate": 1.7780796002216285e-05,
"loss": 0.3781083822250366,
"memory(GiB)": 127.52,
"step": 655,
"token_acc": 0.8578937981658266,
"train_speed(iter/s)": 0.101068
},
{
"epoch": 0.7728337236533958,
"grad_norm": 0.2600330412387848,
"learning_rate": 1.7740078414967817e-05,
"loss": 0.3852128505706787,
"memory(GiB)": 127.52,
"step": 660,
"token_acc": 0.872952104972653,
"train_speed(iter/s)": 0.101073
},
{
"epoch": 0.7786885245901639,
"grad_norm": 0.27133384346961975,
"learning_rate": 1.7699038201701132e-05,
"loss": 0.37737174034118653,
"memory(GiB)": 127.52,
"step": 665,
"token_acc": 0.8593767976691324,
"train_speed(iter/s)": 0.101088
},
{
"epoch": 0.7845433255269321,
"grad_norm": 0.270047664642334,
"learning_rate": 1.7657677073075968e-05,
"loss": 0.38488593101501467,
"memory(GiB)": 127.52,
"step": 670,
"token_acc": 0.8627122177041754,
"train_speed(iter/s)": 0.101091
},
{
"epoch": 0.7903981264637002,
"grad_norm": 0.29772108793258667,
"learning_rate": 1.761599675312864e-05,
"loss": 0.3877399444580078,
"memory(GiB)": 127.52,
"step": 675,
"token_acc": 0.8765810968128602,
"train_speed(iter/s)": 0.101091
},
{
"epoch": 0.7962529274004684,
"grad_norm": 0.30914777517318726,
"learning_rate": 1.7573998979200163e-05,
"loss": 0.38101863861083984,
"memory(GiB)": 127.52,
"step": 680,
"token_acc": 0.8670370510587819,
"train_speed(iter/s)": 0.101106
},
{
"epoch": 0.8021077283372365,
"grad_norm": 0.24654199182987213,
"learning_rate": 1.753168550186383e-05,
"loss": 0.3897979259490967,
"memory(GiB)": 127.52,
"step": 685,
"token_acc": 0.8695668499228697,
"train_speed(iter/s)": 0.101113
},
{
"epoch": 0.8079625292740047,
"grad_norm": 0.268245667219162,
"learning_rate": 1.7489058084852247e-05,
"loss": 0.3852191686630249,
"memory(GiB)": 127.52,
"step": 690,
"token_acc": 0.8590092968475919,
"train_speed(iter/s)": 0.101108
},
{
"epoch": 0.8138173302107728,
"grad_norm": 0.2539999186992645,
"learning_rate": 1.744611850498383e-05,
"loss": 0.38076086044311525,
"memory(GiB)": 127.52,
"step": 695,
"token_acc": 0.8692958838741554,
"train_speed(iter/s)": 0.101093
},
{
"epoch": 0.819672131147541,
"grad_norm": 0.30060875415802,
"learning_rate": 1.7402868552088724e-05,
"loss": 0.37528285980224607,
"memory(GiB)": 127.52,
"step": 700,
"token_acc": 0.863746098668577,
"train_speed(iter/s)": 0.101099
},
{
"epoch": 0.8255269320843092,
"grad_norm": 0.2880835235118866,
"learning_rate": 1.73593100289342e-05,
"loss": 0.3839045286178589,
"memory(GiB)": 127.52,
"step": 705,
"token_acc": 0.8606477737869129,
"train_speed(iter/s)": 0.101117
},
{
"epoch": 0.8313817330210773,
"grad_norm": 0.27465176582336426,
"learning_rate": 1.7315444751149533e-05,
"loss": 0.38219666481018066,
"memory(GiB)": 127.52,
"step": 710,
"token_acc": 0.866171235481518,
"train_speed(iter/s)": 0.101137
},
{
"epoch": 0.8372365339578455,
"grad_norm": 0.2839786410331726,
"learning_rate": 1.727127454715029e-05,
"loss": 0.3815479755401611,
"memory(GiB)": 127.52,
"step": 715,
"token_acc": 0.8742821134330966,
"train_speed(iter/s)": 0.101149
},
{
"epoch": 0.8430913348946136,
"grad_norm": 0.31399768590927124,
"learning_rate": 1.722680125806214e-05,
"loss": 0.38201520442962644,
"memory(GiB)": 127.52,
"step": 720,
"token_acc": 0.8587188600974719,
"train_speed(iter/s)": 0.101155
},
{
"epoch": 0.8489461358313818,
"grad_norm": 0.3099398910999298,
"learning_rate": 1.71820267376441e-05,
"loss": 0.386704421043396,
"memory(GiB)": 127.52,
"step": 725,
"token_acc": 0.8638798635493387,
"train_speed(iter/s)": 0.101166
},
{
"epoch": 0.8548009367681498,
"grad_norm": 0.2707797884941101,
"learning_rate": 1.7136952852211274e-05,
"loss": 0.3908542156219482,
"memory(GiB)": 127.52,
"step": 730,
"token_acc": 0.8531080479659894,
"train_speed(iter/s)": 0.10118
},
{
"epoch": 0.860655737704918,
"grad_norm": 0.24912209808826447,
"learning_rate": 1.7091581480557057e-05,
"loss": 0.3775820732116699,
"memory(GiB)": 127.52,
"step": 735,
"token_acc": 0.8631545113262953,
"train_speed(iter/s)": 0.101187
},
{
"epoch": 0.8665105386416861,
"grad_norm": 0.2668187916278839,
"learning_rate": 1.7045914513874815e-05,
"loss": 0.39071335792541506,
"memory(GiB)": 127.52,
"step": 740,
"token_acc": 0.863421279036421,
"train_speed(iter/s)": 0.101213
},
{
"epoch": 0.8723653395784543,
"grad_norm": 0.24733468890190125,
"learning_rate": 1.699995385567907e-05,
"loss": 0.39272005558013917,
"memory(GiB)": 127.52,
"step": 745,
"token_acc": 0.8545664531712299,
"train_speed(iter/s)": 0.101244
},
{
"epoch": 0.8782201405152225,
"grad_norm": 0.2632930278778076,
"learning_rate": 1.695370142172614e-05,
"loss": 0.3845970630645752,
"memory(GiB)": 127.52,
"step": 750,
"token_acc": 0.8612419217474074,
"train_speed(iter/s)": 0.101242
},
{
"epoch": 0.8840749414519906,
"grad_norm": 0.26514074206352234,
"learning_rate": 1.690715913993429e-05,
"loss": 0.38790068626403806,
"memory(GiB)": 127.52,
"step": 755,
"token_acc": 0.8648871034856036,
"train_speed(iter/s)": 0.101244
},
{
"epoch": 0.8899297423887588,
"grad_norm": 0.26957836747169495,
"learning_rate": 1.6860328950303392e-05,
"loss": 0.36716523170471194,
"memory(GiB)": 127.52,
"step": 760,
"token_acc": 0.8711639836976192,
"train_speed(iter/s)": 0.101257
},
{
"epoch": 0.8957845433255269,
"grad_norm": 0.2675636410713196,
"learning_rate": 1.6813212804834033e-05,
"loss": 0.38340959548950193,
"memory(GiB)": 127.52,
"step": 765,
"token_acc": 0.8579816582165225,
"train_speed(iter/s)": 0.101264
},
{
"epoch": 0.9016393442622951,
"grad_norm": 0.26134225726127625,
"learning_rate": 1.676581266744615e-05,
"loss": 0.3752238988876343,
"memory(GiB)": 127.52,
"step": 770,
"token_acc": 0.8638096187142661,
"train_speed(iter/s)": 0.101274
},
{
"epoch": 0.9074941451990632,
"grad_norm": 0.2766994535923004,
"learning_rate": 1.6718130513897207e-05,
"loss": 0.37386231422424315,
"memory(GiB)": 127.52,
"step": 775,
"token_acc": 0.8692816207520612,
"train_speed(iter/s)": 0.10128
},
{
"epoch": 0.9133489461358314,
"grad_norm": 0.2736496329307556,
"learning_rate": 1.667016833169979e-05,
"loss": 0.3910179138183594,
"memory(GiB)": 127.52,
"step": 780,
"token_acc": 0.8679116603442695,
"train_speed(iter/s)": 0.101285
},
{
"epoch": 0.9192037470725996,
"grad_norm": 0.25334644317626953,
"learning_rate": 1.6621928120038806e-05,
"loss": 0.3837088346481323,
"memory(GiB)": 127.52,
"step": 785,
"token_acc": 0.8568342264714894,
"train_speed(iter/s)": 0.101285
},
{
"epoch": 0.9250585480093677,
"grad_norm": 0.2526282072067261,
"learning_rate": 1.657341188968811e-05,
"loss": 0.3741894721984863,
"memory(GiB)": 127.52,
"step": 790,
"token_acc": 0.8600209680781232,
"train_speed(iter/s)": 0.101298
},
{
"epoch": 0.9309133489461359,
"grad_norm": 0.2629476487636566,
"learning_rate": 1.6524621662926733e-05,
"loss": 0.3736875057220459,
"memory(GiB)": 127.52,
"step": 795,
"token_acc": 0.8765449927636102,
"train_speed(iter/s)": 0.101311
},
{
"epoch": 0.936768149882904,
"grad_norm": 0.26536864042282104,
"learning_rate": 1.6475559473454558e-05,
"loss": 0.3841824769973755,
"memory(GiB)": 127.52,
"step": 800,
"token_acc": 0.8732290436835891,
"train_speed(iter/s)": 0.101317
},
{
"epoch": 0.9426229508196722,
"grad_norm": 0.9267993569374084,
"learning_rate": 1.6426227366307563e-05,
"loss": 0.3876027584075928,
"memory(GiB)": 127.52,
"step": 805,
"token_acc": 0.873662949476559,
"train_speed(iter/s)": 0.10131
},
{
"epoch": 0.9484777517564403,
"grad_norm": 0.31513214111328125,
"learning_rate": 1.6376627397772576e-05,
"loss": 0.38577656745910643,
"memory(GiB)": 127.52,
"step": 810,
"token_acc": 0.8582883611177872,
"train_speed(iter/s)": 0.101308
},
{
"epoch": 0.9543325526932084,
"grad_norm": 0.43881845474243164,
"learning_rate": 1.6326761635301572e-05,
"loss": 0.3793084383010864,
"memory(GiB)": 127.52,
"step": 815,
"token_acc": 0.8658072630089608,
"train_speed(iter/s)": 0.101317
},
{
"epoch": 0.9601873536299765,
"grad_norm": 0.2627616822719574,
"learning_rate": 1.6276632157425475e-05,
"loss": 0.3868673801422119,
"memory(GiB)": 127.52,
"step": 820,
"token_acc": 0.8609059346385673,
"train_speed(iter/s)": 0.101319
},
{
"epoch": 0.9660421545667447,
"grad_norm": 0.28743499517440796,
"learning_rate": 1.6226241053667536e-05,
"loss": 0.39165661334991453,
"memory(GiB)": 127.52,
"step": 825,
"token_acc": 0.8566733687511922,
"train_speed(iter/s)": 0.101328
},
{
"epoch": 0.9718969555035128,
"grad_norm": 0.2647303640842438,
"learning_rate": 1.617559042445625e-05,
"loss": 0.3914238929748535,
"memory(GiB)": 127.52,
"step": 830,
"token_acc": 0.8686447332876824,
"train_speed(iter/s)": 0.101331
},
{
"epoch": 0.977751756440281,
"grad_norm": 0.28214219212532043,
"learning_rate": 1.6124682381037767e-05,
"loss": 0.3775761127471924,
"memory(GiB)": 127.52,
"step": 835,
"token_acc": 0.8658163872414246,
"train_speed(iter/s)": 0.101335
},
{
"epoch": 0.9836065573770492,
"grad_norm": 0.2978610694408417,
"learning_rate": 1.607351904538792e-05,
"loss": 0.39282917976379395,
"memory(GiB)": 127.52,
"step": 840,
"token_acc": 0.866700342369647,
"train_speed(iter/s)": 0.101325
},
{
"epoch": 0.9894613583138173,
"grad_norm": 0.2674395740032196,
"learning_rate": 1.6022102550123775e-05,
"loss": 0.3796736240386963,
"memory(GiB)": 127.52,
"step": 845,
"token_acc": 0.8609281823639752,
"train_speed(iter/s)": 0.101326
},
{
"epoch": 0.9953161592505855,
"grad_norm": 0.2766255736351013,
"learning_rate": 1.597043503841471e-05,
"loss": 0.3800792217254639,
"memory(GiB)": 127.52,
"step": 850,
"token_acc": 0.8745568192822268,
"train_speed(iter/s)": 0.101325
},
{
"epoch": 1.0011709601873535,
"grad_norm": 0.36053553223609924,
"learning_rate": 1.5918518663893124e-05,
"loss": 0.3734774589538574,
"memory(GiB)": 127.52,
"step": 855,
"token_acc": 0.8709147849019284,
"train_speed(iter/s)": 0.100996
},
{
"epoch": 1.0070257611241218,
"grad_norm": 0.29508745670318604,
"learning_rate": 1.5866355590564637e-05,
"loss": 0.3578346252441406,
"memory(GiB)": 127.52,
"step": 860,
"token_acc": 0.8851065028386151,
"train_speed(iter/s)": 0.100988
},
{
"epoch": 1.0128805620608898,
"grad_norm": 0.30008167028427124,
"learning_rate": 1.5813947992717894e-05,
"loss": 0.34525480270385744,
"memory(GiB)": 127.52,
"step": 865,
"token_acc": 0.8753548176879359,
"train_speed(iter/s)": 0.10098
},
{
"epoch": 1.018735362997658,
"grad_norm": 0.2938152253627777,
"learning_rate": 1.5761298054833947e-05,
"loss": 0.3546164035797119,
"memory(GiB)": 127.52,
"step": 870,
"token_acc": 0.8762193571592467,
"train_speed(iter/s)": 0.100965
},
{
"epoch": 1.0245901639344261,
"grad_norm": 0.27178069949150085,
"learning_rate": 1.5708407971495195e-05,
"loss": 0.3612537384033203,
"memory(GiB)": 127.52,
"step": 875,
"token_acc": 0.8722169198754557,
"train_speed(iter/s)": 0.100976
},
{
"epoch": 1.0304449648711944,
"grad_norm": 0.2759335935115814,
"learning_rate": 1.565527994729389e-05,
"loss": 0.3513669967651367,
"memory(GiB)": 127.52,
"step": 880,
"token_acc": 0.8818436745370559,
"train_speed(iter/s)": 0.100984
},
{
"epoch": 1.0362997658079625,
"grad_norm": 0.2735261917114258,
"learning_rate": 1.5601916196740283e-05,
"loss": 0.3473806858062744,
"memory(GiB)": 127.52,
"step": 885,
"token_acc": 0.8784491835740441,
"train_speed(iter/s)": 0.100979
},
{
"epoch": 1.0421545667447307,
"grad_norm": 0.28892189264297485,
"learning_rate": 1.5548318944170276e-05,
"loss": 0.3433929443359375,
"memory(GiB)": 127.52,
"step": 890,
"token_acc": 0.8839334112478968,
"train_speed(iter/s)": 0.100971
},
{
"epoch": 1.0480093676814988,
"grad_norm": 0.2602222263813019,
"learning_rate": 1.5494490423652732e-05,
"loss": 0.3427423000335693,
"memory(GiB)": 127.52,
"step": 895,
"token_acc": 0.876471048390882,
"train_speed(iter/s)": 0.100951
},
{
"epoch": 1.053864168618267,
"grad_norm": 0.2913144528865814,
"learning_rate": 1.544043287889635e-05,
"loss": 0.3336780071258545,
"memory(GiB)": 127.52,
"step": 900,
"token_acc": 0.8869567959634185,
"train_speed(iter/s)": 0.10095
},
{
"epoch": 1.059718969555035,
"grad_norm": 0.2634846270084381,
"learning_rate": 1.538614856315614e-05,
"loss": 0.3489675998687744,
"memory(GiB)": 127.52,
"step": 905,
"token_acc": 0.8832413903915163,
"train_speed(iter/s)": 0.100958
},
{
"epoch": 1.0655737704918034,
"grad_norm": 0.2699672281742096,
"learning_rate": 1.5331639739139477e-05,
"loss": 0.3432894229888916,
"memory(GiB)": 127.52,
"step": 910,
"token_acc": 0.8669136816431162,
"train_speed(iter/s)": 0.100951
},
{
"epoch": 1.0714285714285714,
"grad_norm": 0.2946908175945282,
"learning_rate": 1.5276908678911837e-05,
"loss": 0.3399630546569824,
"memory(GiB)": 127.52,
"step": 915,
"token_acc": 0.8821736748390632,
"train_speed(iter/s)": 0.100953
},
{
"epoch": 1.0772833723653397,
"grad_norm": 0.31119436025619507,
"learning_rate": 1.5221957663802043e-05,
"loss": 0.3506146430969238,
"memory(GiB)": 127.52,
"step": 920,
"token_acc": 0.8818868935608091,
"train_speed(iter/s)": 0.100935
},
{
"epoch": 1.0831381733021077,
"grad_norm": 0.27400681376457214,
"learning_rate": 1.5166788984307204e-05,
"loss": 0.35775036811828614,
"memory(GiB)": 127.52,
"step": 925,
"token_acc": 0.8750959445346218,
"train_speed(iter/s)": 0.100931
},
{
"epoch": 1.088992974238876,
"grad_norm": 0.3916493058204651,
"learning_rate": 1.5111404939997227e-05,
"loss": 0.3546015739440918,
"memory(GiB)": 127.52,
"step": 930,
"token_acc": 0.8738711676022755,
"train_speed(iter/s)": 0.100933
},
{
"epoch": 1.094847775175644,
"grad_norm": 0.3681865930557251,
"learning_rate": 1.5055807839418966e-05,
"loss": 0.33371834754943847,
"memory(GiB)": 127.52,
"step": 935,
"token_acc": 0.8814006570111667,
"train_speed(iter/s)": 0.100931
},
{
"epoch": 1.100702576112412,
"grad_norm": 0.27416518330574036,
"learning_rate": 1.5000000000000002e-05,
"loss": 0.3561122417449951,
"memory(GiB)": 127.52,
"step": 940,
"token_acc": 0.8838524966358717,
"train_speed(iter/s)": 0.100932
},
{
"epoch": 1.1065573770491803,
"grad_norm": 0.2653830349445343,
"learning_rate": 1.494398374795204e-05,
"loss": 0.3430471897125244,
"memory(GiB)": 127.52,
"step": 945,
"token_acc": 0.8739330062998951,
"train_speed(iter/s)": 0.100924
},
{
"epoch": 1.1124121779859484,
"grad_norm": 0.29074740409851074,
"learning_rate": 1.4887761418173947e-05,
"loss": 0.36190090179443357,
"memory(GiB)": 127.52,
"step": 950,
"token_acc": 0.8833006769910948,
"train_speed(iter/s)": 0.100919
},
{
"epoch": 1.1182669789227166,
"grad_norm": 0.2751435339450836,
"learning_rate": 1.4831335354154444e-05,
"loss": 0.34648761749267576,
"memory(GiB)": 127.52,
"step": 955,
"token_acc": 0.8776634838921327,
"train_speed(iter/s)": 0.100926
},
{
"epoch": 1.1241217798594847,
"grad_norm": 0.2628922164440155,
"learning_rate": 1.4774707907874392e-05,
"loss": 0.34562859535217283,
"memory(GiB)": 127.52,
"step": 960,
"token_acc": 0.8836736799002247,
"train_speed(iter/s)": 0.100911
},
{
"epoch": 1.129976580796253,
"grad_norm": 0.2639271020889282,
"learning_rate": 1.4717881439708786e-05,
"loss": 0.34596388339996337,
"memory(GiB)": 127.52,
"step": 965,
"token_acc": 0.8673695686030214,
"train_speed(iter/s)": 0.100909
},
{
"epoch": 1.135831381733021,
"grad_norm": 0.28422874212265015,
"learning_rate": 1.4660858318328348e-05,
"loss": 0.3498117446899414,
"memory(GiB)": 127.52,
"step": 970,
"token_acc": 0.866499586445358,
"train_speed(iter/s)": 0.100888
},
{
"epoch": 1.1416861826697893,
"grad_norm": 0.2625197470188141,
"learning_rate": 1.4603640920600813e-05,
"loss": 0.35533895492553713,
"memory(GiB)": 127.52,
"step": 975,
"token_acc": 0.8624783775908141,
"train_speed(iter/s)": 0.100863
},
{
"epoch": 1.1475409836065573,
"grad_norm": 0.2902534008026123,
"learning_rate": 1.4546231631491827e-05,
"loss": 0.35151519775390627,
"memory(GiB)": 127.52,
"step": 980,
"token_acc": 0.871260222085633,
"train_speed(iter/s)": 0.100833
},
{
"epoch": 1.1533957845433256,
"grad_norm": 0.2525332570075989,
"learning_rate": 1.4488632843965573e-05,
"loss": 0.3441092729568481,
"memory(GiB)": 127.52,
"step": 985,
"token_acc": 0.8626160602258469,
"train_speed(iter/s)": 0.100824
},
{
"epoch": 1.1592505854800936,
"grad_norm": 0.26731306314468384,
"learning_rate": 1.4430846958884995e-05,
"loss": 0.3539264678955078,
"memory(GiB)": 127.52,
"step": 990,
"token_acc": 0.8706765643432645,
"train_speed(iter/s)": 0.100815
},
{
"epoch": 1.165105386416862,
"grad_norm": 0.2605798542499542,
"learning_rate": 1.4372876384911741e-05,
"loss": 0.35328848361968995,
"memory(GiB)": 127.52,
"step": 995,
"token_acc": 0.8729384617783252,
"train_speed(iter/s)": 0.100809
},
{
"epoch": 1.17096018735363,
"grad_norm": 0.2707096338272095,
"learning_rate": 1.4314723538405752e-05,
"loss": 0.36124861240386963,
"memory(GiB)": 127.52,
"step": 1000,
"token_acc": 0.8623729975690332,
"train_speed(iter/s)": 0.100795
},
{
"epoch": 1.1768149882903982,
"grad_norm": 0.26851606369018555,
"learning_rate": 1.4256390843324556e-05,
"loss": 0.35548346042633056,
"memory(GiB)": 127.52,
"step": 1005,
"token_acc": 0.868687436031853,
"train_speed(iter/s)": 0.100786
},
{
"epoch": 1.1826697892271663,
"grad_norm": 0.27084365487098694,
"learning_rate": 1.4197880731122221e-05,
"loss": 0.351657772064209,
"memory(GiB)": 127.52,
"step": 1010,
"token_acc": 0.8682709314201729,
"train_speed(iter/s)": 0.100787
},
{
"epoch": 1.1885245901639343,
"grad_norm": 0.27497202157974243,
"learning_rate": 1.4139195640648008e-05,
"loss": 0.355600380897522,
"memory(GiB)": 127.52,
"step": 1015,
"token_acc": 0.8803992028496556,
"train_speed(iter/s)": 0.10078
},
{
"epoch": 1.1943793911007026,
"grad_norm": 0.2708893418312073,
"learning_rate": 1.4080338018044712e-05,
"loss": 0.3596624851226807,
"memory(GiB)": 127.52,
"step": 1020,
"token_acc": 0.8694279635903098,
"train_speed(iter/s)": 0.100784
},
{
"epoch": 1.2002341920374708,
"grad_norm": 0.32129156589508057,
"learning_rate": 1.4021310316646708e-05,
"loss": 0.3490485668182373,
"memory(GiB)": 127.52,
"step": 1025,
"token_acc": 0.8754893595663521,
"train_speed(iter/s)": 0.100766
},
{
"epoch": 1.2060889929742389,
"grad_norm": 0.25467485189437866,
"learning_rate": 1.3962114996877685e-05,
"loss": 0.3347738981246948,
"memory(GiB)": 127.52,
"step": 1030,
"token_acc": 0.8824631079656678,
"train_speed(iter/s)": 0.100756
},
{
"epoch": 1.211943793911007,
"grad_norm": 0.2674933671951294,
"learning_rate": 1.390275452614808e-05,
"loss": 0.338185977935791,
"memory(GiB)": 127.52,
"step": 1035,
"token_acc": 0.8744415325096718,
"train_speed(iter/s)": 0.100755
},
{
"epoch": 1.2177985948477752,
"grad_norm": 0.2707443833351135,
"learning_rate": 1.3843231378752252e-05,
"loss": 0.3448366165161133,
"memory(GiB)": 127.52,
"step": 1040,
"token_acc": 0.8736029828057016,
"train_speed(iter/s)": 0.100747
},
{
"epoch": 1.2236533957845432,
"grad_norm": 0.24581725895404816,
"learning_rate": 1.3783548035765327e-05,
"loss": 0.34962687492370603,
"memory(GiB)": 127.52,
"step": 1045,
"token_acc": 0.8796080775037746,
"train_speed(iter/s)": 0.100757
},
{
"epoch": 1.2295081967213115,
"grad_norm": 0.2379993051290512,
"learning_rate": 1.3723706984939783e-05,
"loss": 0.33640050888061523,
"memory(GiB)": 127.52,
"step": 1050,
"token_acc": 0.8721236366123021,
"train_speed(iter/s)": 0.100739
},
{
"epoch": 1.2353629976580796,
"grad_norm": 0.26605796813964844,
"learning_rate": 1.366371072060177e-05,
"loss": 0.3490384340286255,
"memory(GiB)": 127.52,
"step": 1055,
"token_acc": 0.862867230488973,
"train_speed(iter/s)": 0.100731
},
{
"epoch": 1.2412177985948478,
"grad_norm": 0.25522705912590027,
"learning_rate": 1.3603561743547125e-05,
"loss": 0.34296507835388185,
"memory(GiB)": 127.52,
"step": 1060,
"token_acc": 0.8687898169167538,
"train_speed(iter/s)": 0.100739
},
{
"epoch": 1.2470725995316159,
"grad_norm": 0.2729935348033905,
"learning_rate": 1.3543262560937135e-05,
"loss": 0.34846017360687254,
"memory(GiB)": 127.52,
"step": 1065,
"token_acc": 0.8741769020279135,
"train_speed(iter/s)": 0.100744
},
{
"epoch": 1.2529274004683841,
"grad_norm": 0.2534308433532715,
"learning_rate": 1.3482815686194033e-05,
"loss": 0.33311474323272705,
"memory(GiB)": 127.52,
"step": 1070,
"token_acc": 0.8795399856245507,
"train_speed(iter/s)": 0.100751
},
{
"epoch": 1.2587822014051522,
"grad_norm": 0.2755572497844696,
"learning_rate": 1.3422223638896235e-05,
"loss": 0.3432854413986206,
"memory(GiB)": 127.52,
"step": 1075,
"token_acc": 0.8758250682788096,
"train_speed(iter/s)": 0.100756
},
{
"epoch": 1.2646370023419204,
"grad_norm": 0.2861506938934326,
"learning_rate": 1.3361488944673315e-05,
"loss": 0.3542114496231079,
"memory(GiB)": 127.52,
"step": 1080,
"token_acc": 0.8687981877806241,
"train_speed(iter/s)": 0.100759
},
{
"epoch": 1.2704918032786885,
"grad_norm": 0.3049258589744568,
"learning_rate": 1.3300614135100736e-05,
"loss": 0.3503614664077759,
"memory(GiB)": 127.52,
"step": 1085,
"token_acc": 0.875489517451949,
"train_speed(iter/s)": 0.100754
},
{
"epoch": 1.2763466042154565,
"grad_norm": 0.25668370723724365,
"learning_rate": 1.3239601747594319e-05,
"loss": 0.3487658739089966,
"memory(GiB)": 127.52,
"step": 1090,
"token_acc": 0.8770075135561131,
"train_speed(iter/s)": 0.100751
},
{
"epoch": 1.2822014051522248,
"grad_norm": 0.2401314228773117,
"learning_rate": 1.3178454325304472e-05,
"loss": 0.3507190465927124,
"memory(GiB)": 127.52,
"step": 1095,
"token_acc": 0.8644839657435769,
"train_speed(iter/s)": 0.100741
},
{
"epoch": 1.288056206088993,
"grad_norm": 0.2501038908958435,
"learning_rate": 1.3117174417010213e-05,
"loss": 0.3356021404266357,
"memory(GiB)": 127.52,
"step": 1100,
"token_acc": 0.8694938440492476,
"train_speed(iter/s)": 0.100738
},
{
"epoch": 1.2939110070257611,
"grad_norm": 0.25629186630249023,
"learning_rate": 1.3055764577012892e-05,
"loss": 0.3668931007385254,
"memory(GiB)": 127.52,
"step": 1105,
"token_acc": 0.8810234328372201,
"train_speed(iter/s)": 0.100745
},
{
"epoch": 1.2997658079625292,
"grad_norm": 0.2689758539199829,
"learning_rate": 1.2994227365029752e-05,
"loss": 0.34679102897644043,
"memory(GiB)": 127.52,
"step": 1110,
"token_acc": 0.8783292769097903,
"train_speed(iter/s)": 0.100746
},
{
"epoch": 1.3056206088992974,
"grad_norm": 0.2619406282901764,
"learning_rate": 1.2932565346087218e-05,
"loss": 0.35414924621582033,
"memory(GiB)": 127.52,
"step": 1115,
"token_acc": 0.8748901150285233,
"train_speed(iter/s)": 0.100729
},
{
"epoch": 1.3114754098360657,
"grad_norm": 0.3210083842277527,
"learning_rate": 1.2870781090413991e-05,
"loss": 0.35202646255493164,
"memory(GiB)": 127.52,
"step": 1120,
"token_acc": 0.8757856423662141,
"train_speed(iter/s)": 0.10072
},
{
"epoch": 1.3173302107728337,
"grad_norm": 0.27284613251686096,
"learning_rate": 1.2808877173333896e-05,
"loss": 0.3467656850814819,
"memory(GiB)": 127.52,
"step": 1125,
"token_acc": 0.883265632074048,
"train_speed(iter/s)": 0.100724
},
{
"epoch": 1.3231850117096018,
"grad_norm": 0.2710505425930023,
"learning_rate": 1.2746856175158556e-05,
"loss": 0.35611112117767335,
"memory(GiB)": 127.52,
"step": 1130,
"token_acc": 0.8756308252586658,
"train_speed(iter/s)": 0.100737
},
{
"epoch": 1.32903981264637,
"grad_norm": 0.26133865118026733,
"learning_rate": 1.2684720681079825e-05,
"loss": 0.3506006240844727,
"memory(GiB)": 127.52,
"step": 1135,
"token_acc": 0.8604187872166245,
"train_speed(iter/s)": 0.100742
},
{
"epoch": 1.334894613583138,
"grad_norm": 0.27019548416137695,
"learning_rate": 1.2622473281062042e-05,
"loss": 0.35390684604644773,
"memory(GiB)": 127.52,
"step": 1140,
"token_acc": 0.8757172258949731,
"train_speed(iter/s)": 0.100736
},
{
"epoch": 1.3407494145199064,
"grad_norm": 0.26330387592315674,
"learning_rate": 1.256011656973406e-05,
"loss": 0.36088995933532714,
"memory(GiB)": 127.52,
"step": 1145,
"token_acc": 0.8777154145240186,
"train_speed(iter/s)": 0.100733
},
{
"epoch": 1.3466042154566744,
"grad_norm": 0.24824829399585724,
"learning_rate": 1.2497653146281113e-05,
"loss": 0.3501885175704956,
"memory(GiB)": 127.52,
"step": 1150,
"token_acc": 0.8752751123830188,
"train_speed(iter/s)": 0.100712
},
{
"epoch": 1.3524590163934427,
"grad_norm": 0.2536720037460327,
"learning_rate": 1.2435085614336459e-05,
"loss": 0.3565546989440918,
"memory(GiB)": 127.52,
"step": 1155,
"token_acc": 0.8831354083065811,
"train_speed(iter/s)": 0.100705
},
{
"epoch": 1.3583138173302107,
"grad_norm": 0.24884596467018127,
"learning_rate": 1.2372416581872857e-05,
"loss": 0.34425859451293944,
"memory(GiB)": 127.52,
"step": 1160,
"token_acc": 0.8804687524440259,
"train_speed(iter/s)": 0.100705
},
{
"epoch": 1.364168618266979,
"grad_norm": 0.2567351162433624,
"learning_rate": 1.2309648661093878e-05,
"loss": 0.3500640630722046,
"memory(GiB)": 127.52,
"step": 1165,
"token_acc": 0.8808626074837297,
"train_speed(iter/s)": 0.100708
},
{
"epoch": 1.370023419203747,
"grad_norm": 0.27127236127853394,
"learning_rate": 1.2246784468324993e-05,
"loss": 0.35610170364379884,
"memory(GiB)": 127.52,
"step": 1170,
"token_acc": 0.8642630631304163,
"train_speed(iter/s)": 0.100707
},
{
"epoch": 1.3758782201405153,
"grad_norm": 0.25630801916122437,
"learning_rate": 1.218382662390454e-05,
"loss": 0.3440692901611328,
"memory(GiB)": 127.52,
"step": 1175,
"token_acc": 0.863847903863763,
"train_speed(iter/s)": 0.100714
},
{
"epoch": 1.3817330210772834,
"grad_norm": 0.2579875886440277,
"learning_rate": 1.2120777752074492e-05,
"loss": 0.35255093574523927,
"memory(GiB)": 127.52,
"step": 1180,
"token_acc": 0.8730105052212985,
"train_speed(iter/s)": 0.100715
},
{
"epoch": 1.3875878220140514,
"grad_norm": 0.2638234496116638,
"learning_rate": 1.2057640480871084e-05,
"loss": 0.3546736717224121,
"memory(GiB)": 127.52,
"step": 1185,
"token_acc": 0.8738721335992023,
"train_speed(iter/s)": 0.100725
},
{
"epoch": 1.3934426229508197,
"grad_norm": 0.25871458649635315,
"learning_rate": 1.1994417442015243e-05,
"loss": 0.35408906936645507,
"memory(GiB)": 127.52,
"step": 1190,
"token_acc": 0.8796952149117578,
"train_speed(iter/s)": 0.100732
},
{
"epoch": 1.399297423887588,
"grad_norm": 0.2632989287376404,
"learning_rate": 1.193111127080292e-05,
"loss": 0.3432591676712036,
"memory(GiB)": 127.52,
"step": 1195,
"token_acc": 0.8828218086199104,
"train_speed(iter/s)": 0.10074
},
{
"epoch": 1.405152224824356,
"grad_norm": 0.24726183712482452,
"learning_rate": 1.186772460599523e-05,
"loss": 0.34243590831756593,
"memory(GiB)": 127.52,
"step": 1200,
"token_acc": 0.8815012144480138,
"train_speed(iter/s)": 0.100741
},
{
"epoch": 1.411007025761124,
"grad_norm": 0.3329097032546997,
"learning_rate": 1.1804260089708464e-05,
"loss": 0.3537503480911255,
"memory(GiB)": 127.52,
"step": 1205,
"token_acc": 0.8658939159898351,
"train_speed(iter/s)": 0.100735
},
{
"epoch": 1.4168618266978923,
"grad_norm": 0.25181666016578674,
"learning_rate": 1.1740720367303958e-05,
"loss": 0.347446870803833,
"memory(GiB)": 127.52,
"step": 1210,
"token_acc": 0.8740943022953225,
"train_speed(iter/s)": 0.10074
},
{
"epoch": 1.4227166276346606,
"grad_norm": 0.2532757818698883,
"learning_rate": 1.1677108087277835e-05,
"loss": 0.3539264678955078,
"memory(GiB)": 127.52,
"step": 1215,
"token_acc": 0.8749382353125137,
"train_speed(iter/s)": 0.100743
},
{
"epoch": 1.4285714285714286,
"grad_norm": 0.2551215887069702,
"learning_rate": 1.1613425901150595e-05,
"loss": 0.35313239097595217,
"memory(GiB)": 127.52,
"step": 1220,
"token_acc": 0.8776082867215627,
"train_speed(iter/s)": 0.100745
},
{
"epoch": 1.4344262295081966,
"grad_norm": 0.2713333070278168,
"learning_rate": 1.15496764633566e-05,
"loss": 0.3634988307952881,
"memory(GiB)": 127.52,
"step": 1225,
"token_acc": 0.8660714848651069,
"train_speed(iter/s)": 0.10073
},
{
"epoch": 1.440281030444965,
"grad_norm": 0.26022830605506897,
"learning_rate": 1.1485862431133445e-05,
"loss": 0.3524580478668213,
"memory(GiB)": 127.52,
"step": 1230,
"token_acc": 0.8803166548004755,
"train_speed(iter/s)": 0.100717
},
{
"epoch": 1.446135831381733,
"grad_norm": 0.25171470642089844,
"learning_rate": 1.1421986464411169e-05,
"loss": 0.3533075571060181,
"memory(GiB)": 127.52,
"step": 1235,
"token_acc": 0.8648047662981438,
"train_speed(iter/s)": 0.100703
},
{
"epoch": 1.4519906323185012,
"grad_norm": 0.2464302033185959,
"learning_rate": 1.1358051225701404e-05,
"loss": 0.3423281192779541,
"memory(GiB)": 127.52,
"step": 1240,
"token_acc": 0.8691011183611862,
"train_speed(iter/s)": 0.100701
},
{
"epoch": 1.4578454332552693,
"grad_norm": 0.25466638803482056,
"learning_rate": 1.1294059379986384e-05,
"loss": 0.35201549530029297,
"memory(GiB)": 127.52,
"step": 1245,
"token_acc": 0.8681012341038652,
"train_speed(iter/s)": 0.100689
},
{
"epoch": 1.4637002341920375,
"grad_norm": 0.2576982080936432,
"learning_rate": 1.1230013594607874e-05,
"loss": 0.3531355857849121,
"memory(GiB)": 127.52,
"step": 1250,
"token_acc": 0.873457880243676,
"train_speed(iter/s)": 0.100693
},
{
"epoch": 1.4695550351288056,
"grad_norm": 0.25660985708236694,
"learning_rate": 1.1165916539155968e-05,
"loss": 0.35094761848449707,
"memory(GiB)": 127.52,
"step": 1255,
"token_acc": 0.8773934266901257,
"train_speed(iter/s)": 0.100701
},
{
"epoch": 1.4754098360655736,
"grad_norm": 0.24054618179798126,
"learning_rate": 1.1101770885357843e-05,
"loss": 0.34633212089538573,
"memory(GiB)": 127.52,
"step": 1260,
"token_acc": 0.8775079994840057,
"train_speed(iter/s)": 0.100711
},
{
"epoch": 1.481264637002342,
"grad_norm": 0.2445182204246521,
"learning_rate": 1.1037579306966365e-05,
"loss": 0.34541456699371337,
"memory(GiB)": 127.52,
"step": 1265,
"token_acc": 0.8862320037137543,
"train_speed(iter/s)": 0.100709
},
{
"epoch": 1.4871194379391102,
"grad_norm": 0.2729050815105438,
"learning_rate": 1.0973344479648652e-05,
"loss": 0.3409654855728149,
"memory(GiB)": 127.52,
"step": 1270,
"token_acc": 0.8771963474914158,
"train_speed(iter/s)": 0.100709
},
{
"epoch": 1.4929742388758782,
"grad_norm": 0.24874503910541534,
"learning_rate": 1.0909069080874556e-05,
"loss": 0.3430008411407471,
"memory(GiB)": 127.52,
"step": 1275,
"token_acc": 0.8704117168634027,
"train_speed(iter/s)": 0.100714
},
{
"epoch": 1.4988290398126463,
"grad_norm": 0.2715279459953308,
"learning_rate": 1.0844755789805042e-05,
"loss": 0.35068159103393554,
"memory(GiB)": 127.52,
"step": 1280,
"token_acc": 0.8675264981305526,
"train_speed(iter/s)": 0.100714
},
{
"epoch": 1.5046838407494145,
"grad_norm": 0.23995369672775269,
"learning_rate": 1.0780407287180526e-05,
"loss": 0.35523912906646726,
"memory(GiB)": 127.52,
"step": 1285,
"token_acc": 0.8685361997709505,
"train_speed(iter/s)": 0.100706
},
{
"epoch": 1.5105386416861828,
"grad_norm": 0.26195716857910156,
"learning_rate": 1.0716026255209124e-05,
"loss": 0.349694561958313,
"memory(GiB)": 127.52,
"step": 1290,
"token_acc": 0.8676919971870162,
"train_speed(iter/s)": 0.100714
},
{
"epoch": 1.5163934426229508,
"grad_norm": 0.24379870295524597,
"learning_rate": 1.0651615377454872e-05,
"loss": 0.3513511657714844,
"memory(GiB)": 127.52,
"step": 1295,
"token_acc": 0.8762717457922776,
"train_speed(iter/s)": 0.100714
},
{
"epoch": 1.5222482435597189,
"grad_norm": 0.2554638683795929,
"learning_rate": 1.0587177338725834e-05,
"loss": 0.3569997787475586,
"memory(GiB)": 127.52,
"step": 1300,
"token_acc": 0.8766220533416101,
"train_speed(iter/s)": 0.100716
},
{
"epoch": 1.5281030444964872,
"grad_norm": 0.2548043727874756,
"learning_rate": 1.0522714824962228e-05,
"loss": 0.3422648191452026,
"memory(GiB)": 127.52,
"step": 1305,
"token_acc": 0.8870603034829783,
"train_speed(iter/s)": 0.100709
},
{
"epoch": 1.5339578454332554,
"grad_norm": 0.24967636168003082,
"learning_rate": 1.0458230523124443e-05,
"loss": 0.3560429573059082,
"memory(GiB)": 127.52,
"step": 1310,
"token_acc": 0.8787232780765522,
"train_speed(iter/s)": 0.100701
},
{
"epoch": 1.5398126463700235,
"grad_norm": 0.2598780691623688,
"learning_rate": 1.0393727121081057e-05,
"loss": 0.3518627166748047,
"memory(GiB)": 127.52,
"step": 1315,
"token_acc": 0.8750810752945474,
"train_speed(iter/s)": 0.10071
},
{
"epoch": 1.5456674473067915,
"grad_norm": 0.23825575411319733,
"learning_rate": 1.0329207307496785e-05,
"loss": 0.3401672840118408,
"memory(GiB)": 127.52,
"step": 1320,
"token_acc": 0.8770460187011242,
"train_speed(iter/s)": 0.100714
},
{
"epoch": 1.5515222482435598,
"grad_norm": 0.2550235986709595,
"learning_rate": 1.0264673771720429e-05,
"loss": 0.350058913230896,
"memory(GiB)": 127.52,
"step": 1325,
"token_acc": 0.881267240867612,
"train_speed(iter/s)": 0.100714
},
{
"epoch": 1.5573770491803278,
"grad_norm": 0.269613653421402,
"learning_rate": 1.0200129203672754e-05,
"loss": 0.3502191543579102,
"memory(GiB)": 127.52,
"step": 1330,
"token_acc": 0.8661129276756743,
"train_speed(iter/s)": 0.10071
},
{
"epoch": 1.5632318501170959,
"grad_norm": 0.24150115251541138,
"learning_rate": 1.0135576293734381e-05,
"loss": 0.34059958457946776,
"memory(GiB)": 127.52,
"step": 1335,
"token_acc": 0.8847581210563216,
"train_speed(iter/s)": 0.100716
},
{
"epoch": 1.5690866510538641,
"grad_norm": 0.2703973650932312,
"learning_rate": 1.007101773263365e-05,
"loss": 0.35358033180236814,
"memory(GiB)": 127.52,
"step": 1340,
"token_acc": 0.8770655404348506,
"train_speed(iter/s)": 0.100721
},
{
"epoch": 1.5749414519906324,
"grad_norm": 0.23871327936649323,
"learning_rate": 1.0006456211334445e-05,
"loss": 0.3467454671859741,
"memory(GiB)": 127.52,
"step": 1345,
"token_acc": 0.8759395313396612,
"train_speed(iter/s)": 0.100731
},
{
"epoch": 1.5807962529274004,
"grad_norm": 0.25692564249038696,
"learning_rate": 9.941894420924044e-06,
"loss": 0.3450988054275513,
"memory(GiB)": 127.52,
"step": 1350,
"token_acc": 0.8868195745646664,
"train_speed(iter/s)": 0.100727
},
{
"epoch": 1.5866510538641685,
"grad_norm": 0.2428205907344818,
"learning_rate": 9.87733505250094e-06,
"loss": 0.3494907855987549,
"memory(GiB)": 127.52,
"step": 1355,
"token_acc": 0.8756121235576668,
"train_speed(iter/s)": 0.100725
},
{
"epoch": 1.5925058548009368,
"grad_norm": 0.24155238270759583,
"learning_rate": 9.812780797062678e-06,
"loss": 0.3456254005432129,
"memory(GiB)": 127.52,
"step": 1360,
"token_acc": 0.8809245943605768,
"train_speed(iter/s)": 0.10072
},
{
"epoch": 1.598360655737705,
"grad_norm": 0.464139848947525,
"learning_rate": 9.748234345393672e-06,
"loss": 0.34203310012817384,
"memory(GiB)": 127.52,
"step": 1365,
"token_acc": 0.8774237555421359,
"train_speed(iter/s)": 0.100715
},
{
"epoch": 1.604215456674473,
"grad_norm": 0.2672084867954254,
"learning_rate": 9.68369838795306e-06,
"loss": 0.350542688369751,
"memory(GiB)": 127.52,
"step": 1370,
"token_acc": 0.8734205080790737,
"train_speed(iter/s)": 0.100705
},
{
"epoch": 1.6100702576112411,
"grad_norm": 0.2600000500679016,
"learning_rate": 9.61917561476255e-06,
"loss": 0.3421807050704956,
"memory(GiB)": 127.52,
"step": 1375,
"token_acc": 0.8668853013058622,
"train_speed(iter/s)": 0.100709
},
{
"epoch": 1.6159250585480094,
"grad_norm": 0.2540619373321533,
"learning_rate": 9.554668715294305e-06,
"loss": 0.3543410778045654,
"memory(GiB)": 127.52,
"step": 1380,
"token_acc": 0.8761743728864414,
"train_speed(iter/s)": 0.100714
},
{
"epoch": 1.6217798594847777,
"grad_norm": 0.2585217356681824,
"learning_rate": 9.490180378358826e-06,
"loss": 0.35744295120239256,
"memory(GiB)": 127.52,
"step": 1385,
"token_acc": 0.8715506016593595,
"train_speed(iter/s)": 0.100718
},
{
"epoch": 1.6276346604215457,
"grad_norm": 0.26017606258392334,
"learning_rate": 9.425713291992878e-06,
"loss": 0.34558424949645994,
"memory(GiB)": 127.52,
"step": 1390,
"token_acc": 0.8794015410099387,
"train_speed(iter/s)": 0.100719
},
{
"epoch": 1.6334894613583137,
"grad_norm": 0.25051021575927734,
"learning_rate": 9.361270143347452e-06,
"loss": 0.35907368659973143,
"memory(GiB)": 127.52,
"step": 1395,
"token_acc": 0.8715231746371632,
"train_speed(iter/s)": 0.100723
},
{
"epoch": 1.639344262295082,
"grad_norm": 0.24877934157848358,
"learning_rate": 9.296853618575753e-06,
"loss": 0.34605088233947756,
"memory(GiB)": 127.52,
"step": 1400,
"token_acc": 0.8828522126980963,
"train_speed(iter/s)": 0.100731
},
{
"epoch": 1.6451990632318503,
"grad_norm": 0.23893095552921295,
"learning_rate": 9.232466402721241e-06,
"loss": 0.3570685625076294,
"memory(GiB)": 127.52,
"step": 1405,
"token_acc": 0.8760022299616647,
"train_speed(iter/s)": 0.10074
},
{
"epoch": 1.651053864168618,
"grad_norm": 0.24638938903808594,
"learning_rate": 9.1681111796057e-06,
"loss": 0.3466794967651367,
"memory(GiB)": 127.52,
"step": 1410,
"token_acc": 0.8773031091974165,
"train_speed(iter/s)": 0.100749
},
{
"epoch": 1.6569086651053864,
"grad_norm": 0.256526380777359,
"learning_rate": 9.103790631717375e-06,
"loss": 0.3623323917388916,
"memory(GiB)": 127.52,
"step": 1415,
"token_acc": 0.8679865616745452,
"train_speed(iter/s)": 0.100748
},
{
"epoch": 1.6627634660421546,
"grad_norm": 0.25238198041915894,
"learning_rate": 9.039507440099164e-06,
"loss": 0.3467939138412476,
"memory(GiB)": 127.52,
"step": 1420,
"token_acc": 0.8828419526341228,
"train_speed(iter/s)": 0.100746
},
{
"epoch": 1.6686182669789227,
"grad_norm": 0.23841890692710876,
"learning_rate": 8.975264284236866e-06,
"loss": 0.34966843128204345,
"memory(GiB)": 127.52,
"step": 1425,
"token_acc": 0.8775815971188294,
"train_speed(iter/s)": 0.100755
},
{
"epoch": 1.6744730679156907,
"grad_norm": 0.26001548767089844,
"learning_rate": 8.911063841947476e-06,
"loss": 0.35109724998474123,
"memory(GiB)": 127.52,
"step": 1430,
"token_acc": 0.8745225380796411,
"train_speed(iter/s)": 0.100754
},
{
"epoch": 1.680327868852459,
"grad_norm": 0.2468952238559723,
"learning_rate": 8.846908789267589e-06,
"loss": 0.35158143043518064,
"memory(GiB)": 127.52,
"step": 1435,
"token_acc": 0.8772585276576946,
"train_speed(iter/s)": 0.100766
},
{
"epoch": 1.6861826697892273,
"grad_norm": 0.24095061421394348,
"learning_rate": 8.78280180034184e-06,
"loss": 0.3411277770996094,
"memory(GiB)": 127.52,
"step": 1440,
"token_acc": 0.8712463039204312,
"train_speed(iter/s)": 0.10077
},
{
"epoch": 1.6920374707259953,
"grad_norm": 0.25439053773880005,
"learning_rate": 8.718745547311458e-06,
"loss": 0.3543074131011963,
"memory(GiB)": 127.52,
"step": 1445,
"token_acc": 0.871980767417743,
"train_speed(iter/s)": 0.100763
},
{
"epoch": 1.6978922716627634,
"grad_norm": 1.5297069549560547,
"learning_rate": 8.654742700202849e-06,
"loss": 0.3533529043197632,
"memory(GiB)": 127.52,
"step": 1450,
"token_acc": 0.8742467882207196,
"train_speed(iter/s)": 0.100766
},
{
"epoch": 1.7037470725995316,
"grad_norm": 0.25103631615638733,
"learning_rate": 8.590795926816348e-06,
"loss": 0.3418538570404053,
"memory(GiB)": 127.52,
"step": 1455,
"token_acc": 0.8745452901882429,
"train_speed(iter/s)": 0.100768
},
{
"epoch": 1.7096018735362999,
"grad_norm": 0.3538268208503723,
"learning_rate": 8.526907892614986e-06,
"loss": 0.34701027870178225,
"memory(GiB)": 127.52,
"step": 1460,
"token_acc": 0.8781468525993731,
"train_speed(iter/s)": 0.100762
},
{
"epoch": 1.715456674473068,
"grad_norm": 0.2575690448284149,
"learning_rate": 8.463081260613391e-06,
"loss": 0.3492567539215088,
"memory(GiB)": 127.52,
"step": 1465,
"token_acc": 0.8833869870635476,
"train_speed(iter/s)": 0.10076
},
{
"epoch": 1.721311475409836,
"grad_norm": 0.25249573588371277,
"learning_rate": 8.399318691266806e-06,
"loss": 0.35265603065490725,
"memory(GiB)": 127.52,
"step": 1470,
"token_acc": 0.8733317460118548,
"train_speed(iter/s)": 0.10076
},
{
"epoch": 1.7271662763466042,
"grad_norm": 0.26620882749557495,
"learning_rate": 8.335622842360168e-06,
"loss": 0.3444960594177246,
"memory(GiB)": 127.52,
"step": 1475,
"token_acc": 0.8786412367096045,
"train_speed(iter/s)": 0.100754
},
{
"epoch": 1.7330210772833725,
"grad_norm": 0.25925421714782715,
"learning_rate": 8.271996368897345e-06,
"loss": 0.35317885875701904,
"memory(GiB)": 127.52,
"step": 1480,
"token_acc": 0.8806173955625871,
"train_speed(iter/s)": 0.10074
},
{
"epoch": 1.7388758782201406,
"grad_norm": 0.24599948525428772,
"learning_rate": 8.208441922990454e-06,
"loss": 0.34299373626708984,
"memory(GiB)": 127.52,
"step": 1485,
"token_acc": 0.8748146671484283,
"train_speed(iter/s)": 0.100736
},
{
"epoch": 1.7447306791569086,
"grad_norm": 0.2374086081981659,
"learning_rate": 8.144962153749331e-06,
"loss": 0.3454796314239502,
"memory(GiB)": 127.52,
"step": 1490,
"token_acc": 0.8697578355578018,
"train_speed(iter/s)": 0.100735
},
{
"epoch": 1.7505854800936769,
"grad_norm": 0.2567986845970154,
"learning_rate": 8.081559707171094e-06,
"loss": 0.35629470348358155,
"memory(GiB)": 127.52,
"step": 1495,
"token_acc": 0.8722708482627621,
"train_speed(iter/s)": 0.100742
},
{
"epoch": 1.756440281030445,
"grad_norm": 0.2612420320510864,
"learning_rate": 8.01823722602986e-06,
"loss": 0.34243695735931395,
"memory(GiB)": 127.52,
"step": 1500,
"token_acc": 0.8835913661147516,
"train_speed(iter/s)": 0.100749
},
{
"epoch": 1.762295081967213,
"grad_norm": 0.25001969933509827,
"learning_rate": 7.954997349766576e-06,
"loss": 0.3504654407501221,
"memory(GiB)": 127.52,
"step": 1505,
"token_acc": 0.8767294491512118,
"train_speed(iter/s)": 0.100752
},
{
"epoch": 1.7681498829039812,
"grad_norm": 0.24179641902446747,
"learning_rate": 7.891842714379027e-06,
"loss": 0.3378228425979614,
"memory(GiB)": 127.52,
"step": 1510,
"token_acc": 0.8821447808495446,
"train_speed(iter/s)": 0.10075
},
{
"epoch": 1.7740046838407495,
"grad_norm": 0.2632296085357666,
"learning_rate": 7.828775952311921e-06,
"loss": 0.34106738567352296,
"memory(GiB)": 127.52,
"step": 1515,
"token_acc": 0.872465283102722,
"train_speed(iter/s)": 0.100743
},
{
"epoch": 1.7798594847775175,
"grad_norm": 0.2476883977651596,
"learning_rate": 7.765799692347201e-06,
"loss": 0.34442992210388185,
"memory(GiB)": 127.52,
"step": 1520,
"token_acc": 0.8729373501693029,
"train_speed(iter/s)": 0.100743
},
{
"epoch": 1.7857142857142856,
"grad_norm": 0.2630121111869812,
"learning_rate": 7.702916559494444e-06,
"loss": 0.3511634588241577,
"memory(GiB)": 127.52,
"step": 1525,
"token_acc": 0.8770366431554089,
"train_speed(iter/s)": 0.100748
},
{
"epoch": 1.7915690866510539,
"grad_norm": 0.24981631338596344,
"learning_rate": 7.64012917488146e-06,
"loss": 0.33224847316741946,
"memory(GiB)": 127.52,
"step": 1530,
"token_acc": 0.8896432981333869,
"train_speed(iter/s)": 0.100745
},
{
"epoch": 1.7974238875878221,
"grad_norm": 0.25589603185653687,
"learning_rate": 7.577440155645028e-06,
"loss": 0.3430049896240234,
"memory(GiB)": 127.52,
"step": 1535,
"token_acc": 0.8784328165618647,
"train_speed(iter/s)": 0.100733
},
{
"epoch": 1.8032786885245902,
"grad_norm": 0.24135656654834747,
"learning_rate": 7.514852114821811e-06,
"loss": 0.35404491424560547,
"memory(GiB)": 127.52,
"step": 1540,
"token_acc": 0.8758359005184462,
"train_speed(iter/s)": 0.100732
},
{
"epoch": 1.8091334894613582,
"grad_norm": 0.3086133599281311,
"learning_rate": 7.452367661239433e-06,
"loss": 0.3292539596557617,
"memory(GiB)": 127.52,
"step": 1545,
"token_acc": 0.8771563599039064,
"train_speed(iter/s)": 0.10073
},
{
"epoch": 1.8149882903981265,
"grad_norm": 0.26186585426330566,
"learning_rate": 7.389989399407741e-06,
"loss": 0.3564730644226074,
"memory(GiB)": 127.52,
"step": 1550,
"token_acc": 0.8767442953125245,
"train_speed(iter/s)": 0.100726
},
{
"epoch": 1.8208430913348947,
"grad_norm": 0.2449086308479309,
"learning_rate": 7.3277199294102485e-06,
"loss": 0.3377220630645752,
"memory(GiB)": 127.52,
"step": 1555,
"token_acc": 0.8731188520277088,
"train_speed(iter/s)": 0.100729
},
{
"epoch": 1.8266978922716628,
"grad_norm": 0.2617018222808838,
"learning_rate": 7.265561846795741e-06,
"loss": 0.35269980430603026,
"memory(GiB)": 127.52,
"step": 1560,
"token_acc": 0.8755254872982656,
"train_speed(iter/s)": 0.100718
},
{
"epoch": 1.8325526932084308,
"grad_norm": 0.2533339262008667,
"learning_rate": 7.203517742470101e-06,
"loss": 0.3477527856826782,
"memory(GiB)": 127.52,
"step": 1565,
"token_acc": 0.8841913617578873,
"train_speed(iter/s)": 0.100718
},
{
"epoch": 1.838407494145199,
"grad_norm": 0.24031810462474823,
"learning_rate": 7.141590202588312e-06,
"loss": 0.35293850898742674,
"memory(GiB)": 127.52,
"step": 1570,
"token_acc": 0.8790135675181339,
"train_speed(iter/s)": 0.100724
},
{
"epoch": 1.8442622950819674,
"grad_norm": 0.2540515661239624,
"learning_rate": 7.079781808446648e-06,
"loss": 0.35478663444519043,
"memory(GiB)": 127.52,
"step": 1575,
"token_acc": 0.8638225043564849,
"train_speed(iter/s)": 0.100721
},
{
"epoch": 1.8501170960187352,
"grad_norm": 0.24163876473903656,
"learning_rate": 7.018095136375089e-06,
"loss": 0.33953070640563965,
"memory(GiB)": 127.52,
"step": 1580,
"token_acc": 0.8760248415939393,
"train_speed(iter/s)": 0.100721
},
{
"epoch": 1.8559718969555035,
"grad_norm": 0.24985362589359283,
"learning_rate": 6.956532757629945e-06,
"loss": 0.34739911556243896,
"memory(GiB)": 127.52,
"step": 1585,
"token_acc": 0.8751094324520373,
"train_speed(iter/s)": 0.10072
},
{
"epoch": 1.8618266978922717,
"grad_norm": 0.24738718569278717,
"learning_rate": 6.89509723828665e-06,
"loss": 0.35140252113342285,
"memory(GiB)": 127.52,
"step": 1590,
"token_acc": 0.8747874666018945,
"train_speed(iter/s)": 0.100721
},
{
"epoch": 1.8676814988290398,
"grad_norm": 0.2528833746910095,
"learning_rate": 6.833791139132824e-06,
"loss": 0.3366274356842041,
"memory(GiB)": 127.52,
"step": 1595,
"token_acc": 0.877359708131215,
"train_speed(iter/s)": 0.100705
},
{
"epoch": 1.8735362997658078,
"grad_norm": 0.22930973768234253,
"learning_rate": 6.772617015561529e-06,
"loss": 0.34548795223236084,
"memory(GiB)": 127.52,
"step": 1600,
"token_acc": 0.8674766998186026,
"train_speed(iter/s)": 0.100705
},
{
"epoch": 1.879391100702576,
"grad_norm": 0.23658259212970734,
"learning_rate": 6.7115774174647475e-06,
"loss": 0.3390948295593262,
"memory(GiB)": 127.52,
"step": 1605,
"token_acc": 0.883574050014699,
"train_speed(iter/s)": 0.100706
},
{
"epoch": 1.8852459016393444,
"grad_norm": 0.25393053889274597,
"learning_rate": 6.6506748891271045e-06,
"loss": 0.3500185012817383,
"memory(GiB)": 127.52,
"step": 1610,
"token_acc": 0.8819961495087196,
"train_speed(iter/s)": 0.100708
},
{
"epoch": 1.8911007025761124,
"grad_norm": 0.23870056867599487,
"learning_rate": 6.5899119691198025e-06,
"loss": 0.343201732635498,
"memory(GiB)": 127.52,
"step": 1615,
"token_acc": 0.8769540112004077,
"train_speed(iter/s)": 0.100712
},
{
"epoch": 1.8969555035128804,
"grad_norm": 0.23795676231384277,
"learning_rate": 6.529291190194829e-06,
"loss": 0.3476824998855591,
"memory(GiB)": 127.52,
"step": 1620,
"token_acc": 0.8771016372387611,
"train_speed(iter/s)": 0.100717
},
{
"epoch": 1.9028103044496487,
"grad_norm": 0.23620595037937164,
"learning_rate": 6.468815079179364e-06,
"loss": 0.3438570022583008,
"memory(GiB)": 127.52,
"step": 1625,
"token_acc": 0.8808678958099098,
"train_speed(iter/s)": 0.100717
},
{
"epoch": 1.908665105386417,
"grad_norm": 0.27084144949913025,
"learning_rate": 6.408486156870466e-06,
"loss": 0.3575857162475586,
"memory(GiB)": 127.52,
"step": 1630,
"token_acc": 0.8567800504203767,
"train_speed(iter/s)": 0.10072
},
{
"epoch": 1.914519906323185,
"grad_norm": 0.24774354696273804,
"learning_rate": 6.348306937929991e-06,
"loss": 0.3539011001586914,
"memory(GiB)": 127.52,
"step": 1635,
"token_acc": 0.8722537158121981,
"train_speed(iter/s)": 0.100726
},
{
"epoch": 1.920374707259953,
"grad_norm": 0.23919358849525452,
"learning_rate": 6.288279930779789e-06,
"loss": 0.33454456329345705,
"memory(GiB)": 127.52,
"step": 1640,
"token_acc": 0.8859452149573859,
"train_speed(iter/s)": 0.100729
},
{
"epoch": 1.9262295081967213,
"grad_norm": 0.2600441575050354,
"learning_rate": 6.228407637497131e-06,
"loss": 0.34556894302368163,
"memory(GiB)": 127.52,
"step": 1645,
"token_acc": 0.8641004272904045,
"train_speed(iter/s)": 0.100727
},
{
"epoch": 1.9320843091334896,
"grad_norm": 0.2533404231071472,
"learning_rate": 6.1686925537104306e-06,
"loss": 0.3354111433029175,
"memory(GiB)": 127.52,
"step": 1650,
"token_acc": 0.8690573840794189,
"train_speed(iter/s)": 0.100726
},
{
"epoch": 1.9379391100702577,
"grad_norm": 0.24305778741836548,
"learning_rate": 6.109137168495205e-06,
"loss": 0.342392110824585,
"memory(GiB)": 127.52,
"step": 1655,
"token_acc": 0.8907634917938944,
"train_speed(iter/s)": 0.100732
},
{
"epoch": 1.9437939110070257,
"grad_norm": 0.23065665364265442,
"learning_rate": 6.049743964270336e-06,
"loss": 0.35349397659301757,
"memory(GiB)": 127.52,
"step": 1660,
"token_acc": 0.8749648996911172,
"train_speed(iter/s)": 0.100731
},
{
"epoch": 1.949648711943794,
"grad_norm": 0.26187312602996826,
"learning_rate": 5.990515416694591e-06,
"loss": 0.3514526844024658,
"memory(GiB)": 127.52,
"step": 1665,
"token_acc": 0.8773919272455463,
"train_speed(iter/s)": 0.100729
},
{
"epoch": 1.955503512880562,
"grad_norm": 0.2436314970254898,
"learning_rate": 5.931453994563434e-06,
"loss": 0.34615340232849123,
"memory(GiB)": 127.52,
"step": 1670,
"token_acc": 0.8825784399814935,
"train_speed(iter/s)": 0.100722
},
{
"epoch": 1.96135831381733,
"grad_norm": 1.0637788772583008,
"learning_rate": 5.872562159706116e-06,
"loss": 0.34925112724304197,
"memory(GiB)": 127.52,
"step": 1675,
"token_acc": 0.8725762818496382,
"train_speed(iter/s)": 0.100718
},
{
"epoch": 1.9672131147540983,
"grad_norm": 0.2608899176120758,
"learning_rate": 5.8138423668830605e-06,
"loss": 0.34130330085754396,
"memory(GiB)": 127.52,
"step": 1680,
"token_acc": 0.876563876375788,
"train_speed(iter/s)": 0.10072
},
{
"epoch": 1.9730679156908666,
"grad_norm": 0.24455122649669647,
"learning_rate": 5.755297063683551e-06,
"loss": 0.3456611633300781,
"memory(GiB)": 127.52,
"step": 1685,
"token_acc": 0.8803155448934612,
"train_speed(iter/s)": 0.100717
},
{
"epoch": 1.9789227166276346,
"grad_norm": 0.23744545876979828,
"learning_rate": 5.696928690423693e-06,
"loss": 0.3404732942581177,
"memory(GiB)": 127.52,
"step": 1690,
"token_acc": 0.873919857146425,
"train_speed(iter/s)": 0.100721
},
{
"epoch": 1.9847775175644027,
"grad_norm": 0.2499692440032959,
"learning_rate": 5.638739680044718e-06,
"loss": 0.3554127931594849,
"memory(GiB)": 127.52,
"step": 1695,
"token_acc": 0.8678405344492528,
"train_speed(iter/s)": 0.10072
},
{
"epoch": 1.990632318501171,
"grad_norm": 0.23933644592761993,
"learning_rate": 5.580732458011544e-06,
"loss": 0.34451732635498045,
"memory(GiB)": 127.52,
"step": 1700,
"token_acc": 0.8813060735041081,
"train_speed(iter/s)": 0.100721
},
{
"epoch": 1.9964871194379392,
"grad_norm": 0.2454347014427185,
"learning_rate": 5.522909442211708e-06,
"loss": 0.3448106527328491,
"memory(GiB)": 127.52,
"step": 1705,
"token_acc": 0.8718723798596708,
"train_speed(iter/s)": 0.100717
},
{
"epoch": 2.002341920374707,
"grad_norm": 0.30603164434432983,
"learning_rate": 5.465273042854551e-06,
"loss": 0.3320322036743164,
"memory(GiB)": 127.52,
"step": 1710,
"token_acc": 0.8845191075650899,
"train_speed(iter/s)": 0.10054
},
{
"epoch": 2.0081967213114753,
"grad_norm": 0.26624929904937744,
"learning_rate": 5.407825662370778e-06,
"loss": 0.3192149639129639,
"memory(GiB)": 127.52,
"step": 1715,
"token_acc": 0.8862581577460744,
"train_speed(iter/s)": 0.100533
},
{
"epoch": 2.0140515222482436,
"grad_norm": 0.28559088706970215,
"learning_rate": 5.350569695312313e-06,
"loss": 0.3315494775772095,
"memory(GiB)": 127.52,
"step": 1720,
"token_acc": 0.8817901407312053,
"train_speed(iter/s)": 0.100527
},
{
"epoch": 2.019906323185012,
"grad_norm": 0.24132603406906128,
"learning_rate": 5.293507528252474e-06,
"loss": 0.3354511737823486,
"memory(GiB)": 127.52,
"step": 1725,
"token_acc": 0.8808201997328972,
"train_speed(iter/s)": 0.100523
},
{
"epoch": 2.0257611241217797,
"grad_norm": 0.25403663516044617,
"learning_rate": 5.236641539686518e-06,
"loss": 0.3226620197296143,
"memory(GiB)": 127.52,
"step": 1730,
"token_acc": 0.8806968959125817,
"train_speed(iter/s)": 0.10053
},
{
"epoch": 2.031615925058548,
"grad_norm": 0.24015206098556519,
"learning_rate": 5.179974099932472e-06,
"loss": 0.3161166667938232,
"memory(GiB)": 127.52,
"step": 1735,
"token_acc": 0.8794680331257753,
"train_speed(iter/s)": 0.100526
},
{
"epoch": 2.037470725995316,
"grad_norm": 0.2842601537704468,
"learning_rate": 5.12350757103236e-06,
"loss": 0.31528186798095703,
"memory(GiB)": 127.52,
"step": 1740,
"token_acc": 0.8833886035950154,
"train_speed(iter/s)": 0.10053
},
{
"epoch": 2.0433255269320845,
"grad_norm": 0.23931631445884705,
"learning_rate": 5.067244306653736e-06,
"loss": 0.32300970554351804,
"memory(GiB)": 127.52,
"step": 1745,
"token_acc": 0.8907401132070736,
"train_speed(iter/s)": 0.100533
},
{
"epoch": 2.0491803278688523,
"grad_norm": 0.25491324067115784,
"learning_rate": 5.0111866519915575e-06,
"loss": 0.31856546401977537,
"memory(GiB)": 127.52,
"step": 1750,
"token_acc": 0.8788062223735568,
"train_speed(iter/s)": 0.100534
},
{
"epoch": 2.0550351288056206,
"grad_norm": 0.2541966140270233,
"learning_rate": 4.95533694367047e-06,
"loss": 0.31543042659759524,
"memory(GiB)": 127.52,
"step": 1755,
"token_acc": 0.8854616459729288,
"train_speed(iter/s)": 0.100541
},
{
"epoch": 2.060889929742389,
"grad_norm": 0.250337690114975,
"learning_rate": 4.899697509647379e-06,
"loss": 0.32208833694458006,
"memory(GiB)": 127.52,
"step": 1760,
"token_acc": 0.8763743304143462,
"train_speed(iter/s)": 0.100545
},
{
"epoch": 2.066744730679157,
"grad_norm": 0.23674513399600983,
"learning_rate": 4.844270669114424e-06,
"loss": 0.32359483242034914,
"memory(GiB)": 127.52,
"step": 1765,
"token_acc": 0.8885440198244088,
"train_speed(iter/s)": 0.100551
},
{
"epoch": 2.072599531615925,
"grad_norm": 0.2509515881538391,
"learning_rate": 4.789058732402319e-06,
"loss": 0.3145972728729248,
"memory(GiB)": 127.52,
"step": 1770,
"token_acc": 0.8812067213755373,
"train_speed(iter/s)": 0.100554
},
{
"epoch": 2.078454332552693,
"grad_norm": 0.27846959233283997,
"learning_rate": 4.734064000884044e-06,
"loss": 0.3361539840698242,
"memory(GiB)": 127.52,
"step": 1775,
"token_acc": 0.8687031468980935,
"train_speed(iter/s)": 0.100561
},
{
"epoch": 2.0843091334894615,
"grad_norm": 0.2520703971385956,
"learning_rate": 4.679288766878908e-06,
"loss": 0.3277717590332031,
"memory(GiB)": 127.52,
"step": 1780,
"token_acc": 0.8835239754091976,
"train_speed(iter/s)": 0.100561
},
{
"epoch": 2.0901639344262297,
"grad_norm": 0.26310279965400696,
"learning_rate": 4.624735313557019e-06,
"loss": 0.32394185066223147,
"memory(GiB)": 127.52,
"step": 1785,
"token_acc": 0.8875730035291546,
"train_speed(iter/s)": 0.100566
},
{
"epoch": 2.0960187353629975,
"grad_norm": 0.2666696310043335,
"learning_rate": 4.570405914844105e-06,
"loss": 0.31819107532501223,
"memory(GiB)": 127.52,
"step": 1790,
"token_acc": 0.8859368071299645,
"train_speed(iter/s)": 0.100562
},
{
"epoch": 2.101873536299766,
"grad_norm": 0.25196680426597595,
"learning_rate": 4.516302835326723e-06,
"loss": 0.322560453414917,
"memory(GiB)": 127.52,
"step": 1795,
"token_acc": 0.8921213689835521,
"train_speed(iter/s)": 0.100564
},
{
"epoch": 2.107728337236534,
"grad_norm": 0.24787664413452148,
"learning_rate": 4.462428330157886e-06,
"loss": 0.3134110927581787,
"memory(GiB)": 127.52,
"step": 1800,
"token_acc": 0.8915973959679097,
"train_speed(iter/s)": 0.100565
},
{
"epoch": 2.113583138173302,
"grad_norm": 0.23812943696975708,
"learning_rate": 4.4087846449630475e-06,
"loss": 0.31724915504455564,
"memory(GiB)": 127.52,
"step": 1805,
"token_acc": 0.8883239519028294,
"train_speed(iter/s)": 0.100568
},
{
"epoch": 2.11943793911007,
"grad_norm": 0.2460552453994751,
"learning_rate": 4.355374015746493e-06,
"loss": 0.31520168781280516,
"memory(GiB)": 127.52,
"step": 1810,
"token_acc": 0.8825987185966718,
"train_speed(iter/s)": 0.100568
},
{
"epoch": 2.1252927400468384,
"grad_norm": 0.2627100646495819,
"learning_rate": 4.302198668798159e-06,
"loss": 0.3187079906463623,
"memory(GiB)": 127.52,
"step": 1815,
"token_acc": 0.8795669142641319,
"train_speed(iter/s)": 0.100574
},
{
"epoch": 2.1311475409836067,
"grad_norm": 0.23737181723117828,
"learning_rate": 4.249260820600813e-06,
"loss": 0.30634393692016604,
"memory(GiB)": 127.52,
"step": 1820,
"token_acc": 0.8882761935077175,
"train_speed(iter/s)": 0.100574
},
{
"epoch": 2.1370023419203745,
"grad_norm": 0.44100987911224365,
"learning_rate": 4.1965626777376766e-06,
"loss": 0.3143752574920654,
"memory(GiB)": 127.52,
"step": 1825,
"token_acc": 0.8907455736843094,
"train_speed(iter/s)": 0.100576
},
{
"epoch": 2.142857142857143,
"grad_norm": 0.243091881275177,
"learning_rate": 4.144106436800453e-06,
"loss": 0.32144436836242674,
"memory(GiB)": 127.52,
"step": 1830,
"token_acc": 0.8904153173473116,
"train_speed(iter/s)": 0.100586
},
{
"epoch": 2.148711943793911,
"grad_norm": 0.22646024823188782,
"learning_rate": 4.091894284297758e-06,
"loss": 0.3123732089996338,
"memory(GiB)": 127.52,
"step": 1835,
"token_acc": 0.8785402692433979,
"train_speed(iter/s)": 0.100589
},
{
"epoch": 2.1545667447306793,
"grad_norm": 0.2700958549976349,
"learning_rate": 4.039928396563983e-06,
"loss": 0.33238074779510496,
"memory(GiB)": 127.52,
"step": 1840,
"token_acc": 0.8842443529070076,
"train_speed(iter/s)": 0.10059
},
{
"epoch": 2.160421545667447,
"grad_norm": 0.2499818056821823,
"learning_rate": 3.9882109396685845e-06,
"loss": 0.30622167587280275,
"memory(GiB)": 127.52,
"step": 1845,
"token_acc": 0.8795685480484824,
"train_speed(iter/s)": 0.100591
},
{
"epoch": 2.1662763466042154,
"grad_norm": 0.22730578482151031,
"learning_rate": 3.936744069325797e-06,
"loss": 0.3057937860488892,
"memory(GiB)": 127.52,
"step": 1850,
"token_acc": 0.8902019848511362,
"train_speed(iter/s)": 0.100589
},
{
"epoch": 2.1721311475409837,
"grad_norm": 0.23967498540878296,
"learning_rate": 3.885529930804768e-06,
"loss": 0.3023227214813232,
"memory(GiB)": 127.52,
"step": 1855,
"token_acc": 0.8807274179657759,
"train_speed(iter/s)": 0.100589
},
{
"epoch": 2.177985948477752,
"grad_norm": 0.2622321844100952,
"learning_rate": 3.834570658840152e-06,
"loss": 0.32261273860931394,
"memory(GiB)": 127.52,
"step": 1860,
"token_acc": 0.8792452360659205,
"train_speed(iter/s)": 0.100591
},
{
"epoch": 2.1838407494145198,
"grad_norm": 0.23954476416110992,
"learning_rate": 3.7838683775431106e-06,
"loss": 0.31424174308776853,
"memory(GiB)": 127.52,
"step": 1865,
"token_acc": 0.8843662495044312,
"train_speed(iter/s)": 0.100597
},
{
"epoch": 2.189695550351288,
"grad_norm": 0.23363274335861206,
"learning_rate": 3.733425200312797e-06,
"loss": 0.316208815574646,
"memory(GiB)": 127.52,
"step": 1870,
"token_acc": 0.876293130342547,
"train_speed(iter/s)": 0.100602
},
{
"epoch": 2.1955503512880563,
"grad_norm": 0.24841627478599548,
"learning_rate": 3.683243229748249e-06,
"loss": 0.3097521781921387,
"memory(GiB)": 127.52,
"step": 1875,
"token_acc": 0.8804246009543149,
"train_speed(iter/s)": 0.100606
},
{
"epoch": 2.201405152224824,
"grad_norm": 0.25356635451316833,
"learning_rate": 3.633324557560747e-06,
"loss": 0.31675851345062256,
"memory(GiB)": 127.52,
"step": 1880,
"token_acc": 0.8871838137645497,
"train_speed(iter/s)": 0.10061
},
{
"epoch": 2.2072599531615924,
"grad_norm": 0.2366763949394226,
"learning_rate": 3.5836712644866277e-06,
"loss": 0.30890917778015137,
"memory(GiB)": 127.52,
"step": 1885,
"token_acc": 0.8819356314491541,
"train_speed(iter/s)": 0.100613
},
{
"epoch": 2.2131147540983607,
"grad_norm": 0.24897019565105438,
"learning_rate": 3.5342854202005696e-06,
"loss": 0.31049222946166993,
"memory(GiB)": 127.52,
"step": 1890,
"token_acc": 0.8878919948532936,
"train_speed(iter/s)": 0.100619
},
{
"epoch": 2.218969555035129,
"grad_norm": 0.239404559135437,
"learning_rate": 3.485169083229293e-06,
"loss": 0.31925191879272463,
"memory(GiB)": 127.52,
"step": 1895,
"token_acc": 0.8928798404593369,
"train_speed(iter/s)": 0.100627
},
{
"epoch": 2.2248243559718968,
"grad_norm": 0.2341826856136322,
"learning_rate": 3.4363243008657842e-06,
"loss": 0.31410508155822753,
"memory(GiB)": 127.52,
"step": 1900,
"token_acc": 0.8741590609526956,
"train_speed(iter/s)": 0.100624
},
{
"epoch": 2.230679156908665,
"grad_norm": 0.24927052855491638,
"learning_rate": 3.3877531090839478e-06,
"loss": 0.3199175834655762,
"memory(GiB)": 127.52,
"step": 1905,
"token_acc": 0.8767657620459692,
"train_speed(iter/s)": 0.100628
},
{
"epoch": 2.2365339578454333,
"grad_norm": 0.2401537299156189,
"learning_rate": 3.3394575324537327e-06,
"loss": 0.3235038757324219,
"memory(GiB)": 127.52,
"step": 1910,
"token_acc": 0.8763058505839384,
"train_speed(iter/s)": 0.100623
},
{
"epoch": 2.2423887587822016,
"grad_norm": 0.23076413571834564,
"learning_rate": 3.2914395840567605e-06,
"loss": 0.31050064563751223,
"memory(GiB)": 127.52,
"step": 1915,
"token_acc": 0.8874926079243052,
"train_speed(iter/s)": 0.100622
},
{
"epoch": 2.2482435597189694,
"grad_norm": 0.2379971295595169,
"learning_rate": 3.2437012654024057e-06,
"loss": 0.3159012317657471,
"memory(GiB)": 127.52,
"step": 1920,
"token_acc": 0.8895969009656411,
"train_speed(iter/s)": 0.100622
},
{
"epoch": 2.2540983606557377,
"grad_norm": 0.23007337749004364,
"learning_rate": 3.1962445663443643e-06,
"loss": 0.31895716190338136,
"memory(GiB)": 127.52,
"step": 1925,
"token_acc": 0.8823520222942871,
"train_speed(iter/s)": 0.100616
},
{
"epoch": 2.259953161592506,
"grad_norm": 0.2437550276517868,
"learning_rate": 3.1490714649977196e-06,
"loss": 0.3226035118103027,
"memory(GiB)": 127.52,
"step": 1930,
"token_acc": 0.8907227393284292,
"train_speed(iter/s)": 0.100614
},
{
"epoch": 2.265807962529274,
"grad_norm": 0.2513379454612732,
"learning_rate": 3.102183927656488e-06,
"loss": 0.31055560111999514,
"memory(GiB)": 127.52,
"step": 1935,
"token_acc": 0.8758090614886731,
"train_speed(iter/s)": 0.100617
},
{
"epoch": 2.271662763466042,
"grad_norm": 0.23778940737247467,
"learning_rate": 3.0555839087116547e-06,
"loss": 0.32387375831604004,
"memory(GiB)": 127.52,
"step": 1940,
"token_acc": 0.887034375,
"train_speed(iter/s)": 0.10062
},
{
"epoch": 2.2775175644028103,
"grad_norm": 0.26385143399238586,
"learning_rate": 3.009273350569705e-06,
"loss": 0.32143163681030273,
"memory(GiB)": 127.52,
"step": 1945,
"token_acc": 0.8916146423189599,
"train_speed(iter/s)": 0.100632
},
{
"epoch": 2.2833723653395785,
"grad_norm": 0.23078720271587372,
"learning_rate": 2.963254183571682e-06,
"loss": 0.31597721576690674,
"memory(GiB)": 127.52,
"step": 1950,
"token_acc": 0.8873806150822559,
"train_speed(iter/s)": 0.10063
},
{
"epoch": 2.289227166276347,
"grad_norm": 0.23988991975784302,
"learning_rate": 2.9175283259126943e-06,
"loss": 0.31755337715148924,
"memory(GiB)": 127.52,
"step": 1955,
"token_acc": 0.8924940331886264,
"train_speed(iter/s)": 0.100631
},
{
"epoch": 2.2950819672131146,
"grad_norm": 0.23374050855636597,
"learning_rate": 2.872097683561986e-06,
"loss": 0.3156282424926758,
"memory(GiB)": 127.52,
"step": 1960,
"token_acc": 0.8946095897383691,
"train_speed(iter/s)": 0.100632
},
{
"epoch": 2.300936768149883,
"grad_norm": 0.22969146072864532,
"learning_rate": 2.8269641501834834e-06,
"loss": 0.32587299346923826,
"memory(GiB)": 127.52,
"step": 1965,
"token_acc": 0.8774885813450646,
"train_speed(iter/s)": 0.100637
},
{
"epoch": 2.306791569086651,
"grad_norm": 0.23242172598838806,
"learning_rate": 2.782129607056848e-06,
"loss": 0.31759541034698485,
"memory(GiB)": 127.52,
"step": 1970,
"token_acc": 0.8783747102265459,
"train_speed(iter/s)": 0.10064
},
{
"epoch": 2.312646370023419,
"grad_norm": 0.22935490310192108,
"learning_rate": 2.7375959229990856e-06,
"loss": 0.307840371131897,
"memory(GiB)": 127.52,
"step": 1975,
"token_acc": 0.8862128010598808,
"train_speed(iter/s)": 0.100639
},
{
"epoch": 2.3185011709601873,
"grad_norm": 0.2637212574481964,
"learning_rate": 2.6933649542866326e-06,
"loss": 0.3114126682281494,
"memory(GiB)": 127.52,
"step": 1980,
"token_acc": 0.8820059272541622,
"train_speed(iter/s)": 0.100646
},
{
"epoch": 2.3243559718969555,
"grad_norm": 0.22703419625759125,
"learning_rate": 2.649438544577977e-06,
"loss": 0.30065155029296875,
"memory(GiB)": 127.52,
"step": 1985,
"token_acc": 0.8849238586641156,
"train_speed(iter/s)": 0.100647
},
{
"epoch": 2.330210772833724,
"grad_norm": 0.22714027762413025,
"learning_rate": 2.6058185248368317e-06,
"loss": 0.3135934352874756,
"memory(GiB)": 127.52,
"step": 1990,
"token_acc": 0.8923622270535968,
"train_speed(iter/s)": 0.100647
},
{
"epoch": 2.3360655737704916,
"grad_norm": 0.23052531480789185,
"learning_rate": 2.562506713255789e-06,
"loss": 0.3088988304138184,
"memory(GiB)": 127.52,
"step": 1995,
"token_acc": 0.8901272198016593,
"train_speed(iter/s)": 0.100652
},
{
"epoch": 2.34192037470726,
"grad_norm": 0.2511214017868042,
"learning_rate": 2.519504915180555e-06,
"loss": 0.3128695487976074,
"memory(GiB)": 127.52,
"step": 2000,
"token_acc": 0.8865565346454385,
"train_speed(iter/s)": 0.100653
},
{
"epoch": 2.347775175644028,
"grad_norm": 0.23098479211330414,
"learning_rate": 2.4768149230346917e-06,
"loss": 0.3291048526763916,
"memory(GiB)": 127.52,
"step": 2005,
"token_acc": 0.8865806253889527,
"train_speed(iter/s)": 0.100648
},
{
"epoch": 2.3536299765807964,
"grad_norm": 0.2332172840833664,
"learning_rate": 2.4344385162448924e-06,
"loss": 0.31312854290008546,
"memory(GiB)": 127.52,
"step": 2010,
"token_acc": 0.8905434652297092,
"train_speed(iter/s)": 0.100649
},
{
"epoch": 2.3594847775175642,
"grad_norm": 0.229131281375885,
"learning_rate": 2.392377461166826e-06,
"loss": 0.3113706588745117,
"memory(GiB)": 127.52,
"step": 2015,
"token_acc": 0.889476325707392,
"train_speed(iter/s)": 0.100651
},
{
"epoch": 2.3653395784543325,
"grad_norm": 0.24932575225830078,
"learning_rate": 2.350633511011511e-06,
"loss": 0.3204165458679199,
"memory(GiB)": 127.52,
"step": 2020,
"token_acc": 0.8841538567415554,
"train_speed(iter/s)": 0.100647
},
{
"epoch": 2.371194379391101,
"grad_norm": 0.23387765884399414,
"learning_rate": 2.309208405772221e-06,
"loss": 0.32724220752716066,
"memory(GiB)": 127.52,
"step": 2025,
"token_acc": 0.8882853658229917,
"train_speed(iter/s)": 0.100652
},
{
"epoch": 2.3770491803278686,
"grad_norm": 0.24220742285251617,
"learning_rate": 2.2681038721519768e-06,
"loss": 0.33083477020263674,
"memory(GiB)": 127.52,
"step": 2030,
"token_acc": 0.8838624553173172,
"train_speed(iter/s)": 0.100651
},
{
"epoch": 2.382903981264637,
"grad_norm": 0.2579573690891266,
"learning_rate": 2.227321623491563e-06,
"loss": 0.3199321746826172,
"memory(GiB)": 127.52,
"step": 2035,
"token_acc": 0.8799424487730837,
"train_speed(iter/s)": 0.100653
},
{
"epoch": 2.388758782201405,
"grad_norm": 0.22851942479610443,
"learning_rate": 2.186863359698108e-06,
"loss": 0.3142981052398682,
"memory(GiB)": 127.52,
"step": 2040,
"token_acc": 0.9041223969400765,
"train_speed(iter/s)": 0.100653
},
{
"epoch": 2.3946135831381734,
"grad_norm": 0.24671818315982819,
"learning_rate": 2.1467307671742377e-06,
"loss": 0.31820495128631593,
"memory(GiB)": 127.52,
"step": 2045,
"token_acc": 0.8822625886964798,
"train_speed(iter/s)": 0.100657
},
{
"epoch": 2.4004683840749417,
"grad_norm": 0.2494201809167862,
"learning_rate": 2.106925518747779e-06,
"loss": 0.31292271614074707,
"memory(GiB)": 127.52,
"step": 2050,
"token_acc": 0.8868852561536922,
"train_speed(iter/s)": 0.100659
},
{
"epoch": 2.4063231850117095,
"grad_norm": 0.25766271352767944,
"learning_rate": 2.06744927360202e-06,
"loss": 0.315954852104187,
"memory(GiB)": 127.52,
"step": 2055,
"token_acc": 0.8844018739071213,
"train_speed(iter/s)": 0.100653
},
{
"epoch": 2.4121779859484778,
"grad_norm": 0.23304541409015656,
"learning_rate": 2.0283036772065712e-06,
"loss": 0.31738996505737305,
"memory(GiB)": 127.52,
"step": 2060,
"token_acc": 0.8888605233133514,
"train_speed(iter/s)": 0.100656
},
{
"epoch": 2.418032786885246,
"grad_norm": 0.23033016920089722,
"learning_rate": 1.9894903612487683e-06,
"loss": 0.32506499290466306,
"memory(GiB)": 127.52,
"step": 2065,
"token_acc": 0.8765848323481849,
"train_speed(iter/s)": 0.100657
},
{
"epoch": 2.423887587822014,
"grad_norm": 0.2522413730621338,
"learning_rate": 1.9510109435656457e-06,
"loss": 0.3240881681442261,
"memory(GiB)": 127.52,
"step": 2070,
"token_acc": 0.8874444430454654,
"train_speed(iter/s)": 0.10066
},
{
"epoch": 2.429742388758782,
"grad_norm": 0.23793016374111176,
"learning_rate": 1.9128670280765283e-06,
"loss": 0.326206374168396,
"memory(GiB)": 127.52,
"step": 2075,
"token_acc": 0.8811696876529852,
"train_speed(iter/s)": 0.100656
},
{
"epoch": 2.4355971896955504,
"grad_norm": 0.2260826826095581,
"learning_rate": 1.8750602047161603e-06,
"loss": 0.3155853748321533,
"memory(GiB)": 127.52,
"step": 2080,
"token_acc": 0.8918628516614084,
"train_speed(iter/s)": 0.100657
},
{
"epoch": 2.4414519906323187,
"grad_norm": 0.22915047407150269,
"learning_rate": 1.8375920493684264e-06,
"loss": 0.32075018882751466,
"memory(GiB)": 127.52,
"step": 2085,
"token_acc": 0.8806146127312637,
"train_speed(iter/s)": 0.100664
},
{
"epoch": 2.4473067915690865,
"grad_norm": 0.23555633425712585,
"learning_rate": 1.8004641238006815e-06,
"loss": 0.3198583126068115,
"memory(GiB)": 127.52,
"step": 2090,
"token_acc": 0.8878798889856471,
"train_speed(iter/s)": 0.100663
},
{
"epoch": 2.4531615925058547,
"grad_norm": 0.23224787414073944,
"learning_rate": 1.7636779755986443e-06,
"loss": 0.32527942657470704,
"memory(GiB)": 127.52,
"step": 2095,
"token_acc": 0.8808102158192161,
"train_speed(iter/s)": 0.100659
},
{
"epoch": 2.459016393442623,
"grad_norm": 0.2313682585954666,
"learning_rate": 1.7272351381018792e-06,
"loss": 0.3221132278442383,
"memory(GiB)": 127.52,
"step": 2100,
"token_acc": 0.8723955898759107,
"train_speed(iter/s)": 0.10066
},
{
"epoch": 2.4648711943793913,
"grad_norm": 0.23031777143478394,
"learning_rate": 1.6911371303399048e-06,
"loss": 0.3093102931976318,
"memory(GiB)": 127.52,
"step": 2105,
"token_acc": 0.887525459211663,
"train_speed(iter/s)": 0.100655
},
{
"epoch": 2.470725995316159,
"grad_norm": 0.23843398690223694,
"learning_rate": 1.6553854569688632e-06,
"loss": 0.3248276710510254,
"memory(GiB)": 127.52,
"step": 2110,
"token_acc": 0.882843537798315,
"train_speed(iter/s)": 0.100654
},
{
"epoch": 2.4765807962529274,
"grad_norm": 0.23203721642494202,
"learning_rate": 1.619981608208796e-06,
"loss": 0.32454729080200195,
"memory(GiB)": 127.52,
"step": 2115,
"token_acc": 0.869970732560573,
"train_speed(iter/s)": 0.100657
},
{
"epoch": 2.4824355971896956,
"grad_norm": 0.23711416125297546,
"learning_rate": 1.584927059781548e-06,
"loss": 0.3233715295791626,
"memory(GiB)": 127.52,
"step": 2120,
"token_acc": 0.8797791727772037,
"train_speed(iter/s)": 0.100658
},
{
"epoch": 2.4882903981264635,
"grad_norm": 0.23975679278373718,
"learning_rate": 1.5502232728492362e-06,
"loss": 0.31569533348083495,
"memory(GiB)": 127.52,
"step": 2125,
"token_acc": 0.8874189972049156,
"train_speed(iter/s)": 0.100661
},
{
"epoch": 2.4941451990632317,
"grad_norm": 0.23424658179283142,
"learning_rate": 1.5158716939533524e-06,
"loss": 0.32528119087219237,
"memory(GiB)": 127.52,
"step": 2130,
"token_acc": 0.8848355062483098,
"train_speed(iter/s)": 0.100663
},
{
"epoch": 2.5,
"grad_norm": 0.2467930018901825,
"learning_rate": 1.4818737549544725e-06,
"loss": 0.3232418060302734,
"memory(GiB)": 127.52,
"step": 2135,
"token_acc": 0.8760404837079283,
"train_speed(iter/s)": 0.100669
},
{
"epoch": 2.5058548009367683,
"grad_norm": 0.23344840109348297,
"learning_rate": 1.448230872972568e-06,
"loss": 0.3205883979797363,
"memory(GiB)": 127.52,
"step": 2140,
"token_acc": 0.8896608528350288,
"train_speed(iter/s)": 0.100665
},
{
"epoch": 2.5117096018735365,
"grad_norm": 0.2276953160762787,
"learning_rate": 1.4149444503279297e-06,
"loss": 0.32780184745788576,
"memory(GiB)": 127.52,
"step": 2145,
"token_acc": 0.8763619018928553,
"train_speed(iter/s)": 0.100666
},
{
"epoch": 2.5175644028103044,
"grad_norm": 0.23720286786556244,
"learning_rate": 1.382015874482735e-06,
"loss": 0.3210983037948608,
"memory(GiB)": 127.52,
"step": 2150,
"token_acc": 0.8830952351167766,
"train_speed(iter/s)": 0.100669
},
{
"epoch": 2.5234192037470726,
"grad_norm": 0.2429177612066269,
"learning_rate": 1.3494465179831895e-06,
"loss": 0.31808924674987793,
"memory(GiB)": 127.52,
"step": 2155,
"token_acc": 0.8801182829610709,
"train_speed(iter/s)": 0.100671
},
{
"epoch": 2.529274004683841,
"grad_norm": 0.2192358821630478,
"learning_rate": 1.3172377384023393e-06,
"loss": 0.3137265682220459,
"memory(GiB)": 127.52,
"step": 2160,
"token_acc": 0.8851310631053786,
"train_speed(iter/s)": 0.100675
},
{
"epoch": 2.5351288056206087,
"grad_norm": 0.22843384742736816,
"learning_rate": 1.2853908782834722e-06,
"loss": 0.31639652252197265,
"memory(GiB)": 127.52,
"step": 2165,
"token_acc": 0.8930099545248551,
"train_speed(iter/s)": 0.100673
},
{
"epoch": 2.540983606557377,
"grad_norm": 0.23414385318756104,
"learning_rate": 1.2539072650841523e-06,
"loss": 0.32384276390075684,
"memory(GiB)": 127.52,
"step": 2170,
"token_acc": 0.8826712369541582,
"train_speed(iter/s)": 0.100679
},
{
"epoch": 2.5468384074941453,
"grad_norm": 0.2386016696691513,
"learning_rate": 1.2227882111209011e-06,
"loss": 0.3276023864746094,
"memory(GiB)": 127.52,
"step": 2175,
"token_acc": 0.876178791079083,
"train_speed(iter/s)": 0.10068
},
{
"epoch": 2.552693208430913,
"grad_norm": 0.23498761653900146,
"learning_rate": 1.1920350135144898e-06,
"loss": 0.3207254409790039,
"memory(GiB)": 127.52,
"step": 2180,
"token_acc": 0.8885690220875708,
"train_speed(iter/s)": 0.100681
},
{
"epoch": 2.5585480093676813,
"grad_norm": 0.23011547327041626,
"learning_rate": 1.1616489541358678e-06,
"loss": 0.3184302806854248,
"memory(GiB)": 127.52,
"step": 2185,
"token_acc": 0.8778273150286384,
"train_speed(iter/s)": 0.100682
},
{
"epoch": 2.5644028103044496,
"grad_norm": 0.22844338417053223,
"learning_rate": 1.1316312995527424e-06,
"loss": 0.3216708183288574,
"memory(GiB)": 127.52,
"step": 2190,
"token_acc": 0.8842230056468974,
"train_speed(iter/s)": 0.100685
},
{
"epoch": 2.570257611241218,
"grad_norm": 0.23386669158935547,
"learning_rate": 1.1019833009767744e-06,
"loss": 0.3198892831802368,
"memory(GiB)": 127.52,
"step": 2195,
"token_acc": 0.881730841074942,
"train_speed(iter/s)": 0.100684
},
{
"epoch": 2.576112412177986,
"grad_norm": 0.23416638374328613,
"learning_rate": 1.072706194211426e-06,
"loss": 0.32181246280670167,
"memory(GiB)": 127.52,
"step": 2200,
"token_acc": 0.8872248114887651,
"train_speed(iter/s)": 0.100687
},
{
"epoch": 2.581967213114754,
"grad_norm": 0.232351616024971,
"learning_rate": 1.0438011996004581e-06,
"loss": 0.32013840675354005,
"memory(GiB)": 127.52,
"step": 2205,
"token_acc": 0.8815920274367514,
"train_speed(iter/s)": 0.100688
},
{
"epoch": 2.5878220140515222,
"grad_norm": 0.24018974602222443,
"learning_rate": 1.0152695219770558e-06,
"loss": 0.3074916124343872,
"memory(GiB)": 127.52,
"step": 2210,
"token_acc": 0.8911461159004883,
"train_speed(iter/s)": 0.100686
},
{
"epoch": 2.5936768149882905,
"grad_norm": 0.2339586764574051,
"learning_rate": 9.871123506136037e-07,
"loss": 0.3152151107788086,
"memory(GiB)": 127.52,
"step": 2215,
"token_acc": 0.8945800996908322,
"train_speed(iter/s)": 0.100689
},
{
"epoch": 2.5995316159250583,
"grad_norm": 0.23918944597244263,
"learning_rate": 9.593308591721274e-07,
"loss": 0.3115771532058716,
"memory(GiB)": 127.52,
"step": 2220,
"token_acc": 0.8863534338516209,
"train_speed(iter/s)": 0.100692
},
{
"epoch": 2.6053864168618266,
"grad_norm": 0.228268101811409,
"learning_rate": 9.319262056553602e-07,
"loss": 0.3226304531097412,
"memory(GiB)": 127.52,
"step": 2225,
"token_acc": 0.8902835788085294,
"train_speed(iter/s)": 0.10069
},
{
"epoch": 2.611241217798595,
"grad_norm": 0.23581595718860626,
"learning_rate": 9.048995323584764e-07,
"loss": 0.3258847713470459,
"memory(GiB)": 127.52,
"step": 2230,
"token_acc": 0.8929581827894788,
"train_speed(iter/s)": 0.10069
},
{
"epoch": 2.617096018735363,
"grad_norm": 0.4460615813732147,
"learning_rate": 8.78251965821485e-07,
"loss": 0.3083215236663818,
"memory(GiB)": 127.52,
"step": 2235,
"token_acc": 0.8851051496528254,
"train_speed(iter/s)": 0.10069
},
{
"epoch": 2.6229508196721314,
"grad_norm": 0.23269429802894592,
"learning_rate": 8.519846167822665e-07,
"loss": 0.31586997509002684,
"memory(GiB)": 127.52,
"step": 2240,
"token_acc": 0.8981023709170914,
"train_speed(iter/s)": 0.100691
},
{
"epoch": 2.628805620608899,
"grad_norm": 0.608095645904541,
"learning_rate": 8.260985801302734e-07,
"loss": 0.30504627227783204,
"memory(GiB)": 127.52,
"step": 2245,
"token_acc": 0.8836382464618571,
"train_speed(iter/s)": 0.100692
},
{
"epoch": 2.6346604215456675,
"grad_norm": 0.22992344200611115,
"learning_rate": 8.005949348608977e-07,
"loss": 0.31817898750305174,
"memory(GiB)": 127.52,
"step": 2250,
"token_acc": 0.8803807403423412,
"train_speed(iter/s)": 0.100694
},
{
"epoch": 2.6405152224824358,
"grad_norm": 0.2216484099626541,
"learning_rate": 7.754747440304911e-07,
"loss": 0.3218961000442505,
"memory(GiB)": 127.52,
"step": 2255,
"token_acc": 0.8802025202800865,
"train_speed(iter/s)": 0.1007
},
{
"epoch": 2.6463700234192036,
"grad_norm": 0.22643844783306122,
"learning_rate": 7.507390547120541e-07,
"loss": 0.31406736373901367,
"memory(GiB)": 127.52,
"step": 2260,
"token_acc": 0.8841787048704839,
"train_speed(iter/s)": 0.100704
},
{
"epoch": 2.652224824355972,
"grad_norm": 0.22945396602153778,
"learning_rate": 7.263888979515954e-07,
"loss": 0.32517061233520506,
"memory(GiB)": 127.52,
"step": 2265,
"token_acc": 0.8788511831616095,
"train_speed(iter/s)": 0.10071
},
{
"epoch": 2.65807962529274,
"grad_norm": 0.22719787061214447,
"learning_rate": 7.024252887251548e-07,
"loss": 0.31670680046081545,
"memory(GiB)": 127.52,
"step": 2270,
"token_acc": 0.8838603030141137,
"train_speed(iter/s)": 0.100707
},
{
"epoch": 2.663934426229508,
"grad_norm": 0.2364586889743805,
"learning_rate": 6.788492258964896e-07,
"loss": 0.3206209659576416,
"memory(GiB)": 127.52,
"step": 2275,
"token_acc": 0.8808837716472833,
"train_speed(iter/s)": 0.100707
},
{
"epoch": 2.669789227166276,
"grad_norm": 0.23205353319644928,
"learning_rate": 6.556616921754489e-07,
"loss": 0.3177974224090576,
"memory(GiB)": 127.52,
"step": 2280,
"token_acc": 0.8846845210507196,
"train_speed(iter/s)": 0.100709
},
{
"epoch": 2.6756440281030445,
"grad_norm": 0.23928001523017883,
"learning_rate": 6.328636540770028e-07,
"loss": 0.3218786001205444,
"memory(GiB)": 127.52,
"step": 2285,
"token_acc": 0.8839321457165733,
"train_speed(iter/s)": 0.10071
},
{
"epoch": 2.6814988290398127,
"grad_norm": 0.22948609292507172,
"learning_rate": 6.10456061880963e-07,
"loss": 0.32559771537780763,
"memory(GiB)": 127.52,
"step": 2290,
"token_acc": 0.888954265344254,
"train_speed(iter/s)": 0.10071
},
{
"epoch": 2.687353629976581,
"grad_norm": 0.22480416297912598,
"learning_rate": 5.884398495923727e-07,
"loss": 0.31432313919067384,
"memory(GiB)": 127.52,
"step": 2295,
"token_acc": 0.8786473253733409,
"train_speed(iter/s)": 0.100714
},
{
"epoch": 2.693208430913349,
"grad_norm": 0.49891427159309387,
"learning_rate": 5.668159349025649e-07,
"loss": 0.33366761207580564,
"memory(GiB)": 127.52,
"step": 2300,
"token_acc": 0.8706380208333333,
"train_speed(iter/s)": 0.100713
},
{
"epoch": 2.699063231850117,
"grad_norm": 0.23788191378116608,
"learning_rate": 5.455852191509214e-07,
"loss": 0.326168417930603,
"memory(GiB)": 127.52,
"step": 2305,
"token_acc": 0.8757156059468948,
"train_speed(iter/s)": 0.100714
},
{
"epoch": 2.7049180327868854,
"grad_norm": 0.23934431374073029,
"learning_rate": 5.247485872873026e-07,
"loss": 0.3131624460220337,
"memory(GiB)": 127.52,
"step": 2310,
"token_acc": 0.8873159330925727,
"train_speed(iter/s)": 0.100715
},
{
"epoch": 2.710772833723653,
"grad_norm": 0.22434021532535553,
"learning_rate": 5.043069078351526e-07,
"loss": 0.3083023548126221,
"memory(GiB)": 127.52,
"step": 2315,
"token_acc": 0.8900379146919432,
"train_speed(iter/s)": 0.10072
},
{
"epoch": 2.7166276346604215,
"grad_norm": 0.2241913378238678,
"learning_rate": 4.842610328552999e-07,
"loss": 0.31645286083221436,
"memory(GiB)": 127.52,
"step": 2320,
"token_acc": 0.8860757524370778,
"train_speed(iter/s)": 0.100719
},
{
"epoch": 2.7224824355971897,
"grad_norm": 0.22683191299438477,
"learning_rate": 4.6461179791044806e-07,
"loss": 0.3162517547607422,
"memory(GiB)": 127.52,
"step": 2325,
"token_acc": 0.8806341851421645,
"train_speed(iter/s)": 0.100722
},
{
"epoch": 2.728337236533958,
"grad_norm": 0.22332416474819183,
"learning_rate": 4.453600220303378e-07,
"loss": 0.3006160736083984,
"memory(GiB)": 127.52,
"step": 2330,
"token_acc": 0.8811269139759368,
"train_speed(iter/s)": 0.100726
},
{
"epoch": 2.7341920374707263,
"grad_norm": 0.2320730835199356,
"learning_rate": 4.2650650767761535e-07,
"loss": 0.3053130149841309,
"memory(GiB)": 127.52,
"step": 2335,
"token_acc": 0.8909103410770822,
"train_speed(iter/s)": 0.100726
},
{
"epoch": 2.740046838407494,
"grad_norm": 0.2575525939464569,
"learning_rate": 4.0805204071437953e-07,
"loss": 0.32894713878631593,
"memory(GiB)": 127.52,
"step": 2340,
"token_acc": 0.880288983757294,
"train_speed(iter/s)": 0.100724
},
{
"epoch": 2.7459016393442623,
"grad_norm": 0.2190413624048233,
"learning_rate": 3.899973903694243e-07,
"loss": 0.32172608375549316,
"memory(GiB)": 127.52,
"step": 2345,
"token_acc": 0.8842697990204148,
"train_speed(iter/s)": 0.100724
},
{
"epoch": 2.7517564402810306,
"grad_norm": 0.22509151697158813,
"learning_rate": 3.72343309206179e-07,
"loss": 0.31258511543273926,
"memory(GiB)": 127.52,
"step": 2350,
"token_acc": 0.8854250593299245,
"train_speed(iter/s)": 0.100723
},
{
"epoch": 2.7576112412177984,
"grad_norm": 0.22671233117580414,
"learning_rate": 3.55090533091339e-07,
"loss": 0.3143455028533936,
"memory(GiB)": 127.52,
"step": 2355,
"token_acc": 0.896848520654861,
"train_speed(iter/s)": 0.10072
},
{
"epoch": 2.7634660421545667,
"grad_norm": 0.21764405071735382,
"learning_rate": 3.382397811641858e-07,
"loss": 0.3072871208190918,
"memory(GiB)": 127.52,
"step": 2360,
"token_acc": 0.8893455142073456,
"train_speed(iter/s)": 0.100725
},
{
"epoch": 2.769320843091335,
"grad_norm": 0.22008980810642242,
"learning_rate": 3.217917558066241e-07,
"loss": 0.31331815719604494,
"memory(GiB)": 127.52,
"step": 2365,
"token_acc": 0.8801702516246458,
"train_speed(iter/s)": 0.100727
},
{
"epoch": 2.775175644028103,
"grad_norm": 0.2225882112979889,
"learning_rate": 3.057471426138958e-07,
"loss": 0.3275087833404541,
"memory(GiB)": 127.52,
"step": 2370,
"token_acc": 0.8743533027834035,
"train_speed(iter/s)": 0.100726
},
{
"epoch": 2.781030444964871,
"grad_norm": 0.22171831130981445,
"learning_rate": 2.901066103660033e-07,
"loss": 0.3129570484161377,
"memory(GiB)": 127.52,
"step": 2375,
"token_acc": 0.8872727501597082,
"train_speed(iter/s)": 0.100728
},
{
"epoch": 2.7868852459016393,
"grad_norm": 0.2355940192937851,
"learning_rate": 2.7487081099983435e-07,
"loss": 0.32728214263916017,
"memory(GiB)": 127.52,
"step": 2380,
"token_acc": 0.882063511039243,
"train_speed(iter/s)": 0.100731
},
{
"epoch": 2.7927400468384076,
"grad_norm": 0.21898697316646576,
"learning_rate": 2.6004037958199167e-07,
"loss": 0.31028578281402586,
"memory(GiB)": 127.52,
"step": 2385,
"token_acc": 0.8959504867399893,
"train_speed(iter/s)": 0.100732
},
{
"epoch": 2.798594847775176,
"grad_norm": 0.22940264642238617,
"learning_rate": 2.4561593428231165e-07,
"loss": 0.3168987274169922,
"memory(GiB)": 127.52,
"step": 2390,
"token_acc": 0.9043824201593208,
"train_speed(iter/s)": 0.100729
},
{
"epoch": 2.8044496487119437,
"grad_norm": 0.22128568589687347,
"learning_rate": 2.3159807634811182e-07,
"loss": 0.30646657943725586,
"memory(GiB)": 127.52,
"step": 2395,
"token_acc": 0.890519620223563,
"train_speed(iter/s)": 0.10073
},
{
"epoch": 2.810304449648712,
"grad_norm": 0.23035509884357452,
"learning_rate": 2.1798739007911517e-07,
"loss": 0.321412467956543,
"memory(GiB)": 127.52,
"step": 2400,
"token_acc": 0.8813866834368367,
"train_speed(iter/s)": 0.100729
},
{
"epoch": 2.8161592505854802,
"grad_norm": 0.22361230850219727,
"learning_rate": 2.0478444280310206e-07,
"loss": 0.314456582069397,
"memory(GiB)": 127.52,
"step": 2405,
"token_acc": 0.8847936237191627,
"train_speed(iter/s)": 0.100733
},
{
"epoch": 2.822014051522248,
"grad_norm": 0.248680979013443,
"learning_rate": 1.919897848522656e-07,
"loss": 0.31545486450195315,
"memory(GiB)": 127.52,
"step": 2410,
"token_acc": 0.8842675175238047,
"train_speed(iter/s)": 0.100732
},
{
"epoch": 2.8278688524590163,
"grad_norm": 0.2220403105020523,
"learning_rate": 1.796039495402646e-07,
"loss": 0.3194711923599243,
"memory(GiB)": 127.52,
"step": 2415,
"token_acc": 0.889650254732648,
"train_speed(iter/s)": 0.100731
},
{
"epoch": 2.8337236533957846,
"grad_norm": 0.23251083493232727,
"learning_rate": 1.6762745313999795e-07,
"loss": 0.32554826736450193,
"memory(GiB)": 127.52,
"step": 2420,
"token_acc": 0.8688351785435834,
"train_speed(iter/s)": 0.100728
},
{
"epoch": 2.839578454332553,
"grad_norm": 0.2339450716972351,
"learning_rate": 1.5606079486208846e-07,
"loss": 0.3137704372406006,
"memory(GiB)": 127.52,
"step": 2425,
"token_acc": 0.8856111133651886,
"train_speed(iter/s)": 0.100732
},
{
"epoch": 2.845433255269321,
"grad_norm": 0.22966544330120087,
"learning_rate": 1.449044568340663e-07,
"loss": 0.32210094928741456,
"memory(GiB)": 127.52,
"step": 2430,
"token_acc": 0.8884470889772489,
"train_speed(iter/s)": 0.100732
},
{
"epoch": 2.851288056206089,
"grad_norm": 0.24191494286060333,
"learning_rate": 1.3415890408027932e-07,
"loss": 0.31206402778625486,
"memory(GiB)": 127.52,
"step": 2435,
"token_acc": 0.8830502196115786,
"train_speed(iter/s)": 0.100731
},
{
"epoch": 2.857142857142857,
"grad_norm": 0.23956511914730072,
"learning_rate": 1.2382458450250657e-07,
"loss": 0.32455346584320066,
"memory(GiB)": 127.52,
"step": 2440,
"token_acc": 0.8758227950966726,
"train_speed(iter/s)": 0.100735
},
{
"epoch": 2.8629976580796255,
"grad_norm": 0.22552776336669922,
"learning_rate": 1.1390192886129304e-07,
"loss": 0.3120935678482056,
"memory(GiB)": 127.52,
"step": 2445,
"token_acc": 0.897060631760815,
"train_speed(iter/s)": 0.100735
},
{
"epoch": 2.8688524590163933,
"grad_norm": 0.2666381001472473,
"learning_rate": 1.0439135075798634e-07,
"loss": 0.3291801452636719,
"memory(GiB)": 127.52,
"step": 2450,
"token_acc": 0.8820067150139295,
"train_speed(iter/s)": 0.100741
},
{
"epoch": 2.8747072599531616,
"grad_norm": 0.22115741670131683,
"learning_rate": 9.529324661750494e-08,
"loss": 0.32175321578979493,
"memory(GiB)": 127.52,
"step": 2455,
"token_acc": 0.8775227487104135,
"train_speed(iter/s)": 0.100739
},
{
"epoch": 2.88056206088993,
"grad_norm": 0.22983959317207336,
"learning_rate": 8.6607995671808e-08,
"loss": 0.31844320297241213,
"memory(GiB)": 127.52,
"step": 2460,
"token_acc": 0.8813101879265747,
"train_speed(iter/s)": 0.10074
},
{
"epoch": 2.8864168618266977,
"grad_norm": 0.23733210563659668,
"learning_rate": 7.833595994409248e-08,
"loss": 0.3080190658569336,
"memory(GiB)": 127.52,
"step": 2465,
"token_acc": 0.88289333750391,
"train_speed(iter/s)": 0.100738
},
{
"epoch": 2.892271662763466,
"grad_norm": 0.24082650244235992,
"learning_rate": 7.047748423370193e-08,
"loss": 0.3234051465988159,
"memory(GiB)": 127.52,
"step": 2470,
"token_acc": 0.8791906373996674,
"train_speed(iter/s)": 0.100744
},
{
"epoch": 2.898126463700234,
"grad_norm": 0.24151204526424408,
"learning_rate": 6.303289610175233e-08,
"loss": 0.31094648838043215,
"memory(GiB)": 127.52,
"step": 2475,
"token_acc": 0.8864608150470219,
"train_speed(iter/s)": 0.100743
},
{
"epoch": 2.9039812646370025,
"grad_norm": 0.23166167736053467,
"learning_rate": 5.6002505857480906e-08,
"loss": 0.3175530910491943,
"memory(GiB)": 127.52,
"step": 2480,
"token_acc": 0.8859342832291451,
"train_speed(iter/s)": 0.100739
},
{
"epoch": 2.9098360655737707,
"grad_norm": 0.22753314673900604,
"learning_rate": 4.938660654530969e-08,
"loss": 0.3289816379547119,
"memory(GiB)": 127.52,
"step": 2485,
"token_acc": 0.8799638876393262,
"train_speed(iter/s)": 0.100739
},
{
"epoch": 2.9156908665105385,
"grad_norm": 0.22824768722057343,
"learning_rate": 4.318547393263317e-08,
"loss": 0.33161611557006837,
"memory(GiB)": 127.52,
"step": 2490,
"token_acc": 0.8840203211591419,
"train_speed(iter/s)": 0.100737
},
{
"epoch": 2.921545667447307,
"grad_norm": 0.2232208400964737,
"learning_rate": 3.739936649832188e-08,
"loss": 0.31346931457519533,
"memory(GiB)": 127.52,
"step": 2495,
"token_acc": 0.8866209251707488,
"train_speed(iter/s)": 0.100742
},
{
"epoch": 2.927400468384075,
"grad_norm": 0.22846031188964844,
"learning_rate": 3.2028525421946563e-08,
"loss": 0.31502933502197267,
"memory(GiB)": 127.52,
"step": 2500,
"token_acc": 0.8958872772065662,
"train_speed(iter/s)": 0.100746
},
{
"epoch": 2.933255269320843,
"grad_norm": 0.22012905776500702,
"learning_rate": 2.70731745737296e-08,
"loss": 0.317963695526123,
"memory(GiB)": 127.52,
"step": 2505,
"token_acc": 0.8870393801646438,
"train_speed(iter/s)": 0.100749
},
{
"epoch": 2.939110070257611,
"grad_norm": 0.22778548300266266,
"learning_rate": 2.2533520505211294e-08,
"loss": 0.3122371196746826,
"memory(GiB)": 127.52,
"step": 2510,
"token_acc": 0.888907967032967,
"train_speed(iter/s)": 0.100751
},
{
"epoch": 2.9449648711943794,
"grad_norm": 0.22804217040538788,
"learning_rate": 1.8409752440639027e-08,
"loss": 0.3041959524154663,
"memory(GiB)": 127.52,
"step": 2515,
"token_acc": 0.8861121607989981,
"train_speed(iter/s)": 0.100754
},
{
"epoch": 2.9508196721311473,
"grad_norm": 0.2233329713344574,
"learning_rate": 1.470204226908134e-08,
"loss": 0.32151806354522705,
"memory(GiB)": 127.52,
"step": 2520,
"token_acc": 0.8879425846286458,
"train_speed(iter/s)": 0.100749
},
{
"epoch": 2.9566744730679155,
"grad_norm": 0.24781863391399384,
"learning_rate": 1.1410544537263645e-08,
"loss": 0.32978765964508056,
"memory(GiB)": 127.52,
"step": 2525,
"token_acc": 0.8869459116971757,
"train_speed(iter/s)": 0.100749
},
{
"epoch": 2.962529274004684,
"grad_norm": 0.22210603952407837,
"learning_rate": 8.535396443124511e-09,
"loss": 0.30834412574768066,
"memory(GiB)": 127.52,
"step": 2530,
"token_acc": 0.8843790902885199,
"train_speed(iter/s)": 0.100751
},
{
"epoch": 2.968384074941452,
"grad_norm": 0.22260542213916779,
"learning_rate": 6.076717830098e-09,
"loss": 0.31018791198730467,
"memory(GiB)": 127.52,
"step": 2535,
"token_acc": 0.8947010997127103,
"train_speed(iter/s)": 0.10075
},
{
"epoch": 2.9742388758782203,
"grad_norm": 0.24026013910770416,
"learning_rate": 4.034611182121007e-09,
"loss": 0.3117814064025879,
"memory(GiB)": 127.52,
"step": 2540,
"token_acc": 0.8939134081534292,
"train_speed(iter/s)": 0.100749
},
{
"epoch": 2.980093676814988,
"grad_norm": 0.22812722623348236,
"learning_rate": 2.40916161935445e-09,
"loss": 0.31728358268737794,
"memory(GiB)": 127.52,
"step": 2545,
"token_acc": 0.883892058363205,
"train_speed(iter/s)": 0.10075
},
{
"epoch": 2.9859484777517564,
"grad_norm": 0.2219596952199936,
"learning_rate": 1.2004368946427758e-09,
"loss": 0.31175081729888915,
"memory(GiB)": 127.52,
"step": 2550,
"token_acc": 0.8867498701584854,
"train_speed(iter/s)": 0.100752
},
{
"epoch": 2.9918032786885247,
"grad_norm": 0.22541016340255737,
"learning_rate": 4.084873906851083e-10,
"loss": 0.31843390464782717,
"memory(GiB)": 127.52,
"step": 2555,
"token_acc": 0.893655570084918,
"train_speed(iter/s)": 0.10075
},
{
"epoch": 2.9976580796252925,
"grad_norm": 0.22078001499176025,
"learning_rate": 3.334611793692766e-11,
"loss": 0.31821532249450685,
"memory(GiB)": 127.52,
"step": 2560,
"token_acc": 0.8979642133800124,
"train_speed(iter/s)": 0.100751
}
],
"logging_steps": 5,
"max_steps": 2562,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 1.0,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1575512474484736.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}