test_intern25_mmqa_lora_5epoch / trainer_state.json
invincibility's picture
Upload trainer_state.json with huggingface_hub
37857e3 verified
{
"best_metric": 0.19586918,
"best_model_checkpoint": "/share/project/gsai/kch/output/v9-20250120-041149/checkpoint-614",
"epoch": 4.996548223350254,
"eval_steps": 500,
"global_step": 1535,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.003248730964467005,
"grad_norm": 9.773202050760368,
"learning_rate": 1.2987012987012988e-06,
"loss": 1.5496090650558472,
"memory(GiB)": 35.94,
"step": 1,
"token_acc": 0.7444444444444445,
"train_speed(iter/s)": 0.013018
},
{
"epoch": 0.016243654822335026,
"grad_norm": 9.248677372850217,
"learning_rate": 6.493506493506493e-06,
"loss": 1.707068681716919,
"memory(GiB)": 36.33,
"step": 5,
"token_acc": 0.7226890756302521,
"train_speed(iter/s)": 0.016033
},
{
"epoch": 0.03248730964467005,
"grad_norm": 7.883848099402922,
"learning_rate": 1.2987012987012986e-05,
"loss": 1.7282501220703126,
"memory(GiB)": 36.33,
"step": 10,
"token_acc": 0.6761363636363636,
"train_speed(iter/s)": 0.016046
},
{
"epoch": 0.048730964467005075,
"grad_norm": 6.0889640814527155,
"learning_rate": 1.9480519480519483e-05,
"loss": 1.2976716995239257,
"memory(GiB)": 36.33,
"step": 15,
"token_acc": 0.7266355140186916,
"train_speed(iter/s)": 0.015999
},
{
"epoch": 0.0649746192893401,
"grad_norm": 2.6158303198283113,
"learning_rate": 2.5974025974025972e-05,
"loss": 0.7637146949768067,
"memory(GiB)": 36.33,
"step": 20,
"token_acc": 0.8190709046454768,
"train_speed(iter/s)": 0.016127
},
{
"epoch": 0.08121827411167512,
"grad_norm": 1.1712343980644169,
"learning_rate": 3.246753246753247e-05,
"loss": 0.5213486194610596,
"memory(GiB)": 36.91,
"step": 25,
"token_acc": 0.8802992518703242,
"train_speed(iter/s)": 0.016159
},
{
"epoch": 0.09746192893401015,
"grad_norm": 1.4361934956753106,
"learning_rate": 3.8961038961038966e-05,
"loss": 0.4833333969116211,
"memory(GiB)": 36.91,
"step": 30,
"token_acc": 0.8929440389294404,
"train_speed(iter/s)": 0.01608
},
{
"epoch": 0.11370558375634518,
"grad_norm": 1.1662861682771686,
"learning_rate": 4.545454545454546e-05,
"loss": 0.4054920196533203,
"memory(GiB)": 36.91,
"step": 35,
"token_acc": 0.8860103626943006,
"train_speed(iter/s)": 0.016078
},
{
"epoch": 0.1299492385786802,
"grad_norm": 1.0429297515235254,
"learning_rate": 5.1948051948051944e-05,
"loss": 0.43406662940979,
"memory(GiB)": 36.91,
"step": 40,
"token_acc": 0.8708333333333333,
"train_speed(iter/s)": 0.016008
},
{
"epoch": 0.14619289340101524,
"grad_norm": 1.2238750692730618,
"learning_rate": 5.844155844155844e-05,
"loss": 0.36366307735443115,
"memory(GiB)": 36.91,
"step": 45,
"token_acc": 0.9007832898172323,
"train_speed(iter/s)": 0.01607
},
{
"epoch": 0.16243654822335024,
"grad_norm": 1.2558032464123954,
"learning_rate": 6.493506493506494e-05,
"loss": 0.327667236328125,
"memory(GiB)": 36.91,
"step": 50,
"token_acc": 0.9095890410958904,
"train_speed(iter/s)": 0.016095
},
{
"epoch": 0.17868020304568527,
"grad_norm": 1.1346516950379935,
"learning_rate": 7.142857142857143e-05,
"loss": 0.2869602680206299,
"memory(GiB)": 36.91,
"step": 55,
"token_acc": 0.9400428265524625,
"train_speed(iter/s)": 0.016146
},
{
"epoch": 0.1949238578680203,
"grad_norm": 1.062592286052222,
"learning_rate": 7.792207792207793e-05,
"loss": 0.32817542552948,
"memory(GiB)": 36.91,
"step": 60,
"token_acc": 0.9162162162162162,
"train_speed(iter/s)": 0.016208
},
{
"epoch": 0.21116751269035533,
"grad_norm": 1.0747418170911354,
"learning_rate": 8.441558441558442e-05,
"loss": 0.3106029987335205,
"memory(GiB)": 36.91,
"step": 65,
"token_acc": 0.8882235528942116,
"train_speed(iter/s)": 0.016166
},
{
"epoch": 0.22741116751269036,
"grad_norm": 1.3626948899821127,
"learning_rate": 9.090909090909092e-05,
"loss": 0.2963001251220703,
"memory(GiB)": 36.91,
"step": 70,
"token_acc": 0.9046511627906977,
"train_speed(iter/s)": 0.016127
},
{
"epoch": 0.2436548223350254,
"grad_norm": 1.767990529674908,
"learning_rate": 9.74025974025974e-05,
"loss": 0.30068559646606446,
"memory(GiB)": 36.91,
"step": 75,
"token_acc": 0.9203539823008849,
"train_speed(iter/s)": 0.016153
},
{
"epoch": 0.2598984771573604,
"grad_norm": 1.1682901865357622,
"learning_rate": 9.99989553622803e-05,
"loss": 0.2741088390350342,
"memory(GiB)": 36.91,
"step": 80,
"token_acc": 0.9041394335511983,
"train_speed(iter/s)": 0.016134
},
{
"epoch": 0.27614213197969545,
"grad_norm": 1.3278362200249414,
"learning_rate": 9.999257162318026e-05,
"loss": 0.25543942451477053,
"memory(GiB)": 36.91,
"step": 85,
"token_acc": 0.9399538106235565,
"train_speed(iter/s)": 0.016119
},
{
"epoch": 0.2923857868020305,
"grad_norm": 1.1803595161351554,
"learning_rate": 9.998038523933224e-05,
"loss": 0.3038362503051758,
"memory(GiB)": 36.91,
"step": 90,
"token_acc": 0.9416058394160584,
"train_speed(iter/s)": 0.016104
},
{
"epoch": 0.3086294416243655,
"grad_norm": 1.1025992286590631,
"learning_rate": 9.996239762521151e-05,
"loss": 0.24188714027404784,
"memory(GiB)": 36.91,
"step": 95,
"token_acc": 0.9402298850574713,
"train_speed(iter/s)": 0.016109
},
{
"epoch": 0.3248730964467005,
"grad_norm": 1.7473005302414135,
"learning_rate": 9.993861086864293e-05,
"loss": 0.2190408945083618,
"memory(GiB)": 36.91,
"step": 100,
"token_acc": 0.9553349875930521,
"train_speed(iter/s)": 0.016079
},
{
"epoch": 0.3411167512690355,
"grad_norm": 0.9780470952963239,
"learning_rate": 9.990902773055866e-05,
"loss": 0.22316210269927977,
"memory(GiB)": 36.91,
"step": 105,
"token_acc": 0.9384236453201971,
"train_speed(iter/s)": 0.016094
},
{
"epoch": 0.35736040609137054,
"grad_norm": 1.2071939622104944,
"learning_rate": 9.987365164467767e-05,
"loss": 0.1844509482383728,
"memory(GiB)": 36.91,
"step": 110,
"token_acc": 0.9557291666666666,
"train_speed(iter/s)": 0.016096
},
{
"epoch": 0.37360406091370557,
"grad_norm": 1.3488873859555934,
"learning_rate": 9.983248671710714e-05,
"loss": 0.24020743370056152,
"memory(GiB)": 36.91,
"step": 115,
"token_acc": 0.91792656587473,
"train_speed(iter/s)": 0.016103
},
{
"epoch": 0.3898477157360406,
"grad_norm": 1.3346849143090171,
"learning_rate": 9.978553772586596e-05,
"loss": 0.17928496599197388,
"memory(GiB)": 36.91,
"step": 120,
"token_acc": 0.9523809523809523,
"train_speed(iter/s)": 0.016107
},
{
"epoch": 0.40609137055837563,
"grad_norm": 1.5370257812561328,
"learning_rate": 9.973281012033007e-05,
"loss": 0.22673540115356444,
"memory(GiB)": 36.91,
"step": 125,
"token_acc": 0.9307692307692308,
"train_speed(iter/s)": 0.016132
},
{
"epoch": 0.42233502538071066,
"grad_norm": 1.564641958549246,
"learning_rate": 9.967431002060002e-05,
"loss": 0.2366321563720703,
"memory(GiB)": 36.91,
"step": 130,
"token_acc": 0.9309576837416481,
"train_speed(iter/s)": 0.016147
},
{
"epoch": 0.4385786802030457,
"grad_norm": 2.562291519667129,
"learning_rate": 9.961004421679047e-05,
"loss": 0.1997455835342407,
"memory(GiB)": 36.91,
"step": 135,
"token_acc": 0.9694656488549618,
"train_speed(iter/s)": 0.01615
},
{
"epoch": 0.4548223350253807,
"grad_norm": 1.3505627897575587,
"learning_rate": 9.954002016824227e-05,
"loss": 0.23050181865692138,
"memory(GiB)": 36.91,
"step": 140,
"token_acc": 0.9395248380129589,
"train_speed(iter/s)": 0.016177
},
{
"epoch": 0.47106598984771575,
"grad_norm": 1.1439093152874722,
"learning_rate": 9.946424600265646e-05,
"loss": 0.2069091796875,
"memory(GiB)": 36.91,
"step": 145,
"token_acc": 0.9485294117647058,
"train_speed(iter/s)": 0.016153
},
{
"epoch": 0.4873096446700508,
"grad_norm": 1.3223308004820944,
"learning_rate": 9.938273051515098e-05,
"loss": 0.21799993515014648,
"memory(GiB)": 36.91,
"step": 150,
"token_acc": 0.9325581395348838,
"train_speed(iter/s)": 0.016154
},
{
"epoch": 0.5035532994923858,
"grad_norm": 1.2523275744092777,
"learning_rate": 9.929548316723982e-05,
"loss": 0.25325832366943357,
"memory(GiB)": 36.91,
"step": 155,
"token_acc": 0.9368421052631579,
"train_speed(iter/s)": 0.016145
},
{
"epoch": 0.5197969543147208,
"grad_norm": 0.9022910796931503,
"learning_rate": 9.920251408573483e-05,
"loss": 0.2051997184753418,
"memory(GiB)": 36.91,
"step": 160,
"token_acc": 0.9321266968325792,
"train_speed(iter/s)": 0.016138
},
{
"epoch": 0.5360406091370559,
"grad_norm": 1.3630797879167007,
"learning_rate": 9.910383406157018e-05,
"loss": 0.19534312486648558,
"memory(GiB)": 36.91,
"step": 165,
"token_acc": 0.9489795918367347,
"train_speed(iter/s)": 0.016146
},
{
"epoch": 0.5522842639593909,
"grad_norm": 1.2845653777954962,
"learning_rate": 9.899945454855006e-05,
"loss": 0.25403494834899903,
"memory(GiB)": 36.91,
"step": 170,
"token_acc": 0.9025974025974026,
"train_speed(iter/s)": 0.01615
},
{
"epoch": 0.5685279187817259,
"grad_norm": 1.2637865638643238,
"learning_rate": 9.888938766201907e-05,
"loss": 0.21994171142578126,
"memory(GiB)": 36.91,
"step": 175,
"token_acc": 0.9292452830188679,
"train_speed(iter/s)": 0.016148
},
{
"epoch": 0.584771573604061,
"grad_norm": 1.3035045872952578,
"learning_rate": 9.877364617745604e-05,
"loss": 0.21233229637145995,
"memory(GiB)": 36.91,
"step": 180,
"token_acc": 0.936046511627907,
"train_speed(iter/s)": 0.016163
},
{
"epoch": 0.601015228426396,
"grad_norm": 1.0837997073678936,
"learning_rate": 9.865224352899119e-05,
"loss": 0.20809760093688964,
"memory(GiB)": 36.91,
"step": 185,
"token_acc": 0.9612403100775194,
"train_speed(iter/s)": 0.016158
},
{
"epoch": 0.617258883248731,
"grad_norm": 1.6131697829206757,
"learning_rate": 9.852519380784686e-05,
"loss": 0.16450556516647338,
"memory(GiB)": 36.91,
"step": 190,
"token_acc": 0.9518716577540107,
"train_speed(iter/s)": 0.01615
},
{
"epoch": 0.6335025380710659,
"grad_norm": 1.0897399385105642,
"learning_rate": 9.839251176070184e-05,
"loss": 0.21039419174194335,
"memory(GiB)": 36.91,
"step": 195,
"token_acc": 0.943089430894309,
"train_speed(iter/s)": 0.016128
},
{
"epoch": 0.649746192893401,
"grad_norm": 1.0509670789538326,
"learning_rate": 9.825421278797983e-05,
"loss": 0.2035764217376709,
"memory(GiB)": 36.91,
"step": 200,
"token_acc": 0.9397260273972603,
"train_speed(iter/s)": 0.016132
},
{
"epoch": 0.665989847715736,
"grad_norm": 1.2329373260124112,
"learning_rate": 9.811031294206184e-05,
"loss": 0.21548199653625488,
"memory(GiB)": 36.91,
"step": 205,
"token_acc": 0.9368191721132898,
"train_speed(iter/s)": 0.01613
},
{
"epoch": 0.682233502538071,
"grad_norm": 0.8421449582235737,
"learning_rate": 9.796082892542302e-05,
"loss": 0.166330087184906,
"memory(GiB)": 36.91,
"step": 210,
"token_acc": 0.9555555555555556,
"train_speed(iter/s)": 0.016131
},
{
"epoch": 0.698477157360406,
"grad_norm": 3.430879327858774,
"learning_rate": 9.780577808869398e-05,
"loss": 0.2193552017211914,
"memory(GiB)": 36.91,
"step": 215,
"token_acc": 0.952020202020202,
"train_speed(iter/s)": 0.016136
},
{
"epoch": 0.7147208121827411,
"grad_norm": 1.5093266746905538,
"learning_rate": 9.764517842864696e-05,
"loss": 0.21606364250183105,
"memory(GiB)": 36.91,
"step": 220,
"token_acc": 0.9311926605504587,
"train_speed(iter/s)": 0.016141
},
{
"epoch": 0.7309644670050761,
"grad_norm": 1.3437389442959786,
"learning_rate": 9.747904858610681e-05,
"loss": 0.18983598947525024,
"memory(GiB)": 36.91,
"step": 225,
"token_acc": 0.9449035812672176,
"train_speed(iter/s)": 0.016146
},
{
"epoch": 0.7472081218274111,
"grad_norm": 0.9560028124850986,
"learning_rate": 9.730740784378753e-05,
"loss": 0.15862367153167725,
"memory(GiB)": 36.91,
"step": 230,
"token_acc": 0.9284009546539379,
"train_speed(iter/s)": 0.016153
},
{
"epoch": 0.7634517766497462,
"grad_norm": 0.9944797001481037,
"learning_rate": 9.713027612405395e-05,
"loss": 0.2057633638381958,
"memory(GiB)": 36.91,
"step": 235,
"token_acc": 0.9560975609756097,
"train_speed(iter/s)": 0.016148
},
{
"epoch": 0.7796954314720812,
"grad_norm": 1.3080304212648073,
"learning_rate": 9.694767398660942e-05,
"loss": 0.20023531913757325,
"memory(GiB)": 36.91,
"step": 240,
"token_acc": 0.930835734870317,
"train_speed(iter/s)": 0.016152
},
{
"epoch": 0.7959390862944162,
"grad_norm": 1.0766984239588557,
"learning_rate": 9.67596226261095e-05,
"loss": 0.17447829246520996,
"memory(GiB)": 36.91,
"step": 245,
"token_acc": 0.9543269230769231,
"train_speed(iter/s)": 0.016152
},
{
"epoch": 0.8121827411167513,
"grad_norm": 1.507367869013474,
"learning_rate": 9.656614386970173e-05,
"loss": 0.1656266212463379,
"memory(GiB)": 36.91,
"step": 250,
"token_acc": 0.9447368421052632,
"train_speed(iter/s)": 0.016157
},
{
"epoch": 0.8284263959390863,
"grad_norm": 0.9746901508793566,
"learning_rate": 9.636726017449236e-05,
"loss": 0.1971142530441284,
"memory(GiB)": 36.91,
"step": 255,
"token_acc": 0.9336384439359268,
"train_speed(iter/s)": 0.016144
},
{
"epoch": 0.8446700507614213,
"grad_norm": 1.2090195353569724,
"learning_rate": 9.616299462493952e-05,
"loss": 0.13225051164627075,
"memory(GiB)": 36.91,
"step": 260,
"token_acc": 0.9502369668246445,
"train_speed(iter/s)": 0.016132
},
{
"epoch": 0.8609137055837564,
"grad_norm": 2.0461505378854024,
"learning_rate": 9.595337093017404e-05,
"loss": 0.15409984588623046,
"memory(GiB)": 36.91,
"step": 265,
"token_acc": 0.9537444933920705,
"train_speed(iter/s)": 0.016136
},
{
"epoch": 0.8771573604060914,
"grad_norm": 2.0732984340431178,
"learning_rate": 9.57384134212473e-05,
"loss": 0.21368227005004883,
"memory(GiB)": 36.91,
"step": 270,
"token_acc": 0.9419642857142857,
"train_speed(iter/s)": 0.016136
},
{
"epoch": 0.8934010152284264,
"grad_norm": 0.7925657032904146,
"learning_rate": 9.551814704830734e-05,
"loss": 0.1758435010910034,
"memory(GiB)": 36.91,
"step": 275,
"token_acc": 0.948509485094851,
"train_speed(iter/s)": 0.016143
},
{
"epoch": 0.9096446700507614,
"grad_norm": 1.493944081608633,
"learning_rate": 9.529259737770269e-05,
"loss": 0.1807725191116333,
"memory(GiB)": 36.91,
"step": 280,
"token_acc": 0.9431524547803618,
"train_speed(iter/s)": 0.016126
},
{
"epoch": 0.9258883248730965,
"grad_norm": 1.6848601658017734,
"learning_rate": 9.506179058901503e-05,
"loss": 0.20769875049591063,
"memory(GiB)": 36.91,
"step": 285,
"token_acc": 0.9391304347826087,
"train_speed(iter/s)": 0.016132
},
{
"epoch": 0.9421319796954315,
"grad_norm": 1.3210817601987923,
"learning_rate": 9.482575347202047e-05,
"loss": 0.162405526638031,
"memory(GiB)": 36.91,
"step": 290,
"token_acc": 0.9507042253521126,
"train_speed(iter/s)": 0.016136
},
{
"epoch": 0.9583756345177665,
"grad_norm": 1.3496077516635223,
"learning_rate": 9.458451342358002e-05,
"loss": 0.19487454891204833,
"memory(GiB)": 36.91,
"step": 295,
"token_acc": 0.9321608040201005,
"train_speed(iter/s)": 0.016132
},
{
"epoch": 0.9746192893401016,
"grad_norm": 0.990005748680569,
"learning_rate": 9.433809844445969e-05,
"loss": 0.18303027153015136,
"memory(GiB)": 36.91,
"step": 300,
"token_acc": 0.9073170731707317,
"train_speed(iter/s)": 0.016134
},
{
"epoch": 0.9908629441624366,
"grad_norm": 1.2295422719869937,
"learning_rate": 9.40865371360804e-05,
"loss": 0.17322018146514892,
"memory(GiB)": 36.91,
"step": 305,
"token_acc": 0.9247311827956989,
"train_speed(iter/s)": 0.016141
},
{
"epoch": 0.9973604060913706,
"eval_loss": 0.1993405520915985,
"eval_runtime": 62.0419,
"eval_samples_per_second": 3.191,
"eval_steps_per_second": 0.806,
"eval_token_acc": 0.9332079021636877,
"step": 307
},
{
"epoch": 1.0095431472081218,
"grad_norm": 0.6300057786945967,
"learning_rate": 9.382985869719825e-05,
"loss": 0.18641979694366456,
"memory(GiB)": 36.91,
"step": 310,
"token_acc": 0.9554234769687965,
"train_speed(iter/s)": 0.016081
},
{
"epoch": 1.0257868020304568,
"grad_norm": 0.6546315853574257,
"learning_rate": 9.35680929205154e-05,
"loss": 0.09114786386489868,
"memory(GiB)": 36.91,
"step": 315,
"token_acc": 0.972972972972973,
"train_speed(iter/s)": 0.016095
},
{
"epoch": 1.0420304568527918,
"grad_norm": 1.0908662736650971,
"learning_rate": 9.330127018922194e-05,
"loss": 0.10798045396804809,
"memory(GiB)": 36.91,
"step": 320,
"token_acc": 0.9705093833780161,
"train_speed(iter/s)": 0.016104
},
{
"epoch": 1.0582741116751269,
"grad_norm": 1.3297407747084764,
"learning_rate": 9.302942147346945e-05,
"loss": 0.1425997495651245,
"memory(GiB)": 36.91,
"step": 325,
"token_acc": 0.9705014749262537,
"train_speed(iter/s)": 0.016103
},
{
"epoch": 1.074517766497462,
"grad_norm": 0.9696985174488663,
"learning_rate": 9.275257832677623e-05,
"loss": 0.09851968884468079,
"memory(GiB)": 36.91,
"step": 330,
"token_acc": 0.9644549763033176,
"train_speed(iter/s)": 0.016115
},
{
"epoch": 1.090761421319797,
"grad_norm": 0.9656710998245678,
"learning_rate": 9.247077288236488e-05,
"loss": 0.11144424676895141,
"memory(GiB)": 36.91,
"step": 335,
"token_acc": 0.972568578553616,
"train_speed(iter/s)": 0.016119
},
{
"epoch": 1.107005076142132,
"grad_norm": 2.119365217816497,
"learning_rate": 9.21840378494325e-05,
"loss": 0.11279252767562867,
"memory(GiB)": 36.91,
"step": 340,
"token_acc": 0.9637462235649547,
"train_speed(iter/s)": 0.016124
},
{
"epoch": 1.123248730964467,
"grad_norm": 1.0607496749665157,
"learning_rate": 9.189240650935433e-05,
"loss": 0.15501840114593507,
"memory(GiB)": 36.91,
"step": 345,
"token_acc": 0.9662337662337662,
"train_speed(iter/s)": 0.016118
},
{
"epoch": 1.139492385786802,
"grad_norm": 1.1350038539205582,
"learning_rate": 9.159591271182058e-05,
"loss": 0.12092633247375488,
"memory(GiB)": 36.91,
"step": 350,
"token_acc": 0.9680232558139535,
"train_speed(iter/s)": 0.016126
},
{
"epoch": 1.155736040609137,
"grad_norm": 0.6471881138956326,
"learning_rate": 9.129459087090763e-05,
"loss": 0.09021483659744263,
"memory(GiB)": 36.91,
"step": 355,
"token_acc": 0.9718670076726342,
"train_speed(iter/s)": 0.016119
},
{
"epoch": 1.171979695431472,
"grad_norm": 0.5557368721254966,
"learning_rate": 9.098847596108351e-05,
"loss": 0.09125213623046875,
"memory(GiB)": 36.91,
"step": 360,
"token_acc": 0.9772727272727273,
"train_speed(iter/s)": 0.01612
},
{
"epoch": 1.188223350253807,
"grad_norm": 0.8767747521686889,
"learning_rate": 9.067760351314838e-05,
"loss": 0.10847616195678711,
"memory(GiB)": 36.91,
"step": 365,
"token_acc": 0.9425587467362925,
"train_speed(iter/s)": 0.016116
},
{
"epoch": 1.2044670050761421,
"grad_norm": 0.7043233347928591,
"learning_rate": 9.036200961011059e-05,
"loss": 0.14046638011932372,
"memory(GiB)": 36.91,
"step": 370,
"token_acc": 0.9632034632034632,
"train_speed(iter/s)": 0.016126
},
{
"epoch": 1.2207106598984772,
"grad_norm": 1.0689456764149206,
"learning_rate": 9.004173088299837e-05,
"loss": 0.13291985988616944,
"memory(GiB)": 36.91,
"step": 375,
"token_acc": 0.9565217391304348,
"train_speed(iter/s)": 0.016139
},
{
"epoch": 1.2369543147208122,
"grad_norm": 1.3657829465422844,
"learning_rate": 8.97168045066082e-05,
"loss": 0.11737120151519775,
"memory(GiB)": 36.91,
"step": 380,
"token_acc": 0.973421926910299,
"train_speed(iter/s)": 0.01615
},
{
"epoch": 1.2531979695431472,
"grad_norm": 0.991725434659403,
"learning_rate": 8.938726819518977e-05,
"loss": 0.1285269021987915,
"memory(GiB)": 36.91,
"step": 385,
"token_acc": 0.97,
"train_speed(iter/s)": 0.016149
},
{
"epoch": 1.2694416243654822,
"grad_norm": 0.7615458350738632,
"learning_rate": 8.905316019806868e-05,
"loss": 0.08999634981155395,
"memory(GiB)": 36.91,
"step": 390,
"token_acc": 0.9392405063291139,
"train_speed(iter/s)": 0.016141
},
{
"epoch": 1.2856852791878173,
"grad_norm": 1.0176469569030087,
"learning_rate": 8.871451929520663e-05,
"loss": 0.12240591049194335,
"memory(GiB)": 36.91,
"step": 395,
"token_acc": 0.9611872146118722,
"train_speed(iter/s)": 0.016137
},
{
"epoch": 1.3019289340101523,
"grad_norm": 1.5999057477034428,
"learning_rate": 8.837138479270036e-05,
"loss": 0.1078599214553833,
"memory(GiB)": 36.91,
"step": 400,
"token_acc": 0.9562982005141388,
"train_speed(iter/s)": 0.016137
},
{
"epoch": 1.3181725888324873,
"grad_norm": 1.8517636831594235,
"learning_rate": 8.802379651821938e-05,
"loss": 0.14071439504623412,
"memory(GiB)": 36.91,
"step": 405,
"token_acc": 0.9592875318066157,
"train_speed(iter/s)": 0.016131
},
{
"epoch": 1.3344162436548224,
"grad_norm": 1.333329930877741,
"learning_rate": 8.767179481638303e-05,
"loss": 0.13171937465667724,
"memory(GiB)": 36.91,
"step": 410,
"token_acc": 0.9744897959183674,
"train_speed(iter/s)": 0.016135
},
{
"epoch": 1.3506598984771574,
"grad_norm": 1.1709434640964491,
"learning_rate": 8.731542054407793e-05,
"loss": 0.10031242370605468,
"memory(GiB)": 36.91,
"step": 415,
"token_acc": 0.9507829977628636,
"train_speed(iter/s)": 0.016142
},
{
"epoch": 1.3669035532994924,
"grad_norm": 0.8550588073511182,
"learning_rate": 8.695471506571542e-05,
"loss": 0.09321081638336182,
"memory(GiB)": 36.91,
"step": 420,
"token_acc": 0.9667519181585678,
"train_speed(iter/s)": 0.016135
},
{
"epoch": 1.3831472081218275,
"grad_norm": 0.8651388677420173,
"learning_rate": 8.658972024843062e-05,
"loss": 0.11361520290374756,
"memory(GiB)": 36.91,
"step": 425,
"token_acc": 0.9243243243243243,
"train_speed(iter/s)": 0.016131
},
{
"epoch": 1.3993908629441625,
"grad_norm": 1.1539120381770573,
"learning_rate": 8.622047845722275e-05,
"loss": 0.11814072132110595,
"memory(GiB)": 36.91,
"step": 430,
"token_acc": 0.9747368421052631,
"train_speed(iter/s)": 0.016135
},
{
"epoch": 1.4156345177664975,
"grad_norm": 0.8277592112279485,
"learning_rate": 8.584703255003795e-05,
"loss": 0.11146994829177856,
"memory(GiB)": 36.91,
"step": 435,
"token_acc": 0.9720101781170484,
"train_speed(iter/s)": 0.016134
},
{
"epoch": 1.4318781725888325,
"grad_norm": 0.613271329664299,
"learning_rate": 8.546942587279465e-05,
"loss": 0.09394789338111878,
"memory(GiB)": 36.91,
"step": 440,
"token_acc": 0.9636803874092009,
"train_speed(iter/s)": 0.016134
},
{
"epoch": 1.4481218274111676,
"grad_norm": 1.0271786482031176,
"learning_rate": 8.508770225435244e-05,
"loss": 0.09493039846420288,
"memory(GiB)": 36.91,
"step": 445,
"token_acc": 0.9743589743589743,
"train_speed(iter/s)": 0.016139
},
{
"epoch": 1.4643654822335026,
"grad_norm": 1.0170609694346187,
"learning_rate": 8.470190600142486e-05,
"loss": 0.0872123122215271,
"memory(GiB)": 36.91,
"step": 450,
"token_acc": 0.9763779527559056,
"train_speed(iter/s)": 0.016139
},
{
"epoch": 1.4806091370558376,
"grad_norm": 1.6562131424643847,
"learning_rate": 8.43120818934367e-05,
"loss": 0.12921547889709473,
"memory(GiB)": 36.91,
"step": 455,
"token_acc": 0.9691516709511568,
"train_speed(iter/s)": 0.016142
},
{
"epoch": 1.4968527918781727,
"grad_norm": 1.9551348110028592,
"learning_rate": 8.39182751773264e-05,
"loss": 0.10002539157867432,
"memory(GiB)": 36.91,
"step": 460,
"token_acc": 0.9665924276169265,
"train_speed(iter/s)": 0.016147
},
{
"epoch": 1.5130964467005077,
"grad_norm": 1.376875063389563,
"learning_rate": 8.352053156229438e-05,
"loss": 0.0880006194114685,
"memory(GiB)": 36.91,
"step": 465,
"token_acc": 0.958128078817734,
"train_speed(iter/s)": 0.016149
},
{
"epoch": 1.5293401015228425,
"grad_norm": 1.688502126127077,
"learning_rate": 8.31188972144974e-05,
"loss": 0.08950616717338562,
"memory(GiB)": 36.91,
"step": 470,
"token_acc": 0.96996996996997,
"train_speed(iter/s)": 0.016152
},
{
"epoch": 1.5455837563451778,
"grad_norm": 1.3189009566745062,
"learning_rate": 8.27134187516901e-05,
"loss": 0.08834458589553833,
"memory(GiB)": 36.91,
"step": 475,
"token_acc": 0.9663865546218487,
"train_speed(iter/s)": 0.016152
},
{
"epoch": 1.5618274111675126,
"grad_norm": 1.4946742975658185,
"learning_rate": 8.23041432378141e-05,
"loss": 0.14390041828155517,
"memory(GiB)": 36.91,
"step": 480,
"token_acc": 0.9621212121212122,
"train_speed(iter/s)": 0.016158
},
{
"epoch": 1.5780710659898478,
"grad_norm": 1.3879821905262077,
"learning_rate": 8.18911181775353e-05,
"loss": 0.1267578125,
"memory(GiB)": 36.91,
"step": 485,
"token_acc": 0.9685230024213075,
"train_speed(iter/s)": 0.016166
},
{
"epoch": 1.5943147208121826,
"grad_norm": 1.0017173842059925,
"learning_rate": 8.147439151072994e-05,
"loss": 0.11637402772903442,
"memory(GiB)": 36.91,
"step": 490,
"token_acc": 0.945031712473573,
"train_speed(iter/s)": 0.016169
},
{
"epoch": 1.6105583756345179,
"grad_norm": 3.517464157304767,
"learning_rate": 8.105401160692023e-05,
"loss": 0.11228004693984986,
"memory(GiB)": 36.91,
"step": 495,
"token_acc": 0.9544513457556936,
"train_speed(iter/s)": 0.016174
},
{
"epoch": 1.6268020304568527,
"grad_norm": 1.123471909128111,
"learning_rate": 8.063002725966015e-05,
"loss": 0.1422884702682495,
"memory(GiB)": 36.91,
"step": 500,
"token_acc": 0.9502487562189055,
"train_speed(iter/s)": 0.016173
},
{
"epoch": 1.643045685279188,
"grad_norm": 0.6219224292611003,
"learning_rate": 8.020248768087188e-05,
"loss": 0.09764043688774109,
"memory(GiB)": 36.91,
"step": 505,
"token_acc": 0.9696312364425163,
"train_speed(iter/s)": 0.016171
},
{
"epoch": 1.6592893401015227,
"grad_norm": 0.6599500918289528,
"learning_rate": 7.977144249513391e-05,
"loss": 0.11226143836975097,
"memory(GiB)": 36.91,
"step": 510,
"token_acc": 0.9662447257383966,
"train_speed(iter/s)": 0.016171
},
{
"epoch": 1.675532994923858,
"grad_norm": 1.1327299497198065,
"learning_rate": 7.93369417339209e-05,
"loss": 0.15791513919830322,
"memory(GiB)": 36.91,
"step": 515,
"token_acc": 0.9535962877030162,
"train_speed(iter/s)": 0.016172
},
{
"epoch": 1.6917766497461928,
"grad_norm": 1.246895523664307,
"learning_rate": 7.88990358297967e-05,
"loss": 0.1254945158958435,
"memory(GiB)": 36.91,
"step": 520,
"token_acc": 0.9494252873563218,
"train_speed(iter/s)": 0.016169
},
{
"epoch": 1.708020304568528,
"grad_norm": 0.7907689981367572,
"learning_rate": 7.84577756105606e-05,
"loss": 0.11963515281677246,
"memory(GiB)": 36.91,
"step": 525,
"token_acc": 0.9851116625310173,
"train_speed(iter/s)": 0.016171
},
{
"epoch": 1.7242639593908629,
"grad_norm": 0.9327837359999639,
"learning_rate": 7.801321229334764e-05,
"loss": 0.0870942771434784,
"memory(GiB)": 36.91,
"step": 530,
"token_acc": 0.9618320610687023,
"train_speed(iter/s)": 0.01617
},
{
"epoch": 1.740507614213198,
"grad_norm": 1.0881384151057631,
"learning_rate": 7.756539747868394e-05,
"loss": 0.08531727194786072,
"memory(GiB)": 36.91,
"step": 535,
"token_acc": 0.9748603351955307,
"train_speed(iter/s)": 0.016168
},
{
"epoch": 1.756751269035533,
"grad_norm": 0.7767069783252919,
"learning_rate": 7.71143831444974e-05,
"loss": 0.11042824983596802,
"memory(GiB)": 36.91,
"step": 540,
"token_acc": 0.957983193277311,
"train_speed(iter/s)": 0.016168
},
{
"epoch": 1.7729949238578682,
"grad_norm": 1.4499212106775468,
"learning_rate": 7.666022164008457e-05,
"loss": 0.11432676315307617,
"memory(GiB)": 36.91,
"step": 545,
"token_acc": 0.975,
"train_speed(iter/s)": 0.016171
},
{
"epoch": 1.789238578680203,
"grad_norm": 1.0081688781849556,
"learning_rate": 7.620296568003449e-05,
"loss": 0.12327454090118409,
"memory(GiB)": 36.91,
"step": 550,
"token_acc": 0.9525,
"train_speed(iter/s)": 0.016174
},
{
"epoch": 1.8054822335025382,
"grad_norm": 0.9935491377578084,
"learning_rate": 7.57426683381101e-05,
"loss": 0.09574033617973328,
"memory(GiB)": 36.91,
"step": 555,
"token_acc": 0.9694117647058823,
"train_speed(iter/s)": 0.016174
},
{
"epoch": 1.821725888324873,
"grad_norm": 1.0191162814710237,
"learning_rate": 7.527938304108795e-05,
"loss": 0.10299128293991089,
"memory(GiB)": 36.91,
"step": 560,
"token_acc": 0.9694793536804309,
"train_speed(iter/s)": 0.016176
},
{
"epoch": 1.8379695431472083,
"grad_norm": 1.322632268427317,
"learning_rate": 7.481316356255698e-05,
"loss": 0.12594590187072754,
"memory(GiB)": 36.91,
"step": 565,
"token_acc": 0.946257197696737,
"train_speed(iter/s)": 0.016169
},
{
"epoch": 1.854213197969543,
"grad_norm": 1.2990436559927216,
"learning_rate": 7.434406401667695e-05,
"loss": 0.10811959505081177,
"memory(GiB)": 36.91,
"step": 570,
"token_acc": 0.9556650246305419,
"train_speed(iter/s)": 0.016173
},
{
"epoch": 1.8704568527918781,
"grad_norm": 1.141255912127714,
"learning_rate": 7.387213885189746e-05,
"loss": 0.10128064155578613,
"memory(GiB)": 36.91,
"step": 575,
"token_acc": 0.9654255319148937,
"train_speed(iter/s)": 0.016178
},
{
"epoch": 1.8867005076142132,
"grad_norm": 1.6575287534795722,
"learning_rate": 7.339744284463808e-05,
"loss": 0.09879794716835022,
"memory(GiB)": 36.91,
"step": 580,
"token_acc": 0.9805555555555555,
"train_speed(iter/s)": 0.016182
},
{
"epoch": 1.9029441624365482,
"grad_norm": 1.1141293923635756,
"learning_rate": 7.292003109293048e-05,
"loss": 0.0816422462463379,
"memory(GiB)": 36.91,
"step": 585,
"token_acc": 0.961038961038961,
"train_speed(iter/s)": 0.016187
},
{
"epoch": 1.9191878172588832,
"grad_norm": 0.9384463374768481,
"learning_rate": 7.243995901002312e-05,
"loss": 0.10118494033813477,
"memory(GiB)": 36.91,
"step": 590,
"token_acc": 0.978021978021978,
"train_speed(iter/s)": 0.016179
},
{
"epoch": 1.9354314720812182,
"grad_norm": 1.2458643327317989,
"learning_rate": 7.19572823179495e-05,
"loss": 0.13551709651947022,
"memory(GiB)": 36.91,
"step": 595,
"token_acc": 0.96,
"train_speed(iter/s)": 0.016178
},
{
"epoch": 1.9516751269035533,
"grad_norm": 1.2473685164472739,
"learning_rate": 7.147205704106046e-05,
"loss": 0.12769120931625366,
"memory(GiB)": 36.91,
"step": 600,
"token_acc": 0.9561586638830898,
"train_speed(iter/s)": 0.016179
},
{
"epoch": 1.9679187817258883,
"grad_norm": 0.7203387342947396,
"learning_rate": 7.098433949952146e-05,
"loss": 0.09962844252586364,
"memory(GiB)": 36.91,
"step": 605,
"token_acc": 0.9623115577889447,
"train_speed(iter/s)": 0.016178
},
{
"epoch": 1.9841624365482233,
"grad_norm": 0.9094364008463653,
"learning_rate": 7.049418630277542e-05,
"loss": 0.10799739360809327,
"memory(GiB)": 36.91,
"step": 610,
"token_acc": 0.9705159705159705,
"train_speed(iter/s)": 0.016178
},
{
"epoch": 1.9971573604060913,
"eval_loss": 0.19586917757987976,
"eval_runtime": 62.6829,
"eval_samples_per_second": 3.159,
"eval_steps_per_second": 0.798,
"eval_token_acc": 0.9416745061147695,
"step": 614
},
{
"epoch": 2.0028426395939087,
"grad_norm": 11.787067733742486,
"learning_rate": 7.000165434297214e-05,
"loss": 0.12140052318572998,
"memory(GiB)": 36.91,
"step": 615,
"token_acc": 0.951048951048951,
"train_speed(iter/s)": 0.016146
},
{
"epoch": 2.0190862944162435,
"grad_norm": 0.9036939767517369,
"learning_rate": 6.950680078836474e-05,
"loss": 0.0476156622171402,
"memory(GiB)": 36.91,
"step": 620,
"token_acc": 0.9901477832512315,
"train_speed(iter/s)": 0.016145
},
{
"epoch": 2.035329949238579,
"grad_norm": 0.8045933316745676,
"learning_rate": 6.900968307667423e-05,
"loss": 0.0368287205696106,
"memory(GiB)": 36.91,
"step": 625,
"token_acc": 0.9932584269662922,
"train_speed(iter/s)": 0.016144
},
{
"epoch": 2.0515736040609136,
"grad_norm": 0.9084110351960255,
"learning_rate": 6.851035890842259e-05,
"loss": 0.03829330801963806,
"memory(GiB)": 36.91,
"step": 630,
"token_acc": 0.9928741092636579,
"train_speed(iter/s)": 0.016139
},
{
"epoch": 2.067817258883249,
"grad_norm": 0.6115130889160721,
"learning_rate": 6.800888624023553e-05,
"loss": 0.04897831082344055,
"memory(GiB)": 36.91,
"step": 635,
"token_acc": 0.995,
"train_speed(iter/s)": 0.016139
},
{
"epoch": 2.0840609137055837,
"grad_norm": 0.2929609590178906,
"learning_rate": 6.750532327811547e-05,
"loss": 0.027808183431625368,
"memory(GiB)": 36.91,
"step": 640,
"token_acc": 0.9877750611246944,
"train_speed(iter/s)": 0.016139
},
{
"epoch": 2.100304568527919,
"grad_norm": 1.6659772014622232,
"learning_rate": 6.699972847068553e-05,
"loss": 0.04012786149978638,
"memory(GiB)": 36.91,
"step": 645,
"token_acc": 0.9892183288409704,
"train_speed(iter/s)": 0.016136
},
{
"epoch": 2.1165482233502537,
"grad_norm": 1.6942318499082378,
"learning_rate": 6.649216050240539e-05,
"loss": 0.03581180572509766,
"memory(GiB)": 36.91,
"step": 650,
"token_acc": 0.9848866498740554,
"train_speed(iter/s)": 0.016138
},
{
"epoch": 2.132791878172589,
"grad_norm": 1.7750332328595628,
"learning_rate": 6.598267828675979e-05,
"loss": 0.038441383838653566,
"memory(GiB)": 36.91,
"step": 655,
"token_acc": 0.9860724233983287,
"train_speed(iter/s)": 0.016136
},
{
"epoch": 2.149035532994924,
"grad_norm": 0.948452800180108,
"learning_rate": 6.547134095942044e-05,
"loss": 0.03809022605419159,
"memory(GiB)": 36.91,
"step": 660,
"token_acc": 0.9917355371900827,
"train_speed(iter/s)": 0.016135
},
{
"epoch": 2.165279187817259,
"grad_norm": 1.185267349759789,
"learning_rate": 6.495820787138209e-05,
"loss": 0.033171114325523374,
"memory(GiB)": 36.91,
"step": 665,
"token_acc": 0.9947916666666666,
"train_speed(iter/s)": 0.016132
},
{
"epoch": 2.181522842639594,
"grad_norm": 1.1780464513130944,
"learning_rate": 6.44433385820737e-05,
"loss": 0.03416465222835541,
"memory(GiB)": 36.91,
"step": 670,
"token_acc": 0.9948051948051948,
"train_speed(iter/s)": 0.01614
},
{
"epoch": 2.197766497461929,
"grad_norm": 0.5862751780031482,
"learning_rate": 6.392679285244538e-05,
"loss": 0.043843358755111694,
"memory(GiB)": 36.91,
"step": 675,
"token_acc": 0.9854014598540146,
"train_speed(iter/s)": 0.016137
},
{
"epoch": 2.214010152284264,
"grad_norm": 0.7314774852745054,
"learning_rate": 6.340863063803188e-05,
"loss": 0.03051617741584778,
"memory(GiB)": 36.91,
"step": 680,
"token_acc": 0.9970326409495549,
"train_speed(iter/s)": 0.016136
},
{
"epoch": 2.230253807106599,
"grad_norm": 1.4305053109603272,
"learning_rate": 6.288891208199353e-05,
"loss": 0.03859332203865051,
"memory(GiB)": 36.91,
"step": 685,
"token_acc": 0.9813829787234043,
"train_speed(iter/s)": 0.016138
},
{
"epoch": 2.246497461928934,
"grad_norm": 1.2676862153868658,
"learning_rate": 6.23676975081355e-05,
"loss": 0.03608715534210205,
"memory(GiB)": 36.91,
"step": 690,
"token_acc": 0.9923076923076923,
"train_speed(iter/s)": 0.016143
},
{
"epoch": 2.262741116751269,
"grad_norm": 0.717797595223322,
"learning_rate": 6.184504741390596e-05,
"loss": 0.024200823903083802,
"memory(GiB)": 36.91,
"step": 695,
"token_acc": 0.9932885906040269,
"train_speed(iter/s)": 0.016142
},
{
"epoch": 2.278984771573604,
"grad_norm": 1.2738346733999926,
"learning_rate": 6.132102246337407e-05,
"loss": 0.04924860596656799,
"memory(GiB)": 36.91,
"step": 700,
"token_acc": 0.989769820971867,
"train_speed(iter/s)": 0.016144
},
{
"epoch": 2.2952284263959393,
"grad_norm": 0.9709229547354659,
"learning_rate": 6.079568348018882e-05,
"loss": 0.04101951122283935,
"memory(GiB)": 36.91,
"step": 705,
"token_acc": 0.9838709677419355,
"train_speed(iter/s)": 0.016145
},
{
"epoch": 2.311472081218274,
"grad_norm": 0.34074159031019935,
"learning_rate": 6.02690914405191e-05,
"loss": 0.012625060975551605,
"memory(GiB)": 36.91,
"step": 710,
"token_acc": 0.9893162393162394,
"train_speed(iter/s)": 0.016143
},
{
"epoch": 2.3277157360406093,
"grad_norm": 1.405033686903226,
"learning_rate": 5.974130746597628e-05,
"loss": 0.023314157128334047,
"memory(GiB)": 36.91,
"step": 715,
"token_acc": 0.9845261121856866,
"train_speed(iter/s)": 0.016146
},
{
"epoch": 2.343959390862944,
"grad_norm": 0.393622080479984,
"learning_rate": 5.921239281651976e-05,
"loss": 0.03884749114513397,
"memory(GiB)": 36.91,
"step": 720,
"token_acc": 0.9844961240310077,
"train_speed(iter/s)": 0.016147
},
{
"epoch": 2.360203045685279,
"grad_norm": 0.8205162732404321,
"learning_rate": 5.868240888334653e-05,
"loss": 0.0408410519361496,
"memory(GiB)": 36.91,
"step": 725,
"token_acc": 0.9696969696969697,
"train_speed(iter/s)": 0.016147
},
{
"epoch": 2.376446700507614,
"grad_norm": 0.9254262259522679,
"learning_rate": 5.815141718176549e-05,
"loss": 0.03491292595863342,
"memory(GiB)": 36.91,
"step": 730,
"token_acc": 0.9818731117824774,
"train_speed(iter/s)": 0.016148
},
{
"epoch": 2.3926903553299494,
"grad_norm": 0.4613013276623316,
"learning_rate": 5.761947934405736e-05,
"loss": 0.041343241930007935,
"memory(GiB)": 36.91,
"step": 735,
"token_acc": 0.9923076923076923,
"train_speed(iter/s)": 0.01615
},
{
"epoch": 2.4089340101522843,
"grad_norm": 0.5995425123829327,
"learning_rate": 5.708665711232103e-05,
"loss": 0.026265931129455567,
"memory(GiB)": 36.91,
"step": 740,
"token_acc": 0.980225988700565,
"train_speed(iter/s)": 0.016147
},
{
"epoch": 2.425177664974619,
"grad_norm": 0.8947399880614664,
"learning_rate": 5.655301233130711e-05,
"loss": 0.026338309049606323,
"memory(GiB)": 36.91,
"step": 745,
"token_acc": 0.9891304347826086,
"train_speed(iter/s)": 0.01615
},
{
"epoch": 2.4414213197969543,
"grad_norm": 0.6528954286261448,
"learning_rate": 5.6018606941239615e-05,
"loss": 0.031349584460258484,
"memory(GiB)": 36.91,
"step": 750,
"token_acc": 0.9825870646766169,
"train_speed(iter/s)": 0.016153
},
{
"epoch": 2.4576649746192896,
"grad_norm": 0.9124965491201447,
"learning_rate": 5.548350297062659e-05,
"loss": 0.04390305280685425,
"memory(GiB)": 36.91,
"step": 755,
"token_acc": 0.9971181556195965,
"train_speed(iter/s)": 0.016158
},
{
"epoch": 2.4739086294416244,
"grad_norm": 1.2758793187917294,
"learning_rate": 5.494776252906036e-05,
"loss": 0.03932673335075378,
"memory(GiB)": 36.91,
"step": 760,
"token_acc": 0.9852941176470589,
"train_speed(iter/s)": 0.016155
},
{
"epoch": 2.490152284263959,
"grad_norm": 1.6183527750946778,
"learning_rate": 5.44114478000086e-05,
"loss": 0.040107494592666625,
"memory(GiB)": 36.91,
"step": 765,
"token_acc": 0.980722891566265,
"train_speed(iter/s)": 0.01616
},
{
"epoch": 2.5063959390862944,
"grad_norm": 0.8155608212943981,
"learning_rate": 5.387462103359655e-05,
"loss": 0.034613233804702756,
"memory(GiB)": 36.91,
"step": 770,
"token_acc": 0.9809885931558935,
"train_speed(iter/s)": 0.016158
},
{
"epoch": 2.5226395939086297,
"grad_norm": 0.72914335142115,
"learning_rate": 5.333734453938174e-05,
"loss": 0.03472020030021668,
"memory(GiB)": 36.91,
"step": 775,
"token_acc": 0.980722891566265,
"train_speed(iter/s)": 0.016157
},
{
"epoch": 2.5388832487309645,
"grad_norm": 0.715640193227215,
"learning_rate": 5.279968067912161e-05,
"loss": 0.03267112672328949,
"memory(GiB)": 36.91,
"step": 780,
"token_acc": 0.9949109414758269,
"train_speed(iter/s)": 0.016159
},
{
"epoch": 2.5551269035532993,
"grad_norm": 0.5201766196940287,
"learning_rate": 5.226169185953532e-05,
"loss": 0.06324458122253418,
"memory(GiB)": 36.91,
"step": 785,
"token_acc": 0.9822784810126582,
"train_speed(iter/s)": 0.016157
},
{
"epoch": 2.5713705583756346,
"grad_norm": 0.716527670309396,
"learning_rate": 5.1723440525060026e-05,
"loss": 0.036973622441291806,
"memory(GiB)": 36.91,
"step": 790,
"token_acc": 0.9828009828009828,
"train_speed(iter/s)": 0.016157
},
{
"epoch": 2.58761421319797,
"grad_norm": 0.9508048665101771,
"learning_rate": 5.118498915060307e-05,
"loss": 0.04134515523910522,
"memory(GiB)": 36.91,
"step": 795,
"token_acc": 0.9832402234636871,
"train_speed(iter/s)": 0.016159
},
{
"epoch": 2.6038578680203046,
"grad_norm": 0.1695737988935869,
"learning_rate": 5.064640023429043e-05,
"loss": 0.0396234929561615,
"memory(GiB)": 36.91,
"step": 800,
"token_acc": 0.9937888198757764,
"train_speed(iter/s)": 0.01616
},
{
"epoch": 2.6201015228426394,
"grad_norm": 1.353410357397197,
"learning_rate": 5.0107736290212603e-05,
"loss": 0.032366597652435304,
"memory(GiB)": 36.91,
"step": 805,
"token_acc": 0.9853658536585366,
"train_speed(iter/s)": 0.016161
},
{
"epoch": 2.6363451776649747,
"grad_norm": 0.9287301884362714,
"learning_rate": 4.956905984116858e-05,
"loss": 0.02025129795074463,
"memory(GiB)": 36.91,
"step": 810,
"token_acc": 1.0,
"train_speed(iter/s)": 0.016156
},
{
"epoch": 2.65258883248731,
"grad_norm": 0.6605215469870417,
"learning_rate": 4.903043341140879e-05,
"loss": 0.027498137950897217,
"memory(GiB)": 36.91,
"step": 815,
"token_acc": 0.9890590809628009,
"train_speed(iter/s)": 0.016158
},
{
"epoch": 2.6688324873096447,
"grad_norm": 1.284202747583917,
"learning_rate": 4.84919195193779e-05,
"loss": 0.04052730202674866,
"memory(GiB)": 36.91,
"step": 820,
"token_acc": 0.9691714836223507,
"train_speed(iter/s)": 0.016161
},
{
"epoch": 2.6850761421319795,
"grad_norm": 1.054572423840406,
"learning_rate": 4.7953580670458345e-05,
"loss": 0.029700332880020143,
"memory(GiB)": 36.91,
"step": 825,
"token_acc": 0.9903381642512077,
"train_speed(iter/s)": 0.016161
},
{
"epoch": 2.701319796954315,
"grad_norm": 1.515148160249309,
"learning_rate": 4.7415479349715275e-05,
"loss": 0.03995212614536285,
"memory(GiB)": 36.91,
"step": 830,
"token_acc": 0.9887005649717514,
"train_speed(iter/s)": 0.016163
},
{
"epoch": 2.7175634517766496,
"grad_norm": 0.7966857436927859,
"learning_rate": 4.687767801464388e-05,
"loss": 0.029492130875587462,
"memory(GiB)": 36.91,
"step": 835,
"token_acc": 0.9946091644204852,
"train_speed(iter/s)": 0.016162
},
{
"epoch": 2.733807106598985,
"grad_norm": 0.6747809015160623,
"learning_rate": 4.634023908791999e-05,
"loss": 0.028040975332260132,
"memory(GiB)": 36.91,
"step": 840,
"token_acc": 0.9950372208436724,
"train_speed(iter/s)": 0.016165
},
{
"epoch": 2.7500507614213197,
"grad_norm": 0.7236373548114289,
"learning_rate": 4.5803224950154656e-05,
"loss": 0.022182533144950868,
"memory(GiB)": 36.91,
"step": 845,
"token_acc": 0.9973753280839895,
"train_speed(iter/s)": 0.016167
},
{
"epoch": 2.766294416243655,
"grad_norm": 0.8702609694851884,
"learning_rate": 4.5266697932653616e-05,
"loss": 0.03542717695236206,
"memory(GiB)": 36.91,
"step": 850,
"token_acc": 0.9930394431554525,
"train_speed(iter/s)": 0.016168
},
{
"epoch": 2.7825380710659897,
"grad_norm": 0.2339976820774803,
"learning_rate": 4.473072031018248e-05,
"loss": 0.017447268962860106,
"memory(GiB)": 36.91,
"step": 855,
"token_acc": 0.9897172236503856,
"train_speed(iter/s)": 0.016172
},
{
"epoch": 2.798781725888325,
"grad_norm": 1.7564108472908913,
"learning_rate": 4.4195354293738484e-05,
"loss": 0.040924933552742,
"memory(GiB)": 36.91,
"step": 860,
"token_acc": 0.9693396226415094,
"train_speed(iter/s)": 0.016172
},
{
"epoch": 2.8150253807106598,
"grad_norm": 1.749637468786309,
"learning_rate": 4.366066202332974e-05,
"loss": 0.0398847758769989,
"memory(GiB)": 36.91,
"step": 865,
"token_acc": 0.9884726224783862,
"train_speed(iter/s)": 0.016173
},
{
"epoch": 2.831269035532995,
"grad_norm": 1.6657986428559317,
"learning_rate": 4.312670556076244e-05,
"loss": 0.027478563785552978,
"memory(GiB)": 36.91,
"step": 870,
"token_acc": 0.9953379953379954,
"train_speed(iter/s)": 0.016178
},
{
"epoch": 2.84751269035533,
"grad_norm": 0.8830417040757416,
"learning_rate": 4.259354688243757e-05,
"loss": 0.05422350764274597,
"memory(GiB)": 36.91,
"step": 875,
"token_acc": 0.9813953488372092,
"train_speed(iter/s)": 0.016176
},
{
"epoch": 2.863756345177665,
"grad_norm": 1.4037166255295264,
"learning_rate": 4.206124787215714e-05,
"loss": 0.03585241138935089,
"memory(GiB)": 36.91,
"step": 880,
"token_acc": 0.9929577464788732,
"train_speed(iter/s)": 0.016178
},
{
"epoch": 2.88,
"grad_norm": 0.40929439648007787,
"learning_rate": 4.1529870313941386e-05,
"loss": 0.037713998556137086,
"memory(GiB)": 36.91,
"step": 885,
"token_acc": 0.9755555555555555,
"train_speed(iter/s)": 0.016182
},
{
"epoch": 2.896243654822335,
"grad_norm": 0.5649136450093045,
"learning_rate": 4.099947588485744e-05,
"loss": 0.02235218584537506,
"memory(GiB)": 36.91,
"step": 890,
"token_acc": 0.9738562091503268,
"train_speed(iter/s)": 0.016179
},
{
"epoch": 2.91248730964467,
"grad_norm": 0.9411441260021843,
"learning_rate": 4.047012614786055e-05,
"loss": 0.03756971955299378,
"memory(GiB)": 36.91,
"step": 895,
"token_acc": 0.9953596287703016,
"train_speed(iter/s)": 0.016182
},
{
"epoch": 2.928730964467005,
"grad_norm": 0.493632814272918,
"learning_rate": 3.994188254464838e-05,
"loss": 0.03068949580192566,
"memory(GiB)": 36.91,
"step": 900,
"token_acc": 0.9681372549019608,
"train_speed(iter/s)": 0.016183
},
{
"epoch": 2.94497461928934,
"grad_norm": 0.9098057371042104,
"learning_rate": 3.941480638852948e-05,
"loss": 0.060313427448272706,
"memory(GiB)": 36.91,
"step": 905,
"token_acc": 0.9809976247030879,
"train_speed(iter/s)": 0.016186
},
{
"epoch": 2.9612182741116753,
"grad_norm": 0.7111307711774197,
"learning_rate": 3.888895885730666e-05,
"loss": 0.017010049521923067,
"memory(GiB)": 36.91,
"step": 910,
"token_acc": 0.9949748743718593,
"train_speed(iter/s)": 0.016184
},
{
"epoch": 2.97746192893401,
"grad_norm": 1.1085076966021257,
"learning_rate": 3.836440098617611e-05,
"loss": 0.0352476716041565,
"memory(GiB)": 36.91,
"step": 915,
"token_acc": 0.9971264367816092,
"train_speed(iter/s)": 0.016185
},
{
"epoch": 2.9937055837563453,
"grad_norm": 1.0414881730973389,
"learning_rate": 3.784119366064293e-05,
"loss": 0.036097651720046996,
"memory(GiB)": 36.91,
"step": 920,
"token_acc": 0.9859484777517564,
"train_speed(iter/s)": 0.016183
},
{
"epoch": 2.996954314720812,
"eval_loss": 0.2438431978225708,
"eval_runtime": 61.9093,
"eval_samples_per_second": 3.198,
"eval_steps_per_second": 0.808,
"eval_token_acc": 0.9426152398871119,
"step": 921
},
{
"epoch": 3.0123857868020303,
"grad_norm": 0.40292122284066784,
"learning_rate": 3.731939760945423e-05,
"loss": 0.02739437222480774,
"memory(GiB)": 36.91,
"step": 925,
"token_acc": 0.9686609686609686,
"train_speed(iter/s)": 0.016163
},
{
"epoch": 3.0286294416243655,
"grad_norm": 2.9493043319197345,
"learning_rate": 3.6799073397550324e-05,
"loss": 0.023541851341724394,
"memory(GiB)": 36.91,
"step": 930,
"token_acc": 1.0,
"train_speed(iter/s)": 0.016168
},
{
"epoch": 3.0448730964467003,
"grad_norm": 0.17930096671859505,
"learning_rate": 3.628028141903493e-05,
"loss": 0.011585032194852829,
"memory(GiB)": 36.91,
"step": 935,
"token_acc": 0.9955849889624724,
"train_speed(iter/s)": 0.016168
},
{
"epoch": 3.0611167512690356,
"grad_norm": 0.32421421634457975,
"learning_rate": 3.576308189016521e-05,
"loss": 0.01218060329556465,
"memory(GiB)": 36.91,
"step": 940,
"token_acc": 1.0,
"train_speed(iter/s)": 0.016169
},
{
"epoch": 3.0773604060913704,
"grad_norm": 0.6594419595560748,
"learning_rate": 3.5247534842362486e-05,
"loss": 0.02207506597042084,
"memory(GiB)": 36.91,
"step": 945,
"token_acc": 0.988558352402746,
"train_speed(iter/s)": 0.016162
},
{
"epoch": 3.0936040609137057,
"grad_norm": 0.2767332960437252,
"learning_rate": 3.473370011524435e-05,
"loss": 0.007218687236309052,
"memory(GiB)": 36.91,
"step": 950,
"token_acc": 1.0,
"train_speed(iter/s)": 0.016165
},
{
"epoch": 3.1098477157360405,
"grad_norm": 0.35071543831944074,
"learning_rate": 3.422163734967913e-05,
"loss": 0.01153595745563507,
"memory(GiB)": 36.91,
"step": 955,
"token_acc": 1.0,
"train_speed(iter/s)": 0.016173
},
{
"epoch": 3.1260913705583757,
"grad_norm": 0.09053944993100493,
"learning_rate": 3.371140598086332e-05,
"loss": 0.0028192587196826935,
"memory(GiB)": 36.91,
"step": 960,
"token_acc": 0.9975247524752475,
"train_speed(iter/s)": 0.016172
},
{
"epoch": 3.1423350253807105,
"grad_norm": 0.2428779518534084,
"learning_rate": 3.3203065231422904e-05,
"loss": 0.0033150166273117065,
"memory(GiB)": 36.91,
"step": 965,
"token_acc": 1.0,
"train_speed(iter/s)": 0.016172
},
{
"epoch": 3.1585786802030458,
"grad_norm": 0.3634314044068558,
"learning_rate": 3.269667410453944e-05,
"loss": 0.006601892411708832,
"memory(GiB)": 36.91,
"step": 970,
"token_acc": 0.9974160206718347,
"train_speed(iter/s)": 0.016171
},
{
"epoch": 3.1748223350253806,
"grad_norm": 0.09528591509222967,
"learning_rate": 3.2192291377101544e-05,
"loss": 0.006571587175130844,
"memory(GiB)": 36.91,
"step": 975,
"token_acc": 1.0,
"train_speed(iter/s)": 0.016173
},
{
"epoch": 3.191065989847716,
"grad_norm": 1.3857004471442305,
"learning_rate": 3.1689975592882603e-05,
"loss": 0.010420820116996765,
"memory(GiB)": 36.91,
"step": 980,
"token_acc": 1.0,
"train_speed(iter/s)": 0.016176
},
{
"epoch": 3.2073096446700506,
"grad_norm": 0.6960737288379213,
"learning_rate": 3.11897850557456e-05,
"loss": 0.013220900297164917,
"memory(GiB)": 36.91,
"step": 985,
"token_acc": 0.9951807228915662,
"train_speed(iter/s)": 0.016176
},
{
"epoch": 3.223553299492386,
"grad_norm": 0.9453732221306024,
"learning_rate": 3.0691777822875846e-05,
"loss": 0.01793895959854126,
"memory(GiB)": 36.91,
"step": 990,
"token_acc": 1.0,
"train_speed(iter/s)": 0.016176
},
{
"epoch": 3.2397969543147207,
"grad_norm": 0.7409992990444315,
"learning_rate": 3.019601169804216e-05,
"loss": 0.019229742884635925,
"memory(GiB)": 36.91,
"step": 995,
"token_acc": 0.9945054945054945,
"train_speed(iter/s)": 0.016174
},
{
"epoch": 3.256040609137056,
"grad_norm": 0.5679417621370911,
"learning_rate": 2.9702544224887684e-05,
"loss": 0.024555668234825134,
"memory(GiB)": 36.91,
"step": 1000,
"token_acc": 0.9953161592505855,
"train_speed(iter/s)": 0.016175
},
{
"epoch": 3.2722842639593908,
"grad_norm": 0.08818412948467023,
"learning_rate": 2.9211432680250717e-05,
"loss": 0.009600495547056198,
"memory(GiB)": 36.91,
"step": 1005,
"token_acc": 1.0,
"train_speed(iter/s)": 0.016176
},
{
"epoch": 3.288527918781726,
"grad_norm": 0.597788232010352,
"learning_rate": 2.872273406751664e-05,
"loss": 0.015477313101291657,
"memory(GiB)": 36.91,
"step": 1010,
"token_acc": 1.0,
"train_speed(iter/s)": 0.016178
},
{
"epoch": 3.304771573604061,
"grad_norm": 0.5815875303347526,
"learning_rate": 2.823650511000142e-05,
"loss": 0.007314224541187286,
"memory(GiB)": 36.91,
"step": 1015,
"token_acc": 0.9928741092636579,
"train_speed(iter/s)": 0.016175
},
{
"epoch": 3.321015228426396,
"grad_norm": 0.06303638116527722,
"learning_rate": 2.7752802244367875e-05,
"loss": 0.0048162821680307385,
"memory(GiB)": 36.91,
"step": 1020,
"token_acc": 0.9976359338061466,
"train_speed(iter/s)": 0.016175
},
{
"epoch": 3.337258883248731,
"grad_norm": 1.530822467857818,
"learning_rate": 2.7271681614074973e-05,
"loss": 0.011756302416324615,
"memory(GiB)": 36.91,
"step": 1025,
"token_acc": 0.9976744186046511,
"train_speed(iter/s)": 0.016173
},
{
"epoch": 3.353502538071066,
"grad_norm": 0.03790601751186608,
"learning_rate": 2.679319906286122e-05,
"loss": 0.008612405508756638,
"memory(GiB)": 36.91,
"step": 1030,
"token_acc": 0.9927184466019418,
"train_speed(iter/s)": 0.016176
},
{
"epoch": 3.369746192893401,
"grad_norm": 0.21401768725028367,
"learning_rate": 2.6317410128262954e-05,
"loss": 0.006316320598125457,
"memory(GiB)": 36.91,
"step": 1035,
"token_acc": 0.9950124688279302,
"train_speed(iter/s)": 0.016179
},
{
"epoch": 3.385989847715736,
"grad_norm": 0.19540220508166592,
"learning_rate": 2.5844370035168073e-05,
"loss": 0.004939628392457962,
"memory(GiB)": 36.91,
"step": 1040,
"token_acc": 1.0,
"train_speed(iter/s)": 0.016181
},
{
"epoch": 3.402233502538071,
"grad_norm": 0.8965894055639708,
"learning_rate": 2.537413368940601e-05,
"loss": 0.016151268780231477,
"memory(GiB)": 36.91,
"step": 1045,
"token_acc": 0.9898785425101214,
"train_speed(iter/s)": 0.016182
},
{
"epoch": 3.4184771573604062,
"grad_norm": 0.21427146738429803,
"learning_rate": 2.4906755671374903e-05,
"loss": 0.010773959755897521,
"memory(GiB)": 36.91,
"step": 1050,
"token_acc": 0.9977827050997783,
"train_speed(iter/s)": 0.016182
},
{
"epoch": 3.434720812182741,
"grad_norm": 0.09286838269357345,
"learning_rate": 2.4442290229706344e-05,
"loss": 0.004091666638851165,
"memory(GiB)": 36.91,
"step": 1055,
"token_acc": 0.9954233409610984,
"train_speed(iter/s)": 0.016183
},
{
"epoch": 3.4509644670050763,
"grad_norm": 0.13489614133107514,
"learning_rate": 2.3980791274968837e-05,
"loss": 0.018990179896354674,
"memory(GiB)": 36.91,
"step": 1060,
"token_acc": 0.9945054945054945,
"train_speed(iter/s)": 0.016184
},
{
"epoch": 3.467208121827411,
"grad_norm": 0.1825955700626613,
"learning_rate": 2.3522312373410276e-05,
"loss": 0.011526491492986679,
"memory(GiB)": 36.91,
"step": 1065,
"token_acc": 0.997275204359673,
"train_speed(iter/s)": 0.016188
},
{
"epoch": 3.4834517766497464,
"grad_norm": 0.2440094791459664,
"learning_rate": 2.3066906740740623e-05,
"loss": 0.019795812666416168,
"memory(GiB)": 36.91,
"step": 1070,
"token_acc": 0.9896373056994818,
"train_speed(iter/s)": 0.016187
},
{
"epoch": 3.499695431472081,
"grad_norm": 0.4913730237430669,
"learning_rate": 2.2614627235955026e-05,
"loss": 0.007270602881908417,
"memory(GiB)": 36.91,
"step": 1075,
"token_acc": 1.0,
"train_speed(iter/s)": 0.016189
},
{
"epoch": 3.5159390862944164,
"grad_norm": 0.6922284750457558,
"learning_rate": 2.2165526355198605e-05,
"loss": 0.0127563938498497,
"memory(GiB)": 36.91,
"step": 1080,
"token_acc": 1.0,
"train_speed(iter/s)": 0.016191
},
{
"epoch": 3.5321827411167512,
"grad_norm": 0.6450602563278425,
"learning_rate": 2.171965622567308e-05,
"loss": 0.007853203266859055,
"memory(GiB)": 36.91,
"step": 1085,
"token_acc": 1.0,
"train_speed(iter/s)": 0.016193
},
{
"epoch": 3.548426395939086,
"grad_norm": 0.3234875973475892,
"learning_rate": 2.127706859958647e-05,
"loss": 0.008352670073509216,
"memory(GiB)": 36.91,
"step": 1090,
"token_acc": 1.0,
"train_speed(iter/s)": 0.016193
},
{
"epoch": 3.5646700507614213,
"grad_norm": 0.09371017997182811,
"learning_rate": 2.0837814848146166e-05,
"loss": 0.001982194371521473,
"memory(GiB)": 36.91,
"step": 1095,
"token_acc": 1.0,
"train_speed(iter/s)": 0.016191
},
{
"epoch": 3.5809137055837565,
"grad_norm": 0.8724610494447905,
"learning_rate": 2.0401945955596206e-05,
"loss": 0.0030656153336167335,
"memory(GiB)": 36.91,
"step": 1100,
"token_acc": 1.0,
"train_speed(iter/s)": 0.016188
},
{
"epoch": 3.5971573604060914,
"grad_norm": 0.5650605008223917,
"learning_rate": 1.9969512513299664e-05,
"loss": 0.00554112084209919,
"memory(GiB)": 36.91,
"step": 1105,
"token_acc": 1.0,
"train_speed(iter/s)": 0.01619
},
{
"epoch": 3.613401015228426,
"grad_norm": 0.39939968413297244,
"learning_rate": 1.9540564713866387e-05,
"loss": 0.006034587323665619,
"memory(GiB)": 36.91,
"step": 1110,
"token_acc": 0.9948586118251928,
"train_speed(iter/s)": 0.016191
},
{
"epoch": 3.6296446700507614,
"grad_norm": 0.1065247660653177,
"learning_rate": 1.9115152345327152e-05,
"loss": 0.005482121184468269,
"memory(GiB)": 36.91,
"step": 1115,
"token_acc": 1.0,
"train_speed(iter/s)": 0.016191
},
{
"epoch": 3.6458883248730967,
"grad_norm": 0.8174090560458377,
"learning_rate": 1.8693324785354822e-05,
"loss": 0.011324305832386018,
"memory(GiB)": 36.91,
"step": 1120,
"token_acc": 1.0,
"train_speed(iter/s)": 0.016193
},
{
"epoch": 3.6621319796954315,
"grad_norm": 0.17850770204119407,
"learning_rate": 1.8275130995532974e-05,
"loss": 0.0144767165184021,
"memory(GiB)": 36.91,
"step": 1125,
"token_acc": 0.9978586723768736,
"train_speed(iter/s)": 0.016195
},
{
"epoch": 3.6783756345177663,
"grad_norm": 0.33877743892749795,
"learning_rate": 1.7860619515673033e-05,
"loss": 0.01116895154118538,
"memory(GiB)": 36.91,
"step": 1130,
"token_acc": 0.9953271028037384,
"train_speed(iter/s)": 0.016195
},
{
"epoch": 3.6946192893401015,
"grad_norm": 0.5168488777536275,
"learning_rate": 1.744983845818019e-05,
"loss": 0.0068625412881374356,
"memory(GiB)": 36.91,
"step": 1135,
"token_acc": 0.9978213507625272,
"train_speed(iter/s)": 0.0162
},
{
"epoch": 3.710862944162437,
"grad_norm": 0.7346145409084535,
"learning_rate": 1.7042835502468934e-05,
"loss": 0.002322973683476448,
"memory(GiB)": 36.91,
"step": 1140,
"token_acc": 1.0,
"train_speed(iter/s)": 0.016197
},
{
"epoch": 3.7271065989847716,
"grad_norm": 0.6646625028373466,
"learning_rate": 1.6639657889429018e-05,
"loss": 0.018248292803764343,
"memory(GiB)": 36.91,
"step": 1145,
"token_acc": 0.9840182648401826,
"train_speed(iter/s)": 0.016195
},
{
"epoch": 3.7433502538071064,
"grad_norm": 0.8354437881107281,
"learning_rate": 1.624035241594213e-05,
"loss": 0.006459401547908783,
"memory(GiB)": 36.91,
"step": 1150,
"token_acc": 1.0,
"train_speed(iter/s)": 0.016193
},
{
"epoch": 3.7595939086294416,
"grad_norm": 0.2958093671449778,
"learning_rate": 1.5844965429450132e-05,
"loss": 0.008441635966300964,
"memory(GiB)": 36.91,
"step": 1155,
"token_acc": 0.9834368530020704,
"train_speed(iter/s)": 0.016192
},
{
"epoch": 3.775837563451777,
"grad_norm": 0.4306627690474224,
"learning_rate": 1.545354282257562e-05,
"loss": 0.015231077373027802,
"memory(GiB)": 36.91,
"step": 1160,
"token_acc": 0.9976851851851852,
"train_speed(iter/s)": 0.016196
},
{
"epoch": 3.7920812182741117,
"grad_norm": 0.0801666210860899,
"learning_rate": 1.5066130027795044e-05,
"loss": 0.02225690186023712,
"memory(GiB)": 36.91,
"step": 1165,
"token_acc": 0.9886363636363636,
"train_speed(iter/s)": 0.0162
},
{
"epoch": 3.8083248730964465,
"grad_norm": 1.390297822775598,
"learning_rate": 1.4682772012165436e-05,
"loss": 0.011767344176769256,
"memory(GiB)": 36.91,
"step": 1170,
"token_acc": 0.9953810623556582,
"train_speed(iter/s)": 0.0162
},
{
"epoch": 3.8245685279187818,
"grad_norm": 0.576269037629794,
"learning_rate": 1.4303513272105057e-05,
"loss": 0.01135575920343399,
"memory(GiB)": 36.91,
"step": 1175,
"token_acc": 0.9976744186046511,
"train_speed(iter/s)": 0.016199
},
{
"epoch": 3.840812182741117,
"grad_norm": 0.6175307257021349,
"learning_rate": 1.3928397828228628e-05,
"loss": 0.00802643597126007,
"memory(GiB)": 36.91,
"step": 1180,
"token_acc": 0.9950738916256158,
"train_speed(iter/s)": 0.016201
},
{
"epoch": 3.857055837563452,
"grad_norm": 0.13098006216818975,
"learning_rate": 1.3557469220237962e-05,
"loss": 0.011502113938331605,
"memory(GiB)": 36.91,
"step": 1185,
"token_acc": 0.9935344827586207,
"train_speed(iter/s)": 0.016204
},
{
"epoch": 3.8732994923857866,
"grad_norm": 0.3987654668677921,
"learning_rate": 1.3190770501868243e-05,
"loss": 0.011363585293293,
"memory(GiB)": 36.91,
"step": 1190,
"token_acc": 0.9974160206718347,
"train_speed(iter/s)": 0.016203
},
{
"epoch": 3.889543147208122,
"grad_norm": 0.14976124575026759,
"learning_rate": 1.2828344235890726e-05,
"loss": 0.01089974120259285,
"memory(GiB)": 36.91,
"step": 1195,
"token_acc": 0.9933481152993349,
"train_speed(iter/s)": 0.016203
},
{
"epoch": 3.9057868020304567,
"grad_norm": 1.5199866835408566,
"learning_rate": 1.247023248917259e-05,
"loss": 0.009822697192430497,
"memory(GiB)": 36.91,
"step": 1200,
"token_acc": 0.9929742388758782,
"train_speed(iter/s)": 0.016204
},
{
"epoch": 3.922030456852792,
"grad_norm": 1.6580131250235997,
"learning_rate": 1.2116476827794104e-05,
"loss": 0.024014970660209654,
"memory(GiB)": 36.91,
"step": 1205,
"token_acc": 1.0,
"train_speed(iter/s)": 0.016204
},
{
"epoch": 3.9382741116751268,
"grad_norm": 1.157754837023021,
"learning_rate": 1.1767118312224151e-05,
"loss": 0.007532584667205811,
"memory(GiB)": 36.91,
"step": 1210,
"token_acc": 0.9972375690607734,
"train_speed(iter/s)": 0.016207
},
{
"epoch": 3.954517766497462,
"grad_norm": 0.6972765226059477,
"learning_rate": 1.142219749255427e-05,
"loss": 0.004430451989173889,
"memory(GiB)": 36.91,
"step": 1215,
"token_acc": 0.9972677595628415,
"train_speed(iter/s)": 0.016207
},
{
"epoch": 3.970761421319797,
"grad_norm": 2.2979580480692188,
"learning_rate": 1.1081754403791999e-05,
"loss": 0.015141716599464417,
"memory(GiB)": 36.91,
"step": 1220,
"token_acc": 0.9954337899543378,
"train_speed(iter/s)": 0.016206
},
{
"epoch": 3.987005076142132,
"grad_norm": 0.2965970510784761,
"learning_rate": 1.0745828561214056e-05,
"loss": 0.021216361224651335,
"memory(GiB)": 36.91,
"step": 1225,
"token_acc": 0.9954337899543378,
"train_speed(iter/s)": 0.016206
},
{
"epoch": 3.996751269035533,
"eval_loss": 0.29802748560905457,
"eval_runtime": 62.08,
"eval_samples_per_second": 3.189,
"eval_steps_per_second": 0.805,
"eval_token_acc": 0.9388523047977423,
"step": 1228
},
{
"epoch": 4.0056852791878175,
"grad_norm": 0.7419564842144963,
"learning_rate": 1.041445895577977e-05,
"loss": 0.009254975616931916,
"memory(GiB)": 36.91,
"step": 1230,
"token_acc": 0.9668174962292609,
"train_speed(iter/s)": 0.016191
},
{
"epoch": 4.021928934010153,
"grad_norm": 0.1343462548929871,
"learning_rate": 1.008768404960535e-05,
"loss": 0.002759779617190361,
"memory(GiB)": 36.91,
"step": 1235,
"token_acc": 1.0,
"train_speed(iter/s)": 0.016192
},
{
"epoch": 4.038172588832487,
"grad_norm": 0.08229350773537837,
"learning_rate": 9.765541771499659e-06,
"loss": 0.0012123636901378632,
"memory(GiB)": 36.91,
"step": 1240,
"token_acc": 1.0,
"train_speed(iter/s)": 0.016192
},
{
"epoch": 4.054416243654822,
"grad_norm": 0.08190000464747839,
"learning_rate": 9.448069512561775e-06,
"loss": 0.0066297553479671475,
"memory(GiB)": 36.91,
"step": 1245,
"token_acc": 1.0,
"train_speed(iter/s)": 0.016195
},
{
"epoch": 4.070659898477158,
"grad_norm": 0.12397302242146173,
"learning_rate": 9.135304121840976e-06,
"loss": 0.0012923330999910832,
"memory(GiB)": 36.91,
"step": 1250,
"token_acc": 1.0,
"train_speed(iter/s)": 0.016192
},
{
"epoch": 4.086903553299492,
"grad_norm": 0.057048418793994596,
"learning_rate": 8.827281902059698e-06,
"loss": 0.0007107659243047237,
"memory(GiB)": 36.91,
"step": 1255,
"token_acc": 1.0,
"train_speed(iter/s)": 0.016191
},
{
"epoch": 4.103147208121827,
"grad_norm": 0.16324844745357645,
"learning_rate": 8.524038605399886e-06,
"loss": 0.0021383626386523246,
"memory(GiB)": 36.91,
"step": 1260,
"token_acc": 1.0,
"train_speed(iter/s)": 0.016189
},
{
"epoch": 4.1193908629441625,
"grad_norm": 0.06874787839714207,
"learning_rate": 8.225609429353187e-06,
"loss": 0.0028022559359669684,
"memory(GiB)": 36.91,
"step": 1265,
"token_acc": 1.0,
"train_speed(iter/s)": 0.016188
},
{
"epoch": 4.135634517766498,
"grad_norm": 0.2526140368602798,
"learning_rate": 7.932029012635623e-06,
"loss": 0.003260459750890732,
"memory(GiB)": 36.91,
"step": 1270,
"token_acc": 1.0,
"train_speed(iter/s)": 0.016187
},
{
"epoch": 4.151878172588832,
"grad_norm": 0.14918347721067196,
"learning_rate": 7.643331431167017e-06,
"loss": 0.004188637435436249,
"memory(GiB)": 36.91,
"step": 1275,
"token_acc": 1.0,
"train_speed(iter/s)": 0.016188
},
{
"epoch": 4.168121827411167,
"grad_norm": 0.46928271799249704,
"learning_rate": 7.35955019411585e-06,
"loss": 0.011932872980833054,
"memory(GiB)": 36.91,
"step": 1280,
"token_acc": 1.0,
"train_speed(iter/s)": 0.016191
},
{
"epoch": 4.184365482233503,
"grad_norm": 0.07080459315091195,
"learning_rate": 7.080718240009826e-06,
"loss": 0.004019932448863983,
"memory(GiB)": 36.91,
"step": 1285,
"token_acc": 1.0,
"train_speed(iter/s)": 0.01619
},
{
"epoch": 4.200609137055838,
"grad_norm": 0.7271340874397169,
"learning_rate": 6.806867932912653e-06,
"loss": 0.0061328854411840435,
"memory(GiB)": 36.91,
"step": 1290,
"token_acc": 1.0,
"train_speed(iter/s)": 0.016194
},
{
"epoch": 4.216852791878172,
"grad_norm": 0.1265328539578886,
"learning_rate": 6.53803105866761e-06,
"loss": 0.006417517364025116,
"memory(GiB)": 36.91,
"step": 1295,
"token_acc": 1.0,
"train_speed(iter/s)": 0.016196
},
{
"epoch": 4.233096446700507,
"grad_norm": 0.057288978819073196,
"learning_rate": 6.274238821208128e-06,
"loss": 0.003987757861614228,
"memory(GiB)": 36.91,
"step": 1300,
"token_acc": 0.9975062344139651,
"train_speed(iter/s)": 0.016195
},
{
"epoch": 4.249340101522843,
"grad_norm": 0.1481683428098521,
"learning_rate": 6.015521838935905e-06,
"loss": 0.0010721445083618163,
"memory(GiB)": 36.91,
"step": 1305,
"token_acc": 1.0,
"train_speed(iter/s)": 0.016194
},
{
"epoch": 4.265583756345178,
"grad_norm": 0.10590383120253814,
"learning_rate": 5.7619101411671095e-06,
"loss": 0.002213609591126442,
"memory(GiB)": 36.91,
"step": 1310,
"token_acc": 1.0,
"train_speed(iter/s)": 0.016195
},
{
"epoch": 4.281827411167512,
"grad_norm": 0.04714189372424805,
"learning_rate": 5.513433164646814e-06,
"loss": 0.0011348580941557885,
"memory(GiB)": 36.91,
"step": 1315,
"token_acc": 0.9976689976689976,
"train_speed(iter/s)": 0.016199
},
{
"epoch": 4.298071065989848,
"grad_norm": 0.476391282204877,
"learning_rate": 5.270119750132258e-06,
"loss": 0.004196888953447342,
"memory(GiB)": 36.91,
"step": 1320,
"token_acc": 1.0,
"train_speed(iter/s)": 0.016201
},
{
"epoch": 4.314314720812183,
"grad_norm": 0.35042552841819846,
"learning_rate": 5.031998139045352e-06,
"loss": 0.0034095611423254012,
"memory(GiB)": 36.91,
"step": 1325,
"token_acc": 1.0,
"train_speed(iter/s)": 0.016203
},
{
"epoch": 4.330558375634518,
"grad_norm": 0.05524764971116243,
"learning_rate": 4.799095970194628e-06,
"loss": 0.0037711452692747115,
"memory(GiB)": 36.91,
"step": 1330,
"token_acc": 1.0,
"train_speed(iter/s)": 0.016203
},
{
"epoch": 4.346802030456852,
"grad_norm": 0.5445980593755461,
"learning_rate": 4.571440276567257e-06,
"loss": 0.0024499524384737014,
"memory(GiB)": 36.91,
"step": 1335,
"token_acc": 0.997624703087886,
"train_speed(iter/s)": 0.016206
},
{
"epoch": 4.363045685279188,
"grad_norm": 0.10598886435572437,
"learning_rate": 4.349057482191299e-06,
"loss": 0.004410183429718018,
"memory(GiB)": 36.91,
"step": 1340,
"token_acc": 1.0,
"train_speed(iter/s)": 0.016206
},
{
"epoch": 4.379289340101523,
"grad_norm": 0.04699969388550453,
"learning_rate": 4.1319733990686446e-06,
"loss": 0.0011100947856903076,
"memory(GiB)": 36.91,
"step": 1345,
"token_acc": 1.0,
"train_speed(iter/s)": 0.016205
},
{
"epoch": 4.395532994923858,
"grad_norm": 0.017045928815902597,
"learning_rate": 3.920213224179042e-06,
"loss": 0.00034863052424043416,
"memory(GiB)": 36.91,
"step": 1350,
"token_acc": 1.0,
"train_speed(iter/s)": 0.016206
},
{
"epoch": 4.4117766497461925,
"grad_norm": 0.7161935581935048,
"learning_rate": 3.7138015365554833e-06,
"loss": 0.0035605177283287047,
"memory(GiB)": 36.91,
"step": 1355,
"token_acc": 0.9977220956719818,
"train_speed(iter/s)": 0.016207
},
{
"epoch": 4.428020304568528,
"grad_norm": 0.06887525802872778,
"learning_rate": 3.512762294431271e-06,
"loss": 0.006134101003408432,
"memory(GiB)": 36.91,
"step": 1360,
"token_acc": 0.9975186104218362,
"train_speed(iter/s)": 0.016208
},
{
"epoch": 4.444263959390863,
"grad_norm": 0.041826315852571724,
"learning_rate": 3.3171188324592427e-06,
"loss": 0.0012344198301434516,
"memory(GiB)": 36.91,
"step": 1365,
"token_acc": 1.0,
"train_speed(iter/s)": 0.016208
},
{
"epoch": 4.460507614213198,
"grad_norm": 0.07787992465189252,
"learning_rate": 3.126893859003249e-06,
"loss": 0.0013754777610301971,
"memory(GiB)": 36.91,
"step": 1370,
"token_acc": 1.0,
"train_speed(iter/s)": 0.016208
},
{
"epoch": 4.476751269035533,
"grad_norm": 0.9611581457799497,
"learning_rate": 2.9421094535024507e-06,
"loss": 0.004121043905615807,
"memory(GiB)": 36.91,
"step": 1375,
"token_acc": 0.9933920704845814,
"train_speed(iter/s)": 0.016206
},
{
"epoch": 4.492994923857868,
"grad_norm": 0.11072593270596472,
"learning_rate": 2.762787063908523e-06,
"loss": 0.0024029091000556946,
"memory(GiB)": 36.91,
"step": 1380,
"token_acc": 1.0,
"train_speed(iter/s)": 0.016205
},
{
"epoch": 4.509238578680203,
"grad_norm": 0.02340550565254115,
"learning_rate": 2.5889475041961765e-06,
"loss": 0.001028289459645748,
"memory(GiB)": 36.91,
"step": 1385,
"token_acc": 1.0,
"train_speed(iter/s)": 0.01621
},
{
"epoch": 4.525482233502538,
"grad_norm": 0.08895116218405089,
"learning_rate": 2.4206109519473163e-06,
"loss": 0.0021161407232284544,
"memory(GiB)": 36.91,
"step": 1390,
"token_acc": 1.0,
"train_speed(iter/s)": 0.016211
},
{
"epoch": 4.541725888324873,
"grad_norm": 0.24076601170504602,
"learning_rate": 2.2577969460089997e-06,
"loss": 0.0007429494522511959,
"memory(GiB)": 36.91,
"step": 1395,
"token_acc": 1.0,
"train_speed(iter/s)": 0.016212
},
{
"epoch": 4.557969543147208,
"grad_norm": 0.19664829308024404,
"learning_rate": 2.100524384225555e-06,
"loss": 0.0008249727077782154,
"memory(GiB)": 36.91,
"step": 1400,
"token_acc": 1.0,
"train_speed(iter/s)": 0.01621
},
{
"epoch": 4.574213197969543,
"grad_norm": 0.06599531052332817,
"learning_rate": 1.948811521245131e-06,
"loss": 0.000786225963383913,
"memory(GiB)": 36.91,
"step": 1405,
"token_acc": 1.0,
"train_speed(iter/s)": 0.016209
},
{
"epoch": 4.5904568527918785,
"grad_norm": 0.10702737644857346,
"learning_rate": 1.8026759664008465e-06,
"loss": 0.003063713386654854,
"memory(GiB)": 36.91,
"step": 1410,
"token_acc": 1.0,
"train_speed(iter/s)": 0.016207
},
{
"epoch": 4.606700507614213,
"grad_norm": 0.41678449867799244,
"learning_rate": 1.6621346816668992e-06,
"loss": 0.00532943345606327,
"memory(GiB)": 36.91,
"step": 1415,
"token_acc": 0.9937629937629938,
"train_speed(iter/s)": 0.016207
},
{
"epoch": 4.622944162436548,
"grad_norm": 0.029982460463042173,
"learning_rate": 1.5272039796897786e-06,
"loss": 0.0017097776755690575,
"memory(GiB)": 36.91,
"step": 1420,
"token_acc": 1.0,
"train_speed(iter/s)": 0.016209
},
{
"epoch": 4.639187817258883,
"grad_norm": 0.03591858354249925,
"learning_rate": 1.397899521894841e-06,
"loss": 0.0013645312748849392,
"memory(GiB)": 36.91,
"step": 1425,
"token_acc": 1.0,
"train_speed(iter/s)": 0.016212
},
{
"epoch": 4.655431472081219,
"grad_norm": 0.04773799774300644,
"learning_rate": 1.2742363166685034e-06,
"loss": 0.0009639391675591469,
"memory(GiB)": 36.91,
"step": 1430,
"token_acc": 1.0,
"train_speed(iter/s)": 0.01621
},
{
"epoch": 4.671675126903553,
"grad_norm": 0.129000803673704,
"learning_rate": 1.15622871761622e-06,
"loss": 0.0005136763211339712,
"memory(GiB)": 36.91,
"step": 1435,
"token_acc": 1.0,
"train_speed(iter/s)": 0.016212
},
{
"epoch": 4.687918781725888,
"grad_norm": 0.029179325549530243,
"learning_rate": 1.0438904218964319e-06,
"loss": 0.0004105303902179003,
"memory(GiB)": 36.91,
"step": 1440,
"token_acc": 1.0,
"train_speed(iter/s)": 0.016213
},
{
"epoch": 4.7041624365482235,
"grad_norm": 0.04897256940654327,
"learning_rate": 9.372344686307655e-07,
"loss": 0.0009922079741954803,
"memory(GiB)": 36.91,
"step": 1445,
"token_acc": 1.0,
"train_speed(iter/s)": 0.016212
},
{
"epoch": 4.720406091370558,
"grad_norm": 0.0393178010532892,
"learning_rate": 8.362732373905723e-07,
"loss": 0.0008288329467177391,
"memory(GiB)": 36.91,
"step": 1450,
"token_acc": 1.0,
"train_speed(iter/s)": 0.016214
},
{
"epoch": 4.736649746192893,
"grad_norm": 0.08771738931354985,
"learning_rate": 7.410184467600001e-07,
"loss": 0.0005111692938953638,
"memory(GiB)": 36.91,
"step": 1455,
"token_acc": 1.0,
"train_speed(iter/s)": 0.016214
},
{
"epoch": 4.752893401015228,
"grad_norm": 0.04916799951696976,
"learning_rate": 6.514811529758747e-07,
"loss": 0.007441927492618561,
"memory(GiB)": 36.91,
"step": 1460,
"token_acc": 1.0,
"train_speed(iter/s)": 0.016214
},
{
"epoch": 4.769137055837564,
"grad_norm": 0.44716598217302617,
"learning_rate": 5.676717486443439e-07,
"loss": 0.0024275451898574827,
"memory(GiB)": 36.91,
"step": 1465,
"token_acc": 1.0,
"train_speed(iter/s)": 0.016214
},
{
"epoch": 4.785380710659899,
"grad_norm": 0.12117859136787597,
"learning_rate": 4.895999615346314e-07,
"loss": 0.001637093722820282,
"memory(GiB)": 36.91,
"step": 1470,
"token_acc": 1.0,
"train_speed(iter/s)": 0.016214
},
{
"epoch": 4.801624365482233,
"grad_norm": 0.01706819131966345,
"learning_rate": 4.1727485344994486e-07,
"loss": 0.0003483247943222523,
"memory(GiB)": 36.91,
"step": 1475,
"token_acc": 1.0,
"train_speed(iter/s)": 0.016212
},
{
"epoch": 4.8178680203045685,
"grad_norm": 0.04859669108953238,
"learning_rate": 3.507048191756401e-07,
"loss": 0.0021356761455535887,
"memory(GiB)": 36.91,
"step": 1480,
"token_acc": 1.0,
"train_speed(iter/s)": 0.016212
},
{
"epoch": 4.834111675126904,
"grad_norm": 0.03682429514387162,
"learning_rate": 2.8989758550487245e-07,
"loss": 0.0021858945488929748,
"memory(GiB)": 36.91,
"step": 1485,
"token_acc": 1.0,
"train_speed(iter/s)": 0.016213
},
{
"epoch": 4.850355329949238,
"grad_norm": 0.06507640939116277,
"learning_rate": 2.3486021034170857e-07,
"loss": 0.002923069894313812,
"memory(GiB)": 36.91,
"step": 1490,
"token_acc": 0.9977064220183486,
"train_speed(iter/s)": 0.016212
},
{
"epoch": 4.866598984771573,
"grad_norm": 0.04259804746440851,
"learning_rate": 1.8559908188195418e-07,
"loss": 0.0019719479605555534,
"memory(GiB)": 36.91,
"step": 1495,
"token_acc": 1.0,
"train_speed(iter/s)": 0.016212
},
{
"epoch": 4.882842639593909,
"grad_norm": 0.25393381486977334,
"learning_rate": 1.4211991787164147e-07,
"loss": 0.0011512625962495804,
"memory(GiB)": 36.91,
"step": 1500,
"token_acc": 1.0,
"train_speed(iter/s)": 0.016215
},
{
"epoch": 4.899086294416244,
"grad_norm": 0.21720000107148496,
"learning_rate": 1.044277649433989e-07,
"loss": 0.003379678726196289,
"memory(GiB)": 36.91,
"step": 1505,
"token_acc": 1.0,
"train_speed(iter/s)": 0.016213
},
{
"epoch": 4.915329949238579,
"grad_norm": 0.6636335728606932,
"learning_rate": 7.252699803065311e-08,
"loss": 0.014554958045482635,
"memory(GiB)": 36.91,
"step": 1510,
"token_acc": 0.9886792452830189,
"train_speed(iter/s)": 0.016211
},
{
"epoch": 4.9315736040609135,
"grad_norm": 0.042674818413491626,
"learning_rate": 4.6421319859862864e-08,
"loss": 0.0024311095476150513,
"memory(GiB)": 36.91,
"step": 1515,
"token_acc": 1.0,
"train_speed(iter/s)": 0.016212
},
{
"epoch": 4.947817258883249,
"grad_norm": 0.07981897617268605,
"learning_rate": 2.6113760520735108e-08,
"loss": 0.0024462098255753515,
"memory(GiB)": 36.91,
"step": 1520,
"token_acc": 1.0,
"train_speed(iter/s)": 0.01621
},
{
"epoch": 4.964060913705584,
"grad_norm": 0.01695528976036472,
"learning_rate": 1.1606677114500697e-08,
"loss": 0.011407441645860671,
"memory(GiB)": 36.91,
"step": 1525,
"token_acc": 1.0,
"train_speed(iter/s)": 0.01621
},
{
"epoch": 4.980304568527918,
"grad_norm": 0.05383783400729952,
"learning_rate": 2.901753480361036e-09,
"loss": 0.005226198583841324,
"memory(GiB)": 36.91,
"step": 1530,
"token_acc": 0.9956521739130435,
"train_speed(iter/s)": 0.01621
},
{
"epoch": 4.996548223350254,
"grad_norm": 0.9774296313594534,
"learning_rate": 0.0,
"loss": 0.003532126545906067,
"memory(GiB)": 36.91,
"step": 1535,
"token_acc": 1.0,
"train_speed(iter/s)": 0.016209
},
{
"epoch": 4.996548223350254,
"eval_loss": 0.31882038712501526,
"eval_runtime": 62.2556,
"eval_samples_per_second": 3.18,
"eval_steps_per_second": 0.803,
"eval_token_acc": 0.9397930385700847,
"step": 1535
},
{
"epoch": 4.996548223350254,
"eval_loss": 0.31882038712501526,
"eval_runtime": 62.6813,
"eval_samples_per_second": 3.159,
"eval_steps_per_second": 0.798,
"eval_token_acc": 0.9397930385700847,
"step": 1535
}
],
"logging_steps": 5,
"max_steps": 1535,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.2119246482890555e+19,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}