One-Shot-CFT-Math-Qwen-1.5B / trainer_state.json
ubowang's picture
Upload folder using huggingface_hub
e735631 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 91.78527607361963,
"eval_steps": 2,
"global_step": 92,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.7852760736196319,
"grad_norm": 2.828015891236112,
"learning_rate": 5.000000000000001e-07,
"loss": 0.7344894409179688,
"memory(GiB)": 33.35,
"step": 1,
"token_acc": 0.8210270474011879,
"train_speed(iter/s)": 0.01508
},
{
"epoch": 1.7852760736196318,
"grad_norm": 5.560922634626607,
"learning_rate": 1.0000000000000002e-06,
"loss": 1.4670829772949219,
"memory(GiB)": 39.33,
"step": 2,
"token_acc": 0.8254568857425018,
"train_speed(iter/s)": 0.015066
},
{
"epoch": 2.785276073619632,
"grad_norm": 5.459222412898219,
"learning_rate": 1.5e-06,
"loss": 1.452088713645935,
"memory(GiB)": 39.33,
"step": 3,
"token_acc": 0.8195757791207489,
"train_speed(iter/s)": 0.014624
},
{
"epoch": 3.785276073619632,
"grad_norm": 5.463168612488766,
"learning_rate": 2.0000000000000003e-06,
"loss": 1.4775556325912476,
"memory(GiB)": 39.33,
"step": 4,
"token_acc": 0.823338222352596,
"train_speed(iter/s)": 0.014761
},
{
"epoch": 4.785276073619632,
"grad_norm": 5.608488489325384,
"learning_rate": 2.5e-06,
"loss": 1.4884722232818604,
"memory(GiB)": 39.33,
"step": 5,
"token_acc": 0.8191892680081656,
"train_speed(iter/s)": 0.014538
},
{
"epoch": 5.785276073619632,
"grad_norm": 5.119511687917419,
"learning_rate": 3e-06,
"loss": 1.4214377403259277,
"memory(GiB)": 39.33,
"step": 6,
"token_acc": 0.8322302158273381,
"train_speed(iter/s)": 0.014634
},
{
"epoch": 6.785276073619632,
"grad_norm": 5.315421901389787,
"learning_rate": 3.5e-06,
"loss": 1.4595433473587036,
"memory(GiB)": 39.33,
"step": 7,
"token_acc": 0.8237858651099242,
"train_speed(iter/s)": 0.014497
},
{
"epoch": 7.785276073619632,
"grad_norm": 4.5538985334897495,
"learning_rate": 4.000000000000001e-06,
"loss": 1.4617054462432861,
"memory(GiB)": 39.33,
"step": 8,
"token_acc": 0.8285934756595355,
"train_speed(iter/s)": 0.014575
},
{
"epoch": 8.785276073619633,
"grad_norm": 4.50876794473483,
"learning_rate": 4.5e-06,
"loss": 1.3948967456817627,
"memory(GiB)": 39.33,
"step": 9,
"token_acc": 0.8302743891270103,
"train_speed(iter/s)": 0.01447
},
{
"epoch": 9.785276073619633,
"grad_norm": 3.214342485444024,
"learning_rate": 5e-06,
"loss": 1.2821123600006104,
"memory(GiB)": 39.33,
"step": 10,
"token_acc": 0.8313403718154694,
"train_speed(iter/s)": 0.014524
},
{
"epoch": 10.785276073619633,
"grad_norm": 2.8792276792731872,
"learning_rate": 4.999658262481173e-06,
"loss": 1.275630235671997,
"memory(GiB)": 39.33,
"step": 11,
"token_acc": 0.8285017000556812,
"train_speed(iter/s)": 0.014445
},
{
"epoch": 11.785276073619633,
"grad_norm": 3.0281873560029338,
"learning_rate": 4.998633143352315e-06,
"loss": 1.2036690711975098,
"memory(GiB)": 39.33,
"step": 12,
"token_acc": 0.8379109677897624,
"train_speed(iter/s)": 0.014493
},
{
"epoch": 12.785276073619633,
"grad_norm": 3.0371512203857205,
"learning_rate": 4.9969249228707625e-06,
"loss": 1.2084441184997559,
"memory(GiB)": 39.33,
"step": 13,
"token_acc": 0.8384220321813085,
"train_speed(iter/s)": 0.014427
},
{
"epoch": 13.785276073619633,
"grad_norm": 2.6387426090200363,
"learning_rate": 4.994534068046936e-06,
"loss": 1.1901323795318604,
"memory(GiB)": 39.33,
"step": 14,
"token_acc": 0.8441223334680967,
"train_speed(iter/s)": 0.014476
},
{
"epoch": 14.785276073619633,
"grad_norm": 2.3160115440855873,
"learning_rate": 4.991461232516675e-06,
"loss": 1.141261100769043,
"memory(GiB)": 39.33,
"step": 15,
"token_acc": 0.8443658849034292,
"train_speed(iter/s)": 0.014426
},
{
"epoch": 15.785276073619633,
"grad_norm": 2.083813179452864,
"learning_rate": 4.987707256362529e-06,
"loss": 1.084287166595459,
"memory(GiB)": 39.33,
"step": 16,
"token_acc": 0.8477599213633511,
"train_speed(iter/s)": 0.014468
},
{
"epoch": 16.78527607361963,
"grad_norm": 1.9692176113541133,
"learning_rate": 4.983273165884096e-06,
"loss": 1.0419270992279053,
"memory(GiB)": 39.33,
"step": 17,
"token_acc": 0.8630328525162231,
"train_speed(iter/s)": 0.014414
},
{
"epoch": 17.78527607361963,
"grad_norm": 1.7151316178388507,
"learning_rate": 4.978160173317439e-06,
"loss": 0.9549746513366699,
"memory(GiB)": 39.33,
"step": 18,
"token_acc": 0.8722866869211904,
"train_speed(iter/s)": 0.01445
},
{
"epoch": 18.78527607361963,
"grad_norm": 1.549137117164761,
"learning_rate": 4.972369676503672e-06,
"loss": 0.9502382278442383,
"memory(GiB)": 39.33,
"step": 19,
"token_acc": 0.8675967359831535,
"train_speed(iter/s)": 0.014405
},
{
"epoch": 19.78527607361963,
"grad_norm": 1.1953412837491808,
"learning_rate": 4.965903258506806e-06,
"loss": 0.893640398979187,
"memory(GiB)": 39.33,
"step": 20,
"token_acc": 0.870425555791075,
"train_speed(iter/s)": 0.014441
},
{
"epoch": 20.78527607361963,
"grad_norm": 1.2537339963447596,
"learning_rate": 4.9587626871809564e-06,
"loss": 0.860186755657196,
"memory(GiB)": 39.33,
"step": 21,
"token_acc": 0.8714236257078563,
"train_speed(iter/s)": 0.014406
},
{
"epoch": 21.78527607361963,
"grad_norm": 1.1827204340282138,
"learning_rate": 4.950949914687024e-06,
"loss": 0.8294661641120911,
"memory(GiB)": 39.34,
"step": 22,
"token_acc": 0.8780051945745555,
"train_speed(iter/s)": 0.014437
},
{
"epoch": 22.78527607361963,
"grad_norm": 1.074496322059685,
"learning_rate": 4.942467076958999e-06,
"loss": 0.826668381690979,
"memory(GiB)": 39.34,
"step": 23,
"token_acc": 0.8780265115649939,
"train_speed(iter/s)": 0.014396
},
{
"epoch": 23.78527607361963,
"grad_norm": 0.9237880545379076,
"learning_rate": 4.933316493120015e-06,
"loss": 0.8319023847579956,
"memory(GiB)": 39.34,
"step": 24,
"token_acc": 0.8776226135060379,
"train_speed(iter/s)": 0.014423
},
{
"epoch": 24.78527607361963,
"grad_norm": 0.8450828330230173,
"learning_rate": 4.923500664848327e-06,
"loss": 0.7946324348449707,
"memory(GiB)": 39.34,
"step": 25,
"token_acc": 0.8774449657163479,
"train_speed(iter/s)": 0.01434
},
{
"epoch": 25.78527607361963,
"grad_norm": 0.8084371406940016,
"learning_rate": 4.913022275693372e-06,
"loss": 0.7982379198074341,
"memory(GiB)": 39.34,
"step": 26,
"token_acc": 0.8835678461967121,
"train_speed(iter/s)": 0.014197
},
{
"epoch": 26.78527607361963,
"grad_norm": 0.8064722550105239,
"learning_rate": 4.901884190342121e-06,
"loss": 0.7379294633865356,
"memory(GiB)": 39.34,
"step": 27,
"token_acc": 0.8820249380108911,
"train_speed(iter/s)": 0.014015
},
{
"epoch": 27.78527607361963,
"grad_norm": 0.7505443142537965,
"learning_rate": 4.890089453835894e-06,
"loss": 0.8055274486541748,
"memory(GiB)": 39.34,
"step": 28,
"token_acc": 0.8885918287235961,
"train_speed(iter/s)": 0.013867
},
{
"epoch": 28.78527607361963,
"grad_norm": 0.6119739730959108,
"learning_rate": 4.8776412907378845e-06,
"loss": 0.7476144433021545,
"memory(GiB)": 39.34,
"step": 29,
"token_acc": 0.8924232543535493,
"train_speed(iter/s)": 0.013849
},
{
"epoch": 29.78527607361963,
"grad_norm": 0.7681036255940338,
"learning_rate": 4.864543104251587e-06,
"loss": 0.7009764909744263,
"memory(GiB)": 39.34,
"step": 30,
"token_acc": 0.8919646242859415,
"train_speed(iter/s)": 0.013827
},
{
"epoch": 30.78527607361963,
"grad_norm": 0.7104586578934021,
"learning_rate": 4.850798475290403e-06,
"loss": 0.6857994198799133,
"memory(GiB)": 39.34,
"step": 31,
"token_acc": 0.8883618146266964,
"train_speed(iter/s)": 0.013775
},
{
"epoch": 31.78527607361963,
"grad_norm": 0.6784594125219264,
"learning_rate": 4.836411161498653e-06,
"loss": 0.6878491044044495,
"memory(GiB)": 39.34,
"step": 32,
"token_acc": 0.8874739614039225,
"train_speed(iter/s)": 0.013774
},
{
"epoch": 32.785276073619634,
"grad_norm": 0.6524486859004722,
"learning_rate": 4.821385096224268e-06,
"loss": 0.6495063900947571,
"memory(GiB)": 39.34,
"step": 33,
"token_acc": 0.8972710558886793,
"train_speed(iter/s)": 0.013745
},
{
"epoch": 33.785276073619634,
"grad_norm": 0.6437780661180932,
"learning_rate": 4.8057243874434625e-06,
"loss": 0.6780753135681152,
"memory(GiB)": 39.34,
"step": 34,
"token_acc": 0.8965800092610968,
"train_speed(iter/s)": 0.013776
},
{
"epoch": 34.785276073619634,
"grad_norm": 0.6012280443982766,
"learning_rate": 4.789433316637644e-06,
"loss": 0.6937360763549805,
"memory(GiB)": 39.34,
"step": 35,
"token_acc": 0.8985454504099806,
"train_speed(iter/s)": 0.01377
},
{
"epoch": 35.785276073619634,
"grad_norm": 0.602538629390956,
"learning_rate": 4.772516337622907e-06,
"loss": 0.6754523515701294,
"memory(GiB)": 39.34,
"step": 36,
"token_acc": 0.8950226959644091,
"train_speed(iter/s)": 0.013804
},
{
"epoch": 36.785276073619634,
"grad_norm": 0.5846158635967554,
"learning_rate": 4.754978075332398e-06,
"loss": 0.6398173570632935,
"memory(GiB)": 39.34,
"step": 37,
"token_acc": 0.898102863215351,
"train_speed(iter/s)": 0.013765
},
{
"epoch": 37.785276073619634,
"grad_norm": 0.44752405527595507,
"learning_rate": 4.736823324551909e-06,
"loss": 0.6178634166717529,
"memory(GiB)": 39.34,
"step": 38,
"token_acc": 0.8983179005164953,
"train_speed(iter/s)": 0.013757
},
{
"epoch": 38.785276073619634,
"grad_norm": 0.5759115585585194,
"learning_rate": 4.71805704860903e-06,
"loss": 0.6596415638923645,
"memory(GiB)": 39.34,
"step": 39,
"token_acc": 0.9043605764785142,
"train_speed(iter/s)": 0.013748
},
{
"epoch": 39.785276073619634,
"grad_norm": 0.5251913319753604,
"learning_rate": 4.698684378016223e-06,
"loss": 0.6197866797447205,
"memory(GiB)": 39.34,
"step": 40,
"token_acc": 0.9019051278555227,
"train_speed(iter/s)": 0.013779
},
{
"epoch": 40.785276073619634,
"grad_norm": 0.5556487937376285,
"learning_rate": 4.678710609068193e-06,
"loss": 0.6387439370155334,
"memory(GiB)": 39.34,
"step": 41,
"token_acc": 0.8996385106639354,
"train_speed(iter/s)": 0.013774
},
{
"epoch": 41.785276073619634,
"grad_norm": 0.5595033551514544,
"learning_rate": 4.658141202393935e-06,
"loss": 0.5881360769271851,
"memory(GiB)": 39.34,
"step": 42,
"token_acc": 0.902147025625015,
"train_speed(iter/s)": 0.013803
},
{
"epoch": 42.785276073619634,
"grad_norm": 0.592912316638698,
"learning_rate": 4.636981781463848e-06,
"loss": 0.6058595776557922,
"memory(GiB)": 39.34,
"step": 43,
"token_acc": 0.8997946249700757,
"train_speed(iter/s)": 0.013801
},
{
"epoch": 43.785276073619634,
"grad_norm": 0.5474462324998715,
"learning_rate": 4.615238131052339e-06,
"loss": 0.5743303894996643,
"memory(GiB)": 39.34,
"step": 44,
"token_acc": 0.9094863701578192,
"train_speed(iter/s)": 0.013828
},
{
"epoch": 44.785276073619634,
"grad_norm": 0.6003410732076991,
"learning_rate": 4.592916195656322e-06,
"loss": 0.612324059009552,
"memory(GiB)": 39.34,
"step": 45,
"token_acc": 0.9116981475374416,
"train_speed(iter/s)": 0.013823
},
{
"epoch": 45.785276073619634,
"grad_norm": 0.5803467871688905,
"learning_rate": 4.570022077870051e-06,
"loss": 0.540604829788208,
"memory(GiB)": 39.34,
"step": 46,
"token_acc": 0.9066389293058044,
"train_speed(iter/s)": 0.013844
},
{
"epoch": 46.785276073619634,
"grad_norm": 0.31746472746367554,
"learning_rate": 4.546562036716732e-06,
"loss": 0.5575750470161438,
"memory(GiB)": 39.34,
"step": 47,
"token_acc": 0.911935911474843,
"train_speed(iter/s)": 0.013831
},
{
"epoch": 47.785276073619634,
"grad_norm": 0.547728341912002,
"learning_rate": 4.522542485937369e-06,
"loss": 0.5444112420082092,
"memory(GiB)": 39.34,
"step": 48,
"token_acc": 0.9140458085414321,
"train_speed(iter/s)": 0.013854
},
{
"epoch": 48.785276073619634,
"grad_norm": 0.5196821422871499,
"learning_rate": 4.497969992237312e-06,
"loss": 0.5895761847496033,
"memory(GiB)": 39.34,
"step": 49,
"token_acc": 0.9174603967193878,
"train_speed(iter/s)": 0.01385
},
{
"epoch": 49.785276073619634,
"grad_norm": 0.5165933268106758,
"learning_rate": 4.472851273490985e-06,
"loss": 0.5617235898971558,
"memory(GiB)": 39.34,
"step": 50,
"token_acc": 0.9105760761054671,
"train_speed(iter/s)": 0.013873
},
{
"epoch": 50.785276073619634,
"grad_norm": 0.5491429792142556,
"learning_rate": 4.4471931969052816e-06,
"loss": 0.5617798566818237,
"memory(GiB)": 39.34,
"step": 51,
"token_acc": 0.914954423407124,
"train_speed(iter/s)": 0.013865
},
{
"epoch": 51.785276073619634,
"grad_norm": 0.47279597803824047,
"learning_rate": 4.421002777142148e-06,
"loss": 0.5274189710617065,
"memory(GiB)": 39.34,
"step": 52,
"token_acc": 0.9109212369606242,
"train_speed(iter/s)": 0.013856
},
{
"epoch": 52.785276073619634,
"grad_norm": 0.5331682466669002,
"learning_rate": 4.394287174400838e-06,
"loss": 0.5500538945198059,
"memory(GiB)": 39.34,
"step": 53,
"token_acc": 0.9179731638418079,
"train_speed(iter/s)": 0.013816
},
{
"epoch": 53.785276073619634,
"grad_norm": 0.5015412639019045,
"learning_rate": 4.3670536924603855e-06,
"loss": 0.5379254817962646,
"memory(GiB)": 39.34,
"step": 54,
"token_acc": 0.91340303696508,
"train_speed(iter/s)": 0.013806
},
{
"epoch": 54.785276073619634,
"grad_norm": 0.5012274856872988,
"learning_rate": 4.33930977668283e-06,
"loss": 0.5354421138763428,
"memory(GiB)": 39.34,
"step": 55,
"token_acc": 0.9172959682882159,
"train_speed(iter/s)": 0.013802
},
{
"epoch": 55.785276073619634,
"grad_norm": 0.2803188316237382,
"learning_rate": 4.311063011977723e-06,
"loss": 0.5273925065994263,
"memory(GiB)": 39.34,
"step": 56,
"token_acc": 0.9197245144878079,
"train_speed(iter/s)": 0.013823
},
{
"epoch": 56.785276073619634,
"grad_norm": 0.5225252350052656,
"learning_rate": 4.282321120728493e-06,
"loss": 0.49844077229499817,
"memory(GiB)": 39.34,
"step": 57,
"token_acc": 0.9147932040501116,
"train_speed(iter/s)": 0.01382
},
{
"epoch": 57.785276073619634,
"grad_norm": 0.5016830591757713,
"learning_rate": 4.253091960681222e-06,
"loss": 0.514026939868927,
"memory(GiB)": 39.34,
"step": 58,
"token_acc": 0.914426587542329,
"train_speed(iter/s)": 0.01384
},
{
"epoch": 58.785276073619634,
"grad_norm": 0.5162114438500172,
"learning_rate": 4.2233835227964145e-06,
"loss": 0.5164660215377808,
"memory(GiB)": 39.34,
"step": 59,
"token_acc": 0.9154462464242161,
"train_speed(iter/s)": 0.013838
},
{
"epoch": 59.785276073619634,
"grad_norm": 0.5003508758002871,
"learning_rate": 4.1932039290643534e-06,
"loss": 0.5167316198348999,
"memory(GiB)": 39.34,
"step": 60,
"token_acc": 0.9177114825166848,
"train_speed(iter/s)": 0.013857
},
{
"epoch": 60.785276073619634,
"grad_norm": 0.5370771821941825,
"learning_rate": 4.162561430284621e-06,
"loss": 0.4968717098236084,
"memory(GiB)": 39.34,
"step": 61,
"token_acc": 0.9180080667900795,
"train_speed(iter/s)": 0.013854
},
{
"epoch": 61.785276073619634,
"grad_norm": 0.4703998846973088,
"learning_rate": 4.1314644038104215e-06,
"loss": 0.48470860719680786,
"memory(GiB)": 39.34,
"step": 62,
"token_acc": 0.9172036041531638,
"train_speed(iter/s)": 0.01387
},
{
"epoch": 62.785276073619634,
"grad_norm": 0.5732521467141994,
"learning_rate": 4.099921351258292e-06,
"loss": 0.4761297106742859,
"memory(GiB)": 39.34,
"step": 63,
"token_acc": 0.9202728996822739,
"train_speed(iter/s)": 0.013866
},
{
"epoch": 63.785276073619634,
"grad_norm": 0.566487845334896,
"learning_rate": 4.067940896183843e-06,
"loss": 0.4775853157043457,
"memory(GiB)": 39.34,
"step": 64,
"token_acc": 0.9162934345333743,
"train_speed(iter/s)": 0.013883
},
{
"epoch": 64.78527607361963,
"grad_norm": 0.31449277119414887,
"learning_rate": 4.0355317817241705e-06,
"loss": 0.4875542223453522,
"memory(GiB)": 39.34,
"step": 65,
"token_acc": 0.9208928240311515,
"train_speed(iter/s)": 0.013874
},
{
"epoch": 65.78527607361963,
"grad_norm": 0.7863716510843305,
"learning_rate": 4.002702868207563e-06,
"loss": 0.4927229881286621,
"memory(GiB)": 39.34,
"step": 66,
"token_acc": 0.9251339373531896,
"train_speed(iter/s)": 0.01389
},
{
"epoch": 66.78527607361963,
"grad_norm": 0.6581549893568186,
"learning_rate": 3.969463130731183e-06,
"loss": 0.4890143871307373,
"memory(GiB)": 39.34,
"step": 67,
"token_acc": 0.9183355219960604,
"train_speed(iter/s)": 0.013878
},
{
"epoch": 67.78527607361963,
"grad_norm": 0.5900053583989073,
"learning_rate": 3.935821656707359e-06,
"loss": 0.48031631112098694,
"memory(GiB)": 39.34,
"step": 68,
"token_acc": 0.9213672888828442,
"train_speed(iter/s)": 0.01389
},
{
"epoch": 68.78527607361963,
"grad_norm": 0.6132529596085236,
"learning_rate": 3.901787643379183e-06,
"loss": 0.45974451303482056,
"memory(GiB)": 39.34,
"step": 69,
"token_acc": 0.9189587008089533,
"train_speed(iter/s)": 0.013882
},
{
"epoch": 69.78527607361963,
"grad_norm": 0.7256621524580472,
"learning_rate": 3.8673703953060685e-06,
"loss": 0.45002666115760803,
"memory(GiB)": 39.34,
"step": 70,
"token_acc": 0.9235120083536373,
"train_speed(iter/s)": 0.013876
},
{
"epoch": 70.78527607361963,
"grad_norm": 0.6572360898351522,
"learning_rate": 3.832579321819985e-06,
"loss": 0.44984108209609985,
"memory(GiB)": 39.34,
"step": 71,
"token_acc": 0.9278641983998498,
"train_speed(iter/s)": 0.013872
},
{
"epoch": 71.78527607361963,
"grad_norm": 0.6788420593989611,
"learning_rate": 3.797423934453038e-06,
"loss": 0.46496278047561646,
"memory(GiB)": 39.34,
"step": 72,
"token_acc": 0.9247173628466024,
"train_speed(iter/s)": 0.013887
},
{
"epoch": 72.78527607361963,
"grad_norm": 0.6073327829268869,
"learning_rate": 3.76191384433711e-06,
"loss": 0.43901190161705017,
"memory(GiB)": 39.34,
"step": 73,
"token_acc": 0.9269318854378426,
"train_speed(iter/s)": 0.013872
},
{
"epoch": 73.78527607361963,
"grad_norm": 0.4952804216866532,
"learning_rate": 3.726058759576271e-06,
"loss": 0.4391399025917053,
"memory(GiB)": 39.34,
"step": 74,
"token_acc": 0.9251885591145068,
"train_speed(iter/s)": 0.01388
},
{
"epoch": 74.78527607361963,
"grad_norm": 0.7195321191248989,
"learning_rate": 3.6898684825926845e-06,
"loss": 0.440918892621994,
"memory(GiB)": 39.34,
"step": 75,
"token_acc": 0.9259756638718266,
"train_speed(iter/s)": 0.013868
},
{
"epoch": 75.78527607361963,
"grad_norm": 0.5356189516143581,
"learning_rate": 3.65335290744672e-06,
"loss": 0.44449836015701294,
"memory(GiB)": 39.34,
"step": 76,
"token_acc": 0.9235405737247461,
"train_speed(iter/s)": 0.013861
},
{
"epoch": 76.78527607361963,
"grad_norm": 0.7790146643095753,
"learning_rate": 3.616522017132017e-06,
"loss": 0.4214305877685547,
"memory(GiB)": 39.34,
"step": 77,
"token_acc": 0.935737818195279,
"train_speed(iter/s)": 0.013851
},
{
"epoch": 77.78527607361963,
"grad_norm": 0.640555014306015,
"learning_rate": 3.579385880846232e-06,
"loss": 0.41491076350212097,
"memory(GiB)": 39.34,
"step": 78,
"token_acc": 0.9254948760620128,
"train_speed(iter/s)": 0.013865
},
{
"epoch": 78.78527607361963,
"grad_norm": 0.7225957987793631,
"learning_rate": 3.5419546512382264e-06,
"loss": 0.4449855089187622,
"memory(GiB)": 39.34,
"step": 79,
"token_acc": 0.92669304165697,
"train_speed(iter/s)": 0.013861
},
{
"epoch": 79.78527607361963,
"grad_norm": 0.71186832956713,
"learning_rate": 3.5042385616324243e-06,
"loss": 0.4115943908691406,
"memory(GiB)": 39.34,
"step": 80,
"token_acc": 0.9300436681222708,
"train_speed(iter/s)": 0.013871
},
{
"epoch": 80.78527607361963,
"grad_norm": 0.5344801517675843,
"learning_rate": 3.466247923231131e-06,
"loss": 0.4183962047100067,
"memory(GiB)": 39.34,
"step": 81,
"token_acc": 0.9238792981795946,
"train_speed(iter/s)": 0.013858
},
{
"epoch": 81.78527607361963,
"grad_norm": 0.7110303531981544,
"learning_rate": 3.427993122295552e-06,
"loss": 0.43044573068618774,
"memory(GiB)": 39.34,
"step": 82,
"token_acc": 0.9315566699353302,
"train_speed(iter/s)": 0.013848
},
{
"epoch": 82.78527607361963,
"grad_norm": 0.6054164665729541,
"learning_rate": 3.3894846173062917e-06,
"loss": 0.391621857881546,
"memory(GiB)": 39.34,
"step": 83,
"token_acc": 0.9380456974067156,
"train_speed(iter/s)": 0.013842
},
{
"epoch": 83.78527607361963,
"grad_norm": 0.659034943496206,
"learning_rate": 3.350732936104108e-06,
"loss": 0.40062740445137024,
"memory(GiB)": 39.34,
"step": 84,
"token_acc": 0.9351094518920754,
"train_speed(iter/s)": 0.013855
},
{
"epoch": 84.78527607361963,
"grad_norm": 0.6757156967804352,
"learning_rate": 3.3117486730117092e-06,
"loss": 0.42082643508911133,
"memory(GiB)": 39.34,
"step": 85,
"token_acc": 0.9334060495173756,
"train_speed(iter/s)": 0.01385
},
{
"epoch": 85.78527607361963,
"grad_norm": 0.6462108938520096,
"learning_rate": 3.272542485937369e-06,
"loss": 0.4044734239578247,
"memory(GiB)": 39.34,
"step": 86,
"token_acc": 0.937454412837345,
"train_speed(iter/s)": 0.013858
},
{
"epoch": 86.78527607361963,
"grad_norm": 0.6658216806148752,
"learning_rate": 3.2331250934611623e-06,
"loss": 0.4085301458835602,
"memory(GiB)": 39.34,
"step": 87,
"token_acc": 0.9344952439398588,
"train_speed(iter/s)": 0.013849
},
{
"epoch": 87.78527607361963,
"grad_norm": 0.5859038575973,
"learning_rate": 3.193507271904612e-06,
"loss": 0.4107922613620758,
"memory(GiB)": 39.34,
"step": 88,
"token_acc": 0.9370524843896911,
"train_speed(iter/s)": 0.013851
},
{
"epoch": 88.78527607361963,
"grad_norm": 0.6618470555633296,
"learning_rate": 3.15369985238455e-06,
"loss": 0.374958336353302,
"memory(GiB)": 39.34,
"step": 89,
"token_acc": 0.9345464787352592,
"train_speed(iter/s)": 0.013831
},
{
"epoch": 89.78527607361963,
"grad_norm": 0.8076141889252606,
"learning_rate": 3.1137137178519983e-06,
"loss": 0.40027111768722534,
"memory(GiB)": 39.34,
"step": 90,
"token_acc": 0.9367903505358848,
"train_speed(iter/s)": 0.013844
},
{
"epoch": 90.78527607361963,
"grad_norm": 0.591801688114359,
"learning_rate": 3.073559800116879e-06,
"loss": 0.3753508925437927,
"memory(GiB)": 39.34,
"step": 91,
"token_acc": 0.9328512619350389,
"train_speed(iter/s)": 0.013839
},
{
"epoch": 91.78527607361963,
"grad_norm": 0.7666418774168722,
"learning_rate": 3.0332490768593676e-06,
"loss": 0.39489710330963135,
"memory(GiB)": 39.34,
"step": 92,
"token_acc": 0.9387719407833094,
"train_speed(iter/s)": 0.013851
}
],
"logging_steps": 1,
"max_steps": 200,
"num_input_tokens_seen": 0,
"num_train_epochs": 200,
"save_steps": 2,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 63696844357632.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}