checkpoint-dsc / trainer_state.json
tohuy2710's picture
Upload folder using huggingface_hub
53e40c5 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 4233,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.007559650366170565,
"grad_norm": 32.25,
"learning_rate": 7.311320754716981e-07,
"loss": 0.6915,
"step": 32
},
{
"epoch": 0.01511930073234113,
"grad_norm": 14.5625,
"learning_rate": 1.4858490566037737e-06,
"loss": 0.6792,
"step": 64
},
{
"epoch": 0.022678951098511695,
"grad_norm": 16.875,
"learning_rate": 2.2405660377358494e-06,
"loss": 0.5993,
"step": 96
},
{
"epoch": 0.03023860146468226,
"grad_norm": 11.5625,
"learning_rate": 2.995283018867925e-06,
"loss": 0.5179,
"step": 128
},
{
"epoch": 0.03779825183085282,
"grad_norm": 14.5625,
"learning_rate": 3.7500000000000005e-06,
"loss": 0.5111,
"step": 160
},
{
"epoch": 0.04535790219702339,
"grad_norm": 9.8125,
"learning_rate": 4.504716981132076e-06,
"loss": 0.5109,
"step": 192
},
{
"epoch": 0.05291755256319395,
"grad_norm": 7.34375,
"learning_rate": 5.259433962264151e-06,
"loss": 0.5106,
"step": 224
},
{
"epoch": 0.06047720292936452,
"grad_norm": 6.1875,
"learning_rate": 6.014150943396226e-06,
"loss": 0.4514,
"step": 256
},
{
"epoch": 0.06803685329553508,
"grad_norm": 7.875,
"learning_rate": 6.768867924528303e-06,
"loss": 0.4862,
"step": 288
},
{
"epoch": 0.07559650366170564,
"grad_norm": 6.03125,
"learning_rate": 7.523584905660378e-06,
"loss": 0.4711,
"step": 320
},
{
"epoch": 0.08315615402787621,
"grad_norm": 6.1875,
"learning_rate": 8.278301886792453e-06,
"loss": 0.4783,
"step": 352
},
{
"epoch": 0.09071580439404678,
"grad_norm": 7.78125,
"learning_rate": 9.03301886792453e-06,
"loss": 0.4457,
"step": 384
},
{
"epoch": 0.09827545476021735,
"grad_norm": 6.59375,
"learning_rate": 9.787735849056604e-06,
"loss": 0.5125,
"step": 416
},
{
"epoch": 0.1058351051263879,
"grad_norm": 13.5625,
"learning_rate": 9.99910037719311e-06,
"loss": 0.4171,
"step": 448
},
{
"epoch": 0.11339475549255847,
"grad_norm": 3.921875,
"learning_rate": 9.994856381944038e-06,
"loss": 0.4538,
"step": 480
},
{
"epoch": 0.12095440585872903,
"grad_norm": 6.15625,
"learning_rate": 9.987133217483066e-06,
"loss": 0.4629,
"step": 512
},
{
"epoch": 0.1285140562248996,
"grad_norm": 6.28125,
"learning_rate": 9.975936263383488e-06,
"loss": 0.4744,
"step": 544
},
{
"epoch": 0.13607370659107015,
"grad_norm": 7.125,
"learning_rate": 9.96127331888816e-06,
"loss": 0.4292,
"step": 576
},
{
"epoch": 0.14363335695724072,
"grad_norm": 5.15625,
"learning_rate": 9.943154597476943e-06,
"loss": 0.4558,
"step": 608
},
{
"epoch": 0.1511930073234113,
"grad_norm": 10.6875,
"learning_rate": 9.921592719752486e-06,
"loss": 0.448,
"step": 640
},
{
"epoch": 0.15875265768958186,
"grad_norm": 8.125,
"learning_rate": 9.896602704649348e-06,
"loss": 0.4117,
"step": 672
},
{
"epoch": 0.16631230805575242,
"grad_norm": 4.84375,
"learning_rate": 9.868201958972548e-06,
"loss": 0.4303,
"step": 704
},
{
"epoch": 0.173871958421923,
"grad_norm": 5.0625,
"learning_rate": 9.836410265272857e-06,
"loss": 0.4402,
"step": 736
},
{
"epoch": 0.18143160878809356,
"grad_norm": 8.875,
"learning_rate": 9.801249768067246e-06,
"loss": 0.4242,
"step": 768
},
{
"epoch": 0.18899125915426412,
"grad_norm": 5.625,
"learning_rate": 9.762744958414113e-06,
"loss": 0.4771,
"step": 800
},
{
"epoch": 0.1965509095204347,
"grad_norm": 14.875,
"learning_rate": 9.720922656854032e-06,
"loss": 0.4497,
"step": 832
},
{
"epoch": 0.20411055988660523,
"grad_norm": 6.3125,
"learning_rate": 9.675811994727897e-06,
"loss": 0.4141,
"step": 864
},
{
"epoch": 0.2116702102527758,
"grad_norm": 6.15625,
"learning_rate": 9.627444393885463e-06,
"loss": 0.432,
"step": 896
},
{
"epoch": 0.21922986061894637,
"grad_norm": 6.28125,
"learning_rate": 9.575853544798453e-06,
"loss": 0.4253,
"step": 928
},
{
"epoch": 0.22678951098511693,
"grad_norm": 7.4375,
"learning_rate": 9.521075383093452e-06,
"loss": 0.4334,
"step": 960
},
{
"epoch": 0.2343491613512875,
"grad_norm": 9.0625,
"learning_rate": 9.463148064520913e-06,
"loss": 0.4595,
"step": 992
},
{
"epoch": 0.24190881171745807,
"grad_norm": 6.71875,
"learning_rate": 9.402111938377776e-06,
"loss": 0.4401,
"step": 1024
},
{
"epoch": 0.24946846208362863,
"grad_norm": 9.0625,
"learning_rate": 9.338009519402132e-06,
"loss": 0.4216,
"step": 1056
},
{
"epoch": 0.2570281124497992,
"grad_norm": 7.59375,
"learning_rate": 9.270885458159576e-06,
"loss": 0.4391,
"step": 1088
},
{
"epoch": 0.26458776281596974,
"grad_norm": 7.875,
"learning_rate": 9.200786509941827e-06,
"loss": 0.4116,
"step": 1120
},
{
"epoch": 0.2721474131821403,
"grad_norm": 11.5625,
"learning_rate": 9.127761502199325e-06,
"loss": 0.4004,
"step": 1152
},
{
"epoch": 0.2797070635483109,
"grad_norm": 7.8125,
"learning_rate": 9.051861300530438e-06,
"loss": 0.4261,
"step": 1184
},
{
"epoch": 0.28726671391448144,
"grad_norm": 6.53125,
"learning_rate": 8.973138773251015e-06,
"loss": 0.4075,
"step": 1216
},
{
"epoch": 0.294826364280652,
"grad_norm": 6.78125,
"learning_rate": 8.891648754568943e-06,
"loss": 0.4398,
"step": 1248
},
{
"epoch": 0.3023860146468226,
"grad_norm": 7.40625,
"learning_rate": 8.807448006389343e-06,
"loss": 0.4517,
"step": 1280
},
{
"epoch": 0.30994566501299314,
"grad_norm": 4.3125,
"learning_rate": 8.720595178777063e-06,
"loss": 0.4254,
"step": 1312
},
{
"epoch": 0.3175053153791637,
"grad_norm": 7.25,
"learning_rate": 8.631150769103934e-06,
"loss": 0.441,
"step": 1344
},
{
"epoch": 0.3250649657453343,
"grad_norm": 6.65625,
"learning_rate": 8.539177079909315e-06,
"loss": 0.4337,
"step": 1376
},
{
"epoch": 0.33262461611150484,
"grad_norm": 5.8125,
"learning_rate": 8.444738175503222e-06,
"loss": 0.4537,
"step": 1408
},
{
"epoch": 0.3401842664776754,
"grad_norm": 5.5625,
"learning_rate": 8.347899837342315e-06,
"loss": 0.4071,
"step": 1440
},
{
"epoch": 0.347743916843846,
"grad_norm": 7.875,
"learning_rate": 8.2487295182098e-06,
"loss": 0.4612,
"step": 1472
},
{
"epoch": 0.35530356721001655,
"grad_norm": 5.1875,
"learning_rate": 8.147296295231158e-06,
"loss": 0.4296,
"step": 1504
},
{
"epoch": 0.3628632175761871,
"grad_norm": 8.625,
"learning_rate": 8.04367082175845e-06,
"loss": 0.4491,
"step": 1536
},
{
"epoch": 0.3704228679423577,
"grad_norm": 5.25,
"learning_rate": 7.937925278156698e-06,
"loss": 0.4132,
"step": 1568
},
{
"epoch": 0.37798251830852825,
"grad_norm": 6.8125,
"learning_rate": 7.830133321526615e-06,
"loss": 0.4068,
"step": 1600
},
{
"epoch": 0.3855421686746988,
"grad_norm": 4.375,
"learning_rate": 7.720370034398741e-06,
"loss": 0.4499,
"step": 1632
},
{
"epoch": 0.3931018190408694,
"grad_norm": 6.96875,
"learning_rate": 7.608711872434648e-06,
"loss": 0.4256,
"step": 1664
},
{
"epoch": 0.40066146940703995,
"grad_norm": 9.875,
"learning_rate": 7.495236611171741e-06,
"loss": 0.428,
"step": 1696
},
{
"epoch": 0.40822111977321046,
"grad_norm": 5.90625,
"learning_rate": 7.3800232918486715e-06,
"loss": 0.4146,
"step": 1728
},
{
"epoch": 0.41578077013938103,
"grad_norm": 6.4375,
"learning_rate": 7.263152166349122e-06,
"loss": 0.4476,
"step": 1760
},
{
"epoch": 0.4233404205055516,
"grad_norm": 7.3125,
"learning_rate": 7.144704641302337e-06,
"loss": 0.4387,
"step": 1792
},
{
"epoch": 0.43090007087172216,
"grad_norm": 6.84375,
"learning_rate": 7.024763221379289e-06,
"loss": 0.4276,
"step": 1824
},
{
"epoch": 0.43845972123789273,
"grad_norm": 7.25,
"learning_rate": 6.903411451824033e-06,
"loss": 0.4482,
"step": 1856
},
{
"epoch": 0.4460193716040633,
"grad_norm": 6.9375,
"learning_rate": 6.780733860260216e-06,
"loss": 0.4187,
"step": 1888
},
{
"epoch": 0.45357902197023386,
"grad_norm": 6.09375,
"learning_rate": 6.6568158978133455e-06,
"loss": 0.402,
"step": 1920
},
{
"epoch": 0.46113867233640443,
"grad_norm": 5.34375,
"learning_rate": 6.531743879589754e-06,
"loss": 0.4157,
"step": 1952
},
{
"epoch": 0.468698322702575,
"grad_norm": 5.8125,
"learning_rate": 6.405604924553797e-06,
"loss": 0.4771,
"step": 1984
},
{
"epoch": 0.47625797306874557,
"grad_norm": 6.53125,
"learning_rate": 6.278486894845084e-06,
"loss": 0.4408,
"step": 2016
},
{
"epoch": 0.48381762343491613,
"grad_norm": 5.96875,
"learning_rate": 6.150478334578085e-06,
"loss": 0.4434,
"step": 2048
},
{
"epoch": 0.4913772738010867,
"grad_norm": 8.3125,
"learning_rate": 6.021668408166688e-06,
"loss": 0.4214,
"step": 2080
},
{
"epoch": 0.49893692416725727,
"grad_norm": 4.28125,
"learning_rate": 5.892146838216687e-06,
"loss": 0.4164,
"step": 2112
},
{
"epoch": 0.5064965745334278,
"grad_norm": 4.25,
"learning_rate": 5.762003843029466e-06,
"loss": 0.426,
"step": 2144
},
{
"epoch": 0.5140562248995983,
"grad_norm": 6.96875,
"learning_rate": 5.631330073760413e-06,
"loss": 0.4205,
"step": 2176
},
{
"epoch": 0.5216158752657689,
"grad_norm": 6.6875,
"learning_rate": 5.500216551275807e-06,
"loss": 0.4429,
"step": 2208
},
{
"epoch": 0.5291755256319395,
"grad_norm": 7.15625,
"learning_rate": 5.368754602752213e-06,
"loss": 0.431,
"step": 2240
},
{
"epoch": 0.53673517599811,
"grad_norm": 5.28125,
"learning_rate": 5.237035798062489e-06,
"loss": 0.4224,
"step": 2272
},
{
"epoch": 0.5442948263642806,
"grad_norm": 8.125,
"learning_rate": 5.105151885992754e-06,
"loss": 0.4194,
"step": 2304
},
{
"epoch": 0.5518544767304512,
"grad_norm": 5.5,
"learning_rate": 4.9731947303347485e-06,
"loss": 0.434,
"step": 2336
},
{
"epoch": 0.5594141270966217,
"grad_norm": 6.09375,
"learning_rate": 4.841256245898055e-06,
"loss": 0.4308,
"step": 2368
},
{
"epoch": 0.5669737774627923,
"grad_norm": 4.3125,
"learning_rate": 4.709428334486816e-06,
"loss": 0.3907,
"step": 2400
},
{
"epoch": 0.5745334278289629,
"grad_norm": 6.96875,
"learning_rate": 4.577802820885482e-06,
"loss": 0.4226,
"step": 2432
},
{
"epoch": 0.5820930781951335,
"grad_norm": 5.75,
"learning_rate": 4.446471388898236e-06,
"loss": 0.4216,
"step": 2464
},
{
"epoch": 0.589652728561304,
"grad_norm": 6.46875,
"learning_rate": 4.315525517486586e-06,
"loss": 0.4632,
"step": 2496
},
{
"epoch": 0.5972123789274746,
"grad_norm": 3.46875,
"learning_rate": 4.185056417049674e-06,
"loss": 0.4304,
"step": 2528
},
{
"epoch": 0.6047720292936452,
"grad_norm": 8.25,
"learning_rate": 4.055154965891625e-06,
"loss": 0.451,
"step": 2560
},
{
"epoch": 0.6123316796598157,
"grad_norm": 6.78125,
"learning_rate": 3.925911646920235e-06,
"loss": 0.3851,
"step": 2592
},
{
"epoch": 0.6198913300259863,
"grad_norm": 6.84375,
"learning_rate": 3.797416484621057e-06,
"loss": 0.4486,
"step": 2624
},
{
"epoch": 0.6274509803921569,
"grad_norm": 5.28125,
"learning_rate": 3.669758982350821e-06,
"loss": 0.4258,
"step": 2656
},
{
"epoch": 0.6350106307583274,
"grad_norm": 6.53125,
"learning_rate": 3.5430280599938204e-06,
"loss": 0.4303,
"step": 2688
},
{
"epoch": 0.642570281124498,
"grad_norm": 12.1875,
"learning_rate": 3.4173119920247454e-06,
"loss": 0.4466,
"step": 2720
},
{
"epoch": 0.6501299314906686,
"grad_norm": 5.1875,
"learning_rate": 3.2926983460210564e-06,
"loss": 0.4131,
"step": 2752
},
{
"epoch": 0.6576895818568391,
"grad_norm": 4.6875,
"learning_rate": 3.1692739216677483e-06,
"loss": 0.4672,
"step": 2784
},
{
"epoch": 0.6652492322230097,
"grad_norm": 5.5625,
"learning_rate": 3.0471246902970032e-06,
"loss": 0.4291,
"step": 2816
},
{
"epoch": 0.6728088825891803,
"grad_norm": 4.71875,
"learning_rate": 2.926335735004817e-06,
"loss": 0.4264,
"step": 2848
},
{
"epoch": 0.6803685329553508,
"grad_norm": 6.0625,
"learning_rate": 2.8069911913863414e-06,
"loss": 0.422,
"step": 2880
},
{
"epoch": 0.6879281833215214,
"grad_norm": 6.9375,
"learning_rate": 2.689174188931202e-06,
"loss": 0.4005,
"step": 2912
},
{
"epoch": 0.695487833687692,
"grad_norm": 6.125,
"learning_rate": 2.5729667931196103e-06,
"loss": 0.4137,
"step": 2944
},
{
"epoch": 0.7030474840538625,
"grad_norm": 4.0625,
"learning_rate": 2.4584499482596274e-06,
"loss": 0.4145,
"step": 2976
},
{
"epoch": 0.7106071344200331,
"grad_norm": 6.1875,
"learning_rate": 2.3457034211053703e-06,
"loss": 0.4601,
"step": 3008
},
{
"epoch": 0.7181667847862037,
"grad_norm": 9.0,
"learning_rate": 2.234805745295457e-06,
"loss": 0.4238,
"step": 3040
},
{
"epoch": 0.7257264351523742,
"grad_norm": 15.625,
"learning_rate": 2.125834166650354e-06,
"loss": 0.4579,
"step": 3072
},
{
"epoch": 0.7332860855185448,
"grad_norm": 4.8125,
"learning_rate": 2.018864589366778e-06,
"loss": 0.4183,
"step": 3104
},
{
"epoch": 0.7408457358847154,
"grad_norm": 6.1875,
"learning_rate": 1.9139715231466014e-06,
"loss": 0.4387,
"step": 3136
},
{
"epoch": 0.7484053862508859,
"grad_norm": 7.03125,
"learning_rate": 1.811228031297077e-06,
"loss": 0.4367,
"step": 3168
},
{
"epoch": 0.7559650366170565,
"grad_norm": 5.65625,
"learning_rate": 1.7107056798385763e-06,
"loss": 0.451,
"step": 3200
},
{
"epoch": 0.7635246869832271,
"grad_norm": 6.65625,
"learning_rate": 1.6124744876552373e-06,
"loss": 0.4101,
"step": 3232
},
{
"epoch": 0.7710843373493976,
"grad_norm": 6.8125,
"learning_rate": 1.5166028777232884e-06,
"loss": 0.3734,
"step": 3264
},
{
"epoch": 0.7786439877155682,
"grad_norm": 5.34375,
"learning_rate": 1.4231576294510013e-06,
"loss": 0.4194,
"step": 3296
},
{
"epoch": 0.7862036380817388,
"grad_norm": 7.8125,
"learning_rate": 1.3322038321634567e-06,
"loss": 0.465,
"step": 3328
},
{
"epoch": 0.7937632884479093,
"grad_norm": 5.75,
"learning_rate": 1.2438048397645558e-06,
"loss": 0.4751,
"step": 3360
},
{
"epoch": 0.8013229388140799,
"grad_norm": 8.75,
"learning_rate": 1.1580222266078367e-06,
"loss": 0.401,
"step": 3392
},
{
"epoch": 0.8088825891802505,
"grad_norm": 8.375,
"learning_rate": 1.0749157446068242e-06,
"loss": 0.4418,
"step": 3424
},
{
"epoch": 0.8164422395464209,
"grad_norm": 7.375,
"learning_rate": 9.945432816148175e-07,
"loss": 0.4405,
"step": 3456
},
{
"epoch": 0.8240018899125915,
"grad_norm": 4.84375,
"learning_rate": 9.169608211030783e-07,
"loss": 0.4298,
"step": 3488
},
{
"epoch": 0.8315615402787621,
"grad_norm": 5.78125,
"learning_rate": 8.422224031655313e-07,
"loss": 0.4156,
"step": 3520
},
{
"epoch": 0.8391211906449326,
"grad_norm": 21.25,
"learning_rate": 7.703800868771e-07,
"loss": 0.4467,
"step": 3552
},
{
"epoch": 0.8466808410111032,
"grad_norm": 14.3125,
"learning_rate": 7.014839140319485e-07,
"loss": 0.4443,
"step": 3584
},
{
"epoch": 0.8542404913772738,
"grad_norm": 6.59375,
"learning_rate": 6.355818742868447e-07,
"loss": 0.4381,
"step": 3616
},
{
"epoch": 0.8618001417434443,
"grad_norm": 5.46875,
"learning_rate": 5.727198717339511e-07,
"loss": 0.405,
"step": 3648
},
{
"epoch": 0.8693597921096149,
"grad_norm": 4.3125,
"learning_rate": 5.129416929263031e-07,
"loss": 0.4161,
"step": 3680
},
{
"epoch": 0.8769194424757855,
"grad_norm": 6.84375,
"learning_rate": 4.5628897637827354e-07,
"loss": 0.4294,
"step": 3712
},
{
"epoch": 0.884479092841956,
"grad_norm": 6.0,
"learning_rate": 4.028011835622492e-07,
"loss": 0.4084,
"step": 3744
},
{
"epoch": 0.8920387432081266,
"grad_norm": 5.1875,
"learning_rate": 3.525155714217227e-07,
"loss": 0.3902,
"step": 3776
},
{
"epoch": 0.8995983935742972,
"grad_norm": 5.8125,
"learning_rate": 3.054671664199543e-07,
"loss": 0.423,
"step": 3808
},
{
"epoch": 0.9071580439404677,
"grad_norm": 6.75,
"learning_rate": 2.616887401422796e-07,
"loss": 0.4108,
"step": 3840
},
{
"epoch": 0.9147176943066383,
"grad_norm": 5.9375,
"learning_rate": 2.212107864690438e-07,
"loss": 0.4546,
"step": 3872
},
{
"epoch": 0.9222773446728089,
"grad_norm": 8.0625,
"learning_rate": 1.8406150033507764e-07,
"loss": 0.4434,
"step": 3904
},
{
"epoch": 0.9298369950389794,
"grad_norm": 10.4375,
"learning_rate": 1.502667580905054e-07,
"loss": 0.4149,
"step": 3936
},
{
"epoch": 0.93739664540515,
"grad_norm": 6.0625,
"learning_rate": 1.1985009947656278e-07,
"loss": 0.4504,
"step": 3968
},
{
"epoch": 0.9449562957713206,
"grad_norm": 5.625,
"learning_rate": 9.283271122898174e-08,
"loss": 0.437,
"step": 4000
},
{
"epoch": 0.9525159461374911,
"grad_norm": 6.90625,
"learning_rate": 6.923341232035863e-08,
"loss": 0.4205,
"step": 4032
},
{
"epoch": 0.9600755965036617,
"grad_norm": 9.625,
"learning_rate": 4.9068640851792636e-08,
"loss": 0.4221,
"step": 4064
},
{
"epoch": 0.9676352468698323,
"grad_norm": 5.09375,
"learning_rate": 3.235244260292147e-08,
"loss": 0.4172,
"step": 4096
},
{
"epoch": 0.9751948972360028,
"grad_norm": 6.0,
"learning_rate": 1.909646124832576e-08,
"loss": 0.415,
"step": 4128
},
{
"epoch": 0.9827545476021734,
"grad_norm": 7.34375,
"learning_rate": 9.30993024712279e-09,
"loss": 0.4298,
"step": 4160
},
{
"epoch": 0.990314197968344,
"grad_norm": 5.25,
"learning_rate": 2.999666411398483e-09,
"loss": 0.3673,
"step": 4192
},
{
"epoch": 0.9978738483345145,
"grad_norm": 5.28125,
"learning_rate": 1.7006515795336963e-10,
"loss": 0.422,
"step": 4224
},
{
"epoch": 1.0,
"step": 4233,
"total_flos": 7.498056185637274e+16,
"train_loss": 0.4399125433520113,
"train_runtime": 1541.2359,
"train_samples_per_second": 10.985,
"train_steps_per_second": 2.746
}
],
"logging_steps": 32,
"max_steps": 4233,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": false,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 7.498056185637274e+16,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}