O-Researcher-72B-sft / trainer_state.json
yycsu's picture
Add files using upload-large-folder tool
0b386b0 verified
{
"best_global_step": 90,
"best_metric": 255.55859375,
"best_model_checkpoint": "/home/notebook/code/group/eason/ms-swift/qwen2.5_72b_swift_allen/v1-20251030-150430/checkpoint-90",
"epoch": 2.040920716112532,
"eval_steps": 10,
"global_step": 100,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.020460358056265986,
"grad_norm": 8.740862846374512,
"learning_rate": 1.25e-06,
"loss": 1.323974609375,
"step": 1,
"token_acc": 0.6749949473048041
},
{
"epoch": 0.04092071611253197,
"grad_norm": 8.77861499786377,
"learning_rate": 2.5e-06,
"loss": 1.3330078125,
"step": 2,
"token_acc": 0.673973689521971
},
{
"epoch": 0.061381074168797956,
"grad_norm": 6.631825923919678,
"learning_rate": 3.7500000000000005e-06,
"loss": 1.296875,
"step": 3,
"token_acc": 0.6779223915870493
},
{
"epoch": 0.08184143222506395,
"grad_norm": 3.3304812908172607,
"learning_rate": 5e-06,
"loss": 1.2734375,
"step": 4,
"token_acc": 0.6801284548421133
},
{
"epoch": 0.10230179028132992,
"grad_norm": 2.9929847717285156,
"learning_rate": 6.25e-06,
"loss": 1.2674560546875,
"step": 5,
"token_acc": 0.6774330054044837
},
{
"epoch": 0.12276214833759591,
"grad_norm": 3.2737808227539062,
"learning_rate": 7.500000000000001e-06,
"loss": 1.24853515625,
"step": 6,
"token_acc": 0.6828065892133894
},
{
"epoch": 0.1432225063938619,
"grad_norm": 3.864370107650757,
"learning_rate": 8.750000000000001e-06,
"loss": 1.2008056640625,
"step": 7,
"token_acc": 0.6894752982753333
},
{
"epoch": 0.1636828644501279,
"grad_norm": 3.059185743331909,
"learning_rate": 1e-05,
"loss": 1.16259765625,
"step": 8,
"token_acc": 0.695852402365021
},
{
"epoch": 0.18414322250639387,
"grad_norm": 2.160026788711548,
"learning_rate": 9.99872299773906e-06,
"loss": 1.1551513671875,
"step": 9,
"token_acc": 0.696011262965415
},
{
"epoch": 0.20460358056265984,
"grad_norm": 1.7391501665115356,
"learning_rate": 9.994892643250147e-06,
"loss": 1.102783203125,
"step": 10,
"token_acc": 0.7084828844476136
},
{
"epoch": 0.20460358056265984,
"eval_loss": 318.328125,
"eval_runtime": 174.7551,
"eval_samples_per_second": 0.086,
"eval_steps_per_second": 0.006,
"eval_token_acc": 0.714127394915891,
"step": 10
},
{
"epoch": 0.22506393861892582,
"grad_norm": 1.512490153312683,
"learning_rate": 9.9885108930818e-06,
"loss": 1.059814453125,
"step": 11,
"token_acc": 0.7156657958041264
},
{
"epoch": 0.24552429667519182,
"grad_norm": 1.2666767835617065,
"learning_rate": 9.979581007037776e-06,
"loss": 1.0445556640625,
"step": 12,
"token_acc": 0.7163779567590186
},
{
"epoch": 0.2659846547314578,
"grad_norm": 1.035067081451416,
"learning_rate": 9.968107546511942e-06,
"loss": 1.0322265625,
"step": 13,
"token_acc": 0.7191054482580511
},
{
"epoch": 0.2864450127877238,
"grad_norm": 1.0283350944519043,
"learning_rate": 9.95409637215831e-06,
"loss": 1.0177001953125,
"step": 14,
"token_acc": 0.7223777681726882
},
{
"epoch": 0.3069053708439898,
"grad_norm": 0.8945289254188538,
"learning_rate": 9.937554640897414e-06,
"loss": 0.9921875,
"step": 15,
"token_acc": 0.7268454397965844
},
{
"epoch": 0.3273657289002558,
"grad_norm": 0.9290580749511719,
"learning_rate": 9.918490802260538e-06,
"loss": 0.9925537109375,
"step": 16,
"token_acc": 0.7261628646104965
},
{
"epoch": 0.34782608695652173,
"grad_norm": 0.9589850902557373,
"learning_rate": 9.896914594073703e-06,
"loss": 0.9844970703125,
"step": 17,
"token_acc": 0.7282743852241678
},
{
"epoch": 0.36828644501278773,
"grad_norm": 0.8926294445991516,
"learning_rate": 9.87283703748356e-06,
"loss": 0.955322265625,
"step": 18,
"token_acc": 0.7333408249225458
},
{
"epoch": 0.3887468030690537,
"grad_norm": 0.7655003666877747,
"learning_rate": 9.846270431327793e-06,
"loss": 0.97998046875,
"step": 19,
"token_acc": 0.7289209223794986
},
{
"epoch": 0.4092071611253197,
"grad_norm": 0.8207703828811646,
"learning_rate": 9.817228345852853e-06,
"loss": 0.9736328125,
"step": 20,
"token_acc": 0.7293907637079884
},
{
"epoch": 0.4092071611253197,
"eval_loss": 282.9296875,
"eval_runtime": 184.9183,
"eval_samples_per_second": 0.081,
"eval_steps_per_second": 0.005,
"eval_token_acc": 0.7367637851149977,
"step": 20
},
{
"epoch": 0.4296675191815857,
"grad_norm": 0.899739682674408,
"learning_rate": 9.785725615782262e-06,
"loss": 0.951171875,
"step": 21,
"token_acc": 0.7350280185329514
},
{
"epoch": 0.45012787723785164,
"grad_norm": 0.7847328186035156,
"learning_rate": 9.751778332739033e-06,
"loss": 0.955322265625,
"step": 22,
"token_acc": 0.7339763941093482
},
{
"epoch": 0.47058823529411764,
"grad_norm": 0.8828109502792358,
"learning_rate": 9.715403837026046e-06,
"loss": 0.928466796875,
"step": 23,
"token_acc": 0.7396126780013036
},
{
"epoch": 0.49104859335038364,
"grad_norm": 0.9227666258811951,
"learning_rate": 9.676620708768608e-06,
"loss": 0.940673828125,
"step": 24,
"token_acc": 0.7378745153330983
},
{
"epoch": 0.5115089514066496,
"grad_norm": 0.6232196092605591,
"learning_rate": 9.635448758423703e-06,
"loss": 0.9249267578125,
"step": 25,
"token_acc": 0.7404127180940572
},
{
"epoch": 0.5319693094629157,
"grad_norm": 0.7773280143737793,
"learning_rate": 9.591909016660806e-06,
"loss": 0.9281005859375,
"step": 26,
"token_acc": 0.7391337224366917
},
{
"epoch": 0.5524296675191815,
"grad_norm": 0.63581383228302,
"learning_rate": 9.546023723619387e-06,
"loss": 0.9176025390625,
"step": 27,
"token_acc": 0.7424947635543382
},
{
"epoch": 0.5728900255754475,
"grad_norm": 0.7504338622093201,
"learning_rate": 9.497816317548625e-06,
"loss": 0.9302978515625,
"step": 28,
"token_acc": 0.7391818850545303
},
{
"epoch": 0.5933503836317136,
"grad_norm": 0.5946778655052185,
"learning_rate": 9.447311422835141e-06,
"loss": 0.9202880859375,
"step": 29,
"token_acc": 0.7407723631931363
},
{
"epoch": 0.6138107416879796,
"grad_norm": 0.787707507610321,
"learning_rate": 9.39453483742483e-06,
"loss": 0.911865234375,
"step": 30,
"token_acc": 0.7428029623155381
},
{
"epoch": 0.6138107416879796,
"eval_loss": 271.0625,
"eval_runtime": 183.1288,
"eval_samples_per_second": 0.082,
"eval_steps_per_second": 0.005,
"eval_token_acc": 0.7445882205618399,
"step": 30
},
{
"epoch": 0.6342710997442456,
"grad_norm": 0.607540488243103,
"learning_rate": 9.33951351964525e-06,
"loss": 0.9012451171875,
"step": 31,
"token_acc": 0.745131512392514
},
{
"epoch": 0.6547314578005116,
"grad_norm": 0.7581419348716736,
"learning_rate": 9.28227557443528e-06,
"loss": 0.921875,
"step": 32,
"token_acc": 0.739635471164067
},
{
"epoch": 0.6751918158567775,
"grad_norm": 0.5728419423103333,
"learning_rate": 9.222850238989104e-06,
"loss": 0.91064453125,
"step": 33,
"token_acc": 0.7417856423282375
},
{
"epoch": 0.6956521739130435,
"grad_norm": 0.7030539512634277,
"learning_rate": 9.161267867821802e-06,
"loss": 0.90576171875,
"step": 34,
"token_acc": 0.7440411485273096
},
{
"epoch": 0.7161125319693095,
"grad_norm": 0.7187775373458862,
"learning_rate": 9.097559917264268e-06,
"loss": 0.915283203125,
"step": 35,
"token_acc": 0.7416095001092865
},
{
"epoch": 0.7365728900255755,
"grad_norm": 1.2131768465042114,
"learning_rate": 9.031758929395259e-06,
"loss": 0.88720703125,
"step": 36,
"token_acc": 0.7491847812640715
},
{
"epoch": 0.7570332480818415,
"grad_norm": 0.6976324319839478,
"learning_rate": 8.963898515418885e-06,
"loss": 0.910400390625,
"step": 37,
"token_acc": 0.742646390581947
},
{
"epoch": 0.7774936061381074,
"grad_norm": 0.6271430253982544,
"learning_rate": 8.89401333849598e-06,
"loss": 0.8946533203125,
"step": 38,
"token_acc": 0.7461849707519417
},
{
"epoch": 0.7979539641943734,
"grad_norm": 0.9629178643226624,
"learning_rate": 8.82213909603812e-06,
"loss": 0.891357421875,
"step": 39,
"token_acc": 0.7467142339485449
},
{
"epoch": 0.8184143222506394,
"grad_norm": 0.5696749091148376,
"learning_rate": 8.748312501473351e-06,
"loss": 0.889404296875,
"step": 40,
"token_acc": 0.7480125858101083
},
{
"epoch": 0.8184143222506394,
"eval_loss": 264.984375,
"eval_runtime": 182.0545,
"eval_samples_per_second": 0.082,
"eval_steps_per_second": 0.005,
"eval_token_acc": 0.7485849647284718,
"step": 40
},
{
"epoch": 0.8388746803069054,
"grad_norm": 0.6953923106193542,
"learning_rate": 8.672571265492944e-06,
"loss": 0.904296875,
"step": 41,
"token_acc": 0.7444278902019779
},
{
"epoch": 0.8593350383631714,
"grad_norm": 0.6765785217285156,
"learning_rate": 8.594954076788736e-06,
"loss": 0.891845703125,
"step": 42,
"token_acc": 0.7476815443943472
},
{
"epoch": 0.8797953964194374,
"grad_norm": 0.7245502471923828,
"learning_rate": 8.515500582290914e-06,
"loss": 0.890625,
"step": 43,
"token_acc": 0.7461978035518999
},
{
"epoch": 0.9002557544757033,
"grad_norm": 0.5907047986984253,
"learning_rate": 8.434251366916323e-06,
"loss": 0.9033203125,
"step": 44,
"token_acc": 0.7438690767483638
},
{
"epoch": 0.9207161125319693,
"grad_norm": 0.6903477311134338,
"learning_rate": 8.351247932837655e-06,
"loss": 0.894775390625,
"step": 45,
"token_acc": 0.745345279252677
},
{
"epoch": 0.9411764705882353,
"grad_norm": 0.6156295537948608,
"learning_rate": 8.266532678284103e-06,
"loss": 0.8800048828125,
"step": 46,
"token_acc": 0.7496958725344752
},
{
"epoch": 0.9616368286445013,
"grad_norm": 0.6671141982078552,
"learning_rate": 8.18014887588431e-06,
"loss": 0.883544921875,
"step": 47,
"token_acc": 0.7479351354819822
},
{
"epoch": 0.9820971867007673,
"grad_norm": 0.5917587280273438,
"learning_rate": 8.092140650562665e-06,
"loss": 0.882080078125,
"step": 48,
"token_acc": 0.7484929482805501
},
{
"epoch": 1.0,
"grad_norm": 0.930376410484314,
"learning_rate": 8.002552957000254e-06,
"loss": 0.8798828125,
"step": 49,
"token_acc": 0.7485103827283421
},
{
"epoch": 1.020460358056266,
"grad_norm": 0.7340478301048279,
"learning_rate": 7.911431556671967e-06,
"loss": 0.8404541015625,
"step": 50,
"token_acc": 0.7561591178820095
},
{
"epoch": 1.020460358056266,
"eval_loss": 261.390625,
"eval_runtime": 186.3503,
"eval_samples_per_second": 0.08,
"eval_steps_per_second": 0.005,
"eval_token_acc": 0.7507617815252328,
"step": 50
},
{
"epoch": 1.040920716112532,
"grad_norm": 0.6937265396118164,
"learning_rate": 7.818822994471504e-06,
"loss": 0.8221435546875,
"step": 51,
"token_acc": 0.7600091509973993
},
{
"epoch": 1.061381074168798,
"grad_norm": 0.6445659399032593,
"learning_rate": 7.72477457493619e-06,
"loss": 0.798095703125,
"step": 52,
"token_acc": 0.7666009535619558
},
{
"epoch": 1.081841432225064,
"grad_norm": 0.6034528613090515,
"learning_rate": 7.629334338083774e-06,
"loss": 0.8121337890625,
"step": 53,
"token_acc": 0.761754260042804
},
{
"epoch": 1.10230179028133,
"grad_norm": 0.7658072710037231,
"learning_rate": 7.532551034873558e-06,
"loss": 0.8314208984375,
"step": 54,
"token_acc": 0.7583218819656938
},
{
"epoch": 1.1227621483375958,
"grad_norm": 0.5790229439735413,
"learning_rate": 7.43447410230435e-06,
"loss": 0.81494140625,
"step": 55,
"token_acc": 0.7614948252002275
},
{
"epoch": 1.143222506393862,
"grad_norm": 0.6969874501228333,
"learning_rate": 7.335153638162005e-06,
"loss": 0.80810546875,
"step": 56,
"token_acc": 0.7627752172619252
},
{
"epoch": 1.1636828644501278,
"grad_norm": 0.6890274286270142,
"learning_rate": 7.234640375429427e-06,
"loss": 0.7890625,
"step": 57,
"token_acc": 0.7680120601871605
},
{
"epoch": 1.184143222506394,
"grad_norm": 0.8471683859825134,
"learning_rate": 7.132985656372126e-06,
"loss": 0.7908935546875,
"step": 58,
"token_acc": 0.7681585540637447
},
{
"epoch": 1.2046035805626598,
"grad_norm": 0.5586804747581482,
"learning_rate": 7.030241406312528e-06,
"loss": 0.7999267578125,
"step": 59,
"token_acc": 0.7652726739906083
},
{
"epoch": 1.2250639386189257,
"grad_norm": 0.6501573324203491,
"learning_rate": 6.926460107106483e-06,
"loss": 0.8023681640625,
"step": 60,
"token_acc": 0.7641969833563484
},
{
"epoch": 1.2250639386189257,
"eval_loss": 260.03515625,
"eval_runtime": 174.7319,
"eval_samples_per_second": 0.086,
"eval_steps_per_second": 0.006,
"eval_token_acc": 0.7517990566431523,
"step": 60
},
{
"epoch": 1.2455242966751918,
"grad_norm": 0.67079758644104,
"learning_rate": 6.8216947703354815e-06,
"loss": 0.80908203125,
"step": 61,
"token_acc": 0.762754462206252
},
{
"epoch": 1.265984654731458,
"grad_norm": 0.8003351092338562,
"learning_rate": 6.715998910228296e-06,
"loss": 0.822021484375,
"step": 62,
"token_acc": 0.7596542081982427
},
{
"epoch": 1.2864450127877238,
"grad_norm": 0.5755249857902527,
"learning_rate": 6.609426516325859e-06,
"loss": 0.7877197265625,
"step": 63,
"token_acc": 0.7665187875244992
},
{
"epoch": 1.3069053708439897,
"grad_norm": 0.5514203310012817,
"learning_rate": 6.502032025903356e-06,
"loss": 0.79248046875,
"step": 64,
"token_acc": 0.7668143054654905
},
{
"epoch": 1.3273657289002558,
"grad_norm": 0.5459880828857422,
"learning_rate": 6.393870296163616e-06,
"loss": 0.794677734375,
"step": 65,
"token_acc": 0.7654924274646578
},
{
"epoch": 1.3478260869565217,
"grad_norm": 0.7557441592216492,
"learning_rate": 6.284996576216014e-06,
"loss": 0.8095703125,
"step": 66,
"token_acc": 0.7632387915441781
},
{
"epoch": 1.3682864450127878,
"grad_norm": 0.5115758776664734,
"learning_rate": 6.175466478855161e-06,
"loss": 0.787109375,
"step": 67,
"token_acc": 0.7676507146997723
},
{
"epoch": 1.3887468030690537,
"grad_norm": 0.6818933486938477,
"learning_rate": 6.065335952153846e-06,
"loss": 0.7919921875,
"step": 68,
"token_acc": 0.7656047815638671
},
{
"epoch": 1.4092071611253196,
"grad_norm": 0.5157542824745178,
"learning_rate": 5.954661250884704e-06,
"loss": 0.7918701171875,
"step": 69,
"token_acc": 0.7667713340544047
},
{
"epoch": 1.4296675191815857,
"grad_norm": 0.5024055242538452,
"learning_rate": 5.843498907785236e-06,
"loss": 0.7869873046875,
"step": 70,
"token_acc": 0.767566587893184
},
{
"epoch": 1.4296675191815857,
"eval_loss": 258.23828125,
"eval_runtime": 190.6896,
"eval_samples_per_second": 0.079,
"eval_steps_per_second": 0.005,
"eval_token_acc": 0.7528071127436657,
"step": 70
},
{
"epoch": 1.4501278772378516,
"grad_norm": 0.541907548904419,
"learning_rate": 5.731905704680834e-06,
"loss": 0.799072265625,
"step": 71,
"token_acc": 0.7636709487617457
},
{
"epoch": 1.4705882352941178,
"grad_norm": 0.5258617997169495,
"learning_rate": 5.6199386434805615e-06,
"loss": 0.7833251953125,
"step": 72,
"token_acc": 0.7686619982839542
},
{
"epoch": 1.4910485933503836,
"grad_norm": 0.7432481646537781,
"learning_rate": 5.507654917060541e-06,
"loss": 0.8057861328125,
"step": 73,
"token_acc": 0.7638059078138052
},
{
"epoch": 1.5115089514066495,
"grad_norm": 0.5650312900543213,
"learning_rate": 5.395111880049775e-06,
"loss": 0.7869873046875,
"step": 74,
"token_acc": 0.7681778820079445
},
{
"epoch": 1.5319693094629157,
"grad_norm": 1.214871883392334,
"learning_rate": 5.28236701953335e-06,
"loss": 0.8092041015625,
"step": 75,
"token_acc": 0.7620805749476309
},
{
"epoch": 1.5524296675191815,
"grad_norm": 0.5642526745796204,
"learning_rate": 5.169477925687981e-06,
"loss": 0.776611328125,
"step": 76,
"token_acc": 0.7701183258952242
},
{
"epoch": 1.5728900255754477,
"grad_norm": 0.4626403748989105,
"learning_rate": 5.0565022623649e-06,
"loss": 0.8040771484375,
"step": 77,
"token_acc": 0.7638591574550989
},
{
"epoch": 1.5933503836317136,
"grad_norm": 0.5212917923927307,
"learning_rate": 4.943497737635103e-06,
"loss": 0.807861328125,
"step": 78,
"token_acc": 0.7629239401114802
},
{
"epoch": 1.6138107416879794,
"grad_norm": 0.5400001406669617,
"learning_rate": 4.830522074312019e-06,
"loss": 0.797119140625,
"step": 79,
"token_acc": 0.7648081896745939
},
{
"epoch": 1.6342710997442456,
"grad_norm": 0.5016899108886719,
"learning_rate": 4.717632980466652e-06,
"loss": 0.7860107421875,
"step": 80,
"token_acc": 0.7684027888411594
},
{
"epoch": 1.6342710997442456,
"eval_loss": 256.85546875,
"eval_runtime": 186.06,
"eval_samples_per_second": 0.081,
"eval_steps_per_second": 0.005,
"eval_token_acc": 0.754155361689694,
"step": 80
},
{
"epoch": 1.6547314578005117,
"grad_norm": 0.5642063617706299,
"learning_rate": 4.6048881199502265e-06,
"loss": 0.7830810546875,
"step": 81,
"token_acc": 0.7678564181548047
},
{
"epoch": 1.6751918158567776,
"grad_norm": 0.4916647970676422,
"learning_rate": 4.49234508293946e-06,
"loss": 0.779052734375,
"step": 82,
"token_acc": 0.77010977208347
},
{
"epoch": 1.6956521739130435,
"grad_norm": 0.6295871138572693,
"learning_rate": 4.38006135651944e-06,
"loss": 0.8001708984375,
"step": 83,
"token_acc": 0.7653163568544067
},
{
"epoch": 1.7161125319693094,
"grad_norm": 0.4934154450893402,
"learning_rate": 4.268094295319167e-06,
"loss": 0.794677734375,
"step": 84,
"token_acc": 0.7658125406456339
},
{
"epoch": 1.7365728900255755,
"grad_norm": 0.4807905852794647,
"learning_rate": 4.1565010922147644e-06,
"loss": 0.8067626953125,
"step": 85,
"token_acc": 0.7628501378393466
},
{
"epoch": 1.7570332480818416,
"grad_norm": 0.4914467930793762,
"learning_rate": 4.045338749115299e-06,
"loss": 0.7962646484375,
"step": 86,
"token_acc": 0.7652487208210966
},
{
"epoch": 1.7774936061381075,
"grad_norm": 0.46617603302001953,
"learning_rate": 3.934664047846157e-06,
"loss": 0.78271484375,
"step": 87,
"token_acc": 0.7677687034999344
},
{
"epoch": 1.7979539641943734,
"grad_norm": 0.535650908946991,
"learning_rate": 3.8245335211448404e-06,
"loss": 0.7938232421875,
"step": 88,
"token_acc": 0.766091786543743
},
{
"epoch": 1.8184143222506393,
"grad_norm": 0.47340837121009827,
"learning_rate": 3.715003423783986e-06,
"loss": 0.8013916015625,
"step": 89,
"token_acc": 0.7640529262026424
},
{
"epoch": 1.8388746803069054,
"grad_norm": 0.6464864611625671,
"learning_rate": 3.6061297038363853e-06,
"loss": 0.80810546875,
"step": 90,
"token_acc": 0.7632898394951272
},
{
"epoch": 1.8388746803069054,
"eval_loss": 255.55859375,
"eval_runtime": 180.9095,
"eval_samples_per_second": 0.083,
"eval_steps_per_second": 0.006,
"eval_token_acc": 0.7548754017614894,
"step": 90
},
{
"epoch": 1.8593350383631715,
"grad_norm": 0.6239431500434875,
"learning_rate": 3.497967974096647e-06,
"loss": 0.800537109375,
"step": 91,
"token_acc": 0.763830056360141
},
{
"epoch": 1.8797953964194374,
"grad_norm": 0.45245441794395447,
"learning_rate": 3.3905734836741415e-06,
"loss": 0.8072509765625,
"step": 92,
"token_acc": 0.7629841640196129
},
{
"epoch": 1.9002557544757033,
"grad_norm": 0.5149667263031006,
"learning_rate": 3.2840010897717045e-06,
"loss": 0.7896728515625,
"step": 93,
"token_acc": 0.7672215188664161
},
{
"epoch": 1.9207161125319692,
"grad_norm": 0.4640462398529053,
"learning_rate": 3.178305229664519e-06,
"loss": 0.802978515625,
"step": 94,
"token_acc": 0.763835117063376
},
{
"epoch": 1.9411764705882353,
"grad_norm": 0.45892465114593506,
"learning_rate": 3.073539892893519e-06,
"loss": 0.7943115234375,
"step": 95,
"token_acc": 0.7657574351900455
},
{
"epoch": 1.9616368286445014,
"grad_norm": 0.47301074862480164,
"learning_rate": 2.969758593687475e-06,
"loss": 0.7738037109375,
"step": 96,
"token_acc": 0.7718963916631811
},
{
"epoch": 1.9820971867007673,
"grad_norm": 0.551275372505188,
"learning_rate": 2.8670143436278757e-06,
"loss": 0.7822265625,
"step": 97,
"token_acc": 0.7684924397079685
},
{
"epoch": 2.0,
"grad_norm": 0.4965466260910034,
"learning_rate": 2.765359624570574e-06,
"loss": 0.76708984375,
"step": 98,
"token_acc": 0.7723498958825589
},
{
"epoch": 2.020460358056266,
"grad_norm": 0.6020316481590271,
"learning_rate": 2.664846361837997e-06,
"loss": 0.7359619140625,
"step": 99,
"token_acc": 0.779019540556512
},
{
"epoch": 2.040920716112532,
"grad_norm": 0.5324369668960571,
"learning_rate": 2.565525897695651e-06,
"loss": 0.7454833984375,
"step": 100,
"token_acc": 0.7767212030542502
},
{
"epoch": 2.040920716112532,
"eval_loss": 256.80859375,
"eval_runtime": 176.504,
"eval_samples_per_second": 0.085,
"eval_steps_per_second": 0.006,
"eval_token_acc": 0.754800267145302,
"step": 100
}
],
"logging_steps": 1,
"max_steps": 147,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 10,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.812412281074483e+16,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}