{ "best_global_step": 90, "best_metric": 255.55859375, "best_model_checkpoint": "/home/notebook/code/group/eason/ms-swift/qwen2.5_72b_swift_allen/v1-20251030-150430/checkpoint-90", "epoch": 2.040920716112532, "eval_steps": 10, "global_step": 100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.020460358056265986, "grad_norm": 8.740862846374512, "learning_rate": 1.25e-06, "loss": 1.323974609375, "step": 1, "token_acc": 0.6749949473048041 }, { "epoch": 0.04092071611253197, "grad_norm": 8.77861499786377, "learning_rate": 2.5e-06, "loss": 1.3330078125, "step": 2, "token_acc": 0.673973689521971 }, { "epoch": 0.061381074168797956, "grad_norm": 6.631825923919678, "learning_rate": 3.7500000000000005e-06, "loss": 1.296875, "step": 3, "token_acc": 0.6779223915870493 }, { "epoch": 0.08184143222506395, "grad_norm": 3.3304812908172607, "learning_rate": 5e-06, "loss": 1.2734375, "step": 4, "token_acc": 0.6801284548421133 }, { "epoch": 0.10230179028132992, "grad_norm": 2.9929847717285156, "learning_rate": 6.25e-06, "loss": 1.2674560546875, "step": 5, "token_acc": 0.6774330054044837 }, { "epoch": 0.12276214833759591, "grad_norm": 3.2737808227539062, "learning_rate": 7.500000000000001e-06, "loss": 1.24853515625, "step": 6, "token_acc": 0.6828065892133894 }, { "epoch": 0.1432225063938619, "grad_norm": 3.864370107650757, "learning_rate": 8.750000000000001e-06, "loss": 1.2008056640625, "step": 7, "token_acc": 0.6894752982753333 }, { "epoch": 0.1636828644501279, "grad_norm": 3.059185743331909, "learning_rate": 1e-05, "loss": 1.16259765625, "step": 8, "token_acc": 0.695852402365021 }, { "epoch": 0.18414322250639387, "grad_norm": 2.160026788711548, "learning_rate": 9.99872299773906e-06, "loss": 1.1551513671875, "step": 9, "token_acc": 0.696011262965415 }, { "epoch": 0.20460358056265984, "grad_norm": 1.7391501665115356, "learning_rate": 9.994892643250147e-06, "loss": 1.102783203125, "step": 10, "token_acc": 0.7084828844476136 }, { "epoch": 0.20460358056265984, "eval_loss": 318.328125, "eval_runtime": 174.7551, "eval_samples_per_second": 0.086, "eval_steps_per_second": 0.006, "eval_token_acc": 0.714127394915891, "step": 10 }, { "epoch": 0.22506393861892582, "grad_norm": 1.512490153312683, "learning_rate": 9.9885108930818e-06, "loss": 1.059814453125, "step": 11, "token_acc": 0.7156657958041264 }, { "epoch": 0.24552429667519182, "grad_norm": 1.2666767835617065, "learning_rate": 9.979581007037776e-06, "loss": 1.0445556640625, "step": 12, "token_acc": 0.7163779567590186 }, { "epoch": 0.2659846547314578, "grad_norm": 1.035067081451416, "learning_rate": 9.968107546511942e-06, "loss": 1.0322265625, "step": 13, "token_acc": 0.7191054482580511 }, { "epoch": 0.2864450127877238, "grad_norm": 1.0283350944519043, "learning_rate": 9.95409637215831e-06, "loss": 1.0177001953125, "step": 14, "token_acc": 0.7223777681726882 }, { "epoch": 0.3069053708439898, "grad_norm": 0.8945289254188538, "learning_rate": 9.937554640897414e-06, "loss": 0.9921875, "step": 15, "token_acc": 0.7268454397965844 }, { "epoch": 0.3273657289002558, "grad_norm": 0.9290580749511719, "learning_rate": 9.918490802260538e-06, "loss": 0.9925537109375, "step": 16, "token_acc": 0.7261628646104965 }, { "epoch": 0.34782608695652173, "grad_norm": 0.9589850902557373, "learning_rate": 9.896914594073703e-06, "loss": 0.9844970703125, "step": 17, "token_acc": 0.7282743852241678 }, { "epoch": 0.36828644501278773, "grad_norm": 0.8926294445991516, "learning_rate": 9.87283703748356e-06, "loss": 0.955322265625, "step": 18, "token_acc": 0.7333408249225458 }, { "epoch": 0.3887468030690537, "grad_norm": 0.7655003666877747, "learning_rate": 9.846270431327793e-06, "loss": 0.97998046875, "step": 19, "token_acc": 0.7289209223794986 }, { "epoch": 0.4092071611253197, "grad_norm": 0.8207703828811646, "learning_rate": 9.817228345852853e-06, "loss": 0.9736328125, "step": 20, "token_acc": 0.7293907637079884 }, { "epoch": 0.4092071611253197, "eval_loss": 282.9296875, "eval_runtime": 184.9183, "eval_samples_per_second": 0.081, "eval_steps_per_second": 0.005, "eval_token_acc": 0.7367637851149977, "step": 20 }, { "epoch": 0.4296675191815857, "grad_norm": 0.899739682674408, "learning_rate": 9.785725615782262e-06, "loss": 0.951171875, "step": 21, "token_acc": 0.7350280185329514 }, { "epoch": 0.45012787723785164, "grad_norm": 0.7847328186035156, "learning_rate": 9.751778332739033e-06, "loss": 0.955322265625, "step": 22, "token_acc": 0.7339763941093482 }, { "epoch": 0.47058823529411764, "grad_norm": 0.8828109502792358, "learning_rate": 9.715403837026046e-06, "loss": 0.928466796875, "step": 23, "token_acc": 0.7396126780013036 }, { "epoch": 0.49104859335038364, "grad_norm": 0.9227666258811951, "learning_rate": 9.676620708768608e-06, "loss": 0.940673828125, "step": 24, "token_acc": 0.7378745153330983 }, { "epoch": 0.5115089514066496, "grad_norm": 0.6232196092605591, "learning_rate": 9.635448758423703e-06, "loss": 0.9249267578125, "step": 25, "token_acc": 0.7404127180940572 }, { "epoch": 0.5319693094629157, "grad_norm": 0.7773280143737793, "learning_rate": 9.591909016660806e-06, "loss": 0.9281005859375, "step": 26, "token_acc": 0.7391337224366917 }, { "epoch": 0.5524296675191815, "grad_norm": 0.63581383228302, "learning_rate": 9.546023723619387e-06, "loss": 0.9176025390625, "step": 27, "token_acc": 0.7424947635543382 }, { "epoch": 0.5728900255754475, "grad_norm": 0.7504338622093201, "learning_rate": 9.497816317548625e-06, "loss": 0.9302978515625, "step": 28, "token_acc": 0.7391818850545303 }, { "epoch": 0.5933503836317136, "grad_norm": 0.5946778655052185, "learning_rate": 9.447311422835141e-06, "loss": 0.9202880859375, "step": 29, "token_acc": 0.7407723631931363 }, { "epoch": 0.6138107416879796, "grad_norm": 0.787707507610321, "learning_rate": 9.39453483742483e-06, "loss": 0.911865234375, "step": 30, "token_acc": 0.7428029623155381 }, { "epoch": 0.6138107416879796, "eval_loss": 271.0625, "eval_runtime": 183.1288, "eval_samples_per_second": 0.082, "eval_steps_per_second": 0.005, "eval_token_acc": 0.7445882205618399, "step": 30 }, { "epoch": 0.6342710997442456, "grad_norm": 0.607540488243103, "learning_rate": 9.33951351964525e-06, "loss": 0.9012451171875, "step": 31, "token_acc": 0.745131512392514 }, { "epoch": 0.6547314578005116, "grad_norm": 0.7581419348716736, "learning_rate": 9.28227557443528e-06, "loss": 0.921875, "step": 32, "token_acc": 0.739635471164067 }, { "epoch": 0.6751918158567775, "grad_norm": 0.5728419423103333, "learning_rate": 9.222850238989104e-06, "loss": 0.91064453125, "step": 33, "token_acc": 0.7417856423282375 }, { "epoch": 0.6956521739130435, "grad_norm": 0.7030539512634277, "learning_rate": 9.161267867821802e-06, "loss": 0.90576171875, "step": 34, "token_acc": 0.7440411485273096 }, { "epoch": 0.7161125319693095, "grad_norm": 0.7187775373458862, "learning_rate": 9.097559917264268e-06, "loss": 0.915283203125, "step": 35, "token_acc": 0.7416095001092865 }, { "epoch": 0.7365728900255755, "grad_norm": 1.2131768465042114, "learning_rate": 9.031758929395259e-06, "loss": 0.88720703125, "step": 36, "token_acc": 0.7491847812640715 }, { "epoch": 0.7570332480818415, "grad_norm": 0.6976324319839478, "learning_rate": 8.963898515418885e-06, "loss": 0.910400390625, "step": 37, "token_acc": 0.742646390581947 }, { "epoch": 0.7774936061381074, "grad_norm": 0.6271430253982544, "learning_rate": 8.89401333849598e-06, "loss": 0.8946533203125, "step": 38, "token_acc": 0.7461849707519417 }, { "epoch": 0.7979539641943734, "grad_norm": 0.9629178643226624, "learning_rate": 8.82213909603812e-06, "loss": 0.891357421875, "step": 39, "token_acc": 0.7467142339485449 }, { "epoch": 0.8184143222506394, "grad_norm": 0.5696749091148376, "learning_rate": 8.748312501473351e-06, "loss": 0.889404296875, "step": 40, "token_acc": 0.7480125858101083 }, { "epoch": 0.8184143222506394, "eval_loss": 264.984375, "eval_runtime": 182.0545, "eval_samples_per_second": 0.082, "eval_steps_per_second": 0.005, "eval_token_acc": 0.7485849647284718, "step": 40 }, { "epoch": 0.8388746803069054, "grad_norm": 0.6953923106193542, "learning_rate": 8.672571265492944e-06, "loss": 0.904296875, "step": 41, "token_acc": 0.7444278902019779 }, { "epoch": 0.8593350383631714, "grad_norm": 0.6765785217285156, "learning_rate": 8.594954076788736e-06, "loss": 0.891845703125, "step": 42, "token_acc": 0.7476815443943472 }, { "epoch": 0.8797953964194374, "grad_norm": 0.7245502471923828, "learning_rate": 8.515500582290914e-06, "loss": 0.890625, "step": 43, "token_acc": 0.7461978035518999 }, { "epoch": 0.9002557544757033, "grad_norm": 0.5907047986984253, "learning_rate": 8.434251366916323e-06, "loss": 0.9033203125, "step": 44, "token_acc": 0.7438690767483638 }, { "epoch": 0.9207161125319693, "grad_norm": 0.6903477311134338, "learning_rate": 8.351247932837655e-06, "loss": 0.894775390625, "step": 45, "token_acc": 0.745345279252677 }, { "epoch": 0.9411764705882353, "grad_norm": 0.6156295537948608, "learning_rate": 8.266532678284103e-06, "loss": 0.8800048828125, "step": 46, "token_acc": 0.7496958725344752 }, { "epoch": 0.9616368286445013, "grad_norm": 0.6671141982078552, "learning_rate": 8.18014887588431e-06, "loss": 0.883544921875, "step": 47, "token_acc": 0.7479351354819822 }, { "epoch": 0.9820971867007673, "grad_norm": 0.5917587280273438, "learning_rate": 8.092140650562665e-06, "loss": 0.882080078125, "step": 48, "token_acc": 0.7484929482805501 }, { "epoch": 1.0, "grad_norm": 0.930376410484314, "learning_rate": 8.002552957000254e-06, "loss": 0.8798828125, "step": 49, "token_acc": 0.7485103827283421 }, { "epoch": 1.020460358056266, "grad_norm": 0.7340478301048279, "learning_rate": 7.911431556671967e-06, "loss": 0.8404541015625, "step": 50, "token_acc": 0.7561591178820095 }, { "epoch": 1.020460358056266, "eval_loss": 261.390625, "eval_runtime": 186.3503, "eval_samples_per_second": 0.08, "eval_steps_per_second": 0.005, "eval_token_acc": 0.7507617815252328, "step": 50 }, { "epoch": 1.040920716112532, "grad_norm": 0.6937265396118164, "learning_rate": 7.818822994471504e-06, "loss": 0.8221435546875, "step": 51, "token_acc": 0.7600091509973993 }, { "epoch": 1.061381074168798, "grad_norm": 0.6445659399032593, "learning_rate": 7.72477457493619e-06, "loss": 0.798095703125, "step": 52, "token_acc": 0.7666009535619558 }, { "epoch": 1.081841432225064, "grad_norm": 0.6034528613090515, "learning_rate": 7.629334338083774e-06, "loss": 0.8121337890625, "step": 53, "token_acc": 0.761754260042804 }, { "epoch": 1.10230179028133, "grad_norm": 0.7658072710037231, "learning_rate": 7.532551034873558e-06, "loss": 0.8314208984375, "step": 54, "token_acc": 0.7583218819656938 }, { "epoch": 1.1227621483375958, "grad_norm": 0.5790229439735413, "learning_rate": 7.43447410230435e-06, "loss": 0.81494140625, "step": 55, "token_acc": 0.7614948252002275 }, { "epoch": 1.143222506393862, "grad_norm": 0.6969874501228333, "learning_rate": 7.335153638162005e-06, "loss": 0.80810546875, "step": 56, "token_acc": 0.7627752172619252 }, { "epoch": 1.1636828644501278, "grad_norm": 0.6890274286270142, "learning_rate": 7.234640375429427e-06, "loss": 0.7890625, "step": 57, "token_acc": 0.7680120601871605 }, { "epoch": 1.184143222506394, "grad_norm": 0.8471683859825134, "learning_rate": 7.132985656372126e-06, "loss": 0.7908935546875, "step": 58, "token_acc": 0.7681585540637447 }, { "epoch": 1.2046035805626598, "grad_norm": 0.5586804747581482, "learning_rate": 7.030241406312528e-06, "loss": 0.7999267578125, "step": 59, "token_acc": 0.7652726739906083 }, { "epoch": 1.2250639386189257, "grad_norm": 0.6501573324203491, "learning_rate": 6.926460107106483e-06, "loss": 0.8023681640625, "step": 60, "token_acc": 0.7641969833563484 }, { "epoch": 1.2250639386189257, "eval_loss": 260.03515625, "eval_runtime": 174.7319, "eval_samples_per_second": 0.086, "eval_steps_per_second": 0.006, "eval_token_acc": 0.7517990566431523, "step": 60 }, { "epoch": 1.2455242966751918, "grad_norm": 0.67079758644104, "learning_rate": 6.8216947703354815e-06, "loss": 0.80908203125, "step": 61, "token_acc": 0.762754462206252 }, { "epoch": 1.265984654731458, "grad_norm": 0.8003351092338562, "learning_rate": 6.715998910228296e-06, "loss": 0.822021484375, "step": 62, "token_acc": 0.7596542081982427 }, { "epoch": 1.2864450127877238, "grad_norm": 0.5755249857902527, "learning_rate": 6.609426516325859e-06, "loss": 0.7877197265625, "step": 63, "token_acc": 0.7665187875244992 }, { "epoch": 1.3069053708439897, "grad_norm": 0.5514203310012817, "learning_rate": 6.502032025903356e-06, "loss": 0.79248046875, "step": 64, "token_acc": 0.7668143054654905 }, { "epoch": 1.3273657289002558, "grad_norm": 0.5459880828857422, "learning_rate": 6.393870296163616e-06, "loss": 0.794677734375, "step": 65, "token_acc": 0.7654924274646578 }, { "epoch": 1.3478260869565217, "grad_norm": 0.7557441592216492, "learning_rate": 6.284996576216014e-06, "loss": 0.8095703125, "step": 66, "token_acc": 0.7632387915441781 }, { "epoch": 1.3682864450127878, "grad_norm": 0.5115758776664734, "learning_rate": 6.175466478855161e-06, "loss": 0.787109375, "step": 67, "token_acc": 0.7676507146997723 }, { "epoch": 1.3887468030690537, "grad_norm": 0.6818933486938477, "learning_rate": 6.065335952153846e-06, "loss": 0.7919921875, "step": 68, "token_acc": 0.7656047815638671 }, { "epoch": 1.4092071611253196, "grad_norm": 0.5157542824745178, "learning_rate": 5.954661250884704e-06, "loss": 0.7918701171875, "step": 69, "token_acc": 0.7667713340544047 }, { "epoch": 1.4296675191815857, "grad_norm": 0.5024055242538452, "learning_rate": 5.843498907785236e-06, "loss": 0.7869873046875, "step": 70, "token_acc": 0.767566587893184 }, { "epoch": 1.4296675191815857, "eval_loss": 258.23828125, "eval_runtime": 190.6896, "eval_samples_per_second": 0.079, "eval_steps_per_second": 0.005, "eval_token_acc": 0.7528071127436657, "step": 70 }, { "epoch": 1.4501278772378516, "grad_norm": 0.541907548904419, "learning_rate": 5.731905704680834e-06, "loss": 0.799072265625, "step": 71, "token_acc": 0.7636709487617457 }, { "epoch": 1.4705882352941178, "grad_norm": 0.5258617997169495, "learning_rate": 5.6199386434805615e-06, "loss": 0.7833251953125, "step": 72, "token_acc": 0.7686619982839542 }, { "epoch": 1.4910485933503836, "grad_norm": 0.7432481646537781, "learning_rate": 5.507654917060541e-06, "loss": 0.8057861328125, "step": 73, "token_acc": 0.7638059078138052 }, { "epoch": 1.5115089514066495, "grad_norm": 0.5650312900543213, "learning_rate": 5.395111880049775e-06, "loss": 0.7869873046875, "step": 74, "token_acc": 0.7681778820079445 }, { "epoch": 1.5319693094629157, "grad_norm": 1.214871883392334, "learning_rate": 5.28236701953335e-06, "loss": 0.8092041015625, "step": 75, "token_acc": 0.7620805749476309 }, { "epoch": 1.5524296675191815, "grad_norm": 0.5642526745796204, "learning_rate": 5.169477925687981e-06, "loss": 0.776611328125, "step": 76, "token_acc": 0.7701183258952242 }, { "epoch": 1.5728900255754477, "grad_norm": 0.4626403748989105, "learning_rate": 5.0565022623649e-06, "loss": 0.8040771484375, "step": 77, "token_acc": 0.7638591574550989 }, { "epoch": 1.5933503836317136, "grad_norm": 0.5212917923927307, "learning_rate": 4.943497737635103e-06, "loss": 0.807861328125, "step": 78, "token_acc": 0.7629239401114802 }, { "epoch": 1.6138107416879794, "grad_norm": 0.5400001406669617, "learning_rate": 4.830522074312019e-06, "loss": 0.797119140625, "step": 79, "token_acc": 0.7648081896745939 }, { "epoch": 1.6342710997442456, "grad_norm": 0.5016899108886719, "learning_rate": 4.717632980466652e-06, "loss": 0.7860107421875, "step": 80, "token_acc": 0.7684027888411594 }, { "epoch": 1.6342710997442456, "eval_loss": 256.85546875, "eval_runtime": 186.06, "eval_samples_per_second": 0.081, "eval_steps_per_second": 0.005, "eval_token_acc": 0.754155361689694, "step": 80 }, { "epoch": 1.6547314578005117, "grad_norm": 0.5642063617706299, "learning_rate": 4.6048881199502265e-06, "loss": 0.7830810546875, "step": 81, "token_acc": 0.7678564181548047 }, { "epoch": 1.6751918158567776, "grad_norm": 0.4916647970676422, "learning_rate": 4.49234508293946e-06, "loss": 0.779052734375, "step": 82, "token_acc": 0.77010977208347 }, { "epoch": 1.6956521739130435, "grad_norm": 0.6295871138572693, "learning_rate": 4.38006135651944e-06, "loss": 0.8001708984375, "step": 83, "token_acc": 0.7653163568544067 }, { "epoch": 1.7161125319693094, "grad_norm": 0.4934154450893402, "learning_rate": 4.268094295319167e-06, "loss": 0.794677734375, "step": 84, "token_acc": 0.7658125406456339 }, { "epoch": 1.7365728900255755, "grad_norm": 0.4807905852794647, "learning_rate": 4.1565010922147644e-06, "loss": 0.8067626953125, "step": 85, "token_acc": 0.7628501378393466 }, { "epoch": 1.7570332480818416, "grad_norm": 0.4914467930793762, "learning_rate": 4.045338749115299e-06, "loss": 0.7962646484375, "step": 86, "token_acc": 0.7652487208210966 }, { "epoch": 1.7774936061381075, "grad_norm": 0.46617603302001953, "learning_rate": 3.934664047846157e-06, "loss": 0.78271484375, "step": 87, "token_acc": 0.7677687034999344 }, { "epoch": 1.7979539641943734, "grad_norm": 0.535650908946991, "learning_rate": 3.8245335211448404e-06, "loss": 0.7938232421875, "step": 88, "token_acc": 0.766091786543743 }, { "epoch": 1.8184143222506393, "grad_norm": 0.47340837121009827, "learning_rate": 3.715003423783986e-06, "loss": 0.8013916015625, "step": 89, "token_acc": 0.7640529262026424 }, { "epoch": 1.8388746803069054, "grad_norm": 0.6464864611625671, "learning_rate": 3.6061297038363853e-06, "loss": 0.80810546875, "step": 90, "token_acc": 0.7632898394951272 }, { "epoch": 1.8388746803069054, "eval_loss": 255.55859375, "eval_runtime": 180.9095, "eval_samples_per_second": 0.083, "eval_steps_per_second": 0.006, "eval_token_acc": 0.7548754017614894, "step": 90 }, { "epoch": 1.8593350383631715, "grad_norm": 0.6239431500434875, "learning_rate": 3.497967974096647e-06, "loss": 0.800537109375, "step": 91, "token_acc": 0.763830056360141 }, { "epoch": 1.8797953964194374, "grad_norm": 0.45245441794395447, "learning_rate": 3.3905734836741415e-06, "loss": 0.8072509765625, "step": 92, "token_acc": 0.7629841640196129 }, { "epoch": 1.9002557544757033, "grad_norm": 0.5149667263031006, "learning_rate": 3.2840010897717045e-06, "loss": 0.7896728515625, "step": 93, "token_acc": 0.7672215188664161 }, { "epoch": 1.9207161125319692, "grad_norm": 0.4640462398529053, "learning_rate": 3.178305229664519e-06, "loss": 0.802978515625, "step": 94, "token_acc": 0.763835117063376 }, { "epoch": 1.9411764705882353, "grad_norm": 0.45892465114593506, "learning_rate": 3.073539892893519e-06, "loss": 0.7943115234375, "step": 95, "token_acc": 0.7657574351900455 }, { "epoch": 1.9616368286445014, "grad_norm": 0.47301074862480164, "learning_rate": 2.969758593687475e-06, "loss": 0.7738037109375, "step": 96, "token_acc": 0.7718963916631811 }, { "epoch": 1.9820971867007673, "grad_norm": 0.551275372505188, "learning_rate": 2.8670143436278757e-06, "loss": 0.7822265625, "step": 97, "token_acc": 0.7684924397079685 }, { "epoch": 2.0, "grad_norm": 0.4965466260910034, "learning_rate": 2.765359624570574e-06, "loss": 0.76708984375, "step": 98, "token_acc": 0.7723498958825589 }, { "epoch": 2.020460358056266, "grad_norm": 0.6020316481590271, "learning_rate": 2.664846361837997e-06, "loss": 0.7359619140625, "step": 99, "token_acc": 0.779019540556512 }, { "epoch": 2.040920716112532, "grad_norm": 0.5324369668960571, "learning_rate": 2.565525897695651e-06, "loss": 0.7454833984375, "step": 100, "token_acc": 0.7767212030542502 }, { "epoch": 2.040920716112532, "eval_loss": 256.80859375, "eval_runtime": 176.504, "eval_samples_per_second": 0.085, "eval_steps_per_second": 0.006, "eval_token_acc": 0.754800267145302, "step": 100 } ], "logging_steps": 1, "max_steps": 147, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 10, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.812412281074483e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }