SFT_True_Final / trainer_state.json
namhokaist's picture
Upload folder using huggingface_hub
1c438da verified
{
"best_global_step": 1275,
"best_metric": 0.00177309,
"best_model_checkpoint": "/ext_hdd2/nhkoh/gelab-env/checkpoint/gui_exp/sft_448/v0-20260221_074940/checkpoint-1275",
"epoch": 1.0,
"eval_steps": 500,
"global_step": 1275,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0007847753580537571,
"grad_norm": 72.18675231933594,
"learning_rate": 1.5625e-07,
"loss": 2.0382871627807617,
"memory(GiB)": 68.03,
"step": 1,
"token_acc": 0.5797872340425532,
"train_speed(iter/s)": 0.024381
},
{
"epoch": 0.007847753580537572,
"grad_norm": 36.29729461669922,
"learning_rate": 1.5625e-06,
"loss": 1.7542770173814561,
"memory(GiB)": 78.33,
"step": 10,
"token_acc": 0.5938697318007663,
"train_speed(iter/s)": 0.075875
},
{
"epoch": 0.015695507161075144,
"grad_norm": 13.778337478637695,
"learning_rate": 3.125e-06,
"loss": 0.7909364223480224,
"memory(GiB)": 78.33,
"step": 20,
"token_acc": 0.7760314341846758,
"train_speed(iter/s)": 0.087711
},
{
"epoch": 0.023543260741612712,
"grad_norm": 27.69485855102539,
"learning_rate": 4.6875000000000004e-06,
"loss": 0.3084306240081787,
"memory(GiB)": 78.33,
"step": 30,
"token_acc": 0.911275415896488,
"train_speed(iter/s)": 0.09229
},
{
"epoch": 0.03139101432215029,
"grad_norm": 4.941364765167236,
"learning_rate": 6.25e-06,
"loss": 0.16059274673461915,
"memory(GiB)": 78.33,
"step": 40,
"token_acc": 0.944043321299639,
"train_speed(iter/s)": 0.094919
},
{
"epoch": 0.039238767902687856,
"grad_norm": 7.940003395080566,
"learning_rate": 7.8125e-06,
"loss": 0.1214432954788208,
"memory(GiB)": 78.33,
"step": 50,
"token_acc": 0.9553853086976115,
"train_speed(iter/s)": 0.096567
},
{
"epoch": 0.047086521483225424,
"grad_norm": 4.930227279663086,
"learning_rate": 9.375000000000001e-06,
"loss": 0.12964333295822145,
"memory(GiB)": 78.33,
"step": 60,
"token_acc": 0.9566682715454983,
"train_speed(iter/s)": 0.09759
},
{
"epoch": 0.054934275063763,
"grad_norm": 11.7340726852417,
"learning_rate": 9.999394317256736e-06,
"loss": 0.10636246204376221,
"memory(GiB)": 78.33,
"step": 70,
"token_acc": 0.9639681796911558,
"train_speed(iter/s)": 0.098428
},
{
"epoch": 0.06278202864430057,
"grad_norm": 3.2650105953216553,
"learning_rate": 9.995693454107632e-06,
"loss": 0.10599330663681031,
"memory(GiB)": 78.33,
"step": 80,
"token_acc": 0.9595461272816971,
"train_speed(iter/s)": 0.099074
},
{
"epoch": 0.07062978222483814,
"grad_norm": 3.0711424350738525,
"learning_rate": 9.988630705723449e-06,
"loss": 0.08438605070114136,
"memory(GiB)": 78.33,
"step": 90,
"token_acc": 0.9621403331650682,
"train_speed(iter/s)": 0.09951
},
{
"epoch": 0.07847753580537571,
"grad_norm": 4.047796726226807,
"learning_rate": 9.978210825027824e-06,
"loss": 0.07347342371940613,
"memory(GiB)": 78.33,
"step": 100,
"token_acc": 0.9733570159857904,
"train_speed(iter/s)": 0.099919
},
{
"epoch": 0.08632528938591329,
"grad_norm": 2.480022668838501,
"learning_rate": 9.964440824148982e-06,
"loss": 0.05940539240837097,
"memory(GiB)": 78.33,
"step": 110,
"token_acc": 0.9747619047619047,
"train_speed(iter/s)": 0.100252
},
{
"epoch": 0.09417304296645085,
"grad_norm": 7.3106865882873535,
"learning_rate": 9.94732996970087e-06,
"loss": 0.07703952789306641,
"memory(GiB)": 78.33,
"step": 120,
"token_acc": 0.9689497716894977,
"train_speed(iter/s)": 0.100517
},
{
"epoch": 0.10202079654698842,
"grad_norm": 3.5625627040863037,
"learning_rate": 9.926889776547134e-06,
"loss": 0.06253595352172851,
"memory(GiB)": 78.33,
"step": 130,
"token_acc": 0.9812667261373773,
"train_speed(iter/s)": 0.100722
},
{
"epoch": 0.109868550127526,
"grad_norm": 4.747700214385986,
"learning_rate": 9.903134000052106e-06,
"loss": 0.0579115629196167,
"memory(GiB)": 78.33,
"step": 140,
"token_acc": 0.9774236387782205,
"train_speed(iter/s)": 0.100934
},
{
"epoch": 0.11771630370806356,
"grad_norm": 2.209345579147339,
"learning_rate": 9.87607862682405e-06,
"loss": 0.05663343667984009,
"memory(GiB)": 78.33,
"step": 150,
"token_acc": 0.9818758495695514,
"train_speed(iter/s)": 0.101108
},
{
"epoch": 0.12556405728860115,
"grad_norm": 1.7758883237838745,
"learning_rate": 9.845741863956859e-06,
"loss": 0.05171079039573669,
"memory(GiB)": 78.33,
"step": 160,
"token_acc": 0.9778481012658228,
"train_speed(iter/s)": 0.101248
},
{
"epoch": 0.1334118108691387,
"grad_norm": 1.93281090259552,
"learning_rate": 9.812144126777474e-06,
"loss": 0.05034952163696289,
"memory(GiB)": 78.33,
"step": 170,
"token_acc": 0.982707509881423,
"train_speed(iter/s)": 0.101395
},
{
"epoch": 0.14125956444967627,
"grad_norm": 2.119216203689575,
"learning_rate": 9.77530802510725e-06,
"loss": 0.0457323968410492,
"memory(GiB)": 78.33,
"step": 180,
"token_acc": 0.9846005774783445,
"train_speed(iter/s)": 0.101512
},
{
"epoch": 0.14910731803021385,
"grad_norm": 3.984466314315796,
"learning_rate": 9.735258348046538e-06,
"loss": 0.049006104469299316,
"memory(GiB)": 78.33,
"step": 190,
"token_acc": 0.9822242479489517,
"train_speed(iter/s)": 0.101601
},
{
"epoch": 0.15695507161075142,
"grad_norm": 3.68289852142334,
"learning_rate": 9.692022047292672e-06,
"loss": 0.05341410040855408,
"memory(GiB)": 78.33,
"step": 200,
"token_acc": 0.9789666209419295,
"train_speed(iter/s)": 0.101709
},
{
"epoch": 0.164802825191289,
"grad_norm": 5.514003753662109,
"learning_rate": 9.645628219002667e-06,
"loss": 0.05571324825286865,
"memory(GiB)": 78.33,
"step": 210,
"token_acc": 0.9723809523809523,
"train_speed(iter/s)": 0.101789
},
{
"epoch": 0.17265057877182657,
"grad_norm": 2.3219399452209473,
"learning_rate": 9.596108084212752e-06,
"loss": 0.04433055818080902,
"memory(GiB)": 78.33,
"step": 220,
"token_acc": 0.984073359073359,
"train_speed(iter/s)": 0.101862
},
{
"epoch": 0.18049833235236415,
"grad_norm": 1.942451000213623,
"learning_rate": 9.543494967827972e-06,
"loss": 0.04440748691558838,
"memory(GiB)": 78.33,
"step": 230,
"token_acc": 0.9850483729111698,
"train_speed(iter/s)": 0.101945
},
{
"epoch": 0.1883460859329017,
"grad_norm": 1.6777188777923584,
"learning_rate": 9.48782427619597e-06,
"loss": 0.040869510173797606,
"memory(GiB)": 78.33,
"step": 240,
"token_acc": 0.9838155958803335,
"train_speed(iter/s)": 0.102011
},
{
"epoch": 0.19619383951343927,
"grad_norm": 1.4591469764709473,
"learning_rate": 9.429133473280043e-06,
"loss": 0.03583506345748901,
"memory(GiB)": 78.33,
"step": 250,
"token_acc": 0.9880838894184938,
"train_speed(iter/s)": 0.102067
},
{
"epoch": 0.20404159309397685,
"grad_norm": 1.7732504606246948,
"learning_rate": 9.367462055447528e-06,
"loss": 0.0392861932516098,
"memory(GiB)": 78.33,
"step": 260,
"token_acc": 0.9775227164036346,
"train_speed(iter/s)": 0.102128
},
{
"epoch": 0.21188934667451442,
"grad_norm": 1.3268229961395264,
"learning_rate": 9.302851524890452e-06,
"loss": 0.11516731977462769,
"memory(GiB)": 78.33,
"step": 270,
"token_acc": 0.9702209414024976,
"train_speed(iter/s)": 0.102181
},
{
"epoch": 0.219737100255052,
"grad_norm": 1.4181880950927734,
"learning_rate": 9.235345361696354e-06,
"loss": 0.029826369881629945,
"memory(GiB)": 78.33,
"step": 280,
"token_acc": 0.9828947368421053,
"train_speed(iter/s)": 0.102219
},
{
"epoch": 0.22758485383558957,
"grad_norm": 1.2937140464782715,
"learning_rate": 9.164988994588077e-06,
"loss": 0.032772365212440493,
"memory(GiB)": 78.33,
"step": 290,
"token_acc": 0.9853313100657562,
"train_speed(iter/s)": 0.102275
},
{
"epoch": 0.23543260741612712,
"grad_norm": 2.1428334712982178,
"learning_rate": 9.091829770352194e-06,
"loss": 0.026177412271499632,
"memory(GiB)": 78.33,
"step": 300,
"token_acc": 0.985909090909091,
"train_speed(iter/s)": 0.10232
},
{
"epoch": 0.2432803609966647,
"grad_norm": 1.6647447347640991,
"learning_rate": 9.015916921976684e-06,
"loss": 0.030026063323020935,
"memory(GiB)": 78.33,
"step": 310,
"token_acc": 0.9893975903614458,
"train_speed(iter/s)": 0.102353
},
{
"epoch": 0.2511281145772023,
"grad_norm": 1.532943606376648,
"learning_rate": 8.93730153551926e-06,
"loss": 0.028212451934814455,
"memory(GiB)": 78.33,
"step": 320,
"token_acc": 0.9867881548974943,
"train_speed(iter/s)": 0.102401
},
{
"epoch": 0.2589758681577399,
"grad_norm": 1.246038794517517,
"learning_rate": 8.856036515728666e-06,
"loss": 0.031563830375671384,
"memory(GiB)": 78.33,
"step": 330,
"token_acc": 0.9879013494648674,
"train_speed(iter/s)": 0.10244
},
{
"epoch": 0.2668236217382774,
"grad_norm": 1.293062448501587,
"learning_rate": 8.772176550442063e-06,
"loss": 0.027055150270462035,
"memory(GiB)": 78.33,
"step": 340,
"token_acc": 0.9899343544857768,
"train_speed(iter/s)": 0.102468
},
{
"epoch": 0.27467137531881497,
"grad_norm": 1.6155800819396973,
"learning_rate": 8.68577807378251e-06,
"loss": 0.02880167365074158,
"memory(GiB)": 78.33,
"step": 350,
"token_acc": 0.9876488751654169,
"train_speed(iter/s)": 0.102506
},
{
"epoch": 0.28251912889935255,
"grad_norm": 1.7026891708374023,
"learning_rate": 8.596899228181216e-06,
"loss": 0.023001885414123534,
"memory(GiB)": 78.33,
"step": 360,
"token_acc": 0.9908088235294118,
"train_speed(iter/s)": 0.102532
},
{
"epoch": 0.2903668824798901,
"grad_norm": 1.3324004411697388,
"learning_rate": 8.505599825250217e-06,
"loss": 0.026848217844963072,
"memory(GiB)": 78.33,
"step": 370,
"token_acc": 0.9889553612517257,
"train_speed(iter/s)": 0.102551
},
{
"epoch": 0.2982146360604277,
"grad_norm": 1.1974210739135742,
"learning_rate": 8.411941305531757e-06,
"loss": 0.026788771152496338,
"memory(GiB)": 78.33,
"step": 380,
"token_acc": 0.9888746255883611,
"train_speed(iter/s)": 0.102581
},
{
"epoch": 0.30606238964096527,
"grad_norm": 0.8245720863342285,
"learning_rate": 8.315986697151453e-06,
"loss": 0.025021129846572877,
"memory(GiB)": 78.33,
"step": 390,
"token_acc": 0.9893911439114391,
"train_speed(iter/s)": 0.102605
},
{
"epoch": 0.31391014322150285,
"grad_norm": 1.4364780187606812,
"learning_rate": 8.217800573403105e-06,
"loss": 0.02195422351360321,
"memory(GiB)": 78.33,
"step": 400,
"token_acc": 0.9930297397769516,
"train_speed(iter/s)": 0.102624
},
{
"epoch": 0.3217578968020404,
"grad_norm": 1.7430766820907593,
"learning_rate": 8.117449009293668e-06,
"loss": 0.027543401718139647,
"memory(GiB)": 78.33,
"step": 410,
"token_acc": 0.990978800180424,
"train_speed(iter/s)": 0.102653
},
{
"epoch": 0.329605650382578,
"grad_norm": 1.2770256996154785,
"learning_rate": 8.014999537077633e-06,
"loss": 0.02160567492246628,
"memory(GiB)": 78.33,
"step": 420,
"token_acc": 0.9921478060046189,
"train_speed(iter/s)": 0.102677
},
{
"epoch": 0.3374534039631156,
"grad_norm": 1.3733662366867065,
"learning_rate": 7.910521100810743e-06,
"loss": 0.026095324754714967,
"memory(GiB)": 78.33,
"step": 430,
"token_acc": 0.9870490286771508,
"train_speed(iter/s)": 0.102699
},
{
"epoch": 0.34530115754365315,
"grad_norm": 0.7466018199920654,
"learning_rate": 7.804084009953638e-06,
"loss": 0.023969930410385133,
"memory(GiB)": 78.33,
"step": 440,
"token_acc": 0.9892673821745217,
"train_speed(iter/s)": 0.102719
},
{
"epoch": 0.3531489111241907,
"grad_norm": 1.8762829303741455,
"learning_rate": 7.695759892056627e-06,
"loss": 0.026105433702468872,
"memory(GiB)": 78.33,
"step": 450,
"token_acc": 0.9867986798679867,
"train_speed(iter/s)": 0.102733
},
{
"epoch": 0.3609966647047283,
"grad_norm": 1.4278916120529175,
"learning_rate": 7.585621644557453e-06,
"loss": 0.02523442208766937,
"memory(GiB)": 78.33,
"step": 460,
"token_acc": 0.9880179730404394,
"train_speed(iter/s)": 0.102756
},
{
"epoch": 0.3688444182852658,
"grad_norm": 1.1567350625991821,
"learning_rate": 7.473743385724478e-06,
"loss": 0.018508574366569518,
"memory(GiB)": 78.33,
"step": 470,
"token_acc": 0.9911154985192497,
"train_speed(iter/s)": 0.102775
},
{
"epoch": 0.3766921718658034,
"grad_norm": 1.2503105401992798,
"learning_rate": 7.3602004047783e-06,
"loss": 0.0277862012386322,
"memory(GiB)": 78.33,
"step": 480,
"token_acc": 0.9869375907111756,
"train_speed(iter/s)": 0.102794
},
{
"epoch": 0.38453992544634097,
"grad_norm": 0.9864979982376099,
"learning_rate": 7.245069111225365e-06,
"loss": 0.020091001689434052,
"memory(GiB)": 78.33,
"step": 490,
"token_acc": 0.9874360167519777,
"train_speed(iter/s)": 0.102814
},
{
"epoch": 0.39238767902687854,
"grad_norm": 0.7295334339141846,
"learning_rate": 7.128426983437685e-06,
"loss": 0.018309633433818816,
"memory(GiB)": 78.33,
"step": 500,
"token_acc": 0.9927813163481953,
"train_speed(iter/s)": 0.102832
},
{
"epoch": 0.39238767902687854,
"eval_loss": 0.01768402010202408,
"eval_runtime": 17.8731,
"eval_samples_per_second": 17.233,
"eval_steps_per_second": 2.909,
"eval_token_acc": 0.9905556236967987,
"step": 500
},
{
"epoch": 0.4002354326074161,
"grad_norm": 1.2380131483078003,
"learning_rate": 7.010352516513246e-06,
"loss": 0.021393966674804688,
"memory(GiB)": 78.33,
"step": 510,
"token_acc": 0.9890754602468137,
"train_speed(iter/s)": 0.1017
},
{
"epoch": 0.4080831861879537,
"grad_norm": 0.7806908488273621,
"learning_rate": 6.890925169452215e-06,
"loss": 0.01623480170965195,
"memory(GiB)": 78.33,
"step": 520,
"token_acc": 0.9933014354066986,
"train_speed(iter/s)": 0.101735
},
{
"epoch": 0.41593093976849127,
"grad_norm": 0.7129095792770386,
"learning_rate": 6.770225311684469e-06,
"loss": 0.018100659549236297,
"memory(GiB)": 78.33,
"step": 530,
"token_acc": 0.9937106918238994,
"train_speed(iter/s)": 0.10177
},
{
"epoch": 0.42377869334902885,
"grad_norm": 0.9304487705230713,
"learning_rate": 6.648334168984452e-06,
"loss": 0.014826363325119019,
"memory(GiB)": 78.33,
"step": 540,
"token_acc": 0.9924026590693258,
"train_speed(iter/s)": 0.101801
},
{
"epoch": 0.4316264469295664,
"grad_norm": 0.5562326908111572,
"learning_rate": 6.525333768809755e-06,
"loss": 0.017968928813934325,
"memory(GiB)": 78.33,
"step": 550,
"token_acc": 0.9962894248608535,
"train_speed(iter/s)": 0.101834
},
{
"epoch": 0.439474200510104,
"grad_norm": 0.6488431096076965,
"learning_rate": 6.4013068851001815e-06,
"loss": 0.01475011110305786,
"memory(GiB)": 78.33,
"step": 560,
"token_acc": 0.9914529914529915,
"train_speed(iter/s)": 0.101862
},
{
"epoch": 0.44732195409064157,
"grad_norm": 0.7067273259162903,
"learning_rate": 6.276336982574479e-06,
"loss": 0.0170462965965271,
"memory(GiB)": 78.33,
"step": 570,
"token_acc": 0.9939707149009475,
"train_speed(iter/s)": 0.101891
},
{
"epoch": 0.45516970767117915,
"grad_norm": 0.9754965305328369,
"learning_rate": 6.150508160562201e-06,
"loss": 0.012932208180427552,
"memory(GiB)": 78.33,
"step": 580,
"token_acc": 0.9925044091710759,
"train_speed(iter/s)": 0.101923
},
{
"epoch": 0.4630174612517167,
"grad_norm": 1.3289886713027954,
"learning_rate": 6.023905096408493e-06,
"loss": 0.015326529741287231,
"memory(GiB)": 78.33,
"step": 590,
"token_acc": 0.9921052631578947,
"train_speed(iter/s)": 0.101953
},
{
"epoch": 0.47086521483225424,
"grad_norm": 1.0586456060409546,
"learning_rate": 5.896612988489917e-06,
"loss": 0.014036232233047485,
"memory(GiB)": 78.33,
"step": 600,
"token_acc": 0.9930523390458545,
"train_speed(iter/s)": 0.101977
},
{
"epoch": 0.4787129684127918,
"grad_norm": 0.5072459578514099,
"learning_rate": 5.768717498879635e-06,
"loss": 0.01249450072646141,
"memory(GiB)": 78.33,
"step": 610,
"token_acc": 0.9956875898418783,
"train_speed(iter/s)": 0.102005
},
{
"epoch": 0.4865607219933294,
"grad_norm": 0.4904365539550781,
"learning_rate": 5.640304695700543e-06,
"loss": 0.009184502065181732,
"memory(GiB)": 78.33,
"step": 620,
"token_acc": 0.9970631424375918,
"train_speed(iter/s)": 0.10203
},
{
"epoch": 0.49440847557386697,
"grad_norm": 0.4811933636665344,
"learning_rate": 5.511460995205152e-06,
"loss": 0.009154336154460907,
"memory(GiB)": 78.33,
"step": 630,
"token_acc": 0.9952277657266811,
"train_speed(iter/s)": 0.102052
},
{
"epoch": 0.5022562291544046,
"grad_norm": 0.6274723410606384,
"learning_rate": 5.3822731036211975e-06,
"loss": 0.016800814867019655,
"memory(GiB)": 78.33,
"step": 640,
"token_acc": 0.9949977262391997,
"train_speed(iter/s)": 0.102079
},
{
"epoch": 0.5101039827349422,
"grad_norm": 0.4992922842502594,
"learning_rate": 5.252827958802104e-06,
"loss": 0.0129698246717453,
"memory(GiB)": 78.33,
"step": 650,
"token_acc": 0.9954934655250113,
"train_speed(iter/s)": 0.102099
},
{
"epoch": 0.5179517363154797,
"grad_norm": 0.20970365405082703,
"learning_rate": 5.123212671721576e-06,
"loss": 0.011136610805988312,
"memory(GiB)": 78.33,
"step": 660,
"token_acc": 0.9967289719626168,
"train_speed(iter/s)": 0.10212
},
{
"epoch": 0.5257994898960172,
"grad_norm": 0.6177439093589783,
"learning_rate": 4.99351446785169e-06,
"loss": 0.008509316295385361,
"memory(GiB)": 78.33,
"step": 670,
"token_acc": 0.9966634890371783,
"train_speed(iter/s)": 0.102147
},
{
"epoch": 0.5336472434765548,
"grad_norm": 1.3992984294891357,
"learning_rate": 4.863820628463925e-06,
"loss": 0.008021638542413712,
"memory(GiB)": 78.33,
"step": 680,
"token_acc": 0.9985155863433943,
"train_speed(iter/s)": 0.102167
},
{
"epoch": 0.5414949970570924,
"grad_norm": 0.5666137933731079,
"learning_rate": 4.734218431892659e-06,
"loss": 0.010254481434822082,
"memory(GiB)": 78.33,
"step": 690,
"token_acc": 0.9966918714555766,
"train_speed(iter/s)": 0.102186
},
{
"epoch": 0.5493427506376299,
"grad_norm": 0.8575289845466614,
"learning_rate": 4.604795094800618e-06,
"loss": 0.006985708326101303,
"memory(GiB)": 78.33,
"step": 700,
"token_acc": 0.9972093023255814,
"train_speed(iter/s)": 0.102208
},
{
"epoch": 0.5571905042181675,
"grad_norm": 0.5324018001556396,
"learning_rate": 4.475637713485853e-06,
"loss": 0.00994066745042801,
"memory(GiB)": 78.33,
"step": 710,
"token_acc": 0.995475113122172,
"train_speed(iter/s)": 0.102224
},
{
"epoch": 0.5650382577987051,
"grad_norm": 0.1626451015472412,
"learning_rate": 4.3468332052697e-06,
"loss": 0.007179060578346252,
"memory(GiB)": 78.33,
"step": 720,
"token_acc": 0.9971195391262602,
"train_speed(iter/s)": 0.10224
},
{
"epoch": 0.5728860113792427,
"grad_norm": 0.282648503780365,
"learning_rate": 4.218468250005189e-06,
"loss": 0.009923791885375977,
"memory(GiB)": 78.33,
"step": 730,
"token_acc": 0.9982158786797503,
"train_speed(iter/s)": 0.102261
},
{
"epoch": 0.5807337649597802,
"grad_norm": 1.230008840560913,
"learning_rate": 4.090629231745257e-06,
"loss": 0.010334306955337524,
"memory(GiB)": 78.33,
"step": 740,
"token_acc": 0.9966903073286052,
"train_speed(iter/s)": 0.102277
},
{
"epoch": 0.5885815185403178,
"grad_norm": 0.9169402122497559,
"learning_rate": 3.963402180610028e-06,
"loss": 0.007900170236825942,
"memory(GiB)": 78.33,
"step": 750,
"token_acc": 0.9972413793103448,
"train_speed(iter/s)": 0.102291
},
{
"epoch": 0.5964292721208554,
"grad_norm": 0.9331321716308594,
"learning_rate": 3.836872714892268e-06,
"loss": 0.0052720453590154644,
"memory(GiB)": 78.33,
"step": 760,
"token_acc": 0.9972489683631361,
"train_speed(iter/s)": 0.102309
},
{
"epoch": 0.604277025701393,
"grad_norm": 0.2847830057144165,
"learning_rate": 3.7111259834399776e-06,
"loss": 0.005255531892180443,
"memory(GiB)": 78.33,
"step": 770,
"token_acc": 0.9972776769509982,
"train_speed(iter/s)": 0.102325
},
{
"epoch": 0.6121247792819305,
"grad_norm": 0.5080392360687256,
"learning_rate": 3.5862466083549176e-06,
"loss": 0.003240898996591568,
"memory(GiB)": 78.33,
"step": 780,
"token_acc": 0.9979939819458375,
"train_speed(iter/s)": 0.102338
},
{
"epoch": 0.6199725328624681,
"grad_norm": 0.9272496104240417,
"learning_rate": 3.4623186280455938e-06,
"loss": 0.004520921036601067,
"memory(GiB)": 78.33,
"step": 790,
"token_acc": 0.9975049900199601,
"train_speed(iter/s)": 0.102355
},
{
"epoch": 0.6278202864430057,
"grad_norm": 0.6222965121269226,
"learning_rate": 3.339425440673049e-06,
"loss": 0.007227100431919098,
"memory(GiB)": 78.33,
"step": 800,
"token_acc": 0.9971014492753624,
"train_speed(iter/s)": 0.102372
},
{
"epoch": 0.6356680400235433,
"grad_norm": 1.0359742641448975,
"learning_rate": 3.2176497480275196e-06,
"loss": 0.0054885722696781155,
"memory(GiB)": 78.33,
"step": 810,
"token_acc": 0.9980601357904947,
"train_speed(iter/s)": 0.102383
},
{
"epoch": 0.6435157936040808,
"grad_norm": 0.13294534385204315,
"learning_rate": 3.0970734998737095e-06,
"loss": 0.005127144977450371,
"memory(GiB)": 78.33,
"step": 820,
"token_acc": 0.9975868725868726,
"train_speed(iter/s)": 0.1024
},
{
"epoch": 0.6513635471846184,
"grad_norm": 0.3067134618759155,
"learning_rate": 2.9777778388021508e-06,
"loss": 0.0035617969930171966,
"memory(GiB)": 78.33,
"step": 830,
"token_acc": 0.9981176470588236,
"train_speed(iter/s)": 0.102414
},
{
"epoch": 0.659211300765156,
"grad_norm": 0.9650315046310425,
"learning_rate": 2.859843045623753e-06,
"loss": 0.004638446867465973,
"memory(GiB)": 78.33,
"step": 840,
"token_acc": 0.9977127172918573,
"train_speed(iter/s)": 0.102425
},
{
"epoch": 0.6670590543456936,
"grad_norm": 0.6016029119491577,
"learning_rate": 2.743348485344307e-06,
"loss": 0.004326858744025231,
"memory(GiB)": 78.33,
"step": 850,
"token_acc": 0.9991146525011066,
"train_speed(iter/s)": 0.102441
},
{
"epoch": 0.6749068079262311,
"grad_norm": 0.0693819597363472,
"learning_rate": 2.6283725537552573e-06,
"loss": 0.004721887409687042,
"memory(GiB)": 78.33,
"step": 860,
"token_acc": 0.9976065102920058,
"train_speed(iter/s)": 0.102453
},
{
"epoch": 0.6827545615067687,
"grad_norm": 0.8225326538085938,
"learning_rate": 2.514992624676748e-06,
"loss": 0.0044202588498592375,
"memory(GiB)": 78.33,
"step": 870,
"token_acc": 0.9981176470588236,
"train_speed(iter/s)": 0.102466
},
{
"epoch": 0.6906023150873063,
"grad_norm": 0.1978958398103714,
"learning_rate": 2.403284997888381e-06,
"loss": 0.003972284868359566,
"memory(GiB)": 78.33,
"step": 880,
"token_acc": 0.9976915974145891,
"train_speed(iter/s)": 0.102478
},
{
"epoch": 0.6984500686678439,
"grad_norm": 0.12321511656045914,
"learning_rate": 2.2933248477827814e-06,
"loss": 0.006783504784107208,
"memory(GiB)": 78.33,
"step": 890,
"token_acc": 0.9984984984984985,
"train_speed(iter/s)": 0.102492
},
{
"epoch": 0.7062978222483814,
"grad_norm": 0.38810572028160095,
"learning_rate": 2.1851861727764815e-06,
"loss": 0.004711529612541199,
"memory(GiB)": 78.33,
"step": 900,
"token_acc": 0.9980544747081712,
"train_speed(iter/s)": 0.102507
},
{
"epoch": 0.714145575828919,
"grad_norm": 0.3537726402282715,
"learning_rate": 2.0789417455121964e-06,
"loss": 0.0040462717413902284,
"memory(GiB)": 78.33,
"step": 910,
"token_acc": 0.998610467809171,
"train_speed(iter/s)": 0.102515
},
{
"epoch": 0.7219933294094566,
"grad_norm": 0.5852633118629456,
"learning_rate": 1.9746630638859853e-06,
"loss": 0.002691943012177944,
"memory(GiB)": 78.33,
"step": 920,
"token_acc": 0.9984901862103673,
"train_speed(iter/s)": 0.102527
},
{
"epoch": 0.7298410829899941,
"grad_norm": 0.09465645998716354,
"learning_rate": 1.8724203029322684e-06,
"loss": 0.0028355952352285387,
"memory(GiB)": 78.33,
"step": 930,
"token_acc": 0.9995393827729157,
"train_speed(iter/s)": 0.102539
},
{
"epoch": 0.7376888365705316,
"grad_norm": 0.6381473541259766,
"learning_rate": 1.772282267599068e-06,
"loss": 0.0024913540109992027,
"memory(GiB)": 78.33,
"step": 940,
"token_acc": 0.9990740740740741,
"train_speed(iter/s)": 0.102548
},
{
"epoch": 0.7455365901510692,
"grad_norm": 0.6442692279815674,
"learning_rate": 1.6743163464452605e-06,
"loss": 0.0036306858062744142,
"memory(GiB)": 78.33,
"step": 950,
"token_acc": 0.9976213130352045,
"train_speed(iter/s)": 0.10256
},
{
"epoch": 0.7533843437316068,
"grad_norm": 0.5344854593276978,
"learning_rate": 1.5785884662909917e-06,
"loss": 0.0027463218197226525,
"memory(GiB)": 78.33,
"step": 960,
"token_acc": 0.9976819656930923,
"train_speed(iter/s)": 0.102572
},
{
"epoch": 0.7612320973121444,
"grad_norm": 1.05326247215271,
"learning_rate": 1.4851630478517942e-06,
"loss": 0.0029366277158260345,
"memory(GiB)": 78.33,
"step": 970,
"token_acc": 0.9985869053226566,
"train_speed(iter/s)": 0.102583
},
{
"epoch": 0.7690798508926819,
"grad_norm": 0.2340899109840393,
"learning_rate": 1.394102962386223e-06,
"loss": 0.0015608785673975945,
"memory(GiB)": 78.33,
"step": 980,
"token_acc": 0.999054820415879,
"train_speed(iter/s)": 0.102594
},
{
"epoch": 0.7769276044732195,
"grad_norm": 0.08281790465116501,
"learning_rate": 1.3054694893862341e-06,
"loss": 0.0035311192274093627,
"memory(GiB)": 78.33,
"step": 990,
"token_acc": 0.9981438515081207,
"train_speed(iter/s)": 0.102606
},
{
"epoch": 0.7847753580537571,
"grad_norm": 0.4216911494731903,
"learning_rate": 1.219322275338738e-06,
"loss": 0.003600326552987099,
"memory(GiB)": 78.33,
"step": 1000,
"token_acc": 1.0,
"train_speed(iter/s)": 0.102614
},
{
"epoch": 0.7847753580537571,
"eval_loss": 0.002305834786966443,
"eval_runtime": 17.6788,
"eval_samples_per_second": 17.422,
"eval_steps_per_second": 2.941,
"eval_token_acc": 0.9990187660983687,
"step": 1000
},
{
"epoch": 0.7926231116342947,
"grad_norm": 0.233176589012146,
"learning_rate": 1.1357192935860955e-06,
"loss": 0.001780262403190136,
"memory(GiB)": 78.33,
"step": 1010,
"token_acc": 0.9993978321959053,
"train_speed(iter/s)": 0.102038
},
{
"epoch": 0.8004708652148322,
"grad_norm": 0.4482191801071167,
"learning_rate": 1.0547168053125733e-06,
"loss": 0.004453697055578232,
"memory(GiB)": 78.33,
"step": 1020,
"token_acc": 0.999061473486626,
"train_speed(iter/s)": 0.102041
},
{
"epoch": 0.8083186187953698,
"grad_norm": 0.04691644757986069,
"learning_rate": 9.763693216830055e-07,
"loss": 0.0016242723912000656,
"memory(GiB)": 78.33,
"step": 1030,
"token_acc": 0.9991503823279524,
"train_speed(iter/s)": 0.102056
},
{
"epoch": 0.8161663723759074,
"grad_norm": 0.31022143363952637,
"learning_rate": 9.007295671591393e-07,
"loss": 0.003097619116306305,
"memory(GiB)": 78.33,
"step": 1040,
"token_acc": 0.999526066350711,
"train_speed(iter/s)": 0.10207
},
{
"epoch": 0.824014125956445,
"grad_norm": 0.44393110275268555,
"learning_rate": 8.278484440183549e-07,
"loss": 0.002246275171637535,
"memory(GiB)": 78.33,
"step": 1050,
"token_acc": 0.9991091314031181,
"train_speed(iter/s)": 0.102085
},
{
"epoch": 0.8318618795369825,
"grad_norm": 0.268406480550766,
"learning_rate": 7.577749980986443e-07,
"loss": 0.003959977626800537,
"memory(GiB)": 78.33,
"step": 1060,
"token_acc": 0.99800796812749,
"train_speed(iter/s)": 0.1021
},
{
"epoch": 0.8397096331175201,
"grad_norm": 0.8081741333007812,
"learning_rate": 6.905563857928838e-07,
"loss": 0.004642174392938614,
"memory(GiB)": 78.33,
"step": 1070,
"token_acc": 0.9976303317535545,
"train_speed(iter/s)": 0.102113
},
{
"epoch": 0.8475573866980577,
"grad_norm": 0.09950771182775497,
"learning_rate": 6.262378423146254e-07,
"loss": 0.0011267985217273235,
"memory(GiB)": 78.33,
"step": 1080,
"token_acc": 0.9995366079703429,
"train_speed(iter/s)": 0.102128
},
{
"epoch": 0.8554051402785953,
"grad_norm": 0.5595096945762634,
"learning_rate": 5.648626512567546e-07,
"loss": 0.001190672628581524,
"memory(GiB)": 78.33,
"step": 1090,
"token_acc": 0.9995657837603127,
"train_speed(iter/s)": 0.102142
},
{
"epoch": 0.8632528938591328,
"grad_norm": 0.3294273018836975,
"learning_rate": 5.064721154635155e-07,
"loss": 0.0030788829550147056,
"memory(GiB)": 78.33,
"step": 1100,
"token_acc": 0.9986708019494904,
"train_speed(iter/s)": 0.102152
},
{
"epoch": 0.8711006474396704,
"grad_norm": 1.3336243629455566,
"learning_rate": 4.511055292354799e-07,
"loss": 0.0034807972609996797,
"memory(GiB)": 78.33,
"step": 1110,
"token_acc": 0.9985250737463127,
"train_speed(iter/s)": 0.102168
},
{
"epoch": 0.878948401020208,
"grad_norm": 0.5120218992233276,
"learning_rate": 3.988001518861878e-07,
"loss": 0.003415053337812424,
"memory(GiB)": 78.33,
"step": 1120,
"token_acc": 0.9981299672744273,
"train_speed(iter/s)": 0.10218
},
{
"epoch": 0.8867961546007456,
"grad_norm": 0.2708381116390228,
"learning_rate": 3.495911826682441e-07,
"loss": 0.002040334790945053,
"memory(GiB)": 78.33,
"step": 1130,
"token_acc": 0.9990627928772259,
"train_speed(iter/s)": 0.102191
},
{
"epoch": 0.8946439081812831,
"grad_norm": 0.2763141989707947,
"learning_rate": 3.0351173708574657e-07,
"loss": 0.002172568999230862,
"memory(GiB)": 78.33,
"step": 1140,
"token_acc": 0.9994972347913524,
"train_speed(iter/s)": 0.102204
},
{
"epoch": 0.9024916617618207,
"grad_norm": 0.08879227936267853,
"learning_rate": 2.605928246089834e-07,
"loss": 0.0020642828196287153,
"memory(GiB)": 78.33,
"step": 1150,
"token_acc": 0.9995069033530573,
"train_speed(iter/s)": 0.102216
},
{
"epoch": 0.9103394153423583,
"grad_norm": 0.4619658291339874,
"learning_rate": 2.2086332780640928e-07,
"loss": 0.001702458970248699,
"memory(GiB)": 78.33,
"step": 1160,
"token_acc": 0.9995285242809995,
"train_speed(iter/s)": 0.102227
},
{
"epoch": 0.9181871689228959,
"grad_norm": 0.6599162220954895,
"learning_rate": 1.8434998290792373e-07,
"loss": 0.0055834796279668805,
"memory(GiB)": 78.33,
"step": 1170,
"token_acc": 0.9982070820259973,
"train_speed(iter/s)": 0.10224
},
{
"epoch": 0.9260349225034334,
"grad_norm": 0.07474014908075333,
"learning_rate": 1.510773618125494e-07,
"loss": 0.0019306868314743042,
"memory(GiB)": 78.33,
"step": 1180,
"token_acc": 0.9995256166982922,
"train_speed(iter/s)": 0.102253
},
{
"epoch": 0.9338826760839709,
"grad_norm": 0.12219434976577759,
"learning_rate": 1.2106785555260568e-07,
"loss": 0.0020278608426451683,
"memory(GiB)": 78.33,
"step": 1190,
"token_acc": 0.9995555555555555,
"train_speed(iter/s)": 0.102262
},
{
"epoch": 0.9417304296645085,
"grad_norm": 0.14843851327896118,
"learning_rate": 9.434165922551641e-08,
"loss": 0.0019979637116193773,
"memory(GiB)": 78.33,
"step": 1200,
"token_acc": 0.9995711835334476,
"train_speed(iter/s)": 0.102275
},
{
"epoch": 0.9495781832450461,
"grad_norm": 0.4590687155723572,
"learning_rate": 7.091675840338485e-08,
"loss": 0.0013441312126815318,
"memory(GiB)": 78.33,
"step": 1210,
"token_acc": 0.9995460735360872,
"train_speed(iter/s)": 0.102286
},
{
"epoch": 0.9574259368255836,
"grad_norm": 0.3629854619503021,
"learning_rate": 5.0808917029481205e-08,
"loss": 0.002632497064769268,
"memory(GiB)": 78.33,
"step": 1220,
"token_acc": 0.9995487364620939,
"train_speed(iter/s)": 0.102295
},
{
"epoch": 0.9652736904061212,
"grad_norm": 0.20997942984104156,
"learning_rate": 3.4031666809793974e-08,
"loss": 0.0015788381919264794,
"memory(GiB)": 78.33,
"step": 1230,
"token_acc": 1.0,
"train_speed(iter/s)": 0.102307
},
{
"epoch": 0.9731214439866588,
"grad_norm": 0.4643399715423584,
"learning_rate": 2.0596298106774214e-08,
"loss": 0.0017853409051895141,
"memory(GiB)": 78.33,
"step": 1240,
"token_acc": 0.9991235758106923,
"train_speed(iter/s)": 0.102317
},
{
"epoch": 0.9809691975671964,
"grad_norm": 0.1480502039194107,
"learning_rate": 1.051185234141494e-08,
"loss": 0.0018517106771469116,
"memory(GiB)": 78.33,
"step": 1250,
"token_acc": 0.9990913221263062,
"train_speed(iter/s)": 0.102326
},
{
"epoch": 0.9888169511477339,
"grad_norm": 0.6869596838951111,
"learning_rate": 3.7851159087665124e-09,
"loss": 0.0024400349706411363,
"memory(GiB)": 78.33,
"step": 1260,
"token_acc": 0.9990494296577946,
"train_speed(iter/s)": 0.102338
},
{
"epoch": 0.9966647047282715,
"grad_norm": 0.9563629031181335,
"learning_rate": 4.2061561098261093e-10,
"loss": 0.0032432712614536285,
"memory(GiB)": 78.33,
"step": 1270,
"token_acc": 0.9995429616087751,
"train_speed(iter/s)": 0.102349
},
{
"epoch": 1.0,
"eval_loss": 0.0017730883555486798,
"eval_runtime": 18.1533,
"eval_samples_per_second": 16.967,
"eval_steps_per_second": 2.864,
"eval_token_acc": 0.9992640745737765,
"step": 1275
}
],
"logging_steps": 10,
"max_steps": 1275,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.34975330481655e+18,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}