{ "best_metric": 0.65642351, "best_model_checkpoint": "/m2v_intern/zhangzhicheng03/code/face-llm/ms-swift/Emo-CFG_bs-512_data-ATTR_OPEN_EMO_MIC_500k_CAP_78k_RATIONALE_120k_scratch_3B_lr-2e-5/v2-20250515-154834/checkpoint-2704", "epoch": 1.9994455738310848, "eval_steps": 50, "global_step": 2704, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0007392348918868971, "grad_norm": 7.145988573209412, "learning_rate": 1.9999993250737395e-05, "loss": 1.997387409210205, "memory(GiB)": 30.3, "step": 1, "token_acc": 0.5373563218390804, "train_speed(iter/s)": 0.022542 }, { "epoch": 0.0036961744594344852, "grad_norm": 4.16117609244522, "learning_rate": 1.9999831268890388e-05, "loss": 1.7683343887329102, "memory(GiB)": 30.88, "step": 5, "token_acc": 0.5186522262334536, "train_speed(iter/s)": 0.047276 }, { "epoch": 0.0073923489188689705, "grad_norm": 1.9968396412286222, "learning_rate": 1.999932508125559e-05, "loss": 1.4955159187316895, "memory(GiB)": 40.49, "step": 10, "token_acc": 0.5502183406113537, "train_speed(iter/s)": 0.057642 }, { "epoch": 0.011088523378303456, "grad_norm": 1.6222058869585758, "learning_rate": 1.9998481454177528e-05, "loss": 1.4060004234313965, "memory(GiB)": 40.49, "step": 15, "token_acc": 0.6005361930294906, "train_speed(iter/s)": 0.059714 }, { "epoch": 0.014784697837737941, "grad_norm": 1.873832697605311, "learning_rate": 1.9997300416125426e-05, "loss": 1.3838209152221679, "memory(GiB)": 40.49, "step": 20, "token_acc": 0.607051282051282, "train_speed(iter/s)": 0.060073 }, { "epoch": 0.018480872297172428, "grad_norm": 1.659686579754198, "learning_rate": 1.9995782006954852e-05, "loss": 1.3265121459960938, "memory(GiB)": 40.49, "step": 25, "token_acc": 0.5963340122199593, "train_speed(iter/s)": 0.062445 }, { "epoch": 0.02217704675660691, "grad_norm": 1.8215744972131866, "learning_rate": 1.9993926277906387e-05, "loss": 1.3122464179992677, "memory(GiB)": 54.96, "step": 30, "token_acc": 0.5690406976744186, "train_speed(iter/s)": 0.06221 }, { "epoch": 0.0258732212160414, "grad_norm": 1.5674638552693054, "learning_rate": 1.9991733291603873e-05, "loss": 1.3101771354675293, "memory(GiB)": 54.96, "step": 35, "token_acc": 0.5990697674418605, "train_speed(iter/s)": 0.061909 }, { "epoch": 0.029569395675475882, "grad_norm": 1.8121545652568625, "learning_rate": 1.998920312205231e-05, "loss": 1.2577611923217773, "memory(GiB)": 54.96, "step": 40, "token_acc": 0.6322274881516587, "train_speed(iter/s)": 0.062755 }, { "epoch": 0.03326557013491037, "grad_norm": 1.7654732822991834, "learning_rate": 1.9986335854635364e-05, "loss": 1.2739611625671388, "memory(GiB)": 54.96, "step": 45, "token_acc": 0.5852668213457076, "train_speed(iter/s)": 0.062974 }, { "epoch": 0.036961744594344856, "grad_norm": 1.3381865017125096, "learning_rate": 1.9983131586112474e-05, "loss": 1.2759986877441407, "memory(GiB)": 54.96, "step": 50, "token_acc": 0.6090799517878666, "train_speed(iter/s)": 0.0626 }, { "epoch": 0.036961744594344856, "eval_loss": 0.8683156967163086, "eval_runtime": 85.8388, "eval_samples_per_second": 81.49, "eval_steps_per_second": 0.641, "eval_token_acc": 0.5936057826607904, "step": 50 }, { "epoch": 0.040657919053779336, "grad_norm": 1.7881784961595093, "learning_rate": 1.9979590424615597e-05, "loss": 1.2275705337524414, "memory(GiB)": 74.93, "step": 55, "token_acc": 0.5959502991256328, "train_speed(iter/s)": 0.055536 }, { "epoch": 0.04435409351321382, "grad_norm": 1.523952154354778, "learning_rate": 1.997571248964556e-05, "loss": 1.2908334732055664, "memory(GiB)": 74.93, "step": 60, "token_acc": 0.6143884892086331, "train_speed(iter/s)": 0.056437 }, { "epoch": 0.04805026797264831, "grad_norm": 1.629094517207129, "learning_rate": 1.9971497912068014e-05, "loss": 1.262259292602539, "memory(GiB)": 74.93, "step": 65, "token_acc": 0.6196943972835314, "train_speed(iter/s)": 0.05732 }, { "epoch": 0.0517464424320828, "grad_norm": 1.4002823703096854, "learning_rate": 1.9966946834109026e-05, "loss": 1.2578742980957032, "memory(GiB)": 74.93, "step": 70, "token_acc": 0.5597722960151803, "train_speed(iter/s)": 0.057646 }, { "epoch": 0.05544261689151728, "grad_norm": 1.3922622186923228, "learning_rate": 1.9962059409350286e-05, "loss": 1.2903871536254883, "memory(GiB)": 74.93, "step": 75, "token_acc": 0.5871787786681404, "train_speed(iter/s)": 0.058082 }, { "epoch": 0.059138791350951764, "grad_norm": 1.5357768582962403, "learning_rate": 1.9956835802723916e-05, "loss": 1.2582176208496094, "memory(GiB)": 74.93, "step": 80, "token_acc": 0.5863981319322825, "train_speed(iter/s)": 0.05893 }, { "epoch": 0.06283496581038625, "grad_norm": 1.5430442318289654, "learning_rate": 1.9951276190506903e-05, "loss": 1.2459497451782227, "memory(GiB)": 74.93, "step": 85, "token_acc": 0.5826538176426983, "train_speed(iter/s)": 0.059014 }, { "epoch": 0.06653114026982074, "grad_norm": 1.5964710010210896, "learning_rate": 1.9945380760315153e-05, "loss": 1.2252405166625977, "memory(GiB)": 74.93, "step": 90, "token_acc": 0.6162060301507538, "train_speed(iter/s)": 0.059178 }, { "epoch": 0.07022731472925522, "grad_norm": 1.5295447471837036, "learning_rate": 1.9939149711097164e-05, "loss": 1.235156536102295, "memory(GiB)": 74.93, "step": 95, "token_acc": 0.6264632848527847, "train_speed(iter/s)": 0.059963 }, { "epoch": 0.07392348918868971, "grad_norm": 1.3740769902051975, "learning_rate": 1.9932583253127302e-05, "loss": 1.2441673278808594, "memory(GiB)": 74.93, "step": 100, "token_acc": 0.6724137931034483, "train_speed(iter/s)": 0.060048 }, { "epoch": 0.07392348918868971, "eval_loss": 0.8143442273139954, "eval_runtime": 82.7498, "eval_samples_per_second": 84.532, "eval_steps_per_second": 0.665, "eval_token_acc": 0.6014724453258288, "step": 100 }, { "epoch": 0.07761966364812418, "grad_norm": 1.4813264557157313, "learning_rate": 1.992568160799872e-05, "loss": 1.2064315795898437, "memory(GiB)": 74.93, "step": 105, "token_acc": 0.6042131350681537, "train_speed(iter/s)": 0.056686 }, { "epoch": 0.08131583810755867, "grad_norm": 1.4543500652550134, "learning_rate": 1.9918445008615862e-05, "loss": 1.2109683990478515, "memory(GiB)": 74.93, "step": 110, "token_acc": 0.5906810035842294, "train_speed(iter/s)": 0.056861 }, { "epoch": 0.08501201256699316, "grad_norm": 1.4682139055625663, "learning_rate": 1.9910873699186618e-05, "loss": 1.2368173599243164, "memory(GiB)": 74.93, "step": 115, "token_acc": 0.5186114596403179, "train_speed(iter/s)": 0.057469 }, { "epoch": 0.08870818702642765, "grad_norm": 1.3995801521088191, "learning_rate": 1.990296793521408e-05, "loss": 1.2045980453491212, "memory(GiB)": 74.93, "step": 120, "token_acc": 0.5862884160756501, "train_speed(iter/s)": 0.057707 }, { "epoch": 0.09240436148586213, "grad_norm": 1.3021560572956736, "learning_rate": 1.989472798348791e-05, "loss": 1.2566261291503906, "memory(GiB)": 74.93, "step": 125, "token_acc": 0.5944492254733219, "train_speed(iter/s)": 0.057808 }, { "epoch": 0.09610053594529662, "grad_norm": 1.4712133484369752, "learning_rate": 1.9886154122075344e-05, "loss": 1.192431640625, "memory(GiB)": 74.93, "step": 130, "token_acc": 0.5911908646003262, "train_speed(iter/s)": 0.058231 }, { "epoch": 0.0997967104047311, "grad_norm": 1.3130768997154976, "learning_rate": 1.9877246640311818e-05, "loss": 1.2078176498413087, "memory(GiB)": 74.93, "step": 135, "token_acc": 0.6265611990008326, "train_speed(iter/s)": 0.058455 }, { "epoch": 0.1034928848641656, "grad_norm": 1.3930149495863555, "learning_rate": 1.9868005838791185e-05, "loss": 1.2078091621398925, "memory(GiB)": 74.93, "step": 140, "token_acc": 0.583790628957366, "train_speed(iter/s)": 0.05851 }, { "epoch": 0.10718905932360008, "grad_norm": 1.3209379748185142, "learning_rate": 1.9858432029355584e-05, "loss": 1.2318389892578125, "memory(GiB)": 74.93, "step": 145, "token_acc": 0.5777182235834609, "train_speed(iter/s)": 0.058766 }, { "epoch": 0.11088523378303455, "grad_norm": 1.5171021117309256, "learning_rate": 1.9848525535084916e-05, "loss": 1.2017921447753905, "memory(GiB)": 74.93, "step": 150, "token_acc": 0.6249167221852099, "train_speed(iter/s)": 0.059012 }, { "epoch": 0.11088523378303455, "eval_loss": 0.8016136884689331, "eval_runtime": 89.0739, "eval_samples_per_second": 78.53, "eval_steps_per_second": 0.617, "eval_token_acc": 0.6041989394145771, "step": 150 }, { "epoch": 0.11458140824246904, "grad_norm": 1.3505800383266968, "learning_rate": 1.983828669028593e-05, "loss": 1.1807826042175293, "memory(GiB)": 74.93, "step": 155, "token_acc": 0.6214278069142674, "train_speed(iter/s)": 0.056798 }, { "epoch": 0.11827758270190353, "grad_norm": 1.3315385057884146, "learning_rate": 1.9827715840480962e-05, "loss": 1.1823822021484376, "memory(GiB)": 74.93, "step": 160, "token_acc": 0.6380498145204028, "train_speed(iter/s)": 0.056862 }, { "epoch": 0.12197375716133801, "grad_norm": 1.5704533949100516, "learning_rate": 1.9816813342396245e-05, "loss": 1.1738862991333008, "memory(GiB)": 74.93, "step": 165, "token_acc": 0.6022625781482585, "train_speed(iter/s)": 0.057054 }, { "epoch": 0.1256699316207725, "grad_norm": 1.379238427568713, "learning_rate": 1.980557956394991e-05, "loss": 1.1857439041137696, "memory(GiB)": 74.93, "step": 170, "token_acc": 0.620583717357911, "train_speed(iter/s)": 0.05738 }, { "epoch": 0.129366106080207, "grad_norm": 1.2583564805641094, "learning_rate": 1.9794014884239532e-05, "loss": 1.2060420989990235, "memory(GiB)": 74.93, "step": 175, "token_acc": 0.6253842775581906, "train_speed(iter/s)": 0.057484 }, { "epoch": 0.13306228053964148, "grad_norm": 1.3557017685070272, "learning_rate": 1.9782119693529358e-05, "loss": 1.2089680671691894, "memory(GiB)": 74.93, "step": 180, "token_acc": 0.6479481641468683, "train_speed(iter/s)": 0.057624 }, { "epoch": 0.13675845499907596, "grad_norm": 1.29022693500859, "learning_rate": 1.9769894393237135e-05, "loss": 1.1686654090881348, "memory(GiB)": 74.93, "step": 185, "token_acc": 0.6546961325966851, "train_speed(iter/s)": 0.057936 }, { "epoch": 0.14045462945851045, "grad_norm": 1.3838281227163587, "learning_rate": 1.975733939592056e-05, "loss": 1.2134584426879882, "memory(GiB)": 74.93, "step": 190, "token_acc": 0.6140988372093024, "train_speed(iter/s)": 0.058017 }, { "epoch": 0.14415080391794494, "grad_norm": 1.4298593013480252, "learning_rate": 1.974445512526336e-05, "loss": 1.1823249816894532, "memory(GiB)": 74.93, "step": 195, "token_acc": 0.5794074793589121, "train_speed(iter/s)": 0.058072 }, { "epoch": 0.14784697837737942, "grad_norm": 1.319725742779011, "learning_rate": 1.9731242016060985e-05, "loss": 1.237997055053711, "memory(GiB)": 74.93, "step": 200, "token_acc": 0.605226480836237, "train_speed(iter/s)": 0.058321 }, { "epoch": 0.14784697837737942, "eval_loss": 0.7794498801231384, "eval_runtime": 92.8737, "eval_samples_per_second": 75.317, "eval_steps_per_second": 0.592, "eval_token_acc": 0.6082148043319165, "step": 200 }, { "epoch": 0.1515431528368139, "grad_norm": 1.2737166952101895, "learning_rate": 1.9717700514205963e-05, "loss": 1.1960806846618652, "memory(GiB)": 74.93, "step": 205, "token_acc": 0.6102411135968183, "train_speed(iter/s)": 0.056695 }, { "epoch": 0.15523932729624837, "grad_norm": 1.392555342325795, "learning_rate": 1.9703831076672807e-05, "loss": 1.1904547691345215, "memory(GiB)": 74.93, "step": 210, "token_acc": 0.6277756868648852, "train_speed(iter/s)": 0.056817 }, { "epoch": 0.15893550175568286, "grad_norm": 1.31415101390755, "learning_rate": 1.9689634171502642e-05, "loss": 1.1859335899353027, "memory(GiB)": 74.93, "step": 215, "token_acc": 0.6238479262672811, "train_speed(iter/s)": 0.056878 }, { "epoch": 0.16263167621511734, "grad_norm": 1.309289118136438, "learning_rate": 1.967511027778738e-05, "loss": 1.1907655715942382, "memory(GiB)": 74.93, "step": 220, "token_acc": 0.6154311649016642, "train_speed(iter/s)": 0.05712 }, { "epoch": 0.16632785067455183, "grad_norm": 1.263699514290592, "learning_rate": 1.966025988565356e-05, "loss": 1.1906933784484863, "memory(GiB)": 74.93, "step": 225, "token_acc": 0.6170634920634921, "train_speed(iter/s)": 0.057279 }, { "epoch": 0.17002402513398632, "grad_norm": 1.2660283851596872, "learning_rate": 1.9645083496245815e-05, "loss": 1.2014826774597167, "memory(GiB)": 74.93, "step": 230, "token_acc": 0.5935228023793787, "train_speed(iter/s)": 0.057316 }, { "epoch": 0.1737201995934208, "grad_norm": 1.2430321644094078, "learning_rate": 1.962958162170994e-05, "loss": 1.189725971221924, "memory(GiB)": 74.93, "step": 235, "token_acc": 0.7096774193548387, "train_speed(iter/s)": 0.057486 }, { "epoch": 0.1774163740528553, "grad_norm": 1.3183460023053968, "learning_rate": 1.961375478517564e-05, "loss": 1.1756509780883788, "memory(GiB)": 74.93, "step": 240, "token_acc": 0.6048, "train_speed(iter/s)": 0.057691 }, { "epoch": 0.18111254851228978, "grad_norm": 1.256458444055883, "learning_rate": 1.9597603520738853e-05, "loss": 1.1867225646972657, "memory(GiB)": 74.93, "step": 245, "token_acc": 0.6193625977149729, "train_speed(iter/s)": 0.057716 }, { "epoch": 0.18480872297172427, "grad_norm": 1.2002856679979195, "learning_rate": 1.9581128373443733e-05, "loss": 1.1792646408081056, "memory(GiB)": 74.93, "step": 250, "token_acc": 0.6049046321525886, "train_speed(iter/s)": 0.057828 }, { "epoch": 0.18480872297172427, "eval_loss": 0.7706022262573242, "eval_runtime": 87.903, "eval_samples_per_second": 79.576, "eval_steps_per_second": 0.626, "eval_token_acc": 0.6092432997735232, "step": 250 }, { "epoch": 0.18850489743115875, "grad_norm": 1.2231576564807567, "learning_rate": 1.9564329899264252e-05, "loss": 1.1703492164611817, "memory(GiB)": 74.93, "step": 255, "token_acc": 0.6188424362408291, "train_speed(iter/s)": 0.056504 }, { "epoch": 0.19220107189059324, "grad_norm": 1.1802132132030512, "learning_rate": 1.954720866508546e-05, "loss": 1.17109956741333, "memory(GiB)": 74.93, "step": 260, "token_acc": 0.6199639206253759, "train_speed(iter/s)": 0.056699 }, { "epoch": 0.19589724635002773, "grad_norm": 1.3103486430673341, "learning_rate": 1.9529765248684308e-05, "loss": 1.1841205596923827, "memory(GiB)": 74.93, "step": 265, "token_acc": 0.5825649622799665, "train_speed(iter/s)": 0.056764 }, { "epoch": 0.1995934208094622, "grad_norm": 1.3602279829039154, "learning_rate": 1.951200023871021e-05, "loss": 1.1760824203491211, "memory(GiB)": 74.93, "step": 270, "token_acc": 0.6165368484122229, "train_speed(iter/s)": 0.056855 }, { "epoch": 0.2032895952688967, "grad_norm": 1.2205622648558432, "learning_rate": 1.949391423466513e-05, "loss": 1.1814783096313477, "memory(GiB)": 74.93, "step": 275, "token_acc": 0.6155863619333084, "train_speed(iter/s)": 0.057043 }, { "epoch": 0.2069857697283312, "grad_norm": 1.2638744947898215, "learning_rate": 1.9475507846883377e-05, "loss": 1.1977863311767578, "memory(GiB)": 74.93, "step": 280, "token_acc": 0.6115591397849462, "train_speed(iter/s)": 0.057131 }, { "epoch": 0.21068194418776567, "grad_norm": 1.208174719234681, "learning_rate": 1.9456781696510996e-05, "loss": 1.1798893928527832, "memory(GiB)": 74.93, "step": 285, "token_acc": 0.6450809464508095, "train_speed(iter/s)": 0.057208 }, { "epoch": 0.21437811864720016, "grad_norm": 1.29989102329814, "learning_rate": 1.943773641548481e-05, "loss": 1.1305645942687987, "memory(GiB)": 74.93, "step": 290, "token_acc": 0.6185250219490781, "train_speed(iter/s)": 0.057373 }, { "epoch": 0.21807429310663462, "grad_norm": 1.2869046963327413, "learning_rate": 1.9418372646511104e-05, "loss": 1.1689376831054688, "memory(GiB)": 74.93, "step": 295, "token_acc": 0.639083030472463, "train_speed(iter/s)": 0.057472 }, { "epoch": 0.2217704675660691, "grad_norm": 1.3262776988217382, "learning_rate": 1.939869104304392e-05, "loss": 1.1520153045654298, "memory(GiB)": 74.93, "step": 300, "token_acc": 0.6394881170018282, "train_speed(iter/s)": 0.05753 }, { "epoch": 0.2217704675660691, "eval_loss": 0.7643480896949768, "eval_runtime": 83.8503, "eval_samples_per_second": 83.422, "eval_steps_per_second": 0.656, "eval_token_acc": 0.6108027805160715, "step": 300 }, { "epoch": 0.2254666420255036, "grad_norm": 1.2789640840585799, "learning_rate": 1.937869226926302e-05, "loss": 1.1876554489135742, "memory(GiB)": 74.93, "step": 305, "token_acc": 0.6270755222281735, "train_speed(iter/s)": 0.056408 }, { "epoch": 0.22916281648493808, "grad_norm": 1.3387432374923267, "learning_rate": 1.9358377000051457e-05, "loss": 1.152684211730957, "memory(GiB)": 74.93, "step": 310, "token_acc": 0.5908354547558435, "train_speed(iter/s)": 0.056571 }, { "epoch": 0.23285899094437257, "grad_norm": 1.3158521752921148, "learning_rate": 1.9337745920972817e-05, "loss": 1.1474998474121094, "memory(GiB)": 74.93, "step": 315, "token_acc": 0.6453079785035138, "train_speed(iter/s)": 0.056681 }, { "epoch": 0.23655516540380706, "grad_norm": 1.3574666032311622, "learning_rate": 1.9316799728248074e-05, "loss": 1.1646709442138672, "memory(GiB)": 74.93, "step": 320, "token_acc": 0.6396255850234009, "train_speed(iter/s)": 0.056747 }, { "epoch": 0.24025133986324154, "grad_norm": 1.5220421984558397, "learning_rate": 1.9295539128732096e-05, "loss": 1.1289070129394532, "memory(GiB)": 74.93, "step": 325, "token_acc": 0.6495638789122627, "train_speed(iter/s)": 0.056887 }, { "epoch": 0.24394751432267603, "grad_norm": 1.2325001012228407, "learning_rate": 1.927396483988979e-05, "loss": 1.1668661117553711, "memory(GiB)": 74.93, "step": 330, "token_acc": 0.6125099390405513, "train_speed(iter/s)": 0.05701 }, { "epoch": 0.24764368878211052, "grad_norm": 1.3455071899618125, "learning_rate": 1.92520775897719e-05, "loss": 1.160017967224121, "memory(GiB)": 74.93, "step": 335, "token_acc": 0.6224098234842671, "train_speed(iter/s)": 0.057069 }, { "epoch": 0.251339863241545, "grad_norm": 1.1193101615271859, "learning_rate": 1.922987811699042e-05, "loss": 1.164522933959961, "memory(GiB)": 74.93, "step": 340, "token_acc": 0.6142303969022265, "train_speed(iter/s)": 0.057185 }, { "epoch": 0.2550360377009795, "grad_norm": 1.184835291510033, "learning_rate": 1.9207367170693688e-05, "loss": 1.1658490180969239, "memory(GiB)": 74.93, "step": 345, "token_acc": 0.6181616832779624, "train_speed(iter/s)": 0.057315 }, { "epoch": 0.258732212160414, "grad_norm": 1.2033091005460579, "learning_rate": 1.918454551054109e-05, "loss": 1.174658966064453, "memory(GiB)": 74.93, "step": 350, "token_acc": 0.6646234676007006, "train_speed(iter/s)": 0.057368 }, { "epoch": 0.258732212160414, "eval_loss": 0.7548633813858032, "eval_runtime": 84.0949, "eval_samples_per_second": 83.18, "eval_steps_per_second": 0.654, "eval_token_acc": 0.6124072795776128, "step": 350 }, { "epoch": 0.26242838661984846, "grad_norm": 1.1971162317002557, "learning_rate": 1.916141390667744e-05, "loss": 1.1562774658203125, "memory(GiB)": 74.93, "step": 355, "token_acc": 0.6173011120615911, "train_speed(iter/s)": 0.056434 }, { "epoch": 0.26612456107928295, "grad_norm": 1.1301068933758331, "learning_rate": 1.9137973139706973e-05, "loss": 1.2061149597167968, "memory(GiB)": 74.93, "step": 360, "token_acc": 0.5783767946088485, "train_speed(iter/s)": 0.056501 }, { "epoch": 0.26982073553871744, "grad_norm": 1.2885970736252064, "learning_rate": 1.9114224000667014e-05, "loss": 1.1453168869018555, "memory(GiB)": 74.93, "step": 365, "token_acc": 0.6045895851721095, "train_speed(iter/s)": 0.056637 }, { "epoch": 0.2735169099981519, "grad_norm": 1.2008587437465796, "learning_rate": 1.9090167291001278e-05, "loss": 1.151451015472412, "memory(GiB)": 74.93, "step": 370, "token_acc": 0.6464088397790055, "train_speed(iter/s)": 0.056724 }, { "epoch": 0.2772130844575864, "grad_norm": 1.2574733188940939, "learning_rate": 1.9065803822532825e-05, "loss": 1.143141269683838, "memory(GiB)": 74.93, "step": 375, "token_acc": 0.6279554937413073, "train_speed(iter/s)": 0.056779 }, { "epoch": 0.2809092589170209, "grad_norm": 1.2230232638304774, "learning_rate": 1.9041134417436674e-05, "loss": 1.1681084632873535, "memory(GiB)": 74.93, "step": 380, "token_acc": 0.6278735632183908, "train_speed(iter/s)": 0.0569 }, { "epoch": 0.2846054333764554, "grad_norm": 1.308574420114396, "learning_rate": 1.9016159908212044e-05, "loss": 1.1313629150390625, "memory(GiB)": 74.93, "step": 385, "token_acc": 0.6380670611439843, "train_speed(iter/s)": 0.056973 }, { "epoch": 0.2883016078358899, "grad_norm": 1.1949255351547317, "learning_rate": 1.899088113765426e-05, "loss": 1.1681228637695313, "memory(GiB)": 74.93, "step": 390, "token_acc": 0.6130952380952381, "train_speed(iter/s)": 0.057013 }, { "epoch": 0.29199778229532436, "grad_norm": 1.1669478994026365, "learning_rate": 1.896529895882633e-05, "loss": 1.1387041091918946, "memory(GiB)": 74.93, "step": 395, "token_acc": 0.6152671755725191, "train_speed(iter/s)": 0.05713 }, { "epoch": 0.29569395675475885, "grad_norm": 1.197646998798649, "learning_rate": 1.8939414235030137e-05, "loss": 1.1374378204345703, "memory(GiB)": 74.93, "step": 400, "token_acc": 0.6037667511771098, "train_speed(iter/s)": 0.057204 }, { "epoch": 0.29569395675475885, "eval_loss": 0.7541109323501587, "eval_runtime": 86.4511, "eval_samples_per_second": 80.913, "eval_steps_per_second": 0.636, "eval_token_acc": 0.613960988740803, "step": 400 }, { "epoch": 0.29939013121419333, "grad_norm": 1.3110127948907355, "learning_rate": 1.8913227839777305e-05, "loss": 1.1630861282348632, "memory(GiB)": 74.93, "step": 405, "token_acc": 0.6250439264378588, "train_speed(iter/s)": 0.056415 }, { "epoch": 0.3030863056736278, "grad_norm": 1.2066723455387354, "learning_rate": 1.8886740656759755e-05, "loss": 1.1657712936401368, "memory(GiB)": 74.93, "step": 410, "token_acc": 0.6286093594424162, "train_speed(iter/s)": 0.056469 }, { "epoch": 0.3067824801330623, "grad_norm": 1.214334257623402, "learning_rate": 1.8859953579819833e-05, "loss": 1.129319953918457, "memory(GiB)": 74.93, "step": 415, "token_acc": 0.5934997644842205, "train_speed(iter/s)": 0.056572 }, { "epoch": 0.31047865459249674, "grad_norm": 1.260998092054749, "learning_rate": 1.883286751292018e-05, "loss": 1.125650119781494, "memory(GiB)": 74.93, "step": 420, "token_acc": 0.6005237125400058, "train_speed(iter/s)": 0.056666 }, { "epoch": 0.3141748290519312, "grad_norm": 1.1445762169453673, "learning_rate": 1.880548337011323e-05, "loss": 1.1848130226135254, "memory(GiB)": 74.93, "step": 425, "token_acc": 0.5819639278557114, "train_speed(iter/s)": 0.05671 }, { "epoch": 0.3178710035113657, "grad_norm": 1.2219231261580983, "learning_rate": 1.8777802075510338e-05, "loss": 1.1647357940673828, "memory(GiB)": 74.93, "step": 430, "token_acc": 0.6077451592754528, "train_speed(iter/s)": 0.056776 }, { "epoch": 0.3215671779708002, "grad_norm": 1.1956915969147472, "learning_rate": 1.8749824563250615e-05, "loss": 1.1394176483154297, "memory(GiB)": 74.93, "step": 435, "token_acc": 0.6606451612903226, "train_speed(iter/s)": 0.056853 }, { "epoch": 0.3252633524302347, "grad_norm": 1.3354423066052745, "learning_rate": 1.8721551777469397e-05, "loss": 1.152536964416504, "memory(GiB)": 74.93, "step": 440, "token_acc": 0.5991432068543452, "train_speed(iter/s)": 0.056906 }, { "epoch": 0.3289595268896692, "grad_norm": 1.2562915522841382, "learning_rate": 1.869298467226639e-05, "loss": 1.1220308303833009, "memory(GiB)": 74.93, "step": 445, "token_acc": 0.6066666666666667, "train_speed(iter/s)": 0.056963 }, { "epoch": 0.33265570134910366, "grad_norm": 1.359582068477731, "learning_rate": 1.8664124211673468e-05, "loss": 1.1504764556884766, "memory(GiB)": 74.93, "step": 450, "token_acc": 0.5973016235993597, "train_speed(iter/s)": 0.057049 }, { "epoch": 0.33265570134910366, "eval_loss": 0.7460736632347107, "eval_runtime": 88.8045, "eval_samples_per_second": 78.769, "eval_steps_per_second": 0.619, "eval_token_acc": 0.6144388755116506, "step": 450 }, { "epoch": 0.33635187580853815, "grad_norm": 1.218975457419414, "learning_rate": 1.863497136962213e-05, "loss": 1.1313959121704102, "memory(GiB)": 74.93, "step": 455, "token_acc": 0.6262968874700718, "train_speed(iter/s)": 0.056354 }, { "epoch": 0.34004805026797263, "grad_norm": 1.4342194151464063, "learning_rate": 1.8605527129910663e-05, "loss": 1.1549379348754882, "memory(GiB)": 74.93, "step": 460, "token_acc": 0.6472244569589702, "train_speed(iter/s)": 0.056414 }, { "epoch": 0.3437442247274071, "grad_norm": 1.440358796861357, "learning_rate": 1.857579248617091e-05, "loss": 1.129042625427246, "memory(GiB)": 74.93, "step": 465, "token_acc": 0.6356026785714286, "train_speed(iter/s)": 0.05648 }, { "epoch": 0.3474403991868416, "grad_norm": 1.2091541968931232, "learning_rate": 1.854576844183476e-05, "loss": 1.1230792999267578, "memory(GiB)": 74.93, "step": 470, "token_acc": 0.6001645413410119, "train_speed(iter/s)": 0.056566 }, { "epoch": 0.3511365736462761, "grad_norm": 1.212497545728028, "learning_rate": 1.8515456010100274e-05, "loss": 1.1627266883850098, "memory(GiB)": 74.93, "step": 475, "token_acc": 0.6375609756097561, "train_speed(iter/s)": 0.056633 }, { "epoch": 0.3548327481057106, "grad_norm": 1.257170599310577, "learning_rate": 1.8484856213897496e-05, "loss": 1.1552623748779296, "memory(GiB)": 74.93, "step": 480, "token_acc": 0.6367495451788963, "train_speed(iter/s)": 0.056696 }, { "epoch": 0.35852892256514507, "grad_norm": 1.3061990827470522, "learning_rate": 1.8453970085853953e-05, "loss": 1.1611719131469727, "memory(GiB)": 74.93, "step": 485, "token_acc": 0.5953002610966057, "train_speed(iter/s)": 0.056777 }, { "epoch": 0.36222509702457956, "grad_norm": 1.2132042758068045, "learning_rate": 1.842279866825976e-05, "loss": 1.1605472564697266, "memory(GiB)": 74.93, "step": 490, "token_acc": 0.6365507776761208, "train_speed(iter/s)": 0.056851 }, { "epoch": 0.36592127148401404, "grad_norm": 1.2900699412115835, "learning_rate": 1.8391343013032505e-05, "loss": 1.1752688407897949, "memory(GiB)": 74.93, "step": 495, "token_acc": 0.6413404114134041, "train_speed(iter/s)": 0.056898 }, { "epoch": 0.36961744594344853, "grad_norm": 1.115509938975403, "learning_rate": 1.8359604181681703e-05, "loss": 1.1677565574645996, "memory(GiB)": 74.93, "step": 500, "token_acc": 0.635439360929557, "train_speed(iter/s)": 0.056967 }, { "epoch": 0.36961744594344853, "eval_loss": 0.7416162490844727, "eval_runtime": 87.8438, "eval_samples_per_second": 79.63, "eval_steps_per_second": 0.626, "eval_token_acc": 0.615732863603728, "step": 500 }, { "epoch": 0.373313620402883, "grad_norm": 1.1864096727600855, "learning_rate": 1.8327583245273004e-05, "loss": 1.120311164855957, "memory(GiB)": 74.93, "step": 505, "token_acc": 0.6247582205029013, "train_speed(iter/s)": 0.056337 }, { "epoch": 0.3770097948623175, "grad_norm": 1.1558943975448084, "learning_rate": 1.8295281284392036e-05, "loss": 1.167508888244629, "memory(GiB)": 74.93, "step": 510, "token_acc": 0.5796680497925312, "train_speed(iter/s)": 0.056408 }, { "epoch": 0.380705969321752, "grad_norm": 1.2905670874481943, "learning_rate": 1.8262699389107933e-05, "loss": 1.15736083984375, "memory(GiB)": 74.93, "step": 515, "token_acc": 0.6157240272763739, "train_speed(iter/s)": 0.056454 }, { "epoch": 0.3844021437811865, "grad_norm": 1.2748847596057926, "learning_rate": 1.8229838658936566e-05, "loss": 1.1492805480957031, "memory(GiB)": 74.93, "step": 520, "token_acc": 0.6105889724310777, "train_speed(iter/s)": 0.056519 }, { "epoch": 0.38809831824062097, "grad_norm": 1.1876718707954161, "learning_rate": 1.819670020280343e-05, "loss": 1.1467121124267579, "memory(GiB)": 74.93, "step": 525, "token_acc": 0.6113826815642458, "train_speed(iter/s)": 0.056588 }, { "epoch": 0.39179449270005545, "grad_norm": 1.2841584592867252, "learning_rate": 1.816328513900622e-05, "loss": 1.1653972625732423, "memory(GiB)": 74.93, "step": 530, "token_acc": 0.6273197444478248, "train_speed(iter/s)": 0.056639 }, { "epoch": 0.39549066715948994, "grad_norm": 1.243754331563731, "learning_rate": 1.8129594595177093e-05, "loss": 1.154591178894043, "memory(GiB)": 74.93, "step": 535, "token_acc": 0.5926477893691009, "train_speed(iter/s)": 0.056695 }, { "epoch": 0.3991868416189244, "grad_norm": 1.3245067788741383, "learning_rate": 1.809562970824462e-05, "loss": 1.157964324951172, "memory(GiB)": 74.93, "step": 540, "token_acc": 0.6192792394428477, "train_speed(iter/s)": 0.056758 }, { "epoch": 0.4028830160783589, "grad_norm": 1.3057962329498682, "learning_rate": 1.806139162439541e-05, "loss": 1.1371761322021485, "memory(GiB)": 74.93, "step": 545, "token_acc": 0.596340150699677, "train_speed(iter/s)": 0.056815 }, { "epoch": 0.4065791905377934, "grad_norm": 1.25005365154622, "learning_rate": 1.8026881499035437e-05, "loss": 1.1124300956726074, "memory(GiB)": 74.93, "step": 550, "token_acc": 0.6204881402543829, "train_speed(iter/s)": 0.056864 }, { "epoch": 0.4065791905377934, "eval_loss": 0.7460726499557495, "eval_runtime": 88.6273, "eval_samples_per_second": 78.926, "eval_steps_per_second": 0.621, "eval_token_acc": 0.6162869352221019, "step": 550 }, { "epoch": 0.4102753649972279, "grad_norm": 1.1926510177467409, "learning_rate": 1.7992100496751054e-05, "loss": 1.1571131706237794, "memory(GiB)": 74.93, "step": 555, "token_acc": 0.6311389759665622, "train_speed(iter/s)": 0.056299 }, { "epoch": 0.4139715394566624, "grad_norm": 1.1989503074947894, "learning_rate": 1.7957049791269684e-05, "loss": 1.1516962051391602, "memory(GiB)": 74.93, "step": 560, "token_acc": 0.5952788231269244, "train_speed(iter/s)": 0.056369 }, { "epoch": 0.41766771391609686, "grad_norm": 1.1212233051313498, "learning_rate": 1.792173056542021e-05, "loss": 1.1592437744140625, "memory(GiB)": 74.93, "step": 565, "token_acc": 0.5976621417797888, "train_speed(iter/s)": 0.056413 }, { "epoch": 0.42136388837553135, "grad_norm": 1.1553604640842632, "learning_rate": 1.7886144011093067e-05, "loss": 1.1524188041687011, "memory(GiB)": 74.93, "step": 570, "token_acc": 0.6424742268041237, "train_speed(iter/s)": 0.056462 }, { "epoch": 0.42506006283496583, "grad_norm": 1.183725532275657, "learning_rate": 1.7850291329200015e-05, "loss": 1.1416030883789063, "memory(GiB)": 74.93, "step": 575, "token_acc": 0.6029700196133371, "train_speed(iter/s)": 0.056533 }, { "epoch": 0.4287562372944003, "grad_norm": 1.2480769087109442, "learning_rate": 1.7814173729633607e-05, "loss": 1.164370059967041, "memory(GiB)": 74.93, "step": 580, "token_acc": 0.6192486281131279, "train_speed(iter/s)": 0.056588 }, { "epoch": 0.43245241175383475, "grad_norm": 1.3104680757325256, "learning_rate": 1.7777792431226384e-05, "loss": 1.119395637512207, "memory(GiB)": 74.93, "step": 585, "token_acc": 0.6305528922978587, "train_speed(iter/s)": 0.056638 }, { "epoch": 0.43614858621326924, "grad_norm": 1.213929814999547, "learning_rate": 1.7741148661709707e-05, "loss": 1.1547592163085938, "memory(GiB)": 74.93, "step": 590, "token_acc": 0.6233905579399142, "train_speed(iter/s)": 0.056711 }, { "epoch": 0.4398447606727037, "grad_norm": 1.2155093557171206, "learning_rate": 1.770424365767236e-05, "loss": 1.1199445724487305, "memory(GiB)": 74.93, "step": 595, "token_acc": 0.6336528221512248, "train_speed(iter/s)": 0.056773 }, { "epoch": 0.4435409351321382, "grad_norm": 1.3908702173841363, "learning_rate": 1.7667078664518796e-05, "loss": 1.157416534423828, "memory(GiB)": 74.93, "step": 600, "token_acc": 0.6181159420289855, "train_speed(iter/s)": 0.056815 }, { "epoch": 0.4435409351321382, "eval_loss": 0.7338850498199463, "eval_runtime": 85.3003, "eval_samples_per_second": 82.004, "eval_steps_per_second": 0.645, "eval_token_acc": 0.6175324420475716, "step": 600 }, { "epoch": 0.4472371095915727, "grad_norm": 1.022281205691788, "learning_rate": 1.7629654936427126e-05, "loss": 1.1211700439453125, "memory(GiB)": 74.93, "step": 605, "token_acc": 0.6267794070427057, "train_speed(iter/s)": 0.056289 }, { "epoch": 0.4509332840510072, "grad_norm": 1.1115715050120814, "learning_rate": 1.7591973736306774e-05, "loss": 1.1568084716796876, "memory(GiB)": 74.93, "step": 610, "token_acc": 0.6001278227524499, "train_speed(iter/s)": 0.056358 }, { "epoch": 0.4546294585104417, "grad_norm": 1.2942894072539404, "learning_rate": 1.755403633575589e-05, "loss": 1.1330131530761718, "memory(GiB)": 74.93, "step": 615, "token_acc": 0.6048237476808905, "train_speed(iter/s)": 0.056424 }, { "epoch": 0.45832563296987616, "grad_norm": 1.2115375753993367, "learning_rate": 1.7515844015018416e-05, "loss": 1.1604066848754884, "memory(GiB)": 74.93, "step": 620, "token_acc": 0.6332541567695962, "train_speed(iter/s)": 0.05648 }, { "epoch": 0.46202180742931065, "grad_norm": 1.1168616761395809, "learning_rate": 1.7477398062940868e-05, "loss": 1.1492230415344238, "memory(GiB)": 74.93, "step": 625, "token_acc": 0.6326703343207787, "train_speed(iter/s)": 0.056541 }, { "epoch": 0.46571798188874514, "grad_norm": 1.3080238975825687, "learning_rate": 1.7438699776928892e-05, "loss": 1.159599494934082, "memory(GiB)": 74.93, "step": 630, "token_acc": 0.5911352329262777, "train_speed(iter/s)": 0.056603 }, { "epoch": 0.4694141563481796, "grad_norm": 1.270157306289422, "learning_rate": 1.739975046290343e-05, "loss": 1.1172502517700196, "memory(GiB)": 74.93, "step": 635, "token_acc": 0.6800878477306003, "train_speed(iter/s)": 0.05664 }, { "epoch": 0.4731103308076141, "grad_norm": 1.1591581275323428, "learning_rate": 1.7360551435256673e-05, "loss": 1.1474403381347655, "memory(GiB)": 74.93, "step": 640, "token_acc": 0.6703857188396557, "train_speed(iter/s)": 0.056691 }, { "epoch": 0.4768065052670486, "grad_norm": 1.3849471969434006, "learning_rate": 1.7321104016807716e-05, "loss": 1.1200141906738281, "memory(GiB)": 74.93, "step": 645, "token_acc": 0.6204099060631939, "train_speed(iter/s)": 0.056741 }, { "epoch": 0.4805026797264831, "grad_norm": 1.2181008696775872, "learning_rate": 1.7281409538757886e-05, "loss": 1.1367115020751952, "memory(GiB)": 74.93, "step": 650, "token_acc": 0.6141581632653061, "train_speed(iter/s)": 0.056787 }, { "epoch": 0.4805026797264831, "eval_loss": 0.7338098287582397, "eval_runtime": 86.3351, "eval_samples_per_second": 81.022, "eval_steps_per_second": 0.637, "eval_token_acc": 0.618567863384408, "step": 650 }, { "epoch": 0.48419885418591757, "grad_norm": 1.2381127917004506, "learning_rate": 1.7241469340645856e-05, "loss": 1.1498327255249023, "memory(GiB)": 74.93, "step": 655, "token_acc": 0.6240238388820386, "train_speed(iter/s)": 0.056305 }, { "epoch": 0.48789502864535206, "grad_norm": 1.3545670040018443, "learning_rate": 1.720128477030241e-05, "loss": 1.123112392425537, "memory(GiB)": 74.93, "step": 660, "token_acc": 0.6101917520357236, "train_speed(iter/s)": 0.05635 }, { "epoch": 0.49159120310478654, "grad_norm": 1.2698188744774948, "learning_rate": 1.716085718380498e-05, "loss": 1.1386995315551758, "memory(GiB)": 74.93, "step": 665, "token_acc": 0.6005629477993859, "train_speed(iter/s)": 0.056398 }, { "epoch": 0.49528737756422103, "grad_norm": 1.4609798611237281, "learning_rate": 1.7120187945431874e-05, "loss": 1.1037940979003906, "memory(GiB)": 74.93, "step": 670, "token_acc": 0.6407727085902178, "train_speed(iter/s)": 0.056444 }, { "epoch": 0.4989835520236555, "grad_norm": 1.1805190661164426, "learning_rate": 1.707927842761623e-05, "loss": 1.1232402801513672, "memory(GiB)": 74.93, "step": 675, "token_acc": 0.5811437403400309, "train_speed(iter/s)": 0.05646 }, { "epoch": 0.50267972648309, "grad_norm": 1.1558010845800675, "learning_rate": 1.7038130010899716e-05, "loss": 1.1340635299682618, "memory(GiB)": 74.93, "step": 680, "token_acc": 0.6523545706371191, "train_speed(iter/s)": 0.056504 }, { "epoch": 0.5063759009425245, "grad_norm": 1.1790896957784056, "learning_rate": 1.6996744083885938e-05, "loss": 1.1378223419189453, "memory(GiB)": 74.93, "step": 685, "token_acc": 0.6573009791400596, "train_speed(iter/s)": 0.056546 }, { "epoch": 0.510072075401959, "grad_norm": 1.2335317128319008, "learning_rate": 1.695512204319357e-05, "loss": 1.1394284248352051, "memory(GiB)": 74.93, "step": 690, "token_acc": 0.6082870568133276, "train_speed(iter/s)": 0.056586 }, { "epoch": 0.5137682498613935, "grad_norm": 0.9893255166681467, "learning_rate": 1.6913265293409235e-05, "loss": 1.1198680877685547, "memory(GiB)": 74.93, "step": 695, "token_acc": 0.547270955165692, "train_speed(iter/s)": 0.05664 }, { "epoch": 0.517464424320828, "grad_norm": 1.1351076610632471, "learning_rate": 1.68711752470401e-05, "loss": 1.1366339683532716, "memory(GiB)": 74.93, "step": 700, "token_acc": 0.6295369211514393, "train_speed(iter/s)": 0.056675 }, { "epoch": 0.517464424320828, "eval_loss": 0.7255228757858276, "eval_runtime": 89.5144, "eval_samples_per_second": 78.144, "eval_steps_per_second": 0.614, "eval_token_acc": 0.6190699907885594, "step": 700 }, { "epoch": 0.5211605987802624, "grad_norm": 1.0862208515121348, "learning_rate": 1.682885332446621e-05, "loss": 1.1369894981384276, "memory(GiB)": 74.93, "step": 705, "token_acc": 0.6288204532248692, "train_speed(iter/s)": 0.056212 }, { "epoch": 0.5248567732396969, "grad_norm": 1.1660653361907225, "learning_rate": 1.6786300953892563e-05, "loss": 1.1410274505615234, "memory(GiB)": 74.93, "step": 710, "token_acc": 0.6100605143721634, "train_speed(iter/s)": 0.056263 }, { "epoch": 0.5285529476991314, "grad_norm": 1.0896922974940084, "learning_rate": 1.674351957130089e-05, "loss": 1.1174249649047852, "memory(GiB)": 74.93, "step": 715, "token_acc": 0.6420308483290489, "train_speed(iter/s)": 0.056309 }, { "epoch": 0.5322491221585659, "grad_norm": 1.152348085956414, "learning_rate": 1.6700510620401223e-05, "loss": 1.1088247299194336, "memory(GiB)": 74.93, "step": 720, "token_acc": 0.6403995560488346, "train_speed(iter/s)": 0.056355 }, { "epoch": 0.5359452966180004, "grad_norm": 1.1236142627513106, "learning_rate": 1.6657275552583172e-05, "loss": 1.137843418121338, "memory(GiB)": 74.93, "step": 725, "token_acc": 0.5981665393430099, "train_speed(iter/s)": 0.056406 }, { "epoch": 0.5396414710774349, "grad_norm": 1.0869362324396392, "learning_rate": 1.6613815826866923e-05, "loss": 1.1183334350585938, "memory(GiB)": 74.93, "step": 730, "token_acc": 0.6076433121019108, "train_speed(iter/s)": 0.056454 }, { "epoch": 0.5433376455368694, "grad_norm": 1.0408539682832916, "learning_rate": 1.6570132909854027e-05, "loss": 1.1498143196105957, "memory(GiB)": 74.93, "step": 735, "token_acc": 0.6524312896405919, "train_speed(iter/s)": 0.0565 }, { "epoch": 0.5470338199963038, "grad_norm": 1.223295875198057, "learning_rate": 1.6526228275677892e-05, "loss": 1.091654109954834, "memory(GiB)": 74.93, "step": 740, "token_acc": 0.6982872200263505, "train_speed(iter/s)": 0.056544 }, { "epoch": 0.5507299944557383, "grad_norm": 1.1558442201312176, "learning_rate": 1.6482103405954056e-05, "loss": 1.1205904006958007, "memory(GiB)": 74.93, "step": 745, "token_acc": 0.6377204884667571, "train_speed(iter/s)": 0.056579 }, { "epoch": 0.5544261689151728, "grad_norm": 1.2784643735837162, "learning_rate": 1.6437759789730154e-05, "loss": 1.1237329483032226, "memory(GiB)": 74.93, "step": 750, "token_acc": 0.6141374837872893, "train_speed(iter/s)": 0.056631 }, { "epoch": 0.5544261689151728, "eval_loss": 0.7271792888641357, "eval_runtime": 87.6966, "eval_samples_per_second": 79.764, "eval_steps_per_second": 0.627, "eval_token_acc": 0.6196194451434468, "step": 750 }, { "epoch": 0.5581223433746073, "grad_norm": 1.2055849293387977, "learning_rate": 1.6393198923435707e-05, "loss": 1.1234511375427245, "memory(GiB)": 74.93, "step": 755, "token_acc": 0.6244901356863398, "train_speed(iter/s)": 0.056217 }, { "epoch": 0.5618185178340418, "grad_norm": 1.1362509527796705, "learning_rate": 1.63484223108316e-05, "loss": 1.125691795349121, "memory(GiB)": 74.93, "step": 760, "token_acc": 0.6037473976405274, "train_speed(iter/s)": 0.05626 }, { "epoch": 0.5655146922934763, "grad_norm": 1.123275540757232, "learning_rate": 1.6303431462959327e-05, "loss": 1.1341413497924804, "memory(GiB)": 74.93, "step": 765, "token_acc": 0.6085106382978723, "train_speed(iter/s)": 0.056308 }, { "epoch": 0.5692108667529108, "grad_norm": 1.015989051360902, "learning_rate": 1.6258227898090037e-05, "loss": 1.1203922271728515, "memory(GiB)": 74.93, "step": 770, "token_acc": 0.601472134595163, "train_speed(iter/s)": 0.056355 }, { "epoch": 0.5729070412123453, "grad_norm": 1.189393051036189, "learning_rate": 1.6212813141673254e-05, "loss": 1.1124958038330077, "memory(GiB)": 74.93, "step": 775, "token_acc": 0.6260790549750114, "train_speed(iter/s)": 0.056399 }, { "epoch": 0.5766032156717797, "grad_norm": 1.1850051513280322, "learning_rate": 1.6167188726285433e-05, "loss": 1.114617919921875, "memory(GiB)": 74.93, "step": 780, "token_acc": 0.5942992874109264, "train_speed(iter/s)": 0.056434 }, { "epoch": 0.5802993901312142, "grad_norm": 1.0681729567626044, "learning_rate": 1.6121356191578213e-05, "loss": 1.1280495643615722, "memory(GiB)": 74.93, "step": 785, "token_acc": 0.705685618729097, "train_speed(iter/s)": 0.056481 }, { "epoch": 0.5839955645906487, "grad_norm": 1.2860183936318812, "learning_rate": 1.607531708422649e-05, "loss": 1.1495230674743653, "memory(GiB)": 74.93, "step": 790, "token_acc": 0.5793650793650794, "train_speed(iter/s)": 0.056516 }, { "epoch": 0.5876917390500832, "grad_norm": 1.0862282113312, "learning_rate": 1.6029072957876196e-05, "loss": 1.1175559997558593, "memory(GiB)": 74.93, "step": 795, "token_acc": 0.6226415094339622, "train_speed(iter/s)": 0.056552 }, { "epoch": 0.5913879135095177, "grad_norm": 1.1331799452220792, "learning_rate": 1.5982625373091877e-05, "loss": 1.0859192848205566, "memory(GiB)": 74.93, "step": 800, "token_acc": 0.597226235192141, "train_speed(iter/s)": 0.056592 }, { "epoch": 0.5913879135095177, "eval_loss": 0.7157755494117737, "eval_runtime": 88.6481, "eval_samples_per_second": 78.907, "eval_steps_per_second": 0.62, "eval_token_acc": 0.6206202370041347, "step": 800 }, { "epoch": 0.5950840879689522, "grad_norm": 1.108802407981979, "learning_rate": 1.593597589730404e-05, "loss": 1.147084617614746, "memory(GiB)": 74.93, "step": 805, "token_acc": 0.6168687401159726, "train_speed(iter/s)": 0.056208 }, { "epoch": 0.5987802624283867, "grad_norm": 0.9423602415844418, "learning_rate": 1.5889126104756245e-05, "loss": 1.1448484420776368, "memory(GiB)": 74.93, "step": 810, "token_acc": 0.5890688259109311, "train_speed(iter/s)": 0.056247 }, { "epoch": 0.6024764368878212, "grad_norm": 1.0816637490179923, "learning_rate": 1.5842077576451988e-05, "loss": 1.1083642959594726, "memory(GiB)": 74.93, "step": 815, "token_acc": 0.6413487738419619, "train_speed(iter/s)": 0.056285 }, { "epoch": 0.6061726113472556, "grad_norm": 1.135732608334688, "learning_rate": 1.5794831900101352e-05, "loss": 1.1130756378173827, "memory(GiB)": 74.93, "step": 820, "token_acc": 0.620497803806735, "train_speed(iter/s)": 0.056338 }, { "epoch": 0.6098687858066901, "grad_norm": 1.0156136928889437, "learning_rate": 1.5747390670067412e-05, "loss": 1.1423524856567382, "memory(GiB)": 74.93, "step": 825, "token_acc": 0.6086384564788424, "train_speed(iter/s)": 0.056378 }, { "epoch": 0.6135649602661246, "grad_norm": 1.233089498837372, "learning_rate": 1.5699755487312446e-05, "loss": 1.1060791969299317, "memory(GiB)": 74.93, "step": 830, "token_acc": 0.6365546218487395, "train_speed(iter/s)": 0.056416 }, { "epoch": 0.6172611347255591, "grad_norm": 1.1731325122439864, "learning_rate": 1.56519279593439e-05, "loss": 1.0863089561462402, "memory(GiB)": 74.93, "step": 835, "token_acc": 0.6160830090791181, "train_speed(iter/s)": 0.056451 }, { "epoch": 0.6209573091849935, "grad_norm": 1.1022360374731142, "learning_rate": 1.560390970016015e-05, "loss": 1.1188045501708985, "memory(GiB)": 74.93, "step": 840, "token_acc": 0.5851091817942646, "train_speed(iter/s)": 0.05649 }, { "epoch": 0.624653483644428, "grad_norm": 1.1163862966216507, "learning_rate": 1.5555702330196024e-05, "loss": 1.1088319778442384, "memory(GiB)": 74.93, "step": 845, "token_acc": 0.6556741028128031, "train_speed(iter/s)": 0.056533 }, { "epoch": 0.6283496581038625, "grad_norm": 1.1694067702393547, "learning_rate": 1.5507307476268126e-05, "loss": 1.1475400924682617, "memory(GiB)": 74.93, "step": 850, "token_acc": 0.6055389221556886, "train_speed(iter/s)": 0.056569 }, { "epoch": 0.6283496581038625, "eval_loss": 0.7119885683059692, "eval_runtime": 87.1877, "eval_samples_per_second": 80.229, "eval_steps_per_second": 0.631, "eval_token_acc": 0.621244721890677, "step": 850 }, { "epoch": 0.6320458325632969, "grad_norm": 1.1865540340685679, "learning_rate": 1.5458726771519946e-05, "loss": 1.135090446472168, "memory(GiB)": 74.93, "step": 855, "token_acc": 0.6295323704676296, "train_speed(iter/s)": 0.056205 }, { "epoch": 0.6357420070227314, "grad_norm": 0.9908463678598523, "learning_rate": 1.5409961855366718e-05, "loss": 1.110205078125, "memory(GiB)": 74.93, "step": 860, "token_acc": 0.6002865329512894, "train_speed(iter/s)": 0.056248 }, { "epoch": 0.6394381814821659, "grad_norm": 1.1394579815051238, "learning_rate": 1.5361014373440125e-05, "loss": 1.131001091003418, "memory(GiB)": 74.93, "step": 865, "token_acc": 0.6846254927726675, "train_speed(iter/s)": 0.056284 }, { "epoch": 0.6431343559416004, "grad_norm": 1.2277455515675866, "learning_rate": 1.5311885977532756e-05, "loss": 1.1217898368835448, "memory(GiB)": 74.93, "step": 870, "token_acc": 0.5979188900747066, "train_speed(iter/s)": 0.056322 }, { "epoch": 0.6468305304010349, "grad_norm": 1.163464153725413, "learning_rate": 1.5262578325542366e-05, "loss": 1.096768569946289, "memory(GiB)": 74.93, "step": 875, "token_acc": 0.6008762322015334, "train_speed(iter/s)": 0.056371 }, { "epoch": 0.6505267048604694, "grad_norm": 1.0920480508914876, "learning_rate": 1.521309308141592e-05, "loss": 1.1257577896118165, "memory(GiB)": 74.93, "step": 880, "token_acc": 0.6577503429355281, "train_speed(iter/s)": 0.056412 }, { "epoch": 0.6542228793199039, "grad_norm": 1.1338180174479229, "learning_rate": 1.5163431915093443e-05, "loss": 1.1262746810913087, "memory(GiB)": 74.93, "step": 885, "token_acc": 0.6306549628629304, "train_speed(iter/s)": 0.056447 }, { "epoch": 0.6579190537793383, "grad_norm": 1.295043254051827, "learning_rate": 1.511359650245168e-05, "loss": 1.1621430397033692, "memory(GiB)": 74.93, "step": 890, "token_acc": 0.6065481230595541, "train_speed(iter/s)": 0.056485 }, { "epoch": 0.6616152282387728, "grad_norm": 1.1985531473315896, "learning_rate": 1.506358852524752e-05, "loss": 1.1280719757080078, "memory(GiB)": 74.93, "step": 895, "token_acc": 0.6419322709163346, "train_speed(iter/s)": 0.056523 }, { "epoch": 0.6653114026982073, "grad_norm": 1.0909942367098966, "learning_rate": 1.5013409671061267e-05, "loss": 1.125238800048828, "memory(GiB)": 74.93, "step": 900, "token_acc": 0.599232245681382, "train_speed(iter/s)": 0.056559 }, { "epoch": 0.6653114026982073, "eval_loss": 0.7135615348815918, "eval_runtime": 87.1706, "eval_samples_per_second": 80.245, "eval_steps_per_second": 0.631, "eval_token_acc": 0.6218034107725374, "step": 900 }, { "epoch": 0.6690075771576418, "grad_norm": 1.1857146226848603, "learning_rate": 1.4963061633239665e-05, "loss": 1.1094846725463867, "memory(GiB)": 74.93, "step": 905, "token_acc": 0.6268454980245374, "train_speed(iter/s)": 0.056203 }, { "epoch": 0.6727037516170763, "grad_norm": 0.9662742881806529, "learning_rate": 1.4912546110838775e-05, "loss": 1.1187602996826171, "memory(GiB)": 74.93, "step": 910, "token_acc": 0.6091391268869849, "train_speed(iter/s)": 0.056241 }, { "epoch": 0.6763999260765108, "grad_norm": 1.0584302453369157, "learning_rate": 1.4861864808566624e-05, "loss": 1.101078701019287, "memory(GiB)": 74.93, "step": 915, "token_acc": 0.5681592039800994, "train_speed(iter/s)": 0.056284 }, { "epoch": 0.6800961005359453, "grad_norm": 1.1605002634031412, "learning_rate": 1.4811019436725684e-05, "loss": 1.146175003051758, "memory(GiB)": 74.93, "step": 920, "token_acc": 0.63498674744415, "train_speed(iter/s)": 0.056321 }, { "epoch": 0.6837922749953798, "grad_norm": 1.0137203677446553, "learning_rate": 1.4760011711155164e-05, "loss": 1.1349545478820802, "memory(GiB)": 74.93, "step": 925, "token_acc": 0.6199203187250996, "train_speed(iter/s)": 0.056361 }, { "epoch": 0.6874884494548142, "grad_norm": 1.183534701619676, "learning_rate": 1.4708843353173084e-05, "loss": 1.0977567672729491, "memory(GiB)": 74.93, "step": 930, "token_acc": 0.6462346760070052, "train_speed(iter/s)": 0.056403 }, { "epoch": 0.6911846239142487, "grad_norm": 1.1575204207505418, "learning_rate": 1.4657516089518211e-05, "loss": 1.1138565063476562, "memory(GiB)": 74.93, "step": 935, "token_acc": 0.6146223888591323, "train_speed(iter/s)": 0.056436 }, { "epoch": 0.6948807983736832, "grad_norm": 1.1418054839263487, "learning_rate": 1.4606031652291772e-05, "loss": 1.1173955917358398, "memory(GiB)": 74.93, "step": 940, "token_acc": 0.6329457364341086, "train_speed(iter/s)": 0.056463 }, { "epoch": 0.6985769728331177, "grad_norm": 1.0817591968148002, "learning_rate": 1.4554391778899016e-05, "loss": 1.0996898651123046, "memory(GiB)": 74.93, "step": 945, "token_acc": 0.6234177215189873, "train_speed(iter/s)": 0.056501 }, { "epoch": 0.7022731472925522, "grad_norm": 1.072385635877129, "learning_rate": 1.4502598211990566e-05, "loss": 1.1042339324951171, "memory(GiB)": 74.93, "step": 950, "token_acc": 0.6252068394925537, "train_speed(iter/s)": 0.056535 }, { "epoch": 0.7022731472925522, "eval_loss": 0.7057685256004333, "eval_runtime": 86.3988, "eval_samples_per_second": 80.962, "eval_steps_per_second": 0.637, "eval_token_acc": 0.62234940217981, "step": 950 }, { "epoch": 0.7059693217519867, "grad_norm": 0.9961167453619919, "learning_rate": 1.4450652699403626e-05, "loss": 1.1219955444335938, "memory(GiB)": 74.93, "step": 955, "token_acc": 0.6370088719898606, "train_speed(iter/s)": 0.056207 }, { "epoch": 0.7096654962114212, "grad_norm": 1.061517038375997, "learning_rate": 1.4398556994102996e-05, "loss": 1.1110521316528321, "memory(GiB)": 74.93, "step": 960, "token_acc": 0.592031029619182, "train_speed(iter/s)": 0.056234 }, { "epoch": 0.7133616706708557, "grad_norm": 1.0852009579100936, "learning_rate": 1.43463128541219e-05, "loss": 1.096040916442871, "memory(GiB)": 74.93, "step": 965, "token_acc": 0.6075691411935954, "train_speed(iter/s)": 0.056271 }, { "epoch": 0.7170578451302901, "grad_norm": 0.9770309231987666, "learning_rate": 1.4293922042502688e-05, "loss": 1.1151371002197266, "memory(GiB)": 74.93, "step": 970, "token_acc": 0.6337025316455697, "train_speed(iter/s)": 0.056306 }, { "epoch": 0.7207540195897246, "grad_norm": 1.1847784978202587, "learning_rate": 1.4241386327237312e-05, "loss": 1.1008172035217285, "memory(GiB)": 74.93, "step": 975, "token_acc": 0.6730158730158731, "train_speed(iter/s)": 0.05634 }, { "epoch": 0.7244501940491591, "grad_norm": 1.143052071292951, "learning_rate": 1.4188707481207677e-05, "loss": 1.083547878265381, "memory(GiB)": 74.93, "step": 980, "token_acc": 0.6250749850029994, "train_speed(iter/s)": 0.056381 }, { "epoch": 0.7281463685085936, "grad_norm": 1.0778857332369403, "learning_rate": 1.4135887282125815e-05, "loss": 1.1583375930786133, "memory(GiB)": 74.93, "step": 985, "token_acc": 0.6521739130434783, "train_speed(iter/s)": 0.056416 }, { "epoch": 0.7318425429680281, "grad_norm": 1.1338338646435362, "learning_rate": 1.4082927512473884e-05, "loss": 1.0937719345092773, "memory(GiB)": 74.93, "step": 990, "token_acc": 0.6181945090739879, "train_speed(iter/s)": 0.056448 }, { "epoch": 0.7355387174274626, "grad_norm": 1.085287732158945, "learning_rate": 1.4029829959444023e-05, "loss": 1.1042760848999023, "memory(GiB)": 74.93, "step": 995, "token_acc": 0.600328947368421, "train_speed(iter/s)": 0.056486 }, { "epoch": 0.7392348918868971, "grad_norm": 1.0122719878977164, "learning_rate": 1.3976596414878044e-05, "loss": 1.1351425170898437, "memory(GiB)": 74.93, "step": 1000, "token_acc": 0.8054474708171206, "train_speed(iter/s)": 0.056528 }, { "epoch": 0.7392348918868971, "eval_loss": 0.7091466784477234, "eval_runtime": 87.3344, "eval_samples_per_second": 80.094, "eval_steps_per_second": 0.63, "eval_token_acc": 0.622888467691853, "step": 1000 }, { "epoch": 0.7429310663463315, "grad_norm": 1.1226018601296495, "learning_rate": 1.392322867520695e-05, "loss": 1.088837242126465, "memory(GiB)": 74.93, "step": 1005, "token_acc": 0.6355053191489362, "train_speed(iter/s)": 0.056225 }, { "epoch": 0.746627240805766, "grad_norm": 1.021565181098161, "learning_rate": 1.3869728541390333e-05, "loss": 1.1350063323974608, "memory(GiB)": 74.93, "step": 1010, "token_acc": 0.6212718064153067, "train_speed(iter/s)": 0.056258 }, { "epoch": 0.7503234152652005, "grad_norm": 1.2612224567220394, "learning_rate": 1.3816097818855575e-05, "loss": 1.1172313690185547, "memory(GiB)": 74.93, "step": 1015, "token_acc": 0.5992337164750958, "train_speed(iter/s)": 0.056287 }, { "epoch": 0.754019589724635, "grad_norm": 1.1387539267847184, "learning_rate": 1.3762338317436948e-05, "loss": 1.1132306098937987, "memory(GiB)": 74.93, "step": 1020, "token_acc": 0.6117302052785923, "train_speed(iter/s)": 0.056327 }, { "epoch": 0.7577157641840695, "grad_norm": 1.358536367466617, "learning_rate": 1.3708451851314511e-05, "loss": 1.1005128860473632, "memory(GiB)": 74.93, "step": 1025, "token_acc": 0.6442417331812998, "train_speed(iter/s)": 0.05636 }, { "epoch": 0.761411938643504, "grad_norm": 1.0707791903089035, "learning_rate": 1.3654440238952913e-05, "loss": 1.0914304733276368, "memory(GiB)": 74.93, "step": 1030, "token_acc": 0.6064616582327754, "train_speed(iter/s)": 0.056391 }, { "epoch": 0.7651081131029385, "grad_norm": 1.116060507051338, "learning_rate": 1.3600305303040007e-05, "loss": 1.1009283065795898, "memory(GiB)": 74.93, "step": 1035, "token_acc": 0.6307870370370371, "train_speed(iter/s)": 0.056425 }, { "epoch": 0.768804287562373, "grad_norm": 1.1278348104888696, "learning_rate": 1.3546048870425356e-05, "loss": 1.1028734207153321, "memory(GiB)": 74.93, "step": 1040, "token_acc": 0.5868608195055875, "train_speed(iter/s)": 0.056459 }, { "epoch": 0.7725004620218074, "grad_norm": 1.1153722062693998, "learning_rate": 1.349167277205858e-05, "loss": 1.124934768676758, "memory(GiB)": 74.93, "step": 1045, "token_acc": 0.6122199592668024, "train_speed(iter/s)": 0.056492 }, { "epoch": 0.7761966364812419, "grad_norm": 1.164884012561426, "learning_rate": 1.3437178842927554e-05, "loss": 1.1385189056396485, "memory(GiB)": 74.93, "step": 1050, "token_acc": 0.6258808456117874, "train_speed(iter/s)": 0.056526 }, { "epoch": 0.7761966364812419, "eval_loss": 0.7029861211776733, "eval_runtime": 88.4673, "eval_samples_per_second": 79.069, "eval_steps_per_second": 0.622, "eval_token_acc": 0.623123948129662, "step": 1050 }, { "epoch": 0.7798928109406764, "grad_norm": 1.3055581766553261, "learning_rate": 1.338256892199651e-05, "loss": 1.1020261764526367, "memory(GiB)": 74.93, "step": 1055, "token_acc": 0.6311363636363636, "train_speed(iter/s)": 0.056221 }, { "epoch": 0.7835889854001109, "grad_norm": 1.0395384668146148, "learning_rate": 1.3327844852143956e-05, "loss": 1.148073959350586, "memory(GiB)": 74.93, "step": 1060, "token_acc": 0.604885993485342, "train_speed(iter/s)": 0.05626 }, { "epoch": 0.7872851598595454, "grad_norm": 1.1665752727714136, "learning_rate": 1.3273008480100495e-05, "loss": 1.0979449272155761, "memory(GiB)": 74.93, "step": 1065, "token_acc": 0.6049382716049383, "train_speed(iter/s)": 0.05629 }, { "epoch": 0.7909813343189799, "grad_norm": 1.041985717329155, "learning_rate": 1.3218061656386517e-05, "loss": 1.1317058563232423, "memory(GiB)": 74.93, "step": 1070, "token_acc": 0.6433460076045627, "train_speed(iter/s)": 0.056314 }, { "epoch": 0.7946775087784144, "grad_norm": 1.0369279649431482, "learning_rate": 1.316300623524972e-05, "loss": 1.1089330673217774, "memory(GiB)": 74.93, "step": 1075, "token_acc": 0.6382868937048504, "train_speed(iter/s)": 0.056354 }, { "epoch": 0.7983736832378489, "grad_norm": 1.1949441156399458, "learning_rate": 1.3107844074602566e-05, "loss": 1.0892942428588868, "memory(GiB)": 74.93, "step": 1080, "token_acc": 0.6408912188728703, "train_speed(iter/s)": 0.056386 }, { "epoch": 0.8020698576972833, "grad_norm": 1.0363420805429473, "learning_rate": 1.305257703595957e-05, "loss": 1.0744206428527832, "memory(GiB)": 74.93, "step": 1085, "token_acc": 0.6147540983606558, "train_speed(iter/s)": 0.056414 }, { "epoch": 0.8057660321567178, "grad_norm": 0.9805753007460783, "learning_rate": 1.2997206984374486e-05, "loss": 1.1048744201660157, "memory(GiB)": 74.93, "step": 1090, "token_acc": 0.6329463792150359, "train_speed(iter/s)": 0.056452 }, { "epoch": 0.8094622066161523, "grad_norm": 1.078880274058704, "learning_rate": 1.2941735788377356e-05, "loss": 1.0897531509399414, "memory(GiB)": 74.93, "step": 1095, "token_acc": 0.6396155899626268, "train_speed(iter/s)": 0.056484 }, { "epoch": 0.8131583810755868, "grad_norm": 1.083885052316346, "learning_rate": 1.2886165319911474e-05, "loss": 1.1432035446166993, "memory(GiB)": 74.93, "step": 1100, "token_acc": 0.5973259929217459, "train_speed(iter/s)": 0.056505 }, { "epoch": 0.8131583810755868, "eval_loss": 0.6945818662643433, "eval_runtime": 86.4586, "eval_samples_per_second": 80.906, "eval_steps_per_second": 0.636, "eval_token_acc": 0.6239354321874054, "step": 1100 }, { "epoch": 0.8168545555350213, "grad_norm": 1.1507994138444235, "learning_rate": 1.2830497454270206e-05, "loss": 1.1136839866638184, "memory(GiB)": 74.93, "step": 1105, "token_acc": 0.6371170793117918, "train_speed(iter/s)": 0.05622 }, { "epoch": 0.8205507299944558, "grad_norm": 1.0133515901515742, "learning_rate": 1.2774734070033692e-05, "loss": 1.1166929244995116, "memory(GiB)": 74.93, "step": 1110, "token_acc": 0.6103855721393034, "train_speed(iter/s)": 0.056253 }, { "epoch": 0.8242469044538903, "grad_norm": 1.1857531032231587, "learning_rate": 1.2718877049005477e-05, "loss": 1.1120613098144532, "memory(GiB)": 74.93, "step": 1115, "token_acc": 0.6248982912937348, "train_speed(iter/s)": 0.056279 }, { "epoch": 0.8279430789133247, "grad_norm": 1.0147593247560383, "learning_rate": 1.2662928276148985e-05, "loss": 1.0828424453735352, "memory(GiB)": 74.93, "step": 1120, "token_acc": 0.6065897858319604, "train_speed(iter/s)": 0.056309 }, { "epoch": 0.8316392533727592, "grad_norm": 1.0535067736037584, "learning_rate": 1.2606889639523925e-05, "loss": 1.082409381866455, "memory(GiB)": 74.93, "step": 1125, "token_acc": 0.6383859286083807, "train_speed(iter/s)": 0.056339 }, { "epoch": 0.8353354278321937, "grad_norm": 1.090903289476391, "learning_rate": 1.255076303022256e-05, "loss": 1.1306575775146483, "memory(GiB)": 74.93, "step": 1130, "token_acc": 0.6113028472821398, "train_speed(iter/s)": 0.056373 }, { "epoch": 0.8390316022916282, "grad_norm": 1.1602057234017449, "learning_rate": 1.2494550342305906e-05, "loss": 1.1157353401184082, "memory(GiB)": 74.93, "step": 1135, "token_acc": 0.629865985960434, "train_speed(iter/s)": 0.0564 }, { "epoch": 0.8427277767510627, "grad_norm": 1.032443656861064, "learning_rate": 1.2438253472739805e-05, "loss": 1.0929494857788087, "memory(GiB)": 74.93, "step": 1140, "token_acc": 0.6280344557556774, "train_speed(iter/s)": 0.056434 }, { "epoch": 0.8464239512104972, "grad_norm": 1.122025726444444, "learning_rate": 1.2381874321330912e-05, "loss": 1.1178958892822266, "memory(GiB)": 74.93, "step": 1145, "token_acc": 0.6517412935323383, "train_speed(iter/s)": 0.056468 }, { "epoch": 0.8501201256699317, "grad_norm": 1.0829851308141574, "learning_rate": 1.2325414790662578e-05, "loss": 1.0894483566284179, "memory(GiB)": 74.93, "step": 1150, "token_acc": 0.6569058077110785, "train_speed(iter/s)": 0.05649 }, { "epoch": 0.8501201256699317, "eval_loss": 0.6932370066642761, "eval_runtime": 86.0146, "eval_samples_per_second": 81.323, "eval_steps_per_second": 0.639, "eval_token_acc": 0.6245899292866097, "step": 1150 }, { "epoch": 0.8538163001293662, "grad_norm": 1.3861087034460704, "learning_rate": 1.2268876786030654e-05, "loss": 1.1001951217651367, "memory(GiB)": 74.93, "step": 1155, "token_acc": 0.630185845691759, "train_speed(iter/s)": 0.056209 }, { "epoch": 0.8575124745888006, "grad_norm": 1.1867682331739955, "learning_rate": 1.2212262215379199e-05, "loss": 1.1211355209350586, "memory(GiB)": 74.93, "step": 1160, "token_acc": 0.6551724137931034, "train_speed(iter/s)": 0.056235 }, { "epoch": 0.8612086490482351, "grad_norm": 1.0901861719096644, "learning_rate": 1.215557298923607e-05, "loss": 1.0956010818481445, "memory(GiB)": 74.93, "step": 1165, "token_acc": 0.6244993324432577, "train_speed(iter/s)": 0.056271 }, { "epoch": 0.8649048235076695, "grad_norm": 1.0190543071260865, "learning_rate": 1.2098811020648475e-05, "loss": 1.1221609115600586, "memory(GiB)": 74.93, "step": 1170, "token_acc": 0.612531328320802, "train_speed(iter/s)": 0.056297 }, { "epoch": 0.868600997967104, "grad_norm": 1.055731899501751, "learning_rate": 1.2041978225118409e-05, "loss": 1.0942396163940429, "memory(GiB)": 74.93, "step": 1175, "token_acc": 0.61580547112462, "train_speed(iter/s)": 0.056324 }, { "epoch": 0.8722971724265385, "grad_norm": 1.1595911679468829, "learning_rate": 1.1985076520537995e-05, "loss": 1.1030941009521484, "memory(GiB)": 74.93, "step": 1180, "token_acc": 0.6299868478737396, "train_speed(iter/s)": 0.056356 }, { "epoch": 0.875993346885973, "grad_norm": 1.1461146239140465, "learning_rate": 1.1928107827124786e-05, "loss": 1.0970783233642578, "memory(GiB)": 74.93, "step": 1185, "token_acc": 0.644696639022261, "train_speed(iter/s)": 0.056381 }, { "epoch": 0.8796895213454075, "grad_norm": 1.0680776701688195, "learning_rate": 1.1871074067356952e-05, "loss": 1.079010009765625, "memory(GiB)": 74.93, "step": 1190, "token_acc": 0.6483679525222552, "train_speed(iter/s)": 0.056408 }, { "epoch": 0.8833856958048419, "grad_norm": 1.1205292458140585, "learning_rate": 1.1813977165908406e-05, "loss": 1.098078155517578, "memory(GiB)": 74.93, "step": 1195, "token_acc": 0.6183456183456183, "train_speed(iter/s)": 0.056441 }, { "epoch": 0.8870818702642764, "grad_norm": 1.073187725881319, "learning_rate": 1.1756819049583861e-05, "loss": 1.1022902488708497, "memory(GiB)": 74.93, "step": 1200, "token_acc": 0.6195414847161572, "train_speed(iter/s)": 0.056472 }, { "epoch": 0.8870818702642764, "eval_loss": 0.6976271271705627, "eval_runtime": 87.7392, "eval_samples_per_second": 79.725, "eval_steps_per_second": 0.627, "eval_token_acc": 0.6255041474569267, "step": 1200 }, { "epoch": 0.8907780447237109, "grad_norm": 1.0836927609908615, "learning_rate": 1.1699601647253791e-05, "loss": 1.0966317176818847, "memory(GiB)": 74.93, "step": 1205, "token_acc": 0.6305779078273592, "train_speed(iter/s)": 0.056207 }, { "epoch": 0.8944742191831454, "grad_norm": 1.1200101176242079, "learning_rate": 1.1642326889789352e-05, "loss": 1.1052473068237305, "memory(GiB)": 74.93, "step": 1210, "token_acc": 0.6330027051397655, "train_speed(iter/s)": 0.05623 }, { "epoch": 0.8981703936425799, "grad_norm": 0.8945893498959235, "learning_rate": 1.158499670999722e-05, "loss": 1.0987310409545898, "memory(GiB)": 74.93, "step": 1215, "token_acc": 0.6409691629955947, "train_speed(iter/s)": 0.05626 }, { "epoch": 0.9018665681020144, "grad_norm": 1.1729053883136484, "learning_rate": 1.1527613042554368e-05, "loss": 1.1048666000366212, "memory(GiB)": 74.93, "step": 1220, "token_acc": 0.6676938880328711, "train_speed(iter/s)": 0.056294 }, { "epoch": 0.9055627425614489, "grad_norm": 1.0443569914858049, "learning_rate": 1.147017782394277e-05, "loss": 1.081749439239502, "memory(GiB)": 74.93, "step": 1225, "token_acc": 0.608612895550797, "train_speed(iter/s)": 0.056319 }, { "epoch": 0.9092589170208834, "grad_norm": 1.2005283092061096, "learning_rate": 1.1412692992384058e-05, "loss": 1.091093158721924, "memory(GiB)": 74.93, "step": 1230, "token_acc": 0.606317160534028, "train_speed(iter/s)": 0.056348 }, { "epoch": 0.9129550914803178, "grad_norm": 1.0896928360432243, "learning_rate": 1.1355160487774119e-05, "loss": 1.1176409721374512, "memory(GiB)": 74.93, "step": 1235, "token_acc": 0.5716694772344013, "train_speed(iter/s)": 0.056377 }, { "epoch": 0.9166512659397523, "grad_norm": 1.09517195359763, "learning_rate": 1.1297582251617618e-05, "loss": 1.1004619598388672, "memory(GiB)": 74.93, "step": 1240, "token_acc": 0.6309497935231472, "train_speed(iter/s)": 0.056401 }, { "epoch": 0.9203474403991868, "grad_norm": 1.0558160321968586, "learning_rate": 1.1239960226962491e-05, "loss": 1.1076683044433593, "memory(GiB)": 74.93, "step": 1245, "token_acc": 0.624376731301939, "train_speed(iter/s)": 0.056433 }, { "epoch": 0.9240436148586213, "grad_norm": 1.167401656088389, "learning_rate": 1.1182296358334373e-05, "loss": 1.0801752090454102, "memory(GiB)": 74.93, "step": 1250, "token_acc": 0.6274625110261688, "train_speed(iter/s)": 0.056468 }, { "epoch": 0.9240436148586213, "eval_loss": 0.6896535158157349, "eval_runtime": 89.0061, "eval_samples_per_second": 78.59, "eval_steps_per_second": 0.618, "eval_token_acc": 0.6259704910690581, "step": 1250 }, { "epoch": 0.9277397893180558, "grad_norm": 1.2651651199409124, "learning_rate": 1.1124592591670964e-05, "loss": 1.0778679847717285, "memory(GiB)": 74.93, "step": 1255, "token_acc": 0.6440798016763074, "train_speed(iter/s)": 0.056224 }, { "epoch": 0.9314359637774903, "grad_norm": 1.0901265302180776, "learning_rate": 1.1066850874256387e-05, "loss": 1.0967378616333008, "memory(GiB)": 74.93, "step": 1260, "token_acc": 0.6274731486715659, "train_speed(iter/s)": 0.056248 }, { "epoch": 0.9351321382369248, "grad_norm": 1.0804226410639166, "learning_rate": 1.1009073154655452e-05, "loss": 1.0889236450195312, "memory(GiB)": 74.93, "step": 1265, "token_acc": 0.620845921450151, "train_speed(iter/s)": 0.056269 }, { "epoch": 0.9388283126963592, "grad_norm": 1.228390945564267, "learning_rate": 1.09512613826479e-05, "loss": 1.1092605590820312, "memory(GiB)": 74.93, "step": 1270, "token_acc": 0.6499229583975347, "train_speed(iter/s)": 0.056301 }, { "epoch": 0.9425244871557937, "grad_norm": 1.179672539170986, "learning_rate": 1.0893417509162624e-05, "loss": 1.099574661254883, "memory(GiB)": 74.93, "step": 1275, "token_acc": 0.6232127838519764, "train_speed(iter/s)": 0.056325 }, { "epoch": 0.9462206616152282, "grad_norm": 1.0309784047078987, "learning_rate": 1.0835543486211815e-05, "loss": 1.1081634521484376, "memory(GiB)": 74.93, "step": 1280, "token_acc": 0.6257142857142857, "train_speed(iter/s)": 0.056352 }, { "epoch": 0.9499168360746627, "grad_norm": 1.1083199849496777, "learning_rate": 1.0777641266825094e-05, "loss": 1.1096603393554687, "memory(GiB)": 74.93, "step": 1285, "token_acc": 0.6357894736842106, "train_speed(iter/s)": 0.056378 }, { "epoch": 0.9536130105340972, "grad_norm": 1.0035577075576465, "learning_rate": 1.0719712804983604e-05, "loss": 1.1045263290405274, "memory(GiB)": 74.93, "step": 1290, "token_acc": 0.6397618260006616, "train_speed(iter/s)": 0.056405 }, { "epoch": 0.9573091849935317, "grad_norm": 1.0502142381441943, "learning_rate": 1.0661760055554083e-05, "loss": 1.082082462310791, "memory(GiB)": 74.93, "step": 1295, "token_acc": 0.6266829865361077, "train_speed(iter/s)": 0.056429 }, { "epoch": 0.9610053594529662, "grad_norm": 1.2499115770499312, "learning_rate": 1.0603784974222862e-05, "loss": 1.098296546936035, "memory(GiB)": 74.93, "step": 1300, "token_acc": 0.6284748309541698, "train_speed(iter/s)": 0.056459 }, { "epoch": 0.9610053594529662, "eval_loss": 0.6888419389724731, "eval_runtime": 87.7552, "eval_samples_per_second": 79.71, "eval_steps_per_second": 0.627, "eval_token_acc": 0.6259658738055717, "step": 1300 }, { "epoch": 0.9647015339124007, "grad_norm": 1.2558210208759852, "learning_rate": 1.054578951742991e-05, "loss": 1.0757410049438476, "memory(GiB)": 74.93, "step": 1305, "token_acc": 0.6296939859059755, "train_speed(iter/s)": 0.056222 }, { "epoch": 0.9683977083718351, "grad_norm": 1.1509712834800971, "learning_rate": 1.048777564230278e-05, "loss": 1.1064401626586915, "memory(GiB)": 74.93, "step": 1310, "token_acc": 0.6144927536231884, "train_speed(iter/s)": 0.056247 }, { "epoch": 0.9720938828312696, "grad_norm": 1.1877122033430165, "learning_rate": 1.0429745306590573e-05, "loss": 1.0995939254760743, "memory(GiB)": 74.93, "step": 1315, "token_acc": 0.6551246537396122, "train_speed(iter/s)": 0.056264 }, { "epoch": 0.9757900572907041, "grad_norm": 1.0334473323989715, "learning_rate": 1.0371700468597886e-05, "loss": 1.0957868576049805, "memory(GiB)": 74.93, "step": 1320, "token_acc": 0.6152882205513784, "train_speed(iter/s)": 0.056289 }, { "epoch": 0.9794862317501386, "grad_norm": 1.0379714843668957, "learning_rate": 1.0313643087118692e-05, "loss": 1.0816888809204102, "memory(GiB)": 74.93, "step": 1325, "token_acc": 0.6423645320197044, "train_speed(iter/s)": 0.056319 }, { "epoch": 0.9831824062095731, "grad_norm": 1.0681169313465444, "learning_rate": 1.0255575121370277e-05, "loss": 1.0688974380493164, "memory(GiB)": 74.93, "step": 1330, "token_acc": 0.6287527459116427, "train_speed(iter/s)": 0.056343 }, { "epoch": 0.9868785806690076, "grad_norm": 1.1171758504896703, "learning_rate": 1.0197498530927102e-05, "loss": 1.099297332763672, "memory(GiB)": 74.93, "step": 1335, "token_acc": 0.6077836745008846, "train_speed(iter/s)": 0.056367 }, { "epoch": 0.9905747551284421, "grad_norm": 1.0576212439483514, "learning_rate": 1.0139415275654671e-05, "loss": 1.0867423057556151, "memory(GiB)": 74.93, "step": 1340, "token_acc": 0.6263262599469496, "train_speed(iter/s)": 0.056396 }, { "epoch": 0.9942709295878766, "grad_norm": 1.258815850774044, "learning_rate": 1.0081327315643406e-05, "loss": 1.1155497550964355, "memory(GiB)": 74.93, "step": 1345, "token_acc": 0.655549765502866, "train_speed(iter/s)": 0.056419 }, { "epoch": 0.997967104047311, "grad_norm": 1.0659691536136329, "learning_rate": 1.0023236611142499e-05, "loss": 1.057703685760498, "memory(GiB)": 74.93, "step": 1350, "token_acc": 0.712, "train_speed(iter/s)": 0.056446 }, { "epoch": 0.997967104047311, "eval_loss": 0.6881307363510132, "eval_runtime": 86.0221, "eval_samples_per_second": 81.316, "eval_steps_per_second": 0.639, "eval_token_acc": 0.626765814704599, "step": 1350 }, { "epoch": 1.0022177046756606, "grad_norm": 1.3069033033680353, "learning_rate": 9.965145122493756e-06, "loss": 1.2448784828186035, "memory(GiB)": 74.93, "step": 1355, "token_acc": 0.6295214105793451, "train_speed(iter/s)": 0.056171 }, { "epoch": 1.0059138791350952, "grad_norm": 0.9882434180756982, "learning_rate": 9.907054810065446e-06, "loss": 1.062336540222168, "memory(GiB)": 74.93, "step": 1360, "token_acc": 0.6483717526527625, "train_speed(iter/s)": 0.056192 }, { "epoch": 1.0096100535945296, "grad_norm": 1.2362454534970095, "learning_rate": 9.848967634186142e-06, "loss": 1.0906942367553711, "memory(GiB)": 74.93, "step": 1365, "token_acc": 0.6448347722536469, "train_speed(iter/s)": 0.056213 }, { "epoch": 1.0133062280539642, "grad_norm": 1.070334993285048, "learning_rate": 9.790885555078575e-06, "loss": 1.0470151901245117, "memory(GiB)": 74.93, "step": 1370, "token_acc": 0.6228728728728729, "train_speed(iter/s)": 0.056237 }, { "epoch": 1.0170024025133986, "grad_norm": 1.0576680139627181, "learning_rate": 9.732810532793465e-06, "loss": 1.0586755752563477, "memory(GiB)": 74.93, "step": 1375, "token_acc": 0.6435643564356436, "train_speed(iter/s)": 0.056266 }, { "epoch": 1.0206985769728332, "grad_norm": 1.0167739538945428, "learning_rate": 9.674744527143419e-06, "loss": 1.059821891784668, "memory(GiB)": 74.93, "step": 1380, "token_acc": 0.6397306397306397, "train_speed(iter/s)": 0.056291 }, { "epoch": 1.0243947514322675, "grad_norm": 1.1268503654686965, "learning_rate": 9.61668949763674e-06, "loss": 1.0377557754516602, "memory(GiB)": 74.93, "step": 1385, "token_acc": 0.6721439749608764, "train_speed(iter/s)": 0.056311 }, { "epoch": 1.0280909258917021, "grad_norm": 0.9931688648143746, "learning_rate": 9.558647403411334e-06, "loss": 1.0480243682861328, "memory(GiB)": 74.93, "step": 1390, "token_acc": 0.6135416666666667, "train_speed(iter/s)": 0.056336 }, { "epoch": 1.0317871003511365, "grad_norm": 1.1339232037274705, "learning_rate": 9.500620203168604e-06, "loss": 1.0579310417175294, "memory(GiB)": 74.93, "step": 1395, "token_acc": 0.6699186991869919, "train_speed(iter/s)": 0.056365 }, { "epoch": 1.0354832748105711, "grad_norm": 0.9738636619210117, "learning_rate": 9.442609855107317e-06, "loss": 1.0384546279907227, "memory(GiB)": 74.93, "step": 1400, "token_acc": 0.6303651505445227, "train_speed(iter/s)": 0.056383 }, { "epoch": 1.0354832748105711, "eval_loss": 0.6841524243354797, "eval_runtime": 86.5188, "eval_samples_per_second": 80.849, "eval_steps_per_second": 0.636, "eval_token_acc": 0.6267785121791868, "step": 1400 }, { "epoch": 1.0391794492700055, "grad_norm": 1.0076575163805248, "learning_rate": 9.38461831685756e-06, "loss": 1.0656241416931151, "memory(GiB)": 74.93, "step": 1405, "token_acc": 0.6295483423818875, "train_speed(iter/s)": 0.056156 }, { "epoch": 1.04287562372944, "grad_norm": 1.0590248393948134, "learning_rate": 9.326647545414647e-06, "loss": 1.0602170944213867, "memory(GiB)": 74.93, "step": 1410, "token_acc": 0.7284836065573771, "train_speed(iter/s)": 0.056177 }, { "epoch": 1.0465717981888745, "grad_norm": 1.0411050083571975, "learning_rate": 9.268699497073102e-06, "loss": 1.0623086929321288, "memory(GiB)": 74.93, "step": 1415, "token_acc": 0.6079059829059829, "train_speed(iter/s)": 0.056203 }, { "epoch": 1.050267972648309, "grad_norm": 1.0820280991464322, "learning_rate": 9.21077612736062e-06, "loss": 1.0742631912231446, "memory(GiB)": 74.93, "step": 1420, "token_acc": 0.6051423324150597, "train_speed(iter/s)": 0.056231 }, { "epoch": 1.0539641471077434, "grad_norm": 1.0150109672389387, "learning_rate": 9.152879390972085e-06, "loss": 1.060621452331543, "memory(GiB)": 74.93, "step": 1425, "token_acc": 0.6677704194260485, "train_speed(iter/s)": 0.056246 }, { "epoch": 1.057660321567178, "grad_norm": 1.0625464742964672, "learning_rate": 9.095011241703623e-06, "loss": 1.1060840606689453, "memory(GiB)": 74.93, "step": 1430, "token_acc": 0.617154288572143, "train_speed(iter/s)": 0.056275 }, { "epoch": 1.0613564960266124, "grad_norm": 1.080121630294682, "learning_rate": 9.037173632386635e-06, "loss": 1.051788902282715, "memory(GiB)": 74.93, "step": 1435, "token_acc": 0.693069306930693, "train_speed(iter/s)": 0.056295 }, { "epoch": 1.065052670486047, "grad_norm": 0.9965862626370368, "learning_rate": 8.979368514821917e-06, "loss": 1.0715249061584473, "memory(GiB)": 74.93, "step": 1440, "token_acc": 0.6563587166602242, "train_speed(iter/s)": 0.05632 }, { "epoch": 1.0687488449454814, "grad_norm": 1.0523645368442776, "learning_rate": 8.921597839713803e-06, "loss": 1.0732128143310546, "memory(GiB)": 74.93, "step": 1445, "token_acc": 0.6195273149941883, "train_speed(iter/s)": 0.056345 }, { "epoch": 1.072445019404916, "grad_norm": 0.9439502959144558, "learning_rate": 8.863863556604312e-06, "loss": 1.0644493103027344, "memory(GiB)": 74.93, "step": 1450, "token_acc": 0.6215469613259669, "train_speed(iter/s)": 0.056369 }, { "epoch": 1.072445019404916, "eval_loss": 0.6834661960601807, "eval_runtime": 87.5557, "eval_samples_per_second": 79.892, "eval_steps_per_second": 0.628, "eval_token_acc": 0.627048622093144, "step": 1450 }, { "epoch": 1.0761411938643504, "grad_norm": 1.1260430229381853, "learning_rate": 8.806167613807374e-06, "loss": 1.0463625907897949, "memory(GiB)": 74.93, "step": 1455, "token_acc": 0.6380742913000977, "train_speed(iter/s)": 0.05615 }, { "epoch": 1.079837368323785, "grad_norm": 1.1262155455903309, "learning_rate": 8.748511958343076e-06, "loss": 1.0758758544921876, "memory(GiB)": 74.93, "step": 1460, "token_acc": 0.6353591160220995, "train_speed(iter/s)": 0.056173 }, { "epoch": 1.0835335427832193, "grad_norm": 1.0836611872394941, "learning_rate": 8.690898535871967e-06, "loss": 1.0662212371826172, "memory(GiB)": 74.93, "step": 1465, "token_acc": 0.6074675324675325, "train_speed(iter/s)": 0.0562 }, { "epoch": 1.087229717242654, "grad_norm": 1.1980862381018496, "learning_rate": 8.633329290629385e-06, "loss": 1.042177963256836, "memory(GiB)": 74.93, "step": 1470, "token_acc": 0.6368200836820084, "train_speed(iter/s)": 0.056225 }, { "epoch": 1.0909258917020883, "grad_norm": 1.1395698139161996, "learning_rate": 8.575806165359852e-06, "loss": 1.0712276458740235, "memory(GiB)": 74.93, "step": 1475, "token_acc": 0.6389548693586699, "train_speed(iter/s)": 0.056249 }, { "epoch": 1.094622066161523, "grad_norm": 1.0531458891625334, "learning_rate": 8.51833110125153e-06, "loss": 1.0721662521362305, "memory(GiB)": 74.93, "step": 1480, "token_acc": 0.6220368744512731, "train_speed(iter/s)": 0.056271 }, { "epoch": 1.0983182406209573, "grad_norm": 0.952355580471414, "learning_rate": 8.460906037870677e-06, "loss": 1.018984603881836, "memory(GiB)": 74.93, "step": 1485, "token_acc": 0.6109256449165402, "train_speed(iter/s)": 0.056292 }, { "epoch": 1.1020144150803919, "grad_norm": 1.0722820285217056, "learning_rate": 8.403532913096231e-06, "loss": 1.0254201889038086, "memory(GiB)": 74.93, "step": 1490, "token_acc": 0.6746411483253588, "train_speed(iter/s)": 0.056313 }, { "epoch": 1.1057105895398263, "grad_norm": 1.0574628279248734, "learning_rate": 8.346213663054388e-06, "loss": 1.0446287155151368, "memory(GiB)": 74.93, "step": 1495, "token_acc": 0.6608030592734225, "train_speed(iter/s)": 0.056333 }, { "epoch": 1.1094067639992609, "grad_norm": 1.0816482005421177, "learning_rate": 8.288950222053287e-06, "loss": 1.0296789169311524, "memory(GiB)": 74.93, "step": 1500, "token_acc": 0.5984496124031008, "train_speed(iter/s)": 0.056359 }, { "epoch": 1.1094067639992609, "eval_loss": 0.6838507056236267, "eval_runtime": 89.1049, "eval_samples_per_second": 78.503, "eval_steps_per_second": 0.617, "eval_token_acc": 0.6276061566591329, "step": 1500 }, { "epoch": 1.1131029384586952, "grad_norm": 1.0669000768005377, "learning_rate": 8.231744522517713e-06, "loss": 1.052156925201416, "memory(GiB)": 74.93, "step": 1505, "token_acc": 0.6264432872990717, "train_speed(iter/s)": 0.056154 }, { "epoch": 1.1167991129181298, "grad_norm": 1.123101456521189, "learning_rate": 8.174598494923893e-06, "loss": 1.0532621383666991, "memory(GiB)": 74.93, "step": 1510, "token_acc": 0.6674074074074074, "train_speed(iter/s)": 0.056174 }, { "epoch": 1.1204952873775642, "grad_norm": 0.9597873108803062, "learning_rate": 8.117514067734365e-06, "loss": 1.0872188568115235, "memory(GiB)": 74.93, "step": 1515, "token_acc": 0.6229354939233406, "train_speed(iter/s)": 0.056193 }, { "epoch": 1.1241914618369988, "grad_norm": 1.01751822081855, "learning_rate": 8.060493167332874e-06, "loss": 1.0647924423217774, "memory(GiB)": 74.93, "step": 1520, "token_acc": 0.6589195979899497, "train_speed(iter/s)": 0.056222 }, { "epoch": 1.1278876362964332, "grad_norm": 1.2668018355322213, "learning_rate": 8.003537717959378e-06, "loss": 1.054795265197754, "memory(GiB)": 74.93, "step": 1525, "token_acc": 0.6280428432327166, "train_speed(iter/s)": 0.056242 }, { "epoch": 1.1315838107558678, "grad_norm": 1.0402787270589529, "learning_rate": 7.946649641645108e-06, "loss": 1.0737996101379395, "memory(GiB)": 74.93, "step": 1530, "token_acc": 0.6400172860847018, "train_speed(iter/s)": 0.056265 }, { "epoch": 1.1352799852153022, "grad_norm": 1.1860588895073847, "learning_rate": 7.889830858147718e-06, "loss": 1.0505868911743164, "memory(GiB)": 74.93, "step": 1535, "token_acc": 0.6243339253996447, "train_speed(iter/s)": 0.056293 }, { "epoch": 1.1389761596747365, "grad_norm": 1.0989591028912902, "learning_rate": 7.833083284886484e-06, "loss": 1.0597726821899414, "memory(GiB)": 74.93, "step": 1540, "token_acc": 0.6668341708542713, "train_speed(iter/s)": 0.056316 }, { "epoch": 1.1426723341341711, "grad_norm": 1.1347824812891065, "learning_rate": 7.7764088368776e-06, "loss": 1.0500106811523438, "memory(GiB)": 74.93, "step": 1545, "token_acc": 0.6302988186240445, "train_speed(iter/s)": 0.056337 }, { "epoch": 1.1463685085936057, "grad_norm": 1.0564162756732445, "learning_rate": 7.719809426669576e-06, "loss": 1.0577827453613282, "memory(GiB)": 74.93, "step": 1550, "token_acc": 0.6201646090534979, "train_speed(iter/s)": 0.056358 }, { "epoch": 1.1463685085936057, "eval_loss": 0.6770405769348145, "eval_runtime": 87.0148, "eval_samples_per_second": 80.389, "eval_steps_per_second": 0.632, "eval_token_acc": 0.6278058533049218, "step": 1550 }, { "epoch": 1.15006468305304, "grad_norm": 1.1665260843525407, "learning_rate": 7.663286964278665e-06, "loss": 1.046430492401123, "memory(GiB)": 74.93, "step": 1555, "token_acc": 0.6295910639909126, "train_speed(iter/s)": 0.056161 }, { "epoch": 1.1537608575124745, "grad_norm": 1.0893384767496972, "learning_rate": 7.606843357124426e-06, "loss": 1.0604162216186523, "memory(GiB)": 74.93, "step": 1560, "token_acc": 0.618162506638343, "train_speed(iter/s)": 0.056181 }, { "epoch": 1.157457031971909, "grad_norm": 1.0091311530942315, "learning_rate": 7.550480509965348e-06, "loss": 1.0764715194702148, "memory(GiB)": 74.93, "step": 1565, "token_acc": 0.6651108518086347, "train_speed(iter/s)": 0.056207 }, { "epoch": 1.1611532064313437, "grad_norm": 0.9991849558827516, "learning_rate": 7.494200324834588e-06, "loss": 1.076918888092041, "memory(GiB)": 74.93, "step": 1570, "token_acc": 0.6519940915805023, "train_speed(iter/s)": 0.056225 }, { "epoch": 1.164849380890778, "grad_norm": 1.1070133372574182, "learning_rate": 7.43800470097576e-06, "loss": 1.0360871315002442, "memory(GiB)": 74.93, "step": 1575, "token_acc": 0.6534121440085975, "train_speed(iter/s)": 0.056247 }, { "epoch": 1.1685455553502124, "grad_norm": 0.9616191113258434, "learning_rate": 7.381895534778852e-06, "loss": 1.071969223022461, "memory(GiB)": 74.93, "step": 1580, "token_acc": 0.6318518518518519, "train_speed(iter/s)": 0.05627 }, { "epoch": 1.172241729809647, "grad_norm": 0.9588896754114927, "learning_rate": 7.3258747197162484e-06, "loss": 1.0856236457824706, "memory(GiB)": 74.93, "step": 1585, "token_acc": 0.6137469586374696, "train_speed(iter/s)": 0.05629 }, { "epoch": 1.1759379042690816, "grad_norm": 1.155114349369357, "learning_rate": 7.269944146278801e-06, "loss": 1.054957962036133, "memory(GiB)": 74.93, "step": 1590, "token_acc": 0.6266263237518911, "train_speed(iter/s)": 0.056314 }, { "epoch": 1.179634078728516, "grad_norm": 1.0144629940415562, "learning_rate": 7.214105701912054e-06, "loss": 1.0508974075317383, "memory(GiB)": 74.93, "step": 1595, "token_acc": 0.6369260827092152, "train_speed(iter/s)": 0.056334 }, { "epoch": 1.1833302531879504, "grad_norm": 1.1824656228465167, "learning_rate": 7.1583612709525405e-06, "loss": 1.0430817604064941, "memory(GiB)": 74.93, "step": 1600, "token_acc": 0.6061151079136691, "train_speed(iter/s)": 0.056355 }, { "epoch": 1.1833302531879504, "eval_loss": 0.674736499786377, "eval_runtime": 85.716, "eval_samples_per_second": 81.607, "eval_steps_per_second": 0.642, "eval_token_acc": 0.6284499615612815, "step": 1600 }, { "epoch": 1.187026427647385, "grad_norm": 1.1524983234954504, "learning_rate": 7.102712734564202e-06, "loss": 1.046616268157959, "memory(GiB)": 74.93, "step": 1605, "token_acc": 0.6345166331770484, "train_speed(iter/s)": 0.056165 }, { "epoch": 1.1907226021068193, "grad_norm": 0.9309819347033588, "learning_rate": 7.047161970674896e-06, "loss": 1.0448005676269532, "memory(GiB)": 74.93, "step": 1610, "token_acc": 0.6130097087378641, "train_speed(iter/s)": 0.056187 }, { "epoch": 1.194418776566254, "grad_norm": 1.0772202352983227, "learning_rate": 6.991710853913025e-06, "loss": 1.0570079803466796, "memory(GiB)": 74.93, "step": 1615, "token_acc": 0.6610324349017817, "train_speed(iter/s)": 0.056205 }, { "epoch": 1.1981149510256883, "grad_norm": 1.1619152201928238, "learning_rate": 6.936361255544288e-06, "loss": 1.044645118713379, "memory(GiB)": 74.93, "step": 1620, "token_acc": 0.6945525291828794, "train_speed(iter/s)": 0.056227 }, { "epoch": 1.201811125485123, "grad_norm": 1.0467564412195258, "learning_rate": 6.881115043408512e-06, "loss": 1.045677661895752, "memory(GiB)": 74.93, "step": 1625, "token_acc": 0.648811228874248, "train_speed(iter/s)": 0.056246 }, { "epoch": 1.2055072999445573, "grad_norm": 1.0325120697680106, "learning_rate": 6.825974081856626e-06, "loss": 1.0619203567504882, "memory(GiB)": 74.93, "step": 1630, "token_acc": 0.6202729044834308, "train_speed(iter/s)": 0.056267 }, { "epoch": 1.209203474403992, "grad_norm": 0.9412938462579274, "learning_rate": 6.770940231687767e-06, "loss": 1.0478931427001954, "memory(GiB)": 74.93, "step": 1635, "token_acc": 0.6356352537199542, "train_speed(iter/s)": 0.056289 }, { "epoch": 1.2128996488634263, "grad_norm": 1.140398149863178, "learning_rate": 6.716015350086449e-06, "loss": 1.0618717193603515, "memory(GiB)": 74.93, "step": 1640, "token_acc": 0.6066892464013548, "train_speed(iter/s)": 0.05631 }, { "epoch": 1.2165958233228609, "grad_norm": 1.0930330137960338, "learning_rate": 6.661201290559918e-06, "loss": 1.0522537231445312, "memory(GiB)": 74.93, "step": 1645, "token_acc": 0.6371971185330714, "train_speed(iter/s)": 0.056329 }, { "epoch": 1.2202919977822952, "grad_norm": 1.0731043610961355, "learning_rate": 6.606499902875585e-06, "loss": 1.0263765335083008, "memory(GiB)": 74.93, "step": 1650, "token_acc": 0.6519023282226007, "train_speed(iter/s)": 0.056348 }, { "epoch": 1.2202919977822952, "eval_loss": 0.6756451725959778, "eval_runtime": 86.9798, "eval_samples_per_second": 80.421, "eval_steps_per_second": 0.632, "eval_token_acc": 0.6288528178004742, "step": 1650 }, { "epoch": 1.2239881722417298, "grad_norm": 1.131205462756531, "learning_rate": 6.5519130329986245e-06, "loss": 1.0687341690063477, "memory(GiB)": 74.93, "step": 1655, "token_acc": 0.6333847797696782, "train_speed(iter/s)": 0.056155 }, { "epoch": 1.2276843467011642, "grad_norm": 1.046052101501651, "learning_rate": 6.497442523029663e-06, "loss": 1.0175907135009765, "memory(GiB)": 74.93, "step": 1660, "token_acc": 0.6453744493392071, "train_speed(iter/s)": 0.056176 }, { "epoch": 1.2313805211605988, "grad_norm": 1.0553291483906215, "learning_rate": 6.443090211142613e-06, "loss": 1.0627668380737305, "memory(GiB)": 74.93, "step": 1665, "token_acc": 0.6409149762624082, "train_speed(iter/s)": 0.056196 }, { "epoch": 1.2350766956200332, "grad_norm": 0.9606710463766085, "learning_rate": 6.388857931522657e-06, "loss": 1.043929672241211, "memory(GiB)": 74.93, "step": 1670, "token_acc": 0.6334586466165414, "train_speed(iter/s)": 0.056218 }, { "epoch": 1.2387728700794678, "grad_norm": 0.9843358834706085, "learning_rate": 6.334747514304338e-06, "loss": 1.0336435317993165, "memory(GiB)": 74.93, "step": 1675, "token_acc": 0.6631016042780749, "train_speed(iter/s)": 0.056238 }, { "epoch": 1.2424690445389022, "grad_norm": 1.0297683983640094, "learning_rate": 6.280760785509802e-06, "loss": 1.0500383377075195, "memory(GiB)": 74.93, "step": 1680, "token_acc": 0.6349254639488896, "train_speed(iter/s)": 0.05626 }, { "epoch": 1.2461652189983368, "grad_norm": 1.0776782375280287, "learning_rate": 6.226899566987177e-06, "loss": 1.0217618942260742, "memory(GiB)": 74.93, "step": 1685, "token_acc": 0.655511811023622, "train_speed(iter/s)": 0.056281 }, { "epoch": 1.2498613934577711, "grad_norm": 1.0846016823921123, "learning_rate": 6.173165676349103e-06, "loss": 1.0370861053466798, "memory(GiB)": 74.93, "step": 1690, "token_acc": 0.6801365964712578, "train_speed(iter/s)": 0.056303 }, { "epoch": 1.2535575679172057, "grad_norm": 1.0790787844363594, "learning_rate": 6.119560926911377e-06, "loss": 1.0697561264038087, "memory(GiB)": 74.93, "step": 1695, "token_acc": 0.6681639528354857, "train_speed(iter/s)": 0.056324 }, { "epoch": 1.2572537423766401, "grad_norm": 1.106497642833312, "learning_rate": 6.066087127631761e-06, "loss": 1.0666908264160155, "memory(GiB)": 74.93, "step": 1700, "token_acc": 0.6533379694019471, "train_speed(iter/s)": 0.056341 }, { "epoch": 1.2572537423766401, "eval_loss": 0.6751002073287964, "eval_runtime": 88.5942, "eval_samples_per_second": 78.955, "eval_steps_per_second": 0.621, "eval_token_acc": 0.6288689782226767, "step": 1700 }, { "epoch": 1.2609499168360747, "grad_norm": 1.0779984264892808, "learning_rate": 6.012746083048966e-06, "loss": 1.0639089584350585, "memory(GiB)": 34.88, "step": 1705, "token_acc": 0.6968838526912181, "train_speed(iter/s)": 14.788094 }, { "epoch": 1.264646091295509, "grad_norm": 1.1027008185153624, "learning_rate": 5.959539593221711e-06, "loss": 1.0941818237304688, "memory(GiB)": 34.88, "step": 1710, "token_acc": 0.6294489092996556, "train_speed(iter/s)": 9.344634 }, { "epoch": 1.2683422657549437, "grad_norm": 1.2059692859973439, "learning_rate": 5.9064694536680135e-06, "loss": 1.0492952346801758, "memory(GiB)": 49.4, "step": 1715, "token_acc": 0.6576319543509273, "train_speed(iter/s)": 6.522706 }, { "epoch": 1.272038440214378, "grad_norm": 1.0913297173697671, "learning_rate": 5.853537455304575e-06, "loss": 1.0665050506591798, "memory(GiB)": 49.4, "step": 1720, "token_acc": 0.6941935483870968, "train_speed(iter/s)": 4.977275 }, { "epoch": 1.2757346146738127, "grad_norm": 1.1326249785449936, "learning_rate": 5.800745384386364e-06, "loss": 1.035014533996582, "memory(GiB)": 49.4, "step": 1725, "token_acc": 0.6055200269269606, "train_speed(iter/s)": 4.1257 }, { "epoch": 1.279430789133247, "grad_norm": 1.011492822170868, "learning_rate": 5.74809502244632e-06, "loss": 1.040954875946045, "memory(GiB)": 49.4, "step": 1730, "token_acc": 0.6559888579387186, "train_speed(iter/s)": 3.505361 }, { "epoch": 1.2831269635926816, "grad_norm": 0.9143549731190831, "learning_rate": 5.695588146235241e-06, "loss": 1.056338119506836, "memory(GiB)": 49.4, "step": 1735, "token_acc": 0.6355591311343524, "train_speed(iter/s)": 3.006185 }, { "epoch": 1.286823138052116, "grad_norm": 1.0541690596505233, "learning_rate": 5.643226527661825e-06, "loss": 1.0424397468566895, "memory(GiB)": 64.42, "step": 1740, "token_acc": 0.6127497621313035, "train_speed(iter/s)": 2.653736 }, { "epoch": 1.2905193125115506, "grad_norm": 1.071302718364978, "learning_rate": 5.591011933732873e-06, "loss": 1.0049684524536133, "memory(GiB)": 64.42, "step": 1745, "token_acc": 0.6237816764132553, "train_speed(iter/s)": 2.414167 }, { "epoch": 1.294215486970985, "grad_norm": 1.0017860936129825, "learning_rate": 5.538946126493659e-06, "loss": 1.048162841796875, "memory(GiB)": 64.42, "step": 1750, "token_acc": 0.6117103235747303, "train_speed(iter/s)": 2.163836 }, { "epoch": 1.294215486970985, "eval_loss": 0.6697070002555847, "eval_runtime": 85.8145, "eval_samples_per_second": 81.513, "eval_steps_per_second": 0.641, "eval_token_acc": 0.6293895746807739, "step": 1750 }, { "epoch": 1.2979116614304196, "grad_norm": 1.1010002294868126, "learning_rate": 5.4870308629684675e-06, "loss": 1.0428232192993163, "memory(GiB)": 74.63, "step": 1755, "token_acc": 0.634660903571061, "train_speed(iter/s)": 1.752193 }, { "epoch": 1.301607835889854, "grad_norm": 1.1351842621603827, "learning_rate": 5.435267895101303e-06, "loss": 1.0705801010131837, "memory(GiB)": 74.63, "step": 1760, "token_acc": 0.663578947368421, "train_speed(iter/s)": 1.629796 }, { "epoch": 1.3053040103492886, "grad_norm": 0.9688327106799416, "learning_rate": 5.383658969696767e-06, "loss": 1.043651008605957, "memory(GiB)": 74.63, "step": 1765, "token_acc": 0.6663619744058501, "train_speed(iter/s)": 1.540319 }, { "epoch": 1.309000184808723, "grad_norm": 1.0196740986171486, "learning_rate": 5.3322058283611045e-06, "loss": 1.066755485534668, "memory(GiB)": 74.63, "step": 1770, "token_acc": 0.6984352773826458, "train_speed(iter/s)": 1.440515 }, { "epoch": 1.3126963592681575, "grad_norm": 0.9324312791356152, "learning_rate": 5.2809102074434505e-06, "loss": 1.0861141204833984, "memory(GiB)": 74.63, "step": 1775, "token_acc": 0.6625352112676056, "train_speed(iter/s)": 1.355437 }, { "epoch": 1.316392533727592, "grad_norm": 1.0475529503023757, "learning_rate": 5.229773837977208e-06, "loss": 1.0537721633911132, "memory(GiB)": 74.63, "step": 1780, "token_acc": 0.6779266161910309, "train_speed(iter/s)": 1.294879 }, { "epoch": 1.3200887081870265, "grad_norm": 0.9281011767547357, "learning_rate": 5.178798445621645e-06, "loss": 1.0430593490600586, "memory(GiB)": 74.63, "step": 1785, "token_acc": 0.6330935251798561, "train_speed(iter/s)": 1.224208 }, { "epoch": 1.3237848826464609, "grad_norm": 1.0483168678654606, "learning_rate": 5.127985750603671e-06, "loss": 1.071333885192871, "memory(GiB)": 74.63, "step": 1790, "token_acc": 0.6417910447761194, "train_speed(iter/s)": 1.162932 }, { "epoch": 1.3274810571058955, "grad_norm": 1.097565660571469, "learning_rate": 5.077337467659768e-06, "loss": 1.0753141403198243, "memory(GiB)": 74.63, "step": 1795, "token_acc": 0.6051001821493625, "train_speed(iter/s)": 1.117195 }, { "epoch": 1.3311772315653299, "grad_norm": 1.063181582729188, "learning_rate": 5.026855305978129e-06, "loss": 1.0764029502868653, "memory(GiB)": 74.63, "step": 1800, "token_acc": 0.6232106339468303, "train_speed(iter/s)": 1.067656 }, { "epoch": 1.3311772315653299, "eval_loss": 0.6690813899040222, "eval_runtime": 85.9692, "eval_samples_per_second": 81.366, "eval_steps_per_second": 0.64, "eval_token_acc": 0.6297716532342776, "step": 1800 }, { "epoch": 1.3348734060247645, "grad_norm": 1.0429392429603475, "learning_rate": 4.976540969140984e-06, "loss": 1.090817928314209, "memory(GiB)": 74.63, "step": 1805, "token_acc": 0.6356786703601108, "train_speed(iter/s)": 0.961744 }, { "epoch": 1.3385695804841988, "grad_norm": 1.0548409670879852, "learning_rate": 4.926396155067114e-06, "loss": 1.0316819190979003, "memory(GiB)": 74.63, "step": 1810, "token_acc": 0.6598138091543833, "train_speed(iter/s)": 0.923472 }, { "epoch": 1.3422657549436332, "grad_norm": 1.0297653617411635, "learning_rate": 4.876422555954543e-06, "loss": 1.03601131439209, "memory(GiB)": 74.63, "step": 1815, "token_acc": 0.6965428937259923, "train_speed(iter/s)": 0.894132 }, { "epoch": 1.3459619294030678, "grad_norm": 1.1178512794477986, "learning_rate": 4.826621858223431e-06, "loss": 1.0318429946899415, "memory(GiB)": 74.63, "step": 1820, "token_acc": 0.6313304721030043, "train_speed(iter/s)": 0.864578 }, { "epoch": 1.3496581038625024, "grad_norm": 1.0401775609610366, "learning_rate": 4.776995742459184e-06, "loss": 1.0820954322814942, "memory(GiB)": 74.63, "step": 1825, "token_acc": 0.6357702349869452, "train_speed(iter/s)": 0.833393 }, { "epoch": 1.3533542783219368, "grad_norm": 1.1053520267340973, "learning_rate": 4.727545883355713e-06, "loss": 1.0570013046264648, "memory(GiB)": 74.63, "step": 1830, "token_acc": 0.6462998102466793, "train_speed(iter/s)": 0.80849 }, { "epoch": 1.3570504527813712, "grad_norm": 1.0129657782670332, "learning_rate": 4.678273949658939e-06, "loss": 1.0589232444763184, "memory(GiB)": 74.63, "step": 1835, "token_acc": 0.6194251734390486, "train_speed(iter/s)": 0.785859 }, { "epoch": 1.3607466272408058, "grad_norm": 0.9863992139542379, "learning_rate": 4.629181604110464e-06, "loss": 1.0515235900878905, "memory(GiB)": 74.63, "step": 1840, "token_acc": 0.6229317851959362, "train_speed(iter/s)": 0.761135 }, { "epoch": 1.3644428017002403, "grad_norm": 1.1494795183000623, "learning_rate": 4.580270503391487e-06, "loss": 1.0223835945129394, "memory(GiB)": 74.63, "step": 1845, "token_acc": 0.6583261432269197, "train_speed(iter/s)": 0.739616 }, { "epoch": 1.3681389761596747, "grad_norm": 1.14471617138646, "learning_rate": 4.531542298066861e-06, "loss": 1.0207533836364746, "memory(GiB)": 74.63, "step": 1850, "token_acc": 0.6551959114139694, "train_speed(iter/s)": 0.721142 }, { "epoch": 1.3681389761596747, "eval_loss": 0.6679942607879639, "eval_runtime": 93.3503, "eval_samples_per_second": 74.933, "eval_steps_per_second": 0.589, "eval_token_acc": 0.6300302199895188, "step": 1850 }, { "epoch": 1.371835150619109, "grad_norm": 1.1238157971359715, "learning_rate": 4.482998632529414e-06, "loss": 1.0442536354064942, "memory(GiB)": 74.63, "step": 1855, "token_acc": 0.6386843397152675, "train_speed(iter/s)": 0.673362 }, { "epoch": 1.3755313250785437, "grad_norm": 0.9044341600768213, "learning_rate": 4.434641144944464e-06, "loss": 1.0640903472900392, "memory(GiB)": 74.63, "step": 1860, "token_acc": 0.6587333602258976, "train_speed(iter/s)": 0.655234 }, { "epoch": 1.3792274995379783, "grad_norm": 1.0166299256206919, "learning_rate": 4.386471467194513e-06, "loss": 1.0587308883666993, "memory(GiB)": 74.63, "step": 1865, "token_acc": 0.6148590947907772, "train_speed(iter/s)": 0.63915 }, { "epoch": 1.3829236739974127, "grad_norm": 1.2786373427724909, "learning_rate": 4.338491224824198e-06, "loss": 1.0438286781311035, "memory(GiB)": 74.63, "step": 1870, "token_acc": 0.6332835077229696, "train_speed(iter/s)": 0.625873 }, { "epoch": 1.386619848456847, "grad_norm": 1.0910902180920756, "learning_rate": 4.290702036985423e-06, "loss": 1.0352885246276855, "memory(GiB)": 74.63, "step": 1875, "token_acc": 0.6918429003021148, "train_speed(iter/s)": 0.610514 }, { "epoch": 1.3903160229162816, "grad_norm": 1.0540455114144576, "learning_rate": 4.243105516382732e-06, "loss": 1.0169889450073242, "memory(GiB)": 74.63, "step": 1880, "token_acc": 0.6479912544411042, "train_speed(iter/s)": 0.59628 }, { "epoch": 1.3940121973757162, "grad_norm": 1.0796012032362492, "learning_rate": 4.1957032692188685e-06, "loss": 1.0289284706115722, "memory(GiB)": 74.63, "step": 1885, "token_acc": 0.6304772536980184, "train_speed(iter/s)": 0.584845 }, { "epoch": 1.3977083718351506, "grad_norm": 0.9497813177866403, "learning_rate": 4.148496895140586e-06, "loss": 1.0058039665222167, "memory(GiB)": 74.63, "step": 1890, "token_acc": 0.6662360034453058, "train_speed(iter/s)": 0.572483 }, { "epoch": 1.401404546294585, "grad_norm": 0.9994791403674819, "learning_rate": 4.101487987184658e-06, "loss": 1.0271056175231934, "memory(GiB)": 74.63, "step": 1895, "token_acc": 0.7174721189591078, "train_speed(iter/s)": 0.559822 }, { "epoch": 1.4051007207540196, "grad_norm": 0.9675552310253457, "learning_rate": 4.054678131724128e-06, "loss": 1.0421775817871093, "memory(GiB)": 74.63, "step": 1900, "token_acc": 0.6403071017274472, "train_speed(iter/s)": 0.549398 }, { "epoch": 1.4051007207540196, "eval_loss": 0.6665124893188477, "eval_runtime": 92.5325, "eval_samples_per_second": 75.595, "eval_steps_per_second": 0.594, "eval_token_acc": 0.6305912175031224, "step": 1900 }, { "epoch": 1.4087968952134542, "grad_norm": 0.9383388424277262, "learning_rate": 4.008068908414764e-06, "loss": 1.0390195846557617, "memory(GiB)": 74.63, "step": 1905, "token_acc": 0.636108220603538, "train_speed(iter/s)": 0.522161 }, { "epoch": 1.4124930696728886, "grad_norm": 1.0404355020365603, "learning_rate": 3.961661890141756e-06, "loss": 1.064806842803955, "memory(GiB)": 74.63, "step": 1910, "token_acc": 0.5955159705159705, "train_speed(iter/s)": 0.512861 }, { "epoch": 1.416189244132323, "grad_norm": 1.1641858092814779, "learning_rate": 3.91545864296665e-06, "loss": 1.0407491683959962, "memory(GiB)": 74.63, "step": 1915, "token_acc": 0.6579710144927536, "train_speed(iter/s)": 0.502749 }, { "epoch": 1.4198854185917575, "grad_norm": 0.9981716234289997, "learning_rate": 3.8694607260744745e-06, "loss": 1.0334474563598632, "memory(GiB)": 74.63, "step": 1920, "token_acc": 0.6448382126348228, "train_speed(iter/s)": 0.494436 }, { "epoch": 1.4235815930511921, "grad_norm": 1.0999406567886463, "learning_rate": 3.8236696917211365e-06, "loss": 1.0606246948242188, "memory(GiB)": 74.63, "step": 1925, "token_acc": 0.6300940438871473, "train_speed(iter/s)": 0.48651 }, { "epoch": 1.4272777675106265, "grad_norm": 1.0161660727647654, "learning_rate": 3.7780870851810515e-06, "loss": 1.076219654083252, "memory(GiB)": 74.63, "step": 1930, "token_acc": 0.6260296540362438, "train_speed(iter/s)": 0.477741 }, { "epoch": 1.430973941970061, "grad_norm": 0.9703902428924409, "learning_rate": 3.7327144446949716e-06, "loss": 1.0812992095947265, "memory(GiB)": 74.63, "step": 1935, "token_acc": 0.630064591896653, "train_speed(iter/s)": 0.470034 }, { "epoch": 1.4346701164294955, "grad_norm": 1.0947535810008933, "learning_rate": 3.687553301418092e-06, "loss": 1.0244592666625976, "memory(GiB)": 74.63, "step": 1940, "token_acc": 0.6301992310380986, "train_speed(iter/s)": 0.463221 }, { "epoch": 1.43836629088893, "grad_norm": 1.0200917528662774, "learning_rate": 3.6426051793683724e-06, "loss": 1.0360092163085937, "memory(GiB)": 74.63, "step": 1945, "token_acc": 0.6446078431372549, "train_speed(iter/s)": 0.45531 }, { "epoch": 1.4420624653483645, "grad_norm": 0.9670618590123606, "learning_rate": 3.5978715953751207e-06, "loss": 1.0297866821289063, "memory(GiB)": 74.63, "step": 1950, "token_acc": 0.6481696687972109, "train_speed(iter/s)": 0.448099 }, { "epoch": 1.4420624653483645, "eval_loss": 0.6662415862083435, "eval_runtime": 87.5872, "eval_samples_per_second": 79.863, "eval_steps_per_second": 0.628, "eval_token_acc": 0.6309225061582752, "step": 1950 }, { "epoch": 1.4457586398077988, "grad_norm": 0.9880600888670725, "learning_rate": 3.5533540590277882e-06, "loss": 1.0223572731018067, "memory(GiB)": 74.63, "step": 1955, "token_acc": 0.6359920144500428, "train_speed(iter/s)": 0.430514 }, { "epoch": 1.4494548142672334, "grad_norm": 0.9593918073057777, "learning_rate": 3.509054072625031e-06, "loss": 1.0360115051269532, "memory(GiB)": 74.63, "step": 1960, "token_acc": 0.6581899775617053, "train_speed(iter/s)": 0.424799 }, { "epoch": 1.453150988726668, "grad_norm": 1.0289280788083641, "learning_rate": 3.4649731311240276e-06, "loss": 1.0378742218017578, "memory(GiB)": 74.63, "step": 1965, "token_acc": 0.6424075531077892, "train_speed(iter/s)": 0.418454 }, { "epoch": 1.4568471631861024, "grad_norm": 1.053788067984888, "learning_rate": 3.4211127220900107e-06, "loss": 1.0713199615478515, "memory(GiB)": 74.63, "step": 1970, "token_acc": 0.632213608957795, "train_speed(iter/s)": 0.412536 }, { "epoch": 1.4605433376455368, "grad_norm": 1.180153902117692, "learning_rate": 3.377474325646074e-06, "loss": 1.0560644149780274, "memory(GiB)": 74.63, "step": 1975, "token_acc": 0.641423703142749, "train_speed(iter/s)": 0.407398 }, { "epoch": 1.4642395121049714, "grad_norm": 0.8918348376917337, "learning_rate": 3.334059414423233e-06, "loss": 1.055532169342041, "memory(GiB)": 74.63, "step": 1980, "token_acc": 0.668722786647315, "train_speed(iter/s)": 0.401897 }, { "epoch": 1.4679356865644058, "grad_norm": 1.109026709845534, "learning_rate": 3.2908694535107144e-06, "loss": 1.027819538116455, "memory(GiB)": 74.63, "step": 1985, "token_acc": 0.661387220098307, "train_speed(iter/s)": 0.396281 }, { "epoch": 1.4716318610238404, "grad_norm": 1.0886246897973584, "learning_rate": 3.247905900406523e-06, "loss": 1.0191631317138672, "memory(GiB)": 74.63, "step": 1990, "token_acc": 0.6097883597883598, "train_speed(iter/s)": 0.391566 }, { "epoch": 1.4753280354832747, "grad_norm": 1.0630977460263966, "learning_rate": 3.2051702049682554e-06, "loss": 1.042071533203125, "memory(GiB)": 74.63, "step": 1995, "token_acc": 0.6236017897091722, "train_speed(iter/s)": 0.386682 }, { "epoch": 1.4790242099427093, "grad_norm": 1.1953007407214893, "learning_rate": 3.162663809364178e-06, "loss": 1.0401007652282714, "memory(GiB)": 74.63, "step": 2000, "token_acc": 0.6173344235486509, "train_speed(iter/s)": 0.381535 }, { "epoch": 1.4790242099427093, "eval_loss": 0.6649311184883118, "eval_runtime": 83.4819, "eval_samples_per_second": 83.791, "eval_steps_per_second": 0.659, "eval_token_acc": 0.63089018531387, "step": 2000 }, { "epoch": 1.4827203844021437, "grad_norm": 1.0030060203161786, "learning_rate": 3.120388148024548e-06, "loss": 1.0528248786926269, "memory(GiB)": 74.63, "step": 2005, "token_acc": 0.6302038823098522, "train_speed(iter/s)": 0.368939 }, { "epoch": 1.4864165588615783, "grad_norm": 1.1306385027348749, "learning_rate": 3.0783446475932145e-06, "loss": 1.061046028137207, "memory(GiB)": 74.63, "step": 2010, "token_acc": 0.6473043478260869, "train_speed(iter/s)": 0.364909 }, { "epoch": 1.4901127333210127, "grad_norm": 1.0935000761259253, "learning_rate": 3.036534726879473e-06, "loss": 1.0255512237548827, "memory(GiB)": 74.63, "step": 2015, "token_acc": 0.65625, "train_speed(iter/s)": 0.360903 }, { "epoch": 1.4938089077804473, "grad_norm": 1.088331528861988, "learning_rate": 2.9949597968101883e-06, "loss": 1.0589797973632813, "memory(GiB)": 74.63, "step": 2020, "token_acc": 0.6325940212150434, "train_speed(iter/s)": 0.356624 }, { "epoch": 1.4975050822398817, "grad_norm": 1.0677052287012947, "learning_rate": 2.953621260382171e-06, "loss": 1.0519143104553224, "memory(GiB)": 74.63, "step": 2025, "token_acc": 0.6626557799742158, "train_speed(iter/s)": 0.352723 }, { "epoch": 1.5012012566993163, "grad_norm": 0.9383180241618552, "learning_rate": 2.9125205126148535e-06, "loss": 1.031491470336914, "memory(GiB)": 74.63, "step": 2030, "token_acc": 0.6123364485981309, "train_speed(iter/s)": 0.349069 }, { "epoch": 1.5048974311587506, "grad_norm": 1.0487719308291952, "learning_rate": 2.871658940503188e-06, "loss": 1.024942398071289, "memory(GiB)": 74.63, "step": 2035, "token_acc": 0.6477366255144033, "train_speed(iter/s)": 0.345173 }, { "epoch": 1.5085936056181852, "grad_norm": 1.0789502013849968, "learning_rate": 2.831037922970855e-06, "loss": 1.0276554107666016, "memory(GiB)": 74.63, "step": 2040, "token_acc": 0.6695604991861096, "train_speed(iter/s)": 0.341604 }, { "epoch": 1.5122897800776198, "grad_norm": 1.0851618366990563, "learning_rate": 2.7906588308237228e-06, "loss": 1.027616596221924, "memory(GiB)": 74.63, "step": 2045, "token_acc": 0.7097625329815304, "train_speed(iter/s)": 0.338222 }, { "epoch": 1.5159859545370542, "grad_norm": 0.9179924796471817, "learning_rate": 2.7505230267036032e-06, "loss": 1.0497385025024415, "memory(GiB)": 74.63, "step": 2050, "token_acc": 0.5937649880095923, "train_speed(iter/s)": 0.334489 }, { "epoch": 1.5159859545370542, "eval_loss": 0.6642535328865051, "eval_runtime": 85.9904, "eval_samples_per_second": 81.346, "eval_steps_per_second": 0.64, "eval_token_acc": 0.6313195908181098, "step": 2050 }, { "epoch": 1.5196821289964886, "grad_norm": 1.0681296921147372, "learning_rate": 2.7106318650422447e-06, "loss": 1.0099181175231933, "memory(GiB)": 74.63, "step": 2055, "token_acc": 0.6372694090953931, "train_speed(iter/s)": 0.325208 }, { "epoch": 1.5233783034559232, "grad_norm": 1.1164983354073834, "learning_rate": 2.6709866920156434e-06, "loss": 1.0027360916137695, "memory(GiB)": 74.63, "step": 2060, "token_acc": 0.631484794275492, "train_speed(iter/s)": 0.321919 }, { "epoch": 1.5270744779153576, "grad_norm": 0.9417253538259095, "learning_rate": 2.6315888454986017e-06, "loss": 1.0374462127685546, "memory(GiB)": 74.63, "step": 2065, "token_acc": 0.6671586715867158, "train_speed(iter/s)": 0.319024 }, { "epoch": 1.530770652374792, "grad_norm": 1.1095932914113171, "learning_rate": 2.5924396550195986e-06, "loss": 1.03175687789917, "memory(GiB)": 74.63, "step": 2070, "token_acc": 0.6316007454959619, "train_speed(iter/s)": 0.315819 }, { "epoch": 1.5344668268342265, "grad_norm": 1.0582702932147185, "learning_rate": 2.5535404417159002e-06, "loss": 1.0430908203125, "memory(GiB)": 74.63, "step": 2075, "token_acc": 0.6477673325499412, "train_speed(iter/s)": 0.312805 }, { "epoch": 1.5381630012936611, "grad_norm": 1.0515415830247143, "learning_rate": 2.514892518288988e-06, "loss": 1.0108471870422364, "memory(GiB)": 74.63, "step": 2080, "token_acc": 0.6291390728476821, "train_speed(iter/s)": 0.310115 }, { "epoch": 1.5418591757530955, "grad_norm": 1.018793664843126, "learning_rate": 2.4764971889602705e-06, "loss": 1.0460142135620116, "memory(GiB)": 74.63, "step": 2085, "token_acc": 0.6321537789427698, "train_speed(iter/s)": 0.307239 }, { "epoch": 1.5455553502125299, "grad_norm": 1.0684231311720556, "learning_rate": 2.4383557494270483e-06, "loss": 1.03402099609375, "memory(GiB)": 74.63, "step": 2090, "token_acc": 0.6098130841121495, "train_speed(iter/s)": 0.304401 }, { "epoch": 1.5492515246719645, "grad_norm": 1.1947182692900764, "learning_rate": 2.400469486818803e-06, "loss": 1.0426679611206056, "memory(GiB)": 74.63, "step": 2095, "token_acc": 0.6819553409776705, "train_speed(iter/s)": 0.301883 }, { "epoch": 1.552947699131399, "grad_norm": 1.1961503070894741, "learning_rate": 2.3628396796537588e-06, "loss": 1.0395529747009278, "memory(GiB)": 74.63, "step": 2100, "token_acc": 0.6641014033499321, "train_speed(iter/s)": 0.299223 }, { "epoch": 1.552947699131399, "eval_loss": 0.6638895273208618, "eval_runtime": 88.4322, "eval_samples_per_second": 79.1, "eval_steps_per_second": 0.622, "eval_token_acc": 0.6315920093638103, "step": 2100 }, { "epoch": 1.5566438735908334, "grad_norm": 1.0207020501497954, "learning_rate": 2.325467597795745e-06, "loss": 1.0622333526611327, "memory(GiB)": 74.63, "step": 2105, "token_acc": 0.638814317673378, "train_speed(iter/s)": 0.291998 }, { "epoch": 1.5603400480502678, "grad_norm": 1.1172734543264464, "learning_rate": 2.2883545024113263e-06, "loss": 1.0403221130371094, "memory(GiB)": 74.63, "step": 2110, "token_acc": 0.6622971285892634, "train_speed(iter/s)": 0.289437 }, { "epoch": 1.5640362225097024, "grad_norm": 1.0571335154576122, "learning_rate": 2.251501645927253e-06, "loss": 1.0463993072509765, "memory(GiB)": 74.63, "step": 2115, "token_acc": 0.636986301369863, "train_speed(iter/s)": 0.28714 }, { "epoch": 1.567732396969137, "grad_norm": 0.9556270442029375, "learning_rate": 2.2149102719882044e-06, "loss": 1.0251903533935547, "memory(GiB)": 74.63, "step": 2120, "token_acc": 0.647495361781076, "train_speed(iter/s)": 0.284896 }, { "epoch": 1.5714285714285714, "grad_norm": 1.041918735454562, "learning_rate": 2.178581615414802e-06, "loss": 1.0483660697937012, "memory(GiB)": 74.63, "step": 2125, "token_acc": 0.5842217484008528, "train_speed(iter/s)": 0.282449 }, { "epoch": 1.5751247458880058, "grad_norm": 1.0827410972952385, "learning_rate": 2.1425169021619518e-06, "loss": 1.0664111137390138, "memory(GiB)": 74.63, "step": 2130, "token_acc": 0.6472742066720911, "train_speed(iter/s)": 0.280246 }, { "epoch": 1.5788209203474404, "grad_norm": 1.0343519334837508, "learning_rate": 2.106717349277475e-06, "loss": 1.0448074340820312, "memory(GiB)": 74.63, "step": 2135, "token_acc": 0.6223404255319149, "train_speed(iter/s)": 0.278222 }, { "epoch": 1.582517094806875, "grad_norm": 0.9536359374215565, "learning_rate": 2.0711841648610254e-06, "loss": 1.0621306419372558, "memory(GiB)": 74.63, "step": 2140, "token_acc": 0.6342119419042496, "train_speed(iter/s)": 0.276006 }, { "epoch": 1.5862132692663093, "grad_norm": 1.072455338512947, "learning_rate": 2.03591854802333e-06, "loss": 1.0556835174560546, "memory(GiB)": 74.63, "step": 2145, "token_acc": 0.7222898903775883, "train_speed(iter/s)": 0.27386 }, { "epoch": 1.5899094437257437, "grad_norm": 1.0210760479008887, "learning_rate": 2.0009216888457206e-06, "loss": 1.0253107070922851, "memory(GiB)": 74.63, "step": 2150, "token_acc": 0.6356216994719155, "train_speed(iter/s)": 0.271885 }, { "epoch": 1.5899094437257437, "eval_loss": 0.6611568927764893, "eval_runtime": 89.4271, "eval_samples_per_second": 78.22, "eval_steps_per_second": 0.615, "eval_token_acc": 0.6316070154701413, "step": 2150 }, { "epoch": 1.5936056181851783, "grad_norm": 1.062555004377403, "learning_rate": 1.966194768339974e-06, "loss": 1.049751091003418, "memory(GiB)": 74.63, "step": 2155, "token_acc": 0.6423422284052106, "train_speed(iter/s)": 0.266286 }, { "epoch": 1.597301792644613, "grad_norm": 0.8629319967495109, "learning_rate": 1.931738958408457e-06, "loss": 1.0435371398925781, "memory(GiB)": 74.63, "step": 2160, "token_acc": 0.6290619251992643, "train_speed(iter/s)": 0.264247 }, { "epoch": 1.6009979671040473, "grad_norm": 0.955539932162413, "learning_rate": 1.8975554218045733e-06, "loss": 1.0308834075927735, "memory(GiB)": 74.63, "step": 2165, "token_acc": 0.6610537751222162, "train_speed(iter/s)": 0.262351 }, { "epoch": 1.6046941415634817, "grad_norm": 0.9624917837200193, "learning_rate": 1.8636453120935428e-06, "loss": 1.0461854934692383, "memory(GiB)": 74.63, "step": 2170, "token_acc": 0.7152838427947599, "train_speed(iter/s)": 0.260619 }, { "epoch": 1.6083903160229163, "grad_norm": 1.1677655720128766, "learning_rate": 1.8300097736134482e-06, "loss": 1.0363172531127929, "memory(GiB)": 74.63, "step": 2175, "token_acc": 0.6848798869524259, "train_speed(iter/s)": 0.258828 }, { "epoch": 1.6120864904823509, "grad_norm": 1.060280622494465, "learning_rate": 1.796649941436638e-06, "loss": 1.0246556282043457, "memory(GiB)": 74.63, "step": 2180, "token_acc": 0.6469820554649266, "train_speed(iter/s)": 0.256928 }, { "epoch": 1.6157826649417852, "grad_norm": 0.9704555618707196, "learning_rate": 1.7635669413314082e-06, "loss": 1.0577556610107421, "memory(GiB)": 74.63, "step": 2185, "token_acc": 0.698159509202454, "train_speed(iter/s)": 0.255252 }, { "epoch": 1.6194788394012196, "grad_norm": 0.9786880620172256, "learning_rate": 1.7307618897240274e-06, "loss": 1.0526361465454102, "memory(GiB)": 74.63, "step": 2190, "token_acc": 0.6385869565217391, "train_speed(iter/s)": 0.253488 }, { "epoch": 1.6231750138606542, "grad_norm": 0.9744613679129237, "learning_rate": 1.6982358936610454e-06, "loss": 1.075265598297119, "memory(GiB)": 74.63, "step": 2195, "token_acc": 0.6133072407045009, "train_speed(iter/s)": 0.251735 }, { "epoch": 1.6268711883200888, "grad_norm": 1.0120755932892964, "learning_rate": 1.6659900507719406e-06, "loss": 1.064041519165039, "memory(GiB)": 74.63, "step": 2200, "token_acc": 0.648406731113498, "train_speed(iter/s)": 0.250141 }, { "epoch": 1.6268711883200888, "eval_loss": 0.6599572896957397, "eval_runtime": 90.9305, "eval_samples_per_second": 76.927, "eval_steps_per_second": 0.605, "eval_token_acc": 0.6317178297938161, "step": 2200 }, { "epoch": 1.6305673627795232, "grad_norm": 1.0770018414163383, "learning_rate": 1.6340254492320873e-06, "loss": 1.0508115768432618, "memory(GiB)": 74.63, "step": 2205, "token_acc": 0.6418808091853472, "train_speed(iter/s)": 0.245446 }, { "epoch": 1.6342635372389576, "grad_norm": 1.0511841490808227, "learning_rate": 1.6023431677260215e-06, "loss": 1.0454177856445312, "memory(GiB)": 74.63, "step": 2210, "token_acc": 0.6532779316712835, "train_speed(iter/s)": 0.243859 }, { "epoch": 1.6379597116983922, "grad_norm": 0.9098679876928407, "learning_rate": 1.570944275411046e-06, "loss": 1.0668581962585448, "memory(GiB)": 74.63, "step": 2215, "token_acc": 0.6121688741721855, "train_speed(iter/s)": 0.242235 }, { "epoch": 1.6416558861578268, "grad_norm": 1.0127053695015762, "learning_rate": 1.5398298318811467e-06, "loss": 1.0175441741943358, "memory(GiB)": 74.63, "step": 2220, "token_acc": 0.6991780821917808, "train_speed(iter/s)": 0.240782 }, { "epoch": 1.6453520606172611, "grad_norm": 1.1031573706590774, "learning_rate": 1.5090008871312433e-06, "loss": 1.0165956497192383, "memory(GiB)": 74.63, "step": 2225, "token_acc": 0.6685121107266436, "train_speed(iter/s)": 0.23932 }, { "epoch": 1.6490482350766955, "grad_norm": 1.0502782153731651, "learning_rate": 1.4784584815217452e-06, "loss": 1.0456388473510743, "memory(GiB)": 74.63, "step": 2230, "token_acc": 0.6672802577082375, "train_speed(iter/s)": 0.237824 }, { "epoch": 1.65274440953613, "grad_norm": 1.003637672944472, "learning_rate": 1.448203645743449e-06, "loss": 1.0287794113159179, "memory(GiB)": 74.63, "step": 2235, "token_acc": 0.6663223140495868, "train_speed(iter/s)": 0.236377 }, { "epoch": 1.6564405839955647, "grad_norm": 1.037599542215698, "learning_rate": 1.4182374007827605e-06, "loss": 1.0127573013305664, "memory(GiB)": 74.63, "step": 2240, "token_acc": 0.6325656132833423, "train_speed(iter/s)": 0.235012 }, { "epoch": 1.660136758454999, "grad_norm": 0.9940434532315588, "learning_rate": 1.3885607578872295e-06, "loss": 1.0367406845092773, "memory(GiB)": 74.63, "step": 2245, "token_acc": 0.6187350835322196, "train_speed(iter/s)": 0.233574 }, { "epoch": 1.6638329329144335, "grad_norm": 0.9200899712617193, "learning_rate": 1.3591747185314342e-06, "loss": 1.0550609588623048, "memory(GiB)": 74.63, "step": 2250, "token_acc": 0.6650768415474297, "train_speed(iter/s)": 0.232175 }, { "epoch": 1.6638329329144335, "eval_loss": 0.6586793661117554, "eval_runtime": 87.5544, "eval_samples_per_second": 79.893, "eval_steps_per_second": 0.628, "eval_token_acc": 0.6320167976045638, "step": 2250 }, { "epoch": 1.667529107373868, "grad_norm": 1.0426265291319847, "learning_rate": 1.3300802743831786e-06, "loss": 1.0567312240600586, "memory(GiB)": 74.63, "step": 2255, "token_acc": 0.6444471182769823, "train_speed(iter/s)": 0.228232 }, { "epoch": 1.6712252818333027, "grad_norm": 1.1036330750940702, "learning_rate": 1.3012784072700335e-06, "loss": 1.0163141250610352, "memory(GiB)": 74.63, "step": 2260, "token_acc": 0.6361031518624641, "train_speed(iter/s)": 0.226993 }, { "epoch": 1.674921456292737, "grad_norm": 1.034827646235815, "learning_rate": 1.272770089146199e-06, "loss": 1.042106819152832, "memory(GiB)": 74.63, "step": 2265, "token_acc": 0.6615910503418272, "train_speed(iter/s)": 0.225676 }, { "epoch": 1.6786176307521714, "grad_norm": 0.9379338873318531, "learning_rate": 1.2445562820597035e-06, "loss": 1.056378173828125, "memory(GiB)": 74.63, "step": 2270, "token_acc": 0.6658767772511849, "train_speed(iter/s)": 0.22441 }, { "epoch": 1.682313805211606, "grad_norm": 1.018955540383726, "learning_rate": 1.2166379381199423e-06, "loss": 1.024850082397461, "memory(GiB)": 74.63, "step": 2275, "token_acc": 0.6339022954679223, "train_speed(iter/s)": 0.223236 }, { "epoch": 1.6860099796710406, "grad_norm": 0.9387152975257087, "learning_rate": 1.1890159994655425e-06, "loss": 1.0364057540893554, "memory(GiB)": 74.63, "step": 2280, "token_acc": 0.6378887070376432, "train_speed(iter/s)": 0.221993 }, { "epoch": 1.689706154130475, "grad_norm": 0.9517285751951058, "learning_rate": 1.1616913982325827e-06, "loss": 1.0173322677612304, "memory(GiB)": 74.63, "step": 2285, "token_acc": 0.63408913213448, "train_speed(iter/s)": 0.220748 }, { "epoch": 1.6934023285899094, "grad_norm": 1.1148106388917103, "learning_rate": 1.1346650565231165e-06, "loss": 1.0427886962890625, "memory(GiB)": 74.63, "step": 2290, "token_acc": 0.640251572327044, "train_speed(iter/s)": 0.219605 }, { "epoch": 1.697098503049344, "grad_norm": 1.1256757463038873, "learning_rate": 1.1079378863740686e-06, "loss": 1.0264497756958009, "memory(GiB)": 74.63, "step": 2295, "token_acc": 0.6556603773584906, "train_speed(iter/s)": 0.21844 }, { "epoch": 1.7007946775087786, "grad_norm": 1.0466875757106615, "learning_rate": 1.0815107897264555e-06, "loss": 1.0546932220458984, "memory(GiB)": 74.63, "step": 2300, "token_acc": 0.6179956896551724, "train_speed(iter/s)": 0.217293 }, { "epoch": 1.7007946775087786, "eval_loss": 0.6585622429847717, "eval_runtime": 86.3947, "eval_samples_per_second": 80.966, "eval_steps_per_second": 0.637, "eval_token_acc": 0.6323053765724668, "step": 2300 }, { "epoch": 1.704490851968213, "grad_norm": 0.9844022346926911, "learning_rate": 1.0553846583949424e-06, "loss": 1.0470151901245117, "memory(GiB)": 74.63, "step": 2305, "token_acc": 0.638003355704698, "train_speed(iter/s)": 0.213982 }, { "epoch": 1.7081870264276473, "grad_norm": 1.002842594215214, "learning_rate": 1.0295603740377591e-06, "loss": 1.0518400192260742, "memory(GiB)": 74.63, "step": 2310, "token_acc": 0.6883333333333334, "train_speed(iter/s)": 0.212941 }, { "epoch": 1.711883200887082, "grad_norm": 0.996843923558558, "learning_rate": 1.0040388081269336e-06, "loss": 1.028696632385254, "memory(GiB)": 74.63, "step": 2315, "token_acc": 0.6513243595310465, "train_speed(iter/s)": 0.211922 }, { "epoch": 1.7155793753465165, "grad_norm": 1.0304373058907095, "learning_rate": 9.788208219188932e-07, "loss": 1.0363618850708007, "memory(GiB)": 74.63, "step": 2320, "token_acc": 0.6015075376884422, "train_speed(iter/s)": 0.210816 }, { "epoch": 1.7192755498059509, "grad_norm": 1.0716438374724575, "learning_rate": 9.539072664254e-07, "loss": 1.065016269683838, "memory(GiB)": 74.63, "step": 2325, "token_acc": 0.6122448979591837, "train_speed(iter/s)": 0.20983 }, { "epoch": 1.7229717242653853, "grad_norm": 1.044849619522368, "learning_rate": 9.292989823848242e-07, "loss": 1.0461166381835938, "memory(GiB)": 74.63, "step": 2330, "token_acc": 0.6681818181818182, "train_speed(iter/s)": 0.208847 }, { "epoch": 1.7266678987248198, "grad_norm": 0.9749773034536726, "learning_rate": 9.049968002337805e-07, "loss": 1.0064781188964844, "memory(GiB)": 74.63, "step": 2335, "token_acc": 0.6454869358669834, "train_speed(iter/s)": 0.207824 }, { "epoch": 1.7303640731842544, "grad_norm": 1.0478755901703891, "learning_rate": 8.810015400790994e-07, "loss": 1.0341422080993652, "memory(GiB)": 74.63, "step": 2340, "token_acc": 0.6453608247422681, "train_speed(iter/s)": 0.20687 }, { "epoch": 1.7340602476436888, "grad_norm": 1.161076200769703, "learning_rate": 8.573140116701573e-07, "loss": 1.031747531890869, "memory(GiB)": 74.63, "step": 2345, "token_acc": 0.633889077917659, "train_speed(iter/s)": 0.205935 }, { "epoch": 1.7377564221031232, "grad_norm": 1.0465828745420171, "learning_rate": 8.339350143715452e-07, "loss": 1.026121234893799, "memory(GiB)": 74.63, "step": 2350, "token_acc": 0.6303341902313625, "train_speed(iter/s)": 0.204941 }, { "epoch": 1.7377564221031232, "eval_loss": 0.6579257845878601, "eval_runtime": 85.2135, "eval_samples_per_second": 82.088, "eval_steps_per_second": 0.645, "eval_token_acc": 0.6323030679407236, "step": 2350 }, { "epoch": 1.7414525965625578, "grad_norm": 0.9803139837060567, "learning_rate": 8.108653371360897e-07, "loss": 1.0249688148498535, "memory(GiB)": 74.63, "step": 2355, "token_acc": 0.6329644032306312, "train_speed(iter/s)": 0.202068 }, { "epoch": 1.7451487710219924, "grad_norm": 1.0773968420983469, "learning_rate": 7.881057584782448e-07, "loss": 1.014153003692627, "memory(GiB)": 74.63, "step": 2360, "token_acc": 0.6533575317604355, "train_speed(iter/s)": 0.201155 }, { "epoch": 1.7488449454814268, "grad_norm": 1.0060807449724751, "learning_rate": 7.656570464477997e-07, "loss": 1.041685199737549, "memory(GiB)": 74.63, "step": 2365, "token_acc": 0.6260771824653428, "train_speed(iter/s)": 0.20029 }, { "epoch": 1.7525411199408611, "grad_norm": 0.9990872446739557, "learning_rate": 7.435199586039721e-07, "loss": 1.025881576538086, "memory(GiB)": 74.63, "step": 2370, "token_acc": 0.6330558125192722, "train_speed(iter/s)": 0.199385 }, { "epoch": 1.7562372944002957, "grad_norm": 1.0713164560199164, "learning_rate": 7.216952419898393e-07, "loss": 1.0439919471740722, "memory(GiB)": 74.63, "step": 2375, "token_acc": 0.6618962432915921, "train_speed(iter/s)": 0.198497 }, { "epoch": 1.7599334688597303, "grad_norm": 1.0964714966010252, "learning_rate": 7.001836331071365e-07, "loss": 1.0411014556884766, "memory(GiB)": 74.63, "step": 2380, "token_acc": 0.6824512534818942, "train_speed(iter/s)": 0.197623 }, { "epoch": 1.7636296433191647, "grad_norm": 0.9737095253362634, "learning_rate": 6.789858578913877e-07, "loss": 1.0455976486206056, "memory(GiB)": 74.63, "step": 2385, "token_acc": 0.6538119252447345, "train_speed(iter/s)": 0.196798 }, { "epoch": 1.767325817778599, "grad_norm": 1.0585968573237603, "learning_rate": 6.581026316874184e-07, "loss": 1.0437522888183595, "memory(GiB)": 74.63, "step": 2390, "token_acc": 0.6448377581120944, "train_speed(iter/s)": 0.195944 }, { "epoch": 1.7710219922380337, "grad_norm": 0.9930747477126893, "learning_rate": 6.375346592252174e-07, "loss": 1.035786247253418, "memory(GiB)": 74.63, "step": 2395, "token_acc": 0.6269207129686539, "train_speed(iter/s)": 0.195132 }, { "epoch": 1.774718166697468, "grad_norm": 0.9303672570135261, "learning_rate": 6.17282634596148e-07, "loss": 1.0481432914733886, "memory(GiB)": 74.63, "step": 2400, "token_acc": 0.6504672897196262, "train_speed(iter/s)": 0.194323 }, { "epoch": 1.774718166697468, "eval_loss": 0.6574872136116028, "eval_runtime": 88.3048, "eval_samples_per_second": 79.214, "eval_steps_per_second": 0.623, "eval_token_acc": 0.6324069563691687, "step": 2400 }, { "epoch": 1.7784143411569024, "grad_norm": 1.0622942862025062, "learning_rate": 5.973472412295256e-07, "loss": 1.019943618774414, "memory(GiB)": 74.63, "step": 2405, "token_acc": 0.631666271628348, "train_speed(iter/s)": 0.191801 }, { "epoch": 1.782110515616337, "grad_norm": 1.0141126472853548, "learning_rate": 5.777291518695593e-07, "loss": 1.0454243659973144, "memory(GiB)": 74.63, "step": 2410, "token_acc": 0.6077097505668935, "train_speed(iter/s)": 0.191007 }, { "epoch": 1.7858066900757716, "grad_norm": 1.0733746716133248, "learning_rate": 5.584290285526473e-07, "loss": 1.036181640625, "memory(GiB)": 74.63, "step": 2415, "token_acc": 0.671865626874625, "train_speed(iter/s)": 0.190213 }, { "epoch": 1.789502864535206, "grad_norm": 1.011543008247962, "learning_rate": 5.394475225850338e-07, "loss": 1.0618670463562012, "memory(GiB)": 74.63, "step": 2420, "token_acc": 0.6783405172413793, "train_speed(iter/s)": 0.189455 }, { "epoch": 1.7931990389946404, "grad_norm": 0.9605401301883022, "learning_rate": 5.207852745208298e-07, "loss": 0.9933710098266602, "memory(GiB)": 74.63, "step": 2425, "token_acc": 0.6471641791044777, "train_speed(iter/s)": 0.188704 }, { "epoch": 1.796895213454075, "grad_norm": 1.1008101055992277, "learning_rate": 5.024429141404019e-07, "loss": 0.999241828918457, "memory(GiB)": 74.63, "step": 2430, "token_acc": 0.6457304163726182, "train_speed(iter/s)": 0.187948 }, { "epoch": 1.8005913879135096, "grad_norm": 0.935629646034127, "learning_rate": 4.844210604291155e-07, "loss": 1.018147087097168, "memory(GiB)": 74.63, "step": 2435, "token_acc": 0.6178369652945924, "train_speed(iter/s)": 0.187233 }, { "epoch": 1.804287562372944, "grad_norm": 0.9808937018928983, "learning_rate": 4.667203215564431e-07, "loss": 1.0448846817016602, "memory(GiB)": 74.63, "step": 2440, "token_acc": 0.6323092170465807, "train_speed(iter/s)": 0.186484 }, { "epoch": 1.8079837368323783, "grad_norm": 1.0392559529080805, "learning_rate": 4.493412948554454e-07, "loss": 1.0690251350402833, "memory(GiB)": 74.63, "step": 2445, "token_acc": 0.6409416581371545, "train_speed(iter/s)": 0.185763 }, { "epoch": 1.811679911291813, "grad_norm": 1.0661940200914148, "learning_rate": 4.3228456680261877e-07, "loss": 1.0110756874084472, "memory(GiB)": 74.63, "step": 2450, "token_acc": 0.649331352154532, "train_speed(iter/s)": 0.185079 }, { "epoch": 1.811679911291813, "eval_loss": 0.6571330428123474, "eval_runtime": 89.3337, "eval_samples_per_second": 78.302, "eval_steps_per_second": 0.616, "eval_token_acc": 0.6325177706928434, "step": 2450 }, { "epoch": 1.8153760857512475, "grad_norm": 0.9674903681690532, "learning_rate": 4.155507129980907e-07, "loss": 1.0614801406860352, "memory(GiB)": 74.63, "step": 2455, "token_acc": 0.6478157805621402, "train_speed(iter/s)": 0.182858 }, { "epoch": 1.819072260210682, "grad_norm": 1.023357784734463, "learning_rate": 3.991402981462045e-07, "loss": 1.0087343215942384, "memory(GiB)": 74.63, "step": 2460, "token_acc": 0.6711140760507005, "train_speed(iter/s)": 0.182142 }, { "epoch": 1.8227684346701163, "grad_norm": 1.1735912820708456, "learning_rate": 3.8305387603646324e-07, "loss": 1.0243083953857421, "memory(GiB)": 74.63, "step": 2465, "token_acc": 0.6599799398194583, "train_speed(iter/s)": 0.181445 }, { "epoch": 1.8264646091295509, "grad_norm": 1.0193068857696008, "learning_rate": 3.6729198952483725e-07, "loss": 1.032374095916748, "memory(GiB)": 74.63, "step": 2470, "token_acc": 0.6700460829493088, "train_speed(iter/s)": 0.180793 }, { "epoch": 1.8301607835889855, "grad_norm": 0.989197160358902, "learning_rate": 3.5185517051544494e-07, "loss": 1.053987693786621, "memory(GiB)": 74.63, "step": 2475, "token_acc": 0.6859160781055256, "train_speed(iter/s)": 0.180141 }, { "epoch": 1.8338569580484199, "grad_norm": 1.0596386275791907, "learning_rate": 3.367439399426087e-07, "loss": 1.0508078575134276, "memory(GiB)": 74.63, "step": 2480, "token_acc": 0.6111356606274856, "train_speed(iter/s)": 0.179489 }, { "epoch": 1.8375531325078542, "grad_norm": 1.0148900997448214, "learning_rate": 3.219588077532687e-07, "loss": 1.0556805610656739, "memory(GiB)": 74.63, "step": 2485, "token_acc": 0.6928414901387875, "train_speed(iter/s)": 0.178863 }, { "epoch": 1.8412493069672888, "grad_norm": 0.9468756707473351, "learning_rate": 3.075002728897747e-07, "loss": 1.0154769897460938, "memory(GiB)": 74.63, "step": 2490, "token_acc": 0.6334152334152334, "train_speed(iter/s)": 0.178234 }, { "epoch": 1.8449454814267234, "grad_norm": 0.9178809513706729, "learning_rate": 2.933688232730536e-07, "loss": 1.0376591682434082, "memory(GiB)": 74.63, "step": 2495, "token_acc": 0.6742112482853223, "train_speed(iter/s)": 0.177603 }, { "epoch": 1.8486416558861578, "grad_norm": 1.0627891032300194, "learning_rate": 2.79564935786143e-07, "loss": 1.0138132095336914, "memory(GiB)": 74.63, "step": 2500, "token_acc": 0.6157316041725401, "train_speed(iter/s)": 0.176992 }, { "epoch": 1.8486416558861578, "eval_loss": 0.6568954586982727, "eval_runtime": 89.4508, "eval_samples_per_second": 78.199, "eval_steps_per_second": 0.615, "eval_token_acc": 0.632513153429357, "step": 2500 }, { "epoch": 1.8523378303455922, "grad_norm": 1.0872052595724289, "learning_rate": 2.660890762580903e-07, "loss": 1.0546483993530273, "memory(GiB)": 74.63, "step": 2505, "token_acc": 0.6424242424242425, "train_speed(iter/s)": 0.175004 }, { "epoch": 1.8560340048050268, "grad_norm": 1.1281209574136644, "learning_rate": 2.5294169944824254e-07, "loss": 1.0317713737487793, "memory(GiB)": 74.63, "step": 2510, "token_acc": 0.6293388429752066, "train_speed(iter/s)": 0.174416 }, { "epoch": 1.8597301792644614, "grad_norm": 0.8926816061055212, "learning_rate": 2.401232490308969e-07, "loss": 1.048653793334961, "memory(GiB)": 74.63, "step": 2515, "token_acc": 0.6237929702587872, "train_speed(iter/s)": 0.173811 }, { "epoch": 1.8634263537238958, "grad_norm": 1.0912285805001078, "learning_rate": 2.2763415758032316e-07, "loss": 1.0199008941650392, "memory(GiB)": 74.63, "step": 2520, "token_acc": 0.632258064516129, "train_speed(iter/s)": 0.173239 }, { "epoch": 1.8671225281833301, "grad_norm": 1.0989085317685814, "learning_rate": 2.1547484655617513e-07, "loss": 1.010093879699707, "memory(GiB)": 74.63, "step": 2525, "token_acc": 0.6342616920651603, "train_speed(iter/s)": 0.172675 }, { "epoch": 1.8708187026427647, "grad_norm": 1.0229802711909943, "learning_rate": 2.0364572628925993e-07, "loss": 1.0246079444885254, "memory(GiB)": 74.63, "step": 2530, "token_acc": 0.717948717948718, "train_speed(iter/s)": 0.172113 }, { "epoch": 1.8745148771021993, "grad_norm": 1.1101947156669076, "learning_rate": 1.921471959676957e-07, "loss": 1.0213122367858887, "memory(GiB)": 74.63, "step": 2535, "token_acc": 0.6377079482439926, "train_speed(iter/s)": 0.171534 }, { "epoch": 1.8782110515616337, "grad_norm": 0.972824509691789, "learning_rate": 1.809796436234379e-07, "loss": 1.0392621040344239, "memory(GiB)": 74.63, "step": 2540, "token_acc": 0.6089108910891089, "train_speed(iter/s)": 0.17099 }, { "epoch": 1.881907226021068, "grad_norm": 1.0893138267742302, "learning_rate": 1.7014344611918753e-07, "loss": 1.0224065780639648, "memory(GiB)": 74.63, "step": 2545, "token_acc": 0.628198149156233, "train_speed(iter/s)": 0.170427 }, { "epoch": 1.8856034004805027, "grad_norm": 1.1303784675436226, "learning_rate": 1.5963896913566923e-07, "loss": 1.0195607185363769, "memory(GiB)": 74.63, "step": 2550, "token_acc": 0.658051689860835, "train_speed(iter/s)": 0.169871 }, { "epoch": 1.8856034004805027, "eval_loss": 0.6567226648330688, "eval_runtime": 88.1394, "eval_samples_per_second": 79.363, "eval_steps_per_second": 0.624, "eval_token_acc": 0.6325870296451402, "step": 2550 }, { "epoch": 1.8892995749399373, "grad_norm": 1.09017259229996, "learning_rate": 1.494665671592943e-07, "loss": 1.0317469596862794, "memory(GiB)": 74.63, "step": 2555, "token_acc": 0.6337277475748854, "train_speed(iter/s)": 0.168117 }, { "epoch": 1.8929957493993717, "grad_norm": 0.9679050962600252, "learning_rate": 1.3962658347019819e-07, "loss": 1.0667352676391602, "memory(GiB)": 74.63, "step": 2560, "token_acc": 0.6295540658700087, "train_speed(iter/s)": 0.167582 }, { "epoch": 1.896691923858806, "grad_norm": 0.88490239893554, "learning_rate": 1.3011935013065303e-07, "loss": 1.0192485809326173, "memory(GiB)": 74.63, "step": 2565, "token_acc": 0.599483204134367, "train_speed(iter/s)": 0.16706 }, { "epoch": 1.9003880983182406, "grad_norm": 1.0244757899454908, "learning_rate": 1.2094518797386657e-07, "loss": 1.0162858963012695, "memory(GiB)": 74.63, "step": 2570, "token_acc": 0.6262672811059908, "train_speed(iter/s)": 0.166543 }, { "epoch": 1.9040842727776752, "grad_norm": 0.9237340665622228, "learning_rate": 1.121044065931498e-07, "loss": 1.0645517349243163, "memory(GiB)": 74.63, "step": 2575, "token_acc": 0.6675933280381255, "train_speed(iter/s)": 0.166012 }, { "epoch": 1.9077804472371096, "grad_norm": 0.9745219678731106, "learning_rate": 1.0359730433147308e-07, "loss": 1.0265457153320312, "memory(GiB)": 74.63, "step": 2580, "token_acc": 0.6550632911392406, "train_speed(iter/s)": 0.165515 }, { "epoch": 1.911476621696544, "grad_norm": 1.0007256420566137, "learning_rate": 9.542416827139855e-08, "loss": 1.0198524475097657, "memory(GiB)": 74.63, "step": 2585, "token_acc": 0.6085481682496607, "train_speed(iter/s)": 0.164991 }, { "epoch": 1.9151727961559786, "grad_norm": 0.9874298790271662, "learning_rate": 8.758527422538798e-08, "loss": 1.0276208877563477, "memory(GiB)": 74.63, "step": 2590, "token_acc": 0.6413793103448275, "train_speed(iter/s)": 0.164496 }, { "epoch": 1.9188689706154132, "grad_norm": 0.985598517098827, "learning_rate": 8.008088672650016e-08, "loss": 1.0311683654785155, "memory(GiB)": 74.63, "step": 2595, "token_acc": 0.6960919540229885, "train_speed(iter/s)": 0.164012 }, { "epoch": 1.9225651450748475, "grad_norm": 0.8074933176611375, "learning_rate": 7.291125901946027e-08, "loss": 1.0470510482788087, "memory(GiB)": 74.63, "step": 2600, "token_acc": 0.6391111111111111, "train_speed(iter/s)": 0.163535 }, { "epoch": 1.9225651450748475, "eval_loss": 0.6566023230552673, "eval_runtime": 88.9043, "eval_samples_per_second": 78.68, "eval_steps_per_second": 0.619, "eval_token_acc": 0.6326458997545924, "step": 2600 }, { "epoch": 1.926261319534282, "grad_norm": 1.0347205106897759, "learning_rate": 6.607663305211675e-08, "loss": 1.0246917724609375, "memory(GiB)": 74.63, "step": 2605, "token_acc": 0.6372442184283812, "train_speed(iter/s)": 0.161902 }, { "epoch": 1.9299574939937165, "grad_norm": 0.9451923481445313, "learning_rate": 5.957723946727445e-08, "loss": 1.030987548828125, "memory(GiB)": 74.63, "step": 2610, "token_acc": 0.655980271270037, "train_speed(iter/s)": 0.161436 }, { "epoch": 1.9336536684531511, "grad_norm": 0.989048068560612, "learning_rate": 5.341329759491087e-08, "loss": 1.043976402282715, "memory(GiB)": 74.63, "step": 2615, "token_acc": 0.6610073571024335, "train_speed(iter/s)": 0.160958 }, { "epoch": 1.9373498429125855, "grad_norm": 0.9059448258322844, "learning_rate": 4.758501544477767e-08, "loss": 1.03828706741333, "memory(GiB)": 74.63, "step": 2620, "token_acc": 0.663670766319773, "train_speed(iter/s)": 0.160484 }, { "epoch": 1.9410460173720199, "grad_norm": 1.0371951958694063, "learning_rate": 4.209258969937624e-08, "loss": 1.0256452560424805, "memory(GiB)": 74.63, "step": 2625, "token_acc": 0.6571687019448214, "train_speed(iter/s)": 0.160045 }, { "epoch": 1.9447421918314545, "grad_norm": 0.9579823005570719, "learning_rate": 3.6936205707325255e-08, "loss": 1.0316158294677735, "memory(GiB)": 74.63, "step": 2630, "token_acc": 0.6658135283363803, "train_speed(iter/s)": 0.159594 }, { "epoch": 1.948438366290889, "grad_norm": 1.185629004014561, "learning_rate": 3.2116037477103454e-08, "loss": 1.0686611175537108, "memory(GiB)": 74.63, "step": 2635, "token_acc": 0.6998087954110899, "train_speed(iter/s)": 0.159158 }, { "epoch": 1.9521345407503234, "grad_norm": 0.9906589709801633, "learning_rate": 2.763224767117767e-08, "loss": 0.9920598983764648, "memory(GiB)": 74.63, "step": 2640, "token_acc": 0.6588921282798834, "train_speed(iter/s)": 0.158729 }, { "epoch": 1.9558307152097578, "grad_norm": 0.9014323974805333, "learning_rate": 2.3484987600512767e-08, "loss": 1.0331963539123534, "memory(GiB)": 74.63, "step": 2645, "token_acc": 0.6714507370054306, "train_speed(iter/s)": 0.158272 }, { "epoch": 1.9595268896691924, "grad_norm": 0.9766018351933058, "learning_rate": 1.9674397219469064e-08, "loss": 1.037597370147705, "memory(GiB)": 74.63, "step": 2650, "token_acc": 0.6561371841155235, "train_speed(iter/s)": 0.157844 }, { "epoch": 1.9595268896691924, "eval_loss": 0.6565667390823364, "eval_runtime": 88.279, "eval_samples_per_second": 79.237, "eval_steps_per_second": 0.623, "eval_token_acc": 0.6325916469086267, "step": 2650 }, { "epoch": 1.963223064128627, "grad_norm": 1.0614662558544963, "learning_rate": 1.620060512107391e-08, "loss": 1.016525936126709, "memory(GiB)": 74.63, "step": 2655, "token_acc": 0.6412867391807452, "train_speed(iter/s)": 0.156391 }, { "epoch": 1.9669192385880614, "grad_norm": 1.234699645190091, "learning_rate": 1.3063728532686225e-08, "loss": 1.0382546424865722, "memory(GiB)": 74.63, "step": 2660, "token_acc": 0.628119293974437, "train_speed(iter/s)": 0.155979 }, { "epoch": 1.9706154130474958, "grad_norm": 1.1176674856308213, "learning_rate": 1.0263873312040818e-08, "loss": 1.0646825790405274, "memory(GiB)": 74.63, "step": 2665, "token_acc": 0.6521344232515894, "train_speed(iter/s)": 0.155534 }, { "epoch": 1.9743115875069304, "grad_norm": 0.9542666956735151, "learning_rate": 7.801133943672323e-09, "loss": 1.047515296936035, "memory(GiB)": 74.63, "step": 2670, "token_acc": 0.632, "train_speed(iter/s)": 0.15513 }, { "epoch": 1.978007761966365, "grad_norm": 0.966385972017561, "learning_rate": 5.675593535731106e-09, "loss": 1.0257146835327149, "memory(GiB)": 74.63, "step": 2675, "token_acc": 0.6467647058823529, "train_speed(iter/s)": 0.15474 }, { "epoch": 1.9817039364257993, "grad_norm": 1.0905550872468757, "learning_rate": 3.887323817173272e-09, "loss": 1.0138104438781739, "memory(GiB)": 74.63, "step": 2680, "token_acc": 0.6310845431255337, "train_speed(iter/s)": 0.154324 }, { "epoch": 1.9854001108852337, "grad_norm": 1.0126426754906144, "learning_rate": 2.436385135348163e-09, "loss": 1.015495491027832, "memory(GiB)": 74.63, "step": 2685, "token_acc": 0.6567026194144838, "train_speed(iter/s)": 0.153915 }, { "epoch": 1.9890962853446683, "grad_norm": 0.8862791092932369, "learning_rate": 1.3228264539522084e-09, "loss": 1.049496841430664, "memory(GiB)": 74.63, "step": 2690, "token_acc": 0.6486280487804879, "train_speed(iter/s)": 0.153518 }, { "epoch": 1.992792459804103, "grad_norm": 1.0787160107890392, "learning_rate": 5.466853513858006e-10, "loss": 1.0067996978759766, "memory(GiB)": 74.63, "step": 2695, "token_acc": 0.6233766233766234, "train_speed(iter/s)": 0.153131 }, { "epoch": 1.9964886342635373, "grad_norm": 1.058938505423735, "learning_rate": 1.0798801947764503e-10, "loss": 1.0397415161132812, "memory(GiB)": 74.63, "step": 2700, "token_acc": 0.6839266450916937, "train_speed(iter/s)": 0.152739 }, { "epoch": 1.9964886342635373, "eval_loss": 0.6565173864364624, "eval_runtime": 87.3486, "eval_samples_per_second": 80.081, "eval_steps_per_second": 0.63, "eval_token_acc": 0.6325489372213771, "step": 2700 }, { "epoch": 1.9994455738310848, "eval_loss": 0.6564235091209412, "eval_runtime": 89.612, "eval_samples_per_second": 78.059, "eval_steps_per_second": 0.614, "eval_token_acc": 0.6324912214277963, "step": 2704 } ], "logging_steps": 5, "max_steps": 2704, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.945781552860365e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }