| { | |
| "best_metric": 0.65642351, | |
| "best_model_checkpoint": "/m2v_intern/zhangzhicheng03/code/face-llm/ms-swift/Emo-CFG_bs-512_data-ATTR_OPEN_EMO_MIC_500k_CAP_78k_RATIONALE_120k_scratch_3B_lr-2e-5/v2-20250515-154834/checkpoint-2704", | |
| "epoch": 1.9994455738310848, | |
| "eval_steps": 50, | |
| "global_step": 2704, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0007392348918868971, | |
| "grad_norm": 7.145988573209412, | |
| "learning_rate": 1.9999993250737395e-05, | |
| "loss": 1.997387409210205, | |
| "memory(GiB)": 30.3, | |
| "step": 1, | |
| "token_acc": 0.5373563218390804, | |
| "train_speed(iter/s)": 0.022542 | |
| }, | |
| { | |
| "epoch": 0.0036961744594344852, | |
| "grad_norm": 4.16117609244522, | |
| "learning_rate": 1.9999831268890388e-05, | |
| "loss": 1.7683343887329102, | |
| "memory(GiB)": 30.88, | |
| "step": 5, | |
| "token_acc": 0.5186522262334536, | |
| "train_speed(iter/s)": 0.047276 | |
| }, | |
| { | |
| "epoch": 0.0073923489188689705, | |
| "grad_norm": 1.9968396412286222, | |
| "learning_rate": 1.999932508125559e-05, | |
| "loss": 1.4955159187316895, | |
| "memory(GiB)": 40.49, | |
| "step": 10, | |
| "token_acc": 0.5502183406113537, | |
| "train_speed(iter/s)": 0.057642 | |
| }, | |
| { | |
| "epoch": 0.011088523378303456, | |
| "grad_norm": 1.6222058869585758, | |
| "learning_rate": 1.9998481454177528e-05, | |
| "loss": 1.4060004234313965, | |
| "memory(GiB)": 40.49, | |
| "step": 15, | |
| "token_acc": 0.6005361930294906, | |
| "train_speed(iter/s)": 0.059714 | |
| }, | |
| { | |
| "epoch": 0.014784697837737941, | |
| "grad_norm": 1.873832697605311, | |
| "learning_rate": 1.9997300416125426e-05, | |
| "loss": 1.3838209152221679, | |
| "memory(GiB)": 40.49, | |
| "step": 20, | |
| "token_acc": 0.607051282051282, | |
| "train_speed(iter/s)": 0.060073 | |
| }, | |
| { | |
| "epoch": 0.018480872297172428, | |
| "grad_norm": 1.659686579754198, | |
| "learning_rate": 1.9995782006954852e-05, | |
| "loss": 1.3265121459960938, | |
| "memory(GiB)": 40.49, | |
| "step": 25, | |
| "token_acc": 0.5963340122199593, | |
| "train_speed(iter/s)": 0.062445 | |
| }, | |
| { | |
| "epoch": 0.02217704675660691, | |
| "grad_norm": 1.8215744972131866, | |
| "learning_rate": 1.9993926277906387e-05, | |
| "loss": 1.3122464179992677, | |
| "memory(GiB)": 54.96, | |
| "step": 30, | |
| "token_acc": 0.5690406976744186, | |
| "train_speed(iter/s)": 0.06221 | |
| }, | |
| { | |
| "epoch": 0.0258732212160414, | |
| "grad_norm": 1.5674638552693054, | |
| "learning_rate": 1.9991733291603873e-05, | |
| "loss": 1.3101771354675293, | |
| "memory(GiB)": 54.96, | |
| "step": 35, | |
| "token_acc": 0.5990697674418605, | |
| "train_speed(iter/s)": 0.061909 | |
| }, | |
| { | |
| "epoch": 0.029569395675475882, | |
| "grad_norm": 1.8121545652568625, | |
| "learning_rate": 1.998920312205231e-05, | |
| "loss": 1.2577611923217773, | |
| "memory(GiB)": 54.96, | |
| "step": 40, | |
| "token_acc": 0.6322274881516587, | |
| "train_speed(iter/s)": 0.062755 | |
| }, | |
| { | |
| "epoch": 0.03326557013491037, | |
| "grad_norm": 1.7654732822991834, | |
| "learning_rate": 1.9986335854635364e-05, | |
| "loss": 1.2739611625671388, | |
| "memory(GiB)": 54.96, | |
| "step": 45, | |
| "token_acc": 0.5852668213457076, | |
| "train_speed(iter/s)": 0.062974 | |
| }, | |
| { | |
| "epoch": 0.036961744594344856, | |
| "grad_norm": 1.3381865017125096, | |
| "learning_rate": 1.9983131586112474e-05, | |
| "loss": 1.2759986877441407, | |
| "memory(GiB)": 54.96, | |
| "step": 50, | |
| "token_acc": 0.6090799517878666, | |
| "train_speed(iter/s)": 0.0626 | |
| }, | |
| { | |
| "epoch": 0.036961744594344856, | |
| "eval_loss": 0.8683156967163086, | |
| "eval_runtime": 85.8388, | |
| "eval_samples_per_second": 81.49, | |
| "eval_steps_per_second": 0.641, | |
| "eval_token_acc": 0.5936057826607904, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.040657919053779336, | |
| "grad_norm": 1.7881784961595093, | |
| "learning_rate": 1.9979590424615597e-05, | |
| "loss": 1.2275705337524414, | |
| "memory(GiB)": 74.93, | |
| "step": 55, | |
| "token_acc": 0.5959502991256328, | |
| "train_speed(iter/s)": 0.055536 | |
| }, | |
| { | |
| "epoch": 0.04435409351321382, | |
| "grad_norm": 1.523952154354778, | |
| "learning_rate": 1.997571248964556e-05, | |
| "loss": 1.2908334732055664, | |
| "memory(GiB)": 74.93, | |
| "step": 60, | |
| "token_acc": 0.6143884892086331, | |
| "train_speed(iter/s)": 0.056437 | |
| }, | |
| { | |
| "epoch": 0.04805026797264831, | |
| "grad_norm": 1.629094517207129, | |
| "learning_rate": 1.9971497912068014e-05, | |
| "loss": 1.262259292602539, | |
| "memory(GiB)": 74.93, | |
| "step": 65, | |
| "token_acc": 0.6196943972835314, | |
| "train_speed(iter/s)": 0.05732 | |
| }, | |
| { | |
| "epoch": 0.0517464424320828, | |
| "grad_norm": 1.4002823703096854, | |
| "learning_rate": 1.9966946834109026e-05, | |
| "loss": 1.2578742980957032, | |
| "memory(GiB)": 74.93, | |
| "step": 70, | |
| "token_acc": 0.5597722960151803, | |
| "train_speed(iter/s)": 0.057646 | |
| }, | |
| { | |
| "epoch": 0.05544261689151728, | |
| "grad_norm": 1.3922622186923228, | |
| "learning_rate": 1.9962059409350286e-05, | |
| "loss": 1.2903871536254883, | |
| "memory(GiB)": 74.93, | |
| "step": 75, | |
| "token_acc": 0.5871787786681404, | |
| "train_speed(iter/s)": 0.058082 | |
| }, | |
| { | |
| "epoch": 0.059138791350951764, | |
| "grad_norm": 1.5357768582962403, | |
| "learning_rate": 1.9956835802723916e-05, | |
| "loss": 1.2582176208496094, | |
| "memory(GiB)": 74.93, | |
| "step": 80, | |
| "token_acc": 0.5863981319322825, | |
| "train_speed(iter/s)": 0.05893 | |
| }, | |
| { | |
| "epoch": 0.06283496581038625, | |
| "grad_norm": 1.5430442318289654, | |
| "learning_rate": 1.9951276190506903e-05, | |
| "loss": 1.2459497451782227, | |
| "memory(GiB)": 74.93, | |
| "step": 85, | |
| "token_acc": 0.5826538176426983, | |
| "train_speed(iter/s)": 0.059014 | |
| }, | |
| { | |
| "epoch": 0.06653114026982074, | |
| "grad_norm": 1.5964710010210896, | |
| "learning_rate": 1.9945380760315153e-05, | |
| "loss": 1.2252405166625977, | |
| "memory(GiB)": 74.93, | |
| "step": 90, | |
| "token_acc": 0.6162060301507538, | |
| "train_speed(iter/s)": 0.059178 | |
| }, | |
| { | |
| "epoch": 0.07022731472925522, | |
| "grad_norm": 1.5295447471837036, | |
| "learning_rate": 1.9939149711097164e-05, | |
| "loss": 1.235156536102295, | |
| "memory(GiB)": 74.93, | |
| "step": 95, | |
| "token_acc": 0.6264632848527847, | |
| "train_speed(iter/s)": 0.059963 | |
| }, | |
| { | |
| "epoch": 0.07392348918868971, | |
| "grad_norm": 1.3740769902051975, | |
| "learning_rate": 1.9932583253127302e-05, | |
| "loss": 1.2441673278808594, | |
| "memory(GiB)": 74.93, | |
| "step": 100, | |
| "token_acc": 0.6724137931034483, | |
| "train_speed(iter/s)": 0.060048 | |
| }, | |
| { | |
| "epoch": 0.07392348918868971, | |
| "eval_loss": 0.8143442273139954, | |
| "eval_runtime": 82.7498, | |
| "eval_samples_per_second": 84.532, | |
| "eval_steps_per_second": 0.665, | |
| "eval_token_acc": 0.6014724453258288, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.07761966364812418, | |
| "grad_norm": 1.4813264557157313, | |
| "learning_rate": 1.992568160799872e-05, | |
| "loss": 1.2064315795898437, | |
| "memory(GiB)": 74.93, | |
| "step": 105, | |
| "token_acc": 0.6042131350681537, | |
| "train_speed(iter/s)": 0.056686 | |
| }, | |
| { | |
| "epoch": 0.08131583810755867, | |
| "grad_norm": 1.4543500652550134, | |
| "learning_rate": 1.9918445008615862e-05, | |
| "loss": 1.2109683990478515, | |
| "memory(GiB)": 74.93, | |
| "step": 110, | |
| "token_acc": 0.5906810035842294, | |
| "train_speed(iter/s)": 0.056861 | |
| }, | |
| { | |
| "epoch": 0.08501201256699316, | |
| "grad_norm": 1.4682139055625663, | |
| "learning_rate": 1.9910873699186618e-05, | |
| "loss": 1.2368173599243164, | |
| "memory(GiB)": 74.93, | |
| "step": 115, | |
| "token_acc": 0.5186114596403179, | |
| "train_speed(iter/s)": 0.057469 | |
| }, | |
| { | |
| "epoch": 0.08870818702642765, | |
| "grad_norm": 1.3995801521088191, | |
| "learning_rate": 1.990296793521408e-05, | |
| "loss": 1.2045980453491212, | |
| "memory(GiB)": 74.93, | |
| "step": 120, | |
| "token_acc": 0.5862884160756501, | |
| "train_speed(iter/s)": 0.057707 | |
| }, | |
| { | |
| "epoch": 0.09240436148586213, | |
| "grad_norm": 1.3021560572956736, | |
| "learning_rate": 1.989472798348791e-05, | |
| "loss": 1.2566261291503906, | |
| "memory(GiB)": 74.93, | |
| "step": 125, | |
| "token_acc": 0.5944492254733219, | |
| "train_speed(iter/s)": 0.057808 | |
| }, | |
| { | |
| "epoch": 0.09610053594529662, | |
| "grad_norm": 1.4712133484369752, | |
| "learning_rate": 1.9886154122075344e-05, | |
| "loss": 1.192431640625, | |
| "memory(GiB)": 74.93, | |
| "step": 130, | |
| "token_acc": 0.5911908646003262, | |
| "train_speed(iter/s)": 0.058231 | |
| }, | |
| { | |
| "epoch": 0.0997967104047311, | |
| "grad_norm": 1.3130768997154976, | |
| "learning_rate": 1.9877246640311818e-05, | |
| "loss": 1.2078176498413087, | |
| "memory(GiB)": 74.93, | |
| "step": 135, | |
| "token_acc": 0.6265611990008326, | |
| "train_speed(iter/s)": 0.058455 | |
| }, | |
| { | |
| "epoch": 0.1034928848641656, | |
| "grad_norm": 1.3930149495863555, | |
| "learning_rate": 1.9868005838791185e-05, | |
| "loss": 1.2078091621398925, | |
| "memory(GiB)": 74.93, | |
| "step": 140, | |
| "token_acc": 0.583790628957366, | |
| "train_speed(iter/s)": 0.05851 | |
| }, | |
| { | |
| "epoch": 0.10718905932360008, | |
| "grad_norm": 1.3209379748185142, | |
| "learning_rate": 1.9858432029355584e-05, | |
| "loss": 1.2318389892578125, | |
| "memory(GiB)": 74.93, | |
| "step": 145, | |
| "token_acc": 0.5777182235834609, | |
| "train_speed(iter/s)": 0.058766 | |
| }, | |
| { | |
| "epoch": 0.11088523378303455, | |
| "grad_norm": 1.5171021117309256, | |
| "learning_rate": 1.9848525535084916e-05, | |
| "loss": 1.2017921447753905, | |
| "memory(GiB)": 74.93, | |
| "step": 150, | |
| "token_acc": 0.6249167221852099, | |
| "train_speed(iter/s)": 0.059012 | |
| }, | |
| { | |
| "epoch": 0.11088523378303455, | |
| "eval_loss": 0.8016136884689331, | |
| "eval_runtime": 89.0739, | |
| "eval_samples_per_second": 78.53, | |
| "eval_steps_per_second": 0.617, | |
| "eval_token_acc": 0.6041989394145771, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.11458140824246904, | |
| "grad_norm": 1.3505800383266968, | |
| "learning_rate": 1.983828669028593e-05, | |
| "loss": 1.1807826042175293, | |
| "memory(GiB)": 74.93, | |
| "step": 155, | |
| "token_acc": 0.6214278069142674, | |
| "train_speed(iter/s)": 0.056798 | |
| }, | |
| { | |
| "epoch": 0.11827758270190353, | |
| "grad_norm": 1.3315385057884146, | |
| "learning_rate": 1.9827715840480962e-05, | |
| "loss": 1.1823822021484376, | |
| "memory(GiB)": 74.93, | |
| "step": 160, | |
| "token_acc": 0.6380498145204028, | |
| "train_speed(iter/s)": 0.056862 | |
| }, | |
| { | |
| "epoch": 0.12197375716133801, | |
| "grad_norm": 1.5704533949100516, | |
| "learning_rate": 1.9816813342396245e-05, | |
| "loss": 1.1738862991333008, | |
| "memory(GiB)": 74.93, | |
| "step": 165, | |
| "token_acc": 0.6022625781482585, | |
| "train_speed(iter/s)": 0.057054 | |
| }, | |
| { | |
| "epoch": 0.1256699316207725, | |
| "grad_norm": 1.379238427568713, | |
| "learning_rate": 1.980557956394991e-05, | |
| "loss": 1.1857439041137696, | |
| "memory(GiB)": 74.93, | |
| "step": 170, | |
| "token_acc": 0.620583717357911, | |
| "train_speed(iter/s)": 0.05738 | |
| }, | |
| { | |
| "epoch": 0.129366106080207, | |
| "grad_norm": 1.2583564805641094, | |
| "learning_rate": 1.9794014884239532e-05, | |
| "loss": 1.2060420989990235, | |
| "memory(GiB)": 74.93, | |
| "step": 175, | |
| "token_acc": 0.6253842775581906, | |
| "train_speed(iter/s)": 0.057484 | |
| }, | |
| { | |
| "epoch": 0.13306228053964148, | |
| "grad_norm": 1.3557017685070272, | |
| "learning_rate": 1.9782119693529358e-05, | |
| "loss": 1.2089680671691894, | |
| "memory(GiB)": 74.93, | |
| "step": 180, | |
| "token_acc": 0.6479481641468683, | |
| "train_speed(iter/s)": 0.057624 | |
| }, | |
| { | |
| "epoch": 0.13675845499907596, | |
| "grad_norm": 1.29022693500859, | |
| "learning_rate": 1.9769894393237135e-05, | |
| "loss": 1.1686654090881348, | |
| "memory(GiB)": 74.93, | |
| "step": 185, | |
| "token_acc": 0.6546961325966851, | |
| "train_speed(iter/s)": 0.057936 | |
| }, | |
| { | |
| "epoch": 0.14045462945851045, | |
| "grad_norm": 1.3838281227163587, | |
| "learning_rate": 1.975733939592056e-05, | |
| "loss": 1.2134584426879882, | |
| "memory(GiB)": 74.93, | |
| "step": 190, | |
| "token_acc": 0.6140988372093024, | |
| "train_speed(iter/s)": 0.058017 | |
| }, | |
| { | |
| "epoch": 0.14415080391794494, | |
| "grad_norm": 1.4298593013480252, | |
| "learning_rate": 1.974445512526336e-05, | |
| "loss": 1.1823249816894532, | |
| "memory(GiB)": 74.93, | |
| "step": 195, | |
| "token_acc": 0.5794074793589121, | |
| "train_speed(iter/s)": 0.058072 | |
| }, | |
| { | |
| "epoch": 0.14784697837737942, | |
| "grad_norm": 1.319725742779011, | |
| "learning_rate": 1.9731242016060985e-05, | |
| "loss": 1.237997055053711, | |
| "memory(GiB)": 74.93, | |
| "step": 200, | |
| "token_acc": 0.605226480836237, | |
| "train_speed(iter/s)": 0.058321 | |
| }, | |
| { | |
| "epoch": 0.14784697837737942, | |
| "eval_loss": 0.7794498801231384, | |
| "eval_runtime": 92.8737, | |
| "eval_samples_per_second": 75.317, | |
| "eval_steps_per_second": 0.592, | |
| "eval_token_acc": 0.6082148043319165, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.1515431528368139, | |
| "grad_norm": 1.2737166952101895, | |
| "learning_rate": 1.9717700514205963e-05, | |
| "loss": 1.1960806846618652, | |
| "memory(GiB)": 74.93, | |
| "step": 205, | |
| "token_acc": 0.6102411135968183, | |
| "train_speed(iter/s)": 0.056695 | |
| }, | |
| { | |
| "epoch": 0.15523932729624837, | |
| "grad_norm": 1.392555342325795, | |
| "learning_rate": 1.9703831076672807e-05, | |
| "loss": 1.1904547691345215, | |
| "memory(GiB)": 74.93, | |
| "step": 210, | |
| "token_acc": 0.6277756868648852, | |
| "train_speed(iter/s)": 0.056817 | |
| }, | |
| { | |
| "epoch": 0.15893550175568286, | |
| "grad_norm": 1.31415101390755, | |
| "learning_rate": 1.9689634171502642e-05, | |
| "loss": 1.1859335899353027, | |
| "memory(GiB)": 74.93, | |
| "step": 215, | |
| "token_acc": 0.6238479262672811, | |
| "train_speed(iter/s)": 0.056878 | |
| }, | |
| { | |
| "epoch": 0.16263167621511734, | |
| "grad_norm": 1.309289118136438, | |
| "learning_rate": 1.967511027778738e-05, | |
| "loss": 1.1907655715942382, | |
| "memory(GiB)": 74.93, | |
| "step": 220, | |
| "token_acc": 0.6154311649016642, | |
| "train_speed(iter/s)": 0.05712 | |
| }, | |
| { | |
| "epoch": 0.16632785067455183, | |
| "grad_norm": 1.263699514290592, | |
| "learning_rate": 1.966025988565356e-05, | |
| "loss": 1.1906933784484863, | |
| "memory(GiB)": 74.93, | |
| "step": 225, | |
| "token_acc": 0.6170634920634921, | |
| "train_speed(iter/s)": 0.057279 | |
| }, | |
| { | |
| "epoch": 0.17002402513398632, | |
| "grad_norm": 1.2660283851596872, | |
| "learning_rate": 1.9645083496245815e-05, | |
| "loss": 1.2014826774597167, | |
| "memory(GiB)": 74.93, | |
| "step": 230, | |
| "token_acc": 0.5935228023793787, | |
| "train_speed(iter/s)": 0.057316 | |
| }, | |
| { | |
| "epoch": 0.1737201995934208, | |
| "grad_norm": 1.2430321644094078, | |
| "learning_rate": 1.962958162170994e-05, | |
| "loss": 1.189725971221924, | |
| "memory(GiB)": 74.93, | |
| "step": 235, | |
| "token_acc": 0.7096774193548387, | |
| "train_speed(iter/s)": 0.057486 | |
| }, | |
| { | |
| "epoch": 0.1774163740528553, | |
| "grad_norm": 1.3183460023053968, | |
| "learning_rate": 1.961375478517564e-05, | |
| "loss": 1.1756509780883788, | |
| "memory(GiB)": 74.93, | |
| "step": 240, | |
| "token_acc": 0.6048, | |
| "train_speed(iter/s)": 0.057691 | |
| }, | |
| { | |
| "epoch": 0.18111254851228978, | |
| "grad_norm": 1.256458444055883, | |
| "learning_rate": 1.9597603520738853e-05, | |
| "loss": 1.1867225646972657, | |
| "memory(GiB)": 74.93, | |
| "step": 245, | |
| "token_acc": 0.6193625977149729, | |
| "train_speed(iter/s)": 0.057716 | |
| }, | |
| { | |
| "epoch": 0.18480872297172427, | |
| "grad_norm": 1.2002856679979195, | |
| "learning_rate": 1.9581128373443733e-05, | |
| "loss": 1.1792646408081056, | |
| "memory(GiB)": 74.93, | |
| "step": 250, | |
| "token_acc": 0.6049046321525886, | |
| "train_speed(iter/s)": 0.057828 | |
| }, | |
| { | |
| "epoch": 0.18480872297172427, | |
| "eval_loss": 0.7706022262573242, | |
| "eval_runtime": 87.903, | |
| "eval_samples_per_second": 79.576, | |
| "eval_steps_per_second": 0.626, | |
| "eval_token_acc": 0.6092432997735232, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.18850489743115875, | |
| "grad_norm": 1.2231576564807567, | |
| "learning_rate": 1.9564329899264252e-05, | |
| "loss": 1.1703492164611817, | |
| "memory(GiB)": 74.93, | |
| "step": 255, | |
| "token_acc": 0.6188424362408291, | |
| "train_speed(iter/s)": 0.056504 | |
| }, | |
| { | |
| "epoch": 0.19220107189059324, | |
| "grad_norm": 1.1802132132030512, | |
| "learning_rate": 1.954720866508546e-05, | |
| "loss": 1.17109956741333, | |
| "memory(GiB)": 74.93, | |
| "step": 260, | |
| "token_acc": 0.6199639206253759, | |
| "train_speed(iter/s)": 0.056699 | |
| }, | |
| { | |
| "epoch": 0.19589724635002773, | |
| "grad_norm": 1.3103486430673341, | |
| "learning_rate": 1.9529765248684308e-05, | |
| "loss": 1.1841205596923827, | |
| "memory(GiB)": 74.93, | |
| "step": 265, | |
| "token_acc": 0.5825649622799665, | |
| "train_speed(iter/s)": 0.056764 | |
| }, | |
| { | |
| "epoch": 0.1995934208094622, | |
| "grad_norm": 1.3602279829039154, | |
| "learning_rate": 1.951200023871021e-05, | |
| "loss": 1.1760824203491211, | |
| "memory(GiB)": 74.93, | |
| "step": 270, | |
| "token_acc": 0.6165368484122229, | |
| "train_speed(iter/s)": 0.056855 | |
| }, | |
| { | |
| "epoch": 0.2032895952688967, | |
| "grad_norm": 1.2205622648558432, | |
| "learning_rate": 1.949391423466513e-05, | |
| "loss": 1.1814783096313477, | |
| "memory(GiB)": 74.93, | |
| "step": 275, | |
| "token_acc": 0.6155863619333084, | |
| "train_speed(iter/s)": 0.057043 | |
| }, | |
| { | |
| "epoch": 0.2069857697283312, | |
| "grad_norm": 1.2638744947898215, | |
| "learning_rate": 1.9475507846883377e-05, | |
| "loss": 1.1977863311767578, | |
| "memory(GiB)": 74.93, | |
| "step": 280, | |
| "token_acc": 0.6115591397849462, | |
| "train_speed(iter/s)": 0.057131 | |
| }, | |
| { | |
| "epoch": 0.21068194418776567, | |
| "grad_norm": 1.208174719234681, | |
| "learning_rate": 1.9456781696510996e-05, | |
| "loss": 1.1798893928527832, | |
| "memory(GiB)": 74.93, | |
| "step": 285, | |
| "token_acc": 0.6450809464508095, | |
| "train_speed(iter/s)": 0.057208 | |
| }, | |
| { | |
| "epoch": 0.21437811864720016, | |
| "grad_norm": 1.29989102329814, | |
| "learning_rate": 1.943773641548481e-05, | |
| "loss": 1.1305645942687987, | |
| "memory(GiB)": 74.93, | |
| "step": 290, | |
| "token_acc": 0.6185250219490781, | |
| "train_speed(iter/s)": 0.057373 | |
| }, | |
| { | |
| "epoch": 0.21807429310663462, | |
| "grad_norm": 1.2869046963327413, | |
| "learning_rate": 1.9418372646511104e-05, | |
| "loss": 1.1689376831054688, | |
| "memory(GiB)": 74.93, | |
| "step": 295, | |
| "token_acc": 0.639083030472463, | |
| "train_speed(iter/s)": 0.057472 | |
| }, | |
| { | |
| "epoch": 0.2217704675660691, | |
| "grad_norm": 1.3262776988217382, | |
| "learning_rate": 1.939869104304392e-05, | |
| "loss": 1.1520153045654298, | |
| "memory(GiB)": 74.93, | |
| "step": 300, | |
| "token_acc": 0.6394881170018282, | |
| "train_speed(iter/s)": 0.05753 | |
| }, | |
| { | |
| "epoch": 0.2217704675660691, | |
| "eval_loss": 0.7643480896949768, | |
| "eval_runtime": 83.8503, | |
| "eval_samples_per_second": 83.422, | |
| "eval_steps_per_second": 0.656, | |
| "eval_token_acc": 0.6108027805160715, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.2254666420255036, | |
| "grad_norm": 1.2789640840585799, | |
| "learning_rate": 1.937869226926302e-05, | |
| "loss": 1.1876554489135742, | |
| "memory(GiB)": 74.93, | |
| "step": 305, | |
| "token_acc": 0.6270755222281735, | |
| "train_speed(iter/s)": 0.056408 | |
| }, | |
| { | |
| "epoch": 0.22916281648493808, | |
| "grad_norm": 1.3387432374923267, | |
| "learning_rate": 1.9358377000051457e-05, | |
| "loss": 1.152684211730957, | |
| "memory(GiB)": 74.93, | |
| "step": 310, | |
| "token_acc": 0.5908354547558435, | |
| "train_speed(iter/s)": 0.056571 | |
| }, | |
| { | |
| "epoch": 0.23285899094437257, | |
| "grad_norm": 1.3158521752921148, | |
| "learning_rate": 1.9337745920972817e-05, | |
| "loss": 1.1474998474121094, | |
| "memory(GiB)": 74.93, | |
| "step": 315, | |
| "token_acc": 0.6453079785035138, | |
| "train_speed(iter/s)": 0.056681 | |
| }, | |
| { | |
| "epoch": 0.23655516540380706, | |
| "grad_norm": 1.3574666032311622, | |
| "learning_rate": 1.9316799728248074e-05, | |
| "loss": 1.1646709442138672, | |
| "memory(GiB)": 74.93, | |
| "step": 320, | |
| "token_acc": 0.6396255850234009, | |
| "train_speed(iter/s)": 0.056747 | |
| }, | |
| { | |
| "epoch": 0.24025133986324154, | |
| "grad_norm": 1.5220421984558397, | |
| "learning_rate": 1.9295539128732096e-05, | |
| "loss": 1.1289070129394532, | |
| "memory(GiB)": 74.93, | |
| "step": 325, | |
| "token_acc": 0.6495638789122627, | |
| "train_speed(iter/s)": 0.056887 | |
| }, | |
| { | |
| "epoch": 0.24394751432267603, | |
| "grad_norm": 1.2325001012228407, | |
| "learning_rate": 1.927396483988979e-05, | |
| "loss": 1.1668661117553711, | |
| "memory(GiB)": 74.93, | |
| "step": 330, | |
| "token_acc": 0.6125099390405513, | |
| "train_speed(iter/s)": 0.05701 | |
| }, | |
| { | |
| "epoch": 0.24764368878211052, | |
| "grad_norm": 1.3455071899618125, | |
| "learning_rate": 1.92520775897719e-05, | |
| "loss": 1.160017967224121, | |
| "memory(GiB)": 74.93, | |
| "step": 335, | |
| "token_acc": 0.6224098234842671, | |
| "train_speed(iter/s)": 0.057069 | |
| }, | |
| { | |
| "epoch": 0.251339863241545, | |
| "grad_norm": 1.1193101615271859, | |
| "learning_rate": 1.922987811699042e-05, | |
| "loss": 1.164522933959961, | |
| "memory(GiB)": 74.93, | |
| "step": 340, | |
| "token_acc": 0.6142303969022265, | |
| "train_speed(iter/s)": 0.057185 | |
| }, | |
| { | |
| "epoch": 0.2550360377009795, | |
| "grad_norm": 1.184835291510033, | |
| "learning_rate": 1.9207367170693688e-05, | |
| "loss": 1.1658490180969239, | |
| "memory(GiB)": 74.93, | |
| "step": 345, | |
| "token_acc": 0.6181616832779624, | |
| "train_speed(iter/s)": 0.057315 | |
| }, | |
| { | |
| "epoch": 0.258732212160414, | |
| "grad_norm": 1.2033091005460579, | |
| "learning_rate": 1.918454551054109e-05, | |
| "loss": 1.174658966064453, | |
| "memory(GiB)": 74.93, | |
| "step": 350, | |
| "token_acc": 0.6646234676007006, | |
| "train_speed(iter/s)": 0.057368 | |
| }, | |
| { | |
| "epoch": 0.258732212160414, | |
| "eval_loss": 0.7548633813858032, | |
| "eval_runtime": 84.0949, | |
| "eval_samples_per_second": 83.18, | |
| "eval_steps_per_second": 0.654, | |
| "eval_token_acc": 0.6124072795776128, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.26242838661984846, | |
| "grad_norm": 1.1971162317002557, | |
| "learning_rate": 1.916141390667744e-05, | |
| "loss": 1.1562774658203125, | |
| "memory(GiB)": 74.93, | |
| "step": 355, | |
| "token_acc": 0.6173011120615911, | |
| "train_speed(iter/s)": 0.056434 | |
| }, | |
| { | |
| "epoch": 0.26612456107928295, | |
| "grad_norm": 1.1301068933758331, | |
| "learning_rate": 1.9137973139706973e-05, | |
| "loss": 1.2061149597167968, | |
| "memory(GiB)": 74.93, | |
| "step": 360, | |
| "token_acc": 0.5783767946088485, | |
| "train_speed(iter/s)": 0.056501 | |
| }, | |
| { | |
| "epoch": 0.26982073553871744, | |
| "grad_norm": 1.2885970736252064, | |
| "learning_rate": 1.9114224000667014e-05, | |
| "loss": 1.1453168869018555, | |
| "memory(GiB)": 74.93, | |
| "step": 365, | |
| "token_acc": 0.6045895851721095, | |
| "train_speed(iter/s)": 0.056637 | |
| }, | |
| { | |
| "epoch": 0.2735169099981519, | |
| "grad_norm": 1.2008587437465796, | |
| "learning_rate": 1.9090167291001278e-05, | |
| "loss": 1.151451015472412, | |
| "memory(GiB)": 74.93, | |
| "step": 370, | |
| "token_acc": 0.6464088397790055, | |
| "train_speed(iter/s)": 0.056724 | |
| }, | |
| { | |
| "epoch": 0.2772130844575864, | |
| "grad_norm": 1.2574733188940939, | |
| "learning_rate": 1.9065803822532825e-05, | |
| "loss": 1.143141269683838, | |
| "memory(GiB)": 74.93, | |
| "step": 375, | |
| "token_acc": 0.6279554937413073, | |
| "train_speed(iter/s)": 0.056779 | |
| }, | |
| { | |
| "epoch": 0.2809092589170209, | |
| "grad_norm": 1.2230232638304774, | |
| "learning_rate": 1.9041134417436674e-05, | |
| "loss": 1.1681084632873535, | |
| "memory(GiB)": 74.93, | |
| "step": 380, | |
| "token_acc": 0.6278735632183908, | |
| "train_speed(iter/s)": 0.0569 | |
| }, | |
| { | |
| "epoch": 0.2846054333764554, | |
| "grad_norm": 1.308574420114396, | |
| "learning_rate": 1.9016159908212044e-05, | |
| "loss": 1.1313629150390625, | |
| "memory(GiB)": 74.93, | |
| "step": 385, | |
| "token_acc": 0.6380670611439843, | |
| "train_speed(iter/s)": 0.056973 | |
| }, | |
| { | |
| "epoch": 0.2883016078358899, | |
| "grad_norm": 1.1949255351547317, | |
| "learning_rate": 1.899088113765426e-05, | |
| "loss": 1.1681228637695313, | |
| "memory(GiB)": 74.93, | |
| "step": 390, | |
| "token_acc": 0.6130952380952381, | |
| "train_speed(iter/s)": 0.057013 | |
| }, | |
| { | |
| "epoch": 0.29199778229532436, | |
| "grad_norm": 1.1669478994026365, | |
| "learning_rate": 1.896529895882633e-05, | |
| "loss": 1.1387041091918946, | |
| "memory(GiB)": 74.93, | |
| "step": 395, | |
| "token_acc": 0.6152671755725191, | |
| "train_speed(iter/s)": 0.05713 | |
| }, | |
| { | |
| "epoch": 0.29569395675475885, | |
| "grad_norm": 1.197646998798649, | |
| "learning_rate": 1.8939414235030137e-05, | |
| "loss": 1.1374378204345703, | |
| "memory(GiB)": 74.93, | |
| "step": 400, | |
| "token_acc": 0.6037667511771098, | |
| "train_speed(iter/s)": 0.057204 | |
| }, | |
| { | |
| "epoch": 0.29569395675475885, | |
| "eval_loss": 0.7541109323501587, | |
| "eval_runtime": 86.4511, | |
| "eval_samples_per_second": 80.913, | |
| "eval_steps_per_second": 0.636, | |
| "eval_token_acc": 0.613960988740803, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.29939013121419333, | |
| "grad_norm": 1.3110127948907355, | |
| "learning_rate": 1.8913227839777305e-05, | |
| "loss": 1.1630861282348632, | |
| "memory(GiB)": 74.93, | |
| "step": 405, | |
| "token_acc": 0.6250439264378588, | |
| "train_speed(iter/s)": 0.056415 | |
| }, | |
| { | |
| "epoch": 0.3030863056736278, | |
| "grad_norm": 1.2066723455387354, | |
| "learning_rate": 1.8886740656759755e-05, | |
| "loss": 1.1657712936401368, | |
| "memory(GiB)": 74.93, | |
| "step": 410, | |
| "token_acc": 0.6286093594424162, | |
| "train_speed(iter/s)": 0.056469 | |
| }, | |
| { | |
| "epoch": 0.3067824801330623, | |
| "grad_norm": 1.214334257623402, | |
| "learning_rate": 1.8859953579819833e-05, | |
| "loss": 1.129319953918457, | |
| "memory(GiB)": 74.93, | |
| "step": 415, | |
| "token_acc": 0.5934997644842205, | |
| "train_speed(iter/s)": 0.056572 | |
| }, | |
| { | |
| "epoch": 0.31047865459249674, | |
| "grad_norm": 1.260998092054749, | |
| "learning_rate": 1.883286751292018e-05, | |
| "loss": 1.125650119781494, | |
| "memory(GiB)": 74.93, | |
| "step": 420, | |
| "token_acc": 0.6005237125400058, | |
| "train_speed(iter/s)": 0.056666 | |
| }, | |
| { | |
| "epoch": 0.3141748290519312, | |
| "grad_norm": 1.1445762169453673, | |
| "learning_rate": 1.880548337011323e-05, | |
| "loss": 1.1848130226135254, | |
| "memory(GiB)": 74.93, | |
| "step": 425, | |
| "token_acc": 0.5819639278557114, | |
| "train_speed(iter/s)": 0.05671 | |
| }, | |
| { | |
| "epoch": 0.3178710035113657, | |
| "grad_norm": 1.2219231261580983, | |
| "learning_rate": 1.8777802075510338e-05, | |
| "loss": 1.1647357940673828, | |
| "memory(GiB)": 74.93, | |
| "step": 430, | |
| "token_acc": 0.6077451592754528, | |
| "train_speed(iter/s)": 0.056776 | |
| }, | |
| { | |
| "epoch": 0.3215671779708002, | |
| "grad_norm": 1.1956915969147472, | |
| "learning_rate": 1.8749824563250615e-05, | |
| "loss": 1.1394176483154297, | |
| "memory(GiB)": 74.93, | |
| "step": 435, | |
| "token_acc": 0.6606451612903226, | |
| "train_speed(iter/s)": 0.056853 | |
| }, | |
| { | |
| "epoch": 0.3252633524302347, | |
| "grad_norm": 1.3354423066052745, | |
| "learning_rate": 1.8721551777469397e-05, | |
| "loss": 1.152536964416504, | |
| "memory(GiB)": 74.93, | |
| "step": 440, | |
| "token_acc": 0.5991432068543452, | |
| "train_speed(iter/s)": 0.056906 | |
| }, | |
| { | |
| "epoch": 0.3289595268896692, | |
| "grad_norm": 1.2562915522841382, | |
| "learning_rate": 1.869298467226639e-05, | |
| "loss": 1.1220308303833009, | |
| "memory(GiB)": 74.93, | |
| "step": 445, | |
| "token_acc": 0.6066666666666667, | |
| "train_speed(iter/s)": 0.056963 | |
| }, | |
| { | |
| "epoch": 0.33265570134910366, | |
| "grad_norm": 1.359582068477731, | |
| "learning_rate": 1.8664124211673468e-05, | |
| "loss": 1.1504764556884766, | |
| "memory(GiB)": 74.93, | |
| "step": 450, | |
| "token_acc": 0.5973016235993597, | |
| "train_speed(iter/s)": 0.057049 | |
| }, | |
| { | |
| "epoch": 0.33265570134910366, | |
| "eval_loss": 0.7460736632347107, | |
| "eval_runtime": 88.8045, | |
| "eval_samples_per_second": 78.769, | |
| "eval_steps_per_second": 0.619, | |
| "eval_token_acc": 0.6144388755116506, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.33635187580853815, | |
| "grad_norm": 1.218975457419414, | |
| "learning_rate": 1.863497136962213e-05, | |
| "loss": 1.1313959121704102, | |
| "memory(GiB)": 74.93, | |
| "step": 455, | |
| "token_acc": 0.6262968874700718, | |
| "train_speed(iter/s)": 0.056354 | |
| }, | |
| { | |
| "epoch": 0.34004805026797263, | |
| "grad_norm": 1.4342194151464063, | |
| "learning_rate": 1.8605527129910663e-05, | |
| "loss": 1.1549379348754882, | |
| "memory(GiB)": 74.93, | |
| "step": 460, | |
| "token_acc": 0.6472244569589702, | |
| "train_speed(iter/s)": 0.056414 | |
| }, | |
| { | |
| "epoch": 0.3437442247274071, | |
| "grad_norm": 1.440358796861357, | |
| "learning_rate": 1.857579248617091e-05, | |
| "loss": 1.129042625427246, | |
| "memory(GiB)": 74.93, | |
| "step": 465, | |
| "token_acc": 0.6356026785714286, | |
| "train_speed(iter/s)": 0.05648 | |
| }, | |
| { | |
| "epoch": 0.3474403991868416, | |
| "grad_norm": 1.2091541968931232, | |
| "learning_rate": 1.854576844183476e-05, | |
| "loss": 1.1230792999267578, | |
| "memory(GiB)": 74.93, | |
| "step": 470, | |
| "token_acc": 0.6001645413410119, | |
| "train_speed(iter/s)": 0.056566 | |
| }, | |
| { | |
| "epoch": 0.3511365736462761, | |
| "grad_norm": 1.212497545728028, | |
| "learning_rate": 1.8515456010100274e-05, | |
| "loss": 1.1627266883850098, | |
| "memory(GiB)": 74.93, | |
| "step": 475, | |
| "token_acc": 0.6375609756097561, | |
| "train_speed(iter/s)": 0.056633 | |
| }, | |
| { | |
| "epoch": 0.3548327481057106, | |
| "grad_norm": 1.257170599310577, | |
| "learning_rate": 1.8484856213897496e-05, | |
| "loss": 1.1552623748779296, | |
| "memory(GiB)": 74.93, | |
| "step": 480, | |
| "token_acc": 0.6367495451788963, | |
| "train_speed(iter/s)": 0.056696 | |
| }, | |
| { | |
| "epoch": 0.35852892256514507, | |
| "grad_norm": 1.3061990827470522, | |
| "learning_rate": 1.8453970085853953e-05, | |
| "loss": 1.1611719131469727, | |
| "memory(GiB)": 74.93, | |
| "step": 485, | |
| "token_acc": 0.5953002610966057, | |
| "train_speed(iter/s)": 0.056777 | |
| }, | |
| { | |
| "epoch": 0.36222509702457956, | |
| "grad_norm": 1.2132042758068045, | |
| "learning_rate": 1.842279866825976e-05, | |
| "loss": 1.1605472564697266, | |
| "memory(GiB)": 74.93, | |
| "step": 490, | |
| "token_acc": 0.6365507776761208, | |
| "train_speed(iter/s)": 0.056851 | |
| }, | |
| { | |
| "epoch": 0.36592127148401404, | |
| "grad_norm": 1.2900699412115835, | |
| "learning_rate": 1.8391343013032505e-05, | |
| "loss": 1.1752688407897949, | |
| "memory(GiB)": 74.93, | |
| "step": 495, | |
| "token_acc": 0.6413404114134041, | |
| "train_speed(iter/s)": 0.056898 | |
| }, | |
| { | |
| "epoch": 0.36961744594344853, | |
| "grad_norm": 1.115509938975403, | |
| "learning_rate": 1.8359604181681703e-05, | |
| "loss": 1.1677565574645996, | |
| "memory(GiB)": 74.93, | |
| "step": 500, | |
| "token_acc": 0.635439360929557, | |
| "train_speed(iter/s)": 0.056967 | |
| }, | |
| { | |
| "epoch": 0.36961744594344853, | |
| "eval_loss": 0.7416162490844727, | |
| "eval_runtime": 87.8438, | |
| "eval_samples_per_second": 79.63, | |
| "eval_steps_per_second": 0.626, | |
| "eval_token_acc": 0.615732863603728, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.373313620402883, | |
| "grad_norm": 1.1864096727600855, | |
| "learning_rate": 1.8327583245273004e-05, | |
| "loss": 1.120311164855957, | |
| "memory(GiB)": 74.93, | |
| "step": 505, | |
| "token_acc": 0.6247582205029013, | |
| "train_speed(iter/s)": 0.056337 | |
| }, | |
| { | |
| "epoch": 0.3770097948623175, | |
| "grad_norm": 1.1558943975448084, | |
| "learning_rate": 1.8295281284392036e-05, | |
| "loss": 1.167508888244629, | |
| "memory(GiB)": 74.93, | |
| "step": 510, | |
| "token_acc": 0.5796680497925312, | |
| "train_speed(iter/s)": 0.056408 | |
| }, | |
| { | |
| "epoch": 0.380705969321752, | |
| "grad_norm": 1.2905670874481943, | |
| "learning_rate": 1.8262699389107933e-05, | |
| "loss": 1.15736083984375, | |
| "memory(GiB)": 74.93, | |
| "step": 515, | |
| "token_acc": 0.6157240272763739, | |
| "train_speed(iter/s)": 0.056454 | |
| }, | |
| { | |
| "epoch": 0.3844021437811865, | |
| "grad_norm": 1.2748847596057926, | |
| "learning_rate": 1.8229838658936566e-05, | |
| "loss": 1.1492805480957031, | |
| "memory(GiB)": 74.93, | |
| "step": 520, | |
| "token_acc": 0.6105889724310777, | |
| "train_speed(iter/s)": 0.056519 | |
| }, | |
| { | |
| "epoch": 0.38809831824062097, | |
| "grad_norm": 1.1876718707954161, | |
| "learning_rate": 1.819670020280343e-05, | |
| "loss": 1.1467121124267579, | |
| "memory(GiB)": 74.93, | |
| "step": 525, | |
| "token_acc": 0.6113826815642458, | |
| "train_speed(iter/s)": 0.056588 | |
| }, | |
| { | |
| "epoch": 0.39179449270005545, | |
| "grad_norm": 1.2841584592867252, | |
| "learning_rate": 1.816328513900622e-05, | |
| "loss": 1.1653972625732423, | |
| "memory(GiB)": 74.93, | |
| "step": 530, | |
| "token_acc": 0.6273197444478248, | |
| "train_speed(iter/s)": 0.056639 | |
| }, | |
| { | |
| "epoch": 0.39549066715948994, | |
| "grad_norm": 1.243754331563731, | |
| "learning_rate": 1.8129594595177093e-05, | |
| "loss": 1.154591178894043, | |
| "memory(GiB)": 74.93, | |
| "step": 535, | |
| "token_acc": 0.5926477893691009, | |
| "train_speed(iter/s)": 0.056695 | |
| }, | |
| { | |
| "epoch": 0.3991868416189244, | |
| "grad_norm": 1.3245067788741383, | |
| "learning_rate": 1.809562970824462e-05, | |
| "loss": 1.157964324951172, | |
| "memory(GiB)": 74.93, | |
| "step": 540, | |
| "token_acc": 0.6192792394428477, | |
| "train_speed(iter/s)": 0.056758 | |
| }, | |
| { | |
| "epoch": 0.4028830160783589, | |
| "grad_norm": 1.3057962329498682, | |
| "learning_rate": 1.806139162439541e-05, | |
| "loss": 1.1371761322021485, | |
| "memory(GiB)": 74.93, | |
| "step": 545, | |
| "token_acc": 0.596340150699677, | |
| "train_speed(iter/s)": 0.056815 | |
| }, | |
| { | |
| "epoch": 0.4065791905377934, | |
| "grad_norm": 1.25005365154622, | |
| "learning_rate": 1.8026881499035437e-05, | |
| "loss": 1.1124300956726074, | |
| "memory(GiB)": 74.93, | |
| "step": 550, | |
| "token_acc": 0.6204881402543829, | |
| "train_speed(iter/s)": 0.056864 | |
| }, | |
| { | |
| "epoch": 0.4065791905377934, | |
| "eval_loss": 0.7460726499557495, | |
| "eval_runtime": 88.6273, | |
| "eval_samples_per_second": 78.926, | |
| "eval_steps_per_second": 0.621, | |
| "eval_token_acc": 0.6162869352221019, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.4102753649972279, | |
| "grad_norm": 1.1926510177467409, | |
| "learning_rate": 1.7992100496751054e-05, | |
| "loss": 1.1571131706237794, | |
| "memory(GiB)": 74.93, | |
| "step": 555, | |
| "token_acc": 0.6311389759665622, | |
| "train_speed(iter/s)": 0.056299 | |
| }, | |
| { | |
| "epoch": 0.4139715394566624, | |
| "grad_norm": 1.1989503074947894, | |
| "learning_rate": 1.7957049791269684e-05, | |
| "loss": 1.1516962051391602, | |
| "memory(GiB)": 74.93, | |
| "step": 560, | |
| "token_acc": 0.5952788231269244, | |
| "train_speed(iter/s)": 0.056369 | |
| }, | |
| { | |
| "epoch": 0.41766771391609686, | |
| "grad_norm": 1.1212233051313498, | |
| "learning_rate": 1.792173056542021e-05, | |
| "loss": 1.1592437744140625, | |
| "memory(GiB)": 74.93, | |
| "step": 565, | |
| "token_acc": 0.5976621417797888, | |
| "train_speed(iter/s)": 0.056413 | |
| }, | |
| { | |
| "epoch": 0.42136388837553135, | |
| "grad_norm": 1.1553604640842632, | |
| "learning_rate": 1.7886144011093067e-05, | |
| "loss": 1.1524188041687011, | |
| "memory(GiB)": 74.93, | |
| "step": 570, | |
| "token_acc": 0.6424742268041237, | |
| "train_speed(iter/s)": 0.056462 | |
| }, | |
| { | |
| "epoch": 0.42506006283496583, | |
| "grad_norm": 1.183725532275657, | |
| "learning_rate": 1.7850291329200015e-05, | |
| "loss": 1.1416030883789063, | |
| "memory(GiB)": 74.93, | |
| "step": 575, | |
| "token_acc": 0.6029700196133371, | |
| "train_speed(iter/s)": 0.056533 | |
| }, | |
| { | |
| "epoch": 0.4287562372944003, | |
| "grad_norm": 1.2480769087109442, | |
| "learning_rate": 1.7814173729633607e-05, | |
| "loss": 1.164370059967041, | |
| "memory(GiB)": 74.93, | |
| "step": 580, | |
| "token_acc": 0.6192486281131279, | |
| "train_speed(iter/s)": 0.056588 | |
| }, | |
| { | |
| "epoch": 0.43245241175383475, | |
| "grad_norm": 1.3104680757325256, | |
| "learning_rate": 1.7777792431226384e-05, | |
| "loss": 1.119395637512207, | |
| "memory(GiB)": 74.93, | |
| "step": 585, | |
| "token_acc": 0.6305528922978587, | |
| "train_speed(iter/s)": 0.056638 | |
| }, | |
| { | |
| "epoch": 0.43614858621326924, | |
| "grad_norm": 1.213929814999547, | |
| "learning_rate": 1.7741148661709707e-05, | |
| "loss": 1.1547592163085938, | |
| "memory(GiB)": 74.93, | |
| "step": 590, | |
| "token_acc": 0.6233905579399142, | |
| "train_speed(iter/s)": 0.056711 | |
| }, | |
| { | |
| "epoch": 0.4398447606727037, | |
| "grad_norm": 1.2155093557171206, | |
| "learning_rate": 1.770424365767236e-05, | |
| "loss": 1.1199445724487305, | |
| "memory(GiB)": 74.93, | |
| "step": 595, | |
| "token_acc": 0.6336528221512248, | |
| "train_speed(iter/s)": 0.056773 | |
| }, | |
| { | |
| "epoch": 0.4435409351321382, | |
| "grad_norm": 1.3908702173841363, | |
| "learning_rate": 1.7667078664518796e-05, | |
| "loss": 1.157416534423828, | |
| "memory(GiB)": 74.93, | |
| "step": 600, | |
| "token_acc": 0.6181159420289855, | |
| "train_speed(iter/s)": 0.056815 | |
| }, | |
| { | |
| "epoch": 0.4435409351321382, | |
| "eval_loss": 0.7338850498199463, | |
| "eval_runtime": 85.3003, | |
| "eval_samples_per_second": 82.004, | |
| "eval_steps_per_second": 0.645, | |
| "eval_token_acc": 0.6175324420475716, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.4472371095915727, | |
| "grad_norm": 1.022281205691788, | |
| "learning_rate": 1.7629654936427126e-05, | |
| "loss": 1.1211700439453125, | |
| "memory(GiB)": 74.93, | |
| "step": 605, | |
| "token_acc": 0.6267794070427057, | |
| "train_speed(iter/s)": 0.056289 | |
| }, | |
| { | |
| "epoch": 0.4509332840510072, | |
| "grad_norm": 1.1115715050120814, | |
| "learning_rate": 1.7591973736306774e-05, | |
| "loss": 1.1568084716796876, | |
| "memory(GiB)": 74.93, | |
| "step": 610, | |
| "token_acc": 0.6001278227524499, | |
| "train_speed(iter/s)": 0.056358 | |
| }, | |
| { | |
| "epoch": 0.4546294585104417, | |
| "grad_norm": 1.2942894072539404, | |
| "learning_rate": 1.755403633575589e-05, | |
| "loss": 1.1330131530761718, | |
| "memory(GiB)": 74.93, | |
| "step": 615, | |
| "token_acc": 0.6048237476808905, | |
| "train_speed(iter/s)": 0.056424 | |
| }, | |
| { | |
| "epoch": 0.45832563296987616, | |
| "grad_norm": 1.2115375753993367, | |
| "learning_rate": 1.7515844015018416e-05, | |
| "loss": 1.1604066848754884, | |
| "memory(GiB)": 74.93, | |
| "step": 620, | |
| "token_acc": 0.6332541567695962, | |
| "train_speed(iter/s)": 0.05648 | |
| }, | |
| { | |
| "epoch": 0.46202180742931065, | |
| "grad_norm": 1.1168616761395809, | |
| "learning_rate": 1.7477398062940868e-05, | |
| "loss": 1.1492230415344238, | |
| "memory(GiB)": 74.93, | |
| "step": 625, | |
| "token_acc": 0.6326703343207787, | |
| "train_speed(iter/s)": 0.056541 | |
| }, | |
| { | |
| "epoch": 0.46571798188874514, | |
| "grad_norm": 1.3080238975825687, | |
| "learning_rate": 1.7438699776928892e-05, | |
| "loss": 1.159599494934082, | |
| "memory(GiB)": 74.93, | |
| "step": 630, | |
| "token_acc": 0.5911352329262777, | |
| "train_speed(iter/s)": 0.056603 | |
| }, | |
| { | |
| "epoch": 0.4694141563481796, | |
| "grad_norm": 1.270157306289422, | |
| "learning_rate": 1.739975046290343e-05, | |
| "loss": 1.1172502517700196, | |
| "memory(GiB)": 74.93, | |
| "step": 635, | |
| "token_acc": 0.6800878477306003, | |
| "train_speed(iter/s)": 0.05664 | |
| }, | |
| { | |
| "epoch": 0.4731103308076141, | |
| "grad_norm": 1.1591581275323428, | |
| "learning_rate": 1.7360551435256673e-05, | |
| "loss": 1.1474403381347655, | |
| "memory(GiB)": 74.93, | |
| "step": 640, | |
| "token_acc": 0.6703857188396557, | |
| "train_speed(iter/s)": 0.056691 | |
| }, | |
| { | |
| "epoch": 0.4768065052670486, | |
| "grad_norm": 1.3849471969434006, | |
| "learning_rate": 1.7321104016807716e-05, | |
| "loss": 1.1200141906738281, | |
| "memory(GiB)": 74.93, | |
| "step": 645, | |
| "token_acc": 0.6204099060631939, | |
| "train_speed(iter/s)": 0.056741 | |
| }, | |
| { | |
| "epoch": 0.4805026797264831, | |
| "grad_norm": 1.2181008696775872, | |
| "learning_rate": 1.7281409538757886e-05, | |
| "loss": 1.1367115020751952, | |
| "memory(GiB)": 74.93, | |
| "step": 650, | |
| "token_acc": 0.6141581632653061, | |
| "train_speed(iter/s)": 0.056787 | |
| }, | |
| { | |
| "epoch": 0.4805026797264831, | |
| "eval_loss": 0.7338098287582397, | |
| "eval_runtime": 86.3351, | |
| "eval_samples_per_second": 81.022, | |
| "eval_steps_per_second": 0.637, | |
| "eval_token_acc": 0.618567863384408, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.48419885418591757, | |
| "grad_norm": 1.2381127917004506, | |
| "learning_rate": 1.7241469340645856e-05, | |
| "loss": 1.1498327255249023, | |
| "memory(GiB)": 74.93, | |
| "step": 655, | |
| "token_acc": 0.6240238388820386, | |
| "train_speed(iter/s)": 0.056305 | |
| }, | |
| { | |
| "epoch": 0.48789502864535206, | |
| "grad_norm": 1.3545670040018443, | |
| "learning_rate": 1.720128477030241e-05, | |
| "loss": 1.123112392425537, | |
| "memory(GiB)": 74.93, | |
| "step": 660, | |
| "token_acc": 0.6101917520357236, | |
| "train_speed(iter/s)": 0.05635 | |
| }, | |
| { | |
| "epoch": 0.49159120310478654, | |
| "grad_norm": 1.2698188744774948, | |
| "learning_rate": 1.716085718380498e-05, | |
| "loss": 1.1386995315551758, | |
| "memory(GiB)": 74.93, | |
| "step": 665, | |
| "token_acc": 0.6005629477993859, | |
| "train_speed(iter/s)": 0.056398 | |
| }, | |
| { | |
| "epoch": 0.49528737756422103, | |
| "grad_norm": 1.4609798611237281, | |
| "learning_rate": 1.7120187945431874e-05, | |
| "loss": 1.1037940979003906, | |
| "memory(GiB)": 74.93, | |
| "step": 670, | |
| "token_acc": 0.6407727085902178, | |
| "train_speed(iter/s)": 0.056444 | |
| }, | |
| { | |
| "epoch": 0.4989835520236555, | |
| "grad_norm": 1.1805190661164426, | |
| "learning_rate": 1.707927842761623e-05, | |
| "loss": 1.1232402801513672, | |
| "memory(GiB)": 74.93, | |
| "step": 675, | |
| "token_acc": 0.5811437403400309, | |
| "train_speed(iter/s)": 0.05646 | |
| }, | |
| { | |
| "epoch": 0.50267972648309, | |
| "grad_norm": 1.1558010845800675, | |
| "learning_rate": 1.7038130010899716e-05, | |
| "loss": 1.1340635299682618, | |
| "memory(GiB)": 74.93, | |
| "step": 680, | |
| "token_acc": 0.6523545706371191, | |
| "train_speed(iter/s)": 0.056504 | |
| }, | |
| { | |
| "epoch": 0.5063759009425245, | |
| "grad_norm": 1.1790896957784056, | |
| "learning_rate": 1.6996744083885938e-05, | |
| "loss": 1.1378223419189453, | |
| "memory(GiB)": 74.93, | |
| "step": 685, | |
| "token_acc": 0.6573009791400596, | |
| "train_speed(iter/s)": 0.056546 | |
| }, | |
| { | |
| "epoch": 0.510072075401959, | |
| "grad_norm": 1.2335317128319008, | |
| "learning_rate": 1.695512204319357e-05, | |
| "loss": 1.1394284248352051, | |
| "memory(GiB)": 74.93, | |
| "step": 690, | |
| "token_acc": 0.6082870568133276, | |
| "train_speed(iter/s)": 0.056586 | |
| }, | |
| { | |
| "epoch": 0.5137682498613935, | |
| "grad_norm": 0.9893255166681467, | |
| "learning_rate": 1.6913265293409235e-05, | |
| "loss": 1.1198680877685547, | |
| "memory(GiB)": 74.93, | |
| "step": 695, | |
| "token_acc": 0.547270955165692, | |
| "train_speed(iter/s)": 0.05664 | |
| }, | |
| { | |
| "epoch": 0.517464424320828, | |
| "grad_norm": 1.1351076610632471, | |
| "learning_rate": 1.68711752470401e-05, | |
| "loss": 1.1366339683532716, | |
| "memory(GiB)": 74.93, | |
| "step": 700, | |
| "token_acc": 0.6295369211514393, | |
| "train_speed(iter/s)": 0.056675 | |
| }, | |
| { | |
| "epoch": 0.517464424320828, | |
| "eval_loss": 0.7255228757858276, | |
| "eval_runtime": 89.5144, | |
| "eval_samples_per_second": 78.144, | |
| "eval_steps_per_second": 0.614, | |
| "eval_token_acc": 0.6190699907885594, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.5211605987802624, | |
| "grad_norm": 1.0862208515121348, | |
| "learning_rate": 1.682885332446621e-05, | |
| "loss": 1.1369894981384276, | |
| "memory(GiB)": 74.93, | |
| "step": 705, | |
| "token_acc": 0.6288204532248692, | |
| "train_speed(iter/s)": 0.056212 | |
| }, | |
| { | |
| "epoch": 0.5248567732396969, | |
| "grad_norm": 1.1660653361907225, | |
| "learning_rate": 1.6786300953892563e-05, | |
| "loss": 1.1410274505615234, | |
| "memory(GiB)": 74.93, | |
| "step": 710, | |
| "token_acc": 0.6100605143721634, | |
| "train_speed(iter/s)": 0.056263 | |
| }, | |
| { | |
| "epoch": 0.5285529476991314, | |
| "grad_norm": 1.0896922974940084, | |
| "learning_rate": 1.674351957130089e-05, | |
| "loss": 1.1174249649047852, | |
| "memory(GiB)": 74.93, | |
| "step": 715, | |
| "token_acc": 0.6420308483290489, | |
| "train_speed(iter/s)": 0.056309 | |
| }, | |
| { | |
| "epoch": 0.5322491221585659, | |
| "grad_norm": 1.152348085956414, | |
| "learning_rate": 1.6700510620401223e-05, | |
| "loss": 1.1088247299194336, | |
| "memory(GiB)": 74.93, | |
| "step": 720, | |
| "token_acc": 0.6403995560488346, | |
| "train_speed(iter/s)": 0.056355 | |
| }, | |
| { | |
| "epoch": 0.5359452966180004, | |
| "grad_norm": 1.1236142627513106, | |
| "learning_rate": 1.6657275552583172e-05, | |
| "loss": 1.137843418121338, | |
| "memory(GiB)": 74.93, | |
| "step": 725, | |
| "token_acc": 0.5981665393430099, | |
| "train_speed(iter/s)": 0.056406 | |
| }, | |
| { | |
| "epoch": 0.5396414710774349, | |
| "grad_norm": 1.0869362324396392, | |
| "learning_rate": 1.6613815826866923e-05, | |
| "loss": 1.1183334350585938, | |
| "memory(GiB)": 74.93, | |
| "step": 730, | |
| "token_acc": 0.6076433121019108, | |
| "train_speed(iter/s)": 0.056454 | |
| }, | |
| { | |
| "epoch": 0.5433376455368694, | |
| "grad_norm": 1.0408539682832916, | |
| "learning_rate": 1.6570132909854027e-05, | |
| "loss": 1.1498143196105957, | |
| "memory(GiB)": 74.93, | |
| "step": 735, | |
| "token_acc": 0.6524312896405919, | |
| "train_speed(iter/s)": 0.0565 | |
| }, | |
| { | |
| "epoch": 0.5470338199963038, | |
| "grad_norm": 1.223295875198057, | |
| "learning_rate": 1.6526228275677892e-05, | |
| "loss": 1.091654109954834, | |
| "memory(GiB)": 74.93, | |
| "step": 740, | |
| "token_acc": 0.6982872200263505, | |
| "train_speed(iter/s)": 0.056544 | |
| }, | |
| { | |
| "epoch": 0.5507299944557383, | |
| "grad_norm": 1.1558442201312176, | |
| "learning_rate": 1.6482103405954056e-05, | |
| "loss": 1.1205904006958007, | |
| "memory(GiB)": 74.93, | |
| "step": 745, | |
| "token_acc": 0.6377204884667571, | |
| "train_speed(iter/s)": 0.056579 | |
| }, | |
| { | |
| "epoch": 0.5544261689151728, | |
| "grad_norm": 1.2784643735837162, | |
| "learning_rate": 1.6437759789730154e-05, | |
| "loss": 1.1237329483032226, | |
| "memory(GiB)": 74.93, | |
| "step": 750, | |
| "token_acc": 0.6141374837872893, | |
| "train_speed(iter/s)": 0.056631 | |
| }, | |
| { | |
| "epoch": 0.5544261689151728, | |
| "eval_loss": 0.7271792888641357, | |
| "eval_runtime": 87.6966, | |
| "eval_samples_per_second": 79.764, | |
| "eval_steps_per_second": 0.627, | |
| "eval_token_acc": 0.6196194451434468, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.5581223433746073, | |
| "grad_norm": 1.2055849293387977, | |
| "learning_rate": 1.6393198923435707e-05, | |
| "loss": 1.1234511375427245, | |
| "memory(GiB)": 74.93, | |
| "step": 755, | |
| "token_acc": 0.6244901356863398, | |
| "train_speed(iter/s)": 0.056217 | |
| }, | |
| { | |
| "epoch": 0.5618185178340418, | |
| "grad_norm": 1.1362509527796705, | |
| "learning_rate": 1.63484223108316e-05, | |
| "loss": 1.125691795349121, | |
| "memory(GiB)": 74.93, | |
| "step": 760, | |
| "token_acc": 0.6037473976405274, | |
| "train_speed(iter/s)": 0.05626 | |
| }, | |
| { | |
| "epoch": 0.5655146922934763, | |
| "grad_norm": 1.123275540757232, | |
| "learning_rate": 1.6303431462959327e-05, | |
| "loss": 1.1341413497924804, | |
| "memory(GiB)": 74.93, | |
| "step": 765, | |
| "token_acc": 0.6085106382978723, | |
| "train_speed(iter/s)": 0.056308 | |
| }, | |
| { | |
| "epoch": 0.5692108667529108, | |
| "grad_norm": 1.015989051360902, | |
| "learning_rate": 1.6258227898090037e-05, | |
| "loss": 1.1203922271728515, | |
| "memory(GiB)": 74.93, | |
| "step": 770, | |
| "token_acc": 0.601472134595163, | |
| "train_speed(iter/s)": 0.056355 | |
| }, | |
| { | |
| "epoch": 0.5729070412123453, | |
| "grad_norm": 1.189393051036189, | |
| "learning_rate": 1.6212813141673254e-05, | |
| "loss": 1.1124958038330077, | |
| "memory(GiB)": 74.93, | |
| "step": 775, | |
| "token_acc": 0.6260790549750114, | |
| "train_speed(iter/s)": 0.056399 | |
| }, | |
| { | |
| "epoch": 0.5766032156717797, | |
| "grad_norm": 1.1850051513280322, | |
| "learning_rate": 1.6167188726285433e-05, | |
| "loss": 1.114617919921875, | |
| "memory(GiB)": 74.93, | |
| "step": 780, | |
| "token_acc": 0.5942992874109264, | |
| "train_speed(iter/s)": 0.056434 | |
| }, | |
| { | |
| "epoch": 0.5802993901312142, | |
| "grad_norm": 1.0681729567626044, | |
| "learning_rate": 1.6121356191578213e-05, | |
| "loss": 1.1280495643615722, | |
| "memory(GiB)": 74.93, | |
| "step": 785, | |
| "token_acc": 0.705685618729097, | |
| "train_speed(iter/s)": 0.056481 | |
| }, | |
| { | |
| "epoch": 0.5839955645906487, | |
| "grad_norm": 1.2860183936318812, | |
| "learning_rate": 1.607531708422649e-05, | |
| "loss": 1.1495230674743653, | |
| "memory(GiB)": 74.93, | |
| "step": 790, | |
| "token_acc": 0.5793650793650794, | |
| "train_speed(iter/s)": 0.056516 | |
| }, | |
| { | |
| "epoch": 0.5876917390500832, | |
| "grad_norm": 1.0862282113312, | |
| "learning_rate": 1.6029072957876196e-05, | |
| "loss": 1.1175559997558593, | |
| "memory(GiB)": 74.93, | |
| "step": 795, | |
| "token_acc": 0.6226415094339622, | |
| "train_speed(iter/s)": 0.056552 | |
| }, | |
| { | |
| "epoch": 0.5913879135095177, | |
| "grad_norm": 1.1331799452220792, | |
| "learning_rate": 1.5982625373091877e-05, | |
| "loss": 1.0859192848205566, | |
| "memory(GiB)": 74.93, | |
| "step": 800, | |
| "token_acc": 0.597226235192141, | |
| "train_speed(iter/s)": 0.056592 | |
| }, | |
| { | |
| "epoch": 0.5913879135095177, | |
| "eval_loss": 0.7157755494117737, | |
| "eval_runtime": 88.6481, | |
| "eval_samples_per_second": 78.907, | |
| "eval_steps_per_second": 0.62, | |
| "eval_token_acc": 0.6206202370041347, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.5950840879689522, | |
| "grad_norm": 1.108802407981979, | |
| "learning_rate": 1.593597589730404e-05, | |
| "loss": 1.147084617614746, | |
| "memory(GiB)": 74.93, | |
| "step": 805, | |
| "token_acc": 0.6168687401159726, | |
| "train_speed(iter/s)": 0.056208 | |
| }, | |
| { | |
| "epoch": 0.5987802624283867, | |
| "grad_norm": 0.9423602415844418, | |
| "learning_rate": 1.5889126104756245e-05, | |
| "loss": 1.1448484420776368, | |
| "memory(GiB)": 74.93, | |
| "step": 810, | |
| "token_acc": 0.5890688259109311, | |
| "train_speed(iter/s)": 0.056247 | |
| }, | |
| { | |
| "epoch": 0.6024764368878212, | |
| "grad_norm": 1.0816637490179923, | |
| "learning_rate": 1.5842077576451988e-05, | |
| "loss": 1.1083642959594726, | |
| "memory(GiB)": 74.93, | |
| "step": 815, | |
| "token_acc": 0.6413487738419619, | |
| "train_speed(iter/s)": 0.056285 | |
| }, | |
| { | |
| "epoch": 0.6061726113472556, | |
| "grad_norm": 1.135732608334688, | |
| "learning_rate": 1.5794831900101352e-05, | |
| "loss": 1.1130756378173827, | |
| "memory(GiB)": 74.93, | |
| "step": 820, | |
| "token_acc": 0.620497803806735, | |
| "train_speed(iter/s)": 0.056338 | |
| }, | |
| { | |
| "epoch": 0.6098687858066901, | |
| "grad_norm": 1.0156136928889437, | |
| "learning_rate": 1.5747390670067412e-05, | |
| "loss": 1.1423524856567382, | |
| "memory(GiB)": 74.93, | |
| "step": 825, | |
| "token_acc": 0.6086384564788424, | |
| "train_speed(iter/s)": 0.056378 | |
| }, | |
| { | |
| "epoch": 0.6135649602661246, | |
| "grad_norm": 1.233089498837372, | |
| "learning_rate": 1.5699755487312446e-05, | |
| "loss": 1.1060791969299317, | |
| "memory(GiB)": 74.93, | |
| "step": 830, | |
| "token_acc": 0.6365546218487395, | |
| "train_speed(iter/s)": 0.056416 | |
| }, | |
| { | |
| "epoch": 0.6172611347255591, | |
| "grad_norm": 1.1731325122439864, | |
| "learning_rate": 1.56519279593439e-05, | |
| "loss": 1.0863089561462402, | |
| "memory(GiB)": 74.93, | |
| "step": 835, | |
| "token_acc": 0.6160830090791181, | |
| "train_speed(iter/s)": 0.056451 | |
| }, | |
| { | |
| "epoch": 0.6209573091849935, | |
| "grad_norm": 1.1022360374731142, | |
| "learning_rate": 1.560390970016015e-05, | |
| "loss": 1.1188045501708985, | |
| "memory(GiB)": 74.93, | |
| "step": 840, | |
| "token_acc": 0.5851091817942646, | |
| "train_speed(iter/s)": 0.05649 | |
| }, | |
| { | |
| "epoch": 0.624653483644428, | |
| "grad_norm": 1.1163862966216507, | |
| "learning_rate": 1.5555702330196024e-05, | |
| "loss": 1.1088319778442384, | |
| "memory(GiB)": 74.93, | |
| "step": 845, | |
| "token_acc": 0.6556741028128031, | |
| "train_speed(iter/s)": 0.056533 | |
| }, | |
| { | |
| "epoch": 0.6283496581038625, | |
| "grad_norm": 1.1694067702393547, | |
| "learning_rate": 1.5507307476268126e-05, | |
| "loss": 1.1475400924682617, | |
| "memory(GiB)": 74.93, | |
| "step": 850, | |
| "token_acc": 0.6055389221556886, | |
| "train_speed(iter/s)": 0.056569 | |
| }, | |
| { | |
| "epoch": 0.6283496581038625, | |
| "eval_loss": 0.7119885683059692, | |
| "eval_runtime": 87.1877, | |
| "eval_samples_per_second": 80.229, | |
| "eval_steps_per_second": 0.631, | |
| "eval_token_acc": 0.621244721890677, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.6320458325632969, | |
| "grad_norm": 1.1865540340685679, | |
| "learning_rate": 1.5458726771519946e-05, | |
| "loss": 1.135090446472168, | |
| "memory(GiB)": 74.93, | |
| "step": 855, | |
| "token_acc": 0.6295323704676296, | |
| "train_speed(iter/s)": 0.056205 | |
| }, | |
| { | |
| "epoch": 0.6357420070227314, | |
| "grad_norm": 0.9908463678598523, | |
| "learning_rate": 1.5409961855366718e-05, | |
| "loss": 1.110205078125, | |
| "memory(GiB)": 74.93, | |
| "step": 860, | |
| "token_acc": 0.6002865329512894, | |
| "train_speed(iter/s)": 0.056248 | |
| }, | |
| { | |
| "epoch": 0.6394381814821659, | |
| "grad_norm": 1.1394579815051238, | |
| "learning_rate": 1.5361014373440125e-05, | |
| "loss": 1.131001091003418, | |
| "memory(GiB)": 74.93, | |
| "step": 865, | |
| "token_acc": 0.6846254927726675, | |
| "train_speed(iter/s)": 0.056284 | |
| }, | |
| { | |
| "epoch": 0.6431343559416004, | |
| "grad_norm": 1.2277455515675866, | |
| "learning_rate": 1.5311885977532756e-05, | |
| "loss": 1.1217898368835448, | |
| "memory(GiB)": 74.93, | |
| "step": 870, | |
| "token_acc": 0.5979188900747066, | |
| "train_speed(iter/s)": 0.056322 | |
| }, | |
| { | |
| "epoch": 0.6468305304010349, | |
| "grad_norm": 1.163464153725413, | |
| "learning_rate": 1.5262578325542366e-05, | |
| "loss": 1.096768569946289, | |
| "memory(GiB)": 74.93, | |
| "step": 875, | |
| "token_acc": 0.6008762322015334, | |
| "train_speed(iter/s)": 0.056371 | |
| }, | |
| { | |
| "epoch": 0.6505267048604694, | |
| "grad_norm": 1.0920480508914876, | |
| "learning_rate": 1.521309308141592e-05, | |
| "loss": 1.1257577896118165, | |
| "memory(GiB)": 74.93, | |
| "step": 880, | |
| "token_acc": 0.6577503429355281, | |
| "train_speed(iter/s)": 0.056412 | |
| }, | |
| { | |
| "epoch": 0.6542228793199039, | |
| "grad_norm": 1.1338180174479229, | |
| "learning_rate": 1.5163431915093443e-05, | |
| "loss": 1.1262746810913087, | |
| "memory(GiB)": 74.93, | |
| "step": 885, | |
| "token_acc": 0.6306549628629304, | |
| "train_speed(iter/s)": 0.056447 | |
| }, | |
| { | |
| "epoch": 0.6579190537793383, | |
| "grad_norm": 1.295043254051827, | |
| "learning_rate": 1.511359650245168e-05, | |
| "loss": 1.1621430397033692, | |
| "memory(GiB)": 74.93, | |
| "step": 890, | |
| "token_acc": 0.6065481230595541, | |
| "train_speed(iter/s)": 0.056485 | |
| }, | |
| { | |
| "epoch": 0.6616152282387728, | |
| "grad_norm": 1.1985531473315896, | |
| "learning_rate": 1.506358852524752e-05, | |
| "loss": 1.1280719757080078, | |
| "memory(GiB)": 74.93, | |
| "step": 895, | |
| "token_acc": 0.6419322709163346, | |
| "train_speed(iter/s)": 0.056523 | |
| }, | |
| { | |
| "epoch": 0.6653114026982073, | |
| "grad_norm": 1.0909942367098966, | |
| "learning_rate": 1.5013409671061267e-05, | |
| "loss": 1.125238800048828, | |
| "memory(GiB)": 74.93, | |
| "step": 900, | |
| "token_acc": 0.599232245681382, | |
| "train_speed(iter/s)": 0.056559 | |
| }, | |
| { | |
| "epoch": 0.6653114026982073, | |
| "eval_loss": 0.7135615348815918, | |
| "eval_runtime": 87.1706, | |
| "eval_samples_per_second": 80.245, | |
| "eval_steps_per_second": 0.631, | |
| "eval_token_acc": 0.6218034107725374, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.6690075771576418, | |
| "grad_norm": 1.1857146226848603, | |
| "learning_rate": 1.4963061633239665e-05, | |
| "loss": 1.1094846725463867, | |
| "memory(GiB)": 74.93, | |
| "step": 905, | |
| "token_acc": 0.6268454980245374, | |
| "train_speed(iter/s)": 0.056203 | |
| }, | |
| { | |
| "epoch": 0.6727037516170763, | |
| "grad_norm": 0.9662742881806529, | |
| "learning_rate": 1.4912546110838775e-05, | |
| "loss": 1.1187602996826171, | |
| "memory(GiB)": 74.93, | |
| "step": 910, | |
| "token_acc": 0.6091391268869849, | |
| "train_speed(iter/s)": 0.056241 | |
| }, | |
| { | |
| "epoch": 0.6763999260765108, | |
| "grad_norm": 1.0584302453369157, | |
| "learning_rate": 1.4861864808566624e-05, | |
| "loss": 1.101078701019287, | |
| "memory(GiB)": 74.93, | |
| "step": 915, | |
| "token_acc": 0.5681592039800994, | |
| "train_speed(iter/s)": 0.056284 | |
| }, | |
| { | |
| "epoch": 0.6800961005359453, | |
| "grad_norm": 1.1605002634031412, | |
| "learning_rate": 1.4811019436725684e-05, | |
| "loss": 1.146175003051758, | |
| "memory(GiB)": 74.93, | |
| "step": 920, | |
| "token_acc": 0.63498674744415, | |
| "train_speed(iter/s)": 0.056321 | |
| }, | |
| { | |
| "epoch": 0.6837922749953798, | |
| "grad_norm": 1.0137203677446553, | |
| "learning_rate": 1.4760011711155164e-05, | |
| "loss": 1.1349545478820802, | |
| "memory(GiB)": 74.93, | |
| "step": 925, | |
| "token_acc": 0.6199203187250996, | |
| "train_speed(iter/s)": 0.056361 | |
| }, | |
| { | |
| "epoch": 0.6874884494548142, | |
| "grad_norm": 1.183534701619676, | |
| "learning_rate": 1.4708843353173084e-05, | |
| "loss": 1.0977567672729491, | |
| "memory(GiB)": 74.93, | |
| "step": 930, | |
| "token_acc": 0.6462346760070052, | |
| "train_speed(iter/s)": 0.056403 | |
| }, | |
| { | |
| "epoch": 0.6911846239142487, | |
| "grad_norm": 1.1575204207505418, | |
| "learning_rate": 1.4657516089518211e-05, | |
| "loss": 1.1138565063476562, | |
| "memory(GiB)": 74.93, | |
| "step": 935, | |
| "token_acc": 0.6146223888591323, | |
| "train_speed(iter/s)": 0.056436 | |
| }, | |
| { | |
| "epoch": 0.6948807983736832, | |
| "grad_norm": 1.1418054839263487, | |
| "learning_rate": 1.4606031652291772e-05, | |
| "loss": 1.1173955917358398, | |
| "memory(GiB)": 74.93, | |
| "step": 940, | |
| "token_acc": 0.6329457364341086, | |
| "train_speed(iter/s)": 0.056463 | |
| }, | |
| { | |
| "epoch": 0.6985769728331177, | |
| "grad_norm": 1.0817591968148002, | |
| "learning_rate": 1.4554391778899016e-05, | |
| "loss": 1.0996898651123046, | |
| "memory(GiB)": 74.93, | |
| "step": 945, | |
| "token_acc": 0.6234177215189873, | |
| "train_speed(iter/s)": 0.056501 | |
| }, | |
| { | |
| "epoch": 0.7022731472925522, | |
| "grad_norm": 1.072385635877129, | |
| "learning_rate": 1.4502598211990566e-05, | |
| "loss": 1.1042339324951171, | |
| "memory(GiB)": 74.93, | |
| "step": 950, | |
| "token_acc": 0.6252068394925537, | |
| "train_speed(iter/s)": 0.056535 | |
| }, | |
| { | |
| "epoch": 0.7022731472925522, | |
| "eval_loss": 0.7057685256004333, | |
| "eval_runtime": 86.3988, | |
| "eval_samples_per_second": 80.962, | |
| "eval_steps_per_second": 0.637, | |
| "eval_token_acc": 0.62234940217981, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.7059693217519867, | |
| "grad_norm": 0.9961167453619919, | |
| "learning_rate": 1.4450652699403626e-05, | |
| "loss": 1.1219955444335938, | |
| "memory(GiB)": 74.93, | |
| "step": 955, | |
| "token_acc": 0.6370088719898606, | |
| "train_speed(iter/s)": 0.056207 | |
| }, | |
| { | |
| "epoch": 0.7096654962114212, | |
| "grad_norm": 1.061517038375997, | |
| "learning_rate": 1.4398556994102996e-05, | |
| "loss": 1.1110521316528321, | |
| "memory(GiB)": 74.93, | |
| "step": 960, | |
| "token_acc": 0.592031029619182, | |
| "train_speed(iter/s)": 0.056234 | |
| }, | |
| { | |
| "epoch": 0.7133616706708557, | |
| "grad_norm": 1.0852009579100936, | |
| "learning_rate": 1.43463128541219e-05, | |
| "loss": 1.096040916442871, | |
| "memory(GiB)": 74.93, | |
| "step": 965, | |
| "token_acc": 0.6075691411935954, | |
| "train_speed(iter/s)": 0.056271 | |
| }, | |
| { | |
| "epoch": 0.7170578451302901, | |
| "grad_norm": 0.9770309231987666, | |
| "learning_rate": 1.4293922042502688e-05, | |
| "loss": 1.1151371002197266, | |
| "memory(GiB)": 74.93, | |
| "step": 970, | |
| "token_acc": 0.6337025316455697, | |
| "train_speed(iter/s)": 0.056306 | |
| }, | |
| { | |
| "epoch": 0.7207540195897246, | |
| "grad_norm": 1.1847784978202587, | |
| "learning_rate": 1.4241386327237312e-05, | |
| "loss": 1.1008172035217285, | |
| "memory(GiB)": 74.93, | |
| "step": 975, | |
| "token_acc": 0.6730158730158731, | |
| "train_speed(iter/s)": 0.05634 | |
| }, | |
| { | |
| "epoch": 0.7244501940491591, | |
| "grad_norm": 1.143052071292951, | |
| "learning_rate": 1.4188707481207677e-05, | |
| "loss": 1.083547878265381, | |
| "memory(GiB)": 74.93, | |
| "step": 980, | |
| "token_acc": 0.6250749850029994, | |
| "train_speed(iter/s)": 0.056381 | |
| }, | |
| { | |
| "epoch": 0.7281463685085936, | |
| "grad_norm": 1.0778857332369403, | |
| "learning_rate": 1.4135887282125815e-05, | |
| "loss": 1.1583375930786133, | |
| "memory(GiB)": 74.93, | |
| "step": 985, | |
| "token_acc": 0.6521739130434783, | |
| "train_speed(iter/s)": 0.056416 | |
| }, | |
| { | |
| "epoch": 0.7318425429680281, | |
| "grad_norm": 1.1338338646435362, | |
| "learning_rate": 1.4082927512473884e-05, | |
| "loss": 1.0937719345092773, | |
| "memory(GiB)": 74.93, | |
| "step": 990, | |
| "token_acc": 0.6181945090739879, | |
| "train_speed(iter/s)": 0.056448 | |
| }, | |
| { | |
| "epoch": 0.7355387174274626, | |
| "grad_norm": 1.085287732158945, | |
| "learning_rate": 1.4029829959444023e-05, | |
| "loss": 1.1042760848999023, | |
| "memory(GiB)": 74.93, | |
| "step": 995, | |
| "token_acc": 0.600328947368421, | |
| "train_speed(iter/s)": 0.056486 | |
| }, | |
| { | |
| "epoch": 0.7392348918868971, | |
| "grad_norm": 1.0122719878977164, | |
| "learning_rate": 1.3976596414878044e-05, | |
| "loss": 1.1351425170898437, | |
| "memory(GiB)": 74.93, | |
| "step": 1000, | |
| "token_acc": 0.8054474708171206, | |
| "train_speed(iter/s)": 0.056528 | |
| }, | |
| { | |
| "epoch": 0.7392348918868971, | |
| "eval_loss": 0.7091466784477234, | |
| "eval_runtime": 87.3344, | |
| "eval_samples_per_second": 80.094, | |
| "eval_steps_per_second": 0.63, | |
| "eval_token_acc": 0.622888467691853, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.7429310663463315, | |
| "grad_norm": 1.1226018601296495, | |
| "learning_rate": 1.392322867520695e-05, | |
| "loss": 1.088837242126465, | |
| "memory(GiB)": 74.93, | |
| "step": 1005, | |
| "token_acc": 0.6355053191489362, | |
| "train_speed(iter/s)": 0.056225 | |
| }, | |
| { | |
| "epoch": 0.746627240805766, | |
| "grad_norm": 1.021565181098161, | |
| "learning_rate": 1.3869728541390333e-05, | |
| "loss": 1.1350063323974608, | |
| "memory(GiB)": 74.93, | |
| "step": 1010, | |
| "token_acc": 0.6212718064153067, | |
| "train_speed(iter/s)": 0.056258 | |
| }, | |
| { | |
| "epoch": 0.7503234152652005, | |
| "grad_norm": 1.2612224567220394, | |
| "learning_rate": 1.3816097818855575e-05, | |
| "loss": 1.1172313690185547, | |
| "memory(GiB)": 74.93, | |
| "step": 1015, | |
| "token_acc": 0.5992337164750958, | |
| "train_speed(iter/s)": 0.056287 | |
| }, | |
| { | |
| "epoch": 0.754019589724635, | |
| "grad_norm": 1.1387539267847184, | |
| "learning_rate": 1.3762338317436948e-05, | |
| "loss": 1.1132306098937987, | |
| "memory(GiB)": 74.93, | |
| "step": 1020, | |
| "token_acc": 0.6117302052785923, | |
| "train_speed(iter/s)": 0.056327 | |
| }, | |
| { | |
| "epoch": 0.7577157641840695, | |
| "grad_norm": 1.358536367466617, | |
| "learning_rate": 1.3708451851314511e-05, | |
| "loss": 1.1005128860473632, | |
| "memory(GiB)": 74.93, | |
| "step": 1025, | |
| "token_acc": 0.6442417331812998, | |
| "train_speed(iter/s)": 0.05636 | |
| }, | |
| { | |
| "epoch": 0.761411938643504, | |
| "grad_norm": 1.0707791903089035, | |
| "learning_rate": 1.3654440238952913e-05, | |
| "loss": 1.0914304733276368, | |
| "memory(GiB)": 74.93, | |
| "step": 1030, | |
| "token_acc": 0.6064616582327754, | |
| "train_speed(iter/s)": 0.056391 | |
| }, | |
| { | |
| "epoch": 0.7651081131029385, | |
| "grad_norm": 1.116060507051338, | |
| "learning_rate": 1.3600305303040007e-05, | |
| "loss": 1.1009283065795898, | |
| "memory(GiB)": 74.93, | |
| "step": 1035, | |
| "token_acc": 0.6307870370370371, | |
| "train_speed(iter/s)": 0.056425 | |
| }, | |
| { | |
| "epoch": 0.768804287562373, | |
| "grad_norm": 1.1278348104888696, | |
| "learning_rate": 1.3546048870425356e-05, | |
| "loss": 1.1028734207153321, | |
| "memory(GiB)": 74.93, | |
| "step": 1040, | |
| "token_acc": 0.5868608195055875, | |
| "train_speed(iter/s)": 0.056459 | |
| }, | |
| { | |
| "epoch": 0.7725004620218074, | |
| "grad_norm": 1.1153722062693998, | |
| "learning_rate": 1.349167277205858e-05, | |
| "loss": 1.124934768676758, | |
| "memory(GiB)": 74.93, | |
| "step": 1045, | |
| "token_acc": 0.6122199592668024, | |
| "train_speed(iter/s)": 0.056492 | |
| }, | |
| { | |
| "epoch": 0.7761966364812419, | |
| "grad_norm": 1.164884012561426, | |
| "learning_rate": 1.3437178842927554e-05, | |
| "loss": 1.1385189056396485, | |
| "memory(GiB)": 74.93, | |
| "step": 1050, | |
| "token_acc": 0.6258808456117874, | |
| "train_speed(iter/s)": 0.056526 | |
| }, | |
| { | |
| "epoch": 0.7761966364812419, | |
| "eval_loss": 0.7029861211776733, | |
| "eval_runtime": 88.4673, | |
| "eval_samples_per_second": 79.069, | |
| "eval_steps_per_second": 0.622, | |
| "eval_token_acc": 0.623123948129662, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.7798928109406764, | |
| "grad_norm": 1.3055581766553261, | |
| "learning_rate": 1.338256892199651e-05, | |
| "loss": 1.1020261764526367, | |
| "memory(GiB)": 74.93, | |
| "step": 1055, | |
| "token_acc": 0.6311363636363636, | |
| "train_speed(iter/s)": 0.056221 | |
| }, | |
| { | |
| "epoch": 0.7835889854001109, | |
| "grad_norm": 1.0395384668146148, | |
| "learning_rate": 1.3327844852143956e-05, | |
| "loss": 1.148073959350586, | |
| "memory(GiB)": 74.93, | |
| "step": 1060, | |
| "token_acc": 0.604885993485342, | |
| "train_speed(iter/s)": 0.05626 | |
| }, | |
| { | |
| "epoch": 0.7872851598595454, | |
| "grad_norm": 1.1665752727714136, | |
| "learning_rate": 1.3273008480100495e-05, | |
| "loss": 1.0979449272155761, | |
| "memory(GiB)": 74.93, | |
| "step": 1065, | |
| "token_acc": 0.6049382716049383, | |
| "train_speed(iter/s)": 0.05629 | |
| }, | |
| { | |
| "epoch": 0.7909813343189799, | |
| "grad_norm": 1.041985717329155, | |
| "learning_rate": 1.3218061656386517e-05, | |
| "loss": 1.1317058563232423, | |
| "memory(GiB)": 74.93, | |
| "step": 1070, | |
| "token_acc": 0.6433460076045627, | |
| "train_speed(iter/s)": 0.056314 | |
| }, | |
| { | |
| "epoch": 0.7946775087784144, | |
| "grad_norm": 1.0369279649431482, | |
| "learning_rate": 1.316300623524972e-05, | |
| "loss": 1.1089330673217774, | |
| "memory(GiB)": 74.93, | |
| "step": 1075, | |
| "token_acc": 0.6382868937048504, | |
| "train_speed(iter/s)": 0.056354 | |
| }, | |
| { | |
| "epoch": 0.7983736832378489, | |
| "grad_norm": 1.1949441156399458, | |
| "learning_rate": 1.3107844074602566e-05, | |
| "loss": 1.0892942428588868, | |
| "memory(GiB)": 74.93, | |
| "step": 1080, | |
| "token_acc": 0.6408912188728703, | |
| "train_speed(iter/s)": 0.056386 | |
| }, | |
| { | |
| "epoch": 0.8020698576972833, | |
| "grad_norm": 1.0363420805429473, | |
| "learning_rate": 1.305257703595957e-05, | |
| "loss": 1.0744206428527832, | |
| "memory(GiB)": 74.93, | |
| "step": 1085, | |
| "token_acc": 0.6147540983606558, | |
| "train_speed(iter/s)": 0.056414 | |
| }, | |
| { | |
| "epoch": 0.8057660321567178, | |
| "grad_norm": 0.9805753007460783, | |
| "learning_rate": 1.2997206984374486e-05, | |
| "loss": 1.1048744201660157, | |
| "memory(GiB)": 74.93, | |
| "step": 1090, | |
| "token_acc": 0.6329463792150359, | |
| "train_speed(iter/s)": 0.056452 | |
| }, | |
| { | |
| "epoch": 0.8094622066161523, | |
| "grad_norm": 1.078880274058704, | |
| "learning_rate": 1.2941735788377356e-05, | |
| "loss": 1.0897531509399414, | |
| "memory(GiB)": 74.93, | |
| "step": 1095, | |
| "token_acc": 0.6396155899626268, | |
| "train_speed(iter/s)": 0.056484 | |
| }, | |
| { | |
| "epoch": 0.8131583810755868, | |
| "grad_norm": 1.083885052316346, | |
| "learning_rate": 1.2886165319911474e-05, | |
| "loss": 1.1432035446166993, | |
| "memory(GiB)": 74.93, | |
| "step": 1100, | |
| "token_acc": 0.5973259929217459, | |
| "train_speed(iter/s)": 0.056505 | |
| }, | |
| { | |
| "epoch": 0.8131583810755868, | |
| "eval_loss": 0.6945818662643433, | |
| "eval_runtime": 86.4586, | |
| "eval_samples_per_second": 80.906, | |
| "eval_steps_per_second": 0.636, | |
| "eval_token_acc": 0.6239354321874054, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.8168545555350213, | |
| "grad_norm": 1.1507994138444235, | |
| "learning_rate": 1.2830497454270206e-05, | |
| "loss": 1.1136839866638184, | |
| "memory(GiB)": 74.93, | |
| "step": 1105, | |
| "token_acc": 0.6371170793117918, | |
| "train_speed(iter/s)": 0.05622 | |
| }, | |
| { | |
| "epoch": 0.8205507299944558, | |
| "grad_norm": 1.0133515901515742, | |
| "learning_rate": 1.2774734070033692e-05, | |
| "loss": 1.1166929244995116, | |
| "memory(GiB)": 74.93, | |
| "step": 1110, | |
| "token_acc": 0.6103855721393034, | |
| "train_speed(iter/s)": 0.056253 | |
| }, | |
| { | |
| "epoch": 0.8242469044538903, | |
| "grad_norm": 1.1857531032231587, | |
| "learning_rate": 1.2718877049005477e-05, | |
| "loss": 1.1120613098144532, | |
| "memory(GiB)": 74.93, | |
| "step": 1115, | |
| "token_acc": 0.6248982912937348, | |
| "train_speed(iter/s)": 0.056279 | |
| }, | |
| { | |
| "epoch": 0.8279430789133247, | |
| "grad_norm": 1.0147593247560383, | |
| "learning_rate": 1.2662928276148985e-05, | |
| "loss": 1.0828424453735352, | |
| "memory(GiB)": 74.93, | |
| "step": 1120, | |
| "token_acc": 0.6065897858319604, | |
| "train_speed(iter/s)": 0.056309 | |
| }, | |
| { | |
| "epoch": 0.8316392533727592, | |
| "grad_norm": 1.0535067736037584, | |
| "learning_rate": 1.2606889639523925e-05, | |
| "loss": 1.082409381866455, | |
| "memory(GiB)": 74.93, | |
| "step": 1125, | |
| "token_acc": 0.6383859286083807, | |
| "train_speed(iter/s)": 0.056339 | |
| }, | |
| { | |
| "epoch": 0.8353354278321937, | |
| "grad_norm": 1.090903289476391, | |
| "learning_rate": 1.255076303022256e-05, | |
| "loss": 1.1306575775146483, | |
| "memory(GiB)": 74.93, | |
| "step": 1130, | |
| "token_acc": 0.6113028472821398, | |
| "train_speed(iter/s)": 0.056373 | |
| }, | |
| { | |
| "epoch": 0.8390316022916282, | |
| "grad_norm": 1.1602057234017449, | |
| "learning_rate": 1.2494550342305906e-05, | |
| "loss": 1.1157353401184082, | |
| "memory(GiB)": 74.93, | |
| "step": 1135, | |
| "token_acc": 0.629865985960434, | |
| "train_speed(iter/s)": 0.0564 | |
| }, | |
| { | |
| "epoch": 0.8427277767510627, | |
| "grad_norm": 1.032443656861064, | |
| "learning_rate": 1.2438253472739805e-05, | |
| "loss": 1.0929494857788087, | |
| "memory(GiB)": 74.93, | |
| "step": 1140, | |
| "token_acc": 0.6280344557556774, | |
| "train_speed(iter/s)": 0.056434 | |
| }, | |
| { | |
| "epoch": 0.8464239512104972, | |
| "grad_norm": 1.122025726444444, | |
| "learning_rate": 1.2381874321330912e-05, | |
| "loss": 1.1178958892822266, | |
| "memory(GiB)": 74.93, | |
| "step": 1145, | |
| "token_acc": 0.6517412935323383, | |
| "train_speed(iter/s)": 0.056468 | |
| }, | |
| { | |
| "epoch": 0.8501201256699317, | |
| "grad_norm": 1.0829851308141574, | |
| "learning_rate": 1.2325414790662578e-05, | |
| "loss": 1.0894483566284179, | |
| "memory(GiB)": 74.93, | |
| "step": 1150, | |
| "token_acc": 0.6569058077110785, | |
| "train_speed(iter/s)": 0.05649 | |
| }, | |
| { | |
| "epoch": 0.8501201256699317, | |
| "eval_loss": 0.6932370066642761, | |
| "eval_runtime": 86.0146, | |
| "eval_samples_per_second": 81.323, | |
| "eval_steps_per_second": 0.639, | |
| "eval_token_acc": 0.6245899292866097, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.8538163001293662, | |
| "grad_norm": 1.3861087034460704, | |
| "learning_rate": 1.2268876786030654e-05, | |
| "loss": 1.1001951217651367, | |
| "memory(GiB)": 74.93, | |
| "step": 1155, | |
| "token_acc": 0.630185845691759, | |
| "train_speed(iter/s)": 0.056209 | |
| }, | |
| { | |
| "epoch": 0.8575124745888006, | |
| "grad_norm": 1.1867682331739955, | |
| "learning_rate": 1.2212262215379199e-05, | |
| "loss": 1.1211355209350586, | |
| "memory(GiB)": 74.93, | |
| "step": 1160, | |
| "token_acc": 0.6551724137931034, | |
| "train_speed(iter/s)": 0.056235 | |
| }, | |
| { | |
| "epoch": 0.8612086490482351, | |
| "grad_norm": 1.0901861719096644, | |
| "learning_rate": 1.215557298923607e-05, | |
| "loss": 1.0956010818481445, | |
| "memory(GiB)": 74.93, | |
| "step": 1165, | |
| "token_acc": 0.6244993324432577, | |
| "train_speed(iter/s)": 0.056271 | |
| }, | |
| { | |
| "epoch": 0.8649048235076695, | |
| "grad_norm": 1.0190543071260865, | |
| "learning_rate": 1.2098811020648475e-05, | |
| "loss": 1.1221609115600586, | |
| "memory(GiB)": 74.93, | |
| "step": 1170, | |
| "token_acc": 0.612531328320802, | |
| "train_speed(iter/s)": 0.056297 | |
| }, | |
| { | |
| "epoch": 0.868600997967104, | |
| "grad_norm": 1.055731899501751, | |
| "learning_rate": 1.2041978225118409e-05, | |
| "loss": 1.0942396163940429, | |
| "memory(GiB)": 74.93, | |
| "step": 1175, | |
| "token_acc": 0.61580547112462, | |
| "train_speed(iter/s)": 0.056324 | |
| }, | |
| { | |
| "epoch": 0.8722971724265385, | |
| "grad_norm": 1.1595911679468829, | |
| "learning_rate": 1.1985076520537995e-05, | |
| "loss": 1.1030941009521484, | |
| "memory(GiB)": 74.93, | |
| "step": 1180, | |
| "token_acc": 0.6299868478737396, | |
| "train_speed(iter/s)": 0.056356 | |
| }, | |
| { | |
| "epoch": 0.875993346885973, | |
| "grad_norm": 1.1461146239140465, | |
| "learning_rate": 1.1928107827124786e-05, | |
| "loss": 1.0970783233642578, | |
| "memory(GiB)": 74.93, | |
| "step": 1185, | |
| "token_acc": 0.644696639022261, | |
| "train_speed(iter/s)": 0.056381 | |
| }, | |
| { | |
| "epoch": 0.8796895213454075, | |
| "grad_norm": 1.0680776701688195, | |
| "learning_rate": 1.1871074067356952e-05, | |
| "loss": 1.079010009765625, | |
| "memory(GiB)": 74.93, | |
| "step": 1190, | |
| "token_acc": 0.6483679525222552, | |
| "train_speed(iter/s)": 0.056408 | |
| }, | |
| { | |
| "epoch": 0.8833856958048419, | |
| "grad_norm": 1.1205292458140585, | |
| "learning_rate": 1.1813977165908406e-05, | |
| "loss": 1.098078155517578, | |
| "memory(GiB)": 74.93, | |
| "step": 1195, | |
| "token_acc": 0.6183456183456183, | |
| "train_speed(iter/s)": 0.056441 | |
| }, | |
| { | |
| "epoch": 0.8870818702642764, | |
| "grad_norm": 1.073187725881319, | |
| "learning_rate": 1.1756819049583861e-05, | |
| "loss": 1.1022902488708497, | |
| "memory(GiB)": 74.93, | |
| "step": 1200, | |
| "token_acc": 0.6195414847161572, | |
| "train_speed(iter/s)": 0.056472 | |
| }, | |
| { | |
| "epoch": 0.8870818702642764, | |
| "eval_loss": 0.6976271271705627, | |
| "eval_runtime": 87.7392, | |
| "eval_samples_per_second": 79.725, | |
| "eval_steps_per_second": 0.627, | |
| "eval_token_acc": 0.6255041474569267, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.8907780447237109, | |
| "grad_norm": 1.0836927609908615, | |
| "learning_rate": 1.1699601647253791e-05, | |
| "loss": 1.0966317176818847, | |
| "memory(GiB)": 74.93, | |
| "step": 1205, | |
| "token_acc": 0.6305779078273592, | |
| "train_speed(iter/s)": 0.056207 | |
| }, | |
| { | |
| "epoch": 0.8944742191831454, | |
| "grad_norm": 1.1200101176242079, | |
| "learning_rate": 1.1642326889789352e-05, | |
| "loss": 1.1052473068237305, | |
| "memory(GiB)": 74.93, | |
| "step": 1210, | |
| "token_acc": 0.6330027051397655, | |
| "train_speed(iter/s)": 0.05623 | |
| }, | |
| { | |
| "epoch": 0.8981703936425799, | |
| "grad_norm": 0.8945893498959235, | |
| "learning_rate": 1.158499670999722e-05, | |
| "loss": 1.0987310409545898, | |
| "memory(GiB)": 74.93, | |
| "step": 1215, | |
| "token_acc": 0.6409691629955947, | |
| "train_speed(iter/s)": 0.05626 | |
| }, | |
| { | |
| "epoch": 0.9018665681020144, | |
| "grad_norm": 1.1729053883136484, | |
| "learning_rate": 1.1527613042554368e-05, | |
| "loss": 1.1048666000366212, | |
| "memory(GiB)": 74.93, | |
| "step": 1220, | |
| "token_acc": 0.6676938880328711, | |
| "train_speed(iter/s)": 0.056294 | |
| }, | |
| { | |
| "epoch": 0.9055627425614489, | |
| "grad_norm": 1.0443569914858049, | |
| "learning_rate": 1.147017782394277e-05, | |
| "loss": 1.081749439239502, | |
| "memory(GiB)": 74.93, | |
| "step": 1225, | |
| "token_acc": 0.608612895550797, | |
| "train_speed(iter/s)": 0.056319 | |
| }, | |
| { | |
| "epoch": 0.9092589170208834, | |
| "grad_norm": 1.2005283092061096, | |
| "learning_rate": 1.1412692992384058e-05, | |
| "loss": 1.091093158721924, | |
| "memory(GiB)": 74.93, | |
| "step": 1230, | |
| "token_acc": 0.606317160534028, | |
| "train_speed(iter/s)": 0.056348 | |
| }, | |
| { | |
| "epoch": 0.9129550914803178, | |
| "grad_norm": 1.0896928360432243, | |
| "learning_rate": 1.1355160487774119e-05, | |
| "loss": 1.1176409721374512, | |
| "memory(GiB)": 74.93, | |
| "step": 1235, | |
| "token_acc": 0.5716694772344013, | |
| "train_speed(iter/s)": 0.056377 | |
| }, | |
| { | |
| "epoch": 0.9166512659397523, | |
| "grad_norm": 1.09517195359763, | |
| "learning_rate": 1.1297582251617618e-05, | |
| "loss": 1.1004619598388672, | |
| "memory(GiB)": 74.93, | |
| "step": 1240, | |
| "token_acc": 0.6309497935231472, | |
| "train_speed(iter/s)": 0.056401 | |
| }, | |
| { | |
| "epoch": 0.9203474403991868, | |
| "grad_norm": 1.0558160321968586, | |
| "learning_rate": 1.1239960226962491e-05, | |
| "loss": 1.1076683044433593, | |
| "memory(GiB)": 74.93, | |
| "step": 1245, | |
| "token_acc": 0.624376731301939, | |
| "train_speed(iter/s)": 0.056433 | |
| }, | |
| { | |
| "epoch": 0.9240436148586213, | |
| "grad_norm": 1.167401656088389, | |
| "learning_rate": 1.1182296358334373e-05, | |
| "loss": 1.0801752090454102, | |
| "memory(GiB)": 74.93, | |
| "step": 1250, | |
| "token_acc": 0.6274625110261688, | |
| "train_speed(iter/s)": 0.056468 | |
| }, | |
| { | |
| "epoch": 0.9240436148586213, | |
| "eval_loss": 0.6896535158157349, | |
| "eval_runtime": 89.0061, | |
| "eval_samples_per_second": 78.59, | |
| "eval_steps_per_second": 0.618, | |
| "eval_token_acc": 0.6259704910690581, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.9277397893180558, | |
| "grad_norm": 1.2651651199409124, | |
| "learning_rate": 1.1124592591670964e-05, | |
| "loss": 1.0778679847717285, | |
| "memory(GiB)": 74.93, | |
| "step": 1255, | |
| "token_acc": 0.6440798016763074, | |
| "train_speed(iter/s)": 0.056224 | |
| }, | |
| { | |
| "epoch": 0.9314359637774903, | |
| "grad_norm": 1.0901265302180776, | |
| "learning_rate": 1.1066850874256387e-05, | |
| "loss": 1.0967378616333008, | |
| "memory(GiB)": 74.93, | |
| "step": 1260, | |
| "token_acc": 0.6274731486715659, | |
| "train_speed(iter/s)": 0.056248 | |
| }, | |
| { | |
| "epoch": 0.9351321382369248, | |
| "grad_norm": 1.0804226410639166, | |
| "learning_rate": 1.1009073154655452e-05, | |
| "loss": 1.0889236450195312, | |
| "memory(GiB)": 74.93, | |
| "step": 1265, | |
| "token_acc": 0.620845921450151, | |
| "train_speed(iter/s)": 0.056269 | |
| }, | |
| { | |
| "epoch": 0.9388283126963592, | |
| "grad_norm": 1.228390945564267, | |
| "learning_rate": 1.09512613826479e-05, | |
| "loss": 1.1092605590820312, | |
| "memory(GiB)": 74.93, | |
| "step": 1270, | |
| "token_acc": 0.6499229583975347, | |
| "train_speed(iter/s)": 0.056301 | |
| }, | |
| { | |
| "epoch": 0.9425244871557937, | |
| "grad_norm": 1.179672539170986, | |
| "learning_rate": 1.0893417509162624e-05, | |
| "loss": 1.099574661254883, | |
| "memory(GiB)": 74.93, | |
| "step": 1275, | |
| "token_acc": 0.6232127838519764, | |
| "train_speed(iter/s)": 0.056325 | |
| }, | |
| { | |
| "epoch": 0.9462206616152282, | |
| "grad_norm": 1.0309784047078987, | |
| "learning_rate": 1.0835543486211815e-05, | |
| "loss": 1.1081634521484376, | |
| "memory(GiB)": 74.93, | |
| "step": 1280, | |
| "token_acc": 0.6257142857142857, | |
| "train_speed(iter/s)": 0.056352 | |
| }, | |
| { | |
| "epoch": 0.9499168360746627, | |
| "grad_norm": 1.1083199849496777, | |
| "learning_rate": 1.0777641266825094e-05, | |
| "loss": 1.1096603393554687, | |
| "memory(GiB)": 74.93, | |
| "step": 1285, | |
| "token_acc": 0.6357894736842106, | |
| "train_speed(iter/s)": 0.056378 | |
| }, | |
| { | |
| "epoch": 0.9536130105340972, | |
| "grad_norm": 1.0035577075576465, | |
| "learning_rate": 1.0719712804983604e-05, | |
| "loss": 1.1045263290405274, | |
| "memory(GiB)": 74.93, | |
| "step": 1290, | |
| "token_acc": 0.6397618260006616, | |
| "train_speed(iter/s)": 0.056405 | |
| }, | |
| { | |
| "epoch": 0.9573091849935317, | |
| "grad_norm": 1.0502142381441943, | |
| "learning_rate": 1.0661760055554083e-05, | |
| "loss": 1.082082462310791, | |
| "memory(GiB)": 74.93, | |
| "step": 1295, | |
| "token_acc": 0.6266829865361077, | |
| "train_speed(iter/s)": 0.056429 | |
| }, | |
| { | |
| "epoch": 0.9610053594529662, | |
| "grad_norm": 1.2499115770499312, | |
| "learning_rate": 1.0603784974222862e-05, | |
| "loss": 1.098296546936035, | |
| "memory(GiB)": 74.93, | |
| "step": 1300, | |
| "token_acc": 0.6284748309541698, | |
| "train_speed(iter/s)": 0.056459 | |
| }, | |
| { | |
| "epoch": 0.9610053594529662, | |
| "eval_loss": 0.6888419389724731, | |
| "eval_runtime": 87.7552, | |
| "eval_samples_per_second": 79.71, | |
| "eval_steps_per_second": 0.627, | |
| "eval_token_acc": 0.6259658738055717, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.9647015339124007, | |
| "grad_norm": 1.2558210208759852, | |
| "learning_rate": 1.054578951742991e-05, | |
| "loss": 1.0757410049438476, | |
| "memory(GiB)": 74.93, | |
| "step": 1305, | |
| "token_acc": 0.6296939859059755, | |
| "train_speed(iter/s)": 0.056222 | |
| }, | |
| { | |
| "epoch": 0.9683977083718351, | |
| "grad_norm": 1.1509712834800971, | |
| "learning_rate": 1.048777564230278e-05, | |
| "loss": 1.1064401626586915, | |
| "memory(GiB)": 74.93, | |
| "step": 1310, | |
| "token_acc": 0.6144927536231884, | |
| "train_speed(iter/s)": 0.056247 | |
| }, | |
| { | |
| "epoch": 0.9720938828312696, | |
| "grad_norm": 1.1877122033430165, | |
| "learning_rate": 1.0429745306590573e-05, | |
| "loss": 1.0995939254760743, | |
| "memory(GiB)": 74.93, | |
| "step": 1315, | |
| "token_acc": 0.6551246537396122, | |
| "train_speed(iter/s)": 0.056264 | |
| }, | |
| { | |
| "epoch": 0.9757900572907041, | |
| "grad_norm": 1.0334473323989715, | |
| "learning_rate": 1.0371700468597886e-05, | |
| "loss": 1.0957868576049805, | |
| "memory(GiB)": 74.93, | |
| "step": 1320, | |
| "token_acc": 0.6152882205513784, | |
| "train_speed(iter/s)": 0.056289 | |
| }, | |
| { | |
| "epoch": 0.9794862317501386, | |
| "grad_norm": 1.0379714843668957, | |
| "learning_rate": 1.0313643087118692e-05, | |
| "loss": 1.0816888809204102, | |
| "memory(GiB)": 74.93, | |
| "step": 1325, | |
| "token_acc": 0.6423645320197044, | |
| "train_speed(iter/s)": 0.056319 | |
| }, | |
| { | |
| "epoch": 0.9831824062095731, | |
| "grad_norm": 1.0681169313465444, | |
| "learning_rate": 1.0255575121370277e-05, | |
| "loss": 1.0688974380493164, | |
| "memory(GiB)": 74.93, | |
| "step": 1330, | |
| "token_acc": 0.6287527459116427, | |
| "train_speed(iter/s)": 0.056343 | |
| }, | |
| { | |
| "epoch": 0.9868785806690076, | |
| "grad_norm": 1.1171758504896703, | |
| "learning_rate": 1.0197498530927102e-05, | |
| "loss": 1.099297332763672, | |
| "memory(GiB)": 74.93, | |
| "step": 1335, | |
| "token_acc": 0.6077836745008846, | |
| "train_speed(iter/s)": 0.056367 | |
| }, | |
| { | |
| "epoch": 0.9905747551284421, | |
| "grad_norm": 1.0576212439483514, | |
| "learning_rate": 1.0139415275654671e-05, | |
| "loss": 1.0867423057556151, | |
| "memory(GiB)": 74.93, | |
| "step": 1340, | |
| "token_acc": 0.6263262599469496, | |
| "train_speed(iter/s)": 0.056396 | |
| }, | |
| { | |
| "epoch": 0.9942709295878766, | |
| "grad_norm": 1.258815850774044, | |
| "learning_rate": 1.0081327315643406e-05, | |
| "loss": 1.1155497550964355, | |
| "memory(GiB)": 74.93, | |
| "step": 1345, | |
| "token_acc": 0.655549765502866, | |
| "train_speed(iter/s)": 0.056419 | |
| }, | |
| { | |
| "epoch": 0.997967104047311, | |
| "grad_norm": 1.0659691536136329, | |
| "learning_rate": 1.0023236611142499e-05, | |
| "loss": 1.057703685760498, | |
| "memory(GiB)": 74.93, | |
| "step": 1350, | |
| "token_acc": 0.712, | |
| "train_speed(iter/s)": 0.056446 | |
| }, | |
| { | |
| "epoch": 0.997967104047311, | |
| "eval_loss": 0.6881307363510132, | |
| "eval_runtime": 86.0221, | |
| "eval_samples_per_second": 81.316, | |
| "eval_steps_per_second": 0.639, | |
| "eval_token_acc": 0.626765814704599, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 1.0022177046756606, | |
| "grad_norm": 1.3069033033680353, | |
| "learning_rate": 9.965145122493756e-06, | |
| "loss": 1.2448784828186035, | |
| "memory(GiB)": 74.93, | |
| "step": 1355, | |
| "token_acc": 0.6295214105793451, | |
| "train_speed(iter/s)": 0.056171 | |
| }, | |
| { | |
| "epoch": 1.0059138791350952, | |
| "grad_norm": 0.9882434180756982, | |
| "learning_rate": 9.907054810065446e-06, | |
| "loss": 1.062336540222168, | |
| "memory(GiB)": 74.93, | |
| "step": 1360, | |
| "token_acc": 0.6483717526527625, | |
| "train_speed(iter/s)": 0.056192 | |
| }, | |
| { | |
| "epoch": 1.0096100535945296, | |
| "grad_norm": 1.2362454534970095, | |
| "learning_rate": 9.848967634186142e-06, | |
| "loss": 1.0906942367553711, | |
| "memory(GiB)": 74.93, | |
| "step": 1365, | |
| "token_acc": 0.6448347722536469, | |
| "train_speed(iter/s)": 0.056213 | |
| }, | |
| { | |
| "epoch": 1.0133062280539642, | |
| "grad_norm": 1.070334993285048, | |
| "learning_rate": 9.790885555078575e-06, | |
| "loss": 1.0470151901245117, | |
| "memory(GiB)": 74.93, | |
| "step": 1370, | |
| "token_acc": 0.6228728728728729, | |
| "train_speed(iter/s)": 0.056237 | |
| }, | |
| { | |
| "epoch": 1.0170024025133986, | |
| "grad_norm": 1.0576680139627181, | |
| "learning_rate": 9.732810532793465e-06, | |
| "loss": 1.0586755752563477, | |
| "memory(GiB)": 74.93, | |
| "step": 1375, | |
| "token_acc": 0.6435643564356436, | |
| "train_speed(iter/s)": 0.056266 | |
| }, | |
| { | |
| "epoch": 1.0206985769728332, | |
| "grad_norm": 1.0167739538945428, | |
| "learning_rate": 9.674744527143419e-06, | |
| "loss": 1.059821891784668, | |
| "memory(GiB)": 74.93, | |
| "step": 1380, | |
| "token_acc": 0.6397306397306397, | |
| "train_speed(iter/s)": 0.056291 | |
| }, | |
| { | |
| "epoch": 1.0243947514322675, | |
| "grad_norm": 1.1268503654686965, | |
| "learning_rate": 9.61668949763674e-06, | |
| "loss": 1.0377557754516602, | |
| "memory(GiB)": 74.93, | |
| "step": 1385, | |
| "token_acc": 0.6721439749608764, | |
| "train_speed(iter/s)": 0.056311 | |
| }, | |
| { | |
| "epoch": 1.0280909258917021, | |
| "grad_norm": 0.9931688648143746, | |
| "learning_rate": 9.558647403411334e-06, | |
| "loss": 1.0480243682861328, | |
| "memory(GiB)": 74.93, | |
| "step": 1390, | |
| "token_acc": 0.6135416666666667, | |
| "train_speed(iter/s)": 0.056336 | |
| }, | |
| { | |
| "epoch": 1.0317871003511365, | |
| "grad_norm": 1.1339232037274705, | |
| "learning_rate": 9.500620203168604e-06, | |
| "loss": 1.0579310417175294, | |
| "memory(GiB)": 74.93, | |
| "step": 1395, | |
| "token_acc": 0.6699186991869919, | |
| "train_speed(iter/s)": 0.056365 | |
| }, | |
| { | |
| "epoch": 1.0354832748105711, | |
| "grad_norm": 0.9738636619210117, | |
| "learning_rate": 9.442609855107317e-06, | |
| "loss": 1.0384546279907227, | |
| "memory(GiB)": 74.93, | |
| "step": 1400, | |
| "token_acc": 0.6303651505445227, | |
| "train_speed(iter/s)": 0.056383 | |
| }, | |
| { | |
| "epoch": 1.0354832748105711, | |
| "eval_loss": 0.6841524243354797, | |
| "eval_runtime": 86.5188, | |
| "eval_samples_per_second": 80.849, | |
| "eval_steps_per_second": 0.636, | |
| "eval_token_acc": 0.6267785121791868, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 1.0391794492700055, | |
| "grad_norm": 1.0076575163805248, | |
| "learning_rate": 9.38461831685756e-06, | |
| "loss": 1.0656241416931151, | |
| "memory(GiB)": 74.93, | |
| "step": 1405, | |
| "token_acc": 0.6295483423818875, | |
| "train_speed(iter/s)": 0.056156 | |
| }, | |
| { | |
| "epoch": 1.04287562372944, | |
| "grad_norm": 1.0590248393948134, | |
| "learning_rate": 9.326647545414647e-06, | |
| "loss": 1.0602170944213867, | |
| "memory(GiB)": 74.93, | |
| "step": 1410, | |
| "token_acc": 0.7284836065573771, | |
| "train_speed(iter/s)": 0.056177 | |
| }, | |
| { | |
| "epoch": 1.0465717981888745, | |
| "grad_norm": 1.0411050083571975, | |
| "learning_rate": 9.268699497073102e-06, | |
| "loss": 1.0623086929321288, | |
| "memory(GiB)": 74.93, | |
| "step": 1415, | |
| "token_acc": 0.6079059829059829, | |
| "train_speed(iter/s)": 0.056203 | |
| }, | |
| { | |
| "epoch": 1.050267972648309, | |
| "grad_norm": 1.0820280991464322, | |
| "learning_rate": 9.21077612736062e-06, | |
| "loss": 1.0742631912231446, | |
| "memory(GiB)": 74.93, | |
| "step": 1420, | |
| "token_acc": 0.6051423324150597, | |
| "train_speed(iter/s)": 0.056231 | |
| }, | |
| { | |
| "epoch": 1.0539641471077434, | |
| "grad_norm": 1.0150109672389387, | |
| "learning_rate": 9.152879390972085e-06, | |
| "loss": 1.060621452331543, | |
| "memory(GiB)": 74.93, | |
| "step": 1425, | |
| "token_acc": 0.6677704194260485, | |
| "train_speed(iter/s)": 0.056246 | |
| }, | |
| { | |
| "epoch": 1.057660321567178, | |
| "grad_norm": 1.0625464742964672, | |
| "learning_rate": 9.095011241703623e-06, | |
| "loss": 1.1060840606689453, | |
| "memory(GiB)": 74.93, | |
| "step": 1430, | |
| "token_acc": 0.617154288572143, | |
| "train_speed(iter/s)": 0.056275 | |
| }, | |
| { | |
| "epoch": 1.0613564960266124, | |
| "grad_norm": 1.080121630294682, | |
| "learning_rate": 9.037173632386635e-06, | |
| "loss": 1.051788902282715, | |
| "memory(GiB)": 74.93, | |
| "step": 1435, | |
| "token_acc": 0.693069306930693, | |
| "train_speed(iter/s)": 0.056295 | |
| }, | |
| { | |
| "epoch": 1.065052670486047, | |
| "grad_norm": 0.9965862626370368, | |
| "learning_rate": 8.979368514821917e-06, | |
| "loss": 1.0715249061584473, | |
| "memory(GiB)": 74.93, | |
| "step": 1440, | |
| "token_acc": 0.6563587166602242, | |
| "train_speed(iter/s)": 0.05632 | |
| }, | |
| { | |
| "epoch": 1.0687488449454814, | |
| "grad_norm": 1.0523645368442776, | |
| "learning_rate": 8.921597839713803e-06, | |
| "loss": 1.0732128143310546, | |
| "memory(GiB)": 74.93, | |
| "step": 1445, | |
| "token_acc": 0.6195273149941883, | |
| "train_speed(iter/s)": 0.056345 | |
| }, | |
| { | |
| "epoch": 1.072445019404916, | |
| "grad_norm": 0.9439502959144558, | |
| "learning_rate": 8.863863556604312e-06, | |
| "loss": 1.0644493103027344, | |
| "memory(GiB)": 74.93, | |
| "step": 1450, | |
| "token_acc": 0.6215469613259669, | |
| "train_speed(iter/s)": 0.056369 | |
| }, | |
| { | |
| "epoch": 1.072445019404916, | |
| "eval_loss": 0.6834661960601807, | |
| "eval_runtime": 87.5557, | |
| "eval_samples_per_second": 79.892, | |
| "eval_steps_per_second": 0.628, | |
| "eval_token_acc": 0.627048622093144, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 1.0761411938643504, | |
| "grad_norm": 1.1260430229381853, | |
| "learning_rate": 8.806167613807374e-06, | |
| "loss": 1.0463625907897949, | |
| "memory(GiB)": 74.93, | |
| "step": 1455, | |
| "token_acc": 0.6380742913000977, | |
| "train_speed(iter/s)": 0.05615 | |
| }, | |
| { | |
| "epoch": 1.079837368323785, | |
| "grad_norm": 1.1262155455903309, | |
| "learning_rate": 8.748511958343076e-06, | |
| "loss": 1.0758758544921876, | |
| "memory(GiB)": 74.93, | |
| "step": 1460, | |
| "token_acc": 0.6353591160220995, | |
| "train_speed(iter/s)": 0.056173 | |
| }, | |
| { | |
| "epoch": 1.0835335427832193, | |
| "grad_norm": 1.0836611872394941, | |
| "learning_rate": 8.690898535871967e-06, | |
| "loss": 1.0662212371826172, | |
| "memory(GiB)": 74.93, | |
| "step": 1465, | |
| "token_acc": 0.6074675324675325, | |
| "train_speed(iter/s)": 0.0562 | |
| }, | |
| { | |
| "epoch": 1.087229717242654, | |
| "grad_norm": 1.1980862381018496, | |
| "learning_rate": 8.633329290629385e-06, | |
| "loss": 1.042177963256836, | |
| "memory(GiB)": 74.93, | |
| "step": 1470, | |
| "token_acc": 0.6368200836820084, | |
| "train_speed(iter/s)": 0.056225 | |
| }, | |
| { | |
| "epoch": 1.0909258917020883, | |
| "grad_norm": 1.1395698139161996, | |
| "learning_rate": 8.575806165359852e-06, | |
| "loss": 1.0712276458740235, | |
| "memory(GiB)": 74.93, | |
| "step": 1475, | |
| "token_acc": 0.6389548693586699, | |
| "train_speed(iter/s)": 0.056249 | |
| }, | |
| { | |
| "epoch": 1.094622066161523, | |
| "grad_norm": 1.0531458891625334, | |
| "learning_rate": 8.51833110125153e-06, | |
| "loss": 1.0721662521362305, | |
| "memory(GiB)": 74.93, | |
| "step": 1480, | |
| "token_acc": 0.6220368744512731, | |
| "train_speed(iter/s)": 0.056271 | |
| }, | |
| { | |
| "epoch": 1.0983182406209573, | |
| "grad_norm": 0.952355580471414, | |
| "learning_rate": 8.460906037870677e-06, | |
| "loss": 1.018984603881836, | |
| "memory(GiB)": 74.93, | |
| "step": 1485, | |
| "token_acc": 0.6109256449165402, | |
| "train_speed(iter/s)": 0.056292 | |
| }, | |
| { | |
| "epoch": 1.1020144150803919, | |
| "grad_norm": 1.0722820285217056, | |
| "learning_rate": 8.403532913096231e-06, | |
| "loss": 1.0254201889038086, | |
| "memory(GiB)": 74.93, | |
| "step": 1490, | |
| "token_acc": 0.6746411483253588, | |
| "train_speed(iter/s)": 0.056313 | |
| }, | |
| { | |
| "epoch": 1.1057105895398263, | |
| "grad_norm": 1.0574628279248734, | |
| "learning_rate": 8.346213663054388e-06, | |
| "loss": 1.0446287155151368, | |
| "memory(GiB)": 74.93, | |
| "step": 1495, | |
| "token_acc": 0.6608030592734225, | |
| "train_speed(iter/s)": 0.056333 | |
| }, | |
| { | |
| "epoch": 1.1094067639992609, | |
| "grad_norm": 1.0816482005421177, | |
| "learning_rate": 8.288950222053287e-06, | |
| "loss": 1.0296789169311524, | |
| "memory(GiB)": 74.93, | |
| "step": 1500, | |
| "token_acc": 0.5984496124031008, | |
| "train_speed(iter/s)": 0.056359 | |
| }, | |
| { | |
| "epoch": 1.1094067639992609, | |
| "eval_loss": 0.6838507056236267, | |
| "eval_runtime": 89.1049, | |
| "eval_samples_per_second": 78.503, | |
| "eval_steps_per_second": 0.617, | |
| "eval_token_acc": 0.6276061566591329, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 1.1131029384586952, | |
| "grad_norm": 1.0669000768005377, | |
| "learning_rate": 8.231744522517713e-06, | |
| "loss": 1.052156925201416, | |
| "memory(GiB)": 74.93, | |
| "step": 1505, | |
| "token_acc": 0.6264432872990717, | |
| "train_speed(iter/s)": 0.056154 | |
| }, | |
| { | |
| "epoch": 1.1167991129181298, | |
| "grad_norm": 1.123101456521189, | |
| "learning_rate": 8.174598494923893e-06, | |
| "loss": 1.0532621383666991, | |
| "memory(GiB)": 74.93, | |
| "step": 1510, | |
| "token_acc": 0.6674074074074074, | |
| "train_speed(iter/s)": 0.056174 | |
| }, | |
| { | |
| "epoch": 1.1204952873775642, | |
| "grad_norm": 0.9597873108803062, | |
| "learning_rate": 8.117514067734365e-06, | |
| "loss": 1.0872188568115235, | |
| "memory(GiB)": 74.93, | |
| "step": 1515, | |
| "token_acc": 0.6229354939233406, | |
| "train_speed(iter/s)": 0.056193 | |
| }, | |
| { | |
| "epoch": 1.1241914618369988, | |
| "grad_norm": 1.01751822081855, | |
| "learning_rate": 8.060493167332874e-06, | |
| "loss": 1.0647924423217774, | |
| "memory(GiB)": 74.93, | |
| "step": 1520, | |
| "token_acc": 0.6589195979899497, | |
| "train_speed(iter/s)": 0.056222 | |
| }, | |
| { | |
| "epoch": 1.1278876362964332, | |
| "grad_norm": 1.2668018355322213, | |
| "learning_rate": 8.003537717959378e-06, | |
| "loss": 1.054795265197754, | |
| "memory(GiB)": 74.93, | |
| "step": 1525, | |
| "token_acc": 0.6280428432327166, | |
| "train_speed(iter/s)": 0.056242 | |
| }, | |
| { | |
| "epoch": 1.1315838107558678, | |
| "grad_norm": 1.0402787270589529, | |
| "learning_rate": 7.946649641645108e-06, | |
| "loss": 1.0737996101379395, | |
| "memory(GiB)": 74.93, | |
| "step": 1530, | |
| "token_acc": 0.6400172860847018, | |
| "train_speed(iter/s)": 0.056265 | |
| }, | |
| { | |
| "epoch": 1.1352799852153022, | |
| "grad_norm": 1.1860588895073847, | |
| "learning_rate": 7.889830858147718e-06, | |
| "loss": 1.0505868911743164, | |
| "memory(GiB)": 74.93, | |
| "step": 1535, | |
| "token_acc": 0.6243339253996447, | |
| "train_speed(iter/s)": 0.056293 | |
| }, | |
| { | |
| "epoch": 1.1389761596747365, | |
| "grad_norm": 1.0989591028912902, | |
| "learning_rate": 7.833083284886484e-06, | |
| "loss": 1.0597726821899414, | |
| "memory(GiB)": 74.93, | |
| "step": 1540, | |
| "token_acc": 0.6668341708542713, | |
| "train_speed(iter/s)": 0.056316 | |
| }, | |
| { | |
| "epoch": 1.1426723341341711, | |
| "grad_norm": 1.1347824812891065, | |
| "learning_rate": 7.7764088368776e-06, | |
| "loss": 1.0500106811523438, | |
| "memory(GiB)": 74.93, | |
| "step": 1545, | |
| "token_acc": 0.6302988186240445, | |
| "train_speed(iter/s)": 0.056337 | |
| }, | |
| { | |
| "epoch": 1.1463685085936057, | |
| "grad_norm": 1.0564162756732445, | |
| "learning_rate": 7.719809426669576e-06, | |
| "loss": 1.0577827453613282, | |
| "memory(GiB)": 74.93, | |
| "step": 1550, | |
| "token_acc": 0.6201646090534979, | |
| "train_speed(iter/s)": 0.056358 | |
| }, | |
| { | |
| "epoch": 1.1463685085936057, | |
| "eval_loss": 0.6770405769348145, | |
| "eval_runtime": 87.0148, | |
| "eval_samples_per_second": 80.389, | |
| "eval_steps_per_second": 0.632, | |
| "eval_token_acc": 0.6278058533049218, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 1.15006468305304, | |
| "grad_norm": 1.1665260843525407, | |
| "learning_rate": 7.663286964278665e-06, | |
| "loss": 1.046430492401123, | |
| "memory(GiB)": 74.93, | |
| "step": 1555, | |
| "token_acc": 0.6295910639909126, | |
| "train_speed(iter/s)": 0.056161 | |
| }, | |
| { | |
| "epoch": 1.1537608575124745, | |
| "grad_norm": 1.0893384767496972, | |
| "learning_rate": 7.606843357124426e-06, | |
| "loss": 1.0604162216186523, | |
| "memory(GiB)": 74.93, | |
| "step": 1560, | |
| "token_acc": 0.618162506638343, | |
| "train_speed(iter/s)": 0.056181 | |
| }, | |
| { | |
| "epoch": 1.157457031971909, | |
| "grad_norm": 1.0091311530942315, | |
| "learning_rate": 7.550480509965348e-06, | |
| "loss": 1.0764715194702148, | |
| "memory(GiB)": 74.93, | |
| "step": 1565, | |
| "token_acc": 0.6651108518086347, | |
| "train_speed(iter/s)": 0.056207 | |
| }, | |
| { | |
| "epoch": 1.1611532064313437, | |
| "grad_norm": 0.9991849558827516, | |
| "learning_rate": 7.494200324834588e-06, | |
| "loss": 1.076918888092041, | |
| "memory(GiB)": 74.93, | |
| "step": 1570, | |
| "token_acc": 0.6519940915805023, | |
| "train_speed(iter/s)": 0.056225 | |
| }, | |
| { | |
| "epoch": 1.164849380890778, | |
| "grad_norm": 1.1070133372574182, | |
| "learning_rate": 7.43800470097576e-06, | |
| "loss": 1.0360871315002442, | |
| "memory(GiB)": 74.93, | |
| "step": 1575, | |
| "token_acc": 0.6534121440085975, | |
| "train_speed(iter/s)": 0.056247 | |
| }, | |
| { | |
| "epoch": 1.1685455553502124, | |
| "grad_norm": 0.9616191113258434, | |
| "learning_rate": 7.381895534778852e-06, | |
| "loss": 1.071969223022461, | |
| "memory(GiB)": 74.93, | |
| "step": 1580, | |
| "token_acc": 0.6318518518518519, | |
| "train_speed(iter/s)": 0.05627 | |
| }, | |
| { | |
| "epoch": 1.172241729809647, | |
| "grad_norm": 0.9588896754114927, | |
| "learning_rate": 7.3258747197162484e-06, | |
| "loss": 1.0856236457824706, | |
| "memory(GiB)": 74.93, | |
| "step": 1585, | |
| "token_acc": 0.6137469586374696, | |
| "train_speed(iter/s)": 0.05629 | |
| }, | |
| { | |
| "epoch": 1.1759379042690816, | |
| "grad_norm": 1.155114349369357, | |
| "learning_rate": 7.269944146278801e-06, | |
| "loss": 1.054957962036133, | |
| "memory(GiB)": 74.93, | |
| "step": 1590, | |
| "token_acc": 0.6266263237518911, | |
| "train_speed(iter/s)": 0.056314 | |
| }, | |
| { | |
| "epoch": 1.179634078728516, | |
| "grad_norm": 1.0144629940415562, | |
| "learning_rate": 7.214105701912054e-06, | |
| "loss": 1.0508974075317383, | |
| "memory(GiB)": 74.93, | |
| "step": 1595, | |
| "token_acc": 0.6369260827092152, | |
| "train_speed(iter/s)": 0.056334 | |
| }, | |
| { | |
| "epoch": 1.1833302531879504, | |
| "grad_norm": 1.1824656228465167, | |
| "learning_rate": 7.1583612709525405e-06, | |
| "loss": 1.0430817604064941, | |
| "memory(GiB)": 74.93, | |
| "step": 1600, | |
| "token_acc": 0.6061151079136691, | |
| "train_speed(iter/s)": 0.056355 | |
| }, | |
| { | |
| "epoch": 1.1833302531879504, | |
| "eval_loss": 0.674736499786377, | |
| "eval_runtime": 85.716, | |
| "eval_samples_per_second": 81.607, | |
| "eval_steps_per_second": 0.642, | |
| "eval_token_acc": 0.6284499615612815, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 1.187026427647385, | |
| "grad_norm": 1.1524983234954504, | |
| "learning_rate": 7.102712734564202e-06, | |
| "loss": 1.046616268157959, | |
| "memory(GiB)": 74.93, | |
| "step": 1605, | |
| "token_acc": 0.6345166331770484, | |
| "train_speed(iter/s)": 0.056165 | |
| }, | |
| { | |
| "epoch": 1.1907226021068193, | |
| "grad_norm": 0.9309819347033588, | |
| "learning_rate": 7.047161970674896e-06, | |
| "loss": 1.0448005676269532, | |
| "memory(GiB)": 74.93, | |
| "step": 1610, | |
| "token_acc": 0.6130097087378641, | |
| "train_speed(iter/s)": 0.056187 | |
| }, | |
| { | |
| "epoch": 1.194418776566254, | |
| "grad_norm": 1.0772202352983227, | |
| "learning_rate": 6.991710853913025e-06, | |
| "loss": 1.0570079803466796, | |
| "memory(GiB)": 74.93, | |
| "step": 1615, | |
| "token_acc": 0.6610324349017817, | |
| "train_speed(iter/s)": 0.056205 | |
| }, | |
| { | |
| "epoch": 1.1981149510256883, | |
| "grad_norm": 1.1619152201928238, | |
| "learning_rate": 6.936361255544288e-06, | |
| "loss": 1.044645118713379, | |
| "memory(GiB)": 74.93, | |
| "step": 1620, | |
| "token_acc": 0.6945525291828794, | |
| "train_speed(iter/s)": 0.056227 | |
| }, | |
| { | |
| "epoch": 1.201811125485123, | |
| "grad_norm": 1.0467564412195258, | |
| "learning_rate": 6.881115043408512e-06, | |
| "loss": 1.045677661895752, | |
| "memory(GiB)": 74.93, | |
| "step": 1625, | |
| "token_acc": 0.648811228874248, | |
| "train_speed(iter/s)": 0.056246 | |
| }, | |
| { | |
| "epoch": 1.2055072999445573, | |
| "grad_norm": 1.0325120697680106, | |
| "learning_rate": 6.825974081856626e-06, | |
| "loss": 1.0619203567504882, | |
| "memory(GiB)": 74.93, | |
| "step": 1630, | |
| "token_acc": 0.6202729044834308, | |
| "train_speed(iter/s)": 0.056267 | |
| }, | |
| { | |
| "epoch": 1.209203474403992, | |
| "grad_norm": 0.9412938462579274, | |
| "learning_rate": 6.770940231687767e-06, | |
| "loss": 1.0478931427001954, | |
| "memory(GiB)": 74.93, | |
| "step": 1635, | |
| "token_acc": 0.6356352537199542, | |
| "train_speed(iter/s)": 0.056289 | |
| }, | |
| { | |
| "epoch": 1.2128996488634263, | |
| "grad_norm": 1.140398149863178, | |
| "learning_rate": 6.716015350086449e-06, | |
| "loss": 1.0618717193603515, | |
| "memory(GiB)": 74.93, | |
| "step": 1640, | |
| "token_acc": 0.6066892464013548, | |
| "train_speed(iter/s)": 0.05631 | |
| }, | |
| { | |
| "epoch": 1.2165958233228609, | |
| "grad_norm": 1.0930330137960338, | |
| "learning_rate": 6.661201290559918e-06, | |
| "loss": 1.0522537231445312, | |
| "memory(GiB)": 74.93, | |
| "step": 1645, | |
| "token_acc": 0.6371971185330714, | |
| "train_speed(iter/s)": 0.056329 | |
| }, | |
| { | |
| "epoch": 1.2202919977822952, | |
| "grad_norm": 1.0731043610961355, | |
| "learning_rate": 6.606499902875585e-06, | |
| "loss": 1.0263765335083008, | |
| "memory(GiB)": 74.93, | |
| "step": 1650, | |
| "token_acc": 0.6519023282226007, | |
| "train_speed(iter/s)": 0.056348 | |
| }, | |
| { | |
| "epoch": 1.2202919977822952, | |
| "eval_loss": 0.6756451725959778, | |
| "eval_runtime": 86.9798, | |
| "eval_samples_per_second": 80.421, | |
| "eval_steps_per_second": 0.632, | |
| "eval_token_acc": 0.6288528178004742, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 1.2239881722417298, | |
| "grad_norm": 1.131205462756531, | |
| "learning_rate": 6.5519130329986245e-06, | |
| "loss": 1.0687341690063477, | |
| "memory(GiB)": 74.93, | |
| "step": 1655, | |
| "token_acc": 0.6333847797696782, | |
| "train_speed(iter/s)": 0.056155 | |
| }, | |
| { | |
| "epoch": 1.2276843467011642, | |
| "grad_norm": 1.046052101501651, | |
| "learning_rate": 6.497442523029663e-06, | |
| "loss": 1.0175907135009765, | |
| "memory(GiB)": 74.93, | |
| "step": 1660, | |
| "token_acc": 0.6453744493392071, | |
| "train_speed(iter/s)": 0.056176 | |
| }, | |
| { | |
| "epoch": 1.2313805211605988, | |
| "grad_norm": 1.0553291483906215, | |
| "learning_rate": 6.443090211142613e-06, | |
| "loss": 1.0627668380737305, | |
| "memory(GiB)": 74.93, | |
| "step": 1665, | |
| "token_acc": 0.6409149762624082, | |
| "train_speed(iter/s)": 0.056196 | |
| }, | |
| { | |
| "epoch": 1.2350766956200332, | |
| "grad_norm": 0.9606710463766085, | |
| "learning_rate": 6.388857931522657e-06, | |
| "loss": 1.043929672241211, | |
| "memory(GiB)": 74.93, | |
| "step": 1670, | |
| "token_acc": 0.6334586466165414, | |
| "train_speed(iter/s)": 0.056218 | |
| }, | |
| { | |
| "epoch": 1.2387728700794678, | |
| "grad_norm": 0.9843358834706085, | |
| "learning_rate": 6.334747514304338e-06, | |
| "loss": 1.0336435317993165, | |
| "memory(GiB)": 74.93, | |
| "step": 1675, | |
| "token_acc": 0.6631016042780749, | |
| "train_speed(iter/s)": 0.056238 | |
| }, | |
| { | |
| "epoch": 1.2424690445389022, | |
| "grad_norm": 1.0297683983640094, | |
| "learning_rate": 6.280760785509802e-06, | |
| "loss": 1.0500383377075195, | |
| "memory(GiB)": 74.93, | |
| "step": 1680, | |
| "token_acc": 0.6349254639488896, | |
| "train_speed(iter/s)": 0.05626 | |
| }, | |
| { | |
| "epoch": 1.2461652189983368, | |
| "grad_norm": 1.0776782375280287, | |
| "learning_rate": 6.226899566987177e-06, | |
| "loss": 1.0217618942260742, | |
| "memory(GiB)": 74.93, | |
| "step": 1685, | |
| "token_acc": 0.655511811023622, | |
| "train_speed(iter/s)": 0.056281 | |
| }, | |
| { | |
| "epoch": 1.2498613934577711, | |
| "grad_norm": 1.0846016823921123, | |
| "learning_rate": 6.173165676349103e-06, | |
| "loss": 1.0370861053466798, | |
| "memory(GiB)": 74.93, | |
| "step": 1690, | |
| "token_acc": 0.6801365964712578, | |
| "train_speed(iter/s)": 0.056303 | |
| }, | |
| { | |
| "epoch": 1.2535575679172057, | |
| "grad_norm": 1.0790787844363594, | |
| "learning_rate": 6.119560926911377e-06, | |
| "loss": 1.0697561264038087, | |
| "memory(GiB)": 74.93, | |
| "step": 1695, | |
| "token_acc": 0.6681639528354857, | |
| "train_speed(iter/s)": 0.056324 | |
| }, | |
| { | |
| "epoch": 1.2572537423766401, | |
| "grad_norm": 1.106497642833312, | |
| "learning_rate": 6.066087127631761e-06, | |
| "loss": 1.0666908264160155, | |
| "memory(GiB)": 74.93, | |
| "step": 1700, | |
| "token_acc": 0.6533379694019471, | |
| "train_speed(iter/s)": 0.056341 | |
| }, | |
| { | |
| "epoch": 1.2572537423766401, | |
| "eval_loss": 0.6751002073287964, | |
| "eval_runtime": 88.5942, | |
| "eval_samples_per_second": 78.955, | |
| "eval_steps_per_second": 0.621, | |
| "eval_token_acc": 0.6288689782226767, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 1.2609499168360747, | |
| "grad_norm": 1.0779984264892808, | |
| "learning_rate": 6.012746083048966e-06, | |
| "loss": 1.0639089584350585, | |
| "memory(GiB)": 34.88, | |
| "step": 1705, | |
| "token_acc": 0.6968838526912181, | |
| "train_speed(iter/s)": 14.788094 | |
| }, | |
| { | |
| "epoch": 1.264646091295509, | |
| "grad_norm": 1.1027008185153624, | |
| "learning_rate": 5.959539593221711e-06, | |
| "loss": 1.0941818237304688, | |
| "memory(GiB)": 34.88, | |
| "step": 1710, | |
| "token_acc": 0.6294489092996556, | |
| "train_speed(iter/s)": 9.344634 | |
| }, | |
| { | |
| "epoch": 1.2683422657549437, | |
| "grad_norm": 1.2059692859973439, | |
| "learning_rate": 5.9064694536680135e-06, | |
| "loss": 1.0492952346801758, | |
| "memory(GiB)": 49.4, | |
| "step": 1715, | |
| "token_acc": 0.6576319543509273, | |
| "train_speed(iter/s)": 6.522706 | |
| }, | |
| { | |
| "epoch": 1.272038440214378, | |
| "grad_norm": 1.0913297173697671, | |
| "learning_rate": 5.853537455304575e-06, | |
| "loss": 1.0665050506591798, | |
| "memory(GiB)": 49.4, | |
| "step": 1720, | |
| "token_acc": 0.6941935483870968, | |
| "train_speed(iter/s)": 4.977275 | |
| }, | |
| { | |
| "epoch": 1.2757346146738127, | |
| "grad_norm": 1.1326249785449936, | |
| "learning_rate": 5.800745384386364e-06, | |
| "loss": 1.035014533996582, | |
| "memory(GiB)": 49.4, | |
| "step": 1725, | |
| "token_acc": 0.6055200269269606, | |
| "train_speed(iter/s)": 4.1257 | |
| }, | |
| { | |
| "epoch": 1.279430789133247, | |
| "grad_norm": 1.011492822170868, | |
| "learning_rate": 5.74809502244632e-06, | |
| "loss": 1.040954875946045, | |
| "memory(GiB)": 49.4, | |
| "step": 1730, | |
| "token_acc": 0.6559888579387186, | |
| "train_speed(iter/s)": 3.505361 | |
| }, | |
| { | |
| "epoch": 1.2831269635926816, | |
| "grad_norm": 0.9143549731190831, | |
| "learning_rate": 5.695588146235241e-06, | |
| "loss": 1.056338119506836, | |
| "memory(GiB)": 49.4, | |
| "step": 1735, | |
| "token_acc": 0.6355591311343524, | |
| "train_speed(iter/s)": 3.006185 | |
| }, | |
| { | |
| "epoch": 1.286823138052116, | |
| "grad_norm": 1.0541690596505233, | |
| "learning_rate": 5.643226527661825e-06, | |
| "loss": 1.0424397468566895, | |
| "memory(GiB)": 64.42, | |
| "step": 1740, | |
| "token_acc": 0.6127497621313035, | |
| "train_speed(iter/s)": 2.653736 | |
| }, | |
| { | |
| "epoch": 1.2905193125115506, | |
| "grad_norm": 1.071302718364978, | |
| "learning_rate": 5.591011933732873e-06, | |
| "loss": 1.0049684524536133, | |
| "memory(GiB)": 64.42, | |
| "step": 1745, | |
| "token_acc": 0.6237816764132553, | |
| "train_speed(iter/s)": 2.414167 | |
| }, | |
| { | |
| "epoch": 1.294215486970985, | |
| "grad_norm": 1.0017860936129825, | |
| "learning_rate": 5.538946126493659e-06, | |
| "loss": 1.048162841796875, | |
| "memory(GiB)": 64.42, | |
| "step": 1750, | |
| "token_acc": 0.6117103235747303, | |
| "train_speed(iter/s)": 2.163836 | |
| }, | |
| { | |
| "epoch": 1.294215486970985, | |
| "eval_loss": 0.6697070002555847, | |
| "eval_runtime": 85.8145, | |
| "eval_samples_per_second": 81.513, | |
| "eval_steps_per_second": 0.641, | |
| "eval_token_acc": 0.6293895746807739, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 1.2979116614304196, | |
| "grad_norm": 1.1010002294868126, | |
| "learning_rate": 5.4870308629684675e-06, | |
| "loss": 1.0428232192993163, | |
| "memory(GiB)": 74.63, | |
| "step": 1755, | |
| "token_acc": 0.634660903571061, | |
| "train_speed(iter/s)": 1.752193 | |
| }, | |
| { | |
| "epoch": 1.301607835889854, | |
| "grad_norm": 1.1351842621603827, | |
| "learning_rate": 5.435267895101303e-06, | |
| "loss": 1.0705801010131837, | |
| "memory(GiB)": 74.63, | |
| "step": 1760, | |
| "token_acc": 0.663578947368421, | |
| "train_speed(iter/s)": 1.629796 | |
| }, | |
| { | |
| "epoch": 1.3053040103492886, | |
| "grad_norm": 0.9688327106799416, | |
| "learning_rate": 5.383658969696767e-06, | |
| "loss": 1.043651008605957, | |
| "memory(GiB)": 74.63, | |
| "step": 1765, | |
| "token_acc": 0.6663619744058501, | |
| "train_speed(iter/s)": 1.540319 | |
| }, | |
| { | |
| "epoch": 1.309000184808723, | |
| "grad_norm": 1.0196740986171486, | |
| "learning_rate": 5.3322058283611045e-06, | |
| "loss": 1.066755485534668, | |
| "memory(GiB)": 74.63, | |
| "step": 1770, | |
| "token_acc": 0.6984352773826458, | |
| "train_speed(iter/s)": 1.440515 | |
| }, | |
| { | |
| "epoch": 1.3126963592681575, | |
| "grad_norm": 0.9324312791356152, | |
| "learning_rate": 5.2809102074434505e-06, | |
| "loss": 1.0861141204833984, | |
| "memory(GiB)": 74.63, | |
| "step": 1775, | |
| "token_acc": 0.6625352112676056, | |
| "train_speed(iter/s)": 1.355437 | |
| }, | |
| { | |
| "epoch": 1.316392533727592, | |
| "grad_norm": 1.0475529503023757, | |
| "learning_rate": 5.229773837977208e-06, | |
| "loss": 1.0537721633911132, | |
| "memory(GiB)": 74.63, | |
| "step": 1780, | |
| "token_acc": 0.6779266161910309, | |
| "train_speed(iter/s)": 1.294879 | |
| }, | |
| { | |
| "epoch": 1.3200887081870265, | |
| "grad_norm": 0.9281011767547357, | |
| "learning_rate": 5.178798445621645e-06, | |
| "loss": 1.0430593490600586, | |
| "memory(GiB)": 74.63, | |
| "step": 1785, | |
| "token_acc": 0.6330935251798561, | |
| "train_speed(iter/s)": 1.224208 | |
| }, | |
| { | |
| "epoch": 1.3237848826464609, | |
| "grad_norm": 1.0483168678654606, | |
| "learning_rate": 5.127985750603671e-06, | |
| "loss": 1.071333885192871, | |
| "memory(GiB)": 74.63, | |
| "step": 1790, | |
| "token_acc": 0.6417910447761194, | |
| "train_speed(iter/s)": 1.162932 | |
| }, | |
| { | |
| "epoch": 1.3274810571058955, | |
| "grad_norm": 1.097565660571469, | |
| "learning_rate": 5.077337467659768e-06, | |
| "loss": 1.0753141403198243, | |
| "memory(GiB)": 74.63, | |
| "step": 1795, | |
| "token_acc": 0.6051001821493625, | |
| "train_speed(iter/s)": 1.117195 | |
| }, | |
| { | |
| "epoch": 1.3311772315653299, | |
| "grad_norm": 1.063181582729188, | |
| "learning_rate": 5.026855305978129e-06, | |
| "loss": 1.0764029502868653, | |
| "memory(GiB)": 74.63, | |
| "step": 1800, | |
| "token_acc": 0.6232106339468303, | |
| "train_speed(iter/s)": 1.067656 | |
| }, | |
| { | |
| "epoch": 1.3311772315653299, | |
| "eval_loss": 0.6690813899040222, | |
| "eval_runtime": 85.9692, | |
| "eval_samples_per_second": 81.366, | |
| "eval_steps_per_second": 0.64, | |
| "eval_token_acc": 0.6297716532342776, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 1.3348734060247645, | |
| "grad_norm": 1.0429392429603475, | |
| "learning_rate": 4.976540969140984e-06, | |
| "loss": 1.090817928314209, | |
| "memory(GiB)": 74.63, | |
| "step": 1805, | |
| "token_acc": 0.6356786703601108, | |
| "train_speed(iter/s)": 0.961744 | |
| }, | |
| { | |
| "epoch": 1.3385695804841988, | |
| "grad_norm": 1.0548409670879852, | |
| "learning_rate": 4.926396155067114e-06, | |
| "loss": 1.0316819190979003, | |
| "memory(GiB)": 74.63, | |
| "step": 1810, | |
| "token_acc": 0.6598138091543833, | |
| "train_speed(iter/s)": 0.923472 | |
| }, | |
| { | |
| "epoch": 1.3422657549436332, | |
| "grad_norm": 1.0297653617411635, | |
| "learning_rate": 4.876422555954543e-06, | |
| "loss": 1.03601131439209, | |
| "memory(GiB)": 74.63, | |
| "step": 1815, | |
| "token_acc": 0.6965428937259923, | |
| "train_speed(iter/s)": 0.894132 | |
| }, | |
| { | |
| "epoch": 1.3459619294030678, | |
| "grad_norm": 1.1178512794477986, | |
| "learning_rate": 4.826621858223431e-06, | |
| "loss": 1.0318429946899415, | |
| "memory(GiB)": 74.63, | |
| "step": 1820, | |
| "token_acc": 0.6313304721030043, | |
| "train_speed(iter/s)": 0.864578 | |
| }, | |
| { | |
| "epoch": 1.3496581038625024, | |
| "grad_norm": 1.0401775609610366, | |
| "learning_rate": 4.776995742459184e-06, | |
| "loss": 1.0820954322814942, | |
| "memory(GiB)": 74.63, | |
| "step": 1825, | |
| "token_acc": 0.6357702349869452, | |
| "train_speed(iter/s)": 0.833393 | |
| }, | |
| { | |
| "epoch": 1.3533542783219368, | |
| "grad_norm": 1.1053520267340973, | |
| "learning_rate": 4.727545883355713e-06, | |
| "loss": 1.0570013046264648, | |
| "memory(GiB)": 74.63, | |
| "step": 1830, | |
| "token_acc": 0.6462998102466793, | |
| "train_speed(iter/s)": 0.80849 | |
| }, | |
| { | |
| "epoch": 1.3570504527813712, | |
| "grad_norm": 1.0129657782670332, | |
| "learning_rate": 4.678273949658939e-06, | |
| "loss": 1.0589232444763184, | |
| "memory(GiB)": 74.63, | |
| "step": 1835, | |
| "token_acc": 0.6194251734390486, | |
| "train_speed(iter/s)": 0.785859 | |
| }, | |
| { | |
| "epoch": 1.3607466272408058, | |
| "grad_norm": 0.9863992139542379, | |
| "learning_rate": 4.629181604110464e-06, | |
| "loss": 1.0515235900878905, | |
| "memory(GiB)": 74.63, | |
| "step": 1840, | |
| "token_acc": 0.6229317851959362, | |
| "train_speed(iter/s)": 0.761135 | |
| }, | |
| { | |
| "epoch": 1.3644428017002403, | |
| "grad_norm": 1.1494795183000623, | |
| "learning_rate": 4.580270503391487e-06, | |
| "loss": 1.0223835945129394, | |
| "memory(GiB)": 74.63, | |
| "step": 1845, | |
| "token_acc": 0.6583261432269197, | |
| "train_speed(iter/s)": 0.739616 | |
| }, | |
| { | |
| "epoch": 1.3681389761596747, | |
| "grad_norm": 1.14471617138646, | |
| "learning_rate": 4.531542298066861e-06, | |
| "loss": 1.0207533836364746, | |
| "memory(GiB)": 74.63, | |
| "step": 1850, | |
| "token_acc": 0.6551959114139694, | |
| "train_speed(iter/s)": 0.721142 | |
| }, | |
| { | |
| "epoch": 1.3681389761596747, | |
| "eval_loss": 0.6679942607879639, | |
| "eval_runtime": 93.3503, | |
| "eval_samples_per_second": 74.933, | |
| "eval_steps_per_second": 0.589, | |
| "eval_token_acc": 0.6300302199895188, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 1.371835150619109, | |
| "grad_norm": 1.1238157971359715, | |
| "learning_rate": 4.482998632529414e-06, | |
| "loss": 1.0442536354064942, | |
| "memory(GiB)": 74.63, | |
| "step": 1855, | |
| "token_acc": 0.6386843397152675, | |
| "train_speed(iter/s)": 0.673362 | |
| }, | |
| { | |
| "epoch": 1.3755313250785437, | |
| "grad_norm": 0.9044341600768213, | |
| "learning_rate": 4.434641144944464e-06, | |
| "loss": 1.0640903472900392, | |
| "memory(GiB)": 74.63, | |
| "step": 1860, | |
| "token_acc": 0.6587333602258976, | |
| "train_speed(iter/s)": 0.655234 | |
| }, | |
| { | |
| "epoch": 1.3792274995379783, | |
| "grad_norm": 1.0166299256206919, | |
| "learning_rate": 4.386471467194513e-06, | |
| "loss": 1.0587308883666993, | |
| "memory(GiB)": 74.63, | |
| "step": 1865, | |
| "token_acc": 0.6148590947907772, | |
| "train_speed(iter/s)": 0.63915 | |
| }, | |
| { | |
| "epoch": 1.3829236739974127, | |
| "grad_norm": 1.2786373427724909, | |
| "learning_rate": 4.338491224824198e-06, | |
| "loss": 1.0438286781311035, | |
| "memory(GiB)": 74.63, | |
| "step": 1870, | |
| "token_acc": 0.6332835077229696, | |
| "train_speed(iter/s)": 0.625873 | |
| }, | |
| { | |
| "epoch": 1.386619848456847, | |
| "grad_norm": 1.0910902180920756, | |
| "learning_rate": 4.290702036985423e-06, | |
| "loss": 1.0352885246276855, | |
| "memory(GiB)": 74.63, | |
| "step": 1875, | |
| "token_acc": 0.6918429003021148, | |
| "train_speed(iter/s)": 0.610514 | |
| }, | |
| { | |
| "epoch": 1.3903160229162816, | |
| "grad_norm": 1.0540455114144576, | |
| "learning_rate": 4.243105516382732e-06, | |
| "loss": 1.0169889450073242, | |
| "memory(GiB)": 74.63, | |
| "step": 1880, | |
| "token_acc": 0.6479912544411042, | |
| "train_speed(iter/s)": 0.59628 | |
| }, | |
| { | |
| "epoch": 1.3940121973757162, | |
| "grad_norm": 1.0796012032362492, | |
| "learning_rate": 4.1957032692188685e-06, | |
| "loss": 1.0289284706115722, | |
| "memory(GiB)": 74.63, | |
| "step": 1885, | |
| "token_acc": 0.6304772536980184, | |
| "train_speed(iter/s)": 0.584845 | |
| }, | |
| { | |
| "epoch": 1.3977083718351506, | |
| "grad_norm": 0.9497813177866403, | |
| "learning_rate": 4.148496895140586e-06, | |
| "loss": 1.0058039665222167, | |
| "memory(GiB)": 74.63, | |
| "step": 1890, | |
| "token_acc": 0.6662360034453058, | |
| "train_speed(iter/s)": 0.572483 | |
| }, | |
| { | |
| "epoch": 1.401404546294585, | |
| "grad_norm": 0.9994791403674819, | |
| "learning_rate": 4.101487987184658e-06, | |
| "loss": 1.0271056175231934, | |
| "memory(GiB)": 74.63, | |
| "step": 1895, | |
| "token_acc": 0.7174721189591078, | |
| "train_speed(iter/s)": 0.559822 | |
| }, | |
| { | |
| "epoch": 1.4051007207540196, | |
| "grad_norm": 0.9675552310253457, | |
| "learning_rate": 4.054678131724128e-06, | |
| "loss": 1.0421775817871093, | |
| "memory(GiB)": 74.63, | |
| "step": 1900, | |
| "token_acc": 0.6403071017274472, | |
| "train_speed(iter/s)": 0.549398 | |
| }, | |
| { | |
| "epoch": 1.4051007207540196, | |
| "eval_loss": 0.6665124893188477, | |
| "eval_runtime": 92.5325, | |
| "eval_samples_per_second": 75.595, | |
| "eval_steps_per_second": 0.594, | |
| "eval_token_acc": 0.6305912175031224, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 1.4087968952134542, | |
| "grad_norm": 0.9383388424277262, | |
| "learning_rate": 4.008068908414764e-06, | |
| "loss": 1.0390195846557617, | |
| "memory(GiB)": 74.63, | |
| "step": 1905, | |
| "token_acc": 0.636108220603538, | |
| "train_speed(iter/s)": 0.522161 | |
| }, | |
| { | |
| "epoch": 1.4124930696728886, | |
| "grad_norm": 1.0404355020365603, | |
| "learning_rate": 3.961661890141756e-06, | |
| "loss": 1.064806842803955, | |
| "memory(GiB)": 74.63, | |
| "step": 1910, | |
| "token_acc": 0.5955159705159705, | |
| "train_speed(iter/s)": 0.512861 | |
| }, | |
| { | |
| "epoch": 1.416189244132323, | |
| "grad_norm": 1.1641858092814779, | |
| "learning_rate": 3.91545864296665e-06, | |
| "loss": 1.0407491683959962, | |
| "memory(GiB)": 74.63, | |
| "step": 1915, | |
| "token_acc": 0.6579710144927536, | |
| "train_speed(iter/s)": 0.502749 | |
| }, | |
| { | |
| "epoch": 1.4198854185917575, | |
| "grad_norm": 0.9981716234289997, | |
| "learning_rate": 3.8694607260744745e-06, | |
| "loss": 1.0334474563598632, | |
| "memory(GiB)": 74.63, | |
| "step": 1920, | |
| "token_acc": 0.6448382126348228, | |
| "train_speed(iter/s)": 0.494436 | |
| }, | |
| { | |
| "epoch": 1.4235815930511921, | |
| "grad_norm": 1.0999406567886463, | |
| "learning_rate": 3.8236696917211365e-06, | |
| "loss": 1.0606246948242188, | |
| "memory(GiB)": 74.63, | |
| "step": 1925, | |
| "token_acc": 0.6300940438871473, | |
| "train_speed(iter/s)": 0.48651 | |
| }, | |
| { | |
| "epoch": 1.4272777675106265, | |
| "grad_norm": 1.0161660727647654, | |
| "learning_rate": 3.7780870851810515e-06, | |
| "loss": 1.076219654083252, | |
| "memory(GiB)": 74.63, | |
| "step": 1930, | |
| "token_acc": 0.6260296540362438, | |
| "train_speed(iter/s)": 0.477741 | |
| }, | |
| { | |
| "epoch": 1.430973941970061, | |
| "grad_norm": 0.9703902428924409, | |
| "learning_rate": 3.7327144446949716e-06, | |
| "loss": 1.0812992095947265, | |
| "memory(GiB)": 74.63, | |
| "step": 1935, | |
| "token_acc": 0.630064591896653, | |
| "train_speed(iter/s)": 0.470034 | |
| }, | |
| { | |
| "epoch": 1.4346701164294955, | |
| "grad_norm": 1.0947535810008933, | |
| "learning_rate": 3.687553301418092e-06, | |
| "loss": 1.0244592666625976, | |
| "memory(GiB)": 74.63, | |
| "step": 1940, | |
| "token_acc": 0.6301992310380986, | |
| "train_speed(iter/s)": 0.463221 | |
| }, | |
| { | |
| "epoch": 1.43836629088893, | |
| "grad_norm": 1.0200917528662774, | |
| "learning_rate": 3.6426051793683724e-06, | |
| "loss": 1.0360092163085937, | |
| "memory(GiB)": 74.63, | |
| "step": 1945, | |
| "token_acc": 0.6446078431372549, | |
| "train_speed(iter/s)": 0.45531 | |
| }, | |
| { | |
| "epoch": 1.4420624653483645, | |
| "grad_norm": 0.9670618590123606, | |
| "learning_rate": 3.5978715953751207e-06, | |
| "loss": 1.0297866821289063, | |
| "memory(GiB)": 74.63, | |
| "step": 1950, | |
| "token_acc": 0.6481696687972109, | |
| "train_speed(iter/s)": 0.448099 | |
| }, | |
| { | |
| "epoch": 1.4420624653483645, | |
| "eval_loss": 0.6662415862083435, | |
| "eval_runtime": 87.5872, | |
| "eval_samples_per_second": 79.863, | |
| "eval_steps_per_second": 0.628, | |
| "eval_token_acc": 0.6309225061582752, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 1.4457586398077988, | |
| "grad_norm": 0.9880600888670725, | |
| "learning_rate": 3.5533540590277882e-06, | |
| "loss": 1.0223572731018067, | |
| "memory(GiB)": 74.63, | |
| "step": 1955, | |
| "token_acc": 0.6359920144500428, | |
| "train_speed(iter/s)": 0.430514 | |
| }, | |
| { | |
| "epoch": 1.4494548142672334, | |
| "grad_norm": 0.9593918073057777, | |
| "learning_rate": 3.509054072625031e-06, | |
| "loss": 1.0360115051269532, | |
| "memory(GiB)": 74.63, | |
| "step": 1960, | |
| "token_acc": 0.6581899775617053, | |
| "train_speed(iter/s)": 0.424799 | |
| }, | |
| { | |
| "epoch": 1.453150988726668, | |
| "grad_norm": 1.0289280788083641, | |
| "learning_rate": 3.4649731311240276e-06, | |
| "loss": 1.0378742218017578, | |
| "memory(GiB)": 74.63, | |
| "step": 1965, | |
| "token_acc": 0.6424075531077892, | |
| "train_speed(iter/s)": 0.418454 | |
| }, | |
| { | |
| "epoch": 1.4568471631861024, | |
| "grad_norm": 1.053788067984888, | |
| "learning_rate": 3.4211127220900107e-06, | |
| "loss": 1.0713199615478515, | |
| "memory(GiB)": 74.63, | |
| "step": 1970, | |
| "token_acc": 0.632213608957795, | |
| "train_speed(iter/s)": 0.412536 | |
| }, | |
| { | |
| "epoch": 1.4605433376455368, | |
| "grad_norm": 1.180153902117692, | |
| "learning_rate": 3.377474325646074e-06, | |
| "loss": 1.0560644149780274, | |
| "memory(GiB)": 74.63, | |
| "step": 1975, | |
| "token_acc": 0.641423703142749, | |
| "train_speed(iter/s)": 0.407398 | |
| }, | |
| { | |
| "epoch": 1.4642395121049714, | |
| "grad_norm": 0.8918348376917337, | |
| "learning_rate": 3.334059414423233e-06, | |
| "loss": 1.055532169342041, | |
| "memory(GiB)": 74.63, | |
| "step": 1980, | |
| "token_acc": 0.668722786647315, | |
| "train_speed(iter/s)": 0.401897 | |
| }, | |
| { | |
| "epoch": 1.4679356865644058, | |
| "grad_norm": 1.109026709845534, | |
| "learning_rate": 3.2908694535107144e-06, | |
| "loss": 1.027819538116455, | |
| "memory(GiB)": 74.63, | |
| "step": 1985, | |
| "token_acc": 0.661387220098307, | |
| "train_speed(iter/s)": 0.396281 | |
| }, | |
| { | |
| "epoch": 1.4716318610238404, | |
| "grad_norm": 1.0886246897973584, | |
| "learning_rate": 3.247905900406523e-06, | |
| "loss": 1.0191631317138672, | |
| "memory(GiB)": 74.63, | |
| "step": 1990, | |
| "token_acc": 0.6097883597883598, | |
| "train_speed(iter/s)": 0.391566 | |
| }, | |
| { | |
| "epoch": 1.4753280354832747, | |
| "grad_norm": 1.0630977460263966, | |
| "learning_rate": 3.2051702049682554e-06, | |
| "loss": 1.042071533203125, | |
| "memory(GiB)": 74.63, | |
| "step": 1995, | |
| "token_acc": 0.6236017897091722, | |
| "train_speed(iter/s)": 0.386682 | |
| }, | |
| { | |
| "epoch": 1.4790242099427093, | |
| "grad_norm": 1.1953007407214893, | |
| "learning_rate": 3.162663809364178e-06, | |
| "loss": 1.0401007652282714, | |
| "memory(GiB)": 74.63, | |
| "step": 2000, | |
| "token_acc": 0.6173344235486509, | |
| "train_speed(iter/s)": 0.381535 | |
| }, | |
| { | |
| "epoch": 1.4790242099427093, | |
| "eval_loss": 0.6649311184883118, | |
| "eval_runtime": 83.4819, | |
| "eval_samples_per_second": 83.791, | |
| "eval_steps_per_second": 0.659, | |
| "eval_token_acc": 0.63089018531387, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 1.4827203844021437, | |
| "grad_norm": 1.0030060203161786, | |
| "learning_rate": 3.120388148024548e-06, | |
| "loss": 1.0528248786926269, | |
| "memory(GiB)": 74.63, | |
| "step": 2005, | |
| "token_acc": 0.6302038823098522, | |
| "train_speed(iter/s)": 0.368939 | |
| }, | |
| { | |
| "epoch": 1.4864165588615783, | |
| "grad_norm": 1.1306385027348749, | |
| "learning_rate": 3.0783446475932145e-06, | |
| "loss": 1.061046028137207, | |
| "memory(GiB)": 74.63, | |
| "step": 2010, | |
| "token_acc": 0.6473043478260869, | |
| "train_speed(iter/s)": 0.364909 | |
| }, | |
| { | |
| "epoch": 1.4901127333210127, | |
| "grad_norm": 1.0935000761259253, | |
| "learning_rate": 3.036534726879473e-06, | |
| "loss": 1.0255512237548827, | |
| "memory(GiB)": 74.63, | |
| "step": 2015, | |
| "token_acc": 0.65625, | |
| "train_speed(iter/s)": 0.360903 | |
| }, | |
| { | |
| "epoch": 1.4938089077804473, | |
| "grad_norm": 1.088331528861988, | |
| "learning_rate": 2.9949597968101883e-06, | |
| "loss": 1.0589797973632813, | |
| "memory(GiB)": 74.63, | |
| "step": 2020, | |
| "token_acc": 0.6325940212150434, | |
| "train_speed(iter/s)": 0.356624 | |
| }, | |
| { | |
| "epoch": 1.4975050822398817, | |
| "grad_norm": 1.0677052287012947, | |
| "learning_rate": 2.953621260382171e-06, | |
| "loss": 1.0519143104553224, | |
| "memory(GiB)": 74.63, | |
| "step": 2025, | |
| "token_acc": 0.6626557799742158, | |
| "train_speed(iter/s)": 0.352723 | |
| }, | |
| { | |
| "epoch": 1.5012012566993163, | |
| "grad_norm": 0.9383180241618552, | |
| "learning_rate": 2.9125205126148535e-06, | |
| "loss": 1.031491470336914, | |
| "memory(GiB)": 74.63, | |
| "step": 2030, | |
| "token_acc": 0.6123364485981309, | |
| "train_speed(iter/s)": 0.349069 | |
| }, | |
| { | |
| "epoch": 1.5048974311587506, | |
| "grad_norm": 1.0487719308291952, | |
| "learning_rate": 2.871658940503188e-06, | |
| "loss": 1.024942398071289, | |
| "memory(GiB)": 74.63, | |
| "step": 2035, | |
| "token_acc": 0.6477366255144033, | |
| "train_speed(iter/s)": 0.345173 | |
| }, | |
| { | |
| "epoch": 1.5085936056181852, | |
| "grad_norm": 1.0789502013849968, | |
| "learning_rate": 2.831037922970855e-06, | |
| "loss": 1.0276554107666016, | |
| "memory(GiB)": 74.63, | |
| "step": 2040, | |
| "token_acc": 0.6695604991861096, | |
| "train_speed(iter/s)": 0.341604 | |
| }, | |
| { | |
| "epoch": 1.5122897800776198, | |
| "grad_norm": 1.0851618366990563, | |
| "learning_rate": 2.7906588308237228e-06, | |
| "loss": 1.027616596221924, | |
| "memory(GiB)": 74.63, | |
| "step": 2045, | |
| "token_acc": 0.7097625329815304, | |
| "train_speed(iter/s)": 0.338222 | |
| }, | |
| { | |
| "epoch": 1.5159859545370542, | |
| "grad_norm": 0.9179924796471817, | |
| "learning_rate": 2.7505230267036032e-06, | |
| "loss": 1.0497385025024415, | |
| "memory(GiB)": 74.63, | |
| "step": 2050, | |
| "token_acc": 0.5937649880095923, | |
| "train_speed(iter/s)": 0.334489 | |
| }, | |
| { | |
| "epoch": 1.5159859545370542, | |
| "eval_loss": 0.6642535328865051, | |
| "eval_runtime": 85.9904, | |
| "eval_samples_per_second": 81.346, | |
| "eval_steps_per_second": 0.64, | |
| "eval_token_acc": 0.6313195908181098, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 1.5196821289964886, | |
| "grad_norm": 1.0681296921147372, | |
| "learning_rate": 2.7106318650422447e-06, | |
| "loss": 1.0099181175231933, | |
| "memory(GiB)": 74.63, | |
| "step": 2055, | |
| "token_acc": 0.6372694090953931, | |
| "train_speed(iter/s)": 0.325208 | |
| }, | |
| { | |
| "epoch": 1.5233783034559232, | |
| "grad_norm": 1.1164983354073834, | |
| "learning_rate": 2.6709866920156434e-06, | |
| "loss": 1.0027360916137695, | |
| "memory(GiB)": 74.63, | |
| "step": 2060, | |
| "token_acc": 0.631484794275492, | |
| "train_speed(iter/s)": 0.321919 | |
| }, | |
| { | |
| "epoch": 1.5270744779153576, | |
| "grad_norm": 0.9417253538259095, | |
| "learning_rate": 2.6315888454986017e-06, | |
| "loss": 1.0374462127685546, | |
| "memory(GiB)": 74.63, | |
| "step": 2065, | |
| "token_acc": 0.6671586715867158, | |
| "train_speed(iter/s)": 0.319024 | |
| }, | |
| { | |
| "epoch": 1.530770652374792, | |
| "grad_norm": 1.1095932914113171, | |
| "learning_rate": 2.5924396550195986e-06, | |
| "loss": 1.03175687789917, | |
| "memory(GiB)": 74.63, | |
| "step": 2070, | |
| "token_acc": 0.6316007454959619, | |
| "train_speed(iter/s)": 0.315819 | |
| }, | |
| { | |
| "epoch": 1.5344668268342265, | |
| "grad_norm": 1.0582702932147185, | |
| "learning_rate": 2.5535404417159002e-06, | |
| "loss": 1.0430908203125, | |
| "memory(GiB)": 74.63, | |
| "step": 2075, | |
| "token_acc": 0.6477673325499412, | |
| "train_speed(iter/s)": 0.312805 | |
| }, | |
| { | |
| "epoch": 1.5381630012936611, | |
| "grad_norm": 1.0515415830247143, | |
| "learning_rate": 2.514892518288988e-06, | |
| "loss": 1.0108471870422364, | |
| "memory(GiB)": 74.63, | |
| "step": 2080, | |
| "token_acc": 0.6291390728476821, | |
| "train_speed(iter/s)": 0.310115 | |
| }, | |
| { | |
| "epoch": 1.5418591757530955, | |
| "grad_norm": 1.018793664843126, | |
| "learning_rate": 2.4764971889602705e-06, | |
| "loss": 1.0460142135620116, | |
| "memory(GiB)": 74.63, | |
| "step": 2085, | |
| "token_acc": 0.6321537789427698, | |
| "train_speed(iter/s)": 0.307239 | |
| }, | |
| { | |
| "epoch": 1.5455553502125299, | |
| "grad_norm": 1.0684231311720556, | |
| "learning_rate": 2.4383557494270483e-06, | |
| "loss": 1.03402099609375, | |
| "memory(GiB)": 74.63, | |
| "step": 2090, | |
| "token_acc": 0.6098130841121495, | |
| "train_speed(iter/s)": 0.304401 | |
| }, | |
| { | |
| "epoch": 1.5492515246719645, | |
| "grad_norm": 1.1947182692900764, | |
| "learning_rate": 2.400469486818803e-06, | |
| "loss": 1.0426679611206056, | |
| "memory(GiB)": 74.63, | |
| "step": 2095, | |
| "token_acc": 0.6819553409776705, | |
| "train_speed(iter/s)": 0.301883 | |
| }, | |
| { | |
| "epoch": 1.552947699131399, | |
| "grad_norm": 1.1961503070894741, | |
| "learning_rate": 2.3628396796537588e-06, | |
| "loss": 1.0395529747009278, | |
| "memory(GiB)": 74.63, | |
| "step": 2100, | |
| "token_acc": 0.6641014033499321, | |
| "train_speed(iter/s)": 0.299223 | |
| }, | |
| { | |
| "epoch": 1.552947699131399, | |
| "eval_loss": 0.6638895273208618, | |
| "eval_runtime": 88.4322, | |
| "eval_samples_per_second": 79.1, | |
| "eval_steps_per_second": 0.622, | |
| "eval_token_acc": 0.6315920093638103, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 1.5566438735908334, | |
| "grad_norm": 1.0207020501497954, | |
| "learning_rate": 2.325467597795745e-06, | |
| "loss": 1.0622333526611327, | |
| "memory(GiB)": 74.63, | |
| "step": 2105, | |
| "token_acc": 0.638814317673378, | |
| "train_speed(iter/s)": 0.291998 | |
| }, | |
| { | |
| "epoch": 1.5603400480502678, | |
| "grad_norm": 1.1172734543264464, | |
| "learning_rate": 2.2883545024113263e-06, | |
| "loss": 1.0403221130371094, | |
| "memory(GiB)": 74.63, | |
| "step": 2110, | |
| "token_acc": 0.6622971285892634, | |
| "train_speed(iter/s)": 0.289437 | |
| }, | |
| { | |
| "epoch": 1.5640362225097024, | |
| "grad_norm": 1.0571335154576122, | |
| "learning_rate": 2.251501645927253e-06, | |
| "loss": 1.0463993072509765, | |
| "memory(GiB)": 74.63, | |
| "step": 2115, | |
| "token_acc": 0.636986301369863, | |
| "train_speed(iter/s)": 0.28714 | |
| }, | |
| { | |
| "epoch": 1.567732396969137, | |
| "grad_norm": 0.9556270442029375, | |
| "learning_rate": 2.2149102719882044e-06, | |
| "loss": 1.0251903533935547, | |
| "memory(GiB)": 74.63, | |
| "step": 2120, | |
| "token_acc": 0.647495361781076, | |
| "train_speed(iter/s)": 0.284896 | |
| }, | |
| { | |
| "epoch": 1.5714285714285714, | |
| "grad_norm": 1.041918735454562, | |
| "learning_rate": 2.178581615414802e-06, | |
| "loss": 1.0483660697937012, | |
| "memory(GiB)": 74.63, | |
| "step": 2125, | |
| "token_acc": 0.5842217484008528, | |
| "train_speed(iter/s)": 0.282449 | |
| }, | |
| { | |
| "epoch": 1.5751247458880058, | |
| "grad_norm": 1.0827410972952385, | |
| "learning_rate": 2.1425169021619518e-06, | |
| "loss": 1.0664111137390138, | |
| "memory(GiB)": 74.63, | |
| "step": 2130, | |
| "token_acc": 0.6472742066720911, | |
| "train_speed(iter/s)": 0.280246 | |
| }, | |
| { | |
| "epoch": 1.5788209203474404, | |
| "grad_norm": 1.0343519334837508, | |
| "learning_rate": 2.106717349277475e-06, | |
| "loss": 1.0448074340820312, | |
| "memory(GiB)": 74.63, | |
| "step": 2135, | |
| "token_acc": 0.6223404255319149, | |
| "train_speed(iter/s)": 0.278222 | |
| }, | |
| { | |
| "epoch": 1.582517094806875, | |
| "grad_norm": 0.9536359374215565, | |
| "learning_rate": 2.0711841648610254e-06, | |
| "loss": 1.0621306419372558, | |
| "memory(GiB)": 74.63, | |
| "step": 2140, | |
| "token_acc": 0.6342119419042496, | |
| "train_speed(iter/s)": 0.276006 | |
| }, | |
| { | |
| "epoch": 1.5862132692663093, | |
| "grad_norm": 1.072455338512947, | |
| "learning_rate": 2.03591854802333e-06, | |
| "loss": 1.0556835174560546, | |
| "memory(GiB)": 74.63, | |
| "step": 2145, | |
| "token_acc": 0.7222898903775883, | |
| "train_speed(iter/s)": 0.27386 | |
| }, | |
| { | |
| "epoch": 1.5899094437257437, | |
| "grad_norm": 1.0210760479008887, | |
| "learning_rate": 2.0009216888457206e-06, | |
| "loss": 1.0253107070922851, | |
| "memory(GiB)": 74.63, | |
| "step": 2150, | |
| "token_acc": 0.6356216994719155, | |
| "train_speed(iter/s)": 0.271885 | |
| }, | |
| { | |
| "epoch": 1.5899094437257437, | |
| "eval_loss": 0.6611568927764893, | |
| "eval_runtime": 89.4271, | |
| "eval_samples_per_second": 78.22, | |
| "eval_steps_per_second": 0.615, | |
| "eval_token_acc": 0.6316070154701413, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 1.5936056181851783, | |
| "grad_norm": 1.062555004377403, | |
| "learning_rate": 1.966194768339974e-06, | |
| "loss": 1.049751091003418, | |
| "memory(GiB)": 74.63, | |
| "step": 2155, | |
| "token_acc": 0.6423422284052106, | |
| "train_speed(iter/s)": 0.266286 | |
| }, | |
| { | |
| "epoch": 1.597301792644613, | |
| "grad_norm": 0.8629319967495109, | |
| "learning_rate": 1.931738958408457e-06, | |
| "loss": 1.0435371398925781, | |
| "memory(GiB)": 74.63, | |
| "step": 2160, | |
| "token_acc": 0.6290619251992643, | |
| "train_speed(iter/s)": 0.264247 | |
| }, | |
| { | |
| "epoch": 1.6009979671040473, | |
| "grad_norm": 0.955539932162413, | |
| "learning_rate": 1.8975554218045733e-06, | |
| "loss": 1.0308834075927735, | |
| "memory(GiB)": 74.63, | |
| "step": 2165, | |
| "token_acc": 0.6610537751222162, | |
| "train_speed(iter/s)": 0.262351 | |
| }, | |
| { | |
| "epoch": 1.6046941415634817, | |
| "grad_norm": 0.9624917837200193, | |
| "learning_rate": 1.8636453120935428e-06, | |
| "loss": 1.0461854934692383, | |
| "memory(GiB)": 74.63, | |
| "step": 2170, | |
| "token_acc": 0.7152838427947599, | |
| "train_speed(iter/s)": 0.260619 | |
| }, | |
| { | |
| "epoch": 1.6083903160229163, | |
| "grad_norm": 1.1677655720128766, | |
| "learning_rate": 1.8300097736134482e-06, | |
| "loss": 1.0363172531127929, | |
| "memory(GiB)": 74.63, | |
| "step": 2175, | |
| "token_acc": 0.6848798869524259, | |
| "train_speed(iter/s)": 0.258828 | |
| }, | |
| { | |
| "epoch": 1.6120864904823509, | |
| "grad_norm": 1.060280622494465, | |
| "learning_rate": 1.796649941436638e-06, | |
| "loss": 1.0246556282043457, | |
| "memory(GiB)": 74.63, | |
| "step": 2180, | |
| "token_acc": 0.6469820554649266, | |
| "train_speed(iter/s)": 0.256928 | |
| }, | |
| { | |
| "epoch": 1.6157826649417852, | |
| "grad_norm": 0.9704555618707196, | |
| "learning_rate": 1.7635669413314082e-06, | |
| "loss": 1.0577556610107421, | |
| "memory(GiB)": 74.63, | |
| "step": 2185, | |
| "token_acc": 0.698159509202454, | |
| "train_speed(iter/s)": 0.255252 | |
| }, | |
| { | |
| "epoch": 1.6194788394012196, | |
| "grad_norm": 0.9786880620172256, | |
| "learning_rate": 1.7307618897240274e-06, | |
| "loss": 1.0526361465454102, | |
| "memory(GiB)": 74.63, | |
| "step": 2190, | |
| "token_acc": 0.6385869565217391, | |
| "train_speed(iter/s)": 0.253488 | |
| }, | |
| { | |
| "epoch": 1.6231750138606542, | |
| "grad_norm": 0.9744613679129237, | |
| "learning_rate": 1.6982358936610454e-06, | |
| "loss": 1.075265598297119, | |
| "memory(GiB)": 74.63, | |
| "step": 2195, | |
| "token_acc": 0.6133072407045009, | |
| "train_speed(iter/s)": 0.251735 | |
| }, | |
| { | |
| "epoch": 1.6268711883200888, | |
| "grad_norm": 1.0120755932892964, | |
| "learning_rate": 1.6659900507719406e-06, | |
| "loss": 1.064041519165039, | |
| "memory(GiB)": 74.63, | |
| "step": 2200, | |
| "token_acc": 0.648406731113498, | |
| "train_speed(iter/s)": 0.250141 | |
| }, | |
| { | |
| "epoch": 1.6268711883200888, | |
| "eval_loss": 0.6599572896957397, | |
| "eval_runtime": 90.9305, | |
| "eval_samples_per_second": 76.927, | |
| "eval_steps_per_second": 0.605, | |
| "eval_token_acc": 0.6317178297938161, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 1.6305673627795232, | |
| "grad_norm": 1.0770018414163383, | |
| "learning_rate": 1.6340254492320873e-06, | |
| "loss": 1.0508115768432618, | |
| "memory(GiB)": 74.63, | |
| "step": 2205, | |
| "token_acc": 0.6418808091853472, | |
| "train_speed(iter/s)": 0.245446 | |
| }, | |
| { | |
| "epoch": 1.6342635372389576, | |
| "grad_norm": 1.0511841490808227, | |
| "learning_rate": 1.6023431677260215e-06, | |
| "loss": 1.0454177856445312, | |
| "memory(GiB)": 74.63, | |
| "step": 2210, | |
| "token_acc": 0.6532779316712835, | |
| "train_speed(iter/s)": 0.243859 | |
| }, | |
| { | |
| "epoch": 1.6379597116983922, | |
| "grad_norm": 0.9098679876928407, | |
| "learning_rate": 1.570944275411046e-06, | |
| "loss": 1.0668581962585448, | |
| "memory(GiB)": 74.63, | |
| "step": 2215, | |
| "token_acc": 0.6121688741721855, | |
| "train_speed(iter/s)": 0.242235 | |
| }, | |
| { | |
| "epoch": 1.6416558861578268, | |
| "grad_norm": 1.0127053695015762, | |
| "learning_rate": 1.5398298318811467e-06, | |
| "loss": 1.0175441741943358, | |
| "memory(GiB)": 74.63, | |
| "step": 2220, | |
| "token_acc": 0.6991780821917808, | |
| "train_speed(iter/s)": 0.240782 | |
| }, | |
| { | |
| "epoch": 1.6453520606172611, | |
| "grad_norm": 1.1031573706590774, | |
| "learning_rate": 1.5090008871312433e-06, | |
| "loss": 1.0165956497192383, | |
| "memory(GiB)": 74.63, | |
| "step": 2225, | |
| "token_acc": 0.6685121107266436, | |
| "train_speed(iter/s)": 0.23932 | |
| }, | |
| { | |
| "epoch": 1.6490482350766955, | |
| "grad_norm": 1.0502782153731651, | |
| "learning_rate": 1.4784584815217452e-06, | |
| "loss": 1.0456388473510743, | |
| "memory(GiB)": 74.63, | |
| "step": 2230, | |
| "token_acc": 0.6672802577082375, | |
| "train_speed(iter/s)": 0.237824 | |
| }, | |
| { | |
| "epoch": 1.65274440953613, | |
| "grad_norm": 1.003637672944472, | |
| "learning_rate": 1.448203645743449e-06, | |
| "loss": 1.0287794113159179, | |
| "memory(GiB)": 74.63, | |
| "step": 2235, | |
| "token_acc": 0.6663223140495868, | |
| "train_speed(iter/s)": 0.236377 | |
| }, | |
| { | |
| "epoch": 1.6564405839955647, | |
| "grad_norm": 1.037599542215698, | |
| "learning_rate": 1.4182374007827605e-06, | |
| "loss": 1.0127573013305664, | |
| "memory(GiB)": 74.63, | |
| "step": 2240, | |
| "token_acc": 0.6325656132833423, | |
| "train_speed(iter/s)": 0.235012 | |
| }, | |
| { | |
| "epoch": 1.660136758454999, | |
| "grad_norm": 0.9940434532315588, | |
| "learning_rate": 1.3885607578872295e-06, | |
| "loss": 1.0367406845092773, | |
| "memory(GiB)": 74.63, | |
| "step": 2245, | |
| "token_acc": 0.6187350835322196, | |
| "train_speed(iter/s)": 0.233574 | |
| }, | |
| { | |
| "epoch": 1.6638329329144335, | |
| "grad_norm": 0.9200899712617193, | |
| "learning_rate": 1.3591747185314342e-06, | |
| "loss": 1.0550609588623048, | |
| "memory(GiB)": 74.63, | |
| "step": 2250, | |
| "token_acc": 0.6650768415474297, | |
| "train_speed(iter/s)": 0.232175 | |
| }, | |
| { | |
| "epoch": 1.6638329329144335, | |
| "eval_loss": 0.6586793661117554, | |
| "eval_runtime": 87.5544, | |
| "eval_samples_per_second": 79.893, | |
| "eval_steps_per_second": 0.628, | |
| "eval_token_acc": 0.6320167976045638, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 1.667529107373868, | |
| "grad_norm": 1.0426265291319847, | |
| "learning_rate": 1.3300802743831786e-06, | |
| "loss": 1.0567312240600586, | |
| "memory(GiB)": 74.63, | |
| "step": 2255, | |
| "token_acc": 0.6444471182769823, | |
| "train_speed(iter/s)": 0.228232 | |
| }, | |
| { | |
| "epoch": 1.6712252818333027, | |
| "grad_norm": 1.1036330750940702, | |
| "learning_rate": 1.3012784072700335e-06, | |
| "loss": 1.0163141250610352, | |
| "memory(GiB)": 74.63, | |
| "step": 2260, | |
| "token_acc": 0.6361031518624641, | |
| "train_speed(iter/s)": 0.226993 | |
| }, | |
| { | |
| "epoch": 1.674921456292737, | |
| "grad_norm": 1.034827646235815, | |
| "learning_rate": 1.272770089146199e-06, | |
| "loss": 1.042106819152832, | |
| "memory(GiB)": 74.63, | |
| "step": 2265, | |
| "token_acc": 0.6615910503418272, | |
| "train_speed(iter/s)": 0.225676 | |
| }, | |
| { | |
| "epoch": 1.6786176307521714, | |
| "grad_norm": 0.9379338873318531, | |
| "learning_rate": 1.2445562820597035e-06, | |
| "loss": 1.056378173828125, | |
| "memory(GiB)": 74.63, | |
| "step": 2270, | |
| "token_acc": 0.6658767772511849, | |
| "train_speed(iter/s)": 0.22441 | |
| }, | |
| { | |
| "epoch": 1.682313805211606, | |
| "grad_norm": 1.018955540383726, | |
| "learning_rate": 1.2166379381199423e-06, | |
| "loss": 1.024850082397461, | |
| "memory(GiB)": 74.63, | |
| "step": 2275, | |
| "token_acc": 0.6339022954679223, | |
| "train_speed(iter/s)": 0.223236 | |
| }, | |
| { | |
| "epoch": 1.6860099796710406, | |
| "grad_norm": 0.9387152975257087, | |
| "learning_rate": 1.1890159994655425e-06, | |
| "loss": 1.0364057540893554, | |
| "memory(GiB)": 74.63, | |
| "step": 2280, | |
| "token_acc": 0.6378887070376432, | |
| "train_speed(iter/s)": 0.221993 | |
| }, | |
| { | |
| "epoch": 1.689706154130475, | |
| "grad_norm": 0.9517285751951058, | |
| "learning_rate": 1.1616913982325827e-06, | |
| "loss": 1.0173322677612304, | |
| "memory(GiB)": 74.63, | |
| "step": 2285, | |
| "token_acc": 0.63408913213448, | |
| "train_speed(iter/s)": 0.220748 | |
| }, | |
| { | |
| "epoch": 1.6934023285899094, | |
| "grad_norm": 1.1148106388917103, | |
| "learning_rate": 1.1346650565231165e-06, | |
| "loss": 1.0427886962890625, | |
| "memory(GiB)": 74.63, | |
| "step": 2290, | |
| "token_acc": 0.640251572327044, | |
| "train_speed(iter/s)": 0.219605 | |
| }, | |
| { | |
| "epoch": 1.697098503049344, | |
| "grad_norm": 1.1256757463038873, | |
| "learning_rate": 1.1079378863740686e-06, | |
| "loss": 1.0264497756958009, | |
| "memory(GiB)": 74.63, | |
| "step": 2295, | |
| "token_acc": 0.6556603773584906, | |
| "train_speed(iter/s)": 0.21844 | |
| }, | |
| { | |
| "epoch": 1.7007946775087786, | |
| "grad_norm": 1.0466875757106615, | |
| "learning_rate": 1.0815107897264555e-06, | |
| "loss": 1.0546932220458984, | |
| "memory(GiB)": 74.63, | |
| "step": 2300, | |
| "token_acc": 0.6179956896551724, | |
| "train_speed(iter/s)": 0.217293 | |
| }, | |
| { | |
| "epoch": 1.7007946775087786, | |
| "eval_loss": 0.6585622429847717, | |
| "eval_runtime": 86.3947, | |
| "eval_samples_per_second": 80.966, | |
| "eval_steps_per_second": 0.637, | |
| "eval_token_acc": 0.6323053765724668, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 1.704490851968213, | |
| "grad_norm": 0.9844022346926911, | |
| "learning_rate": 1.0553846583949424e-06, | |
| "loss": 1.0470151901245117, | |
| "memory(GiB)": 74.63, | |
| "step": 2305, | |
| "token_acc": 0.638003355704698, | |
| "train_speed(iter/s)": 0.213982 | |
| }, | |
| { | |
| "epoch": 1.7081870264276473, | |
| "grad_norm": 1.002842594215214, | |
| "learning_rate": 1.0295603740377591e-06, | |
| "loss": 1.0518400192260742, | |
| "memory(GiB)": 74.63, | |
| "step": 2310, | |
| "token_acc": 0.6883333333333334, | |
| "train_speed(iter/s)": 0.212941 | |
| }, | |
| { | |
| "epoch": 1.711883200887082, | |
| "grad_norm": 0.996843923558558, | |
| "learning_rate": 1.0040388081269336e-06, | |
| "loss": 1.028696632385254, | |
| "memory(GiB)": 74.63, | |
| "step": 2315, | |
| "token_acc": 0.6513243595310465, | |
| "train_speed(iter/s)": 0.211922 | |
| }, | |
| { | |
| "epoch": 1.7155793753465165, | |
| "grad_norm": 1.0304373058907095, | |
| "learning_rate": 9.788208219188932e-07, | |
| "loss": 1.0363618850708007, | |
| "memory(GiB)": 74.63, | |
| "step": 2320, | |
| "token_acc": 0.6015075376884422, | |
| "train_speed(iter/s)": 0.210816 | |
| }, | |
| { | |
| "epoch": 1.7192755498059509, | |
| "grad_norm": 1.0716438374724575, | |
| "learning_rate": 9.539072664254e-07, | |
| "loss": 1.065016269683838, | |
| "memory(GiB)": 74.63, | |
| "step": 2325, | |
| "token_acc": 0.6122448979591837, | |
| "train_speed(iter/s)": 0.20983 | |
| }, | |
| { | |
| "epoch": 1.7229717242653853, | |
| "grad_norm": 1.044849619522368, | |
| "learning_rate": 9.292989823848242e-07, | |
| "loss": 1.0461166381835938, | |
| "memory(GiB)": 74.63, | |
| "step": 2330, | |
| "token_acc": 0.6681818181818182, | |
| "train_speed(iter/s)": 0.208847 | |
| }, | |
| { | |
| "epoch": 1.7266678987248198, | |
| "grad_norm": 0.9749773034536726, | |
| "learning_rate": 9.049968002337805e-07, | |
| "loss": 1.0064781188964844, | |
| "memory(GiB)": 74.63, | |
| "step": 2335, | |
| "token_acc": 0.6454869358669834, | |
| "train_speed(iter/s)": 0.207824 | |
| }, | |
| { | |
| "epoch": 1.7303640731842544, | |
| "grad_norm": 1.0478755901703891, | |
| "learning_rate": 8.810015400790994e-07, | |
| "loss": 1.0341422080993652, | |
| "memory(GiB)": 74.63, | |
| "step": 2340, | |
| "token_acc": 0.6453608247422681, | |
| "train_speed(iter/s)": 0.20687 | |
| }, | |
| { | |
| "epoch": 1.7340602476436888, | |
| "grad_norm": 1.161076200769703, | |
| "learning_rate": 8.573140116701573e-07, | |
| "loss": 1.031747531890869, | |
| "memory(GiB)": 74.63, | |
| "step": 2345, | |
| "token_acc": 0.633889077917659, | |
| "train_speed(iter/s)": 0.205935 | |
| }, | |
| { | |
| "epoch": 1.7377564221031232, | |
| "grad_norm": 1.0465828745420171, | |
| "learning_rate": 8.339350143715452e-07, | |
| "loss": 1.026121234893799, | |
| "memory(GiB)": 74.63, | |
| "step": 2350, | |
| "token_acc": 0.6303341902313625, | |
| "train_speed(iter/s)": 0.204941 | |
| }, | |
| { | |
| "epoch": 1.7377564221031232, | |
| "eval_loss": 0.6579257845878601, | |
| "eval_runtime": 85.2135, | |
| "eval_samples_per_second": 82.088, | |
| "eval_steps_per_second": 0.645, | |
| "eval_token_acc": 0.6323030679407236, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 1.7414525965625578, | |
| "grad_norm": 0.9803139837060567, | |
| "learning_rate": 8.108653371360897e-07, | |
| "loss": 1.0249688148498535, | |
| "memory(GiB)": 74.63, | |
| "step": 2355, | |
| "token_acc": 0.6329644032306312, | |
| "train_speed(iter/s)": 0.202068 | |
| }, | |
| { | |
| "epoch": 1.7451487710219924, | |
| "grad_norm": 1.0773968420983469, | |
| "learning_rate": 7.881057584782448e-07, | |
| "loss": 1.014153003692627, | |
| "memory(GiB)": 74.63, | |
| "step": 2360, | |
| "token_acc": 0.6533575317604355, | |
| "train_speed(iter/s)": 0.201155 | |
| }, | |
| { | |
| "epoch": 1.7488449454814268, | |
| "grad_norm": 1.0060807449724751, | |
| "learning_rate": 7.656570464477997e-07, | |
| "loss": 1.041685199737549, | |
| "memory(GiB)": 74.63, | |
| "step": 2365, | |
| "token_acc": 0.6260771824653428, | |
| "train_speed(iter/s)": 0.20029 | |
| }, | |
| { | |
| "epoch": 1.7525411199408611, | |
| "grad_norm": 0.9990872446739557, | |
| "learning_rate": 7.435199586039721e-07, | |
| "loss": 1.025881576538086, | |
| "memory(GiB)": 74.63, | |
| "step": 2370, | |
| "token_acc": 0.6330558125192722, | |
| "train_speed(iter/s)": 0.199385 | |
| }, | |
| { | |
| "epoch": 1.7562372944002957, | |
| "grad_norm": 1.0713164560199164, | |
| "learning_rate": 7.216952419898393e-07, | |
| "loss": 1.0439919471740722, | |
| "memory(GiB)": 74.63, | |
| "step": 2375, | |
| "token_acc": 0.6618962432915921, | |
| "train_speed(iter/s)": 0.198497 | |
| }, | |
| { | |
| "epoch": 1.7599334688597303, | |
| "grad_norm": 1.0964714966010252, | |
| "learning_rate": 7.001836331071365e-07, | |
| "loss": 1.0411014556884766, | |
| "memory(GiB)": 74.63, | |
| "step": 2380, | |
| "token_acc": 0.6824512534818942, | |
| "train_speed(iter/s)": 0.197623 | |
| }, | |
| { | |
| "epoch": 1.7636296433191647, | |
| "grad_norm": 0.9737095253362634, | |
| "learning_rate": 6.789858578913877e-07, | |
| "loss": 1.0455976486206056, | |
| "memory(GiB)": 74.63, | |
| "step": 2385, | |
| "token_acc": 0.6538119252447345, | |
| "train_speed(iter/s)": 0.196798 | |
| }, | |
| { | |
| "epoch": 1.767325817778599, | |
| "grad_norm": 1.0585968573237603, | |
| "learning_rate": 6.581026316874184e-07, | |
| "loss": 1.0437522888183595, | |
| "memory(GiB)": 74.63, | |
| "step": 2390, | |
| "token_acc": 0.6448377581120944, | |
| "train_speed(iter/s)": 0.195944 | |
| }, | |
| { | |
| "epoch": 1.7710219922380337, | |
| "grad_norm": 0.9930747477126893, | |
| "learning_rate": 6.375346592252174e-07, | |
| "loss": 1.035786247253418, | |
| "memory(GiB)": 74.63, | |
| "step": 2395, | |
| "token_acc": 0.6269207129686539, | |
| "train_speed(iter/s)": 0.195132 | |
| }, | |
| { | |
| "epoch": 1.774718166697468, | |
| "grad_norm": 0.9303672570135261, | |
| "learning_rate": 6.17282634596148e-07, | |
| "loss": 1.0481432914733886, | |
| "memory(GiB)": 74.63, | |
| "step": 2400, | |
| "token_acc": 0.6504672897196262, | |
| "train_speed(iter/s)": 0.194323 | |
| }, | |
| { | |
| "epoch": 1.774718166697468, | |
| "eval_loss": 0.6574872136116028, | |
| "eval_runtime": 88.3048, | |
| "eval_samples_per_second": 79.214, | |
| "eval_steps_per_second": 0.623, | |
| "eval_token_acc": 0.6324069563691687, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 1.7784143411569024, | |
| "grad_norm": 1.0622942862025062, | |
| "learning_rate": 5.973472412295256e-07, | |
| "loss": 1.019943618774414, | |
| "memory(GiB)": 74.63, | |
| "step": 2405, | |
| "token_acc": 0.631666271628348, | |
| "train_speed(iter/s)": 0.191801 | |
| }, | |
| { | |
| "epoch": 1.782110515616337, | |
| "grad_norm": 1.0141126472853548, | |
| "learning_rate": 5.777291518695593e-07, | |
| "loss": 1.0454243659973144, | |
| "memory(GiB)": 74.63, | |
| "step": 2410, | |
| "token_acc": 0.6077097505668935, | |
| "train_speed(iter/s)": 0.191007 | |
| }, | |
| { | |
| "epoch": 1.7858066900757716, | |
| "grad_norm": 1.0733746716133248, | |
| "learning_rate": 5.584290285526473e-07, | |
| "loss": 1.036181640625, | |
| "memory(GiB)": 74.63, | |
| "step": 2415, | |
| "token_acc": 0.671865626874625, | |
| "train_speed(iter/s)": 0.190213 | |
| }, | |
| { | |
| "epoch": 1.789502864535206, | |
| "grad_norm": 1.011543008247962, | |
| "learning_rate": 5.394475225850338e-07, | |
| "loss": 1.0618670463562012, | |
| "memory(GiB)": 74.63, | |
| "step": 2420, | |
| "token_acc": 0.6783405172413793, | |
| "train_speed(iter/s)": 0.189455 | |
| }, | |
| { | |
| "epoch": 1.7931990389946404, | |
| "grad_norm": 0.9605401301883022, | |
| "learning_rate": 5.207852745208298e-07, | |
| "loss": 0.9933710098266602, | |
| "memory(GiB)": 74.63, | |
| "step": 2425, | |
| "token_acc": 0.6471641791044777, | |
| "train_speed(iter/s)": 0.188704 | |
| }, | |
| { | |
| "epoch": 1.796895213454075, | |
| "grad_norm": 1.1008101055992277, | |
| "learning_rate": 5.024429141404019e-07, | |
| "loss": 0.999241828918457, | |
| "memory(GiB)": 74.63, | |
| "step": 2430, | |
| "token_acc": 0.6457304163726182, | |
| "train_speed(iter/s)": 0.187948 | |
| }, | |
| { | |
| "epoch": 1.8005913879135096, | |
| "grad_norm": 0.935629646034127, | |
| "learning_rate": 4.844210604291155e-07, | |
| "loss": 1.018147087097168, | |
| "memory(GiB)": 74.63, | |
| "step": 2435, | |
| "token_acc": 0.6178369652945924, | |
| "train_speed(iter/s)": 0.187233 | |
| }, | |
| { | |
| "epoch": 1.804287562372944, | |
| "grad_norm": 0.9808937018928983, | |
| "learning_rate": 4.667203215564431e-07, | |
| "loss": 1.0448846817016602, | |
| "memory(GiB)": 74.63, | |
| "step": 2440, | |
| "token_acc": 0.6323092170465807, | |
| "train_speed(iter/s)": 0.186484 | |
| }, | |
| { | |
| "epoch": 1.8079837368323783, | |
| "grad_norm": 1.0392559529080805, | |
| "learning_rate": 4.493412948554454e-07, | |
| "loss": 1.0690251350402833, | |
| "memory(GiB)": 74.63, | |
| "step": 2445, | |
| "token_acc": 0.6409416581371545, | |
| "train_speed(iter/s)": 0.185763 | |
| }, | |
| { | |
| "epoch": 1.811679911291813, | |
| "grad_norm": 1.0661940200914148, | |
| "learning_rate": 4.3228456680261877e-07, | |
| "loss": 1.0110756874084472, | |
| "memory(GiB)": 74.63, | |
| "step": 2450, | |
| "token_acc": 0.649331352154532, | |
| "train_speed(iter/s)": 0.185079 | |
| }, | |
| { | |
| "epoch": 1.811679911291813, | |
| "eval_loss": 0.6571330428123474, | |
| "eval_runtime": 89.3337, | |
| "eval_samples_per_second": 78.302, | |
| "eval_steps_per_second": 0.616, | |
| "eval_token_acc": 0.6325177706928434, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 1.8153760857512475, | |
| "grad_norm": 0.9674903681690532, | |
| "learning_rate": 4.155507129980907e-07, | |
| "loss": 1.0614801406860352, | |
| "memory(GiB)": 74.63, | |
| "step": 2455, | |
| "token_acc": 0.6478157805621402, | |
| "train_speed(iter/s)": 0.182858 | |
| }, | |
| { | |
| "epoch": 1.819072260210682, | |
| "grad_norm": 1.023357784734463, | |
| "learning_rate": 3.991402981462045e-07, | |
| "loss": 1.0087343215942384, | |
| "memory(GiB)": 74.63, | |
| "step": 2460, | |
| "token_acc": 0.6711140760507005, | |
| "train_speed(iter/s)": 0.182142 | |
| }, | |
| { | |
| "epoch": 1.8227684346701163, | |
| "grad_norm": 1.1735912820708456, | |
| "learning_rate": 3.8305387603646324e-07, | |
| "loss": 1.0243083953857421, | |
| "memory(GiB)": 74.63, | |
| "step": 2465, | |
| "token_acc": 0.6599799398194583, | |
| "train_speed(iter/s)": 0.181445 | |
| }, | |
| { | |
| "epoch": 1.8264646091295509, | |
| "grad_norm": 1.0193068857696008, | |
| "learning_rate": 3.6729198952483725e-07, | |
| "loss": 1.032374095916748, | |
| "memory(GiB)": 74.63, | |
| "step": 2470, | |
| "token_acc": 0.6700460829493088, | |
| "train_speed(iter/s)": 0.180793 | |
| }, | |
| { | |
| "epoch": 1.8301607835889855, | |
| "grad_norm": 0.989197160358902, | |
| "learning_rate": 3.5185517051544494e-07, | |
| "loss": 1.053987693786621, | |
| "memory(GiB)": 74.63, | |
| "step": 2475, | |
| "token_acc": 0.6859160781055256, | |
| "train_speed(iter/s)": 0.180141 | |
| }, | |
| { | |
| "epoch": 1.8338569580484199, | |
| "grad_norm": 1.0596386275791907, | |
| "learning_rate": 3.367439399426087e-07, | |
| "loss": 1.0508078575134276, | |
| "memory(GiB)": 74.63, | |
| "step": 2480, | |
| "token_acc": 0.6111356606274856, | |
| "train_speed(iter/s)": 0.179489 | |
| }, | |
| { | |
| "epoch": 1.8375531325078542, | |
| "grad_norm": 1.0148900997448214, | |
| "learning_rate": 3.219588077532687e-07, | |
| "loss": 1.0556805610656739, | |
| "memory(GiB)": 74.63, | |
| "step": 2485, | |
| "token_acc": 0.6928414901387875, | |
| "train_speed(iter/s)": 0.178863 | |
| }, | |
| { | |
| "epoch": 1.8412493069672888, | |
| "grad_norm": 0.9468756707473351, | |
| "learning_rate": 3.075002728897747e-07, | |
| "loss": 1.0154769897460938, | |
| "memory(GiB)": 74.63, | |
| "step": 2490, | |
| "token_acc": 0.6334152334152334, | |
| "train_speed(iter/s)": 0.178234 | |
| }, | |
| { | |
| "epoch": 1.8449454814267234, | |
| "grad_norm": 0.9178809513706729, | |
| "learning_rate": 2.933688232730536e-07, | |
| "loss": 1.0376591682434082, | |
| "memory(GiB)": 74.63, | |
| "step": 2495, | |
| "token_acc": 0.6742112482853223, | |
| "train_speed(iter/s)": 0.177603 | |
| }, | |
| { | |
| "epoch": 1.8486416558861578, | |
| "grad_norm": 1.0627891032300194, | |
| "learning_rate": 2.79564935786143e-07, | |
| "loss": 1.0138132095336914, | |
| "memory(GiB)": 74.63, | |
| "step": 2500, | |
| "token_acc": 0.6157316041725401, | |
| "train_speed(iter/s)": 0.176992 | |
| }, | |
| { | |
| "epoch": 1.8486416558861578, | |
| "eval_loss": 0.6568954586982727, | |
| "eval_runtime": 89.4508, | |
| "eval_samples_per_second": 78.199, | |
| "eval_steps_per_second": 0.615, | |
| "eval_token_acc": 0.632513153429357, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 1.8523378303455922, | |
| "grad_norm": 1.0872052595724289, | |
| "learning_rate": 2.660890762580903e-07, | |
| "loss": 1.0546483993530273, | |
| "memory(GiB)": 74.63, | |
| "step": 2505, | |
| "token_acc": 0.6424242424242425, | |
| "train_speed(iter/s)": 0.175004 | |
| }, | |
| { | |
| "epoch": 1.8560340048050268, | |
| "grad_norm": 1.1281209574136644, | |
| "learning_rate": 2.5294169944824254e-07, | |
| "loss": 1.0317713737487793, | |
| "memory(GiB)": 74.63, | |
| "step": 2510, | |
| "token_acc": 0.6293388429752066, | |
| "train_speed(iter/s)": 0.174416 | |
| }, | |
| { | |
| "epoch": 1.8597301792644614, | |
| "grad_norm": 0.8926816061055212, | |
| "learning_rate": 2.401232490308969e-07, | |
| "loss": 1.048653793334961, | |
| "memory(GiB)": 74.63, | |
| "step": 2515, | |
| "token_acc": 0.6237929702587872, | |
| "train_speed(iter/s)": 0.173811 | |
| }, | |
| { | |
| "epoch": 1.8634263537238958, | |
| "grad_norm": 1.0912285805001078, | |
| "learning_rate": 2.2763415758032316e-07, | |
| "loss": 1.0199008941650392, | |
| "memory(GiB)": 74.63, | |
| "step": 2520, | |
| "token_acc": 0.632258064516129, | |
| "train_speed(iter/s)": 0.173239 | |
| }, | |
| { | |
| "epoch": 1.8671225281833301, | |
| "grad_norm": 1.0989085317685814, | |
| "learning_rate": 2.1547484655617513e-07, | |
| "loss": 1.010093879699707, | |
| "memory(GiB)": 74.63, | |
| "step": 2525, | |
| "token_acc": 0.6342616920651603, | |
| "train_speed(iter/s)": 0.172675 | |
| }, | |
| { | |
| "epoch": 1.8708187026427647, | |
| "grad_norm": 1.0229802711909943, | |
| "learning_rate": 2.0364572628925993e-07, | |
| "loss": 1.0246079444885254, | |
| "memory(GiB)": 74.63, | |
| "step": 2530, | |
| "token_acc": 0.717948717948718, | |
| "train_speed(iter/s)": 0.172113 | |
| }, | |
| { | |
| "epoch": 1.8745148771021993, | |
| "grad_norm": 1.1101947156669076, | |
| "learning_rate": 1.921471959676957e-07, | |
| "loss": 1.0213122367858887, | |
| "memory(GiB)": 74.63, | |
| "step": 2535, | |
| "token_acc": 0.6377079482439926, | |
| "train_speed(iter/s)": 0.171534 | |
| }, | |
| { | |
| "epoch": 1.8782110515616337, | |
| "grad_norm": 0.972824509691789, | |
| "learning_rate": 1.809796436234379e-07, | |
| "loss": 1.0392621040344239, | |
| "memory(GiB)": 74.63, | |
| "step": 2540, | |
| "token_acc": 0.6089108910891089, | |
| "train_speed(iter/s)": 0.17099 | |
| }, | |
| { | |
| "epoch": 1.881907226021068, | |
| "grad_norm": 1.0893138267742302, | |
| "learning_rate": 1.7014344611918753e-07, | |
| "loss": 1.0224065780639648, | |
| "memory(GiB)": 74.63, | |
| "step": 2545, | |
| "token_acc": 0.628198149156233, | |
| "train_speed(iter/s)": 0.170427 | |
| }, | |
| { | |
| "epoch": 1.8856034004805027, | |
| "grad_norm": 1.1303784675436226, | |
| "learning_rate": 1.5963896913566923e-07, | |
| "loss": 1.0195607185363769, | |
| "memory(GiB)": 74.63, | |
| "step": 2550, | |
| "token_acc": 0.658051689860835, | |
| "train_speed(iter/s)": 0.169871 | |
| }, | |
| { | |
| "epoch": 1.8856034004805027, | |
| "eval_loss": 0.6567226648330688, | |
| "eval_runtime": 88.1394, | |
| "eval_samples_per_second": 79.363, | |
| "eval_steps_per_second": 0.624, | |
| "eval_token_acc": 0.6325870296451402, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 1.8892995749399373, | |
| "grad_norm": 1.09017259229996, | |
| "learning_rate": 1.494665671592943e-07, | |
| "loss": 1.0317469596862794, | |
| "memory(GiB)": 74.63, | |
| "step": 2555, | |
| "token_acc": 0.6337277475748854, | |
| "train_speed(iter/s)": 0.168117 | |
| }, | |
| { | |
| "epoch": 1.8929957493993717, | |
| "grad_norm": 0.9679050962600252, | |
| "learning_rate": 1.3962658347019819e-07, | |
| "loss": 1.0667352676391602, | |
| "memory(GiB)": 74.63, | |
| "step": 2560, | |
| "token_acc": 0.6295540658700087, | |
| "train_speed(iter/s)": 0.167582 | |
| }, | |
| { | |
| "epoch": 1.896691923858806, | |
| "grad_norm": 0.88490239893554, | |
| "learning_rate": 1.3011935013065303e-07, | |
| "loss": 1.0192485809326173, | |
| "memory(GiB)": 74.63, | |
| "step": 2565, | |
| "token_acc": 0.599483204134367, | |
| "train_speed(iter/s)": 0.16706 | |
| }, | |
| { | |
| "epoch": 1.9003880983182406, | |
| "grad_norm": 1.0244757899454908, | |
| "learning_rate": 1.2094518797386657e-07, | |
| "loss": 1.0162858963012695, | |
| "memory(GiB)": 74.63, | |
| "step": 2570, | |
| "token_acc": 0.6262672811059908, | |
| "train_speed(iter/s)": 0.166543 | |
| }, | |
| { | |
| "epoch": 1.9040842727776752, | |
| "grad_norm": 0.9237340665622228, | |
| "learning_rate": 1.121044065931498e-07, | |
| "loss": 1.0645517349243163, | |
| "memory(GiB)": 74.63, | |
| "step": 2575, | |
| "token_acc": 0.6675933280381255, | |
| "train_speed(iter/s)": 0.166012 | |
| }, | |
| { | |
| "epoch": 1.9077804472371096, | |
| "grad_norm": 0.9745219678731106, | |
| "learning_rate": 1.0359730433147308e-07, | |
| "loss": 1.0265457153320312, | |
| "memory(GiB)": 74.63, | |
| "step": 2580, | |
| "token_acc": 0.6550632911392406, | |
| "train_speed(iter/s)": 0.165515 | |
| }, | |
| { | |
| "epoch": 1.911476621696544, | |
| "grad_norm": 1.0007256420566137, | |
| "learning_rate": 9.542416827139855e-08, | |
| "loss": 1.0198524475097657, | |
| "memory(GiB)": 74.63, | |
| "step": 2585, | |
| "token_acc": 0.6085481682496607, | |
| "train_speed(iter/s)": 0.164991 | |
| }, | |
| { | |
| "epoch": 1.9151727961559786, | |
| "grad_norm": 0.9874298790271662, | |
| "learning_rate": 8.758527422538798e-08, | |
| "loss": 1.0276208877563477, | |
| "memory(GiB)": 74.63, | |
| "step": 2590, | |
| "token_acc": 0.6413793103448275, | |
| "train_speed(iter/s)": 0.164496 | |
| }, | |
| { | |
| "epoch": 1.9188689706154132, | |
| "grad_norm": 0.985598517098827, | |
| "learning_rate": 8.008088672650016e-08, | |
| "loss": 1.0311683654785155, | |
| "memory(GiB)": 74.63, | |
| "step": 2595, | |
| "token_acc": 0.6960919540229885, | |
| "train_speed(iter/s)": 0.164012 | |
| }, | |
| { | |
| "epoch": 1.9225651450748475, | |
| "grad_norm": 0.8074933176611375, | |
| "learning_rate": 7.291125901946027e-08, | |
| "loss": 1.0470510482788087, | |
| "memory(GiB)": 74.63, | |
| "step": 2600, | |
| "token_acc": 0.6391111111111111, | |
| "train_speed(iter/s)": 0.163535 | |
| }, | |
| { | |
| "epoch": 1.9225651450748475, | |
| "eval_loss": 0.6566023230552673, | |
| "eval_runtime": 88.9043, | |
| "eval_samples_per_second": 78.68, | |
| "eval_steps_per_second": 0.619, | |
| "eval_token_acc": 0.6326458997545924, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 1.926261319534282, | |
| "grad_norm": 1.0347205106897759, | |
| "learning_rate": 6.607663305211675e-08, | |
| "loss": 1.0246917724609375, | |
| "memory(GiB)": 74.63, | |
| "step": 2605, | |
| "token_acc": 0.6372442184283812, | |
| "train_speed(iter/s)": 0.161902 | |
| }, | |
| { | |
| "epoch": 1.9299574939937165, | |
| "grad_norm": 0.9451923481445313, | |
| "learning_rate": 5.957723946727445e-08, | |
| "loss": 1.030987548828125, | |
| "memory(GiB)": 74.63, | |
| "step": 2610, | |
| "token_acc": 0.655980271270037, | |
| "train_speed(iter/s)": 0.161436 | |
| }, | |
| { | |
| "epoch": 1.9336536684531511, | |
| "grad_norm": 0.989048068560612, | |
| "learning_rate": 5.341329759491087e-08, | |
| "loss": 1.043976402282715, | |
| "memory(GiB)": 74.63, | |
| "step": 2615, | |
| "token_acc": 0.6610073571024335, | |
| "train_speed(iter/s)": 0.160958 | |
| }, | |
| { | |
| "epoch": 1.9373498429125855, | |
| "grad_norm": 0.9059448258322844, | |
| "learning_rate": 4.758501544477767e-08, | |
| "loss": 1.03828706741333, | |
| "memory(GiB)": 74.63, | |
| "step": 2620, | |
| "token_acc": 0.663670766319773, | |
| "train_speed(iter/s)": 0.160484 | |
| }, | |
| { | |
| "epoch": 1.9410460173720199, | |
| "grad_norm": 1.0371951958694063, | |
| "learning_rate": 4.209258969937624e-08, | |
| "loss": 1.0256452560424805, | |
| "memory(GiB)": 74.63, | |
| "step": 2625, | |
| "token_acc": 0.6571687019448214, | |
| "train_speed(iter/s)": 0.160045 | |
| }, | |
| { | |
| "epoch": 1.9447421918314545, | |
| "grad_norm": 0.9579823005570719, | |
| "learning_rate": 3.6936205707325255e-08, | |
| "loss": 1.0316158294677735, | |
| "memory(GiB)": 74.63, | |
| "step": 2630, | |
| "token_acc": 0.6658135283363803, | |
| "train_speed(iter/s)": 0.159594 | |
| }, | |
| { | |
| "epoch": 1.948438366290889, | |
| "grad_norm": 1.185629004014561, | |
| "learning_rate": 3.2116037477103454e-08, | |
| "loss": 1.0686611175537108, | |
| "memory(GiB)": 74.63, | |
| "step": 2635, | |
| "token_acc": 0.6998087954110899, | |
| "train_speed(iter/s)": 0.159158 | |
| }, | |
| { | |
| "epoch": 1.9521345407503234, | |
| "grad_norm": 0.9906589709801633, | |
| "learning_rate": 2.763224767117767e-08, | |
| "loss": 0.9920598983764648, | |
| "memory(GiB)": 74.63, | |
| "step": 2640, | |
| "token_acc": 0.6588921282798834, | |
| "train_speed(iter/s)": 0.158729 | |
| }, | |
| { | |
| "epoch": 1.9558307152097578, | |
| "grad_norm": 0.9014323974805333, | |
| "learning_rate": 2.3484987600512767e-08, | |
| "loss": 1.0331963539123534, | |
| "memory(GiB)": 74.63, | |
| "step": 2645, | |
| "token_acc": 0.6714507370054306, | |
| "train_speed(iter/s)": 0.158272 | |
| }, | |
| { | |
| "epoch": 1.9595268896691924, | |
| "grad_norm": 0.9766018351933058, | |
| "learning_rate": 1.9674397219469064e-08, | |
| "loss": 1.037597370147705, | |
| "memory(GiB)": 74.63, | |
| "step": 2650, | |
| "token_acc": 0.6561371841155235, | |
| "train_speed(iter/s)": 0.157844 | |
| }, | |
| { | |
| "epoch": 1.9595268896691924, | |
| "eval_loss": 0.6565667390823364, | |
| "eval_runtime": 88.279, | |
| "eval_samples_per_second": 79.237, | |
| "eval_steps_per_second": 0.623, | |
| "eval_token_acc": 0.6325916469086267, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 1.963223064128627, | |
| "grad_norm": 1.0614662558544963, | |
| "learning_rate": 1.620060512107391e-08, | |
| "loss": 1.016525936126709, | |
| "memory(GiB)": 74.63, | |
| "step": 2655, | |
| "token_acc": 0.6412867391807452, | |
| "train_speed(iter/s)": 0.156391 | |
| }, | |
| { | |
| "epoch": 1.9669192385880614, | |
| "grad_norm": 1.234699645190091, | |
| "learning_rate": 1.3063728532686225e-08, | |
| "loss": 1.0382546424865722, | |
| "memory(GiB)": 74.63, | |
| "step": 2660, | |
| "token_acc": 0.628119293974437, | |
| "train_speed(iter/s)": 0.155979 | |
| }, | |
| { | |
| "epoch": 1.9706154130474958, | |
| "grad_norm": 1.1176674856308213, | |
| "learning_rate": 1.0263873312040818e-08, | |
| "loss": 1.0646825790405274, | |
| "memory(GiB)": 74.63, | |
| "step": 2665, | |
| "token_acc": 0.6521344232515894, | |
| "train_speed(iter/s)": 0.155534 | |
| }, | |
| { | |
| "epoch": 1.9743115875069304, | |
| "grad_norm": 0.9542666956735151, | |
| "learning_rate": 7.801133943672323e-09, | |
| "loss": 1.047515296936035, | |
| "memory(GiB)": 74.63, | |
| "step": 2670, | |
| "token_acc": 0.632, | |
| "train_speed(iter/s)": 0.15513 | |
| }, | |
| { | |
| "epoch": 1.978007761966365, | |
| "grad_norm": 0.966385972017561, | |
| "learning_rate": 5.675593535731106e-09, | |
| "loss": 1.0257146835327149, | |
| "memory(GiB)": 74.63, | |
| "step": 2675, | |
| "token_acc": 0.6467647058823529, | |
| "train_speed(iter/s)": 0.15474 | |
| }, | |
| { | |
| "epoch": 1.9817039364257993, | |
| "grad_norm": 1.0905550872468757, | |
| "learning_rate": 3.887323817173272e-09, | |
| "loss": 1.0138104438781739, | |
| "memory(GiB)": 74.63, | |
| "step": 2680, | |
| "token_acc": 0.6310845431255337, | |
| "train_speed(iter/s)": 0.154324 | |
| }, | |
| { | |
| "epoch": 1.9854001108852337, | |
| "grad_norm": 1.0126426754906144, | |
| "learning_rate": 2.436385135348163e-09, | |
| "loss": 1.015495491027832, | |
| "memory(GiB)": 74.63, | |
| "step": 2685, | |
| "token_acc": 0.6567026194144838, | |
| "train_speed(iter/s)": 0.153915 | |
| }, | |
| { | |
| "epoch": 1.9890962853446683, | |
| "grad_norm": 0.8862791092932369, | |
| "learning_rate": 1.3228264539522084e-09, | |
| "loss": 1.049496841430664, | |
| "memory(GiB)": 74.63, | |
| "step": 2690, | |
| "token_acc": 0.6486280487804879, | |
| "train_speed(iter/s)": 0.153518 | |
| }, | |
| { | |
| "epoch": 1.992792459804103, | |
| "grad_norm": 1.0787160107890392, | |
| "learning_rate": 5.466853513858006e-10, | |
| "loss": 1.0067996978759766, | |
| "memory(GiB)": 74.63, | |
| "step": 2695, | |
| "token_acc": 0.6233766233766234, | |
| "train_speed(iter/s)": 0.153131 | |
| }, | |
| { | |
| "epoch": 1.9964886342635373, | |
| "grad_norm": 1.058938505423735, | |
| "learning_rate": 1.0798801947764503e-10, | |
| "loss": 1.0397415161132812, | |
| "memory(GiB)": 74.63, | |
| "step": 2700, | |
| "token_acc": 0.6839266450916937, | |
| "train_speed(iter/s)": 0.152739 | |
| }, | |
| { | |
| "epoch": 1.9964886342635373, | |
| "eval_loss": 0.6565173864364624, | |
| "eval_runtime": 87.3486, | |
| "eval_samples_per_second": 80.081, | |
| "eval_steps_per_second": 0.63, | |
| "eval_token_acc": 0.6325489372213771, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 1.9994455738310848, | |
| "eval_loss": 0.6564235091209412, | |
| "eval_runtime": 89.612, | |
| "eval_samples_per_second": 78.059, | |
| "eval_steps_per_second": 0.614, | |
| "eval_token_acc": 0.6324912214277963, | |
| "step": 2704 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 2704, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 2, | |
| "save_steps": 50, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.945781552860365e+16, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |