{ "best_metric": 0.39331543, "best_model_checkpoint": "/group/40174/Zywoou/mm_math_reasoning/ms-swift-exp2/oly_output/SFT_text13k_geomm13k_test_mimi_e5/v0-20250615-110647/checkpoint-600", "epoch": 4.977667493796526, "eval_steps": 50, "global_step": 1005, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.004962779156327543, "grad_norm": 4.768475281197631, "learning_rate": 3.921568627450981e-07, "loss": 0.829961359500885, "memory(GiB)": 34.54, "step": 1, "token_acc": 0.7476113479347347, "train_speed(iter/s)": 0.01403 }, { "epoch": 0.02481389578163772, "grad_norm": 4.398648559000095, "learning_rate": 1.96078431372549e-06, "loss": 0.8178610801696777, "memory(GiB)": 82.86, "step": 5, "token_acc": 0.7709430756159729, "train_speed(iter/s)": 0.016462 }, { "epoch": 0.04962779156327544, "grad_norm": 1.8736392022689452, "learning_rate": 3.92156862745098e-06, "loss": 0.7623313903808594, "memory(GiB)": 82.86, "step": 10, "token_acc": 0.7958252706986702, "train_speed(iter/s)": 0.016536 }, { "epoch": 0.07444168734491315, "grad_norm": 2.045825816277766, "learning_rate": 5.882352941176471e-06, "loss": 0.7174508571624756, "memory(GiB)": 82.86, "step": 15, "token_acc": 0.7836124503600792, "train_speed(iter/s)": 0.017279 }, { "epoch": 0.09925558312655088, "grad_norm": 1.4620891759847205, "learning_rate": 7.84313725490196e-06, "loss": 0.6413409233093261, "memory(GiB)": 82.86, "step": 20, "token_acc": 0.7984759662668143, "train_speed(iter/s)": 0.017064 }, { "epoch": 0.12406947890818859, "grad_norm": 1.0676208784737973, "learning_rate": 9.803921568627451e-06, "loss": 0.6111278533935547, "memory(GiB)": 82.86, "step": 25, "token_acc": 0.8271531241409104, "train_speed(iter/s)": 0.016967 }, { "epoch": 0.1488833746898263, "grad_norm": 0.7618181414678857, "learning_rate": 1.1764705882352942e-05, "loss": 0.5910769462585449, "memory(GiB)": 82.86, "step": 30, "token_acc": 0.8199785748096231, "train_speed(iter/s)": 0.016793 }, { "epoch": 0.17369727047146402, "grad_norm": 0.6486866379137493, "learning_rate": 1.3725490196078432e-05, "loss": 0.563320541381836, "memory(GiB)": 82.86, "step": 35, "token_acc": 0.8135446844258112, "train_speed(iter/s)": 0.016772 }, { "epoch": 0.19851116625310175, "grad_norm": 0.6289022048785994, "learning_rate": 1.568627450980392e-05, "loss": 0.5803719520568847, "memory(GiB)": 82.86, "step": 40, "token_acc": 0.8021277151893771, "train_speed(iter/s)": 0.016692 }, { "epoch": 0.22332506203473945, "grad_norm": 0.6554404813269014, "learning_rate": 1.7647058823529414e-05, "loss": 0.5443972110748291, "memory(GiB)": 82.86, "step": 45, "token_acc": 0.8180514241473452, "train_speed(iter/s)": 0.01667 }, { "epoch": 0.24813895781637718, "grad_norm": 0.7064848224001448, "learning_rate": 1.9607843137254903e-05, "loss": 0.5299727439880371, "memory(GiB)": 82.86, "step": 50, "token_acc": 0.8285904060313818, "train_speed(iter/s)": 0.016801 }, { "epoch": 0.24813895781637718, "eval_loss": 0.48841050267219543, "eval_runtime": 36.0679, "eval_samples_per_second": 7.181, "eval_steps_per_second": 0.471, "eval_token_acc": 0.8262438649949989, "step": 50 }, { "epoch": 0.2729528535980149, "grad_norm": 0.5611469832371672, "learning_rate": 1.9999132465602526e-05, "loss": 0.5245039939880372, "memory(GiB)": 82.86, "step": 55, "token_acc": 0.8300031537213912, "train_speed(iter/s)": 0.016271 }, { "epoch": 0.2977667493796526, "grad_norm": 0.6906532228678482, "learning_rate": 1.9995608365087945e-05, "loss": 0.5370721340179443, "memory(GiB)": 88.64, "step": 60, "token_acc": 0.8404785794642585, "train_speed(iter/s)": 0.016239 }, { "epoch": 0.3225806451612903, "grad_norm": 0.8317526593094506, "learning_rate": 1.998937443221316e-05, "loss": 0.5141131401062011, "memory(GiB)": 88.64, "step": 65, "token_acc": 0.8288326753680118, "train_speed(iter/s)": 0.016216 }, { "epoch": 0.34739454094292804, "grad_norm": 0.6085530190810974, "learning_rate": 1.9980432357011672e-05, "loss": 0.5197068214416504, "memory(GiB)": 88.64, "step": 70, "token_acc": 0.8228907383370238, "train_speed(iter/s)": 0.016263 }, { "epoch": 0.37220843672456577, "grad_norm": 0.5289893647507578, "learning_rate": 1.9968784563700586e-05, "loss": 0.5076879978179931, "memory(GiB)": 88.64, "step": 75, "token_acc": 0.8441243900160267, "train_speed(iter/s)": 0.016308 }, { "epoch": 0.3970223325062035, "grad_norm": 0.6566505741518432, "learning_rate": 1.9954434210023388e-05, "loss": 0.5057409286499024, "memory(GiB)": 88.64, "step": 80, "token_acc": 0.8405308008648832, "train_speed(iter/s)": 0.016444 }, { "epoch": 0.4218362282878412, "grad_norm": 0.6488893883989255, "learning_rate": 1.9937385186393888e-05, "loss": 0.5170788764953613, "memory(GiB)": 88.64, "step": 85, "token_acc": 0.8368028094412318, "train_speed(iter/s)": 0.016432 }, { "epoch": 0.4466501240694789, "grad_norm": 0.61918836791994, "learning_rate": 1.9917642114841505e-05, "loss": 0.4992537498474121, "memory(GiB)": 88.64, "step": 90, "token_acc": 0.8348116071872977, "train_speed(iter/s)": 0.016433 }, { "epoch": 0.47146401985111663, "grad_norm": 0.5951609226226164, "learning_rate": 1.9895210347758233e-05, "loss": 0.5035615921020508, "memory(GiB)": 88.64, "step": 95, "token_acc": 0.8535644197481864, "train_speed(iter/s)": 0.016464 }, { "epoch": 0.49627791563275436, "grad_norm": 0.5482278802904141, "learning_rate": 1.9870095966447592e-05, "loss": 0.5053007125854492, "memory(GiB)": 88.64, "step": 100, "token_acc": 0.8401285676802426, "train_speed(iter/s)": 0.01646 }, { "epoch": 0.49627791563275436, "eval_loss": 0.4579505920410156, "eval_runtime": 36.3741, "eval_samples_per_second": 7.12, "eval_steps_per_second": 0.467, "eval_token_acc": 0.8344002233025517, "step": 100 }, { "epoch": 0.5210918114143921, "grad_norm": 0.6730121532635986, "learning_rate": 1.984230577947597e-05, "loss": 0.5037405967712403, "memory(GiB)": 88.64, "step": 105, "token_acc": 0.842359144244386, "train_speed(iter/s)": 0.016306 }, { "epoch": 0.5459057071960298, "grad_norm": 0.5976214012072565, "learning_rate": 1.9811847320826818e-05, "loss": 0.5020250797271728, "memory(GiB)": 88.64, "step": 110, "token_acc": 0.8231459777567204, "train_speed(iter/s)": 0.016276 }, { "epoch": 0.5707196029776674, "grad_norm": 0.549343197081887, "learning_rate": 1.977872884785815e-05, "loss": 0.4945159912109375, "memory(GiB)": 88.64, "step": 115, "token_acc": 0.8355786045950673, "train_speed(iter/s)": 0.016303 }, { "epoch": 0.5955334987593052, "grad_norm": 0.6276703676499341, "learning_rate": 1.9742959339063977e-05, "loss": 0.47115507125854494, "memory(GiB)": 88.64, "step": 120, "token_acc": 0.8467319196172923, "train_speed(iter/s)": 0.016405 }, { "epoch": 0.6203473945409429, "grad_norm": 0.48930503328156055, "learning_rate": 1.9704548491640195e-05, "loss": 0.4859424591064453, "memory(GiB)": 88.64, "step": 125, "token_acc": 0.8370880609513801, "train_speed(iter/s)": 0.016377 }, { "epoch": 0.6451612903225806, "grad_norm": 0.5157224450118135, "learning_rate": 1.966350671885566e-05, "loss": 0.4896842956542969, "memory(GiB)": 88.64, "step": 130, "token_acc": 0.8307861188942663, "train_speed(iter/s)": 0.016357 }, { "epoch": 0.6699751861042184, "grad_norm": 0.6061582190169718, "learning_rate": 1.961984514722914e-05, "loss": 0.47826318740844725, "memory(GiB)": 88.64, "step": 135, "token_acc": 0.851224802030969, "train_speed(iter/s)": 0.016369 }, { "epoch": 0.6947890818858561, "grad_norm": 0.5516185961170281, "learning_rate": 1.957357561351287e-05, "loss": 0.471895694732666, "memory(GiB)": 88.64, "step": 140, "token_acc": 0.8424185959845344, "train_speed(iter/s)": 0.016408 }, { "epoch": 0.7196029776674938, "grad_norm": 0.4947960995048164, "learning_rate": 1.9524710661483594e-05, "loss": 0.47608461380004885, "memory(GiB)": 88.64, "step": 145, "token_acc": 0.8376492687508386, "train_speed(iter/s)": 0.016435 }, { "epoch": 0.7444168734491315, "grad_norm": 0.5952017722562365, "learning_rate": 1.9473263538541916e-05, "loss": 0.4909799575805664, "memory(GiB)": 88.64, "step": 150, "token_acc": 0.8504602706501667, "train_speed(iter/s)": 0.016418 }, { "epoch": 0.7444168734491315, "eval_loss": 0.4375256896018982, "eval_runtime": 37.0115, "eval_samples_per_second": 6.998, "eval_steps_per_second": 0.459, "eval_token_acc": 0.8389802516805843, "step": 150 }, { "epoch": 0.7692307692307693, "grad_norm": 0.5564783950854367, "learning_rate": 1.94192481921209e-05, "loss": 0.4702787399291992, "memory(GiB)": 88.64, "step": 155, "token_acc": 0.8495802700870744, "train_speed(iter/s)": 0.01631 }, { "epoch": 0.794044665012407, "grad_norm": 0.51266416113498, "learning_rate": 1.936267926590488e-05, "loss": 0.4766042709350586, "memory(GiB)": 88.64, "step": 160, "token_acc": 0.85023222889173, "train_speed(iter/s)": 0.016298 }, { "epoch": 0.8188585607940446, "grad_norm": 0.4756301510519784, "learning_rate": 1.9303572095859545e-05, "loss": 0.4743985652923584, "memory(GiB)": 88.64, "step": 165, "token_acc": 0.8259336826336148, "train_speed(iter/s)": 0.016268 }, { "epoch": 0.8436724565756824, "grad_norm": 0.530391667374793, "learning_rate": 1.92419427060743e-05, "loss": 0.47100305557250977, "memory(GiB)": 88.64, "step": 170, "token_acc": 0.8465938389259181, "train_speed(iter/s)": 0.016284 }, { "epoch": 0.8684863523573201, "grad_norm": 0.5711613629006742, "learning_rate": 1.91778078044181e-05, "loss": 0.46791276931762693, "memory(GiB)": 88.64, "step": 175, "token_acc": 0.848610394510885, "train_speed(iter/s)": 0.016279 }, { "epoch": 0.8933002481389578, "grad_norm": 0.5215383014934748, "learning_rate": 1.9111184778009934e-05, "loss": 0.46720128059387206, "memory(GiB)": 88.64, "step": 180, "token_acc": 0.8429917728410017, "train_speed(iter/s)": 0.016337 }, { "epoch": 0.9181141439205955, "grad_norm": 0.5571619722684491, "learning_rate": 1.9042091688505104e-05, "loss": 0.46036605834960936, "memory(GiB)": 88.64, "step": 185, "token_acc": 0.8357287838060233, "train_speed(iter/s)": 0.016345 }, { "epoch": 0.9429280397022333, "grad_norm": 0.59307440115033, "learning_rate": 1.89705472671987e-05, "loss": 0.4740591049194336, "memory(GiB)": 88.64, "step": 190, "token_acc": 0.8355302886053912, "train_speed(iter/s)": 0.016341 }, { "epoch": 0.967741935483871, "grad_norm": 0.5582963618116457, "learning_rate": 1.8896570909947477e-05, "loss": 0.46262359619140625, "memory(GiB)": 88.64, "step": 195, "token_acc": 0.8320320418277263, "train_speed(iter/s)": 0.016366 }, { "epoch": 0.9925558312655087, "grad_norm": 0.5657776208171481, "learning_rate": 1.88201826719116e-05, "loss": 0.462737512588501, "memory(GiB)": 88.64, "step": 200, "token_acc": 0.8392010935601458, "train_speed(iter/s)": 0.01638 }, { "epoch": 0.9925558312655087, "eval_loss": 0.4247380495071411, "eval_runtime": 36.803, "eval_samples_per_second": 7.037, "eval_steps_per_second": 0.462, "eval_token_acc": 0.8428554813798237, "step": 200 }, { "epoch": 1.0148883374689825, "grad_norm": 0.5518905639056666, "learning_rate": 1.874140326211766e-05, "loss": 0.41510915756225586, "memory(GiB)": 88.64, "step": 205, "token_acc": 0.8667589172375147, "train_speed(iter/s)": 0.016368 }, { "epoch": 1.0397022332506203, "grad_norm": 0.586511183543945, "learning_rate": 1.866025403784439e-05, "loss": 0.4149285316467285, "memory(GiB)": 88.64, "step": 210, "token_acc": 0.8553997710922171, "train_speed(iter/s)": 0.016413 }, { "epoch": 1.064516129032258, "grad_norm": 0.7352442626023928, "learning_rate": 1.8576756998832667e-05, "loss": 0.4105654716491699, "memory(GiB)": 88.64, "step": 215, "token_acc": 0.855252843136141, "train_speed(iter/s)": 0.016406 }, { "epoch": 1.0893300248138957, "grad_norm": 0.6822260227811386, "learning_rate": 1.849093478132133e-05, "loss": 0.4040283203125, "memory(GiB)": 88.64, "step": 220, "token_acc": 0.8477043011659889, "train_speed(iter/s)": 0.016437 }, { "epoch": 1.1141439205955335, "grad_norm": 0.5903650939016252, "learning_rate": 1.8402810651910444e-05, "loss": 0.3956918716430664, "memory(GiB)": 88.64, "step": 225, "token_acc": 0.8605695208768427, "train_speed(iter/s)": 0.016458 }, { "epoch": 1.1389578163771712, "grad_norm": 0.4978998007240176, "learning_rate": 1.8312408501253674e-05, "loss": 0.4057618141174316, "memory(GiB)": 88.64, "step": 230, "token_acc": 0.8609171684050843, "train_speed(iter/s)": 0.016455 }, { "epoch": 1.163771712158809, "grad_norm": 0.4872739462988421, "learning_rate": 1.8219752837581466e-05, "loss": 0.40581340789794923, "memory(GiB)": 88.64, "step": 235, "token_acc": 0.8521656572270421, "train_speed(iter/s)": 0.016453 }, { "epoch": 1.1885856079404467, "grad_norm": 0.5234851506683952, "learning_rate": 1.8124868780056814e-05, "loss": 0.3941540479660034, "memory(GiB)": 88.64, "step": 240, "token_acc": 0.8785278129421401, "train_speed(iter/s)": 0.016419 }, { "epoch": 1.2133995037220844, "grad_norm": 0.4141490126306291, "learning_rate": 1.8027782051965408e-05, "loss": 0.3891263008117676, "memory(GiB)": 88.64, "step": 245, "token_acc": 0.8737247809520465, "train_speed(iter/s)": 0.016435 }, { "epoch": 1.2382133995037221, "grad_norm": 0.5328099303792803, "learning_rate": 1.7928518973741967e-05, "loss": 0.4076822280883789, "memory(GiB)": 88.64, "step": 250, "token_acc": 0.8484884195259031, "train_speed(iter/s)": 0.016452 }, { "epoch": 1.2382133995037221, "eval_loss": 0.42026251554489136, "eval_runtime": 36.9837, "eval_samples_per_second": 7.003, "eval_steps_per_second": 0.46, "eval_token_acc": 0.8458235444627945, "step": 250 }, { "epoch": 1.2630272952853598, "grad_norm": 0.6838326859978137, "learning_rate": 1.782710645583473e-05, "loss": 0.4152885913848877, "memory(GiB)": 88.64, "step": 255, "token_acc": 0.8661420845706872, "train_speed(iter/s)": 0.016363 }, { "epoch": 1.2878411910669976, "grad_norm": 0.5427598093088785, "learning_rate": 1.7723571991409986e-05, "loss": 0.40700349807739256, "memory(GiB)": 88.64, "step": 260, "token_acc": 0.8587367250591306, "train_speed(iter/s)": 0.016379 }, { "epoch": 1.3126550868486353, "grad_norm": 0.5151142246135575, "learning_rate": 1.761794364889855e-05, "loss": 0.40430006980895994, "memory(GiB)": 88.64, "step": 265, "token_acc": 0.8575991930060525, "train_speed(iter/s)": 0.016383 }, { "epoch": 1.337468982630273, "grad_norm": 0.5015356534355152, "learning_rate": 1.751025006438643e-05, "loss": 0.40410513877868653, "memory(GiB)": 88.64, "step": 270, "token_acc": 0.8492741510067344, "train_speed(iter/s)": 0.016388 }, { "epoch": 1.3622828784119108, "grad_norm": 0.5321569853483857, "learning_rate": 1.7400520433851457e-05, "loss": 0.40181665420532225, "memory(GiB)": 88.64, "step": 275, "token_acc": 0.863210783537583, "train_speed(iter/s)": 0.016384 }, { "epoch": 1.3870967741935485, "grad_norm": 0.5040061479558677, "learning_rate": 1.728878450524822e-05, "loss": 0.39484846591949463, "memory(GiB)": 88.64, "step": 280, "token_acc": 0.8496746362506398, "train_speed(iter/s)": 0.016389 }, { "epoch": 1.4119106699751862, "grad_norm": 0.49441565611811733, "learning_rate": 1.717507257044331e-05, "loss": 0.38428053855895994, "memory(GiB)": 89.94, "step": 285, "token_acc": 0.8691993353349355, "train_speed(iter/s)": 0.01639 }, { "epoch": 1.436724565756824, "grad_norm": 0.4427481805329584, "learning_rate": 1.7059415457003144e-05, "loss": 0.38771657943725585, "memory(GiB)": 89.94, "step": 290, "token_acc": 0.8611781405251951, "train_speed(iter/s)": 0.016398 }, { "epoch": 1.4615384615384617, "grad_norm": 0.4946161550465835, "learning_rate": 1.694184451983651e-05, "loss": 0.39400653839111327, "memory(GiB)": 89.94, "step": 295, "token_acc": 0.8555248745014329, "train_speed(iter/s)": 0.016388 }, { "epoch": 1.4863523573200992, "grad_norm": 0.4970277193994229, "learning_rate": 1.682239163269422e-05, "loss": 0.3775279998779297, "memory(GiB)": 89.94, "step": 300, "token_acc": 0.870510592163004, "train_speed(iter/s)": 0.016417 }, { "epoch": 1.4863523573200992, "eval_loss": 0.4109738767147064, "eval_runtime": 37.1778, "eval_samples_per_second": 6.967, "eval_steps_per_second": 0.457, "eval_token_acc": 0.8480263310925542, "step": 300 }, { "epoch": 1.5111662531017371, "grad_norm": 0.5261002463932816, "learning_rate": 1.6701089179528032e-05, "loss": 0.3945833683013916, "memory(GiB)": 89.94, "step": 305, "token_acc": 0.8619344282927444, "train_speed(iter/s)": 0.016379 }, { "epoch": 1.5359801488833746, "grad_norm": 0.4575728138284327, "learning_rate": 1.6577970045711293e-05, "loss": 0.392360258102417, "memory(GiB)": 89.94, "step": 310, "token_acc": 0.8602967068297687, "train_speed(iter/s)": 0.016403 }, { "epoch": 1.5607940446650124, "grad_norm": 0.40177603172705895, "learning_rate": 1.6453067609123656e-05, "loss": 0.3860903739929199, "memory(GiB)": 89.94, "step": 315, "token_acc": 0.872881901939308, "train_speed(iter/s)": 0.016402 }, { "epoch": 1.58560794044665, "grad_norm": 0.4454839273958068, "learning_rate": 1.6326415731102226e-05, "loss": 0.4006852149963379, "memory(GiB)": 89.94, "step": 320, "token_acc": 0.8653790030740496, "train_speed(iter/s)": 0.01639 }, { "epoch": 1.6104218362282878, "grad_norm": 0.44655786918995494, "learning_rate": 1.619804874726171e-05, "loss": 0.39271857738494875, "memory(GiB)": 89.94, "step": 325, "token_acc": 0.8654047918520109, "train_speed(iter/s)": 0.0164 }, { "epoch": 1.6352357320099256, "grad_norm": 0.4797627971829061, "learning_rate": 1.6068001458185934e-05, "loss": 0.37825469970703124, "memory(GiB)": 89.94, "step": 330, "token_acc": 0.8561078435708074, "train_speed(iter/s)": 0.016385 }, { "epoch": 1.6600496277915633, "grad_norm": 0.4861294532147035, "learning_rate": 1.5936309119993333e-05, "loss": 0.3874125242233276, "memory(GiB)": 89.94, "step": 335, "token_acc": 0.8552511758857783, "train_speed(iter/s)": 0.016383 }, { "epoch": 1.684863523573201, "grad_norm": 0.41946071511052324, "learning_rate": 1.5803007434778915e-05, "loss": 0.38948085308074953, "memory(GiB)": 89.94, "step": 340, "token_acc": 0.8796996530315503, "train_speed(iter/s)": 0.016394 }, { "epoch": 1.7096774193548387, "grad_norm": 0.48162634486442507, "learning_rate": 1.566813254093538e-05, "loss": 0.38796045780181887, "memory(GiB)": 89.94, "step": 345, "token_acc": 0.8590043182007245, "train_speed(iter/s)": 0.016394 }, { "epoch": 1.7344913151364765, "grad_norm": 0.4580728622127146, "learning_rate": 1.553172100335588e-05, "loss": 0.38542957305908204, "memory(GiB)": 89.94, "step": 350, "token_acc": 0.8623940061939602, "train_speed(iter/s)": 0.016386 }, { "epoch": 1.7344913151364765, "eval_loss": 0.40481674671173096, "eval_runtime": 37.3868, "eval_samples_per_second": 6.928, "eval_steps_per_second": 0.455, "eval_token_acc": 0.8498720662464236, "step": 350 }, { "epoch": 1.759305210918114, "grad_norm": 0.4101110019379735, "learning_rate": 1.5393809803521213e-05, "loss": 0.3963811159133911, "memory(GiB)": 89.94, "step": 355, "token_acc": 0.8648625816625366, "train_speed(iter/s)": 0.016344 }, { "epoch": 1.7841191066997517, "grad_norm": 0.4729658162675547, "learning_rate": 1.5254436329474062e-05, "loss": 0.38416252136230467, "memory(GiB)": 89.94, "step": 360, "token_acc": 0.8674258253238613, "train_speed(iter/s)": 0.01635 }, { "epoch": 1.8089330024813894, "grad_norm": 0.48983508519169017, "learning_rate": 1.5113638365682996e-05, "loss": 0.3992438316345215, "memory(GiB)": 89.94, "step": 365, "token_acc": 0.8770906339598599, "train_speed(iter/s)": 0.016361 }, { "epoch": 1.8337468982630272, "grad_norm": 0.5401851259187025, "learning_rate": 1.4971454082799029e-05, "loss": 0.38352556228637696, "memory(GiB)": 89.94, "step": 370, "token_acc": 0.860992567369929, "train_speed(iter/s)": 0.016358 }, { "epoch": 1.858560794044665, "grad_norm": 0.4230668354018143, "learning_rate": 1.482792202730745e-05, "loss": 0.3897742748260498, "memory(GiB)": 89.94, "step": 375, "token_acc": 0.871132879925645, "train_speed(iter/s)": 0.016351 }, { "epoch": 1.8833746898263026, "grad_norm": 0.4019049115904182, "learning_rate": 1.4683081111077807e-05, "loss": 0.39033985137939453, "memory(GiB)": 89.94, "step": 380, "token_acc": 0.8773624177836983, "train_speed(iter/s)": 0.016354 }, { "epoch": 1.9081885856079404, "grad_norm": 0.5428075344626317, "learning_rate": 1.4536970600814789e-05, "loss": 0.3880493640899658, "memory(GiB)": 89.94, "step": 385, "token_acc": 0.859438589168319, "train_speed(iter/s)": 0.016358 }, { "epoch": 1.933002481389578, "grad_norm": 0.4827683055924454, "learning_rate": 1.4389630107412942e-05, "loss": 0.38936262130737304, "memory(GiB)": 89.94, "step": 390, "token_acc": 0.8606305598661481, "train_speed(iter/s)": 0.016355 }, { "epoch": 1.9578163771712158, "grad_norm": 0.4573581809973034, "learning_rate": 1.424109957521806e-05, "loss": 0.3780574560165405, "memory(GiB)": 89.94, "step": 395, "token_acc": 0.8642330574236937, "train_speed(iter/s)": 0.01638 }, { "epoch": 1.9826302729528535, "grad_norm": 0.4605796202723116, "learning_rate": 1.4091419271198197e-05, "loss": 0.3744480848312378, "memory(GiB)": 89.94, "step": 400, "token_acc": 0.8632874334053888, "train_speed(iter/s)": 0.016393 }, { "epoch": 1.9826302729528535, "eval_loss": 0.40109848976135254, "eval_runtime": 36.9238, "eval_samples_per_second": 7.014, "eval_steps_per_second": 0.46, "eval_token_acc": 0.8518154962666604, "step": 400 }, { "epoch": 2.0049627791563274, "grad_norm": 0.561513288720162, "learning_rate": 1.394062977402717e-05, "loss": 0.36142630577087403, "memory(GiB)": 89.94, "step": 405, "token_acc": 0.8730374487048237, "train_speed(iter/s)": 0.016389 }, { "epoch": 2.029776674937965, "grad_norm": 0.5236938495980928, "learning_rate": 1.378877196308361e-05, "loss": 0.30889334678649905, "memory(GiB)": 89.94, "step": 410, "token_acc": 0.8946576830463284, "train_speed(iter/s)": 0.016398 }, { "epoch": 2.054590570719603, "grad_norm": 0.45558075636252165, "learning_rate": 1.3635887007368467e-05, "loss": 0.3037071228027344, "memory(GiB)": 89.94, "step": 415, "token_acc": 0.8943636184386575, "train_speed(iter/s)": 0.016406 }, { "epoch": 2.0794044665012406, "grad_norm": 0.5039448394161661, "learning_rate": 1.348201635434399e-05, "loss": 0.3107598781585693, "memory(GiB)": 89.94, "step": 420, "token_acc": 0.8958745781247054, "train_speed(iter/s)": 0.016399 }, { "epoch": 2.1042183622828783, "grad_norm": 0.4915435678643858, "learning_rate": 1.3327201718697232e-05, "loss": 0.3129460334777832, "memory(GiB)": 89.94, "step": 425, "token_acc": 0.8708129581052927, "train_speed(iter/s)": 0.016409 }, { "epoch": 2.129032258064516, "grad_norm": 0.4526399583483296, "learning_rate": 1.31714850710311e-05, "loss": 0.3166365146636963, "memory(GiB)": 89.94, "step": 430, "token_acc": 0.89759354279269, "train_speed(iter/s)": 0.016395 }, { "epoch": 2.1538461538461537, "grad_norm": 0.47166304759252065, "learning_rate": 1.3014908626486032e-05, "loss": 0.30022444725036623, "memory(GiB)": 89.94, "step": 435, "token_acc": 0.8798537849342958, "train_speed(iter/s)": 0.016395 }, { "epoch": 2.1786600496277915, "grad_norm": 0.4123439151346761, "learning_rate": 1.2857514833295369e-05, "loss": 0.3049207925796509, "memory(GiB)": 89.94, "step": 440, "token_acc": 0.8795848668205352, "train_speed(iter/s)": 0.016409 }, { "epoch": 2.203473945409429, "grad_norm": 0.405269067838883, "learning_rate": 1.2699346361277538e-05, "loss": 0.3032404661178589, "memory(GiB)": 89.94, "step": 445, "token_acc": 0.893778727363035, "train_speed(iter/s)": 0.016409 }, { "epoch": 2.228287841191067, "grad_norm": 0.37939542721602953, "learning_rate": 1.2540446090268193e-05, "loss": 0.3014317512512207, "memory(GiB)": 89.94, "step": 450, "token_acc": 0.8831544347304245, "train_speed(iter/s)": 0.016419 }, { "epoch": 2.228287841191067, "eval_loss": 0.4072587788105011, "eval_runtime": 37.3946, "eval_samples_per_second": 6.926, "eval_steps_per_second": 0.455, "eval_token_acc": 0.8520934614221581, "step": 450 }, { "epoch": 2.2531017369727047, "grad_norm": 0.3861335333797535, "learning_rate": 1.2380857098495355e-05, "loss": 0.30447826385498045, "memory(GiB)": 89.94, "step": 455, "token_acc": 0.881784783123963, "train_speed(iter/s)": 0.016377 }, { "epoch": 2.2779156327543424, "grad_norm": 0.40103897362549834, "learning_rate": 1.2220622650900833e-05, "loss": 0.306304407119751, "memory(GiB)": 89.94, "step": 460, "token_acc": 0.9054849560829752, "train_speed(iter/s)": 0.016371 }, { "epoch": 2.30272952853598, "grad_norm": 0.41467304098935237, "learning_rate": 1.2059786187410984e-05, "loss": 0.31237101554870605, "memory(GiB)": 89.94, "step": 465, "token_acc": 0.8715191597554331, "train_speed(iter/s)": 0.016373 }, { "epoch": 2.327543424317618, "grad_norm": 0.40212608233202607, "learning_rate": 1.1898391311160067e-05, "loss": 0.30408382415771484, "memory(GiB)": 89.94, "step": 470, "token_acc": 0.8885567438564482, "train_speed(iter/s)": 0.01638 }, { "epoch": 2.3523573200992556, "grad_norm": 0.4506239257919707, "learning_rate": 1.1736481776669307e-05, "loss": 0.2938546180725098, "memory(GiB)": 89.94, "step": 475, "token_acc": 0.9089126511337576, "train_speed(iter/s)": 0.016382 }, { "epoch": 2.3771712158808933, "grad_norm": 0.42538484493612555, "learning_rate": 1.1574101477984966e-05, "loss": 0.3105756759643555, "memory(GiB)": 89.94, "step": 480, "token_acc": 0.8806113552657332, "train_speed(iter/s)": 0.016388 }, { "epoch": 2.401985111662531, "grad_norm": 0.4226912481964332, "learning_rate": 1.1411294436778562e-05, "loss": 0.3021634578704834, "memory(GiB)": 89.94, "step": 485, "token_acc": 0.8833968731418979, "train_speed(iter/s)": 0.01639 }, { "epoch": 2.4267990074441688, "grad_norm": 0.449080842036963, "learning_rate": 1.124810479041248e-05, "loss": 0.3032838344573975, "memory(GiB)": 89.94, "step": 490, "token_acc": 0.877714128906116, "train_speed(iter/s)": 0.016386 }, { "epoch": 2.4516129032258065, "grad_norm": 0.4577775380023549, "learning_rate": 1.1084576779974257e-05, "loss": 0.3055537223815918, "memory(GiB)": 89.94, "step": 495, "token_acc": 0.8952398880779006, "train_speed(iter/s)": 0.016386 }, { "epoch": 2.4764267990074442, "grad_norm": 0.4327610654637386, "learning_rate": 1.092075473828269e-05, "loss": 0.3270174741744995, "memory(GiB)": 89.94, "step": 500, "token_acc": 0.8941933336227983, "train_speed(iter/s)": 0.016386 }, { "epoch": 2.4764267990074442, "eval_loss": 0.3992994427680969, "eval_runtime": 37.0915, "eval_samples_per_second": 6.983, "eval_steps_per_second": 0.458, "eval_token_acc": 0.8543858016794212, "step": 500 }, { "epoch": 2.501240694789082, "grad_norm": 0.43447637201251166, "learning_rate": 1.0756683077869133e-05, "loss": 0.30214927196502683, "memory(GiB)": 89.94, "step": 505, "token_acc": 0.8810097805974094, "train_speed(iter/s)": 0.016353 }, { "epoch": 2.5260545905707197, "grad_norm": 0.4282670862928906, "learning_rate": 1.0592406278937143e-05, "loss": 0.29954004287719727, "memory(GiB)": 89.94, "step": 510, "token_acc": 0.8962592483454521, "train_speed(iter/s)": 0.016353 }, { "epoch": 2.5508684863523574, "grad_norm": 0.4520012444271277, "learning_rate": 1.0427968877303809e-05, "loss": 0.29749062061309817, "memory(GiB)": 89.94, "step": 515, "token_acc": 0.88993587445068, "train_speed(iter/s)": 0.016352 }, { "epoch": 2.575682382133995, "grad_norm": 0.4094260181661943, "learning_rate": 1.0263415452325967e-05, "loss": 0.30545458793640134, "memory(GiB)": 89.94, "step": 520, "token_acc": 0.8991213678952933, "train_speed(iter/s)": 0.01636 }, { "epoch": 2.600496277915633, "grad_norm": 0.4362439165953098, "learning_rate": 1.0098790614814658e-05, "loss": 0.29534034729003905, "memory(GiB)": 89.94, "step": 525, "token_acc": 0.8916807528895793, "train_speed(iter/s)": 0.016369 }, { "epoch": 2.6253101736972706, "grad_norm": 0.4946976315555759, "learning_rate": 9.934138994941023e-06, "loss": 0.3051294803619385, "memory(GiB)": 89.94, "step": 530, "token_acc": 0.8991352720121762, "train_speed(iter/s)": 0.016377 }, { "epoch": 2.6501240694789083, "grad_norm": 0.4359207003478648, "learning_rate": 9.769505230136962e-06, "loss": 0.2859165191650391, "memory(GiB)": 89.94, "step": 535, "token_acc": 0.8830987088713036, "train_speed(iter/s)": 0.016394 }, { "epoch": 2.674937965260546, "grad_norm": 0.45484329583426325, "learning_rate": 9.604933952993822e-06, "loss": 0.2968073606491089, "memory(GiB)": 89.94, "step": 540, "token_acc": 0.893879447175874, "train_speed(iter/s)": 0.016401 }, { "epoch": 2.699751861042184, "grad_norm": 0.39016064380810367, "learning_rate": 9.440469779162407e-06, "loss": 0.30095710754394533, "memory(GiB)": 89.94, "step": 545, "token_acc": 0.8819626291391867, "train_speed(iter/s)": 0.016402 }, { "epoch": 2.7245657568238215, "grad_norm": 0.3894440736965737, "learning_rate": 9.276157295257566e-06, "loss": 0.297087574005127, "memory(GiB)": 89.94, "step": 550, "token_acc": 0.8939326285376584, "train_speed(iter/s)": 0.016414 }, { "epoch": 2.7245657568238215, "eval_loss": 0.39560389518737793, "eval_runtime": 36.8591, "eval_samples_per_second": 7.027, "eval_steps_per_second": 0.461, "eval_token_acc": 0.856389709474076, "step": 550 }, { "epoch": 2.749379652605459, "grad_norm": 0.398345793637711, "learning_rate": 9.112041046770653e-06, "loss": 0.3055715084075928, "memory(GiB)": 89.94, "step": 555, "token_acc": 0.8971510298173304, "train_speed(iter/s)": 0.016389 }, { "epoch": 2.774193548387097, "grad_norm": 0.4307995028287168, "learning_rate": 8.948165525993162e-06, "loss": 0.30808732509613035, "memory(GiB)": 89.94, "step": 560, "token_acc": 0.888341487335637, "train_speed(iter/s)": 0.016392 }, { "epoch": 2.7990074441687343, "grad_norm": 0.42720829430324325, "learning_rate": 8.784575159954748e-06, "loss": 0.29772372245788575, "memory(GiB)": 89.94, "step": 565, "token_acc": 0.8964727272727273, "train_speed(iter/s)": 0.016385 }, { "epoch": 2.8238213399503724, "grad_norm": 0.378943802971695, "learning_rate": 8.621314298378958e-06, "loss": 0.2994475126266479, "memory(GiB)": 89.94, "step": 570, "token_acc": 0.8991428363722879, "train_speed(iter/s)": 0.016389 }, { "epoch": 2.8486352357320097, "grad_norm": 0.39340559598269975, "learning_rate": 8.458427201659926e-06, "loss": 0.3069624662399292, "memory(GiB)": 89.94, "step": 575, "token_acc": 0.8998604899265521, "train_speed(iter/s)": 0.016397 }, { "epoch": 2.873449131513648, "grad_norm": 0.4314024894376894, "learning_rate": 8.295958028863285e-06, "loss": 0.30196504592895507, "memory(GiB)": 89.94, "step": 580, "token_acc": 0.9014831273211464, "train_speed(iter/s)": 0.016408 }, { "epoch": 2.898263027295285, "grad_norm": 0.4237139579995691, "learning_rate": 8.133950825754511e-06, "loss": 0.2988776683807373, "memory(GiB)": 89.94, "step": 585, "token_acc": 0.9028119489350919, "train_speed(iter/s)": 0.016424 }, { "epoch": 2.9230769230769234, "grad_norm": 0.4130611959829037, "learning_rate": 7.972449512858062e-06, "loss": 0.30088846683502196, "memory(GiB)": 89.94, "step": 590, "token_acc": 0.8994833915566345, "train_speed(iter/s)": 0.01642 }, { "epoch": 2.9478908188585606, "grad_norm": 0.4169476470359364, "learning_rate": 7.81149787355039e-06, "loss": 0.3129019021987915, "memory(GiB)": 89.94, "step": 595, "token_acc": 0.8943572216882053, "train_speed(iter/s)": 0.016416 }, { "epoch": 2.9727047146401984, "grad_norm": 0.4840772402114463, "learning_rate": 7.651139542190164e-06, "loss": 0.27456250190734866, "memory(GiB)": 89.94, "step": 600, "token_acc": 0.896454707029423, "train_speed(iter/s)": 0.016428 }, { "epoch": 2.9727047146401984, "eval_loss": 0.3933154344558716, "eval_runtime": 37.5718, "eval_samples_per_second": 6.893, "eval_steps_per_second": 0.452, "eval_token_acc": 0.8575376241538927, "step": 600 }, { "epoch": 2.997518610421836, "grad_norm": 0.3681469729562029, "learning_rate": 7.491417992288927e-06, "loss": 0.29853529930114747, "memory(GiB)": 89.94, "step": 605, "token_acc": 0.8887139445589062, "train_speed(iter/s)": 0.016389 }, { "epoch": 3.0198511166253104, "grad_norm": 0.46754553369454177, "learning_rate": 7.332376524725298e-06, "loss": 0.25056142807006837, "memory(GiB)": 89.94, "step": 610, "token_acc": 0.9005600640073151, "train_speed(iter/s)": 0.016407 }, { "epoch": 3.044665012406948, "grad_norm": 0.3905145343550401, "learning_rate": 7.174058256006012e-06, "loss": 0.24309511184692384, "memory(GiB)": 89.94, "step": 615, "token_acc": 0.9053923110803498, "train_speed(iter/s)": 0.016402 }, { "epoch": 3.069478908188586, "grad_norm": 0.41901102410789404, "learning_rate": 7.016506106576942e-06, "loss": 0.23708744049072267, "memory(GiB)": 89.94, "step": 620, "token_acc": 0.921517517259178, "train_speed(iter/s)": 0.016398 }, { "epoch": 3.094292803970223, "grad_norm": 0.36157357718683, "learning_rate": 6.859762789187259e-06, "loss": 0.23210906982421875, "memory(GiB)": 89.94, "step": 625, "token_acc": 0.9225797451873896, "train_speed(iter/s)": 0.016406 }, { "epoch": 3.119106699751861, "grad_norm": 0.3805761890187129, "learning_rate": 6.703870797309922e-06, "loss": 0.2322997808456421, "memory(GiB)": 89.94, "step": 630, "token_acc": 0.9135864765989655, "train_speed(iter/s)": 0.016415 }, { "epoch": 3.1439205955334986, "grad_norm": 0.3788702415059945, "learning_rate": 6.548872393621578e-06, "loss": 0.22191863059997557, "memory(GiB)": 89.94, "step": 635, "token_acc": 0.9246411397786463, "train_speed(iter/s)": 0.016413 }, { "epoch": 3.1687344913151363, "grad_norm": 0.4063569428162902, "learning_rate": 6.3948095985450755e-06, "loss": 0.24599046707153321, "memory(GiB)": 89.94, "step": 640, "token_acc": 0.9072380405759489, "train_speed(iter/s)": 0.016415 }, { "epoch": 3.193548387096774, "grad_norm": 0.3557259507590387, "learning_rate": 6.241724178857621e-06, "loss": 0.23447060585021973, "memory(GiB)": 89.94, "step": 645, "token_acc": 0.9291399599260514, "train_speed(iter/s)": 0.016413 }, { "epoch": 3.2183622828784118, "grad_norm": 0.3295522367686504, "learning_rate": 6.089657636367698e-06, "loss": 0.23479061126708983, "memory(GiB)": 89.94, "step": 650, "token_acc": 0.9199351455699978, "train_speed(iter/s)": 0.016409 }, { "epoch": 3.2183622828784118, "eval_loss": 0.4014388620853424, "eval_runtime": 37.5517, "eval_samples_per_second": 6.897, "eval_steps_per_second": 0.453, "eval_token_acc": 0.858085413226024, "step": 650 }, { "epoch": 3.2431761786600495, "grad_norm": 0.4240123332842028, "learning_rate": 5.938651196663865e-06, "loss": 0.22697579860687256, "memory(GiB)": 89.94, "step": 655, "token_acc": 0.8996346346571273, "train_speed(iter/s)": 0.016378 }, { "epoch": 3.267990074441687, "grad_norm": 0.35633031030131934, "learning_rate": 5.788745797938372e-06, "loss": 0.2304630994796753, "memory(GiB)": 89.94, "step": 660, "token_acc": 0.9186445328577308, "train_speed(iter/s)": 0.01638 }, { "epoch": 3.292803970223325, "grad_norm": 0.38300755718973123, "learning_rate": 5.6399820798887266e-06, "loss": 0.23805148601531984, "memory(GiB)": 89.94, "step": 665, "token_acc": 0.9132490030897615, "train_speed(iter/s)": 0.016388 }, { "epoch": 3.3176178660049627, "grad_norm": 0.3833393689853116, "learning_rate": 5.492400372700195e-06, "loss": 0.23390157222747804, "memory(GiB)": 89.94, "step": 670, "token_acc": 0.9085175452156439, "train_speed(iter/s)": 0.016384 }, { "epoch": 3.3424317617866004, "grad_norm": 0.3525024053865293, "learning_rate": 5.346040686112189e-06, "loss": 0.23395137786865233, "memory(GiB)": 89.94, "step": 675, "token_acc": 0.9463714867825449, "train_speed(iter/s)": 0.016389 }, { "epoch": 3.367245657568238, "grad_norm": 0.3487934368974318, "learning_rate": 5.200942698571527e-06, "loss": 0.22507448196411134, "memory(GiB)": 89.94, "step": 680, "token_acc": 0.9121472535129486, "train_speed(iter/s)": 0.016392 }, { "epoch": 3.392059553349876, "grad_norm": 0.35248625950301066, "learning_rate": 5.0571457464755226e-06, "loss": 0.23436686992645264, "memory(GiB)": 89.94, "step": 685, "token_acc": 0.9249912229851648, "train_speed(iter/s)": 0.016389 }, { "epoch": 3.4168734491315136, "grad_norm": 0.38081790230168544, "learning_rate": 4.914688813507798e-06, "loss": 0.2353280544281006, "memory(GiB)": 89.94, "step": 690, "token_acc": 0.9169562569412102, "train_speed(iter/s)": 0.016393 }, { "epoch": 3.4416873449131513, "grad_norm": 0.3810939058594302, "learning_rate": 4.773610520069706e-06, "loss": 0.23074874877929688, "memory(GiB)": 89.94, "step": 695, "token_acc": 0.896226352801347, "train_speed(iter/s)": 0.0164 }, { "epoch": 3.466501240694789, "grad_norm": 0.3874910212223461, "learning_rate": 4.633949112810271e-06, "loss": 0.22984590530395507, "memory(GiB)": 89.94, "step": 700, "token_acc": 0.9299721620785648, "train_speed(iter/s)": 0.016397 }, { "epoch": 3.466501240694789, "eval_loss": 0.3995462656021118, "eval_runtime": 37.598, "eval_samples_per_second": 6.889, "eval_steps_per_second": 0.452, "eval_token_acc": 0.8598125188993045, "step": 700 }, { "epoch": 3.4913151364764268, "grad_norm": 0.3616597597470475, "learning_rate": 4.495742454257418e-06, "loss": 0.2223682403564453, "memory(GiB)": 89.94, "step": 705, "token_acc": 0.9105715421148296, "train_speed(iter/s)": 0.016371 }, { "epoch": 3.5161290322580645, "grad_norm": 0.3988361808995699, "learning_rate": 4.359028012553362e-06, "loss": 0.2163018226623535, "memory(GiB)": 89.94, "step": 710, "token_acc": 0.9318655704692665, "train_speed(iter/s)": 0.01638 }, { "epoch": 3.5409429280397022, "grad_norm": 0.3932524566015415, "learning_rate": 4.223842851296907e-06, "loss": 0.23104467391967773, "memory(GiB)": 89.94, "step": 715, "token_acc": 0.9287522767981982, "train_speed(iter/s)": 0.016381 }, { "epoch": 3.56575682382134, "grad_norm": 0.367974151309137, "learning_rate": 4.090223619495419e-06, "loss": 0.23323664665222169, "memory(GiB)": 89.94, "step": 720, "token_acc": 0.9070000777236852, "train_speed(iter/s)": 0.016385 }, { "epoch": 3.5905707196029777, "grad_norm": 0.39001981721162915, "learning_rate": 3.9582065416291926e-06, "loss": 0.22505450248718262, "memory(GiB)": 89.94, "step": 725, "token_acc": 0.9127140748875999, "train_speed(iter/s)": 0.016399 }, { "epoch": 3.6153846153846154, "grad_norm": 0.3767818229832471, "learning_rate": 3.827827407830917e-06, "loss": 0.2194854736328125, "memory(GiB)": 89.94, "step": 730, "token_acc": 0.9305206381130868, "train_speed(iter/s)": 0.0164 }, { "epoch": 3.640198511166253, "grad_norm": 0.41209090580965746, "learning_rate": 3.6991215641828903e-06, "loss": 0.217703914642334, "memory(GiB)": 89.94, "step": 735, "token_acc": 0.918243838028169, "train_speed(iter/s)": 0.016398 }, { "epoch": 3.665012406947891, "grad_norm": 0.37397331270090406, "learning_rate": 3.5721239031346067e-06, "loss": 0.2251272201538086, "memory(GiB)": 89.94, "step": 740, "token_acc": 0.9240038816389786, "train_speed(iter/s)": 0.016395 }, { "epoch": 3.6898263027295286, "grad_norm": 0.40120593317876174, "learning_rate": 3.4468688540433425e-06, "loss": 0.22675325870513915, "memory(GiB)": 89.94, "step": 745, "token_acc": 0.9157191822608735, "train_speed(iter/s)": 0.016404 }, { "epoch": 3.7146401985111663, "grad_norm": 0.36315220537470405, "learning_rate": 3.323390373840276e-06, "loss": 0.23883156776428222, "memory(GiB)": 89.94, "step": 750, "token_acc": 0.9140917431192661, "train_speed(iter/s)": 0.016405 }, { "epoch": 3.7146401985111663, "eval_loss": 0.3998472988605499, "eval_runtime": 37.7998, "eval_samples_per_second": 6.852, "eval_steps_per_second": 0.45, "eval_token_acc": 0.8604498615989393, "step": 750 }, { "epoch": 3.739454094292804, "grad_norm": 0.35204470112010783, "learning_rate": 3.2017219378246734e-06, "loss": 0.22259049415588378, "memory(GiB)": 89.94, "step": 755, "token_acc": 0.9101186207181223, "train_speed(iter/s)": 0.016387 }, { "epoch": 3.764267990074442, "grad_norm": 0.3243960514310002, "learning_rate": 3.0818965305886794e-06, "loss": 0.22415781021118164, "memory(GiB)": 89.94, "step": 760, "token_acc": 0.9288042256686896, "train_speed(iter/s)": 0.016394 }, { "epoch": 3.7890818858560795, "grad_norm": 0.4054124995924202, "learning_rate": 2.963946637075107e-06, "loss": 0.2139434337615967, "memory(GiB)": 89.94, "step": 765, "token_acc": 0.897647245531552, "train_speed(iter/s)": 0.016399 }, { "epoch": 3.8138957816377173, "grad_norm": 0.4068548613060828, "learning_rate": 2.847904233770692e-06, "loss": 0.23969681262969972, "memory(GiB)": 89.94, "step": 770, "token_acc": 0.9088209109362202, "train_speed(iter/s)": 0.016384 }, { "epoch": 3.838709677419355, "grad_norm": 0.37626609353926294, "learning_rate": 2.7338007800372024e-06, "loss": 0.2259690284729004, "memory(GiB)": 89.94, "step": 775, "token_acc": 0.9220249520153551, "train_speed(iter/s)": 0.016383 }, { "epoch": 3.8635235732009927, "grad_norm": 0.3294821061527916, "learning_rate": 2.6216672095827267e-06, "loss": 0.2296532154083252, "memory(GiB)": 89.94, "step": 780, "token_acc": 0.9264005360302354, "train_speed(iter/s)": 0.016383 }, { "epoch": 3.8883374689826304, "grad_norm": 0.4192434658643637, "learning_rate": 2.5115339220754796e-06, "loss": 0.21465823650360108, "memory(GiB)": 89.94, "step": 785, "token_acc": 0.9244031530683692, "train_speed(iter/s)": 0.016387 }, { "epoch": 3.9131513647642677, "grad_norm": 0.3555370222229151, "learning_rate": 2.403430774902373e-06, "loss": 0.23048720359802247, "memory(GiB)": 89.94, "step": 790, "token_acc": 0.9265224313413317, "train_speed(iter/s)": 0.016385 }, { "epoch": 3.937965260545906, "grad_norm": 0.32165359038918095, "learning_rate": 2.2973870750746253e-06, "loss": 0.21675100326538085, "memory(GiB)": 89.94, "step": 795, "token_acc": 0.9277675867070517, "train_speed(iter/s)": 0.016387 }, { "epoch": 3.962779156327543, "grad_norm": 0.3691842237508398, "learning_rate": 2.193431571282548e-06, "loss": 0.22982077598571776, "memory(GiB)": 89.94, "step": 800, "token_acc": 0.9162534196640608, "train_speed(iter/s)": 0.016398 }, { "epoch": 3.962779156327543, "eval_loss": 0.3970131278038025, "eval_runtime": 37.5975, "eval_samples_per_second": 6.889, "eval_steps_per_second": 0.452, "eval_token_acc": 0.8609941615687005, "step": 800 }, { "epoch": 3.9875930521091814, "grad_norm": 0.3711396772864637, "learning_rate": 2.09159244610172e-06, "loss": 0.22216348648071288, "memory(GiB)": 89.94, "step": 805, "token_acc": 0.8936650190172618, "train_speed(iter/s)": 0.016386 }, { "epoch": 4.009925558312655, "grad_norm": 0.48689819654773586, "learning_rate": 1.991897308352624e-06, "loss": 0.21132183074951172, "memory(GiB)": 89.94, "step": 810, "token_acc": 0.9407312130437598, "train_speed(iter/s)": 0.016398 }, { "epoch": 4.034739454094293, "grad_norm": 0.5032884501009073, "learning_rate": 1.8943731856158299e-06, "loss": 0.18714178800582887, "memory(GiB)": 89.94, "step": 815, "token_acc": 0.9151910921344552, "train_speed(iter/s)": 0.016407 }, { "epoch": 4.05955334987593, "grad_norm": 0.47787650705591495, "learning_rate": 1.799046516904751e-06, "loss": 0.19512782096862794, "memory(GiB)": 89.94, "step": 820, "token_acc": 0.9267424518609353, "train_speed(iter/s)": 0.016407 }, { "epoch": 4.084367245657568, "grad_norm": 0.3998650942092245, "learning_rate": 1.7059431454979825e-06, "loss": 0.19887795448303222, "memory(GiB)": 89.94, "step": 825, "token_acc": 0.9344387354439184, "train_speed(iter/s)": 0.016408 }, { "epoch": 4.109181141439206, "grad_norm": 0.3511648376611919, "learning_rate": 1.615088311933114e-06, "loss": 0.20051450729370118, "memory(GiB)": 89.94, "step": 830, "token_acc": 0.94342090168636, "train_speed(iter/s)": 0.016408 }, { "epoch": 4.133995037220844, "grad_norm": 0.36198509668580564, "learning_rate": 1.5265066471639701e-06, "loss": 0.19646989107131957, "memory(GiB)": 89.94, "step": 835, "token_acc": 0.9299070545334905, "train_speed(iter/s)": 0.016403 }, { "epoch": 4.158808933002481, "grad_norm": 0.31845761934810685, "learning_rate": 1.4402221658830963e-06, "loss": 0.1856994390487671, "memory(GiB)": 89.94, "step": 840, "token_acc": 0.9413985177001335, "train_speed(iter/s)": 0.016403 }, { "epoch": 4.183622828784119, "grad_norm": 0.31837636190529284, "learning_rate": 1.3562582600113295e-06, "loss": 0.18745067119598388, "memory(GiB)": 89.94, "step": 845, "token_acc": 0.917115642208662, "train_speed(iter/s)": 0.016401 }, { "epoch": 4.208436724565757, "grad_norm": 0.36629490437638673, "learning_rate": 1.274637692356181e-06, "loss": 0.19683722257614136, "memory(GiB)": 89.94, "step": 850, "token_acc": 0.9203514759298465, "train_speed(iter/s)": 0.0164 }, { "epoch": 4.208436724565757, "eval_loss": 0.4071538746356964, "eval_runtime": 37.7446, "eval_samples_per_second": 6.862, "eval_steps_per_second": 0.45, "eval_token_acc": 0.860109092600777, "step": 850 }, { "epoch": 4.233250620347395, "grad_norm": 0.35352317433834707, "learning_rate": 1.1953825904408033e-06, "loss": 0.1799285888671875, "memory(GiB)": 89.94, "step": 855, "token_acc": 0.9219630589493604, "train_speed(iter/s)": 0.016381 }, { "epoch": 4.258064516129032, "grad_norm": 0.3360379010356317, "learning_rate": 1.118514440505155e-06, "loss": 0.19137413501739503, "memory(GiB)": 89.94, "step": 860, "token_acc": 0.9249040837868416, "train_speed(iter/s)": 0.016385 }, { "epoch": 4.28287841191067, "grad_norm": 0.3435835544317236, "learning_rate": 1.0440540816810395e-06, "loss": 0.1967417359352112, "memory(GiB)": 89.94, "step": 865, "token_acc": 0.9308696646383973, "train_speed(iter/s)": 0.016379 }, { "epoch": 4.3076923076923075, "grad_norm": 0.32859421884913725, "learning_rate": 9.720217003425648e-07, "loss": 0.1809452772140503, "memory(GiB)": 89.94, "step": 870, "token_acc": 0.9282937629449756, "train_speed(iter/s)": 0.016382 }, { "epoch": 4.332506203473946, "grad_norm": 0.3395321554617945, "learning_rate": 9.024368246335735e-07, "loss": 0.18605422973632812, "memory(GiB)": 89.94, "step": 875, "token_acc": 0.9546436861343081, "train_speed(iter/s)": 0.016384 }, { "epoch": 4.357320099255583, "grad_norm": 0.3393236511573925, "learning_rate": 8.353183191735115e-07, "loss": 0.1946401596069336, "memory(GiB)": 89.94, "step": 880, "token_acc": 0.9236204495723095, "train_speed(iter/s)": 0.016384 }, { "epoch": 4.382133995037221, "grad_norm": 0.34078095822336996, "learning_rate": 7.706843799431985e-07, "loss": 0.18827946186065675, "memory(GiB)": 89.94, "step": 885, "token_acc": 0.9302919345345024, "train_speed(iter/s)": 0.016386 }, { "epoch": 4.406947890818858, "grad_norm": 0.353572914930731, "learning_rate": 7.085525293518469e-07, "loss": 0.1933911681175232, "memory(GiB)": 89.94, "step": 890, "token_acc": 0.9303721907052539, "train_speed(iter/s)": 0.016395 }, { "epoch": 4.431761786600497, "grad_norm": 0.34090308953100995, "learning_rate": 6.489396114866942e-07, "loss": 0.18675589561462402, "memory(GiB)": 89.94, "step": 895, "token_acc": 0.9425964870708806, "train_speed(iter/s)": 0.016397 }, { "epoch": 4.456575682382134, "grad_norm": 0.34608919518252224, "learning_rate": 5.918617875465449e-07, "loss": 0.19207412004470825, "memory(GiB)": 89.94, "step": 900, "token_acc": 0.9400631757501285, "train_speed(iter/s)": 0.016399 }, { "epoch": 4.456575682382134, "eval_loss": 0.40813976526260376, "eval_runtime": 37.5655, "eval_samples_per_second": 6.895, "eval_steps_per_second": 0.453, "eval_token_acc": 0.8603184387429927, "step": 900 }, { "epoch": 4.481389578163772, "grad_norm": 0.35859843949116793, "learning_rate": 5.373345314604206e-07, "loss": 0.2071406364440918, "memory(GiB)": 89.94, "step": 905, "token_acc": 0.9223047638884068, "train_speed(iter/s)": 0.016383 }, { "epoch": 4.506203473945409, "grad_norm": 0.3089222557610059, "learning_rate": 4.853726256925407e-07, "loss": 0.1951405882835388, "memory(GiB)": 89.94, "step": 910, "token_acc": 0.9288247402227017, "train_speed(iter/s)": 0.016382 }, { "epoch": 4.5310173697270475, "grad_norm": 0.3365251848175135, "learning_rate": 4.359901572347758e-07, "loss": 0.19402856826782228, "memory(GiB)": 89.94, "step": 915, "token_acc": 0.9069553201289728, "train_speed(iter/s)": 0.016381 }, { "epoch": 4.555831265508685, "grad_norm": 0.352511343998469, "learning_rate": 3.892005137876209e-07, "loss": 0.18840408325195312, "memory(GiB)": 89.94, "step": 920, "token_acc": 0.9290194762860235, "train_speed(iter/s)": 0.016381 }, { "epoch": 4.580645161290323, "grad_norm": 0.31455756384327627, "learning_rate": 3.450163801307582e-07, "loss": 0.1860198974609375, "memory(GiB)": 89.94, "step": 925, "token_acc": 0.9461412451458935, "train_speed(iter/s)": 0.016385 }, { "epoch": 4.60545905707196, "grad_norm": 0.31757641861016306, "learning_rate": 3.034497346841958e-07, "loss": 0.1895312786102295, "memory(GiB)": 89.94, "step": 930, "token_acc": 0.918751311402793, "train_speed(iter/s)": 0.016385 }, { "epoch": 4.630272952853598, "grad_norm": 0.34601426382495787, "learning_rate": 2.6451184626087646e-07, "loss": 0.19062964916229247, "memory(GiB)": 89.94, "step": 935, "token_acc": 0.9377439769272455, "train_speed(iter/s)": 0.016388 }, { "epoch": 4.655086848635236, "grad_norm": 0.3686772430396372, "learning_rate": 2.2821327101168578e-07, "loss": 0.18338959217071532, "memory(GiB)": 89.94, "step": 940, "token_acc": 0.9286465593172508, "train_speed(iter/s)": 0.016389 }, { "epoch": 4.679900744416873, "grad_norm": 0.3329148442481642, "learning_rate": 1.9456384956365149e-07, "loss": 0.17848238945007325, "memory(GiB)": 89.94, "step": 945, "token_acc": 0.9242079340262307, "train_speed(iter/s)": 0.016388 }, { "epoch": 4.704714640198511, "grad_norm": 0.33580830230246933, "learning_rate": 1.6357270435212736e-07, "loss": 0.19694331884384156, "memory(GiB)": 89.94, "step": 950, "token_acc": 0.9275961363852546, "train_speed(iter/s)": 0.01639 }, { "epoch": 4.704714640198511, "eval_loss": 0.4078960418701172, "eval_runtime": 37.2595, "eval_samples_per_second": 6.951, "eval_steps_per_second": 0.456, "eval_token_acc": 0.8605452303970599, "step": 950 }, { "epoch": 4.729528535980149, "grad_norm": 0.3214090891564049, "learning_rate": 1.3524823714768375e-07, "loss": 0.19557987451553344, "memory(GiB)": 89.94, "step": 955, "token_acc": 0.917459338194055, "train_speed(iter/s)": 0.016374 }, { "epoch": 4.754342431761787, "grad_norm": 0.3510644774133595, "learning_rate": 1.0959812677835968e-07, "loss": 0.18159072399139403, "memory(GiB)": 89.94, "step": 960, "token_acc": 0.9338328114497434, "train_speed(iter/s)": 0.016382 }, { "epoch": 4.779156327543424, "grad_norm": 0.31852083171219153, "learning_rate": 8.662932704792793e-08, "loss": 0.18122289180755616, "memory(GiB)": 89.94, "step": 965, "token_acc": 0.9309320132692916, "train_speed(iter/s)": 0.016382 }, { "epoch": 4.803970223325062, "grad_norm": 0.318877312082489, "learning_rate": 6.63480648506909e-08, "loss": 0.19023412466049194, "memory(GiB)": 89.94, "step": 970, "token_acc": 0.9162366937555632, "train_speed(iter/s)": 0.016379 }, { "epoch": 4.8287841191067, "grad_norm": 0.34651828579056343, "learning_rate": 4.8759838483358745e-08, "loss": 0.18698248863220215, "memory(GiB)": 89.94, "step": 975, "token_acc": 0.9286984711087747, "train_speed(iter/s)": 0.016384 }, { "epoch": 4.8535980148883375, "grad_norm": 0.3062976957364449, "learning_rate": 3.386941615445283e-08, "loss": 0.18977559804916383, "memory(GiB)": 89.94, "step": 980, "token_acc": 0.9407679833647572, "train_speed(iter/s)": 0.016386 }, { "epoch": 4.878411910669975, "grad_norm": 0.29457610025451847, "learning_rate": 2.1680834691628627e-08, "loss": 0.19420017004013063, "memory(GiB)": 89.94, "step": 985, "token_acc": 0.9039991194174251, "train_speed(iter/s)": 0.016388 }, { "epoch": 4.903225806451613, "grad_norm": 0.3316519420007468, "learning_rate": 1.2197398447283404e-08, "loss": 0.1931779146194458, "memory(GiB)": 89.94, "step": 990, "token_acc": 0.9029619269298459, "train_speed(iter/s)": 0.016386 }, { "epoch": 4.92803970223325, "grad_norm": 0.34572266929696366, "learning_rate": 5.421678402741659e-09, "loss": 0.1826627492904663, "memory(GiB)": 89.94, "step": 995, "token_acc": 0.9375434960795533, "train_speed(iter/s)": 0.016383 }, { "epoch": 4.9528535980148884, "grad_norm": 0.3466792398716006, "learning_rate": 1.3555114712526796e-09, "loss": 0.18890198469161987, "memory(GiB)": 89.94, "step": 1000, "token_acc": 0.9408911997667525, "train_speed(iter/s)": 0.016393 }, { "epoch": 4.9528535980148884, "eval_loss": 0.40783175826072693, "eval_runtime": 37.8642, "eval_samples_per_second": 6.84, "eval_steps_per_second": 0.449, "eval_token_acc": 0.8604533507013096, "step": 1000 }, { "epoch": 4.977667493796526, "grad_norm": 0.36020979675081666, "learning_rate": 0.0, "loss": 0.1956263303756714, "memory(GiB)": 89.94, "step": 1005, "token_acc": 0.9227749975468551, "train_speed(iter/s)": 0.016386 }, { "epoch": 4.977667493796526, "eval_loss": 0.40774813294410706, "eval_runtime": 37.2531, "eval_samples_per_second": 6.952, "eval_steps_per_second": 0.456, "eval_token_acc": 0.8606568816729083, "step": 1005 } ], "logging_steps": 5, "max_steps": 1005, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2269146859438080.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }