| { | |
| "best_metric": 0.39331543, | |
| "best_model_checkpoint": "/group/40174/Zywoou/mm_math_reasoning/ms-swift-exp2/oly_output/SFT_text13k_geomm13k_test_mimi_e5/v0-20250615-110647/checkpoint-600", | |
| "epoch": 4.977667493796526, | |
| "eval_steps": 50, | |
| "global_step": 1005, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.004962779156327543, | |
| "grad_norm": 4.768475281197631, | |
| "learning_rate": 3.921568627450981e-07, | |
| "loss": 0.829961359500885, | |
| "memory(GiB)": 34.54, | |
| "step": 1, | |
| "token_acc": 0.7476113479347347, | |
| "train_speed(iter/s)": 0.01403 | |
| }, | |
| { | |
| "epoch": 0.02481389578163772, | |
| "grad_norm": 4.398648559000095, | |
| "learning_rate": 1.96078431372549e-06, | |
| "loss": 0.8178610801696777, | |
| "memory(GiB)": 82.86, | |
| "step": 5, | |
| "token_acc": 0.7709430756159729, | |
| "train_speed(iter/s)": 0.016462 | |
| }, | |
| { | |
| "epoch": 0.04962779156327544, | |
| "grad_norm": 1.8736392022689452, | |
| "learning_rate": 3.92156862745098e-06, | |
| "loss": 0.7623313903808594, | |
| "memory(GiB)": 82.86, | |
| "step": 10, | |
| "token_acc": 0.7958252706986702, | |
| "train_speed(iter/s)": 0.016536 | |
| }, | |
| { | |
| "epoch": 0.07444168734491315, | |
| "grad_norm": 2.045825816277766, | |
| "learning_rate": 5.882352941176471e-06, | |
| "loss": 0.7174508571624756, | |
| "memory(GiB)": 82.86, | |
| "step": 15, | |
| "token_acc": 0.7836124503600792, | |
| "train_speed(iter/s)": 0.017279 | |
| }, | |
| { | |
| "epoch": 0.09925558312655088, | |
| "grad_norm": 1.4620891759847205, | |
| "learning_rate": 7.84313725490196e-06, | |
| "loss": 0.6413409233093261, | |
| "memory(GiB)": 82.86, | |
| "step": 20, | |
| "token_acc": 0.7984759662668143, | |
| "train_speed(iter/s)": 0.017064 | |
| }, | |
| { | |
| "epoch": 0.12406947890818859, | |
| "grad_norm": 1.0676208784737973, | |
| "learning_rate": 9.803921568627451e-06, | |
| "loss": 0.6111278533935547, | |
| "memory(GiB)": 82.86, | |
| "step": 25, | |
| "token_acc": 0.8271531241409104, | |
| "train_speed(iter/s)": 0.016967 | |
| }, | |
| { | |
| "epoch": 0.1488833746898263, | |
| "grad_norm": 0.7618181414678857, | |
| "learning_rate": 1.1764705882352942e-05, | |
| "loss": 0.5910769462585449, | |
| "memory(GiB)": 82.86, | |
| "step": 30, | |
| "token_acc": 0.8199785748096231, | |
| "train_speed(iter/s)": 0.016793 | |
| }, | |
| { | |
| "epoch": 0.17369727047146402, | |
| "grad_norm": 0.6486866379137493, | |
| "learning_rate": 1.3725490196078432e-05, | |
| "loss": 0.563320541381836, | |
| "memory(GiB)": 82.86, | |
| "step": 35, | |
| "token_acc": 0.8135446844258112, | |
| "train_speed(iter/s)": 0.016772 | |
| }, | |
| { | |
| "epoch": 0.19851116625310175, | |
| "grad_norm": 0.6289022048785994, | |
| "learning_rate": 1.568627450980392e-05, | |
| "loss": 0.5803719520568847, | |
| "memory(GiB)": 82.86, | |
| "step": 40, | |
| "token_acc": 0.8021277151893771, | |
| "train_speed(iter/s)": 0.016692 | |
| }, | |
| { | |
| "epoch": 0.22332506203473945, | |
| "grad_norm": 0.6554404813269014, | |
| "learning_rate": 1.7647058823529414e-05, | |
| "loss": 0.5443972110748291, | |
| "memory(GiB)": 82.86, | |
| "step": 45, | |
| "token_acc": 0.8180514241473452, | |
| "train_speed(iter/s)": 0.01667 | |
| }, | |
| { | |
| "epoch": 0.24813895781637718, | |
| "grad_norm": 0.7064848224001448, | |
| "learning_rate": 1.9607843137254903e-05, | |
| "loss": 0.5299727439880371, | |
| "memory(GiB)": 82.86, | |
| "step": 50, | |
| "token_acc": 0.8285904060313818, | |
| "train_speed(iter/s)": 0.016801 | |
| }, | |
| { | |
| "epoch": 0.24813895781637718, | |
| "eval_loss": 0.48841050267219543, | |
| "eval_runtime": 36.0679, | |
| "eval_samples_per_second": 7.181, | |
| "eval_steps_per_second": 0.471, | |
| "eval_token_acc": 0.8262438649949989, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.2729528535980149, | |
| "grad_norm": 0.5611469832371672, | |
| "learning_rate": 1.9999132465602526e-05, | |
| "loss": 0.5245039939880372, | |
| "memory(GiB)": 82.86, | |
| "step": 55, | |
| "token_acc": 0.8300031537213912, | |
| "train_speed(iter/s)": 0.016271 | |
| }, | |
| { | |
| "epoch": 0.2977667493796526, | |
| "grad_norm": 0.6906532228678482, | |
| "learning_rate": 1.9995608365087945e-05, | |
| "loss": 0.5370721340179443, | |
| "memory(GiB)": 88.64, | |
| "step": 60, | |
| "token_acc": 0.8404785794642585, | |
| "train_speed(iter/s)": 0.016239 | |
| }, | |
| { | |
| "epoch": 0.3225806451612903, | |
| "grad_norm": 0.8317526593094506, | |
| "learning_rate": 1.998937443221316e-05, | |
| "loss": 0.5141131401062011, | |
| "memory(GiB)": 88.64, | |
| "step": 65, | |
| "token_acc": 0.8288326753680118, | |
| "train_speed(iter/s)": 0.016216 | |
| }, | |
| { | |
| "epoch": 0.34739454094292804, | |
| "grad_norm": 0.6085530190810974, | |
| "learning_rate": 1.9980432357011672e-05, | |
| "loss": 0.5197068214416504, | |
| "memory(GiB)": 88.64, | |
| "step": 70, | |
| "token_acc": 0.8228907383370238, | |
| "train_speed(iter/s)": 0.016263 | |
| }, | |
| { | |
| "epoch": 0.37220843672456577, | |
| "grad_norm": 0.5289893647507578, | |
| "learning_rate": 1.9968784563700586e-05, | |
| "loss": 0.5076879978179931, | |
| "memory(GiB)": 88.64, | |
| "step": 75, | |
| "token_acc": 0.8441243900160267, | |
| "train_speed(iter/s)": 0.016308 | |
| }, | |
| { | |
| "epoch": 0.3970223325062035, | |
| "grad_norm": 0.6566505741518432, | |
| "learning_rate": 1.9954434210023388e-05, | |
| "loss": 0.5057409286499024, | |
| "memory(GiB)": 88.64, | |
| "step": 80, | |
| "token_acc": 0.8405308008648832, | |
| "train_speed(iter/s)": 0.016444 | |
| }, | |
| { | |
| "epoch": 0.4218362282878412, | |
| "grad_norm": 0.6488893883989255, | |
| "learning_rate": 1.9937385186393888e-05, | |
| "loss": 0.5170788764953613, | |
| "memory(GiB)": 88.64, | |
| "step": 85, | |
| "token_acc": 0.8368028094412318, | |
| "train_speed(iter/s)": 0.016432 | |
| }, | |
| { | |
| "epoch": 0.4466501240694789, | |
| "grad_norm": 0.61918836791994, | |
| "learning_rate": 1.9917642114841505e-05, | |
| "loss": 0.4992537498474121, | |
| "memory(GiB)": 88.64, | |
| "step": 90, | |
| "token_acc": 0.8348116071872977, | |
| "train_speed(iter/s)": 0.016433 | |
| }, | |
| { | |
| "epoch": 0.47146401985111663, | |
| "grad_norm": 0.5951609226226164, | |
| "learning_rate": 1.9895210347758233e-05, | |
| "loss": 0.5035615921020508, | |
| "memory(GiB)": 88.64, | |
| "step": 95, | |
| "token_acc": 0.8535644197481864, | |
| "train_speed(iter/s)": 0.016464 | |
| }, | |
| { | |
| "epoch": 0.49627791563275436, | |
| "grad_norm": 0.5482278802904141, | |
| "learning_rate": 1.9870095966447592e-05, | |
| "loss": 0.5053007125854492, | |
| "memory(GiB)": 88.64, | |
| "step": 100, | |
| "token_acc": 0.8401285676802426, | |
| "train_speed(iter/s)": 0.01646 | |
| }, | |
| { | |
| "epoch": 0.49627791563275436, | |
| "eval_loss": 0.4579505920410156, | |
| "eval_runtime": 36.3741, | |
| "eval_samples_per_second": 7.12, | |
| "eval_steps_per_second": 0.467, | |
| "eval_token_acc": 0.8344002233025517, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.5210918114143921, | |
| "grad_norm": 0.6730121532635986, | |
| "learning_rate": 1.984230577947597e-05, | |
| "loss": 0.5037405967712403, | |
| "memory(GiB)": 88.64, | |
| "step": 105, | |
| "token_acc": 0.842359144244386, | |
| "train_speed(iter/s)": 0.016306 | |
| }, | |
| { | |
| "epoch": 0.5459057071960298, | |
| "grad_norm": 0.5976214012072565, | |
| "learning_rate": 1.9811847320826818e-05, | |
| "loss": 0.5020250797271728, | |
| "memory(GiB)": 88.64, | |
| "step": 110, | |
| "token_acc": 0.8231459777567204, | |
| "train_speed(iter/s)": 0.016276 | |
| }, | |
| { | |
| "epoch": 0.5707196029776674, | |
| "grad_norm": 0.549343197081887, | |
| "learning_rate": 1.977872884785815e-05, | |
| "loss": 0.4945159912109375, | |
| "memory(GiB)": 88.64, | |
| "step": 115, | |
| "token_acc": 0.8355786045950673, | |
| "train_speed(iter/s)": 0.016303 | |
| }, | |
| { | |
| "epoch": 0.5955334987593052, | |
| "grad_norm": 0.6276703676499341, | |
| "learning_rate": 1.9742959339063977e-05, | |
| "loss": 0.47115507125854494, | |
| "memory(GiB)": 88.64, | |
| "step": 120, | |
| "token_acc": 0.8467319196172923, | |
| "train_speed(iter/s)": 0.016405 | |
| }, | |
| { | |
| "epoch": 0.6203473945409429, | |
| "grad_norm": 0.48930503328156055, | |
| "learning_rate": 1.9704548491640195e-05, | |
| "loss": 0.4859424591064453, | |
| "memory(GiB)": 88.64, | |
| "step": 125, | |
| "token_acc": 0.8370880609513801, | |
| "train_speed(iter/s)": 0.016377 | |
| }, | |
| { | |
| "epoch": 0.6451612903225806, | |
| "grad_norm": 0.5157224450118135, | |
| "learning_rate": 1.966350671885566e-05, | |
| "loss": 0.4896842956542969, | |
| "memory(GiB)": 88.64, | |
| "step": 130, | |
| "token_acc": 0.8307861188942663, | |
| "train_speed(iter/s)": 0.016357 | |
| }, | |
| { | |
| "epoch": 0.6699751861042184, | |
| "grad_norm": 0.6061582190169718, | |
| "learning_rate": 1.961984514722914e-05, | |
| "loss": 0.47826318740844725, | |
| "memory(GiB)": 88.64, | |
| "step": 135, | |
| "token_acc": 0.851224802030969, | |
| "train_speed(iter/s)": 0.016369 | |
| }, | |
| { | |
| "epoch": 0.6947890818858561, | |
| "grad_norm": 0.5516185961170281, | |
| "learning_rate": 1.957357561351287e-05, | |
| "loss": 0.471895694732666, | |
| "memory(GiB)": 88.64, | |
| "step": 140, | |
| "token_acc": 0.8424185959845344, | |
| "train_speed(iter/s)": 0.016408 | |
| }, | |
| { | |
| "epoch": 0.7196029776674938, | |
| "grad_norm": 0.4947960995048164, | |
| "learning_rate": 1.9524710661483594e-05, | |
| "loss": 0.47608461380004885, | |
| "memory(GiB)": 88.64, | |
| "step": 145, | |
| "token_acc": 0.8376492687508386, | |
| "train_speed(iter/s)": 0.016435 | |
| }, | |
| { | |
| "epoch": 0.7444168734491315, | |
| "grad_norm": 0.5952017722562365, | |
| "learning_rate": 1.9473263538541916e-05, | |
| "loss": 0.4909799575805664, | |
| "memory(GiB)": 88.64, | |
| "step": 150, | |
| "token_acc": 0.8504602706501667, | |
| "train_speed(iter/s)": 0.016418 | |
| }, | |
| { | |
| "epoch": 0.7444168734491315, | |
| "eval_loss": 0.4375256896018982, | |
| "eval_runtime": 37.0115, | |
| "eval_samples_per_second": 6.998, | |
| "eval_steps_per_second": 0.459, | |
| "eval_token_acc": 0.8389802516805843, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.7692307692307693, | |
| "grad_norm": 0.5564783950854367, | |
| "learning_rate": 1.94192481921209e-05, | |
| "loss": 0.4702787399291992, | |
| "memory(GiB)": 88.64, | |
| "step": 155, | |
| "token_acc": 0.8495802700870744, | |
| "train_speed(iter/s)": 0.01631 | |
| }, | |
| { | |
| "epoch": 0.794044665012407, | |
| "grad_norm": 0.51266416113498, | |
| "learning_rate": 1.936267926590488e-05, | |
| "loss": 0.4766042709350586, | |
| "memory(GiB)": 88.64, | |
| "step": 160, | |
| "token_acc": 0.85023222889173, | |
| "train_speed(iter/s)": 0.016298 | |
| }, | |
| { | |
| "epoch": 0.8188585607940446, | |
| "grad_norm": 0.4756301510519784, | |
| "learning_rate": 1.9303572095859545e-05, | |
| "loss": 0.4743985652923584, | |
| "memory(GiB)": 88.64, | |
| "step": 165, | |
| "token_acc": 0.8259336826336148, | |
| "train_speed(iter/s)": 0.016268 | |
| }, | |
| { | |
| "epoch": 0.8436724565756824, | |
| "grad_norm": 0.530391667374793, | |
| "learning_rate": 1.92419427060743e-05, | |
| "loss": 0.47100305557250977, | |
| "memory(GiB)": 88.64, | |
| "step": 170, | |
| "token_acc": 0.8465938389259181, | |
| "train_speed(iter/s)": 0.016284 | |
| }, | |
| { | |
| "epoch": 0.8684863523573201, | |
| "grad_norm": 0.5711613629006742, | |
| "learning_rate": 1.91778078044181e-05, | |
| "loss": 0.46791276931762693, | |
| "memory(GiB)": 88.64, | |
| "step": 175, | |
| "token_acc": 0.848610394510885, | |
| "train_speed(iter/s)": 0.016279 | |
| }, | |
| { | |
| "epoch": 0.8933002481389578, | |
| "grad_norm": 0.5215383014934748, | |
| "learning_rate": 1.9111184778009934e-05, | |
| "loss": 0.46720128059387206, | |
| "memory(GiB)": 88.64, | |
| "step": 180, | |
| "token_acc": 0.8429917728410017, | |
| "train_speed(iter/s)": 0.016337 | |
| }, | |
| { | |
| "epoch": 0.9181141439205955, | |
| "grad_norm": 0.5571619722684491, | |
| "learning_rate": 1.9042091688505104e-05, | |
| "loss": 0.46036605834960936, | |
| "memory(GiB)": 88.64, | |
| "step": 185, | |
| "token_acc": 0.8357287838060233, | |
| "train_speed(iter/s)": 0.016345 | |
| }, | |
| { | |
| "epoch": 0.9429280397022333, | |
| "grad_norm": 0.59307440115033, | |
| "learning_rate": 1.89705472671987e-05, | |
| "loss": 0.4740591049194336, | |
| "memory(GiB)": 88.64, | |
| "step": 190, | |
| "token_acc": 0.8355302886053912, | |
| "train_speed(iter/s)": 0.016341 | |
| }, | |
| { | |
| "epoch": 0.967741935483871, | |
| "grad_norm": 0.5582963618116457, | |
| "learning_rate": 1.8896570909947477e-05, | |
| "loss": 0.46262359619140625, | |
| "memory(GiB)": 88.64, | |
| "step": 195, | |
| "token_acc": 0.8320320418277263, | |
| "train_speed(iter/s)": 0.016366 | |
| }, | |
| { | |
| "epoch": 0.9925558312655087, | |
| "grad_norm": 0.5657776208171481, | |
| "learning_rate": 1.88201826719116e-05, | |
| "loss": 0.462737512588501, | |
| "memory(GiB)": 88.64, | |
| "step": 200, | |
| "token_acc": 0.8392010935601458, | |
| "train_speed(iter/s)": 0.01638 | |
| }, | |
| { | |
| "epoch": 0.9925558312655087, | |
| "eval_loss": 0.4247380495071411, | |
| "eval_runtime": 36.803, | |
| "eval_samples_per_second": 7.037, | |
| "eval_steps_per_second": 0.462, | |
| "eval_token_acc": 0.8428554813798237, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 1.0148883374689825, | |
| "grad_norm": 0.5518905639056666, | |
| "learning_rate": 1.874140326211766e-05, | |
| "loss": 0.41510915756225586, | |
| "memory(GiB)": 88.64, | |
| "step": 205, | |
| "token_acc": 0.8667589172375147, | |
| "train_speed(iter/s)": 0.016368 | |
| }, | |
| { | |
| "epoch": 1.0397022332506203, | |
| "grad_norm": 0.586511183543945, | |
| "learning_rate": 1.866025403784439e-05, | |
| "loss": 0.4149285316467285, | |
| "memory(GiB)": 88.64, | |
| "step": 210, | |
| "token_acc": 0.8553997710922171, | |
| "train_speed(iter/s)": 0.016413 | |
| }, | |
| { | |
| "epoch": 1.064516129032258, | |
| "grad_norm": 0.7352442626023928, | |
| "learning_rate": 1.8576756998832667e-05, | |
| "loss": 0.4105654716491699, | |
| "memory(GiB)": 88.64, | |
| "step": 215, | |
| "token_acc": 0.855252843136141, | |
| "train_speed(iter/s)": 0.016406 | |
| }, | |
| { | |
| "epoch": 1.0893300248138957, | |
| "grad_norm": 0.6822260227811386, | |
| "learning_rate": 1.849093478132133e-05, | |
| "loss": 0.4040283203125, | |
| "memory(GiB)": 88.64, | |
| "step": 220, | |
| "token_acc": 0.8477043011659889, | |
| "train_speed(iter/s)": 0.016437 | |
| }, | |
| { | |
| "epoch": 1.1141439205955335, | |
| "grad_norm": 0.5903650939016252, | |
| "learning_rate": 1.8402810651910444e-05, | |
| "loss": 0.3956918716430664, | |
| "memory(GiB)": 88.64, | |
| "step": 225, | |
| "token_acc": 0.8605695208768427, | |
| "train_speed(iter/s)": 0.016458 | |
| }, | |
| { | |
| "epoch": 1.1389578163771712, | |
| "grad_norm": 0.4978998007240176, | |
| "learning_rate": 1.8312408501253674e-05, | |
| "loss": 0.4057618141174316, | |
| "memory(GiB)": 88.64, | |
| "step": 230, | |
| "token_acc": 0.8609171684050843, | |
| "train_speed(iter/s)": 0.016455 | |
| }, | |
| { | |
| "epoch": 1.163771712158809, | |
| "grad_norm": 0.4872739462988421, | |
| "learning_rate": 1.8219752837581466e-05, | |
| "loss": 0.40581340789794923, | |
| "memory(GiB)": 88.64, | |
| "step": 235, | |
| "token_acc": 0.8521656572270421, | |
| "train_speed(iter/s)": 0.016453 | |
| }, | |
| { | |
| "epoch": 1.1885856079404467, | |
| "grad_norm": 0.5234851506683952, | |
| "learning_rate": 1.8124868780056814e-05, | |
| "loss": 0.3941540479660034, | |
| "memory(GiB)": 88.64, | |
| "step": 240, | |
| "token_acc": 0.8785278129421401, | |
| "train_speed(iter/s)": 0.016419 | |
| }, | |
| { | |
| "epoch": 1.2133995037220844, | |
| "grad_norm": 0.4141490126306291, | |
| "learning_rate": 1.8027782051965408e-05, | |
| "loss": 0.3891263008117676, | |
| "memory(GiB)": 88.64, | |
| "step": 245, | |
| "token_acc": 0.8737247809520465, | |
| "train_speed(iter/s)": 0.016435 | |
| }, | |
| { | |
| "epoch": 1.2382133995037221, | |
| "grad_norm": 0.5328099303792803, | |
| "learning_rate": 1.7928518973741967e-05, | |
| "loss": 0.4076822280883789, | |
| "memory(GiB)": 88.64, | |
| "step": 250, | |
| "token_acc": 0.8484884195259031, | |
| "train_speed(iter/s)": 0.016452 | |
| }, | |
| { | |
| "epoch": 1.2382133995037221, | |
| "eval_loss": 0.42026251554489136, | |
| "eval_runtime": 36.9837, | |
| "eval_samples_per_second": 7.003, | |
| "eval_steps_per_second": 0.46, | |
| "eval_token_acc": 0.8458235444627945, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 1.2630272952853598, | |
| "grad_norm": 0.6838326859978137, | |
| "learning_rate": 1.782710645583473e-05, | |
| "loss": 0.4152885913848877, | |
| "memory(GiB)": 88.64, | |
| "step": 255, | |
| "token_acc": 0.8661420845706872, | |
| "train_speed(iter/s)": 0.016363 | |
| }, | |
| { | |
| "epoch": 1.2878411910669976, | |
| "grad_norm": 0.5427598093088785, | |
| "learning_rate": 1.7723571991409986e-05, | |
| "loss": 0.40700349807739256, | |
| "memory(GiB)": 88.64, | |
| "step": 260, | |
| "token_acc": 0.8587367250591306, | |
| "train_speed(iter/s)": 0.016379 | |
| }, | |
| { | |
| "epoch": 1.3126550868486353, | |
| "grad_norm": 0.5151142246135575, | |
| "learning_rate": 1.761794364889855e-05, | |
| "loss": 0.40430006980895994, | |
| "memory(GiB)": 88.64, | |
| "step": 265, | |
| "token_acc": 0.8575991930060525, | |
| "train_speed(iter/s)": 0.016383 | |
| }, | |
| { | |
| "epoch": 1.337468982630273, | |
| "grad_norm": 0.5015356534355152, | |
| "learning_rate": 1.751025006438643e-05, | |
| "loss": 0.40410513877868653, | |
| "memory(GiB)": 88.64, | |
| "step": 270, | |
| "token_acc": 0.8492741510067344, | |
| "train_speed(iter/s)": 0.016388 | |
| }, | |
| { | |
| "epoch": 1.3622828784119108, | |
| "grad_norm": 0.5321569853483857, | |
| "learning_rate": 1.7400520433851457e-05, | |
| "loss": 0.40181665420532225, | |
| "memory(GiB)": 88.64, | |
| "step": 275, | |
| "token_acc": 0.863210783537583, | |
| "train_speed(iter/s)": 0.016384 | |
| }, | |
| { | |
| "epoch": 1.3870967741935485, | |
| "grad_norm": 0.5040061479558677, | |
| "learning_rate": 1.728878450524822e-05, | |
| "loss": 0.39484846591949463, | |
| "memory(GiB)": 88.64, | |
| "step": 280, | |
| "token_acc": 0.8496746362506398, | |
| "train_speed(iter/s)": 0.016389 | |
| }, | |
| { | |
| "epoch": 1.4119106699751862, | |
| "grad_norm": 0.49441565611811733, | |
| "learning_rate": 1.717507257044331e-05, | |
| "loss": 0.38428053855895994, | |
| "memory(GiB)": 89.94, | |
| "step": 285, | |
| "token_acc": 0.8691993353349355, | |
| "train_speed(iter/s)": 0.01639 | |
| }, | |
| { | |
| "epoch": 1.436724565756824, | |
| "grad_norm": 0.4427481805329584, | |
| "learning_rate": 1.7059415457003144e-05, | |
| "loss": 0.38771657943725585, | |
| "memory(GiB)": 89.94, | |
| "step": 290, | |
| "token_acc": 0.8611781405251951, | |
| "train_speed(iter/s)": 0.016398 | |
| }, | |
| { | |
| "epoch": 1.4615384615384617, | |
| "grad_norm": 0.4946161550465835, | |
| "learning_rate": 1.694184451983651e-05, | |
| "loss": 0.39400653839111327, | |
| "memory(GiB)": 89.94, | |
| "step": 295, | |
| "token_acc": 0.8555248745014329, | |
| "train_speed(iter/s)": 0.016388 | |
| }, | |
| { | |
| "epoch": 1.4863523573200992, | |
| "grad_norm": 0.4970277193994229, | |
| "learning_rate": 1.682239163269422e-05, | |
| "loss": 0.3775279998779297, | |
| "memory(GiB)": 89.94, | |
| "step": 300, | |
| "token_acc": 0.870510592163004, | |
| "train_speed(iter/s)": 0.016417 | |
| }, | |
| { | |
| "epoch": 1.4863523573200992, | |
| "eval_loss": 0.4109738767147064, | |
| "eval_runtime": 37.1778, | |
| "eval_samples_per_second": 6.967, | |
| "eval_steps_per_second": 0.457, | |
| "eval_token_acc": 0.8480263310925542, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 1.5111662531017371, | |
| "grad_norm": 0.5261002463932816, | |
| "learning_rate": 1.6701089179528032e-05, | |
| "loss": 0.3945833683013916, | |
| "memory(GiB)": 89.94, | |
| "step": 305, | |
| "token_acc": 0.8619344282927444, | |
| "train_speed(iter/s)": 0.016379 | |
| }, | |
| { | |
| "epoch": 1.5359801488833746, | |
| "grad_norm": 0.4575728138284327, | |
| "learning_rate": 1.6577970045711293e-05, | |
| "loss": 0.392360258102417, | |
| "memory(GiB)": 89.94, | |
| "step": 310, | |
| "token_acc": 0.8602967068297687, | |
| "train_speed(iter/s)": 0.016403 | |
| }, | |
| { | |
| "epoch": 1.5607940446650124, | |
| "grad_norm": 0.40177603172705895, | |
| "learning_rate": 1.6453067609123656e-05, | |
| "loss": 0.3860903739929199, | |
| "memory(GiB)": 89.94, | |
| "step": 315, | |
| "token_acc": 0.872881901939308, | |
| "train_speed(iter/s)": 0.016402 | |
| }, | |
| { | |
| "epoch": 1.58560794044665, | |
| "grad_norm": 0.4454839273958068, | |
| "learning_rate": 1.6326415731102226e-05, | |
| "loss": 0.4006852149963379, | |
| "memory(GiB)": 89.94, | |
| "step": 320, | |
| "token_acc": 0.8653790030740496, | |
| "train_speed(iter/s)": 0.01639 | |
| }, | |
| { | |
| "epoch": 1.6104218362282878, | |
| "grad_norm": 0.44655786918995494, | |
| "learning_rate": 1.619804874726171e-05, | |
| "loss": 0.39271857738494875, | |
| "memory(GiB)": 89.94, | |
| "step": 325, | |
| "token_acc": 0.8654047918520109, | |
| "train_speed(iter/s)": 0.0164 | |
| }, | |
| { | |
| "epoch": 1.6352357320099256, | |
| "grad_norm": 0.4797627971829061, | |
| "learning_rate": 1.6068001458185934e-05, | |
| "loss": 0.37825469970703124, | |
| "memory(GiB)": 89.94, | |
| "step": 330, | |
| "token_acc": 0.8561078435708074, | |
| "train_speed(iter/s)": 0.016385 | |
| }, | |
| { | |
| "epoch": 1.6600496277915633, | |
| "grad_norm": 0.4861294532147035, | |
| "learning_rate": 1.5936309119993333e-05, | |
| "loss": 0.3874125242233276, | |
| "memory(GiB)": 89.94, | |
| "step": 335, | |
| "token_acc": 0.8552511758857783, | |
| "train_speed(iter/s)": 0.016383 | |
| }, | |
| { | |
| "epoch": 1.684863523573201, | |
| "grad_norm": 0.41946071511052324, | |
| "learning_rate": 1.5803007434778915e-05, | |
| "loss": 0.38948085308074953, | |
| "memory(GiB)": 89.94, | |
| "step": 340, | |
| "token_acc": 0.8796996530315503, | |
| "train_speed(iter/s)": 0.016394 | |
| }, | |
| { | |
| "epoch": 1.7096774193548387, | |
| "grad_norm": 0.48162634486442507, | |
| "learning_rate": 1.566813254093538e-05, | |
| "loss": 0.38796045780181887, | |
| "memory(GiB)": 89.94, | |
| "step": 345, | |
| "token_acc": 0.8590043182007245, | |
| "train_speed(iter/s)": 0.016394 | |
| }, | |
| { | |
| "epoch": 1.7344913151364765, | |
| "grad_norm": 0.4580728622127146, | |
| "learning_rate": 1.553172100335588e-05, | |
| "loss": 0.38542957305908204, | |
| "memory(GiB)": 89.94, | |
| "step": 350, | |
| "token_acc": 0.8623940061939602, | |
| "train_speed(iter/s)": 0.016386 | |
| }, | |
| { | |
| "epoch": 1.7344913151364765, | |
| "eval_loss": 0.40481674671173096, | |
| "eval_runtime": 37.3868, | |
| "eval_samples_per_second": 6.928, | |
| "eval_steps_per_second": 0.455, | |
| "eval_token_acc": 0.8498720662464236, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 1.759305210918114, | |
| "grad_norm": 0.4101110019379735, | |
| "learning_rate": 1.5393809803521213e-05, | |
| "loss": 0.3963811159133911, | |
| "memory(GiB)": 89.94, | |
| "step": 355, | |
| "token_acc": 0.8648625816625366, | |
| "train_speed(iter/s)": 0.016344 | |
| }, | |
| { | |
| "epoch": 1.7841191066997517, | |
| "grad_norm": 0.4729658162675547, | |
| "learning_rate": 1.5254436329474062e-05, | |
| "loss": 0.38416252136230467, | |
| "memory(GiB)": 89.94, | |
| "step": 360, | |
| "token_acc": 0.8674258253238613, | |
| "train_speed(iter/s)": 0.01635 | |
| }, | |
| { | |
| "epoch": 1.8089330024813894, | |
| "grad_norm": 0.48983508519169017, | |
| "learning_rate": 1.5113638365682996e-05, | |
| "loss": 0.3992438316345215, | |
| "memory(GiB)": 89.94, | |
| "step": 365, | |
| "token_acc": 0.8770906339598599, | |
| "train_speed(iter/s)": 0.016361 | |
| }, | |
| { | |
| "epoch": 1.8337468982630272, | |
| "grad_norm": 0.5401851259187025, | |
| "learning_rate": 1.4971454082799029e-05, | |
| "loss": 0.38352556228637696, | |
| "memory(GiB)": 89.94, | |
| "step": 370, | |
| "token_acc": 0.860992567369929, | |
| "train_speed(iter/s)": 0.016358 | |
| }, | |
| { | |
| "epoch": 1.858560794044665, | |
| "grad_norm": 0.4230668354018143, | |
| "learning_rate": 1.482792202730745e-05, | |
| "loss": 0.3897742748260498, | |
| "memory(GiB)": 89.94, | |
| "step": 375, | |
| "token_acc": 0.871132879925645, | |
| "train_speed(iter/s)": 0.016351 | |
| }, | |
| { | |
| "epoch": 1.8833746898263026, | |
| "grad_norm": 0.4019049115904182, | |
| "learning_rate": 1.4683081111077807e-05, | |
| "loss": 0.39033985137939453, | |
| "memory(GiB)": 89.94, | |
| "step": 380, | |
| "token_acc": 0.8773624177836983, | |
| "train_speed(iter/s)": 0.016354 | |
| }, | |
| { | |
| "epoch": 1.9081885856079404, | |
| "grad_norm": 0.5428075344626317, | |
| "learning_rate": 1.4536970600814789e-05, | |
| "loss": 0.3880493640899658, | |
| "memory(GiB)": 89.94, | |
| "step": 385, | |
| "token_acc": 0.859438589168319, | |
| "train_speed(iter/s)": 0.016358 | |
| }, | |
| { | |
| "epoch": 1.933002481389578, | |
| "grad_norm": 0.4827683055924454, | |
| "learning_rate": 1.4389630107412942e-05, | |
| "loss": 0.38936262130737304, | |
| "memory(GiB)": 89.94, | |
| "step": 390, | |
| "token_acc": 0.8606305598661481, | |
| "train_speed(iter/s)": 0.016355 | |
| }, | |
| { | |
| "epoch": 1.9578163771712158, | |
| "grad_norm": 0.4573581809973034, | |
| "learning_rate": 1.424109957521806e-05, | |
| "loss": 0.3780574560165405, | |
| "memory(GiB)": 89.94, | |
| "step": 395, | |
| "token_acc": 0.8642330574236937, | |
| "train_speed(iter/s)": 0.01638 | |
| }, | |
| { | |
| "epoch": 1.9826302729528535, | |
| "grad_norm": 0.4605796202723116, | |
| "learning_rate": 1.4091419271198197e-05, | |
| "loss": 0.3744480848312378, | |
| "memory(GiB)": 89.94, | |
| "step": 400, | |
| "token_acc": 0.8632874334053888, | |
| "train_speed(iter/s)": 0.016393 | |
| }, | |
| { | |
| "epoch": 1.9826302729528535, | |
| "eval_loss": 0.40109848976135254, | |
| "eval_runtime": 36.9238, | |
| "eval_samples_per_second": 7.014, | |
| "eval_steps_per_second": 0.46, | |
| "eval_token_acc": 0.8518154962666604, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 2.0049627791563274, | |
| "grad_norm": 0.561513288720162, | |
| "learning_rate": 1.394062977402717e-05, | |
| "loss": 0.36142630577087403, | |
| "memory(GiB)": 89.94, | |
| "step": 405, | |
| "token_acc": 0.8730374487048237, | |
| "train_speed(iter/s)": 0.016389 | |
| }, | |
| { | |
| "epoch": 2.029776674937965, | |
| "grad_norm": 0.5236938495980928, | |
| "learning_rate": 1.378877196308361e-05, | |
| "loss": 0.30889334678649905, | |
| "memory(GiB)": 89.94, | |
| "step": 410, | |
| "token_acc": 0.8946576830463284, | |
| "train_speed(iter/s)": 0.016398 | |
| }, | |
| { | |
| "epoch": 2.054590570719603, | |
| "grad_norm": 0.45558075636252165, | |
| "learning_rate": 1.3635887007368467e-05, | |
| "loss": 0.3037071228027344, | |
| "memory(GiB)": 89.94, | |
| "step": 415, | |
| "token_acc": 0.8943636184386575, | |
| "train_speed(iter/s)": 0.016406 | |
| }, | |
| { | |
| "epoch": 2.0794044665012406, | |
| "grad_norm": 0.5039448394161661, | |
| "learning_rate": 1.348201635434399e-05, | |
| "loss": 0.3107598781585693, | |
| "memory(GiB)": 89.94, | |
| "step": 420, | |
| "token_acc": 0.8958745781247054, | |
| "train_speed(iter/s)": 0.016399 | |
| }, | |
| { | |
| "epoch": 2.1042183622828783, | |
| "grad_norm": 0.4915435678643858, | |
| "learning_rate": 1.3327201718697232e-05, | |
| "loss": 0.3129460334777832, | |
| "memory(GiB)": 89.94, | |
| "step": 425, | |
| "token_acc": 0.8708129581052927, | |
| "train_speed(iter/s)": 0.016409 | |
| }, | |
| { | |
| "epoch": 2.129032258064516, | |
| "grad_norm": 0.4526399583483296, | |
| "learning_rate": 1.31714850710311e-05, | |
| "loss": 0.3166365146636963, | |
| "memory(GiB)": 89.94, | |
| "step": 430, | |
| "token_acc": 0.89759354279269, | |
| "train_speed(iter/s)": 0.016395 | |
| }, | |
| { | |
| "epoch": 2.1538461538461537, | |
| "grad_norm": 0.47166304759252065, | |
| "learning_rate": 1.3014908626486032e-05, | |
| "loss": 0.30022444725036623, | |
| "memory(GiB)": 89.94, | |
| "step": 435, | |
| "token_acc": 0.8798537849342958, | |
| "train_speed(iter/s)": 0.016395 | |
| }, | |
| { | |
| "epoch": 2.1786600496277915, | |
| "grad_norm": 0.4123439151346761, | |
| "learning_rate": 1.2857514833295369e-05, | |
| "loss": 0.3049207925796509, | |
| "memory(GiB)": 89.94, | |
| "step": 440, | |
| "token_acc": 0.8795848668205352, | |
| "train_speed(iter/s)": 0.016409 | |
| }, | |
| { | |
| "epoch": 2.203473945409429, | |
| "grad_norm": 0.405269067838883, | |
| "learning_rate": 1.2699346361277538e-05, | |
| "loss": 0.3032404661178589, | |
| "memory(GiB)": 89.94, | |
| "step": 445, | |
| "token_acc": 0.893778727363035, | |
| "train_speed(iter/s)": 0.016409 | |
| }, | |
| { | |
| "epoch": 2.228287841191067, | |
| "grad_norm": 0.37939542721602953, | |
| "learning_rate": 1.2540446090268193e-05, | |
| "loss": 0.3014317512512207, | |
| "memory(GiB)": 89.94, | |
| "step": 450, | |
| "token_acc": 0.8831544347304245, | |
| "train_speed(iter/s)": 0.016419 | |
| }, | |
| { | |
| "epoch": 2.228287841191067, | |
| "eval_loss": 0.4072587788105011, | |
| "eval_runtime": 37.3946, | |
| "eval_samples_per_second": 6.926, | |
| "eval_steps_per_second": 0.455, | |
| "eval_token_acc": 0.8520934614221581, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 2.2531017369727047, | |
| "grad_norm": 0.3861335333797535, | |
| "learning_rate": 1.2380857098495355e-05, | |
| "loss": 0.30447826385498045, | |
| "memory(GiB)": 89.94, | |
| "step": 455, | |
| "token_acc": 0.881784783123963, | |
| "train_speed(iter/s)": 0.016377 | |
| }, | |
| { | |
| "epoch": 2.2779156327543424, | |
| "grad_norm": 0.40103897362549834, | |
| "learning_rate": 1.2220622650900833e-05, | |
| "loss": 0.306304407119751, | |
| "memory(GiB)": 89.94, | |
| "step": 460, | |
| "token_acc": 0.9054849560829752, | |
| "train_speed(iter/s)": 0.016371 | |
| }, | |
| { | |
| "epoch": 2.30272952853598, | |
| "grad_norm": 0.41467304098935237, | |
| "learning_rate": 1.2059786187410984e-05, | |
| "loss": 0.31237101554870605, | |
| "memory(GiB)": 89.94, | |
| "step": 465, | |
| "token_acc": 0.8715191597554331, | |
| "train_speed(iter/s)": 0.016373 | |
| }, | |
| { | |
| "epoch": 2.327543424317618, | |
| "grad_norm": 0.40212608233202607, | |
| "learning_rate": 1.1898391311160067e-05, | |
| "loss": 0.30408382415771484, | |
| "memory(GiB)": 89.94, | |
| "step": 470, | |
| "token_acc": 0.8885567438564482, | |
| "train_speed(iter/s)": 0.01638 | |
| }, | |
| { | |
| "epoch": 2.3523573200992556, | |
| "grad_norm": 0.4506239257919707, | |
| "learning_rate": 1.1736481776669307e-05, | |
| "loss": 0.2938546180725098, | |
| "memory(GiB)": 89.94, | |
| "step": 475, | |
| "token_acc": 0.9089126511337576, | |
| "train_speed(iter/s)": 0.016382 | |
| }, | |
| { | |
| "epoch": 2.3771712158808933, | |
| "grad_norm": 0.42538484493612555, | |
| "learning_rate": 1.1574101477984966e-05, | |
| "loss": 0.3105756759643555, | |
| "memory(GiB)": 89.94, | |
| "step": 480, | |
| "token_acc": 0.8806113552657332, | |
| "train_speed(iter/s)": 0.016388 | |
| }, | |
| { | |
| "epoch": 2.401985111662531, | |
| "grad_norm": 0.4226912481964332, | |
| "learning_rate": 1.1411294436778562e-05, | |
| "loss": 0.3021634578704834, | |
| "memory(GiB)": 89.94, | |
| "step": 485, | |
| "token_acc": 0.8833968731418979, | |
| "train_speed(iter/s)": 0.01639 | |
| }, | |
| { | |
| "epoch": 2.4267990074441688, | |
| "grad_norm": 0.449080842036963, | |
| "learning_rate": 1.124810479041248e-05, | |
| "loss": 0.3032838344573975, | |
| "memory(GiB)": 89.94, | |
| "step": 490, | |
| "token_acc": 0.877714128906116, | |
| "train_speed(iter/s)": 0.016386 | |
| }, | |
| { | |
| "epoch": 2.4516129032258065, | |
| "grad_norm": 0.4577775380023549, | |
| "learning_rate": 1.1084576779974257e-05, | |
| "loss": 0.3055537223815918, | |
| "memory(GiB)": 89.94, | |
| "step": 495, | |
| "token_acc": 0.8952398880779006, | |
| "train_speed(iter/s)": 0.016386 | |
| }, | |
| { | |
| "epoch": 2.4764267990074442, | |
| "grad_norm": 0.4327610654637386, | |
| "learning_rate": 1.092075473828269e-05, | |
| "loss": 0.3270174741744995, | |
| "memory(GiB)": 89.94, | |
| "step": 500, | |
| "token_acc": 0.8941933336227983, | |
| "train_speed(iter/s)": 0.016386 | |
| }, | |
| { | |
| "epoch": 2.4764267990074442, | |
| "eval_loss": 0.3992994427680969, | |
| "eval_runtime": 37.0915, | |
| "eval_samples_per_second": 6.983, | |
| "eval_steps_per_second": 0.458, | |
| "eval_token_acc": 0.8543858016794212, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 2.501240694789082, | |
| "grad_norm": 0.43447637201251166, | |
| "learning_rate": 1.0756683077869133e-05, | |
| "loss": 0.30214927196502683, | |
| "memory(GiB)": 89.94, | |
| "step": 505, | |
| "token_acc": 0.8810097805974094, | |
| "train_speed(iter/s)": 0.016353 | |
| }, | |
| { | |
| "epoch": 2.5260545905707197, | |
| "grad_norm": 0.4282670862928906, | |
| "learning_rate": 1.0592406278937143e-05, | |
| "loss": 0.29954004287719727, | |
| "memory(GiB)": 89.94, | |
| "step": 510, | |
| "token_acc": 0.8962592483454521, | |
| "train_speed(iter/s)": 0.016353 | |
| }, | |
| { | |
| "epoch": 2.5508684863523574, | |
| "grad_norm": 0.4520012444271277, | |
| "learning_rate": 1.0427968877303809e-05, | |
| "loss": 0.29749062061309817, | |
| "memory(GiB)": 89.94, | |
| "step": 515, | |
| "token_acc": 0.88993587445068, | |
| "train_speed(iter/s)": 0.016352 | |
| }, | |
| { | |
| "epoch": 2.575682382133995, | |
| "grad_norm": 0.4094260181661943, | |
| "learning_rate": 1.0263415452325967e-05, | |
| "loss": 0.30545458793640134, | |
| "memory(GiB)": 89.94, | |
| "step": 520, | |
| "token_acc": 0.8991213678952933, | |
| "train_speed(iter/s)": 0.01636 | |
| }, | |
| { | |
| "epoch": 2.600496277915633, | |
| "grad_norm": 0.4362439165953098, | |
| "learning_rate": 1.0098790614814658e-05, | |
| "loss": 0.29534034729003905, | |
| "memory(GiB)": 89.94, | |
| "step": 525, | |
| "token_acc": 0.8916807528895793, | |
| "train_speed(iter/s)": 0.016369 | |
| }, | |
| { | |
| "epoch": 2.6253101736972706, | |
| "grad_norm": 0.4946976315555759, | |
| "learning_rate": 9.934138994941023e-06, | |
| "loss": 0.3051294803619385, | |
| "memory(GiB)": 89.94, | |
| "step": 530, | |
| "token_acc": 0.8991352720121762, | |
| "train_speed(iter/s)": 0.016377 | |
| }, | |
| { | |
| "epoch": 2.6501240694789083, | |
| "grad_norm": 0.4359207003478648, | |
| "learning_rate": 9.769505230136962e-06, | |
| "loss": 0.2859165191650391, | |
| "memory(GiB)": 89.94, | |
| "step": 535, | |
| "token_acc": 0.8830987088713036, | |
| "train_speed(iter/s)": 0.016394 | |
| }, | |
| { | |
| "epoch": 2.674937965260546, | |
| "grad_norm": 0.45484329583426325, | |
| "learning_rate": 9.604933952993822e-06, | |
| "loss": 0.2968073606491089, | |
| "memory(GiB)": 89.94, | |
| "step": 540, | |
| "token_acc": 0.893879447175874, | |
| "train_speed(iter/s)": 0.016401 | |
| }, | |
| { | |
| "epoch": 2.699751861042184, | |
| "grad_norm": 0.39016064380810367, | |
| "learning_rate": 9.440469779162407e-06, | |
| "loss": 0.30095710754394533, | |
| "memory(GiB)": 89.94, | |
| "step": 545, | |
| "token_acc": 0.8819626291391867, | |
| "train_speed(iter/s)": 0.016402 | |
| }, | |
| { | |
| "epoch": 2.7245657568238215, | |
| "grad_norm": 0.3894440736965737, | |
| "learning_rate": 9.276157295257566e-06, | |
| "loss": 0.297087574005127, | |
| "memory(GiB)": 89.94, | |
| "step": 550, | |
| "token_acc": 0.8939326285376584, | |
| "train_speed(iter/s)": 0.016414 | |
| }, | |
| { | |
| "epoch": 2.7245657568238215, | |
| "eval_loss": 0.39560389518737793, | |
| "eval_runtime": 36.8591, | |
| "eval_samples_per_second": 7.027, | |
| "eval_steps_per_second": 0.461, | |
| "eval_token_acc": 0.856389709474076, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 2.749379652605459, | |
| "grad_norm": 0.398345793637711, | |
| "learning_rate": 9.112041046770653e-06, | |
| "loss": 0.3055715084075928, | |
| "memory(GiB)": 89.94, | |
| "step": 555, | |
| "token_acc": 0.8971510298173304, | |
| "train_speed(iter/s)": 0.016389 | |
| }, | |
| { | |
| "epoch": 2.774193548387097, | |
| "grad_norm": 0.4307995028287168, | |
| "learning_rate": 8.948165525993162e-06, | |
| "loss": 0.30808732509613035, | |
| "memory(GiB)": 89.94, | |
| "step": 560, | |
| "token_acc": 0.888341487335637, | |
| "train_speed(iter/s)": 0.016392 | |
| }, | |
| { | |
| "epoch": 2.7990074441687343, | |
| "grad_norm": 0.42720829430324325, | |
| "learning_rate": 8.784575159954748e-06, | |
| "loss": 0.29772372245788575, | |
| "memory(GiB)": 89.94, | |
| "step": 565, | |
| "token_acc": 0.8964727272727273, | |
| "train_speed(iter/s)": 0.016385 | |
| }, | |
| { | |
| "epoch": 2.8238213399503724, | |
| "grad_norm": 0.378943802971695, | |
| "learning_rate": 8.621314298378958e-06, | |
| "loss": 0.2994475126266479, | |
| "memory(GiB)": 89.94, | |
| "step": 570, | |
| "token_acc": 0.8991428363722879, | |
| "train_speed(iter/s)": 0.016389 | |
| }, | |
| { | |
| "epoch": 2.8486352357320097, | |
| "grad_norm": 0.39340559598269975, | |
| "learning_rate": 8.458427201659926e-06, | |
| "loss": 0.3069624662399292, | |
| "memory(GiB)": 89.94, | |
| "step": 575, | |
| "token_acc": 0.8998604899265521, | |
| "train_speed(iter/s)": 0.016397 | |
| }, | |
| { | |
| "epoch": 2.873449131513648, | |
| "grad_norm": 0.4314024894376894, | |
| "learning_rate": 8.295958028863285e-06, | |
| "loss": 0.30196504592895507, | |
| "memory(GiB)": 89.94, | |
| "step": 580, | |
| "token_acc": 0.9014831273211464, | |
| "train_speed(iter/s)": 0.016408 | |
| }, | |
| { | |
| "epoch": 2.898263027295285, | |
| "grad_norm": 0.4237139579995691, | |
| "learning_rate": 8.133950825754511e-06, | |
| "loss": 0.2988776683807373, | |
| "memory(GiB)": 89.94, | |
| "step": 585, | |
| "token_acc": 0.9028119489350919, | |
| "train_speed(iter/s)": 0.016424 | |
| }, | |
| { | |
| "epoch": 2.9230769230769234, | |
| "grad_norm": 0.4130611959829037, | |
| "learning_rate": 7.972449512858062e-06, | |
| "loss": 0.30088846683502196, | |
| "memory(GiB)": 89.94, | |
| "step": 590, | |
| "token_acc": 0.8994833915566345, | |
| "train_speed(iter/s)": 0.01642 | |
| }, | |
| { | |
| "epoch": 2.9478908188585606, | |
| "grad_norm": 0.4169476470359364, | |
| "learning_rate": 7.81149787355039e-06, | |
| "loss": 0.3129019021987915, | |
| "memory(GiB)": 89.94, | |
| "step": 595, | |
| "token_acc": 0.8943572216882053, | |
| "train_speed(iter/s)": 0.016416 | |
| }, | |
| { | |
| "epoch": 2.9727047146401984, | |
| "grad_norm": 0.4840772402114463, | |
| "learning_rate": 7.651139542190164e-06, | |
| "loss": 0.27456250190734866, | |
| "memory(GiB)": 89.94, | |
| "step": 600, | |
| "token_acc": 0.896454707029423, | |
| "train_speed(iter/s)": 0.016428 | |
| }, | |
| { | |
| "epoch": 2.9727047146401984, | |
| "eval_loss": 0.3933154344558716, | |
| "eval_runtime": 37.5718, | |
| "eval_samples_per_second": 6.893, | |
| "eval_steps_per_second": 0.452, | |
| "eval_token_acc": 0.8575376241538927, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 2.997518610421836, | |
| "grad_norm": 0.3681469729562029, | |
| "learning_rate": 7.491417992288927e-06, | |
| "loss": 0.29853529930114747, | |
| "memory(GiB)": 89.94, | |
| "step": 605, | |
| "token_acc": 0.8887139445589062, | |
| "train_speed(iter/s)": 0.016389 | |
| }, | |
| { | |
| "epoch": 3.0198511166253104, | |
| "grad_norm": 0.46754553369454177, | |
| "learning_rate": 7.332376524725298e-06, | |
| "loss": 0.25056142807006837, | |
| "memory(GiB)": 89.94, | |
| "step": 610, | |
| "token_acc": 0.9005600640073151, | |
| "train_speed(iter/s)": 0.016407 | |
| }, | |
| { | |
| "epoch": 3.044665012406948, | |
| "grad_norm": 0.3905145343550401, | |
| "learning_rate": 7.174058256006012e-06, | |
| "loss": 0.24309511184692384, | |
| "memory(GiB)": 89.94, | |
| "step": 615, | |
| "token_acc": 0.9053923110803498, | |
| "train_speed(iter/s)": 0.016402 | |
| }, | |
| { | |
| "epoch": 3.069478908188586, | |
| "grad_norm": 0.41901102410789404, | |
| "learning_rate": 7.016506106576942e-06, | |
| "loss": 0.23708744049072267, | |
| "memory(GiB)": 89.94, | |
| "step": 620, | |
| "token_acc": 0.921517517259178, | |
| "train_speed(iter/s)": 0.016398 | |
| }, | |
| { | |
| "epoch": 3.094292803970223, | |
| "grad_norm": 0.36157357718683, | |
| "learning_rate": 6.859762789187259e-06, | |
| "loss": 0.23210906982421875, | |
| "memory(GiB)": 89.94, | |
| "step": 625, | |
| "token_acc": 0.9225797451873896, | |
| "train_speed(iter/s)": 0.016406 | |
| }, | |
| { | |
| "epoch": 3.119106699751861, | |
| "grad_norm": 0.3805761890187129, | |
| "learning_rate": 6.703870797309922e-06, | |
| "loss": 0.2322997808456421, | |
| "memory(GiB)": 89.94, | |
| "step": 630, | |
| "token_acc": 0.9135864765989655, | |
| "train_speed(iter/s)": 0.016415 | |
| }, | |
| { | |
| "epoch": 3.1439205955334986, | |
| "grad_norm": 0.3788702415059945, | |
| "learning_rate": 6.548872393621578e-06, | |
| "loss": 0.22191863059997557, | |
| "memory(GiB)": 89.94, | |
| "step": 635, | |
| "token_acc": 0.9246411397786463, | |
| "train_speed(iter/s)": 0.016413 | |
| }, | |
| { | |
| "epoch": 3.1687344913151363, | |
| "grad_norm": 0.4063569428162902, | |
| "learning_rate": 6.3948095985450755e-06, | |
| "loss": 0.24599046707153321, | |
| "memory(GiB)": 89.94, | |
| "step": 640, | |
| "token_acc": 0.9072380405759489, | |
| "train_speed(iter/s)": 0.016415 | |
| }, | |
| { | |
| "epoch": 3.193548387096774, | |
| "grad_norm": 0.3557259507590387, | |
| "learning_rate": 6.241724178857621e-06, | |
| "loss": 0.23447060585021973, | |
| "memory(GiB)": 89.94, | |
| "step": 645, | |
| "token_acc": 0.9291399599260514, | |
| "train_speed(iter/s)": 0.016413 | |
| }, | |
| { | |
| "epoch": 3.2183622828784118, | |
| "grad_norm": 0.3295522367686504, | |
| "learning_rate": 6.089657636367698e-06, | |
| "loss": 0.23479061126708983, | |
| "memory(GiB)": 89.94, | |
| "step": 650, | |
| "token_acc": 0.9199351455699978, | |
| "train_speed(iter/s)": 0.016409 | |
| }, | |
| { | |
| "epoch": 3.2183622828784118, | |
| "eval_loss": 0.4014388620853424, | |
| "eval_runtime": 37.5517, | |
| "eval_samples_per_second": 6.897, | |
| "eval_steps_per_second": 0.453, | |
| "eval_token_acc": 0.858085413226024, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 3.2431761786600495, | |
| "grad_norm": 0.4240123332842028, | |
| "learning_rate": 5.938651196663865e-06, | |
| "loss": 0.22697579860687256, | |
| "memory(GiB)": 89.94, | |
| "step": 655, | |
| "token_acc": 0.8996346346571273, | |
| "train_speed(iter/s)": 0.016378 | |
| }, | |
| { | |
| "epoch": 3.267990074441687, | |
| "grad_norm": 0.35633031030131934, | |
| "learning_rate": 5.788745797938372e-06, | |
| "loss": 0.2304630994796753, | |
| "memory(GiB)": 89.94, | |
| "step": 660, | |
| "token_acc": 0.9186445328577308, | |
| "train_speed(iter/s)": 0.01638 | |
| }, | |
| { | |
| "epoch": 3.292803970223325, | |
| "grad_norm": 0.38300755718973123, | |
| "learning_rate": 5.6399820798887266e-06, | |
| "loss": 0.23805148601531984, | |
| "memory(GiB)": 89.94, | |
| "step": 665, | |
| "token_acc": 0.9132490030897615, | |
| "train_speed(iter/s)": 0.016388 | |
| }, | |
| { | |
| "epoch": 3.3176178660049627, | |
| "grad_norm": 0.3833393689853116, | |
| "learning_rate": 5.492400372700195e-06, | |
| "loss": 0.23390157222747804, | |
| "memory(GiB)": 89.94, | |
| "step": 670, | |
| "token_acc": 0.9085175452156439, | |
| "train_speed(iter/s)": 0.016384 | |
| }, | |
| { | |
| "epoch": 3.3424317617866004, | |
| "grad_norm": 0.3525024053865293, | |
| "learning_rate": 5.346040686112189e-06, | |
| "loss": 0.23395137786865233, | |
| "memory(GiB)": 89.94, | |
| "step": 675, | |
| "token_acc": 0.9463714867825449, | |
| "train_speed(iter/s)": 0.016389 | |
| }, | |
| { | |
| "epoch": 3.367245657568238, | |
| "grad_norm": 0.3487934368974318, | |
| "learning_rate": 5.200942698571527e-06, | |
| "loss": 0.22507448196411134, | |
| "memory(GiB)": 89.94, | |
| "step": 680, | |
| "token_acc": 0.9121472535129486, | |
| "train_speed(iter/s)": 0.016392 | |
| }, | |
| { | |
| "epoch": 3.392059553349876, | |
| "grad_norm": 0.35248625950301066, | |
| "learning_rate": 5.0571457464755226e-06, | |
| "loss": 0.23436686992645264, | |
| "memory(GiB)": 89.94, | |
| "step": 685, | |
| "token_acc": 0.9249912229851648, | |
| "train_speed(iter/s)": 0.016389 | |
| }, | |
| { | |
| "epoch": 3.4168734491315136, | |
| "grad_norm": 0.38081790230168544, | |
| "learning_rate": 4.914688813507798e-06, | |
| "loss": 0.2353280544281006, | |
| "memory(GiB)": 89.94, | |
| "step": 690, | |
| "token_acc": 0.9169562569412102, | |
| "train_speed(iter/s)": 0.016393 | |
| }, | |
| { | |
| "epoch": 3.4416873449131513, | |
| "grad_norm": 0.3810939058594302, | |
| "learning_rate": 4.773610520069706e-06, | |
| "loss": 0.23074874877929688, | |
| "memory(GiB)": 89.94, | |
| "step": 695, | |
| "token_acc": 0.896226352801347, | |
| "train_speed(iter/s)": 0.0164 | |
| }, | |
| { | |
| "epoch": 3.466501240694789, | |
| "grad_norm": 0.3874910212223461, | |
| "learning_rate": 4.633949112810271e-06, | |
| "loss": 0.22984590530395507, | |
| "memory(GiB)": 89.94, | |
| "step": 700, | |
| "token_acc": 0.9299721620785648, | |
| "train_speed(iter/s)": 0.016397 | |
| }, | |
| { | |
| "epoch": 3.466501240694789, | |
| "eval_loss": 0.3995462656021118, | |
| "eval_runtime": 37.598, | |
| "eval_samples_per_second": 6.889, | |
| "eval_steps_per_second": 0.452, | |
| "eval_token_acc": 0.8598125188993045, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 3.4913151364764268, | |
| "grad_norm": 0.3616597597470475, | |
| "learning_rate": 4.495742454257418e-06, | |
| "loss": 0.2223682403564453, | |
| "memory(GiB)": 89.94, | |
| "step": 705, | |
| "token_acc": 0.9105715421148296, | |
| "train_speed(iter/s)": 0.016371 | |
| }, | |
| { | |
| "epoch": 3.5161290322580645, | |
| "grad_norm": 0.3988361808995699, | |
| "learning_rate": 4.359028012553362e-06, | |
| "loss": 0.2163018226623535, | |
| "memory(GiB)": 89.94, | |
| "step": 710, | |
| "token_acc": 0.9318655704692665, | |
| "train_speed(iter/s)": 0.01638 | |
| }, | |
| { | |
| "epoch": 3.5409429280397022, | |
| "grad_norm": 0.3932524566015415, | |
| "learning_rate": 4.223842851296907e-06, | |
| "loss": 0.23104467391967773, | |
| "memory(GiB)": 89.94, | |
| "step": 715, | |
| "token_acc": 0.9287522767981982, | |
| "train_speed(iter/s)": 0.016381 | |
| }, | |
| { | |
| "epoch": 3.56575682382134, | |
| "grad_norm": 0.367974151309137, | |
| "learning_rate": 4.090223619495419e-06, | |
| "loss": 0.23323664665222169, | |
| "memory(GiB)": 89.94, | |
| "step": 720, | |
| "token_acc": 0.9070000777236852, | |
| "train_speed(iter/s)": 0.016385 | |
| }, | |
| { | |
| "epoch": 3.5905707196029777, | |
| "grad_norm": 0.39001981721162915, | |
| "learning_rate": 3.9582065416291926e-06, | |
| "loss": 0.22505450248718262, | |
| "memory(GiB)": 89.94, | |
| "step": 725, | |
| "token_acc": 0.9127140748875999, | |
| "train_speed(iter/s)": 0.016399 | |
| }, | |
| { | |
| "epoch": 3.6153846153846154, | |
| "grad_norm": 0.3767818229832471, | |
| "learning_rate": 3.827827407830917e-06, | |
| "loss": 0.2194854736328125, | |
| "memory(GiB)": 89.94, | |
| "step": 730, | |
| "token_acc": 0.9305206381130868, | |
| "train_speed(iter/s)": 0.0164 | |
| }, | |
| { | |
| "epoch": 3.640198511166253, | |
| "grad_norm": 0.41209090580965746, | |
| "learning_rate": 3.6991215641828903e-06, | |
| "loss": 0.217703914642334, | |
| "memory(GiB)": 89.94, | |
| "step": 735, | |
| "token_acc": 0.918243838028169, | |
| "train_speed(iter/s)": 0.016398 | |
| }, | |
| { | |
| "epoch": 3.665012406947891, | |
| "grad_norm": 0.37397331270090406, | |
| "learning_rate": 3.5721239031346067e-06, | |
| "loss": 0.2251272201538086, | |
| "memory(GiB)": 89.94, | |
| "step": 740, | |
| "token_acc": 0.9240038816389786, | |
| "train_speed(iter/s)": 0.016395 | |
| }, | |
| { | |
| "epoch": 3.6898263027295286, | |
| "grad_norm": 0.40120593317876174, | |
| "learning_rate": 3.4468688540433425e-06, | |
| "loss": 0.22675325870513915, | |
| "memory(GiB)": 89.94, | |
| "step": 745, | |
| "token_acc": 0.9157191822608735, | |
| "train_speed(iter/s)": 0.016404 | |
| }, | |
| { | |
| "epoch": 3.7146401985111663, | |
| "grad_norm": 0.36315220537470405, | |
| "learning_rate": 3.323390373840276e-06, | |
| "loss": 0.23883156776428222, | |
| "memory(GiB)": 89.94, | |
| "step": 750, | |
| "token_acc": 0.9140917431192661, | |
| "train_speed(iter/s)": 0.016405 | |
| }, | |
| { | |
| "epoch": 3.7146401985111663, | |
| "eval_loss": 0.3998472988605499, | |
| "eval_runtime": 37.7998, | |
| "eval_samples_per_second": 6.852, | |
| "eval_steps_per_second": 0.45, | |
| "eval_token_acc": 0.8604498615989393, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 3.739454094292804, | |
| "grad_norm": 0.35204470112010783, | |
| "learning_rate": 3.2017219378246734e-06, | |
| "loss": 0.22259049415588378, | |
| "memory(GiB)": 89.94, | |
| "step": 755, | |
| "token_acc": 0.9101186207181223, | |
| "train_speed(iter/s)": 0.016387 | |
| }, | |
| { | |
| "epoch": 3.764267990074442, | |
| "grad_norm": 0.3243960514310002, | |
| "learning_rate": 3.0818965305886794e-06, | |
| "loss": 0.22415781021118164, | |
| "memory(GiB)": 89.94, | |
| "step": 760, | |
| "token_acc": 0.9288042256686896, | |
| "train_speed(iter/s)": 0.016394 | |
| }, | |
| { | |
| "epoch": 3.7890818858560795, | |
| "grad_norm": 0.4054124995924202, | |
| "learning_rate": 2.963946637075107e-06, | |
| "loss": 0.2139434337615967, | |
| "memory(GiB)": 89.94, | |
| "step": 765, | |
| "token_acc": 0.897647245531552, | |
| "train_speed(iter/s)": 0.016399 | |
| }, | |
| { | |
| "epoch": 3.8138957816377173, | |
| "grad_norm": 0.4068548613060828, | |
| "learning_rate": 2.847904233770692e-06, | |
| "loss": 0.23969681262969972, | |
| "memory(GiB)": 89.94, | |
| "step": 770, | |
| "token_acc": 0.9088209109362202, | |
| "train_speed(iter/s)": 0.016384 | |
| }, | |
| { | |
| "epoch": 3.838709677419355, | |
| "grad_norm": 0.37626609353926294, | |
| "learning_rate": 2.7338007800372024e-06, | |
| "loss": 0.2259690284729004, | |
| "memory(GiB)": 89.94, | |
| "step": 775, | |
| "token_acc": 0.9220249520153551, | |
| "train_speed(iter/s)": 0.016383 | |
| }, | |
| { | |
| "epoch": 3.8635235732009927, | |
| "grad_norm": 0.3294821061527916, | |
| "learning_rate": 2.6216672095827267e-06, | |
| "loss": 0.2296532154083252, | |
| "memory(GiB)": 89.94, | |
| "step": 780, | |
| "token_acc": 0.9264005360302354, | |
| "train_speed(iter/s)": 0.016383 | |
| }, | |
| { | |
| "epoch": 3.8883374689826304, | |
| "grad_norm": 0.4192434658643637, | |
| "learning_rate": 2.5115339220754796e-06, | |
| "loss": 0.21465823650360108, | |
| "memory(GiB)": 89.94, | |
| "step": 785, | |
| "token_acc": 0.9244031530683692, | |
| "train_speed(iter/s)": 0.016387 | |
| }, | |
| { | |
| "epoch": 3.9131513647642677, | |
| "grad_norm": 0.3555370222229151, | |
| "learning_rate": 2.403430774902373e-06, | |
| "loss": 0.23048720359802247, | |
| "memory(GiB)": 89.94, | |
| "step": 790, | |
| "token_acc": 0.9265224313413317, | |
| "train_speed(iter/s)": 0.016385 | |
| }, | |
| { | |
| "epoch": 3.937965260545906, | |
| "grad_norm": 0.32165359038918095, | |
| "learning_rate": 2.2973870750746253e-06, | |
| "loss": 0.21675100326538085, | |
| "memory(GiB)": 89.94, | |
| "step": 795, | |
| "token_acc": 0.9277675867070517, | |
| "train_speed(iter/s)": 0.016387 | |
| }, | |
| { | |
| "epoch": 3.962779156327543, | |
| "grad_norm": 0.3691842237508398, | |
| "learning_rate": 2.193431571282548e-06, | |
| "loss": 0.22982077598571776, | |
| "memory(GiB)": 89.94, | |
| "step": 800, | |
| "token_acc": 0.9162534196640608, | |
| "train_speed(iter/s)": 0.016398 | |
| }, | |
| { | |
| "epoch": 3.962779156327543, | |
| "eval_loss": 0.3970131278038025, | |
| "eval_runtime": 37.5975, | |
| "eval_samples_per_second": 6.889, | |
| "eval_steps_per_second": 0.452, | |
| "eval_token_acc": 0.8609941615687005, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 3.9875930521091814, | |
| "grad_norm": 0.3711396772864637, | |
| "learning_rate": 2.09159244610172e-06, | |
| "loss": 0.22216348648071288, | |
| "memory(GiB)": 89.94, | |
| "step": 805, | |
| "token_acc": 0.8936650190172618, | |
| "train_speed(iter/s)": 0.016386 | |
| }, | |
| { | |
| "epoch": 4.009925558312655, | |
| "grad_norm": 0.48689819654773586, | |
| "learning_rate": 1.991897308352624e-06, | |
| "loss": 0.21132183074951172, | |
| "memory(GiB)": 89.94, | |
| "step": 810, | |
| "token_acc": 0.9407312130437598, | |
| "train_speed(iter/s)": 0.016398 | |
| }, | |
| { | |
| "epoch": 4.034739454094293, | |
| "grad_norm": 0.5032884501009073, | |
| "learning_rate": 1.8943731856158299e-06, | |
| "loss": 0.18714178800582887, | |
| "memory(GiB)": 89.94, | |
| "step": 815, | |
| "token_acc": 0.9151910921344552, | |
| "train_speed(iter/s)": 0.016407 | |
| }, | |
| { | |
| "epoch": 4.05955334987593, | |
| "grad_norm": 0.47787650705591495, | |
| "learning_rate": 1.799046516904751e-06, | |
| "loss": 0.19512782096862794, | |
| "memory(GiB)": 89.94, | |
| "step": 820, | |
| "token_acc": 0.9267424518609353, | |
| "train_speed(iter/s)": 0.016407 | |
| }, | |
| { | |
| "epoch": 4.084367245657568, | |
| "grad_norm": 0.3998650942092245, | |
| "learning_rate": 1.7059431454979825e-06, | |
| "loss": 0.19887795448303222, | |
| "memory(GiB)": 89.94, | |
| "step": 825, | |
| "token_acc": 0.9344387354439184, | |
| "train_speed(iter/s)": 0.016408 | |
| }, | |
| { | |
| "epoch": 4.109181141439206, | |
| "grad_norm": 0.3511648376611919, | |
| "learning_rate": 1.615088311933114e-06, | |
| "loss": 0.20051450729370118, | |
| "memory(GiB)": 89.94, | |
| "step": 830, | |
| "token_acc": 0.94342090168636, | |
| "train_speed(iter/s)": 0.016408 | |
| }, | |
| { | |
| "epoch": 4.133995037220844, | |
| "grad_norm": 0.36198509668580564, | |
| "learning_rate": 1.5265066471639701e-06, | |
| "loss": 0.19646989107131957, | |
| "memory(GiB)": 89.94, | |
| "step": 835, | |
| "token_acc": 0.9299070545334905, | |
| "train_speed(iter/s)": 0.016403 | |
| }, | |
| { | |
| "epoch": 4.158808933002481, | |
| "grad_norm": 0.31845761934810685, | |
| "learning_rate": 1.4402221658830963e-06, | |
| "loss": 0.1856994390487671, | |
| "memory(GiB)": 89.94, | |
| "step": 840, | |
| "token_acc": 0.9413985177001335, | |
| "train_speed(iter/s)": 0.016403 | |
| }, | |
| { | |
| "epoch": 4.183622828784119, | |
| "grad_norm": 0.31837636190529284, | |
| "learning_rate": 1.3562582600113295e-06, | |
| "loss": 0.18745067119598388, | |
| "memory(GiB)": 89.94, | |
| "step": 845, | |
| "token_acc": 0.917115642208662, | |
| "train_speed(iter/s)": 0.016401 | |
| }, | |
| { | |
| "epoch": 4.208436724565757, | |
| "grad_norm": 0.36629490437638673, | |
| "learning_rate": 1.274637692356181e-06, | |
| "loss": 0.19683722257614136, | |
| "memory(GiB)": 89.94, | |
| "step": 850, | |
| "token_acc": 0.9203514759298465, | |
| "train_speed(iter/s)": 0.0164 | |
| }, | |
| { | |
| "epoch": 4.208436724565757, | |
| "eval_loss": 0.4071538746356964, | |
| "eval_runtime": 37.7446, | |
| "eval_samples_per_second": 6.862, | |
| "eval_steps_per_second": 0.45, | |
| "eval_token_acc": 0.860109092600777, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 4.233250620347395, | |
| "grad_norm": 0.35352317433834707, | |
| "learning_rate": 1.1953825904408033e-06, | |
| "loss": 0.1799285888671875, | |
| "memory(GiB)": 89.94, | |
| "step": 855, | |
| "token_acc": 0.9219630589493604, | |
| "train_speed(iter/s)": 0.016381 | |
| }, | |
| { | |
| "epoch": 4.258064516129032, | |
| "grad_norm": 0.3360379010356317, | |
| "learning_rate": 1.118514440505155e-06, | |
| "loss": 0.19137413501739503, | |
| "memory(GiB)": 89.94, | |
| "step": 860, | |
| "token_acc": 0.9249040837868416, | |
| "train_speed(iter/s)": 0.016385 | |
| }, | |
| { | |
| "epoch": 4.28287841191067, | |
| "grad_norm": 0.3435835544317236, | |
| "learning_rate": 1.0440540816810395e-06, | |
| "loss": 0.1967417359352112, | |
| "memory(GiB)": 89.94, | |
| "step": 865, | |
| "token_acc": 0.9308696646383973, | |
| "train_speed(iter/s)": 0.016379 | |
| }, | |
| { | |
| "epoch": 4.3076923076923075, | |
| "grad_norm": 0.32859421884913725, | |
| "learning_rate": 9.720217003425648e-07, | |
| "loss": 0.1809452772140503, | |
| "memory(GiB)": 89.94, | |
| "step": 870, | |
| "token_acc": 0.9282937629449756, | |
| "train_speed(iter/s)": 0.016382 | |
| }, | |
| { | |
| "epoch": 4.332506203473946, | |
| "grad_norm": 0.3395321554617945, | |
| "learning_rate": 9.024368246335735e-07, | |
| "loss": 0.18605422973632812, | |
| "memory(GiB)": 89.94, | |
| "step": 875, | |
| "token_acc": 0.9546436861343081, | |
| "train_speed(iter/s)": 0.016384 | |
| }, | |
| { | |
| "epoch": 4.357320099255583, | |
| "grad_norm": 0.3393236511573925, | |
| "learning_rate": 8.353183191735115e-07, | |
| "loss": 0.1946401596069336, | |
| "memory(GiB)": 89.94, | |
| "step": 880, | |
| "token_acc": 0.9236204495723095, | |
| "train_speed(iter/s)": 0.016384 | |
| }, | |
| { | |
| "epoch": 4.382133995037221, | |
| "grad_norm": 0.34078095822336996, | |
| "learning_rate": 7.706843799431985e-07, | |
| "loss": 0.18827946186065675, | |
| "memory(GiB)": 89.94, | |
| "step": 885, | |
| "token_acc": 0.9302919345345024, | |
| "train_speed(iter/s)": 0.016386 | |
| }, | |
| { | |
| "epoch": 4.406947890818858, | |
| "grad_norm": 0.353572914930731, | |
| "learning_rate": 7.085525293518469e-07, | |
| "loss": 0.1933911681175232, | |
| "memory(GiB)": 89.94, | |
| "step": 890, | |
| "token_acc": 0.9303721907052539, | |
| "train_speed(iter/s)": 0.016395 | |
| }, | |
| { | |
| "epoch": 4.431761786600497, | |
| "grad_norm": 0.34090308953100995, | |
| "learning_rate": 6.489396114866942e-07, | |
| "loss": 0.18675589561462402, | |
| "memory(GiB)": 89.94, | |
| "step": 895, | |
| "token_acc": 0.9425964870708806, | |
| "train_speed(iter/s)": 0.016397 | |
| }, | |
| { | |
| "epoch": 4.456575682382134, | |
| "grad_norm": 0.34608919518252224, | |
| "learning_rate": 5.918617875465449e-07, | |
| "loss": 0.19207412004470825, | |
| "memory(GiB)": 89.94, | |
| "step": 900, | |
| "token_acc": 0.9400631757501285, | |
| "train_speed(iter/s)": 0.016399 | |
| }, | |
| { | |
| "epoch": 4.456575682382134, | |
| "eval_loss": 0.40813976526260376, | |
| "eval_runtime": 37.5655, | |
| "eval_samples_per_second": 6.895, | |
| "eval_steps_per_second": 0.453, | |
| "eval_token_acc": 0.8603184387429927, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 4.481389578163772, | |
| "grad_norm": 0.35859843949116793, | |
| "learning_rate": 5.373345314604206e-07, | |
| "loss": 0.2071406364440918, | |
| "memory(GiB)": 89.94, | |
| "step": 905, | |
| "token_acc": 0.9223047638884068, | |
| "train_speed(iter/s)": 0.016383 | |
| }, | |
| { | |
| "epoch": 4.506203473945409, | |
| "grad_norm": 0.3089222557610059, | |
| "learning_rate": 4.853726256925407e-07, | |
| "loss": 0.1951405882835388, | |
| "memory(GiB)": 89.94, | |
| "step": 910, | |
| "token_acc": 0.9288247402227017, | |
| "train_speed(iter/s)": 0.016382 | |
| }, | |
| { | |
| "epoch": 4.5310173697270475, | |
| "grad_norm": 0.3365251848175135, | |
| "learning_rate": 4.359901572347758e-07, | |
| "loss": 0.19402856826782228, | |
| "memory(GiB)": 89.94, | |
| "step": 915, | |
| "token_acc": 0.9069553201289728, | |
| "train_speed(iter/s)": 0.016381 | |
| }, | |
| { | |
| "epoch": 4.555831265508685, | |
| "grad_norm": 0.352511343998469, | |
| "learning_rate": 3.892005137876209e-07, | |
| "loss": 0.18840408325195312, | |
| "memory(GiB)": 89.94, | |
| "step": 920, | |
| "token_acc": 0.9290194762860235, | |
| "train_speed(iter/s)": 0.016381 | |
| }, | |
| { | |
| "epoch": 4.580645161290323, | |
| "grad_norm": 0.31455756384327627, | |
| "learning_rate": 3.450163801307582e-07, | |
| "loss": 0.1860198974609375, | |
| "memory(GiB)": 89.94, | |
| "step": 925, | |
| "token_acc": 0.9461412451458935, | |
| "train_speed(iter/s)": 0.016385 | |
| }, | |
| { | |
| "epoch": 4.60545905707196, | |
| "grad_norm": 0.31757641861016306, | |
| "learning_rate": 3.034497346841958e-07, | |
| "loss": 0.1895312786102295, | |
| "memory(GiB)": 89.94, | |
| "step": 930, | |
| "token_acc": 0.918751311402793, | |
| "train_speed(iter/s)": 0.016385 | |
| }, | |
| { | |
| "epoch": 4.630272952853598, | |
| "grad_norm": 0.34601426382495787, | |
| "learning_rate": 2.6451184626087646e-07, | |
| "loss": 0.19062964916229247, | |
| "memory(GiB)": 89.94, | |
| "step": 935, | |
| "token_acc": 0.9377439769272455, | |
| "train_speed(iter/s)": 0.016388 | |
| }, | |
| { | |
| "epoch": 4.655086848635236, | |
| "grad_norm": 0.3686772430396372, | |
| "learning_rate": 2.2821327101168578e-07, | |
| "loss": 0.18338959217071532, | |
| "memory(GiB)": 89.94, | |
| "step": 940, | |
| "token_acc": 0.9286465593172508, | |
| "train_speed(iter/s)": 0.016389 | |
| }, | |
| { | |
| "epoch": 4.679900744416873, | |
| "grad_norm": 0.3329148442481642, | |
| "learning_rate": 1.9456384956365149e-07, | |
| "loss": 0.17848238945007325, | |
| "memory(GiB)": 89.94, | |
| "step": 945, | |
| "token_acc": 0.9242079340262307, | |
| "train_speed(iter/s)": 0.016388 | |
| }, | |
| { | |
| "epoch": 4.704714640198511, | |
| "grad_norm": 0.33580830230246933, | |
| "learning_rate": 1.6357270435212736e-07, | |
| "loss": 0.19694331884384156, | |
| "memory(GiB)": 89.94, | |
| "step": 950, | |
| "token_acc": 0.9275961363852546, | |
| "train_speed(iter/s)": 0.01639 | |
| }, | |
| { | |
| "epoch": 4.704714640198511, | |
| "eval_loss": 0.4078960418701172, | |
| "eval_runtime": 37.2595, | |
| "eval_samples_per_second": 6.951, | |
| "eval_steps_per_second": 0.456, | |
| "eval_token_acc": 0.8605452303970599, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 4.729528535980149, | |
| "grad_norm": 0.3214090891564049, | |
| "learning_rate": 1.3524823714768375e-07, | |
| "loss": 0.19557987451553344, | |
| "memory(GiB)": 89.94, | |
| "step": 955, | |
| "token_acc": 0.917459338194055, | |
| "train_speed(iter/s)": 0.016374 | |
| }, | |
| { | |
| "epoch": 4.754342431761787, | |
| "grad_norm": 0.3510644774133595, | |
| "learning_rate": 1.0959812677835968e-07, | |
| "loss": 0.18159072399139403, | |
| "memory(GiB)": 89.94, | |
| "step": 960, | |
| "token_acc": 0.9338328114497434, | |
| "train_speed(iter/s)": 0.016382 | |
| }, | |
| { | |
| "epoch": 4.779156327543424, | |
| "grad_norm": 0.31852083171219153, | |
| "learning_rate": 8.662932704792793e-08, | |
| "loss": 0.18122289180755616, | |
| "memory(GiB)": 89.94, | |
| "step": 965, | |
| "token_acc": 0.9309320132692916, | |
| "train_speed(iter/s)": 0.016382 | |
| }, | |
| { | |
| "epoch": 4.803970223325062, | |
| "grad_norm": 0.318877312082489, | |
| "learning_rate": 6.63480648506909e-08, | |
| "loss": 0.19023412466049194, | |
| "memory(GiB)": 89.94, | |
| "step": 970, | |
| "token_acc": 0.9162366937555632, | |
| "train_speed(iter/s)": 0.016379 | |
| }, | |
| { | |
| "epoch": 4.8287841191067, | |
| "grad_norm": 0.34651828579056343, | |
| "learning_rate": 4.8759838483358745e-08, | |
| "loss": 0.18698248863220215, | |
| "memory(GiB)": 89.94, | |
| "step": 975, | |
| "token_acc": 0.9286984711087747, | |
| "train_speed(iter/s)": 0.016384 | |
| }, | |
| { | |
| "epoch": 4.8535980148883375, | |
| "grad_norm": 0.3062976957364449, | |
| "learning_rate": 3.386941615445283e-08, | |
| "loss": 0.18977559804916383, | |
| "memory(GiB)": 89.94, | |
| "step": 980, | |
| "token_acc": 0.9407679833647572, | |
| "train_speed(iter/s)": 0.016386 | |
| }, | |
| { | |
| "epoch": 4.878411910669975, | |
| "grad_norm": 0.29457610025451847, | |
| "learning_rate": 2.1680834691628627e-08, | |
| "loss": 0.19420017004013063, | |
| "memory(GiB)": 89.94, | |
| "step": 985, | |
| "token_acc": 0.9039991194174251, | |
| "train_speed(iter/s)": 0.016388 | |
| }, | |
| { | |
| "epoch": 4.903225806451613, | |
| "grad_norm": 0.3316519420007468, | |
| "learning_rate": 1.2197398447283404e-08, | |
| "loss": 0.1931779146194458, | |
| "memory(GiB)": 89.94, | |
| "step": 990, | |
| "token_acc": 0.9029619269298459, | |
| "train_speed(iter/s)": 0.016386 | |
| }, | |
| { | |
| "epoch": 4.92803970223325, | |
| "grad_norm": 0.34572266929696366, | |
| "learning_rate": 5.421678402741659e-09, | |
| "loss": 0.1826627492904663, | |
| "memory(GiB)": 89.94, | |
| "step": 995, | |
| "token_acc": 0.9375434960795533, | |
| "train_speed(iter/s)": 0.016383 | |
| }, | |
| { | |
| "epoch": 4.9528535980148884, | |
| "grad_norm": 0.3466792398716006, | |
| "learning_rate": 1.3555114712526796e-09, | |
| "loss": 0.18890198469161987, | |
| "memory(GiB)": 89.94, | |
| "step": 1000, | |
| "token_acc": 0.9408911997667525, | |
| "train_speed(iter/s)": 0.016393 | |
| }, | |
| { | |
| "epoch": 4.9528535980148884, | |
| "eval_loss": 0.40783175826072693, | |
| "eval_runtime": 37.8642, | |
| "eval_samples_per_second": 6.84, | |
| "eval_steps_per_second": 0.449, | |
| "eval_token_acc": 0.8604533507013096, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 4.977667493796526, | |
| "grad_norm": 0.36020979675081666, | |
| "learning_rate": 0.0, | |
| "loss": 0.1956263303756714, | |
| "memory(GiB)": 89.94, | |
| "step": 1005, | |
| "token_acc": 0.9227749975468551, | |
| "train_speed(iter/s)": 0.016386 | |
| }, | |
| { | |
| "epoch": 4.977667493796526, | |
| "eval_loss": 0.40774813294410706, | |
| "eval_runtime": 37.2531, | |
| "eval_samples_per_second": 6.952, | |
| "eval_steps_per_second": 0.456, | |
| "eval_token_acc": 0.8606568816729083, | |
| "step": 1005 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 1005, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 5, | |
| "save_steps": 50, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2269146859438080.0, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |