{ "best_metric": 0.50855798, "best_model_checkpoint": "/home/xtommy/output/v3-20250310-030602/checkpoint-100", "epoch": 1.6788218793828893, "eval_steps": 100, "global_step": 300, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.005610098176718092, "grad_norm": 21.143158969575165, "learning_rate": 3.7037037037037036e-07, "loss": 0.9654525518417358, "memory(GiB)": 33.04, "step": 1, "token_acc": 0.7665198237885462, "train_speed(iter/s)": 0.062201 }, { "epoch": 0.028050490883590462, "grad_norm": 15.87742952977485, "learning_rate": 1.8518518518518519e-06, "loss": 0.9609600901603699, "memory(GiB)": 34.92, "step": 5, "token_acc": 0.7755921316740265, "train_speed(iter/s)": 0.138938 }, { "epoch": 0.056100981767180924, "grad_norm": 5.606319501079557, "learning_rate": 3.7037037037037037e-06, "loss": 0.7228862762451171, "memory(GiB)": 34.92, "step": 10, "token_acc": 0.8462011688711166, "train_speed(iter/s)": 0.164215 }, { "epoch": 0.08415147265077139, "grad_norm": 4.732447800873768, "learning_rate": 5.555555555555557e-06, "loss": 0.6514874458312988, "memory(GiB)": 40.13, "step": 15, "token_acc": 0.8574363188167625, "train_speed(iter/s)": 0.173887 }, { "epoch": 0.11220196353436185, "grad_norm": 3.8882854024163236, "learning_rate": 7.4074074074074075e-06, "loss": 0.5766544342041016, "memory(GiB)": 40.13, "step": 20, "token_acc": 0.8238352329534093, "train_speed(iter/s)": 0.175922 }, { "epoch": 0.1402524544179523, "grad_norm": 3.158738313824327, "learning_rate": 9.25925925925926e-06, "loss": 0.5444880485534668, "memory(GiB)": 40.13, "step": 25, "token_acc": 0.8063087991145546, "train_speed(iter/s)": 0.180858 }, { "epoch": 0.16830294530154277, "grad_norm": 3.1734171494699743, "learning_rate": 9.999136119166803e-06, "loss": 0.593137550354004, "memory(GiB)": 40.13, "step": 30, "token_acc": 0.8212754555198285, "train_speed(iter/s)": 0.183523 }, { "epoch": 0.19635343618513323, "grad_norm": 3.092317172885569, "learning_rate": 9.99385792841537e-06, "loss": 0.5608675003051757, "memory(GiB)": 40.13, "step": 35, "token_acc": 0.7960634226353198, "train_speed(iter/s)": 0.186745 }, { "epoch": 0.2244039270687237, "grad_norm": 3.1337584802773026, "learning_rate": 9.983786540671052e-06, "loss": 0.48392491340637206, "memory(GiB)": 46.85, "step": 40, "token_acc": 0.8518429294145989, "train_speed(iter/s)": 0.186859 }, { "epoch": 0.25245441795231416, "grad_norm": 2.8100754489286737, "learning_rate": 9.968931622637652e-06, "loss": 0.5289750099182129, "memory(GiB)": 46.85, "step": 45, "token_acc": 0.831301318736004, "train_speed(iter/s)": 0.186705 }, { "epoch": 0.2805049088359046, "grad_norm": 2.7591739800649386, "learning_rate": 9.949307432339625e-06, "loss": 0.5157864570617676, "memory(GiB)": 46.85, "step": 50, "token_acc": 0.8248620389195469, "train_speed(iter/s)": 0.186574 }, { "epoch": 0.3085553997194951, "grad_norm": 3.1485563943729478, "learning_rate": 9.92493280543695e-06, "loss": 0.4699361324310303, "memory(GiB)": 46.85, "step": 55, "token_acc": 0.8659049909801564, "train_speed(iter/s)": 0.186987 }, { "epoch": 0.33660589060308554, "grad_norm": 2.4744606435719922, "learning_rate": 9.895831137146319e-06, "loss": 0.48673105239868164, "memory(GiB)": 46.85, "step": 60, "token_acc": 0.8440202803459589, "train_speed(iter/s)": 0.186175 }, { "epoch": 0.364656381486676, "grad_norm": 2.910446571594773, "learning_rate": 9.86203035978598e-06, "loss": 0.49613585472106936, "memory(GiB)": 46.85, "step": 65, "token_acc": 0.7877989460883664, "train_speed(iter/s)": 0.18724 }, { "epoch": 0.39270687237026647, "grad_norm": 2.697935385144819, "learning_rate": 9.82356291596578e-06, "loss": 0.49748897552490234, "memory(GiB)": 46.85, "step": 70, "token_acc": 0.8412732784755306, "train_speed(iter/s)": 0.187758 }, { "epoch": 0.42075736325385693, "grad_norm": 2.66839088833363, "learning_rate": 9.78046572744815e-06, "loss": 0.4695857524871826, "memory(GiB)": 46.85, "step": 75, "token_acc": 0.8216541353383459, "train_speed(iter/s)": 0.188863 }, { "epoch": 0.4488078541374474, "grad_norm": 2.5275349483407297, "learning_rate": 9.732780159709912e-06, "loss": 0.49554810523986814, "memory(GiB)": 46.85, "step": 80, "token_acc": 0.8578199052132701, "train_speed(iter/s)": 0.189432 }, { "epoch": 0.47685834502103785, "grad_norm": 2.9933704329230877, "learning_rate": 9.680551982238941e-06, "loss": 0.5124300479888916, "memory(GiB)": 46.85, "step": 85, "token_acc": 0.8131634819532909, "train_speed(iter/s)": 0.190357 }, { "epoch": 0.5049088359046283, "grad_norm": 2.4574221278857937, "learning_rate": 9.623831324603755e-06, "loss": 0.47562427520751954, "memory(GiB)": 46.85, "step": 90, "token_acc": 0.8389690234097895, "train_speed(iter/s)": 0.191037 }, { "epoch": 0.5329593267882188, "grad_norm": 2.7294720250367632, "learning_rate": 9.562672628338233e-06, "loss": 0.5044757843017578, "memory(GiB)": 46.85, "step": 95, "token_acc": 0.8220171390903098, "train_speed(iter/s)": 0.192157 }, { "epoch": 0.5610098176718092, "grad_norm": 2.6006192761537856, "learning_rate": 9.497134594687635e-06, "loss": 0.46201863288879397, "memory(GiB)": 49.72, "step": 100, "token_acc": 0.8605141171512853, "train_speed(iter/s)": 0.192495 }, { "epoch": 0.5610098176718092, "eval_loss": 0.5085579752922058, "eval_runtime": 3.1807, "eval_samples_per_second": 18.235, "eval_steps_per_second": 2.515, "eval_token_acc": 0.8218710369720027, "step": 100 }, { "epoch": 0.5890603085553997, "grad_norm": 2.6462361602139293, "learning_rate": 9.427280128266049e-06, "loss": 0.46499032974243165, "memory(GiB)": 49.72, "step": 105, "token_acc": 0.8108732668450499, "train_speed(iter/s)": 0.174123 }, { "epoch": 0.6171107994389902, "grad_norm": 2.404335987566073, "learning_rate": 9.353176276679397e-06, "loss": 0.4948995113372803, "memory(GiB)": 49.72, "step": 110, "token_acc": 0.8358369098712446, "train_speed(iter/s)": 0.174701 }, { "epoch": 0.6451612903225806, "grad_norm": 2.335982137765959, "learning_rate": 9.274894166171888e-06, "loss": 0.4810373306274414, "memory(GiB)": 49.72, "step": 115, "token_acc": 0.8155024367602692, "train_speed(iter/s)": 0.175308 }, { "epoch": 0.6732117812061711, "grad_norm": 2.691261889996041, "learning_rate": 9.192508933357753e-06, "loss": 0.4936681747436523, "memory(GiB)": 49.72, "step": 120, "token_acc": 0.8138338904602725, "train_speed(iter/s)": 0.176098 }, { "epoch": 0.7012622720897616, "grad_norm": 2.358427438460444, "learning_rate": 9.106099653103729e-06, "loss": 0.47971954345703127, "memory(GiB)": 49.72, "step": 125, "token_acc": 0.840718018632129, "train_speed(iter/s)": 0.177087 }, { "epoch": 0.729312762973352, "grad_norm": 3.0915696354608015, "learning_rate": 9.015749262631537e-06, "loss": 0.47090797424316405, "memory(GiB)": 49.72, "step": 130, "token_acc": 0.8563148997826612, "train_speed(iter/s)": 0.177882 }, { "epoch": 0.7573632538569425, "grad_norm": 2.49717679636567, "learning_rate": 8.921544481913218e-06, "loss": 0.48613691329956055, "memory(GiB)": 49.72, "step": 135, "token_acc": 0.8506247633472169, "train_speed(iter/s)": 0.17847 }, { "epoch": 0.7854137447405329, "grad_norm": 2.4741070615503205, "learning_rate": 8.823575730435694e-06, "loss": 0.5012983798980712, "memory(GiB)": 49.72, "step": 140, "token_acc": 0.8351838144584258, "train_speed(iter/s)": 0.17935 }, { "epoch": 0.8134642356241234, "grad_norm": 2.2485265925756774, "learning_rate": 8.721937040414481e-06, "loss": 0.4845560073852539, "memory(GiB)": 49.72, "step": 145, "token_acc": 0.833864457191417, "train_speed(iter/s)": 0.179967 }, { "epoch": 0.8415147265077139, "grad_norm": 2.40666949019937, "learning_rate": 8.616725966539831e-06, "loss": 0.4853487014770508, "memory(GiB)": 49.72, "step": 150, "token_acc": 0.8687924725561944, "train_speed(iter/s)": 0.180227 }, { "epoch": 0.8695652173913043, "grad_norm": 2.5222048983104144, "learning_rate": 8.508043492341944e-06, "loss": 0.48848953247070315, "memory(GiB)": 49.72, "step": 155, "token_acc": 0.8692421991084696, "train_speed(iter/s)": 0.180912 }, { "epoch": 0.8976157082748948, "grad_norm": 2.8512141417678367, "learning_rate": 8.395993933265102e-06, "loss": 0.48795671463012696, "memory(GiB)": 49.72, "step": 160, "token_acc": 0.8160145115314849, "train_speed(iter/s)": 0.181882 }, { "epoch": 0.9256661991584852, "grad_norm": 2.2204983344675915, "learning_rate": 8.280684836543794e-06, "loss": 0.4629330635070801, "memory(GiB)": 49.72, "step": 165, "token_acc": 0.8467532467532467, "train_speed(iter/s)": 0.18227 }, { "epoch": 0.9537166900420757, "grad_norm": 2.203233035174616, "learning_rate": 8.162226877976886e-06, "loss": 0.45454840660095214, "memory(GiB)": 49.72, "step": 170, "token_acc": 0.8492569002123143, "train_speed(iter/s)": 0.182553 }, { "epoch": 0.9817671809256662, "grad_norm": 2.6557011149780605, "learning_rate": 8.040733755698954e-06, "loss": 0.4734921455383301, "memory(GiB)": 49.72, "step": 175, "token_acc": 0.8423556430446194, "train_speed(iter/s)": 0.182781 }, { "epoch": 1.0056100981767182, "grad_norm": 5.885778658962663, "learning_rate": 7.916322081050708e-06, "loss": 0.454329252243042, "memory(GiB)": 49.72, "step": 180, "token_acc": 0.8476148409893993, "train_speed(iter/s)": 0.18394 }, { "epoch": 1.0336605890603086, "grad_norm": 2.086470683255528, "learning_rate": 7.789111266653285e-06, "loss": 0.25520517826080324, "memory(GiB)": 49.72, "step": 185, "token_acc": 0.9368489583333334, "train_speed(iter/s)": 0.184327 }, { "epoch": 1.061711079943899, "grad_norm": 1.9585352119743946, "learning_rate": 7.6592234117938e-06, "loss": 0.24057292938232422, "memory(GiB)": 49.72, "step": 190, "token_acc": 0.8799342105263158, "train_speed(iter/s)": 0.184701 }, { "epoch": 1.0897615708274895, "grad_norm": 2.13660433129455, "learning_rate": 7.526783185232208e-06, "loss": 0.2536721706390381, "memory(GiB)": 49.72, "step": 195, "token_acc": 0.8971309928274821, "train_speed(iter/s)": 0.185477 }, { "epoch": 1.11781206171108, "grad_norm": 2.735124602693302, "learning_rate": 7.391917705541927e-06, "loss": 0.31099162101745603, "memory(GiB)": 49.72, "step": 200, "token_acc": 0.8841603680578377, "train_speed(iter/s)": 0.185836 }, { "epoch": 1.11781206171108, "eval_loss": 0.5298875570297241, "eval_runtime": 3.1775, "eval_samples_per_second": 18.254, "eval_steps_per_second": 2.518, "eval_token_acc": 0.8268461613501121, "step": 200 }, { "epoch": 1.1458625525946704, "grad_norm": 2.5196173114331852, "learning_rate": 7.254756419099074e-06, "loss": 0.25104780197143556, "memory(GiB)": 49.72, "step": 205, "token_acc": 0.8569846017503047, "train_speed(iter/s)": 0.176838 }, { "epoch": 1.1739130434782608, "grad_norm": 2.19812138169296, "learning_rate": 7.115430975837457e-06, "loss": 0.258597731590271, "memory(GiB)": 49.72, "step": 210, "token_acc": 0.9285714285714286, "train_speed(iter/s)": 0.177277 }, { "epoch": 1.2019635343618513, "grad_norm": 2.0506479675657125, "learning_rate": 6.974075102888535e-06, "loss": 0.23006267547607423, "memory(GiB)": 49.72, "step": 215, "token_acc": 0.9158527422990232, "train_speed(iter/s)": 0.177685 }, { "epoch": 1.230014025245442, "grad_norm": 2.4011901883900815, "learning_rate": 6.830824476227646e-06, "loss": 0.26652209758758544, "memory(GiB)": 49.72, "step": 220, "token_acc": 0.9189260331666228, "train_speed(iter/s)": 0.178144 }, { "epoch": 1.2580645161290323, "grad_norm": 2.133378788275781, "learning_rate": 6.685816590449708e-06, "loss": 0.24832301139831542, "memory(GiB)": 52.59, "step": 225, "token_acc": 0.9205128205128205, "train_speed(iter/s)": 0.178447 }, { "epoch": 1.2861150070126226, "grad_norm": 1.8251373128865753, "learning_rate": 6.539190626799366e-06, "loss": 0.2387561798095703, "memory(GiB)": 55.47, "step": 230, "token_acc": 0.9165306598274656, "train_speed(iter/s)": 0.178791 }, { "epoch": 1.3141654978962132, "grad_norm": 2.1628256679219677, "learning_rate": 6.391087319582264e-06, "loss": 0.2553119421005249, "memory(GiB)": 55.47, "step": 235, "token_acc": 0.9236641221374046, "train_speed(iter/s)": 0.178893 }, { "epoch": 1.3422159887798037, "grad_norm": 2.191493378761956, "learning_rate": 6.241648821085666e-06, "loss": 0.24976880550384523, "memory(GiB)": 55.47, "step": 240, "token_acc": 0.9153605015673981, "train_speed(iter/s)": 0.179274 }, { "epoch": 1.370266479663394, "grad_norm": 2.5207105988104903, "learning_rate": 6.091018565138062e-06, "loss": 0.2626498699188232, "memory(GiB)": 55.47, "step": 245, "token_acc": 0.8613016563493391, "train_speed(iter/s)": 0.179556 }, { "epoch": 1.3983169705469845, "grad_norm": 2.0765712307553956, "learning_rate": 5.939341129438739e-06, "loss": 0.23375244140625, "memory(GiB)": 55.47, "step": 250, "token_acc": 0.907565069326198, "train_speed(iter/s)": 0.179806 }, { "epoch": 1.426367461430575, "grad_norm": 2.1185412816077758, "learning_rate": 5.786762096789431e-06, "loss": 0.25868446826934816, "memory(GiB)": 55.47, "step": 255, "token_acc": 0.9030013049151805, "train_speed(iter/s)": 0.180282 }, { "epoch": 1.4544179523141656, "grad_norm": 1.91620250808909, "learning_rate": 5.633427915361261e-06, "loss": 0.23945794105529786, "memory(GiB)": 55.47, "step": 260, "token_acc": 0.9035011322069326, "train_speed(iter/s)": 0.180408 }, { "epoch": 1.482468443197756, "grad_norm": 2.6109664603512748, "learning_rate": 5.479485758131089e-06, "loss": 0.24080324172973633, "memory(GiB)": 55.47, "step": 265, "token_acc": 0.9038590604026846, "train_speed(iter/s)": 0.18076 }, { "epoch": 1.5105189340813463, "grad_norm": 2.4134207120025866, "learning_rate": 5.325083381622165e-06, "loss": 0.2503629684448242, "memory(GiB)": 55.47, "step": 270, "token_acc": 0.9309480401093893, "train_speed(iter/s)": 0.181035 }, { "epoch": 1.5385694249649369, "grad_norm": 2.0852670092638164, "learning_rate": 5.170368984084695e-06, "loss": 0.24387457370758056, "memory(GiB)": 55.47, "step": 275, "token_acc": 0.9182186234817814, "train_speed(iter/s)": 0.181332 }, { "epoch": 1.5666199158485274, "grad_norm": 2.269858573325917, "learning_rate": 5.01549106325243e-06, "loss": 0.24088690280914307, "memory(GiB)": 55.47, "step": 280, "token_acc": 0.9289772727272727, "train_speed(iter/s)": 0.181602 }, { "epoch": 1.5946704067321178, "grad_norm": 2.3189439894004957, "learning_rate": 4.860598273811793e-06, "loss": 0.25754590034484864, "memory(GiB)": 55.47, "step": 285, "token_acc": 0.9309855154785572, "train_speed(iter/s)": 0.181856 }, { "epoch": 1.6227208976157081, "grad_norm": 2.123604145448152, "learning_rate": 4.705839284720376e-06, "loss": 0.23919339179992677, "memory(GiB)": 55.47, "step": 290, "token_acc": 0.9271794871794872, "train_speed(iter/s)": 0.182 }, { "epoch": 1.6507713884992987, "grad_norm": 2.0839399594164645, "learning_rate": 4.55136263651172e-06, "loss": 0.24165868759155273, "memory(GiB)": 55.47, "step": 295, "token_acc": 0.8970849626733025, "train_speed(iter/s)": 0.181991 }, { "epoch": 1.6788218793828893, "grad_norm": 2.638338783284183, "learning_rate": 4.397316598723385e-06, "loss": 0.26505722999572756, "memory(GiB)": 55.47, "step": 300, "token_acc": 0.9129909365558913, "train_speed(iter/s)": 0.18223 }, { "epoch": 1.6788218793828893, "eval_loss": 0.5159281492233276, "eval_runtime": 3.1746, "eval_samples_per_second": 18.27, "eval_steps_per_second": 2.52, "eval_token_acc": 0.8280167788508438, "step": 300 } ], "logging_steps": 5, "max_steps": 534, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 29991394213888.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }