| { |
| "best_metric": 0.50855798, |
| "best_model_checkpoint": "/home/xtommy/output/v3-20250310-030602/checkpoint-100", |
| "epoch": 1.6788218793828893, |
| "eval_steps": 100, |
| "global_step": 300, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.005610098176718092, |
| "grad_norm": 21.143158969575165, |
| "learning_rate": 3.7037037037037036e-07, |
| "loss": 0.9654525518417358, |
| "memory(GiB)": 33.04, |
| "step": 1, |
| "token_acc": 0.7665198237885462, |
| "train_speed(iter/s)": 0.062201 |
| }, |
| { |
| "epoch": 0.028050490883590462, |
| "grad_norm": 15.87742952977485, |
| "learning_rate": 1.8518518518518519e-06, |
| "loss": 0.9609600901603699, |
| "memory(GiB)": 34.92, |
| "step": 5, |
| "token_acc": 0.7755921316740265, |
| "train_speed(iter/s)": 0.138938 |
| }, |
| { |
| "epoch": 0.056100981767180924, |
| "grad_norm": 5.606319501079557, |
| "learning_rate": 3.7037037037037037e-06, |
| "loss": 0.7228862762451171, |
| "memory(GiB)": 34.92, |
| "step": 10, |
| "token_acc": 0.8462011688711166, |
| "train_speed(iter/s)": 0.164215 |
| }, |
| { |
| "epoch": 0.08415147265077139, |
| "grad_norm": 4.732447800873768, |
| "learning_rate": 5.555555555555557e-06, |
| "loss": 0.6514874458312988, |
| "memory(GiB)": 40.13, |
| "step": 15, |
| "token_acc": 0.8574363188167625, |
| "train_speed(iter/s)": 0.173887 |
| }, |
| { |
| "epoch": 0.11220196353436185, |
| "grad_norm": 3.8882854024163236, |
| "learning_rate": 7.4074074074074075e-06, |
| "loss": 0.5766544342041016, |
| "memory(GiB)": 40.13, |
| "step": 20, |
| "token_acc": 0.8238352329534093, |
| "train_speed(iter/s)": 0.175922 |
| }, |
| { |
| "epoch": 0.1402524544179523, |
| "grad_norm": 3.158738313824327, |
| "learning_rate": 9.25925925925926e-06, |
| "loss": 0.5444880485534668, |
| "memory(GiB)": 40.13, |
| "step": 25, |
| "token_acc": 0.8063087991145546, |
| "train_speed(iter/s)": 0.180858 |
| }, |
| { |
| "epoch": 0.16830294530154277, |
| "grad_norm": 3.1734171494699743, |
| "learning_rate": 9.999136119166803e-06, |
| "loss": 0.593137550354004, |
| "memory(GiB)": 40.13, |
| "step": 30, |
| "token_acc": 0.8212754555198285, |
| "train_speed(iter/s)": 0.183523 |
| }, |
| { |
| "epoch": 0.19635343618513323, |
| "grad_norm": 3.092317172885569, |
| "learning_rate": 9.99385792841537e-06, |
| "loss": 0.5608675003051757, |
| "memory(GiB)": 40.13, |
| "step": 35, |
| "token_acc": 0.7960634226353198, |
| "train_speed(iter/s)": 0.186745 |
| }, |
| { |
| "epoch": 0.2244039270687237, |
| "grad_norm": 3.1337584802773026, |
| "learning_rate": 9.983786540671052e-06, |
| "loss": 0.48392491340637206, |
| "memory(GiB)": 46.85, |
| "step": 40, |
| "token_acc": 0.8518429294145989, |
| "train_speed(iter/s)": 0.186859 |
| }, |
| { |
| "epoch": 0.25245441795231416, |
| "grad_norm": 2.8100754489286737, |
| "learning_rate": 9.968931622637652e-06, |
| "loss": 0.5289750099182129, |
| "memory(GiB)": 46.85, |
| "step": 45, |
| "token_acc": 0.831301318736004, |
| "train_speed(iter/s)": 0.186705 |
| }, |
| { |
| "epoch": 0.2805049088359046, |
| "grad_norm": 2.7591739800649386, |
| "learning_rate": 9.949307432339625e-06, |
| "loss": 0.5157864570617676, |
| "memory(GiB)": 46.85, |
| "step": 50, |
| "token_acc": 0.8248620389195469, |
| "train_speed(iter/s)": 0.186574 |
| }, |
| { |
| "epoch": 0.3085553997194951, |
| "grad_norm": 3.1485563943729478, |
| "learning_rate": 9.92493280543695e-06, |
| "loss": 0.4699361324310303, |
| "memory(GiB)": 46.85, |
| "step": 55, |
| "token_acc": 0.8659049909801564, |
| "train_speed(iter/s)": 0.186987 |
| }, |
| { |
| "epoch": 0.33660589060308554, |
| "grad_norm": 2.4744606435719922, |
| "learning_rate": 9.895831137146319e-06, |
| "loss": 0.48673105239868164, |
| "memory(GiB)": 46.85, |
| "step": 60, |
| "token_acc": 0.8440202803459589, |
| "train_speed(iter/s)": 0.186175 |
| }, |
| { |
| "epoch": 0.364656381486676, |
| "grad_norm": 2.910446571594773, |
| "learning_rate": 9.86203035978598e-06, |
| "loss": 0.49613585472106936, |
| "memory(GiB)": 46.85, |
| "step": 65, |
| "token_acc": 0.7877989460883664, |
| "train_speed(iter/s)": 0.18724 |
| }, |
| { |
| "epoch": 0.39270687237026647, |
| "grad_norm": 2.697935385144819, |
| "learning_rate": 9.82356291596578e-06, |
| "loss": 0.49748897552490234, |
| "memory(GiB)": 46.85, |
| "step": 70, |
| "token_acc": 0.8412732784755306, |
| "train_speed(iter/s)": 0.187758 |
| }, |
| { |
| "epoch": 0.42075736325385693, |
| "grad_norm": 2.66839088833363, |
| "learning_rate": 9.78046572744815e-06, |
| "loss": 0.4695857524871826, |
| "memory(GiB)": 46.85, |
| "step": 75, |
| "token_acc": 0.8216541353383459, |
| "train_speed(iter/s)": 0.188863 |
| }, |
| { |
| "epoch": 0.4488078541374474, |
| "grad_norm": 2.5275349483407297, |
| "learning_rate": 9.732780159709912e-06, |
| "loss": 0.49554810523986814, |
| "memory(GiB)": 46.85, |
| "step": 80, |
| "token_acc": 0.8578199052132701, |
| "train_speed(iter/s)": 0.189432 |
| }, |
| { |
| "epoch": 0.47685834502103785, |
| "grad_norm": 2.9933704329230877, |
| "learning_rate": 9.680551982238941e-06, |
| "loss": 0.5124300479888916, |
| "memory(GiB)": 46.85, |
| "step": 85, |
| "token_acc": 0.8131634819532909, |
| "train_speed(iter/s)": 0.190357 |
| }, |
| { |
| "epoch": 0.5049088359046283, |
| "grad_norm": 2.4574221278857937, |
| "learning_rate": 9.623831324603755e-06, |
| "loss": 0.47562427520751954, |
| "memory(GiB)": 46.85, |
| "step": 90, |
| "token_acc": 0.8389690234097895, |
| "train_speed(iter/s)": 0.191037 |
| }, |
| { |
| "epoch": 0.5329593267882188, |
| "grad_norm": 2.7294720250367632, |
| "learning_rate": 9.562672628338233e-06, |
| "loss": 0.5044757843017578, |
| "memory(GiB)": 46.85, |
| "step": 95, |
| "token_acc": 0.8220171390903098, |
| "train_speed(iter/s)": 0.192157 |
| }, |
| { |
| "epoch": 0.5610098176718092, |
| "grad_norm": 2.6006192761537856, |
| "learning_rate": 9.497134594687635e-06, |
| "loss": 0.46201863288879397, |
| "memory(GiB)": 49.72, |
| "step": 100, |
| "token_acc": 0.8605141171512853, |
| "train_speed(iter/s)": 0.192495 |
| }, |
| { |
| "epoch": 0.5610098176718092, |
| "eval_loss": 0.5085579752922058, |
| "eval_runtime": 3.1807, |
| "eval_samples_per_second": 18.235, |
| "eval_steps_per_second": 2.515, |
| "eval_token_acc": 0.8218710369720027, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.5890603085553997, |
| "grad_norm": 2.6462361602139293, |
| "learning_rate": 9.427280128266049e-06, |
| "loss": 0.46499032974243165, |
| "memory(GiB)": 49.72, |
| "step": 105, |
| "token_acc": 0.8108732668450499, |
| "train_speed(iter/s)": 0.174123 |
| }, |
| { |
| "epoch": 0.6171107994389902, |
| "grad_norm": 2.404335987566073, |
| "learning_rate": 9.353176276679397e-06, |
| "loss": 0.4948995113372803, |
| "memory(GiB)": 49.72, |
| "step": 110, |
| "token_acc": 0.8358369098712446, |
| "train_speed(iter/s)": 0.174701 |
| }, |
| { |
| "epoch": 0.6451612903225806, |
| "grad_norm": 2.335982137765959, |
| "learning_rate": 9.274894166171888e-06, |
| "loss": 0.4810373306274414, |
| "memory(GiB)": 49.72, |
| "step": 115, |
| "token_acc": 0.8155024367602692, |
| "train_speed(iter/s)": 0.175308 |
| }, |
| { |
| "epoch": 0.6732117812061711, |
| "grad_norm": 2.691261889996041, |
| "learning_rate": 9.192508933357753e-06, |
| "loss": 0.4936681747436523, |
| "memory(GiB)": 49.72, |
| "step": 120, |
| "token_acc": 0.8138338904602725, |
| "train_speed(iter/s)": 0.176098 |
| }, |
| { |
| "epoch": 0.7012622720897616, |
| "grad_norm": 2.358427438460444, |
| "learning_rate": 9.106099653103729e-06, |
| "loss": 0.47971954345703127, |
| "memory(GiB)": 49.72, |
| "step": 125, |
| "token_acc": 0.840718018632129, |
| "train_speed(iter/s)": 0.177087 |
| }, |
| { |
| "epoch": 0.729312762973352, |
| "grad_norm": 3.0915696354608015, |
| "learning_rate": 9.015749262631537e-06, |
| "loss": 0.47090797424316405, |
| "memory(GiB)": 49.72, |
| "step": 130, |
| "token_acc": 0.8563148997826612, |
| "train_speed(iter/s)": 0.177882 |
| }, |
| { |
| "epoch": 0.7573632538569425, |
| "grad_norm": 2.49717679636567, |
| "learning_rate": 8.921544481913218e-06, |
| "loss": 0.48613691329956055, |
| "memory(GiB)": 49.72, |
| "step": 135, |
| "token_acc": 0.8506247633472169, |
| "train_speed(iter/s)": 0.17847 |
| }, |
| { |
| "epoch": 0.7854137447405329, |
| "grad_norm": 2.4741070615503205, |
| "learning_rate": 8.823575730435694e-06, |
| "loss": 0.5012983798980712, |
| "memory(GiB)": 49.72, |
| "step": 140, |
| "token_acc": 0.8351838144584258, |
| "train_speed(iter/s)": 0.17935 |
| }, |
| { |
| "epoch": 0.8134642356241234, |
| "grad_norm": 2.2485265925756774, |
| "learning_rate": 8.721937040414481e-06, |
| "loss": 0.4845560073852539, |
| "memory(GiB)": 49.72, |
| "step": 145, |
| "token_acc": 0.833864457191417, |
| "train_speed(iter/s)": 0.179967 |
| }, |
| { |
| "epoch": 0.8415147265077139, |
| "grad_norm": 2.40666949019937, |
| "learning_rate": 8.616725966539831e-06, |
| "loss": 0.4853487014770508, |
| "memory(GiB)": 49.72, |
| "step": 150, |
| "token_acc": 0.8687924725561944, |
| "train_speed(iter/s)": 0.180227 |
| }, |
| { |
| "epoch": 0.8695652173913043, |
| "grad_norm": 2.5222048983104144, |
| "learning_rate": 8.508043492341944e-06, |
| "loss": 0.48848953247070315, |
| "memory(GiB)": 49.72, |
| "step": 155, |
| "token_acc": 0.8692421991084696, |
| "train_speed(iter/s)": 0.180912 |
| }, |
| { |
| "epoch": 0.8976157082748948, |
| "grad_norm": 2.8512141417678367, |
| "learning_rate": 8.395993933265102e-06, |
| "loss": 0.48795671463012696, |
| "memory(GiB)": 49.72, |
| "step": 160, |
| "token_acc": 0.8160145115314849, |
| "train_speed(iter/s)": 0.181882 |
| }, |
| { |
| "epoch": 0.9256661991584852, |
| "grad_norm": 2.2204983344675915, |
| "learning_rate": 8.280684836543794e-06, |
| "loss": 0.4629330635070801, |
| "memory(GiB)": 49.72, |
| "step": 165, |
| "token_acc": 0.8467532467532467, |
| "train_speed(iter/s)": 0.18227 |
| }, |
| { |
| "epoch": 0.9537166900420757, |
| "grad_norm": 2.203233035174616, |
| "learning_rate": 8.162226877976886e-06, |
| "loss": 0.45454840660095214, |
| "memory(GiB)": 49.72, |
| "step": 170, |
| "token_acc": 0.8492569002123143, |
| "train_speed(iter/s)": 0.182553 |
| }, |
| { |
| "epoch": 0.9817671809256662, |
| "grad_norm": 2.6557011149780605, |
| "learning_rate": 8.040733755698954e-06, |
| "loss": 0.4734921455383301, |
| "memory(GiB)": 49.72, |
| "step": 175, |
| "token_acc": 0.8423556430446194, |
| "train_speed(iter/s)": 0.182781 |
| }, |
| { |
| "epoch": 1.0056100981767182, |
| "grad_norm": 5.885778658962663, |
| "learning_rate": 7.916322081050708e-06, |
| "loss": 0.454329252243042, |
| "memory(GiB)": 49.72, |
| "step": 180, |
| "token_acc": 0.8476148409893993, |
| "train_speed(iter/s)": 0.18394 |
| }, |
| { |
| "epoch": 1.0336605890603086, |
| "grad_norm": 2.086470683255528, |
| "learning_rate": 7.789111266653285e-06, |
| "loss": 0.25520517826080324, |
| "memory(GiB)": 49.72, |
| "step": 185, |
| "token_acc": 0.9368489583333334, |
| "train_speed(iter/s)": 0.184327 |
| }, |
| { |
| "epoch": 1.061711079943899, |
| "grad_norm": 1.9585352119743946, |
| "learning_rate": 7.6592234117938e-06, |
| "loss": 0.24057292938232422, |
| "memory(GiB)": 49.72, |
| "step": 190, |
| "token_acc": 0.8799342105263158, |
| "train_speed(iter/s)": 0.184701 |
| }, |
| { |
| "epoch": 1.0897615708274895, |
| "grad_norm": 2.13660433129455, |
| "learning_rate": 7.526783185232208e-06, |
| "loss": 0.2536721706390381, |
| "memory(GiB)": 49.72, |
| "step": 195, |
| "token_acc": 0.8971309928274821, |
| "train_speed(iter/s)": 0.185477 |
| }, |
| { |
| "epoch": 1.11781206171108, |
| "grad_norm": 2.735124602693302, |
| "learning_rate": 7.391917705541927e-06, |
| "loss": 0.31099162101745603, |
| "memory(GiB)": 49.72, |
| "step": 200, |
| "token_acc": 0.8841603680578377, |
| "train_speed(iter/s)": 0.185836 |
| }, |
| { |
| "epoch": 1.11781206171108, |
| "eval_loss": 0.5298875570297241, |
| "eval_runtime": 3.1775, |
| "eval_samples_per_second": 18.254, |
| "eval_steps_per_second": 2.518, |
| "eval_token_acc": 0.8268461613501121, |
| "step": 200 |
| }, |
| { |
| "epoch": 1.1458625525946704, |
| "grad_norm": 2.5196173114331852, |
| "learning_rate": 7.254756419099074e-06, |
| "loss": 0.25104780197143556, |
| "memory(GiB)": 49.72, |
| "step": 205, |
| "token_acc": 0.8569846017503047, |
| "train_speed(iter/s)": 0.176838 |
| }, |
| { |
| "epoch": 1.1739130434782608, |
| "grad_norm": 2.19812138169296, |
| "learning_rate": 7.115430975837457e-06, |
| "loss": 0.258597731590271, |
| "memory(GiB)": 49.72, |
| "step": 210, |
| "token_acc": 0.9285714285714286, |
| "train_speed(iter/s)": 0.177277 |
| }, |
| { |
| "epoch": 1.2019635343618513, |
| "grad_norm": 2.0506479675657125, |
| "learning_rate": 6.974075102888535e-06, |
| "loss": 0.23006267547607423, |
| "memory(GiB)": 49.72, |
| "step": 215, |
| "token_acc": 0.9158527422990232, |
| "train_speed(iter/s)": 0.177685 |
| }, |
| { |
| "epoch": 1.230014025245442, |
| "grad_norm": 2.4011901883900815, |
| "learning_rate": 6.830824476227646e-06, |
| "loss": 0.26652209758758544, |
| "memory(GiB)": 49.72, |
| "step": 220, |
| "token_acc": 0.9189260331666228, |
| "train_speed(iter/s)": 0.178144 |
| }, |
| { |
| "epoch": 1.2580645161290323, |
| "grad_norm": 2.133378788275781, |
| "learning_rate": 6.685816590449708e-06, |
| "loss": 0.24832301139831542, |
| "memory(GiB)": 52.59, |
| "step": 225, |
| "token_acc": 0.9205128205128205, |
| "train_speed(iter/s)": 0.178447 |
| }, |
| { |
| "epoch": 1.2861150070126226, |
| "grad_norm": 1.8251373128865753, |
| "learning_rate": 6.539190626799366e-06, |
| "loss": 0.2387561798095703, |
| "memory(GiB)": 55.47, |
| "step": 230, |
| "token_acc": 0.9165306598274656, |
| "train_speed(iter/s)": 0.178791 |
| }, |
| { |
| "epoch": 1.3141654978962132, |
| "grad_norm": 2.1628256679219677, |
| "learning_rate": 6.391087319582264e-06, |
| "loss": 0.2553119421005249, |
| "memory(GiB)": 55.47, |
| "step": 235, |
| "token_acc": 0.9236641221374046, |
| "train_speed(iter/s)": 0.178893 |
| }, |
| { |
| "epoch": 1.3422159887798037, |
| "grad_norm": 2.191493378761956, |
| "learning_rate": 6.241648821085666e-06, |
| "loss": 0.24976880550384523, |
| "memory(GiB)": 55.47, |
| "step": 240, |
| "token_acc": 0.9153605015673981, |
| "train_speed(iter/s)": 0.179274 |
| }, |
| { |
| "epoch": 1.370266479663394, |
| "grad_norm": 2.5207105988104903, |
| "learning_rate": 6.091018565138062e-06, |
| "loss": 0.2626498699188232, |
| "memory(GiB)": 55.47, |
| "step": 245, |
| "token_acc": 0.8613016563493391, |
| "train_speed(iter/s)": 0.179556 |
| }, |
| { |
| "epoch": 1.3983169705469845, |
| "grad_norm": 2.0765712307553956, |
| "learning_rate": 5.939341129438739e-06, |
| "loss": 0.23375244140625, |
| "memory(GiB)": 55.47, |
| "step": 250, |
| "token_acc": 0.907565069326198, |
| "train_speed(iter/s)": 0.179806 |
| }, |
| { |
| "epoch": 1.426367461430575, |
| "grad_norm": 2.1185412816077758, |
| "learning_rate": 5.786762096789431e-06, |
| "loss": 0.25868446826934816, |
| "memory(GiB)": 55.47, |
| "step": 255, |
| "token_acc": 0.9030013049151805, |
| "train_speed(iter/s)": 0.180282 |
| }, |
| { |
| "epoch": 1.4544179523141656, |
| "grad_norm": 1.91620250808909, |
| "learning_rate": 5.633427915361261e-06, |
| "loss": 0.23945794105529786, |
| "memory(GiB)": 55.47, |
| "step": 260, |
| "token_acc": 0.9035011322069326, |
| "train_speed(iter/s)": 0.180408 |
| }, |
| { |
| "epoch": 1.482468443197756, |
| "grad_norm": 2.6109664603512748, |
| "learning_rate": 5.479485758131089e-06, |
| "loss": 0.24080324172973633, |
| "memory(GiB)": 55.47, |
| "step": 265, |
| "token_acc": 0.9038590604026846, |
| "train_speed(iter/s)": 0.18076 |
| }, |
| { |
| "epoch": 1.5105189340813463, |
| "grad_norm": 2.4134207120025866, |
| "learning_rate": 5.325083381622165e-06, |
| "loss": 0.2503629684448242, |
| "memory(GiB)": 55.47, |
| "step": 270, |
| "token_acc": 0.9309480401093893, |
| "train_speed(iter/s)": 0.181035 |
| }, |
| { |
| "epoch": 1.5385694249649369, |
| "grad_norm": 2.0852670092638164, |
| "learning_rate": 5.170368984084695e-06, |
| "loss": 0.24387457370758056, |
| "memory(GiB)": 55.47, |
| "step": 275, |
| "token_acc": 0.9182186234817814, |
| "train_speed(iter/s)": 0.181332 |
| }, |
| { |
| "epoch": 1.5666199158485274, |
| "grad_norm": 2.269858573325917, |
| "learning_rate": 5.01549106325243e-06, |
| "loss": 0.24088690280914307, |
| "memory(GiB)": 55.47, |
| "step": 280, |
| "token_acc": 0.9289772727272727, |
| "train_speed(iter/s)": 0.181602 |
| }, |
| { |
| "epoch": 1.5946704067321178, |
| "grad_norm": 2.3189439894004957, |
| "learning_rate": 4.860598273811793e-06, |
| "loss": 0.25754590034484864, |
| "memory(GiB)": 55.47, |
| "step": 285, |
| "token_acc": 0.9309855154785572, |
| "train_speed(iter/s)": 0.181856 |
| }, |
| { |
| "epoch": 1.6227208976157081, |
| "grad_norm": 2.123604145448152, |
| "learning_rate": 4.705839284720376e-06, |
| "loss": 0.23919339179992677, |
| "memory(GiB)": 55.47, |
| "step": 290, |
| "token_acc": 0.9271794871794872, |
| "train_speed(iter/s)": 0.182 |
| }, |
| { |
| "epoch": 1.6507713884992987, |
| "grad_norm": 2.0839399594164645, |
| "learning_rate": 4.55136263651172e-06, |
| "loss": 0.24165868759155273, |
| "memory(GiB)": 55.47, |
| "step": 295, |
| "token_acc": 0.8970849626733025, |
| "train_speed(iter/s)": 0.181991 |
| }, |
| { |
| "epoch": 1.6788218793828893, |
| "grad_norm": 2.638338783284183, |
| "learning_rate": 4.397316598723385e-06, |
| "loss": 0.26505722999572756, |
| "memory(GiB)": 55.47, |
| "step": 300, |
| "token_acc": 0.9129909365558913, |
| "train_speed(iter/s)": 0.18223 |
| }, |
| { |
| "epoch": 1.6788218793828893, |
| "eval_loss": 0.5159281492233276, |
| "eval_runtime": 3.1746, |
| "eval_samples_per_second": 18.27, |
| "eval_steps_per_second": 2.52, |
| "eval_token_acc": 0.8280167788508438, |
| "step": 300 |
| } |
| ], |
| "logging_steps": 5, |
| "max_steps": 534, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 3, |
| "save_steps": 100, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 29991394213888.0, |
| "train_batch_size": 1, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|