{ "best_global_step": 800, "best_metric": 0.44239288568496704, "best_model_checkpoint": "./qwen_semantic_bridge_v2_full/checkpoint-800", "epoch": 5.975609756097561, "eval_steps": 25, "global_step": 800, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.075046904315197, "grad_norm": 1.53766667842865, "learning_rate": 2.6865671641791044e-06, "loss": 2.4791, "step": 10 }, { "epoch": 0.150093808630394, "grad_norm": 1.524065613746643, "learning_rate": 5.671641791044776e-06, "loss": 2.4315, "step": 20 }, { "epoch": 0.18761726078799248, "eval_loss": 2.34097957611084, "eval_runtime": 5.5956, "eval_samples_per_second": 20.194, "eval_steps_per_second": 5.183, "step": 25 }, { "epoch": 0.225140712945591, "grad_norm": 1.4025851488113403, "learning_rate": 8.656716417910449e-06, "loss": 2.3807, "step": 30 }, { "epoch": 0.300187617260788, "grad_norm": 0.9802243709564209, "learning_rate": 1.164179104477612e-05, "loss": 2.1459, "step": 40 }, { "epoch": 0.37523452157598497, "grad_norm": 0.9669085144996643, "learning_rate": 1.4626865671641794e-05, "loss": 1.9016, "step": 50 }, { "epoch": 0.37523452157598497, "eval_loss": 1.7425719499588013, "eval_runtime": 5.053, "eval_samples_per_second": 22.363, "eval_steps_per_second": 5.739, "step": 50 }, { "epoch": 0.450281425891182, "grad_norm": 1.1177589893341064, "learning_rate": 1.761194029850746e-05, "loss": 1.596, "step": 60 }, { "epoch": 0.525328330206379, "grad_norm": 0.5461925864219666, "learning_rate": 2.0597014925373135e-05, "loss": 1.2615, "step": 70 }, { "epoch": 0.5628517823639775, "eval_loss": 1.0852540731430054, "eval_runtime": 5.0653, "eval_samples_per_second": 22.308, "eval_steps_per_second": 5.725, "step": 75 }, { "epoch": 0.600375234521576, "grad_norm": 0.5370538830757141, "learning_rate": 2.3582089552238806e-05, "loss": 1.1037, "step": 80 }, { "epoch": 0.6754221388367729, "grad_norm": 0.4297517240047455, "learning_rate": 2.656716417910448e-05, "loss": 0.989, "step": 90 }, { "epoch": 0.7504690431519699, "grad_norm": 0.5314680933952332, "learning_rate": 2.955223880597015e-05, "loss": 0.9507, "step": 100 }, { "epoch": 0.7504690431519699, "eval_loss": 0.9193022847175598, "eval_runtime": 5.0765, "eval_samples_per_second": 22.259, "eval_steps_per_second": 5.713, "step": 100 }, { "epoch": 0.8255159474671669, "grad_norm": 0.4320300221443176, "learning_rate": 3.253731343283582e-05, "loss": 0.9321, "step": 110 }, { "epoch": 0.900562851782364, "grad_norm": 0.44459468126296997, "learning_rate": 3.5522388059701495e-05, "loss": 0.8909, "step": 120 }, { "epoch": 0.9380863039399625, "eval_loss": 0.847252368927002, "eval_runtime": 5.0816, "eval_samples_per_second": 22.237, "eval_steps_per_second": 5.707, "step": 125 }, { "epoch": 0.975609756097561, "grad_norm": 0.4286046326160431, "learning_rate": 3.850746268656716e-05, "loss": 0.8575, "step": 130 }, { "epoch": 1.0450281425891181, "grad_norm": 0.4913986921310425, "learning_rate": 4.149253731343284e-05, "loss": 0.859, "step": 140 }, { "epoch": 1.1200750469043153, "grad_norm": 0.47017258405685425, "learning_rate": 4.447761194029851e-05, "loss": 0.8314, "step": 150 }, { "epoch": 1.1200750469043153, "eval_loss": 0.8092061877250671, "eval_runtime": 5.0731, "eval_samples_per_second": 22.274, "eval_steps_per_second": 5.716, "step": 150 }, { "epoch": 1.1951219512195121, "grad_norm": 0.4936457574367523, "learning_rate": 4.7462686567164185e-05, "loss": 0.8098, "step": 160 }, { "epoch": 1.2701688555347093, "grad_norm": 0.46496817469596863, "learning_rate": 5.044776119402985e-05, "loss": 0.7836, "step": 170 }, { "epoch": 1.3076923076923077, "eval_loss": 0.7787837982177734, "eval_runtime": 5.0698, "eval_samples_per_second": 22.289, "eval_steps_per_second": 5.72, "step": 175 }, { "epoch": 1.3452157598499062, "grad_norm": 0.48638296127319336, "learning_rate": 5.343283582089552e-05, "loss": 0.7835, "step": 180 }, { "epoch": 1.4202626641651033, "grad_norm": 0.49088621139526367, "learning_rate": 5.64179104477612e-05, "loss": 0.7774, "step": 190 }, { "epoch": 1.4953095684803002, "grad_norm": 0.49398818612098694, "learning_rate": 5.940298507462687e-05, "loss": 0.77, "step": 200 }, { "epoch": 1.4953095684803002, "eval_loss": 0.7553061842918396, "eval_runtime": 5.0675, "eval_samples_per_second": 22.299, "eval_steps_per_second": 5.723, "step": 200 }, { "epoch": 1.5703564727954973, "grad_norm": 0.5628990530967712, "learning_rate": 6.238805970149254e-05, "loss": 0.7582, "step": 210 }, { "epoch": 1.6454033771106942, "grad_norm": 0.49695977568626404, "learning_rate": 6.537313432835821e-05, "loss": 0.7592, "step": 220 }, { "epoch": 1.6829268292682928, "eval_loss": 0.7301478385925293, "eval_runtime": 5.0727, "eval_samples_per_second": 22.276, "eval_steps_per_second": 5.717, "step": 225 }, { "epoch": 1.720450281425891, "grad_norm": 0.4944610595703125, "learning_rate": 6.835820895522388e-05, "loss": 0.7329, "step": 230 }, { "epoch": 1.7954971857410882, "grad_norm": 0.4966571033000946, "learning_rate": 7.134328358208956e-05, "loss": 0.7303, "step": 240 }, { "epoch": 1.8705440900562853, "grad_norm": 0.5113062858581543, "learning_rate": 7.432835820895523e-05, "loss": 0.7082, "step": 250 }, { "epoch": 1.8705440900562853, "eval_loss": 0.7024827003479004, "eval_runtime": 5.0682, "eval_samples_per_second": 22.296, "eval_steps_per_second": 5.722, "step": 250 }, { "epoch": 1.9455909943714822, "grad_norm": 0.5686495900154114, "learning_rate": 7.731343283582089e-05, "loss": 0.7017, "step": 260 }, { "epoch": 2.0150093808630394, "grad_norm": 0.5748053193092346, "learning_rate": 8.029850746268657e-05, "loss": 0.6874, "step": 270 }, { "epoch": 2.052532833020638, "eval_loss": 0.6817129850387573, "eval_runtime": 5.0735, "eval_samples_per_second": 22.273, "eval_steps_per_second": 5.716, "step": 275 }, { "epoch": 2.0900562851782363, "grad_norm": 0.5935199856758118, "learning_rate": 8.328358208955225e-05, "loss": 0.6522, "step": 280 }, { "epoch": 2.1651031894934336, "grad_norm": 0.5952720046043396, "learning_rate": 8.626865671641792e-05, "loss": 0.6259, "step": 290 }, { "epoch": 2.2401500938086305, "grad_norm": 0.6446412205696106, "learning_rate": 8.925373134328359e-05, "loss": 0.6171, "step": 300 }, { "epoch": 2.2401500938086305, "eval_loss": 0.6408159136772156, "eval_runtime": 5.0734, "eval_samples_per_second": 22.273, "eval_steps_per_second": 5.716, "step": 300 }, { "epoch": 2.3151969981238274, "grad_norm": 0.722795307636261, "learning_rate": 9.223880597014926e-05, "loss": 0.6128, "step": 310 }, { "epoch": 2.3902439024390243, "grad_norm": 0.6945660710334778, "learning_rate": 9.522388059701492e-05, "loss": 0.6027, "step": 320 }, { "epoch": 2.427767354596623, "eval_loss": 0.6130869388580322, "eval_runtime": 5.075, "eval_samples_per_second": 22.266, "eval_steps_per_second": 5.714, "step": 325 }, { "epoch": 2.465290806754221, "grad_norm": 0.7010583877563477, "learning_rate": 9.82089552238806e-05, "loss": 0.5824, "step": 330 }, { "epoch": 2.5403377110694185, "grad_norm": 0.7590329051017761, "learning_rate": 9.999990255427757e-05, "loss": 0.5626, "step": 340 }, { "epoch": 2.6153846153846154, "grad_norm": 0.8965818881988525, "learning_rate": 9.99988062942623e-05, "loss": 0.5603, "step": 350 }, { "epoch": 2.6153846153846154, "eval_loss": 0.5764831304550171, "eval_runtime": 5.073, "eval_samples_per_second": 22.275, "eval_steps_per_second": 5.717, "step": 350 }, { "epoch": 2.6904315196998123, "grad_norm": 0.7385474443435669, "learning_rate": 9.999649199387416e-05, "loss": 0.5426, "step": 360 }, { "epoch": 2.7654784240150097, "grad_norm": 0.8265758156776428, "learning_rate": 9.999295970949272e-05, "loss": 0.5248, "step": 370 }, { "epoch": 2.803001876172608, "eval_loss": 0.5394740104675293, "eval_runtime": 5.0664, "eval_samples_per_second": 22.304, "eval_steps_per_second": 5.724, "step": 375 }, { "epoch": 2.8405253283302065, "grad_norm": 0.7400410771369934, "learning_rate": 9.998820952716934e-05, "loss": 0.5078, "step": 380 }, { "epoch": 2.9155722326454034, "grad_norm": 0.7278188467025757, "learning_rate": 9.998224156262505e-05, "loss": 0.4944, "step": 390 }, { "epoch": 2.9906191369606003, "grad_norm": 0.7803333401679993, "learning_rate": 9.997505596124777e-05, "loss": 0.4893, "step": 400 }, { "epoch": 2.9906191369606003, "eval_loss": 0.49963781237602234, "eval_runtime": 5.0779, "eval_samples_per_second": 22.253, "eval_steps_per_second": 5.711, "step": 400 }, { "epoch": 3.0600375234521575, "grad_norm": 0.8603639602661133, "learning_rate": 9.996665289808871e-05, "loss": 0.431, "step": 410 }, { "epoch": 3.1350844277673544, "grad_norm": 0.70228511095047, "learning_rate": 9.995703257785818e-05, "loss": 0.408, "step": 420 }, { "epoch": 3.172607879924953, "eval_loss": 0.4856467545032501, "eval_runtime": 5.0713, "eval_samples_per_second": 22.282, "eval_steps_per_second": 5.719, "step": 425 }, { "epoch": 3.2101313320825517, "grad_norm": 0.7288469672203064, "learning_rate": 9.994619523492054e-05, "loss": 0.3959, "step": 430 }, { "epoch": 3.2851782363977486, "grad_norm": 0.7244749665260315, "learning_rate": 9.993414113328852e-05, "loss": 0.3995, "step": 440 }, { "epoch": 3.3602251407129455, "grad_norm": 0.7007831931114197, "learning_rate": 9.992087056661677e-05, "loss": 0.3999, "step": 450 }, { "epoch": 3.3602251407129455, "eval_loss": 0.474253386259079, "eval_runtime": 5.0675, "eval_samples_per_second": 22.299, "eval_steps_per_second": 5.723, "step": 450 }, { "epoch": 3.4352720450281424, "grad_norm": 0.8084601759910583, "learning_rate": 9.990638385819472e-05, "loss": 0.3819, "step": 460 }, { "epoch": 3.5103189493433398, "grad_norm": 0.726610541343689, "learning_rate": 9.989068136093873e-05, "loss": 0.3869, "step": 470 }, { "epoch": 3.547842401500938, "eval_loss": 0.46386292576789856, "eval_runtime": 5.0699, "eval_samples_per_second": 22.288, "eval_steps_per_second": 5.72, "step": 475 }, { "epoch": 3.5853658536585367, "grad_norm": 0.7002731561660767, "learning_rate": 9.987376345738344e-05, "loss": 0.3953, "step": 480 }, { "epoch": 3.6604127579737336, "grad_norm": 0.6399222016334534, "learning_rate": 9.985563055967248e-05, "loss": 0.397, "step": 490 }, { "epoch": 3.7354596622889304, "grad_norm": 0.8647626042366028, "learning_rate": 9.983628310954843e-05, "loss": 0.3868, "step": 500 }, { "epoch": 3.7354596622889304, "eval_loss": 0.45724615454673767, "eval_runtime": 5.072, "eval_samples_per_second": 22.279, "eval_steps_per_second": 5.718, "step": 500 }, { "epoch": 3.8105065666041273, "grad_norm": 0.8007352948188782, "learning_rate": 9.981572157834203e-05, "loss": 0.3833, "step": 510 }, { "epoch": 3.8855534709193247, "grad_norm": 0.7353331446647644, "learning_rate": 9.979394646696078e-05, "loss": 0.3873, "step": 520 }, { "epoch": 3.9230769230769234, "eval_loss": 0.44937819242477417, "eval_runtime": 5.0648, "eval_samples_per_second": 22.311, "eval_steps_per_second": 5.726, "step": 525 }, { "epoch": 3.9606003752345216, "grad_norm": 0.725788950920105, "learning_rate": 9.977095830587659e-05, "loss": 0.3809, "step": 530 }, { "epoch": 4.030018761726079, "grad_norm": 0.6748231053352356, "learning_rate": 9.974675765511304e-05, "loss": 0.346, "step": 540 }, { "epoch": 4.105065666041276, "grad_norm": 0.7276281714439392, "learning_rate": 9.972134510423157e-05, "loss": 0.3137, "step": 550 }, { "epoch": 4.105065666041276, "eval_loss": 0.4566233158111572, "eval_runtime": 5.0603, "eval_samples_per_second": 22.331, "eval_steps_per_second": 5.731, "step": 550 }, { "epoch": 4.1801125703564725, "grad_norm": 0.7759580612182617, "learning_rate": 9.969472127231723e-05, "loss": 0.3049, "step": 560 }, { "epoch": 4.25515947467167, "grad_norm": 0.7065677642822266, "learning_rate": 9.966688680796356e-05, "loss": 0.3057, "step": 570 }, { "epoch": 4.2926829268292686, "eval_loss": 0.4608473777770996, "eval_runtime": 5.0627, "eval_samples_per_second": 22.32, "eval_steps_per_second": 5.728, "step": 575 }, { "epoch": 4.330206378986867, "grad_norm": 0.746277391910553, "learning_rate": 9.963784238925675e-05, "loss": 0.3042, "step": 580 }, { "epoch": 4.405253283302064, "grad_norm": 0.8283698558807373, "learning_rate": 9.960758872375922e-05, "loss": 0.312, "step": 590 }, { "epoch": 4.480300187617261, "grad_norm": 0.7889224290847778, "learning_rate": 9.957612654849225e-05, "loss": 0.3153, "step": 600 }, { "epoch": 4.480300187617261, "eval_loss": 0.45748454332351685, "eval_runtime": 5.0739, "eval_samples_per_second": 22.271, "eval_steps_per_second": 5.715, "step": 600 }, { "epoch": 4.5553470919324575, "grad_norm": 0.7301938533782959, "learning_rate": 9.954345662991813e-05, "loss": 0.3169, "step": 610 }, { "epoch": 4.630393996247655, "grad_norm": 0.7097094655036926, "learning_rate": 9.950957976392148e-05, "loss": 0.313, "step": 620 }, { "epoch": 4.6679174484052535, "eval_loss": 0.45044615864753723, "eval_runtime": 5.0663, "eval_samples_per_second": 22.304, "eval_steps_per_second": 5.724, "step": 625 }, { "epoch": 4.705440900562852, "grad_norm": 0.6627222895622253, "learning_rate": 9.947449677578982e-05, "loss": 0.311, "step": 630 }, { "epoch": 4.780487804878049, "grad_norm": 0.831093966960907, "learning_rate": 9.943820852019344e-05, "loss": 0.3187, "step": 640 }, { "epoch": 4.855534709193246, "grad_norm": 0.8561496734619141, "learning_rate": 9.940071588116468e-05, "loss": 0.317, "step": 650 }, { "epoch": 4.855534709193246, "eval_loss": 0.4430959224700928, "eval_runtime": 5.0752, "eval_samples_per_second": 22.265, "eval_steps_per_second": 5.714, "step": 650 }, { "epoch": 4.930581613508442, "grad_norm": 0.7869876623153687, "learning_rate": 9.936201977207631e-05, "loss": 0.3165, "step": 660 }, { "epoch": 5.0, "grad_norm": 2.4095470905303955, "learning_rate": 9.932212113561927e-05, "loss": 0.3139, "step": 670 }, { "epoch": 5.037523452157599, "eval_loss": 0.4734329581260681, "eval_runtime": 5.0708, "eval_samples_per_second": 22.285, "eval_steps_per_second": 5.719, "step": 675 }, { "epoch": 5.075046904315197, "grad_norm": 0.8977411985397339, "learning_rate": 9.92810209437798e-05, "loss": 0.2374, "step": 680 }, { "epoch": 5.150093808630394, "grad_norm": 0.8467944860458374, "learning_rate": 9.923872019781564e-05, "loss": 0.2397, "step": 690 }, { "epoch": 5.225140712945591, "grad_norm": 0.8776218295097351, "learning_rate": 9.919521992823173e-05, "loss": 0.2371, "step": 700 }, { "epoch": 5.225140712945591, "eval_loss": 0.46672776341438293, "eval_runtime": 5.0617, "eval_samples_per_second": 22.324, "eval_steps_per_second": 5.729, "step": 700 }, { "epoch": 5.300187617260788, "grad_norm": 0.8390938639640808, "learning_rate": 9.915052119475505e-05, "loss": 0.2487, "step": 710 }, { "epoch": 5.375234521575985, "grad_norm": 0.8848443627357483, "learning_rate": 9.910462508630885e-05, "loss": 0.2393, "step": 720 }, { "epoch": 5.412757973733584, "eval_loss": 0.4665623605251312, "eval_runtime": 5.0722, "eval_samples_per_second": 22.278, "eval_steps_per_second": 5.717, "step": 725 }, { "epoch": 5.450281425891182, "grad_norm": 0.8097019195556641, "learning_rate": 9.905753272098608e-05, "loss": 0.2439, "step": 730 }, { "epoch": 5.525328330206379, "grad_norm": 0.7588761448860168, "learning_rate": 9.900924524602218e-05, "loss": 0.2477, "step": 740 }, { "epoch": 5.600375234521576, "grad_norm": 0.8408402800559998, "learning_rate": 9.895976383776711e-05, "loss": 0.2484, "step": 750 }, { "epoch": 5.600375234521576, "eval_loss": 0.4602144956588745, "eval_runtime": 5.0743, "eval_samples_per_second": 22.269, "eval_steps_per_second": 5.715, "step": 750 }, { "epoch": 5.6754221388367725, "grad_norm": 0.8723646402359009, "learning_rate": 9.890908970165669e-05, "loss": 0.2533, "step": 760 }, { "epoch": 5.75046904315197, "grad_norm": 0.6956503391265869, "learning_rate": 9.88572240721833e-05, "loss": 0.253, "step": 770 }, { "epoch": 5.7879924953095685, "eval_loss": 0.450476735830307, "eval_runtime": 5.0627, "eval_samples_per_second": 22.32, "eval_steps_per_second": 5.728, "step": 775 }, { "epoch": 5.825515947467167, "grad_norm": 0.9172329306602478, "learning_rate": 9.880416821286569e-05, "loss": 0.2522, "step": 780 }, { "epoch": 5.900562851782364, "grad_norm": 0.7848353981971741, "learning_rate": 9.87499234162183e-05, "loss": 0.2493, "step": 790 }, { "epoch": 5.975609756097561, "grad_norm": 0.7720503211021423, "learning_rate": 9.869449100371973e-05, "loss": 0.2574, "step": 800 }, { "epoch": 5.975609756097561, "eval_loss": 0.44239288568496704, "eval_runtime": 5.0788, "eval_samples_per_second": 22.249, "eval_steps_per_second": 5.71, "step": 800 } ], "logging_steps": 10, "max_steps": 6700, "num_input_tokens_seen": 0, "num_train_epochs": 50, "save_steps": 25, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 15, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.5747343525279846e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }