| { | |
| "best_global_step": 800, | |
| "best_metric": 0.44239288568496704, | |
| "best_model_checkpoint": "./qwen_semantic_bridge_v2_full/checkpoint-800", | |
| "epoch": 5.975609756097561, | |
| "eval_steps": 25, | |
| "global_step": 800, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.075046904315197, | |
| "grad_norm": 1.53766667842865, | |
| "learning_rate": 2.6865671641791044e-06, | |
| "loss": 2.4791, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.150093808630394, | |
| "grad_norm": 1.524065613746643, | |
| "learning_rate": 5.671641791044776e-06, | |
| "loss": 2.4315, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.18761726078799248, | |
| "eval_loss": 2.34097957611084, | |
| "eval_runtime": 5.5956, | |
| "eval_samples_per_second": 20.194, | |
| "eval_steps_per_second": 5.183, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.225140712945591, | |
| "grad_norm": 1.4025851488113403, | |
| "learning_rate": 8.656716417910449e-06, | |
| "loss": 2.3807, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.300187617260788, | |
| "grad_norm": 0.9802243709564209, | |
| "learning_rate": 1.164179104477612e-05, | |
| "loss": 2.1459, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.37523452157598497, | |
| "grad_norm": 0.9669085144996643, | |
| "learning_rate": 1.4626865671641794e-05, | |
| "loss": 1.9016, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.37523452157598497, | |
| "eval_loss": 1.7425719499588013, | |
| "eval_runtime": 5.053, | |
| "eval_samples_per_second": 22.363, | |
| "eval_steps_per_second": 5.739, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.450281425891182, | |
| "grad_norm": 1.1177589893341064, | |
| "learning_rate": 1.761194029850746e-05, | |
| "loss": 1.596, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.525328330206379, | |
| "grad_norm": 0.5461925864219666, | |
| "learning_rate": 2.0597014925373135e-05, | |
| "loss": 1.2615, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.5628517823639775, | |
| "eval_loss": 1.0852540731430054, | |
| "eval_runtime": 5.0653, | |
| "eval_samples_per_second": 22.308, | |
| "eval_steps_per_second": 5.725, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.600375234521576, | |
| "grad_norm": 0.5370538830757141, | |
| "learning_rate": 2.3582089552238806e-05, | |
| "loss": 1.1037, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.6754221388367729, | |
| "grad_norm": 0.4297517240047455, | |
| "learning_rate": 2.656716417910448e-05, | |
| "loss": 0.989, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.7504690431519699, | |
| "grad_norm": 0.5314680933952332, | |
| "learning_rate": 2.955223880597015e-05, | |
| "loss": 0.9507, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.7504690431519699, | |
| "eval_loss": 0.9193022847175598, | |
| "eval_runtime": 5.0765, | |
| "eval_samples_per_second": 22.259, | |
| "eval_steps_per_second": 5.713, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.8255159474671669, | |
| "grad_norm": 0.4320300221443176, | |
| "learning_rate": 3.253731343283582e-05, | |
| "loss": 0.9321, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.900562851782364, | |
| "grad_norm": 0.44459468126296997, | |
| "learning_rate": 3.5522388059701495e-05, | |
| "loss": 0.8909, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.9380863039399625, | |
| "eval_loss": 0.847252368927002, | |
| "eval_runtime": 5.0816, | |
| "eval_samples_per_second": 22.237, | |
| "eval_steps_per_second": 5.707, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.975609756097561, | |
| "grad_norm": 0.4286046326160431, | |
| "learning_rate": 3.850746268656716e-05, | |
| "loss": 0.8575, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 1.0450281425891181, | |
| "grad_norm": 0.4913986921310425, | |
| "learning_rate": 4.149253731343284e-05, | |
| "loss": 0.859, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 1.1200750469043153, | |
| "grad_norm": 0.47017258405685425, | |
| "learning_rate": 4.447761194029851e-05, | |
| "loss": 0.8314, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 1.1200750469043153, | |
| "eval_loss": 0.8092061877250671, | |
| "eval_runtime": 5.0731, | |
| "eval_samples_per_second": 22.274, | |
| "eval_steps_per_second": 5.716, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 1.1951219512195121, | |
| "grad_norm": 0.4936457574367523, | |
| "learning_rate": 4.7462686567164185e-05, | |
| "loss": 0.8098, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 1.2701688555347093, | |
| "grad_norm": 0.46496817469596863, | |
| "learning_rate": 5.044776119402985e-05, | |
| "loss": 0.7836, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 1.3076923076923077, | |
| "eval_loss": 0.7787837982177734, | |
| "eval_runtime": 5.0698, | |
| "eval_samples_per_second": 22.289, | |
| "eval_steps_per_second": 5.72, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 1.3452157598499062, | |
| "grad_norm": 0.48638296127319336, | |
| "learning_rate": 5.343283582089552e-05, | |
| "loss": 0.7835, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 1.4202626641651033, | |
| "grad_norm": 0.49088621139526367, | |
| "learning_rate": 5.64179104477612e-05, | |
| "loss": 0.7774, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 1.4953095684803002, | |
| "grad_norm": 0.49398818612098694, | |
| "learning_rate": 5.940298507462687e-05, | |
| "loss": 0.77, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 1.4953095684803002, | |
| "eval_loss": 0.7553061842918396, | |
| "eval_runtime": 5.0675, | |
| "eval_samples_per_second": 22.299, | |
| "eval_steps_per_second": 5.723, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 1.5703564727954973, | |
| "grad_norm": 0.5628990530967712, | |
| "learning_rate": 6.238805970149254e-05, | |
| "loss": 0.7582, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 1.6454033771106942, | |
| "grad_norm": 0.49695977568626404, | |
| "learning_rate": 6.537313432835821e-05, | |
| "loss": 0.7592, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 1.6829268292682928, | |
| "eval_loss": 0.7301478385925293, | |
| "eval_runtime": 5.0727, | |
| "eval_samples_per_second": 22.276, | |
| "eval_steps_per_second": 5.717, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 1.720450281425891, | |
| "grad_norm": 0.4944610595703125, | |
| "learning_rate": 6.835820895522388e-05, | |
| "loss": 0.7329, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 1.7954971857410882, | |
| "grad_norm": 0.4966571033000946, | |
| "learning_rate": 7.134328358208956e-05, | |
| "loss": 0.7303, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 1.8705440900562853, | |
| "grad_norm": 0.5113062858581543, | |
| "learning_rate": 7.432835820895523e-05, | |
| "loss": 0.7082, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 1.8705440900562853, | |
| "eval_loss": 0.7024827003479004, | |
| "eval_runtime": 5.0682, | |
| "eval_samples_per_second": 22.296, | |
| "eval_steps_per_second": 5.722, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 1.9455909943714822, | |
| "grad_norm": 0.5686495900154114, | |
| "learning_rate": 7.731343283582089e-05, | |
| "loss": 0.7017, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 2.0150093808630394, | |
| "grad_norm": 0.5748053193092346, | |
| "learning_rate": 8.029850746268657e-05, | |
| "loss": 0.6874, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 2.052532833020638, | |
| "eval_loss": 0.6817129850387573, | |
| "eval_runtime": 5.0735, | |
| "eval_samples_per_second": 22.273, | |
| "eval_steps_per_second": 5.716, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 2.0900562851782363, | |
| "grad_norm": 0.5935199856758118, | |
| "learning_rate": 8.328358208955225e-05, | |
| "loss": 0.6522, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 2.1651031894934336, | |
| "grad_norm": 0.5952720046043396, | |
| "learning_rate": 8.626865671641792e-05, | |
| "loss": 0.6259, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 2.2401500938086305, | |
| "grad_norm": 0.6446412205696106, | |
| "learning_rate": 8.925373134328359e-05, | |
| "loss": 0.6171, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 2.2401500938086305, | |
| "eval_loss": 0.6408159136772156, | |
| "eval_runtime": 5.0734, | |
| "eval_samples_per_second": 22.273, | |
| "eval_steps_per_second": 5.716, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 2.3151969981238274, | |
| "grad_norm": 0.722795307636261, | |
| "learning_rate": 9.223880597014926e-05, | |
| "loss": 0.6128, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 2.3902439024390243, | |
| "grad_norm": 0.6945660710334778, | |
| "learning_rate": 9.522388059701492e-05, | |
| "loss": 0.6027, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 2.427767354596623, | |
| "eval_loss": 0.6130869388580322, | |
| "eval_runtime": 5.075, | |
| "eval_samples_per_second": 22.266, | |
| "eval_steps_per_second": 5.714, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 2.465290806754221, | |
| "grad_norm": 0.7010583877563477, | |
| "learning_rate": 9.82089552238806e-05, | |
| "loss": 0.5824, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 2.5403377110694185, | |
| "grad_norm": 0.7590329051017761, | |
| "learning_rate": 9.999990255427757e-05, | |
| "loss": 0.5626, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 2.6153846153846154, | |
| "grad_norm": 0.8965818881988525, | |
| "learning_rate": 9.99988062942623e-05, | |
| "loss": 0.5603, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 2.6153846153846154, | |
| "eval_loss": 0.5764831304550171, | |
| "eval_runtime": 5.073, | |
| "eval_samples_per_second": 22.275, | |
| "eval_steps_per_second": 5.717, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 2.6904315196998123, | |
| "grad_norm": 0.7385474443435669, | |
| "learning_rate": 9.999649199387416e-05, | |
| "loss": 0.5426, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 2.7654784240150097, | |
| "grad_norm": 0.8265758156776428, | |
| "learning_rate": 9.999295970949272e-05, | |
| "loss": 0.5248, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 2.803001876172608, | |
| "eval_loss": 0.5394740104675293, | |
| "eval_runtime": 5.0664, | |
| "eval_samples_per_second": 22.304, | |
| "eval_steps_per_second": 5.724, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 2.8405253283302065, | |
| "grad_norm": 0.7400410771369934, | |
| "learning_rate": 9.998820952716934e-05, | |
| "loss": 0.5078, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 2.9155722326454034, | |
| "grad_norm": 0.7278188467025757, | |
| "learning_rate": 9.998224156262505e-05, | |
| "loss": 0.4944, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 2.9906191369606003, | |
| "grad_norm": 0.7803333401679993, | |
| "learning_rate": 9.997505596124777e-05, | |
| "loss": 0.4893, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 2.9906191369606003, | |
| "eval_loss": 0.49963781237602234, | |
| "eval_runtime": 5.0779, | |
| "eval_samples_per_second": 22.253, | |
| "eval_steps_per_second": 5.711, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 3.0600375234521575, | |
| "grad_norm": 0.8603639602661133, | |
| "learning_rate": 9.996665289808871e-05, | |
| "loss": 0.431, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 3.1350844277673544, | |
| "grad_norm": 0.70228511095047, | |
| "learning_rate": 9.995703257785818e-05, | |
| "loss": 0.408, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 3.172607879924953, | |
| "eval_loss": 0.4856467545032501, | |
| "eval_runtime": 5.0713, | |
| "eval_samples_per_second": 22.282, | |
| "eval_steps_per_second": 5.719, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 3.2101313320825517, | |
| "grad_norm": 0.7288469672203064, | |
| "learning_rate": 9.994619523492054e-05, | |
| "loss": 0.3959, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 3.2851782363977486, | |
| "grad_norm": 0.7244749665260315, | |
| "learning_rate": 9.993414113328852e-05, | |
| "loss": 0.3995, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 3.3602251407129455, | |
| "grad_norm": 0.7007831931114197, | |
| "learning_rate": 9.992087056661677e-05, | |
| "loss": 0.3999, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 3.3602251407129455, | |
| "eval_loss": 0.474253386259079, | |
| "eval_runtime": 5.0675, | |
| "eval_samples_per_second": 22.299, | |
| "eval_steps_per_second": 5.723, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 3.4352720450281424, | |
| "grad_norm": 0.8084601759910583, | |
| "learning_rate": 9.990638385819472e-05, | |
| "loss": 0.3819, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 3.5103189493433398, | |
| "grad_norm": 0.726610541343689, | |
| "learning_rate": 9.989068136093873e-05, | |
| "loss": 0.3869, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 3.547842401500938, | |
| "eval_loss": 0.46386292576789856, | |
| "eval_runtime": 5.0699, | |
| "eval_samples_per_second": 22.288, | |
| "eval_steps_per_second": 5.72, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 3.5853658536585367, | |
| "grad_norm": 0.7002731561660767, | |
| "learning_rate": 9.987376345738344e-05, | |
| "loss": 0.3953, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 3.6604127579737336, | |
| "grad_norm": 0.6399222016334534, | |
| "learning_rate": 9.985563055967248e-05, | |
| "loss": 0.397, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 3.7354596622889304, | |
| "grad_norm": 0.8647626042366028, | |
| "learning_rate": 9.983628310954843e-05, | |
| "loss": 0.3868, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 3.7354596622889304, | |
| "eval_loss": 0.45724615454673767, | |
| "eval_runtime": 5.072, | |
| "eval_samples_per_second": 22.279, | |
| "eval_steps_per_second": 5.718, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 3.8105065666041273, | |
| "grad_norm": 0.8007352948188782, | |
| "learning_rate": 9.981572157834203e-05, | |
| "loss": 0.3833, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 3.8855534709193247, | |
| "grad_norm": 0.7353331446647644, | |
| "learning_rate": 9.979394646696078e-05, | |
| "loss": 0.3873, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 3.9230769230769234, | |
| "eval_loss": 0.44937819242477417, | |
| "eval_runtime": 5.0648, | |
| "eval_samples_per_second": 22.311, | |
| "eval_steps_per_second": 5.726, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 3.9606003752345216, | |
| "grad_norm": 0.725788950920105, | |
| "learning_rate": 9.977095830587659e-05, | |
| "loss": 0.3809, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 4.030018761726079, | |
| "grad_norm": 0.6748231053352356, | |
| "learning_rate": 9.974675765511304e-05, | |
| "loss": 0.346, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 4.105065666041276, | |
| "grad_norm": 0.7276281714439392, | |
| "learning_rate": 9.972134510423157e-05, | |
| "loss": 0.3137, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 4.105065666041276, | |
| "eval_loss": 0.4566233158111572, | |
| "eval_runtime": 5.0603, | |
| "eval_samples_per_second": 22.331, | |
| "eval_steps_per_second": 5.731, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 4.1801125703564725, | |
| "grad_norm": 0.7759580612182617, | |
| "learning_rate": 9.969472127231723e-05, | |
| "loss": 0.3049, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 4.25515947467167, | |
| "grad_norm": 0.7065677642822266, | |
| "learning_rate": 9.966688680796356e-05, | |
| "loss": 0.3057, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 4.2926829268292686, | |
| "eval_loss": 0.4608473777770996, | |
| "eval_runtime": 5.0627, | |
| "eval_samples_per_second": 22.32, | |
| "eval_steps_per_second": 5.728, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 4.330206378986867, | |
| "grad_norm": 0.746277391910553, | |
| "learning_rate": 9.963784238925675e-05, | |
| "loss": 0.3042, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 4.405253283302064, | |
| "grad_norm": 0.8283698558807373, | |
| "learning_rate": 9.960758872375922e-05, | |
| "loss": 0.312, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 4.480300187617261, | |
| "grad_norm": 0.7889224290847778, | |
| "learning_rate": 9.957612654849225e-05, | |
| "loss": 0.3153, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 4.480300187617261, | |
| "eval_loss": 0.45748454332351685, | |
| "eval_runtime": 5.0739, | |
| "eval_samples_per_second": 22.271, | |
| "eval_steps_per_second": 5.715, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 4.5553470919324575, | |
| "grad_norm": 0.7301938533782959, | |
| "learning_rate": 9.954345662991813e-05, | |
| "loss": 0.3169, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 4.630393996247655, | |
| "grad_norm": 0.7097094655036926, | |
| "learning_rate": 9.950957976392148e-05, | |
| "loss": 0.313, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 4.6679174484052535, | |
| "eval_loss": 0.45044615864753723, | |
| "eval_runtime": 5.0663, | |
| "eval_samples_per_second": 22.304, | |
| "eval_steps_per_second": 5.724, | |
| "step": 625 | |
| }, | |
| { | |
| "epoch": 4.705440900562852, | |
| "grad_norm": 0.6627222895622253, | |
| "learning_rate": 9.947449677578982e-05, | |
| "loss": 0.311, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 4.780487804878049, | |
| "grad_norm": 0.831093966960907, | |
| "learning_rate": 9.943820852019344e-05, | |
| "loss": 0.3187, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 4.855534709193246, | |
| "grad_norm": 0.8561496734619141, | |
| "learning_rate": 9.940071588116468e-05, | |
| "loss": 0.317, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 4.855534709193246, | |
| "eval_loss": 0.4430959224700928, | |
| "eval_runtime": 5.0752, | |
| "eval_samples_per_second": 22.265, | |
| "eval_steps_per_second": 5.714, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 4.930581613508442, | |
| "grad_norm": 0.7869876623153687, | |
| "learning_rate": 9.936201977207631e-05, | |
| "loss": 0.3165, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "grad_norm": 2.4095470905303955, | |
| "learning_rate": 9.932212113561927e-05, | |
| "loss": 0.3139, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 5.037523452157599, | |
| "eval_loss": 0.4734329581260681, | |
| "eval_runtime": 5.0708, | |
| "eval_samples_per_second": 22.285, | |
| "eval_steps_per_second": 5.719, | |
| "step": 675 | |
| }, | |
| { | |
| "epoch": 5.075046904315197, | |
| "grad_norm": 0.8977411985397339, | |
| "learning_rate": 9.92810209437798e-05, | |
| "loss": 0.2374, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 5.150093808630394, | |
| "grad_norm": 0.8467944860458374, | |
| "learning_rate": 9.923872019781564e-05, | |
| "loss": 0.2397, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 5.225140712945591, | |
| "grad_norm": 0.8776218295097351, | |
| "learning_rate": 9.919521992823173e-05, | |
| "loss": 0.2371, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 5.225140712945591, | |
| "eval_loss": 0.46672776341438293, | |
| "eval_runtime": 5.0617, | |
| "eval_samples_per_second": 22.324, | |
| "eval_steps_per_second": 5.729, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 5.300187617260788, | |
| "grad_norm": 0.8390938639640808, | |
| "learning_rate": 9.915052119475505e-05, | |
| "loss": 0.2487, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 5.375234521575985, | |
| "grad_norm": 0.8848443627357483, | |
| "learning_rate": 9.910462508630885e-05, | |
| "loss": 0.2393, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 5.412757973733584, | |
| "eval_loss": 0.4665623605251312, | |
| "eval_runtime": 5.0722, | |
| "eval_samples_per_second": 22.278, | |
| "eval_steps_per_second": 5.717, | |
| "step": 725 | |
| }, | |
| { | |
| "epoch": 5.450281425891182, | |
| "grad_norm": 0.8097019195556641, | |
| "learning_rate": 9.905753272098608e-05, | |
| "loss": 0.2439, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 5.525328330206379, | |
| "grad_norm": 0.7588761448860168, | |
| "learning_rate": 9.900924524602218e-05, | |
| "loss": 0.2477, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 5.600375234521576, | |
| "grad_norm": 0.8408402800559998, | |
| "learning_rate": 9.895976383776711e-05, | |
| "loss": 0.2484, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 5.600375234521576, | |
| "eval_loss": 0.4602144956588745, | |
| "eval_runtime": 5.0743, | |
| "eval_samples_per_second": 22.269, | |
| "eval_steps_per_second": 5.715, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 5.6754221388367725, | |
| "grad_norm": 0.8723646402359009, | |
| "learning_rate": 9.890908970165669e-05, | |
| "loss": 0.2533, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 5.75046904315197, | |
| "grad_norm": 0.6956503391265869, | |
| "learning_rate": 9.88572240721833e-05, | |
| "loss": 0.253, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 5.7879924953095685, | |
| "eval_loss": 0.450476735830307, | |
| "eval_runtime": 5.0627, | |
| "eval_samples_per_second": 22.32, | |
| "eval_steps_per_second": 5.728, | |
| "step": 775 | |
| }, | |
| { | |
| "epoch": 5.825515947467167, | |
| "grad_norm": 0.9172329306602478, | |
| "learning_rate": 9.880416821286569e-05, | |
| "loss": 0.2522, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 5.900562851782364, | |
| "grad_norm": 0.7848353981971741, | |
| "learning_rate": 9.87499234162183e-05, | |
| "loss": 0.2493, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 5.975609756097561, | |
| "grad_norm": 0.7720503211021423, | |
| "learning_rate": 9.869449100371973e-05, | |
| "loss": 0.2574, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 5.975609756097561, | |
| "eval_loss": 0.44239288568496704, | |
| "eval_runtime": 5.0788, | |
| "eval_samples_per_second": 22.249, | |
| "eval_steps_per_second": 5.71, | |
| "step": 800 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 6700, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 50, | |
| "save_steps": 25, | |
| "stateful_callbacks": { | |
| "EarlyStoppingCallback": { | |
| "args": { | |
| "early_stopping_patience": 15, | |
| "early_stopping_threshold": 0.0 | |
| }, | |
| "attributes": { | |
| "early_stopping_patience_counter": 0 | |
| } | |
| }, | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2.5747343525279846e+17, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |