semstage7b / trainer_state.json
pawan2411's picture
Upload fine-tuned Qwen LoRA for semantic bridging
3d1f7f0 verified
{
"best_global_step": 800,
"best_metric": 0.44239288568496704,
"best_model_checkpoint": "./qwen_semantic_bridge_v2_full/checkpoint-800",
"epoch": 5.975609756097561,
"eval_steps": 25,
"global_step": 800,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.075046904315197,
"grad_norm": 1.53766667842865,
"learning_rate": 2.6865671641791044e-06,
"loss": 2.4791,
"step": 10
},
{
"epoch": 0.150093808630394,
"grad_norm": 1.524065613746643,
"learning_rate": 5.671641791044776e-06,
"loss": 2.4315,
"step": 20
},
{
"epoch": 0.18761726078799248,
"eval_loss": 2.34097957611084,
"eval_runtime": 5.5956,
"eval_samples_per_second": 20.194,
"eval_steps_per_second": 5.183,
"step": 25
},
{
"epoch": 0.225140712945591,
"grad_norm": 1.4025851488113403,
"learning_rate": 8.656716417910449e-06,
"loss": 2.3807,
"step": 30
},
{
"epoch": 0.300187617260788,
"grad_norm": 0.9802243709564209,
"learning_rate": 1.164179104477612e-05,
"loss": 2.1459,
"step": 40
},
{
"epoch": 0.37523452157598497,
"grad_norm": 0.9669085144996643,
"learning_rate": 1.4626865671641794e-05,
"loss": 1.9016,
"step": 50
},
{
"epoch": 0.37523452157598497,
"eval_loss": 1.7425719499588013,
"eval_runtime": 5.053,
"eval_samples_per_second": 22.363,
"eval_steps_per_second": 5.739,
"step": 50
},
{
"epoch": 0.450281425891182,
"grad_norm": 1.1177589893341064,
"learning_rate": 1.761194029850746e-05,
"loss": 1.596,
"step": 60
},
{
"epoch": 0.525328330206379,
"grad_norm": 0.5461925864219666,
"learning_rate": 2.0597014925373135e-05,
"loss": 1.2615,
"step": 70
},
{
"epoch": 0.5628517823639775,
"eval_loss": 1.0852540731430054,
"eval_runtime": 5.0653,
"eval_samples_per_second": 22.308,
"eval_steps_per_second": 5.725,
"step": 75
},
{
"epoch": 0.600375234521576,
"grad_norm": 0.5370538830757141,
"learning_rate": 2.3582089552238806e-05,
"loss": 1.1037,
"step": 80
},
{
"epoch": 0.6754221388367729,
"grad_norm": 0.4297517240047455,
"learning_rate": 2.656716417910448e-05,
"loss": 0.989,
"step": 90
},
{
"epoch": 0.7504690431519699,
"grad_norm": 0.5314680933952332,
"learning_rate": 2.955223880597015e-05,
"loss": 0.9507,
"step": 100
},
{
"epoch": 0.7504690431519699,
"eval_loss": 0.9193022847175598,
"eval_runtime": 5.0765,
"eval_samples_per_second": 22.259,
"eval_steps_per_second": 5.713,
"step": 100
},
{
"epoch": 0.8255159474671669,
"grad_norm": 0.4320300221443176,
"learning_rate": 3.253731343283582e-05,
"loss": 0.9321,
"step": 110
},
{
"epoch": 0.900562851782364,
"grad_norm": 0.44459468126296997,
"learning_rate": 3.5522388059701495e-05,
"loss": 0.8909,
"step": 120
},
{
"epoch": 0.9380863039399625,
"eval_loss": 0.847252368927002,
"eval_runtime": 5.0816,
"eval_samples_per_second": 22.237,
"eval_steps_per_second": 5.707,
"step": 125
},
{
"epoch": 0.975609756097561,
"grad_norm": 0.4286046326160431,
"learning_rate": 3.850746268656716e-05,
"loss": 0.8575,
"step": 130
},
{
"epoch": 1.0450281425891181,
"grad_norm": 0.4913986921310425,
"learning_rate": 4.149253731343284e-05,
"loss": 0.859,
"step": 140
},
{
"epoch": 1.1200750469043153,
"grad_norm": 0.47017258405685425,
"learning_rate": 4.447761194029851e-05,
"loss": 0.8314,
"step": 150
},
{
"epoch": 1.1200750469043153,
"eval_loss": 0.8092061877250671,
"eval_runtime": 5.0731,
"eval_samples_per_second": 22.274,
"eval_steps_per_second": 5.716,
"step": 150
},
{
"epoch": 1.1951219512195121,
"grad_norm": 0.4936457574367523,
"learning_rate": 4.7462686567164185e-05,
"loss": 0.8098,
"step": 160
},
{
"epoch": 1.2701688555347093,
"grad_norm": 0.46496817469596863,
"learning_rate": 5.044776119402985e-05,
"loss": 0.7836,
"step": 170
},
{
"epoch": 1.3076923076923077,
"eval_loss": 0.7787837982177734,
"eval_runtime": 5.0698,
"eval_samples_per_second": 22.289,
"eval_steps_per_second": 5.72,
"step": 175
},
{
"epoch": 1.3452157598499062,
"grad_norm": 0.48638296127319336,
"learning_rate": 5.343283582089552e-05,
"loss": 0.7835,
"step": 180
},
{
"epoch": 1.4202626641651033,
"grad_norm": 0.49088621139526367,
"learning_rate": 5.64179104477612e-05,
"loss": 0.7774,
"step": 190
},
{
"epoch": 1.4953095684803002,
"grad_norm": 0.49398818612098694,
"learning_rate": 5.940298507462687e-05,
"loss": 0.77,
"step": 200
},
{
"epoch": 1.4953095684803002,
"eval_loss": 0.7553061842918396,
"eval_runtime": 5.0675,
"eval_samples_per_second": 22.299,
"eval_steps_per_second": 5.723,
"step": 200
},
{
"epoch": 1.5703564727954973,
"grad_norm": 0.5628990530967712,
"learning_rate": 6.238805970149254e-05,
"loss": 0.7582,
"step": 210
},
{
"epoch": 1.6454033771106942,
"grad_norm": 0.49695977568626404,
"learning_rate": 6.537313432835821e-05,
"loss": 0.7592,
"step": 220
},
{
"epoch": 1.6829268292682928,
"eval_loss": 0.7301478385925293,
"eval_runtime": 5.0727,
"eval_samples_per_second": 22.276,
"eval_steps_per_second": 5.717,
"step": 225
},
{
"epoch": 1.720450281425891,
"grad_norm": 0.4944610595703125,
"learning_rate": 6.835820895522388e-05,
"loss": 0.7329,
"step": 230
},
{
"epoch": 1.7954971857410882,
"grad_norm": 0.4966571033000946,
"learning_rate": 7.134328358208956e-05,
"loss": 0.7303,
"step": 240
},
{
"epoch": 1.8705440900562853,
"grad_norm": 0.5113062858581543,
"learning_rate": 7.432835820895523e-05,
"loss": 0.7082,
"step": 250
},
{
"epoch": 1.8705440900562853,
"eval_loss": 0.7024827003479004,
"eval_runtime": 5.0682,
"eval_samples_per_second": 22.296,
"eval_steps_per_second": 5.722,
"step": 250
},
{
"epoch": 1.9455909943714822,
"grad_norm": 0.5686495900154114,
"learning_rate": 7.731343283582089e-05,
"loss": 0.7017,
"step": 260
},
{
"epoch": 2.0150093808630394,
"grad_norm": 0.5748053193092346,
"learning_rate": 8.029850746268657e-05,
"loss": 0.6874,
"step": 270
},
{
"epoch": 2.052532833020638,
"eval_loss": 0.6817129850387573,
"eval_runtime": 5.0735,
"eval_samples_per_second": 22.273,
"eval_steps_per_second": 5.716,
"step": 275
},
{
"epoch": 2.0900562851782363,
"grad_norm": 0.5935199856758118,
"learning_rate": 8.328358208955225e-05,
"loss": 0.6522,
"step": 280
},
{
"epoch": 2.1651031894934336,
"grad_norm": 0.5952720046043396,
"learning_rate": 8.626865671641792e-05,
"loss": 0.6259,
"step": 290
},
{
"epoch": 2.2401500938086305,
"grad_norm": 0.6446412205696106,
"learning_rate": 8.925373134328359e-05,
"loss": 0.6171,
"step": 300
},
{
"epoch": 2.2401500938086305,
"eval_loss": 0.6408159136772156,
"eval_runtime": 5.0734,
"eval_samples_per_second": 22.273,
"eval_steps_per_second": 5.716,
"step": 300
},
{
"epoch": 2.3151969981238274,
"grad_norm": 0.722795307636261,
"learning_rate": 9.223880597014926e-05,
"loss": 0.6128,
"step": 310
},
{
"epoch": 2.3902439024390243,
"grad_norm": 0.6945660710334778,
"learning_rate": 9.522388059701492e-05,
"loss": 0.6027,
"step": 320
},
{
"epoch": 2.427767354596623,
"eval_loss": 0.6130869388580322,
"eval_runtime": 5.075,
"eval_samples_per_second": 22.266,
"eval_steps_per_second": 5.714,
"step": 325
},
{
"epoch": 2.465290806754221,
"grad_norm": 0.7010583877563477,
"learning_rate": 9.82089552238806e-05,
"loss": 0.5824,
"step": 330
},
{
"epoch": 2.5403377110694185,
"grad_norm": 0.7590329051017761,
"learning_rate": 9.999990255427757e-05,
"loss": 0.5626,
"step": 340
},
{
"epoch": 2.6153846153846154,
"grad_norm": 0.8965818881988525,
"learning_rate": 9.99988062942623e-05,
"loss": 0.5603,
"step": 350
},
{
"epoch": 2.6153846153846154,
"eval_loss": 0.5764831304550171,
"eval_runtime": 5.073,
"eval_samples_per_second": 22.275,
"eval_steps_per_second": 5.717,
"step": 350
},
{
"epoch": 2.6904315196998123,
"grad_norm": 0.7385474443435669,
"learning_rate": 9.999649199387416e-05,
"loss": 0.5426,
"step": 360
},
{
"epoch": 2.7654784240150097,
"grad_norm": 0.8265758156776428,
"learning_rate": 9.999295970949272e-05,
"loss": 0.5248,
"step": 370
},
{
"epoch": 2.803001876172608,
"eval_loss": 0.5394740104675293,
"eval_runtime": 5.0664,
"eval_samples_per_second": 22.304,
"eval_steps_per_second": 5.724,
"step": 375
},
{
"epoch": 2.8405253283302065,
"grad_norm": 0.7400410771369934,
"learning_rate": 9.998820952716934e-05,
"loss": 0.5078,
"step": 380
},
{
"epoch": 2.9155722326454034,
"grad_norm": 0.7278188467025757,
"learning_rate": 9.998224156262505e-05,
"loss": 0.4944,
"step": 390
},
{
"epoch": 2.9906191369606003,
"grad_norm": 0.7803333401679993,
"learning_rate": 9.997505596124777e-05,
"loss": 0.4893,
"step": 400
},
{
"epoch": 2.9906191369606003,
"eval_loss": 0.49963781237602234,
"eval_runtime": 5.0779,
"eval_samples_per_second": 22.253,
"eval_steps_per_second": 5.711,
"step": 400
},
{
"epoch": 3.0600375234521575,
"grad_norm": 0.8603639602661133,
"learning_rate": 9.996665289808871e-05,
"loss": 0.431,
"step": 410
},
{
"epoch": 3.1350844277673544,
"grad_norm": 0.70228511095047,
"learning_rate": 9.995703257785818e-05,
"loss": 0.408,
"step": 420
},
{
"epoch": 3.172607879924953,
"eval_loss": 0.4856467545032501,
"eval_runtime": 5.0713,
"eval_samples_per_second": 22.282,
"eval_steps_per_second": 5.719,
"step": 425
},
{
"epoch": 3.2101313320825517,
"grad_norm": 0.7288469672203064,
"learning_rate": 9.994619523492054e-05,
"loss": 0.3959,
"step": 430
},
{
"epoch": 3.2851782363977486,
"grad_norm": 0.7244749665260315,
"learning_rate": 9.993414113328852e-05,
"loss": 0.3995,
"step": 440
},
{
"epoch": 3.3602251407129455,
"grad_norm": 0.7007831931114197,
"learning_rate": 9.992087056661677e-05,
"loss": 0.3999,
"step": 450
},
{
"epoch": 3.3602251407129455,
"eval_loss": 0.474253386259079,
"eval_runtime": 5.0675,
"eval_samples_per_second": 22.299,
"eval_steps_per_second": 5.723,
"step": 450
},
{
"epoch": 3.4352720450281424,
"grad_norm": 0.8084601759910583,
"learning_rate": 9.990638385819472e-05,
"loss": 0.3819,
"step": 460
},
{
"epoch": 3.5103189493433398,
"grad_norm": 0.726610541343689,
"learning_rate": 9.989068136093873e-05,
"loss": 0.3869,
"step": 470
},
{
"epoch": 3.547842401500938,
"eval_loss": 0.46386292576789856,
"eval_runtime": 5.0699,
"eval_samples_per_second": 22.288,
"eval_steps_per_second": 5.72,
"step": 475
},
{
"epoch": 3.5853658536585367,
"grad_norm": 0.7002731561660767,
"learning_rate": 9.987376345738344e-05,
"loss": 0.3953,
"step": 480
},
{
"epoch": 3.6604127579737336,
"grad_norm": 0.6399222016334534,
"learning_rate": 9.985563055967248e-05,
"loss": 0.397,
"step": 490
},
{
"epoch": 3.7354596622889304,
"grad_norm": 0.8647626042366028,
"learning_rate": 9.983628310954843e-05,
"loss": 0.3868,
"step": 500
},
{
"epoch": 3.7354596622889304,
"eval_loss": 0.45724615454673767,
"eval_runtime": 5.072,
"eval_samples_per_second": 22.279,
"eval_steps_per_second": 5.718,
"step": 500
},
{
"epoch": 3.8105065666041273,
"grad_norm": 0.8007352948188782,
"learning_rate": 9.981572157834203e-05,
"loss": 0.3833,
"step": 510
},
{
"epoch": 3.8855534709193247,
"grad_norm": 0.7353331446647644,
"learning_rate": 9.979394646696078e-05,
"loss": 0.3873,
"step": 520
},
{
"epoch": 3.9230769230769234,
"eval_loss": 0.44937819242477417,
"eval_runtime": 5.0648,
"eval_samples_per_second": 22.311,
"eval_steps_per_second": 5.726,
"step": 525
},
{
"epoch": 3.9606003752345216,
"grad_norm": 0.725788950920105,
"learning_rate": 9.977095830587659e-05,
"loss": 0.3809,
"step": 530
},
{
"epoch": 4.030018761726079,
"grad_norm": 0.6748231053352356,
"learning_rate": 9.974675765511304e-05,
"loss": 0.346,
"step": 540
},
{
"epoch": 4.105065666041276,
"grad_norm": 0.7276281714439392,
"learning_rate": 9.972134510423157e-05,
"loss": 0.3137,
"step": 550
},
{
"epoch": 4.105065666041276,
"eval_loss": 0.4566233158111572,
"eval_runtime": 5.0603,
"eval_samples_per_second": 22.331,
"eval_steps_per_second": 5.731,
"step": 550
},
{
"epoch": 4.1801125703564725,
"grad_norm": 0.7759580612182617,
"learning_rate": 9.969472127231723e-05,
"loss": 0.3049,
"step": 560
},
{
"epoch": 4.25515947467167,
"grad_norm": 0.7065677642822266,
"learning_rate": 9.966688680796356e-05,
"loss": 0.3057,
"step": 570
},
{
"epoch": 4.2926829268292686,
"eval_loss": 0.4608473777770996,
"eval_runtime": 5.0627,
"eval_samples_per_second": 22.32,
"eval_steps_per_second": 5.728,
"step": 575
},
{
"epoch": 4.330206378986867,
"grad_norm": 0.746277391910553,
"learning_rate": 9.963784238925675e-05,
"loss": 0.3042,
"step": 580
},
{
"epoch": 4.405253283302064,
"grad_norm": 0.8283698558807373,
"learning_rate": 9.960758872375922e-05,
"loss": 0.312,
"step": 590
},
{
"epoch": 4.480300187617261,
"grad_norm": 0.7889224290847778,
"learning_rate": 9.957612654849225e-05,
"loss": 0.3153,
"step": 600
},
{
"epoch": 4.480300187617261,
"eval_loss": 0.45748454332351685,
"eval_runtime": 5.0739,
"eval_samples_per_second": 22.271,
"eval_steps_per_second": 5.715,
"step": 600
},
{
"epoch": 4.5553470919324575,
"grad_norm": 0.7301938533782959,
"learning_rate": 9.954345662991813e-05,
"loss": 0.3169,
"step": 610
},
{
"epoch": 4.630393996247655,
"grad_norm": 0.7097094655036926,
"learning_rate": 9.950957976392148e-05,
"loss": 0.313,
"step": 620
},
{
"epoch": 4.6679174484052535,
"eval_loss": 0.45044615864753723,
"eval_runtime": 5.0663,
"eval_samples_per_second": 22.304,
"eval_steps_per_second": 5.724,
"step": 625
},
{
"epoch": 4.705440900562852,
"grad_norm": 0.6627222895622253,
"learning_rate": 9.947449677578982e-05,
"loss": 0.311,
"step": 630
},
{
"epoch": 4.780487804878049,
"grad_norm": 0.831093966960907,
"learning_rate": 9.943820852019344e-05,
"loss": 0.3187,
"step": 640
},
{
"epoch": 4.855534709193246,
"grad_norm": 0.8561496734619141,
"learning_rate": 9.940071588116468e-05,
"loss": 0.317,
"step": 650
},
{
"epoch": 4.855534709193246,
"eval_loss": 0.4430959224700928,
"eval_runtime": 5.0752,
"eval_samples_per_second": 22.265,
"eval_steps_per_second": 5.714,
"step": 650
},
{
"epoch": 4.930581613508442,
"grad_norm": 0.7869876623153687,
"learning_rate": 9.936201977207631e-05,
"loss": 0.3165,
"step": 660
},
{
"epoch": 5.0,
"grad_norm": 2.4095470905303955,
"learning_rate": 9.932212113561927e-05,
"loss": 0.3139,
"step": 670
},
{
"epoch": 5.037523452157599,
"eval_loss": 0.4734329581260681,
"eval_runtime": 5.0708,
"eval_samples_per_second": 22.285,
"eval_steps_per_second": 5.719,
"step": 675
},
{
"epoch": 5.075046904315197,
"grad_norm": 0.8977411985397339,
"learning_rate": 9.92810209437798e-05,
"loss": 0.2374,
"step": 680
},
{
"epoch": 5.150093808630394,
"grad_norm": 0.8467944860458374,
"learning_rate": 9.923872019781564e-05,
"loss": 0.2397,
"step": 690
},
{
"epoch": 5.225140712945591,
"grad_norm": 0.8776218295097351,
"learning_rate": 9.919521992823173e-05,
"loss": 0.2371,
"step": 700
},
{
"epoch": 5.225140712945591,
"eval_loss": 0.46672776341438293,
"eval_runtime": 5.0617,
"eval_samples_per_second": 22.324,
"eval_steps_per_second": 5.729,
"step": 700
},
{
"epoch": 5.300187617260788,
"grad_norm": 0.8390938639640808,
"learning_rate": 9.915052119475505e-05,
"loss": 0.2487,
"step": 710
},
{
"epoch": 5.375234521575985,
"grad_norm": 0.8848443627357483,
"learning_rate": 9.910462508630885e-05,
"loss": 0.2393,
"step": 720
},
{
"epoch": 5.412757973733584,
"eval_loss": 0.4665623605251312,
"eval_runtime": 5.0722,
"eval_samples_per_second": 22.278,
"eval_steps_per_second": 5.717,
"step": 725
},
{
"epoch": 5.450281425891182,
"grad_norm": 0.8097019195556641,
"learning_rate": 9.905753272098608e-05,
"loss": 0.2439,
"step": 730
},
{
"epoch": 5.525328330206379,
"grad_norm": 0.7588761448860168,
"learning_rate": 9.900924524602218e-05,
"loss": 0.2477,
"step": 740
},
{
"epoch": 5.600375234521576,
"grad_norm": 0.8408402800559998,
"learning_rate": 9.895976383776711e-05,
"loss": 0.2484,
"step": 750
},
{
"epoch": 5.600375234521576,
"eval_loss": 0.4602144956588745,
"eval_runtime": 5.0743,
"eval_samples_per_second": 22.269,
"eval_steps_per_second": 5.715,
"step": 750
},
{
"epoch": 5.6754221388367725,
"grad_norm": 0.8723646402359009,
"learning_rate": 9.890908970165669e-05,
"loss": 0.2533,
"step": 760
},
{
"epoch": 5.75046904315197,
"grad_norm": 0.6956503391265869,
"learning_rate": 9.88572240721833e-05,
"loss": 0.253,
"step": 770
},
{
"epoch": 5.7879924953095685,
"eval_loss": 0.450476735830307,
"eval_runtime": 5.0627,
"eval_samples_per_second": 22.32,
"eval_steps_per_second": 5.728,
"step": 775
},
{
"epoch": 5.825515947467167,
"grad_norm": 0.9172329306602478,
"learning_rate": 9.880416821286569e-05,
"loss": 0.2522,
"step": 780
},
{
"epoch": 5.900562851782364,
"grad_norm": 0.7848353981971741,
"learning_rate": 9.87499234162183e-05,
"loss": 0.2493,
"step": 790
},
{
"epoch": 5.975609756097561,
"grad_norm": 0.7720503211021423,
"learning_rate": 9.869449100371973e-05,
"loss": 0.2574,
"step": 800
},
{
"epoch": 5.975609756097561,
"eval_loss": 0.44239288568496704,
"eval_runtime": 5.0788,
"eval_samples_per_second": 22.249,
"eval_steps_per_second": 5.71,
"step": 800
}
],
"logging_steps": 10,
"max_steps": 6700,
"num_input_tokens_seen": 0,
"num_train_epochs": 50,
"save_steps": 25,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 15,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 2.5747343525279846e+17,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}