p-vector / checkpoint-400 /trainer_state.json
saranshagarwal2020's picture
Upload folder using huggingface_hub
f3a5828 verified
{
"best_global_step": 100,
"best_metric": 0.6752368807792664,
"best_model_checkpoint": "models/dpo_fft_LFM2.5-1.2B-Instruct_argilla__distilabel-math-preference-dpo_20260222_210527/checkpoint-100",
"epoch": 2.7791304347826085,
"eval_steps": 100,
"global_step": 400,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.06956521739130435,
"grad_norm": 87.0,
"learning_rate": 4.090909090909091e-07,
"logits/chosen": -1.0601829290390015,
"logits/rejected": -1.0425456762313843,
"logps/chosen": -332.2013244628906,
"logps/rejected": -333.1183776855469,
"loss": 0.6823273181915284,
"rewards/accuracies": 0.42500001192092896,
"rewards/chosen": 0.015406012535095215,
"rewards/margins": 0.03173117712140083,
"rewards/rejected": -0.01632516458630562,
"step": 10
},
{
"epoch": 0.1391304347826087,
"grad_norm": 98.5,
"learning_rate": 8.636363636363636e-07,
"logits/chosen": -1.0965769290924072,
"logits/rejected": -1.0956510305404663,
"logps/chosen": -328.796875,
"logps/rejected": -312.0242919921875,
"loss": 0.6926839828491211,
"rewards/accuracies": 0.48750001192092896,
"rewards/chosen": 0.093757264316082,
"rewards/margins": 0.013911411166191101,
"rewards/rejected": 0.0798458456993103,
"step": 20
},
{
"epoch": 0.20869565217391303,
"grad_norm": 96.5,
"learning_rate": 1.318181818181818e-06,
"logits/chosen": -1.1252676248550415,
"logits/rejected": -1.1598210334777832,
"logps/chosen": -326.04327392578125,
"logps/rejected": -303.9259033203125,
"loss": 0.7117842674255371,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": 0.48031529784202576,
"rewards/margins": -0.010448494926095009,
"rewards/rejected": 0.49076375365257263,
"step": 30
},
{
"epoch": 0.2782608695652174,
"grad_norm": 109.0,
"learning_rate": 1.7727272727272727e-06,
"logits/chosen": -1.0572926998138428,
"logits/rejected": -1.069678544998169,
"logps/chosen": -333.5104064941406,
"logps/rejected": -322.76116943359375,
"loss": 0.721163272857666,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": 1.2552604675292969,
"rewards/margins": 0.020199721679091454,
"rewards/rejected": 1.2350608110427856,
"step": 40
},
{
"epoch": 0.34782608695652173,
"grad_norm": 106.0,
"learning_rate": 1.99918061692433e-06,
"logits/chosen": -1.116310954093933,
"logits/rejected": -1.126555323600769,
"logps/chosen": -325.90625,
"logps/rejected": -320.7261047363281,
"loss": 0.7112587451934814,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": 0.9580303430557251,
"rewards/margins": 0.02043265663087368,
"rewards/rejected": 0.9375975728034973,
"step": 50
},
{
"epoch": 0.41739130434782606,
"grad_norm": 134.0,
"learning_rate": 1.992633606781968e-06,
"logits/chosen": -1.0915653705596924,
"logits/rejected": -1.0714164972305298,
"logps/chosen": -335.96258544921875,
"logps/rejected": -329.37567138671875,
"loss": 0.6888086795806885,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": 0.24013535678386688,
"rewards/margins": 0.025822216644883156,
"rewards/rejected": 0.21431314945220947,
"step": 60
},
{
"epoch": 0.48695652173913045,
"grad_norm": 168.0,
"learning_rate": 1.9795824849893477e-06,
"logits/chosen": -1.124298334121704,
"logits/rejected": -1.1153584718704224,
"logps/chosen": -319.74371337890625,
"logps/rejected": -317.81964111328125,
"loss": 0.7498865127563477,
"rewards/accuracies": 0.45625001192092896,
"rewards/chosen": 0.3042285442352295,
"rewards/margins": -0.07379330694675446,
"rewards/rejected": 0.37802186608314514,
"step": 70
},
{
"epoch": 0.5565217391304348,
"grad_norm": 93.5,
"learning_rate": 1.960112767443493e-06,
"logits/chosen": -1.1165910959243774,
"logits/rejected": -1.1083123683929443,
"logps/chosen": -314.81610107421875,
"logps/rejected": -312.41070556640625,
"loss": 0.67913818359375,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": 0.3251148760318756,
"rewards/margins": 0.07726944983005524,
"rewards/rejected": 0.24784541130065918,
"step": 80
},
{
"epoch": 0.6260869565217392,
"grad_norm": 97.5,
"learning_rate": 1.9343520271137762e-06,
"logits/chosen": -1.0576120615005493,
"logits/rejected": -1.0416970252990723,
"logps/chosen": -333.35565185546875,
"logps/rejected": -329.2746276855469,
"loss": 0.6899321556091309,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": 1.0298190116882324,
"rewards/margins": 0.062107719480991364,
"rewards/rejected": 0.967711329460144,
"step": 90
},
{
"epoch": 0.6956521739130435,
"grad_norm": 106.0,
"learning_rate": 1.9024690581354698e-06,
"logits/chosen": -1.0332655906677246,
"logits/rejected": -1.0259943008422852,
"logps/chosen": -327.9278564453125,
"logps/rejected": -320.8587951660156,
"loss": 0.6782574653625488,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": 1.1065479516983032,
"rewards/margins": 0.09923191368579865,
"rewards/rejected": 1.007315993309021,
"step": 100
},
{
"epoch": 0.6956521739130435,
"eval_logits/chosen": -1.065671682357788,
"eval_logits/rejected": -1.0876761674880981,
"eval_logps/chosen": -315.0599670410156,
"eval_logps/rejected": -316.6776123046875,
"eval_loss": 0.6752368807792664,
"eval_rewards/accuracies": 0.5887096524238586,
"eval_rewards/chosen": 0.8812527060508728,
"eval_rewards/margins": 0.13870203495025635,
"eval_rewards/rejected": 0.7425506114959717,
"eval_runtime": 11.3291,
"eval_samples_per_second": 10.68,
"eval_steps_per_second": 2.736,
"step": 100
},
{
"epoch": 0.7652173913043478,
"grad_norm": 84.0,
"learning_rate": 1.8646727698065862e-06,
"logits/chosen": -1.0779330730438232,
"logits/rejected": -1.0829439163208008,
"logps/chosen": -314.15679931640625,
"logps/rejected": -308.0323181152344,
"loss": 0.6908653736114502,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": 0.6120839715003967,
"rewards/margins": 0.0896262526512146,
"rewards/rejected": 0.5224577188491821,
"step": 110
},
{
"epoch": 0.8347826086956521,
"grad_norm": 101.0,
"learning_rate": 1.821210817734972e-06,
"logits/chosen": -1.0664002895355225,
"logits/rejected": -1.0612239837646484,
"logps/chosen": -334.7059020996094,
"logps/rejected": -321.7742919921875,
"loss": 0.7135319232940673,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": 0.22731296718120575,
"rewards/margins": 0.016000976786017418,
"rewards/rejected": 0.21131198108196259,
"step": 120
},
{
"epoch": 0.9043478260869565,
"grad_norm": 104.5,
"learning_rate": 1.7723679811048902e-06,
"logits/chosen": -1.093273401260376,
"logits/rejected": -1.0916509628295898,
"logps/chosen": -332.2840270996094,
"logps/rejected": -320.5323181152344,
"loss": 0.6850498199462891,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": 0.8852480053901672,
"rewards/margins": 0.07424825429916382,
"rewards/rejected": 0.8109996914863586,
"step": 130
},
{
"epoch": 0.9739130434782609,
"grad_norm": 96.0,
"learning_rate": 1.7184642966958607e-06,
"logits/chosen": -1.065161943435669,
"logits/rejected": -1.0810632705688477,
"logps/chosen": -336.020751953125,
"logps/rejected": -311.6708984375,
"loss": 0.7298181056976318,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": 0.8573455810546875,
"rewards/margins": 0.014898905530571938,
"rewards/rejected": 0.842446506023407,
"step": 140
},
{
"epoch": 1.0417391304347827,
"grad_norm": 169.0,
"learning_rate": 1.6598529618803698e-06,
"logits/chosen": -1.135772705078125,
"logits/rejected": -1.1369844675064087,
"logps/chosen": -297.4026794433594,
"logps/rejected": -306.18194580078125,
"loss": 0.5908462524414062,
"rewards/accuracies": 0.7435897588729858,
"rewards/chosen": 0.7982656359672546,
"rewards/margins": 0.3948451280593872,
"rewards/rejected": 0.4034205377101898,
"step": 150
},
{
"epoch": 1.111304347826087,
"grad_norm": 88.0,
"learning_rate": 1.596918020340805e-06,
"logits/chosen": -1.084324836730957,
"logits/rejected": -1.083601951599121,
"logps/chosen": -326.7928771972656,
"logps/rejected": -318.2250671386719,
"loss": 0.4479428768157959,
"rewards/accuracies": 0.893750011920929,
"rewards/chosen": 0.5380933284759521,
"rewards/margins": 0.6464223861694336,
"rewards/rejected": -0.10832903534173965,
"step": 160
},
{
"epoch": 1.1808695652173913,
"grad_norm": 67.0,
"learning_rate": 1.5300718456696778e-06,
"logits/chosen": -1.1100142002105713,
"logits/rejected": -1.1088769435882568,
"logps/chosen": -324.53009033203125,
"logps/rejected": -317.4223937988281,
"loss": 0.4346441745758057,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": 0.8458078503608704,
"rewards/margins": 0.7088065147399902,
"rewards/rejected": 0.13700127601623535,
"step": 170
},
{
"epoch": 1.2504347826086957,
"grad_norm": 163.0,
"learning_rate": 1.4597524393415336e-06,
"logits/chosen": -1.1076725721359253,
"logits/rejected": -1.0804173946380615,
"logps/chosen": -330.8292541503906,
"logps/rejected": -318.1831359863281,
"loss": 0.47336974143981936,
"rewards/accuracies": 0.831250011920929,
"rewards/chosen": 1.2574979066848755,
"rewards/margins": 0.669701874256134,
"rewards/rejected": 0.5877960920333862,
"step": 180
},
{
"epoch": 1.32,
"grad_norm": 67.5,
"learning_rate": 1.3864205607612647e-06,
"logits/chosen": -1.0184274911880493,
"logits/rejected": -1.0195646286010742,
"logps/chosen": -339.4798583984375,
"logps/rejected": -329.2326965332031,
"loss": 0.47516441345214844,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": 1.4096896648406982,
"rewards/margins": 0.6121624112129211,
"rewards/rejected": 0.7975271940231323,
"step": 190
},
{
"epoch": 1.3895652173913042,
"grad_norm": 63.25,
"learning_rate": 1.3105567081938423e-06,
"logits/chosen": -1.0496165752410889,
"logits/rejected": -1.0412036180496216,
"logps/chosen": -321.82830810546875,
"logps/rejected": -301.32464599609375,
"loss": 0.47269201278686523,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": 1.2313692569732666,
"rewards/margins": 0.6157368421554565,
"rewards/rejected": 0.615632176399231,
"step": 200
},
{
"epoch": 1.3895652173913042,
"eval_logits/chosen": -1.0566930770874023,
"eval_logits/rejected": -1.07687509059906,
"eval_logps/chosen": -314.4008483886719,
"eval_logps/rejected": -315.7554931640625,
"eval_loss": 0.7017911076545715,
"eval_rewards/accuracies": 0.5,
"eval_rewards/chosen": 1.0130723714828491,
"eval_rewards/margins": 0.08608859032392502,
"eval_rewards/rejected": 0.9269838929176331,
"eval_runtime": 11.2238,
"eval_samples_per_second": 10.781,
"eval_steps_per_second": 2.762,
"step": 200
},
{
"epoch": 1.4591304347826086,
"grad_norm": 70.5,
"learning_rate": 1.2326579703575462e-06,
"logits/chosen": -1.0474871397018433,
"logits/rejected": -1.0500593185424805,
"logps/chosen": -324.48602294921875,
"logps/rejected": -322.1194763183594,
"loss": 0.4632419586181641,
"rewards/accuracies": 0.8812500238418579,
"rewards/chosen": 1.3502912521362305,
"rewards/margins": 0.6482532620429993,
"rewards/rejected": 0.7020379304885864,
"step": 210
},
{
"epoch": 1.528695652173913,
"grad_norm": 104.5,
"learning_rate": 1.1532347693102631e-06,
"logits/chosen": -1.0756328105926514,
"logits/rejected": -1.0997965335845947,
"logps/chosen": -322.89508056640625,
"logps/rejected": -318.676025390625,
"loss": 0.4693108081817627,
"rewards/accuracies": 0.856249988079071,
"rewards/chosen": 0.9996536374092102,
"rewards/margins": 0.6115056276321411,
"rewards/rejected": 0.38814812898635864,
"step": 220
},
{
"epoch": 1.5982608695652174,
"grad_norm": 68.0,
"learning_rate": 1.0728075159706879e-06,
"logits/chosen": -1.0858252048492432,
"logits/rejected": -1.0972968339920044,
"logps/chosen": -333.7071533203125,
"logps/rejected": -322.7843322753906,
"loss": 0.537294578552246,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": 0.4641377329826355,
"rewards/margins": 0.43356814980506897,
"rewards/rejected": 0.030569633468985558,
"step": 230
},
{
"epoch": 1.6678260869565218,
"grad_norm": 66.0,
"learning_rate": 9.919032001887214e-07,
"logits/chosen": -1.0909720659255981,
"logits/rejected": -1.0826283693313599,
"logps/chosen": -332.0724792480469,
"logps/rejected": -330.3124694824219,
"loss": 0.4591354846954346,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": 0.42356061935424805,
"rewards/margins": 0.632544219493866,
"rewards/rejected": -0.2089836597442627,
"step": 240
},
{
"epoch": 1.7373913043478262,
"grad_norm": 64.0,
"learning_rate": 9.110519377082173e-07,
"logits/chosen": -1.1404974460601807,
"logits/rejected": -1.1487622261047363,
"logps/chosen": -315.04705810546875,
"logps/rejected": -306.60357666015625,
"loss": 0.47364654541015627,
"rewards/accuracies": 0.856249988079071,
"rewards/chosen": 0.552635669708252,
"rewards/margins": 0.5962538719177246,
"rewards/rejected": -0.043618228286504745,
"step": 250
},
{
"epoch": 1.8069565217391306,
"grad_norm": 96.0,
"learning_rate": 8.307834966476883e-07,
"logits/chosen": -1.1199188232421875,
"logits/rejected": -1.110740303993225,
"logps/chosen": -328.4118957519531,
"logps/rejected": -333.0620422363281,
"loss": 0.4562994956970215,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": 0.5946794152259827,
"rewards/margins": 0.6635168790817261,
"rewards/rejected": -0.0688374936580658,
"step": 260
},
{
"epoch": 1.8765217391304347,
"grad_norm": 97.0,
"learning_rate": 7.51623826258785e-07,
"logits/chosen": -1.1038875579833984,
"logits/rejected": -1.1073077917099,
"logps/chosen": -332.9511413574219,
"logps/rejected": -317.7924499511719,
"loss": 0.48169240951538084,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": 0.6237157583236694,
"rewards/margins": 0.5657099485397339,
"rewards/rejected": 0.05800582095980644,
"step": 270
},
{
"epoch": 1.9460869565217391,
"grad_norm": 76.5,
"learning_rate": 6.740916107074371e-07,
"logits/chosen": -1.0971988439559937,
"logits/rejected": -1.1136589050292969,
"logps/chosen": -331.86248779296875,
"logps/rejected": -324.9644470214844,
"loss": 0.4545170307159424,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": 0.8813239336013794,
"rewards/margins": 0.7066205143928528,
"rewards/rejected": 0.1747034788131714,
"step": 280
},
{
"epoch": 2.013913043478261,
"grad_norm": 61.75,
"learning_rate": 5.986948704585895e-07,
"logits/chosen": -1.0902117490768433,
"logits/rejected": -1.0950541496276855,
"logps/chosen": -323.3013916015625,
"logps/rejected": -322.97393798828125,
"loss": 0.45575871467590334,
"rewards/accuracies": 0.8974359035491943,
"rewards/chosen": 0.7955907583236694,
"rewards/margins": 0.6638101935386658,
"rewards/rejected": 0.13178066909313202,
"step": 290
},
{
"epoch": 2.0834782608695654,
"grad_norm": 56.25,
"learning_rate": 5.259276335335521e-07,
"logits/chosen": -1.111509084701538,
"logits/rejected": -1.138770341873169,
"logps/chosen": -332.07989501953125,
"logps/rejected": -317.6449279785156,
"loss": 0.3894503593444824,
"rewards/accuracies": 0.9624999761581421,
"rewards/chosen": 0.806149959564209,
"rewards/margins": 0.8637164235115051,
"rewards/rejected": -0.05756649374961853,
"step": 300
},
{
"epoch": 2.0834782608695654,
"eval_logits/chosen": -1.0786491632461548,
"eval_logits/rejected": -1.0995056629180908,
"eval_logps/chosen": -317.61407470703125,
"eval_logps/rejected": -318.8610534667969,
"eval_loss": 0.6876804232597351,
"eval_rewards/accuracies": 0.5645161271095276,
"eval_rewards/chosen": 0.37044042348861694,
"eval_rewards/margins": 0.06457632035017014,
"eval_rewards/rejected": 0.305864155292511,
"eval_runtime": 11.2588,
"eval_samples_per_second": 10.747,
"eval_steps_per_second": 2.753,
"step": 300
},
{
"epoch": 2.1530434782608694,
"grad_norm": 59.25,
"learning_rate": 4.5626669845114154e-07,
"logits/chosen": -1.1131139993667603,
"logits/rejected": -1.0847505331039429,
"logps/chosen": -317.81744384765625,
"logps/rejected": -318.0986633300781,
"loss": 0.37284040451049805,
"rewards/accuracies": 0.981249988079071,
"rewards/chosen": 0.8261886835098267,
"rewards/margins": 0.8763921856880188,
"rewards/rejected": -0.05020345374941826,
"step": 310
},
{
"epoch": 2.222608695652174,
"grad_norm": 66.0,
"learning_rate": 3.901685100630554e-07,
"logits/chosen": -1.0582597255706787,
"logits/rejected": -1.0536084175109863,
"logps/chosen": -323.5310974121094,
"logps/rejected": -329.77264404296875,
"loss": 0.40012392997741697,
"rewards/accuracies": 0.9624999761581421,
"rewards/chosen": 0.7232468128204346,
"rewards/margins": 0.8045024871826172,
"rewards/rejected": -0.08125568181276321,
"step": 320
},
{
"epoch": 2.292173913043478,
"grad_norm": 59.75,
"learning_rate": 3.2806616875418757e-07,
"logits/chosen": -1.0983725786209106,
"logits/rejected": -1.0878392457962036,
"logps/chosen": -315.97442626953125,
"logps/rejected": -314.21307373046875,
"loss": 0.4109466552734375,
"rewards/accuracies": 0.9312499761581421,
"rewards/chosen": 0.6401529908180237,
"rewards/margins": 0.7892977595329285,
"rewards/rejected": -0.14914488792419434,
"step": 330
},
{
"epoch": 2.3617391304347826,
"grad_norm": 56.0,
"learning_rate": 2.7036659260473973e-07,
"logits/chosen": -1.0930839776992798,
"logits/rejected": -1.1155784130096436,
"logps/chosen": -338.72357177734375,
"logps/rejected": -327.84832763671875,
"loss": 0.39486031532287597,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": 0.7087287902832031,
"rewards/margins": 0.8493242263793945,
"rewards/rejected": -0.1405954360961914,
"step": 340
},
{
"epoch": 2.431304347826087,
"grad_norm": 81.0,
"learning_rate": 2.174478511087171e-07,
"logits/chosen": -1.0855780839920044,
"logits/rejected": -1.074064016342163,
"logps/chosen": -330.82952880859375,
"logps/rejected": -330.74505615234375,
"loss": 0.39856863021850586,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": 0.7598364949226379,
"rewards/margins": 0.8086788058280945,
"rewards/rejected": -0.04884239286184311,
"step": 350
},
{
"epoch": 2.5008695652173913,
"grad_norm": 68.0,
"learning_rate": 1.69656687919296e-07,
"logits/chosen": -1.0719494819641113,
"logits/rejected": -1.0662832260131836,
"logps/chosen": -337.28863525390625,
"logps/rejected": -320.1393127441406,
"loss": 0.38623409271240233,
"rewards/accuracies": 0.9437500238418579,
"rewards/chosen": 0.8269112706184387,
"rewards/margins": 0.8633332252502441,
"rewards/rejected": -0.036422014236450195,
"step": 360
},
{
"epoch": 2.5704347826086957,
"grad_norm": 63.0,
"learning_rate": 1.2730624885297537e-07,
"logits/chosen": -1.0956530570983887,
"logits/rejected": -1.0990194082260132,
"logps/chosen": -330.1253356933594,
"logps/rejected": -323.8518371582031,
"loss": 0.3871379613876343,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 0.794479489326477,
"rewards/margins": 0.8608804941177368,
"rewards/rejected": -0.066400907933712,
"step": 370
},
{
"epoch": 2.64,
"grad_norm": 64.0,
"learning_rate": 9.067403003948781e-08,
"logits/chosen": -1.1272144317626953,
"logits/rejected": -1.10252845287323,
"logps/chosen": -329.3648681640625,
"logps/rejected": -315.2191162109375,
"loss": 0.36964147090911864,
"rewards/accuracies": 0.9312499761581421,
"rewards/chosen": 0.839946448802948,
"rewards/margins": 0.9301446080207825,
"rewards/rejected": -0.09019814431667328,
"step": 380
},
{
"epoch": 2.7095652173913045,
"grad_norm": 59.25,
"learning_rate": 6.000005966197386e-08,
"logits/chosen": -1.149733543395996,
"logits/rejected": -1.1311017274856567,
"logps/chosen": -317.32818603515625,
"logps/rejected": -311.5692138671875,
"loss": 0.3780661106109619,
"rewards/accuracies": 0.9375,
"rewards/chosen": 0.7798963785171509,
"rewards/margins": 0.8840686082839966,
"rewards/rejected": -0.10417220741510391,
"step": 390
},
{
"epoch": 2.7791304347826085,
"grad_norm": 43.0,
"learning_rate": 3.5485325201341284e-08,
"logits/chosen": -1.104060411453247,
"logits/rejected": -1.1139435768127441,
"logps/chosen": -325.05377197265625,
"logps/rejected": -323.31304931640625,
"loss": 0.40581889152526857,
"rewards/accuracies": 0.9437500238418579,
"rewards/chosen": 0.7068971991539001,
"rewards/margins": 0.8001037836074829,
"rewards/rejected": -0.09320656955242157,
"step": 400
},
{
"epoch": 2.7791304347826085,
"eval_logits/chosen": -1.0804522037506104,
"eval_logits/rejected": -1.1010961532592773,
"eval_logps/chosen": -318.0834655761719,
"eval_logps/rejected": -319.351806640625,
"eval_loss": 0.6869162917137146,
"eval_rewards/accuracies": 0.5403226017951965,
"eval_rewards/chosen": 0.2765510380268097,
"eval_rewards/margins": 0.06884314864873886,
"eval_rewards/rejected": 0.20770789682865143,
"eval_runtime": 11.2322,
"eval_samples_per_second": 10.773,
"eval_steps_per_second": 2.76,
"step": 400
}
],
"logging_steps": 10,
"max_steps": 432,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}