Hackduke-dpo-llm / hackduke1_DPO /trainer_state.json
SimonWSY's picture
Upload 167 files
c62e26d
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 434,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.02,
"learning_rate": 9.986906050861595e-06,
"logits/chosen": -2.1006269454956055,
"logits/rejected": -2.099174737930298,
"logps/chosen": -7.524294376373291,
"logps/rejected": -11.861832618713379,
"loss": 0.6931,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.0014334914740175009,
"rewards/margins": 9.918214345816523e-05,
"rewards/rejected": -0.0015326736029237509,
"step": 10
},
{
"epoch": 0.05,
"learning_rate": 9.94769278404799e-06,
"logits/chosen": -2.115718364715576,
"logits/rejected": -2.1131784915924072,
"logps/chosen": -7.616778373718262,
"logps/rejected": -12.535284042358398,
"loss": 0.6911,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.002857386600226164,
"rewards/margins": 0.004104451276361942,
"rewards/rejected": -0.006961838807910681,
"step": 20
},
{
"epoch": 0.07,
"learning_rate": 9.882565582167673e-06,
"logits/chosen": -2.084085464477539,
"logits/rejected": -2.0862631797790527,
"logps/chosen": -7.192718505859375,
"logps/rejected": -12.321676254272461,
"loss": 0.6903,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.006043486762791872,
"rewards/margins": 0.005702839698642492,
"rewards/rejected": -0.011746326461434364,
"step": 30
},
{
"epoch": 0.09,
"learning_rate": 9.79186555412822e-06,
"logits/chosen": -1.9858118295669556,
"logits/rejected": -1.9895610809326172,
"logps/chosen": -8.221541404724121,
"logps/rejected": -12.079745292663574,
"loss": 0.6848,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -0.0059954021126031876,
"rewards/margins": 0.016938839107751846,
"rewards/rejected": -0.022934241220355034,
"step": 40
},
{
"epoch": 0.12,
"learning_rate": 9.676067748551232e-06,
"logits/chosen": -1.9488677978515625,
"logits/rejected": -1.9480243921279907,
"logps/chosen": -8.750779151916504,
"logps/rejected": -12.39845085144043,
"loss": 0.6836,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.012456322088837624,
"rewards/margins": 0.01962612010538578,
"rewards/rejected": -0.0320824459195137,
"step": 50
},
{
"epoch": 0.14,
"learning_rate": 9.535778665667334e-06,
"logits/chosen": -2.105088472366333,
"logits/rejected": -2.105722665786743,
"logps/chosen": -8.532628059387207,
"logps/rejected": -12.4138765335083,
"loss": 0.6796,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.0170536320656538,
"rewards/margins": 0.028087785467505455,
"rewards/rejected": -0.045141417533159256,
"step": 60
},
{
"epoch": 0.16,
"learning_rate": 9.371733080722911e-06,
"logits/chosen": -1.9830926656723022,
"logits/rejected": -1.982187032699585,
"logps/chosen": -8.544172286987305,
"logps/rejected": -13.114030838012695,
"loss": 0.68,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.01359584927558899,
"rewards/margins": 0.02715850993990898,
"rewards/rejected": -0.04075435549020767,
"step": 70
},
{
"epoch": 0.18,
"learning_rate": 9.184790195536217e-06,
"logits/chosen": -2.089336395263672,
"logits/rejected": -2.0889394283294678,
"logps/chosen": -8.647878646850586,
"logps/rejected": -12.844439506530762,
"loss": 0.6729,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.031301841139793396,
"rewards/margins": 0.04218802973628044,
"rewards/rejected": -0.07348985970020294,
"step": 80
},
{
"epoch": 0.21,
"learning_rate": 8.975929138359423e-06,
"logits/chosen": -2.0116446018218994,
"logits/rejected": -2.0112133026123047,
"logps/chosen": -8.066459655761719,
"logps/rejected": -13.437301635742188,
"loss": 0.6571,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.0288580060005188,
"rewards/margins": 0.07653258740901947,
"rewards/rejected": -0.10539059340953827,
"step": 90
},
{
"epoch": 0.23,
"learning_rate": 8.746243835616392e-06,
"logits/chosen": -2.099787950515747,
"logits/rejected": -2.099290370941162,
"logps/chosen": -8.891599655151367,
"logps/rejected": -12.552549362182617,
"loss": 0.6683,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.05058792978525162,
"rewards/margins": 0.054626502096652985,
"rewards/rejected": -0.1052144318819046,
"step": 100
},
{
"epoch": 0.25,
"learning_rate": 8.496937282375912e-06,
"logits/chosen": -2.0405075550079346,
"logits/rejected": -2.0367274284362793,
"logps/chosen": -7.712469577789307,
"logps/rejected": -13.533961296081543,
"loss": 0.6421,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.030406493693590164,
"rewards/margins": 0.1149085983633995,
"rewards/rejected": -0.14531509578227997,
"step": 110
},
{
"epoch": 0.28,
"learning_rate": 8.229315241569177e-06,
"logits/chosen": -1.9212989807128906,
"logits/rejected": -1.920802116394043,
"logps/chosen": -8.383896827697754,
"logps/rejected": -13.753458976745605,
"loss": 0.6477,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.05628451704978943,
"rewards/margins": 0.10657332837581635,
"rewards/rejected": -0.16285786032676697,
"step": 120
},
{
"epoch": 0.3,
"learning_rate": 7.94477940495245e-06,
"logits/chosen": -2.0202853679656982,
"logits/rejected": -2.0228538513183594,
"logps/chosen": -7.346169471740723,
"logps/rejected": -13.822959899902344,
"loss": 0.6099,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.03167923167347908,
"rewards/margins": 0.19764581322669983,
"rewards/rejected": -0.2293250560760498,
"step": 130
},
{
"epoch": 0.32,
"learning_rate": 7.644820051634813e-06,
"logits/chosen": -1.7893329858779907,
"logits/rejected": -1.786982536315918,
"logps/chosen": -9.868978500366211,
"logps/rejected": -14.2501802444458,
"loss": 0.6516,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.11864396184682846,
"rewards/margins": 0.09951023757457733,
"rewards/rejected": -0.2181541919708252,
"step": 140
},
{
"epoch": 0.35,
"learning_rate": 7.331008242622637e-06,
"logits/chosen": -1.8841416835784912,
"logits/rejected": -1.8860057592391968,
"logps/chosen": -8.504674911499023,
"logps/rejected": -15.18022632598877,
"loss": 0.6069,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.10230400413274765,
"rewards/margins": 0.2147357016801834,
"rewards/rejected": -0.3170396685600281,
"step": 150
},
{
"epoch": 0.37,
"learning_rate": 7.00498759226242e-06,
"logits/chosen": -1.8029648065567017,
"logits/rejected": -1.8041973114013672,
"logps/chosen": -10.013819694519043,
"logps/rejected": -15.499191284179688,
"loss": 0.6368,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.1780211329460144,
"rewards/margins": 0.14491881430149078,
"rewards/rejected": -0.3229399621486664,
"step": 160
},
{
"epoch": 0.39,
"learning_rate": 6.668465659679714e-06,
"logits/chosen": -1.7474323511123657,
"logits/rejected": -1.7486388683319092,
"logps/chosen": -9.3012113571167,
"logps/rejected": -16.239866256713867,
"loss": 0.6106,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.17473192512989044,
"rewards/margins": 0.21514467895030975,
"rewards/rejected": -0.3898766040802002,
"step": 170
},
{
"epoch": 0.41,
"learning_rate": 6.323205005302199e-06,
"logits/chosen": -1.8646036386489868,
"logits/rejected": -1.8657925128936768,
"logps/chosen": -9.604146957397461,
"logps/rejected": -14.87494945526123,
"loss": 0.6349,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.18171469867229462,
"rewards/margins": 0.14191558957099915,
"rewards/rejected": -0.32363027334213257,
"step": 180
},
{
"epoch": 0.44,
"learning_rate": 5.971013959309038e-06,
"logits/chosen": -1.7274844646453857,
"logits/rejected": -1.7273458242416382,
"logps/chosen": -9.02622127532959,
"logps/rejected": -16.272247314453125,
"loss": 0.6046,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.2149616777896881,
"rewards/margins": 0.22385644912719727,
"rewards/rejected": -0.4388181269168854,
"step": 190
},
{
"epoch": 0.46,
"learning_rate": 5.613737150357528e-06,
"logits/chosen": -1.8614267110824585,
"logits/rejected": -1.8625634908676147,
"logps/chosen": -10.578888893127441,
"logps/rejected": -16.31842803955078,
"loss": 0.6291,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.2647446393966675,
"rewards/margins": 0.18538950383663177,
"rewards/rejected": -0.45013418793678284,
"step": 200
},
{
"epoch": 0.48,
"learning_rate": 5.253245844193564e-06,
"logits/chosen": -1.8686116933822632,
"logits/rejected": -1.8661558628082275,
"logps/chosen": -9.932334899902344,
"logps/rejected": -16.823314666748047,
"loss": 0.5866,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.23446519672870636,
"rewards/margins": 0.2885656952857971,
"rewards/rejected": -0.5230308771133423,
"step": 210
},
{
"epoch": 0.51,
"learning_rate": 4.891428142748247e-06,
"logits/chosen": -1.808436393737793,
"logits/rejected": -1.8094003200531006,
"logps/chosen": -11.798791885375977,
"logps/rejected": -17.035802841186523,
"loss": 0.6291,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.2741556763648987,
"rewards/margins": 0.16138270497322083,
"rewards/rejected": -0.4355384409427643,
"step": 220
},
{
"epoch": 0.53,
"learning_rate": 4.5301790950536175e-06,
"logits/chosen": -1.7613519430160522,
"logits/rejected": -1.7557151317596436,
"logps/chosen": -10.371234893798828,
"logps/rejected": -18.36960792541504,
"loss": 0.6033,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.280795156955719,
"rewards/margins": 0.2397323101758957,
"rewards/rejected": -0.5205274820327759,
"step": 230
},
{
"epoch": 0.55,
"learning_rate": 4.171390771772399e-06,
"logits/chosen": -1.8589379787445068,
"logits/rejected": -1.8582267761230469,
"logps/chosen": -11.151894569396973,
"logps/rejected": -18.43938446044922,
"loss": 0.6007,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.34404295682907104,
"rewards/margins": 0.2676880955696106,
"rewards/rejected": -0.6117311120033264,
"step": 240
},
{
"epoch": 0.58,
"learning_rate": 3.816942355327191e-06,
"logits/chosen": -1.8235633373260498,
"logits/rejected": -1.8264453411102295,
"logps/chosen": -9.687265396118164,
"logps/rejected": -18.683124542236328,
"loss": 0.5426,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -0.2572888731956482,
"rewards/margins": 0.3976905941963196,
"rewards/rejected": -0.654979407787323,
"step": 250
},
{
"epoch": 0.6,
"learning_rate": 3.468690297532843e-06,
"logits/chosen": -1.7654492855072021,
"logits/rejected": -1.7625312805175781,
"logps/chosen": -11.64262580871582,
"logps/rejected": -18.569005966186523,
"loss": 0.5844,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.3640887141227722,
"rewards/margins": 0.3248264491558075,
"rewards/rejected": -0.6889151930809021,
"step": 260
},
{
"epoch": 0.62,
"learning_rate": 3.1620696915770243e-06,
"logits/chosen": -1.850664496421814,
"logits/rejected": -1.849443793296814,
"logps/chosen": -12.17113208770752,
"logps/rejected": -18.779268264770508,
"loss": 0.5925,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.37226295471191406,
"rewards/margins": 0.2928708791732788,
"rewards/rejected": -0.6651338338851929,
"step": 270
},
{
"epoch": 0.65,
"learning_rate": 2.83058130441221e-06,
"logits/chosen": -1.8435547351837158,
"logits/rejected": -1.8437684774398804,
"logps/chosen": -11.830400466918945,
"logps/rejected": -19.32192611694336,
"loss": 0.5833,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.39819639921188354,
"rewards/margins": 0.3126676678657532,
"rewards/rejected": -0.7108640074729919,
"step": 280
},
{
"epoch": 0.67,
"learning_rate": 2.510455420471369e-06,
"logits/chosen": -1.765302300453186,
"logits/rejected": -1.7661861181259155,
"logps/chosen": -13.742077827453613,
"logps/rejected": -19.282821655273438,
"loss": 0.6334,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.5000133514404297,
"rewards/margins": 0.20012502372264862,
"rewards/rejected": -0.7001383900642395,
"step": 290
},
{
"epoch": 0.69,
"learning_rate": 2.2033687245713847e-06,
"logits/chosen": -1.696927785873413,
"logits/rejected": -1.6953802108764648,
"logps/chosen": -12.203340530395508,
"logps/rejected": -18.277769088745117,
"loss": 0.592,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.3981890380382538,
"rewards/margins": 0.2977578043937683,
"rewards/rejected": -0.6959468126296997,
"step": 300
},
{
"epoch": 0.71,
"learning_rate": 1.9109296077431373e-06,
"logits/chosen": -1.6897990703582764,
"logits/rejected": -1.6910638809204102,
"logps/chosen": -11.370373725891113,
"logps/rejected": -18.85976791381836,
"loss": 0.5873,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.34721606969833374,
"rewards/margins": 0.327645868062973,
"rewards/rejected": -0.6748619079589844,
"step": 310
},
{
"epoch": 0.74,
"learning_rate": 1.6346697431553554e-06,
"logits/chosen": -1.8375412225723267,
"logits/rejected": -1.838157296180725,
"logps/chosen": -13.91193962097168,
"logps/rejected": -20.724300384521484,
"loss": 0.58,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.5397823452949524,
"rewards/margins": 0.3468281626701355,
"rewards/rejected": -0.8866105079650879,
"step": 320
},
{
"epoch": 0.76,
"learning_rate": 1.3760360638544012e-06,
"logits/chosen": -1.7385616302490234,
"logits/rejected": -1.7387222051620483,
"logps/chosen": -13.660693168640137,
"logps/rejected": -19.53488540649414,
"loss": 0.5843,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -0.4130437970161438,
"rewards/margins": 0.3007003664970398,
"rewards/rejected": -0.7137441635131836,
"step": 330
},
{
"epoch": 0.78,
"learning_rate": 1.1363831843371691e-06,
"logits/chosen": -1.7484130859375,
"logits/rejected": -1.7503455877304077,
"logps/chosen": -12.76932430267334,
"logps/rejected": -18.824031829833984,
"loss": 0.6248,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.3822181522846222,
"rewards/margins": 0.25158068537712097,
"rewards/rejected": -0.6337988972663879,
"step": 340
},
{
"epoch": 0.81,
"learning_rate": 9.169663056497713e-07,
"logits/chosen": -1.739941954612732,
"logits/rejected": -1.7391643524169922,
"logps/chosen": -11.454751968383789,
"logps/rejected": -18.73871421813965,
"loss": 0.6145,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.4753839075565338,
"rewards/margins": 0.27988919615745544,
"rewards/rejected": -0.7552732229232788,
"step": 350
},
{
"epoch": 0.83,
"learning_rate": 7.189346411720604e-07,
"logits/chosen": -1.7457377910614014,
"logits/rejected": -1.7455562353134155,
"logps/chosen": -12.355423927307129,
"logps/rejected": -18.32061767578125,
"loss": 0.6606,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.49726954102516174,
"rewards/margins": 0.14859595894813538,
"rewards/rejected": -0.6458654999732971,
"step": 360
},
{
"epoch": 0.85,
"learning_rate": 5.433253975210262e-07,
"logits/chosen": -1.8033154010772705,
"logits/rejected": -1.805904746055603,
"logps/chosen": -13.142141342163086,
"logps/rejected": -20.202163696289062,
"loss": 0.5894,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.5428677201271057,
"rewards/margins": 0.30698272585868835,
"rewards/rejected": -0.8498504757881165,
"step": 370
},
{
"epoch": 0.88,
"learning_rate": 3.9105834209850536e-07,
"logits/chosen": -1.709399938583374,
"logits/rejected": -1.7103627920150757,
"logps/chosen": -12.403543472290039,
"logps/rejected": -20.584686279296875,
"loss": 0.5906,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.5125871896743774,
"rewards/margins": 0.30313560366630554,
"rewards/rejected": -0.8157228231430054,
"step": 380
},
{
"epoch": 0.9,
"learning_rate": 2.629309857361639e-07,
"logits/chosen": -1.7105987071990967,
"logits/rejected": -1.7081180810928345,
"logps/chosen": -12.209216117858887,
"logps/rejected": -19.7056827545166,
"loss": 0.6038,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.45262184739112854,
"rewards/margins": 0.29007774591445923,
"rewards/rejected": -0.7426996231079102,
"step": 390
},
{
"epoch": 0.92,
"learning_rate": 1.5961440566897913e-07,
"logits/chosen": -1.7665717601776123,
"logits/rejected": -1.766191840171814,
"logps/chosen": -11.442702293395996,
"logps/rejected": -20.167306900024414,
"loss": 0.582,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.4547387659549713,
"rewards/margins": 0.3354105055332184,
"rewards/rejected": -0.7901493310928345,
"step": 400
},
{
"epoch": 0.94,
"learning_rate": 8.164973071477178e-08,
"logits/chosen": -1.8613353967666626,
"logits/rejected": -1.8554569482803345,
"logps/chosen": -11.979402542114258,
"logps/rejected": -18.39723777770996,
"loss": 0.6028,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.42835044860839844,
"rewards/margins": 0.24790024757385254,
"rewards/rejected": -0.6762507557868958,
"step": 410
},
{
"epoch": 0.97,
"learning_rate": 2.944530706892046e-08,
"logits/chosen": -1.803637146949768,
"logits/rejected": -1.8033443689346313,
"logps/chosen": -11.866365432739258,
"logps/rejected": -20.201677322387695,
"loss": 0.5503,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -0.39479631185531616,
"rewards/margins": 0.3996947705745697,
"rewards/rejected": -0.7944909930229187,
"step": 420
},
{
"epoch": 0.99,
"learning_rate": 3.2745595586319843e-09,
"logits/chosen": -1.6975727081298828,
"logits/rejected": -1.6941413879394531,
"logps/chosen": -12.299293518066406,
"logps/rejected": -19.104501724243164,
"loss": 0.5885,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.45436277985572815,
"rewards/margins": 0.2971551716327667,
"rewards/rejected": -0.7515180110931396,
"step": 430
},
{
"epoch": 1.0,
"step": 434,
"total_flos": 1.0192399979927962e+17,
"train_loss": 0.6225535166428385,
"train_runtime": 3531.6202,
"train_samples_per_second": 0.983,
"train_steps_per_second": 0.123
}
],
"logging_steps": 10,
"max_steps": 434,
"num_train_epochs": 1,
"save_steps": 100,
"total_flos": 1.0192399979927962e+17,
"trial_name": null,
"trial_params": null
}