zephyr-7b-dpo-full_lr1e-7 / trainer_state.json
dlibf's picture
Model save
2889bd7 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 100,
"global_step": 478,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"learning_rate": 2.083333333333333e-09,
"logits/chosen": -2.322427272796631,
"logits/rejected": -2.1875603199005127,
"logps/chosen": -316.933837890625,
"logps/rejected": -257.42218017578125,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 1
},
{
"epoch": 0.02,
"learning_rate": 2.0833333333333335e-08,
"logits/chosen": -2.3630495071411133,
"logits/rejected": -2.31345272064209,
"logps/chosen": -246.93641662597656,
"logps/rejected": -213.21914672851562,
"loss": 0.6932,
"rewards/accuracies": 0.3819444477558136,
"rewards/chosen": -0.00016181336832232773,
"rewards/margins": -0.0003163775254506618,
"rewards/rejected": 0.00015456414257641882,
"step": 10
},
{
"epoch": 0.04,
"learning_rate": 4.166666666666667e-08,
"logits/chosen": -2.3567471504211426,
"logits/rejected": -2.3047866821289062,
"logps/chosen": -271.8526611328125,
"logps/rejected": -246.7681884765625,
"loss": 0.6932,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": 5.23998387507163e-05,
"rewards/margins": 0.0003248125431127846,
"rewards/rejected": -0.0002724127843976021,
"step": 20
},
{
"epoch": 0.06,
"learning_rate": 6.25e-08,
"logits/chosen": -2.4557652473449707,
"logits/rejected": -2.36832332611084,
"logps/chosen": -281.60369873046875,
"logps/rejected": -273.43359375,
"loss": 0.693,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": 0.0001846836385084316,
"rewards/margins": 0.0006770413601770997,
"rewards/rejected": -0.0004923577653244138,
"step": 30
},
{
"epoch": 0.08,
"learning_rate": 8.333333333333334e-08,
"logits/chosen": -2.3590247631073,
"logits/rejected": -2.282857894897461,
"logps/chosen": -292.39532470703125,
"logps/rejected": -267.86248779296875,
"loss": 0.6924,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.0016343205934390426,
"rewards/margins": 0.0015354175120592117,
"rewards/rejected": 9.890317596727982e-05,
"step": 40
},
{
"epoch": 0.1,
"learning_rate": 9.999466228837449e-08,
"logits/chosen": -2.364659070968628,
"logits/rejected": -2.304342269897461,
"logps/chosen": -307.89849853515625,
"logps/rejected": -300.7413330078125,
"loss": 0.6912,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.0037586174439638853,
"rewards/margins": 0.0024325354024767876,
"rewards/rejected": 0.0013260821579024196,
"step": 50
},
{
"epoch": 0.13,
"learning_rate": 9.980796201712733e-08,
"logits/chosen": -2.3100428581237793,
"logits/rejected": -2.2260587215423584,
"logps/chosen": -253.73074340820312,
"logps/rejected": -225.9529266357422,
"loss": 0.689,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": 0.0078201899304986,
"rewards/margins": 0.007128429599106312,
"rewards/rejected": 0.0006917613791301847,
"step": 60
},
{
"epoch": 0.15,
"learning_rate": 9.935551471796358e-08,
"logits/chosen": -2.366241455078125,
"logits/rejected": -2.286762237548828,
"logps/chosen": -277.2716369628906,
"logps/rejected": -247.9556121826172,
"loss": 0.687,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": 0.013750630430877209,
"rewards/margins": 0.012628299184143543,
"rewards/rejected": 0.0011223324108868837,
"step": 70
},
{
"epoch": 0.17,
"learning_rate": 9.863973439298597e-08,
"logits/chosen": -2.3527064323425293,
"logits/rejected": -2.297341823577881,
"logps/chosen": -265.49774169921875,
"logps/rejected": -272.723876953125,
"loss": 0.6841,
"rewards/accuracies": 0.71875,
"rewards/chosen": 0.02027452178299427,
"rewards/margins": 0.014950519427657127,
"rewards/rejected": 0.005324001424014568,
"step": 80
},
{
"epoch": 0.19,
"learning_rate": 9.766444003992702e-08,
"logits/chosen": -2.3454525470733643,
"logits/rejected": -2.252537727355957,
"logps/chosen": -262.5143127441406,
"logps/rejected": -250.82666015625,
"loss": 0.6804,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": 0.025243505835533142,
"rewards/margins": 0.027552824467420578,
"rewards/rejected": -0.0023093195632100105,
"step": 90
},
{
"epoch": 0.21,
"learning_rate": 9.643483527614371e-08,
"logits/chosen": -2.3724794387817383,
"logits/rejected": -2.27921724319458,
"logps/chosen": -279.7447204589844,
"logps/rejected": -261.43951416015625,
"loss": 0.6773,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": 0.033204130828380585,
"rewards/margins": 0.02967449091374874,
"rewards/rejected": 0.0035296380519866943,
"step": 100
},
{
"epoch": 0.21,
"eval_logits/chosen": -2.403949499130249,
"eval_logits/rejected": -2.356935739517212,
"eval_logps/chosen": -259.5997314453125,
"eval_logps/rejected": -262.78692626953125,
"eval_loss": 0.6766896843910217,
"eval_rewards/accuracies": 0.69140625,
"eval_rewards/chosen": 0.028116336092352867,
"eval_rewards/margins": 0.03584778681397438,
"eval_rewards/rejected": -0.007731448858976364,
"eval_runtime": 126.1594,
"eval_samples_per_second": 15.853,
"eval_steps_per_second": 0.254,
"step": 100
},
{
"epoch": 0.23,
"learning_rate": 9.495748057506749e-08,
"logits/chosen": -2.4030067920684814,
"logits/rejected": -2.322254180908203,
"logps/chosen": -312.4465637207031,
"logps/rejected": -271.04425048828125,
"loss": 0.6742,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": 0.030206363648176193,
"rewards/margins": 0.04333298280835152,
"rewards/rejected": -0.0131266163662076,
"step": 110
},
{
"epoch": 0.25,
"learning_rate": 9.324025826323994e-08,
"logits/chosen": -2.3183374404907227,
"logits/rejected": -2.2286434173583984,
"logps/chosen": -277.7622985839844,
"logps/rejected": -234.4459686279297,
"loss": 0.6678,
"rewards/accuracies": 0.71875,
"rewards/chosen": 0.009455936029553413,
"rewards/margins": 0.05889623612165451,
"rewards/rejected": -0.04944029822945595,
"step": 120
},
{
"epoch": 0.27,
"learning_rate": 9.12923304646902e-08,
"logits/chosen": -2.3176040649414062,
"logits/rejected": -2.2900288105010986,
"logps/chosen": -262.83428955078125,
"logps/rejected": -294.60919189453125,
"loss": 0.6682,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": -0.00789323914796114,
"rewards/margins": 0.05458872765302658,
"rewards/rejected": -0.062481969594955444,
"step": 130
},
{
"epoch": 0.29,
"learning_rate": 8.912409021703912e-08,
"logits/chosen": -2.4203720092773438,
"logits/rejected": -2.3540279865264893,
"logps/chosen": -292.56103515625,
"logps/rejected": -261.5688171386719,
"loss": 0.6617,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": 0.0033874516375362873,
"rewards/margins": 0.07644981890916824,
"rewards/rejected": -0.07306236028671265,
"step": 140
},
{
"epoch": 0.31,
"learning_rate": 8.67471060201467e-08,
"logits/chosen": -2.361544609069824,
"logits/rejected": -2.2634243965148926,
"logps/chosen": -287.91455078125,
"logps/rejected": -245.2080078125,
"loss": 0.6515,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.019349873065948486,
"rewards/margins": 0.09038561582565308,
"rewards/rejected": -0.10973550379276276,
"step": 150
},
{
"epoch": 0.33,
"learning_rate": 8.417406011315998e-08,
"logits/chosen": -2.306546449661255,
"logits/rejected": -2.24609375,
"logps/chosen": -287.05523681640625,
"logps/rejected": -282.81243896484375,
"loss": 0.6442,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.023438425734639168,
"rewards/margins": 0.12676861882209778,
"rewards/rejected": -0.150207057595253,
"step": 160
},
{
"epoch": 0.36,
"learning_rate": 8.141868080927996e-08,
"logits/chosen": -2.3783812522888184,
"logits/rejected": -2.2990164756774902,
"logps/chosen": -247.52914428710938,
"logps/rejected": -240.2356719970703,
"loss": 0.6507,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.01242762990295887,
"rewards/margins": 0.09276925027370453,
"rewards/rejected": -0.10519689321517944,
"step": 170
},
{
"epoch": 0.38,
"learning_rate": 7.849566924927082e-08,
"logits/chosen": -2.4011363983154297,
"logits/rejected": -2.3967764377593994,
"logps/chosen": -297.4763488769531,
"logps/rejected": -298.1854553222656,
"loss": 0.6404,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -0.04231880232691765,
"rewards/margins": 0.14232759177684784,
"rewards/rejected": -0.18464641273021698,
"step": 180
},
{
"epoch": 0.4,
"learning_rate": 7.542062096451305e-08,
"logits/chosen": -2.433202028274536,
"logits/rejected": -2.340076446533203,
"logps/chosen": -288.4952697753906,
"logps/rejected": -292.8149719238281,
"loss": 0.6296,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.06186067312955856,
"rewards/margins": 0.15901610255241394,
"rewards/rejected": -0.2208767831325531,
"step": 190
},
{
"epoch": 0.42,
"learning_rate": 7.22099426680959e-08,
"logits/chosen": -2.3200743198394775,
"logits/rejected": -2.2398030757904053,
"logps/chosen": -289.97857666015625,
"logps/rejected": -268.5564270019531,
"loss": 0.6286,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.10072056204080582,
"rewards/margins": 0.14450570940971375,
"rewards/rejected": -0.24522623419761658,
"step": 200
},
{
"epoch": 0.42,
"eval_logits/chosen": -2.4022796154022217,
"eval_logits/rejected": -2.3535399436950684,
"eval_logps/chosen": -267.7723693847656,
"eval_logps/rejected": -285.21490478515625,
"eval_loss": 0.6292469501495361,
"eval_rewards/accuracies": 0.7109375,
"eval_rewards/chosen": -0.053610093891620636,
"eval_rewards/margins": 0.17840130627155304,
"eval_rewards/rejected": -0.23201137781143188,
"eval_runtime": 125.7898,
"eval_samples_per_second": 15.9,
"eval_steps_per_second": 0.254,
"step": 200
},
{
"epoch": 0.44,
"learning_rate": 6.888076471790423e-08,
"logits/chosen": -2.3633980751037598,
"logits/rejected": -2.2721266746520996,
"logps/chosen": -278.57403564453125,
"logps/rejected": -257.5547180175781,
"loss": 0.6275,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.019214138388633728,
"rewards/margins": 0.17581240832805634,
"rewards/rejected": -0.19502654671669006,
"step": 210
},
{
"epoch": 0.46,
"learning_rate": 6.545084971874738e-08,
"logits/chosen": -2.2926197052001953,
"logits/rejected": -2.265472650527954,
"logps/chosen": -281.14739990234375,
"logps/rejected": -269.4004821777344,
"loss": 0.6304,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -0.06231166049838066,
"rewards/margins": 0.19198919832706451,
"rewards/rejected": -0.2543008327484131,
"step": 220
},
{
"epoch": 0.48,
"learning_rate": 6.193849775117709e-08,
"logits/chosen": -2.2452166080474854,
"logits/rejected": -2.2042713165283203,
"logps/chosen": -279.60284423828125,
"logps/rejected": -316.6097717285156,
"loss": 0.6223,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -0.08384998142719269,
"rewards/margins": 0.22490167617797852,
"rewards/rejected": -0.3087516725063324,
"step": 230
},
{
"epoch": 0.5,
"learning_rate": 5.836244873263989e-08,
"logits/chosen": -2.2598187923431396,
"logits/rejected": -2.182457685470581,
"logps/chosen": -253.05990600585938,
"logps/rejected": -265.4845275878906,
"loss": 0.6227,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.039119161665439606,
"rewards/margins": 0.20275244116783142,
"rewards/rejected": -0.24187159538269043,
"step": 240
},
{
"epoch": 0.52,
"learning_rate": 5.474178243190913e-08,
"logits/chosen": -2.3356399536132812,
"logits/rejected": -2.2523715496063232,
"logps/chosen": -272.73431396484375,
"logps/rejected": -282.30047607421875,
"loss": 0.6136,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.065894715487957,
"rewards/margins": 0.19611124694347382,
"rewards/rejected": -0.2620059847831726,
"step": 250
},
{
"epoch": 0.54,
"learning_rate": 5.10958166702634e-08,
"logits/chosen": -2.310091018676758,
"logits/rejected": -2.2063992023468018,
"logps/chosen": -281.9244384765625,
"logps/rejected": -267.5443420410156,
"loss": 0.6078,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.0009437998523935676,
"rewards/margins": 0.26818323135375977,
"rewards/rejected": -0.2691270709037781,
"step": 260
},
{
"epoch": 0.56,
"learning_rate": 4.744400425255165e-08,
"logits/chosen": -2.290621519088745,
"logits/rejected": -2.2106773853302,
"logps/chosen": -281.05450439453125,
"logps/rejected": -294.99713134765625,
"loss": 0.61,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.0785941556096077,
"rewards/margins": 0.2625763416290283,
"rewards/rejected": -0.34117045998573303,
"step": 270
},
{
"epoch": 0.59,
"learning_rate": 4.3805829178062595e-08,
"logits/chosen": -2.3119542598724365,
"logits/rejected": -2.2208571434020996,
"logps/chosen": -286.83673095703125,
"logps/rejected": -274.24700927734375,
"loss": 0.611,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.13735605776309967,
"rewards/margins": 0.2302558869123459,
"rewards/rejected": -0.36761194467544556,
"step": 280
},
{
"epoch": 0.61,
"learning_rate": 4.020070268495843e-08,
"logits/chosen": -2.2847678661346436,
"logits/rejected": -2.236466884613037,
"logps/chosen": -294.72039794921875,
"logps/rejected": -302.5251770019531,
"loss": 0.6138,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.08734361827373505,
"rewards/margins": 0.2090953290462494,
"rewards/rejected": -0.29643893241882324,
"step": 290
},
{
"epoch": 0.63,
"learning_rate": 3.6647859682920356e-08,
"logits/chosen": -2.302154064178467,
"logits/rejected": -2.2005538940429688,
"logps/chosen": -304.13336181640625,
"logps/rejected": -333.745849609375,
"loss": 0.6161,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": -0.08611747622489929,
"rewards/margins": 0.18742723762989044,
"rewards/rejected": -0.27354469895362854,
"step": 300
},
{
"epoch": 0.63,
"eval_logits/chosen": -2.3264002799987793,
"eval_logits/rejected": -2.2759318351745605,
"eval_logps/chosen": -270.8907470703125,
"eval_logps/rejected": -295.5617370605469,
"eval_loss": 0.606550931930542,
"eval_rewards/accuracies": 0.71875,
"eval_rewards/chosen": -0.08479367196559906,
"eval_rewards/margins": 0.25068604946136475,
"eval_rewards/rejected": -0.335479736328125,
"eval_runtime": 126.491,
"eval_samples_per_second": 15.811,
"eval_steps_per_second": 0.253,
"step": 300
},
{
"epoch": 0.65,
"learning_rate": 3.316625612658315e-08,
"logits/chosen": -2.3386969566345215,
"logits/rejected": -2.2128827571868896,
"logps/chosen": -283.69805908203125,
"logps/rejected": -254.1611328125,
"loss": 0.6035,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.10745751857757568,
"rewards/margins": 0.25038108229637146,
"rewards/rejected": -0.35783863067626953,
"step": 310
},
{
"epoch": 0.67,
"learning_rate": 2.9774467877315317e-08,
"logits/chosen": -2.2364108562469482,
"logits/rejected": -2.154195785522461,
"logps/chosen": -254.5322723388672,
"logps/rejected": -277.83294677734375,
"loss": 0.5964,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.08545304834842682,
"rewards/margins": 0.3306066691875458,
"rewards/rejected": -0.4160597324371338,
"step": 320
},
{
"epoch": 0.69,
"learning_rate": 2.6490591592961574e-08,
"logits/chosen": -2.321842908859253,
"logits/rejected": -2.27044677734375,
"logps/chosen": -293.48895263671875,
"logps/rejected": -288.6790466308594,
"loss": 0.5983,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.08705135434865952,
"rewards/margins": 0.27239271998405457,
"rewards/rejected": -0.3594440817832947,
"step": 330
},
{
"epoch": 0.71,
"learning_rate": 2.3332148174343254e-08,
"logits/chosen": -2.2017993927001953,
"logits/rejected": -2.1690361499786377,
"logps/chosen": -269.119873046875,
"logps/rejected": -278.8016662597656,
"loss": 0.6085,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.09265842288732529,
"rewards/margins": 0.21494929492473602,
"rewards/rejected": -0.3076077103614807,
"step": 340
},
{
"epoch": 0.73,
"learning_rate": 2.031598928367147e-08,
"logits/chosen": -2.225857734680176,
"logits/rejected": -2.1034350395202637,
"logps/chosen": -263.73974609375,
"logps/rejected": -271.5313415527344,
"loss": 0.5993,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.1507432460784912,
"rewards/margins": 0.24868826568126678,
"rewards/rejected": -0.3994315266609192,
"step": 350
},
{
"epoch": 0.75,
"learning_rate": 1.7458207433638223e-08,
"logits/chosen": -2.311649799346924,
"logits/rejected": -2.1799566745758057,
"logps/chosen": -327.11041259765625,
"logps/rejected": -300.2535095214844,
"loss": 0.6032,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": -0.09650352597236633,
"rewards/margins": 0.29801806807518005,
"rewards/rejected": -0.394521564245224,
"step": 360
},
{
"epoch": 0.77,
"learning_rate": 1.4774050126898163e-08,
"logits/chosen": -2.3074421882629395,
"logits/rejected": -2.187077283859253,
"logps/chosen": -319.43560791015625,
"logps/rejected": -296.7154846191406,
"loss": 0.5996,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -0.09320107102394104,
"rewards/margins": 0.314532995223999,
"rewards/rejected": -0.40773409605026245,
"step": 370
},
{
"epoch": 0.79,
"learning_rate": 1.2277838504044869e-08,
"logits/chosen": -2.2485527992248535,
"logits/rejected": -2.1728129386901855,
"logps/chosen": -296.4371643066406,
"logps/rejected": -318.8984680175781,
"loss": 0.5896,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -0.1758451759815216,
"rewards/margins": 0.26139548420906067,
"rewards/rejected": -0.4372406005859375,
"step": 380
},
{
"epoch": 0.82,
"learning_rate": 9.982890934129379e-09,
"logits/chosen": -2.2180557250976562,
"logits/rejected": -2.164605140686035,
"logps/chosen": -286.80487060546875,
"logps/rejected": -304.9507141113281,
"loss": 0.603,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.18554511666297913,
"rewards/margins": 0.17823219299316406,
"rewards/rejected": -0.3637773096561432,
"step": 390
},
{
"epoch": 0.84,
"learning_rate": 7.901451955398791e-09,
"logits/chosen": -2.188835382461548,
"logits/rejected": -2.1698737144470215,
"logps/chosen": -331.4312744140625,
"logps/rejected": -333.14471435546875,
"loss": 0.5908,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.13692975044250488,
"rewards/margins": 0.26922523975372314,
"rewards/rejected": -0.40615496039390564,
"step": 400
},
{
"epoch": 0.84,
"eval_logits/chosen": -2.3026418685913086,
"eval_logits/rejected": -2.251879930496216,
"eval_logps/chosen": -272.7594299316406,
"eval_logps/rejected": -300.471435546875,
"eval_loss": 0.6002275347709656,
"eval_rewards/accuracies": 0.72265625,
"eval_rewards/chosen": -0.10348068922758102,
"eval_rewards/margins": 0.2810956835746765,
"eval_rewards/rejected": -0.38457638025283813,
"eval_runtime": 129.5587,
"eval_samples_per_second": 15.437,
"eval_steps_per_second": 0.247,
"step": 400
},
{
"epoch": 0.86,
"learning_rate": 6.044626945386894e-09,
"logits/chosen": -2.229055881500244,
"logits/rejected": -2.1533703804016113,
"logps/chosen": -271.81182861328125,
"logps/rejected": -285.35748291015625,
"loss": 0.61,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -0.13526737689971924,
"rewards/margins": 0.255347341299057,
"rewards/rejected": -0.39061471819877625,
"step": 410
},
{
"epoch": 0.88,
"learning_rate": 4.422322868919937e-09,
"logits/chosen": -2.2021594047546387,
"logits/rejected": -2.1868810653686523,
"logps/chosen": -285.3330383300781,
"logps/rejected": -319.97686767578125,
"loss": 0.6048,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -0.09117873013019562,
"rewards/margins": 0.24966660141944885,
"rewards/rejected": -0.34084534645080566,
"step": 420
},
{
"epoch": 0.9,
"learning_rate": 3.043195420172878e-09,
"logits/chosen": -2.312375545501709,
"logits/rejected": -2.2687058448791504,
"logps/chosen": -306.77484130859375,
"logps/rejected": -292.4010314941406,
"loss": 0.6081,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.10561072826385498,
"rewards/margins": 0.275703489780426,
"rewards/rejected": -0.381314218044281,
"step": 430
},
{
"epoch": 0.92,
"learning_rate": 1.914602840795848e-09,
"logits/chosen": -2.260124921798706,
"logits/rejected": -2.2002854347229004,
"logps/chosen": -271.2167053222656,
"logps/rejected": -286.7249755859375,
"loss": 0.6024,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": -0.13646569848060608,
"rewards/margins": 0.19637010991573334,
"rewards/rejected": -0.3328357934951782,
"step": 440
},
{
"epoch": 0.94,
"learning_rate": 1.0425666605112514e-09,
"logits/chosen": -2.1782705783843994,
"logits/rejected": -2.1199605464935303,
"logps/chosen": -253.34414672851562,
"logps/rejected": -260.60638427734375,
"loss": 0.6119,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -0.09690654277801514,
"rewards/margins": 0.21923665702342987,
"rewards/rejected": -0.3161432147026062,
"step": 450
},
{
"epoch": 0.96,
"learning_rate": 4.317395696473214e-10,
"logits/chosen": -2.216618299484253,
"logits/rejected": -2.1176908016204834,
"logps/chosen": -253.55648803710938,
"logps/rejected": -267.17169189453125,
"loss": 0.593,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -0.1348879039287567,
"rewards/margins": 0.2610943615436554,
"rewards/rejected": -0.3959822356700897,
"step": 460
},
{
"epoch": 0.98,
"learning_rate": 8.538059502214978e-11,
"logits/chosen": -2.3197951316833496,
"logits/rejected": -2.164881706237793,
"logps/chosen": -302.24468994140625,
"logps/rejected": -304.28656005859375,
"loss": 0.588,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.046127352863550186,
"rewards/margins": 0.3802485764026642,
"rewards/rejected": -0.42637595534324646,
"step": 470
},
{
"epoch": 1.0,
"step": 478,
"total_flos": 0.0,
"train_loss": 0.6323943896273688,
"train_runtime": 8575.3767,
"train_samples_per_second": 7.129,
"train_steps_per_second": 0.056
}
],
"logging_steps": 10,
"max_steps": 478,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 1000,
"total_flos": 0.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}