dpo-orca-m4 / trainer_state.json
Dapinsky's picture
phi 2 dpo on orca and m4
a430fee verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.6058631921824107,
"eval_steps": 50,
"global_step": 200,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.013029315960912053,
"grad_norm": 41.75,
"learning_rate": 1.6666666666666667e-06,
"logits/chosen": 0.4338657259941101,
"logits/rejected": 0.4453325867652893,
"logps/chosen": -67.76948547363281,
"logps/rejected": -152.9691162109375,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 1
},
{
"epoch": 0.026058631921824105,
"grad_norm": 36.25,
"learning_rate": 3.3333333333333333e-06,
"logits/chosen": 0.3402215540409088,
"logits/rejected": 0.3878844380378723,
"logps/chosen": -98.9161148071289,
"logps/rejected": -155.82638549804688,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 2
},
{
"epoch": 0.03908794788273615,
"grad_norm": 64.5,
"learning_rate": 5e-06,
"logits/chosen": 0.38514813780784607,
"logits/rejected": 0.36703822016716003,
"logps/chosen": -93.1368408203125,
"logps/rejected": -161.52493286132812,
"loss": 0.6983,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.0311676524579525,
"rewards/margins": -0.002570953220129013,
"rewards/rejected": -0.028596699237823486,
"step": 3
},
{
"epoch": 0.05211726384364821,
"grad_norm": 29.875,
"learning_rate": 6.666666666666667e-06,
"logits/chosen": 0.4961632192134857,
"logits/rejected": 0.49073392152786255,
"logps/chosen": -94.36677551269531,
"logps/rejected": -176.82952880859375,
"loss": 0.7247,
"rewards/accuracies": 0.40625,
"rewards/chosen": -0.008490505628287792,
"rewards/margins": -0.055457405745983124,
"rewards/rejected": 0.04696689918637276,
"step": 4
},
{
"epoch": 0.06514657980456026,
"grad_norm": 27.5,
"learning_rate": 8.333333333333334e-06,
"logits/chosen": 0.3893408179283142,
"logits/rejected": 0.41501885652542114,
"logps/chosen": -91.56944274902344,
"logps/rejected": -141.12969970703125,
"loss": 0.6805,
"rewards/accuracies": 0.5,
"rewards/chosen": 0.02342848852276802,
"rewards/margins": 0.03252270072698593,
"rewards/rejected": -0.009094213135540485,
"step": 5
},
{
"epoch": 0.0781758957654723,
"grad_norm": 28.375,
"learning_rate": 1e-05,
"logits/chosen": 0.4950886070728302,
"logits/rejected": 0.5048765540122986,
"logps/chosen": -79.60177612304688,
"logps/rejected": -174.52386474609375,
"loss": 0.6915,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.0013483259826898575,
"rewards/margins": 0.008842225186526775,
"rewards/rejected": -0.007493901532143354,
"step": 6
},
{
"epoch": 0.09120521172638436,
"grad_norm": 44.25,
"learning_rate": 1.1666666666666668e-05,
"logits/chosen": 0.3866894543170929,
"logits/rejected": 0.4369007349014282,
"logps/chosen": -73.19027709960938,
"logps/rejected": -144.08810424804688,
"loss": 0.7116,
"rewards/accuracies": 0.46875,
"rewards/chosen": -0.015920385718345642,
"rewards/margins": -0.026944227516651154,
"rewards/rejected": 0.011023844592273235,
"step": 7
},
{
"epoch": 0.10423452768729642,
"grad_norm": 63.75,
"learning_rate": 1.3333333333333333e-05,
"logits/chosen": 0.45976200699806213,
"logits/rejected": 0.426272988319397,
"logps/chosen": -71.57977294921875,
"logps/rejected": -137.3433074951172,
"loss": 0.6707,
"rewards/accuracies": 0.71875,
"rewards/chosen": 0.01595836505293846,
"rewards/margins": 0.04949212074279785,
"rewards/rejected": -0.03353375196456909,
"step": 8
},
{
"epoch": 0.11726384364820847,
"grad_norm": 54.75,
"learning_rate": 1.5e-05,
"logits/chosen": 0.49033746123313904,
"logits/rejected": 0.48075181245803833,
"logps/chosen": -91.1353759765625,
"logps/rejected": -167.73594665527344,
"loss": 0.6547,
"rewards/accuracies": 0.65625,
"rewards/chosen": 0.04745086282491684,
"rewards/margins": 0.08511507511138916,
"rewards/rejected": -0.03766421973705292,
"step": 9
},
{
"epoch": 0.13029315960912052,
"grad_norm": 33.5,
"learning_rate": 1.6666666666666667e-05,
"logits/chosen": 0.5154792070388794,
"logits/rejected": 0.4838900566101074,
"logps/chosen": -96.14872741699219,
"logps/rejected": -157.02932739257812,
"loss": 0.6958,
"rewards/accuracies": 0.46875,
"rewards/chosen": 0.024153033271431923,
"rewards/margins": 0.006197445094585419,
"rewards/rejected": 0.017955590039491653,
"step": 10
},
{
"epoch": 0.14332247557003258,
"grad_norm": 35.0,
"learning_rate": 1.8333333333333333e-05,
"logits/chosen": 0.45827457308769226,
"logits/rejected": 0.5124724507331848,
"logps/chosen": -93.97823333740234,
"logps/rejected": -138.24327087402344,
"loss": 0.699,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.01763225719332695,
"rewards/margins": 0.0005271416157484055,
"rewards/rejected": -0.018159402534365654,
"step": 11
},
{
"epoch": 0.1563517915309446,
"grad_norm": 34.5,
"learning_rate": 2e-05,
"logits/chosen": 0.4826943874359131,
"logits/rejected": 0.43963971734046936,
"logps/chosen": -98.74089050292969,
"logps/rejected": -145.690185546875,
"loss": 0.7101,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.030411405488848686,
"rewards/margins": -0.028361458331346512,
"rewards/rejected": -0.0020499457605183125,
"step": 12
},
{
"epoch": 0.16938110749185667,
"grad_norm": 33.25,
"learning_rate": 2.1666666666666667e-05,
"logits/chosen": 0.384093314409256,
"logits/rejected": 0.4154108166694641,
"logps/chosen": -110.437744140625,
"logps/rejected": -170.55215454101562,
"loss": 0.7018,
"rewards/accuracies": 0.53125,
"rewards/chosen": -0.000756765715777874,
"rewards/margins": -0.005527975037693977,
"rewards/rejected": 0.004771207459270954,
"step": 13
},
{
"epoch": 0.18241042345276873,
"grad_norm": 32.0,
"learning_rate": 2.3333333333333336e-05,
"logits/chosen": 0.3536284565925598,
"logits/rejected": 0.4306492209434509,
"logps/chosen": -87.72677612304688,
"logps/rejected": -135.49493408203125,
"loss": 0.7118,
"rewards/accuracies": 0.375,
"rewards/chosen": -0.030064944177865982,
"rewards/margins": -0.03109516017138958,
"rewards/rejected": 0.001030217856168747,
"step": 14
},
{
"epoch": 0.19543973941368079,
"grad_norm": 32.5,
"learning_rate": 2.5e-05,
"logits/chosen": 0.4092313051223755,
"logits/rejected": 0.5090660452842712,
"logps/chosen": -95.63008117675781,
"logps/rejected": -135.93472290039062,
"loss": 0.6946,
"rewards/accuracies": 0.53125,
"rewards/chosen": -0.0016081184148788452,
"rewards/margins": 0.002313855104148388,
"rewards/rejected": -0.003921976778656244,
"step": 15
},
{
"epoch": 0.20846905537459284,
"grad_norm": 33.25,
"learning_rate": 2.6666666666666667e-05,
"logits/chosen": 0.4373230040073395,
"logits/rejected": 0.5158215761184692,
"logps/chosen": -115.45347595214844,
"logps/rejected": -160.17929077148438,
"loss": 0.6503,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.015219582244753838,
"rewards/margins": 0.10696868598461151,
"rewards/rejected": -0.09174911677837372,
"step": 16
},
{
"epoch": 0.22149837133550487,
"grad_norm": 31.25,
"learning_rate": 2.8333333333333335e-05,
"logits/chosen": 0.5184516906738281,
"logits/rejected": 0.5677393674850464,
"logps/chosen": -128.66629028320312,
"logps/rejected": -172.19888305664062,
"loss": 0.635,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.0017459085211157799,
"rewards/margins": 0.12914448976516724,
"rewards/rejected": -0.1308903992176056,
"step": 17
},
{
"epoch": 0.23452768729641693,
"grad_norm": 60.5,
"learning_rate": 3e-05,
"logits/chosen": 0.43745332956314087,
"logits/rejected": 0.4682745337486267,
"logps/chosen": -108.17106628417969,
"logps/rejected": -155.61282348632812,
"loss": 0.6391,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.010563232935965061,
"rewards/margins": 0.1288895308971405,
"rewards/rejected": -0.13945278525352478,
"step": 18
},
{
"epoch": 0.247557003257329,
"grad_norm": 28.375,
"learning_rate": 3.1666666666666666e-05,
"logits/chosen": 0.4536093473434448,
"logits/rejected": 0.4597874581813812,
"logps/chosen": -80.29083251953125,
"logps/rejected": -146.64483642578125,
"loss": 0.6456,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.004713800735771656,
"rewards/margins": 0.10411291569471359,
"rewards/rejected": -0.10882672667503357,
"step": 19
},
{
"epoch": 0.26058631921824105,
"grad_norm": 50.0,
"learning_rate": 3.3333333333333335e-05,
"logits/chosen": 0.46047478914260864,
"logits/rejected": 0.5494062304496765,
"logps/chosen": -103.00077056884766,
"logps/rejected": -168.70933532714844,
"loss": 0.6503,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.048282139003276825,
"rewards/margins": 0.09549374878406525,
"rewards/rejected": -0.14377588033676147,
"step": 20
},
{
"epoch": 0.2736156351791531,
"grad_norm": 31.0,
"learning_rate": 3.5e-05,
"logits/chosen": 0.5022985935211182,
"logits/rejected": 0.5251904726028442,
"logps/chosen": -82.43826293945312,
"logps/rejected": -148.17120361328125,
"loss": 0.6247,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.02719825878739357,
"rewards/margins": 0.15447314083576202,
"rewards/rejected": -0.12727488577365875,
"step": 21
},
{
"epoch": 0.28664495114006516,
"grad_norm": 30.75,
"learning_rate": 3.6666666666666666e-05,
"logits/chosen": 0.4817676544189453,
"logits/rejected": 0.4860598146915436,
"logps/chosen": -101.01628875732422,
"logps/rejected": -146.12977600097656,
"loss": 0.622,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.027572251856327057,
"rewards/margins": 0.15721869468688965,
"rewards/rejected": -0.1847909688949585,
"step": 22
},
{
"epoch": 0.2996742671009772,
"grad_norm": 21.75,
"learning_rate": 3.8333333333333334e-05,
"logits/chosen": 0.48463064432144165,
"logits/rejected": 0.5631467700004578,
"logps/chosen": -81.53482055664062,
"logps/rejected": -135.9483184814453,
"loss": 0.5766,
"rewards/accuracies": 0.90625,
"rewards/chosen": 0.021168498322367668,
"rewards/margins": 0.2705130875110626,
"rewards/rejected": -0.2493445873260498,
"step": 23
},
{
"epoch": 0.3127035830618892,
"grad_norm": 35.25,
"learning_rate": 4e-05,
"logits/chosen": 0.38634905219078064,
"logits/rejected": 0.42648378014564514,
"logps/chosen": -97.1165771484375,
"logps/rejected": -161.6883087158203,
"loss": 0.5806,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.008925480768084526,
"rewards/margins": 0.2537250518798828,
"rewards/rejected": -0.2626505196094513,
"step": 24
},
{
"epoch": 0.3257328990228013,
"grad_norm": 27.25,
"learning_rate": 4.166666666666667e-05,
"logits/chosen": 0.41833925247192383,
"logits/rejected": 0.4584392011165619,
"logps/chosen": -89.66869354248047,
"logps/rejected": -150.55813598632812,
"loss": 0.5952,
"rewards/accuracies": 0.90625,
"rewards/chosen": -0.019657809287309647,
"rewards/margins": 0.21433238685131073,
"rewards/rejected": -0.23399019241333008,
"step": 25
},
{
"epoch": 0.33876221498371334,
"grad_norm": 50.5,
"learning_rate": 4.3333333333333334e-05,
"logits/chosen": 0.46740618348121643,
"logits/rejected": 0.4832380712032318,
"logps/chosen": -62.494773864746094,
"logps/rejected": -146.53067016601562,
"loss": 0.5411,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.0009484302718192339,
"rewards/margins": 0.3466818928718567,
"rewards/rejected": -0.3476303815841675,
"step": 26
},
{
"epoch": 0.3517915309446254,
"grad_norm": 23.0,
"learning_rate": 4.5e-05,
"logits/chosen": 0.45530009269714355,
"logits/rejected": 0.5172832012176514,
"logps/chosen": -85.00700378417969,
"logps/rejected": -136.05020141601562,
"loss": 0.5479,
"rewards/accuracies": 0.90625,
"rewards/chosen": -0.001965973526239395,
"rewards/margins": 0.34270864725112915,
"rewards/rejected": -0.34467458724975586,
"step": 27
},
{
"epoch": 0.36482084690553745,
"grad_norm": 23.375,
"learning_rate": 4.666666666666667e-05,
"logits/chosen": 0.46558958292007446,
"logits/rejected": 0.5210444331169128,
"logps/chosen": -105.98873901367188,
"logps/rejected": -163.59945678710938,
"loss": 0.519,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.022590279579162598,
"rewards/margins": 0.4247127175331116,
"rewards/rejected": -0.44730299711227417,
"step": 28
},
{
"epoch": 0.3778501628664495,
"grad_norm": 22.75,
"learning_rate": 4.8333333333333334e-05,
"logits/chosen": 0.4795917868614197,
"logits/rejected": 0.47115039825439453,
"logps/chosen": -107.12705993652344,
"logps/rejected": -142.822509765625,
"loss": 0.5271,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.037488676607608795,
"rewards/margins": 0.3942331075668335,
"rewards/rejected": -0.4317218065261841,
"step": 29
},
{
"epoch": 0.39087947882736157,
"grad_norm": 19.625,
"learning_rate": 5e-05,
"logits/chosen": 0.4289873242378235,
"logits/rejected": 0.5595239996910095,
"logps/chosen": -86.29112243652344,
"logps/rejected": -172.88059997558594,
"loss": 0.459,
"rewards/accuracies": 0.96875,
"rewards/chosen": 0.05108689144253731,
"rewards/margins": 0.5967621803283691,
"rewards/rejected": -0.5456752777099609,
"step": 30
},
{
"epoch": 0.40390879478827363,
"grad_norm": 24.375,
"learning_rate": 4.993150684931507e-05,
"logits/chosen": 0.39370930194854736,
"logits/rejected": 0.42319971323013306,
"logps/chosen": -102.44596862792969,
"logps/rejected": -169.67660522460938,
"loss": 0.4393,
"rewards/accuracies": 0.96875,
"rewards/chosen": -0.0332549586892128,
"rewards/margins": 0.642684817314148,
"rewards/rejected": -0.6759397387504578,
"step": 31
},
{
"epoch": 0.4169381107491857,
"grad_norm": 20.0,
"learning_rate": 4.986301369863014e-05,
"logits/chosen": 0.49218329787254333,
"logits/rejected": 0.5275806784629822,
"logps/chosen": -74.05796813964844,
"logps/rejected": -133.33255004882812,
"loss": 0.4407,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.02741517871618271,
"rewards/margins": 0.6402420997619629,
"rewards/rejected": -0.612826943397522,
"step": 32
},
{
"epoch": 0.42996742671009774,
"grad_norm": 24.75,
"learning_rate": 4.979452054794521e-05,
"logits/chosen": 0.35451555252075195,
"logits/rejected": 0.40355199575424194,
"logps/chosen": -104.55900573730469,
"logps/rejected": -151.27711486816406,
"loss": 0.4234,
"rewards/accuracies": 0.96875,
"rewards/chosen": 0.023018483072519302,
"rewards/margins": 0.6792783737182617,
"rewards/rejected": -0.6562598943710327,
"step": 33
},
{
"epoch": 0.44299674267100975,
"grad_norm": 17.5,
"learning_rate": 4.972602739726028e-05,
"logits/chosen": 0.40463435649871826,
"logits/rejected": 0.5144488215446472,
"logps/chosen": -72.91780090332031,
"logps/rejected": -145.31849670410156,
"loss": 0.4111,
"rewards/accuracies": 0.96875,
"rewards/chosen": 0.02165827713906765,
"rewards/margins": 0.7402617931365967,
"rewards/rejected": -0.7186034917831421,
"step": 34
},
{
"epoch": 0.4560260586319218,
"grad_norm": 16.5,
"learning_rate": 4.9657534246575346e-05,
"logits/chosen": 0.4734452962875366,
"logits/rejected": 0.5330387353897095,
"logps/chosen": -83.89728546142578,
"logps/rejected": -147.41265869140625,
"loss": 0.3853,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.009855479001998901,
"rewards/margins": 0.8149614930152893,
"rewards/rejected": -0.8248169422149658,
"step": 35
},
{
"epoch": 0.46905537459283386,
"grad_norm": 24.75,
"learning_rate": 4.958904109589041e-05,
"logits/chosen": 0.3432111144065857,
"logits/rejected": 0.39720407128334045,
"logps/chosen": -84.57624053955078,
"logps/rejected": -131.17434692382812,
"loss": 0.4056,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.0020672655664384365,
"rewards/margins": 0.7789303064346313,
"rewards/rejected": -0.7809975743293762,
"step": 36
},
{
"epoch": 0.4820846905537459,
"grad_norm": 68.5,
"learning_rate": 4.952054794520548e-05,
"logits/chosen": 0.3694133758544922,
"logits/rejected": 0.42799627780914307,
"logps/chosen": -85.02811431884766,
"logps/rejected": -169.74673461914062,
"loss": 0.3145,
"rewards/accuracies": 0.96875,
"rewards/chosen": 0.07135964930057526,
"rewards/margins": 1.2428215742111206,
"rewards/rejected": -1.171462059020996,
"step": 37
},
{
"epoch": 0.495114006514658,
"grad_norm": 18.0,
"learning_rate": 4.945205479452055e-05,
"logits/chosen": 0.4724690318107605,
"logits/rejected": 0.5161466598510742,
"logps/chosen": -79.45156860351562,
"logps/rejected": -183.5731201171875,
"loss": 0.281,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.08071783930063248,
"rewards/margins": 1.4206629991531372,
"rewards/rejected": -1.3399451971054077,
"step": 38
},
{
"epoch": 0.50814332247557,
"grad_norm": 10.9375,
"learning_rate": 4.938356164383562e-05,
"logits/chosen": 0.570473313331604,
"logits/rejected": 0.5667930841445923,
"logps/chosen": -67.05783081054688,
"logps/rejected": -160.54501342773438,
"loss": 0.2824,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.012804888188838959,
"rewards/margins": 1.2680517435073853,
"rewards/rejected": -1.255246877670288,
"step": 39
},
{
"epoch": 0.5211726384364821,
"grad_norm": 15.0,
"learning_rate": 4.9315068493150684e-05,
"logits/chosen": 0.3750945031642914,
"logits/rejected": 0.5399055480957031,
"logps/chosen": -80.3337631225586,
"logps/rejected": -150.540771484375,
"loss": 0.2555,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.09064020216464996,
"rewards/margins": 1.4325942993164062,
"rewards/rejected": -1.3419541120529175,
"step": 40
},
{
"epoch": 0.5342019543973942,
"grad_norm": 17.625,
"learning_rate": 4.9246575342465756e-05,
"logits/chosen": 0.40898123383522034,
"logits/rejected": 0.3948415219783783,
"logps/chosen": -120.64512634277344,
"logps/rejected": -172.23046875,
"loss": 0.2607,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.03111358918249607,
"rewards/margins": 1.4379582405090332,
"rewards/rejected": -1.4068448543548584,
"step": 41
},
{
"epoch": 0.5472312703583062,
"grad_norm": 11.0,
"learning_rate": 4.917808219178082e-05,
"logits/chosen": 0.44859111309051514,
"logits/rejected": 0.4527463912963867,
"logps/chosen": -111.03682708740234,
"logps/rejected": -175.25076293945312,
"loss": 0.23,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.07664196938276291,
"rewards/margins": 1.6669435501098633,
"rewards/rejected": -1.590301513671875,
"step": 42
},
{
"epoch": 0.5602605863192183,
"grad_norm": 15.75,
"learning_rate": 4.9109589041095895e-05,
"logits/chosen": 0.4859389662742615,
"logits/rejected": 0.5201914310455322,
"logps/chosen": -78.25588989257812,
"logps/rejected": -162.362548828125,
"loss": 0.2227,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.0701964795589447,
"rewards/margins": 1.5760339498519897,
"rewards/rejected": -1.5058374404907227,
"step": 43
},
{
"epoch": 0.5732899022801303,
"grad_norm": 12.125,
"learning_rate": 4.904109589041096e-05,
"logits/chosen": 0.5065852403640747,
"logits/rejected": 0.5527216196060181,
"logps/chosen": -78.39152526855469,
"logps/rejected": -183.5028839111328,
"loss": 0.197,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.0381561741232872,
"rewards/margins": 1.9460369348526,
"rewards/rejected": -1.9078807830810547,
"step": 44
},
{
"epoch": 0.5863192182410424,
"grad_norm": 15.75,
"learning_rate": 4.8972602739726034e-05,
"logits/chosen": 0.5216741561889648,
"logits/rejected": 0.6273947954177856,
"logps/chosen": -74.12837982177734,
"logps/rejected": -167.24652099609375,
"loss": 0.1831,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.13984212279319763,
"rewards/margins": 2.0655643939971924,
"rewards/rejected": -1.9257222414016724,
"step": 45
},
{
"epoch": 0.5993485342019544,
"grad_norm": 15.125,
"learning_rate": 4.89041095890411e-05,
"logits/chosen": 0.5224686861038208,
"logits/rejected": 0.5461165308952332,
"logps/chosen": -101.55109405517578,
"logps/rejected": -163.4028778076172,
"loss": 0.1841,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.01885811612010002,
"rewards/margins": 1.9022661447525024,
"rewards/rejected": -1.8834080696105957,
"step": 46
},
{
"epoch": 0.6123778501628665,
"grad_norm": 10.3125,
"learning_rate": 4.8835616438356167e-05,
"logits/chosen": 0.438764363527298,
"logits/rejected": 0.5729016661643982,
"logps/chosen": -73.1627426147461,
"logps/rejected": -153.8181610107422,
"loss": 0.1734,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.08434567600488663,
"rewards/margins": 2.0392439365386963,
"rewards/rejected": -1.9548982381820679,
"step": 47
},
{
"epoch": 0.6254071661237784,
"grad_norm": 8.9375,
"learning_rate": 4.876712328767123e-05,
"logits/chosen": 0.40418195724487305,
"logits/rejected": 0.4241870045661926,
"logps/chosen": -143.9720001220703,
"logps/rejected": -195.26536560058594,
"loss": 0.1135,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.0051138997077941895,
"rewards/margins": 2.4568700790405273,
"rewards/rejected": -2.461984157562256,
"step": 48
},
{
"epoch": 0.6384364820846905,
"grad_norm": 16.5,
"learning_rate": 4.8698630136986305e-05,
"logits/chosen": 0.5531054735183716,
"logits/rejected": 0.5722475051879883,
"logps/chosen": -80.95619201660156,
"logps/rejected": -174.85643005371094,
"loss": 0.1363,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.057745300233364105,
"rewards/margins": 2.4418563842773438,
"rewards/rejected": -2.3841114044189453,
"step": 49
},
{
"epoch": 0.6514657980456026,
"grad_norm": 6.6875,
"learning_rate": 4.863013698630137e-05,
"logits/chosen": 0.3978479504585266,
"logits/rejected": 0.575504720211029,
"logps/chosen": -111.10527038574219,
"logps/rejected": -194.09478759765625,
"loss": 0.0979,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.024841848760843277,
"rewards/margins": 2.8879756927490234,
"rewards/rejected": -2.9128177165985107,
"step": 50
},
{
"epoch": 0.6514657980456026,
"eval_logits/chosen": 0.40171119570732117,
"eval_logits/rejected": 0.4472416043281555,
"eval_logps/chosen": -94.96456909179688,
"eval_logps/rejected": -177.69801330566406,
"eval_loss": 0.10980458557605743,
"eval_rewards/accuracies": 0.9985119104385376,
"eval_rewards/chosen": 0.048970796167850494,
"eval_rewards/margins": 2.70963716506958,
"eval_rewards/rejected": -2.6606662273406982,
"eval_runtime": 53.1051,
"eval_samples_per_second": 12.635,
"eval_steps_per_second": 0.791,
"step": 50
},
{
"epoch": 0.6644951140065146,
"grad_norm": 9.9375,
"learning_rate": 4.856164383561644e-05,
"logits/chosen": 0.5971242189407349,
"logits/rejected": 0.5052528381347656,
"logps/chosen": -100.87618255615234,
"logps/rejected": -183.73324584960938,
"loss": 0.1258,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.029725002124905586,
"rewards/margins": 2.650700330734253,
"rewards/rejected": -2.6209752559661865,
"step": 51
},
{
"epoch": 0.6775244299674267,
"grad_norm": 9.5,
"learning_rate": 4.849315068493151e-05,
"logits/chosen": 0.46090734004974365,
"logits/rejected": 0.5332375168800354,
"logps/chosen": -83.30604553222656,
"logps/rejected": -198.97483825683594,
"loss": 0.0769,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.07736861705780029,
"rewards/margins": 3.2583978176116943,
"rewards/rejected": -3.1810293197631836,
"step": 52
},
{
"epoch": 0.6905537459283387,
"grad_norm": 10.5,
"learning_rate": 4.8424657534246577e-05,
"logits/chosen": 0.5082046985626221,
"logits/rejected": 0.5545482635498047,
"logps/chosen": -96.28477478027344,
"logps/rejected": -153.87228393554688,
"loss": 0.1285,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.049273423850536346,
"rewards/margins": 2.481055974960327,
"rewards/rejected": -2.4317827224731445,
"step": 53
},
{
"epoch": 0.7035830618892508,
"grad_norm": 11.6875,
"learning_rate": 4.835616438356165e-05,
"logits/chosen": 0.4179171621799469,
"logits/rejected": 0.40184441208839417,
"logps/chosen": -138.70870971679688,
"logps/rejected": -198.06478881835938,
"loss": 0.0812,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.07567030191421509,
"rewards/margins": 3.1052422523498535,
"rewards/rejected": -3.0295724868774414,
"step": 54
},
{
"epoch": 0.7166123778501629,
"grad_norm": 4.34375,
"learning_rate": 4.8287671232876716e-05,
"logits/chosen": 0.5138372182846069,
"logits/rejected": 0.5542392730712891,
"logps/chosen": -93.45801544189453,
"logps/rejected": -196.15989685058594,
"loss": 0.0619,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.008173711597919464,
"rewards/margins": 3.3829448223114014,
"rewards/rejected": -3.3747713565826416,
"step": 55
},
{
"epoch": 0.7296416938110749,
"grad_norm": 5.9375,
"learning_rate": 4.821917808219178e-05,
"logits/chosen": 0.4723089337348938,
"logits/rejected": 0.5142194032669067,
"logps/chosen": -101.18618774414062,
"logps/rejected": -202.30770874023438,
"loss": 0.0698,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.20023450255393982,
"rewards/margins": 3.583833694458008,
"rewards/rejected": -3.7840681076049805,
"step": 56
},
{
"epoch": 0.742671009771987,
"grad_norm": 4.84375,
"learning_rate": 4.815068493150685e-05,
"logits/chosen": 0.6098852157592773,
"logits/rejected": 0.5306227207183838,
"logps/chosen": -92.79605102539062,
"logps/rejected": -194.44285583496094,
"loss": 0.0694,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.09434399008750916,
"rewards/margins": 3.6242706775665283,
"rewards/rejected": -3.5299267768859863,
"step": 57
},
{
"epoch": 0.755700325732899,
"grad_norm": 3.09375,
"learning_rate": 4.808219178082192e-05,
"logits/chosen": 0.596287190914154,
"logits/rejected": 0.5526207685470581,
"logps/chosen": -80.8297348022461,
"logps/rejected": -199.17770385742188,
"loss": 0.0424,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.16401855647563934,
"rewards/margins": 4.359023094177246,
"rewards/rejected": -4.195004463195801,
"step": 58
},
{
"epoch": 0.7687296416938111,
"grad_norm": 4.78125,
"learning_rate": 4.801369863013699e-05,
"logits/chosen": 0.5375024080276489,
"logits/rejected": 0.5418161153793335,
"logps/chosen": -94.54348754882812,
"logps/rejected": -179.93148803710938,
"loss": 0.065,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.02510090172290802,
"rewards/margins": 3.8914875984191895,
"rewards/rejected": -3.866386890411377,
"step": 59
},
{
"epoch": 0.7817589576547231,
"grad_norm": 3.953125,
"learning_rate": 4.794520547945205e-05,
"logits/chosen": 0.5458413362503052,
"logits/rejected": 0.5163211226463318,
"logps/chosen": -102.55235290527344,
"logps/rejected": -192.88011169433594,
"loss": 0.0422,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.03571543097496033,
"rewards/margins": 4.136109352111816,
"rewards/rejected": -4.100393772125244,
"step": 60
},
{
"epoch": 0.7947882736156352,
"grad_norm": 3.390625,
"learning_rate": 4.7876712328767126e-05,
"logits/chosen": 0.44991570711135864,
"logits/rejected": 0.47752076387405396,
"logps/chosen": -71.73591613769531,
"logps/rejected": -166.39166259765625,
"loss": 0.0474,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.09097965061664581,
"rewards/margins": 3.9125423431396484,
"rewards/rejected": -3.8215625286102295,
"step": 61
},
{
"epoch": 0.8078175895765473,
"grad_norm": 5.4375,
"learning_rate": 4.780821917808219e-05,
"logits/chosen": 0.5184447169303894,
"logits/rejected": 0.49066781997680664,
"logps/chosen": -96.78662109375,
"logps/rejected": -220.57266235351562,
"loss": 0.044,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.07911338657140732,
"rewards/margins": 4.811767101287842,
"rewards/rejected": -4.890880584716797,
"step": 62
},
{
"epoch": 0.8208469055374593,
"grad_norm": 3.0,
"learning_rate": 4.7739726027397265e-05,
"logits/chosen": 0.5503054857254028,
"logits/rejected": 0.7354578971862793,
"logps/chosen": -76.80421447753906,
"logps/rejected": -210.28140258789062,
"loss": 0.0163,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.024801086634397507,
"rewards/margins": 5.572457790374756,
"rewards/rejected": -5.547656536102295,
"step": 63
},
{
"epoch": 0.8338762214983714,
"grad_norm": 3.46875,
"learning_rate": 4.767123287671233e-05,
"logits/chosen": 0.5171054005622864,
"logits/rejected": 0.512793242931366,
"logps/chosen": -131.59396362304688,
"logps/rejected": -217.56964111328125,
"loss": 0.0153,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.10994181782007217,
"rewards/margins": 5.379184246063232,
"rewards/rejected": -5.489125728607178,
"step": 64
},
{
"epoch": 0.8469055374592834,
"grad_norm": 4.03125,
"learning_rate": 4.7602739726027403e-05,
"logits/chosen": 0.44678860902786255,
"logits/rejected": 0.5419712662696838,
"logps/chosen": -104.75637817382812,
"logps/rejected": -201.79806518554688,
"loss": 0.0296,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.1282435804605484,
"rewards/margins": 4.831494331359863,
"rewards/rejected": -4.959737777709961,
"step": 65
},
{
"epoch": 0.8599348534201955,
"grad_norm": 2.40625,
"learning_rate": 4.753424657534247e-05,
"logits/chosen": 0.582385778427124,
"logits/rejected": 0.6422931551933289,
"logps/chosen": -94.39370727539062,
"logps/rejected": -199.6475830078125,
"loss": 0.0258,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.04259404167532921,
"rewards/margins": 5.368470191955566,
"rewards/rejected": -5.411064147949219,
"step": 66
},
{
"epoch": 0.8729641693811075,
"grad_norm": 2.6875,
"learning_rate": 4.7465753424657536e-05,
"logits/chosen": 0.5766660571098328,
"logits/rejected": 0.6043537855148315,
"logps/chosen": -102.68363952636719,
"logps/rejected": -214.7265625,
"loss": 0.0178,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.2821919322013855,
"rewards/margins": 5.887226581573486,
"rewards/rejected": -6.169419288635254,
"step": 67
},
{
"epoch": 0.8859934853420195,
"grad_norm": 1.0859375,
"learning_rate": 4.73972602739726e-05,
"logits/chosen": 0.4715408682823181,
"logits/rejected": 0.5762664079666138,
"logps/chosen": -86.6288070678711,
"logps/rejected": -225.074951171875,
"loss": 0.0129,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.10806949436664581,
"rewards/margins": 6.162431716918945,
"rewards/rejected": -6.270501136779785,
"step": 68
},
{
"epoch": 0.8990228013029316,
"grad_norm": 1.71875,
"learning_rate": 4.7328767123287675e-05,
"logits/chosen": 0.613117516040802,
"logits/rejected": 0.5737402439117432,
"logps/chosen": -71.23908996582031,
"logps/rejected": -197.6245880126953,
"loss": 0.0132,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.04037155583500862,
"rewards/margins": 5.633227825164795,
"rewards/rejected": -5.6735992431640625,
"step": 69
},
{
"epoch": 0.9120521172638436,
"grad_norm": 0.98828125,
"learning_rate": 4.726027397260274e-05,
"logits/chosen": 0.6605570912361145,
"logits/rejected": 0.6310275197029114,
"logps/chosen": -123.74465942382812,
"logps/rejected": -249.78793334960938,
"loss": 0.0076,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.3881164789199829,
"rewards/margins": 7.08071231842041,
"rewards/rejected": -7.4688286781311035,
"step": 70
},
{
"epoch": 0.9250814332247557,
"grad_norm": 23.375,
"learning_rate": 4.719178082191781e-05,
"logits/chosen": 0.5911487936973572,
"logits/rejected": 0.6923888325691223,
"logps/chosen": -161.05184936523438,
"logps/rejected": -264.648193359375,
"loss": 0.058,
"rewards/accuracies": 0.96875,
"rewards/chosen": -0.7978758811950684,
"rewards/margins": 8.156850814819336,
"rewards/rejected": -8.954728126525879,
"step": 71
},
{
"epoch": 0.9381107491856677,
"grad_norm": 4.5625,
"learning_rate": 4.712328767123288e-05,
"logits/chosen": 0.6496680378913879,
"logits/rejected": 0.6733301281929016,
"logps/chosen": -121.81378173828125,
"logps/rejected": -239.56304931640625,
"loss": 0.0129,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.4053517282009125,
"rewards/margins": 7.532309532165527,
"rewards/rejected": -7.937661647796631,
"step": 72
},
{
"epoch": 0.9511400651465798,
"grad_norm": 17.625,
"learning_rate": 4.7054794520547946e-05,
"logits/chosen": 0.5184324383735657,
"logits/rejected": 0.6415278911590576,
"logps/chosen": -105.58231353759766,
"logps/rejected": -222.8607940673828,
"loss": 0.0298,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.48813995718955994,
"rewards/margins": 7.293839454650879,
"rewards/rejected": -7.781979560852051,
"step": 73
},
{
"epoch": 0.9641693811074918,
"grad_norm": 0.5234375,
"learning_rate": 4.698630136986302e-05,
"logits/chosen": 0.5843162536621094,
"logits/rejected": 0.5905658602714539,
"logps/chosen": -100.66535949707031,
"logps/rejected": -242.2615203857422,
"loss": 0.006,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.5956183075904846,
"rewards/margins": 7.831192493438721,
"rewards/rejected": -8.426811218261719,
"step": 74
},
{
"epoch": 0.9771986970684039,
"grad_norm": 0.90625,
"learning_rate": 4.6917808219178085e-05,
"logits/chosen": 0.6023251414299011,
"logits/rejected": 0.6175463199615479,
"logps/chosen": -74.83623504638672,
"logps/rejected": -226.2584228515625,
"loss": 0.0081,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.32583388686180115,
"rewards/margins": 7.1260175704956055,
"rewards/rejected": -7.4518513679504395,
"step": 75
},
{
"epoch": 0.990228013029316,
"grad_norm": 1.5859375,
"learning_rate": 4.684931506849316e-05,
"logits/chosen": 0.549035906791687,
"logits/rejected": 0.5604692697525024,
"logps/chosen": -106.24671936035156,
"logps/rejected": -224.1392059326172,
"loss": 0.0062,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.48925158381462097,
"rewards/margins": 8.133434295654297,
"rewards/rejected": -8.622686386108398,
"step": 76
},
{
"epoch": 1.003257328990228,
"grad_norm": 3.21875,
"learning_rate": 4.6780821917808224e-05,
"logits/chosen": 0.4611436426639557,
"logits/rejected": 0.5326769948005676,
"logps/chosen": -122.00413513183594,
"logps/rejected": -225.345703125,
"loss": 0.0104,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.3694484233856201,
"rewards/margins": 8.14291763305664,
"rewards/rejected": -8.512365341186523,
"step": 77
},
{
"epoch": 1.01628664495114,
"grad_norm": 1.0390625,
"learning_rate": 4.671232876712329e-05,
"logits/chosen": 0.5869070887565613,
"logits/rejected": 0.6033880710601807,
"logps/chosen": -82.62848663330078,
"logps/rejected": -218.4529571533203,
"loss": 0.0067,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.45080143213272095,
"rewards/margins": 7.709619522094727,
"rewards/rejected": -8.160421371459961,
"step": 78
},
{
"epoch": 1.0293159609120521,
"grad_norm": 0.5,
"learning_rate": 4.6643835616438356e-05,
"logits/chosen": 0.6383049488067627,
"logits/rejected": 0.6318773031234741,
"logps/chosen": -85.02655029296875,
"logps/rejected": -236.74661254882812,
"loss": 0.0025,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.6345354318618774,
"rewards/margins": 8.56661319732666,
"rewards/rejected": -9.201148986816406,
"step": 79
},
{
"epoch": 1.0423452768729642,
"grad_norm": 6.03125,
"learning_rate": 4.657534246575342e-05,
"logits/chosen": 0.5868783593177795,
"logits/rejected": 0.6521725654602051,
"logps/chosen": -72.04723358154297,
"logps/rejected": -230.14759826660156,
"loss": 0.0064,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.3693377375602722,
"rewards/margins": 8.969406127929688,
"rewards/rejected": -9.3387451171875,
"step": 80
},
{
"epoch": 1.0553745928338762,
"grad_norm": 3.875,
"learning_rate": 4.6506849315068495e-05,
"logits/chosen": 0.6232761144638062,
"logits/rejected": 0.7092280983924866,
"logps/chosen": -79.42515563964844,
"logps/rejected": -243.50372314453125,
"loss": 0.0114,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.4384864568710327,
"rewards/margins": 9.8868408203125,
"rewards/rejected": -10.325326919555664,
"step": 81
},
{
"epoch": 1.0684039087947883,
"grad_norm": 17.625,
"learning_rate": 4.643835616438356e-05,
"logits/chosen": 0.5587644577026367,
"logits/rejected": 0.507000744342804,
"logps/chosen": -107.61006164550781,
"logps/rejected": -269.83843994140625,
"loss": 0.0298,
"rewards/accuracies": 0.96875,
"rewards/chosen": -0.767175555229187,
"rewards/margins": 10.086366653442383,
"rewards/rejected": -10.85354232788086,
"step": 82
},
{
"epoch": 1.0814332247557004,
"grad_norm": 1.765625,
"learning_rate": 4.6369863013698634e-05,
"logits/chosen": 0.7217209339141846,
"logits/rejected": 0.6606077551841736,
"logps/chosen": -112.81648254394531,
"logps/rejected": -288.869384765625,
"loss": 0.0031,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.637757420539856,
"rewards/margins": 11.375957489013672,
"rewards/rejected": -12.013714790344238,
"step": 83
},
{
"epoch": 1.0944625407166124,
"grad_norm": 0.134765625,
"learning_rate": 4.63013698630137e-05,
"logits/chosen": 0.598381757736206,
"logits/rejected": 0.7315313816070557,
"logps/chosen": -107.20101928710938,
"logps/rejected": -281.4562683105469,
"loss": 0.001,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.0282131433486938,
"rewards/margins": 10.506587982177734,
"rewards/rejected": -11.534799575805664,
"step": 84
},
{
"epoch": 1.1074918566775245,
"grad_norm": 0.341796875,
"learning_rate": 4.623287671232877e-05,
"logits/chosen": 0.5361148118972778,
"logits/rejected": 0.625439465045929,
"logps/chosen": -94.30006408691406,
"logps/rejected": -247.62734985351562,
"loss": 0.0019,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.6392572522163391,
"rewards/margins": 9.514155387878418,
"rewards/rejected": -10.153412818908691,
"step": 85
},
{
"epoch": 1.1205211726384365,
"grad_norm": 0.1318359375,
"learning_rate": 4.616438356164384e-05,
"logits/chosen": 0.4699576199054718,
"logits/rejected": 0.5327920317649841,
"logps/chosen": -99.83711242675781,
"logps/rejected": -277.1376953125,
"loss": 0.0007,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.7559419870376587,
"rewards/margins": 10.79163932800293,
"rewards/rejected": -11.54758071899414,
"step": 86
},
{
"epoch": 1.1335504885993486,
"grad_norm": 21.625,
"learning_rate": 4.609589041095891e-05,
"logits/chosen": 0.5424385666847229,
"logits/rejected": 0.5994272232055664,
"logps/chosen": -126.75860595703125,
"logps/rejected": -259.98785400390625,
"loss": 0.027,
"rewards/accuracies": 0.96875,
"rewards/chosen": -1.0079156160354614,
"rewards/margins": 9.790204048156738,
"rewards/rejected": -10.79811954498291,
"step": 87
},
{
"epoch": 1.1465798045602607,
"grad_norm": 0.1923828125,
"learning_rate": 4.602739726027398e-05,
"logits/chosen": 0.4807354509830475,
"logits/rejected": 0.5597364902496338,
"logps/chosen": -106.52574157714844,
"logps/rejected": -272.2024841308594,
"loss": 0.0016,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.9496315717697144,
"rewards/margins": 10.869487762451172,
"rewards/rejected": -11.81911849975586,
"step": 88
},
{
"epoch": 1.1596091205211727,
"grad_norm": 0.1513671875,
"learning_rate": 4.5958904109589044e-05,
"logits/chosen": 0.4442989230155945,
"logits/rejected": 0.5743086338043213,
"logps/chosen": -126.14883422851562,
"logps/rejected": -257.60479736328125,
"loss": 0.0009,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.766221821308136,
"rewards/margins": 10.424041748046875,
"rewards/rejected": -11.190263748168945,
"step": 89
},
{
"epoch": 1.1726384364820848,
"grad_norm": 1.109375,
"learning_rate": 4.589041095890411e-05,
"logits/chosen": 0.6463179588317871,
"logits/rejected": 0.7357967495918274,
"logps/chosen": -111.60262298583984,
"logps/rejected": -257.9665222167969,
"loss": 0.004,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.7475589513778687,
"rewards/margins": 9.678692817687988,
"rewards/rejected": -10.426251411437988,
"step": 90
},
{
"epoch": 1.1856677524429968,
"grad_norm": 6.6875,
"learning_rate": 4.5821917808219176e-05,
"logits/chosen": 0.48268792033195496,
"logits/rejected": 0.5555750131607056,
"logps/chosen": -109.53272247314453,
"logps/rejected": -254.43492126464844,
"loss": 0.021,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.1768321990966797,
"rewards/margins": 10.083324432373047,
"rewards/rejected": -11.260156631469727,
"step": 91
},
{
"epoch": 1.1986970684039089,
"grad_norm": 1.3515625,
"learning_rate": 4.575342465753425e-05,
"logits/chosen": 0.4292120337486267,
"logits/rejected": 0.521615743637085,
"logps/chosen": -95.94520568847656,
"logps/rejected": -253.99993896484375,
"loss": 0.0043,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.871029257774353,
"rewards/margins": 10.545323371887207,
"rewards/rejected": -11.416353225708008,
"step": 92
},
{
"epoch": 1.211726384364821,
"grad_norm": 0.80859375,
"learning_rate": 4.5684931506849315e-05,
"logits/chosen": 0.599204421043396,
"logits/rejected": 0.6558493375778198,
"logps/chosen": -81.71524047851562,
"logps/rejected": -286.6025390625,
"loss": 0.002,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.6884966492652893,
"rewards/margins": 12.463363647460938,
"rewards/rejected": -13.151860237121582,
"step": 93
},
{
"epoch": 1.224755700325733,
"grad_norm": 18.0,
"learning_rate": 4.561643835616439e-05,
"logits/chosen": 0.5306810140609741,
"logits/rejected": 0.6242883801460266,
"logps/chosen": -123.8375244140625,
"logps/rejected": -301.3340759277344,
"loss": 0.115,
"rewards/accuracies": 0.96875,
"rewards/chosen": -1.471944808959961,
"rewards/margins": 11.491494178771973,
"rewards/rejected": -12.963438034057617,
"step": 94
},
{
"epoch": 1.237785016286645,
"grad_norm": 3.6875,
"learning_rate": 4.5547945205479454e-05,
"logits/chosen": 0.307037353515625,
"logits/rejected": 0.4169548749923706,
"logps/chosen": -104.93318176269531,
"logps/rejected": -298.0616455078125,
"loss": 0.0056,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.8200819492340088,
"rewards/margins": 12.302337646484375,
"rewards/rejected": -13.122421264648438,
"step": 95
},
{
"epoch": 1.2508143322475571,
"grad_norm": 0.357421875,
"learning_rate": 4.547945205479453e-05,
"logits/chosen": 0.44628292322158813,
"logits/rejected": 0.5122686624526978,
"logps/chosen": -138.44715881347656,
"logps/rejected": -297.4310302734375,
"loss": 0.0021,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.1169642210006714,
"rewards/margins": 12.18505859375,
"rewards/rejected": -13.302022933959961,
"step": 96
},
{
"epoch": 1.2638436482084692,
"grad_norm": 0.734375,
"learning_rate": 4.541095890410959e-05,
"logits/chosen": 0.4561493992805481,
"logits/rejected": 0.42395809292793274,
"logps/chosen": -97.692626953125,
"logps/rejected": -269.0616149902344,
"loss": 0.0011,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.0600621700286865,
"rewards/margins": 10.464313507080078,
"rewards/rejected": -11.524375915527344,
"step": 97
},
{
"epoch": 1.2768729641693812,
"grad_norm": 0.171875,
"learning_rate": 4.534246575342466e-05,
"logits/chosen": 0.5301443934440613,
"logits/rejected": 0.5689199566841125,
"logps/chosen": -82.25302124023438,
"logps/rejected": -299.8308410644531,
"loss": 0.0003,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.6450921297073364,
"rewards/margins": 12.625539779663086,
"rewards/rejected": -13.270631790161133,
"step": 98
},
{
"epoch": 1.2899022801302933,
"grad_norm": 0.193359375,
"learning_rate": 4.5273972602739725e-05,
"logits/chosen": 0.5289660096168518,
"logits/rejected": 0.5680521726608276,
"logps/chosen": -114.97647094726562,
"logps/rejected": -289.7352294921875,
"loss": 0.001,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.0461821556091309,
"rewards/margins": 11.369194030761719,
"rewards/rejected": -12.415376663208008,
"step": 99
},
{
"epoch": 1.3029315960912053,
"grad_norm": 12.875,
"learning_rate": 4.520547945205479e-05,
"logits/chosen": 0.5327968597412109,
"logits/rejected": 0.5609641075134277,
"logps/chosen": -102.47958374023438,
"logps/rejected": -250.79983520507812,
"loss": 0.0433,
"rewards/accuracies": 0.96875,
"rewards/chosen": -1.2846791744232178,
"rewards/margins": 10.277151107788086,
"rewards/rejected": -11.561830520629883,
"step": 100
},
{
"epoch": 1.3029315960912053,
"eval_logits/chosen": 0.4522504210472107,
"eval_logits/rejected": 0.5126740336418152,
"eval_logps/chosen": -105.14033508300781,
"eval_logps/rejected": -271.7301330566406,
"eval_loss": 0.010936837643384933,
"eval_rewards/accuracies": 0.9955357313156128,
"eval_rewards/chosen": -0.9686061143875122,
"eval_rewards/margins": 11.095270156860352,
"eval_rewards/rejected": -12.06387710571289,
"eval_runtime": 52.2837,
"eval_samples_per_second": 12.834,
"eval_steps_per_second": 0.803,
"step": 100
},
{
"epoch": 1.3159609120521172,
"grad_norm": 0.3359375,
"learning_rate": 4.5136986301369864e-05,
"logits/chosen": 0.4589378833770752,
"logits/rejected": 0.5487878918647766,
"logps/chosen": -105.76063537597656,
"logps/rejected": -304.374755859375,
"loss": 0.0015,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.6952133774757385,
"rewards/margins": 11.938570022583008,
"rewards/rejected": -12.633784294128418,
"step": 101
},
{
"epoch": 1.3289902280130292,
"grad_norm": 16.875,
"learning_rate": 4.506849315068493e-05,
"logits/chosen": 0.3769131302833557,
"logits/rejected": 0.4298419952392578,
"logps/chosen": -90.59988403320312,
"logps/rejected": -247.70855712890625,
"loss": 0.0366,
"rewards/accuracies": 0.96875,
"rewards/chosen": -0.8937082886695862,
"rewards/margins": 10.360432624816895,
"rewards/rejected": -11.254140853881836,
"step": 102
},
{
"epoch": 1.3420195439739413,
"grad_norm": 0.1240234375,
"learning_rate": 4.5e-05,
"logits/chosen": 0.4195227026939392,
"logits/rejected": 0.4982715845108032,
"logps/chosen": -108.6019515991211,
"logps/rejected": -256.0687255859375,
"loss": 0.0009,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.35523366928100586,
"rewards/margins": 10.765069961547852,
"rewards/rejected": -11.120304107666016,
"step": 103
},
{
"epoch": 1.3550488599348534,
"grad_norm": 0.734375,
"learning_rate": 4.493150684931507e-05,
"logits/chosen": 0.512363851070404,
"logits/rejected": 0.576703667640686,
"logps/chosen": -87.09799194335938,
"logps/rejected": -250.88160705566406,
"loss": 0.002,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.8132816553115845,
"rewards/margins": 10.275364875793457,
"rewards/rejected": -11.088645935058594,
"step": 104
},
{
"epoch": 1.3680781758957654,
"grad_norm": 0.5390625,
"learning_rate": 4.486301369863014e-05,
"logits/chosen": 0.5740979909896851,
"logits/rejected": 0.6141005158424377,
"logps/chosen": -101.0667495727539,
"logps/rejected": -270.2124328613281,
"loss": 0.0019,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.4534332752227783,
"rewards/margins": 11.378608703613281,
"rewards/rejected": -11.832042694091797,
"step": 105
},
{
"epoch": 1.3811074918566775,
"grad_norm": 0.1689453125,
"learning_rate": 4.479452054794521e-05,
"logits/chosen": 0.4920623004436493,
"logits/rejected": 0.5869815945625305,
"logps/chosen": -78.95692443847656,
"logps/rejected": -261.3721923828125,
"loss": 0.001,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.5587632656097412,
"rewards/margins": 11.233173370361328,
"rewards/rejected": -11.791936874389648,
"step": 106
},
{
"epoch": 1.3941368078175895,
"grad_norm": 0.251953125,
"learning_rate": 4.472602739726028e-05,
"logits/chosen": 0.570668637752533,
"logits/rejected": 0.6403558850288391,
"logps/chosen": -100.12591552734375,
"logps/rejected": -284.8184814453125,
"loss": 0.0007,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.974056601524353,
"rewards/margins": 11.53510570526123,
"rewards/rejected": -12.509162902832031,
"step": 107
},
{
"epoch": 1.4071661237785016,
"grad_norm": 0.486328125,
"learning_rate": 4.465753424657535e-05,
"logits/chosen": 0.5420557260513306,
"logits/rejected": 0.5884326100349426,
"logps/chosen": -88.60862731933594,
"logps/rejected": -289.9623718261719,
"loss": 0.0006,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.7016857862472534,
"rewards/margins": 12.609970092773438,
"rewards/rejected": -13.31165599822998,
"step": 108
},
{
"epoch": 1.4201954397394136,
"grad_norm": 0.16796875,
"learning_rate": 4.458904109589041e-05,
"logits/chosen": 0.4910571575164795,
"logits/rejected": 0.5071029663085938,
"logps/chosen": -126.79181671142578,
"logps/rejected": -296.6622314453125,
"loss": 0.0009,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.5642073154449463,
"rewards/margins": 11.54067325592041,
"rewards/rejected": -13.104881286621094,
"step": 109
},
{
"epoch": 1.4332247557003257,
"grad_norm": 0.09423828125,
"learning_rate": 4.452054794520548e-05,
"logits/chosen": 0.5247446298599243,
"logits/rejected": 0.47774773836135864,
"logps/chosen": -100.17961883544922,
"logps/rejected": -256.7818908691406,
"loss": 0.0007,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.5451500415802002,
"rewards/margins": 10.933671951293945,
"rewards/rejected": -11.478821754455566,
"step": 110
},
{
"epoch": 1.4462540716612378,
"grad_norm": 0.0615234375,
"learning_rate": 4.4452054794520545e-05,
"logits/chosen": 0.6131365299224854,
"logits/rejected": 0.615870475769043,
"logps/chosen": -91.60357666015625,
"logps/rejected": -277.2375793457031,
"loss": 0.0004,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.4648512601852417,
"rewards/margins": 11.733713150024414,
"rewards/rejected": -12.198564529418945,
"step": 111
},
{
"epoch": 1.4592833876221498,
"grad_norm": 0.6015625,
"learning_rate": 4.438356164383562e-05,
"logits/chosen": 0.7266855239868164,
"logits/rejected": 0.633425235748291,
"logps/chosen": -83.83377075195312,
"logps/rejected": -264.64501953125,
"loss": 0.0011,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.8645696640014648,
"rewards/margins": 11.404350280761719,
"rewards/rejected": -12.268918991088867,
"step": 112
},
{
"epoch": 1.4723127035830619,
"grad_norm": 39.5,
"learning_rate": 4.4315068493150684e-05,
"logits/chosen": 0.6473186016082764,
"logits/rejected": 0.6468358635902405,
"logps/chosen": -145.5900115966797,
"logps/rejected": -300.77301025390625,
"loss": 0.0413,
"rewards/accuracies": 0.96875,
"rewards/chosen": -1.2610886096954346,
"rewards/margins": 12.301548957824707,
"rewards/rejected": -13.562638282775879,
"step": 113
},
{
"epoch": 1.485342019543974,
"grad_norm": 0.400390625,
"learning_rate": 4.424657534246576e-05,
"logits/chosen": 0.4430687427520752,
"logits/rejected": 0.5213119983673096,
"logps/chosen": -133.21205139160156,
"logps/rejected": -270.613525390625,
"loss": 0.0011,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.0954465866088867,
"rewards/margins": 11.330828666687012,
"rewards/rejected": -12.426275253295898,
"step": 114
},
{
"epoch": 1.498371335504886,
"grad_norm": 0.94140625,
"learning_rate": 4.417808219178082e-05,
"logits/chosen": 0.5086010694503784,
"logits/rejected": 0.5820840001106262,
"logps/chosen": -123.90394592285156,
"logps/rejected": -257.7217712402344,
"loss": 0.0019,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.11330246925354,
"rewards/margins": 10.829158782958984,
"rewards/rejected": -11.942461013793945,
"step": 115
},
{
"epoch": 1.511400651465798,
"grad_norm": 0.1435546875,
"learning_rate": 4.4109589041095896e-05,
"logits/chosen": 0.6593326330184937,
"logits/rejected": 0.6211075186729431,
"logps/chosen": -75.89244842529297,
"logps/rejected": -266.60357666015625,
"loss": 0.0006,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.7435863614082336,
"rewards/margins": 11.823095321655273,
"rewards/rejected": -12.566681861877441,
"step": 116
},
{
"epoch": 1.52442996742671,
"grad_norm": 0.3125,
"learning_rate": 4.404109589041096e-05,
"logits/chosen": 0.44883668422698975,
"logits/rejected": 0.5639724135398865,
"logps/chosen": -93.89613342285156,
"logps/rejected": -286.56451416015625,
"loss": 0.0022,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.0818122625350952,
"rewards/margins": 12.308505058288574,
"rewards/rejected": -13.390316009521484,
"step": 117
},
{
"epoch": 1.5374592833876222,
"grad_norm": 0.294921875,
"learning_rate": 4.3972602739726035e-05,
"logits/chosen": 0.5254025459289551,
"logits/rejected": 0.5744770765304565,
"logps/chosen": -120.49933624267578,
"logps/rejected": -313.8304443359375,
"loss": 0.0005,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.2532737255096436,
"rewards/margins": 13.612017631530762,
"rewards/rejected": -14.8652925491333,
"step": 118
},
{
"epoch": 1.5504885993485342,
"grad_norm": 0.06005859375,
"learning_rate": 4.39041095890411e-05,
"logits/chosen": 0.4704741835594177,
"logits/rejected": 0.5933064222335815,
"logps/chosen": -101.07899475097656,
"logps/rejected": -312.5476989746094,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.0991394519805908,
"rewards/margins": 13.127508163452148,
"rewards/rejected": -14.226646423339844,
"step": 119
},
{
"epoch": 1.5635179153094463,
"grad_norm": 0.7265625,
"learning_rate": 4.383561643835617e-05,
"logits/chosen": 0.5662128329277039,
"logits/rejected": 0.5538490414619446,
"logps/chosen": -106.43547058105469,
"logps/rejected": -237.27182006835938,
"loss": 0.0017,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.7144113779067993,
"rewards/margins": 9.923102378845215,
"rewards/rejected": -10.637513160705566,
"step": 120
},
{
"epoch": 1.5765472312703583,
"grad_norm": 0.2734375,
"learning_rate": 4.376712328767123e-05,
"logits/chosen": 0.6062589883804321,
"logits/rejected": 0.6001408100128174,
"logps/chosen": -85.78362274169922,
"logps/rejected": -230.78456115722656,
"loss": 0.0006,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.44176292419433594,
"rewards/margins": 10.009519577026367,
"rewards/rejected": -10.451282501220703,
"step": 121
},
{
"epoch": 1.5895765472312704,
"grad_norm": 0.890625,
"learning_rate": 4.36986301369863e-05,
"logits/chosen": 0.4221673607826233,
"logits/rejected": 0.5758030414581299,
"logps/chosen": -105.4853744506836,
"logps/rejected": -291.33416748046875,
"loss": 0.0012,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.8794921636581421,
"rewards/margins": 12.619811058044434,
"rewards/rejected": -13.499303817749023,
"step": 122
},
{
"epoch": 1.6026058631921825,
"grad_norm": 11.3125,
"learning_rate": 4.363013698630137e-05,
"logits/chosen": 0.5420705676078796,
"logits/rejected": 0.6151952147483826,
"logps/chosen": -100.22688293457031,
"logps/rejected": -243.79376220703125,
"loss": 0.0141,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.1169291734695435,
"rewards/margins": 10.309640884399414,
"rewards/rejected": -11.426569938659668,
"step": 123
},
{
"epoch": 1.6156351791530945,
"grad_norm": 0.1025390625,
"learning_rate": 4.356164383561644e-05,
"logits/chosen": 0.5193166136741638,
"logits/rejected": 0.6056085228919983,
"logps/chosen": -82.8109359741211,
"logps/rejected": -290.5059814453125,
"loss": 0.0004,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.6234797239303589,
"rewards/margins": 13.424489974975586,
"rewards/rejected": -14.047967910766602,
"step": 124
},
{
"epoch": 1.6286644951140063,
"grad_norm": 0.515625,
"learning_rate": 4.349315068493151e-05,
"logits/chosen": 0.524208664894104,
"logits/rejected": 0.4996390640735626,
"logps/chosen": -99.54425811767578,
"logps/rejected": -269.98858642578125,
"loss": 0.001,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.75706547498703,
"rewards/margins": 11.190351486206055,
"rewards/rejected": -11.947418212890625,
"step": 125
},
{
"epoch": 1.6416938110749184,
"grad_norm": 0.263671875,
"learning_rate": 4.342465753424658e-05,
"logits/chosen": 0.6168690323829651,
"logits/rejected": 0.6482622027397156,
"logps/chosen": -85.97930908203125,
"logps/rejected": -270.2721862792969,
"loss": 0.001,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.8299384117126465,
"rewards/margins": 11.914796829223633,
"rewards/rejected": -12.744734764099121,
"step": 126
},
{
"epoch": 1.6547231270358305,
"grad_norm": 0.5546875,
"learning_rate": 4.335616438356165e-05,
"logits/chosen": 0.4758910536766052,
"logits/rejected": 0.6165511012077332,
"logps/chosen": -120.85889434814453,
"logps/rejected": -330.94580078125,
"loss": 0.0014,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.3049366474151611,
"rewards/margins": 13.541584014892578,
"rewards/rejected": -14.846521377563477,
"step": 127
},
{
"epoch": 1.6677524429967425,
"grad_norm": 6.8125,
"learning_rate": 4.3287671232876716e-05,
"logits/chosen": 0.4912353754043579,
"logits/rejected": 0.5630989074707031,
"logps/chosen": -99.70421600341797,
"logps/rejected": -262.81793212890625,
"loss": 0.0078,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.1004682779312134,
"rewards/margins": 11.206673622131348,
"rewards/rejected": -12.307140350341797,
"step": 128
},
{
"epoch": 1.6807817589576546,
"grad_norm": 0.16015625,
"learning_rate": 4.321917808219178e-05,
"logits/chosen": 0.4782199263572693,
"logits/rejected": 0.525773823261261,
"logps/chosen": -104.79579162597656,
"logps/rejected": -289.299560546875,
"loss": 0.0007,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.9410255551338196,
"rewards/margins": 13.11217212677002,
"rewards/rejected": -14.05319595336914,
"step": 129
},
{
"epoch": 1.6938110749185666,
"grad_norm": 0.21875,
"learning_rate": 4.3150684931506855e-05,
"logits/chosen": 0.4822072684764862,
"logits/rejected": 0.4817202687263489,
"logps/chosen": -86.81942749023438,
"logps/rejected": -299.3095703125,
"loss": 0.0006,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.1278772354125977,
"rewards/margins": 12.366036415100098,
"rewards/rejected": -13.493914604187012,
"step": 130
},
{
"epoch": 1.7068403908794787,
"grad_norm": 0.08642578125,
"learning_rate": 4.308219178082192e-05,
"logits/chosen": 0.5804314613342285,
"logits/rejected": 0.6889848709106445,
"logps/chosen": -91.85730743408203,
"logps/rejected": -298.603515625,
"loss": 0.0004,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.0585956573486328,
"rewards/margins": 12.788677215576172,
"rewards/rejected": -13.847272872924805,
"step": 131
},
{
"epoch": 1.7198697068403908,
"grad_norm": 0.072265625,
"learning_rate": 4.301369863013699e-05,
"logits/chosen": 0.4860071837902069,
"logits/rejected": 0.6394906044006348,
"logps/chosen": -122.80025482177734,
"logps/rejected": -303.95257568359375,
"loss": 0.0007,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.156942367553711,
"rewards/margins": 13.32013988494873,
"rewards/rejected": -14.477082252502441,
"step": 132
},
{
"epoch": 1.7328990228013028,
"grad_norm": 0.15234375,
"learning_rate": 4.294520547945205e-05,
"logits/chosen": 0.4813675880432129,
"logits/rejected": 0.6056811213493347,
"logps/chosen": -89.08052062988281,
"logps/rejected": -268.1934814453125,
"loss": 0.0005,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.8572717308998108,
"rewards/margins": 12.159040451049805,
"rewards/rejected": -13.016312599182129,
"step": 133
},
{
"epoch": 1.7459283387622149,
"grad_norm": 0.05615234375,
"learning_rate": 4.2876712328767126e-05,
"logits/chosen": 0.43135523796081543,
"logits/rejected": 0.5367728471755981,
"logps/chosen": -104.37152099609375,
"logps/rejected": -309.7494201660156,
"loss": 0.0003,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.9332711100578308,
"rewards/margins": 13.174678802490234,
"rewards/rejected": -14.107950210571289,
"step": 134
},
{
"epoch": 1.758957654723127,
"grad_norm": 0.1123046875,
"learning_rate": 4.280821917808219e-05,
"logits/chosen": 0.46707215905189514,
"logits/rejected": 0.545040488243103,
"logps/chosen": -141.20016479492188,
"logps/rejected": -337.659423828125,
"loss": 0.0006,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.5533472299575806,
"rewards/margins": 13.37056827545166,
"rewards/rejected": -14.923914909362793,
"step": 135
},
{
"epoch": 1.771986970684039,
"grad_norm": 0.142578125,
"learning_rate": 4.2739726027397265e-05,
"logits/chosen": 0.45749402046203613,
"logits/rejected": 0.5103408098220825,
"logps/chosen": -97.52786254882812,
"logps/rejected": -218.84869384765625,
"loss": 0.0006,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.6832572221755981,
"rewards/margins": 9.930760383605957,
"rewards/rejected": -10.614017486572266,
"step": 136
},
{
"epoch": 1.785016286644951,
"grad_norm": 0.04638671875,
"learning_rate": 4.267123287671233e-05,
"logits/chosen": 0.6288174986839294,
"logits/rejected": 0.6228695511817932,
"logps/chosen": -118.99038696289062,
"logps/rejected": -292.7908020019531,
"loss": 0.0003,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.2574949264526367,
"rewards/margins": 12.354877471923828,
"rewards/rejected": -13.612371444702148,
"step": 137
},
{
"epoch": 1.798045602605863,
"grad_norm": 0.330078125,
"learning_rate": 4.2602739726027404e-05,
"logits/chosen": 0.4609254002571106,
"logits/rejected": 0.480663537979126,
"logps/chosen": -87.55207824707031,
"logps/rejected": -289.66162109375,
"loss": 0.0011,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.7838760614395142,
"rewards/margins": 12.662479400634766,
"rewards/rejected": -13.446355819702148,
"step": 138
},
{
"epoch": 1.8110749185667752,
"grad_norm": 0.091796875,
"learning_rate": 4.253424657534247e-05,
"logits/chosen": 0.5592811703681946,
"logits/rejected": 0.6325635313987732,
"logps/chosen": -113.62852478027344,
"logps/rejected": -291.84967041015625,
"loss": 0.0006,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.9283789992332458,
"rewards/margins": 12.165189743041992,
"rewards/rejected": -13.093568801879883,
"step": 139
},
{
"epoch": 1.8241042345276872,
"grad_norm": 0.1376953125,
"learning_rate": 4.2465753424657536e-05,
"logits/chosen": 0.5351129174232483,
"logits/rejected": 0.5127934813499451,
"logps/chosen": -173.83511352539062,
"logps/rejected": -313.468994140625,
"loss": 0.0007,
"rewards/accuracies": 1.0,
"rewards/chosen": -2.008704662322998,
"rewards/margins": 12.894453048706055,
"rewards/rejected": -14.903158187866211,
"step": 140
},
{
"epoch": 1.8371335504885993,
"grad_norm": 25.25,
"learning_rate": 4.23972602739726e-05,
"logits/chosen": 0.5461170673370361,
"logits/rejected": 0.5241893529891968,
"logps/chosen": -90.9225082397461,
"logps/rejected": -266.9288635253906,
"loss": 0.0711,
"rewards/accuracies": 0.96875,
"rewards/chosen": -1.2498632669448853,
"rewards/margins": 11.258042335510254,
"rewards/rejected": -12.507905960083008,
"step": 141
},
{
"epoch": 1.8501628664495113,
"grad_norm": 0.15625,
"learning_rate": 4.232876712328767e-05,
"logits/chosen": 0.4733356535434723,
"logits/rejected": 0.5178252458572388,
"logps/chosen": -120.46127319335938,
"logps/rejected": -303.619384765625,
"loss": 0.0004,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.1871830224990845,
"rewards/margins": 12.890132904052734,
"rewards/rejected": -14.077316284179688,
"step": 142
},
{
"epoch": 1.8631921824104234,
"grad_norm": 0.2578125,
"learning_rate": 4.226027397260274e-05,
"logits/chosen": 0.48812466859817505,
"logits/rejected": 0.6284564733505249,
"logps/chosen": -94.5536880493164,
"logps/rejected": -292.870849609375,
"loss": 0.0009,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.950133204460144,
"rewards/margins": 13.727540969848633,
"rewards/rejected": -14.677675247192383,
"step": 143
},
{
"epoch": 1.8762214983713354,
"grad_norm": 0.0859375,
"learning_rate": 4.219178082191781e-05,
"logits/chosen": 0.6320376992225647,
"logits/rejected": 0.6237307786941528,
"logps/chosen": -152.7342529296875,
"logps/rejected": -295.3027648925781,
"loss": 0.0003,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.494692325592041,
"rewards/margins": 12.356241226196289,
"rewards/rejected": -13.850934028625488,
"step": 144
},
{
"epoch": 1.8892508143322475,
"grad_norm": 0.2265625,
"learning_rate": 4.212328767123288e-05,
"logits/chosen": 0.7280508279800415,
"logits/rejected": 0.6899917125701904,
"logps/chosen": -79.49422454833984,
"logps/rejected": -290.1501770019531,
"loss": 0.0005,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.7727007865905762,
"rewards/margins": 12.538410186767578,
"rewards/rejected": -13.311111450195312,
"step": 145
},
{
"epoch": 1.9022801302931596,
"grad_norm": 0.875,
"learning_rate": 4.2054794520547946e-05,
"logits/chosen": 0.4206058382987976,
"logits/rejected": 0.5227707624435425,
"logps/chosen": -101.57917785644531,
"logps/rejected": -279.0715637207031,
"loss": 0.0023,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.9858956336975098,
"rewards/margins": 12.150425910949707,
"rewards/rejected": -13.136322021484375,
"step": 146
},
{
"epoch": 1.9153094462540716,
"grad_norm": 1.4609375,
"learning_rate": 4.198630136986302e-05,
"logits/chosen": 0.5245968699455261,
"logits/rejected": 0.6121017932891846,
"logps/chosen": -116.4501953125,
"logps/rejected": -281.0984802246094,
"loss": 0.0014,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.5875823497772217,
"rewards/margins": 12.053236961364746,
"rewards/rejected": -13.640819549560547,
"step": 147
},
{
"epoch": 1.9283387622149837,
"grad_norm": 0.431640625,
"learning_rate": 4.1917808219178085e-05,
"logits/chosen": 0.45093053579330444,
"logits/rejected": 0.587200403213501,
"logps/chosen": -104.39015197753906,
"logps/rejected": -293.44232177734375,
"loss": 0.0016,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.8772008419036865,
"rewards/margins": 12.074963569641113,
"rewards/rejected": -12.952163696289062,
"step": 148
},
{
"epoch": 1.9413680781758957,
"grad_norm": 3.4375,
"learning_rate": 4.184931506849315e-05,
"logits/chosen": 0.48234254121780396,
"logits/rejected": 0.5706640481948853,
"logps/chosen": -147.8875732421875,
"logps/rejected": -278.24407958984375,
"loss": 0.005,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.0798665285110474,
"rewards/margins": 11.722947120666504,
"rewards/rejected": -12.802812576293945,
"step": 149
},
{
"epoch": 1.9543973941368078,
"grad_norm": 0.60546875,
"learning_rate": 4.1780821917808224e-05,
"logits/chosen": 0.5278698205947876,
"logits/rejected": 0.635560154914856,
"logps/chosen": -99.79202270507812,
"logps/rejected": -271.11785888671875,
"loss": 0.0013,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.8048182725906372,
"rewards/margins": 12.346576690673828,
"rewards/rejected": -13.151394844055176,
"step": 150
},
{
"epoch": 1.9543973941368078,
"eval_logits/chosen": 0.4638054668903351,
"eval_logits/rejected": 0.5228009223937988,
"eval_logps/chosen": -108.31918334960938,
"eval_logps/rejected": -286.8623046875,
"eval_loss": 0.007638773415237665,
"eval_rewards/accuracies": 0.9955357313156128,
"eval_rewards/chosen": -1.2864917516708374,
"eval_rewards/margins": 12.290605545043945,
"eval_rewards/rejected": -13.57709789276123,
"eval_runtime": 52.2778,
"eval_samples_per_second": 12.835,
"eval_steps_per_second": 0.803,
"step": 150
},
{
"epoch": 1.9674267100977199,
"grad_norm": 1.53125,
"learning_rate": 4.171232876712329e-05,
"logits/chosen": 0.5083509683609009,
"logits/rejected": 0.6153576374053955,
"logps/chosen": -86.2269515991211,
"logps/rejected": -281.91888427734375,
"loss": 0.0038,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.8234192132949829,
"rewards/margins": 12.487937927246094,
"rewards/rejected": -13.311358451843262,
"step": 151
},
{
"epoch": 1.980456026058632,
"grad_norm": 0.08740234375,
"learning_rate": 4.1643835616438356e-05,
"logits/chosen": 0.4471871554851532,
"logits/rejected": 0.5222618579864502,
"logps/chosen": -77.19293212890625,
"logps/rejected": -279.3829040527344,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.7928668856620789,
"rewards/margins": 12.986977577209473,
"rewards/rejected": -13.779845237731934,
"step": 152
},
{
"epoch": 1.993485342019544,
"grad_norm": 0.38671875,
"learning_rate": 4.157534246575342e-05,
"logits/chosen": 0.5125950574874878,
"logits/rejected": 0.531832218170166,
"logps/chosen": -89.48603057861328,
"logps/rejected": -292.6934509277344,
"loss": 0.0012,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.9772664308547974,
"rewards/margins": 12.81619644165039,
"rewards/rejected": -13.793462753295898,
"step": 153
},
{
"epoch": 2.006514657980456,
"grad_norm": 0.09326171875,
"learning_rate": 4.1506849315068495e-05,
"logits/chosen": 0.5642431378364563,
"logits/rejected": 0.6921492218971252,
"logps/chosen": -109.61473083496094,
"logps/rejected": -336.2562255859375,
"loss": 0.0004,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.3835595846176147,
"rewards/margins": 15.08292007446289,
"rewards/rejected": -16.46647834777832,
"step": 154
},
{
"epoch": 2.019543973941368,
"grad_norm": 0.455078125,
"learning_rate": 4.143835616438356e-05,
"logits/chosen": 0.4728472828865051,
"logits/rejected": 0.5778607726097107,
"logps/chosen": -113.82855987548828,
"logps/rejected": -300.3656005859375,
"loss": 0.0007,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.0702670812606812,
"rewards/margins": 13.495317459106445,
"rewards/rejected": -14.565585136413574,
"step": 155
},
{
"epoch": 2.03257328990228,
"grad_norm": 0.1611328125,
"learning_rate": 4.1369863013698634e-05,
"logits/chosen": 0.558509886264801,
"logits/rejected": 0.5765538215637207,
"logps/chosen": -96.08161163330078,
"logps/rejected": -311.4420471191406,
"loss": 0.0005,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.7950121760368347,
"rewards/margins": 14.138938903808594,
"rewards/rejected": -14.933950424194336,
"step": 156
},
{
"epoch": 2.045602605863192,
"grad_norm": 0.23828125,
"learning_rate": 4.13013698630137e-05,
"logits/chosen": 0.5611923933029175,
"logits/rejected": 0.5538697242736816,
"logps/chosen": -118.36637878417969,
"logps/rejected": -269.89837646484375,
"loss": 0.0006,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.5846986770629883,
"rewards/margins": 11.414068222045898,
"rewards/rejected": -12.998766899108887,
"step": 157
},
{
"epoch": 2.0586319218241043,
"grad_norm": 0.240234375,
"learning_rate": 4.123287671232877e-05,
"logits/chosen": 0.5009916424751282,
"logits/rejected": 0.5371646881103516,
"logps/chosen": -100.47499084472656,
"logps/rejected": -283.9187316894531,
"loss": 0.0014,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.0149474143981934,
"rewards/margins": 11.762290000915527,
"rewards/rejected": -12.777236938476562,
"step": 158
},
{
"epoch": 2.0716612377850163,
"grad_norm": 0.1591796875,
"learning_rate": 4.116438356164384e-05,
"logits/chosen": 0.6033108830451965,
"logits/rejected": 0.6458787322044373,
"logps/chosen": -118.35772705078125,
"logps/rejected": -342.5250244140625,
"loss": 0.0005,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.5086966753005981,
"rewards/margins": 15.292023658752441,
"rewards/rejected": -16.80072021484375,
"step": 159
},
{
"epoch": 2.0846905537459284,
"grad_norm": 0.29296875,
"learning_rate": 4.1095890410958905e-05,
"logits/chosen": 0.5724061131477356,
"logits/rejected": 0.6467206478118896,
"logps/chosen": -95.32568359375,
"logps/rejected": -270.0829772949219,
"loss": 0.0014,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.0148924589157104,
"rewards/margins": 12.349864959716797,
"rewards/rejected": -13.364758491516113,
"step": 160
},
{
"epoch": 2.0977198697068404,
"grad_norm": 0.1328125,
"learning_rate": 4.102739726027398e-05,
"logits/chosen": 0.36649227142333984,
"logits/rejected": 0.4759945273399353,
"logps/chosen": -79.16898345947266,
"logps/rejected": -256.05426025390625,
"loss": 0.0013,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.5944907069206238,
"rewards/margins": 11.262916564941406,
"rewards/rejected": -11.85740852355957,
"step": 161
},
{
"epoch": 2.1107491856677525,
"grad_norm": 0.115234375,
"learning_rate": 4.0958904109589044e-05,
"logits/chosen": 0.4255332350730896,
"logits/rejected": 0.5424034595489502,
"logps/chosen": -146.3050079345703,
"logps/rejected": -328.6482849121094,
"loss": 0.0005,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.6175340414047241,
"rewards/margins": 14.242683410644531,
"rewards/rejected": -15.860215187072754,
"step": 162
},
{
"epoch": 2.1237785016286646,
"grad_norm": 6.78125,
"learning_rate": 4.089041095890411e-05,
"logits/chosen": 0.5109447836875916,
"logits/rejected": 0.5712834596633911,
"logps/chosen": -125.36318969726562,
"logps/rejected": -287.7838134765625,
"loss": 0.012,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.4990017414093018,
"rewards/margins": 12.238770484924316,
"rewards/rejected": -13.737771987915039,
"step": 163
},
{
"epoch": 2.1368078175895766,
"grad_norm": 0.054931640625,
"learning_rate": 4.0821917808219176e-05,
"logits/chosen": 0.5000830888748169,
"logits/rejected": 0.5245240926742554,
"logps/chosen": -97.7026596069336,
"logps/rejected": -304.09375,
"loss": 0.0003,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.1258344650268555,
"rewards/margins": 12.807943344116211,
"rewards/rejected": -13.9337797164917,
"step": 164
},
{
"epoch": 2.1498371335504887,
"grad_norm": 0.1376953125,
"learning_rate": 4.075342465753425e-05,
"logits/chosen": 0.4336688816547394,
"logits/rejected": 0.6021983623504639,
"logps/chosen": -106.12345123291016,
"logps/rejected": -288.62469482421875,
"loss": 0.0004,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.3127267360687256,
"rewards/margins": 12.904582977294922,
"rewards/rejected": -14.217309951782227,
"step": 165
},
{
"epoch": 2.1628664495114007,
"grad_norm": 0.036865234375,
"learning_rate": 4.0684931506849315e-05,
"logits/chosen": 0.4477992355823517,
"logits/rejected": 0.5709498524665833,
"logps/chosen": -108.85577392578125,
"logps/rejected": -285.3506164550781,
"loss": 0.0003,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.2774969339370728,
"rewards/margins": 11.801679611206055,
"rewards/rejected": -13.07917594909668,
"step": 166
},
{
"epoch": 2.175895765472313,
"grad_norm": 0.037841796875,
"learning_rate": 4.061643835616439e-05,
"logits/chosen": 0.452865868806839,
"logits/rejected": 0.5479907989501953,
"logps/chosen": -110.41411590576172,
"logps/rejected": -293.05035400390625,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.1979793310165405,
"rewards/margins": 13.181974411010742,
"rewards/rejected": -14.37995433807373,
"step": 167
},
{
"epoch": 2.188925081433225,
"grad_norm": 0.040771484375,
"learning_rate": 4.0547945205479454e-05,
"logits/chosen": 0.4804653823375702,
"logits/rejected": 0.5071645379066467,
"logps/chosen": -93.72543334960938,
"logps/rejected": -326.3215637207031,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.0895832777023315,
"rewards/margins": 15.001323699951172,
"rewards/rejected": -16.090906143188477,
"step": 168
},
{
"epoch": 2.201954397394137,
"grad_norm": 0.1396484375,
"learning_rate": 4.047945205479452e-05,
"logits/chosen": 0.3976234197616577,
"logits/rejected": 0.5127770304679871,
"logps/chosen": -86.84957122802734,
"logps/rejected": -272.968505859375,
"loss": 0.0006,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.7831870913505554,
"rewards/margins": 12.802996635437012,
"rewards/rejected": -13.58618450164795,
"step": 169
},
{
"epoch": 2.214983713355049,
"grad_norm": 0.1044921875,
"learning_rate": 4.041095890410959e-05,
"logits/chosen": 0.4045504927635193,
"logits/rejected": 0.45465028285980225,
"logps/chosen": -105.28460693359375,
"logps/rejected": -309.6754150390625,
"loss": 0.0003,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.0679643154144287,
"rewards/margins": 14.03729248046875,
"rewards/rejected": -15.105257987976074,
"step": 170
},
{
"epoch": 2.228013029315961,
"grad_norm": 0.03759765625,
"learning_rate": 4.034246575342466e-05,
"logits/chosen": 0.4175838530063629,
"logits/rejected": 0.5390201210975647,
"logps/chosen": -95.82322692871094,
"logps/rejected": -332.502685546875,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.0852179527282715,
"rewards/margins": 14.905830383300781,
"rewards/rejected": -15.991046905517578,
"step": 171
},
{
"epoch": 2.241042345276873,
"grad_norm": 0.06396484375,
"learning_rate": 4.027397260273973e-05,
"logits/chosen": 0.48719215393066406,
"logits/rejected": 0.5657703876495361,
"logps/chosen": -88.64961242675781,
"logps/rejected": -275.7567138671875,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.9338966012001038,
"rewards/margins": 13.001708030700684,
"rewards/rejected": -13.935606002807617,
"step": 172
},
{
"epoch": 2.254071661237785,
"grad_norm": 0.0966796875,
"learning_rate": 4.02054794520548e-05,
"logits/chosen": 0.5867688655853271,
"logits/rejected": 0.6384550333023071,
"logps/chosen": -110.77032470703125,
"logps/rejected": -328.6289367675781,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.4720832109451294,
"rewards/margins": 14.248076438903809,
"rewards/rejected": -15.720159530639648,
"step": 173
},
{
"epoch": 2.267100977198697,
"grad_norm": 0.039794921875,
"learning_rate": 4.0136986301369864e-05,
"logits/chosen": 0.4327799677848816,
"logits/rejected": 0.4705524742603302,
"logps/chosen": -105.45439147949219,
"logps/rejected": -319.2513122558594,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.8988175392150879,
"rewards/margins": 14.017317771911621,
"rewards/rejected": -14.916135787963867,
"step": 174
},
{
"epoch": 2.2801302931596092,
"grad_norm": 0.09619140625,
"learning_rate": 4.006849315068493e-05,
"logits/chosen": 0.5131232738494873,
"logits/rejected": 0.5097309947013855,
"logps/chosen": -120.355712890625,
"logps/rejected": -296.6656494140625,
"loss": 0.0005,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.3266019821166992,
"rewards/margins": 12.798480987548828,
"rewards/rejected": -14.125082969665527,
"step": 175
},
{
"epoch": 2.2931596091205213,
"grad_norm": 0.2255859375,
"learning_rate": 4e-05,
"logits/chosen": 0.4963986575603485,
"logits/rejected": 0.5654538869857788,
"logps/chosen": -119.40376281738281,
"logps/rejected": -269.1568908691406,
"loss": 0.0012,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.111502766609192,
"rewards/margins": 11.538202285766602,
"rewards/rejected": -12.64970588684082,
"step": 176
},
{
"epoch": 2.3061889250814334,
"grad_norm": 0.201171875,
"learning_rate": 3.993150684931507e-05,
"logits/chosen": 0.5080669522285461,
"logits/rejected": 0.4891076385974884,
"logps/chosen": -112.92520141601562,
"logps/rejected": -291.1544189453125,
"loss": 0.0007,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.3876513242721558,
"rewards/margins": 11.71539306640625,
"rewards/rejected": -13.103044509887695,
"step": 177
},
{
"epoch": 2.3192182410423454,
"grad_norm": 0.2138671875,
"learning_rate": 3.9863013698630135e-05,
"logits/chosen": 0.4692964553833008,
"logits/rejected": 0.5622753500938416,
"logps/chosen": -92.26762390136719,
"logps/rejected": -267.98675537109375,
"loss": 0.0016,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.8580554723739624,
"rewards/margins": 12.327470779418945,
"rewards/rejected": -13.185525894165039,
"step": 178
},
{
"epoch": 2.3322475570032575,
"grad_norm": 9.25,
"learning_rate": 3.979452054794521e-05,
"logits/chosen": 0.5638495683670044,
"logits/rejected": 0.5911377668380737,
"logps/chosen": -117.00182342529297,
"logps/rejected": -285.2914123535156,
"loss": 0.0098,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.443804144859314,
"rewards/margins": 11.933603286743164,
"rewards/rejected": -13.377408981323242,
"step": 179
},
{
"epoch": 2.3452768729641695,
"grad_norm": 0.043212890625,
"learning_rate": 3.9726027397260274e-05,
"logits/chosen": 0.4331457316875458,
"logits/rejected": 0.5054813623428345,
"logps/chosen": -114.8367919921875,
"logps/rejected": -263.3021240234375,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.0023431777954102,
"rewards/margins": 11.675691604614258,
"rewards/rejected": -12.678034782409668,
"step": 180
},
{
"epoch": 2.3583061889250816,
"grad_norm": 0.2177734375,
"learning_rate": 3.965753424657535e-05,
"logits/chosen": 0.614739716053009,
"logits/rejected": 0.6245816946029663,
"logps/chosen": -94.85420989990234,
"logps/rejected": -277.0835266113281,
"loss": 0.0009,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.3436460494995117,
"rewards/margins": 11.414693832397461,
"rewards/rejected": -12.758339881896973,
"step": 181
},
{
"epoch": 2.3713355048859937,
"grad_norm": 0.134765625,
"learning_rate": 3.958904109589041e-05,
"logits/chosen": 0.5919771790504456,
"logits/rejected": 0.61507648229599,
"logps/chosen": -69.8411865234375,
"logps/rejected": -272.3177795410156,
"loss": 0.0005,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.573222279548645,
"rewards/margins": 12.539608001708984,
"rewards/rejected": -13.112829208374023,
"step": 182
},
{
"epoch": 2.3843648208469057,
"grad_norm": 0.0966796875,
"learning_rate": 3.952054794520548e-05,
"logits/chosen": 0.48881152272224426,
"logits/rejected": 0.5776315927505493,
"logps/chosen": -89.60847473144531,
"logps/rejected": -293.9697265625,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.0633786916732788,
"rewards/margins": 13.587398529052734,
"rewards/rejected": -14.650779724121094,
"step": 183
},
{
"epoch": 2.3973941368078178,
"grad_norm": 0.07470703125,
"learning_rate": 3.9452054794520546e-05,
"logits/chosen": 0.6034122705459595,
"logits/rejected": 0.5341907739639282,
"logps/chosen": -82.32555389404297,
"logps/rejected": -266.908203125,
"loss": 0.0003,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.8900930881500244,
"rewards/margins": 12.200946807861328,
"rewards/rejected": -13.091039657592773,
"step": 184
},
{
"epoch": 2.41042345276873,
"grad_norm": 0.039306640625,
"learning_rate": 3.938356164383562e-05,
"logits/chosen": 0.5521727800369263,
"logits/rejected": 0.6301867365837097,
"logps/chosen": -98.17955017089844,
"logps/rejected": -288.569580078125,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.1262962818145752,
"rewards/margins": 11.977638244628906,
"rewards/rejected": -13.103934288024902,
"step": 185
},
{
"epoch": 2.423452768729642,
"grad_norm": 0.0166015625,
"learning_rate": 3.9315068493150684e-05,
"logits/chosen": 0.5002225041389465,
"logits/rejected": 0.595288097858429,
"logps/chosen": -96.44597625732422,
"logps/rejected": -270.15771484375,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.1506391763687134,
"rewards/margins": 12.394613265991211,
"rewards/rejected": -13.545251846313477,
"step": 186
},
{
"epoch": 2.436482084690554,
"grad_norm": 0.1865234375,
"learning_rate": 3.924657534246576e-05,
"logits/chosen": 0.5442834496498108,
"logits/rejected": 0.5952669978141785,
"logps/chosen": -104.47047424316406,
"logps/rejected": -306.7992248535156,
"loss": 0.0004,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.0476319789886475,
"rewards/margins": 13.874088287353516,
"rewards/rejected": -14.921720504760742,
"step": 187
},
{
"epoch": 2.449511400651466,
"grad_norm": 0.138671875,
"learning_rate": 3.9178082191780823e-05,
"logits/chosen": 0.38490670919418335,
"logits/rejected": 0.6002693176269531,
"logps/chosen": -87.23043823242188,
"logps/rejected": -338.7787170410156,
"loss": 0.0007,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.8123894929885864,
"rewards/margins": 16.088157653808594,
"rewards/rejected": -16.90054702758789,
"step": 188
},
{
"epoch": 2.462540716612378,
"grad_norm": 0.267578125,
"learning_rate": 3.910958904109589e-05,
"logits/chosen": 0.4915946125984192,
"logits/rejected": 0.5476264953613281,
"logps/chosen": -82.17195892333984,
"logps/rejected": -279.19854736328125,
"loss": 0.0006,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.7035698890686035,
"rewards/margins": 12.01541519165039,
"rewards/rejected": -12.718984603881836,
"step": 189
},
{
"epoch": 2.47557003257329,
"grad_norm": 0.078125,
"learning_rate": 3.904109589041096e-05,
"logits/chosen": 0.5442248582839966,
"logits/rejected": 0.5692495107650757,
"logps/chosen": -118.85929870605469,
"logps/rejected": -289.40765380859375,
"loss": 0.0003,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.2475743293762207,
"rewards/margins": 12.606383323669434,
"rewards/rejected": -13.853957176208496,
"step": 190
},
{
"epoch": 2.488599348534202,
"grad_norm": 0.3046875,
"learning_rate": 3.897260273972603e-05,
"logits/chosen": 0.5258509516716003,
"logits/rejected": 0.6596174240112305,
"logps/chosen": -131.38265991210938,
"logps/rejected": -283.6547546386719,
"loss": 0.0013,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.4643476009368896,
"rewards/margins": 11.995124816894531,
"rewards/rejected": -13.45947265625,
"step": 191
},
{
"epoch": 2.5016286644951142,
"grad_norm": 0.0108642578125,
"learning_rate": 3.89041095890411e-05,
"logits/chosen": 0.4301671087741852,
"logits/rejected": 0.5925787091255188,
"logps/chosen": -98.11710357666016,
"logps/rejected": -325.28521728515625,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.8909515738487244,
"rewards/margins": 15.353211402893066,
"rewards/rejected": -16.24416160583496,
"step": 192
},
{
"epoch": 2.5146579804560263,
"grad_norm": 0.2021484375,
"learning_rate": 3.883561643835617e-05,
"logits/chosen": 0.5148497819900513,
"logits/rejected": 0.5551873445510864,
"logps/chosen": -97.75564575195312,
"logps/rejected": -261.85284423828125,
"loss": 0.0006,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.9065383672714233,
"rewards/margins": 12.213420867919922,
"rewards/rejected": -13.119958877563477,
"step": 193
},
{
"epoch": 2.5276872964169383,
"grad_norm": 0.036865234375,
"learning_rate": 3.8767123287671233e-05,
"logits/chosen": 0.49658170342445374,
"logits/rejected": 0.5841426849365234,
"logps/chosen": -129.8172149658203,
"logps/rejected": -320.80657958984375,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.4361586570739746,
"rewards/margins": 13.985431671142578,
"rewards/rejected": -15.421590805053711,
"step": 194
},
{
"epoch": 2.5407166123778504,
"grad_norm": 0.032470703125,
"learning_rate": 3.86986301369863e-05,
"logits/chosen": 0.40110084414482117,
"logits/rejected": 0.4429419934749603,
"logps/chosen": -110.05766296386719,
"logps/rejected": -279.5133056640625,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.49040687084198,
"rewards/margins": 12.111815452575684,
"rewards/rejected": -13.602222442626953,
"step": 195
},
{
"epoch": 2.5537459283387625,
"grad_norm": 0.047607421875,
"learning_rate": 3.863013698630137e-05,
"logits/chosen": 0.3707536458969116,
"logits/rejected": 0.4637380838394165,
"logps/chosen": -111.06605529785156,
"logps/rejected": -331.0019836425781,
"loss": 0.0003,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.4405083656311035,
"rewards/margins": 14.104761123657227,
"rewards/rejected": -15.545269966125488,
"step": 196
},
{
"epoch": 2.5667752442996745,
"grad_norm": 0.1787109375,
"learning_rate": 3.856164383561644e-05,
"logits/chosen": 0.4742357134819031,
"logits/rejected": 0.5186038613319397,
"logps/chosen": -102.65884399414062,
"logps/rejected": -286.2248229980469,
"loss": 0.0004,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.2666797637939453,
"rewards/margins": 12.227950096130371,
"rewards/rejected": -13.494630813598633,
"step": 197
},
{
"epoch": 2.5798045602605866,
"grad_norm": 3.21875,
"learning_rate": 3.8493150684931505e-05,
"logits/chosen": 0.5423088073730469,
"logits/rejected": 0.5629587173461914,
"logps/chosen": -116.73429870605469,
"logps/rejected": -314.3695068359375,
"loss": 0.0048,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.7822774648666382,
"rewards/margins": 12.672046661376953,
"rewards/rejected": -14.454323768615723,
"step": 198
},
{
"epoch": 2.5928338762214986,
"grad_norm": 0.032470703125,
"learning_rate": 3.842465753424658e-05,
"logits/chosen": 0.3940759599208832,
"logits/rejected": 0.5198019742965698,
"logps/chosen": -129.81735229492188,
"logps/rejected": -329.01812744140625,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.2780612707138062,
"rewards/margins": 14.856027603149414,
"rewards/rejected": -16.13408851623535,
"step": 199
},
{
"epoch": 2.6058631921824107,
"grad_norm": 0.0235595703125,
"learning_rate": 3.8356164383561644e-05,
"logits/chosen": 0.4287755489349365,
"logits/rejected": 0.49127259850502014,
"logps/chosen": -88.73255920410156,
"logps/rejected": -294.54254150390625,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.845811128616333,
"rewards/margins": 13.853937149047852,
"rewards/rejected": -14.699748992919922,
"step": 200
},
{
"epoch": 2.6058631921824107,
"eval_logits/chosen": 0.41032981872558594,
"eval_logits/rejected": 0.4839063882827759,
"eval_logps/chosen": -110.02198791503906,
"eval_logps/rejected": -293.51873779296875,
"eval_loss": 0.00859944336116314,
"eval_rewards/accuracies": 0.9955357313156128,
"eval_rewards/chosen": -1.456769585609436,
"eval_rewards/margins": 12.785966873168945,
"eval_rewards/rejected": -14.24273681640625,
"eval_runtime": 52.2735,
"eval_samples_per_second": 12.836,
"eval_steps_per_second": 0.803,
"step": 200
}
],
"logging_steps": 1.0,
"max_steps": 760,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 200,
"total_flos": 0.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}