FormlessAI's picture
Training in progress, epoch 6, checkpoint
082293f verified
{
"best_global_step": null,
"best_metric": 0.007082384079694748,
"best_model_checkpoint": null,
"epoch": 6.07181328545781,
"eval_steps": 50,
"global_step": 850,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.03590664272890485,
"grad_norm": 17.81623077392578,
"learning_rate": 4.55034563588381e-06,
"logits/chosen": -2.323437452316284,
"logits/rejected": -2.70703125,
"logps/chosen": -72.5999984741211,
"logps/rejected": -82.94999694824219,
"loss": 0.6863,
"rewards/accuracies": 0.29374998807907104,
"rewards/chosen": -0.008679199032485485,
"rewards/margins": 0.01282348670065403,
"rewards/rejected": -0.02150878868997097,
"step": 5
},
{
"epoch": 0.0718132854578097,
"grad_norm": 13.013919830322266,
"learning_rate": 1.0238277680738572e-05,
"logits/chosen": -2.299609422683716,
"logits/rejected": -2.8023438453674316,
"logps/chosen": -76.6624984741211,
"logps/rejected": -89.5250015258789,
"loss": 0.5385,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": -0.25728148221969604,
"rewards/margins": 0.44196778535842896,
"rewards/rejected": -0.6999267339706421,
"step": 10
},
{
"epoch": 0.10771992818671454,
"grad_norm": 10.523333549499512,
"learning_rate": 1.5926209725593337e-05,
"logits/chosen": -3.29296875,
"logits/rejected": -3.5562500953674316,
"logps/chosen": -86.69999694824219,
"logps/rejected": -112.17500305175781,
"loss": 0.325,
"rewards/accuracies": 0.8812500238418579,
"rewards/chosen": -1.2952148914337158,
"rewards/margins": 1.656640648841858,
"rewards/rejected": -2.951953172683716,
"step": 15
},
{
"epoch": 0.1436265709156194,
"grad_norm": 8.033699035644531,
"learning_rate": 2.1614141770448097e-05,
"logits/chosen": -4.182031154632568,
"logits/rejected": -4.242968559265137,
"logps/chosen": -109.25,
"logps/rejected": -148.60000610351562,
"loss": 0.2014,
"rewards/accuracies": 0.921875,
"rewards/chosen": -3.4242186546325684,
"rewards/margins": 3.3539061546325684,
"rewards/rejected": -6.779687404632568,
"step": 20
},
{
"epoch": 0.17953321364452424,
"grad_norm": 32.956817626953125,
"learning_rate": 2.730207381530286e-05,
"logits/chosen": -6.021874904632568,
"logits/rejected": -6.440625190734863,
"logps/chosen": -132.1750030517578,
"logps/rejected": -206.5,
"loss": 0.1298,
"rewards/accuracies": 0.953125,
"rewards/chosen": -6.412499904632568,
"rewards/margins": 6.567187309265137,
"rewards/rejected": -12.984375,
"step": 25
},
{
"epoch": 0.21543985637342908,
"grad_norm": 5.681188583374023,
"learning_rate": 3.2990005860157616e-05,
"logits/chosen": -6.560937404632568,
"logits/rejected": -6.815625190734863,
"logps/chosen": -157.5500030517578,
"logps/rejected": -244.64999389648438,
"loss": 0.0997,
"rewards/accuracies": 0.953125,
"rewards/chosen": -8.732812881469727,
"rewards/margins": 7.779687404632568,
"rewards/rejected": -16.496875762939453,
"step": 30
},
{
"epoch": 0.2513464991023339,
"grad_norm": 10.916585922241211,
"learning_rate": 3.8677937905012385e-05,
"logits/chosen": -5.189062595367432,
"logits/rejected": -5.775781154632568,
"logps/chosen": -156.9499969482422,
"logps/rejected": -237.1999969482422,
"loss": 0.1292,
"rewards/accuracies": 0.9437500238418579,
"rewards/chosen": -8.623437881469727,
"rewards/margins": 7.248437404632568,
"rewards/rejected": -15.871874809265137,
"step": 35
},
{
"epoch": 0.2872531418312388,
"grad_norm": 6.150071620941162,
"learning_rate": 4.436586994986715e-05,
"logits/chosen": -4.82421875,
"logits/rejected": -5.315625190734863,
"logps/chosen": -158.4250030517578,
"logps/rejected": -236.89999389648438,
"loss": 0.1002,
"rewards/accuracies": 0.96875,
"rewards/chosen": -8.595312118530273,
"rewards/margins": 7.290625095367432,
"rewards/rejected": -15.868749618530273,
"step": 40
},
{
"epoch": 0.3231597845601436,
"grad_norm": 5.110800743103027,
"learning_rate": 5.005380199472191e-05,
"logits/chosen": -6.574999809265137,
"logits/rejected": -7.1171875,
"logps/chosen": -172.0500030517578,
"logps/rejected": -269.04998779296875,
"loss": 0.086,
"rewards/accuracies": 0.965624988079071,
"rewards/chosen": -10.365625381469727,
"rewards/margins": 8.489062309265137,
"rewards/rejected": -18.862499237060547,
"step": 45
},
{
"epoch": 0.3590664272890485,
"grad_norm": 7.762596130371094,
"learning_rate": 5.574173403957667e-05,
"logits/chosen": -6.243750095367432,
"logits/rejected": -6.870312690734863,
"logps/chosen": -145.4499969482422,
"logps/rejected": -242.5500030517578,
"loss": 0.0698,
"rewards/accuracies": 0.971875011920929,
"rewards/chosen": -7.448437690734863,
"rewards/margins": 8.868749618530273,
"rewards/rejected": -16.306249618530273,
"step": 50
},
{
"epoch": 0.3590664272890485,
"eval_logits/chosen": -6.295955657958984,
"eval_logits/rejected": -7.025735378265381,
"eval_logps/chosen": -148.5294189453125,
"eval_logps/rejected": -254.23529052734375,
"eval_loss": 0.04325719177722931,
"eval_rewards/accuracies": 0.9816176295280457,
"eval_rewards/chosen": -7.556985378265381,
"eval_rewards/margins": 9.713234901428223,
"eval_rewards/rejected": -17.257352828979492,
"eval_runtime": 7.5302,
"eval_samples_per_second": 35.457,
"eval_steps_per_second": 2.258,
"step": 50
},
{
"epoch": 0.39497307001795334,
"grad_norm": 4.944767475128174,
"learning_rate": 6.142966608443144e-05,
"logits/chosen": -7.017187595367432,
"logits/rejected": -7.801562309265137,
"logps/chosen": -165.39999389648438,
"logps/rejected": -271.54998779296875,
"loss": 0.0805,
"rewards/accuracies": 0.9624999761581421,
"rewards/chosen": -9.506250381469727,
"rewards/margins": 9.890625,
"rewards/rejected": -19.396875381469727,
"step": 55
},
{
"epoch": 0.43087971274685816,
"grad_norm": 5.226822376251221,
"learning_rate": 6.711759812928619e-05,
"logits/chosen": -7.185937404632568,
"logits/rejected": -8.079687118530273,
"logps/chosen": -187.5,
"logps/rejected": -307.6000061035156,
"loss": 0.076,
"rewards/accuracies": 0.965624988079071,
"rewards/chosen": -11.8125,
"rewards/margins": 11.323437690734863,
"rewards/rejected": -23.143749237060547,
"step": 60
},
{
"epoch": 0.466786355475763,
"grad_norm": 2.203220844268799,
"learning_rate": 7.280553017414096e-05,
"logits/chosen": -3.3968749046325684,
"logits/rejected": -4.358593940734863,
"logps/chosen": -151.75,
"logps/rejected": -241.5500030517578,
"loss": 0.1146,
"rewards/accuracies": 0.9593750238418579,
"rewards/chosen": -8.064062118530273,
"rewards/margins": 8.042187690734863,
"rewards/rejected": -16.112499237060547,
"step": 65
},
{
"epoch": 0.5026929982046678,
"grad_norm": 1.6759897470474243,
"learning_rate": 7.849346221899571e-05,
"logits/chosen": -2.987499952316284,
"logits/rejected": -3.8515625,
"logps/chosen": -123.80000305175781,
"logps/rejected": -217.60000610351562,
"loss": 0.0679,
"rewards/accuracies": 0.96875,
"rewards/chosen": -5.29296875,
"rewards/margins": 8.653124809265137,
"rewards/rejected": -13.934374809265137,
"step": 70
},
{
"epoch": 0.5385996409335727,
"grad_norm": 3.1203854084014893,
"learning_rate": 8.418139426385048e-05,
"logits/chosen": -9.951562881469727,
"logits/rejected": -10.609375,
"logps/chosen": -246.6999969482422,
"logps/rejected": -369.29998779296875,
"loss": 0.0558,
"rewards/accuracies": 0.981249988079071,
"rewards/chosen": -17.778125762939453,
"rewards/margins": 11.425000190734863,
"rewards/rejected": -29.212499618530273,
"step": 75
},
{
"epoch": 0.5745062836624776,
"grad_norm": 1.693241834640503,
"learning_rate": 8.986932630870525e-05,
"logits/chosen": -8.556249618530273,
"logits/rejected": -9.748437881469727,
"logps/chosen": -256.1000061035156,
"logps/rejected": -398.79998779296875,
"loss": 0.1092,
"rewards/accuracies": 0.971875011920929,
"rewards/chosen": -18.521875381469727,
"rewards/margins": 13.868749618530273,
"rewards/rejected": -32.38750076293945,
"step": 80
},
{
"epoch": 0.6104129263913824,
"grad_norm": 1.1957429647445679,
"learning_rate": 9.555725835356e-05,
"logits/chosen": -4.921875,
"logits/rejected": -6.415625095367432,
"logps/chosen": -206.8000030517578,
"logps/rejected": -348.70001220703125,
"loss": 0.014,
"rewards/accuracies": 0.996874988079071,
"rewards/chosen": -13.831250190734863,
"rewards/margins": 13.28125,
"rewards/rejected": -27.112499237060547,
"step": 85
},
{
"epoch": 0.6463195691202872,
"grad_norm": 1.4185799360275269,
"learning_rate": 0.00010124519039841476,
"logits/chosen": -8.342187881469727,
"logits/rejected": -9.996874809265137,
"logps/chosen": -340.20001220703125,
"logps/rejected": -538.7000122070312,
"loss": 0.0447,
"rewards/accuracies": 0.981249988079071,
"rewards/chosen": -27.181249618530273,
"rewards/margins": 19.196874618530273,
"rewards/rejected": -46.38750076293945,
"step": 90
},
{
"epoch": 0.6822262118491921,
"grad_norm": 1.2728252410888672,
"learning_rate": 0.00010693312244326953,
"logits/chosen": -8.214062690734863,
"logits/rejected": -9.143750190734863,
"logps/chosen": -374.8999938964844,
"logps/rejected": -553.9000244140625,
"loss": 0.1363,
"rewards/accuracies": 0.9624999761581421,
"rewards/chosen": -31.162500381469727,
"rewards/margins": 16.59375,
"rewards/rejected": -47.724998474121094,
"step": 95
},
{
"epoch": 0.718132854578097,
"grad_norm": 4.153715133666992,
"learning_rate": 0.00011262105448812429,
"logits/chosen": -8.5078125,
"logits/rejected": -10.462499618530273,
"logps/chosen": -284.1000061035156,
"logps/rejected": -456.29998779296875,
"loss": 0.0684,
"rewards/accuracies": 0.96875,
"rewards/chosen": -21.65625,
"rewards/margins": 16.381250381469727,
"rewards/rejected": -38.01250076293945,
"step": 100
},
{
"epoch": 0.718132854578097,
"eval_logits/chosen": -11.941176414489746,
"eval_logits/rejected": -14.220588684082031,
"eval_logps/chosen": -293.1764831542969,
"eval_logps/rejected": -514.4705810546875,
"eval_loss": 0.029092751443386078,
"eval_rewards/accuracies": 0.9926470518112183,
"eval_rewards/chosen": -22.25,
"eval_rewards/margins": 21.441177368164062,
"eval_rewards/rejected": -43.661766052246094,
"eval_runtime": 8.7058,
"eval_samples_per_second": 30.669,
"eval_steps_per_second": 1.953,
"step": 100
},
{
"epoch": 0.7540394973070018,
"grad_norm": 5.066140651702881,
"learning_rate": 0.0001137585950751329,
"logits/chosen": -13.131250381469727,
"logits/rejected": -14.824999809265137,
"logps/chosen": -293.25,
"logps/rejected": -497.3999938964844,
"loss": 0.0586,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": -22.725000381469727,
"rewards/margins": 19.806249618530273,
"rewards/rejected": -42.5,
"step": 105
},
{
"epoch": 0.7899461400359067,
"grad_norm": 2.763026714324951,
"learning_rate": 0.00011375840892353745,
"logits/chosen": -10.834375381469727,
"logits/rejected": -12.490625381469727,
"logps/chosen": -248.6999969482422,
"logps/rejected": -443.8999938964844,
"loss": 0.0501,
"rewards/accuracies": 0.9781249761581421,
"rewards/chosen": -17.862499237060547,
"rewards/margins": 18.778125762939453,
"rewards/rejected": -36.63750076293945,
"step": 110
},
{
"epoch": 0.8258527827648114,
"grad_norm": 2.0490527153015137,
"learning_rate": 0.00011375807957890452,
"logits/chosen": -7.245312690734863,
"logits/rejected": -9.215624809265137,
"logps/chosen": -165.6750030517578,
"logps/rejected": -346.3999938964844,
"loss": 0.0537,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": -9.360937118530273,
"rewards/margins": 17.399999618530273,
"rewards/rejected": -26.75,
"step": 115
},
{
"epoch": 0.8617594254937163,
"grad_norm": 1.383483648300171,
"learning_rate": 0.00011375760704206321,
"logits/chosen": -5.696875095367432,
"logits/rejected": -7.876562595367432,
"logps/chosen": -145.625,
"logps/rejected": -273.0,
"loss": 0.0769,
"rewards/accuracies": 0.971875011920929,
"rewards/chosen": -7.3359375,
"rewards/margins": 12.178125381469727,
"rewards/rejected": -19.512500762939453,
"step": 120
},
{
"epoch": 0.8976660682226212,
"grad_norm": 1.2604446411132812,
"learning_rate": 0.00011375699131420316,
"logits/chosen": -6.131249904632568,
"logits/rejected": -8.760937690734863,
"logps/chosen": -154.0,
"logps/rejected": -332.79998779296875,
"loss": 0.0711,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": -8.456250190734863,
"rewards/margins": 16.806249618530273,
"rewards/rejected": -25.268749237060547,
"step": 125
},
{
"epoch": 0.933572710951526,
"grad_norm": 3.156709671020508,
"learning_rate": 0.00011375623239687444,
"logits/chosen": -6.84375,
"logits/rejected": -8.912500381469727,
"logps/chosen": -213.52499389648438,
"logps/rejected": -368.04998779296875,
"loss": 0.0617,
"rewards/accuracies": 0.971875011920929,
"rewards/chosen": -14.483593940734863,
"rewards/margins": 14.620312690734863,
"rewards/rejected": -29.118749618530273,
"step": 130
},
{
"epoch": 0.9694793536804309,
"grad_norm": 1.4515728950500488,
"learning_rate": 0.00011375533029198762,
"logits/chosen": -9.087499618530273,
"logits/rejected": -10.668749809265137,
"logps/chosen": -258.79998779296875,
"logps/rejected": -419.79998779296875,
"loss": 0.0257,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": -18.887500762939453,
"rewards/margins": 15.359375,
"rewards/rejected": -34.26874923706055,
"step": 135
},
{
"epoch": 1.0,
"grad_norm": 3.5940001010894775,
"learning_rate": 0.00011375428500181377,
"logits/chosen": -6.693014621734619,
"logits/rejected": -10.113970756530762,
"logps/chosen": -174.76470947265625,
"logps/rejected": -416.3529357910156,
"loss": 0.0226,
"rewards/accuracies": 0.9779411554336548,
"rewards/chosen": -10.352941513061523,
"rewards/margins": 23.735294342041016,
"rewards/rejected": -34.117645263671875,
"step": 140
},
{
"epoch": 1.0359066427289048,
"grad_norm": 3.284130573272705,
"learning_rate": 0.00011375309652898442,
"logits/chosen": -11.453125,
"logits/rejected": -13.603124618530273,
"logps/chosen": -387.0,
"logps/rejected": -576.7999877929688,
"loss": 0.0758,
"rewards/accuracies": 0.9624999761581421,
"rewards/chosen": -32.08124923706055,
"rewards/margins": 18.231250762939453,
"rewards/rejected": -50.32500076293945,
"step": 145
},
{
"epoch": 1.0718132854578097,
"grad_norm": 2.5212385654449463,
"learning_rate": 0.00011375176487649152,
"logits/chosen": -9.884374618530273,
"logits/rejected": -11.106249809265137,
"logps/chosen": -335.29998779296875,
"logps/rejected": -490.0,
"loss": 0.0654,
"rewards/accuracies": 0.96875,
"rewards/chosen": -26.9375,
"rewards/margins": 14.440625190734863,
"rewards/rejected": -41.349998474121094,
"step": 150
},
{
"epoch": 1.0718132854578097,
"eval_logits/chosen": -8.321691513061523,
"eval_logits/rejected": -10.400734901428223,
"eval_logps/chosen": -270.1764831542969,
"eval_logps/rejected": -451.76470947265625,
"eval_loss": 0.04131906107068062,
"eval_rewards/accuracies": 0.9779411554336548,
"eval_rewards/chosen": -19.90441131591797,
"eval_rewards/margins": 17.422794342041016,
"eval_rewards/rejected": -37.35293960571289,
"eval_runtime": 8.5972,
"eval_samples_per_second": 31.057,
"eval_steps_per_second": 1.977,
"step": 150
},
{
"epoch": 1.1077199281867145,
"grad_norm": 0.6480632424354553,
"learning_rate": 0.00011375029004768751,
"logits/chosen": -8.751562118530273,
"logits/rejected": -10.721875190734863,
"logps/chosen": -271.6499938964844,
"logps/rejected": -462.8999938964844,
"loss": 0.0452,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": -20.331249237060547,
"rewards/margins": 18.353124618530273,
"rewards/rejected": -38.681251525878906,
"step": 155
},
{
"epoch": 1.1436265709156195,
"grad_norm": 1.824242353439331,
"learning_rate": 0.00011374867204628526,
"logits/chosen": -8.482812881469727,
"logits/rejected": -10.028124809265137,
"logps/chosen": -293.45001220703125,
"logps/rejected": -478.5,
"loss": 0.042,
"rewards/accuracies": 0.984375,
"rewards/chosen": -22.506250381469727,
"rewards/margins": 17.868749618530273,
"rewards/rejected": -40.33124923706055,
"step": 160
},
{
"epoch": 1.1795332136445242,
"grad_norm": 6.0788984298706055,
"learning_rate": 0.00011374691087635812,
"logits/chosen": -9.996874809265137,
"logits/rejected": -11.309374809265137,
"logps/chosen": -378.8999938964844,
"logps/rejected": -676.7000122070312,
"loss": 0.0607,
"rewards/accuracies": 0.981249988079071,
"rewards/chosen": -31.174999237060547,
"rewards/margins": 29.106250762939453,
"rewards/rejected": -60.26250076293945,
"step": 165
},
{
"epoch": 1.215439856373429,
"grad_norm": 2.306962013244629,
"learning_rate": 0.00011374500654233978,
"logits/chosen": -6.659375190734863,
"logits/rejected": -8.645312309265137,
"logps/chosen": -239.0,
"logps/rejected": -461.29998779296875,
"loss": 0.0665,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": -16.965625762939453,
"rewards/margins": 21.640625,
"rewards/rejected": -38.599998474121094,
"step": 170
},
{
"epoch": 1.251346499102334,
"grad_norm": 2.2957208156585693,
"learning_rate": 0.00011374295904902446,
"logits/chosen": -7.462500095367432,
"logits/rejected": -9.553125381469727,
"logps/chosen": -236.8000030517578,
"logps/rejected": -444.5,
"loss": 0.0593,
"rewards/accuracies": 0.984375,
"rewards/chosen": -16.668750762939453,
"rewards/margins": 20.456249237060547,
"rewards/rejected": -37.10625076293945,
"step": 175
},
{
"epoch": 1.2872531418312387,
"grad_norm": 9.126953125,
"learning_rate": 0.00011374076840156666,
"logits/chosen": -6.907812595367432,
"logits/rejected": -9.303125381469727,
"logps/chosen": -205.5,
"logps/rejected": -396.20001220703125,
"loss": 0.0372,
"rewards/accuracies": 0.984375,
"rewards/chosen": -13.418749809265137,
"rewards/margins": 18.612499237060547,
"rewards/rejected": -32.037498474121094,
"step": 180
},
{
"epoch": 1.3231597845601435,
"grad_norm": 1.2403417825698853,
"learning_rate": 0.00011373843460548139,
"logits/chosen": -7.637499809265137,
"logits/rejected": -10.353124618530273,
"logps/chosen": -251.9499969482422,
"logps/rejected": -456.20001220703125,
"loss": 0.0473,
"rewards/accuracies": 0.9781249761581421,
"rewards/chosen": -18.381250381469727,
"rewards/margins": 19.59375,
"rewards/rejected": -37.974998474121094,
"step": 185
},
{
"epoch": 1.3590664272890485,
"grad_norm": 3.5465104579925537,
"learning_rate": 0.00011373595766664395,
"logits/chosen": -10.581250190734863,
"logits/rejected": -12.559374809265137,
"logps/chosen": -374.29998779296875,
"logps/rejected": -551.2000122070312,
"loss": 0.0332,
"rewards/accuracies": 0.984375,
"rewards/chosen": -30.412500381469727,
"rewards/margins": 17.181249618530273,
"rewards/rejected": -47.599998474121094,
"step": 190
},
{
"epoch": 1.3949730700179535,
"grad_norm": 0.9382606744766235,
"learning_rate": 0.00011373333759129004,
"logits/chosen": -12.212499618530273,
"logits/rejected": -13.796875,
"logps/chosen": -380.1000061035156,
"logps/rejected": -573.7999877929688,
"loss": 0.0366,
"rewards/accuracies": 0.9906250238418579,
"rewards/chosen": -31.475000381469727,
"rewards/margins": 18.6875,
"rewards/rejected": -50.20000076293945,
"step": 195
},
{
"epoch": 1.4308797127468582,
"grad_norm": 0.6339579224586487,
"learning_rate": 0.00011373057438601568,
"logits/chosen": -9.418749809265137,
"logits/rejected": -11.771875381469727,
"logps/chosen": -282.20001220703125,
"logps/rejected": -479.29998779296875,
"loss": 0.048,
"rewards/accuracies": 0.9781249761581421,
"rewards/chosen": -21.262500762939453,
"rewards/margins": 19.103124618530273,
"rewards/rejected": -40.36249923706055,
"step": 200
},
{
"epoch": 1.4308797127468582,
"eval_logits/chosen": -6.602941036224365,
"eval_logits/rejected": -9.966911315917969,
"eval_logps/chosen": -166.5294189453125,
"eval_logps/rejected": -406.70587158203125,
"eval_loss": 0.028071066364645958,
"eval_rewards/accuracies": 0.9889705777168274,
"eval_rewards/chosen": -9.369484901428223,
"eval_rewards/margins": 23.360294342041016,
"eval_rewards/rejected": -32.72793960571289,
"eval_runtime": 8.6276,
"eval_samples_per_second": 30.947,
"eval_steps_per_second": 1.97,
"step": 200
},
{
"epoch": 1.466786355475763,
"grad_norm": 3.074913501739502,
"learning_rate": 0.00011372766805777725,
"logits/chosen": -4.8984375,
"logits/rejected": -8.321874618530273,
"logps/chosen": -133.85000610351562,
"logps/rejected": -351.5,
"loss": 0.0452,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": -6.512499809265137,
"rewards/margins": 21.049999237060547,
"rewards/rejected": -27.5625,
"step": 205
},
{
"epoch": 1.502692998204668,
"grad_norm": 3.414430618286133,
"learning_rate": 0.00011372461861389142,
"logits/chosen": -5.293749809265137,
"logits/rejected": -7.599999904632568,
"logps/chosen": -180.0,
"logps/rejected": -397.3999938964844,
"loss": 0.0937,
"rewards/accuracies": 0.9781249761581421,
"rewards/chosen": -11.2421875,
"rewards/margins": 20.878124237060547,
"rewards/rejected": -32.10625076293945,
"step": 210
},
{
"epoch": 1.5385996409335727,
"grad_norm": 1.9072022438049316,
"learning_rate": 0.00011372142606203516,
"logits/chosen": -4.057812690734863,
"logits/rejected": -6.109375,
"logps/chosen": -156.0749969482422,
"logps/rejected": -324.79998779296875,
"loss": 0.0461,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": -8.564062118530273,
"rewards/margins": 16.028125762939453,
"rewards/rejected": -24.587499618530273,
"step": 215
},
{
"epoch": 1.5745062836624775,
"grad_norm": 1.2352949380874634,
"learning_rate": 0.00011371809041024573,
"logits/chosen": -5.315625190734863,
"logits/rejected": -7.754687309265137,
"logps/chosen": -189.64999389648438,
"logps/rejected": -392.0,
"loss": 0.0332,
"rewards/accuracies": 0.981249988079071,
"rewards/chosen": -11.946874618530273,
"rewards/margins": 19.446874618530273,
"rewards/rejected": -31.424999237060547,
"step": 220
},
{
"epoch": 1.6104129263913824,
"grad_norm": 0.5811780095100403,
"learning_rate": 0.00011371461166692062,
"logits/chosen": -7.451562404632568,
"logits/rejected": -10.225000381469727,
"logps/chosen": -218.4499969482422,
"logps/rejected": -433.79998779296875,
"loss": 0.0486,
"rewards/accuracies": 0.984375,
"rewards/chosen": -14.824999809265137,
"rewards/margins": 20.75,
"rewards/rejected": -35.57500076293945,
"step": 225
},
{
"epoch": 1.6463195691202872,
"grad_norm": 0.33781036734580994,
"learning_rate": 0.00011371098984081755,
"logits/chosen": -9.0390625,
"logits/rejected": -11.784375190734863,
"logps/chosen": -232.75,
"logps/rejected": -425.79998779296875,
"loss": 0.0465,
"rewards/accuracies": 0.9781249761581421,
"rewards/chosen": -16.509374618530273,
"rewards/margins": 18.971874237060547,
"rewards/rejected": -35.45624923706055,
"step": 230
},
{
"epoch": 1.682226211849192,
"grad_norm": 0.9501491189002991,
"learning_rate": 0.0001137072249410545,
"logits/chosen": -8.489062309265137,
"logits/rejected": -11.706250190734863,
"logps/chosen": -229.6999969482422,
"logps/rejected": -451.3999938964844,
"loss": 0.0196,
"rewards/accuracies": 0.996874988079071,
"rewards/chosen": -16.003124237060547,
"rewards/margins": 21.600000381469727,
"rewards/rejected": -37.599998474121094,
"step": 235
},
{
"epoch": 1.718132854578097,
"grad_norm": 2.2919723987579346,
"learning_rate": 0.00011370331697710956,
"logits/chosen": -8.271875381469727,
"logits/rejected": -12.678125381469727,
"logps/chosen": -205.75,
"logps/rejected": -511.79998779296875,
"loss": 0.0752,
"rewards/accuracies": 0.984375,
"rewards/chosen": -13.65625,
"rewards/margins": 30.087499618530273,
"rewards/rejected": -43.75,
"step": 240
},
{
"epoch": 1.754039497307002,
"grad_norm": 2.5256083011627197,
"learning_rate": 0.00011369926595882104,
"logits/chosen": -10.071874618530273,
"logits/rejected": -13.768750190734863,
"logps/chosen": -291.04998779296875,
"logps/rejected": -619.2000122070312,
"loss": 0.0377,
"rewards/accuracies": 0.9781249761581421,
"rewards/chosen": -22.446874618530273,
"rewards/margins": 32.150001525878906,
"rewards/rejected": -54.57500076293945,
"step": 245
},
{
"epoch": 1.7899461400359067,
"grad_norm": 1.3631178140640259,
"learning_rate": 0.00011369507189638736,
"logits/chosen": -3.7313232421875,
"logits/rejected": -7.044335842132568,
"logps/chosen": -175.27499389648438,
"logps/rejected": -371.45001220703125,
"loss": 0.0627,
"rewards/accuracies": 0.9781249761581421,
"rewards/chosen": -10.771875381469727,
"rewards/margins": 18.6015625,
"rewards/rejected": -29.353124618530273,
"step": 250
},
{
"epoch": 1.7899461400359067,
"eval_logits/chosen": -0.22195972502231598,
"eval_logits/rejected": -1.6530331373214722,
"eval_logps/chosen": -116.17646789550781,
"eval_logps/rejected": -208.88235473632812,
"eval_loss": 0.07754824310541153,
"eval_rewards/accuracies": 0.9485294222831726,
"eval_rewards/chosen": -4.249080657958984,
"eval_rewards/margins": 8.38786792755127,
"eval_rewards/rejected": -12.650734901428223,
"eval_runtime": 8.5185,
"eval_samples_per_second": 31.344,
"eval_steps_per_second": 1.996,
"step": 250
},
{
"epoch": 1.8258527827648114,
"grad_norm": 1.8161951303482056,
"learning_rate": 0.00011369073480036712,
"logits/chosen": -2.3915038108825684,
"logits/rejected": -3.521484375,
"logps/chosen": -145.4250030517578,
"logps/rejected": -257.20001220703125,
"loss": 0.0791,
"rewards/accuracies": 0.9624999761581421,
"rewards/chosen": -7.525000095367432,
"rewards/margins": 10.292187690734863,
"rewards/rejected": -17.818750381469727,
"step": 255
},
{
"epoch": 1.8617594254937164,
"grad_norm": 0.3177375793457031,
"learning_rate": 0.00011368625468167889,
"logits/chosen": -8.206250190734863,
"logits/rejected": -10.065625190734863,
"logps/chosen": -325.3500061035156,
"logps/rejected": -509.1000061035156,
"loss": 0.0508,
"rewards/accuracies": 0.971875011920929,
"rewards/chosen": -26.078125,
"rewards/margins": 17.381250381469727,
"rewards/rejected": -43.42499923706055,
"step": 260
},
{
"epoch": 1.8976660682226212,
"grad_norm": 0.7164928317070007,
"learning_rate": 0.00011368163155160139,
"logits/chosen": -10.165624618530273,
"logits/rejected": -11.875,
"logps/chosen": -410.29998779296875,
"logps/rejected": -585.4000244140625,
"loss": 0.0577,
"rewards/accuracies": 0.981249988079071,
"rewards/chosen": -34.45624923706055,
"rewards/margins": 16.634374618530273,
"rewards/rejected": -51.07500076293945,
"step": 265
},
{
"epoch": 1.933572710951526,
"grad_norm": 2.070739507675171,
"learning_rate": 0.00011367686542177336,
"logits/chosen": -6.4375,
"logits/rejected": -7.814062595367432,
"logps/chosen": -340.1000061035156,
"logps/rejected": -469.79998779296875,
"loss": 0.0576,
"rewards/accuracies": 0.9593750238418579,
"rewards/chosen": -27.168750762939453,
"rewards/margins": 12.381250381469727,
"rewards/rejected": -39.54999923706055,
"step": 270
},
{
"epoch": 1.969479353680431,
"grad_norm": 1.2207964658737183,
"learning_rate": 0.0001136719563041935,
"logits/chosen": -4.547656059265137,
"logits/rejected": -6.598437309265137,
"logps/chosen": -263.0,
"logps/rejected": -423.8999938964844,
"loss": 0.0322,
"rewards/accuracies": 0.9906250238418579,
"rewards/chosen": -19.46875,
"rewards/margins": 15.509374618530273,
"rewards/rejected": -34.993751525878906,
"step": 275
},
{
"epoch": 2.0,
"grad_norm": 6.416799545288086,
"learning_rate": 0.0001136669042112205,
"logits/chosen": -5.321691036224365,
"logits/rejected": -8.242647171020508,
"logps/chosen": -274.0,
"logps/rejected": -515.6470336914062,
"loss": 0.0428,
"rewards/accuracies": 0.9852941036224365,
"rewards/chosen": -20.264705657958984,
"rewards/margins": 23.514705657958984,
"rewards/rejected": -43.80882263183594,
"step": 280
},
{
"epoch": 2.035906642728905,
"grad_norm": 0.0026381895877420902,
"learning_rate": 0.00011366170915557303,
"logits/chosen": -6.020312309265137,
"logits/rejected": -9.284375190734863,
"logps/chosen": -289.1499938964844,
"logps/rejected": -603.0,
"loss": 0.0676,
"rewards/accuracies": 0.9781249761581421,
"rewards/chosen": -22.331249237060547,
"rewards/margins": 30.418750762939453,
"rewards/rejected": -52.75,
"step": 285
},
{
"epoch": 2.0718132854578095,
"grad_norm": 9.352706909179688,
"learning_rate": 0.00011365637115032964,
"logits/chosen": -5.478906154632568,
"logits/rejected": -9.690625190734863,
"logps/chosen": -263.3500061035156,
"logps/rejected": -613.4000244140625,
"loss": 0.0791,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": -19.450000762939453,
"rewards/margins": 34.474998474121094,
"rewards/rejected": -53.9375,
"step": 290
},
{
"epoch": 2.1077199281867145,
"grad_norm": 1.2689452171325684,
"learning_rate": 0.0001136508902089287,
"logits/chosen": -2.486132860183716,
"logits/rejected": -5.1484375,
"logps/chosen": -145.1999969482422,
"logps/rejected": -346.20001220703125,
"loss": 0.1107,
"rewards/accuracies": 0.9593750238418579,
"rewards/chosen": -7.432031154632568,
"rewards/margins": 19.28125,
"rewards/rejected": -26.706249237060547,
"step": 295
},
{
"epoch": 2.1436265709156195,
"grad_norm": 0.04833826795220375,
"learning_rate": 0.00011364526634516852,
"logits/chosen": -5.057812690734863,
"logits/rejected": -8.810937881469727,
"logps/chosen": -240.14999389648438,
"logps/rejected": -519.7999877929688,
"loss": 0.1121,
"rewards/accuracies": 0.9781249761581421,
"rewards/chosen": -17.071874618530273,
"rewards/margins": 27.412500381469727,
"rewards/rejected": -44.462501525878906,
"step": 300
},
{
"epoch": 2.1436265709156195,
"eval_logits/chosen": -4.876838207244873,
"eval_logits/rejected": -8.235294342041016,
"eval_logps/chosen": -253.05882263183594,
"eval_logps/rejected": -514.8235473632812,
"eval_loss": 0.02023915760219097,
"eval_rewards/accuracies": 0.9836230278015137,
"eval_rewards/chosen": -18.169116973876953,
"eval_rewards/margins": 25.522058486938477,
"eval_rewards/rejected": -43.661766052246094,
"eval_runtime": 8.6202,
"eval_samples_per_second": 30.974,
"eval_steps_per_second": 1.972,
"step": 300
},
{
"epoch": 2.1795332136445245,
"grad_norm": 3.4645302295684814,
"learning_rate": 0.00011363949957320717,
"logits/chosen": -4.932812690734863,
"logits/rejected": -7.857812404632568,
"logps/chosen": -253.5500030517578,
"logps/rejected": -483.5,
"loss": 0.0512,
"rewards/accuracies": 0.981249988079071,
"rewards/chosen": -18.543750762939453,
"rewards/margins": 22.434375762939453,
"rewards/rejected": -40.98749923706055,
"step": 305
},
{
"epoch": 2.215439856373429,
"grad_norm": 0.09185401350259781,
"learning_rate": 0.00011363358990756246,
"logits/chosen": -4.626562595367432,
"logits/rejected": -7.34375,
"logps/chosen": -205.1999969482422,
"logps/rejected": -409.70001220703125,
"loss": 0.0464,
"rewards/accuracies": 0.984375,
"rewards/chosen": -13.615625381469727,
"rewards/margins": 19.90625,
"rewards/rejected": -33.506248474121094,
"step": 310
},
{
"epoch": 2.251346499102334,
"grad_norm": 1.7695757150650024,
"learning_rate": 0.00011362753736311199,
"logits/chosen": -4.758593559265137,
"logits/rejected": -7.090624809265137,
"logps/chosen": -183.60000610351562,
"logps/rejected": -357.79998779296875,
"loss": 0.0192,
"rewards/accuracies": 0.9937499761581421,
"rewards/chosen": -11.206250190734863,
"rewards/margins": 16.837499618530273,
"rewards/rejected": -28.024999618530273,
"step": 315
},
{
"epoch": 2.287253141831239,
"grad_norm": 0.2544287145137787,
"learning_rate": 0.00011362134195509304,
"logits/chosen": -5.989062309265137,
"logits/rejected": -8.729687690734863,
"logps/chosen": -169.6999969482422,
"logps/rejected": -410.1000061035156,
"loss": 0.0094,
"rewards/accuracies": 0.996874988079071,
"rewards/chosen": -10.1484375,
"rewards/margins": 23.103124618530273,
"rewards/rejected": -33.243751525878906,
"step": 320
},
{
"epoch": 2.3231597845601435,
"grad_norm": 0.007753673940896988,
"learning_rate": 0.00011361500369910252,
"logits/chosen": -7.881249904632568,
"logits/rejected": -10.868749618530273,
"logps/chosen": -216.0,
"logps/rejected": -479.5,
"loss": 0.0083,
"rewards/accuracies": 1.0,
"rewards/chosen": -14.771875381469727,
"rewards/margins": 25.493749618530273,
"rewards/rejected": -40.26250076293945,
"step": 325
},
{
"epoch": 2.3590664272890485,
"grad_norm": 0.980211615562439,
"learning_rate": 0.000113608522611097,
"logits/chosen": -8.223437309265137,
"logits/rejected": -10.987500190734863,
"logps/chosen": -212.89999389648438,
"logps/rejected": -467.20001220703125,
"loss": 0.0133,
"rewards/accuracies": 0.996874988079071,
"rewards/chosen": -14.378125190734863,
"rewards/margins": 24.59375,
"rewards/rejected": -38.974998474121094,
"step": 330
},
{
"epoch": 2.3949730700179535,
"grad_norm": 0.30649715662002563,
"learning_rate": 0.0001136018987073926,
"logits/chosen": -7.28125,
"logits/rejected": -9.106249809265137,
"logps/chosen": -161.5500030517578,
"logps/rejected": -330.25,
"loss": 0.0511,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": -8.954687118530273,
"rewards/margins": 16.278125762939453,
"rewards/rejected": -25.225000381469727,
"step": 335
},
{
"epoch": 2.430879712746858,
"grad_norm": 0.8643183708190918,
"learning_rate": 0.000113595132004665,
"logits/chosen": -8.449999809265137,
"logits/rejected": -9.612500190734863,
"logps/chosen": -192.39999389648438,
"logps/rejected": -365.29998779296875,
"loss": 0.0235,
"rewards/accuracies": 0.9906250238418579,
"rewards/chosen": -12.34375,
"rewards/margins": 16.703125,
"rewards/rejected": -29.043750762939453,
"step": 340
},
{
"epoch": 2.466786355475763,
"grad_norm": 1.0305479764938354,
"learning_rate": 0.00011358822251994936,
"logits/chosen": -7.7890625,
"logits/rejected": -8.912500381469727,
"logps/chosen": -169.97500610351562,
"logps/rejected": -350.1000061035156,
"loss": 0.0523,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": -9.7421875,
"rewards/margins": 17.625,
"rewards/rejected": -27.356250762939453,
"step": 345
},
{
"epoch": 2.502692998204668,
"grad_norm": 0.1656515747308731,
"learning_rate": 0.00011358117027064029,
"logits/chosen": -4.857812404632568,
"logits/rejected": -6.118750095367432,
"logps/chosen": -116.25,
"logps/rejected": -289.1499938964844,
"loss": 0.0293,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": -4.669531345367432,
"rewards/margins": 16.421875,
"rewards/rejected": -21.103124618530273,
"step": 350
},
{
"epoch": 2.502692998204668,
"eval_logits/chosen": -4.408088207244873,
"eval_logits/rejected": -5.608455657958984,
"eval_logps/chosen": -151.94117736816406,
"eval_logps/rejected": -344.941162109375,
"eval_loss": 0.030790157616138458,
"eval_rewards/accuracies": 0.9852941036224365,
"eval_rewards/chosen": -7.900735378265381,
"eval_rewards/margins": 18.610294342041016,
"eval_rewards/rejected": -26.5,
"eval_runtime": 8.8079,
"eval_samples_per_second": 30.314,
"eval_steps_per_second": 1.93,
"step": 350
},
{
"epoch": 2.5385996409335725,
"grad_norm": 2.206739664077759,
"learning_rate": 0.00011357397527449183,
"logits/chosen": -4.839062690734863,
"logits/rejected": -5.840624809265137,
"logps/chosen": -205.10000610351562,
"logps/rejected": -381.20001220703125,
"loss": 0.0298,
"rewards/accuracies": 0.9906250238418579,
"rewards/chosen": -13.587499618530273,
"rewards/margins": 16.90625,
"rewards/rejected": -30.512500762939453,
"step": 355
},
{
"epoch": 2.5745062836624775,
"grad_norm": 0.4848335385322571,
"learning_rate": 0.00011356663754961738,
"logits/chosen": -5.310937404632568,
"logits/rejected": -6.078125,
"logps/chosen": -284.79998779296875,
"logps/rejected": -437.70001220703125,
"loss": 0.0368,
"rewards/accuracies": 0.984375,
"rewards/chosen": -21.575000762939453,
"rewards/margins": 14.528124809265137,
"rewards/rejected": -36.09375,
"step": 360
},
{
"epoch": 2.6104129263913824,
"grad_norm": 1.6452534198760986,
"learning_rate": 0.00011355915711448965,
"logits/chosen": -5.776562690734863,
"logits/rejected": -6.996874809265137,
"logps/chosen": -325.5,
"logps/rejected": -500.8999938964844,
"loss": 0.0305,
"rewards/accuracies": 0.984375,
"rewards/chosen": -25.756250381469727,
"rewards/margins": 16.803125381469727,
"rewards/rejected": -42.537498474121094,
"step": 365
},
{
"epoch": 2.646319569120287,
"grad_norm": 0.1357572078704834,
"learning_rate": 0.00011355153398794061,
"logits/chosen": -3.745312452316284,
"logits/rejected": -5.971875190734863,
"logps/chosen": -241.9499969482422,
"logps/rejected": -457.1000061035156,
"loss": 0.043,
"rewards/accuracies": 0.984375,
"rewards/chosen": -17.171875,
"rewards/margins": 20.743749618530273,
"rewards/rejected": -37.943748474121094,
"step": 370
},
{
"epoch": 2.682226211849192,
"grad_norm": 0.02111443318426609,
"learning_rate": 0.00011354376818916149,
"logits/chosen": -2.9437499046325684,
"logits/rejected": -5.345312595367432,
"logps/chosen": -217.10000610351562,
"logps/rejected": -448.8999938964844,
"loss": 0.0395,
"rewards/accuracies": 0.984375,
"rewards/chosen": -14.665624618530273,
"rewards/margins": 22.515625,
"rewards/rejected": -37.15625,
"step": 375
},
{
"epoch": 2.718132854578097,
"grad_norm": 0.670870304107666,
"learning_rate": 0.00011353585973770268,
"logits/chosen": -2.5054688453674316,
"logits/rejected": -6.078125,
"logps/chosen": -211.9499969482422,
"logps/rejected": -524.2999877929688,
"loss": 0.0439,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": -14.5,
"rewards/margins": 30.537500381469727,
"rewards/rejected": -45.037498474121094,
"step": 380
},
{
"epoch": 2.754039497307002,
"grad_norm": 0.44046124815940857,
"learning_rate": 0.00011352780865347368,
"logits/chosen": -3.2164063453674316,
"logits/rejected": -7.603125095367432,
"logps/chosen": -232.9499969482422,
"logps/rejected": -583.7999877929688,
"loss": 0.0776,
"rewards/accuracies": 0.981249988079071,
"rewards/chosen": -16.34375,
"rewards/margins": 34.71875,
"rewards/rejected": -51.025001525878906,
"step": 385
},
{
"epoch": 2.789946140035907,
"grad_norm": 0.58872389793396,
"learning_rate": 0.0001135196149567431,
"logits/chosen": -5.271874904632568,
"logits/rejected": -8.928125381469727,
"logps/chosen": -300.0,
"logps/rejected": -614.7999877929688,
"loss": 0.0626,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": -23.056249618530273,
"rewards/margins": 31.15625,
"rewards/rejected": -54.212501525878906,
"step": 390
},
{
"epoch": 2.8258527827648114,
"grad_norm": 3.51865553855896,
"learning_rate": 0.00011351127866813858,
"logits/chosen": -3.4632811546325684,
"logits/rejected": -7.084374904632568,
"logps/chosen": -203.10000610351562,
"logps/rejected": -554.0999755859375,
"loss": 0.045,
"rewards/accuracies": 0.981249988079071,
"rewards/chosen": -13.490625381469727,
"rewards/margins": 34.400001525878906,
"rewards/rejected": -47.900001525878906,
"step": 395
},
{
"epoch": 2.8617594254937164,
"grad_norm": 1.450678825378418,
"learning_rate": 0.00011350279980864665,
"logits/chosen": -5.461718559265137,
"logits/rejected": -8.4375,
"logps/chosen": -304.70001220703125,
"logps/rejected": -655.7999877929688,
"loss": 0.0565,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": -23.524999618530273,
"rewards/margins": 35.0,
"rewards/rejected": -58.5625,
"step": 400
},
{
"epoch": 2.8617594254937164,
"eval_logits/chosen": -5.751838207244873,
"eval_logits/rejected": -9.095588684082031,
"eval_logps/chosen": -330.8235168457031,
"eval_logps/rejected": -722.11767578125,
"eval_loss": 0.026857540011405945,
"eval_rewards/accuracies": 0.9816176295280457,
"eval_rewards/chosen": -26.066177368164062,
"eval_rewards/margins": 38.764705657958984,
"eval_rewards/rejected": -64.8382339477539,
"eval_runtime": 8.5165,
"eval_samples_per_second": 31.351,
"eval_steps_per_second": 1.996,
"step": 400
},
{
"epoch": 2.8976660682226214,
"grad_norm": 0.8611961603164673,
"learning_rate": 0.00011349417839961291,
"logits/chosen": -3.793750047683716,
"logits/rejected": -7.610937595367432,
"logps/chosen": -244.9499969482422,
"logps/rejected": -608.2999877929688,
"loss": 0.0411,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": -17.553125381469727,
"rewards/margins": 35.88750076293945,
"rewards/rejected": -53.4375,
"step": 405
},
{
"epoch": 2.933572710951526,
"grad_norm": 0.9868994951248169,
"learning_rate": 0.0001134854144627417,
"logits/chosen": -2.541015625,
"logits/rejected": -5.84375,
"logps/chosen": -179.35000610351562,
"logps/rejected": -500.79998779296875,
"loss": 0.0966,
"rewards/accuracies": 0.9781249761581421,
"rewards/chosen": -11.278124809265137,
"rewards/margins": 31.325000762939453,
"rewards/rejected": -42.625,
"step": 410
},
{
"epoch": 2.969479353680431,
"grad_norm": 0.8936044573783875,
"learning_rate": 0.00011347650802009624,
"logits/chosen": -3.5687499046325684,
"logits/rejected": -5.125,
"logps/chosen": -250.89999389648438,
"logps/rejected": -453.70001220703125,
"loss": 0.0359,
"rewards/accuracies": 0.981249988079071,
"rewards/chosen": -17.871875762939453,
"rewards/margins": 19.834375381469727,
"rewards/rejected": -37.73125076293945,
"step": 415
},
{
"epoch": 3.0,
"grad_norm": 1.828643560409546,
"learning_rate": 0.00011346745909409847,
"logits/chosen": -4.330882549285889,
"logits/rejected": -5.753676414489746,
"logps/chosen": -265.941162109375,
"logps/rejected": -454.1176452636719,
"loss": 0.0519,
"rewards/accuracies": 0.9742646813392639,
"rewards/chosen": -19.676469802856445,
"rewards/margins": 18.297794342041016,
"rewards/rejected": -37.992645263671875,
"step": 420
},
{
"epoch": 3.035906642728905,
"grad_norm": 0.023664651438593864,
"learning_rate": 0.00011345826770752906,
"logits/chosen": -3.649218797683716,
"logits/rejected": -5.8515625,
"logps/chosen": -209.39999389648438,
"logps/rejected": -417.3999938964844,
"loss": 0.0118,
"rewards/accuracies": 0.996874988079071,
"rewards/chosen": -14.103124618530273,
"rewards/margins": 20.243749618530273,
"rewards/rejected": -34.38750076293945,
"step": 425
},
{
"epoch": 3.0718132854578095,
"grad_norm": 0.01370246708393097,
"learning_rate": 0.00011344893388352732,
"logits/chosen": -2.9320311546325684,
"logits/rejected": -5.650000095367432,
"logps/chosen": -178.85000610351562,
"logps/rejected": -392.1000061035156,
"loss": 0.0472,
"rewards/accuracies": 0.984375,
"rewards/chosen": -10.657812118530273,
"rewards/margins": 20.962499618530273,
"rewards/rejected": -31.631250381469727,
"step": 430
},
{
"epoch": 3.1077199281867145,
"grad_norm": 0.00010757453128462657,
"learning_rate": 0.00011343945764559112,
"logits/chosen": -2.715625047683716,
"logits/rejected": -5.421875,
"logps/chosen": -158.60000610351562,
"logps/rejected": -340.20001220703125,
"loss": 0.0307,
"rewards/accuracies": 0.984375,
"rewards/chosen": -8.817187309265137,
"rewards/margins": 17.318750381469727,
"rewards/rejected": -26.162500381469727,
"step": 435
},
{
"epoch": 3.1436265709156195,
"grad_norm": 0.32624539732933044,
"learning_rate": 0.0001134298390175769,
"logits/chosen": -3.8375000953674316,
"logits/rejected": -5.959374904632568,
"logps/chosen": -176.8000030517578,
"logps/rejected": -336.8999938964844,
"loss": 0.0305,
"rewards/accuracies": 0.9906250238418579,
"rewards/chosen": -10.962499618530273,
"rewards/margins": 15.490625381469727,
"rewards/rejected": -26.450000762939453,
"step": 440
},
{
"epoch": 3.1795332136445245,
"grad_norm": 0.16668373346328735,
"learning_rate": 0.00011342007802369956,
"logits/chosen": -4.637499809265137,
"logits/rejected": -6.957812309265137,
"logps/chosen": -203.14999389648438,
"logps/rejected": -389.79998779296875,
"loss": 0.0325,
"rewards/accuracies": 0.9937499761581421,
"rewards/chosen": -13.321874618530273,
"rewards/margins": 17.899999618530273,
"rewards/rejected": -31.21875,
"step": 445
},
{
"epoch": 3.215439856373429,
"grad_norm": 0.008436158299446106,
"learning_rate": 0.00011341017468853234,
"logits/chosen": -5.528124809265137,
"logits/rejected": -8.003125190734863,
"logps/chosen": -208.4499969482422,
"logps/rejected": -445.1000061035156,
"loss": 0.0125,
"rewards/accuracies": 0.996874988079071,
"rewards/chosen": -13.625,
"rewards/margins": 23.018749237060547,
"rewards/rejected": -36.64374923706055,
"step": 450
},
{
"epoch": 3.215439856373429,
"eval_logits/chosen": -5.895220756530762,
"eval_logits/rejected": -8.316176414489746,
"eval_logps/chosen": -224.41175842285156,
"eval_logps/rejected": -471.76470947265625,
"eval_loss": 0.02364749275147915,
"eval_rewards/accuracies": 0.9889705777168274,
"eval_rewards/chosen": -15.242647171020508,
"eval_rewards/margins": 24.080883026123047,
"eval_rewards/rejected": -39.35293960571289,
"eval_runtime": 8.4254,
"eval_samples_per_second": 31.69,
"eval_steps_per_second": 2.018,
"step": 450
},
{
"epoch": 3.251346499102334,
"grad_norm": 0.6932575702667236,
"learning_rate": 0.00011340012903700693,
"logits/chosen": -6.3359375,
"logits/rejected": -8.709375381469727,
"logps/chosen": -242.10000610351562,
"logps/rejected": -478.70001220703125,
"loss": 0.0263,
"rewards/accuracies": 0.981249988079071,
"rewards/chosen": -17.1875,
"rewards/margins": 23.299999237060547,
"rewards/rejected": -40.5,
"step": 455
},
{
"epoch": 3.287253141831239,
"grad_norm": 1.7907814979553223,
"learning_rate": 0.0001133899410944132,
"logits/chosen": -7.318749904632568,
"logits/rejected": -9.934374809265137,
"logps/chosen": -268.8999938964844,
"logps/rejected": -554.9000244140625,
"loss": 0.0484,
"rewards/accuracies": 0.981249988079071,
"rewards/chosen": -20.271875381469727,
"rewards/margins": 27.850000381469727,
"rewards/rejected": -48.150001525878906,
"step": 460
},
{
"epoch": 3.3231597845601435,
"grad_norm": 1.6142306327819824,
"learning_rate": 0.0001133796108863993,
"logits/chosen": -5.221093654632568,
"logits/rejected": -7.642187595367432,
"logps/chosen": -200.6999969482422,
"logps/rejected": -408.20001220703125,
"loss": 0.0293,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": -12.826562881469727,
"rewards/margins": 20.359375,
"rewards/rejected": -33.20000076293945,
"step": 465
},
{
"epoch": 3.3590664272890485,
"grad_norm": 1.1235191822052002,
"learning_rate": 0.00011336913843897153,
"logits/chosen": -2.270312547683716,
"logits/rejected": -4.194531440734863,
"logps/chosen": -138.75,
"logps/rejected": -287.20001220703125,
"loss": 0.0528,
"rewards/accuracies": 0.981249988079071,
"rewards/chosen": -6.962500095367432,
"rewards/margins": 13.737500190734863,
"rewards/rejected": -20.700000762939453,
"step": 470
},
{
"epoch": 3.3949730700179535,
"grad_norm": 0.024500994011759758,
"learning_rate": 0.00011335852377849424,
"logits/chosen": -1.961328148841858,
"logits/rejected": -3.6734375953674316,
"logps/chosen": -136.3249969482422,
"logps/rejected": -296.3999938964844,
"loss": 0.0834,
"rewards/accuracies": 0.9624999761581421,
"rewards/chosen": -6.860937595367432,
"rewards/margins": 15.140625,
"rewards/rejected": -22.0,
"step": 475
},
{
"epoch": 3.430879712746858,
"grad_norm": 0.7605228424072266,
"learning_rate": 0.00011334776693168985,
"logits/chosen": -3.0999999046325684,
"logits/rejected": -4.667187690734863,
"logps/chosen": -149.60000610351562,
"logps/rejected": -347.70001220703125,
"loss": 0.0254,
"rewards/accuracies": 0.9906250238418579,
"rewards/chosen": -8.057812690734863,
"rewards/margins": 18.965625762939453,
"rewards/rejected": -27.03125,
"step": 480
},
{
"epoch": 3.466786355475763,
"grad_norm": 3.233030319213867,
"learning_rate": 0.0001133368679256387,
"logits/chosen": -4.842187404632568,
"logits/rejected": -6.603125095367432,
"logps/chosen": -202.375,
"logps/rejected": -458.3999938964844,
"loss": 0.0157,
"rewards/accuracies": 0.9906250238418579,
"rewards/chosen": -13.3359375,
"rewards/margins": 24.912500381469727,
"rewards/rejected": -38.1875,
"step": 485
},
{
"epoch": 3.502692998204668,
"grad_norm": 0.12913425266742706,
"learning_rate": 0.000113325826787779,
"logits/chosen": -6.7109375,
"logits/rejected": -8.201562881469727,
"logps/chosen": -253.0500030517578,
"logps/rejected": -495.5,
"loss": 0.0287,
"rewards/accuracies": 0.9937499761581421,
"rewards/chosen": -18.384374618530273,
"rewards/margins": 23.737499237060547,
"rewards/rejected": -42.11249923706055,
"step": 490
},
{
"epoch": 3.5385996409335725,
"grad_norm": 0.36198368668556213,
"learning_rate": 0.00011331464354590684,
"logits/chosen": -5.528124809265137,
"logits/rejected": -7.0,
"logps/chosen": -176.5500030517578,
"logps/rejected": -370.3999938964844,
"loss": 0.0606,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": -10.745312690734863,
"rewards/margins": 18.90625,
"rewards/rejected": -29.637500762939453,
"step": 495
},
{
"epoch": 3.5745062836624775,
"grad_norm": 1.8532159328460693,
"learning_rate": 0.000113303318228176,
"logits/chosen": -4.196875095367432,
"logits/rejected": -5.509375095367432,
"logps/chosen": -167.60000610351562,
"logps/rejected": -333.8999938964844,
"loss": 0.0652,
"rewards/accuracies": 0.971875011920929,
"rewards/chosen": -9.654687881469727,
"rewards/margins": 16.065624237060547,
"rewards/rejected": -25.712499618530273,
"step": 500
},
{
"epoch": 3.5745062836624775,
"eval_logits/chosen": -4.733455657958984,
"eval_logits/rejected": -6.211397171020508,
"eval_logps/chosen": -219.64706420898438,
"eval_logps/rejected": -395.8823547363281,
"eval_loss": 0.014533035457134247,
"eval_rewards/accuracies": 1.0,
"eval_rewards/chosen": -14.753676414489746,
"eval_rewards/margins": 16.90441131591797,
"eval_rewards/rejected": -31.676469802856445,
"eval_runtime": 8.5526,
"eval_samples_per_second": 31.219,
"eval_steps_per_second": 1.988,
"step": 500
},
{
"epoch": 3.6104129263913824,
"grad_norm": 0.02227591536939144,
"learning_rate": 0.00011329185086309797,
"logits/chosen": -5.440625190734863,
"logits/rejected": -6.892187595367432,
"logps/chosen": -263.04998779296875,
"logps/rejected": -438.5,
"loss": 0.0284,
"rewards/accuracies": 0.9906250238418579,
"rewards/chosen": -19.5625,
"rewards/margins": 16.524999618530273,
"rewards/rejected": -36.099998474121094,
"step": 505
},
{
"epoch": 3.646319569120287,
"grad_norm": 0.263701468706131,
"learning_rate": 0.00011328024147954178,
"logits/chosen": -6.275000095367432,
"logits/rejected": -7.809374809265137,
"logps/chosen": -318.5,
"logps/rejected": -494.5,
"loss": 0.0307,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": -25.049999237060547,
"rewards/margins": 17.087499618530273,
"rewards/rejected": -42.13750076293945,
"step": 510
},
{
"epoch": 3.682226211849192,
"grad_norm": 0.08680078387260437,
"learning_rate": 0.00011326849010673409,
"logits/chosen": -7.175000190734863,
"logits/rejected": -8.606249809265137,
"logps/chosen": -344.70001220703125,
"logps/rejected": -576.4000244140625,
"loss": 0.0187,
"rewards/accuracies": 0.9937499761581421,
"rewards/chosen": -27.662500381469727,
"rewards/margins": 22.403125762939453,
"rewards/rejected": -50.0625,
"step": 515
},
{
"epoch": 3.718132854578097,
"grad_norm": 0.3637928068637848,
"learning_rate": 0.00011325659677425894,
"logits/chosen": -6.360937595367432,
"logits/rejected": -8.0078125,
"logps/chosen": -321.70001220703125,
"logps/rejected": -571.2999877929688,
"loss": 0.0563,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": -25.375,
"rewards/margins": 24.231250762939453,
"rewards/rejected": -49.5625,
"step": 520
},
{
"epoch": 3.754039497307002,
"grad_norm": 0.06884779036045074,
"learning_rate": 0.00011324456151205779,
"logits/chosen": -5.998437404632568,
"logits/rejected": -7.151562690734863,
"logps/chosen": -310.5,
"logps/rejected": -512.7999877929688,
"loss": 0.0412,
"rewards/accuracies": 0.984375,
"rewards/chosen": -24.200000762939453,
"rewards/margins": 19.4375,
"rewards/rejected": -43.63750076293945,
"step": 525
},
{
"epoch": 3.789946140035907,
"grad_norm": 0.19923055171966553,
"learning_rate": 0.0001132323843504294,
"logits/chosen": -6.621874809265137,
"logits/rejected": -7.451562404632568,
"logps/chosen": -316.8999938964844,
"logps/rejected": -483.79998779296875,
"loss": 0.0418,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": -24.6875,
"rewards/margins": 16.034374237060547,
"rewards/rejected": -40.724998474121094,
"step": 530
},
{
"epoch": 3.8258527827648114,
"grad_norm": 0.025405047461390495,
"learning_rate": 0.00011322006532002976,
"logits/chosen": -7.5390625,
"logits/rejected": -8.484375,
"logps/chosen": -325.1000061035156,
"logps/rejected": -495.0,
"loss": 0.0575,
"rewards/accuracies": 0.9781249761581421,
"rewards/chosen": -25.862499237060547,
"rewards/margins": 16.359375,
"rewards/rejected": -42.224998474121094,
"step": 535
},
{
"epoch": 3.8617594254937164,
"grad_norm": 0.7414963841438293,
"learning_rate": 0.00011320760445187202,
"logits/chosen": -8.795312881469727,
"logits/rejected": -10.037500381469727,
"logps/chosen": -355.8999938964844,
"logps/rejected": -577.0,
"loss": 0.0196,
"rewards/accuracies": 0.9906250238418579,
"rewards/chosen": -28.868749618530273,
"rewards/margins": 21.512500762939453,
"rewards/rejected": -50.38750076293945,
"step": 540
},
{
"epoch": 3.8976660682226214,
"grad_norm": 2.0291175842285156,
"learning_rate": 0.00011319500177732639,
"logits/chosen": -8.653124809265137,
"logits/rejected": -9.693750381469727,
"logps/chosen": -344.0,
"logps/rejected": -551.2999877929688,
"loss": 0.0377,
"rewards/accuracies": 0.981249988079071,
"rewards/chosen": -27.618749618530273,
"rewards/margins": 19.924999237060547,
"rewards/rejected": -47.537498474121094,
"step": 545
},
{
"epoch": 3.933572710951526,
"grad_norm": 0.06643925607204437,
"learning_rate": 0.00011318225732812008,
"logits/chosen": -6.587500095367432,
"logits/rejected": -7.754687309265137,
"logps/chosen": -280.04998779296875,
"logps/rejected": -456.8999938964844,
"loss": 0.033,
"rewards/accuracies": 0.9906250238418579,
"rewards/chosen": -21.100000381469727,
"rewards/margins": 17.134374618530273,
"rewards/rejected": -38.25,
"step": 550
},
{
"epoch": 3.933572710951526,
"eval_logits/chosen": -5.768382549285889,
"eval_logits/rejected": -6.928308963775635,
"eval_logps/chosen": -264.4117736816406,
"eval_logps/rejected": -437.76470947265625,
"eval_loss": 0.04973261430859566,
"eval_rewards/accuracies": 0.9852941036224365,
"eval_rewards/chosen": -19.34558868408203,
"eval_rewards/margins": 16.566177368164062,
"eval_rewards/rejected": -35.89706039428711,
"eval_runtime": 8.691,
"eval_samples_per_second": 30.721,
"eval_steps_per_second": 1.956,
"step": 550
},
{
"epoch": 3.969479353680431,
"grad_norm": 0.709564208984375,
"learning_rate": 0.00011316937113633724,
"logits/chosen": -5.145312309265137,
"logits/rejected": -6.15625,
"logps/chosen": -254.3000030517578,
"logps/rejected": -400.3999938964844,
"loss": 0.0689,
"rewards/accuracies": 0.965624988079071,
"rewards/chosen": -18.596874237060547,
"rewards/margins": 13.71875,
"rewards/rejected": -32.318748474121094,
"step": 555
},
{
"epoch": 4.0,
"grad_norm": 0.014319206587970257,
"learning_rate": 0.00011315634323441883,
"logits/chosen": -4.669117450714111,
"logits/rejected": -5.674632549285889,
"logps/chosen": -250.88235473632812,
"logps/rejected": -388.23529052734375,
"loss": 0.0647,
"rewards/accuracies": 0.9632353186607361,
"rewards/chosen": -18.419116973876953,
"rewards/margins": 12.941176414489746,
"rewards/rejected": -31.33823585510254,
"step": 560
},
{
"epoch": 4.0359066427289045,
"grad_norm": 1.3408515453338623,
"learning_rate": 0.0001131431736551626,
"logits/chosen": -4.814062595367432,
"logits/rejected": -6.176562309265137,
"logps/chosen": -249.4499969482422,
"logps/rejected": -412.79998779296875,
"loss": 0.0462,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": -18.274999618530273,
"rewards/margins": 15.787500381469727,
"rewards/rejected": -34.068748474121094,
"step": 565
},
{
"epoch": 4.07181328545781,
"grad_norm": 0.39796170592308044,
"learning_rate": 0.00011312986243172293,
"logits/chosen": -4.244531154632568,
"logits/rejected": -6.751562595367432,
"logps/chosen": -225.6999969482422,
"logps/rejected": -486.6000061035156,
"loss": 0.013,
"rewards/accuracies": 0.996874988079071,
"rewards/chosen": -15.662500381469727,
"rewards/margins": 25.412500381469727,
"rewards/rejected": -41.087501525878906,
"step": 570
},
{
"epoch": 4.1077199281867145,
"grad_norm": 2.608715772628784,
"learning_rate": 0.00011311640959761081,
"logits/chosen": -3.823437452316284,
"logits/rejected": -7.4375,
"logps/chosen": -240.14999389648438,
"logps/rejected": -562.5999755859375,
"loss": 0.053,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": -17.225000381469727,
"rewards/margins": 31.493749618530273,
"rewards/rejected": -48.75,
"step": 575
},
{
"epoch": 4.143626570915619,
"grad_norm": 2.569322347640991,
"learning_rate": 0.00011310281518669376,
"logits/chosen": -5.467187404632568,
"logits/rejected": -8.5078125,
"logps/chosen": -370.6000061035156,
"logps/rejected": -627.0,
"loss": 0.0235,
"rewards/accuracies": 0.984375,
"rewards/chosen": -30.15625,
"rewards/margins": 25.225000381469727,
"rewards/rejected": -55.38750076293945,
"step": 580
},
{
"epoch": 4.1795332136445245,
"grad_norm": 7.271477699279785,
"learning_rate": 0.00011308907923319566,
"logits/chosen": -4.629687309265137,
"logits/rejected": -7.557812690734863,
"logps/chosen": -309.3500061035156,
"logps/rejected": -587.4000244140625,
"loss": 0.0519,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": -24.274999618530273,
"rewards/margins": 27.075000762939453,
"rewards/rejected": -51.337501525878906,
"step": 585
},
{
"epoch": 4.215439856373429,
"grad_norm": 4.082700729370117,
"learning_rate": 0.00011307520177169676,
"logits/chosen": -4.9609375,
"logits/rejected": -7.151562690734863,
"logps/chosen": -320.79998779296875,
"logps/rejected": -574.0,
"loss": 0.0997,
"rewards/accuracies": 0.971875011920929,
"rewards/chosen": -25.225000381469727,
"rewards/margins": 25.018749237060547,
"rewards/rejected": -50.23749923706055,
"step": 590
},
{
"epoch": 4.2513464991023335,
"grad_norm": 1.2855074405670166,
"learning_rate": 0.00011306118283713357,
"logits/chosen": -3.370312452316284,
"logits/rejected": -4.561718940734863,
"logps/chosen": -268.54998779296875,
"logps/rejected": -507.8999938964844,
"loss": 0.0289,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": -20.206249237060547,
"rewards/margins": 23.3125,
"rewards/rejected": -43.537498474121094,
"step": 595
},
{
"epoch": 4.287253141831239,
"grad_norm": 0.21659362316131592,
"learning_rate": 0.00011304702246479876,
"logits/chosen": -3.285937547683716,
"logits/rejected": -4.146093845367432,
"logps/chosen": -270.3500061035156,
"logps/rejected": -488.20001220703125,
"loss": 0.0174,
"rewards/accuracies": 0.9906250238418579,
"rewards/chosen": -19.931249618530273,
"rewards/margins": 21.193750381469727,
"rewards/rejected": -41.125,
"step": 600
},
{
"epoch": 4.287253141831239,
"eval_logits/chosen": -3.882352828979492,
"eval_logits/rejected": -5.0,
"eval_logps/chosen": -272.76470947265625,
"eval_logps/rejected": -492.8235168457031,
"eval_loss": 0.014545433223247528,
"eval_rewards/accuracies": 0.9926470518112183,
"eval_rewards/chosen": -20.176469802856445,
"eval_rewards/margins": 21.330883026123047,
"eval_rewards/rejected": -41.52941131591797,
"eval_runtime": 8.5511,
"eval_samples_per_second": 31.224,
"eval_steps_per_second": 1.988,
"step": 600
},
{
"epoch": 4.3231597845601435,
"grad_norm": 0.36118730902671814,
"learning_rate": 0.00011303272069034098,
"logits/chosen": -4.321875095367432,
"logits/rejected": -5.332812309265137,
"logps/chosen": -282.45001220703125,
"logps/rejected": -488.0,
"loss": 0.0449,
"rewards/accuracies": 0.981249988079071,
"rewards/chosen": -21.653125762939453,
"rewards/margins": 19.787500381469727,
"rewards/rejected": -41.45000076293945,
"step": 605
},
{
"epoch": 4.359066427289049,
"grad_norm": 0.5386459231376648,
"learning_rate": 0.00011301827754976498,
"logits/chosen": -5.6015625,
"logits/rejected": -6.984375,
"logps/chosen": -302.29998779296875,
"logps/rejected": -557.7000122070312,
"loss": 0.0153,
"rewards/accuracies": 0.9937499761581421,
"rewards/chosen": -23.518749237060547,
"rewards/margins": 24.784374237060547,
"rewards/rejected": -48.29999923706055,
"step": 610
},
{
"epoch": 4.3949730700179535,
"grad_norm": 4.88869571685791,
"learning_rate": 0.00011300369307943137,
"logits/chosen": -6.215624809265137,
"logits/rejected": -7.949999809265137,
"logps/chosen": -270.1000061035156,
"logps/rejected": -572.2999877929688,
"loss": 0.0224,
"rewards/accuracies": 0.996874988079071,
"rewards/chosen": -19.809375762939453,
"rewards/margins": 29.512500762939453,
"rewards/rejected": -49.3125,
"step": 615
},
{
"epoch": 4.430879712746858,
"grad_norm": 2.2598838806152344,
"learning_rate": 0.00011298896731605649,
"logits/chosen": -3.9453125,
"logits/rejected": -5.546875,
"logps/chosen": -203.35000610351562,
"logps/rejected": -407.79998779296875,
"loss": 0.0373,
"rewards/accuracies": 0.9906250238418579,
"rewards/chosen": -13.240625381469727,
"rewards/margins": 20.003124237060547,
"rewards/rejected": -33.26250076293945,
"step": 620
},
{
"epoch": 4.466786355475763,
"grad_norm": 0.47177407145500183,
"learning_rate": 0.00011297410029671247,
"logits/chosen": -3.74609375,
"logits/rejected": -5.087500095367432,
"logps/chosen": -281.70001220703125,
"logps/rejected": -448.79998779296875,
"loss": 0.04,
"rewards/accuracies": 0.981249988079071,
"rewards/chosen": -21.34375,
"rewards/margins": 16.043750762939453,
"rewards/rejected": -37.42499923706055,
"step": 625
},
{
"epoch": 4.502692998204668,
"grad_norm": 0.2046031653881073,
"learning_rate": 0.00011295909205882698,
"logits/chosen": -1.002783179283142,
"logits/rejected": -3.4195313453674316,
"logps/chosen": -189.8249969482422,
"logps/rejected": -390.1000061035156,
"loss": 0.0272,
"rewards/accuracies": 0.9937499761581421,
"rewards/chosen": -11.790624618530273,
"rewards/margins": 19.309375762939453,
"rewards/rejected": -31.09375,
"step": 630
},
{
"epoch": 4.5385996409335725,
"grad_norm": 0.29450154304504395,
"learning_rate": 0.00011294394264018326,
"logits/chosen": -2.082812547683716,
"logits/rejected": -4.514062404632568,
"logps/chosen": -224.8000030517578,
"logps/rejected": -433.79998779296875,
"loss": 0.0158,
"rewards/accuracies": 0.9937499761581421,
"rewards/chosen": -15.709375381469727,
"rewards/margins": 19.975000381469727,
"rewards/rejected": -35.70000076293945,
"step": 635
},
{
"epoch": 4.574506283662478,
"grad_norm": 0.15016689896583557,
"learning_rate": 0.00011292865207891994,
"logits/chosen": -3.026562452316284,
"logits/rejected": -5.426562309265137,
"logps/chosen": -247.1999969482422,
"logps/rejected": -477.1000061035156,
"loss": 0.0044,
"rewards/accuracies": 0.996874988079071,
"rewards/chosen": -17.771875381469727,
"rewards/margins": 22.262500762939453,
"rewards/rejected": -40.025001525878906,
"step": 640
},
{
"epoch": 4.6104129263913824,
"grad_norm": 3.1011385917663574,
"learning_rate": 0.00011291322041353101,
"logits/chosen": -3.1812500953674316,
"logits/rejected": -5.546875,
"logps/chosen": -231.9499969482422,
"logps/rejected": -464.5,
"loss": 0.0109,
"rewards/accuracies": 0.996874988079071,
"rewards/chosen": -16.149999618530273,
"rewards/margins": 22.75,
"rewards/rejected": -38.912498474121094,
"step": 645
},
{
"epoch": 4.646319569120287,
"grad_norm": 0.06376684457063675,
"learning_rate": 0.00011289764768286565,
"logits/chosen": -4.444531440734863,
"logits/rejected": -6.487500190734863,
"logps/chosen": -285.45001220703125,
"logps/rejected": -519.7999877929688,
"loss": 0.0149,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": -21.674999237060547,
"rewards/margins": 22.8125,
"rewards/rejected": -44.525001525878906,
"step": 650
},
{
"epoch": 4.646319569120287,
"eval_logits/chosen": -4.110294342041016,
"eval_logits/rejected": -6.588235378265381,
"eval_logps/chosen": -257.6470642089844,
"eval_logps/rejected": -516.5882568359375,
"eval_loss": 0.029071472585201263,
"eval_rewards/accuracies": 0.9889705777168274,
"eval_rewards/chosen": -18.647058486938477,
"eval_rewards/margins": 25.28676414489746,
"eval_rewards/rejected": -43.94117736816406,
"eval_runtime": 8.8159,
"eval_samples_per_second": 30.286,
"eval_steps_per_second": 1.928,
"step": 650
},
{
"epoch": 4.682226211849192,
"grad_norm": 3.26318621635437,
"learning_rate": 0.00011288193392612822,
"logits/chosen": -3.928906202316284,
"logits/rejected": -6.315625190734863,
"logps/chosen": -248.64999389648438,
"logps/rejected": -503.79998779296875,
"loss": 0.056,
"rewards/accuracies": 0.9781249761581421,
"rewards/chosen": -18.081249237060547,
"rewards/margins": 24.456249237060547,
"rewards/rejected": -42.537498474121094,
"step": 655
},
{
"epoch": 4.718132854578097,
"grad_norm": 0.18917639553546906,
"learning_rate": 0.00011286607918287803,
"logits/chosen": -2.7222657203674316,
"logits/rejected": -5.206250190734863,
"logps/chosen": -258.1499938964844,
"logps/rejected": -481.79998779296875,
"loss": 0.0583,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": -19.225000381469727,
"rewards/margins": 21.606250762939453,
"rewards/rejected": -40.849998474121094,
"step": 660
},
{
"epoch": 4.7540394973070015,
"grad_norm": 0.9138699173927307,
"learning_rate": 0.00011285008349302943,
"logits/chosen": -2.116406202316284,
"logits/rejected": -4.259375095367432,
"logps/chosen": -294.20001220703125,
"logps/rejected": -505.20001220703125,
"loss": 0.0192,
"rewards/accuracies": 0.9937499761581421,
"rewards/chosen": -23.012500762939453,
"rewards/margins": 20.296875,
"rewards/rejected": -43.337501525878906,
"step": 665
},
{
"epoch": 4.789946140035907,
"grad_norm": 0.5865360498428345,
"learning_rate": 0.00011283394689685153,
"logits/chosen": -3.46875,
"logits/rejected": -5.800000190734863,
"logps/chosen": -292.0,
"logps/rejected": -535.2999877929688,
"loss": 0.0238,
"rewards/accuracies": 0.9937499761581421,
"rewards/chosen": -22.268749237060547,
"rewards/margins": 23.831249237060547,
"rewards/rejected": -46.04999923706055,
"step": 670
},
{
"epoch": 4.825852782764811,
"grad_norm": 1.3726475238800049,
"learning_rate": 0.0001128176694349682,
"logits/chosen": -3.3148436546325684,
"logits/rejected": -5.324999809265137,
"logps/chosen": -286.29998779296875,
"logps/rejected": -542.9000244140625,
"loss": 0.0127,
"rewards/accuracies": 0.9937499761581421,
"rewards/chosen": -22.024999618530273,
"rewards/margins": 24.868749618530273,
"rewards/rejected": -46.912498474121094,
"step": 675
},
{
"epoch": 4.861759425493716,
"grad_norm": 10.441712379455566,
"learning_rate": 0.00011280125114835791,
"logits/chosen": -2.067578077316284,
"logits/rejected": -4.528124809265137,
"logps/chosen": -216.75,
"logps/rejected": -488.70001220703125,
"loss": 0.0218,
"rewards/accuracies": 0.9906250238418579,
"rewards/chosen": -15.034375190734863,
"rewards/margins": 26.6875,
"rewards/rejected": -41.712501525878906,
"step": 680
},
{
"epoch": 4.897666068222621,
"grad_norm": 0.9471856355667114,
"learning_rate": 0.00011278469207835369,
"logits/chosen": -0.960888683795929,
"logits/rejected": -3.39453125,
"logps/chosen": -179.25,
"logps/rejected": -433.1000061035156,
"loss": 0.0337,
"rewards/accuracies": 0.996874988079071,
"rewards/chosen": -11.198437690734863,
"rewards/margins": 24.59375,
"rewards/rejected": -35.787498474121094,
"step": 685
},
{
"epoch": 4.933572710951526,
"grad_norm": 4.058782577514648,
"learning_rate": 0.000112767992266643,
"logits/chosen": -2.7562499046325684,
"logits/rejected": -4.982031345367432,
"logps/chosen": -250.60000610351562,
"logps/rejected": -498.20001220703125,
"loss": 0.0548,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": -18.268749237060547,
"rewards/margins": 23.912500381469727,
"rewards/rejected": -42.1875,
"step": 690
},
{
"epoch": 4.9694793536804305,
"grad_norm": 5.3591084480285645,
"learning_rate": 0.00011275115175526756,
"logits/chosen": -3.2984375953674316,
"logits/rejected": -5.337500095367432,
"logps/chosen": -273.29998779296875,
"logps/rejected": -526.4000244140625,
"loss": 0.0574,
"rewards/accuracies": 0.984375,
"rewards/chosen": -20.40625,
"rewards/margins": 24.493749618530273,
"rewards/rejected": -44.900001525878906,
"step": 695
},
{
"epoch": 5.0,
"grad_norm": 3.2741596698760986,
"learning_rate": 0.00011273417058662334,
"logits/chosen": -1.62890625,
"logits/rejected": -3.163602828979492,
"logps/chosen": -178.4705810546875,
"logps/rejected": -387.76470947265625,
"loss": 0.0274,
"rewards/accuracies": 0.9852941036224365,
"rewards/chosen": -10.939338684082031,
"rewards/margins": 20.0,
"rewards/rejected": -30.941177368164062,
"step": 700
},
{
"epoch": 5.0,
"eval_logits/chosen": -1.3389246463775635,
"eval_logits/rejected": -2.8189337253570557,
"eval_logps/chosen": -160.88235473632812,
"eval_logps/rejected": -377.29412841796875,
"eval_loss": 0.009149392135441303,
"eval_rewards/accuracies": 1.0,
"eval_rewards/chosen": -8.79411792755127,
"eval_rewards/margins": 20.941177368164062,
"eval_rewards/rejected": -29.757352828979492,
"eval_runtime": 8.8399,
"eval_samples_per_second": 30.204,
"eval_steps_per_second": 1.923,
"step": 700
},
{
"epoch": 5.0359066427289045,
"grad_norm": 0.25538453459739685,
"learning_rate": 0.00011271704880346044,
"logits/chosen": -1.1785156726837158,
"logits/rejected": -2.3316407203674316,
"logps/chosen": -155.8000030517578,
"logps/rejected": -362.29998779296875,
"loss": 0.0164,
"rewards/accuracies": 0.9937499761581421,
"rewards/chosen": -8.415624618530273,
"rewards/margins": 20.053125381469727,
"rewards/rejected": -28.475000381469727,
"step": 705
},
{
"epoch": 5.07181328545781,
"grad_norm": 0.008221164345741272,
"learning_rate": 0.0001126997864488829,
"logits/chosen": -1.918554663658142,
"logits/rejected": -2.684375047683716,
"logps/chosen": -154.64999389648438,
"logps/rejected": -376.79998779296875,
"loss": 0.0105,
"rewards/accuracies": 0.996874988079071,
"rewards/chosen": -8.796875,
"rewards/margins": 21.412500381469727,
"rewards/rejected": -30.21875,
"step": 710
},
{
"epoch": 5.1077199281867145,
"grad_norm": 0.4989652931690216,
"learning_rate": 0.0001126823835663487,
"logits/chosen": -3.301562547683716,
"logits/rejected": -4.116406440734863,
"logps/chosen": -197.75,
"logps/rejected": -458.0,
"loss": 0.0212,
"rewards/accuracies": 0.9906250238418579,
"rewards/chosen": -12.659375190734863,
"rewards/margins": 25.631250381469727,
"rewards/rejected": -38.287498474121094,
"step": 715
},
{
"epoch": 5.143626570915619,
"grad_norm": 0.927310585975647,
"learning_rate": 0.00011266484019966958,
"logits/chosen": -4.263281345367432,
"logits/rejected": -4.912499904632568,
"logps/chosen": -235.35000610351562,
"logps/rejected": -507.5,
"loss": 0.0091,
"rewards/accuracies": 0.996874988079071,
"rewards/chosen": -16.387500762939453,
"rewards/margins": 26.818750381469727,
"rewards/rejected": -43.224998474121094,
"step": 720
},
{
"epoch": 5.1795332136445245,
"grad_norm": 0.03083197772502899,
"learning_rate": 0.00011264715639301091,
"logits/chosen": -4.6484375,
"logits/rejected": -5.534375190734863,
"logps/chosen": -255.85000610351562,
"logps/rejected": -538.7000122070312,
"loss": 0.0437,
"rewards/accuracies": 0.981249988079071,
"rewards/chosen": -18.915624618530273,
"rewards/margins": 27.412500381469727,
"rewards/rejected": -46.349998474121094,
"step": 725
},
{
"epoch": 5.215439856373429,
"grad_norm": 0.13269655406475067,
"learning_rate": 0.00011262933219089168,
"logits/chosen": -4.479687690734863,
"logits/rejected": -5.400000095367432,
"logps/chosen": -285.70001220703125,
"logps/rejected": -533.5999755859375,
"loss": 0.0194,
"rewards/accuracies": 0.9937499761581421,
"rewards/chosen": -21.600000381469727,
"rewards/margins": 24.21875,
"rewards/rejected": -45.837501525878906,
"step": 730
},
{
"epoch": 5.2513464991023335,
"grad_norm": 3.4042835235595703,
"learning_rate": 0.0001126113676381843,
"logits/chosen": -3.9671874046325684,
"logits/rejected": -5.073437690734863,
"logps/chosen": -251.8000030517578,
"logps/rejected": -491.6000061035156,
"loss": 0.0386,
"rewards/accuracies": 0.984375,
"rewards/chosen": -18.399999618530273,
"rewards/margins": 23.25,
"rewards/rejected": -41.650001525878906,
"step": 735
},
{
"epoch": 5.287253141831239,
"grad_norm": 0.5153644680976868,
"learning_rate": 0.00011259326278011449,
"logits/chosen": -4.012499809265137,
"logits/rejected": -5.053124904632568,
"logps/chosen": -254.85000610351562,
"logps/rejected": -454.6000061035156,
"loss": 0.0352,
"rewards/accuracies": 0.9906250238418579,
"rewards/chosen": -18.446874618530273,
"rewards/margins": 19.393749237060547,
"rewards/rejected": -37.849998474121094,
"step": 740
},
{
"epoch": 5.3231597845601435,
"grad_norm": 0.4011136591434479,
"learning_rate": 0.00011257501766226122,
"logits/chosen": -4.405468940734863,
"logits/rejected": -5.582812309265137,
"logps/chosen": -254.9499969482422,
"logps/rejected": -468.0,
"loss": 0.023,
"rewards/accuracies": 0.9906250238418579,
"rewards/chosen": -18.615625381469727,
"rewards/margins": 20.684375762939453,
"rewards/rejected": -39.25,
"step": 745
},
{
"epoch": 5.359066427289049,
"grad_norm": 0.0850766971707344,
"learning_rate": 0.00011255663233055655,
"logits/chosen": -3.621875047683716,
"logits/rejected": -5.439062595367432,
"logps/chosen": -175.75,
"logps/rejected": -402.1000061035156,
"loss": 0.0341,
"rewards/accuracies": 0.9937499761581421,
"rewards/chosen": -10.673437118530273,
"rewards/margins": 22.149999618530273,
"rewards/rejected": -32.837501525878906,
"step": 750
},
{
"epoch": 5.359066427289049,
"eval_logits/chosen": -2.5533087253570557,
"eval_logits/rejected": -4.464154243469238,
"eval_logps/chosen": -134.6764678955078,
"eval_logps/rejected": -359.8823547363281,
"eval_loss": 0.008504279889166355,
"eval_rewards/accuracies": 0.9963235259056091,
"eval_rewards/chosen": -6.143382549285889,
"eval_rewards/margins": 21.83823585510254,
"eval_rewards/rejected": -27.977941513061523,
"eval_runtime": 8.5224,
"eval_samples_per_second": 31.329,
"eval_steps_per_second": 1.995,
"step": 750
},
{
"epoch": 5.3949730700179535,
"grad_norm": 0.8295687437057495,
"learning_rate": 0.00011253810683128554,
"logits/chosen": -3.987499952316284,
"logits/rejected": -6.296875,
"logps/chosen": -207.02499389648438,
"logps/rejected": -479.1000061035156,
"loss": 0.0617,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": -13.651562690734863,
"rewards/margins": 26.793750762939453,
"rewards/rejected": -40.443748474121094,
"step": 755
},
{
"epoch": 5.430879712746858,
"grad_norm": 0.09694784134626389,
"learning_rate": 0.0001125194412110861,
"logits/chosen": -5.464062690734863,
"logits/rejected": -7.379687309265137,
"logps/chosen": -310.6000061035156,
"logps/rejected": -564.5999755859375,
"loss": 0.0176,
"rewards/accuracies": 0.996874988079071,
"rewards/chosen": -24.225000381469727,
"rewards/margins": 24.837499618530273,
"rewards/rejected": -49.025001525878906,
"step": 760
},
{
"epoch": 5.466786355475763,
"grad_norm": 0.035386599600315094,
"learning_rate": 0.00011250063551694892,
"logits/chosen": -5.551562309265137,
"logits/rejected": -7.342187404632568,
"logps/chosen": -306.95001220703125,
"logps/rejected": -559.7000122070312,
"loss": 0.011,
"rewards/accuracies": 0.996874988079071,
"rewards/chosen": -23.700000762939453,
"rewards/margins": 24.424999237060547,
"rewards/rejected": -48.13750076293945,
"step": 765
},
{
"epoch": 5.502692998204668,
"grad_norm": 0.7666543126106262,
"learning_rate": 0.00011248168979621728,
"logits/chosen": -3.92578125,
"logits/rejected": -5.90625,
"logps/chosen": -260.75,
"logps/rejected": -506.8999938964844,
"loss": 0.0214,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": -19.481250762939453,
"rewards/margins": 23.65625,
"rewards/rejected": -43.150001525878906,
"step": 770
},
{
"epoch": 5.5385996409335725,
"grad_norm": 0.0009977294830605388,
"learning_rate": 0.00011246260409658705,
"logits/chosen": -4.388281345367432,
"logits/rejected": -7.181250095367432,
"logps/chosen": -261.20001220703125,
"logps/rejected": -591.7000122070312,
"loss": 0.0035,
"rewards/accuracies": 1.0,
"rewards/chosen": -19.140625,
"rewards/margins": 32.525001525878906,
"rewards/rejected": -51.724998474121094,
"step": 775
},
{
"epoch": 5.574506283662478,
"grad_norm": 0.01044029463082552,
"learning_rate": 0.00011244337846610643,
"logits/chosen": -4.952343940734863,
"logits/rejected": -7.689062595367432,
"logps/chosen": -266.25,
"logps/rejected": -594.2999877929688,
"loss": 0.0375,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": -19.524999618530273,
"rewards/margins": 32.75,
"rewards/rejected": -52.26250076293945,
"step": 780
},
{
"epoch": 5.6104129263913824,
"grad_norm": 0.5297859907150269,
"learning_rate": 0.00011242401295317595,
"logits/chosen": -4.126562595367432,
"logits/rejected": -6.842187404632568,
"logps/chosen": -225.5,
"logps/rejected": -552.0,
"loss": 0.0101,
"rewards/accuracies": 0.9906250238418579,
"rewards/chosen": -15.793749809265137,
"rewards/margins": 32.14374923706055,
"rewards/rejected": -47.912498474121094,
"step": 785
},
{
"epoch": 5.646319569120287,
"grad_norm": 5.579909801483154,
"learning_rate": 0.00011240450760654824,
"logits/chosen": -4.430468559265137,
"logits/rejected": -7.059374809265137,
"logps/chosen": -252.64999389648438,
"logps/rejected": -567.7000122070312,
"loss": 0.085,
"rewards/accuracies": 0.984375,
"rewards/chosen": -18.368749618530273,
"rewards/margins": 31.174999237060547,
"rewards/rejected": -49.537498474121094,
"step": 790
},
{
"epoch": 5.682226211849192,
"grad_norm": 3.711723566055298,
"learning_rate": 0.000112384862475328,
"logits/chosen": -4.119531154632568,
"logits/rejected": -6.137499809265137,
"logps/chosen": -228.8000030517578,
"logps/rejected": -467.29998779296875,
"loss": 0.0374,
"rewards/accuracies": 0.981249988079071,
"rewards/chosen": -16.090625762939453,
"rewards/margins": 23.287500381469727,
"rewards/rejected": -39.375,
"step": 795
},
{
"epoch": 5.718132854578097,
"grad_norm": 0.38980910181999207,
"learning_rate": 0.00011236507760897182,
"logits/chosen": -5.25,
"logits/rejected": -7.279687404632568,
"logps/chosen": -253.60000610351562,
"logps/rejected": -505.0,
"loss": 0.0149,
"rewards/accuracies": 0.9937499761581421,
"rewards/chosen": -18.53125,
"rewards/margins": 24.393749237060547,
"rewards/rejected": -42.9375,
"step": 800
},
{
"epoch": 5.718132854578097,
"eval_logits/chosen": -5.795955657958984,
"eval_logits/rejected": -7.886029243469238,
"eval_logps/chosen": -267.8823547363281,
"eval_logps/rejected": -518.4705810546875,
"eval_loss": 0.0229768268764019,
"eval_rewards/accuracies": 0.9799466133117676,
"eval_rewards/chosen": -19.705883026123047,
"eval_rewards/margins": 24.47058868408203,
"eval_rewards/rejected": -44.132354736328125,
"eval_runtime": 8.5466,
"eval_samples_per_second": 31.241,
"eval_steps_per_second": 1.989,
"step": 800
},
{
"epoch": 5.7540394973070015,
"grad_norm": 0.08777919411659241,
"learning_rate": 0.00011234515305728806,
"logits/chosen": -6.481249809265137,
"logits/rejected": -8.475000381469727,
"logps/chosen": -288.45001220703125,
"logps/rejected": -538.9000244140625,
"loss": 0.0167,
"rewards/accuracies": 0.9906250238418579,
"rewards/chosen": -22.262500762939453,
"rewards/margins": 24.28125,
"rewards/rejected": -46.5625,
"step": 805
},
{
"epoch": 5.789946140035907,
"grad_norm": 0.591503918170929,
"learning_rate": 0.00011232508887043676,
"logits/chosen": -6.189062595367432,
"logits/rejected": -7.951562404632568,
"logps/chosen": -279.8999938964844,
"logps/rejected": -496.5,
"loss": 0.0469,
"rewards/accuracies": 0.984375,
"rewards/chosen": -21.21875,
"rewards/margins": 20.728124618530273,
"rewards/rejected": -41.98749923706055,
"step": 810
},
{
"epoch": 5.825852782764811,
"grad_norm": 0.4492699205875397,
"learning_rate": 0.00011230488509892953,
"logits/chosen": -4.164843559265137,
"logits/rejected": -6.059374809265137,
"logps/chosen": -243.3000030517578,
"logps/rejected": -473.20001220703125,
"loss": 0.0103,
"rewards/accuracies": 0.996874988079071,
"rewards/chosen": -17.274999618530273,
"rewards/margins": 22.53125,
"rewards/rejected": -39.82500076293945,
"step": 815
},
{
"epoch": 5.861759425493716,
"grad_norm": 3.889610767364502,
"learning_rate": 0.0001122845417936293,
"logits/chosen": -5.235937595367432,
"logits/rejected": -7.465624809265137,
"logps/chosen": -300.8999938964844,
"logps/rejected": -565.2999877929688,
"loss": 0.0517,
"rewards/accuracies": 0.981249988079071,
"rewards/chosen": -23.375,
"rewards/margins": 25.899999618530273,
"rewards/rejected": -49.3125,
"step": 820
},
{
"epoch": 5.897666068222621,
"grad_norm": 7.49802303314209,
"learning_rate": 0.00011226405900575031,
"logits/chosen": -5.620312690734863,
"logits/rejected": -7.892187595367432,
"logps/chosen": -335.0,
"logps/rejected": -602.4000244140625,
"loss": 0.0668,
"rewards/accuracies": 0.984375,
"rewards/chosen": -26.631250381469727,
"rewards/margins": 26.475000381469727,
"rewards/rejected": -53.099998474121094,
"step": 825
},
{
"epoch": 5.933572710951526,
"grad_norm": 2.100908041000366,
"learning_rate": 0.00011224343678685797,
"logits/chosen": -6.767187595367432,
"logits/rejected": -8.921875,
"logps/chosen": -385.0,
"logps/rejected": -648.7999877929688,
"loss": 0.0552,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": -31.762500762939453,
"rewards/margins": 25.825000762939453,
"rewards/rejected": -57.625,
"step": 830
},
{
"epoch": 5.9694793536804305,
"grad_norm": 0.25847867131233215,
"learning_rate": 0.00011222267518886872,
"logits/chosen": -6.832812309265137,
"logits/rejected": -8.893750190734863,
"logps/chosen": -343.79998779296875,
"logps/rejected": -640.9000244140625,
"loss": 0.0192,
"rewards/accuracies": 0.9937499761581421,
"rewards/chosen": -27.587499618530273,
"rewards/margins": 29.162500381469727,
"rewards/rejected": -56.724998474121094,
"step": 835
},
{
"epoch": 6.0,
"grad_norm": 0.0401495099067688,
"learning_rate": 0.00011220177426404981,
"logits/chosen": -5.762867450714111,
"logits/rejected": -8.838234901428223,
"logps/chosen": -296.70587158203125,
"logps/rejected": -677.6470336914062,
"loss": 0.0087,
"rewards/accuracies": 0.9963235259056091,
"rewards/chosen": -22.544116973876953,
"rewards/margins": 37.463233947753906,
"rewards/rejected": -60.014705657958984,
"step": 840
},
{
"epoch": 6.0359066427289045,
"grad_norm": 0.012815682217478752,
"learning_rate": 0.00011218073406501931,
"logits/chosen": -6.901562690734863,
"logits/rejected": -9.628125190734863,
"logps/chosen": -326.0,
"logps/rejected": -691.5999755859375,
"loss": 0.0166,
"rewards/accuracies": 0.9937499761581421,
"rewards/chosen": -25.706249237060547,
"rewards/margins": 36.09375,
"rewards/rejected": -61.79999923706055,
"step": 845
},
{
"epoch": 6.07181328545781,
"grad_norm": 0.030361467972397804,
"learning_rate": 0.0001121595546447459,
"logits/chosen": -6.171875,
"logits/rejected": -8.364062309265137,
"logps/chosen": -288.5,
"logps/rejected": -570.2999877929688,
"loss": 0.0259,
"rewards/accuracies": 0.9937499761581421,
"rewards/chosen": -22.3125,
"rewards/margins": 27.575000762939453,
"rewards/rejected": -49.837501525878906,
"step": 850
},
{
"epoch": 6.07181328545781,
"eval_logits/chosen": -4.895220756530762,
"eval_logits/rejected": -7.121323585510254,
"eval_logps/chosen": -238.05882263183594,
"eval_logps/rejected": -518.5882568359375,
"eval_loss": 0.007082384079694748,
"eval_rewards/accuracies": 1.0,
"eval_rewards/chosen": -16.643383026123047,
"eval_rewards/margins": 27.455883026123047,
"eval_rewards/rejected": -44.10293960571289,
"eval_runtime": 8.5014,
"eval_samples_per_second": 31.406,
"eval_steps_per_second": 2.0,
"step": 850
}
],
"logging_steps": 5,
"max_steps": 10000,
"num_input_tokens_seen": 0,
"num_train_epochs": 72,
"save_steps": 100,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 5,
"early_stopping_threshold": 0.0001
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}