intervention_agent_2 / trainer_state.json
Abhijnan's picture
Upload 11 files
80ce2f1 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9111617312072893,
"eval_steps": 200,
"global_step": 3000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"learning_rate": 5.000000000000001e-07,
"logits/chosen": -0.300163596868515,
"logits/rejected": -0.3011459410190582,
"logps/chosen": -418.81268310546875,
"logps/rejected": -421.69482421875,
"loss": 0.6923,
"rewards/accuracies": 0.4124999940395355,
"rewards/chosen": 0.008436297997832298,
"rewards/margins": 0.001967963995411992,
"rewards/rejected": 0.006468335632234812,
"step": 10
},
{
"epoch": 0.01,
"learning_rate": 1.0000000000000002e-06,
"logits/chosen": -0.31174224615097046,
"logits/rejected": -0.3135172724723816,
"logps/chosen": -428.8531799316406,
"logps/rejected": -427.205810546875,
"loss": 0.6951,
"rewards/accuracies": 0.4625000059604645,
"rewards/chosen": 0.0215766541659832,
"rewards/margins": -0.0034640885423868895,
"rewards/rejected": 0.02504074200987816,
"step": 20
},
{
"epoch": 0.01,
"learning_rate": 1.5e-06,
"logits/chosen": -0.2996385097503662,
"logits/rejected": -0.30060532689094543,
"logps/chosen": -416.20086669921875,
"logps/rejected": -412.4971618652344,
"loss": 0.6924,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": 0.030052989721298218,
"rewards/margins": 0.0019294738303869963,
"rewards/rejected": 0.028123509138822556,
"step": 30
},
{
"epoch": 0.01,
"learning_rate": 2.0000000000000003e-06,
"logits/chosen": -0.3022615313529968,
"logits/rejected": -0.3025739789009094,
"logps/chosen": -426.9918518066406,
"logps/rejected": -423.1588439941406,
"loss": 0.692,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": 0.02528352662920952,
"rewards/margins": 0.002774887252599001,
"rewards/rejected": 0.022508641704916954,
"step": 40
},
{
"epoch": 0.02,
"learning_rate": 2.5e-06,
"logits/chosen": -0.30438098311424255,
"logits/rejected": -0.30549854040145874,
"logps/chosen": -421.03363037109375,
"logps/rejected": -421.8212890625,
"loss": 0.6898,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": 0.03513988479971886,
"rewards/margins": 0.007109012454748154,
"rewards/rejected": 0.028030872344970703,
"step": 50
},
{
"epoch": 0.02,
"learning_rate": 3e-06,
"logits/chosen": -0.30687031149864197,
"logits/rejected": -0.3071025013923645,
"logps/chosen": -417.4591369628906,
"logps/rejected": -417.7974548339844,
"loss": 0.6931,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": 0.014512499794363976,
"rewards/margins": 0.00045255664736032486,
"rewards/rejected": 0.014059944078326225,
"step": 60
},
{
"epoch": 0.02,
"learning_rate": 3.5e-06,
"logits/chosen": -0.30733975768089294,
"logits/rejected": -0.3082950711250305,
"logps/chosen": -422.18487548828125,
"logps/rejected": -422.29052734375,
"loss": 0.6887,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": 0.06789219379425049,
"rewards/margins": 0.00933685339987278,
"rewards/rejected": 0.05855534225702286,
"step": 70
},
{
"epoch": 0.02,
"learning_rate": 4.000000000000001e-06,
"logits/chosen": -0.309120774269104,
"logits/rejected": -0.3103254437446594,
"logps/chosen": -424.8710021972656,
"logps/rejected": -423.9234924316406,
"loss": 0.6875,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": 0.0613434836268425,
"rewards/margins": 0.011882667429745197,
"rewards/rejected": 0.04946080967783928,
"step": 80
},
{
"epoch": 0.03,
"learning_rate": 4.5e-06,
"logits/chosen": -0.3092747628688812,
"logits/rejected": -0.3102528750896454,
"logps/chosen": -417.56097412109375,
"logps/rejected": -420.48541259765625,
"loss": 0.6882,
"rewards/accuracies": 0.5625,
"rewards/chosen": 0.1002880111336708,
"rewards/margins": 0.010797671973705292,
"rewards/rejected": 0.08949033915996552,
"step": 90
},
{
"epoch": 0.03,
"learning_rate": 5e-06,
"logits/chosen": -0.3046155571937561,
"logits/rejected": -0.3053414225578308,
"logps/chosen": -417.95501708984375,
"logps/rejected": -416.2376403808594,
"loss": 0.6748,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": 0.1339004933834076,
"rewards/margins": 0.03802730515599251,
"rewards/rejected": 0.09587319195270538,
"step": 100
},
{
"epoch": 0.03,
"learning_rate": 4.999853306957783e-06,
"logits/chosen": -0.3040740489959717,
"logits/rejected": -0.30468136072158813,
"logps/chosen": -416.46527099609375,
"logps/rejected": -415.51568603515625,
"loss": 0.6714,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": 0.13915565609931946,
"rewards/margins": 0.045606400817632675,
"rewards/rejected": 0.09354925900697708,
"step": 110
},
{
"epoch": 0.04,
"learning_rate": 4.99941324504621e-06,
"logits/chosen": -0.3062252104282379,
"logits/rejected": -0.30699923634529114,
"logps/chosen": -423.4345703125,
"logps/rejected": -421.33477783203125,
"loss": 0.6681,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": 0.25515347719192505,
"rewards/margins": 0.05361776426434517,
"rewards/rejected": 0.20153570175170898,
"step": 120
},
{
"epoch": 0.04,
"learning_rate": 4.998679865908499e-06,
"logits/chosen": -0.3025161623954773,
"logits/rejected": -0.30388832092285156,
"logps/chosen": -421.076416015625,
"logps/rejected": -419.70428466796875,
"loss": 0.6432,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.36134886741638184,
"rewards/margins": 0.10863993316888809,
"rewards/rejected": 0.25270897150039673,
"step": 130
},
{
"epoch": 0.04,
"learning_rate": 4.9976532556099425e-06,
"logits/chosen": -0.29753798246383667,
"logits/rejected": -0.2986024022102356,
"logps/chosen": -423.3164978027344,
"logps/rejected": -420.72918701171875,
"loss": 0.632,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": 0.45896610617637634,
"rewards/margins": 0.13753186166286469,
"rewards/rejected": 0.32143422961235046,
"step": 140
},
{
"epoch": 0.05,
"learning_rate": 4.99633353462781e-06,
"logits/chosen": -0.300027072429657,
"logits/rejected": -0.3015795648097992,
"logps/chosen": -413.91973876953125,
"logps/rejected": -415.4903869628906,
"loss": 0.6428,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": 0.47876471281051636,
"rewards/margins": 0.11648330837488174,
"rewards/rejected": 0.3622814118862152,
"step": 150
},
{
"epoch": 0.05,
"learning_rate": 4.994720857837211e-06,
"logits/chosen": -0.3021107316017151,
"logits/rejected": -0.30334895849227905,
"logps/chosen": -419.66571044921875,
"logps/rejected": -420.95068359375,
"loss": 0.623,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": 0.5215948820114136,
"rewards/margins": 0.1642296016216278,
"rewards/rejected": 0.35736531019210815,
"step": 160
},
{
"epoch": 0.05,
"learning_rate": 4.992815414492917e-06,
"logits/chosen": -0.29045212268829346,
"logits/rejected": -0.29103735089302063,
"logps/chosen": -411.07635498046875,
"logps/rejected": -411.93463134765625,
"loss": 0.6303,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": 0.5044211149215698,
"rewards/margins": 0.16038301587104797,
"rewards/rejected": 0.34403812885284424,
"step": 170
},
{
"epoch": 0.05,
"learning_rate": 4.990617428207153e-06,
"logits/chosen": -0.29839888215065,
"logits/rejected": -0.29893797636032104,
"logps/chosen": -430.1136169433594,
"logps/rejected": -428.3583984375,
"loss": 0.6029,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": 0.5385109782218933,
"rewards/margins": 0.22802197933197021,
"rewards/rejected": 0.3104889690876007,
"step": 180
},
{
"epoch": 0.06,
"learning_rate": 4.988127156923355e-06,
"logits/chosen": -0.2956782281398773,
"logits/rejected": -0.2963833212852478,
"logps/chosen": -415.17071533203125,
"logps/rejected": -414.58148193359375,
"loss": 0.6078,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": 0.4606494903564453,
"rewards/margins": 0.22775804996490479,
"rewards/rejected": 0.23289147019386292,
"step": 190
},
{
"epoch": 0.06,
"learning_rate": 4.985344892885899e-06,
"logits/chosen": -0.29678258299827576,
"logits/rejected": -0.2977609634399414,
"logps/chosen": -416.76275634765625,
"logps/rejected": -419.5223693847656,
"loss": 0.5821,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": 0.44212013483047485,
"rewards/margins": 0.29724568128585815,
"rewards/rejected": 0.1448744386434555,
"step": 200
},
{
"epoch": 0.06,
"eval_logits/chosen": -0.35281771421432495,
"eval_logits/rejected": -0.35360345244407654,
"eval_logps/chosen": -408.5499267578125,
"eval_logps/rejected": -409.8388977050781,
"eval_loss": 0.5728641152381897,
"eval_rewards/accuracies": 0.7260000109672546,
"eval_rewards/chosen": 0.43412691354751587,
"eval_rewards/margins": 0.3201069235801697,
"eval_rewards/rejected": 0.11402001231908798,
"eval_runtime": 351.7745,
"eval_samples_per_second": 1.421,
"eval_steps_per_second": 1.421,
"step": 200
},
{
"epoch": 0.06,
"learning_rate": 4.9822709626058065e-06,
"logits/chosen": -0.29128286242485046,
"logits/rejected": -0.2920396327972412,
"logps/chosen": -416.55322265625,
"logps/rejected": -417.397216796875,
"loss": 0.5743,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.42414647340774536,
"rewards/margins": 0.3180859684944153,
"rewards/rejected": 0.1060604602098465,
"step": 210
},
{
"epoch": 0.07,
"learning_rate": 4.978905726822424e-06,
"logits/chosen": -0.29205116629600525,
"logits/rejected": -0.2932327687740326,
"logps/chosen": -429.031005859375,
"logps/rejected": -432.4542541503906,
"loss": 0.5944,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": 0.34927603602409363,
"rewards/margins": 0.2745763659477234,
"rewards/rejected": 0.07469968497753143,
"step": 220
},
{
"epoch": 0.07,
"learning_rate": 4.975249580461092e-06,
"logits/chosen": -0.29278379678726196,
"logits/rejected": -0.29318395256996155,
"logps/chosen": -415.50640869140625,
"logps/rejected": -414.65631103515625,
"loss": 0.6108,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": 0.20576027035713196,
"rewards/margins": 0.24422487616539001,
"rewards/rejected": -0.038464583456516266,
"step": 230
},
{
"epoch": 0.07,
"learning_rate": 4.971302952586796e-06,
"logits/chosen": -0.2884067893028259,
"logits/rejected": -0.2890322208404541,
"logps/chosen": -411.9427795410156,
"logps/rejected": -418.1693420410156,
"loss": 0.553,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": 0.25125259160995483,
"rewards/margins": 0.39369240403175354,
"rewards/rejected": -0.1424398422241211,
"step": 240
},
{
"epoch": 0.08,
"learning_rate": 4.967066306353816e-06,
"logits/chosen": -0.28915414214134216,
"logits/rejected": -0.29073747992515564,
"logps/chosen": -417.0771484375,
"logps/rejected": -419.65380859375,
"loss": 0.5598,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": 0.25535959005355835,
"rewards/margins": 0.40900731086730957,
"rewards/rejected": -0.15364770591259003,
"step": 250
},
{
"epoch": 0.08,
"learning_rate": 4.962540138951371e-06,
"logits/chosen": -0.2950271964073181,
"logits/rejected": -0.29611852765083313,
"logps/chosen": -420.79681396484375,
"logps/rejected": -425.1570739746094,
"loss": 0.5278,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": 0.24652545154094696,
"rewards/margins": 0.48342761397361755,
"rewards/rejected": -0.2369021624326706,
"step": 260
},
{
"epoch": 0.08,
"learning_rate": 4.957724981545276e-06,
"logits/chosen": -0.28752994537353516,
"logits/rejected": -0.2876993417739868,
"logps/chosen": -413.72808837890625,
"logps/rejected": -418.2240295410156,
"loss": 0.5369,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": 0.144112229347229,
"rewards/margins": 0.48878079652786255,
"rewards/rejected": -0.34466850757598877,
"step": 270
},
{
"epoch": 0.09,
"learning_rate": 4.952621399215598e-06,
"logits/chosen": -0.29713207483291626,
"logits/rejected": -0.29806575179100037,
"logps/chosen": -420.4150390625,
"logps/rejected": -428.95513916015625,
"loss": 0.5325,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.011465489864349365,
"rewards/margins": 0.47427234053611755,
"rewards/rejected": -0.4857378602027893,
"step": 280
},
{
"epoch": 0.09,
"learning_rate": 4.947229990890356e-06,
"logits/chosen": -0.285542756319046,
"logits/rejected": -0.28633180260658264,
"logps/chosen": -420.0926208496094,
"logps/rejected": -423.4457092285156,
"loss": 0.5193,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -0.08504172414541245,
"rewards/margins": 0.5871935486793518,
"rewards/rejected": -0.6722352504730225,
"step": 290
},
{
"epoch": 0.09,
"learning_rate": 4.941551389275217e-06,
"logits/chosen": -0.2842163145542145,
"logits/rejected": -0.28539806604385376,
"logps/chosen": -421.17822265625,
"logps/rejected": -424.78387451171875,
"loss": 0.5631,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.19560113549232483,
"rewards/margins": 0.5197954177856445,
"rewards/rejected": -0.715396523475647,
"step": 300
},
{
"epoch": 0.09,
"learning_rate": 4.935586260779261e-06,
"logits/chosen": -0.2907197177410126,
"logits/rejected": -0.29180362820625305,
"logps/chosen": -427.5953063964844,
"logps/rejected": -431.76788330078125,
"loss": 0.5331,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -0.4097444415092468,
"rewards/margins": 0.5406568646430969,
"rewards/rejected": -0.9504014253616333,
"step": 310
},
{
"epoch": 0.1,
"learning_rate": 4.929335305436764e-06,
"logits/chosen": -0.2902284264564514,
"logits/rejected": -0.2910650670528412,
"logps/chosen": -427.05621337890625,
"logps/rejected": -427.26904296875,
"loss": 0.5694,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -0.4887164533138275,
"rewards/margins": 0.4573966860771179,
"rewards/rejected": -0.9461132287979126,
"step": 320
},
{
"epoch": 0.1,
"learning_rate": 4.922799256825052e-06,
"logits/chosen": -0.30178460478782654,
"logits/rejected": -0.3031577467918396,
"logps/chosen": -432.64544677734375,
"logps/rejected": -437.355712890625,
"loss": 0.5759,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.5430983304977417,
"rewards/margins": 0.5063012838363647,
"rewards/rejected": -1.0493996143341064,
"step": 330
},
{
"epoch": 0.1,
"learning_rate": 4.915978881978407e-06,
"logits/chosen": -0.2879001498222351,
"logits/rejected": -0.28882110118865967,
"logps/chosen": -418.3189392089844,
"logps/rejected": -420.14349365234375,
"loss": 0.5114,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -0.39648348093032837,
"rewards/margins": 0.6386500000953674,
"rewards/rejected": -1.0351333618164062,
"step": 340
},
{
"epoch": 0.11,
"learning_rate": 4.908874981298058e-06,
"logits/chosen": -0.29214486479759216,
"logits/rejected": -0.29305440187454224,
"logps/chosen": -421.3182678222656,
"logps/rejected": -427.06317138671875,
"loss": 0.5628,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.5688936114311218,
"rewards/margins": 0.5135782957077026,
"rewards/rejected": -1.0824719667434692,
"step": 350
},
{
"epoch": 0.11,
"learning_rate": 4.901488388458247e-06,
"logits/chosen": -0.2956882119178772,
"logits/rejected": -0.29717716574668884,
"logps/chosen": -429.40850830078125,
"logps/rejected": -432.0194396972656,
"loss": 0.5326,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.5142576098442078,
"rewards/margins": 0.5949846506118774,
"rewards/rejected": -1.1092422008514404,
"step": 360
},
{
"epoch": 0.11,
"learning_rate": 4.893819970308394e-06,
"logits/chosen": -0.29191336035728455,
"logits/rejected": -0.2928611636161804,
"logps/chosen": -432.4073181152344,
"logps/rejected": -437.53472900390625,
"loss": 0.5255,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.5648801922798157,
"rewards/margins": 0.5746434926986694,
"rewards/rejected": -1.1395236253738403,
"step": 370
},
{
"epoch": 0.12,
"learning_rate": 4.885870626771371e-06,
"logits/chosen": -0.2915678322315216,
"logits/rejected": -0.2924065887928009,
"logps/chosen": -421.0965881347656,
"logps/rejected": -425.9581604003906,
"loss": 0.5565,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": -0.6075869798660278,
"rewards/margins": 0.576026201248169,
"rewards/rejected": -1.1836131811141968,
"step": 380
},
{
"epoch": 0.12,
"learning_rate": 4.8776412907378845e-06,
"logits/chosen": -0.29022809863090515,
"logits/rejected": -0.2918907701969147,
"logps/chosen": -422.2085876464844,
"logps/rejected": -425.4307556152344,
"loss": 0.5346,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.6413823962211609,
"rewards/margins": 0.6153510808944702,
"rewards/rejected": -1.2567334175109863,
"step": 390
},
{
"epoch": 0.12,
"learning_rate": 4.869132927957007e-06,
"logits/chosen": -0.2912658751010895,
"logits/rejected": -0.292255163192749,
"logps/chosen": -424.4219665527344,
"logps/rejected": -430.76885986328125,
"loss": 0.53,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.7030640840530396,
"rewards/margins": 0.6264012455940247,
"rewards/rejected": -1.329465389251709,
"step": 400
},
{
"epoch": 0.12,
"eval_logits/chosen": -0.3515583574771881,
"eval_logits/rejected": -0.35239377617836,
"eval_logps/chosen": -419.6265563964844,
"eval_logps/rejected": -424.9375,
"eval_loss": 0.5038847327232361,
"eval_rewards/accuracies": 0.7379999756813049,
"eval_rewards/chosen": -0.6735388040542603,
"eval_rewards/margins": 0.7223072648048401,
"eval_rewards/rejected": -1.3958461284637451,
"eval_runtime": 375.1774,
"eval_samples_per_second": 1.333,
"eval_steps_per_second": 1.333,
"step": 400
},
{
"epoch": 0.12,
"learning_rate": 4.860346536922834e-06,
"logits/chosen": -0.29377973079681396,
"logits/rejected": -0.294566810131073,
"logps/chosen": -429.86907958984375,
"logps/rejected": -432.5889587402344,
"loss": 0.529,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -0.7517430782318115,
"rewards/margins": 0.6350258588790894,
"rewards/rejected": -1.3867689371109009,
"step": 410
},
{
"epoch": 0.13,
"learning_rate": 4.85128314875731e-06,
"logits/chosen": -0.2876330316066742,
"logits/rejected": -0.2890221178531647,
"logps/chosen": -433.5904846191406,
"logps/rejected": -438.02886962890625,
"loss": 0.5174,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.7357751131057739,
"rewards/margins": 0.6541243195533752,
"rewards/rejected": -1.389899492263794,
"step": 420
},
{
"epoch": 0.13,
"learning_rate": 4.841943827089223e-06,
"logits/chosen": -0.30073267221450806,
"logits/rejected": -0.3028663098812103,
"logps/chosen": -438.89056396484375,
"logps/rejected": -444.29443359375,
"loss": 0.5427,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.7469267845153809,
"rewards/margins": 0.6645030379295349,
"rewards/rejected": -1.411429762840271,
"step": 430
},
{
"epoch": 0.13,
"learning_rate": 4.832329667929378e-06,
"logits/chosen": -0.30401021242141724,
"logits/rejected": -0.305408775806427,
"logps/chosen": -436.4923400878906,
"logps/rejected": -443.785400390625,
"loss": 0.4856,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -0.7162739634513855,
"rewards/margins": 0.7617406845092773,
"rewards/rejected": -1.478014588356018,
"step": 440
},
{
"epoch": 0.14,
"learning_rate": 4.822441799541979e-06,
"logits/chosen": -0.29748016595840454,
"logits/rejected": -0.2987380027770996,
"logps/chosen": -432.2513122558594,
"logps/rejected": -439.78741455078125,
"loss": 0.5138,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -0.8289654850959778,
"rewards/margins": 0.7209790349006653,
"rewards/rejected": -1.549944519996643,
"step": 450
},
{
"epoch": 0.14,
"learning_rate": 4.812281382312222e-06,
"logits/chosen": -0.28938063979148865,
"logits/rejected": -0.2903631031513214,
"logps/chosen": -421.52337646484375,
"logps/rejected": -426.65142822265625,
"loss": 0.4934,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.7328917980194092,
"rewards/margins": 0.7723864316940308,
"rewards/rejected": -1.5052781105041504,
"step": 460
},
{
"epoch": 0.14,
"learning_rate": 4.801849608610119e-06,
"logits/chosen": -0.2995319366455078,
"logits/rejected": -0.3008275330066681,
"logps/chosen": -437.45916748046875,
"logps/rejected": -443.75799560546875,
"loss": 0.4984,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -0.9069220423698425,
"rewards/margins": 0.7756569981575012,
"rewards/rejected": -1.6825790405273438,
"step": 470
},
{
"epoch": 0.15,
"learning_rate": 4.7911477026505656e-06,
"logits/chosen": -0.2930867373943329,
"logits/rejected": -0.2938670516014099,
"logps/chosen": -436.32305908203125,
"logps/rejected": -439.0968322753906,
"loss": 0.4882,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.054971694946289,
"rewards/margins": 0.8004587292671204,
"rewards/rejected": -1.8554306030273438,
"step": 480
},
{
"epoch": 0.15,
"learning_rate": 4.780176920349675e-06,
"logits/chosen": -0.2880414128303528,
"logits/rejected": -0.2893609404563904,
"logps/chosen": -426.8358459472656,
"logps/rejected": -432.79248046875,
"loss": 0.5123,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -1.1646369695663452,
"rewards/margins": 0.739470362663269,
"rewards/rejected": -1.9041073322296143,
"step": 490
},
{
"epoch": 0.15,
"learning_rate": 4.7689385491773934e-06,
"logits/chosen": -0.3000113070011139,
"logits/rejected": -0.3008071780204773,
"logps/chosen": -442.62860107421875,
"logps/rejected": -446.14825439453125,
"loss": 0.5871,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -1.3722599744796753,
"rewards/margins": 0.6036561131477356,
"rewards/rejected": -1.9759161472320557,
"step": 500
},
{
"epoch": 0.15,
"learning_rate": 4.7574339080064046e-06,
"logits/chosen": -0.2956729829311371,
"logits/rejected": -0.29699647426605225,
"logps/chosen": -432.17486572265625,
"logps/rejected": -441.1890563964844,
"loss": 0.4989,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.2666178941726685,
"rewards/margins": 0.7352741956710815,
"rewards/rejected": -2.001891851425171,
"step": 510
},
{
"epoch": 0.16,
"learning_rate": 4.745664346957362e-06,
"logits/chosen": -0.29319706559181213,
"logits/rejected": -0.2932819724082947,
"logps/chosen": -441.1473083496094,
"logps/rejected": -443.6536560058594,
"loss": 0.5431,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.1980129480361938,
"rewards/margins": 0.7274158000946045,
"rewards/rejected": -1.9254287481307983,
"step": 520
},
{
"epoch": 0.16,
"learning_rate": 4.733631247240435e-06,
"logits/chosen": -0.28386861085891724,
"logits/rejected": -0.28545230627059937,
"logps/chosen": -424.7322692871094,
"logps/rejected": -432.74920654296875,
"loss": 0.5172,
"rewards/accuracies": 0.71875,
"rewards/chosen": -1.3047645092010498,
"rewards/margins": 0.7416442632675171,
"rewards/rejected": -2.0464088916778564,
"step": 530
},
{
"epoch": 0.16,
"learning_rate": 4.721336020993228e-06,
"logits/chosen": -0.29582637548446655,
"logits/rejected": -0.2965632379055023,
"logps/chosen": -428.98992919921875,
"logps/rejected": -436.71533203125,
"loss": 0.5223,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -1.232280969619751,
"rewards/margins": 0.7531365752220154,
"rewards/rejected": -1.9854176044464111,
"step": 540
},
{
"epoch": 0.17,
"learning_rate": 4.708780111115058e-06,
"logits/chosen": -0.3022860884666443,
"logits/rejected": -0.303489625453949,
"logps/chosen": -434.28936767578125,
"logps/rejected": -439.01043701171875,
"loss": 0.506,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.1110032796859741,
"rewards/margins": 0.7986260652542114,
"rewards/rejected": -1.909629225730896,
"step": 550
},
{
"epoch": 0.17,
"learning_rate": 4.6959649910976165e-06,
"logits/chosen": -0.3028009533882141,
"logits/rejected": -0.3035816550254822,
"logps/chosen": -433.6151428222656,
"logps/rejected": -436.40045166015625,
"loss": 0.5109,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -1.0793737173080444,
"rewards/margins": 0.753380537033081,
"rewards/rejected": -1.832754373550415,
"step": 560
},
{
"epoch": 0.17,
"learning_rate": 4.682892164852057e-06,
"logits/chosen": -0.29320716857910156,
"logits/rejected": -0.29399818181991577,
"logps/chosen": -428.3548889160156,
"logps/rejected": -433.96124267578125,
"loss": 0.5566,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -1.1920106410980225,
"rewards/margins": 0.6818917989730835,
"rewards/rejected": -1.8739025592803955,
"step": 570
},
{
"epoch": 0.18,
"learning_rate": 4.669563166532504e-06,
"logits/chosen": -0.29630088806152344,
"logits/rejected": -0.2984740138053894,
"logps/chosen": -428.59405517578125,
"logps/rejected": -439.8580017089844,
"loss": 0.5099,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.0762312412261963,
"rewards/margins": 0.8075464963912964,
"rewards/rejected": -1.8837776184082031,
"step": 580
},
{
"epoch": 0.18,
"learning_rate": 4.655979560356006e-06,
"logits/chosen": -0.299476683139801,
"logits/rejected": -0.30079394578933716,
"logps/chosen": -437.24359130859375,
"logps/rejected": -444.2662048339844,
"loss": 0.4679,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.0544074773788452,
"rewards/margins": 0.8957304954528809,
"rewards/rejected": -1.9501378536224365,
"step": 590
},
{
"epoch": 0.18,
"learning_rate": 4.642142940418973e-06,
"logits/chosen": -0.3016494810581207,
"logits/rejected": -0.3028479218482971,
"logps/chosen": -428.2562561035156,
"logps/rejected": -436.1544494628906,
"loss": 0.4446,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -1.1558116674423218,
"rewards/margins": 0.9864055514335632,
"rewards/rejected": -2.1422171592712402,
"step": 600
},
{
"epoch": 0.18,
"eval_logits/chosen": -0.3611030876636505,
"eval_logits/rejected": -0.36189064383506775,
"eval_logps/chosen": -425.2828674316406,
"eval_logps/rejected": -432.32147216796875,
"eval_loss": 0.4912301301956177,
"eval_rewards/accuracies": 0.75,
"eval_rewards/chosen": -1.239168405532837,
"eval_rewards/margins": 0.895074725151062,
"eval_rewards/rejected": -2.1342432498931885,
"eval_runtime": 376.2893,
"eval_samples_per_second": 1.329,
"eval_steps_per_second": 1.329,
"step": 600
},
{
"epoch": 0.19,
"learning_rate": 4.6280549305101065e-06,
"logits/chosen": -0.30701732635498047,
"logits/rejected": -0.30861714482307434,
"logps/chosen": -430.90643310546875,
"logps/rejected": -437.9549255371094,
"loss": 0.545,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -1.4101296663284302,
"rewards/margins": 0.7359235286712646,
"rewards/rejected": -2.1460530757904053,
"step": 610
},
{
"epoch": 0.19,
"learning_rate": 4.61371718391983e-06,
"logits/chosen": -0.30552786588668823,
"logits/rejected": -0.30662640929222107,
"logps/chosen": -432.50506591796875,
"logps/rejected": -443.53216552734375,
"loss": 0.486,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -1.181477427482605,
"rewards/margins": 0.9552658796310425,
"rewards/rejected": -2.1367435455322266,
"step": 620
},
{
"epoch": 0.19,
"learning_rate": 4.599131383246277e-06,
"logits/chosen": -0.308699369430542,
"logits/rejected": -0.308963418006897,
"logps/chosen": -443.76837158203125,
"logps/rejected": -452.234130859375,
"loss": 0.5178,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -1.5818192958831787,
"rewards/margins": 0.70851069688797,
"rewards/rejected": -2.290329933166504,
"step": 630
},
{
"epoch": 0.19,
"learning_rate": 4.584299240197826e-06,
"logits/chosen": -0.29901835322380066,
"logits/rejected": -0.2997357249259949,
"logps/chosen": -437.3292541503906,
"logps/rejected": -438.70013427734375,
"loss": 0.4941,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.4868736267089844,
"rewards/margins": 0.9190858602523804,
"rewards/rejected": -2.405959129333496,
"step": 640
},
{
"epoch": 0.2,
"learning_rate": 4.569222495392227e-06,
"logits/chosen": -0.30075928568840027,
"logits/rejected": -0.30218517780303955,
"logps/chosen": -437.5245056152344,
"logps/rejected": -447.72271728515625,
"loss": 0.4425,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.6097447872161865,
"rewards/margins": 1.0334211587905884,
"rewards/rejected": -2.6431655883789062,
"step": 650
},
{
"epoch": 0.2,
"learning_rate": 4.553902918152329e-06,
"logits/chosen": -0.3034583628177643,
"logits/rejected": -0.3045238256454468,
"logps/chosen": -439.45159912109375,
"logps/rejected": -448.080322265625,
"loss": 0.4796,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.7247978448867798,
"rewards/margins": 1.0205966234207153,
"rewards/rejected": -2.745394229888916,
"step": 660
},
{
"epoch": 0.2,
"learning_rate": 4.5383423062984455e-06,
"logits/chosen": -0.3042409420013428,
"logits/rejected": -0.3053613603115082,
"logps/chosen": -432.8832092285156,
"logps/rejected": -440.6971130371094,
"loss": 0.468,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -1.8063684701919556,
"rewards/margins": 0.9467433094978333,
"rewards/rejected": -2.7531113624572754,
"step": 670
},
{
"epoch": 0.21,
"learning_rate": 4.522542485937369e-06,
"logits/chosen": -0.2990413308143616,
"logits/rejected": -0.3002299666404724,
"logps/chosen": -435.41754150390625,
"logps/rejected": -442.31201171875,
"loss": 0.4606,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -1.9637155532836914,
"rewards/margins": 0.984288215637207,
"rewards/rejected": -2.9480037689208984,
"step": 680
},
{
"epoch": 0.21,
"learning_rate": 4.5065053112480725e-06,
"logits/chosen": -0.3054850697517395,
"logits/rejected": -0.3073977530002594,
"logps/chosen": -433.15771484375,
"logps/rejected": -440.9410095214844,
"loss": 0.4933,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -2.1777331829071045,
"rewards/margins": 0.8640511631965637,
"rewards/rejected": -3.0417845249176025,
"step": 690
},
{
"epoch": 0.21,
"learning_rate": 4.49023266426411e-06,
"logits/chosen": -0.30031442642211914,
"logits/rejected": -0.3014809787273407,
"logps/chosen": -441.3443908691406,
"logps/rejected": -447.56231689453125,
"loss": 0.5213,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -2.0933678150177,
"rewards/margins": 0.8899556994438171,
"rewards/rejected": -2.983323574066162,
"step": 700
},
{
"epoch": 0.22,
"learning_rate": 4.473726454652755e-06,
"logits/chosen": -0.2997979521751404,
"logits/rejected": -0.30115145444869995,
"logps/chosen": -440.00372314453125,
"logps/rejected": -449.6446838378906,
"loss": 0.4733,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -2.0547962188720703,
"rewards/margins": 1.1024844646453857,
"rewards/rejected": -3.157280445098877,
"step": 710
},
{
"epoch": 0.22,
"learning_rate": 4.45698861949089e-06,
"logits/chosen": -0.3066961169242859,
"logits/rejected": -0.3076573610305786,
"logps/chosen": -442.42303466796875,
"logps/rejected": -448.47686767578125,
"loss": 0.5236,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -2.1251652240753174,
"rewards/margins": 0.8965371251106262,
"rewards/rejected": -3.021702289581299,
"step": 720
},
{
"epoch": 0.22,
"learning_rate": 4.440021123037683e-06,
"logits/chosen": -0.29265230894088745,
"logits/rejected": -0.29371362924575806,
"logps/chosen": -441.66900634765625,
"logps/rejected": -450.8470153808594,
"loss": 0.5327,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -2.108212947845459,
"rewards/margins": 0.8388462066650391,
"rewards/rejected": -2.947059154510498,
"step": 730
},
{
"epoch": 0.22,
"learning_rate": 4.422825956504073e-06,
"logits/chosen": -0.3069104254245758,
"logits/rejected": -0.3083550035953522,
"logps/chosen": -449.7119140625,
"logps/rejected": -459.4678649902344,
"loss": 0.5117,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -2.1724143028259277,
"rewards/margins": 0.8879534602165222,
"rewards/rejected": -3.0603675842285156,
"step": 740
},
{
"epoch": 0.23,
"learning_rate": 4.4054051378190915e-06,
"logits/chosen": -0.30406031012535095,
"logits/rejected": -0.30475375056266785,
"logps/chosen": -447.04022216796875,
"logps/rejected": -452.49658203125,
"loss": 0.493,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -2.098881483078003,
"rewards/margins": 0.9250394105911255,
"rewards/rejected": -3.023920774459839,
"step": 750
},
{
"epoch": 0.23,
"learning_rate": 4.387760711393052e-06,
"logits/chosen": -0.3125828206539154,
"logits/rejected": -0.3135472536087036,
"logps/chosen": -441.21337890625,
"logps/rejected": -446.6187438964844,
"loss": 0.5226,
"rewards/accuracies": 0.71875,
"rewards/chosen": -2.1020660400390625,
"rewards/margins": 0.8715343475341797,
"rewards/rejected": -2.973600387573242,
"step": 760
},
{
"epoch": 0.23,
"learning_rate": 4.369894747877627e-06,
"logits/chosen": -0.30844077467918396,
"logits/rejected": -0.3093765676021576,
"logps/chosen": -439.68060302734375,
"logps/rejected": -447.6014709472656,
"loss": 0.4748,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -1.9754493236541748,
"rewards/margins": 1.023809552192688,
"rewards/rejected": -2.9992587566375732,
"step": 770
},
{
"epoch": 0.24,
"learning_rate": 4.3518093439228484e-06,
"logits/chosen": -0.309563547372818,
"logits/rejected": -0.3104109764099121,
"logps/chosen": -442.0809631347656,
"logps/rejected": -449.5039978027344,
"loss": 0.4696,
"rewards/accuracies": 0.78125,
"rewards/chosen": -2.0284624099731445,
"rewards/margins": 0.8842188119888306,
"rewards/rejected": -2.9126813411712646,
"step": 780
},
{
"epoch": 0.24,
"learning_rate": 4.333506621931056e-06,
"logits/chosen": -0.3095022737979889,
"logits/rejected": -0.3111112713813782,
"logps/chosen": -441.48736572265625,
"logps/rejected": -452.59466552734375,
"loss": 0.4302,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -1.7804081439971924,
"rewards/margins": 1.1726500988006592,
"rewards/rejected": -2.9530580043792725,
"step": 790
},
{
"epoch": 0.24,
"learning_rate": 4.3149887298078275e-06,
"logits/chosen": -0.3100133538246155,
"logits/rejected": -0.3110717535018921,
"logps/chosen": -439.8687438964844,
"logps/rejected": -447.7996520996094,
"loss": 0.4705,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.8933576345443726,
"rewards/margins": 1.0525233745574951,
"rewards/rejected": -2.9458811283111572,
"step": 800
},
{
"epoch": 0.24,
"eval_logits/chosen": -0.3688412606716156,
"eval_logits/rejected": -0.3696078956127167,
"eval_logps/chosen": -432.4996643066406,
"eval_logps/rejected": -440.57073974609375,
"eval_loss": 0.4888974726200104,
"eval_rewards/accuracies": 0.7599999904632568,
"eval_rewards/chosen": -1.9608467817306519,
"eval_rewards/margins": 0.9983222484588623,
"eval_rewards/rejected": -2.9591689109802246,
"eval_runtime": 376.2946,
"eval_samples_per_second": 1.329,
"eval_steps_per_second": 1.329,
"step": 800
},
{
"epoch": 0.25,
"learning_rate": 4.296257840709906e-06,
"logits/chosen": -0.3060837686061859,
"logits/rejected": -0.30729439854621887,
"logps/chosen": -443.59765625,
"logps/rejected": -454.3882751464844,
"loss": 0.4934,
"rewards/accuracies": 0.75,
"rewards/chosen": -2.0910544395446777,
"rewards/margins": 0.964927077293396,
"rewards/rejected": -3.0559818744659424,
"step": 810
},
{
"epoch": 0.25,
"learning_rate": 4.277316152790177e-06,
"logits/chosen": -0.3090333938598633,
"logits/rejected": -0.3097476363182068,
"logps/chosen": -446.78564453125,
"logps/rejected": -453.74859619140625,
"loss": 0.5066,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -2.2516064643859863,
"rewards/margins": 0.9105945825576782,
"rewards/rejected": -3.162201404571533,
"step": 820
},
{
"epoch": 0.25,
"learning_rate": 4.2581658889397e-06,
"logits/chosen": -0.2983805537223816,
"logits/rejected": -0.29977601766586304,
"logps/chosen": -434.3565979003906,
"logps/rejected": -444.49542236328125,
"loss": 0.4289,
"rewards/accuracies": 0.8187500238418579,
"rewards/chosen": -1.9821250438690186,
"rewards/margins": 1.0745497941970825,
"rewards/rejected": -3.0566749572753906,
"step": 830
},
{
"epoch": 0.26,
"learning_rate": 4.238809296526847e-06,
"logits/chosen": -0.30951178073883057,
"logits/rejected": -0.31038326025009155,
"logps/chosen": -453.48419189453125,
"logps/rejected": -461.54833984375,
"loss": 0.523,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -2.3027613162994385,
"rewards/margins": 0.8140772581100464,
"rewards/rejected": -3.1168384552001953,
"step": 840
},
{
"epoch": 0.26,
"learning_rate": 4.219248647133559e-06,
"logits/chosen": -0.3112717568874359,
"logits/rejected": -0.3124113082885742,
"logps/chosen": -437.2984313964844,
"logps/rejected": -447.7862243652344,
"loss": 0.4619,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -2.230332612991333,
"rewards/margins": 1.0623276233673096,
"rewards/rejected": -3.2926604747772217,
"step": 850
},
{
"epoch": 0.26,
"learning_rate": 4.19948623628877e-06,
"logits/chosen": -0.3127744495868683,
"logits/rejected": -0.31366902589797974,
"logps/chosen": -451.15966796875,
"logps/rejected": -458.08154296875,
"loss": 0.5186,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -2.2668843269348145,
"rewards/margins": 0.8956031799316406,
"rewards/rejected": -3.162487268447876,
"step": 860
},
{
"epoch": 0.26,
"learning_rate": 4.179524383199016e-06,
"logits/chosen": -0.30885085463523865,
"logits/rejected": -0.3100178837776184,
"logps/chosen": -445.05670166015625,
"logps/rejected": -453.55328369140625,
"loss": 0.4533,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -2.1777684688568115,
"rewards/margins": 1.1419053077697754,
"rewards/rejected": -3.319674253463745,
"step": 870
},
{
"epoch": 0.27,
"learning_rate": 4.159365430476262e-06,
"logits/chosen": -0.30774661898612976,
"logits/rejected": -0.3091534674167633,
"logps/chosen": -445.9901428222656,
"logps/rejected": -453.9535217285156,
"loss": 0.4711,
"rewards/accuracies": 0.75,
"rewards/chosen": -2.2285873889923096,
"rewards/margins": 1.0858628749847412,
"rewards/rejected": -3.31445050239563,
"step": 880
},
{
"epoch": 0.27,
"learning_rate": 4.139011743862991e-06,
"logits/chosen": -0.31220975518226624,
"logits/rejected": -0.31295710802078247,
"logps/chosen": -437.8184509277344,
"logps/rejected": -450.45611572265625,
"loss": 0.4411,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -2.122331142425537,
"rewards/margins": 1.2842220067977905,
"rewards/rejected": -3.406553268432617,
"step": 890
},
{
"epoch": 0.27,
"learning_rate": 4.11846571195457e-06,
"logits/chosen": -0.30749282240867615,
"logits/rejected": -0.3092586398124695,
"logps/chosen": -445.489013671875,
"logps/rejected": -456.45361328125,
"loss": 0.4331,
"rewards/accuracies": 0.78125,
"rewards/chosen": -2.122631788253784,
"rewards/margins": 1.2757575511932373,
"rewards/rejected": -3.3983893394470215,
"step": 900
},
{
"epoch": 0.28,
"learning_rate": 4.0977297459189405e-06,
"logits/chosen": -0.31161195039749146,
"logits/rejected": -0.3124944865703583,
"logps/chosen": -448.9032287597656,
"logps/rejected": -456.729248046875,
"loss": 0.4549,
"rewards/accuracies": 0.78125,
"rewards/chosen": -2.3067939281463623,
"rewards/margins": 1.165810227394104,
"rewards/rejected": -3.472604274749756,
"step": 910
},
{
"epoch": 0.28,
"learning_rate": 4.076806279213656e-06,
"logits/chosen": -0.311604380607605,
"logits/rejected": -0.312518447637558,
"logps/chosen": -438.07916259765625,
"logps/rejected": -450.47381591796875,
"loss": 0.4232,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -2.2351415157318115,
"rewards/margins": 1.1904911994934082,
"rewards/rejected": -3.425632953643799,
"step": 920
},
{
"epoch": 0.28,
"learning_rate": 4.055697767300302e-06,
"logits/chosen": -0.3170091211795807,
"logits/rejected": -0.31755563616752625,
"logps/chosen": -442.83758544921875,
"logps/rejected": -450.9444885253906,
"loss": 0.5088,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -2.335662841796875,
"rewards/margins": 1.0687111616134644,
"rewards/rejected": -3.40437388420105,
"step": 930
},
{
"epoch": 0.29,
"learning_rate": 4.034406687356344e-06,
"logits/chosen": -0.3176030218601227,
"logits/rejected": -0.31867748498916626,
"logps/chosen": -438.16229248046875,
"logps/rejected": -446.01806640625,
"loss": 0.5146,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -2.501446008682251,
"rewards/margins": 0.903441309928894,
"rewards/rejected": -3.4048874378204346,
"step": 940
},
{
"epoch": 0.29,
"learning_rate": 4.012935537984414e-06,
"logits/chosen": -0.31417202949523926,
"logits/rejected": -0.3148192763328552,
"logps/chosen": -435.503173828125,
"logps/rejected": -444.60400390625,
"loss": 0.5049,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -2.4111835956573486,
"rewards/margins": 0.8773403167724609,
"rewards/rejected": -3.2885234355926514,
"step": 950
},
{
"epoch": 0.29,
"learning_rate": 3.991286838919086e-06,
"logits/chosen": -0.30995315313339233,
"logits/rejected": -0.31148335337638855,
"logps/chosen": -440.8172912597656,
"logps/rejected": -452.94268798828125,
"loss": 0.4584,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -2.2748212814331055,
"rewards/margins": 1.0933376550674438,
"rewards/rejected": -3.3681588172912598,
"step": 960
},
{
"epoch": 0.29,
"learning_rate": 3.969463130731183e-06,
"logits/chosen": -0.3108167052268982,
"logits/rejected": -0.31207841634750366,
"logps/chosen": -443.76409912109375,
"logps/rejected": -456.50286865234375,
"loss": 0.4063,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -2.237427234649658,
"rewards/margins": 1.2730433940887451,
"rewards/rejected": -3.5104706287384033,
"step": 970
},
{
"epoch": 0.3,
"learning_rate": 3.947466974529622e-06,
"logits/chosen": -0.3074961304664612,
"logits/rejected": -0.30913347005844116,
"logps/chosen": -451.47320556640625,
"logps/rejected": -461.1958923339844,
"loss": 0.4688,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -2.525216817855835,
"rewards/margins": 1.2734287977218628,
"rewards/rejected": -3.798645496368408,
"step": 980
},
{
"epoch": 0.3,
"learning_rate": 3.925300951660859e-06,
"logits/chosen": -0.3098825216293335,
"logits/rejected": -0.3106127381324768,
"logps/chosen": -449.3988342285156,
"logps/rejected": -455.8897399902344,
"loss": 0.4974,
"rewards/accuracies": 0.71875,
"rewards/chosen": -2.6619412899017334,
"rewards/margins": 1.0037035942077637,
"rewards/rejected": -3.665644884109497,
"step": 990
},
{
"epoch": 0.3,
"learning_rate": 3.9029676634059565e-06,
"logits/chosen": -0.31196895241737366,
"logits/rejected": -0.3131485879421234,
"logps/chosen": -451.7205505371094,
"logps/rejected": -461.85467529296875,
"loss": 0.4296,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -2.469062328338623,
"rewards/margins": 1.1974780559539795,
"rewards/rejected": -3.6665406227111816,
"step": 1000
},
{
"epoch": 0.3,
"eval_logits/chosen": -0.3799600601196289,
"eval_logits/rejected": -0.3806193768978119,
"eval_logps/chosen": -436.8405456542969,
"eval_logps/rejected": -445.942626953125,
"eval_loss": 0.48261019587516785,
"eval_rewards/accuracies": 0.7459999918937683,
"eval_rewards/chosen": -2.3949320316314697,
"eval_rewards/margins": 1.1014209985733032,
"eval_rewards/rejected": -3.4963533878326416,
"eval_runtime": 377.1489,
"eval_samples_per_second": 1.326,
"eval_steps_per_second": 1.326,
"step": 1000
},
{
"epoch": 0.31,
"learning_rate": 3.880469730675311e-06,
"logits/chosen": -0.31937772035598755,
"logits/rejected": -0.3201027512550354,
"logps/chosen": -444.93267822265625,
"logps/rejected": -454.3338317871094,
"loss": 0.4744,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -2.4238359928131104,
"rewards/margins": 1.1197197437286377,
"rewards/rejected": -3.543555736541748,
"step": 1010
},
{
"epoch": 0.31,
"learning_rate": 3.857809793701082e-06,
"logits/chosen": -0.3155730664730072,
"logits/rejected": -0.31668931245803833,
"logps/chosen": -447.9942932128906,
"logps/rejected": -458.11199951171875,
"loss": 0.4398,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -2.2663698196411133,
"rewards/margins": 1.3081058263778687,
"rewards/rejected": -3.5744757652282715,
"step": 1020
},
{
"epoch": 0.31,
"learning_rate": 3.834990511727341e-06,
"logits/chosen": -0.3186780512332916,
"logits/rejected": -0.32040825486183167,
"logps/chosen": -445.6949768066406,
"logps/rejected": -458.57244873046875,
"loss": 0.4537,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -2.343160629272461,
"rewards/margins": 1.2446506023406982,
"rewards/rejected": -3.587811231613159,
"step": 1030
},
{
"epoch": 0.32,
"learning_rate": 3.812014562698002e-06,
"logits/chosen": -0.320089191198349,
"logits/rejected": -0.3210357427597046,
"logps/chosen": -441.82354736328125,
"logps/rejected": -449.4639587402344,
"loss": 0.5402,
"rewards/accuracies": 0.71875,
"rewards/chosen": -2.470724582672119,
"rewards/margins": 0.9423478841781616,
"rewards/rejected": -3.4130725860595703,
"step": 1040
},
{
"epoch": 0.32,
"learning_rate": 3.788884642942555e-06,
"logits/chosen": -0.32223668694496155,
"logits/rejected": -0.32441529631614685,
"logps/chosen": -444.36328125,
"logps/rejected": -457.4508361816406,
"loss": 0.4432,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -2.3148508071899414,
"rewards/margins": 1.2026770114898682,
"rewards/rejected": -3.5175278186798096,
"step": 1050
},
{
"epoch": 0.32,
"learning_rate": 3.765603466859635e-06,
"logits/chosen": -0.31094425916671753,
"logits/rejected": -0.3124980330467224,
"logps/chosen": -439.57025146484375,
"logps/rejected": -453.1656799316406,
"loss": 0.4585,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -2.423621654510498,
"rewards/margins": 1.1653788089752197,
"rewards/rejected": -3.589000701904297,
"step": 1060
},
{
"epoch": 0.32,
"learning_rate": 3.7421737665984807e-06,
"logits/chosen": -0.32444941997528076,
"logits/rejected": -0.3258149325847626,
"logps/chosen": -444.17742919921875,
"logps/rejected": -454.7518005371094,
"loss": 0.485,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -2.465573310852051,
"rewards/margins": 1.1133558750152588,
"rewards/rejected": -3.5789291858673096,
"step": 1070
},
{
"epoch": 0.33,
"learning_rate": 3.7185982917382986e-06,
"logits/chosen": -0.32046034932136536,
"logits/rejected": -0.3209912180900574,
"logps/chosen": -449.0337829589844,
"logps/rejected": -456.1290588378906,
"loss": 0.5036,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -2.421260356903076,
"rewards/margins": 1.0578956604003906,
"rewards/rejected": -3.479156017303467,
"step": 1080
},
{
"epoch": 0.33,
"learning_rate": 3.6948798089655913e-06,
"logits/chosen": -0.3232346177101135,
"logits/rejected": -0.3241461217403412,
"logps/chosen": -448.1339416503906,
"logps/rejected": -455.69970703125,
"loss": 0.4664,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -2.2551751136779785,
"rewards/margins": 1.1089966297149658,
"rewards/rejected": -3.3641715049743652,
"step": 1090
},
{
"epoch": 0.33,
"learning_rate": 3.671021101749476e-06,
"logits/chosen": -0.3160512447357178,
"logits/rejected": -0.3167613744735718,
"logps/chosen": -434.97698974609375,
"logps/rejected": -441.48065185546875,
"loss": 0.4634,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -2.224177122116089,
"rewards/margins": 1.158850908279419,
"rewards/rejected": -3.383028507232666,
"step": 1100
},
{
"epoch": 0.34,
"learning_rate": 3.6470249700150273e-06,
"logits/chosen": -0.31829750537872314,
"logits/rejected": -0.3188309669494629,
"logps/chosen": -440.1014099121094,
"logps/rejected": -448.751220703125,
"loss": 0.4287,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -2.001333713531494,
"rewards/margins": 1.3394745588302612,
"rewards/rejected": -3.340808153152466,
"step": 1110
},
{
"epoch": 0.34,
"learning_rate": 3.6228942298146985e-06,
"logits/chosen": -0.31696969270706177,
"logits/rejected": -0.3185669183731079,
"logps/chosen": -436.61090087890625,
"logps/rejected": -446.8427734375,
"loss": 0.4086,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.968629240989685,
"rewards/margins": 1.365235686302185,
"rewards/rejected": -3.33386492729187,
"step": 1120
},
{
"epoch": 0.34,
"learning_rate": 3.598631712997841e-06,
"logits/chosen": -0.3232669234275818,
"logits/rejected": -0.32362625002861023,
"logps/chosen": -445.9930114746094,
"logps/rejected": -456.0194396972656,
"loss": 0.4797,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -2.1778838634490967,
"rewards/margins": 1.1961848735809326,
"rewards/rejected": -3.3740687370300293,
"step": 1130
},
{
"epoch": 0.35,
"learning_rate": 3.5742402668783797e-06,
"logits/chosen": -0.31457391381263733,
"logits/rejected": -0.31524404883384705,
"logps/chosen": -434.63885498046875,
"logps/rejected": -445.6629943847656,
"loss": 0.4942,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -2.235532283782959,
"rewards/margins": 1.1538090705871582,
"rewards/rejected": -3.389340877532959,
"step": 1140
},
{
"epoch": 0.35,
"learning_rate": 3.549722753900662e-06,
"logits/chosen": -0.3312085270881653,
"logits/rejected": -0.33145731687545776,
"logps/chosen": -451.101806640625,
"logps/rejected": -457.88909912109375,
"loss": 0.5859,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -2.525597095489502,
"rewards/margins": 0.7067753672599792,
"rewards/rejected": -3.232372283935547,
"step": 1150
},
{
"epoch": 0.35,
"learning_rate": 3.5250820513035403e-06,
"logits/chosen": -0.3225269615650177,
"logits/rejected": -0.3232432007789612,
"logps/chosen": -438.2666931152344,
"logps/rejected": -450.084716796875,
"loss": 0.4502,
"rewards/accuracies": 0.78125,
"rewards/chosen": -2.3596291542053223,
"rewards/margins": 1.153564691543579,
"rewards/rejected": -3.5131936073303223,
"step": 1160
},
{
"epoch": 0.36,
"learning_rate": 3.500321050782717e-06,
"logits/chosen": -0.3299608826637268,
"logits/rejected": -0.33111685514450073,
"logps/chosen": -435.506103515625,
"logps/rejected": -449.41644287109375,
"loss": 0.4587,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -2.255194664001465,
"rewards/margins": 1.2196067571640015,
"rewards/rejected": -3.474801540374756,
"step": 1170
},
{
"epoch": 0.36,
"learning_rate": 3.4754426581513866e-06,
"logits/chosen": -0.3299122750759125,
"logits/rejected": -0.33067744970321655,
"logps/chosen": -450.20074462890625,
"logps/rejected": -456.9153747558594,
"loss": 0.4929,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -2.382310390472412,
"rewards/margins": 1.0834969282150269,
"rewards/rejected": -3.4658074378967285,
"step": 1180
},
{
"epoch": 0.36,
"learning_rate": 3.45044979299923e-06,
"logits/chosen": -0.3264179527759552,
"logits/rejected": -0.32756897807121277,
"logps/chosen": -442.2974548339844,
"logps/rejected": -449.1595764160156,
"loss": 0.4977,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -2.263164520263672,
"rewards/margins": 1.0983796119689941,
"rewards/rejected": -3.361544370651245,
"step": 1190
},
{
"epoch": 0.36,
"learning_rate": 3.425345388349787e-06,
"logits/chosen": -0.31463193893432617,
"logits/rejected": -0.31522423028945923,
"logps/chosen": -442.2705078125,
"logps/rejected": -452.19110107421875,
"loss": 0.501,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -2.2270102500915527,
"rewards/margins": 1.1489759683609009,
"rewards/rejected": -3.3759865760803223,
"step": 1200
},
{
"epoch": 0.36,
"eval_logits/chosen": -0.39124199748039246,
"eval_logits/rejected": -0.3919140696525574,
"eval_logps/chosen": -434.10162353515625,
"eval_logps/rejected": -443.24261474609375,
"eval_loss": 0.4862767159938812,
"eval_rewards/accuracies": 0.75,
"eval_rewards/chosen": -2.1210429668426514,
"eval_rewards/margins": 1.1053153276443481,
"eval_rewards/rejected": -3.22635817527771,
"eval_runtime": 375.0192,
"eval_samples_per_second": 1.333,
"eval_steps_per_second": 1.333,
"step": 1200
},
{
"epoch": 0.37,
"learning_rate": 3.4001323903162476e-06,
"logits/chosen": -0.32597848773002625,
"logits/rejected": -0.32685333490371704,
"logps/chosen": -435.82135009765625,
"logps/rejected": -446.47552490234375,
"loss": 0.4618,
"rewards/accuracies": 0.78125,
"rewards/chosen": -2.0256919860839844,
"rewards/margins": 1.2204935550689697,
"rewards/rejected": -3.246185302734375,
"step": 1210
},
{
"epoch": 0.37,
"learning_rate": 3.3748137577557216e-06,
"logits/chosen": -0.3275033235549927,
"logits/rejected": -0.3280579149723053,
"logps/chosen": -438.50384521484375,
"logps/rejected": -447.7579040527344,
"loss": 0.4531,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -2.154592514038086,
"rewards/margins": 1.1412467956542969,
"rewards/rejected": -3.295839309692383,
"step": 1220
},
{
"epoch": 0.37,
"learning_rate": 3.3493924619219964e-06,
"logits/chosen": -0.3302023112773895,
"logits/rejected": -0.33196666836738586,
"logps/chosen": -454.8751525878906,
"logps/rejected": -466.814453125,
"loss": 0.4865,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -2.2140915393829346,
"rewards/margins": 1.0115987062454224,
"rewards/rejected": -3.2256903648376465,
"step": 1230
},
{
"epoch": 0.38,
"learning_rate": 3.3238714861168513e-06,
"logits/chosen": -0.3286048173904419,
"logits/rejected": -0.3293796181678772,
"logps/chosen": -436.85308837890625,
"logps/rejected": -445.0462951660156,
"loss": 0.4905,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -2.0171353816986084,
"rewards/margins": 1.1139663457870483,
"rewards/rejected": -3.1311020851135254,
"step": 1240
},
{
"epoch": 0.38,
"learning_rate": 3.29825382533995e-06,
"logits/chosen": -0.3311420679092407,
"logits/rejected": -0.3327622711658478,
"logps/chosen": -444.5484924316406,
"logps/rejected": -455.69732666015625,
"loss": 0.5066,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -2.358177661895752,
"rewards/margins": 0.9705084562301636,
"rewards/rejected": -3.328686237335205,
"step": 1250
},
{
"epoch": 0.38,
"learning_rate": 3.272542485937369e-06,
"logits/chosen": -0.33226504921913147,
"logits/rejected": -0.3331693708896637,
"logps/chosen": -434.11248779296875,
"logps/rejected": -441.95428466796875,
"loss": 0.4827,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -1.988227128982544,
"rewards/margins": 1.1914219856262207,
"rewards/rejected": -3.1796488761901855,
"step": 1260
},
{
"epoch": 0.39,
"learning_rate": 3.2467404852487846e-06,
"logits/chosen": -0.33789581060409546,
"logits/rejected": -0.33837661147117615,
"logps/chosen": -445.60009765625,
"logps/rejected": -453.21380615234375,
"loss": 0.4935,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -1.8867080211639404,
"rewards/margins": 1.2431962490081787,
"rewards/rejected": -3.1299045085906982,
"step": 1270
},
{
"epoch": 0.39,
"learning_rate": 3.2208508512533777e-06,
"logits/chosen": -0.3227623403072357,
"logits/rejected": -0.3246156573295593,
"logps/chosen": -447.2259826660156,
"logps/rejected": -456.6937561035156,
"loss": 0.4514,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -2.05145001411438,
"rewards/margins": 1.0999400615692139,
"rewards/rejected": -3.151390552520752,
"step": 1280
},
{
"epoch": 0.39,
"learning_rate": 3.1948766222144863e-06,
"logits/chosen": -0.32600507140159607,
"logits/rejected": -0.3266277313232422,
"logps/chosen": -434.4227600097656,
"logps/rejected": -442.1160583496094,
"loss": 0.5228,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -2.233891010284424,
"rewards/margins": 0.8642382621765137,
"rewards/rejected": -3.0981292724609375,
"step": 1290
},
{
"epoch": 0.39,
"learning_rate": 3.168820846323053e-06,
"logits/chosen": -0.3299737870693207,
"logits/rejected": -0.3313831090927124,
"logps/chosen": -434.67803955078125,
"logps/rejected": -446.71417236328125,
"loss": 0.4392,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -2.0030479431152344,
"rewards/margins": 1.1440895795822144,
"rewards/rejected": -3.147137403488159,
"step": 1300
},
{
"epoch": 0.4,
"learning_rate": 3.142686581339902e-06,
"logits/chosen": -0.32545098662376404,
"logits/rejected": -0.32752394676208496,
"logps/chosen": -435.9081115722656,
"logps/rejected": -445.0193786621094,
"loss": 0.5154,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -2.0516555309295654,
"rewards/margins": 1.0273317098617554,
"rewards/rejected": -3.0789875984191895,
"step": 1310
},
{
"epoch": 0.4,
"learning_rate": 3.1164768942369058e-06,
"logits/chosen": -0.33717575669288635,
"logits/rejected": -0.33777323365211487,
"logps/chosen": -439.6886291503906,
"logps/rejected": -450.8135681152344,
"loss": 0.4056,
"rewards/accuracies": 0.8187500238418579,
"rewards/chosen": -1.7847926616668701,
"rewards/margins": 1.3125979900360107,
"rewards/rejected": -3.097390651702881,
"step": 1320
},
{
"epoch": 0.4,
"learning_rate": 3.0901948608370503e-06,
"logits/chosen": -0.3371260166168213,
"logits/rejected": -0.33846548199653625,
"logps/chosen": -436.64190673828125,
"logps/rejected": -450.7660217285156,
"loss": 0.4474,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -1.9579681158065796,
"rewards/margins": 1.1995084285736084,
"rewards/rejected": -3.1574764251708984,
"step": 1330
},
{
"epoch": 0.41,
"learning_rate": 3.063843565453486e-06,
"logits/chosen": -0.3233332931995392,
"logits/rejected": -0.3235628008842468,
"logps/chosen": -441.6625061035156,
"logps/rejected": -450.6896057128906,
"loss": 0.4454,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -2.0109899044036865,
"rewards/margins": 1.2036950588226318,
"rewards/rejected": -3.2146849632263184,
"step": 1340
},
{
"epoch": 0.41,
"learning_rate": 3.0374261005275606e-06,
"logits/chosen": -0.32744866609573364,
"logits/rejected": -0.32873040437698364,
"logps/chosen": -438.97955322265625,
"logps/rejected": -452.37255859375,
"loss": 0.4277,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -1.850262999534607,
"rewards/margins": 1.45210862159729,
"rewards/rejected": -3.3023715019226074,
"step": 1350
},
{
"epoch": 0.41,
"learning_rate": 3.0109455662659126e-06,
"logits/chosen": -0.33421364426612854,
"logits/rejected": -0.33508172631263733,
"logps/chosen": -438.8184509277344,
"logps/rejected": -447.74267578125,
"loss": 0.469,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -2.3082900047302246,
"rewards/margins": 1.0502725839614868,
"rewards/rejected": -3.358562469482422,
"step": 1360
},
{
"epoch": 0.42,
"learning_rate": 2.984405070276646e-06,
"logits/chosen": -0.3377315402030945,
"logits/rejected": -0.3380245268344879,
"logps/chosen": -440.62689208984375,
"logps/rejected": -448.7757873535156,
"loss": 0.4497,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -2.239384174346924,
"rewards/margins": 1.1150033473968506,
"rewards/rejected": -3.3543879985809326,
"step": 1370
},
{
"epoch": 0.42,
"learning_rate": 2.9578077272046407e-06,
"logits/chosen": -0.3324066698551178,
"logits/rejected": -0.3327699303627014,
"logps/chosen": -445.11651611328125,
"logps/rejected": -452.57135009765625,
"loss": 0.4627,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -2.3683180809020996,
"rewards/margins": 1.2064650058746338,
"rewards/rejected": -3.5747828483581543,
"step": 1380
},
{
"epoch": 0.42,
"learning_rate": 2.931156658366032e-06,
"logits/chosen": -0.33288371562957764,
"logits/rejected": -0.33407607674598694,
"logps/chosen": -438.28363037109375,
"logps/rejected": -449.0726623535156,
"loss": 0.4609,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -2.4992074966430664,
"rewards/margins": 1.1167179346084595,
"rewards/rejected": -3.6159253120422363,
"step": 1390
},
{
"epoch": 0.43,
"learning_rate": 2.9044549913819125e-06,
"logits/chosen": -0.33773019909858704,
"logits/rejected": -0.3393145203590393,
"logps/chosen": -441.80511474609375,
"logps/rejected": -450.90997314453125,
"loss": 0.421,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": -2.251107692718506,
"rewards/margins": 1.231013536453247,
"rewards/rejected": -3.482121706008911,
"step": 1400
},
{
"epoch": 0.43,
"eval_logits/chosen": -0.401915043592453,
"eval_logits/rejected": -0.4025632441043854,
"eval_logps/chosen": -436.2534484863281,
"eval_logps/rejected": -445.6542053222656,
"eval_loss": 0.4834233820438385,
"eval_rewards/accuracies": 0.7580000162124634,
"eval_rewards/chosen": -2.336226463317871,
"eval_rewards/margins": 1.131289005279541,
"eval_rewards/rejected": -3.4675159454345703,
"eval_runtime": 373.3095,
"eval_samples_per_second": 1.339,
"eval_steps_per_second": 1.339,
"step": 1400
},
{
"epoch": 0.43,
"learning_rate": 2.877705859811292e-06,
"logits/chosen": -0.32963141798973083,
"logits/rejected": -0.32958561182022095,
"logps/chosen": -441.468017578125,
"logps/rejected": -452.41632080078125,
"loss": 0.4867,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -2.3174846172332764,
"rewards/margins": 1.2222645282745361,
"rewards/rejected": -3.5397496223449707,
"step": 1410
},
{
"epoch": 0.43,
"learning_rate": 2.850912402783361e-06,
"logits/chosen": -0.33581605553627014,
"logits/rejected": -0.3373740315437317,
"logps/chosen": -443.38507080078125,
"logps/rejected": -455.705810546875,
"loss": 0.4821,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -2.5340211391448975,
"rewards/margins": 1.059066653251648,
"rewards/rejected": -3.593087673187256,
"step": 1420
},
{
"epoch": 0.43,
"learning_rate": 2.8240777646290973e-06,
"logits/chosen": -0.3432762026786804,
"logits/rejected": -0.3442252576351166,
"logps/chosen": -455.3641662597656,
"logps/rejected": -465.02032470703125,
"loss": 0.4363,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -2.3768112659454346,
"rewards/margins": 1.3034956455230713,
"rewards/rejected": -3.680307388305664,
"step": 1430
},
{
"epoch": 0.44,
"learning_rate": 2.7972050945122666e-06,
"logits/chosen": -0.3318456709384918,
"logits/rejected": -0.33274808526039124,
"logps/chosen": -442.74029541015625,
"logps/rejected": -453.32745361328125,
"loss": 0.4564,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -2.556647300720215,
"rewards/margins": 1.2236577272415161,
"rewards/rejected": -3.7803051471710205,
"step": 1440
},
{
"epoch": 0.44,
"learning_rate": 2.7702975460598545e-06,
"logits/chosen": -0.33731141686439514,
"logits/rejected": -0.33812469244003296,
"logps/chosen": -445.42596435546875,
"logps/rejected": -457.1685485839844,
"loss": 0.4487,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -2.542117118835449,
"rewards/margins": 1.1616876125335693,
"rewards/rejected": -3.7038047313690186,
"step": 1450
},
{
"epoch": 0.44,
"learning_rate": 2.7433582769919752e-06,
"logits/chosen": -0.3384588360786438,
"logits/rejected": -0.33992061018943787,
"logps/chosen": -448.994873046875,
"logps/rejected": -456.4767150878906,
"loss": 0.5548,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -2.7594006061553955,
"rewards/margins": 0.9322620630264282,
"rewards/rejected": -3.691662549972534,
"step": 1460
},
{
"epoch": 0.45,
"learning_rate": 2.716390448751294e-06,
"logits/chosen": -0.34274882078170776,
"logits/rejected": -0.34329262375831604,
"logps/chosen": -450.77972412109375,
"logps/rejected": -461.76239013671875,
"loss": 0.4976,
"rewards/accuracies": 0.71875,
"rewards/chosen": -2.5453426837921143,
"rewards/margins": 1.080673336982727,
"rewards/rejected": -3.6260154247283936,
"step": 1470
},
{
"epoch": 0.45,
"learning_rate": 2.6893972261320265e-06,
"logits/chosen": -0.3363896608352661,
"logits/rejected": -0.33778852224349976,
"logps/chosen": -442.7216796875,
"logps/rejected": -453.9684143066406,
"loss": 0.4628,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -2.584522008895874,
"rewards/margins": 1.2380025386810303,
"rewards/rejected": -3.8225245475769043,
"step": 1480
},
{
"epoch": 0.45,
"learning_rate": 2.6623817769085268e-06,
"logits/chosen": -0.3299495577812195,
"logits/rejected": -0.3310778737068176,
"logps/chosen": -438.0104064941406,
"logps/rejected": -450.68572998046875,
"loss": 0.4308,
"rewards/accuracies": 0.75,
"rewards/chosen": -2.4091532230377197,
"rewards/margins": 1.258310079574585,
"rewards/rejected": -3.6674628257751465,
"step": 1490
},
{
"epoch": 0.46,
"learning_rate": 2.6353472714635443e-06,
"logits/chosen": -0.3383990526199341,
"logits/rejected": -0.33991554379463196,
"logps/chosen": -453.71038818359375,
"logps/rejected": -466.170166015625,
"loss": 0.4603,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -2.7626547813415527,
"rewards/margins": 1.127687692642212,
"rewards/rejected": -3.8903422355651855,
"step": 1500
},
{
"epoch": 0.46,
"learning_rate": 2.6082968824161558e-06,
"logits/chosen": -0.3404627740383148,
"logits/rejected": -0.3412095606327057,
"logps/chosen": -446.44281005859375,
"logps/rejected": -454.9615783691406,
"loss": 0.4887,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -2.626864194869995,
"rewards/margins": 1.2109944820404053,
"rewards/rejected": -3.8378589153289795,
"step": 1510
},
{
"epoch": 0.46,
"learning_rate": 2.5812337842494517e-06,
"logits/chosen": -0.3334888815879822,
"logits/rejected": -0.334361732006073,
"logps/chosen": -437.97979736328125,
"logps/rejected": -449.66864013671875,
"loss": 0.4395,
"rewards/accuracies": 0.78125,
"rewards/chosen": -2.6739344596862793,
"rewards/margins": 1.2408138513565063,
"rewards/rejected": -3.914747953414917,
"step": 1520
},
{
"epoch": 0.46,
"learning_rate": 2.554161152937994e-06,
"logits/chosen": -0.34664058685302734,
"logits/rejected": -0.34752577543258667,
"logps/chosen": -452.38983154296875,
"logps/rejected": -458.98046875,
"loss": 0.46,
"rewards/accuracies": 0.78125,
"rewards/chosen": -2.5840182304382324,
"rewards/margins": 1.3410053253173828,
"rewards/rejected": -3.9250235557556152,
"step": 1530
},
{
"epoch": 0.47,
"learning_rate": 2.5270821655750997e-06,
"logits/chosen": -0.3402210772037506,
"logits/rejected": -0.3408128619194031,
"logps/chosen": -452.06658935546875,
"logps/rejected": -465.1114807128906,
"loss": 0.383,
"rewards/accuracies": 0.8187500238418579,
"rewards/chosen": -2.483328342437744,
"rewards/margins": 1.4499397277832031,
"rewards/rejected": -3.9332680702209473,
"step": 1540
},
{
"epoch": 0.47,
"learning_rate": 2.5e-06,
"logits/chosen": -0.33848652243614197,
"logits/rejected": -0.3391149640083313,
"logps/chosen": -447.24407958984375,
"logps/rejected": -456.50933837890625,
"loss": 0.4384,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -2.932450294494629,
"rewards/margins": 1.1602147817611694,
"rewards/rejected": -4.09266471862793,
"step": 1550
},
{
"epoch": 0.47,
"learning_rate": 2.4729178344249007e-06,
"logits/chosen": -0.34805721044540405,
"logits/rejected": -0.34990328550338745,
"logps/chosen": -457.77520751953125,
"logps/rejected": -467.7879943847656,
"loss": 0.4306,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": -2.852219343185425,
"rewards/margins": 1.3265211582183838,
"rewards/rejected": -4.178740501403809,
"step": 1560
},
{
"epoch": 0.48,
"learning_rate": 2.4458388470620066e-06,
"logits/chosen": -0.34960517287254333,
"logits/rejected": -0.35107699036598206,
"logps/chosen": -457.14569091796875,
"logps/rejected": -467.239990234375,
"loss": 0.4444,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -2.756500720977783,
"rewards/margins": 1.3199396133422852,
"rewards/rejected": -4.07643985748291,
"step": 1570
},
{
"epoch": 0.48,
"learning_rate": 2.418766215750549e-06,
"logits/chosen": -0.3384454548358917,
"logits/rejected": -0.3394390642642975,
"logps/chosen": -455.9664001464844,
"logps/rejected": -467.4884338378906,
"loss": 0.4289,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -2.917130708694458,
"rewards/margins": 1.3165969848632812,
"rewards/rejected": -4.23372745513916,
"step": 1580
},
{
"epoch": 0.48,
"learning_rate": 2.3917031175838447e-06,
"logits/chosen": -0.33930128812789917,
"logits/rejected": -0.33957165479660034,
"logps/chosen": -452.30548095703125,
"logps/rejected": -467.23614501953125,
"loss": 0.4339,
"rewards/accuracies": 0.8125,
"rewards/chosen": -2.8669447898864746,
"rewards/margins": 1.3872116804122925,
"rewards/rejected": -4.254156589508057,
"step": 1590
},
{
"epoch": 0.49,
"learning_rate": 2.3646527285364565e-06,
"logits/chosen": -0.33700358867645264,
"logits/rejected": -0.33825331926345825,
"logps/chosen": -451.98272705078125,
"logps/rejected": -461.351318359375,
"loss": 0.4821,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -3.0442519187927246,
"rewards/margins": 1.0982847213745117,
"rewards/rejected": -4.1425371170043945,
"step": 1600
},
{
"epoch": 0.49,
"eval_logits/chosen": -0.41185611486434937,
"eval_logits/rejected": -0.41246527433395386,
"eval_logps/chosen": -441.00274658203125,
"eval_logps/rejected": -451.21136474609375,
"eval_loss": 0.48274433612823486,
"eval_rewards/accuracies": 0.7620000243186951,
"eval_rewards/chosen": -2.811156749725342,
"eval_rewards/margins": 1.2120723724365234,
"eval_rewards/rejected": -4.023228645324707,
"eval_runtime": 376.6555,
"eval_samples_per_second": 1.327,
"eval_steps_per_second": 1.327,
"step": 1600
},
{
"epoch": 0.49,
"learning_rate": 2.3376182230914728e-06,
"logits/chosen": -0.35231637954711914,
"logits/rejected": -0.3524485230445862,
"logps/chosen": -450.71600341796875,
"logps/rejected": -459.95623779296875,
"loss": 0.4562,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -2.791792869567871,
"rewards/margins": 1.3087048530578613,
"rewards/rejected": -4.100497245788574,
"step": 1610
},
{
"epoch": 0.49,
"learning_rate": 2.3106027738679743e-06,
"logits/chosen": -0.3403882086277008,
"logits/rejected": -0.34152495861053467,
"logps/chosen": -453.09197998046875,
"logps/rejected": -461.7265625,
"loss": 0.5492,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -2.994368076324463,
"rewards/margins": 0.9577304124832153,
"rewards/rejected": -3.9520981311798096,
"step": 1620
},
{
"epoch": 0.5,
"learning_rate": 2.2836095512487063e-06,
"logits/chosen": -0.34236225485801697,
"logits/rejected": -0.3437976539134979,
"logps/chosen": -448.03765869140625,
"logps/rejected": -458.0298767089844,
"loss": 0.4769,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -2.8009488582611084,
"rewards/margins": 1.1950900554656982,
"rewards/rejected": -3.9960389137268066,
"step": 1630
},
{
"epoch": 0.5,
"learning_rate": 2.256641723008026e-06,
"logits/chosen": -0.3453958332538605,
"logits/rejected": -0.34628570079803467,
"logps/chosen": -452.4602966308594,
"logps/rejected": -464.2635192871094,
"loss": 0.4904,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -2.8692307472229004,
"rewards/margins": 1.1883941888809204,
"rewards/rejected": -4.057624816894531,
"step": 1640
},
{
"epoch": 0.5,
"learning_rate": 2.2297024539401463e-06,
"logits/chosen": -0.3422110974788666,
"logits/rejected": -0.34265169501304626,
"logps/chosen": -459.0148010253906,
"logps/rejected": -469.46038818359375,
"loss": 0.4726,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -3.0340380668640137,
"rewards/margins": 1.1149537563323975,
"rewards/rejected": -4.148991584777832,
"step": 1650
},
{
"epoch": 0.5,
"learning_rate": 2.2027949054877342e-06,
"logits/chosen": -0.34315139055252075,
"logits/rejected": -0.3437284529209137,
"logps/chosen": -448.80657958984375,
"logps/rejected": -458.0669860839844,
"loss": 0.5145,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -2.8165078163146973,
"rewards/margins": 1.120755672454834,
"rewards/rejected": -3.9372634887695312,
"step": 1660
},
{
"epoch": 0.51,
"learning_rate": 2.175922235370904e-06,
"logits/chosen": -0.34890785813331604,
"logits/rejected": -0.34955543279647827,
"logps/chosen": -448.3866271972656,
"logps/rejected": -457.5038146972656,
"loss": 0.4845,
"rewards/accuracies": 0.75,
"rewards/chosen": -2.5519251823425293,
"rewards/margins": 1.266904592514038,
"rewards/rejected": -3.8188300132751465,
"step": 1670
},
{
"epoch": 0.51,
"learning_rate": 2.1490875972166394e-06,
"logits/chosen": -0.3498338460922241,
"logits/rejected": -0.35048046708106995,
"logps/chosen": -449.01849365234375,
"logps/rejected": -459.8980407714844,
"loss": 0.3836,
"rewards/accuracies": 0.856249988079071,
"rewards/chosen": -2.5593769550323486,
"rewards/margins": 1.4853286743164062,
"rewards/rejected": -4.044705390930176,
"step": 1680
},
{
"epoch": 0.51,
"learning_rate": 2.1222941401887087e-06,
"logits/chosen": -0.3391914367675781,
"logits/rejected": -0.3401142954826355,
"logps/chosen": -437.19488525390625,
"logps/rejected": -449.09820556640625,
"loss": 0.4638,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -2.642850875854492,
"rewards/margins": 1.1874374151229858,
"rewards/rejected": -3.8302879333496094,
"step": 1690
},
{
"epoch": 0.52,
"learning_rate": 2.0955450086180883e-06,
"logits/chosen": -0.3401223123073578,
"logits/rejected": -0.3409723937511444,
"logps/chosen": -453.819580078125,
"logps/rejected": -463.77117919921875,
"loss": 0.4747,
"rewards/accuracies": 0.75,
"rewards/chosen": -2.682774066925049,
"rewards/margins": 1.2849785089492798,
"rewards/rejected": -3.9677529335021973,
"step": 1700
},
{
"epoch": 0.52,
"learning_rate": 2.0688433416339694e-06,
"logits/chosen": -0.3425321877002716,
"logits/rejected": -0.3435406982898712,
"logps/chosen": -441.6337890625,
"logps/rejected": -454.7735290527344,
"loss": 0.4359,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -2.900296688079834,
"rewards/margins": 1.1836225986480713,
"rewards/rejected": -4.083919525146484,
"step": 1710
},
{
"epoch": 0.52,
"learning_rate": 2.0421922727953597e-06,
"logits/chosen": -0.3457149863243103,
"logits/rejected": -0.3468255400657654,
"logps/chosen": -449.11700439453125,
"logps/rejected": -461.40020751953125,
"loss": 0.4626,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -2.759221315383911,
"rewards/margins": 1.2033522129058838,
"rewards/rejected": -3.962573528289795,
"step": 1720
},
{
"epoch": 0.53,
"learning_rate": 2.0155949297233542e-06,
"logits/chosen": -0.3487555980682373,
"logits/rejected": -0.34981435537338257,
"logps/chosen": -461.87481689453125,
"logps/rejected": -473.541015625,
"loss": 0.4555,
"rewards/accuracies": 0.75,
"rewards/chosen": -2.762120008468628,
"rewards/margins": 1.2758208513259888,
"rewards/rejected": -4.037940979003906,
"step": 1730
},
{
"epoch": 0.53,
"learning_rate": 1.9890544337340882e-06,
"logits/chosen": -0.3474620282649994,
"logits/rejected": -0.34911760687828064,
"logps/chosen": -446.1351623535156,
"logps/rejected": -461.01678466796875,
"loss": 0.4426,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -2.9118289947509766,
"rewards/margins": 1.271337866783142,
"rewards/rejected": -4.183166980743408,
"step": 1740
},
{
"epoch": 0.53,
"learning_rate": 1.96257389947244e-06,
"logits/chosen": -0.34583669900894165,
"logits/rejected": -0.3470597565174103,
"logps/chosen": -445.00054931640625,
"logps/rejected": -457.888671875,
"loss": 0.4487,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -2.823984146118164,
"rewards/margins": 1.3737802505493164,
"rewards/rejected": -4.1977643966674805,
"step": 1750
},
{
"epoch": 0.53,
"learning_rate": 1.936156434546515e-06,
"logits/chosen": -0.3472025990486145,
"logits/rejected": -0.3478149473667145,
"logps/chosen": -450.0955505371094,
"logps/rejected": -459.27166748046875,
"loss": 0.5015,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -2.8728580474853516,
"rewards/margins": 1.2571897506713867,
"rewards/rejected": -4.130047798156738,
"step": 1760
},
{
"epoch": 0.54,
"learning_rate": 1.90980513916295e-06,
"logits/chosen": -0.3443449139595032,
"logits/rejected": -0.3453408479690552,
"logps/chosen": -450.039306640625,
"logps/rejected": -456.46392822265625,
"loss": 0.4463,
"rewards/accuracies": 0.78125,
"rewards/chosen": -2.8687210083007812,
"rewards/margins": 1.327781081199646,
"rewards/rejected": -4.196502208709717,
"step": 1770
},
{
"epoch": 0.54,
"learning_rate": 1.8835231057630955e-06,
"logits/chosen": -0.34365350008010864,
"logits/rejected": -0.34461337327957153,
"logps/chosen": -454.1045837402344,
"logps/rejected": -468.08251953125,
"loss": 0.3981,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -2.681962490081787,
"rewards/margins": 1.4462287425994873,
"rewards/rejected": -4.128190517425537,
"step": 1780
},
{
"epoch": 0.54,
"learning_rate": 1.8573134186600978e-06,
"logits/chosen": -0.3493928909301758,
"logits/rejected": -0.35027194023132324,
"logps/chosen": -447.32666015625,
"logps/rejected": -458.9419860839844,
"loss": 0.4397,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -2.6235809326171875,
"rewards/margins": 1.4347044229507446,
"rewards/rejected": -4.058285236358643,
"step": 1790
},
{
"epoch": 0.55,
"learning_rate": 1.8311791536769485e-06,
"logits/chosen": -0.346055805683136,
"logits/rejected": -0.3475271463394165,
"logps/chosen": -442.3778381347656,
"logps/rejected": -458.1031188964844,
"loss": 0.3935,
"rewards/accuracies": 0.8187500238418579,
"rewards/chosen": -2.6405744552612305,
"rewards/margins": 1.582415223121643,
"rewards/rejected": -4.222989559173584,
"step": 1800
},
{
"epoch": 0.55,
"eval_logits/chosen": -0.41737592220306396,
"eval_logits/rejected": -0.4179980754852295,
"eval_logps/chosen": -440.4134826660156,
"eval_logps/rejected": -450.7027893066406,
"eval_loss": 0.47837841510772705,
"eval_rewards/accuracies": 0.7599999904632568,
"eval_rewards/chosen": -2.7522289752960205,
"eval_rewards/margins": 1.220139503479004,
"eval_rewards/rejected": -3.9723684787750244,
"eval_runtime": 351.6535,
"eval_samples_per_second": 1.422,
"eval_steps_per_second": 1.422,
"step": 1800
},
{
"epoch": 0.55,
"learning_rate": 1.805123377785515e-06,
"logits/chosen": -0.3527616858482361,
"logits/rejected": -0.3528694212436676,
"logps/chosen": -444.4476623535156,
"logps/rejected": -453.0213317871094,
"loss": 0.4432,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -2.709862232208252,
"rewards/margins": 1.3173949718475342,
"rewards/rejected": -4.027257442474365,
"step": 1810
},
{
"epoch": 0.55,
"learning_rate": 1.7791491487466234e-06,
"logits/chosen": -0.3477206528186798,
"logits/rejected": -0.34793621301651,
"logps/chosen": -444.4949645996094,
"logps/rejected": -456.147705078125,
"loss": 0.4933,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -2.9446983337402344,
"rewards/margins": 1.1086232662200928,
"rewards/rejected": -4.053321361541748,
"step": 1820
},
{
"epoch": 0.56,
"learning_rate": 1.7532595147512167e-06,
"logits/chosen": -0.34836429357528687,
"logits/rejected": -0.34931057691574097,
"logps/chosen": -448.5811462402344,
"logps/rejected": -460.9894104003906,
"loss": 0.4243,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": -2.7561140060424805,
"rewards/margins": 1.3816736936569214,
"rewards/rejected": -4.137787818908691,
"step": 1830
},
{
"epoch": 0.56,
"learning_rate": 1.7274575140626318e-06,
"logits/chosen": -0.359462171792984,
"logits/rejected": -0.36063042283058167,
"logps/chosen": -448.9486389160156,
"logps/rejected": -458.26776123046875,
"loss": 0.4759,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -2.6954994201660156,
"rewards/margins": 1.2426683902740479,
"rewards/rejected": -3.9381680488586426,
"step": 1840
},
{
"epoch": 0.56,
"learning_rate": 1.7017461746600506e-06,
"logits/chosen": -0.3540958762168884,
"logits/rejected": -0.3554149866104126,
"logps/chosen": -442.2723083496094,
"logps/rejected": -452.81951904296875,
"loss": 0.479,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -2.743239402770996,
"rewards/margins": 1.2077919244766235,
"rewards/rejected": -3.95103120803833,
"step": 1850
},
{
"epoch": 0.56,
"learning_rate": 1.6761285138831493e-06,
"logits/chosen": -0.3558579981327057,
"logits/rejected": -0.35607069730758667,
"logps/chosen": -448.01458740234375,
"logps/rejected": -458.3499450683594,
"loss": 0.4367,
"rewards/accuracies": 0.78125,
"rewards/chosen": -2.706444025039673,
"rewards/margins": 1.3273353576660156,
"rewards/rejected": -4.033779144287109,
"step": 1860
},
{
"epoch": 0.57,
"learning_rate": 1.6506075380780043e-06,
"logits/chosen": -0.343932569026947,
"logits/rejected": -0.3449569046497345,
"logps/chosen": -449.41534423828125,
"logps/rejected": -461.0784606933594,
"loss": 0.4612,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -2.849799633026123,
"rewards/margins": 1.247933030128479,
"rewards/rejected": -4.0977325439453125,
"step": 1870
},
{
"epoch": 0.57,
"learning_rate": 1.625186242244279e-06,
"logits/chosen": -0.351362407207489,
"logits/rejected": -0.35285985469818115,
"logps/chosen": -442.25335693359375,
"logps/rejected": -452.58526611328125,
"loss": 0.4487,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -2.7264816761016846,
"rewards/margins": 1.3034883737564087,
"rewards/rejected": -4.029970169067383,
"step": 1880
},
{
"epoch": 0.57,
"learning_rate": 1.5998676096837534e-06,
"logits/chosen": -0.35466188192367554,
"logits/rejected": -0.35623863339424133,
"logps/chosen": -455.30859375,
"logps/rejected": -466.81817626953125,
"loss": 0.4525,
"rewards/accuracies": 0.78125,
"rewards/chosen": -2.76641845703125,
"rewards/margins": 1.3434927463531494,
"rewards/rejected": -4.1099114418029785,
"step": 1890
},
{
"epoch": 0.58,
"learning_rate": 1.574654611650214e-06,
"logits/chosen": -0.353823721408844,
"logits/rejected": -0.3546674847602844,
"logps/chosen": -448.30615234375,
"logps/rejected": -462.4393005371094,
"loss": 0.4049,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -2.676255226135254,
"rewards/margins": 1.421419382095337,
"rewards/rejected": -4.097674369812012,
"step": 1900
},
{
"epoch": 0.58,
"learning_rate": 1.54955020700077e-06,
"logits/chosen": -0.35255804657936096,
"logits/rejected": -0.35378915071487427,
"logps/chosen": -442.2880859375,
"logps/rejected": -454.832763671875,
"loss": 0.4771,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -2.8402438163757324,
"rewards/margins": 1.2248531579971313,
"rewards/rejected": -4.065096855163574,
"step": 1910
},
{
"epoch": 0.58,
"learning_rate": 1.5245573418486136e-06,
"logits/chosen": -0.35058295726776123,
"logits/rejected": -0.3520324230194092,
"logps/chosen": -451.47265625,
"logps/rejected": -462.79791259765625,
"loss": 0.4615,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -2.648658037185669,
"rewards/margins": 1.3944532871246338,
"rewards/rejected": -4.043111324310303,
"step": 1920
},
{
"epoch": 0.59,
"learning_rate": 1.4996789492172836e-06,
"logits/chosen": -0.35444819927215576,
"logits/rejected": -0.35484084486961365,
"logps/chosen": -447.3772888183594,
"logps/rejected": -457.873291015625,
"loss": 0.4392,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -2.671096086502075,
"rewards/margins": 1.3712053298950195,
"rewards/rejected": -4.042301654815674,
"step": 1930
},
{
"epoch": 0.59,
"learning_rate": 1.4749179486964599e-06,
"logits/chosen": -0.3643060028553009,
"logits/rejected": -0.3653911054134369,
"logps/chosen": -452.032470703125,
"logps/rejected": -464.42193603515625,
"loss": 0.4286,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -2.5244593620300293,
"rewards/margins": 1.475843071937561,
"rewards/rejected": -4.000302314758301,
"step": 1940
},
{
"epoch": 0.59,
"learning_rate": 1.4502772460993387e-06,
"logits/chosen": -0.35049787163734436,
"logits/rejected": -0.3510446846485138,
"logps/chosen": -448.87518310546875,
"logps/rejected": -457.3994140625,
"loss": 0.491,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -2.827812910079956,
"rewards/margins": 1.2428455352783203,
"rewards/rejected": -4.0706586837768555,
"step": 1950
},
{
"epoch": 0.6,
"learning_rate": 1.4257597331216211e-06,
"logits/chosen": -0.3531518578529358,
"logits/rejected": -0.3538290858268738,
"logps/chosen": -456.27691650390625,
"logps/rejected": -466.98687744140625,
"loss": 0.4657,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -3.0542407035827637,
"rewards/margins": 1.164147138595581,
"rewards/rejected": -4.218388080596924,
"step": 1960
},
{
"epoch": 0.6,
"learning_rate": 1.4013682870021594e-06,
"logits/chosen": -0.35849729180336,
"logits/rejected": -0.3595832884311676,
"logps/chosen": -447.4246520996094,
"logps/rejected": -460.0209045410156,
"loss": 0.3725,
"rewards/accuracies": 0.831250011920929,
"rewards/chosen": -2.7747585773468018,
"rewards/margins": 1.452606439590454,
"rewards/rejected": -4.227365016937256,
"step": 1970
},
{
"epoch": 0.6,
"learning_rate": 1.3771057701853034e-06,
"logits/chosen": -0.35135719180107117,
"logits/rejected": -0.3521498739719391,
"logps/chosen": -455.69549560546875,
"logps/rejected": -467.37591552734375,
"loss": 0.4899,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -2.884308338165283,
"rewards/margins": 1.336925983428955,
"rewards/rejected": -4.221234321594238,
"step": 1980
},
{
"epoch": 0.6,
"learning_rate": 1.352975029984974e-06,
"logits/chosen": -0.3514239192008972,
"logits/rejected": -0.35260799527168274,
"logps/chosen": -441.4317932128906,
"logps/rejected": -454.58251953125,
"loss": 0.4829,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -2.8820691108703613,
"rewards/margins": 1.2067675590515137,
"rewards/rejected": -4.088836669921875,
"step": 1990
},
{
"epoch": 0.61,
"learning_rate": 1.328978898250525e-06,
"logits/chosen": -0.3527238070964813,
"logits/rejected": -0.3534066379070282,
"logps/chosen": -452.95989990234375,
"logps/rejected": -464.698486328125,
"loss": 0.4476,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -2.7672972679138184,
"rewards/margins": 1.3264329433441162,
"rewards/rejected": -4.093730926513672,
"step": 2000
},
{
"epoch": 0.61,
"eval_logits/chosen": -0.42317765951156616,
"eval_logits/rejected": -0.42379918694496155,
"eval_logps/chosen": -441.13116455078125,
"eval_logps/rejected": -451.5594177246094,
"eval_loss": 0.4796808958053589,
"eval_rewards/accuracies": 0.7559999823570251,
"eval_rewards/chosen": -2.823995590209961,
"eval_rewards/margins": 1.234041452407837,
"eval_rewards/rejected": -4.058037281036377,
"eval_runtime": 351.7707,
"eval_samples_per_second": 1.421,
"eval_steps_per_second": 1.421,
"step": 2000
},
{
"epoch": 0.61,
"learning_rate": 1.305120191034409e-06,
"logits/chosen": -0.34321507811546326,
"logits/rejected": -0.343815416097641,
"logps/chosen": -443.4376525878906,
"logps/rejected": -452.2301330566406,
"loss": 0.4223,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -2.8989710807800293,
"rewards/margins": 1.322284460067749,
"rewards/rejected": -4.221255302429199,
"step": 2010
},
{
"epoch": 0.61,
"learning_rate": 1.2814017082617025e-06,
"logits/chosen": -0.3508697748184204,
"logits/rejected": -0.35247209668159485,
"logps/chosen": -444.38641357421875,
"logps/rejected": -456.51153564453125,
"loss": 0.4284,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -2.6340365409851074,
"rewards/margins": 1.4381511211395264,
"rewards/rejected": -4.072187900543213,
"step": 2020
},
{
"epoch": 0.62,
"learning_rate": 1.2578262334015201e-06,
"logits/chosen": -0.34914684295654297,
"logits/rejected": -0.35076671838760376,
"logps/chosen": -441.771728515625,
"logps/rejected": -457.03057861328125,
"loss": 0.4234,
"rewards/accuracies": 0.8187500238418579,
"rewards/chosen": -2.6355042457580566,
"rewards/margins": 1.4987059831619263,
"rewards/rejected": -4.134210109710693,
"step": 2030
},
{
"epoch": 0.62,
"learning_rate": 1.234396533140365e-06,
"logits/chosen": -0.3611491024494171,
"logits/rejected": -0.3617832660675049,
"logps/chosen": -454.31951904296875,
"logps/rejected": -467.4290466308594,
"loss": 0.435,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -2.7407212257385254,
"rewards/margins": 1.4744737148284912,
"rewards/rejected": -4.215195178985596,
"step": 2040
},
{
"epoch": 0.62,
"learning_rate": 1.2111153570574454e-06,
"logits/chosen": -0.35015982389450073,
"logits/rejected": -0.35119912028312683,
"logps/chosen": -446.706787109375,
"logps/rejected": -461.330810546875,
"loss": 0.4095,
"rewards/accuracies": 0.78125,
"rewards/chosen": -2.59765625,
"rewards/margins": 1.5183773040771484,
"rewards/rejected": -4.116034030914307,
"step": 2050
},
{
"epoch": 0.63,
"learning_rate": 1.187985437301999e-06,
"logits/chosen": -0.35530123114585876,
"logits/rejected": -0.35578909516334534,
"logps/chosen": -438.29974365234375,
"logps/rejected": -452.89483642578125,
"loss": 0.4416,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -2.729788303375244,
"rewards/margins": 1.507673978805542,
"rewards/rejected": -4.237462043762207,
"step": 2060
},
{
"epoch": 0.63,
"learning_rate": 1.1650094882726599e-06,
"logits/chosen": -0.36762434244155884,
"logits/rejected": -0.36925989389419556,
"logps/chosen": -455.1209411621094,
"logps/rejected": -469.49920654296875,
"loss": 0.4061,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -2.913037061691284,
"rewards/margins": 1.4348491430282593,
"rewards/rejected": -4.347886562347412,
"step": 2070
},
{
"epoch": 0.63,
"learning_rate": 1.1421902062989178e-06,
"logits/chosen": -0.3690846264362335,
"logits/rejected": -0.3703765869140625,
"logps/chosen": -451.37860107421875,
"logps/rejected": -462.94293212890625,
"loss": 0.4399,
"rewards/accuracies": 0.78125,
"rewards/chosen": -2.7418205738067627,
"rewards/margins": 1.415470838546753,
"rewards/rejected": -4.157290935516357,
"step": 2080
},
{
"epoch": 0.63,
"learning_rate": 1.1195302693246879e-06,
"logits/chosen": -0.34830474853515625,
"logits/rejected": -0.34976112842559814,
"logps/chosen": -447.49261474609375,
"logps/rejected": -460.62994384765625,
"loss": 0.4744,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -2.9276537895202637,
"rewards/margins": 1.2483450174331665,
"rewards/rejected": -4.175999164581299,
"step": 2090
},
{
"epoch": 0.64,
"learning_rate": 1.0970323365940443e-06,
"logits/chosen": -0.358784556388855,
"logits/rejected": -0.35958269238471985,
"logps/chosen": -449.94482421875,
"logps/rejected": -461.4793395996094,
"loss": 0.456,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -2.9514570236206055,
"rewards/margins": 1.3440725803375244,
"rewards/rejected": -4.295529365539551,
"step": 2100
},
{
"epoch": 0.64,
"learning_rate": 1.0746990483391414e-06,
"logits/chosen": -0.3496165871620178,
"logits/rejected": -0.3507440388202667,
"logps/chosen": -453.05755615234375,
"logps/rejected": -464.02978515625,
"loss": 0.429,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -2.848665237426758,
"rewards/margins": 1.313674807548523,
"rewards/rejected": -4.162339687347412,
"step": 2110
},
{
"epoch": 0.64,
"learning_rate": 1.052533025470379e-06,
"logits/chosen": -0.3463028073310852,
"logits/rejected": -0.34690287709236145,
"logps/chosen": -443.31134033203125,
"logps/rejected": -455.8688049316406,
"loss": 0.4229,
"rewards/accuracies": 0.78125,
"rewards/chosen": -2.8174338340759277,
"rewards/margins": 1.4426196813583374,
"rewards/rejected": -4.260054111480713,
"step": 2120
},
{
"epoch": 0.65,
"learning_rate": 1.0305368692688175e-06,
"logits/chosen": -0.3607487082481384,
"logits/rejected": -0.36121565103530884,
"logps/chosen": -459.01824951171875,
"logps/rejected": -472.31884765625,
"loss": 0.4488,
"rewards/accuracies": 0.78125,
"rewards/chosen": -2.872823715209961,
"rewards/margins": 1.4165146350860596,
"rewards/rejected": -4.2893385887146,
"step": 2130
},
{
"epoch": 0.65,
"learning_rate": 1.0087131610809153e-06,
"logits/chosen": -0.34994029998779297,
"logits/rejected": -0.35072094202041626,
"logps/chosen": -442.97589111328125,
"logps/rejected": -453.756591796875,
"loss": 0.555,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -3.1791205406188965,
"rewards/margins": 1.0200657844543457,
"rewards/rejected": -4.1991868019104,
"step": 2140
},
{
"epoch": 0.65,
"learning_rate": 9.870644620155878e-07,
"logits/chosen": -0.35871225595474243,
"logits/rejected": -0.35941624641418457,
"logps/chosen": -454.1318359375,
"logps/rejected": -464.7295837402344,
"loss": 0.4462,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -2.9613711833953857,
"rewards/margins": 1.3269731998443604,
"rewards/rejected": -4.288344383239746,
"step": 2150
},
{
"epoch": 0.66,
"learning_rate": 9.655933126436565e-07,
"logits/chosen": -0.3492319583892822,
"logits/rejected": -0.3505721092224121,
"logps/chosen": -444.28515625,
"logps/rejected": -456.6302185058594,
"loss": 0.4471,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -2.847562313079834,
"rewards/margins": 1.3418700695037842,
"rewards/rejected": -4.1894330978393555,
"step": 2160
},
{
"epoch": 0.66,
"learning_rate": 9.443022326996984e-07,
"logits/chosen": -0.354257732629776,
"logits/rejected": -0.35464176535606384,
"logps/chosen": -444.35089111328125,
"logps/rejected": -453.5,
"loss": 0.4514,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -2.9952127933502197,
"rewards/margins": 1.3154346942901611,
"rewards/rejected": -4.310647487640381,
"step": 2170
},
{
"epoch": 0.66,
"learning_rate": 9.231937207863459e-07,
"logits/chosen": -0.35797202587127686,
"logits/rejected": -0.3591151833534241,
"logps/chosen": -446.80487060546875,
"logps/rejected": -460.2659606933594,
"loss": 0.4346,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -2.878164768218994,
"rewards/margins": 1.2440316677093506,
"rewards/rejected": -4.122197151184082,
"step": 2180
},
{
"epoch": 0.67,
"learning_rate": 9.022702540810607e-07,
"logits/chosen": -0.3597440719604492,
"logits/rejected": -0.3606324791908264,
"logps/chosen": -450.4046325683594,
"logps/rejected": -460.8863830566406,
"loss": 0.4151,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -2.835644483566284,
"rewards/margins": 1.3725159168243408,
"rewards/rejected": -4.208160400390625,
"step": 2190
},
{
"epoch": 0.67,
"learning_rate": 8.815342880454312e-07,
"logits/chosen": -0.3541966378688812,
"logits/rejected": -0.35494524240493774,
"logps/chosen": -455.19146728515625,
"logps/rejected": -470.131103515625,
"loss": 0.4702,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -3.059727430343628,
"rewards/margins": 1.261040449142456,
"rewards/rejected": -4.320767879486084,
"step": 2200
},
{
"epoch": 0.67,
"eval_logits/chosen": -0.4240322411060333,
"eval_logits/rejected": -0.4246600270271301,
"eval_logps/chosen": -441.68072509765625,
"eval_logps/rejected": -452.262451171875,
"eval_loss": 0.4791676104068756,
"eval_rewards/accuracies": 0.7599999904632568,
"eval_rewards/chosen": -2.878952980041504,
"eval_rewards/margins": 1.2493829727172852,
"eval_rewards/rejected": -4.128335475921631,
"eval_runtime": 351.6609,
"eval_samples_per_second": 1.422,
"eval_steps_per_second": 1.422,
"step": 2200
},
{
"epoch": 0.67,
"learning_rate": 8.609882561370101e-07,
"logits/chosen": -0.3556322455406189,
"logits/rejected": -0.35619792342185974,
"logps/chosen": -446.03204345703125,
"logps/rejected": -455.453125,
"loss": 0.4476,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -2.709972381591797,
"rewards/margins": 1.3839571475982666,
"rewards/rejected": -4.093929290771484,
"step": 2210
},
{
"epoch": 0.67,
"learning_rate": 8.406345695237394e-07,
"logits/chosen": -0.3541732430458069,
"logits/rejected": -0.35552269220352173,
"logps/chosen": -444.166015625,
"logps/rejected": -460.39617919921875,
"loss": 0.3845,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -2.691920757293701,
"rewards/margins": 1.684851884841919,
"rewards/rejected": -4.376772880554199,
"step": 2220
},
{
"epoch": 0.68,
"learning_rate": 8.20475616800985e-07,
"logits/chosen": -0.35582807660102844,
"logits/rejected": -0.35650044679641724,
"logps/chosen": -449.7290954589844,
"logps/rejected": -458.966064453125,
"loss": 0.498,
"rewards/accuracies": 0.71875,
"rewards/chosen": -3.0267415046691895,
"rewards/margins": 1.181206464767456,
"rewards/rejected": -4.207947254180908,
"step": 2230
},
{
"epoch": 0.68,
"learning_rate": 8.005137637112303e-07,
"logits/chosen": -0.35746604204177856,
"logits/rejected": -0.35817286372184753,
"logps/chosen": -450.47308349609375,
"logps/rejected": -463.4517517089844,
"loss": 0.4951,
"rewards/accuracies": 0.71875,
"rewards/chosen": -2.940412759780884,
"rewards/margins": 1.3010506629943848,
"rewards/rejected": -4.2414631843566895,
"step": 2240
},
{
"epoch": 0.68,
"learning_rate": 7.807513528664415e-07,
"logits/chosen": -0.3562454581260681,
"logits/rejected": -0.3569663166999817,
"logps/chosen": -449.81768798828125,
"logps/rejected": -462.03125,
"loss": 0.4975,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -3.0241804122924805,
"rewards/margins": 1.1527836322784424,
"rewards/rejected": -4.176963806152344,
"step": 2250
},
{
"epoch": 0.69,
"learning_rate": 7.611907034731538e-07,
"logits/chosen": -0.35374173521995544,
"logits/rejected": -0.3544319272041321,
"logps/chosen": -452.8089904785156,
"logps/rejected": -466.8789978027344,
"loss": 0.4872,
"rewards/accuracies": 0.71875,
"rewards/chosen": -3.0933475494384766,
"rewards/margins": 1.278378963470459,
"rewards/rejected": -4.371726989746094,
"step": 2260
},
{
"epoch": 0.69,
"learning_rate": 7.418341110603e-07,
"logits/chosen": -0.3625703454017639,
"logits/rejected": -0.363391637802124,
"logps/chosen": -461.72442626953125,
"logps/rejected": -472.3929138183594,
"loss": 0.4361,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -2.796329975128174,
"rewards/margins": 1.4400124549865723,
"rewards/rejected": -4.236342430114746,
"step": 2270
},
{
"epoch": 0.69,
"learning_rate": 7.226838472098239e-07,
"logits/chosen": -0.35118603706359863,
"logits/rejected": -0.35229939222335815,
"logps/chosen": -450.23895263671875,
"logps/rejected": -462.094482421875,
"loss": 0.4608,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -2.8915536403656006,
"rewards/margins": 1.3143730163574219,
"rewards/rejected": -4.205926418304443,
"step": 2280
},
{
"epoch": 0.7,
"learning_rate": 7.037421592900942e-07,
"logits/chosen": -0.3532702326774597,
"logits/rejected": -0.3544442057609558,
"logps/chosen": -444.9677734375,
"logps/rejected": -458.2537536621094,
"loss": 0.4259,
"rewards/accuracies": 0.8125,
"rewards/chosen": -2.8905608654022217,
"rewards/margins": 1.462416410446167,
"rewards/rejected": -4.352977275848389,
"step": 2290
},
{
"epoch": 0.7,
"learning_rate": 6.850112701921735e-07,
"logits/chosen": -0.35222965478897095,
"logits/rejected": -0.3528757095336914,
"logps/chosen": -441.19976806640625,
"logps/rejected": -455.4878845214844,
"loss": 0.4063,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -2.7645294666290283,
"rewards/margins": 1.4231407642364502,
"rewards/rejected": -4.1876702308654785,
"step": 2300
},
{
"epoch": 0.7,
"learning_rate": 6.664933780689445e-07,
"logits/chosen": -0.3582982122898102,
"logits/rejected": -0.3593185842037201,
"logps/chosen": -450.44549560546875,
"logps/rejected": -464.33905029296875,
"loss": 0.4102,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -2.714564800262451,
"rewards/margins": 1.4921871423721313,
"rewards/rejected": -4.206751823425293,
"step": 2310
},
{
"epoch": 0.7,
"learning_rate": 6.481906560771525e-07,
"logits/chosen": -0.357990562915802,
"logits/rejected": -0.3587570786476135,
"logps/chosen": -441.71746826171875,
"logps/rejected": -452.60870361328125,
"loss": 0.4988,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -2.9209952354431152,
"rewards/margins": 1.2236100435256958,
"rewards/rejected": -4.1446051597595215,
"step": 2320
},
{
"epoch": 0.71,
"learning_rate": 6.301052521223736e-07,
"logits/chosen": -0.3549385070800781,
"logits/rejected": -0.3562348484992981,
"logps/chosen": -450.3941955566406,
"logps/rejected": -462.034912109375,
"loss": 0.4629,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -3.1530508995056152,
"rewards/margins": 1.2050797939300537,
"rewards/rejected": -4.358130931854248,
"step": 2330
},
{
"epoch": 0.71,
"learning_rate": 6.122392886069486e-07,
"logits/chosen": -0.3575456738471985,
"logits/rejected": -0.3584723174571991,
"logps/chosen": -456.48797607421875,
"logps/rejected": -469.8323669433594,
"loss": 0.403,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -3.066274881362915,
"rewards/margins": 1.3775126934051514,
"rewards/rejected": -4.443788051605225,
"step": 2340
},
{
"epoch": 0.71,
"learning_rate": 5.945948621809092e-07,
"logits/chosen": -0.34499675035476685,
"logits/rejected": -0.34601226449012756,
"logps/chosen": -444.13818359375,
"logps/rejected": -458.27978515625,
"loss": 0.3784,
"rewards/accuracies": 0.8125,
"rewards/chosen": -2.885612726211548,
"rewards/margins": 1.6283729076385498,
"rewards/rejected": -4.513985633850098,
"step": 2350
},
{
"epoch": 0.72,
"learning_rate": 5.771740434959278e-07,
"logits/chosen": -0.36106568574905396,
"logits/rejected": -0.3616113066673279,
"logps/chosen": -451.7525329589844,
"logps/rejected": -462.39361572265625,
"loss": 0.4455,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -3.004162549972534,
"rewards/margins": 1.2321292161941528,
"rewards/rejected": -4.236291408538818,
"step": 2360
},
{
"epoch": 0.72,
"learning_rate": 5.599788769623174e-07,
"logits/chosen": -0.3459396958351135,
"logits/rejected": -0.3463771939277649,
"logps/chosen": -451.74462890625,
"logps/rejected": -460.52532958984375,
"loss": 0.442,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -3.0312728881835938,
"rewards/margins": 1.2485148906707764,
"rewards/rejected": -4.279788017272949,
"step": 2370
},
{
"epoch": 0.72,
"learning_rate": 5.430113805091111e-07,
"logits/chosen": -0.34979885816574097,
"logits/rejected": -0.3506646156311035,
"logps/chosen": -452.90667724609375,
"logps/rejected": -459.8733825683594,
"loss": 0.4529,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -3.2027690410614014,
"rewards/margins": 1.2168538570404053,
"rewards/rejected": -4.419622898101807,
"step": 2380
},
{
"epoch": 0.73,
"learning_rate": 5.262735453472459e-07,
"logits/chosen": -0.3504520058631897,
"logits/rejected": -0.3512795567512512,
"logps/chosen": -448.52685546875,
"logps/rejected": -459.87359619140625,
"loss": 0.3957,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -2.8900694847106934,
"rewards/margins": 1.5135910511016846,
"rewards/rejected": -4.403660774230957,
"step": 2390
},
{
"epoch": 0.73,
"learning_rate": 5.097673357358906e-07,
"logits/chosen": -0.36047258973121643,
"logits/rejected": -0.36156997084617615,
"logps/chosen": -451.36767578125,
"logps/rejected": -462.75225830078125,
"loss": 0.4152,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -2.962690830230713,
"rewards/margins": 1.4245904684066772,
"rewards/rejected": -4.3872809410095215,
"step": 2400
},
{
"epoch": 0.73,
"eval_logits/chosen": -0.4258207082748413,
"eval_logits/rejected": -0.4264317452907562,
"eval_logps/chosen": -443.40802001953125,
"eval_logps/rejected": -454.0955810546875,
"eval_loss": 0.4785875976085663,
"eval_rewards/accuracies": 0.7599999904632568,
"eval_rewards/chosen": -3.0516843795776367,
"eval_rewards/margins": 1.2599674463272095,
"eval_rewards/rejected": -4.311651706695557,
"eval_runtime": 351.6671,
"eval_samples_per_second": 1.422,
"eval_steps_per_second": 1.422,
"step": 2400
},
{
"epoch": 0.73,
"learning_rate": 4.934946887519279e-07,
"logits/chosen": -0.36616581678390503,
"logits/rejected": -0.36695989966392517,
"logps/chosen": -457.62567138671875,
"logps/rejected": -470.66485595703125,
"loss": 0.4125,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": -2.906996011734009,
"rewards/margins": 1.469089150428772,
"rewards/rejected": -4.37608528137207,
"step": 2410
},
{
"epoch": 0.74,
"learning_rate": 4.774575140626317e-07,
"logits/chosen": -0.35779887437820435,
"logits/rejected": -0.3586946129798889,
"logps/chosen": -451.4176330566406,
"logps/rejected": -464.3094787597656,
"loss": 0.4281,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -2.945284605026245,
"rewards/margins": 1.3972845077514648,
"rewards/rejected": -4.342568874359131,
"step": 2420
},
{
"epoch": 0.74,
"learning_rate": 4.6165769370155516e-07,
"logits/chosen": -0.36210596561431885,
"logits/rejected": -0.36279112100601196,
"logps/chosen": -451.7628479003906,
"logps/rejected": -464.893798828125,
"loss": 0.4782,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -3.2397968769073486,
"rewards/margins": 1.2520904541015625,
"rewards/rejected": -4.49188756942749,
"step": 2430
},
{
"epoch": 0.74,
"learning_rate": 4.4609708184767177e-07,
"logits/chosen": -0.3466174006462097,
"logits/rejected": -0.3471986651420593,
"logps/chosen": -448.8627014160156,
"logps/rejected": -458.1527404785156,
"loss": 0.4647,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -2.91853404045105,
"rewards/margins": 1.3560994863510132,
"rewards/rejected": -4.274633884429932,
"step": 2440
},
{
"epoch": 0.74,
"learning_rate": 4.307775046077739e-07,
"logits/chosen": -0.3524012863636017,
"logits/rejected": -0.3537690043449402,
"logps/chosen": -445.4092712402344,
"logps/rejected": -460.19635009765625,
"loss": 0.4618,
"rewards/accuracies": 0.75,
"rewards/chosen": -3.2233848571777344,
"rewards/margins": 1.2467783689498901,
"rewards/rejected": -4.470162391662598,
"step": 2450
},
{
"epoch": 0.75,
"learning_rate": 4.1570075980217503e-07,
"logits/chosen": -0.3559108376502991,
"logits/rejected": -0.35668981075286865,
"logps/chosen": -449.45330810546875,
"logps/rejected": -457.7796325683594,
"loss": 0.4718,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -3.394498348236084,
"rewards/margins": 1.175183892250061,
"rewards/rejected": -4.5696821212768555,
"step": 2460
},
{
"epoch": 0.75,
"learning_rate": 4.008686167537243e-07,
"logits/chosen": -0.36145132780075073,
"logits/rejected": -0.362403005361557,
"logps/chosen": -455.7185974121094,
"logps/rejected": -467.92828369140625,
"loss": 0.427,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": -3.0464015007019043,
"rewards/margins": 1.3699265718460083,
"rewards/rejected": -4.416327953338623,
"step": 2470
},
{
"epoch": 0.75,
"learning_rate": 3.862828160801707e-07,
"logits/chosen": -0.3624842166900635,
"logits/rejected": -0.36328238248825073,
"logps/chosen": -455.2030334472656,
"logps/rejected": -468.92608642578125,
"loss": 0.468,
"rewards/accuracies": 0.8187500238418579,
"rewards/chosen": -3.084407329559326,
"rewards/margins": 1.2791146039962769,
"rewards/rejected": -4.363522529602051,
"step": 2480
},
{
"epoch": 0.76,
"learning_rate": 3.7194506948989405e-07,
"logits/chosen": -0.3563145697116852,
"logits/rejected": -0.3578342795372009,
"logps/chosen": -448.5079650878906,
"logps/rejected": -462.24542236328125,
"loss": 0.3916,
"rewards/accuracies": 0.8187500238418579,
"rewards/chosen": -2.866283655166626,
"rewards/margins": 1.522825002670288,
"rewards/rejected": -4.389109134674072,
"step": 2490
},
{
"epoch": 0.76,
"learning_rate": 3.578570595810274e-07,
"logits/chosen": -0.35796427726745605,
"logits/rejected": -0.3588128089904785,
"logps/chosen": -454.2952575683594,
"logps/rejected": -462.9483337402344,
"loss": 0.4564,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -3.1886661052703857,
"rewards/margins": 1.3093502521514893,
"rewards/rejected": -4.498016357421875,
"step": 2500
},
{
"epoch": 0.76,
"learning_rate": 3.4402043964399527e-07,
"logits/chosen": -0.35277941823005676,
"logits/rejected": -0.35380321741104126,
"logps/chosen": -441.46600341796875,
"logps/rejected": -452.00537109375,
"loss": 0.4007,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -3.036705493927002,
"rewards/margins": 1.4018588066101074,
"rewards/rejected": -4.438564777374268,
"step": 2510
},
{
"epoch": 0.77,
"learning_rate": 3.304368334674965e-07,
"logits/chosen": -0.3567604124546051,
"logits/rejected": -0.35805758833885193,
"logps/chosen": -449.0523376464844,
"logps/rejected": -461.69842529296875,
"loss": 0.4191,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": -2.938197612762451,
"rewards/margins": 1.5301718711853027,
"rewards/rejected": -4.468369483947754,
"step": 2520
},
{
"epoch": 0.77,
"learning_rate": 3.1710783514794256e-07,
"logits/chosen": -0.35164931416511536,
"logits/rejected": -0.3529738187789917,
"logps/chosen": -449.84173583984375,
"logps/rejected": -464.1533203125,
"loss": 0.5458,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -3.2846596240997314,
"rewards/margins": 1.069946527481079,
"rewards/rejected": -4.354605674743652,
"step": 2530
},
{
"epoch": 0.77,
"learning_rate": 3.040350089023844e-07,
"logits/chosen": -0.3580131232738495,
"logits/rejected": -0.35896363854408264,
"logps/chosen": -460.78790283203125,
"logps/rejected": -474.05987548828125,
"loss": 0.4396,
"rewards/accuracies": 0.78125,
"rewards/chosen": -3.1543052196502686,
"rewards/margins": 1.453919768333435,
"rewards/rejected": -4.6082258224487305,
"step": 2540
},
{
"epoch": 0.77,
"learning_rate": 2.9121988888494297e-07,
"logits/chosen": -0.35557836294174194,
"logits/rejected": -0.356197327375412,
"logps/chosen": -454.5060119628906,
"logps/rejected": -467.5367736816406,
"loss": 0.3968,
"rewards/accuracies": 0.8125,
"rewards/chosen": -2.987016201019287,
"rewards/margins": 1.4960193634033203,
"rewards/rejected": -4.483035564422607,
"step": 2550
},
{
"epoch": 0.78,
"learning_rate": 2.786639790067719e-07,
"logits/chosen": -0.35686007142066956,
"logits/rejected": -0.3575289249420166,
"logps/chosen": -457.2361755371094,
"logps/rejected": -470.49212646484375,
"loss": 0.4509,
"rewards/accuracies": 0.78125,
"rewards/chosen": -3.249783754348755,
"rewards/margins": 1.2371891736984253,
"rewards/rejected": -4.486972808837891,
"step": 2560
},
{
"epoch": 0.78,
"learning_rate": 2.6636875275956567e-07,
"logits/chosen": -0.3554794192314148,
"logits/rejected": -0.35618001222610474,
"logps/chosen": -455.373291015625,
"logps/rejected": -466.55389404296875,
"loss": 0.5174,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -3.4816794395446777,
"rewards/margins": 1.009413480758667,
"rewards/rejected": -4.491092681884766,
"step": 2570
},
{
"epoch": 0.78,
"learning_rate": 2.543356530426394e-07,
"logits/chosen": -0.34936192631721497,
"logits/rejected": -0.3497045040130615,
"logps/chosen": -451.0462951660156,
"logps/rejected": -464.62109375,
"loss": 0.4859,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -3.2196598052978516,
"rewards/margins": 1.3027180433273315,
"rewards/rejected": -4.522377967834473,
"step": 2580
},
{
"epoch": 0.79,
"learning_rate": 2.425660919935954e-07,
"logits/chosen": -0.35678738355636597,
"logits/rejected": -0.35775676369667053,
"logps/chosen": -452.04925537109375,
"logps/rejected": -463.97869873046875,
"loss": 0.4253,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -2.8534128665924072,
"rewards/margins": 1.3880208730697632,
"rewards/rejected": -4.241434097290039,
"step": 2590
},
{
"epoch": 0.79,
"learning_rate": 2.3106145082260777e-07,
"logits/chosen": -0.35490182042121887,
"logits/rejected": -0.35594433546066284,
"logps/chosen": -456.057373046875,
"logps/rejected": -470.40350341796875,
"loss": 0.4502,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -3.0541415214538574,
"rewards/margins": 1.4110925197601318,
"rewards/rejected": -4.46523380279541,
"step": 2600
},
{
"epoch": 0.79,
"eval_logits/chosen": -0.4264547824859619,
"eval_logits/rejected": -0.4270709156990051,
"eval_logps/chosen": -443.8349914550781,
"eval_logps/rejected": -454.5430603027344,
"eval_loss": 0.48084381222724915,
"eval_rewards/accuracies": 0.7620000243186951,
"eval_rewards/chosen": -3.0943799018859863,
"eval_rewards/margins": 1.2620201110839844,
"eval_rewards/rejected": -4.356400489807129,
"eval_runtime": 351.5894,
"eval_samples_per_second": 1.422,
"eval_steps_per_second": 1.422,
"step": 2600
},
{
"epoch": 0.79,
"learning_rate": 2.1982307965032563e-07,
"logits/chosen": -0.3585938513278961,
"logits/rejected": -0.3597787618637085,
"logps/chosen": -453.99884033203125,
"logps/rejected": -462.98272705078125,
"loss": 0.5579,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -3.515160322189331,
"rewards/margins": 0.9192056655883789,
"rewards/rejected": -4.434365749359131,
"step": 2610
},
{
"epoch": 0.8,
"learning_rate": 2.0885229734943501e-07,
"logits/chosen": -0.35792115330696106,
"logits/rejected": -0.35949331521987915,
"logps/chosen": -441.6431579589844,
"logps/rejected": -454.21160888671875,
"loss": 0.4968,
"rewards/accuracies": 0.75,
"rewards/chosen": -3.2611217498779297,
"rewards/margins": 1.2790337800979614,
"rewards/rejected": -4.540155410766602,
"step": 2620
},
{
"epoch": 0.8,
"learning_rate": 1.9815039138988135e-07,
"logits/chosen": -0.3631977438926697,
"logits/rejected": -0.3638666272163391,
"logps/chosen": -448.5018005371094,
"logps/rejected": -460.1982421875,
"loss": 0.452,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -3.071455478668213,
"rewards/margins": 1.4410284757614136,
"rewards/rejected": -4.512484073638916,
"step": 2630
},
{
"epoch": 0.8,
"learning_rate": 1.8771861768777794e-07,
"logits/chosen": -0.3509594798088074,
"logits/rejected": -0.35208243131637573,
"logps/chosen": -450.60308837890625,
"logps/rejected": -464.2266540527344,
"loss": 0.4278,
"rewards/accuracies": 0.78125,
"rewards/chosen": -3.225553512573242,
"rewards/margins": 1.3294174671173096,
"rewards/rejected": -4.554970741271973,
"step": 2640
},
{
"epoch": 0.8,
"learning_rate": 1.7755820045802146e-07,
"logits/chosen": -0.35590630769729614,
"logits/rejected": -0.35736554861068726,
"logps/chosen": -455.400390625,
"logps/rejected": -465.2867126464844,
"loss": 0.4158,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -2.942516803741455,
"rewards/margins": 1.4671036005020142,
"rewards/rejected": -4.409620761871338,
"step": 2650
},
{
"epoch": 0.81,
"learning_rate": 1.67670332070623e-07,
"logits/chosen": -0.3521929383277893,
"logits/rejected": -0.3526236116886139,
"logps/chosen": -455.163330078125,
"logps/rejected": -469.2591857910156,
"loss": 0.4457,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -3.1400699615478516,
"rewards/margins": 1.289953589439392,
"rewards/rejected": -4.430023193359375,
"step": 2660
},
{
"epoch": 0.81,
"learning_rate": 1.580561729107777e-07,
"logits/chosen": -0.35622936487197876,
"logits/rejected": -0.356993168592453,
"logps/chosen": -455.1328125,
"logps/rejected": -465.6949157714844,
"loss": 0.4489,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -3.2420265674591064,
"rewards/margins": 1.3224232196807861,
"rewards/rejected": -4.564449310302734,
"step": 2670
},
{
"epoch": 0.81,
"learning_rate": 1.487168512426901e-07,
"logits/chosen": -0.36213189363479614,
"logits/rejected": -0.3628009557723999,
"logps/chosen": -453.6480407714844,
"logps/rejected": -465.2872619628906,
"loss": 0.4185,
"rewards/accuracies": 0.8187500238418579,
"rewards/chosen": -3.116283416748047,
"rewards/margins": 1.3114349842071533,
"rewards/rejected": -4.427718162536621,
"step": 2680
},
{
"epoch": 0.82,
"learning_rate": 1.3965346307716676e-07,
"logits/chosen": -0.3530941605567932,
"logits/rejected": -0.35421401262283325,
"logps/chosen": -451.10894775390625,
"logps/rejected": -465.1979064941406,
"loss": 0.376,
"rewards/accuracies": 0.856249988079071,
"rewards/chosen": -2.9081177711486816,
"rewards/margins": 1.643689751625061,
"rewards/rejected": -4.551807403564453,
"step": 2690
},
{
"epoch": 0.82,
"learning_rate": 1.3086707204299415e-07,
"logits/chosen": -0.36071377992630005,
"logits/rejected": -0.3618861138820648,
"logps/chosen": -448.95355224609375,
"logps/rejected": -460.8838806152344,
"loss": 0.4524,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -3.284519672393799,
"rewards/margins": 1.33005690574646,
"rewards/rejected": -4.6145758628845215,
"step": 2700
},
{
"epoch": 0.82,
"learning_rate": 1.223587092621162e-07,
"logits/chosen": -0.3580467998981476,
"logits/rejected": -0.35923272371292114,
"logps/chosen": -451.82769775390625,
"logps/rejected": -463.8099670410156,
"loss": 0.4238,
"rewards/accuracies": 0.8187500238418579,
"rewards/chosen": -3.3439979553222656,
"rewards/margins": 1.2987263202667236,
"rewards/rejected": -4.64272403717041,
"step": 2710
},
{
"epoch": 0.83,
"learning_rate": 1.1412937322862971e-07,
"logits/chosen": -0.3629991412162781,
"logits/rejected": -0.3639989495277405,
"logps/chosen": -448.5044860839844,
"logps/rejected": -460.817138671875,
"loss": 0.4102,
"rewards/accuracies": 0.831250011920929,
"rewards/chosen": -3.0257716178894043,
"rewards/margins": 1.448880910873413,
"rewards/rejected": -4.4746527671813965,
"step": 2720
},
{
"epoch": 0.83,
"learning_rate": 1.0618002969160546e-07,
"logits/chosen": -0.3608396053314209,
"logits/rejected": -0.3618479371070862,
"logps/chosen": -453.93499755859375,
"logps/rejected": -466.81640625,
"loss": 0.4187,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -3.1413166522979736,
"rewards/margins": 1.3508026599884033,
"rewards/rejected": -4.492118835449219,
"step": 2730
},
{
"epoch": 0.83,
"learning_rate": 9.851161154175337e-08,
"logits/chosen": -0.3562917113304138,
"logits/rejected": -0.35710564255714417,
"logps/chosen": -451.28076171875,
"logps/rejected": -461.2808532714844,
"loss": 0.5024,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -3.143902540206909,
"rewards/margins": 1.1941773891448975,
"rewards/rejected": -4.338079929351807,
"step": 2740
},
{
"epoch": 0.84,
"learning_rate": 9.112501870194273e-08,
"logits/chosen": -0.3589875102043152,
"logits/rejected": -0.35990768671035767,
"logps/chosen": -452.32000732421875,
"logps/rejected": -461.66033935546875,
"loss": 0.5337,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -3.512810468673706,
"rewards/margins": 0.9673913717269897,
"rewards/rejected": -4.4802021980285645,
"step": 2750
},
{
"epoch": 0.84,
"learning_rate": 8.402111802159413e-08,
"logits/chosen": -0.3585359454154968,
"logits/rejected": -0.35975727438926697,
"logps/chosen": -454.8050842285156,
"logps/rejected": -465.12615966796875,
"loss": 0.4486,
"rewards/accuracies": 0.78125,
"rewards/chosen": -3.2963638305664062,
"rewards/margins": 1.2402369976043701,
"rewards/rejected": -4.5366010665893555,
"step": 2760
},
{
"epoch": 0.84,
"learning_rate": 7.720074317494913e-08,
"logits/chosen": -0.36562293767929077,
"logits/rejected": -0.3664829134941101,
"logps/chosen": -457.26068115234375,
"logps/rejected": -470.1167907714844,
"loss": 0.4503,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -3.0565972328186035,
"rewards/margins": 1.4593775272369385,
"rewards/rejected": -4.515974521636963,
"step": 2770
},
{
"epoch": 0.84,
"learning_rate": 7.06646945632361e-08,
"logits/chosen": -0.3597029447555542,
"logits/rejected": -0.3601227402687073,
"logps/chosen": -461.0421447753906,
"logps/rejected": -469.86175537109375,
"loss": 0.512,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -3.246166944503784,
"rewards/margins": 1.1497961282730103,
"rewards/rejected": -4.395963191986084,
"step": 2780
},
{
"epoch": 0.85,
"learning_rate": 6.441373922073946e-08,
"logits/chosen": -0.359005331993103,
"logits/rejected": -0.35974326729774475,
"logps/chosen": -455.99908447265625,
"logps/rejected": -466.95440673828125,
"loss": 0.4367,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -3.224595546722412,
"rewards/margins": 1.361697793006897,
"rewards/rejected": -4.5862932205200195,
"step": 2790
},
{
"epoch": 0.85,
"learning_rate": 5.844861072478336e-08,
"logits/chosen": -0.3530232608318329,
"logits/rejected": -0.3545222580432892,
"logps/chosen": -443.6240234375,
"logps/rejected": -458.322998046875,
"loss": 0.4834,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -3.3614017963409424,
"rewards/margins": 1.2071417570114136,
"rewards/rejected": -4.568543434143066,
"step": 2800
},
{
"epoch": 0.85,
"eval_logits/chosen": -0.4272295832633972,
"eval_logits/rejected": -0.42783358693122864,
"eval_logps/chosen": -444.2228088378906,
"eval_logps/rejected": -454.95098876953125,
"eval_loss": 0.48089736700057983,
"eval_rewards/accuracies": 0.7599999904632568,
"eval_rewards/chosen": -3.1331627368927,
"eval_rewards/margins": 1.2640310525894165,
"eval_rewards/rejected": -4.3971943855285645,
"eval_runtime": 351.6656,
"eval_samples_per_second": 1.422,
"eval_steps_per_second": 1.422,
"step": 2800
},
{
"epoch": 0.85,
"learning_rate": 5.2770009109645306e-08,
"logits/chosen": -0.36214134097099304,
"logits/rejected": -0.36288636922836304,
"logps/chosen": -454.91839599609375,
"logps/rejected": -466.05224609375,
"loss": 0.4296,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -3.1463279724121094,
"rewards/margins": 1.363966703414917,
"rewards/rejected": -4.5102949142456055,
"step": 2810
},
{
"epoch": 0.86,
"learning_rate": 4.7378600784402095e-08,
"logits/chosen": -0.3552590310573578,
"logits/rejected": -0.35652121901512146,
"logps/chosen": -455.6435546875,
"logps/rejected": -465.54693603515625,
"loss": 0.4669,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -3.2327640056610107,
"rewards/margins": 1.2310270071029663,
"rewards/rejected": -4.463791370391846,
"step": 2820
},
{
"epoch": 0.86,
"learning_rate": 4.22750184547252e-08,
"logits/chosen": -0.3599388301372528,
"logits/rejected": -0.3607821762561798,
"logps/chosen": -456.6576232910156,
"logps/rejected": -469.9142150878906,
"loss": 0.4199,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -3.1602883338928223,
"rewards/margins": 1.4565389156341553,
"rewards/rejected": -4.616827487945557,
"step": 2830
},
{
"epoch": 0.86,
"learning_rate": 3.745986104862903e-08,
"logits/chosen": -0.35964518785476685,
"logits/rejected": -0.360365092754364,
"logps/chosen": -455.8336486816406,
"logps/rejected": -467.90948486328125,
"loss": 0.4152,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -2.874156951904297,
"rewards/margins": 1.5480505228042603,
"rewards/rejected": -4.422207832336426,
"step": 2840
},
{
"epoch": 0.87,
"learning_rate": 3.293369364618465e-08,
"logits/chosen": -0.3647812604904175,
"logits/rejected": -0.3658196032047272,
"logps/chosen": -449.73138427734375,
"logps/rejected": -462.46942138671875,
"loss": 0.4729,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -3.295436382293701,
"rewards/margins": 1.252638816833496,
"rewards/rejected": -4.5480756759643555,
"step": 2850
},
{
"epoch": 0.87,
"learning_rate": 2.869704741320478e-08,
"logits/chosen": -0.35672903060913086,
"logits/rejected": -0.3576185703277588,
"logps/chosen": -449.70294189453125,
"logps/rejected": -459.93096923828125,
"loss": 0.4951,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -3.523907423019409,
"rewards/margins": 1.1444907188415527,
"rewards/rejected": -4.668398380279541,
"step": 2860
},
{
"epoch": 0.87,
"learning_rate": 2.4750419538908667e-08,
"logits/chosen": -0.3534146547317505,
"logits/rejected": -0.35466113686561584,
"logps/chosen": -452.890869140625,
"logps/rejected": -464.16510009765625,
"loss": 0.4477,
"rewards/accuracies": 0.78125,
"rewards/chosen": -3.2099146842956543,
"rewards/margins": 1.3813788890838623,
"rewards/rejected": -4.591293811798096,
"step": 2870
},
{
"epoch": 0.87,
"learning_rate": 2.1094273177576508e-08,
"logits/chosen": -0.36183369159698486,
"logits/rejected": -0.36180374026298523,
"logps/chosen": -455.7822265625,
"logps/rejected": -465.14801025390625,
"loss": 0.4747,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -3.090353488922119,
"rewards/margins": 1.3213683366775513,
"rewards/rejected": -4.411721706390381,
"step": 2880
},
{
"epoch": 0.88,
"learning_rate": 1.7729037394193792e-08,
"logits/chosen": -0.3579171299934387,
"logits/rejected": -0.35931870341300964,
"logps/chosen": -450.9007263183594,
"logps/rejected": -464.6581115722656,
"loss": 0.4626,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -3.0921690464019775,
"rewards/margins": 1.4891610145568848,
"rewards/rejected": -4.581330299377441,
"step": 2890
},
{
"epoch": 0.88,
"learning_rate": 1.4655107114101008e-08,
"logits/chosen": -0.36245545744895935,
"logits/rejected": -0.36358946561813354,
"logps/chosen": -452.5899353027344,
"logps/rejected": -467.2527770996094,
"loss": 0.464,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -2.9442224502563477,
"rewards/margins": 1.4334999322891235,
"rewards/rejected": -4.377722263336182,
"step": 2900
},
{
"epoch": 0.88,
"learning_rate": 1.1872843076645157e-08,
"logits/chosen": -0.35802754759788513,
"logits/rejected": -0.35865747928619385,
"logps/chosen": -454.47662353515625,
"logps/rejected": -465.0662536621094,
"loss": 0.3877,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": -2.96606707572937,
"rewards/margins": 1.5276943445205688,
"rewards/rejected": -4.4937615394592285,
"step": 2910
},
{
"epoch": 0.89,
"learning_rate": 9.382571792846962e-09,
"logits/chosen": -0.3509235084056854,
"logits/rejected": -0.3516360819339752,
"logps/chosen": -443.75469970703125,
"logps/rejected": -453.1048278808594,
"loss": 0.4472,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -3.1273512840270996,
"rewards/margins": 1.3490197658538818,
"rewards/rejected": -4.476370811462402,
"step": 2920
},
{
"epoch": 0.89,
"learning_rate": 7.1845855070828975e-09,
"logits/chosen": -0.3624979555606842,
"logits/rejected": -0.36296314001083374,
"logps/chosen": -450.0325622558594,
"logps/rejected": -459.88916015625,
"loss": 0.4578,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -3.340954303741455,
"rewards/margins": 1.343732237815857,
"rewards/rejected": -4.684686660766602,
"step": 2930
},
{
"epoch": 0.89,
"learning_rate": 5.279142162789019e-09,
"logits/chosen": -0.35505902767181396,
"logits/rejected": -0.35619235038757324,
"logps/chosen": -451.9505920410156,
"logps/rejected": -465.6192321777344,
"loss": 0.4539,
"rewards/accuracies": 0.78125,
"rewards/chosen": -3.4748854637145996,
"rewards/margins": 1.2840955257415771,
"rewards/rejected": -4.758981227874756,
"step": 2940
},
{
"epoch": 0.9,
"learning_rate": 3.666465372190453e-09,
"logits/chosen": -0.356467604637146,
"logits/rejected": -0.357626736164093,
"logps/chosen": -452.7481384277344,
"logps/rejected": -465.7762145996094,
"loss": 0.472,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -3.293835401535034,
"rewards/margins": 1.261348009109497,
"rewards/rejected": -4.555183410644531,
"step": 2950
},
{
"epoch": 0.9,
"learning_rate": 2.34674439005822e-09,
"logits/chosen": -0.3525004982948303,
"logits/rejected": -0.35348066687583923,
"logps/chosen": -450.6170959472656,
"logps/rejected": -462.77069091796875,
"loss": 0.3976,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -3.295048952102661,
"rewards/margins": 1.4584187269210815,
"rewards/rejected": -4.753467559814453,
"step": 2960
},
{
"epoch": 0.9,
"learning_rate": 1.3201340915011685e-09,
"logits/chosen": -0.35318654775619507,
"logits/rejected": -0.35393238067626953,
"logps/chosen": -453.31707763671875,
"logps/rejected": -463.7138671875,
"loss": 0.4291,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -3.284226655960083,
"rewards/margins": 1.3613402843475342,
"rewards/rejected": -4.645566463470459,
"step": 2970
},
{
"epoch": 0.91,
"learning_rate": 5.86754953789681e-10,
"logits/chosen": -0.35480597615242004,
"logits/rejected": -0.35523343086242676,
"logps/chosen": -449.10235595703125,
"logps/rejected": -461.8785095214844,
"loss": 0.4941,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -3.270163059234619,
"rewards/margins": 1.221695065498352,
"rewards/rejected": -4.491857528686523,
"step": 2980
},
{
"epoch": 0.91,
"learning_rate": 1.4669304221726077e-10,
"logits/chosen": -0.3551548421382904,
"logits/rejected": -0.3556649386882782,
"logps/chosen": -456.6444396972656,
"logps/rejected": -467.2582092285156,
"loss": 0.4289,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -3.225339412689209,
"rewards/margins": 1.3719263076782227,
"rewards/rejected": -4.597265243530273,
"step": 2990
},
{
"epoch": 0.91,
"learning_rate": 0.0,
"logits/chosen": -0.35613125562667847,
"logits/rejected": -0.3575323522090912,
"logps/chosen": -453.228759765625,
"logps/rejected": -467.6603088378906,
"loss": 0.416,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": -3.2920143604278564,
"rewards/margins": 1.416282296180725,
"rewards/rejected": -4.708296775817871,
"step": 3000
},
{
"epoch": 0.91,
"eval_logits/chosen": -0.42728757858276367,
"eval_logits/rejected": -0.42789557576179504,
"eval_logps/chosen": -444.3138427734375,
"eval_logps/rejected": -455.0481262207031,
"eval_loss": 0.47960197925567627,
"eval_rewards/accuracies": 0.7599999904632568,
"eval_rewards/chosen": -3.1422641277313232,
"eval_rewards/margins": 1.264641523361206,
"eval_rewards/rejected": -4.406905174255371,
"eval_runtime": 351.5662,
"eval_samples_per_second": 1.422,
"eval_steps_per_second": 1.422,
"step": 3000
}
],
"logging_steps": 10,
"max_steps": 3000,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"total_flos": 0.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}