Meta-Llama-3-8B-SecUnalign / trainer_state.json
FlorianJK's picture
Upload SecAlign adapter from Meta-Llama-3-8B-Instruct_dpo_NaiveCompletion_2026-02-13-16-57-22
a551091 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.986221294363257,
"eval_steps": 500,
"global_step": 447,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.006680584551148226,
"grad_norm": 0.3030683696269989,
"learning_rate": 0.0001599980242003563,
"logits/chosen": 0.23207515478134155,
"logits/rejected": 0.5862225890159607,
"logps/chosen": -172.75270080566406,
"logps/rejected": -94.1683120727539,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 1
},
{
"epoch": 0.013361169102296452,
"grad_norm": 0.25661855936050415,
"learning_rate": 0.00015999209689901978,
"logits/chosen": 0.250579833984375,
"logits/rejected": 0.654366672039032,
"logps/chosen": -170.5427703857422,
"logps/rejected": -94.58740234375,
"loss": 0.6682,
"rewards/accuracies": 0.9375,
"rewards/chosen": 0.057050298899412155,
"rewards/margins": 0.05467259883880615,
"rewards/rejected": 0.0023776949383318424,
"step": 2
},
{
"epoch": 0.020041753653444676,
"grad_norm": 0.3576202988624573,
"learning_rate": 0.00015998221838876944,
"logits/chosen": 0.2386956363916397,
"logits/rejected": 0.585329532623291,
"logps/chosen": -165.6811981201172,
"logps/rejected": -128.58689880371094,
"loss": 0.6185,
"rewards/accuracies": 0.875,
"rewards/chosen": 0.1190996840596199,
"rewards/margins": 0.12703385949134827,
"rewards/rejected": -0.007934181950986385,
"step": 3
},
{
"epoch": 0.026722338204592903,
"grad_norm": 0.3903304636478424,
"learning_rate": 0.00015996838915755424,
"logits/chosen": 0.2555890679359436,
"logits/rejected": 0.6079045534133911,
"logps/chosen": -192.27386474609375,
"logps/rejected": -115.53170013427734,
"loss": 0.5817,
"rewards/accuracies": 0.90625,
"rewards/chosen": 0.24757207930088043,
"rewards/margins": 0.26733314990997314,
"rewards/rejected": -0.0197611041367054,
"step": 4
},
{
"epoch": 0.033402922755741124,
"grad_norm": 0.3334524929523468,
"learning_rate": 0.0001599506098884689,
"logits/chosen": 0.12432317435741425,
"logits/rejected": 0.5883125066757202,
"logps/chosen": -182.6539306640625,
"logps/rejected": -128.27572631835938,
"loss": 0.5411,
"rewards/accuracies": 0.8125,
"rewards/chosen": 0.3643491268157959,
"rewards/margins": 0.40698686242103577,
"rewards/rejected": -0.04263775050640106,
"step": 5
},
{
"epoch": 0.04008350730688935,
"grad_norm": 0.32326623797416687,
"learning_rate": 0.00015992888145972026,
"logits/chosen": 0.2656134366989136,
"logits/rejected": 0.6499578952789307,
"logps/chosen": -178.80812072753906,
"logps/rejected": -128.95472717285156,
"loss": 0.5078,
"rewards/accuracies": 0.78125,
"rewards/chosen": 0.5447542071342468,
"rewards/margins": 0.5555691719055176,
"rewards/rejected": -0.010814988985657692,
"step": 6
},
{
"epoch": 0.04676409185803758,
"grad_norm": 0.31647607684135437,
"learning_rate": 0.00015990320494458385,
"logits/chosen": 0.2741105556488037,
"logits/rejected": 0.6408557891845703,
"logps/chosen": -173.88148498535156,
"logps/rejected": -119.0029525756836,
"loss": 0.4447,
"rewards/accuracies": 0.84375,
"rewards/chosen": 0.6545026302337646,
"rewards/margins": 0.7013029456138611,
"rewards/rejected": -0.046800337731838226,
"step": 7
},
{
"epoch": 0.05344467640918581,
"grad_norm": 0.32367509603500366,
"learning_rate": 0.00015987358161135095,
"logits/chosen": 0.30019262433052063,
"logits/rejected": 0.8890558481216431,
"logps/chosen": -213.016357421875,
"logps/rejected": -108.3148422241211,
"loss": 0.4205,
"rewards/accuracies": 0.875,
"rewards/chosen": 0.7475125193595886,
"rewards/margins": 0.8885018825531006,
"rewards/rejected": -0.14098937809467316,
"step": 8
},
{
"epoch": 0.06012526096033403,
"grad_norm": 0.2946716248989105,
"learning_rate": 0.00015984001292326582,
"logits/chosen": -0.012724779546260834,
"logits/rejected": 0.8889679908752441,
"logps/chosen": -254.97061157226562,
"logps/rejected": -110.16943359375,
"loss": 0.302,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.0533387660980225,
"rewards/margins": 1.3181082010269165,
"rewards/rejected": -0.2647695243358612,
"step": 9
},
{
"epoch": 0.06680584551148225,
"grad_norm": 0.3166823089122772,
"learning_rate": 0.0001598025005384535,
"logits/chosen": 0.42483431100845337,
"logits/rejected": 0.5721423029899597,
"logps/chosen": -147.58172607421875,
"logps/rejected": -136.1343536376953,
"loss": 0.3091,
"rewards/accuracies": 0.96875,
"rewards/chosen": 1.1222333908081055,
"rewards/margins": 1.2369006872177124,
"rewards/rejected": -0.11466731131076813,
"step": 10
},
{
"epoch": 0.07348643006263048,
"grad_norm": 0.29748114943504333,
"learning_rate": 0.0001597610463098379,
"logits/chosen": 0.12727132439613342,
"logits/rejected": 0.7985308170318604,
"logps/chosen": -216.18954467773438,
"logps/rejected": -112.87439727783203,
"loss": 0.2774,
"rewards/accuracies": 0.90625,
"rewards/chosen": 1.1798498630523682,
"rewards/margins": 1.6763423681259155,
"rewards/rejected": -0.49649256467819214,
"step": 11
},
{
"epoch": 0.0801670146137787,
"grad_norm": 0.31257691979408264,
"learning_rate": 0.0001597156522850503,
"logits/chosen": 0.4580366611480713,
"logits/rejected": 0.707663357257843,
"logps/chosen": -148.07301330566406,
"logps/rejected": -148.95001220703125,
"loss": 0.2414,
"rewards/accuracies": 0.875,
"rewards/chosen": 1.0007773637771606,
"rewards/margins": 1.4732861518859863,
"rewards/rejected": -0.4725087881088257,
"step": 12
},
{
"epoch": 0.08684759916492693,
"grad_norm": 0.22815896570682526,
"learning_rate": 0.0001596663207063281,
"logits/chosen": 0.146785706281662,
"logits/rejected": 0.705903172492981,
"logps/chosen": -187.23133850097656,
"logps/rejected": -118.27559661865234,
"loss": 0.1546,
"rewards/accuracies": 0.90625,
"rewards/chosen": 1.2817703485488892,
"rewards/margins": 2.2400412559509277,
"rewards/rejected": -0.9582710266113281,
"step": 13
},
{
"epoch": 0.09352818371607516,
"grad_norm": 0.2425857037305832,
"learning_rate": 0.00015961305401040424,
"logits/chosen": 0.1981484740972519,
"logits/rejected": 1.001242995262146,
"logps/chosen": -239.64752197265625,
"logps/rejected": -113.02178192138672,
"loss": 0.152,
"rewards/accuracies": 0.96875,
"rewards/chosen": 1.2698098421096802,
"rewards/margins": 2.5462355613708496,
"rewards/rejected": -1.2764257192611694,
"step": 14
},
{
"epoch": 0.10020876826722339,
"grad_norm": 0.26788046956062317,
"learning_rate": 0.00015955585482838668,
"logits/chosen": 0.3972240686416626,
"logits/rejected": 0.6294059753417969,
"logps/chosen": -179.3887481689453,
"logps/rejected": -178.17100524902344,
"loss": 0.137,
"rewards/accuracies": 0.9375,
"rewards/chosen": 1.4892669916152954,
"rewards/margins": 2.444074869155884,
"rewards/rejected": -0.9548079967498779,
"step": 15
},
{
"epoch": 0.10688935281837161,
"grad_norm": 0.23353877663612366,
"learning_rate": 0.0001594947259856285,
"logits/chosen": 0.27605149149894714,
"logits/rejected": 0.8782747387886047,
"logps/chosen": -206.30886840820312,
"logps/rejected": -148.47564697265625,
"loss": 0.1094,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.5769964456558228,
"rewards/margins": 3.015477180480957,
"rewards/rejected": -1.4384808540344238,
"step": 16
},
{
"epoch": 0.11356993736951983,
"grad_norm": 0.1707775592803955,
"learning_rate": 0.00015942967050158835,
"logits/chosen": 0.40737321972846985,
"logits/rejected": 0.7035366296768188,
"logps/chosen": -146.5780029296875,
"logps/rejected": -146.81468200683594,
"loss": 0.0745,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.6468758583068848,
"rewards/margins": 3.2545251846313477,
"rewards/rejected": -1.6076490879058838,
"step": 17
},
{
"epoch": 0.12025052192066805,
"grad_norm": 0.14162331819534302,
"learning_rate": 0.0001593606915896813,
"logits/chosen": 0.42682021856307983,
"logits/rejected": 0.8648378849029541,
"logps/chosen": -172.6350555419922,
"logps/rejected": -148.40040588378906,
"loss": 0.0609,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.636667013168335,
"rewards/margins": 3.53716778755188,
"rewards/rejected": -1.9005005359649658,
"step": 18
},
{
"epoch": 0.1269311064718163,
"grad_norm": 0.14806503057479858,
"learning_rate": 0.00015928779265712004,
"logits/chosen": 0.46036726236343384,
"logits/rejected": 0.6379547119140625,
"logps/chosen": -177.5619659423828,
"logps/rejected": -199.31591796875,
"loss": 0.057,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.8629982471466064,
"rewards/margins": 3.892867088317871,
"rewards/rejected": -2.0298686027526855,
"step": 19
},
{
"epoch": 0.1336116910229645,
"grad_norm": 0.077659972012043,
"learning_rate": 0.00015921097730474672,
"logits/chosen": 0.18374812602996826,
"logits/rejected": 1.1149046421051025,
"logps/chosen": -188.0821990966797,
"logps/rejected": -119.98029327392578,
"loss": 0.0272,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.9101481437683105,
"rewards/margins": 4.403899192810059,
"rewards/rejected": -2.49375057220459,
"step": 20
},
{
"epoch": 0.14029227557411272,
"grad_norm": 0.16159941256046295,
"learning_rate": 0.000159130249326855,
"logits/chosen": 0.3764076828956604,
"logits/rejected": 0.7519434094429016,
"logps/chosen": -125.24600219726562,
"logps/rejected": -124.57368469238281,
"loss": 0.0331,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.4406747817993164,
"rewards/margins": 5.011131286621094,
"rewards/rejected": -2.5704567432403564,
"step": 21
},
{
"epoch": 0.14697286012526095,
"grad_norm": 0.08939576894044876,
"learning_rate": 0.00015904561271100261,
"logits/chosen": 0.384420245885849,
"logits/rejected": 0.878984272480011,
"logps/chosen": -201.50851440429688,
"logps/rejected": -164.9197998046875,
"loss": 0.0237,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.3332247734069824,
"rewards/margins": 4.973855972290039,
"rewards/rejected": -2.6406304836273193,
"step": 22
},
{
"epoch": 0.15365344467640918,
"grad_norm": 0.05668778717517853,
"learning_rate": 0.00015895707163781446,
"logits/chosen": 0.34759849309921265,
"logits/rejected": 0.7577741742134094,
"logps/chosen": -176.2628936767578,
"logps/rejected": -150.50823974609375,
"loss": 0.0153,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.4216699600219727,
"rewards/margins": 5.509364128112793,
"rewards/rejected": -3.0876946449279785,
"step": 23
},
{
"epoch": 0.1603340292275574,
"grad_norm": 0.07947131991386414,
"learning_rate": 0.00015886463048077603,
"logits/chosen": 0.36052000522613525,
"logits/rejected": 0.8884449005126953,
"logps/chosen": -168.50161743164062,
"logps/rejected": -148.0092010498047,
"loss": 0.0131,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.5387940406799316,
"rewards/margins": 5.77685546875,
"rewards/rejected": -3.2380619049072266,
"step": 24
},
{
"epoch": 0.16701461377870563,
"grad_norm": 0.11461421847343445,
"learning_rate": 0.0001587682938060175,
"logits/chosen": 0.6103401184082031,
"logits/rejected": 0.9240366220474243,
"logps/chosen": -148.28834533691406,
"logps/rejected": -149.85528564453125,
"loss": 0.0189,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.372836112976074,
"rewards/margins": 6.2512006759643555,
"rewards/rejected": -3.8783645629882812,
"step": 25
},
{
"epoch": 0.17369519832985386,
"grad_norm": 0.10785702615976334,
"learning_rate": 0.00015866806637208802,
"logits/chosen": 0.5904795527458191,
"logits/rejected": 0.8780027031898499,
"logps/chosen": -167.06285095214844,
"logps/rejected": -168.36965942382812,
"loss": 0.0131,
"rewards/accuracies": 0.96875,
"rewards/chosen": 2.5616817474365234,
"rewards/margins": 6.0948076248168945,
"rewards/rejected": -3.533125400543213,
"step": 26
},
{
"epoch": 0.1803757828810021,
"grad_norm": 0.01879948377609253,
"learning_rate": 0.0001585639531297208,
"logits/chosen": 0.4085114598274231,
"logits/rejected": 0.7017614841461182,
"logps/chosen": -182.0274200439453,
"logps/rejected": -219.75445556640625,
"loss": 0.0052,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.3172097206115723,
"rewards/margins": 6.493253231048584,
"rewards/rejected": -4.176044464111328,
"step": 27
},
{
"epoch": 0.18705636743215032,
"grad_norm": 0.0552218034863472,
"learning_rate": 0.00015845595922158858,
"logits/chosen": 0.4179205596446991,
"logits/rejected": 0.8640294075012207,
"logps/chosen": -163.02249145507812,
"logps/rejected": -173.25906372070312,
"loss": 0.008,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.3230319023132324,
"rewards/margins": 6.0196757316589355,
"rewards/rejected": -3.696643829345703,
"step": 28
},
{
"epoch": 0.19373695198329854,
"grad_norm": 0.020818833261728287,
"learning_rate": 0.0001583440899820494,
"logits/chosen": 0.37070727348327637,
"logits/rejected": 0.8000531196594238,
"logps/chosen": -153.59396362304688,
"logps/rejected": -157.2525634765625,
"loss": 0.0051,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.583785057067871,
"rewards/margins": 6.723965644836426,
"rewards/rejected": -4.140180587768555,
"step": 29
},
{
"epoch": 0.20041753653444677,
"grad_norm": 0.022928601130843163,
"learning_rate": 0.00015822835093688343,
"logits/chosen": 0.37007975578308105,
"logits/rejected": 0.6836897730827332,
"logps/chosen": -200.75411987304688,
"logps/rejected": -197.43679809570312,
"loss": 0.0048,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.2626662254333496,
"rewards/margins": 6.830512046813965,
"rewards/rejected": -4.567845821380615,
"step": 30
},
{
"epoch": 0.207098121085595,
"grad_norm": 0.017020611092448235,
"learning_rate": 0.00015810874780301971,
"logits/chosen": 0.21970444917678833,
"logits/rejected": 0.9238815307617188,
"logps/chosen": -153.52171325683594,
"logps/rejected": -130.63345336914062,
"loss": 0.0039,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.0072364807128906,
"rewards/margins": 6.51116418838501,
"rewards/rejected": -4.503928184509277,
"step": 31
},
{
"epoch": 0.21377870563674323,
"grad_norm": 0.05750842019915581,
"learning_rate": 0.000157985286488254,
"logits/chosen": 0.29518210887908936,
"logits/rejected": 0.8406018018722534,
"logps/chosen": -179.242919921875,
"logps/rejected": -190.29794311523438,
"loss": 0.0136,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.4710826873779297,
"rewards/margins": 7.065474033355713,
"rewards/rejected": -4.594390392303467,
"step": 32
},
{
"epoch": 0.22045929018789143,
"grad_norm": 0.012446871027350426,
"learning_rate": 0.00015785797309095684,
"logits/chosen": 0.45642712712287903,
"logits/rejected": 0.922579288482666,
"logps/chosen": -174.11033630371094,
"logps/rejected": -154.8909149169922,
"loss": 0.0031,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.4129574298858643,
"rewards/margins": 7.372279167175293,
"rewards/rejected": -4.959321975708008,
"step": 33
},
{
"epoch": 0.22713987473903965,
"grad_norm": 0.016282295808196068,
"learning_rate": 0.00015772681389977238,
"logits/chosen": 0.18731066584587097,
"logits/rejected": 0.7407510280609131,
"logps/chosen": -175.7775421142578,
"logps/rejected": -211.89059448242188,
"loss": 0.0029,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.7385621070861816,
"rewards/margins": 8.315619468688965,
"rewards/rejected": -5.577057361602783,
"step": 34
},
{
"epoch": 0.23382045929018788,
"grad_norm": 0.060803525149822235,
"learning_rate": 0.00015759181539330767,
"logits/chosen": 0.18983310461044312,
"logits/rejected": 0.9841374754905701,
"logps/chosen": -193.72940063476562,
"logps/rejected": -158.21124267578125,
"loss": 0.0069,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.655390501022339,
"rewards/margins": 7.861417293548584,
"rewards/rejected": -5.206027030944824,
"step": 35
},
{
"epoch": 0.2405010438413361,
"grad_norm": 0.014457982033491135,
"learning_rate": 0.0001574529842398127,
"logits/chosen": 0.35429176688194275,
"logits/rejected": 0.8492611050605774,
"logps/chosen": -193.26327514648438,
"logps/rejected": -194.78515625,
"loss": 0.0026,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.4261109828948975,
"rewards/margins": 7.886996269226074,
"rewards/rejected": -5.460885524749756,
"step": 36
},
{
"epoch": 0.24718162839248434,
"grad_norm": 0.014244547113776207,
"learning_rate": 0.00015731032729685116,
"logits/chosen": 0.5617164969444275,
"logits/rejected": 0.648545503616333,
"logps/chosen": -139.7461700439453,
"logps/rejected": -200.63014221191406,
"loss": 0.0021,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.470811367034912,
"rewards/margins": 7.392573833465576,
"rewards/rejected": -4.921762466430664,
"step": 37
},
{
"epoch": 0.2538622129436326,
"grad_norm": 0.019575441256165504,
"learning_rate": 0.0001571638516109614,
"logits/chosen": 0.5932599306106567,
"logits/rejected": 0.6393769979476929,
"logps/chosen": -131.13328552246094,
"logps/rejected": -227.80514526367188,
"loss": 0.0019,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.6353516578674316,
"rewards/margins": 8.202313423156738,
"rewards/rejected": -5.566961288452148,
"step": 38
},
{
"epoch": 0.2605427974947808,
"grad_norm": 0.012481000274419785,
"learning_rate": 0.00015701356441730864,
"logits/chosen": 0.3244459331035614,
"logits/rejected": 0.839286208152771,
"logps/chosen": -183.51742553710938,
"logps/rejected": -186.67271423339844,
"loss": 0.002,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.6852662563323975,
"rewards/margins": 8.163836479187012,
"rewards/rejected": -5.478569984436035,
"step": 39
},
{
"epoch": 0.267223382045929,
"grad_norm": 0.0029381215572357178,
"learning_rate": 0.00015685947313932744,
"logits/chosen": 0.4828110635280609,
"logits/rejected": 0.9766800999641418,
"logps/chosen": -176.82705688476562,
"logps/rejected": -181.6543731689453,
"loss": 0.0008,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.744213819503784,
"rewards/margins": 8.242889404296875,
"rewards/rejected": -5.498675346374512,
"step": 40
},
{
"epoch": 0.2739039665970772,
"grad_norm": 0.04062644764780998,
"learning_rate": 0.00015670158538835517,
"logits/chosen": 0.3162868022918701,
"logits/rejected": 1.0131462812423706,
"logps/chosen": -176.71170043945312,
"logps/rejected": -139.9007110595703,
"loss": 0.0038,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.691941976547241,
"rewards/margins": 8.195215225219727,
"rewards/rejected": -5.5032734870910645,
"step": 41
},
{
"epoch": 0.28058455114822545,
"grad_norm": 0.007433526683598757,
"learning_rate": 0.00015653990896325587,
"logits/chosen": 0.14636696875095367,
"logits/rejected": 0.8240803480148315,
"logps/chosen": -202.27883911132812,
"logps/rejected": -203.40887451171875,
"loss": 0.0011,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.634958267211914,
"rewards/margins": 8.168269157409668,
"rewards/rejected": -5.533310890197754,
"step": 42
},
{
"epoch": 0.2872651356993737,
"grad_norm": 0.007627190090715885,
"learning_rate": 0.00015637445185003504,
"logits/chosen": 0.6393700838088989,
"logits/rejected": 0.8101121783256531,
"logps/chosen": -132.88674926757812,
"logps/rejected": -221.85333251953125,
"loss": 0.0012,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.062117576599121,
"rewards/margins": 8.964439392089844,
"rewards/rejected": -5.902321815490723,
"step": 43
},
{
"epoch": 0.2939457202505219,
"grad_norm": 0.054270580410957336,
"learning_rate": 0.00015620522222144543,
"logits/chosen": 0.43403318524360657,
"logits/rejected": 1.0408865213394165,
"logps/chosen": -193.0514678955078,
"logps/rejected": -174.92593383789062,
"loss": 0.0087,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.494380235671997,
"rewards/margins": 8.408964157104492,
"rewards/rejected": -5.914583206176758,
"step": 44
},
{
"epoch": 0.30062630480167013,
"grad_norm": 0.0030595879070460796,
"learning_rate": 0.00015603222843658292,
"logits/chosen": 0.22928494215011597,
"logits/rejected": 1.1178843975067139,
"logps/chosen": -190.24087524414062,
"logps/rejected": -148.67306518554688,
"loss": 0.0007,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.1091623306274414,
"rewards/margins": 8.748310089111328,
"rewards/rejected": -5.6391472816467285,
"step": 45
},
{
"epoch": 0.30730688935281836,
"grad_norm": 0.016060445457696915,
"learning_rate": 0.00015585547904047405,
"logits/chosen": 0.34157004952430725,
"logits/rejected": 0.8962047696113586,
"logps/chosen": -220.03871154785156,
"logps/rejected": -241.0406494140625,
"loss": 0.0011,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.8193817138671875,
"rewards/margins": 9.287981986999512,
"rewards/rejected": -6.468600273132324,
"step": 46
},
{
"epoch": 0.3139874739039666,
"grad_norm": 0.025302594527602196,
"learning_rate": 0.00015567498276365365,
"logits/chosen": 0.25260353088378906,
"logits/rejected": 1.1292093992233276,
"logps/chosen": -183.6875,
"logps/rejected": -163.08114624023438,
"loss": 0.0017,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.396714687347412,
"rewards/margins": 8.208773612976074,
"rewards/rejected": -5.812058925628662,
"step": 47
},
{
"epoch": 0.3206680584551148,
"grad_norm": 0.010507023893296719,
"learning_rate": 0.00015549074852173368,
"logits/chosen": 0.3296445608139038,
"logits/rejected": 1.195299506187439,
"logps/chosen": -186.541748046875,
"logps/rejected": -154.01535034179688,
"loss": 0.0012,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.7873411178588867,
"rewards/margins": 8.289080619812012,
"rewards/rejected": -5.501739501953125,
"step": 48
},
{
"epoch": 0.32734864300626304,
"grad_norm": 0.00997474230825901,
"learning_rate": 0.00015530278541496292,
"logits/chosen": 0.3625122308731079,
"logits/rejected": 1.0217641592025757,
"logps/chosen": -140.2364959716797,
"logps/rejected": -147.66769409179688,
"loss": 0.0008,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.462085723876953,
"rewards/margins": 8.511833190917969,
"rewards/rejected": -6.049746990203857,
"step": 49
},
{
"epoch": 0.33402922755741127,
"grad_norm": 0.012138652615249157,
"learning_rate": 0.00015511110272777733,
"logits/chosen": 0.5160237550735474,
"logits/rejected": 0.9644973874092102,
"logps/chosen": -168.16012573242188,
"logps/rejected": -173.67738342285156,
"loss": 0.0014,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.5186946392059326,
"rewards/margins": 8.781152725219727,
"rewards/rejected": -6.262458801269531,
"step": 50
},
{
"epoch": 0.3407098121085595,
"grad_norm": 0.013811074197292328,
"learning_rate": 0.00015491570992834155,
"logits/chosen": 0.45710933208465576,
"logits/rejected": 0.9616823196411133,
"logps/chosen": -134.65382385253906,
"logps/rejected": -165.4757537841797,
"loss": 0.0011,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.9232048988342285,
"rewards/margins": 8.802949905395508,
"rewards/rejected": -5.879745006561279,
"step": 51
},
{
"epoch": 0.3473903966597077,
"grad_norm": 0.003540891222655773,
"learning_rate": 0.00015471661666808116,
"logits/chosen": 0.5379828214645386,
"logits/rejected": 0.9311552047729492,
"logps/chosen": -166.2515869140625,
"logps/rejected": -226.817626953125,
"loss": 0.0007,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.7226719856262207,
"rewards/margins": 9.967523574829102,
"rewards/rejected": -7.244852542877197,
"step": 52
},
{
"epoch": 0.35407098121085595,
"grad_norm": 0.046603377908468246,
"learning_rate": 0.00015451383278120595,
"logits/chosen": 0.482962429523468,
"logits/rejected": 0.8294156789779663,
"logps/chosen": -173.58953857421875,
"logps/rejected": -164.0615692138672,
"loss": 0.0038,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.1282169818878174,
"rewards/margins": 8.867436408996582,
"rewards/rejected": -5.739219665527344,
"step": 53
},
{
"epoch": 0.3607515657620042,
"grad_norm": 0.02518213540315628,
"learning_rate": 0.00015430736828422423,
"logits/chosen": 0.2752196192741394,
"logits/rejected": 1.1645457744598389,
"logps/chosen": -199.2365264892578,
"logps/rejected": -145.12139892578125,
"loss": 0.0026,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.693560838699341,
"rewards/margins": 8.52237319946289,
"rewards/rejected": -5.828812599182129,
"step": 54
},
{
"epoch": 0.3674321503131524,
"grad_norm": 0.043834567070007324,
"learning_rate": 0.00015409723337544802,
"logits/chosen": 0.4665435254573822,
"logits/rejected": 0.968167781829834,
"logps/chosen": -175.5590362548828,
"logps/rejected": -201.70126342773438,
"loss": 0.0038,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.8582777976989746,
"rewards/margins": 9.58537769317627,
"rewards/rejected": -6.727100372314453,
"step": 55
},
{
"epoch": 0.37411273486430063,
"grad_norm": 0.02259964495897293,
"learning_rate": 0.0001538834384344892,
"logits/chosen": 0.26438379287719727,
"logits/rejected": 0.9069167375564575,
"logps/chosen": -190.77597045898438,
"logps/rejected": -203.91160583496094,
"loss": 0.0017,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.2503395080566406,
"rewards/margins": 9.630882263183594,
"rewards/rejected": -6.380542278289795,
"step": 56
},
{
"epoch": 0.38079331941544886,
"grad_norm": 0.00614876439794898,
"learning_rate": 0.00015366599402174703,
"logits/chosen": 0.19551321864128113,
"logits/rejected": 1.087761640548706,
"logps/chosen": -226.3440399169922,
"logps/rejected": -211.6017608642578,
"loss": 0.0006,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.834458112716675,
"rewards/margins": 9.265405654907227,
"rewards/rejected": -6.430947303771973,
"step": 57
},
{
"epoch": 0.3874739039665971,
"grad_norm": 0.0061728451400995255,
"learning_rate": 0.00015344491087788633,
"logits/chosen": 0.3328951299190521,
"logits/rejected": 0.9735016822814941,
"logps/chosen": -172.78604125976562,
"logps/rejected": -179.15667724609375,
"loss": 0.0008,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.01570725440979,
"rewards/margins": 9.582061767578125,
"rewards/rejected": -6.566355228424072,
"step": 58
},
{
"epoch": 0.3941544885177453,
"grad_norm": 0.0012235452886670828,
"learning_rate": 0.00015322019992330702,
"logits/chosen": 0.4755726158618927,
"logits/rejected": 0.8112475872039795,
"logps/chosen": -154.75039672851562,
"logps/rejected": -201.81065368652344,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.7610487937927246,
"rewards/margins": 9.628297805786133,
"rewards/rejected": -6.86724853515625,
"step": 59
},
{
"epoch": 0.40083507306889354,
"grad_norm": 0.027884479612112045,
"learning_rate": 0.00015299187225760469,
"logits/chosen": 0.17077681422233582,
"logits/rejected": 0.9662021398544312,
"logps/chosen": -171.18963623046875,
"logps/rejected": -194.4243927001953,
"loss": 0.0027,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.726745128631592,
"rewards/margins": 8.980852127075195,
"rewards/rejected": -6.254107475280762,
"step": 60
},
{
"epoch": 0.40751565762004177,
"grad_norm": 0.012179279699921608,
"learning_rate": 0.00015275993915902234,
"logits/chosen": 0.3292789161205292,
"logits/rejected": 0.7503560185432434,
"logps/chosen": -175.94473266601562,
"logps/rejected": -210.44918823242188,
"loss": 0.0012,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.879286289215088,
"rewards/margins": 9.31381607055664,
"rewards/rejected": -6.434530258178711,
"step": 61
},
{
"epoch": 0.41419624217119,
"grad_norm": 0.0661226138472557,
"learning_rate": 0.00015252441208389334,
"logits/chosen": 0.48564329743385315,
"logits/rejected": 0.6276620030403137,
"logps/chosen": -124.23971557617188,
"logps/rejected": -221.02198791503906,
"loss": 0.0085,
"rewards/accuracies": 0.96875,
"rewards/chosen": 2.9825873374938965,
"rewards/margins": 9.826833724975586,
"rewards/rejected": -6.844245910644531,
"step": 62
},
{
"epoch": 0.4208768267223382,
"grad_norm": 0.004125866107642651,
"learning_rate": 0.00015228530266607547,
"logits/chosen": 0.27578750252723694,
"logits/rejected": 1.1573642492294312,
"logps/chosen": -197.88607788085938,
"logps/rejected": -168.08474731445312,
"loss": 0.0007,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.4376907348632812,
"rewards/margins": 9.881976127624512,
"rewards/rejected": -6.444284915924072,
"step": 63
},
{
"epoch": 0.42755741127348645,
"grad_norm": 0.003275892697274685,
"learning_rate": 0.00015204262271637626,
"logits/chosen": 0.6008961796760559,
"logits/rejected": 1.0901458263397217,
"logps/chosen": -185.306396484375,
"logps/rejected": -202.59896850585938,
"loss": 0.0005,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.3544652462005615,
"rewards/margins": 9.659427642822266,
"rewards/rejected": -6.304962158203125,
"step": 64
},
{
"epoch": 0.4342379958246347,
"grad_norm": 0.006054996512830257,
"learning_rate": 0.00015179638422196966,
"logits/chosen": 0.6647149324417114,
"logits/rejected": 1.1419864892959595,
"logps/chosen": -122.66973876953125,
"logps/rejected": -164.39332580566406,
"loss": 0.0007,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.812810182571411,
"rewards/margins": 8.833253860473633,
"rewards/rejected": -6.020443439483643,
"step": 65
},
{
"epoch": 0.44091858037578285,
"grad_norm": 0.10174047201871872,
"learning_rate": 0.00015154659934580396,
"logits/chosen": 0.3900471329689026,
"logits/rejected": 0.7904311418533325,
"logps/chosen": -191.52008056640625,
"logps/rejected": -219.46322631835938,
"loss": 0.0034,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.0001416206359863,
"rewards/margins": 9.584066390991211,
"rewards/rejected": -6.583923816680908,
"step": 66
},
{
"epoch": 0.4475991649269311,
"grad_norm": 0.024550337344408035,
"learning_rate": 0.00015129328042600085,
"logits/chosen": 0.33518272638320923,
"logits/rejected": 0.980431079864502,
"logps/chosen": -177.56134033203125,
"logps/rejected": -182.8128662109375,
"loss": 0.0004,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.8862996101379395,
"rewards/margins": 10.160045623779297,
"rewards/rejected": -7.273746490478516,
"step": 67
},
{
"epoch": 0.4542797494780793,
"grad_norm": 0.003426956245675683,
"learning_rate": 0.00015103643997524613,
"logits/chosen": 0.4200201630592346,
"logits/rejected": 0.9040957689285278,
"logps/chosen": -223.8280487060547,
"logps/rejected": -291.58636474609375,
"loss": 0.0004,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.0368287563323975,
"rewards/margins": 10.13497543334961,
"rewards/rejected": -7.098146438598633,
"step": 68
},
{
"epoch": 0.46096033402922754,
"grad_norm": 0.003953616600483656,
"learning_rate": 0.00015077609068017158,
"logits/chosen": 0.6471998691558838,
"logits/rejected": 1.0458431243896484,
"logps/chosen": -170.61874389648438,
"logps/rejected": -188.6359100341797,
"loss": 0.0005,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.938751220703125,
"rewards/margins": 9.758254051208496,
"rewards/rejected": -6.819502830505371,
"step": 69
},
{
"epoch": 0.46764091858037576,
"grad_norm": 0.004953494295477867,
"learning_rate": 0.00015051224540072833,
"logits/chosen": 0.43476998805999756,
"logits/rejected": 0.8195680379867554,
"logps/chosen": -158.63465881347656,
"logps/rejected": -213.90623474121094,
"loss": 0.0005,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.9456708431243896,
"rewards/margins": 9.627756118774414,
"rewards/rejected": -6.68208646774292,
"step": 70
},
{
"epoch": 0.474321503131524,
"grad_norm": 0.017343413084745407,
"learning_rate": 0.00015024491716955155,
"logits/chosen": 0.5195021629333496,
"logits/rejected": 0.8773779273033142,
"logps/chosen": -136.5732879638672,
"logps/rejected": -194.87118530273438,
"loss": 0.0007,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.0906143188476562,
"rewards/margins": 9.986026763916016,
"rewards/rejected": -6.895412445068359,
"step": 71
},
{
"epoch": 0.4810020876826722,
"grad_norm": 0.008785056881606579,
"learning_rate": 0.00014997411919131688,
"logits/chosen": 0.5617407560348511,
"logits/rejected": 0.8960329294204712,
"logps/chosen": -163.903076171875,
"logps/rejected": -235.6993408203125,
"loss": 0.0006,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.1653547286987305,
"rewards/margins": 9.831454277038574,
"rewards/rejected": -6.666099548339844,
"step": 72
},
{
"epoch": 0.48768267223382045,
"grad_norm": 0.008580000139772892,
"learning_rate": 0.00014969986484208804,
"logits/chosen": 0.44829118251800537,
"logits/rejected": 0.797157347202301,
"logps/chosen": -136.55909729003906,
"logps/rejected": -190.699951171875,
"loss": 0.0008,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.086763381958008,
"rewards/margins": 9.880305290222168,
"rewards/rejected": -6.79354190826416,
"step": 73
},
{
"epoch": 0.4943632567849687,
"grad_norm": 0.0018130596727132797,
"learning_rate": 0.0001494221676686562,
"logits/chosen": 0.3543122410774231,
"logits/rejected": 0.8907362222671509,
"logps/chosen": -172.5308837890625,
"logps/rejected": -191.48504638671875,
"loss": 0.0004,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.289935350418091,
"rewards/margins": 9.893710136413574,
"rewards/rejected": -6.603774547576904,
"step": 74
},
{
"epoch": 0.5010438413361169,
"grad_norm": 0.009158944711089134,
"learning_rate": 0.00014914104138787066,
"logits/chosen": 0.2868165373802185,
"logits/rejected": 1.0966495275497437,
"logps/chosen": -196.56903076171875,
"logps/rejected": -199.89080810546875,
"loss": 0.0013,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.9512715339660645,
"rewards/margins": 9.58918571472168,
"rewards/rejected": -6.637913703918457,
"step": 75
},
{
"epoch": 0.5077244258872652,
"grad_norm": 0.07474283874034882,
"learning_rate": 0.0001488564998859617,
"logits/chosen": 0.3425526022911072,
"logits/rejected": 0.9148508310317993,
"logps/chosen": -136.7733917236328,
"logps/rejected": -164.56271362304688,
"loss": 0.0027,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.152519941329956,
"rewards/margins": 9.47095775604248,
"rewards/rejected": -6.318437576293945,
"step": 76
},
{
"epoch": 0.5144050104384134,
"grad_norm": 0.009138119406998158,
"learning_rate": 0.00014856855721785415,
"logits/chosen": 0.40808552503585815,
"logits/rejected": 1.0737789869308472,
"logps/chosen": -209.9980926513672,
"logps/rejected": -198.7295379638672,
"loss": 0.0006,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.3080811500549316,
"rewards/margins": 10.055679321289062,
"rewards/rejected": -6.747599124908447,
"step": 77
},
{
"epoch": 0.5210855949895616,
"grad_norm": 0.014706511981785297,
"learning_rate": 0.0001482772276064736,
"logits/chosen": 0.3385770916938782,
"logits/rejected": 0.9576174020767212,
"logps/chosen": -195.38328552246094,
"logps/rejected": -244.77413940429688,
"loss": 0.001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.3216552734375,
"rewards/margins": 10.63253402709961,
"rewards/rejected": -7.310879230499268,
"step": 78
},
{
"epoch": 0.5277661795407098,
"grad_norm": 0.08255796134471893,
"learning_rate": 0.00014798252544204361,
"logits/chosen": 0.6491683125495911,
"logits/rejected": 0.956652045249939,
"logps/chosen": -123.93269348144531,
"logps/rejected": -199.91995239257812,
"loss": 0.0024,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.1648573875427246,
"rewards/margins": 10.019696235656738,
"rewards/rejected": -6.854838848114014,
"step": 79
},
{
"epoch": 0.534446764091858,
"grad_norm": 0.007771719712764025,
"learning_rate": 0.00014768446528137493,
"logits/chosen": 0.4897231459617615,
"logits/rejected": 1.0199142694473267,
"logps/chosen": -155.5507049560547,
"logps/rejected": -182.4271697998047,
"loss": 0.0006,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.8543262481689453,
"rewards/margins": 9.827340126037598,
"rewards/rejected": -6.9730143547058105,
"step": 80
},
{
"epoch": 0.5411273486430063,
"grad_norm": 0.0030905655585229397,
"learning_rate": 0.00014738306184714658,
"logits/chosen": 0.38616618514060974,
"logits/rejected": 1.0583640336990356,
"logps/chosen": -181.91766357421875,
"logps/rejected": -216.1337890625,
"loss": 0.0004,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.401458501815796,
"rewards/margins": 10.563034057617188,
"rewards/rejected": -7.1615753173828125,
"step": 81
},
{
"epoch": 0.5478079331941544,
"grad_norm": 0.0021012630313634872,
"learning_rate": 0.0001470783300271785,
"logits/chosen": 0.2328442931175232,
"logits/rejected": 0.9704186916351318,
"logps/chosen": -179.87081909179688,
"logps/rejected": -180.01779174804688,
"loss": 0.0003,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.238947868347168,
"rewards/margins": 9.60936164855957,
"rewards/rejected": -6.370413780212402,
"step": 82
},
{
"epoch": 0.5544885177453027,
"grad_norm": 0.04149683937430382,
"learning_rate": 0.0001467702848736962,
"logits/chosen": 0.34571874141693115,
"logits/rejected": 0.7980022430419922,
"logps/chosen": -121.0694580078125,
"logps/rejected": -168.23104858398438,
"loss": 0.0035,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.338881254196167,
"rewards/margins": 10.039016723632812,
"rewards/rejected": -6.700134754180908,
"step": 83
},
{
"epoch": 0.5611691022964509,
"grad_norm": 0.019571220502257347,
"learning_rate": 0.0001464589416025873,
"logits/chosen": 0.46891170740127563,
"logits/rejected": 0.9267268180847168,
"logps/chosen": -137.26438903808594,
"logps/rejected": -200.45758056640625,
"loss": 0.0015,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.055349349975586,
"rewards/margins": 10.106170654296875,
"rewards/rejected": -7.050821304321289,
"step": 84
},
{
"epoch": 0.5678496868475992,
"grad_norm": 0.003243156708776951,
"learning_rate": 0.00014614431559264993,
"logits/chosen": 0.5420711636543274,
"logits/rejected": 0.8642836213111877,
"logps/chosen": -153.92691040039062,
"logps/rejected": -197.93504333496094,
"loss": 0.0005,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.184312105178833,
"rewards/margins": 10.237167358398438,
"rewards/rejected": -7.052854537963867,
"step": 85
},
{
"epoch": 0.5745302713987473,
"grad_norm": 0.0025193586479872465,
"learning_rate": 0.00014582642238483302,
"logits/chosen": 0.39913490414619446,
"logits/rejected": 1.0773463249206543,
"logps/chosen": -165.12124633789062,
"logps/rejected": -179.71060180664062,
"loss": 0.0003,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.0881643295288086,
"rewards/margins": 10.198201179504395,
"rewards/rejected": -7.110036849975586,
"step": 86
},
{
"epoch": 0.5812108559498956,
"grad_norm": 0.0010882590431720018,
"learning_rate": 0.00014550527768146876,
"logits/chosen": 0.26313066482543945,
"logits/rejected": 1.0774075984954834,
"logps/chosen": -205.03379821777344,
"logps/rejected": -198.33372497558594,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.51377272605896,
"rewards/margins": 10.64950180053711,
"rewards/rejected": -7.13572883605957,
"step": 87
},
{
"epoch": 0.5878914405010438,
"grad_norm": 0.0017716643633320928,
"learning_rate": 0.0001451808973454969,
"logits/chosen": 0.45582157373428345,
"logits/rejected": 1.0374003648757935,
"logps/chosen": -170.92657470703125,
"logps/rejected": -203.78793334960938,
"loss": 0.0004,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.345822811126709,
"rewards/margins": 10.260464668273926,
"rewards/rejected": -6.914642810821533,
"step": 88
},
{
"epoch": 0.5945720250521921,
"grad_norm": 0.0024310583248734474,
"learning_rate": 0.0001448532973996812,
"logits/chosen": 0.2980806827545166,
"logits/rejected": 1.233319878578186,
"logps/chosen": -184.93099975585938,
"logps/rejected": -159.75274658203125,
"loss": 0.0003,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.6145825386047363,
"rewards/margins": 9.834606170654297,
"rewards/rejected": -6.220023155212402,
"step": 89
},
{
"epoch": 0.6012526096033403,
"grad_norm": 0.0589352585375309,
"learning_rate": 0.00014452249402581818,
"logits/chosen": 0.43372777104377747,
"logits/rejected": 1.0277624130249023,
"logps/chosen": -192.56484985351562,
"logps/rejected": -211.60171508789062,
"loss": 0.0033,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.5561540126800537,
"rewards/margins": 10.808131217956543,
"rewards/rejected": -7.251977920532227,
"step": 90
},
{
"epoch": 0.6079331941544885,
"grad_norm": 0.0015669305576011539,
"learning_rate": 0.00014418850356393744,
"logits/chosen": 0.4860686659812927,
"logits/rejected": 0.8380767107009888,
"logps/chosen": -135.60870361328125,
"logps/rejected": -176.65524291992188,
"loss": 0.0003,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.0018136501312256,
"rewards/margins": 9.445623397827148,
"rewards/rejected": -6.443809509277344,
"step": 91
},
{
"epoch": 0.6146137787056367,
"grad_norm": 0.0015854910016059875,
"learning_rate": 0.0001438513425114949,
"logits/chosen": 0.24818000197410583,
"logits/rejected": 1.0943326950073242,
"logps/chosen": -241.65411376953125,
"logps/rejected": -227.60733032226562,
"loss": 0.0003,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.6303694248199463,
"rewards/margins": 10.740053176879883,
"rewards/rejected": -7.109683990478516,
"step": 92
},
{
"epoch": 0.621294363256785,
"grad_norm": 0.06920412927865982,
"learning_rate": 0.00014351102752255778,
"logits/chosen": 0.4860806465148926,
"logits/rejected": 0.9092923402786255,
"logps/chosen": -194.13475036621094,
"logps/rejected": -225.56948852539062,
"loss": 0.0041,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.6053314208984375,
"rewards/margins": 10.585766792297363,
"rewards/rejected": -6.980436325073242,
"step": 93
},
{
"epoch": 0.6279749478079332,
"grad_norm": 0.08025512099266052,
"learning_rate": 0.00014316757540698186,
"logits/chosen": 0.7010557651519775,
"logits/rejected": 0.9352781772613525,
"logps/chosen": -147.06973266601562,
"logps/rejected": -196.08779907226562,
"loss": 0.0038,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.9433255195617676,
"rewards/margins": 9.722973823547363,
"rewards/rejected": -6.779648303985596,
"step": 94
},
{
"epoch": 0.6346555323590815,
"grad_norm": 0.008386881090700626,
"learning_rate": 0.0001428210031295814,
"logits/chosen": 0.6669158339500427,
"logits/rejected": 1.0123670101165771,
"logps/chosen": -142.99716186523438,
"logps/rejected": -229.2365264892578,
"loss": 0.0007,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.2911269664764404,
"rewards/margins": 10.677725791931152,
"rewards/rejected": -7.386599063873291,
"step": 95
},
{
"epoch": 0.6413361169102296,
"grad_norm": 0.01323284488171339,
"learning_rate": 0.00014247132780929091,
"logits/chosen": 0.5180701613426208,
"logits/rejected": 1.1706314086914062,
"logps/chosen": -155.06080627441406,
"logps/rejected": -180.8107452392578,
"loss": 0.0013,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.032214641571045,
"rewards/margins": 10.255094528198242,
"rewards/rejected": -7.2228803634643555,
"step": 96
},
{
"epoch": 0.6480167014613779,
"grad_norm": 0.006948838476091623,
"learning_rate": 0.00014211856671831973,
"logits/chosen": 0.4092825651168823,
"logits/rejected": 0.9364159107208252,
"logps/chosen": -182.318603515625,
"logps/rejected": -217.2329864501953,
"loss": 0.0004,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.8276522159576416,
"rewards/margins": 10.5891695022583,
"rewards/rejected": -7.76151704788208,
"step": 97
},
{
"epoch": 0.6546972860125261,
"grad_norm": 0.006203535012900829,
"learning_rate": 0.00014176273728129879,
"logits/chosen": 0.38271790742874146,
"logits/rejected": 0.9502532482147217,
"logps/chosen": -154.72279357910156,
"logps/rejected": -199.6136016845703,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.429482936859131,
"rewards/margins": 10.404837608337402,
"rewards/rejected": -6.9753546714782715,
"step": 98
},
{
"epoch": 0.6613778705636744,
"grad_norm": 0.13446320593357086,
"learning_rate": 0.00014140385707442002,
"logits/chosen": 0.5989990234375,
"logits/rejected": 0.9662174582481384,
"logps/chosen": -157.0098114013672,
"logps/rejected": -225.11416625976562,
"loss": 0.0122,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.3204352855682373,
"rewards/margins": 10.797783851623535,
"rewards/rejected": -7.477348804473877,
"step": 99
},
{
"epoch": 0.6680584551148225,
"grad_norm": 0.01066310703754425,
"learning_rate": 0.000141041943824568,
"logits/chosen": 0.4070190191268921,
"logits/rejected": 0.9769365191459656,
"logps/chosen": -178.2224884033203,
"logps/rejected": -192.3170928955078,
"loss": 0.0007,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.1867244243621826,
"rewards/margins": 10.239897727966309,
"rewards/rejected": -7.053173065185547,
"step": 100
},
{
"epoch": 0.6747390396659708,
"grad_norm": 0.004759893286973238,
"learning_rate": 0.00014067701540844443,
"logits/chosen": 0.38733264803886414,
"logits/rejected": 0.9725967049598694,
"logps/chosen": -163.582763671875,
"logps/rejected": -200.43734741210938,
"loss": 0.0003,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.2934625148773193,
"rewards/margins": 10.841292381286621,
"rewards/rejected": -7.547830104827881,
"step": 101
},
{
"epoch": 0.681419624217119,
"grad_norm": 0.06473153084516525,
"learning_rate": 0.00014030908985168528,
"logits/chosen": 0.4140828251838684,
"logits/rejected": 1.0431495904922485,
"logps/chosen": -152.635986328125,
"logps/rejected": -180.54466247558594,
"loss": 0.0021,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.3867475986480713,
"rewards/margins": 10.397819519042969,
"rewards/rejected": -7.011071681976318,
"step": 102
},
{
"epoch": 0.6881002087682673,
"grad_norm": 0.041281044483184814,
"learning_rate": 0.00013993818532797008,
"logits/chosen": 0.5268477201461792,
"logits/rejected": 0.9111321568489075,
"logps/chosen": -168.5952911376953,
"logps/rejected": -216.87393188476562,
"loss": 0.0021,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.4760243892669678,
"rewards/margins": 10.898151397705078,
"rewards/rejected": -7.422128677368164,
"step": 103
},
{
"epoch": 0.6947807933194154,
"grad_norm": 0.025793112814426422,
"learning_rate": 0.0001395643201581245,
"logits/chosen": 0.5013325214385986,
"logits/rejected": 0.8294863700866699,
"logps/chosen": -146.08856201171875,
"logps/rejected": -211.77963256835938,
"loss": 0.0036,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.200756311416626,
"rewards/margins": 10.40932559967041,
"rewards/rejected": -7.208569526672363,
"step": 104
},
{
"epoch": 0.7014613778705637,
"grad_norm": 0.0016948337433859706,
"learning_rate": 0.00013918751280921527,
"logits/chosen": 0.3914889395236969,
"logits/rejected": 0.7578558325767517,
"logps/chosen": -137.25982666015625,
"logps/rejected": -199.191650390625,
"loss": 0.0003,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.24421763420105,
"rewards/margins": 10.255487442016602,
"rewards/rejected": -7.0112690925598145,
"step": 105
},
{
"epoch": 0.7081419624217119,
"grad_norm": 0.0030184737406671047,
"learning_rate": 0.000138807781893638,
"logits/chosen": 0.47848889231681824,
"logits/rejected": 1.006292462348938,
"logps/chosen": -141.8809814453125,
"logps/rejected": -167.41851806640625,
"loss": 0.0004,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.821171998977661,
"rewards/margins": 10.16208553314209,
"rewards/rejected": -7.34091329574585,
"step": 106
},
{
"epoch": 0.7148225469728601,
"grad_norm": 0.0008359827334061265,
"learning_rate": 0.00013842514616819795,
"logits/chosen": 0.553492546081543,
"logits/rejected": 1.0241787433624268,
"logps/chosen": -167.76641845703125,
"logps/rejected": -223.5001678466797,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.6072874069213867,
"rewards/margins": 11.082305908203125,
"rewards/rejected": -7.475017070770264,
"step": 107
},
{
"epoch": 0.7215031315240084,
"grad_norm": 0.0023584573063999414,
"learning_rate": 0.00013803962453318332,
"logits/chosen": 0.4638338088989258,
"logits/rejected": 1.1057603359222412,
"logps/chosen": -137.7595672607422,
"logps/rejected": -139.77247619628906,
"loss": 0.0003,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.4429550170898438,
"rewards/margins": 10.039093017578125,
"rewards/rejected": -6.596138000488281,
"step": 108
},
{
"epoch": 0.7281837160751565,
"grad_norm": 0.0010937312617897987,
"learning_rate": 0.00013765123603143187,
"logits/chosen": 0.5443668365478516,
"logits/rejected": 1.1449006795883179,
"logps/chosen": -195.05966186523438,
"logps/rejected": -221.8257598876953,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.427518606185913,
"rewards/margins": 10.773955345153809,
"rewards/rejected": -7.346436023712158,
"step": 109
},
{
"epoch": 0.7348643006263048,
"grad_norm": 0.005583143327385187,
"learning_rate": 0.00013725999984739014,
"logits/chosen": 0.5298680067062378,
"logits/rejected": 0.9729467034339905,
"logps/chosen": -179.80484008789062,
"logps/rejected": -245.03363037109375,
"loss": 0.0003,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.2088518142700195,
"rewards/margins": 10.993154525756836,
"rewards/rejected": -7.784302711486816,
"step": 110
},
{
"epoch": 0.741544885177453,
"grad_norm": 0.002138708718121052,
"learning_rate": 0.000136865935306166,
"logits/chosen": 0.3394404649734497,
"logits/rejected": 1.089263916015625,
"logps/chosen": -182.08953857421875,
"logps/rejected": -191.70120239257812,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.5019545555114746,
"rewards/margins": 10.58714485168457,
"rewards/rejected": -7.0851898193359375,
"step": 111
},
{
"epoch": 0.7482254697286013,
"grad_norm": 0.0015473664971068501,
"learning_rate": 0.00013646906187257392,
"logits/chosen": 0.2853686213493347,
"logits/rejected": 1.1013542413711548,
"logps/chosen": -190.174560546875,
"logps/rejected": -198.12901306152344,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.5565602779388428,
"rewards/margins": 11.150199890136719,
"rewards/rejected": -7.593639373779297,
"step": 112
},
{
"epoch": 0.7549060542797494,
"grad_norm": 0.0007681334391236305,
"learning_rate": 0.00013606939915017366,
"logits/chosen": 0.5230981111526489,
"logits/rejected": 1.0380409955978394,
"logps/chosen": -145.13079833984375,
"logps/rejected": -166.50582885742188,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.462148427963257,
"rewards/margins": 10.77736759185791,
"rewards/rejected": -7.315218925476074,
"step": 113
},
{
"epoch": 0.7615866388308977,
"grad_norm": 0.0014410597505047917,
"learning_rate": 0.00013566696688030176,
"logits/chosen": 0.21239212155342102,
"logits/rejected": 0.8984243869781494,
"logps/chosen": -207.76397705078125,
"logps/rejected": -216.22067260742188,
"loss": 0.0003,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.6719069480895996,
"rewards/margins": 11.330296516418457,
"rewards/rejected": -7.658390045166016,
"step": 114
},
{
"epoch": 0.7682672233820459,
"grad_norm": 0.006789306178689003,
"learning_rate": 0.00013526178494109668,
"logits/chosen": 0.433882474899292,
"logits/rejected": 1.0958993434906006,
"logps/chosen": -198.0896453857422,
"logps/rejected": -203.357666015625,
"loss": 0.0004,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.284191370010376,
"rewards/margins": 11.191015243530273,
"rewards/rejected": -7.906824111938477,
"step": 115
},
{
"epoch": 0.7749478079331942,
"grad_norm": 0.0014478856464847922,
"learning_rate": 0.00013485387334651668,
"logits/chosen": 0.32002153992652893,
"logits/rejected": 1.0241080522537231,
"logps/chosen": -151.71717834472656,
"logps/rejected": -176.21511840820312,
"loss": 0.0003,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.194765567779541,
"rewards/margins": 10.118311882019043,
"rewards/rejected": -6.923546314239502,
"step": 116
},
{
"epoch": 0.7816283924843423,
"grad_norm": 0.008634977042675018,
"learning_rate": 0.00013444325224535127,
"logits/chosen": 0.43800267577171326,
"logits/rejected": 0.780940592288971,
"logps/chosen": -107.12379455566406,
"logps/rejected": -186.25064086914062,
"loss": 0.0004,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.841928243637085,
"rewards/margins": 10.173989295959473,
"rewards/rejected": -7.332061290740967,
"step": 117
},
{
"epoch": 0.7883089770354906,
"grad_norm": 0.0018734281184151769,
"learning_rate": 0.00013402994192022622,
"logits/chosen": 0.4912789762020111,
"logits/rejected": 0.8206208944320679,
"logps/chosen": -147.78268432617188,
"logps/rejected": -207.8667755126953,
"loss": 0.0003,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.988457202911377,
"rewards/margins": 10.380359649658203,
"rewards/rejected": -7.391902446746826,
"step": 118
},
{
"epoch": 0.7949895615866388,
"grad_norm": 0.0015072772512212396,
"learning_rate": 0.00013361396278660124,
"logits/chosen": 0.3757545053958893,
"logits/rejected": 1.0709331035614014,
"logps/chosen": -150.7517852783203,
"logps/rejected": -183.0996551513672,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.7918334007263184,
"rewards/margins": 10.951468467712402,
"rewards/rejected": -7.159633636474609,
"step": 119
},
{
"epoch": 0.8016701461377871,
"grad_norm": 0.001006263424642384,
"learning_rate": 0.00013319533539176199,
"logits/chosen": 0.6779341697692871,
"logits/rejected": 0.8323581218719482,
"logps/chosen": -162.09359741210938,
"logps/rejected": -241.7459259033203,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.6300947666168213,
"rewards/margins": 11.384780883789062,
"rewards/rejected": -7.7546868324279785,
"step": 120
},
{
"epoch": 0.8083507306889353,
"grad_norm": 0.0022658442612737417,
"learning_rate": 0.00013277408041380487,
"logits/chosen": 0.7190150022506714,
"logits/rejected": 1.058363676071167,
"logps/chosen": -142.59063720703125,
"logps/rejected": -214.18865966796875,
"loss": 0.0003,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.351846218109131,
"rewards/margins": 10.205248832702637,
"rewards/rejected": -6.853403091430664,
"step": 121
},
{
"epoch": 0.8150313152400835,
"grad_norm": 0.0006775332149118185,
"learning_rate": 0.0001323502186606158,
"logits/chosen": 0.46084725856781006,
"logits/rejected": 1.1550092697143555,
"logps/chosen": -164.25515747070312,
"logps/rejected": -191.22218322753906,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.5589873790740967,
"rewards/margins": 11.493250846862793,
"rewards/rejected": -7.934264183044434,
"step": 122
},
{
"epoch": 0.8217118997912317,
"grad_norm": 0.0040510594844818115,
"learning_rate": 0.0001319237710688423,
"logits/chosen": 0.19903920590877533,
"logits/rejected": 1.0057140588760376,
"logps/chosen": -189.11306762695312,
"logps/rejected": -211.69046020507812,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.7651314735412598,
"rewards/margins": 11.223003387451172,
"rewards/rejected": -7.457871437072754,
"step": 123
},
{
"epoch": 0.82839248434238,
"grad_norm": 0.04469997435808182,
"learning_rate": 0.00013149475870285934,
"logits/chosen": 0.5379782319068909,
"logits/rejected": 1.0742489099502563,
"logps/chosen": -186.49488830566406,
"logps/rejected": -206.31185913085938,
"loss": 0.0019,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.179280996322632,
"rewards/margins": 10.138044357299805,
"rewards/rejected": -6.9587626457214355,
"step": 124
},
{
"epoch": 0.8350730688935282,
"grad_norm": 0.0012717200443148613,
"learning_rate": 0.00013106320275372893,
"logits/chosen": 0.5852510333061218,
"logits/rejected": 1.0125198364257812,
"logps/chosen": -156.62803649902344,
"logps/rejected": -211.2324981689453,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.8485283851623535,
"rewards/margins": 11.504820823669434,
"rewards/rejected": -7.6562933921813965,
"step": 125
},
{
"epoch": 0.8417536534446765,
"grad_norm": 0.01169919315725565,
"learning_rate": 0.00013062912453815336,
"logits/chosen": 0.6571882367134094,
"logits/rejected": 0.9026492238044739,
"logps/chosen": -148.38502502441406,
"logps/rejected": -236.60284423828125,
"loss": 0.0007,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.63680362701416,
"rewards/margins": 11.450029373168945,
"rewards/rejected": -7.813226699829102,
"step": 126
},
{
"epoch": 0.8484342379958246,
"grad_norm": 0.0030007236637175083,
"learning_rate": 0.00013019254549742217,
"logits/chosen": 0.45202672481536865,
"logits/rejected": 1.183500051498413,
"logps/chosen": -163.0814971923828,
"logps/rejected": -198.718017578125,
"loss": 0.0005,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.5360257625579834,
"rewards/margins": 10.435393333435059,
"rewards/rejected": -6.899367809295654,
"step": 127
},
{
"epoch": 0.8551148225469729,
"grad_norm": 0.0010185908759012818,
"learning_rate": 0.00012975348719635322,
"logits/chosen": 0.6836375594139099,
"logits/rejected": 1.2013335227966309,
"logps/chosen": -171.42530822753906,
"logps/rejected": -199.51210021972656,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.4464898109436035,
"rewards/margins": 11.003561019897461,
"rewards/rejected": -7.557069778442383,
"step": 128
},
{
"epoch": 0.8617954070981211,
"grad_norm": 0.0015769846504554152,
"learning_rate": 0.00012931197132222738,
"logits/chosen": 0.23104141652584076,
"logits/rejected": 1.1797983646392822,
"logps/chosen": -193.76705932617188,
"logps/rejected": -194.70799255371094,
"loss": 0.0003,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.822854518890381,
"rewards/margins": 11.82970905303955,
"rewards/rejected": -8.006855010986328,
"step": 129
},
{
"epoch": 0.8684759916492694,
"grad_norm": 0.0023490425664931536,
"learning_rate": 0.00012886801968371733,
"logits/chosen": 0.9613852500915527,
"logits/rejected": 1.0451734066009521,
"logps/chosen": -153.423828125,
"logps/rejected": -238.90798950195312,
"loss": 0.0003,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.0533547401428223,
"rewards/margins": 11.23104190826416,
"rewards/rejected": -8.177685737609863,
"step": 130
},
{
"epoch": 0.8751565762004175,
"grad_norm": 0.007329499814659357,
"learning_rate": 0.00012842165420981028,
"logits/chosen": 0.3598926365375519,
"logits/rejected": 1.2872174978256226,
"logps/chosen": -184.6826629638672,
"logps/rejected": -160.8395233154297,
"loss": 0.0003,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.779690980911255,
"rewards/margins": 10.888734817504883,
"rewards/rejected": -7.109042644500732,
"step": 131
},
{
"epoch": 0.8818371607515657,
"grad_norm": 0.0005828408757224679,
"learning_rate": 0.00012797289694872483,
"logits/chosen": 0.5109158158302307,
"logits/rejected": 1.1522945165634155,
"logps/chosen": -181.67935180664062,
"logps/rejected": -195.45596313476562,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.14690899848938,
"rewards/margins": 10.518176078796387,
"rewards/rejected": -7.371266841888428,
"step": 132
},
{
"epoch": 0.888517745302714,
"grad_norm": 0.006235368084162474,
"learning_rate": 0.00012752177006682193,
"logits/chosen": 0.3775724470615387,
"logits/rejected": 0.9961695671081543,
"logps/chosen": -177.385009765625,
"logps/rejected": -198.3971405029297,
"loss": 0.0003,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.827141046524048,
"rewards/margins": 11.361750602722168,
"rewards/rejected": -7.534610271453857,
"step": 133
},
{
"epoch": 0.8951983298538622,
"grad_norm": 0.006079867482185364,
"learning_rate": 0.00012706829584750989,
"logits/chosen": 0.7383792400360107,
"logits/rejected": 0.7893859148025513,
"logps/chosen": -103.3727798461914,
"logps/rejected": -196.89271545410156,
"loss": 0.0007,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.7119593620300293,
"rewards/margins": 10.615793228149414,
"rewards/rejected": -7.903831958770752,
"step": 134
},
{
"epoch": 0.9018789144050104,
"grad_norm": 0.0013690602499991655,
"learning_rate": 0.00012661249669014364,
"logits/chosen": 0.6606283187866211,
"logits/rejected": 0.932214081287384,
"logps/chosen": -162.7115936279297,
"logps/rejected": -225.05245971679688,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.3914923667907715,
"rewards/margins": 11.141890525817871,
"rewards/rejected": -7.750399589538574,
"step": 135
},
{
"epoch": 0.9085594989561586,
"grad_norm": 0.00651137251406908,
"learning_rate": 0.0001261543951089186,
"logits/chosen": 0.31962257623672485,
"logits/rejected": 1.2235876321792603,
"logps/chosen": -223.0352325439453,
"logps/rejected": -209.3647003173828,
"loss": 0.0007,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.4260380268096924,
"rewards/margins": 10.789265632629395,
"rewards/rejected": -7.363227367401123,
"step": 136
},
{
"epoch": 0.9152400835073069,
"grad_norm": 0.0014856844209134579,
"learning_rate": 0.0001256940137317583,
"logits/chosen": 0.48557624220848083,
"logits/rejected": 0.7384285926818848,
"logps/chosen": -142.57444763183594,
"logps/rejected": -209.3195037841797,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.1010584831237793,
"rewards/margins": 10.799681663513184,
"rewards/rejected": -7.6986236572265625,
"step": 137
},
{
"epoch": 0.9219206680584551,
"grad_norm": 0.004290735349059105,
"learning_rate": 0.00012523137529919673,
"logits/chosen": 0.49820947647094727,
"logits/rejected": 1.187336802482605,
"logps/chosen": -181.5707550048828,
"logps/rejected": -202.8837127685547,
"loss": 0.0004,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.54292893409729,
"rewards/margins": 10.928933143615723,
"rewards/rejected": -7.3860039710998535,
"step": 138
},
{
"epoch": 0.9286012526096034,
"grad_norm": 0.003620475996285677,
"learning_rate": 0.00012476650266325513,
"logits/chosen": 0.25639641284942627,
"logits/rejected": 1.1532477140426636,
"logps/chosen": -194.07142639160156,
"logps/rejected": -179.5195770263672,
"loss": 0.0004,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.796419620513916,
"rewards/margins": 10.853509902954102,
"rewards/rejected": -7.057089805603027,
"step": 139
},
{
"epoch": 0.9352818371607515,
"grad_norm": 0.007752254139631987,
"learning_rate": 0.00012429941878631324,
"logits/chosen": 0.5540122389793396,
"logits/rejected": 0.8194867372512817,
"logps/chosen": -103.79815673828125,
"logps/rejected": -197.13775634765625,
"loss": 0.0006,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.879499912261963,
"rewards/margins": 11.494918823242188,
"rewards/rejected": -8.615418434143066,
"step": 140
},
{
"epoch": 0.9419624217118998,
"grad_norm": 0.0009465285111218691,
"learning_rate": 0.00012383014673997497,
"logits/chosen": 0.7288948893547058,
"logits/rejected": 0.9930503964424133,
"logps/chosen": -141.9916534423828,
"logps/rejected": -216.1266326904297,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.5408124923706055,
"rewards/margins": 11.739241600036621,
"rewards/rejected": -8.1984281539917,
"step": 141
},
{
"epoch": 0.948643006263048,
"grad_norm": 0.0011045335559174418,
"learning_rate": 0.00012335870970392888,
"logits/chosen": 0.5019521117210388,
"logits/rejected": 1.1962378025054932,
"logps/chosen": -200.11459350585938,
"logps/rejected": -179.86489868164062,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.4744818210601807,
"rewards/margins": 11.273345947265625,
"rewards/rejected": -7.798864841461182,
"step": 142
},
{
"epoch": 0.9553235908141963,
"grad_norm": 0.024925533682107925,
"learning_rate": 0.0001228851309648032,
"logits/chosen": 0.6946430802345276,
"logits/rejected": 0.8380041122436523,
"logps/chosen": -127.19657135009766,
"logps/rejected": -245.40231323242188,
"loss": 0.0034,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.5805623531341553,
"rewards/margins": 12.349920272827148,
"rewards/rejected": -8.769356727600098,
"step": 143
},
{
"epoch": 0.9620041753653444,
"grad_norm": 0.0006273960461840034,
"learning_rate": 0.0001224094339150155,
"logits/chosen": 0.32063305377960205,
"logits/rejected": 1.1136928796768188,
"logps/chosen": -182.1404266357422,
"logps/rejected": -192.31224060058594,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.556812047958374,
"rewards/margins": 11.931962966918945,
"rewards/rejected": -8.375152587890625,
"step": 144
},
{
"epoch": 0.9686847599164927,
"grad_norm": 0.013907409273087978,
"learning_rate": 0.0001219316420516173,
"logits/chosen": 0.3725297451019287,
"logits/rejected": 1.098531723022461,
"logps/chosen": -166.97886657714844,
"logps/rejected": -225.6486358642578,
"loss": 0.0007,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.5993576049804688,
"rewards/margins": 11.865777015686035,
"rewards/rejected": -8.266420364379883,
"step": 145
},
{
"epoch": 0.9753653444676409,
"grad_norm": 0.008556756190955639,
"learning_rate": 0.00012145177897513349,
"logits/chosen": 0.4221380949020386,
"logits/rejected": 1.1614536046981812,
"logps/chosen": -148.64877319335938,
"logps/rejected": -174.18954467773438,
"loss": 0.0005,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.369095802307129,
"rewards/margins": 10.837663650512695,
"rewards/rejected": -7.468567371368408,
"step": 146
},
{
"epoch": 0.9820459290187892,
"grad_norm": 0.01212705485522747,
"learning_rate": 0.0001209698683883964,
"logits/chosen": 0.2313823103904724,
"logits/rejected": 1.3470351696014404,
"logps/chosen": -226.58926391601562,
"logps/rejected": -192.19192504882812,
"loss": 0.0017,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.370995044708252,
"rewards/margins": 11.233190536499023,
"rewards/rejected": -7.8621954917907715,
"step": 147
},
{
"epoch": 0.9887265135699373,
"grad_norm": 0.0021595736034214497,
"learning_rate": 0.00012048593409537522,
"logits/chosen": 0.33942633867263794,
"logits/rejected": 1.0542570352554321,
"logps/chosen": -157.76954650878906,
"logps/rejected": -191.26931762695312,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.679450273513794,
"rewards/margins": 11.785932540893555,
"rewards/rejected": -8.106481552124023,
"step": 148
},
{
"epoch": 0.9954070981210856,
"grad_norm": 0.011656565591692924,
"learning_rate": 0.00012000000000000002,
"logits/chosen": 0.5189492702484131,
"logits/rejected": 1.1787470579147339,
"logps/chosen": -188.0403289794922,
"logps/rejected": -217.3338165283203,
"loss": 0.0014,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.895737648010254,
"rewards/margins": 11.713899612426758,
"rewards/rejected": -7.8181633949279785,
"step": 149
},
{
"epoch": 1.0020876826722338,
"grad_norm": 0.0011344770900905132,
"learning_rate": 0.00011951209010498108,
"logits/chosen": 0.4898693561553955,
"logits/rejected": 0.9766267538070679,
"logps/chosen": -173.29742431640625,
"logps/rejected": -209.09970092773438,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.360459089279175,
"rewards/margins": 10.934969902038574,
"rewards/rejected": -7.574510097503662,
"step": 150
},
{
"epoch": 1.008768267223382,
"grad_norm": 0.0007721176953054965,
"learning_rate": 0.0001190222285106234,
"logits/chosen": 0.40774810314178467,
"logits/rejected": 1.1688655614852905,
"logps/chosen": -172.59530639648438,
"logps/rejected": -188.3610076904297,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.3076529502868652,
"rewards/margins": 11.588592529296875,
"rewards/rejected": -8.280941009521484,
"step": 151
},
{
"epoch": 1.0154488517745304,
"grad_norm": 0.0009479824802838266,
"learning_rate": 0.00011853043941363599,
"logits/chosen": 0.5603910684585571,
"logits/rejected": 1.2070255279541016,
"logps/chosen": -171.68991088867188,
"logps/rejected": -202.64581298828125,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.053154706954956,
"rewards/margins": 11.318292617797852,
"rewards/rejected": -8.265138626098633,
"step": 152
},
{
"epoch": 1.0221294363256785,
"grad_norm": 0.0007491550641134381,
"learning_rate": 0.00011803674710593694,
"logits/chosen": 0.5270896553993225,
"logits/rejected": 0.9994916915893555,
"logps/chosen": -143.41436767578125,
"logps/rejected": -200.78506469726562,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.390288829803467,
"rewards/margins": 11.925804138183594,
"rewards/rejected": -8.535514831542969,
"step": 153
},
{
"epoch": 1.0288100208768267,
"grad_norm": 0.0009609659318812191,
"learning_rate": 0.00011754117597345342,
"logits/chosen": 0.44007253646850586,
"logits/rejected": 1.2215617895126343,
"logps/chosen": -137.77011108398438,
"logps/rejected": -157.72262573242188,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.361788034439087,
"rewards/margins": 10.739198684692383,
"rewards/rejected": -7.377409934997559,
"step": 154
},
{
"epoch": 1.0354906054279749,
"grad_norm": 0.00042587798088788986,
"learning_rate": 0.00011704375049491706,
"logits/chosen": 0.5652292370796204,
"logits/rejected": 0.945173978805542,
"logps/chosen": -145.53587341308594,
"logps/rejected": -234.8934326171875,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.2489700317382812,
"rewards/margins": 11.937629699707031,
"rewards/rejected": -8.68865966796875,
"step": 155
},
{
"epoch": 1.0421711899791233,
"grad_norm": 0.0036174345295876265,
"learning_rate": 0.00011654449524065499,
"logits/chosen": 0.6072965860366821,
"logits/rejected": 0.9700977802276611,
"logps/chosen": -164.41070556640625,
"logps/rejected": -208.70692443847656,
"loss": 0.0003,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.4178385734558105,
"rewards/margins": 11.409514427185059,
"rewards/rejected": -7.991675853729248,
"step": 156
},
{
"epoch": 1.0488517745302715,
"grad_norm": 0.0013750857906416059,
"learning_rate": 0.00011604343487137601,
"logits/chosen": 0.4669913947582245,
"logits/rejected": 0.8744410276412964,
"logps/chosen": -136.6684112548828,
"logps/rejected": -221.99180603027344,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.308939218521118,
"rewards/margins": 11.851880073547363,
"rewards/rejected": -8.542940139770508,
"step": 157
},
{
"epoch": 1.0555323590814196,
"grad_norm": 0.0015664240345358849,
"learning_rate": 0.00011554059413695259,
"logits/chosen": 0.33473482728004456,
"logits/rejected": 0.9933146834373474,
"logps/chosen": -148.87991333007812,
"logps/rejected": -174.86378479003906,
"loss": 0.0003,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.450471878051758,
"rewards/margins": 11.176468849182129,
"rewards/rejected": -7.725996494293213,
"step": 158
},
{
"epoch": 1.0622129436325678,
"grad_norm": 0.0005284909857437015,
"learning_rate": 0.00011503599787519838,
"logits/chosen": 0.5126962065696716,
"logits/rejected": 1.1356326341629028,
"logps/chosen": -137.66116333007812,
"logps/rejected": -207.24697875976562,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.605351448059082,
"rewards/margins": 12.428322792053223,
"rewards/rejected": -8.82297134399414,
"step": 159
},
{
"epoch": 1.068893528183716,
"grad_norm": 0.0012326446594670415,
"learning_rate": 0.00011452967101064118,
"logits/chosen": 0.34365200996398926,
"logits/rejected": 1.3188669681549072,
"logps/chosen": -196.2615509033203,
"logps/rejected": -177.1357879638672,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.4942498207092285,
"rewards/margins": 11.286696434020996,
"rewards/rejected": -7.792446136474609,
"step": 160
},
{
"epoch": 1.0755741127348644,
"grad_norm": 0.0008088697213679552,
"learning_rate": 0.00011402163855329199,
"logits/chosen": 0.6658412218093872,
"logits/rejected": 0.9367747902870178,
"logps/chosen": -148.8216094970703,
"logps/rejected": -237.14321899414062,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.0566344261169434,
"rewards/margins": 11.411201477050781,
"rewards/rejected": -8.35456657409668,
"step": 161
},
{
"epoch": 1.0822546972860125,
"grad_norm": 0.0008485389407724142,
"learning_rate": 0.00011351192559740949,
"logits/chosen": 0.7383836507797241,
"logits/rejected": 1.149275541305542,
"logps/chosen": -137.4690704345703,
"logps/rejected": -209.24774169921875,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.417785882949829,
"rewards/margins": 11.462677001953125,
"rewards/rejected": -8.044892311096191,
"step": 162
},
{
"epoch": 1.0889352818371607,
"grad_norm": 0.0005739150219596922,
"learning_rate": 0.0001130005573202606,
"logits/chosen": 0.5681736469268799,
"logits/rejected": 0.9365695714950562,
"logps/chosen": -163.19027709960938,
"logps/rejected": -242.56704711914062,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.6804287433624268,
"rewards/margins": 11.32785415649414,
"rewards/rejected": -7.647425651550293,
"step": 163
},
{
"epoch": 1.0956158663883089,
"grad_norm": 0.004342256113886833,
"learning_rate": 0.00011248755898087684,
"logits/chosen": 0.4348772168159485,
"logits/rejected": 0.9407061338424683,
"logps/chosen": -164.66812133789062,
"logps/rejected": -211.21351623535156,
"loss": 0.0004,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.6066393852233887,
"rewards/margins": 11.529861450195312,
"rewards/rejected": -7.923222541809082,
"step": 164
},
{
"epoch": 1.1022964509394573,
"grad_norm": 0.0008769746054895222,
"learning_rate": 0.00011197295591880657,
"logits/chosen": 0.6984822750091553,
"logits/rejected": 1.179412841796875,
"logps/chosen": -115.85931396484375,
"logps/rejected": -204.60218811035156,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.4029932022094727,
"rewards/margins": 11.5657958984375,
"rewards/rejected": -8.162801742553711,
"step": 165
},
{
"epoch": 1.1089770354906054,
"grad_norm": 0.052136704325675964,
"learning_rate": 0.00011145677355286353,
"logits/chosen": 0.35074859857559204,
"logits/rejected": 1.297317385673523,
"logps/chosen": -179.23977661132812,
"logps/rejected": -196.0187530517578,
"loss": 0.0041,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.2048532962799072,
"rewards/margins": 10.744298934936523,
"rewards/rejected": -7.539445877075195,
"step": 166
},
{
"epoch": 1.1156576200417536,
"grad_norm": 0.004496046341955662,
"learning_rate": 0.00011093903737987102,
"logits/chosen": 0.4773109555244446,
"logits/rejected": 0.854899525642395,
"logps/chosen": -135.96896362304688,
"logps/rejected": -217.36251831054688,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.521961212158203,
"rewards/margins": 12.137231826782227,
"rewards/rejected": -8.61527156829834,
"step": 167
},
{
"epoch": 1.1223382045929018,
"grad_norm": 0.0014830527361482382,
"learning_rate": 0.0001104197729734027,
"logits/chosen": 0.4430505335330963,
"logits/rejected": 1.0172383785247803,
"logps/chosen": -130.85362243652344,
"logps/rejected": -194.0697479248047,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.6902472972869873,
"rewards/margins": 11.970958709716797,
"rewards/rejected": -8.280712127685547,
"step": 168
},
{
"epoch": 1.1290187891440502,
"grad_norm": 0.0010357052087783813,
"learning_rate": 0.00010989900598251933,
"logits/chosen": 0.5143346190452576,
"logits/rejected": 1.06320321559906,
"logps/chosen": -163.08689880371094,
"logps/rejected": -201.315673828125,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.4902918338775635,
"rewards/margins": 11.922452926635742,
"rewards/rejected": -8.432162284851074,
"step": 169
},
{
"epoch": 1.1356993736951984,
"grad_norm": 0.003559267381206155,
"learning_rate": 0.00010937676213050178,
"logits/chosen": 0.5508258938789368,
"logits/rejected": 0.8938766121864319,
"logps/chosen": -177.08377075195312,
"logps/rejected": -236.21572875976562,
"loss": 0.0004,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.5201761722564697,
"rewards/margins": 11.900284767150879,
"rewards/rejected": -8.380107879638672,
"step": 170
},
{
"epoch": 1.1423799582463465,
"grad_norm": 0.0015716857742518187,
"learning_rate": 0.00010885306721358045,
"logits/chosen": 0.5690699219703674,
"logits/rejected": 1.3002761602401733,
"logps/chosen": -158.36329650878906,
"logps/rejected": -188.37022399902344,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.6385347843170166,
"rewards/margins": 11.156852722167969,
"rewards/rejected": -7.518317699432373,
"step": 171
},
{
"epoch": 1.1490605427974947,
"grad_norm": 0.00707374420017004,
"learning_rate": 0.00010832794709966112,
"logits/chosen": 0.3383929431438446,
"logits/rejected": 1.2127389907836914,
"logps/chosen": -209.92825317382812,
"logps/rejected": -226.083740234375,
"loss": 0.0008,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.8767664432525635,
"rewards/margins": 12.013040542602539,
"rewards/rejected": -8.136273384094238,
"step": 172
},
{
"epoch": 1.155741127348643,
"grad_norm": 0.0012596473097801208,
"learning_rate": 0.00010780142772704712,
"logits/chosen": 0.6638607978820801,
"logits/rejected": 1.1403982639312744,
"logps/chosen": -157.58836364746094,
"logps/rejected": -212.12307739257812,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.922273635864258,
"rewards/margins": 11.61031436920166,
"rewards/rejected": -7.688040256500244,
"step": 173
},
{
"epoch": 1.1624217118997913,
"grad_norm": 0.0015774251660332084,
"learning_rate": 0.00010727353510315816,
"logits/chosen": 0.5496118068695068,
"logits/rejected": 1.1589007377624512,
"logps/chosen": -145.5523223876953,
"logps/rejected": -200.73741149902344,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.641279697418213,
"rewards/margins": 12.338908195495605,
"rewards/rejected": -8.697628021240234,
"step": 174
},
{
"epoch": 1.1691022964509394,
"grad_norm": 0.015453093685209751,
"learning_rate": 0.00010674429530324574,
"logits/chosen": 0.5172752141952515,
"logits/rejected": 1.1510488986968994,
"logps/chosen": -155.7484588623047,
"logps/rejected": -177.63027954101562,
"loss": 0.0015,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.400092363357544,
"rewards/margins": 10.950430870056152,
"rewards/rejected": -7.5503387451171875,
"step": 175
},
{
"epoch": 1.1757828810020876,
"grad_norm": 0.0012826044112443924,
"learning_rate": 0.00010621373446910502,
"logits/chosen": 0.5946585536003113,
"logits/rejected": 1.1312611103057861,
"logps/chosen": -182.75096130371094,
"logps/rejected": -220.31317138671875,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.6229946613311768,
"rewards/margins": 11.344440460205078,
"rewards/rejected": -7.7214460372924805,
"step": 176
},
{
"epoch": 1.182463465553236,
"grad_norm": 0.04078257828950882,
"learning_rate": 0.00010568187880778373,
"logits/chosen": 0.43110349774360657,
"logits/rejected": 1.0659563541412354,
"logps/chosen": -184.86537170410156,
"logps/rejected": -219.26332092285156,
"loss": 0.0029,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.9776532649993896,
"rewards/margins": 12.578085899353027,
"rewards/rejected": -8.600433349609375,
"step": 177
},
{
"epoch": 1.1891440501043842,
"grad_norm": 0.0012859440175816417,
"learning_rate": 0.00010514875459028743,
"logits/chosen": 0.4430413842201233,
"logits/rejected": 0.9718153476715088,
"logps/chosen": -130.32984924316406,
"logps/rejected": -205.24440002441406,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.349815845489502,
"rewards/margins": 10.951803207397461,
"rewards/rejected": -7.601987361907959,
"step": 178
},
{
"epoch": 1.1958246346555323,
"grad_norm": 0.018363507464528084,
"learning_rate": 0.00010461438815028219,
"logits/chosen": 0.2948397397994995,
"logits/rejected": 0.872904360294342,
"logps/chosen": -151.6688690185547,
"logps/rejected": -215.8356170654297,
"loss": 0.0015,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.273361921310425,
"rewards/margins": 10.712809562683105,
"rewards/rejected": -7.43944787979126,
"step": 179
},
{
"epoch": 1.2025052192066805,
"grad_norm": 0.0017469325102865696,
"learning_rate": 0.00010407880588279352,
"logits/chosen": 0.4774348735809326,
"logits/rejected": 0.7673771977424622,
"logps/chosen": -142.30799865722656,
"logps/rejected": -235.41400146484375,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.5183143615722656,
"rewards/margins": 12.07205867767334,
"rewards/rejected": -8.553744316101074,
"step": 180
},
{
"epoch": 1.209185803757829,
"grad_norm": 0.0007553395698778331,
"learning_rate": 0.00010354203424290271,
"logits/chosen": 0.7918840050697327,
"logits/rejected": 1.1450823545455933,
"logps/chosen": -131.16664123535156,
"logps/rejected": -211.7412109375,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.386751174926758,
"rewards/margins": 10.85494327545166,
"rewards/rejected": -7.468192100524902,
"step": 181
},
{
"epoch": 1.215866388308977,
"grad_norm": 0.0012744672130793333,
"learning_rate": 0.0001030040997444402,
"logits/chosen": 0.45629051327705383,
"logits/rejected": 0.9976529479026794,
"logps/chosen": -133.558349609375,
"logps/rejected": -192.5741424560547,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.5578696727752686,
"rewards/margins": 11.33415699005127,
"rewards/rejected": -7.77628755569458,
"step": 182
},
{
"epoch": 1.2225469728601253,
"grad_norm": 0.00047452072612941265,
"learning_rate": 0.00010246502895867568,
"logits/chosen": 0.8382459878921509,
"logits/rejected": 0.7453187108039856,
"logps/chosen": -130.78721618652344,
"logps/rejected": -261.71197509765625,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.311988592147827,
"rewards/margins": 12.276436805725098,
"rewards/rejected": -8.964447975158691,
"step": 183
},
{
"epoch": 1.2292275574112734,
"grad_norm": 0.0086443442851305,
"learning_rate": 0.0001019248485130059,
"logits/chosen": 0.4247654676437378,
"logits/rejected": 1.1591222286224365,
"logps/chosen": -167.55833435058594,
"logps/rejected": -179.604248046875,
"loss": 0.0003,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.1582260131835938,
"rewards/margins": 11.254338264465332,
"rewards/rejected": -8.096114158630371,
"step": 184
},
{
"epoch": 1.2359081419624216,
"grad_norm": 0.001544467406347394,
"learning_rate": 0.0001013835850896391,
"logits/chosen": 0.5242801308631897,
"logits/rejected": 1.4609646797180176,
"logps/chosen": -179.2179412841797,
"logps/rejected": -167.85665893554688,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.4588305950164795,
"rewards/margins": 11.103114128112793,
"rewards/rejected": -7.64428186416626,
"step": 185
},
{
"epoch": 1.24258872651357,
"grad_norm": 0.0018972865073010325,
"learning_rate": 0.00010084126542427725,
"logits/chosen": 0.6289225220680237,
"logits/rejected": 0.774864137172699,
"logps/chosen": -153.0395965576172,
"logps/rejected": -283.26702880859375,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.832301616668701,
"rewards/margins": 12.483742713928223,
"rewards/rejected": -8.65144157409668,
"step": 186
},
{
"epoch": 1.2492693110647182,
"grad_norm": 0.0005702719208784401,
"learning_rate": 0.0001002979163047954,
"logits/chosen": 0.5563308596611023,
"logits/rejected": 1.0048775672912598,
"logps/chosen": -160.7752685546875,
"logps/rejected": -206.54483032226562,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.5111823081970215,
"rewards/margins": 11.941852569580078,
"rewards/rejected": -8.430670738220215,
"step": 187
},
{
"epoch": 1.2559498956158663,
"grad_norm": 0.0005867581348866224,
"learning_rate": 9.975356456991849e-05,
"logits/chosen": 0.8396896123886108,
"logits/rejected": 1.0179424285888672,
"logps/chosen": -144.9906768798828,
"logps/rejected": -236.50119018554688,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.715435028076172,
"rewards/margins": 11.944793701171875,
"rewards/rejected": -8.229358673095703,
"step": 188
},
{
"epoch": 1.2626304801670147,
"grad_norm": 0.00043765606824308634,
"learning_rate": 9.920823710789562e-05,
"logits/chosen": 0.36594095826148987,
"logits/rejected": 1.1739063262939453,
"logps/chosen": -170.1721954345703,
"logps/rejected": -185.6078643798828,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.8701069355010986,
"rewards/margins": 11.88235092163086,
"rewards/rejected": -8.01224422454834,
"step": 189
},
{
"epoch": 1.269311064718163,
"grad_norm": 0.00041320393211208284,
"learning_rate": 9.866196085517186e-05,
"logits/chosen": 0.4379793703556061,
"logits/rejected": 1.0098289251327515,
"logps/chosen": -138.4647216796875,
"logps/rejected": -204.14564514160156,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.3668053150177,
"rewards/margins": 11.314434051513672,
"rewards/rejected": -7.947629928588867,
"step": 190
},
{
"epoch": 1.275991649269311,
"grad_norm": 0.0015216560568660498,
"learning_rate": 9.81147627950579e-05,
"logits/chosen": 0.5741678476333618,
"logits/rejected": 1.0209534168243408,
"logps/chosen": -117.83605194091797,
"logps/rejected": -161.2305908203125,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.5719006061553955,
"rewards/margins": 11.671441078186035,
"rewards/rejected": -8.099540710449219,
"step": 191
},
{
"epoch": 1.2826722338204593,
"grad_norm": 0.0032318232115358114,
"learning_rate": 9.756666995639703e-05,
"logits/chosen": 0.5298473238945007,
"logits/rejected": 1.1004164218902588,
"logps/chosen": -155.68045043945312,
"logps/rejected": -188.41229248046875,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.450162887573242,
"rewards/margins": 11.770234107971191,
"rewards/rejected": -8.32007122039795,
"step": 192
},
{
"epoch": 1.2893528183716074,
"grad_norm": 0.0007364507764577866,
"learning_rate": 9.701770941223014e-05,
"logits/chosen": 0.38831934332847595,
"logits/rejected": 0.9996166229248047,
"logps/chosen": -149.40911865234375,
"logps/rejected": -181.202880859375,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.700319290161133,
"rewards/margins": 11.73774528503418,
"rewards/rejected": -8.037424087524414,
"step": 193
},
{
"epoch": 1.2960334029227558,
"grad_norm": 0.05426819995045662,
"learning_rate": 9.646790827845844e-05,
"logits/chosen": 0.45380842685699463,
"logits/rejected": 1.102933645248413,
"logps/chosen": -132.97164916992188,
"logps/rejected": -154.08615112304688,
"loss": 0.0023,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.5674822330474854,
"rewards/margins": 10.895905494689941,
"rewards/rejected": -7.328424453735352,
"step": 194
},
{
"epoch": 1.302713987473904,
"grad_norm": 0.0008307976531796157,
"learning_rate": 9.5917293712504e-05,
"logits/chosen": 0.4264935255050659,
"logits/rejected": 1.022092580795288,
"logps/chosen": -174.43560791015625,
"logps/rejected": -219.11038208007812,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.602630853652954,
"rewards/margins": 12.087784767150879,
"rewards/rejected": -8.485153198242188,
"step": 195
},
{
"epoch": 1.3093945720250522,
"grad_norm": 0.005923965014517307,
"learning_rate": 9.536589291196837e-05,
"logits/chosen": 0.38043221831321716,
"logits/rejected": 1.2655529975891113,
"logps/chosen": -219.9903106689453,
"logps/rejected": -224.3175048828125,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.751121997833252,
"rewards/margins": 12.251858711242676,
"rewards/rejected": -8.500736236572266,
"step": 196
},
{
"epoch": 1.3160751565762006,
"grad_norm": 0.008259938098490238,
"learning_rate": 9.481373311328927e-05,
"logits/chosen": 0.3577250838279724,
"logits/rejected": 1.0607563257217407,
"logps/chosen": -192.20773315429688,
"logps/rejected": -223.62220764160156,
"loss": 0.0009,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.199789047241211,
"rewards/margins": 12.089982032775879,
"rewards/rejected": -7.890193939208984,
"step": 197
},
{
"epoch": 1.3227557411273487,
"grad_norm": 0.011571898125112057,
"learning_rate": 9.426084159039497e-05,
"logits/chosen": 0.6079049110412598,
"logits/rejected": 0.6202818155288696,
"logps/chosen": -129.2443389892578,
"logps/rejected": -254.39288330078125,
"loss": 0.0013,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.321871042251587,
"rewards/margins": 12.733055114746094,
"rewards/rejected": -9.41118335723877,
"step": 198
},
{
"epoch": 1.329436325678497,
"grad_norm": 0.003518011188134551,
"learning_rate": 9.370724565335733e-05,
"logits/chosen": 0.5599907636642456,
"logits/rejected": 0.6773720383644104,
"logps/chosen": -142.21826171875,
"logps/rejected": -256.0460510253906,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.407008171081543,
"rewards/margins": 12.075630187988281,
"rewards/rejected": -8.668621063232422,
"step": 199
},
{
"epoch": 1.336116910229645,
"grad_norm": 0.00033826244180090725,
"learning_rate": 9.315297264704276e-05,
"logits/chosen": 0.36998850107192993,
"logits/rejected": 1.1482012271881104,
"logps/chosen": -161.43829345703125,
"logps/rejected": -188.72938537597656,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.6240572929382324,
"rewards/margins": 12.082491874694824,
"rewards/rejected": -8.45843505859375,
"step": 200
},
{
"epoch": 1.3427974947807932,
"grad_norm": 0.013436054810881615,
"learning_rate": 9.259804994976145e-05,
"logits/chosen": 0.4420285224914551,
"logits/rejected": 1.2852901220321655,
"logps/chosen": -216.61392211914062,
"logps/rejected": -218.47210693359375,
"loss": 0.001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.93255877494812,
"rewards/margins": 12.457169532775879,
"rewards/rejected": -8.524611473083496,
"step": 201
},
{
"epoch": 1.3494780793319414,
"grad_norm": 0.0006867127376608551,
"learning_rate": 9.204250497191507e-05,
"logits/chosen": 0.3688305616378784,
"logits/rejected": 1.0229319334030151,
"logps/chosen": -187.53517150878906,
"logps/rejected": -212.68167114257812,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.5623180866241455,
"rewards/margins": 12.298954010009766,
"rewards/rejected": -8.736637115478516,
"step": 202
},
{
"epoch": 1.3561586638830898,
"grad_norm": 0.006973725743591785,
"learning_rate": 9.148636515464286e-05,
"logits/chosen": 0.6111980676651001,
"logits/rejected": 1.1670434474945068,
"logps/chosen": -150.9393768310547,
"logps/rejected": -190.71719360351562,
"loss": 0.0003,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.552490234375,
"rewards/margins": 11.955738067626953,
"rewards/rejected": -8.40324878692627,
"step": 203
},
{
"epoch": 1.362839248434238,
"grad_norm": 0.0029133392963558435,
"learning_rate": 9.092965796846615e-05,
"logits/chosen": 0.43748268485069275,
"logits/rejected": 0.8366719484329224,
"logps/chosen": -141.9459686279297,
"logps/rejected": -248.22157287597656,
"loss": 0.0003,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.4116930961608887,
"rewards/margins": 13.216766357421875,
"rewards/rejected": -9.805072784423828,
"step": 204
},
{
"epoch": 1.3695198329853862,
"grad_norm": 0.0006192025612108409,
"learning_rate": 9.037241091193146e-05,
"logits/chosen": 0.7180970907211304,
"logits/rejected": 1.0650739669799805,
"logps/chosen": -103.74508666992188,
"logps/rejected": -204.15809631347656,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.131009101867676,
"rewards/margins": 11.749822616577148,
"rewards/rejected": -8.618814468383789,
"step": 205
},
{
"epoch": 1.3762004175365345,
"grad_norm": 0.0005728130927309394,
"learning_rate": 8.981465151025214e-05,
"logits/chosen": 0.41642457246780396,
"logits/rejected": 0.8988510370254517,
"logps/chosen": -156.42083740234375,
"logps/rejected": -248.08001708984375,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.5374138355255127,
"rewards/margins": 12.660049438476562,
"rewards/rejected": -9.122636795043945,
"step": 206
},
{
"epoch": 1.3828810020876827,
"grad_norm": 0.03418273478746414,
"learning_rate": 8.925640731394891e-05,
"logits/chosen": 0.40667593479156494,
"logits/rejected": 1.2365777492523193,
"logps/chosen": -173.7451934814453,
"logps/rejected": -170.29013061523438,
"loss": 0.0023,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.2876858711242676,
"rewards/margins": 12.047090530395508,
"rewards/rejected": -8.759405136108398,
"step": 207
},
{
"epoch": 1.389561586638831,
"grad_norm": 0.00034636727650649846,
"learning_rate": 8.869770589748885e-05,
"logits/chosen": 0.6443273425102234,
"logits/rejected": 1.0803853273391724,
"logps/chosen": -149.794189453125,
"logps/rejected": -238.55108642578125,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.2578701972961426,
"rewards/margins": 11.839977264404297,
"rewards/rejected": -8.58210563659668,
"step": 208
},
{
"epoch": 1.396242171189979,
"grad_norm": 0.0011885147541761398,
"learning_rate": 8.813857485792346e-05,
"logits/chosen": 0.8002556562423706,
"logits/rejected": 0.8125602602958679,
"logps/chosen": -113.92469787597656,
"logps/rejected": -232.79171752929688,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.2659642696380615,
"rewards/margins": 12.342820167541504,
"rewards/rejected": -9.07685661315918,
"step": 209
},
{
"epoch": 1.4029227557411272,
"grad_norm": 0.003782590152695775,
"learning_rate": 8.757904181352548e-05,
"logits/chosen": 0.5000881552696228,
"logits/rejected": 0.8626303672790527,
"logps/chosen": -140.60691833496094,
"logps/rejected": -224.2894287109375,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.359609365463257,
"rewards/margins": 12.288975715637207,
"rewards/rejected": -8.929367065429688,
"step": 210
},
{
"epoch": 1.4096033402922756,
"grad_norm": 0.0002241008187411353,
"learning_rate": 8.701913440242459e-05,
"logits/chosen": 0.7766547799110413,
"logits/rejected": 0.6242997050285339,
"logps/chosen": -118.01210021972656,
"logps/rejected": -280.14080810546875,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.590362310409546,
"rewards/margins": 12.91071605682373,
"rewards/rejected": -9.320354461669922,
"step": 211
},
{
"epoch": 1.4162839248434238,
"grad_norm": 0.0009039054857566953,
"learning_rate": 8.645888028124245e-05,
"logits/chosen": 0.45011216402053833,
"logits/rejected": 1.036590814590454,
"logps/chosen": -150.9805145263672,
"logps/rejected": -212.8893585205078,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.292708396911621,
"rewards/margins": 12.373250961303711,
"rewards/rejected": -9.080541610717773,
"step": 212
},
{
"epoch": 1.422964509394572,
"grad_norm": 0.000518288929015398,
"learning_rate": 8.589830712372634e-05,
"logits/chosen": 0.4165090322494507,
"logits/rejected": 1.087862491607666,
"logps/chosen": -157.97787475585938,
"logps/rejected": -206.57342529296875,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.5738751888275146,
"rewards/margins": 12.204456329345703,
"rewards/rejected": -8.630579948425293,
"step": 213
},
{
"epoch": 1.4296450939457204,
"grad_norm": 0.0008133598603308201,
"learning_rate": 8.533744261938238e-05,
"logits/chosen": 0.2943029999732971,
"logits/rejected": 1.3612968921661377,
"logps/chosen": -220.78323364257812,
"logps/rejected": -184.81143188476562,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.250751495361328,
"rewards/margins": 12.059762001037598,
"rewards/rejected": -8.80901050567627,
"step": 214
},
{
"epoch": 1.4363256784968685,
"grad_norm": 0.0004915079916827381,
"learning_rate": 8.477631447210778e-05,
"logits/chosen": 0.24841757118701935,
"logits/rejected": 1.2294715642929077,
"logps/chosen": -194.55015563964844,
"logps/rejected": -207.66958618164062,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.5389163494110107,
"rewards/margins": 12.835709571838379,
"rewards/rejected": -9.296793937683105,
"step": 215
},
{
"epoch": 1.4430062630480167,
"grad_norm": 0.004459040705114603,
"learning_rate": 8.421495039882238e-05,
"logits/chosen": 0.6321086287498474,
"logits/rejected": 0.9829391837120056,
"logps/chosen": -146.0184326171875,
"logps/rejected": -238.7043914794922,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.0472970008850098,
"rewards/margins": 13.108593940734863,
"rewards/rejected": -10.061296463012695,
"step": 216
},
{
"epoch": 1.4496868475991649,
"grad_norm": 0.002996882889419794,
"learning_rate": 8.365337812809957e-05,
"logits/chosen": 0.5133633613586426,
"logits/rejected": 1.3615784645080566,
"logps/chosen": -159.088134765625,
"logps/rejected": -166.50949096679688,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.693748712539673,
"rewards/margins": 11.46066951751709,
"rewards/rejected": -7.766920566558838,
"step": 217
},
{
"epoch": 1.456367432150313,
"grad_norm": 0.007244854234158993,
"learning_rate": 8.309162539879668e-05,
"logits/chosen": 0.5689404010772705,
"logits/rejected": 0.7958050966262817,
"logps/chosen": -136.24407958984375,
"logps/rejected": -253.57131958007812,
"loss": 0.0003,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.1574044227600098,
"rewards/margins": 12.73931884765625,
"rewards/rejected": -9.581913948059082,
"step": 218
},
{
"epoch": 1.4630480167014615,
"grad_norm": 0.0009683882817625999,
"learning_rate": 8.252971995868472e-05,
"logits/chosen": 0.4068656861782074,
"logits/rejected": 1.104844093322754,
"logps/chosen": -153.0896453857422,
"logps/rejected": -200.84996032714844,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.2435359954833984,
"rewards/margins": 11.93062973022461,
"rewards/rejected": -8.687093734741211,
"step": 219
},
{
"epoch": 1.4697286012526096,
"grad_norm": 0.00201588892377913,
"learning_rate": 8.196768956307795e-05,
"logits/chosen": 0.5268558263778687,
"logits/rejected": 0.9168641567230225,
"logps/chosen": -178.9090118408203,
"logps/rejected": -254.7775115966797,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.295229911804199,
"rewards/margins": 13.071582794189453,
"rewards/rejected": -9.77635383605957,
"step": 220
},
{
"epoch": 1.4764091858037578,
"grad_norm": 0.00043293953058309853,
"learning_rate": 8.140556197346273e-05,
"logits/chosen": 0.45406419038772583,
"logits/rejected": 1.1222517490386963,
"logps/chosen": -201.90655517578125,
"logps/rejected": -249.59384155273438,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.7309956550598145,
"rewards/margins": 12.679603576660156,
"rewards/rejected": -8.948606491088867,
"step": 221
},
{
"epoch": 1.4830897703549062,
"grad_norm": 0.0047185542061924934,
"learning_rate": 8.084336495612638e-05,
"logits/chosen": 0.5548102259635925,
"logits/rejected": 1.0701701641082764,
"logps/chosen": -137.0140838623047,
"logps/rejected": -196.58352661132812,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.2548980712890625,
"rewards/margins": 12.659390449523926,
"rewards/rejected": -9.404491424560547,
"step": 222
},
{
"epoch": 1.4897703549060544,
"grad_norm": 0.0012326568830758333,
"learning_rate": 8.02811262807855e-05,
"logits/chosen": 0.7784193158149719,
"logits/rejected": 1.0100144147872925,
"logps/chosen": -135.61509704589844,
"logps/rejected": -227.09725952148438,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.833930730819702,
"rewards/margins": 11.9942045211792,
"rewards/rejected": -9.160274505615234,
"step": 223
},
{
"epoch": 1.4964509394572025,
"grad_norm": 0.003940995782613754,
"learning_rate": 7.971887371921452e-05,
"logits/chosen": 0.45579829812049866,
"logits/rejected": 1.0411605834960938,
"logps/chosen": -151.62644958496094,
"logps/rejected": -205.2895965576172,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.106025218963623,
"rewards/margins": 12.4378023147583,
"rewards/rejected": -9.331775665283203,
"step": 224
},
{
"epoch": 1.5031315240083507,
"grad_norm": 0.0006113062263466418,
"learning_rate": 7.915663504387365e-05,
"logits/chosen": 0.7197325229644775,
"logits/rejected": 1.078650712966919,
"logps/chosen": -145.16757202148438,
"logps/rejected": -241.98052978515625,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.173128128051758,
"rewards/margins": 13.138956069946289,
"rewards/rejected": -9.965826988220215,
"step": 225
},
{
"epoch": 1.5098121085594989,
"grad_norm": 0.0055145323276519775,
"learning_rate": 7.859443802653728e-05,
"logits/chosen": 0.3827665150165558,
"logits/rejected": 1.2257570028305054,
"logps/chosen": -188.14073181152344,
"logps/rejected": -200.613037109375,
"loss": 0.0003,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.5312018394470215,
"rewards/margins": 12.372119903564453,
"rewards/rejected": -8.840917587280273,
"step": 226
},
{
"epoch": 1.516492693110647,
"grad_norm": 0.001715692924335599,
"learning_rate": 7.803231043692206e-05,
"logits/chosen": 0.560634195804596,
"logits/rejected": 0.9691610932350159,
"logps/chosen": -154.68666076660156,
"logps/rejected": -228.98910522460938,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.682804584503174,
"rewards/margins": 14.054306030273438,
"rewards/rejected": -10.371501922607422,
"step": 227
},
{
"epoch": 1.5231732776617954,
"grad_norm": 0.00392462033778429,
"learning_rate": 7.74702800413153e-05,
"logits/chosen": 0.47522974014282227,
"logits/rejected": 1.1297662258148193,
"logps/chosen": -155.83364868164062,
"logps/rejected": -188.842041015625,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.877500534057617,
"rewards/margins": 12.252093315124512,
"rewards/rejected": -8.374591827392578,
"step": 228
},
{
"epoch": 1.5298538622129436,
"grad_norm": 0.000706602178979665,
"learning_rate": 7.690837460120337e-05,
"logits/chosen": 0.3643675446510315,
"logits/rejected": 0.6753086447715759,
"logps/chosen": -132.07772827148438,
"logps/rejected": -268.08905029296875,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.478888988494873,
"rewards/margins": 13.154215812683105,
"rewards/rejected": -9.675326347351074,
"step": 229
},
{
"epoch": 1.536534446764092,
"grad_norm": 0.003365734126418829,
"learning_rate": 7.634662187190045e-05,
"logits/chosen": 0.648161768913269,
"logits/rejected": 1.1449542045593262,
"logps/chosen": -159.03543090820312,
"logps/rejected": -207.7163848876953,
"loss": 0.0003,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.997041702270508,
"rewards/margins": 12.427491188049316,
"rewards/rejected": -9.430448532104492,
"step": 230
},
{
"epoch": 1.5432150313152402,
"grad_norm": 0.0004856723826378584,
"learning_rate": 7.578504960117764e-05,
"logits/chosen": 0.792070209980011,
"logits/rejected": 1.158513069152832,
"logps/chosen": -141.17575073242188,
"logps/rejected": -243.63555908203125,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.499164581298828,
"rewards/margins": 12.730960845947266,
"rewards/rejected": -9.231797218322754,
"step": 231
},
{
"epoch": 1.5498956158663884,
"grad_norm": 0.0010725751053541899,
"learning_rate": 7.522368552789226e-05,
"logits/chosen": 0.6130843162536621,
"logits/rejected": 0.9492489099502563,
"logps/chosen": -115.36308288574219,
"logps/rejected": -209.8251953125,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.06745982170105,
"rewards/margins": 11.86876106262207,
"rewards/rejected": -8.801301002502441,
"step": 232
},
{
"epoch": 1.5565762004175365,
"grad_norm": 0.004260468762367964,
"learning_rate": 7.466255738061765e-05,
"logits/chosen": 0.4129951298236847,
"logits/rejected": 1.1070955991744995,
"logps/chosen": -170.56251525878906,
"logps/rejected": -215.829345703125,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.782398223876953,
"rewards/margins": 12.914801597595215,
"rewards/rejected": -9.132403373718262,
"step": 233
},
{
"epoch": 1.5632567849686847,
"grad_norm": 0.00023358217731583863,
"learning_rate": 7.410169287627369e-05,
"logits/chosen": 0.7674708962440491,
"logits/rejected": 0.951148509979248,
"logps/chosen": -153.4148406982422,
"logps/rejected": -276.1761474609375,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.604790210723877,
"rewards/margins": 13.036596298217773,
"rewards/rejected": -9.431807518005371,
"step": 234
},
{
"epoch": 1.5699373695198329,
"grad_norm": 0.0016559605719521642,
"learning_rate": 7.354111971875756e-05,
"logits/chosen": 0.47153300046920776,
"logits/rejected": 1.1286731958389282,
"logps/chosen": -154.3711700439453,
"logps/rejected": -184.73593139648438,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.253800868988037,
"rewards/margins": 11.487798690795898,
"rewards/rejected": -8.233997344970703,
"step": 235
},
{
"epoch": 1.5766179540709813,
"grad_norm": 0.0021103417966514826,
"learning_rate": 7.298086559757542e-05,
"logits/chosen": 0.5945063233375549,
"logits/rejected": 1.0322848558425903,
"logps/chosen": -128.76268005371094,
"logps/rejected": -196.54275512695312,
"loss": 0.0003,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.2880306243896484,
"rewards/margins": 12.563961029052734,
"rewards/rejected": -9.27592945098877,
"step": 236
},
{
"epoch": 1.5832985386221294,
"grad_norm": 0.001216527889482677,
"learning_rate": 7.242095818647454e-05,
"logits/chosen": 0.31693902611732483,
"logits/rejected": 1.3508694171905518,
"logps/chosen": -172.87911987304688,
"logps/rejected": -180.53089904785156,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.3253848552703857,
"rewards/margins": 11.522850036621094,
"rewards/rejected": -8.197465896606445,
"step": 237
},
{
"epoch": 1.5899791231732776,
"grad_norm": 0.0009002351434901357,
"learning_rate": 7.186142514207653e-05,
"logits/chosen": 0.4034768342971802,
"logits/rejected": 1.1458234786987305,
"logps/chosen": -154.97938537597656,
"logps/rejected": -189.22506713867188,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.2304248809814453,
"rewards/margins": 11.878856658935547,
"rewards/rejected": -8.648432731628418,
"step": 238
},
{
"epoch": 1.596659707724426,
"grad_norm": 0.00505381915718317,
"learning_rate": 7.130229410251116e-05,
"logits/chosen": 0.3664909899234772,
"logits/rejected": 0.9492464065551758,
"logps/chosen": -182.10992431640625,
"logps/rejected": -226.52783203125,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.0609524250030518,
"rewards/margins": 11.703436851501465,
"rewards/rejected": -8.642484664916992,
"step": 239
},
{
"epoch": 1.6033402922755742,
"grad_norm": 0.0007440568879246712,
"learning_rate": 7.074359268605111e-05,
"logits/chosen": 0.717093825340271,
"logits/rejected": 0.741437554359436,
"logps/chosen": -150.6142120361328,
"logps/rejected": -280.0047912597656,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.4058785438537598,
"rewards/margins": 13.219608306884766,
"rewards/rejected": -9.813730239868164,
"step": 240
},
{
"epoch": 1.6100208768267223,
"grad_norm": 0.0016615077620372176,
"learning_rate": 7.018534848974788e-05,
"logits/chosen": 0.5540122985839844,
"logits/rejected": 1.253514289855957,
"logps/chosen": -193.7939453125,
"logps/rejected": -226.57171630859375,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.350304126739502,
"rewards/margins": 13.100728988647461,
"rewards/rejected": -9.750425338745117,
"step": 241
},
{
"epoch": 1.6167014613778705,
"grad_norm": 0.0004508822748903185,
"learning_rate": 6.962758908806857e-05,
"logits/chosen": 0.47170671820640564,
"logits/rejected": 1.2200703620910645,
"logps/chosen": -208.1016845703125,
"logps/rejected": -205.09820556640625,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.2683615684509277,
"rewards/margins": 12.15200138092041,
"rewards/rejected": -8.883639335632324,
"step": 242
},
{
"epoch": 1.6233820459290187,
"grad_norm": 0.0014309012331068516,
"learning_rate": 6.907034203153386e-05,
"logits/chosen": 0.28136053681373596,
"logits/rejected": 1.2956924438476562,
"logps/chosen": -197.4197235107422,
"logps/rejected": -165.1903533935547,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.7073264122009277,
"rewards/margins": 12.170064926147461,
"rewards/rejected": -8.462738037109375,
"step": 243
},
{
"epoch": 1.6300626304801669,
"grad_norm": 0.0004903227090835571,
"learning_rate": 6.851363484535715e-05,
"logits/chosen": 0.5373984575271606,
"logits/rejected": 1.427203893661499,
"logps/chosen": -200.0109405517578,
"logps/rejected": -210.7964630126953,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.652834892272949,
"rewards/margins": 12.517841339111328,
"rewards/rejected": -8.865007400512695,
"step": 244
},
{
"epoch": 1.6367432150313153,
"grad_norm": 0.0005771399009972811,
"learning_rate": 6.795749502808498e-05,
"logits/chosen": 0.38623395562171936,
"logits/rejected": 1.2658888101577759,
"logps/chosen": -161.20318603515625,
"logps/rejected": -193.91897583007812,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.443969249725342,
"rewards/margins": 12.874672889709473,
"rewards/rejected": -9.430703163146973,
"step": 245
},
{
"epoch": 1.6434237995824634,
"grad_norm": 0.0004550297453533858,
"learning_rate": 6.74019500502386e-05,
"logits/chosen": 0.31895774602890015,
"logits/rejected": 1.2252631187438965,
"logps/chosen": -179.82730102539062,
"logps/rejected": -198.14735412597656,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.135148525238037,
"rewards/margins": 11.938846588134766,
"rewards/rejected": -8.803698539733887,
"step": 246
},
{
"epoch": 1.6501043841336118,
"grad_norm": 0.0006330306641757488,
"learning_rate": 6.684702735295725e-05,
"logits/chosen": 0.59703129529953,
"logits/rejected": 1.1867053508758545,
"logps/chosen": -145.04815673828125,
"logps/rejected": -198.34249877929688,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.332218647003174,
"rewards/margins": 12.267135620117188,
"rewards/rejected": -8.934917449951172,
"step": 247
},
{
"epoch": 1.65678496868476,
"grad_norm": 0.0033348165452480316,
"learning_rate": 6.629275434664267e-05,
"logits/chosen": 0.3704008460044861,
"logits/rejected": 1.013413429260254,
"logps/chosen": -181.12796020507812,
"logps/rejected": -244.9725341796875,
"loss": 0.0004,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.5755138397216797,
"rewards/margins": 12.979727745056152,
"rewards/rejected": -9.404214859008789,
"step": 248
},
{
"epoch": 1.6634655532359082,
"grad_norm": 0.0004896700265817344,
"learning_rate": 6.573915840960506e-05,
"logits/chosen": 0.3934718668460846,
"logits/rejected": 1.2252607345581055,
"logps/chosen": -178.19854736328125,
"logps/rejected": -183.23167419433594,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.2613682746887207,
"rewards/margins": 12.15888500213623,
"rewards/rejected": -8.897516250610352,
"step": 249
},
{
"epoch": 1.6701461377870563,
"grad_norm": 0.0017181966686621308,
"learning_rate": 6.518626688671075e-05,
"logits/chosen": 0.6154460906982422,
"logits/rejected": 0.8937150239944458,
"logps/chosen": -150.39620971679688,
"logps/rejected": -242.8111114501953,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.4164161682128906,
"rewards/margins": 13.423528671264648,
"rewards/rejected": -10.007112503051758,
"step": 250
},
{
"epoch": 1.6768267223382045,
"grad_norm": 0.00037543641519732773,
"learning_rate": 6.463410708803162e-05,
"logits/chosen": 0.6243424415588379,
"logits/rejected": 1.0089101791381836,
"logps/chosen": -118.72608947753906,
"logps/rejected": -188.4051971435547,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.186236619949341,
"rewards/margins": 11.791353225708008,
"rewards/rejected": -8.605117797851562,
"step": 251
},
{
"epoch": 1.6835073068893527,
"grad_norm": 0.012711738236248493,
"learning_rate": 6.408270628749605e-05,
"logits/chosen": 0.49875199794769287,
"logits/rejected": 1.288053274154663,
"logps/chosen": -181.6597442626953,
"logps/rejected": -191.96499633789062,
"loss": 0.0016,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.1369545459747314,
"rewards/margins": 11.313640594482422,
"rewards/rejected": -8.17668628692627,
"step": 252
},
{
"epoch": 1.690187891440501,
"grad_norm": 0.0029580378904938698,
"learning_rate": 6.353209172154159e-05,
"logits/chosen": 0.5064088106155396,
"logits/rejected": 0.9738377928733826,
"logps/chosen": -151.7202606201172,
"logps/rejected": -202.4073028564453,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.9301891326904297,
"rewards/margins": 11.945601463317871,
"rewards/rejected": -9.015412330627441,
"step": 253
},
{
"epoch": 1.6968684759916492,
"grad_norm": 0.001020129886455834,
"learning_rate": 6.298229058776986e-05,
"logits/chosen": 0.700631320476532,
"logits/rejected": 0.908123254776001,
"logps/chosen": -131.9924774169922,
"logps/rejected": -255.37557983398438,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.146085262298584,
"rewards/margins": 12.487560272216797,
"rewards/rejected": -9.341474533081055,
"step": 254
},
{
"epoch": 1.7035490605427976,
"grad_norm": 0.0003875437832903117,
"learning_rate": 6.243333004360298e-05,
"logits/chosen": 0.5261640548706055,
"logits/rejected": 1.0396316051483154,
"logps/chosen": -147.33441162109375,
"logps/rejected": -225.6119384765625,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.4174602031707764,
"rewards/margins": 13.032320022583008,
"rewards/rejected": -9.614859580993652,
"step": 255
},
{
"epoch": 1.7102296450939458,
"grad_norm": 0.0135966707020998,
"learning_rate": 6.188523720494211e-05,
"logits/chosen": 0.20291149616241455,
"logits/rejected": 1.127085566520691,
"logps/chosen": -208.96347045898438,
"logps/rejected": -220.78175354003906,
"loss": 0.001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.3484673500061035,
"rewards/margins": 11.46480655670166,
"rewards/rejected": -8.116339683532715,
"step": 256
},
{
"epoch": 1.716910229645094,
"grad_norm": 0.0002224293421022594,
"learning_rate": 6.133803914482815e-05,
"logits/chosen": 0.7359347939491272,
"logits/rejected": 0.9675177931785583,
"logps/chosen": -172.76893615722656,
"logps/rejected": -252.0592498779297,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.5859246253967285,
"rewards/margins": 12.79989242553711,
"rewards/rejected": -9.213968276977539,
"step": 257
},
{
"epoch": 1.7235908141962422,
"grad_norm": 0.002110100816935301,
"learning_rate": 6.0791762892104416e-05,
"logits/chosen": 0.6298213601112366,
"logits/rejected": 1.1915262937545776,
"logps/chosen": -159.96487426757812,
"logps/rejected": -225.63790893554688,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.81847882270813,
"rewards/margins": 12.683730125427246,
"rewards/rejected": -8.865250587463379,
"step": 258
},
{
"epoch": 1.7302713987473903,
"grad_norm": 0.001850843895226717,
"learning_rate": 6.024643543008152e-05,
"logits/chosen": 0.6120001077651978,
"logits/rejected": 0.9265426397323608,
"logps/chosen": -136.79991149902344,
"logps/rejected": -205.8091583251953,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.2155539989471436,
"rewards/margins": 12.165740966796875,
"rewards/rejected": -8.950185775756836,
"step": 259
},
{
"epoch": 1.7369519832985385,
"grad_norm": 0.009307686239480972,
"learning_rate": 5.970208369520461e-05,
"logits/chosen": 0.41133901476860046,
"logits/rejected": 1.244918942451477,
"logps/chosen": -170.95782470703125,
"logps/rejected": -175.3126678466797,
"loss": 0.0003,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.113586187362671,
"rewards/margins": 11.052433013916016,
"rewards/rejected": -7.938846588134766,
"step": 260
},
{
"epoch": 1.743632567849687,
"grad_norm": 0.0012803918216377497,
"learning_rate": 5.915873457572276e-05,
"logits/chosen": 0.633670449256897,
"logits/rejected": 1.3272953033447266,
"logps/chosen": -195.4368896484375,
"logps/rejected": -219.29466247558594,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.7701711654663086,
"rewards/margins": 12.498077392578125,
"rewards/rejected": -8.7279052734375,
"step": 261
},
{
"epoch": 1.750313152400835,
"grad_norm": 0.002625958528369665,
"learning_rate": 5.861641491036095e-05,
"logits/chosen": 0.4034227430820465,
"logits/rejected": 1.1840628385543823,
"logps/chosen": -175.64141845703125,
"logps/rejected": -210.2655792236328,
"loss": 0.0003,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.4675025939941406,
"rewards/margins": 12.683855056762695,
"rewards/rejected": -9.216352462768555,
"step": 262
},
{
"epoch": 1.7569937369519835,
"grad_norm": 0.0024748044088482857,
"learning_rate": 5.807515148699412e-05,
"logits/chosen": 0.6530991196632385,
"logits/rejected": 1.0524541139602661,
"logps/chosen": -141.74876403808594,
"logps/rejected": -196.0273895263672,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.1580276489257812,
"rewards/margins": 12.469364166259766,
"rewards/rejected": -9.311336517333984,
"step": 263
},
{
"epoch": 1.7636743215031316,
"grad_norm": 0.009714308194816113,
"learning_rate": 5.75349710413243e-05,
"logits/chosen": 0.5566273927688599,
"logits/rejected": 0.9325087666511536,
"logps/chosen": -132.86610412597656,
"logps/rejected": -217.09593200683594,
"loss": 0.0007,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.0257492065429688,
"rewards/margins": 11.39592170715332,
"rewards/rejected": -8.370172500610352,
"step": 264
},
{
"epoch": 1.7703549060542798,
"grad_norm": 0.00043294430361129344,
"learning_rate": 5.699590025555984e-05,
"logits/chosen": 0.5268535017967224,
"logits/rejected": 1.4419851303100586,
"logps/chosen": -161.57455444335938,
"logps/rejected": -173.24090576171875,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.010202884674072,
"rewards/margins": 12.556886672973633,
"rewards/rejected": -8.546682357788086,
"step": 265
},
{
"epoch": 1.777035490605428,
"grad_norm": 0.0034687798470258713,
"learning_rate": 5.645796575709731e-05,
"logits/chosen": 0.5220036506652832,
"logits/rejected": 1.2265688180923462,
"logps/chosen": -148.57662963867188,
"logps/rejected": -177.5288543701172,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.401430130004883,
"rewards/margins": 12.56801700592041,
"rewards/rejected": -9.166585922241211,
"step": 266
},
{
"epoch": 1.7837160751565762,
"grad_norm": 0.0005332541186362505,
"learning_rate": 5.592119411720651e-05,
"logits/chosen": 0.631429135799408,
"logits/rejected": 1.1897830963134766,
"logps/chosen": -164.9969482421875,
"logps/rejected": -215.81735229492188,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.4762606620788574,
"rewards/margins": 12.587698936462402,
"rewards/rejected": -9.111437797546387,
"step": 267
},
{
"epoch": 1.7903966597077243,
"grad_norm": 0.0007586259744130075,
"learning_rate": 5.5385611849717856e-05,
"logits/chosen": 0.7947689294815063,
"logits/rejected": 1.1861993074417114,
"logps/chosen": -150.67178344726562,
"logps/rejected": -223.85858154296875,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.8535823822021484,
"rewards/margins": 13.847382545471191,
"rewards/rejected": -9.993799209594727,
"step": 268
},
{
"epoch": 1.7970772442588725,
"grad_norm": 0.002334814053028822,
"learning_rate": 5.4851245409712585e-05,
"logits/chosen": 0.7192946672439575,
"logits/rejected": 1.0370632410049438,
"logps/chosen": -134.57066345214844,
"logps/rejected": -200.06007385253906,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.05899715423584,
"rewards/margins": 12.443685531616211,
"rewards/rejected": -9.384687423706055,
"step": 269
},
{
"epoch": 1.803757828810021,
"grad_norm": 0.01562592014670372,
"learning_rate": 5.4318121192216314e-05,
"logits/chosen": 0.5374943017959595,
"logits/rejected": 1.2269227504730225,
"logps/chosen": -183.84536743164062,
"logps/rejected": -224.18148803710938,
"loss": 0.0015,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.26055645942688,
"rewards/margins": 12.747486114501953,
"rewards/rejected": -9.486929893493652,
"step": 270
},
{
"epoch": 1.810438413361169,
"grad_norm": 0.00042984873289242387,
"learning_rate": 5.378626553089499e-05,
"logits/chosen": 0.4885158836841583,
"logits/rejected": 1.0233110189437866,
"logps/chosen": -206.98069763183594,
"logps/rejected": -265.67486572265625,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.3298757076263428,
"rewards/margins": 13.249074935913086,
"rewards/rejected": -9.91919994354248,
"step": 271
},
{
"epoch": 1.8171189979123175,
"grad_norm": 0.004903177265077829,
"learning_rate": 5.3255704696754276e-05,
"logits/chosen": 0.3477621078491211,
"logits/rejected": 1.3452774286270142,
"logps/chosen": -222.60406494140625,
"logps/rejected": -210.79745483398438,
"loss": 0.0005,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.442991256713867,
"rewards/margins": 12.528120994567871,
"rewards/rejected": -9.08513069152832,
"step": 272
},
{
"epoch": 1.8237995824634656,
"grad_norm": 0.0014413794269785285,
"learning_rate": 5.272646489684186e-05,
"logits/chosen": 0.5790391564369202,
"logits/rejected": 0.7671089172363281,
"logps/chosen": -113.13287353515625,
"logps/rejected": -217.6890106201172,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.147888422012329,
"rewards/margins": 13.171778678894043,
"rewards/rejected": -10.023889541625977,
"step": 273
},
{
"epoch": 1.8304801670146138,
"grad_norm": 0.00028897402808070183,
"learning_rate": 5.219857227295291e-05,
"logits/chosen": 0.709730327129364,
"logits/rejected": 0.8601136207580566,
"logps/chosen": -156.91653442382812,
"logps/rejected": -295.93157958984375,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.7537806034088135,
"rewards/margins": 13.796693801879883,
"rewards/rejected": -10.042911529541016,
"step": 274
},
{
"epoch": 1.837160751565762,
"grad_norm": 0.0007451316341757774,
"learning_rate": 5.167205290033892e-05,
"logits/chosen": 0.658225953578949,
"logits/rejected": 1.08082115650177,
"logps/chosen": -171.94277954101562,
"logps/rejected": -237.1277618408203,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.034437894821167,
"rewards/margins": 12.588874816894531,
"rewards/rejected": -9.554437637329102,
"step": 275
},
{
"epoch": 1.8438413361169101,
"grad_norm": 0.02490099146962166,
"learning_rate": 5.114693278641957e-05,
"logits/chosen": 0.6082062721252441,
"logits/rejected": 0.9871134161949158,
"logps/chosen": -178.09619140625,
"logps/rejected": -247.61810302734375,
"loss": 0.0022,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.1589179039001465,
"rewards/margins": 12.523808479309082,
"rewards/rejected": -9.364890098571777,
"step": 276
},
{
"epoch": 1.8505219206680583,
"grad_norm": 0.005023833829909563,
"learning_rate": 5.062323786949824e-05,
"logits/chosen": 0.6513435244560242,
"logits/rejected": 1.0219300985336304,
"logps/chosen": -171.38177490234375,
"logps/rejected": -259.73443603515625,
"loss": 0.0005,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.3252928256988525,
"rewards/margins": 12.778488159179688,
"rewards/rejected": -9.453194618225098,
"step": 277
},
{
"epoch": 1.8572025052192067,
"grad_norm": 0.001171917305327952,
"learning_rate": 5.0100994017480704e-05,
"logits/chosen": 0.7154785990715027,
"logits/rejected": 1.23942232131958,
"logps/chosen": -158.58740234375,
"logps/rejected": -199.68759155273438,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.3045573234558105,
"rewards/margins": 12.734079360961914,
"rewards/rejected": -9.429522514343262,
"step": 278
},
{
"epoch": 1.8638830897703549,
"grad_norm": 0.0008877121144905686,
"learning_rate": 4.958022702659731e-05,
"logits/chosen": 0.3124326765537262,
"logits/rejected": 1.3723382949829102,
"logps/chosen": -222.2159423828125,
"logps/rejected": -199.4228057861328,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.163365364074707,
"rewards/margins": 13.488718032836914,
"rewards/rejected": -9.325352668762207,
"step": 279
},
{
"epoch": 1.8705636743215033,
"grad_norm": 0.0010831408435478806,
"learning_rate": 4.9060962620129e-05,
"logits/chosen": 0.4300207495689392,
"logits/rejected": 1.1081445217132568,
"logps/chosen": -144.10047912597656,
"logps/rejected": -205.8716583251953,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.295637607574463,
"rewards/margins": 12.61944580078125,
"rewards/rejected": -9.323808670043945,
"step": 280
},
{
"epoch": 1.8772442588726515,
"grad_norm": 0.00025624182308092713,
"learning_rate": 4.854322644713648e-05,
"logits/chosen": 0.6763560175895691,
"logits/rejected": 0.9413488507270813,
"logps/chosen": -142.95899963378906,
"logps/rejected": -257.82391357421875,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.138211727142334,
"rewards/margins": 13.099564552307129,
"rewards/rejected": -9.961352348327637,
"step": 281
},
{
"epoch": 1.8839248434237996,
"grad_norm": 0.004967350512742996,
"learning_rate": 4.8027044081193434e-05,
"logits/chosen": 0.9007304906845093,
"logits/rejected": 1.1363219022750854,
"logps/chosen": -121.35027313232422,
"logps/rejected": -230.96900939941406,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.0166122913360596,
"rewards/margins": 12.013561248779297,
"rewards/rejected": -8.996949195861816,
"step": 282
},
{
"epoch": 1.8906054279749478,
"grad_norm": 0.0005215193377807736,
"learning_rate": 4.751244101912317e-05,
"logits/chosen": 0.4525222182273865,
"logits/rejected": 1.0277119874954224,
"logps/chosen": -166.93800354003906,
"logps/rejected": -224.22732543945312,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.6909801959991455,
"rewards/margins": 12.965056419372559,
"rewards/rejected": -9.274076461791992,
"step": 283
},
{
"epoch": 1.897286012526096,
"grad_norm": 0.0002354821190237999,
"learning_rate": 4.6999442679739404e-05,
"logits/chosen": 0.5379772782325745,
"logits/rejected": 1.0427289009094238,
"logps/chosen": -121.2558364868164,
"logps/rejected": -200.19229125976562,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.4199142456054688,
"rewards/margins": 12.075815200805664,
"rewards/rejected": -8.655900955200195,
"step": 284
},
{
"epoch": 1.9039665970772441,
"grad_norm": 0.00031139524071477354,
"learning_rate": 4.648807440259054e-05,
"logits/chosen": 0.6613008975982666,
"logits/rejected": 1.0043450593948364,
"logps/chosen": -117.17173767089844,
"logps/rejected": -229.09197998046875,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.0674209594726562,
"rewards/margins": 12.071032524108887,
"rewards/rejected": -9.003610610961914,
"step": 285
},
{
"epoch": 1.9106471816283925,
"grad_norm": 0.0003046044730581343,
"learning_rate": 4.5978361446708026e-05,
"logits/chosen": 0.5342459678649902,
"logits/rejected": 1.357889175415039,
"logps/chosen": -207.97657775878906,
"logps/rejected": -223.646240234375,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.909088611602783,
"rewards/margins": 12.76576042175293,
"rewards/rejected": -8.856673240661621,
"step": 286
},
{
"epoch": 1.9173277661795407,
"grad_norm": 0.003803228959441185,
"learning_rate": 4.547032898935883e-05,
"logits/chosen": 0.42278778553009033,
"logits/rejected": 0.9761509895324707,
"logps/chosen": -219.009765625,
"logps/rejected": -267.2040100097656,
"loss": 0.0004,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.5123653411865234,
"rewards/margins": 12.867091178894043,
"rewards/rejected": -9.354723930358887,
"step": 287
},
{
"epoch": 1.924008350730689,
"grad_norm": 0.0037957087624818087,
"learning_rate": 4.496400212480165e-05,
"logits/chosen": 0.5482458472251892,
"logits/rejected": 1.2564729452133179,
"logps/chosen": -150.4049072265625,
"logps/rejected": -185.18603515625,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.50251841545105,
"rewards/margins": 12.41536808013916,
"rewards/rejected": -8.912851333618164,
"step": 288
},
{
"epoch": 1.9306889352818373,
"grad_norm": 0.00025647124857641757,
"learning_rate": 4.445940586304742e-05,
"logits/chosen": 0.2872941493988037,
"logits/rejected": 1.2851579189300537,
"logps/chosen": -186.39442443847656,
"logps/rejected": -196.6807861328125,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.469214916229248,
"rewards/margins": 12.677154541015625,
"rewards/rejected": -9.207941055297852,
"step": 289
},
{
"epoch": 1.9373695198329854,
"grad_norm": 0.002922415267676115,
"learning_rate": 4.3956565128623996e-05,
"logits/chosen": 0.42484214901924133,
"logits/rejected": 0.8704441785812378,
"logps/chosen": -172.12071228027344,
"logps/rejected": -256.0958557128906,
"loss": 0.0004,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.3208203315734863,
"rewards/margins": 13.219375610351562,
"rewards/rejected": -9.898555755615234,
"step": 290
},
{
"epoch": 1.9440501043841336,
"grad_norm": 0.00034838722785934806,
"learning_rate": 4.3455504759345056e-05,
"logits/chosen": 0.3946557641029358,
"logits/rejected": 1.1197338104248047,
"logps/chosen": -186.8988037109375,
"logps/rejected": -234.5850830078125,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.767469882965088,
"rewards/margins": 13.301392555236816,
"rewards/rejected": -9.53392219543457,
"step": 291
},
{
"epoch": 1.9507306889352818,
"grad_norm": 0.0006544087664224207,
"learning_rate": 4.295624950508295e-05,
"logits/chosen": 0.4687265455722809,
"logits/rejected": 1.0979089736938477,
"logps/chosen": -126.43232727050781,
"logps/rejected": -183.7377166748047,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.1935901641845703,
"rewards/margins": 11.762184143066406,
"rewards/rejected": -8.56859302520752,
"step": 292
},
{
"epoch": 1.95741127348643,
"grad_norm": 0.00044172187335789204,
"learning_rate": 4.24588240265466e-05,
"logits/chosen": 0.8599841594696045,
"logits/rejected": 0.7370025515556335,
"logps/chosen": -138.1736297607422,
"logps/rejected": -296.47723388671875,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.7924275398254395,
"rewards/margins": 14.83957290649414,
"rewards/rejected": -11.04714584350586,
"step": 293
},
{
"epoch": 1.9640918580375781,
"grad_norm": 0.001833213260397315,
"learning_rate": 4.1963252894063056e-05,
"logits/chosen": 0.5297300815582275,
"logits/rejected": 0.8016726970672607,
"logps/chosen": -158.78738403320312,
"logps/rejected": -272.85113525390625,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.3024632930755615,
"rewards/margins": 13.22446060180664,
"rewards/rejected": -9.921996116638184,
"step": 294
},
{
"epoch": 1.9707724425887265,
"grad_norm": 0.0010583212133497,
"learning_rate": 4.146956058636406e-05,
"logits/chosen": 0.3446853756904602,
"logits/rejected": 1.0419793128967285,
"logps/chosen": -150.81710815429688,
"logps/rejected": -210.25636291503906,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.4412221908569336,
"rewards/margins": 12.799392700195312,
"rewards/rejected": -9.358169555664062,
"step": 295
},
{
"epoch": 1.9774530271398747,
"grad_norm": 0.001035829191096127,
"learning_rate": 4.097777148937663e-05,
"logits/chosen": 0.5304365754127502,
"logits/rejected": 0.9279758334159851,
"logps/chosen": -155.19656372070312,
"logps/rejected": -245.78414916992188,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.409811496734619,
"rewards/margins": 12.613784790039062,
"rewards/rejected": -9.203972816467285,
"step": 296
},
{
"epoch": 1.984133611691023,
"grad_norm": 0.0017729728715494275,
"learning_rate": 4.048790989501893e-05,
"logits/chosen": 0.5875007510185242,
"logits/rejected": 1.1323366165161133,
"logps/chosen": -140.05502319335938,
"logps/rejected": -218.2332000732422,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.322587251663208,
"rewards/margins": 13.057868003845215,
"rewards/rejected": -9.73528003692627,
"step": 297
},
{
"epoch": 1.9908141962421713,
"grad_norm": 0.0009540588362142444,
"learning_rate": 4.0000000000000024e-05,
"logits/chosen": 0.5278233289718628,
"logits/rejected": 1.2491222620010376,
"logps/chosen": -195.366943359375,
"logps/rejected": -243.4252471923828,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.871669292449951,
"rewards/margins": 13.229048728942871,
"rewards/rejected": -9.357379913330078,
"step": 298
},
{
"epoch": 1.9974947807933194,
"grad_norm": 0.000431572028901428,
"learning_rate": 3.951406590462479e-05,
"logits/chosen": 0.8149689435958862,
"logits/rejected": 0.9918652772903442,
"logps/chosen": -132.57635498046875,
"logps/rejected": -227.2015380859375,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.2055227756500244,
"rewards/margins": 12.865432739257812,
"rewards/rejected": -9.659910202026367,
"step": 299
},
{
"epoch": 2.0041753653444676,
"grad_norm": 0.000286836177110672,
"learning_rate": 3.9030131611603605e-05,
"logits/chosen": 0.5436272621154785,
"logits/rejected": 1.0624536275863647,
"logps/chosen": -145.8037872314453,
"logps/rejected": -211.19740295410156,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.796835422515869,
"rewards/margins": 13.231882095336914,
"rewards/rejected": -9.435047149658203,
"step": 300
},
{
"epoch": 2.010855949895616,
"grad_norm": 0.012853999622166157,
"learning_rate": 3.854822102486654e-05,
"logits/chosen": 0.36405637860298157,
"logits/rejected": 1.1111705303192139,
"logps/chosen": -160.4470672607422,
"logps/rejected": -197.8324737548828,
"loss": 0.0009,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.6854968070983887,
"rewards/margins": 12.909117698669434,
"rewards/rejected": -9.223621368408203,
"step": 301
},
{
"epoch": 2.017536534446764,
"grad_norm": 0.0005720301996916533,
"learning_rate": 3.8068357948382715e-05,
"logits/chosen": 0.554153561592102,
"logits/rejected": 1.1136068105697632,
"logps/chosen": -136.82049560546875,
"logps/rejected": -201.56800842285156,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.509455442428589,
"rewards/margins": 13.126138687133789,
"rewards/rejected": -9.616683006286621,
"step": 302
},
{
"epoch": 2.024217118997912,
"grad_norm": 0.0013834366109222174,
"learning_rate": 3.759056608498451e-05,
"logits/chosen": 0.23741646111011505,
"logits/rejected": 1.0043878555297852,
"logps/chosen": -145.64682006835938,
"logps/rejected": -184.83242797851562,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.1809093952178955,
"rewards/margins": 12.4395751953125,
"rewards/rejected": -9.258666038513184,
"step": 303
},
{
"epoch": 2.0308977035490607,
"grad_norm": 0.00023377075558528304,
"learning_rate": 3.7114869035196815e-05,
"logits/chosen": 0.5432067513465881,
"logits/rejected": 1.153996467590332,
"logps/chosen": -167.0084686279297,
"logps/rejected": -215.29714965820312,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.3663644790649414,
"rewards/margins": 12.530076026916504,
"rewards/rejected": -9.163711547851562,
"step": 304
},
{
"epoch": 2.037578288100209,
"grad_norm": 0.0050998348742723465,
"learning_rate": 3.6641290296071134e-05,
"logits/chosen": 0.6319041848182678,
"logits/rejected": 1.0745893716812134,
"logps/chosen": -189.58924865722656,
"logps/rejected": -271.85601806640625,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.620217800140381,
"rewards/margins": 13.474288940429688,
"rewards/rejected": -9.854070663452148,
"step": 305
},
{
"epoch": 2.044258872651357,
"grad_norm": 0.0010785937774926424,
"learning_rate": 3.616985326002506e-05,
"logits/chosen": 0.603534460067749,
"logits/rejected": 0.9770067930221558,
"logps/chosen": -131.9829864501953,
"logps/rejected": -214.7379608154297,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.453455924987793,
"rewards/margins": 12.670695304870605,
"rewards/rejected": -9.217240333557129,
"step": 306
},
{
"epoch": 2.0509394572025053,
"grad_norm": 0.00044056694605387747,
"learning_rate": 3.570058121368678e-05,
"logits/chosen": 0.7243631482124329,
"logits/rejected": 1.2042169570922852,
"logps/chosen": -167.11720275878906,
"logps/rejected": -231.396728515625,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.567816972732544,
"rewards/margins": 12.468470573425293,
"rewards/rejected": -8.900653839111328,
"step": 307
},
{
"epoch": 2.0576200417536534,
"grad_norm": 0.0021745203994214535,
"learning_rate": 3.5233497336744907e-05,
"logits/chosen": 0.3905419111251831,
"logits/rejected": 0.9584081172943115,
"logps/chosen": -159.60000610351562,
"logps/rejected": -249.20263671875,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.5410196781158447,
"rewards/margins": 13.085066795349121,
"rewards/rejected": -9.544046401977539,
"step": 308
},
{
"epoch": 2.0643006263048016,
"grad_norm": 0.003901979187503457,
"learning_rate": 3.476862470080329e-05,
"logits/chosen": 0.3295363187789917,
"logits/rejected": 1.1402533054351807,
"logps/chosen": -155.79783630371094,
"logps/rejected": -215.50086975097656,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.6909849643707275,
"rewards/margins": 12.775930404663086,
"rewards/rejected": -9.084944725036621,
"step": 309
},
{
"epoch": 2.0709812108559498,
"grad_norm": 0.0003340893890708685,
"learning_rate": 3.4305986268241716e-05,
"logits/chosen": 0.5111173391342163,
"logits/rejected": 1.2973905801773071,
"logps/chosen": -179.60287475585938,
"logps/rejected": -192.02505493164062,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.3571372032165527,
"rewards/margins": 11.964003562927246,
"rewards/rejected": -8.606865882873535,
"step": 310
},
{
"epoch": 2.077661795407098,
"grad_norm": 0.001990803051739931,
"learning_rate": 3.3845604891081396e-05,
"logits/chosen": 0.4202510118484497,
"logits/rejected": 0.9384629130363464,
"logps/chosen": -138.1681365966797,
"logps/rejected": -208.4151153564453,
"loss": 0.0003,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.3490982055664062,
"rewards/margins": 12.660693168640137,
"rewards/rejected": -9.311594009399414,
"step": 311
},
{
"epoch": 2.0843423799582466,
"grad_norm": 0.0005957921384833753,
"learning_rate": 3.338750330985638e-05,
"logits/chosen": 0.28172874450683594,
"logits/rejected": 1.208389163017273,
"logps/chosen": -174.9169158935547,
"logps/rejected": -188.84228515625,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.366245746612549,
"rewards/margins": 12.091064453125,
"rewards/rejected": -8.724818229675293,
"step": 312
},
{
"epoch": 2.0910229645093947,
"grad_norm": 0.00024528999347239733,
"learning_rate": 3.293170415249015e-05,
"logits/chosen": 0.39942628145217896,
"logits/rejected": 0.9915326833724976,
"logps/chosen": -144.50625610351562,
"logps/rejected": -200.2137451171875,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.865654706954956,
"rewards/margins": 12.166391372680664,
"rewards/rejected": -9.300737380981445,
"step": 313
},
{
"epoch": 2.097703549060543,
"grad_norm": 0.000993763329461217,
"learning_rate": 3.247822993317809e-05,
"logits/chosen": 0.22010907530784607,
"logits/rejected": 1.199426531791687,
"logps/chosen": -169.88729858398438,
"logps/rejected": -175.47982788085938,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.4611406326293945,
"rewards/margins": 12.266228675842285,
"rewards/rejected": -8.80508804321289,
"step": 314
},
{
"epoch": 2.104384133611691,
"grad_norm": 0.0003548046515788883,
"learning_rate": 3.202710305127518e-05,
"logits/chosen": 0.47131308913230896,
"logits/rejected": 1.1059740781784058,
"logps/chosen": -186.29026794433594,
"logps/rejected": -250.04234313964844,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.3379273414611816,
"rewards/margins": 13.453146934509277,
"rewards/rejected": -10.115219116210938,
"step": 315
},
{
"epoch": 2.1110647181628392,
"grad_norm": 0.0008554239757359028,
"learning_rate": 3.157834579018972e-05,
"logits/chosen": 0.513791024684906,
"logits/rejected": 1.2864115238189697,
"logps/chosen": -196.79559326171875,
"logps/rejected": -204.86273193359375,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.6547086238861084,
"rewards/margins": 12.088726997375488,
"rewards/rejected": -8.434019088745117,
"step": 316
},
{
"epoch": 2.1177453027139874,
"grad_norm": 0.003397479420527816,
"learning_rate": 3.113198031628267e-05,
"logits/chosen": 0.6464900374412537,
"logits/rejected": 1.0889447927474976,
"logps/chosen": -180.78738403320312,
"logps/rejected": -245.6759033203125,
"loss": 0.0003,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.6549572944641113,
"rewards/margins": 13.39869499206543,
"rewards/rejected": -9.74373722076416,
"step": 317
},
{
"epoch": 2.1244258872651356,
"grad_norm": 0.0010641631670296192,
"learning_rate": 3.0688028677772615e-05,
"logits/chosen": 0.11827311664819717,
"logits/rejected": 1.2251101732254028,
"logps/chosen": -204.45823669433594,
"logps/rejected": -189.8067626953125,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.575129985809326,
"rewards/margins": 12.471538543701172,
"rewards/rejected": -8.896408081054688,
"step": 318
},
{
"epoch": 2.1311064718162838,
"grad_norm": 0.0023302403278648853,
"learning_rate": 3.0246512803646787e-05,
"logits/chosen": 0.5905887484550476,
"logits/rejected": 1.3115737438201904,
"logps/chosen": -160.69956970214844,
"logps/rejected": -200.94296264648438,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.5085368156433105,
"rewards/margins": 13.034270286560059,
"rewards/rejected": -9.525733947753906,
"step": 319
},
{
"epoch": 2.137787056367432,
"grad_norm": 0.002920424798503518,
"learning_rate": 2.980745450257782e-05,
"logits/chosen": 0.793439507484436,
"logits/rejected": 1.0042898654937744,
"logps/chosen": -121.60202026367188,
"logps/rejected": -215.73321533203125,
"loss": 0.0005,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.618967294692993,
"rewards/margins": 13.545495986938477,
"rewards/rejected": -9.926527976989746,
"step": 320
},
{
"epoch": 2.1444676409185806,
"grad_norm": 0.002082268940284848,
"learning_rate": 2.9370875461846675e-05,
"logits/chosen": 0.6137429475784302,
"logits/rejected": 1.13056218624115,
"logps/chosen": -131.8321533203125,
"logps/rejected": -184.77304077148438,
"loss": 0.0003,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.843712329864502,
"rewards/margins": 11.69206428527832,
"rewards/rejected": -8.848353385925293,
"step": 321
},
{
"epoch": 2.1511482254697287,
"grad_norm": 0.0028337843250483274,
"learning_rate": 2.8936797246271074e-05,
"logits/chosen": 0.6332962512969971,
"logits/rejected": 0.9748692512512207,
"logps/chosen": -129.75018310546875,
"logps/rejected": -253.5199432373047,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.8473052978515625,
"rewards/margins": 12.578422546386719,
"rewards/rejected": -9.731117248535156,
"step": 322
},
{
"epoch": 2.157828810020877,
"grad_norm": 0.0006396812968887389,
"learning_rate": 2.8505241297140674e-05,
"logits/chosen": 0.5061097145080566,
"logits/rejected": 0.7550954818725586,
"logps/chosen": -132.68972778320312,
"logps/rejected": -245.25881958007812,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.3926968574523926,
"rewards/margins": 13.062835693359375,
"rewards/rejected": -9.67013931274414,
"step": 323
},
{
"epoch": 2.164509394572025,
"grad_norm": 0.010461756028234959,
"learning_rate": 2.807622893115773e-05,
"logits/chosen": 0.2978319525718689,
"logits/rejected": 1.3880324363708496,
"logps/chosen": -172.8788299560547,
"logps/rejected": -175.39361572265625,
"loss": 0.0009,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.776562452316284,
"rewards/margins": 12.051597595214844,
"rewards/rejected": -8.27503490447998,
"step": 324
},
{
"epoch": 2.1711899791231732,
"grad_norm": 0.0011066849110648036,
"learning_rate": 2.7649781339384224e-05,
"logits/chosen": 0.4691530466079712,
"logits/rejected": 1.1270420551300049,
"logps/chosen": -153.82896423339844,
"logps/rejected": -219.26405334472656,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.471506357192993,
"rewards/margins": 12.436012268066406,
"rewards/rejected": -8.964506149291992,
"step": 325
},
{
"epoch": 2.1778705636743214,
"grad_norm": 0.0004933910095132887,
"learning_rate": 2.7225919586195133e-05,
"logits/chosen": 0.42578768730163574,
"logits/rejected": 0.9353082776069641,
"logps/chosen": -145.8098907470703,
"logps/rejected": -232.33059692382812,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.495699882507324,
"rewards/margins": 13.425835609436035,
"rewards/rejected": -9.930134773254395,
"step": 326
},
{
"epoch": 2.1845511482254696,
"grad_norm": 0.0005303247016854584,
"learning_rate": 2.6804664608238035e-05,
"logits/chosen": 0.38650327920913696,
"logits/rejected": 0.703351616859436,
"logps/chosen": -127.17084503173828,
"logps/rejected": -228.81396484375,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.2400314807891846,
"rewards/margins": 14.320934295654297,
"rewards/rejected": -11.080903053283691,
"step": 327
},
{
"epoch": 2.1912317327766178,
"grad_norm": 0.0009748793672770262,
"learning_rate": 2.6386037213398786e-05,
"logits/chosen": 0.4487416744232178,
"logits/rejected": 1.1172525882720947,
"logps/chosen": -160.41439819335938,
"logps/rejected": -252.93212890625,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.4267497062683105,
"rewards/margins": 12.388287544250488,
"rewards/rejected": -8.961536407470703,
"step": 328
},
{
"epoch": 2.1979123173277664,
"grad_norm": 0.0002127394254785031,
"learning_rate": 2.5970058079773816e-05,
"logits/chosen": 0.5297442674636841,
"logits/rejected": 1.090707778930664,
"logps/chosen": -146.61062622070312,
"logps/rejected": -222.10076904296875,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.185678720474243,
"rewards/margins": 12.19675064086914,
"rewards/rejected": -9.011072158813477,
"step": 329
},
{
"epoch": 2.2045929018789145,
"grad_norm": 0.00024031591601669788,
"learning_rate": 2.5556747754648718e-05,
"logits/chosen": 0.3269326686859131,
"logits/rejected": 1.258778691291809,
"logps/chosen": -171.55960083007812,
"logps/rejected": -198.94358825683594,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.729045867919922,
"rewards/margins": 12.932186126708984,
"rewards/rejected": -9.203140258789062,
"step": 330
},
{
"epoch": 2.2112734864300627,
"grad_norm": 0.0011042467085644603,
"learning_rate": 2.5146126653483355e-05,
"logits/chosen": 0.5408863425254822,
"logits/rejected": 0.9578089118003845,
"logps/chosen": -144.37818908691406,
"logps/rejected": -230.30712890625,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.7335550785064697,
"rewards/margins": 13.26760196685791,
"rewards/rejected": -9.53404712677002,
"step": 331
},
{
"epoch": 2.217954070981211,
"grad_norm": 0.000509345147293061,
"learning_rate": 2.4738215058903343e-05,
"logits/chosen": 0.23819413781166077,
"logits/rejected": 1.2504065036773682,
"logps/chosen": -200.8672637939453,
"logps/rejected": -195.00680541992188,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.456376075744629,
"rewards/margins": 11.623927116394043,
"rewards/rejected": -8.167550086975098,
"step": 332
},
{
"epoch": 2.224634655532359,
"grad_norm": 0.0002333044249098748,
"learning_rate": 2.4333033119698267e-05,
"logits/chosen": 0.47582709789276123,
"logits/rejected": 0.9356101155281067,
"logps/chosen": -141.839599609375,
"logps/rejected": -238.50204467773438,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.40701961517334,
"rewards/margins": 13.678300857543945,
"rewards/rejected": -10.271280288696289,
"step": 333
},
{
"epoch": 2.2313152400835072,
"grad_norm": 0.0004591036995407194,
"learning_rate": 2.393060084982639e-05,
"logits/chosen": 0.37793320417404175,
"logits/rejected": 1.3741904497146606,
"logps/chosen": -185.0458984375,
"logps/rejected": -205.19021606445312,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.4873390197753906,
"rewards/margins": 13.392151832580566,
"rewards/rejected": -9.90481185913086,
"step": 334
},
{
"epoch": 2.2379958246346554,
"grad_norm": 0.00019663709099404514,
"learning_rate": 2.3530938127426098e-05,
"logits/chosen": 0.4617392420768738,
"logits/rejected": 1.034355878829956,
"logps/chosen": -171.1890106201172,
"logps/rejected": -242.88619995117188,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.821230411529541,
"rewards/margins": 14.595293045043945,
"rewards/rejected": -10.774063110351562,
"step": 335
},
{
"epoch": 2.2446764091858036,
"grad_norm": 0.0010577983921393752,
"learning_rate": 2.3134064693834022e-05,
"logits/chosen": 0.5003147721290588,
"logits/rejected": 1.3415861129760742,
"logps/chosen": -184.5679473876953,
"logps/rejected": -215.14340209960938,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.04392671585083,
"rewards/margins": 12.149682998657227,
"rewards/rejected": -9.105756759643555,
"step": 336
},
{
"epoch": 2.2513569937369518,
"grad_norm": 0.0007893574074842036,
"learning_rate": 2.274000015260988e-05,
"logits/chosen": 0.5415345430374146,
"logits/rejected": 0.9807635545730591,
"logps/chosen": -128.60086059570312,
"logps/rejected": -237.58474731445312,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.333793878555298,
"rewards/margins": 13.319954872131348,
"rewards/rejected": -9.986162185668945,
"step": 337
},
{
"epoch": 2.2580375782881004,
"grad_norm": 0.003091056365519762,
"learning_rate": 2.234876396856817e-05,
"logits/chosen": 0.5783103704452515,
"logits/rejected": 1.0742615461349487,
"logps/chosen": -175.44082641601562,
"logps/rejected": -240.58963012695312,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.027278423309326,
"rewards/margins": 14.690441131591797,
"rewards/rejected": -10.663162231445312,
"step": 338
},
{
"epoch": 2.2647181628392485,
"grad_norm": 0.0008333343430422246,
"learning_rate": 2.1960375466816685e-05,
"logits/chosen": 0.4626193344593048,
"logits/rejected": 1.3312710523605347,
"logps/chosen": -179.23594665527344,
"logps/rejected": -204.65113830566406,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.346703290939331,
"rewards/margins": 12.753974914550781,
"rewards/rejected": -9.407270431518555,
"step": 339
},
{
"epoch": 2.2713987473903967,
"grad_norm": 0.00037302737473510206,
"learning_rate": 2.1574853831802062e-05,
"logits/chosen": 0.592528760433197,
"logits/rejected": 1.1724703311920166,
"logps/chosen": -141.2781524658203,
"logps/rejected": -205.59986877441406,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.087510824203491,
"rewards/margins": 11.936580657958984,
"rewards/rejected": -8.849069595336914,
"step": 340
},
{
"epoch": 2.278079331941545,
"grad_norm": 0.00047880798229016364,
"learning_rate": 2.1192218106362004e-05,
"logits/chosen": 0.4639968276023865,
"logits/rejected": 1.1247094869613647,
"logps/chosen": -167.94326782226562,
"logps/rejected": -223.88681030273438,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.112114429473877,
"rewards/margins": 12.083959579467773,
"rewards/rejected": -8.971845626831055,
"step": 341
},
{
"epoch": 2.284759916492693,
"grad_norm": 0.0006552104605361819,
"learning_rate": 2.0812487190784765e-05,
"logits/chosen": 0.4680987298488617,
"logits/rejected": 1.1858527660369873,
"logps/chosen": -148.18118286132812,
"logps/rejected": -187.73463439941406,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.776710033416748,
"rewards/margins": 12.323975563049316,
"rewards/rejected": -8.54726505279541,
"step": 342
},
{
"epoch": 2.2914405010438412,
"grad_norm": 0.0006427404005080462,
"learning_rate": 2.0435679841875517e-05,
"logits/chosen": 0.5988658666610718,
"logits/rejected": 0.8582466244697571,
"logps/chosen": -140.76695251464844,
"logps/rejected": -239.03146362304688,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.353756904602051,
"rewards/margins": 13.393622398376465,
"rewards/rejected": -10.039865493774414,
"step": 343
},
{
"epoch": 2.2981210855949894,
"grad_norm": 0.000361485785106197,
"learning_rate": 2.0061814672029964e-05,
"logits/chosen": 0.6378865838050842,
"logits/rejected": 1.3802528381347656,
"logps/chosen": -165.37449645996094,
"logps/rejected": -207.62039184570312,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.260084390640259,
"rewards/margins": 12.411645889282227,
"rewards/rejected": -9.151561737060547,
"step": 344
},
{
"epoch": 2.304801670146138,
"grad_norm": 0.002754014451056719,
"learning_rate": 1.9690910148314746e-05,
"logits/chosen": 0.32854142785072327,
"logits/rejected": 1.3232519626617432,
"logps/chosen": -195.93017578125,
"logps/rejected": -183.62518310546875,
"loss": 0.0003,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.4481635093688965,
"rewards/margins": 12.422575950622559,
"rewards/rejected": -8.974411964416504,
"step": 345
},
{
"epoch": 2.311482254697286,
"grad_norm": 0.0057357000187039375,
"learning_rate": 1.9322984591555593e-05,
"logits/chosen": 0.5361195802688599,
"logits/rejected": 1.024005651473999,
"logps/chosen": -166.576904296875,
"logps/rejected": -230.3949432373047,
"loss": 0.0005,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.127915859222412,
"rewards/margins": 12.562141418457031,
"rewards/rejected": -9.434226036071777,
"step": 346
},
{
"epoch": 2.3181628392484344,
"grad_norm": 0.0005040675168856978,
"learning_rate": 1.8958056175432064e-05,
"logits/chosen": 0.4197937250137329,
"logits/rejected": 0.8660867214202881,
"logps/chosen": -144.34555053710938,
"logps/rejected": -255.72872924804688,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.1396241188049316,
"rewards/margins": 13.073751449584961,
"rewards/rejected": -9.934127807617188,
"step": 347
},
{
"epoch": 2.3248434237995825,
"grad_norm": 0.000678808952216059,
"learning_rate": 1.8596142925580008e-05,
"logits/chosen": 0.6340179443359375,
"logits/rejected": 0.8392489552497864,
"logps/chosen": -140.9353485107422,
"logps/rejected": -238.82012939453125,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.632033586502075,
"rewards/margins": 13.527902603149414,
"rewards/rejected": -9.895870208740234,
"step": 348
},
{
"epoch": 2.3315240083507307,
"grad_norm": 0.003711126046255231,
"learning_rate": 1.823726271870122e-05,
"logits/chosen": 0.6699475049972534,
"logits/rejected": 1.1883944272994995,
"logps/chosen": -143.63088989257812,
"logps/rejected": -210.8107147216797,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.578331470489502,
"rewards/margins": 12.837823867797852,
"rewards/rejected": -9.259491920471191,
"step": 349
},
{
"epoch": 2.338204592901879,
"grad_norm": 0.00021632130665238947,
"learning_rate": 1.7881433281680297e-05,
"logits/chosen": 0.5508967041969299,
"logits/rejected": 1.1000094413757324,
"logps/chosen": -149.94349670410156,
"logps/rejected": -218.51022338867188,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.072751522064209,
"rewards/margins": 12.319024085998535,
"rewards/rejected": -9.246273040771484,
"step": 350
},
{
"epoch": 2.344885177453027,
"grad_norm": 0.003893829882144928,
"learning_rate": 1.752867219070912e-05,
"logits/chosen": 0.4527266323566437,
"logits/rejected": 1.2279529571533203,
"logps/chosen": -193.52725219726562,
"logps/rejected": -230.76287841796875,
"loss": 0.0004,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.5734779834747314,
"rewards/margins": 13.716750144958496,
"rewards/rejected": -10.143270492553711,
"step": 351
},
{
"epoch": 2.351565762004175,
"grad_norm": 0.004190846811980009,
"learning_rate": 1.717899687041861e-05,
"logits/chosen": 0.4203382134437561,
"logits/rejected": 0.896210253238678,
"logps/chosen": -174.0313262939453,
"logps/rejected": -229.40170288085938,
"loss": 0.0004,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.109071969985962,
"rewards/margins": 12.539839744567871,
"rewards/rejected": -9.430768013000488,
"step": 352
},
{
"epoch": 2.3582463465553234,
"grad_norm": 0.0006010103388689458,
"learning_rate": 1.6832424593018145e-05,
"logits/chosen": 0.3124818801879883,
"logits/rejected": 1.187449336051941,
"logps/chosen": -175.65945434570312,
"logps/rejected": -194.64474487304688,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.353351354598999,
"rewards/margins": 12.400053977966309,
"rewards/rejected": -9.04670238494873,
"step": 353
},
{
"epoch": 2.364926931106472,
"grad_norm": 0.0013186561409384012,
"learning_rate": 1.648897247744224e-05,
"logits/chosen": 0.5556175112724304,
"logits/rejected": 0.9724099636077881,
"logps/chosen": -175.44541931152344,
"logps/rejected": -247.06283569335938,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.4358012676239014,
"rewards/margins": 13.10071086883545,
"rewards/rejected": -9.664909362792969,
"step": 354
},
{
"epoch": 2.37160751565762,
"grad_norm": 0.0004016379243694246,
"learning_rate": 1.6148657488505116e-05,
"logits/chosen": 0.5522690415382385,
"logits/rejected": 1.1395349502563477,
"logps/chosen": -172.11297607421875,
"logps/rejected": -244.9354248046875,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.4882326126098633,
"rewards/margins": 13.11169719696045,
"rewards/rejected": -9.623465538024902,
"step": 355
},
{
"epoch": 2.3782881002087684,
"grad_norm": 0.003400665707886219,
"learning_rate": 1.581149643606257e-05,
"logits/chosen": 0.4829326868057251,
"logits/rejected": 1.142820954322815,
"logps/chosen": -204.84255981445312,
"logps/rejected": -255.23028564453125,
"loss": 0.0004,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.039217472076416,
"rewards/margins": 12.172881126403809,
"rewards/rejected": -9.13366413116455,
"step": 356
},
{
"epoch": 2.3849686847599165,
"grad_norm": 0.00029609608463943005,
"learning_rate": 1.5477505974181858e-05,
"logits/chosen": 0.4647904932498932,
"logits/rejected": 1.0525555610656738,
"logps/chosen": -152.51803588867188,
"logps/rejected": -240.22915649414062,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.57979154586792,
"rewards/margins": 14.005621910095215,
"rewards/rejected": -10.425830841064453,
"step": 357
},
{
"epoch": 2.3916492693110647,
"grad_norm": 0.0010005651274695992,
"learning_rate": 1.5146702600318795e-05,
"logits/chosen": 0.6249891519546509,
"logits/rejected": 1.248299241065979,
"logps/chosen": -153.75282287597656,
"logps/rejected": -212.9418182373047,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.2799882888793945,
"rewards/margins": 12.723299980163574,
"rewards/rejected": -9.44331169128418,
"step": 358
},
{
"epoch": 2.398329853862213,
"grad_norm": 0.0012129038805142045,
"learning_rate": 1.4819102654503143e-05,
"logits/chosen": 0.5324082970619202,
"logits/rejected": 0.9918537139892578,
"logps/chosen": -194.02386474609375,
"logps/rejected": -250.39137268066406,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.799360513687134,
"rewards/margins": 13.776371955871582,
"rewards/rejected": -9.977011680603027,
"step": 359
},
{
"epoch": 2.405010438413361,
"grad_norm": 0.0011025875573977828,
"learning_rate": 1.4494722318531272e-05,
"logits/chosen": 0.5399448871612549,
"logits/rejected": 0.688692569732666,
"logps/chosen": -115.42459869384766,
"logps/rejected": -260.16656494140625,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.4390218257904053,
"rewards/margins": 13.882972717285156,
"rewards/rejected": -10.443949699401855,
"step": 360
},
{
"epoch": 2.411691022964509,
"grad_norm": 0.00040039754821918905,
"learning_rate": 1.4173577615167014e-05,
"logits/chosen": 0.5479378700256348,
"logits/rejected": 1.0258996486663818,
"logps/chosen": -148.38746643066406,
"logps/rejected": -235.00123596191406,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.461238384246826,
"rewards/margins": 12.922686576843262,
"rewards/rejected": -9.461446762084961,
"step": 361
},
{
"epoch": 2.418371607515658,
"grad_norm": 0.00040944863576442003,
"learning_rate": 1.3855684407350087e-05,
"logits/chosen": 0.305247962474823,
"logits/rejected": 0.8850731253623962,
"logps/chosen": -183.35250854492188,
"logps/rejected": -245.67764282226562,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.0578300952911377,
"rewards/margins": 12.985921859741211,
"rewards/rejected": -9.928091049194336,
"step": 362
},
{
"epoch": 2.425052192066806,
"grad_norm": 0.0013041149359196424,
"learning_rate": 1.3541058397412719e-05,
"logits/chosen": 0.9205292463302612,
"logits/rejected": 1.027268409729004,
"logps/chosen": -139.70252990722656,
"logps/rejected": -251.70785522460938,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.5465292930603027,
"rewards/margins": 14.001523971557617,
"rewards/rejected": -10.454995155334473,
"step": 363
},
{
"epoch": 2.431732776617954,
"grad_norm": 0.002102568047121167,
"learning_rate": 1.3229715126303835e-05,
"logits/chosen": 0.5301153659820557,
"logits/rejected": 1.125875473022461,
"logps/chosen": -195.70343017578125,
"logps/rejected": -248.96484375,
"loss": 0.0003,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.5196890830993652,
"rewards/margins": 13.107810974121094,
"rewards/rejected": -9.588122367858887,
"step": 364
},
{
"epoch": 2.4384133611691023,
"grad_norm": 0.000353420153260231,
"learning_rate": 1.292166997282152e-05,
"logits/chosen": 0.5683207511901855,
"logits/rejected": 1.1430654525756836,
"logps/chosen": -187.99229431152344,
"logps/rejected": -233.15504455566406,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.5683434009552,
"rewards/margins": 12.799301147460938,
"rewards/rejected": -9.230957984924316,
"step": 365
},
{
"epoch": 2.4450939457202505,
"grad_norm": 0.0007050547283142805,
"learning_rate": 1.2616938152853435e-05,
"logits/chosen": 0.7088552117347717,
"logits/rejected": 1.1877071857452393,
"logps/chosen": -175.97979736328125,
"logps/rejected": -242.88336181640625,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.2787091732025146,
"rewards/margins": 13.82058048248291,
"rewards/rejected": -10.5418701171875,
"step": 366
},
{
"epoch": 2.4517745302713987,
"grad_norm": 0.0010725195752456784,
"learning_rate": 1.2315534718625082e-05,
"logits/chosen": 0.5767889022827148,
"logits/rejected": 1.0672720670700073,
"logps/chosen": -131.60899353027344,
"logps/rejected": -218.16946411132812,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.5861661434173584,
"rewards/margins": 12.36983871459961,
"rewards/rejected": -8.783672332763672,
"step": 367
},
{
"epoch": 2.458455114822547,
"grad_norm": 0.0022976077161729336,
"learning_rate": 1.2017474557956424e-05,
"logits/chosen": 0.5557492971420288,
"logits/rejected": 1.0859577655792236,
"logps/chosen": -138.25811767578125,
"logps/rejected": -223.6555633544922,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.32340669631958,
"rewards/margins": 12.937990188598633,
"rewards/rejected": -9.614583015441895,
"step": 368
},
{
"epoch": 2.465135699373695,
"grad_norm": 0.0001732637465465814,
"learning_rate": 1.1722772393526402e-05,
"logits/chosen": 0.3511442542076111,
"logits/rejected": 1.180907130241394,
"logps/chosen": -212.04290771484375,
"logps/rejected": -232.52906799316406,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.6756913661956787,
"rewards/margins": 14.446080207824707,
"rewards/rejected": -10.770389556884766,
"step": 369
},
{
"epoch": 2.471816283924843,
"grad_norm": 0.0014034606283530593,
"learning_rate": 1.1431442782145878e-05,
"logits/chosen": 0.4913908839225769,
"logits/rejected": 1.1496714353561401,
"logps/chosen": -186.36854553222656,
"logps/rejected": -201.04025268554688,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.4986705780029297,
"rewards/margins": 12.48499870300293,
"rewards/rejected": -8.986328125,
"step": 370
},
{
"epoch": 2.478496868475992,
"grad_norm": 0.005758744198828936,
"learning_rate": 1.1143500114038335e-05,
"logits/chosen": 0.4534023106098175,
"logits/rejected": 1.1205188035964966,
"logps/chosen": -162.62062072753906,
"logps/rejected": -195.78395080566406,
"loss": 0.0003,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.2878878116607666,
"rewards/margins": 13.010221481323242,
"rewards/rejected": -9.722332954406738,
"step": 371
},
{
"epoch": 2.48517745302714,
"grad_norm": 0.006383196916431189,
"learning_rate": 1.0858958612129346e-05,
"logits/chosen": 0.4629024565219879,
"logits/rejected": 1.1753510236740112,
"logps/chosen": -151.94247436523438,
"logps/rejected": -198.45919799804688,
"loss": 0.0004,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.5583202838897705,
"rewards/margins": 13.165674209594727,
"rewards/rejected": -9.607353210449219,
"step": 372
},
{
"epoch": 2.491858037578288,
"grad_norm": 0.00029526828438974917,
"learning_rate": 1.0577832331343835e-05,
"logits/chosen": 0.46336808800697327,
"logits/rejected": 0.9336625933647156,
"logps/chosen": -151.4046630859375,
"logps/rejected": -246.76052856445312,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.5535144805908203,
"rewards/margins": 14.210418701171875,
"rewards/rejected": -10.656903266906738,
"step": 373
},
{
"epoch": 2.4985386221294363,
"grad_norm": 0.001064629526808858,
"learning_rate": 1.0300135157911985e-05,
"logits/chosen": 0.3790948688983917,
"logits/rejected": 1.0435997247695923,
"logps/chosen": -197.76539611816406,
"logps/rejected": -259.93133544921875,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.1729063987731934,
"rewards/margins": 13.631752967834473,
"rewards/rejected": -10.45884895324707,
"step": 374
},
{
"epoch": 2.5052192066805845,
"grad_norm": 0.00039556881529279053,
"learning_rate": 1.0025880808683133e-05,
"logits/chosen": 0.7378631830215454,
"logits/rejected": 1.0038983821868896,
"logps/chosen": -138.7704315185547,
"logps/rejected": -267.25146484375,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.345494270324707,
"rewards/margins": 13.454019546508789,
"rewards/rejected": -10.108526229858398,
"step": 375
},
{
"epoch": 2.5118997912317327,
"grad_norm": 0.0034411298111081123,
"learning_rate": 9.755082830448477e-06,
"logits/chosen": 0.6844637393951416,
"logits/rejected": 1.1692044734954834,
"logps/chosen": -152.147216796875,
"logps/rejected": -240.3649139404297,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.52048397064209,
"rewards/margins": 13.183599472045898,
"rewards/rejected": -9.663114547729492,
"step": 376
},
{
"epoch": 2.518580375782881,
"grad_norm": 0.005600621923804283,
"learning_rate": 9.487754599271714e-06,
"logits/chosen": 0.5809310674667358,
"logits/rejected": 1.2069010734558105,
"logps/chosen": -136.8238067626953,
"logps/rejected": -205.76683044433594,
"loss": 0.0005,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.8006017208099365,
"rewards/margins": 13.183314323425293,
"rewards/rejected": -9.382713317871094,
"step": 377
},
{
"epoch": 2.5252609603340295,
"grad_norm": 0.0022874092683196068,
"learning_rate": 9.223909319828448e-06,
"logits/chosen": 0.2795962691307068,
"logits/rejected": 0.9856559634208679,
"logps/chosen": -157.48626708984375,
"logps/rejected": -198.1659698486328,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.247584342956543,
"rewards/margins": 11.768815994262695,
"rewards/rejected": -8.521230697631836,
"step": 378
},
{
"epoch": 2.5319415448851776,
"grad_norm": 0.0010204276768490672,
"learning_rate": 8.96356002475388e-06,
"logits/chosen": 0.4792221188545227,
"logits/rejected": 1.0078630447387695,
"logps/chosen": -155.52247619628906,
"logps/rejected": -202.2192840576172,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.309563159942627,
"rewards/margins": 12.788346290588379,
"rewards/rejected": -9.478784561157227,
"step": 379
},
{
"epoch": 2.538622129436326,
"grad_norm": 0.002153388923034072,
"learning_rate": 8.706719573999166e-06,
"logits/chosen": 0.697721540927887,
"logits/rejected": 1.042283058166504,
"logps/chosen": -162.91702270507812,
"logps/rejected": -281.65045166015625,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.598088502883911,
"rewards/margins": 14.490654945373535,
"rewards/rejected": -10.89256763458252,
"step": 380
},
{
"epoch": 2.545302713987474,
"grad_norm": 0.0033939930144697428,
"learning_rate": 8.45340065419606e-06,
"logits/chosen": 0.2647485136985779,
"logits/rejected": 1.3145473003387451,
"logps/chosen": -185.42364501953125,
"logps/rejected": -169.73617553710938,
"loss": 0.0004,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.510795831680298,
"rewards/margins": 11.805034637451172,
"rewards/rejected": -8.294239044189453,
"step": 381
},
{
"epoch": 2.551983298538622,
"grad_norm": 0.0015864988090470433,
"learning_rate": 8.203615778030358e-06,
"logits/chosen": 0.3910033106803894,
"logits/rejected": 1.1290589570999146,
"logps/chosen": -173.88612365722656,
"logps/rejected": -196.45225524902344,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.66280460357666,
"rewards/margins": 12.923318862915039,
"rewards/rejected": -9.260513305664062,
"step": 382
},
{
"epoch": 2.5586638830897703,
"grad_norm": 0.0010147824650630355,
"learning_rate": 7.957377283623775e-06,
"logits/chosen": 0.60319983959198,
"logits/rejected": 1.0238069295883179,
"logps/chosen": -127.49191284179688,
"logps/rejected": -210.27113342285156,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.0743024349212646,
"rewards/margins": 12.275171279907227,
"rewards/rejected": -9.200868606567383,
"step": 383
},
{
"epoch": 2.5653444676409185,
"grad_norm": 0.01197250746190548,
"learning_rate": 7.71469733392456e-06,
"logits/chosen": 0.37566909193992615,
"logits/rejected": 1.2771214246749878,
"logps/chosen": -190.87774658203125,
"logps/rejected": -219.00123596191406,
"loss": 0.0013,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.659575939178467,
"rewards/margins": 13.514084815979004,
"rewards/rejected": -9.854509353637695,
"step": 384
},
{
"epoch": 2.5720250521920667,
"grad_norm": 0.0003030757943633944,
"learning_rate": 7.475587916106674e-06,
"logits/chosen": 0.5149967670440674,
"logits/rejected": 1.0121179819107056,
"logps/chosen": -154.53530883789062,
"logps/rejected": -224.7379913330078,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.215111017227173,
"rewards/margins": 12.857205390930176,
"rewards/rejected": -9.642094612121582,
"step": 385
},
{
"epoch": 2.578705636743215,
"grad_norm": 0.014440486207604408,
"learning_rate": 7.240060840977654e-06,
"logits/chosen": 0.5663985013961792,
"logits/rejected": 1.17019784450531,
"logps/chosen": -182.380615234375,
"logps/rejected": -235.32057189941406,
"loss": 0.0009,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.6025519371032715,
"rewards/margins": 12.833649635314941,
"rewards/rejected": -9.231098175048828,
"step": 386
},
{
"epoch": 2.585386221294363,
"grad_norm": 0.0010652203345671296,
"learning_rate": 7.008127742395339e-06,
"logits/chosen": 0.7321959137916565,
"logits/rejected": 1.1040430068969727,
"logps/chosen": -130.3069610595703,
"logps/rejected": -207.42813110351562,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.5689282417297363,
"rewards/margins": 12.16445255279541,
"rewards/rejected": -9.595523834228516,
"step": 387
},
{
"epoch": 2.5920668058455116,
"grad_norm": 0.0002396242634858936,
"learning_rate": 6.779800076692989e-06,
"logits/chosen": 0.557038426399231,
"logits/rejected": 1.1275995969772339,
"logps/chosen": -152.3614959716797,
"logps/rejected": -220.40699768066406,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.3268535137176514,
"rewards/margins": 13.34984302520752,
"rewards/rejected": -10.022990226745605,
"step": 388
},
{
"epoch": 2.59874739039666,
"grad_norm": 0.0006751392502337694,
"learning_rate": 6.555089122113671e-06,
"logits/chosen": 0.5842954516410828,
"logits/rejected": 1.438590407371521,
"logps/chosen": -168.39236450195312,
"logps/rejected": -195.80142211914062,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.2229514122009277,
"rewards/margins": 12.052131652832031,
"rewards/rejected": -8.829178810119629,
"step": 389
},
{
"epoch": 2.605427974947808,
"grad_norm": 0.00407389784231782,
"learning_rate": 6.334005978252968e-06,
"logits/chosen": 0.6431280374526978,
"logits/rejected": 1.1001299619674683,
"logps/chosen": -151.1575469970703,
"logps/rejected": -224.49334716796875,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.211059331893921,
"rewards/margins": 12.562589645385742,
"rewards/rejected": -9.351531028747559,
"step": 390
},
{
"epoch": 2.612108559498956,
"grad_norm": 0.00046424585161730647,
"learning_rate": 6.116561565510806e-06,
"logits/chosen": 0.6559157371520996,
"logits/rejected": 0.8570265173912048,
"logps/chosen": -117.59468841552734,
"logps/rejected": -238.4303436279297,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.9642975330352783,
"rewards/margins": 11.914740562438965,
"rewards/rejected": -8.95044231414795,
"step": 391
},
{
"epoch": 2.6187891440501043,
"grad_norm": 0.000544650130905211,
"learning_rate": 5.902766624551994e-06,
"logits/chosen": 0.4319482445716858,
"logits/rejected": 1.1018319129943848,
"logps/chosen": -171.65264892578125,
"logps/rejected": -213.88772583007812,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.3273277282714844,
"rewards/margins": 12.443902015686035,
"rewards/rejected": -9.11657428741455,
"step": 392
},
{
"epoch": 2.6254697286012525,
"grad_norm": 0.0002654242271091789,
"learning_rate": 5.6926317157757825e-06,
"logits/chosen": 0.29998600482940674,
"logits/rejected": 1.0530532598495483,
"logps/chosen": -153.1922149658203,
"logps/rejected": -202.36703491210938,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.9652388095855713,
"rewards/margins": 13.307126998901367,
"rewards/rejected": -9.341888427734375,
"step": 393
},
{
"epoch": 2.632150313152401,
"grad_norm": 0.014262525364756584,
"learning_rate": 5.486167218794069e-06,
"logits/chosen": 0.49952131509780884,
"logits/rejected": 1.1781392097473145,
"logps/chosen": -181.53782653808594,
"logps/rejected": -179.46331787109375,
"loss": 0.0011,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.316577911376953,
"rewards/margins": 12.234426498413086,
"rewards/rejected": -8.917847633361816,
"step": 394
},
{
"epoch": 2.6388308977035493,
"grad_norm": 0.013617642223834991,
"learning_rate": 5.283383331918872e-06,
"logits/chosen": 0.6029332280158997,
"logits/rejected": 1.200974464416504,
"logps/chosen": -167.70443725585938,
"logps/rejected": -210.56065368652344,
"loss": 0.0014,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.7498412132263184,
"rewards/margins": 12.447210311889648,
"rewards/rejected": -8.697368621826172,
"step": 395
},
{
"epoch": 2.6455114822546975,
"grad_norm": 0.01435225922614336,
"learning_rate": 5.084290071658462e-06,
"logits/chosen": 0.39357277750968933,
"logits/rejected": 1.2007172107696533,
"logps/chosen": -171.33279418945312,
"logps/rejected": -187.95843505859375,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.256293535232544,
"rewards/margins": 12.046555519104004,
"rewards/rejected": -8.790261268615723,
"step": 396
},
{
"epoch": 2.6521920668058456,
"grad_norm": 0.002334900200366974,
"learning_rate": 4.888897272222677e-06,
"logits/chosen": 0.3714907169342041,
"logits/rejected": 1.1416910886764526,
"logps/chosen": -168.70376586914062,
"logps/rejected": -207.44976806640625,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.877674102783203,
"rewards/margins": 12.441089630126953,
"rewards/rejected": -8.563414573669434,
"step": 397
},
{
"epoch": 2.658872651356994,
"grad_norm": 0.001730743213556707,
"learning_rate": 4.697214585037087e-06,
"logits/chosen": 0.5332534313201904,
"logits/rejected": 1.249335765838623,
"logps/chosen": -158.67762756347656,
"logps/rejected": -190.1533203125,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.0616326332092285,
"rewards/margins": 12.478856086730957,
"rewards/rejected": -9.417223930358887,
"step": 398
},
{
"epoch": 2.665553235908142,
"grad_norm": 0.003226222237572074,
"learning_rate": 4.5092514782663255e-06,
"logits/chosen": 0.6613823771476746,
"logits/rejected": 1.0511373281478882,
"logps/chosen": -144.17306518554688,
"logps/rejected": -228.23768615722656,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.6547493934631348,
"rewards/margins": 12.782587051391602,
"rewards/rejected": -9.127838134765625,
"step": 399
},
{
"epoch": 2.67223382045929,
"grad_norm": 0.0005425635608844459,
"learning_rate": 4.325017236346378e-06,
"logits/chosen": 0.5887613296508789,
"logits/rejected": 1.0347230434417725,
"logps/chosen": -158.33848571777344,
"logps/rejected": -240.96356201171875,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.1818087100982666,
"rewards/margins": 13.404643058776855,
"rewards/rejected": -10.222835540771484,
"step": 400
},
{
"epoch": 2.6789144050104383,
"grad_norm": 0.00024664445663802326,
"learning_rate": 4.144520959525959e-06,
"logits/chosen": 0.5648880004882812,
"logits/rejected": 0.8261204957962036,
"logps/chosen": -149.58128356933594,
"logps/rejected": -245.0674591064453,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.261422872543335,
"rewards/margins": 12.966839790344238,
"rewards/rejected": -9.70541763305664,
"step": 401
},
{
"epoch": 2.6855949895615865,
"grad_norm": 0.0005279434262774885,
"learning_rate": 3.967771563417096e-06,
"logits/chosen": 0.7538788914680481,
"logits/rejected": 1.1771303415298462,
"logps/chosen": -135.6904754638672,
"logps/rejected": -227.23272705078125,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.337376832962036,
"rewards/margins": 13.063421249389648,
"rewards/rejected": -9.726044654846191,
"step": 402
},
{
"epoch": 2.6922755741127347,
"grad_norm": 0.00802143756300211,
"learning_rate": 3.794777778554615e-06,
"logits/chosen": 0.4799140691757202,
"logits/rejected": 1.0762194395065308,
"logps/chosen": -169.96649169921875,
"logps/rejected": -227.5594940185547,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.182255744934082,
"rewards/margins": 12.846413612365723,
"rewards/rejected": -9.66415786743164,
"step": 403
},
{
"epoch": 2.698956158663883,
"grad_norm": 0.00045062805293127894,
"learning_rate": 3.6255481499649725e-06,
"logits/chosen": 0.4726150333881378,
"logits/rejected": 1.337223768234253,
"logps/chosen": -163.80133056640625,
"logps/rejected": -204.79391479492188,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.530905246734619,
"rewards/margins": 12.308239936828613,
"rewards/rejected": -8.777335166931152,
"step": 404
},
{
"epoch": 2.7056367432150314,
"grad_norm": 0.0015004087472334504,
"learning_rate": 3.460091036744162e-06,
"logits/chosen": 0.27585047483444214,
"logits/rejected": 1.1584709882736206,
"logps/chosen": -174.3064422607422,
"logps/rejected": -199.953125,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.2562003135681152,
"rewards/margins": 12.504425048828125,
"rewards/rejected": -9.248224258422852,
"step": 405
},
{
"epoch": 2.7123173277661796,
"grad_norm": 0.00020969973411411047,
"learning_rate": 3.2984146116448447e-06,
"logits/chosen": 0.5917679667472839,
"logits/rejected": 1.226263403892517,
"logps/chosen": -160.92601013183594,
"logps/rejected": -188.79367065429688,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.2227373123168945,
"rewards/margins": 12.113557815551758,
"rewards/rejected": -8.890820503234863,
"step": 406
},
{
"epoch": 2.718997912317328,
"grad_norm": 0.000391695968573913,
"learning_rate": 3.140526860672557e-06,
"logits/chosen": 0.3402010500431061,
"logits/rejected": 0.9619426727294922,
"logps/chosen": -139.92054748535156,
"logps/rejected": -229.22862243652344,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.010129928588867,
"rewards/margins": 13.08873176574707,
"rewards/rejected": -10.078601837158203,
"step": 407
},
{
"epoch": 2.725678496868476,
"grad_norm": 0.0002957108954433352,
"learning_rate": 2.9864355826913873e-06,
"logits/chosen": 0.6995745897293091,
"logits/rejected": 0.9052819013595581,
"logps/chosen": -123.17851257324219,
"logps/rejected": -251.5882568359375,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.108473777770996,
"rewards/margins": 12.585604667663574,
"rewards/rejected": -9.477129936218262,
"step": 408
},
{
"epoch": 2.732359081419624,
"grad_norm": 0.0005900960532017052,
"learning_rate": 2.83614838903862e-06,
"logits/chosen": 0.4034777283668518,
"logits/rejected": 1.2409098148345947,
"logps/chosen": -199.7462158203125,
"logps/rejected": -198.4483642578125,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.5001864433288574,
"rewards/margins": 12.815964698791504,
"rewards/rejected": -9.315777778625488,
"step": 409
},
{
"epoch": 2.7390396659707723,
"grad_norm": 0.0002275588922202587,
"learning_rate": 2.689672703148869e-06,
"logits/chosen": 0.4764283299446106,
"logits/rejected": 1.1419066190719604,
"logps/chosen": -169.3687744140625,
"logps/rejected": -220.18045043945312,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.3647985458374023,
"rewards/margins": 13.468372344970703,
"rewards/rejected": -10.1035737991333,
"step": 410
},
{
"epoch": 2.745720250521921,
"grad_norm": 0.0013009293470531702,
"learning_rate": 2.5470157601873035e-06,
"logits/chosen": 0.36611929535865784,
"logits/rejected": 1.0027097463607788,
"logps/chosen": -189.9754638671875,
"logps/rejected": -249.90679931640625,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.5394551753997803,
"rewards/margins": 13.063716888427734,
"rewards/rejected": -9.524261474609375,
"step": 411
},
{
"epoch": 2.752400835073069,
"grad_norm": 0.0015072039095684886,
"learning_rate": 2.4081846066923697e-06,
"logits/chosen": 0.4824734628200531,
"logits/rejected": 1.1585693359375,
"logps/chosen": -161.98866271972656,
"logps/rejected": -216.73757934570312,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.2364959716796875,
"rewards/margins": 13.164081573486328,
"rewards/rejected": -9.92758560180664,
"step": 412
},
{
"epoch": 2.7590814196242173,
"grad_norm": 0.0006356340018101037,
"learning_rate": 2.273186100227651e-06,
"logits/chosen": 0.564692497253418,
"logits/rejected": 1.2192484140396118,
"logps/chosen": -133.49551391601562,
"logps/rejected": -193.18711853027344,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.1987099647521973,
"rewards/margins": 11.772148132324219,
"rewards/rejected": -8.573437690734863,
"step": 413
},
{
"epoch": 2.7657620041753654,
"grad_norm": 0.00035454286262393,
"learning_rate": 2.1420269090431712e-06,
"logits/chosen": 0.42993149161338806,
"logits/rejected": 1.0948328971862793,
"logps/chosen": -169.48936462402344,
"logps/rejected": -220.0684814453125,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.176632881164551,
"rewards/margins": 12.51456069946289,
"rewards/rejected": -9.33792781829834,
"step": 414
},
{
"epoch": 2.7724425887265136,
"grad_norm": 0.0007007503882050514,
"learning_rate": 2.0147135117460204e-06,
"logits/chosen": 0.38136231899261475,
"logits/rejected": 1.3104112148284912,
"logps/chosen": -161.56480407714844,
"logps/rejected": -181.53868103027344,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.7145438194274902,
"rewards/margins": 13.201973915100098,
"rewards/rejected": -9.48742961883545,
"step": 415
},
{
"epoch": 2.779123173277662,
"grad_norm": 0.0013630822068080306,
"learning_rate": 1.891252196980311e-06,
"logits/chosen": 0.724543035030365,
"logits/rejected": 1.2720431089401245,
"logps/chosen": -174.0122833251953,
"logps/rejected": -216.48367309570312,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.654141664505005,
"rewards/margins": 13.36253547668457,
"rewards/rejected": -9.708393096923828,
"step": 416
},
{
"epoch": 2.78580375782881,
"grad_norm": 0.0008407873101532459,
"learning_rate": 1.7716490631165984e-06,
"logits/chosen": 0.5323563814163208,
"logits/rejected": 1.2073959112167358,
"logps/chosen": -149.6080322265625,
"logps/rejected": -223.80166625976562,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.5268023014068604,
"rewards/margins": 12.292684555053711,
"rewards/rejected": -8.76588249206543,
"step": 417
},
{
"epoch": 2.792484342379958,
"grad_norm": 0.00027841454721055925,
"learning_rate": 1.6559100179506015e-06,
"logits/chosen": 0.46936628222465515,
"logits/rejected": 1.0593383312225342,
"logps/chosen": -169.1297607421875,
"logps/rejected": -212.625,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.026738166809082,
"rewards/margins": 12.370702743530273,
"rewards/rejected": -9.343963623046875,
"step": 418
},
{
"epoch": 2.7991649269311063,
"grad_norm": 0.0007251783972606063,
"learning_rate": 1.5440407784114285e-06,
"logits/chosen": 0.5382225513458252,
"logits/rejected": 1.2113491296768188,
"logps/chosen": -153.16236877441406,
"logps/rejected": -216.06365966796875,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.2709617614746094,
"rewards/margins": 12.992986679077148,
"rewards/rejected": -9.722025871276855,
"step": 419
},
{
"epoch": 2.8058455114822545,
"grad_norm": 0.000872993899974972,
"learning_rate": 1.4360468702791885e-06,
"logits/chosen": 0.6498620510101318,
"logits/rejected": 1.0502396821975708,
"logps/chosen": -168.02088928222656,
"logps/rejected": -243.26052856445312,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.300208568572998,
"rewards/margins": 13.005234718322754,
"rewards/rejected": -9.705026626586914,
"step": 420
},
{
"epoch": 2.812526096033403,
"grad_norm": 0.00035026189289055765,
"learning_rate": 1.3319336279119832e-06,
"logits/chosen": 0.5399425029754639,
"logits/rejected": 1.0422205924987793,
"logps/chosen": -145.86790466308594,
"logps/rejected": -231.82078552246094,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.275618076324463,
"rewards/margins": 12.443395614624023,
"rewards/rejected": -9.167777061462402,
"step": 421
},
{
"epoch": 2.8192066805845513,
"grad_norm": 0.00047282990999519825,
"learning_rate": 1.2317061939825092e-06,
"logits/chosen": 0.6304022669792175,
"logits/rejected": 0.9782839417457581,
"logps/chosen": -132.8438262939453,
"logps/rejected": -230.8707275390625,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.1279447078704834,
"rewards/margins": 12.440394401550293,
"rewards/rejected": -9.312448501586914,
"step": 422
},
{
"epoch": 2.8258872651356994,
"grad_norm": 0.0015838721301406622,
"learning_rate": 1.1353695192239767e-06,
"logits/chosen": 0.34126338362693787,
"logits/rejected": 0.872474193572998,
"logps/chosen": -150.72293090820312,
"logps/rejected": -219.77139282226562,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.3702316284179688,
"rewards/margins": 13.250913619995117,
"rewards/rejected": -9.880681037902832,
"step": 423
},
{
"epoch": 2.8325678496868476,
"grad_norm": 0.0009925226913765073,
"learning_rate": 1.042928362185558e-06,
"logits/chosen": 0.5055323243141174,
"logits/rejected": 1.2043384313583374,
"logps/chosen": -165.8092041015625,
"logps/rejected": -205.6092529296875,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.0547263622283936,
"rewards/margins": 12.708995819091797,
"rewards/rejected": -9.654268264770508,
"step": 424
},
{
"epoch": 2.8392484342379958,
"grad_norm": 0.0010521183721721172,
"learning_rate": 9.543872889974027e-07,
"logits/chosen": 0.6694621443748474,
"logits/rejected": 1.0147770643234253,
"logps/chosen": -141.67022705078125,
"logps/rejected": -271.6776428222656,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.1093451976776123,
"rewards/margins": 13.3922119140625,
"rewards/rejected": -10.282866477966309,
"step": 425
},
{
"epoch": 2.845929018789144,
"grad_norm": 0.0004929823335260153,
"learning_rate": 8.697506731450222e-07,
"logits/chosen": 0.36508888006210327,
"logits/rejected": 1.2144858837127686,
"logps/chosen": -174.45252990722656,
"logps/rejected": -199.98072814941406,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.2696728706359863,
"rewards/margins": 12.374869346618652,
"rewards/rejected": -9.105195999145508,
"step": 426
},
{
"epoch": 2.852609603340292,
"grad_norm": 0.0011409110156819224,
"learning_rate": 7.890226952532942e-07,
"logits/chosen": 0.38490644097328186,
"logits/rejected": 1.3086551427841187,
"logps/chosen": -209.907470703125,
"logps/rejected": -200.3457489013672,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.552093982696533,
"rewards/margins": 12.373093605041504,
"rewards/rejected": -8.820999145507812,
"step": 427
},
{
"epoch": 2.8592901878914407,
"grad_norm": 0.0007044864469207823,
"learning_rate": 7.122073428799781e-07,
"logits/chosen": 0.5145556926727295,
"logits/rejected": 1.0260547399520874,
"logps/chosen": -135.60379028320312,
"logps/rejected": -226.86810302734375,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.268608570098877,
"rewards/margins": 13.123603820800781,
"rewards/rejected": -9.854996681213379,
"step": 428
},
{
"epoch": 2.865970772442589,
"grad_norm": 0.0031695598736405373,
"learning_rate": 6.393084103187264e-07,
"logits/chosen": 0.532058596611023,
"logits/rejected": 0.8780273795127869,
"logps/chosen": -164.82818603515625,
"logps/rejected": -250.14654541015625,
"loss": 0.0004,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.7977402210235596,
"rewards/margins": 13.179388046264648,
"rewards/rejected": -9.381647109985352,
"step": 429
},
{
"epoch": 2.872651356993737,
"grad_norm": 0.0007318177376873791,
"learning_rate": 5.703294984116525e-07,
"logits/chosen": 0.3651660084724426,
"logits/rejected": 0.9196643829345703,
"logps/chosen": -173.27577209472656,
"logps/rejected": -234.96435546875,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.602478504180908,
"rewards/margins": 13.61583423614502,
"rewards/rejected": -10.013355255126953,
"step": 430
},
{
"epoch": 2.8793319415448853,
"grad_norm": 0.0004672574286814779,
"learning_rate": 5.052740143714996e-07,
"logits/chosen": 0.3767249882221222,
"logits/rejected": 0.877831757068634,
"logps/chosen": -180.11639404296875,
"logps/rejected": -272.79217529296875,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.4903178215026855,
"rewards/margins": 13.622541427612305,
"rewards/rejected": -10.132223129272461,
"step": 431
},
{
"epoch": 2.8860125260960334,
"grad_norm": 0.006710366811603308,
"learning_rate": 4.441451716133216e-07,
"logits/chosen": 0.3890739977359772,
"logits/rejected": 1.1784958839416504,
"logps/chosen": -192.0306396484375,
"logps/rejected": -246.87294006347656,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.150397777557373,
"rewards/margins": 13.305784225463867,
"rewards/rejected": -10.15538501739502,
"step": 432
},
{
"epoch": 2.8926931106471816,
"grad_norm": 0.0009442153386771679,
"learning_rate": 3.8694598959575725e-07,
"logits/chosen": 0.49244481325149536,
"logits/rejected": 1.0682514905929565,
"logps/chosen": -193.1286163330078,
"logps/rejected": -267.5054626464844,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.750030040740967,
"rewards/margins": 13.207242012023926,
"rewards/rejected": -9.457212448120117,
"step": 433
},
{
"epoch": 2.8993736951983298,
"grad_norm": 0.0028639482334256172,
"learning_rate": 3.3367929367190463e-07,
"logits/chosen": 0.610533595085144,
"logits/rejected": 1.2630019187927246,
"logps/chosen": -129.1243438720703,
"logps/rejected": -186.8758544921875,
"loss": 0.0003,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.5877702236175537,
"rewards/margins": 12.923406600952148,
"rewards/rejected": -9.335637092590332,
"step": 434
},
{
"epoch": 2.906054279749478,
"grad_norm": 0.0031200747471302748,
"learning_rate": 2.843477149497265e-07,
"logits/chosen": 0.7550321817398071,
"logits/rejected": 0.9643117189407349,
"logps/chosen": -83.94383239746094,
"logps/rejected": -206.9544219970703,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.38393497467041,
"rewards/margins": 12.600971221923828,
"rewards/rejected": -9.217036247253418,
"step": 435
},
{
"epoch": 2.912734864300626,
"grad_norm": 0.0013908625114709139,
"learning_rate": 2.3895369016211813e-07,
"logits/chosen": 0.6865592002868652,
"logits/rejected": 0.957063615322113,
"logps/chosen": -148.91688537597656,
"logps/rejected": -231.92611694335938,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.5575623512268066,
"rewards/margins": 13.301969528198242,
"rewards/rejected": -9.744406700134277,
"step": 436
},
{
"epoch": 2.9194154488517743,
"grad_norm": 0.04500668868422508,
"learning_rate": 1.9749946154651534e-07,
"logits/chosen": 0.4031934142112732,
"logits/rejected": 0.901584267616272,
"logps/chosen": -159.9291229248047,
"logps/rejected": -242.07891845703125,
"loss": 0.0005,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.2847237586975098,
"rewards/margins": 12.680036544799805,
"rewards/rejected": -9.395313262939453,
"step": 437
},
{
"epoch": 2.926096033402923,
"grad_norm": 0.000297738763038069,
"learning_rate": 1.5998707673419156e-07,
"logits/chosen": 0.6217765808105469,
"logits/rejected": 1.300815463066101,
"logps/chosen": -175.9449920654297,
"logps/rejected": -220.9844207763672,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.2398884296417236,
"rewards/margins": 12.802839279174805,
"rewards/rejected": -9.562949180603027,
"step": 438
},
{
"epoch": 2.932776617954071,
"grad_norm": 0.007137789856642485,
"learning_rate": 1.2641838864905887e-07,
"logits/chosen": 0.5026199817657471,
"logits/rejected": 1.145331621170044,
"logps/chosen": -209.8468017578125,
"logps/rejected": -262.7084045410156,
"loss": 0.0006,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.754572868347168,
"rewards/margins": 13.609201431274414,
"rewards/rejected": -9.854629516601562,
"step": 439
},
{
"epoch": 2.9394572025052192,
"grad_norm": 0.00042287795804440975,
"learning_rate": 9.679505541615008e-08,
"logits/chosen": 0.3595789670944214,
"logits/rejected": 1.4266026020050049,
"logps/chosen": -190.64529418945312,
"logps/rejected": -162.8024139404297,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.641660451889038,
"rewards/margins": 11.447403907775879,
"rewards/rejected": -7.80574369430542,
"step": 440
},
{
"epoch": 2.9461377870563674,
"grad_norm": 0.0004641091509256512,
"learning_rate": 7.11185402797554e-08,
"logits/chosen": 0.4248248338699341,
"logits/rejected": 0.8252262473106384,
"logps/chosen": -162.13511657714844,
"logps/rejected": -238.3690948486328,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.014636516571045,
"rewards/margins": 13.756202697753906,
"rewards/rejected": -9.741565704345703,
"step": 441
},
{
"epoch": 2.9528183716075156,
"grad_norm": 0.00071021041367203,
"learning_rate": 4.9390111531115724e-08,
"logits/chosen": 0.8295226097106934,
"logits/rejected": 1.1506669521331787,
"logps/chosen": -161.70565795898438,
"logps/rejected": -239.46661376953125,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.0058834552764893,
"rewards/margins": 13.02005386352539,
"rewards/rejected": -10.01417064666748,
"step": 442
},
{
"epoch": 2.9594989561586638,
"grad_norm": 0.0019515601452440023,
"learning_rate": 3.1610842445761736e-08,
"logits/chosen": 0.4108167588710785,
"logits/rejected": 1.0816458463668823,
"logps/chosen": -161.15087890625,
"logps/rejected": -216.76573181152344,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.153501272201538,
"rewards/margins": 12.347149848937988,
"rewards/rejected": -9.193648338317871,
"step": 443
},
{
"epoch": 2.9661795407098124,
"grad_norm": 0.0006106442306190729,
"learning_rate": 1.7781611230551775e-08,
"logits/chosen": 0.5510632991790771,
"logits/rejected": 1.0120404958724976,
"logps/chosen": -161.36676025390625,
"logps/rejected": -249.424560546875,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.663705825805664,
"rewards/margins": 13.109761238098145,
"rewards/rejected": -9.44605541229248,
"step": 444
},
{
"epoch": 2.9728601252609606,
"grad_norm": 0.0006236585322767496,
"learning_rate": 7.903100980222178e-09,
"logits/chosen": 0.658979594707489,
"logits/rejected": 1.351822018623352,
"logps/chosen": -174.4720458984375,
"logps/rejected": -205.27960205078125,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.2610790729522705,
"rewards/margins": 12.586731910705566,
"rewards/rejected": -9.325651168823242,
"step": 445
},
{
"epoch": 2.9795407098121087,
"grad_norm": 0.00028068042593076825,
"learning_rate": 1.975799643707532e-09,
"logits/chosen": 0.5611605048179626,
"logits/rejected": 1.0852856636047363,
"logps/chosen": -155.2922821044922,
"logps/rejected": -229.93069458007812,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.4737355709075928,
"rewards/margins": 12.924980163574219,
"rewards/rejected": -9.451244354248047,
"step": 446
},
{
"epoch": 2.986221294363257,
"grad_norm": 0.007568780332803726,
"learning_rate": 0.0,
"logits/chosen": 0.4653575122356415,
"logits/rejected": 1.259974718093872,
"logps/chosen": -172.01187133789062,
"logps/rejected": -166.48036193847656,
"loss": 0.0007,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.4418039321899414,
"rewards/margins": 11.374272346496582,
"rewards/rejected": -7.932469367980957,
"step": 447
},
{
"epoch": 2.986221294363257,
"step": 447,
"total_flos": 0.0,
"train_loss": 0.015126691275923332,
"train_runtime": 9353.1303,
"train_samples_per_second": 6.145,
"train_steps_per_second": 0.048
}
],
"logging_steps": 1.0,
"max_steps": 447,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": false,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}