q3_8b_secalign_xml / checkpoint-600 /trainer_state.json
thavens's picture
Upload folder using huggingface_hub
a444f23 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.0,
"eval_steps": 500,
"global_step": 600,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00333889816360601,
"grad_norm": 0.8906780481338501,
"learning_rate": 0.0,
"logits/chosen": -2.6484375,
"logits/rejected": -2.7734375,
"logps/chosen": -182.0,
"logps/rejected": -244.5,
"loss": 1.5239,
"rewards/accuracies": 0.421875,
"rewards/chosen": 0.02294921875,
"rewards/margins": -0.33056640625,
"rewards/rejected": 0.3515625,
"step": 1
},
{
"epoch": 0.00667779632721202,
"grad_norm": 0.8854408860206604,
"learning_rate": 1.6000000000000003e-05,
"logits/chosen": -3.2109375,
"logits/rejected": -2.6640625,
"logps/chosen": -133.5,
"logps/rejected": -258.5,
"loss": 1.103,
"rewards/accuracies": 0.546875,
"rewards/chosen": -0.0283203125,
"rewards/margins": 0.0703125,
"rewards/rejected": -0.09814453125,
"step": 2
},
{
"epoch": 0.01001669449081803,
"grad_norm": 1.1921889781951904,
"learning_rate": 3.2000000000000005e-05,
"logits/chosen": -3.0,
"logits/rejected": -2.6640625,
"logps/chosen": -130.0,
"logps/rejected": -226.0,
"loss": 1.5938,
"rewards/accuracies": 0.28125,
"rewards/chosen": -0.44921875,
"rewards/margins": -0.763671875,
"rewards/rejected": 0.3125,
"step": 3
},
{
"epoch": 0.01335559265442404,
"grad_norm": 0.9537683725357056,
"learning_rate": 4.8e-05,
"logits/chosen": -2.890625,
"logits/rejected": -2.8125,
"logps/chosen": -147.5,
"logps/rejected": -281.0,
"loss": 1.0503,
"rewards/accuracies": 0.578125,
"rewards/chosen": 0.02099609375,
"rewards/margins": 0.28369140625,
"rewards/rejected": -0.26171875,
"step": 4
},
{
"epoch": 0.01669449081803005,
"grad_norm": 1.2217826843261719,
"learning_rate": 6.400000000000001e-05,
"logits/chosen": -2.90625,
"logits/rejected": -2.7734375,
"logps/chosen": -135.75,
"logps/rejected": -232.5,
"loss": 1.3145,
"rewards/accuracies": 0.46875,
"rewards/chosen": 0.1611328125,
"rewards/margins": 0.0048828125,
"rewards/rejected": 0.15606689453125,
"step": 5
},
{
"epoch": 0.02003338898163606,
"grad_norm": 1.0575661659240723,
"learning_rate": 8e-05,
"logits/chosen": -3.0,
"logits/rejected": -2.640625,
"logps/chosen": -142.5,
"logps/rejected": -260.5,
"loss": 1.1968,
"rewards/accuracies": 0.46875,
"rewards/chosen": 0.104736328125,
"rewards/margins": -0.081298828125,
"rewards/rejected": 0.185546875,
"step": 6
},
{
"epoch": 0.02337228714524207,
"grad_norm": 1.3832917213439941,
"learning_rate": 9.6e-05,
"logits/chosen": -3.140625,
"logits/rejected": -2.9375,
"logps/chosen": -119.25,
"logps/rejected": -241.0,
"loss": 1.1948,
"rewards/accuracies": 0.390625,
"rewards/chosen": 0.16455078125,
"rewards/margins": -0.1988525390625,
"rewards/rejected": 0.36328125,
"step": 7
},
{
"epoch": 0.02671118530884808,
"grad_norm": 0.9630353450775146,
"learning_rate": 0.000112,
"logits/chosen": -2.7734375,
"logits/rejected": -2.625,
"logps/chosen": -159.5,
"logps/rejected": -256.0,
"loss": 0.8555,
"rewards/accuracies": 0.671875,
"rewards/chosen": 0.0166015625,
"rewards/margins": 1.04296875,
"rewards/rejected": -1.02734375,
"step": 8
},
{
"epoch": 0.03005008347245409,
"grad_norm": 1.3834831714630127,
"learning_rate": 0.00012800000000000002,
"logits/chosen": -2.921875,
"logits/rejected": -2.703125,
"logps/chosen": -143.5,
"logps/rejected": -224.5,
"loss": 1.1067,
"rewards/accuracies": 0.546875,
"rewards/chosen": 0.2060546875,
"rewards/margins": 0.4296875,
"rewards/rejected": -0.222686767578125,
"step": 9
},
{
"epoch": 0.0333889816360601,
"grad_norm": 1.5977782011032104,
"learning_rate": 0.000144,
"logits/chosen": -2.7890625,
"logits/rejected": -2.7890625,
"logps/chosen": -160.0,
"logps/rejected": -271.0,
"loss": 0.896,
"rewards/accuracies": 0.640625,
"rewards/chosen": 0.0777587890625,
"rewards/margins": 0.669921875,
"rewards/rejected": -0.5927734375,
"step": 10
},
{
"epoch": 0.03672787979966611,
"grad_norm": 0.9674336910247803,
"learning_rate": 0.00016,
"logits/chosen": -2.484375,
"logits/rejected": -2.734375,
"logps/chosen": -182.0,
"logps/rejected": -217.0,
"loss": 0.4347,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.208984375,
"rewards/margins": 1.55078125,
"rewards/rejected": -1.34375,
"step": 11
},
{
"epoch": 0.04006677796327212,
"grad_norm": 1.0390831232070923,
"learning_rate": 0.00015999950159857832,
"logits/chosen": -2.953125,
"logits/rejected": -2.78125,
"logps/chosen": -149.0,
"logps/rejected": -259.5,
"loss": 0.3765,
"rewards/accuracies": 0.84375,
"rewards/chosen": 0.3333740234375,
"rewards/margins": 2.6015625,
"rewards/rejected": -2.265625,
"step": 12
},
{
"epoch": 0.04340567612687813,
"grad_norm": 0.7539263963699341,
"learning_rate": 0.00015999800640052332,
"logits/chosen": -2.9609375,
"logits/rejected": -2.734375,
"logps/chosen": -164.5,
"logps/rejected": -295.0,
"loss": 0.2193,
"rewards/accuracies": 0.875,
"rewards/chosen": 0.7236328125,
"rewards/margins": 4.15625,
"rewards/rejected": -3.4296875,
"step": 13
},
{
"epoch": 0.04674457429048414,
"grad_norm": 0.8638622760772705,
"learning_rate": 0.00015999551442446528,
"logits/chosen": -3.2890625,
"logits/rejected": -2.515625,
"logps/chosen": -144.0,
"logps/rejected": -289.0,
"loss": 0.2724,
"rewards/accuracies": 0.921875,
"rewards/chosen": 0.4716796875,
"rewards/margins": 4.484375,
"rewards/rejected": -4.015625,
"step": 14
},
{
"epoch": 0.05008347245409015,
"grad_norm": 0.5347347855567932,
"learning_rate": 0.00015999202570145425,
"logits/chosen": -3.2734375,
"logits/rejected": -2.5546875,
"logps/chosen": -113.0,
"logps/rejected": -288.0,
"loss": 0.1353,
"rewards/accuracies": 0.9375,
"rewards/chosen": 0.955078125,
"rewards/margins": 5.78125,
"rewards/rejected": -4.828125,
"step": 15
},
{
"epoch": 0.05342237061769616,
"grad_norm": 0.5463722348213196,
"learning_rate": 0.0001599875402749599,
"logits/chosen": -3.359375,
"logits/rejected": -2.7734375,
"logps/chosen": -138.0,
"logps/rejected": -241.0,
"loss": 0.1262,
"rewards/accuracies": 0.9375,
"rewards/chosen": 1.4609375,
"rewards/margins": 6.0625,
"rewards/rejected": -4.609375,
"step": 16
},
{
"epoch": 0.05676126878130217,
"grad_norm": 0.6224486231803894,
"learning_rate": 0.00015998205820087077,
"logits/chosen": -3.625,
"logits/rejected": -2.78125,
"logps/chosen": -105.0,
"logps/rejected": -271.0,
"loss": 0.1513,
"rewards/accuracies": 0.9375,
"rewards/chosen": 1.890625,
"rewards/margins": 7.015625,
"rewards/rejected": -5.125,
"step": 17
},
{
"epoch": 0.06010016694490818,
"grad_norm": 0.31991323828697205,
"learning_rate": 0.00015997557954749368,
"logits/chosen": -3.71875,
"logits/rejected": -2.6171875,
"logps/chosen": -113.5,
"logps/rejected": -292.0,
"loss": 0.0388,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.6953125,
"rewards/margins": 8.3125,
"rewards/rejected": -5.625,
"step": 18
},
{
"epoch": 0.06343906510851419,
"grad_norm": 0.1611785888671875,
"learning_rate": 0.00015996810439555294,
"logits/chosen": -3.7578125,
"logits/rejected": -2.9140625,
"logps/chosen": -121.25,
"logps/rejected": -330.0,
"loss": 0.0091,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.328125,
"rewards/margins": 8.875,
"rewards/rejected": -6.515625,
"step": 19
},
{
"epoch": 0.0667779632721202,
"grad_norm": 0.2885970175266266,
"learning_rate": 0.00015995963283818918,
"logits/chosen": -4.03125,
"logits/rejected": -2.6953125,
"logps/chosen": -117.75,
"logps/rejected": -312.0,
"loss": 0.0381,
"rewards/accuracies": 0.984375,
"rewards/chosen": 2.296875,
"rewards/margins": 8.28125,
"rewards/rejected": -6.0,
"step": 20
},
{
"epoch": 0.07011686143572621,
"grad_norm": 0.1798153966665268,
"learning_rate": 0.00015995016498095827,
"logits/chosen": -3.6953125,
"logits/rejected": -2.9453125,
"logps/chosen": -124.0,
"logps/rejected": -300.0,
"loss": 0.0241,
"rewards/accuracies": 0.984375,
"rewards/chosen": 2.921875,
"rewards/margins": 9.875,
"rewards/rejected": -6.9375,
"step": 21
},
{
"epoch": 0.07345575959933222,
"grad_norm": 0.04417094215750694,
"learning_rate": 0.0001599397009418301,
"logits/chosen": -3.7421875,
"logits/rejected": -2.703125,
"logps/chosen": -129.5,
"logps/rejected": -347.0,
"loss": 0.0048,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.4921875,
"rewards/margins": 10.15625,
"rewards/rejected": -7.671875,
"step": 22
},
{
"epoch": 0.07679465776293823,
"grad_norm": 0.0934915617108345,
"learning_rate": 0.00015992824085118694,
"logits/chosen": -3.421875,
"logits/rejected": -2.9765625,
"logps/chosen": -108.0,
"logps/rejected": -303.0,
"loss": 0.008,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.8359375,
"rewards/margins": 10.65625,
"rewards/rejected": -7.8125,
"step": 23
},
{
"epoch": 0.08013355592654424,
"grad_norm": 0.03649423271417618,
"learning_rate": 0.00015991578485182194,
"logits/chosen": -3.96875,
"logits/rejected": -2.7890625,
"logps/chosen": -100.25,
"logps/rejected": -345.0,
"loss": 0.0028,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.4296875,
"rewards/margins": 12.0,
"rewards/rejected": -8.59375,
"step": 24
},
{
"epoch": 0.08347245409015025,
"grad_norm": 0.29851219058036804,
"learning_rate": 0.00015990233309893726,
"logits/chosen": -3.8203125,
"logits/rejected": -3.1484375,
"logps/chosen": -110.25,
"logps/rejected": -313.0,
"loss": 0.0403,
"rewards/accuracies": 0.984375,
"rewards/chosen": 3.0703125,
"rewards/margins": 10.59375,
"rewards/rejected": -7.5,
"step": 25
},
{
"epoch": 0.08681135225375626,
"grad_norm": 0.09218787401914597,
"learning_rate": 0.00015988788576014228,
"logits/chosen": -4.1796875,
"logits/rejected": -3.015625,
"logps/chosen": -113.25,
"logps/rejected": -331.0,
"loss": 0.0081,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.2578125,
"rewards/margins": 12.0,
"rewards/rejected": -8.75,
"step": 26
},
{
"epoch": 0.09015025041736227,
"grad_norm": 0.010930394753813744,
"learning_rate": 0.0001598724430154513,
"logits/chosen": -4.625,
"logits/rejected": -3.0625,
"logps/chosen": -75.0,
"logps/rejected": -328.0,
"loss": 0.001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.5,
"rewards/margins": 12.875,
"rewards/rejected": -9.375,
"step": 27
},
{
"epoch": 0.09348914858096828,
"grad_norm": 0.025569891557097435,
"learning_rate": 0.00015985600505728152,
"logits/chosen": -4.5625,
"logits/rejected": -3.1015625,
"logps/chosen": -81.75,
"logps/rejected": -301.0,
"loss": 0.0019,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.8828125,
"rewards/margins": 11.96875,
"rewards/rejected": -9.09375,
"step": 28
},
{
"epoch": 0.09682804674457429,
"grad_norm": 0.00938540231436491,
"learning_rate": 0.00015983857209045046,
"logits/chosen": -3.8984375,
"logits/rejected": -3.03125,
"logps/chosen": -154.0,
"logps/rejected": -353.0,
"loss": 0.0007,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.40625,
"rewards/margins": 12.96875,
"rewards/rejected": -9.5625,
"step": 29
},
{
"epoch": 0.1001669449081803,
"grad_norm": 0.00449951458722353,
"learning_rate": 0.00015982014433217346,
"logits/chosen": -4.359375,
"logits/rejected": -3.15625,
"logps/chosen": -110.25,
"logps/rejected": -349.0,
"loss": 0.0004,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.6875,
"rewards/margins": 14.40625,
"rewards/rejected": -10.6875,
"step": 30
},
{
"epoch": 0.10350584307178631,
"grad_norm": 0.1915261447429657,
"learning_rate": 0.0001598007220120611,
"logits/chosen": -4.546875,
"logits/rejected": -3.09375,
"logps/chosen": -108.25,
"logps/rejected": -386.0,
"loss": 0.006,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.6953125,
"rewards/margins": 15.75,
"rewards/rejected": -12.0625,
"step": 31
},
{
"epoch": 0.10684474123539232,
"grad_norm": 0.015207415446639061,
"learning_rate": 0.0001597803053721162,
"logits/chosen": -4.921875,
"logits/rejected": -3.2734375,
"logps/chosen": -102.25,
"logps/rejected": -355.0,
"loss": 0.0009,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.0859375,
"rewards/margins": 13.65625,
"rewards/rejected": -10.5625,
"step": 32
},
{
"epoch": 0.11018363939899833,
"grad_norm": 0.052224867045879364,
"learning_rate": 0.00015975889466673073,
"logits/chosen": -4.84375,
"logits/rejected": -3.2265625,
"logps/chosen": -107.75,
"logps/rejected": -387.0,
"loss": 0.0021,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.1796875,
"rewards/margins": 15.375,
"rewards/rejected": -12.21875,
"step": 33
},
{
"epoch": 0.11352253756260434,
"grad_norm": 0.018112409859895706,
"learning_rate": 0.0001597364901626829,
"logits/chosen": -4.640625,
"logits/rejected": -3.3828125,
"logps/chosen": -121.5,
"logps/rejected": -367.0,
"loss": 0.001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.53125,
"rewards/margins": 15.65625,
"rewards/rejected": -12.125,
"step": 34
},
{
"epoch": 0.11686143572621036,
"grad_norm": 0.03322592005133629,
"learning_rate": 0.00015971309213913366,
"logits/chosen": -4.984375,
"logits/rejected": -3.4296875,
"logps/chosen": -89.75,
"logps/rejected": -365.0,
"loss": 0.0012,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.5078125,
"rewards/margins": 15.53125,
"rewards/rejected": -12.0625,
"step": 35
},
{
"epoch": 0.12020033388981637,
"grad_norm": 0.051106907427310944,
"learning_rate": 0.00015968870088762315,
"logits/chosen": -4.609375,
"logits/rejected": -3.28125,
"logps/chosen": -98.75,
"logps/rejected": -359.0,
"loss": 0.0032,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.296875,
"rewards/margins": 16.15625,
"rewards/rejected": -12.90625,
"step": 36
},
{
"epoch": 0.12353923205342238,
"grad_norm": 0.040621671825647354,
"learning_rate": 0.00015966331671206724,
"logits/chosen": -4.4375,
"logits/rejected": -3.2421875,
"logps/chosen": -120.0,
"logps/rejected": -406.0,
"loss": 0.0023,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.8671875,
"rewards/margins": 17.4375,
"rewards/rejected": -13.625,
"step": 37
},
{
"epoch": 0.12687813021702837,
"grad_norm": 0.053807105869054794,
"learning_rate": 0.00015963693992875367,
"logits/chosen": -4.546875,
"logits/rejected": -3.3203125,
"logps/chosen": -117.0,
"logps/rejected": -378.0,
"loss": 0.0016,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.3984375,
"rewards/margins": 17.0625,
"rewards/rejected": -13.6875,
"step": 38
},
{
"epoch": 0.1302170283806344,
"grad_norm": 0.008678439073264599,
"learning_rate": 0.00015960957086633812,
"logits/chosen": -4.6875,
"logits/rejected": -3.5078125,
"logps/chosen": -108.0,
"logps/rejected": -368.0,
"loss": 0.0003,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.8515625,
"rewards/margins": 18.6875,
"rewards/rejected": -14.84375,
"step": 39
},
{
"epoch": 0.1335559265442404,
"grad_norm": 0.05349210277199745,
"learning_rate": 0.00015958120986584007,
"logits/chosen": -4.5625,
"logits/rejected": -3.3203125,
"logps/chosen": -121.0,
"logps/rejected": -348.0,
"loss": 0.0026,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.1953125,
"rewards/margins": 16.15625,
"rewards/rejected": -13.0,
"step": 40
},
{
"epoch": 0.13689482470784642,
"grad_norm": 0.01919226534664631,
"learning_rate": 0.00015955185728063859,
"logits/chosen": -4.671875,
"logits/rejected": -3.65625,
"logps/chosen": -111.75,
"logps/rejected": -348.0,
"loss": 0.0012,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.3515625,
"rewards/margins": 16.0,
"rewards/rejected": -12.625,
"step": 41
},
{
"epoch": 0.14023372287145242,
"grad_norm": 0.003901825286448002,
"learning_rate": 0.0001595215134764679,
"logits/chosen": -4.890625,
"logits/rejected": -3.484375,
"logps/chosen": -96.75,
"logps/rejected": -401.0,
"loss": 0.0003,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.0390625,
"rewards/margins": 17.4375,
"rewards/rejected": -14.40625,
"step": 42
},
{
"epoch": 0.14357262103505844,
"grad_norm": 1.0987324714660645,
"learning_rate": 0.00015949017883141293,
"logits/chosen": -4.4375,
"logits/rejected": -3.5625,
"logps/chosen": -132.25,
"logps/rejected": -354.0,
"loss": 1.1098,
"rewards/accuracies": 0.984375,
"rewards/chosen": 2.125,
"rewards/margins": 16.625,
"rewards/rejected": -14.4375,
"step": 43
},
{
"epoch": 0.14691151919866444,
"grad_norm": 0.0008750148699618876,
"learning_rate": 0.00015945785373590446,
"logits/chosen": -4.78125,
"logits/rejected": -3.578125,
"logps/chosen": -112.75,
"logps/rejected": -404.0,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.40625,
"rewards/margins": 20.1875,
"rewards/rejected": -16.8125,
"step": 44
},
{
"epoch": 0.15025041736227046,
"grad_norm": 0.00039893100620247424,
"learning_rate": 0.0001594245385927143,
"logits/chosen": -4.796875,
"logits/rejected": -3.6875,
"logps/chosen": -109.0,
"logps/rejected": -436.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.90625,
"rewards/margins": 21.0625,
"rewards/rejected": -17.25,
"step": 45
},
{
"epoch": 0.15358931552587646,
"grad_norm": 3.564608414308168e-05,
"learning_rate": 0.00015939023381695034,
"logits/chosen": -5.015625,
"logits/rejected": -3.5546875,
"logps/chosen": -122.0,
"logps/rejected": -404.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.9453125,
"rewards/margins": 20.5,
"rewards/rejected": -16.5625,
"step": 46
},
{
"epoch": 0.15692821368948248,
"grad_norm": 0.15678206086158752,
"learning_rate": 0.0001593549398360513,
"logits/chosen": -5.25,
"logits/rejected": -3.546875,
"logps/chosen": -103.75,
"logps/rejected": -430.0,
"loss": 0.0052,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.765625,
"rewards/margins": 21.25,
"rewards/rejected": -17.5,
"step": 47
},
{
"epoch": 0.16026711185308848,
"grad_norm": 0.003343533491715789,
"learning_rate": 0.00015931865708978144,
"logits/chosen": -4.828125,
"logits/rejected": -3.75,
"logps/chosen": -115.25,
"logps/rejected": -455.0,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.46875,
"rewards/margins": 22.1875,
"rewards/rejected": -18.75,
"step": 48
},
{
"epoch": 0.1636060100166945,
"grad_norm": 0.0013559595681726933,
"learning_rate": 0.0001592813860302251,
"logits/chosen": -4.78125,
"logits/rejected": -3.515625,
"logps/chosen": -98.0,
"logps/rejected": -460.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.1484375,
"rewards/margins": 22.1875,
"rewards/rejected": -19.0,
"step": 49
},
{
"epoch": 0.1669449081803005,
"grad_norm": 0.010648728348314762,
"learning_rate": 0.00015924312712178095,
"logits/chosen": -5.34375,
"logits/rejected": -3.6953125,
"logps/chosen": -101.5,
"logps/rejected": -440.0,
"loss": 0.0003,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.828125,
"rewards/margins": 21.875,
"rewards/rejected": -19.0625,
"step": 50
},
{
"epoch": 0.17028380634390652,
"grad_norm": 0.0414469912648201,
"learning_rate": 0.00015920388084115635,
"logits/chosen": -4.78125,
"logits/rejected": -3.6484375,
"logps/chosen": -111.0,
"logps/rejected": -429.0,
"loss": 0.0012,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.6484375,
"rewards/margins": 22.875,
"rewards/rejected": -19.25,
"step": 51
},
{
"epoch": 0.17362270450751252,
"grad_norm": 0.0018979490268975496,
"learning_rate": 0.00015916364767736143,
"logits/chosen": -5.078125,
"logits/rejected": -3.546875,
"logps/chosen": -132.25,
"logps/rejected": -401.0,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.0,
"rewards/margins": 19.875,
"rewards/rejected": -16.875,
"step": 52
},
{
"epoch": 0.17696160267111852,
"grad_norm": 1.743229768180754e-05,
"learning_rate": 0.00015912242813170274,
"logits/chosen": -4.984375,
"logits/rejected": -3.7265625,
"logps/chosen": -125.25,
"logps/rejected": -457.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.4140625,
"rewards/margins": 23.25,
"rewards/rejected": -19.875,
"step": 53
},
{
"epoch": 0.18030050083472454,
"grad_norm": 0.13024184107780457,
"learning_rate": 0.00015908022271777733,
"logits/chosen": -5.515625,
"logits/rejected": -3.796875,
"logps/chosen": -83.0,
"logps/rejected": -467.0,
"loss": 0.0043,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.3828125,
"rewards/margins": 24.0,
"rewards/rejected": -20.625,
"step": 54
},
{
"epoch": 0.18363939899833054,
"grad_norm": 0.0001471200812375173,
"learning_rate": 0.0001590370319614662,
"logits/chosen": -4.859375,
"logits/rejected": -3.703125,
"logps/chosen": -111.5,
"logps/rejected": -470.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.546875,
"rewards/margins": 24.5625,
"rewards/rejected": -21.0,
"step": 55
},
{
"epoch": 0.18697829716193656,
"grad_norm": 0.0001591207692399621,
"learning_rate": 0.00015899285640092763,
"logits/chosen": -5.046875,
"logits/rejected": -3.8046875,
"logps/chosen": -115.75,
"logps/rejected": -455.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.703125,
"rewards/margins": 23.625,
"rewards/rejected": -19.875,
"step": 56
},
{
"epoch": 0.19031719532554256,
"grad_norm": 0.0009253205498680472,
"learning_rate": 0.00015894769658659073,
"logits/chosen": -4.65625,
"logits/rejected": -3.65625,
"logps/chosen": -122.25,
"logps/rejected": -446.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.40625,
"rewards/margins": 22.0625,
"rewards/rejected": -18.5625,
"step": 57
},
{
"epoch": 0.19365609348914858,
"grad_norm": 4.7780202294234186e-05,
"learning_rate": 0.00015890155308114837,
"logits/chosen": -5.234375,
"logits/rejected": -3.4765625,
"logps/chosen": -113.75,
"logps/rejected": -467.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.6640625,
"rewards/margins": 24.1875,
"rewards/rejected": -20.5,
"step": 58
},
{
"epoch": 0.19699499165275458,
"grad_norm": 3.8026719266781583e-05,
"learning_rate": 0.00015885442645955026,
"logits/chosen": -5.0625,
"logits/rejected": -3.890625,
"logps/chosen": -119.75,
"logps/rejected": -399.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.171875,
"rewards/margins": 22.9375,
"rewards/rejected": -18.75,
"step": 59
},
{
"epoch": 0.2003338898163606,
"grad_norm": 0.00020460848463699222,
"learning_rate": 0.00015880631730899578,
"logits/chosen": -4.5859375,
"logits/rejected": -3.53125,
"logps/chosen": -99.0,
"logps/rejected": -449.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.6328125,
"rewards/margins": 22.875,
"rewards/rejected": -19.25,
"step": 60
},
{
"epoch": 0.2036727879799666,
"grad_norm": 0.00048449577298015356,
"learning_rate": 0.0001587572262289267,
"logits/chosen": -5.546875,
"logits/rejected": -3.9453125,
"logps/chosen": -87.5,
"logps/rejected": -464.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.96875,
"rewards/margins": 24.4375,
"rewards/rejected": -21.4375,
"step": 61
},
{
"epoch": 0.20701168614357263,
"grad_norm": 0.14330030977725983,
"learning_rate": 0.00015870715383101955,
"logits/chosen": -5.671875,
"logits/rejected": -3.7265625,
"logps/chosen": -80.25,
"logps/rejected": -457.0,
"loss": 0.0031,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.5234375,
"rewards/margins": 23.875,
"rewards/rejected": -20.375,
"step": 62
},
{
"epoch": 0.21035058430717862,
"grad_norm": 5.157471969141625e-05,
"learning_rate": 0.00015865610073917825,
"logits/chosen": -5.65625,
"logits/rejected": -3.875,
"logps/chosen": -96.5,
"logps/rejected": -399.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.0703125,
"rewards/margins": 23.125,
"rewards/rejected": -19.0625,
"step": 63
},
{
"epoch": 0.21368948247078465,
"grad_norm": 0.001272167544811964,
"learning_rate": 0.0001586040675895261,
"logits/chosen": -5.03125,
"logits/rejected": -3.828125,
"logps/chosen": -126.75,
"logps/rejected": -453.0,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.1796875,
"rewards/margins": 23.4375,
"rewards/rejected": -20.3125,
"step": 64
},
{
"epoch": 0.21702838063439064,
"grad_norm": 0.10344758629798889,
"learning_rate": 0.00015855105503039804,
"logits/chosen": -5.0,
"logits/rejected": -3.65625,
"logps/chosen": -93.75,
"logps/rejected": -463.0,
"loss": 0.0029,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.921875,
"rewards/margins": 24.0,
"rewards/rejected": -20.0625,
"step": 65
},
{
"epoch": 0.22036727879799667,
"grad_norm": 0.000509591365698725,
"learning_rate": 0.00015849706372233238,
"logits/chosen": -5.390625,
"logits/rejected": -3.7109375,
"logps/chosen": -99.5,
"logps/rejected": -421.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.34375,
"rewards/margins": 22.625,
"rewards/rejected": -19.3125,
"step": 66
},
{
"epoch": 0.22370617696160267,
"grad_norm": 0.0010495522292330861,
"learning_rate": 0.0001584420943380628,
"logits/chosen": -5.265625,
"logits/rejected": -3.9375,
"logps/chosen": -109.25,
"logps/rejected": -458.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.3203125,
"rewards/margins": 23.1875,
"rewards/rejected": -19.875,
"step": 67
},
{
"epoch": 0.2270450751252087,
"grad_norm": 0.00022398516011890024,
"learning_rate": 0.0001583861475625097,
"logits/chosen": -5.296875,
"logits/rejected": -3.6796875,
"logps/chosen": -100.0,
"logps/rejected": -489.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.6796875,
"rewards/margins": 24.6875,
"rewards/rejected": -21.0,
"step": 68
},
{
"epoch": 0.2303839732888147,
"grad_norm": 1.072521808964666e-05,
"learning_rate": 0.00015832922409277198,
"logits/chosen": -4.859375,
"logits/rejected": -3.9765625,
"logps/chosen": -124.75,
"logps/rejected": -395.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.640625,
"rewards/margins": 22.625,
"rewards/rejected": -18.9375,
"step": 69
},
{
"epoch": 0.2337228714524207,
"grad_norm": 0.00017957530508283526,
"learning_rate": 0.00015827132463811804,
"logits/chosen": -5.09375,
"logits/rejected": -3.828125,
"logps/chosen": -106.5,
"logps/rejected": -407.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.609375,
"rewards/margins": 22.5,
"rewards/rejected": -18.875,
"step": 70
},
{
"epoch": 0.2370617696160267,
"grad_norm": 0.0003923263284377754,
"learning_rate": 0.00015821244991997717,
"logits/chosen": -4.671875,
"logits/rejected": -3.53125,
"logps/chosen": -102.25,
"logps/rejected": -459.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.9453125,
"rewards/margins": 23.875,
"rewards/rejected": -20.0,
"step": 71
},
{
"epoch": 0.24040066777963273,
"grad_norm": 1.9827170035568997e-05,
"learning_rate": 0.0001581526006719304,
"logits/chosen": -5.53125,
"logits/rejected": -3.84375,
"logps/chosen": -89.75,
"logps/rejected": -465.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.28125,
"rewards/margins": 24.75,
"rewards/rejected": -20.4375,
"step": 72
},
{
"epoch": 0.24373956594323873,
"grad_norm": 0.2003210037946701,
"learning_rate": 0.0001580917776397016,
"logits/chosen": -4.875,
"logits/rejected": -3.6875,
"logps/chosen": -123.0,
"logps/rejected": -445.0,
"loss": 0.0063,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.0546875,
"rewards/margins": 23.4375,
"rewards/rejected": -19.3125,
"step": 73
},
{
"epoch": 0.24707846410684475,
"grad_norm": 4.490726860240102e-05,
"learning_rate": 0.0001580299815811478,
"logits/chosen": -5.046875,
"logits/rejected": -3.6328125,
"logps/chosen": -96.25,
"logps/rejected": -463.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.6015625,
"rewards/margins": 24.4375,
"rewards/rejected": -20.8125,
"step": 74
},
{
"epoch": 0.25041736227045075,
"grad_norm": 0.0011267291847616434,
"learning_rate": 0.00015796721326625013,
"logits/chosen": -5.078125,
"logits/rejected": -3.5625,
"logps/chosen": -99.25,
"logps/rejected": -478.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.8046875,
"rewards/margins": 25.25,
"rewards/rejected": -21.4375,
"step": 75
},
{
"epoch": 0.25375626043405675,
"grad_norm": 2.087617986035184e-06,
"learning_rate": 0.00015790347347710405,
"logits/chosen": -5.046875,
"logits/rejected": -3.53125,
"logps/chosen": -118.0,
"logps/rejected": -464.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.9296875,
"rewards/margins": 24.4375,
"rewards/rejected": -20.5,
"step": 76
},
{
"epoch": 0.2570951585976628,
"grad_norm": 0.00048489755135960877,
"learning_rate": 0.00015783876300790956,
"logits/chosen": -5.078125,
"logits/rejected": -3.6015625,
"logps/chosen": -102.25,
"logps/rejected": -466.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.3359375,
"rewards/margins": 23.375,
"rewards/rejected": -20.0,
"step": 77
},
{
"epoch": 0.2604340567612688,
"grad_norm": 5.250661706668325e-05,
"learning_rate": 0.0001577730826649614,
"logits/chosen": -4.984375,
"logits/rejected": -3.484375,
"logps/chosen": -123.25,
"logps/rejected": -448.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.234375,
"rewards/margins": 23.6875,
"rewards/rejected": -19.5,
"step": 78
},
{
"epoch": 0.2637729549248748,
"grad_norm": 2.072748429782223e-05,
"learning_rate": 0.00015770643326663898,
"logits/chosen": -5.25,
"logits/rejected": -3.359375,
"logps/chosen": -80.25,
"logps/rejected": -486.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.46875,
"rewards/margins": 23.625,
"rewards/rejected": -20.1875,
"step": 79
},
{
"epoch": 0.2671118530884808,
"grad_norm": 0.002645147731527686,
"learning_rate": 0.0001576388156433962,
"logits/chosen": -5.15625,
"logits/rejected": -3.515625,
"logps/chosen": -119.0,
"logps/rejected": -456.0,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.765625,
"rewards/margins": 24.0625,
"rewards/rejected": -20.3125,
"step": 80
},
{
"epoch": 0.2704507512520868,
"grad_norm": 1.2537796465039719e-05,
"learning_rate": 0.00015757023063775106,
"logits/chosen": -4.71875,
"logits/rejected": -3.5078125,
"logps/chosen": -119.25,
"logps/rejected": -425.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.2421875,
"rewards/margins": 23.25,
"rewards/rejected": -19.0,
"step": 81
},
{
"epoch": 0.27378964941569284,
"grad_norm": 0.0001700354478089139,
"learning_rate": 0.00015750067910427513,
"logits/chosen": -5.125,
"logits/rejected": -3.59375,
"logps/chosen": -108.5,
"logps/rejected": -414.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.7421875,
"rewards/margins": 22.0625,
"rewards/rejected": -18.3125,
"step": 82
},
{
"epoch": 0.27712854757929883,
"grad_norm": 5.510517439688556e-05,
"learning_rate": 0.000157430161909583,
"logits/chosen": -5.0,
"logits/rejected": -3.40625,
"logps/chosen": -102.0,
"logps/rejected": -452.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.8828125,
"rewards/margins": 22.875,
"rewards/rejected": -19.0,
"step": 83
},
{
"epoch": 0.28046744574290483,
"grad_norm": 0.0010590353049337864,
"learning_rate": 0.00015735867993232143,
"logits/chosen": -5.078125,
"logits/rejected": -3.5703125,
"logps/chosen": -105.25,
"logps/rejected": -399.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.2265625,
"rewards/margins": 21.9375,
"rewards/rejected": -17.6875,
"step": 84
},
{
"epoch": 0.2838063439065108,
"grad_norm": 9.394536027684808e-05,
"learning_rate": 0.0001572862340631584,
"logits/chosen": -4.984375,
"logits/rejected": -3.6875,
"logps/chosen": -90.75,
"logps/rejected": -433.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.3125,
"rewards/margins": 22.875,
"rewards/rejected": -18.5625,
"step": 85
},
{
"epoch": 0.2871452420701169,
"grad_norm": 0.0001689967029960826,
"learning_rate": 0.00015721282520477197,
"logits/chosen": -4.828125,
"logits/rejected": -3.65625,
"logps/chosen": -95.5,
"logps/rejected": -429.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.9375,
"rewards/margins": 22.5,
"rewards/rejected": -18.5625,
"step": 86
},
{
"epoch": 0.2904841402337229,
"grad_norm": 9.815259545575827e-06,
"learning_rate": 0.00015713845427183922,
"logits/chosen": -5.09375,
"logits/rejected": -3.4140625,
"logps/chosen": -87.5,
"logps/rejected": -453.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.53125,
"rewards/margins": 24.1875,
"rewards/rejected": -19.625,
"step": 87
},
{
"epoch": 0.2938230383973289,
"grad_norm": 2.0043949916725978e-05,
"learning_rate": 0.0001570631221910245,
"logits/chosen": -4.703125,
"logits/rejected": -3.5546875,
"logps/chosen": -127.0,
"logps/rejected": -440.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.265625,
"rewards/margins": 23.9375,
"rewards/rejected": -19.6875,
"step": 88
},
{
"epoch": 0.29716193656093487,
"grad_norm": 0.000917528523132205,
"learning_rate": 0.00015698682990096828,
"logits/chosen": -4.90625,
"logits/rejected": -3.5625,
"logps/chosen": -111.25,
"logps/rejected": -377.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.546875,
"rewards/margins": 22.25,
"rewards/rejected": -17.6875,
"step": 89
},
{
"epoch": 0.3005008347245409,
"grad_norm": 8.072228229139e-05,
"learning_rate": 0.00015690957835227522,
"logits/chosen": -5.28125,
"logits/rejected": -3.4765625,
"logps/chosen": -79.25,
"logps/rejected": -458.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.96875,
"rewards/margins": 22.8125,
"rewards/rejected": -18.8125,
"step": 90
},
{
"epoch": 0.3038397328881469,
"grad_norm": 4.4778818846680224e-05,
"learning_rate": 0.00015683136850750236,
"logits/chosen": -4.453125,
"logits/rejected": -3.28125,
"logps/chosen": -104.25,
"logps/rejected": -458.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.4765625,
"rewards/margins": 24.0625,
"rewards/rejected": -20.5625,
"step": 91
},
{
"epoch": 0.3071786310517529,
"grad_norm": 5.669149322784506e-05,
"learning_rate": 0.00015675220134114712,
"logits/chosen": -4.796875,
"logits/rejected": -3.4765625,
"logps/chosen": -94.25,
"logps/rejected": -428.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.1796875,
"rewards/margins": 23.1875,
"rewards/rejected": -19.0625,
"step": 92
},
{
"epoch": 0.3105175292153589,
"grad_norm": 7.771019227220677e-06,
"learning_rate": 0.00015667207783963516,
"logits/chosen": -5.09375,
"logits/rejected": -3.359375,
"logps/chosen": -102.25,
"logps/rejected": -465.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.90625,
"rewards/margins": 24.0625,
"rewards/rejected": -20.125,
"step": 93
},
{
"epoch": 0.31385642737896496,
"grad_norm": 0.0015844438457861543,
"learning_rate": 0.00015659099900130826,
"logits/chosen": -4.9375,
"logits/rejected": -3.265625,
"logps/chosen": -97.25,
"logps/rejected": -508.0,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.8046875,
"rewards/margins": 25.125,
"rewards/rejected": -21.375,
"step": 94
},
{
"epoch": 0.31719532554257096,
"grad_norm": 0.0008710987749509513,
"learning_rate": 0.00015650896583641158,
"logits/chosen": -5.140625,
"logits/rejected": -3.3046875,
"logps/chosen": -102.5,
"logps/rejected": -449.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.6171875,
"rewards/margins": 23.875,
"rewards/rejected": -20.25,
"step": 95
},
{
"epoch": 0.32053422370617696,
"grad_norm": 2.450255806252244e-06,
"learning_rate": 0.00015642597936708127,
"logits/chosen": -4.6875,
"logits/rejected": -3.3515625,
"logps/chosen": -124.5,
"logps/rejected": -473.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.375,
"rewards/margins": 25.0,
"rewards/rejected": -20.625,
"step": 96
},
{
"epoch": 0.32387312186978295,
"grad_norm": 4.694379822467454e-05,
"learning_rate": 0.00015634204062733167,
"logits/chosen": -5.015625,
"logits/rejected": -3.59375,
"logps/chosen": -103.75,
"logps/rejected": -385.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.6875,
"rewards/margins": 23.0,
"rewards/rejected": -18.3125,
"step": 97
},
{
"epoch": 0.327212020033389,
"grad_norm": 0.0004240713897161186,
"learning_rate": 0.00015625715066304246,
"logits/chosen": -4.9375,
"logits/rejected": -3.671875,
"logps/chosen": -103.75,
"logps/rejected": -385.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.875,
"rewards/margins": 22.125,
"rewards/rejected": -18.3125,
"step": 98
},
{
"epoch": 0.330550918196995,
"grad_norm": 0.864057183265686,
"learning_rate": 0.00015617131053194565,
"logits/chosen": -4.609375,
"logits/rejected": -3.4453125,
"logps/chosen": -128.0,
"logps/rejected": -453.0,
"loss": 0.0811,
"rewards/accuracies": 0.984375,
"rewards/chosen": 3.671875,
"rewards/margins": 22.625,
"rewards/rejected": -18.9375,
"step": 99
},
{
"epoch": 0.333889816360601,
"grad_norm": 0.0037979809567332268,
"learning_rate": 0.0001560845213036123,
"logits/chosen": -4.65625,
"logits/rejected": -3.65625,
"logps/chosen": -113.5,
"logps/rejected": -393.0,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.890625,
"rewards/margins": 24.0625,
"rewards/rejected": -20.25,
"step": 100
},
{
"epoch": 0.337228714524207,
"grad_norm": 0.0002999906719196588,
"learning_rate": 0.00015599678405943927,
"logits/chosen": -5.171875,
"logits/rejected": -3.7890625,
"logps/chosen": -104.75,
"logps/rejected": -471.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.7890625,
"rewards/margins": 27.0625,
"rewards/rejected": -23.25,
"step": 101
},
{
"epoch": 0.34056761268781305,
"grad_norm": 0.38996586203575134,
"learning_rate": 0.00015590809989263576,
"logits/chosen": -5.265625,
"logits/rejected": -3.3359375,
"logps/chosen": -95.25,
"logps/rejected": -461.0,
"loss": 0.0173,
"rewards/accuracies": 0.984375,
"rewards/chosen": 3.4140625,
"rewards/margins": 24.8125,
"rewards/rejected": -21.4375,
"step": 102
},
{
"epoch": 0.34390651085141904,
"grad_norm": 0.0001715045509627089,
"learning_rate": 0.00015581846990820965,
"logits/chosen": -5.375,
"logits/rejected": -3.953125,
"logps/chosen": -119.75,
"logps/rejected": -469.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.8359375,
"rewards/margins": 27.1875,
"rewards/rejected": -23.3125,
"step": 103
},
{
"epoch": 0.34724540901502504,
"grad_norm": 0.0018466322217136621,
"learning_rate": 0.00015572789522295372,
"logits/chosen": -5.21875,
"logits/rejected": -4.0390625,
"logps/chosen": -117.25,
"logps/rejected": -507.0,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.6015625,
"rewards/margins": 29.3125,
"rewards/rejected": -25.625,
"step": 104
},
{
"epoch": 0.35058430717863104,
"grad_norm": 1.7793492588680238e-05,
"learning_rate": 0.00015563637696543173,
"logits/chosen": -5.375,
"logits/rejected": -3.765625,
"logps/chosen": -122.5,
"logps/rejected": -574.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.1953125,
"rewards/margins": 32.0,
"rewards/rejected": -28.75,
"step": 105
},
{
"epoch": 0.35392320534223703,
"grad_norm": 0.0019883729983121157,
"learning_rate": 0.00015554391627596446,
"logits/chosen": -5.75,
"logits/rejected": -3.84375,
"logps/chosen": -74.75,
"logps/rejected": -558.0,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.4296875,
"rewards/margins": 30.75,
"rewards/rejected": -27.25,
"step": 106
},
{
"epoch": 0.3572621035058431,
"grad_norm": 0.0002132615481968969,
"learning_rate": 0.0001554505143066154,
"logits/chosen": -5.515625,
"logits/rejected": -3.8984375,
"logps/chosen": -118.5,
"logps/rejected": -496.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.265625,
"rewards/margins": 30.0,
"rewards/rejected": -25.75,
"step": 107
},
{
"epoch": 0.3606010016694491,
"grad_norm": 0.018147334456443787,
"learning_rate": 0.0001553561722211764,
"logits/chosen": -5.5625,
"logits/rejected": -3.9375,
"logps/chosen": -98.75,
"logps/rejected": -535.0,
"loss": 0.0003,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.8828125,
"rewards/margins": 32.5,
"rewards/rejected": -28.5625,
"step": 108
},
{
"epoch": 0.3639398998330551,
"grad_norm": 0.00014658304280601442,
"learning_rate": 0.00015526089119515316,
"logits/chosen": -5.796875,
"logits/rejected": -3.875,
"logps/chosen": -107.5,
"logps/rejected": -476.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.1171875,
"rewards/margins": 30.375,
"rewards/rejected": -26.25,
"step": 109
},
{
"epoch": 0.3672787979966611,
"grad_norm": 0.5631054043769836,
"learning_rate": 0.00015516467241575066,
"logits/chosen": -5.46875,
"logits/rejected": -3.9453125,
"logps/chosen": -91.0,
"logps/rejected": -548.0,
"loss": 0.063,
"rewards/accuracies": 0.984375,
"rewards/chosen": 4.0703125,
"rewards/margins": 32.875,
"rewards/rejected": -28.75,
"step": 110
},
{
"epoch": 0.37061769616026713,
"grad_norm": 0.0015704554971307516,
"learning_rate": 0.00015506751708185837,
"logits/chosen": -5.359375,
"logits/rejected": -3.9609375,
"logps/chosen": -97.0,
"logps/rejected": -463.0,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.515625,
"rewards/margins": 29.625,
"rewards/rejected": -25.125,
"step": 111
},
{
"epoch": 0.3739565943238731,
"grad_norm": 0.03998275473713875,
"learning_rate": 0.00015496942640403515,
"logits/chosen": -5.5625,
"logits/rejected": -3.6484375,
"logps/chosen": -90.0,
"logps/rejected": -502.0,
"loss": 0.0009,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.375,
"rewards/margins": 30.125,
"rewards/rejected": -26.6875,
"step": 112
},
{
"epoch": 0.3772954924874791,
"grad_norm": 1.5272264136001468e-05,
"learning_rate": 0.00015487040160449433,
"logits/chosen": -5.21875,
"logits/rejected": -3.859375,
"logps/chosen": -108.0,
"logps/rejected": -489.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.109375,
"rewards/margins": 29.0625,
"rewards/rejected": -24.9375,
"step": 113
},
{
"epoch": 0.3806343906510851,
"grad_norm": 2.4422410206170753e-05,
"learning_rate": 0.00015477044391708848,
"logits/chosen": -5.28125,
"logits/rejected": -3.7890625,
"logps/chosen": -119.0,
"logps/rejected": -501.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.375,
"rewards/margins": 30.1875,
"rewards/rejected": -25.8125,
"step": 114
},
{
"epoch": 0.38397328881469117,
"grad_norm": 0.009292054921388626,
"learning_rate": 0.00015466955458729386,
"logits/chosen": -5.640625,
"logits/rejected": -3.6484375,
"logps/chosen": -84.75,
"logps/rejected": -505.0,
"loss": 0.0003,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.1171875,
"rewards/margins": 29.5,
"rewards/rejected": -25.3125,
"step": 115
},
{
"epoch": 0.38731218697829717,
"grad_norm": 0.016696617007255554,
"learning_rate": 0.00015456773487219517,
"logits/chosen": -5.109375,
"logits/rejected": -3.640625,
"logps/chosen": -92.5,
"logps/rejected": -484.0,
"loss": 0.0007,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.28125,
"rewards/margins": 29.375,
"rewards/rejected": -25.0625,
"step": 116
},
{
"epoch": 0.39065108514190316,
"grad_norm": 3.30902221321594e-05,
"learning_rate": 0.00015446498604046967,
"logits/chosen": -4.9375,
"logits/rejected": -3.3359375,
"logps/chosen": -110.25,
"logps/rejected": -536.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.453125,
"rewards/margins": 29.875,
"rewards/rejected": -25.5,
"step": 117
},
{
"epoch": 0.39398998330550916,
"grad_norm": 0.007862205617129803,
"learning_rate": 0.00015436130937237144,
"logits/chosen": -4.28125,
"logits/rejected": -3.265625,
"logps/chosen": -102.5,
"logps/rejected": -476.0,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.703125,
"rewards/margins": 29.0,
"rewards/rejected": -24.25,
"step": 118
},
{
"epoch": 0.3973288814691152,
"grad_norm": 0.00031752747599966824,
"learning_rate": 0.00015425670615971544,
"logits/chosen": -4.84375,
"logits/rejected": -3.0859375,
"logps/chosen": -105.75,
"logps/rejected": -473.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.78125,
"rewards/margins": 29.6875,
"rewards/rejected": -24.9375,
"step": 119
},
{
"epoch": 0.4006677796327212,
"grad_norm": 3.322392512927763e-05,
"learning_rate": 0.00015415117770586144,
"logits/chosen": -5.234375,
"logits/rejected": -3.1875,
"logps/chosen": -75.25,
"logps/rejected": -491.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.9375,
"rewards/margins": 29.8125,
"rewards/rejected": -24.8125,
"step": 120
},
{
"epoch": 0.4040066777963272,
"grad_norm": 0.00014395274047274143,
"learning_rate": 0.00015404472532569771,
"logits/chosen": -5.078125,
"logits/rejected": -3.109375,
"logps/chosen": -107.0,
"logps/rejected": -468.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.90625,
"rewards/margins": 28.4375,
"rewards/rejected": -23.5625,
"step": 121
},
{
"epoch": 0.4073455759599332,
"grad_norm": 3.165722773701418e-06,
"learning_rate": 0.0001539373503456247,
"logits/chosen": -4.765625,
"logits/rejected": -3.125,
"logps/chosen": -110.25,
"logps/rejected": -497.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.875,
"rewards/margins": 28.125,
"rewards/rejected": -23.1875,
"step": 122
},
{
"epoch": 0.41068447412353926,
"grad_norm": 0.05887475982308388,
"learning_rate": 0.00015382905410353846,
"logits/chosen": -4.84375,
"logits/rejected": -3.375,
"logps/chosen": -121.0,
"logps/rejected": -435.0,
"loss": 0.0035,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.03125,
"rewards/margins": 24.9375,
"rewards/rejected": -20.9375,
"step": 123
},
{
"epoch": 0.41402337228714525,
"grad_norm": 0.0702575147151947,
"learning_rate": 0.00015371983794881404,
"logits/chosen": -4.890625,
"logits/rejected": -3.0546875,
"logps/chosen": -69.25,
"logps/rejected": -460.0,
"loss": 0.0017,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.84375,
"rewards/margins": 26.8125,
"rewards/rejected": -23.0,
"step": 124
},
{
"epoch": 0.41736227045075125,
"grad_norm": 0.01235484890639782,
"learning_rate": 0.0001536097032422886,
"logits/chosen": -4.9375,
"logits/rejected": -2.921875,
"logps/chosen": -102.5,
"logps/rejected": -492.0,
"loss": 0.0004,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.59375,
"rewards/margins": 27.1875,
"rewards/rejected": -22.625,
"step": 125
},
{
"epoch": 0.42070116861435725,
"grad_norm": 0.0033392056357115507,
"learning_rate": 0.00015349865135624448,
"logits/chosen": -5.078125,
"logits/rejected": -2.8125,
"logps/chosen": -93.5,
"logps/rejected": -439.0,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.34375,
"rewards/margins": 25.375,
"rewards/rejected": -21.0,
"step": 126
},
{
"epoch": 0.4240400667779633,
"grad_norm": 0.0017836507176980376,
"learning_rate": 0.0001533866836743922,
"logits/chosen": -5.21875,
"logits/rejected": -3.15625,
"logps/chosen": -95.25,
"logps/rejected": -440.0,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.171875,
"rewards/margins": 27.5,
"rewards/rejected": -22.3125,
"step": 127
},
{
"epoch": 0.4273789649415693,
"grad_norm": 1.02886324384599e-05,
"learning_rate": 0.00015327380159185295,
"logits/chosen": -4.8125,
"logits/rejected": -3.0,
"logps/chosen": -98.0,
"logps/rejected": -496.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.90625,
"rewards/margins": 28.75,
"rewards/rejected": -23.875,
"step": 128
},
{
"epoch": 0.4307178631051753,
"grad_norm": 0.0003149933472741395,
"learning_rate": 0.00015316000651514157,
"logits/chosen": -5.140625,
"logits/rejected": -2.9765625,
"logps/chosen": -82.25,
"logps/rejected": -471.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.78125,
"rewards/margins": 27.5625,
"rewards/rejected": -22.75,
"step": 129
},
{
"epoch": 0.4340567612687813,
"grad_norm": 5.283300197334029e-05,
"learning_rate": 0.0001530452998621487,
"logits/chosen": -4.8125,
"logits/rejected": -3.0390625,
"logps/chosen": -112.0,
"logps/rejected": -428.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.859375,
"rewards/margins": 26.0,
"rewards/rejected": -21.125,
"step": 130
},
{
"epoch": 0.4373956594323873,
"grad_norm": 7.664081931579858e-05,
"learning_rate": 0.00015292968306212336,
"logits/chosen": -4.453125,
"logits/rejected": -2.9375,
"logps/chosen": -101.5,
"logps/rejected": -450.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.203125,
"rewards/margins": 26.875,
"rewards/rejected": -21.6875,
"step": 131
},
{
"epoch": 0.44073455759599334,
"grad_norm": 0.00020385748939588666,
"learning_rate": 0.00015281315755565498,
"logits/chosen": -4.765625,
"logits/rejected": -3.1015625,
"logps/chosen": -95.25,
"logps/rejected": -448.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.859375,
"rewards/margins": 27.125,
"rewards/rejected": -22.25,
"step": 132
},
{
"epoch": 0.44407345575959933,
"grad_norm": 0.000807323376648128,
"learning_rate": 0.0001526957247946555,
"logits/chosen": -5.078125,
"logits/rejected": -2.75,
"logps/chosen": -98.75,
"logps/rejected": -494.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.765625,
"rewards/margins": 28.4375,
"rewards/rejected": -23.6875,
"step": 133
},
{
"epoch": 0.44741235392320533,
"grad_norm": 0.0005043560522608459,
"learning_rate": 0.0001525773862423413,
"logits/chosen": -4.984375,
"logits/rejected": -2.921875,
"logps/chosen": -100.0,
"logps/rejected": -462.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.578125,
"rewards/margins": 27.375,
"rewards/rejected": -22.8125,
"step": 134
},
{
"epoch": 0.4507512520868113,
"grad_norm": 0.0002074016520055011,
"learning_rate": 0.00015245814337321492,
"logits/chosen": -4.8125,
"logits/rejected": -2.71875,
"logps/chosen": -106.0,
"logps/rejected": -532.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.640625,
"rewards/margins": 29.4375,
"rewards/rejected": -24.8125,
"step": 135
},
{
"epoch": 0.4540901502504174,
"grad_norm": 8.115387754514813e-05,
"learning_rate": 0.0001523379976730468,
"logits/chosen": -4.90625,
"logits/rejected": -2.90625,
"logps/chosen": -88.5,
"logps/rejected": -502.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.8125,
"rewards/margins": 28.5,
"rewards/rejected": -23.6875,
"step": 136
},
{
"epoch": 0.4574290484140234,
"grad_norm": 3.2275711419060826e-06,
"learning_rate": 0.00015221695063885664,
"logits/chosen": -4.75,
"logits/rejected": -2.8046875,
"logps/chosen": -85.0,
"logps/rejected": -494.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.03125,
"rewards/margins": 29.1875,
"rewards/rejected": -24.1875,
"step": 137
},
{
"epoch": 0.4607679465776294,
"grad_norm": 0.0006476023118011653,
"learning_rate": 0.00015209500377889472,
"logits/chosen": -4.6875,
"logits/rejected": -2.8359375,
"logps/chosen": -107.75,
"logps/rejected": -498.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.671875,
"rewards/margins": 28.5,
"rewards/rejected": -23.8125,
"step": 138
},
{
"epoch": 0.46410684474123537,
"grad_norm": 4.4746982894139364e-05,
"learning_rate": 0.00015197215861262325,
"logits/chosen": -4.796875,
"logits/rejected": -2.65625,
"logps/chosen": -102.0,
"logps/rejected": -566.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.140625,
"rewards/margins": 30.875,
"rewards/rejected": -25.75,
"step": 139
},
{
"epoch": 0.4674457429048414,
"grad_norm": 0.05361659452319145,
"learning_rate": 0.00015184841667069748,
"logits/chosen": -4.796875,
"logits/rejected": -2.9609375,
"logps/chosen": -98.75,
"logps/rejected": -447.0,
"loss": 0.0012,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.78125,
"rewards/margins": 28.0625,
"rewards/rejected": -23.25,
"step": 140
},
{
"epoch": 0.4707846410684474,
"grad_norm": 0.00023327009694185108,
"learning_rate": 0.0001517237794949463,
"logits/chosen": -4.609375,
"logits/rejected": -3.03125,
"logps/chosen": -103.5,
"logps/rejected": -476.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.4453125,
"rewards/margins": 28.5625,
"rewards/rejected": -24.125,
"step": 141
},
{
"epoch": 0.4741235392320534,
"grad_norm": 0.2315979301929474,
"learning_rate": 0.00015159824863835336,
"logits/chosen": -5.484375,
"logits/rejected": -2.578125,
"logps/chosen": -76.5,
"logps/rejected": -536.0,
"loss": 0.0126,
"rewards/accuracies": 0.984375,
"rewards/chosen": 3.59375,
"rewards/margins": 29.0,
"rewards/rejected": -25.4375,
"step": 142
},
{
"epoch": 0.4774624373956594,
"grad_norm": 0.1798364669084549,
"learning_rate": 0.00015147182566503764,
"logits/chosen": -5.078125,
"logits/rejected": -2.9296875,
"logps/chosen": -101.25,
"logps/rejected": -478.0,
"loss": 0.0058,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.078125,
"rewards/margins": 28.4375,
"rewards/rejected": -23.4375,
"step": 143
},
{
"epoch": 0.48080133555926546,
"grad_norm": 2.0006180420750752e-05,
"learning_rate": 0.00015134451215023385,
"logits/chosen": -4.34375,
"logits/rejected": -2.96875,
"logps/chosen": -103.0,
"logps/rejected": -493.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.109375,
"rewards/margins": 29.0,
"rewards/rejected": -23.875,
"step": 144
},
{
"epoch": 0.48414023372287146,
"grad_norm": 0.0023838214110583067,
"learning_rate": 0.0001512163096802729,
"logits/chosen": -4.96875,
"logits/rejected": -2.828125,
"logps/chosen": -107.25,
"logps/rejected": -492.0,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.078125,
"rewards/margins": 28.0625,
"rewards/rejected": -23.9375,
"step": 145
},
{
"epoch": 0.48747913188647746,
"grad_norm": 0.006409293040633202,
"learning_rate": 0.00015108721985256215,
"logits/chosen": -4.765625,
"logits/rejected": -2.734375,
"logps/chosen": -79.0,
"logps/rejected": -548.0,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.625,
"rewards/margins": 31.6875,
"rewards/rejected": -27.0,
"step": 146
},
{
"epoch": 0.49081803005008345,
"grad_norm": 0.00015124543278943747,
"learning_rate": 0.00015095724427556544,
"logits/chosen": -4.84375,
"logits/rejected": -2.8515625,
"logps/chosen": -102.25,
"logps/rejected": -460.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.859375,
"rewards/margins": 27.0625,
"rewards/rejected": -22.1875,
"step": 147
},
{
"epoch": 0.4941569282136895,
"grad_norm": 0.00012339219392742962,
"learning_rate": 0.00015082638456878308,
"logits/chosen": -4.75,
"logits/rejected": -2.9921875,
"logps/chosen": -89.0,
"logps/rejected": -481.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.2265625,
"rewards/margins": 28.9375,
"rewards/rejected": -24.6875,
"step": 148
},
{
"epoch": 0.4974958263772955,
"grad_norm": 1.230505313287722e-05,
"learning_rate": 0.0001506946423627316,
"logits/chosen": -5.0,
"logits/rejected": -3.1171875,
"logps/chosen": -82.5,
"logps/rejected": -489.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.59375,
"rewards/margins": 29.125,
"rewards/rejected": -24.5,
"step": 149
},
{
"epoch": 0.5008347245409015,
"grad_norm": 2.1004785594413988e-05,
"learning_rate": 0.00015056201929892368,
"logits/chosen": -4.25,
"logits/rejected": -2.9765625,
"logps/chosen": -112.25,
"logps/rejected": -434.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.171875,
"rewards/margins": 27.0625,
"rewards/rejected": -22.875,
"step": 150
},
{
"epoch": 0.5041736227045075,
"grad_norm": 0.00019315003009978682,
"learning_rate": 0.00015042851702984732,
"logits/chosen": -4.84375,
"logits/rejected": -2.875,
"logps/chosen": -108.75,
"logps/rejected": -448.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.625,
"rewards/margins": 27.5625,
"rewards/rejected": -22.9375,
"step": 151
},
{
"epoch": 0.5075125208681135,
"grad_norm": 0.003308866871520877,
"learning_rate": 0.00015029413721894558,
"logits/chosen": -4.75,
"logits/rejected": -2.984375,
"logps/chosen": -122.5,
"logps/rejected": -488.0,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.609375,
"rewards/margins": 28.625,
"rewards/rejected": -24.0625,
"step": 152
},
{
"epoch": 0.5108514190317195,
"grad_norm": 5.397659606387606e-06,
"learning_rate": 0.00015015888154059568,
"logits/chosen": -4.53125,
"logits/rejected": -3.1640625,
"logps/chosen": -101.5,
"logps/rejected": -505.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.75,
"rewards/margins": 29.3125,
"rewards/rejected": -24.5625,
"step": 153
},
{
"epoch": 0.5141903171953256,
"grad_norm": 0.10684552043676376,
"learning_rate": 0.00015002275168008816,
"logits/chosen": -5.46875,
"logits/rejected": -3.015625,
"logps/chosen": -71.75,
"logps/rejected": -500.0,
"loss": 0.005,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.2578125,
"rewards/margins": 29.4375,
"rewards/rejected": -25.125,
"step": 154
},
{
"epoch": 0.5175292153589316,
"grad_norm": 3.8425196180469356e-07,
"learning_rate": 0.00014988574933360593,
"logits/chosen": -4.71875,
"logits/rejected": -2.9375,
"logps/chosen": -101.25,
"logps/rejected": -507.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.765625,
"rewards/margins": 29.75,
"rewards/rejected": -25.0,
"step": 155
},
{
"epoch": 0.5208681135225376,
"grad_norm": 7.517022822867148e-06,
"learning_rate": 0.0001497478762082031,
"logits/chosen": -5.125,
"logits/rejected": -2.609375,
"logps/chosen": -96.75,
"logps/rejected": -532.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.515625,
"rewards/margins": 32.0,
"rewards/rejected": -27.4375,
"step": 156
},
{
"epoch": 0.5242070116861436,
"grad_norm": 0.00013846807996742427,
"learning_rate": 0.00014960913402178373,
"logits/chosen": -5.328125,
"logits/rejected": -2.953125,
"logps/chosen": -86.25,
"logps/rejected": -520.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.015625,
"rewards/margins": 30.5625,
"rewards/rejected": -25.5625,
"step": 157
},
{
"epoch": 0.5275459098497496,
"grad_norm": 3.7317280657589436e-05,
"learning_rate": 0.00014946952450308035,
"logits/chosen": -5.03125,
"logits/rejected": -3.0859375,
"logps/chosen": -83.5,
"logps/rejected": -505.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.546875,
"rewards/margins": 30.625,
"rewards/rejected": -26.0625,
"step": 158
},
{
"epoch": 0.5308848080133556,
"grad_norm": 1.3853728887625039e-05,
"learning_rate": 0.00014932904939163257,
"logits/chosen": -5.125,
"logits/rejected": -2.8359375,
"logps/chosen": -76.5,
"logps/rejected": -490.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.859375,
"rewards/margins": 29.8125,
"rewards/rejected": -25.0625,
"step": 159
},
{
"epoch": 0.5342237061769616,
"grad_norm": 6.569054676219821e-06,
"learning_rate": 0.00014918771043776524,
"logits/chosen": -4.734375,
"logits/rejected": -3.0859375,
"logps/chosen": -116.75,
"logps/rejected": -504.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.453125,
"rewards/margins": 29.75,
"rewards/rejected": -25.25,
"step": 160
},
{
"epoch": 0.5375626043405676,
"grad_norm": 0.018823888152837753,
"learning_rate": 0.00014904550940256675,
"logits/chosen": -4.6875,
"logits/rejected": -2.8203125,
"logps/chosen": -103.75,
"logps/rejected": -521.0,
"loss": 0.0008,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.953125,
"rewards/margins": 30.125,
"rewards/rejected": -25.1875,
"step": 161
},
{
"epoch": 0.5409015025041736,
"grad_norm": 0.01165434904396534,
"learning_rate": 0.00014890244805786706,
"logits/chosen": -5.03125,
"logits/rejected": -2.9921875,
"logps/chosen": -100.25,
"logps/rejected": -465.0,
"loss": 0.0004,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.46875,
"rewards/margins": 29.375,
"rewards/rejected": -24.875,
"step": 162
},
{
"epoch": 0.5442404006677797,
"grad_norm": 2.0660480004153214e-05,
"learning_rate": 0.00014875852818621563,
"logits/chosen": -4.6875,
"logits/rejected": -2.8671875,
"logps/chosen": -112.0,
"logps/rejected": -467.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.6875,
"rewards/margins": 29.5625,
"rewards/rejected": -24.8125,
"step": 163
},
{
"epoch": 0.5475792988313857,
"grad_norm": 0.002862096531316638,
"learning_rate": 0.00014861375158085915,
"logits/chosen": -4.46875,
"logits/rejected": -2.9921875,
"logps/chosen": -110.75,
"logps/rejected": -487.0,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.703125,
"rewards/margins": 29.5625,
"rewards/rejected": -24.8125,
"step": 164
},
{
"epoch": 0.5509181969949917,
"grad_norm": 0.0027916012331843376,
"learning_rate": 0.00014846812004571928,
"logits/chosen": -4.953125,
"logits/rejected": -3.1015625,
"logps/chosen": -92.25,
"logps/rejected": -482.0,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.234375,
"rewards/margins": 29.375,
"rewards/rejected": -25.125,
"step": 165
},
{
"epoch": 0.5542570951585977,
"grad_norm": 0.00024799967650324106,
"learning_rate": 0.0001483216353953701,
"logits/chosen": -4.71875,
"logits/rejected": -2.9296875,
"logps/chosen": -108.0,
"logps/rejected": -451.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.6875,
"rewards/margins": 28.5625,
"rewards/rejected": -23.9375,
"step": 166
},
{
"epoch": 0.5575959933222037,
"grad_norm": 0.0007697382825426757,
"learning_rate": 0.00014817429945501563,
"logits/chosen": -4.578125,
"logits/rejected": -2.875,
"logps/chosen": -109.5,
"logps/rejected": -505.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.0859375,
"rewards/margins": 29.0,
"rewards/rejected": -24.875,
"step": 167
},
{
"epoch": 0.5609348914858097,
"grad_norm": 3.113675120403059e-05,
"learning_rate": 0.00014802611406046685,
"logits/chosen": -4.953125,
"logits/rejected": -3.140625,
"logps/chosen": -89.5,
"logps/rejected": -470.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.8125,
"rewards/margins": 29.25,
"rewards/rejected": -24.5,
"step": 168
},
{
"epoch": 0.5642737896494157,
"grad_norm": 0.0006815637461841106,
"learning_rate": 0.00014787708105811905,
"logits/chosen": -4.984375,
"logits/rejected": -2.984375,
"logps/chosen": -88.375,
"logps/rejected": -471.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.859375,
"rewards/margins": 29.375,
"rewards/rejected": -24.5,
"step": 169
},
{
"epoch": 0.5676126878130217,
"grad_norm": 0.00014252612891141325,
"learning_rate": 0.00014772720230492878,
"logits/chosen": -5.15625,
"logits/rejected": -2.8203125,
"logps/chosen": -103.25,
"logps/rejected": -525.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.703125,
"rewards/margins": 30.3125,
"rewards/rejected": -25.625,
"step": 170
},
{
"epoch": 0.5709515859766278,
"grad_norm": 2.218190456915181e-05,
"learning_rate": 0.00014757647966839058,
"logits/chosen": -5.0,
"logits/rejected": -2.8359375,
"logps/chosen": -81.875,
"logps/rejected": -514.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.15625,
"rewards/margins": 30.375,
"rewards/rejected": -26.25,
"step": 171
},
{
"epoch": 0.5742904841402338,
"grad_norm": 0.00024572337861172855,
"learning_rate": 0.0001474249150265139,
"logits/chosen": -4.921875,
"logits/rejected": -2.921875,
"logps/chosen": -77.125,
"logps/rejected": -491.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.734375,
"rewards/margins": 29.8125,
"rewards/rejected": -25.0625,
"step": 172
},
{
"epoch": 0.5776293823038398,
"grad_norm": 0.062237586826086044,
"learning_rate": 0.00014727251026779953,
"logits/chosen": -4.921875,
"logits/rejected": -3.0625,
"logps/chosen": -104.25,
"logps/rejected": -480.0,
"loss": 0.0022,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.84375,
"rewards/margins": 28.3125,
"rewards/rejected": -24.5,
"step": 173
},
{
"epoch": 0.5809682804674458,
"grad_norm": 0.010632738471031189,
"learning_rate": 0.0001471192672912162,
"logits/chosen": -5.0,
"logits/rejected": -2.859375,
"logps/chosen": -91.0,
"logps/rejected": -540.0,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.53125,
"rewards/margins": 31.75,
"rewards/rejected": -27.25,
"step": 174
},
{
"epoch": 0.5843071786310517,
"grad_norm": 0.004395844414830208,
"learning_rate": 0.00014696518800617686,
"logits/chosen": -4.796875,
"logits/rejected": -3.109375,
"logps/chosen": -100.25,
"logps/rejected": -480.0,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.703125,
"rewards/margins": 27.8125,
"rewards/rejected": -24.0625,
"step": 175
},
{
"epoch": 0.5876460767946577,
"grad_norm": 0.0008318977779708803,
"learning_rate": 0.00014681027433251486,
"logits/chosen": -4.546875,
"logits/rejected": -3.078125,
"logps/chosen": -108.25,
"logps/rejected": -473.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.765625,
"rewards/margins": 29.4375,
"rewards/rejected": -24.625,
"step": 176
},
{
"epoch": 0.5909849749582637,
"grad_norm": 0.47385621070861816,
"learning_rate": 0.00014665452820046006,
"logits/chosen": -4.71875,
"logits/rejected": -3.015625,
"logps/chosen": -92.25,
"logps/rejected": -467.0,
"loss": 0.0391,
"rewards/accuracies": 0.984375,
"rewards/chosen": 4.125,
"rewards/margins": 27.625,
"rewards/rejected": -23.5625,
"step": 177
},
{
"epoch": 0.5943238731218697,
"grad_norm": 2.916028734034626e-07,
"learning_rate": 0.00014649795155061485,
"logits/chosen": -4.921875,
"logits/rejected": -2.828125,
"logps/chosen": -86.25,
"logps/rejected": -518.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.65625,
"rewards/margins": 30.625,
"rewards/rejected": -26.0,
"step": 178
},
{
"epoch": 0.5976627712854758,
"grad_norm": 0.06330767273902893,
"learning_rate": 0.00014634054633392982,
"logits/chosen": -4.5,
"logits/rejected": -2.75,
"logps/chosen": -98.25,
"logps/rejected": -497.0,
"loss": 0.0013,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.875,
"rewards/margins": 29.6875,
"rewards/rejected": -24.875,
"step": 179
},
{
"epoch": 0.6010016694490818,
"grad_norm": 0.0008164051687344909,
"learning_rate": 0.00014618231451167955,
"logits/chosen": -4.640625,
"logits/rejected": -2.65625,
"logps/chosen": -94.75,
"logps/rejected": -432.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.46875,
"rewards/margins": 26.5625,
"rewards/rejected": -21.0625,
"step": 180
},
{
"epoch": 0.6043405676126878,
"grad_norm": 6.780491821700707e-05,
"learning_rate": 0.00014602325805543822,
"logits/chosen": -4.265625,
"logits/rejected": -2.84375,
"logps/chosen": -97.25,
"logps/rejected": -453.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.875,
"rewards/margins": 27.5,
"rewards/rejected": -22.5625,
"step": 181
},
{
"epoch": 0.6076794657762938,
"grad_norm": 4.8602585593471304e-05,
"learning_rate": 0.00014586337894705487,
"logits/chosen": -4.1875,
"logits/rejected": -2.78125,
"logps/chosen": -94.0,
"logps/rejected": -409.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.140625,
"rewards/margins": 26.125,
"rewards/rejected": -21.0,
"step": 182
},
{
"epoch": 0.6110183639398998,
"grad_norm": 0.0002642914478201419,
"learning_rate": 0.00014570267917862891,
"logits/chosen": -4.421875,
"logits/rejected": -2.703125,
"logps/chosen": -99.25,
"logps/rejected": -434.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.171875,
"rewards/margins": 24.4375,
"rewards/rejected": -19.25,
"step": 183
},
{
"epoch": 0.6143572621035058,
"grad_norm": 0.0028488298412412405,
"learning_rate": 0.00014554116075248514,
"logits/chosen": -4.171875,
"logits/rejected": -2.6171875,
"logps/chosen": -100.25,
"logps/rejected": -429.0,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.015625,
"rewards/margins": 25.125,
"rewards/rejected": -20.125,
"step": 184
},
{
"epoch": 0.6176961602671118,
"grad_norm": 0.00043352670036256313,
"learning_rate": 0.0001453788256811489,
"logits/chosen": -4.078125,
"logits/rejected": -2.7578125,
"logps/chosen": -100.25,
"logps/rejected": -406.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.03125,
"rewards/margins": 24.6875,
"rewards/rejected": -18.6875,
"step": 185
},
{
"epoch": 0.6210350584307178,
"grad_norm": 0.0006578704342246056,
"learning_rate": 0.00014521567598732097,
"logits/chosen": -4.2421875,
"logits/rejected": -2.515625,
"logps/chosen": -83.0,
"logps/rejected": -400.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.6875,
"rewards/margins": 23.375,
"rewards/rejected": -17.6875,
"step": 186
},
{
"epoch": 0.6243739565943238,
"grad_norm": 0.0001582528348080814,
"learning_rate": 0.00014505171370385233,
"logits/chosen": -4.2265625,
"logits/rejected": -2.5546875,
"logps/chosen": -87.25,
"logps/rejected": -462.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.125,
"rewards/margins": 26.0625,
"rewards/rejected": -19.9375,
"step": 187
},
{
"epoch": 0.6277128547579299,
"grad_norm": 2.958109871542547e-05,
"learning_rate": 0.00014488694087371883,
"logits/chosen": -4.234375,
"logits/rejected": -2.1796875,
"logps/chosen": -85.75,
"logps/rejected": -447.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.078125,
"rewards/margins": 25.5625,
"rewards/rejected": -19.5,
"step": 188
},
{
"epoch": 0.6310517529215359,
"grad_norm": 0.0010354547994211316,
"learning_rate": 0.00014472135954999581,
"logits/chosen": -4.0,
"logits/rejected": -2.5546875,
"logps/chosen": -90.75,
"logps/rejected": -452.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.515625,
"rewards/margins": 25.4375,
"rewards/rejected": -19.9375,
"step": 189
},
{
"epoch": 0.6343906510851419,
"grad_norm": 0.0017567307222634554,
"learning_rate": 0.00014455497179583244,
"logits/chosen": -4.3125,
"logits/rejected": -2.5546875,
"logps/chosen": -79.0,
"logps/rejected": -453.0,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.09375,
"rewards/margins": 23.875,
"rewards/rejected": -18.8125,
"step": 190
},
{
"epoch": 0.6377295492487479,
"grad_norm": 1.4471517715719528e-05,
"learning_rate": 0.00014438777968442607,
"logits/chosen": -3.8671875,
"logits/rejected": -2.5390625,
"logps/chosen": -99.75,
"logps/rejected": -490.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.640625,
"rewards/margins": 25.8125,
"rewards/rejected": -20.1875,
"step": 191
},
{
"epoch": 0.6410684474123539,
"grad_norm": 1.1293011993984692e-05,
"learning_rate": 0.00014421978529899633,
"logits/chosen": -3.8046875,
"logits/rejected": -2.4296875,
"logps/chosen": -70.375,
"logps/rejected": -495.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.09375,
"rewards/margins": 26.25,
"rewards/rejected": -21.1875,
"step": 192
},
{
"epoch": 0.6444073455759599,
"grad_norm": 0.031924691051244736,
"learning_rate": 0.00014405099073275924,
"logits/chosen": -3.953125,
"logits/rejected": -2.3359375,
"logps/chosen": -111.0,
"logps/rejected": -445.0,
"loss": 0.0009,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.71875,
"rewards/margins": 24.75,
"rewards/rejected": -19.0625,
"step": 193
},
{
"epoch": 0.6477462437395659,
"grad_norm": 8.30372482596431e-06,
"learning_rate": 0.00014388139808890112,
"logits/chosen": -3.796875,
"logits/rejected": -2.421875,
"logps/chosen": -108.75,
"logps/rejected": -434.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.234375,
"rewards/margins": 24.5625,
"rewards/rejected": -18.375,
"step": 194
},
{
"epoch": 0.6510851419031719,
"grad_norm": 0.016556670889258385,
"learning_rate": 0.00014371100948055226,
"logits/chosen": -4.296875,
"logits/rejected": -2.4921875,
"logps/chosen": -88.5,
"logps/rejected": -417.0,
"loss": 0.0008,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.015625,
"rewards/margins": 23.25,
"rewards/rejected": -18.25,
"step": 195
},
{
"epoch": 0.654424040066778,
"grad_norm": 0.0003383951261639595,
"learning_rate": 0.0001435398270307609,
"logits/chosen": -3.8203125,
"logits/rejected": -2.5625,
"logps/chosen": -96.25,
"logps/rejected": -386.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.03125,
"rewards/margins": 23.625,
"rewards/rejected": -17.625,
"step": 196
},
{
"epoch": 0.657762938230384,
"grad_norm": 0.00017920513346325606,
"learning_rate": 0.00014336785287246632,
"logits/chosen": -4.0625,
"logits/rejected": -2.4140625,
"logps/chosen": -78.75,
"logps/rejected": -421.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.890625,
"rewards/margins": 24.0,
"rewards/rejected": -18.125,
"step": 197
},
{
"epoch": 0.66110183639399,
"grad_norm": 0.0014842700911685824,
"learning_rate": 0.00014319508914847274,
"logits/chosen": -3.640625,
"logits/rejected": -2.4765625,
"logps/chosen": -107.5,
"logps/rejected": -433.0,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.578125,
"rewards/margins": 23.0625,
"rewards/rejected": -17.4375,
"step": 198
},
{
"epoch": 0.664440734557596,
"grad_norm": 0.0002902350970543921,
"learning_rate": 0.00014302153801142226,
"logits/chosen": -3.671875,
"logits/rejected": -2.359375,
"logps/chosen": -105.25,
"logps/rejected": -386.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.234375,
"rewards/margins": 23.0,
"rewards/rejected": -16.6875,
"step": 199
},
{
"epoch": 0.667779632721202,
"grad_norm": 0.000622183782979846,
"learning_rate": 0.00014284720162376823,
"logits/chosen": -4.0546875,
"logits/rejected": -2.265625,
"logps/chosen": -93.0,
"logps/rejected": -482.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.34375,
"rewards/margins": 24.6875,
"rewards/rejected": -19.3125,
"step": 200
},
{
"epoch": 0.671118530884808,
"grad_norm": 0.032146863639354706,
"learning_rate": 0.0001426720821577482,
"logits/chosen": -4.1171875,
"logits/rejected": -2.390625,
"logps/chosen": -70.0,
"logps/rejected": -463.0,
"loss": 0.0013,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.25,
"rewards/margins": 24.3125,
"rewards/rejected": -19.0625,
"step": 201
},
{
"epoch": 0.674457429048414,
"grad_norm": 0.0003144640941172838,
"learning_rate": 0.000142496181795357,
"logits/chosen": -4.125,
"logits/rejected": -2.4296875,
"logps/chosen": -91.0,
"logps/rejected": -429.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.671875,
"rewards/margins": 23.3125,
"rewards/rejected": -17.625,
"step": 202
},
{
"epoch": 0.67779632721202,
"grad_norm": 0.00027810977189801633,
"learning_rate": 0.00014231950272831936,
"logits/chosen": -3.9375,
"logits/rejected": -2.4453125,
"logps/chosen": -92.5,
"logps/rejected": -425.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.21875,
"rewards/margins": 23.8125,
"rewards/rejected": -18.6875,
"step": 203
},
{
"epoch": 0.6811352253756261,
"grad_norm": 0.12220592051744461,
"learning_rate": 0.00014214204715806271,
"logits/chosen": -3.8828125,
"logits/rejected": -2.5234375,
"logps/chosen": -100.5,
"logps/rejected": -428.0,
"loss": 0.008,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.890625,
"rewards/margins": 24.1875,
"rewards/rejected": -18.25,
"step": 204
},
{
"epoch": 0.6844741235392321,
"grad_norm": 1.1589469067985192e-05,
"learning_rate": 0.00014196381729568983,
"logits/chosen": -3.6640625,
"logits/rejected": -2.3828125,
"logps/chosen": -126.5,
"logps/rejected": -467.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.015625,
"rewards/margins": 25.3125,
"rewards/rejected": -19.25,
"step": 205
},
{
"epoch": 0.6878130217028381,
"grad_norm": 2.4499566279700957e-05,
"learning_rate": 0.00014178481536195113,
"logits/chosen": -4.203125,
"logits/rejected": -2.6015625,
"logps/chosen": -71.25,
"logps/rejected": -402.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.75,
"rewards/margins": 23.5,
"rewards/rejected": -17.75,
"step": 206
},
{
"epoch": 0.6911519198664441,
"grad_norm": 0.00026768725365400314,
"learning_rate": 0.000141605043587217,
"logits/chosen": -4.0546875,
"logits/rejected": -2.296875,
"logps/chosen": -73.75,
"logps/rejected": -426.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.3125,
"rewards/margins": 23.4375,
"rewards/rejected": -18.0625,
"step": 207
},
{
"epoch": 0.6944908180300501,
"grad_norm": 4.719466232927516e-05,
"learning_rate": 0.0001414245042114502,
"logits/chosen": -4.0,
"logits/rejected": -2.4453125,
"logps/chosen": -62.75,
"logps/rejected": -405.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.734375,
"rewards/margins": 23.4375,
"rewards/rejected": -17.75,
"step": 208
},
{
"epoch": 0.6978297161936561,
"grad_norm": 0.0002969894267152995,
"learning_rate": 0.00014124319948417773,
"logits/chosen": -3.84375,
"logits/rejected": -2.2734375,
"logps/chosen": -88.5,
"logps/rejected": -420.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.359375,
"rewards/margins": 23.1875,
"rewards/rejected": -17.875,
"step": 209
},
{
"epoch": 0.7011686143572621,
"grad_norm": 0.0002236636937595904,
"learning_rate": 0.000141061131664463,
"logits/chosen": -4.0625,
"logits/rejected": -2.4609375,
"logps/chosen": -71.25,
"logps/rejected": -454.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.5,
"rewards/margins": 25.0,
"rewards/rejected": -19.5,
"step": 210
},
{
"epoch": 0.7045075125208681,
"grad_norm": 1.3881902305001859e-05,
"learning_rate": 0.00014087830302087742,
"logits/chosen": -3.875,
"logits/rejected": -2.359375,
"logps/chosen": -76.0,
"logps/rejected": -468.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.515625,
"rewards/margins": 25.0625,
"rewards/rejected": -19.5625,
"step": 211
},
{
"epoch": 0.7078464106844741,
"grad_norm": 0.018235376104712486,
"learning_rate": 0.00014069471583147249,
"logits/chosen": -3.71875,
"logits/rejected": -2.484375,
"logps/chosen": -114.5,
"logps/rejected": -384.0,
"loss": 0.0008,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.15625,
"rewards/margins": 21.75,
"rewards/rejected": -16.625,
"step": 212
},
{
"epoch": 0.7111853088480802,
"grad_norm": 0.00014337942411657423,
"learning_rate": 0.00014051037238375103,
"logits/chosen": -3.7421875,
"logits/rejected": -2.5234375,
"logps/chosen": -102.0,
"logps/rejected": -421.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.4375,
"rewards/margins": 23.1875,
"rewards/rejected": -17.75,
"step": 213
},
{
"epoch": 0.7145242070116862,
"grad_norm": 3.2657169413141673e-06,
"learning_rate": 0.00014032527497463901,
"logits/chosen": -3.5234375,
"logits/rejected": -2.2578125,
"logps/chosen": -108.0,
"logps/rejected": -398.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.796875,
"rewards/margins": 23.0,
"rewards/rejected": -17.1875,
"step": 214
},
{
"epoch": 0.7178631051752922,
"grad_norm": 0.0009526070207357407,
"learning_rate": 0.00014013942591045668,
"logits/chosen": -3.671875,
"logits/rejected": -2.140625,
"logps/chosen": -107.75,
"logps/rejected": -423.0,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.84375,
"rewards/margins": 22.75,
"rewards/rejected": -16.9375,
"step": 215
},
{
"epoch": 0.7212020033388982,
"grad_norm": 0.0013299849815666676,
"learning_rate": 0.00013995282750689001,
"logits/chosen": -3.6171875,
"logits/rejected": -2.2578125,
"logps/chosen": -91.625,
"logps/rejected": -461.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.65625,
"rewards/margins": 24.4375,
"rewards/rejected": -18.75,
"step": 216
},
{
"epoch": 0.7245409015025042,
"grad_norm": 0.0005564424791373312,
"learning_rate": 0.00013976548208896177,
"logits/chosen": -3.3125,
"logits/rejected": -2.3515625,
"logps/chosen": -120.5,
"logps/rejected": -394.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.203125,
"rewards/margins": 23.5,
"rewards/rejected": -17.3125,
"step": 217
},
{
"epoch": 0.7278797996661102,
"grad_norm": 0.0002660456520970911,
"learning_rate": 0.00013957739199100248,
"logits/chosen": -3.734375,
"logits/rejected": -2.3203125,
"logps/chosen": -87.5,
"logps/rejected": -453.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.390625,
"rewards/margins": 24.125,
"rewards/rejected": -18.6875,
"step": 218
},
{
"epoch": 0.7312186978297162,
"grad_norm": 8.14365193946287e-05,
"learning_rate": 0.00013938855955662142,
"logits/chosen": -3.140625,
"logits/rejected": -2.3515625,
"logps/chosen": -108.75,
"logps/rejected": -404.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.921875,
"rewards/margins": 23.125,
"rewards/rejected": -17.125,
"step": 219
},
{
"epoch": 0.7345575959933222,
"grad_norm": 3.825878229690716e-05,
"learning_rate": 0.00013919898713867754,
"logits/chosen": -3.8359375,
"logits/rejected": -2.4296875,
"logps/chosen": -91.75,
"logps/rejected": -420.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.1875,
"rewards/margins": 23.8125,
"rewards/rejected": -17.59375,
"step": 220
},
{
"epoch": 0.7378964941569283,
"grad_norm": 0.0003690333687700331,
"learning_rate": 0.00013900867709924978,
"logits/chosen": -3.90625,
"logits/rejected": -2.2734375,
"logps/chosen": -94.25,
"logps/rejected": -448.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.390625,
"rewards/margins": 24.1875,
"rewards/rejected": -18.8125,
"step": 221
},
{
"epoch": 0.7412353923205343,
"grad_norm": 0.0014959557447582483,
"learning_rate": 0.00013881763180960809,
"logits/chosen": -3.7890625,
"logits/rejected": -2.4609375,
"logps/chosen": -119.75,
"logps/rejected": -417.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.734375,
"rewards/margins": 23.8125,
"rewards/rejected": -18.125,
"step": 222
},
{
"epoch": 0.7445742904841403,
"grad_norm": 0.0005811589653603733,
"learning_rate": 0.00013862585365018352,
"logits/chosen": -4.0,
"logits/rejected": -2.390625,
"logps/chosen": -109.5,
"logps/rejected": -429.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.8125,
"rewards/margins": 23.6875,
"rewards/rejected": -17.875,
"step": 223
},
{
"epoch": 0.7479131886477463,
"grad_norm": 0.00026676716515794396,
"learning_rate": 0.00013843334501053878,
"logits/chosen": -3.921875,
"logits/rejected": -2.390625,
"logps/chosen": -88.75,
"logps/rejected": -430.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.46875,
"rewards/margins": 24.0,
"rewards/rejected": -18.5,
"step": 224
},
{
"epoch": 0.7512520868113522,
"grad_norm": 0.00046272281906567514,
"learning_rate": 0.00013824010828933833,
"logits/chosen": -3.96875,
"logits/rejected": -2.3515625,
"logps/chosen": -70.25,
"logps/rejected": -453.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.53125,
"rewards/margins": 24.8125,
"rewards/rejected": -19.1875,
"step": 225
},
{
"epoch": 0.7545909849749582,
"grad_norm": 0.0005486281588673592,
"learning_rate": 0.0001380461458943186,
"logits/chosen": -3.8046875,
"logits/rejected": -2.34375,
"logps/chosen": -90.75,
"logps/rejected": -458.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.453125,
"rewards/margins": 25.1875,
"rewards/rejected": -19.75,
"step": 226
},
{
"epoch": 0.7579298831385642,
"grad_norm": 0.005520923994481564,
"learning_rate": 0.00013785146024225797,
"logits/chosen": -4.359375,
"logits/rejected": -2.46875,
"logps/chosen": -94.0,
"logps/rejected": -392.0,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.59375,
"rewards/margins": 22.75,
"rewards/rejected": -17.125,
"step": 227
},
{
"epoch": 0.7612687813021702,
"grad_norm": 8.077368693193421e-05,
"learning_rate": 0.0001376560537589465,
"logits/chosen": -3.5625,
"logits/rejected": -2.2734375,
"logps/chosen": -82.0,
"logps/rejected": -415.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.796875,
"rewards/margins": 23.125,
"rewards/rejected": -18.375,
"step": 228
},
{
"epoch": 0.7646076794657763,
"grad_norm": 0.00024681369541212916,
"learning_rate": 0.000137459928879156,
"logits/chosen": -4.21875,
"logits/rejected": -2.15625,
"logps/chosen": -76.25,
"logps/rejected": -468.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.03125,
"rewards/margins": 24.75,
"rewards/rejected": -19.75,
"step": 229
},
{
"epoch": 0.7679465776293823,
"grad_norm": 2.9520870157284662e-06,
"learning_rate": 0.00013726308804660938,
"logits/chosen": -3.796875,
"logits/rejected": -2.40625,
"logps/chosen": -99.5,
"logps/rejected": -455.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.296875,
"rewards/margins": 25.3125,
"rewards/rejected": -20.0,
"step": 230
},
{
"epoch": 0.7712854757929883,
"grad_norm": 0.4641003906726837,
"learning_rate": 0.00013706553371395044,
"logits/chosen": -3.8046875,
"logits/rejected": -2.375,
"logps/chosen": -94.75,
"logps/rejected": -438.0,
"loss": 0.0183,
"rewards/accuracies": 0.984375,
"rewards/chosen": 5.0625,
"rewards/margins": 23.5,
"rewards/rejected": -18.375,
"step": 231
},
{
"epoch": 0.7746243739565943,
"grad_norm": 0.0002731184067670256,
"learning_rate": 0.00013686726834271316,
"logits/chosen": -3.71875,
"logits/rejected": -2.3203125,
"logps/chosen": -89.25,
"logps/rejected": -367.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.734375,
"rewards/margins": 21.8125,
"rewards/rejected": -16.03125,
"step": 232
},
{
"epoch": 0.7779632721202003,
"grad_norm": 0.02955230325460434,
"learning_rate": 0.00013666829440329113,
"logits/chosen": -4.1875,
"logits/rejected": -2.203125,
"logps/chosen": -59.125,
"logps/rejected": -431.0,
"loss": 0.0012,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.296875,
"rewards/margins": 23.625,
"rewards/rejected": -18.375,
"step": 233
},
{
"epoch": 0.7813021702838063,
"grad_norm": 3.0029235858819447e-05,
"learning_rate": 0.00013646861437490674,
"logits/chosen": -3.7265625,
"logits/rejected": -2.2265625,
"logps/chosen": -98.25,
"logps/rejected": -416.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.625,
"rewards/margins": 23.375,
"rewards/rejected": -17.75,
"step": 234
},
{
"epoch": 0.7846410684474123,
"grad_norm": 0.018931280821561813,
"learning_rate": 0.00013626823074558019,
"logits/chosen": -3.890625,
"logits/rejected": -2.203125,
"logps/chosen": -89.0,
"logps/rejected": -415.0,
"loss": 0.001,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.03125,
"rewards/margins": 22.75,
"rewards/rejected": -17.6875,
"step": 235
},
{
"epoch": 0.7879799666110183,
"grad_norm": 3.846998879453167e-05,
"learning_rate": 0.00013606714601209865,
"logits/chosen": -3.5234375,
"logits/rejected": -2.1953125,
"logps/chosen": -92.0,
"logps/rejected": -404.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.515625,
"rewards/margins": 22.625,
"rewards/rejected": -17.125,
"step": 236
},
{
"epoch": 0.7913188647746243,
"grad_norm": 0.00047268884372897446,
"learning_rate": 0.00013586536267998504,
"logits/chosen": -3.96875,
"logits/rejected": -2.1171875,
"logps/chosen": -90.25,
"logps/rejected": -482.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.109375,
"rewards/margins": 25.9375,
"rewards/rejected": -19.8125,
"step": 237
},
{
"epoch": 0.7946577629382304,
"grad_norm": 0.00010255785309709609,
"learning_rate": 0.00013566288326346683,
"logits/chosen": -3.9765625,
"logits/rejected": -2.140625,
"logps/chosen": -76.0,
"logps/rejected": -383.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.5,
"rewards/margins": 22.9375,
"rewards/rejected": -17.4375,
"step": 238
},
{
"epoch": 0.7979966611018364,
"grad_norm": 0.00026641954900696874,
"learning_rate": 0.0001354597102854448,
"logits/chosen": -3.71875,
"logits/rejected": -2.21875,
"logps/chosen": -88.25,
"logps/rejected": -368.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.875,
"rewards/margins": 22.1875,
"rewards/rejected": -16.3125,
"step": 239
},
{
"epoch": 0.8013355592654424,
"grad_norm": 0.0015892143128439784,
"learning_rate": 0.00013525584627746142,
"logits/chosen": -3.6328125,
"logits/rejected": -2.03125,
"logps/chosen": -98.5,
"logps/rejected": -400.0,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.515625,
"rewards/margins": 22.375,
"rewards/rejected": -16.90625,
"step": 240
},
{
"epoch": 0.8046744574290484,
"grad_norm": 0.0008768016705289483,
"learning_rate": 0.0001350512937796695,
"logits/chosen": -4.09375,
"logits/rejected": -2.12890625,
"logps/chosen": -61.875,
"logps/rejected": -414.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.453125,
"rewards/margins": 22.75,
"rewards/rejected": -17.375,
"step": 241
},
{
"epoch": 0.8080133555926544,
"grad_norm": 0.0008365919347852468,
"learning_rate": 0.00013484605534080045,
"logits/chosen": -3.484375,
"logits/rejected": -2.02734375,
"logps/chosen": -90.25,
"logps/rejected": -382.0,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.65625,
"rewards/margins": 21.625,
"rewards/rejected": -15.96875,
"step": 242
},
{
"epoch": 0.8113522537562604,
"grad_norm": 0.0011707853991538286,
"learning_rate": 0.00013464013351813248,
"logits/chosen": -3.8125,
"logits/rejected": -2.1953125,
"logps/chosen": -75.5,
"logps/rejected": -402.0,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.734375,
"rewards/margins": 22.5,
"rewards/rejected": -16.75,
"step": 243
},
{
"epoch": 0.8146911519198664,
"grad_norm": 0.05999299883842468,
"learning_rate": 0.00013443353087745885,
"logits/chosen": -3.546875,
"logits/rejected": -2.0390625,
"logps/chosen": -119.75,
"logps/rejected": -389.0,
"loss": 0.004,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.296875,
"rewards/margins": 21.375,
"rewards/rejected": -15.09375,
"step": 244
},
{
"epoch": 0.8180300500834724,
"grad_norm": 1.330207032879116e-05,
"learning_rate": 0.00013422624999305578,
"logits/chosen": -3.25,
"logits/rejected": -2.0859375,
"logps/chosen": -101.0,
"logps/rejected": -434.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.625,
"rewards/margins": 23.3125,
"rewards/rejected": -17.625,
"step": 245
},
{
"epoch": 0.8213689482470785,
"grad_norm": 0.001336806220933795,
"learning_rate": 0.00013401829344765045,
"logits/chosen": -3.65625,
"logits/rejected": -2.078125,
"logps/chosen": -93.75,
"logps/rejected": -427.0,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.109375,
"rewards/margins": 22.0625,
"rewards/rejected": -17.0,
"step": 246
},
{
"epoch": 0.8247078464106845,
"grad_norm": 1.3096532711642794e-05,
"learning_rate": 0.00013380966383238883,
"logits/chosen": -3.5625,
"logits/rejected": -2.125,
"logps/chosen": -95.25,
"logps/rejected": -477.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.21875,
"rewards/margins": 24.9375,
"rewards/rejected": -19.625,
"step": 247
},
{
"epoch": 0.8280467445742905,
"grad_norm": 0.0002120399149134755,
"learning_rate": 0.00013360036374680334,
"logits/chosen": -3.5625,
"logits/rejected": -1.796875,
"logps/chosen": -82.75,
"logps/rejected": -436.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.5,
"rewards/margins": 23.5625,
"rewards/rejected": -18.0625,
"step": 248
},
{
"epoch": 0.8313856427378965,
"grad_norm": 0.0001271862565772608,
"learning_rate": 0.0001333903957987805,
"logits/chosen": -3.46875,
"logits/rejected": -2.15625,
"logps/chosen": -80.25,
"logps/rejected": -415.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.859375,
"rewards/margins": 24.25,
"rewards/rejected": -18.4375,
"step": 249
},
{
"epoch": 0.8347245409015025,
"grad_norm": 1.912344669108279e-05,
"learning_rate": 0.00013317976260452836,
"logits/chosen": -3.4765625,
"logits/rejected": -1.74609375,
"logps/chosen": -108.75,
"logps/rejected": -422.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.1875,
"rewards/margins": 22.9375,
"rewards/rejected": -16.78125,
"step": 250
},
{
"epoch": 0.8380634390651085,
"grad_norm": 0.0015317240031436086,
"learning_rate": 0.00013296846678854406,
"logits/chosen": -3.7265625,
"logits/rejected": -2.109375,
"logps/chosen": -84.0,
"logps/rejected": -411.0,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.59375,
"rewards/margins": 23.625,
"rewards/rejected": -18.0,
"step": 251
},
{
"epoch": 0.8414023372287145,
"grad_norm": 0.013954302296042442,
"learning_rate": 0.0001327565109835809,
"logits/chosen": -3.578125,
"logits/rejected": -2.078125,
"logps/chosen": -119.5,
"logps/rejected": -471.0,
"loss": 0.001,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.25,
"rewards/margins": 24.0,
"rewards/rejected": -18.75,
"step": 252
},
{
"epoch": 0.8447412353923205,
"grad_norm": 0.0019699318800121546,
"learning_rate": 0.00013254389783061584,
"logits/chosen": -3.6953125,
"logits/rejected": -1.88671875,
"logps/chosen": -105.75,
"logps/rejected": -401.0,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.484375,
"rewards/margins": 22.625,
"rewards/rejected": -17.15625,
"step": 253
},
{
"epoch": 0.8480801335559266,
"grad_norm": 0.03861398622393608,
"learning_rate": 0.00013233062997881627,
"logits/chosen": -3.453125,
"logits/rejected": -2.0859375,
"logps/chosen": -104.75,
"logps/rejected": -402.0,
"loss": 0.0021,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.5625,
"rewards/margins": 22.25,
"rewards/rejected": -16.6875,
"step": 254
},
{
"epoch": 0.8514190317195326,
"grad_norm": 8.354683086508885e-05,
"learning_rate": 0.00013211671008550718,
"logits/chosen": -3.7265625,
"logits/rejected": -1.89453125,
"logps/chosen": -89.25,
"logps/rejected": -417.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.46875,
"rewards/margins": 23.0625,
"rewards/rejected": -17.625,
"step": 255
},
{
"epoch": 0.8547579298831386,
"grad_norm": 0.015042081475257874,
"learning_rate": 0.0001319021408161381,
"logits/chosen": -3.671875,
"logits/rejected": -2.078125,
"logps/chosen": -89.0,
"logps/rejected": -421.0,
"loss": 0.0005,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.78125,
"rewards/margins": 24.375,
"rewards/rejected": -18.5625,
"step": 256
},
{
"epoch": 0.8580968280467446,
"grad_norm": 0.0004783151962328702,
"learning_rate": 0.0001316869248442497,
"logits/chosen": -3.5078125,
"logits/rejected": -2.15625,
"logps/chosen": -100.25,
"logps/rejected": -445.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.21875,
"rewards/margins": 24.3125,
"rewards/rejected": -19.0625,
"step": 257
},
{
"epoch": 0.8614357262103506,
"grad_norm": 0.11549096554517746,
"learning_rate": 0.00013147106485144068,
"logits/chosen": -3.5703125,
"logits/rejected": -1.9765625,
"logps/chosen": -98.0,
"logps/rejected": -449.0,
"loss": 0.0047,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.953125,
"rewards/margins": 23.625,
"rewards/rejected": -18.625,
"step": 258
},
{
"epoch": 0.8647746243739566,
"grad_norm": 0.005697562824934721,
"learning_rate": 0.00013125456352733423,
"logits/chosen": -3.5859375,
"logits/rejected": -2.0859375,
"logps/chosen": -104.5,
"logps/rejected": -425.0,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.3125,
"rewards/margins": 23.375,
"rewards/rejected": -18.0625,
"step": 259
},
{
"epoch": 0.8681135225375626,
"grad_norm": 0.07381512969732285,
"learning_rate": 0.0001310374235695445,
"logits/chosen": -3.328125,
"logits/rejected": -1.92578125,
"logps/chosen": -98.25,
"logps/rejected": -465.0,
"loss": 0.0023,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.296875,
"rewards/margins": 25.9375,
"rewards/rejected": -20.625,
"step": 260
},
{
"epoch": 0.8714524207011686,
"grad_norm": 0.004491521045565605,
"learning_rate": 0.00013081964768364308,
"logits/chosen": -3.546875,
"logits/rejected": -2.140625,
"logps/chosen": -95.5,
"logps/rejected": -450.0,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.28125,
"rewards/margins": 24.875,
"rewards/rejected": -19.625,
"step": 261
},
{
"epoch": 0.8747913188647746,
"grad_norm": 0.00013118820788804442,
"learning_rate": 0.0001306012385831253,
"logits/chosen": -3.765625,
"logits/rejected": -2.109375,
"logps/chosen": -106.25,
"logps/rejected": -419.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.734375,
"rewards/margins": 23.4375,
"rewards/rejected": -18.75,
"step": 262
},
{
"epoch": 0.8781302170283807,
"grad_norm": 0.00017179777205456048,
"learning_rate": 0.00013038219898937629,
"logits/chosen": -3.875,
"logits/rejected": -2.1171875,
"logps/chosen": -63.125,
"logps/rejected": -422.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.171875,
"rewards/margins": 23.75,
"rewards/rejected": -18.5625,
"step": 263
},
{
"epoch": 0.8814691151919867,
"grad_norm": 0.00010286461474606767,
"learning_rate": 0.00013016253163163714,
"logits/chosen": -3.4765625,
"logits/rejected": -2.03125,
"logps/chosen": -100.25,
"logps/rejected": -409.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.046875,
"rewards/margins": 22.875,
"rewards/rejected": -17.8125,
"step": 264
},
{
"epoch": 0.8848080133555927,
"grad_norm": 0.0005210632225498557,
"learning_rate": 0.000129942239246971,
"logits/chosen": -3.765625,
"logits/rejected": -2.1015625,
"logps/chosen": -90.75,
"logps/rejected": -473.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.671875,
"rewards/margins": 26.4375,
"rewards/rejected": -20.6875,
"step": 265
},
{
"epoch": 0.8881469115191987,
"grad_norm": 0.0008880659588612616,
"learning_rate": 0.00012972132458022878,
"logits/chosen": -3.53125,
"logits/rejected": -1.99609375,
"logps/chosen": -104.5,
"logps/rejected": -401.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.890625,
"rewards/margins": 22.1875,
"rewards/rejected": -17.1875,
"step": 266
},
{
"epoch": 0.8914858096828047,
"grad_norm": 7.1963854679779615e-06,
"learning_rate": 0.00012949979038401503,
"logits/chosen": -3.2734375,
"logits/rejected": -2.125,
"logps/chosen": -110.0,
"logps/rejected": -424.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.5,
"rewards/margins": 24.125,
"rewards/rejected": -18.625,
"step": 267
},
{
"epoch": 0.8948247078464107,
"grad_norm": 0.0002867156290449202,
"learning_rate": 0.00012927763941865378,
"logits/chosen": -4.1875,
"logits/rejected": -1.87109375,
"logps/chosen": -86.75,
"logps/rejected": -478.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.21875,
"rewards/margins": 26.6875,
"rewards/rejected": -21.4375,
"step": 268
},
{
"epoch": 0.8981636060100167,
"grad_norm": 0.7056188583374023,
"learning_rate": 0.00012905487445215394,
"logits/chosen": -4.0703125,
"logits/rejected": -2.078125,
"logps/chosen": -92.75,
"logps/rejected": -399.0,
"loss": 0.2617,
"rewards/accuracies": 0.984375,
"rewards/chosen": 5.546875,
"rewards/margins": 22.75,
"rewards/rejected": -17.25,
"step": 269
},
{
"epoch": 0.9015025041736227,
"grad_norm": 3.4810282159014605e-06,
"learning_rate": 0.0001288314982601749,
"logits/chosen": -4.1171875,
"logits/rejected": -2.2109375,
"logps/chosen": -105.25,
"logps/rejected": -473.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.5625,
"rewards/margins": 27.75,
"rewards/rejected": -22.1875,
"step": 270
},
{
"epoch": 0.9048414023372288,
"grad_norm": 2.5784022000152618e-05,
"learning_rate": 0.00012860751362599193,
"logits/chosen": -4.0078125,
"logits/rejected": -2.296875,
"logps/chosen": -79.125,
"logps/rejected": -472.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.34375,
"rewards/margins": 28.875,
"rewards/rejected": -23.5625,
"step": 271
},
{
"epoch": 0.9081803005008348,
"grad_norm": 5.8086599892703816e-05,
"learning_rate": 0.00012838292334046156,
"logits/chosen": -4.59375,
"logits/rejected": -2.25,
"logps/chosen": -84.0,
"logps/rejected": -514.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.390625,
"rewards/margins": 30.25,
"rewards/rejected": -25.875,
"step": 272
},
{
"epoch": 0.9115191986644408,
"grad_norm": 2.144264362868853e-05,
"learning_rate": 0.00012815773020198674,
"logits/chosen": -4.703125,
"logits/rejected": -2.265625,
"logps/chosen": -87.75,
"logps/rejected": -630.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.8046875,
"rewards/margins": 35.375,
"rewards/rejected": -31.625,
"step": 273
},
{
"epoch": 0.9148580968280468,
"grad_norm": 0.010493806563317776,
"learning_rate": 0.00012793193701648195,
"logits/chosen": -4.046875,
"logits/rejected": -2.328125,
"logps/chosen": -98.0,
"logps/rejected": -568.0,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.09375,
"rewards/margins": 35.625,
"rewards/rejected": -31.5625,
"step": 274
},
{
"epoch": 0.9181969949916527,
"grad_norm": 0.0006047628121450543,
"learning_rate": 0.0001277055465973383,
"logits/chosen": -4.796875,
"logits/rejected": -2.640625,
"logps/chosen": -93.0,
"logps/rejected": -584.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.6484375,
"rewards/margins": 37.375,
"rewards/rejected": -33.8125,
"step": 275
},
{
"epoch": 0.9215358931552587,
"grad_norm": 0.0015730452723801136,
"learning_rate": 0.0001274785617653885,
"logits/chosen": -5.0,
"logits/rejected": -2.8203125,
"logps/chosen": -99.25,
"logps/rejected": -501.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.8671875,
"rewards/margins": 33.8125,
"rewards/rejected": -29.9375,
"step": 276
},
{
"epoch": 0.9248747913188647,
"grad_norm": 2.2766906113247387e-05,
"learning_rate": 0.00012725098534887162,
"logits/chosen": -4.59375,
"logits/rejected": -2.7265625,
"logps/chosen": -106.75,
"logps/rejected": -640.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.3203125,
"rewards/margins": 41.875,
"rewards/rejected": -38.5,
"step": 277
},
{
"epoch": 0.9282136894824707,
"grad_norm": 4.425516090122983e-05,
"learning_rate": 0.00012702282018339786,
"logits/chosen": -5.125,
"logits/rejected": -2.9453125,
"logps/chosen": -107.75,
"logps/rejected": -592.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.0625,
"rewards/margins": 40.75,
"rewards/rejected": -37.625,
"step": 278
},
{
"epoch": 0.9315525876460768,
"grad_norm": 0.002112521091476083,
"learning_rate": 0.00012679406911191333,
"logits/chosen": -4.90625,
"logits/rejected": -2.921875,
"logps/chosen": -108.75,
"logps/rejected": -564.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.59375,
"rewards/margins": 38.75,
"rewards/rejected": -35.25,
"step": 279
},
{
"epoch": 0.9348914858096828,
"grad_norm": 6.724369995936286e-06,
"learning_rate": 0.00012656473498466446,
"logits/chosen": -5.1875,
"logits/rejected": -2.9921875,
"logps/chosen": -115.75,
"logps/rejected": -666.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.9921875,
"rewards/margins": 45.5,
"rewards/rejected": -42.625,
"step": 280
},
{
"epoch": 0.9382303839732888,
"grad_norm": 9.46976160776103e-06,
"learning_rate": 0.00012633482065916267,
"logits/chosen": -5.375,
"logits/rejected": -2.875,
"logps/chosen": -127.75,
"logps/rejected": -700.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.328125,
"rewards/margins": 46.375,
"rewards/rejected": -44.0,
"step": 281
},
{
"epoch": 0.9415692821368948,
"grad_norm": 0.0018350208410993218,
"learning_rate": 0.00012610432900014864,
"logits/chosen": -5.140625,
"logits/rejected": -3.078125,
"logps/chosen": -132.5,
"logps/rejected": -660.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.2890625,
"rewards/margins": 46.25,
"rewards/rejected": -43.875,
"step": 282
},
{
"epoch": 0.9449081803005008,
"grad_norm": 0.5556798577308655,
"learning_rate": 0.0001258732628795566,
"logits/chosen": -5.234375,
"logits/rejected": -3.1875,
"logps/chosen": -106.5,
"logps/rejected": -670.0,
"loss": 0.023,
"rewards/accuracies": 0.984375,
"rewards/chosen": 2.984375,
"rewards/margins": 47.125,
"rewards/rejected": -44.25,
"step": 283
},
{
"epoch": 0.9482470784641068,
"grad_norm": 5.686655276804231e-05,
"learning_rate": 0.00012564162517647863,
"logits/chosen": -5.375,
"logits/rejected": -3.03125,
"logps/chosen": -91.0,
"logps/rejected": -756.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.84375,
"rewards/margins": 51.375,
"rewards/rejected": -48.5,
"step": 284
},
{
"epoch": 0.9515859766277128,
"grad_norm": 1.8947954231407493e-05,
"learning_rate": 0.00012540941877712877,
"logits/chosen": -4.875,
"logits/rejected": -2.9921875,
"logps/chosen": -112.5,
"logps/rejected": -714.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.5703125,
"rewards/margins": 50.625,
"rewards/rejected": -47.0,
"step": 285
},
{
"epoch": 0.9549248747913188,
"grad_norm": 2.8031481633661315e-05,
"learning_rate": 0.00012517664657480694,
"logits/chosen": -4.765625,
"logits/rejected": -2.890625,
"logps/chosen": -124.25,
"logps/rejected": -710.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.1015625,
"rewards/margins": 48.375,
"rewards/rejected": -45.375,
"step": 286
},
{
"epoch": 0.9582637729549248,
"grad_norm": 1.9544756924005924e-06,
"learning_rate": 0.00012494331146986314,
"logits/chosen": -5.109375,
"logits/rejected": -2.875,
"logps/chosen": -100.0,
"logps/rejected": -704.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.59375,
"rewards/margins": 48.75,
"rewards/rejected": -45.125,
"step": 287
},
{
"epoch": 0.9616026711185309,
"grad_norm": 1.1154845103167332e-10,
"learning_rate": 0.00012470941636966103,
"logits/chosen": -5.15625,
"logits/rejected": -2.8828125,
"logps/chosen": -94.0,
"logps/rejected": -734.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.9609375,
"rewards/margins": 51.375,
"rewards/rejected": -47.5,
"step": 288
},
{
"epoch": 0.9649415692821369,
"grad_norm": 1.1422841453168076e-05,
"learning_rate": 0.00012447496418854188,
"logits/chosen": -5.1875,
"logits/rejected": -2.921875,
"logps/chosen": -122.25,
"logps/rejected": -720.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.640625,
"rewards/margins": 50.25,
"rewards/rejected": -46.625,
"step": 289
},
{
"epoch": 0.9682804674457429,
"grad_norm": 1.922987102886964e-08,
"learning_rate": 0.00012423995784778817,
"logits/chosen": -5.0625,
"logits/rejected": -3.0,
"logps/chosen": -109.0,
"logps/rejected": -722.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.578125,
"rewards/margins": 50.125,
"rewards/rejected": -46.625,
"step": 290
},
{
"epoch": 0.9716193656093489,
"grad_norm": 0.7204757332801819,
"learning_rate": 0.00012400440027558732,
"logits/chosen": -5.09375,
"logits/rejected": -2.859375,
"logps/chosen": -120.75,
"logps/rejected": -694.0,
"loss": 0.0087,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.546875,
"rewards/margins": 45.875,
"rewards/rejected": -43.25,
"step": 291
},
{
"epoch": 0.9749582637729549,
"grad_norm": 7.915846822470485e-08,
"learning_rate": 0.000123768294406995,
"logits/chosen": -5.03125,
"logits/rejected": -3.1484375,
"logps/chosen": -118.75,
"logps/rejected": -678.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.5625,
"rewards/margins": 49.125,
"rewards/rejected": -44.625,
"step": 292
},
{
"epoch": 0.9782971619365609,
"grad_norm": 3.635519169620238e-05,
"learning_rate": 0.00012353164318389874,
"logits/chosen": -4.96875,
"logits/rejected": -3.109375,
"logps/chosen": -99.25,
"logps/rejected": -630.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.265625,
"rewards/margins": 45.25,
"rewards/rejected": -41.0,
"step": 293
},
{
"epoch": 0.9816360601001669,
"grad_norm": 2.1648361325787846e-06,
"learning_rate": 0.00012329444955498116,
"logits/chosen": -4.9375,
"logits/rejected": -2.734375,
"logps/chosen": -107.75,
"logps/rejected": -664.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.734375,
"rewards/margins": 47.625,
"rewards/rejected": -42.875,
"step": 294
},
{
"epoch": 0.9849749582637729,
"grad_norm": 4.366306711744983e-07,
"learning_rate": 0.00012305671647568338,
"logits/chosen": -4.890625,
"logits/rejected": -2.9375,
"logps/chosen": -116.75,
"logps/rejected": -668.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.921875,
"rewards/margins": 46.375,
"rewards/rejected": -41.5,
"step": 295
},
{
"epoch": 0.988313856427379,
"grad_norm": 9.796775884751696e-06,
"learning_rate": 0.00012281844690816793,
"logits/chosen": -4.3125,
"logits/rejected": -2.65625,
"logps/chosen": -115.75,
"logps/rejected": -678.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.609375,
"rewards/margins": 46.375,
"rewards/rejected": -41.75,
"step": 296
},
{
"epoch": 0.991652754590985,
"grad_norm": 1.267866196030809e-06,
"learning_rate": 0.0001225796438212822,
"logits/chosen": -4.484375,
"logits/rejected": -2.828125,
"logps/chosen": -81.0,
"logps/rejected": -646.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.265625,
"rewards/margins": 46.75,
"rewards/rejected": -41.375,
"step": 297
},
{
"epoch": 0.994991652754591,
"grad_norm": 7.400533519330565e-10,
"learning_rate": 0.00012234031019052103,
"logits/chosen": -5.25,
"logits/rejected": -2.8125,
"logps/chosen": -72.75,
"logps/rejected": -664.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.203125,
"rewards/margins": 46.125,
"rewards/rejected": -41.0,
"step": 298
},
{
"epoch": 0.998330550918197,
"grad_norm": 1.766308876938183e-08,
"learning_rate": 0.00012210044899799003,
"logits/chosen": -4.171875,
"logits/rejected": -2.765625,
"logps/chosen": -90.5,
"logps/rejected": -684.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.875,
"rewards/margins": 48.0,
"rewards/rejected": -42.125,
"step": 299
},
{
"epoch": 1.0,
"grad_norm": 3.3598018944758223e-06,
"learning_rate": 0.00012186006323236816,
"logits/chosen": -4.0625,
"logits/rejected": -2.6875,
"logps/chosen": -91.0,
"logps/rejected": -680.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.40625,
"rewards/margins": 48.5,
"rewards/rejected": -44.0,
"step": 300
},
{
"epoch": 1.003338898163606,
"grad_norm": 4.096188988000904e-08,
"learning_rate": 0.00012161915588887058,
"logits/chosen": -4.53125,
"logits/rejected": -2.8671875,
"logps/chosen": -94.25,
"logps/rejected": -626.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.234375,
"rewards/margins": 43.875,
"rewards/rejected": -38.625,
"step": 301
},
{
"epoch": 1.006677796327212,
"grad_norm": 1.8207643734058365e-05,
"learning_rate": 0.0001213777299692114,
"logits/chosen": -4.5,
"logits/rejected": -2.609375,
"logps/chosen": -110.5,
"logps/rejected": -604.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.578125,
"rewards/margins": 41.5,
"rewards/rejected": -35.875,
"step": 302
},
{
"epoch": 1.010016694490818,
"grad_norm": 3.5500662409759798e-09,
"learning_rate": 0.00012113578848156614,
"logits/chosen": -4.546875,
"logits/rejected": -2.71875,
"logps/chosen": -87.0,
"logps/rejected": -658.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.546875,
"rewards/margins": 46.25,
"rewards/rejected": -40.625,
"step": 303
},
{
"epoch": 1.013355592654424,
"grad_norm": 0.0002687852829694748,
"learning_rate": 0.00012089333444053437,
"logits/chosen": -4.46875,
"logits/rejected": -2.6796875,
"logps/chosen": -76.5,
"logps/rejected": -600.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.296875,
"rewards/margins": 41.25,
"rewards/rejected": -35.875,
"step": 304
},
{
"epoch": 1.01669449081803,
"grad_norm": 7.73879182958126e-09,
"learning_rate": 0.000120650370867102,
"logits/chosen": -3.875,
"logits/rejected": -2.84375,
"logps/chosen": -98.0,
"logps/rejected": -630.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.703125,
"rewards/margins": 44.25,
"rewards/rejected": -38.5,
"step": 305
},
{
"epoch": 1.020033388981636,
"grad_norm": 1.89209330869744e-07,
"learning_rate": 0.0001204069007886038,
"logits/chosen": -4.4375,
"logits/rejected": -2.703125,
"logps/chosen": -91.0,
"logps/rejected": -594.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.984375,
"rewards/margins": 41.5,
"rewards/rejected": -36.5,
"step": 306
},
{
"epoch": 1.023372287145242,
"grad_norm": 6.296863119814589e-08,
"learning_rate": 0.0001201629272386856,
"logits/chosen": -4.140625,
"logits/rejected": -2.8359375,
"logps/chosen": -80.5,
"logps/rejected": -604.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.234375,
"rewards/margins": 43.125,
"rewards/rejected": -38.0,
"step": 307
},
{
"epoch": 1.026711185308848,
"grad_norm": 1.2057295862177853e-05,
"learning_rate": 0.00011991845325726657,
"logits/chosen": -4.875,
"logits/rejected": -2.9765625,
"logps/chosen": -71.125,
"logps/rejected": -582.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.90625,
"rewards/margins": 41.125,
"rewards/rejected": -36.25,
"step": 308
},
{
"epoch": 1.0300500834724542,
"grad_norm": 1.0573453579354464e-07,
"learning_rate": 0.00011967348189050114,
"logits/chosen": -4.828125,
"logits/rejected": -2.8671875,
"logps/chosen": -80.25,
"logps/rejected": -632.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.0,
"rewards/margins": 44.625,
"rewards/rejected": -38.75,
"step": 309
},
{
"epoch": 1.0333889816360602,
"grad_norm": 8.637631481178687e-07,
"learning_rate": 0.00011942801619074128,
"logits/chosen": -4.84375,
"logits/rejected": -2.6953125,
"logps/chosen": -76.25,
"logps/rejected": -642.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.515625,
"rewards/margins": 44.5,
"rewards/rejected": -38.875,
"step": 310
},
{
"epoch": 1.0367278797996662,
"grad_norm": 2.945505617191202e-09,
"learning_rate": 0.00011918205921649828,
"logits/chosen": -4.46875,
"logits/rejected": -2.7421875,
"logps/chosen": -92.75,
"logps/rejected": -596.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.90625,
"rewards/margins": 43.25,
"rewards/rejected": -37.375,
"step": 311
},
{
"epoch": 1.0400667779632722,
"grad_norm": 3.719551671110821e-07,
"learning_rate": 0.00011893561403240484,
"logits/chosen": -4.53125,
"logits/rejected": -2.7265625,
"logps/chosen": -84.125,
"logps/rejected": -622.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.671875,
"rewards/margins": 42.75,
"rewards/rejected": -37.125,
"step": 312
},
{
"epoch": 1.0434056761268782,
"grad_norm": 4.653336205251435e-08,
"learning_rate": 0.0001186886837091767,
"logits/chosen": -4.5625,
"logits/rejected": -2.5625,
"logps/chosen": -93.0,
"logps/rejected": -646.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.40625,
"rewards/margins": 43.375,
"rewards/rejected": -38.125,
"step": 313
},
{
"epoch": 1.0467445742904842,
"grad_norm": 3.053295358768082e-07,
"learning_rate": 0.00011844127132357443,
"logits/chosen": -3.7421875,
"logits/rejected": -2.75,
"logps/chosen": -102.5,
"logps/rejected": -616.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.546875,
"rewards/margins": 42.625,
"rewards/rejected": -38.0,
"step": 314
},
{
"epoch": 1.0500834724540902,
"grad_norm": 2.1130126981461217e-08,
"learning_rate": 0.00011819337995836521,
"logits/chosen": -4.046875,
"logits/rejected": -2.765625,
"logps/chosen": -117.25,
"logps/rejected": -642.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.171875,
"rewards/margins": 43.125,
"rewards/rejected": -37.0,
"step": 315
},
{
"epoch": 1.0534223706176962,
"grad_norm": 3.6415363524611166e-07,
"learning_rate": 0.00011794501270228418,
"logits/chosen": -4.28125,
"logits/rejected": -2.484375,
"logps/chosen": -103.25,
"logps/rejected": -600.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.578125,
"rewards/margins": 40.0,
"rewards/rejected": -34.375,
"step": 316
},
{
"epoch": 1.0567612687813022,
"grad_norm": 2.995850678644274e-08,
"learning_rate": 0.00011769617264999628,
"logits/chosen": -4.40625,
"logits/rejected": -2.71875,
"logps/chosen": -94.75,
"logps/rejected": -610.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.171875,
"rewards/margins": 41.75,
"rewards/rejected": -36.625,
"step": 317
},
{
"epoch": 1.0601001669449082,
"grad_norm": 1.6123553880920127e-10,
"learning_rate": 0.0001174468629020573,
"logits/chosen": -4.484375,
"logits/rejected": -2.703125,
"logps/chosen": -90.75,
"logps/rejected": -628.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.515625,
"rewards/margins": 43.875,
"rewards/rejected": -38.375,
"step": 318
},
{
"epoch": 1.0634390651085142,
"grad_norm": 5.522470019059256e-07,
"learning_rate": 0.00011719708656487565,
"logits/chosen": -4.484375,
"logits/rejected": -2.6953125,
"logps/chosen": -86.25,
"logps/rejected": -606.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.3125,
"rewards/margins": 41.625,
"rewards/rejected": -36.25,
"step": 319
},
{
"epoch": 1.0667779632721202,
"grad_norm": 3.308125773315851e-09,
"learning_rate": 0.0001169468467506733,
"logits/chosen": -4.5,
"logits/rejected": -2.65625,
"logps/chosen": -88.75,
"logps/rejected": -600.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.53125,
"rewards/margins": 41.375,
"rewards/rejected": -35.75,
"step": 320
},
{
"epoch": 1.0701168614357262,
"grad_norm": 9.489669139384205e-08,
"learning_rate": 0.00011669614657744725,
"logits/chosen": -4.203125,
"logits/rejected": -2.6953125,
"logps/chosen": -99.75,
"logps/rejected": -578.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.453125,
"rewards/margins": 40.25,
"rewards/rejected": -34.875,
"step": 321
},
{
"epoch": 1.0734557595993321,
"grad_norm": 5.833362126672625e-10,
"learning_rate": 0.0001164449891689306,
"logits/chosen": -4.625,
"logits/rejected": -2.6875,
"logps/chosen": -80.25,
"logps/rejected": -660.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.296875,
"rewards/margins": 44.125,
"rewards/rejected": -38.875,
"step": 322
},
{
"epoch": 1.0767946577629381,
"grad_norm": 3.6675750436643284e-08,
"learning_rate": 0.00011619337765455356,
"logits/chosen": -4.53125,
"logits/rejected": -2.703125,
"logps/chosen": -71.875,
"logps/rejected": -656.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.59375,
"rewards/margins": 44.625,
"rewards/rejected": -39.0,
"step": 323
},
{
"epoch": 1.0801335559265441,
"grad_norm": 2.6781217457028106e-06,
"learning_rate": 0.00011594131516940455,
"logits/chosen": -4.46875,
"logits/rejected": -2.6875,
"logps/chosen": -88.0,
"logps/rejected": -562.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.140625,
"rewards/margins": 38.5,
"rewards/rejected": -33.375,
"step": 324
},
{
"epoch": 1.0834724540901504,
"grad_norm": 4.066099279498303e-07,
"learning_rate": 0.00011568880485419107,
"logits/chosen": -4.921875,
"logits/rejected": -2.75,
"logps/chosen": -74.0,
"logps/rejected": -572.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.703125,
"rewards/margins": 41.375,
"rewards/rejected": -35.75,
"step": 325
},
{
"epoch": 1.0868113522537564,
"grad_norm": 9.277933088469581e-09,
"learning_rate": 0.00011543584985520065,
"logits/chosen": -4.359375,
"logits/rejected": -2.90625,
"logps/chosen": -86.0,
"logps/rejected": -602.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.765625,
"rewards/margins": 41.0,
"rewards/rejected": -36.25,
"step": 326
},
{
"epoch": 1.0901502504173624,
"grad_norm": 5.080758036513089e-09,
"learning_rate": 0.00011518245332426155,
"logits/chosen": -3.8828125,
"logits/rejected": -2.7578125,
"logps/chosen": -111.0,
"logps/rejected": -662.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.3125,
"rewards/margins": 43.875,
"rewards/rejected": -38.5,
"step": 327
},
{
"epoch": 1.0934891485809684,
"grad_norm": 9.527740729708967e-08,
"learning_rate": 0.00011492861841870358,
"logits/chosen": -4.2109375,
"logits/rejected": -2.671875,
"logps/chosen": -108.75,
"logps/rejected": -582.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.65625,
"rewards/margins": 41.875,
"rewards/rejected": -36.25,
"step": 328
},
{
"epoch": 1.0968280467445743,
"grad_norm": 1.594116305625448e-08,
"learning_rate": 0.00011467434830131869,
"logits/chosen": -4.84375,
"logits/rejected": -2.59375,
"logps/chosen": -70.5,
"logps/rejected": -666.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.40625,
"rewards/margins": 44.75,
"rewards/rejected": -39.375,
"step": 329
},
{
"epoch": 1.1001669449081803,
"grad_norm": 3.618521532189334e-06,
"learning_rate": 0.00011441964614032151,
"logits/chosen": -4.1875,
"logits/rejected": -2.7890625,
"logps/chosen": -101.75,
"logps/rejected": -572.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.53125,
"rewards/margins": 39.25,
"rewards/rejected": -33.8125,
"step": 330
},
{
"epoch": 1.1035058430717863,
"grad_norm": 2.0210993625369156e-06,
"learning_rate": 0.00011416451510931009,
"logits/chosen": -4.140625,
"logits/rejected": -2.6171875,
"logps/chosen": -98.0,
"logps/rejected": -553.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.03125,
"rewards/margins": 38.625,
"rewards/rejected": -33.5625,
"step": 331
},
{
"epoch": 1.1068447412353923,
"grad_norm": 1.149137052181004e-08,
"learning_rate": 0.00011390895838722613,
"logits/chosen": -4.171875,
"logits/rejected": -2.578125,
"logps/chosen": -115.0,
"logps/rejected": -566.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.015625,
"rewards/margins": 39.625,
"rewards/rejected": -34.625,
"step": 332
},
{
"epoch": 1.1101836393989983,
"grad_norm": 6.3274292472215166e-09,
"learning_rate": 0.00011365297915831545,
"logits/chosen": -4.328125,
"logits/rejected": -2.53125,
"logps/chosen": -108.75,
"logps/rejected": -704.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.03125,
"rewards/margins": 45.75,
"rewards/rejected": -39.75,
"step": 333
},
{
"epoch": 1.1135225375626043,
"grad_norm": 5.55765291210264e-06,
"learning_rate": 0.00011339658061208833,
"logits/chosen": -4.28125,
"logits/rejected": -2.7109375,
"logps/chosen": -105.5,
"logps/rejected": -616.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.359375,
"rewards/margins": 42.875,
"rewards/rejected": -36.625,
"step": 334
},
{
"epoch": 1.1168614357262103,
"grad_norm": 0.001700929249636829,
"learning_rate": 0.0001131397659432798,
"logits/chosen": -4.4375,
"logits/rejected": -2.78125,
"logps/chosen": -102.5,
"logps/rejected": -564.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.578125,
"rewards/margins": 39.625,
"rewards/rejected": -34.125,
"step": 335
},
{
"epoch": 1.1202003338898163,
"grad_norm": 6.786502240174741e-07,
"learning_rate": 0.00011288253835180975,
"logits/chosen": -4.453125,
"logits/rejected": -2.8984375,
"logps/chosen": -91.5,
"logps/rejected": -548.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.21875,
"rewards/margins": 37.75,
"rewards/rejected": -32.5,
"step": 336
},
{
"epoch": 1.1235392320534223,
"grad_norm": 1.9758540759085008e-07,
"learning_rate": 0.00011262490104274313,
"logits/chosen": -3.96875,
"logits/rejected": -2.6640625,
"logps/chosen": -100.25,
"logps/rejected": -580.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.546875,
"rewards/margins": 40.25,
"rewards/rejected": -34.6875,
"step": 337
},
{
"epoch": 1.1268781302170283,
"grad_norm": 2.381622380198678e-06,
"learning_rate": 0.00011236685722624995,
"logits/chosen": -4.71875,
"logits/rejected": -2.734375,
"logps/chosen": -92.75,
"logps/rejected": -592.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.171875,
"rewards/margins": 41.5,
"rewards/rejected": -36.375,
"step": 338
},
{
"epoch": 1.1302170283806343,
"grad_norm": 1.006454340313212e-06,
"learning_rate": 0.0001121084101175653,
"logits/chosen": -4.296875,
"logits/rejected": -2.6015625,
"logps/chosen": -77.625,
"logps/rejected": -644.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.609375,
"rewards/margins": 43.75,
"rewards/rejected": -38.125,
"step": 339
},
{
"epoch": 1.1335559265442403,
"grad_norm": 2.7868235719097356e-08,
"learning_rate": 0.00011184956293694941,
"logits/chosen": -3.8125,
"logits/rejected": -2.8046875,
"logps/chosen": -107.0,
"logps/rejected": -586.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.15625,
"rewards/margins": 41.0,
"rewards/rejected": -35.875,
"step": 340
},
{
"epoch": 1.1368948247078463,
"grad_norm": 5.081914196125581e-07,
"learning_rate": 0.00011159031890964731,
"logits/chosen": -4.890625,
"logits/rejected": -2.703125,
"logps/chosen": -92.0,
"logps/rejected": -568.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.640625,
"rewards/margins": 40.75,
"rewards/rejected": -35.125,
"step": 341
},
{
"epoch": 1.1402337228714523,
"grad_norm": 3.7901945688645355e-06,
"learning_rate": 0.00011133068126584881,
"logits/chosen": -4.53125,
"logits/rejected": -2.7265625,
"logps/chosen": -76.5,
"logps/rejected": -576.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.21875,
"rewards/margins": 41.875,
"rewards/rejected": -36.625,
"step": 342
},
{
"epoch": 1.1435726210350585,
"grad_norm": 1.784697616358244e-07,
"learning_rate": 0.00011107065324064816,
"logits/chosen": -3.90625,
"logits/rejected": -2.625,
"logps/chosen": -103.0,
"logps/rejected": -638.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.078125,
"rewards/margins": 41.625,
"rewards/rejected": -36.5,
"step": 343
},
{
"epoch": 1.1469115191986645,
"grad_norm": 1.5315407608795795e-06,
"learning_rate": 0.00011081023807400378,
"logits/chosen": -4.359375,
"logits/rejected": -2.703125,
"logps/chosen": -91.0,
"logps/rejected": -568.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.859375,
"rewards/margins": 40.125,
"rewards/rejected": -34.375,
"step": 344
},
{
"epoch": 1.1502504173622705,
"grad_norm": 1.3057597669785537e-08,
"learning_rate": 0.000110549439010698,
"logits/chosen": -4.390625,
"logits/rejected": -2.6640625,
"logps/chosen": -88.5,
"logps/rejected": -674.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.0625,
"rewards/margins": 45.0,
"rewards/rejected": -40.0,
"step": 345
},
{
"epoch": 1.1535893155258765,
"grad_norm": 9.364051578586441e-08,
"learning_rate": 0.0001102882593002963,
"logits/chosen": -4.1015625,
"logits/rejected": -2.6328125,
"logps/chosen": -105.25,
"logps/rejected": -604.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.109375,
"rewards/margins": 41.875,
"rewards/rejected": -36.625,
"step": 346
},
{
"epoch": 1.1569282136894825,
"grad_norm": 2.3089357092942464e-09,
"learning_rate": 0.00011002670219710718,
"logits/chosen": -4.25,
"logits/rejected": -2.703125,
"logps/chosen": -85.5,
"logps/rejected": -644.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.890625,
"rewards/margins": 45.75,
"rewards/rejected": -39.75,
"step": 347
},
{
"epoch": 1.1602671118530885,
"grad_norm": 1.3028484318056144e-06,
"learning_rate": 0.0001097647709601415,
"logits/chosen": -4.203125,
"logits/rejected": -2.7265625,
"logps/chosen": -108.75,
"logps/rejected": -608.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.0625,
"rewards/margins": 41.75,
"rewards/rejected": -35.75,
"step": 348
},
{
"epoch": 1.1636060100166945,
"grad_norm": 4.899620709153396e-09,
"learning_rate": 0.00010950246885307172,
"logits/chosen": -4.453125,
"logits/rejected": -2.625,
"logps/chosen": -81.0,
"logps/rejected": -632.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.125,
"rewards/margins": 42.625,
"rewards/rejected": -37.5,
"step": 349
},
{
"epoch": 1.1669449081803005,
"grad_norm": 2.0634535857766423e-08,
"learning_rate": 0.00010923979914419147,
"logits/chosen": -4.390625,
"logits/rejected": -2.7734375,
"logps/chosen": -96.5,
"logps/rejected": -620.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.390625,
"rewards/margins": 42.25,
"rewards/rejected": -37.0,
"step": 350
},
{
"epoch": 1.1702838063439065,
"grad_norm": 1.040544761465867e-09,
"learning_rate": 0.00010897676510637461,
"logits/chosen": -4.34375,
"logits/rejected": -2.828125,
"logps/chosen": -98.0,
"logps/rejected": -644.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.734375,
"rewards/margins": 44.125,
"rewards/rejected": -38.375,
"step": 351
},
{
"epoch": 1.1736227045075125,
"grad_norm": 3.300164053143817e-07,
"learning_rate": 0.00010871337001703463,
"logits/chosen": -4.6875,
"logits/rejected": -2.71875,
"logps/chosen": -88.25,
"logps/rejected": -636.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.03125,
"rewards/margins": 43.875,
"rewards/rejected": -38.75,
"step": 352
},
{
"epoch": 1.1769616026711185,
"grad_norm": 0.002110698726028204,
"learning_rate": 0.00010844961715808369,
"logits/chosen": -4.65625,
"logits/rejected": -2.484375,
"logps/chosen": -78.25,
"logps/rejected": -672.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.5625,
"rewards/margins": 44.5,
"rewards/rejected": -38.875,
"step": 353
},
{
"epoch": 1.1803005008347245,
"grad_norm": 3.753486854662924e-09,
"learning_rate": 0.00010818550981589181,
"logits/chosen": -4.6875,
"logits/rejected": -2.671875,
"logps/chosen": -75.5,
"logps/rejected": -656.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.484375,
"rewards/margins": 46.5,
"rewards/rejected": -41.125,
"step": 354
},
{
"epoch": 1.1836393989983305,
"grad_norm": 7.482569230887748e-08,
"learning_rate": 0.00010792105128124584,
"logits/chosen": -4.75,
"logits/rejected": -2.6328125,
"logps/chosen": -69.25,
"logps/rejected": -636.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.0,
"rewards/margins": 43.375,
"rewards/rejected": -38.5,
"step": 355
},
{
"epoch": 1.1869782971619365,
"grad_norm": 4.6094406069840943e-10,
"learning_rate": 0.00010765624484930851,
"logits/chosen": -4.1875,
"logits/rejected": -2.8046875,
"logps/chosen": -95.0,
"logps/rejected": -644.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.296875,
"rewards/margins": 43.5,
"rewards/rejected": -38.125,
"step": 356
},
{
"epoch": 1.1903171953255425,
"grad_norm": 7.443770932979987e-09,
"learning_rate": 0.00010739109381957741,
"logits/chosen": -3.671875,
"logits/rejected": -2.609375,
"logps/chosen": -109.25,
"logps/rejected": -600.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.859375,
"rewards/margins": 41.25,
"rewards/rejected": -35.375,
"step": 357
},
{
"epoch": 1.1936560934891487,
"grad_norm": 2.510856802473427e-07,
"learning_rate": 0.00010712560149584376,
"logits/chosen": -3.71875,
"logits/rejected": -2.7578125,
"logps/chosen": -118.75,
"logps/rejected": -552.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.15625,
"rewards/margins": 38.75,
"rewards/rejected": -32.625,
"step": 358
},
{
"epoch": 1.1969949916527547,
"grad_norm": 6.122155582488631e-08,
"learning_rate": 0.00010685977118615136,
"logits/chosen": -4.875,
"logits/rejected": -2.6953125,
"logps/chosen": -59.75,
"logps/rejected": -688.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.859375,
"rewards/margins": 46.375,
"rewards/rejected": -41.5,
"step": 359
},
{
"epoch": 1.2003338898163607,
"grad_norm": 5.218083742875024e-08,
"learning_rate": 0.00010659360620275533,
"logits/chosen": -4.640625,
"logits/rejected": -2.640625,
"logps/chosen": -79.25,
"logps/rejected": -618.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.328125,
"rewards/margins": 42.625,
"rewards/rejected": -37.25,
"step": 360
},
{
"epoch": 1.2036727879799667,
"grad_norm": 8.644967238069512e-06,
"learning_rate": 0.0001063271098620808,
"logits/chosen": -4.2421875,
"logits/rejected": -2.6796875,
"logps/chosen": -92.25,
"logps/rejected": -610.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.71875,
"rewards/margins": 42.625,
"rewards/rejected": -36.875,
"step": 361
},
{
"epoch": 1.2070116861435727,
"grad_norm": 3.447842118475819e-07,
"learning_rate": 0.00010606028548468168,
"logits/chosen": -4.40625,
"logits/rejected": -2.578125,
"logps/chosen": -87.0,
"logps/rejected": -584.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.375,
"rewards/margins": 40.875,
"rewards/rejected": -35.5,
"step": 362
},
{
"epoch": 1.2103505843071787,
"grad_norm": 1.8150290159724136e-08,
"learning_rate": 0.00010579313639519917,
"logits/chosen": -4.703125,
"logits/rejected": -2.7578125,
"logps/chosen": -82.25,
"logps/rejected": -634.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.3125,
"rewards/margins": 43.0,
"rewards/rejected": -37.75,
"step": 363
},
{
"epoch": 1.2136894824707847,
"grad_norm": 1.4830611944198608,
"learning_rate": 0.00010552566592232041,
"logits/chosen": -4.46875,
"logits/rejected": -2.671875,
"logps/chosen": -90.25,
"logps/rejected": -616.0,
"loss": 0.0253,
"rewards/accuracies": 0.984375,
"rewards/chosen": 4.59375,
"rewards/margins": 41.25,
"rewards/rejected": -36.625,
"step": 364
},
{
"epoch": 1.2170283806343907,
"grad_norm": 1.2266583837572398e-07,
"learning_rate": 0.00010525787739873704,
"logits/chosen": -4.5625,
"logits/rejected": -2.5859375,
"logps/chosen": -87.0,
"logps/rejected": -590.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.6875,
"rewards/margins": 40.75,
"rewards/rejected": -35.0,
"step": 365
},
{
"epoch": 1.2203672787979967,
"grad_norm": 1.2345830327831209e-05,
"learning_rate": 0.00010498977416110356,
"logits/chosen": -4.1640625,
"logits/rejected": -2.5546875,
"logps/chosen": -98.0,
"logps/rejected": -624.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.0625,
"rewards/margins": 44.125,
"rewards/rejected": -38.0,
"step": 366
},
{
"epoch": 1.2237061769616027,
"grad_norm": 3.0638256376391837e-09,
"learning_rate": 0.0001047213595499958,
"logits/chosen": -4.5625,
"logits/rejected": -2.4453125,
"logps/chosen": -82.5,
"logps/rejected": -624.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.296875,
"rewards/margins": 45.375,
"rewards/rejected": -39.25,
"step": 367
},
{
"epoch": 1.2270450751252087,
"grad_norm": 0.10086002945899963,
"learning_rate": 0.0001044526369098694,
"logits/chosen": -3.921875,
"logits/rejected": -2.6484375,
"logps/chosen": -111.75,
"logps/rejected": -642.0,
"loss": 0.0011,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.015625,
"rewards/margins": 45.75,
"rewards/rejected": -39.625,
"step": 368
},
{
"epoch": 1.2303839732888147,
"grad_norm": 7.665768020359565e-10,
"learning_rate": 0.00010418360958901803,
"logits/chosen": -4.296875,
"logits/rejected": -2.6875,
"logps/chosen": -92.0,
"logps/rejected": -650.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.96875,
"rewards/margins": 45.5,
"rewards/rejected": -39.5,
"step": 369
},
{
"epoch": 1.2337228714524207,
"grad_norm": 3.2425892526610767e-10,
"learning_rate": 0.00010391428093953162,
"logits/chosen": -4.140625,
"logits/rejected": -2.5390625,
"logps/chosen": -85.75,
"logps/rejected": -696.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.140625,
"rewards/margins": 49.375,
"rewards/rejected": -43.25,
"step": 370
},
{
"epoch": 1.2370617696160267,
"grad_norm": 1.0531423413340235e-06,
"learning_rate": 0.00010364465431725476,
"logits/chosen": -4.0078125,
"logits/rejected": -2.515625,
"logps/chosen": -89.75,
"logps/rejected": -628.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.84375,
"rewards/margins": 44.375,
"rewards/rejected": -38.625,
"step": 371
},
{
"epoch": 1.2404006677796326,
"grad_norm": 6.026751719900858e-08,
"learning_rate": 0.00010337473308174466,
"logits/chosen": -4.171875,
"logits/rejected": -2.5625,
"logps/chosen": -79.25,
"logps/rejected": -672.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.34375,
"rewards/margins": 48.125,
"rewards/rejected": -42.75,
"step": 372
},
{
"epoch": 1.2437395659432386,
"grad_norm": 0.011823623441159725,
"learning_rate": 0.0001031045205962296,
"logits/chosen": -4.609375,
"logits/rejected": -2.5625,
"logps/chosen": -76.75,
"logps/rejected": -620.0,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.375,
"rewards/margins": 43.875,
"rewards/rejected": -38.5,
"step": 373
},
{
"epoch": 1.2470784641068446,
"grad_norm": 7.656896923435852e-05,
"learning_rate": 0.00010283402022756673,
"logits/chosen": -4.28125,
"logits/rejected": -2.6171875,
"logps/chosen": -84.0,
"logps/rejected": -656.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.375,
"rewards/margins": 48.0,
"rewards/rejected": -41.625,
"step": 374
},
{
"epoch": 1.2504173622704506,
"grad_norm": 6.547962616565428e-09,
"learning_rate": 0.00010256323534620024,
"logits/chosen": -4.171875,
"logits/rejected": -2.3203125,
"logps/chosen": -105.5,
"logps/rejected": -690.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.09375,
"rewards/margins": 49.0,
"rewards/rejected": -42.875,
"step": 375
},
{
"epoch": 1.2537562604340566,
"grad_norm": 2.6672342556821604e-08,
"learning_rate": 0.00010229216932611939,
"logits/chosen": -4.28125,
"logits/rejected": -2.640625,
"logps/chosen": -79.5,
"logps/rejected": -646.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.640625,
"rewards/margins": 46.75,
"rewards/rejected": -41.125,
"step": 376
},
{
"epoch": 1.2570951585976629,
"grad_norm": 1.1736976723852877e-08,
"learning_rate": 0.0001020208255448164,
"logits/chosen": -4.453125,
"logits/rejected": -2.46875,
"logps/chosen": -79.5,
"logps/rejected": -652.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.9375,
"rewards/margins": 47.375,
"rewards/rejected": -41.375,
"step": 377
},
{
"epoch": 1.2604340567612689,
"grad_norm": 6.4667449173327896e-09,
"learning_rate": 0.00010174920738324448,
"logits/chosen": -4.0703125,
"logits/rejected": -2.4765625,
"logps/chosen": -102.75,
"logps/rejected": -616.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.296875,
"rewards/margins": 46.75,
"rewards/rejected": -40.5,
"step": 378
},
{
"epoch": 1.2637729549248748,
"grad_norm": 2.800082654630387e-07,
"learning_rate": 0.00010147731822577554,
"logits/chosen": -4.46875,
"logits/rejected": -2.484375,
"logps/chosen": -65.875,
"logps/rejected": -662.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.890625,
"rewards/margins": 48.5,
"rewards/rejected": -42.5,
"step": 379
},
{
"epoch": 1.2671118530884808,
"grad_norm": 5.510163436639459e-09,
"learning_rate": 0.00010120516146015814,
"logits/chosen": -4.046875,
"logits/rejected": -2.3671875,
"logps/chosen": -112.5,
"logps/rejected": -726.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.03125,
"rewards/margins": 50.25,
"rewards/rejected": -44.125,
"step": 380
},
{
"epoch": 1.2704507512520868,
"grad_norm": 5.70578917447051e-10,
"learning_rate": 0.00010093274047747526,
"logits/chosen": -4.34375,
"logits/rejected": -2.484375,
"logps/chosen": -85.5,
"logps/rejected": -646.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.953125,
"rewards/margins": 46.625,
"rewards/rejected": -40.75,
"step": 381
},
{
"epoch": 1.2737896494156928,
"grad_norm": 2.0008934509263554e-09,
"learning_rate": 0.00010066005867210204,
"logits/chosen": -4.375,
"logits/rejected": -2.4453125,
"logps/chosen": -77.5,
"logps/rejected": -648.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.875,
"rewards/margins": 47.5,
"rewards/rejected": -41.625,
"step": 382
},
{
"epoch": 1.2771285475792988,
"grad_norm": 3.104088364125346e-08,
"learning_rate": 0.00010038711944166345,
"logits/chosen": -4.515625,
"logits/rejected": -2.5078125,
"logps/chosen": -72.5,
"logps/rejected": -622.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.828125,
"rewards/margins": 46.125,
"rewards/rejected": -40.25,
"step": 383
},
{
"epoch": 1.2804674457429048,
"grad_norm": 4.240581347403349e-06,
"learning_rate": 0.00010011392618699203,
"logits/chosen": -4.828125,
"logits/rejected": -2.421875,
"logps/chosen": -66.125,
"logps/rejected": -696.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.03125,
"rewards/margins": 48.75,
"rewards/rejected": -42.75,
"step": 384
},
{
"epoch": 1.2838063439065108,
"grad_norm": 1.7224666137849454e-11,
"learning_rate": 9.984048231208542e-05,
"logits/chosen": -4.78125,
"logits/rejected": -2.4765625,
"logps/chosen": -66.125,
"logps/rejected": -704.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.984375,
"rewards/margins": 50.375,
"rewards/rejected": -44.375,
"step": 385
},
{
"epoch": 1.2871452420701168,
"grad_norm": 1.3602375226184904e-08,
"learning_rate": 9.956679122406401e-05,
"logits/chosen": -4.6875,
"logits/rejected": -2.359375,
"logps/chosen": -77.25,
"logps/rejected": -714.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.5625,
"rewards/margins": 50.0,
"rewards/rejected": -44.625,
"step": 386
},
{
"epoch": 1.2904841402337228,
"grad_norm": 2.335872886760626e-05,
"learning_rate": 9.92928563331285e-05,
"logits/chosen": -4.03125,
"logits/rejected": -2.4453125,
"logps/chosen": -96.75,
"logps/rejected": -648.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.84375,
"rewards/margins": 46.5,
"rewards/rejected": -40.625,
"step": 387
},
{
"epoch": 1.2938230383973288,
"grad_norm": 1.2561605444716406e-06,
"learning_rate": 9.901868105251735e-05,
"logits/chosen": -4.203125,
"logits/rejected": -2.40625,
"logps/chosen": -70.5,
"logps/rejected": -648.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.46875,
"rewards/margins": 46.375,
"rewards/rejected": -40.875,
"step": 388
},
{
"epoch": 1.2971619365609348,
"grad_norm": 3.965501704783492e-09,
"learning_rate": 9.874426879846435e-05,
"logits/chosen": -4.0625,
"logits/rejected": -2.3515625,
"logps/chosen": -101.75,
"logps/rejected": -640.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.203125,
"rewards/margins": 46.75,
"rewards/rejected": -40.5,
"step": 389
},
{
"epoch": 1.300500834724541,
"grad_norm": 7.816532132665088e-08,
"learning_rate": 9.846962299015589e-05,
"logits/chosen": -4.09375,
"logits/rejected": -2.5078125,
"logps/chosen": -95.0,
"logps/rejected": -704.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.78125,
"rewards/margins": 50.25,
"rewards/rejected": -44.5,
"step": 390
},
{
"epoch": 1.303839732888147,
"grad_norm": 5.069846320537863e-09,
"learning_rate": 9.819474704968854e-05,
"logits/chosen": -4.671875,
"logits/rejected": -2.390625,
"logps/chosen": -81.25,
"logps/rejected": -716.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.6875,
"rewards/margins": 50.5,
"rewards/rejected": -44.75,
"step": 391
},
{
"epoch": 1.307178631051753,
"grad_norm": 1.4890460988681298e-06,
"learning_rate": 9.791964440202621e-05,
"logits/chosen": -4.03125,
"logits/rejected": -2.234375,
"logps/chosen": -81.25,
"logps/rejected": -746.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.71875,
"rewards/margins": 50.25,
"rewards/rejected": -44.5,
"step": 392
},
{
"epoch": 1.310517529215359,
"grad_norm": 6.790008200141529e-08,
"learning_rate": 9.764431847495775e-05,
"logits/chosen": -3.578125,
"logits/rejected": -2.171875,
"logps/chosen": -107.75,
"logps/rejected": -750.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.59375,
"rewards/margins": 51.125,
"rewards/rejected": -45.5,
"step": 393
},
{
"epoch": 1.313856427378965,
"grad_norm": 1.523936225567013e-05,
"learning_rate": 9.7368772699054e-05,
"logits/chosen": -4.78125,
"logits/rejected": -2.375,
"logps/chosen": -77.25,
"logps/rejected": -652.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.453125,
"rewards/margins": 46.0,
"rewards/rejected": -40.625,
"step": 394
},
{
"epoch": 1.317195325542571,
"grad_norm": 0.0042861769907176495,
"learning_rate": 9.709301050762508e-05,
"logits/chosen": -4.203125,
"logits/rejected": -2.3125,
"logps/chosen": -86.75,
"logps/rejected": -686.0,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.359375,
"rewards/margins": 48.5,
"rewards/rejected": -42.125,
"step": 395
},
{
"epoch": 1.320534223706177,
"grad_norm": 3.386242610758927e-07,
"learning_rate": 9.681703533667771e-05,
"logits/chosen": -4.375,
"logits/rejected": -2.4765625,
"logps/chosen": -84.5,
"logps/rejected": -660.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.28125,
"rewards/margins": 47.5,
"rewards/rejected": -42.125,
"step": 396
},
{
"epoch": 1.323873121869783,
"grad_norm": 4.3367620605749835e-07,
"learning_rate": 9.65408506248723e-05,
"logits/chosen": -4.265625,
"logits/rejected": -2.375,
"logps/chosen": -96.5,
"logps/rejected": -666.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.078125,
"rewards/margins": 48.375,
"rewards/rejected": -42.375,
"step": 397
},
{
"epoch": 1.327212020033389,
"grad_norm": 1.0023908370015988e-09,
"learning_rate": 9.626445981348023e-05,
"logits/chosen": -4.09375,
"logits/rejected": -2.3125,
"logps/chosen": -83.0,
"logps/rejected": -670.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.796875,
"rewards/margins": 48.625,
"rewards/rejected": -42.875,
"step": 398
},
{
"epoch": 1.330550918196995,
"grad_norm": 3.0025191222193826e-08,
"learning_rate": 9.598786634634082e-05,
"logits/chosen": -3.8125,
"logits/rejected": -2.375,
"logps/chosen": -101.5,
"logps/rejected": -626.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.546875,
"rewards/margins": 46.25,
"rewards/rejected": -40.75,
"step": 399
},
{
"epoch": 1.333889816360601,
"grad_norm": 7.900404774829894e-09,
"learning_rate": 9.571107366981845e-05,
"logits/chosen": -4.71875,
"logits/rejected": -2.53125,
"logps/chosen": -64.375,
"logps/rejected": -598.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.734375,
"rewards/margins": 44.75,
"rewards/rejected": -39.0,
"step": 400
},
{
"epoch": 1.337228714524207,
"grad_norm": 0.001412657555192709,
"learning_rate": 9.543408523275976e-05,
"logits/chosen": -4.421875,
"logits/rejected": -2.5078125,
"logps/chosen": -79.25,
"logps/rejected": -608.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.765625,
"rewards/margins": 44.5,
"rewards/rejected": -38.625,
"step": 401
},
{
"epoch": 1.340567612687813,
"grad_norm": 3.320309076570993e-07,
"learning_rate": 9.51569044864505e-05,
"logits/chosen": -4.609375,
"logits/rejected": -2.375,
"logps/chosen": -78.5,
"logps/rejected": -740.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.015625,
"rewards/margins": 53.125,
"rewards/rejected": -47.125,
"step": 402
},
{
"epoch": 1.343906510851419,
"grad_norm": 5.0411297358721185e-09,
"learning_rate": 9.487953488457264e-05,
"logits/chosen": -4.2890625,
"logits/rejected": -2.46875,
"logps/chosen": -86.875,
"logps/rejected": -702.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.40625,
"rewards/margins": 48.25,
"rewards/rejected": -42.75,
"step": 403
},
{
"epoch": 1.347245409015025,
"grad_norm": 8.494340875131456e-08,
"learning_rate": 9.460197988316126e-05,
"logits/chosen": -4.265625,
"logits/rejected": -2.5,
"logps/chosen": -87.75,
"logps/rejected": -650.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.78125,
"rewards/margins": 47.0,
"rewards/rejected": -41.125,
"step": 404
},
{
"epoch": 1.350584307178631,
"grad_norm": 5.050285835750401e-06,
"learning_rate": 9.432424294056155e-05,
"logits/chosen": -4.90625,
"logits/rejected": -2.359375,
"logps/chosen": -59.75,
"logps/rejected": -740.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.703125,
"rewards/margins": 52.125,
"rewards/rejected": -46.375,
"step": 405
},
{
"epoch": 1.353923205342237,
"grad_norm": 9.864572803053306e-07,
"learning_rate": 9.404632751738566e-05,
"logits/chosen": -3.7890625,
"logits/rejected": -2.40625,
"logps/chosen": -78.75,
"logps/rejected": -734.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.796875,
"rewards/margins": 50.25,
"rewards/rejected": -45.5,
"step": 406
},
{
"epoch": 1.357262103505843,
"grad_norm": 3.688290962600149e-05,
"learning_rate": 9.376823707646968e-05,
"logits/chosen": -4.328125,
"logits/rejected": -2.5703125,
"logps/chosen": -99.25,
"logps/rejected": -598.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.5,
"rewards/margins": 43.875,
"rewards/rejected": -38.375,
"step": 407
},
{
"epoch": 1.360601001669449,
"grad_norm": 3.8295453919090505e-08,
"learning_rate": 9.348997508283038e-05,
"logits/chosen": -4.15625,
"logits/rejected": -2.2578125,
"logps/chosen": -88.5,
"logps/rejected": -680.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.6875,
"rewards/margins": 47.875,
"rewards/rejected": -42.25,
"step": 408
},
{
"epoch": 1.363939899833055,
"grad_norm": 1.983067576816211e-08,
"learning_rate": 9.321154500362208e-05,
"logits/chosen": -4.546875,
"logits/rejected": -2.375,
"logps/chosen": -80.5,
"logps/rejected": -708.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.90625,
"rewards/margins": 50.25,
"rewards/rejected": -44.375,
"step": 409
},
{
"epoch": 1.367278797996661,
"grad_norm": 5.22494865151657e-08,
"learning_rate": 9.293295030809347e-05,
"logits/chosen": -4.140625,
"logits/rejected": -2.5546875,
"logps/chosen": -94.75,
"logps/rejected": -656.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.703125,
"rewards/margins": 46.5,
"rewards/rejected": -40.875,
"step": 410
},
{
"epoch": 1.3706176961602672,
"grad_norm": 7.317971153497638e-07,
"learning_rate": 9.265419446754433e-05,
"logits/chosen": -4.1875,
"logits/rejected": -2.5234375,
"logps/chosen": -81.75,
"logps/rejected": -624.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.546875,
"rewards/margins": 47.0,
"rewards/rejected": -41.375,
"step": 411
},
{
"epoch": 1.3739565943238732,
"grad_norm": 5.5238370322285846e-08,
"learning_rate": 9.237528095528238e-05,
"logits/chosen": -4.46875,
"logits/rejected": -2.3671875,
"logps/chosen": -86.25,
"logps/rejected": -618.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.6875,
"rewards/margins": 43.75,
"rewards/rejected": -38.0,
"step": 412
},
{
"epoch": 1.3772954924874792,
"grad_norm": 1.3657646125153633e-09,
"learning_rate": 9.209621324657987e-05,
"logits/chosen": -4.71875,
"logits/rejected": -2.4921875,
"logps/chosen": -74.75,
"logps/rejected": -622.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.9375,
"rewards/margins": 45.75,
"rewards/rejected": -39.875,
"step": 413
},
{
"epoch": 1.3806343906510852,
"grad_norm": 6.379238470799464e-07,
"learning_rate": 9.181699481863039e-05,
"logits/chosen": -4.1875,
"logits/rejected": -2.3046875,
"logps/chosen": -80.25,
"logps/rejected": -732.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.0625,
"rewards/margins": 50.5,
"rewards/rejected": -45.5,
"step": 414
},
{
"epoch": 1.3839732888146912,
"grad_norm": 3.687435423671559e-08,
"learning_rate": 9.153762915050547e-05,
"logits/chosen": -4.375,
"logits/rejected": -2.4140625,
"logps/chosen": -90.0,
"logps/rejected": -648.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.875,
"rewards/margins": 49.125,
"rewards/rejected": -43.375,
"step": 415
},
{
"epoch": 1.3873121869782972,
"grad_norm": 1.0608222282826318e-06,
"learning_rate": 9.125811972311125e-05,
"logits/chosen": -4.140625,
"logits/rejected": -2.4609375,
"logps/chosen": -94.0,
"logps/rejected": -628.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.21875,
"rewards/margins": 45.875,
"rewards/rejected": -39.625,
"step": 416
},
{
"epoch": 1.3906510851419032,
"grad_norm": 1.4249868840465751e-08,
"learning_rate": 9.097847001914515e-05,
"logits/chosen": -3.921875,
"logits/rejected": -2.40625,
"logps/chosen": -104.0,
"logps/rejected": -658.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.734375,
"rewards/margins": 47.375,
"rewards/rejected": -41.625,
"step": 417
},
{
"epoch": 1.3939899833055092,
"grad_norm": 2.5101855953835184e-06,
"learning_rate": 9.069868352305241e-05,
"logits/chosen": -4.15625,
"logits/rejected": -2.4140625,
"logps/chosen": -96.25,
"logps/rejected": -658.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.265625,
"rewards/margins": 47.875,
"rewards/rejected": -41.625,
"step": 418
},
{
"epoch": 1.3973288814691152,
"grad_norm": 1.0253067728172027e-08,
"learning_rate": 9.041876372098271e-05,
"logits/chosen": -4.21875,
"logits/rejected": -2.4140625,
"logps/chosen": -95.25,
"logps/rejected": -700.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.546875,
"rewards/margins": 49.625,
"rewards/rejected": -43.0,
"step": 419
},
{
"epoch": 1.4006677796327212,
"grad_norm": 0.41543394327163696,
"learning_rate": 9.013871410074674e-05,
"logits/chosen": -4.5625,
"logits/rejected": -2.359375,
"logps/chosen": -86.5,
"logps/rejected": -650.0,
"loss": 0.1123,
"rewards/accuracies": 0.984375,
"rewards/chosen": 6.6875,
"rewards/margins": 46.0,
"rewards/rejected": -39.25,
"step": 420
},
{
"epoch": 1.4040066777963272,
"grad_norm": 0.0006037292769178748,
"learning_rate": 8.985853815177269e-05,
"logits/chosen": -4.265625,
"logits/rejected": -2.34375,
"logps/chosen": -97.25,
"logps/rejected": -626.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.09375,
"rewards/margins": 46.125,
"rewards/rejected": -40.125,
"step": 421
},
{
"epoch": 1.4073455759599331,
"grad_norm": 4.558990129854834e-10,
"learning_rate": 8.957823936506292e-05,
"logits/chosen": -4.0546875,
"logits/rejected": -2.390625,
"logps/chosen": -94.75,
"logps/rejected": -668.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.609375,
"rewards/margins": 50.25,
"rewards/rejected": -44.625,
"step": 422
},
{
"epoch": 1.4106844741235394,
"grad_norm": 1.3576600395026617e-05,
"learning_rate": 8.929782123315022e-05,
"logits/chosen": -3.6953125,
"logits/rejected": -2.1953125,
"logps/chosen": -106.25,
"logps/rejected": -712.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.375,
"rewards/margins": 49.75,
"rewards/rejected": -44.375,
"step": 423
},
{
"epoch": 1.4140233722871454,
"grad_norm": 1.5348656789981874e-09,
"learning_rate": 8.901728725005449e-05,
"logits/chosen": -4.1875,
"logits/rejected": -2.3515625,
"logps/chosen": -106.25,
"logps/rejected": -714.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.0,
"rewards/margins": 49.375,
"rewards/rejected": -43.375,
"step": 424
},
{
"epoch": 1.4173622704507514,
"grad_norm": 1.130615490119169e-09,
"learning_rate": 8.873664091123909e-05,
"logits/chosen": -4.0546875,
"logits/rejected": -2.4140625,
"logps/chosen": -95.75,
"logps/rejected": -684.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.671875,
"rewards/margins": 50.375,
"rewards/rejected": -44.75,
"step": 425
},
{
"epoch": 1.4207011686143574,
"grad_norm": 6.090725257479335e-09,
"learning_rate": 8.845588571356745e-05,
"logits/chosen": -4.59375,
"logits/rejected": -2.578125,
"logps/chosen": -89.25,
"logps/rejected": -726.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.953125,
"rewards/margins": 52.25,
"rewards/rejected": -46.125,
"step": 426
},
{
"epoch": 1.4240400667779634,
"grad_norm": 6.303178088273853e-05,
"learning_rate": 8.817502515525927e-05,
"logits/chosen": -3.921875,
"logits/rejected": -2.53125,
"logps/chosen": -88.5,
"logps/rejected": -716.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.328125,
"rewards/margins": 52.0,
"rewards/rejected": -46.625,
"step": 427
},
{
"epoch": 1.4273789649415694,
"grad_norm": 1.215575307078609e-09,
"learning_rate": 8.789406273584708e-05,
"logits/chosen": -4.71875,
"logits/rejected": -2.3515625,
"logps/chosen": -67.0,
"logps/rejected": -822.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.34375,
"rewards/margins": 57.25,
"rewards/rejected": -51.875,
"step": 428
},
{
"epoch": 1.4307178631051753,
"grad_norm": 1.2602808965311851e-05,
"learning_rate": 8.761300195613267e-05,
"logits/chosen": -4.140625,
"logits/rejected": -2.390625,
"logps/chosen": -77.75,
"logps/rejected": -776.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.796875,
"rewards/margins": 56.0,
"rewards/rejected": -50.25,
"step": 429
},
{
"epoch": 1.4340567612687813,
"grad_norm": 1.7794310469554375e-10,
"learning_rate": 8.733184631814326e-05,
"logits/chosen": -4.671875,
"logits/rejected": -2.6015625,
"logps/chosen": -98.0,
"logps/rejected": -682.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.453125,
"rewards/margins": 51.125,
"rewards/rejected": -45.625,
"step": 430
},
{
"epoch": 1.4373956594323873,
"grad_norm": 4.9994866913039004e-08,
"learning_rate": 8.705059932508816e-05,
"logits/chosen": -4.3125,
"logits/rejected": -2.359375,
"logps/chosen": -94.5,
"logps/rejected": -694.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.8125,
"rewards/margins": 50.625,
"rewards/rejected": -44.75,
"step": 431
},
{
"epoch": 1.4407345575959933,
"grad_norm": 1.9163441322689323e-07,
"learning_rate": 8.676926448131487e-05,
"logits/chosen": -4.734375,
"logits/rejected": -2.3984375,
"logps/chosen": -83.75,
"logps/rejected": -736.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.625,
"rewards/margins": 53.75,
"rewards/rejected": -48.125,
"step": 432
},
{
"epoch": 1.4440734557595993,
"grad_norm": 4.263274178128995e-09,
"learning_rate": 8.648784529226552e-05,
"logits/chosen": -4.375,
"logits/rejected": -2.40625,
"logps/chosen": -81.0,
"logps/rejected": -728.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.171875,
"rewards/margins": 54.625,
"rewards/rejected": -48.5,
"step": 433
},
{
"epoch": 1.4474123539232053,
"grad_norm": 0.0010009096004068851,
"learning_rate": 8.620634526443326e-05,
"logits/chosen": -4.671875,
"logits/rejected": -2.578125,
"logps/chosen": -74.75,
"logps/rejected": -668.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.96875,
"rewards/margins": 50.625,
"rewards/rejected": -44.75,
"step": 434
},
{
"epoch": 1.4507512520868113,
"grad_norm": 1.0308641940071084e-08,
"learning_rate": 8.59247679053184e-05,
"logits/chosen": -4.75,
"logits/rejected": -2.578125,
"logps/chosen": -59.25,
"logps/rejected": -718.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.328125,
"rewards/margins": 53.875,
"rewards/rejected": -48.5,
"step": 435
},
{
"epoch": 1.4540901502504173,
"grad_norm": 5.3008247959951404e-06,
"learning_rate": 8.564311672338488e-05,
"logits/chosen": -4.0390625,
"logits/rejected": -2.28125,
"logps/chosen": -94.5,
"logps/rejected": -728.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.5,
"rewards/margins": 53.125,
"rewards/rejected": -47.625,
"step": 436
},
{
"epoch": 1.4574290484140233,
"grad_norm": 9.894330105453264e-06,
"learning_rate": 8.536139522801641e-05,
"logits/chosen": -4.734375,
"logits/rejected": -2.2265625,
"logps/chosen": -63.25,
"logps/rejected": -778.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.625,
"rewards/margins": 55.875,
"rewards/rejected": -50.25,
"step": 437
},
{
"epoch": 1.4607679465776293,
"grad_norm": 1.3490364381141262e-07,
"learning_rate": 8.507960692947287e-05,
"logits/chosen": -4.0078125,
"logits/rejected": -2.4453125,
"logps/chosen": -133.5,
"logps/rejected": -640.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.6875,
"rewards/margins": 49.125,
"rewards/rejected": -43.375,
"step": 438
},
{
"epoch": 1.4641068447412353,
"grad_norm": 1.2006904359651571e-09,
"learning_rate": 8.479775533884641e-05,
"logits/chosen": -4.515625,
"logits/rejected": -2.625,
"logps/chosen": -75.5,
"logps/rejected": -702.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.4375,
"rewards/margins": 52.5,
"rewards/rejected": -47.0,
"step": 439
},
{
"epoch": 1.4674457429048413,
"grad_norm": 2.2769681695677946e-09,
"learning_rate": 8.45158439680179e-05,
"logits/chosen": -4.625,
"logits/rejected": -2.65625,
"logps/chosen": -99.0,
"logps/rejected": -644.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.328125,
"rewards/margins": 50.125,
"rewards/rejected": -43.875,
"step": 440
},
{
"epoch": 1.4707846410684473,
"grad_norm": 3.575904156116749e-11,
"learning_rate": 8.423387632961305e-05,
"logits/chosen": -4.5,
"logits/rejected": -2.4375,
"logps/chosen": -77.5,
"logps/rejected": -824.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.140625,
"rewards/margins": 60.625,
"rewards/rejected": -54.5,
"step": 441
},
{
"epoch": 1.4741235392320533,
"grad_norm": 5.85626310112275e-07,
"learning_rate": 8.395185593695866e-05,
"logits/chosen": -4.21875,
"logits/rejected": -2.5859375,
"logps/chosen": -103.25,
"logps/rejected": -632.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.109375,
"rewards/margins": 48.5,
"rewards/rejected": -43.375,
"step": 442
},
{
"epoch": 1.4774624373956593,
"grad_norm": 3.311103853320674e-07,
"learning_rate": 8.366978630403886e-05,
"logits/chosen": -4.0625,
"logits/rejected": -2.421875,
"logps/chosen": -96.25,
"logps/rejected": -808.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.59375,
"rewards/margins": 57.375,
"rewards/rejected": -51.875,
"step": 443
},
{
"epoch": 1.4808013355592655,
"grad_norm": 0.003105483716353774,
"learning_rate": 8.338767094545124e-05,
"logits/chosen": -4.453125,
"logits/rejected": -2.328125,
"logps/chosen": -85.25,
"logps/rejected": -762.0,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.40625,
"rewards/margins": 54.5,
"rewards/rejected": -49.125,
"step": 444
},
{
"epoch": 1.4841402337228715,
"grad_norm": 7.841013575671241e-05,
"learning_rate": 8.310551337636326e-05,
"logits/chosen": -4.296875,
"logits/rejected": -2.4609375,
"logps/chosen": -107.75,
"logps/rejected": -752.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.203125,
"rewards/margins": 55.0,
"rewards/rejected": -49.75,
"step": 445
},
{
"epoch": 1.4874791318864775,
"grad_norm": 9.359698793787175e-09,
"learning_rate": 8.282331711246822e-05,
"logits/chosen": -4.21875,
"logits/rejected": -2.359375,
"logps/chosen": -80.0,
"logps/rejected": -728.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.59375,
"rewards/margins": 53.0,
"rewards/rejected": -47.5,
"step": 446
},
{
"epoch": 1.4908180300500835,
"grad_norm": 1.995251464048664e-10,
"learning_rate": 8.25410856699416e-05,
"logits/chosen": -4.1484375,
"logits/rejected": -2.390625,
"logps/chosen": -98.25,
"logps/rejected": -766.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.03125,
"rewards/margins": 56.25,
"rewards/rejected": -50.25,
"step": 447
},
{
"epoch": 1.4941569282136895,
"grad_norm": 3.125167458151168e-09,
"learning_rate": 8.225882256539724e-05,
"logits/chosen": -3.859375,
"logits/rejected": -2.5,
"logps/chosen": -117.5,
"logps/rejected": -774.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.1875,
"rewards/margins": 55.625,
"rewards/rejected": -50.375,
"step": 448
},
{
"epoch": 1.4974958263772955,
"grad_norm": 1.1834481172812161e-09,
"learning_rate": 8.197653131584339e-05,
"logits/chosen": -4.0625,
"logits/rejected": -2.4140625,
"logps/chosen": -114.75,
"logps/rejected": -666.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.234375,
"rewards/margins": 49.625,
"rewards/rejected": -44.375,
"step": 449
},
{
"epoch": 1.5008347245409015,
"grad_norm": 9.292146913031374e-11,
"learning_rate": 8.169421543863914e-05,
"logits/chosen": -4.40625,
"logits/rejected": -2.5390625,
"logps/chosen": -88.0,
"logps/rejected": -702.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.0625,
"rewards/margins": 53.0,
"rewards/rejected": -46.875,
"step": 450
},
{
"epoch": 1.5041736227045075,
"grad_norm": 5.541923586704911e-10,
"learning_rate": 8.14118784514503e-05,
"logits/chosen": -4.609375,
"logits/rejected": -2.390625,
"logps/chosen": -100.25,
"logps/rejected": -770.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.5625,
"rewards/margins": 56.375,
"rewards/rejected": -50.75,
"step": 451
},
{
"epoch": 1.5075125208681135,
"grad_norm": 2.15681126292111e-08,
"learning_rate": 8.112952387220578e-05,
"logits/chosen": -4.15625,
"logits/rejected": -2.3984375,
"logps/chosen": -87.25,
"logps/rejected": -762.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.171875,
"rewards/margins": 57.375,
"rewards/rejected": -51.125,
"step": 452
},
{
"epoch": 1.5108514190317195,
"grad_norm": 2.62340899004343e-10,
"learning_rate": 8.084715521905363e-05,
"logits/chosen": -4.03125,
"logits/rejected": -2.53125,
"logps/chosen": -100.75,
"logps/rejected": -674.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.703125,
"rewards/margins": 50.875,
"rewards/rejected": -45.125,
"step": 453
},
{
"epoch": 1.5141903171953257,
"grad_norm": 8.177906130057977e-10,
"learning_rate": 8.056477601031736e-05,
"logits/chosen": -4.40625,
"logits/rejected": -2.65625,
"logps/chosen": -84.25,
"logps/rejected": -710.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.953125,
"rewards/margins": 54.125,
"rewards/rejected": -48.0,
"step": 454
},
{
"epoch": 1.5175292153589317,
"grad_norm": 1.9370509970334515e-12,
"learning_rate": 8.028238976445194e-05,
"logits/chosen": -4.0546875,
"logits/rejected": -2.390625,
"logps/chosen": -92.5,
"logps/rejected": -704.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.9375,
"rewards/margins": 53.25,
"rewards/rejected": -47.25,
"step": 455
},
{
"epoch": 1.5208681135225377,
"grad_norm": 1.649963672634147e-11,
"learning_rate": 8e-05,
"logits/chosen": -4.484375,
"logits/rejected": -2.359375,
"logps/chosen": -64.625,
"logps/rejected": -846.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.0,
"rewards/margins": 61.125,
"rewards/rejected": -56.125,
"step": 456
},
{
"epoch": 1.5242070116861437,
"grad_norm": 3.1389786272484343e-06,
"learning_rate": 7.971761023554807e-05,
"logits/chosen": -4.4375,
"logits/rejected": -2.4375,
"logps/chosen": -73.75,
"logps/rejected": -720.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.234375,
"rewards/margins": 52.0,
"rewards/rejected": -46.75,
"step": 457
},
{
"epoch": 1.5275459098497497,
"grad_norm": 6.32845509329627e-09,
"learning_rate": 7.943522398968266e-05,
"logits/chosen": -3.9453125,
"logits/rejected": -2.5546875,
"logps/chosen": -94.75,
"logps/rejected": -714.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.6875,
"rewards/margins": 53.5,
"rewards/rejected": -47.75,
"step": 458
},
{
"epoch": 1.5308848080133557,
"grad_norm": 6.801514462750902e-09,
"learning_rate": 7.915284478094637e-05,
"logits/chosen": -4.359375,
"logits/rejected": -2.578125,
"logps/chosen": -86.5,
"logps/rejected": -726.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.34375,
"rewards/margins": 53.875,
"rewards/rejected": -48.5,
"step": 459
},
{
"epoch": 1.5342237061769617,
"grad_norm": 3.1257162191877796e-08,
"learning_rate": 7.887047612779426e-05,
"logits/chosen": -3.9140625,
"logits/rejected": -2.3984375,
"logps/chosen": -92.0,
"logps/rejected": -764.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.84375,
"rewards/margins": 56.75,
"rewards/rejected": -50.875,
"step": 460
},
{
"epoch": 1.5375626043405677,
"grad_norm": 2.6219735271837408e-09,
"learning_rate": 7.858812154854972e-05,
"logits/chosen": -4.265625,
"logits/rejected": -2.5625,
"logps/chosen": -87.5,
"logps/rejected": -730.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.296875,
"rewards/margins": 53.625,
"rewards/rejected": -48.25,
"step": 461
},
{
"epoch": 1.5409015025041737,
"grad_norm": 1.4608904086443886e-09,
"learning_rate": 7.83057845613609e-05,
"logits/chosen": -4.0625,
"logits/rejected": -2.296875,
"logps/chosen": -89.75,
"logps/rejected": -672.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.859375,
"rewards/margins": 50.75,
"rewards/rejected": -44.875,
"step": 462
},
{
"epoch": 1.5442404006677797,
"grad_norm": 1.4853892338351216e-08,
"learning_rate": 7.802346868415662e-05,
"logits/chosen": -4.421875,
"logits/rejected": -2.3671875,
"logps/chosen": -91.75,
"logps/rejected": -742.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.578125,
"rewards/margins": 54.125,
"rewards/rejected": -48.5,
"step": 463
},
{
"epoch": 1.5475792988313857,
"grad_norm": 8.356595344594098e-07,
"learning_rate": 7.774117743460278e-05,
"logits/chosen": -4.0546875,
"logits/rejected": -2.375,
"logps/chosen": -115.25,
"logps/rejected": -800.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.09375,
"rewards/margins": 56.375,
"rewards/rejected": -50.375,
"step": 464
},
{
"epoch": 1.5509181969949917,
"grad_norm": 3.025586556759663e-05,
"learning_rate": 7.745891433005843e-05,
"logits/chosen": -4.703125,
"logits/rejected": -2.5859375,
"logps/chosen": -81.5,
"logps/rejected": -696.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.421875,
"rewards/margins": 52.25,
"rewards/rejected": -46.875,
"step": 465
},
{
"epoch": 1.5542570951585977,
"grad_norm": 7.671878243797892e-09,
"learning_rate": 7.717668288753181e-05,
"logits/chosen": -4.1875,
"logits/rejected": -2.390625,
"logps/chosen": -98.5,
"logps/rejected": -730.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.046875,
"rewards/margins": 53.875,
"rewards/rejected": -47.875,
"step": 466
},
{
"epoch": 1.5575959933222037,
"grad_norm": 1.2859992466474068e-06,
"learning_rate": 7.68944866236368e-05,
"logits/chosen": -3.9921875,
"logits/rejected": -2.4296875,
"logps/chosen": -106.5,
"logps/rejected": -734.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.890625,
"rewards/margins": 53.125,
"rewards/rejected": -48.375,
"step": 467
},
{
"epoch": 1.5609348914858097,
"grad_norm": 8.435268483708569e-08,
"learning_rate": 7.661232905454879e-05,
"logits/chosen": -4.234375,
"logits/rejected": -2.3984375,
"logps/chosen": -101.5,
"logps/rejected": -796.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.25,
"rewards/margins": 57.625,
"rewards/rejected": -52.25,
"step": 468
},
{
"epoch": 1.5642737896494157,
"grad_norm": 0.0006341390544548631,
"learning_rate": 7.633021369596119e-05,
"logits/chosen": -4.53125,
"logits/rejected": -2.3984375,
"logps/chosen": -63.625,
"logps/rejected": -780.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.21875,
"rewards/margins": 56.75,
"rewards/rejected": -51.5,
"step": 469
},
{
"epoch": 1.5676126878130217,
"grad_norm": 2.392030573616921e-09,
"learning_rate": 7.604814406304135e-05,
"logits/chosen": -4.234375,
"logits/rejected": -2.578125,
"logps/chosen": -99.0,
"logps/rejected": -736.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.796875,
"rewards/margins": 53.875,
"rewards/rejected": -48.125,
"step": 470
},
{
"epoch": 1.5709515859766277,
"grad_norm": 7.502531235159537e-11,
"learning_rate": 7.576612367038695e-05,
"logits/chosen": -4.296875,
"logits/rejected": -2.453125,
"logps/chosen": -95.25,
"logps/rejected": -764.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.640625,
"rewards/margins": 55.0,
"rewards/rejected": -49.25,
"step": 471
},
{
"epoch": 1.5742904841402336,
"grad_norm": 3.8160608006876373e-10,
"learning_rate": 7.548415603198213e-05,
"logits/chosen": -4.34375,
"logits/rejected": -2.4375,
"logps/chosen": -105.0,
"logps/rejected": -636.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.421875,
"rewards/margins": 50.125,
"rewards/rejected": -43.625,
"step": 472
},
{
"epoch": 1.5776293823038396,
"grad_norm": 4.221317073671571e-09,
"learning_rate": 7.520224466115363e-05,
"logits/chosen": -4.296875,
"logits/rejected": -2.578125,
"logps/chosen": -90.5,
"logps/rejected": -722.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.390625,
"rewards/margins": 53.75,
"rewards/rejected": -48.25,
"step": 473
},
{
"epoch": 1.5809682804674456,
"grad_norm": 2.064158427517171e-10,
"learning_rate": 7.492039307052718e-05,
"logits/chosen": -4.453125,
"logits/rejected": -2.3828125,
"logps/chosen": -85.0,
"logps/rejected": -772.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.5625,
"rewards/margins": 56.375,
"rewards/rejected": -50.875,
"step": 474
},
{
"epoch": 1.5843071786310516,
"grad_norm": 4.211127446751561e-09,
"learning_rate": 7.46386047719836e-05,
"logits/chosen": -4.34375,
"logits/rejected": -2.4609375,
"logps/chosen": -88.25,
"logps/rejected": -668.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.765625,
"rewards/margins": 52.0,
"rewards/rejected": -46.375,
"step": 475
},
{
"epoch": 1.5876460767946576,
"grad_norm": 2.4332844986929558e-05,
"learning_rate": 7.435688327661512e-05,
"logits/chosen": -4.015625,
"logits/rejected": -2.5,
"logps/chosen": -103.75,
"logps/rejected": -644.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.390625,
"rewards/margins": 48.75,
"rewards/rejected": -43.375,
"step": 476
},
{
"epoch": 1.5909849749582636,
"grad_norm": 1.4354889499301748e-10,
"learning_rate": 7.407523209468162e-05,
"logits/chosen": -4.1015625,
"logits/rejected": -2.3515625,
"logps/chosen": -98.5,
"logps/rejected": -740.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.234375,
"rewards/margins": 56.375,
"rewards/rejected": -50.125,
"step": 477
},
{
"epoch": 1.5943238731218696,
"grad_norm": 3.726325359387772e-10,
"learning_rate": 7.379365473556675e-05,
"logits/chosen": -4.203125,
"logits/rejected": -2.40625,
"logps/chosen": -98.25,
"logps/rejected": -704.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.890625,
"rewards/margins": 52.75,
"rewards/rejected": -46.875,
"step": 478
},
{
"epoch": 1.5976627712854758,
"grad_norm": 1.74457059909372e-09,
"learning_rate": 7.35121547077345e-05,
"logits/chosen": -4.078125,
"logits/rejected": -2.421875,
"logps/chosen": -89.25,
"logps/rejected": -650.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.953125,
"rewards/margins": 49.125,
"rewards/rejected": -43.125,
"step": 479
},
{
"epoch": 1.6010016694490818,
"grad_norm": 1.4945060300419755e-08,
"learning_rate": 7.323073551868516e-05,
"logits/chosen": -4.234375,
"logits/rejected": -2.390625,
"logps/chosen": -95.5,
"logps/rejected": -736.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.34375,
"rewards/margins": 56.375,
"rewards/rejected": -51.0,
"step": 480
},
{
"epoch": 1.6043405676126878,
"grad_norm": 3.741529397416343e-08,
"learning_rate": 7.294940067491189e-05,
"logits/chosen": -4.515625,
"logits/rejected": -2.28125,
"logps/chosen": -73.625,
"logps/rejected": -770.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.203125,
"rewards/margins": 55.875,
"rewards/rejected": -50.625,
"step": 481
},
{
"epoch": 1.6076794657762938,
"grad_norm": 3.76367270682465e-11,
"learning_rate": 7.266815368185677e-05,
"logits/chosen": -4.640625,
"logits/rejected": -2.46875,
"logps/chosen": -74.5,
"logps/rejected": -730.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.75,
"rewards/margins": 54.25,
"rewards/rejected": -48.5,
"step": 482
},
{
"epoch": 1.6110183639398998,
"grad_norm": 3.4839501950045815e-08,
"learning_rate": 7.238699804386737e-05,
"logits/chosen": -4.21875,
"logits/rejected": -2.453125,
"logps/chosen": -94.25,
"logps/rejected": -652.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.609375,
"rewards/margins": 49.125,
"rewards/rejected": -43.5,
"step": 483
},
{
"epoch": 1.6143572621035058,
"grad_norm": 1.3145572408390649e-09,
"learning_rate": 7.210593726415293e-05,
"logits/chosen": -4.546875,
"logits/rejected": -2.6015625,
"logps/chosen": -74.5,
"logps/rejected": -650.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.671875,
"rewards/margins": 49.5,
"rewards/rejected": -43.875,
"step": 484
},
{
"epoch": 1.6176961602671118,
"grad_norm": 0.010065467096865177,
"learning_rate": 7.182497484474075e-05,
"logits/chosen": -4.3125,
"logits/rejected": -2.484375,
"logps/chosen": -75.5,
"logps/rejected": -772.0,
"loss": 0.0004,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.234375,
"rewards/margins": 58.125,
"rewards/rejected": -51.875,
"step": 485
},
{
"epoch": 1.6210350584307178,
"grad_norm": 1.0117897839001522e-11,
"learning_rate": 7.154411428643258e-05,
"logits/chosen": -4.4765625,
"logits/rejected": -2.515625,
"logps/chosen": -90.5,
"logps/rejected": -698.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.875,
"rewards/margins": 53.0,
"rewards/rejected": -47.0,
"step": 486
},
{
"epoch": 1.6243739565943238,
"grad_norm": 4.363270704743627e-07,
"learning_rate": 7.126335908876092e-05,
"logits/chosen": -4.65625,
"logits/rejected": -2.4609375,
"logps/chosen": -72.625,
"logps/rejected": -710.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.40625,
"rewards/margins": 54.25,
"rewards/rejected": -48.875,
"step": 487
},
{
"epoch": 1.62771285475793,
"grad_norm": 1.109244749386562e-05,
"learning_rate": 7.098271274994556e-05,
"logits/chosen": -4.375,
"logits/rejected": -2.546875,
"logps/chosen": -91.75,
"logps/rejected": -726.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.421875,
"rewards/margins": 52.375,
"rewards/rejected": -46.875,
"step": 488
},
{
"epoch": 1.631051752921536,
"grad_norm": 3.7534000907335496e-10,
"learning_rate": 7.070217876684981e-05,
"logits/chosen": -4.15625,
"logits/rejected": -2.4140625,
"logps/chosen": -101.75,
"logps/rejected": -774.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.234375,
"rewards/margins": 55.25,
"rewards/rejected": -50.0,
"step": 489
},
{
"epoch": 1.634390651085142,
"grad_norm": 1.1033153946016228e-08,
"learning_rate": 7.042176063493708e-05,
"logits/chosen": -4.0234375,
"logits/rejected": -2.5234375,
"logps/chosen": -102.75,
"logps/rejected": -642.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.4375,
"rewards/margins": 48.0,
"rewards/rejected": -42.625,
"step": 490
},
{
"epoch": 1.637729549248748,
"grad_norm": 4.122978625176188e-10,
"learning_rate": 7.014146184822732e-05,
"logits/chosen": -4.359375,
"logits/rejected": -2.671875,
"logps/chosen": -105.75,
"logps/rejected": -720.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.84375,
"rewards/margins": 54.375,
"rewards/rejected": -48.5,
"step": 491
},
{
"epoch": 1.641068447412354,
"grad_norm": 1.314667042606743e-05,
"learning_rate": 6.98612858992533e-05,
"logits/chosen": -4.53125,
"logits/rejected": -2.4453125,
"logps/chosen": -80.5,
"logps/rejected": -732.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.921875,
"rewards/margins": 54.25,
"rewards/rejected": -48.25,
"step": 492
},
{
"epoch": 1.64440734557596,
"grad_norm": 1.1127249122111493e-10,
"learning_rate": 6.958123627901733e-05,
"logits/chosen": -3.9921875,
"logits/rejected": -2.578125,
"logps/chosen": -111.25,
"logps/rejected": -686.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.25,
"rewards/margins": 51.25,
"rewards/rejected": -45.0,
"step": 493
},
{
"epoch": 1.647746243739566,
"grad_norm": 2.7546880687623343e-07,
"learning_rate": 6.930131647694761e-05,
"logits/chosen": -4.328125,
"logits/rejected": -2.25,
"logps/chosen": -80.0,
"logps/rejected": -714.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.4375,
"rewards/margins": 53.0,
"rewards/rejected": -47.5,
"step": 494
},
{
"epoch": 1.651085141903172,
"grad_norm": 4.5741360699125266e-10,
"learning_rate": 6.90215299808549e-05,
"logits/chosen": -4.4375,
"logits/rejected": -2.375,
"logps/chosen": -83.25,
"logps/rejected": -704.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.328125,
"rewards/margins": 53.75,
"rewards/rejected": -47.375,
"step": 495
},
{
"epoch": 1.654424040066778,
"grad_norm": 3.2220714274444617e-07,
"learning_rate": 6.874188027688877e-05,
"logits/chosen": -4.515625,
"logits/rejected": -2.296875,
"logps/chosen": -82.25,
"logps/rejected": -738.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.15625,
"rewards/margins": 53.25,
"rewards/rejected": -48.125,
"step": 496
},
{
"epoch": 1.657762938230384,
"grad_norm": 7.868575124803101e-09,
"learning_rate": 6.846237084949454e-05,
"logits/chosen": -4.34375,
"logits/rejected": -2.421875,
"logps/chosen": -85.5,
"logps/rejected": -754.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.734375,
"rewards/margins": 56.0,
"rewards/rejected": -50.25,
"step": 497
},
{
"epoch": 1.66110183639399,
"grad_norm": 4.101905481945778e-10,
"learning_rate": 6.818300518136964e-05,
"logits/chosen": -4.359375,
"logits/rejected": -2.3984375,
"logps/chosen": -91.5,
"logps/rejected": -756.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.421875,
"rewards/margins": 55.375,
"rewards/rejected": -49.0,
"step": 498
},
{
"epoch": 1.664440734557596,
"grad_norm": 1.494113721633994e-10,
"learning_rate": 6.790378675342013e-05,
"logits/chosen": -4.234375,
"logits/rejected": -2.421875,
"logps/chosen": -85.0,
"logps/rejected": -716.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.921875,
"rewards/margins": 54.0,
"rewards/rejected": -48.125,
"step": 499
},
{
"epoch": 1.667779632721202,
"grad_norm": 7.279717627317606e-12,
"learning_rate": 6.762471904471765e-05,
"logits/chosen": -4.78125,
"logits/rejected": -2.4375,
"logps/chosen": -70.625,
"logps/rejected": -748.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.890625,
"rewards/margins": 56.75,
"rewards/rejected": -50.75,
"step": 500
},
{
"epoch": 1.671118530884808,
"grad_norm": 3.969352402322102e-09,
"learning_rate": 6.73458055324557e-05,
"logits/chosen": -4.296875,
"logits/rejected": -2.4140625,
"logps/chosen": -72.25,
"logps/rejected": -828.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.59375,
"rewards/margins": 59.5,
"rewards/rejected": -53.875,
"step": 501
},
{
"epoch": 1.674457429048414,
"grad_norm": 1.531030875412398e-07,
"learning_rate": 6.706704969190657e-05,
"logits/chosen": -4.359375,
"logits/rejected": -2.59375,
"logps/chosen": -81.5,
"logps/rejected": -666.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.75,
"rewards/margins": 51.25,
"rewards/rejected": -45.625,
"step": 502
},
{
"epoch": 1.67779632721202,
"grad_norm": 8.170403797969072e-10,
"learning_rate": 6.678845499637793e-05,
"logits/chosen": -4.2890625,
"logits/rejected": -2.3515625,
"logps/chosen": -81.0,
"logps/rejected": -702.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.5625,
"rewards/margins": 52.5,
"rewards/rejected": -47.0,
"step": 503
},
{
"epoch": 1.681135225375626,
"grad_norm": 2.2174371713812313e-12,
"learning_rate": 6.651002491716963e-05,
"logits/chosen": -4.1953125,
"logits/rejected": -2.390625,
"logps/chosen": -98.25,
"logps/rejected": -716.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.53125,
"rewards/margins": 53.875,
"rewards/rejected": -48.375,
"step": 504
},
{
"epoch": 1.684474123539232,
"grad_norm": 3.214521704375528e-10,
"learning_rate": 6.623176292353034e-05,
"logits/chosen": -4.46875,
"logits/rejected": -2.4296875,
"logps/chosen": -73.25,
"logps/rejected": -736.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.78125,
"rewards/margins": 54.75,
"rewards/rejected": -49.0,
"step": 505
},
{
"epoch": 1.687813021702838,
"grad_norm": 1.9167500919792246e-10,
"learning_rate": 6.595367248261435e-05,
"logits/chosen": -4.703125,
"logits/rejected": -2.3515625,
"logps/chosen": -59.5,
"logps/rejected": -838.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.609375,
"rewards/margins": 59.5,
"rewards/rejected": -54.0,
"step": 506
},
{
"epoch": 1.691151919866444,
"grad_norm": 8.242089677423792e-08,
"learning_rate": 6.567575705943849e-05,
"logits/chosen": -4.28125,
"logits/rejected": -2.40625,
"logps/chosen": -79.5,
"logps/rejected": -688.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.3125,
"rewards/margins": 51.625,
"rewards/rejected": -46.375,
"step": 507
},
{
"epoch": 1.69449081803005,
"grad_norm": 1.890362355538855e-08,
"learning_rate": 6.539802011683875e-05,
"logits/chosen": -3.8125,
"logits/rejected": -2.46875,
"logps/chosen": -100.0,
"logps/rejected": -724.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.96875,
"rewards/margins": 52.375,
"rewards/rejected": -47.25,
"step": 508
},
{
"epoch": 1.697829716193656,
"grad_norm": 1.2278357497397252e-10,
"learning_rate": 6.51204651154274e-05,
"logits/chosen": -4.2109375,
"logits/rejected": -2.4296875,
"logps/chosen": -97.75,
"logps/rejected": -740.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.46875,
"rewards/margins": 53.5,
"rewards/rejected": -48.0,
"step": 509
},
{
"epoch": 1.701168614357262,
"grad_norm": 9.539391498947225e-09,
"learning_rate": 6.484309551354952e-05,
"logits/chosen": -4.78125,
"logits/rejected": -2.5078125,
"logps/chosen": -75.75,
"logps/rejected": -736.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.75,
"rewards/margins": 55.125,
"rewards/rejected": -49.375,
"step": 510
},
{
"epoch": 1.704507512520868,
"grad_norm": 7.297229043246034e-09,
"learning_rate": 6.456591476724026e-05,
"logits/chosen": -4.53125,
"logits/rejected": -2.10546875,
"logps/chosen": -81.25,
"logps/rejected": -768.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.03125,
"rewards/margins": 55.25,
"rewards/rejected": -49.25,
"step": 511
},
{
"epoch": 1.707846410684474,
"grad_norm": 1.4188211716614774e-09,
"learning_rate": 6.428892633018158e-05,
"logits/chosen": -4.59375,
"logits/rejected": -2.4453125,
"logps/chosen": -83.875,
"logps/rejected": -718.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.734375,
"rewards/margins": 53.125,
"rewards/rejected": -47.375,
"step": 512
},
{
"epoch": 1.7111853088480802,
"grad_norm": 3.5108431717389976e-08,
"learning_rate": 6.401213365365921e-05,
"logits/chosen": -4.15625,
"logits/rejected": -2.421875,
"logps/chosen": -95.75,
"logps/rejected": -766.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.796875,
"rewards/margins": 56.125,
"rewards/rejected": -50.25,
"step": 513
},
{
"epoch": 1.7145242070116862,
"grad_norm": 6.461980950334123e-11,
"learning_rate": 6.373554018651981e-05,
"logits/chosen": -4.5625,
"logits/rejected": -2.34375,
"logps/chosen": -75.25,
"logps/rejected": -836.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.53125,
"rewards/margins": 60.5,
"rewards/rejected": -55.125,
"step": 514
},
{
"epoch": 1.7178631051752922,
"grad_norm": 2.938630450088908e-10,
"learning_rate": 6.345914937512772e-05,
"logits/chosen": -4.375,
"logits/rejected": -2.625,
"logps/chosen": -88.75,
"logps/rejected": -720.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.859375,
"rewards/margins": 54.0,
"rewards/rejected": -48.25,
"step": 515
},
{
"epoch": 1.7212020033388982,
"grad_norm": 7.139490387775282e-11,
"learning_rate": 6.318296466332232e-05,
"logits/chosen": -4.46875,
"logits/rejected": -2.4375,
"logps/chosen": -83.0,
"logps/rejected": -770.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.109375,
"rewards/margins": 56.0,
"rewards/rejected": -49.875,
"step": 516
},
{
"epoch": 1.7245409015025042,
"grad_norm": 6.043272549050016e-10,
"learning_rate": 6.290698949237494e-05,
"logits/chosen": -3.9921875,
"logits/rejected": -2.265625,
"logps/chosen": -99.5,
"logps/rejected": -764.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.625,
"rewards/margins": 55.375,
"rewards/rejected": -49.75,
"step": 517
},
{
"epoch": 1.7278797996661102,
"grad_norm": 5.376046829042025e-06,
"learning_rate": 6.2631227300946e-05,
"logits/chosen": -4.5,
"logits/rejected": -2.484375,
"logps/chosen": -75.75,
"logps/rejected": -698.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.1875,
"rewards/margins": 52.75,
"rewards/rejected": -46.625,
"step": 518
},
{
"epoch": 1.7312186978297162,
"grad_norm": 1.3798023834610262e-10,
"learning_rate": 6.235568152504226e-05,
"logits/chosen": -4.265625,
"logits/rejected": -2.46875,
"logps/chosen": -78.5,
"logps/rejected": -682.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.78125,
"rewards/margins": 52.5,
"rewards/rejected": -46.5,
"step": 519
},
{
"epoch": 1.7345575959933222,
"grad_norm": 0.0643463060259819,
"learning_rate": 6.20803555979738e-05,
"logits/chosen": -4.578125,
"logits/rejected": -2.4453125,
"logps/chosen": -90.5,
"logps/rejected": -688.0,
"loss": 0.0022,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.046875,
"rewards/margins": 51.625,
"rewards/rejected": -46.5,
"step": 520
},
{
"epoch": 1.7378964941569284,
"grad_norm": 4.92718896794031e-08,
"learning_rate": 6.18052529503115e-05,
"logits/chosen": -4.359375,
"logits/rejected": -2.421875,
"logps/chosen": -78.0,
"logps/rejected": -712.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.359375,
"rewards/margins": 51.5,
"rewards/rejected": -46.125,
"step": 521
},
{
"epoch": 1.7412353923205344,
"grad_norm": 1.0779488235357348e-07,
"learning_rate": 6.153037700984412e-05,
"logits/chosen": -4.390625,
"logits/rejected": -2.4453125,
"logps/chosen": -80.125,
"logps/rejected": -744.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.96875,
"rewards/margins": 54.625,
"rewards/rejected": -49.625,
"step": 522
},
{
"epoch": 1.7445742904841404,
"grad_norm": 2.592769610121337e-10,
"learning_rate": 6.125573120153565e-05,
"logits/chosen": -4.4375,
"logits/rejected": -2.5,
"logps/chosen": -102.5,
"logps/rejected": -734.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.609375,
"rewards/margins": 53.875,
"rewards/rejected": -48.125,
"step": 523
},
{
"epoch": 1.7479131886477464,
"grad_norm": 1.1691867474183937e-09,
"learning_rate": 6.098131894748267e-05,
"logits/chosen": -4.6875,
"logits/rejected": -2.53125,
"logps/chosen": -80.0,
"logps/rejected": -680.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.65625,
"rewards/margins": 51.375,
"rewards/rejected": -45.75,
"step": 524
},
{
"epoch": 1.7512520868113524,
"grad_norm": 5.292809746038074e-08,
"learning_rate": 6.070714366687152e-05,
"logits/chosen": -4.34375,
"logits/rejected": -2.4140625,
"logps/chosen": -94.75,
"logps/rejected": -686.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.59375,
"rewards/margins": 51.25,
"rewards/rejected": -45.75,
"step": 525
},
{
"epoch": 1.7545909849749584,
"grad_norm": 6.529545237832224e-10,
"learning_rate": 6.0433208775936015e-05,
"logits/chosen": -4.71875,
"logits/rejected": -2.4921875,
"logps/chosen": -75.0,
"logps/rejected": -686.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.578125,
"rewards/margins": 52.375,
"rewards/rejected": -46.875,
"step": 526
},
{
"epoch": 1.7579298831385644,
"grad_norm": 1.5020368664409034e-05,
"learning_rate": 6.015951768791461e-05,
"logits/chosen": -4.25,
"logits/rejected": -2.3359375,
"logps/chosen": -77.0,
"logps/rejected": -730.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.75,
"rewards/margins": 53.75,
"rewards/rejected": -48.0,
"step": 527
},
{
"epoch": 1.7612687813021703,
"grad_norm": 1.7290001778746955e-05,
"learning_rate": 5.9886073813008015e-05,
"logits/chosen": -4.53125,
"logits/rejected": -2.328125,
"logps/chosen": -73.5,
"logps/rejected": -662.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.09375,
"rewards/margins": 49.625,
"rewards/rejected": -44.375,
"step": 528
},
{
"epoch": 1.7646076794657763,
"grad_norm": 2.072624738502782e-09,
"learning_rate": 5.961288055833656e-05,
"logits/chosen": -3.734375,
"logits/rejected": -2.5546875,
"logps/chosen": -115.25,
"logps/rejected": -608.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.5625,
"rewards/margins": 47.0,
"rewards/rejected": -41.5,
"step": 529
},
{
"epoch": 1.7679465776293823,
"grad_norm": 0.013802506029605865,
"learning_rate": 5.9339941327897977e-05,
"logits/chosen": -4.4375,
"logits/rejected": -2.71875,
"logps/chosen": -79.75,
"logps/rejected": -756.0,
"loss": 0.0004,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.125,
"rewards/margins": 54.5,
"rewards/rejected": -49.375,
"step": 530
},
{
"epoch": 1.7712854757929883,
"grad_norm": 2.484111405465228e-08,
"learning_rate": 5.906725952252476e-05,
"logits/chosen": -4.515625,
"logits/rejected": -2.4921875,
"logps/chosen": -86.0,
"logps/rejected": -762.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.671875,
"rewards/margins": 55.5,
"rewards/rejected": -49.875,
"step": 531
},
{
"epoch": 1.7746243739565943,
"grad_norm": 1.7635564120155323e-09,
"learning_rate": 5.879483853984187e-05,
"logits/chosen": -4.09375,
"logits/rejected": -2.4765625,
"logps/chosen": -94.75,
"logps/rejected": -716.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.640625,
"rewards/margins": 53.25,
"rewards/rejected": -47.625,
"step": 532
},
{
"epoch": 1.7779632721202003,
"grad_norm": 2.7364133181606576e-10,
"learning_rate": 5.852268177422451e-05,
"logits/chosen": -4.1796875,
"logits/rejected": -2.4140625,
"logps/chosen": -93.75,
"logps/rejected": -836.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.296875,
"rewards/margins": 57.875,
"rewards/rejected": -52.625,
"step": 533
},
{
"epoch": 1.7813021702838063,
"grad_norm": 5.411819259393269e-08,
"learning_rate": 5.8250792616755554e-05,
"logits/chosen": -4.28125,
"logits/rejected": -2.46875,
"logps/chosen": -82.875,
"logps/rejected": -712.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.828125,
"rewards/margins": 52.25,
"rewards/rejected": -46.375,
"step": 534
},
{
"epoch": 1.7846410684474123,
"grad_norm": 1.0748289758222285e-10,
"learning_rate": 5.7979174455183625e-05,
"logits/chosen": -4.390625,
"logits/rejected": -2.4296875,
"logps/chosen": -98.25,
"logps/rejected": -714.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.515625,
"rewards/margins": 52.5,
"rewards/rejected": -47.125,
"step": 535
},
{
"epoch": 1.7879799666110183,
"grad_norm": 5.165533001338929e-10,
"learning_rate": 5.7707830673880635e-05,
"logits/chosen": -4.46875,
"logits/rejected": -2.5703125,
"logps/chosen": -71.75,
"logps/rejected": -698.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.8125,
"rewards/margins": 53.625,
"rewards/rejected": -47.75,
"step": 536
},
{
"epoch": 1.7913188647746243,
"grad_norm": 3.2588772569397406e-08,
"learning_rate": 5.743676465379977e-05,
"logits/chosen": -4.375,
"logits/rejected": -2.4609375,
"logps/chosen": -71.25,
"logps/rejected": -658.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.703125,
"rewards/margins": 49.875,
"rewards/rejected": -44.125,
"step": 537
},
{
"epoch": 1.7946577629382303,
"grad_norm": 1.4472433917944727e-08,
"learning_rate": 5.71659797724333e-05,
"logits/chosen": -4.0625,
"logits/rejected": -2.5078125,
"logps/chosen": -92.75,
"logps/rejected": -732.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.84375,
"rewards/margins": 52.0,
"rewards/rejected": -46.125,
"step": 538
},
{
"epoch": 1.7979966611018363,
"grad_norm": 5.101317128719529e-07,
"learning_rate": 5.6895479403770415e-05,
"logits/chosen": -4.4375,
"logits/rejected": -2.5703125,
"logps/chosen": -84.75,
"logps/rejected": -694.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.65625,
"rewards/margins": 52.625,
"rewards/rejected": -47.0,
"step": 539
},
{
"epoch": 1.8013355592654423,
"grad_norm": 1.3090671302506962e-07,
"learning_rate": 5.6625266918255355e-05,
"logits/chosen": -4.59375,
"logits/rejected": -2.453125,
"logps/chosen": -78.375,
"logps/rejected": -634.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.953125,
"rewards/margins": 48.875,
"rewards/rejected": -42.875,
"step": 540
},
{
"epoch": 1.8046744574290483,
"grad_norm": 5.188319392168683e-12,
"learning_rate": 5.6355345682745285e-05,
"logits/chosen": -4.5546875,
"logits/rejected": -2.3828125,
"logps/chosen": -84.125,
"logps/rejected": -796.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.375,
"rewards/margins": 57.875,
"rewards/rejected": -52.5,
"step": 541
},
{
"epoch": 1.8080133555926543,
"grad_norm": 2.993649275140342e-07,
"learning_rate": 5.608571906046841e-05,
"logits/chosen": -4.609375,
"logits/rejected": -2.4453125,
"logps/chosen": -70.75,
"logps/rejected": -704.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.828125,
"rewards/margins": 52.5,
"rewards/rejected": -47.75,
"step": 542
},
{
"epoch": 1.8113522537562603,
"grad_norm": 3.541090380990153e-12,
"learning_rate": 5.5816390410982e-05,
"logits/chosen": -4.21875,
"logits/rejected": -2.5,
"logps/chosen": -96.0,
"logps/rejected": -802.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.21875,
"rewards/margins": 58.5,
"rewards/rejected": -52.375,
"step": 543
},
{
"epoch": 1.8146911519198663,
"grad_norm": 2.76508788088492e-10,
"learning_rate": 5.5547363090130596e-05,
"logits/chosen": -4.0234375,
"logits/rejected": -2.4609375,
"logps/chosen": -98.75,
"logps/rejected": -828.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.28125,
"rewards/margins": 58.25,
"rewards/rejected": -53.0,
"step": 544
},
{
"epoch": 1.8180300500834723,
"grad_norm": 1.0243940096188453e-06,
"learning_rate": 5.5278640450004216e-05,
"logits/chosen": -4.046875,
"logits/rejected": -2.453125,
"logps/chosen": -108.0,
"logps/rejected": -680.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.765625,
"rewards/margins": 50.25,
"rewards/rejected": -44.375,
"step": 545
},
{
"epoch": 1.8213689482470785,
"grad_norm": 1.3393444575626745e-08,
"learning_rate": 5.501022583889647e-05,
"logits/chosen": -4.171875,
"logits/rejected": -2.3359375,
"logps/chosen": -109.25,
"logps/rejected": -684.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.8125,
"rewards/margins": 50.625,
"rewards/rejected": -44.75,
"step": 546
},
{
"epoch": 1.8247078464106845,
"grad_norm": 1.8362791820436541e-07,
"learning_rate": 5.474212260126299e-05,
"logits/chosen": -4.46875,
"logits/rejected": -2.484375,
"logps/chosen": -86.5,
"logps/rejected": -684.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.59375,
"rewards/margins": 50.25,
"rewards/rejected": -44.75,
"step": 547
},
{
"epoch": 1.8280467445742905,
"grad_norm": 3.473892036254256e-07,
"learning_rate": 5.4474334077679604e-05,
"logits/chosen": -4.453125,
"logits/rejected": -2.40625,
"logps/chosen": -88.0,
"logps/rejected": -816.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.703125,
"rewards/margins": 58.5,
"rewards/rejected": -52.875,
"step": 548
},
{
"epoch": 1.8313856427378965,
"grad_norm": 1.5503935912875022e-08,
"learning_rate": 5.4206863604800853e-05,
"logits/chosen": -4.484375,
"logits/rejected": -2.375,
"logps/chosen": -89.0,
"logps/rejected": -662.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.0625,
"rewards/margins": 50.125,
"rewards/rejected": -44.0,
"step": 549
},
{
"epoch": 1.8347245409015025,
"grad_norm": 3.2570995900371713e-10,
"learning_rate": 5.393971451531833e-05,
"logits/chosen": -4.578125,
"logits/rejected": -2.4375,
"logps/chosen": -81.75,
"logps/rejected": -710.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.84375,
"rewards/margins": 53.5,
"rewards/rejected": -47.625,
"step": 550
},
{
"epoch": 1.8380634390651085,
"grad_norm": 3.935995973591844e-09,
"learning_rate": 5.36728901379192e-05,
"logits/chosen": -4.328125,
"logits/rejected": -2.3515625,
"logps/chosen": -98.5,
"logps/rejected": -694.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.71875,
"rewards/margins": 52.5,
"rewards/rejected": -46.625,
"step": 551
},
{
"epoch": 1.8414023372287145,
"grad_norm": 1.3496583850525212e-09,
"learning_rate": 5.34063937972447e-05,
"logits/chosen": -4.0390625,
"logits/rejected": -2.3515625,
"logps/chosen": -109.75,
"logps/rejected": -802.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.578125,
"rewards/margins": 58.875,
"rewards/rejected": -53.25,
"step": 552
},
{
"epoch": 1.8447412353923205,
"grad_norm": 2.9108708776703907e-09,
"learning_rate": 5.3140228813848656e-05,
"logits/chosen": -3.796875,
"logits/rejected": -2.40625,
"logps/chosen": -103.25,
"logps/rejected": -772.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.859375,
"rewards/margins": 57.0,
"rewards/rejected": -51.125,
"step": 553
},
{
"epoch": 1.8480801335559267,
"grad_norm": 3.852580630336888e-05,
"learning_rate": 5.287439850415627e-05,
"logits/chosen": -4.421875,
"logits/rejected": -2.4765625,
"logps/chosen": -91.5,
"logps/rejected": -692.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.65625,
"rewards/margins": 50.375,
"rewards/rejected": -44.75,
"step": 554
},
{
"epoch": 1.8514190317195327,
"grad_norm": 7.022646464349691e-09,
"learning_rate": 5.260890618042261e-05,
"logits/chosen": -4.671875,
"logits/rejected": -2.484375,
"logps/chosen": -78.125,
"logps/rejected": -714.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.71875,
"rewards/margins": 54.0,
"rewards/rejected": -48.125,
"step": 555
},
{
"epoch": 1.8547579298831387,
"grad_norm": 9.849737345191123e-12,
"learning_rate": 5.234375515069149e-05,
"logits/chosen": -4.25,
"logits/rejected": -2.4140625,
"logps/chosen": -76.0,
"logps/rejected": -690.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.734375,
"rewards/margins": 52.625,
"rewards/rejected": -46.75,
"step": 556
},
{
"epoch": 1.8580968280467447,
"grad_norm": 7.22921613487415e-05,
"learning_rate": 5.207894871875419e-05,
"logits/chosen": -4.328125,
"logits/rejected": -2.4921875,
"logps/chosen": -90.5,
"logps/rejected": -716.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.765625,
"rewards/margins": 52.5,
"rewards/rejected": -46.75,
"step": 557
},
{
"epoch": 1.8614357262103507,
"grad_norm": 5.320074936143726e-10,
"learning_rate": 5.1814490184108204e-05,
"logits/chosen": -3.96875,
"logits/rejected": -2.3828125,
"logps/chosen": -94.25,
"logps/rejected": -784.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.734375,
"rewards/margins": 57.0,
"rewards/rejected": -52.375,
"step": 558
},
{
"epoch": 1.8647746243739567,
"grad_norm": 1.9208787449542797e-08,
"learning_rate": 5.155038284191632e-05,
"logits/chosen": -4.671875,
"logits/rejected": -2.4765625,
"logps/chosen": -84.0,
"logps/rejected": -688.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.015625,
"rewards/margins": 52.75,
"rewards/rejected": -46.75,
"step": 559
},
{
"epoch": 1.8681135225375627,
"grad_norm": 1.4424077487973364e-09,
"learning_rate": 5.1286629982965375e-05,
"logits/chosen": -4.6875,
"logits/rejected": -2.4140625,
"logps/chosen": -95.5,
"logps/rejected": -722.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.421875,
"rewards/margins": 52.625,
"rewards/rejected": -47.375,
"step": 560
},
{
"epoch": 1.8714524207011687,
"grad_norm": 2.7382171530199173e-10,
"learning_rate": 5.102323489362542e-05,
"logits/chosen": -4.484375,
"logits/rejected": -2.484375,
"logps/chosen": -78.5,
"logps/rejected": -700.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.328125,
"rewards/margins": 52.375,
"rewards/rejected": -47.125,
"step": 561
},
{
"epoch": 1.8747913188647747,
"grad_norm": 3.5231098594046273e-10,
"learning_rate": 5.076020085580856e-05,
"logits/chosen": -4.625,
"logits/rejected": -2.2734375,
"logps/chosen": -75.25,
"logps/rejected": -728.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.03125,
"rewards/margins": 53.5,
"rewards/rejected": -47.375,
"step": 562
},
{
"epoch": 1.8781302170283807,
"grad_norm": 3.248313618087195e-08,
"learning_rate": 5.049753114692829e-05,
"logits/chosen": -4.46875,
"logits/rejected": -2.2890625,
"logps/chosen": -75.75,
"logps/rejected": -694.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.578125,
"rewards/margins": 52.125,
"rewards/rejected": -46.5,
"step": 563
},
{
"epoch": 1.8814691151919867,
"grad_norm": 4.1881906831520155e-09,
"learning_rate": 5.023522903985853e-05,
"logits/chosen": -4.203125,
"logits/rejected": -2.484375,
"logps/chosen": -80.75,
"logps/rejected": -768.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.421875,
"rewards/margins": 55.375,
"rewards/rejected": -49.875,
"step": 564
},
{
"epoch": 1.8848080133555927,
"grad_norm": 1.2319371300861803e-08,
"learning_rate": 4.9973297802892824e-05,
"logits/chosen": -4.625,
"logits/rejected": -2.4609375,
"logps/chosen": -77.25,
"logps/rejected": -694.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.328125,
"rewards/margins": 51.5,
"rewards/rejected": -46.25,
"step": 565
},
{
"epoch": 1.8881469115191987,
"grad_norm": 1.6527255258802498e-11,
"learning_rate": 4.971174069970375e-05,
"logits/chosen": -4.453125,
"logits/rejected": -2.40625,
"logps/chosen": -76.25,
"logps/rejected": -786.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.25,
"rewards/margins": 55.5,
"rewards/rejected": -50.25,
"step": 566
},
{
"epoch": 1.8914858096828047,
"grad_norm": 1.6659939205965202e-08,
"learning_rate": 4.945056098930204e-05,
"logits/chosen": -4.40625,
"logits/rejected": -2.375,
"logps/chosen": -86.25,
"logps/rejected": -710.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.90625,
"rewards/margins": 52.0,
"rewards/rejected": -47.125,
"step": 567
},
{
"epoch": 1.8948247078464107,
"grad_norm": 4.976216327889915e-09,
"learning_rate": 4.9189761925996226e-05,
"logits/chosen": -3.7890625,
"logits/rejected": -2.4453125,
"logps/chosen": -102.25,
"logps/rejected": -716.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.1875,
"rewards/margins": 52.75,
"rewards/rejected": -47.625,
"step": 568
},
{
"epoch": 1.8981636060100167,
"grad_norm": 5.617463716411919e-10,
"learning_rate": 4.8929346759351856e-05,
"logits/chosen": -4.453125,
"logits/rejected": -2.59375,
"logps/chosen": -72.75,
"logps/rejected": -670.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.875,
"rewards/margins": 51.25,
"rewards/rejected": -45.375,
"step": 569
},
{
"epoch": 1.9015025041736227,
"grad_norm": 8.406910168012871e-10,
"learning_rate": 4.8669318734151205e-05,
"logits/chosen": -4.203125,
"logits/rejected": -2.5859375,
"logps/chosen": -106.25,
"logps/rejected": -636.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.984375,
"rewards/margins": 49.125,
"rewards/rejected": -43.0,
"step": 570
},
{
"epoch": 1.9048414023372287,
"grad_norm": 1.8631781983913243e-07,
"learning_rate": 4.840968109035271e-05,
"logits/chosen": -4.59375,
"logits/rejected": -2.53125,
"logps/chosen": -64.625,
"logps/rejected": -740.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.984375,
"rewards/margins": 54.375,
"rewards/rejected": -49.5,
"step": 571
},
{
"epoch": 1.9081803005008346,
"grad_norm": 2.908484009189749e-10,
"learning_rate": 4.8150437063050605e-05,
"logits/chosen": -4.359375,
"logits/rejected": -2.6640625,
"logps/chosen": -81.5,
"logps/rejected": -746.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.25,
"rewards/margins": 55.875,
"rewards/rejected": -50.625,
"step": 572
},
{
"epoch": 1.9115191986644406,
"grad_norm": 1.1384876188458293e-06,
"learning_rate": 4.7891589882434714e-05,
"logits/chosen": -4.3125,
"logits/rejected": -2.4765625,
"logps/chosen": -72.625,
"logps/rejected": -748.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.78125,
"rewards/margins": 55.875,
"rewards/rejected": -51.0,
"step": 573
},
{
"epoch": 1.9148580968280466,
"grad_norm": 1.7041387398442076e-11,
"learning_rate": 4.763314277375008e-05,
"logits/chosen": -3.7578125,
"logits/rejected": -2.4609375,
"logps/chosen": -98.25,
"logps/rejected": -724.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.21875,
"rewards/margins": 54.0,
"rewards/rejected": -47.75,
"step": 574
},
{
"epoch": 1.9181969949916526,
"grad_norm": 0.0002684830396901816,
"learning_rate": 4.7375098957256905e-05,
"logits/chosen": -3.9609375,
"logits/rejected": -2.265625,
"logps/chosen": -120.0,
"logps/rejected": -628.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.53125,
"rewards/margins": 46.625,
"rewards/rejected": -41.125,
"step": 575
},
{
"epoch": 1.9215358931552586,
"grad_norm": 1.264372144760273e-07,
"learning_rate": 4.711746164819026e-05,
"logits/chosen": -4.0234375,
"logits/rejected": -2.34375,
"logps/chosen": -105.0,
"logps/rejected": -732.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.828125,
"rewards/margins": 53.25,
"rewards/rejected": -47.375,
"step": 576
},
{
"epoch": 1.9248747913188646,
"grad_norm": 0.00022175066987983882,
"learning_rate": 4.6860234056720215e-05,
"logits/chosen": -4.4296875,
"logits/rejected": -2.40625,
"logps/chosen": -81.0,
"logps/rejected": -684.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.125,
"rewards/margins": 51.0,
"rewards/rejected": -45.0,
"step": 577
},
{
"epoch": 1.9282136894824706,
"grad_norm": 1.208756259529764e-07,
"learning_rate": 4.6603419387911695e-05,
"logits/chosen": -4.3125,
"logits/rejected": -2.4921875,
"logps/chosen": -77.75,
"logps/rejected": -732.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.390625,
"rewards/margins": 53.375,
"rewards/rejected": -47.875,
"step": 578
},
{
"epoch": 1.9315525876460768,
"grad_norm": 5.137899550256009e-10,
"learning_rate": 4.63470208416846e-05,
"logits/chosen": -4.359375,
"logits/rejected": -2.3984375,
"logps/chosen": -86.0,
"logps/rejected": -726.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.9375,
"rewards/margins": 54.25,
"rewards/rejected": -48.375,
"step": 579
},
{
"epoch": 1.9348914858096828,
"grad_norm": 8.149926067346769e-09,
"learning_rate": 4.609104161277392e-05,
"logits/chosen": -4.015625,
"logits/rejected": -2.484375,
"logps/chosen": -103.75,
"logps/rejected": -732.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.671875,
"rewards/margins": 54.0,
"rewards/rejected": -48.375,
"step": 580
},
{
"epoch": 1.9382303839732888,
"grad_norm": 1.3904052131863409e-08,
"learning_rate": 4.5835484890689914e-05,
"logits/chosen": -4.046875,
"logits/rejected": -2.3125,
"logps/chosen": -115.0,
"logps/rejected": -730.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.203125,
"rewards/margins": 53.75,
"rewards/rejected": -47.5,
"step": 581
},
{
"epoch": 1.9415692821368948,
"grad_norm": 8.70739924607733e-08,
"learning_rate": 4.558035385967853e-05,
"logits/chosen": -4.5,
"logits/rejected": -2.4609375,
"logps/chosen": -84.75,
"logps/rejected": -720.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.390625,
"rewards/margins": 53.625,
"rewards/rejected": -48.25,
"step": 582
},
{
"epoch": 1.9449081803005008,
"grad_norm": 0.8842443227767944,
"learning_rate": 4.532565169868134e-05,
"logits/chosen": -4.375,
"logits/rejected": -2.1484375,
"logps/chosen": -95.25,
"logps/rejected": -716.0,
"loss": 0.2373,
"rewards/accuracies": 0.984375,
"rewards/chosen": 5.78125,
"rewards/margins": 50.625,
"rewards/rejected": -44.875,
"step": 583
},
{
"epoch": 1.9482470784641068,
"grad_norm": 9.874440820567543e-09,
"learning_rate": 4.507138158129642e-05,
"logits/chosen": -4.421875,
"logits/rejected": -2.3125,
"logps/chosen": -84.0,
"logps/rejected": -822.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.75,
"rewards/margins": 58.5,
"rewards/rejected": -52.75,
"step": 584
},
{
"epoch": 1.9515859766277128,
"grad_norm": 1.259659737629093e-11,
"learning_rate": 4.481754667573846e-05,
"logits/chosen": -4.5625,
"logits/rejected": -2.4921875,
"logps/chosen": -80.75,
"logps/rejected": -732.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.0,
"rewards/margins": 54.375,
"rewards/rejected": -48.25,
"step": 585
},
{
"epoch": 1.9549248747913188,
"grad_norm": 7.668348178668793e-10,
"learning_rate": 4.4564150144799346e-05,
"logits/chosen": -4.140625,
"logits/rejected": -2.3984375,
"logps/chosen": -93.75,
"logps/rejected": -740.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.8125,
"rewards/margins": 55.25,
"rewards/rejected": -49.375,
"step": 586
},
{
"epoch": 1.9582637729549248,
"grad_norm": 1.0073026857071454e-08,
"learning_rate": 4.431119514580897e-05,
"logits/chosen": -4.546875,
"logits/rejected": -2.6328125,
"logps/chosen": -80.0,
"logps/rejected": -708.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.671875,
"rewards/margins": 53.125,
"rewards/rejected": -47.375,
"step": 587
},
{
"epoch": 1.961602671118531,
"grad_norm": 4.297049116530616e-08,
"learning_rate": 4.405868483059548e-05,
"logits/chosen": -3.6171875,
"logits/rejected": -2.4296875,
"logps/chosen": -112.5,
"logps/rejected": -678.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.875,
"rewards/margins": 50.125,
"rewards/rejected": -45.25,
"step": 588
},
{
"epoch": 1.964941569282137,
"grad_norm": 1.041563432724324e-10,
"learning_rate": 4.3806622345446465e-05,
"logits/chosen": -4.109375,
"logits/rejected": -2.453125,
"logps/chosen": -109.0,
"logps/rejected": -664.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.09375,
"rewards/margins": 49.875,
"rewards/rejected": -43.75,
"step": 589
},
{
"epoch": 1.968280467445743,
"grad_norm": 3.955933670918288e-12,
"learning_rate": 4.3555010831069425e-05,
"logits/chosen": -4.578125,
"logits/rejected": -2.578125,
"logps/chosen": -89.0,
"logps/rejected": -658.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.984375,
"rewards/margins": 51.75,
"rewards/rejected": -45.875,
"step": 590
},
{
"epoch": 1.971619365609349,
"grad_norm": 2.0798443074454553e-06,
"learning_rate": 4.330385342255275e-05,
"logits/chosen": -3.9921875,
"logits/rejected": -2.3515625,
"logps/chosen": -97.0,
"logps/rejected": -726.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.265625,
"rewards/margins": 51.875,
"rewards/rejected": -47.625,
"step": 591
},
{
"epoch": 1.974958263772955,
"grad_norm": 1.0061744770695213e-09,
"learning_rate": 4.305315324932675e-05,
"logits/chosen": -4.46875,
"logits/rejected": -2.4375,
"logps/chosen": -108.25,
"logps/rejected": -688.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.15625,
"rewards/margins": 51.125,
"rewards/rejected": -45.875,
"step": 592
},
{
"epoch": 1.978297161936561,
"grad_norm": 1.4001835246801875e-09,
"learning_rate": 4.280291343512439e-05,
"logits/chosen": -4.203125,
"logits/rejected": -2.40625,
"logps/chosen": -94.75,
"logps/rejected": -726.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.296875,
"rewards/margins": 53.875,
"rewards/rejected": -48.5,
"step": 593
},
{
"epoch": 1.981636060100167,
"grad_norm": 7.307075083895498e-11,
"learning_rate": 4.255313709794271e-05,
"logits/chosen": -4.28125,
"logits/rejected": -2.65625,
"logps/chosen": -98.0,
"logps/rejected": -684.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.4375,
"rewards/margins": 51.5,
"rewards/rejected": -46.125,
"step": 594
},
{
"epoch": 1.984974958263773,
"grad_norm": 2.983379931986718e-10,
"learning_rate": 4.230382735000376e-05,
"logits/chosen": -4.40625,
"logits/rejected": -2.5,
"logps/chosen": -79.25,
"logps/rejected": -700.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.796875,
"rewards/margins": 52.375,
"rewards/rejected": -46.5,
"step": 595
},
{
"epoch": 1.988313856427379,
"grad_norm": 7.869925156001045e-09,
"learning_rate": 4.2054987297715805e-05,
"logits/chosen": -4.28125,
"logits/rejected": -2.20703125,
"logps/chosen": -92.0,
"logps/rejected": -730.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.515625,
"rewards/margins": 52.75,
"rewards/rejected": -47.25,
"step": 596
},
{
"epoch": 1.991652754590985,
"grad_norm": 5.2440864983793745e-09,
"learning_rate": 4.180662004163484e-05,
"logits/chosen": -4.765625,
"logits/rejected": -2.578125,
"logps/chosen": -77.25,
"logps/rejected": -690.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.65625,
"rewards/margins": 52.5,
"rewards/rejected": -46.875,
"step": 597
},
{
"epoch": 1.994991652754591,
"grad_norm": 2.159956588587697e-11,
"learning_rate": 4.1558728676425566e-05,
"logits/chosen": -4.609375,
"logits/rejected": -2.578125,
"logps/chosen": -72.25,
"logps/rejected": -674.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.46875,
"rewards/margins": 50.875,
"rewards/rejected": -45.25,
"step": 598
},
{
"epoch": 1.998330550918197,
"grad_norm": 9.040629911361009e-10,
"learning_rate": 4.131131629082335e-05,
"logits/chosen": -4.3203125,
"logits/rejected": -2.5703125,
"logps/chosen": -101.5,
"logps/rejected": -704.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.421875,
"rewards/margins": 52.625,
"rewards/rejected": -47.25,
"step": 599
},
{
"epoch": 2.0,
"grad_norm": 6.309617674560286e-06,
"learning_rate": 4.106438596759518e-05,
"logits/chosen": -3.515625,
"logits/rejected": -2.25,
"logps/chosen": -97.5,
"logps/rejected": -792.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.9375,
"rewards/margins": 60.25,
"rewards/rejected": -55.25,
"step": 600
}
],
"logging_steps": 1,
"max_steps": 900,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}