s300-1.3.0L-GRPO-it2 / trainer_state.json
PocketDoc's picture
Upload folder using huggingface_hub
fcb0207 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.0543293718166384,
"eval_steps": 500,
"global_step": 300,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio": 0.0,
"completion_length": 313.2109375,
"epoch": 0.006791171477079796,
"grad_norm": 1.1658549601952852,
"kl": 0.0003286600112915039,
"learning_rate": 0.0,
"loss": -0.0057,
"reward": 0.4946059621870518,
"reward_std": 0.3181956857442856,
"rewards/preference_model_reward": 0.4946059621870518,
"rewards/preference_model_reward/std": 0.3181956773623824,
"step": 1
},
{
"clip_ratio": 0.0,
"epoch": 0.013582342954159592,
"grad_norm": 1.1661357306239586,
"kl": 0.0003286600112915039,
"learning_rate": 8e-08,
"loss": -0.0057,
"step": 2
},
{
"clip_ratio": 0.0,
"completion_length": 379.8359375,
"epoch": 0.02037351443123939,
"grad_norm": 0.9395065263567765,
"kl": 0.00034868717193603516,
"learning_rate": 1.6e-07,
"loss": 0.0023,
"reward": 0.4930149093270302,
"reward_std": 0.24642075644806027,
"rewards/preference_model_reward": 0.4930149093270302,
"rewards/preference_model_reward/std": 0.24642075458541512,
"step": 3
},
{
"clip_ratio": 0.0005332227265171241,
"epoch": 0.027164685908319185,
"grad_norm": 0.9559331452106371,
"kl": 0.00041222572326660156,
"learning_rate": 2.4e-07,
"loss": 0.0023,
"step": 4
},
{
"clip_ratio": 0.0,
"completion_length": 345.19921875,
"epoch": 0.03395585738539898,
"grad_norm": 1.0659097324920952,
"kl": 0.000335693359375,
"learning_rate": 3.2e-07,
"loss": -0.0052,
"reward": 0.5752501655369997,
"reward_std": 0.2759046368300915,
"rewards/preference_model_reward": 0.5752501655369997,
"rewards/preference_model_reward/std": 0.2759046256542206,
"step": 5
},
{
"clip_ratio": 0.00037182615051278844,
"epoch": 0.04074702886247878,
"grad_norm": 1.0694980180681193,
"kl": 0.0003333091735839844,
"learning_rate": 4e-07,
"loss": -0.0052,
"step": 6
},
{
"clip_ratio": 0.0,
"completion_length": 348.94921875,
"epoch": 0.04753820033955857,
"grad_norm": 1.0374782935041194,
"kl": 0.0003757476806640625,
"learning_rate": 4.8e-07,
"loss": 0.0011,
"reward": 0.5447775591164827,
"reward_std": 0.25542482268065214,
"rewards/preference_model_reward": 0.5447775591164827,
"rewards/preference_model_reward/std": 0.25542481429874897,
"step": 7
},
{
"clip_ratio": 0.00040936221739684697,
"epoch": 0.05432937181663837,
"grad_norm": 1.094306972800085,
"kl": 0.00039839744567871094,
"learning_rate": 5.6e-07,
"loss": 0.0011,
"step": 8
},
{
"clip_ratio": 0.0,
"completion_length": 305.2890625,
"epoch": 0.06112054329371817,
"grad_norm": 1.0830810632499763,
"kl": 0.00036144256591796875,
"learning_rate": 6.4e-07,
"loss": -0.0091,
"reward": 0.44168128073215485,
"reward_std": 0.29115105979144573,
"rewards/preference_model_reward": 0.44168128073215485,
"rewards/preference_model_reward/std": 0.29115105606615543,
"step": 9
},
{
"clip_ratio": 0.00021800790273118764,
"epoch": 0.06791171477079797,
"grad_norm": 1.0686900063629763,
"kl": 0.00036394596099853516,
"learning_rate": 7.2e-07,
"loss": -0.0091,
"step": 10
},
{
"clip_ratio": 0.0,
"completion_length": 393.38671875,
"epoch": 0.07470288624787776,
"grad_norm": 0.9072962579375315,
"kl": 0.00039005279541015625,
"learning_rate": 8e-07,
"loss": -0.0006,
"reward": 0.3613455481827259,
"reward_std": 0.25880729779601097,
"rewards/preference_model_reward": 0.3613455481827259,
"rewards/preference_model_reward/std": 0.2588073033839464,
"step": 11
},
{
"clip_ratio": 0.0003210598952136934,
"epoch": 0.08149405772495756,
"grad_norm": 0.9072123086741707,
"kl": 0.0003895759582519531,
"learning_rate": 8.799999999999999e-07,
"loss": -0.0006,
"step": 12
},
{
"clip_ratio": 0.0,
"completion_length": 367.1328125,
"epoch": 0.08828522920203735,
"grad_norm": 1.031552247551817,
"kl": 0.0003743171691894531,
"learning_rate": 9.6e-07,
"loss": 0.0019,
"reward": 0.48316149413585663,
"reward_std": 0.2832601722329855,
"rewards/preference_model_reward": 0.48316149413585663,
"rewards/preference_model_reward/std": 0.2832601750269532,
"step": 13
},
{
"clip_ratio": 0.00029072189227008494,
"epoch": 0.09507640067911714,
"grad_norm": 1.020095801170388,
"kl": 0.00040650367736816406,
"learning_rate": 1.04e-06,
"loss": 0.0018,
"step": 14
},
{
"clip_ratio": 0.0,
"completion_length": 361.61328125,
"epoch": 0.10186757215619695,
"grad_norm": 0.9054135077436368,
"kl": 0.000415802001953125,
"learning_rate": 1.12e-06,
"loss": -0.0095,
"reward": 0.6715657562017441,
"reward_std": 0.26127034425735474,
"rewards/preference_model_reward": 0.6715657562017441,
"rewards/preference_model_reward/std": 0.2612703386694193,
"step": 15
},
{
"clip_ratio": 0.00016775515268818708,
"epoch": 0.10865874363327674,
"grad_norm": 0.9126437934856992,
"kl": 0.00041675567626953125,
"learning_rate": 1.2e-06,
"loss": -0.0095,
"step": 16
},
{
"clip_ratio": 0.0,
"completion_length": 269.35546875,
"epoch": 0.11544991511035653,
"grad_norm": 1.3411908637656689,
"kl": 0.0004000663757324219,
"learning_rate": 1.28e-06,
"loss": -0.0131,
"reward": 0.5323104355484247,
"reward_std": 0.2701443340629339,
"rewards/preference_model_reward": 0.5323104355484247,
"rewards/preference_model_reward/std": 0.2701443322002888,
"step": 17
},
{
"clip_ratio": 0.0002587076596682891,
"epoch": 0.12224108658743633,
"grad_norm": 1.161588313842898,
"kl": 0.0004303455352783203,
"learning_rate": 1.3600000000000001e-06,
"loss": -0.013,
"step": 18
},
{
"clip_ratio": 0.0,
"completion_length": 310.2265625,
"epoch": 0.12903225806451613,
"grad_norm": 1.0392498163961779,
"kl": 0.0005252361297607422,
"learning_rate": 1.44e-06,
"loss": -0.0118,
"reward": 0.5056532379239798,
"reward_std": 0.24357289634644985,
"rewards/preference_model_reward": 0.5056532379239798,
"rewards/preference_model_reward/std": 0.24357289355248213,
"step": 19
},
{
"clip_ratio": 0.0002536060192142031,
"epoch": 0.13582342954159593,
"grad_norm": 1.0214823504151023,
"kl": 0.0005702972412109375,
"learning_rate": 1.5199999999999998e-06,
"loss": -0.0119,
"step": 20
},
{
"clip_ratio": 0.0,
"completion_length": 437.74609375,
"epoch": 0.14261460101867574,
"grad_norm": 0.9892882217675961,
"kl": 0.0005764961242675781,
"learning_rate": 1.6e-06,
"loss": 0.0017,
"reward": 0.6471737138926983,
"reward_std": 0.28144341707229614,
"rewards/preference_model_reward": 0.6471737138926983,
"rewards/preference_model_reward/std": 0.2814434114843607,
"step": 21
},
{
"clip_ratio": 0.00024805667453620117,
"epoch": 0.1494057724957555,
"grad_norm": 0.8611996612129298,
"kl": 0.0006487369537353516,
"learning_rate": 1.6799999999999998e-06,
"loss": 0.0016,
"step": 22
},
{
"clip_ratio": 0.0,
"completion_length": 288.34375,
"epoch": 0.15619694397283532,
"grad_norm": 1.5143681967656712,
"kl": 0.0007748603820800781,
"learning_rate": 1.7599999999999999e-06,
"loss": -0.0053,
"reward": 0.735472559928894,
"reward_std": 0.3065086118876934,
"rewards/preference_model_reward": 0.735472559928894,
"rewards/preference_model_reward/std": 0.3065086044371128,
"step": 23
},
{
"clip_ratio": 0.00037343262283684453,
"epoch": 0.16298811544991512,
"grad_norm": 1.1570786184830504,
"kl": 0.0008769035339355469,
"learning_rate": 1.84e-06,
"loss": -0.0053,
"step": 24
},
{
"clip_ratio": 0.0,
"completion_length": 381.19140625,
"epoch": 0.1697792869269949,
"grad_norm": 0.8177439257742645,
"kl": 0.0009984970092773438,
"learning_rate": 1.92e-06,
"loss": -0.0008,
"reward": 0.6719343699514866,
"reward_std": 0.2588585498742759,
"rewards/preference_model_reward": 0.6719343699514866,
"rewards/preference_model_reward/std": 0.2588585487101227,
"step": 25
},
{
"clip_ratio": 0.0002841594987330609,
"epoch": 0.1765704584040747,
"grad_norm": 0.8284952878885445,
"kl": 0.0011463165283203125,
"learning_rate": 2e-06,
"loss": -0.0009,
"step": 26
},
{
"clip_ratio": 0.0,
"completion_length": 293.75390625,
"epoch": 0.1833616298811545,
"grad_norm": 1.0612960800439712,
"kl": 0.001338958740234375,
"learning_rate": 2e-06,
"loss": 0.0062,
"reward": 0.5385704413056374,
"reward_std": 0.2679297383874655,
"rewards/preference_model_reward": 0.5385704413056374,
"rewards/preference_model_reward/std": 0.2679297402501106,
"step": 27
},
{
"clip_ratio": 0.0003048431572096888,
"epoch": 0.19015280135823429,
"grad_norm": 1.0535161088618343,
"kl": 0.0014734268188476562,
"learning_rate": 2e-06,
"loss": 0.0061,
"step": 28
},
{
"clip_ratio": 0.0,
"completion_length": 406.28125,
"epoch": 0.1969439728353141,
"grad_norm": 0.9688907390375263,
"kl": 0.0017023086547851562,
"learning_rate": 2e-06,
"loss": -0.0025,
"reward": 0.5423681996762753,
"reward_std": 0.26603892212733626,
"rewards/preference_model_reward": 0.5423681996762753,
"rewards/preference_model_reward/std": 0.26603891397826374,
"step": 29
},
{
"clip_ratio": 0.0004548036777123343,
"epoch": 0.2037351443123939,
"grad_norm": 1.056840409643398,
"kl": 0.0018987655639648438,
"learning_rate": 2e-06,
"loss": -0.0026,
"step": 30
},
{
"clip_ratio": 0.0,
"completion_length": 441.890625,
"epoch": 0.21052631578947367,
"grad_norm": 0.8637647596491498,
"kl": 0.0018672943115234375,
"learning_rate": 2e-06,
"loss": -0.0001,
"reward": 0.6307996772229671,
"reward_std": 0.24354635225608945,
"rewards/preference_model_reward": 0.6307996772229671,
"rewards/preference_model_reward/std": 0.24354635691270232,
"step": 31
},
{
"clip_ratio": 0.00037552460526057985,
"epoch": 0.21731748726655348,
"grad_norm": 0.8722655459487942,
"kl": 0.0019969940185546875,
"learning_rate": 2e-06,
"loss": -0.0002,
"step": 32
},
{
"clip_ratio": 0.0,
"completion_length": 305.55859375,
"epoch": 0.22410865874363328,
"grad_norm": 1.1940804556285844,
"kl": 0.002285003662109375,
"learning_rate": 2e-06,
"loss": 0.0097,
"reward": 0.5836777277290821,
"reward_std": 0.30259183794260025,
"rewards/preference_model_reward": 0.5836777277290821,
"rewards/preference_model_reward/std": 0.30259183421730995,
"step": 33
},
{
"clip_ratio": 0.00024095292246784084,
"epoch": 0.23089983022071306,
"grad_norm": 1.3077624309877895,
"kl": 0.00241851806640625,
"learning_rate": 2e-06,
"loss": 0.0097,
"step": 34
},
{
"clip_ratio": 0.0,
"completion_length": 313.63671875,
"epoch": 0.23769100169779286,
"grad_norm": 1.0665132545980283,
"kl": 0.0022363662719726562,
"learning_rate": 2e-06,
"loss": -0.0159,
"reward": 0.594868753105402,
"reward_std": 0.2588401613757014,
"rewards/preference_model_reward": 0.594868753105402,
"rewards/preference_model_reward/std": 0.25884015765041113,
"step": 35
},
{
"clip_ratio": 0.0003941934592148755,
"epoch": 0.24448217317487267,
"grad_norm": 1.076309978844894,
"kl": 0.00234222412109375,
"learning_rate": 2e-06,
"loss": -0.016,
"step": 36
},
{
"clip_ratio": 0.0,
"completion_length": 396.75,
"epoch": 0.25127334465195245,
"grad_norm": 1.0267247550638525,
"kl": 0.002719879150390625,
"learning_rate": 2e-06,
"loss": -0.0067,
"reward": 0.5739418976008892,
"reward_std": 0.2615406233817339,
"rewards/preference_model_reward": 0.5739418976008892,
"rewards/preference_model_reward/std": 0.26154061406850815,
"step": 37
},
{
"clip_ratio": 0.00040432674541079905,
"epoch": 0.25806451612903225,
"grad_norm": 0.9172779196341795,
"kl": 0.0029506683349609375,
"learning_rate": 2e-06,
"loss": -0.0068,
"step": 38
},
{
"clip_ratio": 0.0,
"completion_length": 327.375,
"epoch": 0.26485568760611206,
"grad_norm": 1.086086180237484,
"kl": 0.00321197509765625,
"learning_rate": 2e-06,
"loss": -0.0043,
"reward": 0.6816785149276257,
"reward_std": 0.2440826129168272,
"rewards/preference_model_reward": 0.6816785149276257,
"rewards/preference_model_reward/std": 0.24408261477947235,
"step": 39
},
{
"clip_ratio": 0.00011399965751479613,
"epoch": 0.27164685908319186,
"grad_norm": 1.1219880555909083,
"kl": 0.0034637451171875,
"learning_rate": 2e-06,
"loss": -0.0044,
"step": 40
},
{
"clip_ratio": 0.0,
"completion_length": 399.7109375,
"epoch": 0.27843803056027167,
"grad_norm": 0.959926063361782,
"kl": 0.00341796875,
"learning_rate": 2e-06,
"loss": 0.0028,
"reward": 0.5822978504002094,
"reward_std": 0.2836025133728981,
"rewards/preference_model_reward": 0.5822978504002094,
"rewards/preference_model_reward/std": 0.28360251151025295,
"step": 41
},
{
"clip_ratio": 0.0002537540549383266,
"epoch": 0.28522920203735147,
"grad_norm": 0.9737278797687372,
"kl": 0.0036487579345703125,
"learning_rate": 2e-06,
"loss": 0.0027,
"step": 42
},
{
"clip_ratio": 0.0,
"completion_length": 466.25390625,
"epoch": 0.2920203735144312,
"grad_norm": 0.8110847666278888,
"kl": 0.0040760040283203125,
"learning_rate": 2e-06,
"loss": 0.0015,
"reward": 0.7039333023130894,
"reward_std": 0.24644886306487024,
"rewards/preference_model_reward": 0.7039333023130894,
"rewards/preference_model_reward/std": 0.2464488591067493,
"step": 43
},
{
"clip_ratio": 0.00036502836792351445,
"epoch": 0.298811544991511,
"grad_norm": 0.8251482957928842,
"kl": 0.00433349609375,
"learning_rate": 2e-06,
"loss": 0.0015,
"step": 44
},
{
"clip_ratio": 0.0,
"completion_length": 326.37890625,
"epoch": 0.30560271646859083,
"grad_norm": 0.9093484912696803,
"kl": 0.004657745361328125,
"learning_rate": 2e-06,
"loss": -0.0035,
"reward": 0.5050893509760499,
"reward_std": 0.23734514694660902,
"rewards/preference_model_reward": 0.5050893509760499,
"rewards/preference_model_reward/std": 0.23734513530507684,
"step": 45
},
{
"clip_ratio": 0.00032999391623889096,
"epoch": 0.31239388794567063,
"grad_norm": 0.8994019896097728,
"kl": 0.0048351287841796875,
"learning_rate": 2e-06,
"loss": -0.0034,
"step": 46
},
{
"clip_ratio": 0.0,
"completion_length": 378.30859375,
"epoch": 0.31918505942275044,
"grad_norm": 1.0879235017943376,
"kl": 0.004894256591796875,
"learning_rate": 2e-06,
"loss": -0.0055,
"reward": 0.7660590410232544,
"reward_std": 0.24720557313412428,
"rewards/preference_model_reward": 0.7660590410232544,
"rewards/preference_model_reward/std": 0.24720556661486626,
"step": 47
},
{
"clip_ratio": 0.00039132226856963825,
"epoch": 0.32597623089983024,
"grad_norm": 0.9067283296578724,
"kl": 0.00513458251953125,
"learning_rate": 2e-06,
"loss": -0.0055,
"step": 48
},
{
"clip_ratio": 0.0,
"completion_length": 309.609375,
"epoch": 0.33276740237691,
"grad_norm": 1.417503400810007,
"kl": 0.005855560302734375,
"learning_rate": 2e-06,
"loss": -0.0132,
"reward": 0.6303279139101505,
"reward_std": 0.29637874104082584,
"rewards/preference_model_reward": 0.6303279139101505,
"rewards/preference_model_reward/std": 0.29637873359024525,
"step": 49
},
{
"clip_ratio": 0.00033494585659354925,
"epoch": 0.3395585738539898,
"grad_norm": 1.2450962191215096,
"kl": 0.006114959716796875,
"learning_rate": 2e-06,
"loss": -0.0133,
"step": 50
},
{
"clip_ratio": 0.0,
"completion_length": 333.18359375,
"epoch": 0.3463497453310696,
"grad_norm": 1.044571491400686,
"kl": 0.0055866241455078125,
"learning_rate": 2e-06,
"loss": -0.0093,
"reward": 0.4536947198212147,
"reward_std": 0.25838964246213436,
"rewards/preference_model_reward": 0.4536947198212147,
"rewards/preference_model_reward/std": 0.25838964618742466,
"step": 51
},
{
"clip_ratio": 0.00034966163912031334,
"epoch": 0.3531409168081494,
"grad_norm": 1.0319046384759274,
"kl": 0.0059833526611328125,
"learning_rate": 2e-06,
"loss": -0.0094,
"step": 52
},
{
"clip_ratio": 0.0,
"completion_length": 293.0390625,
"epoch": 0.3599320882852292,
"grad_norm": 1.0209389153993411,
"kl": 0.00640106201171875,
"learning_rate": 2e-06,
"loss": -0.0063,
"reward": 0.6109997481107712,
"reward_std": 0.27825887873768806,
"rewards/preference_model_reward": 0.6109997481107712,
"rewards/preference_model_reward/std": 0.2782588703557849,
"step": 53
},
{
"clip_ratio": 0.00033825510763563216,
"epoch": 0.366723259762309,
"grad_norm": 1.1686483370046787,
"kl": 0.0067596435546875,
"learning_rate": 2e-06,
"loss": -0.0064,
"step": 54
},
{
"clip_ratio": 0.0,
"completion_length": 385.97265625,
"epoch": 0.3735144312393888,
"grad_norm": 1.2276766352908126,
"kl": 0.00732421875,
"learning_rate": 2e-06,
"loss": -0.0131,
"reward": 0.5218756012618542,
"reward_std": 0.32545966282486916,
"rewards/preference_model_reward": 0.5218756012618542,
"rewards/preference_model_reward/std": 0.32545965164899826,
"step": 55
},
{
"clip_ratio": 0.00023414421593770385,
"epoch": 0.38030560271646857,
"grad_norm": 1.123144036568887,
"kl": 0.0076751708984375,
"learning_rate": 2e-06,
"loss": -0.0131,
"step": 56
},
{
"clip_ratio": 0.0,
"completion_length": 340.5625,
"epoch": 0.3870967741935484,
"grad_norm": 0.9127683567363678,
"kl": 0.00778961181640625,
"learning_rate": 2e-06,
"loss": 0.0072,
"reward": 0.5140541326254606,
"reward_std": 0.23541682050563395,
"rewards/preference_model_reward": 0.5140541326254606,
"rewards/preference_model_reward/std": 0.23541681352071464,
"step": 57
},
{
"clip_ratio": 0.00020465617490117438,
"epoch": 0.3938879456706282,
"grad_norm": 0.9240907400149151,
"kl": 0.00821685791015625,
"learning_rate": 2e-06,
"loss": 0.0071,
"step": 58
},
{
"clip_ratio": 0.0,
"completion_length": 400.1171875,
"epoch": 0.400679117147708,
"grad_norm": 1.2755019834131804,
"kl": 0.00856781005859375,
"learning_rate": 2e-06,
"loss": -0.0049,
"reward": 0.5248972652480006,
"reward_std": 0.2680952288210392,
"rewards/preference_model_reward": 0.5248972652480006,
"rewards/preference_model_reward/std": 0.26809522369876504,
"step": 59
},
{
"clip_ratio": 0.00044215139951120364,
"epoch": 0.4074702886247878,
"grad_norm": 1.0137811933079786,
"kl": 0.008953094482421875,
"learning_rate": 2e-06,
"loss": -0.005,
"step": 60
},
{
"clip_ratio": 0.0,
"completion_length": 384.32421875,
"epoch": 0.4142614601018676,
"grad_norm": 0.8023274386554214,
"kl": 0.00972747802734375,
"learning_rate": 2e-06,
"loss": -0.007,
"reward": 0.7109678499400616,
"reward_std": 0.22113378904759884,
"rewards/preference_model_reward": 0.7109678499400616,
"rewards/preference_model_reward/std": 0.22113378625363111,
"step": 61
},
{
"clip_ratio": 0.00018141404598281952,
"epoch": 0.42105263157894735,
"grad_norm": 0.8315096106222806,
"kl": 0.01013946533203125,
"learning_rate": 2e-06,
"loss": -0.007,
"step": 62
},
{
"clip_ratio": 0.0,
"completion_length": 364.890625,
"epoch": 0.42784380305602715,
"grad_norm": 0.9668397185089038,
"kl": 0.0107421875,
"learning_rate": 2e-06,
"loss": 0.0031,
"reward": 0.6806612908840179,
"reward_std": 0.2775609251111746,
"rewards/preference_model_reward": 0.6806612908840179,
"rewards/preference_model_reward/std": 0.27756091207265854,
"step": 63
},
{
"clip_ratio": 0.00043790563358925283,
"epoch": 0.43463497453310695,
"grad_norm": 0.9404870267113553,
"kl": 0.01111602783203125,
"learning_rate": 2e-06,
"loss": 0.0031,
"step": 64
},
{
"clip_ratio": 0.0,
"completion_length": 324.453125,
"epoch": 0.44142614601018676,
"grad_norm": 1.0311940848708463,
"kl": 0.009700775146484375,
"learning_rate": 2e-06,
"loss": 0.0037,
"reward": 0.6332272328436375,
"reward_std": 0.25745808193460107,
"rewards/preference_model_reward": 0.6332272328436375,
"rewards/preference_model_reward/std": 0.2574580740183592,
"step": 65
},
{
"clip_ratio": 0.0003352455223648576,
"epoch": 0.44821731748726656,
"grad_norm": 1.041796930728353,
"kl": 0.009906768798828125,
"learning_rate": 2e-06,
"loss": 0.0037,
"step": 66
},
{
"clip_ratio": 0.0,
"completion_length": 382.29296875,
"epoch": 0.45500848896434637,
"grad_norm": 0.9676368782571226,
"kl": 0.01045989990234375,
"learning_rate": 2e-06,
"loss": 0.0018,
"reward": 0.6579710729420185,
"reward_std": 0.2728098388761282,
"rewards/preference_model_reward": 0.6579710729420185,
"rewards/preference_model_reward/std": 0.2728098277002573,
"step": 67
},
{
"clip_ratio": 0.0004586090890370542,
"epoch": 0.4617996604414261,
"grad_norm": 0.9733174758393729,
"kl": 0.01084136962890625,
"learning_rate": 2e-06,
"loss": 0.0018,
"step": 68
},
{
"clip_ratio": 0.0,
"completion_length": 329.83984375,
"epoch": 0.4685908319185059,
"grad_norm": 0.9029956997185686,
"kl": 0.01184844970703125,
"learning_rate": 2e-06,
"loss": -0.0082,
"reward": 0.6309689432382584,
"reward_std": 0.2526115436339751,
"rewards/preference_model_reward": 0.6309689432382584,
"rewards/preference_model_reward/std": 0.2526115436339751,
"step": 69
},
{
"clip_ratio": 0.0002797141951305093,
"epoch": 0.47538200339558573,
"grad_norm": 0.8995188914064342,
"kl": 0.01227569580078125,
"learning_rate": 2e-06,
"loss": -0.0082,
"step": 70
},
{
"clip_ratio": 0.0,
"completion_length": 349.5234375,
"epoch": 0.48217317487266553,
"grad_norm": 1.1446636787137474,
"kl": 0.01305389404296875,
"learning_rate": 2e-06,
"loss": -0.0037,
"reward": 0.5975633077323437,
"reward_std": 0.27538682520389557,
"rewards/preference_model_reward": 0.5975633077323437,
"rewards/preference_model_reward/std": 0.2753868168219924,
"step": 71
},
{
"clip_ratio": 0.00037468447590072174,
"epoch": 0.48896434634974534,
"grad_norm": 1.0052357390576183,
"kl": 0.013519287109375,
"learning_rate": 2e-06,
"loss": -0.0037,
"step": 72
},
{
"clip_ratio": 0.0,
"completion_length": 423.359375,
"epoch": 0.49575551782682514,
"grad_norm": 0.9195418190581706,
"kl": 0.01299285888671875,
"learning_rate": 2e-06,
"loss": -0.0037,
"reward": 0.6704220920801163,
"reward_std": 0.2678078021854162,
"rewards/preference_model_reward": 0.6704220920801163,
"rewards/preference_model_reward/std": 0.26780780404806137,
"step": 73
},
{
"clip_ratio": 0.0002866502591132303,
"epoch": 0.5025466893039049,
"grad_norm": 1.097145506469791,
"kl": 0.01361083984375,
"learning_rate": 2e-06,
"loss": -0.0038,
"step": 74
},
{
"clip_ratio": 0.0,
"completion_length": 396.50390625,
"epoch": 0.5093378607809848,
"grad_norm": 0.9564676492531332,
"kl": 0.01482391357421875,
"learning_rate": 2e-06,
"loss": -0.0056,
"reward": 0.6842997781932354,
"reward_std": 0.25731130968779325,
"rewards/preference_model_reward": 0.6842997781932354,
"rewards/preference_model_reward/std": 0.25731129944324493,
"step": 75
},
{
"clip_ratio": 0.00033780875583033776,
"epoch": 0.5161290322580645,
"grad_norm": 0.9451343780893788,
"kl": 0.0153350830078125,
"learning_rate": 2e-06,
"loss": -0.0056,
"step": 76
},
{
"clip_ratio": 0.0,
"completion_length": 303.171875,
"epoch": 0.5229202037351444,
"grad_norm": 1.028583508211413,
"kl": 0.0159912109375,
"learning_rate": 2e-06,
"loss": -0.005,
"reward": 0.7383619099855423,
"reward_std": 0.24221469252370298,
"rewards/preference_model_reward": 0.7383619099855423,
"rewards/preference_model_reward/std": 0.2422146894969046,
"step": 77
},
{
"clip_ratio": 0.0004982726022717543,
"epoch": 0.5297113752122241,
"grad_norm": 0.9589189463847356,
"kl": 0.01644134521484375,
"learning_rate": 2e-06,
"loss": -0.0051,
"step": 78
},
{
"clip_ratio": 0.0,
"completion_length": 407.79296875,
"epoch": 0.5365025466893039,
"grad_norm": 1.0543595544736126,
"kl": 0.0185699462890625,
"learning_rate": 2e-06,
"loss": -0.0072,
"reward": 0.6588779911398888,
"reward_std": 0.2795031704008579,
"rewards/preference_model_reward": 0.6588779911398888,
"rewards/preference_model_reward/std": 0.2795031573623419,
"step": 79
},
{
"clip_ratio": 0.00030141435672703665,
"epoch": 0.5432937181663837,
"grad_norm": 1.0758354050017793,
"kl": 0.018951416015625,
"learning_rate": 2e-06,
"loss": -0.0073,
"step": 80
},
{
"clip_ratio": 0.0,
"completion_length": 436.18359375,
"epoch": 0.5500848896434635,
"grad_norm": 0.781066685649031,
"kl": 0.0158233642578125,
"learning_rate": 2e-06,
"loss": -0.0022,
"reward": 0.7797287777066231,
"reward_std": 0.2147242482751608,
"rewards/preference_model_reward": 0.7797287777066231,
"rewards/preference_model_reward/std": 0.21472424920648336,
"step": 81
},
{
"clip_ratio": 0.0003111153791905963,
"epoch": 0.5568760611205433,
"grad_norm": 0.7433718148656135,
"kl": 0.0160369873046875,
"learning_rate": 2e-06,
"loss": -0.0023,
"step": 82
},
{
"clip_ratio": 0.0,
"completion_length": 383.03125,
"epoch": 0.5636672325976231,
"grad_norm": 0.8390742957401245,
"kl": 0.01795196533203125,
"learning_rate": 2e-06,
"loss": -0.0066,
"reward": 0.6639863140881062,
"reward_std": 0.2489920537918806,
"rewards/preference_model_reward": 0.6639863140881062,
"rewards/preference_model_reward/std": 0.24899205192923546,
"step": 83
},
{
"clip_ratio": 0.00024405560725426767,
"epoch": 0.5704584040747029,
"grad_norm": 0.8449552595534167,
"kl": 0.0183258056640625,
"learning_rate": 2e-06,
"loss": -0.0066,
"step": 84
},
{
"clip_ratio": 0.0,
"completion_length": 406.01953125,
"epoch": 0.5772495755517827,
"grad_norm": 0.9325132487643661,
"kl": 0.01790618896484375,
"learning_rate": 2e-06,
"loss": -0.0019,
"reward": 0.7170650623738766,
"reward_std": 0.2119898241944611,
"rewards/preference_model_reward": 0.7170650623738766,
"rewards/preference_model_reward/std": 0.21198982140049338,
"step": 85
},
{
"clip_ratio": 0.000315660272462992,
"epoch": 0.5840407470288624,
"grad_norm": 0.8561811844655657,
"kl": 0.0183258056640625,
"learning_rate": 2e-06,
"loss": -0.002,
"step": 86
},
{
"clip_ratio": 0.0,
"completion_length": 373.5390625,
"epoch": 0.5908319185059423,
"grad_norm": 0.9759881466782773,
"kl": 0.01984405517578125,
"learning_rate": 2e-06,
"loss": 0.0002,
"reward": 0.6445738822221756,
"reward_std": 0.24776791501790285,
"rewards/preference_model_reward": 0.6445738822221756,
"rewards/preference_model_reward/std": 0.2477679206058383,
"step": 87
},
{
"clip_ratio": 0.0004009394979220815,
"epoch": 0.597623089983022,
"grad_norm": 0.9762704806385095,
"kl": 0.0201568603515625,
"learning_rate": 2e-06,
"loss": 0.0002,
"step": 88
},
{
"clip_ratio": 0.0,
"completion_length": 370.92578125,
"epoch": 0.6044142614601019,
"grad_norm": 1.0730306390874693,
"kl": 0.01903533935546875,
"learning_rate": 2e-06,
"loss": -0.0058,
"reward": 0.7051359005272388,
"reward_std": 0.2701743124052882,
"rewards/preference_model_reward": 0.7051359005272388,
"rewards/preference_model_reward/std": 0.2701743012294173,
"step": 89
},
{
"clip_ratio": 0.0002958584018415422,
"epoch": 0.6112054329371817,
"grad_norm": 11.272175064066806,
"kl": 0.0193634033203125,
"learning_rate": 2e-06,
"loss": -0.0058,
"step": 90
},
{
"clip_ratio": 0.0,
"completion_length": 293.8515625,
"epoch": 0.6179966044142614,
"grad_norm": 1.2532081169597848,
"kl": 0.024749755859375,
"learning_rate": 2e-06,
"loss": -0.0107,
"reward": 0.5385027192533016,
"reward_std": 0.2979753892868757,
"rewards/preference_model_reward": 0.5385027192533016,
"rewards/preference_model_reward/std": 0.2979753725230694,
"step": 91
},
{
"clip_ratio": 0.000490643553348491,
"epoch": 0.6247877758913413,
"grad_norm": 1.2110410142820385,
"kl": 0.025421142578125,
"learning_rate": 2e-06,
"loss": -0.0108,
"step": 92
},
{
"clip_ratio": 0.0,
"completion_length": 339.84765625,
"epoch": 0.631578947368421,
"grad_norm": 0.9347491475779949,
"kl": 0.0230560302734375,
"learning_rate": 2e-06,
"loss": -0.0029,
"reward": 0.6585216615349054,
"reward_std": 0.2630395647138357,
"rewards/preference_model_reward": 0.6585216615349054,
"rewards/preference_model_reward/std": 0.2630395656451583,
"step": 93
},
{
"clip_ratio": 0.0002996912953676656,
"epoch": 0.6383701188455009,
"grad_norm": 0.958835774432475,
"kl": 0.0236053466796875,
"learning_rate": 2e-06,
"loss": -0.0029,
"step": 94
},
{
"clip_ratio": 0.0,
"completion_length": 363.64453125,
"epoch": 0.6451612903225806,
"grad_norm": 0.8272066984013973,
"kl": 0.020721435546875,
"learning_rate": 2e-06,
"loss": -0.0064,
"reward": 0.8653465658426285,
"reward_std": 0.1561016822233796,
"rewards/preference_model_reward": 0.8653465658426285,
"rewards/preference_model_reward/std": 0.15610167011618614,
"step": 95
},
{
"clip_ratio": 0.0002611535892356187,
"epoch": 0.6519524617996605,
"grad_norm": 0.8209441010001175,
"kl": 0.021331787109375,
"learning_rate": 2e-06,
"loss": -0.0064,
"step": 96
},
{
"clip_ratio": 0.0,
"completion_length": 337.61328125,
"epoch": 0.6587436332767402,
"grad_norm": 0.9707335419756125,
"kl": 0.0257415771484375,
"learning_rate": 2e-06,
"loss": -0.003,
"reward": 0.6451991870999336,
"reward_std": 0.24027688056230545,
"rewards/preference_model_reward": 0.6451991870999336,
"rewards/preference_model_reward/std": 0.24027687963098288,
"step": 97
},
{
"clip_ratio": 0.000394973511902208,
"epoch": 0.66553480475382,
"grad_norm": 0.9686579468801768,
"kl": 0.0262908935546875,
"learning_rate": 2e-06,
"loss": -0.003,
"step": 98
},
{
"clip_ratio": 0.0,
"completion_length": 407.8203125,
"epoch": 0.6723259762308998,
"grad_norm": 1.0490178740647227,
"kl": 0.0236663818359375,
"learning_rate": 2e-06,
"loss": -0.0056,
"reward": 0.6188035290688276,
"reward_std": 0.23126866854727268,
"rewards/preference_model_reward": 0.6188035290688276,
"rewards/preference_model_reward/std": 0.23126865550875664,
"step": 99
},
{
"clip_ratio": 0.0002818793718688539,
"epoch": 0.6791171477079796,
"grad_norm": 0.9455082290037559,
"kl": 0.024139404296875,
"learning_rate": 2e-06,
"loss": -0.0056,
"step": 100
},
{
"clip_ratio": 0.0,
"completion_length": 423.52734375,
"epoch": 0.6859083191850595,
"grad_norm": 0.8251731717477002,
"kl": 0.02386474609375,
"learning_rate": 2e-06,
"loss": -0.0005,
"reward": 0.8521472215652466,
"reward_std": 0.2337375245988369,
"rewards/preference_model_reward": 0.8521472215652466,
"rewards/preference_model_reward/std": 0.2337375171482563,
"step": 101
},
{
"clip_ratio": 0.0003581516129997908,
"epoch": 0.6926994906621392,
"grad_norm": 0.9184460532438401,
"kl": 0.024505615234375,
"learning_rate": 2e-06,
"loss": -0.0005,
"step": 102
},
{
"clip_ratio": 0.0,
"completion_length": 435.6171875,
"epoch": 0.6994906621392191,
"grad_norm": 1.0860106347765217,
"kl": 0.0284881591796875,
"learning_rate": 2e-06,
"loss": -0.0005,
"reward": 0.7299420312047005,
"reward_std": 0.2999110519886017,
"rewards/preference_model_reward": 0.7299420312047005,
"rewards/preference_model_reward/std": 0.29991103895008564,
"step": 103
},
{
"clip_ratio": 0.00037642833285644883,
"epoch": 0.7062818336162988,
"grad_norm": 1.0035915159320414,
"kl": 0.029083251953125,
"learning_rate": 2e-06,
"loss": -0.0005,
"step": 104
},
{
"clip_ratio": 0.0,
"completion_length": 314.51171875,
"epoch": 0.7130730050933786,
"grad_norm": 0.9150031551000603,
"kl": 0.0306854248046875,
"learning_rate": 2e-06,
"loss": -0.0014,
"reward": 0.6948796380311251,
"reward_std": 0.23453088384121656,
"rewards/preference_model_reward": 0.6948796380311251,
"rewards/preference_model_reward/std": 0.23453087732195854,
"step": 105
},
{
"clip_ratio": 0.00046619037311756983,
"epoch": 0.7198641765704584,
"grad_norm": 0.9057949433659955,
"kl": 0.0311737060546875,
"learning_rate": 2e-06,
"loss": -0.0015,
"step": 106
},
{
"clip_ratio": 0.0,
"completion_length": 347.51171875,
"epoch": 0.7266553480475382,
"grad_norm": 1.058402386367552,
"kl": 0.029266357421875,
"learning_rate": 2e-06,
"loss": -0.0072,
"reward": 0.7044901698827744,
"reward_std": 0.27433344163000584,
"rewards/preference_model_reward": 0.7044901698827744,
"rewards/preference_model_reward/std": 0.2743334397673607,
"step": 107
},
{
"clip_ratio": 0.0002761358628049493,
"epoch": 0.733446519524618,
"grad_norm": 1.0936652673840959,
"kl": 0.02972412109375,
"learning_rate": 2e-06,
"loss": -0.0073,
"step": 108
},
{
"clip_ratio": 0.0,
"completion_length": 332.05078125,
"epoch": 0.7402376910016978,
"grad_norm": 1.1862717131671745,
"kl": 0.0320892333984375,
"learning_rate": 2e-06,
"loss": -0.0043,
"reward": 0.6147979386150837,
"reward_std": 0.30731342919170856,
"rewards/preference_model_reward": 0.6147979386150837,
"rewards/preference_model_reward/std": 0.30731342546641827,
"step": 109
},
{
"clip_ratio": 0.00022946991703065578,
"epoch": 0.7470288624787776,
"grad_norm": 1.1478795200225282,
"kl": 0.0328369140625,
"learning_rate": 2e-06,
"loss": -0.0044,
"step": 110
},
{
"clip_ratio": 0.0,
"completion_length": 365.30859375,
"epoch": 0.7538200339558574,
"grad_norm": 0.5771078052939254,
"kl": 0.0330810546875,
"learning_rate": 2e-06,
"loss": -0.0051,
"reward": 0.7481855824589729,
"reward_std": 0.1441822382621467,
"rewards/preference_model_reward": 0.7481855824589729,
"rewards/preference_model_reward/std": 0.1441822382621467,
"step": 111
},
{
"clip_ratio": 0.00022052829172025668,
"epoch": 0.7606112054329371,
"grad_norm": 0.5662276617721039,
"kl": 0.033966064453125,
"learning_rate": 2e-06,
"loss": -0.0051,
"step": 112
},
{
"clip_ratio": 0.0,
"completion_length": 448.23046875,
"epoch": 0.767402376910017,
"grad_norm": 0.9029133714996065,
"kl": 0.0342559814453125,
"learning_rate": 2e-06,
"loss": -0.002,
"reward": 0.7898782268166542,
"reward_std": 0.24535357393324375,
"rewards/preference_model_reward": 0.7898782268166542,
"rewards/preference_model_reward/std": 0.24535356741398573,
"step": 113
},
{
"clip_ratio": 0.00040923262531578075,
"epoch": 0.7741935483870968,
"grad_norm": 0.8879456702067533,
"kl": 0.0348358154296875,
"learning_rate": 2e-06,
"loss": -0.002,
"step": 114
},
{
"clip_ratio": 0.0,
"completion_length": 377.7265625,
"epoch": 0.7809847198641766,
"grad_norm": 0.825741580156274,
"kl": 0.035675048828125,
"learning_rate": 2e-06,
"loss": -0.0054,
"reward": 0.7737665809690952,
"reward_std": 0.22196321282535791,
"rewards/preference_model_reward": 0.7737665809690952,
"rewards/preference_model_reward/std": 0.22196321096271276,
"step": 115
},
{
"clip_ratio": 0.00022133419952297118,
"epoch": 0.7877758913412564,
"grad_norm": 0.8402195364210961,
"kl": 0.03631591796875,
"learning_rate": 2e-06,
"loss": -0.0055,
"step": 116
},
{
"clip_ratio": 0.0,
"completion_length": 343.15234375,
"epoch": 0.7945670628183361,
"grad_norm": 0.8554129883813052,
"kl": 0.0380859375,
"learning_rate": 2e-06,
"loss": 0.004,
"reward": 0.8098233491182327,
"reward_std": 0.21788090001791716,
"rewards/preference_model_reward": 0.8098233491182327,
"rewards/preference_model_reward/std": 0.21788090048357844,
"step": 117
},
{
"clip_ratio": 0.0003840877070615534,
"epoch": 0.801358234295416,
"grad_norm": 0.8638777221139845,
"kl": 0.0389556884765625,
"learning_rate": 2e-06,
"loss": 0.0039,
"step": 118
},
{
"clip_ratio": 0.0,
"completion_length": 421.3046875,
"epoch": 0.8081494057724957,
"grad_norm": 0.6793458238067924,
"kl": 0.03741455078125,
"learning_rate": 2e-06,
"loss": -0.0004,
"reward": 0.7930763997137547,
"reward_std": 0.16387870162725449,
"rewards/preference_model_reward": 0.7930763997137547,
"rewards/preference_model_reward/std": 0.1638787006959319,
"step": 119
},
{
"clip_ratio": 0.000321710589560098,
"epoch": 0.8149405772495756,
"grad_norm": 0.6962155610206633,
"kl": 0.0380096435546875,
"learning_rate": 2e-06,
"loss": -0.0004,
"step": 120
},
{
"clip_ratio": 0.0,
"completion_length": 421.8359375,
"epoch": 0.8217317487266553,
"grad_norm": 0.9565211478116729,
"kl": 0.0385589599609375,
"learning_rate": 2e-06,
"loss": -0.0026,
"reward": 0.7755959965288639,
"reward_std": 0.23735665366984904,
"rewards/preference_model_reward": 0.7755959965288639,
"rewards/preference_model_reward/std": 0.23735664342530072,
"step": 121
},
{
"clip_ratio": 0.0003308643190393923,
"epoch": 0.8285229202037352,
"grad_norm": 1.0302822979795947,
"kl": 0.0387725830078125,
"learning_rate": 2e-06,
"loss": -0.0027,
"step": 122
},
{
"clip_ratio": 0.0,
"completion_length": 421.22265625,
"epoch": 0.8353140916808149,
"grad_norm": 1.156154466362683,
"kl": 0.03570556640625,
"learning_rate": 2e-06,
"loss": -0.0075,
"reward": 0.8653455749154091,
"reward_std": 0.17953538481378928,
"rewards/preference_model_reward": 0.8653455749154091,
"rewards/preference_model_reward/std": 0.179535374045372,
"step": 123
},
{
"clip_ratio": 0.000370404065506591,
"epoch": 0.8421052631578947,
"grad_norm": 1.0589963494452141,
"kl": 0.036041259765625,
"learning_rate": 2e-06,
"loss": -0.0074,
"step": 124
},
{
"clip_ratio": 0.0,
"completion_length": 361.0546875,
"epoch": 0.8488964346349746,
"grad_norm": 0.8042794508114846,
"kl": 0.041229248046875,
"learning_rate": 2e-06,
"loss": -0.0115,
"reward": 0.7803249768912792,
"reward_std": 0.2213962199166417,
"rewards/preference_model_reward": 0.7803249768912792,
"rewards/preference_model_reward/std": 0.221396217122674,
"step": 125
},
{
"clip_ratio": 0.0002928560206783004,
"epoch": 0.8556876061120543,
"grad_norm": 0.8094825207954228,
"kl": 0.0414276123046875,
"learning_rate": 2e-06,
"loss": -0.0115,
"step": 126
},
{
"clip_ratio": 0.0,
"completion_length": 367.30859375,
"epoch": 0.8624787775891342,
"grad_norm": 0.999252643417212,
"kl": 0.0374755859375,
"learning_rate": 2e-06,
"loss": -0.0009,
"reward": 0.7459425553679466,
"reward_std": 0.27967323176562786,
"rewards/preference_model_reward": 0.7459425553679466,
"rewards/preference_model_reward/std": 0.27967322804033756,
"step": 127
},
{
"clip_ratio": 0.00029575879580079345,
"epoch": 0.8692699490662139,
"grad_norm": 1.4461960825342008,
"kl": 0.037933349609375,
"learning_rate": 2e-06,
"loss": -0.0009,
"step": 128
},
{
"clip_ratio": 0.0,
"completion_length": 354.65234375,
"epoch": 0.8760611205432938,
"grad_norm": 1.0554062228696781,
"kl": 0.0406494140625,
"learning_rate": 2e-06,
"loss": -0.0028,
"reward": 0.7245122045278549,
"reward_std": 0.2613325589336455,
"rewards/preference_model_reward": 0.7245122045278549,
"rewards/preference_model_reward/std": 0.2613325538113713,
"step": 129
},
{
"clip_ratio": 0.00030801673892710824,
"epoch": 0.8828522920203735,
"grad_norm": 1.0792443458054302,
"kl": 0.04156494140625,
"learning_rate": 2e-06,
"loss": -0.0029,
"step": 130
},
{
"clip_ratio": 0.0,
"completion_length": 294.59375,
"epoch": 0.8896434634974533,
"grad_norm": 1.0374198522552833,
"kl": 0.0457000732421875,
"learning_rate": 2e-06,
"loss": -0.0079,
"reward": 0.7023610211908817,
"reward_std": 0.2407909445464611,
"rewards/preference_model_reward": 0.7023610211908817,
"rewards/preference_model_reward/std": 0.24079094640910625,
"step": 131
},
{
"clip_ratio": 0.00040924851782619953,
"epoch": 0.8964346349745331,
"grad_norm": 0.9902604514127353,
"kl": 0.046234130859375,
"learning_rate": 2e-06,
"loss": -0.008,
"step": 132
},
{
"clip_ratio": 0.0,
"completion_length": 363.4609375,
"epoch": 0.9032258064516129,
"grad_norm": 1.060000229364824,
"kl": 0.042449951171875,
"learning_rate": 2e-06,
"loss": -0.0007,
"reward": 0.7669150531291962,
"reward_std": 0.2660955060273409,
"rewards/preference_model_reward": 0.7669150531291962,
"rewards/preference_model_reward/std": 0.26609550788998604,
"step": 133
},
{
"clip_ratio": 0.00026973526109941304,
"epoch": 0.9100169779286927,
"grad_norm": 0.9936418006191627,
"kl": 0.043121337890625,
"learning_rate": 2e-06,
"loss": -0.0007,
"step": 134
},
{
"clip_ratio": 0.0,
"completion_length": 357.22265625,
"epoch": 0.9168081494057725,
"grad_norm": 0.7982535557698439,
"kl": 0.051361083984375,
"learning_rate": 2e-06,
"loss": -0.0028,
"reward": 0.8265567198395729,
"reward_std": 0.21873105503618717,
"rewards/preference_model_reward": 0.8265567198395729,
"rewards/preference_model_reward/std": 0.21873104479163885,
"step": 135
},
{
"clip_ratio": 0.0003086660126427887,
"epoch": 0.9235993208828522,
"grad_norm": 0.7943180189345149,
"kl": 0.05242919921875,
"learning_rate": 2e-06,
"loss": -0.0029,
"step": 136
},
{
"clip_ratio": 0.0,
"completion_length": 320.4296875,
"epoch": 0.9303904923599321,
"grad_norm": 1.1156997998544687,
"kl": 0.061492919921875,
"learning_rate": 2e-06,
"loss": -0.004,
"reward": 0.7904198691248894,
"reward_std": 0.216522429138422,
"rewards/preference_model_reward": 0.7904198691248894,
"rewards/preference_model_reward/std": 0.21652242727577686,
"step": 137
},
{
"clip_ratio": 0.00037556743882305454,
"epoch": 0.9371816638370118,
"grad_norm": 1.017141741754524,
"kl": 0.057098388671875,
"learning_rate": 2e-06,
"loss": -0.0041,
"step": 138
},
{
"clip_ratio": 0.0,
"completion_length": 340.8046875,
"epoch": 0.9439728353140917,
"grad_norm": 0.7728732979067988,
"kl": 0.047637939453125,
"learning_rate": 2e-06,
"loss": -0.0115,
"reward": 0.7384522631764412,
"reward_std": 0.20472911931574345,
"rewards/preference_model_reward": 0.7384522631764412,
"rewards/preference_model_reward/std": 0.20472911186516285,
"step": 139
},
{
"clip_ratio": 0.00038547036820091307,
"epoch": 0.9507640067911715,
"grad_norm": 0.8459776106505102,
"kl": 0.0485687255859375,
"learning_rate": 2e-06,
"loss": -0.0116,
"step": 140
},
{
"clip_ratio": 0.0,
"completion_length": 296.31640625,
"epoch": 0.9575551782682513,
"grad_norm": 1.0555084972169586,
"kl": 0.051055908203125,
"learning_rate": 2e-06,
"loss": -0.0092,
"reward": 0.7005413100123405,
"reward_std": 0.2579723782837391,
"rewards/preference_model_reward": 0.7005413100123405,
"rewards/preference_model_reward/std": 0.2579723745584488,
"step": 141
},
{
"clip_ratio": 0.00016731805226299912,
"epoch": 0.9643463497453311,
"grad_norm": 1.120763560485812,
"kl": 0.051971435546875,
"learning_rate": 2e-06,
"loss": -0.0093,
"step": 142
},
{
"clip_ratio": 0.0,
"completion_length": 377.7734375,
"epoch": 0.9711375212224108,
"grad_norm": 0.9897948935948673,
"kl": 0.0577392578125,
"learning_rate": 2e-06,
"loss": -0.0067,
"reward": 0.7895881161093712,
"reward_std": 0.23325645178556442,
"rewards/preference_model_reward": 0.7895881161093712,
"rewards/preference_model_reward/std": 0.23325645574368536,
"step": 143
},
{
"clip_ratio": 0.00022080717917560833,
"epoch": 0.9779286926994907,
"grad_norm": 1.4320582249764962,
"kl": 0.05889892578125,
"learning_rate": 2e-06,
"loss": -0.0067,
"step": 144
},
{
"clip_ratio": 0.0,
"completion_length": 402.28515625,
"epoch": 0.9847198641765704,
"grad_norm": 0.880015088177386,
"kl": 0.055999755859375,
"learning_rate": 2e-06,
"loss": -0.0012,
"reward": 0.7704083099961281,
"reward_std": 0.22987801115959883,
"rewards/preference_model_reward": 0.7704083099961281,
"rewards/preference_model_reward/std": 0.22987799905240536,
"step": 145
},
{
"clip_ratio": 0.00039433108031516895,
"epoch": 0.9915110356536503,
"grad_norm": 0.8831875786359421,
"kl": 0.056640625,
"learning_rate": 2e-06,
"loss": -0.0013,
"step": 146
},
{
"clip_ratio": 0.0,
"completion_length": 471.2734375,
"epoch": 1.0067911714770799,
"grad_norm": 0.8457751003945985,
"kl": 0.0511474609375,
"learning_rate": 2e-06,
"loss": -0.0095,
"reward": 0.7864305526018143,
"reward_std": 0.19768846221268177,
"rewards/preference_model_reward": 0.7864305526018143,
"rewards/preference_model_reward/std": 0.19768844894133508,
"step": 147
},
{
"clip_ratio": 0.000296334306767676,
"epoch": 1.0135823429541595,
"grad_norm": 0.8298995880374975,
"kl": 0.05181884765625,
"learning_rate": 2e-06,
"loss": -0.0096,
"step": 148
},
{
"clip_ratio": 0.0,
"completion_length": 455.83984375,
"epoch": 1.0203735144312394,
"grad_norm": 0.8194267156898773,
"kl": 0.052215576171875,
"learning_rate": 2e-06,
"loss": 0.0078,
"reward": 0.7310561165213585,
"reward_std": 0.21428223699331284,
"rewards/preference_model_reward": 0.7310561165213585,
"rewards/preference_model_reward/std": 0.21428224071860313,
"step": 149
},
{
"clip_ratio": 0.00043329124673618935,
"epoch": 1.0271646859083192,
"grad_norm": 0.8240545782064995,
"kl": 0.05267333984375,
"learning_rate": 2e-06,
"loss": 0.0078,
"step": 150
},
{
"clip_ratio": 0.0,
"completion_length": 370.30859375,
"epoch": 1.033955857385399,
"grad_norm": 0.760034227142178,
"kl": 0.057525634765625,
"learning_rate": 2e-06,
"loss": 0.0031,
"reward": 0.9029070436954498,
"reward_std": 0.15814024256542325,
"rewards/preference_model_reward": 0.9029070436954498,
"rewards/preference_model_reward/std": 0.15814024163410068,
"step": 151
},
{
"clip_ratio": 0.00020186395067867124,
"epoch": 1.0407470288624787,
"grad_norm": 0.7614495802573132,
"kl": 0.057952880859375,
"learning_rate": 2e-06,
"loss": 0.003,
"step": 152
},
{
"clip_ratio": 0.0,
"completion_length": 314.09375,
"epoch": 1.0475382003395586,
"grad_norm": 0.8397238705572047,
"kl": 0.0572509765625,
"learning_rate": 2e-06,
"loss": -0.0023,
"reward": 0.8715953528881073,
"reward_std": 0.16063255118206143,
"rewards/preference_model_reward": 0.8715953528881073,
"rewards/preference_model_reward/std": 0.1606325414031744,
"step": 153
},
{
"clip_ratio": 0.00035122232020512456,
"epoch": 1.0543293718166384,
"grad_norm": 0.6751976056560207,
"kl": 0.057525634765625,
"learning_rate": 2e-06,
"loss": -0.0023,
"step": 154
},
{
"clip_ratio": 0.0,
"completion_length": 338.484375,
"epoch": 1.061120543293718,
"grad_norm": 0.8883816112044249,
"kl": 0.053192138671875,
"learning_rate": 2e-06,
"loss": -0.0024,
"reward": 0.6162162441760302,
"reward_std": 0.22661381494253874,
"rewards/preference_model_reward": 0.6162162441760302,
"rewards/preference_model_reward/std": 0.22661380283534527,
"step": 155
},
{
"clip_ratio": 0.00026881803660216974,
"epoch": 1.067911714770798,
"grad_norm": 0.8996727390846359,
"kl": 0.0533447265625,
"learning_rate": 2e-06,
"loss": -0.0024,
"step": 156
},
{
"clip_ratio": 0.0,
"completion_length": 396.72265625,
"epoch": 1.0747028862478778,
"grad_norm": 1.3700956028048723,
"kl": 0.0594482421875,
"learning_rate": 2e-06,
"loss": -0.0047,
"reward": 0.8017089515924454,
"reward_std": 0.2577928486280143,
"rewards/preference_model_reward": 0.8017089515924454,
"rewards/preference_model_reward/std": 0.25779283652082086,
"step": 157
},
{
"clip_ratio": 0.0002924563341366593,
"epoch": 1.0814940577249577,
"grad_norm": 0.989617562955661,
"kl": 0.060272216796875,
"learning_rate": 2e-06,
"loss": -0.0047,
"step": 158
},
{
"clip_ratio": 0.0,
"completion_length": 342.76953125,
"epoch": 1.0882852292020373,
"grad_norm": 0.9655234182650031,
"kl": 0.058380126953125,
"learning_rate": 2e-06,
"loss": -0.0101,
"reward": 0.7295246534049511,
"reward_std": 0.2533393930643797,
"rewards/preference_model_reward": 0.7295246534049511,
"rewards/preference_model_reward/std": 0.2533393818885088,
"step": 159
},
{
"clip_ratio": 0.00045342851990426425,
"epoch": 1.0950764006791172,
"grad_norm": 1.2087305406513047,
"kl": 0.05926513671875,
"learning_rate": 2e-06,
"loss": -0.0102,
"step": 160
},
{
"clip_ratio": 0.0,
"completion_length": 436.4921875,
"epoch": 1.101867572156197,
"grad_norm": 0.6940296847029844,
"kl": 0.066864013671875,
"learning_rate": 2e-06,
"loss": -0.0046,
"reward": 0.8526426330208778,
"reward_std": 0.1869236477650702,
"rewards/preference_model_reward": 0.8526426330208778,
"rewards/preference_model_reward/std": 0.18692364171147346,
"step": 161
},
{
"clip_ratio": 0.0005265060426609125,
"epoch": 1.1086587436332767,
"grad_norm": 0.7298134766362122,
"kl": 0.06866455078125,
"learning_rate": 2e-06,
"loss": -0.0047,
"step": 162
},
{
"clip_ratio": 0.0,
"completion_length": 451.22265625,
"epoch": 1.1154499151103565,
"grad_norm": 0.7453936371234922,
"kl": 0.06085205078125,
"learning_rate": 2e-06,
"loss": -0.0002,
"reward": 0.8591005653142929,
"reward_std": 0.15921288449317217,
"rewards/preference_model_reward": 0.8591005653142929,
"rewards/preference_model_reward/std": 0.15921288169920444,
"step": 163
},
{
"clip_ratio": 0.0002904113680415321,
"epoch": 1.1222410865874364,
"grad_norm": 0.669910512576077,
"kl": 0.06268310546875,
"learning_rate": 2e-06,
"loss": -0.0002,
"step": 164
},
{
"clip_ratio": 0.0,
"completion_length": 407.80078125,
"epoch": 1.129032258064516,
"grad_norm": 0.909192169606273,
"kl": 0.06158447265625,
"learning_rate": 2e-06,
"loss": 0.0011,
"reward": 0.7943635508418083,
"reward_std": 0.25198143534362316,
"rewards/preference_model_reward": 0.7943635508418083,
"rewards/preference_model_reward/std": 0.25198143906891346,
"step": 165
},
{
"clip_ratio": 0.00025186290713463677,
"epoch": 1.1358234295415959,
"grad_norm": 0.9692712527409699,
"kl": 0.061767578125,
"learning_rate": 2e-06,
"loss": 0.001,
"step": 166
},
{
"clip_ratio": 0.0,
"completion_length": 325.6484375,
"epoch": 1.1426146010186757,
"grad_norm": 1.1179428269106786,
"kl": 0.0601806640625,
"learning_rate": 2e-06,
"loss": 0.0002,
"reward": 0.9444838091731071,
"reward_std": 0.11847700248472393,
"rewards/preference_model_reward": 0.9444838091731071,
"rewards/preference_model_reward/std": 0.11847699934151024,
"step": 167
},
{
"clip_ratio": 0.0002802106146191363,
"epoch": 1.1494057724957556,
"grad_norm": 0.6405441409913845,
"kl": 0.06036376953125,
"learning_rate": 2e-06,
"loss": 0.0002,
"step": 168
},
{
"clip_ratio": 0.0,
"completion_length": 353.76953125,
"epoch": 1.1561969439728352,
"grad_norm": 0.7782378270820641,
"kl": 0.06005859375,
"learning_rate": 2e-06,
"loss": -0.0016,
"reward": 0.8752250149846077,
"reward_std": 0.21684391144663095,
"rewards/preference_model_reward": 0.8752250149846077,
"rewards/preference_model_reward/std": 0.21684390027076006,
"step": 169
},
{
"clip_ratio": 0.00020744246558024315,
"epoch": 1.162988115449915,
"grad_norm": 0.850587103078974,
"kl": 0.06024169921875,
"learning_rate": 2e-06,
"loss": -0.0016,
"step": 170
},
{
"clip_ratio": 0.0,
"completion_length": 370.96484375,
"epoch": 1.169779286926995,
"grad_norm": 0.8701439536858631,
"kl": 0.054901123046875,
"learning_rate": 2e-06,
"loss": -0.0031,
"reward": 0.8039593771100044,
"reward_std": 0.18934147339314222,
"rewards/preference_model_reward": 0.8039593771100044,
"rewards/preference_model_reward/std": 0.18934146966785192,
"step": 171
},
{
"clip_ratio": 0.0003747612081497209,
"epoch": 1.1765704584040746,
"grad_norm": 0.8335090925710185,
"kl": 0.054718017578125,
"learning_rate": 2e-06,
"loss": -0.0031,
"step": 172
},
{
"clip_ratio": 0.0,
"completion_length": 485.390625,
"epoch": 1.1833616298811545,
"grad_norm": 0.6304735929486551,
"kl": 0.0595703125,
"learning_rate": 2e-06,
"loss": -0.0031,
"reward": 0.9124765843153,
"reward_std": 0.17223104648292065,
"rewards/preference_model_reward": 0.9124765843153,
"rewards/preference_model_reward/std": 0.17223104741424322,
"step": 173
},
{
"clip_ratio": 0.0004532161146926228,
"epoch": 1.1901528013582343,
"grad_norm": 0.5908523851183375,
"kl": 0.059295654296875,
"learning_rate": 2e-06,
"loss": -0.0031,
"step": 174
},
{
"clip_ratio": 0.0,
"completion_length": 424.578125,
"epoch": 1.1969439728353142,
"grad_norm": 0.8992704878494081,
"kl": 0.06396484375,
"learning_rate": 2e-06,
"loss": -0.0031,
"reward": 0.804033525288105,
"reward_std": 0.2301926789805293,
"rewards/preference_model_reward": 0.804033525288105,
"rewards/preference_model_reward/std": 0.2301926789805293,
"step": 175
},
{
"clip_ratio": 0.0003341037518111989,
"epoch": 1.2037351443123938,
"grad_norm": 0.9196956764410971,
"kl": 0.06427001953125,
"learning_rate": 2e-06,
"loss": -0.0031,
"step": 176
},
{
"clip_ratio": 0.0,
"completion_length": 431.2421875,
"epoch": 1.2105263157894737,
"grad_norm": 0.7220744702495395,
"kl": 0.058349609375,
"learning_rate": 2e-06,
"loss": -0.0077,
"reward": 0.7273744996637106,
"reward_std": 0.2286776825785637,
"rewards/preference_model_reward": 0.7273744996637106,
"rewards/preference_model_reward/std": 0.22867767792195082,
"step": 177
},
{
"clip_ratio": 0.0003454066154517932,
"epoch": 1.2173174872665535,
"grad_norm": 0.8716110246400824,
"kl": 0.058624267578125,
"learning_rate": 2e-06,
"loss": -0.0078,
"step": 178
},
{
"clip_ratio": 0.0,
"completion_length": 378.078125,
"epoch": 1.2241086587436332,
"grad_norm": 0.6663285066650045,
"kl": 0.0638427734375,
"learning_rate": 2e-06,
"loss": -0.0059,
"reward": 0.9165123328566551,
"reward_std": 0.16831889003515244,
"rewards/preference_model_reward": 0.9165123328566551,
"rewards/preference_model_reward/std": 0.16831888817250729,
"step": 179
},
{
"clip_ratio": 0.00019379381046746857,
"epoch": 1.230899830220713,
"grad_norm": 0.7055628007733394,
"kl": 0.0640869140625,
"learning_rate": 2e-06,
"loss": -0.006,
"step": 180
},
{
"clip_ratio": 0.0,
"completion_length": 422.25,
"epoch": 1.237691001697793,
"grad_norm": 0.7762768084046409,
"kl": 0.06463623046875,
"learning_rate": 2e-06,
"loss": -0.0043,
"reward": 0.9134577289223671,
"reward_std": 0.1393869406019803,
"rewards/preference_model_reward": 0.9134577289223671,
"rewards/preference_model_reward/std": 0.13938693181262352,
"step": 181
},
{
"clip_ratio": 0.0003034833280253224,
"epoch": 1.2444821731748728,
"grad_norm": 0.6186463874614553,
"kl": 0.064910888671875,
"learning_rate": 2e-06,
"loss": -0.0044,
"step": 182
},
{
"clip_ratio": 0.0,
"completion_length": 354.81640625,
"epoch": 1.2512733446519524,
"grad_norm": 0.8899638180343089,
"kl": 0.07208251953125,
"learning_rate": 2e-06,
"loss": -0.0069,
"reward": 0.7580052837729454,
"reward_std": 0.19877337908837944,
"rewards/preference_model_reward": 0.7580052837729454,
"rewards/preference_model_reward/std": 0.1987733746645972,
"step": 183
},
{
"clip_ratio": 0.0004808369849342853,
"epoch": 1.2580645161290323,
"grad_norm": 0.8688793068634583,
"kl": 0.072906494140625,
"learning_rate": 2e-06,
"loss": -0.007,
"step": 184
},
{
"clip_ratio": 0.0,
"completion_length": 391.66796875,
"epoch": 1.2648556876061121,
"grad_norm": 0.7448326779186991,
"kl": 0.0616455078125,
"learning_rate": 2e-06,
"loss": -0.007,
"reward": 0.8168806880712509,
"reward_std": 0.16082846838980913,
"rewards/preference_model_reward": 0.8168806880712509,
"rewards/preference_model_reward/std": 0.1608284618705511,
"step": 185
},
{
"clip_ratio": 0.00020933753694407642,
"epoch": 1.2716468590831917,
"grad_norm": 0.7190507214269751,
"kl": 0.06219482421875,
"learning_rate": 2e-06,
"loss": -0.0071,
"step": 186
},
{
"clip_ratio": 0.0,
"completion_length": 483.05078125,
"epoch": 1.2784380305602716,
"grad_norm": 0.8188458679532237,
"kl": 0.0731201171875,
"learning_rate": 2e-06,
"loss": -0.005,
"reward": 0.8175918683409691,
"reward_std": 0.22522686189040542,
"rewards/preference_model_reward": 0.8175918683409691,
"rewards/preference_model_reward/std": 0.22522685630246997,
"step": 187
},
{
"clip_ratio": 0.0003966744334320538,
"epoch": 1.2852292020373515,
"grad_norm": 1.0458363130153399,
"kl": 0.073944091796875,
"learning_rate": 2e-06,
"loss": -0.005,
"step": 188
},
{
"clip_ratio": 0.0,
"completion_length": 549.33203125,
"epoch": 1.2920203735144313,
"grad_norm": 0.9152240807229773,
"kl": 0.059906005859375,
"learning_rate": 2e-06,
"loss": 0.0056,
"reward": 0.8635009974241257,
"reward_std": 0.22975661419332027,
"rewards/preference_model_reward": 0.8635009974241257,
"rewards/preference_model_reward/std": 0.22975661046802998,
"step": 189
},
{
"clip_ratio": 0.00035369363831705414,
"epoch": 1.298811544991511,
"grad_norm": 0.9207458262412241,
"kl": 0.06072998046875,
"learning_rate": 2e-06,
"loss": 0.0056,
"step": 190
},
{
"clip_ratio": 0.0,
"completion_length": 442.76171875,
"epoch": 1.3056027164685908,
"grad_norm": 0.6570145377312667,
"kl": 0.068359375,
"learning_rate": 2e-06,
"loss": -0.0025,
"reward": 0.9078862443566322,
"reward_std": 0.15442213136702776,
"rewards/preference_model_reward": 0.9078862443566322,
"rewards/preference_model_reward/std": 0.15442212857306004,
"step": 191
},
{
"clip_ratio": 0.000297196668725519,
"epoch": 1.3123938879456707,
"grad_norm": 0.6739036562465582,
"kl": 0.069244384765625,
"learning_rate": 2e-06,
"loss": -0.0025,
"step": 192
},
{
"clip_ratio": 0.0,
"completion_length": 387.12890625,
"epoch": 1.3191850594227503,
"grad_norm": 0.8574027394239306,
"kl": 0.064239501953125,
"learning_rate": 2e-06,
"loss": -0.0045,
"reward": 0.824281245470047,
"reward_std": 0.2169697443023324,
"rewards/preference_model_reward": 0.824281245470047,
"rewards/preference_model_reward/std": 0.2169697443023324,
"step": 193
},
{
"clip_ratio": 0.0002967717100545997,
"epoch": 1.3259762308998302,
"grad_norm": 0.7290085924664141,
"kl": 0.065155029296875,
"learning_rate": 2e-06,
"loss": -0.0045,
"step": 194
},
{
"clip_ratio": 0.0,
"completion_length": 465.8203125,
"epoch": 1.33276740237691,
"grad_norm": 0.7309486318631441,
"kl": 0.07769775390625,
"learning_rate": 2e-06,
"loss": 0.0008,
"reward": 0.9312445744872093,
"reward_std": 0.15284666204388486,
"rewards/preference_model_reward": 0.9312445744872093,
"rewards/preference_model_reward/std": 0.15284666297520744,
"step": 195
},
{
"clip_ratio": 0.0002473961694704485,
"epoch": 1.33955857385399,
"grad_norm": 0.8256107757168715,
"kl": 0.07861328125,
"learning_rate": 2e-06,
"loss": 0.0008,
"step": 196
},
{
"clip_ratio": 0.0,
"completion_length": 430.015625,
"epoch": 1.3463497453310695,
"grad_norm": 0.7651818013398796,
"kl": 0.074951171875,
"learning_rate": 2e-06,
"loss": -0.0032,
"reward": 0.903270959854126,
"reward_std": 0.15746590262278914,
"rewards/preference_model_reward": 0.903270959854126,
"rewards/preference_model_reward/std": 0.15746590006165206,
"step": 197
},
{
"clip_ratio": 0.00028777409352187533,
"epoch": 1.3531409168081494,
"grad_norm": 0.695431821161012,
"kl": 0.0751953125,
"learning_rate": 2e-06,
"loss": -0.0032,
"step": 198
},
{
"clip_ratio": 0.0,
"completion_length": 464.35546875,
"epoch": 1.3599320882852293,
"grad_norm": 0.6072593985440756,
"kl": 0.068878173828125,
"learning_rate": 2e-06,
"loss": -0.0046,
"reward": 0.8996652066707611,
"reward_std": 0.15736312349326909,
"rewards/preference_model_reward": 0.8996652066707611,
"rewards/preference_model_reward/std": 0.15736312372609973,
"step": 199
},
{
"clip_ratio": 0.00029625675415445585,
"epoch": 1.366723259762309,
"grad_norm": 0.6467946669294912,
"kl": 0.069000244140625,
"learning_rate": 2e-06,
"loss": -0.0046,
"step": 200
},
{
"clip_ratio": 0.0,
"completion_length": 453.8671875,
"epoch": 1.3735144312393888,
"grad_norm": 0.6011320960228079,
"kl": 0.0772705078125,
"learning_rate": 2e-06,
"loss": -0.0019,
"reward": 0.9229187443852425,
"reward_std": 0.1588728630449623,
"rewards/preference_model_reward": 0.9229187443852425,
"rewards/preference_model_reward/std": 0.15887285268399864,
"step": 201
},
{
"clip_ratio": 0.0004093242150702281,
"epoch": 1.3803056027164686,
"grad_norm": 0.6166416974233886,
"kl": 0.07733154296875,
"learning_rate": 2e-06,
"loss": -0.0019,
"step": 202
},
{
"clip_ratio": 0.0,
"completion_length": 522.33203125,
"epoch": 1.3870967741935485,
"grad_norm": 0.821145885871034,
"kl": 0.071319580078125,
"learning_rate": 2e-06,
"loss": -0.0032,
"reward": 0.7927182205021381,
"reward_std": 0.15776410112448502,
"rewards/preference_model_reward": 0.7927182205021381,
"rewards/preference_model_reward/std": 0.1577640951873036,
"step": 203
},
{
"clip_ratio": 0.00040718307718634605,
"epoch": 1.3938879456706281,
"grad_norm": 0.6436174776451671,
"kl": 0.071380615234375,
"learning_rate": 2e-06,
"loss": -0.0032,
"step": 204
},
{
"clip_ratio": 0.0,
"completion_length": 448.05859375,
"epoch": 1.400679117147708,
"grad_norm": 0.9941878622739161,
"kl": 0.067535400390625,
"learning_rate": 2e-06,
"loss": 0.0005,
"reward": 0.8290302827954292,
"reward_std": 0.20328299421817064,
"rewards/preference_model_reward": 0.8290302827954292,
"rewards/preference_model_reward/std": 0.20328299049288034,
"step": 205
},
{
"clip_ratio": 0.0004087122197233839,
"epoch": 1.4074702886247878,
"grad_norm": 0.758001459822151,
"kl": 0.066619873046875,
"learning_rate": 2e-06,
"loss": 0.0006,
"step": 206
},
{
"clip_ratio": 0.0,
"completion_length": 393.47265625,
"epoch": 1.4142614601018675,
"grad_norm": 0.8719842748126723,
"kl": 0.067352294921875,
"learning_rate": 2e-06,
"loss": -0.0102,
"reward": 0.8164683133363724,
"reward_std": 0.22312493529170752,
"rewards/preference_model_reward": 0.8164683133363724,
"rewards/preference_model_reward/std": 0.22312493529170752,
"step": 207
},
{
"clip_ratio": 0.0002680147335922811,
"epoch": 1.4210526315789473,
"grad_norm": 0.8534423110795387,
"kl": 0.067230224609375,
"learning_rate": 2e-06,
"loss": -0.0103,
"step": 208
},
{
"clip_ratio": 0.0,
"completion_length": 468.77734375,
"epoch": 1.4278438030560272,
"grad_norm": 0.8752993311606954,
"kl": 0.0692138671875,
"learning_rate": 2e-06,
"loss": -0.002,
"reward": 0.8254417404532433,
"reward_std": 0.24196279793977737,
"rewards/preference_model_reward": 0.8254417404532433,
"rewards/preference_model_reward/std": 0.24196279048919678,
"step": 209
},
{
"clip_ratio": 0.0003697285646921955,
"epoch": 1.434634974533107,
"grad_norm": 1.0128390633646167,
"kl": 0.069732666015625,
"learning_rate": 2e-06,
"loss": -0.0021,
"step": 210
},
{
"clip_ratio": 0.0,
"completion_length": 467.9765625,
"epoch": 1.4414261460101867,
"grad_norm": 0.9508406874837924,
"kl": 0.07574462890625,
"learning_rate": 2e-06,
"loss": 0.0036,
"reward": 0.7649775668978691,
"reward_std": 0.27024316415190697,
"rewards/preference_model_reward": 0.7649775668978691,
"rewards/preference_model_reward/std": 0.2702431622892618,
"step": 211
},
{
"clip_ratio": 0.0004117395801586099,
"epoch": 1.4482173174872666,
"grad_norm": 0.9621517839351592,
"kl": 0.07647705078125,
"learning_rate": 2e-06,
"loss": 0.0035,
"step": 212
},
{
"clip_ratio": 0.0,
"completion_length": 458.64453125,
"epoch": 1.4550084889643464,
"grad_norm": 0.810982449228881,
"kl": 0.08087158203125,
"learning_rate": 2e-06,
"loss": -0.001,
"reward": 0.8479541018605232,
"reward_std": 0.22648475086316466,
"rewards/preference_model_reward": 0.8479541018605232,
"rewards/preference_model_reward/std": 0.2264847457408905,
"step": 213
},
{
"clip_ratio": 0.0003555577713996172,
"epoch": 1.461799660441426,
"grad_norm": 0.80101478641754,
"kl": 0.08135986328125,
"learning_rate": 2e-06,
"loss": -0.001,
"step": 214
},
{
"clip_ratio": 0.0,
"completion_length": 574.29296875,
"epoch": 1.468590831918506,
"grad_norm": 0.9452762932433028,
"kl": 0.0784912109375,
"learning_rate": 2e-06,
"loss": -0.0021,
"reward": 0.874680757522583,
"reward_std": 0.1911849994212389,
"rewards/preference_model_reward": 0.874680757522583,
"rewards/preference_model_reward/std": 0.1911849956959486,
"step": 215
},
{
"clip_ratio": 0.00028368424500513356,
"epoch": 1.4753820033955858,
"grad_norm": 0.8371740511238688,
"kl": 0.078369140625,
"learning_rate": 2e-06,
"loss": -0.0021,
"step": 216
},
{
"clip_ratio": 0.0,
"completion_length": 426.421875,
"epoch": 1.4821731748726656,
"grad_norm": 1.0152679244033544,
"kl": 0.077392578125,
"learning_rate": 2e-06,
"loss": -0.0064,
"reward": 0.8196995928883553,
"reward_std": 0.1846959013491869,
"rewards/preference_model_reward": 0.8196995928883553,
"rewards/preference_model_reward/std": 0.1846959034446627,
"step": 217
},
{
"clip_ratio": 0.0004560712586680893,
"epoch": 1.4889643463497453,
"grad_norm": 0.8237042745863938,
"kl": 0.07757568359375,
"learning_rate": 2e-06,
"loss": -0.0064,
"step": 218
},
{
"clip_ratio": 0.0,
"completion_length": 442.375,
"epoch": 1.4957555178268251,
"grad_norm": 0.7237731143426731,
"kl": 0.07708740234375,
"learning_rate": 2e-06,
"loss": 0.0046,
"reward": 0.9068370684981346,
"reward_std": 0.16706688702106476,
"rewards/preference_model_reward": 0.9068370684981346,
"rewards/preference_model_reward/std": 0.1670668888837099,
"step": 219
},
{
"clip_ratio": 0.00027096804114989936,
"epoch": 1.5025466893039048,
"grad_norm": 0.7076398007967601,
"kl": 0.0770263671875,
"learning_rate": 2e-06,
"loss": 0.0046,
"step": 220
},
{
"clip_ratio": 0.0,
"completion_length": 444.953125,
"epoch": 1.5093378607809846,
"grad_norm": 0.869997583364439,
"kl": 0.082275390625,
"learning_rate": 2e-06,
"loss": 0.0055,
"reward": 0.7990810945630074,
"reward_std": 0.18924825318390504,
"rewards/preference_model_reward": 0.7990810945630074,
"rewards/preference_model_reward/std": 0.18924825073918328,
"step": 221
},
{
"clip_ratio": 0.0004075437354913447,
"epoch": 1.5161290322580645,
"grad_norm": 0.9017629071808718,
"kl": 0.0819091796875,
"learning_rate": 2e-06,
"loss": 0.0055,
"step": 222
},
{
"clip_ratio": 0.0,
"completion_length": 446.15234375,
"epoch": 1.5229202037351444,
"grad_norm": 0.7451783779397045,
"kl": 0.074432373046875,
"learning_rate": 2e-06,
"loss": 0.007,
"reward": 0.9096781089901924,
"reward_std": 0.16962200123816729,
"rewards/preference_model_reward": 0.9096781089901924,
"rewards/preference_model_reward/std": 0.16962200321722776,
"step": 223
},
{
"clip_ratio": 0.00042257635868736543,
"epoch": 1.5297113752122242,
"grad_norm": 0.7058693281421018,
"kl": 0.07421875,
"learning_rate": 2e-06,
"loss": 0.007,
"step": 224
},
{
"clip_ratio": 0.0,
"completion_length": 465.25,
"epoch": 1.5365025466893039,
"grad_norm": 0.6935303191071553,
"kl": 0.070648193359375,
"learning_rate": 2e-06,
"loss": -0.0079,
"reward": 0.8487136289477348,
"reward_std": 0.1877696868032217,
"rewards/preference_model_reward": 0.8487136289477348,
"rewards/preference_model_reward/std": 0.18776968773454428,
"step": 225
},
{
"clip_ratio": 0.00021090211157570593,
"epoch": 1.5432937181663837,
"grad_norm": 0.7145245121074538,
"kl": 0.0701904296875,
"learning_rate": 2e-06,
"loss": -0.008,
"step": 226
},
{
"clip_ratio": 0.0,
"completion_length": 381.94140625,
"epoch": 1.5500848896434634,
"grad_norm": 0.7994299854278107,
"kl": 0.07879638671875,
"learning_rate": 2e-06,
"loss": 0.001,
"reward": 0.8791565969586372,
"reward_std": 0.1796162803657353,
"rewards/preference_model_reward": 0.8791565969586372,
"rewards/preference_model_reward/std": 0.17961628688499331,
"step": 227
},
{
"clip_ratio": 0.00023016269369691145,
"epoch": 1.5568760611205432,
"grad_norm": 0.8175812413468173,
"kl": 0.07855224609375,
"learning_rate": 2e-06,
"loss": 0.001,
"step": 228
},
{
"clip_ratio": 0.0,
"completion_length": 568.01171875,
"epoch": 1.563667232597623,
"grad_norm": 0.6094276281977793,
"kl": 0.0706787109375,
"learning_rate": 2e-06,
"loss": -0.0011,
"reward": 0.8882196396589279,
"reward_std": 0.13881529681384563,
"rewards/preference_model_reward": 0.8882196396589279,
"rewards/preference_model_reward/std": 0.13881529681384563,
"step": 229
},
{
"clip_ratio": 0.00029951393207738874,
"epoch": 1.570458404074703,
"grad_norm": 1.7950954969207382,
"kl": 0.07061767578125,
"learning_rate": 2e-06,
"loss": -0.0011,
"step": 230
},
{
"clip_ratio": 0.0,
"completion_length": 423.640625,
"epoch": 1.5772495755517828,
"grad_norm": 0.821807107387434,
"kl": 0.07537841796875,
"learning_rate": 2e-06,
"loss": -0.0032,
"reward": 0.8326325863599777,
"reward_std": 0.222384094260633,
"rewards/preference_model_reward": 0.8326325863599777,
"rewards/preference_model_reward/std": 0.22238408401608467,
"step": 231
},
{
"clip_ratio": 0.0003851418096019188,
"epoch": 1.5840407470288624,
"grad_norm": 0.7724457393034863,
"kl": 0.07427978515625,
"learning_rate": 2e-06,
"loss": -0.0032,
"step": 232
},
{
"clip_ratio": 0.0,
"completion_length": 450.34765625,
"epoch": 1.5908319185059423,
"grad_norm": 0.9314695228796444,
"kl": 0.070556640625,
"learning_rate": 2e-06,
"loss": 0.0058,
"reward": 0.815672542899847,
"reward_std": 0.22120152600109577,
"rewards/preference_model_reward": 0.815672542899847,
"rewards/preference_model_reward/std": 0.2212015176191926,
"step": 233
},
{
"clip_ratio": 0.00026556486773188226,
"epoch": 1.597623089983022,
"grad_norm": 0.8986538432374337,
"kl": 0.07061767578125,
"learning_rate": 2e-06,
"loss": 0.0058,
"step": 234
},
{
"clip_ratio": 0.0,
"completion_length": 403.4140625,
"epoch": 1.6044142614601018,
"grad_norm": 1.0978197262889375,
"kl": 0.080474853515625,
"learning_rate": 2e-06,
"loss": -0.0007,
"reward": 0.8273614943027496,
"reward_std": 0.20897717960178852,
"rewards/preference_model_reward": 0.8273614943027496,
"rewards/preference_model_reward/std": 0.208977174712345,
"step": 235
},
{
"clip_ratio": 0.0002703697555261897,
"epoch": 1.6112054329371817,
"grad_norm": 1.0483625406174306,
"kl": 0.079681396484375,
"learning_rate": 2e-06,
"loss": -0.0008,
"step": 236
},
{
"clip_ratio": 0.0,
"completion_length": 526.9453125,
"epoch": 1.6179966044142615,
"grad_norm": 0.5732340261918957,
"kl": 0.068572998046875,
"learning_rate": 2e-06,
"loss": 0.0007,
"reward": 0.9326564967632294,
"reward_std": 0.14762203209102154,
"rewards/preference_model_reward": 0.9326564967632294,
"rewards/preference_model_reward/std": 0.1476220367476344,
"step": 237
},
{
"clip_ratio": 0.0003460040388745256,
"epoch": 1.6247877758913414,
"grad_norm": 0.5752766993082183,
"kl": 0.068756103515625,
"learning_rate": 2e-06,
"loss": 0.0007,
"step": 238
},
{
"clip_ratio": 0.0,
"completion_length": 386.03515625,
"epoch": 1.631578947368421,
"grad_norm": 0.6056663933056317,
"kl": 0.075103759765625,
"learning_rate": 2e-06,
"loss": -0.0021,
"reward": 0.8928236216306686,
"reward_std": 0.12510262243449688,
"rewards/preference_model_reward": 0.8928236216306686,
"rewards/preference_model_reward/std": 0.12510262383148074,
"step": 239
},
{
"clip_ratio": 0.0002695773910090793,
"epoch": 1.6383701188455009,
"grad_norm": 0.6137961375915928,
"kl": 0.07489013671875,
"learning_rate": 2e-06,
"loss": -0.0022,
"step": 240
},
{
"clip_ratio": 0.0,
"completion_length": 407.54296875,
"epoch": 1.6451612903225805,
"grad_norm": 0.42071122454808335,
"kl": 0.0718994140625,
"learning_rate": 2e-06,
"loss": 0.0013,
"reward": 0.9794645011425018,
"reward_std": 0.08479468178120442,
"rewards/preference_model_reward": 0.9794645011425018,
"rewards/preference_model_reward/std": 0.08479468079167418,
"step": 241
},
{
"clip_ratio": 0.00025576782900316175,
"epoch": 1.6519524617996604,
"grad_norm": 0.3919119735586361,
"kl": 0.07177734375,
"learning_rate": 2e-06,
"loss": 0.0013,
"step": 242
},
{
"clip_ratio": 0.0,
"completion_length": 411.42578125,
"epoch": 1.6587436332767402,
"grad_norm": 0.6548770476035524,
"kl": 0.0699462890625,
"learning_rate": 2e-06,
"loss": -0.0013,
"reward": 0.8988784328103065,
"reward_std": 0.1754322163760662,
"rewards/preference_model_reward": 0.8988784328103065,
"rewards/preference_model_reward/std": 0.17543221032246947,
"step": 243
},
{
"clip_ratio": 0.0002102269518218236,
"epoch": 1.66553480475382,
"grad_norm": 0.6267254419265091,
"kl": 0.06951904296875,
"learning_rate": 2e-06,
"loss": -0.0014,
"step": 244
},
{
"clip_ratio": 0.0,
"completion_length": 466.73828125,
"epoch": 1.6723259762309,
"grad_norm": 0.7815256424102707,
"kl": 0.070068359375,
"learning_rate": 2e-06,
"loss": -0.0002,
"reward": 0.8519543707370758,
"reward_std": 0.22265557665377855,
"rewards/preference_model_reward": 0.8519543707370758,
"rewards/preference_model_reward/std": 0.22265558131039143,
"step": 245
},
{
"clip_ratio": 0.00024074002521956572,
"epoch": 1.6791171477079796,
"grad_norm": 0.7771753382163754,
"kl": 0.0697021484375,
"learning_rate": 2e-06,
"loss": -0.0003,
"step": 246
},
{
"clip_ratio": 0.0,
"completion_length": 468.9140625,
"epoch": 1.6859083191850595,
"grad_norm": 0.8258890143100246,
"kl": 0.064239501953125,
"learning_rate": 2e-06,
"loss": 0.0068,
"reward": 0.6989383921027184,
"reward_std": 0.21238140459172428,
"rewards/preference_model_reward": 0.6989383921027184,
"rewards/preference_model_reward/std": 0.212381407385692,
"step": 247
},
{
"clip_ratio": 0.00029818056009389693,
"epoch": 1.692699490662139,
"grad_norm": 0.8433009778411044,
"kl": 0.06402587890625,
"learning_rate": 2e-06,
"loss": 0.0068,
"step": 248
},
{
"clip_ratio": 0.0,
"completion_length": 461.68359375,
"epoch": 1.699490662139219,
"grad_norm": 0.564079360506697,
"kl": 0.081146240234375,
"learning_rate": 2e-06,
"loss": -0.0013,
"reward": 0.813681973144412,
"reward_std": 0.15440709423273802,
"rewards/preference_model_reward": 0.813681973144412,
"rewards/preference_model_reward/std": 0.1544070926029235,
"step": 249
},
{
"clip_ratio": 0.00033701639495120617,
"epoch": 1.7062818336162988,
"grad_norm": 0.5478613289915798,
"kl": 0.08160400390625,
"learning_rate": 2e-06,
"loss": -0.0014,
"step": 250
},
{
"clip_ratio": 0.0,
"completion_length": 322.9296875,
"epoch": 1.7130730050933787,
"grad_norm": 1.071419601384365,
"kl": 0.07757568359375,
"learning_rate": 2e-06,
"loss": 0.0057,
"reward": 0.729640144854784,
"reward_std": 0.24793746508657932,
"rewards/preference_model_reward": 0.729640144854784,
"rewards/preference_model_reward/std": 0.24793746136128902,
"step": 251
},
{
"clip_ratio": 0.0003556678657332668,
"epoch": 1.7198641765704585,
"grad_norm": 1.0183565424453855,
"kl": 0.078369140625,
"learning_rate": 2e-06,
"loss": 0.0057,
"step": 252
},
{
"clip_ratio": 0.0,
"completion_length": 507.984375,
"epoch": 1.7266553480475382,
"grad_norm": 0.8616822486915081,
"kl": 0.07220458984375,
"learning_rate": 2e-06,
"loss": 0.0018,
"reward": 0.8504317253828049,
"reward_std": 0.2000572015531361,
"rewards/preference_model_reward": 0.8504317253828049,
"rewards/preference_model_reward/std": 0.20005719922482967,
"step": 253
},
{
"clip_ratio": 0.00037185212204349227,
"epoch": 1.733446519524618,
"grad_norm": 0.7593220266059924,
"kl": 0.07269287109375,
"learning_rate": 2e-06,
"loss": 0.0018,
"step": 254
},
{
"clip_ratio": 0.0,
"completion_length": 422.05078125,
"epoch": 1.7402376910016977,
"grad_norm": 0.8256725125473834,
"kl": 0.06494140625,
"learning_rate": 2e-06,
"loss": 0.0065,
"reward": 0.851757287979126,
"reward_std": 0.2138998694717884,
"rewards/preference_model_reward": 0.851757287979126,
"rewards/preference_model_reward/std": 0.21389986481517553,
"step": 255
},
{
"clip_ratio": 0.00029054368678771425,
"epoch": 1.7470288624787775,
"grad_norm": 0.8768036487045519,
"kl": 0.065338134765625,
"learning_rate": 2e-06,
"loss": 0.0064,
"step": 256
},
{
"clip_ratio": 0.0,
"completion_length": 606.53515625,
"epoch": 1.7538200339558574,
"grad_norm": 0.7028967106094474,
"kl": 0.071868896484375,
"learning_rate": 2e-06,
"loss": 0.0011,
"reward": 0.908553458750248,
"reward_std": 0.1910355999134481,
"rewards/preference_model_reward": 0.908553458750248,
"rewards/preference_model_reward/std": 0.19103559292852879,
"step": 257
},
{
"clip_ratio": 0.0003378584806341678,
"epoch": 1.7606112054329373,
"grad_norm": 0.6669227299061052,
"kl": 0.07257080078125,
"learning_rate": 2e-06,
"loss": 0.0011,
"step": 258
},
{
"clip_ratio": 0.0,
"completion_length": 484.171875,
"epoch": 1.7674023769100171,
"grad_norm": 0.589142092316522,
"kl": 0.07623291015625,
"learning_rate": 2e-06,
"loss": 0.0014,
"reward": 0.9417537078261375,
"reward_std": 0.15645512472838163,
"rewards/preference_model_reward": 0.9417537078261375,
"rewards/preference_model_reward/std": 0.15645511914044619,
"step": 259
},
{
"clip_ratio": 0.0003804455500358017,
"epoch": 1.7741935483870968,
"grad_norm": 0.5638842375184904,
"kl": 0.07666015625,
"learning_rate": 2e-06,
"loss": 0.0014,
"step": 260
},
{
"clip_ratio": 0.0,
"completion_length": 457.60546875,
"epoch": 1.7809847198641766,
"grad_norm": 0.8955077816264015,
"kl": 0.08245849609375,
"learning_rate": 2e-06,
"loss": -0.0022,
"reward": 0.8265820220112801,
"reward_std": 0.21266387501964346,
"rewards/preference_model_reward": 0.8265820220112801,
"rewards/preference_model_reward/std": 0.21266387071227655,
"step": 261
},
{
"clip_ratio": 0.0004603340476023732,
"epoch": 1.7877758913412563,
"grad_norm": 0.8188440887100487,
"kl": 0.082763671875,
"learning_rate": 2e-06,
"loss": -0.0022,
"step": 262
},
{
"clip_ratio": 0.0,
"completion_length": 559.12890625,
"epoch": 1.7945670628183361,
"grad_norm": 0.7420003421814586,
"kl": 0.06298828125,
"learning_rate": 2e-06,
"loss": -0.0019,
"reward": 0.8031069450080395,
"reward_std": 0.18254950502887368,
"rewards/preference_model_reward": 0.8031069450080395,
"rewards/preference_model_reward/std": 0.1825494933873415,
"step": 263
},
{
"clip_ratio": 0.0002911918672907632,
"epoch": 1.801358234295416,
"grad_norm": 0.7226550220045151,
"kl": 0.06256103515625,
"learning_rate": 2e-06,
"loss": -0.002,
"step": 264
},
{
"clip_ratio": 0.0,
"completion_length": 393.953125,
"epoch": 1.8081494057724958,
"grad_norm": 0.8002731044836809,
"kl": 0.077423095703125,
"learning_rate": 2e-06,
"loss": -0.0042,
"reward": 0.8755168691277504,
"reward_std": 0.2164273001253605,
"rewards/preference_model_reward": 0.8755168691277504,
"rewards/preference_model_reward/std": 0.2164272964000702,
"step": 265
},
{
"clip_ratio": 0.0002747558801274863,
"epoch": 1.8149405772495757,
"grad_norm": 0.7854043108449068,
"kl": 0.07794189453125,
"learning_rate": 2e-06,
"loss": -0.0042,
"step": 266
},
{
"clip_ratio": 0.0,
"completion_length": 393.8359375,
"epoch": 1.8217317487266553,
"grad_norm": 0.9479972413749124,
"kl": 0.08074951171875,
"learning_rate": 2e-06,
"loss": -0.0107,
"reward": 0.8254084438085556,
"reward_std": 0.1835477078857366,
"rewards/preference_model_reward": 0.8254084438085556,
"rewards/preference_model_reward/std": 0.18354770765290596,
"step": 267
},
{
"clip_ratio": 0.0003044331115233945,
"epoch": 1.8285229202037352,
"grad_norm": 0.8555828495346359,
"kl": 0.08099365234375,
"learning_rate": 2e-06,
"loss": -0.0107,
"step": 268
},
{
"clip_ratio": 0.0,
"completion_length": 493.96484375,
"epoch": 1.8353140916808148,
"grad_norm": 0.6643254122645025,
"kl": 0.07830810546875,
"learning_rate": 2e-06,
"loss": 0.0029,
"reward": 0.8742033094167709,
"reward_std": 0.17645000852644444,
"rewards/preference_model_reward": 0.8742033094167709,
"rewards/preference_model_reward/std": 0.1764500148128718,
"step": 269
},
{
"clip_ratio": 0.00037674069881177275,
"epoch": 1.8421052631578947,
"grad_norm": 0.7454046347235634,
"kl": 0.0787353515625,
"learning_rate": 2e-06,
"loss": 0.0029,
"step": 270
},
{
"clip_ratio": 0.0,
"completion_length": 429.734375,
"epoch": 1.8488964346349746,
"grad_norm": 0.7255152868461135,
"kl": 0.07861328125,
"learning_rate": 2e-06,
"loss": -0.001,
"reward": 0.8850414156913757,
"reward_std": 0.17634376138448715,
"rewards/preference_model_reward": 0.8850414156913757,
"rewards/preference_model_reward/std": 0.1763437541667372,
"step": 271
},
{
"clip_ratio": 0.0002247508855361957,
"epoch": 1.8556876061120544,
"grad_norm": 0.7200469091029748,
"kl": 0.07861328125,
"learning_rate": 2e-06,
"loss": -0.001,
"step": 272
},
{
"clip_ratio": 0.0,
"completion_length": 404.3515625,
"epoch": 1.8624787775891343,
"grad_norm": 0.7158241923732864,
"kl": 0.07818603515625,
"learning_rate": 2e-06,
"loss": -0.0003,
"reward": 0.8412534669041634,
"reward_std": 0.1782067846506834,
"rewards/preference_model_reward": 0.8412534669041634,
"rewards/preference_model_reward/std": 0.17820678371936083,
"step": 273
},
{
"clip_ratio": 0.00020900591698591597,
"epoch": 1.869269949066214,
"grad_norm": 0.7312235131971142,
"kl": 0.07806396484375,
"learning_rate": 2e-06,
"loss": -0.0004,
"step": 274
},
{
"clip_ratio": 0.0,
"completion_length": 447.046875,
"epoch": 1.8760611205432938,
"grad_norm": 0.7216675721694292,
"kl": 0.074462890625,
"learning_rate": 2e-06,
"loss": 0.0012,
"reward": 0.8510579615831375,
"reward_std": 0.24259257689118385,
"rewards/preference_model_reward": 0.8510579615831375,
"rewards/preference_model_reward/std": 0.24259258434176445,
"step": 275
},
{
"clip_ratio": 0.0003766012450796552,
"epoch": 1.8828522920203734,
"grad_norm": 0.7280400520472221,
"kl": 0.074951171875,
"learning_rate": 2e-06,
"loss": 0.0012,
"step": 276
},
{
"clip_ratio": 0.0,
"completion_length": 453.65625,
"epoch": 1.8896434634974533,
"grad_norm": 0.8584892428121671,
"kl": 0.079376220703125,
"learning_rate": 2e-06,
"loss": 0.0052,
"reward": 0.8504569008946419,
"reward_std": 0.24195780232548714,
"rewards/preference_model_reward": 0.8504569008946419,
"rewards/preference_model_reward/std": 0.24195779021829367,
"step": 277
},
{
"clip_ratio": 0.000481072609545663,
"epoch": 1.8964346349745331,
"grad_norm": 0.8623823368670106,
"kl": 0.079437255859375,
"learning_rate": 2e-06,
"loss": 0.0051,
"step": 278
},
{
"clip_ratio": 0.0,
"completion_length": 428.984375,
"epoch": 1.903225806451613,
"grad_norm": 0.65090353158444,
"kl": 0.07568359375,
"learning_rate": 2e-06,
"loss": -0.0078,
"reward": 0.9181569963693619,
"reward_std": 0.17035586189012975,
"rewards/preference_model_reward": 0.9181569963693619,
"rewards/preference_model_reward/std": 0.17035586202109698,
"step": 279
},
{
"clip_ratio": 0.0002422129618935287,
"epoch": 1.9100169779286928,
"grad_norm": 0.6478462369679827,
"kl": 0.07586669921875,
"learning_rate": 2e-06,
"loss": -0.0078,
"step": 280
},
{
"clip_ratio": 0.0,
"completion_length": 486.6953125,
"epoch": 1.9168081494057725,
"grad_norm": 0.7048516661988414,
"kl": 0.07611083984375,
"learning_rate": 2e-06,
"loss": 0.0012,
"reward": 0.8832282423973083,
"reward_std": 0.20259307883679867,
"rewards/preference_model_reward": 0.8832282423973083,
"rewards/preference_model_reward/std": 0.20259307883679867,
"step": 281
},
{
"clip_ratio": 0.00031240142743627075,
"epoch": 1.9235993208828521,
"grad_norm": 0.6842247007649929,
"kl": 0.076324462890625,
"learning_rate": 2e-06,
"loss": 0.0012,
"step": 282
},
{
"clip_ratio": 0.0,
"completion_length": 446.12109375,
"epoch": 1.930390492359932,
"grad_norm": 0.7414576613573336,
"kl": 0.08099365234375,
"learning_rate": 2e-06,
"loss": 0.0029,
"reward": 0.8914339393377304,
"reward_std": 0.15328312013298273,
"rewards/preference_model_reward": 0.8914339393377304,
"rewards/preference_model_reward/std": 0.1532831130316481,
"step": 283
},
{
"clip_ratio": 0.00038151090484461747,
"epoch": 1.9371816638370118,
"grad_norm": 0.7360576429069317,
"kl": 0.08160400390625,
"learning_rate": 2e-06,
"loss": 0.0029,
"step": 284
},
{
"clip_ratio": 0.0,
"completion_length": 477.34765625,
"epoch": 1.9439728353140917,
"grad_norm": 0.8076039752992614,
"kl": 0.07861328125,
"learning_rate": 2e-06,
"loss": -0.0035,
"reward": 0.8031719997525215,
"reward_std": 0.22500982508063316,
"rewards/preference_model_reward": 0.8031719997525215,
"rewards/preference_model_reward/std": 0.22500981856137514,
"step": 285
},
{
"clip_ratio": 0.000369085326383356,
"epoch": 1.9507640067911716,
"grad_norm": 0.8009414546069689,
"kl": 0.07916259765625,
"learning_rate": 2e-06,
"loss": -0.0035,
"step": 286
},
{
"clip_ratio": 0.0,
"completion_length": 456.796875,
"epoch": 1.9575551782682514,
"grad_norm": 0.7202067622191793,
"kl": 0.072509765625,
"learning_rate": 2e-06,
"loss": -0.0031,
"reward": 0.9239321351051331,
"reward_std": 0.1347129621426575,
"rewards/preference_model_reward": 0.9239321351051331,
"rewards/preference_model_reward/std": 0.13471295684576035,
"step": 287
},
{
"clip_ratio": 0.00027696689903677907,
"epoch": 1.964346349745331,
"grad_norm": 0.4938288163422082,
"kl": 0.0726318359375,
"learning_rate": 2e-06,
"loss": -0.0031,
"step": 288
},
{
"clip_ratio": 0.0,
"completion_length": 364.72265625,
"epoch": 1.9711375212224107,
"grad_norm": 0.7669478819520723,
"kl": 0.08953857421875,
"learning_rate": 2e-06,
"loss": -0.0036,
"reward": 0.8362223468720913,
"reward_std": 0.19234392209909856,
"rewards/preference_model_reward": 0.8362223468720913,
"rewards/preference_model_reward/std": 0.19234391255304217,
"step": 289
},
{
"clip_ratio": 0.000403772370191291,
"epoch": 1.9779286926994906,
"grad_norm": 2.6810045962105375,
"kl": 0.089599609375,
"learning_rate": 2e-06,
"loss": -0.0037,
"step": 290
},
{
"clip_ratio": 0.0,
"completion_length": 457.55859375,
"epoch": 1.9847198641765704,
"grad_norm": 0.8500553867554066,
"kl": 0.07928466796875,
"learning_rate": 2e-06,
"loss": -0.0072,
"reward": 0.845167201012373,
"reward_std": 0.1875469057704322,
"rewards/preference_model_reward": 0.845167201012373,
"rewards/preference_model_reward/std": 0.18754690227797255,
"step": 291
},
{
"clip_ratio": 0.0004131398400204489,
"epoch": 1.9915110356536503,
"grad_norm": 0.8328407125726051,
"kl": 0.07965087890625,
"learning_rate": 2e-06,
"loss": -0.0073,
"step": 292
},
{
"clip_ratio": 0.0,
"completion_length": 464.4375,
"epoch": 2.00679117147708,
"grad_norm": 0.7832085644539297,
"kl": 0.08221435546875,
"learning_rate": 2e-06,
"loss": 0.001,
"reward": 0.8732399269938469,
"reward_std": 0.22111400961875916,
"rewards/preference_model_reward": 0.8732399269938469,
"rewards/preference_model_reward/std": 0.22111400589346886,
"step": 293
},
{
"clip_ratio": 0.0002739406791079091,
"epoch": 2.0135823429541597,
"grad_norm": 0.7473092873628361,
"kl": 0.082275390625,
"learning_rate": 2e-06,
"loss": 0.0009,
"step": 294
},
{
"clip_ratio": 0.0,
"completion_length": 540.265625,
"epoch": 2.0203735144312396,
"grad_norm": 0.4811242877169514,
"kl": 0.07806396484375,
"learning_rate": 2e-06,
"loss": 0.0044,
"reward": 0.9485599547624588,
"reward_std": 0.12165670236572623,
"rewards/preference_model_reward": 0.9485599547624588,
"rewards/preference_model_reward/std": 0.12165670190006495,
"step": 295
},
{
"clip_ratio": 0.00019723791228898335,
"epoch": 2.027164685908319,
"grad_norm": 0.48620380717534856,
"kl": 0.0780029296875,
"learning_rate": 2e-06,
"loss": 0.0043,
"step": 296
},
{
"clip_ratio": 0.0,
"completion_length": 438.375,
"epoch": 2.033955857385399,
"grad_norm": 0.6998442696000419,
"kl": 0.08050537109375,
"learning_rate": 2e-06,
"loss": 0.0042,
"reward": 0.8833131715655327,
"reward_std": 0.18213203502818942,
"rewards/preference_model_reward": 0.8833131715655327,
"rewards/preference_model_reward/std": 0.18213202757760882,
"step": 297
},
{
"clip_ratio": 0.00025544029949742253,
"epoch": 2.0407470288624787,
"grad_norm": 0.7155677820275104,
"kl": 0.0804443359375,
"learning_rate": 2e-06,
"loss": 0.0043,
"step": 298
},
{
"clip_ratio": 0.0,
"completion_length": 417.74609375,
"epoch": 2.0475382003395586,
"grad_norm": 0.6253438347592372,
"kl": 0.0816650390625,
"learning_rate": 2e-06,
"loss": -0.0002,
"reward": 0.9034418389201164,
"reward_std": 0.13920773862628266,
"rewards/preference_model_reward": 0.9034418389201164,
"rewards/preference_model_reward/std": 0.13920772803248838,
"step": 299
},
{
"clip_ratio": 0.0002523756156733725,
"epoch": 2.0543293718166384,
"grad_norm": 0.6171154618996508,
"kl": 0.08148193359375,
"learning_rate": 2e-06,
"loss": -0.0002,
"step": 300
}
],
"logging_steps": 1,
"max_steps": 625,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 150,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}