s300-1.3.0L-GRPO-it1 / trainer_state.json
PocketDoc's picture
Upload folder using huggingface_hub
d574e2a verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.2546689303904924,
"eval_steps": 500,
"global_step": 300,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio": 0.0,
"completion_length": 154.90625,
"epoch": 0.0008488964346349745,
"grad_norm": 1.373261530904728,
"kl": 0.0003566741943359375,
"learning_rate": 0.0,
"loss": -0.0035,
"reward": 0.12956976890563965,
"reward_std": 0.10243552178144455,
"rewards/preference_model_reward": 0.12956976890563965,
"rewards/preference_model_reward/std": 0.10243552923202515,
"step": 1
},
{
"clip_ratio": 0.0,
"epoch": 0.001697792869269949,
"grad_norm": 1.373744508768238,
"kl": 0.0003566741943359375,
"learning_rate": 1e-07,
"loss": -0.0035,
"step": 2
},
{
"clip_ratio": 0.0,
"completion_length": 426.03125,
"epoch": 0.0025466893039049238,
"grad_norm": 0.01976094802569778,
"kl": 0.0003337860107421875,
"learning_rate": 2e-07,
"loss": -0.0,
"reward": 0.007162425667047501,
"reward_std": 0.002347785048186779,
"rewards/preference_model_reward": 0.007162425667047501,
"rewards/preference_model_reward/std": 0.0023477852810174227,
"step": 3
},
{
"clip_ratio": 0.00029364757938310504,
"epoch": 0.003395585738539898,
"grad_norm": 0.019704841225345854,
"kl": 0.000339508056640625,
"learning_rate": 3e-07,
"loss": -0.0001,
"step": 4
},
{
"clip_ratio": 0.0,
"completion_length": 205.59375,
"epoch": 0.004244482173174873,
"grad_norm": 0.8550671585380242,
"kl": 0.000408172607421875,
"learning_rate": 4e-07,
"loss": 0.0011,
"reward": 0.0704927146434784,
"reward_std": 0.06750915944576263,
"rewards/preference_model_reward": 0.0704927146434784,
"rewards/preference_model_reward/std": 0.06750915199518204,
"step": 5
},
{
"clip_ratio": 0.0,
"epoch": 0.0050933786078098476,
"grad_norm": 0.7361877708957172,
"kl": 0.0003910064697265625,
"learning_rate": 5e-07,
"loss": 0.0011,
"step": 6
},
{
"clip_ratio": 0.0,
"completion_length": 227.15625,
"epoch": 0.005942275042444821,
"grad_norm": 0.38706524237759515,
"kl": 0.0003662109375,
"learning_rate": 6e-07,
"loss": -0.0007,
"reward": 0.03637976944446564,
"reward_std": 0.037161991000175476,
"rewards/preference_model_reward": 0.03637976944446564,
"rewards/preference_model_reward/std": 0.037161991000175476,
"step": 7
},
{
"clip_ratio": 0.0003041362506337464,
"epoch": 0.006791171477079796,
"grad_norm": 0.3886457621776694,
"kl": 0.000339508056640625,
"learning_rate": 7e-07,
"loss": -0.0007,
"step": 8
},
{
"clip_ratio": 0.0,
"completion_length": 384.0625,
"epoch": 0.007640067911714771,
"grad_norm": 1.3360350931326528,
"kl": 0.0003337860107421875,
"learning_rate": 8e-07,
"loss": 0.0003,
"reward": 0.2957379221916199,
"reward_std": 0.1667662262916565,
"rewards/preference_model_reward": 0.2957379221916199,
"rewards/preference_model_reward/std": 0.1667662262916565,
"step": 9
},
{
"clip_ratio": 0.0004007347160950303,
"epoch": 0.008488964346349746,
"grad_norm": 1.3492099330380622,
"kl": 0.00035858154296875,
"learning_rate": 9e-07,
"loss": 0.0003,
"step": 10
},
{
"clip_ratio": 0.0,
"completion_length": 271.34375,
"epoch": 0.00933786078098472,
"grad_norm": 0.5844743318217549,
"kl": 0.0004730224609375,
"learning_rate": 1e-06,
"loss": -0.0055,
"reward": 0.06409081071615219,
"reward_std": 0.05993795394897461,
"rewards/preference_model_reward": 0.06409081071615219,
"rewards/preference_model_reward/std": 0.05993795767426491,
"step": 11
},
{
"clip_ratio": 0.0003285869024693966,
"epoch": 0.010186757215619695,
"grad_norm": 0.5853835659686467,
"kl": 0.0005340576171875,
"learning_rate": 1.1e-06,
"loss": -0.0055,
"step": 12
},
{
"clip_ratio": 0.0,
"completion_length": 323.875,
"epoch": 0.011035653650254669,
"grad_norm": 0.5037341718615376,
"kl": 0.00077056884765625,
"learning_rate": 1.2e-06,
"loss": -0.0005,
"reward": 0.05134192109107971,
"reward_std": 0.05402546375989914,
"rewards/preference_model_reward": 0.05134192109107971,
"rewards/preference_model_reward/std": 0.05402546748518944,
"step": 13
},
{
"clip_ratio": 0.00019549165153875947,
"epoch": 0.011884550084889643,
"grad_norm": 0.5354006783262033,
"kl": 0.00098419189453125,
"learning_rate": 1.3e-06,
"loss": -0.0006,
"step": 14
},
{
"clip_ratio": 0.0,
"completion_length": 389.53125,
"epoch": 0.012733446519524618,
"grad_norm": 0.4376233615388839,
"kl": 0.00057220458984375,
"learning_rate": 1.4e-06,
"loss": 0.0,
"reward": 0.07725013792514801,
"reward_std": 0.0637926235795021,
"rewards/preference_model_reward": 0.07725013792514801,
"rewards/preference_model_reward/std": 0.0637926235795021,
"step": 15
},
{
"clip_ratio": 0.0001568605366628617,
"epoch": 0.013582342954159592,
"grad_norm": 0.43520351533225,
"kl": 0.000732421875,
"learning_rate": 1.5e-06,
"loss": -0.0,
"step": 16
},
{
"clip_ratio": 0.0,
"completion_length": 225.625,
"epoch": 0.014431239388794566,
"grad_norm": 0.009342079121610454,
"kl": 0.00156402587890625,
"learning_rate": 1.6e-06,
"loss": -0.0,
"reward": 0.003957257140427828,
"reward_std": 0.0007005692459642887,
"rewards/preference_model_reward": 0.003957257140427828,
"rewards/preference_model_reward/std": 0.0007005691877566278,
"step": 17
},
{
"clip_ratio": 0.0,
"epoch": 0.015280135823429542,
"grad_norm": 0.009135995120751321,
"kl": 0.0016937255859375,
"learning_rate": 1.6999999999999998e-06,
"loss": -0.0,
"step": 18
},
{
"clip_ratio": 0.0,
"completion_length": 618.53125,
"epoch": 0.016129032258064516,
"grad_norm": 0.8342082066432049,
"kl": 0.00165557861328125,
"learning_rate": 1.8e-06,
"loss": -0.003,
"reward": 0.30697351694107056,
"reward_std": 0.12625738978385925,
"rewards/preference_model_reward": 0.30697351694107056,
"rewards/preference_model_reward/std": 0.12625740468502045,
"step": 19
},
{
"clip_ratio": 0.00025422428734600544,
"epoch": 0.01697792869269949,
"grad_norm": 0.8045973414054722,
"kl": 0.00189208984375,
"learning_rate": 1.8999999999999998e-06,
"loss": -0.0032,
"step": 20
},
{
"clip_ratio": 0.0,
"completion_length": 341.0,
"epoch": 0.017826825127334467,
"grad_norm": 0.15747378780960536,
"kl": 0.0018157958984375,
"learning_rate": 2e-06,
"loss": 0.0013,
"reward": 0.025423400104045868,
"reward_std": 0.022510820999741554,
"rewards/preference_model_reward": 0.025423400104045868,
"rewards/preference_model_reward/std": 0.022510822862386703,
"step": 21
},
{
"clip_ratio": 0.00045937151298858225,
"epoch": 0.01867572156196944,
"grad_norm": 0.15703129987750525,
"kl": 0.0022125244140625,
"learning_rate": 2e-06,
"loss": 0.0012,
"step": 22
},
{
"clip_ratio": 0.0,
"completion_length": 232.5625,
"epoch": 0.019524617996604415,
"grad_norm": 0.4416727840666515,
"kl": 0.0036468505859375,
"learning_rate": 2e-06,
"loss": 0.0003,
"reward": 0.04581625759601593,
"reward_std": 0.0430915392935276,
"rewards/preference_model_reward": 0.04581625759601593,
"rewards/preference_model_reward/std": 0.0430915392935276,
"step": 23
},
{
"clip_ratio": 0.0006831242935732007,
"epoch": 0.02037351443123939,
"grad_norm": 0.44514421266142573,
"kl": 0.0042724609375,
"learning_rate": 2e-06,
"loss": 0.0001,
"step": 24
},
{
"clip_ratio": 0.0,
"completion_length": 212.34375,
"epoch": 0.021222410865874362,
"grad_norm": 0.965193272956362,
"kl": 0.006103515625,
"learning_rate": 2e-06,
"loss": -0.0014,
"reward": 0.11097941547632217,
"reward_std": 0.0762963593006134,
"rewards/preference_model_reward": 0.11097941547632217,
"rewards/preference_model_reward/std": 0.0762963593006134,
"step": 25
},
{
"clip_ratio": 0.0,
"epoch": 0.022071307300509338,
"grad_norm": 0.9125624994776861,
"kl": 0.00677490234375,
"learning_rate": 2e-06,
"loss": -0.0016,
"step": 26
},
{
"clip_ratio": 0.0,
"completion_length": 270.6875,
"epoch": 0.022920203735144314,
"grad_norm": 1.021646949062738,
"kl": 0.00738525390625,
"learning_rate": 2e-06,
"loss": -0.0024,
"reward": 0.14990350604057312,
"reward_std": 0.10197865962982178,
"rewards/preference_model_reward": 0.14990350604057312,
"rewards/preference_model_reward/std": 0.10197865217924118,
"step": 27
},
{
"clip_ratio": 0.0003397603868506849,
"epoch": 0.023769100169779286,
"grad_norm": 1.0520153952968034,
"kl": 0.00872802734375,
"learning_rate": 2e-06,
"loss": -0.0027,
"step": 28
},
{
"clip_ratio": 0.0,
"completion_length": 758.28125,
"epoch": 0.02461799660441426,
"grad_norm": 0.7119844877358423,
"kl": 0.006744384765625,
"learning_rate": 2e-06,
"loss": 0.0009,
"reward": 0.09009624272584915,
"reward_std": 0.09022250026464462,
"rewards/preference_model_reward": 0.09009624272584915,
"rewards/preference_model_reward/std": 0.09022250026464462,
"step": 29
},
{
"clip_ratio": 0.0006554110441356897,
"epoch": 0.025466893039049237,
"grad_norm": 0.5478668890905144,
"kl": 0.007568359375,
"learning_rate": 2e-06,
"loss": 0.0008,
"step": 30
},
{
"clip_ratio": 0.0,
"completion_length": 317.96875,
"epoch": 0.02631578947368421,
"grad_norm": 0.3101174990761895,
"kl": 0.00958251953125,
"learning_rate": 2e-06,
"loss": 0.0011,
"reward": 0.05284169688820839,
"reward_std": 0.028878774493932724,
"rewards/preference_model_reward": 0.05284169688820839,
"rewards/preference_model_reward/std": 0.028878774493932724,
"step": 31
},
{
"clip_ratio": 0.0008713441202417016,
"epoch": 0.027164685908319185,
"grad_norm": 0.2969256230681903,
"kl": 0.01007080078125,
"learning_rate": 2e-06,
"loss": 0.0009,
"step": 32
},
{
"clip_ratio": 0.0,
"completion_length": 346.5,
"epoch": 0.02801358234295416,
"grad_norm": 0.5964158292526848,
"kl": 0.01025390625,
"learning_rate": 2e-06,
"loss": -0.0002,
"reward": 0.10442396998405457,
"reward_std": 0.0710761621594429,
"rewards/preference_model_reward": 0.10442396998405457,
"rewards/preference_model_reward/std": 0.0710761621594429,
"step": 33
},
{
"clip_ratio": 0.001167232054285705,
"epoch": 0.028862478777589132,
"grad_norm": 0.6459875868432908,
"kl": 0.011474609375,
"learning_rate": 2e-06,
"loss": -0.0004,
"step": 34
},
{
"clip_ratio": 0.0,
"completion_length": 325.125,
"epoch": 0.029711375212224108,
"grad_norm": 0.8665540483399039,
"kl": 0.012451171875,
"learning_rate": 2e-06,
"loss": -0.0007,
"reward": 0.20713286101818085,
"reward_std": 0.08856458961963654,
"rewards/preference_model_reward": 0.20713286101818085,
"rewards/preference_model_reward/std": 0.08856458961963654,
"step": 35
},
{
"clip_ratio": 0.0004757290589623153,
"epoch": 0.030560271646859084,
"grad_norm": 0.8458082294842536,
"kl": 0.013916015625,
"learning_rate": 2e-06,
"loss": -0.001,
"step": 36
},
{
"clip_ratio": 0.0,
"completion_length": 259.5625,
"epoch": 0.031409168081494056,
"grad_norm": 0.6451884349368371,
"kl": 0.018798828125,
"learning_rate": 2e-06,
"loss": -0.0021,
"reward": 0.07655464112758636,
"reward_std": 0.06220533698797226,
"rewards/preference_model_reward": 0.07655464112758636,
"rewards/preference_model_reward/std": 0.06220533698797226,
"step": 37
},
{
"clip_ratio": 0.00036129303043708205,
"epoch": 0.03225806451612903,
"grad_norm": 0.6180427670131671,
"kl": 0.020751953125,
"learning_rate": 2e-06,
"loss": -0.0023,
"step": 38
},
{
"clip_ratio": 0.0,
"completion_length": 217.6875,
"epoch": 0.03310696095076401,
"grad_norm": 0.15623847767793025,
"kl": 0.027099609375,
"learning_rate": 2e-06,
"loss": 0.0006,
"reward": 0.01216259878128767,
"reward_std": 0.015137026086449623,
"rewards/preference_model_reward": 0.01216259878128767,
"rewards/preference_model_reward/std": 0.015137026086449623,
"step": 39
},
{
"clip_ratio": 0.0,
"epoch": 0.03395585738539898,
"grad_norm": 0.16166488666255469,
"kl": 0.02880859375,
"learning_rate": 2e-06,
"loss": 0.0005,
"step": 40
},
{
"clip_ratio": 0.0,
"completion_length": 340.25,
"epoch": 0.03480475382003396,
"grad_norm": 1.2594757575533009,
"kl": 0.022216796875,
"learning_rate": 2e-06,
"loss": 0.0002,
"reward": 0.36821770668029785,
"reward_std": 0.10101611167192459,
"rewards/preference_model_reward": 0.36821770668029785,
"rewards/preference_model_reward/std": 0.101016104221344,
"step": 41
},
{
"clip_ratio": 0.0007307034684345126,
"epoch": 0.035653650254668934,
"grad_norm": 1.027698571984344,
"kl": 0.0245361328125,
"learning_rate": 2e-06,
"loss": -0.0002,
"step": 42
},
{
"clip_ratio": 0.0,
"completion_length": 303.3125,
"epoch": 0.0365025466893039,
"grad_norm": 0.4510741866786348,
"kl": 0.0252685546875,
"learning_rate": 2e-06,
"loss": 0.0,
"reward": 0.056258413940668106,
"reward_std": 0.05226214602589607,
"rewards/preference_model_reward": 0.056258413940668106,
"rewards/preference_model_reward/std": 0.05226214602589607,
"step": 43
},
{
"clip_ratio": 0.0008270645630545914,
"epoch": 0.03735144312393888,
"grad_norm": 0.4538945689960808,
"kl": 0.02685546875,
"learning_rate": 2e-06,
"loss": -0.0002,
"step": 44
},
{
"clip_ratio": 0.0,
"completion_length": 696.4375,
"epoch": 0.038200339558573854,
"grad_norm": 0.6552940518465648,
"kl": 0.02490234375,
"learning_rate": 2e-06,
"loss": 0.001,
"reward": 0.28145015239715576,
"reward_std": 0.09661795943975449,
"rewards/preference_model_reward": 0.28145015239715576,
"rewards/preference_model_reward/std": 0.09661795198917389,
"step": 45
},
{
"clip_ratio": 0.0003099275636486709,
"epoch": 0.03904923599320883,
"grad_norm": 0.6063830680426688,
"kl": 0.026123046875,
"learning_rate": 2e-06,
"loss": 0.0007,
"step": 46
},
{
"clip_ratio": 0.0,
"completion_length": 185.5625,
"epoch": 0.039898132427843805,
"grad_norm": 0.8803723441580334,
"kl": 0.036376953125,
"learning_rate": 2e-06,
"loss": 0.0057,
"reward": 0.14047113060951233,
"reward_std": 0.07379527390003204,
"rewards/preference_model_reward": 0.14047113060951233,
"rewards/preference_model_reward/std": 0.07379526644945145,
"step": 47
},
{
"clip_ratio": 0.00016545334074180573,
"epoch": 0.04074702886247878,
"grad_norm": 0.8768664086923781,
"kl": 0.03857421875,
"learning_rate": 2e-06,
"loss": 0.0053,
"step": 48
},
{
"clip_ratio": 0.0,
"completion_length": 409.84375,
"epoch": 0.04159592529711375,
"grad_norm": 0.790693661049164,
"kl": 0.0303955078125,
"learning_rate": 2e-06,
"loss": 0.0013,
"reward": 0.16792196035385132,
"reward_std": 0.08975110948085785,
"rewards/preference_model_reward": 0.16792196035385132,
"rewards/preference_model_reward/std": 0.08975110203027725,
"step": 49
},
{
"clip_ratio": 0.0004502690862864256,
"epoch": 0.042444821731748725,
"grad_norm": 0.7661168129852652,
"kl": 0.03173828125,
"learning_rate": 2e-06,
"loss": 0.001,
"step": 50
},
{
"clip_ratio": 0.0,
"completion_length": 186.875,
"epoch": 0.0432937181663837,
"grad_norm": 0.5403939950650409,
"kl": 0.044189453125,
"learning_rate": 2e-06,
"loss": -0.0019,
"reward": 0.06892818212509155,
"reward_std": 0.036003705114126205,
"rewards/preference_model_reward": 0.06892818212509155,
"rewards/preference_model_reward/std": 0.036003705114126205,
"step": 51
},
{
"clip_ratio": 0.002485671080648899,
"epoch": 0.044142614601018676,
"grad_norm": 0.5730308488231836,
"kl": 0.047119140625,
"learning_rate": 2e-06,
"loss": -0.0021,
"step": 52
},
{
"clip_ratio": 0.0,
"completion_length": 335.25,
"epoch": 0.04499151103565365,
"grad_norm": 0.7389454596845632,
"kl": 0.041748046875,
"learning_rate": 2e-06,
"loss": -0.0063,
"reward": 0.28060293197631836,
"reward_std": 0.07954739779233932,
"rewards/preference_model_reward": 0.28060293197631836,
"rewards/preference_model_reward/std": 0.07954739034175873,
"step": 53
},
{
"clip_ratio": 0.0005645600031130016,
"epoch": 0.04584040747028863,
"grad_norm": 0.7275148836573597,
"kl": 0.04296875,
"learning_rate": 2e-06,
"loss": -0.0066,
"step": 54
},
{
"clip_ratio": 0.0,
"completion_length": 321.6875,
"epoch": 0.0466893039049236,
"grad_norm": 0.673383864573446,
"kl": 0.031494140625,
"learning_rate": 2e-06,
"loss": 0.0018,
"reward": 0.13006240129470825,
"reward_std": 0.07470076531171799,
"rewards/preference_model_reward": 0.13006240129470825,
"rewards/preference_model_reward/std": 0.07470076531171799,
"step": 55
},
{
"clip_ratio": 0.00018761330284178257,
"epoch": 0.04753820033955857,
"grad_norm": 0.5932243482520125,
"kl": 0.03125,
"learning_rate": 2e-06,
"loss": 0.0015,
"step": 56
},
{
"clip_ratio": 0.0,
"completion_length": 629.0625,
"epoch": 0.04838709677419355,
"grad_norm": 0.5943555323097186,
"kl": 0.03759765625,
"learning_rate": 2e-06,
"loss": 0.0022,
"reward": 0.16671660542488098,
"reward_std": 0.08239807188510895,
"rewards/preference_model_reward": 0.16671660542488098,
"rewards/preference_model_reward/std": 0.08239807188510895,
"step": 57
},
{
"clip_ratio": 0.0007027126266621053,
"epoch": 0.04923599320882852,
"grad_norm": 0.5944317831243726,
"kl": 0.0390625,
"learning_rate": 2e-06,
"loss": 0.002,
"step": 58
},
{
"clip_ratio": 0.0,
"completion_length": 221.90625,
"epoch": 0.0500848896434635,
"grad_norm": 0.4673677413132102,
"kl": 0.06689453125,
"learning_rate": 2e-06,
"loss": -0.0017,
"reward": 0.031242549419403076,
"reward_std": 0.04061814025044441,
"rewards/preference_model_reward": 0.031242549419403076,
"rewards/preference_model_reward/std": 0.04061814025044441,
"step": 59
},
{
"clip_ratio": 0.0010640884283930063,
"epoch": 0.050933786078098474,
"grad_norm": 0.46769299125491254,
"kl": 0.0693359375,
"learning_rate": 2e-06,
"loss": -0.0019,
"step": 60
},
{
"clip_ratio": 0.0,
"completion_length": 235.625,
"epoch": 0.05178268251273345,
"grad_norm": 0.9433079745488324,
"kl": 0.041259765625,
"learning_rate": 2e-06,
"loss": -0.0046,
"reward": 0.3223969340324402,
"reward_std": 0.08566058427095413,
"rewards/preference_model_reward": 0.3223969340324402,
"rewards/preference_model_reward/std": 0.08566058427095413,
"step": 61
},
{
"clip_ratio": 0.0005407010903581977,
"epoch": 0.05263157894736842,
"grad_norm": 0.8827749256609521,
"kl": 0.043212890625,
"learning_rate": 2e-06,
"loss": -0.0051,
"step": 62
},
{
"clip_ratio": 0.0,
"completion_length": 321.21875,
"epoch": 0.053480475382003394,
"grad_norm": 1.376741075127532,
"kl": 0.072265625,
"learning_rate": 2e-06,
"loss": -0.0027,
"reward": 0.18953999876976013,
"reward_std": 0.0982605516910553,
"rewards/preference_model_reward": 0.18953999876976013,
"rewards/preference_model_reward/std": 0.0982605367898941,
"step": 63
},
{
"clip_ratio": 0.0017892650794237852,
"epoch": 0.05432937181663837,
"grad_norm": 1.0129637166861898,
"kl": 0.076171875,
"learning_rate": 2e-06,
"loss": -0.003,
"step": 64
},
{
"clip_ratio": 0.0,
"completion_length": 225.25,
"epoch": 0.055178268251273345,
"grad_norm": 1.0280761369580704,
"kl": 0.08203125,
"learning_rate": 2e-06,
"loss": -0.0121,
"reward": 0.3603067398071289,
"reward_std": 0.09477485716342926,
"rewards/preference_model_reward": 0.3603067398071289,
"rewards/preference_model_reward/std": 0.09477484971284866,
"step": 65
},
{
"clip_ratio": 0.00028635968919843435,
"epoch": 0.05602716468590832,
"grad_norm": 1.012643043603393,
"kl": 0.08544921875,
"learning_rate": 2e-06,
"loss": -0.0126,
"step": 66
},
{
"clip_ratio": 0.0,
"completion_length": 718.90625,
"epoch": 0.056876061120543296,
"grad_norm": 0.5544711698032301,
"kl": 0.049560546875,
"learning_rate": 2e-06,
"loss": 0.0015,
"reward": 0.0976465493440628,
"reward_std": 0.08025789260864258,
"rewards/preference_model_reward": 0.0976465493440628,
"rewards/preference_model_reward/std": 0.08025789260864258,
"step": 67
},
{
"clip_ratio": 0.0004293117090128362,
"epoch": 0.057724957555178265,
"grad_norm": 0.5758752923955168,
"kl": 0.05029296875,
"learning_rate": 2e-06,
"loss": 0.0012,
"step": 68
},
{
"clip_ratio": 0.0,
"completion_length": 313.8125,
"epoch": 0.05857385398981324,
"grad_norm": 0.5384984962195909,
"kl": 0.08056640625,
"learning_rate": 2e-06,
"loss": -0.0024,
"reward": 0.037503279745578766,
"reward_std": 0.050285980105400085,
"rewards/preference_model_reward": 0.037503279745578766,
"rewards/preference_model_reward/std": 0.05028597638010979,
"step": 69
},
{
"clip_ratio": 0.000297203310765326,
"epoch": 0.059422750424448216,
"grad_norm": 0.5193566163583858,
"kl": 0.08251953125,
"learning_rate": 2e-06,
"loss": -0.0026,
"step": 70
},
{
"clip_ratio": 0.0,
"completion_length": 289.84375,
"epoch": 0.06027164685908319,
"grad_norm": 1.5206690115948938,
"kl": 0.0849609375,
"learning_rate": 2e-06,
"loss": 0.0006,
"reward": 0.3103345036506653,
"reward_std": 0.14627772569656372,
"rewards/preference_model_reward": 0.3103345036506653,
"rewards/preference_model_reward/std": 0.14627772569656372,
"step": 71
},
{
"clip_ratio": 0.0004342186148278415,
"epoch": 0.06112054329371817,
"grad_norm": 1.4731091282595996,
"kl": 0.0927734375,
"learning_rate": 2e-06,
"loss": -0.0001,
"step": 72
},
{
"clip_ratio": 0.0,
"completion_length": 338.90625,
"epoch": 0.06196943972835314,
"grad_norm": 0.5932547022811241,
"kl": 0.0556640625,
"learning_rate": 2e-06,
"loss": -0.0004,
"reward": 0.10221391916275024,
"reward_std": 0.07499799132347107,
"rewards/preference_model_reward": 0.10221391916275024,
"rewards/preference_model_reward/std": 0.07499799132347107,
"step": 73
},
{
"clip_ratio": 0.00036755931796506047,
"epoch": 0.06281833616298811,
"grad_norm": 0.5752683509803187,
"kl": 0.05810546875,
"learning_rate": 2e-06,
"loss": -0.0006,
"step": 74
},
{
"clip_ratio": 0.0,
"completion_length": 311.28125,
"epoch": 0.0636672325976231,
"grad_norm": 0.5598161958043475,
"kl": 0.07080078125,
"learning_rate": 2e-06,
"loss": -0.0015,
"reward": 0.07023796439170837,
"reward_std": 0.06094999983906746,
"rewards/preference_model_reward": 0.07023796439170837,
"rewards/preference_model_reward/std": 0.06094999611377716,
"step": 75
},
{
"clip_ratio": 9.596929157851264e-05,
"epoch": 0.06451612903225806,
"grad_norm": 0.5569802415068833,
"kl": 0.07275390625,
"learning_rate": 2e-06,
"loss": -0.0018,
"step": 76
},
{
"clip_ratio": 0.0,
"completion_length": 227.625,
"epoch": 0.06536502546689305,
"grad_norm": 1.2813688126905285,
"kl": 0.09521484375,
"learning_rate": 2e-06,
"loss": -0.0028,
"reward": 0.1808125078678131,
"reward_std": 0.10328490287065506,
"rewards/preference_model_reward": 0.1808125078678131,
"rewards/preference_model_reward/std": 0.10328490287065506,
"step": 77
},
{
"clip_ratio": 0.0006906483322381973,
"epoch": 0.06621392190152801,
"grad_norm": 1.2343717842035047,
"kl": 0.09765625,
"learning_rate": 2e-06,
"loss": -0.0035,
"step": 78
},
{
"clip_ratio": 0.0,
"completion_length": 239.25,
"epoch": 0.06706281833616298,
"grad_norm": 1.2476565697443593,
"kl": 0.09423828125,
"learning_rate": 2e-06,
"loss": 0.0004,
"reward": 0.3275872468948364,
"reward_std": 0.10609177500009537,
"rewards/preference_model_reward": 0.3275872468948364,
"rewards/preference_model_reward/std": 0.10609177500009537,
"step": 79
},
{
"clip_ratio": 0.00013171759201213717,
"epoch": 0.06791171477079797,
"grad_norm": 1.191757488980422,
"kl": 0.09765625,
"learning_rate": 2e-06,
"loss": -0.0003,
"step": 80
},
{
"clip_ratio": 0.0,
"completion_length": 356.5,
"epoch": 0.06876061120543293,
"grad_norm": 0.10573846819647138,
"kl": 0.07666015625,
"learning_rate": 2e-06,
"loss": 0.0002,
"reward": 0.023218905553221703,
"reward_std": 0.013258407823741436,
"rewards/preference_model_reward": 0.023218905553221703,
"rewards/preference_model_reward/std": 0.013258407823741436,
"step": 81
},
{
"clip_ratio": 0.0003507659712340683,
"epoch": 0.06960950764006792,
"grad_norm": 0.1103870844147634,
"kl": 0.0771484375,
"learning_rate": 2e-06,
"loss": 0.0001,
"step": 82
},
{
"clip_ratio": 0.0,
"completion_length": 388.15625,
"epoch": 0.07045840407470289,
"grad_norm": 1.1684465615315396,
"kl": 0.11474609375,
"learning_rate": 2e-06,
"loss": -0.0114,
"reward": 0.16451352834701538,
"reward_std": 0.12782737612724304,
"rewards/preference_model_reward": 0.16451352834701538,
"rewards/preference_model_reward/std": 0.12782739102840424,
"step": 83
},
{
"clip_ratio": 0.00016542727826163173,
"epoch": 0.07130730050933787,
"grad_norm": 1.1475084655450116,
"kl": 0.115234375,
"learning_rate": 2e-06,
"loss": -0.012,
"step": 84
},
{
"clip_ratio": 0.0,
"completion_length": 717.90625,
"epoch": 0.07215619694397284,
"grad_norm": 0.1671276234234309,
"kl": 0.0439453125,
"learning_rate": 2e-06,
"loss": 0.0003,
"reward": 0.004496478941291571,
"reward_std": 0.01374930702149868,
"rewards/preference_model_reward": 0.004496478941291571,
"rewards/preference_model_reward/std": 0.01374930702149868,
"step": 85
},
{
"clip_ratio": 0.0011028368026018143,
"epoch": 0.0730050933786078,
"grad_norm": 0.11851554080319895,
"kl": 0.04150390625,
"learning_rate": 2e-06,
"loss": 0.0003,
"step": 86
},
{
"clip_ratio": 0.0,
"completion_length": 597.875,
"epoch": 0.07385398981324279,
"grad_norm": 0.8930226438210058,
"kl": 0.10107421875,
"learning_rate": 2e-06,
"loss": -0.0002,
"reward": 0.2464292198419571,
"reward_std": 0.1284564882516861,
"rewards/preference_model_reward": 0.2464292198419571,
"rewards/preference_model_reward/std": 0.1284564733505249,
"step": 87
},
{
"clip_ratio": 0.0003217374032828957,
"epoch": 0.07470288624787776,
"grad_norm": 0.8976677967365754,
"kl": 0.10205078125,
"learning_rate": 2e-06,
"loss": -0.0007,
"step": 88
},
{
"clip_ratio": 0.0,
"completion_length": 309.0,
"epoch": 0.07555178268251274,
"grad_norm": 0.8544800016185599,
"kl": 0.1005859375,
"learning_rate": 2e-06,
"loss": -0.0008,
"reward": 0.11918962001800537,
"reward_std": 0.08243891596794128,
"rewards/preference_model_reward": 0.11918962001800537,
"rewards/preference_model_reward/std": 0.08243890851736069,
"step": 89
},
{
"clip_ratio": 0.0003866804763674736,
"epoch": 0.07640067911714771,
"grad_norm": 0.9945706586547373,
"kl": 0.10107421875,
"learning_rate": 2e-06,
"loss": -0.0013,
"step": 90
},
{
"clip_ratio": 0.0,
"completion_length": 603.15625,
"epoch": 0.07724957555178268,
"grad_norm": 0.6302983763196456,
"kl": 0.10009765625,
"learning_rate": 2e-06,
"loss": 0.0001,
"reward": 0.13992220163345337,
"reward_std": 0.08116798847913742,
"rewards/preference_model_reward": 0.13992220163345337,
"rewards/preference_model_reward/std": 0.08116798847913742,
"step": 91
},
{
"clip_ratio": 0.00037479729508049786,
"epoch": 0.07809847198641766,
"grad_norm": 0.6164704370528037,
"kl": 0.099609375,
"learning_rate": 2e-06,
"loss": -0.0002,
"step": 92
},
{
"clip_ratio": 0.0,
"completion_length": 312.15625,
"epoch": 0.07894736842105263,
"grad_norm": 1.3844161112543023,
"kl": 0.10888671875,
"learning_rate": 2e-06,
"loss": -0.0053,
"reward": 0.36870962381362915,
"reward_std": 0.134088933467865,
"rewards/preference_model_reward": 0.36870962381362915,
"rewards/preference_model_reward/std": 0.1340889185667038,
"step": 93
},
{
"clip_ratio": 0.001708789262920618,
"epoch": 0.07979626485568761,
"grad_norm": 1.2611380216933197,
"kl": 0.10888671875,
"learning_rate": 2e-06,
"loss": -0.0061,
"step": 94
},
{
"clip_ratio": 0.0,
"completion_length": 345.84375,
"epoch": 0.08064516129032258,
"grad_norm": 0.7996421963852738,
"kl": 0.1142578125,
"learning_rate": 2e-06,
"loss": 0.0001,
"reward": 0.11349868029356003,
"reward_std": 0.08194027096033096,
"rewards/preference_model_reward": 0.11349868029356003,
"rewards/preference_model_reward/std": 0.08194026350975037,
"step": 95
},
{
"clip_ratio": 0.000727824226487428,
"epoch": 0.08149405772495756,
"grad_norm": 0.851345288015861,
"kl": 0.11474609375,
"learning_rate": 2e-06,
"loss": -0.0004,
"step": 96
},
{
"clip_ratio": 0.0,
"completion_length": 496.8125,
"epoch": 0.08234295415959253,
"grad_norm": 0.7233456374057684,
"kl": 0.0771484375,
"learning_rate": 2e-06,
"loss": 0.0005,
"reward": 0.062306515872478485,
"reward_std": 0.06466341018676758,
"rewards/preference_model_reward": 0.062306515872478485,
"rewards/preference_model_reward/std": 0.06466341018676758,
"step": 97
},
{
"clip_ratio": 0.0024484877940267324,
"epoch": 0.0831918505942275,
"grad_norm": 0.5356124474945269,
"kl": 0.0751953125,
"learning_rate": 2e-06,
"loss": 0.0003,
"step": 98
},
{
"clip_ratio": 0.0,
"completion_length": 346.46875,
"epoch": 0.08404074702886248,
"grad_norm": 0.9856700443791508,
"kl": 0.08447265625,
"learning_rate": 2e-06,
"loss": -0.0026,
"reward": 0.128938689827919,
"reward_std": 0.10464771091938019,
"rewards/preference_model_reward": 0.128938689827919,
"rewards/preference_model_reward/std": 0.10464771091938019,
"step": 99
},
{
"clip_ratio": 0.00026480897213332355,
"epoch": 0.08488964346349745,
"grad_norm": 0.9630686055340228,
"kl": 0.08642578125,
"learning_rate": 2e-06,
"loss": -0.0032,
"step": 100
},
{
"clip_ratio": 0.0,
"completion_length": 265.71875,
"epoch": 0.08573853989813243,
"grad_norm": 1.4006922971408575,
"kl": 0.099609375,
"learning_rate": 2e-06,
"loss": 0.0017,
"reward": 0.20826829969882965,
"reward_std": 0.08805741369724274,
"rewards/preference_model_reward": 0.20826829969882965,
"rewards/preference_model_reward/std": 0.08805741369724274,
"step": 101
},
{
"clip_ratio": 0.00047059552161954343,
"epoch": 0.0865874363327674,
"grad_norm": 1.0093723749145036,
"kl": 0.099609375,
"learning_rate": 2e-06,
"loss": 0.0014,
"step": 102
},
{
"clip_ratio": 0.0,
"completion_length": 352.34375,
"epoch": 0.08743633276740238,
"grad_norm": 0.7907866558806076,
"kl": 0.099609375,
"learning_rate": 2e-06,
"loss": -0.0069,
"reward": 0.1357034146785736,
"reward_std": 0.08183176815509796,
"rewards/preference_model_reward": 0.1357034146785736,
"rewards/preference_model_reward/std": 0.08183176815509796,
"step": 103
},
{
"clip_ratio": 0.00016943408991210163,
"epoch": 0.08828522920203735,
"grad_norm": 0.8163727108346147,
"kl": 0.1005859375,
"learning_rate": 2e-06,
"loss": -0.0074,
"step": 104
},
{
"clip_ratio": 0.0,
"completion_length": 405.1875,
"epoch": 0.08913412563667232,
"grad_norm": 1.2455238091620369,
"kl": 0.1015625,
"learning_rate": 2e-06,
"loss": -0.0006,
"reward": 0.32809197902679443,
"reward_std": 0.13614021241664886,
"rewards/preference_model_reward": 0.32809197902679443,
"rewards/preference_model_reward/std": 0.13614021241664886,
"step": 105
},
{
"clip_ratio": 0.00047510667354799807,
"epoch": 0.0899830220713073,
"grad_norm": 1.175271310166888,
"kl": 0.1044921875,
"learning_rate": 2e-06,
"loss": -0.0012,
"step": 106
},
{
"clip_ratio": 0.0,
"completion_length": 507.28125,
"epoch": 0.09083191850594227,
"grad_norm": 1.4891793433061915,
"kl": 0.09521484375,
"learning_rate": 2e-06,
"loss": 0.0058,
"reward": 0.3271234631538391,
"reward_std": 0.11905878782272339,
"rewards/preference_model_reward": 0.3271234631538391,
"rewards/preference_model_reward/std": 0.11905878782272339,
"step": 107
},
{
"clip_ratio": 0.0007332629174925387,
"epoch": 0.09168081494057725,
"grad_norm": 0.9610577492843277,
"kl": 0.09814453125,
"learning_rate": 2e-06,
"loss": 0.0054,
"step": 108
},
{
"clip_ratio": 0.0,
"completion_length": 677.375,
"epoch": 0.09252971137521222,
"grad_norm": 0.5710959299764182,
"kl": 0.1005859375,
"learning_rate": 2e-06,
"loss": 0.0005,
"reward": 0.3096367120742798,
"reward_std": 0.08406942337751389,
"rewards/preference_model_reward": 0.3096367120742798,
"rewards/preference_model_reward/std": 0.08406941592693329,
"step": 109
},
{
"clip_ratio": 0.00041483400855213404,
"epoch": 0.0933786078098472,
"grad_norm": 0.5381736695457521,
"kl": 0.10205078125,
"learning_rate": 2e-06,
"loss": 0.0001,
"step": 110
},
{
"clip_ratio": 0.0,
"completion_length": 408.78125,
"epoch": 0.09422750424448217,
"grad_norm": 1.04134918622721,
"kl": 0.130859375,
"learning_rate": 2e-06,
"loss": -0.0019,
"reward": 0.16213001310825348,
"reward_std": 0.09974581748247147,
"rewards/preference_model_reward": 0.16213001310825348,
"rewards/preference_model_reward/std": 0.09974581748247147,
"step": 111
},
{
"clip_ratio": 0.0008149376371875405,
"epoch": 0.09507640067911714,
"grad_norm": 0.9160731616594122,
"kl": 0.1328125,
"learning_rate": 2e-06,
"loss": -0.0024,
"step": 112
},
{
"clip_ratio": 0.0,
"completion_length": 373.96875,
"epoch": 0.09592529711375213,
"grad_norm": 1.0946577521442298,
"kl": 0.12158203125,
"learning_rate": 2e-06,
"loss": 0.007,
"reward": 0.2697640657424927,
"reward_std": 0.10352278500795364,
"rewards/preference_model_reward": 0.2697640657424927,
"rewards/preference_model_reward/std": 0.10352278500795364,
"step": 113
},
{
"clip_ratio": 0.0013311142101883888,
"epoch": 0.0967741935483871,
"grad_norm": 0.9667455701612728,
"kl": 0.12353515625,
"learning_rate": 2e-06,
"loss": 0.0065,
"step": 114
},
{
"clip_ratio": 0.0,
"completion_length": 306.0,
"epoch": 0.09762308998302208,
"grad_norm": 1.3217916635151856,
"kl": 0.12890625,
"learning_rate": 2e-06,
"loss": 0.0095,
"reward": 0.26093003153800964,
"reward_std": 0.125474750995636,
"rewards/preference_model_reward": 0.26093003153800964,
"rewards/preference_model_reward/std": 0.125474750995636,
"step": 115
},
{
"clip_ratio": 0.0,
"epoch": 0.09847198641765705,
"grad_norm": 1.249179472861132,
"kl": 0.1318359375,
"learning_rate": 2e-06,
"loss": 0.0088,
"step": 116
},
{
"clip_ratio": 0.0,
"completion_length": 418.71875,
"epoch": 0.09932088285229201,
"grad_norm": 1.0159825028105314,
"kl": 0.126953125,
"learning_rate": 2e-06,
"loss": 0.0004,
"reward": 0.21273019909858704,
"reward_std": 0.10706693679094315,
"rewards/preference_model_reward": 0.21273019909858704,
"rewards/preference_model_reward/std": 0.10706692934036255,
"step": 117
},
{
"clip_ratio": 0.00037145469104871154,
"epoch": 0.100169779286927,
"grad_norm": 0.932911697631841,
"kl": 0.12890625,
"learning_rate": 2e-06,
"loss": -0.0002,
"step": 118
},
{
"clip_ratio": 0.0,
"completion_length": 555.0625,
"epoch": 0.10101867572156197,
"grad_norm": 1.3593075889098412,
"kl": 0.1123046875,
"learning_rate": 2e-06,
"loss": 0.0152,
"reward": 0.42055854201316833,
"reward_std": 0.1595481038093567,
"rewards/preference_model_reward": 0.42055854201316833,
"rewards/preference_model_reward/std": 0.1595481038093567,
"step": 119
},
{
"clip_ratio": 0.0009689436410553753,
"epoch": 0.10186757215619695,
"grad_norm": 1.5262381686745565,
"kl": 0.115234375,
"learning_rate": 2e-06,
"loss": 0.0144,
"step": 120
},
{
"clip_ratio": 0.0,
"completion_length": 357.5625,
"epoch": 0.10271646859083192,
"grad_norm": 0.6284715836457387,
"kl": 0.1533203125,
"learning_rate": 2e-06,
"loss": -0.0006,
"reward": 0.5042369365692139,
"reward_std": 0.05617382749915123,
"rewards/preference_model_reward": 0.5042369365692139,
"rewards/preference_model_reward/std": 0.05617383494973183,
"step": 121
},
{
"clip_ratio": 0.0016699727857485414,
"epoch": 0.1035653650254669,
"grad_norm": 0.5676292275365208,
"kl": 0.154296875,
"learning_rate": 2e-06,
"loss": -0.0009,
"step": 122
},
{
"clip_ratio": 0.0,
"completion_length": 413.6875,
"epoch": 0.10441426146010187,
"grad_norm": 0.6166063462451017,
"kl": 0.1240234375,
"learning_rate": 2e-06,
"loss": -0.0012,
"reward": 0.4225958585739136,
"reward_std": 0.06309302896261215,
"rewards/preference_model_reward": 0.4225958585739136,
"rewards/preference_model_reward/std": 0.06309301406145096,
"step": 123
},
{
"clip_ratio": 0.001325472490862012,
"epoch": 0.10526315789473684,
"grad_norm": 0.5030461455471626,
"kl": 0.1240234375,
"learning_rate": 2e-06,
"loss": -0.0014,
"step": 124
},
{
"clip_ratio": 0.0,
"completion_length": 654.625,
"epoch": 0.10611205432937182,
"grad_norm": 0.9061656748551837,
"kl": 0.1416015625,
"learning_rate": 2e-06,
"loss": -0.0003,
"reward": 0.26992088556289673,
"reward_std": 0.12210524082183838,
"rewards/preference_model_reward": 0.26992088556289673,
"rewards/preference_model_reward/std": 0.12210523337125778,
"step": 125
},
{
"clip_ratio": 0.0003366470627952367,
"epoch": 0.10696095076400679,
"grad_norm": 0.8859074333947582,
"kl": 0.142578125,
"learning_rate": 2e-06,
"loss": -0.0008,
"step": 126
},
{
"clip_ratio": 0.0,
"completion_length": 318.96875,
"epoch": 0.10780984719864177,
"grad_norm": 0.7830937452002261,
"kl": 0.07763671875,
"learning_rate": 2e-06,
"loss": -0.0029,
"reward": 0.31064295768737793,
"reward_std": 0.0781577080488205,
"rewards/preference_model_reward": 0.31064295768737793,
"rewards/preference_model_reward/std": 0.07815771549940109,
"step": 127
},
{
"clip_ratio": 0.0004959848592989147,
"epoch": 0.10865874363327674,
"grad_norm": 0.7465755882530519,
"kl": 0.07861328125,
"learning_rate": 2e-06,
"loss": -0.0034,
"step": 128
},
{
"clip_ratio": 0.0,
"completion_length": 285.4375,
"epoch": 0.10950764006791172,
"grad_norm": 0.6716474625246404,
"kl": 0.150390625,
"learning_rate": 2e-06,
"loss": -0.0014,
"reward": 0.4774589240550995,
"reward_std": 0.04319845512509346,
"rewards/preference_model_reward": 0.4774589240550995,
"rewards/preference_model_reward/std": 0.04319845885038376,
"step": 129
},
{
"clip_ratio": 0.0006459264550358057,
"epoch": 0.11035653650254669,
"grad_norm": 0.4277286893768862,
"kl": 0.15234375,
"learning_rate": 2e-06,
"loss": -0.0016,
"step": 130
},
{
"clip_ratio": 0.0,
"completion_length": 230.78125,
"epoch": 0.11120543293718166,
"grad_norm": 0.9935245836857972,
"kl": 0.169921875,
"learning_rate": 2e-06,
"loss": -0.0023,
"reward": 0.3256058692932129,
"reward_std": 0.07717268913984299,
"rewards/preference_model_reward": 0.3256058692932129,
"rewards/preference_model_reward/std": 0.07717268913984299,
"step": 131
},
{
"clip_ratio": 0.00220286101102829,
"epoch": 0.11205432937181664,
"grad_norm": 0.9136352224117723,
"kl": 0.16796875,
"learning_rate": 2e-06,
"loss": -0.0028,
"step": 132
},
{
"clip_ratio": 0.0,
"completion_length": 350.46875,
"epoch": 0.11290322580645161,
"grad_norm": 0.5870925563537904,
"kl": 0.1640625,
"learning_rate": 2e-06,
"loss": 0.0002,
"reward": 0.08773044496774673,
"reward_std": 0.06896770745515823,
"rewards/preference_model_reward": 0.08773044496774673,
"rewards/preference_model_reward/std": 0.06896770745515823,
"step": 133
},
{
"clip_ratio": 0.0005350956926122308,
"epoch": 0.11375212224108659,
"grad_norm": 0.5639384325042808,
"kl": 0.16015625,
"learning_rate": 2e-06,
"loss": -0.0002,
"step": 134
},
{
"clip_ratio": 0.0,
"completion_length": 528.0625,
"epoch": 0.11460101867572156,
"grad_norm": 0.48480451064325536,
"kl": 0.12109375,
"learning_rate": 2e-06,
"loss": -0.0014,
"reward": 0.4244787096977234,
"reward_std": 0.05737914890050888,
"rewards/preference_model_reward": 0.4244787096977234,
"rewards/preference_model_reward/std": 0.05737914890050888,
"step": 135
},
{
"clip_ratio": 0.000636638724245131,
"epoch": 0.11544991511035653,
"grad_norm": 0.4122336602818054,
"kl": 0.11328125,
"learning_rate": 2e-06,
"loss": -0.0017,
"step": 136
},
{
"clip_ratio": 0.0,
"completion_length": 257.625,
"epoch": 0.11629881154499151,
"grad_norm": 1.4922729095300895,
"kl": 0.203125,
"learning_rate": 2e-06,
"loss": -0.0004,
"reward": 0.3075970411300659,
"reward_std": 0.10743933171033859,
"rewards/preference_model_reward": 0.3075970411300659,
"rewards/preference_model_reward/std": 0.10743933171033859,
"step": 137
},
{
"clip_ratio": 0.0006169785629026592,
"epoch": 0.11714770797962648,
"grad_norm": 1.1808501339893651,
"kl": 0.203125,
"learning_rate": 2e-06,
"loss": -0.001,
"step": 138
},
{
"clip_ratio": 0.0,
"completion_length": 306.9375,
"epoch": 0.11799660441426146,
"grad_norm": 1.0160072383114551,
"kl": 0.16796875,
"learning_rate": 2e-06,
"loss": -0.0061,
"reward": 0.46265456080436707,
"reward_std": 0.11661313474178314,
"rewards/preference_model_reward": 0.46265456080436707,
"rewards/preference_model_reward/std": 0.11661314219236374,
"step": 139
},
{
"clip_ratio": 0.0003987574018537998,
"epoch": 0.11884550084889643,
"grad_norm": 0.9531276983222098,
"kl": 0.166015625,
"learning_rate": 2e-06,
"loss": -0.0068,
"step": 140
},
{
"clip_ratio": 0.0,
"completion_length": 503.5,
"epoch": 0.11969439728353141,
"grad_norm": 1.1419798492524438,
"kl": 0.1787109375,
"learning_rate": 2e-06,
"loss": 0.0048,
"reward": 0.20830851793289185,
"reward_std": 0.13461197912693024,
"rewards/preference_model_reward": 0.20830851793289185,
"rewards/preference_model_reward/std": 0.13461197912693024,
"step": 141
},
{
"clip_ratio": 0.00026064369012601674,
"epoch": 0.12054329371816638,
"grad_norm": 1.072087725228961,
"kl": 0.1806640625,
"learning_rate": 2e-06,
"loss": 0.004,
"step": 142
},
{
"clip_ratio": 0.0,
"completion_length": 384.1875,
"epoch": 0.12139219015280135,
"grad_norm": 1.363810772987835,
"kl": 0.1884765625,
"learning_rate": 2e-06,
"loss": -0.0039,
"reward": 0.24709536135196686,
"reward_std": 0.13785149157047272,
"rewards/preference_model_reward": 0.24709536135196686,
"rewards/preference_model_reward/std": 0.1378515064716339,
"step": 143
},
{
"clip_ratio": 0.00023824731761123985,
"epoch": 0.12224108658743633,
"grad_norm": 1.280172314576326,
"kl": 0.19140625,
"learning_rate": 2e-06,
"loss": -0.0047,
"step": 144
},
{
"clip_ratio": 0.0,
"completion_length": 425.90625,
"epoch": 0.1230899830220713,
"grad_norm": 0.5570698552144494,
"kl": 0.115234375,
"learning_rate": 2e-06,
"loss": -0.0017,
"reward": 0.12276525795459747,
"reward_std": 0.06314485520124435,
"rewards/preference_model_reward": 0.12276525795459747,
"rewards/preference_model_reward/std": 0.06314485520124435,
"step": 145
},
{
"clip_ratio": 0.0003630488063208759,
"epoch": 0.12393887945670629,
"grad_norm": 0.5707319334424644,
"kl": 0.11474609375,
"learning_rate": 2e-06,
"loss": -0.002,
"step": 146
},
{
"clip_ratio": 0.0,
"completion_length": 331.625,
"epoch": 0.12478777589134125,
"grad_norm": 0.8955034856932126,
"kl": 0.189453125,
"learning_rate": 2e-06,
"loss": -0.002,
"reward": 0.06658346205949783,
"reward_std": 0.05682339146733284,
"rewards/preference_model_reward": 0.06658346205949783,
"rewards/preference_model_reward/std": 0.05682339146733284,
"step": 147
},
{
"clip_ratio": 0.0011066581355407834,
"epoch": 0.12563667232597622,
"grad_norm": 0.6244418580201853,
"kl": 0.19140625,
"learning_rate": 2e-06,
"loss": -0.0023,
"step": 148
},
{
"clip_ratio": 0.0,
"completion_length": 393.15625,
"epoch": 0.1264855687606112,
"grad_norm": 1.2895611117206778,
"kl": 0.1875,
"learning_rate": 2e-06,
"loss": -0.0053,
"reward": 0.25148850679397583,
"reward_std": 0.12007515132427216,
"rewards/preference_model_reward": 0.25148850679397583,
"rewards/preference_model_reward/std": 0.12007514387369156,
"step": 149
},
{
"clip_ratio": 0.0006402829312719405,
"epoch": 0.1273344651952462,
"grad_norm": 1.288543979995357,
"kl": 0.189453125,
"learning_rate": 2e-06,
"loss": -0.0061,
"step": 150
},
{
"clip_ratio": 0.0,
"completion_length": 440.78125,
"epoch": 0.12818336162988114,
"grad_norm": 1.2555636888842705,
"kl": 0.193359375,
"learning_rate": 2e-06,
"loss": -0.0041,
"reward": 0.2623947262763977,
"reward_std": 0.11035064607858658,
"rewards/preference_model_reward": 0.2623947262763977,
"rewards/preference_model_reward/std": 0.11035064607858658,
"step": 151
},
{
"clip_ratio": 0.003197396406903863,
"epoch": 0.12903225806451613,
"grad_norm": 1.1625036279466894,
"kl": 0.197265625,
"learning_rate": 2e-06,
"loss": -0.0047,
"step": 152
},
{
"clip_ratio": 0.0,
"completion_length": 271.59375,
"epoch": 0.1298811544991511,
"grad_norm": 1.096093984535528,
"kl": 0.251953125,
"learning_rate": 2e-06,
"loss": -0.0016,
"reward": 0.3598458170890808,
"reward_std": 0.08515099436044693,
"rewards/preference_model_reward": 0.3598458170890808,
"rewards/preference_model_reward/std": 0.08515099436044693,
"step": 153
},
{
"clip_ratio": 0.0012925309129059315,
"epoch": 0.1307300509337861,
"grad_norm": 1.00178630558753,
"kl": 0.255859375,
"learning_rate": 2e-06,
"loss": -0.0023,
"step": 154
},
{
"clip_ratio": 0.0,
"completion_length": 415.5,
"epoch": 0.13157894736842105,
"grad_norm": 1.1081030432870604,
"kl": 0.177734375,
"learning_rate": 2e-06,
"loss": 0.0115,
"reward": 0.3061649799346924,
"reward_std": 0.11071331799030304,
"rewards/preference_model_reward": 0.3061649799346924,
"rewards/preference_model_reward/std": 0.11071331799030304,
"step": 155
},
{
"clip_ratio": 0.0003086737706325948,
"epoch": 0.13242784380305603,
"grad_norm": 1.0874086536996357,
"kl": 0.177734375,
"learning_rate": 2e-06,
"loss": 0.0109,
"step": 156
},
{
"clip_ratio": 0.0,
"completion_length": 417.03125,
"epoch": 0.133276740237691,
"grad_norm": 0.7599994731662741,
"kl": 0.177734375,
"learning_rate": 2e-06,
"loss": -0.0007,
"reward": 0.48063063621520996,
"reward_std": 0.06754690408706665,
"rewards/preference_model_reward": 0.48063063621520996,
"rewards/preference_model_reward/std": 0.06754691153764725,
"step": 157
},
{
"clip_ratio": 0.0018070859368890524,
"epoch": 0.13412563667232597,
"grad_norm": 0.597446098810492,
"kl": 0.1650390625,
"learning_rate": 2e-06,
"loss": -0.001,
"step": 158
},
{
"clip_ratio": 0.0,
"completion_length": 383.9375,
"epoch": 0.13497453310696095,
"grad_norm": 0.7677380389143111,
"kl": 0.2041015625,
"learning_rate": 2e-06,
"loss": -0.0046,
"reward": 0.1170925423502922,
"reward_std": 0.07494159787893295,
"rewards/preference_model_reward": 0.1170925423502922,
"rewards/preference_model_reward/std": 0.07494159787893295,
"step": 159
},
{
"clip_ratio": 0.0012223758967593312,
"epoch": 0.13582342954159593,
"grad_norm": 0.7399797210592777,
"kl": 0.203125,
"learning_rate": 2e-06,
"loss": -0.0051,
"step": 160
},
{
"clip_ratio": 0.0,
"completion_length": 722.25,
"epoch": 0.1366723259762309,
"grad_norm": 0.8765706673008182,
"kl": 0.189453125,
"learning_rate": 2e-06,
"loss": 0.0001,
"reward": 0.3232240676879883,
"reward_std": 0.09787525236606598,
"rewards/preference_model_reward": 0.3232240676879883,
"rewards/preference_model_reward/std": 0.09787525236606598,
"step": 161
},
{
"clip_ratio": 0.0010285093449056149,
"epoch": 0.13752122241086587,
"grad_norm": 0.762896466304412,
"kl": 0.1875,
"learning_rate": 2e-06,
"loss": -0.0002,
"step": 162
},
{
"clip_ratio": 0.0,
"completion_length": 714.09375,
"epoch": 0.13837011884550085,
"grad_norm": 1.1627137317630705,
"kl": 0.12353515625,
"learning_rate": 2e-06,
"loss": 0.0081,
"reward": 0.3034539818763733,
"reward_std": 0.15702968835830688,
"rewards/preference_model_reward": 0.3034539818763733,
"rewards/preference_model_reward/std": 0.15702970325946808,
"step": 163
},
{
"clip_ratio": 0.0006113144336268306,
"epoch": 0.13921901528013583,
"grad_norm": 1.110258121456432,
"kl": 0.12158203125,
"learning_rate": 2e-06,
"loss": 0.0073,
"step": 164
},
{
"clip_ratio": 0.0,
"completion_length": 391.21875,
"epoch": 0.1400679117147708,
"grad_norm": 1.089926535448989,
"kl": 0.1708984375,
"learning_rate": 2e-06,
"loss": 0.0037,
"reward": 0.22248202562332153,
"reward_std": 0.11353754997253418,
"rewards/preference_model_reward": 0.22248202562332153,
"rewards/preference_model_reward/std": 0.11353754997253418,
"step": 165
},
{
"clip_ratio": 0.0005485577858053148,
"epoch": 0.14091680814940577,
"grad_norm": 0.9711244878108193,
"kl": 0.1708984375,
"learning_rate": 2e-06,
"loss": 0.0031,
"step": 166
},
{
"clip_ratio": 0.0,
"completion_length": 426.21875,
"epoch": 0.14176570458404075,
"grad_norm": 1.2634303413205847,
"kl": 0.2392578125,
"learning_rate": 2e-06,
"loss": 0.0015,
"reward": 0.23580655455589294,
"reward_std": 0.12878787517547607,
"rewards/preference_model_reward": 0.23580655455589294,
"rewards/preference_model_reward/std": 0.12878787517547607,
"step": 167
},
{
"clip_ratio": 0.0008147264015860856,
"epoch": 0.14261460101867574,
"grad_norm": 1.2475202369294187,
"kl": 0.2392578125,
"learning_rate": 2e-06,
"loss": 0.0007,
"step": 168
},
{
"clip_ratio": 0.0,
"completion_length": 420.84375,
"epoch": 0.1434634974533107,
"grad_norm": 0.9256116658563932,
"kl": 0.1806640625,
"learning_rate": 2e-06,
"loss": 0.0017,
"reward": 0.19842864573001862,
"reward_std": 0.09470146149396896,
"rewards/preference_model_reward": 0.19842864573001862,
"rewards/preference_model_reward/std": 0.09470146149396896,
"step": 169
},
{
"clip_ratio": 0.0016190335154533386,
"epoch": 0.14431239388794567,
"grad_norm": 0.8308927390156897,
"kl": 0.1796875,
"learning_rate": 2e-06,
"loss": 0.0012,
"step": 170
},
{
"clip_ratio": 0.0,
"completion_length": 511.125,
"epoch": 0.14516129032258066,
"grad_norm": 0.9408637846666039,
"kl": 0.181640625,
"learning_rate": 2e-06,
"loss": 0.0081,
"reward": 0.19552090764045715,
"reward_std": 0.1133582592010498,
"rewards/preference_model_reward": 0.19552090764045715,
"rewards/preference_model_reward/std": 0.11335825175046921,
"step": 171
},
{
"clip_ratio": 0.000971193250734359,
"epoch": 0.1460101867572156,
"grad_norm": 0.946800803701003,
"kl": 0.181640625,
"learning_rate": 2e-06,
"loss": 0.0076,
"step": 172
},
{
"clip_ratio": 0.0,
"completion_length": 645.34375,
"epoch": 0.1468590831918506,
"grad_norm": 0.8208513143064141,
"kl": 0.22265625,
"learning_rate": 2e-06,
"loss": 0.002,
"reward": 0.4708970785140991,
"reward_std": 0.10244568437337875,
"rewards/preference_model_reward": 0.4708970785140991,
"rewards/preference_model_reward/std": 0.10244568437337875,
"step": 173
},
{
"clip_ratio": 0.0008575035026296973,
"epoch": 0.14770797962648557,
"grad_norm": 0.777907252016811,
"kl": 0.220703125,
"learning_rate": 2e-06,
"loss": 0.0015,
"step": 174
},
{
"clip_ratio": 0.0,
"completion_length": 352.5625,
"epoch": 0.14855687606112053,
"grad_norm": 0.7724021747968359,
"kl": 0.26171875,
"learning_rate": 2e-06,
"loss": -0.0009,
"reward": 0.28050410747528076,
"reward_std": 0.07774435728788376,
"rewards/preference_model_reward": 0.28050410747528076,
"rewards/preference_model_reward/std": 0.07774436473846436,
"step": 175
},
{
"clip_ratio": 0.0007047850522212684,
"epoch": 0.1494057724957555,
"grad_norm": 0.8617498914678887,
"kl": 0.26171875,
"learning_rate": 2e-06,
"loss": -0.0014,
"step": 176
},
{
"clip_ratio": 0.0,
"completion_length": 528.8125,
"epoch": 0.1502546689303905,
"grad_norm": 1.0109383309762656,
"kl": 0.19921875,
"learning_rate": 2e-06,
"loss": 0.0031,
"reward": 0.400208055973053,
"reward_std": 0.11498203873634338,
"rewards/preference_model_reward": 0.400208055973053,
"rewards/preference_model_reward/std": 0.11498204618692398,
"step": 177
},
{
"clip_ratio": 0.0015235163737088442,
"epoch": 0.15110356536502548,
"grad_norm": 0.9751593675624571,
"kl": 0.19921875,
"learning_rate": 2e-06,
"loss": 0.0024,
"step": 178
},
{
"clip_ratio": 0.0,
"completion_length": 364.84375,
"epoch": 0.15195246179966043,
"grad_norm": 1.0436702904156618,
"kl": 0.1962890625,
"learning_rate": 2e-06,
"loss": -0.0041,
"reward": 0.431307315826416,
"reward_std": 0.10992471128702164,
"rewards/preference_model_reward": 0.431307315826416,
"rewards/preference_model_reward/std": 0.10992471128702164,
"step": 179
},
{
"clip_ratio": 0.00033796619391068816,
"epoch": 0.15280135823429541,
"grad_norm": 0.9638853656528253,
"kl": 0.1953125,
"learning_rate": 2e-06,
"loss": -0.0048,
"step": 180
},
{
"clip_ratio": 0.0,
"completion_length": 421.03125,
"epoch": 0.1536502546689304,
"grad_norm": 1.0786137221446535,
"kl": 0.2158203125,
"learning_rate": 2e-06,
"loss": 0.0106,
"reward": 0.26917120814323425,
"reward_std": 0.11035769432783127,
"rewards/preference_model_reward": 0.26917120814323425,
"rewards/preference_model_reward/std": 0.11035769432783127,
"step": 181
},
{
"clip_ratio": 0.0005866018473170698,
"epoch": 0.15449915110356535,
"grad_norm": 1.0797645934269513,
"kl": 0.21484375,
"learning_rate": 2e-06,
"loss": 0.01,
"step": 182
},
{
"clip_ratio": 0.0,
"completion_length": 340.40625,
"epoch": 0.15534804753820033,
"grad_norm": 0.9516406822692116,
"kl": 0.21484375,
"learning_rate": 2e-06,
"loss": -0.0008,
"reward": 0.4257683753967285,
"reward_std": 0.08008842915296555,
"rewards/preference_model_reward": 0.4257683753967285,
"rewards/preference_model_reward/std": 0.08008842915296555,
"step": 183
},
{
"clip_ratio": 0.0007616700022481382,
"epoch": 0.15619694397283532,
"grad_norm": 1.0243333961812868,
"kl": 0.2158203125,
"learning_rate": 2e-06,
"loss": -0.0014,
"step": 184
},
{
"clip_ratio": 0.0,
"completion_length": 309.3125,
"epoch": 0.1570458404074703,
"grad_norm": 1.0966611105512216,
"kl": 0.19140625,
"learning_rate": 2e-06,
"loss": 0.0,
"reward": 0.44364723563194275,
"reward_std": 0.08954507112503052,
"rewards/preference_model_reward": 0.44364723563194275,
"rewards/preference_model_reward/std": 0.08954507112503052,
"step": 185
},
{
"clip_ratio": 0.0007273735827766359,
"epoch": 0.15789473684210525,
"grad_norm": 0.8995850425186181,
"kl": 0.19140625,
"learning_rate": 2e-06,
"loss": -0.0006,
"step": 186
},
{
"clip_ratio": 0.0,
"completion_length": 350.03125,
"epoch": 0.15874363327674024,
"grad_norm": 1.0365032710666704,
"kl": 0.1943359375,
"learning_rate": 2e-06,
"loss": -0.0025,
"reward": 0.503156304359436,
"reward_std": 0.06975705921649933,
"rewards/preference_model_reward": 0.503156304359436,
"rewards/preference_model_reward/std": 0.06975706666707993,
"step": 187
},
{
"clip_ratio": 0.003309250809252262,
"epoch": 0.15959252971137522,
"grad_norm": 0.7719048533542018,
"kl": 0.1953125,
"learning_rate": 2e-06,
"loss": -0.0028,
"step": 188
},
{
"clip_ratio": 0.0,
"completion_length": 469.40625,
"epoch": 0.16044142614601017,
"grad_norm": 1.2152567376815486,
"kl": 0.19140625,
"learning_rate": 2e-06,
"loss": -0.0018,
"reward": 0.2653403878211975,
"reward_std": 0.13634686172008514,
"rewards/preference_model_reward": 0.2653403878211975,
"rewards/preference_model_reward/std": 0.13634686172008514,
"step": 189
},
{
"clip_ratio": 0.00013139564543962479,
"epoch": 0.16129032258064516,
"grad_norm": 1.1599347877086612,
"kl": 0.19140625,
"learning_rate": 2e-06,
"loss": -0.0027,
"step": 190
},
{
"clip_ratio": 0.0,
"completion_length": 387.0625,
"epoch": 0.16213921901528014,
"grad_norm": 1.1561729382206092,
"kl": 0.21484375,
"learning_rate": 2e-06,
"loss": 0.0003,
"reward": 0.3497307002544403,
"reward_std": 0.11881572753190994,
"rewards/preference_model_reward": 0.3497307002544403,
"rewards/preference_model_reward/std": 0.11881572753190994,
"step": 191
},
{
"clip_ratio": 0.0006593581638298929,
"epoch": 0.16298811544991512,
"grad_norm": 1.0934785615900324,
"kl": 0.216796875,
"learning_rate": 2e-06,
"loss": -0.0005,
"step": 192
},
{
"clip_ratio": 0.0,
"completion_length": 466.8125,
"epoch": 0.16383701188455008,
"grad_norm": 0.5519710984838219,
"kl": 0.1845703125,
"learning_rate": 2e-06,
"loss": 0.0013,
"reward": 0.10650002956390381,
"reward_std": 0.059550777077674866,
"rewards/preference_model_reward": 0.10650002956390381,
"rewards/preference_model_reward/std": 0.059550777077674866,
"step": 193
},
{
"clip_ratio": 0.0010145865380764008,
"epoch": 0.16468590831918506,
"grad_norm": 0.8499180844539812,
"kl": 0.185546875,
"learning_rate": 2e-06,
"loss": 0.001,
"step": 194
},
{
"clip_ratio": 0.0,
"completion_length": 556.59375,
"epoch": 0.16553480475382004,
"grad_norm": 0.9414095054306995,
"kl": 0.13671875,
"learning_rate": 2e-06,
"loss": 0.0036,
"reward": 0.35372745990753174,
"reward_std": 0.11836274713277817,
"rewards/preference_model_reward": 0.35372745990753174,
"rewards/preference_model_reward/std": 0.11836273968219757,
"step": 195
},
{
"clip_ratio": 0.00011072463530581445,
"epoch": 0.166383701188455,
"grad_norm": 0.8517491157753083,
"kl": 0.13671875,
"learning_rate": 2e-06,
"loss": 0.0031,
"step": 196
},
{
"clip_ratio": 0.0,
"completion_length": 441.21875,
"epoch": 0.16723259762308998,
"grad_norm": 1.0176591850675871,
"kl": 0.203125,
"learning_rate": 2e-06,
"loss": -0.003,
"reward": 0.32411473989486694,
"reward_std": 0.10493102669715881,
"rewards/preference_model_reward": 0.32411473989486694,
"rewards/preference_model_reward/std": 0.10493102669715881,
"step": 197
},
{
"clip_ratio": 0.0005816287593916059,
"epoch": 0.16808149405772496,
"grad_norm": 0.9532792399626693,
"kl": 0.2041015625,
"learning_rate": 2e-06,
"loss": -0.0036,
"step": 198
},
{
"clip_ratio": 0.0,
"completion_length": 693.71875,
"epoch": 0.16893039049235994,
"grad_norm": 0.6157709458820001,
"kl": 0.17578125,
"learning_rate": 2e-06,
"loss": 0.0012,
"reward": 0.5120692849159241,
"reward_std": 0.08368350565433502,
"rewards/preference_model_reward": 0.5120692849159241,
"rewards/preference_model_reward/std": 0.08368349820375443,
"step": 199
},
{
"clip_ratio": 0.000810971308965236,
"epoch": 0.1697792869269949,
"grad_norm": 0.5744963108218382,
"kl": 0.1748046875,
"learning_rate": 2e-06,
"loss": 0.0008,
"step": 200
},
{
"clip_ratio": 0.0,
"completion_length": 499.34375,
"epoch": 0.17062818336162988,
"grad_norm": 1.171086543231942,
"kl": 0.181640625,
"learning_rate": 2e-06,
"loss": -0.0044,
"reward": 0.3279721438884735,
"reward_std": 0.14420974254608154,
"rewards/preference_model_reward": 0.3279721438884735,
"rewards/preference_model_reward/std": 0.14420974254608154,
"step": 201
},
{
"clip_ratio": 0.0011094075161963701,
"epoch": 0.17147707979626486,
"grad_norm": 1.121059254698811,
"kl": 0.1806640625,
"learning_rate": 2e-06,
"loss": -0.0052,
"step": 202
},
{
"clip_ratio": 0.0,
"completion_length": 292.6875,
"epoch": 0.17232597623089982,
"grad_norm": 0.8874841846577225,
"kl": 0.1083984375,
"learning_rate": 2e-06,
"loss": -0.0089,
"reward": 0.27286988496780396,
"reward_std": 0.09546167403459549,
"rewards/preference_model_reward": 0.27286988496780396,
"rewards/preference_model_reward/std": 0.09546167403459549,
"step": 203
},
{
"clip_ratio": 0.00020234723342582583,
"epoch": 0.1731748726655348,
"grad_norm": 0.8726296645362996,
"kl": 0.1083984375,
"learning_rate": 2e-06,
"loss": -0.0095,
"step": 204
},
{
"clip_ratio": 0.0,
"completion_length": 504.875,
"epoch": 0.17402376910016978,
"grad_norm": 2.7391485484558045,
"kl": 0.1904296875,
"learning_rate": 2e-06,
"loss": -0.0018,
"reward": 0.41756588220596313,
"reward_std": 0.11001207679510117,
"rewards/preference_model_reward": 0.41756588220596313,
"rewards/preference_model_reward/std": 0.11001206934452057,
"step": 205
},
{
"clip_ratio": 0.0005510338814929128,
"epoch": 0.17487266553480477,
"grad_norm": 0.8607142192205527,
"kl": 0.189453125,
"learning_rate": 2e-06,
"loss": -0.002,
"step": 206
},
{
"clip_ratio": 0.0,
"completion_length": 452.84375,
"epoch": 0.17572156196943972,
"grad_norm": 0.4685652506244187,
"kl": 0.1923828125,
"learning_rate": 2e-06,
"loss": 0.0003,
"reward": 0.48838815093040466,
"reward_std": 0.039198972284793854,
"rewards/preference_model_reward": 0.48838815093040466,
"rewards/preference_model_reward/std": 0.03919896483421326,
"step": 207
},
{
"clip_ratio": 0.001379701541736722,
"epoch": 0.1765704584040747,
"grad_norm": 0.36885619072170894,
"kl": 0.1904296875,
"learning_rate": 2e-06,
"loss": 0.0002,
"step": 208
},
{
"clip_ratio": 0.0,
"completion_length": 457.8125,
"epoch": 0.1774193548387097,
"grad_norm": 0.660983587706389,
"kl": 0.173828125,
"learning_rate": 2e-06,
"loss": -0.0025,
"reward": 0.08188852667808533,
"reward_std": 0.06869849562644958,
"rewards/preference_model_reward": 0.08188852667808533,
"rewards/preference_model_reward/std": 0.06869849562644958,
"step": 209
},
{
"clip_ratio": 0.0004064367385581136,
"epoch": 0.17826825127334464,
"grad_norm": 0.5900136843275542,
"kl": 0.1728515625,
"learning_rate": 2e-06,
"loss": -0.0029,
"step": 210
},
{
"clip_ratio": 0.0,
"completion_length": 536.34375,
"epoch": 0.17911714770797962,
"grad_norm": 1.192737456986498,
"kl": 0.208984375,
"learning_rate": 2e-06,
"loss": 0.0126,
"reward": 0.25954583287239075,
"reward_std": 0.09377846866846085,
"rewards/preference_model_reward": 0.25954583287239075,
"rewards/preference_model_reward/std": 0.09377846121788025,
"step": 211
},
{
"clip_ratio": 0.0005144176539033651,
"epoch": 0.1799660441426146,
"grad_norm": 0.9034533154555449,
"kl": 0.19921875,
"learning_rate": 2e-06,
"loss": 0.0122,
"step": 212
},
{
"clip_ratio": 0.0,
"completion_length": 503.5625,
"epoch": 0.1808149405772496,
"grad_norm": 0.9811032516295555,
"kl": 0.208984375,
"learning_rate": 2e-06,
"loss": 0.0015,
"reward": 0.22673586010932922,
"reward_std": 0.1181153729557991,
"rewards/preference_model_reward": 0.22673586010932922,
"rewards/preference_model_reward/std": 0.1181153655052185,
"step": 213
},
{
"clip_ratio": 0.0006308910087682307,
"epoch": 0.18166383701188454,
"grad_norm": 1.2177987016178247,
"kl": 0.2060546875,
"learning_rate": 2e-06,
"loss": 0.0009,
"step": 214
},
{
"clip_ratio": 0.0,
"completion_length": 380.9375,
"epoch": 0.18251273344651953,
"grad_norm": 1.1883998105439184,
"kl": 0.2041015625,
"learning_rate": 2e-06,
"loss": 0.0026,
"reward": 0.3673512935638428,
"reward_std": 0.11825986206531525,
"rewards/preference_model_reward": 0.3673512935638428,
"rewards/preference_model_reward/std": 0.11825986206531525,
"step": 215
},
{
"clip_ratio": 0.0004939221544191241,
"epoch": 0.1833616298811545,
"grad_norm": 1.1187570941679779,
"kl": 0.2060546875,
"learning_rate": 2e-06,
"loss": 0.0018,
"step": 216
},
{
"clip_ratio": 0.0,
"completion_length": 482.78125,
"epoch": 0.18421052631578946,
"grad_norm": 1.2336234795768202,
"kl": 0.1943359375,
"learning_rate": 2e-06,
"loss": -0.0007,
"reward": 0.22679108381271362,
"reward_std": 0.13555686175823212,
"rewards/preference_model_reward": 0.22679108381271362,
"rewards/preference_model_reward/std": 0.13555686175823212,
"step": 217
},
{
"clip_ratio": 0.0012310510501265526,
"epoch": 0.18505942275042445,
"grad_norm": 1.1626357202169864,
"kl": 0.197265625,
"learning_rate": 2e-06,
"loss": -0.0014,
"step": 218
},
{
"clip_ratio": 0.0,
"completion_length": 319.25,
"epoch": 0.18590831918505943,
"grad_norm": 0.90605092796741,
"kl": 0.18359375,
"learning_rate": 2e-06,
"loss": -0.0023,
"reward": 0.44145655632019043,
"reward_std": 0.09349598735570908,
"rewards/preference_model_reward": 0.44145655632019043,
"rewards/preference_model_reward/std": 0.09349598735570908,
"step": 219
},
{
"clip_ratio": 0.0001996077917283401,
"epoch": 0.1867572156196944,
"grad_norm": 0.8461078416569522,
"kl": 0.185546875,
"learning_rate": 2e-06,
"loss": -0.0028,
"step": 220
},
{
"clip_ratio": 0.0,
"completion_length": 469.25,
"epoch": 0.18760611205432937,
"grad_norm": 1.2596387776384212,
"kl": 0.2177734375,
"learning_rate": 2e-06,
"loss": 0.0053,
"reward": 0.3222573697566986,
"reward_std": 0.10504135489463806,
"rewards/preference_model_reward": 0.3222573697566986,
"rewards/preference_model_reward/std": 0.10504135489463806,
"step": 221
},
{
"clip_ratio": 0.00020092798513360322,
"epoch": 0.18845500848896435,
"grad_norm": 0.9722158375203807,
"kl": 0.21875,
"learning_rate": 2e-06,
"loss": 0.0048,
"step": 222
},
{
"clip_ratio": 0.0,
"completion_length": 360.34375,
"epoch": 0.18930390492359933,
"grad_norm": 1.365145072141222,
"kl": 0.2392578125,
"learning_rate": 2e-06,
"loss": 0.0032,
"reward": 0.4276666045188904,
"reward_std": 0.12780673801898956,
"rewards/preference_model_reward": 0.4276666045188904,
"rewards/preference_model_reward/std": 0.12780673801898956,
"step": 223
},
{
"clip_ratio": 0.0016522787045687437,
"epoch": 0.19015280135823429,
"grad_norm": 1.286088004349321,
"kl": 0.2421875,
"learning_rate": 2e-06,
"loss": 0.0023,
"step": 224
},
{
"clip_ratio": 0.0,
"completion_length": 517.375,
"epoch": 0.19100169779286927,
"grad_norm": 1.0225752401513326,
"kl": 0.2138671875,
"learning_rate": 2e-06,
"loss": 0.0013,
"reward": 0.3944551348686218,
"reward_std": 0.11224386841058731,
"rewards/preference_model_reward": 0.3944551348686218,
"rewards/preference_model_reward/std": 0.11224386096000671,
"step": 225
},
{
"clip_ratio": 0.0007841808255761862,
"epoch": 0.19185059422750425,
"grad_norm": 0.9149056778209514,
"kl": 0.197265625,
"learning_rate": 2e-06,
"loss": 0.0008,
"step": 226
},
{
"clip_ratio": 0.0,
"completion_length": 291.59375,
"epoch": 0.1926994906621392,
"grad_norm": 1.288986938092742,
"kl": 0.2734375,
"learning_rate": 2e-06,
"loss": -0.0003,
"reward": 0.4484859108924866,
"reward_std": 0.04202309623360634,
"rewards/preference_model_reward": 0.4484859108924866,
"rewards/preference_model_reward/std": 0.04202309623360634,
"step": 227
},
{
"clip_ratio": 0.00215684762224555,
"epoch": 0.1935483870967742,
"grad_norm": 0.4977519171941583,
"kl": 0.275390625,
"learning_rate": 2e-06,
"loss": -0.0004,
"step": 228
},
{
"clip_ratio": 0.0,
"completion_length": 518.6875,
"epoch": 0.19439728353140917,
"grad_norm": 1.1492317219584003,
"kl": 0.23046875,
"learning_rate": 2e-06,
"loss": 0.007,
"reward": 0.39957520365715027,
"reward_std": 0.12616394460201263,
"rewards/preference_model_reward": 0.39957520365715027,
"rewards/preference_model_reward/std": 0.12616392970085144,
"step": 229
},
{
"clip_ratio": 0.0006631789728999138,
"epoch": 0.19524617996604415,
"grad_norm": 1.1237887647036364,
"kl": 0.2314453125,
"learning_rate": 2e-06,
"loss": 0.0063,
"step": 230
},
{
"clip_ratio": 0.0,
"completion_length": 264.15625,
"epoch": 0.1960950764006791,
"grad_norm": 1.5446214982606454,
"kl": 0.3828125,
"learning_rate": 2e-06,
"loss": -0.003,
"reward": 0.051013268530368805,
"reward_std": 0.04769134148955345,
"rewards/preference_model_reward": 0.051013268530368805,
"rewards/preference_model_reward/std": 0.04769134148955345,
"step": 231
},
{
"clip_ratio": 0.0015723377000540495,
"epoch": 0.1969439728353141,
"grad_norm": 0.7937791917154097,
"kl": 0.37890625,
"learning_rate": 2e-06,
"loss": -0.0032,
"step": 232
},
{
"clip_ratio": 0.0,
"completion_length": 653.4375,
"epoch": 0.19779286926994907,
"grad_norm": 1.285817858002923,
"kl": 0.1953125,
"learning_rate": 2e-06,
"loss": -0.0031,
"reward": 0.4576322138309479,
"reward_std": 0.12770842015743256,
"rewards/preference_model_reward": 0.4576322138309479,
"rewards/preference_model_reward/std": 0.12770840525627136,
"step": 233
},
{
"clip_ratio": 0.0005655796267092228,
"epoch": 0.19864176570458403,
"grad_norm": 0.9988328328952946,
"kl": 0.1943359375,
"learning_rate": 2e-06,
"loss": -0.0037,
"step": 234
},
{
"clip_ratio": 0.0,
"completion_length": 625.875,
"epoch": 0.199490662139219,
"grad_norm": 0.5736428283590878,
"kl": 0.3359375,
"learning_rate": 2e-06,
"loss": 0.0007,
"reward": 0.5476886034011841,
"reward_std": 0.019688162952661514,
"rewards/preference_model_reward": 0.5476886034011841,
"rewards/preference_model_reward/std": 0.01968817040324211,
"step": 235
},
{
"clip_ratio": 0.004047113005071878,
"epoch": 0.200339558573854,
"grad_norm": 0.3004588551896628,
"kl": 0.267578125,
"learning_rate": 2e-06,
"loss": 0.0006,
"step": 236
},
{
"clip_ratio": 0.0,
"completion_length": 397.1875,
"epoch": 0.20118845500848898,
"grad_norm": 0.7407875649985818,
"kl": 0.255859375,
"learning_rate": 2e-06,
"loss": -0.0036,
"reward": 0.5196166038513184,
"reward_std": 0.0838971957564354,
"rewards/preference_model_reward": 0.5196166038513184,
"rewards/preference_model_reward/std": 0.0838971957564354,
"step": 237
},
{
"clip_ratio": 0.000780011061578989,
"epoch": 0.20203735144312393,
"grad_norm": 0.7989807862404035,
"kl": 0.251953125,
"learning_rate": 2e-06,
"loss": -0.004,
"step": 238
},
{
"clip_ratio": 0.0,
"completion_length": 663.1875,
"epoch": 0.2028862478777589,
"grad_norm": 1.1362959567666375,
"kl": 0.201171875,
"learning_rate": 2e-06,
"loss": 0.0049,
"reward": 0.3731197118759155,
"reward_std": 0.13519592583179474,
"rewards/preference_model_reward": 0.3731197118759155,
"rewards/preference_model_reward/std": 0.13519594073295593,
"step": 239
},
{
"clip_ratio": 0.0008930441690608859,
"epoch": 0.2037351443123939,
"grad_norm": 0.9826027457053308,
"kl": 0.197265625,
"learning_rate": 2e-06,
"loss": 0.0043,
"step": 240
},
{
"clip_ratio": 0.0,
"completion_length": 638.65625,
"epoch": 0.20458404074702885,
"grad_norm": 1.2194567876973144,
"kl": 0.12890625,
"learning_rate": 2e-06,
"loss": 0.0058,
"reward": 0.4657590687274933,
"reward_std": 0.16892072558403015,
"rewards/preference_model_reward": 0.4657590687274933,
"rewards/preference_model_reward/std": 0.16892069578170776,
"step": 241
},
{
"clip_ratio": 0.0005752947181463242,
"epoch": 0.20543293718166383,
"grad_norm": 2.051110848359452,
"kl": 0.12451171875,
"learning_rate": 2e-06,
"loss": 0.0051,
"step": 242
},
{
"clip_ratio": 0.0,
"completion_length": 337.1875,
"epoch": 0.20628183361629882,
"grad_norm": 0.4124833822582155,
"kl": 0.23046875,
"learning_rate": 2e-06,
"loss": -0.0006,
"reward": 0.5514668226242065,
"reward_std": 0.03596644848585129,
"rewards/preference_model_reward": 0.5514668226242065,
"rewards/preference_model_reward/std": 0.03596644848585129,
"step": 243
},
{
"clip_ratio": 0.0035508163273334503,
"epoch": 0.2071307300509338,
"grad_norm": 0.4171934728891184,
"kl": 0.224609375,
"learning_rate": 2e-06,
"loss": -0.0008,
"step": 244
},
{
"clip_ratio": 0.0,
"completion_length": 408.9375,
"epoch": 0.20797962648556875,
"grad_norm": 2.9889867378208055,
"kl": 0.2333984375,
"learning_rate": 2e-06,
"loss": 0.0025,
"reward": 0.2543810307979584,
"reward_std": 0.11507824808359146,
"rewards/preference_model_reward": 0.2543810307979584,
"rewards/preference_model_reward/std": 0.11507824808359146,
"step": 245
},
{
"clip_ratio": 0.002395933959633112,
"epoch": 0.20882852292020374,
"grad_norm": 1.1132478545817386,
"kl": 0.2314453125,
"learning_rate": 2e-06,
"loss": 0.0024,
"step": 246
},
{
"clip_ratio": 0.0,
"completion_length": 451.6875,
"epoch": 0.20967741935483872,
"grad_norm": 0.16838590128648132,
"kl": 0.1953125,
"learning_rate": 2e-06,
"loss": 0.0009,
"reward": 0.5358486175537109,
"reward_std": 0.017411619424819946,
"rewards/preference_model_reward": 0.5358486175537109,
"rewards/preference_model_reward/std": 0.0174116063863039,
"step": 247
},
{
"clip_ratio": 0.0004304340109229088,
"epoch": 0.21052631578947367,
"grad_norm": 0.16083907124687224,
"kl": 0.19140625,
"learning_rate": 2e-06,
"loss": 0.0008,
"step": 248
},
{
"clip_ratio": 0.0,
"completion_length": 373.0,
"epoch": 0.21137521222410866,
"grad_norm": 1.336981028115652,
"kl": 0.1953125,
"learning_rate": 2e-06,
"loss": -0.0051,
"reward": 0.15268400311470032,
"reward_std": 0.11865763366222382,
"rewards/preference_model_reward": 0.15268400311470032,
"rewards/preference_model_reward/std": 0.11865763366222382,
"step": 249
},
{
"clip_ratio": 0.0005081939161755145,
"epoch": 0.21222410865874364,
"grad_norm": 1.2864787994059055,
"kl": 0.1923828125,
"learning_rate": 2e-06,
"loss": -0.0058,
"step": 250
},
{
"clip_ratio": 0.0,
"completion_length": 593.5625,
"epoch": 0.21307300509337862,
"grad_norm": 0.692959080153562,
"kl": 0.126953125,
"learning_rate": 2e-06,
"loss": -0.0002,
"reward": 0.09329767525196075,
"reward_std": 0.08787816017866135,
"rewards/preference_model_reward": 0.09329767525196075,
"rewards/preference_model_reward/std": 0.08787816762924194,
"step": 251
},
{
"clip_ratio": 0.0005807211855426431,
"epoch": 0.21392190152801357,
"grad_norm": 0.724372938551975,
"kl": 0.12451171875,
"learning_rate": 2e-06,
"loss": -0.0006,
"step": 252
},
{
"clip_ratio": 0.0,
"completion_length": 262.46875,
"epoch": 0.21477079796264856,
"grad_norm": 1.1375236466013237,
"kl": 0.220703125,
"learning_rate": 2e-06,
"loss": 0.0023,
"reward": 0.20732049643993378,
"reward_std": 0.09915804862976074,
"rewards/preference_model_reward": 0.20732049643993378,
"rewards/preference_model_reward/std": 0.09915804862976074,
"step": 253
},
{
"clip_ratio": 0.0012860854621976614,
"epoch": 0.21561969439728354,
"grad_norm": 1.1117646406622659,
"kl": 0.21875,
"learning_rate": 2e-06,
"loss": 0.0015,
"step": 254
},
{
"clip_ratio": 0.0,
"completion_length": 488.15625,
"epoch": 0.2164685908319185,
"grad_norm": 0.83190430608096,
"kl": 0.19921875,
"learning_rate": 2e-06,
"loss": -0.0001,
"reward": 0.16658729314804077,
"reward_std": 0.0986584797501564,
"rewards/preference_model_reward": 0.16658729314804077,
"rewards/preference_model_reward/std": 0.0986584797501564,
"step": 255
},
{
"clip_ratio": 0.00012755101488437504,
"epoch": 0.21731748726655348,
"grad_norm": 0.6949799847951833,
"kl": 0.1962890625,
"learning_rate": 2e-06,
"loss": -0.0005,
"step": 256
},
{
"clip_ratio": 0.0,
"completion_length": 306.8125,
"epoch": 0.21816638370118846,
"grad_norm": 0.8460209137088387,
"kl": 0.1962890625,
"learning_rate": 2e-06,
"loss": -0.0024,
"reward": 0.13335028290748596,
"reward_std": 0.08512399345636368,
"rewards/preference_model_reward": 0.13335028290748596,
"rewards/preference_model_reward/std": 0.08512399345636368,
"step": 257
},
{
"clip_ratio": 0.0,
"epoch": 0.21901528013582344,
"grad_norm": 0.8304477251507847,
"kl": 0.1943359375,
"learning_rate": 2e-06,
"loss": -0.003,
"step": 258
},
{
"clip_ratio": 0.0,
"completion_length": 381.78125,
"epoch": 0.2198641765704584,
"grad_norm": 1.097994057597742,
"kl": 0.1923828125,
"learning_rate": 2e-06,
"loss": -0.004,
"reward": 0.20581325888633728,
"reward_std": 0.11145073920488358,
"rewards/preference_model_reward": 0.20581325888633728,
"rewards/preference_model_reward/std": 0.11145073920488358,
"step": 259
},
{
"clip_ratio": 0.000915912794880569,
"epoch": 0.22071307300509338,
"grad_norm": 1.0577251056985664,
"kl": 0.1904296875,
"learning_rate": 2e-06,
"loss": -0.0049,
"step": 260
},
{
"clip_ratio": 0.0,
"completion_length": 410.8125,
"epoch": 0.22156196943972836,
"grad_norm": 0.9488227851729216,
"kl": 0.150390625,
"learning_rate": 2e-06,
"loss": 0.01,
"reward": 0.14306291937828064,
"reward_std": 0.08440835028886795,
"rewards/preference_model_reward": 0.14306291937828064,
"rewards/preference_model_reward/std": 0.08440835028886795,
"step": 261
},
{
"clip_ratio": 0.0006555670406669378,
"epoch": 0.22241086587436332,
"grad_norm": 0.9625461445287203,
"kl": 0.1494140625,
"learning_rate": 2e-06,
"loss": 0.0097,
"step": 262
},
{
"clip_ratio": 0.0,
"completion_length": 441.15625,
"epoch": 0.2232597623089983,
"grad_norm": 1.3378528546974586,
"kl": 0.201171875,
"learning_rate": 2e-06,
"loss": 0.0111,
"reward": 0.3541616201400757,
"reward_std": 0.14822125434875488,
"rewards/preference_model_reward": 0.3541616201400757,
"rewards/preference_model_reward/std": 0.14822125434875488,
"step": 263
},
{
"clip_ratio": 0.0002928848844021559,
"epoch": 0.22410865874363328,
"grad_norm": 1.44097954206129,
"kl": 0.2021484375,
"learning_rate": 2e-06,
"loss": 0.0103,
"step": 264
},
{
"clip_ratio": 0.0,
"completion_length": 328.5625,
"epoch": 0.22495755517826826,
"grad_norm": 1.0594223141668664,
"kl": 0.228515625,
"learning_rate": 2e-06,
"loss": -0.0016,
"reward": 0.4411153793334961,
"reward_std": 0.11199039965867996,
"rewards/preference_model_reward": 0.4411153793334961,
"rewards/preference_model_reward/std": 0.11199039220809937,
"step": 265
},
{
"clip_ratio": 0.0003811471979133785,
"epoch": 0.22580645161290322,
"grad_norm": 1.048487426692034,
"kl": 0.2314453125,
"learning_rate": 2e-06,
"loss": -0.0024,
"step": 266
},
{
"clip_ratio": 0.0,
"completion_length": 501.90625,
"epoch": 0.2266553480475382,
"grad_norm": 0.764207880163219,
"kl": 0.1845703125,
"learning_rate": 2e-06,
"loss": -0.0022,
"reward": 0.4925358295440674,
"reward_std": 0.09214819222688675,
"rewards/preference_model_reward": 0.4925358295440674,
"rewards/preference_model_reward/std": 0.09214819967746735,
"step": 267
},
{
"clip_ratio": 0.000623343454208225,
"epoch": 0.22750424448217318,
"grad_norm": 0.7021748192613388,
"kl": 0.1845703125,
"learning_rate": 2e-06,
"loss": -0.0027,
"step": 268
},
{
"clip_ratio": 0.0,
"completion_length": 406.78125,
"epoch": 0.22835314091680814,
"grad_norm": 1.2534216873477357,
"kl": 0.2109375,
"learning_rate": 2e-06,
"loss": 0.0004,
"reward": 0.346091091632843,
"reward_std": 0.11527692526578903,
"rewards/preference_model_reward": 0.346091091632843,
"rewards/preference_model_reward/std": 0.11527692526578903,
"step": 269
},
{
"clip_ratio": 0.0006988497916609049,
"epoch": 0.22920203735144312,
"grad_norm": 1.028430046135459,
"kl": 0.2119140625,
"learning_rate": 2e-06,
"loss": -0.0001,
"step": 270
},
{
"clip_ratio": 0.0,
"completion_length": 474.875,
"epoch": 0.2300509337860781,
"grad_norm": 13.308680917152811,
"kl": 0.7265625,
"learning_rate": 2e-06,
"loss": -0.0046,
"reward": 0.39890241622924805,
"reward_std": 0.10348767042160034,
"rewards/preference_model_reward": 0.39890241622924805,
"rewards/preference_model_reward/std": 0.10348766297101974,
"step": 271
},
{
"clip_ratio": 0.0009435814572498202,
"epoch": 0.23089983022071306,
"grad_norm": 0.9508081839275468,
"kl": 0.1708984375,
"learning_rate": 2e-06,
"loss": -0.0057,
"step": 272
},
{
"clip_ratio": 0.0,
"completion_length": 335.78125,
"epoch": 0.23174872665534804,
"grad_norm": 0.9495076380585504,
"kl": 0.1650390625,
"learning_rate": 2e-06,
"loss": -0.0052,
"reward": 0.19895681738853455,
"reward_std": 0.10379483550786972,
"rewards/preference_model_reward": 0.19895681738853455,
"rewards/preference_model_reward/std": 0.10379482805728912,
"step": 273
},
{
"clip_ratio": 0.0,
"epoch": 0.23259762308998302,
"grad_norm": 0.9358387855912544,
"kl": 0.1650390625,
"learning_rate": 2e-06,
"loss": -0.0059,
"step": 274
},
{
"clip_ratio": 0.0,
"completion_length": 430.8125,
"epoch": 0.233446519524618,
"grad_norm": 1.172070244878363,
"kl": 0.1748046875,
"learning_rate": 2e-06,
"loss": 0.0189,
"reward": 0.4197525382041931,
"reward_std": 0.11745458096265793,
"rewards/preference_model_reward": 0.4197525382041931,
"rewards/preference_model_reward/std": 0.11745458096265793,
"step": 275
},
{
"clip_ratio": 0.00059707515174523,
"epoch": 0.23429541595925296,
"grad_norm": 1.1205887604531841,
"kl": 0.173828125,
"learning_rate": 2e-06,
"loss": 0.0182,
"step": 276
},
{
"clip_ratio": 0.0,
"completion_length": 380.84375,
"epoch": 0.23514431239388794,
"grad_norm": 1.0667324137631118,
"kl": 0.189453125,
"learning_rate": 2e-06,
"loss": -0.0014,
"reward": 0.4866299033164978,
"reward_std": 0.10881662368774414,
"rewards/preference_model_reward": 0.4866299033164978,
"rewards/preference_model_reward/std": 0.10881662368774414,
"step": 277
},
{
"clip_ratio": 0.0003319675161037594,
"epoch": 0.23599320882852293,
"grad_norm": 1.4205333564695013,
"kl": 0.2265625,
"learning_rate": 2e-06,
"loss": -0.0021,
"step": 278
},
{
"clip_ratio": 0.0,
"completion_length": 409.96875,
"epoch": 0.23684210526315788,
"grad_norm": 1.2256158038709593,
"kl": 0.1904296875,
"learning_rate": 2e-06,
"loss": -0.0022,
"reward": 0.3310420513153076,
"reward_std": 0.13203255832195282,
"rewards/preference_model_reward": 0.3310420513153076,
"rewards/preference_model_reward/std": 0.13203254342079163,
"step": 279
},
{
"clip_ratio": 0.00029370313859544694,
"epoch": 0.23769100169779286,
"grad_norm": 1.140780771833912,
"kl": 0.1923828125,
"learning_rate": 2e-06,
"loss": -0.0031,
"step": 280
},
{
"clip_ratio": 0.0,
"completion_length": 299.8125,
"epoch": 0.23853989813242785,
"grad_norm": 0.5773592648416564,
"kl": 0.18359375,
"learning_rate": 2e-06,
"loss": 0.0001,
"reward": 0.06664206832647324,
"reward_std": 0.046818241477012634,
"rewards/preference_model_reward": 0.06664206832647324,
"rewards/preference_model_reward/std": 0.046818237751722336,
"step": 281
},
{
"clip_ratio": 0.002476999070495367,
"epoch": 0.23938879456706283,
"grad_norm": 0.5215923415469397,
"kl": 0.1796875,
"learning_rate": 2e-06,
"loss": -0.0002,
"step": 282
},
{
"clip_ratio": 0.0,
"completion_length": 641.5,
"epoch": 0.24023769100169778,
"grad_norm": 0.3036782782176285,
"kl": 0.1865234375,
"learning_rate": 2e-06,
"loss": -0.0012,
"reward": 0.4948778748512268,
"reward_std": 0.042491715401411057,
"rewards/preference_model_reward": 0.4948778748512268,
"rewards/preference_model_reward/std": 0.042491719126701355,
"step": 283
},
{
"clip_ratio": 0.0014662991743534803,
"epoch": 0.24108658743633277,
"grad_norm": 1.2693793919900047,
"kl": 0.1865234375,
"learning_rate": 2e-06,
"loss": -0.0014,
"step": 284
},
{
"clip_ratio": 0.0,
"completion_length": 326.25,
"epoch": 0.24193548387096775,
"grad_norm": 0.9299842399613616,
"kl": 0.2353515625,
"learning_rate": 2e-06,
"loss": -0.0024,
"reward": 0.46875959634780884,
"reward_std": 0.09210902452468872,
"rewards/preference_model_reward": 0.46875959634780884,
"rewards/preference_model_reward/std": 0.09210902452468872,
"step": 285
},
{
"clip_ratio": 0.0002825378905981779,
"epoch": 0.2427843803056027,
"grad_norm": 1.269996090531689,
"kl": 0.236328125,
"learning_rate": 2e-06,
"loss": -0.0031,
"step": 286
},
{
"clip_ratio": 0.0,
"completion_length": 358.0,
"epoch": 0.2436332767402377,
"grad_norm": 0.845178527409992,
"kl": 0.1904296875,
"learning_rate": 2e-06,
"loss": -0.0032,
"reward": 0.14985330402851105,
"reward_std": 0.07015400379896164,
"rewards/preference_model_reward": 0.14985330402851105,
"rewards/preference_model_reward/std": 0.07015399634838104,
"step": 287
},
{
"clip_ratio": 0.002010664436966181,
"epoch": 0.24448217317487267,
"grad_norm": 0.7374485292445523,
"kl": 0.1923828125,
"learning_rate": 2e-06,
"loss": -0.0037,
"step": 288
},
{
"clip_ratio": 0.0,
"completion_length": 494.5625,
"epoch": 0.24533106960950765,
"grad_norm": 0.9545115708280292,
"kl": 0.2119140625,
"learning_rate": 2e-06,
"loss": 0.0043,
"reward": 0.44061005115509033,
"reward_std": 0.1013847216963768,
"rewards/preference_model_reward": 0.44061005115509033,
"rewards/preference_model_reward/std": 0.1013847216963768,
"step": 289
},
{
"clip_ratio": 0.0008882409892976284,
"epoch": 0.2461799660441426,
"grad_norm": 1.0760062283966654,
"kl": 0.212890625,
"learning_rate": 2e-06,
"loss": 0.0037,
"step": 290
},
{
"clip_ratio": 0.0,
"completion_length": 622.78125,
"epoch": 0.2470288624787776,
"grad_norm": 0.2204401442573945,
"kl": 0.216796875,
"learning_rate": 2e-06,
"loss": 0.0006,
"reward": 0.5982410907745361,
"reward_std": 0.01771736703813076,
"rewards/preference_model_reward": 0.5982410907745361,
"rewards/preference_model_reward/std": 0.01771736331284046,
"step": 291
},
{
"clip_ratio": 0.000654590898193419,
"epoch": 0.24787775891341257,
"grad_norm": 0.14373274603213665,
"kl": 0.203125,
"learning_rate": 2e-06,
"loss": 0.0005,
"step": 292
},
{
"clip_ratio": 0.0,
"completion_length": 474.4375,
"epoch": 0.24872665534804753,
"grad_norm": 1.0199416518092594,
"kl": 0.197265625,
"learning_rate": 2e-06,
"loss": -0.0064,
"reward": 0.3177827000617981,
"reward_std": 0.13039816915988922,
"rewards/preference_model_reward": 0.3177827000617981,
"rewards/preference_model_reward/std": 0.13039815425872803,
"step": 293
},
{
"clip_ratio": 0.0005917281378060579,
"epoch": 0.2495755517826825,
"grad_norm": 0.9840967197672208,
"kl": 0.1953125,
"learning_rate": 2e-06,
"loss": -0.0071,
"step": 294
},
{
"clip_ratio": 0.0,
"completion_length": 397.0625,
"epoch": 0.25042444821731746,
"grad_norm": 1.8823223194664953,
"kl": 0.2099609375,
"learning_rate": 2e-06,
"loss": -0.0006,
"reward": 0.31412482261657715,
"reward_std": 0.1166299358010292,
"rewards/preference_model_reward": 0.31412482261657715,
"rewards/preference_model_reward/std": 0.11662992835044861,
"step": 295
},
{
"clip_ratio": 0.0003579020267352462,
"epoch": 0.25127334465195245,
"grad_norm": 1.1537409422856328,
"kl": 0.208984375,
"learning_rate": 2e-06,
"loss": -0.0011,
"step": 296
},
{
"clip_ratio": 0.0,
"completion_length": 628.5625,
"epoch": 0.25212224108658743,
"grad_norm": 1.1927096956384737,
"kl": 0.1708984375,
"learning_rate": 2e-06,
"loss": 0.0164,
"reward": 0.3190678358078003,
"reward_std": 0.14439481496810913,
"rewards/preference_model_reward": 0.3190678358078003,
"rewards/preference_model_reward/std": 0.14439481496810913,
"step": 297
},
{
"clip_ratio": 0.000986331608146429,
"epoch": 0.2529711375212224,
"grad_norm": 1.1803756478909,
"kl": 0.1708984375,
"learning_rate": 2e-06,
"loss": 0.0156,
"step": 298
},
{
"clip_ratio": 0.0,
"completion_length": 357.5,
"epoch": 0.2538200339558574,
"grad_norm": 2.207926479632091,
"kl": 0.2109375,
"learning_rate": 2e-06,
"loss": -0.0057,
"reward": 0.3464009761810303,
"reward_std": 0.1354563981294632,
"rewards/preference_model_reward": 0.3464009761810303,
"rewards/preference_model_reward/std": 0.135456383228302,
"step": 299
},
{
"clip_ratio": 0.0006329367170110345,
"epoch": 0.2546689303904924,
"grad_norm": 1.4143240393325465,
"kl": 0.2099609375,
"learning_rate": 2e-06,
"loss": -0.0064,
"step": 300
}
],
"logging_steps": 1,
"max_steps": 625,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}