s600-1.3.0L-GRPO-it4 / trainer_state.json
PocketDoc's picture
Upload folder using huggingface_hub
eba011d verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.7802340702210663,
"eval_steps": 500,
"global_step": 600,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio": 0.0,
"completion_length": 369.8125,
"epoch": 0.0013003901170351106,
"grad_norm": 0.9659074481719256,
"kl": 0.0002956390380859375,
"learning_rate": 0.0,
"loss": 0.0102,
"reward": 0.24122674763202667,
"reward_std": 0.35857653617858887,
"rewards/preference_model_reward": 0.24122674763202667,
"rewards/preference_model_reward/std": 0.3657657206058502,
"step": 1
},
{
"clip_ratio": 0.0,
"epoch": 0.002600780234070221,
"grad_norm": 0.9657764983988186,
"kl": 0.0002956390380859375,
"learning_rate": 1e-07,
"loss": 0.0102,
"step": 2
},
{
"clip_ratio": 0.0,
"completion_length": 355.59375,
"epoch": 0.0039011703511053317,
"grad_norm": 0.9911250453286248,
"kl": 0.0003662109375,
"learning_rate": 2e-07,
"loss": -0.0076,
"reward": 0.1862872838973999,
"reward_std": 0.22117774188518524,
"rewards/preference_model_reward": 0.1862872838973999,
"rewards/preference_model_reward/std": 0.3502456843852997,
"step": 3
},
{
"clip_ratio": 0.000235976796830073,
"epoch": 0.005201560468140442,
"grad_norm": 0.9993331560979716,
"kl": 0.0003719329833984375,
"learning_rate": 3e-07,
"loss": -0.0076,
"step": 4
},
{
"clip_ratio": 0.0,
"completion_length": 376.03125,
"epoch": 0.006501950585175552,
"grad_norm": 0.9789094069833814,
"kl": 0.000347137451171875,
"learning_rate": 4e-07,
"loss": 0.0023,
"reward": 0.2914609909057617,
"reward_std": 0.3521167039871216,
"rewards/preference_model_reward": 0.2914609909057617,
"rewards/preference_model_reward/std": 0.4191484749317169,
"step": 5
},
{
"clip_ratio": 0.00034768745535984635,
"epoch": 0.007802340702210663,
"grad_norm": 0.9859506359104134,
"kl": 0.00034332275390625,
"learning_rate": 5e-07,
"loss": 0.0023,
"step": 6
},
{
"clip_ratio": 0.0,
"completion_length": 250.21875,
"epoch": 0.009102730819245773,
"grad_norm": 1.3649252351615753,
"kl": 0.000370025634765625,
"learning_rate": 6e-07,
"loss": 0.0062,
"reward": 0.4924929738044739,
"reward_std": 0.4304487109184265,
"rewards/preference_model_reward": 0.4924929738044739,
"rewards/preference_model_reward/std": 0.465331107378006,
"step": 7
},
{
"clip_ratio": 0.0,
"epoch": 0.010403120936280884,
"grad_norm": 1.059378833132842,
"kl": 0.0004215240478515625,
"learning_rate": 7e-07,
"loss": 0.0062,
"step": 8
},
{
"clip_ratio": 0.0,
"completion_length": 358.71875,
"epoch": 0.011703511053315995,
"grad_norm": 1.1643210223877094,
"kl": 0.00040435791015625,
"learning_rate": 8e-07,
"loss": -0.0024,
"reward": 0.42163607478141785,
"reward_std": 0.40762829780578613,
"rewards/preference_model_reward": 0.42163607478141785,
"rewards/preference_model_reward/std": 0.43679776787757874,
"step": 9
},
{
"clip_ratio": 0.00018502894090488553,
"epoch": 0.013003901170351105,
"grad_norm": 1.0909138168951602,
"kl": 0.00040435791015625,
"learning_rate": 9e-07,
"loss": -0.0025,
"step": 10
},
{
"clip_ratio": 0.0,
"completion_length": 337.875,
"epoch": 0.014304291287386216,
"grad_norm": 1.1375050760237047,
"kl": 0.00042724609375,
"learning_rate": 1e-06,
"loss": 0.0027,
"reward": 0.39657002687454224,
"reward_std": 0.25011393427848816,
"rewards/preference_model_reward": 0.39657002687454224,
"rewards/preference_model_reward/std": 0.4063413441181183,
"step": 11
},
{
"clip_ratio": 0.00011823614477179945,
"epoch": 0.015604681404421327,
"grad_norm": 1.1070268098404832,
"kl": 0.000438690185546875,
"learning_rate": 1.1e-06,
"loss": 0.0026,
"step": 12
},
{
"clip_ratio": 0.0,
"completion_length": 229.90625,
"epoch": 0.016905071521456438,
"grad_norm": 0.8427732256393701,
"kl": 0.000579833984375,
"learning_rate": 1.2e-06,
"loss": 0.0031,
"reward": 0.2741852104663849,
"reward_std": 0.23615789413452148,
"rewards/preference_model_reward": 0.2741852104663849,
"rewards/preference_model_reward/std": 0.4120958745479584,
"step": 13
},
{
"clip_ratio": 0.00015812776109669358,
"epoch": 0.018205461638491547,
"grad_norm": 0.811295051358225,
"kl": 0.0005950927734375,
"learning_rate": 1.3e-06,
"loss": 0.003,
"step": 14
},
{
"clip_ratio": 0.0,
"completion_length": 319.40625,
"epoch": 0.01950585175552666,
"grad_norm": 0.015205465689863728,
"kl": 0.000701904296875,
"learning_rate": 1.4e-06,
"loss": 0.0001,
"reward": 0.00646161288022995,
"reward_std": 0.005409592762589455,
"rewards/preference_model_reward": 0.00646161288022995,
"rewards/preference_model_reward/std": 0.0058777108788490295,
"step": 15
},
{
"clip_ratio": 0.000241475339862518,
"epoch": 0.02080624187256177,
"grad_norm": 0.01476566433090852,
"kl": 0.000766754150390625,
"learning_rate": 1.5e-06,
"loss": 0.0001,
"step": 16
},
{
"clip_ratio": 0.0,
"completion_length": 251.5625,
"epoch": 0.022106631989596878,
"grad_norm": 1.132463593759137,
"kl": 0.001007080078125,
"learning_rate": 1.6e-06,
"loss": -0.0035,
"reward": 0.4667004942893982,
"reward_std": 0.4533562660217285,
"rewards/preference_model_reward": 0.4667004942893982,
"rewards/preference_model_reward/std": 0.4472948908805847,
"step": 17
},
{
"clip_ratio": 9.563886851537973e-05,
"epoch": 0.02340702210663199,
"grad_norm": 1.2516838066095344,
"kl": 0.0012054443359375,
"learning_rate": 1.6999999999999998e-06,
"loss": -0.0038,
"step": 18
},
{
"clip_ratio": 0.0,
"completion_length": 317.25,
"epoch": 0.0247074122236671,
"grad_norm": 1.3550285048912802,
"kl": 0.0013885498046875,
"learning_rate": 1.8e-06,
"loss": 0.0264,
"reward": 0.6398348808288574,
"reward_std": 0.41185781359672546,
"rewards/preference_model_reward": 0.6398348808288574,
"rewards/preference_model_reward/std": 0.4352080225944519,
"step": 19
},
{
"clip_ratio": 0.0002283816138515249,
"epoch": 0.02600780234070221,
"grad_norm": 1.3188454766806155,
"kl": 0.0018310546875,
"learning_rate": 1.8999999999999998e-06,
"loss": 0.0262,
"step": 20
},
{
"clip_ratio": 0.0,
"completion_length": 430.40625,
"epoch": 0.027308192457737322,
"grad_norm": 1.1882515797674884,
"kl": 0.0024871826171875,
"learning_rate": 2e-06,
"loss": 0.0227,
"reward": 0.46726664900779724,
"reward_std": 0.27668142318725586,
"rewards/preference_model_reward": 0.46726664900779724,
"rewards/preference_model_reward/std": 0.49030089378356934,
"step": 21
},
{
"clip_ratio": 5.293245703796856e-05,
"epoch": 0.02860858257477243,
"grad_norm": 1.144548816920826,
"kl": 0.003204345703125,
"learning_rate": 2e-06,
"loss": 0.0224,
"step": 22
},
{
"clip_ratio": 0.0,
"completion_length": 344.28125,
"epoch": 0.02990897269180754,
"grad_norm": 1.2911067063161903,
"kl": 0.00341796875,
"learning_rate": 2e-06,
"loss": -0.0015,
"reward": 0.5998943448066711,
"reward_std": 0.45733675360679626,
"rewards/preference_model_reward": 0.5998943448066711,
"rewards/preference_model_reward/std": 0.449917197227478,
"step": 23
},
{
"clip_ratio": 0.00026819598861038685,
"epoch": 0.031209362808842653,
"grad_norm": 1.2967596040632892,
"kl": 0.004364013671875,
"learning_rate": 2e-06,
"loss": -0.0017,
"step": 24
},
{
"clip_ratio": 0.0,
"completion_length": 393.21875,
"epoch": 0.032509752925877766,
"grad_norm": 1.5694696539232997,
"kl": 0.005126953125,
"learning_rate": 2e-06,
"loss": 0.0094,
"reward": 0.3718755841255188,
"reward_std": 0.3863350749015808,
"rewards/preference_model_reward": 0.3718755841255188,
"rewards/preference_model_reward/std": 0.4579065144062042,
"step": 25
},
{
"clip_ratio": 0.0005104803130961955,
"epoch": 0.033810143042912875,
"grad_norm": 1.4430846873009133,
"kl": 0.0064697265625,
"learning_rate": 2e-06,
"loss": 0.009,
"step": 26
},
{
"clip_ratio": 0.0,
"completion_length": 147.09375,
"epoch": 0.035110533159947985,
"grad_norm": 0.6793569926930156,
"kl": 0.0111083984375,
"learning_rate": 2e-06,
"loss": -0.0008,
"reward": 0.2612653970718384,
"reward_std": 0.29103392362594604,
"rewards/preference_model_reward": 0.2612653970718384,
"rewards/preference_model_reward/std": 0.3302207589149475,
"step": 27
},
{
"clip_ratio": 0.0024005018640309572,
"epoch": 0.036410923276983094,
"grad_norm": 0.8626605965309095,
"kl": 0.015380859375,
"learning_rate": 2e-06,
"loss": -0.0009,
"step": 28
},
{
"clip_ratio": 0.0,
"completion_length": 306.21875,
"epoch": 0.0377113133940182,
"grad_norm": 1.1377329684795203,
"kl": 0.0126953125,
"learning_rate": 2e-06,
"loss": -0.0029,
"reward": 0.7407833337783813,
"reward_std": 0.4136916995048523,
"rewards/preference_model_reward": 0.7407833337783813,
"rewards/preference_model_reward/std": 0.42335739731788635,
"step": 29
},
{
"clip_ratio": 0.0006882185116410255,
"epoch": 0.03901170351105332,
"grad_norm": 1.153060303007809,
"kl": 0.01519775390625,
"learning_rate": 2e-06,
"loss": -0.0034,
"step": 30
},
{
"clip_ratio": 0.0,
"completion_length": 197.21875,
"epoch": 0.04031209362808843,
"grad_norm": 0.3843055910687809,
"kl": 0.019775390625,
"learning_rate": 2e-06,
"loss": -0.0021,
"reward": 0.49551475048065186,
"reward_std": 0.14911304414272308,
"rewards/preference_model_reward": 0.49551475048065186,
"rewards/preference_model_reward/std": 0.45444655418395996,
"step": 31
},
{
"clip_ratio": 0.00019171778694726527,
"epoch": 0.04161248374512354,
"grad_norm": 0.35937765329925253,
"kl": 0.02392578125,
"learning_rate": 2e-06,
"loss": -0.0022,
"step": 32
},
{
"clip_ratio": 0.0,
"completion_length": 327.9375,
"epoch": 0.04291287386215865,
"grad_norm": 1.037959425094293,
"kl": 0.0235595703125,
"learning_rate": 2e-06,
"loss": 0.0051,
"reward": 0.2412111759185791,
"reward_std": 0.3637581765651703,
"rewards/preference_model_reward": 0.2412111759185791,
"rewards/preference_model_reward/std": 0.35813963413238525,
"step": 33
},
{
"clip_ratio": 0.0002937716490123421,
"epoch": 0.044213263979193757,
"grad_norm": 1.012063287494272,
"kl": 0.0269775390625,
"learning_rate": 2e-06,
"loss": 0.0048,
"step": 34
},
{
"clip_ratio": 0.0,
"completion_length": 351.1875,
"epoch": 0.045513654096228866,
"grad_norm": 0.049977121036559484,
"kl": 0.0267333984375,
"learning_rate": 2e-06,
"loss": 0.0007,
"reward": 0.5103617906570435,
"reward_std": 0.013033521361649036,
"rewards/preference_model_reward": 0.5103617906570435,
"rewards/preference_model_reward/std": 0.49780330061912537,
"step": 35
},
{
"clip_ratio": 0.00022163119865581393,
"epoch": 0.04681404421326398,
"grad_norm": 0.050415747625479665,
"kl": 0.0289306640625,
"learning_rate": 2e-06,
"loss": 0.0007,
"step": 36
},
{
"clip_ratio": 0.0,
"completion_length": 386.84375,
"epoch": 0.04811443433029909,
"grad_norm": 1.320382086078505,
"kl": 0.03466796875,
"learning_rate": 2e-06,
"loss": -0.0007,
"reward": 0.7545909881591797,
"reward_std": 0.3815248906612396,
"rewards/preference_model_reward": 0.7545909881591797,
"rewards/preference_model_reward/std": 0.38322117924690247,
"step": 37
},
{
"clip_ratio": 0.00017478244262747467,
"epoch": 0.0494148244473342,
"grad_norm": 1.3588405193554765,
"kl": 0.036865234375,
"learning_rate": 2e-06,
"loss": -0.0011,
"step": 38
},
{
"clip_ratio": 0.0,
"completion_length": 275.09375,
"epoch": 0.05071521456436931,
"grad_norm": 0.9774108213555497,
"kl": 0.031494140625,
"learning_rate": 2e-06,
"loss": 0.001,
"reward": 0.8388960361480713,
"reward_std": 0.26034486293792725,
"rewards/preference_model_reward": 0.8388960361480713,
"rewards/preference_model_reward/std": 0.3219398558139801,
"step": 39
},
{
"clip_ratio": 0.00045662192860618234,
"epoch": 0.05201560468140442,
"grad_norm": 0.9753549692487811,
"kl": 0.034423828125,
"learning_rate": 2e-06,
"loss": 0.0007,
"step": 40
},
{
"clip_ratio": 0.0,
"completion_length": 329.625,
"epoch": 0.053315994798439535,
"grad_norm": 1.163784763708709,
"kl": 0.107421875,
"learning_rate": 2e-06,
"loss": -0.0101,
"reward": 0.6221742630004883,
"reward_std": 0.35088545083999634,
"rewards/preference_model_reward": 0.6221742630004883,
"rewards/preference_model_reward/std": 0.4535573124885559,
"step": 41
},
{
"clip_ratio": 0.0014115219237282872,
"epoch": 0.054616384915474644,
"grad_norm": 1.1725293952887486,
"kl": 0.09033203125,
"learning_rate": 2e-06,
"loss": -0.0106,
"step": 42
},
{
"clip_ratio": 0.0,
"completion_length": 360.25,
"epoch": 0.055916775032509754,
"grad_norm": 0.8648268899377819,
"kl": 0.03857421875,
"learning_rate": 2e-06,
"loss": -0.0036,
"reward": 0.86008620262146,
"reward_std": 0.29710525274276733,
"rewards/preference_model_reward": 0.86008620262146,
"rewards/preference_model_reward/std": 0.32698217034339905,
"step": 43
},
{
"clip_ratio": 0.0002532459911890328,
"epoch": 0.05721716514954486,
"grad_norm": 0.8200773607213043,
"kl": 0.041015625,
"learning_rate": 2e-06,
"loss": -0.004,
"step": 44
},
{
"clip_ratio": 0.0,
"completion_length": 305.84375,
"epoch": 0.05851755526657997,
"grad_norm": 0.5893143524753656,
"kl": 0.0625,
"learning_rate": 2e-06,
"loss": -0.0014,
"reward": 0.6891697645187378,
"reward_std": 0.1563209444284439,
"rewards/preference_model_reward": 0.6891697645187378,
"rewards/preference_model_reward/std": 0.38344231247901917,
"step": 45
},
{
"clip_ratio": 0.00037541432539001107,
"epoch": 0.05981794538361508,
"grad_norm": 0.5686876994070226,
"kl": 0.06689453125,
"learning_rate": 2e-06,
"loss": -0.0017,
"step": 46
},
{
"clip_ratio": 0.0,
"completion_length": 241.5,
"epoch": 0.0611183355006502,
"grad_norm": 0.9353983139355063,
"kl": 0.07275390625,
"learning_rate": 2e-06,
"loss": -0.0027,
"reward": 0.9130573272705078,
"reward_std": 0.26053106784820557,
"rewards/preference_model_reward": 0.9130573272705078,
"rewards/preference_model_reward/std": 0.2611267864704132,
"step": 47
},
{
"clip_ratio": 0.0003936196444556117,
"epoch": 0.06241872561768531,
"grad_norm": 0.6672689260021744,
"kl": 0.0771484375,
"learning_rate": 2e-06,
"loss": -0.0029,
"step": 48
},
{
"clip_ratio": 0.0,
"completion_length": 325.53125,
"epoch": 0.06371911573472042,
"grad_norm": 0.7173309068130417,
"kl": 0.078125,
"learning_rate": 2e-06,
"loss": -0.0089,
"reward": 0.8731744885444641,
"reward_std": 0.2636582851409912,
"rewards/preference_model_reward": 0.8731744885444641,
"rewards/preference_model_reward/std": 0.2623332142829895,
"step": 49
},
{
"clip_ratio": 0.0003933516563847661,
"epoch": 0.06501950585175553,
"grad_norm": 0.6845578633909412,
"kl": 0.08203125,
"learning_rate": 2e-06,
"loss": -0.0092,
"step": 50
},
{
"clip_ratio": 0.0,
"completion_length": 309.125,
"epoch": 0.06631989596879063,
"grad_norm": 1.0072549333125307,
"kl": 0.0771484375,
"learning_rate": 2e-06,
"loss": -0.0025,
"reward": 0.7915850877761841,
"reward_std": 0.3451814651489258,
"rewards/preference_model_reward": 0.7915850877761841,
"rewards/preference_model_reward/std": 0.34321537613868713,
"step": 51
},
{
"clip_ratio": 9.516558930044994e-05,
"epoch": 0.06762028608582575,
"grad_norm": 0.9815790015455084,
"kl": 0.080078125,
"learning_rate": 2e-06,
"loss": -0.0029,
"step": 52
},
{
"clip_ratio": 0.0,
"completion_length": 380.3125,
"epoch": 0.06892067620286085,
"grad_norm": 1.3876607215088368,
"kl": 0.07373046875,
"learning_rate": 2e-06,
"loss": 0.026,
"reward": 0.8868893384933472,
"reward_std": 0.2691570520401001,
"rewards/preference_model_reward": 0.8868893384933472,
"rewards/preference_model_reward/std": 0.30143871903419495,
"step": 53
},
{
"clip_ratio": 0.00025676062796264887,
"epoch": 0.07022106631989597,
"grad_norm": 1.32392587242523,
"kl": 0.078125,
"learning_rate": 2e-06,
"loss": 0.0255,
"step": 54
},
{
"clip_ratio": 0.0,
"completion_length": 296.34375,
"epoch": 0.07152145643693109,
"grad_norm": 0.6600106697929482,
"kl": 0.09228515625,
"learning_rate": 2e-06,
"loss": 0.0022,
"reward": 0.9307242631912231,
"reward_std": 0.15007582306861877,
"rewards/preference_model_reward": 0.9307242631912231,
"rewards/preference_model_reward/std": 0.22033238410949707,
"step": 55
},
{
"clip_ratio": 0.0,
"epoch": 0.07282184655396619,
"grad_norm": 0.6509839555905929,
"kl": 0.0966796875,
"learning_rate": 2e-06,
"loss": 0.0019,
"step": 56
},
{
"clip_ratio": 0.0,
"completion_length": 374.15625,
"epoch": 0.0741222366710013,
"grad_norm": 0.8166372061077617,
"kl": 0.06640625,
"learning_rate": 2e-06,
"loss": -0.0058,
"reward": 0.8706126809120178,
"reward_std": 0.27126792073249817,
"rewards/preference_model_reward": 0.8706126809120178,
"rewards/preference_model_reward/std": 0.29494142532348633,
"step": 57
},
{
"clip_ratio": 0.0006189570995047688,
"epoch": 0.0754226267880364,
"grad_norm": 0.7575930393174628,
"kl": 0.06884765625,
"learning_rate": 2e-06,
"loss": -0.0062,
"step": 58
},
{
"clip_ratio": 0.0,
"completion_length": 273.53125,
"epoch": 0.07672301690507152,
"grad_norm": 0.35564737874795666,
"kl": 0.10546875,
"learning_rate": 2e-06,
"loss": -0.0117,
"reward": 0.929673433303833,
"reward_std": 0.20910362899303436,
"rewards/preference_model_reward": 0.929673433303833,
"rewards/preference_model_reward/std": 0.23424802720546722,
"step": 59
},
{
"clip_ratio": 0.0006005860050208867,
"epoch": 0.07802340702210664,
"grad_norm": 0.34766362031349507,
"kl": 0.109375,
"learning_rate": 2e-06,
"loss": -0.0119,
"step": 60
},
{
"clip_ratio": 0.0,
"completion_length": 345.0625,
"epoch": 0.07932379713914174,
"grad_norm": 1.186646095697512,
"kl": 0.09228515625,
"learning_rate": 2e-06,
"loss": -0.0028,
"reward": 0.7294732332229614,
"reward_std": 0.35979652404785156,
"rewards/preference_model_reward": 0.7294732332229614,
"rewards/preference_model_reward/std": 0.35507434606552124,
"step": 61
},
{
"clip_ratio": 0.00039078935515135527,
"epoch": 0.08062418725617686,
"grad_norm": 1.1224238133769298,
"kl": 0.0947265625,
"learning_rate": 2e-06,
"loss": -0.0032,
"step": 62
},
{
"clip_ratio": 0.0,
"completion_length": 526.75,
"epoch": 0.08192457737321196,
"grad_norm": 1.2409103678829725,
"kl": 0.0849609375,
"learning_rate": 2e-06,
"loss": 0.0099,
"reward": 0.6618127822875977,
"reward_std": 0.3134193420410156,
"rewards/preference_model_reward": 0.6618127822875977,
"rewards/preference_model_reward/std": 0.44231337308883667,
"step": 63
},
{
"clip_ratio": 0.0005315542221069336,
"epoch": 0.08322496749024708,
"grad_norm": 1.20783409819399,
"kl": 0.087890625,
"learning_rate": 2e-06,
"loss": 0.0093,
"step": 64
},
{
"clip_ratio": 0.0,
"completion_length": 321.375,
"epoch": 0.08452535760728218,
"grad_norm": 1.1952606760232154,
"kl": 0.1201171875,
"learning_rate": 2e-06,
"loss": 0.0018,
"reward": 0.8502117395401001,
"reward_std": 0.28802555799484253,
"rewards/preference_model_reward": 0.8502117395401001,
"rewards/preference_model_reward/std": 0.29467442631721497,
"step": 65
},
{
"clip_ratio": 0.0005706611555069685,
"epoch": 0.0858257477243173,
"grad_norm": 0.9932076087850403,
"kl": 0.123046875,
"learning_rate": 2e-06,
"loss": 0.0014,
"step": 66
},
{
"clip_ratio": 0.0,
"completion_length": 382.40625,
"epoch": 0.08712613784135241,
"grad_norm": 0.9026575096116797,
"kl": 0.12255859375,
"learning_rate": 2e-06,
"loss": 0.0029,
"reward": 0.5317609906196594,
"reward_std": 0.22324004769325256,
"rewards/preference_model_reward": 0.5317609906196594,
"rewards/preference_model_reward/std": 0.4589392840862274,
"step": 67
},
{
"clip_ratio": 0.0009551330585964024,
"epoch": 0.08842652795838751,
"grad_norm": 0.8034448562270808,
"kl": 0.1240234375,
"learning_rate": 2e-06,
"loss": 0.0025,
"step": 68
},
{
"clip_ratio": 0.0,
"completion_length": 352.125,
"epoch": 0.08972691807542263,
"grad_norm": 1.051559244432883,
"kl": 0.1005859375,
"learning_rate": 2e-06,
"loss": -0.0008,
"reward": 0.8645539879798889,
"reward_std": 0.30358821153640747,
"rewards/preference_model_reward": 0.8645539879798889,
"rewards/preference_model_reward/std": 0.3146204948425293,
"step": 69
},
{
"clip_ratio": 0.0004191896296106279,
"epoch": 0.09102730819245773,
"grad_norm": 1.138873373412801,
"kl": 0.10302734375,
"learning_rate": 2e-06,
"loss": -0.0013,
"step": 70
},
{
"clip_ratio": 0.0,
"completion_length": 394.9375,
"epoch": 0.09232769830949285,
"grad_norm": 0.7693875160579493,
"kl": 0.10498046875,
"learning_rate": 2e-06,
"loss": 0.0012,
"reward": 0.9269964694976807,
"reward_std": 0.1615072637796402,
"rewards/preference_model_reward": 0.9269964694976807,
"rewards/preference_model_reward/std": 0.2366172969341278,
"step": 71
},
{
"clip_ratio": 0.0,
"epoch": 0.09362808842652796,
"grad_norm": 0.721838481847161,
"kl": 0.107421875,
"learning_rate": 2e-06,
"loss": 0.0008,
"step": 72
},
{
"clip_ratio": 0.0,
"completion_length": 484.0,
"epoch": 0.09492847854356307,
"grad_norm": 0.67467714871582,
"kl": 0.10546875,
"learning_rate": 2e-06,
"loss": 0.0017,
"reward": 0.9037147760391235,
"reward_std": 0.14057296514511108,
"rewards/preference_model_reward": 0.9037147760391235,
"rewards/preference_model_reward/std": 0.21866993606090546,
"step": 73
},
{
"clip_ratio": 0.00014462298713624477,
"epoch": 0.09622886866059818,
"grad_norm": 0.6527573526740783,
"kl": 0.10888671875,
"learning_rate": 2e-06,
"loss": 0.0013,
"step": 74
},
{
"clip_ratio": 0.0,
"completion_length": 352.8125,
"epoch": 0.09752925877763328,
"grad_norm": 1.0730024250380301,
"kl": 0.1083984375,
"learning_rate": 2e-06,
"loss": -0.0014,
"reward": 0.8341162204742432,
"reward_std": 0.3151201903820038,
"rewards/preference_model_reward": 0.8341162204742432,
"rewards/preference_model_reward/std": 0.32914233207702637,
"step": 75
},
{
"clip_ratio": 0.00022457953309640288,
"epoch": 0.0988296488946684,
"grad_norm": 1.4755587484249004,
"kl": 0.11328125,
"learning_rate": 2e-06,
"loss": -0.0019,
"step": 76
},
{
"clip_ratio": 0.0,
"completion_length": 478.6875,
"epoch": 0.10013003901170352,
"grad_norm": 0.6070744376376411,
"kl": 0.109375,
"learning_rate": 2e-06,
"loss": 0.0073,
"reward": 0.9789870977401733,
"reward_std": 0.07569272816181183,
"rewards/preference_model_reward": 0.9789870977401733,
"rewards/preference_model_reward/std": 0.1074473112821579,
"step": 77
},
{
"clip_ratio": 0.0003504111082293093,
"epoch": 0.10143042912873862,
"grad_norm": 0.5334363278812777,
"kl": 0.115234375,
"learning_rate": 2e-06,
"loss": 0.007,
"step": 78
},
{
"clip_ratio": 0.0,
"completion_length": 348.0625,
"epoch": 0.10273081924577374,
"grad_norm": 0.7094682971060341,
"kl": 0.10791015625,
"learning_rate": 2e-06,
"loss": -0.0005,
"reward": 0.9031955003738403,
"reward_std": 0.1537085920572281,
"rewards/preference_model_reward": 0.9031955003738403,
"rewards/preference_model_reward/std": 0.23537583649158478,
"step": 79
},
{
"clip_ratio": 9.012256487039849e-05,
"epoch": 0.10403120936280884,
"grad_norm": 0.6528569724682338,
"kl": 0.1103515625,
"learning_rate": 2e-06,
"loss": -0.0009,
"step": 80
},
{
"clip_ratio": 0.0,
"completion_length": 254.34375,
"epoch": 0.10533159947984395,
"grad_norm": 0.015979283278672672,
"kl": 0.126953125,
"learning_rate": 2e-06,
"loss": 0.0001,
"reward": 0.503002941608429,
"reward_std": 0.0007813164265826344,
"rewards/preference_model_reward": 0.503002941608429,
"rewards/preference_model_reward/std": 0.5049507021903992,
"step": 81
},
{
"clip_ratio": 0.0016290850471705198,
"epoch": 0.10663198959687907,
"grad_norm": 0.013573220175038038,
"kl": 0.115234375,
"learning_rate": 2e-06,
"loss": 0.0,
"step": 82
},
{
"clip_ratio": 0.0,
"completion_length": 252.90625,
"epoch": 0.10793237971391417,
"grad_norm": 0.34340056371333394,
"kl": 0.181640625,
"learning_rate": 2e-06,
"loss": -0.0008,
"reward": 0.9807977676391602,
"reward_std": 0.07286863774061203,
"rewards/preference_model_reward": 0.9807977676391602,
"rewards/preference_model_reward/std": 0.07266176491975784,
"step": 83
},
{
"clip_ratio": 0.0008525701705366373,
"epoch": 0.10923276983094929,
"grad_norm": 0.18116145894909788,
"kl": 0.138671875,
"learning_rate": 2e-06,
"loss": -0.0009,
"step": 84
},
{
"clip_ratio": 0.0,
"completion_length": 331.875,
"epoch": 0.11053315994798439,
"grad_norm": 0.48967266770314716,
"kl": 0.1328125,
"learning_rate": 2e-06,
"loss": 0.0,
"reward": 0.9723169803619385,
"reward_std": 0.11073215305805206,
"rewards/preference_model_reward": 0.9723169803619385,
"rewards/preference_model_reward/std": 0.15659891068935394,
"step": 85
},
{
"clip_ratio": 0.0005794943426735699,
"epoch": 0.11183355006501951,
"grad_norm": 0.4092702632564478,
"kl": 0.12109375,
"learning_rate": 2e-06,
"loss": -0.0002,
"step": 86
},
{
"clip_ratio": 0.0,
"completion_length": 357.0,
"epoch": 0.11313394018205461,
"grad_norm": 0.32187360662760156,
"kl": 0.10546875,
"learning_rate": 2e-06,
"loss": -0.0014,
"reward": 0.5838743448257446,
"reward_std": 0.09063759446144104,
"rewards/preference_model_reward": 0.5838743448257446,
"rewards/preference_model_reward/std": 0.4262309968471527,
"step": 87
},
{
"clip_ratio": 0.0007115560583770275,
"epoch": 0.11443433029908973,
"grad_norm": 0.31072273769841946,
"kl": 0.09814453125,
"learning_rate": 2e-06,
"loss": -0.0016,
"step": 88
},
{
"clip_ratio": 0.0,
"completion_length": 335.3125,
"epoch": 0.11573472041612484,
"grad_norm": 1.1176337391124187,
"kl": 0.09716796875,
"learning_rate": 2e-06,
"loss": 0.0161,
"reward": 0.8303125500679016,
"reward_std": 0.26434481143951416,
"rewards/preference_model_reward": 0.8303125500679016,
"rewards/preference_model_reward/std": 0.2824815809726715,
"step": 89
},
{
"clip_ratio": 0.0020064229611307383,
"epoch": 0.11703511053315994,
"grad_norm": 1.1042794016878086,
"kl": 0.09375,
"learning_rate": 2e-06,
"loss": 0.0155,
"step": 90
},
{
"clip_ratio": 0.0,
"completion_length": 349.125,
"epoch": 0.11833550065019506,
"grad_norm": 0.6566935601055136,
"kl": 0.09814453125,
"learning_rate": 2e-06,
"loss": -0.007,
"reward": 0.8883383274078369,
"reward_std": 0.14250892400741577,
"rewards/preference_model_reward": 0.8883383274078369,
"rewards/preference_model_reward/std": 0.22842475771903992,
"step": 91
},
{
"clip_ratio": 9.124087227974087e-05,
"epoch": 0.11963589076723016,
"grad_norm": 0.6051172269491196,
"kl": 0.09521484375,
"learning_rate": 2e-06,
"loss": -0.0073,
"step": 92
},
{
"clip_ratio": 0.0,
"completion_length": 431.71875,
"epoch": 0.12093628088426528,
"grad_norm": 1.1072727024624063,
"kl": 0.091796875,
"learning_rate": 2e-06,
"loss": 0.0009,
"reward": 0.7813946008682251,
"reward_std": 0.2200125753879547,
"rewards/preference_model_reward": 0.7813946008682251,
"rewards/preference_model_reward/std": 0.3781771957874298,
"step": 93
},
{
"clip_ratio": 0.00021242158254608512,
"epoch": 0.1222366710013004,
"grad_norm": 1.130829779328361,
"kl": 0.09033203125,
"learning_rate": 2e-06,
"loss": 0.0004,
"step": 94
},
{
"clip_ratio": 0.0,
"completion_length": 380.34375,
"epoch": 0.1235370611183355,
"grad_norm": 1.1837052245253927,
"kl": 0.083984375,
"learning_rate": 2e-06,
"loss": 0.0048,
"reward": 0.6728702187538147,
"reward_std": 0.3235069513320923,
"rewards/preference_model_reward": 0.6728702187538147,
"rewards/preference_model_reward/std": 0.3584347665309906,
"step": 95
},
{
"clip_ratio": 0.0003149464901071042,
"epoch": 0.12483745123537061,
"grad_norm": 1.0580700210395666,
"kl": 0.08447265625,
"learning_rate": 2e-06,
"loss": 0.0043,
"step": 96
},
{
"clip_ratio": 0.0,
"completion_length": 491.96875,
"epoch": 0.12613784135240572,
"grad_norm": 14.99699485585395,
"kl": 2.140625,
"learning_rate": 2e-06,
"loss": -0.0011,
"reward": 0.6248654127120972,
"reward_std": 0.30046606063842773,
"rewards/preference_model_reward": 0.6248654127120972,
"rewards/preference_model_reward/std": 0.3810950815677643,
"step": 97
},
{
"clip_ratio": 0.0009762371191754937,
"epoch": 0.12743823146944083,
"grad_norm": 56.741945106919694,
"kl": 0.16015625,
"learning_rate": 2e-06,
"loss": 0.002,
"step": 98
},
{
"clip_ratio": 0.0,
"completion_length": 446.78125,
"epoch": 0.12873862158647595,
"grad_norm": 1.25074838363305,
"kl": 0.0830078125,
"learning_rate": 2e-06,
"loss": -0.0056,
"reward": 0.8078266978263855,
"reward_std": 0.33678656816482544,
"rewards/preference_model_reward": 0.8078266978263855,
"rewards/preference_model_reward/std": 0.37187322974205017,
"step": 99
},
{
"clip_ratio": 0.0002822041278705001,
"epoch": 0.13003901170351106,
"grad_norm": 1.2248064666505254,
"kl": 0.08447265625,
"learning_rate": 2e-06,
"loss": -0.0062,
"step": 100
},
{
"clip_ratio": 0.0,
"completion_length": 321.34375,
"epoch": 0.13133940182054615,
"grad_norm": 0.8820706667295893,
"kl": 0.083984375,
"learning_rate": 2e-06,
"loss": -0.0028,
"reward": 0.8281782269477844,
"reward_std": 0.25918400287628174,
"rewards/preference_model_reward": 0.8281782269477844,
"rewards/preference_model_reward/std": 0.3205867111682892,
"step": 101
},
{
"clip_ratio": 0.0001883239165181294,
"epoch": 0.13263979193758127,
"grad_norm": 0.8633439241133061,
"kl": 0.0849609375,
"learning_rate": 2e-06,
"loss": -0.0033,
"step": 102
},
{
"clip_ratio": 0.0,
"completion_length": 516.125,
"epoch": 0.13394018205461639,
"grad_norm": 0.9179254188048204,
"kl": 0.1162109375,
"learning_rate": 2e-06,
"loss": -0.0012,
"reward": 0.8762021064758301,
"reward_std": 0.1672360599040985,
"rewards/preference_model_reward": 0.8762021064758301,
"rewards/preference_model_reward/std": 0.26448386907577515,
"step": 103
},
{
"clip_ratio": 0.0004665090818889439,
"epoch": 0.1352405721716515,
"grad_norm": 0.8518295339560051,
"kl": 0.119140625,
"learning_rate": 2e-06,
"loss": -0.0017,
"step": 104
},
{
"clip_ratio": 0.0,
"completion_length": 459.09375,
"epoch": 0.13654096228868662,
"grad_norm": 1.3294307657803117,
"kl": 0.09375,
"learning_rate": 2e-06,
"loss": -0.0053,
"reward": 0.834820032119751,
"reward_std": 0.20883573591709137,
"rewards/preference_model_reward": 0.834820032119751,
"rewards/preference_model_reward/std": 0.33552286028862,
"step": 105
},
{
"clip_ratio": 0.00029856746550649405,
"epoch": 0.1378413524057217,
"grad_norm": 1.2984474405085682,
"kl": 0.09423828125,
"learning_rate": 2e-06,
"loss": -0.0061,
"step": 106
},
{
"clip_ratio": 0.0,
"completion_length": 511.125,
"epoch": 0.13914174252275682,
"grad_norm": 1.336937835070471,
"kl": 0.09716796875,
"learning_rate": 2e-06,
"loss": 0.006,
"reward": 0.8500386476516724,
"reward_std": 0.262530118227005,
"rewards/preference_model_reward": 0.8500386476516724,
"rewards/preference_model_reward/std": 0.33816027641296387,
"step": 107
},
{
"clip_ratio": 0.00040964456275105476,
"epoch": 0.14044213263979194,
"grad_norm": 1.3064575603389494,
"kl": 0.09765625,
"learning_rate": 2e-06,
"loss": 0.0052,
"step": 108
},
{
"clip_ratio": 0.0,
"completion_length": 581.71875,
"epoch": 0.14174252275682706,
"grad_norm": 0.7932427250097717,
"kl": 0.11328125,
"learning_rate": 2e-06,
"loss": 0.0012,
"reward": 0.954128086566925,
"reward_std": 0.1318557858467102,
"rewards/preference_model_reward": 0.954128086566925,
"rewards/preference_model_reward/std": 0.18926787376403809,
"step": 109
},
{
"clip_ratio": 0.0003079274611081928,
"epoch": 0.14304291287386217,
"grad_norm": 0.7211363567303583,
"kl": 0.1142578125,
"learning_rate": 2e-06,
"loss": 0.0008,
"step": 110
},
{
"clip_ratio": 0.0,
"completion_length": 625.71875,
"epoch": 0.14434330299089726,
"grad_norm": 1.5659320313843499,
"kl": 0.09423828125,
"learning_rate": 2e-06,
"loss": 0.012,
"reward": 0.5640057921409607,
"reward_std": 0.3570883274078369,
"rewards/preference_model_reward": 0.5640057921409607,
"rewards/preference_model_reward/std": 0.4593791365623474,
"step": 111
},
{
"clip_ratio": 0.0002680362085811794,
"epoch": 0.14564369310793238,
"grad_norm": 1.465110556785164,
"kl": 0.095703125,
"learning_rate": 2e-06,
"loss": 0.0113,
"step": 112
},
{
"clip_ratio": 0.0,
"completion_length": 488.59375,
"epoch": 0.1469440832249675,
"grad_norm": 1.409521260931884,
"kl": 0.111328125,
"learning_rate": 2e-06,
"loss": 0.0261,
"reward": 0.8961158990859985,
"reward_std": 0.2921155095100403,
"rewards/preference_model_reward": 0.8961158990859985,
"rewards/preference_model_reward/std": 0.29145917296409607,
"step": 113
},
{
"clip_ratio": 0.0003246698761358857,
"epoch": 0.1482444733420026,
"grad_norm": 1.2950122397091601,
"kl": 0.11328125,
"learning_rate": 2e-06,
"loss": 0.0253,
"step": 114
},
{
"clip_ratio": 0.0,
"completion_length": 351.40625,
"epoch": 0.14954486345903772,
"grad_norm": 0.7110353333635794,
"kl": 0.1240234375,
"learning_rate": 2e-06,
"loss": -0.0015,
"reward": 0.9187972545623779,
"reward_std": 0.23119348287582397,
"rewards/preference_model_reward": 0.9187972545623779,
"rewards/preference_model_reward/std": 0.22826893627643585,
"step": 115
},
{
"clip_ratio": 0.0002654148265719414,
"epoch": 0.1508452535760728,
"grad_norm": 0.6454362421608829,
"kl": 0.12158203125,
"learning_rate": 2e-06,
"loss": -0.0018,
"step": 116
},
{
"clip_ratio": 0.0,
"completion_length": 415.0,
"epoch": 0.15214564369310793,
"grad_norm": 1.3938640333126824,
"kl": 0.115234375,
"learning_rate": 2e-06,
"loss": 0.0042,
"reward": 0.7364434003829956,
"reward_std": 0.3461056351661682,
"rewards/preference_model_reward": 0.7364434003829956,
"rewards/preference_model_reward/std": 0.3824731409549713,
"step": 117
},
{
"clip_ratio": 0.00016578214126639068,
"epoch": 0.15344603381014305,
"grad_norm": 1.1848160683983286,
"kl": 0.1162109375,
"learning_rate": 2e-06,
"loss": 0.0035,
"step": 118
},
{
"clip_ratio": 0.0,
"completion_length": 461.65625,
"epoch": 0.15474642392717816,
"grad_norm": 1.2327690933799054,
"kl": 0.13671875,
"learning_rate": 2e-06,
"loss": -0.0024,
"reward": 0.7404364347457886,
"reward_std": 0.22904622554779053,
"rewards/preference_model_reward": 0.7404364347457886,
"rewards/preference_model_reward/std": 0.4136257469654083,
"step": 119
},
{
"clip_ratio": 0.0004128115251660347,
"epoch": 0.15604681404421328,
"grad_norm": 1.1864725204703228,
"kl": 0.1376953125,
"learning_rate": 2e-06,
"loss": -0.0031,
"step": 120
},
{
"clip_ratio": 0.0,
"completion_length": 576.6875,
"epoch": 0.15734720416124837,
"grad_norm": 1.2507452373907348,
"kl": 0.10791015625,
"learning_rate": 2e-06,
"loss": 0.009,
"reward": 0.667312741279602,
"reward_std": 0.2824709415435791,
"rewards/preference_model_reward": 0.667312741279602,
"rewards/preference_model_reward/std": 0.4015900194644928,
"step": 121
},
{
"clip_ratio": 0.0003672163584269583,
"epoch": 0.15864759427828348,
"grad_norm": 1.173660124640144,
"kl": 0.109375,
"learning_rate": 2e-06,
"loss": 0.0084,
"step": 122
},
{
"clip_ratio": 0.0,
"completion_length": 409.59375,
"epoch": 0.1599479843953186,
"grad_norm": 0.8600581560919843,
"kl": 0.123046875,
"learning_rate": 2e-06,
"loss": -0.02,
"reward": 0.8616656064987183,
"reward_std": 0.2982789874076843,
"rewards/preference_model_reward": 0.8616656064987183,
"rewards/preference_model_reward/std": 0.30772268772125244,
"step": 123
},
{
"clip_ratio": 6.778741953894496e-05,
"epoch": 0.16124837451235371,
"grad_norm": 0.8288777030969212,
"kl": 0.123046875,
"learning_rate": 2e-06,
"loss": -0.0205,
"step": 124
},
{
"clip_ratio": 0.0,
"completion_length": 436.96875,
"epoch": 0.1625487646293888,
"grad_norm": 1.7782879022760736,
"kl": 0.10986328125,
"learning_rate": 2e-06,
"loss": -0.0159,
"reward": 0.8924694061279297,
"reward_std": 0.2801273465156555,
"rewards/preference_model_reward": 0.8924694061279297,
"rewards/preference_model_reward/std": 0.2758771777153015,
"step": 125
},
{
"clip_ratio": 0.00029164942679926753,
"epoch": 0.16384915474642392,
"grad_norm": 0.7890220661652254,
"kl": 0.1123046875,
"learning_rate": 2e-06,
"loss": -0.0162,
"step": 126
},
{
"clip_ratio": 0.0,
"completion_length": 499.90625,
"epoch": 0.16514954486345904,
"grad_norm": 0.5249880512825823,
"kl": 0.12109375,
"learning_rate": 2e-06,
"loss": 0.0085,
"reward": 0.9754809141159058,
"reward_std": 0.06936999410390854,
"rewards/preference_model_reward": 0.9754809141159058,
"rewards/preference_model_reward/std": 0.09967197477817535,
"step": 127
},
{
"clip_ratio": 6.330716860247776e-05,
"epoch": 0.16644993498049415,
"grad_norm": 0.48145633919403286,
"kl": 0.1220703125,
"learning_rate": 2e-06,
"loss": 0.0082,
"step": 128
},
{
"clip_ratio": 0.0,
"completion_length": 476.0,
"epoch": 0.16775032509752927,
"grad_norm": 0.8581044560913536,
"kl": 0.12890625,
"learning_rate": 2e-06,
"loss": -0.0003,
"reward": 0.9375989437103271,
"reward_std": 0.14338433742523193,
"rewards/preference_model_reward": 0.9375989437103271,
"rewards/preference_model_reward/std": 0.2093113660812378,
"step": 129
},
{
"clip_ratio": 0.00023169601627159864,
"epoch": 0.16905071521456436,
"grad_norm": 0.6716469385695882,
"kl": 0.1298828125,
"learning_rate": 2e-06,
"loss": -0.0006,
"step": 130
},
{
"clip_ratio": 0.0,
"completion_length": 417.53125,
"epoch": 0.17035110533159947,
"grad_norm": 0.9531216293434532,
"kl": 0.1220703125,
"learning_rate": 2e-06,
"loss": -0.0145,
"reward": 0.874575138092041,
"reward_std": 0.312376469373703,
"rewards/preference_model_reward": 0.874575138092041,
"rewards/preference_model_reward/std": 0.325155109167099,
"step": 131
},
{
"clip_ratio": 0.00021792339975945652,
"epoch": 0.1716514954486346,
"grad_norm": 0.9321159155810473,
"kl": 0.1240234375,
"learning_rate": 2e-06,
"loss": -0.0151,
"step": 132
},
{
"clip_ratio": 0.0,
"completion_length": 444.84375,
"epoch": 0.1729518855656697,
"grad_norm": 0.9100648016845723,
"kl": 0.1220703125,
"learning_rate": 2e-06,
"loss": -0.0057,
"reward": 0.1512603610754013,
"reward_std": 0.15578344464302063,
"rewards/preference_model_reward": 0.1512603610754013,
"rewards/preference_model_reward/std": 0.2468794584274292,
"step": 133
},
{
"clip_ratio": 0.0009341657860204577,
"epoch": 0.17425227568270482,
"grad_norm": 0.8979224574809616,
"kl": 0.1259765625,
"learning_rate": 2e-06,
"loss": -0.0062,
"step": 134
},
{
"clip_ratio": 0.0,
"completion_length": 499.71875,
"epoch": 0.1755526657997399,
"grad_norm": 1.1433790438627716,
"kl": 0.26171875,
"learning_rate": 2e-06,
"loss": 0.0028,
"reward": 0.90513676404953,
"reward_std": 0.23498843610286713,
"rewards/preference_model_reward": 0.90513676404953,
"rewards/preference_model_reward/std": 0.24004317820072174,
"step": 135
},
{
"clip_ratio": 0.0007208128226920962,
"epoch": 0.17685305591677503,
"grad_norm": 1.0273440654539872,
"kl": 0.2197265625,
"learning_rate": 2e-06,
"loss": 0.0022,
"step": 136
},
{
"clip_ratio": 0.0,
"completion_length": 511.1875,
"epoch": 0.17815344603381014,
"grad_norm": 0.3778536448237069,
"kl": 0.1318359375,
"learning_rate": 2e-06,
"loss": 0.0016,
"reward": 0.9848020076751709,
"reward_std": 0.060791999101638794,
"rewards/preference_model_reward": 0.9848020076751709,
"rewards/preference_model_reward/std": 0.0859728679060936,
"step": 137
},
{
"clip_ratio": 0.00015050009824335575,
"epoch": 0.17945383615084526,
"grad_norm": 0.38068416605149247,
"kl": 0.1337890625,
"learning_rate": 2e-06,
"loss": 0.0013,
"step": 138
},
{
"clip_ratio": 0.0,
"completion_length": 330.78125,
"epoch": 0.18075422626788037,
"grad_norm": 1.121182505147895,
"kl": 0.140625,
"learning_rate": 2e-06,
"loss": -0.0082,
"reward": 0.6605119705200195,
"reward_std": 0.357056200504303,
"rewards/preference_model_reward": 0.6605119705200195,
"rewards/preference_model_reward/std": 0.3964306712150574,
"step": 139
},
{
"clip_ratio": 0.00019087232067249715,
"epoch": 0.18205461638491546,
"grad_norm": 1.0796848772767726,
"kl": 0.142578125,
"learning_rate": 2e-06,
"loss": -0.0089,
"step": 140
},
{
"clip_ratio": 0.0,
"completion_length": 576.625,
"epoch": 0.18335500650195058,
"grad_norm": 1.3173408698053826,
"kl": 0.1484375,
"learning_rate": 2e-06,
"loss": -0.0007,
"reward": 0.5397460460662842,
"reward_std": 0.27076956629753113,
"rewards/preference_model_reward": 0.5397460460662842,
"rewards/preference_model_reward/std": 0.43955907225608826,
"step": 141
},
{
"clip_ratio": 0.000534381833858788,
"epoch": 0.1846553966189857,
"grad_norm": 1.2508652015275377,
"kl": 0.150390625,
"learning_rate": 2e-06,
"loss": -0.0014,
"step": 142
},
{
"clip_ratio": 0.0,
"completion_length": 417.5625,
"epoch": 0.1859557867360208,
"grad_norm": 1.0095689522807012,
"kl": 0.1298828125,
"learning_rate": 2e-06,
"loss": -0.0072,
"reward": 0.8610371351242065,
"reward_std": 0.2966481149196625,
"rewards/preference_model_reward": 0.8610371351242065,
"rewards/preference_model_reward/std": 0.30670949816703796,
"step": 143
},
{
"clip_ratio": 5.617977512883954e-05,
"epoch": 0.18725617685305593,
"grad_norm": 0.9826898380088299,
"kl": 0.1298828125,
"learning_rate": 2e-06,
"loss": -0.0078,
"step": 144
},
{
"clip_ratio": 0.0,
"completion_length": 549.96875,
"epoch": 0.18855656697009102,
"grad_norm": 1.7766319258647452,
"kl": 0.158203125,
"learning_rate": 2e-06,
"loss": 0.0135,
"reward": 0.6295263171195984,
"reward_std": 0.4187769293785095,
"rewards/preference_model_reward": 0.6295263171195984,
"rewards/preference_model_reward/std": 0.42381730675697327,
"step": 145
},
{
"clip_ratio": 0.0003444340836722404,
"epoch": 0.18985695708712613,
"grad_norm": 1.7285319530442194,
"kl": 0.1484375,
"learning_rate": 2e-06,
"loss": 0.0125,
"step": 146
},
{
"clip_ratio": 0.0,
"completion_length": 396.78125,
"epoch": 0.19115734720416125,
"grad_norm": 1.1051043019440878,
"kl": 0.1318359375,
"learning_rate": 2e-06,
"loss": -0.0055,
"reward": 0.5800995826721191,
"reward_std": 0.32392293214797974,
"rewards/preference_model_reward": 0.5800995826721191,
"rewards/preference_model_reward/std": 0.3707711398601532,
"step": 147
},
{
"clip_ratio": 0.0003981678746640682,
"epoch": 0.19245773732119636,
"grad_norm": 1.0453400928741006,
"kl": 0.1328125,
"learning_rate": 2e-06,
"loss": -0.0062,
"step": 148
},
{
"clip_ratio": 0.0,
"completion_length": 334.28125,
"epoch": 0.19375812743823148,
"grad_norm": 0.03748773956154849,
"kl": 0.1376953125,
"learning_rate": 2e-06,
"loss": 0.0,
"reward": 0.9981102347373962,
"reward_std": 0.0075591248460114,
"rewards/preference_model_reward": 0.9981102347373962,
"rewards/preference_model_reward/std": 0.010690220631659031,
"step": 149
},
{
"clip_ratio": 8.632597018731758e-05,
"epoch": 0.19505851755526657,
"grad_norm": 0.03228936227298709,
"kl": 0.1328125,
"learning_rate": 2e-06,
"loss": 0.0,
"step": 150
},
{
"clip_ratio": 0.0,
"completion_length": 447.875,
"epoch": 0.19635890767230169,
"grad_norm": 0.05752238966724317,
"kl": 0.15234375,
"learning_rate": 2e-06,
"loss": 0.0002,
"reward": 0.9974650144577026,
"reward_std": 0.010140029713511467,
"rewards/preference_model_reward": 0.9974650144577026,
"rewards/preference_model_reward/std": 0.014340158551931381,
"step": 151
},
{
"clip_ratio": 0.00020508613670244813,
"epoch": 0.1976592977893368,
"grad_norm": 0.04811959314845179,
"kl": 0.1416015625,
"learning_rate": 2e-06,
"loss": 0.0001,
"step": 152
},
{
"clip_ratio": 0.0,
"completion_length": 550.0625,
"epoch": 0.19895968790637192,
"grad_norm": 1.0016776671397591,
"kl": 0.1474609375,
"learning_rate": 2e-06,
"loss": -0.0062,
"reward": 0.912561297416687,
"reward_std": 0.25192081928253174,
"rewards/preference_model_reward": 0.912561297416687,
"rewards/preference_model_reward/std": 0.25600963830947876,
"step": 153
},
{
"clip_ratio": 0.0003616804606281221,
"epoch": 0.20026007802340703,
"grad_norm": 0.9381106713638003,
"kl": 0.1376953125,
"learning_rate": 2e-06,
"loss": -0.0068,
"step": 154
},
{
"clip_ratio": 0.0,
"completion_length": 779.6875,
"epoch": 0.20156046814044212,
"grad_norm": 1.9176344667399166,
"kl": 0.1240234375,
"learning_rate": 2e-06,
"loss": 0.0209,
"reward": 0.46237650513648987,
"reward_std": 0.37269675731658936,
"rewards/preference_model_reward": 0.46237650513648987,
"rewards/preference_model_reward/std": 0.4106147587299347,
"step": 155
},
{
"clip_ratio": 0.0005603111931122839,
"epoch": 0.20286085825747724,
"grad_norm": 1.9416642185631487,
"kl": 0.11962890625,
"learning_rate": 2e-06,
"loss": 0.0196,
"step": 156
},
{
"clip_ratio": 0.0,
"completion_length": 561.15625,
"epoch": 0.20416124837451236,
"grad_norm": 1.246090268048283,
"kl": 0.1171875,
"learning_rate": 2e-06,
"loss": 0.0074,
"reward": 0.8197011351585388,
"reward_std": 0.32638221979141235,
"rewards/preference_model_reward": 0.8197011351585388,
"rewards/preference_model_reward/std": 0.3211716413497925,
"step": 157
},
{
"clip_ratio": 0.00015653572336304933,
"epoch": 0.20546163849154747,
"grad_norm": 1.1878525300972134,
"kl": 0.115234375,
"learning_rate": 2e-06,
"loss": 0.0067,
"step": 158
},
{
"clip_ratio": 0.0,
"completion_length": 498.78125,
"epoch": 0.2067620286085826,
"grad_norm": 1.6471484937824736,
"kl": 0.1083984375,
"learning_rate": 2e-06,
"loss": 0.0099,
"reward": 0.6723287105560303,
"reward_std": 0.42400574684143066,
"rewards/preference_model_reward": 0.6723287105560303,
"rewards/preference_model_reward/std": 0.42361700534820557,
"step": 159
},
{
"clip_ratio": 0.00019252923084422946,
"epoch": 0.20806241872561768,
"grad_norm": 1.5640304604076711,
"kl": 0.10888671875,
"learning_rate": 2e-06,
"loss": 0.0089,
"step": 160
},
{
"clip_ratio": 0.0,
"completion_length": 452.59375,
"epoch": 0.2093628088426528,
"grad_norm": 1.1976569308957934,
"kl": 0.1337890625,
"learning_rate": 2e-06,
"loss": -0.0028,
"reward": 0.8096756935119629,
"reward_std": 0.33807146549224854,
"rewards/preference_model_reward": 0.8096756935119629,
"rewards/preference_model_reward/std": 0.34619417786598206,
"step": 161
},
{
"clip_ratio": 0.00016228496679104865,
"epoch": 0.2106631989596879,
"grad_norm": 1.2198444194251261,
"kl": 0.13671875,
"learning_rate": 2e-06,
"loss": -0.0036,
"step": 162
},
{
"clip_ratio": 0.0,
"completion_length": 612.21875,
"epoch": 0.21196358907672302,
"grad_norm": 1.6710655753119792,
"kl": 0.1328125,
"learning_rate": 2e-06,
"loss": 0.0194,
"reward": 0.7978184223175049,
"reward_std": 0.26372814178466797,
"rewards/preference_model_reward": 0.7978184223175049,
"rewards/preference_model_reward/std": 0.3677564263343811,
"step": 163
},
{
"clip_ratio": 0.0002533727674745023,
"epoch": 0.21326397919375814,
"grad_norm": 1.4829182389200866,
"kl": 0.1328125,
"learning_rate": 2e-06,
"loss": 0.0188,
"step": 164
},
{
"clip_ratio": 0.0,
"completion_length": 368.71875,
"epoch": 0.21456436931079323,
"grad_norm": 1.0731863888448565,
"kl": 0.1279296875,
"learning_rate": 2e-06,
"loss": 0.0004,
"reward": 0.7712475061416626,
"reward_std": 0.3398103713989258,
"rewards/preference_model_reward": 0.7712475061416626,
"rewards/preference_model_reward/std": 0.34702128171920776,
"step": 165
},
{
"clip_ratio": 0.0001748301729094237,
"epoch": 0.21586475942782835,
"grad_norm": 1.0142522280669701,
"kl": 0.12890625,
"learning_rate": 2e-06,
"loss": -0.0002,
"step": 166
},
{
"clip_ratio": 0.0,
"completion_length": 613.0,
"epoch": 0.21716514954486346,
"grad_norm": 1.1219818347809352,
"kl": 0.1357421875,
"learning_rate": 2e-06,
"loss": 0.0129,
"reward": 0.5442590713500977,
"reward_std": 0.2779287099838257,
"rewards/preference_model_reward": 0.5442590713500977,
"rewards/preference_model_reward/std": 0.41137993335723877,
"step": 167
},
{
"clip_ratio": 0.00020389427663758397,
"epoch": 0.21846553966189858,
"grad_norm": 1.1061402577957113,
"kl": 0.13671875,
"learning_rate": 2e-06,
"loss": 0.0122,
"step": 168
},
{
"clip_ratio": 0.0,
"completion_length": 612.90625,
"epoch": 0.21976592977893367,
"grad_norm": 1.72194342380149,
"kl": 0.13671875,
"learning_rate": 2e-06,
"loss": 0.0029,
"reward": 0.684001624584198,
"reward_std": 0.3655635118484497,
"rewards/preference_model_reward": 0.684001624584198,
"rewards/preference_model_reward/std": 0.4322417080402374,
"step": 169
},
{
"clip_ratio": 0.000274754042038694,
"epoch": 0.22106631989596878,
"grad_norm": 1.6428549273522806,
"kl": 0.138671875,
"learning_rate": 2e-06,
"loss": 0.002,
"step": 170
},
{
"clip_ratio": 0.0,
"completion_length": 634.28125,
"epoch": 0.2223667100130039,
"grad_norm": 1.614101972009106,
"kl": 0.138671875,
"learning_rate": 2e-06,
"loss": 0.0056,
"reward": 0.8601099848747253,
"reward_std": 0.31807541847229004,
"rewards/preference_model_reward": 0.8601099848747253,
"rewards/preference_model_reward/std": 0.3315056562423706,
"step": 171
},
{
"clip_ratio": 0.00013853044947609305,
"epoch": 0.22366710013003901,
"grad_norm": 1.3296533646627524,
"kl": 0.1396484375,
"learning_rate": 2e-06,
"loss": 0.0049,
"step": 172
},
{
"clip_ratio": 0.0,
"completion_length": 401.59375,
"epoch": 0.22496749024707413,
"grad_norm": 0.6357224318885739,
"kl": 0.1552734375,
"learning_rate": 2e-06,
"loss": -0.0,
"reward": 0.9281669855117798,
"reward_std": 0.1928824484348297,
"rewards/preference_model_reward": 0.9281669855117798,
"rewards/preference_model_reward/std": 0.19411809742450714,
"step": 173
},
{
"clip_ratio": 0.00015087510109879076,
"epoch": 0.22626788036410922,
"grad_norm": 0.5759303474167251,
"kl": 0.158203125,
"learning_rate": 2e-06,
"loss": -0.0004,
"step": 174
},
{
"clip_ratio": 0.0,
"completion_length": 489.71875,
"epoch": 0.22756827048114434,
"grad_norm": 1.6254811605839956,
"kl": 0.13671875,
"learning_rate": 2e-06,
"loss": -0.0008,
"reward": 0.6483066082000732,
"reward_std": 0.4315972328186035,
"rewards/preference_model_reward": 0.6483066082000732,
"rewards/preference_model_reward/std": 0.4604679346084595,
"step": 175
},
{
"clip_ratio": 0.00034807526390068233,
"epoch": 0.22886866059817945,
"grad_norm": 1.5654975897789054,
"kl": 0.138671875,
"learning_rate": 2e-06,
"loss": -0.0018,
"step": 176
},
{
"clip_ratio": 0.0,
"completion_length": 418.9375,
"epoch": 0.23016905071521457,
"grad_norm": 1.2843572461072899,
"kl": 0.1611328125,
"learning_rate": 2e-06,
"loss": -0.0046,
"reward": 0.7664402723312378,
"reward_std": 0.3110997676849365,
"rewards/preference_model_reward": 0.7664402723312378,
"rewards/preference_model_reward/std": 0.367339551448822,
"step": 177
},
{
"clip_ratio": 0.0,
"epoch": 0.23146944083224968,
"grad_norm": 1.1110477641764491,
"kl": 0.1640625,
"learning_rate": 2e-06,
"loss": -0.0052,
"step": 178
},
{
"clip_ratio": 0.0,
"completion_length": 358.21875,
"epoch": 0.23276983094928477,
"grad_norm": 0.8712719346022294,
"kl": 0.140625,
"learning_rate": 2e-06,
"loss": -0.0053,
"reward": 0.8626149892807007,
"reward_std": 0.2925213575363159,
"rewards/preference_model_reward": 0.8626149892807007,
"rewards/preference_model_reward/std": 0.2878483831882477,
"step": 179
},
{
"clip_ratio": 0.0,
"epoch": 0.2340702210663199,
"grad_norm": 0.8070501204024879,
"kl": 0.142578125,
"learning_rate": 2e-06,
"loss": -0.0058,
"step": 180
},
{
"clip_ratio": 0.0,
"completion_length": 647.03125,
"epoch": 0.235370611183355,
"grad_norm": 1.4675397620762083,
"kl": 0.134765625,
"learning_rate": 2e-06,
"loss": 0.0117,
"reward": 0.8469977378845215,
"reward_std": 0.3276008367538452,
"rewards/preference_model_reward": 0.8469977378845215,
"rewards/preference_model_reward/std": 0.32416415214538574,
"step": 181
},
{
"clip_ratio": 0.0002926147426478565,
"epoch": 0.23667100130039012,
"grad_norm": 2.070965238661399,
"kl": 0.134765625,
"learning_rate": 2e-06,
"loss": 0.0109,
"step": 182
},
{
"clip_ratio": 0.0,
"completion_length": 421.0625,
"epoch": 0.23797139141742524,
"grad_norm": 1.1744315012326634,
"kl": 0.1611328125,
"learning_rate": 2e-06,
"loss": 0.0164,
"reward": 0.8896687030792236,
"reward_std": 0.22899243235588074,
"rewards/preference_model_reward": 0.8896687030792236,
"rewards/preference_model_reward/std": 0.29047808051109314,
"step": 183
},
{
"clip_ratio": 0.00016585712728556246,
"epoch": 0.23927178153446033,
"grad_norm": 1.1088031941295666,
"kl": 0.1640625,
"learning_rate": 2e-06,
"loss": 0.0157,
"step": 184
},
{
"clip_ratio": 0.0,
"completion_length": 305.1875,
"epoch": 0.24057217165149544,
"grad_norm": 0.012167435784803771,
"kl": 0.17578125,
"learning_rate": 2e-06,
"loss": 0.0001,
"reward": 1.0,
"reward_std": 0.0,
"rewards/preference_model_reward": 1.0,
"rewards/preference_model_reward/std": 0.0,
"step": 185
},
{
"clip_ratio": 0.0,
"epoch": 0.24187256176853056,
"grad_norm": 0.010662640913732074,
"kl": 0.166015625,
"learning_rate": 2e-06,
"loss": 0.0001,
"step": 186
},
{
"clip_ratio": 0.0,
"completion_length": 279.5,
"epoch": 0.24317295188556567,
"grad_norm": 2.798587143064415,
"kl": 0.1484375,
"learning_rate": 2e-06,
"loss": -0.0066,
"reward": 0.6557672023773193,
"reward_std": 0.332293838262558,
"rewards/preference_model_reward": 0.6557672023773193,
"rewards/preference_model_reward/std": 0.34837016463279724,
"step": 187
},
{
"clip_ratio": 0.001388114527799189,
"epoch": 0.2444733420026008,
"grad_norm": 0.8537099565058428,
"kl": 0.142578125,
"learning_rate": 2e-06,
"loss": -0.0068,
"step": 188
},
{
"clip_ratio": 0.0,
"completion_length": 385.15625,
"epoch": 0.24577373211963588,
"grad_norm": 0.8363775477713209,
"kl": 0.1376953125,
"learning_rate": 2e-06,
"loss": 0.0038,
"reward": 0.8794984221458435,
"reward_std": 0.28465738892555237,
"rewards/preference_model_reward": 0.8794984221458435,
"rewards/preference_model_reward/std": 0.283857524394989,
"step": 189
},
{
"clip_ratio": 0.00017409646534360945,
"epoch": 0.247074122236671,
"grad_norm": 0.816021420016002,
"kl": 0.1318359375,
"learning_rate": 2e-06,
"loss": 0.0032,
"step": 190
},
{
"clip_ratio": 0.0,
"completion_length": 356.96875,
"epoch": 0.2483745123537061,
"grad_norm": 0.9815328749481499,
"kl": 0.138671875,
"learning_rate": 2e-06,
"loss": -0.0094,
"reward": 0.6524101495742798,
"reward_std": 0.3220939040184021,
"rewards/preference_model_reward": 0.6524101495742798,
"rewards/preference_model_reward/std": 0.34822776913642883,
"step": 191
},
{
"clip_ratio": 0.0002863642293959856,
"epoch": 0.24967490247074123,
"grad_norm": 0.9330643460254814,
"kl": 0.134765625,
"learning_rate": 2e-06,
"loss": -0.01,
"step": 192
},
{
"clip_ratio": 0.0,
"completion_length": 257.71875,
"epoch": 0.25097529258777634,
"grad_norm": 1.1018955579208387,
"kl": 0.30859375,
"learning_rate": 2e-06,
"loss": -0.0052,
"reward": 0.7511432766914368,
"reward_std": 0.3835994601249695,
"rewards/preference_model_reward": 0.7511432766914368,
"rewards/preference_model_reward/std": 0.3915899097919464,
"step": 193
},
{
"clip_ratio": 0.0005864645936526358,
"epoch": 0.25227568270481143,
"grad_norm": 1.0859444991839702,
"kl": 0.2421875,
"learning_rate": 2e-06,
"loss": -0.0058,
"step": 194
},
{
"clip_ratio": 0.0,
"completion_length": 410.21875,
"epoch": 0.2535760728218466,
"grad_norm": 0.012556309626161617,
"kl": 0.126953125,
"learning_rate": 2e-06,
"loss": 0.0001,
"reward": 1.0,
"reward_std": 0.0,
"rewards/preference_model_reward": 1.0,
"rewards/preference_model_reward/std": 0.0,
"step": 195
},
{
"clip_ratio": 0.0,
"epoch": 0.25487646293888166,
"grad_norm": 0.010950174010336853,
"kl": 0.1162109375,
"learning_rate": 2e-06,
"loss": 0.0001,
"step": 196
},
{
"clip_ratio": 0.0,
"completion_length": 270.40625,
"epoch": 0.25617685305591675,
"grad_norm": 0.4413920366005579,
"kl": 0.109375,
"learning_rate": 2e-06,
"loss": -0.0097,
"reward": 0.5270646810531616,
"reward_std": 0.2283352017402649,
"rewards/preference_model_reward": 0.5270646810531616,
"rewards/preference_model_reward/std": 0.46704450249671936,
"step": 197
},
{
"clip_ratio": 0.0012252123560756445,
"epoch": 0.2574772431729519,
"grad_norm": 0.420238367270396,
"kl": 0.1005859375,
"learning_rate": 2e-06,
"loss": -0.01,
"step": 198
},
{
"clip_ratio": 0.0,
"completion_length": 284.4375,
"epoch": 0.258777633289987,
"grad_norm": 0.648882313337497,
"kl": 0.1015625,
"learning_rate": 2e-06,
"loss": -0.0029,
"reward": 0.9249469637870789,
"reward_std": 0.2125411331653595,
"rewards/preference_model_reward": 0.9249469637870789,
"rewards/preference_model_reward/std": 0.24472835659980774,
"step": 199
},
{
"clip_ratio": 0.0009765520226210356,
"epoch": 0.26007802340702213,
"grad_norm": 0.5803957439533651,
"kl": 0.09765625,
"learning_rate": 2e-06,
"loss": -0.0033,
"step": 200
},
{
"clip_ratio": 0.0,
"completion_length": 347.0625,
"epoch": 0.2613784135240572,
"grad_norm": 0.8843805814111732,
"kl": 0.103515625,
"learning_rate": 2e-06,
"loss": -0.0003,
"reward": 0.4381202459335327,
"reward_std": 0.27952808141708374,
"rewards/preference_model_reward": 0.4381202459335327,
"rewards/preference_model_reward/std": 0.4532867968082428,
"step": 201
},
{
"clip_ratio": 0.00026075675850734115,
"epoch": 0.2626788036410923,
"grad_norm": 0.8236946250468278,
"kl": 0.1015625,
"learning_rate": 2e-06,
"loss": -0.0008,
"step": 202
},
{
"clip_ratio": 0.0,
"completion_length": 334.375,
"epoch": 0.26397919375812745,
"grad_norm": 0.8793077051463641,
"kl": 0.1142578125,
"learning_rate": 2e-06,
"loss": 0.0059,
"reward": 0.8583904504776001,
"reward_std": 0.26961272954940796,
"rewards/preference_model_reward": 0.8583904504776001,
"rewards/preference_model_reward/std": 0.27106156945228577,
"step": 203
},
{
"clip_ratio": 0.0005615265690721571,
"epoch": 0.26527958387516254,
"grad_norm": 0.8213872997514073,
"kl": 0.14453125,
"learning_rate": 2e-06,
"loss": 0.0054,
"step": 204
},
{
"clip_ratio": 0.0,
"completion_length": 244.21875,
"epoch": 0.2665799739921977,
"grad_norm": 0.920439324543163,
"kl": 0.10791015625,
"learning_rate": 2e-06,
"loss": 0.0037,
"reward": 0.6516105532646179,
"reward_std": 0.3531697392463684,
"rewards/preference_model_reward": 0.6516105532646179,
"rewards/preference_model_reward/std": 0.38390058279037476,
"step": 205
},
{
"clip_ratio": 0.0003897629212588072,
"epoch": 0.26788036410923277,
"grad_norm": 0.8873190929889052,
"kl": 0.107421875,
"learning_rate": 2e-06,
"loss": 0.0031,
"step": 206
},
{
"clip_ratio": 0.0,
"completion_length": 285.71875,
"epoch": 0.26918075422626786,
"grad_norm": 0.9107935937419545,
"kl": 0.1337890625,
"learning_rate": 2e-06,
"loss": 0.0073,
"reward": 0.6827311515808105,
"reward_std": 0.33245402574539185,
"rewards/preference_model_reward": 0.6827311515808105,
"rewards/preference_model_reward/std": 0.32757312059402466,
"step": 207
},
{
"clip_ratio": 0.0003240827936679125,
"epoch": 0.270481144343303,
"grad_norm": 0.8964363171955139,
"kl": 0.1357421875,
"learning_rate": 2e-06,
"loss": 0.0066,
"step": 208
},
{
"clip_ratio": 0.0,
"completion_length": 419.9375,
"epoch": 0.2717815344603381,
"grad_norm": 0.8564481881429168,
"kl": 0.0869140625,
"learning_rate": 2e-06,
"loss": -0.0051,
"reward": 0.8694363832473755,
"reward_std": 0.25172746181488037,
"rewards/preference_model_reward": 0.8694363832473755,
"rewards/preference_model_reward/std": 0.24772705137729645,
"step": 209
},
{
"clip_ratio": 0.00022542427177540958,
"epoch": 0.27308192457737324,
"grad_norm": 0.7335746351550806,
"kl": 0.08544921875,
"learning_rate": 2e-06,
"loss": -0.0055,
"step": 210
},
{
"clip_ratio": 0.0,
"completion_length": 430.09375,
"epoch": 0.2743823146944083,
"grad_norm": 1.0673059450451972,
"kl": 0.09228515625,
"learning_rate": 2e-06,
"loss": 0.0069,
"reward": 0.720138430595398,
"reward_std": 0.3222920894622803,
"rewards/preference_model_reward": 0.720138430595398,
"rewards/preference_model_reward/std": 0.41566386818885803,
"step": 211
},
{
"clip_ratio": 8.36400140542537e-05,
"epoch": 0.2756827048114434,
"grad_norm": 1.1400954432908579,
"kl": 0.09375,
"learning_rate": 2e-06,
"loss": 0.0061,
"step": 212
},
{
"clip_ratio": 0.0,
"completion_length": 350.75,
"epoch": 0.27698309492847856,
"grad_norm": 1.3462434191662358,
"kl": 0.0791015625,
"learning_rate": 2e-06,
"loss": -0.0097,
"reward": 0.5960279703140259,
"reward_std": 0.4318947196006775,
"rewards/preference_model_reward": 0.5960279703140259,
"rewards/preference_model_reward/std": 0.4450107216835022,
"step": 213
},
{
"clip_ratio": 0.0005943958531133831,
"epoch": 0.27828348504551365,
"grad_norm": 1.3958238267778045,
"kl": 0.080078125,
"learning_rate": 2e-06,
"loss": -0.0106,
"step": 214
},
{
"clip_ratio": 0.0,
"completion_length": 489.34375,
"epoch": 0.2795838751625488,
"grad_norm": 1.027540012470293,
"kl": 0.0869140625,
"learning_rate": 2e-06,
"loss": -0.0014,
"reward": 0.844211220741272,
"reward_std": 0.2946808934211731,
"rewards/preference_model_reward": 0.844211220741272,
"rewards/preference_model_reward/std": 0.29101303219795227,
"step": 215
},
{
"clip_ratio": 0.00011487863957881927,
"epoch": 0.2808842652795839,
"grad_norm": 0.9445019155913764,
"kl": 0.08837890625,
"learning_rate": 2e-06,
"loss": -0.002,
"step": 216
},
{
"clip_ratio": 0.0,
"completion_length": 319.25,
"epoch": 0.28218465539661897,
"grad_norm": 0.8593232598517545,
"kl": 0.1005859375,
"learning_rate": 2e-06,
"loss": 0.0038,
"reward": 0.8116539120674133,
"reward_std": 0.2266491800546646,
"rewards/preference_model_reward": 0.8116539120674133,
"rewards/preference_model_reward/std": 0.3688415586948395,
"step": 217
},
{
"clip_ratio": 0.0,
"epoch": 0.2834850455136541,
"grad_norm": 0.9536555216415759,
"kl": 0.10205078125,
"learning_rate": 2e-06,
"loss": 0.0032,
"step": 218
},
{
"clip_ratio": 0.0,
"completion_length": 285.4375,
"epoch": 0.2847854356306892,
"grad_norm": 0.0920547152120324,
"kl": 0.111328125,
"learning_rate": 2e-06,
"loss": -0.0,
"reward": 0.5229940414428711,
"reward_std": 0.036096930503845215,
"rewards/preference_model_reward": 0.5229940414428711,
"rewards/preference_model_reward/std": 0.47642675042152405,
"step": 219
},
{
"clip_ratio": 0.0006749940221197903,
"epoch": 0.28608582574772434,
"grad_norm": 0.09505637629837431,
"kl": 0.111328125,
"learning_rate": 2e-06,
"loss": -0.0001,
"step": 220
},
{
"clip_ratio": 0.0,
"completion_length": 315.3125,
"epoch": 0.28738621586475943,
"grad_norm": 0.44900720903133046,
"kl": 0.11669921875,
"learning_rate": 2e-06,
"loss": -0.0047,
"reward": 0.9324289560317993,
"reward_std": 0.14278283715248108,
"rewards/preference_model_reward": 0.9324289560317993,
"rewards/preference_model_reward/std": 0.16829460859298706,
"step": 221
},
{
"clip_ratio": 0.00019348411296959966,
"epoch": 0.2886866059817945,
"grad_norm": 0.42608100511796476,
"kl": 0.1171875,
"learning_rate": 2e-06,
"loss": -0.005,
"step": 222
},
{
"clip_ratio": 0.0,
"completion_length": 253.90625,
"epoch": 0.28998699609882966,
"grad_norm": 0.48780598580355755,
"kl": 0.11669921875,
"learning_rate": 2e-06,
"loss": 0.0005,
"reward": 0.43476933240890503,
"reward_std": 0.1322799175977707,
"rewards/preference_model_reward": 0.43476933240890503,
"rewards/preference_model_reward/std": 0.4707281291484833,
"step": 223
},
{
"clip_ratio": 0.00040084568900056183,
"epoch": 0.29128738621586475,
"grad_norm": 0.44729368284423715,
"kl": 0.1181640625,
"learning_rate": 2e-06,
"loss": 0.0002,
"step": 224
},
{
"clip_ratio": 0.0,
"completion_length": 471.46875,
"epoch": 0.2925877763328999,
"grad_norm": 0.6225599540084386,
"kl": 0.205078125,
"learning_rate": 2e-06,
"loss": -0.0135,
"reward": 0.938056230545044,
"reward_std": 0.1692640632390976,
"rewards/preference_model_reward": 0.938056230545044,
"rewards/preference_model_reward/std": 0.24374790489673615,
"step": 225
},
{
"clip_ratio": 0.00016879897157195956,
"epoch": 0.293888166449935,
"grad_norm": 0.566535614416212,
"kl": 0.1328125,
"learning_rate": 2e-06,
"loss": -0.0139,
"step": 226
},
{
"clip_ratio": 0.0,
"completion_length": 219.40625,
"epoch": 0.2951885565669701,
"grad_norm": 1.1863022123947027,
"kl": 0.1171875,
"learning_rate": 2e-06,
"loss": 0.0027,
"reward": 0.5798717737197876,
"reward_std": 0.3581033945083618,
"rewards/preference_model_reward": 0.5798717737197876,
"rewards/preference_model_reward/std": 0.3897789716720581,
"step": 227
},
{
"clip_ratio": 0.0,
"epoch": 0.2964889466840052,
"grad_norm": 0.8735255960345151,
"kl": 0.1220703125,
"learning_rate": 2e-06,
"loss": 0.0023,
"step": 228
},
{
"clip_ratio": 0.0,
"completion_length": 296.8125,
"epoch": 0.2977893368010403,
"grad_norm": 0.8794603051578461,
"kl": 0.11279296875,
"learning_rate": 2e-06,
"loss": -0.0072,
"reward": 0.79021155834198,
"reward_std": 0.31384900212287903,
"rewards/preference_model_reward": 0.79021155834198,
"rewards/preference_model_reward/std": 0.3670634329319,
"step": 229
},
{
"clip_ratio": 0.0,
"epoch": 0.29908972691807545,
"grad_norm": 0.9006009359994098,
"kl": 0.1162109375,
"learning_rate": 2e-06,
"loss": -0.0078,
"step": 230
},
{
"clip_ratio": 0.0,
"completion_length": 373.9375,
"epoch": 0.30039011703511054,
"grad_norm": 0.8110636965631363,
"kl": 0.103515625,
"learning_rate": 2e-06,
"loss": 0.0017,
"reward": 0.8521493673324585,
"reward_std": 0.1953742802143097,
"rewards/preference_model_reward": 0.8521493673324585,
"rewards/preference_model_reward/std": 0.31055518984794617,
"step": 231
},
{
"clip_ratio": 0.00011916110815946013,
"epoch": 0.3016905071521456,
"grad_norm": 0.7852075153190419,
"kl": 0.10595703125,
"learning_rate": 2e-06,
"loss": 0.0013,
"step": 232
},
{
"clip_ratio": 0.0,
"completion_length": 264.8125,
"epoch": 0.30299089726918077,
"grad_norm": 0.6520439692812087,
"kl": 0.125,
"learning_rate": 2e-06,
"loss": 0.0003,
"reward": 0.9159287214279175,
"reward_std": 0.2180308848619461,
"rewards/preference_model_reward": 0.9159287214279175,
"rewards/preference_model_reward/std": 0.24606594443321228,
"step": 233
},
{
"clip_ratio": 0.0004468331462703645,
"epoch": 0.30429128738621586,
"grad_norm": 0.5532931586806176,
"kl": 0.12890625,
"learning_rate": 2e-06,
"loss": -0.0,
"step": 234
},
{
"clip_ratio": 0.0,
"completion_length": 319.75,
"epoch": 0.305591677503251,
"grad_norm": 0.776988546935505,
"kl": 0.142578125,
"learning_rate": 2e-06,
"loss": 0.0032,
"reward": 0.9222438931465149,
"reward_std": 0.23244205117225647,
"rewards/preference_model_reward": 0.9222438931465149,
"rewards/preference_model_reward/std": 0.23122651875019073,
"step": 235
},
{
"clip_ratio": 0.0007608620799146593,
"epoch": 0.3068920676202861,
"grad_norm": 0.7011384841526471,
"kl": 0.146484375,
"learning_rate": 2e-06,
"loss": 0.0027,
"step": 236
},
{
"clip_ratio": 0.0,
"completion_length": 375.25,
"epoch": 0.3081924577373212,
"grad_norm": 0.6230777256393368,
"kl": 0.126953125,
"learning_rate": 2e-06,
"loss": -0.0075,
"reward": 0.9193332195281982,
"reward_std": 0.17760136723518372,
"rewards/preference_model_reward": 0.9193332195281982,
"rewards/preference_model_reward/std": 0.2603200674057007,
"step": 237
},
{
"clip_ratio": 0.0006001516012474895,
"epoch": 0.3094928478543563,
"grad_norm": 0.5912438243448386,
"kl": 0.130859375,
"learning_rate": 2e-06,
"loss": -0.0079,
"step": 238
},
{
"clip_ratio": 0.0,
"completion_length": 222.125,
"epoch": 0.3107932379713914,
"grad_norm": 0.7506171412047739,
"kl": 0.150390625,
"learning_rate": 2e-06,
"loss": -0.0032,
"reward": 0.6538297533988953,
"reward_std": 0.2938784062862396,
"rewards/preference_model_reward": 0.6538297533988953,
"rewards/preference_model_reward/std": 0.3540026843547821,
"step": 239
},
{
"clip_ratio": 0.00028041156474500895,
"epoch": 0.31209362808842656,
"grad_norm": 0.64002825525906,
"kl": 0.15234375,
"learning_rate": 2e-06,
"loss": -0.0037,
"step": 240
},
{
"clip_ratio": 0.0,
"completion_length": 405.5,
"epoch": 0.31339401820546164,
"grad_norm": 0.38597198495676954,
"kl": 0.1240234375,
"learning_rate": 2e-06,
"loss": -0.0012,
"reward": 0.970663845539093,
"reward_std": 0.0886102095246315,
"rewards/preference_model_reward": 0.970663845539093,
"rewards/preference_model_reward/std": 0.12682799994945526,
"step": 241
},
{
"clip_ratio": 0.0,
"epoch": 0.31469440832249673,
"grad_norm": 0.35045812420514433,
"kl": 0.126953125,
"learning_rate": 2e-06,
"loss": -0.0015,
"step": 242
},
{
"clip_ratio": 0.0,
"completion_length": 332.40625,
"epoch": 0.3159947984395319,
"grad_norm": 1.1616464740874977,
"kl": 0.140625,
"learning_rate": 2e-06,
"loss": 0.0007,
"reward": 0.7326881289482117,
"reward_std": 0.3448137640953064,
"rewards/preference_model_reward": 0.7326881289482117,
"rewards/preference_model_reward/std": 0.4094682037830353,
"step": 243
},
{
"clip_ratio": 0.00034368172055110335,
"epoch": 0.31729518855656696,
"grad_norm": 1.0573890331902724,
"kl": 0.142578125,
"learning_rate": 2e-06,
"loss": -0.0,
"step": 244
},
{
"clip_ratio": 0.0,
"completion_length": 335.9375,
"epoch": 0.31859557867360205,
"grad_norm": 0.7961750832151863,
"kl": 0.1416015625,
"learning_rate": 2e-06,
"loss": -0.0001,
"reward": 0.8184847235679626,
"reward_std": 0.24248003959655762,
"rewards/preference_model_reward": 0.8184847235679626,
"rewards/preference_model_reward/std": 0.3097231090068817,
"step": 245
},
{
"clip_ratio": 0.00016528925334569067,
"epoch": 0.3198959687906372,
"grad_norm": 0.731756658994661,
"kl": 0.1435546875,
"learning_rate": 2e-06,
"loss": -0.0006,
"step": 246
},
{
"clip_ratio": 0.0,
"completion_length": 330.21875,
"epoch": 0.3211963589076723,
"grad_norm": 0.8198850650973981,
"kl": 0.15234375,
"learning_rate": 2e-06,
"loss": -0.0025,
"reward": 0.800827145576477,
"reward_std": 0.22814823687076569,
"rewards/preference_model_reward": 0.800827145576477,
"rewards/preference_model_reward/std": 0.3556991219520569,
"step": 247
},
{
"clip_ratio": 0.00036711152642965317,
"epoch": 0.32249674902470743,
"grad_norm": 0.7758595322860915,
"kl": 0.16015625,
"learning_rate": 2e-06,
"loss": -0.0031,
"step": 248
},
{
"clip_ratio": 0.0,
"completion_length": 333.9375,
"epoch": 0.3237971391417425,
"grad_norm": 0.7816768910697072,
"kl": 0.1708984375,
"learning_rate": 2e-06,
"loss": -0.0009,
"reward": 0.9013230204582214,
"reward_std": 0.20031380653381348,
"rewards/preference_model_reward": 0.9013230204582214,
"rewards/preference_model_reward/std": 0.2634318470954895,
"step": 249
},
{
"clip_ratio": 0.0006623025983572006,
"epoch": 0.3250975292587776,
"grad_norm": 0.700949755432084,
"kl": 0.173828125,
"learning_rate": 2e-06,
"loss": -0.0014,
"step": 250
},
{
"clip_ratio": 0.0,
"completion_length": 387.375,
"epoch": 0.32639791937581275,
"grad_norm": 0.5236384680893461,
"kl": 0.166015625,
"learning_rate": 2e-06,
"loss": 0.0012,
"reward": 0.9541411399841309,
"reward_std": 0.12308676540851593,
"rewards/preference_model_reward": 0.9541411399841309,
"rewards/preference_model_reward/std": 0.13573099672794342,
"step": 251
},
{
"clip_ratio": 0.00028014881536364555,
"epoch": 0.32769830949284784,
"grad_norm": 0.5281527733024369,
"kl": 0.1689453125,
"learning_rate": 2e-06,
"loss": 0.0008,
"step": 252
},
{
"clip_ratio": 0.0,
"completion_length": 320.25,
"epoch": 0.328998699609883,
"grad_norm": 0.9528739475463192,
"kl": 0.193359375,
"learning_rate": 2e-06,
"loss": -0.0015,
"reward": 0.9271292686462402,
"reward_std": 0.16556303203105927,
"rewards/preference_model_reward": 0.9271292686462402,
"rewards/preference_model_reward/std": 0.24194052815437317,
"step": 253
},
{
"clip_ratio": 0.0003772066265810281,
"epoch": 0.33029908972691807,
"grad_norm": 0.661148563447169,
"kl": 0.1953125,
"learning_rate": 2e-06,
"loss": -0.0018,
"step": 254
},
{
"clip_ratio": 0.0,
"completion_length": 318.65625,
"epoch": 0.33159947984395316,
"grad_norm": 0.9604652912869995,
"kl": 0.1865234375,
"learning_rate": 2e-06,
"loss": -0.0038,
"reward": 0.7251725196838379,
"reward_std": 0.2921797037124634,
"rewards/preference_model_reward": 0.7251725196838379,
"rewards/preference_model_reward/std": 0.3959549367427826,
"step": 255
},
{
"clip_ratio": 0.0005459666135720909,
"epoch": 0.3328998699609883,
"grad_norm": 0.9173313255466033,
"kl": 0.185546875,
"learning_rate": 2e-06,
"loss": -0.0045,
"step": 256
},
{
"clip_ratio": 0.0,
"completion_length": 326.59375,
"epoch": 0.3342002600780234,
"grad_norm": 0.15271026747443167,
"kl": 0.1806640625,
"learning_rate": 2e-06,
"loss": -0.0001,
"reward": 0.9912102222442627,
"reward_std": 0.03515896201133728,
"rewards/preference_model_reward": 0.9912102222442627,
"rewards/preference_model_reward/std": 0.049722280353307724,
"step": 257
},
{
"clip_ratio": 0.0,
"epoch": 0.33550065019505854,
"grad_norm": 0.13315090840044477,
"kl": 0.1796875,
"learning_rate": 2e-06,
"loss": -0.0002,
"step": 258
},
{
"clip_ratio": 0.0,
"completion_length": 474.03125,
"epoch": 0.3368010403120936,
"grad_norm": 0.023749172386806034,
"kl": 0.1767578125,
"learning_rate": 2e-06,
"loss": 0.0002,
"reward": 1.0,
"reward_std": 0.0,
"rewards/preference_model_reward": 1.0,
"rewards/preference_model_reward/std": 0.0,
"step": 259
},
{
"clip_ratio": 0.0,
"epoch": 0.3381014304291287,
"grad_norm": 0.020046705197983922,
"kl": 0.162109375,
"learning_rate": 2e-06,
"loss": 0.0002,
"step": 260
},
{
"clip_ratio": 0.0,
"completion_length": 352.9375,
"epoch": 0.33940182054616386,
"grad_norm": 0.6360843528660727,
"kl": 0.1552734375,
"learning_rate": 2e-06,
"loss": -0.0005,
"reward": 0.93841552734375,
"reward_std": 0.13651104271411896,
"rewards/preference_model_reward": 0.93841552734375,
"rewards/preference_model_reward/std": 0.19995808601379395,
"step": 261
},
{
"clip_ratio": 0.0007453379803337157,
"epoch": 0.34070221066319895,
"grad_norm": 0.6012787093360856,
"kl": 0.1435546875,
"learning_rate": 2e-06,
"loss": -0.001,
"step": 262
},
{
"clip_ratio": 0.0,
"completion_length": 545.40625,
"epoch": 0.3420026007802341,
"grad_norm": 1.5555168702461082,
"kl": 0.19921875,
"learning_rate": 2e-06,
"loss": 0.0238,
"reward": 0.7650834918022156,
"reward_std": 0.3637867867946625,
"rewards/preference_model_reward": 0.7650834918022156,
"rewards/preference_model_reward/std": 0.3750750422477722,
"step": 263
},
{
"clip_ratio": 0.0010029254481196404,
"epoch": 0.3433029908972692,
"grad_norm": 1.7494042707337192,
"kl": 0.1611328125,
"learning_rate": 2e-06,
"loss": 0.0227,
"step": 264
},
{
"clip_ratio": 0.0,
"completion_length": 380.75,
"epoch": 0.34460338101430427,
"grad_norm": 1.1102015498823388,
"kl": 0.150390625,
"learning_rate": 2e-06,
"loss": -0.0044,
"reward": 0.8403390645980835,
"reward_std": 0.31802237033843994,
"rewards/preference_model_reward": 0.8403390645980835,
"rewards/preference_model_reward/std": 0.32403555512428284,
"step": 265
},
{
"clip_ratio": 7.552870374638587e-05,
"epoch": 0.3459037711313394,
"grad_norm": 0.9753672523996855,
"kl": 0.1455078125,
"learning_rate": 2e-06,
"loss": -0.0051,
"step": 266
},
{
"clip_ratio": 0.0,
"completion_length": 420.5,
"epoch": 0.3472041612483745,
"grad_norm": 0.7074828188717931,
"kl": 0.169921875,
"learning_rate": 2e-06,
"loss": -0.0001,
"reward": 0.7929283380508423,
"reward_std": 0.18768203258514404,
"rewards/preference_model_reward": 0.7929283380508423,
"rewards/preference_model_reward/std": 0.3012048006057739,
"step": 267
},
{
"clip_ratio": 0.0006710141897201538,
"epoch": 0.34850455136540964,
"grad_norm": 0.6706438170937675,
"kl": 0.166015625,
"learning_rate": 2e-06,
"loss": -0.0005,
"step": 268
},
{
"clip_ratio": 0.0,
"completion_length": 358.5625,
"epoch": 0.34980494148244473,
"grad_norm": 1.045237660141156,
"kl": 0.15234375,
"learning_rate": 2e-06,
"loss": 0.0025,
"reward": 0.8770852088928223,
"reward_std": 0.23402594029903412,
"rewards/preference_model_reward": 0.8770852088928223,
"rewards/preference_model_reward/std": 0.2798316180706024,
"step": 269
},
{
"clip_ratio": 0.000235777348279953,
"epoch": 0.3511053315994798,
"grad_norm": 0.7229933430375373,
"kl": 0.15234375,
"learning_rate": 2e-06,
"loss": 0.0022,
"step": 270
},
{
"clip_ratio": 0.0,
"completion_length": 455.875,
"epoch": 0.35240572171651496,
"grad_norm": 0.07309145002425353,
"kl": 0.130859375,
"learning_rate": 2e-06,
"loss": 0.0001,
"reward": 1.0,
"reward_std": 0.0,
"rewards/preference_model_reward": 1.0,
"rewards/preference_model_reward/std": 0.0,
"step": 271
},
{
"clip_ratio": 0.0,
"epoch": 0.35370611183355005,
"grad_norm": 0.01529023255881576,
"kl": 0.1259765625,
"learning_rate": 2e-06,
"loss": 0.0001,
"step": 272
},
{
"clip_ratio": 0.0,
"completion_length": 334.6875,
"epoch": 0.3550065019505852,
"grad_norm": 0.4251362023871739,
"kl": 0.1513671875,
"learning_rate": 2e-06,
"loss": -0.0023,
"reward": 0.9609469175338745,
"reward_std": 0.10973211377859116,
"rewards/preference_model_reward": 0.9609469175338745,
"rewards/preference_model_reward/std": 0.1577332317829132,
"step": 273
},
{
"clip_ratio": 0.000310945266392082,
"epoch": 0.3563068920676203,
"grad_norm": 0.34692654187177197,
"kl": 0.1474609375,
"learning_rate": 2e-06,
"loss": -0.0025,
"step": 274
},
{
"clip_ratio": 0.0,
"completion_length": 345.65625,
"epoch": 0.3576072821846554,
"grad_norm": 0.8323153038827732,
"kl": 0.1298828125,
"learning_rate": 2e-06,
"loss": -0.0006,
"reward": 0.8540354371070862,
"reward_std": 0.2715786099433899,
"rewards/preference_model_reward": 0.8540354371070862,
"rewards/preference_model_reward/std": 0.2714146673679352,
"step": 275
},
{
"clip_ratio": 0.00025343496236018836,
"epoch": 0.3589076723016905,
"grad_norm": 0.8386551476075588,
"kl": 0.126953125,
"learning_rate": 2e-06,
"loss": -0.0012,
"step": 276
},
{
"clip_ratio": 0.0,
"completion_length": 400.875,
"epoch": 0.3602080624187256,
"grad_norm": 3.018201149851659,
"kl": 0.14453125,
"learning_rate": 2e-06,
"loss": -0.0039,
"reward": 0.7919412851333618,
"reward_std": 0.20601463317871094,
"rewards/preference_model_reward": 0.7919412851333618,
"rewards/preference_model_reward/std": 0.35613295435905457,
"step": 277
},
{
"clip_ratio": 0.00024119633599184453,
"epoch": 0.36150845253576075,
"grad_norm": 2.3752234333701847,
"kl": 0.69140625,
"learning_rate": 2e-06,
"loss": -0.0035,
"step": 278
},
{
"clip_ratio": 0.0,
"completion_length": 420.6875,
"epoch": 0.36280884265279584,
"grad_norm": 1.4330759769641548,
"kl": 0.1376953125,
"learning_rate": 2e-06,
"loss": -0.0142,
"reward": 0.5622725486755371,
"reward_std": 0.40393969416618347,
"rewards/preference_model_reward": 0.5622725486755371,
"rewards/preference_model_reward/std": 0.4006726145744324,
"step": 279
},
{
"clip_ratio": 0.00024359519011341035,
"epoch": 0.3641092327698309,
"grad_norm": 1.2330239115295205,
"kl": 0.1376953125,
"learning_rate": 2e-06,
"loss": -0.0151,
"step": 280
},
{
"clip_ratio": 0.0,
"completion_length": 327.84375,
"epoch": 0.36540962288686607,
"grad_norm": 1.0374054211943582,
"kl": 0.126953125,
"learning_rate": 2e-06,
"loss": -0.0044,
"reward": 0.840008020401001,
"reward_std": 0.35583558678627014,
"rewards/preference_model_reward": 0.840008020401001,
"rewards/preference_model_reward/std": 0.3535732328891754,
"step": 281
},
{
"clip_ratio": 0.0007460214546881616,
"epoch": 0.36671001300390116,
"grad_norm": 0.9016990592456968,
"kl": 0.1279296875,
"learning_rate": 2e-06,
"loss": -0.005,
"step": 282
},
{
"clip_ratio": 0.0,
"completion_length": 431.0625,
"epoch": 0.3680104031209363,
"grad_norm": 1.5216553850491168,
"kl": 0.134765625,
"learning_rate": 2e-06,
"loss": 0.0085,
"reward": 0.6559486389160156,
"reward_std": 0.4345345199108124,
"rewards/preference_model_reward": 0.6559486389160156,
"rewards/preference_model_reward/std": 0.42781931161880493,
"step": 283
},
{
"clip_ratio": 0.00047609303146600723,
"epoch": 0.3693107932379714,
"grad_norm": 5.409857191769975,
"kl": 0.134765625,
"learning_rate": 2e-06,
"loss": 0.0076,
"step": 284
},
{
"clip_ratio": 0.0,
"completion_length": 390.53125,
"epoch": 0.3706111833550065,
"grad_norm": 1.1576263309647625,
"kl": 0.1357421875,
"learning_rate": 2e-06,
"loss": 0.0014,
"reward": 0.8136157989501953,
"reward_std": 0.23213137686252594,
"rewards/preference_model_reward": 0.8136157989501953,
"rewards/preference_model_reward/std": 0.298849493265152,
"step": 285
},
{
"clip_ratio": 0.00016891787527129054,
"epoch": 0.3719115734720416,
"grad_norm": 0.8952466212055566,
"kl": 0.1357421875,
"learning_rate": 2e-06,
"loss": 0.0009,
"step": 286
},
{
"clip_ratio": 0.0,
"completion_length": 468.46875,
"epoch": 0.3732119635890767,
"grad_norm": 0.23014091366454292,
"kl": 0.14453125,
"learning_rate": 2e-06,
"loss": 0.0005,
"reward": 0.9814756512641907,
"reward_std": 0.05683635175228119,
"rewards/preference_model_reward": 0.9814756512641907,
"rewards/preference_model_reward/std": 0.08128068596124649,
"step": 287
},
{
"clip_ratio": 0.0008712065173313022,
"epoch": 0.37451235370611186,
"grad_norm": 0.22543453641754027,
"kl": 0.1435546875,
"learning_rate": 2e-06,
"loss": 0.0003,
"step": 288
},
{
"clip_ratio": 0.0,
"completion_length": 527.40625,
"epoch": 0.37581274382314694,
"grad_norm": 1.0195256953878606,
"kl": 0.1171875,
"learning_rate": 2e-06,
"loss": 0.0058,
"reward": 0.48845207691192627,
"reward_std": 0.331182062625885,
"rewards/preference_model_reward": 0.48845207691192627,
"rewards/preference_model_reward/std": 0.469135046005249,
"step": 289
},
{
"clip_ratio": 0.0002331826981389895,
"epoch": 0.37711313394018203,
"grad_norm": 1.0989081166309522,
"kl": 0.1171875,
"learning_rate": 2e-06,
"loss": 0.0051,
"step": 290
},
{
"clip_ratio": 0.0,
"completion_length": 318.4375,
"epoch": 0.3784135240572172,
"grad_norm": 1.0743112389929244,
"kl": 0.1357421875,
"learning_rate": 2e-06,
"loss": 0.0019,
"reward": 0.6902080178260803,
"reward_std": 0.3703380525112152,
"rewards/preference_model_reward": 0.6902080178260803,
"rewards/preference_model_reward/std": 0.42161616683006287,
"step": 291
},
{
"clip_ratio": 0.000206069671548903,
"epoch": 0.37971391417425226,
"grad_norm": 1.233125880130113,
"kl": 0.1376953125,
"learning_rate": 2e-06,
"loss": 0.0012,
"step": 292
},
{
"clip_ratio": 0.0,
"completion_length": 395.25,
"epoch": 0.3810143042912874,
"grad_norm": 1.107176758039471,
"kl": 0.1435546875,
"learning_rate": 2e-06,
"loss": 0.0001,
"reward": 0.6763293743133545,
"reward_std": 0.34847384691238403,
"rewards/preference_model_reward": 0.6763293743133545,
"rewards/preference_model_reward/std": 0.3937572240829468,
"step": 293
},
{
"clip_ratio": 0.0004974036128260195,
"epoch": 0.3823146944083225,
"grad_norm": 1.1451654931732194,
"kl": 0.1435546875,
"learning_rate": 2e-06,
"loss": -0.0008,
"step": 294
},
{
"clip_ratio": 0.0,
"completion_length": 377.09375,
"epoch": 0.3836150845253576,
"grad_norm": 0.7690758743063623,
"kl": 0.162109375,
"learning_rate": 2e-06,
"loss": 0.0054,
"reward": 0.4330715835094452,
"reward_std": 0.17991632223129272,
"rewards/preference_model_reward": 0.4330715835094452,
"rewards/preference_model_reward/std": 0.46172231435775757,
"step": 295
},
{
"clip_ratio": 0.00045062549179419875,
"epoch": 0.38491547464239273,
"grad_norm": 0.7270920776936544,
"kl": 0.1630859375,
"learning_rate": 2e-06,
"loss": 0.0049,
"step": 296
},
{
"clip_ratio": 0.0,
"completion_length": 356.03125,
"epoch": 0.3862158647594278,
"grad_norm": 0.8363253049863949,
"kl": 0.15625,
"learning_rate": 2e-06,
"loss": 0.0016,
"reward": 0.8998174667358398,
"reward_std": 0.2283022552728653,
"rewards/preference_model_reward": 0.8998174667358398,
"rewards/preference_model_reward/std": 0.26539289951324463,
"step": 297
},
{
"clip_ratio": 9.889240755001083e-05,
"epoch": 0.38751625487646296,
"grad_norm": 0.7879079112961523,
"kl": 0.158203125,
"learning_rate": 2e-06,
"loss": 0.001,
"step": 298
},
{
"clip_ratio": 0.0,
"completion_length": 345.65625,
"epoch": 0.38881664499349805,
"grad_norm": 0.9537917167423159,
"kl": 0.15625,
"learning_rate": 2e-06,
"loss": 0.0029,
"reward": 0.8426351547241211,
"reward_std": 0.32086896896362305,
"rewards/preference_model_reward": 0.8426351547241211,
"rewards/preference_model_reward/std": 0.3234107196331024,
"step": 299
},
{
"clip_ratio": 0.0001863636280177161,
"epoch": 0.39011703511053314,
"grad_norm": 1.0680774123073455,
"kl": 0.158203125,
"learning_rate": 2e-06,
"loss": 0.0022,
"step": 300
},
{
"clip_ratio": 0.0,
"completion_length": 315.40625,
"epoch": 0.3914174252275683,
"grad_norm": 0.8700504381068667,
"kl": 0.1572265625,
"learning_rate": 2e-06,
"loss": -0.0018,
"reward": 0.8363662958145142,
"reward_std": 0.22255460917949677,
"rewards/preference_model_reward": 0.8363662958145142,
"rewards/preference_model_reward/std": 0.3514332175254822,
"step": 301
},
{
"clip_ratio": 0.0,
"epoch": 0.39271781534460337,
"grad_norm": 0.8462340517851735,
"kl": 0.1591796875,
"learning_rate": 2e-06,
"loss": -0.0025,
"step": 302
},
{
"clip_ratio": 0.0,
"completion_length": 561.96875,
"epoch": 0.3940182054616385,
"grad_norm": 1.0311732527803947,
"kl": 0.12255859375,
"learning_rate": 2e-06,
"loss": 0.0068,
"reward": 0.8814795017242432,
"reward_std": 0.2578521966934204,
"rewards/preference_model_reward": 0.8814795017242432,
"rewards/preference_model_reward/std": 0.2664722502231598,
"step": 303
},
{
"clip_ratio": 0.00029304379131644964,
"epoch": 0.3953185955786736,
"grad_norm": 0.8897993747480035,
"kl": 0.12353515625,
"learning_rate": 2e-06,
"loss": 0.0061,
"step": 304
},
{
"clip_ratio": 0.0,
"completion_length": 345.625,
"epoch": 0.3966189856957087,
"grad_norm": 0.5314694953876299,
"kl": 0.169921875,
"learning_rate": 2e-06,
"loss": -0.0038,
"reward": 0.9106444716453552,
"reward_std": 0.15523825585842133,
"rewards/preference_model_reward": 0.9106444716453552,
"rewards/preference_model_reward/std": 0.23427554965019226,
"step": 305
},
{
"clip_ratio": 0.0,
"epoch": 0.39791937581274384,
"grad_norm": 0.5193425208826081,
"kl": 0.1708984375,
"learning_rate": 2e-06,
"loss": -0.0042,
"step": 306
},
{
"clip_ratio": 0.0,
"completion_length": 438.3125,
"epoch": 0.3992197659297789,
"grad_norm": 0.7071936845424766,
"kl": 0.1650390625,
"learning_rate": 2e-06,
"loss": 0.0,
"reward": 0.9416297078132629,
"reward_std": 0.233481302857399,
"rewards/preference_model_reward": 0.9416297078132629,
"rewards/preference_model_reward/std": 0.23015134036540985,
"step": 307
},
{
"clip_ratio": 0.00029302731854841113,
"epoch": 0.40052015604681407,
"grad_norm": 0.6689390834175738,
"kl": 0.166015625,
"learning_rate": 2e-06,
"loss": -0.0004,
"step": 308
},
{
"clip_ratio": 0.0,
"completion_length": 212.5,
"epoch": 0.40182054616384916,
"grad_norm": 0.17393097771051252,
"kl": 0.189453125,
"learning_rate": 2e-06,
"loss": -0.0011,
"reward": 0.4954475164413452,
"reward_std": 0.07749561965465546,
"rewards/preference_model_reward": 0.4954475164413452,
"rewards/preference_model_reward/std": 0.48258301615715027,
"step": 309
},
{
"clip_ratio": 0.0002689617977011949,
"epoch": 0.40312093628088425,
"grad_norm": 0.14360471990489682,
"kl": 0.1884765625,
"learning_rate": 2e-06,
"loss": -0.0012,
"step": 310
},
{
"clip_ratio": 0.0,
"completion_length": 405.125,
"epoch": 0.4044213263979194,
"grad_norm": 1.3861640957004484,
"kl": 0.1572265625,
"learning_rate": 2e-06,
"loss": 0.0066,
"reward": 0.7110024094581604,
"reward_std": 0.28312164545059204,
"rewards/preference_model_reward": 0.7110024094581604,
"rewards/preference_model_reward/std": 0.4174264371395111,
"step": 311
},
{
"clip_ratio": 0.0,
"epoch": 0.4057217165149545,
"grad_norm": 1.0765104391828029,
"kl": 0.1572265625,
"learning_rate": 2e-06,
"loss": 0.006,
"step": 312
},
{
"clip_ratio": 0.0,
"completion_length": 246.65625,
"epoch": 0.4070221066319896,
"grad_norm": 0.2721463731071287,
"kl": 0.1796875,
"learning_rate": 2e-06,
"loss": 0.001,
"reward": 0.9796858429908752,
"reward_std": 0.08125662803649902,
"rewards/preference_model_reward": 0.9796858429908752,
"rewards/preference_model_reward/std": 0.11491423845291138,
"step": 313
},
{
"clip_ratio": 0.0,
"epoch": 0.4083224967490247,
"grad_norm": 0.23033629785859774,
"kl": 0.1806640625,
"learning_rate": 2e-06,
"loss": 0.0008,
"step": 314
},
{
"clip_ratio": 0.0,
"completion_length": 255.21875,
"epoch": 0.4096228868660598,
"grad_norm": 0.6419391137466428,
"kl": 0.1884765625,
"learning_rate": 2e-06,
"loss": -0.0005,
"reward": 0.8585046529769897,
"reward_std": 0.2577616572380066,
"rewards/preference_model_reward": 0.8585046529769897,
"rewards/preference_model_reward/std": 0.26389265060424805,
"step": 315
},
{
"clip_ratio": 0.00030229747062548995,
"epoch": 0.41092327698309494,
"grad_norm": 0.6006179379691705,
"kl": 0.189453125,
"learning_rate": 2e-06,
"loss": -0.0011,
"step": 316
},
{
"clip_ratio": 0.0,
"completion_length": 363.5625,
"epoch": 0.41222366710013003,
"grad_norm": 3.170040530471923,
"kl": 0.1962890625,
"learning_rate": 2e-06,
"loss": -0.0034,
"reward": 0.7158905267715454,
"reward_std": 0.40732401609420776,
"rewards/preference_model_reward": 0.7158905267715454,
"rewards/preference_model_reward/std": 0.4021960496902466,
"step": 317
},
{
"clip_ratio": 0.00033682904904708266,
"epoch": 0.4135240572171652,
"grad_norm": 1.2493860673019876,
"kl": 0.1953125,
"learning_rate": 2e-06,
"loss": -0.0037,
"step": 318
},
{
"clip_ratio": 0.0,
"completion_length": 357.0,
"epoch": 0.41482444733420026,
"grad_norm": 0.8704204592205625,
"kl": 0.1826171875,
"learning_rate": 2e-06,
"loss": -0.0114,
"reward": 0.6861220598220825,
"reward_std": 0.3007145822048187,
"rewards/preference_model_reward": 0.6861220598220825,
"rewards/preference_model_reward/std": 0.3603835999965668,
"step": 319
},
{
"clip_ratio": 0.0003130662371404469,
"epoch": 0.41612483745123535,
"grad_norm": 0.8570575968724828,
"kl": 0.18359375,
"learning_rate": 2e-06,
"loss": -0.0121,
"step": 320
},
{
"clip_ratio": 0.0,
"completion_length": 396.59375,
"epoch": 0.4174252275682705,
"grad_norm": 1.2062034927710477,
"kl": 0.326171875,
"learning_rate": 2e-06,
"loss": 0.002,
"reward": 0.9455615282058716,
"reward_std": 0.21775373816490173,
"rewards/preference_model_reward": 0.9455615282058716,
"rewards/preference_model_reward/std": 0.21650560200214386,
"step": 321
},
{
"clip_ratio": 0.0002604166802484542,
"epoch": 0.4187256176853056,
"grad_norm": 2.819982155660782,
"kl": 0.2138671875,
"learning_rate": 2e-06,
"loss": 0.0016,
"step": 322
},
{
"clip_ratio": 0.0,
"completion_length": 351.5,
"epoch": 0.42002600780234073,
"grad_norm": 1.0161054442586306,
"kl": 0.177734375,
"learning_rate": 2e-06,
"loss": -0.0007,
"reward": 0.8240683078765869,
"reward_std": 0.27943116426467896,
"rewards/preference_model_reward": 0.8240683078765869,
"rewards/preference_model_reward/std": 0.3035885691642761,
"step": 323
},
{
"clip_ratio": 0.00038074731128290296,
"epoch": 0.4213263979193758,
"grad_norm": 0.9288478971908941,
"kl": 0.1787109375,
"learning_rate": 2e-06,
"loss": -0.0012,
"step": 324
},
{
"clip_ratio": 0.0,
"completion_length": 398.34375,
"epoch": 0.4226267880364109,
"grad_norm": 0.5555980131095809,
"kl": 0.203125,
"learning_rate": 2e-06,
"loss": 0.0018,
"reward": 0.9582957029342651,
"reward_std": 0.12391936033964157,
"rewards/preference_model_reward": 0.9582957029342651,
"rewards/preference_model_reward/std": 0.16838383674621582,
"step": 325
},
{
"clip_ratio": 0.00029583554714918137,
"epoch": 0.42392717815344605,
"grad_norm": 0.4895345891264094,
"kl": 0.203125,
"learning_rate": 2e-06,
"loss": 0.0014,
"step": 326
},
{
"clip_ratio": 0.0,
"completion_length": 313.0625,
"epoch": 0.42522756827048114,
"grad_norm": 0.22580993498032037,
"kl": 0.19140625,
"learning_rate": 2e-06,
"loss": -0.0009,
"reward": 0.9758968353271484,
"reward_std": 0.06588973104953766,
"rewards/preference_model_reward": 0.9758968353271484,
"rewards/preference_model_reward/std": 0.09488161653280258,
"step": 327
},
{
"clip_ratio": 0.0,
"epoch": 0.4265279583875163,
"grad_norm": 0.19771147507803757,
"kl": 0.1904296875,
"learning_rate": 2e-06,
"loss": -0.001,
"step": 328
},
{
"clip_ratio": 0.0,
"completion_length": 388.65625,
"epoch": 0.42782834850455137,
"grad_norm": 1.073532799875203,
"kl": 0.18359375,
"learning_rate": 2e-06,
"loss": -0.0031,
"reward": 0.5786818265914917,
"reward_std": 0.37675005197525024,
"rewards/preference_model_reward": 0.5786818265914917,
"rewards/preference_model_reward/std": 0.38477084040641785,
"step": 329
},
{
"clip_ratio": 0.00016030779806897044,
"epoch": 0.42912873862158646,
"grad_norm": 1.059287777640083,
"kl": 0.1865234375,
"learning_rate": 2e-06,
"loss": -0.0039,
"step": 330
},
{
"clip_ratio": 0.0,
"completion_length": 412.75,
"epoch": 0.4304291287386216,
"grad_norm": 0.7820217805815579,
"kl": 0.169921875,
"learning_rate": 2e-06,
"loss": -0.0028,
"reward": 0.8559874892234802,
"reward_std": 0.1847127079963684,
"rewards/preference_model_reward": 0.8559874892234802,
"rewards/preference_model_reward/std": 0.295710951089859,
"step": 331
},
{
"clip_ratio": 8.996042015496641e-05,
"epoch": 0.4317295188556567,
"grad_norm": 0.7329357485601158,
"kl": 0.169921875,
"learning_rate": 2e-06,
"loss": -0.0032,
"step": 332
},
{
"clip_ratio": 0.0,
"completion_length": 373.0625,
"epoch": 0.43302990897269183,
"grad_norm": 0.36553915145989274,
"kl": 0.201171875,
"learning_rate": 2e-06,
"loss": -0.0017,
"reward": 0.9780337810516357,
"reward_std": 0.08786486089229584,
"rewards/preference_model_reward": 0.9780337810516357,
"rewards/preference_model_reward/std": 0.12425968050956726,
"step": 333
},
{
"clip_ratio": 0.00035080796806141734,
"epoch": 0.4343302990897269,
"grad_norm": 0.3148042609460324,
"kl": 0.201171875,
"learning_rate": 2e-06,
"loss": -0.002,
"step": 334
},
{
"clip_ratio": 0.0,
"completion_length": 341.875,
"epoch": 0.435630689206762,
"grad_norm": 0.5432127291313187,
"kl": 0.1875,
"learning_rate": 2e-06,
"loss": 0.0011,
"reward": 0.9440739750862122,
"reward_std": 0.14797841012477875,
"rewards/preference_model_reward": 0.9440739750862122,
"rewards/preference_model_reward/std": 0.21356752514839172,
"step": 335
},
{
"clip_ratio": 0.00030156815773807466,
"epoch": 0.43693107932379716,
"grad_norm": 0.49846748282978737,
"kl": 0.1884765625,
"learning_rate": 2e-06,
"loss": 0.0007,
"step": 336
},
{
"clip_ratio": 0.0,
"completion_length": 291.15625,
"epoch": 0.43823146944083224,
"grad_norm": 0.5736680592738835,
"kl": 0.1748046875,
"learning_rate": 2e-06,
"loss": -0.003,
"reward": 0.8900240659713745,
"reward_std": 0.24245613813400269,
"rewards/preference_model_reward": 0.8900240659713745,
"rewards/preference_model_reward/std": 0.2557799518108368,
"step": 337
},
{
"clip_ratio": 8.4745763160754e-05,
"epoch": 0.43953185955786733,
"grad_norm": 0.54020537665832,
"kl": 0.177734375,
"learning_rate": 2e-06,
"loss": -0.0034,
"step": 338
},
{
"clip_ratio": 0.0,
"completion_length": 423.21875,
"epoch": 0.4408322496749025,
"grad_norm": 0.9480813610454677,
"kl": 0.2373046875,
"learning_rate": 2e-06,
"loss": -0.0017,
"reward": 0.5154986381530762,
"reward_std": 0.3188796639442444,
"rewards/preference_model_reward": 0.5154986381530762,
"rewards/preference_model_reward/std": 0.4238376319408417,
"step": 339
},
{
"clip_ratio": 0.00020426370610948652,
"epoch": 0.44213263979193757,
"grad_norm": 0.9501018563704864,
"kl": 0.240234375,
"learning_rate": 2e-06,
"loss": -0.0024,
"step": 340
},
{
"clip_ratio": 0.0,
"completion_length": 420.59375,
"epoch": 0.4434330299089727,
"grad_norm": 0.845647715547278,
"kl": 0.201171875,
"learning_rate": 2e-06,
"loss": -0.0021,
"reward": 0.8065922260284424,
"reward_std": 0.2574193477630615,
"rewards/preference_model_reward": 0.8065922260284424,
"rewards/preference_model_reward/std": 0.29339519143104553,
"step": 341
},
{
"clip_ratio": 0.00031662482069805264,
"epoch": 0.4447334200260078,
"grad_norm": 0.8751571372465456,
"kl": 0.203125,
"learning_rate": 2e-06,
"loss": -0.0026,
"step": 342
},
{
"clip_ratio": 0.0,
"completion_length": 357.15625,
"epoch": 0.4460338101430429,
"grad_norm": 0.730275481610921,
"kl": 0.234375,
"learning_rate": 2e-06,
"loss": -0.0063,
"reward": 0.900632381439209,
"reward_std": 0.2498612105846405,
"rewards/preference_model_reward": 0.900632381439209,
"rewards/preference_model_reward/std": 0.256816565990448,
"step": 343
},
{
"clip_ratio": 0.00030949688516557217,
"epoch": 0.44733420026007803,
"grad_norm": 0.6892449500597292,
"kl": 0.2373046875,
"learning_rate": 2e-06,
"loss": -0.0068,
"step": 344
},
{
"clip_ratio": 0.0,
"completion_length": 284.4375,
"epoch": 0.4486345903771131,
"grad_norm": 0.8200871031591095,
"kl": 0.166015625,
"learning_rate": 2e-06,
"loss": -0.0001,
"reward": 0.8631854057312012,
"reward_std": 0.19240587949752808,
"rewards/preference_model_reward": 0.8631854057312012,
"rewards/preference_model_reward/std": 0.30161845684051514,
"step": 345
},
{
"clip_ratio": 0.0002665245265234262,
"epoch": 0.44993498049414826,
"grad_norm": 0.7466247621387166,
"kl": 0.177734375,
"learning_rate": 2e-06,
"loss": -0.0004,
"step": 346
},
{
"clip_ratio": 0.0,
"completion_length": 385.46875,
"epoch": 0.45123537061118335,
"grad_norm": 1.0742987274568128,
"kl": 0.216796875,
"learning_rate": 2e-06,
"loss": 0.0,
"reward": 0.8062537908554077,
"reward_std": 0.22240078449249268,
"rewards/preference_model_reward": 0.8062537908554077,
"rewards/preference_model_reward/std": 0.36671730875968933,
"step": 347
},
{
"clip_ratio": 8.4373947174754e-05,
"epoch": 0.45253576072821844,
"grad_norm": 0.8886627945755464,
"kl": 0.21875,
"learning_rate": 2e-06,
"loss": -0.0005,
"step": 348
},
{
"clip_ratio": 0.0,
"completion_length": 548.28125,
"epoch": 0.4538361508452536,
"grad_norm": 0.3764239056895551,
"kl": 0.212890625,
"learning_rate": 2e-06,
"loss": 0.0004,
"reward": 0.9814817309379578,
"reward_std": 0.07407312840223312,
"rewards/preference_model_reward": 0.9814817309379578,
"rewards/preference_model_reward/std": 0.1047552078962326,
"step": 349
},
{
"clip_ratio": 0.00011498251114971936,
"epoch": 0.45513654096228867,
"grad_norm": 0.3492401522936101,
"kl": 0.2158203125,
"learning_rate": 2e-06,
"loss": 0.0001,
"step": 350
},
{
"clip_ratio": 0.0,
"completion_length": 340.4375,
"epoch": 0.4564369310793238,
"grad_norm": 0.8053126037042443,
"kl": 0.265625,
"learning_rate": 2e-06,
"loss": 0.0022,
"reward": 0.8822987079620361,
"reward_std": 0.19238536059856415,
"rewards/preference_model_reward": 0.8822987079620361,
"rewards/preference_model_reward/std": 0.24105463922023773,
"step": 351
},
{
"clip_ratio": 0.0005687876255251467,
"epoch": 0.4577373211963589,
"grad_norm": 0.7744649363558931,
"kl": 0.267578125,
"learning_rate": 2e-06,
"loss": 0.0016,
"step": 352
},
{
"clip_ratio": 0.0,
"completion_length": 340.15625,
"epoch": 0.459037711313394,
"grad_norm": 0.4163188535794671,
"kl": 0.1962890625,
"learning_rate": 2e-06,
"loss": -0.0004,
"reward": 0.9581733345985413,
"reward_std": 0.09610553830862045,
"rewards/preference_model_reward": 0.9581733345985413,
"rewards/preference_model_reward/std": 0.14029455184936523,
"step": 353
},
{
"clip_ratio": 9.426847827853635e-05,
"epoch": 0.46033810143042914,
"grad_norm": 0.37210922204260644,
"kl": 0.197265625,
"learning_rate": 2e-06,
"loss": -0.0007,
"step": 354
},
{
"clip_ratio": 0.0,
"completion_length": 482.875,
"epoch": 0.4616384915474642,
"grad_norm": 0.443699445192129,
"kl": 0.2255859375,
"learning_rate": 2e-06,
"loss": -0.002,
"reward": 0.9650442600250244,
"reward_std": 0.09929230064153671,
"rewards/preference_model_reward": 0.9650442600250244,
"rewards/preference_model_reward/std": 0.14262951910495758,
"step": 355
},
{
"clip_ratio": 7.941549847600982e-05,
"epoch": 0.46293888166449937,
"grad_norm": 0.3525018468251512,
"kl": 0.2275390625,
"learning_rate": 2e-06,
"loss": -0.0022,
"step": 356
},
{
"clip_ratio": 0.0,
"completion_length": 541.96875,
"epoch": 0.46423927178153446,
"grad_norm": 0.037646114609881515,
"kl": 0.26171875,
"learning_rate": 2e-06,
"loss": 0.0003,
"reward": 1.0,
"reward_std": 0.0,
"rewards/preference_model_reward": 1.0,
"rewards/preference_model_reward/std": 0.0,
"step": 357
},
{
"clip_ratio": 0.0,
"epoch": 0.46553966189856955,
"grad_norm": 0.022323449072510736,
"kl": 0.248046875,
"learning_rate": 2e-06,
"loss": 0.0003,
"step": 358
},
{
"clip_ratio": 0.0,
"completion_length": 486.5,
"epoch": 0.4668400520156047,
"grad_norm": 0.047585204521953244,
"kl": 0.1884765625,
"learning_rate": 2e-06,
"loss": -0.0007,
"reward": 0.9968596696853638,
"reward_std": 0.012561214156448841,
"rewards/preference_model_reward": 0.9968596696853638,
"rewards/preference_model_reward/std": 0.017764244228601456,
"step": 359
},
{
"clip_ratio": 0.0003033555403817445,
"epoch": 0.4681404421326398,
"grad_norm": 0.04158159498021269,
"kl": 0.1748046875,
"learning_rate": 2e-06,
"loss": -0.0008,
"step": 360
},
{
"clip_ratio": 0.0,
"completion_length": 453.0625,
"epoch": 0.4694408322496749,
"grad_norm": 0.4699175632650768,
"kl": 0.18359375,
"learning_rate": 2e-06,
"loss": -0.0061,
"reward": 0.9376707673072815,
"reward_std": 0.1413489133119583,
"rewards/preference_model_reward": 0.9376707673072815,
"rewards/preference_model_reward/std": 0.1997252255678177,
"step": 361
},
{
"clip_ratio": 0.0010629449971020222,
"epoch": 0.47074122236671,
"grad_norm": 0.4443824446386433,
"kl": 0.1748046875,
"learning_rate": 2e-06,
"loss": -0.0065,
"step": 362
},
{
"clip_ratio": 0.0,
"completion_length": 431.8125,
"epoch": 0.4720416124837451,
"grad_norm": 0.5928527775265698,
"kl": 0.193359375,
"learning_rate": 2e-06,
"loss": 0.0001,
"reward": 0.9526033997535706,
"reward_std": 0.18958628177642822,
"rewards/preference_model_reward": 0.9526033997535706,
"rewards/preference_model_reward/std": 0.19250237941741943,
"step": 363
},
{
"clip_ratio": 0.000205172153073363,
"epoch": 0.47334200260078024,
"grad_norm": 0.5475249341379819,
"kl": 0.1865234375,
"learning_rate": 2e-06,
"loss": -0.0003,
"step": 364
},
{
"clip_ratio": 0.0,
"completion_length": 413.6875,
"epoch": 0.47464239271781533,
"grad_norm": 0.9292294703178672,
"kl": 0.17578125,
"learning_rate": 2e-06,
"loss": -0.0052,
"reward": 0.8800602555274963,
"reward_std": 0.22069776058197021,
"rewards/preference_model_reward": 0.8800602555274963,
"rewards/preference_model_reward/std": 0.2782754898071289,
"step": 365
},
{
"clip_ratio": 0.0006039842264726758,
"epoch": 0.4759427828348505,
"grad_norm": 0.8278406545693311,
"kl": 0.171875,
"learning_rate": 2e-06,
"loss": -0.0058,
"step": 366
},
{
"clip_ratio": 0.0,
"completion_length": 696.46875,
"epoch": 0.47724317295188556,
"grad_norm": 1.7035086337060783,
"kl": 0.1640625,
"learning_rate": 2e-06,
"loss": 0.0226,
"reward": 0.7321191430091858,
"reward_std": 0.39982855319976807,
"rewards/preference_model_reward": 0.7321191430091858,
"rewards/preference_model_reward/std": 0.39758551120758057,
"step": 367
},
{
"clip_ratio": 9.947569924406707e-05,
"epoch": 0.47854356306892065,
"grad_norm": 1.6398878874883134,
"kl": 0.162109375,
"learning_rate": 2e-06,
"loss": 0.0212,
"step": 368
},
{
"clip_ratio": 0.0,
"completion_length": 485.0625,
"epoch": 0.4798439531859558,
"grad_norm": 1.0250430126617982,
"kl": 0.154296875,
"learning_rate": 2e-06,
"loss": 0.0073,
"reward": 0.846257746219635,
"reward_std": 0.2683815360069275,
"rewards/preference_model_reward": 0.846257746219635,
"rewards/preference_model_reward/std": 0.2826293110847473,
"step": 369
},
{
"clip_ratio": 0.0001257861586054787,
"epoch": 0.4811443433029909,
"grad_norm": 1.001787366115385,
"kl": 0.1533203125,
"learning_rate": 2e-06,
"loss": 0.0065,
"step": 370
},
{
"clip_ratio": 0.0,
"completion_length": 570.53125,
"epoch": 0.48244473342002603,
"grad_norm": 1.3025780868642676,
"kl": 0.13671875,
"learning_rate": 2e-06,
"loss": 0.025,
"reward": 0.8562784790992737,
"reward_std": 0.3221096098423004,
"rewards/preference_model_reward": 0.8562784790992737,
"rewards/preference_model_reward/std": 0.3274212181568146,
"step": 371
},
{
"clip_ratio": 0.0,
"epoch": 0.4837451235370611,
"grad_norm": 1.2670088001321425,
"kl": 0.13671875,
"learning_rate": 2e-06,
"loss": 0.0241,
"step": 372
},
{
"clip_ratio": 0.0,
"completion_length": 557.25,
"epoch": 0.4850455136540962,
"grad_norm": 0.396947988021042,
"kl": 0.1318359375,
"learning_rate": 2e-06,
"loss": 0.0003,
"reward": 0.9703431129455566,
"reward_std": 0.08718589693307877,
"rewards/preference_model_reward": 0.9703431129455566,
"rewards/preference_model_reward/std": 0.08997520059347153,
"step": 373
},
{
"clip_ratio": 8.115790114970878e-05,
"epoch": 0.48634590377113135,
"grad_norm": 0.3588001484684231,
"kl": 0.1318359375,
"learning_rate": 2e-06,
"loss": -0.0,
"step": 374
},
{
"clip_ratio": 0.0,
"completion_length": 402.75,
"epoch": 0.48764629388816644,
"grad_norm": 1.0349158159751568,
"kl": 0.17578125,
"learning_rate": 2e-06,
"loss": -0.0081,
"reward": 0.5225076675415039,
"reward_std": 0.3107752799987793,
"rewards/preference_model_reward": 0.5225076675415039,
"rewards/preference_model_reward/std": 0.4560634195804596,
"step": 375
},
{
"clip_ratio": 0.0003945899079553783,
"epoch": 0.4889466840052016,
"grad_norm": 1.0294645469560724,
"kl": 0.17578125,
"learning_rate": 2e-06,
"loss": -0.0089,
"step": 376
},
{
"clip_ratio": 0.0,
"completion_length": 439.65625,
"epoch": 0.49024707412223667,
"grad_norm": 0.12293364932946799,
"kl": 0.1650390625,
"learning_rate": 2e-06,
"loss": 0.0001,
"reward": 0.9920886754989624,
"reward_std": 0.03164532035589218,
"rewards/preference_model_reward": 0.9920886754989624,
"rewards/preference_model_reward/std": 0.044753238558769226,
"step": 377
},
{
"clip_ratio": 0.0,
"epoch": 0.49154746423927176,
"grad_norm": 0.11828689260322871,
"kl": 0.1630859375,
"learning_rate": 2e-06,
"loss": 0.0,
"step": 378
},
{
"clip_ratio": 0.0,
"completion_length": 567.90625,
"epoch": 0.4928478543563069,
"grad_norm": 0.8350931514014152,
"kl": 0.1669921875,
"learning_rate": 2e-06,
"loss": -0.0049,
"reward": 0.8592836856842041,
"reward_std": 0.22580870985984802,
"rewards/preference_model_reward": 0.8592836856842041,
"rewards/preference_model_reward/std": 0.28813624382019043,
"step": 379
},
{
"clip_ratio": 0.00024522157036699355,
"epoch": 0.494148244473342,
"grad_norm": 0.7878789832148902,
"kl": 0.1669921875,
"learning_rate": 2e-06,
"loss": -0.0054,
"step": 380
},
{
"clip_ratio": 0.0,
"completion_length": 458.40625,
"epoch": 0.49544863459037713,
"grad_norm": 1.3819275414815289,
"kl": 0.14453125,
"learning_rate": 2e-06,
"loss": 0.0319,
"reward": 0.6078048348426819,
"reward_std": 0.39275383949279785,
"rewards/preference_model_reward": 0.6078048348426819,
"rewards/preference_model_reward/std": 0.4531180262565613,
"step": 381
},
{
"clip_ratio": 4.539676956483163e-05,
"epoch": 0.4967490247074122,
"grad_norm": 1.3480746769326,
"kl": 0.14453125,
"learning_rate": 2e-06,
"loss": 0.0309,
"step": 382
},
{
"clip_ratio": 0.0,
"completion_length": 530.90625,
"epoch": 0.4980494148244473,
"grad_norm": 1.0915508320963603,
"kl": 0.1630859375,
"learning_rate": 2e-06,
"loss": 0.0096,
"reward": 0.9106493592262268,
"reward_std": 0.26455527544021606,
"rewards/preference_model_reward": 0.9106493592262268,
"rewards/preference_model_reward/std": 0.26605573296546936,
"step": 383
},
{
"clip_ratio": 0.0006032996461726725,
"epoch": 0.49934980494148246,
"grad_norm": 1.0062396438915018,
"kl": 0.1640625,
"learning_rate": 2e-06,
"loss": 0.0089,
"step": 384
},
{
"clip_ratio": 0.0,
"completion_length": 404.28125,
"epoch": 0.5006501950585176,
"grad_norm": 0.815339253376103,
"kl": 0.16015625,
"learning_rate": 2e-06,
"loss": 0.0019,
"reward": 0.6707455515861511,
"reward_std": 0.2263924926519394,
"rewards/preference_model_reward": 0.6707455515861511,
"rewards/preference_model_reward/std": 0.40456080436706543,
"step": 385
},
{
"clip_ratio": 8.218277798732743e-05,
"epoch": 0.5019505851755527,
"grad_norm": 0.8169128748708187,
"kl": 0.1611328125,
"learning_rate": 2e-06,
"loss": 0.0013,
"step": 386
},
{
"clip_ratio": 0.0,
"completion_length": 481.03125,
"epoch": 0.5032509752925878,
"grad_norm": 1.187671199008333,
"kl": 0.177734375,
"learning_rate": 2e-06,
"loss": 0.0014,
"reward": 0.8174352049827576,
"reward_std": 0.3356226086616516,
"rewards/preference_model_reward": 0.8174352049827576,
"rewards/preference_model_reward/std": 0.35391342639923096,
"step": 387
},
{
"clip_ratio": 8.300133049488068e-05,
"epoch": 0.5045513654096229,
"grad_norm": 1.1173042462876819,
"kl": 0.177734375,
"learning_rate": 2e-06,
"loss": 0.0005,
"step": 388
},
{
"clip_ratio": 0.0,
"completion_length": 485.21875,
"epoch": 0.505851755526658,
"grad_norm": 0.9197781852352046,
"kl": 0.1630859375,
"learning_rate": 2e-06,
"loss": -0.0007,
"reward": 0.6285791993141174,
"reward_std": 0.27514463663101196,
"rewards/preference_model_reward": 0.6285791993141174,
"rewards/preference_model_reward/std": 0.43319857120513916,
"step": 389
},
{
"clip_ratio": 0.0,
"epoch": 0.5071521456436932,
"grad_norm": 1.0705592825656165,
"kl": 0.1640625,
"learning_rate": 2e-06,
"loss": -0.0014,
"step": 390
},
{
"clip_ratio": 0.0,
"completion_length": 440.9375,
"epoch": 0.5084525357607282,
"grad_norm": 1.2228961699827323,
"kl": 0.1513671875,
"learning_rate": 2e-06,
"loss": -0.0048,
"reward": 0.5957476496696472,
"reward_std": 0.40599602460861206,
"rewards/preference_model_reward": 0.5957476496696472,
"rewards/preference_model_reward/std": 0.4048064649105072,
"step": 391
},
{
"clip_ratio": 6.316321378108114e-05,
"epoch": 0.5097529258777633,
"grad_norm": 1.1490955590907963,
"kl": 0.1533203125,
"learning_rate": 2e-06,
"loss": -0.0056,
"step": 392
},
{
"clip_ratio": 0.0,
"completion_length": 458.34375,
"epoch": 0.5110533159947984,
"grad_norm": 1.2320099034869054,
"kl": 0.189453125,
"learning_rate": 2e-06,
"loss": 0.025,
"reward": 0.7658855319023132,
"reward_std": 0.33958619832992554,
"rewards/preference_model_reward": 0.7658855319023132,
"rewards/preference_model_reward/std": 0.36572444438934326,
"step": 393
},
{
"clip_ratio": 0.0,
"epoch": 0.5123537061118335,
"grad_norm": 1.1909134434235307,
"kl": 0.1923828125,
"learning_rate": 2e-06,
"loss": 0.024,
"step": 394
},
{
"clip_ratio": 0.0,
"completion_length": 335.21875,
"epoch": 0.5136540962288687,
"grad_norm": 0.4436447730361839,
"kl": 0.1728515625,
"learning_rate": 2e-06,
"loss": -0.0077,
"reward": 0.4475706219673157,
"reward_std": 0.1472529023885727,
"rewards/preference_model_reward": 0.4475706219673157,
"rewards/preference_model_reward/std": 0.4426521062850952,
"step": 395
},
{
"clip_ratio": 0.0003680627851281315,
"epoch": 0.5149544863459038,
"grad_norm": 0.44133914560284426,
"kl": 0.1748046875,
"learning_rate": 2e-06,
"loss": -0.0081,
"step": 396
},
{
"clip_ratio": 0.0,
"completion_length": 486.40625,
"epoch": 0.5162548764629389,
"grad_norm": 1.027174535394778,
"kl": 0.142578125,
"learning_rate": 2e-06,
"loss": 0.0247,
"reward": 0.5502924919128418,
"reward_std": 0.3194156885147095,
"rewards/preference_model_reward": 0.5502924919128418,
"rewards/preference_model_reward/std": 0.404392808675766,
"step": 397
},
{
"clip_ratio": 7.11642496753484e-05,
"epoch": 0.517555266579974,
"grad_norm": 0.9543587594708557,
"kl": 0.1455078125,
"learning_rate": 2e-06,
"loss": 0.024,
"step": 398
},
{
"clip_ratio": 0.0,
"completion_length": 689.125,
"epoch": 0.5188556566970091,
"grad_norm": 0.4271536388521303,
"kl": 0.154296875,
"learning_rate": 2e-06,
"loss": 0.0007,
"reward": 0.48311230540275574,
"reward_std": 0.1016714870929718,
"rewards/preference_model_reward": 0.48311230540275574,
"rewards/preference_model_reward/std": 0.4811513423919678,
"step": 399
},
{
"clip_ratio": 0.0003734786878339946,
"epoch": 0.5201560468140443,
"grad_norm": 0.5666357587808477,
"kl": 0.154296875,
"learning_rate": 2e-06,
"loss": 0.0005,
"step": 400
},
{
"clip_ratio": 0.0,
"completion_length": 407.125,
"epoch": 0.5214564369310793,
"grad_norm": 0.3796776014953804,
"kl": 0.1845703125,
"learning_rate": 2e-06,
"loss": 0.0002,
"reward": 0.9658941030502319,
"reward_std": 0.09363856911659241,
"rewards/preference_model_reward": 0.9658941030502319,
"rewards/preference_model_reward/std": 0.1329784095287323,
"step": 401
},
{
"clip_ratio": 0.0003180578933097422,
"epoch": 0.5227568270481144,
"grad_norm": 0.3647794628722391,
"kl": 0.1865234375,
"learning_rate": 2e-06,
"loss": -0.0001,
"step": 402
},
{
"clip_ratio": 0.0,
"completion_length": 474.34375,
"epoch": 0.5240572171651495,
"grad_norm": 0.5011856408174054,
"kl": 0.1533203125,
"learning_rate": 2e-06,
"loss": -0.006,
"reward": 0.9527279734611511,
"reward_std": 0.09915541857481003,
"rewards/preference_model_reward": 0.9527279734611511,
"rewards/preference_model_reward/std": 0.1460685133934021,
"step": 403
},
{
"clip_ratio": 5.3544656111625955e-05,
"epoch": 0.5253576072821846,
"grad_norm": 0.4587873209186705,
"kl": 0.15625,
"learning_rate": 2e-06,
"loss": -0.0064,
"step": 404
},
{
"clip_ratio": 0.0,
"completion_length": 339.0,
"epoch": 0.5266579973992198,
"grad_norm": 0.6507606398747722,
"kl": 0.177734375,
"learning_rate": 2e-06,
"loss": -0.0007,
"reward": 0.9481613636016846,
"reward_std": 0.20735451579093933,
"rewards/preference_model_reward": 0.9481613636016846,
"rewards/preference_model_reward/std": 0.20464079082012177,
"step": 405
},
{
"clip_ratio": 0.0006553526036441326,
"epoch": 0.5279583875162549,
"grad_norm": 0.5683220760296074,
"kl": 0.1787109375,
"learning_rate": 2e-06,
"loss": -0.0011,
"step": 406
},
{
"clip_ratio": 0.0,
"completion_length": 397.34375,
"epoch": 0.52925877763329,
"grad_norm": 0.46341453825755263,
"kl": 0.1865234375,
"learning_rate": 2e-06,
"loss": 0.0002,
"reward": 0.9362279772758484,
"reward_std": 0.11489400267601013,
"rewards/preference_model_reward": 0.9362279772758484,
"rewards/preference_model_reward/std": 0.17247511446475983,
"step": 407
},
{
"clip_ratio": 0.0,
"epoch": 0.5305591677503251,
"grad_norm": 0.435386222165232,
"kl": 0.189453125,
"learning_rate": 2e-06,
"loss": -0.0001,
"step": 408
},
{
"clip_ratio": 0.0,
"completion_length": 415.125,
"epoch": 0.5318595578673602,
"grad_norm": 0.5058190173177521,
"kl": 0.1650390625,
"learning_rate": 2e-06,
"loss": -0.0024,
"reward": 0.8493468165397644,
"reward_std": 0.14191898703575134,
"rewards/preference_model_reward": 0.8493468165397644,
"rewards/preference_model_reward/std": 0.24982212483882904,
"step": 409
},
{
"clip_ratio": 0.000360783189535141,
"epoch": 0.5331599479843954,
"grad_norm": 0.49350987150616776,
"kl": 0.1640625,
"learning_rate": 2e-06,
"loss": -0.0027,
"step": 410
},
{
"clip_ratio": 0.0,
"completion_length": 438.96875,
"epoch": 0.5344603381014305,
"grad_norm": 0.11849056766792637,
"kl": 0.2060546875,
"learning_rate": 2e-06,
"loss": 0.0004,
"reward": 0.993826150894165,
"reward_std": 0.024695372208952904,
"rewards/preference_model_reward": 0.993826150894165,
"rewards/preference_model_reward/std": 0.03492453321814537,
"step": 411
},
{
"clip_ratio": 5.724754009861499e-05,
"epoch": 0.5357607282184655,
"grad_norm": 0.11249502423658887,
"kl": 0.2060546875,
"learning_rate": 2e-06,
"loss": 0.0003,
"step": 412
},
{
"clip_ratio": 0.0,
"completion_length": 484.28125,
"epoch": 0.5370611183355006,
"grad_norm": 0.4928407072097148,
"kl": 0.171875,
"learning_rate": 2e-06,
"loss": -0.002,
"reward": 0.9763680100440979,
"reward_std": 0.08725623041391373,
"rewards/preference_model_reward": 0.9763680100440979,
"rewards/preference_model_reward/std": 0.0860796794295311,
"step": 413
},
{
"clip_ratio": 0.0006810087943449616,
"epoch": 0.5383615084525357,
"grad_norm": 0.25338692807833185,
"kl": 0.171875,
"learning_rate": 2e-06,
"loss": -0.0022,
"step": 414
},
{
"clip_ratio": 0.0,
"completion_length": 419.28125,
"epoch": 0.5396618985695709,
"grad_norm": 0.1908824259718182,
"kl": 0.1962890625,
"learning_rate": 2e-06,
"loss": 0.0005,
"reward": 0.9887747764587402,
"reward_std": 0.04490102827548981,
"rewards/preference_model_reward": 0.9887747764587402,
"rewards/preference_model_reward/std": 0.06349964439868927,
"step": 415
},
{
"clip_ratio": 6.952168769203126e-05,
"epoch": 0.540962288686606,
"grad_norm": 0.1787252912416434,
"kl": 0.1953125,
"learning_rate": 2e-06,
"loss": 0.0004,
"step": 416
},
{
"clip_ratio": 0.0,
"completion_length": 633.4375,
"epoch": 0.5422626788036411,
"grad_norm": 0.9123177644363701,
"kl": 0.193359375,
"learning_rate": 2e-06,
"loss": 0.0143,
"reward": 0.9695033431053162,
"reward_std": 0.12198655307292938,
"rewards/preference_model_reward": 0.9695033431053162,
"rewards/preference_model_reward/std": 0.17251503467559814,
"step": 417
},
{
"clip_ratio": 9.883376333164051e-05,
"epoch": 0.5435630689206762,
"grad_norm": 0.9162834478633793,
"kl": 0.193359375,
"learning_rate": 2e-06,
"loss": 0.0137,
"step": 418
},
{
"clip_ratio": 0.0,
"completion_length": 411.03125,
"epoch": 0.5448634590377113,
"grad_norm": 0.7437466671675131,
"kl": 0.1826171875,
"learning_rate": 2e-06,
"loss": -0.0012,
"reward": 0.8951805830001831,
"reward_std": 0.1884705126285553,
"rewards/preference_model_reward": 0.8951805830001831,
"rewards/preference_model_reward/std": 0.2830054759979248,
"step": 419
},
{
"clip_ratio": 0.0,
"epoch": 0.5461638491547465,
"grad_norm": 0.6744284019984438,
"kl": 0.18359375,
"learning_rate": 2e-06,
"loss": -0.0017,
"step": 420
},
{
"clip_ratio": 0.0,
"completion_length": 480.03125,
"epoch": 0.5474642392717816,
"grad_norm": 0.46776056087514883,
"kl": 0.21484375,
"learning_rate": 2e-06,
"loss": -0.0024,
"reward": 0.9680905342102051,
"reward_std": 0.08884235471487045,
"rewards/preference_model_reward": 0.9680905342102051,
"rewards/preference_model_reward/std": 0.12778013944625854,
"step": 421
},
{
"clip_ratio": 0.00018961718888022006,
"epoch": 0.5487646293888166,
"grad_norm": 0.4036332714818817,
"kl": 0.2138671875,
"learning_rate": 2e-06,
"loss": -0.0027,
"step": 422
},
{
"clip_ratio": 0.0,
"completion_length": 435.0625,
"epoch": 0.5500650195058517,
"grad_norm": 0.8298529686690673,
"kl": 0.1953125,
"learning_rate": 2e-06,
"loss": -0.013,
"reward": 0.8935482501983643,
"reward_std": 0.29745957255363464,
"rewards/preference_model_reward": 0.8935482501983643,
"rewards/preference_model_reward/std": 0.2955591082572937,
"step": 423
},
{
"clip_ratio": 0.0002922832500189543,
"epoch": 0.5513654096228868,
"grad_norm": 0.7804747694003971,
"kl": 0.1943359375,
"learning_rate": 2e-06,
"loss": -0.0136,
"step": 424
},
{
"clip_ratio": 0.0,
"completion_length": 576.15625,
"epoch": 0.552665799739922,
"grad_norm": 0.5767173027038802,
"kl": 0.2265625,
"learning_rate": 2e-06,
"loss": -0.0018,
"reward": 0.9546047449111938,
"reward_std": 0.12968912720680237,
"rewards/preference_model_reward": 0.9546047449111938,
"rewards/preference_model_reward/std": 0.18622736632823944,
"step": 425
},
{
"clip_ratio": 0.0,
"epoch": 0.5539661898569571,
"grad_norm": 0.5385672477616652,
"kl": 0.2265625,
"learning_rate": 2e-06,
"loss": -0.0022,
"step": 426
},
{
"clip_ratio": 0.0,
"completion_length": 481.125,
"epoch": 0.5552665799739922,
"grad_norm": 0.7743174896574186,
"kl": 0.2001953125,
"learning_rate": 2e-06,
"loss": -0.012,
"reward": 0.885873019695282,
"reward_std": 0.18265797197818756,
"rewards/preference_model_reward": 0.885873019695282,
"rewards/preference_model_reward/std": 0.27932146191596985,
"step": 427
},
{
"clip_ratio": 0.00013702338037546724,
"epoch": 0.5565669700910273,
"grad_norm": 0.7444105184385629,
"kl": 0.201171875,
"learning_rate": 2e-06,
"loss": -0.0126,
"step": 428
},
{
"clip_ratio": 0.0,
"completion_length": 559.375,
"epoch": 0.5578673602080624,
"grad_norm": 1.1477604387216325,
"kl": 0.224609375,
"learning_rate": 2e-06,
"loss": -0.0163,
"reward": 0.8528430461883545,
"reward_std": 0.22550532221794128,
"rewards/preference_model_reward": 0.8528430461883545,
"rewards/preference_model_reward/std": 0.34753158688545227,
"step": 429
},
{
"clip_ratio": 0.00011868027650052682,
"epoch": 0.5591677503250976,
"grad_norm": 1.0326779519681246,
"kl": 0.2255859375,
"learning_rate": 2e-06,
"loss": -0.017,
"step": 430
},
{
"clip_ratio": 0.0,
"completion_length": 500.3125,
"epoch": 0.5604681404421327,
"grad_norm": 0.9071727949790038,
"kl": 0.1962890625,
"learning_rate": 2e-06,
"loss": -0.0084,
"reward": 0.8113287091255188,
"reward_std": 0.19848331809043884,
"rewards/preference_model_reward": 0.8113287091255188,
"rewards/preference_model_reward/std": 0.33614689111709595,
"step": 431
},
{
"clip_ratio": 7.271669892361388e-05,
"epoch": 0.5617685305591678,
"grad_norm": 0.8455173031641725,
"kl": 0.197265625,
"learning_rate": 2e-06,
"loss": -0.009,
"step": 432
},
{
"clip_ratio": 0.0,
"completion_length": 604.71875,
"epoch": 0.5630689206762028,
"grad_norm": 1.3857673138070365,
"kl": 0.2099609375,
"learning_rate": 2e-06,
"loss": 0.0168,
"reward": 0.8483308553695679,
"reward_std": 0.325148344039917,
"rewards/preference_model_reward": 0.8483308553695679,
"rewards/preference_model_reward/std": 0.32090431451797485,
"step": 433
},
{
"clip_ratio": 0.0003254468902014196,
"epoch": 0.5643693107932379,
"grad_norm": 1.3314635785340656,
"kl": 0.2158203125,
"learning_rate": 2e-06,
"loss": 0.0157,
"step": 434
},
{
"clip_ratio": 0.0,
"completion_length": 512.28125,
"epoch": 0.5656697009102731,
"grad_norm": 0.5989680573760588,
"kl": 0.2177734375,
"learning_rate": 2e-06,
"loss": 0.0099,
"reward": 0.9779109358787537,
"reward_std": 0.08835619688034058,
"rewards/preference_model_reward": 0.9779109358787537,
"rewards/preference_model_reward/std": 0.12495452910661697,
"step": 435
},
{
"clip_ratio": 0.0,
"epoch": 0.5669700910273082,
"grad_norm": 0.5634736482208081,
"kl": 0.21875,
"learning_rate": 2e-06,
"loss": 0.0094,
"step": 436
},
{
"clip_ratio": 0.0,
"completion_length": 463.28125,
"epoch": 0.5682704811443433,
"grad_norm": 0.33184977159461365,
"kl": 0.2333984375,
"learning_rate": 2e-06,
"loss": -0.0026,
"reward": 0.9590626955032349,
"reward_std": 0.10807390511035919,
"rewards/preference_model_reward": 0.9590626955032349,
"rewards/preference_model_reward/std": 0.11148703843355179,
"step": 437
},
{
"clip_ratio": 0.0,
"epoch": 0.5695708712613784,
"grad_norm": 0.3161029328314251,
"kl": 0.234375,
"learning_rate": 2e-06,
"loss": -0.0028,
"step": 438
},
{
"clip_ratio": 0.0,
"completion_length": 465.53125,
"epoch": 0.5708712613784135,
"grad_norm": 0.9314909195166072,
"kl": 0.2197265625,
"learning_rate": 2e-06,
"loss": -0.0062,
"reward": 0.9457427859306335,
"reward_std": 0.17563901841640472,
"rewards/preference_model_reward": 0.9457427859306335,
"rewards/preference_model_reward/std": 0.17885959148406982,
"step": 439
},
{
"clip_ratio": 0.0005498891696333885,
"epoch": 0.5721716514954487,
"grad_norm": 0.6109650911373344,
"kl": 0.2216796875,
"learning_rate": 2e-06,
"loss": -0.0065,
"step": 440
},
{
"clip_ratio": 0.0,
"completion_length": 580.375,
"epoch": 0.5734720416124838,
"grad_norm": 0.02581098267581845,
"kl": 0.21484375,
"learning_rate": 2e-06,
"loss": 0.0002,
"reward": 1.0,
"reward_std": 0.0,
"rewards/preference_model_reward": 1.0,
"rewards/preference_model_reward/std": 0.0,
"step": 441
},
{
"clip_ratio": 0.0,
"epoch": 0.5747724317295189,
"grad_norm": 0.01979962238447969,
"kl": 0.201171875,
"learning_rate": 2e-06,
"loss": 0.0002,
"step": 442
},
{
"clip_ratio": 0.0,
"completion_length": 447.3125,
"epoch": 0.576072821846554,
"grad_norm": 0.4804861704479814,
"kl": 0.228515625,
"learning_rate": 2e-06,
"loss": 0.0016,
"reward": 0.9702784419059753,
"reward_std": 0.09500616788864136,
"rewards/preference_model_reward": 0.9702784419059753,
"rewards/preference_model_reward/std": 0.13557977974414825,
"step": 443
},
{
"clip_ratio": 6.139489414636046e-05,
"epoch": 0.577373211963589,
"grad_norm": 0.4764924275579863,
"kl": 0.21875,
"learning_rate": 2e-06,
"loss": 0.0013,
"step": 444
},
{
"clip_ratio": 0.0,
"completion_length": 396.3125,
"epoch": 0.5786736020806242,
"grad_norm": 0.9864886855099723,
"kl": 0.1943359375,
"learning_rate": 2e-06,
"loss": -0.0046,
"reward": 0.8668668270111084,
"reward_std": 0.297611802816391,
"rewards/preference_model_reward": 0.8668668270111084,
"rewards/preference_model_reward/std": 0.30994293093681335,
"step": 445
},
{
"clip_ratio": 0.0,
"epoch": 0.5799739921976593,
"grad_norm": 0.8321524897920206,
"kl": 0.1904296875,
"learning_rate": 2e-06,
"loss": -0.0051,
"step": 446
},
{
"clip_ratio": 0.0,
"completion_length": 427.9375,
"epoch": 0.5812743823146944,
"grad_norm": 1.1409128393837655,
"kl": 0.2275390625,
"learning_rate": 2e-06,
"loss": 0.0028,
"reward": 0.795079231262207,
"reward_std": 0.31667160987854004,
"rewards/preference_model_reward": 0.795079231262207,
"rewards/preference_model_reward/std": 0.34156370162963867,
"step": 447
},
{
"clip_ratio": 0.00022159164655022323,
"epoch": 0.5825747724317295,
"grad_norm": 1.0738948095730507,
"kl": 0.2275390625,
"learning_rate": 2e-06,
"loss": 0.002,
"step": 448
},
{
"clip_ratio": 0.0,
"completion_length": 552.40625,
"epoch": 0.5838751625487646,
"grad_norm": 1.507954566094587,
"kl": 0.162109375,
"learning_rate": 2e-06,
"loss": -0.0013,
"reward": 0.5813151597976685,
"reward_std": 0.38100069761276245,
"rewards/preference_model_reward": 0.5813151597976685,
"rewards/preference_model_reward/std": 0.4741608202457428,
"step": 449
},
{
"clip_ratio": 0.0003506769426167011,
"epoch": 0.5851755526657998,
"grad_norm": 1.4082782499892905,
"kl": 0.162109375,
"learning_rate": 2e-06,
"loss": -0.0023,
"step": 450
},
{
"clip_ratio": 0.0,
"completion_length": 568.09375,
"epoch": 0.5864759427828349,
"grad_norm": 1.5418403027222796,
"kl": 0.1796875,
"learning_rate": 2e-06,
"loss": 0.0157,
"reward": 0.6295768618583679,
"reward_std": 0.4063897132873535,
"rewards/preference_model_reward": 0.6295768618583679,
"rewards/preference_model_reward/std": 0.4200841784477234,
"step": 451
},
{
"clip_ratio": 0.00017533147183712572,
"epoch": 0.58777633289987,
"grad_norm": 1.4908308242341415,
"kl": 0.1806640625,
"learning_rate": 2e-06,
"loss": 0.0147,
"step": 452
},
{
"clip_ratio": 0.0,
"completion_length": 414.71875,
"epoch": 0.5890767230169051,
"grad_norm": 0.3291705830428915,
"kl": 0.171875,
"learning_rate": 2e-06,
"loss": -0.0022,
"reward": 0.9669902324676514,
"reward_std": 0.0894487202167511,
"rewards/preference_model_reward": 0.9669902324676514,
"rewards/preference_model_reward/std": 0.10228119045495987,
"step": 453
},
{
"clip_ratio": 0.0001948475546669215,
"epoch": 0.5903771131339401,
"grad_norm": 0.3089967159758185,
"kl": 0.171875,
"learning_rate": 2e-06,
"loss": -0.0025,
"step": 454
},
{
"clip_ratio": 0.0,
"completion_length": 589.6875,
"epoch": 0.5916775032509753,
"grad_norm": 0.777625282632689,
"kl": 0.1748046875,
"learning_rate": 2e-06,
"loss": 0.0116,
"reward": 0.9618591070175171,
"reward_std": 0.12204061448574066,
"rewards/preference_model_reward": 0.9618591070175171,
"rewards/preference_model_reward/std": 0.1741509884595871,
"step": 455
},
{
"clip_ratio": 0.0,
"epoch": 0.5929778933680104,
"grad_norm": 0.7325722818853695,
"kl": 0.17578125,
"learning_rate": 2e-06,
"loss": 0.011,
"step": 456
},
{
"clip_ratio": 0.0,
"completion_length": 505.3125,
"epoch": 0.5942782834850455,
"grad_norm": 0.5246149237440364,
"kl": 0.185546875,
"learning_rate": 2e-06,
"loss": -0.0037,
"reward": 0.9700804948806763,
"reward_std": 0.11967816203832626,
"rewards/preference_model_reward": 0.9700804948806763,
"rewards/preference_model_reward/std": 0.1692504733800888,
"step": 457
},
{
"clip_ratio": 0.00017141705029644072,
"epoch": 0.5955786736020806,
"grad_norm": 0.4704269534446643,
"kl": 0.1865234375,
"learning_rate": 2e-06,
"loss": -0.004,
"step": 458
},
{
"clip_ratio": 0.0,
"completion_length": 418.4375,
"epoch": 0.5968790637191157,
"grad_norm": 0.99830704358775,
"kl": 0.208984375,
"learning_rate": 2e-06,
"loss": 0.0132,
"reward": 0.8735105395317078,
"reward_std": 0.1765914112329483,
"rewards/preference_model_reward": 0.8735105395317078,
"rewards/preference_model_reward/std": 0.27725955843925476,
"step": 459
},
{
"clip_ratio": 0.0006676804041489959,
"epoch": 0.5981794538361509,
"grad_norm": 0.9276264800319912,
"kl": 0.2099609375,
"learning_rate": 2e-06,
"loss": 0.0127,
"step": 460
},
{
"clip_ratio": 0.0,
"completion_length": 600.625,
"epoch": 0.599479843953186,
"grad_norm": 1.5304815918050108,
"kl": 0.1826171875,
"learning_rate": 2e-06,
"loss": 0.0092,
"reward": 0.8300485610961914,
"reward_std": 0.36765021085739136,
"rewards/preference_model_reward": 0.8300485610961914,
"rewards/preference_model_reward/std": 0.3629445433616638,
"step": 461
},
{
"clip_ratio": 9.010268695419654e-05,
"epoch": 0.6007802340702211,
"grad_norm": 1.4944547262094985,
"kl": 0.189453125,
"learning_rate": 2e-06,
"loss": 0.0081,
"step": 462
},
{
"clip_ratio": 0.0,
"completion_length": 556.125,
"epoch": 0.6020806241872562,
"grad_norm": 0.018760959897438687,
"kl": 0.20703125,
"learning_rate": 2e-06,
"loss": 0.0002,
"reward": 1.0,
"reward_std": 0.0,
"rewards/preference_model_reward": 1.0,
"rewards/preference_model_reward/std": 0.0,
"step": 463
},
{
"clip_ratio": 0.0,
"epoch": 0.6033810143042913,
"grad_norm": 0.01660210108604342,
"kl": 0.197265625,
"learning_rate": 2e-06,
"loss": 0.0002,
"step": 464
},
{
"clip_ratio": 0.0,
"completion_length": 490.59375,
"epoch": 0.6046814044213265,
"grad_norm": 0.27288074072764534,
"kl": 0.205078125,
"learning_rate": 2e-06,
"loss": -0.0053,
"reward": 0.9808217287063599,
"reward_std": 0.07671315968036652,
"rewards/preference_model_reward": 0.9808217287063599,
"rewards/preference_model_reward/std": 0.1084887906908989,
"step": 465
},
{
"clip_ratio": 0.00041510548908263445,
"epoch": 0.6059817945383615,
"grad_norm": 0.2036263491928161,
"kl": 0.1962890625,
"learning_rate": 2e-06,
"loss": -0.0055,
"step": 466
},
{
"clip_ratio": 0.0,
"completion_length": 446.375,
"epoch": 0.6072821846553966,
"grad_norm": 0.9138687673589736,
"kl": 0.1748046875,
"learning_rate": 2e-06,
"loss": 0.01,
"reward": 0.5181146264076233,
"reward_std": 0.2147461473941803,
"rewards/preference_model_reward": 0.5181146264076233,
"rewards/preference_model_reward/std": 0.4774966835975647,
"step": 467
},
{
"clip_ratio": 0.0008817376801744103,
"epoch": 0.6085825747724317,
"grad_norm": 0.884284869476855,
"kl": 0.1708984375,
"learning_rate": 2e-06,
"loss": 0.0093,
"step": 468
},
{
"clip_ratio": 0.0,
"completion_length": 578.03125,
"epoch": 0.6098829648894668,
"grad_norm": 1.373228383519465,
"kl": 0.1767578125,
"learning_rate": 2e-06,
"loss": 0.0312,
"reward": 0.8574967980384827,
"reward_std": 0.3164219260215759,
"rewards/preference_model_reward": 0.8574967980384827,
"rewards/preference_model_reward/std": 0.3132801949977875,
"step": 469
},
{
"clip_ratio": 0.0002780166978482157,
"epoch": 0.611183355006502,
"grad_norm": 1.2773340380778375,
"kl": 0.173828125,
"learning_rate": 2e-06,
"loss": 0.0302,
"step": 470
},
{
"clip_ratio": 0.0,
"completion_length": 598.875,
"epoch": 0.6124837451235371,
"grad_norm": 1.4916055964066572,
"kl": 0.1689453125,
"learning_rate": 2e-06,
"loss": 0.0378,
"reward": 0.7772074341773987,
"reward_std": 0.3698381781578064,
"rewards/preference_model_reward": 0.7772074341773987,
"rewards/preference_model_reward/std": 0.36533990502357483,
"step": 471
},
{
"clip_ratio": 4.626202644431032e-05,
"epoch": 0.6137841352405722,
"grad_norm": 1.4536924831752602,
"kl": 0.169921875,
"learning_rate": 2e-06,
"loss": 0.0366,
"step": 472
},
{
"clip_ratio": 0.0,
"completion_length": 505.875,
"epoch": 0.6150845253576073,
"grad_norm": 0.7544413510474123,
"kl": 0.173828125,
"learning_rate": 2e-06,
"loss": 0.0057,
"reward": 0.9283210635185242,
"reward_std": 0.18047307431697845,
"rewards/preference_model_reward": 0.9283210635185242,
"rewards/preference_model_reward/std": 0.2003507763147354,
"step": 473
},
{
"clip_ratio": 0.0003117350279353559,
"epoch": 0.6163849154746424,
"grad_norm": 0.7045547292296246,
"kl": 0.173828125,
"learning_rate": 2e-06,
"loss": 0.0051,
"step": 474
},
{
"clip_ratio": 0.0,
"completion_length": 668.59375,
"epoch": 0.6176853055916776,
"grad_norm": 0.7354870473019032,
"kl": 0.1591796875,
"learning_rate": 2e-06,
"loss": 0.0047,
"reward": 0.9718723893165588,
"reward_std": 0.11251037567853928,
"rewards/preference_model_reward": 0.9718723893165588,
"rewards/preference_model_reward/std": 0.15911369025707245,
"step": 475
},
{
"clip_ratio": 3.537569136824459e-05,
"epoch": 0.6189856957087126,
"grad_norm": 0.6857184543038295,
"kl": 0.1591796875,
"learning_rate": 2e-06,
"loss": 0.0041,
"step": 476
},
{
"clip_ratio": 0.0,
"completion_length": 677.09375,
"epoch": 0.6202860858257477,
"grad_norm": 1.7321781240572358,
"kl": 0.15625,
"learning_rate": 2e-06,
"loss": 0.0378,
"reward": 0.797584056854248,
"reward_std": 0.3873823583126068,
"rewards/preference_model_reward": 0.797584056854248,
"rewards/preference_model_reward/std": 0.38197755813598633,
"step": 477
},
{
"clip_ratio": 4.785605051438324e-05,
"epoch": 0.6215864759427828,
"grad_norm": 1.7078683184871906,
"kl": 0.15625,
"learning_rate": 2e-06,
"loss": 0.0366,
"step": 478
},
{
"clip_ratio": 0.0,
"completion_length": 439.28125,
"epoch": 0.6228868660598179,
"grad_norm": 1.3124273700119886,
"kl": 0.1796875,
"learning_rate": 2e-06,
"loss": -0.0063,
"reward": 0.6355161666870117,
"reward_std": 0.4067726731300354,
"rewards/preference_model_reward": 0.6355161666870117,
"rewards/preference_model_reward/std": 0.4306652247905731,
"step": 479
},
{
"clip_ratio": 6.122948980191723e-05,
"epoch": 0.6241872561768531,
"grad_norm": 1.2747010389997053,
"kl": 0.1796875,
"learning_rate": 2e-06,
"loss": -0.0073,
"step": 480
},
{
"clip_ratio": 0.0,
"completion_length": 476.8125,
"epoch": 0.6254876462938882,
"grad_norm": 1.1771905194134227,
"kl": 0.1669921875,
"learning_rate": 2e-06,
"loss": 0.0047,
"reward": 0.8450658321380615,
"reward_std": 0.32613372802734375,
"rewards/preference_model_reward": 0.8450658321380615,
"rewards/preference_model_reward/std": 0.32565778493881226,
"step": 481
},
{
"clip_ratio": 6.566850788658485e-05,
"epoch": 0.6267880364109233,
"grad_norm": 1.0949478432903983,
"kl": 0.16796875,
"learning_rate": 2e-06,
"loss": 0.0037,
"step": 482
},
{
"clip_ratio": 0.0,
"completion_length": 411.1875,
"epoch": 0.6280884265279584,
"grad_norm": 1.0345750868470442,
"kl": 0.1708984375,
"learning_rate": 2e-06,
"loss": 0.0024,
"reward": 0.7638916373252869,
"reward_std": 0.3289935886859894,
"rewards/preference_model_reward": 0.7638916373252869,
"rewards/preference_model_reward/std": 0.33954110741615295,
"step": 483
},
{
"clip_ratio": 6.652474985457957e-05,
"epoch": 0.6293888166449935,
"grad_norm": 0.9527524423711179,
"kl": 0.171875,
"learning_rate": 2e-06,
"loss": 0.0017,
"step": 484
},
{
"clip_ratio": 0.0,
"completion_length": 548.3125,
"epoch": 0.6306892067620286,
"grad_norm": 1.0248342945427826,
"kl": 0.19921875,
"learning_rate": 2e-06,
"loss": 0.0007,
"reward": 0.8802142143249512,
"reward_std": 0.28954002261161804,
"rewards/preference_model_reward": 0.8802142143249512,
"rewards/preference_model_reward/std": 0.28504154086112976,
"step": 485
},
{
"clip_ratio": 4.90099992020987e-05,
"epoch": 0.6319895968790638,
"grad_norm": 0.9824475068268235,
"kl": 0.1923828125,
"learning_rate": 2e-06,
"loss": -0.0,
"step": 486
},
{
"clip_ratio": 0.0,
"completion_length": 438.90625,
"epoch": 0.6332899869960988,
"grad_norm": 1.0170620577468696,
"kl": 0.181640625,
"learning_rate": 2e-06,
"loss": -0.011,
"reward": 0.840887188911438,
"reward_std": 0.21575048565864563,
"rewards/preference_model_reward": 0.840887188911438,
"rewards/preference_model_reward/std": 0.3409208357334137,
"step": 487
},
{
"clip_ratio": 0.00013467390090227127,
"epoch": 0.6345903771131339,
"grad_norm": 0.9962152448073786,
"kl": 0.18359375,
"learning_rate": 2e-06,
"loss": -0.0118,
"step": 488
},
{
"clip_ratio": 0.0,
"completion_length": 453.25,
"epoch": 0.635890767230169,
"grad_norm": 1.1268507254797382,
"kl": 0.1923828125,
"learning_rate": 2e-06,
"loss": 0.0048,
"reward": 0.837617039680481,
"reward_std": 0.21851858496665955,
"rewards/preference_model_reward": 0.837617039680481,
"rewards/preference_model_reward/std": 0.34588855504989624,
"step": 489
},
{
"clip_ratio": 0.0,
"epoch": 0.6371911573472041,
"grad_norm": 1.0885462907631436,
"kl": 0.1953125,
"learning_rate": 2e-06,
"loss": 0.0039,
"step": 490
},
{
"clip_ratio": 0.0,
"completion_length": 427.09375,
"epoch": 0.6384915474642393,
"grad_norm": 0.9071689083575298,
"kl": 0.201171875,
"learning_rate": 2e-06,
"loss": -0.0026,
"reward": 0.8059304356575012,
"reward_std": 0.20044738054275513,
"rewards/preference_model_reward": 0.8059304356575012,
"rewards/preference_model_reward/std": 0.341531902551651,
"step": 491
},
{
"clip_ratio": 6.603274960070848e-05,
"epoch": 0.6397919375812744,
"grad_norm": 0.8345599641034357,
"kl": 0.203125,
"learning_rate": 2e-06,
"loss": -0.0032,
"step": 492
},
{
"clip_ratio": 0.0,
"completion_length": 478.375,
"epoch": 0.6410923276983095,
"grad_norm": 0.6624858525413615,
"kl": 0.197265625,
"learning_rate": 2e-06,
"loss": -0.0033,
"reward": 0.9255695939064026,
"reward_std": 0.14829862117767334,
"rewards/preference_model_reward": 0.9255695939064026,
"rewards/preference_model_reward/std": 0.21973776817321777,
"step": 493
},
{
"clip_ratio": 0.0,
"epoch": 0.6423927178153446,
"grad_norm": 0.6312105576362554,
"kl": 0.2001953125,
"learning_rate": 2e-06,
"loss": -0.0038,
"step": 494
},
{
"clip_ratio": 0.0,
"completion_length": 525.28125,
"epoch": 0.6436931079323797,
"grad_norm": 1.5486110651386202,
"kl": 0.20703125,
"learning_rate": 2e-06,
"loss": 0.0059,
"reward": 0.8713526725769043,
"reward_std": 0.2359280288219452,
"rewards/preference_model_reward": 0.8713526725769043,
"rewards/preference_model_reward/std": 0.2887003421783447,
"step": 495
},
{
"clip_ratio": 0.0002505861921235919,
"epoch": 0.6449934980494149,
"grad_norm": 1.0616419797076526,
"kl": 0.208984375,
"learning_rate": 2e-06,
"loss": 0.0053,
"step": 496
},
{
"clip_ratio": 0.0,
"completion_length": 513.0,
"epoch": 0.64629388816645,
"grad_norm": 1.2046309521967586,
"kl": 0.189453125,
"learning_rate": 2e-06,
"loss": 0.024,
"reward": 0.8967607021331787,
"reward_std": 0.28214776515960693,
"rewards/preference_model_reward": 0.8967607021331787,
"rewards/preference_model_reward/std": 0.28318798542022705,
"step": 497
},
{
"clip_ratio": 0.00011981013813056052,
"epoch": 0.647594278283485,
"grad_norm": 1.1245818999020487,
"kl": 0.193359375,
"learning_rate": 2e-06,
"loss": 0.0229,
"step": 498
},
{
"clip_ratio": 0.0,
"completion_length": 500.34375,
"epoch": 0.6488946684005201,
"grad_norm": 0.8253153021127959,
"kl": 0.19921875,
"learning_rate": 2e-06,
"loss": -0.0024,
"reward": 0.9368200302124023,
"reward_std": 0.16158036887645721,
"rewards/preference_model_reward": 0.9368200302124023,
"rewards/preference_model_reward/std": 0.2238502949476242,
"step": 499
},
{
"clip_ratio": 0.00017198966816067696,
"epoch": 0.6501950585175552,
"grad_norm": 0.7544071871520531,
"kl": 0.2041015625,
"learning_rate": 2e-06,
"loss": -0.0029,
"step": 500
},
{
"clip_ratio": 0.0,
"completion_length": 427.03125,
"epoch": 0.6514954486345904,
"grad_norm": 0.8134258338741699,
"kl": 0.2158203125,
"learning_rate": 2e-06,
"loss": -0.0062,
"reward": 0.9170088171958923,
"reward_std": 0.17870065569877625,
"rewards/preference_model_reward": 0.9170088171958923,
"rewards/preference_model_reward/std": 0.26252105832099915,
"step": 501
},
{
"clip_ratio": 0.0001375194697175175,
"epoch": 0.6527958387516255,
"grad_norm": 0.7216707899969623,
"kl": 0.2177734375,
"learning_rate": 2e-06,
"loss": -0.0067,
"step": 502
},
{
"clip_ratio": 0.0,
"completion_length": 518.40625,
"epoch": 0.6540962288686606,
"grad_norm": 0.7084971694449992,
"kl": 0.201171875,
"learning_rate": 2e-06,
"loss": -0.0019,
"reward": 0.9319340586662292,
"reward_std": 0.19666869938373566,
"rewards/preference_model_reward": 0.9319340586662292,
"rewards/preference_model_reward/std": 0.21102942526340485,
"step": 503
},
{
"clip_ratio": 0.00019161276577506214,
"epoch": 0.6553966189856957,
"grad_norm": 0.6297038100275056,
"kl": 0.203125,
"learning_rate": 2e-06,
"loss": -0.0023,
"step": 504
},
{
"clip_ratio": 0.0,
"completion_length": 534.25,
"epoch": 0.6566970091027308,
"grad_norm": 1.2200861351672945,
"kl": 0.212890625,
"learning_rate": 2e-06,
"loss": 0.0057,
"reward": 0.870198130607605,
"reward_std": 0.2922201156616211,
"rewards/preference_model_reward": 0.870198130607605,
"rewards/preference_model_reward/std": 0.2885199189186096,
"step": 505
},
{
"clip_ratio": 0.0006603770307265222,
"epoch": 0.657997399219766,
"grad_norm": 1.1025383311332084,
"kl": 0.21484375,
"learning_rate": 2e-06,
"loss": 0.0049,
"step": 506
},
{
"clip_ratio": 0.0,
"completion_length": 350.8125,
"epoch": 0.659297789336801,
"grad_norm": 0.5929542711439004,
"kl": 0.228515625,
"learning_rate": 2e-06,
"loss": -0.004,
"reward": 0.9226664304733276,
"reward_std": 0.19836580753326416,
"rewards/preference_model_reward": 0.9226664304733276,
"rewards/preference_model_reward/std": 0.19793914258480072,
"step": 507
},
{
"clip_ratio": 0.00015906358021311462,
"epoch": 0.6605981794538361,
"grad_norm": 0.5486817244564233,
"kl": 0.23046875,
"learning_rate": 2e-06,
"loss": -0.0045,
"step": 508
},
{
"clip_ratio": 0.0,
"completion_length": 548.5625,
"epoch": 0.6618985695708712,
"grad_norm": 0.3288568622069386,
"kl": 0.2119140625,
"learning_rate": 2e-06,
"loss": 0.0014,
"reward": 0.9740253686904907,
"reward_std": 0.057107504457235336,
"rewards/preference_model_reward": 0.9740253686904907,
"rewards/preference_model_reward/std": 0.0837172269821167,
"step": 509
},
{
"clip_ratio": 9.279881487600505e-05,
"epoch": 0.6631989596879063,
"grad_norm": 0.3017876628121043,
"kl": 0.21484375,
"learning_rate": 2e-06,
"loss": 0.0012,
"step": 510
},
{
"clip_ratio": 0.0,
"completion_length": 501.96875,
"epoch": 0.6644993498049415,
"grad_norm": 0.2552289537545978,
"kl": 0.2177734375,
"learning_rate": 2e-06,
"loss": -0.0022,
"reward": 0.9818136692047119,
"reward_std": 0.05234856531023979,
"rewards/preference_model_reward": 0.9818136692047119,
"rewards/preference_model_reward/std": 0.07513560354709625,
"step": 511
},
{
"clip_ratio": 0.0,
"epoch": 0.6657997399219766,
"grad_norm": 0.21243026144301005,
"kl": 0.220703125,
"learning_rate": 2e-06,
"loss": -0.0024,
"step": 512
},
{
"clip_ratio": 0.0,
"completion_length": 522.84375,
"epoch": 0.6671001300390117,
"grad_norm": 0.03160928214041918,
"kl": 0.248046875,
"learning_rate": 2e-06,
"loss": 0.0003,
"reward": 1.0,
"reward_std": 0.0,
"rewards/preference_model_reward": 1.0,
"rewards/preference_model_reward/std": 0.0,
"step": 513
},
{
"clip_ratio": 0.0,
"epoch": 0.6684005201560468,
"grad_norm": 0.019785740981705057,
"kl": 0.23828125,
"learning_rate": 2e-06,
"loss": 0.0002,
"step": 514
},
{
"clip_ratio": 0.0,
"completion_length": 619.25,
"epoch": 0.6697009102730819,
"grad_norm": 0.9295754723585831,
"kl": 0.1962890625,
"learning_rate": 2e-06,
"loss": 0.0107,
"reward": 0.947670578956604,
"reward_std": 0.14302663505077362,
"rewards/preference_model_reward": 0.947670578956604,
"rewards/preference_model_reward/std": 0.20596152544021606,
"step": 515
},
{
"clip_ratio": 8.934033394325525e-05,
"epoch": 0.6710013003901171,
"grad_norm": 0.8958440484535811,
"kl": 0.19140625,
"learning_rate": 2e-06,
"loss": 0.01,
"step": 516
},
{
"clip_ratio": 0.0,
"completion_length": 457.53125,
"epoch": 0.6723016905071522,
"grad_norm": 0.53034749494269,
"kl": 0.236328125,
"learning_rate": 2e-06,
"loss": -0.0036,
"reward": 0.49002784490585327,
"reward_std": 0.14473380148410797,
"rewards/preference_model_reward": 0.49002784490585327,
"rewards/preference_model_reward/std": 0.48859700560569763,
"step": 517
},
{
"clip_ratio": 0.0005512780044227839,
"epoch": 0.6736020806241872,
"grad_norm": 0.692962639214083,
"kl": 0.2314453125,
"learning_rate": 2e-06,
"loss": -0.004,
"step": 518
},
{
"clip_ratio": 0.0,
"completion_length": 509.84375,
"epoch": 0.6749024707412223,
"grad_norm": 0.8076317202453783,
"kl": 0.20703125,
"learning_rate": 2e-06,
"loss": 0.0028,
"reward": 0.8944128155708313,
"reward_std": 0.14827537536621094,
"rewards/preference_model_reward": 0.8944128155708313,
"rewards/preference_model_reward/std": 0.2325102984905243,
"step": 519
},
{
"clip_ratio": 0.0001351351384073496,
"epoch": 0.6762028608582574,
"grad_norm": 0.7475623142861337,
"kl": 0.203125,
"learning_rate": 2e-06,
"loss": 0.0022,
"step": 520
},
{
"clip_ratio": 0.0,
"completion_length": 436.84375,
"epoch": 0.6775032509752926,
"grad_norm": 0.028377255891001926,
"kl": 0.201171875,
"learning_rate": 2e-06,
"loss": 0.0002,
"reward": 1.0,
"reward_std": 0.0,
"rewards/preference_model_reward": 1.0,
"rewards/preference_model_reward/std": 0.0,
"step": 521
},
{
"clip_ratio": 0.0,
"epoch": 0.6788036410923277,
"grad_norm": 0.012551554297778673,
"kl": 0.1923828125,
"learning_rate": 2e-06,
"loss": 0.0002,
"step": 522
},
{
"clip_ratio": 0.0,
"completion_length": 711.125,
"epoch": 0.6801040312093628,
"grad_norm": 1.5471897459570736,
"kl": 0.173828125,
"learning_rate": 2e-06,
"loss": 0.0266,
"reward": 0.843010425567627,
"reward_std": 0.2676996886730194,
"rewards/preference_model_reward": 0.843010425567627,
"rewards/preference_model_reward/std": 0.34891876578330994,
"step": 523
},
{
"clip_ratio": 0.0002701242920011282,
"epoch": 0.6814044213263979,
"grad_norm": 1.4410403152995823,
"kl": 0.16796875,
"learning_rate": 2e-06,
"loss": 0.0254,
"step": 524
},
{
"clip_ratio": 0.0,
"completion_length": 646.4375,
"epoch": 0.682704811443433,
"grad_norm": 1.6486514561580368,
"kl": 0.1689453125,
"learning_rate": 2e-06,
"loss": 0.0186,
"reward": 0.7798171043395996,
"reward_std": 0.3787343502044678,
"rewards/preference_model_reward": 0.7798171043395996,
"rewards/preference_model_reward/std": 0.37885910272598267,
"step": 525
},
{
"clip_ratio": 0.0002811163431033492,
"epoch": 0.6840052015604682,
"grad_norm": 1.4654130660789888,
"kl": 0.1640625,
"learning_rate": 2e-06,
"loss": 0.0176,
"step": 526
},
{
"clip_ratio": 0.0,
"completion_length": 517.375,
"epoch": 0.6853055916775033,
"grad_norm": 1.3509256509461836,
"kl": 0.15625,
"learning_rate": 2e-06,
"loss": 0.0132,
"reward": 0.4524506628513336,
"reward_std": 0.4075589179992676,
"rewards/preference_model_reward": 0.4524506628513336,
"rewards/preference_model_reward/std": 0.4056318700313568,
"step": 527
},
{
"clip_ratio": 0.0001703468442428857,
"epoch": 0.6866059817945384,
"grad_norm": 1.3169959570019343,
"kl": 0.15234375,
"learning_rate": 2e-06,
"loss": 0.0121,
"step": 528
},
{
"clip_ratio": 0.0,
"completion_length": 699.96875,
"epoch": 0.6879063719115734,
"grad_norm": 1.659880219331862,
"kl": 0.150390625,
"learning_rate": 2e-06,
"loss": -0.0022,
"reward": 0.5921683311462402,
"reward_std": 0.3830801844596863,
"rewards/preference_model_reward": 0.5921683311462402,
"rewards/preference_model_reward/std": 0.4823853075504303,
"step": 529
},
{
"clip_ratio": 3.313892011647113e-05,
"epoch": 0.6892067620286085,
"grad_norm": 1.5933404503889264,
"kl": 0.150390625,
"learning_rate": 2e-06,
"loss": -0.0034,
"step": 530
},
{
"clip_ratio": 0.0,
"completion_length": 536.59375,
"epoch": 0.6905071521456437,
"grad_norm": 1.4789545393321204,
"kl": 0.193359375,
"learning_rate": 2e-06,
"loss": 0.0052,
"reward": 0.48997482657432556,
"reward_std": 0.4091135859489441,
"rewards/preference_model_reward": 0.48997482657432556,
"rewards/preference_model_reward/std": 0.40547576546669006,
"step": 531
},
{
"clip_ratio": 0.00037025794154033065,
"epoch": 0.6918075422626788,
"grad_norm": 1.3836275497987043,
"kl": 0.19140625,
"learning_rate": 2e-06,
"loss": 0.0041,
"step": 532
},
{
"clip_ratio": 0.0,
"completion_length": 500.71875,
"epoch": 0.6931079323797139,
"grad_norm": 0.3199567384300615,
"kl": 0.1630859375,
"learning_rate": 2e-06,
"loss": -0.0051,
"reward": 0.957996129989624,
"reward_std": 0.07893598824739456,
"rewards/preference_model_reward": 0.957996129989624,
"rewards/preference_model_reward/std": 0.11781778186559677,
"step": 533
},
{
"clip_ratio": 0.0,
"epoch": 0.694408322496749,
"grad_norm": 0.30160406098099796,
"kl": 0.162109375,
"learning_rate": 2e-06,
"loss": -0.0053,
"step": 534
},
{
"clip_ratio": 0.0,
"completion_length": 622.21875,
"epoch": 0.6957087126137841,
"grad_norm": 0.9950320411701277,
"kl": 0.1064453125,
"learning_rate": 2e-06,
"loss": 0.0192,
"reward": 0.45777231454849243,
"reward_std": 0.27751585841178894,
"rewards/preference_model_reward": 0.45777231454849243,
"rewards/preference_model_reward/std": 0.46042200922966003,
"step": 535
},
{
"clip_ratio": 0.0001473418960813433,
"epoch": 0.6970091027308193,
"grad_norm": 0.9759058209911393,
"kl": 0.1064453125,
"learning_rate": 2e-06,
"loss": 0.0185,
"step": 536
},
{
"clip_ratio": 0.0,
"completion_length": 479.5,
"epoch": 0.6983094928478544,
"grad_norm": 0.6121602073781033,
"kl": 0.162109375,
"learning_rate": 2e-06,
"loss": 0.0008,
"reward": 0.9411002397537231,
"reward_std": 0.14587333798408508,
"rewards/preference_model_reward": 0.9411002397537231,
"rewards/preference_model_reward/std": 0.18111251294612885,
"step": 537
},
{
"clip_ratio": 0.00011307100794510916,
"epoch": 0.6996098829648895,
"grad_norm": 0.5716256653179863,
"kl": 0.162109375,
"learning_rate": 2e-06,
"loss": 0.0003,
"step": 538
},
{
"clip_ratio": 0.0,
"completion_length": 578.65625,
"epoch": 0.7009102730819246,
"grad_norm": 1.4127144788915404,
"kl": 0.1416015625,
"learning_rate": 2e-06,
"loss": -0.0024,
"reward": 0.7771836519241333,
"reward_std": 0.29650747776031494,
"rewards/preference_model_reward": 0.7771836519241333,
"rewards/preference_model_reward/std": 0.36120009422302246,
"step": 539
},
{
"clip_ratio": 0.00010380351159255952,
"epoch": 0.7022106631989596,
"grad_norm": 1.1777179730274026,
"kl": 0.1435546875,
"learning_rate": 2e-06,
"loss": -0.0032,
"step": 540
},
{
"clip_ratio": 0.0,
"completion_length": 523.4375,
"epoch": 0.7035110533159948,
"grad_norm": 1.4681000227125094,
"kl": 0.1630859375,
"learning_rate": 2e-06,
"loss": 0.0038,
"reward": 0.5395978689193726,
"reward_std": 0.40768900513648987,
"rewards/preference_model_reward": 0.5395978689193726,
"rewards/preference_model_reward/std": 0.46992650628089905,
"step": 541
},
{
"clip_ratio": 0.00012938569125253707,
"epoch": 0.7048114434330299,
"grad_norm": 1.3613455454415246,
"kl": 0.1650390625,
"learning_rate": 2e-06,
"loss": 0.0028,
"step": 542
},
{
"clip_ratio": 0.0,
"completion_length": 477.03125,
"epoch": 0.706111833550065,
"grad_norm": 1.0553742279151677,
"kl": 0.158203125,
"learning_rate": 2e-06,
"loss": 0.0223,
"reward": 0.4630056619644165,
"reward_std": 0.27500906586647034,
"rewards/preference_model_reward": 0.4630056619644165,
"rewards/preference_model_reward/std": 0.4385357201099396,
"step": 543
},
{
"clip_ratio": 5.340739153325558e-05,
"epoch": 0.7074122236671001,
"grad_norm": 0.9682034434189447,
"kl": 0.16015625,
"learning_rate": 2e-06,
"loss": 0.0216,
"step": 544
},
{
"clip_ratio": 0.0,
"completion_length": 450.53125,
"epoch": 0.7087126137841352,
"grad_norm": 1.428990857463978,
"kl": 0.171875,
"learning_rate": 2e-06,
"loss": -0.0085,
"reward": 0.7226020693778992,
"reward_std": 0.2753547132015228,
"rewards/preference_model_reward": 0.7226020693778992,
"rewards/preference_model_reward/std": 0.40645530819892883,
"step": 545
},
{
"clip_ratio": 6.157635652925819e-05,
"epoch": 0.7100130039011704,
"grad_norm": 1.0153576138262457,
"kl": 0.171875,
"learning_rate": 2e-06,
"loss": -0.0091,
"step": 546
},
{
"clip_ratio": 0.0,
"completion_length": 381.375,
"epoch": 0.7113133940182055,
"grad_norm": 0.7961865635923656,
"kl": 0.1728515625,
"learning_rate": 2e-06,
"loss": -0.0011,
"reward": 0.9013676643371582,
"reward_std": 0.23820459842681885,
"rewards/preference_model_reward": 0.9013676643371582,
"rewards/preference_model_reward/std": 0.24647004902362823,
"step": 547
},
{
"clip_ratio": 0.0,
"epoch": 0.7126137841352406,
"grad_norm": 0.6685334773440631,
"kl": 0.173828125,
"learning_rate": 2e-06,
"loss": -0.0017,
"step": 548
},
{
"clip_ratio": 0.0,
"completion_length": 397.71875,
"epoch": 0.7139141742522757,
"grad_norm": 0.509873094151191,
"kl": 0.189453125,
"learning_rate": 2e-06,
"loss": 0.0058,
"reward": 0.9610089063644409,
"reward_std": 0.10700845718383789,
"rewards/preference_model_reward": 0.9610089063644409,
"rewards/preference_model_reward/std": 0.15405260026454926,
"step": 549
},
{
"clip_ratio": 7.24008132237941e-05,
"epoch": 0.7152145643693107,
"grad_norm": 0.46289523230086543,
"kl": 0.19140625,
"learning_rate": 2e-06,
"loss": 0.0054,
"step": 550
},
{
"clip_ratio": 0.0,
"completion_length": 479.90625,
"epoch": 0.716514954486346,
"grad_norm": 0.7777436132276164,
"kl": 0.1708984375,
"learning_rate": 2e-06,
"loss": 0.0001,
"reward": 0.9174792766571045,
"reward_std": 0.22822144627571106,
"rewards/preference_model_reward": 0.9174792766571045,
"rewards/preference_model_reward/std": 0.22594521939754486,
"step": 551
},
{
"clip_ratio": 0.0002606313209980726,
"epoch": 0.717815344603381,
"grad_norm": 0.6834258660506523,
"kl": 0.171875,
"learning_rate": 2e-06,
"loss": -0.0005,
"step": 552
},
{
"clip_ratio": 0.0,
"completion_length": 413.5,
"epoch": 0.7191157347204161,
"grad_norm": 0.505052776040059,
"kl": 0.189453125,
"learning_rate": 2e-06,
"loss": -0.0026,
"reward": 0.4816475510597229,
"reward_std": 0.10513729602098465,
"rewards/preference_model_reward": 0.4816475510597229,
"rewards/preference_model_reward/std": 0.4959093928337097,
"step": 553
},
{
"clip_ratio": 0.00036867460585199296,
"epoch": 0.7204161248374512,
"grad_norm": 0.42150869730426016,
"kl": 0.1904296875,
"learning_rate": 2e-06,
"loss": -0.003,
"step": 554
},
{
"clip_ratio": 0.0,
"completion_length": 518.28125,
"epoch": 0.7217165149544863,
"grad_norm": 1.4834420478857298,
"kl": 0.177734375,
"learning_rate": 2e-06,
"loss": -0.0091,
"reward": 0.5543996095657349,
"reward_std": 0.425930917263031,
"rewards/preference_model_reward": 0.5543996095657349,
"rewards/preference_model_reward/std": 0.42936739325523376,
"step": 555
},
{
"clip_ratio": 0.00012094823614461347,
"epoch": 0.7230169050715215,
"grad_norm": 1.6593571165239325,
"kl": 0.1767578125,
"learning_rate": 2e-06,
"loss": -0.0102,
"step": 556
},
{
"clip_ratio": 0.0,
"completion_length": 377.9375,
"epoch": 0.7243172951885566,
"grad_norm": 0.8643935299403351,
"kl": 0.1806640625,
"learning_rate": 2e-06,
"loss": -0.0059,
"reward": 0.7919847965240479,
"reward_std": 0.3126184642314911,
"rewards/preference_model_reward": 0.7919847965240479,
"rewards/preference_model_reward/std": 0.3270798325538635,
"step": 557
},
{
"clip_ratio": 0.0002360784710617736,
"epoch": 0.7256176853055917,
"grad_norm": 0.8316400864873517,
"kl": 0.181640625,
"learning_rate": 2e-06,
"loss": -0.0066,
"step": 558
},
{
"clip_ratio": 0.0,
"completion_length": 293.21875,
"epoch": 0.7269180754226268,
"grad_norm": 0.8796910032990837,
"kl": 0.2255859375,
"learning_rate": 2e-06,
"loss": -0.0119,
"reward": 0.7465857863426208,
"reward_std": 0.3626917600631714,
"rewards/preference_model_reward": 0.7465857863426208,
"rewards/preference_model_reward/std": 0.35811734199523926,
"step": 559
},
{
"clip_ratio": 0.0,
"epoch": 0.7282184655396619,
"grad_norm": 0.8380735342987923,
"kl": 0.2265625,
"learning_rate": 2e-06,
"loss": -0.0126,
"step": 560
},
{
"clip_ratio": 0.0,
"completion_length": 408.03125,
"epoch": 0.729518855656697,
"grad_norm": 0.512186793959597,
"kl": 0.21484375,
"learning_rate": 2e-06,
"loss": -0.0057,
"reward": 0.9450701475143433,
"reward_std": 0.2109910547733307,
"rewards/preference_model_reward": 0.9450701475143433,
"rewards/preference_model_reward/std": 0.20757848024368286,
"step": 561
},
{
"clip_ratio": 0.0,
"epoch": 0.7308192457737321,
"grad_norm": 0.4831254168108591,
"kl": 0.216796875,
"learning_rate": 2e-06,
"loss": -0.0061,
"step": 562
},
{
"clip_ratio": 0.0,
"completion_length": 563.46875,
"epoch": 0.7321196358907672,
"grad_norm": 1.6126446142051276,
"kl": 0.1533203125,
"learning_rate": 2e-06,
"loss": 0.0137,
"reward": 0.5664808750152588,
"reward_std": 0.41850006580352783,
"rewards/preference_model_reward": 0.5664808750152588,
"rewards/preference_model_reward/std": 0.41366487741470337,
"step": 563
},
{
"clip_ratio": 0.0,
"epoch": 0.7334200260078023,
"grad_norm": 1.4735887582214497,
"kl": 0.1552734375,
"learning_rate": 2e-06,
"loss": 0.0128,
"step": 564
},
{
"clip_ratio": 0.0,
"completion_length": 552.4375,
"epoch": 0.7347204161248374,
"grad_norm": 1.0717577501192503,
"kl": 0.18359375,
"learning_rate": 2e-06,
"loss": 0.0034,
"reward": 0.913676917552948,
"reward_std": 0.1866173893213272,
"rewards/preference_model_reward": 0.913676917552948,
"rewards/preference_model_reward/std": 0.2740388512611389,
"step": 565
},
{
"clip_ratio": 0.00016552054148633033,
"epoch": 0.7360208062418726,
"grad_norm": 1.0305545524954813,
"kl": 0.185546875,
"learning_rate": 2e-06,
"loss": 0.0026,
"step": 566
},
{
"clip_ratio": 0.0,
"completion_length": 494.28125,
"epoch": 0.7373211963589077,
"grad_norm": 0.40807537593869514,
"kl": 0.1767578125,
"learning_rate": 2e-06,
"loss": -0.0033,
"reward": 0.9716057777404785,
"reward_std": 0.1135769933462143,
"rewards/preference_model_reward": 0.9716057777404785,
"rewards/preference_model_reward/std": 0.16062211990356445,
"step": 567
},
{
"clip_ratio": 0.0,
"epoch": 0.7386215864759428,
"grad_norm": 0.3435926951880284,
"kl": 0.1796875,
"learning_rate": 2e-06,
"loss": -0.0036,
"step": 568
},
{
"clip_ratio": 0.0,
"completion_length": 578.71875,
"epoch": 0.7399219765929779,
"grad_norm": 1.3861764799614187,
"kl": 0.203125,
"learning_rate": 2e-06,
"loss": -0.0024,
"reward": 0.805233359336853,
"reward_std": 0.35668760538101196,
"rewards/preference_model_reward": 0.805233359336853,
"rewards/preference_model_reward/std": 0.35277989506721497,
"step": 569
},
{
"clip_ratio": 9.771350596565753e-05,
"epoch": 0.741222366710013,
"grad_norm": 1.241876701253313,
"kl": 0.2060546875,
"learning_rate": 2e-06,
"loss": -0.0034,
"step": 570
},
{
"clip_ratio": 0.0,
"completion_length": 512.9375,
"epoch": 0.7425227568270482,
"grad_norm": 0.5960426098398067,
"kl": 0.1845703125,
"learning_rate": 2e-06,
"loss": 0.017,
"reward": 0.9699637293815613,
"reward_std": 0.12014515697956085,
"rewards/preference_model_reward": 0.9699637293815613,
"rewards/preference_model_reward/std": 0.16991090774536133,
"step": 571
},
{
"clip_ratio": 0.0003436754341237247,
"epoch": 0.7438231469440832,
"grad_norm": 0.5183287490814747,
"kl": 0.1875,
"learning_rate": 2e-06,
"loss": 0.0166,
"step": 572
},
{
"clip_ratio": 0.0,
"completion_length": 461.4375,
"epoch": 0.7451235370611183,
"grad_norm": 1.5416862528532593,
"kl": 0.212890625,
"learning_rate": 2e-06,
"loss": -0.0011,
"reward": 0.9567732214927673,
"reward_std": 0.15070945024490356,
"rewards/preference_model_reward": 0.9567732214927673,
"rewards/preference_model_reward/std": 0.14909282326698303,
"step": 573
},
{
"clip_ratio": 0.0004944581887684762,
"epoch": 0.7464239271781534,
"grad_norm": 0.5001084547889261,
"kl": 0.2158203125,
"learning_rate": 2e-06,
"loss": -0.0013,
"step": 574
},
{
"clip_ratio": 0.0,
"completion_length": 439.15625,
"epoch": 0.7477243172951885,
"grad_norm": 0.3547953762763419,
"kl": 0.2080078125,
"learning_rate": 2e-06,
"loss": 0.0005,
"reward": 0.9812067747116089,
"reward_std": 0.07517301291227341,
"rewards/preference_model_reward": 0.9812067747116089,
"rewards/preference_model_reward/std": 0.10631068795919418,
"step": 575
},
{
"clip_ratio": 0.0002910027978941798,
"epoch": 0.7490247074122237,
"grad_norm": 0.3164538486238686,
"kl": 0.2119140625,
"learning_rate": 2e-06,
"loss": 0.0002,
"step": 576
},
{
"clip_ratio": 0.0,
"completion_length": 392.625,
"epoch": 0.7503250975292588,
"grad_norm": 0.6989008521134648,
"kl": 0.232421875,
"learning_rate": 2e-06,
"loss": -0.0004,
"reward": 0.44408947229385376,
"reward_std": 0.18061134219169617,
"rewards/preference_model_reward": 0.44408947229385376,
"rewards/preference_model_reward/std": 0.47470250725746155,
"step": 577
},
{
"clip_ratio": 0.0004471123975235969,
"epoch": 0.7516254876462939,
"grad_norm": 0.6192349801951256,
"kl": 0.236328125,
"learning_rate": 2e-06,
"loss": -0.0009,
"step": 578
},
{
"clip_ratio": 0.0,
"completion_length": 499.84375,
"epoch": 0.752925877763329,
"grad_norm": 1.164670357893636,
"kl": 0.251953125,
"learning_rate": 2e-06,
"loss": 0.0097,
"reward": 0.8941581845283508,
"reward_std": 0.27277839183807373,
"rewards/preference_model_reward": 0.8941581845283508,
"rewards/preference_model_reward/std": 0.268955260515213,
"step": 579
},
{
"clip_ratio": 0.0004549244767986238,
"epoch": 0.7542262678803641,
"grad_norm": 1.0205324380629648,
"kl": 0.251953125,
"learning_rate": 2e-06,
"loss": 0.0088,
"step": 580
},
{
"clip_ratio": 0.0,
"completion_length": 536.03125,
"epoch": 0.7555266579973993,
"grad_norm": 1.1026359782739126,
"kl": 0.236328125,
"learning_rate": 2e-06,
"loss": 0.011,
"reward": 0.9393888115882874,
"reward_std": 0.16562925279140472,
"rewards/preference_model_reward": 0.9393888115882874,
"rewards/preference_model_reward/std": 0.23851299285888672,
"step": 581
},
{
"clip_ratio": 0.00014628437929786742,
"epoch": 0.7568270481144344,
"grad_norm": 0.9887761797962026,
"kl": 0.23828125,
"learning_rate": 2e-06,
"loss": 0.0103,
"step": 582
},
{
"clip_ratio": 0.0,
"completion_length": 478.5,
"epoch": 0.7581274382314694,
"grad_norm": 0.36250831588618054,
"kl": 0.234375,
"learning_rate": 2e-06,
"loss": -0.0024,
"reward": 0.9745615720748901,
"reward_std": 0.08803649246692657,
"rewards/preference_model_reward": 0.9745615720748901,
"rewards/preference_model_reward/std": 0.1251751035451889,
"step": 583
},
{
"clip_ratio": 0.0007471668068319559,
"epoch": 0.7594278283485045,
"grad_norm": 0.3337433977585642,
"kl": 0.2333984375,
"learning_rate": 2e-06,
"loss": -0.0027,
"step": 584
},
{
"clip_ratio": 0.0,
"completion_length": 474.875,
"epoch": 0.7607282184655396,
"grad_norm": 1.1970923812923153,
"kl": 0.25,
"learning_rate": 2e-06,
"loss": 0.0039,
"reward": 0.42788398265838623,
"reward_std": 0.3588111996650696,
"rewards/preference_model_reward": 0.42788398265838623,
"rewards/preference_model_reward/std": 0.405824214220047,
"step": 585
},
{
"clip_ratio": 0.0007635854999534786,
"epoch": 0.7620286085825748,
"grad_norm": 1.1707496444316037,
"kl": 0.1865234375,
"learning_rate": 2e-06,
"loss": 0.0031,
"step": 586
},
{
"clip_ratio": 0.0,
"completion_length": 535.5,
"epoch": 0.7633289986996099,
"grad_norm": 1.2912574830783405,
"kl": 0.25390625,
"learning_rate": 2e-06,
"loss": -0.0089,
"reward": 0.8629287481307983,
"reward_std": 0.2172880619764328,
"rewards/preference_model_reward": 0.8629287481307983,
"rewards/preference_model_reward/std": 0.33283141255378723,
"step": 587
},
{
"clip_ratio": 0.00011639429430942982,
"epoch": 0.764629388816645,
"grad_norm": 1.448212757233341,
"kl": 0.255859375,
"learning_rate": 2e-06,
"loss": -0.0098,
"step": 588
},
{
"clip_ratio": 0.0,
"completion_length": 401.46875,
"epoch": 0.7659297789336801,
"grad_norm": 0.5876962392618675,
"kl": 0.2734375,
"learning_rate": 2e-06,
"loss": -0.002,
"reward": 0.9528677463531494,
"reward_std": 0.1301821768283844,
"rewards/preference_model_reward": 0.9528677463531494,
"rewards/preference_model_reward/std": 0.18388831615447998,
"step": 589
},
{
"clip_ratio": 0.0006060305167920887,
"epoch": 0.7672301690507152,
"grad_norm": 0.5076147378502047,
"kl": 0.271484375,
"learning_rate": 2e-06,
"loss": -0.0025,
"step": 590
},
{
"clip_ratio": 0.0,
"completion_length": 485.46875,
"epoch": 0.7685305591677504,
"grad_norm": 1.2814868227092682,
"kl": 0.2333984375,
"learning_rate": 2e-06,
"loss": 0.0029,
"reward": 0.7781925797462463,
"reward_std": 0.36613741517066956,
"rewards/preference_model_reward": 0.7781925797462463,
"rewards/preference_model_reward/std": 0.363247811794281,
"step": 591
},
{
"clip_ratio": 0.0,
"epoch": 0.7698309492847855,
"grad_norm": 1.22748966955473,
"kl": 0.234375,
"learning_rate": 2e-06,
"loss": 0.0019,
"step": 592
},
{
"clip_ratio": 0.0,
"completion_length": 270.25,
"epoch": 0.7711313394018205,
"grad_norm": 0.011637130289628438,
"kl": 0.279296875,
"learning_rate": 2e-06,
"loss": 0.0001,
"reward": 1.0,
"reward_std": 0.0,
"rewards/preference_model_reward": 1.0,
"rewards/preference_model_reward/std": 0.0,
"step": 593
},
{
"clip_ratio": 0.0,
"epoch": 0.7724317295188556,
"grad_norm": 0.010229706612576805,
"kl": 0.263671875,
"learning_rate": 2e-06,
"loss": 0.0001,
"step": 594
},
{
"clip_ratio": 0.0,
"completion_length": 489.0,
"epoch": 0.7737321196358907,
"grad_norm": 0.02168733906561227,
"kl": 0.234375,
"learning_rate": 2e-06,
"loss": 0.0002,
"reward": 1.0,
"reward_std": 0.0,
"rewards/preference_model_reward": 1.0,
"rewards/preference_model_reward/std": 0.0,
"step": 595
},
{
"clip_ratio": 0.0,
"epoch": 0.7750325097529259,
"grad_norm": 0.014641318125133244,
"kl": 0.2177734375,
"learning_rate": 2e-06,
"loss": 0.0002,
"step": 596
},
{
"clip_ratio": 0.0,
"completion_length": 453.65625,
"epoch": 0.776332899869961,
"grad_norm": 0.8255334973681036,
"kl": 0.259765625,
"learning_rate": 2e-06,
"loss": -0.0101,
"reward": 0.8930084705352783,
"reward_std": 0.26911142468452454,
"rewards/preference_model_reward": 0.8930084705352783,
"rewards/preference_model_reward/std": 0.2652178108692169,
"step": 597
},
{
"clip_ratio": 0.0011451852042227983,
"epoch": 0.7776332899869961,
"grad_norm": 0.7541031893153506,
"kl": 0.232421875,
"learning_rate": 2e-06,
"loss": -0.0107,
"step": 598
},
{
"clip_ratio": 0.0,
"completion_length": 515.0,
"epoch": 0.7789336801040312,
"grad_norm": 0.8665016761502143,
"kl": 0.1923828125,
"learning_rate": 2e-06,
"loss": 0.0142,
"reward": 0.9606556296348572,
"reward_std": 0.14067065715789795,
"rewards/preference_model_reward": 0.9606556296348572,
"rewards/preference_model_reward/std": 0.16641117632389069,
"step": 599
},
{
"clip_ratio": 0.0006148220272734761,
"epoch": 0.7802340702210663,
"grad_norm": 0.7376855642027353,
"kl": 0.1875,
"learning_rate": 2e-06,
"loss": 0.0136,
"step": 600
}
],
"logging_steps": 1,
"max_steps": 2048,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 150,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}