| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.7802340702210663, |
| "eval_steps": 500, |
| "global_step": 600, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 369.8125, |
| "epoch": 0.0013003901170351106, |
| "grad_norm": 0.9659074481719256, |
| "kl": 0.0002956390380859375, |
| "learning_rate": 0.0, |
| "loss": 0.0102, |
| "reward": 0.24122674763202667, |
| "reward_std": 0.35857653617858887, |
| "rewards/preference_model_reward": 0.24122674763202667, |
| "rewards/preference_model_reward/std": 0.3657657206058502, |
| "step": 1 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "epoch": 0.002600780234070221, |
| "grad_norm": 0.9657764983988186, |
| "kl": 0.0002956390380859375, |
| "learning_rate": 1e-07, |
| "loss": 0.0102, |
| "step": 2 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 355.59375, |
| "epoch": 0.0039011703511053317, |
| "grad_norm": 0.9911250453286248, |
| "kl": 0.0003662109375, |
| "learning_rate": 2e-07, |
| "loss": -0.0076, |
| "reward": 0.1862872838973999, |
| "reward_std": 0.22117774188518524, |
| "rewards/preference_model_reward": 0.1862872838973999, |
| "rewards/preference_model_reward/std": 0.3502456843852997, |
| "step": 3 |
| }, |
| { |
| "clip_ratio": 0.000235976796830073, |
| "epoch": 0.005201560468140442, |
| "grad_norm": 0.9993331560979716, |
| "kl": 0.0003719329833984375, |
| "learning_rate": 3e-07, |
| "loss": -0.0076, |
| "step": 4 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 376.03125, |
| "epoch": 0.006501950585175552, |
| "grad_norm": 0.9789094069833814, |
| "kl": 0.000347137451171875, |
| "learning_rate": 4e-07, |
| "loss": 0.0023, |
| "reward": 0.2914609909057617, |
| "reward_std": 0.3521167039871216, |
| "rewards/preference_model_reward": 0.2914609909057617, |
| "rewards/preference_model_reward/std": 0.4191484749317169, |
| "step": 5 |
| }, |
| { |
| "clip_ratio": 0.00034768745535984635, |
| "epoch": 0.007802340702210663, |
| "grad_norm": 0.9859506359104134, |
| "kl": 0.00034332275390625, |
| "learning_rate": 5e-07, |
| "loss": 0.0023, |
| "step": 6 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 250.21875, |
| "epoch": 0.009102730819245773, |
| "grad_norm": 1.3649252351615753, |
| "kl": 0.000370025634765625, |
| "learning_rate": 6e-07, |
| "loss": 0.0062, |
| "reward": 0.4924929738044739, |
| "reward_std": 0.4304487109184265, |
| "rewards/preference_model_reward": 0.4924929738044739, |
| "rewards/preference_model_reward/std": 0.465331107378006, |
| "step": 7 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "epoch": 0.010403120936280884, |
| "grad_norm": 1.059378833132842, |
| "kl": 0.0004215240478515625, |
| "learning_rate": 7e-07, |
| "loss": 0.0062, |
| "step": 8 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 358.71875, |
| "epoch": 0.011703511053315995, |
| "grad_norm": 1.1643210223877094, |
| "kl": 0.00040435791015625, |
| "learning_rate": 8e-07, |
| "loss": -0.0024, |
| "reward": 0.42163607478141785, |
| "reward_std": 0.40762829780578613, |
| "rewards/preference_model_reward": 0.42163607478141785, |
| "rewards/preference_model_reward/std": 0.43679776787757874, |
| "step": 9 |
| }, |
| { |
| "clip_ratio": 0.00018502894090488553, |
| "epoch": 0.013003901170351105, |
| "grad_norm": 1.0909138168951602, |
| "kl": 0.00040435791015625, |
| "learning_rate": 9e-07, |
| "loss": -0.0025, |
| "step": 10 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 337.875, |
| "epoch": 0.014304291287386216, |
| "grad_norm": 1.1375050760237047, |
| "kl": 0.00042724609375, |
| "learning_rate": 1e-06, |
| "loss": 0.0027, |
| "reward": 0.39657002687454224, |
| "reward_std": 0.25011393427848816, |
| "rewards/preference_model_reward": 0.39657002687454224, |
| "rewards/preference_model_reward/std": 0.4063413441181183, |
| "step": 11 |
| }, |
| { |
| "clip_ratio": 0.00011823614477179945, |
| "epoch": 0.015604681404421327, |
| "grad_norm": 1.1070268098404832, |
| "kl": 0.000438690185546875, |
| "learning_rate": 1.1e-06, |
| "loss": 0.0026, |
| "step": 12 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 229.90625, |
| "epoch": 0.016905071521456438, |
| "grad_norm": 0.8427732256393701, |
| "kl": 0.000579833984375, |
| "learning_rate": 1.2e-06, |
| "loss": 0.0031, |
| "reward": 0.2741852104663849, |
| "reward_std": 0.23615789413452148, |
| "rewards/preference_model_reward": 0.2741852104663849, |
| "rewards/preference_model_reward/std": 0.4120958745479584, |
| "step": 13 |
| }, |
| { |
| "clip_ratio": 0.00015812776109669358, |
| "epoch": 0.018205461638491547, |
| "grad_norm": 0.811295051358225, |
| "kl": 0.0005950927734375, |
| "learning_rate": 1.3e-06, |
| "loss": 0.003, |
| "step": 14 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 319.40625, |
| "epoch": 0.01950585175552666, |
| "grad_norm": 0.015205465689863728, |
| "kl": 0.000701904296875, |
| "learning_rate": 1.4e-06, |
| "loss": 0.0001, |
| "reward": 0.00646161288022995, |
| "reward_std": 0.005409592762589455, |
| "rewards/preference_model_reward": 0.00646161288022995, |
| "rewards/preference_model_reward/std": 0.0058777108788490295, |
| "step": 15 |
| }, |
| { |
| "clip_ratio": 0.000241475339862518, |
| "epoch": 0.02080624187256177, |
| "grad_norm": 0.01476566433090852, |
| "kl": 0.000766754150390625, |
| "learning_rate": 1.5e-06, |
| "loss": 0.0001, |
| "step": 16 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 251.5625, |
| "epoch": 0.022106631989596878, |
| "grad_norm": 1.132463593759137, |
| "kl": 0.001007080078125, |
| "learning_rate": 1.6e-06, |
| "loss": -0.0035, |
| "reward": 0.4667004942893982, |
| "reward_std": 0.4533562660217285, |
| "rewards/preference_model_reward": 0.4667004942893982, |
| "rewards/preference_model_reward/std": 0.4472948908805847, |
| "step": 17 |
| }, |
| { |
| "clip_ratio": 9.563886851537973e-05, |
| "epoch": 0.02340702210663199, |
| "grad_norm": 1.2516838066095344, |
| "kl": 0.0012054443359375, |
| "learning_rate": 1.6999999999999998e-06, |
| "loss": -0.0038, |
| "step": 18 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 317.25, |
| "epoch": 0.0247074122236671, |
| "grad_norm": 1.3550285048912802, |
| "kl": 0.0013885498046875, |
| "learning_rate": 1.8e-06, |
| "loss": 0.0264, |
| "reward": 0.6398348808288574, |
| "reward_std": 0.41185781359672546, |
| "rewards/preference_model_reward": 0.6398348808288574, |
| "rewards/preference_model_reward/std": 0.4352080225944519, |
| "step": 19 |
| }, |
| { |
| "clip_ratio": 0.0002283816138515249, |
| "epoch": 0.02600780234070221, |
| "grad_norm": 1.3188454766806155, |
| "kl": 0.0018310546875, |
| "learning_rate": 1.8999999999999998e-06, |
| "loss": 0.0262, |
| "step": 20 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 430.40625, |
| "epoch": 0.027308192457737322, |
| "grad_norm": 1.1882515797674884, |
| "kl": 0.0024871826171875, |
| "learning_rate": 2e-06, |
| "loss": 0.0227, |
| "reward": 0.46726664900779724, |
| "reward_std": 0.27668142318725586, |
| "rewards/preference_model_reward": 0.46726664900779724, |
| "rewards/preference_model_reward/std": 0.49030089378356934, |
| "step": 21 |
| }, |
| { |
| "clip_ratio": 5.293245703796856e-05, |
| "epoch": 0.02860858257477243, |
| "grad_norm": 1.144548816920826, |
| "kl": 0.003204345703125, |
| "learning_rate": 2e-06, |
| "loss": 0.0224, |
| "step": 22 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 344.28125, |
| "epoch": 0.02990897269180754, |
| "grad_norm": 1.2911067063161903, |
| "kl": 0.00341796875, |
| "learning_rate": 2e-06, |
| "loss": -0.0015, |
| "reward": 0.5998943448066711, |
| "reward_std": 0.45733675360679626, |
| "rewards/preference_model_reward": 0.5998943448066711, |
| "rewards/preference_model_reward/std": 0.449917197227478, |
| "step": 23 |
| }, |
| { |
| "clip_ratio": 0.00026819598861038685, |
| "epoch": 0.031209362808842653, |
| "grad_norm": 1.2967596040632892, |
| "kl": 0.004364013671875, |
| "learning_rate": 2e-06, |
| "loss": -0.0017, |
| "step": 24 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 393.21875, |
| "epoch": 0.032509752925877766, |
| "grad_norm": 1.5694696539232997, |
| "kl": 0.005126953125, |
| "learning_rate": 2e-06, |
| "loss": 0.0094, |
| "reward": 0.3718755841255188, |
| "reward_std": 0.3863350749015808, |
| "rewards/preference_model_reward": 0.3718755841255188, |
| "rewards/preference_model_reward/std": 0.4579065144062042, |
| "step": 25 |
| }, |
| { |
| "clip_ratio": 0.0005104803130961955, |
| "epoch": 0.033810143042912875, |
| "grad_norm": 1.4430846873009133, |
| "kl": 0.0064697265625, |
| "learning_rate": 2e-06, |
| "loss": 0.009, |
| "step": 26 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 147.09375, |
| "epoch": 0.035110533159947985, |
| "grad_norm": 0.6793569926930156, |
| "kl": 0.0111083984375, |
| "learning_rate": 2e-06, |
| "loss": -0.0008, |
| "reward": 0.2612653970718384, |
| "reward_std": 0.29103392362594604, |
| "rewards/preference_model_reward": 0.2612653970718384, |
| "rewards/preference_model_reward/std": 0.3302207589149475, |
| "step": 27 |
| }, |
| { |
| "clip_ratio": 0.0024005018640309572, |
| "epoch": 0.036410923276983094, |
| "grad_norm": 0.8626605965309095, |
| "kl": 0.015380859375, |
| "learning_rate": 2e-06, |
| "loss": -0.0009, |
| "step": 28 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 306.21875, |
| "epoch": 0.0377113133940182, |
| "grad_norm": 1.1377329684795203, |
| "kl": 0.0126953125, |
| "learning_rate": 2e-06, |
| "loss": -0.0029, |
| "reward": 0.7407833337783813, |
| "reward_std": 0.4136916995048523, |
| "rewards/preference_model_reward": 0.7407833337783813, |
| "rewards/preference_model_reward/std": 0.42335739731788635, |
| "step": 29 |
| }, |
| { |
| "clip_ratio": 0.0006882185116410255, |
| "epoch": 0.03901170351105332, |
| "grad_norm": 1.153060303007809, |
| "kl": 0.01519775390625, |
| "learning_rate": 2e-06, |
| "loss": -0.0034, |
| "step": 30 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 197.21875, |
| "epoch": 0.04031209362808843, |
| "grad_norm": 0.3843055910687809, |
| "kl": 0.019775390625, |
| "learning_rate": 2e-06, |
| "loss": -0.0021, |
| "reward": 0.49551475048065186, |
| "reward_std": 0.14911304414272308, |
| "rewards/preference_model_reward": 0.49551475048065186, |
| "rewards/preference_model_reward/std": 0.45444655418395996, |
| "step": 31 |
| }, |
| { |
| "clip_ratio": 0.00019171778694726527, |
| "epoch": 0.04161248374512354, |
| "grad_norm": 0.35937765329925253, |
| "kl": 0.02392578125, |
| "learning_rate": 2e-06, |
| "loss": -0.0022, |
| "step": 32 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 327.9375, |
| "epoch": 0.04291287386215865, |
| "grad_norm": 1.037959425094293, |
| "kl": 0.0235595703125, |
| "learning_rate": 2e-06, |
| "loss": 0.0051, |
| "reward": 0.2412111759185791, |
| "reward_std": 0.3637581765651703, |
| "rewards/preference_model_reward": 0.2412111759185791, |
| "rewards/preference_model_reward/std": 0.35813963413238525, |
| "step": 33 |
| }, |
| { |
| "clip_ratio": 0.0002937716490123421, |
| "epoch": 0.044213263979193757, |
| "grad_norm": 1.012063287494272, |
| "kl": 0.0269775390625, |
| "learning_rate": 2e-06, |
| "loss": 0.0048, |
| "step": 34 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 351.1875, |
| "epoch": 0.045513654096228866, |
| "grad_norm": 0.049977121036559484, |
| "kl": 0.0267333984375, |
| "learning_rate": 2e-06, |
| "loss": 0.0007, |
| "reward": 0.5103617906570435, |
| "reward_std": 0.013033521361649036, |
| "rewards/preference_model_reward": 0.5103617906570435, |
| "rewards/preference_model_reward/std": 0.49780330061912537, |
| "step": 35 |
| }, |
| { |
| "clip_ratio": 0.00022163119865581393, |
| "epoch": 0.04681404421326398, |
| "grad_norm": 0.050415747625479665, |
| "kl": 0.0289306640625, |
| "learning_rate": 2e-06, |
| "loss": 0.0007, |
| "step": 36 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 386.84375, |
| "epoch": 0.04811443433029909, |
| "grad_norm": 1.320382086078505, |
| "kl": 0.03466796875, |
| "learning_rate": 2e-06, |
| "loss": -0.0007, |
| "reward": 0.7545909881591797, |
| "reward_std": 0.3815248906612396, |
| "rewards/preference_model_reward": 0.7545909881591797, |
| "rewards/preference_model_reward/std": 0.38322117924690247, |
| "step": 37 |
| }, |
| { |
| "clip_ratio": 0.00017478244262747467, |
| "epoch": 0.0494148244473342, |
| "grad_norm": 1.3588405193554765, |
| "kl": 0.036865234375, |
| "learning_rate": 2e-06, |
| "loss": -0.0011, |
| "step": 38 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 275.09375, |
| "epoch": 0.05071521456436931, |
| "grad_norm": 0.9774108213555497, |
| "kl": 0.031494140625, |
| "learning_rate": 2e-06, |
| "loss": 0.001, |
| "reward": 0.8388960361480713, |
| "reward_std": 0.26034486293792725, |
| "rewards/preference_model_reward": 0.8388960361480713, |
| "rewards/preference_model_reward/std": 0.3219398558139801, |
| "step": 39 |
| }, |
| { |
| "clip_ratio": 0.00045662192860618234, |
| "epoch": 0.05201560468140442, |
| "grad_norm": 0.9753549692487811, |
| "kl": 0.034423828125, |
| "learning_rate": 2e-06, |
| "loss": 0.0007, |
| "step": 40 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 329.625, |
| "epoch": 0.053315994798439535, |
| "grad_norm": 1.163784763708709, |
| "kl": 0.107421875, |
| "learning_rate": 2e-06, |
| "loss": -0.0101, |
| "reward": 0.6221742630004883, |
| "reward_std": 0.35088545083999634, |
| "rewards/preference_model_reward": 0.6221742630004883, |
| "rewards/preference_model_reward/std": 0.4535573124885559, |
| "step": 41 |
| }, |
| { |
| "clip_ratio": 0.0014115219237282872, |
| "epoch": 0.054616384915474644, |
| "grad_norm": 1.1725293952887486, |
| "kl": 0.09033203125, |
| "learning_rate": 2e-06, |
| "loss": -0.0106, |
| "step": 42 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 360.25, |
| "epoch": 0.055916775032509754, |
| "grad_norm": 0.8648268899377819, |
| "kl": 0.03857421875, |
| "learning_rate": 2e-06, |
| "loss": -0.0036, |
| "reward": 0.86008620262146, |
| "reward_std": 0.29710525274276733, |
| "rewards/preference_model_reward": 0.86008620262146, |
| "rewards/preference_model_reward/std": 0.32698217034339905, |
| "step": 43 |
| }, |
| { |
| "clip_ratio": 0.0002532459911890328, |
| "epoch": 0.05721716514954486, |
| "grad_norm": 0.8200773607213043, |
| "kl": 0.041015625, |
| "learning_rate": 2e-06, |
| "loss": -0.004, |
| "step": 44 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 305.84375, |
| "epoch": 0.05851755526657997, |
| "grad_norm": 0.5893143524753656, |
| "kl": 0.0625, |
| "learning_rate": 2e-06, |
| "loss": -0.0014, |
| "reward": 0.6891697645187378, |
| "reward_std": 0.1563209444284439, |
| "rewards/preference_model_reward": 0.6891697645187378, |
| "rewards/preference_model_reward/std": 0.38344231247901917, |
| "step": 45 |
| }, |
| { |
| "clip_ratio": 0.00037541432539001107, |
| "epoch": 0.05981794538361508, |
| "grad_norm": 0.5686876994070226, |
| "kl": 0.06689453125, |
| "learning_rate": 2e-06, |
| "loss": -0.0017, |
| "step": 46 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 241.5, |
| "epoch": 0.0611183355006502, |
| "grad_norm": 0.9353983139355063, |
| "kl": 0.07275390625, |
| "learning_rate": 2e-06, |
| "loss": -0.0027, |
| "reward": 0.9130573272705078, |
| "reward_std": 0.26053106784820557, |
| "rewards/preference_model_reward": 0.9130573272705078, |
| "rewards/preference_model_reward/std": 0.2611267864704132, |
| "step": 47 |
| }, |
| { |
| "clip_ratio": 0.0003936196444556117, |
| "epoch": 0.06241872561768531, |
| "grad_norm": 0.6672689260021744, |
| "kl": 0.0771484375, |
| "learning_rate": 2e-06, |
| "loss": -0.0029, |
| "step": 48 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 325.53125, |
| "epoch": 0.06371911573472042, |
| "grad_norm": 0.7173309068130417, |
| "kl": 0.078125, |
| "learning_rate": 2e-06, |
| "loss": -0.0089, |
| "reward": 0.8731744885444641, |
| "reward_std": 0.2636582851409912, |
| "rewards/preference_model_reward": 0.8731744885444641, |
| "rewards/preference_model_reward/std": 0.2623332142829895, |
| "step": 49 |
| }, |
| { |
| "clip_ratio": 0.0003933516563847661, |
| "epoch": 0.06501950585175553, |
| "grad_norm": 0.6845578633909412, |
| "kl": 0.08203125, |
| "learning_rate": 2e-06, |
| "loss": -0.0092, |
| "step": 50 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 309.125, |
| "epoch": 0.06631989596879063, |
| "grad_norm": 1.0072549333125307, |
| "kl": 0.0771484375, |
| "learning_rate": 2e-06, |
| "loss": -0.0025, |
| "reward": 0.7915850877761841, |
| "reward_std": 0.3451814651489258, |
| "rewards/preference_model_reward": 0.7915850877761841, |
| "rewards/preference_model_reward/std": 0.34321537613868713, |
| "step": 51 |
| }, |
| { |
| "clip_ratio": 9.516558930044994e-05, |
| "epoch": 0.06762028608582575, |
| "grad_norm": 0.9815790015455084, |
| "kl": 0.080078125, |
| "learning_rate": 2e-06, |
| "loss": -0.0029, |
| "step": 52 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 380.3125, |
| "epoch": 0.06892067620286085, |
| "grad_norm": 1.3876607215088368, |
| "kl": 0.07373046875, |
| "learning_rate": 2e-06, |
| "loss": 0.026, |
| "reward": 0.8868893384933472, |
| "reward_std": 0.2691570520401001, |
| "rewards/preference_model_reward": 0.8868893384933472, |
| "rewards/preference_model_reward/std": 0.30143871903419495, |
| "step": 53 |
| }, |
| { |
| "clip_ratio": 0.00025676062796264887, |
| "epoch": 0.07022106631989597, |
| "grad_norm": 1.32392587242523, |
| "kl": 0.078125, |
| "learning_rate": 2e-06, |
| "loss": 0.0255, |
| "step": 54 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 296.34375, |
| "epoch": 0.07152145643693109, |
| "grad_norm": 0.6600106697929482, |
| "kl": 0.09228515625, |
| "learning_rate": 2e-06, |
| "loss": 0.0022, |
| "reward": 0.9307242631912231, |
| "reward_std": 0.15007582306861877, |
| "rewards/preference_model_reward": 0.9307242631912231, |
| "rewards/preference_model_reward/std": 0.22033238410949707, |
| "step": 55 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "epoch": 0.07282184655396619, |
| "grad_norm": 0.6509839555905929, |
| "kl": 0.0966796875, |
| "learning_rate": 2e-06, |
| "loss": 0.0019, |
| "step": 56 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 374.15625, |
| "epoch": 0.0741222366710013, |
| "grad_norm": 0.8166372061077617, |
| "kl": 0.06640625, |
| "learning_rate": 2e-06, |
| "loss": -0.0058, |
| "reward": 0.8706126809120178, |
| "reward_std": 0.27126792073249817, |
| "rewards/preference_model_reward": 0.8706126809120178, |
| "rewards/preference_model_reward/std": 0.29494142532348633, |
| "step": 57 |
| }, |
| { |
| "clip_ratio": 0.0006189570995047688, |
| "epoch": 0.0754226267880364, |
| "grad_norm": 0.7575930393174628, |
| "kl": 0.06884765625, |
| "learning_rate": 2e-06, |
| "loss": -0.0062, |
| "step": 58 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 273.53125, |
| "epoch": 0.07672301690507152, |
| "grad_norm": 0.35564737874795666, |
| "kl": 0.10546875, |
| "learning_rate": 2e-06, |
| "loss": -0.0117, |
| "reward": 0.929673433303833, |
| "reward_std": 0.20910362899303436, |
| "rewards/preference_model_reward": 0.929673433303833, |
| "rewards/preference_model_reward/std": 0.23424802720546722, |
| "step": 59 |
| }, |
| { |
| "clip_ratio": 0.0006005860050208867, |
| "epoch": 0.07802340702210664, |
| "grad_norm": 0.34766362031349507, |
| "kl": 0.109375, |
| "learning_rate": 2e-06, |
| "loss": -0.0119, |
| "step": 60 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 345.0625, |
| "epoch": 0.07932379713914174, |
| "grad_norm": 1.186646095697512, |
| "kl": 0.09228515625, |
| "learning_rate": 2e-06, |
| "loss": -0.0028, |
| "reward": 0.7294732332229614, |
| "reward_std": 0.35979652404785156, |
| "rewards/preference_model_reward": 0.7294732332229614, |
| "rewards/preference_model_reward/std": 0.35507434606552124, |
| "step": 61 |
| }, |
| { |
| "clip_ratio": 0.00039078935515135527, |
| "epoch": 0.08062418725617686, |
| "grad_norm": 1.1224238133769298, |
| "kl": 0.0947265625, |
| "learning_rate": 2e-06, |
| "loss": -0.0032, |
| "step": 62 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 526.75, |
| "epoch": 0.08192457737321196, |
| "grad_norm": 1.2409103678829725, |
| "kl": 0.0849609375, |
| "learning_rate": 2e-06, |
| "loss": 0.0099, |
| "reward": 0.6618127822875977, |
| "reward_std": 0.3134193420410156, |
| "rewards/preference_model_reward": 0.6618127822875977, |
| "rewards/preference_model_reward/std": 0.44231337308883667, |
| "step": 63 |
| }, |
| { |
| "clip_ratio": 0.0005315542221069336, |
| "epoch": 0.08322496749024708, |
| "grad_norm": 1.20783409819399, |
| "kl": 0.087890625, |
| "learning_rate": 2e-06, |
| "loss": 0.0093, |
| "step": 64 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 321.375, |
| "epoch": 0.08452535760728218, |
| "grad_norm": 1.1952606760232154, |
| "kl": 0.1201171875, |
| "learning_rate": 2e-06, |
| "loss": 0.0018, |
| "reward": 0.8502117395401001, |
| "reward_std": 0.28802555799484253, |
| "rewards/preference_model_reward": 0.8502117395401001, |
| "rewards/preference_model_reward/std": 0.29467442631721497, |
| "step": 65 |
| }, |
| { |
| "clip_ratio": 0.0005706611555069685, |
| "epoch": 0.0858257477243173, |
| "grad_norm": 0.9932076087850403, |
| "kl": 0.123046875, |
| "learning_rate": 2e-06, |
| "loss": 0.0014, |
| "step": 66 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 382.40625, |
| "epoch": 0.08712613784135241, |
| "grad_norm": 0.9026575096116797, |
| "kl": 0.12255859375, |
| "learning_rate": 2e-06, |
| "loss": 0.0029, |
| "reward": 0.5317609906196594, |
| "reward_std": 0.22324004769325256, |
| "rewards/preference_model_reward": 0.5317609906196594, |
| "rewards/preference_model_reward/std": 0.4589392840862274, |
| "step": 67 |
| }, |
| { |
| "clip_ratio": 0.0009551330585964024, |
| "epoch": 0.08842652795838751, |
| "grad_norm": 0.8034448562270808, |
| "kl": 0.1240234375, |
| "learning_rate": 2e-06, |
| "loss": 0.0025, |
| "step": 68 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 352.125, |
| "epoch": 0.08972691807542263, |
| "grad_norm": 1.051559244432883, |
| "kl": 0.1005859375, |
| "learning_rate": 2e-06, |
| "loss": -0.0008, |
| "reward": 0.8645539879798889, |
| "reward_std": 0.30358821153640747, |
| "rewards/preference_model_reward": 0.8645539879798889, |
| "rewards/preference_model_reward/std": 0.3146204948425293, |
| "step": 69 |
| }, |
| { |
| "clip_ratio": 0.0004191896296106279, |
| "epoch": 0.09102730819245773, |
| "grad_norm": 1.138873373412801, |
| "kl": 0.10302734375, |
| "learning_rate": 2e-06, |
| "loss": -0.0013, |
| "step": 70 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 394.9375, |
| "epoch": 0.09232769830949285, |
| "grad_norm": 0.7693875160579493, |
| "kl": 0.10498046875, |
| "learning_rate": 2e-06, |
| "loss": 0.0012, |
| "reward": 0.9269964694976807, |
| "reward_std": 0.1615072637796402, |
| "rewards/preference_model_reward": 0.9269964694976807, |
| "rewards/preference_model_reward/std": 0.2366172969341278, |
| "step": 71 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "epoch": 0.09362808842652796, |
| "grad_norm": 0.721838481847161, |
| "kl": 0.107421875, |
| "learning_rate": 2e-06, |
| "loss": 0.0008, |
| "step": 72 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 484.0, |
| "epoch": 0.09492847854356307, |
| "grad_norm": 0.67467714871582, |
| "kl": 0.10546875, |
| "learning_rate": 2e-06, |
| "loss": 0.0017, |
| "reward": 0.9037147760391235, |
| "reward_std": 0.14057296514511108, |
| "rewards/preference_model_reward": 0.9037147760391235, |
| "rewards/preference_model_reward/std": 0.21866993606090546, |
| "step": 73 |
| }, |
| { |
| "clip_ratio": 0.00014462298713624477, |
| "epoch": 0.09622886866059818, |
| "grad_norm": 0.6527573526740783, |
| "kl": 0.10888671875, |
| "learning_rate": 2e-06, |
| "loss": 0.0013, |
| "step": 74 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 352.8125, |
| "epoch": 0.09752925877763328, |
| "grad_norm": 1.0730024250380301, |
| "kl": 0.1083984375, |
| "learning_rate": 2e-06, |
| "loss": -0.0014, |
| "reward": 0.8341162204742432, |
| "reward_std": 0.3151201903820038, |
| "rewards/preference_model_reward": 0.8341162204742432, |
| "rewards/preference_model_reward/std": 0.32914233207702637, |
| "step": 75 |
| }, |
| { |
| "clip_ratio": 0.00022457953309640288, |
| "epoch": 0.0988296488946684, |
| "grad_norm": 1.4755587484249004, |
| "kl": 0.11328125, |
| "learning_rate": 2e-06, |
| "loss": -0.0019, |
| "step": 76 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 478.6875, |
| "epoch": 0.10013003901170352, |
| "grad_norm": 0.6070744376376411, |
| "kl": 0.109375, |
| "learning_rate": 2e-06, |
| "loss": 0.0073, |
| "reward": 0.9789870977401733, |
| "reward_std": 0.07569272816181183, |
| "rewards/preference_model_reward": 0.9789870977401733, |
| "rewards/preference_model_reward/std": 0.1074473112821579, |
| "step": 77 |
| }, |
| { |
| "clip_ratio": 0.0003504111082293093, |
| "epoch": 0.10143042912873862, |
| "grad_norm": 0.5334363278812777, |
| "kl": 0.115234375, |
| "learning_rate": 2e-06, |
| "loss": 0.007, |
| "step": 78 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 348.0625, |
| "epoch": 0.10273081924577374, |
| "grad_norm": 0.7094682971060341, |
| "kl": 0.10791015625, |
| "learning_rate": 2e-06, |
| "loss": -0.0005, |
| "reward": 0.9031955003738403, |
| "reward_std": 0.1537085920572281, |
| "rewards/preference_model_reward": 0.9031955003738403, |
| "rewards/preference_model_reward/std": 0.23537583649158478, |
| "step": 79 |
| }, |
| { |
| "clip_ratio": 9.012256487039849e-05, |
| "epoch": 0.10403120936280884, |
| "grad_norm": 0.6528569724682338, |
| "kl": 0.1103515625, |
| "learning_rate": 2e-06, |
| "loss": -0.0009, |
| "step": 80 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 254.34375, |
| "epoch": 0.10533159947984395, |
| "grad_norm": 0.015979283278672672, |
| "kl": 0.126953125, |
| "learning_rate": 2e-06, |
| "loss": 0.0001, |
| "reward": 0.503002941608429, |
| "reward_std": 0.0007813164265826344, |
| "rewards/preference_model_reward": 0.503002941608429, |
| "rewards/preference_model_reward/std": 0.5049507021903992, |
| "step": 81 |
| }, |
| { |
| "clip_ratio": 0.0016290850471705198, |
| "epoch": 0.10663198959687907, |
| "grad_norm": 0.013573220175038038, |
| "kl": 0.115234375, |
| "learning_rate": 2e-06, |
| "loss": 0.0, |
| "step": 82 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 252.90625, |
| "epoch": 0.10793237971391417, |
| "grad_norm": 0.34340056371333394, |
| "kl": 0.181640625, |
| "learning_rate": 2e-06, |
| "loss": -0.0008, |
| "reward": 0.9807977676391602, |
| "reward_std": 0.07286863774061203, |
| "rewards/preference_model_reward": 0.9807977676391602, |
| "rewards/preference_model_reward/std": 0.07266176491975784, |
| "step": 83 |
| }, |
| { |
| "clip_ratio": 0.0008525701705366373, |
| "epoch": 0.10923276983094929, |
| "grad_norm": 0.18116145894909788, |
| "kl": 0.138671875, |
| "learning_rate": 2e-06, |
| "loss": -0.0009, |
| "step": 84 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 331.875, |
| "epoch": 0.11053315994798439, |
| "grad_norm": 0.48967266770314716, |
| "kl": 0.1328125, |
| "learning_rate": 2e-06, |
| "loss": 0.0, |
| "reward": 0.9723169803619385, |
| "reward_std": 0.11073215305805206, |
| "rewards/preference_model_reward": 0.9723169803619385, |
| "rewards/preference_model_reward/std": 0.15659891068935394, |
| "step": 85 |
| }, |
| { |
| "clip_ratio": 0.0005794943426735699, |
| "epoch": 0.11183355006501951, |
| "grad_norm": 0.4092702632564478, |
| "kl": 0.12109375, |
| "learning_rate": 2e-06, |
| "loss": -0.0002, |
| "step": 86 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 357.0, |
| "epoch": 0.11313394018205461, |
| "grad_norm": 0.32187360662760156, |
| "kl": 0.10546875, |
| "learning_rate": 2e-06, |
| "loss": -0.0014, |
| "reward": 0.5838743448257446, |
| "reward_std": 0.09063759446144104, |
| "rewards/preference_model_reward": 0.5838743448257446, |
| "rewards/preference_model_reward/std": 0.4262309968471527, |
| "step": 87 |
| }, |
| { |
| "clip_ratio": 0.0007115560583770275, |
| "epoch": 0.11443433029908973, |
| "grad_norm": 0.31072273769841946, |
| "kl": 0.09814453125, |
| "learning_rate": 2e-06, |
| "loss": -0.0016, |
| "step": 88 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 335.3125, |
| "epoch": 0.11573472041612484, |
| "grad_norm": 1.1176337391124187, |
| "kl": 0.09716796875, |
| "learning_rate": 2e-06, |
| "loss": 0.0161, |
| "reward": 0.8303125500679016, |
| "reward_std": 0.26434481143951416, |
| "rewards/preference_model_reward": 0.8303125500679016, |
| "rewards/preference_model_reward/std": 0.2824815809726715, |
| "step": 89 |
| }, |
| { |
| "clip_ratio": 0.0020064229611307383, |
| "epoch": 0.11703511053315994, |
| "grad_norm": 1.1042794016878086, |
| "kl": 0.09375, |
| "learning_rate": 2e-06, |
| "loss": 0.0155, |
| "step": 90 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 349.125, |
| "epoch": 0.11833550065019506, |
| "grad_norm": 0.6566935601055136, |
| "kl": 0.09814453125, |
| "learning_rate": 2e-06, |
| "loss": -0.007, |
| "reward": 0.8883383274078369, |
| "reward_std": 0.14250892400741577, |
| "rewards/preference_model_reward": 0.8883383274078369, |
| "rewards/preference_model_reward/std": 0.22842475771903992, |
| "step": 91 |
| }, |
| { |
| "clip_ratio": 9.124087227974087e-05, |
| "epoch": 0.11963589076723016, |
| "grad_norm": 0.6051172269491196, |
| "kl": 0.09521484375, |
| "learning_rate": 2e-06, |
| "loss": -0.0073, |
| "step": 92 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 431.71875, |
| "epoch": 0.12093628088426528, |
| "grad_norm": 1.1072727024624063, |
| "kl": 0.091796875, |
| "learning_rate": 2e-06, |
| "loss": 0.0009, |
| "reward": 0.7813946008682251, |
| "reward_std": 0.2200125753879547, |
| "rewards/preference_model_reward": 0.7813946008682251, |
| "rewards/preference_model_reward/std": 0.3781771957874298, |
| "step": 93 |
| }, |
| { |
| "clip_ratio": 0.00021242158254608512, |
| "epoch": 0.1222366710013004, |
| "grad_norm": 1.130829779328361, |
| "kl": 0.09033203125, |
| "learning_rate": 2e-06, |
| "loss": 0.0004, |
| "step": 94 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 380.34375, |
| "epoch": 0.1235370611183355, |
| "grad_norm": 1.1837052245253927, |
| "kl": 0.083984375, |
| "learning_rate": 2e-06, |
| "loss": 0.0048, |
| "reward": 0.6728702187538147, |
| "reward_std": 0.3235069513320923, |
| "rewards/preference_model_reward": 0.6728702187538147, |
| "rewards/preference_model_reward/std": 0.3584347665309906, |
| "step": 95 |
| }, |
| { |
| "clip_ratio": 0.0003149464901071042, |
| "epoch": 0.12483745123537061, |
| "grad_norm": 1.0580700210395666, |
| "kl": 0.08447265625, |
| "learning_rate": 2e-06, |
| "loss": 0.0043, |
| "step": 96 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 491.96875, |
| "epoch": 0.12613784135240572, |
| "grad_norm": 14.99699485585395, |
| "kl": 2.140625, |
| "learning_rate": 2e-06, |
| "loss": -0.0011, |
| "reward": 0.6248654127120972, |
| "reward_std": 0.30046606063842773, |
| "rewards/preference_model_reward": 0.6248654127120972, |
| "rewards/preference_model_reward/std": 0.3810950815677643, |
| "step": 97 |
| }, |
| { |
| "clip_ratio": 0.0009762371191754937, |
| "epoch": 0.12743823146944083, |
| "grad_norm": 56.741945106919694, |
| "kl": 0.16015625, |
| "learning_rate": 2e-06, |
| "loss": 0.002, |
| "step": 98 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 446.78125, |
| "epoch": 0.12873862158647595, |
| "grad_norm": 1.25074838363305, |
| "kl": 0.0830078125, |
| "learning_rate": 2e-06, |
| "loss": -0.0056, |
| "reward": 0.8078266978263855, |
| "reward_std": 0.33678656816482544, |
| "rewards/preference_model_reward": 0.8078266978263855, |
| "rewards/preference_model_reward/std": 0.37187322974205017, |
| "step": 99 |
| }, |
| { |
| "clip_ratio": 0.0002822041278705001, |
| "epoch": 0.13003901170351106, |
| "grad_norm": 1.2248064666505254, |
| "kl": 0.08447265625, |
| "learning_rate": 2e-06, |
| "loss": -0.0062, |
| "step": 100 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 321.34375, |
| "epoch": 0.13133940182054615, |
| "grad_norm": 0.8820706667295893, |
| "kl": 0.083984375, |
| "learning_rate": 2e-06, |
| "loss": -0.0028, |
| "reward": 0.8281782269477844, |
| "reward_std": 0.25918400287628174, |
| "rewards/preference_model_reward": 0.8281782269477844, |
| "rewards/preference_model_reward/std": 0.3205867111682892, |
| "step": 101 |
| }, |
| { |
| "clip_ratio": 0.0001883239165181294, |
| "epoch": 0.13263979193758127, |
| "grad_norm": 0.8633439241133061, |
| "kl": 0.0849609375, |
| "learning_rate": 2e-06, |
| "loss": -0.0033, |
| "step": 102 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 516.125, |
| "epoch": 0.13394018205461639, |
| "grad_norm": 0.9179254188048204, |
| "kl": 0.1162109375, |
| "learning_rate": 2e-06, |
| "loss": -0.0012, |
| "reward": 0.8762021064758301, |
| "reward_std": 0.1672360599040985, |
| "rewards/preference_model_reward": 0.8762021064758301, |
| "rewards/preference_model_reward/std": 0.26448386907577515, |
| "step": 103 |
| }, |
| { |
| "clip_ratio": 0.0004665090818889439, |
| "epoch": 0.1352405721716515, |
| "grad_norm": 0.8518295339560051, |
| "kl": 0.119140625, |
| "learning_rate": 2e-06, |
| "loss": -0.0017, |
| "step": 104 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 459.09375, |
| "epoch": 0.13654096228868662, |
| "grad_norm": 1.3294307657803117, |
| "kl": 0.09375, |
| "learning_rate": 2e-06, |
| "loss": -0.0053, |
| "reward": 0.834820032119751, |
| "reward_std": 0.20883573591709137, |
| "rewards/preference_model_reward": 0.834820032119751, |
| "rewards/preference_model_reward/std": 0.33552286028862, |
| "step": 105 |
| }, |
| { |
| "clip_ratio": 0.00029856746550649405, |
| "epoch": 0.1378413524057217, |
| "grad_norm": 1.2984474405085682, |
| "kl": 0.09423828125, |
| "learning_rate": 2e-06, |
| "loss": -0.0061, |
| "step": 106 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 511.125, |
| "epoch": 0.13914174252275682, |
| "grad_norm": 1.336937835070471, |
| "kl": 0.09716796875, |
| "learning_rate": 2e-06, |
| "loss": 0.006, |
| "reward": 0.8500386476516724, |
| "reward_std": 0.262530118227005, |
| "rewards/preference_model_reward": 0.8500386476516724, |
| "rewards/preference_model_reward/std": 0.33816027641296387, |
| "step": 107 |
| }, |
| { |
| "clip_ratio": 0.00040964456275105476, |
| "epoch": 0.14044213263979194, |
| "grad_norm": 1.3064575603389494, |
| "kl": 0.09765625, |
| "learning_rate": 2e-06, |
| "loss": 0.0052, |
| "step": 108 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 581.71875, |
| "epoch": 0.14174252275682706, |
| "grad_norm": 0.7932427250097717, |
| "kl": 0.11328125, |
| "learning_rate": 2e-06, |
| "loss": 0.0012, |
| "reward": 0.954128086566925, |
| "reward_std": 0.1318557858467102, |
| "rewards/preference_model_reward": 0.954128086566925, |
| "rewards/preference_model_reward/std": 0.18926787376403809, |
| "step": 109 |
| }, |
| { |
| "clip_ratio": 0.0003079274611081928, |
| "epoch": 0.14304291287386217, |
| "grad_norm": 0.7211363567303583, |
| "kl": 0.1142578125, |
| "learning_rate": 2e-06, |
| "loss": 0.0008, |
| "step": 110 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 625.71875, |
| "epoch": 0.14434330299089726, |
| "grad_norm": 1.5659320313843499, |
| "kl": 0.09423828125, |
| "learning_rate": 2e-06, |
| "loss": 0.012, |
| "reward": 0.5640057921409607, |
| "reward_std": 0.3570883274078369, |
| "rewards/preference_model_reward": 0.5640057921409607, |
| "rewards/preference_model_reward/std": 0.4593791365623474, |
| "step": 111 |
| }, |
| { |
| "clip_ratio": 0.0002680362085811794, |
| "epoch": 0.14564369310793238, |
| "grad_norm": 1.465110556785164, |
| "kl": 0.095703125, |
| "learning_rate": 2e-06, |
| "loss": 0.0113, |
| "step": 112 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 488.59375, |
| "epoch": 0.1469440832249675, |
| "grad_norm": 1.409521260931884, |
| "kl": 0.111328125, |
| "learning_rate": 2e-06, |
| "loss": 0.0261, |
| "reward": 0.8961158990859985, |
| "reward_std": 0.2921155095100403, |
| "rewards/preference_model_reward": 0.8961158990859985, |
| "rewards/preference_model_reward/std": 0.29145917296409607, |
| "step": 113 |
| }, |
| { |
| "clip_ratio": 0.0003246698761358857, |
| "epoch": 0.1482444733420026, |
| "grad_norm": 1.2950122397091601, |
| "kl": 0.11328125, |
| "learning_rate": 2e-06, |
| "loss": 0.0253, |
| "step": 114 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 351.40625, |
| "epoch": 0.14954486345903772, |
| "grad_norm": 0.7110353333635794, |
| "kl": 0.1240234375, |
| "learning_rate": 2e-06, |
| "loss": -0.0015, |
| "reward": 0.9187972545623779, |
| "reward_std": 0.23119348287582397, |
| "rewards/preference_model_reward": 0.9187972545623779, |
| "rewards/preference_model_reward/std": 0.22826893627643585, |
| "step": 115 |
| }, |
| { |
| "clip_ratio": 0.0002654148265719414, |
| "epoch": 0.1508452535760728, |
| "grad_norm": 0.6454362421608829, |
| "kl": 0.12158203125, |
| "learning_rate": 2e-06, |
| "loss": -0.0018, |
| "step": 116 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 415.0, |
| "epoch": 0.15214564369310793, |
| "grad_norm": 1.3938640333126824, |
| "kl": 0.115234375, |
| "learning_rate": 2e-06, |
| "loss": 0.0042, |
| "reward": 0.7364434003829956, |
| "reward_std": 0.3461056351661682, |
| "rewards/preference_model_reward": 0.7364434003829956, |
| "rewards/preference_model_reward/std": 0.3824731409549713, |
| "step": 117 |
| }, |
| { |
| "clip_ratio": 0.00016578214126639068, |
| "epoch": 0.15344603381014305, |
| "grad_norm": 1.1848160683983286, |
| "kl": 0.1162109375, |
| "learning_rate": 2e-06, |
| "loss": 0.0035, |
| "step": 118 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 461.65625, |
| "epoch": 0.15474642392717816, |
| "grad_norm": 1.2327690933799054, |
| "kl": 0.13671875, |
| "learning_rate": 2e-06, |
| "loss": -0.0024, |
| "reward": 0.7404364347457886, |
| "reward_std": 0.22904622554779053, |
| "rewards/preference_model_reward": 0.7404364347457886, |
| "rewards/preference_model_reward/std": 0.4136257469654083, |
| "step": 119 |
| }, |
| { |
| "clip_ratio": 0.0004128115251660347, |
| "epoch": 0.15604681404421328, |
| "grad_norm": 1.1864725204703228, |
| "kl": 0.1376953125, |
| "learning_rate": 2e-06, |
| "loss": -0.0031, |
| "step": 120 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 576.6875, |
| "epoch": 0.15734720416124837, |
| "grad_norm": 1.2507452373907348, |
| "kl": 0.10791015625, |
| "learning_rate": 2e-06, |
| "loss": 0.009, |
| "reward": 0.667312741279602, |
| "reward_std": 0.2824709415435791, |
| "rewards/preference_model_reward": 0.667312741279602, |
| "rewards/preference_model_reward/std": 0.4015900194644928, |
| "step": 121 |
| }, |
| { |
| "clip_ratio": 0.0003672163584269583, |
| "epoch": 0.15864759427828348, |
| "grad_norm": 1.173660124640144, |
| "kl": 0.109375, |
| "learning_rate": 2e-06, |
| "loss": 0.0084, |
| "step": 122 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 409.59375, |
| "epoch": 0.1599479843953186, |
| "grad_norm": 0.8600581560919843, |
| "kl": 0.123046875, |
| "learning_rate": 2e-06, |
| "loss": -0.02, |
| "reward": 0.8616656064987183, |
| "reward_std": 0.2982789874076843, |
| "rewards/preference_model_reward": 0.8616656064987183, |
| "rewards/preference_model_reward/std": 0.30772268772125244, |
| "step": 123 |
| }, |
| { |
| "clip_ratio": 6.778741953894496e-05, |
| "epoch": 0.16124837451235371, |
| "grad_norm": 0.8288777030969212, |
| "kl": 0.123046875, |
| "learning_rate": 2e-06, |
| "loss": -0.0205, |
| "step": 124 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 436.96875, |
| "epoch": 0.1625487646293888, |
| "grad_norm": 1.7782879022760736, |
| "kl": 0.10986328125, |
| "learning_rate": 2e-06, |
| "loss": -0.0159, |
| "reward": 0.8924694061279297, |
| "reward_std": 0.2801273465156555, |
| "rewards/preference_model_reward": 0.8924694061279297, |
| "rewards/preference_model_reward/std": 0.2758771777153015, |
| "step": 125 |
| }, |
| { |
| "clip_ratio": 0.00029164942679926753, |
| "epoch": 0.16384915474642392, |
| "grad_norm": 0.7890220661652254, |
| "kl": 0.1123046875, |
| "learning_rate": 2e-06, |
| "loss": -0.0162, |
| "step": 126 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 499.90625, |
| "epoch": 0.16514954486345904, |
| "grad_norm": 0.5249880512825823, |
| "kl": 0.12109375, |
| "learning_rate": 2e-06, |
| "loss": 0.0085, |
| "reward": 0.9754809141159058, |
| "reward_std": 0.06936999410390854, |
| "rewards/preference_model_reward": 0.9754809141159058, |
| "rewards/preference_model_reward/std": 0.09967197477817535, |
| "step": 127 |
| }, |
| { |
| "clip_ratio": 6.330716860247776e-05, |
| "epoch": 0.16644993498049415, |
| "grad_norm": 0.48145633919403286, |
| "kl": 0.1220703125, |
| "learning_rate": 2e-06, |
| "loss": 0.0082, |
| "step": 128 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 476.0, |
| "epoch": 0.16775032509752927, |
| "grad_norm": 0.8581044560913536, |
| "kl": 0.12890625, |
| "learning_rate": 2e-06, |
| "loss": -0.0003, |
| "reward": 0.9375989437103271, |
| "reward_std": 0.14338433742523193, |
| "rewards/preference_model_reward": 0.9375989437103271, |
| "rewards/preference_model_reward/std": 0.2093113660812378, |
| "step": 129 |
| }, |
| { |
| "clip_ratio": 0.00023169601627159864, |
| "epoch": 0.16905071521456436, |
| "grad_norm": 0.6716469385695882, |
| "kl": 0.1298828125, |
| "learning_rate": 2e-06, |
| "loss": -0.0006, |
| "step": 130 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 417.53125, |
| "epoch": 0.17035110533159947, |
| "grad_norm": 0.9531216293434532, |
| "kl": 0.1220703125, |
| "learning_rate": 2e-06, |
| "loss": -0.0145, |
| "reward": 0.874575138092041, |
| "reward_std": 0.312376469373703, |
| "rewards/preference_model_reward": 0.874575138092041, |
| "rewards/preference_model_reward/std": 0.325155109167099, |
| "step": 131 |
| }, |
| { |
| "clip_ratio": 0.00021792339975945652, |
| "epoch": 0.1716514954486346, |
| "grad_norm": 0.9321159155810473, |
| "kl": 0.1240234375, |
| "learning_rate": 2e-06, |
| "loss": -0.0151, |
| "step": 132 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 444.84375, |
| "epoch": 0.1729518855656697, |
| "grad_norm": 0.9100648016845723, |
| "kl": 0.1220703125, |
| "learning_rate": 2e-06, |
| "loss": -0.0057, |
| "reward": 0.1512603610754013, |
| "reward_std": 0.15578344464302063, |
| "rewards/preference_model_reward": 0.1512603610754013, |
| "rewards/preference_model_reward/std": 0.2468794584274292, |
| "step": 133 |
| }, |
| { |
| "clip_ratio": 0.0009341657860204577, |
| "epoch": 0.17425227568270482, |
| "grad_norm": 0.8979224574809616, |
| "kl": 0.1259765625, |
| "learning_rate": 2e-06, |
| "loss": -0.0062, |
| "step": 134 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 499.71875, |
| "epoch": 0.1755526657997399, |
| "grad_norm": 1.1433790438627716, |
| "kl": 0.26171875, |
| "learning_rate": 2e-06, |
| "loss": 0.0028, |
| "reward": 0.90513676404953, |
| "reward_std": 0.23498843610286713, |
| "rewards/preference_model_reward": 0.90513676404953, |
| "rewards/preference_model_reward/std": 0.24004317820072174, |
| "step": 135 |
| }, |
| { |
| "clip_ratio": 0.0007208128226920962, |
| "epoch": 0.17685305591677503, |
| "grad_norm": 1.0273440654539872, |
| "kl": 0.2197265625, |
| "learning_rate": 2e-06, |
| "loss": 0.0022, |
| "step": 136 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 511.1875, |
| "epoch": 0.17815344603381014, |
| "grad_norm": 0.3778536448237069, |
| "kl": 0.1318359375, |
| "learning_rate": 2e-06, |
| "loss": 0.0016, |
| "reward": 0.9848020076751709, |
| "reward_std": 0.060791999101638794, |
| "rewards/preference_model_reward": 0.9848020076751709, |
| "rewards/preference_model_reward/std": 0.0859728679060936, |
| "step": 137 |
| }, |
| { |
| "clip_ratio": 0.00015050009824335575, |
| "epoch": 0.17945383615084526, |
| "grad_norm": 0.38068416605149247, |
| "kl": 0.1337890625, |
| "learning_rate": 2e-06, |
| "loss": 0.0013, |
| "step": 138 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 330.78125, |
| "epoch": 0.18075422626788037, |
| "grad_norm": 1.121182505147895, |
| "kl": 0.140625, |
| "learning_rate": 2e-06, |
| "loss": -0.0082, |
| "reward": 0.6605119705200195, |
| "reward_std": 0.357056200504303, |
| "rewards/preference_model_reward": 0.6605119705200195, |
| "rewards/preference_model_reward/std": 0.3964306712150574, |
| "step": 139 |
| }, |
| { |
| "clip_ratio": 0.00019087232067249715, |
| "epoch": 0.18205461638491546, |
| "grad_norm": 1.0796848772767726, |
| "kl": 0.142578125, |
| "learning_rate": 2e-06, |
| "loss": -0.0089, |
| "step": 140 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 576.625, |
| "epoch": 0.18335500650195058, |
| "grad_norm": 1.3173408698053826, |
| "kl": 0.1484375, |
| "learning_rate": 2e-06, |
| "loss": -0.0007, |
| "reward": 0.5397460460662842, |
| "reward_std": 0.27076956629753113, |
| "rewards/preference_model_reward": 0.5397460460662842, |
| "rewards/preference_model_reward/std": 0.43955907225608826, |
| "step": 141 |
| }, |
| { |
| "clip_ratio": 0.000534381833858788, |
| "epoch": 0.1846553966189857, |
| "grad_norm": 1.2508652015275377, |
| "kl": 0.150390625, |
| "learning_rate": 2e-06, |
| "loss": -0.0014, |
| "step": 142 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 417.5625, |
| "epoch": 0.1859557867360208, |
| "grad_norm": 1.0095689522807012, |
| "kl": 0.1298828125, |
| "learning_rate": 2e-06, |
| "loss": -0.0072, |
| "reward": 0.8610371351242065, |
| "reward_std": 0.2966481149196625, |
| "rewards/preference_model_reward": 0.8610371351242065, |
| "rewards/preference_model_reward/std": 0.30670949816703796, |
| "step": 143 |
| }, |
| { |
| "clip_ratio": 5.617977512883954e-05, |
| "epoch": 0.18725617685305593, |
| "grad_norm": 0.9826898380088299, |
| "kl": 0.1298828125, |
| "learning_rate": 2e-06, |
| "loss": -0.0078, |
| "step": 144 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 549.96875, |
| "epoch": 0.18855656697009102, |
| "grad_norm": 1.7766319258647452, |
| "kl": 0.158203125, |
| "learning_rate": 2e-06, |
| "loss": 0.0135, |
| "reward": 0.6295263171195984, |
| "reward_std": 0.4187769293785095, |
| "rewards/preference_model_reward": 0.6295263171195984, |
| "rewards/preference_model_reward/std": 0.42381730675697327, |
| "step": 145 |
| }, |
| { |
| "clip_ratio": 0.0003444340836722404, |
| "epoch": 0.18985695708712613, |
| "grad_norm": 1.7285319530442194, |
| "kl": 0.1484375, |
| "learning_rate": 2e-06, |
| "loss": 0.0125, |
| "step": 146 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 396.78125, |
| "epoch": 0.19115734720416125, |
| "grad_norm": 1.1051043019440878, |
| "kl": 0.1318359375, |
| "learning_rate": 2e-06, |
| "loss": -0.0055, |
| "reward": 0.5800995826721191, |
| "reward_std": 0.32392293214797974, |
| "rewards/preference_model_reward": 0.5800995826721191, |
| "rewards/preference_model_reward/std": 0.3707711398601532, |
| "step": 147 |
| }, |
| { |
| "clip_ratio": 0.0003981678746640682, |
| "epoch": 0.19245773732119636, |
| "grad_norm": 1.0453400928741006, |
| "kl": 0.1328125, |
| "learning_rate": 2e-06, |
| "loss": -0.0062, |
| "step": 148 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 334.28125, |
| "epoch": 0.19375812743823148, |
| "grad_norm": 0.03748773956154849, |
| "kl": 0.1376953125, |
| "learning_rate": 2e-06, |
| "loss": 0.0, |
| "reward": 0.9981102347373962, |
| "reward_std": 0.0075591248460114, |
| "rewards/preference_model_reward": 0.9981102347373962, |
| "rewards/preference_model_reward/std": 0.010690220631659031, |
| "step": 149 |
| }, |
| { |
| "clip_ratio": 8.632597018731758e-05, |
| "epoch": 0.19505851755526657, |
| "grad_norm": 0.03228936227298709, |
| "kl": 0.1328125, |
| "learning_rate": 2e-06, |
| "loss": 0.0, |
| "step": 150 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 447.875, |
| "epoch": 0.19635890767230169, |
| "grad_norm": 0.05752238966724317, |
| "kl": 0.15234375, |
| "learning_rate": 2e-06, |
| "loss": 0.0002, |
| "reward": 0.9974650144577026, |
| "reward_std": 0.010140029713511467, |
| "rewards/preference_model_reward": 0.9974650144577026, |
| "rewards/preference_model_reward/std": 0.014340158551931381, |
| "step": 151 |
| }, |
| { |
| "clip_ratio": 0.00020508613670244813, |
| "epoch": 0.1976592977893368, |
| "grad_norm": 0.04811959314845179, |
| "kl": 0.1416015625, |
| "learning_rate": 2e-06, |
| "loss": 0.0001, |
| "step": 152 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 550.0625, |
| "epoch": 0.19895968790637192, |
| "grad_norm": 1.0016776671397591, |
| "kl": 0.1474609375, |
| "learning_rate": 2e-06, |
| "loss": -0.0062, |
| "reward": 0.912561297416687, |
| "reward_std": 0.25192081928253174, |
| "rewards/preference_model_reward": 0.912561297416687, |
| "rewards/preference_model_reward/std": 0.25600963830947876, |
| "step": 153 |
| }, |
| { |
| "clip_ratio": 0.0003616804606281221, |
| "epoch": 0.20026007802340703, |
| "grad_norm": 0.9381106713638003, |
| "kl": 0.1376953125, |
| "learning_rate": 2e-06, |
| "loss": -0.0068, |
| "step": 154 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 779.6875, |
| "epoch": 0.20156046814044212, |
| "grad_norm": 1.9176344667399166, |
| "kl": 0.1240234375, |
| "learning_rate": 2e-06, |
| "loss": 0.0209, |
| "reward": 0.46237650513648987, |
| "reward_std": 0.37269675731658936, |
| "rewards/preference_model_reward": 0.46237650513648987, |
| "rewards/preference_model_reward/std": 0.4106147587299347, |
| "step": 155 |
| }, |
| { |
| "clip_ratio": 0.0005603111931122839, |
| "epoch": 0.20286085825747724, |
| "grad_norm": 1.9416642185631487, |
| "kl": 0.11962890625, |
| "learning_rate": 2e-06, |
| "loss": 0.0196, |
| "step": 156 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 561.15625, |
| "epoch": 0.20416124837451236, |
| "grad_norm": 1.246090268048283, |
| "kl": 0.1171875, |
| "learning_rate": 2e-06, |
| "loss": 0.0074, |
| "reward": 0.8197011351585388, |
| "reward_std": 0.32638221979141235, |
| "rewards/preference_model_reward": 0.8197011351585388, |
| "rewards/preference_model_reward/std": 0.3211716413497925, |
| "step": 157 |
| }, |
| { |
| "clip_ratio": 0.00015653572336304933, |
| "epoch": 0.20546163849154747, |
| "grad_norm": 1.1878525300972134, |
| "kl": 0.115234375, |
| "learning_rate": 2e-06, |
| "loss": 0.0067, |
| "step": 158 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 498.78125, |
| "epoch": 0.2067620286085826, |
| "grad_norm": 1.6471484937824736, |
| "kl": 0.1083984375, |
| "learning_rate": 2e-06, |
| "loss": 0.0099, |
| "reward": 0.6723287105560303, |
| "reward_std": 0.42400574684143066, |
| "rewards/preference_model_reward": 0.6723287105560303, |
| "rewards/preference_model_reward/std": 0.42361700534820557, |
| "step": 159 |
| }, |
| { |
| "clip_ratio": 0.00019252923084422946, |
| "epoch": 0.20806241872561768, |
| "grad_norm": 1.5640304604076711, |
| "kl": 0.10888671875, |
| "learning_rate": 2e-06, |
| "loss": 0.0089, |
| "step": 160 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 452.59375, |
| "epoch": 0.2093628088426528, |
| "grad_norm": 1.1976569308957934, |
| "kl": 0.1337890625, |
| "learning_rate": 2e-06, |
| "loss": -0.0028, |
| "reward": 0.8096756935119629, |
| "reward_std": 0.33807146549224854, |
| "rewards/preference_model_reward": 0.8096756935119629, |
| "rewards/preference_model_reward/std": 0.34619417786598206, |
| "step": 161 |
| }, |
| { |
| "clip_ratio": 0.00016228496679104865, |
| "epoch": 0.2106631989596879, |
| "grad_norm": 1.2198444194251261, |
| "kl": 0.13671875, |
| "learning_rate": 2e-06, |
| "loss": -0.0036, |
| "step": 162 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 612.21875, |
| "epoch": 0.21196358907672302, |
| "grad_norm": 1.6710655753119792, |
| "kl": 0.1328125, |
| "learning_rate": 2e-06, |
| "loss": 0.0194, |
| "reward": 0.7978184223175049, |
| "reward_std": 0.26372814178466797, |
| "rewards/preference_model_reward": 0.7978184223175049, |
| "rewards/preference_model_reward/std": 0.3677564263343811, |
| "step": 163 |
| }, |
| { |
| "clip_ratio": 0.0002533727674745023, |
| "epoch": 0.21326397919375814, |
| "grad_norm": 1.4829182389200866, |
| "kl": 0.1328125, |
| "learning_rate": 2e-06, |
| "loss": 0.0188, |
| "step": 164 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 368.71875, |
| "epoch": 0.21456436931079323, |
| "grad_norm": 1.0731863888448565, |
| "kl": 0.1279296875, |
| "learning_rate": 2e-06, |
| "loss": 0.0004, |
| "reward": 0.7712475061416626, |
| "reward_std": 0.3398103713989258, |
| "rewards/preference_model_reward": 0.7712475061416626, |
| "rewards/preference_model_reward/std": 0.34702128171920776, |
| "step": 165 |
| }, |
| { |
| "clip_ratio": 0.0001748301729094237, |
| "epoch": 0.21586475942782835, |
| "grad_norm": 1.0142522280669701, |
| "kl": 0.12890625, |
| "learning_rate": 2e-06, |
| "loss": -0.0002, |
| "step": 166 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 613.0, |
| "epoch": 0.21716514954486346, |
| "grad_norm": 1.1219818347809352, |
| "kl": 0.1357421875, |
| "learning_rate": 2e-06, |
| "loss": 0.0129, |
| "reward": 0.5442590713500977, |
| "reward_std": 0.2779287099838257, |
| "rewards/preference_model_reward": 0.5442590713500977, |
| "rewards/preference_model_reward/std": 0.41137993335723877, |
| "step": 167 |
| }, |
| { |
| "clip_ratio": 0.00020389427663758397, |
| "epoch": 0.21846553966189858, |
| "grad_norm": 1.1061402577957113, |
| "kl": 0.13671875, |
| "learning_rate": 2e-06, |
| "loss": 0.0122, |
| "step": 168 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 612.90625, |
| "epoch": 0.21976592977893367, |
| "grad_norm": 1.72194342380149, |
| "kl": 0.13671875, |
| "learning_rate": 2e-06, |
| "loss": 0.0029, |
| "reward": 0.684001624584198, |
| "reward_std": 0.3655635118484497, |
| "rewards/preference_model_reward": 0.684001624584198, |
| "rewards/preference_model_reward/std": 0.4322417080402374, |
| "step": 169 |
| }, |
| { |
| "clip_ratio": 0.000274754042038694, |
| "epoch": 0.22106631989596878, |
| "grad_norm": 1.6428549273522806, |
| "kl": 0.138671875, |
| "learning_rate": 2e-06, |
| "loss": 0.002, |
| "step": 170 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 634.28125, |
| "epoch": 0.2223667100130039, |
| "grad_norm": 1.614101972009106, |
| "kl": 0.138671875, |
| "learning_rate": 2e-06, |
| "loss": 0.0056, |
| "reward": 0.8601099848747253, |
| "reward_std": 0.31807541847229004, |
| "rewards/preference_model_reward": 0.8601099848747253, |
| "rewards/preference_model_reward/std": 0.3315056562423706, |
| "step": 171 |
| }, |
| { |
| "clip_ratio": 0.00013853044947609305, |
| "epoch": 0.22366710013003901, |
| "grad_norm": 1.3296533646627524, |
| "kl": 0.1396484375, |
| "learning_rate": 2e-06, |
| "loss": 0.0049, |
| "step": 172 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 401.59375, |
| "epoch": 0.22496749024707413, |
| "grad_norm": 0.6357224318885739, |
| "kl": 0.1552734375, |
| "learning_rate": 2e-06, |
| "loss": -0.0, |
| "reward": 0.9281669855117798, |
| "reward_std": 0.1928824484348297, |
| "rewards/preference_model_reward": 0.9281669855117798, |
| "rewards/preference_model_reward/std": 0.19411809742450714, |
| "step": 173 |
| }, |
| { |
| "clip_ratio": 0.00015087510109879076, |
| "epoch": 0.22626788036410922, |
| "grad_norm": 0.5759303474167251, |
| "kl": 0.158203125, |
| "learning_rate": 2e-06, |
| "loss": -0.0004, |
| "step": 174 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 489.71875, |
| "epoch": 0.22756827048114434, |
| "grad_norm": 1.6254811605839956, |
| "kl": 0.13671875, |
| "learning_rate": 2e-06, |
| "loss": -0.0008, |
| "reward": 0.6483066082000732, |
| "reward_std": 0.4315972328186035, |
| "rewards/preference_model_reward": 0.6483066082000732, |
| "rewards/preference_model_reward/std": 0.4604679346084595, |
| "step": 175 |
| }, |
| { |
| "clip_ratio": 0.00034807526390068233, |
| "epoch": 0.22886866059817945, |
| "grad_norm": 1.5654975897789054, |
| "kl": 0.138671875, |
| "learning_rate": 2e-06, |
| "loss": -0.0018, |
| "step": 176 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 418.9375, |
| "epoch": 0.23016905071521457, |
| "grad_norm": 1.2843572461072899, |
| "kl": 0.1611328125, |
| "learning_rate": 2e-06, |
| "loss": -0.0046, |
| "reward": 0.7664402723312378, |
| "reward_std": 0.3110997676849365, |
| "rewards/preference_model_reward": 0.7664402723312378, |
| "rewards/preference_model_reward/std": 0.367339551448822, |
| "step": 177 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "epoch": 0.23146944083224968, |
| "grad_norm": 1.1110477641764491, |
| "kl": 0.1640625, |
| "learning_rate": 2e-06, |
| "loss": -0.0052, |
| "step": 178 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 358.21875, |
| "epoch": 0.23276983094928477, |
| "grad_norm": 0.8712719346022294, |
| "kl": 0.140625, |
| "learning_rate": 2e-06, |
| "loss": -0.0053, |
| "reward": 0.8626149892807007, |
| "reward_std": 0.2925213575363159, |
| "rewards/preference_model_reward": 0.8626149892807007, |
| "rewards/preference_model_reward/std": 0.2878483831882477, |
| "step": 179 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "epoch": 0.2340702210663199, |
| "grad_norm": 0.8070501204024879, |
| "kl": 0.142578125, |
| "learning_rate": 2e-06, |
| "loss": -0.0058, |
| "step": 180 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 647.03125, |
| "epoch": 0.235370611183355, |
| "grad_norm": 1.4675397620762083, |
| "kl": 0.134765625, |
| "learning_rate": 2e-06, |
| "loss": 0.0117, |
| "reward": 0.8469977378845215, |
| "reward_std": 0.3276008367538452, |
| "rewards/preference_model_reward": 0.8469977378845215, |
| "rewards/preference_model_reward/std": 0.32416415214538574, |
| "step": 181 |
| }, |
| { |
| "clip_ratio": 0.0002926147426478565, |
| "epoch": 0.23667100130039012, |
| "grad_norm": 2.070965238661399, |
| "kl": 0.134765625, |
| "learning_rate": 2e-06, |
| "loss": 0.0109, |
| "step": 182 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 421.0625, |
| "epoch": 0.23797139141742524, |
| "grad_norm": 1.1744315012326634, |
| "kl": 0.1611328125, |
| "learning_rate": 2e-06, |
| "loss": 0.0164, |
| "reward": 0.8896687030792236, |
| "reward_std": 0.22899243235588074, |
| "rewards/preference_model_reward": 0.8896687030792236, |
| "rewards/preference_model_reward/std": 0.29047808051109314, |
| "step": 183 |
| }, |
| { |
| "clip_ratio": 0.00016585712728556246, |
| "epoch": 0.23927178153446033, |
| "grad_norm": 1.1088031941295666, |
| "kl": 0.1640625, |
| "learning_rate": 2e-06, |
| "loss": 0.0157, |
| "step": 184 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 305.1875, |
| "epoch": 0.24057217165149544, |
| "grad_norm": 0.012167435784803771, |
| "kl": 0.17578125, |
| "learning_rate": 2e-06, |
| "loss": 0.0001, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/preference_model_reward": 1.0, |
| "rewards/preference_model_reward/std": 0.0, |
| "step": 185 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "epoch": 0.24187256176853056, |
| "grad_norm": 0.010662640913732074, |
| "kl": 0.166015625, |
| "learning_rate": 2e-06, |
| "loss": 0.0001, |
| "step": 186 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 279.5, |
| "epoch": 0.24317295188556567, |
| "grad_norm": 2.798587143064415, |
| "kl": 0.1484375, |
| "learning_rate": 2e-06, |
| "loss": -0.0066, |
| "reward": 0.6557672023773193, |
| "reward_std": 0.332293838262558, |
| "rewards/preference_model_reward": 0.6557672023773193, |
| "rewards/preference_model_reward/std": 0.34837016463279724, |
| "step": 187 |
| }, |
| { |
| "clip_ratio": 0.001388114527799189, |
| "epoch": 0.2444733420026008, |
| "grad_norm": 0.8537099565058428, |
| "kl": 0.142578125, |
| "learning_rate": 2e-06, |
| "loss": -0.0068, |
| "step": 188 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 385.15625, |
| "epoch": 0.24577373211963588, |
| "grad_norm": 0.8363775477713209, |
| "kl": 0.1376953125, |
| "learning_rate": 2e-06, |
| "loss": 0.0038, |
| "reward": 0.8794984221458435, |
| "reward_std": 0.28465738892555237, |
| "rewards/preference_model_reward": 0.8794984221458435, |
| "rewards/preference_model_reward/std": 0.283857524394989, |
| "step": 189 |
| }, |
| { |
| "clip_ratio": 0.00017409646534360945, |
| "epoch": 0.247074122236671, |
| "grad_norm": 0.816021420016002, |
| "kl": 0.1318359375, |
| "learning_rate": 2e-06, |
| "loss": 0.0032, |
| "step": 190 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 356.96875, |
| "epoch": 0.2483745123537061, |
| "grad_norm": 0.9815328749481499, |
| "kl": 0.138671875, |
| "learning_rate": 2e-06, |
| "loss": -0.0094, |
| "reward": 0.6524101495742798, |
| "reward_std": 0.3220939040184021, |
| "rewards/preference_model_reward": 0.6524101495742798, |
| "rewards/preference_model_reward/std": 0.34822776913642883, |
| "step": 191 |
| }, |
| { |
| "clip_ratio": 0.0002863642293959856, |
| "epoch": 0.24967490247074123, |
| "grad_norm": 0.9330643460254814, |
| "kl": 0.134765625, |
| "learning_rate": 2e-06, |
| "loss": -0.01, |
| "step": 192 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 257.71875, |
| "epoch": 0.25097529258777634, |
| "grad_norm": 1.1018955579208387, |
| "kl": 0.30859375, |
| "learning_rate": 2e-06, |
| "loss": -0.0052, |
| "reward": 0.7511432766914368, |
| "reward_std": 0.3835994601249695, |
| "rewards/preference_model_reward": 0.7511432766914368, |
| "rewards/preference_model_reward/std": 0.3915899097919464, |
| "step": 193 |
| }, |
| { |
| "clip_ratio": 0.0005864645936526358, |
| "epoch": 0.25227568270481143, |
| "grad_norm": 1.0859444991839702, |
| "kl": 0.2421875, |
| "learning_rate": 2e-06, |
| "loss": -0.0058, |
| "step": 194 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 410.21875, |
| "epoch": 0.2535760728218466, |
| "grad_norm": 0.012556309626161617, |
| "kl": 0.126953125, |
| "learning_rate": 2e-06, |
| "loss": 0.0001, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/preference_model_reward": 1.0, |
| "rewards/preference_model_reward/std": 0.0, |
| "step": 195 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "epoch": 0.25487646293888166, |
| "grad_norm": 0.010950174010336853, |
| "kl": 0.1162109375, |
| "learning_rate": 2e-06, |
| "loss": 0.0001, |
| "step": 196 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 270.40625, |
| "epoch": 0.25617685305591675, |
| "grad_norm": 0.4413920366005579, |
| "kl": 0.109375, |
| "learning_rate": 2e-06, |
| "loss": -0.0097, |
| "reward": 0.5270646810531616, |
| "reward_std": 0.2283352017402649, |
| "rewards/preference_model_reward": 0.5270646810531616, |
| "rewards/preference_model_reward/std": 0.46704450249671936, |
| "step": 197 |
| }, |
| { |
| "clip_ratio": 0.0012252123560756445, |
| "epoch": 0.2574772431729519, |
| "grad_norm": 0.420238367270396, |
| "kl": 0.1005859375, |
| "learning_rate": 2e-06, |
| "loss": -0.01, |
| "step": 198 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 284.4375, |
| "epoch": 0.258777633289987, |
| "grad_norm": 0.648882313337497, |
| "kl": 0.1015625, |
| "learning_rate": 2e-06, |
| "loss": -0.0029, |
| "reward": 0.9249469637870789, |
| "reward_std": 0.2125411331653595, |
| "rewards/preference_model_reward": 0.9249469637870789, |
| "rewards/preference_model_reward/std": 0.24472835659980774, |
| "step": 199 |
| }, |
| { |
| "clip_ratio": 0.0009765520226210356, |
| "epoch": 0.26007802340702213, |
| "grad_norm": 0.5803957439533651, |
| "kl": 0.09765625, |
| "learning_rate": 2e-06, |
| "loss": -0.0033, |
| "step": 200 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 347.0625, |
| "epoch": 0.2613784135240572, |
| "grad_norm": 0.8843805814111732, |
| "kl": 0.103515625, |
| "learning_rate": 2e-06, |
| "loss": -0.0003, |
| "reward": 0.4381202459335327, |
| "reward_std": 0.27952808141708374, |
| "rewards/preference_model_reward": 0.4381202459335327, |
| "rewards/preference_model_reward/std": 0.4532867968082428, |
| "step": 201 |
| }, |
| { |
| "clip_ratio": 0.00026075675850734115, |
| "epoch": 0.2626788036410923, |
| "grad_norm": 0.8236946250468278, |
| "kl": 0.1015625, |
| "learning_rate": 2e-06, |
| "loss": -0.0008, |
| "step": 202 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 334.375, |
| "epoch": 0.26397919375812745, |
| "grad_norm": 0.8793077051463641, |
| "kl": 0.1142578125, |
| "learning_rate": 2e-06, |
| "loss": 0.0059, |
| "reward": 0.8583904504776001, |
| "reward_std": 0.26961272954940796, |
| "rewards/preference_model_reward": 0.8583904504776001, |
| "rewards/preference_model_reward/std": 0.27106156945228577, |
| "step": 203 |
| }, |
| { |
| "clip_ratio": 0.0005615265690721571, |
| "epoch": 0.26527958387516254, |
| "grad_norm": 0.8213872997514073, |
| "kl": 0.14453125, |
| "learning_rate": 2e-06, |
| "loss": 0.0054, |
| "step": 204 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 244.21875, |
| "epoch": 0.2665799739921977, |
| "grad_norm": 0.920439324543163, |
| "kl": 0.10791015625, |
| "learning_rate": 2e-06, |
| "loss": 0.0037, |
| "reward": 0.6516105532646179, |
| "reward_std": 0.3531697392463684, |
| "rewards/preference_model_reward": 0.6516105532646179, |
| "rewards/preference_model_reward/std": 0.38390058279037476, |
| "step": 205 |
| }, |
| { |
| "clip_ratio": 0.0003897629212588072, |
| "epoch": 0.26788036410923277, |
| "grad_norm": 0.8873190929889052, |
| "kl": 0.107421875, |
| "learning_rate": 2e-06, |
| "loss": 0.0031, |
| "step": 206 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 285.71875, |
| "epoch": 0.26918075422626786, |
| "grad_norm": 0.9107935937419545, |
| "kl": 0.1337890625, |
| "learning_rate": 2e-06, |
| "loss": 0.0073, |
| "reward": 0.6827311515808105, |
| "reward_std": 0.33245402574539185, |
| "rewards/preference_model_reward": 0.6827311515808105, |
| "rewards/preference_model_reward/std": 0.32757312059402466, |
| "step": 207 |
| }, |
| { |
| "clip_ratio": 0.0003240827936679125, |
| "epoch": 0.270481144343303, |
| "grad_norm": 0.8964363171955139, |
| "kl": 0.1357421875, |
| "learning_rate": 2e-06, |
| "loss": 0.0066, |
| "step": 208 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 419.9375, |
| "epoch": 0.2717815344603381, |
| "grad_norm": 0.8564481881429168, |
| "kl": 0.0869140625, |
| "learning_rate": 2e-06, |
| "loss": -0.0051, |
| "reward": 0.8694363832473755, |
| "reward_std": 0.25172746181488037, |
| "rewards/preference_model_reward": 0.8694363832473755, |
| "rewards/preference_model_reward/std": 0.24772705137729645, |
| "step": 209 |
| }, |
| { |
| "clip_ratio": 0.00022542427177540958, |
| "epoch": 0.27308192457737324, |
| "grad_norm": 0.7335746351550806, |
| "kl": 0.08544921875, |
| "learning_rate": 2e-06, |
| "loss": -0.0055, |
| "step": 210 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 430.09375, |
| "epoch": 0.2743823146944083, |
| "grad_norm": 1.0673059450451972, |
| "kl": 0.09228515625, |
| "learning_rate": 2e-06, |
| "loss": 0.0069, |
| "reward": 0.720138430595398, |
| "reward_std": 0.3222920894622803, |
| "rewards/preference_model_reward": 0.720138430595398, |
| "rewards/preference_model_reward/std": 0.41566386818885803, |
| "step": 211 |
| }, |
| { |
| "clip_ratio": 8.36400140542537e-05, |
| "epoch": 0.2756827048114434, |
| "grad_norm": 1.1400954432908579, |
| "kl": 0.09375, |
| "learning_rate": 2e-06, |
| "loss": 0.0061, |
| "step": 212 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 350.75, |
| "epoch": 0.27698309492847856, |
| "grad_norm": 1.3462434191662358, |
| "kl": 0.0791015625, |
| "learning_rate": 2e-06, |
| "loss": -0.0097, |
| "reward": 0.5960279703140259, |
| "reward_std": 0.4318947196006775, |
| "rewards/preference_model_reward": 0.5960279703140259, |
| "rewards/preference_model_reward/std": 0.4450107216835022, |
| "step": 213 |
| }, |
| { |
| "clip_ratio": 0.0005943958531133831, |
| "epoch": 0.27828348504551365, |
| "grad_norm": 1.3958238267778045, |
| "kl": 0.080078125, |
| "learning_rate": 2e-06, |
| "loss": -0.0106, |
| "step": 214 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 489.34375, |
| "epoch": 0.2795838751625488, |
| "grad_norm": 1.027540012470293, |
| "kl": 0.0869140625, |
| "learning_rate": 2e-06, |
| "loss": -0.0014, |
| "reward": 0.844211220741272, |
| "reward_std": 0.2946808934211731, |
| "rewards/preference_model_reward": 0.844211220741272, |
| "rewards/preference_model_reward/std": 0.29101303219795227, |
| "step": 215 |
| }, |
| { |
| "clip_ratio": 0.00011487863957881927, |
| "epoch": 0.2808842652795839, |
| "grad_norm": 0.9445019155913764, |
| "kl": 0.08837890625, |
| "learning_rate": 2e-06, |
| "loss": -0.002, |
| "step": 216 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 319.25, |
| "epoch": 0.28218465539661897, |
| "grad_norm": 0.8593232598517545, |
| "kl": 0.1005859375, |
| "learning_rate": 2e-06, |
| "loss": 0.0038, |
| "reward": 0.8116539120674133, |
| "reward_std": 0.2266491800546646, |
| "rewards/preference_model_reward": 0.8116539120674133, |
| "rewards/preference_model_reward/std": 0.3688415586948395, |
| "step": 217 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "epoch": 0.2834850455136541, |
| "grad_norm": 0.9536555216415759, |
| "kl": 0.10205078125, |
| "learning_rate": 2e-06, |
| "loss": 0.0032, |
| "step": 218 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 285.4375, |
| "epoch": 0.2847854356306892, |
| "grad_norm": 0.0920547152120324, |
| "kl": 0.111328125, |
| "learning_rate": 2e-06, |
| "loss": -0.0, |
| "reward": 0.5229940414428711, |
| "reward_std": 0.036096930503845215, |
| "rewards/preference_model_reward": 0.5229940414428711, |
| "rewards/preference_model_reward/std": 0.47642675042152405, |
| "step": 219 |
| }, |
| { |
| "clip_ratio": 0.0006749940221197903, |
| "epoch": 0.28608582574772434, |
| "grad_norm": 0.09505637629837431, |
| "kl": 0.111328125, |
| "learning_rate": 2e-06, |
| "loss": -0.0001, |
| "step": 220 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 315.3125, |
| "epoch": 0.28738621586475943, |
| "grad_norm": 0.44900720903133046, |
| "kl": 0.11669921875, |
| "learning_rate": 2e-06, |
| "loss": -0.0047, |
| "reward": 0.9324289560317993, |
| "reward_std": 0.14278283715248108, |
| "rewards/preference_model_reward": 0.9324289560317993, |
| "rewards/preference_model_reward/std": 0.16829460859298706, |
| "step": 221 |
| }, |
| { |
| "clip_ratio": 0.00019348411296959966, |
| "epoch": 0.2886866059817945, |
| "grad_norm": 0.42608100511796476, |
| "kl": 0.1171875, |
| "learning_rate": 2e-06, |
| "loss": -0.005, |
| "step": 222 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 253.90625, |
| "epoch": 0.28998699609882966, |
| "grad_norm": 0.48780598580355755, |
| "kl": 0.11669921875, |
| "learning_rate": 2e-06, |
| "loss": 0.0005, |
| "reward": 0.43476933240890503, |
| "reward_std": 0.1322799175977707, |
| "rewards/preference_model_reward": 0.43476933240890503, |
| "rewards/preference_model_reward/std": 0.4707281291484833, |
| "step": 223 |
| }, |
| { |
| "clip_ratio": 0.00040084568900056183, |
| "epoch": 0.29128738621586475, |
| "grad_norm": 0.44729368284423715, |
| "kl": 0.1181640625, |
| "learning_rate": 2e-06, |
| "loss": 0.0002, |
| "step": 224 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 471.46875, |
| "epoch": 0.2925877763328999, |
| "grad_norm": 0.6225599540084386, |
| "kl": 0.205078125, |
| "learning_rate": 2e-06, |
| "loss": -0.0135, |
| "reward": 0.938056230545044, |
| "reward_std": 0.1692640632390976, |
| "rewards/preference_model_reward": 0.938056230545044, |
| "rewards/preference_model_reward/std": 0.24374790489673615, |
| "step": 225 |
| }, |
| { |
| "clip_ratio": 0.00016879897157195956, |
| "epoch": 0.293888166449935, |
| "grad_norm": 0.566535614416212, |
| "kl": 0.1328125, |
| "learning_rate": 2e-06, |
| "loss": -0.0139, |
| "step": 226 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 219.40625, |
| "epoch": 0.2951885565669701, |
| "grad_norm": 1.1863022123947027, |
| "kl": 0.1171875, |
| "learning_rate": 2e-06, |
| "loss": 0.0027, |
| "reward": 0.5798717737197876, |
| "reward_std": 0.3581033945083618, |
| "rewards/preference_model_reward": 0.5798717737197876, |
| "rewards/preference_model_reward/std": 0.3897789716720581, |
| "step": 227 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "epoch": 0.2964889466840052, |
| "grad_norm": 0.8735255960345151, |
| "kl": 0.1220703125, |
| "learning_rate": 2e-06, |
| "loss": 0.0023, |
| "step": 228 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 296.8125, |
| "epoch": 0.2977893368010403, |
| "grad_norm": 0.8794603051578461, |
| "kl": 0.11279296875, |
| "learning_rate": 2e-06, |
| "loss": -0.0072, |
| "reward": 0.79021155834198, |
| "reward_std": 0.31384900212287903, |
| "rewards/preference_model_reward": 0.79021155834198, |
| "rewards/preference_model_reward/std": 0.3670634329319, |
| "step": 229 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "epoch": 0.29908972691807545, |
| "grad_norm": 0.9006009359994098, |
| "kl": 0.1162109375, |
| "learning_rate": 2e-06, |
| "loss": -0.0078, |
| "step": 230 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 373.9375, |
| "epoch": 0.30039011703511054, |
| "grad_norm": 0.8110636965631363, |
| "kl": 0.103515625, |
| "learning_rate": 2e-06, |
| "loss": 0.0017, |
| "reward": 0.8521493673324585, |
| "reward_std": 0.1953742802143097, |
| "rewards/preference_model_reward": 0.8521493673324585, |
| "rewards/preference_model_reward/std": 0.31055518984794617, |
| "step": 231 |
| }, |
| { |
| "clip_ratio": 0.00011916110815946013, |
| "epoch": 0.3016905071521456, |
| "grad_norm": 0.7852075153190419, |
| "kl": 0.10595703125, |
| "learning_rate": 2e-06, |
| "loss": 0.0013, |
| "step": 232 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 264.8125, |
| "epoch": 0.30299089726918077, |
| "grad_norm": 0.6520439692812087, |
| "kl": 0.125, |
| "learning_rate": 2e-06, |
| "loss": 0.0003, |
| "reward": 0.9159287214279175, |
| "reward_std": 0.2180308848619461, |
| "rewards/preference_model_reward": 0.9159287214279175, |
| "rewards/preference_model_reward/std": 0.24606594443321228, |
| "step": 233 |
| }, |
| { |
| "clip_ratio": 0.0004468331462703645, |
| "epoch": 0.30429128738621586, |
| "grad_norm": 0.5532931586806176, |
| "kl": 0.12890625, |
| "learning_rate": 2e-06, |
| "loss": -0.0, |
| "step": 234 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 319.75, |
| "epoch": 0.305591677503251, |
| "grad_norm": 0.776988546935505, |
| "kl": 0.142578125, |
| "learning_rate": 2e-06, |
| "loss": 0.0032, |
| "reward": 0.9222438931465149, |
| "reward_std": 0.23244205117225647, |
| "rewards/preference_model_reward": 0.9222438931465149, |
| "rewards/preference_model_reward/std": 0.23122651875019073, |
| "step": 235 |
| }, |
| { |
| "clip_ratio": 0.0007608620799146593, |
| "epoch": 0.3068920676202861, |
| "grad_norm": 0.7011384841526471, |
| "kl": 0.146484375, |
| "learning_rate": 2e-06, |
| "loss": 0.0027, |
| "step": 236 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 375.25, |
| "epoch": 0.3081924577373212, |
| "grad_norm": 0.6230777256393368, |
| "kl": 0.126953125, |
| "learning_rate": 2e-06, |
| "loss": -0.0075, |
| "reward": 0.9193332195281982, |
| "reward_std": 0.17760136723518372, |
| "rewards/preference_model_reward": 0.9193332195281982, |
| "rewards/preference_model_reward/std": 0.2603200674057007, |
| "step": 237 |
| }, |
| { |
| "clip_ratio": 0.0006001516012474895, |
| "epoch": 0.3094928478543563, |
| "grad_norm": 0.5912438243448386, |
| "kl": 0.130859375, |
| "learning_rate": 2e-06, |
| "loss": -0.0079, |
| "step": 238 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 222.125, |
| "epoch": 0.3107932379713914, |
| "grad_norm": 0.7506171412047739, |
| "kl": 0.150390625, |
| "learning_rate": 2e-06, |
| "loss": -0.0032, |
| "reward": 0.6538297533988953, |
| "reward_std": 0.2938784062862396, |
| "rewards/preference_model_reward": 0.6538297533988953, |
| "rewards/preference_model_reward/std": 0.3540026843547821, |
| "step": 239 |
| }, |
| { |
| "clip_ratio": 0.00028041156474500895, |
| "epoch": 0.31209362808842656, |
| "grad_norm": 0.64002825525906, |
| "kl": 0.15234375, |
| "learning_rate": 2e-06, |
| "loss": -0.0037, |
| "step": 240 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 405.5, |
| "epoch": 0.31339401820546164, |
| "grad_norm": 0.38597198495676954, |
| "kl": 0.1240234375, |
| "learning_rate": 2e-06, |
| "loss": -0.0012, |
| "reward": 0.970663845539093, |
| "reward_std": 0.0886102095246315, |
| "rewards/preference_model_reward": 0.970663845539093, |
| "rewards/preference_model_reward/std": 0.12682799994945526, |
| "step": 241 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "epoch": 0.31469440832249673, |
| "grad_norm": 0.35045812420514433, |
| "kl": 0.126953125, |
| "learning_rate": 2e-06, |
| "loss": -0.0015, |
| "step": 242 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 332.40625, |
| "epoch": 0.3159947984395319, |
| "grad_norm": 1.1616464740874977, |
| "kl": 0.140625, |
| "learning_rate": 2e-06, |
| "loss": 0.0007, |
| "reward": 0.7326881289482117, |
| "reward_std": 0.3448137640953064, |
| "rewards/preference_model_reward": 0.7326881289482117, |
| "rewards/preference_model_reward/std": 0.4094682037830353, |
| "step": 243 |
| }, |
| { |
| "clip_ratio": 0.00034368172055110335, |
| "epoch": 0.31729518855656696, |
| "grad_norm": 1.0573890331902724, |
| "kl": 0.142578125, |
| "learning_rate": 2e-06, |
| "loss": -0.0, |
| "step": 244 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 335.9375, |
| "epoch": 0.31859557867360205, |
| "grad_norm": 0.7961750832151863, |
| "kl": 0.1416015625, |
| "learning_rate": 2e-06, |
| "loss": -0.0001, |
| "reward": 0.8184847235679626, |
| "reward_std": 0.24248003959655762, |
| "rewards/preference_model_reward": 0.8184847235679626, |
| "rewards/preference_model_reward/std": 0.3097231090068817, |
| "step": 245 |
| }, |
| { |
| "clip_ratio": 0.00016528925334569067, |
| "epoch": 0.3198959687906372, |
| "grad_norm": 0.731756658994661, |
| "kl": 0.1435546875, |
| "learning_rate": 2e-06, |
| "loss": -0.0006, |
| "step": 246 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 330.21875, |
| "epoch": 0.3211963589076723, |
| "grad_norm": 0.8198850650973981, |
| "kl": 0.15234375, |
| "learning_rate": 2e-06, |
| "loss": -0.0025, |
| "reward": 0.800827145576477, |
| "reward_std": 0.22814823687076569, |
| "rewards/preference_model_reward": 0.800827145576477, |
| "rewards/preference_model_reward/std": 0.3556991219520569, |
| "step": 247 |
| }, |
| { |
| "clip_ratio": 0.00036711152642965317, |
| "epoch": 0.32249674902470743, |
| "grad_norm": 0.7758595322860915, |
| "kl": 0.16015625, |
| "learning_rate": 2e-06, |
| "loss": -0.0031, |
| "step": 248 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 333.9375, |
| "epoch": 0.3237971391417425, |
| "grad_norm": 0.7816768910697072, |
| "kl": 0.1708984375, |
| "learning_rate": 2e-06, |
| "loss": -0.0009, |
| "reward": 0.9013230204582214, |
| "reward_std": 0.20031380653381348, |
| "rewards/preference_model_reward": 0.9013230204582214, |
| "rewards/preference_model_reward/std": 0.2634318470954895, |
| "step": 249 |
| }, |
| { |
| "clip_ratio": 0.0006623025983572006, |
| "epoch": 0.3250975292587776, |
| "grad_norm": 0.700949755432084, |
| "kl": 0.173828125, |
| "learning_rate": 2e-06, |
| "loss": -0.0014, |
| "step": 250 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 387.375, |
| "epoch": 0.32639791937581275, |
| "grad_norm": 0.5236384680893461, |
| "kl": 0.166015625, |
| "learning_rate": 2e-06, |
| "loss": 0.0012, |
| "reward": 0.9541411399841309, |
| "reward_std": 0.12308676540851593, |
| "rewards/preference_model_reward": 0.9541411399841309, |
| "rewards/preference_model_reward/std": 0.13573099672794342, |
| "step": 251 |
| }, |
| { |
| "clip_ratio": 0.00028014881536364555, |
| "epoch": 0.32769830949284784, |
| "grad_norm": 0.5281527733024369, |
| "kl": 0.1689453125, |
| "learning_rate": 2e-06, |
| "loss": 0.0008, |
| "step": 252 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 320.25, |
| "epoch": 0.328998699609883, |
| "grad_norm": 0.9528739475463192, |
| "kl": 0.193359375, |
| "learning_rate": 2e-06, |
| "loss": -0.0015, |
| "reward": 0.9271292686462402, |
| "reward_std": 0.16556303203105927, |
| "rewards/preference_model_reward": 0.9271292686462402, |
| "rewards/preference_model_reward/std": 0.24194052815437317, |
| "step": 253 |
| }, |
| { |
| "clip_ratio": 0.0003772066265810281, |
| "epoch": 0.33029908972691807, |
| "grad_norm": 0.661148563447169, |
| "kl": 0.1953125, |
| "learning_rate": 2e-06, |
| "loss": -0.0018, |
| "step": 254 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 318.65625, |
| "epoch": 0.33159947984395316, |
| "grad_norm": 0.9604652912869995, |
| "kl": 0.1865234375, |
| "learning_rate": 2e-06, |
| "loss": -0.0038, |
| "reward": 0.7251725196838379, |
| "reward_std": 0.2921797037124634, |
| "rewards/preference_model_reward": 0.7251725196838379, |
| "rewards/preference_model_reward/std": 0.3959549367427826, |
| "step": 255 |
| }, |
| { |
| "clip_ratio": 0.0005459666135720909, |
| "epoch": 0.3328998699609883, |
| "grad_norm": 0.9173313255466033, |
| "kl": 0.185546875, |
| "learning_rate": 2e-06, |
| "loss": -0.0045, |
| "step": 256 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 326.59375, |
| "epoch": 0.3342002600780234, |
| "grad_norm": 0.15271026747443167, |
| "kl": 0.1806640625, |
| "learning_rate": 2e-06, |
| "loss": -0.0001, |
| "reward": 0.9912102222442627, |
| "reward_std": 0.03515896201133728, |
| "rewards/preference_model_reward": 0.9912102222442627, |
| "rewards/preference_model_reward/std": 0.049722280353307724, |
| "step": 257 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "epoch": 0.33550065019505854, |
| "grad_norm": 0.13315090840044477, |
| "kl": 0.1796875, |
| "learning_rate": 2e-06, |
| "loss": -0.0002, |
| "step": 258 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 474.03125, |
| "epoch": 0.3368010403120936, |
| "grad_norm": 0.023749172386806034, |
| "kl": 0.1767578125, |
| "learning_rate": 2e-06, |
| "loss": 0.0002, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/preference_model_reward": 1.0, |
| "rewards/preference_model_reward/std": 0.0, |
| "step": 259 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "epoch": 0.3381014304291287, |
| "grad_norm": 0.020046705197983922, |
| "kl": 0.162109375, |
| "learning_rate": 2e-06, |
| "loss": 0.0002, |
| "step": 260 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 352.9375, |
| "epoch": 0.33940182054616386, |
| "grad_norm": 0.6360843528660727, |
| "kl": 0.1552734375, |
| "learning_rate": 2e-06, |
| "loss": -0.0005, |
| "reward": 0.93841552734375, |
| "reward_std": 0.13651104271411896, |
| "rewards/preference_model_reward": 0.93841552734375, |
| "rewards/preference_model_reward/std": 0.19995808601379395, |
| "step": 261 |
| }, |
| { |
| "clip_ratio": 0.0007453379803337157, |
| "epoch": 0.34070221066319895, |
| "grad_norm": 0.6012787093360856, |
| "kl": 0.1435546875, |
| "learning_rate": 2e-06, |
| "loss": -0.001, |
| "step": 262 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 545.40625, |
| "epoch": 0.3420026007802341, |
| "grad_norm": 1.5555168702461082, |
| "kl": 0.19921875, |
| "learning_rate": 2e-06, |
| "loss": 0.0238, |
| "reward": 0.7650834918022156, |
| "reward_std": 0.3637867867946625, |
| "rewards/preference_model_reward": 0.7650834918022156, |
| "rewards/preference_model_reward/std": 0.3750750422477722, |
| "step": 263 |
| }, |
| { |
| "clip_ratio": 0.0010029254481196404, |
| "epoch": 0.3433029908972692, |
| "grad_norm": 1.7494042707337192, |
| "kl": 0.1611328125, |
| "learning_rate": 2e-06, |
| "loss": 0.0227, |
| "step": 264 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 380.75, |
| "epoch": 0.34460338101430427, |
| "grad_norm": 1.1102015498823388, |
| "kl": 0.150390625, |
| "learning_rate": 2e-06, |
| "loss": -0.0044, |
| "reward": 0.8403390645980835, |
| "reward_std": 0.31802237033843994, |
| "rewards/preference_model_reward": 0.8403390645980835, |
| "rewards/preference_model_reward/std": 0.32403555512428284, |
| "step": 265 |
| }, |
| { |
| "clip_ratio": 7.552870374638587e-05, |
| "epoch": 0.3459037711313394, |
| "grad_norm": 0.9753672523996855, |
| "kl": 0.1455078125, |
| "learning_rate": 2e-06, |
| "loss": -0.0051, |
| "step": 266 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 420.5, |
| "epoch": 0.3472041612483745, |
| "grad_norm": 0.7074828188717931, |
| "kl": 0.169921875, |
| "learning_rate": 2e-06, |
| "loss": -0.0001, |
| "reward": 0.7929283380508423, |
| "reward_std": 0.18768203258514404, |
| "rewards/preference_model_reward": 0.7929283380508423, |
| "rewards/preference_model_reward/std": 0.3012048006057739, |
| "step": 267 |
| }, |
| { |
| "clip_ratio": 0.0006710141897201538, |
| "epoch": 0.34850455136540964, |
| "grad_norm": 0.6706438170937675, |
| "kl": 0.166015625, |
| "learning_rate": 2e-06, |
| "loss": -0.0005, |
| "step": 268 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 358.5625, |
| "epoch": 0.34980494148244473, |
| "grad_norm": 1.045237660141156, |
| "kl": 0.15234375, |
| "learning_rate": 2e-06, |
| "loss": 0.0025, |
| "reward": 0.8770852088928223, |
| "reward_std": 0.23402594029903412, |
| "rewards/preference_model_reward": 0.8770852088928223, |
| "rewards/preference_model_reward/std": 0.2798316180706024, |
| "step": 269 |
| }, |
| { |
| "clip_ratio": 0.000235777348279953, |
| "epoch": 0.3511053315994798, |
| "grad_norm": 0.7229933430375373, |
| "kl": 0.15234375, |
| "learning_rate": 2e-06, |
| "loss": 0.0022, |
| "step": 270 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 455.875, |
| "epoch": 0.35240572171651496, |
| "grad_norm": 0.07309145002425353, |
| "kl": 0.130859375, |
| "learning_rate": 2e-06, |
| "loss": 0.0001, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/preference_model_reward": 1.0, |
| "rewards/preference_model_reward/std": 0.0, |
| "step": 271 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "epoch": 0.35370611183355005, |
| "grad_norm": 0.01529023255881576, |
| "kl": 0.1259765625, |
| "learning_rate": 2e-06, |
| "loss": 0.0001, |
| "step": 272 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 334.6875, |
| "epoch": 0.3550065019505852, |
| "grad_norm": 0.4251362023871739, |
| "kl": 0.1513671875, |
| "learning_rate": 2e-06, |
| "loss": -0.0023, |
| "reward": 0.9609469175338745, |
| "reward_std": 0.10973211377859116, |
| "rewards/preference_model_reward": 0.9609469175338745, |
| "rewards/preference_model_reward/std": 0.1577332317829132, |
| "step": 273 |
| }, |
| { |
| "clip_ratio": 0.000310945266392082, |
| "epoch": 0.3563068920676203, |
| "grad_norm": 0.34692654187177197, |
| "kl": 0.1474609375, |
| "learning_rate": 2e-06, |
| "loss": -0.0025, |
| "step": 274 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 345.65625, |
| "epoch": 0.3576072821846554, |
| "grad_norm": 0.8323153038827732, |
| "kl": 0.1298828125, |
| "learning_rate": 2e-06, |
| "loss": -0.0006, |
| "reward": 0.8540354371070862, |
| "reward_std": 0.2715786099433899, |
| "rewards/preference_model_reward": 0.8540354371070862, |
| "rewards/preference_model_reward/std": 0.2714146673679352, |
| "step": 275 |
| }, |
| { |
| "clip_ratio": 0.00025343496236018836, |
| "epoch": 0.3589076723016905, |
| "grad_norm": 0.8386551476075588, |
| "kl": 0.126953125, |
| "learning_rate": 2e-06, |
| "loss": -0.0012, |
| "step": 276 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 400.875, |
| "epoch": 0.3602080624187256, |
| "grad_norm": 3.018201149851659, |
| "kl": 0.14453125, |
| "learning_rate": 2e-06, |
| "loss": -0.0039, |
| "reward": 0.7919412851333618, |
| "reward_std": 0.20601463317871094, |
| "rewards/preference_model_reward": 0.7919412851333618, |
| "rewards/preference_model_reward/std": 0.35613295435905457, |
| "step": 277 |
| }, |
| { |
| "clip_ratio": 0.00024119633599184453, |
| "epoch": 0.36150845253576075, |
| "grad_norm": 2.3752234333701847, |
| "kl": 0.69140625, |
| "learning_rate": 2e-06, |
| "loss": -0.0035, |
| "step": 278 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 420.6875, |
| "epoch": 0.36280884265279584, |
| "grad_norm": 1.4330759769641548, |
| "kl": 0.1376953125, |
| "learning_rate": 2e-06, |
| "loss": -0.0142, |
| "reward": 0.5622725486755371, |
| "reward_std": 0.40393969416618347, |
| "rewards/preference_model_reward": 0.5622725486755371, |
| "rewards/preference_model_reward/std": 0.4006726145744324, |
| "step": 279 |
| }, |
| { |
| "clip_ratio": 0.00024359519011341035, |
| "epoch": 0.3641092327698309, |
| "grad_norm": 1.2330239115295205, |
| "kl": 0.1376953125, |
| "learning_rate": 2e-06, |
| "loss": -0.0151, |
| "step": 280 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 327.84375, |
| "epoch": 0.36540962288686607, |
| "grad_norm": 1.0374054211943582, |
| "kl": 0.126953125, |
| "learning_rate": 2e-06, |
| "loss": -0.0044, |
| "reward": 0.840008020401001, |
| "reward_std": 0.35583558678627014, |
| "rewards/preference_model_reward": 0.840008020401001, |
| "rewards/preference_model_reward/std": 0.3535732328891754, |
| "step": 281 |
| }, |
| { |
| "clip_ratio": 0.0007460214546881616, |
| "epoch": 0.36671001300390116, |
| "grad_norm": 0.9016990592456968, |
| "kl": 0.1279296875, |
| "learning_rate": 2e-06, |
| "loss": -0.005, |
| "step": 282 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 431.0625, |
| "epoch": 0.3680104031209363, |
| "grad_norm": 1.5216553850491168, |
| "kl": 0.134765625, |
| "learning_rate": 2e-06, |
| "loss": 0.0085, |
| "reward": 0.6559486389160156, |
| "reward_std": 0.4345345199108124, |
| "rewards/preference_model_reward": 0.6559486389160156, |
| "rewards/preference_model_reward/std": 0.42781931161880493, |
| "step": 283 |
| }, |
| { |
| "clip_ratio": 0.00047609303146600723, |
| "epoch": 0.3693107932379714, |
| "grad_norm": 5.409857191769975, |
| "kl": 0.134765625, |
| "learning_rate": 2e-06, |
| "loss": 0.0076, |
| "step": 284 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 390.53125, |
| "epoch": 0.3706111833550065, |
| "grad_norm": 1.1576263309647625, |
| "kl": 0.1357421875, |
| "learning_rate": 2e-06, |
| "loss": 0.0014, |
| "reward": 0.8136157989501953, |
| "reward_std": 0.23213137686252594, |
| "rewards/preference_model_reward": 0.8136157989501953, |
| "rewards/preference_model_reward/std": 0.298849493265152, |
| "step": 285 |
| }, |
| { |
| "clip_ratio": 0.00016891787527129054, |
| "epoch": 0.3719115734720416, |
| "grad_norm": 0.8952466212055566, |
| "kl": 0.1357421875, |
| "learning_rate": 2e-06, |
| "loss": 0.0009, |
| "step": 286 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 468.46875, |
| "epoch": 0.3732119635890767, |
| "grad_norm": 0.23014091366454292, |
| "kl": 0.14453125, |
| "learning_rate": 2e-06, |
| "loss": 0.0005, |
| "reward": 0.9814756512641907, |
| "reward_std": 0.05683635175228119, |
| "rewards/preference_model_reward": 0.9814756512641907, |
| "rewards/preference_model_reward/std": 0.08128068596124649, |
| "step": 287 |
| }, |
| { |
| "clip_ratio": 0.0008712065173313022, |
| "epoch": 0.37451235370611186, |
| "grad_norm": 0.22543453641754027, |
| "kl": 0.1435546875, |
| "learning_rate": 2e-06, |
| "loss": 0.0003, |
| "step": 288 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 527.40625, |
| "epoch": 0.37581274382314694, |
| "grad_norm": 1.0195256953878606, |
| "kl": 0.1171875, |
| "learning_rate": 2e-06, |
| "loss": 0.0058, |
| "reward": 0.48845207691192627, |
| "reward_std": 0.331182062625885, |
| "rewards/preference_model_reward": 0.48845207691192627, |
| "rewards/preference_model_reward/std": 0.469135046005249, |
| "step": 289 |
| }, |
| { |
| "clip_ratio": 0.0002331826981389895, |
| "epoch": 0.37711313394018203, |
| "grad_norm": 1.0989081166309522, |
| "kl": 0.1171875, |
| "learning_rate": 2e-06, |
| "loss": 0.0051, |
| "step": 290 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 318.4375, |
| "epoch": 0.3784135240572172, |
| "grad_norm": 1.0743112389929244, |
| "kl": 0.1357421875, |
| "learning_rate": 2e-06, |
| "loss": 0.0019, |
| "reward": 0.6902080178260803, |
| "reward_std": 0.3703380525112152, |
| "rewards/preference_model_reward": 0.6902080178260803, |
| "rewards/preference_model_reward/std": 0.42161616683006287, |
| "step": 291 |
| }, |
| { |
| "clip_ratio": 0.000206069671548903, |
| "epoch": 0.37971391417425226, |
| "grad_norm": 1.233125880130113, |
| "kl": 0.1376953125, |
| "learning_rate": 2e-06, |
| "loss": 0.0012, |
| "step": 292 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 395.25, |
| "epoch": 0.3810143042912874, |
| "grad_norm": 1.107176758039471, |
| "kl": 0.1435546875, |
| "learning_rate": 2e-06, |
| "loss": 0.0001, |
| "reward": 0.6763293743133545, |
| "reward_std": 0.34847384691238403, |
| "rewards/preference_model_reward": 0.6763293743133545, |
| "rewards/preference_model_reward/std": 0.3937572240829468, |
| "step": 293 |
| }, |
| { |
| "clip_ratio": 0.0004974036128260195, |
| "epoch": 0.3823146944083225, |
| "grad_norm": 1.1451654931732194, |
| "kl": 0.1435546875, |
| "learning_rate": 2e-06, |
| "loss": -0.0008, |
| "step": 294 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 377.09375, |
| "epoch": 0.3836150845253576, |
| "grad_norm": 0.7690758743063623, |
| "kl": 0.162109375, |
| "learning_rate": 2e-06, |
| "loss": 0.0054, |
| "reward": 0.4330715835094452, |
| "reward_std": 0.17991632223129272, |
| "rewards/preference_model_reward": 0.4330715835094452, |
| "rewards/preference_model_reward/std": 0.46172231435775757, |
| "step": 295 |
| }, |
| { |
| "clip_ratio": 0.00045062549179419875, |
| "epoch": 0.38491547464239273, |
| "grad_norm": 0.7270920776936544, |
| "kl": 0.1630859375, |
| "learning_rate": 2e-06, |
| "loss": 0.0049, |
| "step": 296 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 356.03125, |
| "epoch": 0.3862158647594278, |
| "grad_norm": 0.8363253049863949, |
| "kl": 0.15625, |
| "learning_rate": 2e-06, |
| "loss": 0.0016, |
| "reward": 0.8998174667358398, |
| "reward_std": 0.2283022552728653, |
| "rewards/preference_model_reward": 0.8998174667358398, |
| "rewards/preference_model_reward/std": 0.26539289951324463, |
| "step": 297 |
| }, |
| { |
| "clip_ratio": 9.889240755001083e-05, |
| "epoch": 0.38751625487646296, |
| "grad_norm": 0.7879079112961523, |
| "kl": 0.158203125, |
| "learning_rate": 2e-06, |
| "loss": 0.001, |
| "step": 298 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 345.65625, |
| "epoch": 0.38881664499349805, |
| "grad_norm": 0.9537917167423159, |
| "kl": 0.15625, |
| "learning_rate": 2e-06, |
| "loss": 0.0029, |
| "reward": 0.8426351547241211, |
| "reward_std": 0.32086896896362305, |
| "rewards/preference_model_reward": 0.8426351547241211, |
| "rewards/preference_model_reward/std": 0.3234107196331024, |
| "step": 299 |
| }, |
| { |
| "clip_ratio": 0.0001863636280177161, |
| "epoch": 0.39011703511053314, |
| "grad_norm": 1.0680774123073455, |
| "kl": 0.158203125, |
| "learning_rate": 2e-06, |
| "loss": 0.0022, |
| "step": 300 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 315.40625, |
| "epoch": 0.3914174252275683, |
| "grad_norm": 0.8700504381068667, |
| "kl": 0.1572265625, |
| "learning_rate": 2e-06, |
| "loss": -0.0018, |
| "reward": 0.8363662958145142, |
| "reward_std": 0.22255460917949677, |
| "rewards/preference_model_reward": 0.8363662958145142, |
| "rewards/preference_model_reward/std": 0.3514332175254822, |
| "step": 301 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "epoch": 0.39271781534460337, |
| "grad_norm": 0.8462340517851735, |
| "kl": 0.1591796875, |
| "learning_rate": 2e-06, |
| "loss": -0.0025, |
| "step": 302 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 561.96875, |
| "epoch": 0.3940182054616385, |
| "grad_norm": 1.0311732527803947, |
| "kl": 0.12255859375, |
| "learning_rate": 2e-06, |
| "loss": 0.0068, |
| "reward": 0.8814795017242432, |
| "reward_std": 0.2578521966934204, |
| "rewards/preference_model_reward": 0.8814795017242432, |
| "rewards/preference_model_reward/std": 0.2664722502231598, |
| "step": 303 |
| }, |
| { |
| "clip_ratio": 0.00029304379131644964, |
| "epoch": 0.3953185955786736, |
| "grad_norm": 0.8897993747480035, |
| "kl": 0.12353515625, |
| "learning_rate": 2e-06, |
| "loss": 0.0061, |
| "step": 304 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 345.625, |
| "epoch": 0.3966189856957087, |
| "grad_norm": 0.5314694953876299, |
| "kl": 0.169921875, |
| "learning_rate": 2e-06, |
| "loss": -0.0038, |
| "reward": 0.9106444716453552, |
| "reward_std": 0.15523825585842133, |
| "rewards/preference_model_reward": 0.9106444716453552, |
| "rewards/preference_model_reward/std": 0.23427554965019226, |
| "step": 305 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "epoch": 0.39791937581274384, |
| "grad_norm": 0.5193425208826081, |
| "kl": 0.1708984375, |
| "learning_rate": 2e-06, |
| "loss": -0.0042, |
| "step": 306 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 438.3125, |
| "epoch": 0.3992197659297789, |
| "grad_norm": 0.7071936845424766, |
| "kl": 0.1650390625, |
| "learning_rate": 2e-06, |
| "loss": 0.0, |
| "reward": 0.9416297078132629, |
| "reward_std": 0.233481302857399, |
| "rewards/preference_model_reward": 0.9416297078132629, |
| "rewards/preference_model_reward/std": 0.23015134036540985, |
| "step": 307 |
| }, |
| { |
| "clip_ratio": 0.00029302731854841113, |
| "epoch": 0.40052015604681407, |
| "grad_norm": 0.6689390834175738, |
| "kl": 0.166015625, |
| "learning_rate": 2e-06, |
| "loss": -0.0004, |
| "step": 308 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 212.5, |
| "epoch": 0.40182054616384916, |
| "grad_norm": 0.17393097771051252, |
| "kl": 0.189453125, |
| "learning_rate": 2e-06, |
| "loss": -0.0011, |
| "reward": 0.4954475164413452, |
| "reward_std": 0.07749561965465546, |
| "rewards/preference_model_reward": 0.4954475164413452, |
| "rewards/preference_model_reward/std": 0.48258301615715027, |
| "step": 309 |
| }, |
| { |
| "clip_ratio": 0.0002689617977011949, |
| "epoch": 0.40312093628088425, |
| "grad_norm": 0.14360471990489682, |
| "kl": 0.1884765625, |
| "learning_rate": 2e-06, |
| "loss": -0.0012, |
| "step": 310 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 405.125, |
| "epoch": 0.4044213263979194, |
| "grad_norm": 1.3861640957004484, |
| "kl": 0.1572265625, |
| "learning_rate": 2e-06, |
| "loss": 0.0066, |
| "reward": 0.7110024094581604, |
| "reward_std": 0.28312164545059204, |
| "rewards/preference_model_reward": 0.7110024094581604, |
| "rewards/preference_model_reward/std": 0.4174264371395111, |
| "step": 311 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "epoch": 0.4057217165149545, |
| "grad_norm": 1.0765104391828029, |
| "kl": 0.1572265625, |
| "learning_rate": 2e-06, |
| "loss": 0.006, |
| "step": 312 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 246.65625, |
| "epoch": 0.4070221066319896, |
| "grad_norm": 0.2721463731071287, |
| "kl": 0.1796875, |
| "learning_rate": 2e-06, |
| "loss": 0.001, |
| "reward": 0.9796858429908752, |
| "reward_std": 0.08125662803649902, |
| "rewards/preference_model_reward": 0.9796858429908752, |
| "rewards/preference_model_reward/std": 0.11491423845291138, |
| "step": 313 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "epoch": 0.4083224967490247, |
| "grad_norm": 0.23033629785859774, |
| "kl": 0.1806640625, |
| "learning_rate": 2e-06, |
| "loss": 0.0008, |
| "step": 314 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 255.21875, |
| "epoch": 0.4096228868660598, |
| "grad_norm": 0.6419391137466428, |
| "kl": 0.1884765625, |
| "learning_rate": 2e-06, |
| "loss": -0.0005, |
| "reward": 0.8585046529769897, |
| "reward_std": 0.2577616572380066, |
| "rewards/preference_model_reward": 0.8585046529769897, |
| "rewards/preference_model_reward/std": 0.26389265060424805, |
| "step": 315 |
| }, |
| { |
| "clip_ratio": 0.00030229747062548995, |
| "epoch": 0.41092327698309494, |
| "grad_norm": 0.6006179379691705, |
| "kl": 0.189453125, |
| "learning_rate": 2e-06, |
| "loss": -0.0011, |
| "step": 316 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 363.5625, |
| "epoch": 0.41222366710013003, |
| "grad_norm": 3.170040530471923, |
| "kl": 0.1962890625, |
| "learning_rate": 2e-06, |
| "loss": -0.0034, |
| "reward": 0.7158905267715454, |
| "reward_std": 0.40732401609420776, |
| "rewards/preference_model_reward": 0.7158905267715454, |
| "rewards/preference_model_reward/std": 0.4021960496902466, |
| "step": 317 |
| }, |
| { |
| "clip_ratio": 0.00033682904904708266, |
| "epoch": 0.4135240572171652, |
| "grad_norm": 1.2493860673019876, |
| "kl": 0.1953125, |
| "learning_rate": 2e-06, |
| "loss": -0.0037, |
| "step": 318 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 357.0, |
| "epoch": 0.41482444733420026, |
| "grad_norm": 0.8704204592205625, |
| "kl": 0.1826171875, |
| "learning_rate": 2e-06, |
| "loss": -0.0114, |
| "reward": 0.6861220598220825, |
| "reward_std": 0.3007145822048187, |
| "rewards/preference_model_reward": 0.6861220598220825, |
| "rewards/preference_model_reward/std": 0.3603835999965668, |
| "step": 319 |
| }, |
| { |
| "clip_ratio": 0.0003130662371404469, |
| "epoch": 0.41612483745123535, |
| "grad_norm": 0.8570575968724828, |
| "kl": 0.18359375, |
| "learning_rate": 2e-06, |
| "loss": -0.0121, |
| "step": 320 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 396.59375, |
| "epoch": 0.4174252275682705, |
| "grad_norm": 1.2062034927710477, |
| "kl": 0.326171875, |
| "learning_rate": 2e-06, |
| "loss": 0.002, |
| "reward": 0.9455615282058716, |
| "reward_std": 0.21775373816490173, |
| "rewards/preference_model_reward": 0.9455615282058716, |
| "rewards/preference_model_reward/std": 0.21650560200214386, |
| "step": 321 |
| }, |
| { |
| "clip_ratio": 0.0002604166802484542, |
| "epoch": 0.4187256176853056, |
| "grad_norm": 2.819982155660782, |
| "kl": 0.2138671875, |
| "learning_rate": 2e-06, |
| "loss": 0.0016, |
| "step": 322 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 351.5, |
| "epoch": 0.42002600780234073, |
| "grad_norm": 1.0161054442586306, |
| "kl": 0.177734375, |
| "learning_rate": 2e-06, |
| "loss": -0.0007, |
| "reward": 0.8240683078765869, |
| "reward_std": 0.27943116426467896, |
| "rewards/preference_model_reward": 0.8240683078765869, |
| "rewards/preference_model_reward/std": 0.3035885691642761, |
| "step": 323 |
| }, |
| { |
| "clip_ratio": 0.00038074731128290296, |
| "epoch": 0.4213263979193758, |
| "grad_norm": 0.9288478971908941, |
| "kl": 0.1787109375, |
| "learning_rate": 2e-06, |
| "loss": -0.0012, |
| "step": 324 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 398.34375, |
| "epoch": 0.4226267880364109, |
| "grad_norm": 0.5555980131095809, |
| "kl": 0.203125, |
| "learning_rate": 2e-06, |
| "loss": 0.0018, |
| "reward": 0.9582957029342651, |
| "reward_std": 0.12391936033964157, |
| "rewards/preference_model_reward": 0.9582957029342651, |
| "rewards/preference_model_reward/std": 0.16838383674621582, |
| "step": 325 |
| }, |
| { |
| "clip_ratio": 0.00029583554714918137, |
| "epoch": 0.42392717815344605, |
| "grad_norm": 0.4895345891264094, |
| "kl": 0.203125, |
| "learning_rate": 2e-06, |
| "loss": 0.0014, |
| "step": 326 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 313.0625, |
| "epoch": 0.42522756827048114, |
| "grad_norm": 0.22580993498032037, |
| "kl": 0.19140625, |
| "learning_rate": 2e-06, |
| "loss": -0.0009, |
| "reward": 0.9758968353271484, |
| "reward_std": 0.06588973104953766, |
| "rewards/preference_model_reward": 0.9758968353271484, |
| "rewards/preference_model_reward/std": 0.09488161653280258, |
| "step": 327 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "epoch": 0.4265279583875163, |
| "grad_norm": 0.19771147507803757, |
| "kl": 0.1904296875, |
| "learning_rate": 2e-06, |
| "loss": -0.001, |
| "step": 328 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 388.65625, |
| "epoch": 0.42782834850455137, |
| "grad_norm": 1.073532799875203, |
| "kl": 0.18359375, |
| "learning_rate": 2e-06, |
| "loss": -0.0031, |
| "reward": 0.5786818265914917, |
| "reward_std": 0.37675005197525024, |
| "rewards/preference_model_reward": 0.5786818265914917, |
| "rewards/preference_model_reward/std": 0.38477084040641785, |
| "step": 329 |
| }, |
| { |
| "clip_ratio": 0.00016030779806897044, |
| "epoch": 0.42912873862158646, |
| "grad_norm": 1.059287777640083, |
| "kl": 0.1865234375, |
| "learning_rate": 2e-06, |
| "loss": -0.0039, |
| "step": 330 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 412.75, |
| "epoch": 0.4304291287386216, |
| "grad_norm": 0.7820217805815579, |
| "kl": 0.169921875, |
| "learning_rate": 2e-06, |
| "loss": -0.0028, |
| "reward": 0.8559874892234802, |
| "reward_std": 0.1847127079963684, |
| "rewards/preference_model_reward": 0.8559874892234802, |
| "rewards/preference_model_reward/std": 0.295710951089859, |
| "step": 331 |
| }, |
| { |
| "clip_ratio": 8.996042015496641e-05, |
| "epoch": 0.4317295188556567, |
| "grad_norm": 0.7329357485601158, |
| "kl": 0.169921875, |
| "learning_rate": 2e-06, |
| "loss": -0.0032, |
| "step": 332 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 373.0625, |
| "epoch": 0.43302990897269183, |
| "grad_norm": 0.36553915145989274, |
| "kl": 0.201171875, |
| "learning_rate": 2e-06, |
| "loss": -0.0017, |
| "reward": 0.9780337810516357, |
| "reward_std": 0.08786486089229584, |
| "rewards/preference_model_reward": 0.9780337810516357, |
| "rewards/preference_model_reward/std": 0.12425968050956726, |
| "step": 333 |
| }, |
| { |
| "clip_ratio": 0.00035080796806141734, |
| "epoch": 0.4343302990897269, |
| "grad_norm": 0.3148042609460324, |
| "kl": 0.201171875, |
| "learning_rate": 2e-06, |
| "loss": -0.002, |
| "step": 334 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 341.875, |
| "epoch": 0.435630689206762, |
| "grad_norm": 0.5432127291313187, |
| "kl": 0.1875, |
| "learning_rate": 2e-06, |
| "loss": 0.0011, |
| "reward": 0.9440739750862122, |
| "reward_std": 0.14797841012477875, |
| "rewards/preference_model_reward": 0.9440739750862122, |
| "rewards/preference_model_reward/std": 0.21356752514839172, |
| "step": 335 |
| }, |
| { |
| "clip_ratio": 0.00030156815773807466, |
| "epoch": 0.43693107932379716, |
| "grad_norm": 0.49846748282978737, |
| "kl": 0.1884765625, |
| "learning_rate": 2e-06, |
| "loss": 0.0007, |
| "step": 336 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 291.15625, |
| "epoch": 0.43823146944083224, |
| "grad_norm": 0.5736680592738835, |
| "kl": 0.1748046875, |
| "learning_rate": 2e-06, |
| "loss": -0.003, |
| "reward": 0.8900240659713745, |
| "reward_std": 0.24245613813400269, |
| "rewards/preference_model_reward": 0.8900240659713745, |
| "rewards/preference_model_reward/std": 0.2557799518108368, |
| "step": 337 |
| }, |
| { |
| "clip_ratio": 8.4745763160754e-05, |
| "epoch": 0.43953185955786733, |
| "grad_norm": 0.54020537665832, |
| "kl": 0.177734375, |
| "learning_rate": 2e-06, |
| "loss": -0.0034, |
| "step": 338 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 423.21875, |
| "epoch": 0.4408322496749025, |
| "grad_norm": 0.9480813610454677, |
| "kl": 0.2373046875, |
| "learning_rate": 2e-06, |
| "loss": -0.0017, |
| "reward": 0.5154986381530762, |
| "reward_std": 0.3188796639442444, |
| "rewards/preference_model_reward": 0.5154986381530762, |
| "rewards/preference_model_reward/std": 0.4238376319408417, |
| "step": 339 |
| }, |
| { |
| "clip_ratio": 0.00020426370610948652, |
| "epoch": 0.44213263979193757, |
| "grad_norm": 0.9501018563704864, |
| "kl": 0.240234375, |
| "learning_rate": 2e-06, |
| "loss": -0.0024, |
| "step": 340 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 420.59375, |
| "epoch": 0.4434330299089727, |
| "grad_norm": 0.845647715547278, |
| "kl": 0.201171875, |
| "learning_rate": 2e-06, |
| "loss": -0.0021, |
| "reward": 0.8065922260284424, |
| "reward_std": 0.2574193477630615, |
| "rewards/preference_model_reward": 0.8065922260284424, |
| "rewards/preference_model_reward/std": 0.29339519143104553, |
| "step": 341 |
| }, |
| { |
| "clip_ratio": 0.00031662482069805264, |
| "epoch": 0.4447334200260078, |
| "grad_norm": 0.8751571372465456, |
| "kl": 0.203125, |
| "learning_rate": 2e-06, |
| "loss": -0.0026, |
| "step": 342 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 357.15625, |
| "epoch": 0.4460338101430429, |
| "grad_norm": 0.730275481610921, |
| "kl": 0.234375, |
| "learning_rate": 2e-06, |
| "loss": -0.0063, |
| "reward": 0.900632381439209, |
| "reward_std": 0.2498612105846405, |
| "rewards/preference_model_reward": 0.900632381439209, |
| "rewards/preference_model_reward/std": 0.256816565990448, |
| "step": 343 |
| }, |
| { |
| "clip_ratio": 0.00030949688516557217, |
| "epoch": 0.44733420026007803, |
| "grad_norm": 0.6892449500597292, |
| "kl": 0.2373046875, |
| "learning_rate": 2e-06, |
| "loss": -0.0068, |
| "step": 344 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 284.4375, |
| "epoch": 0.4486345903771131, |
| "grad_norm": 0.8200871031591095, |
| "kl": 0.166015625, |
| "learning_rate": 2e-06, |
| "loss": -0.0001, |
| "reward": 0.8631854057312012, |
| "reward_std": 0.19240587949752808, |
| "rewards/preference_model_reward": 0.8631854057312012, |
| "rewards/preference_model_reward/std": 0.30161845684051514, |
| "step": 345 |
| }, |
| { |
| "clip_ratio": 0.0002665245265234262, |
| "epoch": 0.44993498049414826, |
| "grad_norm": 0.7466247621387166, |
| "kl": 0.177734375, |
| "learning_rate": 2e-06, |
| "loss": -0.0004, |
| "step": 346 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 385.46875, |
| "epoch": 0.45123537061118335, |
| "grad_norm": 1.0742987274568128, |
| "kl": 0.216796875, |
| "learning_rate": 2e-06, |
| "loss": 0.0, |
| "reward": 0.8062537908554077, |
| "reward_std": 0.22240078449249268, |
| "rewards/preference_model_reward": 0.8062537908554077, |
| "rewards/preference_model_reward/std": 0.36671730875968933, |
| "step": 347 |
| }, |
| { |
| "clip_ratio": 8.4373947174754e-05, |
| "epoch": 0.45253576072821844, |
| "grad_norm": 0.8886627945755464, |
| "kl": 0.21875, |
| "learning_rate": 2e-06, |
| "loss": -0.0005, |
| "step": 348 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 548.28125, |
| "epoch": 0.4538361508452536, |
| "grad_norm": 0.3764239056895551, |
| "kl": 0.212890625, |
| "learning_rate": 2e-06, |
| "loss": 0.0004, |
| "reward": 0.9814817309379578, |
| "reward_std": 0.07407312840223312, |
| "rewards/preference_model_reward": 0.9814817309379578, |
| "rewards/preference_model_reward/std": 0.1047552078962326, |
| "step": 349 |
| }, |
| { |
| "clip_ratio": 0.00011498251114971936, |
| "epoch": 0.45513654096228867, |
| "grad_norm": 0.3492401522936101, |
| "kl": 0.2158203125, |
| "learning_rate": 2e-06, |
| "loss": 0.0001, |
| "step": 350 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 340.4375, |
| "epoch": 0.4564369310793238, |
| "grad_norm": 0.8053126037042443, |
| "kl": 0.265625, |
| "learning_rate": 2e-06, |
| "loss": 0.0022, |
| "reward": 0.8822987079620361, |
| "reward_std": 0.19238536059856415, |
| "rewards/preference_model_reward": 0.8822987079620361, |
| "rewards/preference_model_reward/std": 0.24105463922023773, |
| "step": 351 |
| }, |
| { |
| "clip_ratio": 0.0005687876255251467, |
| "epoch": 0.4577373211963589, |
| "grad_norm": 0.7744649363558931, |
| "kl": 0.267578125, |
| "learning_rate": 2e-06, |
| "loss": 0.0016, |
| "step": 352 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 340.15625, |
| "epoch": 0.459037711313394, |
| "grad_norm": 0.4163188535794671, |
| "kl": 0.1962890625, |
| "learning_rate": 2e-06, |
| "loss": -0.0004, |
| "reward": 0.9581733345985413, |
| "reward_std": 0.09610553830862045, |
| "rewards/preference_model_reward": 0.9581733345985413, |
| "rewards/preference_model_reward/std": 0.14029455184936523, |
| "step": 353 |
| }, |
| { |
| "clip_ratio": 9.426847827853635e-05, |
| "epoch": 0.46033810143042914, |
| "grad_norm": 0.37210922204260644, |
| "kl": 0.197265625, |
| "learning_rate": 2e-06, |
| "loss": -0.0007, |
| "step": 354 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 482.875, |
| "epoch": 0.4616384915474642, |
| "grad_norm": 0.443699445192129, |
| "kl": 0.2255859375, |
| "learning_rate": 2e-06, |
| "loss": -0.002, |
| "reward": 0.9650442600250244, |
| "reward_std": 0.09929230064153671, |
| "rewards/preference_model_reward": 0.9650442600250244, |
| "rewards/preference_model_reward/std": 0.14262951910495758, |
| "step": 355 |
| }, |
| { |
| "clip_ratio": 7.941549847600982e-05, |
| "epoch": 0.46293888166449937, |
| "grad_norm": 0.3525018468251512, |
| "kl": 0.2275390625, |
| "learning_rate": 2e-06, |
| "loss": -0.0022, |
| "step": 356 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 541.96875, |
| "epoch": 0.46423927178153446, |
| "grad_norm": 0.037646114609881515, |
| "kl": 0.26171875, |
| "learning_rate": 2e-06, |
| "loss": 0.0003, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/preference_model_reward": 1.0, |
| "rewards/preference_model_reward/std": 0.0, |
| "step": 357 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "epoch": 0.46553966189856955, |
| "grad_norm": 0.022323449072510736, |
| "kl": 0.248046875, |
| "learning_rate": 2e-06, |
| "loss": 0.0003, |
| "step": 358 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 486.5, |
| "epoch": 0.4668400520156047, |
| "grad_norm": 0.047585204521953244, |
| "kl": 0.1884765625, |
| "learning_rate": 2e-06, |
| "loss": -0.0007, |
| "reward": 0.9968596696853638, |
| "reward_std": 0.012561214156448841, |
| "rewards/preference_model_reward": 0.9968596696853638, |
| "rewards/preference_model_reward/std": 0.017764244228601456, |
| "step": 359 |
| }, |
| { |
| "clip_ratio": 0.0003033555403817445, |
| "epoch": 0.4681404421326398, |
| "grad_norm": 0.04158159498021269, |
| "kl": 0.1748046875, |
| "learning_rate": 2e-06, |
| "loss": -0.0008, |
| "step": 360 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 453.0625, |
| "epoch": 0.4694408322496749, |
| "grad_norm": 0.4699175632650768, |
| "kl": 0.18359375, |
| "learning_rate": 2e-06, |
| "loss": -0.0061, |
| "reward": 0.9376707673072815, |
| "reward_std": 0.1413489133119583, |
| "rewards/preference_model_reward": 0.9376707673072815, |
| "rewards/preference_model_reward/std": 0.1997252255678177, |
| "step": 361 |
| }, |
| { |
| "clip_ratio": 0.0010629449971020222, |
| "epoch": 0.47074122236671, |
| "grad_norm": 0.4443824446386433, |
| "kl": 0.1748046875, |
| "learning_rate": 2e-06, |
| "loss": -0.0065, |
| "step": 362 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 431.8125, |
| "epoch": 0.4720416124837451, |
| "grad_norm": 0.5928527775265698, |
| "kl": 0.193359375, |
| "learning_rate": 2e-06, |
| "loss": 0.0001, |
| "reward": 0.9526033997535706, |
| "reward_std": 0.18958628177642822, |
| "rewards/preference_model_reward": 0.9526033997535706, |
| "rewards/preference_model_reward/std": 0.19250237941741943, |
| "step": 363 |
| }, |
| { |
| "clip_ratio": 0.000205172153073363, |
| "epoch": 0.47334200260078024, |
| "grad_norm": 0.5475249341379819, |
| "kl": 0.1865234375, |
| "learning_rate": 2e-06, |
| "loss": -0.0003, |
| "step": 364 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 413.6875, |
| "epoch": 0.47464239271781533, |
| "grad_norm": 0.9292294703178672, |
| "kl": 0.17578125, |
| "learning_rate": 2e-06, |
| "loss": -0.0052, |
| "reward": 0.8800602555274963, |
| "reward_std": 0.22069776058197021, |
| "rewards/preference_model_reward": 0.8800602555274963, |
| "rewards/preference_model_reward/std": 0.2782754898071289, |
| "step": 365 |
| }, |
| { |
| "clip_ratio": 0.0006039842264726758, |
| "epoch": 0.4759427828348505, |
| "grad_norm": 0.8278406545693311, |
| "kl": 0.171875, |
| "learning_rate": 2e-06, |
| "loss": -0.0058, |
| "step": 366 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 696.46875, |
| "epoch": 0.47724317295188556, |
| "grad_norm": 1.7035086337060783, |
| "kl": 0.1640625, |
| "learning_rate": 2e-06, |
| "loss": 0.0226, |
| "reward": 0.7321191430091858, |
| "reward_std": 0.39982855319976807, |
| "rewards/preference_model_reward": 0.7321191430091858, |
| "rewards/preference_model_reward/std": 0.39758551120758057, |
| "step": 367 |
| }, |
| { |
| "clip_ratio": 9.947569924406707e-05, |
| "epoch": 0.47854356306892065, |
| "grad_norm": 1.6398878874883134, |
| "kl": 0.162109375, |
| "learning_rate": 2e-06, |
| "loss": 0.0212, |
| "step": 368 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 485.0625, |
| "epoch": 0.4798439531859558, |
| "grad_norm": 1.0250430126617982, |
| "kl": 0.154296875, |
| "learning_rate": 2e-06, |
| "loss": 0.0073, |
| "reward": 0.846257746219635, |
| "reward_std": 0.2683815360069275, |
| "rewards/preference_model_reward": 0.846257746219635, |
| "rewards/preference_model_reward/std": 0.2826293110847473, |
| "step": 369 |
| }, |
| { |
| "clip_ratio": 0.0001257861586054787, |
| "epoch": 0.4811443433029909, |
| "grad_norm": 1.001787366115385, |
| "kl": 0.1533203125, |
| "learning_rate": 2e-06, |
| "loss": 0.0065, |
| "step": 370 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 570.53125, |
| "epoch": 0.48244473342002603, |
| "grad_norm": 1.3025780868642676, |
| "kl": 0.13671875, |
| "learning_rate": 2e-06, |
| "loss": 0.025, |
| "reward": 0.8562784790992737, |
| "reward_std": 0.3221096098423004, |
| "rewards/preference_model_reward": 0.8562784790992737, |
| "rewards/preference_model_reward/std": 0.3274212181568146, |
| "step": 371 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "epoch": 0.4837451235370611, |
| "grad_norm": 1.2670088001321425, |
| "kl": 0.13671875, |
| "learning_rate": 2e-06, |
| "loss": 0.0241, |
| "step": 372 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 557.25, |
| "epoch": 0.4850455136540962, |
| "grad_norm": 0.396947988021042, |
| "kl": 0.1318359375, |
| "learning_rate": 2e-06, |
| "loss": 0.0003, |
| "reward": 0.9703431129455566, |
| "reward_std": 0.08718589693307877, |
| "rewards/preference_model_reward": 0.9703431129455566, |
| "rewards/preference_model_reward/std": 0.08997520059347153, |
| "step": 373 |
| }, |
| { |
| "clip_ratio": 8.115790114970878e-05, |
| "epoch": 0.48634590377113135, |
| "grad_norm": 0.3588001484684231, |
| "kl": 0.1318359375, |
| "learning_rate": 2e-06, |
| "loss": -0.0, |
| "step": 374 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 402.75, |
| "epoch": 0.48764629388816644, |
| "grad_norm": 1.0349158159751568, |
| "kl": 0.17578125, |
| "learning_rate": 2e-06, |
| "loss": -0.0081, |
| "reward": 0.5225076675415039, |
| "reward_std": 0.3107752799987793, |
| "rewards/preference_model_reward": 0.5225076675415039, |
| "rewards/preference_model_reward/std": 0.4560634195804596, |
| "step": 375 |
| }, |
| { |
| "clip_ratio": 0.0003945899079553783, |
| "epoch": 0.4889466840052016, |
| "grad_norm": 1.0294645469560724, |
| "kl": 0.17578125, |
| "learning_rate": 2e-06, |
| "loss": -0.0089, |
| "step": 376 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 439.65625, |
| "epoch": 0.49024707412223667, |
| "grad_norm": 0.12293364932946799, |
| "kl": 0.1650390625, |
| "learning_rate": 2e-06, |
| "loss": 0.0001, |
| "reward": 0.9920886754989624, |
| "reward_std": 0.03164532035589218, |
| "rewards/preference_model_reward": 0.9920886754989624, |
| "rewards/preference_model_reward/std": 0.044753238558769226, |
| "step": 377 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "epoch": 0.49154746423927176, |
| "grad_norm": 0.11828689260322871, |
| "kl": 0.1630859375, |
| "learning_rate": 2e-06, |
| "loss": 0.0, |
| "step": 378 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 567.90625, |
| "epoch": 0.4928478543563069, |
| "grad_norm": 0.8350931514014152, |
| "kl": 0.1669921875, |
| "learning_rate": 2e-06, |
| "loss": -0.0049, |
| "reward": 0.8592836856842041, |
| "reward_std": 0.22580870985984802, |
| "rewards/preference_model_reward": 0.8592836856842041, |
| "rewards/preference_model_reward/std": 0.28813624382019043, |
| "step": 379 |
| }, |
| { |
| "clip_ratio": 0.00024522157036699355, |
| "epoch": 0.494148244473342, |
| "grad_norm": 0.7878789832148902, |
| "kl": 0.1669921875, |
| "learning_rate": 2e-06, |
| "loss": -0.0054, |
| "step": 380 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 458.40625, |
| "epoch": 0.49544863459037713, |
| "grad_norm": 1.3819275414815289, |
| "kl": 0.14453125, |
| "learning_rate": 2e-06, |
| "loss": 0.0319, |
| "reward": 0.6078048348426819, |
| "reward_std": 0.39275383949279785, |
| "rewards/preference_model_reward": 0.6078048348426819, |
| "rewards/preference_model_reward/std": 0.4531180262565613, |
| "step": 381 |
| }, |
| { |
| "clip_ratio": 4.539676956483163e-05, |
| "epoch": 0.4967490247074122, |
| "grad_norm": 1.3480746769326, |
| "kl": 0.14453125, |
| "learning_rate": 2e-06, |
| "loss": 0.0309, |
| "step": 382 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 530.90625, |
| "epoch": 0.4980494148244473, |
| "grad_norm": 1.0915508320963603, |
| "kl": 0.1630859375, |
| "learning_rate": 2e-06, |
| "loss": 0.0096, |
| "reward": 0.9106493592262268, |
| "reward_std": 0.26455527544021606, |
| "rewards/preference_model_reward": 0.9106493592262268, |
| "rewards/preference_model_reward/std": 0.26605573296546936, |
| "step": 383 |
| }, |
| { |
| "clip_ratio": 0.0006032996461726725, |
| "epoch": 0.49934980494148246, |
| "grad_norm": 1.0062396438915018, |
| "kl": 0.1640625, |
| "learning_rate": 2e-06, |
| "loss": 0.0089, |
| "step": 384 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 404.28125, |
| "epoch": 0.5006501950585176, |
| "grad_norm": 0.815339253376103, |
| "kl": 0.16015625, |
| "learning_rate": 2e-06, |
| "loss": 0.0019, |
| "reward": 0.6707455515861511, |
| "reward_std": 0.2263924926519394, |
| "rewards/preference_model_reward": 0.6707455515861511, |
| "rewards/preference_model_reward/std": 0.40456080436706543, |
| "step": 385 |
| }, |
| { |
| "clip_ratio": 8.218277798732743e-05, |
| "epoch": 0.5019505851755527, |
| "grad_norm": 0.8169128748708187, |
| "kl": 0.1611328125, |
| "learning_rate": 2e-06, |
| "loss": 0.0013, |
| "step": 386 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 481.03125, |
| "epoch": 0.5032509752925878, |
| "grad_norm": 1.187671199008333, |
| "kl": 0.177734375, |
| "learning_rate": 2e-06, |
| "loss": 0.0014, |
| "reward": 0.8174352049827576, |
| "reward_std": 0.3356226086616516, |
| "rewards/preference_model_reward": 0.8174352049827576, |
| "rewards/preference_model_reward/std": 0.35391342639923096, |
| "step": 387 |
| }, |
| { |
| "clip_ratio": 8.300133049488068e-05, |
| "epoch": 0.5045513654096229, |
| "grad_norm": 1.1173042462876819, |
| "kl": 0.177734375, |
| "learning_rate": 2e-06, |
| "loss": 0.0005, |
| "step": 388 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 485.21875, |
| "epoch": 0.505851755526658, |
| "grad_norm": 0.9197781852352046, |
| "kl": 0.1630859375, |
| "learning_rate": 2e-06, |
| "loss": -0.0007, |
| "reward": 0.6285791993141174, |
| "reward_std": 0.27514463663101196, |
| "rewards/preference_model_reward": 0.6285791993141174, |
| "rewards/preference_model_reward/std": 0.43319857120513916, |
| "step": 389 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "epoch": 0.5071521456436932, |
| "grad_norm": 1.0705592825656165, |
| "kl": 0.1640625, |
| "learning_rate": 2e-06, |
| "loss": -0.0014, |
| "step": 390 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 440.9375, |
| "epoch": 0.5084525357607282, |
| "grad_norm": 1.2228961699827323, |
| "kl": 0.1513671875, |
| "learning_rate": 2e-06, |
| "loss": -0.0048, |
| "reward": 0.5957476496696472, |
| "reward_std": 0.40599602460861206, |
| "rewards/preference_model_reward": 0.5957476496696472, |
| "rewards/preference_model_reward/std": 0.4048064649105072, |
| "step": 391 |
| }, |
| { |
| "clip_ratio": 6.316321378108114e-05, |
| "epoch": 0.5097529258777633, |
| "grad_norm": 1.1490955590907963, |
| "kl": 0.1533203125, |
| "learning_rate": 2e-06, |
| "loss": -0.0056, |
| "step": 392 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 458.34375, |
| "epoch": 0.5110533159947984, |
| "grad_norm": 1.2320099034869054, |
| "kl": 0.189453125, |
| "learning_rate": 2e-06, |
| "loss": 0.025, |
| "reward": 0.7658855319023132, |
| "reward_std": 0.33958619832992554, |
| "rewards/preference_model_reward": 0.7658855319023132, |
| "rewards/preference_model_reward/std": 0.36572444438934326, |
| "step": 393 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "epoch": 0.5123537061118335, |
| "grad_norm": 1.1909134434235307, |
| "kl": 0.1923828125, |
| "learning_rate": 2e-06, |
| "loss": 0.024, |
| "step": 394 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 335.21875, |
| "epoch": 0.5136540962288687, |
| "grad_norm": 0.4436447730361839, |
| "kl": 0.1728515625, |
| "learning_rate": 2e-06, |
| "loss": -0.0077, |
| "reward": 0.4475706219673157, |
| "reward_std": 0.1472529023885727, |
| "rewards/preference_model_reward": 0.4475706219673157, |
| "rewards/preference_model_reward/std": 0.4426521062850952, |
| "step": 395 |
| }, |
| { |
| "clip_ratio": 0.0003680627851281315, |
| "epoch": 0.5149544863459038, |
| "grad_norm": 0.44133914560284426, |
| "kl": 0.1748046875, |
| "learning_rate": 2e-06, |
| "loss": -0.0081, |
| "step": 396 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 486.40625, |
| "epoch": 0.5162548764629389, |
| "grad_norm": 1.027174535394778, |
| "kl": 0.142578125, |
| "learning_rate": 2e-06, |
| "loss": 0.0247, |
| "reward": 0.5502924919128418, |
| "reward_std": 0.3194156885147095, |
| "rewards/preference_model_reward": 0.5502924919128418, |
| "rewards/preference_model_reward/std": 0.404392808675766, |
| "step": 397 |
| }, |
| { |
| "clip_ratio": 7.11642496753484e-05, |
| "epoch": 0.517555266579974, |
| "grad_norm": 0.9543587594708557, |
| "kl": 0.1455078125, |
| "learning_rate": 2e-06, |
| "loss": 0.024, |
| "step": 398 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 689.125, |
| "epoch": 0.5188556566970091, |
| "grad_norm": 0.4271536388521303, |
| "kl": 0.154296875, |
| "learning_rate": 2e-06, |
| "loss": 0.0007, |
| "reward": 0.48311230540275574, |
| "reward_std": 0.1016714870929718, |
| "rewards/preference_model_reward": 0.48311230540275574, |
| "rewards/preference_model_reward/std": 0.4811513423919678, |
| "step": 399 |
| }, |
| { |
| "clip_ratio": 0.0003734786878339946, |
| "epoch": 0.5201560468140443, |
| "grad_norm": 0.5666357587808477, |
| "kl": 0.154296875, |
| "learning_rate": 2e-06, |
| "loss": 0.0005, |
| "step": 400 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 407.125, |
| "epoch": 0.5214564369310793, |
| "grad_norm": 0.3796776014953804, |
| "kl": 0.1845703125, |
| "learning_rate": 2e-06, |
| "loss": 0.0002, |
| "reward": 0.9658941030502319, |
| "reward_std": 0.09363856911659241, |
| "rewards/preference_model_reward": 0.9658941030502319, |
| "rewards/preference_model_reward/std": 0.1329784095287323, |
| "step": 401 |
| }, |
| { |
| "clip_ratio": 0.0003180578933097422, |
| "epoch": 0.5227568270481144, |
| "grad_norm": 0.3647794628722391, |
| "kl": 0.1865234375, |
| "learning_rate": 2e-06, |
| "loss": -0.0001, |
| "step": 402 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 474.34375, |
| "epoch": 0.5240572171651495, |
| "grad_norm": 0.5011856408174054, |
| "kl": 0.1533203125, |
| "learning_rate": 2e-06, |
| "loss": -0.006, |
| "reward": 0.9527279734611511, |
| "reward_std": 0.09915541857481003, |
| "rewards/preference_model_reward": 0.9527279734611511, |
| "rewards/preference_model_reward/std": 0.1460685133934021, |
| "step": 403 |
| }, |
| { |
| "clip_ratio": 5.3544656111625955e-05, |
| "epoch": 0.5253576072821846, |
| "grad_norm": 0.4587873209186705, |
| "kl": 0.15625, |
| "learning_rate": 2e-06, |
| "loss": -0.0064, |
| "step": 404 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 339.0, |
| "epoch": 0.5266579973992198, |
| "grad_norm": 0.6507606398747722, |
| "kl": 0.177734375, |
| "learning_rate": 2e-06, |
| "loss": -0.0007, |
| "reward": 0.9481613636016846, |
| "reward_std": 0.20735451579093933, |
| "rewards/preference_model_reward": 0.9481613636016846, |
| "rewards/preference_model_reward/std": 0.20464079082012177, |
| "step": 405 |
| }, |
| { |
| "clip_ratio": 0.0006553526036441326, |
| "epoch": 0.5279583875162549, |
| "grad_norm": 0.5683220760296074, |
| "kl": 0.1787109375, |
| "learning_rate": 2e-06, |
| "loss": -0.0011, |
| "step": 406 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 397.34375, |
| "epoch": 0.52925877763329, |
| "grad_norm": 0.46341453825755263, |
| "kl": 0.1865234375, |
| "learning_rate": 2e-06, |
| "loss": 0.0002, |
| "reward": 0.9362279772758484, |
| "reward_std": 0.11489400267601013, |
| "rewards/preference_model_reward": 0.9362279772758484, |
| "rewards/preference_model_reward/std": 0.17247511446475983, |
| "step": 407 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "epoch": 0.5305591677503251, |
| "grad_norm": 0.435386222165232, |
| "kl": 0.189453125, |
| "learning_rate": 2e-06, |
| "loss": -0.0001, |
| "step": 408 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 415.125, |
| "epoch": 0.5318595578673602, |
| "grad_norm": 0.5058190173177521, |
| "kl": 0.1650390625, |
| "learning_rate": 2e-06, |
| "loss": -0.0024, |
| "reward": 0.8493468165397644, |
| "reward_std": 0.14191898703575134, |
| "rewards/preference_model_reward": 0.8493468165397644, |
| "rewards/preference_model_reward/std": 0.24982212483882904, |
| "step": 409 |
| }, |
| { |
| "clip_ratio": 0.000360783189535141, |
| "epoch": 0.5331599479843954, |
| "grad_norm": 0.49350987150616776, |
| "kl": 0.1640625, |
| "learning_rate": 2e-06, |
| "loss": -0.0027, |
| "step": 410 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 438.96875, |
| "epoch": 0.5344603381014305, |
| "grad_norm": 0.11849056766792637, |
| "kl": 0.2060546875, |
| "learning_rate": 2e-06, |
| "loss": 0.0004, |
| "reward": 0.993826150894165, |
| "reward_std": 0.024695372208952904, |
| "rewards/preference_model_reward": 0.993826150894165, |
| "rewards/preference_model_reward/std": 0.03492453321814537, |
| "step": 411 |
| }, |
| { |
| "clip_ratio": 5.724754009861499e-05, |
| "epoch": 0.5357607282184655, |
| "grad_norm": 0.11249502423658887, |
| "kl": 0.2060546875, |
| "learning_rate": 2e-06, |
| "loss": 0.0003, |
| "step": 412 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 484.28125, |
| "epoch": 0.5370611183355006, |
| "grad_norm": 0.4928407072097148, |
| "kl": 0.171875, |
| "learning_rate": 2e-06, |
| "loss": -0.002, |
| "reward": 0.9763680100440979, |
| "reward_std": 0.08725623041391373, |
| "rewards/preference_model_reward": 0.9763680100440979, |
| "rewards/preference_model_reward/std": 0.0860796794295311, |
| "step": 413 |
| }, |
| { |
| "clip_ratio": 0.0006810087943449616, |
| "epoch": 0.5383615084525357, |
| "grad_norm": 0.25338692807833185, |
| "kl": 0.171875, |
| "learning_rate": 2e-06, |
| "loss": -0.0022, |
| "step": 414 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 419.28125, |
| "epoch": 0.5396618985695709, |
| "grad_norm": 0.1908824259718182, |
| "kl": 0.1962890625, |
| "learning_rate": 2e-06, |
| "loss": 0.0005, |
| "reward": 0.9887747764587402, |
| "reward_std": 0.04490102827548981, |
| "rewards/preference_model_reward": 0.9887747764587402, |
| "rewards/preference_model_reward/std": 0.06349964439868927, |
| "step": 415 |
| }, |
| { |
| "clip_ratio": 6.952168769203126e-05, |
| "epoch": 0.540962288686606, |
| "grad_norm": 0.1787252912416434, |
| "kl": 0.1953125, |
| "learning_rate": 2e-06, |
| "loss": 0.0004, |
| "step": 416 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 633.4375, |
| "epoch": 0.5422626788036411, |
| "grad_norm": 0.9123177644363701, |
| "kl": 0.193359375, |
| "learning_rate": 2e-06, |
| "loss": 0.0143, |
| "reward": 0.9695033431053162, |
| "reward_std": 0.12198655307292938, |
| "rewards/preference_model_reward": 0.9695033431053162, |
| "rewards/preference_model_reward/std": 0.17251503467559814, |
| "step": 417 |
| }, |
| { |
| "clip_ratio": 9.883376333164051e-05, |
| "epoch": 0.5435630689206762, |
| "grad_norm": 0.9162834478633793, |
| "kl": 0.193359375, |
| "learning_rate": 2e-06, |
| "loss": 0.0137, |
| "step": 418 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 411.03125, |
| "epoch": 0.5448634590377113, |
| "grad_norm": 0.7437466671675131, |
| "kl": 0.1826171875, |
| "learning_rate": 2e-06, |
| "loss": -0.0012, |
| "reward": 0.8951805830001831, |
| "reward_std": 0.1884705126285553, |
| "rewards/preference_model_reward": 0.8951805830001831, |
| "rewards/preference_model_reward/std": 0.2830054759979248, |
| "step": 419 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "epoch": 0.5461638491547465, |
| "grad_norm": 0.6744284019984438, |
| "kl": 0.18359375, |
| "learning_rate": 2e-06, |
| "loss": -0.0017, |
| "step": 420 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 480.03125, |
| "epoch": 0.5474642392717816, |
| "grad_norm": 0.46776056087514883, |
| "kl": 0.21484375, |
| "learning_rate": 2e-06, |
| "loss": -0.0024, |
| "reward": 0.9680905342102051, |
| "reward_std": 0.08884235471487045, |
| "rewards/preference_model_reward": 0.9680905342102051, |
| "rewards/preference_model_reward/std": 0.12778013944625854, |
| "step": 421 |
| }, |
| { |
| "clip_ratio": 0.00018961718888022006, |
| "epoch": 0.5487646293888166, |
| "grad_norm": 0.4036332714818817, |
| "kl": 0.2138671875, |
| "learning_rate": 2e-06, |
| "loss": -0.0027, |
| "step": 422 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 435.0625, |
| "epoch": 0.5500650195058517, |
| "grad_norm": 0.8298529686690673, |
| "kl": 0.1953125, |
| "learning_rate": 2e-06, |
| "loss": -0.013, |
| "reward": 0.8935482501983643, |
| "reward_std": 0.29745957255363464, |
| "rewards/preference_model_reward": 0.8935482501983643, |
| "rewards/preference_model_reward/std": 0.2955591082572937, |
| "step": 423 |
| }, |
| { |
| "clip_ratio": 0.0002922832500189543, |
| "epoch": 0.5513654096228868, |
| "grad_norm": 0.7804747694003971, |
| "kl": 0.1943359375, |
| "learning_rate": 2e-06, |
| "loss": -0.0136, |
| "step": 424 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 576.15625, |
| "epoch": 0.552665799739922, |
| "grad_norm": 0.5767173027038802, |
| "kl": 0.2265625, |
| "learning_rate": 2e-06, |
| "loss": -0.0018, |
| "reward": 0.9546047449111938, |
| "reward_std": 0.12968912720680237, |
| "rewards/preference_model_reward": 0.9546047449111938, |
| "rewards/preference_model_reward/std": 0.18622736632823944, |
| "step": 425 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "epoch": 0.5539661898569571, |
| "grad_norm": 0.5385672477616652, |
| "kl": 0.2265625, |
| "learning_rate": 2e-06, |
| "loss": -0.0022, |
| "step": 426 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 481.125, |
| "epoch": 0.5552665799739922, |
| "grad_norm": 0.7743174896574186, |
| "kl": 0.2001953125, |
| "learning_rate": 2e-06, |
| "loss": -0.012, |
| "reward": 0.885873019695282, |
| "reward_std": 0.18265797197818756, |
| "rewards/preference_model_reward": 0.885873019695282, |
| "rewards/preference_model_reward/std": 0.27932146191596985, |
| "step": 427 |
| }, |
| { |
| "clip_ratio": 0.00013702338037546724, |
| "epoch": 0.5565669700910273, |
| "grad_norm": 0.7444105184385629, |
| "kl": 0.201171875, |
| "learning_rate": 2e-06, |
| "loss": -0.0126, |
| "step": 428 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 559.375, |
| "epoch": 0.5578673602080624, |
| "grad_norm": 1.1477604387216325, |
| "kl": 0.224609375, |
| "learning_rate": 2e-06, |
| "loss": -0.0163, |
| "reward": 0.8528430461883545, |
| "reward_std": 0.22550532221794128, |
| "rewards/preference_model_reward": 0.8528430461883545, |
| "rewards/preference_model_reward/std": 0.34753158688545227, |
| "step": 429 |
| }, |
| { |
| "clip_ratio": 0.00011868027650052682, |
| "epoch": 0.5591677503250976, |
| "grad_norm": 1.0326779519681246, |
| "kl": 0.2255859375, |
| "learning_rate": 2e-06, |
| "loss": -0.017, |
| "step": 430 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 500.3125, |
| "epoch": 0.5604681404421327, |
| "grad_norm": 0.9071727949790038, |
| "kl": 0.1962890625, |
| "learning_rate": 2e-06, |
| "loss": -0.0084, |
| "reward": 0.8113287091255188, |
| "reward_std": 0.19848331809043884, |
| "rewards/preference_model_reward": 0.8113287091255188, |
| "rewards/preference_model_reward/std": 0.33614689111709595, |
| "step": 431 |
| }, |
| { |
| "clip_ratio": 7.271669892361388e-05, |
| "epoch": 0.5617685305591678, |
| "grad_norm": 0.8455173031641725, |
| "kl": 0.197265625, |
| "learning_rate": 2e-06, |
| "loss": -0.009, |
| "step": 432 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 604.71875, |
| "epoch": 0.5630689206762028, |
| "grad_norm": 1.3857673138070365, |
| "kl": 0.2099609375, |
| "learning_rate": 2e-06, |
| "loss": 0.0168, |
| "reward": 0.8483308553695679, |
| "reward_std": 0.325148344039917, |
| "rewards/preference_model_reward": 0.8483308553695679, |
| "rewards/preference_model_reward/std": 0.32090431451797485, |
| "step": 433 |
| }, |
| { |
| "clip_ratio": 0.0003254468902014196, |
| "epoch": 0.5643693107932379, |
| "grad_norm": 1.3314635785340656, |
| "kl": 0.2158203125, |
| "learning_rate": 2e-06, |
| "loss": 0.0157, |
| "step": 434 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 512.28125, |
| "epoch": 0.5656697009102731, |
| "grad_norm": 0.5989680573760588, |
| "kl": 0.2177734375, |
| "learning_rate": 2e-06, |
| "loss": 0.0099, |
| "reward": 0.9779109358787537, |
| "reward_std": 0.08835619688034058, |
| "rewards/preference_model_reward": 0.9779109358787537, |
| "rewards/preference_model_reward/std": 0.12495452910661697, |
| "step": 435 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "epoch": 0.5669700910273082, |
| "grad_norm": 0.5634736482208081, |
| "kl": 0.21875, |
| "learning_rate": 2e-06, |
| "loss": 0.0094, |
| "step": 436 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 463.28125, |
| "epoch": 0.5682704811443433, |
| "grad_norm": 0.33184977159461365, |
| "kl": 0.2333984375, |
| "learning_rate": 2e-06, |
| "loss": -0.0026, |
| "reward": 0.9590626955032349, |
| "reward_std": 0.10807390511035919, |
| "rewards/preference_model_reward": 0.9590626955032349, |
| "rewards/preference_model_reward/std": 0.11148703843355179, |
| "step": 437 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "epoch": 0.5695708712613784, |
| "grad_norm": 0.3161029328314251, |
| "kl": 0.234375, |
| "learning_rate": 2e-06, |
| "loss": -0.0028, |
| "step": 438 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 465.53125, |
| "epoch": 0.5708712613784135, |
| "grad_norm": 0.9314909195166072, |
| "kl": 0.2197265625, |
| "learning_rate": 2e-06, |
| "loss": -0.0062, |
| "reward": 0.9457427859306335, |
| "reward_std": 0.17563901841640472, |
| "rewards/preference_model_reward": 0.9457427859306335, |
| "rewards/preference_model_reward/std": 0.17885959148406982, |
| "step": 439 |
| }, |
| { |
| "clip_ratio": 0.0005498891696333885, |
| "epoch": 0.5721716514954487, |
| "grad_norm": 0.6109650911373344, |
| "kl": 0.2216796875, |
| "learning_rate": 2e-06, |
| "loss": -0.0065, |
| "step": 440 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 580.375, |
| "epoch": 0.5734720416124838, |
| "grad_norm": 0.02581098267581845, |
| "kl": 0.21484375, |
| "learning_rate": 2e-06, |
| "loss": 0.0002, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/preference_model_reward": 1.0, |
| "rewards/preference_model_reward/std": 0.0, |
| "step": 441 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "epoch": 0.5747724317295189, |
| "grad_norm": 0.01979962238447969, |
| "kl": 0.201171875, |
| "learning_rate": 2e-06, |
| "loss": 0.0002, |
| "step": 442 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 447.3125, |
| "epoch": 0.576072821846554, |
| "grad_norm": 0.4804861704479814, |
| "kl": 0.228515625, |
| "learning_rate": 2e-06, |
| "loss": 0.0016, |
| "reward": 0.9702784419059753, |
| "reward_std": 0.09500616788864136, |
| "rewards/preference_model_reward": 0.9702784419059753, |
| "rewards/preference_model_reward/std": 0.13557977974414825, |
| "step": 443 |
| }, |
| { |
| "clip_ratio": 6.139489414636046e-05, |
| "epoch": 0.577373211963589, |
| "grad_norm": 0.4764924275579863, |
| "kl": 0.21875, |
| "learning_rate": 2e-06, |
| "loss": 0.0013, |
| "step": 444 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 396.3125, |
| "epoch": 0.5786736020806242, |
| "grad_norm": 0.9864886855099723, |
| "kl": 0.1943359375, |
| "learning_rate": 2e-06, |
| "loss": -0.0046, |
| "reward": 0.8668668270111084, |
| "reward_std": 0.297611802816391, |
| "rewards/preference_model_reward": 0.8668668270111084, |
| "rewards/preference_model_reward/std": 0.30994293093681335, |
| "step": 445 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "epoch": 0.5799739921976593, |
| "grad_norm": 0.8321524897920206, |
| "kl": 0.1904296875, |
| "learning_rate": 2e-06, |
| "loss": -0.0051, |
| "step": 446 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 427.9375, |
| "epoch": 0.5812743823146944, |
| "grad_norm": 1.1409128393837655, |
| "kl": 0.2275390625, |
| "learning_rate": 2e-06, |
| "loss": 0.0028, |
| "reward": 0.795079231262207, |
| "reward_std": 0.31667160987854004, |
| "rewards/preference_model_reward": 0.795079231262207, |
| "rewards/preference_model_reward/std": 0.34156370162963867, |
| "step": 447 |
| }, |
| { |
| "clip_ratio": 0.00022159164655022323, |
| "epoch": 0.5825747724317295, |
| "grad_norm": 1.0738948095730507, |
| "kl": 0.2275390625, |
| "learning_rate": 2e-06, |
| "loss": 0.002, |
| "step": 448 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 552.40625, |
| "epoch": 0.5838751625487646, |
| "grad_norm": 1.507954566094587, |
| "kl": 0.162109375, |
| "learning_rate": 2e-06, |
| "loss": -0.0013, |
| "reward": 0.5813151597976685, |
| "reward_std": 0.38100069761276245, |
| "rewards/preference_model_reward": 0.5813151597976685, |
| "rewards/preference_model_reward/std": 0.4741608202457428, |
| "step": 449 |
| }, |
| { |
| "clip_ratio": 0.0003506769426167011, |
| "epoch": 0.5851755526657998, |
| "grad_norm": 1.4082782499892905, |
| "kl": 0.162109375, |
| "learning_rate": 2e-06, |
| "loss": -0.0023, |
| "step": 450 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 568.09375, |
| "epoch": 0.5864759427828349, |
| "grad_norm": 1.5418403027222796, |
| "kl": 0.1796875, |
| "learning_rate": 2e-06, |
| "loss": 0.0157, |
| "reward": 0.6295768618583679, |
| "reward_std": 0.4063897132873535, |
| "rewards/preference_model_reward": 0.6295768618583679, |
| "rewards/preference_model_reward/std": 0.4200841784477234, |
| "step": 451 |
| }, |
| { |
| "clip_ratio": 0.00017533147183712572, |
| "epoch": 0.58777633289987, |
| "grad_norm": 1.4908308242341415, |
| "kl": 0.1806640625, |
| "learning_rate": 2e-06, |
| "loss": 0.0147, |
| "step": 452 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 414.71875, |
| "epoch": 0.5890767230169051, |
| "grad_norm": 0.3291705830428915, |
| "kl": 0.171875, |
| "learning_rate": 2e-06, |
| "loss": -0.0022, |
| "reward": 0.9669902324676514, |
| "reward_std": 0.0894487202167511, |
| "rewards/preference_model_reward": 0.9669902324676514, |
| "rewards/preference_model_reward/std": 0.10228119045495987, |
| "step": 453 |
| }, |
| { |
| "clip_ratio": 0.0001948475546669215, |
| "epoch": 0.5903771131339401, |
| "grad_norm": 0.3089967159758185, |
| "kl": 0.171875, |
| "learning_rate": 2e-06, |
| "loss": -0.0025, |
| "step": 454 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 589.6875, |
| "epoch": 0.5916775032509753, |
| "grad_norm": 0.777625282632689, |
| "kl": 0.1748046875, |
| "learning_rate": 2e-06, |
| "loss": 0.0116, |
| "reward": 0.9618591070175171, |
| "reward_std": 0.12204061448574066, |
| "rewards/preference_model_reward": 0.9618591070175171, |
| "rewards/preference_model_reward/std": 0.1741509884595871, |
| "step": 455 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "epoch": 0.5929778933680104, |
| "grad_norm": 0.7325722818853695, |
| "kl": 0.17578125, |
| "learning_rate": 2e-06, |
| "loss": 0.011, |
| "step": 456 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 505.3125, |
| "epoch": 0.5942782834850455, |
| "grad_norm": 0.5246149237440364, |
| "kl": 0.185546875, |
| "learning_rate": 2e-06, |
| "loss": -0.0037, |
| "reward": 0.9700804948806763, |
| "reward_std": 0.11967816203832626, |
| "rewards/preference_model_reward": 0.9700804948806763, |
| "rewards/preference_model_reward/std": 0.1692504733800888, |
| "step": 457 |
| }, |
| { |
| "clip_ratio": 0.00017141705029644072, |
| "epoch": 0.5955786736020806, |
| "grad_norm": 0.4704269534446643, |
| "kl": 0.1865234375, |
| "learning_rate": 2e-06, |
| "loss": -0.004, |
| "step": 458 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 418.4375, |
| "epoch": 0.5968790637191157, |
| "grad_norm": 0.99830704358775, |
| "kl": 0.208984375, |
| "learning_rate": 2e-06, |
| "loss": 0.0132, |
| "reward": 0.8735105395317078, |
| "reward_std": 0.1765914112329483, |
| "rewards/preference_model_reward": 0.8735105395317078, |
| "rewards/preference_model_reward/std": 0.27725955843925476, |
| "step": 459 |
| }, |
| { |
| "clip_ratio": 0.0006676804041489959, |
| "epoch": 0.5981794538361509, |
| "grad_norm": 0.9276264800319912, |
| "kl": 0.2099609375, |
| "learning_rate": 2e-06, |
| "loss": 0.0127, |
| "step": 460 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 600.625, |
| "epoch": 0.599479843953186, |
| "grad_norm": 1.5304815918050108, |
| "kl": 0.1826171875, |
| "learning_rate": 2e-06, |
| "loss": 0.0092, |
| "reward": 0.8300485610961914, |
| "reward_std": 0.36765021085739136, |
| "rewards/preference_model_reward": 0.8300485610961914, |
| "rewards/preference_model_reward/std": 0.3629445433616638, |
| "step": 461 |
| }, |
| { |
| "clip_ratio": 9.010268695419654e-05, |
| "epoch": 0.6007802340702211, |
| "grad_norm": 1.4944547262094985, |
| "kl": 0.189453125, |
| "learning_rate": 2e-06, |
| "loss": 0.0081, |
| "step": 462 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 556.125, |
| "epoch": 0.6020806241872562, |
| "grad_norm": 0.018760959897438687, |
| "kl": 0.20703125, |
| "learning_rate": 2e-06, |
| "loss": 0.0002, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/preference_model_reward": 1.0, |
| "rewards/preference_model_reward/std": 0.0, |
| "step": 463 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "epoch": 0.6033810143042913, |
| "grad_norm": 0.01660210108604342, |
| "kl": 0.197265625, |
| "learning_rate": 2e-06, |
| "loss": 0.0002, |
| "step": 464 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 490.59375, |
| "epoch": 0.6046814044213265, |
| "grad_norm": 0.27288074072764534, |
| "kl": 0.205078125, |
| "learning_rate": 2e-06, |
| "loss": -0.0053, |
| "reward": 0.9808217287063599, |
| "reward_std": 0.07671315968036652, |
| "rewards/preference_model_reward": 0.9808217287063599, |
| "rewards/preference_model_reward/std": 0.1084887906908989, |
| "step": 465 |
| }, |
| { |
| "clip_ratio": 0.00041510548908263445, |
| "epoch": 0.6059817945383615, |
| "grad_norm": 0.2036263491928161, |
| "kl": 0.1962890625, |
| "learning_rate": 2e-06, |
| "loss": -0.0055, |
| "step": 466 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 446.375, |
| "epoch": 0.6072821846553966, |
| "grad_norm": 0.9138687673589736, |
| "kl": 0.1748046875, |
| "learning_rate": 2e-06, |
| "loss": 0.01, |
| "reward": 0.5181146264076233, |
| "reward_std": 0.2147461473941803, |
| "rewards/preference_model_reward": 0.5181146264076233, |
| "rewards/preference_model_reward/std": 0.4774966835975647, |
| "step": 467 |
| }, |
| { |
| "clip_ratio": 0.0008817376801744103, |
| "epoch": 0.6085825747724317, |
| "grad_norm": 0.884284869476855, |
| "kl": 0.1708984375, |
| "learning_rate": 2e-06, |
| "loss": 0.0093, |
| "step": 468 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 578.03125, |
| "epoch": 0.6098829648894668, |
| "grad_norm": 1.373228383519465, |
| "kl": 0.1767578125, |
| "learning_rate": 2e-06, |
| "loss": 0.0312, |
| "reward": 0.8574967980384827, |
| "reward_std": 0.3164219260215759, |
| "rewards/preference_model_reward": 0.8574967980384827, |
| "rewards/preference_model_reward/std": 0.3132801949977875, |
| "step": 469 |
| }, |
| { |
| "clip_ratio": 0.0002780166978482157, |
| "epoch": 0.611183355006502, |
| "grad_norm": 1.2773340380778375, |
| "kl": 0.173828125, |
| "learning_rate": 2e-06, |
| "loss": 0.0302, |
| "step": 470 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 598.875, |
| "epoch": 0.6124837451235371, |
| "grad_norm": 1.4916055964066572, |
| "kl": 0.1689453125, |
| "learning_rate": 2e-06, |
| "loss": 0.0378, |
| "reward": 0.7772074341773987, |
| "reward_std": 0.3698381781578064, |
| "rewards/preference_model_reward": 0.7772074341773987, |
| "rewards/preference_model_reward/std": 0.36533990502357483, |
| "step": 471 |
| }, |
| { |
| "clip_ratio": 4.626202644431032e-05, |
| "epoch": 0.6137841352405722, |
| "grad_norm": 1.4536924831752602, |
| "kl": 0.169921875, |
| "learning_rate": 2e-06, |
| "loss": 0.0366, |
| "step": 472 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 505.875, |
| "epoch": 0.6150845253576073, |
| "grad_norm": 0.7544413510474123, |
| "kl": 0.173828125, |
| "learning_rate": 2e-06, |
| "loss": 0.0057, |
| "reward": 0.9283210635185242, |
| "reward_std": 0.18047307431697845, |
| "rewards/preference_model_reward": 0.9283210635185242, |
| "rewards/preference_model_reward/std": 0.2003507763147354, |
| "step": 473 |
| }, |
| { |
| "clip_ratio": 0.0003117350279353559, |
| "epoch": 0.6163849154746424, |
| "grad_norm": 0.7045547292296246, |
| "kl": 0.173828125, |
| "learning_rate": 2e-06, |
| "loss": 0.0051, |
| "step": 474 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 668.59375, |
| "epoch": 0.6176853055916776, |
| "grad_norm": 0.7354870473019032, |
| "kl": 0.1591796875, |
| "learning_rate": 2e-06, |
| "loss": 0.0047, |
| "reward": 0.9718723893165588, |
| "reward_std": 0.11251037567853928, |
| "rewards/preference_model_reward": 0.9718723893165588, |
| "rewards/preference_model_reward/std": 0.15911369025707245, |
| "step": 475 |
| }, |
| { |
| "clip_ratio": 3.537569136824459e-05, |
| "epoch": 0.6189856957087126, |
| "grad_norm": 0.6857184543038295, |
| "kl": 0.1591796875, |
| "learning_rate": 2e-06, |
| "loss": 0.0041, |
| "step": 476 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 677.09375, |
| "epoch": 0.6202860858257477, |
| "grad_norm": 1.7321781240572358, |
| "kl": 0.15625, |
| "learning_rate": 2e-06, |
| "loss": 0.0378, |
| "reward": 0.797584056854248, |
| "reward_std": 0.3873823583126068, |
| "rewards/preference_model_reward": 0.797584056854248, |
| "rewards/preference_model_reward/std": 0.38197755813598633, |
| "step": 477 |
| }, |
| { |
| "clip_ratio": 4.785605051438324e-05, |
| "epoch": 0.6215864759427828, |
| "grad_norm": 1.7078683184871906, |
| "kl": 0.15625, |
| "learning_rate": 2e-06, |
| "loss": 0.0366, |
| "step": 478 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 439.28125, |
| "epoch": 0.6228868660598179, |
| "grad_norm": 1.3124273700119886, |
| "kl": 0.1796875, |
| "learning_rate": 2e-06, |
| "loss": -0.0063, |
| "reward": 0.6355161666870117, |
| "reward_std": 0.4067726731300354, |
| "rewards/preference_model_reward": 0.6355161666870117, |
| "rewards/preference_model_reward/std": 0.4306652247905731, |
| "step": 479 |
| }, |
| { |
| "clip_ratio": 6.122948980191723e-05, |
| "epoch": 0.6241872561768531, |
| "grad_norm": 1.2747010389997053, |
| "kl": 0.1796875, |
| "learning_rate": 2e-06, |
| "loss": -0.0073, |
| "step": 480 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 476.8125, |
| "epoch": 0.6254876462938882, |
| "grad_norm": 1.1771905194134227, |
| "kl": 0.1669921875, |
| "learning_rate": 2e-06, |
| "loss": 0.0047, |
| "reward": 0.8450658321380615, |
| "reward_std": 0.32613372802734375, |
| "rewards/preference_model_reward": 0.8450658321380615, |
| "rewards/preference_model_reward/std": 0.32565778493881226, |
| "step": 481 |
| }, |
| { |
| "clip_ratio": 6.566850788658485e-05, |
| "epoch": 0.6267880364109233, |
| "grad_norm": 1.0949478432903983, |
| "kl": 0.16796875, |
| "learning_rate": 2e-06, |
| "loss": 0.0037, |
| "step": 482 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 411.1875, |
| "epoch": 0.6280884265279584, |
| "grad_norm": 1.0345750868470442, |
| "kl": 0.1708984375, |
| "learning_rate": 2e-06, |
| "loss": 0.0024, |
| "reward": 0.7638916373252869, |
| "reward_std": 0.3289935886859894, |
| "rewards/preference_model_reward": 0.7638916373252869, |
| "rewards/preference_model_reward/std": 0.33954110741615295, |
| "step": 483 |
| }, |
| { |
| "clip_ratio": 6.652474985457957e-05, |
| "epoch": 0.6293888166449935, |
| "grad_norm": 0.9527524423711179, |
| "kl": 0.171875, |
| "learning_rate": 2e-06, |
| "loss": 0.0017, |
| "step": 484 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 548.3125, |
| "epoch": 0.6306892067620286, |
| "grad_norm": 1.0248342945427826, |
| "kl": 0.19921875, |
| "learning_rate": 2e-06, |
| "loss": 0.0007, |
| "reward": 0.8802142143249512, |
| "reward_std": 0.28954002261161804, |
| "rewards/preference_model_reward": 0.8802142143249512, |
| "rewards/preference_model_reward/std": 0.28504154086112976, |
| "step": 485 |
| }, |
| { |
| "clip_ratio": 4.90099992020987e-05, |
| "epoch": 0.6319895968790638, |
| "grad_norm": 0.9824475068268235, |
| "kl": 0.1923828125, |
| "learning_rate": 2e-06, |
| "loss": -0.0, |
| "step": 486 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 438.90625, |
| "epoch": 0.6332899869960988, |
| "grad_norm": 1.0170620577468696, |
| "kl": 0.181640625, |
| "learning_rate": 2e-06, |
| "loss": -0.011, |
| "reward": 0.840887188911438, |
| "reward_std": 0.21575048565864563, |
| "rewards/preference_model_reward": 0.840887188911438, |
| "rewards/preference_model_reward/std": 0.3409208357334137, |
| "step": 487 |
| }, |
| { |
| "clip_ratio": 0.00013467390090227127, |
| "epoch": 0.6345903771131339, |
| "grad_norm": 0.9962152448073786, |
| "kl": 0.18359375, |
| "learning_rate": 2e-06, |
| "loss": -0.0118, |
| "step": 488 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 453.25, |
| "epoch": 0.635890767230169, |
| "grad_norm": 1.1268507254797382, |
| "kl": 0.1923828125, |
| "learning_rate": 2e-06, |
| "loss": 0.0048, |
| "reward": 0.837617039680481, |
| "reward_std": 0.21851858496665955, |
| "rewards/preference_model_reward": 0.837617039680481, |
| "rewards/preference_model_reward/std": 0.34588855504989624, |
| "step": 489 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "epoch": 0.6371911573472041, |
| "grad_norm": 1.0885462907631436, |
| "kl": 0.1953125, |
| "learning_rate": 2e-06, |
| "loss": 0.0039, |
| "step": 490 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 427.09375, |
| "epoch": 0.6384915474642393, |
| "grad_norm": 0.9071689083575298, |
| "kl": 0.201171875, |
| "learning_rate": 2e-06, |
| "loss": -0.0026, |
| "reward": 0.8059304356575012, |
| "reward_std": 0.20044738054275513, |
| "rewards/preference_model_reward": 0.8059304356575012, |
| "rewards/preference_model_reward/std": 0.341531902551651, |
| "step": 491 |
| }, |
| { |
| "clip_ratio": 6.603274960070848e-05, |
| "epoch": 0.6397919375812744, |
| "grad_norm": 0.8345599641034357, |
| "kl": 0.203125, |
| "learning_rate": 2e-06, |
| "loss": -0.0032, |
| "step": 492 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 478.375, |
| "epoch": 0.6410923276983095, |
| "grad_norm": 0.6624858525413615, |
| "kl": 0.197265625, |
| "learning_rate": 2e-06, |
| "loss": -0.0033, |
| "reward": 0.9255695939064026, |
| "reward_std": 0.14829862117767334, |
| "rewards/preference_model_reward": 0.9255695939064026, |
| "rewards/preference_model_reward/std": 0.21973776817321777, |
| "step": 493 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "epoch": 0.6423927178153446, |
| "grad_norm": 0.6312105576362554, |
| "kl": 0.2001953125, |
| "learning_rate": 2e-06, |
| "loss": -0.0038, |
| "step": 494 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 525.28125, |
| "epoch": 0.6436931079323797, |
| "grad_norm": 1.5486110651386202, |
| "kl": 0.20703125, |
| "learning_rate": 2e-06, |
| "loss": 0.0059, |
| "reward": 0.8713526725769043, |
| "reward_std": 0.2359280288219452, |
| "rewards/preference_model_reward": 0.8713526725769043, |
| "rewards/preference_model_reward/std": 0.2887003421783447, |
| "step": 495 |
| }, |
| { |
| "clip_ratio": 0.0002505861921235919, |
| "epoch": 0.6449934980494149, |
| "grad_norm": 1.0616419797076526, |
| "kl": 0.208984375, |
| "learning_rate": 2e-06, |
| "loss": 0.0053, |
| "step": 496 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 513.0, |
| "epoch": 0.64629388816645, |
| "grad_norm": 1.2046309521967586, |
| "kl": 0.189453125, |
| "learning_rate": 2e-06, |
| "loss": 0.024, |
| "reward": 0.8967607021331787, |
| "reward_std": 0.28214776515960693, |
| "rewards/preference_model_reward": 0.8967607021331787, |
| "rewards/preference_model_reward/std": 0.28318798542022705, |
| "step": 497 |
| }, |
| { |
| "clip_ratio": 0.00011981013813056052, |
| "epoch": 0.647594278283485, |
| "grad_norm": 1.1245818999020487, |
| "kl": 0.193359375, |
| "learning_rate": 2e-06, |
| "loss": 0.0229, |
| "step": 498 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 500.34375, |
| "epoch": 0.6488946684005201, |
| "grad_norm": 0.8253153021127959, |
| "kl": 0.19921875, |
| "learning_rate": 2e-06, |
| "loss": -0.0024, |
| "reward": 0.9368200302124023, |
| "reward_std": 0.16158036887645721, |
| "rewards/preference_model_reward": 0.9368200302124023, |
| "rewards/preference_model_reward/std": 0.2238502949476242, |
| "step": 499 |
| }, |
| { |
| "clip_ratio": 0.00017198966816067696, |
| "epoch": 0.6501950585175552, |
| "grad_norm": 0.7544071871520531, |
| "kl": 0.2041015625, |
| "learning_rate": 2e-06, |
| "loss": -0.0029, |
| "step": 500 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 427.03125, |
| "epoch": 0.6514954486345904, |
| "grad_norm": 0.8134258338741699, |
| "kl": 0.2158203125, |
| "learning_rate": 2e-06, |
| "loss": -0.0062, |
| "reward": 0.9170088171958923, |
| "reward_std": 0.17870065569877625, |
| "rewards/preference_model_reward": 0.9170088171958923, |
| "rewards/preference_model_reward/std": 0.26252105832099915, |
| "step": 501 |
| }, |
| { |
| "clip_ratio": 0.0001375194697175175, |
| "epoch": 0.6527958387516255, |
| "grad_norm": 0.7216707899969623, |
| "kl": 0.2177734375, |
| "learning_rate": 2e-06, |
| "loss": -0.0067, |
| "step": 502 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 518.40625, |
| "epoch": 0.6540962288686606, |
| "grad_norm": 0.7084971694449992, |
| "kl": 0.201171875, |
| "learning_rate": 2e-06, |
| "loss": -0.0019, |
| "reward": 0.9319340586662292, |
| "reward_std": 0.19666869938373566, |
| "rewards/preference_model_reward": 0.9319340586662292, |
| "rewards/preference_model_reward/std": 0.21102942526340485, |
| "step": 503 |
| }, |
| { |
| "clip_ratio": 0.00019161276577506214, |
| "epoch": 0.6553966189856957, |
| "grad_norm": 0.6297038100275056, |
| "kl": 0.203125, |
| "learning_rate": 2e-06, |
| "loss": -0.0023, |
| "step": 504 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 534.25, |
| "epoch": 0.6566970091027308, |
| "grad_norm": 1.2200861351672945, |
| "kl": 0.212890625, |
| "learning_rate": 2e-06, |
| "loss": 0.0057, |
| "reward": 0.870198130607605, |
| "reward_std": 0.2922201156616211, |
| "rewards/preference_model_reward": 0.870198130607605, |
| "rewards/preference_model_reward/std": 0.2885199189186096, |
| "step": 505 |
| }, |
| { |
| "clip_ratio": 0.0006603770307265222, |
| "epoch": 0.657997399219766, |
| "grad_norm": 1.1025383311332084, |
| "kl": 0.21484375, |
| "learning_rate": 2e-06, |
| "loss": 0.0049, |
| "step": 506 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 350.8125, |
| "epoch": 0.659297789336801, |
| "grad_norm": 0.5929542711439004, |
| "kl": 0.228515625, |
| "learning_rate": 2e-06, |
| "loss": -0.004, |
| "reward": 0.9226664304733276, |
| "reward_std": 0.19836580753326416, |
| "rewards/preference_model_reward": 0.9226664304733276, |
| "rewards/preference_model_reward/std": 0.19793914258480072, |
| "step": 507 |
| }, |
| { |
| "clip_ratio": 0.00015906358021311462, |
| "epoch": 0.6605981794538361, |
| "grad_norm": 0.5486817244564233, |
| "kl": 0.23046875, |
| "learning_rate": 2e-06, |
| "loss": -0.0045, |
| "step": 508 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 548.5625, |
| "epoch": 0.6618985695708712, |
| "grad_norm": 0.3288568622069386, |
| "kl": 0.2119140625, |
| "learning_rate": 2e-06, |
| "loss": 0.0014, |
| "reward": 0.9740253686904907, |
| "reward_std": 0.057107504457235336, |
| "rewards/preference_model_reward": 0.9740253686904907, |
| "rewards/preference_model_reward/std": 0.0837172269821167, |
| "step": 509 |
| }, |
| { |
| "clip_ratio": 9.279881487600505e-05, |
| "epoch": 0.6631989596879063, |
| "grad_norm": 0.3017876628121043, |
| "kl": 0.21484375, |
| "learning_rate": 2e-06, |
| "loss": 0.0012, |
| "step": 510 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 501.96875, |
| "epoch": 0.6644993498049415, |
| "grad_norm": 0.2552289537545978, |
| "kl": 0.2177734375, |
| "learning_rate": 2e-06, |
| "loss": -0.0022, |
| "reward": 0.9818136692047119, |
| "reward_std": 0.05234856531023979, |
| "rewards/preference_model_reward": 0.9818136692047119, |
| "rewards/preference_model_reward/std": 0.07513560354709625, |
| "step": 511 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "epoch": 0.6657997399219766, |
| "grad_norm": 0.21243026144301005, |
| "kl": 0.220703125, |
| "learning_rate": 2e-06, |
| "loss": -0.0024, |
| "step": 512 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 522.84375, |
| "epoch": 0.6671001300390117, |
| "grad_norm": 0.03160928214041918, |
| "kl": 0.248046875, |
| "learning_rate": 2e-06, |
| "loss": 0.0003, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/preference_model_reward": 1.0, |
| "rewards/preference_model_reward/std": 0.0, |
| "step": 513 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "epoch": 0.6684005201560468, |
| "grad_norm": 0.019785740981705057, |
| "kl": 0.23828125, |
| "learning_rate": 2e-06, |
| "loss": 0.0002, |
| "step": 514 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 619.25, |
| "epoch": 0.6697009102730819, |
| "grad_norm": 0.9295754723585831, |
| "kl": 0.1962890625, |
| "learning_rate": 2e-06, |
| "loss": 0.0107, |
| "reward": 0.947670578956604, |
| "reward_std": 0.14302663505077362, |
| "rewards/preference_model_reward": 0.947670578956604, |
| "rewards/preference_model_reward/std": 0.20596152544021606, |
| "step": 515 |
| }, |
| { |
| "clip_ratio": 8.934033394325525e-05, |
| "epoch": 0.6710013003901171, |
| "grad_norm": 0.8958440484535811, |
| "kl": 0.19140625, |
| "learning_rate": 2e-06, |
| "loss": 0.01, |
| "step": 516 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 457.53125, |
| "epoch": 0.6723016905071522, |
| "grad_norm": 0.53034749494269, |
| "kl": 0.236328125, |
| "learning_rate": 2e-06, |
| "loss": -0.0036, |
| "reward": 0.49002784490585327, |
| "reward_std": 0.14473380148410797, |
| "rewards/preference_model_reward": 0.49002784490585327, |
| "rewards/preference_model_reward/std": 0.48859700560569763, |
| "step": 517 |
| }, |
| { |
| "clip_ratio": 0.0005512780044227839, |
| "epoch": 0.6736020806241872, |
| "grad_norm": 0.692962639214083, |
| "kl": 0.2314453125, |
| "learning_rate": 2e-06, |
| "loss": -0.004, |
| "step": 518 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 509.84375, |
| "epoch": 0.6749024707412223, |
| "grad_norm": 0.8076317202453783, |
| "kl": 0.20703125, |
| "learning_rate": 2e-06, |
| "loss": 0.0028, |
| "reward": 0.8944128155708313, |
| "reward_std": 0.14827537536621094, |
| "rewards/preference_model_reward": 0.8944128155708313, |
| "rewards/preference_model_reward/std": 0.2325102984905243, |
| "step": 519 |
| }, |
| { |
| "clip_ratio": 0.0001351351384073496, |
| "epoch": 0.6762028608582574, |
| "grad_norm": 0.7475623142861337, |
| "kl": 0.203125, |
| "learning_rate": 2e-06, |
| "loss": 0.0022, |
| "step": 520 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 436.84375, |
| "epoch": 0.6775032509752926, |
| "grad_norm": 0.028377255891001926, |
| "kl": 0.201171875, |
| "learning_rate": 2e-06, |
| "loss": 0.0002, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/preference_model_reward": 1.0, |
| "rewards/preference_model_reward/std": 0.0, |
| "step": 521 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "epoch": 0.6788036410923277, |
| "grad_norm": 0.012551554297778673, |
| "kl": 0.1923828125, |
| "learning_rate": 2e-06, |
| "loss": 0.0002, |
| "step": 522 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 711.125, |
| "epoch": 0.6801040312093628, |
| "grad_norm": 1.5471897459570736, |
| "kl": 0.173828125, |
| "learning_rate": 2e-06, |
| "loss": 0.0266, |
| "reward": 0.843010425567627, |
| "reward_std": 0.2676996886730194, |
| "rewards/preference_model_reward": 0.843010425567627, |
| "rewards/preference_model_reward/std": 0.34891876578330994, |
| "step": 523 |
| }, |
| { |
| "clip_ratio": 0.0002701242920011282, |
| "epoch": 0.6814044213263979, |
| "grad_norm": 1.4410403152995823, |
| "kl": 0.16796875, |
| "learning_rate": 2e-06, |
| "loss": 0.0254, |
| "step": 524 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 646.4375, |
| "epoch": 0.682704811443433, |
| "grad_norm": 1.6486514561580368, |
| "kl": 0.1689453125, |
| "learning_rate": 2e-06, |
| "loss": 0.0186, |
| "reward": 0.7798171043395996, |
| "reward_std": 0.3787343502044678, |
| "rewards/preference_model_reward": 0.7798171043395996, |
| "rewards/preference_model_reward/std": 0.37885910272598267, |
| "step": 525 |
| }, |
| { |
| "clip_ratio": 0.0002811163431033492, |
| "epoch": 0.6840052015604682, |
| "grad_norm": 1.4654130660789888, |
| "kl": 0.1640625, |
| "learning_rate": 2e-06, |
| "loss": 0.0176, |
| "step": 526 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 517.375, |
| "epoch": 0.6853055916775033, |
| "grad_norm": 1.3509256509461836, |
| "kl": 0.15625, |
| "learning_rate": 2e-06, |
| "loss": 0.0132, |
| "reward": 0.4524506628513336, |
| "reward_std": 0.4075589179992676, |
| "rewards/preference_model_reward": 0.4524506628513336, |
| "rewards/preference_model_reward/std": 0.4056318700313568, |
| "step": 527 |
| }, |
| { |
| "clip_ratio": 0.0001703468442428857, |
| "epoch": 0.6866059817945384, |
| "grad_norm": 1.3169959570019343, |
| "kl": 0.15234375, |
| "learning_rate": 2e-06, |
| "loss": 0.0121, |
| "step": 528 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 699.96875, |
| "epoch": 0.6879063719115734, |
| "grad_norm": 1.659880219331862, |
| "kl": 0.150390625, |
| "learning_rate": 2e-06, |
| "loss": -0.0022, |
| "reward": 0.5921683311462402, |
| "reward_std": 0.3830801844596863, |
| "rewards/preference_model_reward": 0.5921683311462402, |
| "rewards/preference_model_reward/std": 0.4823853075504303, |
| "step": 529 |
| }, |
| { |
| "clip_ratio": 3.313892011647113e-05, |
| "epoch": 0.6892067620286085, |
| "grad_norm": 1.5933404503889264, |
| "kl": 0.150390625, |
| "learning_rate": 2e-06, |
| "loss": -0.0034, |
| "step": 530 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 536.59375, |
| "epoch": 0.6905071521456437, |
| "grad_norm": 1.4789545393321204, |
| "kl": 0.193359375, |
| "learning_rate": 2e-06, |
| "loss": 0.0052, |
| "reward": 0.48997482657432556, |
| "reward_std": 0.4091135859489441, |
| "rewards/preference_model_reward": 0.48997482657432556, |
| "rewards/preference_model_reward/std": 0.40547576546669006, |
| "step": 531 |
| }, |
| { |
| "clip_ratio": 0.00037025794154033065, |
| "epoch": 0.6918075422626788, |
| "grad_norm": 1.3836275497987043, |
| "kl": 0.19140625, |
| "learning_rate": 2e-06, |
| "loss": 0.0041, |
| "step": 532 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 500.71875, |
| "epoch": 0.6931079323797139, |
| "grad_norm": 0.3199567384300615, |
| "kl": 0.1630859375, |
| "learning_rate": 2e-06, |
| "loss": -0.0051, |
| "reward": 0.957996129989624, |
| "reward_std": 0.07893598824739456, |
| "rewards/preference_model_reward": 0.957996129989624, |
| "rewards/preference_model_reward/std": 0.11781778186559677, |
| "step": 533 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "epoch": 0.694408322496749, |
| "grad_norm": 0.30160406098099796, |
| "kl": 0.162109375, |
| "learning_rate": 2e-06, |
| "loss": -0.0053, |
| "step": 534 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 622.21875, |
| "epoch": 0.6957087126137841, |
| "grad_norm": 0.9950320411701277, |
| "kl": 0.1064453125, |
| "learning_rate": 2e-06, |
| "loss": 0.0192, |
| "reward": 0.45777231454849243, |
| "reward_std": 0.27751585841178894, |
| "rewards/preference_model_reward": 0.45777231454849243, |
| "rewards/preference_model_reward/std": 0.46042200922966003, |
| "step": 535 |
| }, |
| { |
| "clip_ratio": 0.0001473418960813433, |
| "epoch": 0.6970091027308193, |
| "grad_norm": 0.9759058209911393, |
| "kl": 0.1064453125, |
| "learning_rate": 2e-06, |
| "loss": 0.0185, |
| "step": 536 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 479.5, |
| "epoch": 0.6983094928478544, |
| "grad_norm": 0.6121602073781033, |
| "kl": 0.162109375, |
| "learning_rate": 2e-06, |
| "loss": 0.0008, |
| "reward": 0.9411002397537231, |
| "reward_std": 0.14587333798408508, |
| "rewards/preference_model_reward": 0.9411002397537231, |
| "rewards/preference_model_reward/std": 0.18111251294612885, |
| "step": 537 |
| }, |
| { |
| "clip_ratio": 0.00011307100794510916, |
| "epoch": 0.6996098829648895, |
| "grad_norm": 0.5716256653179863, |
| "kl": 0.162109375, |
| "learning_rate": 2e-06, |
| "loss": 0.0003, |
| "step": 538 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 578.65625, |
| "epoch": 0.7009102730819246, |
| "grad_norm": 1.4127144788915404, |
| "kl": 0.1416015625, |
| "learning_rate": 2e-06, |
| "loss": -0.0024, |
| "reward": 0.7771836519241333, |
| "reward_std": 0.29650747776031494, |
| "rewards/preference_model_reward": 0.7771836519241333, |
| "rewards/preference_model_reward/std": 0.36120009422302246, |
| "step": 539 |
| }, |
| { |
| "clip_ratio": 0.00010380351159255952, |
| "epoch": 0.7022106631989596, |
| "grad_norm": 1.1777179730274026, |
| "kl": 0.1435546875, |
| "learning_rate": 2e-06, |
| "loss": -0.0032, |
| "step": 540 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 523.4375, |
| "epoch": 0.7035110533159948, |
| "grad_norm": 1.4681000227125094, |
| "kl": 0.1630859375, |
| "learning_rate": 2e-06, |
| "loss": 0.0038, |
| "reward": 0.5395978689193726, |
| "reward_std": 0.40768900513648987, |
| "rewards/preference_model_reward": 0.5395978689193726, |
| "rewards/preference_model_reward/std": 0.46992650628089905, |
| "step": 541 |
| }, |
| { |
| "clip_ratio": 0.00012938569125253707, |
| "epoch": 0.7048114434330299, |
| "grad_norm": 1.3613455454415246, |
| "kl": 0.1650390625, |
| "learning_rate": 2e-06, |
| "loss": 0.0028, |
| "step": 542 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 477.03125, |
| "epoch": 0.706111833550065, |
| "grad_norm": 1.0553742279151677, |
| "kl": 0.158203125, |
| "learning_rate": 2e-06, |
| "loss": 0.0223, |
| "reward": 0.4630056619644165, |
| "reward_std": 0.27500906586647034, |
| "rewards/preference_model_reward": 0.4630056619644165, |
| "rewards/preference_model_reward/std": 0.4385357201099396, |
| "step": 543 |
| }, |
| { |
| "clip_ratio": 5.340739153325558e-05, |
| "epoch": 0.7074122236671001, |
| "grad_norm": 0.9682034434189447, |
| "kl": 0.16015625, |
| "learning_rate": 2e-06, |
| "loss": 0.0216, |
| "step": 544 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 450.53125, |
| "epoch": 0.7087126137841352, |
| "grad_norm": 1.428990857463978, |
| "kl": 0.171875, |
| "learning_rate": 2e-06, |
| "loss": -0.0085, |
| "reward": 0.7226020693778992, |
| "reward_std": 0.2753547132015228, |
| "rewards/preference_model_reward": 0.7226020693778992, |
| "rewards/preference_model_reward/std": 0.40645530819892883, |
| "step": 545 |
| }, |
| { |
| "clip_ratio": 6.157635652925819e-05, |
| "epoch": 0.7100130039011704, |
| "grad_norm": 1.0153576138262457, |
| "kl": 0.171875, |
| "learning_rate": 2e-06, |
| "loss": -0.0091, |
| "step": 546 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 381.375, |
| "epoch": 0.7113133940182055, |
| "grad_norm": 0.7961865635923656, |
| "kl": 0.1728515625, |
| "learning_rate": 2e-06, |
| "loss": -0.0011, |
| "reward": 0.9013676643371582, |
| "reward_std": 0.23820459842681885, |
| "rewards/preference_model_reward": 0.9013676643371582, |
| "rewards/preference_model_reward/std": 0.24647004902362823, |
| "step": 547 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "epoch": 0.7126137841352406, |
| "grad_norm": 0.6685334773440631, |
| "kl": 0.173828125, |
| "learning_rate": 2e-06, |
| "loss": -0.0017, |
| "step": 548 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 397.71875, |
| "epoch": 0.7139141742522757, |
| "grad_norm": 0.509873094151191, |
| "kl": 0.189453125, |
| "learning_rate": 2e-06, |
| "loss": 0.0058, |
| "reward": 0.9610089063644409, |
| "reward_std": 0.10700845718383789, |
| "rewards/preference_model_reward": 0.9610089063644409, |
| "rewards/preference_model_reward/std": 0.15405260026454926, |
| "step": 549 |
| }, |
| { |
| "clip_ratio": 7.24008132237941e-05, |
| "epoch": 0.7152145643693107, |
| "grad_norm": 0.46289523230086543, |
| "kl": 0.19140625, |
| "learning_rate": 2e-06, |
| "loss": 0.0054, |
| "step": 550 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 479.90625, |
| "epoch": 0.716514954486346, |
| "grad_norm": 0.7777436132276164, |
| "kl": 0.1708984375, |
| "learning_rate": 2e-06, |
| "loss": 0.0001, |
| "reward": 0.9174792766571045, |
| "reward_std": 0.22822144627571106, |
| "rewards/preference_model_reward": 0.9174792766571045, |
| "rewards/preference_model_reward/std": 0.22594521939754486, |
| "step": 551 |
| }, |
| { |
| "clip_ratio": 0.0002606313209980726, |
| "epoch": 0.717815344603381, |
| "grad_norm": 0.6834258660506523, |
| "kl": 0.171875, |
| "learning_rate": 2e-06, |
| "loss": -0.0005, |
| "step": 552 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 413.5, |
| "epoch": 0.7191157347204161, |
| "grad_norm": 0.505052776040059, |
| "kl": 0.189453125, |
| "learning_rate": 2e-06, |
| "loss": -0.0026, |
| "reward": 0.4816475510597229, |
| "reward_std": 0.10513729602098465, |
| "rewards/preference_model_reward": 0.4816475510597229, |
| "rewards/preference_model_reward/std": 0.4959093928337097, |
| "step": 553 |
| }, |
| { |
| "clip_ratio": 0.00036867460585199296, |
| "epoch": 0.7204161248374512, |
| "grad_norm": 0.42150869730426016, |
| "kl": 0.1904296875, |
| "learning_rate": 2e-06, |
| "loss": -0.003, |
| "step": 554 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 518.28125, |
| "epoch": 0.7217165149544863, |
| "grad_norm": 1.4834420478857298, |
| "kl": 0.177734375, |
| "learning_rate": 2e-06, |
| "loss": -0.0091, |
| "reward": 0.5543996095657349, |
| "reward_std": 0.425930917263031, |
| "rewards/preference_model_reward": 0.5543996095657349, |
| "rewards/preference_model_reward/std": 0.42936739325523376, |
| "step": 555 |
| }, |
| { |
| "clip_ratio": 0.00012094823614461347, |
| "epoch": 0.7230169050715215, |
| "grad_norm": 1.6593571165239325, |
| "kl": 0.1767578125, |
| "learning_rate": 2e-06, |
| "loss": -0.0102, |
| "step": 556 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 377.9375, |
| "epoch": 0.7243172951885566, |
| "grad_norm": 0.8643935299403351, |
| "kl": 0.1806640625, |
| "learning_rate": 2e-06, |
| "loss": -0.0059, |
| "reward": 0.7919847965240479, |
| "reward_std": 0.3126184642314911, |
| "rewards/preference_model_reward": 0.7919847965240479, |
| "rewards/preference_model_reward/std": 0.3270798325538635, |
| "step": 557 |
| }, |
| { |
| "clip_ratio": 0.0002360784710617736, |
| "epoch": 0.7256176853055917, |
| "grad_norm": 0.8316400864873517, |
| "kl": 0.181640625, |
| "learning_rate": 2e-06, |
| "loss": -0.0066, |
| "step": 558 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 293.21875, |
| "epoch": 0.7269180754226268, |
| "grad_norm": 0.8796910032990837, |
| "kl": 0.2255859375, |
| "learning_rate": 2e-06, |
| "loss": -0.0119, |
| "reward": 0.7465857863426208, |
| "reward_std": 0.3626917600631714, |
| "rewards/preference_model_reward": 0.7465857863426208, |
| "rewards/preference_model_reward/std": 0.35811734199523926, |
| "step": 559 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "epoch": 0.7282184655396619, |
| "grad_norm": 0.8380735342987923, |
| "kl": 0.2265625, |
| "learning_rate": 2e-06, |
| "loss": -0.0126, |
| "step": 560 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 408.03125, |
| "epoch": 0.729518855656697, |
| "grad_norm": 0.512186793959597, |
| "kl": 0.21484375, |
| "learning_rate": 2e-06, |
| "loss": -0.0057, |
| "reward": 0.9450701475143433, |
| "reward_std": 0.2109910547733307, |
| "rewards/preference_model_reward": 0.9450701475143433, |
| "rewards/preference_model_reward/std": 0.20757848024368286, |
| "step": 561 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "epoch": 0.7308192457737321, |
| "grad_norm": 0.4831254168108591, |
| "kl": 0.216796875, |
| "learning_rate": 2e-06, |
| "loss": -0.0061, |
| "step": 562 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 563.46875, |
| "epoch": 0.7321196358907672, |
| "grad_norm": 1.6126446142051276, |
| "kl": 0.1533203125, |
| "learning_rate": 2e-06, |
| "loss": 0.0137, |
| "reward": 0.5664808750152588, |
| "reward_std": 0.41850006580352783, |
| "rewards/preference_model_reward": 0.5664808750152588, |
| "rewards/preference_model_reward/std": 0.41366487741470337, |
| "step": 563 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "epoch": 0.7334200260078023, |
| "grad_norm": 1.4735887582214497, |
| "kl": 0.1552734375, |
| "learning_rate": 2e-06, |
| "loss": 0.0128, |
| "step": 564 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 552.4375, |
| "epoch": 0.7347204161248374, |
| "grad_norm": 1.0717577501192503, |
| "kl": 0.18359375, |
| "learning_rate": 2e-06, |
| "loss": 0.0034, |
| "reward": 0.913676917552948, |
| "reward_std": 0.1866173893213272, |
| "rewards/preference_model_reward": 0.913676917552948, |
| "rewards/preference_model_reward/std": 0.2740388512611389, |
| "step": 565 |
| }, |
| { |
| "clip_ratio": 0.00016552054148633033, |
| "epoch": 0.7360208062418726, |
| "grad_norm": 1.0305545524954813, |
| "kl": 0.185546875, |
| "learning_rate": 2e-06, |
| "loss": 0.0026, |
| "step": 566 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 494.28125, |
| "epoch": 0.7373211963589077, |
| "grad_norm": 0.40807537593869514, |
| "kl": 0.1767578125, |
| "learning_rate": 2e-06, |
| "loss": -0.0033, |
| "reward": 0.9716057777404785, |
| "reward_std": 0.1135769933462143, |
| "rewards/preference_model_reward": 0.9716057777404785, |
| "rewards/preference_model_reward/std": 0.16062211990356445, |
| "step": 567 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "epoch": 0.7386215864759428, |
| "grad_norm": 0.3435926951880284, |
| "kl": 0.1796875, |
| "learning_rate": 2e-06, |
| "loss": -0.0036, |
| "step": 568 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 578.71875, |
| "epoch": 0.7399219765929779, |
| "grad_norm": 1.3861764799614187, |
| "kl": 0.203125, |
| "learning_rate": 2e-06, |
| "loss": -0.0024, |
| "reward": 0.805233359336853, |
| "reward_std": 0.35668760538101196, |
| "rewards/preference_model_reward": 0.805233359336853, |
| "rewards/preference_model_reward/std": 0.35277989506721497, |
| "step": 569 |
| }, |
| { |
| "clip_ratio": 9.771350596565753e-05, |
| "epoch": 0.741222366710013, |
| "grad_norm": 1.241876701253313, |
| "kl": 0.2060546875, |
| "learning_rate": 2e-06, |
| "loss": -0.0034, |
| "step": 570 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 512.9375, |
| "epoch": 0.7425227568270482, |
| "grad_norm": 0.5960426098398067, |
| "kl": 0.1845703125, |
| "learning_rate": 2e-06, |
| "loss": 0.017, |
| "reward": 0.9699637293815613, |
| "reward_std": 0.12014515697956085, |
| "rewards/preference_model_reward": 0.9699637293815613, |
| "rewards/preference_model_reward/std": 0.16991090774536133, |
| "step": 571 |
| }, |
| { |
| "clip_ratio": 0.0003436754341237247, |
| "epoch": 0.7438231469440832, |
| "grad_norm": 0.5183287490814747, |
| "kl": 0.1875, |
| "learning_rate": 2e-06, |
| "loss": 0.0166, |
| "step": 572 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 461.4375, |
| "epoch": 0.7451235370611183, |
| "grad_norm": 1.5416862528532593, |
| "kl": 0.212890625, |
| "learning_rate": 2e-06, |
| "loss": -0.0011, |
| "reward": 0.9567732214927673, |
| "reward_std": 0.15070945024490356, |
| "rewards/preference_model_reward": 0.9567732214927673, |
| "rewards/preference_model_reward/std": 0.14909282326698303, |
| "step": 573 |
| }, |
| { |
| "clip_ratio": 0.0004944581887684762, |
| "epoch": 0.7464239271781534, |
| "grad_norm": 0.5001084547889261, |
| "kl": 0.2158203125, |
| "learning_rate": 2e-06, |
| "loss": -0.0013, |
| "step": 574 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 439.15625, |
| "epoch": 0.7477243172951885, |
| "grad_norm": 0.3547953762763419, |
| "kl": 0.2080078125, |
| "learning_rate": 2e-06, |
| "loss": 0.0005, |
| "reward": 0.9812067747116089, |
| "reward_std": 0.07517301291227341, |
| "rewards/preference_model_reward": 0.9812067747116089, |
| "rewards/preference_model_reward/std": 0.10631068795919418, |
| "step": 575 |
| }, |
| { |
| "clip_ratio": 0.0002910027978941798, |
| "epoch": 0.7490247074122237, |
| "grad_norm": 0.3164538486238686, |
| "kl": 0.2119140625, |
| "learning_rate": 2e-06, |
| "loss": 0.0002, |
| "step": 576 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 392.625, |
| "epoch": 0.7503250975292588, |
| "grad_norm": 0.6989008521134648, |
| "kl": 0.232421875, |
| "learning_rate": 2e-06, |
| "loss": -0.0004, |
| "reward": 0.44408947229385376, |
| "reward_std": 0.18061134219169617, |
| "rewards/preference_model_reward": 0.44408947229385376, |
| "rewards/preference_model_reward/std": 0.47470250725746155, |
| "step": 577 |
| }, |
| { |
| "clip_ratio": 0.0004471123975235969, |
| "epoch": 0.7516254876462939, |
| "grad_norm": 0.6192349801951256, |
| "kl": 0.236328125, |
| "learning_rate": 2e-06, |
| "loss": -0.0009, |
| "step": 578 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 499.84375, |
| "epoch": 0.752925877763329, |
| "grad_norm": 1.164670357893636, |
| "kl": 0.251953125, |
| "learning_rate": 2e-06, |
| "loss": 0.0097, |
| "reward": 0.8941581845283508, |
| "reward_std": 0.27277839183807373, |
| "rewards/preference_model_reward": 0.8941581845283508, |
| "rewards/preference_model_reward/std": 0.268955260515213, |
| "step": 579 |
| }, |
| { |
| "clip_ratio": 0.0004549244767986238, |
| "epoch": 0.7542262678803641, |
| "grad_norm": 1.0205324380629648, |
| "kl": 0.251953125, |
| "learning_rate": 2e-06, |
| "loss": 0.0088, |
| "step": 580 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 536.03125, |
| "epoch": 0.7555266579973993, |
| "grad_norm": 1.1026359782739126, |
| "kl": 0.236328125, |
| "learning_rate": 2e-06, |
| "loss": 0.011, |
| "reward": 0.9393888115882874, |
| "reward_std": 0.16562925279140472, |
| "rewards/preference_model_reward": 0.9393888115882874, |
| "rewards/preference_model_reward/std": 0.23851299285888672, |
| "step": 581 |
| }, |
| { |
| "clip_ratio": 0.00014628437929786742, |
| "epoch": 0.7568270481144344, |
| "grad_norm": 0.9887761797962026, |
| "kl": 0.23828125, |
| "learning_rate": 2e-06, |
| "loss": 0.0103, |
| "step": 582 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 478.5, |
| "epoch": 0.7581274382314694, |
| "grad_norm": 0.36250831588618054, |
| "kl": 0.234375, |
| "learning_rate": 2e-06, |
| "loss": -0.0024, |
| "reward": 0.9745615720748901, |
| "reward_std": 0.08803649246692657, |
| "rewards/preference_model_reward": 0.9745615720748901, |
| "rewards/preference_model_reward/std": 0.1251751035451889, |
| "step": 583 |
| }, |
| { |
| "clip_ratio": 0.0007471668068319559, |
| "epoch": 0.7594278283485045, |
| "grad_norm": 0.3337433977585642, |
| "kl": 0.2333984375, |
| "learning_rate": 2e-06, |
| "loss": -0.0027, |
| "step": 584 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 474.875, |
| "epoch": 0.7607282184655396, |
| "grad_norm": 1.1970923812923153, |
| "kl": 0.25, |
| "learning_rate": 2e-06, |
| "loss": 0.0039, |
| "reward": 0.42788398265838623, |
| "reward_std": 0.3588111996650696, |
| "rewards/preference_model_reward": 0.42788398265838623, |
| "rewards/preference_model_reward/std": 0.405824214220047, |
| "step": 585 |
| }, |
| { |
| "clip_ratio": 0.0007635854999534786, |
| "epoch": 0.7620286085825748, |
| "grad_norm": 1.1707496444316037, |
| "kl": 0.1865234375, |
| "learning_rate": 2e-06, |
| "loss": 0.0031, |
| "step": 586 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 535.5, |
| "epoch": 0.7633289986996099, |
| "grad_norm": 1.2912574830783405, |
| "kl": 0.25390625, |
| "learning_rate": 2e-06, |
| "loss": -0.0089, |
| "reward": 0.8629287481307983, |
| "reward_std": 0.2172880619764328, |
| "rewards/preference_model_reward": 0.8629287481307983, |
| "rewards/preference_model_reward/std": 0.33283141255378723, |
| "step": 587 |
| }, |
| { |
| "clip_ratio": 0.00011639429430942982, |
| "epoch": 0.764629388816645, |
| "grad_norm": 1.448212757233341, |
| "kl": 0.255859375, |
| "learning_rate": 2e-06, |
| "loss": -0.0098, |
| "step": 588 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 401.46875, |
| "epoch": 0.7659297789336801, |
| "grad_norm": 0.5876962392618675, |
| "kl": 0.2734375, |
| "learning_rate": 2e-06, |
| "loss": -0.002, |
| "reward": 0.9528677463531494, |
| "reward_std": 0.1301821768283844, |
| "rewards/preference_model_reward": 0.9528677463531494, |
| "rewards/preference_model_reward/std": 0.18388831615447998, |
| "step": 589 |
| }, |
| { |
| "clip_ratio": 0.0006060305167920887, |
| "epoch": 0.7672301690507152, |
| "grad_norm": 0.5076147378502047, |
| "kl": 0.271484375, |
| "learning_rate": 2e-06, |
| "loss": -0.0025, |
| "step": 590 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 485.46875, |
| "epoch": 0.7685305591677504, |
| "grad_norm": 1.2814868227092682, |
| "kl": 0.2333984375, |
| "learning_rate": 2e-06, |
| "loss": 0.0029, |
| "reward": 0.7781925797462463, |
| "reward_std": 0.36613741517066956, |
| "rewards/preference_model_reward": 0.7781925797462463, |
| "rewards/preference_model_reward/std": 0.363247811794281, |
| "step": 591 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "epoch": 0.7698309492847855, |
| "grad_norm": 1.22748966955473, |
| "kl": 0.234375, |
| "learning_rate": 2e-06, |
| "loss": 0.0019, |
| "step": 592 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 270.25, |
| "epoch": 0.7711313394018205, |
| "grad_norm": 0.011637130289628438, |
| "kl": 0.279296875, |
| "learning_rate": 2e-06, |
| "loss": 0.0001, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/preference_model_reward": 1.0, |
| "rewards/preference_model_reward/std": 0.0, |
| "step": 593 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "epoch": 0.7724317295188556, |
| "grad_norm": 0.010229706612576805, |
| "kl": 0.263671875, |
| "learning_rate": 2e-06, |
| "loss": 0.0001, |
| "step": 594 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 489.0, |
| "epoch": 0.7737321196358907, |
| "grad_norm": 0.02168733906561227, |
| "kl": 0.234375, |
| "learning_rate": 2e-06, |
| "loss": 0.0002, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/preference_model_reward": 1.0, |
| "rewards/preference_model_reward/std": 0.0, |
| "step": 595 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "epoch": 0.7750325097529259, |
| "grad_norm": 0.014641318125133244, |
| "kl": 0.2177734375, |
| "learning_rate": 2e-06, |
| "loss": 0.0002, |
| "step": 596 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 453.65625, |
| "epoch": 0.776332899869961, |
| "grad_norm": 0.8255334973681036, |
| "kl": 0.259765625, |
| "learning_rate": 2e-06, |
| "loss": -0.0101, |
| "reward": 0.8930084705352783, |
| "reward_std": 0.26911142468452454, |
| "rewards/preference_model_reward": 0.8930084705352783, |
| "rewards/preference_model_reward/std": 0.2652178108692169, |
| "step": 597 |
| }, |
| { |
| "clip_ratio": 0.0011451852042227983, |
| "epoch": 0.7776332899869961, |
| "grad_norm": 0.7541031893153506, |
| "kl": 0.232421875, |
| "learning_rate": 2e-06, |
| "loss": -0.0107, |
| "step": 598 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 515.0, |
| "epoch": 0.7789336801040312, |
| "grad_norm": 0.8665016761502143, |
| "kl": 0.1923828125, |
| "learning_rate": 2e-06, |
| "loss": 0.0142, |
| "reward": 0.9606556296348572, |
| "reward_std": 0.14067065715789795, |
| "rewards/preference_model_reward": 0.9606556296348572, |
| "rewards/preference_model_reward/std": 0.16641117632389069, |
| "step": 599 |
| }, |
| { |
| "clip_ratio": 0.0006148220272734761, |
| "epoch": 0.7802340702210663, |
| "grad_norm": 0.7376855642027353, |
| "kl": 0.1875, |
| "learning_rate": 2e-06, |
| "loss": 0.0136, |
| "step": 600 |
| } |
| ], |
| "logging_steps": 1, |
| "max_steps": 2048, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 3, |
| "save_steps": 150, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 0.0, |
| "train_batch_size": 8, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|