| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.2546689303904924, | |
| "eval_steps": 500, | |
| "global_step": 300, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 154.90625, | |
| "epoch": 0.0008488964346349745, | |
| "grad_norm": 1.373261530904728, | |
| "kl": 0.0003566741943359375, | |
| "learning_rate": 0.0, | |
| "loss": -0.0035, | |
| "reward": 0.12956976890563965, | |
| "reward_std": 0.10243552178144455, | |
| "rewards/preference_model_reward": 0.12956976890563965, | |
| "rewards/preference_model_reward/std": 0.10243552923202515, | |
| "step": 1 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "epoch": 0.001697792869269949, | |
| "grad_norm": 1.373744508768238, | |
| "kl": 0.0003566741943359375, | |
| "learning_rate": 1e-07, | |
| "loss": -0.0035, | |
| "step": 2 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 426.03125, | |
| "epoch": 0.0025466893039049238, | |
| "grad_norm": 0.01976094802569778, | |
| "kl": 0.0003337860107421875, | |
| "learning_rate": 2e-07, | |
| "loss": -0.0, | |
| "reward": 0.007162425667047501, | |
| "reward_std": 0.002347785048186779, | |
| "rewards/preference_model_reward": 0.007162425667047501, | |
| "rewards/preference_model_reward/std": 0.0023477852810174227, | |
| "step": 3 | |
| }, | |
| { | |
| "clip_ratio": 0.00029364757938310504, | |
| "epoch": 0.003395585738539898, | |
| "grad_norm": 0.019704841225345854, | |
| "kl": 0.000339508056640625, | |
| "learning_rate": 3e-07, | |
| "loss": -0.0001, | |
| "step": 4 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 205.59375, | |
| "epoch": 0.004244482173174873, | |
| "grad_norm": 0.8550671585380242, | |
| "kl": 0.000408172607421875, | |
| "learning_rate": 4e-07, | |
| "loss": 0.0011, | |
| "reward": 0.0704927146434784, | |
| "reward_std": 0.06750915944576263, | |
| "rewards/preference_model_reward": 0.0704927146434784, | |
| "rewards/preference_model_reward/std": 0.06750915199518204, | |
| "step": 5 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "epoch": 0.0050933786078098476, | |
| "grad_norm": 0.7361877708957172, | |
| "kl": 0.0003910064697265625, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0011, | |
| "step": 6 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 227.15625, | |
| "epoch": 0.005942275042444821, | |
| "grad_norm": 0.38706524237759515, | |
| "kl": 0.0003662109375, | |
| "learning_rate": 6e-07, | |
| "loss": -0.0007, | |
| "reward": 0.03637976944446564, | |
| "reward_std": 0.037161991000175476, | |
| "rewards/preference_model_reward": 0.03637976944446564, | |
| "rewards/preference_model_reward/std": 0.037161991000175476, | |
| "step": 7 | |
| }, | |
| { | |
| "clip_ratio": 0.0003041362506337464, | |
| "epoch": 0.006791171477079796, | |
| "grad_norm": 0.3886457621776694, | |
| "kl": 0.000339508056640625, | |
| "learning_rate": 7e-07, | |
| "loss": -0.0007, | |
| "step": 8 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 384.0625, | |
| "epoch": 0.007640067911714771, | |
| "grad_norm": 1.3360350931326528, | |
| "kl": 0.0003337860107421875, | |
| "learning_rate": 8e-07, | |
| "loss": 0.0003, | |
| "reward": 0.2957379221916199, | |
| "reward_std": 0.1667662262916565, | |
| "rewards/preference_model_reward": 0.2957379221916199, | |
| "rewards/preference_model_reward/std": 0.1667662262916565, | |
| "step": 9 | |
| }, | |
| { | |
| "clip_ratio": 0.0004007347160950303, | |
| "epoch": 0.008488964346349746, | |
| "grad_norm": 1.3492099330380622, | |
| "kl": 0.00035858154296875, | |
| "learning_rate": 9e-07, | |
| "loss": 0.0003, | |
| "step": 10 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 271.34375, | |
| "epoch": 0.00933786078098472, | |
| "grad_norm": 0.5844743318217549, | |
| "kl": 0.0004730224609375, | |
| "learning_rate": 1e-06, | |
| "loss": -0.0055, | |
| "reward": 0.06409081071615219, | |
| "reward_std": 0.05993795394897461, | |
| "rewards/preference_model_reward": 0.06409081071615219, | |
| "rewards/preference_model_reward/std": 0.05993795767426491, | |
| "step": 11 | |
| }, | |
| { | |
| "clip_ratio": 0.0003285869024693966, | |
| "epoch": 0.010186757215619695, | |
| "grad_norm": 0.5853835659686467, | |
| "kl": 0.0005340576171875, | |
| "learning_rate": 1.1e-06, | |
| "loss": -0.0055, | |
| "step": 12 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 323.875, | |
| "epoch": 0.011035653650254669, | |
| "grad_norm": 0.5037341718615376, | |
| "kl": 0.00077056884765625, | |
| "learning_rate": 1.2e-06, | |
| "loss": -0.0005, | |
| "reward": 0.05134192109107971, | |
| "reward_std": 0.05402546375989914, | |
| "rewards/preference_model_reward": 0.05134192109107971, | |
| "rewards/preference_model_reward/std": 0.05402546748518944, | |
| "step": 13 | |
| }, | |
| { | |
| "clip_ratio": 0.00019549165153875947, | |
| "epoch": 0.011884550084889643, | |
| "grad_norm": 0.5354006783262033, | |
| "kl": 0.00098419189453125, | |
| "learning_rate": 1.3e-06, | |
| "loss": -0.0006, | |
| "step": 14 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 389.53125, | |
| "epoch": 0.012733446519524618, | |
| "grad_norm": 0.4376233615388839, | |
| "kl": 0.00057220458984375, | |
| "learning_rate": 1.4e-06, | |
| "loss": 0.0, | |
| "reward": 0.07725013792514801, | |
| "reward_std": 0.0637926235795021, | |
| "rewards/preference_model_reward": 0.07725013792514801, | |
| "rewards/preference_model_reward/std": 0.0637926235795021, | |
| "step": 15 | |
| }, | |
| { | |
| "clip_ratio": 0.0001568605366628617, | |
| "epoch": 0.013582342954159592, | |
| "grad_norm": 0.43520351533225, | |
| "kl": 0.000732421875, | |
| "learning_rate": 1.5e-06, | |
| "loss": -0.0, | |
| "step": 16 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 225.625, | |
| "epoch": 0.014431239388794566, | |
| "grad_norm": 0.009342079121610454, | |
| "kl": 0.00156402587890625, | |
| "learning_rate": 1.6e-06, | |
| "loss": -0.0, | |
| "reward": 0.003957257140427828, | |
| "reward_std": 0.0007005692459642887, | |
| "rewards/preference_model_reward": 0.003957257140427828, | |
| "rewards/preference_model_reward/std": 0.0007005691877566278, | |
| "step": 17 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "epoch": 0.015280135823429542, | |
| "grad_norm": 0.009135995120751321, | |
| "kl": 0.0016937255859375, | |
| "learning_rate": 1.6999999999999998e-06, | |
| "loss": -0.0, | |
| "step": 18 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 618.53125, | |
| "epoch": 0.016129032258064516, | |
| "grad_norm": 0.8342082066432049, | |
| "kl": 0.00165557861328125, | |
| "learning_rate": 1.8e-06, | |
| "loss": -0.003, | |
| "reward": 0.30697351694107056, | |
| "reward_std": 0.12625738978385925, | |
| "rewards/preference_model_reward": 0.30697351694107056, | |
| "rewards/preference_model_reward/std": 0.12625740468502045, | |
| "step": 19 | |
| }, | |
| { | |
| "clip_ratio": 0.00025422428734600544, | |
| "epoch": 0.01697792869269949, | |
| "grad_norm": 0.8045973414054722, | |
| "kl": 0.00189208984375, | |
| "learning_rate": 1.8999999999999998e-06, | |
| "loss": -0.0032, | |
| "step": 20 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 341.0, | |
| "epoch": 0.017826825127334467, | |
| "grad_norm": 0.15747378780960536, | |
| "kl": 0.0018157958984375, | |
| "learning_rate": 2e-06, | |
| "loss": 0.0013, | |
| "reward": 0.025423400104045868, | |
| "reward_std": 0.022510820999741554, | |
| "rewards/preference_model_reward": 0.025423400104045868, | |
| "rewards/preference_model_reward/std": 0.022510822862386703, | |
| "step": 21 | |
| }, | |
| { | |
| "clip_ratio": 0.00045937151298858225, | |
| "epoch": 0.01867572156196944, | |
| "grad_norm": 0.15703129987750525, | |
| "kl": 0.0022125244140625, | |
| "learning_rate": 2e-06, | |
| "loss": 0.0012, | |
| "step": 22 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 232.5625, | |
| "epoch": 0.019524617996604415, | |
| "grad_norm": 0.4416727840666515, | |
| "kl": 0.0036468505859375, | |
| "learning_rate": 2e-06, | |
| "loss": 0.0003, | |
| "reward": 0.04581625759601593, | |
| "reward_std": 0.0430915392935276, | |
| "rewards/preference_model_reward": 0.04581625759601593, | |
| "rewards/preference_model_reward/std": 0.0430915392935276, | |
| "step": 23 | |
| }, | |
| { | |
| "clip_ratio": 0.0006831242935732007, | |
| "epoch": 0.02037351443123939, | |
| "grad_norm": 0.44514421266142573, | |
| "kl": 0.0042724609375, | |
| "learning_rate": 2e-06, | |
| "loss": 0.0001, | |
| "step": 24 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 212.34375, | |
| "epoch": 0.021222410865874362, | |
| "grad_norm": 0.965193272956362, | |
| "kl": 0.006103515625, | |
| "learning_rate": 2e-06, | |
| "loss": -0.0014, | |
| "reward": 0.11097941547632217, | |
| "reward_std": 0.0762963593006134, | |
| "rewards/preference_model_reward": 0.11097941547632217, | |
| "rewards/preference_model_reward/std": 0.0762963593006134, | |
| "step": 25 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "epoch": 0.022071307300509338, | |
| "grad_norm": 0.9125624994776861, | |
| "kl": 0.00677490234375, | |
| "learning_rate": 2e-06, | |
| "loss": -0.0016, | |
| "step": 26 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 270.6875, | |
| "epoch": 0.022920203735144314, | |
| "grad_norm": 1.021646949062738, | |
| "kl": 0.00738525390625, | |
| "learning_rate": 2e-06, | |
| "loss": -0.0024, | |
| "reward": 0.14990350604057312, | |
| "reward_std": 0.10197865962982178, | |
| "rewards/preference_model_reward": 0.14990350604057312, | |
| "rewards/preference_model_reward/std": 0.10197865217924118, | |
| "step": 27 | |
| }, | |
| { | |
| "clip_ratio": 0.0003397603868506849, | |
| "epoch": 0.023769100169779286, | |
| "grad_norm": 1.0520153952968034, | |
| "kl": 0.00872802734375, | |
| "learning_rate": 2e-06, | |
| "loss": -0.0027, | |
| "step": 28 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 758.28125, | |
| "epoch": 0.02461799660441426, | |
| "grad_norm": 0.7119844877358423, | |
| "kl": 0.006744384765625, | |
| "learning_rate": 2e-06, | |
| "loss": 0.0009, | |
| "reward": 0.09009624272584915, | |
| "reward_std": 0.09022250026464462, | |
| "rewards/preference_model_reward": 0.09009624272584915, | |
| "rewards/preference_model_reward/std": 0.09022250026464462, | |
| "step": 29 | |
| }, | |
| { | |
| "clip_ratio": 0.0006554110441356897, | |
| "epoch": 0.025466893039049237, | |
| "grad_norm": 0.5478668890905144, | |
| "kl": 0.007568359375, | |
| "learning_rate": 2e-06, | |
| "loss": 0.0008, | |
| "step": 30 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 317.96875, | |
| "epoch": 0.02631578947368421, | |
| "grad_norm": 0.3101174990761895, | |
| "kl": 0.00958251953125, | |
| "learning_rate": 2e-06, | |
| "loss": 0.0011, | |
| "reward": 0.05284169688820839, | |
| "reward_std": 0.028878774493932724, | |
| "rewards/preference_model_reward": 0.05284169688820839, | |
| "rewards/preference_model_reward/std": 0.028878774493932724, | |
| "step": 31 | |
| }, | |
| { | |
| "clip_ratio": 0.0008713441202417016, | |
| "epoch": 0.027164685908319185, | |
| "grad_norm": 0.2969256230681903, | |
| "kl": 0.01007080078125, | |
| "learning_rate": 2e-06, | |
| "loss": 0.0009, | |
| "step": 32 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 346.5, | |
| "epoch": 0.02801358234295416, | |
| "grad_norm": 0.5964158292526848, | |
| "kl": 0.01025390625, | |
| "learning_rate": 2e-06, | |
| "loss": -0.0002, | |
| "reward": 0.10442396998405457, | |
| "reward_std": 0.0710761621594429, | |
| "rewards/preference_model_reward": 0.10442396998405457, | |
| "rewards/preference_model_reward/std": 0.0710761621594429, | |
| "step": 33 | |
| }, | |
| { | |
| "clip_ratio": 0.001167232054285705, | |
| "epoch": 0.028862478777589132, | |
| "grad_norm": 0.6459875868432908, | |
| "kl": 0.011474609375, | |
| "learning_rate": 2e-06, | |
| "loss": -0.0004, | |
| "step": 34 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 325.125, | |
| "epoch": 0.029711375212224108, | |
| "grad_norm": 0.8665540483399039, | |
| "kl": 0.012451171875, | |
| "learning_rate": 2e-06, | |
| "loss": -0.0007, | |
| "reward": 0.20713286101818085, | |
| "reward_std": 0.08856458961963654, | |
| "rewards/preference_model_reward": 0.20713286101818085, | |
| "rewards/preference_model_reward/std": 0.08856458961963654, | |
| "step": 35 | |
| }, | |
| { | |
| "clip_ratio": 0.0004757290589623153, | |
| "epoch": 0.030560271646859084, | |
| "grad_norm": 0.8458082294842536, | |
| "kl": 0.013916015625, | |
| "learning_rate": 2e-06, | |
| "loss": -0.001, | |
| "step": 36 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 259.5625, | |
| "epoch": 0.031409168081494056, | |
| "grad_norm": 0.6451884349368371, | |
| "kl": 0.018798828125, | |
| "learning_rate": 2e-06, | |
| "loss": -0.0021, | |
| "reward": 0.07655464112758636, | |
| "reward_std": 0.06220533698797226, | |
| "rewards/preference_model_reward": 0.07655464112758636, | |
| "rewards/preference_model_reward/std": 0.06220533698797226, | |
| "step": 37 | |
| }, | |
| { | |
| "clip_ratio": 0.00036129303043708205, | |
| "epoch": 0.03225806451612903, | |
| "grad_norm": 0.6180427670131671, | |
| "kl": 0.020751953125, | |
| "learning_rate": 2e-06, | |
| "loss": -0.0023, | |
| "step": 38 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 217.6875, | |
| "epoch": 0.03310696095076401, | |
| "grad_norm": 0.15623847767793025, | |
| "kl": 0.027099609375, | |
| "learning_rate": 2e-06, | |
| "loss": 0.0006, | |
| "reward": 0.01216259878128767, | |
| "reward_std": 0.015137026086449623, | |
| "rewards/preference_model_reward": 0.01216259878128767, | |
| "rewards/preference_model_reward/std": 0.015137026086449623, | |
| "step": 39 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "epoch": 0.03395585738539898, | |
| "grad_norm": 0.16166488666255469, | |
| "kl": 0.02880859375, | |
| "learning_rate": 2e-06, | |
| "loss": 0.0005, | |
| "step": 40 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 340.25, | |
| "epoch": 0.03480475382003396, | |
| "grad_norm": 1.2594757575533009, | |
| "kl": 0.022216796875, | |
| "learning_rate": 2e-06, | |
| "loss": 0.0002, | |
| "reward": 0.36821770668029785, | |
| "reward_std": 0.10101611167192459, | |
| "rewards/preference_model_reward": 0.36821770668029785, | |
| "rewards/preference_model_reward/std": 0.101016104221344, | |
| "step": 41 | |
| }, | |
| { | |
| "clip_ratio": 0.0007307034684345126, | |
| "epoch": 0.035653650254668934, | |
| "grad_norm": 1.027698571984344, | |
| "kl": 0.0245361328125, | |
| "learning_rate": 2e-06, | |
| "loss": -0.0002, | |
| "step": 42 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 303.3125, | |
| "epoch": 0.0365025466893039, | |
| "grad_norm": 0.4510741866786348, | |
| "kl": 0.0252685546875, | |
| "learning_rate": 2e-06, | |
| "loss": 0.0, | |
| "reward": 0.056258413940668106, | |
| "reward_std": 0.05226214602589607, | |
| "rewards/preference_model_reward": 0.056258413940668106, | |
| "rewards/preference_model_reward/std": 0.05226214602589607, | |
| "step": 43 | |
| }, | |
| { | |
| "clip_ratio": 0.0008270645630545914, | |
| "epoch": 0.03735144312393888, | |
| "grad_norm": 0.4538945689960808, | |
| "kl": 0.02685546875, | |
| "learning_rate": 2e-06, | |
| "loss": -0.0002, | |
| "step": 44 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 696.4375, | |
| "epoch": 0.038200339558573854, | |
| "grad_norm": 0.6552940518465648, | |
| "kl": 0.02490234375, | |
| "learning_rate": 2e-06, | |
| "loss": 0.001, | |
| "reward": 0.28145015239715576, | |
| "reward_std": 0.09661795943975449, | |
| "rewards/preference_model_reward": 0.28145015239715576, | |
| "rewards/preference_model_reward/std": 0.09661795198917389, | |
| "step": 45 | |
| }, | |
| { | |
| "clip_ratio": 0.0003099275636486709, | |
| "epoch": 0.03904923599320883, | |
| "grad_norm": 0.6063830680426688, | |
| "kl": 0.026123046875, | |
| "learning_rate": 2e-06, | |
| "loss": 0.0007, | |
| "step": 46 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 185.5625, | |
| "epoch": 0.039898132427843805, | |
| "grad_norm": 0.8803723441580334, | |
| "kl": 0.036376953125, | |
| "learning_rate": 2e-06, | |
| "loss": 0.0057, | |
| "reward": 0.14047113060951233, | |
| "reward_std": 0.07379527390003204, | |
| "rewards/preference_model_reward": 0.14047113060951233, | |
| "rewards/preference_model_reward/std": 0.07379526644945145, | |
| "step": 47 | |
| }, | |
| { | |
| "clip_ratio": 0.00016545334074180573, | |
| "epoch": 0.04074702886247878, | |
| "grad_norm": 0.8768664086923781, | |
| "kl": 0.03857421875, | |
| "learning_rate": 2e-06, | |
| "loss": 0.0053, | |
| "step": 48 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 409.84375, | |
| "epoch": 0.04159592529711375, | |
| "grad_norm": 0.790693661049164, | |
| "kl": 0.0303955078125, | |
| "learning_rate": 2e-06, | |
| "loss": 0.0013, | |
| "reward": 0.16792196035385132, | |
| "reward_std": 0.08975110948085785, | |
| "rewards/preference_model_reward": 0.16792196035385132, | |
| "rewards/preference_model_reward/std": 0.08975110203027725, | |
| "step": 49 | |
| }, | |
| { | |
| "clip_ratio": 0.0004502690862864256, | |
| "epoch": 0.042444821731748725, | |
| "grad_norm": 0.7661168129852652, | |
| "kl": 0.03173828125, | |
| "learning_rate": 2e-06, | |
| "loss": 0.001, | |
| "step": 50 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 186.875, | |
| "epoch": 0.0432937181663837, | |
| "grad_norm": 0.5403939950650409, | |
| "kl": 0.044189453125, | |
| "learning_rate": 2e-06, | |
| "loss": -0.0019, | |
| "reward": 0.06892818212509155, | |
| "reward_std": 0.036003705114126205, | |
| "rewards/preference_model_reward": 0.06892818212509155, | |
| "rewards/preference_model_reward/std": 0.036003705114126205, | |
| "step": 51 | |
| }, | |
| { | |
| "clip_ratio": 0.002485671080648899, | |
| "epoch": 0.044142614601018676, | |
| "grad_norm": 0.5730308488231836, | |
| "kl": 0.047119140625, | |
| "learning_rate": 2e-06, | |
| "loss": -0.0021, | |
| "step": 52 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 335.25, | |
| "epoch": 0.04499151103565365, | |
| "grad_norm": 0.7389454596845632, | |
| "kl": 0.041748046875, | |
| "learning_rate": 2e-06, | |
| "loss": -0.0063, | |
| "reward": 0.28060293197631836, | |
| "reward_std": 0.07954739779233932, | |
| "rewards/preference_model_reward": 0.28060293197631836, | |
| "rewards/preference_model_reward/std": 0.07954739034175873, | |
| "step": 53 | |
| }, | |
| { | |
| "clip_ratio": 0.0005645600031130016, | |
| "epoch": 0.04584040747028863, | |
| "grad_norm": 0.7275148836573597, | |
| "kl": 0.04296875, | |
| "learning_rate": 2e-06, | |
| "loss": -0.0066, | |
| "step": 54 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 321.6875, | |
| "epoch": 0.0466893039049236, | |
| "grad_norm": 0.673383864573446, | |
| "kl": 0.031494140625, | |
| "learning_rate": 2e-06, | |
| "loss": 0.0018, | |
| "reward": 0.13006240129470825, | |
| "reward_std": 0.07470076531171799, | |
| "rewards/preference_model_reward": 0.13006240129470825, | |
| "rewards/preference_model_reward/std": 0.07470076531171799, | |
| "step": 55 | |
| }, | |
| { | |
| "clip_ratio": 0.00018761330284178257, | |
| "epoch": 0.04753820033955857, | |
| "grad_norm": 0.5932243482520125, | |
| "kl": 0.03125, | |
| "learning_rate": 2e-06, | |
| "loss": 0.0015, | |
| "step": 56 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 629.0625, | |
| "epoch": 0.04838709677419355, | |
| "grad_norm": 0.5943555323097186, | |
| "kl": 0.03759765625, | |
| "learning_rate": 2e-06, | |
| "loss": 0.0022, | |
| "reward": 0.16671660542488098, | |
| "reward_std": 0.08239807188510895, | |
| "rewards/preference_model_reward": 0.16671660542488098, | |
| "rewards/preference_model_reward/std": 0.08239807188510895, | |
| "step": 57 | |
| }, | |
| { | |
| "clip_ratio": 0.0007027126266621053, | |
| "epoch": 0.04923599320882852, | |
| "grad_norm": 0.5944317831243726, | |
| "kl": 0.0390625, | |
| "learning_rate": 2e-06, | |
| "loss": 0.002, | |
| "step": 58 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 221.90625, | |
| "epoch": 0.0500848896434635, | |
| "grad_norm": 0.4673677413132102, | |
| "kl": 0.06689453125, | |
| "learning_rate": 2e-06, | |
| "loss": -0.0017, | |
| "reward": 0.031242549419403076, | |
| "reward_std": 0.04061814025044441, | |
| "rewards/preference_model_reward": 0.031242549419403076, | |
| "rewards/preference_model_reward/std": 0.04061814025044441, | |
| "step": 59 | |
| }, | |
| { | |
| "clip_ratio": 0.0010640884283930063, | |
| "epoch": 0.050933786078098474, | |
| "grad_norm": 0.46769299125491254, | |
| "kl": 0.0693359375, | |
| "learning_rate": 2e-06, | |
| "loss": -0.0019, | |
| "step": 60 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 235.625, | |
| "epoch": 0.05178268251273345, | |
| "grad_norm": 0.9433079745488324, | |
| "kl": 0.041259765625, | |
| "learning_rate": 2e-06, | |
| "loss": -0.0046, | |
| "reward": 0.3223969340324402, | |
| "reward_std": 0.08566058427095413, | |
| "rewards/preference_model_reward": 0.3223969340324402, | |
| "rewards/preference_model_reward/std": 0.08566058427095413, | |
| "step": 61 | |
| }, | |
| { | |
| "clip_ratio": 0.0005407010903581977, | |
| "epoch": 0.05263157894736842, | |
| "grad_norm": 0.8827749256609521, | |
| "kl": 0.043212890625, | |
| "learning_rate": 2e-06, | |
| "loss": -0.0051, | |
| "step": 62 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 321.21875, | |
| "epoch": 0.053480475382003394, | |
| "grad_norm": 1.376741075127532, | |
| "kl": 0.072265625, | |
| "learning_rate": 2e-06, | |
| "loss": -0.0027, | |
| "reward": 0.18953999876976013, | |
| "reward_std": 0.0982605516910553, | |
| "rewards/preference_model_reward": 0.18953999876976013, | |
| "rewards/preference_model_reward/std": 0.0982605367898941, | |
| "step": 63 | |
| }, | |
| { | |
| "clip_ratio": 0.0017892650794237852, | |
| "epoch": 0.05432937181663837, | |
| "grad_norm": 1.0129637166861898, | |
| "kl": 0.076171875, | |
| "learning_rate": 2e-06, | |
| "loss": -0.003, | |
| "step": 64 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 225.25, | |
| "epoch": 0.055178268251273345, | |
| "grad_norm": 1.0280761369580704, | |
| "kl": 0.08203125, | |
| "learning_rate": 2e-06, | |
| "loss": -0.0121, | |
| "reward": 0.3603067398071289, | |
| "reward_std": 0.09477485716342926, | |
| "rewards/preference_model_reward": 0.3603067398071289, | |
| "rewards/preference_model_reward/std": 0.09477484971284866, | |
| "step": 65 | |
| }, | |
| { | |
| "clip_ratio": 0.00028635968919843435, | |
| "epoch": 0.05602716468590832, | |
| "grad_norm": 1.012643043603393, | |
| "kl": 0.08544921875, | |
| "learning_rate": 2e-06, | |
| "loss": -0.0126, | |
| "step": 66 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 718.90625, | |
| "epoch": 0.056876061120543296, | |
| "grad_norm": 0.5544711698032301, | |
| "kl": 0.049560546875, | |
| "learning_rate": 2e-06, | |
| "loss": 0.0015, | |
| "reward": 0.0976465493440628, | |
| "reward_std": 0.08025789260864258, | |
| "rewards/preference_model_reward": 0.0976465493440628, | |
| "rewards/preference_model_reward/std": 0.08025789260864258, | |
| "step": 67 | |
| }, | |
| { | |
| "clip_ratio": 0.0004293117090128362, | |
| "epoch": 0.057724957555178265, | |
| "grad_norm": 0.5758752923955168, | |
| "kl": 0.05029296875, | |
| "learning_rate": 2e-06, | |
| "loss": 0.0012, | |
| "step": 68 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 313.8125, | |
| "epoch": 0.05857385398981324, | |
| "grad_norm": 0.5384984962195909, | |
| "kl": 0.08056640625, | |
| "learning_rate": 2e-06, | |
| "loss": -0.0024, | |
| "reward": 0.037503279745578766, | |
| "reward_std": 0.050285980105400085, | |
| "rewards/preference_model_reward": 0.037503279745578766, | |
| "rewards/preference_model_reward/std": 0.05028597638010979, | |
| "step": 69 | |
| }, | |
| { | |
| "clip_ratio": 0.000297203310765326, | |
| "epoch": 0.059422750424448216, | |
| "grad_norm": 0.5193566163583858, | |
| "kl": 0.08251953125, | |
| "learning_rate": 2e-06, | |
| "loss": -0.0026, | |
| "step": 70 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 289.84375, | |
| "epoch": 0.06027164685908319, | |
| "grad_norm": 1.5206690115948938, | |
| "kl": 0.0849609375, | |
| "learning_rate": 2e-06, | |
| "loss": 0.0006, | |
| "reward": 0.3103345036506653, | |
| "reward_std": 0.14627772569656372, | |
| "rewards/preference_model_reward": 0.3103345036506653, | |
| "rewards/preference_model_reward/std": 0.14627772569656372, | |
| "step": 71 | |
| }, | |
| { | |
| "clip_ratio": 0.0004342186148278415, | |
| "epoch": 0.06112054329371817, | |
| "grad_norm": 1.4731091282595996, | |
| "kl": 0.0927734375, | |
| "learning_rate": 2e-06, | |
| "loss": -0.0001, | |
| "step": 72 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 338.90625, | |
| "epoch": 0.06196943972835314, | |
| "grad_norm": 0.5932547022811241, | |
| "kl": 0.0556640625, | |
| "learning_rate": 2e-06, | |
| "loss": -0.0004, | |
| "reward": 0.10221391916275024, | |
| "reward_std": 0.07499799132347107, | |
| "rewards/preference_model_reward": 0.10221391916275024, | |
| "rewards/preference_model_reward/std": 0.07499799132347107, | |
| "step": 73 | |
| }, | |
| { | |
| "clip_ratio": 0.00036755931796506047, | |
| "epoch": 0.06281833616298811, | |
| "grad_norm": 0.5752683509803187, | |
| "kl": 0.05810546875, | |
| "learning_rate": 2e-06, | |
| "loss": -0.0006, | |
| "step": 74 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 311.28125, | |
| "epoch": 0.0636672325976231, | |
| "grad_norm": 0.5598161958043475, | |
| "kl": 0.07080078125, | |
| "learning_rate": 2e-06, | |
| "loss": -0.0015, | |
| "reward": 0.07023796439170837, | |
| "reward_std": 0.06094999983906746, | |
| "rewards/preference_model_reward": 0.07023796439170837, | |
| "rewards/preference_model_reward/std": 0.06094999611377716, | |
| "step": 75 | |
| }, | |
| { | |
| "clip_ratio": 9.596929157851264e-05, | |
| "epoch": 0.06451612903225806, | |
| "grad_norm": 0.5569802415068833, | |
| "kl": 0.07275390625, | |
| "learning_rate": 2e-06, | |
| "loss": -0.0018, | |
| "step": 76 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 227.625, | |
| "epoch": 0.06536502546689305, | |
| "grad_norm": 1.2813688126905285, | |
| "kl": 0.09521484375, | |
| "learning_rate": 2e-06, | |
| "loss": -0.0028, | |
| "reward": 0.1808125078678131, | |
| "reward_std": 0.10328490287065506, | |
| "rewards/preference_model_reward": 0.1808125078678131, | |
| "rewards/preference_model_reward/std": 0.10328490287065506, | |
| "step": 77 | |
| }, | |
| { | |
| "clip_ratio": 0.0006906483322381973, | |
| "epoch": 0.06621392190152801, | |
| "grad_norm": 1.2343717842035047, | |
| "kl": 0.09765625, | |
| "learning_rate": 2e-06, | |
| "loss": -0.0035, | |
| "step": 78 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 239.25, | |
| "epoch": 0.06706281833616298, | |
| "grad_norm": 1.2476565697443593, | |
| "kl": 0.09423828125, | |
| "learning_rate": 2e-06, | |
| "loss": 0.0004, | |
| "reward": 0.3275872468948364, | |
| "reward_std": 0.10609177500009537, | |
| "rewards/preference_model_reward": 0.3275872468948364, | |
| "rewards/preference_model_reward/std": 0.10609177500009537, | |
| "step": 79 | |
| }, | |
| { | |
| "clip_ratio": 0.00013171759201213717, | |
| "epoch": 0.06791171477079797, | |
| "grad_norm": 1.191757488980422, | |
| "kl": 0.09765625, | |
| "learning_rate": 2e-06, | |
| "loss": -0.0003, | |
| "step": 80 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 356.5, | |
| "epoch": 0.06876061120543293, | |
| "grad_norm": 0.10573846819647138, | |
| "kl": 0.07666015625, | |
| "learning_rate": 2e-06, | |
| "loss": 0.0002, | |
| "reward": 0.023218905553221703, | |
| "reward_std": 0.013258407823741436, | |
| "rewards/preference_model_reward": 0.023218905553221703, | |
| "rewards/preference_model_reward/std": 0.013258407823741436, | |
| "step": 81 | |
| }, | |
| { | |
| "clip_ratio": 0.0003507659712340683, | |
| "epoch": 0.06960950764006792, | |
| "grad_norm": 0.1103870844147634, | |
| "kl": 0.0771484375, | |
| "learning_rate": 2e-06, | |
| "loss": 0.0001, | |
| "step": 82 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 388.15625, | |
| "epoch": 0.07045840407470289, | |
| "grad_norm": 1.1684465615315396, | |
| "kl": 0.11474609375, | |
| "learning_rate": 2e-06, | |
| "loss": -0.0114, | |
| "reward": 0.16451352834701538, | |
| "reward_std": 0.12782737612724304, | |
| "rewards/preference_model_reward": 0.16451352834701538, | |
| "rewards/preference_model_reward/std": 0.12782739102840424, | |
| "step": 83 | |
| }, | |
| { | |
| "clip_ratio": 0.00016542727826163173, | |
| "epoch": 0.07130730050933787, | |
| "grad_norm": 1.1475084655450116, | |
| "kl": 0.115234375, | |
| "learning_rate": 2e-06, | |
| "loss": -0.012, | |
| "step": 84 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 717.90625, | |
| "epoch": 0.07215619694397284, | |
| "grad_norm": 0.1671276234234309, | |
| "kl": 0.0439453125, | |
| "learning_rate": 2e-06, | |
| "loss": 0.0003, | |
| "reward": 0.004496478941291571, | |
| "reward_std": 0.01374930702149868, | |
| "rewards/preference_model_reward": 0.004496478941291571, | |
| "rewards/preference_model_reward/std": 0.01374930702149868, | |
| "step": 85 | |
| }, | |
| { | |
| "clip_ratio": 0.0011028368026018143, | |
| "epoch": 0.0730050933786078, | |
| "grad_norm": 0.11851554080319895, | |
| "kl": 0.04150390625, | |
| "learning_rate": 2e-06, | |
| "loss": 0.0003, | |
| "step": 86 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 597.875, | |
| "epoch": 0.07385398981324279, | |
| "grad_norm": 0.8930226438210058, | |
| "kl": 0.10107421875, | |
| "learning_rate": 2e-06, | |
| "loss": -0.0002, | |
| "reward": 0.2464292198419571, | |
| "reward_std": 0.1284564882516861, | |
| "rewards/preference_model_reward": 0.2464292198419571, | |
| "rewards/preference_model_reward/std": 0.1284564733505249, | |
| "step": 87 | |
| }, | |
| { | |
| "clip_ratio": 0.0003217374032828957, | |
| "epoch": 0.07470288624787776, | |
| "grad_norm": 0.8976677967365754, | |
| "kl": 0.10205078125, | |
| "learning_rate": 2e-06, | |
| "loss": -0.0007, | |
| "step": 88 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 309.0, | |
| "epoch": 0.07555178268251274, | |
| "grad_norm": 0.8544800016185599, | |
| "kl": 0.1005859375, | |
| "learning_rate": 2e-06, | |
| "loss": -0.0008, | |
| "reward": 0.11918962001800537, | |
| "reward_std": 0.08243891596794128, | |
| "rewards/preference_model_reward": 0.11918962001800537, | |
| "rewards/preference_model_reward/std": 0.08243890851736069, | |
| "step": 89 | |
| }, | |
| { | |
| "clip_ratio": 0.0003866804763674736, | |
| "epoch": 0.07640067911714771, | |
| "grad_norm": 0.9945706586547373, | |
| "kl": 0.10107421875, | |
| "learning_rate": 2e-06, | |
| "loss": -0.0013, | |
| "step": 90 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 603.15625, | |
| "epoch": 0.07724957555178268, | |
| "grad_norm": 0.6302983763196456, | |
| "kl": 0.10009765625, | |
| "learning_rate": 2e-06, | |
| "loss": 0.0001, | |
| "reward": 0.13992220163345337, | |
| "reward_std": 0.08116798847913742, | |
| "rewards/preference_model_reward": 0.13992220163345337, | |
| "rewards/preference_model_reward/std": 0.08116798847913742, | |
| "step": 91 | |
| }, | |
| { | |
| "clip_ratio": 0.00037479729508049786, | |
| "epoch": 0.07809847198641766, | |
| "grad_norm": 0.6164704370528037, | |
| "kl": 0.099609375, | |
| "learning_rate": 2e-06, | |
| "loss": -0.0002, | |
| "step": 92 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 312.15625, | |
| "epoch": 0.07894736842105263, | |
| "grad_norm": 1.3844161112543023, | |
| "kl": 0.10888671875, | |
| "learning_rate": 2e-06, | |
| "loss": -0.0053, | |
| "reward": 0.36870962381362915, | |
| "reward_std": 0.134088933467865, | |
| "rewards/preference_model_reward": 0.36870962381362915, | |
| "rewards/preference_model_reward/std": 0.1340889185667038, | |
| "step": 93 | |
| }, | |
| { | |
| "clip_ratio": 0.001708789262920618, | |
| "epoch": 0.07979626485568761, | |
| "grad_norm": 1.2611380216933197, | |
| "kl": 0.10888671875, | |
| "learning_rate": 2e-06, | |
| "loss": -0.0061, | |
| "step": 94 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 345.84375, | |
| "epoch": 0.08064516129032258, | |
| "grad_norm": 0.7996421963852738, | |
| "kl": 0.1142578125, | |
| "learning_rate": 2e-06, | |
| "loss": 0.0001, | |
| "reward": 0.11349868029356003, | |
| "reward_std": 0.08194027096033096, | |
| "rewards/preference_model_reward": 0.11349868029356003, | |
| "rewards/preference_model_reward/std": 0.08194026350975037, | |
| "step": 95 | |
| }, | |
| { | |
| "clip_ratio": 0.000727824226487428, | |
| "epoch": 0.08149405772495756, | |
| "grad_norm": 0.851345288015861, | |
| "kl": 0.11474609375, | |
| "learning_rate": 2e-06, | |
| "loss": -0.0004, | |
| "step": 96 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 496.8125, | |
| "epoch": 0.08234295415959253, | |
| "grad_norm": 0.7233456374057684, | |
| "kl": 0.0771484375, | |
| "learning_rate": 2e-06, | |
| "loss": 0.0005, | |
| "reward": 0.062306515872478485, | |
| "reward_std": 0.06466341018676758, | |
| "rewards/preference_model_reward": 0.062306515872478485, | |
| "rewards/preference_model_reward/std": 0.06466341018676758, | |
| "step": 97 | |
| }, | |
| { | |
| "clip_ratio": 0.0024484877940267324, | |
| "epoch": 0.0831918505942275, | |
| "grad_norm": 0.5356124474945269, | |
| "kl": 0.0751953125, | |
| "learning_rate": 2e-06, | |
| "loss": 0.0003, | |
| "step": 98 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 346.46875, | |
| "epoch": 0.08404074702886248, | |
| "grad_norm": 0.9856700443791508, | |
| "kl": 0.08447265625, | |
| "learning_rate": 2e-06, | |
| "loss": -0.0026, | |
| "reward": 0.128938689827919, | |
| "reward_std": 0.10464771091938019, | |
| "rewards/preference_model_reward": 0.128938689827919, | |
| "rewards/preference_model_reward/std": 0.10464771091938019, | |
| "step": 99 | |
| }, | |
| { | |
| "clip_ratio": 0.00026480897213332355, | |
| "epoch": 0.08488964346349745, | |
| "grad_norm": 0.9630686055340228, | |
| "kl": 0.08642578125, | |
| "learning_rate": 2e-06, | |
| "loss": -0.0032, | |
| "step": 100 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 265.71875, | |
| "epoch": 0.08573853989813243, | |
| "grad_norm": 1.4006922971408575, | |
| "kl": 0.099609375, | |
| "learning_rate": 2e-06, | |
| "loss": 0.0017, | |
| "reward": 0.20826829969882965, | |
| "reward_std": 0.08805741369724274, | |
| "rewards/preference_model_reward": 0.20826829969882965, | |
| "rewards/preference_model_reward/std": 0.08805741369724274, | |
| "step": 101 | |
| }, | |
| { | |
| "clip_ratio": 0.00047059552161954343, | |
| "epoch": 0.0865874363327674, | |
| "grad_norm": 1.0093723749145036, | |
| "kl": 0.099609375, | |
| "learning_rate": 2e-06, | |
| "loss": 0.0014, | |
| "step": 102 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 352.34375, | |
| "epoch": 0.08743633276740238, | |
| "grad_norm": 0.7907866558806076, | |
| "kl": 0.099609375, | |
| "learning_rate": 2e-06, | |
| "loss": -0.0069, | |
| "reward": 0.1357034146785736, | |
| "reward_std": 0.08183176815509796, | |
| "rewards/preference_model_reward": 0.1357034146785736, | |
| "rewards/preference_model_reward/std": 0.08183176815509796, | |
| "step": 103 | |
| }, | |
| { | |
| "clip_ratio": 0.00016943408991210163, | |
| "epoch": 0.08828522920203735, | |
| "grad_norm": 0.8163727108346147, | |
| "kl": 0.1005859375, | |
| "learning_rate": 2e-06, | |
| "loss": -0.0074, | |
| "step": 104 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 405.1875, | |
| "epoch": 0.08913412563667232, | |
| "grad_norm": 1.2455238091620369, | |
| "kl": 0.1015625, | |
| "learning_rate": 2e-06, | |
| "loss": -0.0006, | |
| "reward": 0.32809197902679443, | |
| "reward_std": 0.13614021241664886, | |
| "rewards/preference_model_reward": 0.32809197902679443, | |
| "rewards/preference_model_reward/std": 0.13614021241664886, | |
| "step": 105 | |
| }, | |
| { | |
| "clip_ratio": 0.00047510667354799807, | |
| "epoch": 0.0899830220713073, | |
| "grad_norm": 1.175271310166888, | |
| "kl": 0.1044921875, | |
| "learning_rate": 2e-06, | |
| "loss": -0.0012, | |
| "step": 106 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 507.28125, | |
| "epoch": 0.09083191850594227, | |
| "grad_norm": 1.4891793433061915, | |
| "kl": 0.09521484375, | |
| "learning_rate": 2e-06, | |
| "loss": 0.0058, | |
| "reward": 0.3271234631538391, | |
| "reward_std": 0.11905878782272339, | |
| "rewards/preference_model_reward": 0.3271234631538391, | |
| "rewards/preference_model_reward/std": 0.11905878782272339, | |
| "step": 107 | |
| }, | |
| { | |
| "clip_ratio": 0.0007332629174925387, | |
| "epoch": 0.09168081494057725, | |
| "grad_norm": 0.9610577492843277, | |
| "kl": 0.09814453125, | |
| "learning_rate": 2e-06, | |
| "loss": 0.0054, | |
| "step": 108 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 677.375, | |
| "epoch": 0.09252971137521222, | |
| "grad_norm": 0.5710959299764182, | |
| "kl": 0.1005859375, | |
| "learning_rate": 2e-06, | |
| "loss": 0.0005, | |
| "reward": 0.3096367120742798, | |
| "reward_std": 0.08406942337751389, | |
| "rewards/preference_model_reward": 0.3096367120742798, | |
| "rewards/preference_model_reward/std": 0.08406941592693329, | |
| "step": 109 | |
| }, | |
| { | |
| "clip_ratio": 0.00041483400855213404, | |
| "epoch": 0.0933786078098472, | |
| "grad_norm": 0.5381736695457521, | |
| "kl": 0.10205078125, | |
| "learning_rate": 2e-06, | |
| "loss": 0.0001, | |
| "step": 110 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 408.78125, | |
| "epoch": 0.09422750424448217, | |
| "grad_norm": 1.04134918622721, | |
| "kl": 0.130859375, | |
| "learning_rate": 2e-06, | |
| "loss": -0.0019, | |
| "reward": 0.16213001310825348, | |
| "reward_std": 0.09974581748247147, | |
| "rewards/preference_model_reward": 0.16213001310825348, | |
| "rewards/preference_model_reward/std": 0.09974581748247147, | |
| "step": 111 | |
| }, | |
| { | |
| "clip_ratio": 0.0008149376371875405, | |
| "epoch": 0.09507640067911714, | |
| "grad_norm": 0.9160731616594122, | |
| "kl": 0.1328125, | |
| "learning_rate": 2e-06, | |
| "loss": -0.0024, | |
| "step": 112 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 373.96875, | |
| "epoch": 0.09592529711375213, | |
| "grad_norm": 1.0946577521442298, | |
| "kl": 0.12158203125, | |
| "learning_rate": 2e-06, | |
| "loss": 0.007, | |
| "reward": 0.2697640657424927, | |
| "reward_std": 0.10352278500795364, | |
| "rewards/preference_model_reward": 0.2697640657424927, | |
| "rewards/preference_model_reward/std": 0.10352278500795364, | |
| "step": 113 | |
| }, | |
| { | |
| "clip_ratio": 0.0013311142101883888, | |
| "epoch": 0.0967741935483871, | |
| "grad_norm": 0.9667455701612728, | |
| "kl": 0.12353515625, | |
| "learning_rate": 2e-06, | |
| "loss": 0.0065, | |
| "step": 114 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 306.0, | |
| "epoch": 0.09762308998302208, | |
| "grad_norm": 1.3217916635151856, | |
| "kl": 0.12890625, | |
| "learning_rate": 2e-06, | |
| "loss": 0.0095, | |
| "reward": 0.26093003153800964, | |
| "reward_std": 0.125474750995636, | |
| "rewards/preference_model_reward": 0.26093003153800964, | |
| "rewards/preference_model_reward/std": 0.125474750995636, | |
| "step": 115 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "epoch": 0.09847198641765705, | |
| "grad_norm": 1.249179472861132, | |
| "kl": 0.1318359375, | |
| "learning_rate": 2e-06, | |
| "loss": 0.0088, | |
| "step": 116 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 418.71875, | |
| "epoch": 0.09932088285229201, | |
| "grad_norm": 1.0159825028105314, | |
| "kl": 0.126953125, | |
| "learning_rate": 2e-06, | |
| "loss": 0.0004, | |
| "reward": 0.21273019909858704, | |
| "reward_std": 0.10706693679094315, | |
| "rewards/preference_model_reward": 0.21273019909858704, | |
| "rewards/preference_model_reward/std": 0.10706692934036255, | |
| "step": 117 | |
| }, | |
| { | |
| "clip_ratio": 0.00037145469104871154, | |
| "epoch": 0.100169779286927, | |
| "grad_norm": 0.932911697631841, | |
| "kl": 0.12890625, | |
| "learning_rate": 2e-06, | |
| "loss": -0.0002, | |
| "step": 118 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 555.0625, | |
| "epoch": 0.10101867572156197, | |
| "grad_norm": 1.3593075889098412, | |
| "kl": 0.1123046875, | |
| "learning_rate": 2e-06, | |
| "loss": 0.0152, | |
| "reward": 0.42055854201316833, | |
| "reward_std": 0.1595481038093567, | |
| "rewards/preference_model_reward": 0.42055854201316833, | |
| "rewards/preference_model_reward/std": 0.1595481038093567, | |
| "step": 119 | |
| }, | |
| { | |
| "clip_ratio": 0.0009689436410553753, | |
| "epoch": 0.10186757215619695, | |
| "grad_norm": 1.5262381686745565, | |
| "kl": 0.115234375, | |
| "learning_rate": 2e-06, | |
| "loss": 0.0144, | |
| "step": 120 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 357.5625, | |
| "epoch": 0.10271646859083192, | |
| "grad_norm": 0.6284715836457387, | |
| "kl": 0.1533203125, | |
| "learning_rate": 2e-06, | |
| "loss": -0.0006, | |
| "reward": 0.5042369365692139, | |
| "reward_std": 0.05617382749915123, | |
| "rewards/preference_model_reward": 0.5042369365692139, | |
| "rewards/preference_model_reward/std": 0.05617383494973183, | |
| "step": 121 | |
| }, | |
| { | |
| "clip_ratio": 0.0016699727857485414, | |
| "epoch": 0.1035653650254669, | |
| "grad_norm": 0.5676292275365208, | |
| "kl": 0.154296875, | |
| "learning_rate": 2e-06, | |
| "loss": -0.0009, | |
| "step": 122 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 413.6875, | |
| "epoch": 0.10441426146010187, | |
| "grad_norm": 0.6166063462451017, | |
| "kl": 0.1240234375, | |
| "learning_rate": 2e-06, | |
| "loss": -0.0012, | |
| "reward": 0.4225958585739136, | |
| "reward_std": 0.06309302896261215, | |
| "rewards/preference_model_reward": 0.4225958585739136, | |
| "rewards/preference_model_reward/std": 0.06309301406145096, | |
| "step": 123 | |
| }, | |
| { | |
| "clip_ratio": 0.001325472490862012, | |
| "epoch": 0.10526315789473684, | |
| "grad_norm": 0.5030461455471626, | |
| "kl": 0.1240234375, | |
| "learning_rate": 2e-06, | |
| "loss": -0.0014, | |
| "step": 124 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 654.625, | |
| "epoch": 0.10611205432937182, | |
| "grad_norm": 0.9061656748551837, | |
| "kl": 0.1416015625, | |
| "learning_rate": 2e-06, | |
| "loss": -0.0003, | |
| "reward": 0.26992088556289673, | |
| "reward_std": 0.12210524082183838, | |
| "rewards/preference_model_reward": 0.26992088556289673, | |
| "rewards/preference_model_reward/std": 0.12210523337125778, | |
| "step": 125 | |
| }, | |
| { | |
| "clip_ratio": 0.0003366470627952367, | |
| "epoch": 0.10696095076400679, | |
| "grad_norm": 0.8859074333947582, | |
| "kl": 0.142578125, | |
| "learning_rate": 2e-06, | |
| "loss": -0.0008, | |
| "step": 126 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 318.96875, | |
| "epoch": 0.10780984719864177, | |
| "grad_norm": 0.7830937452002261, | |
| "kl": 0.07763671875, | |
| "learning_rate": 2e-06, | |
| "loss": -0.0029, | |
| "reward": 0.31064295768737793, | |
| "reward_std": 0.0781577080488205, | |
| "rewards/preference_model_reward": 0.31064295768737793, | |
| "rewards/preference_model_reward/std": 0.07815771549940109, | |
| "step": 127 | |
| }, | |
| { | |
| "clip_ratio": 0.0004959848592989147, | |
| "epoch": 0.10865874363327674, | |
| "grad_norm": 0.7465755882530519, | |
| "kl": 0.07861328125, | |
| "learning_rate": 2e-06, | |
| "loss": -0.0034, | |
| "step": 128 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 285.4375, | |
| "epoch": 0.10950764006791172, | |
| "grad_norm": 0.6716474625246404, | |
| "kl": 0.150390625, | |
| "learning_rate": 2e-06, | |
| "loss": -0.0014, | |
| "reward": 0.4774589240550995, | |
| "reward_std": 0.04319845512509346, | |
| "rewards/preference_model_reward": 0.4774589240550995, | |
| "rewards/preference_model_reward/std": 0.04319845885038376, | |
| "step": 129 | |
| }, | |
| { | |
| "clip_ratio": 0.0006459264550358057, | |
| "epoch": 0.11035653650254669, | |
| "grad_norm": 0.4277286893768862, | |
| "kl": 0.15234375, | |
| "learning_rate": 2e-06, | |
| "loss": -0.0016, | |
| "step": 130 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 230.78125, | |
| "epoch": 0.11120543293718166, | |
| "grad_norm": 0.9935245836857972, | |
| "kl": 0.169921875, | |
| "learning_rate": 2e-06, | |
| "loss": -0.0023, | |
| "reward": 0.3256058692932129, | |
| "reward_std": 0.07717268913984299, | |
| "rewards/preference_model_reward": 0.3256058692932129, | |
| "rewards/preference_model_reward/std": 0.07717268913984299, | |
| "step": 131 | |
| }, | |
| { | |
| "clip_ratio": 0.00220286101102829, | |
| "epoch": 0.11205432937181664, | |
| "grad_norm": 0.9136352224117723, | |
| "kl": 0.16796875, | |
| "learning_rate": 2e-06, | |
| "loss": -0.0028, | |
| "step": 132 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 350.46875, | |
| "epoch": 0.11290322580645161, | |
| "grad_norm": 0.5870925563537904, | |
| "kl": 0.1640625, | |
| "learning_rate": 2e-06, | |
| "loss": 0.0002, | |
| "reward": 0.08773044496774673, | |
| "reward_std": 0.06896770745515823, | |
| "rewards/preference_model_reward": 0.08773044496774673, | |
| "rewards/preference_model_reward/std": 0.06896770745515823, | |
| "step": 133 | |
| }, | |
| { | |
| "clip_ratio": 0.0005350956926122308, | |
| "epoch": 0.11375212224108659, | |
| "grad_norm": 0.5639384325042808, | |
| "kl": 0.16015625, | |
| "learning_rate": 2e-06, | |
| "loss": -0.0002, | |
| "step": 134 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 528.0625, | |
| "epoch": 0.11460101867572156, | |
| "grad_norm": 0.48480451064325536, | |
| "kl": 0.12109375, | |
| "learning_rate": 2e-06, | |
| "loss": -0.0014, | |
| "reward": 0.4244787096977234, | |
| "reward_std": 0.05737914890050888, | |
| "rewards/preference_model_reward": 0.4244787096977234, | |
| "rewards/preference_model_reward/std": 0.05737914890050888, | |
| "step": 135 | |
| }, | |
| { | |
| "clip_ratio": 0.000636638724245131, | |
| "epoch": 0.11544991511035653, | |
| "grad_norm": 0.4122336602818054, | |
| "kl": 0.11328125, | |
| "learning_rate": 2e-06, | |
| "loss": -0.0017, | |
| "step": 136 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 257.625, | |
| "epoch": 0.11629881154499151, | |
| "grad_norm": 1.4922729095300895, | |
| "kl": 0.203125, | |
| "learning_rate": 2e-06, | |
| "loss": -0.0004, | |
| "reward": 0.3075970411300659, | |
| "reward_std": 0.10743933171033859, | |
| "rewards/preference_model_reward": 0.3075970411300659, | |
| "rewards/preference_model_reward/std": 0.10743933171033859, | |
| "step": 137 | |
| }, | |
| { | |
| "clip_ratio": 0.0006169785629026592, | |
| "epoch": 0.11714770797962648, | |
| "grad_norm": 1.1808501339893651, | |
| "kl": 0.203125, | |
| "learning_rate": 2e-06, | |
| "loss": -0.001, | |
| "step": 138 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 306.9375, | |
| "epoch": 0.11799660441426146, | |
| "grad_norm": 1.0160072383114551, | |
| "kl": 0.16796875, | |
| "learning_rate": 2e-06, | |
| "loss": -0.0061, | |
| "reward": 0.46265456080436707, | |
| "reward_std": 0.11661313474178314, | |
| "rewards/preference_model_reward": 0.46265456080436707, | |
| "rewards/preference_model_reward/std": 0.11661314219236374, | |
| "step": 139 | |
| }, | |
| { | |
| "clip_ratio": 0.0003987574018537998, | |
| "epoch": 0.11884550084889643, | |
| "grad_norm": 0.9531276983222098, | |
| "kl": 0.166015625, | |
| "learning_rate": 2e-06, | |
| "loss": -0.0068, | |
| "step": 140 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 503.5, | |
| "epoch": 0.11969439728353141, | |
| "grad_norm": 1.1419798492524438, | |
| "kl": 0.1787109375, | |
| "learning_rate": 2e-06, | |
| "loss": 0.0048, | |
| "reward": 0.20830851793289185, | |
| "reward_std": 0.13461197912693024, | |
| "rewards/preference_model_reward": 0.20830851793289185, | |
| "rewards/preference_model_reward/std": 0.13461197912693024, | |
| "step": 141 | |
| }, | |
| { | |
| "clip_ratio": 0.00026064369012601674, | |
| "epoch": 0.12054329371816638, | |
| "grad_norm": 1.072087725228961, | |
| "kl": 0.1806640625, | |
| "learning_rate": 2e-06, | |
| "loss": 0.004, | |
| "step": 142 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 384.1875, | |
| "epoch": 0.12139219015280135, | |
| "grad_norm": 1.363810772987835, | |
| "kl": 0.1884765625, | |
| "learning_rate": 2e-06, | |
| "loss": -0.0039, | |
| "reward": 0.24709536135196686, | |
| "reward_std": 0.13785149157047272, | |
| "rewards/preference_model_reward": 0.24709536135196686, | |
| "rewards/preference_model_reward/std": 0.1378515064716339, | |
| "step": 143 | |
| }, | |
| { | |
| "clip_ratio": 0.00023824731761123985, | |
| "epoch": 0.12224108658743633, | |
| "grad_norm": 1.280172314576326, | |
| "kl": 0.19140625, | |
| "learning_rate": 2e-06, | |
| "loss": -0.0047, | |
| "step": 144 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 425.90625, | |
| "epoch": 0.1230899830220713, | |
| "grad_norm": 0.5570698552144494, | |
| "kl": 0.115234375, | |
| "learning_rate": 2e-06, | |
| "loss": -0.0017, | |
| "reward": 0.12276525795459747, | |
| "reward_std": 0.06314485520124435, | |
| "rewards/preference_model_reward": 0.12276525795459747, | |
| "rewards/preference_model_reward/std": 0.06314485520124435, | |
| "step": 145 | |
| }, | |
| { | |
| "clip_ratio": 0.0003630488063208759, | |
| "epoch": 0.12393887945670629, | |
| "grad_norm": 0.5707319334424644, | |
| "kl": 0.11474609375, | |
| "learning_rate": 2e-06, | |
| "loss": -0.002, | |
| "step": 146 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 331.625, | |
| "epoch": 0.12478777589134125, | |
| "grad_norm": 0.8955034856932126, | |
| "kl": 0.189453125, | |
| "learning_rate": 2e-06, | |
| "loss": -0.002, | |
| "reward": 0.06658346205949783, | |
| "reward_std": 0.05682339146733284, | |
| "rewards/preference_model_reward": 0.06658346205949783, | |
| "rewards/preference_model_reward/std": 0.05682339146733284, | |
| "step": 147 | |
| }, | |
| { | |
| "clip_ratio": 0.0011066581355407834, | |
| "epoch": 0.12563667232597622, | |
| "grad_norm": 0.6244418580201853, | |
| "kl": 0.19140625, | |
| "learning_rate": 2e-06, | |
| "loss": -0.0023, | |
| "step": 148 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 393.15625, | |
| "epoch": 0.1264855687606112, | |
| "grad_norm": 1.2895611117206778, | |
| "kl": 0.1875, | |
| "learning_rate": 2e-06, | |
| "loss": -0.0053, | |
| "reward": 0.25148850679397583, | |
| "reward_std": 0.12007515132427216, | |
| "rewards/preference_model_reward": 0.25148850679397583, | |
| "rewards/preference_model_reward/std": 0.12007514387369156, | |
| "step": 149 | |
| }, | |
| { | |
| "clip_ratio": 0.0006402829312719405, | |
| "epoch": 0.1273344651952462, | |
| "grad_norm": 1.288543979995357, | |
| "kl": 0.189453125, | |
| "learning_rate": 2e-06, | |
| "loss": -0.0061, | |
| "step": 150 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 440.78125, | |
| "epoch": 0.12818336162988114, | |
| "grad_norm": 1.2555636888842705, | |
| "kl": 0.193359375, | |
| "learning_rate": 2e-06, | |
| "loss": -0.0041, | |
| "reward": 0.2623947262763977, | |
| "reward_std": 0.11035064607858658, | |
| "rewards/preference_model_reward": 0.2623947262763977, | |
| "rewards/preference_model_reward/std": 0.11035064607858658, | |
| "step": 151 | |
| }, | |
| { | |
| "clip_ratio": 0.003197396406903863, | |
| "epoch": 0.12903225806451613, | |
| "grad_norm": 1.1625036279466894, | |
| "kl": 0.197265625, | |
| "learning_rate": 2e-06, | |
| "loss": -0.0047, | |
| "step": 152 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 271.59375, | |
| "epoch": 0.1298811544991511, | |
| "grad_norm": 1.096093984535528, | |
| "kl": 0.251953125, | |
| "learning_rate": 2e-06, | |
| "loss": -0.0016, | |
| "reward": 0.3598458170890808, | |
| "reward_std": 0.08515099436044693, | |
| "rewards/preference_model_reward": 0.3598458170890808, | |
| "rewards/preference_model_reward/std": 0.08515099436044693, | |
| "step": 153 | |
| }, | |
| { | |
| "clip_ratio": 0.0012925309129059315, | |
| "epoch": 0.1307300509337861, | |
| "grad_norm": 1.00178630558753, | |
| "kl": 0.255859375, | |
| "learning_rate": 2e-06, | |
| "loss": -0.0023, | |
| "step": 154 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 415.5, | |
| "epoch": 0.13157894736842105, | |
| "grad_norm": 1.1081030432870604, | |
| "kl": 0.177734375, | |
| "learning_rate": 2e-06, | |
| "loss": 0.0115, | |
| "reward": 0.3061649799346924, | |
| "reward_std": 0.11071331799030304, | |
| "rewards/preference_model_reward": 0.3061649799346924, | |
| "rewards/preference_model_reward/std": 0.11071331799030304, | |
| "step": 155 | |
| }, | |
| { | |
| "clip_ratio": 0.0003086737706325948, | |
| "epoch": 0.13242784380305603, | |
| "grad_norm": 1.0874086536996357, | |
| "kl": 0.177734375, | |
| "learning_rate": 2e-06, | |
| "loss": 0.0109, | |
| "step": 156 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 417.03125, | |
| "epoch": 0.133276740237691, | |
| "grad_norm": 0.7599994731662741, | |
| "kl": 0.177734375, | |
| "learning_rate": 2e-06, | |
| "loss": -0.0007, | |
| "reward": 0.48063063621520996, | |
| "reward_std": 0.06754690408706665, | |
| "rewards/preference_model_reward": 0.48063063621520996, | |
| "rewards/preference_model_reward/std": 0.06754691153764725, | |
| "step": 157 | |
| }, | |
| { | |
| "clip_ratio": 0.0018070859368890524, | |
| "epoch": 0.13412563667232597, | |
| "grad_norm": 0.597446098810492, | |
| "kl": 0.1650390625, | |
| "learning_rate": 2e-06, | |
| "loss": -0.001, | |
| "step": 158 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 383.9375, | |
| "epoch": 0.13497453310696095, | |
| "grad_norm": 0.7677380389143111, | |
| "kl": 0.2041015625, | |
| "learning_rate": 2e-06, | |
| "loss": -0.0046, | |
| "reward": 0.1170925423502922, | |
| "reward_std": 0.07494159787893295, | |
| "rewards/preference_model_reward": 0.1170925423502922, | |
| "rewards/preference_model_reward/std": 0.07494159787893295, | |
| "step": 159 | |
| }, | |
| { | |
| "clip_ratio": 0.0012223758967593312, | |
| "epoch": 0.13582342954159593, | |
| "grad_norm": 0.7399797210592777, | |
| "kl": 0.203125, | |
| "learning_rate": 2e-06, | |
| "loss": -0.0051, | |
| "step": 160 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 722.25, | |
| "epoch": 0.1366723259762309, | |
| "grad_norm": 0.8765706673008182, | |
| "kl": 0.189453125, | |
| "learning_rate": 2e-06, | |
| "loss": 0.0001, | |
| "reward": 0.3232240676879883, | |
| "reward_std": 0.09787525236606598, | |
| "rewards/preference_model_reward": 0.3232240676879883, | |
| "rewards/preference_model_reward/std": 0.09787525236606598, | |
| "step": 161 | |
| }, | |
| { | |
| "clip_ratio": 0.0010285093449056149, | |
| "epoch": 0.13752122241086587, | |
| "grad_norm": 0.762896466304412, | |
| "kl": 0.1875, | |
| "learning_rate": 2e-06, | |
| "loss": -0.0002, | |
| "step": 162 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 714.09375, | |
| "epoch": 0.13837011884550085, | |
| "grad_norm": 1.1627137317630705, | |
| "kl": 0.12353515625, | |
| "learning_rate": 2e-06, | |
| "loss": 0.0081, | |
| "reward": 0.3034539818763733, | |
| "reward_std": 0.15702968835830688, | |
| "rewards/preference_model_reward": 0.3034539818763733, | |
| "rewards/preference_model_reward/std": 0.15702970325946808, | |
| "step": 163 | |
| }, | |
| { | |
| "clip_ratio": 0.0006113144336268306, | |
| "epoch": 0.13921901528013583, | |
| "grad_norm": 1.110258121456432, | |
| "kl": 0.12158203125, | |
| "learning_rate": 2e-06, | |
| "loss": 0.0073, | |
| "step": 164 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 391.21875, | |
| "epoch": 0.1400679117147708, | |
| "grad_norm": 1.089926535448989, | |
| "kl": 0.1708984375, | |
| "learning_rate": 2e-06, | |
| "loss": 0.0037, | |
| "reward": 0.22248202562332153, | |
| "reward_std": 0.11353754997253418, | |
| "rewards/preference_model_reward": 0.22248202562332153, | |
| "rewards/preference_model_reward/std": 0.11353754997253418, | |
| "step": 165 | |
| }, | |
| { | |
| "clip_ratio": 0.0005485577858053148, | |
| "epoch": 0.14091680814940577, | |
| "grad_norm": 0.9711244878108193, | |
| "kl": 0.1708984375, | |
| "learning_rate": 2e-06, | |
| "loss": 0.0031, | |
| "step": 166 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 426.21875, | |
| "epoch": 0.14176570458404075, | |
| "grad_norm": 1.2634303413205847, | |
| "kl": 0.2392578125, | |
| "learning_rate": 2e-06, | |
| "loss": 0.0015, | |
| "reward": 0.23580655455589294, | |
| "reward_std": 0.12878787517547607, | |
| "rewards/preference_model_reward": 0.23580655455589294, | |
| "rewards/preference_model_reward/std": 0.12878787517547607, | |
| "step": 167 | |
| }, | |
| { | |
| "clip_ratio": 0.0008147264015860856, | |
| "epoch": 0.14261460101867574, | |
| "grad_norm": 1.2475202369294187, | |
| "kl": 0.2392578125, | |
| "learning_rate": 2e-06, | |
| "loss": 0.0007, | |
| "step": 168 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 420.84375, | |
| "epoch": 0.1434634974533107, | |
| "grad_norm": 0.9256116658563932, | |
| "kl": 0.1806640625, | |
| "learning_rate": 2e-06, | |
| "loss": 0.0017, | |
| "reward": 0.19842864573001862, | |
| "reward_std": 0.09470146149396896, | |
| "rewards/preference_model_reward": 0.19842864573001862, | |
| "rewards/preference_model_reward/std": 0.09470146149396896, | |
| "step": 169 | |
| }, | |
| { | |
| "clip_ratio": 0.0016190335154533386, | |
| "epoch": 0.14431239388794567, | |
| "grad_norm": 0.8308927390156897, | |
| "kl": 0.1796875, | |
| "learning_rate": 2e-06, | |
| "loss": 0.0012, | |
| "step": 170 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 511.125, | |
| "epoch": 0.14516129032258066, | |
| "grad_norm": 0.9408637846666039, | |
| "kl": 0.181640625, | |
| "learning_rate": 2e-06, | |
| "loss": 0.0081, | |
| "reward": 0.19552090764045715, | |
| "reward_std": 0.1133582592010498, | |
| "rewards/preference_model_reward": 0.19552090764045715, | |
| "rewards/preference_model_reward/std": 0.11335825175046921, | |
| "step": 171 | |
| }, | |
| { | |
| "clip_ratio": 0.000971193250734359, | |
| "epoch": 0.1460101867572156, | |
| "grad_norm": 0.946800803701003, | |
| "kl": 0.181640625, | |
| "learning_rate": 2e-06, | |
| "loss": 0.0076, | |
| "step": 172 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 645.34375, | |
| "epoch": 0.1468590831918506, | |
| "grad_norm": 0.8208513143064141, | |
| "kl": 0.22265625, | |
| "learning_rate": 2e-06, | |
| "loss": 0.002, | |
| "reward": 0.4708970785140991, | |
| "reward_std": 0.10244568437337875, | |
| "rewards/preference_model_reward": 0.4708970785140991, | |
| "rewards/preference_model_reward/std": 0.10244568437337875, | |
| "step": 173 | |
| }, | |
| { | |
| "clip_ratio": 0.0008575035026296973, | |
| "epoch": 0.14770797962648557, | |
| "grad_norm": 0.777907252016811, | |
| "kl": 0.220703125, | |
| "learning_rate": 2e-06, | |
| "loss": 0.0015, | |
| "step": 174 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 352.5625, | |
| "epoch": 0.14855687606112053, | |
| "grad_norm": 0.7724021747968359, | |
| "kl": 0.26171875, | |
| "learning_rate": 2e-06, | |
| "loss": -0.0009, | |
| "reward": 0.28050410747528076, | |
| "reward_std": 0.07774435728788376, | |
| "rewards/preference_model_reward": 0.28050410747528076, | |
| "rewards/preference_model_reward/std": 0.07774436473846436, | |
| "step": 175 | |
| }, | |
| { | |
| "clip_ratio": 0.0007047850522212684, | |
| "epoch": 0.1494057724957555, | |
| "grad_norm": 0.8617498914678887, | |
| "kl": 0.26171875, | |
| "learning_rate": 2e-06, | |
| "loss": -0.0014, | |
| "step": 176 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 528.8125, | |
| "epoch": 0.1502546689303905, | |
| "grad_norm": 1.0109383309762656, | |
| "kl": 0.19921875, | |
| "learning_rate": 2e-06, | |
| "loss": 0.0031, | |
| "reward": 0.400208055973053, | |
| "reward_std": 0.11498203873634338, | |
| "rewards/preference_model_reward": 0.400208055973053, | |
| "rewards/preference_model_reward/std": 0.11498204618692398, | |
| "step": 177 | |
| }, | |
| { | |
| "clip_ratio": 0.0015235163737088442, | |
| "epoch": 0.15110356536502548, | |
| "grad_norm": 0.9751593675624571, | |
| "kl": 0.19921875, | |
| "learning_rate": 2e-06, | |
| "loss": 0.0024, | |
| "step": 178 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 364.84375, | |
| "epoch": 0.15195246179966043, | |
| "grad_norm": 1.0436702904156618, | |
| "kl": 0.1962890625, | |
| "learning_rate": 2e-06, | |
| "loss": -0.0041, | |
| "reward": 0.431307315826416, | |
| "reward_std": 0.10992471128702164, | |
| "rewards/preference_model_reward": 0.431307315826416, | |
| "rewards/preference_model_reward/std": 0.10992471128702164, | |
| "step": 179 | |
| }, | |
| { | |
| "clip_ratio": 0.00033796619391068816, | |
| "epoch": 0.15280135823429541, | |
| "grad_norm": 0.9638853656528253, | |
| "kl": 0.1953125, | |
| "learning_rate": 2e-06, | |
| "loss": -0.0048, | |
| "step": 180 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 421.03125, | |
| "epoch": 0.1536502546689304, | |
| "grad_norm": 1.0786137221446535, | |
| "kl": 0.2158203125, | |
| "learning_rate": 2e-06, | |
| "loss": 0.0106, | |
| "reward": 0.26917120814323425, | |
| "reward_std": 0.11035769432783127, | |
| "rewards/preference_model_reward": 0.26917120814323425, | |
| "rewards/preference_model_reward/std": 0.11035769432783127, | |
| "step": 181 | |
| }, | |
| { | |
| "clip_ratio": 0.0005866018473170698, | |
| "epoch": 0.15449915110356535, | |
| "grad_norm": 1.0797645934269513, | |
| "kl": 0.21484375, | |
| "learning_rate": 2e-06, | |
| "loss": 0.01, | |
| "step": 182 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 340.40625, | |
| "epoch": 0.15534804753820033, | |
| "grad_norm": 0.9516406822692116, | |
| "kl": 0.21484375, | |
| "learning_rate": 2e-06, | |
| "loss": -0.0008, | |
| "reward": 0.4257683753967285, | |
| "reward_std": 0.08008842915296555, | |
| "rewards/preference_model_reward": 0.4257683753967285, | |
| "rewards/preference_model_reward/std": 0.08008842915296555, | |
| "step": 183 | |
| }, | |
| { | |
| "clip_ratio": 0.0007616700022481382, | |
| "epoch": 0.15619694397283532, | |
| "grad_norm": 1.0243333961812868, | |
| "kl": 0.2158203125, | |
| "learning_rate": 2e-06, | |
| "loss": -0.0014, | |
| "step": 184 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 309.3125, | |
| "epoch": 0.1570458404074703, | |
| "grad_norm": 1.0966611105512216, | |
| "kl": 0.19140625, | |
| "learning_rate": 2e-06, | |
| "loss": 0.0, | |
| "reward": 0.44364723563194275, | |
| "reward_std": 0.08954507112503052, | |
| "rewards/preference_model_reward": 0.44364723563194275, | |
| "rewards/preference_model_reward/std": 0.08954507112503052, | |
| "step": 185 | |
| }, | |
| { | |
| "clip_ratio": 0.0007273735827766359, | |
| "epoch": 0.15789473684210525, | |
| "grad_norm": 0.8995850425186181, | |
| "kl": 0.19140625, | |
| "learning_rate": 2e-06, | |
| "loss": -0.0006, | |
| "step": 186 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 350.03125, | |
| "epoch": 0.15874363327674024, | |
| "grad_norm": 1.0365032710666704, | |
| "kl": 0.1943359375, | |
| "learning_rate": 2e-06, | |
| "loss": -0.0025, | |
| "reward": 0.503156304359436, | |
| "reward_std": 0.06975705921649933, | |
| "rewards/preference_model_reward": 0.503156304359436, | |
| "rewards/preference_model_reward/std": 0.06975706666707993, | |
| "step": 187 | |
| }, | |
| { | |
| "clip_ratio": 0.003309250809252262, | |
| "epoch": 0.15959252971137522, | |
| "grad_norm": 0.7719048533542018, | |
| "kl": 0.1953125, | |
| "learning_rate": 2e-06, | |
| "loss": -0.0028, | |
| "step": 188 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 469.40625, | |
| "epoch": 0.16044142614601017, | |
| "grad_norm": 1.2152567376815486, | |
| "kl": 0.19140625, | |
| "learning_rate": 2e-06, | |
| "loss": -0.0018, | |
| "reward": 0.2653403878211975, | |
| "reward_std": 0.13634686172008514, | |
| "rewards/preference_model_reward": 0.2653403878211975, | |
| "rewards/preference_model_reward/std": 0.13634686172008514, | |
| "step": 189 | |
| }, | |
| { | |
| "clip_ratio": 0.00013139564543962479, | |
| "epoch": 0.16129032258064516, | |
| "grad_norm": 1.1599347877086612, | |
| "kl": 0.19140625, | |
| "learning_rate": 2e-06, | |
| "loss": -0.0027, | |
| "step": 190 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 387.0625, | |
| "epoch": 0.16213921901528014, | |
| "grad_norm": 1.1561729382206092, | |
| "kl": 0.21484375, | |
| "learning_rate": 2e-06, | |
| "loss": 0.0003, | |
| "reward": 0.3497307002544403, | |
| "reward_std": 0.11881572753190994, | |
| "rewards/preference_model_reward": 0.3497307002544403, | |
| "rewards/preference_model_reward/std": 0.11881572753190994, | |
| "step": 191 | |
| }, | |
| { | |
| "clip_ratio": 0.0006593581638298929, | |
| "epoch": 0.16298811544991512, | |
| "grad_norm": 1.0934785615900324, | |
| "kl": 0.216796875, | |
| "learning_rate": 2e-06, | |
| "loss": -0.0005, | |
| "step": 192 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 466.8125, | |
| "epoch": 0.16383701188455008, | |
| "grad_norm": 0.5519710984838219, | |
| "kl": 0.1845703125, | |
| "learning_rate": 2e-06, | |
| "loss": 0.0013, | |
| "reward": 0.10650002956390381, | |
| "reward_std": 0.059550777077674866, | |
| "rewards/preference_model_reward": 0.10650002956390381, | |
| "rewards/preference_model_reward/std": 0.059550777077674866, | |
| "step": 193 | |
| }, | |
| { | |
| "clip_ratio": 0.0010145865380764008, | |
| "epoch": 0.16468590831918506, | |
| "grad_norm": 0.8499180844539812, | |
| "kl": 0.185546875, | |
| "learning_rate": 2e-06, | |
| "loss": 0.001, | |
| "step": 194 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 556.59375, | |
| "epoch": 0.16553480475382004, | |
| "grad_norm": 0.9414095054306995, | |
| "kl": 0.13671875, | |
| "learning_rate": 2e-06, | |
| "loss": 0.0036, | |
| "reward": 0.35372745990753174, | |
| "reward_std": 0.11836274713277817, | |
| "rewards/preference_model_reward": 0.35372745990753174, | |
| "rewards/preference_model_reward/std": 0.11836273968219757, | |
| "step": 195 | |
| }, | |
| { | |
| "clip_ratio": 0.00011072463530581445, | |
| "epoch": 0.166383701188455, | |
| "grad_norm": 0.8517491157753083, | |
| "kl": 0.13671875, | |
| "learning_rate": 2e-06, | |
| "loss": 0.0031, | |
| "step": 196 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 441.21875, | |
| "epoch": 0.16723259762308998, | |
| "grad_norm": 1.0176591850675871, | |
| "kl": 0.203125, | |
| "learning_rate": 2e-06, | |
| "loss": -0.003, | |
| "reward": 0.32411473989486694, | |
| "reward_std": 0.10493102669715881, | |
| "rewards/preference_model_reward": 0.32411473989486694, | |
| "rewards/preference_model_reward/std": 0.10493102669715881, | |
| "step": 197 | |
| }, | |
| { | |
| "clip_ratio": 0.0005816287593916059, | |
| "epoch": 0.16808149405772496, | |
| "grad_norm": 0.9532792399626693, | |
| "kl": 0.2041015625, | |
| "learning_rate": 2e-06, | |
| "loss": -0.0036, | |
| "step": 198 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 693.71875, | |
| "epoch": 0.16893039049235994, | |
| "grad_norm": 0.6157709458820001, | |
| "kl": 0.17578125, | |
| "learning_rate": 2e-06, | |
| "loss": 0.0012, | |
| "reward": 0.5120692849159241, | |
| "reward_std": 0.08368350565433502, | |
| "rewards/preference_model_reward": 0.5120692849159241, | |
| "rewards/preference_model_reward/std": 0.08368349820375443, | |
| "step": 199 | |
| }, | |
| { | |
| "clip_ratio": 0.000810971308965236, | |
| "epoch": 0.1697792869269949, | |
| "grad_norm": 0.5744963108218382, | |
| "kl": 0.1748046875, | |
| "learning_rate": 2e-06, | |
| "loss": 0.0008, | |
| "step": 200 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 499.34375, | |
| "epoch": 0.17062818336162988, | |
| "grad_norm": 1.171086543231942, | |
| "kl": 0.181640625, | |
| "learning_rate": 2e-06, | |
| "loss": -0.0044, | |
| "reward": 0.3279721438884735, | |
| "reward_std": 0.14420974254608154, | |
| "rewards/preference_model_reward": 0.3279721438884735, | |
| "rewards/preference_model_reward/std": 0.14420974254608154, | |
| "step": 201 | |
| }, | |
| { | |
| "clip_ratio": 0.0011094075161963701, | |
| "epoch": 0.17147707979626486, | |
| "grad_norm": 1.121059254698811, | |
| "kl": 0.1806640625, | |
| "learning_rate": 2e-06, | |
| "loss": -0.0052, | |
| "step": 202 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 292.6875, | |
| "epoch": 0.17232597623089982, | |
| "grad_norm": 0.8874841846577225, | |
| "kl": 0.1083984375, | |
| "learning_rate": 2e-06, | |
| "loss": -0.0089, | |
| "reward": 0.27286988496780396, | |
| "reward_std": 0.09546167403459549, | |
| "rewards/preference_model_reward": 0.27286988496780396, | |
| "rewards/preference_model_reward/std": 0.09546167403459549, | |
| "step": 203 | |
| }, | |
| { | |
| "clip_ratio": 0.00020234723342582583, | |
| "epoch": 0.1731748726655348, | |
| "grad_norm": 0.8726296645362996, | |
| "kl": 0.1083984375, | |
| "learning_rate": 2e-06, | |
| "loss": -0.0095, | |
| "step": 204 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 504.875, | |
| "epoch": 0.17402376910016978, | |
| "grad_norm": 2.7391485484558045, | |
| "kl": 0.1904296875, | |
| "learning_rate": 2e-06, | |
| "loss": -0.0018, | |
| "reward": 0.41756588220596313, | |
| "reward_std": 0.11001207679510117, | |
| "rewards/preference_model_reward": 0.41756588220596313, | |
| "rewards/preference_model_reward/std": 0.11001206934452057, | |
| "step": 205 | |
| }, | |
| { | |
| "clip_ratio": 0.0005510338814929128, | |
| "epoch": 0.17487266553480477, | |
| "grad_norm": 0.8607142192205527, | |
| "kl": 0.189453125, | |
| "learning_rate": 2e-06, | |
| "loss": -0.002, | |
| "step": 206 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 452.84375, | |
| "epoch": 0.17572156196943972, | |
| "grad_norm": 0.4685652506244187, | |
| "kl": 0.1923828125, | |
| "learning_rate": 2e-06, | |
| "loss": 0.0003, | |
| "reward": 0.48838815093040466, | |
| "reward_std": 0.039198972284793854, | |
| "rewards/preference_model_reward": 0.48838815093040466, | |
| "rewards/preference_model_reward/std": 0.03919896483421326, | |
| "step": 207 | |
| }, | |
| { | |
| "clip_ratio": 0.001379701541736722, | |
| "epoch": 0.1765704584040747, | |
| "grad_norm": 0.36885619072170894, | |
| "kl": 0.1904296875, | |
| "learning_rate": 2e-06, | |
| "loss": 0.0002, | |
| "step": 208 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 457.8125, | |
| "epoch": 0.1774193548387097, | |
| "grad_norm": 0.660983587706389, | |
| "kl": 0.173828125, | |
| "learning_rate": 2e-06, | |
| "loss": -0.0025, | |
| "reward": 0.08188852667808533, | |
| "reward_std": 0.06869849562644958, | |
| "rewards/preference_model_reward": 0.08188852667808533, | |
| "rewards/preference_model_reward/std": 0.06869849562644958, | |
| "step": 209 | |
| }, | |
| { | |
| "clip_ratio": 0.0004064367385581136, | |
| "epoch": 0.17826825127334464, | |
| "grad_norm": 0.5900136843275542, | |
| "kl": 0.1728515625, | |
| "learning_rate": 2e-06, | |
| "loss": -0.0029, | |
| "step": 210 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 536.34375, | |
| "epoch": 0.17911714770797962, | |
| "grad_norm": 1.192737456986498, | |
| "kl": 0.208984375, | |
| "learning_rate": 2e-06, | |
| "loss": 0.0126, | |
| "reward": 0.25954583287239075, | |
| "reward_std": 0.09377846866846085, | |
| "rewards/preference_model_reward": 0.25954583287239075, | |
| "rewards/preference_model_reward/std": 0.09377846121788025, | |
| "step": 211 | |
| }, | |
| { | |
| "clip_ratio": 0.0005144176539033651, | |
| "epoch": 0.1799660441426146, | |
| "grad_norm": 0.9034533154555449, | |
| "kl": 0.19921875, | |
| "learning_rate": 2e-06, | |
| "loss": 0.0122, | |
| "step": 212 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 503.5625, | |
| "epoch": 0.1808149405772496, | |
| "grad_norm": 0.9811032516295555, | |
| "kl": 0.208984375, | |
| "learning_rate": 2e-06, | |
| "loss": 0.0015, | |
| "reward": 0.22673586010932922, | |
| "reward_std": 0.1181153729557991, | |
| "rewards/preference_model_reward": 0.22673586010932922, | |
| "rewards/preference_model_reward/std": 0.1181153655052185, | |
| "step": 213 | |
| }, | |
| { | |
| "clip_ratio": 0.0006308910087682307, | |
| "epoch": 0.18166383701188454, | |
| "grad_norm": 1.2177987016178247, | |
| "kl": 0.2060546875, | |
| "learning_rate": 2e-06, | |
| "loss": 0.0009, | |
| "step": 214 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 380.9375, | |
| "epoch": 0.18251273344651953, | |
| "grad_norm": 1.1883998105439184, | |
| "kl": 0.2041015625, | |
| "learning_rate": 2e-06, | |
| "loss": 0.0026, | |
| "reward": 0.3673512935638428, | |
| "reward_std": 0.11825986206531525, | |
| "rewards/preference_model_reward": 0.3673512935638428, | |
| "rewards/preference_model_reward/std": 0.11825986206531525, | |
| "step": 215 | |
| }, | |
| { | |
| "clip_ratio": 0.0004939221544191241, | |
| "epoch": 0.1833616298811545, | |
| "grad_norm": 1.1187570941679779, | |
| "kl": 0.2060546875, | |
| "learning_rate": 2e-06, | |
| "loss": 0.0018, | |
| "step": 216 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 482.78125, | |
| "epoch": 0.18421052631578946, | |
| "grad_norm": 1.2336234795768202, | |
| "kl": 0.1943359375, | |
| "learning_rate": 2e-06, | |
| "loss": -0.0007, | |
| "reward": 0.22679108381271362, | |
| "reward_std": 0.13555686175823212, | |
| "rewards/preference_model_reward": 0.22679108381271362, | |
| "rewards/preference_model_reward/std": 0.13555686175823212, | |
| "step": 217 | |
| }, | |
| { | |
| "clip_ratio": 0.0012310510501265526, | |
| "epoch": 0.18505942275042445, | |
| "grad_norm": 1.1626357202169864, | |
| "kl": 0.197265625, | |
| "learning_rate": 2e-06, | |
| "loss": -0.0014, | |
| "step": 218 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 319.25, | |
| "epoch": 0.18590831918505943, | |
| "grad_norm": 0.90605092796741, | |
| "kl": 0.18359375, | |
| "learning_rate": 2e-06, | |
| "loss": -0.0023, | |
| "reward": 0.44145655632019043, | |
| "reward_std": 0.09349598735570908, | |
| "rewards/preference_model_reward": 0.44145655632019043, | |
| "rewards/preference_model_reward/std": 0.09349598735570908, | |
| "step": 219 | |
| }, | |
| { | |
| "clip_ratio": 0.0001996077917283401, | |
| "epoch": 0.1867572156196944, | |
| "grad_norm": 0.8461078416569522, | |
| "kl": 0.185546875, | |
| "learning_rate": 2e-06, | |
| "loss": -0.0028, | |
| "step": 220 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 469.25, | |
| "epoch": 0.18760611205432937, | |
| "grad_norm": 1.2596387776384212, | |
| "kl": 0.2177734375, | |
| "learning_rate": 2e-06, | |
| "loss": 0.0053, | |
| "reward": 0.3222573697566986, | |
| "reward_std": 0.10504135489463806, | |
| "rewards/preference_model_reward": 0.3222573697566986, | |
| "rewards/preference_model_reward/std": 0.10504135489463806, | |
| "step": 221 | |
| }, | |
| { | |
| "clip_ratio": 0.00020092798513360322, | |
| "epoch": 0.18845500848896435, | |
| "grad_norm": 0.9722158375203807, | |
| "kl": 0.21875, | |
| "learning_rate": 2e-06, | |
| "loss": 0.0048, | |
| "step": 222 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 360.34375, | |
| "epoch": 0.18930390492359933, | |
| "grad_norm": 1.365145072141222, | |
| "kl": 0.2392578125, | |
| "learning_rate": 2e-06, | |
| "loss": 0.0032, | |
| "reward": 0.4276666045188904, | |
| "reward_std": 0.12780673801898956, | |
| "rewards/preference_model_reward": 0.4276666045188904, | |
| "rewards/preference_model_reward/std": 0.12780673801898956, | |
| "step": 223 | |
| }, | |
| { | |
| "clip_ratio": 0.0016522787045687437, | |
| "epoch": 0.19015280135823429, | |
| "grad_norm": 1.286088004349321, | |
| "kl": 0.2421875, | |
| "learning_rate": 2e-06, | |
| "loss": 0.0023, | |
| "step": 224 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 517.375, | |
| "epoch": 0.19100169779286927, | |
| "grad_norm": 1.0225752401513326, | |
| "kl": 0.2138671875, | |
| "learning_rate": 2e-06, | |
| "loss": 0.0013, | |
| "reward": 0.3944551348686218, | |
| "reward_std": 0.11224386841058731, | |
| "rewards/preference_model_reward": 0.3944551348686218, | |
| "rewards/preference_model_reward/std": 0.11224386096000671, | |
| "step": 225 | |
| }, | |
| { | |
| "clip_ratio": 0.0007841808255761862, | |
| "epoch": 0.19185059422750425, | |
| "grad_norm": 0.9149056778209514, | |
| "kl": 0.197265625, | |
| "learning_rate": 2e-06, | |
| "loss": 0.0008, | |
| "step": 226 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 291.59375, | |
| "epoch": 0.1926994906621392, | |
| "grad_norm": 1.288986938092742, | |
| "kl": 0.2734375, | |
| "learning_rate": 2e-06, | |
| "loss": -0.0003, | |
| "reward": 0.4484859108924866, | |
| "reward_std": 0.04202309623360634, | |
| "rewards/preference_model_reward": 0.4484859108924866, | |
| "rewards/preference_model_reward/std": 0.04202309623360634, | |
| "step": 227 | |
| }, | |
| { | |
| "clip_ratio": 0.00215684762224555, | |
| "epoch": 0.1935483870967742, | |
| "grad_norm": 0.4977519171941583, | |
| "kl": 0.275390625, | |
| "learning_rate": 2e-06, | |
| "loss": -0.0004, | |
| "step": 228 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 518.6875, | |
| "epoch": 0.19439728353140917, | |
| "grad_norm": 1.1492317219584003, | |
| "kl": 0.23046875, | |
| "learning_rate": 2e-06, | |
| "loss": 0.007, | |
| "reward": 0.39957520365715027, | |
| "reward_std": 0.12616394460201263, | |
| "rewards/preference_model_reward": 0.39957520365715027, | |
| "rewards/preference_model_reward/std": 0.12616392970085144, | |
| "step": 229 | |
| }, | |
| { | |
| "clip_ratio": 0.0006631789728999138, | |
| "epoch": 0.19524617996604415, | |
| "grad_norm": 1.1237887647036364, | |
| "kl": 0.2314453125, | |
| "learning_rate": 2e-06, | |
| "loss": 0.0063, | |
| "step": 230 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 264.15625, | |
| "epoch": 0.1960950764006791, | |
| "grad_norm": 1.5446214982606454, | |
| "kl": 0.3828125, | |
| "learning_rate": 2e-06, | |
| "loss": -0.003, | |
| "reward": 0.051013268530368805, | |
| "reward_std": 0.04769134148955345, | |
| "rewards/preference_model_reward": 0.051013268530368805, | |
| "rewards/preference_model_reward/std": 0.04769134148955345, | |
| "step": 231 | |
| }, | |
| { | |
| "clip_ratio": 0.0015723377000540495, | |
| "epoch": 0.1969439728353141, | |
| "grad_norm": 0.7937791917154097, | |
| "kl": 0.37890625, | |
| "learning_rate": 2e-06, | |
| "loss": -0.0032, | |
| "step": 232 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 653.4375, | |
| "epoch": 0.19779286926994907, | |
| "grad_norm": 1.285817858002923, | |
| "kl": 0.1953125, | |
| "learning_rate": 2e-06, | |
| "loss": -0.0031, | |
| "reward": 0.4576322138309479, | |
| "reward_std": 0.12770842015743256, | |
| "rewards/preference_model_reward": 0.4576322138309479, | |
| "rewards/preference_model_reward/std": 0.12770840525627136, | |
| "step": 233 | |
| }, | |
| { | |
| "clip_ratio": 0.0005655796267092228, | |
| "epoch": 0.19864176570458403, | |
| "grad_norm": 0.9988328328952946, | |
| "kl": 0.1943359375, | |
| "learning_rate": 2e-06, | |
| "loss": -0.0037, | |
| "step": 234 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 625.875, | |
| "epoch": 0.199490662139219, | |
| "grad_norm": 0.5736428283590878, | |
| "kl": 0.3359375, | |
| "learning_rate": 2e-06, | |
| "loss": 0.0007, | |
| "reward": 0.5476886034011841, | |
| "reward_std": 0.019688162952661514, | |
| "rewards/preference_model_reward": 0.5476886034011841, | |
| "rewards/preference_model_reward/std": 0.01968817040324211, | |
| "step": 235 | |
| }, | |
| { | |
| "clip_ratio": 0.004047113005071878, | |
| "epoch": 0.200339558573854, | |
| "grad_norm": 0.3004588551896628, | |
| "kl": 0.267578125, | |
| "learning_rate": 2e-06, | |
| "loss": 0.0006, | |
| "step": 236 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 397.1875, | |
| "epoch": 0.20118845500848898, | |
| "grad_norm": 0.7407875649985818, | |
| "kl": 0.255859375, | |
| "learning_rate": 2e-06, | |
| "loss": -0.0036, | |
| "reward": 0.5196166038513184, | |
| "reward_std": 0.0838971957564354, | |
| "rewards/preference_model_reward": 0.5196166038513184, | |
| "rewards/preference_model_reward/std": 0.0838971957564354, | |
| "step": 237 | |
| }, | |
| { | |
| "clip_ratio": 0.000780011061578989, | |
| "epoch": 0.20203735144312393, | |
| "grad_norm": 0.7989807862404035, | |
| "kl": 0.251953125, | |
| "learning_rate": 2e-06, | |
| "loss": -0.004, | |
| "step": 238 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 663.1875, | |
| "epoch": 0.2028862478777589, | |
| "grad_norm": 1.1362959567666375, | |
| "kl": 0.201171875, | |
| "learning_rate": 2e-06, | |
| "loss": 0.0049, | |
| "reward": 0.3731197118759155, | |
| "reward_std": 0.13519592583179474, | |
| "rewards/preference_model_reward": 0.3731197118759155, | |
| "rewards/preference_model_reward/std": 0.13519594073295593, | |
| "step": 239 | |
| }, | |
| { | |
| "clip_ratio": 0.0008930441690608859, | |
| "epoch": 0.2037351443123939, | |
| "grad_norm": 0.9826027457053308, | |
| "kl": 0.197265625, | |
| "learning_rate": 2e-06, | |
| "loss": 0.0043, | |
| "step": 240 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 638.65625, | |
| "epoch": 0.20458404074702885, | |
| "grad_norm": 1.2194567876973144, | |
| "kl": 0.12890625, | |
| "learning_rate": 2e-06, | |
| "loss": 0.0058, | |
| "reward": 0.4657590687274933, | |
| "reward_std": 0.16892072558403015, | |
| "rewards/preference_model_reward": 0.4657590687274933, | |
| "rewards/preference_model_reward/std": 0.16892069578170776, | |
| "step": 241 | |
| }, | |
| { | |
| "clip_ratio": 0.0005752947181463242, | |
| "epoch": 0.20543293718166383, | |
| "grad_norm": 2.051110848359452, | |
| "kl": 0.12451171875, | |
| "learning_rate": 2e-06, | |
| "loss": 0.0051, | |
| "step": 242 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 337.1875, | |
| "epoch": 0.20628183361629882, | |
| "grad_norm": 0.4124833822582155, | |
| "kl": 0.23046875, | |
| "learning_rate": 2e-06, | |
| "loss": -0.0006, | |
| "reward": 0.5514668226242065, | |
| "reward_std": 0.03596644848585129, | |
| "rewards/preference_model_reward": 0.5514668226242065, | |
| "rewards/preference_model_reward/std": 0.03596644848585129, | |
| "step": 243 | |
| }, | |
| { | |
| "clip_ratio": 0.0035508163273334503, | |
| "epoch": 0.2071307300509338, | |
| "grad_norm": 0.4171934728891184, | |
| "kl": 0.224609375, | |
| "learning_rate": 2e-06, | |
| "loss": -0.0008, | |
| "step": 244 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 408.9375, | |
| "epoch": 0.20797962648556875, | |
| "grad_norm": 2.9889867378208055, | |
| "kl": 0.2333984375, | |
| "learning_rate": 2e-06, | |
| "loss": 0.0025, | |
| "reward": 0.2543810307979584, | |
| "reward_std": 0.11507824808359146, | |
| "rewards/preference_model_reward": 0.2543810307979584, | |
| "rewards/preference_model_reward/std": 0.11507824808359146, | |
| "step": 245 | |
| }, | |
| { | |
| "clip_ratio": 0.002395933959633112, | |
| "epoch": 0.20882852292020374, | |
| "grad_norm": 1.1132478545817386, | |
| "kl": 0.2314453125, | |
| "learning_rate": 2e-06, | |
| "loss": 0.0024, | |
| "step": 246 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 451.6875, | |
| "epoch": 0.20967741935483872, | |
| "grad_norm": 0.16838590128648132, | |
| "kl": 0.1953125, | |
| "learning_rate": 2e-06, | |
| "loss": 0.0009, | |
| "reward": 0.5358486175537109, | |
| "reward_std": 0.017411619424819946, | |
| "rewards/preference_model_reward": 0.5358486175537109, | |
| "rewards/preference_model_reward/std": 0.0174116063863039, | |
| "step": 247 | |
| }, | |
| { | |
| "clip_ratio": 0.0004304340109229088, | |
| "epoch": 0.21052631578947367, | |
| "grad_norm": 0.16083907124687224, | |
| "kl": 0.19140625, | |
| "learning_rate": 2e-06, | |
| "loss": 0.0008, | |
| "step": 248 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 373.0, | |
| "epoch": 0.21137521222410866, | |
| "grad_norm": 1.336981028115652, | |
| "kl": 0.1953125, | |
| "learning_rate": 2e-06, | |
| "loss": -0.0051, | |
| "reward": 0.15268400311470032, | |
| "reward_std": 0.11865763366222382, | |
| "rewards/preference_model_reward": 0.15268400311470032, | |
| "rewards/preference_model_reward/std": 0.11865763366222382, | |
| "step": 249 | |
| }, | |
| { | |
| "clip_ratio": 0.0005081939161755145, | |
| "epoch": 0.21222410865874364, | |
| "grad_norm": 1.2864787994059055, | |
| "kl": 0.1923828125, | |
| "learning_rate": 2e-06, | |
| "loss": -0.0058, | |
| "step": 250 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 593.5625, | |
| "epoch": 0.21307300509337862, | |
| "grad_norm": 0.692959080153562, | |
| "kl": 0.126953125, | |
| "learning_rate": 2e-06, | |
| "loss": -0.0002, | |
| "reward": 0.09329767525196075, | |
| "reward_std": 0.08787816017866135, | |
| "rewards/preference_model_reward": 0.09329767525196075, | |
| "rewards/preference_model_reward/std": 0.08787816762924194, | |
| "step": 251 | |
| }, | |
| { | |
| "clip_ratio": 0.0005807211855426431, | |
| "epoch": 0.21392190152801357, | |
| "grad_norm": 0.724372938551975, | |
| "kl": 0.12451171875, | |
| "learning_rate": 2e-06, | |
| "loss": -0.0006, | |
| "step": 252 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 262.46875, | |
| "epoch": 0.21477079796264856, | |
| "grad_norm": 1.1375236466013237, | |
| "kl": 0.220703125, | |
| "learning_rate": 2e-06, | |
| "loss": 0.0023, | |
| "reward": 0.20732049643993378, | |
| "reward_std": 0.09915804862976074, | |
| "rewards/preference_model_reward": 0.20732049643993378, | |
| "rewards/preference_model_reward/std": 0.09915804862976074, | |
| "step": 253 | |
| }, | |
| { | |
| "clip_ratio": 0.0012860854621976614, | |
| "epoch": 0.21561969439728354, | |
| "grad_norm": 1.1117646406622659, | |
| "kl": 0.21875, | |
| "learning_rate": 2e-06, | |
| "loss": 0.0015, | |
| "step": 254 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 488.15625, | |
| "epoch": 0.2164685908319185, | |
| "grad_norm": 0.83190430608096, | |
| "kl": 0.19921875, | |
| "learning_rate": 2e-06, | |
| "loss": -0.0001, | |
| "reward": 0.16658729314804077, | |
| "reward_std": 0.0986584797501564, | |
| "rewards/preference_model_reward": 0.16658729314804077, | |
| "rewards/preference_model_reward/std": 0.0986584797501564, | |
| "step": 255 | |
| }, | |
| { | |
| "clip_ratio": 0.00012755101488437504, | |
| "epoch": 0.21731748726655348, | |
| "grad_norm": 0.6949799847951833, | |
| "kl": 0.1962890625, | |
| "learning_rate": 2e-06, | |
| "loss": -0.0005, | |
| "step": 256 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 306.8125, | |
| "epoch": 0.21816638370118846, | |
| "grad_norm": 0.8460209137088387, | |
| "kl": 0.1962890625, | |
| "learning_rate": 2e-06, | |
| "loss": -0.0024, | |
| "reward": 0.13335028290748596, | |
| "reward_std": 0.08512399345636368, | |
| "rewards/preference_model_reward": 0.13335028290748596, | |
| "rewards/preference_model_reward/std": 0.08512399345636368, | |
| "step": 257 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "epoch": 0.21901528013582344, | |
| "grad_norm": 0.8304477251507847, | |
| "kl": 0.1943359375, | |
| "learning_rate": 2e-06, | |
| "loss": -0.003, | |
| "step": 258 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 381.78125, | |
| "epoch": 0.2198641765704584, | |
| "grad_norm": 1.097994057597742, | |
| "kl": 0.1923828125, | |
| "learning_rate": 2e-06, | |
| "loss": -0.004, | |
| "reward": 0.20581325888633728, | |
| "reward_std": 0.11145073920488358, | |
| "rewards/preference_model_reward": 0.20581325888633728, | |
| "rewards/preference_model_reward/std": 0.11145073920488358, | |
| "step": 259 | |
| }, | |
| { | |
| "clip_ratio": 0.000915912794880569, | |
| "epoch": 0.22071307300509338, | |
| "grad_norm": 1.0577251056985664, | |
| "kl": 0.1904296875, | |
| "learning_rate": 2e-06, | |
| "loss": -0.0049, | |
| "step": 260 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 410.8125, | |
| "epoch": 0.22156196943972836, | |
| "grad_norm": 0.9488227851729216, | |
| "kl": 0.150390625, | |
| "learning_rate": 2e-06, | |
| "loss": 0.01, | |
| "reward": 0.14306291937828064, | |
| "reward_std": 0.08440835028886795, | |
| "rewards/preference_model_reward": 0.14306291937828064, | |
| "rewards/preference_model_reward/std": 0.08440835028886795, | |
| "step": 261 | |
| }, | |
| { | |
| "clip_ratio": 0.0006555670406669378, | |
| "epoch": 0.22241086587436332, | |
| "grad_norm": 0.9625461445287203, | |
| "kl": 0.1494140625, | |
| "learning_rate": 2e-06, | |
| "loss": 0.0097, | |
| "step": 262 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 441.15625, | |
| "epoch": 0.2232597623089983, | |
| "grad_norm": 1.3378528546974586, | |
| "kl": 0.201171875, | |
| "learning_rate": 2e-06, | |
| "loss": 0.0111, | |
| "reward": 0.3541616201400757, | |
| "reward_std": 0.14822125434875488, | |
| "rewards/preference_model_reward": 0.3541616201400757, | |
| "rewards/preference_model_reward/std": 0.14822125434875488, | |
| "step": 263 | |
| }, | |
| { | |
| "clip_ratio": 0.0002928848844021559, | |
| "epoch": 0.22410865874363328, | |
| "grad_norm": 1.44097954206129, | |
| "kl": 0.2021484375, | |
| "learning_rate": 2e-06, | |
| "loss": 0.0103, | |
| "step": 264 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 328.5625, | |
| "epoch": 0.22495755517826826, | |
| "grad_norm": 1.0594223141668664, | |
| "kl": 0.228515625, | |
| "learning_rate": 2e-06, | |
| "loss": -0.0016, | |
| "reward": 0.4411153793334961, | |
| "reward_std": 0.11199039965867996, | |
| "rewards/preference_model_reward": 0.4411153793334961, | |
| "rewards/preference_model_reward/std": 0.11199039220809937, | |
| "step": 265 | |
| }, | |
| { | |
| "clip_ratio": 0.0003811471979133785, | |
| "epoch": 0.22580645161290322, | |
| "grad_norm": 1.048487426692034, | |
| "kl": 0.2314453125, | |
| "learning_rate": 2e-06, | |
| "loss": -0.0024, | |
| "step": 266 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 501.90625, | |
| "epoch": 0.2266553480475382, | |
| "grad_norm": 0.764207880163219, | |
| "kl": 0.1845703125, | |
| "learning_rate": 2e-06, | |
| "loss": -0.0022, | |
| "reward": 0.4925358295440674, | |
| "reward_std": 0.09214819222688675, | |
| "rewards/preference_model_reward": 0.4925358295440674, | |
| "rewards/preference_model_reward/std": 0.09214819967746735, | |
| "step": 267 | |
| }, | |
| { | |
| "clip_ratio": 0.000623343454208225, | |
| "epoch": 0.22750424448217318, | |
| "grad_norm": 0.7021748192613388, | |
| "kl": 0.1845703125, | |
| "learning_rate": 2e-06, | |
| "loss": -0.0027, | |
| "step": 268 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 406.78125, | |
| "epoch": 0.22835314091680814, | |
| "grad_norm": 1.2534216873477357, | |
| "kl": 0.2109375, | |
| "learning_rate": 2e-06, | |
| "loss": 0.0004, | |
| "reward": 0.346091091632843, | |
| "reward_std": 0.11527692526578903, | |
| "rewards/preference_model_reward": 0.346091091632843, | |
| "rewards/preference_model_reward/std": 0.11527692526578903, | |
| "step": 269 | |
| }, | |
| { | |
| "clip_ratio": 0.0006988497916609049, | |
| "epoch": 0.22920203735144312, | |
| "grad_norm": 1.028430046135459, | |
| "kl": 0.2119140625, | |
| "learning_rate": 2e-06, | |
| "loss": -0.0001, | |
| "step": 270 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 474.875, | |
| "epoch": 0.2300509337860781, | |
| "grad_norm": 13.308680917152811, | |
| "kl": 0.7265625, | |
| "learning_rate": 2e-06, | |
| "loss": -0.0046, | |
| "reward": 0.39890241622924805, | |
| "reward_std": 0.10348767042160034, | |
| "rewards/preference_model_reward": 0.39890241622924805, | |
| "rewards/preference_model_reward/std": 0.10348766297101974, | |
| "step": 271 | |
| }, | |
| { | |
| "clip_ratio": 0.0009435814572498202, | |
| "epoch": 0.23089983022071306, | |
| "grad_norm": 0.9508081839275468, | |
| "kl": 0.1708984375, | |
| "learning_rate": 2e-06, | |
| "loss": -0.0057, | |
| "step": 272 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 335.78125, | |
| "epoch": 0.23174872665534804, | |
| "grad_norm": 0.9495076380585504, | |
| "kl": 0.1650390625, | |
| "learning_rate": 2e-06, | |
| "loss": -0.0052, | |
| "reward": 0.19895681738853455, | |
| "reward_std": 0.10379483550786972, | |
| "rewards/preference_model_reward": 0.19895681738853455, | |
| "rewards/preference_model_reward/std": 0.10379482805728912, | |
| "step": 273 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "epoch": 0.23259762308998302, | |
| "grad_norm": 0.9358387855912544, | |
| "kl": 0.1650390625, | |
| "learning_rate": 2e-06, | |
| "loss": -0.0059, | |
| "step": 274 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 430.8125, | |
| "epoch": 0.233446519524618, | |
| "grad_norm": 1.172070244878363, | |
| "kl": 0.1748046875, | |
| "learning_rate": 2e-06, | |
| "loss": 0.0189, | |
| "reward": 0.4197525382041931, | |
| "reward_std": 0.11745458096265793, | |
| "rewards/preference_model_reward": 0.4197525382041931, | |
| "rewards/preference_model_reward/std": 0.11745458096265793, | |
| "step": 275 | |
| }, | |
| { | |
| "clip_ratio": 0.00059707515174523, | |
| "epoch": 0.23429541595925296, | |
| "grad_norm": 1.1205887604531841, | |
| "kl": 0.173828125, | |
| "learning_rate": 2e-06, | |
| "loss": 0.0182, | |
| "step": 276 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 380.84375, | |
| "epoch": 0.23514431239388794, | |
| "grad_norm": 1.0667324137631118, | |
| "kl": 0.189453125, | |
| "learning_rate": 2e-06, | |
| "loss": -0.0014, | |
| "reward": 0.4866299033164978, | |
| "reward_std": 0.10881662368774414, | |
| "rewards/preference_model_reward": 0.4866299033164978, | |
| "rewards/preference_model_reward/std": 0.10881662368774414, | |
| "step": 277 | |
| }, | |
| { | |
| "clip_ratio": 0.0003319675161037594, | |
| "epoch": 0.23599320882852293, | |
| "grad_norm": 1.4205333564695013, | |
| "kl": 0.2265625, | |
| "learning_rate": 2e-06, | |
| "loss": -0.0021, | |
| "step": 278 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 409.96875, | |
| "epoch": 0.23684210526315788, | |
| "grad_norm": 1.2256158038709593, | |
| "kl": 0.1904296875, | |
| "learning_rate": 2e-06, | |
| "loss": -0.0022, | |
| "reward": 0.3310420513153076, | |
| "reward_std": 0.13203255832195282, | |
| "rewards/preference_model_reward": 0.3310420513153076, | |
| "rewards/preference_model_reward/std": 0.13203254342079163, | |
| "step": 279 | |
| }, | |
| { | |
| "clip_ratio": 0.00029370313859544694, | |
| "epoch": 0.23769100169779286, | |
| "grad_norm": 1.140780771833912, | |
| "kl": 0.1923828125, | |
| "learning_rate": 2e-06, | |
| "loss": -0.0031, | |
| "step": 280 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 299.8125, | |
| "epoch": 0.23853989813242785, | |
| "grad_norm": 0.5773592648416564, | |
| "kl": 0.18359375, | |
| "learning_rate": 2e-06, | |
| "loss": 0.0001, | |
| "reward": 0.06664206832647324, | |
| "reward_std": 0.046818241477012634, | |
| "rewards/preference_model_reward": 0.06664206832647324, | |
| "rewards/preference_model_reward/std": 0.046818237751722336, | |
| "step": 281 | |
| }, | |
| { | |
| "clip_ratio": 0.002476999070495367, | |
| "epoch": 0.23938879456706283, | |
| "grad_norm": 0.5215923415469397, | |
| "kl": 0.1796875, | |
| "learning_rate": 2e-06, | |
| "loss": -0.0002, | |
| "step": 282 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 641.5, | |
| "epoch": 0.24023769100169778, | |
| "grad_norm": 0.3036782782176285, | |
| "kl": 0.1865234375, | |
| "learning_rate": 2e-06, | |
| "loss": -0.0012, | |
| "reward": 0.4948778748512268, | |
| "reward_std": 0.042491715401411057, | |
| "rewards/preference_model_reward": 0.4948778748512268, | |
| "rewards/preference_model_reward/std": 0.042491719126701355, | |
| "step": 283 | |
| }, | |
| { | |
| "clip_ratio": 0.0014662991743534803, | |
| "epoch": 0.24108658743633277, | |
| "grad_norm": 1.2693793919900047, | |
| "kl": 0.1865234375, | |
| "learning_rate": 2e-06, | |
| "loss": -0.0014, | |
| "step": 284 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 326.25, | |
| "epoch": 0.24193548387096775, | |
| "grad_norm": 0.9299842399613616, | |
| "kl": 0.2353515625, | |
| "learning_rate": 2e-06, | |
| "loss": -0.0024, | |
| "reward": 0.46875959634780884, | |
| "reward_std": 0.09210902452468872, | |
| "rewards/preference_model_reward": 0.46875959634780884, | |
| "rewards/preference_model_reward/std": 0.09210902452468872, | |
| "step": 285 | |
| }, | |
| { | |
| "clip_ratio": 0.0002825378905981779, | |
| "epoch": 0.2427843803056027, | |
| "grad_norm": 1.269996090531689, | |
| "kl": 0.236328125, | |
| "learning_rate": 2e-06, | |
| "loss": -0.0031, | |
| "step": 286 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 358.0, | |
| "epoch": 0.2436332767402377, | |
| "grad_norm": 0.845178527409992, | |
| "kl": 0.1904296875, | |
| "learning_rate": 2e-06, | |
| "loss": -0.0032, | |
| "reward": 0.14985330402851105, | |
| "reward_std": 0.07015400379896164, | |
| "rewards/preference_model_reward": 0.14985330402851105, | |
| "rewards/preference_model_reward/std": 0.07015399634838104, | |
| "step": 287 | |
| }, | |
| { | |
| "clip_ratio": 0.002010664436966181, | |
| "epoch": 0.24448217317487267, | |
| "grad_norm": 0.7374485292445523, | |
| "kl": 0.1923828125, | |
| "learning_rate": 2e-06, | |
| "loss": -0.0037, | |
| "step": 288 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 494.5625, | |
| "epoch": 0.24533106960950765, | |
| "grad_norm": 0.9545115708280292, | |
| "kl": 0.2119140625, | |
| "learning_rate": 2e-06, | |
| "loss": 0.0043, | |
| "reward": 0.44061005115509033, | |
| "reward_std": 0.1013847216963768, | |
| "rewards/preference_model_reward": 0.44061005115509033, | |
| "rewards/preference_model_reward/std": 0.1013847216963768, | |
| "step": 289 | |
| }, | |
| { | |
| "clip_ratio": 0.0008882409892976284, | |
| "epoch": 0.2461799660441426, | |
| "grad_norm": 1.0760062283966654, | |
| "kl": 0.212890625, | |
| "learning_rate": 2e-06, | |
| "loss": 0.0037, | |
| "step": 290 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 622.78125, | |
| "epoch": 0.2470288624787776, | |
| "grad_norm": 0.2204401442573945, | |
| "kl": 0.216796875, | |
| "learning_rate": 2e-06, | |
| "loss": 0.0006, | |
| "reward": 0.5982410907745361, | |
| "reward_std": 0.01771736703813076, | |
| "rewards/preference_model_reward": 0.5982410907745361, | |
| "rewards/preference_model_reward/std": 0.01771736331284046, | |
| "step": 291 | |
| }, | |
| { | |
| "clip_ratio": 0.000654590898193419, | |
| "epoch": 0.24787775891341257, | |
| "grad_norm": 0.14373274603213665, | |
| "kl": 0.203125, | |
| "learning_rate": 2e-06, | |
| "loss": 0.0005, | |
| "step": 292 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 474.4375, | |
| "epoch": 0.24872665534804753, | |
| "grad_norm": 1.0199416518092594, | |
| "kl": 0.197265625, | |
| "learning_rate": 2e-06, | |
| "loss": -0.0064, | |
| "reward": 0.3177827000617981, | |
| "reward_std": 0.13039816915988922, | |
| "rewards/preference_model_reward": 0.3177827000617981, | |
| "rewards/preference_model_reward/std": 0.13039815425872803, | |
| "step": 293 | |
| }, | |
| { | |
| "clip_ratio": 0.0005917281378060579, | |
| "epoch": 0.2495755517826825, | |
| "grad_norm": 0.9840967197672208, | |
| "kl": 0.1953125, | |
| "learning_rate": 2e-06, | |
| "loss": -0.0071, | |
| "step": 294 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 397.0625, | |
| "epoch": 0.25042444821731746, | |
| "grad_norm": 1.8823223194664953, | |
| "kl": 0.2099609375, | |
| "learning_rate": 2e-06, | |
| "loss": -0.0006, | |
| "reward": 0.31412482261657715, | |
| "reward_std": 0.1166299358010292, | |
| "rewards/preference_model_reward": 0.31412482261657715, | |
| "rewards/preference_model_reward/std": 0.11662992835044861, | |
| "step": 295 | |
| }, | |
| { | |
| "clip_ratio": 0.0003579020267352462, | |
| "epoch": 0.25127334465195245, | |
| "grad_norm": 1.1537409422856328, | |
| "kl": 0.208984375, | |
| "learning_rate": 2e-06, | |
| "loss": -0.0011, | |
| "step": 296 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 628.5625, | |
| "epoch": 0.25212224108658743, | |
| "grad_norm": 1.1927096956384737, | |
| "kl": 0.1708984375, | |
| "learning_rate": 2e-06, | |
| "loss": 0.0164, | |
| "reward": 0.3190678358078003, | |
| "reward_std": 0.14439481496810913, | |
| "rewards/preference_model_reward": 0.3190678358078003, | |
| "rewards/preference_model_reward/std": 0.14439481496810913, | |
| "step": 297 | |
| }, | |
| { | |
| "clip_ratio": 0.000986331608146429, | |
| "epoch": 0.2529711375212224, | |
| "grad_norm": 1.1803756478909, | |
| "kl": 0.1708984375, | |
| "learning_rate": 2e-06, | |
| "loss": 0.0156, | |
| "step": 298 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 357.5, | |
| "epoch": 0.2538200339558574, | |
| "grad_norm": 2.207926479632091, | |
| "kl": 0.2109375, | |
| "learning_rate": 2e-06, | |
| "loss": -0.0057, | |
| "reward": 0.3464009761810303, | |
| "reward_std": 0.1354563981294632, | |
| "rewards/preference_model_reward": 0.3464009761810303, | |
| "rewards/preference_model_reward/std": 0.135456383228302, | |
| "step": 299 | |
| }, | |
| { | |
| "clip_ratio": 0.0006329367170110345, | |
| "epoch": 0.2546689303904924, | |
| "grad_norm": 1.4143240393325465, | |
| "kl": 0.2099609375, | |
| "learning_rate": 2e-06, | |
| "loss": -0.0064, | |
| "step": 300 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 625, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 100, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |