| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 2.0543293718166384, |
| "eval_steps": 500, |
| "global_step": 300, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 313.2109375, |
| "epoch": 0.006791171477079796, |
| "grad_norm": 1.1658549601952852, |
| "kl": 0.0003286600112915039, |
| "learning_rate": 0.0, |
| "loss": -0.0057, |
| "reward": 0.4946059621870518, |
| "reward_std": 0.3181956857442856, |
| "rewards/preference_model_reward": 0.4946059621870518, |
| "rewards/preference_model_reward/std": 0.3181956773623824, |
| "step": 1 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "epoch": 0.013582342954159592, |
| "grad_norm": 1.1661357306239586, |
| "kl": 0.0003286600112915039, |
| "learning_rate": 8e-08, |
| "loss": -0.0057, |
| "step": 2 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 379.8359375, |
| "epoch": 0.02037351443123939, |
| "grad_norm": 0.9395065263567765, |
| "kl": 0.00034868717193603516, |
| "learning_rate": 1.6e-07, |
| "loss": 0.0023, |
| "reward": 0.4930149093270302, |
| "reward_std": 0.24642075644806027, |
| "rewards/preference_model_reward": 0.4930149093270302, |
| "rewards/preference_model_reward/std": 0.24642075458541512, |
| "step": 3 |
| }, |
| { |
| "clip_ratio": 0.0005332227265171241, |
| "epoch": 0.027164685908319185, |
| "grad_norm": 0.9559331452106371, |
| "kl": 0.00041222572326660156, |
| "learning_rate": 2.4e-07, |
| "loss": 0.0023, |
| "step": 4 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 345.19921875, |
| "epoch": 0.03395585738539898, |
| "grad_norm": 1.0659097324920952, |
| "kl": 0.000335693359375, |
| "learning_rate": 3.2e-07, |
| "loss": -0.0052, |
| "reward": 0.5752501655369997, |
| "reward_std": 0.2759046368300915, |
| "rewards/preference_model_reward": 0.5752501655369997, |
| "rewards/preference_model_reward/std": 0.2759046256542206, |
| "step": 5 |
| }, |
| { |
| "clip_ratio": 0.00037182615051278844, |
| "epoch": 0.04074702886247878, |
| "grad_norm": 1.0694980180681193, |
| "kl": 0.0003333091735839844, |
| "learning_rate": 4e-07, |
| "loss": -0.0052, |
| "step": 6 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 348.94921875, |
| "epoch": 0.04753820033955857, |
| "grad_norm": 1.0374782935041194, |
| "kl": 0.0003757476806640625, |
| "learning_rate": 4.8e-07, |
| "loss": 0.0011, |
| "reward": 0.5447775591164827, |
| "reward_std": 0.25542482268065214, |
| "rewards/preference_model_reward": 0.5447775591164827, |
| "rewards/preference_model_reward/std": 0.25542481429874897, |
| "step": 7 |
| }, |
| { |
| "clip_ratio": 0.00040936221739684697, |
| "epoch": 0.05432937181663837, |
| "grad_norm": 1.094306972800085, |
| "kl": 0.00039839744567871094, |
| "learning_rate": 5.6e-07, |
| "loss": 0.0011, |
| "step": 8 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 305.2890625, |
| "epoch": 0.06112054329371817, |
| "grad_norm": 1.0830810632499763, |
| "kl": 0.00036144256591796875, |
| "learning_rate": 6.4e-07, |
| "loss": -0.0091, |
| "reward": 0.44168128073215485, |
| "reward_std": 0.29115105979144573, |
| "rewards/preference_model_reward": 0.44168128073215485, |
| "rewards/preference_model_reward/std": 0.29115105606615543, |
| "step": 9 |
| }, |
| { |
| "clip_ratio": 0.00021800790273118764, |
| "epoch": 0.06791171477079797, |
| "grad_norm": 1.0686900063629763, |
| "kl": 0.00036394596099853516, |
| "learning_rate": 7.2e-07, |
| "loss": -0.0091, |
| "step": 10 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 393.38671875, |
| "epoch": 0.07470288624787776, |
| "grad_norm": 0.9072962579375315, |
| "kl": 0.00039005279541015625, |
| "learning_rate": 8e-07, |
| "loss": -0.0006, |
| "reward": 0.3613455481827259, |
| "reward_std": 0.25880729779601097, |
| "rewards/preference_model_reward": 0.3613455481827259, |
| "rewards/preference_model_reward/std": 0.2588073033839464, |
| "step": 11 |
| }, |
| { |
| "clip_ratio": 0.0003210598952136934, |
| "epoch": 0.08149405772495756, |
| "grad_norm": 0.9072123086741707, |
| "kl": 0.0003895759582519531, |
| "learning_rate": 8.799999999999999e-07, |
| "loss": -0.0006, |
| "step": 12 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 367.1328125, |
| "epoch": 0.08828522920203735, |
| "grad_norm": 1.031552247551817, |
| "kl": 0.0003743171691894531, |
| "learning_rate": 9.6e-07, |
| "loss": 0.0019, |
| "reward": 0.48316149413585663, |
| "reward_std": 0.2832601722329855, |
| "rewards/preference_model_reward": 0.48316149413585663, |
| "rewards/preference_model_reward/std": 0.2832601750269532, |
| "step": 13 |
| }, |
| { |
| "clip_ratio": 0.00029072189227008494, |
| "epoch": 0.09507640067911714, |
| "grad_norm": 1.020095801170388, |
| "kl": 0.00040650367736816406, |
| "learning_rate": 1.04e-06, |
| "loss": 0.0018, |
| "step": 14 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 361.61328125, |
| "epoch": 0.10186757215619695, |
| "grad_norm": 0.9054135077436368, |
| "kl": 0.000415802001953125, |
| "learning_rate": 1.12e-06, |
| "loss": -0.0095, |
| "reward": 0.6715657562017441, |
| "reward_std": 0.26127034425735474, |
| "rewards/preference_model_reward": 0.6715657562017441, |
| "rewards/preference_model_reward/std": 0.2612703386694193, |
| "step": 15 |
| }, |
| { |
| "clip_ratio": 0.00016775515268818708, |
| "epoch": 0.10865874363327674, |
| "grad_norm": 0.9126437934856992, |
| "kl": 0.00041675567626953125, |
| "learning_rate": 1.2e-06, |
| "loss": -0.0095, |
| "step": 16 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 269.35546875, |
| "epoch": 0.11544991511035653, |
| "grad_norm": 1.3411908637656689, |
| "kl": 0.0004000663757324219, |
| "learning_rate": 1.28e-06, |
| "loss": -0.0131, |
| "reward": 0.5323104355484247, |
| "reward_std": 0.2701443340629339, |
| "rewards/preference_model_reward": 0.5323104355484247, |
| "rewards/preference_model_reward/std": 0.2701443322002888, |
| "step": 17 |
| }, |
| { |
| "clip_ratio": 0.0002587076596682891, |
| "epoch": 0.12224108658743633, |
| "grad_norm": 1.161588313842898, |
| "kl": 0.0004303455352783203, |
| "learning_rate": 1.3600000000000001e-06, |
| "loss": -0.013, |
| "step": 18 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 310.2265625, |
| "epoch": 0.12903225806451613, |
| "grad_norm": 1.0392498163961779, |
| "kl": 0.0005252361297607422, |
| "learning_rate": 1.44e-06, |
| "loss": -0.0118, |
| "reward": 0.5056532379239798, |
| "reward_std": 0.24357289634644985, |
| "rewards/preference_model_reward": 0.5056532379239798, |
| "rewards/preference_model_reward/std": 0.24357289355248213, |
| "step": 19 |
| }, |
| { |
| "clip_ratio": 0.0002536060192142031, |
| "epoch": 0.13582342954159593, |
| "grad_norm": 1.0214823504151023, |
| "kl": 0.0005702972412109375, |
| "learning_rate": 1.5199999999999998e-06, |
| "loss": -0.0119, |
| "step": 20 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 437.74609375, |
| "epoch": 0.14261460101867574, |
| "grad_norm": 0.9892882217675961, |
| "kl": 0.0005764961242675781, |
| "learning_rate": 1.6e-06, |
| "loss": 0.0017, |
| "reward": 0.6471737138926983, |
| "reward_std": 0.28144341707229614, |
| "rewards/preference_model_reward": 0.6471737138926983, |
| "rewards/preference_model_reward/std": 0.2814434114843607, |
| "step": 21 |
| }, |
| { |
| "clip_ratio": 0.00024805667453620117, |
| "epoch": 0.1494057724957555, |
| "grad_norm": 0.8611996612129298, |
| "kl": 0.0006487369537353516, |
| "learning_rate": 1.6799999999999998e-06, |
| "loss": 0.0016, |
| "step": 22 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 288.34375, |
| "epoch": 0.15619694397283532, |
| "grad_norm": 1.5143681967656712, |
| "kl": 0.0007748603820800781, |
| "learning_rate": 1.7599999999999999e-06, |
| "loss": -0.0053, |
| "reward": 0.735472559928894, |
| "reward_std": 0.3065086118876934, |
| "rewards/preference_model_reward": 0.735472559928894, |
| "rewards/preference_model_reward/std": 0.3065086044371128, |
| "step": 23 |
| }, |
| { |
| "clip_ratio": 0.00037343262283684453, |
| "epoch": 0.16298811544991512, |
| "grad_norm": 1.1570786184830504, |
| "kl": 0.0008769035339355469, |
| "learning_rate": 1.84e-06, |
| "loss": -0.0053, |
| "step": 24 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 381.19140625, |
| "epoch": 0.1697792869269949, |
| "grad_norm": 0.8177439257742645, |
| "kl": 0.0009984970092773438, |
| "learning_rate": 1.92e-06, |
| "loss": -0.0008, |
| "reward": 0.6719343699514866, |
| "reward_std": 0.2588585498742759, |
| "rewards/preference_model_reward": 0.6719343699514866, |
| "rewards/preference_model_reward/std": 0.2588585487101227, |
| "step": 25 |
| }, |
| { |
| "clip_ratio": 0.0002841594987330609, |
| "epoch": 0.1765704584040747, |
| "grad_norm": 0.8284952878885445, |
| "kl": 0.0011463165283203125, |
| "learning_rate": 2e-06, |
| "loss": -0.0009, |
| "step": 26 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 293.75390625, |
| "epoch": 0.1833616298811545, |
| "grad_norm": 1.0612960800439712, |
| "kl": 0.001338958740234375, |
| "learning_rate": 2e-06, |
| "loss": 0.0062, |
| "reward": 0.5385704413056374, |
| "reward_std": 0.2679297383874655, |
| "rewards/preference_model_reward": 0.5385704413056374, |
| "rewards/preference_model_reward/std": 0.2679297402501106, |
| "step": 27 |
| }, |
| { |
| "clip_ratio": 0.0003048431572096888, |
| "epoch": 0.19015280135823429, |
| "grad_norm": 1.0535161088618343, |
| "kl": 0.0014734268188476562, |
| "learning_rate": 2e-06, |
| "loss": 0.0061, |
| "step": 28 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 406.28125, |
| "epoch": 0.1969439728353141, |
| "grad_norm": 0.9688907390375263, |
| "kl": 0.0017023086547851562, |
| "learning_rate": 2e-06, |
| "loss": -0.0025, |
| "reward": 0.5423681996762753, |
| "reward_std": 0.26603892212733626, |
| "rewards/preference_model_reward": 0.5423681996762753, |
| "rewards/preference_model_reward/std": 0.26603891397826374, |
| "step": 29 |
| }, |
| { |
| "clip_ratio": 0.0004548036777123343, |
| "epoch": 0.2037351443123939, |
| "grad_norm": 1.056840409643398, |
| "kl": 0.0018987655639648438, |
| "learning_rate": 2e-06, |
| "loss": -0.0026, |
| "step": 30 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 441.890625, |
| "epoch": 0.21052631578947367, |
| "grad_norm": 0.8637647596491498, |
| "kl": 0.0018672943115234375, |
| "learning_rate": 2e-06, |
| "loss": -0.0001, |
| "reward": 0.6307996772229671, |
| "reward_std": 0.24354635225608945, |
| "rewards/preference_model_reward": 0.6307996772229671, |
| "rewards/preference_model_reward/std": 0.24354635691270232, |
| "step": 31 |
| }, |
| { |
| "clip_ratio": 0.00037552460526057985, |
| "epoch": 0.21731748726655348, |
| "grad_norm": 0.8722655459487942, |
| "kl": 0.0019969940185546875, |
| "learning_rate": 2e-06, |
| "loss": -0.0002, |
| "step": 32 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 305.55859375, |
| "epoch": 0.22410865874363328, |
| "grad_norm": 1.1940804556285844, |
| "kl": 0.002285003662109375, |
| "learning_rate": 2e-06, |
| "loss": 0.0097, |
| "reward": 0.5836777277290821, |
| "reward_std": 0.30259183794260025, |
| "rewards/preference_model_reward": 0.5836777277290821, |
| "rewards/preference_model_reward/std": 0.30259183421730995, |
| "step": 33 |
| }, |
| { |
| "clip_ratio": 0.00024095292246784084, |
| "epoch": 0.23089983022071306, |
| "grad_norm": 1.3077624309877895, |
| "kl": 0.00241851806640625, |
| "learning_rate": 2e-06, |
| "loss": 0.0097, |
| "step": 34 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 313.63671875, |
| "epoch": 0.23769100169779286, |
| "grad_norm": 1.0665132545980283, |
| "kl": 0.0022363662719726562, |
| "learning_rate": 2e-06, |
| "loss": -0.0159, |
| "reward": 0.594868753105402, |
| "reward_std": 0.2588401613757014, |
| "rewards/preference_model_reward": 0.594868753105402, |
| "rewards/preference_model_reward/std": 0.25884015765041113, |
| "step": 35 |
| }, |
| { |
| "clip_ratio": 0.0003941934592148755, |
| "epoch": 0.24448217317487267, |
| "grad_norm": 1.076309978844894, |
| "kl": 0.00234222412109375, |
| "learning_rate": 2e-06, |
| "loss": -0.016, |
| "step": 36 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 396.75, |
| "epoch": 0.25127334465195245, |
| "grad_norm": 1.0267247550638525, |
| "kl": 0.002719879150390625, |
| "learning_rate": 2e-06, |
| "loss": -0.0067, |
| "reward": 0.5739418976008892, |
| "reward_std": 0.2615406233817339, |
| "rewards/preference_model_reward": 0.5739418976008892, |
| "rewards/preference_model_reward/std": 0.26154061406850815, |
| "step": 37 |
| }, |
| { |
| "clip_ratio": 0.00040432674541079905, |
| "epoch": 0.25806451612903225, |
| "grad_norm": 0.9172779196341795, |
| "kl": 0.0029506683349609375, |
| "learning_rate": 2e-06, |
| "loss": -0.0068, |
| "step": 38 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 327.375, |
| "epoch": 0.26485568760611206, |
| "grad_norm": 1.086086180237484, |
| "kl": 0.00321197509765625, |
| "learning_rate": 2e-06, |
| "loss": -0.0043, |
| "reward": 0.6816785149276257, |
| "reward_std": 0.2440826129168272, |
| "rewards/preference_model_reward": 0.6816785149276257, |
| "rewards/preference_model_reward/std": 0.24408261477947235, |
| "step": 39 |
| }, |
| { |
| "clip_ratio": 0.00011399965751479613, |
| "epoch": 0.27164685908319186, |
| "grad_norm": 1.1219880555909083, |
| "kl": 0.0034637451171875, |
| "learning_rate": 2e-06, |
| "loss": -0.0044, |
| "step": 40 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 399.7109375, |
| "epoch": 0.27843803056027167, |
| "grad_norm": 0.959926063361782, |
| "kl": 0.00341796875, |
| "learning_rate": 2e-06, |
| "loss": 0.0028, |
| "reward": 0.5822978504002094, |
| "reward_std": 0.2836025133728981, |
| "rewards/preference_model_reward": 0.5822978504002094, |
| "rewards/preference_model_reward/std": 0.28360251151025295, |
| "step": 41 |
| }, |
| { |
| "clip_ratio": 0.0002537540549383266, |
| "epoch": 0.28522920203735147, |
| "grad_norm": 0.9737278797687372, |
| "kl": 0.0036487579345703125, |
| "learning_rate": 2e-06, |
| "loss": 0.0027, |
| "step": 42 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 466.25390625, |
| "epoch": 0.2920203735144312, |
| "grad_norm": 0.8110847666278888, |
| "kl": 0.0040760040283203125, |
| "learning_rate": 2e-06, |
| "loss": 0.0015, |
| "reward": 0.7039333023130894, |
| "reward_std": 0.24644886306487024, |
| "rewards/preference_model_reward": 0.7039333023130894, |
| "rewards/preference_model_reward/std": 0.2464488591067493, |
| "step": 43 |
| }, |
| { |
| "clip_ratio": 0.00036502836792351445, |
| "epoch": 0.298811544991511, |
| "grad_norm": 0.8251482957928842, |
| "kl": 0.00433349609375, |
| "learning_rate": 2e-06, |
| "loss": 0.0015, |
| "step": 44 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 326.37890625, |
| "epoch": 0.30560271646859083, |
| "grad_norm": 0.9093484912696803, |
| "kl": 0.004657745361328125, |
| "learning_rate": 2e-06, |
| "loss": -0.0035, |
| "reward": 0.5050893509760499, |
| "reward_std": 0.23734514694660902, |
| "rewards/preference_model_reward": 0.5050893509760499, |
| "rewards/preference_model_reward/std": 0.23734513530507684, |
| "step": 45 |
| }, |
| { |
| "clip_ratio": 0.00032999391623889096, |
| "epoch": 0.31239388794567063, |
| "grad_norm": 0.8994019896097728, |
| "kl": 0.0048351287841796875, |
| "learning_rate": 2e-06, |
| "loss": -0.0034, |
| "step": 46 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 378.30859375, |
| "epoch": 0.31918505942275044, |
| "grad_norm": 1.0879235017943376, |
| "kl": 0.004894256591796875, |
| "learning_rate": 2e-06, |
| "loss": -0.0055, |
| "reward": 0.7660590410232544, |
| "reward_std": 0.24720557313412428, |
| "rewards/preference_model_reward": 0.7660590410232544, |
| "rewards/preference_model_reward/std": 0.24720556661486626, |
| "step": 47 |
| }, |
| { |
| "clip_ratio": 0.00039132226856963825, |
| "epoch": 0.32597623089983024, |
| "grad_norm": 0.9067283296578724, |
| "kl": 0.00513458251953125, |
| "learning_rate": 2e-06, |
| "loss": -0.0055, |
| "step": 48 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 309.609375, |
| "epoch": 0.33276740237691, |
| "grad_norm": 1.417503400810007, |
| "kl": 0.005855560302734375, |
| "learning_rate": 2e-06, |
| "loss": -0.0132, |
| "reward": 0.6303279139101505, |
| "reward_std": 0.29637874104082584, |
| "rewards/preference_model_reward": 0.6303279139101505, |
| "rewards/preference_model_reward/std": 0.29637873359024525, |
| "step": 49 |
| }, |
| { |
| "clip_ratio": 0.00033494585659354925, |
| "epoch": 0.3395585738539898, |
| "grad_norm": 1.2450962191215096, |
| "kl": 0.006114959716796875, |
| "learning_rate": 2e-06, |
| "loss": -0.0133, |
| "step": 50 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 333.18359375, |
| "epoch": 0.3463497453310696, |
| "grad_norm": 1.044571491400686, |
| "kl": 0.0055866241455078125, |
| "learning_rate": 2e-06, |
| "loss": -0.0093, |
| "reward": 0.4536947198212147, |
| "reward_std": 0.25838964246213436, |
| "rewards/preference_model_reward": 0.4536947198212147, |
| "rewards/preference_model_reward/std": 0.25838964618742466, |
| "step": 51 |
| }, |
| { |
| "clip_ratio": 0.00034966163912031334, |
| "epoch": 0.3531409168081494, |
| "grad_norm": 1.0319046384759274, |
| "kl": 0.0059833526611328125, |
| "learning_rate": 2e-06, |
| "loss": -0.0094, |
| "step": 52 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 293.0390625, |
| "epoch": 0.3599320882852292, |
| "grad_norm": 1.0209389153993411, |
| "kl": 0.00640106201171875, |
| "learning_rate": 2e-06, |
| "loss": -0.0063, |
| "reward": 0.6109997481107712, |
| "reward_std": 0.27825887873768806, |
| "rewards/preference_model_reward": 0.6109997481107712, |
| "rewards/preference_model_reward/std": 0.2782588703557849, |
| "step": 53 |
| }, |
| { |
| "clip_ratio": 0.00033825510763563216, |
| "epoch": 0.366723259762309, |
| "grad_norm": 1.1686483370046787, |
| "kl": 0.0067596435546875, |
| "learning_rate": 2e-06, |
| "loss": -0.0064, |
| "step": 54 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 385.97265625, |
| "epoch": 0.3735144312393888, |
| "grad_norm": 1.2276766352908126, |
| "kl": 0.00732421875, |
| "learning_rate": 2e-06, |
| "loss": -0.0131, |
| "reward": 0.5218756012618542, |
| "reward_std": 0.32545966282486916, |
| "rewards/preference_model_reward": 0.5218756012618542, |
| "rewards/preference_model_reward/std": 0.32545965164899826, |
| "step": 55 |
| }, |
| { |
| "clip_ratio": 0.00023414421593770385, |
| "epoch": 0.38030560271646857, |
| "grad_norm": 1.123144036568887, |
| "kl": 0.0076751708984375, |
| "learning_rate": 2e-06, |
| "loss": -0.0131, |
| "step": 56 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 340.5625, |
| "epoch": 0.3870967741935484, |
| "grad_norm": 0.9127683567363678, |
| "kl": 0.00778961181640625, |
| "learning_rate": 2e-06, |
| "loss": 0.0072, |
| "reward": 0.5140541326254606, |
| "reward_std": 0.23541682050563395, |
| "rewards/preference_model_reward": 0.5140541326254606, |
| "rewards/preference_model_reward/std": 0.23541681352071464, |
| "step": 57 |
| }, |
| { |
| "clip_ratio": 0.00020465617490117438, |
| "epoch": 0.3938879456706282, |
| "grad_norm": 0.9240907400149151, |
| "kl": 0.00821685791015625, |
| "learning_rate": 2e-06, |
| "loss": 0.0071, |
| "step": 58 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 400.1171875, |
| "epoch": 0.400679117147708, |
| "grad_norm": 1.2755019834131804, |
| "kl": 0.00856781005859375, |
| "learning_rate": 2e-06, |
| "loss": -0.0049, |
| "reward": 0.5248972652480006, |
| "reward_std": 0.2680952288210392, |
| "rewards/preference_model_reward": 0.5248972652480006, |
| "rewards/preference_model_reward/std": 0.26809522369876504, |
| "step": 59 |
| }, |
| { |
| "clip_ratio": 0.00044215139951120364, |
| "epoch": 0.4074702886247878, |
| "grad_norm": 1.0137811933079786, |
| "kl": 0.008953094482421875, |
| "learning_rate": 2e-06, |
| "loss": -0.005, |
| "step": 60 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 384.32421875, |
| "epoch": 0.4142614601018676, |
| "grad_norm": 0.8023274386554214, |
| "kl": 0.00972747802734375, |
| "learning_rate": 2e-06, |
| "loss": -0.007, |
| "reward": 0.7109678499400616, |
| "reward_std": 0.22113378904759884, |
| "rewards/preference_model_reward": 0.7109678499400616, |
| "rewards/preference_model_reward/std": 0.22113378625363111, |
| "step": 61 |
| }, |
| { |
| "clip_ratio": 0.00018141404598281952, |
| "epoch": 0.42105263157894735, |
| "grad_norm": 0.8315096106222806, |
| "kl": 0.01013946533203125, |
| "learning_rate": 2e-06, |
| "loss": -0.007, |
| "step": 62 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 364.890625, |
| "epoch": 0.42784380305602715, |
| "grad_norm": 0.9668397185089038, |
| "kl": 0.0107421875, |
| "learning_rate": 2e-06, |
| "loss": 0.0031, |
| "reward": 0.6806612908840179, |
| "reward_std": 0.2775609251111746, |
| "rewards/preference_model_reward": 0.6806612908840179, |
| "rewards/preference_model_reward/std": 0.27756091207265854, |
| "step": 63 |
| }, |
| { |
| "clip_ratio": 0.00043790563358925283, |
| "epoch": 0.43463497453310695, |
| "grad_norm": 0.9404870267113553, |
| "kl": 0.01111602783203125, |
| "learning_rate": 2e-06, |
| "loss": 0.0031, |
| "step": 64 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 324.453125, |
| "epoch": 0.44142614601018676, |
| "grad_norm": 1.0311940848708463, |
| "kl": 0.009700775146484375, |
| "learning_rate": 2e-06, |
| "loss": 0.0037, |
| "reward": 0.6332272328436375, |
| "reward_std": 0.25745808193460107, |
| "rewards/preference_model_reward": 0.6332272328436375, |
| "rewards/preference_model_reward/std": 0.2574580740183592, |
| "step": 65 |
| }, |
| { |
| "clip_ratio": 0.0003352455223648576, |
| "epoch": 0.44821731748726656, |
| "grad_norm": 1.041796930728353, |
| "kl": 0.009906768798828125, |
| "learning_rate": 2e-06, |
| "loss": 0.0037, |
| "step": 66 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 382.29296875, |
| "epoch": 0.45500848896434637, |
| "grad_norm": 0.9676368782571226, |
| "kl": 0.01045989990234375, |
| "learning_rate": 2e-06, |
| "loss": 0.0018, |
| "reward": 0.6579710729420185, |
| "reward_std": 0.2728098388761282, |
| "rewards/preference_model_reward": 0.6579710729420185, |
| "rewards/preference_model_reward/std": 0.2728098277002573, |
| "step": 67 |
| }, |
| { |
| "clip_ratio": 0.0004586090890370542, |
| "epoch": 0.4617996604414261, |
| "grad_norm": 0.9733174758393729, |
| "kl": 0.01084136962890625, |
| "learning_rate": 2e-06, |
| "loss": 0.0018, |
| "step": 68 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 329.83984375, |
| "epoch": 0.4685908319185059, |
| "grad_norm": 0.9029956997185686, |
| "kl": 0.01184844970703125, |
| "learning_rate": 2e-06, |
| "loss": -0.0082, |
| "reward": 0.6309689432382584, |
| "reward_std": 0.2526115436339751, |
| "rewards/preference_model_reward": 0.6309689432382584, |
| "rewards/preference_model_reward/std": 0.2526115436339751, |
| "step": 69 |
| }, |
| { |
| "clip_ratio": 0.0002797141951305093, |
| "epoch": 0.47538200339558573, |
| "grad_norm": 0.8995188914064342, |
| "kl": 0.01227569580078125, |
| "learning_rate": 2e-06, |
| "loss": -0.0082, |
| "step": 70 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 349.5234375, |
| "epoch": 0.48217317487266553, |
| "grad_norm": 1.1446636787137474, |
| "kl": 0.01305389404296875, |
| "learning_rate": 2e-06, |
| "loss": -0.0037, |
| "reward": 0.5975633077323437, |
| "reward_std": 0.27538682520389557, |
| "rewards/preference_model_reward": 0.5975633077323437, |
| "rewards/preference_model_reward/std": 0.2753868168219924, |
| "step": 71 |
| }, |
| { |
| "clip_ratio": 0.00037468447590072174, |
| "epoch": 0.48896434634974534, |
| "grad_norm": 1.0052357390576183, |
| "kl": 0.013519287109375, |
| "learning_rate": 2e-06, |
| "loss": -0.0037, |
| "step": 72 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 423.359375, |
| "epoch": 0.49575551782682514, |
| "grad_norm": 0.9195418190581706, |
| "kl": 0.01299285888671875, |
| "learning_rate": 2e-06, |
| "loss": -0.0037, |
| "reward": 0.6704220920801163, |
| "reward_std": 0.2678078021854162, |
| "rewards/preference_model_reward": 0.6704220920801163, |
| "rewards/preference_model_reward/std": 0.26780780404806137, |
| "step": 73 |
| }, |
| { |
| "clip_ratio": 0.0002866502591132303, |
| "epoch": 0.5025466893039049, |
| "grad_norm": 1.097145506469791, |
| "kl": 0.01361083984375, |
| "learning_rate": 2e-06, |
| "loss": -0.0038, |
| "step": 74 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 396.50390625, |
| "epoch": 0.5093378607809848, |
| "grad_norm": 0.9564676492531332, |
| "kl": 0.01482391357421875, |
| "learning_rate": 2e-06, |
| "loss": -0.0056, |
| "reward": 0.6842997781932354, |
| "reward_std": 0.25731130968779325, |
| "rewards/preference_model_reward": 0.6842997781932354, |
| "rewards/preference_model_reward/std": 0.25731129944324493, |
| "step": 75 |
| }, |
| { |
| "clip_ratio": 0.00033780875583033776, |
| "epoch": 0.5161290322580645, |
| "grad_norm": 0.9451343780893788, |
| "kl": 0.0153350830078125, |
| "learning_rate": 2e-06, |
| "loss": -0.0056, |
| "step": 76 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 303.171875, |
| "epoch": 0.5229202037351444, |
| "grad_norm": 1.028583508211413, |
| "kl": 0.0159912109375, |
| "learning_rate": 2e-06, |
| "loss": -0.005, |
| "reward": 0.7383619099855423, |
| "reward_std": 0.24221469252370298, |
| "rewards/preference_model_reward": 0.7383619099855423, |
| "rewards/preference_model_reward/std": 0.2422146894969046, |
| "step": 77 |
| }, |
| { |
| "clip_ratio": 0.0004982726022717543, |
| "epoch": 0.5297113752122241, |
| "grad_norm": 0.9589189463847356, |
| "kl": 0.01644134521484375, |
| "learning_rate": 2e-06, |
| "loss": -0.0051, |
| "step": 78 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 407.79296875, |
| "epoch": 0.5365025466893039, |
| "grad_norm": 1.0543595544736126, |
| "kl": 0.0185699462890625, |
| "learning_rate": 2e-06, |
| "loss": -0.0072, |
| "reward": 0.6588779911398888, |
| "reward_std": 0.2795031704008579, |
| "rewards/preference_model_reward": 0.6588779911398888, |
| "rewards/preference_model_reward/std": 0.2795031573623419, |
| "step": 79 |
| }, |
| { |
| "clip_ratio": 0.00030141435672703665, |
| "epoch": 0.5432937181663837, |
| "grad_norm": 1.0758354050017793, |
| "kl": 0.018951416015625, |
| "learning_rate": 2e-06, |
| "loss": -0.0073, |
| "step": 80 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 436.18359375, |
| "epoch": 0.5500848896434635, |
| "grad_norm": 0.781066685649031, |
| "kl": 0.0158233642578125, |
| "learning_rate": 2e-06, |
| "loss": -0.0022, |
| "reward": 0.7797287777066231, |
| "reward_std": 0.2147242482751608, |
| "rewards/preference_model_reward": 0.7797287777066231, |
| "rewards/preference_model_reward/std": 0.21472424920648336, |
| "step": 81 |
| }, |
| { |
| "clip_ratio": 0.0003111153791905963, |
| "epoch": 0.5568760611205433, |
| "grad_norm": 0.7433718148656135, |
| "kl": 0.0160369873046875, |
| "learning_rate": 2e-06, |
| "loss": -0.0023, |
| "step": 82 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 383.03125, |
| "epoch": 0.5636672325976231, |
| "grad_norm": 0.8390742957401245, |
| "kl": 0.01795196533203125, |
| "learning_rate": 2e-06, |
| "loss": -0.0066, |
| "reward": 0.6639863140881062, |
| "reward_std": 0.2489920537918806, |
| "rewards/preference_model_reward": 0.6639863140881062, |
| "rewards/preference_model_reward/std": 0.24899205192923546, |
| "step": 83 |
| }, |
| { |
| "clip_ratio": 0.00024405560725426767, |
| "epoch": 0.5704584040747029, |
| "grad_norm": 0.8449552595534167, |
| "kl": 0.0183258056640625, |
| "learning_rate": 2e-06, |
| "loss": -0.0066, |
| "step": 84 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 406.01953125, |
| "epoch": 0.5772495755517827, |
| "grad_norm": 0.9325132487643661, |
| "kl": 0.01790618896484375, |
| "learning_rate": 2e-06, |
| "loss": -0.0019, |
| "reward": 0.7170650623738766, |
| "reward_std": 0.2119898241944611, |
| "rewards/preference_model_reward": 0.7170650623738766, |
| "rewards/preference_model_reward/std": 0.21198982140049338, |
| "step": 85 |
| }, |
| { |
| "clip_ratio": 0.000315660272462992, |
| "epoch": 0.5840407470288624, |
| "grad_norm": 0.8561811844655657, |
| "kl": 0.0183258056640625, |
| "learning_rate": 2e-06, |
| "loss": -0.002, |
| "step": 86 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 373.5390625, |
| "epoch": 0.5908319185059423, |
| "grad_norm": 0.9759881466782773, |
| "kl": 0.01984405517578125, |
| "learning_rate": 2e-06, |
| "loss": 0.0002, |
| "reward": 0.6445738822221756, |
| "reward_std": 0.24776791501790285, |
| "rewards/preference_model_reward": 0.6445738822221756, |
| "rewards/preference_model_reward/std": 0.2477679206058383, |
| "step": 87 |
| }, |
| { |
| "clip_ratio": 0.0004009394979220815, |
| "epoch": 0.597623089983022, |
| "grad_norm": 0.9762704806385095, |
| "kl": 0.0201568603515625, |
| "learning_rate": 2e-06, |
| "loss": 0.0002, |
| "step": 88 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 370.92578125, |
| "epoch": 0.6044142614601019, |
| "grad_norm": 1.0730306390874693, |
| "kl": 0.01903533935546875, |
| "learning_rate": 2e-06, |
| "loss": -0.0058, |
| "reward": 0.7051359005272388, |
| "reward_std": 0.2701743124052882, |
| "rewards/preference_model_reward": 0.7051359005272388, |
| "rewards/preference_model_reward/std": 0.2701743012294173, |
| "step": 89 |
| }, |
| { |
| "clip_ratio": 0.0002958584018415422, |
| "epoch": 0.6112054329371817, |
| "grad_norm": 11.272175064066806, |
| "kl": 0.0193634033203125, |
| "learning_rate": 2e-06, |
| "loss": -0.0058, |
| "step": 90 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 293.8515625, |
| "epoch": 0.6179966044142614, |
| "grad_norm": 1.2532081169597848, |
| "kl": 0.024749755859375, |
| "learning_rate": 2e-06, |
| "loss": -0.0107, |
| "reward": 0.5385027192533016, |
| "reward_std": 0.2979753892868757, |
| "rewards/preference_model_reward": 0.5385027192533016, |
| "rewards/preference_model_reward/std": 0.2979753725230694, |
| "step": 91 |
| }, |
| { |
| "clip_ratio": 0.000490643553348491, |
| "epoch": 0.6247877758913413, |
| "grad_norm": 1.2110410142820385, |
| "kl": 0.025421142578125, |
| "learning_rate": 2e-06, |
| "loss": -0.0108, |
| "step": 92 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 339.84765625, |
| "epoch": 0.631578947368421, |
| "grad_norm": 0.9347491475779949, |
| "kl": 0.0230560302734375, |
| "learning_rate": 2e-06, |
| "loss": -0.0029, |
| "reward": 0.6585216615349054, |
| "reward_std": 0.2630395647138357, |
| "rewards/preference_model_reward": 0.6585216615349054, |
| "rewards/preference_model_reward/std": 0.2630395656451583, |
| "step": 93 |
| }, |
| { |
| "clip_ratio": 0.0002996912953676656, |
| "epoch": 0.6383701188455009, |
| "grad_norm": 0.958835774432475, |
| "kl": 0.0236053466796875, |
| "learning_rate": 2e-06, |
| "loss": -0.0029, |
| "step": 94 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 363.64453125, |
| "epoch": 0.6451612903225806, |
| "grad_norm": 0.8272066984013973, |
| "kl": 0.020721435546875, |
| "learning_rate": 2e-06, |
| "loss": -0.0064, |
| "reward": 0.8653465658426285, |
| "reward_std": 0.1561016822233796, |
| "rewards/preference_model_reward": 0.8653465658426285, |
| "rewards/preference_model_reward/std": 0.15610167011618614, |
| "step": 95 |
| }, |
| { |
| "clip_ratio": 0.0002611535892356187, |
| "epoch": 0.6519524617996605, |
| "grad_norm": 0.8209441010001175, |
| "kl": 0.021331787109375, |
| "learning_rate": 2e-06, |
| "loss": -0.0064, |
| "step": 96 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 337.61328125, |
| "epoch": 0.6587436332767402, |
| "grad_norm": 0.9707335419756125, |
| "kl": 0.0257415771484375, |
| "learning_rate": 2e-06, |
| "loss": -0.003, |
| "reward": 0.6451991870999336, |
| "reward_std": 0.24027688056230545, |
| "rewards/preference_model_reward": 0.6451991870999336, |
| "rewards/preference_model_reward/std": 0.24027687963098288, |
| "step": 97 |
| }, |
| { |
| "clip_ratio": 0.000394973511902208, |
| "epoch": 0.66553480475382, |
| "grad_norm": 0.9686579468801768, |
| "kl": 0.0262908935546875, |
| "learning_rate": 2e-06, |
| "loss": -0.003, |
| "step": 98 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 407.8203125, |
| "epoch": 0.6723259762308998, |
| "grad_norm": 1.0490178740647227, |
| "kl": 0.0236663818359375, |
| "learning_rate": 2e-06, |
| "loss": -0.0056, |
| "reward": 0.6188035290688276, |
| "reward_std": 0.23126866854727268, |
| "rewards/preference_model_reward": 0.6188035290688276, |
| "rewards/preference_model_reward/std": 0.23126865550875664, |
| "step": 99 |
| }, |
| { |
| "clip_ratio": 0.0002818793718688539, |
| "epoch": 0.6791171477079796, |
| "grad_norm": 0.9455082290037559, |
| "kl": 0.024139404296875, |
| "learning_rate": 2e-06, |
| "loss": -0.0056, |
| "step": 100 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 423.52734375, |
| "epoch": 0.6859083191850595, |
| "grad_norm": 0.8251731717477002, |
| "kl": 0.02386474609375, |
| "learning_rate": 2e-06, |
| "loss": -0.0005, |
| "reward": 0.8521472215652466, |
| "reward_std": 0.2337375245988369, |
| "rewards/preference_model_reward": 0.8521472215652466, |
| "rewards/preference_model_reward/std": 0.2337375171482563, |
| "step": 101 |
| }, |
| { |
| "clip_ratio": 0.0003581516129997908, |
| "epoch": 0.6926994906621392, |
| "grad_norm": 0.9184460532438401, |
| "kl": 0.024505615234375, |
| "learning_rate": 2e-06, |
| "loss": -0.0005, |
| "step": 102 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 435.6171875, |
| "epoch": 0.6994906621392191, |
| "grad_norm": 1.0860106347765217, |
| "kl": 0.0284881591796875, |
| "learning_rate": 2e-06, |
| "loss": -0.0005, |
| "reward": 0.7299420312047005, |
| "reward_std": 0.2999110519886017, |
| "rewards/preference_model_reward": 0.7299420312047005, |
| "rewards/preference_model_reward/std": 0.29991103895008564, |
| "step": 103 |
| }, |
| { |
| "clip_ratio": 0.00037642833285644883, |
| "epoch": 0.7062818336162988, |
| "grad_norm": 1.0035915159320414, |
| "kl": 0.029083251953125, |
| "learning_rate": 2e-06, |
| "loss": -0.0005, |
| "step": 104 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 314.51171875, |
| "epoch": 0.7130730050933786, |
| "grad_norm": 0.9150031551000603, |
| "kl": 0.0306854248046875, |
| "learning_rate": 2e-06, |
| "loss": -0.0014, |
| "reward": 0.6948796380311251, |
| "reward_std": 0.23453088384121656, |
| "rewards/preference_model_reward": 0.6948796380311251, |
| "rewards/preference_model_reward/std": 0.23453087732195854, |
| "step": 105 |
| }, |
| { |
| "clip_ratio": 0.00046619037311756983, |
| "epoch": 0.7198641765704584, |
| "grad_norm": 0.9057949433659955, |
| "kl": 0.0311737060546875, |
| "learning_rate": 2e-06, |
| "loss": -0.0015, |
| "step": 106 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 347.51171875, |
| "epoch": 0.7266553480475382, |
| "grad_norm": 1.058402386367552, |
| "kl": 0.029266357421875, |
| "learning_rate": 2e-06, |
| "loss": -0.0072, |
| "reward": 0.7044901698827744, |
| "reward_std": 0.27433344163000584, |
| "rewards/preference_model_reward": 0.7044901698827744, |
| "rewards/preference_model_reward/std": 0.2743334397673607, |
| "step": 107 |
| }, |
| { |
| "clip_ratio": 0.0002761358628049493, |
| "epoch": 0.733446519524618, |
| "grad_norm": 1.0936652673840959, |
| "kl": 0.02972412109375, |
| "learning_rate": 2e-06, |
| "loss": -0.0073, |
| "step": 108 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 332.05078125, |
| "epoch": 0.7402376910016978, |
| "grad_norm": 1.1862717131671745, |
| "kl": 0.0320892333984375, |
| "learning_rate": 2e-06, |
| "loss": -0.0043, |
| "reward": 0.6147979386150837, |
| "reward_std": 0.30731342919170856, |
| "rewards/preference_model_reward": 0.6147979386150837, |
| "rewards/preference_model_reward/std": 0.30731342546641827, |
| "step": 109 |
| }, |
| { |
| "clip_ratio": 0.00022946991703065578, |
| "epoch": 0.7470288624787776, |
| "grad_norm": 1.1478795200225282, |
| "kl": 0.0328369140625, |
| "learning_rate": 2e-06, |
| "loss": -0.0044, |
| "step": 110 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 365.30859375, |
| "epoch": 0.7538200339558574, |
| "grad_norm": 0.5771078052939254, |
| "kl": 0.0330810546875, |
| "learning_rate": 2e-06, |
| "loss": -0.0051, |
| "reward": 0.7481855824589729, |
| "reward_std": 0.1441822382621467, |
| "rewards/preference_model_reward": 0.7481855824589729, |
| "rewards/preference_model_reward/std": 0.1441822382621467, |
| "step": 111 |
| }, |
| { |
| "clip_ratio": 0.00022052829172025668, |
| "epoch": 0.7606112054329371, |
| "grad_norm": 0.5662276617721039, |
| "kl": 0.033966064453125, |
| "learning_rate": 2e-06, |
| "loss": -0.0051, |
| "step": 112 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 448.23046875, |
| "epoch": 0.767402376910017, |
| "grad_norm": 0.9029133714996065, |
| "kl": 0.0342559814453125, |
| "learning_rate": 2e-06, |
| "loss": -0.002, |
| "reward": 0.7898782268166542, |
| "reward_std": 0.24535357393324375, |
| "rewards/preference_model_reward": 0.7898782268166542, |
| "rewards/preference_model_reward/std": 0.24535356741398573, |
| "step": 113 |
| }, |
| { |
| "clip_ratio": 0.00040923262531578075, |
| "epoch": 0.7741935483870968, |
| "grad_norm": 0.8879456702067533, |
| "kl": 0.0348358154296875, |
| "learning_rate": 2e-06, |
| "loss": -0.002, |
| "step": 114 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 377.7265625, |
| "epoch": 0.7809847198641766, |
| "grad_norm": 0.825741580156274, |
| "kl": 0.035675048828125, |
| "learning_rate": 2e-06, |
| "loss": -0.0054, |
| "reward": 0.7737665809690952, |
| "reward_std": 0.22196321282535791, |
| "rewards/preference_model_reward": 0.7737665809690952, |
| "rewards/preference_model_reward/std": 0.22196321096271276, |
| "step": 115 |
| }, |
| { |
| "clip_ratio": 0.00022133419952297118, |
| "epoch": 0.7877758913412564, |
| "grad_norm": 0.8402195364210961, |
| "kl": 0.03631591796875, |
| "learning_rate": 2e-06, |
| "loss": -0.0055, |
| "step": 116 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 343.15234375, |
| "epoch": 0.7945670628183361, |
| "grad_norm": 0.8554129883813052, |
| "kl": 0.0380859375, |
| "learning_rate": 2e-06, |
| "loss": 0.004, |
| "reward": 0.8098233491182327, |
| "reward_std": 0.21788090001791716, |
| "rewards/preference_model_reward": 0.8098233491182327, |
| "rewards/preference_model_reward/std": 0.21788090048357844, |
| "step": 117 |
| }, |
| { |
| "clip_ratio": 0.0003840877070615534, |
| "epoch": 0.801358234295416, |
| "grad_norm": 0.8638777221139845, |
| "kl": 0.0389556884765625, |
| "learning_rate": 2e-06, |
| "loss": 0.0039, |
| "step": 118 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 421.3046875, |
| "epoch": 0.8081494057724957, |
| "grad_norm": 0.6793458238067924, |
| "kl": 0.03741455078125, |
| "learning_rate": 2e-06, |
| "loss": -0.0004, |
| "reward": 0.7930763997137547, |
| "reward_std": 0.16387870162725449, |
| "rewards/preference_model_reward": 0.7930763997137547, |
| "rewards/preference_model_reward/std": 0.1638787006959319, |
| "step": 119 |
| }, |
| { |
| "clip_ratio": 0.000321710589560098, |
| "epoch": 0.8149405772495756, |
| "grad_norm": 0.6962155610206633, |
| "kl": 0.0380096435546875, |
| "learning_rate": 2e-06, |
| "loss": -0.0004, |
| "step": 120 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 421.8359375, |
| "epoch": 0.8217317487266553, |
| "grad_norm": 0.9565211478116729, |
| "kl": 0.0385589599609375, |
| "learning_rate": 2e-06, |
| "loss": -0.0026, |
| "reward": 0.7755959965288639, |
| "reward_std": 0.23735665366984904, |
| "rewards/preference_model_reward": 0.7755959965288639, |
| "rewards/preference_model_reward/std": 0.23735664342530072, |
| "step": 121 |
| }, |
| { |
| "clip_ratio": 0.0003308643190393923, |
| "epoch": 0.8285229202037352, |
| "grad_norm": 1.0302822979795947, |
| "kl": 0.0387725830078125, |
| "learning_rate": 2e-06, |
| "loss": -0.0027, |
| "step": 122 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 421.22265625, |
| "epoch": 0.8353140916808149, |
| "grad_norm": 1.156154466362683, |
| "kl": 0.03570556640625, |
| "learning_rate": 2e-06, |
| "loss": -0.0075, |
| "reward": 0.8653455749154091, |
| "reward_std": 0.17953538481378928, |
| "rewards/preference_model_reward": 0.8653455749154091, |
| "rewards/preference_model_reward/std": 0.179535374045372, |
| "step": 123 |
| }, |
| { |
| "clip_ratio": 0.000370404065506591, |
| "epoch": 0.8421052631578947, |
| "grad_norm": 1.0589963494452141, |
| "kl": 0.036041259765625, |
| "learning_rate": 2e-06, |
| "loss": -0.0074, |
| "step": 124 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 361.0546875, |
| "epoch": 0.8488964346349746, |
| "grad_norm": 0.8042794508114846, |
| "kl": 0.041229248046875, |
| "learning_rate": 2e-06, |
| "loss": -0.0115, |
| "reward": 0.7803249768912792, |
| "reward_std": 0.2213962199166417, |
| "rewards/preference_model_reward": 0.7803249768912792, |
| "rewards/preference_model_reward/std": 0.221396217122674, |
| "step": 125 |
| }, |
| { |
| "clip_ratio": 0.0002928560206783004, |
| "epoch": 0.8556876061120543, |
| "grad_norm": 0.8094825207954228, |
| "kl": 0.0414276123046875, |
| "learning_rate": 2e-06, |
| "loss": -0.0115, |
| "step": 126 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 367.30859375, |
| "epoch": 0.8624787775891342, |
| "grad_norm": 0.999252643417212, |
| "kl": 0.0374755859375, |
| "learning_rate": 2e-06, |
| "loss": -0.0009, |
| "reward": 0.7459425553679466, |
| "reward_std": 0.27967323176562786, |
| "rewards/preference_model_reward": 0.7459425553679466, |
| "rewards/preference_model_reward/std": 0.27967322804033756, |
| "step": 127 |
| }, |
| { |
| "clip_ratio": 0.00029575879580079345, |
| "epoch": 0.8692699490662139, |
| "grad_norm": 1.4461960825342008, |
| "kl": 0.037933349609375, |
| "learning_rate": 2e-06, |
| "loss": -0.0009, |
| "step": 128 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 354.65234375, |
| "epoch": 0.8760611205432938, |
| "grad_norm": 1.0554062228696781, |
| "kl": 0.0406494140625, |
| "learning_rate": 2e-06, |
| "loss": -0.0028, |
| "reward": 0.7245122045278549, |
| "reward_std": 0.2613325589336455, |
| "rewards/preference_model_reward": 0.7245122045278549, |
| "rewards/preference_model_reward/std": 0.2613325538113713, |
| "step": 129 |
| }, |
| { |
| "clip_ratio": 0.00030801673892710824, |
| "epoch": 0.8828522920203735, |
| "grad_norm": 1.0792443458054302, |
| "kl": 0.04156494140625, |
| "learning_rate": 2e-06, |
| "loss": -0.0029, |
| "step": 130 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 294.59375, |
| "epoch": 0.8896434634974533, |
| "grad_norm": 1.0374198522552833, |
| "kl": 0.0457000732421875, |
| "learning_rate": 2e-06, |
| "loss": -0.0079, |
| "reward": 0.7023610211908817, |
| "reward_std": 0.2407909445464611, |
| "rewards/preference_model_reward": 0.7023610211908817, |
| "rewards/preference_model_reward/std": 0.24079094640910625, |
| "step": 131 |
| }, |
| { |
| "clip_ratio": 0.00040924851782619953, |
| "epoch": 0.8964346349745331, |
| "grad_norm": 0.9902604514127353, |
| "kl": 0.046234130859375, |
| "learning_rate": 2e-06, |
| "loss": -0.008, |
| "step": 132 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 363.4609375, |
| "epoch": 0.9032258064516129, |
| "grad_norm": 1.060000229364824, |
| "kl": 0.042449951171875, |
| "learning_rate": 2e-06, |
| "loss": -0.0007, |
| "reward": 0.7669150531291962, |
| "reward_std": 0.2660955060273409, |
| "rewards/preference_model_reward": 0.7669150531291962, |
| "rewards/preference_model_reward/std": 0.26609550788998604, |
| "step": 133 |
| }, |
| { |
| "clip_ratio": 0.00026973526109941304, |
| "epoch": 0.9100169779286927, |
| "grad_norm": 0.9936418006191627, |
| "kl": 0.043121337890625, |
| "learning_rate": 2e-06, |
| "loss": -0.0007, |
| "step": 134 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 357.22265625, |
| "epoch": 0.9168081494057725, |
| "grad_norm": 0.7982535557698439, |
| "kl": 0.051361083984375, |
| "learning_rate": 2e-06, |
| "loss": -0.0028, |
| "reward": 0.8265567198395729, |
| "reward_std": 0.21873105503618717, |
| "rewards/preference_model_reward": 0.8265567198395729, |
| "rewards/preference_model_reward/std": 0.21873104479163885, |
| "step": 135 |
| }, |
| { |
| "clip_ratio": 0.0003086660126427887, |
| "epoch": 0.9235993208828522, |
| "grad_norm": 0.7943180189345149, |
| "kl": 0.05242919921875, |
| "learning_rate": 2e-06, |
| "loss": -0.0029, |
| "step": 136 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 320.4296875, |
| "epoch": 0.9303904923599321, |
| "grad_norm": 1.1156997998544687, |
| "kl": 0.061492919921875, |
| "learning_rate": 2e-06, |
| "loss": -0.004, |
| "reward": 0.7904198691248894, |
| "reward_std": 0.216522429138422, |
| "rewards/preference_model_reward": 0.7904198691248894, |
| "rewards/preference_model_reward/std": 0.21652242727577686, |
| "step": 137 |
| }, |
| { |
| "clip_ratio": 0.00037556743882305454, |
| "epoch": 0.9371816638370118, |
| "grad_norm": 1.017141741754524, |
| "kl": 0.057098388671875, |
| "learning_rate": 2e-06, |
| "loss": -0.0041, |
| "step": 138 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 340.8046875, |
| "epoch": 0.9439728353140917, |
| "grad_norm": 0.7728732979067988, |
| "kl": 0.047637939453125, |
| "learning_rate": 2e-06, |
| "loss": -0.0115, |
| "reward": 0.7384522631764412, |
| "reward_std": 0.20472911931574345, |
| "rewards/preference_model_reward": 0.7384522631764412, |
| "rewards/preference_model_reward/std": 0.20472911186516285, |
| "step": 139 |
| }, |
| { |
| "clip_ratio": 0.00038547036820091307, |
| "epoch": 0.9507640067911715, |
| "grad_norm": 0.8459776106505102, |
| "kl": 0.0485687255859375, |
| "learning_rate": 2e-06, |
| "loss": -0.0116, |
| "step": 140 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 296.31640625, |
| "epoch": 0.9575551782682513, |
| "grad_norm": 1.0555084972169586, |
| "kl": 0.051055908203125, |
| "learning_rate": 2e-06, |
| "loss": -0.0092, |
| "reward": 0.7005413100123405, |
| "reward_std": 0.2579723782837391, |
| "rewards/preference_model_reward": 0.7005413100123405, |
| "rewards/preference_model_reward/std": 0.2579723745584488, |
| "step": 141 |
| }, |
| { |
| "clip_ratio": 0.00016731805226299912, |
| "epoch": 0.9643463497453311, |
| "grad_norm": 1.120763560485812, |
| "kl": 0.051971435546875, |
| "learning_rate": 2e-06, |
| "loss": -0.0093, |
| "step": 142 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 377.7734375, |
| "epoch": 0.9711375212224108, |
| "grad_norm": 0.9897948935948673, |
| "kl": 0.0577392578125, |
| "learning_rate": 2e-06, |
| "loss": -0.0067, |
| "reward": 0.7895881161093712, |
| "reward_std": 0.23325645178556442, |
| "rewards/preference_model_reward": 0.7895881161093712, |
| "rewards/preference_model_reward/std": 0.23325645574368536, |
| "step": 143 |
| }, |
| { |
| "clip_ratio": 0.00022080717917560833, |
| "epoch": 0.9779286926994907, |
| "grad_norm": 1.4320582249764962, |
| "kl": 0.05889892578125, |
| "learning_rate": 2e-06, |
| "loss": -0.0067, |
| "step": 144 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 402.28515625, |
| "epoch": 0.9847198641765704, |
| "grad_norm": 0.880015088177386, |
| "kl": 0.055999755859375, |
| "learning_rate": 2e-06, |
| "loss": -0.0012, |
| "reward": 0.7704083099961281, |
| "reward_std": 0.22987801115959883, |
| "rewards/preference_model_reward": 0.7704083099961281, |
| "rewards/preference_model_reward/std": 0.22987799905240536, |
| "step": 145 |
| }, |
| { |
| "clip_ratio": 0.00039433108031516895, |
| "epoch": 0.9915110356536503, |
| "grad_norm": 0.8831875786359421, |
| "kl": 0.056640625, |
| "learning_rate": 2e-06, |
| "loss": -0.0013, |
| "step": 146 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 471.2734375, |
| "epoch": 1.0067911714770799, |
| "grad_norm": 0.8457751003945985, |
| "kl": 0.0511474609375, |
| "learning_rate": 2e-06, |
| "loss": -0.0095, |
| "reward": 0.7864305526018143, |
| "reward_std": 0.19768846221268177, |
| "rewards/preference_model_reward": 0.7864305526018143, |
| "rewards/preference_model_reward/std": 0.19768844894133508, |
| "step": 147 |
| }, |
| { |
| "clip_ratio": 0.000296334306767676, |
| "epoch": 1.0135823429541595, |
| "grad_norm": 0.8298995880374975, |
| "kl": 0.05181884765625, |
| "learning_rate": 2e-06, |
| "loss": -0.0096, |
| "step": 148 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 455.83984375, |
| "epoch": 1.0203735144312394, |
| "grad_norm": 0.8194267156898773, |
| "kl": 0.052215576171875, |
| "learning_rate": 2e-06, |
| "loss": 0.0078, |
| "reward": 0.7310561165213585, |
| "reward_std": 0.21428223699331284, |
| "rewards/preference_model_reward": 0.7310561165213585, |
| "rewards/preference_model_reward/std": 0.21428224071860313, |
| "step": 149 |
| }, |
| { |
| "clip_ratio": 0.00043329124673618935, |
| "epoch": 1.0271646859083192, |
| "grad_norm": 0.8240545782064995, |
| "kl": 0.05267333984375, |
| "learning_rate": 2e-06, |
| "loss": 0.0078, |
| "step": 150 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 370.30859375, |
| "epoch": 1.033955857385399, |
| "grad_norm": 0.760034227142178, |
| "kl": 0.057525634765625, |
| "learning_rate": 2e-06, |
| "loss": 0.0031, |
| "reward": 0.9029070436954498, |
| "reward_std": 0.15814024256542325, |
| "rewards/preference_model_reward": 0.9029070436954498, |
| "rewards/preference_model_reward/std": 0.15814024163410068, |
| "step": 151 |
| }, |
| { |
| "clip_ratio": 0.00020186395067867124, |
| "epoch": 1.0407470288624787, |
| "grad_norm": 0.7614495802573132, |
| "kl": 0.057952880859375, |
| "learning_rate": 2e-06, |
| "loss": 0.003, |
| "step": 152 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 314.09375, |
| "epoch": 1.0475382003395586, |
| "grad_norm": 0.8397238705572047, |
| "kl": 0.0572509765625, |
| "learning_rate": 2e-06, |
| "loss": -0.0023, |
| "reward": 0.8715953528881073, |
| "reward_std": 0.16063255118206143, |
| "rewards/preference_model_reward": 0.8715953528881073, |
| "rewards/preference_model_reward/std": 0.1606325414031744, |
| "step": 153 |
| }, |
| { |
| "clip_ratio": 0.00035122232020512456, |
| "epoch": 1.0543293718166384, |
| "grad_norm": 0.6751976056560207, |
| "kl": 0.057525634765625, |
| "learning_rate": 2e-06, |
| "loss": -0.0023, |
| "step": 154 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 338.484375, |
| "epoch": 1.061120543293718, |
| "grad_norm": 0.8883816112044249, |
| "kl": 0.053192138671875, |
| "learning_rate": 2e-06, |
| "loss": -0.0024, |
| "reward": 0.6162162441760302, |
| "reward_std": 0.22661381494253874, |
| "rewards/preference_model_reward": 0.6162162441760302, |
| "rewards/preference_model_reward/std": 0.22661380283534527, |
| "step": 155 |
| }, |
| { |
| "clip_ratio": 0.00026881803660216974, |
| "epoch": 1.067911714770798, |
| "grad_norm": 0.8996727390846359, |
| "kl": 0.0533447265625, |
| "learning_rate": 2e-06, |
| "loss": -0.0024, |
| "step": 156 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 396.72265625, |
| "epoch": 1.0747028862478778, |
| "grad_norm": 1.3700956028048723, |
| "kl": 0.0594482421875, |
| "learning_rate": 2e-06, |
| "loss": -0.0047, |
| "reward": 0.8017089515924454, |
| "reward_std": 0.2577928486280143, |
| "rewards/preference_model_reward": 0.8017089515924454, |
| "rewards/preference_model_reward/std": 0.25779283652082086, |
| "step": 157 |
| }, |
| { |
| "clip_ratio": 0.0002924563341366593, |
| "epoch": 1.0814940577249577, |
| "grad_norm": 0.989617562955661, |
| "kl": 0.060272216796875, |
| "learning_rate": 2e-06, |
| "loss": -0.0047, |
| "step": 158 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 342.76953125, |
| "epoch": 1.0882852292020373, |
| "grad_norm": 0.9655234182650031, |
| "kl": 0.058380126953125, |
| "learning_rate": 2e-06, |
| "loss": -0.0101, |
| "reward": 0.7295246534049511, |
| "reward_std": 0.2533393930643797, |
| "rewards/preference_model_reward": 0.7295246534049511, |
| "rewards/preference_model_reward/std": 0.2533393818885088, |
| "step": 159 |
| }, |
| { |
| "clip_ratio": 0.00045342851990426425, |
| "epoch": 1.0950764006791172, |
| "grad_norm": 1.2087305406513047, |
| "kl": 0.05926513671875, |
| "learning_rate": 2e-06, |
| "loss": -0.0102, |
| "step": 160 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 436.4921875, |
| "epoch": 1.101867572156197, |
| "grad_norm": 0.6940296847029844, |
| "kl": 0.066864013671875, |
| "learning_rate": 2e-06, |
| "loss": -0.0046, |
| "reward": 0.8526426330208778, |
| "reward_std": 0.1869236477650702, |
| "rewards/preference_model_reward": 0.8526426330208778, |
| "rewards/preference_model_reward/std": 0.18692364171147346, |
| "step": 161 |
| }, |
| { |
| "clip_ratio": 0.0005265060426609125, |
| "epoch": 1.1086587436332767, |
| "grad_norm": 0.7298134766362122, |
| "kl": 0.06866455078125, |
| "learning_rate": 2e-06, |
| "loss": -0.0047, |
| "step": 162 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 451.22265625, |
| "epoch": 1.1154499151103565, |
| "grad_norm": 0.7453936371234922, |
| "kl": 0.06085205078125, |
| "learning_rate": 2e-06, |
| "loss": -0.0002, |
| "reward": 0.8591005653142929, |
| "reward_std": 0.15921288449317217, |
| "rewards/preference_model_reward": 0.8591005653142929, |
| "rewards/preference_model_reward/std": 0.15921288169920444, |
| "step": 163 |
| }, |
| { |
| "clip_ratio": 0.0002904113680415321, |
| "epoch": 1.1222410865874364, |
| "grad_norm": 0.669910512576077, |
| "kl": 0.06268310546875, |
| "learning_rate": 2e-06, |
| "loss": -0.0002, |
| "step": 164 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 407.80078125, |
| "epoch": 1.129032258064516, |
| "grad_norm": 0.909192169606273, |
| "kl": 0.06158447265625, |
| "learning_rate": 2e-06, |
| "loss": 0.0011, |
| "reward": 0.7943635508418083, |
| "reward_std": 0.25198143534362316, |
| "rewards/preference_model_reward": 0.7943635508418083, |
| "rewards/preference_model_reward/std": 0.25198143906891346, |
| "step": 165 |
| }, |
| { |
| "clip_ratio": 0.00025186290713463677, |
| "epoch": 1.1358234295415959, |
| "grad_norm": 0.9692712527409699, |
| "kl": 0.061767578125, |
| "learning_rate": 2e-06, |
| "loss": 0.001, |
| "step": 166 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 325.6484375, |
| "epoch": 1.1426146010186757, |
| "grad_norm": 1.1179428269106786, |
| "kl": 0.0601806640625, |
| "learning_rate": 2e-06, |
| "loss": 0.0002, |
| "reward": 0.9444838091731071, |
| "reward_std": 0.11847700248472393, |
| "rewards/preference_model_reward": 0.9444838091731071, |
| "rewards/preference_model_reward/std": 0.11847699934151024, |
| "step": 167 |
| }, |
| { |
| "clip_ratio": 0.0002802106146191363, |
| "epoch": 1.1494057724957556, |
| "grad_norm": 0.6405441409913845, |
| "kl": 0.06036376953125, |
| "learning_rate": 2e-06, |
| "loss": 0.0002, |
| "step": 168 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 353.76953125, |
| "epoch": 1.1561969439728352, |
| "grad_norm": 0.7782378270820641, |
| "kl": 0.06005859375, |
| "learning_rate": 2e-06, |
| "loss": -0.0016, |
| "reward": 0.8752250149846077, |
| "reward_std": 0.21684391144663095, |
| "rewards/preference_model_reward": 0.8752250149846077, |
| "rewards/preference_model_reward/std": 0.21684390027076006, |
| "step": 169 |
| }, |
| { |
| "clip_ratio": 0.00020744246558024315, |
| "epoch": 1.162988115449915, |
| "grad_norm": 0.850587103078974, |
| "kl": 0.06024169921875, |
| "learning_rate": 2e-06, |
| "loss": -0.0016, |
| "step": 170 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 370.96484375, |
| "epoch": 1.169779286926995, |
| "grad_norm": 0.8701439536858631, |
| "kl": 0.054901123046875, |
| "learning_rate": 2e-06, |
| "loss": -0.0031, |
| "reward": 0.8039593771100044, |
| "reward_std": 0.18934147339314222, |
| "rewards/preference_model_reward": 0.8039593771100044, |
| "rewards/preference_model_reward/std": 0.18934146966785192, |
| "step": 171 |
| }, |
| { |
| "clip_ratio": 0.0003747612081497209, |
| "epoch": 1.1765704584040746, |
| "grad_norm": 0.8335090925710185, |
| "kl": 0.054718017578125, |
| "learning_rate": 2e-06, |
| "loss": -0.0031, |
| "step": 172 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 485.390625, |
| "epoch": 1.1833616298811545, |
| "grad_norm": 0.6304735929486551, |
| "kl": 0.0595703125, |
| "learning_rate": 2e-06, |
| "loss": -0.0031, |
| "reward": 0.9124765843153, |
| "reward_std": 0.17223104648292065, |
| "rewards/preference_model_reward": 0.9124765843153, |
| "rewards/preference_model_reward/std": 0.17223104741424322, |
| "step": 173 |
| }, |
| { |
| "clip_ratio": 0.0004532161146926228, |
| "epoch": 1.1901528013582343, |
| "grad_norm": 0.5908523851183375, |
| "kl": 0.059295654296875, |
| "learning_rate": 2e-06, |
| "loss": -0.0031, |
| "step": 174 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 424.578125, |
| "epoch": 1.1969439728353142, |
| "grad_norm": 0.8992704878494081, |
| "kl": 0.06396484375, |
| "learning_rate": 2e-06, |
| "loss": -0.0031, |
| "reward": 0.804033525288105, |
| "reward_std": 0.2301926789805293, |
| "rewards/preference_model_reward": 0.804033525288105, |
| "rewards/preference_model_reward/std": 0.2301926789805293, |
| "step": 175 |
| }, |
| { |
| "clip_ratio": 0.0003341037518111989, |
| "epoch": 1.2037351443123938, |
| "grad_norm": 0.9196956764410971, |
| "kl": 0.06427001953125, |
| "learning_rate": 2e-06, |
| "loss": -0.0031, |
| "step": 176 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 431.2421875, |
| "epoch": 1.2105263157894737, |
| "grad_norm": 0.7220744702495395, |
| "kl": 0.058349609375, |
| "learning_rate": 2e-06, |
| "loss": -0.0077, |
| "reward": 0.7273744996637106, |
| "reward_std": 0.2286776825785637, |
| "rewards/preference_model_reward": 0.7273744996637106, |
| "rewards/preference_model_reward/std": 0.22867767792195082, |
| "step": 177 |
| }, |
| { |
| "clip_ratio": 0.0003454066154517932, |
| "epoch": 1.2173174872665535, |
| "grad_norm": 0.8716110246400824, |
| "kl": 0.058624267578125, |
| "learning_rate": 2e-06, |
| "loss": -0.0078, |
| "step": 178 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 378.078125, |
| "epoch": 1.2241086587436332, |
| "grad_norm": 0.6663285066650045, |
| "kl": 0.0638427734375, |
| "learning_rate": 2e-06, |
| "loss": -0.0059, |
| "reward": 0.9165123328566551, |
| "reward_std": 0.16831889003515244, |
| "rewards/preference_model_reward": 0.9165123328566551, |
| "rewards/preference_model_reward/std": 0.16831888817250729, |
| "step": 179 |
| }, |
| { |
| "clip_ratio": 0.00019379381046746857, |
| "epoch": 1.230899830220713, |
| "grad_norm": 0.7055628007733394, |
| "kl": 0.0640869140625, |
| "learning_rate": 2e-06, |
| "loss": -0.006, |
| "step": 180 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 422.25, |
| "epoch": 1.237691001697793, |
| "grad_norm": 0.7762768084046409, |
| "kl": 0.06463623046875, |
| "learning_rate": 2e-06, |
| "loss": -0.0043, |
| "reward": 0.9134577289223671, |
| "reward_std": 0.1393869406019803, |
| "rewards/preference_model_reward": 0.9134577289223671, |
| "rewards/preference_model_reward/std": 0.13938693181262352, |
| "step": 181 |
| }, |
| { |
| "clip_ratio": 0.0003034833280253224, |
| "epoch": 1.2444821731748728, |
| "grad_norm": 0.6186463874614553, |
| "kl": 0.064910888671875, |
| "learning_rate": 2e-06, |
| "loss": -0.0044, |
| "step": 182 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 354.81640625, |
| "epoch": 1.2512733446519524, |
| "grad_norm": 0.8899638180343089, |
| "kl": 0.07208251953125, |
| "learning_rate": 2e-06, |
| "loss": -0.0069, |
| "reward": 0.7580052837729454, |
| "reward_std": 0.19877337908837944, |
| "rewards/preference_model_reward": 0.7580052837729454, |
| "rewards/preference_model_reward/std": 0.1987733746645972, |
| "step": 183 |
| }, |
| { |
| "clip_ratio": 0.0004808369849342853, |
| "epoch": 1.2580645161290323, |
| "grad_norm": 0.8688793068634583, |
| "kl": 0.072906494140625, |
| "learning_rate": 2e-06, |
| "loss": -0.007, |
| "step": 184 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 391.66796875, |
| "epoch": 1.2648556876061121, |
| "grad_norm": 0.7448326779186991, |
| "kl": 0.0616455078125, |
| "learning_rate": 2e-06, |
| "loss": -0.007, |
| "reward": 0.8168806880712509, |
| "reward_std": 0.16082846838980913, |
| "rewards/preference_model_reward": 0.8168806880712509, |
| "rewards/preference_model_reward/std": 0.1608284618705511, |
| "step": 185 |
| }, |
| { |
| "clip_ratio": 0.00020933753694407642, |
| "epoch": 1.2716468590831917, |
| "grad_norm": 0.7190507214269751, |
| "kl": 0.06219482421875, |
| "learning_rate": 2e-06, |
| "loss": -0.0071, |
| "step": 186 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 483.05078125, |
| "epoch": 1.2784380305602716, |
| "grad_norm": 0.8188458679532237, |
| "kl": 0.0731201171875, |
| "learning_rate": 2e-06, |
| "loss": -0.005, |
| "reward": 0.8175918683409691, |
| "reward_std": 0.22522686189040542, |
| "rewards/preference_model_reward": 0.8175918683409691, |
| "rewards/preference_model_reward/std": 0.22522685630246997, |
| "step": 187 |
| }, |
| { |
| "clip_ratio": 0.0003966744334320538, |
| "epoch": 1.2852292020373515, |
| "grad_norm": 1.0458363130153399, |
| "kl": 0.073944091796875, |
| "learning_rate": 2e-06, |
| "loss": -0.005, |
| "step": 188 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 549.33203125, |
| "epoch": 1.2920203735144313, |
| "grad_norm": 0.9152240807229773, |
| "kl": 0.059906005859375, |
| "learning_rate": 2e-06, |
| "loss": 0.0056, |
| "reward": 0.8635009974241257, |
| "reward_std": 0.22975661419332027, |
| "rewards/preference_model_reward": 0.8635009974241257, |
| "rewards/preference_model_reward/std": 0.22975661046802998, |
| "step": 189 |
| }, |
| { |
| "clip_ratio": 0.00035369363831705414, |
| "epoch": 1.298811544991511, |
| "grad_norm": 0.9207458262412241, |
| "kl": 0.06072998046875, |
| "learning_rate": 2e-06, |
| "loss": 0.0056, |
| "step": 190 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 442.76171875, |
| "epoch": 1.3056027164685908, |
| "grad_norm": 0.6570145377312667, |
| "kl": 0.068359375, |
| "learning_rate": 2e-06, |
| "loss": -0.0025, |
| "reward": 0.9078862443566322, |
| "reward_std": 0.15442213136702776, |
| "rewards/preference_model_reward": 0.9078862443566322, |
| "rewards/preference_model_reward/std": 0.15442212857306004, |
| "step": 191 |
| }, |
| { |
| "clip_ratio": 0.000297196668725519, |
| "epoch": 1.3123938879456707, |
| "grad_norm": 0.6739036562465582, |
| "kl": 0.069244384765625, |
| "learning_rate": 2e-06, |
| "loss": -0.0025, |
| "step": 192 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 387.12890625, |
| "epoch": 1.3191850594227503, |
| "grad_norm": 0.8574027394239306, |
| "kl": 0.064239501953125, |
| "learning_rate": 2e-06, |
| "loss": -0.0045, |
| "reward": 0.824281245470047, |
| "reward_std": 0.2169697443023324, |
| "rewards/preference_model_reward": 0.824281245470047, |
| "rewards/preference_model_reward/std": 0.2169697443023324, |
| "step": 193 |
| }, |
| { |
| "clip_ratio": 0.0002967717100545997, |
| "epoch": 1.3259762308998302, |
| "grad_norm": 0.7290085924664141, |
| "kl": 0.065155029296875, |
| "learning_rate": 2e-06, |
| "loss": -0.0045, |
| "step": 194 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 465.8203125, |
| "epoch": 1.33276740237691, |
| "grad_norm": 0.7309486318631441, |
| "kl": 0.07769775390625, |
| "learning_rate": 2e-06, |
| "loss": 0.0008, |
| "reward": 0.9312445744872093, |
| "reward_std": 0.15284666204388486, |
| "rewards/preference_model_reward": 0.9312445744872093, |
| "rewards/preference_model_reward/std": 0.15284666297520744, |
| "step": 195 |
| }, |
| { |
| "clip_ratio": 0.0002473961694704485, |
| "epoch": 1.33955857385399, |
| "grad_norm": 0.8256107757168715, |
| "kl": 0.07861328125, |
| "learning_rate": 2e-06, |
| "loss": 0.0008, |
| "step": 196 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 430.015625, |
| "epoch": 1.3463497453310695, |
| "grad_norm": 0.7651818013398796, |
| "kl": 0.074951171875, |
| "learning_rate": 2e-06, |
| "loss": -0.0032, |
| "reward": 0.903270959854126, |
| "reward_std": 0.15746590262278914, |
| "rewards/preference_model_reward": 0.903270959854126, |
| "rewards/preference_model_reward/std": 0.15746590006165206, |
| "step": 197 |
| }, |
| { |
| "clip_ratio": 0.00028777409352187533, |
| "epoch": 1.3531409168081494, |
| "grad_norm": 0.695431821161012, |
| "kl": 0.0751953125, |
| "learning_rate": 2e-06, |
| "loss": -0.0032, |
| "step": 198 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 464.35546875, |
| "epoch": 1.3599320882852293, |
| "grad_norm": 0.6072593985440756, |
| "kl": 0.068878173828125, |
| "learning_rate": 2e-06, |
| "loss": -0.0046, |
| "reward": 0.8996652066707611, |
| "reward_std": 0.15736312349326909, |
| "rewards/preference_model_reward": 0.8996652066707611, |
| "rewards/preference_model_reward/std": 0.15736312372609973, |
| "step": 199 |
| }, |
| { |
| "clip_ratio": 0.00029625675415445585, |
| "epoch": 1.366723259762309, |
| "grad_norm": 0.6467946669294912, |
| "kl": 0.069000244140625, |
| "learning_rate": 2e-06, |
| "loss": -0.0046, |
| "step": 200 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 453.8671875, |
| "epoch": 1.3735144312393888, |
| "grad_norm": 0.6011320960228079, |
| "kl": 0.0772705078125, |
| "learning_rate": 2e-06, |
| "loss": -0.0019, |
| "reward": 0.9229187443852425, |
| "reward_std": 0.1588728630449623, |
| "rewards/preference_model_reward": 0.9229187443852425, |
| "rewards/preference_model_reward/std": 0.15887285268399864, |
| "step": 201 |
| }, |
| { |
| "clip_ratio": 0.0004093242150702281, |
| "epoch": 1.3803056027164686, |
| "grad_norm": 0.6166416974233886, |
| "kl": 0.07733154296875, |
| "learning_rate": 2e-06, |
| "loss": -0.0019, |
| "step": 202 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 522.33203125, |
| "epoch": 1.3870967741935485, |
| "grad_norm": 0.821145885871034, |
| "kl": 0.071319580078125, |
| "learning_rate": 2e-06, |
| "loss": -0.0032, |
| "reward": 0.7927182205021381, |
| "reward_std": 0.15776410112448502, |
| "rewards/preference_model_reward": 0.7927182205021381, |
| "rewards/preference_model_reward/std": 0.1577640951873036, |
| "step": 203 |
| }, |
| { |
| "clip_ratio": 0.00040718307718634605, |
| "epoch": 1.3938879456706281, |
| "grad_norm": 0.6436174776451671, |
| "kl": 0.071380615234375, |
| "learning_rate": 2e-06, |
| "loss": -0.0032, |
| "step": 204 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 448.05859375, |
| "epoch": 1.400679117147708, |
| "grad_norm": 0.9941878622739161, |
| "kl": 0.067535400390625, |
| "learning_rate": 2e-06, |
| "loss": 0.0005, |
| "reward": 0.8290302827954292, |
| "reward_std": 0.20328299421817064, |
| "rewards/preference_model_reward": 0.8290302827954292, |
| "rewards/preference_model_reward/std": 0.20328299049288034, |
| "step": 205 |
| }, |
| { |
| "clip_ratio": 0.0004087122197233839, |
| "epoch": 1.4074702886247878, |
| "grad_norm": 0.758001459822151, |
| "kl": 0.066619873046875, |
| "learning_rate": 2e-06, |
| "loss": 0.0006, |
| "step": 206 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 393.47265625, |
| "epoch": 1.4142614601018675, |
| "grad_norm": 0.8719842748126723, |
| "kl": 0.067352294921875, |
| "learning_rate": 2e-06, |
| "loss": -0.0102, |
| "reward": 0.8164683133363724, |
| "reward_std": 0.22312493529170752, |
| "rewards/preference_model_reward": 0.8164683133363724, |
| "rewards/preference_model_reward/std": 0.22312493529170752, |
| "step": 207 |
| }, |
| { |
| "clip_ratio": 0.0002680147335922811, |
| "epoch": 1.4210526315789473, |
| "grad_norm": 0.8534423110795387, |
| "kl": 0.067230224609375, |
| "learning_rate": 2e-06, |
| "loss": -0.0103, |
| "step": 208 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 468.77734375, |
| "epoch": 1.4278438030560272, |
| "grad_norm": 0.8752993311606954, |
| "kl": 0.0692138671875, |
| "learning_rate": 2e-06, |
| "loss": -0.002, |
| "reward": 0.8254417404532433, |
| "reward_std": 0.24196279793977737, |
| "rewards/preference_model_reward": 0.8254417404532433, |
| "rewards/preference_model_reward/std": 0.24196279048919678, |
| "step": 209 |
| }, |
| { |
| "clip_ratio": 0.0003697285646921955, |
| "epoch": 1.434634974533107, |
| "grad_norm": 1.0128390633646167, |
| "kl": 0.069732666015625, |
| "learning_rate": 2e-06, |
| "loss": -0.0021, |
| "step": 210 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 467.9765625, |
| "epoch": 1.4414261460101867, |
| "grad_norm": 0.9508406874837924, |
| "kl": 0.07574462890625, |
| "learning_rate": 2e-06, |
| "loss": 0.0036, |
| "reward": 0.7649775668978691, |
| "reward_std": 0.27024316415190697, |
| "rewards/preference_model_reward": 0.7649775668978691, |
| "rewards/preference_model_reward/std": 0.2702431622892618, |
| "step": 211 |
| }, |
| { |
| "clip_ratio": 0.0004117395801586099, |
| "epoch": 1.4482173174872666, |
| "grad_norm": 0.9621517839351592, |
| "kl": 0.07647705078125, |
| "learning_rate": 2e-06, |
| "loss": 0.0035, |
| "step": 212 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 458.64453125, |
| "epoch": 1.4550084889643464, |
| "grad_norm": 0.810982449228881, |
| "kl": 0.08087158203125, |
| "learning_rate": 2e-06, |
| "loss": -0.001, |
| "reward": 0.8479541018605232, |
| "reward_std": 0.22648475086316466, |
| "rewards/preference_model_reward": 0.8479541018605232, |
| "rewards/preference_model_reward/std": 0.2264847457408905, |
| "step": 213 |
| }, |
| { |
| "clip_ratio": 0.0003555577713996172, |
| "epoch": 1.461799660441426, |
| "grad_norm": 0.80101478641754, |
| "kl": 0.08135986328125, |
| "learning_rate": 2e-06, |
| "loss": -0.001, |
| "step": 214 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 574.29296875, |
| "epoch": 1.468590831918506, |
| "grad_norm": 0.9452762932433028, |
| "kl": 0.0784912109375, |
| "learning_rate": 2e-06, |
| "loss": -0.0021, |
| "reward": 0.874680757522583, |
| "reward_std": 0.1911849994212389, |
| "rewards/preference_model_reward": 0.874680757522583, |
| "rewards/preference_model_reward/std": 0.1911849956959486, |
| "step": 215 |
| }, |
| { |
| "clip_ratio": 0.00028368424500513356, |
| "epoch": 1.4753820033955858, |
| "grad_norm": 0.8371740511238688, |
| "kl": 0.078369140625, |
| "learning_rate": 2e-06, |
| "loss": -0.0021, |
| "step": 216 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 426.421875, |
| "epoch": 1.4821731748726656, |
| "grad_norm": 1.0152679244033544, |
| "kl": 0.077392578125, |
| "learning_rate": 2e-06, |
| "loss": -0.0064, |
| "reward": 0.8196995928883553, |
| "reward_std": 0.1846959013491869, |
| "rewards/preference_model_reward": 0.8196995928883553, |
| "rewards/preference_model_reward/std": 0.1846959034446627, |
| "step": 217 |
| }, |
| { |
| "clip_ratio": 0.0004560712586680893, |
| "epoch": 1.4889643463497453, |
| "grad_norm": 0.8237042745863938, |
| "kl": 0.07757568359375, |
| "learning_rate": 2e-06, |
| "loss": -0.0064, |
| "step": 218 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 442.375, |
| "epoch": 1.4957555178268251, |
| "grad_norm": 0.7237731143426731, |
| "kl": 0.07708740234375, |
| "learning_rate": 2e-06, |
| "loss": 0.0046, |
| "reward": 0.9068370684981346, |
| "reward_std": 0.16706688702106476, |
| "rewards/preference_model_reward": 0.9068370684981346, |
| "rewards/preference_model_reward/std": 0.1670668888837099, |
| "step": 219 |
| }, |
| { |
| "clip_ratio": 0.00027096804114989936, |
| "epoch": 1.5025466893039048, |
| "grad_norm": 0.7076398007967601, |
| "kl": 0.0770263671875, |
| "learning_rate": 2e-06, |
| "loss": 0.0046, |
| "step": 220 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 444.953125, |
| "epoch": 1.5093378607809846, |
| "grad_norm": 0.869997583364439, |
| "kl": 0.082275390625, |
| "learning_rate": 2e-06, |
| "loss": 0.0055, |
| "reward": 0.7990810945630074, |
| "reward_std": 0.18924825318390504, |
| "rewards/preference_model_reward": 0.7990810945630074, |
| "rewards/preference_model_reward/std": 0.18924825073918328, |
| "step": 221 |
| }, |
| { |
| "clip_ratio": 0.0004075437354913447, |
| "epoch": 1.5161290322580645, |
| "grad_norm": 0.9017629071808718, |
| "kl": 0.0819091796875, |
| "learning_rate": 2e-06, |
| "loss": 0.0055, |
| "step": 222 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 446.15234375, |
| "epoch": 1.5229202037351444, |
| "grad_norm": 0.7451783779397045, |
| "kl": 0.074432373046875, |
| "learning_rate": 2e-06, |
| "loss": 0.007, |
| "reward": 0.9096781089901924, |
| "reward_std": 0.16962200123816729, |
| "rewards/preference_model_reward": 0.9096781089901924, |
| "rewards/preference_model_reward/std": 0.16962200321722776, |
| "step": 223 |
| }, |
| { |
| "clip_ratio": 0.00042257635868736543, |
| "epoch": 1.5297113752122242, |
| "grad_norm": 0.7058693281421018, |
| "kl": 0.07421875, |
| "learning_rate": 2e-06, |
| "loss": 0.007, |
| "step": 224 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 465.25, |
| "epoch": 1.5365025466893039, |
| "grad_norm": 0.6935303191071553, |
| "kl": 0.070648193359375, |
| "learning_rate": 2e-06, |
| "loss": -0.0079, |
| "reward": 0.8487136289477348, |
| "reward_std": 0.1877696868032217, |
| "rewards/preference_model_reward": 0.8487136289477348, |
| "rewards/preference_model_reward/std": 0.18776968773454428, |
| "step": 225 |
| }, |
| { |
| "clip_ratio": 0.00021090211157570593, |
| "epoch": 1.5432937181663837, |
| "grad_norm": 0.7145245121074538, |
| "kl": 0.0701904296875, |
| "learning_rate": 2e-06, |
| "loss": -0.008, |
| "step": 226 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 381.94140625, |
| "epoch": 1.5500848896434634, |
| "grad_norm": 0.7994299854278107, |
| "kl": 0.07879638671875, |
| "learning_rate": 2e-06, |
| "loss": 0.001, |
| "reward": 0.8791565969586372, |
| "reward_std": 0.1796162803657353, |
| "rewards/preference_model_reward": 0.8791565969586372, |
| "rewards/preference_model_reward/std": 0.17961628688499331, |
| "step": 227 |
| }, |
| { |
| "clip_ratio": 0.00023016269369691145, |
| "epoch": 1.5568760611205432, |
| "grad_norm": 0.8175812413468173, |
| "kl": 0.07855224609375, |
| "learning_rate": 2e-06, |
| "loss": 0.001, |
| "step": 228 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 568.01171875, |
| "epoch": 1.563667232597623, |
| "grad_norm": 0.6094276281977793, |
| "kl": 0.0706787109375, |
| "learning_rate": 2e-06, |
| "loss": -0.0011, |
| "reward": 0.8882196396589279, |
| "reward_std": 0.13881529681384563, |
| "rewards/preference_model_reward": 0.8882196396589279, |
| "rewards/preference_model_reward/std": 0.13881529681384563, |
| "step": 229 |
| }, |
| { |
| "clip_ratio": 0.00029951393207738874, |
| "epoch": 1.570458404074703, |
| "grad_norm": 1.7950954969207382, |
| "kl": 0.07061767578125, |
| "learning_rate": 2e-06, |
| "loss": -0.0011, |
| "step": 230 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 423.640625, |
| "epoch": 1.5772495755517828, |
| "grad_norm": 0.821807107387434, |
| "kl": 0.07537841796875, |
| "learning_rate": 2e-06, |
| "loss": -0.0032, |
| "reward": 0.8326325863599777, |
| "reward_std": 0.222384094260633, |
| "rewards/preference_model_reward": 0.8326325863599777, |
| "rewards/preference_model_reward/std": 0.22238408401608467, |
| "step": 231 |
| }, |
| { |
| "clip_ratio": 0.0003851418096019188, |
| "epoch": 1.5840407470288624, |
| "grad_norm": 0.7724457393034863, |
| "kl": 0.07427978515625, |
| "learning_rate": 2e-06, |
| "loss": -0.0032, |
| "step": 232 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 450.34765625, |
| "epoch": 1.5908319185059423, |
| "grad_norm": 0.9314695228796444, |
| "kl": 0.070556640625, |
| "learning_rate": 2e-06, |
| "loss": 0.0058, |
| "reward": 0.815672542899847, |
| "reward_std": 0.22120152600109577, |
| "rewards/preference_model_reward": 0.815672542899847, |
| "rewards/preference_model_reward/std": 0.2212015176191926, |
| "step": 233 |
| }, |
| { |
| "clip_ratio": 0.00026556486773188226, |
| "epoch": 1.597623089983022, |
| "grad_norm": 0.8986538432374337, |
| "kl": 0.07061767578125, |
| "learning_rate": 2e-06, |
| "loss": 0.0058, |
| "step": 234 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 403.4140625, |
| "epoch": 1.6044142614601018, |
| "grad_norm": 1.0978197262889375, |
| "kl": 0.080474853515625, |
| "learning_rate": 2e-06, |
| "loss": -0.0007, |
| "reward": 0.8273614943027496, |
| "reward_std": 0.20897717960178852, |
| "rewards/preference_model_reward": 0.8273614943027496, |
| "rewards/preference_model_reward/std": 0.208977174712345, |
| "step": 235 |
| }, |
| { |
| "clip_ratio": 0.0002703697555261897, |
| "epoch": 1.6112054329371817, |
| "grad_norm": 1.0483625406174306, |
| "kl": 0.079681396484375, |
| "learning_rate": 2e-06, |
| "loss": -0.0008, |
| "step": 236 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 526.9453125, |
| "epoch": 1.6179966044142615, |
| "grad_norm": 0.5732340261918957, |
| "kl": 0.068572998046875, |
| "learning_rate": 2e-06, |
| "loss": 0.0007, |
| "reward": 0.9326564967632294, |
| "reward_std": 0.14762203209102154, |
| "rewards/preference_model_reward": 0.9326564967632294, |
| "rewards/preference_model_reward/std": 0.1476220367476344, |
| "step": 237 |
| }, |
| { |
| "clip_ratio": 0.0003460040388745256, |
| "epoch": 1.6247877758913414, |
| "grad_norm": 0.5752766993082183, |
| "kl": 0.068756103515625, |
| "learning_rate": 2e-06, |
| "loss": 0.0007, |
| "step": 238 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 386.03515625, |
| "epoch": 1.631578947368421, |
| "grad_norm": 0.6056663933056317, |
| "kl": 0.075103759765625, |
| "learning_rate": 2e-06, |
| "loss": -0.0021, |
| "reward": 0.8928236216306686, |
| "reward_std": 0.12510262243449688, |
| "rewards/preference_model_reward": 0.8928236216306686, |
| "rewards/preference_model_reward/std": 0.12510262383148074, |
| "step": 239 |
| }, |
| { |
| "clip_ratio": 0.0002695773910090793, |
| "epoch": 1.6383701188455009, |
| "grad_norm": 0.6137961375915928, |
| "kl": 0.07489013671875, |
| "learning_rate": 2e-06, |
| "loss": -0.0022, |
| "step": 240 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 407.54296875, |
| "epoch": 1.6451612903225805, |
| "grad_norm": 0.42071122454808335, |
| "kl": 0.0718994140625, |
| "learning_rate": 2e-06, |
| "loss": 0.0013, |
| "reward": 0.9794645011425018, |
| "reward_std": 0.08479468178120442, |
| "rewards/preference_model_reward": 0.9794645011425018, |
| "rewards/preference_model_reward/std": 0.08479468079167418, |
| "step": 241 |
| }, |
| { |
| "clip_ratio": 0.00025576782900316175, |
| "epoch": 1.6519524617996604, |
| "grad_norm": 0.3919119735586361, |
| "kl": 0.07177734375, |
| "learning_rate": 2e-06, |
| "loss": 0.0013, |
| "step": 242 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 411.42578125, |
| "epoch": 1.6587436332767402, |
| "grad_norm": 0.6548770476035524, |
| "kl": 0.0699462890625, |
| "learning_rate": 2e-06, |
| "loss": -0.0013, |
| "reward": 0.8988784328103065, |
| "reward_std": 0.1754322163760662, |
| "rewards/preference_model_reward": 0.8988784328103065, |
| "rewards/preference_model_reward/std": 0.17543221032246947, |
| "step": 243 |
| }, |
| { |
| "clip_ratio": 0.0002102269518218236, |
| "epoch": 1.66553480475382, |
| "grad_norm": 0.6267254419265091, |
| "kl": 0.06951904296875, |
| "learning_rate": 2e-06, |
| "loss": -0.0014, |
| "step": 244 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 466.73828125, |
| "epoch": 1.6723259762309, |
| "grad_norm": 0.7815256424102707, |
| "kl": 0.070068359375, |
| "learning_rate": 2e-06, |
| "loss": -0.0002, |
| "reward": 0.8519543707370758, |
| "reward_std": 0.22265557665377855, |
| "rewards/preference_model_reward": 0.8519543707370758, |
| "rewards/preference_model_reward/std": 0.22265558131039143, |
| "step": 245 |
| }, |
| { |
| "clip_ratio": 0.00024074002521956572, |
| "epoch": 1.6791171477079796, |
| "grad_norm": 0.7771753382163754, |
| "kl": 0.0697021484375, |
| "learning_rate": 2e-06, |
| "loss": -0.0003, |
| "step": 246 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 468.9140625, |
| "epoch": 1.6859083191850595, |
| "grad_norm": 0.8258890143100246, |
| "kl": 0.064239501953125, |
| "learning_rate": 2e-06, |
| "loss": 0.0068, |
| "reward": 0.6989383921027184, |
| "reward_std": 0.21238140459172428, |
| "rewards/preference_model_reward": 0.6989383921027184, |
| "rewards/preference_model_reward/std": 0.212381407385692, |
| "step": 247 |
| }, |
| { |
| "clip_ratio": 0.00029818056009389693, |
| "epoch": 1.692699490662139, |
| "grad_norm": 0.8433009778411044, |
| "kl": 0.06402587890625, |
| "learning_rate": 2e-06, |
| "loss": 0.0068, |
| "step": 248 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 461.68359375, |
| "epoch": 1.699490662139219, |
| "grad_norm": 0.564079360506697, |
| "kl": 0.081146240234375, |
| "learning_rate": 2e-06, |
| "loss": -0.0013, |
| "reward": 0.813681973144412, |
| "reward_std": 0.15440709423273802, |
| "rewards/preference_model_reward": 0.813681973144412, |
| "rewards/preference_model_reward/std": 0.1544070926029235, |
| "step": 249 |
| }, |
| { |
| "clip_ratio": 0.00033701639495120617, |
| "epoch": 1.7062818336162988, |
| "grad_norm": 0.5478613289915798, |
| "kl": 0.08160400390625, |
| "learning_rate": 2e-06, |
| "loss": -0.0014, |
| "step": 250 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 322.9296875, |
| "epoch": 1.7130730050933787, |
| "grad_norm": 1.071419601384365, |
| "kl": 0.07757568359375, |
| "learning_rate": 2e-06, |
| "loss": 0.0057, |
| "reward": 0.729640144854784, |
| "reward_std": 0.24793746508657932, |
| "rewards/preference_model_reward": 0.729640144854784, |
| "rewards/preference_model_reward/std": 0.24793746136128902, |
| "step": 251 |
| }, |
| { |
| "clip_ratio": 0.0003556678657332668, |
| "epoch": 1.7198641765704585, |
| "grad_norm": 1.0183565424453855, |
| "kl": 0.078369140625, |
| "learning_rate": 2e-06, |
| "loss": 0.0057, |
| "step": 252 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 507.984375, |
| "epoch": 1.7266553480475382, |
| "grad_norm": 0.8616822486915081, |
| "kl": 0.07220458984375, |
| "learning_rate": 2e-06, |
| "loss": 0.0018, |
| "reward": 0.8504317253828049, |
| "reward_std": 0.2000572015531361, |
| "rewards/preference_model_reward": 0.8504317253828049, |
| "rewards/preference_model_reward/std": 0.20005719922482967, |
| "step": 253 |
| }, |
| { |
| "clip_ratio": 0.00037185212204349227, |
| "epoch": 1.733446519524618, |
| "grad_norm": 0.7593220266059924, |
| "kl": 0.07269287109375, |
| "learning_rate": 2e-06, |
| "loss": 0.0018, |
| "step": 254 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 422.05078125, |
| "epoch": 1.7402376910016977, |
| "grad_norm": 0.8256725125473834, |
| "kl": 0.06494140625, |
| "learning_rate": 2e-06, |
| "loss": 0.0065, |
| "reward": 0.851757287979126, |
| "reward_std": 0.2138998694717884, |
| "rewards/preference_model_reward": 0.851757287979126, |
| "rewards/preference_model_reward/std": 0.21389986481517553, |
| "step": 255 |
| }, |
| { |
| "clip_ratio": 0.00029054368678771425, |
| "epoch": 1.7470288624787775, |
| "grad_norm": 0.8768036487045519, |
| "kl": 0.065338134765625, |
| "learning_rate": 2e-06, |
| "loss": 0.0064, |
| "step": 256 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 606.53515625, |
| "epoch": 1.7538200339558574, |
| "grad_norm": 0.7028967106094474, |
| "kl": 0.071868896484375, |
| "learning_rate": 2e-06, |
| "loss": 0.0011, |
| "reward": 0.908553458750248, |
| "reward_std": 0.1910355999134481, |
| "rewards/preference_model_reward": 0.908553458750248, |
| "rewards/preference_model_reward/std": 0.19103559292852879, |
| "step": 257 |
| }, |
| { |
| "clip_ratio": 0.0003378584806341678, |
| "epoch": 1.7606112054329373, |
| "grad_norm": 0.6669227299061052, |
| "kl": 0.07257080078125, |
| "learning_rate": 2e-06, |
| "loss": 0.0011, |
| "step": 258 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 484.171875, |
| "epoch": 1.7674023769100171, |
| "grad_norm": 0.589142092316522, |
| "kl": 0.07623291015625, |
| "learning_rate": 2e-06, |
| "loss": 0.0014, |
| "reward": 0.9417537078261375, |
| "reward_std": 0.15645512472838163, |
| "rewards/preference_model_reward": 0.9417537078261375, |
| "rewards/preference_model_reward/std": 0.15645511914044619, |
| "step": 259 |
| }, |
| { |
| "clip_ratio": 0.0003804455500358017, |
| "epoch": 1.7741935483870968, |
| "grad_norm": 0.5638842375184904, |
| "kl": 0.07666015625, |
| "learning_rate": 2e-06, |
| "loss": 0.0014, |
| "step": 260 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 457.60546875, |
| "epoch": 1.7809847198641766, |
| "grad_norm": 0.8955077816264015, |
| "kl": 0.08245849609375, |
| "learning_rate": 2e-06, |
| "loss": -0.0022, |
| "reward": 0.8265820220112801, |
| "reward_std": 0.21266387501964346, |
| "rewards/preference_model_reward": 0.8265820220112801, |
| "rewards/preference_model_reward/std": 0.21266387071227655, |
| "step": 261 |
| }, |
| { |
| "clip_ratio": 0.0004603340476023732, |
| "epoch": 1.7877758913412563, |
| "grad_norm": 0.8188440887100487, |
| "kl": 0.082763671875, |
| "learning_rate": 2e-06, |
| "loss": -0.0022, |
| "step": 262 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 559.12890625, |
| "epoch": 1.7945670628183361, |
| "grad_norm": 0.7420003421814586, |
| "kl": 0.06298828125, |
| "learning_rate": 2e-06, |
| "loss": -0.0019, |
| "reward": 0.8031069450080395, |
| "reward_std": 0.18254950502887368, |
| "rewards/preference_model_reward": 0.8031069450080395, |
| "rewards/preference_model_reward/std": 0.1825494933873415, |
| "step": 263 |
| }, |
| { |
| "clip_ratio": 0.0002911918672907632, |
| "epoch": 1.801358234295416, |
| "grad_norm": 0.7226550220045151, |
| "kl": 0.06256103515625, |
| "learning_rate": 2e-06, |
| "loss": -0.002, |
| "step": 264 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 393.953125, |
| "epoch": 1.8081494057724958, |
| "grad_norm": 0.8002731044836809, |
| "kl": 0.077423095703125, |
| "learning_rate": 2e-06, |
| "loss": -0.0042, |
| "reward": 0.8755168691277504, |
| "reward_std": 0.2164273001253605, |
| "rewards/preference_model_reward": 0.8755168691277504, |
| "rewards/preference_model_reward/std": 0.2164272964000702, |
| "step": 265 |
| }, |
| { |
| "clip_ratio": 0.0002747558801274863, |
| "epoch": 1.8149405772495757, |
| "grad_norm": 0.7854043108449068, |
| "kl": 0.07794189453125, |
| "learning_rate": 2e-06, |
| "loss": -0.0042, |
| "step": 266 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 393.8359375, |
| "epoch": 1.8217317487266553, |
| "grad_norm": 0.9479972413749124, |
| "kl": 0.08074951171875, |
| "learning_rate": 2e-06, |
| "loss": -0.0107, |
| "reward": 0.8254084438085556, |
| "reward_std": 0.1835477078857366, |
| "rewards/preference_model_reward": 0.8254084438085556, |
| "rewards/preference_model_reward/std": 0.18354770765290596, |
| "step": 267 |
| }, |
| { |
| "clip_ratio": 0.0003044331115233945, |
| "epoch": 1.8285229202037352, |
| "grad_norm": 0.8555828495346359, |
| "kl": 0.08099365234375, |
| "learning_rate": 2e-06, |
| "loss": -0.0107, |
| "step": 268 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 493.96484375, |
| "epoch": 1.8353140916808148, |
| "grad_norm": 0.6643254122645025, |
| "kl": 0.07830810546875, |
| "learning_rate": 2e-06, |
| "loss": 0.0029, |
| "reward": 0.8742033094167709, |
| "reward_std": 0.17645000852644444, |
| "rewards/preference_model_reward": 0.8742033094167709, |
| "rewards/preference_model_reward/std": 0.1764500148128718, |
| "step": 269 |
| }, |
| { |
| "clip_ratio": 0.00037674069881177275, |
| "epoch": 1.8421052631578947, |
| "grad_norm": 0.7454046347235634, |
| "kl": 0.0787353515625, |
| "learning_rate": 2e-06, |
| "loss": 0.0029, |
| "step": 270 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 429.734375, |
| "epoch": 1.8488964346349746, |
| "grad_norm": 0.7255152868461135, |
| "kl": 0.07861328125, |
| "learning_rate": 2e-06, |
| "loss": -0.001, |
| "reward": 0.8850414156913757, |
| "reward_std": 0.17634376138448715, |
| "rewards/preference_model_reward": 0.8850414156913757, |
| "rewards/preference_model_reward/std": 0.1763437541667372, |
| "step": 271 |
| }, |
| { |
| "clip_ratio": 0.0002247508855361957, |
| "epoch": 1.8556876061120544, |
| "grad_norm": 0.7200469091029748, |
| "kl": 0.07861328125, |
| "learning_rate": 2e-06, |
| "loss": -0.001, |
| "step": 272 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 404.3515625, |
| "epoch": 1.8624787775891343, |
| "grad_norm": 0.7158241923732864, |
| "kl": 0.07818603515625, |
| "learning_rate": 2e-06, |
| "loss": -0.0003, |
| "reward": 0.8412534669041634, |
| "reward_std": 0.1782067846506834, |
| "rewards/preference_model_reward": 0.8412534669041634, |
| "rewards/preference_model_reward/std": 0.17820678371936083, |
| "step": 273 |
| }, |
| { |
| "clip_ratio": 0.00020900591698591597, |
| "epoch": 1.869269949066214, |
| "grad_norm": 0.7312235131971142, |
| "kl": 0.07806396484375, |
| "learning_rate": 2e-06, |
| "loss": -0.0004, |
| "step": 274 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 447.046875, |
| "epoch": 1.8760611205432938, |
| "grad_norm": 0.7216675721694292, |
| "kl": 0.074462890625, |
| "learning_rate": 2e-06, |
| "loss": 0.0012, |
| "reward": 0.8510579615831375, |
| "reward_std": 0.24259257689118385, |
| "rewards/preference_model_reward": 0.8510579615831375, |
| "rewards/preference_model_reward/std": 0.24259258434176445, |
| "step": 275 |
| }, |
| { |
| "clip_ratio": 0.0003766012450796552, |
| "epoch": 1.8828522920203734, |
| "grad_norm": 0.7280400520472221, |
| "kl": 0.074951171875, |
| "learning_rate": 2e-06, |
| "loss": 0.0012, |
| "step": 276 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 453.65625, |
| "epoch": 1.8896434634974533, |
| "grad_norm": 0.8584892428121671, |
| "kl": 0.079376220703125, |
| "learning_rate": 2e-06, |
| "loss": 0.0052, |
| "reward": 0.8504569008946419, |
| "reward_std": 0.24195780232548714, |
| "rewards/preference_model_reward": 0.8504569008946419, |
| "rewards/preference_model_reward/std": 0.24195779021829367, |
| "step": 277 |
| }, |
| { |
| "clip_ratio": 0.000481072609545663, |
| "epoch": 1.8964346349745331, |
| "grad_norm": 0.8623823368670106, |
| "kl": 0.079437255859375, |
| "learning_rate": 2e-06, |
| "loss": 0.0051, |
| "step": 278 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 428.984375, |
| "epoch": 1.903225806451613, |
| "grad_norm": 0.65090353158444, |
| "kl": 0.07568359375, |
| "learning_rate": 2e-06, |
| "loss": -0.0078, |
| "reward": 0.9181569963693619, |
| "reward_std": 0.17035586189012975, |
| "rewards/preference_model_reward": 0.9181569963693619, |
| "rewards/preference_model_reward/std": 0.17035586202109698, |
| "step": 279 |
| }, |
| { |
| "clip_ratio": 0.0002422129618935287, |
| "epoch": 1.9100169779286928, |
| "grad_norm": 0.6478462369679827, |
| "kl": 0.07586669921875, |
| "learning_rate": 2e-06, |
| "loss": -0.0078, |
| "step": 280 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 486.6953125, |
| "epoch": 1.9168081494057725, |
| "grad_norm": 0.7048516661988414, |
| "kl": 0.07611083984375, |
| "learning_rate": 2e-06, |
| "loss": 0.0012, |
| "reward": 0.8832282423973083, |
| "reward_std": 0.20259307883679867, |
| "rewards/preference_model_reward": 0.8832282423973083, |
| "rewards/preference_model_reward/std": 0.20259307883679867, |
| "step": 281 |
| }, |
| { |
| "clip_ratio": 0.00031240142743627075, |
| "epoch": 1.9235993208828521, |
| "grad_norm": 0.6842247007649929, |
| "kl": 0.076324462890625, |
| "learning_rate": 2e-06, |
| "loss": 0.0012, |
| "step": 282 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 446.12109375, |
| "epoch": 1.930390492359932, |
| "grad_norm": 0.7414576613573336, |
| "kl": 0.08099365234375, |
| "learning_rate": 2e-06, |
| "loss": 0.0029, |
| "reward": 0.8914339393377304, |
| "reward_std": 0.15328312013298273, |
| "rewards/preference_model_reward": 0.8914339393377304, |
| "rewards/preference_model_reward/std": 0.1532831130316481, |
| "step": 283 |
| }, |
| { |
| "clip_ratio": 0.00038151090484461747, |
| "epoch": 1.9371816638370118, |
| "grad_norm": 0.7360576429069317, |
| "kl": 0.08160400390625, |
| "learning_rate": 2e-06, |
| "loss": 0.0029, |
| "step": 284 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 477.34765625, |
| "epoch": 1.9439728353140917, |
| "grad_norm": 0.8076039752992614, |
| "kl": 0.07861328125, |
| "learning_rate": 2e-06, |
| "loss": -0.0035, |
| "reward": 0.8031719997525215, |
| "reward_std": 0.22500982508063316, |
| "rewards/preference_model_reward": 0.8031719997525215, |
| "rewards/preference_model_reward/std": 0.22500981856137514, |
| "step": 285 |
| }, |
| { |
| "clip_ratio": 0.000369085326383356, |
| "epoch": 1.9507640067911716, |
| "grad_norm": 0.8009414546069689, |
| "kl": 0.07916259765625, |
| "learning_rate": 2e-06, |
| "loss": -0.0035, |
| "step": 286 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 456.796875, |
| "epoch": 1.9575551782682514, |
| "grad_norm": 0.7202067622191793, |
| "kl": 0.072509765625, |
| "learning_rate": 2e-06, |
| "loss": -0.0031, |
| "reward": 0.9239321351051331, |
| "reward_std": 0.1347129621426575, |
| "rewards/preference_model_reward": 0.9239321351051331, |
| "rewards/preference_model_reward/std": 0.13471295684576035, |
| "step": 287 |
| }, |
| { |
| "clip_ratio": 0.00027696689903677907, |
| "epoch": 1.964346349745331, |
| "grad_norm": 0.4938288163422082, |
| "kl": 0.0726318359375, |
| "learning_rate": 2e-06, |
| "loss": -0.0031, |
| "step": 288 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 364.72265625, |
| "epoch": 1.9711375212224107, |
| "grad_norm": 0.7669478819520723, |
| "kl": 0.08953857421875, |
| "learning_rate": 2e-06, |
| "loss": -0.0036, |
| "reward": 0.8362223468720913, |
| "reward_std": 0.19234392209909856, |
| "rewards/preference_model_reward": 0.8362223468720913, |
| "rewards/preference_model_reward/std": 0.19234391255304217, |
| "step": 289 |
| }, |
| { |
| "clip_ratio": 0.000403772370191291, |
| "epoch": 1.9779286926994906, |
| "grad_norm": 2.6810045962105375, |
| "kl": 0.089599609375, |
| "learning_rate": 2e-06, |
| "loss": -0.0037, |
| "step": 290 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 457.55859375, |
| "epoch": 1.9847198641765704, |
| "grad_norm": 0.8500553867554066, |
| "kl": 0.07928466796875, |
| "learning_rate": 2e-06, |
| "loss": -0.0072, |
| "reward": 0.845167201012373, |
| "reward_std": 0.1875469057704322, |
| "rewards/preference_model_reward": 0.845167201012373, |
| "rewards/preference_model_reward/std": 0.18754690227797255, |
| "step": 291 |
| }, |
| { |
| "clip_ratio": 0.0004131398400204489, |
| "epoch": 1.9915110356536503, |
| "grad_norm": 0.8328407125726051, |
| "kl": 0.07965087890625, |
| "learning_rate": 2e-06, |
| "loss": -0.0073, |
| "step": 292 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 464.4375, |
| "epoch": 2.00679117147708, |
| "grad_norm": 0.7832085644539297, |
| "kl": 0.08221435546875, |
| "learning_rate": 2e-06, |
| "loss": 0.001, |
| "reward": 0.8732399269938469, |
| "reward_std": 0.22111400961875916, |
| "rewards/preference_model_reward": 0.8732399269938469, |
| "rewards/preference_model_reward/std": 0.22111400589346886, |
| "step": 293 |
| }, |
| { |
| "clip_ratio": 0.0002739406791079091, |
| "epoch": 2.0135823429541597, |
| "grad_norm": 0.7473092873628361, |
| "kl": 0.082275390625, |
| "learning_rate": 2e-06, |
| "loss": 0.0009, |
| "step": 294 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 540.265625, |
| "epoch": 2.0203735144312396, |
| "grad_norm": 0.4811242877169514, |
| "kl": 0.07806396484375, |
| "learning_rate": 2e-06, |
| "loss": 0.0044, |
| "reward": 0.9485599547624588, |
| "reward_std": 0.12165670236572623, |
| "rewards/preference_model_reward": 0.9485599547624588, |
| "rewards/preference_model_reward/std": 0.12165670190006495, |
| "step": 295 |
| }, |
| { |
| "clip_ratio": 0.00019723791228898335, |
| "epoch": 2.027164685908319, |
| "grad_norm": 0.48620380717534856, |
| "kl": 0.0780029296875, |
| "learning_rate": 2e-06, |
| "loss": 0.0043, |
| "step": 296 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 438.375, |
| "epoch": 2.033955857385399, |
| "grad_norm": 0.6998442696000419, |
| "kl": 0.08050537109375, |
| "learning_rate": 2e-06, |
| "loss": 0.0042, |
| "reward": 0.8833131715655327, |
| "reward_std": 0.18213203502818942, |
| "rewards/preference_model_reward": 0.8833131715655327, |
| "rewards/preference_model_reward/std": 0.18213202757760882, |
| "step": 297 |
| }, |
| { |
| "clip_ratio": 0.00025544029949742253, |
| "epoch": 2.0407470288624787, |
| "grad_norm": 0.7155677820275104, |
| "kl": 0.0804443359375, |
| "learning_rate": 2e-06, |
| "loss": 0.0043, |
| "step": 298 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 417.74609375, |
| "epoch": 2.0475382003395586, |
| "grad_norm": 0.6253438347592372, |
| "kl": 0.0816650390625, |
| "learning_rate": 2e-06, |
| "loss": -0.0002, |
| "reward": 0.9034418389201164, |
| "reward_std": 0.13920773862628266, |
| "rewards/preference_model_reward": 0.9034418389201164, |
| "rewards/preference_model_reward/std": 0.13920772803248838, |
| "step": 299 |
| }, |
| { |
| "clip_ratio": 0.0002523756156733725, |
| "epoch": 2.0543293718166384, |
| "grad_norm": 0.6171154618996508, |
| "kl": 0.08148193359375, |
| "learning_rate": 2e-06, |
| "loss": -0.0002, |
| "step": 300 |
| } |
| ], |
| "logging_steps": 1, |
| "max_steps": 625, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 5, |
| "save_steps": 150, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 0.0, |
| "train_batch_size": 8, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|