| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.7802340702210663, |
| "eval_steps": 500, |
| "global_step": 300, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 366.5625, |
| "epoch": 0.002600780234070221, |
| "grad_norm": 0.7637246764761187, |
| "kl": 0.00033664703369140625, |
| "learning_rate": 0.0, |
| "loss": 0.0093, |
| "reward": 0.27206526696681976, |
| "reward_std": 0.35790932178497314, |
| "rewards/preference_model_reward": 0.27206526696681976, |
| "rewards/preference_model_reward/std": 0.3698284178972244, |
| "step": 1 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "epoch": 0.005201560468140442, |
| "grad_norm": 0.7634221939067439, |
| "kl": 0.00033664703369140625, |
| "learning_rate": 1e-07, |
| "loss": 0.0093, |
| "step": 2 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 286.765625, |
| "epoch": 0.007802340702210663, |
| "grad_norm": 0.6989165605414746, |
| "kl": 0.0003261566162109375, |
| "learning_rate": 2e-07, |
| "loss": 0.0017, |
| "reward": 0.4179777204990387, |
| "reward_std": 0.3505486845970154, |
| "rewards/preference_model_reward": 0.4179777204990387, |
| "rewards/preference_model_reward/std": 0.42789818346500397, |
| "step": 3 |
| }, |
| { |
| "clip_ratio": 0.0003455103069427423, |
| "epoch": 0.010403120936280884, |
| "grad_norm": 0.7024752202801559, |
| "kl": 0.00033855438232421875, |
| "learning_rate": 3e-07, |
| "loss": 0.0017, |
| "step": 4 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 347.421875, |
| "epoch": 0.013003901170351105, |
| "grad_norm": 0.7903982506139909, |
| "kl": 0.0003814697265625, |
| "learning_rate": 4e-07, |
| "loss": 0.0042, |
| "reward": 0.42805667221546173, |
| "reward_std": 0.3343174010515213, |
| "rewards/preference_model_reward": 0.42805667221546173, |
| "rewards/preference_model_reward/std": 0.38788357377052307, |
| "step": 5 |
| }, |
| { |
| "clip_ratio": 0.00042975760879926383, |
| "epoch": 0.015604681404421327, |
| "grad_norm": 0.8018115419028574, |
| "kl": 0.0003833770751953125, |
| "learning_rate": 5e-07, |
| "loss": 0.0042, |
| "step": 6 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 213.625, |
| "epoch": 0.018205461638491547, |
| "grad_norm": 0.5838143481805789, |
| "kl": 0.0004119873046875, |
| "learning_rate": 6e-07, |
| "loss": -0.0006, |
| "reward": 0.1721051186323166, |
| "reward_std": 0.2126249074935913, |
| "rewards/preference_model_reward": 0.1721051186323166, |
| "rewards/preference_model_reward/std": 0.32822249829769135, |
| "step": 7 |
| }, |
| { |
| "clip_ratio": 0.00028950837804586627, |
| "epoch": 0.02080624187256177, |
| "grad_norm": 0.5890045804822527, |
| "kl": 0.00039958953857421875, |
| "learning_rate": 7e-07, |
| "loss": -0.0006, |
| "step": 8 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 276.53125, |
| "epoch": 0.02340702210663199, |
| "grad_norm": 0.8569937422606633, |
| "kl": 0.000370025634765625, |
| "learning_rate": 8e-07, |
| "loss": 0.0074, |
| "reward": 0.5692853033542633, |
| "reward_std": 0.4373367577791214, |
| "rewards/preference_model_reward": 0.5692853033542633, |
| "rewards/preference_model_reward/std": 0.43920741975307465, |
| "step": 9 |
| }, |
| { |
| "clip_ratio": 0.0002302309003425762, |
| "epoch": 0.02600780234070221, |
| "grad_norm": 0.8635017364924986, |
| "kl": 0.000354766845703125, |
| "learning_rate": 9e-07, |
| "loss": 0.0075, |
| "step": 10 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 417.921875, |
| "epoch": 0.02860858257477243, |
| "grad_norm": 0.7581548485527216, |
| "kl": 0.000392913818359375, |
| "learning_rate": 1e-06, |
| "loss": 0.0146, |
| "reward": 0.4504364877939224, |
| "reward_std": 0.3650350868701935, |
| "rewards/preference_model_reward": 0.4504364877939224, |
| "rewards/preference_model_reward/std": 0.4405139982700348, |
| "step": 11 |
| }, |
| { |
| "clip_ratio": 0.00031334128289017826, |
| "epoch": 0.031209362808842653, |
| "grad_norm": 0.7745072225657651, |
| "kl": 0.00039005279541015625, |
| "learning_rate": 1e-06, |
| "loss": 0.0146, |
| "step": 12 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 229.953125, |
| "epoch": 0.033810143042912875, |
| "grad_norm": 0.8433641431962554, |
| "kl": 0.00045299530029296875, |
| "learning_rate": 1e-06, |
| "loss": -0.0043, |
| "reward": 0.2926686853170395, |
| "reward_std": 0.33062444627285004, |
| "rewards/preference_model_reward": 0.2926686853170395, |
| "rewards/preference_model_reward/std": 0.3724062442779541, |
| "step": 13 |
| }, |
| { |
| "clip_ratio": 0.00040468364022672176, |
| "epoch": 0.036410923276983094, |
| "grad_norm": 0.7555936110067955, |
| "kl": 0.0004749298095703125, |
| "learning_rate": 1e-06, |
| "loss": -0.0043, |
| "step": 14 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 266.859375, |
| "epoch": 0.03901170351105332, |
| "grad_norm": 0.8416993973883007, |
| "kl": 0.00039386749267578125, |
| "learning_rate": 1e-06, |
| "loss": 0.0026, |
| "reward": 0.41267864406108856, |
| "reward_std": 0.3813520818948746, |
| "rewards/preference_model_reward": 0.41267864406108856, |
| "rewards/preference_model_reward/std": 0.4197400361299515, |
| "step": 15 |
| }, |
| { |
| "clip_ratio": 0.00021883380395593122, |
| "epoch": 0.04161248374512354, |
| "grad_norm": 0.819553453730802, |
| "kl": 0.000415802001953125, |
| "learning_rate": 1e-06, |
| "loss": 0.0026, |
| "step": 16 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 367.84375, |
| "epoch": 0.044213263979193757, |
| "grad_norm": 0.5697479623176207, |
| "kl": 0.00043487548828125, |
| "learning_rate": 1e-06, |
| "loss": 0.0076, |
| "reward": 0.3072579577565193, |
| "reward_std": 0.20797011256217957, |
| "rewards/preference_model_reward": 0.3072579577565193, |
| "rewards/preference_model_reward/std": 0.3955962061882019, |
| "step": 17 |
| }, |
| { |
| "clip_ratio": 0.00018104003902408294, |
| "epoch": 0.04681404421326398, |
| "grad_norm": 0.5554507387208018, |
| "kl": 0.0004138946533203125, |
| "learning_rate": 1e-06, |
| "loss": 0.0076, |
| "step": 18 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 355.671875, |
| "epoch": 0.0494148244473342, |
| "grad_norm": 0.9534071970661521, |
| "kl": 0.00045013427734375, |
| "learning_rate": 1e-06, |
| "loss": 0.0049, |
| "reward": 0.32867759466171265, |
| "reward_std": 0.4192758649587631, |
| "rewards/preference_model_reward": 0.32867759466171265, |
| "rewards/preference_model_reward/std": 0.41489049792289734, |
| "step": 19 |
| }, |
| { |
| "clip_ratio": 0.0001942292437888682, |
| "epoch": 0.05201560468140442, |
| "grad_norm": 0.9582264669727778, |
| "kl": 0.0004558563232421875, |
| "learning_rate": 1e-06, |
| "loss": 0.0049, |
| "step": 20 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 340.671875, |
| "epoch": 0.054616384915474644, |
| "grad_norm": 0.7518884583016779, |
| "kl": 0.0004329681396484375, |
| "learning_rate": 1e-06, |
| "loss": 0.0054, |
| "reward": 0.47081659734249115, |
| "reward_std": 0.37600383162498474, |
| "rewards/preference_model_reward": 0.47081659734249115, |
| "rewards/preference_model_reward/std": 0.46160686016082764, |
| "step": 21 |
| }, |
| { |
| "clip_ratio": 0.00022455723956227303, |
| "epoch": 0.05721716514954486, |
| "grad_norm": 0.7473411198477925, |
| "kl": 0.0004329681396484375, |
| "learning_rate": 1e-06, |
| "loss": 0.0054, |
| "step": 22 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 328.734375, |
| "epoch": 0.05981794538361508, |
| "grad_norm": 1.9194684284684271, |
| "kl": 0.0005474090576171875, |
| "learning_rate": 1e-06, |
| "loss": 0.0147, |
| "reward": 0.4218391329050064, |
| "reward_std": 0.3977803438901901, |
| "rewards/preference_model_reward": 0.4218391329050064, |
| "rewards/preference_model_reward/std": 0.4245973825454712, |
| "step": 23 |
| }, |
| { |
| "clip_ratio": 0.0008293713217426557, |
| "epoch": 0.06241872561768531, |
| "grad_norm": 1.0216332209635672, |
| "kl": 0.00061798095703125, |
| "learning_rate": 1e-06, |
| "loss": 0.0147, |
| "step": 24 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 361.984375, |
| "epoch": 0.06501950585175553, |
| "grad_norm": 1.2061326379172082, |
| "kl": 0.0006351470947265625, |
| "learning_rate": 1e-06, |
| "loss": 0.0006, |
| "reward": 0.2993537187576294, |
| "reward_std": 0.33115406334400177, |
| "rewards/preference_model_reward": 0.2993537187576294, |
| "rewards/preference_model_reward/std": 0.34632159024477005, |
| "step": 25 |
| }, |
| { |
| "clip_ratio": 0.00020526795196929015, |
| "epoch": 0.06762028608582575, |
| "grad_norm": 0.7646253192627566, |
| "kl": 0.000896453857421875, |
| "learning_rate": 1e-06, |
| "loss": 0.0006, |
| "step": 26 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 333.59375, |
| "epoch": 0.07022106631989597, |
| "grad_norm": 1.093317920354513, |
| "kl": 0.00075531005859375, |
| "learning_rate": 1e-06, |
| "loss": 0.0029, |
| "reward": 0.25294405221939087, |
| "reward_std": 0.26265186071395874, |
| "rewards/preference_model_reward": 0.25294405221939087, |
| "rewards/preference_model_reward/std": 0.31941579282283783, |
| "step": 27 |
| }, |
| { |
| "clip_ratio": 0.0002510274134692736, |
| "epoch": 0.07282184655396619, |
| "grad_norm": 0.5733552941634138, |
| "kl": 0.000896453857421875, |
| "learning_rate": 1e-06, |
| "loss": 0.0029, |
| "step": 28 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 315.234375, |
| "epoch": 0.0754226267880364, |
| "grad_norm": 0.8998443055154545, |
| "kl": 0.000705718994140625, |
| "learning_rate": 1e-06, |
| "loss": 0.0003, |
| "reward": 0.44713057577610016, |
| "reward_std": 0.4038489907979965, |
| "rewards/preference_model_reward": 0.44713057577610016, |
| "rewards/preference_model_reward/std": 0.4337831437587738, |
| "step": 29 |
| }, |
| { |
| "clip_ratio": 7.785470006638207e-05, |
| "epoch": 0.07802340702210664, |
| "grad_norm": 0.9074477906453469, |
| "kl": 0.000736236572265625, |
| "learning_rate": 1e-06, |
| "loss": 0.0003, |
| "step": 30 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 435.03125, |
| "epoch": 0.08062418725617686, |
| "grad_norm": 0.6726628287711655, |
| "kl": 0.000946044921875, |
| "learning_rate": 1e-06, |
| "loss": 0.0073, |
| "reward": 0.2786962687969208, |
| "reward_std": 0.25916408747434616, |
| "rewards/preference_model_reward": 0.2786962687969208, |
| "rewards/preference_model_reward/std": 0.3720279037952423, |
| "step": 31 |
| }, |
| { |
| "clip_ratio": 0.0002550777353462763, |
| "epoch": 0.08322496749024708, |
| "grad_norm": 0.6715022976592974, |
| "kl": 0.000949859619140625, |
| "learning_rate": 1e-06, |
| "loss": 0.0072, |
| "step": 32 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 297.015625, |
| "epoch": 0.0858257477243173, |
| "grad_norm": 0.39405296450661176, |
| "kl": 0.0009059906005859375, |
| "learning_rate": 1e-06, |
| "loss": 0.0044, |
| "reward": 0.09238657355308533, |
| "reward_std": 0.2289801463484764, |
| "rewards/preference_model_reward": 0.09238657355308533, |
| "rewards/preference_model_reward/std": 0.22657855600118637, |
| "step": 33 |
| }, |
| { |
| "clip_ratio": 0.00038183602737262845, |
| "epoch": 0.08842652795838751, |
| "grad_norm": 0.4049944734565922, |
| "kl": 0.0009765625, |
| "learning_rate": 1e-06, |
| "loss": 0.0044, |
| "step": 34 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 322.203125, |
| "epoch": 0.09102730819245773, |
| "grad_norm": 0.827294235606583, |
| "kl": 0.00092315673828125, |
| "learning_rate": 1e-06, |
| "loss": 0.009, |
| "reward": 0.3779194802045822, |
| "reward_std": 0.39827682077884674, |
| "rewards/preference_model_reward": 0.3779194802045822, |
| "rewards/preference_model_reward/std": 0.43630431592464447, |
| "step": 35 |
| }, |
| { |
| "clip_ratio": 0.00026937505026580766, |
| "epoch": 0.09362808842652796, |
| "grad_norm": 0.8300155350554602, |
| "kl": 0.0009613037109375, |
| "learning_rate": 1e-06, |
| "loss": 0.009, |
| "step": 36 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 361.75, |
| "epoch": 0.09622886866059818, |
| "grad_norm": 0.9040133604531, |
| "kl": 0.001125335693359375, |
| "learning_rate": 1e-06, |
| "loss": 0.0078, |
| "reward": 0.3842260241508484, |
| "reward_std": 0.38062165677547455, |
| "rewards/preference_model_reward": 0.3842260241508484, |
| "rewards/preference_model_reward/std": 0.42281022667884827, |
| "step": 37 |
| }, |
| { |
| "clip_ratio": 0.000247176612901967, |
| "epoch": 0.0988296488946684, |
| "grad_norm": 0.8818972451317902, |
| "kl": 0.001178741455078125, |
| "learning_rate": 1e-06, |
| "loss": 0.0078, |
| "step": 38 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 409.0, |
| "epoch": 0.10143042912873862, |
| "grad_norm": 0.9975116421261425, |
| "kl": 0.00125885009765625, |
| "learning_rate": 1e-06, |
| "loss": 0.0066, |
| "reward": 0.46170978248119354, |
| "reward_std": 0.3637985289096832, |
| "rewards/preference_model_reward": 0.46170978248119354, |
| "rewards/preference_model_reward/std": 0.4502808153629303, |
| "step": 39 |
| }, |
| { |
| "clip_ratio": 0.00038266702904365957, |
| "epoch": 0.10403120936280884, |
| "grad_norm": 0.9904624875110181, |
| "kl": 0.00133514404296875, |
| "learning_rate": 1e-06, |
| "loss": 0.0066, |
| "step": 40 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 221.875, |
| "epoch": 0.10663198959687907, |
| "grad_norm": 0.5851402421021292, |
| "kl": 0.001651763916015625, |
| "learning_rate": 1e-06, |
| "loss": -0.0034, |
| "reward": 0.3681895285844803, |
| "reward_std": 0.2873292565345764, |
| "rewards/preference_model_reward": 0.3681895285844803, |
| "rewards/preference_model_reward/std": 0.42807736992836, |
| "step": 41 |
| }, |
| { |
| "clip_ratio": 0.00044402084313333035, |
| "epoch": 0.10923276983094929, |
| "grad_norm": 0.5781183841216399, |
| "kl": 0.00211334228515625, |
| "learning_rate": 1e-06, |
| "loss": -0.0034, |
| "step": 42 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 319.390625, |
| "epoch": 0.11183355006501951, |
| "grad_norm": 0.8225811515974385, |
| "kl": 0.0016632080078125, |
| "learning_rate": 1e-06, |
| "loss": 0.0131, |
| "reward": 0.5128970742225647, |
| "reward_std": 0.3217063844203949, |
| "rewards/preference_model_reward": 0.5128970742225647, |
| "rewards/preference_model_reward/std": 0.42474237084388733, |
| "step": 43 |
| }, |
| { |
| "clip_ratio": 0.00033481663558632135, |
| "epoch": 0.11443433029908973, |
| "grad_norm": 0.8483070262251596, |
| "kl": 0.001781463623046875, |
| "learning_rate": 1e-06, |
| "loss": 0.0131, |
| "step": 44 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 224.203125, |
| "epoch": 0.11703511053315994, |
| "grad_norm": 0.5395067031089475, |
| "kl": 0.00168609619140625, |
| "learning_rate": 1e-06, |
| "loss": 0.0018, |
| "reward": 0.5381855368614197, |
| "reward_std": 0.2828982323408127, |
| "rewards/preference_model_reward": 0.5381855368614197, |
| "rewards/preference_model_reward/std": 0.4257737398147583, |
| "step": 45 |
| }, |
| { |
| "clip_ratio": 8.294625149574131e-05, |
| "epoch": 0.11963589076723016, |
| "grad_norm": 0.5469512514582927, |
| "kl": 0.0019378662109375, |
| "learning_rate": 1e-06, |
| "loss": 0.0018, |
| "step": 46 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 313.03125, |
| "epoch": 0.1222366710013004, |
| "grad_norm": 0.7038582256559461, |
| "kl": 0.00201416015625, |
| "learning_rate": 1e-06, |
| "loss": 0.0056, |
| "reward": 0.4148406833410263, |
| "reward_std": 0.3201068937778473, |
| "rewards/preference_model_reward": 0.4148406833410263, |
| "rewards/preference_model_reward/std": 0.4175822138786316, |
| "step": 47 |
| }, |
| { |
| "clip_ratio": 7.856693991925567e-05, |
| "epoch": 0.12483745123537061, |
| "grad_norm": 0.675491507271435, |
| "kl": 0.0020904541015625, |
| "learning_rate": 1e-06, |
| "loss": 0.0056, |
| "step": 48 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 414.671875, |
| "epoch": 0.12743823146944083, |
| "grad_norm": 0.8170063918072458, |
| "kl": 0.0021820068359375, |
| "learning_rate": 1e-06, |
| "loss": 0.0091, |
| "reward": 0.2692461907863617, |
| "reward_std": 0.3443475216627121, |
| "rewards/preference_model_reward": 0.2692461907863617, |
| "rewards/preference_model_reward/std": 0.36693260073661804, |
| "step": 49 |
| }, |
| { |
| "clip_ratio": 0.00021530466619879007, |
| "epoch": 0.13003901170351106, |
| "grad_norm": 0.8122674605503017, |
| "kl": 0.00228118896484375, |
| "learning_rate": 1e-06, |
| "loss": 0.0092, |
| "step": 50 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 391.953125, |
| "epoch": 0.13263979193758127, |
| "grad_norm": 1.0413631794553713, |
| "kl": 0.002166748046875, |
| "learning_rate": 1e-06, |
| "loss": 0.0037, |
| "reward": 0.5847213864326477, |
| "reward_std": 0.42418254911899567, |
| "rewards/preference_model_reward": 0.5847213864326477, |
| "rewards/preference_model_reward/std": 0.43372175097465515, |
| "step": 51 |
| }, |
| { |
| "clip_ratio": 0.00017526535884826444, |
| "epoch": 0.1352405721716515, |
| "grad_norm": 0.996238286825058, |
| "kl": 0.00226593017578125, |
| "learning_rate": 1e-06, |
| "loss": 0.0036, |
| "step": 52 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 352.640625, |
| "epoch": 0.1378413524057217, |
| "grad_norm": 0.8962684423825826, |
| "kl": 0.0029296875, |
| "learning_rate": 1e-06, |
| "loss": -0.0128, |
| "reward": 0.45333464443683624, |
| "reward_std": 0.34005598723888397, |
| "rewards/preference_model_reward": 0.45333464443683624, |
| "rewards/preference_model_reward/std": 0.4754510223865509, |
| "step": 53 |
| }, |
| { |
| "clip_ratio": 0.00030642370984423906, |
| "epoch": 0.14044213263979194, |
| "grad_norm": 0.9030383579421986, |
| "kl": 0.00295257568359375, |
| "learning_rate": 1e-06, |
| "loss": -0.0128, |
| "step": 54 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 421.546875, |
| "epoch": 0.14304291287386217, |
| "grad_norm": 0.9579654034497551, |
| "kl": 0.0024871826171875, |
| "learning_rate": 1e-06, |
| "loss": 0.0174, |
| "reward": 0.46721845865249634, |
| "reward_std": 0.367512583732605, |
| "rewards/preference_model_reward": 0.46721845865249634, |
| "rewards/preference_model_reward/std": 0.38719798624515533, |
| "step": 55 |
| }, |
| { |
| "clip_ratio": 0.00037521994090639055, |
| "epoch": 0.14564369310793238, |
| "grad_norm": 1.1022969678624799, |
| "kl": 0.002593994140625, |
| "learning_rate": 1e-06, |
| "loss": 0.0174, |
| "step": 56 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 263.5, |
| "epoch": 0.1482444733420026, |
| "grad_norm": 0.8463126094043792, |
| "kl": 0.00301361083984375, |
| "learning_rate": 1e-06, |
| "loss": -0.009, |
| "reward": 0.5161054730415344, |
| "reward_std": 0.4402284473180771, |
| "rewards/preference_model_reward": 0.5161054730415344, |
| "rewards/preference_model_reward/std": 0.43457816541194916, |
| "step": 57 |
| }, |
| { |
| "clip_ratio": 0.00023281520407181233, |
| "epoch": 0.1508452535760728, |
| "grad_norm": 0.8638119217410174, |
| "kl": 0.00360107421875, |
| "learning_rate": 1e-06, |
| "loss": -0.009, |
| "step": 58 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 264.484375, |
| "epoch": 0.15344603381014305, |
| "grad_norm": 0.9063786698187589, |
| "kl": 0.0032806396484375, |
| "learning_rate": 1e-06, |
| "loss": 0.0045, |
| "reward": 0.4931875765323639, |
| "reward_std": 0.3744415044784546, |
| "rewards/preference_model_reward": 0.4931875765323639, |
| "rewards/preference_model_reward/std": 0.41449280083179474, |
| "step": 59 |
| }, |
| { |
| "clip_ratio": 0.0004102790408069268, |
| "epoch": 0.15604681404421328, |
| "grad_norm": 0.7069394763111468, |
| "kl": 0.00341796875, |
| "learning_rate": 1e-06, |
| "loss": 0.0045, |
| "step": 60 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 325.734375, |
| "epoch": 0.15864759427828348, |
| "grad_norm": 0.8614703926848987, |
| "kl": 0.004241943359375, |
| "learning_rate": 1e-06, |
| "loss": -0.0041, |
| "reward": 0.5906274169683456, |
| "reward_std": 0.2752673625946045, |
| "rewards/preference_model_reward": 0.5906274169683456, |
| "rewards/preference_model_reward/std": 0.4438425600528717, |
| "step": 61 |
| }, |
| { |
| "clip_ratio": 8.253894338849932e-05, |
| "epoch": 0.16124837451235371, |
| "grad_norm": 4.155206953608738, |
| "kl": 0.0043792724609375, |
| "learning_rate": 1e-06, |
| "loss": -0.0041, |
| "step": 62 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 324.640625, |
| "epoch": 0.16384915474642392, |
| "grad_norm": 0.7867835517980187, |
| "kl": 0.0041961669921875, |
| "learning_rate": 1e-06, |
| "loss": 0.0109, |
| "reward": 0.6263610422611237, |
| "reward_std": 0.33436986804008484, |
| "rewards/preference_model_reward": 0.6263610422611237, |
| "rewards/preference_model_reward/std": 0.437585785984993, |
| "step": 63 |
| }, |
| { |
| "clip_ratio": 0.00021243346418486908, |
| "epoch": 0.16644993498049415, |
| "grad_norm": 0.776838553332444, |
| "kl": 0.0043792724609375, |
| "learning_rate": 1e-06, |
| "loss": 0.0109, |
| "step": 64 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 348.34375, |
| "epoch": 0.16905071521456436, |
| "grad_norm": 0.8132709521354365, |
| "kl": 0.004913330078125, |
| "learning_rate": 1e-06, |
| "loss": 0.0014, |
| "reward": 0.6121957302093506, |
| "reward_std": 0.3261266052722931, |
| "rewards/preference_model_reward": 0.6121957302093506, |
| "rewards/preference_model_reward/std": 0.43943892419338226, |
| "step": 65 |
| }, |
| { |
| "clip_ratio": 0.0002961681311717257, |
| "epoch": 0.1716514954486346, |
| "grad_norm": 0.7884869792445989, |
| "kl": 0.0051422119140625, |
| "learning_rate": 1e-06, |
| "loss": 0.0013, |
| "step": 66 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 361.34375, |
| "epoch": 0.17425227568270482, |
| "grad_norm": 0.7001728994457754, |
| "kl": 0.0053863525390625, |
| "learning_rate": 1e-06, |
| "loss": 0.0002, |
| "reward": 0.41676153242588043, |
| "reward_std": 0.2956756055355072, |
| "rewards/preference_model_reward": 0.41676153242588043, |
| "rewards/preference_model_reward/std": 0.33416300266981125, |
| "step": 67 |
| }, |
| { |
| "clip_ratio": 0.00037663579132640734, |
| "epoch": 0.17685305591677503, |
| "grad_norm": 0.701103186201825, |
| "kl": 0.0054931640625, |
| "learning_rate": 1e-06, |
| "loss": 0.0002, |
| "step": 68 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 337.234375, |
| "epoch": 0.17945383615084526, |
| "grad_norm": 1.2570820396809244, |
| "kl": 0.1349945068359375, |
| "learning_rate": 1e-06, |
| "loss": 0.0013, |
| "reward": 0.4304375946521759, |
| "reward_std": 0.3242499828338623, |
| "rewards/preference_model_reward": 0.4304375946521759, |
| "rewards/preference_model_reward/std": 0.33702679723501205, |
| "step": 69 |
| }, |
| { |
| "clip_ratio": 0.00043001653102692217, |
| "epoch": 0.18205461638491546, |
| "grad_norm": 1.6894684029251643, |
| "kl": 0.099365234375, |
| "learning_rate": 1e-06, |
| "loss": 0.0012, |
| "step": 70 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 361.09375, |
| "epoch": 0.1846553966189857, |
| "grad_norm": 0.740223544455656, |
| "kl": 0.0048065185546875, |
| "learning_rate": 1e-06, |
| "loss": -0.0079, |
| "reward": 0.38418276607990265, |
| "reward_std": 0.32340583205223083, |
| "rewards/preference_model_reward": 0.38418276607990265, |
| "rewards/preference_model_reward/std": 0.4079796075820923, |
| "step": 71 |
| }, |
| { |
| "clip_ratio": 0.0002601840387796983, |
| "epoch": 0.18725617685305593, |
| "grad_norm": 0.7356062747183508, |
| "kl": 0.0050048828125, |
| "learning_rate": 1e-06, |
| "loss": -0.0079, |
| "step": 72 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 244.640625, |
| "epoch": 0.18985695708712613, |
| "grad_norm": 0.5133791549848619, |
| "kl": 0.0059051513671875, |
| "learning_rate": 1e-06, |
| "loss": -0.0043, |
| "reward": 0.19469847530126572, |
| "reward_std": 0.21297892928123474, |
| "rewards/preference_model_reward": 0.19469847530126572, |
| "rewards/preference_model_reward/std": 0.2838420420885086, |
| "step": 73 |
| }, |
| { |
| "clip_ratio": 0.00011593603994697332, |
| "epoch": 0.19245773732119636, |
| "grad_norm": 0.511693644544892, |
| "kl": 0.0062713623046875, |
| "learning_rate": 1e-06, |
| "loss": -0.0043, |
| "step": 74 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 313.515625, |
| "epoch": 0.19505851755526657, |
| "grad_norm": 0.9763617288864301, |
| "kl": 0.005584716796875, |
| "learning_rate": 1e-06, |
| "loss": 0.0132, |
| "reward": 0.678843080997467, |
| "reward_std": 0.438846230506897, |
| "rewards/preference_model_reward": 0.678843080997467, |
| "rewards/preference_model_reward/std": 0.43212489783763885, |
| "step": 75 |
| }, |
| { |
| "clip_ratio": 0.00010097982158185914, |
| "epoch": 0.1976592977893368, |
| "grad_norm": 1.0362215753715969, |
| "kl": 0.0058135986328125, |
| "learning_rate": 1e-06, |
| "loss": 0.0132, |
| "step": 76 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 428.25, |
| "epoch": 0.20026007802340703, |
| "grad_norm": 0.836837589110761, |
| "kl": 0.0065460205078125, |
| "learning_rate": 1e-06, |
| "loss": 0.0036, |
| "reward": 0.26709984242916107, |
| "reward_std": 0.29253821820020676, |
| "rewards/preference_model_reward": 0.26709984242916107, |
| "rewards/preference_model_reward/std": 0.33563828468322754, |
| "step": 77 |
| }, |
| { |
| "clip_ratio": 0.00026085925492225215, |
| "epoch": 0.20286085825747724, |
| "grad_norm": 0.8750312130202887, |
| "kl": 0.006805419921875, |
| "learning_rate": 1e-06, |
| "loss": 0.0036, |
| "step": 78 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 354.515625, |
| "epoch": 0.20546163849154747, |
| "grad_norm": 0.92295112375941, |
| "kl": 0.00604248046875, |
| "learning_rate": 1e-06, |
| "loss": 0.0045, |
| "reward": 0.35681067407131195, |
| "reward_std": 0.378703311085701, |
| "rewards/preference_model_reward": 0.35681067407131195, |
| "rewards/preference_model_reward/std": 0.3927233815193176, |
| "step": 79 |
| }, |
| { |
| "clip_ratio": 8.110202907118946e-05, |
| "epoch": 0.20806241872561768, |
| "grad_norm": 0.8460897952801627, |
| "kl": 0.0062713623046875, |
| "learning_rate": 1e-06, |
| "loss": 0.0045, |
| "step": 80 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 290.53125, |
| "epoch": 0.2106631989596879, |
| "grad_norm": 0.8210294793253463, |
| "kl": 0.0078582763671875, |
| "learning_rate": 1e-06, |
| "loss": -0.0127, |
| "reward": 0.47505128383636475, |
| "reward_std": 0.3855440318584442, |
| "rewards/preference_model_reward": 0.47505128383636475, |
| "rewards/preference_model_reward/std": 0.4009372293949127, |
| "step": 81 |
| }, |
| { |
| "clip_ratio": 0.00014372689474839717, |
| "epoch": 0.21326397919375814, |
| "grad_norm": 0.8027749648905551, |
| "kl": 0.0082244873046875, |
| "learning_rate": 1e-06, |
| "loss": -0.0126, |
| "step": 82 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 323.140625, |
| "epoch": 0.21586475942782835, |
| "grad_norm": 0.6142723349382537, |
| "kl": 0.00732421875, |
| "learning_rate": 1e-06, |
| "loss": 0.0002, |
| "reward": 0.4587654545903206, |
| "reward_std": 0.29028914868831635, |
| "rewards/preference_model_reward": 0.4587654545903206, |
| "rewards/preference_model_reward/std": 0.3300708681344986, |
| "step": 83 |
| }, |
| { |
| "clip_ratio": 0.00034744998265523463, |
| "epoch": 0.21846553966189858, |
| "grad_norm": 0.5861784424361753, |
| "kl": 0.0074920654296875, |
| "learning_rate": 1e-06, |
| "loss": 0.0002, |
| "step": 84 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 484.484375, |
| "epoch": 0.22106631989596878, |
| "grad_norm": 0.9076959060952224, |
| "kl": 0.0075836181640625, |
| "learning_rate": 1e-06, |
| "loss": 0.0072, |
| "reward": 0.5031348764896393, |
| "reward_std": 0.2952383682131767, |
| "rewards/preference_model_reward": 0.5031348764896393, |
| "rewards/preference_model_reward/std": 0.44817858934402466, |
| "step": 85 |
| }, |
| { |
| "clip_ratio": 0.0002725635713431984, |
| "epoch": 0.22366710013003901, |
| "grad_norm": 0.9288531435242793, |
| "kl": 0.007843017578125, |
| "learning_rate": 1e-06, |
| "loss": 0.0073, |
| "step": 86 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 339.09375, |
| "epoch": 0.22626788036410922, |
| "grad_norm": 0.8923680637356045, |
| "kl": 0.00830078125, |
| "learning_rate": 1e-06, |
| "loss": 0.0105, |
| "reward": 0.4401181936264038, |
| "reward_std": 0.4361417144536972, |
| "rewards/preference_model_reward": 0.4401181936264038, |
| "rewards/preference_model_reward/std": 0.4305359721183777, |
| "step": 87 |
| }, |
| { |
| "clip_ratio": 0.00014025312702869996, |
| "epoch": 0.22886866059817945, |
| "grad_norm": 0.8984033497034181, |
| "kl": 0.008697509765625, |
| "learning_rate": 1e-06, |
| "loss": 0.0106, |
| "step": 88 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 321.375, |
| "epoch": 0.23146944083224968, |
| "grad_norm": 2.827688577013721, |
| "kl": 0.008758544921875, |
| "learning_rate": 1e-06, |
| "loss": 0.0001, |
| "reward": 0.5173258185386658, |
| "reward_std": 0.4247867166996002, |
| "rewards/preference_model_reward": 0.5173258185386658, |
| "rewards/preference_model_reward/std": 0.4418337345123291, |
| "step": 89 |
| }, |
| { |
| "clip_ratio": 0.00032082964025903493, |
| "epoch": 0.2340702210663199, |
| "grad_norm": 0.9909499799511914, |
| "kl": 0.009033203125, |
| "learning_rate": 1e-06, |
| "loss": 0.0002, |
| "step": 90 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 397.25, |
| "epoch": 0.23667100130039012, |
| "grad_norm": 0.958697923736949, |
| "kl": 0.00958251953125, |
| "learning_rate": 1e-06, |
| "loss": -0.0032, |
| "reward": 0.7365403473377228, |
| "reward_std": 0.37412843108177185, |
| "rewards/preference_model_reward": 0.7365403473377228, |
| "rewards/preference_model_reward/std": 0.38982725143432617, |
| "step": 91 |
| }, |
| { |
| "clip_ratio": 0.0003890225198119879, |
| "epoch": 0.23927178153446033, |
| "grad_norm": 0.9071467126090668, |
| "kl": 0.00982666015625, |
| "learning_rate": 1e-06, |
| "loss": -0.0031, |
| "step": 92 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 208.75, |
| "epoch": 0.24187256176853056, |
| "grad_norm": 0.6085390765827805, |
| "kl": 0.010498046875, |
| "learning_rate": 1e-06, |
| "loss": -0.0095, |
| "reward": 0.6177057921886444, |
| "reward_std": 0.36169178783893585, |
| "rewards/preference_model_reward": 0.6177057921886444, |
| "rewards/preference_model_reward/std": 0.3608778268098831, |
| "step": 93 |
| }, |
| { |
| "clip_ratio": 0.0002914143551606685, |
| "epoch": 0.2444733420026008, |
| "grad_norm": 0.6159917640161859, |
| "kl": 0.010528564453125, |
| "learning_rate": 1e-06, |
| "loss": -0.0095, |
| "step": 94 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 357.9375, |
| "epoch": 0.247074122236671, |
| "grad_norm": 0.9013427802646543, |
| "kl": 0.011474609375, |
| "learning_rate": 1e-06, |
| "loss": 0.0033, |
| "reward": 0.4740111082792282, |
| "reward_std": 0.39952021837234497, |
| "rewards/preference_model_reward": 0.4740111082792282, |
| "rewards/preference_model_reward/std": 0.4284791499376297, |
| "step": 95 |
| }, |
| { |
| "clip_ratio": 0.0003251660345995333, |
| "epoch": 0.24967490247074123, |
| "grad_norm": 0.9042971795796162, |
| "kl": 0.011688232421875, |
| "learning_rate": 1e-06, |
| "loss": 0.0032, |
| "step": 96 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 368.859375, |
| "epoch": 0.25227568270481143, |
| "grad_norm": 0.7543210465997314, |
| "kl": 0.01019287109375, |
| "learning_rate": 1e-06, |
| "loss": -0.015, |
| "reward": 0.732722133398056, |
| "reward_std": 0.3390260487794876, |
| "rewards/preference_model_reward": 0.732722133398056, |
| "rewards/preference_model_reward/std": 0.37897253036499023, |
| "step": 97 |
| }, |
| { |
| "clip_ratio": 0.00034867875365307555, |
| "epoch": 0.25487646293888166, |
| "grad_norm": 0.7756495986001384, |
| "kl": 0.0103759765625, |
| "learning_rate": 1e-06, |
| "loss": -0.015, |
| "step": 98 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 269.609375, |
| "epoch": 0.2574772431729519, |
| "grad_norm": 0.8621435865558427, |
| "kl": 0.012908935546875, |
| "learning_rate": 1e-06, |
| "loss": 0.0035, |
| "reward": 0.5670541375875473, |
| "reward_std": 0.3086087480187416, |
| "rewards/preference_model_reward": 0.5670541375875473, |
| "rewards/preference_model_reward/std": 0.4115421622991562, |
| "step": 99 |
| }, |
| { |
| "clip_ratio": 0.0002304340960108675, |
| "epoch": 0.26007802340702213, |
| "grad_norm": 0.8566337212306596, |
| "kl": 0.01300048828125, |
| "learning_rate": 1e-06, |
| "loss": 0.0035, |
| "step": 100 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 321.296875, |
| "epoch": 0.2626788036410923, |
| "grad_norm": 0.7524769990332698, |
| "kl": 0.01031494140625, |
| "learning_rate": 1e-06, |
| "loss": 0.0056, |
| "reward": 0.5453621596097946, |
| "reward_std": 0.3028823733329773, |
| "rewards/preference_model_reward": 0.5453621596097946, |
| "rewards/preference_model_reward/std": 0.41332288086414337, |
| "step": 101 |
| }, |
| { |
| "clip_ratio": 0.0004275508617865853, |
| "epoch": 0.26527958387516254, |
| "grad_norm": 0.760555697889034, |
| "kl": 0.010467529296875, |
| "learning_rate": 1e-06, |
| "loss": 0.0056, |
| "step": 102 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 277.53125, |
| "epoch": 0.26788036410923277, |
| "grad_norm": 0.7828074285838857, |
| "kl": 0.01123046875, |
| "learning_rate": 1e-06, |
| "loss": -0.0056, |
| "reward": 0.5679600983858109, |
| "reward_std": 0.38669461011886597, |
| "rewards/preference_model_reward": 0.5679600983858109, |
| "rewards/preference_model_reward/std": 0.3954748064279556, |
| "step": 103 |
| }, |
| { |
| "clip_ratio": 0.00024837666569510475, |
| "epoch": 0.270481144343303, |
| "grad_norm": 0.7708099657439608, |
| "kl": 0.011322021484375, |
| "learning_rate": 1e-06, |
| "loss": -0.0056, |
| "step": 104 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 449.03125, |
| "epoch": 0.27308192457737324, |
| "grad_norm": 0.8471146947264236, |
| "kl": 0.0101318359375, |
| "learning_rate": 1e-06, |
| "loss": 0.0058, |
| "reward": 0.7424919009208679, |
| "reward_std": 0.31481410562992096, |
| "rewards/preference_model_reward": 0.7424919009208679, |
| "rewards/preference_model_reward/std": 0.3854113817214966, |
| "step": 105 |
| }, |
| { |
| "clip_ratio": 0.0001572734909132123, |
| "epoch": 0.2756827048114434, |
| "grad_norm": 0.870295151902565, |
| "kl": 0.01031494140625, |
| "learning_rate": 1e-06, |
| "loss": 0.0058, |
| "step": 106 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 460.96875, |
| "epoch": 0.27828348504551365, |
| "grad_norm": 1.2070074597423983, |
| "kl": 0.012939453125, |
| "learning_rate": 1e-06, |
| "loss": 0.0009, |
| "reward": 0.5596677958965302, |
| "reward_std": 0.4118568003177643, |
| "rewards/preference_model_reward": 0.5596677958965302, |
| "rewards/preference_model_reward/std": 0.4305266737937927, |
| "step": 107 |
| }, |
| { |
| "clip_ratio": 0.00020162294094916433, |
| "epoch": 0.2808842652795839, |
| "grad_norm": 1.1701677024313162, |
| "kl": 0.0130615234375, |
| "learning_rate": 1e-06, |
| "loss": 0.0009, |
| "step": 108 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 279.953125, |
| "epoch": 0.2834850455136541, |
| "grad_norm": 0.4951581311208896, |
| "kl": 0.012420654296875, |
| "learning_rate": 1e-06, |
| "loss": -0.0018, |
| "reward": 0.677459716796875, |
| "reward_std": 0.21432576701045036, |
| "rewards/preference_model_reward": 0.677459716796875, |
| "rewards/preference_model_reward/std": 0.40531064569950104, |
| "step": 109 |
| }, |
| { |
| "clip_ratio": 0.00015336842989199795, |
| "epoch": 0.28608582574772434, |
| "grad_norm": 0.4891314801199321, |
| "kl": 0.0125732421875, |
| "learning_rate": 1e-06, |
| "loss": -0.0017, |
| "step": 110 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 290.359375, |
| "epoch": 0.2886866059817945, |
| "grad_norm": 0.8219128100663494, |
| "kl": 0.011444091796875, |
| "learning_rate": 1e-06, |
| "loss": 0.0046, |
| "reward": 0.5518685728311539, |
| "reward_std": 0.28263746947050095, |
| "rewards/preference_model_reward": 0.5518685728311539, |
| "rewards/preference_model_reward/std": 0.42345236241817474, |
| "step": 111 |
| }, |
| { |
| "clip_ratio": 0.0004249960547895171, |
| "epoch": 0.29128738621586475, |
| "grad_norm": 1.0133739094911354, |
| "kl": 0.011749267578125, |
| "learning_rate": 1e-06, |
| "loss": 0.0046, |
| "step": 112 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 346.765625, |
| "epoch": 0.293888166449935, |
| "grad_norm": 0.8158114597582814, |
| "kl": 0.01141357421875, |
| "learning_rate": 1e-06, |
| "loss": -0.0107, |
| "reward": 0.6804526448249817, |
| "reward_std": 0.3598247319459915, |
| "rewards/preference_model_reward": 0.6804526448249817, |
| "rewards/preference_model_reward/std": 0.38642027974128723, |
| "step": 113 |
| }, |
| { |
| "clip_ratio": 0.0003029858708032407, |
| "epoch": 0.2964889466840052, |
| "grad_norm": 0.8253998137210009, |
| "kl": 0.0115966796875, |
| "learning_rate": 1e-06, |
| "loss": -0.0108, |
| "step": 114 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 385.359375, |
| "epoch": 0.29908972691807545, |
| "grad_norm": 0.9275019225790542, |
| "kl": 0.013031005859375, |
| "learning_rate": 1e-06, |
| "loss": 0.0001, |
| "reward": 0.6333288848400116, |
| "reward_std": 0.35428452491760254, |
| "rewards/preference_model_reward": 0.6333288848400116, |
| "rewards/preference_model_reward/std": 0.3997349590063095, |
| "step": 115 |
| }, |
| { |
| "clip_ratio": 0.00022703978174831718, |
| "epoch": 0.3016905071521456, |
| "grad_norm": 0.9059205023346738, |
| "kl": 0.013153076171875, |
| "learning_rate": 1e-06, |
| "loss": 0.0001, |
| "step": 116 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 316.828125, |
| "epoch": 0.30429128738621586, |
| "grad_norm": 0.9713664565394787, |
| "kl": 0.01373291015625, |
| "learning_rate": 1e-06, |
| "loss": 0.0139, |
| "reward": 0.7312487959861755, |
| "reward_std": 0.38407662510871887, |
| "rewards/preference_model_reward": 0.7312487959861755, |
| "rewards/preference_model_reward/std": 0.40977030992507935, |
| "step": 117 |
| }, |
| { |
| "clip_ratio": 0.0002469179962645285, |
| "epoch": 0.3068920676202861, |
| "grad_norm": 1.01290147214235, |
| "kl": 0.013885498046875, |
| "learning_rate": 1e-06, |
| "loss": 0.0139, |
| "step": 118 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 290.09375, |
| "epoch": 0.3094928478543563, |
| "grad_norm": 0.8042535298912621, |
| "kl": 0.01300048828125, |
| "learning_rate": 1e-06, |
| "loss": -0.0069, |
| "reward": 0.6101844310760498, |
| "reward_std": 0.3753702640533447, |
| "rewards/preference_model_reward": 0.6101844310760498, |
| "rewards/preference_model_reward/std": 0.41265669465065, |
| "step": 119 |
| }, |
| { |
| "clip_ratio": 0.00021792916231788695, |
| "epoch": 0.31209362808842656, |
| "grad_norm": 0.7861169995186488, |
| "kl": 0.01348876953125, |
| "learning_rate": 1e-06, |
| "loss": -0.0069, |
| "step": 120 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 343.8125, |
| "epoch": 0.31469440832249673, |
| "grad_norm": 1.0936736694491056, |
| "kl": 0.0130615234375, |
| "learning_rate": 1e-06, |
| "loss": -0.008, |
| "reward": 0.7429376542568207, |
| "reward_std": 0.34224678575992584, |
| "rewards/preference_model_reward": 0.7429376542568207, |
| "rewards/preference_model_reward/std": 0.39303846657276154, |
| "step": 121 |
| }, |
| { |
| "clip_ratio": 0.0004085178370587528, |
| "epoch": 0.31729518855656696, |
| "grad_norm": 0.8105152189436506, |
| "kl": 0.013336181640625, |
| "learning_rate": 1e-06, |
| "loss": -0.0081, |
| "step": 122 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 351.578125, |
| "epoch": 0.3198959687906372, |
| "grad_norm": 1.1492446304526747, |
| "kl": 0.012359619140625, |
| "learning_rate": 1e-06, |
| "loss": -0.0031, |
| "reward": 0.666700005531311, |
| "reward_std": 0.4078214764595032, |
| "rewards/preference_model_reward": 0.666700005531311, |
| "rewards/preference_model_reward/std": 0.4199042469263077, |
| "step": 123 |
| }, |
| { |
| "clip_ratio": 0.0003381448841537349, |
| "epoch": 0.32249674902470743, |
| "grad_norm": 0.9700568661797079, |
| "kl": 0.0125732421875, |
| "learning_rate": 1e-06, |
| "loss": -0.0031, |
| "step": 124 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 295.890625, |
| "epoch": 0.3250975292587776, |
| "grad_norm": 0.7872274173675015, |
| "kl": 0.012054443359375, |
| "learning_rate": 1e-06, |
| "loss": -0.0016, |
| "reward": 0.6267447769641876, |
| "reward_std": 0.38382989168167114, |
| "rewards/preference_model_reward": 0.6267447769641876, |
| "rewards/preference_model_reward/std": 0.40342220664024353, |
| "step": 125 |
| }, |
| { |
| "clip_ratio": 0.00043341246782802045, |
| "epoch": 0.32769830949284784, |
| "grad_norm": 0.7404724454292584, |
| "kl": 0.01226806640625, |
| "learning_rate": 1e-06, |
| "loss": -0.0016, |
| "step": 126 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 283.328125, |
| "epoch": 0.33029908972691807, |
| "grad_norm": 0.7130605700783115, |
| "kl": 0.0150146484375, |
| "learning_rate": 1e-06, |
| "loss": -0.0016, |
| "reward": 0.5046076327562332, |
| "reward_std": 0.32486245036125183, |
| "rewards/preference_model_reward": 0.5046076327562332, |
| "rewards/preference_model_reward/std": 0.35284098982810974, |
| "step": 127 |
| }, |
| { |
| "clip_ratio": 0.0002618366925162263, |
| "epoch": 0.3328998699609883, |
| "grad_norm": 0.7176392089800268, |
| "kl": 0.01519775390625, |
| "learning_rate": 1e-06, |
| "loss": -0.0016, |
| "step": 128 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 364.796875, |
| "epoch": 0.33550065019505854, |
| "grad_norm": 0.442241343056545, |
| "kl": 0.013427734375, |
| "learning_rate": 1e-06, |
| "loss": -0.0004, |
| "reward": 0.9121778607368469, |
| "reward_std": 0.16598587855696678, |
| "rewards/preference_model_reward": 0.9121778607368469, |
| "rewards/preference_model_reward/std": 0.16689887270331383, |
| "step": 129 |
| }, |
| { |
| "clip_ratio": 0.0002460579635226168, |
| "epoch": 0.3381014304291287, |
| "grad_norm": 0.4491215566539981, |
| "kl": 0.0135498046875, |
| "learning_rate": 1e-06, |
| "loss": -0.0004, |
| "step": 130 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 399.328125, |
| "epoch": 0.34070221066319895, |
| "grad_norm": 0.9316255175025087, |
| "kl": 0.0152587890625, |
| "learning_rate": 1e-06, |
| "loss": 0.0096, |
| "reward": 0.4223726838827133, |
| "reward_std": 0.36841753125190735, |
| "rewards/preference_model_reward": 0.4223726838827133, |
| "rewards/preference_model_reward/std": 0.39657390117645264, |
| "step": 131 |
| }, |
| { |
| "clip_ratio": 0.0002102968719555065, |
| "epoch": 0.3433029908972692, |
| "grad_norm": 0.9385675024218874, |
| "kl": 0.01544189453125, |
| "learning_rate": 1e-06, |
| "loss": 0.0096, |
| "step": 132 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 331.515625, |
| "epoch": 0.3459037711313394, |
| "grad_norm": 0.8377549816770966, |
| "kl": 0.01513671875, |
| "learning_rate": 1e-06, |
| "loss": -0.014, |
| "reward": 0.519294261932373, |
| "reward_std": 0.36041052639484406, |
| "rewards/preference_model_reward": 0.519294261932373, |
| "rewards/preference_model_reward/std": 0.44742684066295624, |
| "step": 133 |
| }, |
| { |
| "clip_ratio": 0.00024680225760675967, |
| "epoch": 0.34850455136540964, |
| "grad_norm": 0.8569254578138688, |
| "kl": 0.015411376953125, |
| "learning_rate": 1e-06, |
| "loss": -0.0141, |
| "step": 134 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 272.734375, |
| "epoch": 0.3511053315994798, |
| "grad_norm": 0.7108994159400903, |
| "kl": 0.01519775390625, |
| "learning_rate": 1e-06, |
| "loss": -0.0135, |
| "reward": 0.6890691518783569, |
| "reward_std": 0.3600848317146301, |
| "rewards/preference_model_reward": 0.6890691518783569, |
| "rewards/preference_model_reward/std": 0.3913665860891342, |
| "step": 135 |
| }, |
| { |
| "clip_ratio": 0.00033790143788792193, |
| "epoch": 0.35370611183355005, |
| "grad_norm": 0.7107451748958499, |
| "kl": 0.015350341796875, |
| "learning_rate": 1e-06, |
| "loss": -0.0136, |
| "step": 136 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 304.53125, |
| "epoch": 0.3563068920676203, |
| "grad_norm": 0.7112930237728833, |
| "kl": 0.01519775390625, |
| "learning_rate": 1e-06, |
| "loss": -0.0098, |
| "reward": 0.7829216420650482, |
| "reward_std": 0.34336017072200775, |
| "rewards/preference_model_reward": 0.7829216420650482, |
| "rewards/preference_model_reward/std": 0.34507185220718384, |
| "step": 137 |
| }, |
| { |
| "clip_ratio": 7.81005946919322e-05, |
| "epoch": 0.3589076723016905, |
| "grad_norm": 0.7120408275310693, |
| "kl": 0.01519775390625, |
| "learning_rate": 1e-06, |
| "loss": -0.0099, |
| "step": 138 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 338.390625, |
| "epoch": 0.36150845253576075, |
| "grad_norm": 0.7547500044898569, |
| "kl": 0.013763427734375, |
| "learning_rate": 1e-06, |
| "loss": -0.0076, |
| "reward": 0.5637124627828598, |
| "reward_std": 0.2805905416607857, |
| "rewards/preference_model_reward": 0.5637124627828598, |
| "rewards/preference_model_reward/std": 0.39679694175720215, |
| "step": 139 |
| }, |
| { |
| "clip_ratio": 0.00016640447574900463, |
| "epoch": 0.3641092327698309, |
| "grad_norm": 0.8421728078893187, |
| "kl": 0.01385498046875, |
| "learning_rate": 1e-06, |
| "loss": -0.0076, |
| "step": 140 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 322.21875, |
| "epoch": 0.36671001300390116, |
| "grad_norm": 0.9221596077030108, |
| "kl": 0.01507568359375, |
| "learning_rate": 1e-06, |
| "loss": -0.0052, |
| "reward": 0.5962317585945129, |
| "reward_std": 0.4129558801651001, |
| "rewards/preference_model_reward": 0.5962317585945129, |
| "rewards/preference_model_reward/std": 0.43260352313518524, |
| "step": 141 |
| }, |
| { |
| "clip_ratio": 0.0002706990926526487, |
| "epoch": 0.3693107932379714, |
| "grad_norm": 0.9429041940751776, |
| "kl": 0.01531982421875, |
| "learning_rate": 1e-06, |
| "loss": -0.0051, |
| "step": 142 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 383.921875, |
| "epoch": 0.3719115734720416, |
| "grad_norm": 0.7528201748690613, |
| "kl": 0.01739501953125, |
| "learning_rate": 1e-06, |
| "loss": 0.0005, |
| "reward": 0.8064576983451843, |
| "reward_std": 0.2952372878789902, |
| "rewards/preference_model_reward": 0.8064576983451843, |
| "rewards/preference_model_reward/std": 0.33498962223529816, |
| "step": 143 |
| }, |
| { |
| "clip_ratio": 0.0002466185833327472, |
| "epoch": 0.37451235370611186, |
| "grad_norm": 0.7343692384638275, |
| "kl": 0.01751708984375, |
| "learning_rate": 1e-06, |
| "loss": 0.0005, |
| "step": 144 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 331.671875, |
| "epoch": 0.37711313394018203, |
| "grad_norm": 0.6099207337964894, |
| "kl": 0.012603759765625, |
| "learning_rate": 1e-06, |
| "loss": 0.0025, |
| "reward": 0.47770172357559204, |
| "reward_std": 0.2876940220594406, |
| "rewards/preference_model_reward": 0.47770172357559204, |
| "rewards/preference_model_reward/std": 0.46976715326309204, |
| "step": 145 |
| }, |
| { |
| "clip_ratio": 0.00027403252897784114, |
| "epoch": 0.37971391417425226, |
| "grad_norm": 0.6935396105177946, |
| "kl": 0.01275634765625, |
| "learning_rate": 1e-06, |
| "loss": 0.0025, |
| "step": 146 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 291.0625, |
| "epoch": 0.3823146944083225, |
| "grad_norm": 0.6431240710584735, |
| "kl": 0.015350341796875, |
| "learning_rate": 1e-06, |
| "loss": -0.0049, |
| "reward": 0.4983751177787781, |
| "reward_std": 0.3121785521507263, |
| "rewards/preference_model_reward": 0.4983751177787781, |
| "rewards/preference_model_reward/std": 0.4164520502090454, |
| "step": 147 |
| }, |
| { |
| "clip_ratio": 0.00021908171038376167, |
| "epoch": 0.38491547464239273, |
| "grad_norm": 0.644019538703639, |
| "kl": 0.01507568359375, |
| "learning_rate": 1e-06, |
| "loss": -0.0049, |
| "step": 148 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 342.734375, |
| "epoch": 0.38751625487646296, |
| "grad_norm": 1.010397732794426, |
| "kl": 0.0172119140625, |
| "learning_rate": 1e-06, |
| "loss": 0.0067, |
| "reward": 0.6010620892047882, |
| "reward_std": 0.39562711119651794, |
| "rewards/preference_model_reward": 0.6010620892047882, |
| "rewards/preference_model_reward/std": 0.4233546853065491, |
| "step": 149 |
| }, |
| { |
| "clip_ratio": 0.00015757188884890638, |
| "epoch": 0.39011703511053314, |
| "grad_norm": 0.9973363628824508, |
| "kl": 0.0172119140625, |
| "learning_rate": 1e-06, |
| "loss": 0.0067, |
| "step": 150 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 448.890625, |
| "epoch": 0.39271781534460337, |
| "grad_norm": 1.0100632336398712, |
| "kl": 0.01690673828125, |
| "learning_rate": 1e-06, |
| "loss": 0.0051, |
| "reward": 0.5471232235431671, |
| "reward_std": 0.408640593290329, |
| "rewards/preference_model_reward": 0.5471232235431671, |
| "rewards/preference_model_reward/std": 0.4592936784029007, |
| "step": 151 |
| }, |
| { |
| "clip_ratio": 0.0002920969855040312, |
| "epoch": 0.3953185955786736, |
| "grad_norm": 0.9929713506706197, |
| "kl": 0.017333984375, |
| "learning_rate": 1e-06, |
| "loss": 0.005, |
| "step": 152 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 378.53125, |
| "epoch": 0.39791937581274384, |
| "grad_norm": 0.7606426983085363, |
| "kl": 0.017578125, |
| "learning_rate": 1e-06, |
| "loss": -0.01, |
| "reward": 0.7775087058544159, |
| "reward_std": 0.34680168330669403, |
| "rewards/preference_model_reward": 0.7775087058544159, |
| "rewards/preference_model_reward/std": 0.35080482065677643, |
| "step": 153 |
| }, |
| { |
| "clip_ratio": 0.0002456825313856825, |
| "epoch": 0.40052015604681407, |
| "grad_norm": 0.7710939909605372, |
| "kl": 0.0179443359375, |
| "learning_rate": 1e-06, |
| "loss": -0.01, |
| "step": 154 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 250.796875, |
| "epoch": 0.40312093628088425, |
| "grad_norm": 0.5451857762518953, |
| "kl": 0.02081298828125, |
| "learning_rate": 1e-06, |
| "loss": -0.0028, |
| "reward": 0.5519833117723465, |
| "reward_std": 0.2390262335538864, |
| "rewards/preference_model_reward": 0.5519833117723465, |
| "rewards/preference_model_reward/std": 0.45026274025440216, |
| "step": 155 |
| }, |
| { |
| "clip_ratio": 0.00017806489631766453, |
| "epoch": 0.4057217165149545, |
| "grad_norm": 0.554072362585335, |
| "kl": 0.02093505859375, |
| "learning_rate": 1e-06, |
| "loss": -0.0028, |
| "step": 156 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 232.078125, |
| "epoch": 0.4083224967490247, |
| "grad_norm": 0.5558645545689939, |
| "kl": 0.0223388671875, |
| "learning_rate": 1e-06, |
| "loss": 0.0021, |
| "reward": 0.7436761558055878, |
| "reward_std": 0.2672045975923538, |
| "rewards/preference_model_reward": 0.7436761558055878, |
| "rewards/preference_model_reward/std": 0.35222816467285156, |
| "step": 157 |
| }, |
| { |
| "clip_ratio": 0.0003454240650171414, |
| "epoch": 0.41092327698309494, |
| "grad_norm": 0.6575119278635855, |
| "kl": 0.0224609375, |
| "learning_rate": 1e-06, |
| "loss": 0.0022, |
| "step": 158 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 303.375, |
| "epoch": 0.4135240572171652, |
| "grad_norm": 0.7637043400607006, |
| "kl": 0.01995849609375, |
| "learning_rate": 1e-06, |
| "loss": 0.0036, |
| "reward": 0.5944642722606659, |
| "reward_std": 0.3134625256061554, |
| "rewards/preference_model_reward": 0.5944642722606659, |
| "rewards/preference_model_reward/std": 0.4180496633052826, |
| "step": 159 |
| }, |
| { |
| "clip_ratio": 0.00020651466911658645, |
| "epoch": 0.41612483745123535, |
| "grad_norm": 0.741675886273459, |
| "kl": 0.02020263671875, |
| "learning_rate": 1e-06, |
| "loss": 0.0035, |
| "step": 160 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 303.96875, |
| "epoch": 0.4187256176853056, |
| "grad_norm": 0.5908208021088289, |
| "kl": 0.01922607421875, |
| "learning_rate": 1e-06, |
| "loss": -0.0014, |
| "reward": 0.8729158341884613, |
| "reward_std": 0.2293628454208374, |
| "rewards/preference_model_reward": 0.8729158341884613, |
| "rewards/preference_model_reward/std": 0.2594291567802429, |
| "step": 161 |
| }, |
| { |
| "clip_ratio": 0.000392273606848903, |
| "epoch": 0.4213263979193758, |
| "grad_norm": 0.5839404720619328, |
| "kl": 0.01947021484375, |
| "learning_rate": 1e-06, |
| "loss": -0.0013, |
| "step": 162 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 282.125, |
| "epoch": 0.42392717815344605, |
| "grad_norm": 0.5173474832622873, |
| "kl": 0.01922607421875, |
| "learning_rate": 1e-06, |
| "loss": -0.0006, |
| "reward": 0.8816950023174286, |
| "reward_std": 0.17555147409439087, |
| "rewards/preference_model_reward": 0.8816950023174286, |
| "rewards/preference_model_reward/std": 0.24751071631908417, |
| "step": 163 |
| }, |
| { |
| "clip_ratio": 0.00020699262677226216, |
| "epoch": 0.4265279583875163, |
| "grad_norm": 0.5253345880647052, |
| "kl": 0.01947021484375, |
| "learning_rate": 1e-06, |
| "loss": -0.0006, |
| "step": 164 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 354.40625, |
| "epoch": 0.42912873862158646, |
| "grad_norm": 0.642701967238361, |
| "kl": 0.02044677734375, |
| "learning_rate": 1e-06, |
| "loss": -0.0059, |
| "reward": 0.5235294103622437, |
| "reward_std": 0.3236909657716751, |
| "rewards/preference_model_reward": 0.5235294103622437, |
| "rewards/preference_model_reward/std": 0.3493155986070633, |
| "step": 165 |
| }, |
| { |
| "clip_ratio": 0.00025575608015060425, |
| "epoch": 0.4317295188556567, |
| "grad_norm": 0.6590828609212557, |
| "kl": 0.0208740234375, |
| "learning_rate": 1e-06, |
| "loss": -0.0059, |
| "step": 166 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 322.296875, |
| "epoch": 0.4343302990897269, |
| "grad_norm": 0.6256801460180289, |
| "kl": 0.01947021484375, |
| "learning_rate": 1e-06, |
| "loss": -0.0001, |
| "reward": 0.8114274740219116, |
| "reward_std": 0.26273050904273987, |
| "rewards/preference_model_reward": 0.8114274740219116, |
| "rewards/preference_model_reward/std": 0.33220019936561584, |
| "step": 167 |
| }, |
| { |
| "clip_ratio": 0.00031010911334306, |
| "epoch": 0.43693107932379716, |
| "grad_norm": 0.6426336745378424, |
| "kl": 0.02008056640625, |
| "learning_rate": 1e-06, |
| "loss": -0.0001, |
| "step": 168 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 294.84375, |
| "epoch": 0.43953185955786733, |
| "grad_norm": 0.5161332894447401, |
| "kl": 0.02392578125, |
| "learning_rate": 1e-06, |
| "loss": -0.0001, |
| "reward": 0.572134867310524, |
| "reward_std": 0.21724799275398254, |
| "rewards/preference_model_reward": 0.572134867310524, |
| "rewards/preference_model_reward/std": 0.3613039702177048, |
| "step": 169 |
| }, |
| { |
| "clip_ratio": 0.0004953075695084408, |
| "epoch": 0.44213263979193757, |
| "grad_norm": 0.5949338340498116, |
| "kl": 0.02392578125, |
| "learning_rate": 1e-06, |
| "loss": -0.0001, |
| "step": 170 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 303.921875, |
| "epoch": 0.4447334200260078, |
| "grad_norm": 0.770205409238572, |
| "kl": 0.02276611328125, |
| "learning_rate": 1e-06, |
| "loss": 0.0068, |
| "reward": 0.6543514132499695, |
| "reward_std": 0.3576260805130005, |
| "rewards/preference_model_reward": 0.6543514132499695, |
| "rewards/preference_model_reward/std": 0.3618515580892563, |
| "step": 171 |
| }, |
| { |
| "clip_ratio": 0.0002886750335164834, |
| "epoch": 0.44733420026007803, |
| "grad_norm": 0.7881488307294554, |
| "kl": 0.02288818359375, |
| "learning_rate": 1e-06, |
| "loss": 0.0068, |
| "step": 172 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 256.265625, |
| "epoch": 0.44993498049414826, |
| "grad_norm": 0.6609030894984753, |
| "kl": 0.02105712890625, |
| "learning_rate": 1e-06, |
| "loss": 0.003, |
| "reward": 0.7981027066707611, |
| "reward_std": 0.23525837063789368, |
| "rewards/preference_model_reward": 0.7981027066707611, |
| "rewards/preference_model_reward/std": 0.3589998483657837, |
| "step": 173 |
| }, |
| { |
| "clip_ratio": 0.00019836763385683298, |
| "epoch": 0.45253576072821844, |
| "grad_norm": 0.5635575828296628, |
| "kl": 0.020751953125, |
| "learning_rate": 1e-06, |
| "loss": 0.003, |
| "step": 174 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 319.453125, |
| "epoch": 0.45513654096228867, |
| "grad_norm": 0.8369196057998636, |
| "kl": 0.0262451171875, |
| "learning_rate": 1e-06, |
| "loss": -0.0051, |
| "reward": 0.7033383548259735, |
| "reward_std": 0.2920212224125862, |
| "rewards/preference_model_reward": 0.7033383548259735, |
| "rewards/preference_model_reward/std": 0.3324955254793167, |
| "step": 175 |
| }, |
| { |
| "clip_ratio": 0.00029345130315050483, |
| "epoch": 0.4577373211963589, |
| "grad_norm": 0.7558084723044584, |
| "kl": 0.02667236328125, |
| "learning_rate": 1e-06, |
| "loss": -0.0051, |
| "step": 176 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 313.46875, |
| "epoch": 0.46033810143042914, |
| "grad_norm": 0.6043611202887026, |
| "kl": 0.02191162109375, |
| "learning_rate": 1e-06, |
| "loss": -0.0042, |
| "reward": 0.7082796394824982, |
| "reward_std": 0.2582213580608368, |
| "rewards/preference_model_reward": 0.7082796394824982, |
| "rewards/preference_model_reward/std": 0.369435116648674, |
| "step": 177 |
| }, |
| { |
| "clip_ratio": 0.00041518576472299173, |
| "epoch": 0.46293888166449937, |
| "grad_norm": 0.6071654228678836, |
| "kl": 0.0220947265625, |
| "learning_rate": 1e-06, |
| "loss": -0.0042, |
| "step": 178 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 361.84375, |
| "epoch": 0.46553966189856955, |
| "grad_norm": 0.8328067968880296, |
| "kl": 0.0191650390625, |
| "learning_rate": 1e-06, |
| "loss": 0.0056, |
| "reward": 0.6826794147491455, |
| "reward_std": 0.39410941302776337, |
| "rewards/preference_model_reward": 0.6826794147491455, |
| "rewards/preference_model_reward/std": 0.41487593948841095, |
| "step": 179 |
| }, |
| { |
| "clip_ratio": 0.0003094414860242978, |
| "epoch": 0.4681404421326398, |
| "grad_norm": 0.8355485459107456, |
| "kl": 0.0191650390625, |
| "learning_rate": 1e-06, |
| "loss": 0.0056, |
| "step": 180 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 187.390625, |
| "epoch": 0.47074122236671, |
| "grad_norm": 0.9822709833700972, |
| "kl": 0.02197265625, |
| "learning_rate": 1e-06, |
| "loss": -0.0071, |
| "reward": 0.7466834783554077, |
| "reward_std": 0.3152329549193382, |
| "rewards/preference_model_reward": 0.7466834783554077, |
| "rewards/preference_model_reward/std": 0.3539666682481766, |
| "step": 181 |
| }, |
| { |
| "clip_ratio": 0.00015157322195591405, |
| "epoch": 0.47334200260078024, |
| "grad_norm": 0.6823337060903064, |
| "kl": 0.0220947265625, |
| "learning_rate": 1e-06, |
| "loss": -0.0071, |
| "step": 182 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 366.078125, |
| "epoch": 0.4759427828348505, |
| "grad_norm": 0.5703827869859761, |
| "kl": 0.02264404296875, |
| "learning_rate": 1e-06, |
| "loss": 0.0004, |
| "reward": 0.760188639163971, |
| "reward_std": 0.2790553569793701, |
| "rewards/preference_model_reward": 0.760188639163971, |
| "rewards/preference_model_reward/std": 0.3441888093948364, |
| "step": 183 |
| }, |
| { |
| "clip_ratio": 0.00012506498387665488, |
| "epoch": 0.47854356306892065, |
| "grad_norm": 0.5735123344340435, |
| "kl": 0.02252197265625, |
| "learning_rate": 1e-06, |
| "loss": 0.0004, |
| "step": 184 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 336.71875, |
| "epoch": 0.4811443433029909, |
| "grad_norm": 0.8852644183946003, |
| "kl": 0.0257568359375, |
| "learning_rate": 1e-06, |
| "loss": 0.0035, |
| "reward": 0.7332727313041687, |
| "reward_std": 0.3565700799226761, |
| "rewards/preference_model_reward": 0.7332727313041687, |
| "rewards/preference_model_reward/std": 0.39707188308238983, |
| "step": 185 |
| }, |
| { |
| "clip_ratio": 0.00018871420979849063, |
| "epoch": 0.4837451235370611, |
| "grad_norm": 0.8348047351432295, |
| "kl": 0.0257568359375, |
| "learning_rate": 1e-06, |
| "loss": 0.0035, |
| "step": 186 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 334.140625, |
| "epoch": 0.48634590377113135, |
| "grad_norm": 0.4595696403338184, |
| "kl": 0.0205078125, |
| "learning_rate": 1e-06, |
| "loss": 0.0017, |
| "reward": 0.7008107006549835, |
| "reward_std": 0.137342881411314, |
| "rewards/preference_model_reward": 0.7008107006549835, |
| "rewards/preference_model_reward/std": 0.2884506806731224, |
| "step": 187 |
| }, |
| { |
| "clip_ratio": 0.00015839685511309654, |
| "epoch": 0.4889466840052016, |
| "grad_norm": 0.4569164783179551, |
| "kl": 0.0206298828125, |
| "learning_rate": 1e-06, |
| "loss": 0.0017, |
| "step": 188 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 341.6875, |
| "epoch": 0.49154746423927176, |
| "grad_norm": 0.6567111545906984, |
| "kl": 0.0224609375, |
| "learning_rate": 1e-06, |
| "loss": -0.0003, |
| "reward": 0.7975671887397766, |
| "reward_std": 0.2910696864128113, |
| "rewards/preference_model_reward": 0.7975671887397766, |
| "rewards/preference_model_reward/std": 0.34180814027786255, |
| "step": 189 |
| }, |
| { |
| "clip_ratio": 0.00016007465819711797, |
| "epoch": 0.494148244473342, |
| "grad_norm": 0.6589594674011164, |
| "kl": 0.0228271484375, |
| "learning_rate": 1e-06, |
| "loss": -0.0003, |
| "step": 190 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 258.34375, |
| "epoch": 0.4967490247074122, |
| "grad_norm": 1.797805114255913, |
| "kl": 0.02618408203125, |
| "learning_rate": 1e-06, |
| "loss": -0.0071, |
| "reward": 0.8337420225143433, |
| "reward_std": 0.2474193051457405, |
| "rewards/preference_model_reward": 0.8337420225143433, |
| "rewards/preference_model_reward/std": 0.3199751079082489, |
| "step": 191 |
| }, |
| { |
| "clip_ratio": 0.0002587813069112599, |
| "epoch": 0.49934980494148246, |
| "grad_norm": 0.5660164857332879, |
| "kl": 0.02655029296875, |
| "learning_rate": 1e-06, |
| "loss": -0.0071, |
| "step": 192 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 288.46875, |
| "epoch": 0.5019505851755527, |
| "grad_norm": 0.6740716396818753, |
| "kl": 0.0220947265625, |
| "learning_rate": 1e-06, |
| "loss": -0.0029, |
| "reward": 0.6083263158798218, |
| "reward_std": 0.3238120675086975, |
| "rewards/preference_model_reward": 0.6083263158798218, |
| "rewards/preference_model_reward/std": 0.3790005147457123, |
| "step": 193 |
| }, |
| { |
| "clip_ratio": 0.00020429812866495922, |
| "epoch": 0.5045513654096229, |
| "grad_norm": 0.6657232986537709, |
| "kl": 0.02227783203125, |
| "learning_rate": 1e-06, |
| "loss": -0.0029, |
| "step": 194 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 324.203125, |
| "epoch": 0.5071521456436932, |
| "grad_norm": 0.9139150980318331, |
| "kl": 0.01904296875, |
| "learning_rate": 1e-06, |
| "loss": -0.0009, |
| "reward": 0.7892916798591614, |
| "reward_std": 0.3459463268518448, |
| "rewards/preference_model_reward": 0.7892916798591614, |
| "rewards/preference_model_reward/std": 0.3491186946630478, |
| "step": 195 |
| }, |
| { |
| "clip_ratio": 0.00033000129042193294, |
| "epoch": 0.5097529258777633, |
| "grad_norm": 0.7044330356942116, |
| "kl": 0.019287109375, |
| "learning_rate": 1e-06, |
| "loss": -0.001, |
| "step": 196 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 318.5, |
| "epoch": 0.5123537061118335, |
| "grad_norm": 1.097976379791459, |
| "kl": 0.0263671875, |
| "learning_rate": 1e-06, |
| "loss": 0.0085, |
| "reward": 0.6349725127220154, |
| "reward_std": 0.22165381908416748, |
| "rewards/preference_model_reward": 0.6349725127220154, |
| "rewards/preference_model_reward/std": 0.3097042515873909, |
| "step": 197 |
| }, |
| { |
| "clip_ratio": 0.0003432096855249256, |
| "epoch": 0.5149544863459038, |
| "grad_norm": 1.0743598050872851, |
| "kl": 0.02667236328125, |
| "learning_rate": 1e-06, |
| "loss": 0.0085, |
| "step": 198 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 445.09375, |
| "epoch": 0.517555266579974, |
| "grad_norm": 0.8055405857842888, |
| "kl": 0.0220947265625, |
| "learning_rate": 1e-06, |
| "loss": -0.0007, |
| "reward": 0.3948906809091568, |
| "reward_std": 0.2313927859067917, |
| "rewards/preference_model_reward": 0.3948906809091568, |
| "rewards/preference_model_reward/std": 0.44849249720573425, |
| "step": 199 |
| }, |
| { |
| "clip_ratio": 0.00019127286213915795, |
| "epoch": 0.5201560468140443, |
| "grad_norm": 0.7564346551970879, |
| "kl": 0.02227783203125, |
| "learning_rate": 1e-06, |
| "loss": -0.0007, |
| "step": 200 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 372.8125, |
| "epoch": 0.5227568270481144, |
| "grad_norm": 0.8534629203388931, |
| "kl": 0.0289306640625, |
| "learning_rate": 1e-06, |
| "loss": 0.0042, |
| "reward": 0.753670871257782, |
| "reward_std": 0.3137911409139633, |
| "rewards/preference_model_reward": 0.753670871257782, |
| "rewards/preference_model_reward/std": 0.35431434214115143, |
| "step": 201 |
| }, |
| { |
| "clip_ratio": 0.0001328055550402496, |
| "epoch": 0.5253576072821846, |
| "grad_norm": 0.8274306102017099, |
| "kl": 0.02947998046875, |
| "learning_rate": 1e-06, |
| "loss": 0.0042, |
| "step": 202 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 187.734375, |
| "epoch": 0.5279583875162549, |
| "grad_norm": 0.5411453143013689, |
| "kl": 0.02337646484375, |
| "learning_rate": 1e-06, |
| "loss": -0.0073, |
| "reward": 0.6845378577709198, |
| "reward_std": 0.35077695548534393, |
| "rewards/preference_model_reward": 0.6845378577709198, |
| "rewards/preference_model_reward/std": 0.381725937128067, |
| "step": 203 |
| }, |
| { |
| "clip_ratio": 0.0005341880605556071, |
| "epoch": 0.5305591677503251, |
| "grad_norm": 0.5152105756393677, |
| "kl": 0.023681640625, |
| "learning_rate": 1e-06, |
| "loss": -0.0073, |
| "step": 204 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 271.65625, |
| "epoch": 0.5331599479843954, |
| "grad_norm": 0.5219287048320751, |
| "kl": 0.022216796875, |
| "learning_rate": 1e-06, |
| "loss": -0.0027, |
| "reward": 0.8503350913524628, |
| "reward_std": 0.2589127942919731, |
| "rewards/preference_model_reward": 0.8503350913524628, |
| "rewards/preference_model_reward/std": 0.2864740937948227, |
| "step": 205 |
| }, |
| { |
| "clip_ratio": 0.0003061444003833458, |
| "epoch": 0.5357607282184655, |
| "grad_norm": 0.5250746479640336, |
| "kl": 0.0225830078125, |
| "learning_rate": 1e-06, |
| "loss": -0.0027, |
| "step": 206 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 360.9375, |
| "epoch": 0.5383615084525357, |
| "grad_norm": 0.9686490492017382, |
| "kl": 0.0242919921875, |
| "learning_rate": 1e-06, |
| "loss": -0.0064, |
| "reward": 0.8047667145729065, |
| "reward_std": 0.34497836232185364, |
| "rewards/preference_model_reward": 0.8047667145729065, |
| "rewards/preference_model_reward/std": 0.3676797151565552, |
| "step": 207 |
| }, |
| { |
| "clip_ratio": 0.00017128348554251716, |
| "epoch": 0.540962288686606, |
| "grad_norm": 0.8929961952129637, |
| "kl": 0.02447509765625, |
| "learning_rate": 1e-06, |
| "loss": -0.0064, |
| "step": 208 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 333.5625, |
| "epoch": 0.5435630689206762, |
| "grad_norm": 0.4541825457318801, |
| "kl": 0.02734375, |
| "learning_rate": 1e-06, |
| "loss": -0.0084, |
| "reward": 0.8458181023597717, |
| "reward_std": 0.23653991520404816, |
| "rewards/preference_model_reward": 0.8458181023597717, |
| "rewards/preference_model_reward/std": 0.2930053174495697, |
| "step": 209 |
| }, |
| { |
| "clip_ratio": 0.0007538689824286848, |
| "epoch": 0.5461638491547465, |
| "grad_norm": 0.45249924575704586, |
| "kl": 0.02783203125, |
| "learning_rate": 1e-06, |
| "loss": -0.0085, |
| "step": 210 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 309.703125, |
| "epoch": 0.5487646293888166, |
| "grad_norm": 1.0654026235581544, |
| "kl": 0.031982421875, |
| "learning_rate": 1e-06, |
| "loss": -0.0069, |
| "reward": 0.5554362535476685, |
| "reward_std": 0.45049290359020233, |
| "rewards/preference_model_reward": 0.5554362535476685, |
| "rewards/preference_model_reward/std": 0.4461488127708435, |
| "step": 211 |
| }, |
| { |
| "clip_ratio": 9.597632015356794e-05, |
| "epoch": 0.5513654096228868, |
| "grad_norm": 1.0653292288478493, |
| "kl": 0.032470703125, |
| "learning_rate": 1e-06, |
| "loss": -0.0068, |
| "step": 212 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 348.078125, |
| "epoch": 0.5539661898569571, |
| "grad_norm": 0.8314788226660293, |
| "kl": 0.02728271484375, |
| "learning_rate": 1e-06, |
| "loss": -0.0033, |
| "reward": 0.842810720205307, |
| "reward_std": 0.2719964236021042, |
| "rewards/preference_model_reward": 0.842810720205307, |
| "rewards/preference_model_reward/std": 0.317636638879776, |
| "step": 213 |
| }, |
| { |
| "clip_ratio": 0.0002034323406405747, |
| "epoch": 0.5565669700910273, |
| "grad_norm": 0.8392092498413791, |
| "kl": 0.0277099609375, |
| "learning_rate": 1e-06, |
| "loss": -0.0033, |
| "step": 214 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 389.40625, |
| "epoch": 0.5591677503250976, |
| "grad_norm": 0.9192249339442864, |
| "kl": 0.03179931640625, |
| "learning_rate": 1e-06, |
| "loss": -0.0103, |
| "reward": 0.622369259595871, |
| "reward_std": 0.35231079161167145, |
| "rewards/preference_model_reward": 0.622369259595871, |
| "rewards/preference_model_reward/std": 0.38653427362442017, |
| "step": 215 |
| }, |
| { |
| "clip_ratio": 0.0005518103025679011, |
| "epoch": 0.5617685305591678, |
| "grad_norm": 0.9297117283091114, |
| "kl": 0.0323486328125, |
| "learning_rate": 1e-06, |
| "loss": -0.0103, |
| "step": 216 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 419.859375, |
| "epoch": 0.5643693107932379, |
| "grad_norm": 0.9336213718132743, |
| "kl": 0.0318603515625, |
| "learning_rate": 1e-06, |
| "loss": 0.0165, |
| "reward": 0.7810149788856506, |
| "reward_std": 0.30521372705698013, |
| "rewards/preference_model_reward": 0.7810149788856506, |
| "rewards/preference_model_reward/std": 0.3358805924654007, |
| "step": 217 |
| }, |
| { |
| "clip_ratio": 0.00014426001871470362, |
| "epoch": 0.5669700910273082, |
| "grad_norm": 0.9051217250973354, |
| "kl": 0.03265380859375, |
| "learning_rate": 1e-06, |
| "loss": 0.0165, |
| "step": 218 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 320.59375, |
| "epoch": 0.5695708712613784, |
| "grad_norm": 0.7224839222870695, |
| "kl": 0.02740478515625, |
| "learning_rate": 1e-06, |
| "loss": -0.0023, |
| "reward": 0.7635192573070526, |
| "reward_std": 0.324033185839653, |
| "rewards/preference_model_reward": 0.7635192573070526, |
| "rewards/preference_model_reward/std": 0.353701576590538, |
| "step": 219 |
| }, |
| { |
| "clip_ratio": 0.0002873854355129879, |
| "epoch": 0.5721716514954487, |
| "grad_norm": 0.7595301461919822, |
| "kl": 0.0274658203125, |
| "learning_rate": 1e-06, |
| "loss": -0.0024, |
| "step": 220 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 388.3125, |
| "epoch": 0.5747724317295189, |
| "grad_norm": 0.9070734191648187, |
| "kl": 0.025146484375, |
| "learning_rate": 1e-06, |
| "loss": -0.0015, |
| "reward": 0.7658654153347015, |
| "reward_std": 0.2642327696084976, |
| "rewards/preference_model_reward": 0.7658654153347015, |
| "rewards/preference_model_reward/std": 0.3203909620642662, |
| "step": 221 |
| }, |
| { |
| "clip_ratio": 0.00011779637134168297, |
| "epoch": 0.577373211963589, |
| "grad_norm": 0.7137387787434728, |
| "kl": 0.02520751953125, |
| "learning_rate": 1e-06, |
| "loss": -0.0015, |
| "step": 222 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 256.203125, |
| "epoch": 0.5799739921976593, |
| "grad_norm": 0.5417591118857858, |
| "kl": 0.0323486328125, |
| "learning_rate": 1e-06, |
| "loss": -0.0027, |
| "reward": 0.7153788208961487, |
| "reward_std": 0.29760105162858963, |
| "rewards/preference_model_reward": 0.7153788208961487, |
| "rewards/preference_model_reward/std": 0.3605159521102905, |
| "step": 223 |
| }, |
| { |
| "clip_ratio": 0.0002042335836449638, |
| "epoch": 0.5825747724317295, |
| "grad_norm": 0.541117637465139, |
| "kl": 0.032470703125, |
| "learning_rate": 1e-06, |
| "loss": -0.0027, |
| "step": 224 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 414.75, |
| "epoch": 0.5851755526657998, |
| "grad_norm": 0.7271472193596403, |
| "kl": 0.02532958984375, |
| "learning_rate": 1e-06, |
| "loss": 0.0076, |
| "reward": 0.37321533262729645, |
| "reward_std": 0.22989524900913239, |
| "rewards/preference_model_reward": 0.37321533262729645, |
| "rewards/preference_model_reward/std": 0.4283030182123184, |
| "step": 225 |
| }, |
| { |
| "clip_ratio": 0.0003217906632926315, |
| "epoch": 0.58777633289987, |
| "grad_norm": 0.7195866194223471, |
| "kl": 0.025634765625, |
| "learning_rate": 1e-06, |
| "loss": 0.0075, |
| "step": 226 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 323.0625, |
| "epoch": 0.5903771131339401, |
| "grad_norm": 0.8041443917617668, |
| "kl": 0.03070068359375, |
| "learning_rate": 1e-06, |
| "loss": -0.0025, |
| "reward": 0.7684324085712433, |
| "reward_std": 0.36028069257736206, |
| "rewards/preference_model_reward": 0.7684324085712433, |
| "rewards/preference_model_reward/std": 0.3624133765697479, |
| "step": 227 |
| }, |
| { |
| "clip_ratio": 0.00026055803027702495, |
| "epoch": 0.5929778933680104, |
| "grad_norm": 0.8283620393853188, |
| "kl": 0.03125, |
| "learning_rate": 1e-06, |
| "loss": -0.0024, |
| "step": 228 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 283.03125, |
| "epoch": 0.5955786736020806, |
| "grad_norm": 0.6860393905348983, |
| "kl": 0.031494140625, |
| "learning_rate": 1e-06, |
| "loss": -0.0018, |
| "reward": 0.7659209966659546, |
| "reward_std": 0.26482800394296646, |
| "rewards/preference_model_reward": 0.7659209966659546, |
| "rewards/preference_model_reward/std": 0.37613604962825775, |
| "step": 229 |
| }, |
| { |
| "clip_ratio": 0.0002554464590502903, |
| "epoch": 0.5981794538361509, |
| "grad_norm": 0.6751506226066358, |
| "kl": 0.03173828125, |
| "learning_rate": 1e-06, |
| "loss": -0.0019, |
| "step": 230 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 305.890625, |
| "epoch": 0.6007802340702211, |
| "grad_norm": 0.6931747416922038, |
| "kl": 0.03240966796875, |
| "learning_rate": 1e-06, |
| "loss": -0.0027, |
| "reward": 0.7967671155929565, |
| "reward_std": 0.3124672695994377, |
| "rewards/preference_model_reward": 0.7967671155929565, |
| "rewards/preference_model_reward/std": 0.3221246153116226, |
| "step": 231 |
| }, |
| { |
| "clip_ratio": 0.00034807444899342954, |
| "epoch": 0.6033810143042913, |
| "grad_norm": 0.8871992669497728, |
| "kl": 0.03277587890625, |
| "learning_rate": 1e-06, |
| "loss": -0.0027, |
| "step": 232 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 272.71875, |
| "epoch": 0.6059817945383615, |
| "grad_norm": 0.22600317259271924, |
| "kl": 0.03021240234375, |
| "learning_rate": 1e-06, |
| "loss": -0.001, |
| "reward": 0.7308064997196198, |
| "reward_std": 0.09175470843911171, |
| "rewards/preference_model_reward": 0.7308064997196198, |
| "rewards/preference_model_reward/std": 0.3288180008530617, |
| "step": 233 |
| }, |
| { |
| "clip_ratio": 0.00035572612250689417, |
| "epoch": 0.6085825747724317, |
| "grad_norm": 0.2269448326022545, |
| "kl": 0.0306396484375, |
| "learning_rate": 1e-06, |
| "loss": -0.001, |
| "step": 234 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 288.0, |
| "epoch": 0.611183355006502, |
| "grad_norm": 0.5678201319537751, |
| "kl": 0.0279541015625, |
| "learning_rate": 1e-06, |
| "loss": -0.0027, |
| "reward": 0.7927112579345703, |
| "reward_std": 0.2680581137537956, |
| "rewards/preference_model_reward": 0.7927112579345703, |
| "rewards/preference_model_reward/std": 0.29819803684949875, |
| "step": 235 |
| }, |
| { |
| "clip_ratio": 0.0005594654503511265, |
| "epoch": 0.6137841352405722, |
| "grad_norm": 0.5696451305988013, |
| "kl": 0.0277099609375, |
| "learning_rate": 1e-06, |
| "loss": -0.0027, |
| "step": 236 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 387.71875, |
| "epoch": 0.6163849154746424, |
| "grad_norm": 0.2826076441567154, |
| "kl": 0.0311279296875, |
| "learning_rate": 1e-06, |
| "loss": 0.0002, |
| "reward": 0.9802200198173523, |
| "reward_std": 0.0791199654340744, |
| "rewards/preference_model_reward": 0.9802200198173523, |
| "rewards/preference_model_reward/std": 0.08191458880901337, |
| "step": 237 |
| }, |
| { |
| "clip_ratio": 0.0001503910607425496, |
| "epoch": 0.6189856957087126, |
| "grad_norm": 0.27566046682842354, |
| "kl": 0.03076171875, |
| "learning_rate": 1e-06, |
| "loss": 0.0002, |
| "step": 238 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 278.359375, |
| "epoch": 0.6215864759427828, |
| "grad_norm": 0.6496794092367957, |
| "kl": 0.0299072265625, |
| "learning_rate": 1e-06, |
| "loss": 0.0041, |
| "reward": 0.8209348022937775, |
| "reward_std": 0.2692207768559456, |
| "rewards/preference_model_reward": 0.8209348022937775, |
| "rewards/preference_model_reward/std": 0.3124052509665489, |
| "step": 239 |
| }, |
| { |
| "clip_ratio": 0.00021162991833989508, |
| "epoch": 0.6241872561768531, |
| "grad_norm": 0.6797772811922748, |
| "kl": 0.02978515625, |
| "learning_rate": 1e-06, |
| "loss": 0.0041, |
| "step": 240 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 290.09375, |
| "epoch": 0.6267880364109233, |
| "grad_norm": 0.6610475273967841, |
| "kl": 0.03094482421875, |
| "learning_rate": 1e-06, |
| "loss": -0.0017, |
| "reward": 0.5471342355012894, |
| "reward_std": 0.3384169638156891, |
| "rewards/preference_model_reward": 0.5471342355012894, |
| "rewards/preference_model_reward/std": 0.3545994460582733, |
| "step": 241 |
| }, |
| { |
| "clip_ratio": 0.00036541650479193777, |
| "epoch": 0.6293888166449935, |
| "grad_norm": 0.6636307821447273, |
| "kl": 0.03094482421875, |
| "learning_rate": 1e-06, |
| "loss": -0.0018, |
| "step": 242 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 314.640625, |
| "epoch": 0.6319895968790638, |
| "grad_norm": 1.007770555329949, |
| "kl": 0.0347900390625, |
| "learning_rate": 1e-06, |
| "loss": -0.0072, |
| "reward": 0.8875998258590698, |
| "reward_std": 0.22977813333272934, |
| "rewards/preference_model_reward": 0.8875998258590698, |
| "rewards/preference_model_reward/std": 0.26288190484046936, |
| "step": 243 |
| }, |
| { |
| "clip_ratio": 0.00036119218566454947, |
| "epoch": 0.6345903771131339, |
| "grad_norm": 0.5276320493290289, |
| "kl": 0.0350341796875, |
| "learning_rate": 1e-06, |
| "loss": -0.0072, |
| "step": 244 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 263.828125, |
| "epoch": 0.6371911573472041, |
| "grad_norm": 0.6961169112868046, |
| "kl": 0.03265380859375, |
| "learning_rate": 1e-06, |
| "loss": 0.0047, |
| "reward": 0.7485232055187225, |
| "reward_std": 0.27340711653232574, |
| "rewards/preference_model_reward": 0.7485232055187225, |
| "rewards/preference_model_reward/std": 0.35039061307907104, |
| "step": 245 |
| }, |
| { |
| "clip_ratio": 0.000293049102765508, |
| "epoch": 0.6397919375812744, |
| "grad_norm": 0.9025214332063679, |
| "kl": 0.0330810546875, |
| "learning_rate": 1e-06, |
| "loss": 0.0047, |
| "step": 246 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 362.390625, |
| "epoch": 0.6423927178153446, |
| "grad_norm": 0.9207891120898004, |
| "kl": 0.034423828125, |
| "learning_rate": 1e-06, |
| "loss": 0.0009, |
| "reward": 0.6746810376644135, |
| "reward_std": 0.37852445244789124, |
| "rewards/preference_model_reward": 0.6746810376644135, |
| "rewards/preference_model_reward/std": 0.39785870909690857, |
| "step": 247 |
| }, |
| { |
| "clip_ratio": 0.0004364640772109851, |
| "epoch": 0.6449934980494149, |
| "grad_norm": 0.9428596077830829, |
| "kl": 0.0345458984375, |
| "learning_rate": 1e-06, |
| "loss": 0.0008, |
| "step": 248 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 355.953125, |
| "epoch": 0.647594278283485, |
| "grad_norm": 0.9002426185767372, |
| "kl": 0.0325927734375, |
| "learning_rate": 1e-06, |
| "loss": 0.0043, |
| "reward": 0.7399424314498901, |
| "reward_std": 0.34358392655849457, |
| "rewards/preference_model_reward": 0.7399424314498901, |
| "rewards/preference_model_reward/std": 0.41077572107315063, |
| "step": 249 |
| }, |
| { |
| "clip_ratio": 0.0002715677837841213, |
| "epoch": 0.6501950585175552, |
| "grad_norm": 0.9388574486276239, |
| "kl": 0.033447265625, |
| "learning_rate": 1e-06, |
| "loss": 0.0043, |
| "step": 250 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 275.640625, |
| "epoch": 0.6527958387516255, |
| "grad_norm": 0.6358294351644407, |
| "kl": 0.031005859375, |
| "learning_rate": 1e-06, |
| "loss": -0.008, |
| "reward": 0.7702620029449463, |
| "reward_std": 0.30297737568616867, |
| "rewards/preference_model_reward": 0.7702620029449463, |
| "rewards/preference_model_reward/std": 0.31037599593400955, |
| "step": 251 |
| }, |
| { |
| "clip_ratio": 7.407407247228548e-05, |
| "epoch": 0.6553966189856957, |
| "grad_norm": 0.6496709525090261, |
| "kl": 0.03118896484375, |
| "learning_rate": 1e-06, |
| "loss": -0.008, |
| "step": 252 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 383.171875, |
| "epoch": 0.657997399219766, |
| "grad_norm": 0.9259779511089412, |
| "kl": 0.0374755859375, |
| "learning_rate": 1e-06, |
| "loss": 0.0011, |
| "reward": 0.7584672272205353, |
| "reward_std": 0.34140945971012115, |
| "rewards/preference_model_reward": 0.7584672272205353, |
| "rewards/preference_model_reward/std": 0.36738522350788116, |
| "step": 253 |
| }, |
| { |
| "clip_ratio": 0.00042696832679212093, |
| "epoch": 0.6605981794538361, |
| "grad_norm": 0.9195597011995847, |
| "kl": 0.0379638671875, |
| "learning_rate": 1e-06, |
| "loss": 0.0011, |
| "step": 254 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 414.90625, |
| "epoch": 0.6631989596879063, |
| "grad_norm": 0.7640236735659361, |
| "kl": 0.0379638671875, |
| "learning_rate": 1e-06, |
| "loss": -0.0019, |
| "reward": 0.8538275361061096, |
| "reward_std": 0.23895438015460968, |
| "rewards/preference_model_reward": 0.8538275361061096, |
| "rewards/preference_model_reward/std": 0.3124992400407791, |
| "step": 255 |
| }, |
| { |
| "clip_ratio": 0.00029875659674871713, |
| "epoch": 0.6657997399219766, |
| "grad_norm": 0.9201327061263956, |
| "kl": 0.0382080078125, |
| "learning_rate": 1e-06, |
| "loss": -0.0018, |
| "step": 256 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 360.078125, |
| "epoch": 0.6684005201560468, |
| "grad_norm": 0.627065020941867, |
| "kl": 0.041015625, |
| "learning_rate": 1e-06, |
| "loss": -0.01, |
| "reward": 0.8764857351779938, |
| "reward_std": 0.20473513007164001, |
| "rewards/preference_model_reward": 0.8764857351779938, |
| "rewards/preference_model_reward/std": 0.20165851712226868, |
| "step": 257 |
| }, |
| { |
| "clip_ratio": 0.00010048231342807412, |
| "epoch": 0.6710013003901171, |
| "grad_norm": 0.6604841172659577, |
| "kl": 0.04052734375, |
| "learning_rate": 1e-06, |
| "loss": -0.01, |
| "step": 258 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 343.15625, |
| "epoch": 0.6736020806241872, |
| "grad_norm": 0.5406956351974641, |
| "kl": 0.0341796875, |
| "learning_rate": 1e-06, |
| "loss": 0.0003, |
| "reward": 0.6130577623844147, |
| "reward_std": 0.18850323930382729, |
| "rewards/preference_model_reward": 0.6130577623844147, |
| "rewards/preference_model_reward/std": 0.4316726624965668, |
| "step": 259 |
| }, |
| { |
| "clip_ratio": 0.00025136396288871765, |
| "epoch": 0.6762028608582574, |
| "grad_norm": 0.5511157087672558, |
| "kl": 0.0343017578125, |
| "learning_rate": 1e-06, |
| "loss": 0.0003, |
| "step": 260 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 425.421875, |
| "epoch": 0.6788036410923277, |
| "grad_norm": 0.9136732590296446, |
| "kl": 0.0321044921875, |
| "learning_rate": 1e-06, |
| "loss": 0.0049, |
| "reward": 0.7294110059738159, |
| "reward_std": 0.36171969771385193, |
| "rewards/preference_model_reward": 0.7294110059738159, |
| "rewards/preference_model_reward/std": 0.3690430223941803, |
| "step": 261 |
| }, |
| { |
| "clip_ratio": 0.0002961536665679887, |
| "epoch": 0.6814044213263979, |
| "grad_norm": 0.9224194439572596, |
| "kl": 0.03228759765625, |
| "learning_rate": 1e-06, |
| "loss": 0.0049, |
| "step": 262 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 485.265625, |
| "epoch": 0.6840052015604682, |
| "grad_norm": 1.0125086260119556, |
| "kl": 0.02874755859375, |
| "learning_rate": 1e-06, |
| "loss": 0.0151, |
| "reward": 0.6575124561786652, |
| "reward_std": 0.3728269934654236, |
| "rewards/preference_model_reward": 0.6575124561786652, |
| "rewards/preference_model_reward/std": 0.377619668841362, |
| "step": 263 |
| }, |
| { |
| "clip_ratio": 0.00027208159008296207, |
| "epoch": 0.6866059817945384, |
| "grad_norm": 0.9988921149577585, |
| "kl": 0.02899169921875, |
| "learning_rate": 1e-06, |
| "loss": 0.015, |
| "step": 264 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 396.328125, |
| "epoch": 0.6892067620286085, |
| "grad_norm": 0.48044088482562075, |
| "kl": 0.03973388671875, |
| "learning_rate": 1e-06, |
| "loss": -0.0047, |
| "reward": 0.5718486905097961, |
| "reward_std": 0.15403037518262863, |
| "rewards/preference_model_reward": 0.5718486905097961, |
| "rewards/preference_model_reward/std": 0.452066108584404, |
| "step": 265 |
| }, |
| { |
| "clip_ratio": 0.0002960472193080932, |
| "epoch": 0.6918075422626788, |
| "grad_norm": 0.43607234235741066, |
| "kl": 0.04034423828125, |
| "learning_rate": 1e-06, |
| "loss": -0.0047, |
| "step": 266 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 407.84375, |
| "epoch": 0.694408322496749, |
| "grad_norm": 0.05537902433689471, |
| "kl": 0.0294189453125, |
| "learning_rate": 1e-06, |
| "loss": -0.0003, |
| "reward": 0.750793993473053, |
| "reward_std": 0.02123763016425073, |
| "rewards/preference_model_reward": 0.750793993473053, |
| "rewards/preference_model_reward/std": 0.27511218935251236, |
| "step": 267 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "epoch": 0.6970091027308193, |
| "grad_norm": 0.05537572405853725, |
| "kl": 0.0291748046875, |
| "learning_rate": 1e-06, |
| "loss": -0.0003, |
| "step": 268 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 406.765625, |
| "epoch": 0.6996098829648895, |
| "grad_norm": 0.859848076243301, |
| "kl": 0.0335693359375, |
| "learning_rate": 1e-06, |
| "loss": 0.0232, |
| "reward": 0.7944641709327698, |
| "reward_std": 0.25520364195108414, |
| "rewards/preference_model_reward": 0.7944641709327698, |
| "rewards/preference_model_reward/std": 0.28688880801200867, |
| "step": 269 |
| }, |
| { |
| "clip_ratio": 0.00023486852296628058, |
| "epoch": 0.7022106631989596, |
| "grad_norm": 0.8205533173715761, |
| "kl": 0.0333251953125, |
| "learning_rate": 1e-06, |
| "loss": 0.0233, |
| "step": 270 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 281.796875, |
| "epoch": 0.7048114434330299, |
| "grad_norm": 0.36632377721796217, |
| "kl": 0.0333251953125, |
| "learning_rate": 1e-06, |
| "loss": -0.001, |
| "reward": 0.48682448267936707, |
| "reward_std": 0.16525335051119328, |
| "rewards/preference_model_reward": 0.48682448267936707, |
| "rewards/preference_model_reward/std": 0.4657934308052063, |
| "step": 271 |
| }, |
| { |
| "clip_ratio": 0.0002138561540050432, |
| "epoch": 0.7074122236671001, |
| "grad_norm": 0.38569263532759096, |
| "kl": 0.03302001953125, |
| "learning_rate": 1e-06, |
| "loss": -0.001, |
| "step": 272 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 277.09375, |
| "epoch": 0.7100130039011704, |
| "grad_norm": 0.5414845135880526, |
| "kl": 0.03057861328125, |
| "learning_rate": 1e-06, |
| "loss": -0.0044, |
| "reward": 0.8333463966846466, |
| "reward_std": 0.21173010021448135, |
| "rewards/preference_model_reward": 0.8333463966846466, |
| "rewards/preference_model_reward/std": 0.3135555535554886, |
| "step": 273 |
| }, |
| { |
| "clip_ratio": 0.00042767466220539063, |
| "epoch": 0.7126137841352406, |
| "grad_norm": 0.5814268658836278, |
| "kl": 0.0303955078125, |
| "learning_rate": 1e-06, |
| "loss": -0.0044, |
| "step": 274 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 300.625, |
| "epoch": 0.7152145643693107, |
| "grad_norm": 0.29110876248573814, |
| "kl": 0.0308837890625, |
| "learning_rate": 1e-06, |
| "loss": -0.0006, |
| "reward": 0.9611281454563141, |
| "reward_std": 0.0763670519227162, |
| "rewards/preference_model_reward": 0.9611281454563141, |
| "rewards/preference_model_reward/std": 0.1133881090208888, |
| "step": 275 |
| }, |
| { |
| "clip_ratio": 0.00015091049135662615, |
| "epoch": 0.717815344603381, |
| "grad_norm": 0.2924332763673859, |
| "kl": 0.03094482421875, |
| "learning_rate": 1e-06, |
| "loss": -0.0006, |
| "step": 276 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 367.765625, |
| "epoch": 0.7204161248374512, |
| "grad_norm": 0.6810325664521416, |
| "kl": 0.03387451171875, |
| "learning_rate": 1e-06, |
| "loss": -0.004, |
| "reward": 0.5777665078639984, |
| "reward_std": 0.2041564560495317, |
| "rewards/preference_model_reward": 0.5777665078639984, |
| "rewards/preference_model_reward/std": 0.4507894814014435, |
| "step": 277 |
| }, |
| { |
| "clip_ratio": 0.0004186662699794397, |
| "epoch": 0.7230169050715215, |
| "grad_norm": 0.6837062367984963, |
| "kl": 0.033935546875, |
| "learning_rate": 1e-06, |
| "loss": -0.0039, |
| "step": 278 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 193.3125, |
| "epoch": 0.7256176853055917, |
| "grad_norm": 0.4916940767331736, |
| "kl": 0.0399169921875, |
| "learning_rate": 1e-06, |
| "loss": -0.0046, |
| "reward": 0.7320626676082611, |
| "reward_std": 0.30857832729816437, |
| "rewards/preference_model_reward": 0.7320626676082611, |
| "rewards/preference_model_reward/std": 0.3521760255098343, |
| "step": 279 |
| }, |
| { |
| "clip_ratio": 0.00017064846178982407, |
| "epoch": 0.7282184655396619, |
| "grad_norm": 0.49237527836693745, |
| "kl": 0.0400390625, |
| "learning_rate": 1e-06, |
| "loss": -0.0046, |
| "step": 280 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 347.984375, |
| "epoch": 0.7308192457737321, |
| "grad_norm": 0.7857224409730065, |
| "kl": 0.0343017578125, |
| "learning_rate": 1e-06, |
| "loss": 0.0194, |
| "reward": 0.6453294306993484, |
| "reward_std": 0.33257874846458435, |
| "rewards/preference_model_reward": 0.6453294306993484, |
| "rewards/preference_model_reward/std": 0.33213719725608826, |
| "step": 281 |
| }, |
| { |
| "clip_ratio": 0.000580175816139672, |
| "epoch": 0.7334200260078023, |
| "grad_norm": 0.7785268642528227, |
| "kl": 0.03460693359375, |
| "learning_rate": 1e-06, |
| "loss": 0.0194, |
| "step": 282 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 460.59375, |
| "epoch": 0.7360208062418726, |
| "grad_norm": 0.4261062042986948, |
| "kl": 0.0321044921875, |
| "learning_rate": 1e-06, |
| "loss": -0.0108, |
| "reward": 0.943671464920044, |
| "reward_std": 0.15016759932041168, |
| "rewards/preference_model_reward": 0.943671464920044, |
| "rewards/preference_model_reward/std": 0.21713975816965103, |
| "step": 283 |
| }, |
| { |
| "clip_ratio": 5.9215664805378765e-05, |
| "epoch": 0.7386215864759428, |
| "grad_norm": 0.42689251687876145, |
| "kl": 0.0323486328125, |
| "learning_rate": 1e-06, |
| "loss": -0.0108, |
| "step": 284 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 398.328125, |
| "epoch": 0.741222366710013, |
| "grad_norm": 0.8784480579767534, |
| "kl": 0.0347900390625, |
| "learning_rate": 1e-06, |
| "loss": -0.0157, |
| "reward": 0.7145366668701172, |
| "reward_std": 0.3674587905406952, |
| "rewards/preference_model_reward": 0.7145366668701172, |
| "rewards/preference_model_reward/std": 0.37918268144130707, |
| "step": 285 |
| }, |
| { |
| "clip_ratio": 0.00027935014077229425, |
| "epoch": 0.7438231469440832, |
| "grad_norm": 0.8835084511138045, |
| "kl": 0.03515625, |
| "learning_rate": 1e-06, |
| "loss": -0.0157, |
| "step": 286 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 307.59375, |
| "epoch": 0.7464239271781534, |
| "grad_norm": 0.4936033601748067, |
| "kl": 0.0343017578125, |
| "learning_rate": 1e-06, |
| "loss": -0.0056, |
| "reward": 0.876055896282196, |
| "reward_std": 0.22134239226579666, |
| "rewards/preference_model_reward": 0.876055896282196, |
| "rewards/preference_model_reward/std": 0.27353301644325256, |
| "step": 287 |
| }, |
| { |
| "clip_ratio": 0.00022383942268788815, |
| "epoch": 0.7490247074122237, |
| "grad_norm": 0.48882075708940054, |
| "kl": 0.0345458984375, |
| "learning_rate": 1e-06, |
| "loss": -0.0056, |
| "step": 288 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 302.296875, |
| "epoch": 0.7516254876462939, |
| "grad_norm": 0.6099100220796305, |
| "kl": 0.03253173828125, |
| "learning_rate": 1e-06, |
| "loss": 0.0018, |
| "reward": 0.6450457721948624, |
| "reward_std": 0.16428439319133759, |
| "rewards/preference_model_reward": 0.6450457721948624, |
| "rewards/preference_model_reward/std": 0.4192444086074829, |
| "step": 289 |
| }, |
| { |
| "clip_ratio": 5.056634472566657e-05, |
| "epoch": 0.7542262678803641, |
| "grad_norm": 0.6204674610660649, |
| "kl": 0.0330810546875, |
| "learning_rate": 1e-06, |
| "loss": 0.0018, |
| "step": 290 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 273.171875, |
| "epoch": 0.7568270481144344, |
| "grad_norm": 0.4785669921706492, |
| "kl": 0.0518798828125, |
| "learning_rate": 1e-06, |
| "loss": -0.0009, |
| "reward": 0.8897781372070312, |
| "reward_std": 0.1889289878308773, |
| "rewards/preference_model_reward": 0.8897781372070312, |
| "rewards/preference_model_reward/std": 0.23168149590492249, |
| "step": 291 |
| }, |
| { |
| "clip_ratio": 0.0002485646546119824, |
| "epoch": 0.7594278283485045, |
| "grad_norm": 0.47439042676333165, |
| "kl": 0.05322265625, |
| "learning_rate": 1e-06, |
| "loss": -0.0009, |
| "step": 292 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 399.4375, |
| "epoch": 0.7620286085825748, |
| "grad_norm": 0.9071650770110183, |
| "kl": 0.037353515625, |
| "learning_rate": 1e-06, |
| "loss": 0.0054, |
| "reward": 0.5175963789224625, |
| "reward_std": 0.266024149954319, |
| "rewards/preference_model_reward": 0.5175963789224625, |
| "rewards/preference_model_reward/std": 0.4135463237762451, |
| "step": 293 |
| }, |
| { |
| "clip_ratio": 0.00034057936863973737, |
| "epoch": 0.764629388816645, |
| "grad_norm": 0.8822613784734961, |
| "kl": 0.037353515625, |
| "learning_rate": 1e-06, |
| "loss": 0.0054, |
| "step": 294 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 338.1875, |
| "epoch": 0.7672301690507152, |
| "grad_norm": 0.7861793558844458, |
| "kl": 0.035400390625, |
| "learning_rate": 1e-06, |
| "loss": -0.0005, |
| "reward": 0.7742494642734528, |
| "reward_std": 0.2848004475235939, |
| "rewards/preference_model_reward": 0.7742494642734528, |
| "rewards/preference_model_reward/std": 0.3529687523841858, |
| "step": 295 |
| }, |
| { |
| "clip_ratio": 0.0001368406847177539, |
| "epoch": 0.7698309492847855, |
| "grad_norm": 0.7910539014156112, |
| "kl": 0.03515625, |
| "learning_rate": 1e-06, |
| "loss": -0.0005, |
| "step": 296 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 272.359375, |
| "epoch": 0.7724317295188556, |
| "grad_norm": 0.5546679508846072, |
| "kl": 0.0374755859375, |
| "learning_rate": 1e-06, |
| "loss": -0.005, |
| "reward": 0.8659342527389526, |
| "reward_std": 0.1929013878107071, |
| "rewards/preference_model_reward": 0.8659342527389526, |
| "rewards/preference_model_reward/std": 0.2005062848329544, |
| "step": 297 |
| }, |
| { |
| "clip_ratio": 9.600614430382848e-05, |
| "epoch": 0.7750325097529259, |
| "grad_norm": 0.49104877942341174, |
| "kl": 0.037353515625, |
| "learning_rate": 1e-06, |
| "loss": -0.005, |
| "step": 298 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 235.375, |
| "epoch": 0.7776332899869961, |
| "grad_norm": 0.5420595207231925, |
| "kl": 0.044921875, |
| "learning_rate": 1e-06, |
| "loss": -0.0076, |
| "reward": 0.7609247267246246, |
| "reward_std": 0.2636025846004486, |
| "rewards/preference_model_reward": 0.7609247267246246, |
| "rewards/preference_model_reward/std": 0.31968455016613007, |
| "step": 299 |
| }, |
| { |
| "clip_ratio": 0.00036888108297716826, |
| "epoch": 0.7802340702210663, |
| "grad_norm": 0.5424021192905694, |
| "kl": 0.044921875, |
| "learning_rate": 1e-06, |
| "loss": -0.0076, |
| "step": 300 |
| } |
| ], |
| "logging_steps": 1, |
| "max_steps": 1280, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 4, |
| "save_steps": 150, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 0.0, |
| "train_batch_size": 8, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|