| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.0, | |
| "eval_steps": 500, | |
| "global_step": 66, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.030303030303030304, | |
| "grad_norm": 3.359375, | |
| "kl_divergence": 0.0, | |
| "learning_rate": 2e-05, | |
| "log_prob_preferred": -40.3125, | |
| "log_prob_rejected": -38.6875, | |
| "loss": 0.69140625, | |
| "policy_diff": -1.625, | |
| "ref_diff": -1.625, | |
| "reward_margin": 0.0, | |
| "step": 1, | |
| "valid_samples_per_batch": 4.0 | |
| }, | |
| { | |
| "epoch": 0.06060606060606061, | |
| "grad_norm": 2.28125, | |
| "kl_divergence": -0.00021457672119140625, | |
| "learning_rate": 1.96969696969697e-05, | |
| "log_prob_preferred": -38.5, | |
| "log_prob_rejected": -37.5625, | |
| "loss": 0.6796875, | |
| "policy_diff": -0.9375, | |
| "ref_diff": -1.0625, | |
| "reward_margin": 0.0250244140625, | |
| "step": 2, | |
| "valid_samples_per_batch": 4.0 | |
| }, | |
| { | |
| "epoch": 0.09090909090909091, | |
| "grad_norm": 1.921875, | |
| "kl_divergence": -0.000499725341796875, | |
| "learning_rate": 1.9393939393939395e-05, | |
| "log_prob_preferred": -37.3125, | |
| "log_prob_rejected": -36.5625, | |
| "loss": 0.6796875, | |
| "policy_diff": -0.75, | |
| "ref_diff": -0.875, | |
| "reward_margin": 0.0250244140625, | |
| "step": 3, | |
| "valid_samples_per_batch": 4.0 | |
| }, | |
| { | |
| "epoch": 0.12121212121212122, | |
| "grad_norm": 1.84375, | |
| "kl_divergence": -0.0011119842529296875, | |
| "learning_rate": 1.9090909090909094e-05, | |
| "log_prob_preferred": -37.375, | |
| "log_prob_rejected": -36.3125, | |
| "loss": 0.6796875, | |
| "policy_diff": -1.0625, | |
| "ref_diff": -1.1875, | |
| "reward_margin": 0.0250244140625, | |
| "step": 4, | |
| "valid_samples_per_batch": 4.0 | |
| }, | |
| { | |
| "epoch": 0.15151515151515152, | |
| "grad_norm": 2.546875, | |
| "kl_divergence": -0.0003497600555419922, | |
| "learning_rate": 1.8787878787878792e-05, | |
| "log_prob_preferred": -35.875, | |
| "log_prob_rejected": -34.65625, | |
| "loss": 0.65625, | |
| "policy_diff": -1.21875, | |
| "ref_diff": -1.59375, | |
| "reward_margin": 0.07513427734375, | |
| "step": 5, | |
| "valid_samples_per_batch": 4.0 | |
| }, | |
| { | |
| "epoch": 0.18181818181818182, | |
| "grad_norm": 1.71875, | |
| "kl_divergence": -0.00025653839111328125, | |
| "learning_rate": 1.8484848484848487e-05, | |
| "log_prob_preferred": -38.375, | |
| "log_prob_rejected": -37.5, | |
| "loss": 0.66796875, | |
| "policy_diff": -0.875, | |
| "ref_diff": -1.125, | |
| "reward_margin": 0.050048828125, | |
| "step": 6, | |
| "valid_samples_per_batch": 4.0 | |
| }, | |
| { | |
| "epoch": 0.21212121212121213, | |
| "grad_norm": 2.03125, | |
| "kl_divergence": -6.866455078125e-05, | |
| "learning_rate": 1.8181818181818182e-05, | |
| "log_prob_preferred": -38.625, | |
| "log_prob_rejected": -37.3125, | |
| "loss": 0.66796875, | |
| "policy_diff": -1.3125, | |
| "ref_diff": -1.5625, | |
| "reward_margin": 0.050048828125, | |
| "step": 7, | |
| "valid_samples_per_batch": 4.0 | |
| }, | |
| { | |
| "epoch": 0.24242424242424243, | |
| "grad_norm": 1.6015625, | |
| "kl_divergence": 0.00064849853515625, | |
| "learning_rate": 1.787878787878788e-05, | |
| "log_prob_preferred": -38.5625, | |
| "log_prob_rejected": -37.75, | |
| "loss": 0.662109375, | |
| "policy_diff": -0.8125, | |
| "ref_diff": -1.125, | |
| "reward_margin": 0.06256103515625, | |
| "step": 8, | |
| "valid_samples_per_batch": 4.0 | |
| }, | |
| { | |
| "epoch": 0.2727272727272727, | |
| "grad_norm": 1.734375, | |
| "kl_divergence": 0.0003542900085449219, | |
| "learning_rate": 1.7575757575757576e-05, | |
| "log_prob_preferred": -39.5625, | |
| "log_prob_rejected": -38.75, | |
| "loss": 0.662109375, | |
| "policy_diff": -0.8125, | |
| "ref_diff": -1.125, | |
| "reward_margin": 0.06256103515625, | |
| "step": 9, | |
| "valid_samples_per_batch": 4.0 | |
| }, | |
| { | |
| "epoch": 0.30303030303030304, | |
| "grad_norm": 1.3515625, | |
| "kl_divergence": -7.82012939453125e-05, | |
| "learning_rate": 1.7272727272727274e-05, | |
| "log_prob_preferred": -36.9375, | |
| "log_prob_rejected": -36.375, | |
| "loss": 0.662109375, | |
| "policy_diff": -0.5625, | |
| "ref_diff": -0.875, | |
| "reward_margin": 0.06256103515625, | |
| "step": 10, | |
| "valid_samples_per_batch": 4.0 | |
| }, | |
| { | |
| "epoch": 0.3333333333333333, | |
| "grad_norm": 1.71875, | |
| "kl_divergence": 0.0005383491516113281, | |
| "learning_rate": 1.6969696969696972e-05, | |
| "log_prob_preferred": -35.6875, | |
| "log_prob_rejected": -34.75, | |
| "loss": 0.662109375, | |
| "policy_diff": -0.9375, | |
| "ref_diff": -1.25, | |
| "reward_margin": 0.06256103515625, | |
| "step": 11, | |
| "valid_samples_per_batch": 4.0 | |
| }, | |
| { | |
| "epoch": 0.36363636363636365, | |
| "grad_norm": 1.6328125, | |
| "kl_divergence": -6.29425048828125e-05, | |
| "learning_rate": 1.6666666666666667e-05, | |
| "log_prob_preferred": -38.3125, | |
| "log_prob_rejected": -37.375, | |
| "loss": 0.662109375, | |
| "policy_diff": -0.9375, | |
| "ref_diff": -1.25, | |
| "reward_margin": 0.06256103515625, | |
| "step": 12, | |
| "valid_samples_per_batch": 4.0 | |
| }, | |
| { | |
| "epoch": 0.3939393939393939, | |
| "grad_norm": 1.7109375, | |
| "kl_divergence": 0.0003426074981689453, | |
| "learning_rate": 1.6363636363636366e-05, | |
| "log_prob_preferred": -40.1875, | |
| "log_prob_rejected": -39.375, | |
| "loss": 0.64453125, | |
| "policy_diff": -0.8125, | |
| "ref_diff": -1.3125, | |
| "reward_margin": 0.10015869140625, | |
| "step": 13, | |
| "valid_samples_per_batch": 4.0 | |
| }, | |
| { | |
| "epoch": 0.42424242424242425, | |
| "grad_norm": 1.6328125, | |
| "kl_divergence": 0.0017499923706054688, | |
| "learning_rate": 1.606060606060606e-05, | |
| "log_prob_preferred": -35.125, | |
| "log_prob_rejected": -34.5, | |
| "loss": 0.6591796875, | |
| "policy_diff": -0.625, | |
| "ref_diff": -0.96875, | |
| "reward_margin": 0.0687255859375, | |
| "step": 14, | |
| "valid_samples_per_batch": 4.0 | |
| }, | |
| { | |
| "epoch": 0.45454545454545453, | |
| "grad_norm": 1.6484375, | |
| "kl_divergence": 0.0004515647888183594, | |
| "learning_rate": 1.575757575757576e-05, | |
| "log_prob_preferred": -38.6875, | |
| "log_prob_rejected": -37.875, | |
| "loss": 0.662109375, | |
| "policy_diff": -0.8125, | |
| "ref_diff": -1.125, | |
| "reward_margin": 0.0626220703125, | |
| "step": 15, | |
| "valid_samples_per_batch": 4.0 | |
| }, | |
| { | |
| "epoch": 0.48484848484848486, | |
| "grad_norm": 1.5546875, | |
| "kl_divergence": 0.0008220672607421875, | |
| "learning_rate": 1.5454545454545454e-05, | |
| "log_prob_preferred": -36.75, | |
| "log_prob_rejected": -36.0625, | |
| "loss": 0.64453125, | |
| "policy_diff": -0.6875, | |
| "ref_diff": -1.1875, | |
| "reward_margin": 0.10015869140625, | |
| "step": 16, | |
| "valid_samples_per_batch": 4.0 | |
| }, | |
| { | |
| "epoch": 0.5151515151515151, | |
| "grad_norm": 1.6953125, | |
| "kl_divergence": 0.0004444122314453125, | |
| "learning_rate": 1.5151515151515153e-05, | |
| "log_prob_preferred": -35.125, | |
| "log_prob_rejected": -34.25, | |
| "loss": 0.6416015625, | |
| "policy_diff": -0.875, | |
| "ref_diff": -1.40625, | |
| "reward_margin": 0.1063232421875, | |
| "step": 17, | |
| "valid_samples_per_batch": 4.0 | |
| }, | |
| { | |
| "epoch": 0.5454545454545454, | |
| "grad_norm": 1.7421875, | |
| "kl_divergence": 0.000736236572265625, | |
| "learning_rate": 1.484848484848485e-05, | |
| "log_prob_preferred": -38.1875, | |
| "log_prob_rejected": -37.5625, | |
| "loss": 0.650390625, | |
| "policy_diff": -0.625, | |
| "ref_diff": -1.0625, | |
| "reward_margin": 0.087646484375, | |
| "step": 18, | |
| "valid_samples_per_batch": 4.0 | |
| }, | |
| { | |
| "epoch": 0.5757575757575758, | |
| "grad_norm": 1.578125, | |
| "kl_divergence": 0.0011148452758789062, | |
| "learning_rate": 1.4545454545454546e-05, | |
| "log_prob_preferred": -39.5625, | |
| "log_prob_rejected": -38.9375, | |
| "loss": 0.662109375, | |
| "policy_diff": -0.625, | |
| "ref_diff": -0.9375, | |
| "reward_margin": 0.06256103515625, | |
| "step": 19, | |
| "valid_samples_per_batch": 4.0 | |
| }, | |
| { | |
| "epoch": 0.6060606060606061, | |
| "grad_norm": 1.5, | |
| "kl_divergence": -0.00011205673217773438, | |
| "learning_rate": 1.4242424242424245e-05, | |
| "log_prob_preferred": -38.9375, | |
| "log_prob_rejected": -38.3125, | |
| "loss": 0.638671875, | |
| "policy_diff": -0.625, | |
| "ref_diff": -1.1875, | |
| "reward_margin": 0.112548828125, | |
| "step": 20, | |
| "valid_samples_per_batch": 4.0 | |
| }, | |
| { | |
| "epoch": 0.6363636363636364, | |
| "grad_norm": 1.671875, | |
| "kl_divergence": 0.0005855560302734375, | |
| "learning_rate": 1.3939393939393942e-05, | |
| "log_prob_preferred": -40.0625, | |
| "log_prob_rejected": -39.125, | |
| "loss": 0.62109375, | |
| "policy_diff": -0.9375, | |
| "ref_diff": -1.6875, | |
| "reward_margin": 0.150146484375, | |
| "step": 21, | |
| "valid_samples_per_batch": 4.0 | |
| }, | |
| { | |
| "epoch": 0.6666666666666666, | |
| "grad_norm": 1.75, | |
| "kl_divergence": 0.0017135143280029297, | |
| "learning_rate": 1.3636363636363637e-05, | |
| "log_prob_preferred": -37.875, | |
| "log_prob_rejected": -37.1875, | |
| "loss": 0.638671875, | |
| "policy_diff": -0.6875, | |
| "ref_diff": -1.25, | |
| "reward_margin": 0.1126708984375, | |
| "step": 22, | |
| "valid_samples_per_batch": 4.0 | |
| }, | |
| { | |
| "epoch": 0.696969696969697, | |
| "grad_norm": 1.2265625, | |
| "kl_divergence": 0.0012230873107910156, | |
| "learning_rate": 1.3333333333333333e-05, | |
| "log_prob_preferred": -37.4375, | |
| "log_prob_rejected": -36.875, | |
| "loss": 0.650390625, | |
| "policy_diff": -0.5625, | |
| "ref_diff": -1.0, | |
| "reward_margin": 0.087646484375, | |
| "step": 23, | |
| "valid_samples_per_batch": 4.0 | |
| }, | |
| { | |
| "epoch": 0.7272727272727273, | |
| "grad_norm": 1.1796875, | |
| "kl_divergence": 0.00089263916015625, | |
| "learning_rate": 1.3030303030303032e-05, | |
| "log_prob_preferred": -37.0625, | |
| "log_prob_rejected": -36.59375, | |
| "loss": 0.6474609375, | |
| "policy_diff": -0.46875, | |
| "ref_diff": -0.9375, | |
| "reward_margin": 0.093902587890625, | |
| "step": 24, | |
| "valid_samples_per_batch": 4.0 | |
| }, | |
| { | |
| "epoch": 0.7575757575757576, | |
| "grad_norm": 1.1953125, | |
| "kl_divergence": 0.0015249252319335938, | |
| "learning_rate": 1.2727272727272728e-05, | |
| "log_prob_preferred": -38.0625, | |
| "log_prob_rejected": -37.5, | |
| "loss": 0.64453125, | |
| "policy_diff": -0.5625, | |
| "ref_diff": -1.0625, | |
| "reward_margin": 0.1002197265625, | |
| "step": 25, | |
| "valid_samples_per_batch": 4.0 | |
| }, | |
| { | |
| "epoch": 0.7878787878787878, | |
| "grad_norm": 1.4921875, | |
| "kl_divergence": -0.0013937950134277344, | |
| "learning_rate": 1.2424242424242425e-05, | |
| "log_prob_preferred": -38.3125, | |
| "log_prob_rejected": -37.75, | |
| "loss": 0.650390625, | |
| "policy_diff": -0.5625, | |
| "ref_diff": -1.0, | |
| "reward_margin": 0.087646484375, | |
| "step": 26, | |
| "valid_samples_per_batch": 4.0 | |
| }, | |
| { | |
| "epoch": 0.8181818181818182, | |
| "grad_norm": 1.1328125, | |
| "kl_divergence": 0.000850677490234375, | |
| "learning_rate": 1.2121212121212122e-05, | |
| "log_prob_preferred": -37.375, | |
| "log_prob_rejected": -37.0625, | |
| "loss": 0.626953125, | |
| "policy_diff": -0.3125, | |
| "ref_diff": -1.0, | |
| "reward_margin": 0.1376953125, | |
| "step": 27, | |
| "valid_samples_per_batch": 4.0 | |
| }, | |
| { | |
| "epoch": 0.8484848484848485, | |
| "grad_norm": 1.6171875, | |
| "kl_divergence": 0.000988006591796875, | |
| "learning_rate": 1.181818181818182e-05, | |
| "log_prob_preferred": -36.0, | |
| "log_prob_rejected": -35.3125, | |
| "loss": 0.6142578125, | |
| "policy_diff": -0.6875, | |
| "ref_diff": -1.53125, | |
| "reward_margin": 0.16888427734375, | |
| "step": 28, | |
| "valid_samples_per_batch": 4.0 | |
| }, | |
| { | |
| "epoch": 0.8787878787878788, | |
| "grad_norm": 1.59375, | |
| "kl_divergence": 0.0015506744384765625, | |
| "learning_rate": 1.1515151515151517e-05, | |
| "log_prob_preferred": -39.0, | |
| "log_prob_rejected": -38.3125, | |
| "loss": 0.638671875, | |
| "policy_diff": -0.6875, | |
| "ref_diff": -1.25, | |
| "reward_margin": 0.1126708984375, | |
| "step": 29, | |
| "valid_samples_per_batch": 4.0 | |
| }, | |
| { | |
| "epoch": 0.9090909090909091, | |
| "grad_norm": 1.65625, | |
| "kl_divergence": -0.0009489059448242188, | |
| "learning_rate": 1.1212121212121212e-05, | |
| "log_prob_preferred": -41.0, | |
| "log_prob_rejected": -40.125, | |
| "loss": 0.650390625, | |
| "policy_diff": -0.875, | |
| "ref_diff": -1.3125, | |
| "reward_margin": 0.08758544921875, | |
| "step": 30, | |
| "valid_samples_per_batch": 4.0 | |
| }, | |
| { | |
| "epoch": 0.9393939393939394, | |
| "grad_norm": 1.828125, | |
| "kl_divergence": 0.001901388168334961, | |
| "learning_rate": 1.0909090909090909e-05, | |
| "log_prob_preferred": -35.0625, | |
| "log_prob_rejected": -34.1875, | |
| "loss": 0.6181640625, | |
| "policy_diff": -0.875, | |
| "ref_diff": -1.65625, | |
| "reward_margin": 0.15625, | |
| "step": 31, | |
| "valid_samples_per_batch": 4.0 | |
| }, | |
| { | |
| "epoch": 0.9696969696969697, | |
| "grad_norm": 1.2734375, | |
| "kl_divergence": 0.0008959770202636719, | |
| "learning_rate": 1.0606060606060606e-05, | |
| "log_prob_preferred": -37.25, | |
| "log_prob_rejected": -36.6875, | |
| "loss": 0.650390625, | |
| "policy_diff": -0.5625, | |
| "ref_diff": -1.0, | |
| "reward_margin": 0.08758544921875, | |
| "step": 32, | |
| "valid_samples_per_batch": 4.0 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 1.3828125, | |
| "kl_divergence": 0.0013256072998046875, | |
| "learning_rate": 1.0303030303030304e-05, | |
| "log_prob_preferred": -37.1875, | |
| "log_prob_rejected": -36.6875, | |
| "loss": 0.626953125, | |
| "policy_diff": -0.5, | |
| "ref_diff": -1.1875, | |
| "reward_margin": 0.13763427734375, | |
| "step": 33, | |
| "valid_samples_per_batch": 4.0 | |
| }, | |
| { | |
| "epoch": 1.0303030303030303, | |
| "grad_norm": 1.0234375, | |
| "kl_divergence": 0.0015974044799804688, | |
| "learning_rate": 1e-05, | |
| "log_prob_preferred": -35.75, | |
| "log_prob_rejected": -35.25, | |
| "loss": 0.650390625, | |
| "policy_diff": -0.5, | |
| "ref_diff": -0.9375, | |
| "reward_margin": 0.087646484375, | |
| "step": 34, | |
| "valid_samples_per_batch": 4.0 | |
| }, | |
| { | |
| "epoch": 1.0606060606060606, | |
| "grad_norm": 0.953125, | |
| "kl_divergence": 0.0008296966552734375, | |
| "learning_rate": 9.696969696969698e-06, | |
| "log_prob_preferred": -37.1875, | |
| "log_prob_rejected": -36.8125, | |
| "loss": 0.64453125, | |
| "policy_diff": -0.375, | |
| "ref_diff": -0.875, | |
| "reward_margin": 0.10015869140625, | |
| "step": 35, | |
| "valid_samples_per_batch": 4.0 | |
| }, | |
| { | |
| "epoch": 1.0909090909090908, | |
| "grad_norm": 1.1015625, | |
| "kl_divergence": -0.0008130073547363281, | |
| "learning_rate": 9.393939393939396e-06, | |
| "log_prob_preferred": -37.375, | |
| "log_prob_rejected": -36.875, | |
| "loss": 0.6416015625, | |
| "policy_diff": -0.5, | |
| "ref_diff": -1.03125, | |
| "reward_margin": 0.10626220703125, | |
| "step": 36, | |
| "valid_samples_per_batch": 4.0 | |
| }, | |
| { | |
| "epoch": 1.121212121212121, | |
| "grad_norm": 1.3984375, | |
| "kl_divergence": 0.0008101463317871094, | |
| "learning_rate": 9.090909090909091e-06, | |
| "log_prob_preferred": -38.75, | |
| "log_prob_rejected": -38.125, | |
| "loss": 0.609375, | |
| "policy_diff": -0.625, | |
| "ref_diff": -1.5, | |
| "reward_margin": 0.17529296875, | |
| "step": 37, | |
| "valid_samples_per_batch": 4.0 | |
| }, | |
| { | |
| "epoch": 1.1515151515151516, | |
| "grad_norm": 1.3515625, | |
| "kl_divergence": 0.0025653839111328125, | |
| "learning_rate": 8.787878787878788e-06, | |
| "log_prob_preferred": -37.5, | |
| "log_prob_rejected": -36.9375, | |
| "loss": 0.650390625, | |
| "policy_diff": -0.5625, | |
| "ref_diff": -1.0, | |
| "reward_margin": 0.087646484375, | |
| "step": 38, | |
| "valid_samples_per_batch": 4.0 | |
| }, | |
| { | |
| "epoch": 1.1818181818181819, | |
| "grad_norm": 1.25, | |
| "kl_divergence": -0.000659942626953125, | |
| "learning_rate": 8.484848484848486e-06, | |
| "log_prob_preferred": -38.5, | |
| "log_prob_rejected": -38.0, | |
| "loss": 0.62109375, | |
| "policy_diff": -0.5, | |
| "ref_diff": -1.25, | |
| "reward_margin": 0.150146484375, | |
| "step": 39, | |
| "valid_samples_per_batch": 4.0 | |
| }, | |
| { | |
| "epoch": 1.2121212121212122, | |
| "grad_norm": 1.328125, | |
| "kl_divergence": -7.82012939453125e-05, | |
| "learning_rate": 8.181818181818183e-06, | |
| "log_prob_preferred": -39.125, | |
| "log_prob_rejected": -38.5, | |
| "loss": 0.650390625, | |
| "policy_diff": -0.625, | |
| "ref_diff": -1.0625, | |
| "reward_margin": 0.08758544921875, | |
| "step": 40, | |
| "valid_samples_per_batch": 4.0 | |
| }, | |
| { | |
| "epoch": 1.2424242424242424, | |
| "grad_norm": 1.2578125, | |
| "kl_divergence": 0.0023040771484375, | |
| "learning_rate": 7.87878787878788e-06, | |
| "log_prob_preferred": -35.1875, | |
| "log_prob_rejected": -34.5625, | |
| "loss": 0.626953125, | |
| "policy_diff": -0.625, | |
| "ref_diff": -1.3125, | |
| "reward_margin": 0.13763427734375, | |
| "step": 41, | |
| "valid_samples_per_batch": 4.0 | |
| }, | |
| { | |
| "epoch": 1.2727272727272727, | |
| "grad_norm": 1.4453125, | |
| "kl_divergence": 0.0010166168212890625, | |
| "learning_rate": 7.5757575757575764e-06, | |
| "log_prob_preferred": -37.25, | |
| "log_prob_rejected": -36.625, | |
| "loss": 0.6171875, | |
| "policy_diff": -0.625, | |
| "ref_diff": -1.4375, | |
| "reward_margin": 0.16259765625, | |
| "step": 42, | |
| "valid_samples_per_batch": 4.0 | |
| }, | |
| { | |
| "epoch": 1.303030303030303, | |
| "grad_norm": 1.625, | |
| "kl_divergence": 0.0015864372253417969, | |
| "learning_rate": 7.272727272727273e-06, | |
| "log_prob_preferred": -39.0, | |
| "log_prob_rejected": -38.375, | |
| "loss": 0.615234375, | |
| "policy_diff": -0.625, | |
| "ref_diff": -1.4375, | |
| "reward_margin": 0.16265869140625, | |
| "step": 43, | |
| "valid_samples_per_batch": 4.0 | |
| }, | |
| { | |
| "epoch": 1.3333333333333333, | |
| "grad_norm": 0.97265625, | |
| "kl_divergence": -0.000118255615234375, | |
| "learning_rate": 6.969696969696971e-06, | |
| "log_prob_preferred": -34.625, | |
| "log_prob_rejected": -34.21875, | |
| "loss": 0.6337890625, | |
| "policy_diff": -0.40625, | |
| "ref_diff": -1.03125, | |
| "reward_margin": 0.12530517578125, | |
| "step": 44, | |
| "valid_samples_per_batch": 4.0 | |
| }, | |
| { | |
| "epoch": 1.3636363636363638, | |
| "grad_norm": 1.8203125, | |
| "kl_divergence": 0.0010471343994140625, | |
| "learning_rate": 6.666666666666667e-06, | |
| "log_prob_preferred": -38.75, | |
| "log_prob_rejected": -37.75, | |
| "loss": 0.56640625, | |
| "policy_diff": -1.0, | |
| "ref_diff": -2.375, | |
| "reward_margin": 0.275146484375, | |
| "step": 45, | |
| "valid_samples_per_batch": 4.0 | |
| }, | |
| { | |
| "epoch": 1.393939393939394, | |
| "grad_norm": 0.9453125, | |
| "kl_divergence": 0.0012845993041992188, | |
| "learning_rate": 6.363636363636364e-06, | |
| "log_prob_preferred": -38.6875, | |
| "log_prob_rejected": -38.3125, | |
| "loss": 0.650390625, | |
| "policy_diff": -0.375, | |
| "ref_diff": -0.8125, | |
| "reward_margin": 0.08758544921875, | |
| "step": 46, | |
| "valid_samples_per_batch": 4.0 | |
| }, | |
| { | |
| "epoch": 1.4242424242424243, | |
| "grad_norm": 0.8984375, | |
| "kl_divergence": -0.0010428428649902344, | |
| "learning_rate": 6.060606060606061e-06, | |
| "log_prob_preferred": -37.8125, | |
| "log_prob_rejected": -37.5, | |
| "loss": 0.64453125, | |
| "policy_diff": -0.3125, | |
| "ref_diff": -0.8125, | |
| "reward_margin": 0.10015869140625, | |
| "step": 47, | |
| "valid_samples_per_batch": 4.0 | |
| }, | |
| { | |
| "epoch": 1.4545454545454546, | |
| "grad_norm": 1.5, | |
| "kl_divergence": 0.0023941993713378906, | |
| "learning_rate": 5.7575757575757586e-06, | |
| "log_prob_preferred": -39.375, | |
| "log_prob_rejected": -38.625, | |
| "loss": 0.6171875, | |
| "policy_diff": -0.75, | |
| "ref_diff": -1.5625, | |
| "reward_margin": 0.16253662109375, | |
| "step": 48, | |
| "valid_samples_per_batch": 4.0 | |
| }, | |
| { | |
| "epoch": 1.4848484848484849, | |
| "grad_norm": 0.8203125, | |
| "kl_divergence": 1.52587890625e-05, | |
| "learning_rate": 5.4545454545454545e-06, | |
| "log_prob_preferred": -34.0, | |
| "log_prob_rejected": -33.6875, | |
| "loss": 0.650390625, | |
| "policy_diff": -0.3125, | |
| "ref_diff": -0.75, | |
| "reward_margin": 0.087646484375, | |
| "step": 49, | |
| "valid_samples_per_batch": 4.0 | |
| }, | |
| { | |
| "epoch": 1.5151515151515151, | |
| "grad_norm": 1.5, | |
| "kl_divergence": 0.0006866455078125, | |
| "learning_rate": 5.151515151515152e-06, | |
| "log_prob_preferred": -38.125, | |
| "log_prob_rejected": -37.375, | |
| "loss": 0.609375, | |
| "policy_diff": -0.75, | |
| "ref_diff": -1.625, | |
| "reward_margin": 0.1751708984375, | |
| "step": 50, | |
| "valid_samples_per_batch": 4.0 | |
| }, | |
| { | |
| "epoch": 1.5454545454545454, | |
| "grad_norm": 1.140625, | |
| "kl_divergence": 5.7220458984375e-05, | |
| "learning_rate": 4.848484848484849e-06, | |
| "log_prob_preferred": -38.6875, | |
| "log_prob_rejected": -38.125, | |
| "loss": 0.638671875, | |
| "policy_diff": -0.5625, | |
| "ref_diff": -1.125, | |
| "reward_margin": 0.11260986328125, | |
| "step": 51, | |
| "valid_samples_per_batch": 4.0 | |
| }, | |
| { | |
| "epoch": 1.5757575757575757, | |
| "grad_norm": 0.99609375, | |
| "kl_divergence": 0.0010547637939453125, | |
| "learning_rate": 4.5454545454545455e-06, | |
| "log_prob_preferred": -37.5625, | |
| "log_prob_rejected": -37.125, | |
| "loss": 0.662109375, | |
| "policy_diff": -0.4375, | |
| "ref_diff": -0.75, | |
| "reward_margin": 0.06256103515625, | |
| "step": 52, | |
| "valid_samples_per_batch": 4.0 | |
| }, | |
| { | |
| "epoch": 1.606060606060606, | |
| "grad_norm": 1.0078125, | |
| "kl_divergence": 0.0012416839599609375, | |
| "learning_rate": 4.242424242424243e-06, | |
| "log_prob_preferred": -39.1875, | |
| "log_prob_rejected": -38.6875, | |
| "loss": 0.65625, | |
| "policy_diff": -0.5, | |
| "ref_diff": -0.875, | |
| "reward_margin": 0.0750732421875, | |
| "step": 53, | |
| "valid_samples_per_batch": 4.0 | |
| }, | |
| { | |
| "epoch": 1.6363636363636362, | |
| "grad_norm": 1.125, | |
| "kl_divergence": 0.0001773834228515625, | |
| "learning_rate": 3.93939393939394e-06, | |
| "log_prob_preferred": -34.9375, | |
| "log_prob_rejected": -34.5, | |
| "loss": 0.62109375, | |
| "policy_diff": -0.4375, | |
| "ref_diff": -1.1875, | |
| "reward_margin": 0.15020751953125, | |
| "step": 54, | |
| "valid_samples_per_batch": 4.0 | |
| }, | |
| { | |
| "epoch": 1.6666666666666665, | |
| "grad_norm": 1.3046875, | |
| "kl_divergence": 0.0010638236999511719, | |
| "learning_rate": 3.6363636363636366e-06, | |
| "log_prob_preferred": -34.21875, | |
| "log_prob_rejected": -33.6875, | |
| "loss": 0.615234375, | |
| "policy_diff": -0.53125, | |
| "ref_diff": -1.34375, | |
| "reward_margin": 0.16259765625, | |
| "step": 55, | |
| "valid_samples_per_batch": 4.0 | |
| }, | |
| { | |
| "epoch": 1.696969696969697, | |
| "grad_norm": 1.71875, | |
| "kl_divergence": 0.0024175643920898438, | |
| "learning_rate": 3.3333333333333333e-06, | |
| "log_prob_preferred": -37.625, | |
| "log_prob_rejected": -36.9375, | |
| "loss": 0.609375, | |
| "policy_diff": -0.6875, | |
| "ref_diff": -1.5625, | |
| "reward_margin": 0.1751708984375, | |
| "step": 56, | |
| "valid_samples_per_batch": 4.0 | |
| }, | |
| { | |
| "epoch": 1.7272727272727273, | |
| "grad_norm": 1.25, | |
| "kl_divergence": 0.0025844573974609375, | |
| "learning_rate": 3.0303030303030305e-06, | |
| "log_prob_preferred": -37.8125, | |
| "log_prob_rejected": -37.25, | |
| "loss": 0.6044921875, | |
| "policy_diff": -0.5625, | |
| "ref_diff": -1.5, | |
| "reward_margin": 0.1878662109375, | |
| "step": 57, | |
| "valid_samples_per_batch": 4.0 | |
| }, | |
| { | |
| "epoch": 1.7575757575757576, | |
| "grad_norm": 1.03125, | |
| "kl_divergence": -0.000789642333984375, | |
| "learning_rate": 2.7272727272727272e-06, | |
| "log_prob_preferred": -35.59375, | |
| "log_prob_rejected": -35.28125, | |
| "loss": 0.6533203125, | |
| "policy_diff": -0.3125, | |
| "ref_diff": -0.71875, | |
| "reward_margin": 0.08135986328125, | |
| "step": 58, | |
| "valid_samples_per_batch": 4.0 | |
| }, | |
| { | |
| "epoch": 1.7878787878787878, | |
| "grad_norm": 1.109375, | |
| "kl_divergence": 0.00030517578125, | |
| "learning_rate": 2.4242424242424244e-06, | |
| "log_prob_preferred": -37.6875, | |
| "log_prob_rejected": -37.1875, | |
| "loss": 0.662109375, | |
| "policy_diff": -0.5, | |
| "ref_diff": -0.8125, | |
| "reward_margin": 0.06256103515625, | |
| "step": 59, | |
| "valid_samples_per_batch": 4.0 | |
| }, | |
| { | |
| "epoch": 1.8181818181818183, | |
| "grad_norm": 1.4453125, | |
| "kl_divergence": 0.0015001296997070312, | |
| "learning_rate": 2.1212121212121216e-06, | |
| "log_prob_preferred": -39.1875, | |
| "log_prob_rejected": -38.5625, | |
| "loss": 0.626953125, | |
| "policy_diff": -0.625, | |
| "ref_diff": -1.3125, | |
| "reward_margin": 0.13775634765625, | |
| "step": 60, | |
| "valid_samples_per_batch": 4.0 | |
| }, | |
| { | |
| "epoch": 1.8484848484848486, | |
| "grad_norm": 0.921875, | |
| "kl_divergence": 0.00011348724365234375, | |
| "learning_rate": 1.8181818181818183e-06, | |
| "log_prob_preferred": -38.625, | |
| "log_prob_rejected": -38.3125, | |
| "loss": 0.650390625, | |
| "policy_diff": -0.3125, | |
| "ref_diff": -0.75, | |
| "reward_margin": 0.08758544921875, | |
| "step": 61, | |
| "valid_samples_per_batch": 4.0 | |
| }, | |
| { | |
| "epoch": 1.878787878787879, | |
| "grad_norm": 1.4609375, | |
| "kl_divergence": 0.002395153045654297, | |
| "learning_rate": 1.5151515151515152e-06, | |
| "log_prob_preferred": -36.8125, | |
| "log_prob_rejected": -36.125, | |
| "loss": 0.59375, | |
| "policy_diff": -0.6875, | |
| "ref_diff": -1.75, | |
| "reward_margin": 0.2125244140625, | |
| "step": 62, | |
| "valid_samples_per_batch": 4.0 | |
| }, | |
| { | |
| "epoch": 1.9090909090909092, | |
| "grad_norm": 1.21875, | |
| "kl_divergence": -0.0005307197570800781, | |
| "learning_rate": 1.2121212121212122e-06, | |
| "log_prob_preferred": -38.25, | |
| "log_prob_rejected": -37.8125, | |
| "loss": 0.64453125, | |
| "policy_diff": -0.4375, | |
| "ref_diff": -0.9375, | |
| "reward_margin": 0.10015869140625, | |
| "step": 63, | |
| "valid_samples_per_batch": 4.0 | |
| }, | |
| { | |
| "epoch": 1.9393939393939394, | |
| "grad_norm": 1.2421875, | |
| "kl_divergence": 0.0015139579772949219, | |
| "learning_rate": 9.090909090909091e-07, | |
| "log_prob_preferred": -37.75, | |
| "log_prob_rejected": -37.3125, | |
| "loss": 0.62109375, | |
| "policy_diff": -0.4375, | |
| "ref_diff": -1.1875, | |
| "reward_margin": 0.1502685546875, | |
| "step": 64, | |
| "valid_samples_per_batch": 4.0 | |
| }, | |
| { | |
| "epoch": 1.9696969696969697, | |
| "grad_norm": 1.2578125, | |
| "kl_divergence": 0.0006976127624511719, | |
| "learning_rate": 6.060606060606061e-07, | |
| "log_prob_preferred": -37.6875, | |
| "log_prob_rejected": -37.1875, | |
| "loss": 0.62109375, | |
| "policy_diff": -0.5, | |
| "ref_diff": -1.25, | |
| "reward_margin": 0.150146484375, | |
| "step": 65, | |
| "valid_samples_per_batch": 4.0 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 1.25, | |
| "kl_divergence": 0.000301361083984375, | |
| "learning_rate": 3.0303030303030305e-07, | |
| "log_prob_preferred": -38.0625, | |
| "log_prob_rejected": -37.5625, | |
| "loss": 0.615234375, | |
| "policy_diff": -0.5, | |
| "ref_diff": -1.3125, | |
| "reward_margin": 0.1627197265625, | |
| "step": 66, | |
| "valid_samples_per_batch": 4.0 | |
| } | |
| ], | |
| "logging_steps": 1.0, | |
| "max_steps": 66, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 2, | |
| "save_steps": 100, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |