{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500, "global_step": 66, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.030303030303030304, "grad_norm": 3.359375, "kl_divergence": 0.0, "learning_rate": 2e-05, "log_prob_preferred": -40.3125, "log_prob_rejected": -38.6875, "loss": 0.69140625, "policy_diff": -1.625, "ref_diff": -1.625, "reward_margin": 0.0, "step": 1, "valid_samples_per_batch": 4.0 }, { "epoch": 0.06060606060606061, "grad_norm": 2.28125, "kl_divergence": -0.00021457672119140625, "learning_rate": 1.96969696969697e-05, "log_prob_preferred": -38.5, "log_prob_rejected": -37.5625, "loss": 0.6796875, "policy_diff": -0.9375, "ref_diff": -1.0625, "reward_margin": 0.0250244140625, "step": 2, "valid_samples_per_batch": 4.0 }, { "epoch": 0.09090909090909091, "grad_norm": 1.921875, "kl_divergence": -0.000499725341796875, "learning_rate": 1.9393939393939395e-05, "log_prob_preferred": -37.3125, "log_prob_rejected": -36.5625, "loss": 0.6796875, "policy_diff": -0.75, "ref_diff": -0.875, "reward_margin": 0.0250244140625, "step": 3, "valid_samples_per_batch": 4.0 }, { "epoch": 0.12121212121212122, "grad_norm": 1.84375, "kl_divergence": -0.0011119842529296875, "learning_rate": 1.9090909090909094e-05, "log_prob_preferred": -37.375, "log_prob_rejected": -36.3125, "loss": 0.6796875, "policy_diff": -1.0625, "ref_diff": -1.1875, "reward_margin": 0.0250244140625, "step": 4, "valid_samples_per_batch": 4.0 }, { "epoch": 0.15151515151515152, "grad_norm": 2.546875, "kl_divergence": -0.0003497600555419922, "learning_rate": 1.8787878787878792e-05, "log_prob_preferred": -35.875, "log_prob_rejected": -34.65625, "loss": 0.65625, "policy_diff": -1.21875, "ref_diff": -1.59375, "reward_margin": 0.07513427734375, "step": 5, "valid_samples_per_batch": 4.0 }, { "epoch": 0.18181818181818182, "grad_norm": 1.71875, "kl_divergence": -0.00025653839111328125, "learning_rate": 1.8484848484848487e-05, "log_prob_preferred": -38.375, "log_prob_rejected": -37.5, "loss": 0.66796875, "policy_diff": -0.875, "ref_diff": -1.125, "reward_margin": 0.050048828125, "step": 6, "valid_samples_per_batch": 4.0 }, { "epoch": 0.21212121212121213, "grad_norm": 2.03125, "kl_divergence": -6.866455078125e-05, "learning_rate": 1.8181818181818182e-05, "log_prob_preferred": -38.625, "log_prob_rejected": -37.3125, "loss": 0.66796875, "policy_diff": -1.3125, "ref_diff": -1.5625, "reward_margin": 0.050048828125, "step": 7, "valid_samples_per_batch": 4.0 }, { "epoch": 0.24242424242424243, "grad_norm": 1.6015625, "kl_divergence": 0.00064849853515625, "learning_rate": 1.787878787878788e-05, "log_prob_preferred": -38.5625, "log_prob_rejected": -37.75, "loss": 0.662109375, "policy_diff": -0.8125, "ref_diff": -1.125, "reward_margin": 0.06256103515625, "step": 8, "valid_samples_per_batch": 4.0 }, { "epoch": 0.2727272727272727, "grad_norm": 1.734375, "kl_divergence": 0.0003542900085449219, "learning_rate": 1.7575757575757576e-05, "log_prob_preferred": -39.5625, "log_prob_rejected": -38.75, "loss": 0.662109375, "policy_diff": -0.8125, "ref_diff": -1.125, "reward_margin": 0.06256103515625, "step": 9, "valid_samples_per_batch": 4.0 }, { "epoch": 0.30303030303030304, "grad_norm": 1.3515625, "kl_divergence": -7.82012939453125e-05, "learning_rate": 1.7272727272727274e-05, "log_prob_preferred": -36.9375, "log_prob_rejected": -36.375, "loss": 0.662109375, "policy_diff": -0.5625, "ref_diff": -0.875, "reward_margin": 0.06256103515625, "step": 10, "valid_samples_per_batch": 4.0 }, { "epoch": 0.3333333333333333, "grad_norm": 1.71875, "kl_divergence": 0.0005383491516113281, "learning_rate": 1.6969696969696972e-05, "log_prob_preferred": -35.6875, "log_prob_rejected": -34.75, "loss": 0.662109375, "policy_diff": -0.9375, "ref_diff": -1.25, "reward_margin": 0.06256103515625, "step": 11, "valid_samples_per_batch": 4.0 }, { "epoch": 0.36363636363636365, "grad_norm": 1.6328125, "kl_divergence": -6.29425048828125e-05, "learning_rate": 1.6666666666666667e-05, "log_prob_preferred": -38.3125, "log_prob_rejected": -37.375, "loss": 0.662109375, "policy_diff": -0.9375, "ref_diff": -1.25, "reward_margin": 0.06256103515625, "step": 12, "valid_samples_per_batch": 4.0 }, { "epoch": 0.3939393939393939, "grad_norm": 1.7109375, "kl_divergence": 0.0003426074981689453, "learning_rate": 1.6363636363636366e-05, "log_prob_preferred": -40.1875, "log_prob_rejected": -39.375, "loss": 0.64453125, "policy_diff": -0.8125, "ref_diff": -1.3125, "reward_margin": 0.10015869140625, "step": 13, "valid_samples_per_batch": 4.0 }, { "epoch": 0.42424242424242425, "grad_norm": 1.6328125, "kl_divergence": 0.0017499923706054688, "learning_rate": 1.606060606060606e-05, "log_prob_preferred": -35.125, "log_prob_rejected": -34.5, "loss": 0.6591796875, "policy_diff": -0.625, "ref_diff": -0.96875, "reward_margin": 0.0687255859375, "step": 14, "valid_samples_per_batch": 4.0 }, { "epoch": 0.45454545454545453, "grad_norm": 1.6484375, "kl_divergence": 0.0004515647888183594, "learning_rate": 1.575757575757576e-05, "log_prob_preferred": -38.6875, "log_prob_rejected": -37.875, "loss": 0.662109375, "policy_diff": -0.8125, "ref_diff": -1.125, "reward_margin": 0.0626220703125, "step": 15, "valid_samples_per_batch": 4.0 }, { "epoch": 0.48484848484848486, "grad_norm": 1.5546875, "kl_divergence": 0.0008220672607421875, "learning_rate": 1.5454545454545454e-05, "log_prob_preferred": -36.75, "log_prob_rejected": -36.0625, "loss": 0.64453125, "policy_diff": -0.6875, "ref_diff": -1.1875, "reward_margin": 0.10015869140625, "step": 16, "valid_samples_per_batch": 4.0 }, { "epoch": 0.5151515151515151, "grad_norm": 1.6953125, "kl_divergence": 0.0004444122314453125, "learning_rate": 1.5151515151515153e-05, "log_prob_preferred": -35.125, "log_prob_rejected": -34.25, "loss": 0.6416015625, "policy_diff": -0.875, "ref_diff": -1.40625, "reward_margin": 0.1063232421875, "step": 17, "valid_samples_per_batch": 4.0 }, { "epoch": 0.5454545454545454, "grad_norm": 1.7421875, "kl_divergence": 0.000736236572265625, "learning_rate": 1.484848484848485e-05, "log_prob_preferred": -38.1875, "log_prob_rejected": -37.5625, "loss": 0.650390625, "policy_diff": -0.625, "ref_diff": -1.0625, "reward_margin": 0.087646484375, "step": 18, "valid_samples_per_batch": 4.0 }, { "epoch": 0.5757575757575758, "grad_norm": 1.578125, "kl_divergence": 0.0011148452758789062, "learning_rate": 1.4545454545454546e-05, "log_prob_preferred": -39.5625, "log_prob_rejected": -38.9375, "loss": 0.662109375, "policy_diff": -0.625, "ref_diff": -0.9375, "reward_margin": 0.06256103515625, "step": 19, "valid_samples_per_batch": 4.0 }, { "epoch": 0.6060606060606061, "grad_norm": 1.5, "kl_divergence": -0.00011205673217773438, "learning_rate": 1.4242424242424245e-05, "log_prob_preferred": -38.9375, "log_prob_rejected": -38.3125, "loss": 0.638671875, "policy_diff": -0.625, "ref_diff": -1.1875, "reward_margin": 0.112548828125, "step": 20, "valid_samples_per_batch": 4.0 }, { "epoch": 0.6363636363636364, "grad_norm": 1.671875, "kl_divergence": 0.0005855560302734375, "learning_rate": 1.3939393939393942e-05, "log_prob_preferred": -40.0625, "log_prob_rejected": -39.125, "loss": 0.62109375, "policy_diff": -0.9375, "ref_diff": -1.6875, "reward_margin": 0.150146484375, "step": 21, "valid_samples_per_batch": 4.0 }, { "epoch": 0.6666666666666666, "grad_norm": 1.75, "kl_divergence": 0.0017135143280029297, "learning_rate": 1.3636363636363637e-05, "log_prob_preferred": -37.875, "log_prob_rejected": -37.1875, "loss": 0.638671875, "policy_diff": -0.6875, "ref_diff": -1.25, "reward_margin": 0.1126708984375, "step": 22, "valid_samples_per_batch": 4.0 }, { "epoch": 0.696969696969697, "grad_norm": 1.2265625, "kl_divergence": 0.0012230873107910156, "learning_rate": 1.3333333333333333e-05, "log_prob_preferred": -37.4375, "log_prob_rejected": -36.875, "loss": 0.650390625, "policy_diff": -0.5625, "ref_diff": -1.0, "reward_margin": 0.087646484375, "step": 23, "valid_samples_per_batch": 4.0 }, { "epoch": 0.7272727272727273, "grad_norm": 1.1796875, "kl_divergence": 0.00089263916015625, "learning_rate": 1.3030303030303032e-05, "log_prob_preferred": -37.0625, "log_prob_rejected": -36.59375, "loss": 0.6474609375, "policy_diff": -0.46875, "ref_diff": -0.9375, "reward_margin": 0.093902587890625, "step": 24, "valid_samples_per_batch": 4.0 }, { "epoch": 0.7575757575757576, "grad_norm": 1.1953125, "kl_divergence": 0.0015249252319335938, "learning_rate": 1.2727272727272728e-05, "log_prob_preferred": -38.0625, "log_prob_rejected": -37.5, "loss": 0.64453125, "policy_diff": -0.5625, "ref_diff": -1.0625, "reward_margin": 0.1002197265625, "step": 25, "valid_samples_per_batch": 4.0 }, { "epoch": 0.7878787878787878, "grad_norm": 1.4921875, "kl_divergence": -0.0013937950134277344, "learning_rate": 1.2424242424242425e-05, "log_prob_preferred": -38.3125, "log_prob_rejected": -37.75, "loss": 0.650390625, "policy_diff": -0.5625, "ref_diff": -1.0, "reward_margin": 0.087646484375, "step": 26, "valid_samples_per_batch": 4.0 }, { "epoch": 0.8181818181818182, "grad_norm": 1.1328125, "kl_divergence": 0.000850677490234375, "learning_rate": 1.2121212121212122e-05, "log_prob_preferred": -37.375, "log_prob_rejected": -37.0625, "loss": 0.626953125, "policy_diff": -0.3125, "ref_diff": -1.0, "reward_margin": 0.1376953125, "step": 27, "valid_samples_per_batch": 4.0 }, { "epoch": 0.8484848484848485, "grad_norm": 1.6171875, "kl_divergence": 0.000988006591796875, "learning_rate": 1.181818181818182e-05, "log_prob_preferred": -36.0, "log_prob_rejected": -35.3125, "loss": 0.6142578125, "policy_diff": -0.6875, "ref_diff": -1.53125, "reward_margin": 0.16888427734375, "step": 28, "valid_samples_per_batch": 4.0 }, { "epoch": 0.8787878787878788, "grad_norm": 1.59375, "kl_divergence": 0.0015506744384765625, "learning_rate": 1.1515151515151517e-05, "log_prob_preferred": -39.0, "log_prob_rejected": -38.3125, "loss": 0.638671875, "policy_diff": -0.6875, "ref_diff": -1.25, "reward_margin": 0.1126708984375, "step": 29, "valid_samples_per_batch": 4.0 }, { "epoch": 0.9090909090909091, "grad_norm": 1.65625, "kl_divergence": -0.0009489059448242188, "learning_rate": 1.1212121212121212e-05, "log_prob_preferred": -41.0, "log_prob_rejected": -40.125, "loss": 0.650390625, "policy_diff": -0.875, "ref_diff": -1.3125, "reward_margin": 0.08758544921875, "step": 30, "valid_samples_per_batch": 4.0 }, { "epoch": 0.9393939393939394, "grad_norm": 1.828125, "kl_divergence": 0.001901388168334961, "learning_rate": 1.0909090909090909e-05, "log_prob_preferred": -35.0625, "log_prob_rejected": -34.1875, "loss": 0.6181640625, "policy_diff": -0.875, "ref_diff": -1.65625, "reward_margin": 0.15625, "step": 31, "valid_samples_per_batch": 4.0 }, { "epoch": 0.9696969696969697, "grad_norm": 1.2734375, "kl_divergence": 0.0008959770202636719, "learning_rate": 1.0606060606060606e-05, "log_prob_preferred": -37.25, "log_prob_rejected": -36.6875, "loss": 0.650390625, "policy_diff": -0.5625, "ref_diff": -1.0, "reward_margin": 0.08758544921875, "step": 32, "valid_samples_per_batch": 4.0 }, { "epoch": 1.0, "grad_norm": 1.3828125, "kl_divergence": 0.0013256072998046875, "learning_rate": 1.0303030303030304e-05, "log_prob_preferred": -37.1875, "log_prob_rejected": -36.6875, "loss": 0.626953125, "policy_diff": -0.5, "ref_diff": -1.1875, "reward_margin": 0.13763427734375, "step": 33, "valid_samples_per_batch": 4.0 }, { "epoch": 1.0303030303030303, "grad_norm": 1.0234375, "kl_divergence": 0.0015974044799804688, "learning_rate": 1e-05, "log_prob_preferred": -35.75, "log_prob_rejected": -35.25, "loss": 0.650390625, "policy_diff": -0.5, "ref_diff": -0.9375, "reward_margin": 0.087646484375, "step": 34, "valid_samples_per_batch": 4.0 }, { "epoch": 1.0606060606060606, "grad_norm": 0.953125, "kl_divergence": 0.0008296966552734375, "learning_rate": 9.696969696969698e-06, "log_prob_preferred": -37.1875, "log_prob_rejected": -36.8125, "loss": 0.64453125, "policy_diff": -0.375, "ref_diff": -0.875, "reward_margin": 0.10015869140625, "step": 35, "valid_samples_per_batch": 4.0 }, { "epoch": 1.0909090909090908, "grad_norm": 1.1015625, "kl_divergence": -0.0008130073547363281, "learning_rate": 9.393939393939396e-06, "log_prob_preferred": -37.375, "log_prob_rejected": -36.875, "loss": 0.6416015625, "policy_diff": -0.5, "ref_diff": -1.03125, "reward_margin": 0.10626220703125, "step": 36, "valid_samples_per_batch": 4.0 }, { "epoch": 1.121212121212121, "grad_norm": 1.3984375, "kl_divergence": 0.0008101463317871094, "learning_rate": 9.090909090909091e-06, "log_prob_preferred": -38.75, "log_prob_rejected": -38.125, "loss": 0.609375, "policy_diff": -0.625, "ref_diff": -1.5, "reward_margin": 0.17529296875, "step": 37, "valid_samples_per_batch": 4.0 }, { "epoch": 1.1515151515151516, "grad_norm": 1.3515625, "kl_divergence": 0.0025653839111328125, "learning_rate": 8.787878787878788e-06, "log_prob_preferred": -37.5, "log_prob_rejected": -36.9375, "loss": 0.650390625, "policy_diff": -0.5625, "ref_diff": -1.0, "reward_margin": 0.087646484375, "step": 38, "valid_samples_per_batch": 4.0 }, { "epoch": 1.1818181818181819, "grad_norm": 1.25, "kl_divergence": -0.000659942626953125, "learning_rate": 8.484848484848486e-06, "log_prob_preferred": -38.5, "log_prob_rejected": -38.0, "loss": 0.62109375, "policy_diff": -0.5, "ref_diff": -1.25, "reward_margin": 0.150146484375, "step": 39, "valid_samples_per_batch": 4.0 }, { "epoch": 1.2121212121212122, "grad_norm": 1.328125, "kl_divergence": -7.82012939453125e-05, "learning_rate": 8.181818181818183e-06, "log_prob_preferred": -39.125, "log_prob_rejected": -38.5, "loss": 0.650390625, "policy_diff": -0.625, "ref_diff": -1.0625, "reward_margin": 0.08758544921875, "step": 40, "valid_samples_per_batch": 4.0 }, { "epoch": 1.2424242424242424, "grad_norm": 1.2578125, "kl_divergence": 0.0023040771484375, "learning_rate": 7.87878787878788e-06, "log_prob_preferred": -35.1875, "log_prob_rejected": -34.5625, "loss": 0.626953125, "policy_diff": -0.625, "ref_diff": -1.3125, "reward_margin": 0.13763427734375, "step": 41, "valid_samples_per_batch": 4.0 }, { "epoch": 1.2727272727272727, "grad_norm": 1.4453125, "kl_divergence": 0.0010166168212890625, "learning_rate": 7.5757575757575764e-06, "log_prob_preferred": -37.25, "log_prob_rejected": -36.625, "loss": 0.6171875, "policy_diff": -0.625, "ref_diff": -1.4375, "reward_margin": 0.16259765625, "step": 42, "valid_samples_per_batch": 4.0 }, { "epoch": 1.303030303030303, "grad_norm": 1.625, "kl_divergence": 0.0015864372253417969, "learning_rate": 7.272727272727273e-06, "log_prob_preferred": -39.0, "log_prob_rejected": -38.375, "loss": 0.615234375, "policy_diff": -0.625, "ref_diff": -1.4375, "reward_margin": 0.16265869140625, "step": 43, "valid_samples_per_batch": 4.0 }, { "epoch": 1.3333333333333333, "grad_norm": 0.97265625, "kl_divergence": -0.000118255615234375, "learning_rate": 6.969696969696971e-06, "log_prob_preferred": -34.625, "log_prob_rejected": -34.21875, "loss": 0.6337890625, "policy_diff": -0.40625, "ref_diff": -1.03125, "reward_margin": 0.12530517578125, "step": 44, "valid_samples_per_batch": 4.0 }, { "epoch": 1.3636363636363638, "grad_norm": 1.8203125, "kl_divergence": 0.0010471343994140625, "learning_rate": 6.666666666666667e-06, "log_prob_preferred": -38.75, "log_prob_rejected": -37.75, "loss": 0.56640625, "policy_diff": -1.0, "ref_diff": -2.375, "reward_margin": 0.275146484375, "step": 45, "valid_samples_per_batch": 4.0 }, { "epoch": 1.393939393939394, "grad_norm": 0.9453125, "kl_divergence": 0.0012845993041992188, "learning_rate": 6.363636363636364e-06, "log_prob_preferred": -38.6875, "log_prob_rejected": -38.3125, "loss": 0.650390625, "policy_diff": -0.375, "ref_diff": -0.8125, "reward_margin": 0.08758544921875, "step": 46, "valid_samples_per_batch": 4.0 }, { "epoch": 1.4242424242424243, "grad_norm": 0.8984375, "kl_divergence": -0.0010428428649902344, "learning_rate": 6.060606060606061e-06, "log_prob_preferred": -37.8125, "log_prob_rejected": -37.5, "loss": 0.64453125, "policy_diff": -0.3125, "ref_diff": -0.8125, "reward_margin": 0.10015869140625, "step": 47, "valid_samples_per_batch": 4.0 }, { "epoch": 1.4545454545454546, "grad_norm": 1.5, "kl_divergence": 0.0023941993713378906, "learning_rate": 5.7575757575757586e-06, "log_prob_preferred": -39.375, "log_prob_rejected": -38.625, "loss": 0.6171875, "policy_diff": -0.75, "ref_diff": -1.5625, "reward_margin": 0.16253662109375, "step": 48, "valid_samples_per_batch": 4.0 }, { "epoch": 1.4848484848484849, "grad_norm": 0.8203125, "kl_divergence": 1.52587890625e-05, "learning_rate": 5.4545454545454545e-06, "log_prob_preferred": -34.0, "log_prob_rejected": -33.6875, "loss": 0.650390625, "policy_diff": -0.3125, "ref_diff": -0.75, "reward_margin": 0.087646484375, "step": 49, "valid_samples_per_batch": 4.0 }, { "epoch": 1.5151515151515151, "grad_norm": 1.5, "kl_divergence": 0.0006866455078125, "learning_rate": 5.151515151515152e-06, "log_prob_preferred": -38.125, "log_prob_rejected": -37.375, "loss": 0.609375, "policy_diff": -0.75, "ref_diff": -1.625, "reward_margin": 0.1751708984375, "step": 50, "valid_samples_per_batch": 4.0 }, { "epoch": 1.5454545454545454, "grad_norm": 1.140625, "kl_divergence": 5.7220458984375e-05, "learning_rate": 4.848484848484849e-06, "log_prob_preferred": -38.6875, "log_prob_rejected": -38.125, "loss": 0.638671875, "policy_diff": -0.5625, "ref_diff": -1.125, "reward_margin": 0.11260986328125, "step": 51, "valid_samples_per_batch": 4.0 }, { "epoch": 1.5757575757575757, "grad_norm": 0.99609375, "kl_divergence": 0.0010547637939453125, "learning_rate": 4.5454545454545455e-06, "log_prob_preferred": -37.5625, "log_prob_rejected": -37.125, "loss": 0.662109375, "policy_diff": -0.4375, "ref_diff": -0.75, "reward_margin": 0.06256103515625, "step": 52, "valid_samples_per_batch": 4.0 }, { "epoch": 1.606060606060606, "grad_norm": 1.0078125, "kl_divergence": 0.0012416839599609375, "learning_rate": 4.242424242424243e-06, "log_prob_preferred": -39.1875, "log_prob_rejected": -38.6875, "loss": 0.65625, "policy_diff": -0.5, "ref_diff": -0.875, "reward_margin": 0.0750732421875, "step": 53, "valid_samples_per_batch": 4.0 }, { "epoch": 1.6363636363636362, "grad_norm": 1.125, "kl_divergence": 0.0001773834228515625, "learning_rate": 3.93939393939394e-06, "log_prob_preferred": -34.9375, "log_prob_rejected": -34.5, "loss": 0.62109375, "policy_diff": -0.4375, "ref_diff": -1.1875, "reward_margin": 0.15020751953125, "step": 54, "valid_samples_per_batch": 4.0 }, { "epoch": 1.6666666666666665, "grad_norm": 1.3046875, "kl_divergence": 0.0010638236999511719, "learning_rate": 3.6363636363636366e-06, "log_prob_preferred": -34.21875, "log_prob_rejected": -33.6875, "loss": 0.615234375, "policy_diff": -0.53125, "ref_diff": -1.34375, "reward_margin": 0.16259765625, "step": 55, "valid_samples_per_batch": 4.0 }, { "epoch": 1.696969696969697, "grad_norm": 1.71875, "kl_divergence": 0.0024175643920898438, "learning_rate": 3.3333333333333333e-06, "log_prob_preferred": -37.625, "log_prob_rejected": -36.9375, "loss": 0.609375, "policy_diff": -0.6875, "ref_diff": -1.5625, "reward_margin": 0.1751708984375, "step": 56, "valid_samples_per_batch": 4.0 }, { "epoch": 1.7272727272727273, "grad_norm": 1.25, "kl_divergence": 0.0025844573974609375, "learning_rate": 3.0303030303030305e-06, "log_prob_preferred": -37.8125, "log_prob_rejected": -37.25, "loss": 0.6044921875, "policy_diff": -0.5625, "ref_diff": -1.5, "reward_margin": 0.1878662109375, "step": 57, "valid_samples_per_batch": 4.0 }, { "epoch": 1.7575757575757576, "grad_norm": 1.03125, "kl_divergence": -0.000789642333984375, "learning_rate": 2.7272727272727272e-06, "log_prob_preferred": -35.59375, "log_prob_rejected": -35.28125, "loss": 0.6533203125, "policy_diff": -0.3125, "ref_diff": -0.71875, "reward_margin": 0.08135986328125, "step": 58, "valid_samples_per_batch": 4.0 }, { "epoch": 1.7878787878787878, "grad_norm": 1.109375, "kl_divergence": 0.00030517578125, "learning_rate": 2.4242424242424244e-06, "log_prob_preferred": -37.6875, "log_prob_rejected": -37.1875, "loss": 0.662109375, "policy_diff": -0.5, "ref_diff": -0.8125, "reward_margin": 0.06256103515625, "step": 59, "valid_samples_per_batch": 4.0 }, { "epoch": 1.8181818181818183, "grad_norm": 1.4453125, "kl_divergence": 0.0015001296997070312, "learning_rate": 2.1212121212121216e-06, "log_prob_preferred": -39.1875, "log_prob_rejected": -38.5625, "loss": 0.626953125, "policy_diff": -0.625, "ref_diff": -1.3125, "reward_margin": 0.13775634765625, "step": 60, "valid_samples_per_batch": 4.0 }, { "epoch": 1.8484848484848486, "grad_norm": 0.921875, "kl_divergence": 0.00011348724365234375, "learning_rate": 1.8181818181818183e-06, "log_prob_preferred": -38.625, "log_prob_rejected": -38.3125, "loss": 0.650390625, "policy_diff": -0.3125, "ref_diff": -0.75, "reward_margin": 0.08758544921875, "step": 61, "valid_samples_per_batch": 4.0 }, { "epoch": 1.878787878787879, "grad_norm": 1.4609375, "kl_divergence": 0.002395153045654297, "learning_rate": 1.5151515151515152e-06, "log_prob_preferred": -36.8125, "log_prob_rejected": -36.125, "loss": 0.59375, "policy_diff": -0.6875, "ref_diff": -1.75, "reward_margin": 0.2125244140625, "step": 62, "valid_samples_per_batch": 4.0 }, { "epoch": 1.9090909090909092, "grad_norm": 1.21875, "kl_divergence": -0.0005307197570800781, "learning_rate": 1.2121212121212122e-06, "log_prob_preferred": -38.25, "log_prob_rejected": -37.8125, "loss": 0.64453125, "policy_diff": -0.4375, "ref_diff": -0.9375, "reward_margin": 0.10015869140625, "step": 63, "valid_samples_per_batch": 4.0 }, { "epoch": 1.9393939393939394, "grad_norm": 1.2421875, "kl_divergence": 0.0015139579772949219, "learning_rate": 9.090909090909091e-07, "log_prob_preferred": -37.75, "log_prob_rejected": -37.3125, "loss": 0.62109375, "policy_diff": -0.4375, "ref_diff": -1.1875, "reward_margin": 0.1502685546875, "step": 64, "valid_samples_per_batch": 4.0 }, { "epoch": 1.9696969696969697, "grad_norm": 1.2578125, "kl_divergence": 0.0006976127624511719, "learning_rate": 6.060606060606061e-07, "log_prob_preferred": -37.6875, "log_prob_rejected": -37.1875, "loss": 0.62109375, "policy_diff": -0.5, "ref_diff": -1.25, "reward_margin": 0.150146484375, "step": 65, "valid_samples_per_batch": 4.0 }, { "epoch": 2.0, "grad_norm": 1.25, "kl_divergence": 0.000301361083984375, "learning_rate": 3.0303030303030305e-07, "log_prob_preferred": -38.0625, "log_prob_rejected": -37.5625, "loss": 0.615234375, "policy_diff": -0.5, "ref_diff": -1.3125, "reward_margin": 0.1627197265625, "step": 66, "valid_samples_per_batch": 4.0 } ], "logging_steps": 1.0, "max_steps": 66, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }