{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.380952380952381, "eval_steps": 500, "global_step": 150, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 51.85, "epoch": 0.07936507936507936, "grad_norm": 2.2403488159179688, "kl": 0.0027098871301859616, "learning_rate": 2.6666666666666667e-07, "loss": 0.0001, "reward": 0.16500000339001417, "reward_std": 0.11435296162962913, "rewards/cbt_content_reward": 0.16500000339001417, "rewards/check_cbt_structure": 0.0, "step": 5 }, { "completion_length": 58.675, "epoch": 0.15873015873015872, "grad_norm": 3.0842247009277344, "kl": 0.002581492770696059, "learning_rate": 6e-07, "loss": 0.0001, "reward": 0.18250000290572643, "reward_std": 0.14434738419950008, "rewards/cbt_content_reward": 0.18250000439584255, "rewards/check_cbt_structure": 0.0, "step": 10 }, { "completion_length": 55.7, "epoch": 0.23809523809523808, "grad_norm": 1.792191982269287, "kl": 0.0019858392188325524, "learning_rate": 9.333333333333333e-07, "loss": 0.0001, "reward": 0.1250000039115548, "reward_std": 0.1089518491178751, "rewards/cbt_content_reward": 0.1250000039115548, "rewards/check_cbt_structure": 0.0, "step": 15 }, { "completion_length": 73.025, "epoch": 0.31746031746031744, "grad_norm": 2.7672274112701416, "kl": 0.0017400937096681446, "learning_rate": 9.703703703703704e-07, "loss": 0.0001, "reward": 0.15250000283122062, "reward_std": 0.1529246997088194, "rewards/cbt_content_reward": 0.15250000432133676, "rewards/check_cbt_structure": 0.0, "step": 20 }, { "completion_length": 52.825, "epoch": 0.3968253968253968, "grad_norm": 1.1795265674591064, "kl": 0.0027943541877903043, "learning_rate": 9.333333333333333e-07, "loss": 0.0001, "reward": 0.14500000327825546, "reward_std": 0.10991083942353726, "rewards/cbt_content_reward": 0.14500000178813935, "rewards/check_cbt_structure": 0.0, "step": 25 }, { "completion_length": 53.3375, "epoch": 0.47619047619047616, "grad_norm": 2.038141965866089, "kl": 0.002141835092334077, "learning_rate": 8.962962962962963e-07, "loss": 0.0001, "reward": 0.1750000048428774, "reward_std": 0.11487694047391414, "rewards/cbt_content_reward": 0.17500000335276128, "rewards/check_cbt_structure": 0.0, "step": 30 }, { "completion_length": 66.55, "epoch": 0.5555555555555556, "grad_norm": 2.512485980987549, "kl": 0.0019801797869149597, "learning_rate": 8.592592592592592e-07, "loss": 0.0001, "reward": 0.1575000027194619, "reward_std": 0.1333638343960047, "rewards/cbt_content_reward": 0.1575000027194619, "rewards/check_cbt_structure": 0.0, "step": 35 }, { "completion_length": 56.8, "epoch": 0.6349206349206349, "grad_norm": 4.515315055847168, "kl": 0.002533014601795003, "learning_rate": 8.222222222222221e-07, "loss": 0.0001, "reward": 0.18250000663101673, "reward_std": 0.1475393757224083, "rewards/cbt_content_reward": 0.18250000514090062, "rewards/check_cbt_structure": 0.0, "step": 40 }, { "completion_length": 64.275, "epoch": 0.7142857142857143, "grad_norm": 2.102088212966919, "kl": 0.002205433923518285, "learning_rate": 7.851851851851852e-07, "loss": 0.0001, "reward": 0.180000002682209, "reward_std": 0.13480928540229797, "rewards/cbt_content_reward": 0.180000002682209, "rewards/check_cbt_structure": 0.0, "step": 45 }, { "completion_length": 47.3, "epoch": 0.7936507936507936, "grad_norm": 3.474039077758789, "kl": 0.0022507158428197727, "learning_rate": 7.481481481481481e-07, "loss": 0.0001, "reward": 0.15000000335276126, "reward_std": 0.1302133210003376, "rewards/cbt_content_reward": 0.15000000335276126, "rewards/check_cbt_structure": 0.0, "step": 50 }, { "completion_length": 64.8125, "epoch": 0.873015873015873, "grad_norm": 2.440229654312134, "kl": 0.00192810871230904, "learning_rate": 7.111111111111111e-07, "loss": 0.0001, "reward": 0.16500000488013028, "reward_std": 0.0976612851023674, "rewards/cbt_content_reward": 0.16500000488013028, "rewards/check_cbt_structure": 0.0, "step": 55 }, { "completion_length": 59.7375, "epoch": 0.9523809523809523, "grad_norm": 1.8490701913833618, "kl": 0.002136132796294987, "learning_rate": 6.74074074074074e-07, "loss": 0.0001, "reward": 0.17750000581145287, "reward_std": 0.14892779104411602, "rewards/cbt_content_reward": 0.17750000432133675, "rewards/check_cbt_structure": 0.0, "step": 60 }, { "completion_length": 54.7125, "epoch": 1.0317460317460316, "grad_norm": 2.5725176334381104, "kl": 0.002353719263919629, "learning_rate": 6.37037037037037e-07, "loss": 0.0001, "reward": 0.1800000037997961, "reward_std": 0.12799057997763158, "rewards/cbt_content_reward": 0.18000000230968, "rewards/check_cbt_structure": 0.0, "step": 65 }, { "completion_length": 66.5625, "epoch": 1.1111111111111112, "grad_norm": 1.7574554681777954, "kl": 0.00198215174023062, "learning_rate": 6e-07, "loss": 0.0001, "reward": 0.22500000521540642, "reward_std": 0.12019186988472938, "rewards/cbt_content_reward": 0.2250000037252903, "rewards/check_cbt_structure": 0.0, "step": 70 }, { "completion_length": 58.825, "epoch": 1.1904761904761905, "grad_norm": 2.7693963050842285, "kl": 0.002443282786407508, "learning_rate": 5.62962962962963e-07, "loss": 0.0001, "reward": 0.1500000048428774, "reward_std": 0.13859827741980552, "rewards/cbt_content_reward": 0.15000000186264514, "rewards/check_cbt_structure": 0.0, "step": 75 }, { "completion_length": 63.875, "epoch": 1.2698412698412698, "grad_norm": 2.8838014602661133, "kl": 0.002126309886807576, "learning_rate": 5.259259259259258e-07, "loss": 0.0001, "reward": 0.1775000037625432, "reward_std": 0.11867224015295505, "rewards/cbt_content_reward": 0.1775000037625432, "rewards/check_cbt_structure": 0.0, "step": 80 }, { "completion_length": 53.85, "epoch": 1.3492063492063493, "grad_norm": 3.608692169189453, "kl": 0.002317077317275107, "learning_rate": 4.888888888888889e-07, "loss": 0.0001, "reward": 0.18250000216066836, "reward_std": 0.1294781118631363, "rewards/cbt_content_reward": 0.18250000514090062, "rewards/check_cbt_structure": 0.0, "step": 85 }, { "completion_length": 56.3375, "epoch": 1.4285714285714286, "grad_norm": 2.851497173309326, "kl": 0.0024501425621565433, "learning_rate": 4.5185185185185183e-07, "loss": 0.0001, "reward": 0.1675000036135316, "reward_std": 0.11531616114079953, "rewards/cbt_content_reward": 0.1675000036135316, "rewards/check_cbt_structure": 0.0, "step": 90 }, { "completion_length": 54.6625, "epoch": 1.507936507936508, "grad_norm": 2.257418632507324, "kl": 0.002075143059482798, "learning_rate": 4.1481481481481476e-07, "loss": 0.0001, "reward": 0.14500000271946192, "reward_std": 0.12950329035520552, "rewards/cbt_content_reward": 0.14500000271946192, "rewards/check_cbt_structure": 0.0, "step": 95 }, { "completion_length": 60.6, "epoch": 1.5873015873015874, "grad_norm": 2.6931891441345215, "kl": 0.002077191596617922, "learning_rate": 3.7777777777777775e-07, "loss": 0.0001, "reward": 0.13250000309199095, "reward_std": 0.13716318383812903, "rewards/cbt_content_reward": 0.13250000458210706, "rewards/check_cbt_structure": 0.0, "step": 100 }, { "completion_length": 78.9, "epoch": 1.6666666666666665, "grad_norm": 2.117309808731079, "kl": 0.0018815583549439906, "learning_rate": 3.407407407407407e-07, "loss": 0.0001, "reward": 0.21000000461935997, "reward_std": 0.14268166311085223, "rewards/cbt_content_reward": 0.21000000461935997, "rewards/check_cbt_structure": 0.0, "step": 105 }, { "completion_length": 54.3125, "epoch": 1.746031746031746, "grad_norm": 2.469907760620117, "kl": 0.002604751317994669, "learning_rate": 3.037037037037037e-07, "loss": 0.0001, "reward": 0.21750000603497027, "reward_std": 0.1493647824972868, "rewards/cbt_content_reward": 0.21750000603497027, "rewards/check_cbt_structure": 0.0, "step": 110 }, { "completion_length": 56.85, "epoch": 1.8253968253968254, "grad_norm": 2.3624932765960693, "kl": 0.002011225459864363, "learning_rate": 2.6666666666666667e-07, "loss": 0.0001, "reward": 0.1475000012665987, "reward_std": 0.12692904993891715, "rewards/cbt_content_reward": 0.14750000275671482, "rewards/check_cbt_structure": 0.0, "step": 115 }, { "completion_length": 55.625, "epoch": 1.9047619047619047, "grad_norm": 3.7563695907592773, "kl": 0.002532219042768702, "learning_rate": 2.296296296296296e-07, "loss": 0.0001, "reward": 0.16250000335276127, "reward_std": 0.11915707401931286, "rewards/cbt_content_reward": 0.16250000037252904, "rewards/check_cbt_structure": 0.0, "step": 120 }, { "completion_length": 57.4625, "epoch": 1.9841269841269842, "grad_norm": 2.7810513973236084, "kl": 0.002170709293568507, "learning_rate": 1.9259259259259257e-07, "loss": 0.0001, "reward": 0.15750000439584255, "reward_std": 0.1208796363323927, "rewards/cbt_content_reward": 0.15750000439584255, "rewards/check_cbt_structure": 0.0, "step": 125 }, { "completion_length": 49.35, "epoch": 2.0634920634920633, "grad_norm": 2.166604518890381, "kl": 0.0023496907786466183, "learning_rate": 1.5555555555555556e-07, "loss": 0.0001, "reward": 0.1425000036135316, "reward_std": 0.14228889718651772, "rewards/cbt_content_reward": 0.1425000036135316, "rewards/check_cbt_structure": 0.0, "step": 130 }, { "completion_length": 63.175, "epoch": 2.142857142857143, "grad_norm": 3.5904221534729004, "kl": 0.002583104814402759, "learning_rate": 1.1851851851851851e-07, "loss": 0.0001, "reward": 0.20750000271946192, "reward_std": 0.12724252939224243, "rewards/cbt_content_reward": 0.20750000420957804, "rewards/check_cbt_structure": 0.0, "step": 135 }, { "completion_length": 57.625, "epoch": 2.2222222222222223, "grad_norm": 2.712351083755493, "kl": 0.0022835541458334774, "learning_rate": 8.148148148148149e-08, "loss": 0.0001, "reward": 0.1375000037252903, "reward_std": 0.118898082152009, "rewards/cbt_content_reward": 0.13750000223517417, "rewards/check_cbt_structure": 0.0, "step": 140 }, { "completion_length": 52.4375, "epoch": 2.3015873015873014, "grad_norm": 3.027590751647949, "kl": 0.0020462898130062966, "learning_rate": 4.444444444444444e-08, "loss": 0.0001, "reward": 0.1750000027939677, "reward_std": 0.11545386202633381, "rewards/cbt_content_reward": 0.1750000027939677, "rewards/check_cbt_structure": 0.0, "step": 145 }, { "completion_length": 70.325, "epoch": 2.380952380952381, "grad_norm": 3.1476638317108154, "kl": 0.0017971335852053016, "learning_rate": 7.407407407407407e-09, "loss": 0.0001, "reward": 0.19500000346451998, "reward_std": 0.12282740026712417, "rewards/cbt_content_reward": 0.19500000346451998, "rewards/check_cbt_structure": 0.0, "step": 150 } ], "logging_steps": 5, "max_steps": 150, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }