| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 1.0, |
| "eval_steps": 500, |
| "global_step": 35, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 123.9375, |
| "epoch": 0.02857142857142857, |
| "grad_norm": 0.6184732913970947, |
| "kl": 0.0, |
| "learning_rate": 9.714285714285715e-06, |
| "loss": 0.0, |
| "reward": 2.375, |
| "reward_std": 0.7682067155838013, |
| "rewards/format_reward_custom": 1.0, |
| "rewards/high_level_action_reward": 0.6875, |
| "rewards/low_level_action_reward": 0.6875, |
| "step": 1 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 138.5625, |
| "epoch": 0.05714285714285714, |
| "grad_norm": 0.5803099870681763, |
| "kl": 0.00021266937255859375, |
| "learning_rate": 9.42857142857143e-06, |
| "loss": 0.0, |
| "reward": 2.3125, |
| "reward_std": 0.6477618217468262, |
| "rewards/format_reward_custom": 1.0, |
| "rewards/high_level_action_reward": 0.625, |
| "rewards/low_level_action_reward": 0.6875, |
| "step": 2 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 152.6875, |
| "epoch": 0.08571428571428572, |
| "grad_norm": 0.5307255983352661, |
| "kl": 0.00022840499877929688, |
| "learning_rate": 9.142857142857144e-06, |
| "loss": 0.0, |
| "reward": 2.5625, |
| "reward_std": 0.44403792917728424, |
| "rewards/format_reward_custom": 1.0, |
| "rewards/high_level_action_reward": 0.75, |
| "rewards/low_level_action_reward": 0.8125, |
| "step": 3 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 114.75, |
| "epoch": 0.11428571428571428, |
| "grad_norm": 0.8789212107658386, |
| "kl": 0.00019073486328125, |
| "learning_rate": 8.857142857142858e-06, |
| "loss": 0.0, |
| "reward": 2.90625, |
| "reward_std": 0.2651650384068489, |
| "rewards/format_reward_custom": 1.0, |
| "rewards/high_level_action_reward": 0.90625, |
| "rewards/low_level_action_reward": 1.0, |
| "step": 4 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 154.75, |
| "epoch": 0.14285714285714285, |
| "grad_norm": 0.5785586833953857, |
| "kl": 0.00037479400634765625, |
| "learning_rate": 8.571428571428571e-06, |
| "loss": 0.0, |
| "reward": 2.46875, |
| "reward_std": 0.7718936204910278, |
| "rewards/format_reward_custom": 0.9375, |
| "rewards/high_level_action_reward": 0.71875, |
| "rewards/low_level_action_reward": 0.8125, |
| "step": 5 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 122.9375, |
| "epoch": 0.17142857142857143, |
| "grad_norm": 0.6176652312278748, |
| "kl": 0.0002498626708984375, |
| "learning_rate": 8.285714285714287e-06, |
| "loss": 0.0, |
| "reward": 2.5625, |
| "reward_std": 0.6470449417829514, |
| "rewards/format_reward_custom": 1.0, |
| "rewards/high_level_action_reward": 0.8125, |
| "rewards/low_level_action_reward": 0.75, |
| "step": 6 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 127.75, |
| "epoch": 0.2, |
| "grad_norm": 0.46473655104637146, |
| "kl": 0.000518798828125, |
| "learning_rate": 8.000000000000001e-06, |
| "loss": 0.0, |
| "reward": 2.71875, |
| "reward_std": 0.33905068039894104, |
| "rewards/format_reward_custom": 1.0, |
| "rewards/high_level_action_reward": 0.78125, |
| "rewards/low_level_action_reward": 0.9375, |
| "step": 7 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 126.4375, |
| "epoch": 0.22857142857142856, |
| "grad_norm": 0.6035409569740295, |
| "kl": 0.000499725341796875, |
| "learning_rate": 7.714285714285716e-06, |
| "loss": 0.0, |
| "reward": 2.125, |
| "reward_std": 0.4355512708425522, |
| "rewards/format_reward_custom": 1.0, |
| "rewards/high_level_action_reward": 0.5, |
| "rewards/low_level_action_reward": 0.625, |
| "step": 8 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 137.0625, |
| "epoch": 0.2571428571428571, |
| "grad_norm": 0.5820505023002625, |
| "kl": 0.0006046295166015625, |
| "learning_rate": 7.428571428571429e-06, |
| "loss": 0.0, |
| "reward": 2.59375, |
| "reward_std": 0.4180135875940323, |
| "rewards/format_reward_custom": 1.0, |
| "rewards/high_level_action_reward": 0.78125, |
| "rewards/low_level_action_reward": 0.8125, |
| "step": 9 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 145.25, |
| "epoch": 0.2857142857142857, |
| "grad_norm": 0.5779363512992859, |
| "kl": 0.000782012939453125, |
| "learning_rate": 7.1428571428571436e-06, |
| "loss": 0.0, |
| "reward": 2.625, |
| "reward_std": 0.48037588596343994, |
| "rewards/format_reward_custom": 0.9375, |
| "rewards/high_level_action_reward": 0.9375, |
| "rewards/low_level_action_reward": 0.75, |
| "step": 10 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 125.25, |
| "epoch": 0.3142857142857143, |
| "grad_norm": 0.704531192779541, |
| "kl": 0.0004863739013671875, |
| "learning_rate": 6.857142857142858e-06, |
| "loss": 0.0, |
| "reward": 2.03125, |
| "reward_std": 0.4498222768306732, |
| "rewards/format_reward_custom": 1.0, |
| "rewards/high_level_action_reward": 0.65625, |
| "rewards/low_level_action_reward": 0.375, |
| "step": 11 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 150.25, |
| "epoch": 0.34285714285714286, |
| "grad_norm": 0.5799468159675598, |
| "kl": 0.0006017684936523438, |
| "learning_rate": 6.571428571428572e-06, |
| "loss": 0.0, |
| "reward": 2.625, |
| "reward_std": 0.45624347031116486, |
| "rewards/format_reward_custom": 1.0, |
| "rewards/high_level_action_reward": 0.8125, |
| "rewards/low_level_action_reward": 0.8125, |
| "step": 12 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 139.5625, |
| "epoch": 0.37142857142857144, |
| "grad_norm": 0.6031074523925781, |
| "kl": 0.000988006591796875, |
| "learning_rate": 6.285714285714286e-06, |
| "loss": 0.0, |
| "reward": 1.78125, |
| "reward_std": 0.43056435883045197, |
| "rewards/format_reward_custom": 1.0, |
| "rewards/high_level_action_reward": 0.40625, |
| "rewards/low_level_action_reward": 0.375, |
| "step": 13 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 125.25, |
| "epoch": 0.4, |
| "grad_norm": 0.6519690752029419, |
| "kl": 0.0007038116455078125, |
| "learning_rate": 6e-06, |
| "loss": 0.0, |
| "reward": 2.34375, |
| "reward_std": 0.48591597378253937, |
| "rewards/format_reward_custom": 1.0, |
| "rewards/high_level_action_reward": 0.53125, |
| "rewards/low_level_action_reward": 0.8125, |
| "step": 14 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 131.125, |
| "epoch": 0.42857142857142855, |
| "grad_norm": 0.5583562254905701, |
| "kl": 0.000743865966796875, |
| "learning_rate": 5.7142857142857145e-06, |
| "loss": 0.0, |
| "reward": 2.5625, |
| "reward_std": 0.5751546919345856, |
| "rewards/format_reward_custom": 1.0, |
| "rewards/high_level_action_reward": 0.75, |
| "rewards/low_level_action_reward": 0.8125, |
| "step": 15 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 114.9375, |
| "epoch": 0.45714285714285713, |
| "grad_norm": 0.0059979381039738655, |
| "kl": 0.000995635986328125, |
| "learning_rate": 5.428571428571429e-06, |
| "loss": 0.0, |
| "reward": 2.5, |
| "reward_std": 0.0, |
| "rewards/format_reward_custom": 1.0, |
| "rewards/high_level_action_reward": 0.5, |
| "rewards/low_level_action_reward": 1.0, |
| "step": 16 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 119.875, |
| "epoch": 0.4857142857142857, |
| "grad_norm": 0.7269045114517212, |
| "kl": 0.001373291015625, |
| "learning_rate": 5.142857142857142e-06, |
| "loss": 0.0001, |
| "reward": 2.09375, |
| "reward_std": 0.5073517560958862, |
| "rewards/format_reward_custom": 1.0, |
| "rewards/high_level_action_reward": 0.40625, |
| "rewards/low_level_action_reward": 0.6875, |
| "step": 17 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 115.6875, |
| "epoch": 0.5142857142857142, |
| "grad_norm": 0.6689741015434265, |
| "kl": 0.00109100341796875, |
| "learning_rate": 4.857142857142858e-06, |
| "loss": 0.0, |
| "reward": 2.0625, |
| "reward_std": 0.38532208651304245, |
| "rewards/format_reward_custom": 1.0, |
| "rewards/high_level_action_reward": 0.4375, |
| "rewards/low_level_action_reward": 0.625, |
| "step": 18 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 142.625, |
| "epoch": 0.5428571428571428, |
| "grad_norm": 0.6664654016494751, |
| "kl": 0.0009002685546875, |
| "learning_rate": 4.571428571428572e-06, |
| "loss": 0.0, |
| "reward": 2.28125, |
| "reward_std": 0.5278384983539581, |
| "rewards/format_reward_custom": 1.0, |
| "rewards/high_level_action_reward": 0.71875, |
| "rewards/low_level_action_reward": 0.5625, |
| "step": 19 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 136.875, |
| "epoch": 0.5714285714285714, |
| "grad_norm": 0.597842812538147, |
| "kl": 0.0012664794921875, |
| "learning_rate": 4.2857142857142855e-06, |
| "loss": 0.0001, |
| "reward": 2.5, |
| "reward_std": 0.680880218744278, |
| "rewards/format_reward_custom": 1.0, |
| "rewards/high_level_action_reward": 0.8125, |
| "rewards/low_level_action_reward": 0.6875, |
| "step": 20 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 152.625, |
| "epoch": 0.6, |
| "grad_norm": 0.5350140929222107, |
| "kl": 0.00119781494140625, |
| "learning_rate": 4.000000000000001e-06, |
| "loss": 0.0, |
| "reward": 2.21875, |
| "reward_std": 0.7195108830928802, |
| "rewards/format_reward_custom": 0.9375, |
| "rewards/high_level_action_reward": 0.53125, |
| "rewards/low_level_action_reward": 0.75, |
| "step": 21 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 116.4375, |
| "epoch": 0.6285714285714286, |
| "grad_norm": 0.0055608744733035564, |
| "kl": 0.00130462646484375, |
| "learning_rate": 3.7142857142857146e-06, |
| "loss": 0.0001, |
| "reward": 2.0, |
| "reward_std": 0.0, |
| "rewards/format_reward_custom": 1.0, |
| "rewards/high_level_action_reward": 0.5, |
| "rewards/low_level_action_reward": 0.5, |
| "step": 22 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 130.6875, |
| "epoch": 0.6571428571428571, |
| "grad_norm": 0.6009066700935364, |
| "kl": 0.00139617919921875, |
| "learning_rate": 3.428571428571429e-06, |
| "loss": 0.0001, |
| "reward": 1.9375, |
| "reward_std": 0.5121889710426331, |
| "rewards/format_reward_custom": 1.0, |
| "rewards/high_level_action_reward": 0.375, |
| "rewards/low_level_action_reward": 0.5625, |
| "step": 23 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 142.0, |
| "epoch": 0.6857142857142857, |
| "grad_norm": 0.6292827129364014, |
| "kl": 0.001522064208984375, |
| "learning_rate": 3.142857142857143e-06, |
| "loss": 0.0001, |
| "reward": 1.8125, |
| "reward_std": 0.7284566164016724, |
| "rewards/format_reward_custom": 1.0, |
| "rewards/high_level_action_reward": 0.4375, |
| "rewards/low_level_action_reward": 0.375, |
| "step": 24 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 129.6875, |
| "epoch": 0.7142857142857143, |
| "grad_norm": 0.6546318531036377, |
| "kl": 0.001224517822265625, |
| "learning_rate": 2.8571428571428573e-06, |
| "loss": 0.0, |
| "reward": 2.28125, |
| "reward_std": 0.4532671868801117, |
| "rewards/format_reward_custom": 1.0, |
| "rewards/high_level_action_reward": 0.53125, |
| "rewards/low_level_action_reward": 0.75, |
| "step": 25 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 132.4375, |
| "epoch": 0.7428571428571429, |
| "grad_norm": 0.6277039051055908, |
| "kl": 0.001224517822265625, |
| "learning_rate": 2.571428571428571e-06, |
| "loss": 0.0, |
| "reward": 2.28125, |
| "reward_std": 0.7378469705581665, |
| "rewards/format_reward_custom": 1.0, |
| "rewards/high_level_action_reward": 0.59375, |
| "rewards/low_level_action_reward": 0.6875, |
| "step": 26 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 136.75, |
| "epoch": 0.7714285714285715, |
| "grad_norm": 0.6006151437759399, |
| "kl": 0.00202178955078125, |
| "learning_rate": 2.285714285714286e-06, |
| "loss": 0.0001, |
| "reward": 2.375, |
| "reward_std": 0.48037588596343994, |
| "rewards/format_reward_custom": 1.0, |
| "rewards/high_level_action_reward": 0.625, |
| "rewards/low_level_action_reward": 0.75, |
| "step": 27 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 137.75, |
| "epoch": 0.8, |
| "grad_norm": 0.6642078757286072, |
| "kl": 0.0014190673828125, |
| "learning_rate": 2.0000000000000003e-06, |
| "loss": 0.0001, |
| "reward": 2.375, |
| "reward_std": 0.48718400299549103, |
| "rewards/format_reward_custom": 1.0, |
| "rewards/high_level_action_reward": 0.5625, |
| "rewards/low_level_action_reward": 0.8125, |
| "step": 28 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 119.25, |
| "epoch": 0.8285714285714286, |
| "grad_norm": 0.6026730537414551, |
| "kl": 0.00200653076171875, |
| "learning_rate": 1.7142857142857145e-06, |
| "loss": 0.0001, |
| "reward": 2.5625, |
| "reward_std": 0.47558383643627167, |
| "rewards/format_reward_custom": 1.0, |
| "rewards/high_level_action_reward": 0.875, |
| "rewards/low_level_action_reward": 0.6875, |
| "step": 29 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 122.125, |
| "epoch": 0.8571428571428571, |
| "grad_norm": 0.6274431347846985, |
| "kl": 0.00164794921875, |
| "learning_rate": 1.4285714285714286e-06, |
| "loss": 0.0001, |
| "reward": 1.875, |
| "reward_std": 0.5239592343568802, |
| "rewards/format_reward_custom": 1.0, |
| "rewards/high_level_action_reward": 0.4375, |
| "rewards/low_level_action_reward": 0.4375, |
| "step": 30 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 122.25, |
| "epoch": 0.8857142857142857, |
| "grad_norm": 0.6134248971939087, |
| "kl": 0.001708984375, |
| "learning_rate": 1.142857142857143e-06, |
| "loss": 0.0001, |
| "reward": 2.8125, |
| "reward_std": 0.31539323925971985, |
| "rewards/format_reward_custom": 1.0, |
| "rewards/high_level_action_reward": 0.875, |
| "rewards/low_level_action_reward": 0.9375, |
| "step": 31 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 140.5625, |
| "epoch": 0.9142857142857143, |
| "grad_norm": 0.5456552505493164, |
| "kl": 0.001468658447265625, |
| "learning_rate": 8.571428571428572e-07, |
| "loss": 0.0001, |
| "reward": 2.46875, |
| "reward_std": 0.8058048486709595, |
| "rewards/format_reward_custom": 1.0, |
| "rewards/high_level_action_reward": 0.78125, |
| "rewards/low_level_action_reward": 0.6875, |
| "step": 32 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 134.375, |
| "epoch": 0.9428571428571428, |
| "grad_norm": 0.6829774379730225, |
| "kl": 0.001861572265625, |
| "learning_rate": 5.714285714285715e-07, |
| "loss": 0.0001, |
| "reward": 1.53125, |
| "reward_std": 0.6504871249198914, |
| "rewards/format_reward_custom": 1.0, |
| "rewards/high_level_action_reward": 0.28125, |
| "rewards/low_level_action_reward": 0.25, |
| "step": 33 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 135.875, |
| "epoch": 0.9714285714285714, |
| "grad_norm": 0.6511191129684448, |
| "kl": 0.001590728759765625, |
| "learning_rate": 2.8571428571428575e-07, |
| "loss": 0.0001, |
| "reward": 2.46875, |
| "reward_std": 0.5606519877910614, |
| "rewards/format_reward_custom": 1.0, |
| "rewards/high_level_action_reward": 0.625, |
| "rewards/low_level_action_reward": 0.84375, |
| "step": 34 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 141.33333587646484, |
| "epoch": 1.0, |
| "grad_norm": 0.6342370510101318, |
| "kl": 0.0015106201171875, |
| "learning_rate": 0.0, |
| "loss": 0.0001, |
| "reward": 2.583333373069763, |
| "reward_std": 0.5056425184011459, |
| "rewards/format_reward_custom": 1.0, |
| "rewards/high_level_action_reward": 0.6666666865348816, |
| "rewards/low_level_action_reward": 0.9166666865348816, |
| "step": 35 |
| } |
| ], |
| "logging_steps": 1.0, |
| "max_steps": 35, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 1, |
| "save_steps": 2, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 0.0, |
| "train_batch_size": 4, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|