| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.0, | |
| "eval_steps": 500, | |
| "global_step": 69, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 131.1875, | |
| "epoch": 0.014492753623188406, | |
| "grad_norm": 0.6139106154441833, | |
| "kl": 0.0, | |
| "learning_rate": 9.855072463768118e-06, | |
| "loss": 0.0, | |
| "reward": 2.4375, | |
| "reward_std": 0.5868084877729416, | |
| "rewards/format_reward_custom": 1.0, | |
| "rewards/high_level_action_reward": 0.625, | |
| "rewards/low_level_action_reward": 0.8125, | |
| "step": 1 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 127.5625, | |
| "epoch": 0.028985507246376812, | |
| "grad_norm": 0.5722443461418152, | |
| "kl": 0.00024318695068359375, | |
| "learning_rate": 9.710144927536233e-06, | |
| "loss": 0.0, | |
| "reward": 2.5, | |
| "reward_std": 0.5239592343568802, | |
| "rewards/format_reward_custom": 1.0, | |
| "rewards/high_level_action_reward": 0.6875, | |
| "rewards/low_level_action_reward": 0.8125, | |
| "step": 2 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 131.4375, | |
| "epoch": 0.043478260869565216, | |
| "grad_norm": 0.5824880003929138, | |
| "kl": 0.00020122528076171875, | |
| "learning_rate": 9.565217391304349e-06, | |
| "loss": 0.0, | |
| "reward": 2.65625, | |
| "reward_std": 0.2651650384068489, | |
| "rewards/format_reward_custom": 1.0, | |
| "rewards/high_level_action_reward": 0.71875, | |
| "rewards/low_level_action_reward": 0.9375, | |
| "step": 3 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 118.6875, | |
| "epoch": 0.057971014492753624, | |
| "grad_norm": 0.36597952246665955, | |
| "kl": 0.00018024444580078125, | |
| "learning_rate": 9.420289855072464e-06, | |
| "loss": 0.0, | |
| "reward": 2.9375, | |
| "reward_std": 0.1157275140285492, | |
| "rewards/format_reward_custom": 1.0, | |
| "rewards/high_level_action_reward": 0.9375, | |
| "rewards/low_level_action_reward": 1.0, | |
| "step": 4 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 152.5625, | |
| "epoch": 0.07246376811594203, | |
| "grad_norm": 0.6035860180854797, | |
| "kl": 0.0002532005310058594, | |
| "learning_rate": 9.275362318840581e-06, | |
| "loss": 0.0, | |
| "reward": 2.5625, | |
| "reward_std": 0.5609941333532333, | |
| "rewards/format_reward_custom": 1.0, | |
| "rewards/high_level_action_reward": 0.8125, | |
| "rewards/low_level_action_reward": 0.75, | |
| "step": 5 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 127.4375, | |
| "epoch": 0.08695652173913043, | |
| "grad_norm": 0.6012734174728394, | |
| "kl": 0.00024127960205078125, | |
| "learning_rate": 9.130434782608697e-06, | |
| "loss": 0.0, | |
| "reward": 2.53125, | |
| "reward_std": 0.619232714176178, | |
| "rewards/format_reward_custom": 1.0, | |
| "rewards/high_level_action_reward": 0.84375, | |
| "rewards/low_level_action_reward": 0.6875, | |
| "step": 6 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 144.125, | |
| "epoch": 0.10144927536231885, | |
| "grad_norm": 0.5445747375488281, | |
| "kl": 0.000244140625, | |
| "learning_rate": 8.985507246376812e-06, | |
| "loss": 0.0, | |
| "reward": 2.59375, | |
| "reward_std": 0.38138842582702637, | |
| "rewards/format_reward_custom": 1.0, | |
| "rewards/high_level_action_reward": 0.84375, | |
| "rewards/low_level_action_reward": 0.75, | |
| "step": 7 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 129.375, | |
| "epoch": 0.11594202898550725, | |
| "grad_norm": 0.5511504411697388, | |
| "kl": 0.0002574920654296875, | |
| "learning_rate": 8.840579710144929e-06, | |
| "loss": 0.0, | |
| "reward": 2.5625, | |
| "reward_std": 0.4802234023809433, | |
| "rewards/format_reward_custom": 1.0, | |
| "rewards/high_level_action_reward": 0.75, | |
| "rewards/low_level_action_reward": 0.8125, | |
| "step": 8 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 146.3125, | |
| "epoch": 0.13043478260869565, | |
| "grad_norm": 0.39530378580093384, | |
| "kl": 0.000354766845703125, | |
| "learning_rate": 8.695652173913044e-06, | |
| "loss": 0.0, | |
| "reward": 2.65625, | |
| "reward_std": 0.35197147727012634, | |
| "rewards/format_reward_custom": 1.0, | |
| "rewards/high_level_action_reward": 0.84375, | |
| "rewards/low_level_action_reward": 0.8125, | |
| "step": 9 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 140.375, | |
| "epoch": 0.14492753623188406, | |
| "grad_norm": 0.6552218794822693, | |
| "kl": 0.0005092620849609375, | |
| "learning_rate": 8.55072463768116e-06, | |
| "loss": 0.0, | |
| "reward": 2.4375, | |
| "reward_std": 0.5546489059925079, | |
| "rewards/format_reward_custom": 1.0, | |
| "rewards/high_level_action_reward": 0.75, | |
| "rewards/low_level_action_reward": 0.6875, | |
| "step": 10 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 133.8125, | |
| "epoch": 0.15942028985507245, | |
| "grad_norm": 0.5951517224311829, | |
| "kl": 0.00035858154296875, | |
| "learning_rate": 8.405797101449275e-06, | |
| "loss": 0.0, | |
| "reward": 2.78125, | |
| "reward_std": 0.3061639815568924, | |
| "rewards/format_reward_custom": 1.0, | |
| "rewards/high_level_action_reward": 0.84375, | |
| "rewards/low_level_action_reward": 0.9375, | |
| "step": 11 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 135.3125, | |
| "epoch": 0.17391304347826086, | |
| "grad_norm": 0.5633934140205383, | |
| "kl": 0.0006542205810546875, | |
| "learning_rate": 8.260869565217392e-06, | |
| "loss": 0.0, | |
| "reward": 2.625, | |
| "reward_std": 0.2925042062997818, | |
| "rewards/format_reward_custom": 1.0, | |
| "rewards/high_level_action_reward": 0.8125, | |
| "rewards/low_level_action_reward": 0.8125, | |
| "step": 12 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 123.5625, | |
| "epoch": 0.18840579710144928, | |
| "grad_norm": 0.6226204037666321, | |
| "kl": 0.0006656646728515625, | |
| "learning_rate": 8.115942028985508e-06, | |
| "loss": 0.0, | |
| "reward": 2.59375, | |
| "reward_std": 0.35564958304166794, | |
| "rewards/format_reward_custom": 1.0, | |
| "rewards/high_level_action_reward": 0.78125, | |
| "rewards/low_level_action_reward": 0.8125, | |
| "step": 13 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.625, | |
| "epoch": 0.2028985507246377, | |
| "grad_norm": 0.6195806264877319, | |
| "kl": 0.000820159912109375, | |
| "learning_rate": 7.971014492753623e-06, | |
| "loss": 0.0, | |
| "reward": 2.0, | |
| "reward_std": 0.26726123690605164, | |
| "rewards/format_reward_custom": 1.0, | |
| "rewards/high_level_action_reward": 0.5, | |
| "rewards/low_level_action_reward": 0.5, | |
| "step": 14 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 125.1875, | |
| "epoch": 0.21739130434782608, | |
| "grad_norm": 0.6649656891822815, | |
| "kl": 0.000728607177734375, | |
| "learning_rate": 7.82608695652174e-06, | |
| "loss": 0.0, | |
| "reward": 2.625, | |
| "reward_std": 0.5720614045858383, | |
| "rewards/format_reward_custom": 1.0, | |
| "rewards/high_level_action_reward": 0.8125, | |
| "rewards/low_level_action_reward": 0.8125, | |
| "step": 15 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 133.9375, | |
| "epoch": 0.2318840579710145, | |
| "grad_norm": 0.5864853262901306, | |
| "kl": 0.000736236572265625, | |
| "learning_rate": 7.681159420289856e-06, | |
| "loss": 0.0, | |
| "reward": 2.4375, | |
| "reward_std": 0.5585024058818817, | |
| "rewards/format_reward_custom": 1.0, | |
| "rewards/high_level_action_reward": 0.8125, | |
| "rewards/low_level_action_reward": 0.625, | |
| "step": 16 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 134.5, | |
| "epoch": 0.2463768115942029, | |
| "grad_norm": 0.6138740181922913, | |
| "kl": 0.0010128021240234375, | |
| "learning_rate": 7.536231884057972e-06, | |
| "loss": 0.0, | |
| "reward": 2.5, | |
| "reward_std": 0.5500157475471497, | |
| "rewards/format_reward_custom": 1.0, | |
| "rewards/high_level_action_reward": 0.6875, | |
| "rewards/low_level_action_reward": 0.8125, | |
| "step": 17 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 121.875, | |
| "epoch": 0.2608695652173913, | |
| "grad_norm": 0.5959410667419434, | |
| "kl": 0.001262664794921875, | |
| "learning_rate": 7.391304347826087e-06, | |
| "loss": 0.0001, | |
| "reward": 2.8125, | |
| "reward_std": 0.31539323925971985, | |
| "rewards/format_reward_custom": 1.0, | |
| "rewards/high_level_action_reward": 0.875, | |
| "rewards/low_level_action_reward": 0.9375, | |
| "step": 18 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 141.375, | |
| "epoch": 0.2753623188405797, | |
| "grad_norm": 0.5705474615097046, | |
| "kl": 0.001049041748046875, | |
| "learning_rate": 7.246376811594203e-06, | |
| "loss": 0.0, | |
| "reward": 2.28125, | |
| "reward_std": 0.6677263081073761, | |
| "rewards/format_reward_custom": 1.0, | |
| "rewards/high_level_action_reward": 0.84375, | |
| "rewards/low_level_action_reward": 0.4375, | |
| "step": 19 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 135.5625, | |
| "epoch": 0.2898550724637681, | |
| "grad_norm": 0.43123292922973633, | |
| "kl": 0.001407623291015625, | |
| "learning_rate": 7.10144927536232e-06, | |
| "loss": 0.0001, | |
| "reward": 2.90625, | |
| "reward_std": 0.18600594997406006, | |
| "rewards/format_reward_custom": 1.0, | |
| "rewards/high_level_action_reward": 0.96875, | |
| "rewards/low_level_action_reward": 0.9375, | |
| "step": 20 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 122.0, | |
| "epoch": 0.30434782608695654, | |
| "grad_norm": 0.6939437985420227, | |
| "kl": 0.001312255859375, | |
| "learning_rate": 6.956521739130435e-06, | |
| "loss": 0.0001, | |
| "reward": 2.625, | |
| "reward_std": 0.39837799966335297, | |
| "rewards/format_reward_custom": 1.0, | |
| "rewards/high_level_action_reward": 0.8125, | |
| "rewards/low_level_action_reward": 0.8125, | |
| "step": 21 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 118.3125, | |
| "epoch": 0.3188405797101449, | |
| "grad_norm": 0.6187354326248169, | |
| "kl": 0.00099945068359375, | |
| "learning_rate": 6.811594202898551e-06, | |
| "loss": 0.0, | |
| "reward": 2.90625, | |
| "reward_std": 0.2651650384068489, | |
| "rewards/format_reward_custom": 1.0, | |
| "rewards/high_level_action_reward": 0.96875, | |
| "rewards/low_level_action_reward": 0.9375, | |
| "step": 22 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 137.125, | |
| "epoch": 0.3333333333333333, | |
| "grad_norm": 0.619391143321991, | |
| "kl": 0.00128173828125, | |
| "learning_rate": 6.666666666666667e-06, | |
| "loss": 0.0001, | |
| "reward": 2.53125, | |
| "reward_std": 0.3966485261917114, | |
| "rewards/format_reward_custom": 1.0, | |
| "rewards/high_level_action_reward": 0.65625, | |
| "rewards/low_level_action_reward": 0.875, | |
| "step": 23 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 113.375, | |
| "epoch": 0.34782608695652173, | |
| "grad_norm": 0.6596890687942505, | |
| "kl": 0.001399993896484375, | |
| "learning_rate": 6.521739130434783e-06, | |
| "loss": 0.0001, | |
| "reward": 2.75, | |
| "reward_std": 0.32261285185813904, | |
| "rewards/format_reward_custom": 1.0, | |
| "rewards/high_level_action_reward": 0.75, | |
| "rewards/low_level_action_reward": 1.0, | |
| "step": 24 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 132.6875, | |
| "epoch": 0.36231884057971014, | |
| "grad_norm": 0.6133448481559753, | |
| "kl": 0.0009860992431640625, | |
| "learning_rate": 6.376811594202898e-06, | |
| "loss": 0.0, | |
| "reward": 2.8125, | |
| "reward_std": 0.45117098093032837, | |
| "rewards/format_reward_custom": 1.0, | |
| "rewards/high_level_action_reward": 0.9375, | |
| "rewards/low_level_action_reward": 0.875, | |
| "step": 25 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 149.1875, | |
| "epoch": 0.37681159420289856, | |
| "grad_norm": 0.6592795848846436, | |
| "kl": 0.00177764892578125, | |
| "learning_rate": 6.2318840579710145e-06, | |
| "loss": 0.0001, | |
| "reward": 2.59375, | |
| "reward_std": 0.6028470396995544, | |
| "rewards/format_reward_custom": 0.9375, | |
| "rewards/high_level_action_reward": 0.84375, | |
| "rewards/low_level_action_reward": 0.8125, | |
| "step": 26 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 129.6875, | |
| "epoch": 0.391304347826087, | |
| "grad_norm": 0.5780444145202637, | |
| "kl": 0.00179290771484375, | |
| "learning_rate": 6.086956521739132e-06, | |
| "loss": 0.0001, | |
| "reward": 2.75, | |
| "reward_std": 0.35841864347457886, | |
| "rewards/format_reward_custom": 1.0, | |
| "rewards/high_level_action_reward": 0.875, | |
| "rewards/low_level_action_reward": 0.875, | |
| "step": 27 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 122.4375, | |
| "epoch": 0.4057971014492754, | |
| "grad_norm": 0.004627756774425507, | |
| "kl": 0.00113677978515625, | |
| "learning_rate": 5.942028985507247e-06, | |
| "loss": 0.0, | |
| "reward": 3.0, | |
| "reward_std": 0.0, | |
| "rewards/format_reward_custom": 1.0, | |
| "rewards/high_level_action_reward": 1.0, | |
| "rewards/low_level_action_reward": 1.0, | |
| "step": 28 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 141.0625, | |
| "epoch": 0.42028985507246375, | |
| "grad_norm": 0.5492585301399231, | |
| "kl": 0.001590728759765625, | |
| "learning_rate": 5.797101449275363e-06, | |
| "loss": 0.0001, | |
| "reward": 2.78125, | |
| "reward_std": 0.2630179077386856, | |
| "rewards/format_reward_custom": 1.0, | |
| "rewards/high_level_action_reward": 0.78125, | |
| "rewards/low_level_action_reward": 1.0, | |
| "step": 29 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 141.0, | |
| "epoch": 0.43478260869565216, | |
| "grad_norm": 0.5583959221839905, | |
| "kl": 0.002197265625, | |
| "learning_rate": 5.652173913043479e-06, | |
| "loss": 0.0001, | |
| "reward": 2.34375, | |
| "reward_std": 0.38138842582702637, | |
| "rewards/format_reward_custom": 1.0, | |
| "rewards/high_level_action_reward": 0.65625, | |
| "rewards/low_level_action_reward": 0.6875, | |
| "step": 30 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 132.8125, | |
| "epoch": 0.4492753623188406, | |
| "grad_norm": 0.5492476224899292, | |
| "kl": 0.002262115478515625, | |
| "learning_rate": 5.507246376811595e-06, | |
| "loss": 0.0001, | |
| "reward": 2.4375, | |
| "reward_std": 0.5260358154773712, | |
| "rewards/format_reward_custom": 1.0, | |
| "rewards/high_level_action_reward": 0.875, | |
| "rewards/low_level_action_reward": 0.5625, | |
| "step": 31 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 138.875, | |
| "epoch": 0.463768115942029, | |
| "grad_norm": 0.5487068891525269, | |
| "kl": 0.00237274169921875, | |
| "learning_rate": 5.362318840579711e-06, | |
| "loss": 0.0001, | |
| "reward": 2.59375, | |
| "reward_std": 0.34475886821746826, | |
| "rewards/format_reward_custom": 1.0, | |
| "rewards/high_level_action_reward": 0.78125, | |
| "rewards/low_level_action_reward": 0.8125, | |
| "step": 32 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 142.875, | |
| "epoch": 0.4782608695652174, | |
| "grad_norm": 0.5532354712486267, | |
| "kl": 0.001415252685546875, | |
| "learning_rate": 5.2173913043478265e-06, | |
| "loss": 0.0001, | |
| "reward": 2.65625, | |
| "reward_std": 0.35098859667778015, | |
| "rewards/format_reward_custom": 1.0, | |
| "rewards/high_level_action_reward": 0.71875, | |
| "rewards/low_level_action_reward": 0.9375, | |
| "step": 33 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 124.75, | |
| "epoch": 0.4927536231884058, | |
| "grad_norm": 0.5815756916999817, | |
| "kl": 0.00170135498046875, | |
| "learning_rate": 5.072463768115943e-06, | |
| "loss": 0.0001, | |
| "reward": 2.75, | |
| "reward_std": 0.39837799966335297, | |
| "rewards/format_reward_custom": 1.0, | |
| "rewards/high_level_action_reward": 0.9375, | |
| "rewards/low_level_action_reward": 0.8125, | |
| "step": 34 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 114.4375, | |
| "epoch": 0.5072463768115942, | |
| "grad_norm": 0.6308125257492065, | |
| "kl": 0.0013580322265625, | |
| "learning_rate": 4.927536231884059e-06, | |
| "loss": 0.0001, | |
| "reward": 2.625, | |
| "reward_std": 0.3535533770918846, | |
| "rewards/format_reward_custom": 1.0, | |
| "rewards/high_level_action_reward": 0.875, | |
| "rewards/low_level_action_reward": 0.75, | |
| "step": 35 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 148.625, | |
| "epoch": 0.5217391304347826, | |
| "grad_norm": 0.5474228858947754, | |
| "kl": 0.00164031982421875, | |
| "learning_rate": 4.782608695652174e-06, | |
| "loss": 0.0001, | |
| "reward": 2.625, | |
| "reward_std": 0.4355512708425522, | |
| "rewards/format_reward_custom": 1.0, | |
| "rewards/high_level_action_reward": 0.875, | |
| "rewards/low_level_action_reward": 0.75, | |
| "step": 36 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 124.75, | |
| "epoch": 0.5362318840579711, | |
| "grad_norm": 0.46679696440696716, | |
| "kl": 0.002105712890625, | |
| "learning_rate": 4.637681159420291e-06, | |
| "loss": 0.0001, | |
| "reward": 2.78125, | |
| "reward_std": 0.33905068039894104, | |
| "rewards/format_reward_custom": 1.0, | |
| "rewards/high_level_action_reward": 0.90625, | |
| "rewards/low_level_action_reward": 0.875, | |
| "step": 37 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 131.125, | |
| "epoch": 0.5507246376811594, | |
| "grad_norm": 0.6113550662994385, | |
| "kl": 0.002593994140625, | |
| "learning_rate": 4.492753623188406e-06, | |
| "loss": 0.0001, | |
| "reward": 2.78125, | |
| "reward_std": 0.3749881833791733, | |
| "rewards/format_reward_custom": 1.0, | |
| "rewards/high_level_action_reward": 0.90625, | |
| "rewards/low_level_action_reward": 0.875, | |
| "step": 38 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 132.25, | |
| "epoch": 0.5652173913043478, | |
| "grad_norm": 0.6611613035202026, | |
| "kl": 0.00252532958984375, | |
| "learning_rate": 4.347826086956522e-06, | |
| "loss": 0.0001, | |
| "reward": 2.625, | |
| "reward_std": 0.5487885922193527, | |
| "rewards/format_reward_custom": 1.0, | |
| "rewards/high_level_action_reward": 0.75, | |
| "rewards/low_level_action_reward": 0.875, | |
| "step": 39 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 115.5, | |
| "epoch": 0.5797101449275363, | |
| "grad_norm": 0.6072784662246704, | |
| "kl": 0.002056121826171875, | |
| "learning_rate": 4.202898550724638e-06, | |
| "loss": 0.0001, | |
| "reward": 2.84375, | |
| "reward_std": 0.3808925449848175, | |
| "rewards/format_reward_custom": 1.0, | |
| "rewards/high_level_action_reward": 0.90625, | |
| "rewards/low_level_action_reward": 0.9375, | |
| "step": 40 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 129.875, | |
| "epoch": 0.5942028985507246, | |
| "grad_norm": 0.5880132913589478, | |
| "kl": 0.0030670166015625, | |
| "learning_rate": 4.057971014492754e-06, | |
| "loss": 0.0001, | |
| "reward": 2.65625, | |
| "reward_std": 0.3966485261917114, | |
| "rewards/format_reward_custom": 1.0, | |
| "rewards/high_level_action_reward": 0.78125, | |
| "rewards/low_level_action_reward": 0.875, | |
| "step": 41 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 120.75, | |
| "epoch": 0.6086956521739131, | |
| "grad_norm": 0.6412323117256165, | |
| "kl": 0.0033416748046875, | |
| "learning_rate": 3.91304347826087e-06, | |
| "loss": 0.0001, | |
| "reward": 2.21875, | |
| "reward_std": 0.3818188011646271, | |
| "rewards/format_reward_custom": 1.0, | |
| "rewards/high_level_action_reward": 0.59375, | |
| "rewards/low_level_action_reward": 0.625, | |
| "step": 42 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 131.3125, | |
| "epoch": 0.6231884057971014, | |
| "grad_norm": 0.6180394887924194, | |
| "kl": 0.00208282470703125, | |
| "learning_rate": 3.768115942028986e-06, | |
| "loss": 0.0001, | |
| "reward": 2.65625, | |
| "reward_std": 0.5133327841758728, | |
| "rewards/format_reward_custom": 1.0, | |
| "rewards/high_level_action_reward": 0.84375, | |
| "rewards/low_level_action_reward": 0.8125, | |
| "step": 43 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 128.5625, | |
| "epoch": 0.6376811594202898, | |
| "grad_norm": 0.6940774917602539, | |
| "kl": 0.002685546875, | |
| "learning_rate": 3.6231884057971017e-06, | |
| "loss": 0.0001, | |
| "reward": 2.625, | |
| "reward_std": 0.44099316000938416, | |
| "rewards/format_reward_custom": 0.9375, | |
| "rewards/high_level_action_reward": 0.875, | |
| "rewards/low_level_action_reward": 0.8125, | |
| "step": 44 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 126.6875, | |
| "epoch": 0.6521739130434783, | |
| "grad_norm": 0.6032717227935791, | |
| "kl": 0.00244140625, | |
| "learning_rate": 3.4782608695652175e-06, | |
| "loss": 0.0001, | |
| "reward": 2.65625, | |
| "reward_std": 0.5762138962745667, | |
| "rewards/format_reward_custom": 1.0, | |
| "rewards/high_level_action_reward": 0.84375, | |
| "rewards/low_level_action_reward": 0.8125, | |
| "step": 45 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 137.25, | |
| "epoch": 0.6666666666666666, | |
| "grad_norm": 0.3984113335609436, | |
| "kl": 0.001659393310546875, | |
| "learning_rate": 3.3333333333333333e-06, | |
| "loss": 0.0001, | |
| "reward": 2.84375, | |
| "reward_std": 0.2651650309562683, | |
| "rewards/format_reward_custom": 1.0, | |
| "rewards/high_level_action_reward": 0.90625, | |
| "rewards/low_level_action_reward": 0.9375, | |
| "step": 46 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 134.3125, | |
| "epoch": 0.6811594202898551, | |
| "grad_norm": 0.37174344062805176, | |
| "kl": 0.003200531005859375, | |
| "learning_rate": 3.188405797101449e-06, | |
| "loss": 0.0001, | |
| "reward": 2.875, | |
| "reward_std": 0.13363061845302582, | |
| "rewards/format_reward_custom": 1.0, | |
| "rewards/high_level_action_reward": 0.875, | |
| "rewards/low_level_action_reward": 1.0, | |
| "step": 47 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 113.625, | |
| "epoch": 0.6956521739130435, | |
| "grad_norm": 0.6417880654335022, | |
| "kl": 0.0034027099609375, | |
| "learning_rate": 3.043478260869566e-06, | |
| "loss": 0.0001, | |
| "reward": 2.125, | |
| "reward_std": 0.5175491571426392, | |
| "rewards/format_reward_custom": 1.0, | |
| "rewards/high_level_action_reward": 0.5, | |
| "rewards/low_level_action_reward": 0.625, | |
| "step": 48 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 120.875, | |
| "epoch": 0.7101449275362319, | |
| "grad_norm": 0.3155536353588104, | |
| "kl": 0.003082275390625, | |
| "learning_rate": 2.8985507246376816e-06, | |
| "loss": 0.0001, | |
| "reward": 2.84375, | |
| "reward_std": 0.18600594997406006, | |
| "rewards/format_reward_custom": 1.0, | |
| "rewards/high_level_action_reward": 0.90625, | |
| "rewards/low_level_action_reward": 0.9375, | |
| "step": 49 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 133.3125, | |
| "epoch": 0.7246376811594203, | |
| "grad_norm": 0.6605053544044495, | |
| "kl": 0.00164031982421875, | |
| "learning_rate": 2.7536231884057974e-06, | |
| "loss": 0.0001, | |
| "reward": 2.65625, | |
| "reward_std": 0.36348532140254974, | |
| "rewards/format_reward_custom": 1.0, | |
| "rewards/high_level_action_reward": 0.78125, | |
| "rewards/low_level_action_reward": 0.875, | |
| "step": 50 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 141.9375, | |
| "epoch": 0.7391304347826086, | |
| "grad_norm": 0.5570524334907532, | |
| "kl": 0.002655029296875, | |
| "learning_rate": 2.6086956521739132e-06, | |
| "loss": 0.0001, | |
| "reward": 2.59375, | |
| "reward_std": 0.7191916108131409, | |
| "rewards/format_reward_custom": 1.0, | |
| "rewards/high_level_action_reward": 0.78125, | |
| "rewards/low_level_action_reward": 0.8125, | |
| "step": 51 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 141.1875, | |
| "epoch": 0.7536231884057971, | |
| "grad_norm": 0.4170561730861664, | |
| "kl": 0.001430511474609375, | |
| "learning_rate": 2.4637681159420295e-06, | |
| "loss": 0.0001, | |
| "reward": 2.90625, | |
| "reward_std": 0.1293872892856598, | |
| "rewards/format_reward_custom": 1.0, | |
| "rewards/high_level_action_reward": 0.90625, | |
| "rewards/low_level_action_reward": 1.0, | |
| "step": 52 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 121.5625, | |
| "epoch": 0.7681159420289855, | |
| "grad_norm": 0.6030449867248535, | |
| "kl": 0.0033111572265625, | |
| "learning_rate": 2.3188405797101453e-06, | |
| "loss": 0.0001, | |
| "reward": 2.212499976158142, | |
| "reward_std": 0.46406444907188416, | |
| "rewards/format_reward_custom": 1.0, | |
| "rewards/high_level_action_reward": 0.5250000059604645, | |
| "rewards/low_level_action_reward": 0.6875, | |
| "step": 53 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 130.0625, | |
| "epoch": 0.782608695652174, | |
| "grad_norm": 0.40535715222358704, | |
| "kl": 0.001888275146484375, | |
| "learning_rate": 2.173913043478261e-06, | |
| "loss": 0.0001, | |
| "reward": 2.90625, | |
| "reward_std": 0.1293872892856598, | |
| "rewards/format_reward_custom": 1.0, | |
| "rewards/high_level_action_reward": 0.90625, | |
| "rewards/low_level_action_reward": 1.0, | |
| "step": 54 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 133.25, | |
| "epoch": 0.7971014492753623, | |
| "grad_norm": 0.5601004958152771, | |
| "kl": 0.00250244140625, | |
| "learning_rate": 2.028985507246377e-06, | |
| "loss": 0.0001, | |
| "reward": 2.5, | |
| "reward_std": 0.5945880711078644, | |
| "rewards/format_reward_custom": 1.0, | |
| "rewards/high_level_action_reward": 0.8125, | |
| "rewards/low_level_action_reward": 0.6875, | |
| "step": 55 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 123.3125, | |
| "epoch": 0.8115942028985508, | |
| "grad_norm": 0.6125327348709106, | |
| "kl": 0.00266265869140625, | |
| "learning_rate": 1.884057971014493e-06, | |
| "loss": 0.0001, | |
| "reward": 2.625, | |
| "reward_std": 0.3380180299282074, | |
| "rewards/format_reward_custom": 1.0, | |
| "rewards/high_level_action_reward": 0.75, | |
| "rewards/low_level_action_reward": 0.875, | |
| "step": 56 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 139.875, | |
| "epoch": 0.8260869565217391, | |
| "grad_norm": 0.525627613067627, | |
| "kl": 0.002033233642578125, | |
| "learning_rate": 1.7391304347826088e-06, | |
| "loss": 0.0001, | |
| "reward": 2.65625, | |
| "reward_std": 0.5649385899305344, | |
| "rewards/format_reward_custom": 0.9375, | |
| "rewards/high_level_action_reward": 0.84375, | |
| "rewards/low_level_action_reward": 0.875, | |
| "step": 57 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 121.0, | |
| "epoch": 0.8405797101449275, | |
| "grad_norm": 0.6244301795959473, | |
| "kl": 0.0027923583984375, | |
| "learning_rate": 1.5942028985507246e-06, | |
| "loss": 0.0001, | |
| "reward": 2.875, | |
| "reward_std": 0.27439429610967636, | |
| "rewards/format_reward_custom": 1.0, | |
| "rewards/high_level_action_reward": 0.9375, | |
| "rewards/low_level_action_reward": 0.9375, | |
| "step": 58 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 134.6875, | |
| "epoch": 0.855072463768116, | |
| "grad_norm": 0.6077686548233032, | |
| "kl": 0.0023651123046875, | |
| "learning_rate": 1.4492753623188408e-06, | |
| "loss": 0.0001, | |
| "reward": 2.5, | |
| "reward_std": 0.42632102966308594, | |
| "rewards/format_reward_custom": 1.0, | |
| "rewards/high_level_action_reward": 0.8125, | |
| "rewards/low_level_action_reward": 0.6875, | |
| "step": 59 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 129.125, | |
| "epoch": 0.8695652173913043, | |
| "grad_norm": 0.549598217010498, | |
| "kl": 0.001483917236328125, | |
| "learning_rate": 1.3043478260869566e-06, | |
| "loss": 0.0001, | |
| "reward": 2.6875, | |
| "reward_std": 0.4150373041629791, | |
| "rewards/format_reward_custom": 1.0, | |
| "rewards/high_level_action_reward": 0.875, | |
| "rewards/low_level_action_reward": 0.8125, | |
| "step": 60 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 132.0625, | |
| "epoch": 0.8840579710144928, | |
| "grad_norm": 0.3656002879142761, | |
| "kl": 0.00152587890625, | |
| "learning_rate": 1.1594202898550726e-06, | |
| "loss": 0.0001, | |
| "reward": 2.837499976158142, | |
| "reward_std": 0.18077217042446136, | |
| "rewards/format_reward_custom": 1.0, | |
| "rewards/high_level_action_reward": 0.9000000059604645, | |
| "rewards/low_level_action_reward": 0.9375, | |
| "step": 61 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 150.25, | |
| "epoch": 0.8985507246376812, | |
| "grad_norm": 0.5994426012039185, | |
| "kl": 0.0025482177734375, | |
| "learning_rate": 1.0144927536231885e-06, | |
| "loss": 0.0001, | |
| "reward": 2.71875, | |
| "reward_std": 0.36348532140254974, | |
| "rewards/format_reward_custom": 1.0, | |
| "rewards/high_level_action_reward": 0.84375, | |
| "rewards/low_level_action_reward": 0.875, | |
| "step": 62 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 121.3125, | |
| "epoch": 0.9130434782608695, | |
| "grad_norm": 0.6515089273452759, | |
| "kl": 0.0019683837890625, | |
| "learning_rate": 8.695652173913044e-07, | |
| "loss": 0.0001, | |
| "reward": 2.21875, | |
| "reward_std": 0.4966200590133667, | |
| "rewards/format_reward_custom": 1.0, | |
| "rewards/high_level_action_reward": 0.65625, | |
| "rewards/low_level_action_reward": 0.5625, | |
| "step": 63 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 124.625, | |
| "epoch": 0.927536231884058, | |
| "grad_norm": 0.6351967453956604, | |
| "kl": 0.002838134765625, | |
| "learning_rate": 7.246376811594204e-07, | |
| "loss": 0.0001, | |
| "reward": 2.78125, | |
| "reward_std": 0.3808925449848175, | |
| "rewards/format_reward_custom": 1.0, | |
| "rewards/high_level_action_reward": 0.84375, | |
| "rewards/low_level_action_reward": 0.9375, | |
| "step": 64 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 121.6875, | |
| "epoch": 0.9420289855072463, | |
| "grad_norm": 0.6020182967185974, | |
| "kl": 0.001895904541015625, | |
| "learning_rate": 5.797101449275363e-07, | |
| "loss": 0.0001, | |
| "reward": 2.8125, | |
| "reward_std": 0.3535533770918846, | |
| "rewards/format_reward_custom": 0.9375, | |
| "rewards/high_level_action_reward": 0.875, | |
| "rewards/low_level_action_reward": 1.0, | |
| "step": 65 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 125.0, | |
| "epoch": 0.9565217391304348, | |
| "grad_norm": 0.6860438585281372, | |
| "kl": 0.00278472900390625, | |
| "learning_rate": 4.347826086956522e-07, | |
| "loss": 0.0001, | |
| "reward": 2.65625, | |
| "reward_std": 0.2651650384068489, | |
| "rewards/format_reward_custom": 1.0, | |
| "rewards/high_level_action_reward": 0.71875, | |
| "rewards/low_level_action_reward": 0.9375, | |
| "step": 66 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 123.875, | |
| "epoch": 0.9710144927536232, | |
| "grad_norm": 0.587708592414856, | |
| "kl": 0.003173828125, | |
| "learning_rate": 2.8985507246376816e-07, | |
| "loss": 0.0001, | |
| "reward": 2.625, | |
| "reward_std": 0.2925042062997818, | |
| "rewards/format_reward_custom": 1.0, | |
| "rewards/high_level_action_reward": 0.8125, | |
| "rewards/low_level_action_reward": 0.8125, | |
| "step": 67 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 131.125, | |
| "epoch": 0.9855072463768116, | |
| "grad_norm": 0.5342994928359985, | |
| "kl": 0.002532958984375, | |
| "learning_rate": 1.4492753623188408e-07, | |
| "loss": 0.0001, | |
| "reward": 2.71875, | |
| "reward_std": 0.4419417232275009, | |
| "rewards/format_reward_custom": 1.0, | |
| "rewards/high_level_action_reward": 0.84375, | |
| "rewards/low_level_action_reward": 0.875, | |
| "step": 68 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 122.875, | |
| "epoch": 1.0, | |
| "grad_norm": 0.43572136759757996, | |
| "kl": 0.00244140625, | |
| "learning_rate": 0.0, | |
| "loss": 0.0001, | |
| "reward": 2.875, | |
| "reward_std": 0.13363061845302582, | |
| "rewards/format_reward_custom": 1.0, | |
| "rewards/high_level_action_reward": 0.875, | |
| "rewards/low_level_action_reward": 1.0, | |
| "step": 69 | |
| } | |
| ], | |
| "logging_steps": 1.0, | |
| "max_steps": 69, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 2, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |