{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 69, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio": 0.0, "completion_length": 131.1875, "epoch": 0.014492753623188406, "grad_norm": 0.6139106154441833, "kl": 0.0, "learning_rate": 9.855072463768118e-06, "loss": 0.0, "reward": 2.4375, "reward_std": 0.5868084877729416, "rewards/format_reward_custom": 1.0, "rewards/high_level_action_reward": 0.625, "rewards/low_level_action_reward": 0.8125, "step": 1 }, { "clip_ratio": 0.0, "completion_length": 127.5625, "epoch": 0.028985507246376812, "grad_norm": 0.5722443461418152, "kl": 0.00024318695068359375, "learning_rate": 9.710144927536233e-06, "loss": 0.0, "reward": 2.5, "reward_std": 0.5239592343568802, "rewards/format_reward_custom": 1.0, "rewards/high_level_action_reward": 0.6875, "rewards/low_level_action_reward": 0.8125, "step": 2 }, { "clip_ratio": 0.0, "completion_length": 131.4375, "epoch": 0.043478260869565216, "grad_norm": 0.5824880003929138, "kl": 0.00020122528076171875, "learning_rate": 9.565217391304349e-06, "loss": 0.0, "reward": 2.65625, "reward_std": 0.2651650384068489, "rewards/format_reward_custom": 1.0, "rewards/high_level_action_reward": 0.71875, "rewards/low_level_action_reward": 0.9375, "step": 3 }, { "clip_ratio": 0.0, "completion_length": 118.6875, "epoch": 0.057971014492753624, "grad_norm": 0.36597952246665955, "kl": 0.00018024444580078125, "learning_rate": 9.420289855072464e-06, "loss": 0.0, "reward": 2.9375, "reward_std": 0.1157275140285492, "rewards/format_reward_custom": 1.0, "rewards/high_level_action_reward": 0.9375, "rewards/low_level_action_reward": 1.0, "step": 4 }, { "clip_ratio": 0.0, "completion_length": 152.5625, "epoch": 0.07246376811594203, "grad_norm": 0.6035860180854797, "kl": 0.0002532005310058594, "learning_rate": 9.275362318840581e-06, "loss": 0.0, "reward": 2.5625, "reward_std": 0.5609941333532333, "rewards/format_reward_custom": 1.0, "rewards/high_level_action_reward": 0.8125, "rewards/low_level_action_reward": 0.75, "step": 5 }, { "clip_ratio": 0.0, "completion_length": 127.4375, "epoch": 0.08695652173913043, "grad_norm": 0.6012734174728394, "kl": 0.00024127960205078125, "learning_rate": 9.130434782608697e-06, "loss": 0.0, "reward": 2.53125, "reward_std": 0.619232714176178, "rewards/format_reward_custom": 1.0, "rewards/high_level_action_reward": 0.84375, "rewards/low_level_action_reward": 0.6875, "step": 6 }, { "clip_ratio": 0.0, "completion_length": 144.125, "epoch": 0.10144927536231885, "grad_norm": 0.5445747375488281, "kl": 0.000244140625, "learning_rate": 8.985507246376812e-06, "loss": 0.0, "reward": 2.59375, "reward_std": 0.38138842582702637, "rewards/format_reward_custom": 1.0, "rewards/high_level_action_reward": 0.84375, "rewards/low_level_action_reward": 0.75, "step": 7 }, { "clip_ratio": 0.0, "completion_length": 129.375, "epoch": 0.11594202898550725, "grad_norm": 0.5511504411697388, "kl": 0.0002574920654296875, "learning_rate": 8.840579710144929e-06, "loss": 0.0, "reward": 2.5625, "reward_std": 0.4802234023809433, "rewards/format_reward_custom": 1.0, "rewards/high_level_action_reward": 0.75, "rewards/low_level_action_reward": 0.8125, "step": 8 }, { "clip_ratio": 0.0, "completion_length": 146.3125, "epoch": 0.13043478260869565, "grad_norm": 0.39530378580093384, "kl": 0.000354766845703125, "learning_rate": 8.695652173913044e-06, "loss": 0.0, "reward": 2.65625, "reward_std": 0.35197147727012634, "rewards/format_reward_custom": 1.0, "rewards/high_level_action_reward": 0.84375, "rewards/low_level_action_reward": 0.8125, "step": 9 }, { "clip_ratio": 0.0, "completion_length": 140.375, "epoch": 0.14492753623188406, "grad_norm": 0.6552218794822693, "kl": 0.0005092620849609375, "learning_rate": 8.55072463768116e-06, "loss": 0.0, "reward": 2.4375, "reward_std": 0.5546489059925079, "rewards/format_reward_custom": 1.0, "rewards/high_level_action_reward": 0.75, "rewards/low_level_action_reward": 0.6875, "step": 10 }, { "clip_ratio": 0.0, "completion_length": 133.8125, "epoch": 0.15942028985507245, "grad_norm": 0.5951517224311829, "kl": 0.00035858154296875, "learning_rate": 8.405797101449275e-06, "loss": 0.0, "reward": 2.78125, "reward_std": 0.3061639815568924, "rewards/format_reward_custom": 1.0, "rewards/high_level_action_reward": 0.84375, "rewards/low_level_action_reward": 0.9375, "step": 11 }, { "clip_ratio": 0.0, "completion_length": 135.3125, "epoch": 0.17391304347826086, "grad_norm": 0.5633934140205383, "kl": 0.0006542205810546875, "learning_rate": 8.260869565217392e-06, "loss": 0.0, "reward": 2.625, "reward_std": 0.2925042062997818, "rewards/format_reward_custom": 1.0, "rewards/high_level_action_reward": 0.8125, "rewards/low_level_action_reward": 0.8125, "step": 12 }, { "clip_ratio": 0.0, "completion_length": 123.5625, "epoch": 0.18840579710144928, "grad_norm": 0.6226204037666321, "kl": 0.0006656646728515625, "learning_rate": 8.115942028985508e-06, "loss": 0.0, "reward": 2.59375, "reward_std": 0.35564958304166794, "rewards/format_reward_custom": 1.0, "rewards/high_level_action_reward": 0.78125, "rewards/low_level_action_reward": 0.8125, "step": 13 }, { "clip_ratio": 0.0, "completion_length": 105.625, "epoch": 0.2028985507246377, "grad_norm": 0.6195806264877319, "kl": 0.000820159912109375, "learning_rate": 7.971014492753623e-06, "loss": 0.0, "reward": 2.0, "reward_std": 0.26726123690605164, "rewards/format_reward_custom": 1.0, "rewards/high_level_action_reward": 0.5, "rewards/low_level_action_reward": 0.5, "step": 14 }, { "clip_ratio": 0.0, "completion_length": 125.1875, "epoch": 0.21739130434782608, "grad_norm": 0.6649656891822815, "kl": 0.000728607177734375, "learning_rate": 7.82608695652174e-06, "loss": 0.0, "reward": 2.625, "reward_std": 0.5720614045858383, "rewards/format_reward_custom": 1.0, "rewards/high_level_action_reward": 0.8125, "rewards/low_level_action_reward": 0.8125, "step": 15 }, { "clip_ratio": 0.0, "completion_length": 133.9375, "epoch": 0.2318840579710145, "grad_norm": 0.5864853262901306, "kl": 0.000736236572265625, "learning_rate": 7.681159420289856e-06, "loss": 0.0, "reward": 2.4375, "reward_std": 0.5585024058818817, "rewards/format_reward_custom": 1.0, "rewards/high_level_action_reward": 0.8125, "rewards/low_level_action_reward": 0.625, "step": 16 }, { "clip_ratio": 0.0, "completion_length": 134.5, "epoch": 0.2463768115942029, "grad_norm": 0.6138740181922913, "kl": 0.0010128021240234375, "learning_rate": 7.536231884057972e-06, "loss": 0.0, "reward": 2.5, "reward_std": 0.5500157475471497, "rewards/format_reward_custom": 1.0, "rewards/high_level_action_reward": 0.6875, "rewards/low_level_action_reward": 0.8125, "step": 17 }, { "clip_ratio": 0.0, "completion_length": 121.875, "epoch": 0.2608695652173913, "grad_norm": 0.5959410667419434, "kl": 0.001262664794921875, "learning_rate": 7.391304347826087e-06, "loss": 0.0001, "reward": 2.8125, "reward_std": 0.31539323925971985, "rewards/format_reward_custom": 1.0, "rewards/high_level_action_reward": 0.875, "rewards/low_level_action_reward": 0.9375, "step": 18 }, { "clip_ratio": 0.0, "completion_length": 141.375, "epoch": 0.2753623188405797, "grad_norm": 0.5705474615097046, "kl": 0.001049041748046875, "learning_rate": 7.246376811594203e-06, "loss": 0.0, "reward": 2.28125, "reward_std": 0.6677263081073761, "rewards/format_reward_custom": 1.0, "rewards/high_level_action_reward": 0.84375, "rewards/low_level_action_reward": 0.4375, "step": 19 }, { "clip_ratio": 0.0, "completion_length": 135.5625, "epoch": 0.2898550724637681, "grad_norm": 0.43123292922973633, "kl": 0.001407623291015625, "learning_rate": 7.10144927536232e-06, "loss": 0.0001, "reward": 2.90625, "reward_std": 0.18600594997406006, "rewards/format_reward_custom": 1.0, "rewards/high_level_action_reward": 0.96875, "rewards/low_level_action_reward": 0.9375, "step": 20 }, { "clip_ratio": 0.0, "completion_length": 122.0, "epoch": 0.30434782608695654, "grad_norm": 0.6939437985420227, "kl": 0.001312255859375, "learning_rate": 6.956521739130435e-06, "loss": 0.0001, "reward": 2.625, "reward_std": 0.39837799966335297, "rewards/format_reward_custom": 1.0, "rewards/high_level_action_reward": 0.8125, "rewards/low_level_action_reward": 0.8125, "step": 21 }, { "clip_ratio": 0.0, "completion_length": 118.3125, "epoch": 0.3188405797101449, "grad_norm": 0.6187354326248169, "kl": 0.00099945068359375, "learning_rate": 6.811594202898551e-06, "loss": 0.0, "reward": 2.90625, "reward_std": 0.2651650384068489, "rewards/format_reward_custom": 1.0, "rewards/high_level_action_reward": 0.96875, "rewards/low_level_action_reward": 0.9375, "step": 22 }, { "clip_ratio": 0.0, "completion_length": 137.125, "epoch": 0.3333333333333333, "grad_norm": 0.619391143321991, "kl": 0.00128173828125, "learning_rate": 6.666666666666667e-06, "loss": 0.0001, "reward": 2.53125, "reward_std": 0.3966485261917114, "rewards/format_reward_custom": 1.0, "rewards/high_level_action_reward": 0.65625, "rewards/low_level_action_reward": 0.875, "step": 23 }, { "clip_ratio": 0.0, "completion_length": 113.375, "epoch": 0.34782608695652173, "grad_norm": 0.6596890687942505, "kl": 0.001399993896484375, "learning_rate": 6.521739130434783e-06, "loss": 0.0001, "reward": 2.75, "reward_std": 0.32261285185813904, "rewards/format_reward_custom": 1.0, "rewards/high_level_action_reward": 0.75, "rewards/low_level_action_reward": 1.0, "step": 24 }, { "clip_ratio": 0.0, "completion_length": 132.6875, "epoch": 0.36231884057971014, "grad_norm": 0.6133448481559753, "kl": 0.0009860992431640625, "learning_rate": 6.376811594202898e-06, "loss": 0.0, "reward": 2.8125, "reward_std": 0.45117098093032837, "rewards/format_reward_custom": 1.0, "rewards/high_level_action_reward": 0.9375, "rewards/low_level_action_reward": 0.875, "step": 25 }, { "clip_ratio": 0.0, "completion_length": 149.1875, "epoch": 0.37681159420289856, "grad_norm": 0.6592795848846436, "kl": 0.00177764892578125, "learning_rate": 6.2318840579710145e-06, "loss": 0.0001, "reward": 2.59375, "reward_std": 0.6028470396995544, "rewards/format_reward_custom": 0.9375, "rewards/high_level_action_reward": 0.84375, "rewards/low_level_action_reward": 0.8125, "step": 26 }, { "clip_ratio": 0.0, "completion_length": 129.6875, "epoch": 0.391304347826087, "grad_norm": 0.5780444145202637, "kl": 0.00179290771484375, "learning_rate": 6.086956521739132e-06, "loss": 0.0001, "reward": 2.75, "reward_std": 0.35841864347457886, "rewards/format_reward_custom": 1.0, "rewards/high_level_action_reward": 0.875, "rewards/low_level_action_reward": 0.875, "step": 27 }, { "clip_ratio": 0.0, "completion_length": 122.4375, "epoch": 0.4057971014492754, "grad_norm": 0.004627756774425507, "kl": 0.00113677978515625, "learning_rate": 5.942028985507247e-06, "loss": 0.0, "reward": 3.0, "reward_std": 0.0, "rewards/format_reward_custom": 1.0, "rewards/high_level_action_reward": 1.0, "rewards/low_level_action_reward": 1.0, "step": 28 }, { "clip_ratio": 0.0, "completion_length": 141.0625, "epoch": 0.42028985507246375, "grad_norm": 0.5492585301399231, "kl": 0.001590728759765625, "learning_rate": 5.797101449275363e-06, "loss": 0.0001, "reward": 2.78125, "reward_std": 0.2630179077386856, "rewards/format_reward_custom": 1.0, "rewards/high_level_action_reward": 0.78125, "rewards/low_level_action_reward": 1.0, "step": 29 }, { "clip_ratio": 0.0, "completion_length": 141.0, "epoch": 0.43478260869565216, "grad_norm": 0.5583959221839905, "kl": 0.002197265625, "learning_rate": 5.652173913043479e-06, "loss": 0.0001, "reward": 2.34375, "reward_std": 0.38138842582702637, "rewards/format_reward_custom": 1.0, "rewards/high_level_action_reward": 0.65625, "rewards/low_level_action_reward": 0.6875, "step": 30 }, { "clip_ratio": 0.0, "completion_length": 132.8125, "epoch": 0.4492753623188406, "grad_norm": 0.5492476224899292, "kl": 0.002262115478515625, "learning_rate": 5.507246376811595e-06, "loss": 0.0001, "reward": 2.4375, "reward_std": 0.5260358154773712, "rewards/format_reward_custom": 1.0, "rewards/high_level_action_reward": 0.875, "rewards/low_level_action_reward": 0.5625, "step": 31 }, { "clip_ratio": 0.0, "completion_length": 138.875, "epoch": 0.463768115942029, "grad_norm": 0.5487068891525269, "kl": 0.00237274169921875, "learning_rate": 5.362318840579711e-06, "loss": 0.0001, "reward": 2.59375, "reward_std": 0.34475886821746826, "rewards/format_reward_custom": 1.0, "rewards/high_level_action_reward": 0.78125, "rewards/low_level_action_reward": 0.8125, "step": 32 }, { "clip_ratio": 0.0, "completion_length": 142.875, "epoch": 0.4782608695652174, "grad_norm": 0.5532354712486267, "kl": 0.001415252685546875, "learning_rate": 5.2173913043478265e-06, "loss": 0.0001, "reward": 2.65625, "reward_std": 0.35098859667778015, "rewards/format_reward_custom": 1.0, "rewards/high_level_action_reward": 0.71875, "rewards/low_level_action_reward": 0.9375, "step": 33 }, { "clip_ratio": 0.0, "completion_length": 124.75, "epoch": 0.4927536231884058, "grad_norm": 0.5815756916999817, "kl": 0.00170135498046875, "learning_rate": 5.072463768115943e-06, "loss": 0.0001, "reward": 2.75, "reward_std": 0.39837799966335297, "rewards/format_reward_custom": 1.0, "rewards/high_level_action_reward": 0.9375, "rewards/low_level_action_reward": 0.8125, "step": 34 }, { "clip_ratio": 0.0, "completion_length": 114.4375, "epoch": 0.5072463768115942, "grad_norm": 0.6308125257492065, "kl": 0.0013580322265625, "learning_rate": 4.927536231884059e-06, "loss": 0.0001, "reward": 2.625, "reward_std": 0.3535533770918846, "rewards/format_reward_custom": 1.0, "rewards/high_level_action_reward": 0.875, "rewards/low_level_action_reward": 0.75, "step": 35 }, { "clip_ratio": 0.0, "completion_length": 148.625, "epoch": 0.5217391304347826, "grad_norm": 0.5474228858947754, "kl": 0.00164031982421875, "learning_rate": 4.782608695652174e-06, "loss": 0.0001, "reward": 2.625, "reward_std": 0.4355512708425522, "rewards/format_reward_custom": 1.0, "rewards/high_level_action_reward": 0.875, "rewards/low_level_action_reward": 0.75, "step": 36 }, { "clip_ratio": 0.0, "completion_length": 124.75, "epoch": 0.5362318840579711, "grad_norm": 0.46679696440696716, "kl": 0.002105712890625, "learning_rate": 4.637681159420291e-06, "loss": 0.0001, "reward": 2.78125, "reward_std": 0.33905068039894104, "rewards/format_reward_custom": 1.0, "rewards/high_level_action_reward": 0.90625, "rewards/low_level_action_reward": 0.875, "step": 37 }, { "clip_ratio": 0.0, "completion_length": 131.125, "epoch": 0.5507246376811594, "grad_norm": 0.6113550662994385, "kl": 0.002593994140625, "learning_rate": 4.492753623188406e-06, "loss": 0.0001, "reward": 2.78125, "reward_std": 0.3749881833791733, "rewards/format_reward_custom": 1.0, "rewards/high_level_action_reward": 0.90625, "rewards/low_level_action_reward": 0.875, "step": 38 }, { "clip_ratio": 0.0, "completion_length": 132.25, "epoch": 0.5652173913043478, "grad_norm": 0.6611613035202026, "kl": 0.00252532958984375, "learning_rate": 4.347826086956522e-06, "loss": 0.0001, "reward": 2.625, "reward_std": 0.5487885922193527, "rewards/format_reward_custom": 1.0, "rewards/high_level_action_reward": 0.75, "rewards/low_level_action_reward": 0.875, "step": 39 }, { "clip_ratio": 0.0, "completion_length": 115.5, "epoch": 0.5797101449275363, "grad_norm": 0.6072784662246704, "kl": 0.002056121826171875, "learning_rate": 4.202898550724638e-06, "loss": 0.0001, "reward": 2.84375, "reward_std": 0.3808925449848175, "rewards/format_reward_custom": 1.0, "rewards/high_level_action_reward": 0.90625, "rewards/low_level_action_reward": 0.9375, "step": 40 }, { "clip_ratio": 0.0, "completion_length": 129.875, "epoch": 0.5942028985507246, "grad_norm": 0.5880132913589478, "kl": 0.0030670166015625, "learning_rate": 4.057971014492754e-06, "loss": 0.0001, "reward": 2.65625, "reward_std": 0.3966485261917114, "rewards/format_reward_custom": 1.0, "rewards/high_level_action_reward": 0.78125, "rewards/low_level_action_reward": 0.875, "step": 41 }, { "clip_ratio": 0.0, "completion_length": 120.75, "epoch": 0.6086956521739131, "grad_norm": 0.6412323117256165, "kl": 0.0033416748046875, "learning_rate": 3.91304347826087e-06, "loss": 0.0001, "reward": 2.21875, "reward_std": 0.3818188011646271, "rewards/format_reward_custom": 1.0, "rewards/high_level_action_reward": 0.59375, "rewards/low_level_action_reward": 0.625, "step": 42 }, { "clip_ratio": 0.0, "completion_length": 131.3125, "epoch": 0.6231884057971014, "grad_norm": 0.6180394887924194, "kl": 0.00208282470703125, "learning_rate": 3.768115942028986e-06, "loss": 0.0001, "reward": 2.65625, "reward_std": 0.5133327841758728, "rewards/format_reward_custom": 1.0, "rewards/high_level_action_reward": 0.84375, "rewards/low_level_action_reward": 0.8125, "step": 43 }, { "clip_ratio": 0.0, "completion_length": 128.5625, "epoch": 0.6376811594202898, "grad_norm": 0.6940774917602539, "kl": 0.002685546875, "learning_rate": 3.6231884057971017e-06, "loss": 0.0001, "reward": 2.625, "reward_std": 0.44099316000938416, "rewards/format_reward_custom": 0.9375, "rewards/high_level_action_reward": 0.875, "rewards/low_level_action_reward": 0.8125, "step": 44 }, { "clip_ratio": 0.0, "completion_length": 126.6875, "epoch": 0.6521739130434783, "grad_norm": 0.6032717227935791, "kl": 0.00244140625, "learning_rate": 3.4782608695652175e-06, "loss": 0.0001, "reward": 2.65625, "reward_std": 0.5762138962745667, "rewards/format_reward_custom": 1.0, "rewards/high_level_action_reward": 0.84375, "rewards/low_level_action_reward": 0.8125, "step": 45 }, { "clip_ratio": 0.0, "completion_length": 137.25, "epoch": 0.6666666666666666, "grad_norm": 0.3984113335609436, "kl": 0.001659393310546875, "learning_rate": 3.3333333333333333e-06, "loss": 0.0001, "reward": 2.84375, "reward_std": 0.2651650309562683, "rewards/format_reward_custom": 1.0, "rewards/high_level_action_reward": 0.90625, "rewards/low_level_action_reward": 0.9375, "step": 46 }, { "clip_ratio": 0.0, "completion_length": 134.3125, "epoch": 0.6811594202898551, "grad_norm": 0.37174344062805176, "kl": 0.003200531005859375, "learning_rate": 3.188405797101449e-06, "loss": 0.0001, "reward": 2.875, "reward_std": 0.13363061845302582, "rewards/format_reward_custom": 1.0, "rewards/high_level_action_reward": 0.875, "rewards/low_level_action_reward": 1.0, "step": 47 }, { "clip_ratio": 0.0, "completion_length": 113.625, "epoch": 0.6956521739130435, "grad_norm": 0.6417880654335022, "kl": 0.0034027099609375, "learning_rate": 3.043478260869566e-06, "loss": 0.0001, "reward": 2.125, "reward_std": 0.5175491571426392, "rewards/format_reward_custom": 1.0, "rewards/high_level_action_reward": 0.5, "rewards/low_level_action_reward": 0.625, "step": 48 }, { "clip_ratio": 0.0, "completion_length": 120.875, "epoch": 0.7101449275362319, "grad_norm": 0.3155536353588104, "kl": 0.003082275390625, "learning_rate": 2.8985507246376816e-06, "loss": 0.0001, "reward": 2.84375, "reward_std": 0.18600594997406006, "rewards/format_reward_custom": 1.0, "rewards/high_level_action_reward": 0.90625, "rewards/low_level_action_reward": 0.9375, "step": 49 }, { "clip_ratio": 0.0, "completion_length": 133.3125, "epoch": 0.7246376811594203, "grad_norm": 0.6605053544044495, "kl": 0.00164031982421875, "learning_rate": 2.7536231884057974e-06, "loss": 0.0001, "reward": 2.65625, "reward_std": 0.36348532140254974, "rewards/format_reward_custom": 1.0, "rewards/high_level_action_reward": 0.78125, "rewards/low_level_action_reward": 0.875, "step": 50 }, { "clip_ratio": 0.0, "completion_length": 141.9375, "epoch": 0.7391304347826086, "grad_norm": 0.5570524334907532, "kl": 0.002655029296875, "learning_rate": 2.6086956521739132e-06, "loss": 0.0001, "reward": 2.59375, "reward_std": 0.7191916108131409, "rewards/format_reward_custom": 1.0, "rewards/high_level_action_reward": 0.78125, "rewards/low_level_action_reward": 0.8125, "step": 51 }, { "clip_ratio": 0.0, "completion_length": 141.1875, "epoch": 0.7536231884057971, "grad_norm": 0.4170561730861664, "kl": 0.001430511474609375, "learning_rate": 2.4637681159420295e-06, "loss": 0.0001, "reward": 2.90625, "reward_std": 0.1293872892856598, "rewards/format_reward_custom": 1.0, "rewards/high_level_action_reward": 0.90625, "rewards/low_level_action_reward": 1.0, "step": 52 }, { "clip_ratio": 0.0, "completion_length": 121.5625, "epoch": 0.7681159420289855, "grad_norm": 0.6030449867248535, "kl": 0.0033111572265625, "learning_rate": 2.3188405797101453e-06, "loss": 0.0001, "reward": 2.212499976158142, "reward_std": 0.46406444907188416, "rewards/format_reward_custom": 1.0, "rewards/high_level_action_reward": 0.5250000059604645, "rewards/low_level_action_reward": 0.6875, "step": 53 }, { "clip_ratio": 0.0, "completion_length": 130.0625, "epoch": 0.782608695652174, "grad_norm": 0.40535715222358704, "kl": 0.001888275146484375, "learning_rate": 2.173913043478261e-06, "loss": 0.0001, "reward": 2.90625, "reward_std": 0.1293872892856598, "rewards/format_reward_custom": 1.0, "rewards/high_level_action_reward": 0.90625, "rewards/low_level_action_reward": 1.0, "step": 54 }, { "clip_ratio": 0.0, "completion_length": 133.25, "epoch": 0.7971014492753623, "grad_norm": 0.5601004958152771, "kl": 0.00250244140625, "learning_rate": 2.028985507246377e-06, "loss": 0.0001, "reward": 2.5, "reward_std": 0.5945880711078644, "rewards/format_reward_custom": 1.0, "rewards/high_level_action_reward": 0.8125, "rewards/low_level_action_reward": 0.6875, "step": 55 }, { "clip_ratio": 0.0, "completion_length": 123.3125, "epoch": 0.8115942028985508, "grad_norm": 0.6125327348709106, "kl": 0.00266265869140625, "learning_rate": 1.884057971014493e-06, "loss": 0.0001, "reward": 2.625, "reward_std": 0.3380180299282074, "rewards/format_reward_custom": 1.0, "rewards/high_level_action_reward": 0.75, "rewards/low_level_action_reward": 0.875, "step": 56 }, { "clip_ratio": 0.0, "completion_length": 139.875, "epoch": 0.8260869565217391, "grad_norm": 0.525627613067627, "kl": 0.002033233642578125, "learning_rate": 1.7391304347826088e-06, "loss": 0.0001, "reward": 2.65625, "reward_std": 0.5649385899305344, "rewards/format_reward_custom": 0.9375, "rewards/high_level_action_reward": 0.84375, "rewards/low_level_action_reward": 0.875, "step": 57 }, { "clip_ratio": 0.0, "completion_length": 121.0, "epoch": 0.8405797101449275, "grad_norm": 0.6244301795959473, "kl": 0.0027923583984375, "learning_rate": 1.5942028985507246e-06, "loss": 0.0001, "reward": 2.875, "reward_std": 0.27439429610967636, "rewards/format_reward_custom": 1.0, "rewards/high_level_action_reward": 0.9375, "rewards/low_level_action_reward": 0.9375, "step": 58 }, { "clip_ratio": 0.0, "completion_length": 134.6875, "epoch": 0.855072463768116, "grad_norm": 0.6077686548233032, "kl": 0.0023651123046875, "learning_rate": 1.4492753623188408e-06, "loss": 0.0001, "reward": 2.5, "reward_std": 0.42632102966308594, "rewards/format_reward_custom": 1.0, "rewards/high_level_action_reward": 0.8125, "rewards/low_level_action_reward": 0.6875, "step": 59 }, { "clip_ratio": 0.0, "completion_length": 129.125, "epoch": 0.8695652173913043, "grad_norm": 0.549598217010498, "kl": 0.001483917236328125, "learning_rate": 1.3043478260869566e-06, "loss": 0.0001, "reward": 2.6875, "reward_std": 0.4150373041629791, "rewards/format_reward_custom": 1.0, "rewards/high_level_action_reward": 0.875, "rewards/low_level_action_reward": 0.8125, "step": 60 }, { "clip_ratio": 0.0, "completion_length": 132.0625, "epoch": 0.8840579710144928, "grad_norm": 0.3656002879142761, "kl": 0.00152587890625, "learning_rate": 1.1594202898550726e-06, "loss": 0.0001, "reward": 2.837499976158142, "reward_std": 0.18077217042446136, "rewards/format_reward_custom": 1.0, "rewards/high_level_action_reward": 0.9000000059604645, "rewards/low_level_action_reward": 0.9375, "step": 61 }, { "clip_ratio": 0.0, "completion_length": 150.25, "epoch": 0.8985507246376812, "grad_norm": 0.5994426012039185, "kl": 0.0025482177734375, "learning_rate": 1.0144927536231885e-06, "loss": 0.0001, "reward": 2.71875, "reward_std": 0.36348532140254974, "rewards/format_reward_custom": 1.0, "rewards/high_level_action_reward": 0.84375, "rewards/low_level_action_reward": 0.875, "step": 62 }, { "clip_ratio": 0.0, "completion_length": 121.3125, "epoch": 0.9130434782608695, "grad_norm": 0.6515089273452759, "kl": 0.0019683837890625, "learning_rate": 8.695652173913044e-07, "loss": 0.0001, "reward": 2.21875, "reward_std": 0.4966200590133667, "rewards/format_reward_custom": 1.0, "rewards/high_level_action_reward": 0.65625, "rewards/low_level_action_reward": 0.5625, "step": 63 }, { "clip_ratio": 0.0, "completion_length": 124.625, "epoch": 0.927536231884058, "grad_norm": 0.6351967453956604, "kl": 0.002838134765625, "learning_rate": 7.246376811594204e-07, "loss": 0.0001, "reward": 2.78125, "reward_std": 0.3808925449848175, "rewards/format_reward_custom": 1.0, "rewards/high_level_action_reward": 0.84375, "rewards/low_level_action_reward": 0.9375, "step": 64 }, { "clip_ratio": 0.0, "completion_length": 121.6875, "epoch": 0.9420289855072463, "grad_norm": 0.6020182967185974, "kl": 0.001895904541015625, "learning_rate": 5.797101449275363e-07, "loss": 0.0001, "reward": 2.8125, "reward_std": 0.3535533770918846, "rewards/format_reward_custom": 0.9375, "rewards/high_level_action_reward": 0.875, "rewards/low_level_action_reward": 1.0, "step": 65 }, { "clip_ratio": 0.0, "completion_length": 125.0, "epoch": 0.9565217391304348, "grad_norm": 0.6860438585281372, "kl": 0.00278472900390625, "learning_rate": 4.347826086956522e-07, "loss": 0.0001, "reward": 2.65625, "reward_std": 0.2651650384068489, "rewards/format_reward_custom": 1.0, "rewards/high_level_action_reward": 0.71875, "rewards/low_level_action_reward": 0.9375, "step": 66 }, { "clip_ratio": 0.0, "completion_length": 123.875, "epoch": 0.9710144927536232, "grad_norm": 0.587708592414856, "kl": 0.003173828125, "learning_rate": 2.8985507246376816e-07, "loss": 0.0001, "reward": 2.625, "reward_std": 0.2925042062997818, "rewards/format_reward_custom": 1.0, "rewards/high_level_action_reward": 0.8125, "rewards/low_level_action_reward": 0.8125, "step": 67 }, { "clip_ratio": 0.0, "completion_length": 131.125, "epoch": 0.9855072463768116, "grad_norm": 0.5342994928359985, "kl": 0.002532958984375, "learning_rate": 1.4492753623188408e-07, "loss": 0.0001, "reward": 2.71875, "reward_std": 0.4419417232275009, "rewards/format_reward_custom": 1.0, "rewards/high_level_action_reward": 0.84375, "rewards/low_level_action_reward": 0.875, "step": 68 }, { "clip_ratio": 0.0, "completion_length": 122.875, "epoch": 1.0, "grad_norm": 0.43572136759757996, "kl": 0.00244140625, "learning_rate": 0.0, "loss": 0.0001, "reward": 2.875, "reward_std": 0.13363061845302582, "rewards/format_reward_custom": 1.0, "rewards/high_level_action_reward": 0.875, "rewards/low_level_action_reward": 1.0, "step": 69 } ], "logging_steps": 1.0, "max_steps": 69, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 2, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }